From f6217f891ac0bb64f3d375211650a4c1ff8ca1ea Mon Sep 17 00:00:00 2001 From: Apple OSS Distributions <91980991+AppleOSSDistributions@users.noreply.github.com> Date: Sat, 4 Oct 2025 02:38:34 +0000 Subject: [PATCH] xnu-12377.1.9 Imported from xnu-12377.1.9.tar.gz --- .upstream_base_commits | 1 + EXTERNAL_HEADERS/Makefile | 2 + EXTERNAL_HEADERS/_inttypes.h | 225 + EXTERNAL_HEADERS/corecrypto/ccdigest.h | 24 +- EXTERNAL_HEADERS/inttypes.h | 36 + Makefile | 7 + README.md | 4 +- SETUP/config/mkheaders.c | 21 +- bsd/Makefile | 1 + bsd/bsm/audit_kevents.h | 1 + bsd/conf/Makefile.template | 9 + bsd/conf/files | 23 +- bsd/conf/param.c | 2 - bsd/dev/arm/kern_machdep.c | 14 +- bsd/dev/arm64/sysctl.c | 27 + bsd/dev/dtrace/dtrace.c | 6 +- bsd/dev/dtrace/dtrace_subr.c | 17 +- bsd/dev/dtrace/fasttrap.c | 6 +- bsd/dev/dtrace/scripts/errno.d | 4 +- bsd/dev/dtrace/sdt_subr.c | 12 + bsd/dev/i386/kern_machdep.c | 8 +- bsd/dev/unix_startup.c | 91 +- bsd/kern/Makefile | 18 + bsd/kern/bsd_init.c | 13 +- bsd/kern/bsd_syscalls_stashtask.txt | 4 - bsd/kern/code_signing/ppl.c | 25 +- bsd/kern/code_signing/txm.c | 47 +- bsd/kern/decmpfs.c | 8 +- bsd/kern/imageboot.c | 56 +- bsd/kern/kdebug.c | 1416 ++-- bsd/kern/kdebug_common.c | 634 +- bsd/kern/kdebug_triage.c | 299 +- bsd/kern/kern_aio.c | 1260 +++- bsd/kern/kern_authorization.c | 2 +- bsd/kern/kern_codesigning.c | 55 +- bsd/kern/kern_core.c | 141 +- bsd/kern/kern_credential.c | 4 +- bsd/kern/kern_csr.c | 21 +- bsd/kern/kern_descrip.c | 52 +- bsd/kern/kern_event.c | 17 +- bsd/kern/kern_exec.c | 631 +- bsd/kern/kern_exec_internal.h | 59 +- bsd/kern/kern_exit.c | 30 +- bsd/kern/kern_fork.c | 16 +- bsd/kern/kern_guarded.c | 11 +- bsd/kern/kern_malloc.c | 49 +- bsd/kern/kern_memorystatus.c | 1526 ++-- bsd/kern/kern_memorystatus_freeze.c | 55 +- bsd/kern/kern_memorystatus_internal.h | 88 +- bsd/kern/kern_memorystatus_notify.c | 278 +- bsd/kern/kern_memorystatus_policy.c | 262 +- bsd/kern/kern_mib.c | 58 +- bsd/kern/kern_mman.c | 26 +- bsd/kern/kern_newsysctl.c | 80 +- bsd/kern/kern_proc.c | 368 +- bsd/kern/kern_resource.c | 491 +- bsd/kern/kern_shutdown.c | 14 + bsd/kern/kern_sig.c | 116 +- bsd/kern/kern_symfile.c | 42 +- bsd/kern/kern_sysctl.c | 606 +- bsd/kern/kern_time.c | 1 + bsd/kern/kpi_mbuf.c | 45 +- bsd/kern/mach_loader.c | 284 +- bsd/kern/mach_loader.h | 13 +- bsd/kern/mach_process.c | 10 +- bsd/kern/mcache.c | 4 +- bsd/kern/mem_acct.c | 496 ++ bsd/kern/mem_acct.h | 71 + bsd/kern/policy_check.c | 16 +- bsd/kern/posix_sem.c | 317 +- bsd/kern/proc_info.c | 30 +- bsd/kern/qsort.c | 6 +- .../arm64/bti_telemetry.h => bsd/kern/qsort.h | 38 +- bsd/kern/socket_flows.c | 61 +- bsd/kern/stackshot.c | 6 +- bsd/kern/subr_log.c | 2 +- bsd/kern/subr_log_stream.c | 2 +- bsd/kern/subr_prf.c | 5 +- bsd/kern/sys_generic.c | 44 +- bsd/kern/sys_socket.c | 2 +- bsd/kern/sys_ulock.c | 2 + bsd/kern/syscalls.master | 2 +- bsd/kern/trace_codes | 199 + bsd/kern/tracker.c | 18 +- bsd/kern/tty_dev.c | 7 +- bsd/kern/tty_ptmx.c | 4 + bsd/kern/tty_pty.c | 1 + bsd/kern/ubc_subr.c | 54 +- bsd/kern/uipc_domain.c | 38 +- bsd/kern/uipc_domain.h | 106 + bsd/kern/uipc_mbuf.c | 6061 ++-------------- bsd/kern/uipc_mbuf2.c | 278 +- bsd/kern/uipc_mbuf_mcache.c | 6207 +++++++++++++++++ bsd/kern/uipc_socket.c | 402 +- bsd/kern/uipc_socket.h | 77 + bsd/kern/uipc_socket2.c | 140 +- bsd/kern/uipc_syscalls.c | 17 +- bsd/kern/uipc_usrreq.c | 2 +- bsd/kern/vsock_domain.c | 347 +- bsd/machine/exec.h | 4 +- bsd/man/man2/access.2 | 17 +- bsd/man/man2/chmod.2 | 15 +- bsd/man/man2/chown.2 | 15 +- bsd/man/man2/clonefile.2 | 39 +- bsd/man/man2/execve.2 | 5 +- bsd/man/man2/fs_snapshot_create.2 | 2 + bsd/man/man2/getattrlist.2 | 36 +- bsd/man/man2/getattrlistbulk.2 | 9 +- bsd/man/man2/getxattr.2 | 13 +- bsd/man/man2/kqueue.2 | 12 +- bsd/man/man2/link.2 | 40 +- bsd/man/man2/listxattr.2 | 13 +- bsd/man/man2/mount.2 | 24 +- bsd/man/man2/open.2 | 17 +- bsd/man/man2/posix_spawn.2 | 5 +- bsd/man/man2/removexattr.2 | 13 +- bsd/man/man2/rename.2 | 17 +- bsd/man/man2/sem_open.2 | 19 + bsd/man/man2/setattrlist.2 | 14 +- bsd/man/man2/setxattr.2 | 13 +- bsd/man/man2/stat.2 | 15 +- bsd/man/man2/unlink.2 | 28 +- bsd/man/man2/utimensat.2 | 16 +- bsd/man/man3/Makefile | 4 +- .../man3/posix_spawn_file_actions_addclose.3 | 4 +- bsd/man/man9/Makefile | 1 + bsd/man/man9/byteorder.9 | 169 + bsd/miscfs/bindfs/bind_vfsops.c | 9 +- bsd/miscfs/deadfs/dead_vnops.c | 4 + bsd/miscfs/devfs/devfs_fdesc_support.c | 30 +- bsd/miscfs/devfs/devfs_vnops.c | 35 + bsd/miscfs/nullfs/null_vfsops.c | 9 +- bsd/miscfs/specfs/spec_vnops.c | 28 +- bsd/net/Makefile | 13 +- bsd/net/aop/Makefile | 33 + bsd/net/aop/aop_flow_stats.h | 60 + bsd/net/aop/aop_stats.h | 288 + bsd/net/aop/kpi_aop.c | 731 ++ bsd/net/aop/kpi_aop.h | 129 + bsd/net/bpf.c | 108 +- bsd/net/bpf.h | 293 +- bsd/net/bpf_private.h | 364 + bsd/net/classq/classq.h | 3 - bsd/net/classq/classq_fq_codel.c | 543 +- bsd/net/classq/classq_fq_codel.h | 46 +- bsd/net/classq/classq_subr.c | 434 +- bsd/net/classq/if_classq.h | 49 +- bsd/net/content_filter.c | 31 +- bsd/net/dlil.c | 589 +- bsd/net/dlil_ctl.c | 29 - bsd/net/dlil_input.c | 62 +- bsd/net/dlil_output.c | 40 +- bsd/net/dlil_subr.c | 221 +- bsd/net/dlil_var_private.h | 24 +- bsd/net/droptap.c | 8 + bsd/net/droptap.h | 62 +- bsd/net/ether_if_module.c | 7 +- bsd/net/ethernet.h | 1 + bsd/net/flowadv.c | 3 +- bsd/net/flowadv.h | 4 +- bsd/net/if.c | 346 +- bsd/net/if.h | 11 +- bsd/net/if_bond.c | 1 + bsd/net/if_bridge.c | 288 +- bsd/net/if_fake.c | 109 +- bsd/net/if_headless.c | 2 +- bsd/net/if_ipsec.c | 7 +- bsd/net/if_llreach.c | 1 + bsd/net/if_loop.c | 9 +- bsd/net/if_low_power_mode.c | 2 +- bsd/net/if_mib.c | 2 +- bsd/net/if_ports_used.c | 1099 ++- bsd/net/if_ports_used.h | 80 +- bsd/net/if_private.h | 85 +- bsd/net/if_redirect.c | 16 +- bsd/net/if_utun.c | 1445 ++-- bsd/net/if_utun.h | 2 + bsd/net/if_var_private.h | 95 +- bsd/net/if_var_status.h | 21 +- bsd/net/if_vlan.c | 2 +- bsd/net/iptap.c | 2 +- bsd/net/kpi_interface.c | 125 +- bsd/net/kpi_interface.h | 58 + bsd/net/nat464_utils.c | 24 +- bsd/net/nat464_utils.h | 4 +- bsd/net/necp.c | 697 +- bsd/net/necp.h | 55 +- bsd/net/necp_client.c | 1098 ++- bsd/net/net_private.modulemap | 12 + bsd/net/net_stubs.c | 8 + bsd/net/network_agent.c | 1 + bsd/net/network_agent.h | 4 +- bsd/net/ntstat.c | 2571 ++++--- bsd/net/ntstat.h | 281 +- bsd/net/packet_mangler.c | 2 +- bsd/net/pf.c | 34 +- bsd/net/pf_ioctl.c | 33 +- bsd/net/pf_norm.c | 31 +- bsd/net/pf_pbuf.c | 31 +- bsd/net/pf_pbuf.h | 4 +- bsd/net/pktap.c | 19 +- bsd/net/pktap.h | 27 +- bsd/net/pktsched/Makefile | 2 +- bsd/net/pktsched/pktsched.c | 188 +- bsd/net/pktsched/pktsched.h | 6 +- bsd/net/pktsched/pktsched_fq_codel.c | 449 +- bsd/net/pktsched/pktsched_fq_codel.h | 48 +- bsd/net/pktsched/pktsched_netem.c | 4 +- bsd/net/pktsched/pktsched_ops.c | 67 + bsd/net/pktsched/pktsched_ops.h | 83 + bsd/net/radix.c | 325 +- bsd/net/radix.h | 49 +- bsd/net/restricted_in_port.c | 10 +- bsd/net/route.c | 169 +- bsd/net/route_private.h | 9 +- bsd/net/rtsock.c | 9 +- bsd/net/siphash.c | 257 + bsd/net/siphash.h | 85 + bsd/netinet/Makefile | 3 +- bsd/netinet/cpu_in_cksum_gen.c | 27 +- bsd/netinet/flow_divert.c | 11 +- bsd/netinet/icmp6.h | 21 +- bsd/netinet/igmp.c | 15 +- bsd/netinet/in.c | 14 +- bsd/netinet/in_arp.c | 123 +- bsd/netinet/in_mcast.c | 2 +- bsd/netinet/in_pcb.c | 336 +- bsd/netinet/in_pcb.h | 176 +- bsd/netinet/in_pcblist.c | 6 +- bsd/netinet/in_private.h | 10 + bsd/netinet/in_proto.c | 19 +- bsd/netinet/in_rmx.c | 1 + bsd/netinet/in_stat.h | 20 + bsd/netinet/in_tclass.c | 54 +- bsd/netinet/in_var.h | 1 - bsd/netinet/ip_icmp.c | 5 + bsd/netinet/ip_input.c | 468 +- bsd/netinet/ip_output.c | 17 +- bsd/netinet/ip_var.h | 1 + bsd/netinet/mptcp.c | 47 +- bsd/netinet/mptcp.h | 10 - bsd/netinet/mptcp_opt.c | 414 +- bsd/netinet/mptcp_subr.c | 48 +- bsd/netinet/mptcp_usrreq.c | 42 +- bsd/netinet/mptcp_var.h | 8 +- bsd/netinet/raw_ip.c | 27 +- bsd/netinet/tcp.h | 7 +- bsd/netinet/tcp_cache.c | 377 +- bsd/netinet/tcp_cache.h | 85 +- bsd/netinet/tcp_cc.c | 148 +- bsd/netinet/tcp_cc.h | 8 +- bsd/netinet/tcp_cubic.c | 205 +- bsd/netinet/tcp_includes.h | 1 + bsd/netinet/tcp_input.c | 2353 ++++--- bsd/netinet/tcp_ledbat.c | 20 +- bsd/netinet/tcp_log.c | 23 +- bsd/netinet/tcp_newreno.c | 10 + bsd/netinet/tcp_output.c | 872 +-- bsd/netinet/tcp_pacing.c | 164 + bsd/netinet/tcp_pacing.h | 43 + bsd/netinet/tcp_prague.c | 47 +- bsd/netinet/tcp_private.h | 9 +- bsd/netinet/tcp_rack.c | 2 +- bsd/netinet/tcp_sack.c | 36 +- bsd/netinet/tcp_subr.c | 636 +- bsd/netinet/tcp_syncookie.c | 727 ++ .../nfsdiskless.h => netinet/tcp_syncookie.h} | 78 +- bsd/netinet/tcp_sysctls.c | 4 +- bsd/netinet/tcp_sysctls.h | 2 + bsd/netinet/tcp_timer.c | 357 +- bsd/netinet/tcp_timer.h | 31 +- bsd/netinet/tcp_usrreq.c | 105 +- bsd/netinet/tcp_utils.h | 31 + bsd/netinet/tcp_var.h | 395 +- bsd/netinet/udp_log.c | 15 +- bsd/netinet/udp_usrreq.c | 256 +- bsd/netinet/udp_var.h | 19 +- bsd/netinet6/dest6.c | 5 +- bsd/netinet6/esp_core.c | 2 +- bsd/netinet6/esp_input.c | 6 +- bsd/netinet6/frag6.c | 7 +- bsd/netinet6/icmp6.c | 43 +- bsd/netinet6/in6.c | 11 +- bsd/netinet6/in6.h | 1 + bsd/netinet6/in6_ifattach.c | 6 +- bsd/netinet6/in6_mcast.c | 2 +- bsd/netinet6/in6_pcb.c | 8 +- bsd/netinet6/in6_private.h | 11 +- bsd/netinet6/in6_proto.c | 71 +- bsd/netinet6/in6_rmx.c | 4 +- bsd/netinet6/ip6_forward.c | 6 +- bsd/netinet6/ip6_input.c | 51 +- bsd/netinet6/ip6_output.c | 22 +- bsd/netinet6/ip6_var.h | 44 +- bsd/netinet6/ip6protosw.h | 6 +- bsd/netinet6/mld6.c | 13 +- bsd/netinet6/nd6.c | 172 +- bsd/netinet6/nd6.h | 36 +- bsd/netinet6/nd6_nbr.c | 64 +- bsd/netinet6/nd6_prproxy.c | 1 + bsd/netinet6/nd6_rtr.c | 145 +- bsd/netinet6/nd6_var.h | 4 + bsd/netinet6/raw_ip6.c | 27 +- bsd/netinet6/route6.c | 3 +- bsd/netinet6/udp6_output.c | 31 +- bsd/netinet6/udp6_usrreq.c | 17 +- bsd/netkey/key.c | 9 +- bsd/nfs/gss/gss_krb5_mech.c | 11 +- bsd/nfs/nfs.h | 191 +- bsd/nfs/nfs_gss.c | 18 +- bsd/nfs/nfs_serv.c | 16 +- bsd/nfs/nfs_socket.c | 16 +- bsd/nfs/nfs_srvcache.c | 7 +- bsd/nfs/nfs_subs.c | 35 +- bsd/nfs/nfs_syscalls.c | 10 +- bsd/nfs/nfsm_subs.h | 10 +- bsd/nfs/nfsproto.h | 559 +- bsd/pgo/profile_runtime_data.c | 5 + bsd/pthread/pthread_shims.c | 12 +- bsd/pthread/pthread_workqueue.c | 10 +- bsd/security/audit/audit_bsm.c | 10 + bsd/skywalk/channel/channel.c | 219 +- bsd/skywalk/channel/channel_kern.c | 36 +- bsd/skywalk/channel/channel_ring.c | 368 +- bsd/skywalk/channel/channel_syscalls.c | 105 +- bsd/skywalk/channel/channel_var.h | 98 +- bsd/skywalk/channel/kern_channel_event.c | 14 +- bsd/skywalk/channel/os_channel.h | 29 +- bsd/skywalk/channel/os_channel_private.h | 20 +- bsd/skywalk/core/skywalk.c | 159 +- bsd/skywalk/core/skywalk_common.h | 6 +- bsd/skywalk/core/skywalk_proc_info.c | 9 - bsd/skywalk/core/skywalk_var.h | 21 +- bsd/skywalk/mem/skmem.c | 62 +- bsd/skywalk/mem/skmem_arena.c | 97 +- bsd/skywalk/mem/skmem_arena_var.h | 2 + bsd/skywalk/mem/skmem_cache.c | 51 +- bsd/skywalk/mem/skmem_region.c | 222 +- bsd/skywalk/mem/skmem_slab.c | 9 +- bsd/skywalk/mem/skmem_test.c | 249 +- bsd/skywalk/namespace/flowidns.c | 19 +- bsd/skywalk/namespace/netns.c | 120 +- bsd/skywalk/namespace/netns.h | 11 +- bsd/skywalk/namespace/protons.c | 1 + bsd/skywalk/nexus/Makefile | 2 +- bsd/skywalk/nexus/flowswitch/flow/flow.c | 3 +- bsd/skywalk/nexus/flowswitch/flow/flow_agg.c | 59 +- .../nexus/flowswitch/flow/flow_classifier.c | 46 +- .../nexus/flowswitch/flow/flow_entry.c | 350 +- .../nexus/flowswitch/flow/flow_manager.c | 31 +- .../nexus/flowswitch/flow/flow_namespace.c | 2 +- .../nexus/flowswitch/flow/flow_owner.c | 20 +- .../nexus/flowswitch/flow/flow_route.c | 62 +- .../nexus/flowswitch/flow/flow_stats.c | 8 +- .../nexus/flowswitch/flow/flow_track.c | 73 +- bsd/skywalk/nexus/flowswitch/flow/flow_var.h | 44 +- bsd/skywalk/nexus/flowswitch/fsw.c | 60 +- bsd/skywalk/nexus/flowswitch/fsw_classq.c | 19 +- bsd/skywalk/nexus/flowswitch/fsw_dp.c | 456 +- bsd/skywalk/nexus/flowswitch/fsw_ethernet.c | 17 +- bsd/skywalk/nexus/flowswitch/fsw_flow.c | 124 +- bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c | 24 + bsd/skywalk/nexus/flowswitch/fsw_netagent.c | 13 +- bsd/skywalk/nexus/flowswitch/fsw_var.h | 4 +- bsd/skywalk/nexus/flowswitch/fsw_vp.c | 38 +- bsd/skywalk/nexus/flowswitch/nx_flowswitch.c | 54 +- bsd/skywalk/nexus/flowswitch/nx_flowswitch.h | 3 + bsd/skywalk/nexus/kpipe/nx_kernel_pipe.c | 58 +- bsd/skywalk/nexus/kpipe/nx_kpipe_loopback.c | 32 +- bsd/skywalk/nexus/monitor/Makefile | 46 - bsd/skywalk/nexus/monitor/nx_monitor.c | 1712 ----- bsd/skywalk/nexus/monitor/nx_monitor.h | 86 - bsd/skywalk/nexus/netif/nx_netif.c | 351 +- bsd/skywalk/nexus/netif/nx_netif.h | 38 +- bsd/skywalk/nexus/netif/nx_netif_compat.c | 117 +- bsd/skywalk/nexus/netif/nx_netif_filter.c | 4 +- .../nexus/netif/nx_netif_filter_compat.c | 2 +- .../nexus/netif/nx_netif_filter_native.c | 2 +- bsd/skywalk/nexus/netif/nx_netif_filter_vp.c | 4 +- bsd/skywalk/nexus/netif/nx_netif_flow.c | 14 +- bsd/skywalk/nexus/netif/nx_netif_gso.c | 35 +- bsd/skywalk/nexus/netif/nx_netif_host.c | 66 +- bsd/skywalk/nexus/netif/nx_netif_llink.c | 65 +- bsd/skywalk/nexus/netif/nx_netif_mit.c | 12 +- bsd/skywalk/nexus/netif/nx_netif_netagent.c | 24 +- bsd/skywalk/nexus/netif/nx_netif_poll.c | 4 +- bsd/skywalk/nexus/netif/nx_netif_util.c | 99 +- bsd/skywalk/nexus/netif/nx_netif_vp.c | 87 +- bsd/skywalk/nexus/nexus.c | 120 +- bsd/skywalk/nexus/nexus_adapter.c | 411 +- bsd/skywalk/nexus/nexus_adapter.h | 25 +- bsd/skywalk/nexus/nexus_ioctl.c | 39 +- bsd/skywalk/nexus/nexus_ioctl.h | 19 +- bsd/skywalk/nexus/nexus_kern.c | 83 +- bsd/skywalk/nexus/nexus_syscalls.c | 68 +- bsd/skywalk/nexus/nexus_traffic_rule.c | 1141 +-- bsd/skywalk/nexus/nexus_traffic_rule.h | 154 + bsd/skywalk/nexus/nexus_traffic_rule_eth.c | 652 ++ bsd/skywalk/nexus/nexus_traffic_rule_eth.h | 69 + bsd/skywalk/nexus/nexus_traffic_rule_inet.c | 915 +++ bsd/skywalk/nexus/nexus_traffic_rule_inet.h | 68 + bsd/skywalk/nexus/nexus_var.h | 34 +- bsd/skywalk/nexus/os_nexus.h | 29 +- bsd/skywalk/nexus/os_nexus_private.h | 55 +- bsd/skywalk/nexus/upipe/nx_user_pipe.c | 130 +- bsd/skywalk/os_skywalk_private.h | 104 +- bsd/skywalk/os_stats_private.h | 37 +- bsd/skywalk/os_sysctls_private.h | 13 +- bsd/skywalk/packet/os_packet.h | 17 +- bsd/skywalk/packet/os_packet_private.h | 31 +- bsd/skywalk/packet/packet_common.h | 334 +- bsd/skywalk/packet/packet_copy.c | 270 +- bsd/skywalk/packet/packet_kern.c | 82 +- bsd/skywalk/packet/packet_var.h | 124 +- bsd/skywalk/packet/pbufpool.c | 546 +- bsd/skywalk/packet/pbufpool_kern.c | 7 +- bsd/sys/Makefile | 33 +- bsd/sys/_types/_graftdmg_un.h | 3 + bsd/sys/attr.h | 7 +- bsd/sys/buf.h | 52 + bsd/sys/buf_internal.h | 11 +- bsd/sys/cdefs.h | 59 +- bsd/sys/clonefile.h | 9 +- bsd/sys/code_signing.h | 70 +- bsd/sys/code_signing_internal.h | 5 + bsd/sys/codesign.h | 1 + bsd/sys/disk.h | 87 +- bsd/sys/disk_private.h | 111 + bsd/sys/dtrace_impl.h | 1 - bsd/sys/endian.h | 227 + bsd/sys/errno.h | 10 +- bsd/sys/fcntl.h | 149 +- bsd/sys/fcntl_private.h | 176 + bsd/sys/fsctl.h | 15 +- bsd/sys/guarded.h | 1 + bsd/sys/imageboot.h | 2 +- bsd/sys/imgact.h | 2 - bsd/sys/kas_info.h | 1 + bsd/sys/kdebug.h | 47 +- bsd/sys/kdebug_common.h | 23 +- bsd/sys/kdebug_kernel.h | 10 +- bsd/sys/kdebug_private.h | 1 + bsd/sys/kdebug_triage.h | 5 +- bsd/sys/kern_control.h | 244 +- bsd/sys/kern_control_private.h | 318 + bsd/sys/kern_event.h | 48 +- bsd/sys/kern_event_private.h | 82 + bsd/sys/kern_memorystatus.h | 33 +- bsd/sys/kern_memorystatus_notify.h | 5 +- bsd/sys/kern_memorystatus_xnu.h | 6 + bsd/sys/kpi_mbuf.h | 116 +- bsd/sys/linker_set.h | 2 +- bsd/sys/mbuf.h | 271 +- bsd/sys/mcache.h | 5 - bsd/sys/mem_acct_private.h | 51 + bsd/sys/mount.h | 7 +- bsd/sys/namei.h | 15 +- bsd/sys/paths.h | 11 + bsd/sys/proc.h | 53 +- bsd/sys/proc_info.h | 9 + bsd/sys/proc_info_private.h | 21 +- bsd/sys/proc_internal.h | 60 +- bsd/sys/proc_ro.h | 3 +- bsd/sys/protosw.h | 18 +- bsd/sys/pthread_shims.h | 4 +- bsd/sys/reason.h | 9 +- bsd/sys/reboot.h | 1 + bsd/sys/resource.h | 4 + bsd/sys/resource_private.h | 27 +- bsd/sys/signal.h | 1 + bsd/sys/signalvar.h | 1 + bsd/sys/snapshot.h | 4 + bsd/sys/socket_private.h | 3 + bsd/sys/socketvar.h | 34 +- bsd/sys/sockio_private.h | 12 +- bsd/sys/spawn_internal.h | 9 +- bsd/sys/stdio.h | 1 + bsd/sys/sys_domain.h | 28 +- bsd/sys/sys_domain_private.h | 59 + bsd/sys/sysctl.h | 120 +- bsd/sys/ubc_internal.h | 2 +- bsd/sys/user.h | 4 +- bsd/sys/vnode.h | 44 +- bsd/sys/vnode_if.h | 29 +- bsd/sys/vnode_internal.h | 3 +- bsd/sys/vsock_private.h | 43 + bsd/sys/vsock_transport.h | 5 +- bsd/sys/work_interval.h | 7 +- bsd/sys/xattr.h | 3 +- bsd/tests/bsd_tests.c | 4 +- bsd/tests/ctrr_test_sysctl.c | 12 +- bsd/tests/pmap_test_sysctl.c | 33 +- bsd/vfs/kpi_vfs.c | 50 +- bsd/vfs/vfs_attrlist.c | 129 +- bsd/vfs/vfs_bio.c | 169 +- bsd/vfs/vfs_cache.c | 40 +- bsd/vfs/vfs_cluster.c | 348 +- bsd/vfs/vfs_cprotect.c | 2 +- bsd/vfs/vfs_exclave_fs.c | 75 +- bsd/vfs/vfs_exclave_fs.h | 1 + bsd/vfs/vfs_lookup.c | 236 +- bsd/vfs/vfs_subr.c | 254 +- bsd/vfs/vfs_syscalls.c | 541 +- bsd/vfs/vfs_unicode.c | 6 +- bsd/vfs/vfs_unicode_data.h | 833 +-- bsd/vfs/vfs_vnops.c | 57 +- bsd/vfs/vfs_xattr.c | 1938 +---- bsd/vm/vm_unix.c | 293 +- config/BSDKernel.arm.exports | 1 + config/BSDKernel.arm64.exports | 1 + config/BSDKernel.exports | 2 + config/BSDKernel.x86_64.exports | 1 + config/IOKit.arm.exports | 1 + config/IOKit.arm64.exports | 1 + config/IOKit.exports | 25 + config/IOKit.x86_64.exports | 1 + config/Kasan_enabled.arm.exports | 8 + config/Kasan_enabled.arm64.exports | 8 + config/Kasan_enabled.x86_64.exports | 8 + config/Kcov_enabled.exports | 9 + config/Libkern.exports | 1 + config/MASTER | 17 +- config/MASTER.arm | 15 +- config/MASTER.arm64 | 21 +- config/MASTER.arm64.BridgeOS | 15 +- config/MASTER.arm64.MacOSX | 12 +- config/MASTER.arm64.WatchOS | 20 +- config/MASTER.arm64.iPhoneOS | 20 +- config/MASTER.x86_64 | 14 +- config/Private.arm64.exports | 7 +- config/Private.exports | 43 +- config/libTightbeam.exports | 1 - doc/debugging/extensible_paniclog.md | 389 ++ doc/lifecycle/startup.md | 9 +- doc/mach_ipc/guard_exceptions.md | 178 +- doc/mach_ipc/ipc_security_concepts.md | 116 + doc/mach_ipc/port_types.md | 164 + doc/observability/coalitions.md | 134 + doc/scheduler/sched_clutch_edge.md | 7 +- doc/vm/memorystatus.md | 12 +- doc/vm/memorystatus_kills.md | 20 +- doc/vm/memorystatus_notify.md | 2 +- doc/vm/pageout_scan.md | 231 + iokit/DriverKit/IOService.iig | 50 +- iokit/DriverKit/queue_implementation.h | 4 +- iokit/Exclaves/Exclaves.cpp | 64 + iokit/Exclaves/Exclaves.h | 27 + iokit/IOKit/IOBSD.h | 15 + iokit/IOKit/IOBufferMemoryDescriptor.h | 6 +- iokit/IOKit/IOCircularDataQueue.h | 420 ++ .../IOKit/IOCircularDataQueueImplementation.h | 1918 +++++ iokit/IOKit/IOHibernatePrivate.h | 9 +- iokit/IOKit/IOKitDebug.h | 4 +- iokit/IOKit/IOKitKeysPrivate.h | 10 + iokit/IOKit/IOKitServer.h | 52 +- iokit/IOKit/IOLib.h | 30 +- iokit/IOKit/IOMemoryDescriptor.h | 33 +- iokit/IOKit/IOMultiMemoryDescriptor.h | 16 + iokit/IOKit/IONVRAM.h | 2 +- iokit/IOKit/IOPolledInterface.h | 2 +- iokit/IOKit/IOService.h | 17 +- iokit/IOKit/IOSubMemoryDescriptor.h | 11 + iokit/IOKit/IOUserServer.h | 24 +- iokit/IOKit/Makefile | 3 +- iokit/IOKit/perfcontrol/IOPerfControl.h | 38 +- iokit/IOKit/pwr_mgt/IOPM.h | 15 +- iokit/IOKit/pwr_mgt/IOPMLibDefs.h | 3 +- iokit/IOKit/pwr_mgt/IOPMPrivate.h | 16 +- iokit/IOKit/pwr_mgt/RootDomain.h | 55 +- iokit/Kernel/IOBufferMemoryDescriptor.cpp | 67 +- iokit/Kernel/IOCatalogue.cpp | 2 + iokit/Kernel/IOCircularDataQueue.cpp | 38 + iokit/Kernel/IOHibernateIO.cpp | 10 +- iokit/Kernel/IOKitKernelInternal.h | 9 +- iokit/Kernel/IOLib.cpp | 44 +- iokit/Kernel/IOMemoryDescriptor.cpp | 145 +- iokit/Kernel/IOMultiMemoryDescriptor.cpp | 21 +- iokit/Kernel/IONVRAM.cpp | 237 +- iokit/Kernel/IONVRAMCHRPHandler.cpp | 2 +- iokit/Kernel/IONVRAMV3Handler.cpp | 2 +- iokit/Kernel/IOPMrootDomain.cpp | 549 +- iokit/Kernel/IOPerfControl.cpp | 48 +- iokit/Kernel/IOPlatformActions.cpp | 7 + iokit/Kernel/IOPlatformExpert.cpp | 15 +- iokit/Kernel/IOPolledInterface.cpp | 2 +- iokit/Kernel/IOService.cpp | 305 +- iokit/Kernel/IOServicePM.cpp | 103 +- iokit/Kernel/IOServicePMPrivate.h | 13 + iokit/Kernel/IOServicePrivate.h | 15 +- iokit/Kernel/IOSharedDataQueue.cpp | 2 +- iokit/Kernel/IOSubMemoryDescriptor.cpp | 7 + iokit/Kernel/IOUserClient.cpp | 535 +- iokit/Kernel/IOUserServer.cpp | 520 +- iokit/Kernel/RootDomainUserClient.cpp | 35 + iokit/Kernel/RootDomainUserClient.h | 2 + iokit/Tests/TestIOMemoryDescriptor.cpp | 13 + iokit/Tests/TestServices/TestIODataQueues.cpp | 79 + iokit/Tests/TestServices/TestIODataQueues.h | 36 + .../TestIOServiceUserNotification.cpp | 90 +- .../TestIOServiceUserNotification.h | 15 + iokit/bsddev/IOKitBSDInit.cpp | 100 + iokit/conf/Makefile.template | 4 + iokit/conf/files | 2 + libkdd/kcdata.h | 73 +- libkdd/kcdtypes.c | 72 +- libkern/amfi/amfi.c | 12 +- libkern/c++/OSData.cpp | 10 +- libkern/c++/OSKext.cpp | 58 +- libkern/c++/OSObject.cpp | 14 +- libkern/c++/OSSerialize.cpp | 2 +- libkern/c++/OSSymbol.cpp | 8 +- libkern/conf/Makefile.template | 4 + libkern/firehose/firehose_types_private.h | 30 + libkern/kxld/kxld_util.c | 6 +- libkern/libkern/amfi/amfi.h | 24 + libkern/libkern/c++/OSKext.h | 5 +- libkern/libkern/section_keywords.h | 12 + libkern/os/atomic_private.h | 4 +- libkern/os/base.h | 21 + libkern/os/hash.h | 34 +- libkern/os/log.c | 4 +- libkern/os/log_encode.c | 12 + libkern/os/log_encode_types.h | 1 + libkern/os/log_queue.c | 46 +- libkern/os/log_queue.h | 1 + libkern/os/refcnt.c | 6 +- libkern/os/refcnt_internal.h | 20 +- libsa/conf/Makefile.template | 4 + libsyscall/mach/mach_vm.c | 313 + libsyscall/mach/vm_reclaim.c | 354 +- libsyscall/wrappers/_libkernel_init.c | 13 +- libsyscall/wrappers/exclaves.c | 46 + libsyscall/wrappers/getiopolicy_np.c | 1 - libsyscall/wrappers/skywalk/os_channel.c | 261 +- libsyscall/wrappers/skywalk/os_nexus.c | 104 +- libsyscall/wrappers/skywalk/os_packet.c | 38 +- libsyscall/wrappers/spawn/posix_spawn.c | 73 +- libsyscall/wrappers/spawn/spawn.h | 10 +- libsyscall/wrappers/spawn/spawn_private.h | 4 + libsyscall/wrappers/statfs_ext.c | 14 +- .../wrappers/system-version-compat-support.h | 32 + libsyscall/wrappers/system-version-compat.c | 33 +- libsyscall/wrappers/utimensat.c | 3 + makedefs/MakeInc.cmd | 45 +- makedefs/MakeInc.def | 137 +- makedefs/MakeInc.kernel | 13 +- makedefs/MakeInc.rule | 8 +- makedefs/MakeInc.top | 22 +- .../UserNotification/KUNCUserNotifications.c | 3 +- osfmk/arm/arm_features.inc | 2 + osfmk/arm/arm_init.c | 59 +- osfmk/arm/commpage/commpage.c | 26 + osfmk/arm/commpage/commpage.h | 3 + osfmk/arm/commpage/commpage_asm.s | 12 +- osfmk/arm/cpu_capabilities.h | 6 + osfmk/arm/cpu_capabilities_public.h | 3 +- osfmk/arm/cpu_common.c | 60 +- osfmk/arm/cpu_data.h | 6 + osfmk/arm/cpu_data_internal.h | 14 +- osfmk/arm/cpu_internal.h | 6 +- osfmk/arm/cpu_topology.h | 69 - osfmk/arm/cpuid.c | 16 +- osfmk/arm/cpuid.h | 22 +- osfmk/arm/data.s | 4 +- osfmk/arm/io_map.c | 4 +- osfmk/arm/locks.h | 4 + osfmk/arm/machine_cpu.h | 1 + osfmk/arm/machine_cpuid.c | 2 +- osfmk/arm/machine_routines.h | 198 +- osfmk/arm/machine_routines_apple.c | 5 + osfmk/arm/machine_routines_common.c | 342 +- osfmk/arm/misc_protos.h | 26 +- osfmk/arm/model_dep.c | 134 +- osfmk/arm/pmap/pmap.c | 634 +- osfmk/arm/pmap/pmap.h | 27 +- osfmk/arm/pmap/pmap_data.c | 34 +- osfmk/arm/pmap/pmap_data.h | 44 +- osfmk/arm/pmap/pmap_misc.c | 6 +- osfmk/arm/pmap/pmap_ppl_interface.c | 4 +- osfmk/arm/pmap/pmap_pt_geometry.h | 17 +- osfmk/arm/preemption_disable.c | 139 +- osfmk/arm/preemption_disable_internal.h | 30 +- osfmk/arm/task.h | 3 + osfmk/arm/thread.h | 29 +- osfmk/arm64/Makefile | 1 + osfmk/arm64/amcc_rorgn.h | 49 +- osfmk/arm64/apt.c | 55 + osfmk/arm64/arm_vm_init.c | 96 +- osfmk/arm64/asm.h | 1 + osfmk/arm64/bcopy.s | 10 +- osfmk/arm64/bti_telemetry.c | 549 -- osfmk/arm64/bzero.s | 24 +- osfmk/arm64/caches_asm.s | 4 +- osfmk/arm64/copyio.c | 11 +- osfmk/arm64/cpc_arm64_events.c | 158 +- osfmk/arm64/cpu.c | 4 - osfmk/arm64/cswitch.s | 17 +- osfmk/arm64/dbgwrap.c | 31 +- osfmk/arm64/exception_asm.h | 2 +- osfmk/arm64/genassym.c | 6 +- osfmk/arm64/hibernate_arm64.c | 67 +- osfmk/arm64/hibernate_restore.c | 5 + osfmk/arm64/iofilter_asm.s | 2 +- osfmk/arm64/locore.s | 119 +- osfmk/arm64/loose_ends.c | 101 +- osfmk/arm64/lowmem_vectors.c | 4 + osfmk/arm64/lz4_decode_arm64.s | 2 +- osfmk/arm64/lz4_encode_arm64.s | 2 +- osfmk/arm64/machine_routines.c | 133 +- osfmk/arm64/machine_routines_asm.s | 350 +- osfmk/arm64/memcmp_zero.s | 2 +- osfmk/arm64/monotonic_arm64.c | 33 +- osfmk/arm64/pac_asm.h | 2 +- osfmk/arm64/pcb.c | 21 +- osfmk/arm64/pinst.s | 4 +- osfmk/arm64/platform_tests.c | 234 +- osfmk/arm64/platform_tests_asm.s | 79 +- osfmk/arm64/proc_reg.h | 673 +- osfmk/arm64/sleh.c | 202 +- osfmk/arm64/sptm/arm_init_sptm.c | 168 +- osfmk/arm64/sptm/pmap/pmap.c | 2202 +++--- osfmk/arm64/sptm/pmap/pmap.h | 27 +- osfmk/arm64/sptm/pmap/pmap_data.c | 401 +- osfmk/arm64/sptm/pmap/pmap_data.h | 349 +- osfmk/arm64/sptm/pmap/pmap_internal.h | 66 +- osfmk/arm64/sptm/pmap/pmap_misc.c | 10 +- osfmk/arm64/sptm/pmap/pmap_ppl_interface.c | 4 +- osfmk/arm64/sptm/pmap/pmap_pt_geometry.h | 17 +- osfmk/arm64/sptm/sptm.h | 15 + osfmk/arm64/start.s | 23 +- osfmk/arm64/strncmp.s | 2 +- osfmk/arm64/strnlen.s | 11 +- osfmk/conf/Makefile.template | 9 +- osfmk/conf/files | 9 +- osfmk/conf/files.arm64 | 4 +- osfmk/console/serial_protos.h | 3 +- osfmk/corpses/corpse.c | 1 + osfmk/device/device.defs | 17 +- osfmk/device/device_init.c | 14 - osfmk/device/device_types.h | 5 +- osfmk/device/iokit_rpc.c | 190 +- osfmk/i386/AT386/model_dep.c | 6 + osfmk/i386/commpage/commpage.c | 7 + osfmk/i386/cpu.c | 6 + osfmk/i386/cpu_topology.c | 2 +- osfmk/i386/i386_vm_init.c | 4 +- osfmk/i386/machine_routines.c | 7 + osfmk/i386/pcb.c | 1 - osfmk/i386/pmCPU.c | 8 + osfmk/i386/pmap_x86_common.c | 17 + osfmk/i386/trap.c | 2 +- osfmk/ipc/flipc.c | 640 -- osfmk/ipc/flipc.h | 160 - osfmk/ipc/ipc_entry.c | 89 +- osfmk/ipc/ipc_entry.h | 101 +- osfmk/ipc/ipc_eventlink.c | 12 +- osfmk/ipc/ipc_hash.c | 1 - osfmk/ipc/ipc_importance.c | 24 +- osfmk/ipc/ipc_init.c | 40 +- osfmk/ipc/ipc_init.h | 155 - osfmk/ipc/ipc_kmsg.c | 399 +- osfmk/ipc/ipc_kmsg.h | 8 - osfmk/ipc/ipc_mqueue.c | 247 +- osfmk/ipc/ipc_mqueue.h | 40 +- osfmk/ipc/ipc_notify.c | 198 +- osfmk/ipc/ipc_notify.h | 187 +- osfmk/ipc/ipc_object.c | 358 +- osfmk/ipc/ipc_object.h | 482 +- osfmk/ipc/ipc_policy.c | 962 ++- osfmk/ipc/ipc_policy.h | 517 +- osfmk/ipc/ipc_port.c | 784 +-- osfmk/ipc/ipc_port.h | 484 +- osfmk/ipc/ipc_pset.c | 103 +- osfmk/ipc/ipc_pset.h | 8 +- osfmk/ipc/ipc_right.c | 944 ++- osfmk/ipc/ipc_right.h | 30 +- osfmk/ipc/ipc_service_port.c | 177 +- osfmk/ipc/ipc_service_port.h | 106 +- osfmk/ipc/ipc_space.c | 72 +- osfmk/ipc/ipc_space.h | 76 +- osfmk/ipc/ipc_types.h | 215 +- osfmk/ipc/ipc_voucher.c | 20 +- osfmk/ipc/mach_debug.c | 129 +- osfmk/ipc/mach_kernelrpc.c | 35 +- osfmk/ipc/mach_msg.c | 21 +- osfmk/ipc/mach_port.c | 720 +- osfmk/kdp/kdp_common.c | 2 + osfmk/kdp/kdp_core.c | 41 +- osfmk/kdp/kdp_core.h | 3 +- osfmk/kdp/kdp_out_stage.h | 3 +- osfmk/kdp/ml/arm/kdp_machdep.c | 1 + osfmk/kdp/ml/arm/kdp_vm.c | 1 + osfmk/kdp/ml/x86_64/kdp_machdep.c | 1 + osfmk/kdp/output_stages/out_aea.c | 8 +- osfmk/kdp/output_stages/out_buffer.c | 8 +- osfmk/kdp/output_stages/out_disk.c | 8 +- osfmk/kdp/output_stages/out_lz4.c | 70 +- .../out_memory_backing_aware_buffer.c | 13 +- osfmk/kdp/output_stages/out_net.c | 6 +- osfmk/kdp/output_stages/out_progress_notify.c | 8 +- osfmk/kdp/output_stages/out_shmem.c | 231 +- osfmk/kdp/output_stages/out_zlib.c | 20 +- osfmk/kdp/processor_core.c | 48 +- osfmk/kdp/processor_core.h | 17 +- osfmk/kdp/sk_core.c | 1 + osfmk/kern/Makefile | 11 +- osfmk/kern/arcade.c | 1 + osfmk/kern/assert.h | 71 +- osfmk/kern/ast.c | 6 + osfmk/kern/ast.h | 3 +- osfmk/kern/audit_sessionport.c | 6 +- osfmk/kern/backtrace.c | 22 +- osfmk/kern/bits.h | 33 +- osfmk/kern/block_hint.h | 9 + osfmk/kern/bsd_kern.c | 25 +- osfmk/kern/btlog.h | 4 +- osfmk/kern/coalition.c | 20 +- osfmk/kern/compact_id.h | 4 +- osfmk/kern/cpu_data.h | 2 +- osfmk/kern/cs_blobs.h | 12 + osfmk/kern/debug.c | 139 +- osfmk/kern/debug.h | 24 +- osfmk/kern/epoch_sync.c | 2 +- osfmk/kern/exc_resource.h | 3 +- osfmk/kern/exception.c | 60 +- osfmk/kern/exclaves.c | 154 +- osfmk/kern/exclaves_aoe.c | 527 ++ osfmk/kern/exclaves_aoe.h | 114 + osfmk/kern/exclaves_boot.c | 21 +- osfmk/kern/exclaves_driverkit.c | 159 +- osfmk/kern/exclaves_driverkit.h | 12 + osfmk/kern/exclaves_inspection.c | 5 +- osfmk/kern/exclaves_log.c | 16 +- osfmk/kern/exclaves_memory.c | 10 +- osfmk/kern/exclaves_resource.c | 298 +- osfmk/kern/exclaves_resource.h | 101 +- osfmk/kern/exclaves_sensor.c | 190 +- osfmk/kern/exclaves_sensor.h | 8 + osfmk/kern/exclaves_storage.c | 4 +- osfmk/kern/exclaves_tests.c | 5 +- osfmk/kern/exclaves_upcalls.c | 55 +- osfmk/kern/hibernate.c | 2 - osfmk/kern/host.c | 43 +- osfmk/kern/host_notify.c | 13 +- osfmk/kern/host_statistics.h | 3 + osfmk/kern/hv_io_notifier.c | 9 +- osfmk/kern/ipc_clock.c | 3 +- osfmk/kern/ipc_host.c | 39 +- osfmk/kern/ipc_kobject.c | 744 +- osfmk/kern/ipc_kobject.h | 300 +- osfmk/kern/ipc_misc.c | 6 +- osfmk/kern/ipc_tt.c | 864 +-- osfmk/kern/ipc_tt.h | 73 +- osfmk/kern/kalloc.c | 197 +- osfmk/kern/kalloc.h | 48 +- osfmk/kern/kcdata.h | 73 +- osfmk/kern/kcdata_private.h | 29 + osfmk/kern/kern_cdata.c | 6 +- osfmk/kern/kern_stackshot.c | 370 +- osfmk/kern/kern_stackshot.h | 2 + osfmk/kern/kern_types.h | 14 + osfmk/kern/ledger.c | 241 +- osfmk/kern/ledger.h | 27 +- osfmk/kern/lock_group.h | 4 +- osfmk/kern/lock_mtx.c | 12 +- osfmk/kern/lock_rw.c | 15 + osfmk/kern/lock_ticket.c | 4 - osfmk/kern/mach_node.c | 903 --- osfmk/kern/mach_node.h | 258 - osfmk/kern/mach_node_link.h | 265 - osfmk/kern/machine.c | 309 +- osfmk/kern/machine.h | 2 + osfmk/kern/misc_protos.h | 1 + osfmk/kern/mk_sp.c | 2 - osfmk/kern/mk_timer.c | 76 +- osfmk/kern/policy_internal.h | 11 +- osfmk/kern/printf.c | 1 + osfmk/kern/priority.c | 4 +- osfmk/kern/processor.c | 232 +- osfmk/kern/processor.h | 94 +- osfmk/kern/queue.h | 4 +- osfmk/kern/sched.h | 22 +- osfmk/kern/sched_amp.c | 27 +- osfmk/kern/sched_amp_common.c | 3 +- osfmk/kern/sched_clutch.c | 1225 +++- osfmk/kern/sched_clutch.h | 28 +- osfmk/kern/sched_common.c | 111 + osfmk/kern/sched_common.h | 136 + osfmk/kern/sched_dualq.c | 20 +- osfmk/kern/sched_hygiene.h | 3 + osfmk/kern/sched_prim.c | 1549 +--- osfmk/kern/sched_prim.h | 63 +- osfmk/kern/sched_rt.c | 1495 ++++ osfmk/kern/sched_rt.h | 187 + osfmk/kern/sfi.c | 6 +- osfmk/kern/smr.c | 2 +- osfmk/kern/smr_hash.h | 6 + osfmk/kern/socd_client.c | 13 + osfmk/kern/socd_client.h | 11 +- osfmk/kern/stack.c | 4 +- osfmk/kern/startup.c | 62 +- osfmk/kern/startup.h | 52 +- osfmk/kern/static_if_common.c | 15 +- osfmk/kern/sync_sema.c | 12 +- osfmk/kern/syscall_subr.c | 14 +- osfmk/kern/syscall_sw.c | 4 +- osfmk/kern/task.c | 772 +- osfmk/kern/task.h | 154 +- osfmk/kern/task_ident.c | 39 +- osfmk/kern/task_policy.c | 142 +- osfmk/kern/telemetry.c | 34 +- osfmk/kern/thread.c | 110 +- osfmk/kern/thread.h | 97 +- osfmk/kern/thread_act.c | 149 +- osfmk/kern/thread_call.c | 29 +- osfmk/kern/thread_group.c | 208 +- osfmk/kern/thread_group.h | 1 + osfmk/kern/thread_policy.c | 14 +- osfmk/kern/ticket_lock.h | 3 - osfmk/kern/timeout.c | 310 + osfmk/kern/timeout.h | 112 + osfmk/kern/timeout_decl.h | 53 + osfmk/kern/timer_call.c | 5 +- osfmk/kern/timer_call.h | 1 + osfmk/kern/trap_telemetry.c | 100 +- osfmk/kern/trap_telemetry.h | 51 +- osfmk/kern/turnstile.c | 24 +- osfmk/kern/upsi.h | 70 + osfmk/kern/ux_handler.c | 1 + osfmk/kern/waitq.c | 96 +- osfmk/kern/waitq.h | 17 +- osfmk/kern/work_interval.c | 12 +- osfmk/kern/zalloc.c | 1018 +-- osfmk/kern/zalloc.h | 114 +- osfmk/kern/zalloc_internal.h | 18 +- osfmk/kperf/kptimer.c | 4 +- osfmk/kperf/task_samplers.c | 15 +- osfmk/kperf/task_samplers.h | 3 + osfmk/mach/Makefile | 3 + osfmk/mach/arm/_structs.h | 16 +- osfmk/mach/arm/exception.h | 8 +- osfmk/mach/arm/thread_status.h | 5 - osfmk/mach/arm/vm_param.h | 73 +- osfmk/mach/coalition.h | 1 + osfmk/mach/exception_types.h | 7 + osfmk/mach/exclaves.h | 149 +- osfmk/mach/host_info.h | 12 +- osfmk/mach/i386/vm_param.h | 6 + osfmk/mach/mach_port.defs | 4 +- osfmk/mach/mach_traps.h | 14 + osfmk/mach/mach_types.defs | 2 + osfmk/mach/mach_vm.defs | 39 +- osfmk/mach/machine.h | 12 +- osfmk/mach/memory_entry.defs | 18 + osfmk/mach/memory_object_types.h | 8 +- osfmk/mach/message.h | 99 +- osfmk/mach/mk_timer.h | 8 + osfmk/mach/port.h | 238 +- osfmk/mach/sfi_class.h | 8 +- osfmk/mach/syscall_sw.h | 3 + osfmk/mach/task_info.h | 32 +- osfmk/mach/task_policy.h | 1 + osfmk/mach/task_policy_private.h | 14 +- osfmk/mach/thread_info.h | 2 +- osfmk/mach/vm_param.h | 14 +- osfmk/mach/vm_reclaim_private.h | 155 +- osfmk/mach/vm_region.h | 18 + osfmk/mach/vm_statistics.h | 147 +- osfmk/mach/vm_types.h | 15 + osfmk/mach_debug/ipc_info.h | 75 + osfmk/mach_debug/mach_debug_types.defs | 2 + osfmk/machine/machine_routines.h | 37 + osfmk/machine/static_if.h | 21 +- osfmk/machine/trap.h | 25 + osfmk/man/vm_copy.html | 6 +- osfmk/prng/prng_random.c | 26 +- osfmk/tests/kernel_tests.c | 53 + osfmk/tests/pmap_tests.c | 253 +- osfmk/tests/ptrauth_data_tests.c | 4 +- osfmk/tests/vm_parameter_validation.h | 348 +- osfmk/tests/vm_parameter_validation_kern.c | 196 +- osfmk/vm/Makefile | 4 +- osfmk/vm/analytics.c | 16 +- osfmk/vm/bsd_vm.c | 48 +- osfmk/vm/device_vm.c | 2 - osfmk/vm/pmap.h | 58 +- osfmk/vm/pmap_cs.h | 2 +- osfmk/vm/vm32_user.c | 5 + osfmk/vm/vm_apple_protect.c | 5 +- osfmk/vm/vm_compressor.c | 171 +- osfmk/vm/vm_compressor_backing_store.c | 89 +- osfmk/vm/vm_compressor_internal.h | 3 +- osfmk/vm/vm_compressor_pager.c | 3 +- osfmk/vm/vm_compressor_pager_xnu.h | 1 + osfmk/vm/vm_compressor_xnu.h | 26 + osfmk/vm/vm_debug.c | 25 +- osfmk/vm/vm_dyld_pager.c | 296 +- osfmk/vm/vm_dyld_pager_internal.h | 1 + osfmk/vm/vm_fault.c | 330 +- osfmk/vm/vm_fault_internal.h | 3 +- osfmk/vm/vm_init.c | 1 - osfmk/vm/vm_iokit.h | 15 +- osfmk/vm/vm_kern.c | 295 +- osfmk/vm/vm_kern_xnu.h | 20 +- osfmk/vm/vm_lock_perf.h | 329 + osfmk/vm/vm_log.h | 55 + osfmk/vm/vm_map.c | 2488 ++++--- osfmk/vm/vm_map.h | 17 + osfmk/vm/vm_map_internal.h | 36 +- osfmk/vm/vm_map_store.c | 12 +- osfmk/vm/vm_map_store_internal.h | 11 +- osfmk/vm/vm_map_store_ll.c | 27 +- osfmk/vm/vm_map_store_rb.c | 52 +- osfmk/vm/vm_map_xnu.h | 128 +- osfmk/vm/vm_memory_entry.c | 223 +- osfmk/vm/vm_memory_entry.h | 12 + osfmk/vm/vm_memory_entry_xnu.h | 4 + osfmk/vm/vm_memtag.c | 12 + osfmk/vm/vm_memtag.h | 12 +- osfmk/vm/vm_object.c | 293 +- osfmk/vm/vm_object_internal.h | 21 +- osfmk/vm/vm_object_xnu.h | 28 +- osfmk/vm/vm_options.h | 14 + osfmk/vm/vm_page.h | 506 +- osfmk/vm/vm_page_internal.h | 358 +- osfmk/vm/vm_pageout.c | 686 +- osfmk/vm/vm_pageout_xnu.h | 29 +- osfmk/vm/vm_phantom_cache.c | 8 + osfmk/vm/vm_pmap.c | 37 + osfmk/vm/vm_protos.h | 7 +- osfmk/vm/vm_purgeable.c | 9 +- osfmk/vm/vm_reclaim.c | 748 +- osfmk/vm/vm_reclaim_internal.h | 19 +- osfmk/vm/vm_reclaim_xnu.h | 6 + osfmk/vm/vm_resident.c | 3161 +++++---- osfmk/vm/vm_sanitize.c | 7 +- osfmk/vm/vm_sanitize_internal.h | 2 +- osfmk/vm/vm_sanitize_telemetry.c | 4 +- osfmk/vm/vm_shared_region.c | 682 +- osfmk/vm/vm_shared_region_internal.h | 3 +- osfmk/vm/vm_shared_region_pager.c | 5 +- osfmk/vm/vm_shared_region_xnu.h | 7 +- osfmk/vm/vm_swapfile_pager.c | 4 +- osfmk/vm/vm_tests.c | 943 ++- osfmk/vm/vm_upl.c | 3 + osfmk/vm/vm_user.c | 186 +- osfmk/x86_64/pmap.c | 30 +- pexpert/arm/hwtrace/hwtrace.c | 55 +- pexpert/arm/pe_init.c | 18 +- pexpert/arm/pe_kprintf.c | 1 + pexpert/arm/pe_serial.c | 2 +- pexpert/conf/Makefile.template | 4 + pexpert/gen/bootargs.c | 5 +- pexpert/gen/device_tree.c | 6 +- pexpert/gen/pe_gen.c | 2 +- pexpert/i386/pe_init.c | 3 +- pexpert/i386/pe_kprintf.c | 3 + pexpert/pexpert/arm64/H16.h | 5 + pexpert/pexpert/arm64/VMAPPLE.h | 3 + pexpert/pexpert/arm64/apple_arm64_common.h | 19 +- pexpert/pexpert/arm64/apple_arm64_regs.h | 2 + pexpert/pexpert/arm64/board_config.h | 18 +- pexpert/pexpert/arm64/boot.h | 11 +- pexpert/pexpert/pexpert.h | 6 +- san/conf/Makefile.template | 10 +- san/conf/files | 2 +- san/coverage/Makefile | 8 +- san/coverage/kcov-blacklist-arm64 | 18 - .../{kcov-blacklist => kcov-denylist} | 13 + san/coverage/kcov-denylist-arm64 | 48 + ...-blacklist-x86_64 => kcov-denylist-x86_64} | 2 +- san/coverage/kcov.c | 215 +- san/coverage/kcov.h | 34 +- san/coverage/kcov_data.h | 2 +- san/coverage/kcov_ksancov.c | 316 +- san/coverage/kcov_ksancov.h | 46 +- san/coverage/kcov_ksancov_data.h | 60 + san/memory/Makefile | 6 +- san/memory/kasan-arm64.c | 8 +- san/memory/kasan-classic.h | 2 +- .../{kasan-blacklist => kasan-denylist} | 8 +- ...n-blacklist-arm64 => kasan-denylist-arm64} | 6 +- ...acklist-dynamic => kasan-denylist-dynamic} | 6 +- ...blacklist-x86_64 => kasan-denylist-x86_64} | 4 +- san/memory/kasan-memintrinsics.c | 127 +- san/memory/kasan-report.c | 2 +- san/memory/kasan-tbi-arm64.h | 4 +- san/memory/kasan-tbi.h | 2 +- san/memory/kasan-test.c | 14 +- san/memory/kasan.c | 16 +- ...c_blacklist.c => kasan_dynamic_denylist.c} | 170 +- san/memory/kasan_internal.h | 12 +- san/memory/memintrinsics.h | 124 +- .../{ubsan-blacklist => ubsan-denylist} | 0 ...cklist.py => generate_dynamic_denylist.py} | 6 +- san/tools/ksancov.c | 108 +- san/tools/ksancov.h | 164 +- san/tools/validate_blacklist.sh | 28 - san/tools/validate_denylist.sh | 28 + security/conf/Makefile.template | 4 + security/mac_audit.c | 2 +- security/mac_framework.h | 7 + security/mac_label.c | 4 +- security/mac_mach.c | 12 +- security/mac_mach_internal.h | 7 +- security/mac_policy.h | 144 +- security/mac_process.c | 17 + security/mac_skywalk.c | 16 +- security/mac_vfs.c | 154 +- tests/Makefile | 294 +- tests/accept_race.c | 4 + tests/aio.c | 564 ++ tests/arm_cpu_capabilities.c | 4 +- tests/arm_matrix.c | 3 +- tests/bingrade.c | 101 + tests/bingrade_helper.c | 53 + tests/coalition_policy.c | 13 + tests/coalition_policy_unentitled.c | 4 +- tests/codesigntests.c | 17 +- tests/context_helpers.h | 44 + tests/cpu_counters/cpc_security_tests.c | 11 +- tests/cpu_counters/kpc_tests.c | 8 +- tests/cpucount.c | 22 +- tests/decompression_failure.c | 44 +- tests/dev_zero.c | 22 + ...conditioner.c => disk_mount_conditioner.m} | 73 +- tests/ecc_test.c | 12 +- ...ate_conn_port_with_port_array.entitlements | 8 + ...nced-security-binary-entitlements-1.plist} | 0 ...nced-security-binary-entitlements-2.plist} | 0 .../hardened-heap-security.entitlements | 10 + ...ntitlements => hardened-heap.entitlements} | 2 + .../hardened-proc-invalid.entitlements | 10 + .../hardened-proc-security.entitlements | 8 + .../entitlements/platform-restrictions.plist | 10 + .../port_type_policy.entitlements | 12 + tests/entitlements/tpro.plist | 10 + tests/exc_guard_helper.c | 312 + tests/exc_guard_helper.h | 138 + tests/exc_guard_helper_test.c | 307 + tests/exc_guard_helper_test_unexpected.c | 118 + tests/exc_helpers.c | 227 +- tests/exc_helpers.h | 28 +- tests/exception_tests.c | 44 - tests/exec_set_proc_name.c | 50 + tests/extract_right_soft_fail.c | 136 - tests/flow_div_doubleconnect_55917185.c | 5 + tests/fp_exception.c | 9 +- tests/host_statistics_rate_limiting.c | 55 +- tests/imm_pinned_control_port.c | 4 +- tests/imm_pinned_control_port_crasher.c | 14 +- tests/inet6_addr_mode.c | 131 - tests/ioc_str.h | 7 +- tests/iokit/ioserviceusernotification_race.c | 163 +- tests/iokit/testiodataqueues.c | 86 + tests/iokit/testiodataqueues.entitlements | 12 + tests/iopolicy.c | 6 + tests/ip_pktinfo.c | 409 ++ tests/ipc/hardened_exceptions.c | 17 +- tests/ipc/ipc_read_inspect.c | 29 +- tests/ipc/ipc_thread_ports_race.c | 6 +- tests/ipc/ipcpv_telemetry_test.c | 31 + tests/ipc/mach_exc_port_substitute.c | 13 +- tests/ipc/mach_msg2.c | 11 +- tests/ipc/mach_msg_transport.c | 114 +- tests/ipc/mach_port_construct_errors.c | 146 + tests/ipc/mach_port_description.c | 87 + .../ipc/platform_restrictions_entitlements.c | 59 + tests/ipc/port_api.c | 36 + tests/ipc/port_peek.c | 2 +- tests/ipc/port_turnstile_stash.c | 2 +- tests/ipc/port_type_policy.c | 887 +++ tests/ipc/sys_perf_notify_test.c | 4 +- tests/ipc/tpro_entitlements.c | 60 + tests/ipv6_bind_race.c | 4 + tests/kern-trial.entitlements | 8 + tests/ktrace/kdebug_tests.c | 65 +- tests/ledger_entry_info_v2.c | 57 + tests/libmalloc_apple_array.c | 47 +- tests/mach_eventlink.c | 138 +- tests/mach_service_port.c | 26 +- tests/mcast_group_race_82820812.c | 6 + tests/mcast_ssm.c | 105 + tests/memorystatus_is_assertion.c | 3 +- tests/microstackshot_tests.c | 8 - tests/mktimer_kobject.c | 68 - tests/net_bridge.c | 8 +- tests/net_siocdifaddr.c | 111 + tests/net_test_lib.c | 186 +- tests/net_test_lib.h | 14 +- tests/net_tuntests.c | 159 +- tests/nox86exec.c | 26 + tests/nox86exec_helper.c | 8 + tests/nvram_tests/nvram_nonentitled.c | 14 + tests/os_refcnt.c | 2 +- tests/pac_exception_entitlement.c | 3 +- tests/perf_vmfault.c | 464 -- tests/pmap_fault_on_commpage.c | 67 + tests/pmap_stress.c | 34 + tests/poll.c | 5 + tests/posix_sem.c | 282 + tests/posix_sem_namespace_helper.c | 85 + tests/posix_spawn_file_actions.c | 24 +- tests/prng.c | 15 + tests/proc_archinfo.c | 61 + tests/proc_info.c | 29 +- tests/ptrauth_failure.c | 3 +- tests/recount/coalition_info_tests.c | 20 + tests/recount/recount_test_utils.c | 14 +- tests/recount/recount_test_utils.h | 2 +- tests/recount/recount_tests.c | 87 +- tests/recount/thread_selfcounts_tests.c | 58 +- tests/recv_link_addr_type.c | 428 ++ tests/reply_port_defense.c | 340 +- tests/reply_port_defense_client.c | 425 +- tests/rm/coalition_info_resource_usage.c | 176 + tests/runaway_mitigation.c | 582 ++ tests/runaway_mitigation.entitlements | 10 + tests/sched/Makefile | 11 + tests/sched/all_cores_running.c | 4 +- tests/sched/cluster_bound_threads.c | 21 +- tests/sched/clutch_runqueue.c | 4 +- tests/sched/edge_migration.c | 351 +- tests/sched/edge_runqueue.c | 2 +- tests/sched/rt_migration.c | 396 ++ tests/sched/rttimer.c | 291 + tests/sched/rttimer.entitlements | 8 + tests/sched/rttimer.workload_config.plist | 27 + tests/sched/sched_test_harness/Makefile | 93 - .../sched_test_harness/sched_clutch_harness.c | 24 +- .../sched_test_harness/sched_clutch_harness.h | 2 +- .../sched_clutch_harness_impl.c | 182 +- .../sched_test_harness/sched_edge_harness.c | 245 +- .../sched_test_harness/sched_edge_harness.h | 14 + .../sched_test_harness/sched_harness_impl.h | 13 +- .../sched_migration_harness.c | 65 +- .../sched_migration_harness.h | 12 +- .../sched_policy_darwintest.h | 40 +- .../sched_runqueue_harness.c | 71 +- .../sched_runqueue_harness.h | 8 +- .../shadow_headers/misc_needed_defines.h | 20 + .../shadow_headers/misc_needed_deps.c | 64 + .../shadow_headers/sched_prim.c | 55 +- tests/sched/sched_test_utils.c | 129 +- tests/sched/sched_test_utils.h | 15 +- tests/{ => sched}/setitimer.c | 133 +- tests/sched/thread_group_fairness.c | 174 + tests/sched/zero_to_n_tests.c | 277 +- tests/select_stress.c | 45 +- tests/sendmsg_test.c | 149 + tests/signal_exit_reason.c | 73 +- tests/signal_initproc.c | 32 + tests/skywalk/skt_badring.c | 4 +- tests/skywalk/skt_bind.c | 44 +- tests/skywalk/skt_closecfd.c | 6 +- tests/skywalk/skt_debug_verify.c | 2 +- tests/skywalk/skt_features.c | 1 - tests/skywalk/skt_filter.c | 3 +- tests/skywalk/skt_flow.c | 88 + tests/skywalk/skt_fsw29301703.c | 4 +- tests/skywalk/skt_fullupipe.c | 8 +- tests/skywalk/skt_kqueue.c | 8 +- tests/skywalk/skt_mangle.c | 4 +- tests/skywalk/skt_manyflows.c | 2 +- tests/skywalk/skt_netifcompat.c | 10 +- tests/skywalk/skt_nslots.c | 2 +- tests/skywalk/skt_oneslot.c | 2 +- tests/skywalk/skt_pllupipe.c | 12 +- tests/skywalk/skt_restricted_port.c | 31 +- tests/skywalk/skt_ringid.c | 6 +- tests/skywalk/skt_shutdown.c | 2 +- tests/skywalk/skt_shutdown2.c | 2 +- tests/skywalk/skt_steering.c | 119 + tests/skywalk/skt_teardown.c | 4 +- tests/skywalk/skt_utun27302538.c | 2 +- tests/skywalk/skt_utunloop.c | 95 +- tests/skywalk/skt_writemem.c | 12 +- tests/skywalk/skt_xfer.c | 172 +- tests/skywalk/skywalk_mptest_driver.c | 28 + tests/skywalk/skywalk_mptests.c | 25 +- tests/skywalk/skywalk_test_common.c | 74 +- tests/skywalk/skywalk_test_common.h | 16 +- tests/skywalk/skywalk_test_driver.c | 3 +- tests/skywalk/skywalk_test_driver.h | 8 +- tests/skywalk/skywalk_test_utils.c | 65 +- tests/skywalk/skywalk_test_utils.h | 14 +- tests/skywalk_test.entitlements | 6 +- tests/skywalk_tests.c | 68 +- tests/socket_bind_35243417.c | 5 + tests/socket_bind_35685803.c | 4 + tests/socket_v4mappedv6.c | 3 +- tests/stackshot_tests.m | 266 +- tests/sysctl_hw.c | 7 + tests/sysctl_wire_limits.c | 12 +- tests/task_suspend_stats.c | 3 + tests/task_vm_info_decompressions.c | 110 +- tests/tcp_cache_entitlements.plist | 10 + tests/tcp_cache_test.c | 194 + tests/tcp_input_outputopts_uaf_56155583.c | 4 + tests/test_utils.c | 26 + tests/test_utils.h | 7 + tests/trial_experiment_factors.c | 81 + tests/try_read_write.c | 356 + tests/try_read_write.h | 86 + tests/try_read_write_test.c | 299 + tests/try_read_write_test_unexpected.c | 126 + tests/udp_kao_opt.c | 3 +- tests/unit/Makefile | 346 + tests/unit/README.md | 210 + tests/unit/bits_test.c | 59 + tests/unit/debugger_xcall_test.c | 57 + tests/unit/ecc_test_remove_duplicates.c | 144 + tests/unit/example_dir/example_test_in_dir.c | 48 + tests/unit/example_test_bsd.c | 51 + tests/unit/example_test_iokit.cpp | 43 + tests/unit/example_test_osfmk.c | 52 + tests/unit/fibers_test.c | 194 + tests/unit/mach_vm_range_contains.c | 134 + tests/unit/mocks/dt_proxy.c | 89 + tests/unit/mocks/dt_proxy.h | 77 + tests/unit/mocks/fake_kinit.c | 119 + tests/unit/mocks/fibers/checker.c | 345 + tests/unit/mocks/fibers/checker.h | 49 + tests/unit/mocks/fibers/condition.c | 131 + tests/unit/mocks/fibers/condition.h | 46 + tests/unit/mocks/fibers/fibers.c | 411 ++ tests/unit/mocks/fibers/fibers.h | 349 + tests/unit/mocks/fibers/mutex.c | 172 + tests/unit/mocks/fibers/mutex.h | 44 + tests/unit/mocks/fibers/random.c | 82 + tests/unit/mocks/fibers/random.h | 35 + tests/unit/mocks/fibers/rwlock.c | 486 ++ tests/unit/mocks/fibers/rwlock.h | 63 + tests/unit/mocks/mock_3rd_party.c | 107 + tests/unit/mocks/mock_alloc.c | 156 + tests/unit/mocks/mock_attached.c | 162 + tests/unit/mocks/mock_cpu.c | 38 + tests/unit/mocks/mock_cpu.h | 42 + tests/unit/mocks/mock_dynamic.h | 286 + tests/unit/mocks/mock_mem.c | 86 + tests/unit/mocks/mock_misc.c | 211 + tests/unit/mocks/mock_misc.h | 41 + tests/unit/mocks/mock_pmap.c | 127 + tests/unit/mocks/mock_pmap.h | 53 + tests/unit/mocks/mock_thread.c | 1741 +++++ tests/unit/mocks/mock_thread.h | 115 + tests/unit/mocks/mock_unimpl.c | 44 + tests/unit/mocks/san_attached.c | 101 + tests/unit/mocks/std_safe.h | 199 + tests/unit/mocks/unit_test_utils.c | 114 + tests/unit/mocks/unit_test_utils.h | 97 + tests/unit/mocks_test.c | 305 + tests/unit/panic_path_test.c | 125 + tests/unit/pmap_steal_memory_overflow.c | 75 + tests/unit/tools/fibers_lldb.py | 390 ++ tests/unit/tools/generate_ut_proj.py | 757 ++ tests/unit/tools/get_target_details.py | 30 + tests/unit/tools/make_run_unittests.py | 43 + tests/unit/tools/merge_cmds_json.py | 56 + tests/unit/tools/quote_defines.py | 27 + tests/unit/tools/sanitizers-ignorelist | 30 + tests/unit/tools/xnu_lib.unexport | 25 + tests/unp_connect_thread_uaf.c | 4 + tests/unp_sock_release.c | 3 +- tests/unrecoverable_trap_test.c | 255 + tests/vfs/devfd_access.c | 120 + tests/vfs/devfd_access.entitlements | 8 + tests/vfs/direntries_permissions.c | 163 + tests/vfs/fmount_funmount.c | 115 + tests/vfs/getattrlist_fullpath.c | 67 + tests/vfs/getattrlist_mountextflags.c | 8 +- tests/vfs/linkat_flags.c | 203 + tests/vfs/longpaths.c | 6 + tests/vfs/named_fork_path.c | 173 + tests/vfs/open_symlink.c | 111 + tests/vfs/open_unique.c | 126 + tests/vfs/openbyid_stress.c | 182 + tests/vfs/openbyid_stress.entitlements | 8 + tests/vfs/resolve_beneath.c | 1018 ++- tests/vfs/resolve_namespace.c | 327 + tests/vfs/sandbox_appledouble_write.c | 195 + tests/vfs/sandbox_fstat.c | 159 + tests/vfs/sandbox_type_error.c | 169 + tests/vfs/statfs_ext.c | 4 +- tests/vfs/symlink_trailing_slash.c | 196 + tests/vfs/unlinkat_nodeletebusy.c | 108 + tests/vfs/volfs_chroot.c | 89 + tests/vm/Makefile | 14 +- tests/vm/compression_sweep.c | 2 + tests/vm/configurator/vm_configurator.c | 4322 ++++++++++++ tests/vm/configurator/vm_configurator.h | 1522 ++++ .../vm/configurator/vm_configurator_helpers.h | 137 + tests/vm/configurator/vm_configurator_tests.h | 1924 +++++ tests/vm/configurator_fault.c | 536 ++ tests/vm/configurator_mincore.c | 247 + tests/vm/configurator_mmap.c | 832 +++ tests/vm/configurator_test.c | 2124 ++++++ tests/vm/configurator_vm_allocate.c | 1019 +++ tests/vm/configurator_vm_behavior_set.c | 909 +++ tests/vm/configurator_vm_deallocate.c | 590 ++ tests/vm/configurator_vm_inherit.c | 210 + tests/vm/configurator_vm_protect.c | 385 + tests/vm/configurator_vm_wire.c | 696 ++ tests/vm/corpse_footprint.c | 128 + tests/vm/corpse_owned_vmobjects.c | 2 + tests/vm/entitlement_increased_memory_limit.c | 1 + tests/vm/entitlement_internal_bands.c | 2 +- .../vm/memory-ownership-transfer.entitlements | 8 + tests/vm/memorystatus_convert_limit_bytes.c | 1 + tests/vm/memorystatus_freeze_test.c | 34 +- tests/vm/memorystatus_kill_counts.c | 14 +- tests/vm/memorystatus_rearm.c | 328 + tests/vm/memorystatus_rearm.entitlements | 12 + tests/vm/memorystatus_sort_test.c | 352 +- tests/vm/mixed_pagesize.plist | 4 +- tests/vm/test_vm_no_pager.m | 9 + tests/vm/test_vm_no_pager_helper.c | 94 +- tests/vm/upl.c | 642 ++ tests/vm/upl.entitlements | 16 + tests/vm/vectorupl.c | 98 + tests/vm/vm_allocation.c | 880 ++- tests/vm/vm_memory_entry.c | 119 + tests/vm/vm_parameter_validation.c | 342 +- tests/vm/vm_reclaim.c | 261 +- tests/vm/vm_reclaim.entitlements | 8 + tests/vm/vm_stress.cpp | 1483 ++++ tests/vm/vm_stress_slow.sh | 113 + tests/vm/vm_sysctl_tests.c | 30 +- tests/vm/vm_user.c | 15 +- tests/vm/zalloc.c | 4 +- tests/vm_test_mach_map.c | 315 + tests/vm_test_mach_map.plist | 14 +- tests/vsock.c | 258 +- tests/vsock_entitled.c | 65 + tests/vsock_entitlements.plist | 8 + tests/vsock_helpers.c | 267 + tests/vsock_helpers.h | 95 + tests/x18_entitled.c | 17 +- tests/x18_legacy.c | 17 +- tests/x18_unentitled.c | 17 +- tools/cocci/remove-cassert.cocci | 11 + tools/lldbmacros/core/__init__.py | 1 + tools/lldbmacros/core/kernelcore.py | 29 +- tools/lldbmacros/ioreg.py | 8 +- tools/lldbmacros/ipc.py | 2142 +++--- tools/lldbmacros/kcdata.py | 99 +- tools/lldbmacros/kmemory/btlog.py | 14 +- tools/lldbmacros/kmemory/kmem.py | 9 +- tools/lldbmacros/kmemory/zone.py | 64 +- tools/lldbmacros/ktrace.py | 99 +- tools/lldbmacros/mbufs.py | 65 +- tools/lldbmacros/memory.py | 240 +- tools/lldbmacros/misc.py | 37 +- tools/lldbmacros/net.py | 277 +- tools/lldbmacros/netdefines.py | 30 +- tools/lldbmacros/ntstat.py | 111 +- tools/lldbmacros/pmap.py | 65 +- tools/lldbmacros/process.py | 17 +- tools/lldbmacros/recount.py | 490 +- tools/lldbmacros/ruff.toml | 23 +- tools/lldbmacros/scheduler.py | 8 +- .../integration_smoke/test_lldb_macros.py | 1 - tools/lldbmacros/userspace.py | 2 +- tools/lldbmacros/utils.py | 15 +- tools/lldbmacros/waitq.py | 18 +- tools/lldbmacros/xnudefines.py | 8 +- tools/pre-commit.sh | 84 +- tools/syscall_map.lua | 6 + tools/tests/zero-to-n/Makefile | 2 +- tools/tests/zero-to-n/zero-to-n.c | 284 +- .../zero-to-n/zero_to_n_workload_config.plist | 63 + 1471 files changed, 130806 insertions(+), 54469 deletions(-) create mode 100644 EXTERNAL_HEADERS/_inttypes.h create mode 100644 EXTERNAL_HEADERS/inttypes.h create mode 100644 bsd/kern/Makefile delete mode 100644 bsd/kern/bsd_syscalls_stashtask.txt create mode 100644 bsd/kern/mem_acct.c create mode 100644 bsd/kern/mem_acct.h rename osfmk/arm64/bti_telemetry.h => bsd/kern/qsort.h (68%) create mode 100644 bsd/kern/uipc_domain.h create mode 100644 bsd/kern/uipc_mbuf_mcache.c create mode 100644 bsd/kern/uipc_socket.h create mode 100644 bsd/man/man9/byteorder.9 create mode 100644 bsd/net/aop/Makefile create mode 100644 bsd/net/aop/aop_flow_stats.h create mode 100644 bsd/net/aop/aop_stats.h create mode 100644 bsd/net/aop/kpi_aop.c create mode 100644 bsd/net/aop/kpi_aop.h create mode 100644 bsd/net/bpf_private.h create mode 100644 bsd/net/pktsched/pktsched_ops.c create mode 100644 bsd/net/pktsched/pktsched_ops.h create mode 100644 bsd/net/siphash.c create mode 100644 bsd/net/siphash.h create mode 100644 bsd/netinet/tcp_pacing.c create mode 100644 bsd/netinet/tcp_pacing.h create mode 100644 bsd/netinet/tcp_syncookie.c rename bsd/{nfs/nfsdiskless.h => netinet/tcp_syncookie.h} (63%) delete mode 100644 bsd/skywalk/nexus/monitor/Makefile delete mode 100644 bsd/skywalk/nexus/monitor/nx_monitor.c delete mode 100644 bsd/skywalk/nexus/monitor/nx_monitor.h create mode 100644 bsd/skywalk/nexus/nexus_traffic_rule.h create mode 100644 bsd/skywalk/nexus/nexus_traffic_rule_eth.c create mode 100644 bsd/skywalk/nexus/nexus_traffic_rule_eth.h create mode 100644 bsd/skywalk/nexus/nexus_traffic_rule_inet.c create mode 100644 bsd/skywalk/nexus/nexus_traffic_rule_inet.h create mode 100644 bsd/sys/disk_private.h create mode 100644 bsd/sys/endian.h create mode 100644 bsd/sys/fcntl_private.h create mode 100644 bsd/sys/kern_control_private.h create mode 100644 bsd/sys/kern_event_private.h create mode 100644 bsd/sys/mem_acct_private.h create mode 100644 bsd/sys/sys_domain_private.h create mode 100644 bsd/sys/vsock_private.h create mode 100644 doc/debugging/extensible_paniclog.md create mode 100644 doc/mach_ipc/ipc_security_concepts.md create mode 100644 doc/mach_ipc/port_types.md create mode 100644 doc/observability/coalitions.md create mode 100644 doc/vm/pageout_scan.md create mode 100644 iokit/IOKit/IOCircularDataQueue.h create mode 100644 iokit/IOKit/IOCircularDataQueueImplementation.h create mode 100644 iokit/Kernel/IOCircularDataQueue.cpp create mode 100644 iokit/Tests/TestServices/TestIODataQueues.cpp create mode 100644 iokit/Tests/TestServices/TestIODataQueues.h create mode 100644 osfmk/arm64/apt.c delete mode 100644 osfmk/arm64/bti_telemetry.c delete mode 100644 osfmk/ipc/flipc.c delete mode 100644 osfmk/ipc/flipc.h delete mode 100644 osfmk/ipc/ipc_init.h create mode 100644 osfmk/kern/exclaves_aoe.c create mode 100644 osfmk/kern/exclaves_aoe.h create mode 100644 osfmk/kern/kcdata_private.h delete mode 100644 osfmk/kern/mach_node.c delete mode 100644 osfmk/kern/mach_node.h delete mode 100644 osfmk/kern/mach_node_link.h create mode 100644 osfmk/kern/sched_common.c create mode 100644 osfmk/kern/sched_common.h create mode 100644 osfmk/kern/sched_rt.c create mode 100644 osfmk/kern/sched_rt.h create mode 100644 osfmk/kern/timeout.c create mode 100644 osfmk/kern/timeout.h create mode 100644 osfmk/kern/timeout_decl.h create mode 100644 osfmk/kern/upsi.h create mode 100644 osfmk/vm/vm_lock_perf.h create mode 100644 osfmk/vm/vm_log.h delete mode 100644 san/coverage/kcov-blacklist-arm64 rename san/coverage/{kcov-blacklist => kcov-denylist} (69%) create mode 100644 san/coverage/kcov-denylist-arm64 rename san/coverage/{kcov-blacklist-x86_64 => kcov-denylist-x86_64} (98%) rename san/memory/{kasan-blacklist => kasan-denylist} (93%) rename san/memory/{kasan-blacklist-arm64 => kasan-denylist-arm64} (90%) rename san/memory/{kasan-blacklist-dynamic => kasan-denylist-dynamic} (80%) rename san/memory/{kasan-blacklist-x86_64 => kasan-denylist-x86_64} (96%) rename san/memory/{kasan_dynamic_blacklist.c => kasan_dynamic_denylist.c} (72%) rename san/memory/{ubsan-blacklist => ubsan-denylist} (100%) rename san/tools/{generate_dynamic_blacklist.py => generate_dynamic_denylist.py} (87%) delete mode 100755 san/tools/validate_blacklist.sh create mode 100755 san/tools/validate_denylist.sh create mode 100644 tests/aio.c create mode 100644 tests/bingrade.c create mode 100644 tests/bingrade_helper.c create mode 100644 tests/context_helpers.h rename tests/{disk_mount_conditioner.c => disk_mount_conditioner.m} (83%) create mode 100644 tests/entitlements/create_conn_port_with_port_array.entitlements rename tests/entitlements/{hardened-binary-entitlements-1.plist => enhanced-security-binary-entitlements-1.plist} (100%) rename tests/entitlements/{hardened-binary-entitlements-2.plist => enhanced-security-binary-entitlements-2.plist} (100%) create mode 100644 tests/entitlements/hardened-heap-security.entitlements rename tests/entitlements/{hardened-heap-standalone.entitlements => hardened-heap.entitlements} (81%) create mode 100644 tests/entitlements/hardened-proc-invalid.entitlements create mode 100644 tests/entitlements/hardened-proc-security.entitlements create mode 100644 tests/entitlements/platform-restrictions.plist create mode 100644 tests/entitlements/port_type_policy.entitlements create mode 100644 tests/entitlements/tpro.plist create mode 100644 tests/exc_guard_helper.c create mode 100644 tests/exc_guard_helper.h create mode 100644 tests/exc_guard_helper_test.c create mode 100644 tests/exc_guard_helper_test_unexpected.c create mode 100644 tests/exec_set_proc_name.c delete mode 100644 tests/extract_right_soft_fail.c create mode 100644 tests/iokit/testiodataqueues.c create mode 100644 tests/iokit/testiodataqueues.entitlements create mode 100644 tests/ip_pktinfo.c create mode 100644 tests/ipc/ipcpv_telemetry_test.c create mode 100644 tests/ipc/mach_port_construct_errors.c create mode 100644 tests/ipc/mach_port_description.c create mode 100644 tests/ipc/platform_restrictions_entitlements.c create mode 100644 tests/ipc/port_type_policy.c create mode 100644 tests/ipc/tpro_entitlements.c create mode 100644 tests/kern-trial.entitlements create mode 100644 tests/ledger_entry_info_v2.c create mode 100644 tests/mcast_ssm.c delete mode 100644 tests/mktimer_kobject.c create mode 100644 tests/net_siocdifaddr.c create mode 100644 tests/nox86exec.c create mode 100644 tests/nox86exec_helper.c delete mode 100644 tests/perf_vmfault.c create mode 100644 tests/pmap_fault_on_commpage.c create mode 100644 tests/posix_sem.c create mode 100644 tests/posix_sem_namespace_helper.c create mode 100644 tests/proc_archinfo.c create mode 100644 tests/recv_link_addr_type.c create mode 100644 tests/rm/coalition_info_resource_usage.c create mode 100644 tests/runaway_mitigation.c create mode 100644 tests/runaway_mitigation.entitlements create mode 100644 tests/sched/rt_migration.c create mode 100644 tests/sched/rttimer.c create mode 100644 tests/sched/rttimer.entitlements create mode 100644 tests/sched/rttimer.workload_config.plist delete mode 100644 tests/sched/sched_test_harness/Makefile rename tests/{ => sched}/setitimer.c (88%) create mode 100644 tests/sendmsg_test.c create mode 100644 tests/signal_initproc.c create mode 100644 tests/tcp_cache_entitlements.plist create mode 100644 tests/tcp_cache_test.c create mode 100644 tests/trial_experiment_factors.c create mode 100644 tests/try_read_write.c create mode 100644 tests/try_read_write.h create mode 100644 tests/try_read_write_test.c create mode 100644 tests/try_read_write_test_unexpected.c create mode 100644 tests/unit/Makefile create mode 100644 tests/unit/README.md create mode 100644 tests/unit/bits_test.c create mode 100644 tests/unit/debugger_xcall_test.c create mode 100644 tests/unit/ecc_test_remove_duplicates.c create mode 100644 tests/unit/example_dir/example_test_in_dir.c create mode 100644 tests/unit/example_test_bsd.c create mode 100644 tests/unit/example_test_iokit.cpp create mode 100644 tests/unit/example_test_osfmk.c create mode 100644 tests/unit/fibers_test.c create mode 100644 tests/unit/mach_vm_range_contains.c create mode 100644 tests/unit/mocks/dt_proxy.c create mode 100644 tests/unit/mocks/dt_proxy.h create mode 100644 tests/unit/mocks/fake_kinit.c create mode 100644 tests/unit/mocks/fibers/checker.c create mode 100644 tests/unit/mocks/fibers/checker.h create mode 100644 tests/unit/mocks/fibers/condition.c create mode 100644 tests/unit/mocks/fibers/condition.h create mode 100644 tests/unit/mocks/fibers/fibers.c create mode 100644 tests/unit/mocks/fibers/fibers.h create mode 100644 tests/unit/mocks/fibers/mutex.c create mode 100644 tests/unit/mocks/fibers/mutex.h create mode 100644 tests/unit/mocks/fibers/random.c create mode 100644 tests/unit/mocks/fibers/random.h create mode 100644 tests/unit/mocks/fibers/rwlock.c create mode 100644 tests/unit/mocks/fibers/rwlock.h create mode 100644 tests/unit/mocks/mock_3rd_party.c create mode 100644 tests/unit/mocks/mock_alloc.c create mode 100644 tests/unit/mocks/mock_attached.c create mode 100644 tests/unit/mocks/mock_cpu.c create mode 100644 tests/unit/mocks/mock_cpu.h create mode 100644 tests/unit/mocks/mock_dynamic.h create mode 100644 tests/unit/mocks/mock_mem.c create mode 100644 tests/unit/mocks/mock_misc.c create mode 100644 tests/unit/mocks/mock_misc.h create mode 100644 tests/unit/mocks/mock_pmap.c create mode 100644 tests/unit/mocks/mock_pmap.h create mode 100644 tests/unit/mocks/mock_thread.c create mode 100644 tests/unit/mocks/mock_thread.h create mode 100644 tests/unit/mocks/mock_unimpl.c create mode 100644 tests/unit/mocks/san_attached.c create mode 100644 tests/unit/mocks/std_safe.h create mode 100644 tests/unit/mocks/unit_test_utils.c create mode 100644 tests/unit/mocks/unit_test_utils.h create mode 100644 tests/unit/mocks_test.c create mode 100644 tests/unit/panic_path_test.c create mode 100644 tests/unit/pmap_steal_memory_overflow.c create mode 100755 tests/unit/tools/fibers_lldb.py create mode 100755 tests/unit/tools/generate_ut_proj.py create mode 100755 tests/unit/tools/get_target_details.py create mode 100755 tests/unit/tools/make_run_unittests.py create mode 100755 tests/unit/tools/merge_cmds_json.py create mode 100755 tests/unit/tools/quote_defines.py create mode 100644 tests/unit/tools/sanitizers-ignorelist create mode 100644 tests/unit/tools/xnu_lib.unexport create mode 100644 tests/unrecoverable_trap_test.c create mode 100644 tests/vfs/devfd_access.c create mode 100644 tests/vfs/devfd_access.entitlements create mode 100644 tests/vfs/direntries_permissions.c create mode 100644 tests/vfs/fmount_funmount.c create mode 100644 tests/vfs/getattrlist_fullpath.c create mode 100644 tests/vfs/linkat_flags.c create mode 100644 tests/vfs/named_fork_path.c create mode 100644 tests/vfs/open_symlink.c create mode 100644 tests/vfs/open_unique.c create mode 100644 tests/vfs/openbyid_stress.c create mode 100644 tests/vfs/openbyid_stress.entitlements create mode 100644 tests/vfs/resolve_namespace.c create mode 100644 tests/vfs/sandbox_appledouble_write.c create mode 100644 tests/vfs/sandbox_fstat.c create mode 100644 tests/vfs/sandbox_type_error.c create mode 100644 tests/vfs/symlink_trailing_slash.c create mode 100644 tests/vfs/unlinkat_nodeletebusy.c create mode 100644 tests/vfs/volfs_chroot.c create mode 100644 tests/vm/configurator/vm_configurator.c create mode 100644 tests/vm/configurator/vm_configurator.h create mode 100644 tests/vm/configurator/vm_configurator_helpers.h create mode 100644 tests/vm/configurator/vm_configurator_tests.h create mode 100644 tests/vm/configurator_fault.c create mode 100644 tests/vm/configurator_mincore.c create mode 100644 tests/vm/configurator_mmap.c create mode 100644 tests/vm/configurator_test.c create mode 100644 tests/vm/configurator_vm_allocate.c create mode 100644 tests/vm/configurator_vm_behavior_set.c create mode 100644 tests/vm/configurator_vm_deallocate.c create mode 100644 tests/vm/configurator_vm_inherit.c create mode 100644 tests/vm/configurator_vm_protect.c create mode 100644 tests/vm/configurator_vm_wire.c create mode 100644 tests/vm/corpse_footprint.c create mode 100644 tests/vm/memory-ownership-transfer.entitlements create mode 100644 tests/vm/memorystatus_rearm.c create mode 100644 tests/vm/memorystatus_rearm.entitlements create mode 100644 tests/vm/upl.c create mode 100644 tests/vm/upl.entitlements create mode 100644 tests/vm/vectorupl.c create mode 100644 tests/vm/vm_memory_entry.c create mode 100644 tests/vm/vm_reclaim.entitlements create mode 100644 tests/vm/vm_stress.cpp create mode 100755 tests/vm/vm_stress_slow.sh create mode 100644 tests/vsock_entitled.c create mode 100644 tests/vsock_entitlements.plist create mode 100644 tests/vsock_helpers.c create mode 100644 tests/vsock_helpers.h create mode 100644 tools/cocci/remove-cassert.cocci create mode 100644 tools/tests/zero-to-n/zero_to_n_workload_config.plist diff --git a/.upstream_base_commits b/.upstream_base_commits index 0343ee6c1..d980df78d 100644 --- a/.upstream_base_commits +++ b/.upstream_base_commits @@ -3,3 +3,4 @@ bsd/man/man2/access.2 freebsd lib/libc/sys/access.2 5b882020081a138285227631c46a bsd/man/man7/sticky.7 freebsd share/man/man7/sticky.7 5b882020081a138285227631c46a406c08e17bc8 bsd/man/man2/utimensat.2 freebsd lib/libc/sys/utimensat.2 89c1fcc0d088065021703b658ef547f46b5481f0 tools/tests/darwintests/netbsd_utimensat.c freebsd contrib/netbsd-tests/lib/libc/c063/t_utimensat.c 89c1fcc0d088065021703b658ef547f46b5481f0 +bsd/man/man9/byteorder.9 freebsd share/man/man9/byteorder.9 5b882020081a138285227631c46a406c08e17bc8 diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index 7d1ccf515..8cd215351 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -34,6 +34,8 @@ KERNEL_FILES = \ ptrauth.h LIBCXX_DATAFILES = \ + _inttypes.h \ + inttypes.h \ stddef.h \ stdint.h diff --git a/EXTERNAL_HEADERS/_inttypes.h b/EXTERNAL_HEADERS/_inttypes.h new file mode 100644 index 000000000..13ee7c25e --- /dev/null +++ b/EXTERNAL_HEADERS/_inttypes.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2023 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * -- Standard C header, defined in ISO/IEC 9899:1999 + * (aka "C99"), section 7.8. This defines format string conversion + * specifiers suitable for use within arguments to fprintf and fscanf + * and their ilk. + */ + +#if !defined(__INTTYPES_H_) +#define __INTTYPES_H_ + +# define __PRI_8_LENGTH_MODIFIER__ "hh" +# define __PRI_64_LENGTH_MODIFIER__ "ll" +# define __SCN_64_LENGTH_MODIFIER__ "ll" +# define __PRI_MAX_LENGTH_MODIFIER__ "j" +# define __SCN_MAX_LENGTH_MODIFIER__ "j" + +# define PRId8 __PRI_8_LENGTH_MODIFIER__ "d" +# define PRIi8 __PRI_8_LENGTH_MODIFIER__ "i" +# define PRIo8 __PRI_8_LENGTH_MODIFIER__ "o" +# define PRIu8 __PRI_8_LENGTH_MODIFIER__ "u" +# define PRIx8 __PRI_8_LENGTH_MODIFIER__ "x" +# define PRIX8 __PRI_8_LENGTH_MODIFIER__ "X" + +# define PRId16 "hd" +# define PRIi16 "hi" +# define PRIo16 "ho" +# define PRIu16 "hu" +# define PRIx16 "hx" +# define PRIX16 "hX" + +# define PRId32 "d" +# define PRIi32 "i" +# define PRIo32 "o" +# define PRIu32 "u" +# define PRIx32 "x" +# define PRIX32 "X" + +# define PRId64 __PRI_64_LENGTH_MODIFIER__ "d" +# define PRIi64 __PRI_64_LENGTH_MODIFIER__ "i" +# define PRIo64 __PRI_64_LENGTH_MODIFIER__ "o" +# define PRIu64 __PRI_64_LENGTH_MODIFIER__ "u" +# define PRIx64 __PRI_64_LENGTH_MODIFIER__ "x" +# define PRIX64 __PRI_64_LENGTH_MODIFIER__ "X" + +# define PRIdLEAST8 PRId8 +# define PRIiLEAST8 PRIi8 +# define PRIoLEAST8 PRIo8 +# define PRIuLEAST8 PRIu8 +# define PRIxLEAST8 PRIx8 +# define PRIXLEAST8 PRIX8 + +# define PRIdLEAST16 PRId16 +# define PRIiLEAST16 PRIi16 +# define PRIoLEAST16 PRIo16 +# define PRIuLEAST16 PRIu16 +# define PRIxLEAST16 PRIx16 +# define PRIXLEAST16 PRIX16 + +# define PRIdLEAST32 PRId32 +# define PRIiLEAST32 PRIi32 +# define PRIoLEAST32 PRIo32 +# define PRIuLEAST32 PRIu32 +# define PRIxLEAST32 PRIx32 +# define PRIXLEAST32 PRIX32 + +# define PRIdLEAST64 PRId64 +# define PRIiLEAST64 PRIi64 +# define PRIoLEAST64 PRIo64 +# define PRIuLEAST64 PRIu64 +# define PRIxLEAST64 PRIx64 +# define PRIXLEAST64 PRIX64 + +# define PRIdFAST8 PRId8 +# define PRIiFAST8 PRIi8 +# define PRIoFAST8 PRIo8 +# define PRIuFAST8 PRIu8 +# define PRIxFAST8 PRIx8 +# define PRIXFAST8 PRIX8 + +# define PRIdFAST16 PRId16 +# define PRIiFAST16 PRIi16 +# define PRIoFAST16 PRIo16 +# define PRIuFAST16 PRIu16 +# define PRIxFAST16 PRIx16 +# define PRIXFAST16 PRIX16 + +# define PRIdFAST32 PRId32 +# define PRIiFAST32 PRIi32 +# define PRIoFAST32 PRIo32 +# define PRIuFAST32 PRIu32 +# define PRIxFAST32 PRIx32 +# define PRIXFAST32 PRIX32 + +# define PRIdFAST64 PRId64 +# define PRIiFAST64 PRIi64 +# define PRIoFAST64 PRIo64 +# define PRIuFAST64 PRIu64 +# define PRIxFAST64 PRIx64 +# define PRIXFAST64 PRIX64 + +/* int32_t is 'int', but intptr_t is 'long'. */ +# define PRIdPTR "ld" +# define PRIiPTR "li" +# define PRIoPTR "lo" +# define PRIuPTR "lu" +# define PRIxPTR "lx" +# define PRIXPTR "lX" + +# define PRIdMAX __PRI_MAX_LENGTH_MODIFIER__ "d" +# define PRIiMAX __PRI_MAX_LENGTH_MODIFIER__ "i" +# define PRIoMAX __PRI_MAX_LENGTH_MODIFIER__ "o" +# define PRIuMAX __PRI_MAX_LENGTH_MODIFIER__ "u" +# define PRIxMAX __PRI_MAX_LENGTH_MODIFIER__ "x" +# define PRIXMAX __PRI_MAX_LENGTH_MODIFIER__ "X" + +# define SCNd8 __PRI_8_LENGTH_MODIFIER__ "d" +# define SCNi8 __PRI_8_LENGTH_MODIFIER__ "i" +# define SCNo8 __PRI_8_LENGTH_MODIFIER__ "o" +# define SCNu8 __PRI_8_LENGTH_MODIFIER__ "u" +# define SCNx8 __PRI_8_LENGTH_MODIFIER__ "x" + +# define SCNd16 "hd" +# define SCNi16 "hi" +# define SCNo16 "ho" +# define SCNu16 "hu" +# define SCNx16 "hx" + +# define SCNd32 "d" +# define SCNi32 "i" +# define SCNo32 "o" +# define SCNu32 "u" +# define SCNx32 "x" + +# define SCNd64 __SCN_64_LENGTH_MODIFIER__ "d" +# define SCNi64 __SCN_64_LENGTH_MODIFIER__ "i" +# define SCNo64 __SCN_64_LENGTH_MODIFIER__ "o" +# define SCNu64 __SCN_64_LENGTH_MODIFIER__ "u" +# define SCNx64 __SCN_64_LENGTH_MODIFIER__ "x" + +# define SCNdLEAST8 SCNd8 +# define SCNiLEAST8 SCNi8 +# define SCNoLEAST8 SCNo8 +# define SCNuLEAST8 SCNu8 +# define SCNxLEAST8 SCNx8 + +# define SCNdLEAST16 SCNd16 +# define SCNiLEAST16 SCNi16 +# define SCNoLEAST16 SCNo16 +# define SCNuLEAST16 SCNu16 +# define SCNxLEAST16 SCNx16 + +# define SCNdLEAST32 SCNd32 +# define SCNiLEAST32 SCNi32 +# define SCNoLEAST32 SCNo32 +# define SCNuLEAST32 SCNu32 +# define SCNxLEAST32 SCNx32 + +# define SCNdLEAST64 SCNd64 +# define SCNiLEAST64 SCNi64 +# define SCNoLEAST64 SCNo64 +# define SCNuLEAST64 SCNu64 +# define SCNxLEAST64 SCNx64 + +# define SCNdFAST8 SCNd8 +# define SCNiFAST8 SCNi8 +# define SCNoFAST8 SCNo8 +# define SCNuFAST8 SCNu8 +# define SCNxFAST8 SCNx8 + +# define SCNdFAST16 SCNd16 +# define SCNiFAST16 SCNi16 +# define SCNoFAST16 SCNo16 +# define SCNuFAST16 SCNu16 +# define SCNxFAST16 SCNx16 + +# define SCNdFAST32 SCNd32 +# define SCNiFAST32 SCNi32 +# define SCNoFAST32 SCNo32 +# define SCNuFAST32 SCNu32 +# define SCNxFAST32 SCNx32 + +# define SCNdFAST64 SCNd64 +# define SCNiFAST64 SCNi64 +# define SCNoFAST64 SCNo64 +# define SCNuFAST64 SCNu64 +# define SCNxFAST64 SCNx64 + +# define SCNdPTR "ld" +# define SCNiPTR "li" +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" + +# define SCNdMAX __SCN_MAX_LENGTH_MODIFIER__ "d" +# define SCNiMAX __SCN_MAX_LENGTH_MODIFIER__ "i" +# define SCNoMAX __SCN_MAX_LENGTH_MODIFIER__ "o" +# define SCNuMAX __SCN_MAX_LENGTH_MODIFIER__ "u" +# define SCNxMAX __SCN_MAX_LENGTH_MODIFIER__ "x" + +#include + +#endif /* !__INTTYPES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest.h b/EXTERNAL_HEADERS/corecrypto/ccdigest.h index 39db86afb..88b1a9c3f 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest.h @@ -1,4 +1,4 @@ -/* Copyright (c) (2010-2012,2014-2022) Apple Inc. All rights reserved. +/* Copyright (c) (2010-2012,2014-2022,2024) Apple Inc. All rights reserved. * * corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which * is contained in the License.txt file distributed with corecrypto) and only to @@ -41,8 +41,12 @@ struct ccdigest_info { void(* CC_SPTR(ccdigest_info, final))(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest); cc_impl_t impl; + void(* CC_SPTR(ccdigest_info, compress_parallel))(ccdigest_state_t state1, size_t nblocks1, + const void *data1, ccdigest_state_t state2, size_t nblocks2, const void *data2); }; +typedef const struct ccdigest_info *(*ccdigest_info_selector_t)(void); + /* Return sizeof a ccdigest_ctx for a given size_t _state_size_ and size_t _block_size_. */ #define ccdigest_ctx_size(_state_size_, _block_size_) ((_state_size_) + sizeof(uint64_t) + (_block_size_) + sizeof(unsigned int)) @@ -89,6 +93,24 @@ void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned void ccdigest(const struct ccdigest_info *di, size_t len, const void *data, void *digest); +/*! + @function ccdigest_parallel + @abstract Hashes two inputs of the same size, in parallel where hardware support is available. + + @param di digest info struct specifying the hash to use + @param data_nbytes the size of the inputs + @param data1 pointer to the first input + @param digest1 output pointer for the hash of data1 + @param data2 pointer to the second input + @param digest2 output pointer for the hash of data2 + + @discussion This is intended for use in the construction of Merkle trees. +*/ +CC_NONNULL_ALL +void ccdigest_parallel(const struct ccdigest_info *di, size_t data_nbytes, + const void *data1, void *digest1, + const void *data2, void *digest2); + #define OID_DEF(_VALUE_) ((const unsigned char *)_VALUE_) // https://csrc.nist.gov/projects/computer-security-objects-register/algorithm-registration#Hash diff --git a/EXTERNAL_HEADERS/inttypes.h b/EXTERNAL_HEADERS/inttypes.h new file mode 100644 index 000000000..ee6e626c7 --- /dev/null +++ b/EXTERNAL_HEADERS/inttypes.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2000-2004, 2013, 2023 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * -- Standard C header, defined in ISO/IEC 9899:1999 + * (aka "C99"), section 7.8. This defines format string conversion + * specifiers suitable for use within arguments to fprintf and fscanf + * and their ilk. + */ + +#if !defined(_INTTYPES_H_) || __has_feature(modules) +#define _INTTYPES_H_ + +#include <_inttypes.h> + +#endif /* !_INTTYPES_H_ */ diff --git a/Makefile b/Makefile index ca0a2e25f..24b63f8cc 100644 --- a/Makefile +++ b/Makefile @@ -210,6 +210,7 @@ TOP_TARGETS = \ install install_desktop install_embedded \ install_release_embedded install_development_embedded \ install_release_desktop install_development_desktop \ + install_release_embedded_nohdrs install_release_desktop_nohdrs \ install_kernels \ cscope tags TAGS \ help @@ -336,6 +337,12 @@ xnu_tests_driverkit: $(MAKE) -C $(SRCROOT)/tests/driverkit $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \ SRCROOT=$(SRCROOT)/tests/driverkit +xnu_unittests: + $(MAKE) -C $(SRCROOT)/tests/unit $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \ + SRCROOT=$(SRCROOT)/tests/unit + $(MAKE) -C $(SRCROOT)/tests $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) sched/install_userspace_unit_tests \ + SRCROOT=$(SRCROOT)/tests + include $(MakeInc_cmd) diff --git a/README.md b/README.md index 6f9f6df1b..99c8db7e5 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ This can be customized by setting the `RC_DARWIN_KERNEL_VERSION` variable in the environment or on the `make` command line. -See doc/xnu_version.md for more details. +See doc/building/xnu_version.md for more details. ### Debug Information Formats @@ -421,6 +421,8 @@ DriverKit SDK headers used by userspace drivers. ExclaveKit SDK headers. 9. `EXCLAVECORE`: If defined, enclosed code is visible exclusively in the ExclaveCore SDK headers. +10. `MODULES_SUPPORTED` If defined, enclosed code is visible exclusively +in locations that support modules/Swift (i.e. not System or Kernel frameworks). ## VM header file name convention The VM headers follow the following naming conventions: diff --git a/SETUP/config/mkheaders.c b/SETUP/config/mkheaders.c index e3d2f34ac..e20463413 100644 --- a/SETUP/config/mkheaders.c +++ b/SETUP/config/mkheaders.c @@ -104,6 +104,17 @@ do_count(const char *dev, const char *hname, int search) do_header(dev, hname, count); } +static void +free_file_list(struct file_list *fl) +{ + struct file_list *fl_prev; + while (fl != 0) { + fl_prev = fl; + fl = fl->f_next; + free((char *)fl_prev); + } +} + static void do_header(const char *dev, const char *hname, int count) { @@ -111,7 +122,7 @@ do_header(const char *dev, const char *hname, int count) const char *inw; char *inwcopy; struct file_list *fl = NULL; /* may exit for(;;) uninitted */ - struct file_list *fl_head, *fl_prev; + struct file_list *fl_head; FILE *inf, *outf; int inc, oldcount; @@ -169,11 +180,7 @@ do_header(const char *dev, const char *hname, int count) } (void) fclose(inf); if (count == oldcount) { - while (fl != 0) { - fl_prev = fl; - fl = fl->f_next; - free((char *)fl_prev); - } + free_file_list(fl_head); return; } if (oldcount == -1) { @@ -192,8 +199,8 @@ do_header(const char *dev, const char *hname, int count) for (fl = fl_head; fl != 0; fl = fl->f_next) { fprintf(outf, "#define %s %d\n", fl->f_fn, count ? fl->f_type : 0); - free((char *)fl); } + free_file_list(fl_head); (void) fclose(outf); } diff --git a/bsd/Makefile b/bsd/Makefile index 1f459df0a..7461701de 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -42,6 +42,7 @@ EXPINC_SUBDIRS = \ bsm \ crypto/entropy \ dev \ + kern \ libkern \ machine \ miscfs \ diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 5aba04a94..b86302cb4 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -619,6 +619,7 @@ #define AUE_PREADV 43216 /* Darwin. */ #define AUE_PWRITEV 43217 /* Darwin. */ #define AUE_FREADLINK 43218 +#define AUE_FUNMOUNT 43219 /* Darwin. */ #define AUE_SESSION_START 44901 /* Darwin. */ #define AUE_SESSION_UPDATE 44902 /* Darwin. */ diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index 1b3361e51..8684c5b00 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -180,6 +180,7 @@ tty_compat.o_CWARNFLAGS_ADD += -Wno-cast-align tty_dev.o_CWARNFLAGS_ADD += -Wno-cast-align ubc_subr.o_CWARNFLAGS_ADD += -Wno-cast-align uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-cast-align +uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-cast-align uipc_usrreq.o_CWARNFLAGS_ADD += -Wno-cast-align vfs_attrlist.o_CWARNFLAGS_ADD += -Wno-cast-align vfs_fsevents.o_CWARNFLAGS_ADD += -Wno-cast-align @@ -247,6 +248,7 @@ systrace.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion sysv_msg.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion sysv_sem.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion +uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion vfs_quota.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion vsock_domain.o_CWARNFLAGS_ADD += -Wno-implicit-int-conversion # -Wno-shorten-64-to-32 @@ -306,6 +308,7 @@ sysv_msg.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 sysv_sem.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 sysv_shm.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 +uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 unix_signal.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 ux_exception.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 vfs_cluster.o_CWARNFLAGS_ADD += -Wno-shorten-64-to-32 @@ -326,6 +329,7 @@ radix.o_CWARNFLAGS_ADD += -Wno-sign-compare route6.o_CWARNFLAGS_ADD += -Wno-sign-compare scope6.o_CWARNFLAGS_ADD += -Wno-sign-compare uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-sign-compare +uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-sign-compare # -Wno-sign-conversion audit.o_CWARNFLAGS_ADD += -Wno-sign-conversion audit_arg.o_CWARNFLAGS_ADD += -Wno-sign-conversion @@ -494,6 +498,7 @@ tcp_cc.o_CWARNFLAGS_ADD += -Wno-sign-conversion tcp_cubic.o_CWARNFLAGS_ADD += -Wno-sign-conversion ubc_subr.o_CWARNFLAGS_ADD += -Wno-sign-conversion uipc_mbuf.o_CWARNFLAGS_ADD += -Wno-sign-conversion +uipc_mbuf_mcache.o_CWARNFLAGS_ADD += -Wno-sign-conversion unix_signal.o_CWARNFLAGS_ADD += -Wno-sign-conversion unix_startup.o_CWARNFLAGS_ADD += -Wno-sign-conversion ux_exception.o_CWARNFLAGS_ADD += -Wno-sign-conversion @@ -601,6 +606,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/bsd/conf/files b/bsd/conf/files index e35917680..202f1648a 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -58,7 +58,6 @@ OPTIONS/kctl_test optional kctl_test OPTIONS/skywalk optional skywalk OPTIONS/config_nexus_user_pipe optional config_nexus_user_pipe OPTIONS/config_nexus_kernel_pipe optional config_nexus_kernel_pipe -OPTIONS/config_nexus_monitor optional config_nexus_monitor OPTIONS/config_nexus_flowswitch optional config_nexus_flowswitch OPTIONS/config_nexus_netif optional config_nexus_netif @@ -169,7 +168,7 @@ bsd/net/if_bridge.c optional if_bridge bound-checks bsd/net/bridgestp.c optional bridgestp bsd/net/if.c optional networking bound-checks bsd/net/init.c optional sockets bound-checks -bsd/net/dlil.c optional networking bound-checks-pending +bsd/net/dlil.c optional networking bound-checks bsd/net/dlil_ctl.c optional networking bound-checks bsd/net/dlil_input.c optional networking bound-checks bsd/net/dlil_output.c optional networking bound-checks @@ -188,7 +187,7 @@ bsd/net/multicast_list.c optional networking bound-checks bsd/net/if_bond.c optional bond bound-checks bsd/net/devtimer.c optional bond bound-checks bsd/net/ndrv.c optional networking bound-checks -bsd/net/radix.c optional networking +bsd/net/radix.c optional networking bound-checks-pending bsd/net/raw_cb.c optional networking bound-checks bsd/net/raw_usrreq.c optional networking bound-checks bsd/net/route.c optional networking bound-checks @@ -206,7 +205,7 @@ bsd/net/kpi_interfacefilter.c optional networking bound-checks bsd/net/net_str_id.c optional networking bound-checks bsd/net/if_utun.c optional networking bound-checks bsd/net/if_ipsec.c optional ipsec bound-checks -bsd/net/necp.c optional necp +bsd/net/necp.c optional necp bound-checks bsd/net/necp_client.c optional necp bound-checks bsd/net/network_agent.c optional networking bound-checks bsd/net/bloom_filter.c optional networking bound-checks @@ -226,6 +225,7 @@ bsd/net/pktap.c optional networking bound-checks bsd/net/droptap.c optional networking bound-checks bsd/net/if_llreach.c optional networking bound-checks bsd/net/flowhash.c optional networking bound-checks +bsd/net/siphash.c optional networking bound-checks bsd/net/flowadv.c optional networking bound-checks bsd/net/content_filter.c optional content_filter bound-checks bsd/net/content_filter_crypto.c optional content_filter bound-checks @@ -245,6 +245,9 @@ bsd/net/classq/classq_fq_codel.c optional networking bound-checks bsd/net/pktsched/pktsched.c optional networking bound-checks bsd/net/pktsched/pktsched_fq_codel.c optional networking bound-checks bsd/net/pktsched/pktsched_netem.c optional networking bound-checks +bsd/net/pktsched/pktsched_ops.c optional networking bound-checks + +bsd/net/aop/kpi_aop.c optional networking bound-checks bsd/netinet/cpu_in_cksum_gen.c standard bound-checks bsd/netinet/in_cksum.c optional inet bound-checks @@ -277,10 +280,12 @@ bsd/netinet/tcp_cc.c optional inet bound-checks bsd/netinet/tcp_newreno.c optional inet bound-checks bsd/netinet/tcp_cubic.c optional inet bound-checks bsd/netinet/tcp_prague.c optional inet bound-checks +bsd/netinet/tcp_pacing.c optional inet bound-checks bsd/netinet/cbrtf.c optional inet bound-checks bsd/netinet/tcp_ledbat.c optional inet bound-checks bsd/netinet/tcp_rledbat.c optional inet bound-checks bsd/netinet/tcp_rack.c optional inet bound-checks +bsd/netinet/tcp_syncookie.c optional inet bound-checks bsd/netinet/tcp_log.c optional inet bound-checks bsd/netinet/tcp_sysctls.c optional inet bound-checks bsd/netinet/tcp_ccdbg.c optional inet bound-checks @@ -390,6 +395,7 @@ bsd/kern/kern_authorization.c standard bsd/kern/kern_backtrace.c standard bsd/kern/kern_clock.c standard bsd/kern/kern_core.c optional config_coredump +bsd/kern/kern_core.c optional config_ucoredump bsd/kern/kern_credential.c standard bsd/kern/kern_crossarch.c standard bsd/kern/kern_cs.c standard @@ -436,6 +442,7 @@ bsd/kern/kern_xxx.c standard bsd/kern/lockdown_mode.c standard bsd/kern/mach_process.c standard bsd/kern/mcache.c optional sockets config_mbuf_mcache +bsd/kern/mem_acct.c optional sockets bound-checks bsd/kern/stackshot.c standard bsd/kern/subr_log.c standard bsd/kern/subr_log_stream.c standard @@ -466,13 +473,14 @@ bsd/kern/tty_tty.c standard bsd/kern/ubc_subr.c standard bsd/kern/uipc_domain.c optional sockets bound-checks bsd/kern/uipc_mbuf.c optional sockets bound-checks -bsd/kern/uipc_mbuf2.c optional sockets bound-checks +bsd/kern/uipc_mbuf_mcache.c optional sockets config_mbuf_mcache +bsd/kern/uipc_mbuf2.c optional sockets bound-checks-soft bsd/kern/uipc_proto.c optional sockets bound-checks bsd/kern/uipc_socket.c optional sockets bound-checks bsd/kern/uipc_socket2.c optional sockets bound-checks bsd/kern/uipc_syscalls.c optional sockets bound-checks bsd/kern/uipc_usrreq.c optional sockets bound-checks -bsd/kern/vsock_domain.c optional sockets +bsd/kern/vsock_domain.c optional sockets bound-checks-soft bsd/kern/sysv_ipc.c standard bsd/kern/sysv_shm.c standard bsd/kern/sysv_sem.c standard @@ -568,6 +576,8 @@ bsd/skywalk/nexus/nexus_mbq.c optional skywalk bound-checks bsd/skywalk/nexus/nexus_pktq.c optional skywalk bound-checks bsd/skywalk/nexus/nexus_syscalls.c optional skywalk bound-checks bsd/skywalk/nexus/nexus_traffic_rule.c optional skywalk bound-checks +bsd/skywalk/nexus/nexus_traffic_rule_inet.c optional skywalk bound-checks +bsd/skywalk/nexus/nexus_traffic_rule_eth.c optional skywalk bound-checks bsd/skywalk/nexus/flowswitch/nx_flowswitch.c optional config_nexus_flowswitch bound-checks bsd/skywalk/nexus/flowswitch/fsw.c optional config_nexus_flowswitch bound-checks bsd/skywalk/nexus/flowswitch/fsw_vp.c optional config_nexus_flowswitch bound-checks @@ -590,7 +600,6 @@ bsd/skywalk/nexus/flowswitch/flow/flow_route.c optional config_nexus_flowswitch bsd/skywalk/nexus/flowswitch/flow/flow_stats.c optional config_nexus_flowswitch bound-checks bsd/skywalk/nexus/flowswitch/flow/flow_track.c optional config_nexus_flowswitch bound-checks bsd/skywalk/nexus/flowswitch/flow/flow_agg.c optional config_nexus_flowswitch bound-checks -bsd/skywalk/nexus/monitor/nx_monitor.c optional config_nexus_monitor bsd/skywalk/nexus/netif/nx_netif.c optional config_nexus_netif bound-checks bsd/skywalk/nexus/netif/nx_netif_compat.c optional config_nexus_netif bound-checks bsd/skywalk/nexus/netif/nx_netif_host.c optional config_nexus_netif bound-checks diff --git a/bsd/conf/param.c b/bsd/conf/param.c index fe3cfc9c2..a26380196 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -110,9 +110,7 @@ int maxfiles = 3 * OPEN_MAX; int maxfiles = OPEN_MAX + 2048; #endif -unsigned int ncallout = 16 + 2 * NPROC; unsigned int nmbclusters = NMBCLUSTERS; -int nport = NPROC / 2; /* * async IO (aio) configurable limits diff --git a/bsd/dev/arm/kern_machdep.c b/bsd/dev/arm/kern_machdep.c index 2d8067794..58769e07a 100644 --- a/bsd/dev/arm/kern_machdep.c +++ b/bsd/dev/arm/kern_machdep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2020 Apple Inc. All rights reserved. + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. */ /* * Copyright (C) 1990, NeXT, Inc. @@ -17,6 +17,8 @@ #include #include +int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool); + #if __arm64__ static cpu_subtype_t cpu_subtype32(void); #endif /* __arm64__ */ @@ -51,18 +53,18 @@ grade_arm64e_binary(cpu_subtype_t execfeatures) #endif /* XNU_TARGET_OS_IOS || XNU_TARGET_OS_XR */ /* The current ABI version is preferred over arm64 */ - if (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(execfeatures) == - CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION) { + if (CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(execfeatures) <= + CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION) { return 12; } - /* Future ABIs are allowed, but exec_mach_imgact will treat it like an arm64 slice */ + /* Non-preferred future and older ABIs are allowed, but exec_mach_imgact may treat them like an arm64 slice */ return 11; } #endif /* __arm64__ */ /********************************************************************** -* Routine: grade_binary() +* Routine: ml_grade_binary() * * Function: Return a relative preference for exectypes and * execsubtypes in fat executable files. The higher the @@ -70,7 +72,7 @@ grade_arm64e_binary(cpu_subtype_t execfeatures) * not acceptable. **********************************************************************/ int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused) +ml_grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused) { #if __arm64__ cpu_subtype_t hostsubtype = diff --git a/bsd/dev/arm64/sysctl.c b/bsd/dev/arm64/sysctl.c index 1be8210cb..777a1a27e 100644 --- a/bsd/dev/arm64/sysctl.c +++ b/bsd/dev/arm64/sysctl.c @@ -23,6 +23,9 @@ #include #endif +#define __STR(x) #x +#define STRINGIFY(x) __STR(x) + extern uint64_t wake_abstime; #if DEVELOPMENT || DEBUG @@ -438,6 +441,22 @@ SYSCTL_PROC(_machdep, OID_AUTO, ptrauth_enabled, 0, 0, machdep_ptrauth_enabled, "I", ""); +static const char _ctrr_type[] = +#if defined(KERNEL_CTRR_VERSION) + "ctrrv" STRINGIFY(KERNEL_CTRR_VERSION); +#elif defined(KERNEL_INTEGRITY_KTRR) + "ktrr"; +#elif defined(KERNEL_INTEGRITY_PV_CTRR) + "pv"; +#else + "none"; +#endif + +SYSCTL_STRING(_machdep, OID_AUTO, ctrr_type, + CTLFLAG_KERN | CTLFLAG_RD | CTLFLAG_LOCKED, + __DECONST(char *, _ctrr_type), 0, + "CTRR type supported by hardware/kernel"); + #if CONFIG_TELEMETRY && (DEBUG || DEVELOPMENT) extern unsigned long trap_telemetry_reported_events; SYSCTL_ULONG(_debug, OID_AUTO, trap_telemetry_reported_events, @@ -466,3 +485,11 @@ dram_ecc_error_injection_capable SYSCTL_HANDLER_ARGS SYSCTL_PROC(_vm, OID_AUTO, dram_ecc_error_injection_capable, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, &dram_ecc_error_injection_capable, "I", ""); #endif /* DEBUG || DEVELOPMENT */ + + +#if DEBUG || DEVELOPMENT +extern _Atomic unsigned int ipcpv_telemetry_count; +SYSCTL_UINT(_debug, OID_AUTO, ipcpv_telemetry_count, + CTLFLAG_RD | CTLFLAG_LOCKED, &ipcpv_telemetry_count, + 0, "Number of ipc policy violation telemetry emitted"); +#endif /* DEBUG || DEVELOPMENT */ diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index f94124cb1..c38ae193a 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -258,7 +258,7 @@ static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */ */ static ZONE_DEFINE_TYPE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t", - dtrace_probe_t, ZC_PGZ_USE_GUARDS); + dtrace_probe_t, ZC_NONE); static ZONE_DEFINE(dtrace_state_pcpu_zone, "dtrace.dtrace_dstate_percpu_t", sizeof(dtrace_dstate_percpu_t), ZC_PERCPU); @@ -564,7 +564,7 @@ dtrace_load##bits(uintptr_t addr) \ int i; \ volatile uint16_t *flags = (volatile uint16_t *) \ &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \ - uintptr_t caddr = vm_memtag_canonicalize_kernel(addr); \ + uintptr_t caddr = VM_KERNEL_STRIP_PTR(addr); \ \ DTRACE_ALIGNCHECK(addr, size, flags); \ \ @@ -19277,8 +19277,6 @@ static int gMajDevNo; void dtrace_early_init (void) { - dtrace_restriction_policy_load(); - /* * See dtrace_impl.h for a description of kernel symbol modes. * The default is to wait for symbols from userspace (lazy symbols). diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index 3ffb811a6..63bfb556b 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -35,14 +35,11 @@ #include #include #include +#include #if CONFIG_CSR #include #include - -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) -extern bool csr_unsafe_kernel_text; -#endif #endif /* @@ -414,13 +411,6 @@ dtrace_state_free(minor_t minor) kfree_type(dtrace_state_t, state); } - - -void -dtrace_restriction_policy_load(void) -{ -} - /* * Check if DTrace has been restricted by the current security policy. */ @@ -449,7 +439,8 @@ dtrace_are_restrictions_relaxed(void) boolean_t dtrace_fbt_probes_restricted(void) { - + if (!ml_unsafe_kernel_text()) + return TRUE; #if CONFIG_CSR if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) return TRUE; @@ -462,6 +453,8 @@ boolean_t dtrace_sdt_probes_restricted(void) { + if (!ml_unsafe_kernel_text()) + return TRUE; return FALSE; } diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index 8d4fbb91d..975587da2 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -592,7 +592,7 @@ fasttrap_setdebug(proc_t *p) * should not be possible for the process to actually * disappear. */ - struct proc_ident pident = proc_ident(p); + struct proc_ident pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); sprunlock(p); p = PROC_NULL; @@ -2428,8 +2428,8 @@ fasttrap_check_cred_priv(cred_t *cr, proc_t *p) #if CONFIG_MACF /* Check with MAC framework when enabled. */ - struct proc_ident cur_ident = proc_ident(current_proc()); - struct proc_ident p_ident = proc_ident(p); + struct proc_ident cur_ident = proc_ident_with_policy(current_proc(), IDENT_VALIDATION_PROC_EXACT); + struct proc_ident p_ident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); /* Do not hold ref to proc here to avoid deadlock. */ proc_rele(p); diff --git a/bsd/dev/dtrace/scripts/errno.d b/bsd/dev/dtrace/scripts/errno.d index a8f16059f..1ccd4965e 100644 --- a/bsd/dev/dtrace/scripts/errno.d +++ b/bsd/dev/dtrace/scripts/errno.d @@ -234,5 +234,7 @@ inline int EOWNERDEAD = 105; #pragma D binding "1.0" EOWNERDEAD inline int EQFULL = 106; #pragma D binding "1.0" EQFULL -inline int ELAST = 106; +inline int ENOTCAPABLE = 107; +#pragma D binding "1.0" ENOTCAPABLE +inline int ELAST = 107; #pragma D binding "1.0" ELAST diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 9726864d9..af6202304 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -996,6 +996,18 @@ sdt_argdesc_t sdt_args[] = { {"vminfo", "vm_sanitize", 4, 4, "uint64_t", "uint64_t" }, {"vminfo", "vm_sanitize", 5, 5, "uint64_t", "uint64_t" }, {"vminfo", "vm_sanitize", 6, 6, "uint64_t", "uint64_t" }, + {"vminfo", "corpse_footprint_collect", 0, 0, "uint32_t", "uint32_t" }, + {"vminfo", "corpse_footprint_collect", 1, 1, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect", 2, 2, "uint32_t", "uint32_t" }, + {"vminfo", "corpse_footprint_collect", 3, 3, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_new_region", 0, 0, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_new_region", 1, 1, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_new_region", 2, 2, "uint64_t", "uint64_t" }, + {"vminfo", "corpse_footprint_collect_zero_gap", 0, 0, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_zero_gap", 1, 1, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_zero_gap", 2, 2, "uint64_t", "uint64_t" }, + {"vminfo", "corpse_footprint_collect_page_info", 0, 0, "vm_map_offset_t", "vm_map_offset_t" }, + {"vminfo", "corpse_footprint_collect_page_info", 1, 1, "uint8_t", "uint8_t" }, {"vminfo", "reclaim_ring_allocate", 0, 0, "mach_vm_address_t", "mach_vm_address_t" }, {"vminfo", "reclaim_ring_allocate", 1, 1, "mach_vm_reclaim_count_t", "mach_vm_reclaim_count_t" }, {"vminfo", "reclaim_ring_allocate", 2, 2, "mach_vm_reclaim_count_t", "mach_vm_reclaim_count_t" }, diff --git a/bsd/dev/i386/kern_machdep.c b/bsd/dev/i386/kern_machdep.c index 890c15f25..64e676f98 100644 --- a/bsd/dev/i386/kern_machdep.c +++ b/bsd/dev/i386/kern_machdep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,15 +44,17 @@ extern int bootarg_no32exec; /* bsd_init.c */ #endif +int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool); + /********************************************************************** -* Routine: grade_binary() +* Routine: ml_grade_binary() * * Function: Say OK to CPU types that we can actually execute on the given * system. 64-bit binaries have the highest preference, followed * by 32-bit binaries. 0 means unsupported. **********************************************************************/ int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused) +ml_grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures __unused, bool allow_simulator_binary __unused) { cpu_subtype_t hostsubtype = cpu_subtype(); diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index de25961ef..7055b7e81 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -59,11 +59,6 @@ extern uint32_t kern_maxvnodes; extern vm_map_t mb_map; #endif /* CONFIG_MBUF_MCACHE */ -#if INET -extern uint32_t tcp_sendspace; -extern uint32_t tcp_recvspace; -#endif - void bsd_bufferinit(void); unsigned int bsd_mbuf_cluster_reserve(boolean_t *); @@ -174,27 +169,6 @@ bsd_startupearly(void) buf_headers = (struct buf *)bufferhdr_range.min_address; -#if SOCKETS - { - static const unsigned int maxspace = 128 * 1024; - int scale; - -#if INET - if ((scale = nmbclusters / NMBCLUSTERS) > 1) { - tcp_sendspace *= scale; - tcp_recvspace *= scale; - - if (tcp_sendspace > maxspace) { - tcp_sendspace = maxspace; - } - if (tcp_recvspace > maxspace) { - tcp_recvspace = maxspace; - } - } -#endif /* INET */ - } -#endif /* SOCKETS */ - if (vnodes_sized == 0) { if (!PE_get_default("kern.maxvnodes", &desiredvnodes, sizeof(desiredvnodes))) { /* @@ -331,51 +305,42 @@ done: #if defined(__LP64__) extern int tcp_tcbhashsize; -extern int max_cached_sock_count; #endif void bsd_scale_setup(int scale) { #if defined(__LP64__) - if ((scale > 0) && (serverperfmode == 0)) { - maxproc *= scale; - maxprocperuid = (maxproc * 2) / 3; - if (scale > 2) { - maxfiles *= scale; - maxfilesperproc = maxfiles / 2; - } - } - /* Apply server scaling rules */ - if ((scale > 0) && (serverperfmode != 0)) { - maxproc = 2500 * scale; - hard_maxproc = maxproc; - /* no fp usage */ - maxprocperuid = (maxproc * 3) / 4; - maxfiles = (150000 * scale); - maxfilesperproc = maxfiles / 2; - desiredvnodes = maxfiles; - vnodes_sized = 1; - tcp_tfo_backlog = 100 * scale; - if (scale > 4) { - /* clip somaxconn at 32G level */ - somaxconn = 2048; - /* - * For scale > 4 (> 32G), clip - * tcp_tcbhashsize to 32K - */ - tcp_tcbhashsize = 32 * 1024; - - if (scale > 7) { - /* clip at 64G level */ - max_cached_sock_count = 165000; - } else { - max_cached_sock_count = 60000 + ((scale - 1) * 15000); + if (scale > 0) { + if (!serverperfmode) { + maxproc *= scale; + maxprocperuid = (maxproc * 2) / 3; + if (scale > 2) { + maxfiles *= scale; + maxfilesperproc = maxfiles / 2; } } else { - somaxconn = 512 * scale; - tcp_tcbhashsize = 4 * 1024 * scale; - max_cached_sock_count = 60000 + ((scale - 1) * 15000); + maxproc = 2500 * scale; + hard_maxproc = maxproc; + /* no fp usage */ + maxprocperuid = (maxproc * 3) / 4; + maxfiles = (150000 * scale); + maxfilesperproc = maxfiles / 2; + desiredvnodes = maxfiles; + vnodes_sized = 1; + tcp_tfo_backlog = 100 * scale; + if (scale > 4) { + /* clip somaxconn at 32G level */ + somaxconn = 2048; + /* + * For scale > 4 (> 32G), clip + * tcp_tcbhashsize to 32K + */ + tcp_tcbhashsize = 32 * 1024; + } else { + somaxconn = 512 * scale; + tcp_tcbhashsize = 4 * 1024 * scale; + } } } diff --git a/bsd/kern/Makefile b/bsd/kern/Makefile new file mode 100644 index 000000000..709fe3556 --- /dev/null +++ b/bsd/kern/Makefile @@ -0,0 +1,18 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +EXPORT_MI_DIR = kern + +EXPORT_MI_LIST = qsort.h + +# Don't want these XNU-internal headers installed in the SDK +INSTALL_KF_MI_LIST = $(empty) +INSTALL_KF_MI_LCL_LIST = $(empty) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index fc185bb90..c23e7fd57 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -163,6 +163,7 @@ #include /* for restricted_in_port_init() */ #include /* for rvi_init() */ #include /* for kctl_test_init() */ +#include /* for kern_aop_net_init() */ #include /* for ipfilter_init() */ #include /* for assert() */ #include /* for init_system_override() */ @@ -270,7 +271,9 @@ extern void bsd_bufferinit(void); extern void throttle_init(void); vm_map_t bsd_pageable_map; +#if CONFIG_MBUF_MCACHE vm_map_t mb_map; +#endif /* CONFIG_MBUF_MCACHE */ static int bsd_simul_execs; static int bsd_pageable_map_size; @@ -491,8 +494,8 @@ bsd_init(void) boolean_t netboot = FALSE; #endif -#if (DEVELOPMENT || DEBUG) - platform_stall_panic_or_spin(PLATFORM_STALL_XNU_LOCATION_BSD_INIT); +#if HAS_UPSI_FAILURE_INJECTION + check_for_failure_injection(XNU_STAGE_BSD_INIT_START); #endif #define DEBUG_BSDINIT 0 @@ -705,6 +708,7 @@ bsd_init(void) #endif #if SOCKETS + net_update_uptime(); #if CONFIG_MBUF_MCACHE /* Initialize per-CPU cache allocator */ mcache_init(); @@ -774,6 +778,7 @@ bsd_init(void) necp_init(); #endif netagent_init(); + net_aop_init(); #endif /* NETWORKING */ #if CONFIG_FREEZE @@ -1067,6 +1072,10 @@ bsd_init(void) machine_timeout_bsd_init(); #endif /* DEVELOPMENT || DEBUG */ +#if HAS_UPSI_FAILURE_INJECTION + check_for_failure_injection(XNU_STAGE_BSD_INIT_END); +#endif + bsd_init_kprintf("done\n"); } diff --git a/bsd/kern/bsd_syscalls_stashtask.txt b/bsd/kern/bsd_syscalls_stashtask.txt deleted file mode 100644 index dd98e3158..000000000 --- a/bsd/kern/bsd_syscalls_stashtask.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ensure any new syscalls added: - -- Fill in any gaps before being added to the end of the list. -- Have been reviewed by a security engineer. diff --git a/bsd/kern/code_signing/ppl.c b/bsd/kern/code_signing/ppl.c index 1dc39d0af..5c27266ff 100644 --- a/bsd/kern/code_signing/ppl.c +++ b/bsd/kern/code_signing/ppl.c @@ -336,6 +336,20 @@ ppl_reconstitute_code_signature( #pragma mark Address Spaces +kern_return_t +ppl_setup_nested_address_space( + __unused pmap_t pmap, + __unused const vm_address_t region_addr, + __unused const vm_size_t region_size) +{ + /* + * We don't need to do anything here from the code-signing-monitor's perspective + * because the PMAP's base address fields are setup when someone eventually calls + * pmap_nest on the PMAP object. + */ + return KERN_SUCCESS; +} + kern_return_t ppl_associate_code_signature( pmap_t pmap, @@ -380,7 +394,6 @@ ppl_associate_debug_region( const vm_address_t region_addr, const vm_size_t region_size) { - volatile bool force_true = true; bool debugger_mapping = false; /* @@ -409,16 +422,6 @@ ppl_associate_debug_region( } #endif - /* - * For now, we're just going to revert back to our previous policy and continue - * to allow a debugger mapped to be created by a process on its own. - * - * For more information: rdar://145588999. - */ - if (force_true == true) { - debugger_mapping = true; - } - if (debugger_mapping == false) { printf("disallowed non-debugger initiated debug mapping\n"); return KERN_DENIED; diff --git a/bsd/kern/code_signing/txm.c b/bsd/kern/code_signing/txm.c index ae36361cf..82f8bd75c 100644 --- a/bsd/kern/code_signing/txm.c +++ b/bsd/kern/code_signing/txm.c @@ -621,25 +621,16 @@ get_code_signing_info(void) txm_restricted_mode_state = txm_ro_data->restrictedModeState; } +#if kTXMKernelAPIVersion >= 11 + research_mode_enabled = txm_ro_data->buildType.research; + extended_research_mode_enabled = txm_ro_data->buildType.extendedResearch; +#endif + /* Setup the number of boot trust caches */ num_static_trust_caches = os_atomic_load(&txm_metrics->trustCaches.numStatic, relaxed); num_engineering_trust_caches = os_atomic_load(&txm_metrics->trustCaches.numEngineering, relaxed); } -static void -set_shared_region_base_address(void) -{ - txm_call_t txm_call = { - .selector = kTXMKernelSelectorSetSharedRegionBaseAddress, - .failure_fatal = true, - .num_input_args = 2, - }; - - txm_kernel_call(&txm_call, - SHARED_REGION_BASE, - SHARED_REGION_SIZE); -} - void code_signing_init(void) { @@ -662,12 +653,6 @@ code_signing_init(void) lck_mtx_init(&compilation_service_lock, &txm_lck_grp, 0); lck_mtx_init(&unregister_sync_lock, &txm_lck_grp, 0); - /* - * We need to let TXM know what the shared region base address is going - * to be for this boot. - */ - set_shared_region_base_address(); - /* Require signed code when monitor is enabled */ if (code_signing_enabled == true) { cs_debug_fail_on_unsigned_code = 1; @@ -1228,6 +1213,26 @@ txm_unregister_address_space( return KERN_SUCCESS; } +kern_return_t +txm_setup_nested_address_space( + pmap_t pmap, + const vm_address_t region_addr, + const vm_size_t region_size) +{ + txm_call_t txm_call = { + .selector = kTXMKernelSelectorSetupNestedAddressSpace, + .num_input_args = 3 + }; + TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap); + kern_return_t ret = KERN_DENIED; + + pmap_txm_acquire_exclusive_lock(pmap); + ret = txm_kernel_call(&txm_call, txm_addr_space, region_addr, region_size); + pmap_txm_release_exclusive_lock(pmap); + + return ret; +} + kern_return_t txm_associate_code_signature( pmap_t pmap, @@ -1260,7 +1265,7 @@ txm_associate_code_signature( */ vm_address_t adjusted_region_addr = region_addr; if (txm_addr_space->addrSpaceID.type == kTXMAddressSpaceIDTypeSharedRegion) { - adjusted_region_addr += SHARED_REGION_BASE; + adjusted_region_addr += txm_addr_space->baseAddr; } /* diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index 2f89177b4..cad45017c 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -1347,7 +1347,7 @@ decmpfs_pagein_compressed(struct vnop_pagein_args *ap, int *is_compressed, decmp * alignment requirements. */ err = VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, - VNODE_VERIFY_DEFAULT, NULL); + VNODE_VERIFY_DEFAULT, NULL, NULL); if (err) { ErrorLogWithPath("VNOP_VERIFY returned error = %d\n", err); goto out; @@ -1597,7 +1597,7 @@ decompress: if (!err && verify_block_size) { size_t cur_verify_block_size = verify_block_size; - if ((err = VNOP_VERIFY(vp, uplPos, vec.buf, rounded_uplSize, &cur_verify_block_size, NULL, 0, NULL))) { + if ((err = VNOP_VERIFY(vp, uplPos, vec.buf, rounded_uplSize, &cur_verify_block_size, NULL, 0, NULL, NULL))) { ErrorLogWithPath("Verification failed with error %d, uplPos = %lld, uplSize = %d, did_read = %d, valid_pages = %d, invalid_pages = %d, tail_page_valid = %d\n", err, (long long)uplPos, (int)rounded_uplSize, (int)did_read, num_valid_pages, num_invalid_pages, file_tail_page_valid); } @@ -1749,7 +1749,7 @@ decmpfs_read_compressed(struct vnop_read_args *ap, int *is_compressed, decmpfs_c */ /* If the verify block size is larger than the page size, the UPL needs to aligned to it */ - err = VNOP_VERIFY(vp, uplPos, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL); + err = VNOP_VERIFY(vp, uplPos, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL); if (err) { goto out; } else if (verify_block_size) { @@ -1858,7 +1858,7 @@ decompress: if (!err && verify_block_size) { size_t cur_verify_block_size = verify_block_size; - if ((err = VNOP_VERIFY(vp, curUplPos, data, curUplSize, &cur_verify_block_size, NULL, 0, NULL))) { + if ((err = VNOP_VERIFY(vp, curUplPos, data, curUplSize, &cur_verify_block_size, NULL, 0, NULL, NULL))) { ErrorLogWithPath("Verification failed with error %d\n", err); abort_read = 1; } diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 8eb92b2ad..c3c7eb791 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -243,7 +244,8 @@ imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char size_t bufsz = 0; void *buf = NULL; error_func = "imageboot_read_file"; - error = imageboot_read_file_pageable(image_path, &buf, &bufsz); + // no_softlimit: di_root_ramfile_buf is OK to handle a no_softlimit buffer + error = imageboot_read_file_pageable(image_path, &buf, &bufsz, /* no_softlimit */ true); if (error == 0) { error_func = "di_root_ramfile_buf"; error = di_root_ramfile_buf(buf, bufsz, devname, sizeof(devname), &dev); @@ -572,7 +574,7 @@ errorout: } static int -imageboot_read_file_internal(const char *path, const off_t offset, const bool pageable, void **bufp, size_t *bufszp, off_t *fsizep) +imageboot_read_file_internal(const char *path, const off_t offset, const bool pageable, void **bufp, size_t *bufszp, off_t *fsizep, bool no_softlimit) { int err = 0; struct nameidata ndp = {}; @@ -639,26 +641,41 @@ imageboot_read_file_internal(const char *path, const off_t offset, const bool pa PE_parse_boot_argn("rootdmg-maxsize", &maxsize, sizeof(maxsize)); if (maxsize && (maxsize < (size_t)fsize)) { AUTHPRNT("file is too large (%lld > %lld)", (long long) fsize, (long long) maxsize); - err = ENOMEM; + err = EFBIG; goto out; } if (pageable) { vm_offset_t addr = 0; + kma_flags_t kma_flags = 0; + + kma_flags = KMA_PAGEABLE | KMA_DATA_SHARED; + if (no_softlimit) { + kma_flags |= KMA_NOSOFTLIMIT; + } + if (kmem_alloc(kernel_map, &addr, (vm_size_t)fsize, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_FILE) == KERN_SUCCESS) { + kma_flags, VM_KERN_MEMORY_FILE) == KERN_SUCCESS) { buf = (char *)addr; } else { buf = NULL; } } else { + zalloc_flags_t zflags = 0; + //limit kalloc data calls to only 2GB. if (fsize > IMAGEBOOT_MAX_KALLOCSIZE) { AUTHPRNT("file is too large for non-pageable (%lld)", (long long) fsize); err = ENOMEM; goto out; } - buf = (char *)kalloc_data((vm_size_t)fsize, Z_WAITOK); + + zflags = Z_WAITOK; + if (no_softlimit) { + zflags |= Z_NOSOFTLIMIT; + } + + buf = (char *)kalloc_data((vm_size_t)fsize, zflags); } if (buf == NULL) { err = ENOMEM; @@ -699,7 +716,7 @@ imageboot_read_file_internal(const char *path, const off_t offset, const bool pa } } - readbuf = &readbuf[chunksize]; + readbuf = VM_FAR_ADD_PTR_UNBOUNDED(readbuf, chunksize); readsize -= chunksize; readoff += chunksize; } @@ -734,21 +751,21 @@ out: } int -imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp) +imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp, bool no_softlimit) { - return imageboot_read_file_internal(path, 0, true, bufp, bufszp, NULL); + return imageboot_read_file_internal(path, 0, true, bufp, bufszp, NULL, no_softlimit); } int imageboot_read_file_from_offset(const char *path, const off_t offset, void **bufp, size_t *bufszp) { - return imageboot_read_file_internal(path, offset, false, bufp, bufszp, NULL); + return imageboot_read_file_internal(path, offset, false, bufp, bufszp, NULL, /* no_softlimit */ false); } int imageboot_read_file(const char *path, void **bufp, size_t *bufszp, off_t *fsizep) { - return imageboot_read_file_internal(path, 0, false, bufp, bufszp, fsizep); + return imageboot_read_file_internal(path, 0, false, bufp, bufszp, fsizep, /* no_softlimit */ false); } #if CONFIG_IMAGEBOOT_IMG4 || CONFIG_IMAGEBOOT_CHUNKLIST @@ -896,8 +913,14 @@ imageboot_mount_ramdisk(const char *path) vnode_t tvp; mount_t new_rootfs; - /* Read our target image from disk */ - err = imageboot_read_file_pageable(path, &buf, &bufsz); + /* + * Read our target image from disk + * + * We override the allocator soft-limit in order to allow booting large RAM + * disks. As a consequence, we are responsible for manipulating the + * buffer only through vm_far safe APIs. + */ + err = imageboot_read_file_pageable(path, &buf, &bufsz, /* no_softlimit */ true); if (err) { printf("%s: failed: imageboot_read_file_pageable() = %d\n", __func__, err); goto out; @@ -1091,8 +1114,13 @@ imageboot_setup_new(imageboot_type_t type) } if (error) { - panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)", - error, auth_root, ramdisk_root); + if (error == EFBIG) { + panic("root imagefile is too large (err=%d, auth=%d, ramdisk=%d)", + error, auth_root, ramdisk_root); + } else { + panic("Failed to mount root image (err=%d, auth=%d, ramdisk=%d)", + error, auth_root, ramdisk_root); + } } #if CONFIG_IMAGEBOOT_CHUNKLIST diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 1b420d10f..e7c02adad 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -48,6 +48,7 @@ #include #include #include +#include #include @@ -142,7 +143,7 @@ typefilter_create(void) { typefilter_t tf; if (KERN_SUCCESS == kmem_alloc(kernel_map, (vm_offset_t*)&tf, - TYPEFILTER_ALLOC_SIZE, KMA_DATA | KMA_ZERO, VM_KERN_MEMORY_DIAG)) { + TYPEFILTER_ALLOC_SIZE, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG)) { return tf; } return NULL; @@ -237,14 +238,13 @@ static int kdbg_reinit(unsigned int extra_cpus); static int kdbg_test(size_t flavor); #endif /* DEVELOPMENT || DEBUG */ -static int _write_legacy_header(bool write_thread_map, vnode_t vp, - vfs_context_t ctx); -static int kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx); static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size); static void _clear_thread_map(void); static bool kdbg_wait(uint64_t timeout_ms); -static void kdbg_wakeup(void); + +static void _try_wakeup_waiter(void); +static void _wakeup_waiter(void); static int _copy_cpu_map(int version, void **dst, size_t *size); @@ -254,9 +254,75 @@ static kd_threadmap *_thread_map_create_live(size_t max_count, static bool kdebug_current_proc_enabled(uint32_t debugid); static errno_t kdebug_check_trace_string(uint32_t debugid, uint64_t str_id); -int kernel_debug_trace_write_to_file(user_addr_t *buffer, size_t *number, - size_t *count, size_t tempbuf_number, vnode_t vp, vfs_context_t ctx, - bool chunk); +#define RAW_FLUSH_SIZE (2 * 1024 * 1024) + +__enum_closed_decl(kd_dest_kind_t, uint32_t, { + KD_DEST_COPYOUT = 0x1, + KD_DEST_VFS = 0x2, +}); + +struct kd_dest { + kd_dest_kind_t kdd_kind; + bool kdd_chunk_format; + off_t kdd_cur_offset; + union { + struct { + user_addr_t kdd_user_buffer; + size_t kdd_user_size; + }; + struct { + struct vfs_context kdd_vfs_ctx; + vnode_t kdd_vnode; + off_t kdd_file_written_since_flush; + }; + }; +}; + +static inline struct kd_dest +kd_dest_copyout(user_addr_t buf, size_t size) +{ + return (struct kd_dest){ + .kdd_kind = KD_DEST_COPYOUT, + .kdd_user_buffer = buf, + .kdd_user_size = size, + }; +} + +static inline int +kd_dest_init_write(struct kd_dest *dest, int fd, struct fileproc **fp_out) +{ + dest->kdd_kind = KD_DEST_VFS; + proc_t p = current_proc(); + struct fileproc *fp = NULL; + if (fp_get_ftype(p, fd, DTYPE_VNODE, EBADF, &fp)) { + return EBADF; + } + + dest->kdd_vnode = fp_get_data(fp); + int error = vnode_getwithref(dest->kdd_vnode); + if (error != 0) { + fp_drop(p, fd, fp, 0); + return error; + } + dest->kdd_vfs_ctx.vc_thread = current_thread(); + dest->kdd_vfs_ctx.vc_ucred = fp->fp_glob->fg_cred; + dest->kdd_cur_offset = fp->fp_glob->fg_offset; + *fp_out = fp; + return 0; +} + +static inline void +kd_dest_finish_write(struct kd_dest *dest, struct fileproc *fp, int fd) +{ + fp->fp_glob->fg_offset = dest->kdd_cur_offset; + vnode_put(dest->kdd_vnode); + fp_drop(current_proc(), fd, fp, 0); +} + +static int _send_events(struct kd_dest *dest, const void *src, + size_t event_count); +static int kdbg_write_thread_map(struct kd_dest *dest); +static int _write_legacy_header(bool write_thread_map, struct kd_dest *dest); extern void IOSleep(int); @@ -321,9 +387,6 @@ kd_threadmap *kd_mapptr = 0; vm_size_t kd_mapsize = 0; vm_size_t kd_mapcount = 0; -off_t RAW_file_offset = 0; -int RAW_file_written = 0; - /* * A globally increasing counter for identifying strings in trace. Starts at * 1 because 0 is a reserved return value. @@ -436,6 +499,8 @@ kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type) int intrs_en = kdebug_storage_lock(&kd_control_trace); if (enabled) { + // Latch the status of the user-controlled flags for wrapping. + kd_control_trace.kdc_live_flags = kd_control_trace.kdc_flags & KDBG_NOWRAP; // The oldest valid time is now; reject past events from coprocessors. kd_control_trace.kdc_oldest_time = kdebug_timestamp(); kdebug_enable |= trace_type; @@ -617,11 +682,264 @@ _try_wakeup_above_threshold(uint32_t debugid) if (eventid == INTERRUPT_EVENT || eventid == VMFAULT_EVENT || csc == BSD_SYSCALL_CSC || csc == MACH_SYSCALL_CSC) { - kdbg_wakeup(); + _try_wakeup_waiter(); } } } +__attribute__((always_inline)) +static struct kd_storage * +_next_storage_unit(struct kd_bufinfo *info, unsigned int cpu) +{ + struct kd_storage *store = NULL; + do { + bool needs_new_store = true; + union kds_ptr kds_raw = info->kd_list_tail; + if (kds_raw.raw != KDS_PTR_NULL) { + store = POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, kds_raw); + if (store->kds_bufindx < kd_control_trace.kdebug_events_per_storage_unit) { + needs_new_store = false; + } + } + + if (!needs_new_store) { + return store; + } + bool allocated = kdebug_storage_alloc(&kd_control_trace, &kd_buffer_trace, cpu); + if (!allocated) { + // Failed to allocate while wrapping is disabled. + return NULL; + } + } while (true); +} + +__attribute__((always_inline)) +static kd_buf * +_next_timestamped_coproc_record(unsigned int cpu, uint64_t timestamp) +{ + struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu]; + bool timestamp_is_continuous = info->continuous_timestamps; + + if (kdebug_using_continuous_time()) { + if (!timestamp_is_continuous) { + timestamp = absolutetime_to_continuoustime(timestamp); + } + } else { + if (timestamp_is_continuous) { + timestamp = continuoustime_to_absolutetime(timestamp); + } + } + if (timestamp < kd_control_trace.kdc_oldest_time) { + if (info->latest_past_event_timestamp < timestamp) { + info->latest_past_event_timestamp = timestamp; + } + return NULL; + } + + struct kd_storage *store = NULL; + uint32_t store_index = 0; + + do { + store = _next_storage_unit(info, cpu); + if (!store) { + return NULL; + } + store_index = store->kds_bufindx; + // Prevent an interrupt from stealing this slot in the storage unit, + // retrying if necessary. No barriers are needed because this only + // concerns visibility on this same CPU. + if (os_atomic_cmpxchg(&store->kds_bufindx, store_index, store_index + 1, relaxed)) { + break; + } + } while (true); + + // Make sure kds_timestamp is less than any event in this buffer. This can + // only happen for coprocessors because this field is initialized to the + // current time when a storage unit is allocated by a CPU. + if (timestamp < store->kds_timestamp) { + store->kds_timestamp = timestamp; + } + os_atomic_inc(&store->kds_bufcnt, relaxed); + kd_buf *kd = &store->kds_records[store_index]; + kd->timestamp = timestamp; + return kd; +} + +__attribute__((always_inline)) +static void +_write_trace_record_coproc_nopreempt( + uint64_t timestamp, + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5, + unsigned int cpu) +{ + if (kd_control_trace.enabled == 0) { + return; + } + kd_buf *kd = _next_timestamped_coproc_record(cpu, timestamp); + if (kd) { + kd->debugid = debugid; + kd->arg1 = arg1; + kd->arg2 = arg2; + kd->arg3 = arg3; + kd->arg4 = arg4; + kd->arg5 = arg5; + kd->cpuid = cpu; + } +} + +__attribute__((always_inline)) +static kd_buf * +_next_timestamped_record(unsigned int cpu) +{ + struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu]; + struct kd_storage *store = NULL; + uint64_t now = 0; + uint32_t store_index = 0; + + do { + store = _next_storage_unit(info, cpu); + if (!store) { + return NULL; + } + store_index = store->kds_bufindx; + + // Re-capture the timestamp to ensure time is monotonically-increasing + // within storage units. + now = kdebug_timestamp(); + if (os_atomic_cmpxchg(&store->kds_bufindx, store_index, store_index + 1, relaxed)) { + break; + } + } while (true); + + os_atomic_inc(&store->kds_bufcnt, relaxed); + kd_buf *kd = &store->kds_records[store_index]; + kd->timestamp = now; + return kd; +} + +static bool kdebug_debugid_procfilt_allowed(uint32_t debugid); + +static void +_write_trace_record( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5, + kdebug_emit_flags_t flags) +{ + kdebug_emit_filter_t emit = kd_control_trace.kdc_emit; + if (!emit || !kdebug_enable) { + return; + } + bool only_filter = flags & KDBG_FILTER_ONLY; + bool observe_procfilt = !(flags & KDBG_NON_PROCESS); + + if (!_should_emit_debugid(emit, debugid)) { + return; + } + if (emit == KDEMIT_ALL && only_filter) { + return; + } + if (!ml_at_interrupt_context() && observe_procfilt && + !kdebug_debugid_procfilt_allowed(debugid)) { + return; + } + + disable_preemption(); + if (kd_control_trace.enabled == 0) { + enable_preemption(); + return; + } + unsigned int cpu = cpu_number(); + kd_buf *kd = _next_timestamped_record(cpu); + if (kd) { + kd->debugid = debugid; + kd->arg1 = arg1; + kd->arg2 = arg2; + kd->arg3 = arg3; + kd->arg4 = arg4; + kd->arg5 = arg5; + kd->cpuid = cpu; + } + enable_preemption(); + +#if KPERF + kperf_kdebug_callback(debugid, __builtin_frame_address(0)); +#endif // KPERF +} + +static void +kernel_debug_internal( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5, + kdebug_emit_flags_t flags) +{ + _write_trace_record(debugid, arg1, arg2, arg3, arg4, arg5, flags); + _try_wakeup_above_threshold(debugid); +} + +__attribute__((noinline)) +void +kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, + uintptr_t arg4, __unused uintptr_t arg5) +{ + uintptr_t tid = (uintptr_t)thread_tid(current_thread()); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, tid, 0); +} + +__attribute__((noinline)) +void +kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, + uintptr_t arg4, uintptr_t arg5) +{ + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); +} + +__attribute__((noinline)) +void +kernel_debug_flags( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + kdebug_emit_flags_t flags) +{ + uintptr_t tid = (uintptr_t)thread_tid(current_thread()); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, tid, flags); +} + +__attribute__((noinline)) +void +kernel_debug_filtered( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4) +{ + kernel_debug_flags(debugid, arg1, arg2, arg3, arg4, KDBG_FILTER_ONLY); +} + +void +kernel_debug_string_early(const char *message) +{ + uintptr_t a[4] = { 0 }; + strncpy((char *)a, message, sizeof(a)); + KERNEL_DEBUG_EARLY(TRACE_INFO_STRING, a[0], a[1], a[2], a[3]); +} + // Emit events from coprocessors. void kernel_debug_enter( @@ -638,6 +956,9 @@ kernel_debug_enter( if (kd_control_trace.kdc_flags & KDBG_DISABLE_COPROCS) { return; } + if (coreid >= kd_control_trace.kdebug_cpus) { + return; + } kdebug_emit_filter_t emit = kd_control_trace.kdc_emit; if (!emit || !kdebug_enable) { return; @@ -646,17 +967,9 @@ kernel_debug_enter( return; } - struct kd_record kd_rec = { - .cpu = (int32_t)coreid, - .timestamp = (int64_t)timestamp, - .debugid = debugid, - .arg1 = arg1, - .arg2 = arg2, - .arg3 = arg3, - .arg4 = arg4, - .arg5 = threadid, - }; - kernel_debug_write(&kd_control_trace, &kd_buffer_trace, kd_rec); + disable_preemption(); + _write_trace_record_coproc_nopreempt(timestamp, debugid, arg1, arg2, arg3, arg4, threadid, coreid); + enable_preemption(); } __pure2 @@ -667,7 +980,8 @@ kdebug_current_proc_unsafe(void) } // Return true iff the debug ID should be traced by the current process. -static inline bool +__attribute__((always_inline)) +static bool kdebug_debugid_procfilt_allowed(uint32_t debugid) { uint32_t procfilt_flags = kd_control_trace.kdc_flags & @@ -698,116 +1012,6 @@ kdebug_debugid_procfilt_allowed(uint32_t debugid) } } -static void -kdebug_emit_internal(kdebug_emit_filter_t emit, - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5, - uint64_t flags) -{ - bool only_filter = flags & KDBG_FLAG_FILTERED; - bool observe_procfilt = !(flags & KDBG_FLAG_NOPROCFILT); - - if (!_should_emit_debugid(emit, debugid)) { - return; - } - if (emit == KDEMIT_ALL && only_filter) { - return; - } - if (!ml_at_interrupt_context() && observe_procfilt && - !kdebug_debugid_procfilt_allowed(debugid)) { - return; - } - - struct kd_record kd_rec = { - .cpu = -1, - .timestamp = -1, - .debugid = debugid, - .arg1 = arg1, - .arg2 = arg2, - .arg3 = arg3, - .arg4 = arg4, - .arg5 = arg5, - }; - kernel_debug_write(&kd_control_trace, &kd_buffer_trace, kd_rec); - -#if KPERF - kperf_kdebug_callback(kd_rec.debugid, __builtin_frame_address(0)); -#endif // KPERF -} - -static void -kernel_debug_internal( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uintptr_t arg5, - uint64_t flags) -{ - kdebug_emit_filter_t emit = kd_control_trace.kdc_emit; - if (!emit || !kdebug_enable) { - return; - } - kdebug_emit_internal(emit, debugid, arg1, arg2, arg3, arg4, arg5, flags); - _try_wakeup_above_threshold(debugid); -} - -__attribute__((noinline)) -void -kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, __unused uintptr_t arg5) -{ - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, - (uintptr_t)thread_tid(current_thread()), 0); -} - -__attribute__((noinline)) -void -kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5) -{ - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); -} - -__attribute__((noinline)) -void -kernel_debug_flags( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4, - uint64_t flags) -{ - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, - (uintptr_t)thread_tid(current_thread()), flags); -} - -__attribute__((noinline)) -void -kernel_debug_filtered( - uint32_t debugid, - uintptr_t arg1, - uintptr_t arg2, - uintptr_t arg3, - uintptr_t arg4) -{ - kernel_debug_flags(debugid, arg1, arg2, arg3, arg4, KDBG_FLAG_FILTERED); -} - -void -kernel_debug_string_early(const char *message) -{ - uintptr_t a[4] = { 0 }; - strncpy((char *)a, message, sizeof(a)); - KERNEL_DEBUG_EARLY(TRACE_INFO_STRING, a[0], a[1], a[2], a[3]); -} - #define SIMPLE_STR_LEN (64) static_assert(SIMPLE_STR_LEN % sizeof(uintptr_t) == 0); @@ -843,10 +1047,8 @@ kernel_debug_string_simple(uint32_t eventid, const char *str) if ((written + (4 * sizeof(uintptr_t))) >= len) { debugid |= DBG_FUNC_END; } - kernel_debug_internal(debugid, str_buf[i], - str_buf[i + 1], - str_buf[i + 2], - str_buf[i + 3], thread_id, 0); + kernel_debug_internal(debugid, str_buf[i], str_buf[i + 1], + str_buf[i + 2], str_buf[i + 3], thread_id, 0); } } @@ -943,10 +1145,8 @@ kernel_debug_early_end(void) void kernel_debug_disable(void) { - if (kdebug_enable) { - kdbg_set_tracing_enabled(false, 0); - kdbg_wakeup(); - } + kdbg_set_tracing_enabled(false, 0); + _wakeup_waiter(); } // Returns true if debugid should only be traced from the kernel. @@ -1016,8 +1216,8 @@ kdebug_trace64(__unused struct proc *p, struct kdebug_trace64_args *uap, if (_kernel_only_event(uap->code)) { return EPERM; } - kernel_debug_internal(uap->code, (uintptr_t)uap->arg1, - (uintptr_t)uap->arg2, (uintptr_t)uap->arg3, (uintptr_t)uap->arg4, + kernel_debug_internal(uap->code, (uintptr_t)uap->arg1, (uintptr_t)uap->arg2, + (uintptr_t)uap->arg3, (uintptr_t)uap->arg4, (uintptr_t)thread_tid(current_thread()), 0); return 0; } @@ -1090,9 +1290,7 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, if ((written + (4 * sizeof(uintptr_t))) >= str_len) { trace_debugid |= DBG_FUNC_END; } - kernel_debug_internal(trace_debugid, str[i], - str[i + 1], - str[i + 2], + kernel_debug_internal(trace_debugid, str[i], str[i + 1], str[i + 2], str[i + 3], thread_id, 0); } @@ -1300,10 +1498,6 @@ kdbg_reinit(unsigned int extra_cpus) _clear_thread_map(); kd_control_trace.kdc_live_flags &= ~KDBG_WRAPPED; - - RAW_file_offset = 0; - RAW_file_written = 0; - return create_buffers_trace(extra_cpus); } @@ -1382,7 +1576,7 @@ _copy_cpu_map(int map_version, void **dst, size_t *size) if (*dst == NULL) { kern_return_t alloc_ret = kmem_alloc(kernel_map, (vm_offset_t *)dst, - (vm_size_t)size_needed, KMA_DATA | KMA_ZERO, VM_KERN_MEMORY_DIAG); + (vm_size_t)size_needed, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG); if (alloc_ret != KERN_SUCCESS) { return ENOMEM; } @@ -1559,6 +1753,7 @@ kdbg_clear(void) kd_control_trace.kdc_flags &= ~KDBG_CONTINUOUS_TIME; kd_control_trace.kdc_flags &= ~KDBG_DISABLE_COPROCS; kd_control_trace.kdc_flags &= ~KDBG_MATCH_DISABLE; + kd_control_trace.kdc_flags &= ~(KDBG_NOWRAP | KDBG_WRAPPED); kd_control_trace.kdc_live_flags &= ~(KDBG_NOWRAP | KDBG_WRAPPED); kd_control_trace.kdc_oldest_time = 0; @@ -1567,9 +1762,6 @@ kdbg_clear(void) kd_buffer_trace.kdb_event_count = 0; _clear_thread_map(); - - RAW_file_offset = 0; - RAW_file_written = 0; } void @@ -1844,15 +2036,6 @@ _copyout_event_disable_mask(user_addr_t uaddr, size_t usize) return 0; } -static int -kdbg_write_to_vnode(caddr_t buffer, size_t size, vnode_t vp, vfs_context_t ctx, off_t file_offset) -{ - assert(size < INT_MAX); - return vn_rdwr(UIO_WRITE, vp, buffer, (int)size, file_offset, UIO_SYSSPACE, - IO_NODELOCKED | IO_UNIT, vfs_context_ucred(ctx), (int *) 0, - vfs_context_proc(ctx)); -} - static errno_t _copyout_cpu_map(int map_version, user_addr_t udst, size_t *usize) { @@ -1904,159 +2087,6 @@ kdbg_readcurthrmap(user_addr_t buffer, size_t *bufsize) return ret; } - -static int -_write_legacy_header(bool write_thread_map, vnode_t vp, vfs_context_t ctx) -{ - int ret = 0; - RAW_header header; - clock_sec_t secs; - clock_usec_t usecs; - void *pad_buf; - uint32_t pad_size; - uint32_t extra_thread_count = 0; - uint32_t cpumap_size; - size_t map_size = 0; - uint32_t map_count = 0; - - if (write_thread_map) { - assert(kd_control_trace.kdc_flags & KDBG_MAPINIT); - if (kd_mapcount > UINT32_MAX) { - return ERANGE; - } - map_count = (uint32_t)kd_mapcount; - if (os_mul_overflow(map_count, sizeof(kd_threadmap), &map_size)) { - return ERANGE; - } - if (map_size >= INT_MAX) { - return ERANGE; - } - } - - /* - * Without the buffers initialized, we cannot construct a CPU map or a - * thread map, and cannot write a header. - */ - if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT)) { - return EINVAL; - } - - /* - * To write a RAW_VERSION1+ file, we must embed a cpumap in the - * "padding" used to page align the events following the threadmap. If - * the threadmap happens to not require enough padding, we artificially - * increase its footprint until it needs enough padding. - */ - - assert(vp); - assert(ctx); - - pad_size = 16384 - ((sizeof(RAW_header) + map_size) & PAGE_MASK); - cpumap_size = sizeof(kd_cpumap_header) + kd_control_trace.kdebug_cpus * sizeof(kd_cpumap); - - if (cpumap_size > pad_size) { - /* If the cpu map doesn't fit in the current available pad_size, - * we increase the pad_size by 16K. We do this so that the event - * data is always available on a page aligned boundary for both - * 4k and 16k systems. We enforce this alignment for the event - * data so that we can take advantage of optimized file/disk writes. - */ - pad_size += 16384; - } - - /* The way we are silently embedding a cpumap in the "padding" is by artificially - * increasing the number of thread entries. However, we'll also need to ensure that - * the cpumap is embedded in the last 4K page before when the event data is expected. - * This way the tools can read the data starting the next page boundary on both - * 4K and 16K systems preserving compatibility with older versions of the tools - */ - if (pad_size > 4096) { - pad_size -= 4096; - extra_thread_count = (pad_size / sizeof(kd_threadmap)) + 1; - } - - memset(&header, 0, sizeof(header)); - header.version_no = RAW_VERSION1; - header.thread_count = map_count + extra_thread_count; - - clock_get_calendar_microtime(&secs, &usecs); - header.TOD_secs = secs; - header.TOD_usecs = usecs; - - ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)&header, (int)sizeof(RAW_header), RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - if (ret) { - goto write_error; - } - RAW_file_offset += sizeof(RAW_header); - RAW_file_written += sizeof(RAW_header); - - if (write_thread_map) { - assert(map_size < INT_MAX); - ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, (int)map_size, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - if (ret) { - goto write_error; - } - - RAW_file_offset += map_size; - RAW_file_written += map_size; - } - - if (extra_thread_count) { - pad_size = extra_thread_count * sizeof(kd_threadmap); - pad_buf = (char *)kalloc_data(pad_size, Z_WAITOK | Z_ZERO); - if (!pad_buf) { - ret = ENOMEM; - goto write_error; - } - - assert(pad_size < INT_MAX); - ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, (int)pad_size, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - kfree_data(pad_buf, pad_size); - if (ret) { - goto write_error; - } - - RAW_file_offset += pad_size; - RAW_file_written += pad_size; - } - - pad_size = PAGE_SIZE - (RAW_file_offset & PAGE_MASK); - if (pad_size) { - pad_buf = (char *)kalloc_data(pad_size, Z_WAITOK | Z_ZERO); - if (!pad_buf) { - ret = ENOMEM; - goto write_error; - } - - /* - * Embed the CPU map in the padding bytes -- old code will skip it, - * while newer code knows it's there. - */ - size_t temp = pad_size; - errno_t error = _copy_cpu_map(RAW_VERSION1, &pad_buf, &temp); - if (0 != error) { - memset(pad_buf, 0, pad_size); - } - - assert(pad_size < INT_MAX); - ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, (int)pad_size, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - kfree_data(pad_buf, pad_size); - if (ret) { - goto write_error; - } - - RAW_file_offset += pad_size; - RAW_file_written += pad_size; - } - -write_error: - return ret; -} - static void _clear_thread_map(void) { @@ -2081,17 +2111,15 @@ _clear_thread_map(void) * Returns ENOMEM if padding could not be allocated. Returns 0 otherwise. */ static int -kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx) +kdbg_write_thread_map(struct kd_dest *dest) { - int ret = 0; - bool map_initialized; - ktrace_assert_lock_held(); - assert(ctx != NULL); + if (dest->kdd_kind != KD_DEST_VFS) { + panic("kdebug: must write thread map to VFS"); + } - map_initialized = (kd_control_trace.kdc_flags & KDBG_MAPINIT); - - ret = _write_legacy_header(map_initialized, vp, ctx); + bool map_initialized = (kd_control_trace.kdc_flags & KDBG_MAPINIT); + int ret = _write_legacy_header(map_initialized, dest); if (ret == 0) { if (map_initialized) { _clear_thread_map(); @@ -2099,7 +2127,6 @@ kdbg_write_thread_map(vnode_t vp, vfs_context_t ctx) ret = ENODATA; } } - return ret; } @@ -2145,8 +2172,10 @@ kdbg_set_nkdbufs_trace(unsigned int req_nkdbufs_trace) * Only allow allocations of up to half the kernel's data range or "sane * size", whichever is smaller. */ + kmem_range_id_t range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; const uint64_t max_nkdbufs_trace_64 = - MIN(kmem_range_id_size(KMEM_RANGE_ID_DATA), sane_size) / 2 / + MIN(kmem_range_id_size(range_id), sane_size) / 2 / sizeof(kd_buf); /* * Can't allocate more than 2^38 (2^32 * 64) bytes of events without @@ -2219,7 +2248,7 @@ kdbg_wait(uint64_t timeout_ms) * `kd_buffer_trace.kdb_storage_threshold` storage units in use. */ static void -kdbg_wakeup(void) +_try_wakeup_waiter(void) { bool need_kds_wakeup = false; @@ -2249,22 +2278,430 @@ kdbg_wakeup(void) } } +static void +_wakeup_waiter(void) +{ + bool was_waiting = false; + bool s = ml_set_interrupts_enabled(false); + lck_spin_lock(&kd_wait_lock); + if (kd_waiter) { + was_waiting = true; + kd_waiter = 0; + } + lck_spin_unlock(&kd_wait_lock); + ml_set_interrupts_enabled(s); + + if (was_waiting) { + wakeup(&kd_waiter); + } +} + +static void +_storage_free(struct kd_control *kd_ctrl_page, struct kd_buffer *kd_data_page, int cpu, uint32_t kdsp_raw) +{ + struct kd_storage *kdsp_actual; + struct kd_bufinfo *kdbp; + union kds_ptr kdsp; + + kdbp = &kd_data_page->kdb_info[cpu]; + + kdsp.raw = kdsp_raw; + + int intrs_en = kdebug_storage_lock(kd_ctrl_page); + + if (kdsp.raw == kdbp->kd_list_head.raw) { + /* + * it's possible for the storage unit pointed to + * by kdsp to have already been stolen... so + * check to see if it's still the head of the list + * now that we're behind the lock that protects + * adding and removing from the queue... + * since we only ever release and steal units from + * that position, if it's no longer the head + * we having nothing to do in this context + */ + kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp); + kdbp->kd_list_head = kdsp_actual->kds_next; + + kdsp_actual->kds_next = kd_ctrl_page->kds_free_list; + kd_ctrl_page->kds_free_list = kdsp; + + kd_ctrl_page->kdc_storage_used--; + } + + kdebug_storage_unlock(kd_ctrl_page, intrs_en); +} + +static bool +_reading_set_flags( + struct kd_control *ctl, + kdebug_emit_filter_t *old_emit, + kdebug_live_flags_t *old_live) +{ + int intrs_en = kdebug_storage_lock(ctl); + + *old_emit = ctl->kdc_emit; + *old_live = ctl->kdc_live_flags; + + bool wrapped = ctl->kdc_live_flags & KDBG_WRAPPED; + ctl->kdc_live_flags |= KDBG_NOWRAP; + + kdebug_storage_unlock(ctl, intrs_en); + + return wrapped; +} + +static bool +_reading_restore_flags( + struct kd_control *ctl, + kdebug_emit_filter_t old_emit, + kdebug_live_flags_t old_live) +{ + int intrs_en = kdebug_storage_lock(ctl); + bool disabled_during_read = !ctl->enabled; + // The wrapped bit was handled already, by adding a lost-events event, don't + // replace it. + ctl->kdc_live_flags = old_live & ~KDBG_WRAPPED; + bool was_wrapping = (old_live & KDBG_NOWRAP) == 0; + // Only re-enable trace if the reader causes lost events if wrapping was + // previously enabled. + if (was_wrapping && old_emit) { + ctl->kdc_emit = old_emit; + } + kdebug_storage_unlock(ctl, intrs_en); + return disabled_during_read; +} + +static inline void +_clear_oldest_lostevents(void) +{ + for (unsigned int cpu = 0; cpu < kd_control_trace.kdebug_cpus; cpu++) { + struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu]; + union kds_ptr oldest_ptr = info->kd_list_head; + if (oldest_ptr.raw != KDS_PTR_NULL) { + struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, oldest_ptr); + store->kds_lostevents = false; + } + } +} + +static inline bool +_event_should_disable(kd_buf *event) +{ + if ((kd_control_trace.kdc_flags & KDBG_MATCH_DISABLE) == 0) { + return false; + } + kd_event_matcher *match = &kd_control_trace.disable_event_match; + kd_event_matcher *mask = &kd_control_trace.disable_event_mask; + return (event->debugid & mask->kem_debugid) == match->kem_debugid && + (event->arg1 & mask->kem_args[0]) == match->kem_args[0] && + (event->arg2 & mask->kem_args[1]) == match->kem_args[1] && + (event->arg3 & mask->kem_args[2]) == match->kem_args[2] && + (event->arg4 & mask->kem_args[3]) == match->kem_args[3]; +} + +static inline struct kd_storage * +_store_read_inc(struct kd_storage *store, struct kd_bufinfo *info, + unsigned int cpu, union kds_ptr *store_ptr) +{ + store->kds_readlast++; + if (store->kds_readlast < kd_control_trace.kdebug_events_per_storage_unit) { + return store; + } + _storage_free(&kd_control_trace, &kd_buffer_trace, cpu, store_ptr->raw); + union kds_ptr oldest_ptr = info->kd_list_head; + if (oldest_ptr.raw == KDS_PTR_NULL) { + return NULL; + } + *store_ptr = oldest_ptr; + return POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, oldest_ptr); +} + +static inline uint64_t +_store_earliest_timestamp( + struct kd_storage *store, + uint64_t min, + uint64_t max, + struct kd_bufinfo *info, + unsigned int cpu, + union kds_ptr store_ptr) +{ + while (true) { + uint32_t rcursor = store->kds_readlast; + if (rcursor == store->kds_bufindx) { + // Out of events to read on this store. + return UINT64_MAX; + } + uint64_t t = store->kds_records[rcursor].timestamp; + if (t > max) { + return UINT64_MAX; + } else if (__improbable(t < store->kds_timestamp)) { + // This can only happen for coprocessors that haven't + // finished emitting this event, it will be processed the + // next time through. + return UINT64_MAX; + } else if (t >= min) { + return t; + } + // Skip to the next event. + store = _store_read_inc(store, info, cpu, &store_ptr); + if (!store) { + return UINT64_MAX; + } + } +} + static int -_read_merged_trace_events(user_addr_t buffer, size_t *number, vnode_t vp, - vfs_context_t ctx, bool chunk) +_read_trace_events_internal(struct kd_dest *dest, size_t event_count, + uint64_t barrier_max, bool wrapped, bool *should_disable, + size_t *events_written) +{ + bool traced_retrograde = false; + bool out_of_events = false; + bool const wrapping_enabled = !(kd_control_trace.kdc_flags & KDBG_NOWRAP); + + struct kd_bufinfo *kdbip = kd_buffer_trace.kdb_info; + struct kd_region *kd_bufs = kd_buffer_trace.kd_bufs; + + event_count = MIN(event_count, kd_buffer_trace.kdb_event_count); + + if (wrapped) { + // If buffers have wrapped, do not emit additional lost events for the + // oldest storage units. + _clear_oldest_lostevents(); + } + + uint64_t barrier_min = kd_control_trace.kdc_oldest_time; + + while (event_count && !out_of_events) { + kd_buf *tempbuf = kd_buffer_trace.kdcopybuf; + uint32_t used_count = 0; + + size_t avail_count = MIN(event_count, kd_control_trace.kdebug_kdcopybuf_count); + while (used_count < avail_count) { + bool lostevents = false; + int lostcpu = -1; + uint64_t earliest_time = UINT64_MAX; + int min_cpu = -1; + + // Find the earliest event from all the oldest storage units. + for (unsigned int cpu = 0; cpu < kd_control_trace.kdebug_cpus; cpu++) { + struct kd_bufinfo *info = &kdbip[cpu]; + union kds_ptr oldest_ptr = info->kd_list_head; + if (oldest_ptr.raw == KDS_PTR_NULL) { + continue; + } + struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_bufs, oldest_ptr); + + // If the storage unit was stolen, make sure to emit a lost + // events event with the earliest time to expect an event stream + // with no gaps. + if (__improbable(store->kds_lostevents)) { + store->kds_lostevents = false; + lostevents = true; + uint64_t lost_time = store->kds_records[0].timestamp; + if (kd_control_trace.kdc_oldest_time < lost_time) { + // This time is now the oldest that can be read to + // ensure an event stream with no gaps from this point + // forward. + kd_control_trace.kdc_oldest_time = barrier_min = lost_time; + lostcpu = cpu; + } + continue; + } else if (__improbable(lostevents)) { + // On lost events, just find the latest timestamp of the + // gaps. + continue; + } + + uint64_t t = _store_earliest_timestamp(store, barrier_min, + barrier_max, info, cpu, oldest_ptr); + if (t < earliest_time) { + earliest_time = t; + min_cpu = cpu; + } + } + if (lostevents) { + wrapped = false; + // Only emit a lost events event if the user allowed wrapping. + if (wrapping_enabled) { + tempbuf[used_count++] = (kd_buf){ + .debugid = TRACE_LOST_EVENTS, + .timestamp = barrier_min, + .cpuid = lostcpu, + .arg1 = 1, + }; + } + continue; + } + if (min_cpu == -1) { + out_of_events = true; + break; + } + if (wrapped) { + // Emit a single lost events event in the case of expected + // wrapping. + wrapped = false; + if (wrapping_enabled) { + tempbuf[used_count++] = (kd_buf){ + .debugid = TRACE_LOST_EVENTS, + .timestamp = barrier_min, + }; + } + } + + struct kd_bufinfo *min_info = &kdbip[min_cpu]; + union kds_ptr oldest_ptr = min_info->kd_list_head; + struct kd_storage *min_store = POINTER_FROM_KDS_PTR(kd_bufs, oldest_ptr); + kd_buf *earliest_event = &min_store->kds_records[min_store->kds_readlast]; + + if (__improbable(min_info->latest_past_event_timestamp != 0)) { + if (__improbable(kdbg_debug)) { + printf("kdebug: PAST EVENT: debugid %#8x: " + "time %lld from CPU %u " + "(barrier at time %lld)\n", + earliest_event->debugid, + min_info->latest_past_event_timestamp, min_cpu, + barrier_min); + } + tempbuf[used_count++] = (kd_buf){ + .timestamp = earliest_time, + .cpuid = min_cpu, + .arg1 = (kd_buf_argtype)min_info->latest_past_event_timestamp, + .arg2 = 0, + .arg3 = 0, + .arg4 = 0, + .debugid = TRACE_PAST_EVENTS, + }; + min_info->latest_past_event_timestamp = 0; + continue; + } + + if (__improbable(_event_should_disable(earliest_event))) { + *should_disable = true; + } + tempbuf[used_count] = *earliest_event; + (void)_store_read_inc(min_store, min_info, min_cpu, &oldest_ptr); + if (__improbable(earliest_time < min_info->kd_prev_timebase)) { + if (traced_retrograde) { + continue; + } + traced_retrograde = true; + + if (__improbable(kdbg_debug)) { + printf("kdebug: RETRO EVENT: debugid %#8x: " + "time %lld from CPU %u " + "(previous earliest at time %lld)\n", + tempbuf[used_count].debugid, + earliest_time, min_cpu, min_info->kd_prev_timebase); + } + + tempbuf[used_count] = (kd_buf){ + .timestamp = min_info->kd_prev_timebase, + .cpuid = tempbuf[used_count].cpuid, + .arg1 = tempbuf->debugid, + .arg2 = (kd_buf_argtype)earliest_time, + .arg3 = 0, + .arg4 = 0, + .debugid = TRACE_RETROGRADE_EVENTS, + }; + } else { + min_info->kd_prev_timebase = earliest_time; + } + used_count++; + } + + if (used_count > 0) { + /* + * Remember the latest timestamp of events that we've merged so we + * don't think we've lost events later. + */ + uint64_t latest_time = tempbuf[used_count - 1].timestamp; + if (kd_control_trace.kdc_oldest_time < latest_time) { + kd_control_trace.kdc_oldest_time = latest_time; + } + + int error = _send_events(dest, kd_buffer_trace.kdcopybuf, used_count); + if (error != 0) { + // XXX Why zero this when some events may have been written? + *events_written = 0; + return error; + } + event_count -= used_count; + *events_written += used_count; + } + } + return 0; +} + +// Read events from kdebug storage units into a user space buffer or file. +// +// This code runs while events are emitted -- storage unit allocation and +// deallocation will synchronize with the emitters under the storage lock. +// Otherwise, mutual exclusion for this function must be provided by the caller, +// typically using the ktrace lock. +static int +_read_trace_events(struct kd_dest *dest, size_t event_count, size_t *events_written) +{ + bool should_disable = false; + int const prev_kdebug_enable = kdebug_enable; + *events_written = 0; + if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT) || kd_buffer_trace.kdcopybuf == NULL) { + return EINVAL; + } + thread_set_eager_preempt(current_thread()); + + /* + * Capture the current time. Only sort events that have occured + * before now. Since the IOPs are being flushed here, it is possible + * that events occur on the AP while running live tracing. + */ + uint64_t barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK; + + // Disable wrap so storage units cannot be stolen while inspecting events. + // + // With ktrace_lock held, no other control threads can be modifying + // kdc_flags. The code that emits new events could be running, but + // acquiring new storage units requires holding the storage lock, and it + // looks at the flags there. The only issue is if events are being written + // to the same chunk being read from. + kdebug_emit_filter_t old_emit; + kdebug_live_flags_t old_live_flags; + bool wrapped = _reading_set_flags(&kd_control_trace, &old_emit, &old_live_flags); + bool const no_wrapping = old_live_flags & KDBG_NOWRAP; + int error = _read_trace_events_internal(dest, event_count, barrier_max, + wrapped, &should_disable, events_written); + bool disabled_during_read = _reading_restore_flags(&kd_control_trace, old_emit, + old_live_flags); + should_disable = should_disable || (disabled_during_read && no_wrapping); + + thread_clear_eager_preempt(current_thread()); + + if (should_disable) { + kernel_debug_disable(); + } else if (disabled_during_read && !no_wrapping && old_emit) { + kd_control_trace.kdc_emit = old_emit; + kdebug_enable = prev_kdebug_enable; + kd_control_trace.enabled = 1; + commpage_update_kdebug_state(); + } + + return error; +} + +static int +_read_merged_trace_events(struct kd_dest *dest, size_t event_count, size_t *events_written) { ktrace_assert_lock_held(); - size_t count = *number / sizeof(kd_buf); - if (count == 0 || !(kd_control_trace.kdc_flags & KDBG_BUFINIT) || + if (event_count == 0 || !(kd_control_trace.kdc_flags & KDBG_BUFINIT) || kd_buffer_trace.kdcopybuf == 0) { - *number = 0; + *events_written = 0; return EINVAL; } // Before merging, make sure coprocessors have provided up-to-date events. _coproc_list_callback(KD_CALLBACK_SYNC_FLUSH, NULL); - return kernel_debug_read(&kd_control_trace, &kd_buffer_trace, buffer, - number, vp, ctx, chunk); + return _read_trace_events(dest, event_count, events_written); } struct event_chunk_header { @@ -2275,69 +2712,184 @@ struct event_chunk_header { }; static int -_write_event_chunk_header(user_addr_t udst, vnode_t vp, vfs_context_t ctx, - uint64_t length) +_send_data_vfs(struct kd_dest *dest, const void *src, size_t size) +{ + assert(size < INT_MAX); + assert(dest->kdd_kind == KD_DEST_VFS); + return vn_rdwr(UIO_WRITE, dest->kdd_vnode, (caddr_t)(uintptr_t)src, + (int)size, dest->kdd_cur_offset, UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, + vfs_context_ucred(&dest->kdd_vfs_ctx), (int *) 0, + vfs_context_proc(&dest->kdd_vfs_ctx)); +} + +static int +_send_data(struct kd_dest *dest, const void *src, size_t size) +{ + int error = 0; + switch (dest->kdd_kind) { + case KD_DEST_COPYOUT: + if (size > dest->kdd_user_size - dest->kdd_cur_offset) { + return ERANGE; + } + error = copyout(src, dest->kdd_user_buffer + dest->kdd_cur_offset, size); + break; + case KD_DEST_VFS: + error = _send_data_vfs(dest, src, size); + // XXX Previous code flushed with `VNOP_FSYNC` every 2MB, still needed? + break; + default: + panic("kdebug: unrecognized destination %d", dest->kdd_kind); + } + if (error == 0) { + dest->kdd_cur_offset += size; + } + return error; +} + +static int +_send_event_chunk_header(struct kd_dest *dest, size_t event_count) { struct event_chunk_header header = { .tag = V3_RAW_EVENTS, .sub_tag = 1, - .length = length, + .length = event_count * sizeof(kd_buf), }; - if (vp) { - assert(udst == USER_ADDR_NULL); - assert(ctx != NULL); - int error = kdbg_write_to_vnode((caddr_t)&header, sizeof(header), vp, - ctx, RAW_file_offset); - if (0 == error) { - RAW_file_offset += sizeof(header); - } - return error; - } else { - assert(udst != USER_ADDR_NULL); - return copyout(&header, udst, sizeof(header)); - } + return _send_data(dest, &header, sizeof(header)); } int -kernel_debug_trace_write_to_file(user_addr_t *buffer, size_t *number, - size_t *count, size_t tempbuf_number, vnode_t vp, vfs_context_t ctx, - bool chunk) +_send_events(struct kd_dest *dest, const void *src, size_t event_count) { - int error = 0; - - if (chunk) { - error = _write_event_chunk_header(*buffer, vp, ctx, - tempbuf_number * sizeof(kd_buf)); - if (error) { + if (dest->kdd_chunk_format) { + int error = _send_event_chunk_header(dest, event_count); + if (error != 0) { return error; } - if (buffer) { - *buffer += sizeof(struct event_chunk_header); - } - - assert(*count >= sizeof(struct event_chunk_header)); - *count -= sizeof(struct event_chunk_header); - *number += sizeof(struct event_chunk_header); } - if (vp) { - size_t write_size = tempbuf_number * sizeof(kd_buf); - error = kdbg_write_to_vnode((caddr_t)kd_buffer_trace.kdcopybuf, - write_size, vp, ctx, RAW_file_offset); - if (!error) { - RAW_file_offset += write_size; - } + return _send_data(dest, src, event_count * sizeof(kd_buf)); +} - if (RAW_file_written >= RAW_FLUSH_SIZE) { - error = VNOP_FSYNC(vp, MNT_NOWAIT, ctx); +static int +_write_legacy_header(bool write_thread_map, struct kd_dest *dest) +{ + uint32_t pad_size; + uint32_t extra_thread_count = 0; + uint32_t cpumap_size; + size_t map_size = 0; + uint32_t map_count = 0; - RAW_file_written = 0; + if (write_thread_map) { + assert(kd_control_trace.kdc_flags & KDBG_MAPINIT); + if (kd_mapcount > UINT32_MAX) { + return ERANGE; + } + map_count = (uint32_t)kd_mapcount; + if (os_mul_overflow(map_count, sizeof(kd_threadmap), &map_size)) { + return ERANGE; + } + if (map_size >= INT_MAX) { + return ERANGE; } - } else { - error = copyout(kd_buffer_trace.kdcopybuf, *buffer, tempbuf_number * sizeof(kd_buf)); - *buffer += (tempbuf_number * sizeof(kd_buf)); } + /* + * Without the buffers initialized, we cannot construct a CPU map or a + * thread map, and cannot write a header. + */ + if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT)) { + return EINVAL; + } + + /* + * To write a RAW_VERSION1+ file, we must embed a cpumap in the + * "padding" used to page align the events following the threadmap. If + * the threadmap happens to not require enough padding, we artificially + * increase its footprint until it needs enough padding. + */ + + pad_size = 16384 - ((sizeof(RAW_header) + map_size) & PAGE_MASK); + cpumap_size = sizeof(kd_cpumap_header) + kd_control_trace.kdebug_cpus * sizeof(kd_cpumap); + + if (cpumap_size > pad_size) { + /* If the cpu map doesn't fit in the current available pad_size, + * we increase the pad_size by 16K. We do this so that the event + * data is always available on a page aligned boundary for both + * 4k and 16k systems. We enforce this alignment for the event + * data so that we can take advantage of optimized file/disk writes. + */ + pad_size += 16384; + } + + /* The way we are silently embedding a cpumap in the "padding" is by artificially + * increasing the number of thread entries. However, we'll also need to ensure that + * the cpumap is embedded in the last 4K page before when the event data is expected. + * This way the tools can read the data starting the next page boundary on both + * 4K and 16K systems preserving compatibility with older versions of the tools + */ + if (pad_size > 4096) { + pad_size -= 4096; + extra_thread_count = (pad_size / sizeof(kd_threadmap)) + 1; + } + + int error = 0; + do { + clock_sec_t secs; + clock_usec_t usecs; + clock_get_calendar_microtime(&secs, &usecs); + RAW_header header = { + .version_no = RAW_VERSION1, + .thread_count = map_count + extra_thread_count, + .TOD_secs = secs, + .TOD_usecs = usecs, + }; + error = _send_data(dest, &header, sizeof(header)); + if (error != 0) { + break; + } + + if (write_thread_map) { + error = _send_data(dest, kd_mapptr, map_size); + if (error != 0) { + break; + } + } + + if (extra_thread_count) { + pad_size = extra_thread_count * sizeof(kd_threadmap); + void *pad_buf = kalloc_data(pad_size, Z_WAITOK | Z_ZERO); + if (!pad_buf) { + error = ENOMEM; + break; + } + error = _send_data(dest, pad_buf, pad_size); + if (error != 0) { + break; + } + } + + pad_size = PAGE_SIZE - (dest->kdd_cur_offset & PAGE_MASK); + if (pad_size) { + void *pad_buf = kalloc_data(pad_size, Z_WAITOK | Z_ZERO); + if (!pad_buf) { + error = ENOMEM; + break; + } + + /* + * Embed the CPU map in the padding bytes -- old code will skip it, + * while newer code knows it's there. + */ + size_t temp = pad_size; + (void)_copy_cpu_map(RAW_VERSION1, &pad_buf, &temp); + error = _send_data(dest, pad_buf, pad_size); + kfree_data(pad_buf, pad_size); + if (error != 0) { + break; + } + } + } while (false); + return error; } @@ -2348,7 +2900,6 @@ _kd_sysctl_internal(int op, int value, user_addr_t where, size_t *sizep) { size_t size = *sizep; kd_regtype kd_Reg; - proc_t p; bool read_only = (op == KERN_KDGETBUF || op == KERN_KDREADCURTHRMAP); int perm_error = read_only ? ktrace_read_check() : @@ -2360,17 +2911,13 @@ _kd_sysctl_internal(int op, int value, user_addr_t where, size_t *sizep) switch (op) { case KERN_KDGETBUF:; pid_t owning_pid = ktrace_get_owning_pid(); - kbufinfo_t info = { + const kbufinfo_t info = { .nkdbufs = kd_buffer_trace.kdb_event_count, .nkdthreads = (int)MIN(kd_mapcount, INT_MAX), .nolog = kd_control_trace.kdc_emit == KDEMIT_DISABLE, - .flags = kd_control_trace.kdc_flags | kd_control_trace.kdc_live_flags, + .flags = kd_control_trace.kdc_flags | kd_control_trace.kdc_live_flags | KDBG_LP64, .bufid = owning_pid ?: -1, }; -#if defined(__LP64__) - info.flags |= KDBG_LP64; -#endif // defined(__LP64__) - size = MIN(size, sizeof(info)); return copyout(&info, where, size); case KERN_KDREADCURTHRMAP: @@ -2418,17 +2965,19 @@ _kd_sysctl_internal(int op, int value, user_addr_t where, size_t *sizep) return kdbg_setreg(&kd_Reg); case KERN_KDGETREG: return EINVAL; - case KERN_KDREADTR: - return _read_merged_trace_events(where, sizep, NULL, NULL, false); + case KERN_KDREADTR: { + struct kd_dest copy_dest = kd_dest_copyout(where, *sizep); + size_t event_count = *sizep / sizeof(kd_buf); + size_t events_written = 0; + int error = _read_merged_trace_events(©_dest, event_count, &events_written); + *sizep = events_written; + return error; + } case KERN_KDWRITETR: case KERN_KDWRITETR_V3: case KERN_KDWRITEMAP: { - struct vfs_context context; - struct fileproc *fp; - size_t number; - vnode_t vp; - int fd; - int ret = 0; + struct kd_dest write_dest = {}; + int fd = value; if (op == KERN_KDWRITETR || op == KERN_KDWRITETR_V3) { (void)kdbg_wait(size); @@ -2440,38 +2989,31 @@ _kd_sysctl_internal(int op, int value, user_addr_t where, size_t *sizep) } } - p = current_proc(); - fd = value; - - if (fp_get_ftype(p, fd, DTYPE_VNODE, EBADF, &fp)) { - return EBADF; + struct fileproc *fp; + int error = kd_dest_init_write(&write_dest, fd, &fp); + if (error != 0) { + return error; } - - vp = fp_get_data(fp); - context.vc_thread = current_thread(); - context.vc_ucred = fp->fp_glob->fg_cred; - - if ((ret = vnode_getwithref(vp)) == 0) { - RAW_file_offset = fp->fp_glob->fg_offset; - if (op == KERN_KDWRITETR || op == KERN_KDWRITETR_V3) { - number = kd_buffer_trace.kdb_event_count * sizeof(kd_buf); - - KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_START); - ret = _read_merged_trace_events(0, &number, vp, &context, - op == KERN_KDWRITETR_V3); - KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_END, number); - - *sizep = number; - } else { - number = kd_mapcount * sizeof(kd_threadmap); - ret = kdbg_write_thread_map(vp, &context); + if (op == KERN_KDWRITETR || op == KERN_KDWRITETR_V3) { + size_t event_count = kd_buffer_trace.kdb_event_count; + size_t events_written = 0; + if (op == KERN_KDWRITETR_V3) { + write_dest.kdd_chunk_format = true; } - fp->fp_glob->fg_offset = RAW_file_offset; - vnode_put(vp); - } - fp_drop(p, fd, fp, 0); - return ret; + KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_START); + error = _read_merged_trace_events(&write_dest, event_count, + &events_written); + KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_END, events_written); + *sizep = events_written; + } else { + error = kdbg_write_thread_map(&write_dest); + if (error == 0) { + *sizep = kd_mapcount * sizeof(kd_threadmap); + } + } + kd_dest_finish_write(&write_dest, fp, fd); + return error; } case KERN_KDBUFWAIT: *sizep = kdbg_wait(size); @@ -2858,7 +3400,7 @@ kdebug_trace_start(unsigned int n_events, const char *filter_desc, * the earliest events, at the expense of later ones. */ if ((opts & KDOPT_WRAPPING) == 0) { - kd_control_trace.kdc_live_flags |= KDBG_NOWRAP; + kd_control_trace.kdc_flags |= KDBG_NOWRAP; } if (filter_desc && filter_desc[0] != '\0') { @@ -2900,7 +3442,6 @@ kdbg_dump_trace_to_file(const char *filename, bool reenable) { vfs_context_t ctx; vnode_t vp; - size_t write_size; int ret; int reenable_trace = 0; @@ -2929,15 +3470,20 @@ kdbg_dump_trace_to_file(const char *filename, bool reenable) commpage_update_kdebug_state(); ctx = vfs_context_kernel(); - if (vnode_open(filename, (O_CREAT | FWRITE | O_NOFOLLOW), 0600, 0, &vp, ctx)) { goto out; } + struct kd_dest file_dest = { + .kdd_kind = KD_DEST_VFS, + .kdd_vnode = vp, + .kdd_vfs_ctx = *ctx, + }; - kdbg_write_thread_map(vp, ctx); + kdbg_write_thread_map(&file_dest); - write_size = kd_buffer_trace.kdb_event_count * sizeof(kd_buf); - ret = _read_merged_trace_events(0, &write_size, vp, ctx, false); + size_t events_written = 0; + ret = _read_merged_trace_events(&file_dest, kd_buffer_trace.kdb_event_count, + &events_written); if (ret) { goto out_close; } @@ -2956,16 +3502,14 @@ kdbg_dump_trace_to_file(const char *filename, bool reenable) */ kd_buf end_event = { .debugid = TRACE_WRITING_EVENTS | DBG_FUNC_END, - .arg1 = write_size, + .arg1 = events_written, .arg2 = ret, .arg5 = (kd_buf_argtype)thread_tid(current_thread()), + .timestamp = kdebug_timestamp(), + .cpuid = cpu_number(), }; - kdbg_set_timestamp_and_cpu(&end_event, kdebug_timestamp(), - cpu_number()); - /* this is best effort -- ignore any errors */ - (void)kdbg_write_to_vnode((caddr_t)&end_event, sizeof(kd_buf), vp, ctx, - RAW_file_offset); + (void)_send_data_vfs(&file_dest, &end_event, sizeof(kd_buf)); out_close: vnode_close(vp, FWRITE, ctx); diff --git a/bsd/kern/kdebug_common.c b/bsd/kern/kdebug_common.c index 8523a9e9d..38be5880b 100644 --- a/bsd/kern/kdebug_common.c +++ b/bsd/kern/kdebug_common.c @@ -58,6 +58,7 @@ TUNABLE(unsigned int, write_trace_on_panic, "trace_panic", 0); // Obsolete leak logging system. TUNABLE(int, log_leaks, "-l", 0); +__startup_func void kdebug_startup(void) { @@ -137,7 +138,7 @@ create_buffers( kd_data_page->kdb_region_count++; } - if (kd_data_page->kdcopybuf == 0) { + if (kd_ctrl_page->kdebug_kdcopybuf_size > 0 && kd_data_page->kdcopybuf == NULL) { if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kdcopybuf, (vm_size_t) kd_ctrl_page->kdebug_kdcopybuf_size, KMA_DATA | KMA_ZERO, tag) != KERN_SUCCESS) { @@ -252,9 +253,20 @@ delete_buffers(struct kd_control *kd_ctrl_page, kd_ctrl_page->kdc_flags &= ~KDBG_BUFINIT; } -static bool -allocate_storage_unit(struct kd_control *kd_ctrl_page, - struct kd_buffer *kd_data_page, int cpu) +static void +_register_out_of_space(struct kd_control *kd_ctrl_page) +{ + kd_ctrl_page->kdc_emit = KDEMIT_DISABLE; + kdebug_enable = 0; + kd_ctrl_page->enabled = 0; + commpage_update_kdebug_state(); +} + +bool +kdebug_storage_alloc( + struct kd_control *kd_ctrl_page, + struct kd_buffer *kd_data_page, + int cpu) { union kds_ptr kdsp; struct kd_storage *kdsp_actual, *kdsp_next_actual; @@ -292,11 +304,8 @@ allocate_storage_unit(struct kd_control *kd_ctrl_page, * storage unit we can find. */ if (kd_ctrl_page->kdc_live_flags & KDBG_NOWRAP) { - kd_ctrl_page->kdc_emit = KDEMIT_DISABLE; + _register_out_of_space(kd_ctrl_page); kd_ctrl_page->kdc_live_flags |= KDBG_WRAPPED; - kdebug_enable = 0; - kd_ctrl_page->enabled = 0; - commpage_update_kdebug_state(); kdbp->kd_lostevents = true; retval = false; goto out; @@ -339,10 +348,7 @@ allocate_storage_unit(struct kd_control *kd_ctrl_page, } } if (kdbp_vict == NULL && kd_ctrl_page->mode == KDEBUG_MODE_TRACE) { - kd_ctrl_page->kdc_emit = KDEMIT_DISABLE; - kdebug_enable = 0; - kd_ctrl_page->enabled = 0; - commpage_update_kdebug_state(); + _register_out_of_space(kd_ctrl_page); retval = false; goto out; } @@ -388,607 +394,3 @@ out: return retval; } - -static void -release_storage_unit(struct kd_control *kd_ctrl_page, struct kd_buffer *kd_data_page, int cpu, uint32_t kdsp_raw) -{ - struct kd_storage *kdsp_actual; - struct kd_bufinfo *kdbp; - union kds_ptr kdsp; - - kdbp = &kd_data_page->kdb_info[cpu]; - - kdsp.raw = kdsp_raw; - - int intrs_en = kdebug_storage_lock(kd_ctrl_page); - - if (kdsp.raw == kdbp->kd_list_head.raw) { - /* - * it's possible for the storage unit pointed to - * by kdsp to have already been stolen... so - * check to see if it's still the head of the list - * now that we're behind the lock that protects - * adding and removing from the queue... - * since we only ever release and steal units from - * that position, if it's no longer the head - * we having nothing to do in this context - */ - kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp); - kdbp->kd_list_head = kdsp_actual->kds_next; - - kdsp_actual->kds_next = kd_ctrl_page->kds_free_list; - kd_ctrl_page->kds_free_list = kdsp; - - kd_ctrl_page->kdc_storage_used--; - } - - kdebug_storage_unlock(kd_ctrl_page, intrs_en); -} - -bool -kdebug_disable_wrap(struct kd_control *ctl, - kdebug_emit_filter_t *old_emit, kdebug_live_flags_t *old_live) -{ - int intrs_en = kdebug_storage_lock(ctl); - - *old_emit = ctl->kdc_emit; - *old_live = ctl->kdc_live_flags; - - bool wrapped = ctl->kdc_live_flags & KDBG_WRAPPED; - ctl->kdc_live_flags &= ~KDBG_WRAPPED; - ctl->kdc_live_flags |= KDBG_NOWRAP; - - kdebug_storage_unlock(ctl, intrs_en); - - return wrapped; -} - -static void -_enable_wrap(struct kd_control *kd_ctrl_page, kdebug_emit_filter_t emit) -{ - int intrs_en = kdebug_storage_lock(kd_ctrl_page); - kd_ctrl_page->kdc_live_flags &= ~KDBG_NOWRAP; - if (emit) { - kd_ctrl_page->kdc_emit = emit; - } - kdebug_storage_unlock(kd_ctrl_page, intrs_en); -} - -__attribute__((always_inline)) -void -kernel_debug_write(struct kd_control *kd_ctrl_page, - struct kd_buffer *kd_data_page, - struct kd_record kd_rec) -{ - uint64_t now = 0; - uint32_t bindx; - kd_buf *kd; - int cpu; - struct kd_bufinfo *kdbp; - struct kd_storage *kdsp_actual; - union kds_ptr kds_raw; - - disable_preemption(); - - if (kd_ctrl_page->enabled == 0) { - goto out; - } - - if (kd_rec.cpu == -1) { - cpu = cpu_number(); - } else { - cpu = kd_rec.cpu; - } - - kdbp = &kd_data_page->kdb_info[cpu]; - - bool timestamp_is_continuous = kdbp->continuous_timestamps; - - if (kd_rec.timestamp != -1) { - if (kdebug_using_continuous_time()) { - if (!timestamp_is_continuous) { - kd_rec.timestamp = absolutetime_to_continuoustime(kd_rec.timestamp); - } - } else { - if (timestamp_is_continuous) { - kd_rec.timestamp = continuoustime_to_absolutetime(kd_rec.timestamp); - } - } - kd_rec.timestamp &= KDBG_TIMESTAMP_MASK; - if (kd_rec.timestamp < kd_ctrl_page->kdc_oldest_time) { - if (kdbp->latest_past_event_timestamp < kd_rec.timestamp) { - kdbp->latest_past_event_timestamp = kd_rec.timestamp; - } - goto out; - } - } - -retry_q: - kds_raw = kdbp->kd_list_tail; - - if (kds_raw.raw != KDS_PTR_NULL) { - kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kds_raw); - bindx = kdsp_actual->kds_bufindx; - } else { - kdsp_actual = NULL; - bindx = kd_ctrl_page->kdebug_events_per_storage_unit; - } - - if (kdsp_actual == NULL || bindx >= kd_ctrl_page->kdebug_events_per_storage_unit) { - if (allocate_storage_unit(kd_ctrl_page, kd_data_page, cpu) == false) { - /* - * this can only happen if wrapping - * has been disabled - */ - goto out; - } - goto retry_q; - } - - if (kd_rec.timestamp != -1) { - /* - * IOP entries can be allocated before xnu allocates and inits the buffer - * And, Intel uses a special 0 value as a early tracing timestamp sentinel - * to set the start of trace-time-start-of-interest. - */ - if (kd_rec.timestamp < kdsp_actual->kds_timestamp) { - kdsp_actual->kds_timestamp = kd_rec.timestamp; - } - now = kd_rec.timestamp; - } else { - if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) { - now = kdebug_timestamp() & KDBG_TIMESTAMP_MASK; - } else { - now = mach_continuous_time() & KDBG_TIMESTAMP_MASK; - } - } - - if (!OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) { - goto retry_q; - } - - kd = &kdsp_actual->kds_records[bindx]; - - if (kd_ctrl_page->kdc_flags & KDBG_DEBUGID_64) { - /*DebugID has been passed in arg 4*/ - kd->debugid = 0; - } else { - kd->debugid = kd_rec.debugid; - } - - kd->arg1 = kd_rec.arg1; - kd->arg2 = kd_rec.arg2; - kd->arg3 = kd_rec.arg3; - kd->arg4 = kd_rec.arg4; - kd->arg5 = kd_rec.arg5; - - kdbg_set_timestamp_and_cpu(kd, now, cpu); - - OSAddAtomic(1, &kdsp_actual->kds_bufcnt); - -out: - enable_preemption(); -} - -// Read events from kdebug storage units into a user space buffer or file. -// -// This code runs while events are emitted -- storage unit allocation and -// deallocation wll synchronize with the emitters. Only one reader per control -// structure is allowed. -int -kernel_debug_read(struct kd_control *kd_ctrl_page, - struct kd_buffer *kd_data_page, user_addr_t buffer, size_t *number, - vnode_t vp, vfs_context_t ctx, uint32_t file_version) -{ - size_t count; - unsigned int cpu, min_cpu; - uint64_t barrier_min = 0, barrier_max = 0, t, earliest_time; - int error = 0; - kd_buf *tempbuf; - uint32_t rcursor; - kd_buf lostevent; - union kds_ptr kdsp; - bool traced_retrograde = false; - struct kd_storage *kdsp_actual; - struct kd_bufinfo *kdbp; - struct kd_bufinfo *min_kdbp; - size_t tempbuf_count; - uint32_t tempbuf_number; - kdebug_emit_filter_t old_emit; - uint32_t old_live_flags; - bool out_of_events = false; - bool wrapped = false; - bool set_preempt = true; - bool should_disable = false; - - struct kd_bufinfo *kdbip = kd_data_page->kdb_info; - struct kd_region *kd_bufs = kd_data_page->kd_bufs; - - assert(number != NULL); - count = *number / sizeof(kd_buf); - *number = 0; - - if (count == 0 || !(kd_ctrl_page->kdc_flags & KDBG_BUFINIT) || kd_data_page->kdcopybuf == 0) { - return EINVAL; - } - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) { - /* - * A corpse can be created due to 'TASK_HAS_TOO_MANY_THREADS' - * and that can be handled by a callout thread that already - * has the eager-preemption set. - * So check to see if we are dealing with one such thread. - */ - set_preempt = !(thread_is_eager_preempt(current_thread())); - } - - if (set_preempt) { - thread_set_eager_preempt(current_thread()); - } - - memset(&lostevent, 0, sizeof(lostevent)); - lostevent.debugid = TRACE_LOST_EVENTS; - - /* - * Capture the current time. Only sort events that have occured - * before now. Since the IOPs are being flushed here, it is possible - * that events occur on the AP while running live tracing. - */ - if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) { - barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK; - } else { - barrier_max = mach_continuous_time() & KDBG_TIMESTAMP_MASK; - } - - /* - * Disable wrap so storage units cannot be stolen out from underneath us - * while merging events. - * - * Because we hold ktrace_lock, no other control threads can be playing - * with kdc_flags. The code that emits new events could be running, - * but it grabs kdc_storage_lock if it needs to acquire a new storage - * chunk, which is where it examines kdc_flags. If it is adding to - * the same chunk we're reading from, check for that below. - */ - wrapped = kdebug_disable_wrap(kd_ctrl_page, &old_emit, &old_live_flags); - - if (count > kd_data_page->kdb_event_count) { - count = kd_data_page->kdb_event_count; - } - - if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) { - tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count; - } - - /* - * If the buffers have wrapped, do not emit additional lost events for the - * oldest storage units. - */ - if (wrapped) { - kd_ctrl_page->kdc_live_flags &= ~KDBG_WRAPPED; - - for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) { - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { - continue; - } - kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp); - kdsp_actual->kds_lostevents = false; - } - } - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) { - /* - * In TRIAGE mode we want to extract all the current - * records regardless of where we stopped reading last - * time so that we have the best shot at getting older - * records for threads before the buffers are wrapped. - * So set:- - * a) kd_prev_timebase to 0 so we (re-)consider older records - * b) readlast to 0 to initiate the search from the - * 1st record. - */ - for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) { - kdbp->kd_prev_timebase = 0; - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { - continue; - } - kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp); - kdsp_actual->kds_readlast = 0; - } - } - - /* - * Capture the earliest time where there are events for all CPUs and don't - * emit events with timestamps prior. - */ - barrier_min = kd_ctrl_page->kdc_oldest_time; - - while (count) { - tempbuf = kd_data_page->kdcopybuf; - tempbuf_number = 0; - - if (wrapped) { - /* - * Emit a lost events tracepoint to indicate that previous events - * were lost -- the thread map cannot be trusted. A new one must - * be taken so tools can analyze the trace in a backwards-facing - * fashion. - */ - kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0); - *tempbuf = lostevent; - wrapped = false; - goto nextevent; - } - - /* While space left in merged events scratch buffer. */ - while (tempbuf_count) { - bool lostevents = false; - int lostcpu = 0; - earliest_time = UINT64_MAX; - min_kdbp = NULL; - min_cpu = 0; - - /* Check each CPU's buffers for the earliest event. */ - for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) { - /* Skip CPUs without data in their oldest storage unit. */ - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { -next_cpu: - continue; - } - /* From CPU data to buffer header to buffer. */ - kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp); - -next_event: - /* The next event to be read from this buffer. */ - rcursor = kdsp_actual->kds_readlast; - - /* Skip this buffer if there are no events left. */ - if (rcursor == kdsp_actual->kds_bufindx) { - continue; - } - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) { - /* - * TRIAGE mode record keeping doesn't (currently) - * use lostevent markers. It also doesn't want to - * call release_storage_unit() in this read call. - * It expects the buffers to wrap and records reclaimed - * in that way solely. - */ - t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); - goto skip_record_checks; - } - - /* - * Check that this storage unit wasn't stolen and events were - * lost. This must have happened while wrapping was disabled - * in this function. - */ - if (kdsp_actual->kds_lostevents) { - lostevents = true; - kdsp_actual->kds_lostevents = false; - - /* - * The earliest event we can trust is the first one in this - * stolen storage unit. - */ - uint64_t lost_time = - kdbg_get_timestamp(&kdsp_actual->kds_records[0]); - if (kd_ctrl_page->kdc_oldest_time < lost_time) { - /* - * If this is the first time we've seen lost events for - * this gap, record its timestamp as the oldest - * timestamp we're willing to merge for the lost events - * tracepoint. - */ - kd_ctrl_page->kdc_oldest_time = barrier_min = lost_time; - lostcpu = cpu; - } - } - - t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); - - if (t > barrier_max) { - goto next_cpu; - } - if (t < kdsp_actual->kds_timestamp) { - /* - * This indicates the event emitter hasn't completed - * filling in the event (becuase we're looking at the - * buffer that the record head is using). The max barrier - * timestamp should have saved us from seeing these kinds - * of things, but other CPUs might be slow on the up-take. - * - * Bail out so we don't get out-of-order events by - * continuing to read events from other CPUs' events. - */ - out_of_events = true; - break; - } - - /* - * Ignore events that have aged out due to wrapping or storage - * unit exhaustion while merging events. - */ - if (t < barrier_min) { - kdsp_actual->kds_readlast++; - if (kdsp_actual->kds_readlast >= kd_ctrl_page->kdebug_events_per_storage_unit) { - release_storage_unit(kd_ctrl_page, kd_data_page, cpu, kdsp.raw); - - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { - goto next_cpu; - } - kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp); - } - goto next_event; - } - - /* - * Don't worry about merging any events -- just walk through - * the CPUs and find the latest timestamp of lost events. - */ - if (lostevents) { - continue; - } -skip_record_checks: - if (t < earliest_time) { - earliest_time = t; - min_kdbp = kdbp; - min_cpu = cpu; - } - } - if (lostevents) { - /* - * If any lost events were hit in the buffers, emit an event - * with the latest timestamp. - */ - kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu); - *tempbuf = lostevent; - tempbuf->arg1 = 1; - goto nextevent; - } - if (min_kdbp == NULL) { - /* All buffers ran empty. */ - out_of_events = true; - } - if (out_of_events) { - break; - } - - kdsp = min_kdbp->kd_list_head; - kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp); - - if (min_kdbp->latest_past_event_timestamp != 0) { - if (kdbg_debug) { - printf("kdebug: PAST EVENT: debugid %#8x: " - "time %lld from CPU %u " - "(barrier at time %lld)\n", - kdsp_actual->kds_records[rcursor].debugid, - t, cpu, barrier_min); - } - - kdbg_set_timestamp_and_cpu(tempbuf, earliest_time, min_cpu); - tempbuf->arg1 = (kd_buf_argtype)min_kdbp->latest_past_event_timestamp; - tempbuf->arg2 = 0; - tempbuf->arg3 = 0; - tempbuf->arg4 = 0; - tempbuf->debugid = TRACE_PAST_EVENTS; - min_kdbp->latest_past_event_timestamp = 0; - goto nextevent; - } - - /* Copy earliest event into merged events scratch buffer. */ - *tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++]; - kd_buf *earliest_event = tempbuf; - if (kd_control_trace.kdc_flags & KDBG_MATCH_DISABLE) { - kd_event_matcher *match = &kd_control_trace.disable_event_match; - kd_event_matcher *mask = &kd_control_trace.disable_event_mask; - if ((earliest_event->debugid & mask->kem_debugid) == match->kem_debugid && - (earliest_event->arg1 & mask->kem_args[0]) == match->kem_args[0] && - (earliest_event->arg2 & mask->kem_args[1]) == match->kem_args[1] && - (earliest_event->arg3 & mask->kem_args[2]) == match->kem_args[2] && - (earliest_event->arg4 & mask->kem_args[3]) == match->kem_args[3]) { - should_disable = true; - } - } - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) { - if (kdsp_actual->kds_readlast == kd_ctrl_page->kdebug_events_per_storage_unit) { - release_storage_unit(kd_ctrl_page, kd_data_page, min_cpu, kdsp.raw); - } - } - - /* - * Watch for out of order timestamps (from IOPs). - */ - if (earliest_time < min_kdbp->kd_prev_timebase) { - /* - * If we haven't already, emit a retrograde events event. - * Otherwise, ignore this event. - */ - if (traced_retrograde) { - continue; - } - if (kdbg_debug) { - printf("kdebug: RETRO EVENT: debugid %#8x: " - "time %lld from CPU %u " - "(barrier at time %lld)\n", - kdsp_actual->kds_records[rcursor].debugid, - t, cpu, barrier_min); - } - - kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase, - kdbg_get_cpu(tempbuf)); - tempbuf->arg1 = tempbuf->debugid; - tempbuf->arg2 = (kd_buf_argtype)earliest_time; - tempbuf->arg3 = 0; - tempbuf->arg4 = 0; - tempbuf->debugid = TRACE_RETROGRADE_EVENTS; - traced_retrograde = true; - } else { - min_kdbp->kd_prev_timebase = earliest_time; - } -nextevent: - tempbuf_count--; - tempbuf_number++; - tempbuf++; - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE && - (RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) { - break; - } - } - - if (tempbuf_number) { - /* - * Remember the latest timestamp of events that we've merged so we - * don't think we've lost events later. - */ - uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1); - if (kd_ctrl_page->kdc_oldest_time < latest_time) { - kd_ctrl_page->kdc_oldest_time = latest_time; - } - - if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) { - extern int kernel_debug_trace_write_to_file(user_addr_t *buffer, - size_t *number, size_t *count, size_t tempbuf_number, - vnode_t vp, vfs_context_t ctx, uint32_t file_version); - error = kernel_debug_trace_write_to_file(&buffer, number, - &count, tempbuf_number, vp, ctx, file_version); - } else if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) { - memcpy((void*)buffer, kd_data_page->kdcopybuf, - tempbuf_number * sizeof(kd_buf)); - buffer += tempbuf_number * sizeof(kd_buf); - } else { - panic("kdebug: invalid kdebug mode %d", kd_ctrl_page->mode); - } - if (error) { - *number = 0; - error = EINVAL; - break; - } - count -= tempbuf_number; - *number += tempbuf_number; - } - if (out_of_events) { - break; - } - - if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) { - tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count; - } - } - if ((old_live_flags & KDBG_NOWRAP) == 0) { - _enable_wrap(kd_ctrl_page, old_emit); - } - - if (set_preempt) { - thread_clear_eager_preempt(current_thread()); - } - - if (should_disable) { - kernel_debug_disable(); - } - - return error; -} diff --git a/bsd/kern/kdebug_triage.c b/bsd/kern/kdebug_triage.c index 575d13982..f07f9d60e 100644 --- a/bsd/kern/kdebug_triage.c +++ b/bsd/kern/kdebug_triage.c @@ -22,6 +22,7 @@ #include #include +#include #define TRIAGE_KDCOPYBUF_COUNT 128 #define TRIAGE_KDCOPYBUF_SIZE (TRIAGE_KDCOPYBUF_COUNT * sizeof(kd_buf)) @@ -31,8 +32,6 @@ struct kd_control kd_control_triage = { .mode = KDEBUG_MODE_TRIAGE, .kdebug_events_per_storage_unit = TRIAGE_EVENTS_PER_STORAGE_UNIT, .kdebug_min_storage_units_per_cpu = TRIAGE_MIN_STORAGE_UNITS_PER_CPU, - .kdebug_kdcopybuf_count = TRIAGE_KDCOPYBUF_COUNT, - .kdebug_kdcopybuf_size = TRIAGE_KDCOPYBUF_SIZE, .kdc_flags = KDBG_DEBUGID_64, .kdc_emit = KDEMIT_DISABLE, .kdc_oldest_time = 0 @@ -45,10 +44,9 @@ struct kd_buffer kd_buffer_triage = { .kdb_region_count = 0, .kdb_info = NULL, .kd_bufs = NULL, - .kdcopybuf = NULL + .kdcopybuf = NULL, }; - static LCK_GRP_DECLARE(ktriage_grp, "ktriage"); static LCK_MTX_DECLARE(ktriage_mtx, &ktriage_grp); @@ -64,54 +62,28 @@ ktriage_unlock(void) lck_mtx_unlock(&ktriage_mtx); } -int +__startup_func +void create_buffers_triage(void) { - int error = 0; - int events_per_storage_unit, min_storage_units_per_cpu; - if (kd_control_triage.kdc_flags & KDBG_BUFINIT) { - panic("create_buffers_triage shouldn't be called once we have inited the triage system."); + panic("kdebug_triage: double-init"); } - events_per_storage_unit = kd_control_triage.kdebug_events_per_storage_unit; - min_storage_units_per_cpu = kd_control_triage.kdebug_min_storage_units_per_cpu; + uint32_t cpu_count = kdbg_cpu_count(); + kd_control_triage.kdebug_cpus = cpu_count; + kd_control_triage.alloc_cpus = cpu_count; + uint32_t storage_count = cpu_count * kd_control_triage.kdebug_min_storage_units_per_cpu; - kd_control_triage.kdebug_cpus = kdbg_cpu_count(); - kd_control_triage.alloc_cpus = kd_control_triage.kdebug_cpus; - kd_control_triage.kdc_coprocs = NULL; + kd_buffer_triage.kdb_storage_count = storage_count; + kd_buffer_triage.kdb_event_count = storage_count * kd_control_triage.kdebug_events_per_storage_unit; - if (kd_buffer_triage.kdb_event_count < (kd_control_triage.kdebug_cpus * events_per_storage_unit * min_storage_units_per_cpu)) { - kd_buffer_triage.kdb_storage_count = kd_control_triage.kdebug_cpus * min_storage_units_per_cpu; - } else { - kd_buffer_triage.kdb_storage_count = kd_buffer_triage.kdb_event_count / events_per_storage_unit; + int error = create_buffers(&kd_control_triage, &kd_buffer_triage, VM_KERN_MEMORY_TRIAGE); + if (error != 0) { + panic("kdebug_triage: failed to create buffers, error = %d", error); } - - kd_buffer_triage.kdb_event_count = kd_buffer_triage.kdb_storage_count * events_per_storage_unit; - - kd_buffer_triage.kd_bufs = NULL; - - error = create_buffers(&kd_control_triage, &kd_buffer_triage, VM_KERN_MEMORY_TRIAGE); - - if (!error) { - kd_control_triage.kdc_oldest_time = mach_continuous_time(); - kd_control_triage.enabled = 1; - kd_buffer_triage.kdb_storage_threshold = kd_buffer_triage.kdb_storage_count / 2; - } - - return error; -} - -__attribute__((noreturn)) -void -delete_buffers_triage(void) -{ - /* - * If create_buffers() for triage mode fails, it will call the generic delete_buffers() to - * free the resources. This specific call should never be invoked because we expect the - * triage system to always be ON. - */ - panic("delete_buffers_triage shouldn't be invoked"); + // Immediately enable triage recording. + kd_control_triage.enabled = 1; } ktriage_strings_t ktriage_subsystems_strings[KDBG_TRIAGE_SUBSYS_MAX + 1]; @@ -150,38 +122,191 @@ ktriage_convert_to_string(uint64_t debugid, uintptr_t arg, char *buf, uint32_t b return; } +static void +_write_triage_record_nopreempt(uintptr_t debugid, uintptr_t arg, uintptr_t thread_id) +{ + uint64_t now = 0; + uint32_t bindx; + kd_buf *kd; + struct kd_storage *kdsp_actual; + union kds_ptr kds_raw; + + if (!kd_control_triage.enabled) { + return; + } + int cpu = cpu_number(); + struct kd_bufinfo *info = &kd_buffer_triage.kdb_info[cpu]; + const uint32_t events_per_storage = kd_control_triage.kdebug_events_per_storage_unit; + + while (true) { + kds_raw = info->kd_list_tail; + + if (kds_raw.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, kds_raw); + bindx = kdsp_actual->kds_bufindx; + } else { + kdsp_actual = NULL; + bindx = events_per_storage; + } + + if (kdsp_actual == NULL || bindx >= events_per_storage) { + if (kdebug_storage_alloc(&kd_control_triage, &kd_buffer_triage, cpu) == false) { + break; + } + continue; + } + + now = mach_continuous_time() & KDBG_TIMESTAMP_MASK; + if (OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) { + kd = &kdsp_actual->kds_records[bindx]; + + kd->debugid = 0; + kd->arg1 = arg; + kd->arg2 = 0; + kd->arg3 = 0; + kd->arg4 = debugid; + kd->arg5 = thread_id; + kd->timestamp = now; + + os_atomic_inc(&kdsp_actual->kds_bufcnt, release); + break; + } + } +} + void ktriage_record( uint64_t thread_id, uint64_t debugid, uintptr_t arg) { - struct kd_record kd_rec; - if (thread_id == 0) { thread_id = thread_tid(current_thread()); } + disable_preemption(); + _write_triage_record_nopreempt(debugid, arg, thread_id); + enable_preemption(); +} - kd_rec.cpu = -1; - kd_rec.timestamp = -1; +static struct kd_storage * +_find_triage_min_storage(uint64_t thread_id) +{ + uint64_t earliest_time = UINT64_MAX; + struct kd_storage *min_store = NULL; - /* - * use 64-bit debugid per our flag KDBG_DEBUGID_64 - * that is set in kd_control_triage (on LP64 only). - */ - assert(kd_control_triage.kdc_flags & KDBG_DEBUGID_64); + // Find the earliest record from all CPUs. + for (unsigned int cpu = 0; cpu < kd_control_triage.kdebug_cpus; cpu++) { + struct kd_bufinfo *info = &kd_buffer_triage.kdb_info[cpu]; + union kds_ptr store_ptr = info->kd_list_head; + if (store_ptr.raw == KDS_PTR_NULL) { + continue; + } + struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, store_ptr); + kd_buf *found_rec = NULL; - kd_rec.debugid = 0; - kd_rec.arg4 = (uintptr_t)debugid; + while (store) { + unsigned int last_read = store->kds_readlast; + unsigned int const limit = os_atomic_load(&store->kds_bufcnt, acquire); + while (last_read < limit) { + // Skip any records that didn't come from the target thread. + kd_buf *rec = &store->kds_records[last_read]; + if (rec->arg5 == thread_id) { + found_rec = rec; + break; + } + last_read++; + } + if (found_rec) { + store->kds_readlast = last_read; + break; + } - kd_rec.arg1 = arg; - kd_rec.arg2 = 0; - kd_rec.arg3 = 0; - kd_rec.arg5 = (uintptr_t)thread_id; + store_ptr = store->kds_next; + if (store_ptr.raw == KDS_PTR_NULL) { + break; + } + store = POINTER_FROM_KDS_PTR(kd_buffer_triage.kd_bufs, store_ptr); + } - kernel_debug_write(&kd_control_triage, - &kd_buffer_triage, - kd_rec); + if (found_rec) { + uint64_t t = found_rec->timestamp; + if (t < earliest_time) { + earliest_time = t; + min_store = store; + } + } + } + return min_store; +} + +/// Copy a time-ordered series of records pertaining to the given thread to a +/// buffer. Returns the number of records written into the buffer. +/// +/// Mutual exclusion must be provided by the caller. +/// +/// This is similar to `_read_trace_records`, except for a few triage-specific +/// additions and the removal of significant complexity for handling lost +/// events, coprocessors, and direct file writing. +static size_t +_read_triage_records(kd_buf *read_buffer, + size_t max_count, + uint64_t thread_id) +{ + struct kd_bufinfo *bufinfos = kd_buffer_triage.kdb_info; + struct kd_region *region = kd_buffer_triage.kd_bufs; + + size_t avail_count = MIN(max_count, kd_buffer_triage.kdb_event_count); + size_t read_count = 0; + + if (avail_count == 0 || + !(kd_control_triage.kdc_flags & KDBG_BUFINIT)) { + return 0; + } + + // `thread_call` threads created due to corpse creation may already have the + // eager preemption bit set, so don't over-do it. + bool set_preempt = !(thread_is_eager_preempt(current_thread())); + if (set_preempt) { + thread_set_eager_preempt(current_thread()); + } + + // Prevent any writers from stealing storage units -- just drop their logs + // on the floor instead. + int intrs_en = kdebug_storage_lock(&kd_control_triage); + kd_control_triage.kdc_flags |= KDBG_NOWRAP; + kdebug_storage_unlock(&kd_control_triage, intrs_en); + + // Clear out any previous accumulated state from earlier reads, as triage + // wants to reconsider all available data. + for (unsigned int cpu = 0; cpu < kd_control_triage.kdebug_cpus; cpu++) { + struct kd_bufinfo *info = &bufinfos[cpu]; + info->kd_prev_timebase = 0; + union kds_ptr kdsp = info->kd_list_head; + while (kdsp.raw != KDS_PTR_NULL) { + struct kd_storage *store = POINTER_FROM_KDS_PTR(region, kdsp); + store->kds_readlast = 0; + kdsp = store->kds_next; + } + } + + while (avail_count) { + struct kd_storage *min_store = _find_triage_min_storage(thread_id); + if (min_store == NULL) { + break; + } + *read_buffer++ = min_store->kds_records[min_store->kds_readlast++]; + avail_count--; + read_count++; + } + + intrs_en = kdebug_storage_lock(&kd_control_triage); + kd_control_triage.kdc_flags &= ~KDBG_NOWRAP; + kdebug_storage_unlock(&kd_control_triage, intrs_en); + if (set_preempt) { + thread_clear_eager_preempt(current_thread()); + } + + return read_count; } void @@ -190,11 +315,9 @@ ktriage_extract( void *buf, uint32_t bufsz) { - size_t i, record_bytes, record_cnt, record_bufsz; + size_t record_cnt = 0, record_bufsz; void *record_buf; void *local_buf; - int ret; - if (thread_id == 0 || buf == NULL || bufsz < KDBG_TRIAGE_MAX_STRLEN) { return; @@ -203,43 +326,28 @@ ktriage_extract( local_buf = buf; bzero(local_buf, bufsz); - record_bytes = record_bufsz = kd_buffer_triage.kdb_event_count * sizeof(kd_buf); + record_bufsz = kd_buffer_triage.kdb_event_count * sizeof(kd_buf); record_buf = kalloc_data(record_bufsz, Z_WAITOK); - if (record_buf == NULL) { - ret = ENOMEM; + printf("kdebug_triage: failed to allocate %lu bytes for record\n", + record_bufsz); + return; } else { ktriage_lock(); - ret = kernel_debug_read(&kd_control_triage, - &kd_buffer_triage, - (user_addr_t) record_buf, &record_bytes, NULL, NULL, 0); + record_cnt = _read_triage_records(record_buf, + kd_buffer_triage.kdb_event_count, thread_id); ktriage_unlock(); } - if (ret) { - printf("ktriage_extract: kernel_debug_read failed with %d\n", ret); - kfree_data(record_buf, record_bufsz); - return; - } - - kd_buf *kd = (kd_buf*) record_buf; - i = 0; - record_cnt = record_bytes; /* kernel_debug_read() takes number of bytes that it - * converts to kd_bufs. It processes a max of those and - * returns number of kd_buf read/processed. We use a - * different variable here to make our units clear. - */ - - while (i < record_cnt) { - if (kd->arg5 == (uintptr_t)thread_id) { - ktriage_convert_to_string(kd->arg4, kd->arg1, local_buf, KDBG_TRIAGE_MAX_STRLEN); - local_buf = (void *)((uintptr_t)local_buf + KDBG_TRIAGE_MAX_STRLEN); - bufsz -= KDBG_TRIAGE_MAX_STRLEN; - if (bufsz < KDBG_TRIAGE_MAX_STRLEN) { - break; - } + kd_buf *kd = (kd_buf *)record_buf; + for (size_t i = 0; i < record_cnt; i++) { + assert3u(kd->arg5, ==, thread_id); + ktriage_convert_to_string(kd->arg4, kd->arg1, local_buf, KDBG_TRIAGE_MAX_STRLEN); + local_buf = (void *)((uintptr_t)local_buf + KDBG_TRIAGE_MAX_STRLEN); + bufsz -= KDBG_TRIAGE_MAX_STRLEN; + if (bufsz < KDBG_TRIAGE_MAX_STRLEN) { + break; } - i++; kd++; } @@ -346,6 +454,9 @@ const char *vm_triage_strings[] = [KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR] = "mach_vm_allocate_kernel failed due to bad map\n", [KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR] = "mach_vm_allocate_kernel failed due to bad size\n", [KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR] = "mach_vm_allocate_kernel failed within call to vm_map_enter\n", + [KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE] = "Attempted I/O wiring of page with executable mapping\n", + [KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE] = "Attempted executable mapping of page already wired for I/O\n", + [KDBG_TRIAGE_VM_UPL_WRITE_ON_EXEC_REGION] = "Attempted writable UPL against executable VM region\n", }; /* VM end */ diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index cd3d84614..951b0994f 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2020 Apple Inc. All rights reserved. + * Copyright (c) 2003-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,18 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* - * todo: - * 1) ramesh is looking into how to replace taking a reference on - * the user's map (vm_map_reference()) since it is believed that - * would not hold the process for us. - * 2) david is looking into a way for us to set the priority of the - * worker threads to match that of the user's thread when the - * async IO was queued. - */ - - /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ @@ -46,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -72,20 +61,29 @@ #include -#include +#include +#include +#include + +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif + #define AIO_work_queued 1 #define AIO_worker_wake 2 #define AIO_completion_sig 3 -#define AIO_completion_cleanup_wait 4 -#define AIO_completion_cleanup_wake 5 -#define AIO_completion_suspend_wake 6 -#define AIO_fsync_delay 7 +#define AIO_completion_kevent 4 +#define AIO_completion_cleanup_wait 5 +#define AIO_completion_cleanup_wake 6 +#define AIO_completion_suspend_wake 7 #define AIO_cancel 10 #define AIO_cancel_async_workq 11 #define AIO_cancel_sync_workq 12 #define AIO_cancel_activeq 13 #define AIO_cancel_doneq 14 #define AIO_fsync 20 +#define AIO_fsync_delay 21 #define AIO_read 30 #define AIO_write 40 #define AIO_listio 50 @@ -105,6 +103,17 @@ #define AIO_suspend 110 #define AIO_suspend_sleep 111 #define AIO_worker_thread 120 +#define AIO_register_kevent 130 +#define AIO_WQ_process_entry 140 +#define AIO_WQ_aio_thread_create 141 +#define AIO_WQ_aio_thread_terminate 142 +#define AIO_WQ_aio_death_call 143 +#define AIO_WQ_aio_thread_park 144 +#define AIO_WQ_aio_select_req 145 +#define AIO_WQ_aio_thread_create_failed 146 +#define AIO_WQ_aio_thread_wakeup 147 + +static TUNABLE(uint32_t, bootarg_aio_new_workq, "aio_new_workq", 1); __options_decl(aio_entry_flags_t, uint32_t, { AIO_READ = 0x00000001, /* a read */ @@ -114,6 +123,9 @@ __options_decl(aio_entry_flags_t, uint32_t, { AIO_LIO = 0x00000010, /* lio_listio generated IO */ AIO_LIO_WAIT = 0x00000020, /* lio_listio is waiting on the leader */ + AIO_COMPLETED = 0x00000100, /* request has completed */ + AIO_CANCELLED = 0x00000200, /* request has been cancelled */ + /* * These flags mean that this entry is blocking either: * - close (AIO_CLOSE_WAIT) @@ -205,6 +217,49 @@ struct aio_anchor_cb { }; typedef struct aio_anchor_cb aio_anchor_cb; + +/* New per process workqueue */ +#define WORKQUEUE_AIO_MAXTHREADS 16 + +TAILQ_HEAD(workq_aio_uthread_head, uthread); + +typedef struct workq_aio_s { + thread_call_t wa_death_call; + struct workq_aio_uthread_head wa_thrunlist; + struct workq_aio_uthread_head wa_thidlelist; + TAILQ_HEAD(, aio_workq_entry) wa_aioq_entries; + proc_t wa_proc; + workq_state_flags_t _Atomic wa_flags; + uint16_t wa_nthreads; + uint16_t wa_thidlecount; + uint16_t wa_thdying_count; +} workq_aio_s, *workq_aio_t; + +struct aio_workq_usec_var { + uint32_t usecs; + uint64_t abstime; +}; + +static int aio_workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS; + +#define AIO_WORKQ_SYSCTL_USECS(var, init) \ + static struct aio_workq_usec_var var = { .usecs = (init) }; \ + SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \ + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &(var), 0, \ + aio_workq_sysctl_handle_usecs, "I", "") + +AIO_WORKQ_SYSCTL_USECS(aio_wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS); + +#define WQ_AIO_TRACE(x, wq, a, b, c, d) \ + ({ KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, (x)),\ + proc_getpid((wq)->wa_proc), (a), (b), (c), (d)); }) + +#define WQ_AIO_TRACE_WQ(x, wq) \ + ({ KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, (x)),\ + proc_getpid((wq)->wa_proc),\ + (uintptr_t)thread_tid(current_thread()),\ + (wq)->wa_nthreads, (wq)->wa_thidlecount, (wq)->wa_thdying_count); }) + /* * Notes on aio sleep / wake channels. * We currently pick a couple fields within the proc structure that will allow @@ -219,6 +274,8 @@ typedef struct aio_anchor_cb aio_anchor_cb; panic("AIO on a proc list that does not belong to that proc."); \ } +extern kern_return_t thread_terminate(thread_t); + /* * LOCAL PROTOTYPES */ @@ -235,7 +292,7 @@ static void aio_workq_remove_entry_locked(aio_workq_t queue, aio_wor static void aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp); static void aio_entry_ref(aio_workq_entry *entryp); static void aio_entry_unref(aio_workq_entry *entryp); -static bool aio_entry_try_workq_remove(aio_workq_entry *entryp); +static bool aio_entry_try_workq_remove(proc_t p, aio_workq_entry *entryp); static boolean_t aio_delay_fsync_request(aio_workq_entry *entryp); static void aio_free_request(aio_workq_entry *entryp); @@ -251,7 +308,7 @@ static int aio_queue_async_request(proc_t procp, user_addr_t aiocbp static int aio_validate(proc_t, aio_workq_entry *entryp); static int do_aio_cancel_locked(proc_t p, int fd, user_addr_t aiocbp, aio_entry_flags_t); -static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp); +static void do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp, aio_entry_flags_t reason); static int do_aio_fsync(aio_workq_entry *entryp); static int do_aio_read(aio_workq_entry *entryp); static int do_aio_write(aio_workq_entry *entryp); @@ -260,6 +317,19 @@ static void do_munge_aiocb_user64_to_user(struct user64_aiocb *my_ai static aio_workq_entry *aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t); static int aio_copy_in_list(proc_t, user_addr_t, user_addr_t *, int); +static void workq_aio_prepare(struct proc *p); +static bool workq_aio_entry_add_locked(struct proc *p, aio_workq_entry *entryp); +static void workq_aio_wakeup_thread(proc_t p); +static void workq_aio_wakeup_thread_and_unlock(proc_t p); +static int workq_aio_process_entry(aio_workq_entry *entryp); +static bool workq_aio_entry_remove_locked(struct proc *p, aio_workq_entry *entryp); + +static void workq_aio_kill_old_threads_call(void *param0, void *param1 __unused); +static void workq_aio_unpark_continue(void *parameter __unused, wait_result_t wr); + +static void workq_aio_mark_exiting(proc_t p); +static void workq_aio_exit(proc_t p); + #define ASSERT_AIO_PROC_LOCK_OWNED(p) LCK_MTX_ASSERT(aio_proc_mutex(p), LCK_MTX_ASSERT_OWNED) #define ASSERT_AIO_WORKQ_LOCK_OWNED(q) LCK_SPIN_ASSERT(aio_workq_lock(q), LCK_ASSERT_OWNED) @@ -294,6 +364,10 @@ static LCK_GRP_DECLARE(aio_proc_lock_grp, "aio_proc"); static LCK_GRP_DECLARE(aio_queue_lock_grp, "aio_queue"); static LCK_MTX_DECLARE(aio_proc_mtx, &aio_proc_lock_grp); +static struct klist aio_klist; +static LCK_GRP_DECLARE(aio_klist_lck_grp, "aio_klist"); +static LCK_MTX_DECLARE(aio_klist_lock, &aio_klist_lck_grp); + static KALLOC_TYPE_DEFINE(aio_workq_zonep, aio_workq_entry, KT_DEFAULT); /* Hash */ @@ -333,6 +407,10 @@ aio_workq_add_entry_locked(aio_workq_t queue, aio_workq_entry *entryp) { ASSERT_AIO_WORKQ_LOCK_OWNED(queue); + if (bootarg_aio_new_workq) { + panic("old workq implementation selected with bootarg set"); + } + TAILQ_INSERT_TAIL(&queue->aioq_entries, entryp, aio_workq_link); } @@ -428,11 +506,14 @@ aio_entry_unref(aio_workq_entry *entryp) } static bool -aio_entry_try_workq_remove(aio_workq_entry *entryp) +aio_entry_try_workq_remove(proc_t p, aio_workq_entry *entryp) { /* Can only be cancelled if it's still on a work queue */ if (entryp->aio_workq_link.tqe_prev != NULL) { aio_workq_t queue; + if (bootarg_aio_new_workq) { + return workq_aio_entry_remove_locked(p, entryp); + } /* Will have to check again under the lock */ queue = aio_entry_workq(entryp); @@ -481,7 +562,37 @@ aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval) int result; KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_START, - VM_KERNEL_ADDRPERM(p), uap->aiocbp, 0, 0, 0); + VM_KERNEL_ADDRPERM(p), uap->fd, uap->aiocbp, 0, 0); + + if (uap->fd) { + vnode_t vp = NULLVP; + const char *vname = NULL; + + result = vnode_getfromfd(vfs_context_current(), uap->fd, &vp); + if (result != 0) { + result = EBADF; + goto ExitRoutine; + } + + vname = vnode_getname(vp); + /* + * The aio_cancel() system call will always return AIO_NOTCANCELED for + * file descriptor associated with raw disk device. + */ + if (vnode_ischr(vp) && vname && !strncmp(vname, "rdisk", 5)) { + result = 0; + *retval = AIO_NOTCANCELED; + } + + if (vname) { + vnode_putname(vname); + } + vnode_put(vp); + + if (result == 0 && *retval == AIO_NOTCANCELED) { + goto ExitRoutine; + } + } /* quick check to see if there are any async IO requests queued up */ if (!aio_has_any_work()) { @@ -538,7 +649,7 @@ aio_cancel(proc_t p, struct aio_cancel_args *uap, int *retval) ExitRoutine: KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel) | DBG_FUNC_END, - VM_KERNEL_ADDRPERM(p), uap->aiocbp, result, 0, 0); + VM_KERNEL_ADDRPERM(p), uap->fd, uap->aiocbp, result, 0); return result; } @@ -816,9 +927,13 @@ _aio_exit(proc_t p) /* quick check to see if there are any async IO requests queued up */ if (!aio_has_any_work()) { + workq_aio_mark_exiting(p); + workq_aio_exit(p); return; } + workq_aio_mark_exiting(p); + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_exit) | DBG_FUNC_START, VM_KERNEL_ADDRPERM(p), 0, 0, 0, 0); @@ -860,6 +975,8 @@ _aio_exit(proc_t p) aio_proc_unlock(p); + workq_aio_exit(p); + /* free all the entries outside of the aio_proc_lock() */ TAILQ_FOREACH_SAFE(entryp, &tofree, aio_proc_link, tmp) { entryp->aio_proc_link.tqe_prev = NULL; @@ -935,7 +1052,7 @@ again: } /* Can only be cancelled if it's still on a work queue */ - if (aio_entry_try_workq_remove(entryp)) { + if (aio_entry_try_workq_remove(p, entryp)) { entryp->errorval = ECANCELED; entryp->returnval = -1; @@ -943,7 +1060,7 @@ again: KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE, VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), fd, 0, 0); - do_aio_completion_and_unlock(p, entryp); + do_aio_completion_and_unlock(p, entryp, AIO_CANCELLED); aio_proc_lock(p); @@ -1291,6 +1408,16 @@ aio_sigev_validate(const struct user_sigevent *sigev) case SIGEV_NONE: break; + case SIGEV_KEVENT: + /* + * The sigev_signo should contain the descriptor of the kqueue. + * Validate that it contains some sane value. + */ + if (sigev->sigev_signo <= 0 || sigev->sigev_signo > maxfilesperproc) { + return EINVAL; + } + break; + case SIGEV_THREAD: /* Unsupported [RTS] */ @@ -1313,7 +1440,7 @@ aio_sigev_validate(const struct user_sigevent *sigev) * entryp The work queue entry being queued * leader The work leader if any * - * Returns: Wether the enqueue was successful + * Returns: Whether the enqueue was successful * * Notes: This function is used for both lio_listio and aio * @@ -1327,8 +1454,6 @@ static bool aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp, aio_workq_entry *leader) { - aio_workq_t queue = aio_entry_workq(entryp); - ASSERT_AIO_PROC_LOCK_OWNED(procp); /* Onto proc queue */ @@ -1344,11 +1469,19 @@ aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp, /* And work queue */ aio_entry_ref(entryp); /* consumed in do_aio_completion_and_unlock */ - aio_workq_lock_spin(queue); - aio_workq_add_entry_locked(queue, entryp); - waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue), - THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); - aio_workq_unlock(queue); + if (bootarg_aio_new_workq) { + if (!workq_aio_entry_add_locked(procp, entryp)) { + (void)os_ref_release(&entryp->aio_refcount); + return false; + } + } else { + aio_workq_t queue = aio_entry_workq(entryp); + aio_workq_lock_spin(queue); + aio_workq_add_entry_locked(queue, entryp); + waitq_wakeup64_one(&queue->aioq_waitq, CAST_EVENT64_T(queue), + THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); + aio_workq_unlock(queue); + } KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_AIO, AIO_work_queued) | DBG_FUNC_START, VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp->uaiocbp), @@ -1358,6 +1491,58 @@ aio_try_enqueue_work_locked(proc_t procp, aio_workq_entry *entryp, return true; } +/* + * EV_FLAG0/1 are filter specific flags. + * Repurpose EV_FLAG0 to indicate the kevent is registered from kernel. + */ +#define EV_KERNEL EV_FLAG0 + +/* Register a kevent for AIO completion notification. */ +static int +aio_register_kevent(proc_t procp, aio_workq_entry *entryp) +{ + struct kevent_qos_s kev; + struct fileproc *fp = NULL; + kqueue_t kqu; + int kqfd = entryp->aiocb.aio_sigevent.sigev_signo; + int error; + + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_register_kevent) | DBG_FUNC_START, + VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp), + VM_KERNEL_ADDRPERM(entryp->uaiocbp), kqfd, 0); + + error = fp_get_ftype(procp, kqfd, DTYPE_KQUEUE, EBADF, &fp); + if (error) { + goto exit; + } + + kqu.kq = (struct kqueue *)fp_get_data(fp); + + memset(&kev, 0, sizeof(kev)); + kev.ident = (uintptr_t)entryp->uaiocbp; + kev.filter = EVFILT_AIO; + /* + * Set the EV_FLAG0 to indicate the event is registered from the kernel. + * This flag later is checked in filt_aioattach() and to determine if + * a kevent is registered from kernel or user-space. + */ + kev.flags = EV_ADD | EV_ENABLE | EV_CLEAR | EV_ONESHOT | EV_KERNEL; + kev.udata = entryp->aiocb.aio_sigevent.sigev_value.sival_ptr; + kev.data = (intptr_t)entryp; + + error = kevent_register(kqu.kq, &kev, NULL); + assert((error & FILTER_REGISTER_WAIT) == 0); + +exit: + if (fp) { + fp_drop(procp, kqfd, fp, 0); + } + + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_register_kevent) | DBG_FUNC_END, + VM_KERNEL_ADDRPERM(procp), VM_KERNEL_ADDRPERM(entryp), error, 0, 0); + + return error; +} /* * lio_listio - initiate a list of IO requests. We process the list of @@ -1432,7 +1617,8 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused) * isn't submitted */ entries[lio_count++] = entryp; - if (uap->mode == LIO_NOWAIT) { + if ((uap->mode == LIO_NOWAIT) && + (entryp->aiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)) { /* Set signal hander, if any */ entryp->aiocb.aio_sigevent = aiosigev; } @@ -1455,10 +1641,20 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval __unused) aio_entry_ref(leader); /* consumed below */ } - aio_proc_lock_spin(p); + aio_proc_lock(p); for (int i = 0; i < lio_count; i++) { if (aio_try_enqueue_work_locked(p, entries[i], leader)) { + workq_aio_wakeup_thread(p); /* this may drop and reacquire the proc lock */ + /* + * For SIGEV_KEVENT, every AIO in the list would get its own kevent + * notification upon completion as opposed to SIGEV_SIGNAL which a + * single notification is deliverd when all AIOs have completed. + */ + if ((uap->mode == LIO_NOWAIT) && + (entries[i]->aiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT)) { + aio_register_kevent(p, entries[i]); + } entries[i] = NULL; /* the entry was submitted */ } else { result = EAGAIN; @@ -1564,7 +1760,7 @@ aio_work_thread(void *arg __unused, wait_result_t wr __unused) vm_map_deallocate(entryp->aio_map); entryp->aio_map = VM_MAP_NULL; - KERNEL_DEBUG(SDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END, + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_worker_thread) | DBG_FUNC_END, VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), entryp->errorval, entryp->returnval, 0); @@ -1572,7 +1768,7 @@ aio_work_thread(void *arg __unused, wait_result_t wr __unused) /* push it on the done queue */ aio_proc_lock(p); entryp->errorval = error; - do_aio_completion_and_unlock(p, entryp); + do_aio_completion_and_unlock(p, entryp, AIO_COMPLETED); } } @@ -1622,8 +1818,8 @@ aio_get_some_work(void) if (aio_delay_fsync_request(entryp)) { /* It needs to be delayed. Put it back on the end of the work queue */ KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE, - VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), - 0, 0, 0); + VM_KERNEL_ADDRPERM(entryp->procp), + VM_KERNEL_ADDRPERM(entryp->uaiocbp), 0, 0, 0); aio_proc_unlock(entryp->procp); @@ -1701,14 +1897,19 @@ aio_create_queue_entry(proc_t procp, user_addr_t aiocbp, aio_entry_flags_t flags goto error_exit; } - /* get a reference to the user land map in order to keep it around */ - entryp->aio_map = get_task_map(proc_task(procp)); - vm_map_reference(entryp->aio_map); - /* get a reference on the current_thread, which is passed in vfs_context. */ entryp->context = *vfs_context_current(); thread_reference(entryp->context.vc_thread); kauth_cred_ref(entryp->context.vc_ucred); + + if (bootarg_aio_new_workq) { + entryp->aio_map = VM_MAP_NULL; + workq_aio_prepare(procp); + } else { + /* get a reference to the user land map in order to keep it around */ + entryp->aio_map = get_task_map(proc_task(procp)); + vm_map_reference(entryp->aio_map); + } return entryp; error_exit: @@ -1736,12 +1937,18 @@ aio_queue_async_request(proc_t procp, user_addr_t aiocbp, goto error_noalloc; } - aio_proc_lock_spin(procp); + aio_proc_lock(procp); if (!aio_try_enqueue_work_locked(procp, entryp, NULL)) { result = EAGAIN; goto error_exit; } - aio_proc_unlock(procp); + + if ((entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) && + (aio_register_kevent(procp, entryp) != 0)) { + result = EAGAIN; + goto error_exit; + } + workq_aio_wakeup_thread_and_unlock(procp); return 0; error_exit: @@ -1854,15 +2061,17 @@ aio_validate(proc_t p, aio_workq_entry *entryp) * do_aio_completion_and_unlock. Handle async IO completion. */ static void -do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp) +do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp, + aio_entry_flags_t reason) { aio_workq_entry *leader = entryp->lio_leader; int lio_pending = 0; - bool do_signal = false; + bool do_signal, do_kevent; ASSERT_AIO_PROC_LOCK_OWNED(p); aio_proc_move_done_locked(p, entryp); + entryp->flags |= reason; if (leader) { lio_pending = --leader->lio_pending; @@ -1884,6 +2093,7 @@ do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp) * If there are some still active then do nothing - we only want to * wakeup when all active aio requests for the process are complete. */ + do_signal = do_kevent = false; if (__improbable(entryp->flags & AIO_EXIT_WAIT)) { KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_cleanup_wait) | DBG_FUNC_NONE, VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), @@ -1892,7 +2102,7 @@ do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp) if (!aio_has_active_requests_for_process(p)) { /* * no active aio requests for this process, continue exiting. In this - * case, there should be no one else waiting ont he proc in AIO... + * case, there should be no one else waiting on the proc in AIO... */ wakeup_one((caddr_t)&p->AIO_CLEANUP_SLEEP_CHAN); @@ -1906,6 +2116,12 @@ do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp) * a group, and that a signal is desired, send one. */ do_signal = (lio_pending == 0); + } else if (entryp->aiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) { + /* + * For SIGEV_KEVENT, every AIO (even it is part of a group) would get + * a kevent notification. + */ + do_kevent = true; } if (__improbable(entryp->flags & AIO_CLOSE_WAIT)) { @@ -1931,6 +2147,15 @@ do_aio_completion_and_unlock(proc_t p, aio_workq_entry *entryp) entryp->aiocb.aio_sigevent.sigev_signo, 0, 0); psignal(p, entryp->aiocb.aio_sigevent.sigev_signo); + } else if (do_kevent) { + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_completion_kevent) | DBG_FUNC_NONE, + VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), + entryp->aiocb.aio_sigevent.sigev_signo, 0, 0); + + /* We only support one event type for either completed/cancelled AIO. */ + lck_mtx_lock(&aio_klist_lock); + KNOTE(&aio_klist, 1); + lck_mtx_unlock(&aio_klist_lock); } /* @@ -2144,7 +2369,16 @@ aio_init(void) aio_workq_init(&aio_anchor.aio_async_workqs[i]); } - _aio_create_worker_threads(aio_worker_threads); + if (bootarg_aio_new_workq) { + printf("New aio workqueue implementation selected\n"); + } else { + _aio_create_worker_threads(aio_worker_threads); + } + + klist_init(&aio_klist); + + clock_interval_to_absolutetime_interval(aio_wq_reduce_pool_window.usecs, + NSEC_PER_USEC, &aio_wq_reduce_pool_window.abstime); } @@ -2203,8 +2437,8 @@ do_munge_aiocb_user32_to_user(struct user32_aiocb *my_aiocbp, struct user_aiocb //LP64 the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify; the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo; - the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int = - my_aiocbp->aio_sigevent.sigev_value.sival_int; + the_user_aiocbp->aio_sigevent.sigev_value.sival_ptr = + my_aiocbp->aio_sigevent.sigev_value.sival_ptr; the_user_aiocbp->aio_sigevent.sigev_notify_function = CAST_USER_ADDR_T(my_aiocbp->aio_sigevent.sigev_notify_function); the_user_aiocbp->aio_sigevent.sigev_notify_attributes = @@ -2230,8 +2464,8 @@ do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb the_user_aiocbp->aio_sigevent.sigev_notify = my_aiocbp->aio_sigevent.sigev_notify; the_user_aiocbp->aio_sigevent.sigev_signo = my_aiocbp->aio_sigevent.sigev_signo; - the_user_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int = - my_aiocbp->aio_sigevent.sigev_value.size_equivalent.sival_int; + the_user_aiocbp->aio_sigevent.sigev_value.sival_ptr = + my_aiocbp->aio_sigevent.sigev_value.sival_ptr; the_user_aiocbp->aio_sigevent.sigev_notify_function = my_aiocbp->aio_sigevent.sigev_notify_function; the_user_aiocbp->aio_sigevent.sigev_notify_attributes = @@ -2241,3 +2475,923 @@ do_munge_aiocb_user64_to_user(struct user64_aiocb *my_aiocbp, struct user_aiocb panic("64bit process on 32bit kernel is not supported"); #endif } + + +static int +filt_aioattach(struct knote *kn, struct kevent_qos_s *kev) +{ + aio_workq_entry *entryp = (aio_workq_entry *)kev->data; + + /* Don't allow kevent registration from the user-space. */ + if ((kev->flags & EV_KERNEL) == 0) { + return EPERM; + } + + kev->flags &= ~EV_KERNEL; + /* Clear the 'kn_fflags' state afte the knote has been processed. */ + kn->kn_flags |= EV_CLEAR; + + /* Associate the knote with the AIO work. */ + knote_kn_hook_set_raw(kn, (void *)entryp); + + lck_mtx_lock(&aio_klist_lock); + KNOTE_ATTACH(&aio_klist, kn); + lck_mtx_unlock(&aio_klist_lock); + + return 0; +} + +static void +filt_aiodetach(struct knote *kn) +{ + lck_mtx_lock(&aio_klist_lock); + KNOTE_DETACH(&aio_klist, kn); + lck_mtx_unlock(&aio_klist_lock); +} + +/* + * The 'f_event' is called with 'aio_klist_lock' held when KNOTE() was called + * in do_aio_completion_and_unlock(). + */ +static int +filt_aioevent(struct knote *kn, long hint) +{ + aio_workq_entry *entryp; + int activate = 0; + + /* + * The 'f_event' and 'f_process' can run concurrently so it is possible + * the aio_workq_entry has been detached from the knote when the + * filt_aioprocess() was called earlier. In this case, we will skip + * activating the event. + */ + entryp = knote_kn_hook_get_raw(kn); + if (__improbable(entryp == NULL)) { + goto out; + } + + /* We can only activate the filter if the AIO work has completed. */ + if (entryp->flags & AIO_COMPLETED) { + kn->kn_fflags |= hint; + activate = FILTER_ACTIVE; + } + +out: + return activate; +} + +static int +filt_aiotouch(struct knote *kn, struct kevent_qos_s *kev) +{ + panic("%s: kn %p kev %p (NOT EXPECTED TO BE CALLED!!)", __func__, kn, kev); +} + +static int +filt_aioprocess(struct knote *kn, struct kevent_qos_s *kev) +{ + aio_workq_entry *entryp; + proc_t p; + int res = 0; + + entryp = knote_kn_hook_get_raw(kn); + assert(entryp); + p = entryp->procp; + + lck_mtx_lock(&aio_klist_lock); + if (kn->kn_fflags) { + /* Propagate the error status and return value back to the user. */ + kn->kn_ext[0] = entryp->errorval; + kn->kn_ext[1] = entryp->returnval; + knote_fill_kevent(kn, kev, 0); + knote_kn_hook_set_raw(kn, NULL); + + aio_proc_lock(p); + aio_proc_remove_done_locked(p, entryp); + aio_proc_unlock(p); + aio_entry_unref(entryp); + + res = FILTER_ACTIVE; + } + lck_mtx_unlock(&aio_klist_lock); + + return res; +} + +SECURITY_READ_ONLY_EARLY(struct filterops) aio_filtops = { + .f_isfd = 0, + .f_attach = filt_aioattach, + .f_detach = filt_aiodetach, + .f_event = filt_aioevent, + .f_touch = filt_aiotouch, + .f_process = filt_aioprocess, +}; + +#pragma mark per process aio workqueue + +/* + * The per process workq threads call this function to handle the aio request. The threads + * belong to the same process so we don't need to change the vm maps as we would for kernel + * threads. + */ +static int +workq_aio_process_entry(aio_workq_entry *entryp) +{ + proc_t p = entryp->procp; + int error; + + assert(current_proc() == p && current_thread() != vfs_context_thread(&entryp->context)); + + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_WQ_process_entry) | DBG_FUNC_START, + VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), + entryp->flags, 0, 0); + + if ((entryp->flags & AIO_READ) != 0) { + error = do_aio_read(entryp); + } else if ((entryp->flags & AIO_WRITE) != 0) { + error = do_aio_write(entryp); + } else if ((entryp->flags & (AIO_FSYNC | AIO_DSYNC)) != 0) { + if ((entryp->flags & AIO_FSYNC) != 0) { + /* + * Check for unfinished operations on the same file + * in this proc's queue. + */ + aio_proc_lock_spin(p); + if (aio_delay_fsync_request(entryp)) { + /* It needs to be delayed. Put it back on the end of the work queue */ + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_fsync_delay) | DBG_FUNC_NONE, + VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), + 0, 0, 0); + + /* The references on this entry havn't been consumed */ + if (!workq_aio_entry_add_locked(p, entryp)) { + entryp->errorval = ECANCELED; + entryp->returnval = -1; + + /* Now it's officially cancelled. Do the completion */ + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_cancel_async_workq) | DBG_FUNC_NONE, + VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), + entryp->aiocb.aio_fildes, 0, 0); + + do_aio_completion_and_unlock(p, entryp, AIO_CANCELLED); + } else { + workq_aio_wakeup_thread_and_unlock(p); + } + return 0; + } + aio_proc_unlock(entryp->procp); + } + error = do_aio_fsync(entryp); + } else { + error = EINVAL; + } + + KERNEL_DEBUG(BSDDBG_CODE(DBG_BSD_AIO, AIO_WQ_process_entry) | DBG_FUNC_END, + VM_KERNEL_ADDRPERM(p), VM_KERNEL_ADDRPERM(entryp->uaiocbp), + entryp->errorval, entryp->returnval, 0); + + /* we're done with the IO request so pop it off the active queue and */ + /* push it on the done queue */ + aio_proc_lock(p); + entryp->errorval = error; + do_aio_completion_and_unlock(p, entryp, AIO_COMPLETED); + return 0; +} + +/* + * The functions below implement a workqueue for aio which is taken from the + * workqueue implementation for libdispatch/pthreads. They are stripped down versions + * of the corresponding functions for libdispatch/pthreads. + */ + +static int +aio_workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg2) + struct aio_workq_usec_var *v = arg1; + int error = sysctl_handle_int(oidp, &v->usecs, 0, req); + if (error || !req->newptr) { + return error; + } + clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC, + &v->abstime); + return 0; +} + +#pragma mark wq_flags + +#define AIO_WQ_DEAD 0x1000 + +static inline uint32_t +_wa_flags(workq_aio_t wq_aio) +{ + return os_atomic_load(&wq_aio->wa_flags, relaxed); +} + +static inline bool +_wq_exiting(workq_aio_t wq_aio) +{ + return _wa_flags(wq_aio) & WQ_EXITING; +} + +static inline bool +_wq_dead(workq_aio_t wq_aio) +{ + return _wa_flags(wq_aio) & AIO_WQ_DEAD; +} + +#define AIO_WQPTR_IS_INITING_VALUE ((workq_aio_t)~(uintptr_t)0) + +static workq_aio_t +proc_get_aio_wqptr_fast(struct proc *p) +{ + return os_atomic_load(&p->p_aio_wqptr, relaxed); +} + +static workq_aio_t +proc_get_aio_wqptr(struct proc *p) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr_fast(p); + return wq_aio == AIO_WQPTR_IS_INITING_VALUE ? NULL : wq_aio; +} + +static void +proc_set_aio_wqptr(struct proc *p, workq_aio_t wq_aio) +{ + wq_aio = os_atomic_xchg(&p->p_aio_wqptr, wq_aio, release); + if (wq_aio == AIO_WQPTR_IS_INITING_VALUE) { + proc_lock(p); + thread_wakeup(&p->p_aio_wqptr); + proc_unlock(p); + } +} + +static bool +proc_init_aio_wqptr_or_wait(struct proc *p) +{ + workq_aio_t wq_aio; + + proc_lock(p); + wq_aio = os_atomic_load(&p->p_aio_wqptr, relaxed); + + if (wq_aio == NULL) { + os_atomic_store(&p->p_aio_wqptr, AIO_WQPTR_IS_INITING_VALUE, relaxed); + proc_unlock(p); + return true; + } + + if (wq_aio == AIO_WQPTR_IS_INITING_VALUE) { + assert_wait(&p->p_aio_wqptr, THREAD_UNINT); + proc_unlock(p); + thread_block(THREAD_CONTINUE_NULL); + } else { + proc_unlock(p); + } + return false; +} + +static inline event_t +workq_aio_parked_wait_event(struct uthread *uth) +{ + return (event_t)&uth->uu_workq_stackaddr; +} + +static inline void +workq_aio_thread_wakeup(struct uthread *uth) +{ + thread_wakeup_thread(workq_aio_parked_wait_event(uth), get_machthread(uth)); +} + +/* + * Routine: workq_aio_mark_exiting + * + * Function: Mark the work queue such that new threads will not be added to the + * work queue after we return. + * + * Conditions: Called against the current process. + */ +static void +workq_aio_mark_exiting(proc_t p) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr(p); + uint32_t wq_flags; + + if (!wq_aio) { + return; + } + + wq_flags = os_atomic_or_orig(&wq_aio->wa_flags, WQ_EXITING, relaxed); + if (__improbable(wq_flags & WQ_EXITING)) { + panic("workq_aio_mark_exiting_locked called twice"); + } + + /* + * Opportunistically try to cancel thread calls that are likely in flight. + * workq_aio_exit() will do the proper cleanup. + */ + if (wq_flags & WQ_DEATH_CALL_SCHEDULED) { + thread_call_cancel(wq_aio->wa_death_call); + } +} + +static void +workq_aio_exit(proc_t p) +{ + workq_aio_t wq_aio; + + wq_aio = os_atomic_xchg(&p->p_aio_wqptr, NULL, release); + + if (!wq_aio) { + return; + } + + /* + * Thread calls are always scheduled by the proc itself or under the + * workqueue spinlock if WQ_EXITING is not yet set. + * + * Either way, when this runs, the proc has no threads left beside + * the one running this very code, so we know no thread call can be + * dispatched anymore. + */ + + thread_call_cancel_wait(wq_aio->wa_death_call); + thread_call_free(wq_aio->wa_death_call); + + /* + * Clean up workqueue data structures for threads that exited and + * didn't get a chance to clean up after themselves. + * + * idle/new threads should have been interrupted and died on their own + */ + + assert(TAILQ_EMPTY(&wq_aio->wa_aioq_entries)); + assert(TAILQ_EMPTY(&wq_aio->wa_thrunlist)); + + if (wq_aio->wa_nthreads) { + os_atomic_or(&wq_aio->wa_flags, AIO_WQ_DEAD, relaxed); + aio_proc_lock_spin(p); + if (wq_aio->wa_nthreads) { + struct uthread *uth; + + TAILQ_FOREACH(uth, &wq_aio->wa_thidlelist, uu_workq_entry) { + if (uth->uu_workq_flags & UT_WORKQ_DYING) { + workq_aio_thread_wakeup(uth); + continue; + } + wq_aio->wa_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + workq_aio_thread_wakeup(uth); + } + while (wq_aio->wa_nthreads) { + msleep(&wq_aio->wa_nthreads, aio_proc_mutex(p), PRIBIO | PSPIN, "aio_workq_exit", 0); + } + } + aio_proc_unlock(p); + } + + assertf(TAILQ_EMPTY(&wq_aio->wa_thidlelist), + "wa_thidlecount = %d, wa_thdying_count = %d", + wq_aio->wa_thidlecount, wq_aio->wa_thdying_count); + assertf(wq_aio->wa_thidlecount == 0, + "wa_thidlecount = %d, wa_thdying_count = %d", + wq_aio->wa_thidlecount, wq_aio->wa_thdying_count); + assertf(wq_aio->wa_thdying_count == 0, + "wa_thdying_count = %d", wq_aio->wa_thdying_count); + + kfree_type(workq_aio_s, wq_aio); +} + +static int +workq_aio_open(struct proc *p) +{ + workq_aio_t wq_aio; + int error = 0; + + if (proc_get_aio_wqptr(p) == NULL) { + if (proc_init_aio_wqptr_or_wait(p) == FALSE) { + assert(proc_get_aio_wqptr(p) != NULL); + goto out; + } + + wq_aio = kalloc_type(workq_aio_s, Z_WAITOK | Z_ZERO); + + wq_aio->wa_proc = p; + + TAILQ_INIT(&wq_aio->wa_thidlelist); + TAILQ_INIT(&wq_aio->wa_thrunlist); + TAILQ_INIT(&wq_aio->wa_aioq_entries); + + wq_aio->wa_death_call = thread_call_allocate_with_options( + workq_aio_kill_old_threads_call, wq_aio, + THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE); + + proc_set_aio_wqptr(p, wq_aio); + } +out: + return error; +} + +#pragma mark aio workqueue idle thread accounting + +static inline struct uthread * +workq_oldest_killable_idle_aio_thread(workq_aio_t wq_aio) +{ + return TAILQ_LAST(&wq_aio->wa_thidlelist, workq_aio_uthread_head); +} + +static inline uint64_t +workq_kill_delay_for_idle_aio_thread() +{ + return aio_wq_reduce_pool_window.abstime; +} + +static inline bool +workq_should_kill_idle_aio_thread(struct uthread *uth, uint64_t now) +{ + uint64_t delay = workq_kill_delay_for_idle_aio_thread(); + return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay; +} + +static void +workq_aio_death_call_schedule(workq_aio_t wq_aio, uint64_t deadline) +{ + uint32_t wa_flags = os_atomic_load(&wq_aio->wa_flags, relaxed); + + if (wa_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) { + return; + } + os_atomic_or(&wq_aio->wa_flags, WQ_DEATH_CALL_SCHEDULED, relaxed); + + /* + * Due to how long term timers work, the leeway + * can't be too short, so use 500ms which is long enough that we will not + * wake up the CPU for killing threads, but short enough that it doesn't + * fall into long-term timer list shenanigans. + */ + thread_call_enter_delayed_with_leeway(wq_aio->wa_death_call, NULL, deadline, + aio_wq_reduce_pool_window.abstime / 10, + THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND); +} + +/* + * `decrement` is set to the number of threads that are no longer dying: + * - because they have been resuscitated just in time (workq_pop_idle_thread) + * - or have been killed (workq_thread_terminate). + */ +static void +workq_aio_death_policy_evaluate(workq_aio_t wq_aio, uint16_t decrement) +{ + struct uthread *uth; + + assert(wq_aio->wa_thdying_count >= decrement); +#if 0 + if (decrement) { + printf("VV_DEBUG_AIO : %s:%d : pid = %d, ctid = %d, after decrement thdying_count = %d\n", + __FUNCTION__, __LINE__, proc_pid(current_proc()), thread_get_ctid(thr), + wq_aio->wa_thdying_count - decrement); + } +#endif + + if ((wq_aio->wa_thdying_count -= decrement) > 0) { + return; + } + + if (wq_aio->wa_thidlecount <= 1) { + return; + } + + if (((uth = workq_oldest_killable_idle_aio_thread(wq_aio)) == NULL)) { + return; + } + + uint64_t now = mach_absolute_time(); + uint64_t delay = workq_kill_delay_for_idle_aio_thread(); + + if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) { + if (!(uth->uu_workq_flags & UT_WORKQ_DYING)) { + wq_aio->wa_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + } + workq_aio_thread_wakeup(uth); + return; + } + + workq_aio_death_call_schedule(wq_aio, + uth->uu_save.uus_workq_park_data.idle_stamp + delay); +} + +static void +workq_aio_kill_old_threads_call(void *param0, void *param1 __unused) +{ + workq_aio_t wq_aio = param0; + + aio_proc_lock_spin(wq_aio->wa_proc); + WQ_AIO_TRACE_WQ(AIO_WQ_aio_death_call | DBG_FUNC_START, wq_aio); + os_atomic_andnot(&wq_aio->wa_flags, WQ_DEATH_CALL_SCHEDULED, relaxed); + workq_aio_death_policy_evaluate(wq_aio, 0); + WQ_AIO_TRACE_WQ(AIO_WQ_aio_death_call | DBG_FUNC_END, wq_aio); + aio_proc_unlock(wq_aio->wa_proc);; +} + +#define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1 +#define WQ_SETUP_NONE 0 + +__attribute__((noreturn, noinline)) +static void +workq_aio_unpark_for_death_and_unlock(proc_t p, workq_aio_t wq_aio, + struct uthread *uth, uint32_t death_flags, __unused uint32_t setup_flags) +{ + if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) { + wq_aio->wa_thidlecount--; + TAILQ_REMOVE(&wq_aio->wa_thidlelist, uth, uu_workq_entry); + } + + if (uth->uu_workq_flags & UT_WORKQ_DYING) { + wq_aio->wa_thdying_count--; + } + assert(wq_aio->wa_nthreads > 0); + wq_aio->wa_nthreads--; + + WQ_AIO_TRACE_WQ(AIO_WQ_aio_thread_terminate | DBG_FUNC_NONE, wq_aio); + + if (_wq_dead(wq_aio) && (wq_aio->wa_nthreads == 0)) { + wakeup(&wq_aio->wa_nthreads); + } + + aio_proc_unlock(p); + + thread_t th = get_machthread(uth); + assert(th == current_thread()); + + thread_deallocate(th); + thread_terminate(th); + thread_exception_return(); + __builtin_unreachable(); +} + +static void +workq_push_idle_aio_thread(proc_t p, workq_aio_t wq_aio, struct uthread *uth, + uint32_t setup_flags) +{ + uint64_t now = mach_absolute_time(); + + uth->uu_workq_flags &= ~(UT_WORKQ_RUNNING); + TAILQ_REMOVE(&wq_aio->wa_thrunlist, uth, uu_workq_entry); + + uth->uu_save.uus_workq_park_data.idle_stamp = now; + + struct uthread *oldest = workq_oldest_killable_idle_aio_thread(wq_aio); + uint16_t cur_idle = wq_aio->wa_thidlecount; + + if (_wq_exiting(wq_aio) || (wq_aio->wa_thdying_count == 0 && oldest && + workq_should_kill_idle_aio_thread(oldest, now))) { + /* + * Immediately kill threads if we have too may of them. + * + * And swap "place" with the oldest one we'd have woken up. + * This is a relatively desperate situation where we really + * need to kill threads quickly and it's best to kill + * the one that's currently on core than context switching. + */ + if (oldest) { + oldest->uu_save.uus_workq_park_data.idle_stamp = now; + TAILQ_REMOVE(&wq_aio->wa_thidlelist, oldest, uu_workq_entry); + TAILQ_INSERT_HEAD(&wq_aio->wa_thidlelist, oldest, uu_workq_entry); + } + + if (!(uth->uu_workq_flags & UT_WORKQ_DYING)) { + wq_aio->wa_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + } + workq_aio_unpark_for_death_and_unlock(p, wq_aio, uth, 0, setup_flags); + __builtin_unreachable(); + } + + struct uthread *tail = TAILQ_LAST(&wq_aio->wa_thidlelist, workq_aio_uthread_head); + + cur_idle += 1; + wq_aio->wa_thidlecount = cur_idle; + uth->uu_save.uus_workq_park_data.has_stack = false; + TAILQ_INSERT_HEAD(&wq_aio->wa_thidlelist, uth, uu_workq_entry); + + if (!tail) { + uint64_t delay = workq_kill_delay_for_idle_aio_thread(); + workq_aio_death_call_schedule(wq_aio, now + delay); + } +} + +/* + * We have no work to do, park ourselves on the idle list. + * + * Consumes the workqueue lock and does not return. + */ +__attribute__((noreturn, noinline)) +static void +workq_aio_park_and_unlock(proc_t p, workq_aio_t wq_aio, struct uthread *uth, + uint32_t setup_flags) +{ + assert(uth == current_uthread()); + assert(uth->uu_kqr_bound == NULL); + + workq_push_idle_aio_thread(p, wq_aio, uth, setup_flags); // may not return + + if (uth->uu_workq_flags & UT_WORKQ_DYING) { + workq_aio_unpark_for_death_and_unlock(p, wq_aio, uth, + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, setup_flags); + __builtin_unreachable(); + } + + WQ_AIO_TRACE_WQ(AIO_WQ_aio_thread_park | DBG_FUNC_NONE, wq_aio); + + thread_set_pending_block_hint(get_machthread(uth), kThreadWaitParkedWorkQueue); + /* XXX this should probably be THREAD_UNINT */ + assert_wait(workq_aio_parked_wait_event(uth), THREAD_INTERRUPTIBLE); + aio_proc_unlock(p); + thread_block(workq_aio_unpark_continue); + __builtin_unreachable(); +} + +#define WORKQ_POLICY_INIT(qos) \ + (struct uu_workq_policy){ .qos_req = (qos), .qos_bucket = (qos) } + +/* + * This function is always called with the workq lock. + */ +static void +workq_aio_thread_reset_pri(struct uthread *uth, thread_t src_th) +{ + thread_t th = get_machthread(uth); + thread_qos_t qos = (thread_qos_t)proc_get_effective_thread_policy(src_th, TASK_POLICY_QOS); + int priority = 31; + int policy = POLICY_TIMESHARE; + + uth->uu_workq_pri = WORKQ_POLICY_INIT(qos); + thread_set_workq_pri(th, qos, priority, policy); +} + +static inline void +workq_aio_thread_set_type(struct uthread *uth, uint16_t flags) +{ + uth->uu_workq_flags &= ~(UT_WORKQ_OVERCOMMIT | UT_WORKQ_COOPERATIVE); + uth->uu_workq_flags |= flags; +} + +__attribute__((noreturn, noinline)) +static void +workq_aio_unpark_select_req_or_park_and_unlock(proc_t p, workq_aio_t wq_aio, + struct uthread *uth, uint32_t setup_flags) +{ + aio_workq_entry *entryp; + thread_t last_thread = NULL; + + WQ_AIO_TRACE_WQ(AIO_WQ_aio_select_req | DBG_FUNC_START, wq_aio); + thread_freeze_base_pri(get_machthread(uth)); + workq_aio_thread_set_type(uth, 0); + while ((entryp = TAILQ_FIRST(&wq_aio->wa_aioq_entries))) { + if (__improbable(_wq_exiting(wq_aio))) { + break; + } + + TAILQ_REMOVE(&wq_aio->wa_aioq_entries, entryp, aio_workq_link); + entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */ + + aio_proc_unlock(p); + + thread_t thr = vfs_context_thread(&entryp->context); + if (last_thread != thr) { + workq_aio_thread_reset_pri(uth, thr); + last_thread = thr; + } + + /* this frees references to workq entry */ + workq_aio_process_entry(entryp); + + aio_proc_lock_spin(p); + } + WQ_AIO_TRACE_WQ(AIO_WQ_aio_select_req | DBG_FUNC_END, wq_aio); + thread_unfreeze_base_pri(get_machthread(uth)); + workq_aio_park_and_unlock(p, wq_aio, uth, setup_flags); +} + +/* + * parked idle thread wakes up + */ +__attribute__((noreturn, noinline)) +static void +workq_aio_unpark_continue(void *parameter __unused, wait_result_t wr) +{ + thread_t th = current_thread(); + struct uthread *uth = get_bsdthread_info(th); + proc_t p = current_proc(); + workq_aio_t wq_aio = proc_get_aio_wqptr_fast(p); + + aio_proc_lock_spin(p); + + if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) { + workq_aio_unpark_select_req_or_park_and_unlock(p, wq_aio, uth, WQ_SETUP_NONE); + __builtin_unreachable(); + } + + if (__probable(wr == THREAD_AWAKENED)) { + /* + * We were set running, but for the purposes of dying. + */ + assert(uth->uu_workq_flags & UT_WORKQ_DYING); + assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0); + } else { + /* + * workaround for , + * in case we do hit userspace, make sure calling + * workq_thread_terminate() does the right thing here, + * and if we never call it, that workq_exit() will too because it sees + * this thread on the runlist. + */ + assert(wr == THREAD_INTERRUPTED); + + if (!(uth->uu_workq_flags & UT_WORKQ_DYING)) { + wq_aio->wa_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + } + } + + workq_aio_unpark_for_death_and_unlock(p, wq_aio, uth, + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE, WQ_SETUP_NONE); + + __builtin_unreachable(); +} + +/* + * Called by thread_create_workq_aio_waiting() during thread initialization, before + * assert_wait, before the thread has been started. + */ +event_t +aio_workq_thread_init_and_wq_lock(task_t task, thread_t th) +{ + struct uthread *uth = get_bsdthread_info(th); + + uth->uu_workq_flags = UT_WORKQ_NEW; + uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY); + uth->uu_workq_thport = MACH_PORT_NULL; + uth->uu_workq_stackaddr = 0; + uth->uu_workq_pthread_kill_allowed = 0; + + thread_set_tag(th, THREAD_TAG_AIO_WORKQUEUE); + thread_reset_workq_qos(th, THREAD_QOS_LEGACY); + + aio_proc_lock(get_bsdtask_info(task)); + return workq_aio_parked_wait_event(uth); +} + +/** + * Try to add a new workqueue thread for aio. + * + * - called with workq lock held + * - dropped and retaken around thread creation + * - return with workq lock held + * - aio threads do not call into pthread functions to setup or destroy stacks. + */ +static kern_return_t +workq_aio_add_new_thread(proc_t p, workq_aio_t wq_aio) +{ + kern_return_t kret; + thread_t th; + + wq_aio->wa_nthreads++; + + aio_proc_unlock(p); + + kret = thread_create_aio_workq_waiting(proc_task(p), + workq_aio_unpark_continue, + &th); + + if (kret != KERN_SUCCESS) { + WQ_AIO_TRACE(AIO_WQ_aio_thread_create_failed | DBG_FUNC_NONE, wq_aio, + kret, 0, 0, 0); + goto out; + } + + /* + * thread_create_aio_workq_waiting() will return with the wq lock held + * on success, because it calls workq_thread_init_and_wq_lock(). + */ + struct uthread *uth = get_bsdthread_info(th); + TAILQ_INSERT_TAIL(&wq_aio->wa_thidlelist, uth, uu_workq_entry); + wq_aio->wa_thidlecount++; + uth->uu_workq_flags &= ~UT_WORKQ_NEW; + WQ_AIO_TRACE_WQ(AIO_WQ_aio_thread_create | DBG_FUNC_NONE, wq_aio); + return kret; + +out: + aio_proc_lock(p); + /* + * Do not redrive here if we went under wq_max_threads again, + * it is the responsibility of the callers of this function + * to do so when it fails. + */ + wq_aio->wa_nthreads--; + return kret; +} + +static void +workq_aio_wakeup_thread_internal(proc_t p, bool unlock) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr(p); + bool needs_wakeup = false; + struct uthread *uth = NULL; + + if (!wq_aio) { + goto out; + } + + uth = TAILQ_FIRST(&wq_aio->wa_thidlelist); + while (!uth && (wq_aio->wa_nthreads < WORKQUEUE_AIO_MAXTHREADS) && + !(thread_get_tag(current_thread()) & THREAD_TAG_AIO_WORKQUEUE)) { + if (workq_aio_add_new_thread(p, wq_aio) != KERN_SUCCESS) { + break; + } + uth = TAILQ_FIRST(&wq_aio->wa_thidlelist); + } + + if (!uth) { + goto out; + } + + TAILQ_REMOVE(&wq_aio->wa_thidlelist, uth, uu_workq_entry); + wq_aio->wa_thidlecount--; + + TAILQ_INSERT_TAIL(&wq_aio->wa_thrunlist, uth, uu_workq_entry); + assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0); + uth->uu_workq_flags |= UT_WORKQ_RUNNING; + + WQ_AIO_TRACE_WQ(AIO_WQ_aio_thread_wakeup | DBG_FUNC_NONE, wq_aio); + + if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) { + uth->uu_workq_flags ^= UT_WORKQ_DYING; + workq_aio_death_policy_evaluate(wq_aio, 1); + needs_wakeup = false; + } else { + needs_wakeup = true; + } +out: + if (unlock) { + aio_proc_unlock(p); + } + + if (uth && needs_wakeup) { + workq_aio_thread_wakeup(uth); + } +} + +static void +workq_aio_wakeup_thread_and_unlock(proc_t p) +{ + return workq_aio_wakeup_thread_internal(p, true); +} + +static void +workq_aio_wakeup_thread(proc_t p) +{ + return workq_aio_wakeup_thread_internal(p, false); +} + +void +workq_aio_prepare(struct proc *p) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr(p); + + if (__improbable(!wq_aio && !proc_in_teardown(p))) { + workq_aio_open(p); + } +} + +bool +workq_aio_entry_add_locked(struct proc *p, aio_workq_entry *entryp) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr(p); + bool ret = false; + + ASSERT_AIO_PROC_LOCK_OWNED(p); + + if (!proc_in_teardown(p) && wq_aio && !_wq_exiting(wq_aio)) { + TAILQ_INSERT_TAIL(&wq_aio->wa_aioq_entries, entryp, aio_workq_link); + ret = true; + } + + return ret; +} + +bool +workq_aio_entry_remove_locked(struct proc *p, aio_workq_entry *entryp) +{ + workq_aio_t wq_aio = proc_get_aio_wqptr(p); + + ASSERT_AIO_PROC_LOCK_OWNED(p); + + if (entryp->aio_workq_link.tqe_prev == NULL) { + panic("Trying to remove an entry from a work queue, but it is not on a queue"); + } + + TAILQ_REMOVE(&wq_aio->wa_aioq_entries, entryp, aio_workq_link); + entryp->aio_workq_link.tqe_prev = NULL; /* Not on a workq */ + + return true; +} diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index efedae762..835a8ff8a 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -132,7 +132,7 @@ static int kauth_authorize_generic_callback(kauth_cred_t _credential, void uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3); kauth_scope_t kauth_scope_fileop; -extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); +extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); extern char * get_pathbuff(void); extern void release_pathbuff(char *path); diff --git a/bsd/kern/kern_codesigning.c b/bsd/kern/kern_codesigning.c index 530202d16..f6f502f1a 100644 --- a/bsd/kern/kern_codesigning.c +++ b/bsd/kern/kern_codesigning.c @@ -232,6 +232,7 @@ code_signing_configuration( int cs_enforcement_disabled = 0; int cs_integrity_skip = 0; int amfi_relax_profile_trust = 0; + int amfi_dev_mode_policy = 0; /* Parse the AMFI mask */ PE_parse_boot_argn("amfi", &amfi_mask, sizeof(amfi_mask)); @@ -272,6 +273,12 @@ code_signing_configuration( &amfi_relax_profile_trust, sizeof(amfi_relax_profile_trust)); + /* Parse the AMFI customer developer mode policy */ + PE_parse_boot_argn( + "amfi_dev_mode_policy", + &amfi_dev_mode_policy, + sizeof(amfi_dev_mode_policy)); + /* CS_CONFIG_UNRESTRICTED_DEBUGGING */ if (amfi_mask & CS_AMFI_MASK_UNRESTRICT_TASK_FOR_PID) { config |= CS_CONFIG_UNRESTRICTED_DEBUGGING; @@ -316,6 +323,11 @@ code_signing_configuration( config |= CS_CONFIG_RELAX_PROFILE_TRUST; } + /* CS_CONFIG_DEV_MODE_POLICY */ + if (amfi_dev_mode_policy) { + config |= CS_CONFIG_DEV_MODE_POLICY; + } + #if CONFIG_SPTM if (csm_enabled() == true) { @@ -503,6 +515,29 @@ developer_mode_state(void) return os_atomic_load(developer_mode_enabled, relaxed); } +#pragma mark Research Mode + +SECURITY_READ_ONLY_LATE(bool) research_mode_enabled = false; +SECURITY_READ_ONLY_LATE(bool) extended_research_mode_enabled = false; + +bool +research_mode_state(void) +{ + if (allow_research_modes() == true) { + return research_mode_enabled; + } + return false; +} + +bool +extended_research_mode_state(void) +{ + if (allow_research_modes() == true) { + return extended_research_mode_enabled; + } + return false; +} + #pragma mark Restricted Execution Mode kern_return_t @@ -982,7 +1017,7 @@ csm_resolve_os_entitlements_from_proc( } kern_return_t -address_space_debugged( +address_space_debugged_state( const proc_t process) { /* Must pass in a valid proc_t */ @@ -1049,6 +1084,12 @@ address_space_debugged( return KERN_DENIED; } +bool +is_address_space_debugged(const proc_t process) +{ + return address_space_debugged_state(process) == KERN_SUCCESS; +} + #if CODE_SIGNING_MONITOR bool @@ -1207,6 +1248,18 @@ csm_reconstitute_code_signature( unneeded_size); } +kern_return_t +csm_setup_nested_address_space( + pmap_t pmap, + const vm_address_t region_addr, + const vm_size_t region_size) +{ + return CSM_PREFIX(setup_nested_address_space)( + pmap, + region_addr, + region_size); +} + kern_return_t csm_associate_code_signature( pmap_t monitor_pmap, diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index f91c86d72..94c4ec880 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2021 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ * This file contains machine independent code for performing core dumps. * */ -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP #include #include @@ -73,13 +73,85 @@ #include #endif /* CONFIG_MACF */ + #include +extern int freespace_mb(vnode_t vp); + +/* XXX not in a Mach header anywhere */ +kern_return_t thread_getstatus(thread_t act, int flavor, + thread_state_t tstate, mach_msg_type_number_t *count); +void task_act_iterate_wth_args(task_t, void (*)(thread_t, void *), void *); + +#ifdef SECURE_KERNEL +__XNU_PRIVATE_EXTERN int do_coredump = 0; /* default: don't dump cores */ +#else +__XNU_PRIVATE_EXTERN int do_coredump = 1; /* default: dump cores */ +#endif /* SECURE_KERNEL */ +__XNU_PRIVATE_EXTERN int sugid_coredump = 0; /* default: but not SGUID binaries */ + +#if CONFIG_UCOREDUMP +__XNU_PRIVATE_EXTERN int do_ucoredump = 0; /* default: kernel does dumps */ +#endif + +/* + * is_coredump_eligible + * + * Determine if a core should even be dumped at all (by any mechanism) + * + * Does NOT include disk permission or space constraints + * + * core_proc Process to dump core [*] must be current proc! + * + * Return: 0 Success + * !0 Failure errno + */ +int +is_coredump_eligible(proc_t core_proc) +{ + if (current_proc() != core_proc && ( + core_proc->p_exit_reason && + core_proc->p_exit_reason->osr_namespace == OS_REASON_JETSAM)) { + return EPERM; + } + if (current_proc() != core_proc) { + panic("coredump for proc that is not current: %p)", core_proc); + } + + vfs_context_t ctx = vfs_context_current(); + kauth_cred_t cred = vfs_context_ucred(ctx); + + if (do_coredump == 0 || /* Not dumping at all */ + ((sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ + ((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) || + (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) { + return EPERM; + } + +#if CONFIG_MACF + const int error = mac_proc_check_dump_core(core_proc); + if (error != 0) { + return error; + } +#endif + return 0; +} + +#else /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ + +/* When core dumps aren't needed, no need to compile this file at all */ + +#error assertion failed: this section is not compiled + +#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ + +#if CONFIG_COREDUMP + #define COREDUMP_CUSTOM_LOCATION_ENTITLEMENT "com.apple.private.custom-coredump-location" typedef struct { int flavor; /* the number for this flavor */ - mach_msg_type_number_t count; /* count of ints in this flavor */ + mach_msg_type_number_t count; /* count of ints in this flavor */ } mythread_state_flavor_t; #if defined (__i386__) || defined (__x86_64__) @@ -109,21 +181,6 @@ typedef struct { size_t flavor_count; } tir_t; -extern int freespace_mb(vnode_t vp); - -/* XXX not in a Mach header anywhere */ -kern_return_t thread_getstatus(thread_t act, int flavor, - thread_state_t tstate, mach_msg_type_number_t *count); -void task_act_iterate_wth_args(task_t, void (*)(thread_t, void *), void *); - -#ifdef SECURE_KERNEL -__XNU_PRIVATE_EXTERN int do_coredump = 0; /* default: don't dump cores */ -#else -__XNU_PRIVATE_EXTERN int do_coredump = 1; /* default: dump cores */ -#endif /* SECURE_KERNEL */ -__XNU_PRIVATE_EXTERN int sugid_coredump = 0; /* default: but not SGUID binaries */ - - /* cpu_type returns only the most generic indication of the current CPU. */ /* in a core we want to know the kind of process. */ @@ -299,9 +356,9 @@ dump_notes(proc_t __unused core_proc, vm_offset_t header, size_t hoffset, struct * indicated * * Parameters: core_proc Process to dump core [*] - * reserve_mb If non-zero, leave filesystem with - * at least this much free space. - * coredump_flags Extra options (ignore rlimit, run fsync) + * reserve_mb If non-zero, leave filesystem with + * at least this much free space. + * coredump_flags Extra options (ignore rlimit, run fsync) * * Returns: 0 Success * !0 Failure errno @@ -344,8 +401,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) mach_msg_type_number_t vbrcount = 0; tir_t tir1; struct vnode * vp; - struct mach_header *mh = NULL; /* protected by is_64 */ - struct mach_header_64 *mh64 = NULL; /* protected by is_64 */ + struct mach_header *mh = NULL; /* protected by is_64 */ + struct mach_header_64 *mh64 = NULL; /* protected by is_64 */ int is_64 = 0; size_t mach_header_sz = sizeof(struct mach_header); size_t segment_command_sz = sizeof(struct segment_command); @@ -358,27 +415,10 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) bool include_iokit_memory = task_is_driver(task); bool coredump_attempted = false; - if (current_proc() != core_proc) { - COREDUMPLOG("Skipping coredump (called against proc that is not current_proc: %p)", core_proc); - error = EFAULT; + if ((error = is_coredump_eligible(core_proc)) != 0) { goto out2; } - if (do_coredump == 0 || /* Not dumping at all */ - ((sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ - ((kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) || - (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) { - error = EFAULT; - goto out2; - } - -#if CONFIG_MACF - error = mac_proc_check_dump_core(core_proc); - if (error != 0) { - goto out2; - } -#endif - if (IS_64BIT_PROCESS(core_proc)) { is_64 = 1; mach_header_sz = sizeof(struct mach_header_64); @@ -425,6 +465,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) (void) task_suspend_internal(task); + alloced_name = zalloc_flags(ZV_NAMEI, Z_NOWAIT | Z_ZERO); /* create name according to sysctl'able format string */ @@ -456,7 +497,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) goto out; } - VATTR_INIT(vap); /* better to do it here than waste more stack in vnode_setsize */ + VATTR_INIT(vap); /* better to do it here than waste more stack in vnode_setsize */ VATTR_SET(vap, va_data_size, 0); if (core_proc == initproc) { VATTR_SET(vap, va_dataprotect_class, PROTECTION_CLASS_D); @@ -479,7 +520,7 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) */ thread_count = get_task_numacts(task); - segment_count = get_vmmap_entries(map); /* XXX */ + segment_count = get_vmmap_entries(map); /* XXX */ tir1.flavor_count = sizeof(thread_flavor_array) / sizeof(mythread_state_flavor_t); bcopy(thread_flavor_array, flavors, sizeof(thread_flavor_array)); tstate_size = 0; @@ -561,8 +602,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) mh->sizeofcmds = (uint32_t)command_size; } - hoffset = mach_header_sz; /* offset into header */ - foffset = round_page(header_size); /* offset into file */ + hoffset = mach_header_sz; /* offset into header */ + foffset = round_page(header_size); /* offset into file */ vmoffset = MACH_VM_MIN_ADDRESS; /* offset into VM */ COREDUMPLOG("mach header size: %zu", header_size); @@ -694,8 +735,8 @@ coredump(proc_t core_proc, uint32_t reserve_mb, int coredump_flags) sc->segname[0] = 0; sc->vmaddr = CAST_DOWN_EXPLICIT(uint32_t, vmoffset); sc->vmsize = CAST_DOWN_EXPLICIT(uint32_t, vmsize); - sc->fileoff = CAST_DOWN_EXPLICIT(uint32_t, foffset); /* will never truncate */ - sc->filesize = CAST_DOWN_EXPLICIT(uint32_t, fsize); /* will never truncate */ + sc->fileoff = CAST_DOWN_EXPLICIT(uint32_t, foffset); /* will never truncate */ + sc->filesize = CAST_DOWN_EXPLICIT(uint32_t, fsize); /* will never truncate */ sc->maxprot = maxprot; sc->initprot = prot; sc->nsects = 0; @@ -791,10 +832,4 @@ out2: return error; } -#else /* CONFIG_COREDUMP */ - -/* When core dumps aren't needed, no need to compile this file at all */ - -#error assertion failed: this section is not compiled - #endif /* CONFIG_COREDUMP */ diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index b7984ffcb..d9a72d0c7 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -1918,7 +1918,7 @@ kauth_cred_change_egid(kauth_cred_t cred, gid_t new_egid) } -uid_t +__mockable uid_t kauth_cred_getuid(kauth_cred_t cred) { return posix_cred_get(cred)->cr_uid; @@ -3414,7 +3414,9 @@ kauth_cred_init(void) smr_shash_init(&kauth_cred_hash, SMRSH_BALANCED, maxproc / 4); vfs_context0.vc_ucred = posix_cred_create(&kernel_cred_template); } +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* smr not supported in user-mode */ STARTUP(ZALLOC, STARTUP_RANK_LAST, kauth_cred_init); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ uid_t kauth_getuid(void) diff --git a/bsd/kern/kern_csr.c b/bsd/kern/kern_csr.c index b2ab23271..604f16a3d 100644 --- a/bsd/kern/kern_csr.c +++ b/bsd/kern/kern_csr.c @@ -43,7 +43,7 @@ #include #include -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include #endif @@ -120,6 +120,15 @@ _csr_is_restore_environment(void) return PE_parse_boot_argn("-restore", ¬used, sizeof(notused)); } +static bool +_csr_is_darwinos_ramdisk(void) +{ + DTEntry chosen; + + return SecureDTLookupEntry(0, "/chosen", &chosen) == kSuccess && + _csr_dt_string_is_equal(&chosen, "osenvironment", "darwinos-ramdisk"); +} + static bool _csr_is_iuou_or_iuos_device(void) { @@ -201,7 +210,8 @@ csr_bootstrap(void) // This is required so the MSU stack can mount/unmount the update volume // during paired recovery. if (_csr_is_recovery_environment() || - _csr_is_restore_environment()) { + _csr_is_restore_environment() || + _csr_is_darwinos_ramdisk()) { csr_config |= CSR_ALLOW_UNRESTRICTED_FS; } @@ -216,13 +226,6 @@ csr_bootstrap(void) } else { csr_config &= ~CSR_ALLOW_UNAUTHENTICATED_ROOT; } - -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) - // Check whether we have to disable CTRR. - // lp-sip2 in the local boot policy is the bit driving this, - // which csrutil also sets implicitly when e.g. requesting kernel debugging. - csr_unsafe_kernel_text = _csr_get_dt_bool(&entry, "lp-sip2", &bool_value) && bool_value; -#endif } STARTUP(TUNABLES, STARTUP_RANK_FIRST, csr_bootstrap); diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 91969eb32..a54b019c4 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -1731,6 +1731,8 @@ fp_close_and_unlock(proc_t p, kauth_cred_t cred, int fd, struct fileproc *fp, in proc_fdunlock(p); if (FILEGLOB_DTYPE(fg) == DTYPE_VNODE) { + vnode_t vp = (vnode_t)fg_get_data(fg); + /* * call out to allow 3rd party notification of close. * Ignore result of kauth_authorize_fileop call. @@ -1742,15 +1744,15 @@ fp_close_and_unlock(proc_t p, kauth_cred_t cred, int fd, struct fileproc *fp, in #endif if (kauth_authorize_fileop_has_listeners() && - vnode_getwithref((vnode_t)fg_get_data(fg)) == 0) { + vnode_getwithref(vp) == 0) { u_int fileop_flags = 0; if (fg->fg_flag & FWASWRITTEN) { fileop_flags |= KAUTH_FILEOP_CLOSE_MODIFIED; } kauth_authorize_fileop(fg->fg_cred, KAUTH_FILEOP_CLOSE, - (uintptr_t)fg_get_data(fg), (uintptr_t)fileop_flags); + (uintptr_t)vp, (uintptr_t)fileop_flags); - vnode_put((vnode_t)fg_get_data(fg)); + vnode_put(vp); } } @@ -1861,6 +1863,16 @@ dupfdopen(proc_t p, int indx, int dfd, int flags, int error) return EPERM; } + if (wfp->f_type == DTYPE_VNODE) { + vnode_t vp = (vnode_t)fp_get_data(wfp); + + /* Don't allow opening symlink if O_SYMLINK was not specified. */ + if (vp && (vp->v_type == VLNK) && ((flags & O_SYMLINK) == 0)) { + proc_fdunlock(p); + return ELOOP; + } + } + /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. @@ -2764,6 +2776,7 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) int i, tmp, error, error2, flg = 0; struct flock fl = {}; struct flocktimeout fltimeout; + struct user32_flocktimeout user32_fltimeout; struct timespec *timeout = NULL; off_t offset; int newmin; @@ -3025,9 +3038,20 @@ sys_fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* Copy in the lock structure */ if (F_SETLKWTIMEOUT == cmd || F_OFD_SETLKWTIMEOUT == cmd) { - error = copyin(argp, (caddr_t) &fltimeout, sizeof(fltimeout)); - if (error) { - goto outdrop; + /* timespec uses long, so munge when we're dealing with 32-bit userspace */ + if (is64bit) { + error = copyin(argp, (caddr_t) &fltimeout, sizeof(fltimeout)); + if (error) { + goto outdrop; + } + } else { + error = copyin(argp, (caddr_t) &user32_fltimeout, sizeof(user32_fltimeout)); + if (error) { + goto outdrop; + } + fltimeout.fl = user32_fltimeout.fl; + fltimeout.timeout.tv_sec = user32_fltimeout.timeout.tv_sec; + fltimeout.timeout.tv_nsec = user32_fltimeout.timeout.tv_nsec; } fl = fltimeout.fl; timeout = &fltimeout.timeout; @@ -4401,6 +4425,16 @@ dropboth: struct vnode_attr va; +#if CONFIG_MACF + // tmp has already explicitly downcast to uint32_t above. + uint32_t dataprotect_class = (uint32_t)tmp; + if ((error = mac_vnode_check_dataprotect_set(ctx, vp, &dataprotect_class))) { + vnode_put(vp); + goto outdrop; + } + tmp = (int)dataprotect_class; +#endif + VATTR_INIT(&va); VATTR_SET(&va, va_dataprotect_class, tmp); @@ -5479,11 +5513,11 @@ fstat(proc_t p, int fd, user_addr_t ub, user_addr_t xsecurity, case DTYPE_VNODE: if ((error = vnode_getwithref((vnode_t)data)) == 0) { /* - * If the caller has the file open, and is not - * requesting extended security information, we are + * If the caller has the file open for reading, and is + * not requesting extended security information, we are * going to let them get the basic stat information. */ - if (xsecurity == USER_ADDR_NULL) { + if ((fp->f_flag & FREAD) && (xsecurity == USER_ADDR_NULL)) { error = vn_stat_noauth((vnode_t)data, sbptr, NULL, isstat64, 0, ctx, fp->fp_glob->fg_cred); } else { diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index c85e740ad..ab090688b 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -148,7 +148,7 @@ SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params = VM_PACKING_PARAMS(KNOTE_KQ_PACKED); extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */ -extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */ +extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */ #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) @@ -310,15 +310,16 @@ extern const struct filterops fsevent_filtops; extern const struct filterops vnode_filtops; extern const struct filterops tty_filtops; -const static struct filterops file_filtops; -const static struct filterops kqread_filtops; -const static struct filterops proc_filtops; -const static struct filterops timer_filtops; -const static struct filterops user_filtops; -const static struct filterops workloop_filtops; +__security_const_early static struct filterops file_filtops; +__security_const_early static struct filterops kqread_filtops; +__security_const_early static struct filterops proc_filtops; +__security_const_early static struct filterops timer_filtops; +__security_const_early static struct filterops user_filtops; +__security_const_early static struct filterops workloop_filtops; #if CONFIG_EXCLAVES extern const struct filterops exclaves_notification_filtops; #endif /* CONFIG_EXCLAVES */ +extern const struct filterops aio_filtops; /* * @@ -340,7 +341,7 @@ static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = { /* Public Filters */ [~EVFILT_READ] = &file_filtops, [~EVFILT_WRITE] = &file_filtops, - [~EVFILT_AIO] = &bad_filtops, + [~EVFILT_AIO] = &aio_filtops, [~EVFILT_VNODE] = &file_filtops, [~EVFILT_PROC] = &proc_filtops, [~EVFILT_SIGNAL] = &sig_filtops, diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 58b1d9c2a..47420f9f8 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -175,12 +176,14 @@ #include #include +#include #if CONFIG_MEMORYSTATUS #include #endif #include +#include /* kIODriverKitEntitlementKey */ #include "kern_exec_internal.h" @@ -204,7 +207,17 @@ static TUNABLE(bool, unentitled_ios_sim_launch, "unentitled_ios_sim_launch", fal #endif /* DEBUG || DEVELOPMENT */ #endif /* XNU_TARGET_OS_OSX */ - +#if DEVELOPMENT || DEBUG +os_log_t exec_log_handle = NULL; +#define EXEC_LOG(fmt, ...) \ +do { \ + if (exec_log_handle) { \ + os_log_with_type(exec_log_handle, OS_LOG_TYPE_INFO, "exec - %s:%d " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } \ +} while (0) +#else /* DEVELOPMENT || DEBUG */ +#define EXEC_LOG(fmt, ...) do { } while (0) +#endif /* DEVELOPMENT || DEBUG */ #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ @@ -290,7 +303,6 @@ int task_add_conclave(task_t task, void *vnode, int64_t off, const char *task_co kern_return_t task_inherit_conclave(task_t old_task, task_t new_task, void *vnode, int64_t off); #endif /* CONFIG_EXCLAVES */ - /* * Mach things for which prototypes are unavailable from Mach headers */ @@ -351,40 +363,6 @@ extern int nextpidversion; */ #define SPAWN_SET_PANIC_CRASH_BEHAVIOR "com.apple.private.spawn-panic-crash-behavior" -/* - * This entitlement marks security critical binaries for which the spawned - * process should be hardened. Implies enable-by-default for enablement - * of security features. These defaults can be overridden with the control - * entitlements for the sub-features below. - */ -#define SPAWN_ENABLE_HARDENED_PROCESS "com.apple.developer.hardened-process" - -#if DEVELOPMENT || DEBUG -/* - * The following boot-arg defines the behavior for the case - * where a binary entitled as hardened-process but doesn't - * have a specific sub-feature entitlement, which is still - * under adoption. - */ -typedef enum { - HARDENED_PROCESS_CONFIG_SILENT = 0, - HARDENED_PROCESS_CONFIG_LOG = 1, - HARDENED_PROCESS_CONFIG_FATAL = 2, - HARDENED_PROCESS_CONFIG_MAX = 3 -} hardened_process_config_policy; - -TUNABLE(hardened_process_config_policy, - hardened_process_config, - "hardened_process_config", - HARDENED_PROCESS_CONFIG_SILENT); -#endif /* DEVELOPMENT || DEBUG */ - -/* - * Control entitlement to enable/disable hardened-heap in the process. - */ -#define SPAWN_ENABLE_HARDENED_HEAP "com.apple.developer.hardened-process.hardened-heap" - - /* Platform Code Exec Logging */ static int platform_exec_logging = 0; @@ -395,6 +373,7 @@ SYSCTL_INT(_security_mac, OID_AUTO, platform_exec_logging, CTLFLAG_RW, &platform static os_log_t peLog = OS_LOG_DEFAULT; + struct exception_port_action_t { ipc_port_t port; _ps_port_action_t *port_action; @@ -417,7 +396,7 @@ static int execargs_alloc(struct image_params *imgp); static int execargs_free(struct image_params *imgp); static int exec_check_permissions(struct image_params *imgp); static int exec_extract_strings(struct image_params *imgp); -static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result); +static int exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result, task_t task); static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; SYSCTL_INT(_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, ""); @@ -434,8 +413,6 @@ static errno_t exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_a task_role_t psa_darwin_role, struct exec_port_actions *port_actions); static void exec_port_actions_destroy(struct exec_port_actions *port_actions); - - /* * exec_add_user_string * @@ -973,23 +950,24 @@ set_crash_behavior_from_bootarg(proc_t p) void set_proc_name(struct image_params *imgp, proc_t p) { - int p_name_len = sizeof(p->p_name) - 1; + uint64_t buflen = imgp->ip_ndp->ni_cnd.cn_namelen; + const int p_name_len = sizeof(p->p_name) - 1; + const int p_comm_len = sizeof(p->p_comm) - 1; - if (imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len) { - imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len; + if (buflen > p_name_len) { + buflen = p_name_len; } - bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name, - (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); - p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; + bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_name, buflen); + p->p_name[buflen] = '\0'; - if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) { - imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN; + if (buflen > p_comm_len) { + static_assert(MAXCOMLEN + 1 == sizeof(p->p_comm)); + buflen = p_comm_len; } - bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, - (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen); - p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; + bcopy((caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, (caddr_t)p->p_comm, buflen); + p->p_comm[buflen] = '\0'; #if (DEVELOPMENT || DEBUG) /* @@ -1042,8 +1020,10 @@ get_teamid_for_shared_region(struct image_params *imgp) static inline bool arm64_cpusubtype_uses_ptrauth(cpu_subtype_t cpusubtype) { + int ptrauth_abi_version = (int)CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(cpusubtype); return (cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E && - CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(cpusubtype) == CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION; + (ptrauth_abi_version >= CPU_SUBTYPE_ARM64_PTR_AUTHV0_VERSION && + ptrauth_abi_version <= CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION); } #endif /* __has_feature(ptrauth_calls) */ @@ -1078,54 +1058,19 @@ binary_match(cpu_type_t mask, cpu_type_t req_cpu, } -#define MIN_IOS_TPRO_SDK_VERSION 0x00100000 -#define MIN_OSX_TPRO_SDK_VERSION 0x000D0000 -#define MIN_TVOS_TPRO_SDK_VERSION 0x000D0000 -#define MIN_WATCHOS_TPRO_SDK_VERSION 0x00090000 -#define MIN_DRIVERKIT_TPRO_SDK_VERSION 0x00600000 - -static void -exec_setup_tpro(struct image_params *imgp, load_result_t *load_result) +/* + * Check entitlements to see if this is a platform restrictions binary. + * Save this in load_result until later for two purposes: + * 1. We can mark the task at a certain security level once it's been created + * 2. We can propagate which entitlements are present to the apple array + */ +static inline void +encode_HR_entitlement(const char *entitlement, hardened_browser_flags_t mask, + const struct image_params *imgp, load_result_t *load_result) { - extern boolean_t xprr_tpro_enabled; - extern boolean_t enable_user_modifiable_perms; - uint32_t min_sdk_version = 0; - - /* x86-64 translated code cannot take advantage of TPRO */ - if (imgp->ip_flags & IMGPF_ROSETTA) { - return; + if (IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, entitlement)) { + load_result->hardened_browser |= mask; } - - /* Do not enable on 32-bit VA targets */ - if (!(imgp->ip_flags & IMGPF_IS_64BIT_ADDR)) { - return; - } - - switch (load_result->ip_platform) { - case PLATFORM_IOS: - case PLATFORM_IOSSIMULATOR: - case PLATFORM_MACCATALYST: - min_sdk_version = MIN_IOS_TPRO_SDK_VERSION; - break; - case PLATFORM_MACOS: - min_sdk_version = MIN_OSX_TPRO_SDK_VERSION; - break; - case PLATFORM_TVOS: - case PLATFORM_TVOSSIMULATOR: - min_sdk_version = MIN_TVOS_TPRO_SDK_VERSION; - break; - case PLATFORM_WATCHOS: - case PLATFORM_WATCHOSSIMULATOR: - min_sdk_version = MIN_WATCHOS_TPRO_SDK_VERSION; - break; - case PLATFORM_DRIVERKIT: - min_sdk_version = MIN_DRIVERKIT_TPRO_SDK_VERSION; - break; - default: - /* TPRO is on by default for newer platforms */ - break; - } - } /* @@ -1148,157 +1093,164 @@ vnode_is_rsr(vnode_t vp) return FALSE; } +static struct { + char *legacy; + char *security; +} exec_security_mitigation_entitlement[] = { + /* The following entries must match the enum declaration in kern_exec_internal.h */ + [HARDENED_PROCESS] = { + "com.apple.developer.hardened-process", + "com.apple.security.hardened-process" + }, + [HARDENED_HEAP] = { + "com.apple.developer.hardened-process.hardened-heap", + "com.apple.security.hardened-process.hardened-heap" + }, + [TPRO] = { + NULL, + "com.apple.security.hardened-process.dyld-ro", + }, +}; -// Check entitlements to see if this is a hardened runtime binary. -// Save this in load_result until later for two purposes: -// 1. Once the task is created, we can mark it as hardened runtime if needed -// 2. we can propagate which entitlements are present to the apple array +/* + * Platform Restrictions + * + * This mitigation opts you into the grab bag of various kernel mitigations + * including IPC security restrictions + * The presence of the entitlement opts the binary into the feature. + * The entitlement is an entitlement containing a version number + * for the platform restrictions you are opting into. + */ +#define SPAWN_ENABLE_PLATFORM_RESTRICTIONS "com.apple.security.hardened-process.platform-restrictions" + +/* + * Version number for enhanced security + * Currently stored with 3 bits in `hardened_process_version` + */ +#define HARDENED_PROCESS_VERSION "com.apple.security.hardened-process.enhanced-security-version" + +/* See kern_exec_internal.h for the extensive documentation. */ +exec_security_err_t +exec_check_security_entitlement(struct image_params *imgp, + exec_security_mitigation_entitlement_t entitlement) +{ + bool has_legacy_entitlement = false, has_security_entitlement = false; + assert(exec_security_mitigation_entitlement[entitlement].security != NULL); + + if (exec_security_mitigation_entitlement[entitlement].legacy != NULL) { + has_legacy_entitlement = + IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, + exec_security_mitigation_entitlement[entitlement].legacy); + } + + has_security_entitlement = + IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, + exec_security_mitigation_entitlement[entitlement].security); + + /* If both entitlements are present, this is an invalid configuration. */ + if (has_legacy_entitlement && has_security_entitlement) { + EXEC_LOG("Binary has both legacy (%s) and security (%s) entitlements\n", + exec_security_mitigation_entitlement[entitlement].legacy, + exec_security_mitigation_entitlement[entitlement].security); + + return EXEC_SECURITY_INVALID_CONFIG; + } + + if (has_legacy_entitlement || has_security_entitlement) { + return EXEC_SECURITY_ENTITLED; + } + + return EXEC_SECURITY_NOT_ENTITLED; +} + + +/* + * Entitled binaries get hardened_heap + */ +static inline errno_t +imgact_setup_hardened_heap(struct image_params *imgp, task_t task) +{ + exec_security_err_t ret = exec_check_security_entitlement(imgp, HARDENED_HEAP); + if (ret == EXEC_SECURITY_ENTITLED) { + task_set_hardened_heap(task); + } else { + task_clear_hardened_heap(task); + } + switch (ret) { + case EXEC_SECURITY_INVALID_CONFIG: + return EINVAL; + case EXEC_SECURITY_ENTITLED: + case EXEC_SECURITY_NOT_ENTITLED: + return 0; + } +} + +/* + * Configure the platform restrictions security features on the task + * This must be done before `ipc_task_enable` so that the bits + * can be propogated to the ipc space. + * + * Requires `exectextresetvp` to be called on `task` previously so + * that we can use the `IOTaskGetEntitlement` API + */ static inline void -encode_HR_entitlement(const char *entitlement, HR_flags_t mask, - const struct image_params *imgp, load_result_t *load_result) +exec_setup_platform_restrictions(task_t task) { - if (IOVnodeHasEntitlement(imgp->ip_vp, (int64_t)imgp->ip_arch_offset, entitlement)) { - load_result->hardened_runtime_binary |= mask; - } -} - -#if DEVELOPMENT || DEBUG -/* - * This function handles the case where the hardened-process entitlement - * is set without a specific sub-feature entitlement, which is still under - * adoption. - * - * For in-adoption features, the fallout of having certain - * security sensitive components enabled but not qualified - * is potentially too large. Therefore, we allow to have a - * "forcing period" in which every binary entitled as - * hardened-process is required to have an explicit setting - * (true or false) for the security feature or otherwise - * gets killed or at least traced at launch. - * - * return value: true if all policies restrictions met, - * false otherwise. - */ -static inline bool -handle_missing_subfeature_entitlement( - const struct image_params *imgp, - const char *subfeature_entitlement) -{ - switch (hardened_process_config) { - case HARDENED_PROCESS_CONFIG_SILENT: - break; - case HARDENED_PROCESS_CONFIG_LOG: - /* - * Use the name directly from imgp since we haven't - * set_proc_name() yet. - */ - printf("[WARNING] %s has hardened-process but not %s\n", - imgp->ip_ndp->ni_cnd.cn_nameptr, - subfeature_entitlement); - break; - case HARDENED_PROCESS_CONFIG_FATAL: - /* - * When the policy defined as FATAL, we SIGKILL - * the process. - */ - printf("[ERROR] %s has hardened-process but not %s\n", - imgp->ip_ndp->ni_cnd.cn_nameptr, - subfeature_entitlement); - return false; - default: - panic("invalid hardened-process policy"); + uint64_t value = 0; + /* Set platform restrictions version */ + if (task_get_platform_binary(task)) { + task_set_platform_restrictions_version(task, 2); + } else if (IOTaskGetIntegerEntitlement(task, SPAWN_ENABLE_PLATFORM_RESTRICTIONS, &value) && + value > 1) { + task_set_platform_restrictions_version(task, value); } - return true; -} -#endif /* DEVELOPMENT || DEBUG */ - -/* - * Handle the hardened-process.hardened-heap entitlement. - * - * Note: hardened-heap is not inherited via spawn/exec; - * It is inherited (only) on fork, which is done - * via Apple strings. - */ -static inline bool -apply_hardened_heap_policy( - struct image_params *imgp, - bool is_hardened_process) -{ - bool result = true; - bool set_hardened_heap = false; - - bool hardened_heap_ent = false; - if (IOVnodeGetBooleanEntitlement(imgp->ip_vp, - (int64_t)imgp->ip_arch_offset, - SPAWN_ENABLE_HARDENED_HEAP, - &hardened_heap_ent)) { - /* - * The hardened-heap entitlement exists, use that - * to decide about enablement. - */ - set_hardened_heap = hardened_heap_ent; - } else if (is_hardened_process) { -#if DEVELOPMENT || DEBUG - /* - * We should imply default from hardened-process. However, - * bringup will take time and could be sensitive. We want - * to allow teams to adopt incrementally. - * - * We will link hardened-heap to hardened-process when - * adoption will be more stable. - */ - if (!handle_missing_subfeature_entitlement(imgp, - SPAWN_ENABLE_HARDENED_HEAP)) { - result = false; - } -#endif /* DEVELOPMENT || DEBUG */ + /* Set hardened process version*/ + if (IOTaskGetIntegerEntitlement(task, HARDENED_PROCESS_VERSION, &value)) { + task_set_hardened_process_version(task, value); } - - if (set_hardened_heap) { - imgp->ip_flags |= IMGPF_HARDENED_HEAP; - } - - return result; } /* - * This function handles all the hardened-process related - * mitigations, parse their entitlements, and apply policies. + * This routine configures the various runtime mitigations we can apply to a process + * during image activation. This occurs before `imgact_setup_runtime_mitigations` * - * For feature-ready mitigations, having hardened-process=true - * implies enablement. Sub-features specific entitlements can - * override this, which means that even if we have hardened-process - * exists and set to true, but a sub-feature entitlement exists - * and set to false, we do not enable the sub-feature. - * - * return value: true if all policies restrictions met, - * false otherwise. + * Returns true on success, false on failure. Failure will be fatal in exec_mach_imgact(). */ -static bool -apply_hardened_process_policy( - struct image_params *imgp, - __unused proc_t proc, - __unused bool is_platform_binary) +static inline errno_t +imgact_setup_runtime_mitigations(struct image_params *imgp, __unused load_result_t *load_result, + __unused task_t old_task, task_t new_task, __unused vm_map_t map, __unused proc_t proc) { - bool result = true; + /* + * It's safe to check entitlements anytime after `load_machfile` if you check + * based on the vnode in imgp. We must perform this entitlement check + * before we start using load_result->hardened_browser further down + */ + load_result->hardened_browser = 0; + encode_HR_entitlement(kCSWebBrowserHostEntitlement, BrowserHostEntitlementMask, imgp, load_result); + encode_HR_entitlement(kCSWebBrowserGPUEntitlement, BrowserGPUEntitlementMask, imgp, load_result); + encode_HR_entitlement(kCSWebBrowserNetworkEntitlement, BrowserNetworkEntitlementMask, imgp, load_result); + encode_HR_entitlement(kCSWebBrowserWebContentEntitlement, BrowserWebContentEntitlementMask, imgp, load_result); + + if (load_result->hardened_browser) { + task_set_platform_restrictions_version(new_task, 1); + } + + errno_t retval = 0; /* - * Check if the binary has hardened-process entitlement. + * Hardened-heap enables a set of extra security features in our system memory allocator. */ - bool is_hardened_process = false; - if (IOVnodeHasEntitlement(imgp->ip_vp, - (int64_t)imgp->ip_arch_offset, SPAWN_ENABLE_HARDENED_PROCESS)) { - is_hardened_process = true; - } - - if (!apply_hardened_heap_policy(imgp, is_hardened_process)) { - result = false; + if ((retval = imgact_setup_hardened_heap(imgp, new_task)) != 0) { + EXEC_LOG("Invalid configuration detected for hardened-heap"); + return retval; } - return result; + + + return retval; } uint32_t @@ -1372,7 +1324,7 @@ exec_mach_imgact(struct image_params *imgp) proc_t p = vfs_context_proc(imgp->ip_vfs_context); int error = 0; task_t task; - task_t new_task = NULL; /* protected by vfexec */ + task_t new_task = NULL; /* protected by vfexec */ thread_t thread; struct uthread *uthread; vm_map_switch_context_t switch_ctx; @@ -1547,16 +1499,6 @@ grade: assert(imgp->ip_free_map == NULL); - - // It's safe to check entitlements anytime after `load_machfile` if you check - // based on the vnode in imgp. We must perform this entitlement check - // before we start using load_result->hardened_runtime_binary further down - load_result.hardened_runtime_binary = 0; - encode_HR_entitlement(kCSWebBrowserHostEntitlement, BrowserHostEntitlementMask, imgp, &load_result); - encode_HR_entitlement(kCSWebBrowserGPUEntitlement, BrowserGPUEntitlementMask, imgp, &load_result); - encode_HR_entitlement(kCSWebBrowserNetworkEntitlement, BrowserNetworkEntitlementMask, imgp, &load_result); - encode_HR_entitlement(kCSWebBrowserWebContentEntitlement, BrowserWebContentEntitlementMask, imgp, &load_result); - /* * ERROR RECOVERY * @@ -1585,7 +1527,6 @@ grade: p->p_cputype = imgp->ip_origcputype; p->p_cpusubtype = imgp->ip_origcpusubtype; proc_setplatformdata(p, load_result.ip_platform, load_result.lr_min_sdk, load_result.lr_sdk); - exec_setup_tpro(imgp, &load_result); vm_map_set_size_limit(map, proc_limitgetcur(p, RLIMIT_AS)); vm_map_set_data_limit(map, proc_limitgetcur(p, RLIMIT_DATA)); @@ -1601,11 +1542,9 @@ grade: proc_unlock(p); /* - * Handle hardened-process mitigations, parse entitlements - * and apply enablements. + * Setup runtime mitigations. */ - if (!apply_hardened_process_policy(imgp, p, load_result.platform_binary)) { -#if DEVELOPMENT || DEBUG + if ((error = imgact_setup_runtime_mitigations(imgp, &load_result, current_task(), new_task, map, p)) != 0) { set_proc_name(imgp, p); exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO); if (bootarg_execfailurereports) { @@ -1616,13 +1555,8 @@ grade: imgp->ip_free_map = map; map = VM_MAP_NULL; goto badtoolate; -#endif /* DEVELOPMENT || DEBUG */ } - /* - * Set TPRO flags if enabled - */ - /* * Set code-signing flags if this binary is signed, or if parent has * requested them on exec. @@ -1720,7 +1654,7 @@ grade: * for system processes that need to match and be able to inspect * a pre-existing task. */ - int cpu_subtype = 0; /* all cpu_subtypes use the same shared region */ + int cpu_subtype = 0; /* all cpu_subtypes use the same shared region */ #if __has_feature(ptrauth_calls) char *shared_region_id = NULL; size_t len; @@ -1749,7 +1683,7 @@ grade: * Determine which shared cache to select based on being told, * matching a team-id or matching an entitlement. */ - if (load_result.hardened_runtime_binary & BrowserWebContentEntitlementMask) { + if (load_result.hardened_browser & BrowserWebContentEntitlementMask) { len = sizeof(HARDENED_RUNTIME_CONTENT_ID); shared_region_id = kalloc_data(len, Z_WAITOK | Z_NOFAIL); strlcpy(shared_region_id, HARDENED_RUNTIME_CONTENT_ID, len); @@ -1929,16 +1863,6 @@ grade: goto badtoolate; } - if (load_result.hardened_runtime_binary) { - if (cs_debug) { - printf("setting hardened runtime with entitlement mask= " - "0x%x on task: pid = %d\n", - load_result.hardened_runtime_binary, - proc_getpid(p)); - } - task_set_hardened_runtime(task, true); - } - /* * The load result will have already been munged by AMFI to include the * platform binary flag if boot-args dictated it (AMFI will mark anything @@ -1984,22 +1908,7 @@ grade: #endif /* DEVELOPMENT || DEBUG */ #endif /* XNU_TARGET_OS_OSX */ - /* - * Set starting EXC_GUARD and control port behavior for task now that - * platform and hardened runtime is set. Use the name directly from imgp since we haven't - * set_proc_name() yet. Also make control port for the task and main thread - * immovable/pinned based on task's option. - * - * Must happen before main thread port copyout in exc_add_apple_strings. - */ - task_set_exc_guard_ctrl_port_default(task, thread, - imgp->ip_ndp->ni_cnd.cn_nameptr, - (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen, - proc_is_simulated(p), - load_result.ip_platform, - load_result.lr_sdk); - - error = exec_add_apple_strings(imgp, &load_result); /* copies out main thread port */ + error = exec_add_apple_strings(imgp, &load_result, task); /* copies out main thread port */ if (error) { KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, @@ -2284,7 +2193,6 @@ cleanup_rosetta_fp: dtrace_proc_exec(p); #endif - if (kdebug_enable) { long args[4] = {}; @@ -2638,6 +2546,7 @@ bad_notrans: return error; } + /* * exec_validate_spawnattr_policy * @@ -2700,6 +2609,9 @@ exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_ case POSIX_SPAWN_PROC_TYPE_APP_DEFAULT: apptype = TASK_APPTYPE_APP_DEFAULT; break; + case POSIX_SPAWN_PROC_TYPE_APP_NONUI: + apptype = TASK_APPTYPE_APP_NONUI; + break; case POSIX_SPAWN_PROC_TYPE_DRIVER: apptype = TASK_APPTYPE_DRIVER; break; @@ -2886,7 +2798,7 @@ exec_handle_port_actions(struct image_params *imgp, if (MACH_PORT_VALID(act->new_port)) { kr = ipc_typed_port_copyin_send(get_task_ipcspace(current_task()), - act->new_port, IKOT_UNKNOWN, &port); + act->new_port, IOT_ANY, &port); if (kr != KERN_SUCCESS) { ret = EINVAL; @@ -3684,12 +3596,14 @@ proc_apply_jit_and_vm_policies(struct image_params *imgp, proc_t p, task_t task) } #if CONFIG_MAP_RANGES - if (task_is_hardened_binary(task) && !proc_is_simulated(p)) { + if ((task_has_hardened_heap(task) || + (task_get_platform_restrictions_version(task) == 1) || + task_get_platform_binary(task)) && !proc_is_simulated(p)) { /* * This must be done last as it needs to observe * any kind of VA space growth that was requested. * This is used by the secure allocator, so - * must be applied to all hardened binaries + * must be applied to all platform restrictions binaries */ #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT needs_extra_jumbo_va = IOTaskHasEntitlement(task, @@ -3713,14 +3627,14 @@ proc_apply_jit_and_vm_policies(struct image_params *imgp, proc_t p, task_t task) const bool task_loads_3P_plugins = imgp->ip_flags & IMGPF_3P_PLUGINS; #endif /* XNU_TARGET_OS_OSX */ - if (task_is_hardened_binary(task) + if (task_has_tpro(task) #if XNU_TARGET_OS_OSX && !task_loads_3P_plugins #endif /* XNU_TARGET_OS_OSX */ ) { /* * Pre-emptively disable TPRO remapping for - * hardened binaries (which do not load 3P plugins) + * platform restrictions binaries (which do not load 3P plugins) */ vm_map_set_tpro_enforcement(get_task_map(task)); } @@ -3764,7 +3678,6 @@ spawn_posix_cred_adopt(proc_t p, return 0; } - /* * posix_spawn * @@ -4109,11 +4022,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if ((psa->psa_options & PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) == PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) { imgp->ip_flags |= IMGPF_PLUGIN_HOST_DISABLE_A_KEYS; } + #if (DEVELOPMENT || DEBUG) if ((psa->psa_options & PSA_OPTION_ALT_ROSETTA) == PSA_OPTION_ALT_ROSETTA) { imgp->ip_flags |= (IMGPF_ROSETTA | IMGPF_ALT_ROSETTA); } -#endif +#endif /* (DEVELOPMENT || DEBUG) */ + if ((error = exec_validate_spawnattr_policy(psa->psa_apptype)) != 0) { goto bad; @@ -4501,8 +4416,6 @@ do_fork1: } } #endif - - /* * Activate the image. * Warning: If activation failed after point of no return, it returns error @@ -4693,13 +4606,27 @@ bad: } exec_resettextvp(p, imgp); + vm_map_setup(get_task_map(new_task), new_task); + + exec_setup_platform_restrictions(new_task); + + /* + * Set starting EXC_GUARD behavior for task now that platform + * and platform restrictions bits are set. + */ + task_set_exc_guard_default(new_task, + proc_best_name(p), + strlen(proc_best_name(p)), + proc_is_simulated(p), + proc_platform(p), + proc_sdk(p)); + /* * Enable new task IPC access if exec_activate_image() returned an * active task. (Checks active bit in ipc_task_enable() under lock). * Must enable after resettextvp so that task port policies are not evaluated * until the csblob in the textvp is accurately reflected. */ - vm_map_setup(get_task_map(new_task), new_task); ipc_task_enable(new_task); /* Set task exception ports now that we can check entitlements */ @@ -4779,6 +4706,9 @@ bad: if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) { task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit); } + if (imgp->ip_px_sa != NULL && px_sa.psa_conclave_mem_limit > 0) { + task_set_conclave_mem_limit(new_task, px_sa.psa_conclave_mem_limit); + } #if CONFIG_PROC_RESOURCE_LIMITS if (imgp->ip_px_sa != NULL && (px_sa.psa_port_soft_limit > 0 || px_sa.psa_port_hard_limit > 0)) { @@ -4795,6 +4725,10 @@ bad: (int)px_sa.psa_kqworkloop_hard_limit); } #endif /* CONFIG_PROC_RESOURCE_LIMITS */ + + if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_REALTIME_AUDIO)) { + task_set_jetsam_realtime_audio(new_task, TRUE); + } } @@ -4952,8 +4886,9 @@ bad: } if (error == 0 && !spawn_no_exec) { + extern uint64_t kdp_task_exec_meta_flags(task_t task); KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC), - proc_getpid(p)); + proc_getpid(p), kdp_task_exec_meta_flags(proc_task(p))); } } @@ -5550,17 +5485,33 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval __unused) assert(imgp->ip_new_thread != NULL); exec_resettextvp(p, imgp); + + vm_map_setup(get_task_map(new_task), new_task); + + exec_setup_platform_restrictions(new_task); + + /* + * Set starting EXC_GUARD behavior for task now that platform + * and platform restrictions bits are set. + */ + task_set_exc_guard_default(new_task, + proc_best_name(p), + strlen(proc_best_name(p)), + proc_is_simulated(p), + proc_platform(p), + proc_sdk(p)); + /* * Enable new task IPC access if exec_activate_image() returned an * active task. (Checks active bit in ipc_task_enable() under lock). * Must enable after resettextvp so that task port policies are not evaluated * until the csblob in the textvp is accurately reflected. */ - vm_map_setup(get_task_map(new_task), new_task); ipc_task_enable(new_task); error = process_signature(p, imgp); } + #if defined(HAS_APPLE_PAC) if (imgp->ip_new_thread && !error) { ml_task_set_jop_pid_from_shared_region(new_task, imgp->ip_flags & IMGPF_NOJOP); @@ -6327,7 +6278,7 @@ bad: #define LIBMALLOC_EXPERIMENT_FACTORS_KEY "MallocExperiment=" /* - * Passes information about hardened runtime entitlements to libsystem/libmalloc + * Passes information about hardened heap/"hardened runtime" entitlements to libsystem/libmalloc */ #define HARDENED_RUNTIME_KEY "HardenedRuntime=" @@ -6423,7 +6374,7 @@ _Atomic uint64_t libmalloc_experiment_factors = 0; static int exec_add_apple_strings(struct image_params *imgp, - const load_result_t *load_result) + const load_result_t *load_result, task_t task) { int error; int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; @@ -6547,7 +6498,7 @@ exec_add_apple_strings(struct image_params *imgp, } uint8_t cdhash[SHA1_RESULTLEN]; - int cdhash_errror = ubc_cs_getcdhash(imgp->ip_vp, imgp->ip_arch_offset, cdhash); + int cdhash_errror = ubc_cs_getcdhash(imgp->ip_vp, imgp->ip_arch_offset, cdhash, NULL); if (cdhash_errror == 0) { char hash_string[strlen(CDHASH_KEY) + 2 * SHA1_RESULTLEN + 1]; strncpy(hash_string, CDHASH_KEY, sizeof(hash_string)); @@ -6645,9 +6596,9 @@ exec_add_apple_strings(struct image_params *imgp, */ if ((new_thread = imgp->ip_new_thread) != THREAD_NULL) { thread_reference(new_thread); - sright = convert_thread_to_port_pinned(new_thread); + sright = convert_thread_to_port_immovable(new_thread); task_t new_task = get_threadtask(new_thread); - mach_port_name_t name = ipc_port_copyout_send(sright, get_task_ipcspace(new_task)); + mach_port_name_t name = ipc_port_copyout_send_pinned(sright, get_task_ipcspace(new_task)); char port_name_hex_str[strlen(MAIN_TH_PORT_KEY) + HEX_STR_LEN32 + 1]; snprintf(port_name_hex_str, sizeof(port_name_hex_str), MAIN_TH_PORT_KEY "0x%x", name); @@ -6694,35 +6645,32 @@ exec_add_apple_strings(struct image_params *imgp, imgp->ip_applec++; } - if (imgp->ip_flags & IMGPF_HARDENED_HEAP) { - const char *hardened_heap_shims = "hardened_heap=1"; - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hardened_heap_shims), UIO_SYSSPACE, FALSE); + /* + * Push down the task security configuration. To reduce confusion when userland parses the information + * still push an empty security configuration if nothing is active. + */ + { + #define SECURITY_CONFIG_KEY "security_config=" + char security_config_str[strlen(SECURITY_CONFIG_KEY) + HEX_STR_LEN + 1]; + + snprintf(security_config_str, sizeof(security_config_str), + SECURITY_CONFIG_KEY "0x%x", task_get_security_config(task)); + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(security_config_str), UIO_SYSSPACE, FALSE); if (error) { - printf("Failed to add hardened heap string with error %d\n", error); + printf("Failed to add the security config string with error %d\n", error); goto bad; } - imgp->ip_applec++; } - /* tell dyld that it can leverage hardware for its read-only/read-write trusted path */ - if (imgp->ip_flags & IMGPF_HW_TPRO) { - const char *dyld_hw_tpro = "dyld_hw_tpro=1"; - error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_hw_tpro), UIO_SYSSPACE, FALSE); - if (error) { - printf("Failed to add dyld hw tpro setting with error %d\n", error); - goto bad; - } - imgp->ip_applec++; - } - - if (load_result->hardened_runtime_binary) { + if (load_result->hardened_browser) { const size_t HR_STRING_SIZE = sizeof(HARDENED_RUNTIME_KEY) + HR_FLAGS_NUM_NIBBLES + 2 + 1; char hardened_runtime[HR_STRING_SIZE]; - snprintf(hardened_runtime, HR_STRING_SIZE, HARDENED_RUNTIME_KEY"0x%x", load_result->hardened_runtime_binary); + snprintf(hardened_runtime, HR_STRING_SIZE, HARDENED_RUNTIME_KEY"0x%x", load_result->hardened_browser); error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hardened_runtime), UIO_SYSSPACE, FALSE); if (error) { printf("Failed to add hardened runtime flag with error %d\n", error); @@ -7434,6 +7382,10 @@ load_init_program(proc_t p) mach_vm_offset_t scratch_addr = 0; mach_vm_size_t map_page_size = vm_map_page_size(map); +#if DEVELOPMENT || DEBUG + /* Use the opportunity to initialize exec's debug log stream */ + exec_log_handle = os_log_create("com.apple.xnu.bsd", "exec"); +#endif /* DEVELOPMENT || DEBUG */ (void) mach_vm_allocate_kernel(map, &scratch_addr, map_page_size, VM_MAP_KERNEL_FLAGS_ANYWHERE()); @@ -7884,6 +7836,36 @@ proc_process_signature(proc_t p, os_reason_t *signature_failure_reason) return error; } + +#define DT_UNRESTRICTED_SUBSYSTEM_ROOT "unrestricted-subsystem-root" + +static bool +allow_unrestricted_subsystem_root(void) +{ +#if !(DEVELOPMENT || DEBUG) + static bool allow_unrestricted_subsystem_root = false; + static bool has_been_set = false; + + if (!has_been_set) { + DTEntry chosen; + const uint32_t *value; + unsigned size; + + has_been_set = true; + if (SecureDTLookupEntry(0, "/chosen", &chosen) == kSuccess && + SecureDTGetProperty(chosen, DT_UNRESTRICTED_SUBSYSTEM_ROOT, (const void**)&value, &size) == kSuccess && + value != NULL && + size == sizeof(uint32_t)) { + allow_unrestricted_subsystem_root = (bool)*value; + } + } + + return allow_unrestricted_subsystem_root; +#else + return true; +#endif +} + static int process_signature(proc_t p, struct image_params *imgp) { @@ -7945,6 +7927,20 @@ process_signature(proc_t p, struct image_params *imgp) goto done; } + /* + * Reject when there's subsystem root path set, but the image is restricted, and doesn't require + * library validation. This is to avoid subsystem root being used to inject unsigned code + */ + if (!allow_unrestricted_subsystem_root()) { + if ((imgp->ip_csflags & CS_RESTRICT || proc_issetugid(p)) && + !(imgp->ip_csflags & CS_REQUIRE_LV) && + (imgp->ip_subsystem_root_path != NULL)) { + signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY); + error = EACCES; + goto done; + } + } + #if XNU_TARGET_OS_OSX /* Check for platform passed in spawn attr if iOS binary is being spawned */ if (proc_platform(p) == PLATFORM_IOS) { @@ -8357,16 +8353,11 @@ sysctl_libmalloc_experiments SYSCTL_HANDLER_ARGS return 0; } -EXPERIMENT_FACTOR_PROC(_kern, libmalloc_experiments, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_libmalloc_experiments, "A", ""); +EXPERIMENT_FACTOR_LEGACY_PROC(_kern, libmalloc_experiments, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_libmalloc_experiments, "A", ""); SYSCTL_NODE(_kern, OID_AUTO, sec_transition, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "sec_transition"); - -SYSCTL_INT(_kern_sec_transition, OID_AUTO, available, - CTLFLAG_RD | CTLFLAG_LOCKED, (int *)NULL, 0, ""); - - #if DEBUG || DEVELOPMENT static int sysctl_setup_ensure_pidversion_changes_on_exec(__unused int64_t in, int64_t *out) diff --git a/bsd/kern/kern_exec_internal.h b/bsd/kern/kern_exec_internal.h index 0a0648bf6..6afea8ada 100644 --- a/bsd/kern/kern_exec_internal.h +++ b/bsd/kern/kern_exec_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2020-2025 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,6 +31,7 @@ #include #include +#include /* * Set p->p_comm and p->p_name to the name passed to exec @@ -38,4 +39,60 @@ extern void set_proc_name(struct image_params *imgp, proc_t p); +/* + * Runtime security mitigations in production are primarily controlled by + * entitlements. Third party processes/daemons on MacOS aren't allowed to use + * the com.apple.developer entitlement without a profile, whereby a special carve out + * exists for com.apple.security. + * + * Progressively we expect internal first party software to shift towards the com.apple.security + * format, but until then we support both cases, with a strict rule that only one can + * be present. + */ +__enum_decl(exec_security_mitigation_entitlement_t, uint8_t, { +/* + * Hardened-process. + * + * Security mitigations follow the notion of "hardened-process": binaries that we + * have identified as being security critical. They are identified by the + * com.apple.{developer|security}.hardened-process entitlement, which is required to further + * configure the other security mitigations. + */ + HARDENED_PROCESS = 0, +/* + * Hardened-Heap. + * + * This mitigation extends libmalloc xzone with a number of security features, + * most notably increasing the number of buckets and adding guard pages. + * The presence of the entitlement opts the binary into the feature. + */ + HARDENED_HEAP, +/* + * TPRO - Trusted-Path Read-Only + * + * The TPRO mitigation allows to create memory regions that are read-only + * but that can be rapidly, locally, modified by trusted-paths to be temporarily + * read-write. TPRO is "enabled by default" (with the caveats in the exec_setup_tpro()) + * starting with the SDK versions below. + */ + TPRO, +}); + +/* + * exec_check_security_entitlement verifies whether a given entitlement is + * associated to the to-be-run process. It verifies both legacy and current + * format and returns: + * EXEC_SECURITY_NOT_ENTITLED - if no entitlement is present + * EXEC_SECURITY_ENTITLED - if the entitlement is present + * EXEC_SECURITY_INVALID_CONFIG - if _both_ entitlements are present (fatal condition) + */ +__enum_decl(exec_security_err_t, uint8_t, { + EXEC_SECURITY_NOT_ENTITLED, + EXEC_SECURITY_ENTITLED, + EXEC_SECURITY_INVALID_CONFIG +}); + +extern exec_security_err_t exec_check_security_entitlement(struct image_params *imgp, + exec_security_mitigation_entitlement_t entitlement); + #endif /* _KERN_EXEC_INTERNAL_H_ */ diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index e7e0ab602..1e79b94b1 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -824,6 +824,12 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * kcdata_memcpy(crash_info_ptr, uaddr, &trust, sizeof(trust)); } + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_TASK_SECURITY_CONFIG, sizeof(uint32_t), &uaddr)) { + struct crashinfo_task_security_config task_security; + task_security.task_security_config = task_get_security_config(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &task_security, sizeof(task_security)); + } + uint64_t jit_start_addr = 0; uint64_t jit_end_addr = 0; kern_return_t ret = get_jit_address_range_kdp(get_task_pmap(corpse_task), (uintptr_t*)&jit_start_addr, (uintptr_t*)&jit_end_addr); @@ -841,9 +847,24 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * kcdata_memcpy(crash_info_ptr, uaddr, &cs_auxiliary_info, sizeof(cs_auxiliary_info)); } + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_RLIM_CORE, sizeof(rlim_t), &uaddr)) { + const rlim_t lim = proc_limitgetcur(p, RLIMIT_CORE); + kcdata_memcpy(crash_info_ptr, uaddr, &lim, sizeof(lim)); + } + +#if CONFIG_UCOREDUMP + if (do_ucoredump && !task_is_driver(proc_task(p)) && + KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_CORE_ALLOWED, sizeof(uint8_t), &uaddr)) { + const uint8_t allow = is_coredump_eligible(p) == 0; + kcdata_memcpy(crash_info_ptr, uaddr, &allow, sizeof(allow)); + } +#endif /* CONFIG_UCOREDUMP */ + if (p->p_exit_reason != OS_REASON_NULL && reason == OS_REASON_NULL) { reason = p->p_exit_reason; } + + if (reason != OS_REASON_NULL) { if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, EXIT_REASON_SNAPSHOT, sizeof(struct exit_reason_snapshot), &uaddr)) { struct exit_reason_snapshot ers = { @@ -863,8 +884,6 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * kcdata_memcpy(crash_info_ptr, uaddr, reason->osr_kcd_buf, reason_buf_size); } } -#if DEVELOPMENT || DEBUG -#endif /* DEVELOPMENT || DEBUG */ } if (num_udata > 0) { @@ -3537,7 +3556,7 @@ exit_with_exception_internal( } if (!(flags & PX_DEBUG_NO_HONOR) - && address_space_debugged(p) == KERN_SUCCESS) { + && is_address_space_debugged(p)) { return 0; } @@ -3623,12 +3642,13 @@ exit_with_exclave_exception( void exit_with_mach_exception_using_ast( exception_info_t exception, - uint32_t flags) + uint32_t flags, + bool fatal) { const uint32_t __assert_only supported_flags = PX_KTRIAGE; assert((flags & ~supported_flags) == 0); bool ktriage = flags & PX_KTRIAGE; thread_ast_mach_exception(current_thread(), exception.os_reason, exception.exception_type, - exception.mx_code, exception.mx_subcode, false, ktriage); + exception.mx_code, exception.mx_subcode, fatal, ktriage); } diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 9a6e5335a..85b9ea215 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -407,8 +407,8 @@ bad: * fork_create_child * * Description: Common operations associated with the creation of a child - * process. Return with new task and first thread's control port movable - * and not pinned. + * process. Return with new task and first thread's control + * port movable * * Parameters: parent_task parent task * parent_coalitions parent's set of coalitions @@ -506,8 +506,7 @@ fork_create_child(task_t parent_task, } /* - * Create main thread for the child process. Its control port is not immovable/pinned - * until main_thread_set_immovable_pinned(). + * Create main thread for the child process. * * The new thread is waiting on the event triggered by 'task_clear_return_wait' */ @@ -588,14 +587,7 @@ fork(proc_t parent_proc, __unused struct fork_args *uap, int32_t *retval) child_task = (task_t)get_threadtask(child_thread); assert(child_task != TASK_NULL); - /* task_control_port_options has been inherited from parent, apply it */ - task_set_immovable_pinned(child_task); - main_thread_set_immovable_pinned(child_thread); - - /* - * Since the task ports for this new task are now set to be immovable, - * we can enable them. - */ + task_copyout_control_port(child_task); vm_map_setup(get_task_map(child_task), child_task); ipc_task_enable(child_task); diff --git a/bsd/kern/kern_guarded.c b/bsd/kern/kern_guarded.c index 74679eca8..443482139 100644 --- a/bsd/kern/kern_guarded.c +++ b/bsd/kern/kern_guarded.c @@ -1268,7 +1268,8 @@ vng_guard_violation(const struct vng_info *vgi, } } - if (vng_policy_flags & (kVNG_POLICY_EXC | kVNG_POLICY_EXC_CORPSE)) { + if (vng_policy_flags & + (kVNG_POLICY_EXC | kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_EXC_CORE)) { /* EXC_GUARD exception */ const struct vng_owner *vgo = TAILQ_FIRST(&vgi->vgi_owners); pid_t pid = vgo ? proc_pid(vgo->vgo_p) : 0; @@ -1283,7 +1284,8 @@ vng_guard_violation(const struct vng_info *vgi, lck_rw_unlock_shared(&llock); - if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) { + if (vng_policy_flags & + (kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_EXC_CORE)) { char *path; int len = MAXPATHLEN; @@ -1294,7 +1296,10 @@ vng_guard_violation(const struct vng_info *vgi, if (*path && len) { r = vng_reason_from_pathname(path, len); } - task_violated_guard(code, subcode, r, TRUE); /* not fatal */ + const bool backtrace_only = + !(vng_policy_flags & kVNG_POLICY_EXC_CORE); + /* not fatal */ + task_violated_guard(code, subcode, r, backtrace_only); if (NULL != r) { os_reason_free(r); } diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index d67fad2d3..5a1911e99 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -263,7 +263,7 @@ OSMalloc_external(uint32_t size, OSMallocTag tag) OSMalloc_Tagref(tag); if ((tag->OSMT_attr & OSMT_PAGEABLE) && (size & ~PAGE_MASK)) { if ((kr = kmem_alloc(kernel_map, (vm_offset_t *)&addr, size, - KMA_PAGEABLE | KMA_DATA, vm_tag_bt())) != KERN_SUCCESS) { + KMA_PAGEABLE | KMA_DATA_SHARED, vm_tag_bt())) != KERN_SUCCESS) { addr = NULL; } } else { @@ -433,3 +433,50 @@ SYSCTL_PROC(_kern, OID_AUTO, zones_collectable_bytes, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, 0, &sysctl_zones_collectable_bytes, "Q", "Collectable memory in zones"); + +#if DEVELOPMENT || DEBUG + +static int +sysctl_zone_reset_peak SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + kern_return_t kr; + int ret; + const size_t name_len = MAX_ZONE_NAME + 1; + char zonename[name_len]; + + ret = sysctl_io_string(req, zonename, name_len, 0, NULL); + if (ret) { + return ret; + } + + kr = zone_reset_peak(zonename); + return mach_to_bsd_errno(kr); +} + +SYSCTL_PROC(_kern, OID_AUTO, zone_reset_peak, + CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, &sysctl_zone_reset_peak, "-", + "Reset the peak size of a kernel zone by name."); + +static int +sysctl_zone_reset_all_peaks SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + kern_return_t kr; + + if (!req->newptr) { + /* Only reset on a write */ + return EINVAL; + } + + kr = zone_reset_all_peaks(); + return mach_to_bsd_errno(kr); +} + +SYSCTL_PROC(_kern, OID_AUTO, zone_reset_all_peaks, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, &sysctl_zone_reset_all_peaks, "I", + "Reset the peak size of all kernel zones."); + +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index 1e58568cf..a66554168 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -102,7 +102,7 @@ uint64_t memorystatus_freeze_last_pid_thawed_ts = 0; int block_corpses = 0; /* counter to block new corpses if jetsam purges them */ /* For logging clarity */ -static const char *memorystatus_kill_cause_name[] = { +static const char *memstat_kill_cause_name[] = { "", /* kMemorystatusInvalid */ "jettisoned", /* kMemorystatusKilled */ "highwater", /* kMemorystatusKilledHiwat */ @@ -119,7 +119,7 @@ static const char *memorystatus_kill_cause_name[] = { "low-swap", /* kMemorystatusKilledLowSwap */ "sustained-memory-pressure", /* kMemorystatusKilledSustainedPressure */ "vm-pageout-starvation", /* kMemorystatusKilledVMPageoutStarvation */ - "", /* Future kMemorystatusKilledConclaveLimit */ + "conclave-limit", /* kMemorystatusKilledConclaveLimit */ "long-idle-exit", /* kMemorystatusKilledLongIdleExit */ }; @@ -305,14 +305,15 @@ _memstat_write_memlimit_to_ledger_locked(proc_t p, bool is_active, bool drop_loc #define MEMORYSTATUS_REAPER_RATIO_NUM 4UL #define MEMORYSTATUS_REAPER_RATIO_DENOM 1UL -#if (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) +#if (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT TRUE #else #define MEMORYSTATUS_REAPER_ENABLED_DEFAULT FALSE -#endif /* (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) */ +#endif /* (XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) || XNU_TARGET_OS_WATCH */ #define MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT 300 +#define MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT 300 #define MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT JETSAM_PRIORITY_IDLE -#define MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT 300 +#define MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT 30 #define MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT -1 #define MEMORYSTATUS_REAPER_REAP_RELAUNCH_MASK_UNKNOWN (P_MEMSTAT_RELAUNCH_HIGH << 1) @@ -362,6 +363,8 @@ unsigned int memorystatus_jetsam_snapshot_size = 0; uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0; uint64_t memorystatus_jetsam_snapshot_timeout = 0; +#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30 + #if DEVELOPMENT || DEBUG /* * On development and debug kernels, we allow one pid to take ownership @@ -429,6 +432,9 @@ TUNABLE_DT_WRITEABLE(int32_t, memorystatus_entitled_bincompat_max_task_footprint "/defaults", "kern.entitled_bc_max_task_pmem", "entitled_bincompat_max_task_pmem", 0, TUNABLE_DT_NONE); #endif // CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT +#if DEBUG || DEVELOPMENT +TUNABLE(bool, memstat_ignore_task_limit_increase, "memstat_no_task_limit_increase", false); +#endif /* DEBUG || DEVELOPMENT */ #if __arm64__ #if DEVELOPMENT || DEBUG @@ -448,7 +454,7 @@ SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem, #endif /* CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT */ #else /* !(DEVELOPMENT || DEBUG) */ SYSCTL_INT(_kern, OID_AUTO, entitled_max_task_pmem, - CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN, &memorystatus_entitled_max_task_footprint_mb, 0, ""); #if CONFIG_SEPARATE_BINCOMPAT_ENTITLED_MEMLIMIT SYSCTL_INT(_kern, OID_AUTO, entitled_bincompat_max_task_pmem, @@ -485,13 +491,20 @@ static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused v static void _memstat_invalidate_idle_demotion_locked(proc_t p); static void _memstat_schedule_idle_demotion_locked(proc_t p); static void _memstat_reschedule_idle_demotion_locked(void); -int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap); +int memorystatus_update_priority_for_appnap(proc_t p); vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t); boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t); void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear); void memorystatus_send_low_swap_note(void); boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors, uint64_t *memory_reclaimed); +static bool memorystatus_kill_proc(proc_t p, uint32_t cause, + os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc); +/* Synchronously kill a process in priority order */ +static bool memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason, + int32_t max_priority, memstat_kill_options_t options, + int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed); + uint64_t memorystatus_available_memory_internal(proc_t p); void memorystatus_thread_wake(void); static bool _memstat_consider_waking_jetsam_thread(void); @@ -509,7 +522,7 @@ uint64_t memstat_idle_demotion_deadline = 0; unsigned int memorystatus_suspended_count = 0; #endif /* CONFIG_FREEZE */ -#ifdef XNU_TARGET_OS_OSX +#if XNU_TARGET_OS_OSX /* * Effectively disable the system process and application demotion * logic on macOS. This means system processes and apps won't get the @@ -672,7 +685,7 @@ sysctl_jetsam_set_apps_idle_delay_time SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern_memorystatus, OID_AUTO, apps_idle_delay_time_ns, CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_jetsam_set_apps_idle_delay_time, "I", "Aging window for applications"); -SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, max_task_pmem, CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_KERN, &max_task_footprint_mb, 0, ""); #if __arm64__ int legacy_footprint_bonus_mb = 50; /* This value was chosen after looking at the top 30 apps @@ -880,7 +893,7 @@ static void memorystatus_thread(void *param __unused, wait_result_t wr __unused) /* Memory Limits */ static bool memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); -static bool memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); +static bool memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason); static int memorystatus_cmd_set_memlimit_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval); @@ -959,13 +972,11 @@ static void memorystatus_get_task_memory_region_count(task_t task, uint64_t *cou static memorystatus_proc_state_t _memstat_build_state(proc_t p); //static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured); -static bool memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason, - int32_t max_priority, bool only_swappable, bool only_long_idle, - int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed); static bool memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, int32_t max_kills, uint32_t *errors, uint64_t *memory_reclaimed); static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged, uint64_t *memory_reclaimed); static bool _memstat_proc_is_reapable(proc_t p); static void _memstat_refresh_oldest_reapable_proc_info(void); +static bool _memstat_proc_is_application(proc_t p); #if CONFIG_JETSAM static void _memstat_reaper_check_oldest_reapable_proc_info_timeout(void); @@ -974,18 +985,27 @@ static void _memstat_reaper_end_sweep(void); static void _memstat_reaper_record_kill(uint64_t bytes_freed); #endif /* CONFIG_JETSAM */ static const char* _memstat_relaunch_flags_description(uint32_t flags); +static const char* _memstat_proc_type_description(proc_t p); /* Priority Band Sorting Routines */ -static int memorystatus_sort_bucket(unsigned int bucket_index, int sort_order); -static int memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order); -static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_index); -static int memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz); +static int memstat_sort_bucket(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order); +static void memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order); +static void memstat_sort_by_footprint_locked(unsigned int bucket_index); + +#define JETSAM_SORT_IDLE_DEFAULT JETSAM_SORT_FOOTPRINT_NOCOAL +#if XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR +#define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_LRU +#else /* XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR */ +#define JETSAM_SORT_FG_DEFAULT JETSAM_SORT_FOOTPRINT +#endif /* !(XNU_TARGET_OS_IOS && !XNU_TARGET_OS_XR) */ + +TUNABLE_DT(memorystatus_jetsam_sort_order_t, memstat_jetsam_fg_sort_order, "/defaults", + "kern.memstat_fg_sort_order", "memstat_fg_sort_order", JETSAM_SORT_FG_DEFAULT, TUNABLE_DT_NONE); /* qsort routines */ typedef int (*cmpfunc_t)(const void *a, const void *b); extern void qsort(void *a, size_t n, size_t es, cmpfunc_t cmp); -static int memstat_asc_cmp(const void *a, const void *b); /* VM pressure */ @@ -1028,6 +1048,7 @@ uint32_t memstat_ballast_offset = 0; uint32_t memstat_ctd_offset = 0; int32_t memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT; +int32_t memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT; boolean_t memstat_reaper_enabled = MEMORYSTATUS_REAPER_ENABLED_DEFAULT; uint32_t memstat_reaper_max_priority = MEMORYSTATUS_REAPER_MAX_PRIORITY_DEFAULT; int32_t memstat_reaper_rescan_secs = MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT; @@ -1080,9 +1101,11 @@ uint32_t _Atomic memorystatus_idle_exit_kill_count = 0; TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_seconds, "/defaults", "kern.memstat_reaper_minage_secs", "memorystatus_reaper_minimum_age_seconds", MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT, TUNABLE_DT_NONE); +TUNABLE_DT(int32_t, memorystatus_reaper_minimum_age_apps_seconds, "/defaults", + "kern.memstat_reaper_minapp_secs", "memorystatus_reaper_minimum_age_apps_seconds", MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT, TUNABLE_DT_NONE); TUNABLE_DT(uint32_t, memorystatus_reaper_rescan_delay_seconds, "/defaults", "kern.memstat_reaper_rescan_secs", "memorystatus_reaper_rescan_delay_seconds", MEMORYSTATUS_REAPER_RESCAN_SECS_DEFAULT, TUNABLE_DT_NONE); -TUNABLE_DT(bool, memorystatus_reaper_enabled, "/defaults", +TUNABLE_DT(boolean_t, memorystatus_reaper_enabled, "/defaults", "kern.memstat_reaper_enabled", "memorystatus_reaper_enabled", MEMORYSTATUS_REAPER_ENABLED_DEFAULT, TUNABLE_DT_NONE); @@ -1120,6 +1143,10 @@ TUNABLE_WRITEABLE(bool, fast_jetsam_enabled, "fast_jetsam_enabled", true); #else /* !CONFIG_JETSAM */ +/* + * On compressor/swap exhaustion, kill the largest process regardless of + * its chosen process policy. + */ #if DEVELOPMENT || DEBUG TUNABLE(bool, kill_on_no_paging_space, "-kill_on_no_paging_space", false); #else /* !(DEVELOPMENT || DEBUG) */ @@ -1134,13 +1161,13 @@ TUNABLE(uint64_t, no_paging_space_action_throttle_delay_ns, #endif /* CONFIG_JETSAM */ -#if DEVELOPMENT || DEBUG static inline uint32_t roundToNearestMB(uint32_t in) { return (in + ((1 << 20) - 1)) >> 20; } +#if DEVELOPMENT || DEBUG static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase); #endif @@ -1356,13 +1383,14 @@ SYSCTL_INT(_kern_memorystatus, OID_AUTO, ballast_offset_pages, SYSCTL_INT(_kern_memorystatus, OID_AUTO, ctd_offset_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &memstat_ctd_offset, 0, ""); -#endif /* DEVELOPMENT || DEBUG */ +#endif /* DEBUG || DEVELOPMENT */ -EXPERIMENT_FACTOR_UINT(_kern_memorystatus, reaper_enabled, &memstat_reaper_enabled, FALSE, TRUE, ""); -EXPERIMENT_FACTOR_UINT(_kern_memorystatus, reaper_min_age_secs, &memstat_reaper_min_age_secs, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern_memorystatus, reaper_max_priority, &memstat_reaper_max_priority, 0, JETSAM_PRIORITY_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern_memorystatus, reaper_reap_relaunch_mask, &memstat_reaper_reap_relaunch_mask, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern_memorystatus, reaper_rescan_secs, &memstat_reaper_rescan_secs, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_enabled, &memstat_reaper_enabled, FALSE, TRUE, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_secs, &memstat_reaper_min_age_secs, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_min_age_apps_secs, &memstat_reaper_min_age_apps_secs, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_max_priority, &memstat_reaper_max_priority, 0, JETSAM_PRIORITY_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_reap_relaunch_mask, &memstat_reaper_reap_relaunch_mask, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern_memorystatus, reaper_rescan_secs, &memstat_reaper_rescan_secs, 0, UINT32_MAX, ""); SYSCTL_INT(_kern_memorystatus, OID_AUTO, reaper_stats_sweep_count, CTLFLAG_RD | CTLFLAG_LOCKED, @@ -1440,38 +1468,38 @@ memstat_page_shortage_threshold_experiment_handler SYSCTL_HANDLER_ARGS #define MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS (CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED) #endif /* DEVELOPMENT || DEBUG */ -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, soft_threshold_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, soft_threshold_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_soft_threshold, 0, memstat_page_shortage_threshold_experiment_handler, "IU", "The minimum amount of available memory to maintain before killing " "processes which have violated there soft memory limit"); -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, idle_threshold_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, idle_threshold_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_idle_threshold, 0, memstat_page_shortage_threshold_experiment_handler, "IU", "The minimum amount of available memory to maintain before exiting idle " "processes"); -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, critical_threshold_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, critical_threshold_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_critical_threshold, 0, memstat_page_shortage_threshold_experiment_handler, "IU", "The minimum amount of available memory to maintain before killing non-idle " "processes"); -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, reaper_threshold_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, reaper_threshold_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_reaper_threshold, 0, memstat_page_shortage_threshold_experiment_handler, "IU", "The minimum amount of available memory to maintain before killing long-idle " "processes"); -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, ballast_offset_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, ballast_offset_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_ballast_offset, 0, memstat_page_shortage_threshold_experiment_handler, "IU", "An offset to apply to all non-critical page shortage thresholds when " "ballast is filling"); -EXPERIMENT_FACTOR_PROC(_kern_memorystatus, clear_the_decks_offset_mb, +EXPERIMENT_FACTOR_LEGACY_PROC(_kern_memorystatus, clear_the_decks_offset_mb, MEMSTAT_PAGE_SHORTAGE_EXPERIMENT_FLAGS, &memstat_ctd_offset, 0, memstat_page_shortage_threshold_experiment_handler, "IU", @@ -1712,13 +1740,13 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_disconnect_page_mappings, CTLTYPE_INT * Input: * bucket_index - jetsam priority band to be sorted. * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h - * Currently sort_order is only meaningful when handling - * coalitions. * * proc_list_lock must be held by the caller. */ static void -memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order) +memstat_sort_bucket_locked( + unsigned int bucket_index, + memorystatus_jetsam_sort_order_t sort_order) { LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); assert(bucket_index < MEMSTAT_BUCKET_COUNT); @@ -1726,18 +1754,15 @@ memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order) return; } - switch (bucket_index) { - case JETSAM_PRIORITY_FOREGROUND: - if (memorystatus_sort_by_largest_coalition_locked(bucket_index, sort_order) == 0) { - /* - * Fall back to per process sorting when zero coalitions are found. - */ - memorystatus_sort_by_largest_process_locked(bucket_index); - } + switch (sort_order) { + case JETSAM_SORT_NONE: break; - default: - memorystatus_sort_by_largest_process_locked(bucket_index); + case JETSAM_SORT_LRU: + case JETSAM_SORT_FOOTPRINT: + memstat_sort_coals_locked(bucket_index, sort_order); break; + case JETSAM_SORT_FOOTPRINT_NOCOAL: + memstat_sort_by_footprint_locked(bucket_index); } } @@ -1746,43 +1771,21 @@ memorystatus_sort_bucket_locked(unsigned int bucket_index, int sort_order) * * Input: * bucket_index - jetsam priority band to be sorted. - * sort_order - JETSAM_SORT_xxx from kern_memorystatus.h - * Currently sort_order is only meaningful when handling - * coalitions. + * sort_order - sort order to use * * Return: * 0 on success * non-0 on failure */ static int -memorystatus_sort_bucket(unsigned int bucket_index, int sort_order) +memstat_sort_bucket( + unsigned int bucket_index, + memorystatus_jetsam_sort_order_t sort_order) { - int coal_sort_order; - - /* - * Verify the jetsam priority - */ - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { - return EINVAL; - } - -#if DEVELOPMENT || DEBUG - if (sort_order == JETSAM_SORT_DEFAULT) { - coal_sort_order = COALITION_SORT_DEFAULT; - } else { - coal_sort_order = sort_order; /* only used for testing scenarios */ - } -#else - /* Verify default */ - if (sort_order == JETSAM_SORT_DEFAULT) { - coal_sort_order = COALITION_SORT_DEFAULT; - } else { - return EINVAL; - } -#endif + assert(bucket_index < MEMSTAT_BUCKET_COUNT); proc_list_lock(); - memorystatus_sort_bucket_locked(bucket_index, coal_sort_order); + memstat_sort_bucket_locked(bucket_index, sort_order); proc_list_unlock(); return 0; @@ -1793,7 +1796,7 @@ memorystatus_sort_bucket(unsigned int bucket_index, int sort_order) */ static void -memorystatus_sort_by_largest_process_locked(unsigned int bucket_index) +memstat_sort_by_footprint_locked(unsigned int bucket_index) { proc_t p = NULL, insert_after_proc = NULL, max_proc = NULL; proc_t next_p = NULL, prev_max_proc = NULL; @@ -2105,6 +2108,12 @@ memorystatus_init(void) memstat_reaper_min_age_secs = MEMORYSTATUS_REAPER_MIN_AGE_SECS_DEFAULT; } + if (memorystatus_reaper_minimum_age_apps_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) { + memstat_reaper_min_age_apps_secs = memorystatus_reaper_minimum_age_apps_seconds; + } else { + memstat_reaper_min_age_apps_secs = MEMORYSTATUS_REAPER_MIN_AGE_APPS_SECS_DEFAULT; + } + if (memorystatus_reaper_rescan_delay_seconds != MEMORYSTATUS_REAPER_SENTINAL_VALUE_MEANING_USE_DEFAULT) { memstat_reaper_rescan_secs = memorystatus_reaper_rescan_delay_seconds; } else { @@ -2201,6 +2210,14 @@ memorystatus_init(void) if (PE_parse_boot_argn("-memorystatus-skip-fg-notify", &i, sizeof(i))) { memorystatus_should_issue_fg_band_notify = false; } + + if (PE_parse_boot_argn("memorystatus_kill_on_sustained_pressure", &i, sizeof(i))) { + if (i) { + memstat_pressure_config |= MEMSTAT_WARNING_KILL_SUSTAINED; + } else { + memstat_pressure_config &= ~MEMSTAT_WARNING_KILL_SUSTAINED; + } + } #endif /* DEVELOPMENT || DEBUG */ /* Initialize the jetsam_threads state array */ @@ -2371,6 +2388,7 @@ memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason, uint64 case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break; case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; + case kMemorystatusKilledConclaveLimit: jetsam_flags |= P_JETSAM_PID; break; } /* jetsam_do_kill drops a reference. */ os_reason_ref(jetsam_reason); @@ -2865,6 +2883,17 @@ memstat_update_priority_locked(proc_t p, old_bucket = &memstat_bucket[p->p_memstat_effectivepriority]; + if (priority == JETSAM_PRIORITY_IDLE && + !(_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) && + !(_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p))) { + priority = JETSAM_PRIORITY_BACKGROUND; + memorystatus_log_error("memorystatus: %s [%d] is neither " + "clean (0x%x) nor assertion-less (0x%x) and cannot " + "therefore be idle - overriding to pri %d\n", + proc_best_name(p), proc_getpid(p), p->p_memstat_dirty, + p->p_memstat_state, priority); + } + if (!(options & MEMSTAT_PRIORITY_NO_AGING)) { if (_memstat_proc_is_elevated(p)) { /* @@ -3065,7 +3094,7 @@ memorystatus_set_priority(proc_t p, int priority, uint64_t user_data, } else if (priority == JETSAM_PRIORITY_IDLE_HEAD) { /* JETSAM_PRIORITY_IDLE_HEAD inserts at the head of the idle queue */ priority = JETSAM_PRIORITY_IDLE; - options |= MEMSTAT_PRIORITY_INSERT_HEAD; + options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING); } else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) { /* Sanity check */ ret = EINVAL; @@ -3131,6 +3160,21 @@ out: return ret; } +#if DEVELOPMENT || DEBUG +static int32_t +memstat_increased_limit(int32_t limit, int32_t increase) +{ + int32_t offset_limit; + if (limit <= 0) { + return 0; + } + if (os_add_overflow(limit, increase, &offset_limit)) { + return INT32_MAX; + } + return offset_limit; +} +#endif /* DEVELOPMENT || DEBUG */ + static int memstat_set_memlimits_locked(proc_t p, int32_t active_limit, int32_t inactive_limit, memlimit_options_t options) @@ -3145,6 +3189,8 @@ memstat_set_memlimits_locked(proc_t p, int32_t active_limit, LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); int32_t default_active_limit = memorystatus_get_default_task_active_limit(p); + int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p); + /* * The special value of -1 specifies that this proc wants the default * memory limit @@ -3152,6 +3198,19 @@ memstat_set_memlimits_locked(proc_t p, int32_t active_limit, if (active_limit <= 0) { active_limit = default_active_limit; } + if (inactive_limit <= 0) { + inactive_limit = default_inactive_limit; + } + +#if DEVELOPMENT || DEBUG + if (p->p_memlimit_increase) { + /* Apply memlimit increase (for testing with overlay roots) */ + int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase); + active_limit = memstat_increased_limit(active_limit, memlimit_increase); + inactive_limit = memstat_increased_limit(inactive_limit, memlimit_increase); + } +#endif /* DEVELOPMENT || DEBUG */ + /* * Work around a bug in JetsamProperties whereby processes may mistakenly receive * ActiveSoftMemoryLimit := -1 by forcing the default task limit to be fatal. @@ -3160,21 +3219,9 @@ memstat_set_memlimits_locked(proc_t p, int32_t active_limit, options |= MEMLIMIT_ACTIVE_FATAL; } - int32_t default_inactive_limit = memorystatus_get_default_task_inactive_limit(p); - if (inactive_limit <= 0) { - inactive_limit = default_inactive_limit; - } if (default_inactive_limit && inactive_limit == default_inactive_limit) { options |= MEMLIMIT_INACTIVE_FATAL; } -#if DEVELOPMENT || DEBUG - if (p->p_memlimit_increase) { - /* Apply memlimit increase (for testing with overlay roots) */ - int32_t memlimit_increase = roundToNearestMB(p->p_memlimit_increase); - active_limit = active_limit + memlimit_increase; - inactive_limit = inactive_limit + memlimit_increase; - } -#endif /* DEVELOPMENT || DEBUG */ memorystatus_log_debug( "memorystatus: setting memlimit for %s [%d], " @@ -3338,7 +3385,7 @@ static int memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) { /* See that the process isn't marked for termination */ - if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) { + if (_memstat_proc_is_terminating(target_p)) { return EBUSY; } @@ -3394,6 +3441,7 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) boolean_t defer_now = FALSE; int ret = 0; int priority; + bool kill = false; memstat_priority_options_t priority_options = MEMSTAT_PRIORITY_OPTIONS_NONE; @@ -3478,6 +3526,14 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) defer_now = TRUE; } + if (pcontrol & PROC_DIRTY_SHUTDOWN_ON_CLEAN) { + p->p_memstat_dirty |= P_DIRTY_SHUTDOWN_ON_CLEAN; + + if (_memstat_proc_is_tracked(p) && !_memstat_proc_is_dirty(p)) { + kill = true; + } + } + memorystatus_log_info( "%s [%d] enrolled in ActivityTracking tracked %d / idle-exit %d / defer %d / dirty %d", proc_best_name(p), proc_getpid(p), @@ -3505,7 +3561,13 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) memstat_update_priority_locked(p, priority, priority_options); exit: - proc_list_unlock(); + if (kill && proc_ref(p, true) == p) { + proc_list_unlock(); + psignal(p, SIGKILL); + proc_rele(p); + } else { + proc_list_unlock(); + } return ret; } @@ -3544,7 +3606,7 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) /* Dirty tracking not enabled */ ret = EINVAL; goto exit; - } else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { + } else if (pcontrol && _memstat_proc_is_terminating(p)) { /* * Process is set to be terminated and we're attempting to mark it dirty. * Set for termination and marking as clean is OK - see . @@ -3562,7 +3624,7 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) /* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */ p->p_memstat_dirty |= P_DIRTY_TERMINATED; kill = true; - } else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) { + } else if ((flag == P_DIRTY) && _memstat_proc_is_terminating(p)) { /* Kill previously terminated processes if set clean */ kill = true; } @@ -3593,6 +3655,9 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) } task_ledger_settle_dirty_time(t); task_set_dirty_start(t, 0); + if (_memstat_proc_shutdown_on_clean(p)) { + kill = true; + } } else if (!was_dirty && now_dirty) { priority = p->p_memstat_requestedpriority; task_set_dirty_start(t, mach_absolute_time()); @@ -3604,17 +3669,14 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_OPTIONS_NONE); - if (kill) { - if (proc_ref(p, true) == p) { - proc_list_unlock(); - psignal(p, SIGKILL); - proc_list_lock(); - proc_rele(p); - } - } - exit: - proc_list_unlock(); + if (kill && proc_ref(p, true) == p) { + proc_list_unlock(); + psignal(p, SIGKILL); + proc_rele(p); + } else { + proc_list_unlock(); + } return ret; } @@ -3757,7 +3819,7 @@ memorystatus_on_suspend(proc_t p) _memstat_proc_set_suspended(p); /* Check if proc is marked for termination */ - bool kill_process = !!(p->p_memstat_dirty & P_DIRTY_TERMINATED); + bool kill_process = _memstat_proc_is_terminating(p); proc_list_unlock(); if (kill_process) { @@ -3904,7 +3966,7 @@ memstat_kill_idle_process(memorystatus_kill_cause_t cause, /* Pick next idle exit victim. */ current_time = mach_absolute_time(); - jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_IDLE_EXIT); + jetsam_reason = os_reason_create(OS_REASON_JETSAM, (jetsam_reason_t)cause); if (jetsam_reason == OS_REASON_NULL) { memorystatus_log_error("memorystatus: failed to allocate jetsam reason\n"); } @@ -3918,8 +3980,7 @@ memstat_kill_idle_process(memorystatus_kill_cause_t cause, break; } - if ((_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) || - (_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p))) { + if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) { if (current_time >= p->p_memstat_idledeadline) { p->p_memstat_dirty |= P_DIRTY_TERMINATED; p = proc_ref(p, true); @@ -3934,9 +3995,9 @@ memstat_kill_idle_process(memorystatus_kill_cause_t cause, if (p) { memorystatus_log( - "memorystatus: killing_idle_process pid %d [%s] jetsam_reason->osr_code: %llu\n", - proc_getpid(p), proc_best_name(p), jetsam_reason->osr_code); - killed = memorystatus_do_kill(p, cause, jetsam_reason, footprint_out); + "memorystatus: killing (idle) %s [%d] due to %s (%u)\n", + proc_best_name(p), proc_getpid(p), memstat_kill_cause_name[cause], cause); + memorystatus_kill_proc(p, cause, jetsam_reason, &killed, footprint_out); proc_rele(p); } else { os_reason_free(jetsam_reason); @@ -3982,11 +4043,6 @@ memorystatus_respond_to_compressor_exhaustion(void) if (kill_on_no_paging_space) { memorystatus_thread_wake(); } else { - if (memstat_get_idle_proccnt() > 0) { - /* There are idle processes to kill */ - memorystatus_thread_wake(); - } - /* * Throttle how often the jetsam thread is woken due to * compressor/swap exhaustion @@ -4000,14 +4056,7 @@ memorystatus_respond_to_compressor_exhaustion(void) } absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns); if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) { - /* TODO: this should happen on the memorystatus thread (rdar://138409129) */ - if (os_atomic_cmpxchg(&last_no_space_action_ts, last_action_ts, now, relaxed)) { - bool should_notify = no_paging_space_action(); - if (should_notify) { - /* Put up the "Out of Application Memory" dialogue */ - memorystatus_send_low_swap_note(); - } - } + memorystatus_thread_wake(); } } #endif /* CONFIG_JETSAM */ @@ -4174,6 +4223,16 @@ _memstat_relaunch_flags_description(uint32_t flags) } } +const char* +_memstat_proc_type_description(proc_t p) +{ + if (_memstat_proc_is_application(p)) { + return "app"; + } else { + return "daemon"; + } +} + bool memstat_evaluate_page_shortage( bool *should_enforce_memlimits, @@ -4420,7 +4479,11 @@ memstat_purge_caches(jetsam_state_t state) #if CONFIG_DEFERRED_RECLAIM /* TODO: estimate memory recovered from deferred reclaim */ memorystatus_log("memorystatus: reclaiming all deferred user memory\n"); - vm_deferred_reclamation_gc(RECLAIM_GC_DRAIN, RECLAIM_NO_FAULT | RECLAIM_NO_WAIT); + mach_vm_size_t vmdr_bytes_reclaimed; + vm_deferred_reclamation_gc(RECLAIM_GC_DRAIN, &vmdr_bytes_reclaimed, + RECLAIM_NO_FAULT | RECLAIM_NO_WAIT); + memorystatus_log("memorystatus: purged %llu KiB of deferred user memory\n", + vmdr_bytes_reclaimed); #endif /* CONFIG_DEFERRED_RECLAIM */ /* TODO: estimate wired memory recovered from zone_gc */ @@ -4594,12 +4657,12 @@ create_jetsam_reason(memorystatus_kill_cause_t cause) /* * Do one kill as we're marching up the priority bands. - * This is a wrapper around memorystatus_kill_top_process that also + * This is a wrapper around memstat_kill_top_process that also * sets post_snapshot, tracks jld_idle_kills, and notifies if we're appraoching the fg band. */ static bool -memorystatus_do_priority_kill(jetsam_state_t state, - uint32_t kill_cause, int32_t max_priority, bool only_swappable, bool only_long_idle) +memstat_do_priority_kill(jetsam_state_t state, + uint32_t kill_cause, int32_t max_priority, memstat_kill_options_t options) { os_reason_t jetsam_reason = OS_REASON_NULL; bool killed = false; @@ -4607,15 +4670,18 @@ memorystatus_do_priority_kill(jetsam_state_t state, jetsam_reason = create_jetsam_reason(kill_cause); /* - * memorystatus_kill_top_process() drops a reference, + * memstat_kill_top_process() drops a reference, * so take another one so we can continue to use this exit reason * even after it returns */ os_reason_ref(jetsam_reason); + if (state->sort_flag) { + options |= MEMSTAT_SORT_BUCKET; + } /* LRU */ - killed = memorystatus_kill_top_process(true, state->sort_flag, kill_cause, jetsam_reason, max_priority, - only_swappable, only_long_idle, &priority, &state->errors, &state->memory_reclaimed); + killed = memstat_kill_top_process(kill_cause, jetsam_reason, max_priority, + options, &priority, &state->errors, &state->memory_reclaimed); state->sort_flag = false; if (killed) { @@ -4647,7 +4713,31 @@ memorystatus_do_priority_kill(jetsam_state_t state, } static bool -memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, uint32_t kill_cause) +memstat_perform_no_paging_space_action(memorystatus_kill_cause_t cause) +{ +#if !CONFIG_JETSAM + uint64_t now = mach_absolute_time(); + os_atomic_store(&last_no_space_action_ts, now, relaxed); + + bool should_notify = no_paging_space_action(cause); + if (should_notify) { + /* + * Put up the "Out of Application Memory" dialogue. The user will be + * prompted to select applications to Force Quit. + */ + memorystatus_log("memorystatus: sending out-of-application memory knote\n"); + memorystatus_send_low_swap_note(); + return false; + } + return true; +#else /* CONFIG_JETSAM */ + (void)cause; + panic("No-Paging-Space Action unsupported on this platform"); +#endif /* !CONFIG_JETSAM */ +} + +static bool +memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, memorystatus_kill_cause_t kill_cause) { bool killed = false; os_reason_t jetsam_reason = OS_REASON_NULL; @@ -4663,7 +4753,7 @@ memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, uint3 os_reason_free(jetsam_reason); break; case MEMORYSTATUS_KILL_TOP_PROCESS: - killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, false, false); + killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, 0); break; case MEMORYSTATUS_WAKE_SWAPPER: memorystatus_log_info( @@ -4679,16 +4769,23 @@ memorystatus_do_action(jetsam_state_t state, memorystatus_action_t action, uint3 vm_compressor_process_special_swapped_in_segments(); break; case MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE: - killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, true, false); + killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_BACKGROUND - 1, MEMSTAT_ONLY_SWAPPABBLE); break; case MEMORYSTATUS_KILL_SWAPPABLE: - killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, true, false); + killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_MAX, MEMSTAT_ONLY_SWAPPABBLE); break; case MEMORYSTATUS_KILL_IDLE: - killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, false, false); + killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, 0); break; case MEMORYSTATUS_KILL_LONG_IDLE: - killed = memorystatus_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, false, true); + killed = memstat_do_priority_kill(state, kill_cause, JETSAM_PRIORITY_IDLE, MEMSTAT_ONLY_LONG_IDLE); + break; + case MEMORYSTATUS_NO_PAGING_SPACE: + killed = memstat_perform_no_paging_space_action(kill_cause); + break; + case MEMORYSTATUS_PURGE_CACHES: + memstat_purge_caches(state); + killed = true; break; case MEMORYSTATUS_KILL_NONE: panic("memorystatus_do_action: Impossible! memorystatus_do_action called with action = NONE\n"); @@ -4771,6 +4868,7 @@ memorystatus_thread_internal(jetsam_state_t state) assert(state != NULL); state->jld_idle_kills = 0; state->errors = 0; + state->errors_cleared = false; state->hwm_kills = 0; state->sort_flag = true; state->corpse_list_purged = false; @@ -4824,7 +4922,7 @@ memorystatus_thread_internal(jetsam_state_t state) } if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) { - memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memorystatus_kill_cause_name[cause], vm_compression_ratio()); + memorystatus_log("memorystatus: killing due to \"%s\" - compression_ratio=%u\n", memstat_kill_cause_name[cause], vm_compression_ratio()); } killed = memorystatus_do_action(state, action, cause); @@ -4838,8 +4936,55 @@ memorystatus_thread_internal(jetsam_state_t state) suspended_swappable_apps_remaining = false; } else if (action == MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE) { suspended_swappable_apps_remaining = false; + } else if (action == MEMORYSTATUS_KILL_TOP_PROCESS || + action == MEMORYSTATUS_KILL_AGGRESSIVE) { + /* + * We tried to kill a process, but failed to find anyone to kill. It's + * possible we chose not to because we reclaimed some purgeable memory or + * hit this thread's priority limit. + */ + if (state->memory_reclaimed == 0 && !state->limit_to_low_bands) { + /* + * We should have found someone to kill. Either we failed because of a transient + * error or we've run out of candidates and the issue is caused by the kernel. + */ + memorystatus_log("memorystatus: failed to find a process to kill!\n"); + if (state->errors && !state->errors_cleared) { + /* + * It's possible that all of the kill candidates had the error bit set + * (e.g. because we caught them in exec()). Clear all the error bits and + * try to kill them one more time in the hopes that they are now killable. + */ + memorystatus_log("memorystatus: clearing kill errors and retrying\n"); + memorystatus_clear_errors(); + state->errors_cleared = true; + } else { + /* The memory may be held by a corpse or zalloc. */ + memstat_purge_caches(state); + struct memorystatus_system_health_s health_status; + bool is_system_healthy = memstat_check_system_health(&health_status); + if (!is_system_healthy) { + memorystatus_log("memorystatus: system still unhealthy after cache purge!\n"); + /* + * We trimmed the zones above but it's possible there is a bug with + * working set estimation and we needed a full drain. + */ + memorystatus_log_fault("memorystatus: fully draining kernel zone allocator\n"); + zone_gc_drain(); + is_system_healthy = memstat_check_system_health(&health_status); + if (!is_system_healthy) { + /* + * We've killed everything and purged all xnu caches. There is nothing + * left to do but panic. + */ + panic("memorystatus: all victims exhausted"); + } + } + } + } } } else { + /* We successfully killed a process */ if (cause == kMemorystatusKilledVMCompressorThrashing || cause == kMemorystatusKilledVMCompressorSpaceShortage) { memorystatus_log("memorystatus: post-jetsam compressor fragmentation_level=%u\n", vm_compressor_fragmentation_level()); } @@ -4849,32 +4994,6 @@ memorystatus_thread_internal(jetsam_state_t state) suspended_swappable_apps_remaining = true; } - if (!killed && total_memory_reclaimed == 0) { - memorystatus_log("memorystatus: failed to kill a process and no memory was reclaimed\n"); - if ((action == MEMORYSTATUS_KILL_TOP_PROCESS || action == MEMORYSTATUS_KILL_AGGRESSIVE) && - memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) { - /* - * Still under pressure and unable to kill a process - purge corpse memory - * and get everything back from the pmap. - */ - memorystatus_log("memorystatus: ran out of %sprocesses to kill but " - "system is still in critical condition\n", - state->limit_to_low_bands ? "low-band " : ""); - memstat_purge_caches(state); - - if (!state->limit_to_low_bands && - memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) { - /* - * Still under pressure and unable to kill a process - */ - memorystatus_log_fault("memorystatus: attempting full drain of kernel zone allocator\n"); - zone_gc_drain(); - if (memorystatus_get_available_page_count() < memorystatus_get_critical_page_shortage_threshold()) { - panic("memorystatus_jetsam_thread: no victim! available pages:%llu", (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES); - } - } - } - } /* * If we did a kill on behalf of another subsystem (compressor or zalloc) @@ -4967,7 +5086,7 @@ memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_ jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; } - if (memorystatus_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) { + if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledPerProcessLimit, jetsam_reason) != TRUE) { memorystatus_log_error("task_exceeded_footprint: failed to kill the current task (exiting?).\n"); } } else { @@ -5022,6 +5141,31 @@ memorystatus_log_diag_threshold_exception(const int diag_threshold_value) ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), diag_threshold_value); } +void +memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb) +{ + os_reason_t jetsam_reason = OS_REASON_NULL; + proc_t p = current_proc(); + + /* + * The limit violation is logged here; it's always fatal. + */ + memorystatus_log("memorystatus: %s [%d] exceeded conclave limit: %d MB \n", + ((p && *p->p_name) ? p->p_name : "unknown"), (p ? proc_getpid(p) : -1), max_footprint_mb); + + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_CONCLAVELIMIT); + if (jetsam_reason == NULL) { + memorystatus_log_error("task_exceeded_conclave: failed to allocate jetsam reason\n"); + } else if (corpse_for_fatal_memkill && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) { + /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */ + jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; + } + + if (memstat_kill_process_sync(proc_getpid(p), kMemorystatusKilledConclaveLimit, jetsam_reason) != TRUE) { + memorystatus_log_error("task_exceeded_conclave: failed to kill the current task (exiting?).\n"); + } +} + /* * Description: * Evaluates process state to determine which limit @@ -5091,7 +5235,7 @@ memstat_proc_is_active_locked(proc_t p) } static bool -memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) +memstat_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jetsam_reason) { bool killed; @@ -5111,13 +5255,15 @@ memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause, os_reason_t jet if (kill_on_no_paging_space || cause == kMemorystatusKilledZoneMapExhaustion) { max_priority = JETSAM_PRIORITY_MAX; + } else if (cause == kMemorystatusKilledSustainedPressure) { + max_priority = memstat_sustained_pressure_max_pri; } else { max_priority = JETSAM_PRIORITY_IDLE; } #endif /* CONFIG_JETSAM */ /* No pid, so kill first process */ - killed = memorystatus_kill_top_process(true, true, cause, jetsam_reason, - max_priority, false, false, NULL, &errors, &memory_reclaimed); + killed = memstat_kill_top_process(cause, jetsam_reason, + max_priority, MEMSTAT_SORT_BUCKET, NULL, &errors, &memory_reclaimed); } else { killed = memorystatus_kill_specific_process(victim_pid, cause, jetsam_reason); } @@ -5175,7 +5321,7 @@ memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t proc_list_lock(); - if (p->p_memstat_state & P_MEMSTAT_TERMINATED) { + if (_memstat_proc_was_killed(p)) { /* * Someone beat us to this kill. * Nothing to do here. @@ -5204,9 +5350,10 @@ memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause, os_reason_t killed = memorystatus_do_kill(p, cause, jetsam_reason, &footprint_of_killed_proc); - memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d %llus rf:%s) %lluKB - memorystatus_available_pages: %llu\n", + memorystatus_log("%lu.%03d memorystatus: killing_specific_process pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n", (unsigned long)tv_sec, tv_msec, victim_pid, ((p && *p->p_name) ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1), time_in_priority_band_secs, (p ? _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags) : 0), + memstat_kill_cause_name[cause], (p ? p->p_memstat_effectivepriority: -1), time_in_priority_band_secs, + (p ? _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags) : 0), _memstat_proc_type_description(p), footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES); if (!killed) { @@ -5664,6 +5811,7 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, entry->jse_killtime = killtime; entry->jse_gencount = snapshot->js_gencount; entry->jse_idle_delta = p->p_memstat_idle_delta; + entry->jse_prio_start = p->p_memstat_prio_start; #if CONFIG_FREEZE entry->jse_thaw_count = p->p_memstat_thaw_count; entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason; @@ -5923,6 +6071,7 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna entry->jse_gencount = gencount; /* indicates a pass through jetsam thread, when process was targeted to be killed */ entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */ + entry->jse_prio_start = p->p_memstat_prio_start; /* Time moved to current band */ #if CONFIG_FREEZE entry->jse_freeze_skip_reason = p->p_memstat_freeze_skip_reason; @@ -6069,83 +6218,39 @@ static const int memorystatus_artificial_snapshot_entry_count = 2; #if DEVELOPMENT || DEBUG /* - * Verify that the given bucket has been sorted correctly. - * - * Walks through the bucket and verifies that all pids in the - * expected_order buffer are in that bucket and in the same - * relative order. - * - * The proc_list_lock must be held by the caller. + * Fills an array with the given pids in the order they are seen in a + * jetsam band. */ static int -memorystatus_verify_sort_order(unsigned int bucket_index, pid_t *expected_order, size_t num_pids) +memorystatus_get_sort_order( + unsigned int bucket_index, + pid_t *pids, + pid_t *order, + size_t num_pids) { LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_OWNED); - int error = 0; proc_t p = NULL; - size_t i = 0; + size_t i, out_idx = 0; /* - * NB: We allow other procs to be mixed in within the expected ones. - * We just need the expected procs to be in the right order relative to each other. + * Read out the order of all the pids into the order array. */ p = memorystatus_get_first_proc_locked(&bucket_index, FALSE); while (p) { - if (proc_getpid(p) == expected_order[i]) { - i++; - } - if (i == num_pids) { - break; + for (i = 0; i < num_pids; i++) { + if (pids[i] == proc_getpid(p)) { + if (out_idx >= num_pids) { + /* Did we somehow see something twice? */ + return EINVAL; + } + order[out_idx] = pids[i]; + out_idx++; + } } p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE); } - if (i != num_pids) { - char buffer[128]; - size_t len = sizeof(buffer); - size_t buffer_idx = 0; - memorystatus_log_error("memorystatus_verify_sort_order: Processes in bucket %d were not sorted properly\n", bucket_index); - for (i = 0; i < num_pids; i++) { - int num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", expected_order[i]); - if (num_written <= 0) { - break; - } - if (buffer_idx + (unsigned int) num_written >= len) { - break; - } - buffer_idx += num_written; - } - memorystatus_log_error("memorystatus_verify_sort_order: Expected order [%s]\n", buffer); - memset(buffer, 0, len); - buffer_idx = 0; - p = memorystatus_get_first_proc_locked(&bucket_index, FALSE); - i = 0; - memorystatus_log_error("memorystatus_verify_sort_order: Actual order:\n"); - while (p) { - int num_written; - if (buffer_idx == 0) { - num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%zu: %d,", i, proc_getpid(p)); - } else { - num_written = snprintf(buffer + buffer_idx, len - buffer_idx, "%d,", proc_getpid(p)); - } - if (num_written <= 0) { - break; - } - buffer_idx += (unsigned int) num_written; - assert(buffer_idx <= len); - if (i % 10 == 0) { - memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer); - buffer_idx = 0; - } - p = memorystatus_get_next_proc_locked(&bucket_index, p, FALSE); - i++; - } - if (buffer_idx != 0) { - memorystatus_log_error("memorystatus_verify_sort_order: %s\n", buffer); - } - error = EINVAL; - } - return error; + return 0; } /* @@ -6159,48 +6264,46 @@ memorystatus_cmd_test_jetsam_sort(int priority, user_addr_t expected_order_user, size_t expected_order_user_len) { + pid_t *expected_order, *actual_order; int error = 0; - unsigned int bucket_index = 0; - const size_t kMaxPids = 8; - pid_t expected_order[kMaxPids]; - size_t copy_size = sizeof(expected_order); - size_t num_pids; + size_t num_pids = expected_order_user_len / sizeof(pid_t); - if (expected_order_user_len < copy_size) { - copy_size = expected_order_user_len; - } - num_pids = copy_size / sizeof(pid_t); - - error = copyin(expected_order_user, expected_order, copy_size); - if (error != 0) { - return error; - } - - if (priority == -1) { - /* Use as shorthand for default priority */ - bucket_index = JETSAM_PRIORITY_DEFAULT; - } else { - bucket_index = (unsigned int)priority; - } - - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { + if (num_pids > 512) { /* Just so we don't allocate some huge buffer */ return EINVAL; } + if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) { + return EINVAL; + } + + expected_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG); + actual_order = kalloc_data_tag(num_pids * sizeof(pid_t), Z_WAITOK, VM_KERN_MEMORY_DIAG); + + error = copyin(expected_order_user, expected_order, expected_order_user_len); + if (error != 0) { + goto err; + } + /* * Acquire lock before sorting so we can check the sort order * while still holding the lock. */ proc_list_lock(); - memorystatus_sort_bucket_locked(bucket_index, sort_order); + memstat_sort_bucket_locked(priority, sort_order); if (expected_order_user != CAST_USER_ADDR_T(NULL) && expected_order_user_len > 0) { - error = memorystatus_verify_sort_order(bucket_index, expected_order, num_pids); + bzero(actual_order, num_pids * sizeof(pid_t)); + error = memorystatus_get_sort_order(priority, expected_order, actual_order, num_pids); + /* Even if we get an error, we still want to copyout what we had */ + copyout(actual_order, expected_order_user, num_pids * sizeof(pid_t)); } proc_list_unlock(); +err: + kfree_data(expected_order, num_pids * sizeof(pid_t)); + kfree_data(actual_order, num_pids * sizeof(pid_t)); return error; } @@ -6252,7 +6355,7 @@ networking_memstatus_callout(proc_t p, uint32_t status) #endif /* SOCKETS */ static bool -memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_of_killed_proc) +memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool *killed, uint64_t *footprint_out) { pid_t aPid = 0; uint32_t aPid_ep = 0; @@ -6334,10 +6437,10 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool memorystatus_purge_before_jetsam_success++; memorystatus_log_info("memorystatus: reclaimed %llu pages (%llu purged, %llu unsecluded) from pid %d [%s] and avoided %s\n", - num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memorystatus_kill_cause_name[cause]); + num_pages_reclaimed, num_pages_purged, num_pages_unsecluded, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), memstat_kill_cause_name[cause]); *killed = false; - *footprint_of_killed_proc = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded; + *footprint_out = num_pages_reclaimed + num_pages_purged + num_pages_unsecluded; return true; } @@ -6371,17 +6474,18 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool */ os_reason_ref(jetsam_reason); - retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_of_killed_proc); + retval = memorystatus_do_kill(p, cause, jetsam_reason, footprint_out); *killed = retval; absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs); time_in_priority_band_secs /= NSEC_PER_SEC; - memorystatus_log("memorystatus: %s pid %d [%s] (%s %d %llus rf:%s) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n", + memorystatus_log("memorystatus: %s pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu compressor_size:%u\n", kill_reason_string, aPid, proc_best_name(p), - memorystatus_kill_cause_name[cause], aPid_ep, time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), - (*footprint_of_killed_proc) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + memstat_kill_cause_name[cause], aPid_ep, time_in_priority_band_secs, + _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p), + (*footprint_out) >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); return retval; } @@ -6390,9 +6494,9 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool * Jetsam the first process in the queue. */ static bool -memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reason_t jetsam_reason, - int32_t max_priority, bool only_swappable, bool only_long_idle, - int32_t *priority, uint32_t *errors, uint64_t *memory_reclaimed) +memstat_kill_top_process(uint32_t cause, os_reason_t jetsam_reason, + int32_t max_priority, memstat_kill_options_t options, + int32_t *priority_out, uint32_t *errors_out, uint64_t *memory_reclaimed_out) { pid_t aPid; proc_t p = PROC_NULL, next_p = PROC_NULL; @@ -6401,32 +6505,27 @@ memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reaso uint32_t aPid_ep; uint64_t footprint_of_killed_proc = 0; -#ifndef CONFIG_FREEZE -#pragma unused(any) -#endif - KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, MEMORYSTATUS_LOG_AVAILABLE_PAGES); + bool only_long_idle = options & MEMSTAT_ONLY_LONG_IDLE; + bool only_swappable = options & MEMSTAT_ONLY_SWAPPABBLE; + bool sort_bucket = options & MEMSTAT_SORT_BUCKET; #if CONFIG_JETSAM - if (sort_flag) { - (void)memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); + if (sort_bucket) { + (void)memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order); } - *memory_reclaimed = 0; - -#if VM_PRESSURE_EVENTS - if (cause == kMemorystatusKilledSustainedPressure) { - max_priority = memorystatus_sustained_pressure_maximum_band; + if (memory_reclaimed_out) { + *memory_reclaimed_out = 0; } -#endif /* VM_PRESSURE_EVENTS */ force_new_snapshot = false; #else /* CONFIG_JETSAM */ - if (sort_flag) { - (void)memorystatus_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_DEFAULT); + if (sort_bucket) { + (void)memstat_sort_bucket(JETSAM_PRIORITY_IDLE, JETSAM_SORT_IDLE_DEFAULT); } /* @@ -6482,18 +6581,17 @@ memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reaso if (only_long_idle) { if (!_memstat_proc_is_reapable(p)) { - memorystatus_log_debug("memorystatus: memorystatus_kill_top_process: skipping non-reapable process %s [%d]\n", + memorystatus_log_debug("memorystatus: memstat_kill_top_process: skipping non-reapable process %s [%d]\n", proc_best_name(p), p->p_pid); continue; } - memorystatus_log_debug("memorystatus: memorystatus_kill_top_process: found reapable long-idle process %s [%d]\n", + memorystatus_log_debug("memorystatus: memstat_kill_top_process: found reapable long-idle process %s [%d]\n", proc_best_name(p), p->p_pid); } #if !CONFIG_JETSAM if (max_priority == JETSAM_PRIORITY_IDLE && - !((_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) || - (_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p)))) { + ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT | P_DIRTY_IS_DIRTY | P_DIRTY_TERMINATED)) != (P_DIRTY_ALLOW_IDLE_EXIT))) { /* * This process is in the idle band but is not clean+idle-exitable or * managed+assertion-less. Skip it. @@ -6504,86 +6602,80 @@ memorystatus_kill_top_process(bool any, bool sort_flag, uint32_t cause, os_reaso } #endif /* !CONFIG_JETSAM */ #if CONFIG_FREEZE - boolean_t skip; - boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED); - if (any || reclaim_proc) { - skip = FALSE; - } else { - skip = TRUE; - } - - if (skip) { + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { continue; - } else + } #endif - { - if (proc_ref(p, true) == p) { - /* - * Mark as terminated so that if exit1() indicates success, but the process (for example) - * is blocked in task_exception_notify(), it'll be skipped if encountered again - see - * . This is cheaper than examining P_LEXIT, which requires the - * acquisition of the proc lock. - */ - p->p_memstat_state |= P_MEMSTAT_TERMINATED; - } else { - /* - * We need to restart the search again because - * proc_ref _can_ drop the proc_list lock - * and we could have lost our stored next_p via - * an exit() on another core. - */ - i = 0; - next_p = memorystatus_get_first_proc_locked(&i, TRUE); - continue; - } - + if (proc_ref(p, true) == p) { /* - * Capture a snapshot if none exists and: - * - we are forcing a new snapshot creation, either because: - * - on a particular platform we need these snapshots every time, OR - * - a boot-arg/embedded device tree property has been set. - * - priority was not requested (this is something other than an ambient kill) - * - the priority was requested *and* the targeted process is not at idle priority + * Mark as terminated so that if exit1() indicates success, but the process (for example) + * is blocked in task_exception_notify(), it'll be skipped if encountered again - see + * . This is cheaper than examining P_LEXIT, which requires the + * acquisition of the proc lock. */ - if ((memorystatus_jetsam_snapshot_count == 0) && - (force_new_snapshot || memorystatus_idle_snapshot || ((!priority) || (priority && (aPid_ep != JETSAM_PRIORITY_IDLE))))) { - memorystatus_init_jetsam_snapshot_locked(NULL, 0); - new_snapshot = true; - } - - proc_list_unlock(); - - freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */ - /* Success? */ - if (freed_mem) { - *memory_reclaimed = footprint_of_killed_proc; - if (killed) { - if (priority) { - *priority = aPid_ep; - } - } else { - /* purged */ - proc_list_lock(); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - proc_list_unlock(); - } - proc_rele(p); - goto exit; - } - + p->p_memstat_state |= P_MEMSTAT_TERMINATED; + } else { /* - * Failure - first unwind the state, - * then fall through to restart the search. + * We need to restart the search again because + * proc_ref _can_ drop the proc_list lock + * and we could have lost our stored next_p via + * an exit() on another core. */ - proc_list_lock(); - proc_rele(p); - p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; - p->p_memstat_state |= P_MEMSTAT_ERROR; - *errors += 1; - i = 0; next_p = memorystatus_get_first_proc_locked(&i, TRUE); + continue; } + + /* + * Capture a snapshot if none exists and: + * - we are forcing a new snapshot creation, either because: + * - on a particular platform we need these snapshots every time, OR + * - a boot-arg/embedded device tree property has been set. + * - priority was not requested (this is something other than an ambient kill) + * - the priority was requested *and* the targeted process is not at idle priority + */ + if ((memorystatus_jetsam_snapshot_count == 0) && + (force_new_snapshot || memorystatus_idle_snapshot || ((!priority_out) || (priority_out && (aPid_ep != JETSAM_PRIORITY_IDLE))))) { + memorystatus_init_jetsam_snapshot_locked(NULL, 0); + new_snapshot = true; + } + + proc_list_unlock(); + + freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed, &footprint_of_killed_proc); /* purged and/or killed 'p' */ + /* Success? */ + if (freed_mem) { + if (memory_reclaimed_out) { + *memory_reclaimed_out = footprint_of_killed_proc; + } + if (killed) { + if (priority_out) { + *priority_out = aPid_ep; + } + } else { + /* purged */ + proc_list_lock(); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + proc_list_unlock(); + } + proc_rele(p); + goto exit; + } + + /* + * Failure - first unwind the state, + * then fall through to restart the search. + */ + proc_list_lock(); + proc_rele(p); + p->p_memstat_state &= ~P_MEMSTAT_TERMINATED; + p->p_memstat_state |= P_MEMSTAT_ERROR; + if (errors_out) { + *errors_out += 1; + } + + i = 0; + next_p = memorystatus_get_first_proc_locked(&i, TRUE); } proc_list_unlock(); @@ -6612,7 +6704,7 @@ exit: #endif KDBG(MEMSTAT_CODE(BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, - MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, *memory_reclaimed); + MEMORYSTATUS_LOG_AVAILABLE_PAGES, killed ? aPid : 0, killed, footprint_of_killed_proc); return killed; } @@ -6656,6 +6748,10 @@ _memstat_refresh_oldest_reapable_proc_info() uint64_t proc_prio_start = p->p_memstat_prio_start; if (proc_prio_start < oldest_prio_start) { oldest_prio_start = proc_prio_start; + /* Since the process list is sorted in age order within priority bands, + * the first process will be the oldest one, and we can bail out and skip the rest + */ + break; } } } @@ -6684,8 +6780,9 @@ _memstat_proc_is_reapable(proc_t proc) /* * To be potentially reapable, the process * - must be in or below the max reapable priority and + * - must not have a relaunch probability of High or Medium (per memstat_reaper_reap_relaunch_mask) * - must have been in that priority band longer than the reaper minimum age threshold - * - must not have a relaunch probability of High or Medium + * - must have been in that priority band longer than the reaper minimum age threshold for applications, if process is an application */ priority_band = proc->p_memstat_effectivepriority; if (priority_band > memstat_reaper_max_priority) { @@ -6694,14 +6791,6 @@ _memstat_proc_is_reapable(proc_t proc) return false; } - absolutetime_to_nanoseconds(mach_absolute_time() - proc->p_memstat_prio_start, &time_in_priority_band_secs); - time_in_priority_band_secs /= NSEC_PER_SEC; - if (time_in_priority_band_secs < memstat_reaper_min_age_secs) { - memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because age (%llu) is below min age (%d)\n", - proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_secs); - return false; - } - uint32_t relaunch_flags = proc->p_memstat_relaunch_flags; // There's no explicit flag for "unknown" relaunch probability, and we need one for our control bitmask. // So if none of the Low Medium or High bits are set, we set the next higher bit as the "unknown relaunch probability" bit @@ -6717,11 +6806,42 @@ _memstat_proc_is_reapable(proc_t proc) return false; } - memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] is reapable; priority=%d, age=%d, relaunch_probability_acceptable_mask=0x%02X\n", - proc_best_name(proc), proc->p_pid, priority_band, (uint32_t)(time_in_priority_band_secs), relaunch_probability_acceptable_mask ); + absolutetime_to_nanoseconds(mach_absolute_time() - proc->p_memstat_prio_start, &time_in_priority_band_secs); + time_in_priority_band_secs /= NSEC_PER_SEC; + + if (_memstat_proc_is_application(proc)) { + if ((time_in_priority_band_secs < memstat_reaper_min_age_apps_secs)) { + memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because it is an application and age (%llu) is below min age for apps (%d)\n", + proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_apps_secs); + return false; + } + } else { + if (time_in_priority_band_secs < memstat_reaper_min_age_secs) { + memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] not reapable because age (%llu) is below min age (%d)\n", + proc_best_name(proc), proc->p_pid, time_in_priority_band_secs, memstat_reaper_min_age_secs); + return false; + } + } + + memorystatus_log_debug("_memstat_proc_is_reapable: %s [%d] is reapable; priority=%d, age=%d, relaunch_probability_acceptable_mask=0x%02X, type=%s\n", + proc_best_name(proc), proc->p_pid, priority_band, (uint32_t)(time_in_priority_band_secs), relaunch_probability_acceptable_mask, + _memstat_proc_type_description(proc)); return true; } +static bool +_memstat_proc_is_application(proc_t proc) +{ + bool isApp = false; + + task_t task = proc_task(proc); + if (task != NULL) { + isApp = task_is_app( task); + } + + return isApp; +} + /* * Jetsam aggressively */ @@ -6755,7 +6875,7 @@ memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, * JETSAM_PRIORITY_FOREGROUND bucket. If yes, sort the FG band based on * coalition footprint. */ - memorystatus_sort_bucket(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_DEFAULT); + memstat_sort_bucket(JETSAM_PRIORITY_FOREGROUND, memstat_jetsam_fg_sort_order); } jetsam_reason = os_reason_create(OS_REASON_JETSAM, cause); @@ -6868,11 +6988,11 @@ memorystatus_kill_processes_aggressive(uint32_t cause, int aggr_count, absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs); time_in_priority_band_secs /= NSEC_PER_SEC; memorystatus_log( - "memorystatus: %s%d pid %d [%s] (%s %d %llus rf:%s - memorystatus_available_pages: %llu\n", + "memorystatus: %s%d pid %d [%s] (%s %d %llus rf:%s type:%s) - memorystatus_available_pages: %llu\n", ((aPid_ep == JETSAM_PRIORITY_IDLE) ? "killing_idle_process_aggressive" : "killing_top_process_aggressive"), aggr_count, aPid, proc_best_name(p), - memorystatus_kill_cause_name[cause], aPid_ep, - time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), + memstat_kill_cause_name[cause], aPid_ep, + time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), _memstat_proc_type_description(p), (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES); memorystatus_level_snapshot = memorystatus_level; @@ -7206,12 +7326,13 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, un absolutetime_to_nanoseconds(killtime - p->p_memstat_prio_start, &time_in_priority_band_secs); time_in_priority_band_secs /= NSEC_PER_SEC; - memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d %llus rf:%s) %lluKB - memorystatus_available_pages: %llu\n", + memorystatus_log("%lu.%03d memorystatus: killing_top_process_elevated%d pid %d [%s] (%s %d %llus rf:%s type:%s) %lluKB - memorystatus_available_pages: %llu\n", (unsigned long)tv_sec, tv_msec, aggr_count, aPid, ((p && *p->p_name) ? p->p_name : "unknown"), - memorystatus_kill_cause_name[cause], aPid_ep, + memstat_kill_cause_name[cause], aPid_ep, time_in_priority_band_secs, _memstat_relaunch_flags_description(p->p_memstat_relaunch_flags), + _memstat_proc_type_description(p), footprint_of_killed_proc >> 10, (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES); /* Success? */ @@ -7283,7 +7404,7 @@ memorystatus_kill_on_VM_compressor_space_shortage(bool async) memorystatus_log_error("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n"); } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason); + return memstat_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason); } } @@ -7304,18 +7425,20 @@ memorystatus_kill_on_vnode_exhaustion(void) memorystatus_log_error("memorystatus_kill_on_vnode_limit: failed to allocate jetsam reason\n"); } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason); + return memstat_kill_process_sync(-1, kMemorystatusKilledVnodes, jetsam_reason); } +#endif /* CONFIG_JETSAM */ + bool memorystatus_kill_on_sustained_pressure() { os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE); if (jetsam_reason == OS_REASON_NULL) { - memorystatus_log_error("memorystatus_kill_on_FC_thrashing -- sync: failed to allocate jetsam reason\n"); + memorystatus_log_error("%s() failed to allocate jetsam reason\n", __func__); } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason); + return memstat_kill_process_sync(-1, kMemorystatusKilledSustainedPressure, jetsam_reason); } bool @@ -7323,11 +7446,9 @@ memstat_kill_with_jetsam_reason_sync(pid_t pid, os_reason_t jetsam_reason) { uint32_t kill_cause = jetsam_reason->osr_code <= JETSAM_REASON_MEMORYSTATUS_MAX ? (uint32_t) jetsam_reason->osr_code : JETSAM_REASON_INVALID; - return memorystatus_kill_process_sync(pid, kill_cause, jetsam_reason); + return memstat_kill_process_sync(pid, kill_cause, jetsam_reason); } -#endif /* CONFIG_JETSAM */ - bool memorystatus_kill_on_zone_map_exhaustion(pid_t pid) { @@ -7340,7 +7461,7 @@ memorystatus_kill_on_zone_map_exhaustion(pid_t pid) if (jetsam_reason == OS_REASON_NULL) { memorystatus_log_error("memorystatus_kill_on_zone_map_exhaustion: failed to allocate jetsam reason\n"); } - return memorystatus_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason); + return memstat_kill_process_sync(pid, kMemorystatusKilledZoneMapExhaustion, jetsam_reason); } } @@ -7851,7 +7972,6 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b #endif /* (DEVELOPMENT || DEBUG)*/ if (is_default_snapshot) { snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; - memorystatus_jetsam_snapshot_last_timestamp = 0; } #if CONFIG_FREEZE else if (is_freezer_snapshot) { @@ -8080,7 +8200,7 @@ memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size) /* Allow head inserts -- but relative order is now */ if (table[i].priority == JETSAM_PRIORITY_IDLE_HEAD) { new_priority = JETSAM_PRIORITY_IDLE; - priority_options |= MEMSTAT_PRIORITY_INSERT_HEAD; + priority_options |= (MEMSTAT_PRIORITY_INSERT_HEAD | MEMSTAT_PRIORITY_NO_AGING); } else { new_priority = table[i].priority; } @@ -8382,6 +8502,26 @@ memorystatus_cmd_get_diag_memlimit_properties(pid_t pid, user_addr_t buffer, siz } #endif //DEBUG || DEVELOPMENT +static int +_memstat_get_process_conclave_mem_limit(pid_t pid, int32_t *retval) +{ + kern_return_t error; + proc_t p = proc_find(pid); + if (!p) { + return ESRCH; + } + + uint64_t conclave_limit; + error = task_get_conclave_mem_limit(proc_task(p), &conclave_limit); + + if (error == KERN_SUCCESS) { + *retval = roundToNearestMB((uint32_t)conclave_limit); + } + + proc_rele(p); + return mach_to_bsd_errno(error); +} + static void memorystatus_get_memlimit_properties_internal(proc_t p, memorystatus_memlimit_properties_t* p_entry) { @@ -8661,6 +8801,46 @@ memorystatus_cmd_convert_memlimit_mb(pid_t pid, int32_t limit, int32_t *retval) proc_rele(p); return error; } + +static int +_memstat_rearm_proc_memlimit(proc_t proc, void* flagsptr) +{ + task_t task = proc_task(proc); + uint32_t flags = *((uint32_t *) flagsptr); + + if (flags & MEMORYSTATUS_FLAGS_REARM_ACTIVE) { + task_reset_triggered_exc_resource(task, true); + } + if (flags & MEMORYSTATUS_FLAGS_REARM_INACTIVE) { + task_reset_triggered_exc_resource(task, false); + } + + return 0; +} + +static int +memorystatus_cmd_rearm_memlimit(pid_t pid, uint32_t flags, __unused int32_t *retval) +{ + if (pid == -1) { + /* Re-arm all pids */ + proc_iterate( + PROC_ALLPROCLIST, + _memstat_rearm_proc_memlimit, + &flags, + NULL, + NULL); + } else { + /* Re-arm one pid */ + proc_t p = (pid == proc_selfpid()) ? proc_self() : proc_find(pid); + if (!p) { + return ESRCH; + } + _memstat_rearm_proc_memlimit(p, &flags); + proc_rele(p); + } + + return 0; +} #endif /* CONFIG_JETSAM */ #if DEBUG || DEVELOPMENT @@ -8859,14 +9039,23 @@ memorystatus_get_process_is_managed(pid_t pid, int *is_managed) return ESRCH; } - proc_list_lock(); - *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0); + *is_managed = memorystatus_get_proc_is_managed(p) ? 1 : 0; + proc_rele(p); - proc_list_unlock(); return 0; } +bool +memorystatus_get_proc_is_managed(proc_t proc) +{ + proc_list_lock(); + bool is_managed = _memstat_proc_is_managed(proc); + proc_list_unlock(); + return is_managed; +} + + static int memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed) { @@ -8990,6 +9179,16 @@ memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int } #endif /* DEVELOPMENT || DEBUG */ +#if DEVELOPMENT || DEBUG + /* + * On development kernels, processes should be able to re-arm themselves + * without entitlement for testing. + */ + if (args->command == MEMORYSTATUS_CMD_REARM_MEMLIMIT && proc_getpid(p) == args->pid) { + skip_auth_check = TRUE; + } +#endif + /* Need to be root or have entitlement. */ if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) { error = EPERM; @@ -9104,6 +9303,10 @@ memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int case MEMORYSTATUS_CMD_CONVERT_MEMLIMIT_MB: error = memorystatus_cmd_convert_memlimit_mb(args->pid, (int32_t) args->flags, ret); break; + + case MEMORYSTATUS_CMD_REARM_MEMLIMIT: + error = memorystatus_cmd_rearm_memlimit(args->pid, args->flags, ret); + break; #endif /* CONFIG_JETSAM */ /* Test commands */ #if DEVELOPMENT || DEBUG @@ -9113,7 +9316,7 @@ memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int memorystatus_log_error("memorystatus_control: failed to allocate jetsam reason\n"); } - error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL; + error = memstat_kill_process_sync(args->pid, kMemorystatusKilled, jetsam_reason) ? 0 : EINVAL; break; case MEMORYSTATUS_CMD_TEST_JETSAM_SORT: error = memorystatus_cmd_test_jetsam_sort(args->pid, (int32_t)args->flags, args->buffer, args->buffersize); @@ -9193,6 +9396,11 @@ memorystatus_control(struct proc *p, struct memorystatus_control_args *args, int case MEMORYSTATUS_CMD_GET_KILL_COUNTS: error = memorystatus_cmd_get_kill_counts(args->pid, args->buffer, args->buffersize, args->flags); break; + + case MEMORYSTATUS_CMD_GET_CONCLAVE_LIMIT: + error = _memstat_get_process_conclave_mem_limit(args->pid, ret); + break; + default: error = EINVAL; break; @@ -9204,109 +9412,80 @@ out: /* Coalition support */ -/* sorting info for a particular priority bucket */ -typedef struct memstat_sort_info { - coalition_t msi_coal; - uint64_t msi_page_count; - pid_t msi_pid; - int msi_ntasks; -} memstat_sort_info_t; - /* - * qsort from smallest page count to largest page count - * - * return < 0 for a < b - * 0 for a == b - * > 0 for a > b + * Inserts a list of pids before the given proc in the bucket. If any of the + * pids in the given list are not already in the bucket, they will be ignored. */ -static int -memstat_asc_cmp(const void *a, const void *b) +static void +memstat_insert_list_locked( + proc_t before, + unsigned int bucket_idx, + pid_t *pid_list, + int list_sz) { - const memstat_sort_info_t *msA = (const memstat_sort_info_t *)a; - const memstat_sort_info_t *msB = (const memstat_sort_info_t *)b; + int i; + proc_t p; + memstat_bucket_t *bucket; - return (int)((uint64_t)msA->msi_page_count - (uint64_t)msB->msi_page_count); + assert(bucket_idx < MEMSTAT_BUCKET_COUNT); + + bucket = &memstat_bucket[bucket_idx]; + + if ((pid_list == NULL) || (list_sz <= 0)) { + return; + } + + for (i = list_sz - 1; i >= 0; i--) { + p = proc_find_locked(pid_list[i]); + + if (p == NULL || p == before) { + /* + * We can encounter p == before when we try to sort a coalition with an in- + * progress exec of the leader, such that the leader and the exec-ing + * member have the same PID. Just skip over it for now, since this member + * will soon be removed from the proc list anyway. + */ + continue; + } + + if (p->p_memstat_effectivepriority != bucket_idx) { + /* proc not in bucket, skip it */ + proc_rele(p); + continue; + } + + TAILQ_REMOVE(&bucket->list, p, p_memstat_list); + TAILQ_INSERT_BEFORE(before, p, p_memstat_list); + proc_rele(p); + } } - /* * Return the number of pids rearranged during this sort. */ -static int -memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coal_sort_order) +static void +memstat_sort_coals_locked(unsigned int bucket_index, memorystatus_jetsam_sort_order_t sort_order) { #define MAX_SORT_PIDS 80 -#define MAX_COAL_LEADERS 10 - unsigned int b = bucket_index; - int nleaders = 0; int ntasks = 0; proc_t p = NULL; coalition_t coal = COALITION_NULL; - int pids_moved = 0; - int total_pids_moved = 0; - int i; - - /* - * The system is typically under memory pressure when in this - * path, hence, we want to avoid dynamic memory allocation. - */ - memstat_sort_info_t leaders[MAX_COAL_LEADERS]; pid_t pid_list[MAX_SORT_PIDS]; + memstat_bucket_t *bucket; - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { - return 0; - } + assert((sort_order == JETSAM_SORT_LRU) || (sort_order == JETSAM_SORT_FOOTPRINT)); + assert(bucket_index < MEMSTAT_BUCKET_COUNT); - /* - * Clear the array that holds coalition leader information - */ - for (i = 0; i < MAX_COAL_LEADERS; i++) { - leaders[i].msi_coal = COALITION_NULL; - leaders[i].msi_page_count = 0; /* will hold total coalition page count */ - leaders[i].msi_pid = 0; /* will hold coalition leader pid */ - leaders[i].msi_ntasks = 0; /* will hold the number of tasks in a coalition */ - } - - p = memorystatus_get_first_proc_locked(&b, FALSE); - while (p) { - coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM); - if (coalition_is_leader(proc_task(p), coal)) { - if (nleaders < MAX_COAL_LEADERS) { - int coal_ntasks = 0; - uint64_t coal_page_count = coalition_get_page_count(coal, &coal_ntasks); - leaders[nleaders].msi_coal = coal; - leaders[nleaders].msi_page_count = coal_page_count; - leaders[nleaders].msi_pid = proc_getpid(p); /* the coalition leader */ - leaders[nleaders].msi_ntasks = coal_ntasks; - nleaders++; - } else { - /* - * We've hit MAX_COAL_LEADERS meaning we can handle no more coalitions. - * Abandoned coalitions will linger at the tail of the priority band - * when this sort session ends. - * TODO: should this be an assert? - */ - memorystatus_log_error( - "%s: WARNING: more than %d leaders in priority band [%d]\n", - __FUNCTION__, MAX_COAL_LEADERS, bucket_index); - break; - } - } - p = memorystatus_get_next_proc_locked(&b, p, FALSE); - } - - if (nleaders == 0) { - /* Nothing to sort */ - return 0; - } - - /* - * Sort the coalition leader array, from smallest coalition page count - * to largest coalition page count. When inserted in the priority bucket, - * smallest coalition is handled first, resulting in the last to be jetsammed. - */ - if (nleaders > 1) { - qsort(leaders, nleaders, sizeof(memstat_sort_info_t), memstat_asc_cmp); + switch (sort_order) { + case JETSAM_SORT_LRU: + /* Nothing to do, buckets are already LRU */ + break; + case JETSAM_SORT_FOOTPRINT: + /* Sort bucket by footprint first */ + memstat_sort_by_footprint_locked(bucket_index); + break; + default: + panic("Invalid sort order %d passed to memstat_sort_coals", sort_order); } /* @@ -9327,118 +9506,62 @@ memorystatus_sort_by_largest_coalition_locked(unsigned int bucket_index, int coa * Coalition members are rearranged in the priority bucket here, * based on their coalition role. */ - total_pids_moved = 0; - for (i = 0; i < nleaders; i++) { - /* a bit of bookkeeping */ - pids_moved = 0; - /* Coalition leaders are jetsammed last, so move into place first */ - pid_list[0] = leaders[i].msi_pid; - pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, 1); + bucket = &memstat_bucket[bucket_index]; + p = TAILQ_FIRST(&bucket->list); + while (p) { + coal = task_get_coalition(proc_task(p), COALITION_TYPE_JETSAM); + if (!coalition_is_leader(proc_task(p), coal)) { + p = TAILQ_NEXT(p, p_memstat_list); + continue; + } - /* xpc services should jetsam after extensions */ - ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_XPC, - coal_sort_order, pid_list, MAX_SORT_PIDS); + /* undefined coalition members should be the first to jetsam */ + ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_UNDEF, + COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS); if (ntasks > 0) { - pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, + memstat_insert_list_locked(p, bucket_index, pid_list, (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS)); } /* extensions should jetsam after unmarked processes */ - ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_EXT, - coal_sort_order, pid_list, MAX_SORT_PIDS); + ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_EXT, + COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS); if (ntasks > 0) { - pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, + memstat_insert_list_locked(p, bucket_index, pid_list, (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS)); } - /* undefined coalition members should be the first to jetsam */ - ntasks = coalition_get_pid_list(leaders[i].msi_coal, COALITION_ROLEMASK_UNDEF, - coal_sort_order, pid_list, MAX_SORT_PIDS); + /* xpc services should jetsam after extensions */ + ntasks = coalition_get_pid_list(coal, COALITION_ROLEMASK_XPC, + COALITION_SORT_DEFAULT, pid_list, MAX_SORT_PIDS); if (ntasks > 0) { - pids_moved += memorystatus_move_list_locked(bucket_index, pid_list, + memstat_insert_list_locked(p, bucket_index, pid_list, (ntasks <= MAX_SORT_PIDS ? ntasks : MAX_SORT_PIDS)); } - total_pids_moved += pids_moved; + /* + * And then, the leader will jetsam last since we inserted everyone else + * before it in the bucket + */ + + p = TAILQ_NEXT(p, p_memstat_list); } /* end for */ - - return total_pids_moved; } -/* - * Traverse a list of pids, searching for each within the priority band provided. - * If pid is found, move it to the front of the priority band. - * Never searches outside the priority band provided. - * - * Input: - * bucket_index - jetsam priority band. - * pid_list - pointer to a list of pids. - * list_sz - number of pids in the list. - * - * Pid list ordering is important in that, - * pid_list[n] is expected to jetsam ahead of pid_list[n+1]. - * The sort_order is set by the coalition default. - * - * Return: - * the number of pids found and hence moved within the priority band. - */ -static int -memorystatus_move_list_locked(unsigned int bucket_index, pid_t *pid_list, int list_sz) -{ - memstat_bucket_t *current_bucket; - int i; - int found_pids = 0; - - if ((pid_list == NULL) || (list_sz <= 0)) { - return 0; - } - - if (bucket_index >= MEMSTAT_BUCKET_COUNT) { - return 0; - } - - current_bucket = &memstat_bucket[bucket_index]; - for (i = 0; i < list_sz; i++) { - unsigned int b = bucket_index; - proc_t p = NULL; - proc_t aProc = NULL; - pid_t aPid; - int list_index; - - list_index = ((list_sz - 1) - i); - aPid = pid_list[list_index]; - - /* never search beyond bucket_index provided */ - p = memorystatus_get_first_proc_locked(&b, FALSE); - while (p) { - if (proc_getpid(p) == aPid) { - aProc = p; - break; - } - p = memorystatus_get_next_proc_locked(&b, p, FALSE); - } - - if (aProc == NULL) { - /* pid not found in this band, just skip it */ - continue; - } else { - TAILQ_REMOVE(¤t_bucket->list, aProc, p_memstat_list); - TAILQ_INSERT_HEAD(¤t_bucket->list, aProc, p_memstat_list); - found_pids++; - } - } - return found_pids; -} uint32_t memstat_get_idle_proccnt(void) { #if CONFIG_JETSAM + /* + * On fully jetsam-enabled systems, all processes on the idle band may + * be idle-exited + */ return os_atomic_load(&memstat_bucket[JETSAM_PRIORITY_IDLE].count, relaxed); #else /* !CONFIG_JETSAM */ uint32_t count = 0; @@ -9448,8 +9571,14 @@ memstat_get_idle_proccnt(void) for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE); p != PROC_NULL; p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) { - if ((_memstat_proc_can_idle_exit(p) && !_memstat_proc_is_dirty(p)) || - (_memstat_proc_is_managed(p) && !_memstat_proc_has_priority_assertion(p))) { + /* + * On macOS, we can only exit clean daemons. In the future, we + * should include assertion-less managed daemons. Apps may make + * their way into this band as well, and we cannot jetsam those. + */ + if (_memstat_proc_can_idle_exit(p) && + !_memstat_proc_is_dirty(p) && + !_memstat_proc_is_terminating(p)) { count++; } } @@ -9459,64 +9588,56 @@ memstat_get_idle_proccnt(void) #endif /* CONFIG_JETSAM */ } +uint32_t +memstat_get_long_idle_proccnt(void) +{ + uint32_t count = 0; + uint32_t bucket = JETSAM_PRIORITY_IDLE; + + proc_list_lock(); + for (proc_t p = memorystatus_get_first_proc_locked(&bucket, FALSE); + p != PROC_NULL; + p = memorystatus_get_next_proc_locked(&bucket, p, FALSE)) { + if (!_memstat_proc_is_dirty(p) && _memstat_proc_can_idle_exit(p) && + !_memstat_proc_is_terminating(p) && _memstat_proc_is_reapable(p)) { + count++; + } + } + proc_list_unlock(); + + return count; +} + uint32_t memstat_get_proccnt_upto_priority(uint32_t max_bucket_index) { int32_t i = JETSAM_PRIORITY_IDLE; int count = 0; - if (max_bucket_index >= MEMSTAT_BUCKET_COUNT) { - return -1; - } + assert3u(max_bucket_index, <=, MEMSTAT_BUCKET_COUNT); while (i <= max_bucket_index) { - count += memstat_bucket[i++].count; + /* + * NB: We don't hold the proc-list lock here; that's ok b/c this is just an + * estimate. + */ + count += os_atomic_load(&memstat_bucket[i++].count, relaxed); } return count; } int -memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) +memorystatus_update_priority_for_appnap(proc_t p) { #if !CONFIG_JETSAM if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) { /* * Ineligible processes OR system processes e.g. launchd. - * - * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e. - * they're managed by assertiond. These are iOS apps that have been ported - * to macOS. assertiond might be in the process of modifying the app's - * priority / memory limit - so it might have the proc_list lock, and then try - * to take the task lock. Meanwhile we've entered this function with the task lock - * held, and we need the proc_list lock below. So we'll deadlock with assertiond. - * - * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list - * lock here, since assertiond only sets this bit on process launch. */ return -1; } - /* - * For macOS only: - * We would like to use memorystatus_set_priority() here to move the processes - * within the bands. Unfortunately memorystatus_set_priority() calls - * memorystatus_update_priority_locked() which uses any band transitions - * as an indication to modify ledgers. For that it needs the task lock - * and since we came into this function with the task lock held, we'll deadlock. - * - * Unfortunately we can't completely disable ledger updates because we still - * need the ledger updates for a subset of processes i.e. daemons. - * When all processes on all platforms support memory limits, we can simply call - * memorystatus_set_priority(). - * - * It also has some logic to deal with 'aging' which, currently, is only applicable - * on CONFIG_JETSAM configs. So, till every platform has CONFIG_JETSAM we'll need - * to do this explicit band transition. - */ - - memstat_bucket_t *current_bucket, *new_bucket; - uint64_t now; int32_t priority = 0; proc_list_lock(); @@ -9532,61 +9653,17 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) return 0; } - if (is_appnap) { - current_bucket = &memstat_bucket[p->p_memstat_effectivepriority]; - new_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - priority = JETSAM_PRIORITY_IDLE; - } else { - if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) { - /* - * It is possible that someone pulled this process - * out of the IDLE band without updating its app-nap - * parameters. - */ - proc_list_unlock(); - return 0; - } - - current_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; - new_bucket = &memstat_bucket[p->p_memstat_requestedpriority]; - priority = p->p_memstat_requestedpriority; - } - - now = mach_absolute_time(); - - TAILQ_REMOVE(¤t_bucket->list, p, p_memstat_list); - current_bucket->count--; - if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { - current_bucket->relaunch_high_count--; - } - TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list); - new_bucket->count++; - if (p->p_memstat_relaunch_flags & (P_MEMSTAT_RELAUNCH_HIGH)) { - new_bucket->relaunch_high_count++; - } /* - * Record idle start or idle delta. + * Update priority. We don't want the aging logic because that's only applicable on + * configs with CONFIG_JETSAM. */ - if (p->p_memstat_effectivepriority == priority) { - /* - * This process is not transitioning between - * jetsam priority buckets. Do nothing. - */ - } else if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE) { - /* - * Transitioning out of the idle priority bucket. - * Record idle delta. - */ - assert(p->p_memstat_prio_start != 0); - if (now > p->p_memstat_prio_start) { - p->p_memstat_idle_delta = now - p->p_memstat_prio_start; - } + priority = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_SUP_ACTIVE) ? + JETSAM_PRIORITY_BACKGROUND : + p->p_memstat_requestedpriority; + if (_memstat_proc_has_priority_assertion(p)) { + priority = MAX(priority, p->p_memstat_assertionpriority); } - - KDBG(MEMSTAT_CODE(BSD_MEMSTAT_CHANGE_PRIORITY), proc_getpid(p), priority, p->p_memstat_effectivepriority); - - p->p_memstat_effectivepriority = priority; - p->p_memstat_prio_start = now; + memstat_update_priority_locked(p, priority, MEMSTAT_PRIORITY_NO_AGING); proc_list_unlock(); @@ -9594,7 +9671,6 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) #else /* !CONFIG_JETSAM */ #pragma unused(p) - #pragma unused(is_appnap) return -1; #endif /* !CONFIG_JETSAM */ } @@ -9641,155 +9717,6 @@ memorystatus_available_memory(struct proc *p, __unused struct memorystatus_avail return 0; } -void -memorystatus_log_system_health(const memorystatus_system_health_t *status) -{ - static struct memorystatus_system_health prev_status = {0}; - - bool healthy = memorystatus_is_system_healthy(status); - - /* - * Avoid spamming logs by only logging when the system status has changed. - */ - if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted && - prev_status.msh_compressor_exhausted == status->msh_compressor_exhausted && - prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space && - prev_status.msh_swap_exhausted == status->msh_swap_exhausted -#if CONFIG_JETSAM - && - prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle && - prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft && - prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical && - prev_status.msh_available_pages_below_reaper == status->msh_available_pages_below_reaper && - prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap && - prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing && - prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing && - prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure && - prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit && - prev_status.msh_pageout_starved == status->msh_pageout_starved -#endif /* CONFIG_JETSAM */ - ) { - /* No change */ - return; - } - -#if CONFIG_JETSAM - if (healthy) { - if (status->msh_available_pages_below_soft) { - memorystatus_log( - "memorystatus: System will begin enforcing " - "soft memory limits. " - "memorystatus_available_pages: %llu compressor_size: %u\n", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - } else if (status->msh_available_pages_below_idle) { - memorystatus_log( - "memorystatus: System will begin enacting " - "idle-exits. " - "memorystatus_available_pages: %llu compressor_size: %u\n", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - } else if (status->msh_available_pages_below_reaper) { - memorystatus_log( - "memorystatus: System will begin reaping " - "long-idle processes. " - "memorystatus_available_pages: %llu compressor_size: %u\n", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - } else { - memorystatus_log( - "memorystatus: System is healthy. " - "memorystatus_available_pages: %llu compressor_size:%u\n", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - } - } else { - /* Unhealthy */ - memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - memorystatus_log( - "memorystatus: {" - "\"available_pages_below_critical\": %d, " - "\"available_pages_below_idle\": %d, " - "\"available_pages_below_soft\": %d, " - "\"available_pages_below_reaper\": %d, " - "\"compressor_needs_to_swap\": %d, " - "\"compressor_exhausted\": %d, " - "\"compressor_is_thrashing\": %d, " - "\"filecache_is_thrashing\": %d, " - "\"zone_map_is_exhausted\": %d, " - "\"phantom_cache_pressure\": %d, " - "\"swappable_compressor_segments_over_limit\": %d, " - "\"swapin_queue_over_limit\": %d, " - "\"swap_low\": %d, " - "\"swap_exhausted\": %d" - "}\n", - status->msh_available_pages_below_critical, - status->msh_available_pages_below_idle, - status->msh_available_pages_below_soft, - status->msh_available_pages_below_reaper, - status->msh_compressor_needs_to_swap, - status->msh_compressor_exhausted, - status->msh_compressor_is_thrashing, - status->msh_filecache_is_thrashing, - status->msh_zone_map_is_exhausted, - status->msh_phantom_cache_pressure, - status->msh_swappable_compressor_segments_over_limit, - status->msh_swapin_queue_over_limit, - status->msh_swap_low_on_space, - status->msh_swap_exhausted); - } -#else /* CONFIG_JETSAM */ - memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n", - healthy ? "healthy" : "unhealthy", - (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); - if (!healthy) { - memorystatus_log( - "memorystatus: {" - "\"compressor_exhausted\": %d, " - "\"zone_map_is_exhausted\": %d, " - "\"swap_low\": %d, " - "\"swap_exhausted\": %d" - "}\n", - status->msh_compressor_exhausted, - status->msh_zone_map_is_exhausted, - status->msh_swap_low_on_space, - status->msh_swap_exhausted); - } -#endif /* CONFIG_JETSAM */ - prev_status = *status; -} - -uint32_t -memorystatus_pick_kill_cause(const memorystatus_system_health_t *status) -{ - assert(!memorystatus_is_system_healthy(status)); -#if CONFIG_JETSAM - if (status->msh_compressor_is_thrashing) { - return kMemorystatusKilledVMCompressorThrashing; - } else if (status->msh_compressor_exhausted) { - return kMemorystatusKilledVMCompressorSpaceShortage; - } else if (status->msh_swap_low_on_space) { - return kMemorystatusKilledLowSwap; - } else if (status->msh_filecache_is_thrashing) { - return kMemorystatusKilledFCThrashing; - } else if (status->msh_zone_map_is_exhausted) { - return kMemorystatusKilledZoneMapExhaustion; - } else if (status->msh_pageout_starved) { - return kMemorystatusKilledVMPageoutStarvation; - } else { - assert(status->msh_available_pages_below_critical); - return kMemorystatusKilledVMPageShortage; - } -#else /* CONFIG_JETSAM */ - if (status->msh_zone_map_is_exhausted) { - return kMemorystatusKilledZoneMapExhaustion; - } else if (status->msh_compressor_exhausted) { - return kMemorystatusKilledVMCompressorSpaceShortage; - } else if (status->msh_swap_exhausted) { - return kMemorystatusKilledLowSwap; - } else { - return kMemorystatusKilled; - } -#endif /* CONFIG_JETSAM */ -} - #if DEVELOPMENT || DEBUG static int memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase) @@ -9801,6 +9728,11 @@ memorystatus_cmd_increase_jetsam_task_limit(pid_t pid, uint32_t byte_increase) return EINVAL; } + if (memstat_ignore_task_limit_increase) { + /* If the bootarg is set, lie and say we did it */ + return 0; + } + proc_t p = proc_find(pid); if (!p) { diff --git a/bsd/kern/kern_memorystatus_freeze.c b/bsd/kern/kern_memorystatus_freeze.c index 551fb6ace..7a82b885c 100644 --- a/bsd/kern/kern_memorystatus_freeze.c +++ b/bsd/kern/kern_memorystatus_freeze.c @@ -195,12 +195,12 @@ struct memorystatus_freezer_candidate_list memorystatus_global_demote_list = {NU #define FREEZER_USE_ORDERED_LIST_DEFAULT 0 #endif int memorystatus_freezer_use_ordered_list = FREEZER_USE_ORDERED_LIST_DEFAULT; -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freezer_use_ordered_list, &memorystatus_freezer_use_ordered_list, 0, 1, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freezer_use_ordered_list, &memorystatus_freezer_use_ordered_list, 0, 1, ""); /* * When enabled, demotion candidates are chosen from memorystatus_global_demotion_list */ int memorystatus_freezer_use_demotion_list = 0; -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freezer_use_demotion_list, &memorystatus_freezer_use_demotion_list, 0, 1, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freezer_use_demotion_list, &memorystatus_freezer_use_demotion_list, 0, 1, ""); extern boolean_t vm_swap_max_budget(uint64_t *); @@ -411,13 +411,13 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freezer_thaw_percentage_fg_non_xpc_ser #define FREEZER_ERROR_STRING_LENGTH 128 -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_BACKGROUND, JETSAM_PRIORITY_FOREGROUND, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, ""); -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_freeze_max_candidate_band, &memorystatus_freeze_max_candidate_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_FOREGROUND, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_pages_min, &memorystatus_freeze_pages_min, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_pages_max, &memorystatus_freeze_pages_max, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_processes_max, &memorystatus_frozen_processes_max, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_jetsam_band, &memorystatus_freeze_jetsam_band, JETSAM_PRIORITY_BACKGROUND, JETSAM_PRIORITY_FOREGROUND, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_private_shared_pages_ratio, &memorystatus_freeze_private_shared_pages_ratio, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_min_processes, &memorystatus_freeze_suspended_threshold, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_freeze_max_candidate_band, &memorystatus_freeze_max_candidate_band, JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_FOREGROUND, ""); static int sysctl_memorystatus_freeze_budget_multiplier SYSCTL_HANDLER_ARGS { @@ -458,21 +458,21 @@ sysctl_memorystatus_freeze_budget_multiplier SYSCTL_HANDLER_ARGS } return 0; } -EXPERIMENT_FACTOR_PROC(_kern, memorystatus_freeze_budget_multiplier, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_memorystatus_freeze_budget_multiplier, "Q", ""); +EXPERIMENT_FACTOR_LEGACY_PROC(_kern, memorystatus_freeze_budget_multiplier, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, &sysctl_memorystatus_freeze_budget_multiplier, "Q", ""); /* * max. # of frozen process demotions we will allow in our daily cycle. */ -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_max_freeze_demotions_daily, &memorystatus_max_frozen_demotions_daily, 0, UINT32_MAX, ""); /* * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. */ -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_thaw_count_demotion_threshold, &memorystatus_thaw_count_demotion_threshold, 0, UINT32_MAX, ""); /* * min # of global thaws needed for us to consider refreezing these processes. */ -EXPERIMENT_FACTOR_UINT(_kern, memorystatus_min_thaw_refreeze_threshold, &memorystatus_min_thaw_refreeze_threshold, 0, UINT32_MAX, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, memorystatus_min_thaw_refreeze_threshold, &memorystatus_min_thaw_refreeze_threshold, 0, UINT32_MAX, ""); #if DEVELOPMENT || DEBUG @@ -1441,7 +1441,7 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS return 0; } -EXPERIMENT_FACTOR_PROC(_vm, freeze_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_freeze_enabled, "I", ""); +EXPERIMENT_FACTOR_LEGACY_PROC(_vm, freeze_enabled, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_freeze_enabled, "I", ""); static void schedule_interval_reset(thread_call_t reset_thread_call, throttle_interval_t *interval) @@ -3086,36 +3086,41 @@ memorystatus_freeze_init_proc(proc_t p) } } - static int -sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS +sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS { -#pragma unused(oidp, arg1, arg2) - if (!req->newptr) { return EINVAL; } /* Need to be root or have entitlement */ - if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement( MEMORYSTATUS_ENTITLEMENT)) { + if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) { return EPERM; } - if (memorystatus_freeze_enabled == false) { - return ENOTSUP; - } - if (!VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { return ENOTSUP; } + if (!memorystatus_freeze_enabled && !memorystatus_swap_all_apps) { + /* Nothing to do. Swap is not enabled on this system. */ + assert3u(vm_compressor_get_swapped_segment_count(), ==, 0); + memorystatus_log("memorystatus: swap is disabled, bypassing fast-wake warmup"); + return 0; + } + + memorystatus_log("memorystatus: swapping-in all swapped-out compressor " + "segments\n"); + do_fastwake_warmup_all(); return 0; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, - 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", ""); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_do_fastwake_warmup_all, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_memorystatus_do_fastwake_warmup_all, "I", + "Swap-in any compressed data that resides in swapfiles"); /* * Takes in a candidate list from the user_addr, validates it, and copies it into the list pointer. diff --git a/bsd/kern/kern_memorystatus_internal.h b/bsd/kern/kern_memorystatus_internal.h index dd8723920..a25a7a471 100644 --- a/bsd/kern/kern_memorystatus_internal.h +++ b/bsd/kern/kern_memorystatus_internal.h @@ -121,9 +121,17 @@ OS_CLOSED_ENUM(memorystatus_action, uint32_t, MEMORYSTATUS_KILL_SWAPPABLE, // Kill a swap-eligible process (even if it's running) based on jetsam priority MEMORYSTATUS_KILL_IDLE, // Kill an idle process MEMORYSTATUS_KILL_LONG_IDLE, // Kill a long-idle process (reaper) + MEMORYSTATUS_NO_PAGING_SPACE, // Perform a no-paging-space-action + MEMORYSTATUS_PURGE_CACHES, // Purge system memory caches (e.g. corpses, deferred reclaim memory) MEMORYSTATUS_KILL_NONE, // Do nothing ); +__options_closed_decl(memstat_kill_options_t, uint8_t, { + MEMSTAT_ONLY_SWAPPABBLE = 0x01, + MEMSTAT_ONLY_LONG_IDLE = 0x02, + MEMSTAT_SORT_BUCKET = 0x04, +}); + /* * Structure to hold state for a jetsam thread. * Typically there should be a single jetsam thread @@ -136,6 +144,7 @@ typedef struct jetsam_state_s { thread_t thread; /* jetsam thread pointer */ int jld_idle_kills; /* idle jetsam kill counter for this session */ uint32_t errors; /* Error accumulator */ + bool errors_cleared; /* Have we tried clearing all errors this iteration? */ bool sort_flag; /* Sort the fg band (idle on macOS) before killing? */ bool corpse_list_purged; /* Has the corpse list been purged? */ bool post_snapshot; /* Do we need to post a jetsam snapshot after this session? */ @@ -149,7 +158,7 @@ typedef struct jetsam_state_s { * and will continue to act until the system is considered * healthy. */ -typedef struct memorystatus_system_health { +typedef struct memorystatus_system_health_s { #if CONFIG_JETSAM bool msh_available_pages_below_soft; bool msh_available_pages_below_idle; @@ -163,16 +172,28 @@ typedef struct memorystatus_system_health { bool msh_swapin_queue_over_limit; bool msh_pageout_starved; #endif /* CONFIG_JETSAM */ + bool msh_vm_pressure_warning; + bool msh_vm_pressure_critical; + bool msh_compressor_low_on_space; bool msh_compressor_exhausted; bool msh_swap_exhausted; bool msh_swap_low_on_space; bool msh_zone_map_is_exhausted; -} memorystatus_system_health_t; +} *memorystatus_system_health_t; -void memorystatus_log_system_health(const memorystatus_system_health_t *health); -bool memorystatus_is_system_healthy(const memorystatus_system_health_t *status); -/* Picks a kill cause given an unhealthy system status */ -uint32_t memorystatus_pick_kill_cause(const memorystatus_system_health_t *status); +/* + * @func memstat_check_system_health + * + * @brief Evaluate system memory conditions and return if the system is healthy. + * + * @discussion + * Evaluates various system memory conditions, including compressor size and + * available page quantities. If conditions indicate a kill should be + * performed, the system is considered "unhealthy". + * + * @returns @c true if the system is healthy, @c false otherwise. + */ +extern bool memstat_check_system_health(memorystatus_system_health_t status); #pragma mark Locks @@ -193,6 +214,30 @@ extern int jld_idle_kill_candidates; extern _Atomic uint64_t last_no_space_action_ts; extern uint64_t no_paging_space_action_throttle_delay_ns; +#pragma mark Pressure Response Globals +extern uint64_t memstat_last_cache_purge_ts; +extern uint64_t memstat_cache_purge_backoff_ns; + +__options_decl(memstat_pressure_options_t, uint32_t, { + /* Kill long idle processes at kVMPressureWarning */ + MEMSTAT_WARNING_KILL_LONG_IDLE = 0x01, + /* Kill idle processes from the notify thread at kVMPressureWarning */ + MEMSTAT_WARNING_KILL_IDLE_THROTTLED = 0x02, + /* Purge memory caches (e.g. corpses, deferred reclaim rings) at kVMPressureCritical */ + MEMSTAT_CRITICAL_PURGE_CACHES = 0x04, + /* Kill all idle processes at kVMPressureCritical */ + MEMSTAT_CRITICAL_KILL_IDLE = 0x08, + /* Kill when at kVMPressureWarning for a prolonged period */ + MEMSTAT_WARNING_KILL_SUSTAINED = 0x10, +}); +/* Maximum value for sysctl handler */ +#define MEMSTAT_PRESSURE_CONFIG_MAX (0x18U) + +extern memstat_pressure_options_t memstat_pressure_config; + +#pragma mark Config Globals +extern boolean_t memstat_reaper_enabled; + #pragma mark VM globals read by the memorystatus subsystem extern unsigned int vm_page_free_count; @@ -302,6 +347,24 @@ _memstat_proc_is_dirty(proc_t p) return p->p_memstat_dirty & P_DIRTY_IS_DIRTY; } +/* + * Return true if this process is self-terminating via ActivityTracking. + */ +static inline bool +_memstat_proc_is_terminating(proc_t p) +{ + return p->p_memstat_dirty & P_DIRTY_TERMINATED; +} + +/* + * Return true if this process has been killed and is in the process of exiting. + */ +static inline bool +_memstat_proc_was_killed(proc_t p) +{ + return p->p_memstat_state & P_MEMSTAT_TERMINATED; +} + static inline bool _memstat_proc_is_internal(proc_t p) { @@ -315,6 +378,13 @@ _memstat_proc_can_idle_exit(proc_t p) (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT); } +static inline bool +_memstat_proc_shutdown_on_clean(proc_t p) +{ + return _memstat_proc_is_tracked(p) && + (p->p_memstat_dirty & P_DIRTY_SHUTDOWN_ON_CLEAN); +} + static inline bool _memstat_proc_has_priority_assertion(proc_t p) { @@ -485,6 +555,12 @@ uint32_t memstat_get_proccnt_upto_priority(uint32_t max_bucket_index); */ uint32_t memstat_get_idle_proccnt(void); +/* + * @func memstat_get_reapable_proccnt + * @brief Return the number of idle, reapable processes which may be terminated. + */ +uint32_t memstat_get_long_idle_proccnt(void); + #pragma mark Freezer #if CONFIG_FREEZE /* diff --git a/bsd/kern/kern_memorystatus_notify.c b/bsd/kern/kern_memorystatus_notify.c index 738f7b4a5..296aa2225 100644 --- a/bsd/kern/kern_memorystatus_notify.c +++ b/bsd/kern/kern_memorystatus_notify.c @@ -142,7 +142,7 @@ kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_proces #define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */ #endif /* XNU_TARGET_OS_OSX */ -static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE; +static TUNABLE_DEV_WRITEABLE(uint32_t, vm_pressure_task_footprint_min, "vm_pressure_notify_min_footprint_mb", VM_PRESSURE_MINIMUM_RSIZE); #if DEVELOPMENT || DEBUG SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, ""); @@ -421,34 +421,25 @@ memorystatus_knote_unregister(struct knote *kn __unused) #if VM_PRESSURE_EVENTS -#if CONFIG_JETSAM - static thread_call_t sustained_pressure_handler_thread_call; -int memorystatus_should_kill_on_sustained_pressure = 1; /* Count the number of sustained pressure kills we've done since boot. */ uint64_t memorystatus_kill_on_sustained_pressure_count = 0; uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */ uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */ -#if DEVELOPMENT || DEBUG -SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, ""); -#endif /* DEVELOPMENT || DEBUG */ -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, ""); +SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, ""); +SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, ""); +SYSCTL_QUAD(_kern_memorystatus, OID_AUTO, kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, ""); static void sustained_pressure_handler(void*, void*); -#endif /* CONFIG_JETSAM */ + static thread_call_t memorystatus_notify_update_telemetry_thread_call; static void update_footprints_for_telemetry(void*, void*); - void memorystatus_notify_init() { -#if CONFIG_JETSAM sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE); -#endif /* CONFIG_JETSAM */ memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(update_footprints_for_telemetry, NULL, THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE); } @@ -708,17 +699,23 @@ memorystatus_is_foreground_locked(proc_t p) * to access the p_memstat_dirty field. */ void -memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit) +memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion) { if (!v) { *is_dirty = FALSE; *is_dirty_tracked = FALSE; *allow_idle_exit = FALSE; + *is_active = FALSE; + *is_managed = FALSE; + *has_assertion = FALSE; } else { proc_t p = (proc_t)v; *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0; *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0; + *is_active = (p->p_memstat_memlimit == p->p_memstat_memlimit_active); + *is_managed = (p->p_memstat_state & P_MEMSTAT_MANAGED) != 0; + *has_assertion = (p->p_memstat_state & P_MEMSTAT_PRIORITY_ASSERTION) != 0; } } @@ -783,7 +780,16 @@ CA_EVENT(memorystatus_pressure_interval, CA_INT, num_transitions, CA_INT, num_kills, CA_INT, duration); -static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry; + +/* Separate struct for tracking so that we have aligned members for atomics */ +struct memstat_cur_interval { + int64_t num_procs; + int64_t num_notifs; + int64_t num_transitions; + uint64_t start_mt; + _Atomic uint32_t num_kills; + vm_pressure_level_t max_level; +} memstat_cur_interval; CA_EVENT(memorystatus_proc_notification, CA_INT, footprint_before_notification, @@ -915,19 +921,15 @@ update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts) } extern char *proc_name_address(void *p); + /* - * Attempt to send the given level telemetry event. - * Finalizes the duration. - * Clears the src_event struct. + * Send pressure interval telemetry. */ static void -memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event) +memorystatus_pressure_interval_send(void) { - uint64_t duration_nanoseconds = 0; - uint64_t curr_ts = mach_absolute_time(); - src_event->duration = curr_ts - src_event->duration; - absolutetime_to_nanoseconds(src_event->duration, &duration_nanoseconds); - src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC); + uint64_t duration_nanoseconds; + CA_EVENT_TYPE(memorystatus_pressure_interval) * evt_data; /* * Drop the event rather than block for memory. We should be in a normal pressure level now, @@ -935,17 +937,23 @@ memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval */ ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT); if (event_wrapper) { - memcpy(event_wrapper->data, src_event, sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval))); - CA_EVENT_SEND(event_wrapper); - } - src_event->num_processes_registered = 0; - src_event->num_notifications_sent = 0; - src_event->max_level = 0; - src_event->num_transitions = 0; - src_event->num_kills = 0; - src_event->duration = 0; -} + absolutetime_to_nanoseconds( + mach_absolute_time() - memstat_cur_interval.start_mt, + &duration_nanoseconds); + evt_data = event_wrapper->data; + evt_data->num_processes_registered = memstat_cur_interval.num_procs; + evt_data->num_notifications_sent = memstat_cur_interval.num_notifs; + evt_data->max_level = memstat_cur_interval.max_level; + evt_data->num_transitions = memstat_cur_interval.num_transitions; + evt_data->num_kills = os_atomic_load(&memstat_cur_interval.num_kills, relaxed); + evt_data->duration = duration_nanoseconds / NSEC_PER_SEC; + + CA_EVENT_SEND(event_wrapper); + } else { + memorystatus_log_error("memorystatus: Dropping interval telemetry event\n"); + } +} /* * Attempt to send the per-proc telemetry events. @@ -955,7 +963,6 @@ static void memorystatus_pressure_proc_telemetry_send(void) { struct knote *kn = NULL; - memorystatus_klist_lock(); SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) { proc_t p = PROC_NULL; struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext; @@ -1000,21 +1007,8 @@ memorystatus_pressure_proc_telemetry_send(void) timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0; timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0; } - memorystatus_klist_unlock(); } -/* - * Send all telemetry associated with the increased pressure interval. - */ -static void -memorystatus_pressure_telemetry_send(void) -{ - LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED); - memorystatus_pressure_interval_send(&memorystatus_pressure_interval_telemetry); - memorystatus_pressure_proc_telemetry_send(); -} - - /* * kn_max - knote * @@ -1286,12 +1280,49 @@ uint64_t next_critical_notification_sent_at_ts = 0; boolean_t memorystatus_manual_testing_on = FALSE; vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal; -unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE; +TUNABLE_DEV_WRITEABLE(unsigned int, memstat_sustained_pressure_max_pri, "memstat_sustained_pressure_max_pri", JETSAM_PRIORITY_IDLE); #if DEVELOPMENT || DEBUG -SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, ""); +SYSCTL_UINT(_kern_memorystatus, OID_AUTO, sustained_pressure_max_pri, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memstat_sustained_pressure_max_pri, 0, ""); #endif /* DEVELOPMENT || DEBUG */ #if CONFIG_JETSAM +#define MEMSTAT_PRESSURE_CONFIG_DEFAULT (MEMSTAT_WARNING_KILL_SUSTAINED) +#else +#define MEMSTAT_PRESSURE_CONFIG_DEFAULT (MEMSTAT_WARNING_KILL_IDLE_THROTTLED | MEMSTAT_CRITICAL_PURGE_CACHES) +#endif + +TUNABLE_WRITEABLE(memstat_pressure_options_t, memstat_pressure_config, + "memorystatus_pressure_config", MEMSTAT_PRESSURE_CONFIG_DEFAULT); +EXPERIMENT_FACTOR_UINT(memorystatus_pressure_config, &memstat_pressure_config, + 0, MEMSTAT_PRESSURE_CONFIG_MAX, + "Which actions to take in response to rising VM pressure"); +#if DEVELOPMENT || DEBUG +SYSCTL_UINT(_kern_memorystatus, OID_AUTO, pressure_config, + CTLFLAG_RW | CTLFLAG_LOCKED, &memstat_pressure_config, 0, + "How to respond to VM pressure"); + +static int +sysctl_memstat_should_kill_sustained SYSCTL_HANDLER_ARGS +{ + int old = !!(memstat_pressure_config & MEMSTAT_WARNING_KILL_SUSTAINED); + int new, changed; + + int ret = sysctl_io_number(req, old, sizeof(old), &new, &changed); + + if (changed) { + if (new) { + memstat_pressure_config |= MEMSTAT_WARNING_KILL_SUSTAINED; + } else { + memstat_pressure_config &= ~MEMSTAT_WARNING_KILL_SUSTAINED; + } + } + return ret; +} + +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, + CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_memstat_should_kill_sustained, "IU", + "Whether to kill idle processes under sustained pressure"); +#endif /* * TODO(jason): The memorystatus thread should be responsible for this @@ -1312,7 +1343,7 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused) * If the pressure hasn't been relieved by then, the problem is memory * consumption in a higher band and this churn is probably doing more harm than good. */ - max_kills = memstat_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2; + max_kills = memstat_get_proccnt_upto_priority(memstat_sustained_pressure_max_pri) * 2; memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills); while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) { bool killed = memorystatus_kill_on_sustained_pressure(); @@ -1323,8 +1354,7 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused) delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC)); kill_count++; memorystatus_kill_on_sustained_pressure_count++; - /* TODO(jason): Should use os_atomic but requires rdar://76310894. */ - memorystatus_pressure_interval_telemetry.num_kills++; + os_atomic_inc(&memstat_cur_interval.num_kills, relaxed); } else { /* Nothing left to kill */ break; @@ -1335,8 +1365,6 @@ sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused) } } -#endif /* CONFIG_JETSAM */ - /* * Returns the number of processes registered for notifications at this level. */ @@ -1355,6 +1383,48 @@ memorystatus_klist_length(int level) return count; } +/* + * Starts a pressure interval, setting up tracking for it + */ +static void +memstat_pressure_interval_start(uint64_t curr_ts) +{ + LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED); + memstat_cur_interval.num_procs = 0; + memstat_cur_interval.num_notifs = 0; + memstat_cur_interval.num_transitions = 0; + memstat_cur_interval.start_mt = curr_ts; + os_atomic_store(&memstat_cur_interval.num_kills, 0, relaxed); + memstat_cur_interval.max_level = kVMPressureNormal; +} + +/* + * Ends a pressure interval, sending all telemetry associated with it + */ +static void +memstat_pressure_interval_end(void) +{ + LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED); + memorystatus_pressure_interval_send(); + memorystatus_pressure_proc_telemetry_send(); +} + +/* + * Updates the pressure interval when the pressure level changes + */ +static void +memstat_pressure_interval_update(vm_pressure_level_t new_level) +{ + LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED); + memstat_cur_interval.num_transitions++; + if (new_level <= memstat_cur_interval.max_level) { + return; + } + memstat_cur_interval.num_procs = memorystatus_klist_length(new_level); + memstat_cur_interval.max_level = new_level; +} + + /* * Updates the footprint telemetry for procs that have received notifications. */ @@ -1421,14 +1491,12 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) * by immediately killing idle exitable processes. We use a delay * to avoid overkill. And we impose a max counter as a fail safe * in case daemons re-launch too fast. - * - * TODO: These jetsams should be performed on the memorystatus thread. We can - * provide the similar false-idle mitigation by skipping processes with med/high - * relaunch probability and/or using the sustained-pressure mechanism. - * (rdar://134075608) */ - while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) { - if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, NULL)) { + while (memstat_pressure_config & MEMSTAT_WARNING_KILL_IDLE_THROTTLED && + memorystatus_vm_pressure_level != kVMPressureNormal && + idle_kill_counter < MAX_IDLE_KILLS) { + uint64_t footprint; + if (!memstat_kill_idle_process(kMemorystatusKilledIdleExit, &footprint)) { /* No idle exitable processes left to kill */ break; } @@ -1440,7 +1508,7 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) * the pressure notification scheme. */ } else { - delay(1000000); /* 1 second */ + delay(1 * USEC_PER_SEC); } } #endif /* !CONFIG_JETSAM */ @@ -1476,26 +1544,24 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) } } -#if CONFIG_JETSAM - if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) { - if (memorystatus_should_kill_on_sustained_pressure) { + if (memstat_pressure_config & MEMSTAT_WARNING_KILL_SUSTAINED) { + if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) { memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level); thread_call_cancel(sustained_pressure_handler_thread_call); + } else if (memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) { + /* + * Pressure has increased from normal. + * Hopefully the notifications will relieve it, + * but as a fail-safe we'll trigger jetsam + * after a configurable amount of time. + */ + memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level); + uint64_t kill_time; + nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time); + kill_time += mach_absolute_time(); + thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time); } - } else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) { - /* - * Pressure has increased from normal. - * Hopefully the notifications will relieve it, - * but as a fail-safe we'll trigger jetsam - * after a configurable amount of time. - */ - memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level); - uint64_t kill_time; - nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time); - kill_time += mach_absolute_time(); - thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time); } -#endif /* CONFIG_JETSAM */ while (1) { /* @@ -1523,21 +1589,41 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) continue; } } - if (level_snapshot == kVMPressureNormal) { - memorystatus_pressure_telemetry_send(); - } + prev_level_snapshot = level_snapshot; smoothing_window_started = FALSE; + + if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE && + level_snapshot >= kVMPressureWarning && + memstat_get_long_idle_proccnt() > 0) { + /* There are long-idle daemons to kill */ + memorystatus_thread_wake(); + } else if (level_snapshot == kVMPressureCritical) { + if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) { + uint64_t now = mach_absolute_time(); + uint64_t delta_ns; + absolutetime_to_nanoseconds(now - memstat_last_cache_purge_ts, &delta_ns); + if (delta_ns >= memstat_cache_purge_backoff_ns) { + /* Wake up the jetsam thread to purge caches */ + memorystatus_thread_wake(); + } + } else if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE && + memstat_get_idle_proccnt() > 0) { + memorystatus_thread_wake(); + } + } + memorystatus_klist_lock(); - if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) { - memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level_snapshot); - memorystatus_pressure_interval_telemetry.max_level = level_snapshot; - memorystatus_pressure_interval_telemetry.num_transitions++; - if (memorystatus_pressure_interval_telemetry.duration == 0) { - /* Set the start timestamp. Duration will be finalized when we send the event. */ - memorystatus_pressure_interval_telemetry.duration = curr_ts; + /* Interval tracking & telemetry */ + if (prev_level_snapshot != level_snapshot) { + if (level_snapshot == kVMPressureNormal) { + memstat_pressure_interval_end(); + } else if (prev_level_snapshot == kVMPressureNormal) { + memstat_pressure_interval_start(curr_ts); } + + memstat_pressure_interval_update(level_snapshot); } kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot, target_foreground_process, &next_telemetry_update); @@ -1624,10 +1710,16 @@ memorystatus_update_vm_pressure(boolean_t target_foreground_process) } } } + if (level_snapshot != kVMPressureNormal) { - mark_knote_send_time(kn_max, task, convert_internal_pressure_level_to_dispatch_level(level_snapshot), - (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent)); - memorystatus_pressure_interval_telemetry.num_notifications_sent++; + uint16_t num_notifications; + if (os_convert_overflow(memstat_cur_interval.num_notifs, &num_notifications)) { + num_notifications = UINT16_MAX; + } + mark_knote_send_time(kn_max, task, + convert_internal_pressure_level_to_dispatch_level(level_snapshot), + num_notifications); + memstat_cur_interval.num_notifs++; } KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure); diff --git a/bsd/kern/kern_memorystatus_policy.c b/bsd/kern/kern_memorystatus_policy.c index 945474c85..2065fd558 100644 --- a/bsd/kern/kern_memorystatus_policy.c +++ b/bsd/kern/kern_memorystatus_policy.c @@ -80,12 +80,15 @@ extern uint64_t memstat_reaper_min_age_secs; extern uint64_t memstat_oldest_reapable_proc_will_be_reapable_at_ts_matu; extern bool memstat_reaper_is_currently_sweeping; +extern vm_pressure_level_t memorystatus_vm_pressure_level; + static void -memorystatus_health_check(memorystatus_system_health_t *status) +memstat_evaluate_health_conditions(memorystatus_system_health_t status) { memset(status, 0, sizeof(memorystatus_system_health_t)); - status->msh_compressor_exhausted = vm_compressor_low_on_space() || + status->msh_compressor_low_on_space = vm_compressor_low_on_space() || os_atomic_load(&memorystatus_compressor_space_shortage, relaxed); + status->msh_compressor_exhausted = vm_compressor_out_of_space(); status->msh_swap_low_on_space = vm_swap_low_on_space(); status->msh_swap_exhausted = vm_swap_out_of_space(); #if CONFIG_JETSAM @@ -108,27 +111,158 @@ memorystatus_health_check(memorystatus_system_health_t *status) status->msh_pageout_starved = os_atomic_load(&memorystatus_pageout_starved, relaxed); status->msh_swappable_compressor_segments_over_limit = memorystatus_swap_over_trigger(100); status->msh_swapin_queue_over_limit = memorystatus_swapin_over_trigger(); +#else /* !CONFIG_JETSAM */ + vm_pressure_level_t pressure_level = memorystatus_vm_pressure_level; + status->msh_vm_pressure_critical = (pressure_level == kVMPressureCritical); + status->msh_vm_pressure_warning = (pressure_level >= kVMPressureWarning); #endif /* CONFIG_JETSAM */ status->msh_zone_map_is_exhausted = os_atomic_load(&memorystatus_zone_map_is_exhausted, relaxed); } -bool -memorystatus_is_system_healthy(const memorystatus_system_health_t *status) +static bool +memstat_is_system_healthy(const memorystatus_system_health_t status) { #if CONFIG_JETSAM return !(status->msh_available_pages_below_critical || status->msh_compressor_is_thrashing || status->msh_compressor_exhausted || + status->msh_compressor_low_on_space || status->msh_filecache_is_thrashing || status->msh_zone_map_is_exhausted || status->msh_pageout_starved); #else /* CONFIG_JETSAM */ return !(status->msh_zone_map_is_exhausted || status->msh_compressor_exhausted || - status->msh_swap_exhausted); + status->msh_compressor_low_on_space || + status->msh_swap_exhausted || + status->msh_swap_low_on_space || + status->msh_vm_pressure_critical || + status->msh_vm_pressure_warning); #endif /* CONFIG_JETSAM */ } +static void +memstat_log_system_health(const memorystatus_system_health_t status) +{ + static struct memorystatus_system_health_s prev_status = {0}; + + bool healthy = memstat_is_system_healthy(status); + + /* + * Avoid spamming logs by only logging when the system status has changed. + */ + if (prev_status.msh_zone_map_is_exhausted == status->msh_zone_map_is_exhausted && + prev_status.msh_compressor_exhausted == status->msh_compressor_exhausted && + prev_status.msh_swap_low_on_space == status->msh_swap_low_on_space && + prev_status.msh_swap_exhausted == status->msh_swap_exhausted +#if CONFIG_JETSAM + && + prev_status.msh_available_pages_below_idle == status->msh_available_pages_below_idle && + prev_status.msh_available_pages_below_soft == status->msh_available_pages_below_soft && + prev_status.msh_available_pages_below_critical == status->msh_available_pages_below_critical && + prev_status.msh_available_pages_below_reaper == status->msh_available_pages_below_reaper && + prev_status.msh_compressor_needs_to_swap == status->msh_compressor_needs_to_swap && + prev_status.msh_compressor_is_thrashing == status->msh_compressor_is_thrashing && + prev_status.msh_filecache_is_thrashing == status->msh_filecache_is_thrashing && + prev_status.msh_phantom_cache_pressure == status->msh_phantom_cache_pressure && + prev_status.msh_swapin_queue_over_limit == status->msh_swapin_queue_over_limit && + prev_status.msh_pageout_starved == status->msh_pageout_starved +#endif /* CONFIG_JETSAM */ + ) { + /* No change */ + return; + } + +#if CONFIG_JETSAM + if (healthy) { + if (status->msh_available_pages_below_soft) { + memorystatus_log( + "memorystatus: System will begin enforcing " + "soft memory limits. " + "memorystatus_available_pages: %llu compressor_size: %u\n", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + } else if (status->msh_available_pages_below_idle) { + memorystatus_log( + "memorystatus: System will begin enacting " + "idle-exits. " + "memorystatus_available_pages: %llu compressor_size: %u\n", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + } else if (status->msh_available_pages_below_reaper) { + memorystatus_log( + "memorystatus: System will begin reaping " + "long-idle processes. " + "memorystatus_available_pages: %llu compressor_size: %u\n", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + } else { + memorystatus_log( + "memorystatus: System is healthy. " + "memorystatus_available_pages: %llu compressor_size:%u\n", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + } + } else { + /* Unhealthy */ + memorystatus_log("memorystatus: System is unhealthy! memorystatus_available_pages: %llu compressor_size:%u\n", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + memorystatus_log( + "memorystatus: {" + "\"available_pages_below_critical\": %d, " + "\"available_pages_below_idle\": %d, " + "\"available_pages_below_soft\": %d, " + "\"available_pages_below_reaper\": %d, " + "\"compressor_needs_to_swap\": %d, " + "\"compressor_exhausted\": %d, " + "\"compressor_is_thrashing\": %d, " + "\"filecache_is_thrashing\": %d, " + "\"zone_map_is_exhausted\": %d, " + "\"phantom_cache_pressure\": %d, " + "\"swappable_compressor_segments_over_limit\": %d, " + "\"swapin_queue_over_limit\": %d, " + "\"swap_low\": %d, " + "\"swap_exhausted\": %d" + "}\n", + status->msh_available_pages_below_critical, + status->msh_available_pages_below_idle, + status->msh_available_pages_below_soft, + status->msh_available_pages_below_reaper, + status->msh_compressor_needs_to_swap, + status->msh_compressor_exhausted, + status->msh_compressor_is_thrashing, + status->msh_filecache_is_thrashing, + status->msh_zone_map_is_exhausted, + status->msh_phantom_cache_pressure, + status->msh_swappable_compressor_segments_over_limit, + status->msh_swapin_queue_over_limit, + status->msh_swap_low_on_space, + status->msh_swap_exhausted); + } +#else /* CONFIG_JETSAM */ + memorystatus_log("memorystatus: System is %s. memorystatus_available_pages: %llu compressor_size:%u\n", + healthy ? "healthy" : "unhealthy", + (uint64_t)MEMORYSTATUS_LOG_AVAILABLE_PAGES, vm_compressor_pool_size()); + if (!healthy) { + memorystatus_log( + "memorystatus: {" + "\"compressor_exhausted\": %d, " + "\"zone_map_is_exhausted\": %d, " + "\"swap_low\": %d, " + "\"swap_exhausted\": %d" + "}\n", + status->msh_compressor_exhausted, + status->msh_zone_map_is_exhausted, + status->msh_swap_low_on_space, + status->msh_swap_exhausted); + } +#endif /* CONFIG_JETSAM */ + prev_status = *status; +} + +bool +memstat_check_system_health(memorystatus_system_health_t status) +{ + memstat_evaluate_health_conditions(status); + memstat_log_system_health(status); + return memstat_is_system_healthy(status); +} #pragma mark Memorystatus Thread Actions @@ -136,6 +270,45 @@ memorystatus_is_system_healthy(const memorystatus_system_health_t *status) * This section picks the appropriate memorystatus_action & deploys it. */ +uint64_t memstat_last_cache_purge_ts; +/* Purge caches under critical pressure up to every 1 min */ +TUNABLE(uint64_t, memstat_cache_purge_backoff_ns, + "memorystatus_cache_purge_backoff_ns", 1 * 60 * NSEC_PER_SEC); + +static uint32_t +memorystatus_pick_kill_cause(const memorystatus_system_health_t status) +{ + assert(!memstat_is_system_healthy(status)); +#if CONFIG_JETSAM + if (status->msh_compressor_is_thrashing) { + return kMemorystatusKilledVMCompressorThrashing; + } else if (status->msh_compressor_exhausted) { + return kMemorystatusKilledVMCompressorSpaceShortage; + } else if (status->msh_swap_low_on_space) { + return kMemorystatusKilledLowSwap; + } else if (status->msh_filecache_is_thrashing) { + return kMemorystatusKilledFCThrashing; + } else if (status->msh_zone_map_is_exhausted) { + return kMemorystatusKilledZoneMapExhaustion; + } else if (status->msh_pageout_starved) { + return kMemorystatusKilledVMPageoutStarvation; + } else { + assert(status->msh_available_pages_below_critical); + return kMemorystatusKilledVMPageShortage; + } +#else /* CONFIG_JETSAM */ + if (status->msh_zone_map_is_exhausted) { + return kMemorystatusKilledZoneMapExhaustion; + } else if (status->msh_compressor_exhausted) { + return kMemorystatusKilledVMCompressorSpaceShortage; + } else if (status->msh_swap_exhausted) { + return kMemorystatusKilledLowSwap; + } else { + return kMemorystatusKilled; + } +#endif /* CONFIG_JETSAM */ +} + /* * Inspects the state of various resources in the system to see if * the system is healthy. If the system is not healthy, picks a @@ -153,10 +326,8 @@ memorystatus_pick_action(jetsam_state_t state, bool swappable_apps_remaining, int *jld_idle_kills) { - memorystatus_system_health_t status; - memorystatus_health_check(&status); - memorystatus_log_system_health(&status); - bool is_system_healthy = memorystatus_is_system_healthy(&status); + struct memorystatus_system_health_s status; + bool is_system_healthy = memstat_check_system_health(&status); #if CONFIG_JETSAM if (status.msh_available_pages_below_soft || !is_system_healthy) { @@ -195,7 +366,7 @@ memorystatus_pick_action(jetsam_state_t state, } } - if (status.msh_compressor_exhausted) { + if (status.msh_compressor_exhausted || status.msh_compressor_low_on_space) { *kill_cause = kMemorystatusKilledVMCompressorSpaceShortage; return MEMORYSTATUS_KILL_TOP_PROCESS; } @@ -255,6 +426,7 @@ memorystatus_pick_action(jetsam_state_t state, (void) jld_idle_kills; (void) suspended_swappable_apps_remaining; (void) swappable_apps_remaining; + (void) highwater_remaining; /* * Without CONFIG_JETSAM, we only kill if the system is unhealthy. @@ -265,29 +437,75 @@ memorystatus_pick_action(jetsam_state_t state, *kill_cause = 0; return MEMORYSTATUS_KILL_NONE; } - if (highwater_remaining) { - *kill_cause = kMemorystatusKilledHiwat; - return MEMORYSTATUS_KILL_HIWATER; - } *kill_cause = memorystatus_pick_kill_cause(&status); if (status.msh_zone_map_is_exhausted) { return MEMORYSTATUS_KILL_TOP_PROCESS; - } else if (status.msh_compressor_exhausted || status.msh_swap_exhausted) { + } + if (status.msh_compressor_exhausted || status.msh_swap_exhausted) { if (kill_on_no_paging_space) { return MEMORYSTATUS_KILL_TOP_PROCESS; - } else if (memstat_get_idle_proccnt() > 0) { + } + } + if (status.msh_compressor_low_on_space || status.msh_swap_low_on_space) { + if (memstat_get_idle_proccnt() > 0) { + /* Kill all idle processes before invoking the no paging space action */ return MEMORYSTATUS_KILL_IDLE; + } + /* + * Throttle how often the no-paging-space action is performed. + */ + uint64_t now = mach_absolute_time(); + uint64_t delta_since_last_no_space_ns; + uint64_t last_action_ts = os_atomic_load(&last_no_space_action_ts, relaxed); + assert3u(now, >=, last_action_ts); + absolutetime_to_nanoseconds(now - last_action_ts, &delta_since_last_no_space_ns); + if (delta_since_last_no_space_ns > no_paging_space_action_throttle_delay_ns) { + return MEMORYSTATUS_NO_PAGING_SPACE; } else { - /* - * The no paging space action will be performed synchronously by the the - * thread performing the compression/swap. - */ return MEMORYSTATUS_KILL_NONE; } - } else { - panic("System is unhealthy but compressor, swap, and zone map are not exhausted"); + } + if (status.msh_vm_pressure_critical) { + /* + * The system is under critical memory pressure. First terminate any low-risk + * idle processes. When they are exhausted, purge system memory caches. + */ + if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE && + memstat_get_long_idle_proccnt() > 0) { + *kill_cause = kMemorystatusKilledLongIdleExit; + return MEMORYSTATUS_KILL_LONG_IDLE; + } + if (memstat_pressure_config & MEMSTAT_CRITICAL_KILL_IDLE && + memstat_get_idle_proccnt() > 0) { + *kill_cause = kMemorystatusKilledIdleExit; + return MEMORYSTATUS_KILL_IDLE; + } + if (memstat_pressure_config & MEMSTAT_CRITICAL_PURGE_CACHES) { + uint64_t now = mach_absolute_time(); + uint64_t delta_ns; + uint64_t last_purge_ts = os_atomic_load(&memstat_last_cache_purge_ts, relaxed); + assert3u(now, >=, last_purge_ts); + absolutetime_to_nanoseconds(now - last_purge_ts, &delta_ns); + if (delta_ns > memstat_cache_purge_backoff_ns) { + memstat_last_cache_purge_ts = now; + return MEMORYSTATUS_PURGE_CACHES; + } + } + return MEMORYSTATUS_KILL_NONE; + } else if (status.msh_vm_pressure_warning) { + /* + * The system is under pressure and is likely to start swapping soon. Reap + * any long-idle daemons. + */ + if (memstat_pressure_config & MEMSTAT_WARNING_KILL_LONG_IDLE && + memstat_get_long_idle_proccnt() > 0) { + *kill_cause = kMemorystatusKilledLongIdleExit; + return MEMORYSTATUS_KILL_LONG_IDLE; + } + return MEMORYSTATUS_KILL_NONE; } #endif /* CONFIG_JETSAM */ + panic("System is unhealthy but no action has been chosen"); } #pragma mark Aggressive Jetsam diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index c5219604a..98757f29b 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -66,8 +66,10 @@ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ +#include #include #include +#include #include #include #include @@ -112,6 +114,7 @@ extern vm_map_t bsd_pageable_map; #include #include +#include #include #include @@ -165,6 +168,9 @@ static int osenvironment_initialized = 0; static uint32_t ephemeral_storage = 0; static uint32_t use_recovery_securityd = 0; +static char *mempath = NULL; +static size_t mempath_size = 0; + static struct { uint32_t ephemeral_storage:1; uint32_t use_recovery_securityd:1; @@ -575,7 +581,7 @@ sysctl_hw_generic(__unused struct sysctl_oid *oidp, void *arg1, #endif case HW_USERMEM: { - int usermem = (int)(mem_size - vm_page_wire_count * page_size); + int usermem = (int)(max_mem - vm_page_wire_count * page_size); return SYSCTL_RETURN(req, usermem); } @@ -876,6 +882,55 @@ sysctl_serialdebugmode return sysctl_io_number(req, serialdebugmode, sizeof(serialdebugmode), NULL, NULL); } +/* + * This sysctl is a string that contains the jetsam properties path used by launchd to apply. + * jetsam properties to service. This sysctl is set once by launchd at boot and after userspace reboots, + * before it spawns any services. + */ +#define kReadOnlyMempathEntitlement "com.apple.private.kernel.mempath-read-only" +static int +sysctl_mempath +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error = EINVAL; + if (req->newptr != 0) { + /* initproc is the only process that can write to this sysctl */ + if (proc_getpid(req->p) != 1) { + return EPERM; + } + if (req->newlen > PATH_MAX) { + return EOVERFLOW; + } + size_t mempath_new_size = req->newlen + 1; + char *mempath_new = kalloc_data(mempath_new_size, Z_WAITOK); + if (!mempath_new) { + return ENOMEM; + } + mempath_new[mempath_new_size - 1] = '\0'; + error = SYSCTL_IN(req, mempath_new, mempath_new_size - 1); + if (0 != error) { + kfree_data(mempath_new, mempath_new_size); + return error; + } + /* copy in was successful; swap out old/new buffers */ + if (NULL != mempath) { + kfree_data(mempath, mempath_size); + } + mempath = mempath_new; + mempath_size = mempath_new_size; + } else { + /* A read entitlement is required to read this sysctl */ + if (!IOCurrentTaskHasEntitlement(kReadOnlyMempathEntitlement)) { + return EPERM; + } + error = EIO; + if (mempath && mempath_size) { + error = SYSCTL_OUT(req, mempath, mempath_size); + } + } + return error; +} + /* * hw.* MIB variables. */ @@ -937,6 +992,7 @@ SYSCTL_PROC(_hw, OID_AUTO, ephemeral_storage, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG SYSCTL_PROC(_hw, OID_AUTO, use_recovery_securityd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_recovery_securityd, "I", ""); SYSCTL_PROC(_hw, OID_AUTO, use_kernelmanagerd, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_use_kernelmanagerd, "I", ""); SYSCTL_PROC(_hw, OID_AUTO, serialdebugmode, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_serialdebugmode, "I", ""); +SYSCTL_PROC(_hw, OID_AUTO, mempath, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_mempath, "A", ""); /* * hw.perflevelN.* variables. diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 0599755c0..b23eb09d7 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -1168,10 +1168,12 @@ mprotect_sanitize( * check unaligned start due to UNIX SPEC: user address is not page-aligned, * return EINVAL */ + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_CHECK_ALIGNED_START | + VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH; + + result = vm_sanitize_addr_size(user_addr_u, user_size_u, - VM_SANITIZE_CALLER_MPROTECT, user_map, - VM_SANITIZE_FLAGS_CHECK_ALIGNED_START | - VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, + VM_SANITIZE_CALLER_MPROTECT, user_map, flags, user_addr, user_end_aligned, user_size); if (__improbable(result != KERN_SUCCESS)) { return result; @@ -1325,10 +1327,11 @@ minherit_sanitize( kern_return_t result; mach_vm_offset_t addr_end; + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH; + + result = vm_sanitize_addr_size(addr_u, size_u, VM_SANITIZE_CALLER_MINHERIT, - user_map, - VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, addr, - &addr_end, size); + user_map, flags, addr, &addr_end, size); if (__improbable(result != KERN_SUCCESS)) { return result; } @@ -1397,10 +1400,11 @@ madvise_sanitize( mach_vm_offset_t *end, mach_vm_size_t *size) { + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH; + + return vm_sanitize_addr_size(addr_u, len_u, VM_SANITIZE_CALLER_MADVISE, - user_map, - VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, - start, end, size); + user_map, flags, start, end, size); } int @@ -1510,8 +1514,10 @@ mincore_sanitize( mach_vm_offset_t *end, mach_vm_size_t *size) { + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + return vm_sanitize_addr_size(addr_u, len_u, VM_SANITIZE_CALLER_MINCORE, - map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, addr, end, size); + map, flags, addr, end, size); } int diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index 4d57bb531..26aacccad 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -77,6 +77,7 @@ #include #include #include +#include #include @@ -1676,16 +1677,36 @@ sysctl_new_user(struct sysctl_req *req, void *p, size_t l) return error; } -#define WRITE_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors" +const char *trial_experiment_factors_entitlement = "com.apple.private.kernel.read-write-trial-experiment-factors"; + +/* + * Is the current task allowed to read/write trial experiment factors? + * Requires either: + * - trial_experiment_factors_entitlement + * - root user (internal-diagnostics only) + */ +STATIC bool +can_rw_trial_experiment_factors(struct sysctl_req *req) +{ + if (IOTaskHasEntitlement(proc_task(req->p), trial_experiment_factors_entitlement)) { + return true; + } + if (os_variant_has_internal_diagnostics("com.apple.xnu")) { + return !proc_suser(req->p); + } + return false; +} + +#define WRITE_LEGACY_EXPERIMENT_FACTORS_ENTITLEMENT "com.apple.private.write-kr-experiment-factors" /* * Is the current task allowed to write to experiment factors? * tasks with the WRITE_EXPERIMENT_FACTORS_ENTITLEMENT are always allowed to write these. * In the development / debug kernel we also allow root to write them. */ STATIC bool -can_write_experiment_factors(__unused struct sysctl_req *req) +can_write_legacy_experiment_factors(__unused struct sysctl_req *req) { - if (IOCurrentTaskHasEntitlement(WRITE_EXPERIMENT_FACTORS_ENTITLEMENT)) { + if (IOCurrentTaskHasEntitlement(WRITE_LEGACY_EXPERIMENT_FACTORS_ENTITLEMENT)) { return true; } #if DEBUG || DEVELOPMENT @@ -1832,13 +1853,20 @@ found: goto err; } + if (oid->oid_kind & CTLFLAG_EXPERIMENT && req->p) { + if (!can_rw_trial_experiment_factors(req)) { + error = (EPERM); + goto err; + } + } + if (req->newptr && req->p) { - if (oid->oid_kind & CTLFLAG_EXPERIMENT) { + if (oid->oid_kind & CTLFLAG_LEGACY_EXPERIMENT) { /* * Experiment factors have different permissions since they need to be * writable by procs with WRITE_EXPERIMENT_FACTORS_ENTITLEMENT. */ - if (!can_write_experiment_factors(req)) { + if (!can_write_legacy_experiment_factors(req)) { error = (EPERM); goto err; } @@ -2223,6 +2251,9 @@ scalable_counter_sysctl_handler SYSCTL_HANDLER_ARGS return SYSCTL_OUT(req, &value, sizeof(value)); } +SYSCTL_NODE(_kern, OID_AUTO, trial, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "trial experiment factors"); + #define X(name, T) \ int \ experiment_factor_##name##_handler SYSCTL_HANDLER_ARGS \ @@ -2285,7 +2316,12 @@ sysctl_register_test_startup(struct sysctl_test_setup_spec *spec) .oid_parent = &sysctl__debug_test_children, .oid_number = OID_AUTO, .oid_kind = CTLTYPE_QUAD | CTLFLAG_OID2 | CTLFLAG_WR | - CTLFLAG_PERMANENT | CTLFLAG_LOCKED | CTLFLAG_MASKED, + CTLFLAG_PERMANENT | CTLFLAG_LOCKED | CTLFLAG_MASKED +#ifdef __BUILDING_XNU_LIB_UNITTEST__ + | CTLFLAG_KERN, /* allow calls from unit-test which use kernel_sysctlbyname() */ +#else /* __BUILDING_XNU_LIB_UNITTEST__ */ + , +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ .oid_arg1 = (void *)(uintptr_t)spec->st_func, .oid_name = spec->st_name, .oid_handler = sysctl_test_handler, @@ -2457,3 +2493,35 @@ SYSCTL_OID(_debug_test_sysctl_node_test_l2, OID_AUTO, hanging_oid, CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, NULL, "", "rdar://138698424 L2 hanging OID"); #endif /* DEBUG || DEVELOPMENT */ + +static int +sysctl_static_if_modified_keys SYSCTL_HANDLER_ARGS +{ + extern char __static_if_segment_start[] __SEGMENT_START_SYM(STATIC_IF_SEGMENT); + + uint64_t addr; + int err; + + for (static_if_key_t key = static_if_modified_keys; + key; key = key->sik_modified_next) { + if ((key->sik_enable_count >= 0) == key->sik_init_value) { + continue; + } + + addr = (vm_offset_t)key->sik_entries_head - (vm_offset_t)__static_if_segment_start; + err = SYSCTL_OUT(req, &addr, sizeof(addr)); + if (err) { + return err; + } + } + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, static_if_modified_keys, + CTLFLAG_RD | CTLFLAG_LOCKED | CTLTYPE_OPAQUE, + 0, 0, sysctl_static_if_modified_keys, "-", + "List of unslid addresses of modified keys"); + +SYSCTL_UINT(_kern, OID_AUTO, static_if_abi, CTLFLAG_RD | CTLFLAG_LOCKED, + &static_if_abi, 0, "static_if ABI"); diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index 830b23545..a08362b2e 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include #include @@ -193,7 +194,7 @@ static TUNABLE(bool, syscallfilter_disable, "-disable_syscallfilter", false); #if DEBUG #define __PROC_INTERNAL_DEBUG 1 #endif -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP /* Name to give to core files */ #if defined(XNU_TARGET_OS_BRIDGE) __XNU_PRIVATE_EXTERN const char * defaultcorefiledir = "/private/var/internal"; @@ -421,9 +422,11 @@ proc_isinferior(int pid1, int pid2) * racy for a current process or if a reference to the process is held. */ struct proc_ident -proc_ident(proc_t p) +proc_ident_with_policy(proc_t p, proc_ident_validation_policy_t policy) { struct proc_ident ident = { + .may_exit = (policy & IDENT_VALIDATION_PROC_MAY_EXIT) != 0, + .may_exec = (policy & IDENT_VALIDATION_PROC_MAY_EXEC) != 0, .p_pid = proc_pid(p), .p_uniqueid = proc_uniqueid(p), .p_idversion = proc_pidversion(p), @@ -432,6 +435,12 @@ proc_ident(proc_t p) return ident; } +/* + * Function: proc_find_audit_token + * + * Description: Lookup a process with the provided audit_token_t + * will validate that the embedded pidver matches. + */ proc_t proc_find_audit_token(const audit_token_t token) { @@ -456,23 +465,200 @@ proc_find_audit_token(const audit_token_t token) return proc; } -proc_t -proc_find_ident(struct proc_ident const *ident) +/* + * Function: proc_find_ident_validated + * + * Description: Obtain a proc ref from the provided proc_ident. + * + * Returns: + * - 0 on Success + * - EINVAL: When the provided arguments are invalid (NULL) + * - ESTALE: The process exists but is currently a zombie and + * has not been reaped via wait(). Callers may choose to handle + * this edge case as a non-error. + * - ESRCH: When the lookup or validation fails otherwise. The process + * described by the identifier no longer exists. + * + * Note: Caller must proc_rele() the out param when this function returns 0 + */ +errno_t +proc_find_ident_validated(const proc_ident_t ident, proc_t *out) { - proc_t proc = PROC_NULL; - - proc = proc_find(ident->p_pid); - if (proc == PROC_NULL) { - return PROC_NULL; + if (ident == NULL || out == NULL) { + return EINVAL; } - if (proc_uniqueid(proc) != ident->p_uniqueid || + proc_t proc = proc_find(ident->p_pid); + if (proc == PROC_NULL) { + // If the policy indicates the process may exit, we should also check + // the zombie list, and return ENOENT to indicate that the process is + // a zombie waiting to be reaped. + if (proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXIT) + && pzfind_unique(ident->p_pid, ident->p_uniqueid)) { + return ESTALE; + } + return ESRCH; + } + + // If the policy indicates that the process shouldn't exec, fail the + // lookup if the pidversion doesn't match + if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) && proc_pidversion(proc) != ident->p_idversion) { proc_rele(proc); - return PROC_NULL; + return ESRCH; } - return proc; + // Check the uniqueid which is always verified + if (proc_uniqueid(proc) != ident->p_uniqueid) { + proc_rele(proc); + return ESRCH; + } + + *out = proc; + return 0; +} + +/* + * Function: proc_find_ident + * + * Description: Obtain a proc ref from the provided proc_ident. + * Discards the errno result from proc_find_ident_validated + * for callers using the old interface. + */ +inline proc_t +proc_find_ident(const proc_ident_t ident) +{ + proc_t p = PROC_NULL; + if (proc_find_ident_validated(ident, &p) != 0) { + return PROC_NULL; + } + return p; +} + +/* + * Function: proc_ident_equal_token + * + * Description: Compare a proc_ident_t to an audit token. The + * process described by the audit token must still exist (which + * includes a pidver check during the lookup). But the comparison + * with the proc_ident_t will respect IDENT_VALIDATION_PROC_MAY_EXEC + * and only compare PID and unique ID when it is set. + */ +bool +proc_ident_equal_token(proc_ident_t ident, audit_token_t token) +{ + if (ident == NULL) { + return false; + } + + // If the PIDs don't match, early return + if (ident->p_pid != get_audit_token_pid(&token)) { + return false; + } + + // Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set + if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) && + ident->p_idversion != token.val[7]) { + return false; + } + + // Lookup the process described by the provided audit token + proc_t proc = proc_find_audit_token(token); + if (proc == PROC_NULL) { + return false; + } + + // Always validate that the uniqueid matches + if (proc_uniqueid(proc) != ident->p_uniqueid) { + proc_rele(proc); + return false; + } + + proc_rele(proc); + return true; +} + +/* + * Function: proc_ident_equal_ref + * + * Description: Compare a proc_ident_t to a proc_t. Will + * respect IDENT_VALIDATION_PROC_MAY_EXEC and only compare + * PID and unique ID when set. + */ +bool +proc_ident_equal_ref(proc_ident_t ident, proc_t proc) +{ + if (ident == NULL || proc == PROC_NULL) { + return false; + } + + // Always compare PID and p_uniqueid + if (proc_pid(proc) != ident->p_pid || + proc_uniqueid(proc) != ident->p_uniqueid) { + return false; + } + + // Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set + if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) && + proc_pidversion(proc) != ident->p_idversion) { + return false; + } + + return true; +} + +/* + * Function: proc_ident_equal + * + * Description: Compare two proc_ident_t identifiers. Will + * respect IDENT_VALIDATION_PROC_MAY_EXEC and only compare + * PID and unique ID when set. + */ +bool +proc_ident_equal(proc_ident_t ident, proc_ident_t other) +{ + if (ident == NULL || other == NULL) { + return false; + } + + // Always compare PID and p_uniqueid + if (ident->p_pid != other->p_pid || + ident->p_uniqueid != other->p_uniqueid) { + return false; + } + + // Compare pidversion if IDENT_VALIDATION_PROC_MAY_EXEC is not set + if (!proc_ident_has_policy(ident, IDENT_VALIDATION_PROC_MAY_EXEC) && + ident->p_idversion != other->p_idversion) { + return false; + } + + return true; +} + +/* + * Function: proc_ident_has_policy + * + * Description: Validate that a particular policy is set. + * + * Stored in the upper 4 bits of the 32 bit + * p_pid field. + */ +inline bool +proc_ident_has_policy(const proc_ident_t ident, enum proc_ident_validation_policy policy) +{ + if (ident == NULL) { + return false; + } + + switch (policy) { + case IDENT_VALIDATION_PROC_MAY_EXIT: + return ident->may_exit; + case IDENT_VALIDATION_PROC_MAY_EXEC: + return ident->may_exec; + case IDENT_VALIDATION_PROC_EXACT: + return ident->may_exec == 0 && ident->may_exit == 0; + } } void @@ -1467,6 +1653,21 @@ proc_archinfo_kdp(void* p, cpu_type_t* cputype, cpu_subtype_t* cpusubtype) } } +void +proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion); + +void +proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion) +{ + proc_t pp = (proc_t)p; + if (pp != PROC_NULL) { + *current_memlimit = pp->p_memstat_memlimit; + *prio_effective = pp->p_memstat_effectivepriority; + *prio_assertion = pp->p_memstat_assertionpriority; + *prio_requested = pp->p_memstat_requestedpriority; + } +} + const char * proc_name_address(void *p) { @@ -1840,7 +2041,7 @@ proc_getcdhash(proc_t p, unsigned char *cdhash) if (p == kernproc) { return EINVAL; } - return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash); + return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash, NULL); } uint64_t @@ -2264,27 +2465,59 @@ proc_findthread(thread_t thread) return p; } - /* - * Locate a zombie by PID + * Determine if the process described by the provided + * PID is a zombie */ -__private_extern__ proc_t +__private_extern__ bool pzfind(pid_t pid) { - proc_t p; - + bool found = false; + /* Enter critical section */ proc_list_lock(); - LIST_FOREACH(p, &zombproc, p_list) { - if (proc_getpid(p) == pid && !proc_is_shadow(p)) { - break; - } + /* Ensure the proc exists and is a zombie */ + proc_t p = phash_find_locked(pid); + if ((p == PROC_NULL) || !proc_list_exited(p)) { + goto out; } + found = true; +out: + /* Exit critical section */ proc_list_unlock(); + return found; +} - return p; +/* + * Determine if the process described by the provided + * uniqueid is a zombie. The same as pzfind but with an + * additional uniqueid check. + */ +__private_extern__ bool +pzfind_unique(pid_t pid, uint64_t uniqueid) +{ + bool found = false; + + /* Enter critical section */ + proc_list_lock(); + + /* Ensure the proc exists and is a zombie */ + proc_t p = phash_find_locked(pid); + if ((p == PROC_NULL) || !proc_list_exited(p)) { + goto out; + } + + if (proc_uniqueid(p) != uniqueid) { + goto out; + } + + found = true; +out: + /* Exit critical section */ + proc_list_unlock(); + return found; } /* @@ -3163,7 +3396,7 @@ proc_is_rsr(proc_t p) return os_atomic_load(&p->p_ladvflag, relaxed) & P_RSR; } -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP /* * proc_core_name(format, name, uid, pid) * Expand the name described in format, using name, uid, and pid. @@ -3253,7 +3486,7 @@ endofstring: (long)pid, name, (uint32_t)uid); return 1; } -#endif /* CONFIG_COREDUMP */ +#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ /* Code Signing related routines */ @@ -3311,9 +3544,10 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user int error; vnode_t tvp; off_t toff; - unsigned char cdhash[SHA1_RESULTLEN]; + csops_cdhash_t cdhash_info = {0}; audit_token_t token; unsigned int upid = 0, uidversion = 0; + bool mark_invalid_allowed = false; forself = error = 0; @@ -3322,12 +3556,13 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user } if (pid == proc_selfpid()) { forself = 1; + mark_invalid_allowed = true; } - switch (ops) { case CS_OPS_STATUS: case CS_OPS_CDHASH: + case CS_OPS_CDHASH_WITH_INFO: case CS_OPS_PIDOFFSET: case CS_OPS_ENTITLEMENTS_BLOB: case CS_OPS_DER_ENTITLEMENTS_BLOB: @@ -3411,6 +3646,10 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user break; } case CS_OPS_MARKINVALID: + if (mark_invalid_allowed == false) { + error = EPERM; + goto out; + } proc_lock(pt); if ((proc_getcsflags(pt) & CS_VALID) == CS_VALID) { /* is currently valid */ proc_csflags_clear(pt, CS_VALID); /* set invalid */ @@ -3470,16 +3709,36 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user tvp = pt->p_textvp; toff = pt->p_textoff; - if (tvp == NULLVP || usize != SHA1_RESULTLEN) { + if (tvp == NULLVP || usize != sizeof(cdhash_info.hash)) { proc_rele(pt); return EINVAL; } - error = vn_getcdhash(tvp, toff, cdhash); + error = vn_getcdhash(tvp, toff, cdhash_info.hash, &cdhash_info.type); proc_rele(pt); if (error == 0) { - error = copyout(cdhash, uaddr, sizeof(cdhash)); + error = copyout(cdhash_info.hash, uaddr, sizeof(cdhash_info.hash)); + } + + return error; + + case CS_OPS_CDHASH_WITH_INFO: + + /* pt already holds a reference on its p_textvp */ + tvp = pt->p_textvp; + toff = pt->p_textoff; + + if (tvp == NULLVP || usize != sizeof(csops_cdhash_t)) { + proc_rele(pt); + return EINVAL; + } + + error = vn_getcdhash(tvp, toff, cdhash_info.hash, &cdhash_info.type); + proc_rele(pt); + + if (error == 0) { + error = copyout(&cdhash_info, uaddr, sizeof(cdhash_info)); } return error; @@ -3641,7 +3900,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user */ if (forself == 1 && IOTaskHasEntitlement(proc_task(pt), CLEAR_LV_ENTITLEMENT)) { proc_lock(pt); - if (!(proc_getcsflags(pt) & CS_INSTALLER)) { + if (!(proc_getcsflags(pt) & CS_INSTALLER) && (pt->p_subsystem_root_path == NULL)) { proc_csflags_clear(pt, CS_REQUIRE_LV | CS_FORCED_LV); error = 0; } else { @@ -3742,11 +4001,8 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user break; } #endif /* CONFIG_CSR */ - task_t task = proc_task(pt); - proc_lock(pt); proc_csflags_clear(pt, CS_PLATFORM_BINARY | CS_PLATFORM_PATH); - task_set_hardened_runtime(task, false); csproc_clear_platform_binary(pt); proc_unlock(pt); break; @@ -4648,7 +4904,7 @@ proc_pcontrol_null(__unused proc_t p, __unused void *arg) extern int32_t max_kill_priority; bool -no_paging_space_action(void) +no_paging_space_action(memorystatus_kill_cause_t cause) { proc_t p; struct no_paging_space nps; @@ -4691,7 +4947,7 @@ no_paging_space_action(void) memorystatus_log("memorystatus: killing largest compressed process %s [%d] " "%llu MB\n", proc_best_name(p), proc_getpid(p), (nps.npcs_max_size / MB_SIZE)); - kill_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_LOWSWAP); + kill_reason = os_reason_create(OS_REASON_JETSAM, cause); psignal_with_reason(p, SIGKILL, kill_reason); proc_rele(p); @@ -4703,13 +4959,6 @@ no_paging_space_action(void) } } - if (memstat_get_idle_proccnt() > 0) { - /* - * There are still idle processes to kill. - */ - return false; - } - if (nps.pcs_max_size > 0) { memorystatus_log("memorystatus: attempting pcontrol on " "[%d]\n", nps.pcs_pid); @@ -4723,10 +4972,9 @@ no_paging_space_action(void) memorystatus_log("memorystatus: doing " "pcontrol on %s [%d]\n", proc_best_name(p), proc_getpid(p)); - proc_dopcontrol(p, JETSAM_REASON_LOWSWAP); + proc_dopcontrol(p, cause); proc_rele(p); - return true; } else { memorystatus_log("memorystatus: cannot " @@ -5217,6 +5465,19 @@ proc_get_ro(proc_t p) return ro; } +#ifdef __BUILDING_XNU_LIB_UNITTEST__ +/* this is here since unittest Makefile can't build BSD sources yet */ +void mock_init_proc(proc_t p, void* (*calloc_call)(size_t, size_t)); +void +mock_init_proc(proc_t p, void* (*calloc_call)(size_t, size_t)) +{ + proc_ro_t ro = calloc_call(1, sizeof(struct proc_ro)); + ro->pr_proc = p; + p->p_proc_ro = ro; +} +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ + + task_t proc_ro_task(proc_ro_t pr) { @@ -5471,7 +5732,7 @@ task_for_pid( error = KERN_FAILURE; goto tfpout; } - pident = proc_ident(p); + pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); is_current_proc = (p == current_proc()); #if CONFIG_AUDIT @@ -5544,12 +5805,7 @@ task_for_pid( /* this reference will be consumed during conversion */ task_reference(task); - if (task == current_task()) { - /* return pinned self if current_task() so equality check with mach_task_self_ passes */ - sright = (void *)convert_task_to_port_pinned(task); - } else { - sright = (void *)convert_task_to_port(task); - } + sright = (void *)convert_task_to_port(task); /* extra task ref consumed */ /* @@ -5638,7 +5894,7 @@ task_name_for_pid( || IOCurrentTaskHasEntitlement("com.apple.system-task-ports.name.safe") )) { if (proc_task(p) != TASK_NULL) { - struct proc_ident pident = proc_ident(p); + struct proc_ident pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); task_t task = proc_task(p); @@ -5726,7 +5982,7 @@ task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args * error = ESRCH; goto tifpout; } - pident = proc_ident(proc); + pident = proc_ident_with_policy(proc, IDENT_VALIDATION_PROC_EXACT); is_current_proc = (proc == current_proc()); if (!(task_for_pid_posix_check(proc))) { @@ -5848,7 +6104,7 @@ task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args, error = ESRCH; goto trfpout; } - pident = proc_ident(proc); + pident = proc_ident_with_policy(proc, IDENT_VALIDATION_PROC_EXACT); is_current_proc = (proc == current_proc()); if (!(task_for_pid_posix_check(proc))) { @@ -6061,7 +6317,7 @@ debug_control_port_for_pid(struct debug_control_port_for_pid_args *args) error = KERN_FAILURE; goto tfpout; } - pident = proc_ident(p); + pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); is_current_proc = (p == current_proc()); #if CONFIG_AUDIT diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 75dffd0d1..5190630bf 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -104,6 +104,7 @@ #if CONFIG_FREEZE #include /* for memorystatus_freeze_mark_ui_transition */ #endif /* CONFIG_FREEZE */ +#include /* for memorystatus_get_proc_is_managed */ #include /* for struct socket */ #if NECP #include @@ -131,13 +132,16 @@ static int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); static void do_background_socket(struct proc *p, thread_t thread); static int do_background_thread(thread_t thread, int priority); static int do_background_proc(struct proc *curp, struct proc *targetp, int priority); -static int set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority); +static int proc_set_gpurole(struct proc *curp, struct proc *targetp, int priority); +static int proc_get_gpurole(proc_t targetp, int *priority); static int proc_set_darwin_role(proc_t curp, proc_t targetp, int priority); static int proc_get_darwin_role(proc_t curp, proc_t targetp, int *priority); static int proc_set_game_mode(proc_t targetp, int priority); static int proc_get_game_mode(proc_t targetp, int *priority); static int proc_set_carplay_mode(proc_t targetp, int priority); static int proc_get_carplay_mode(proc_t targetp, int *priority); +static int proc_set_runaway_mitigation(proc_t targetp, int priority); +static int proc_get_runaway_mitigation(proc_t targetp, int *priority); static int get_background_proc(struct proc *curp, struct proc *targetp, int *priority); int fill_task_rusage(task_t task, rusage_info_current *ri); @@ -357,6 +361,50 @@ getpriority(struct proc *curp, struct getpriority_args *uap, int32_t *retval) } break; + case PRIO_DARWIN_GPU: + if (uap->who == 0) { + p = curp; + } else { + p = proc_find(uap->who); + if (p == PROC_NULL) { + break; + } + refheld = 1; + } + + + error = proc_get_gpurole(p, &low); + + if (refheld) { + proc_rele(p); + } + if (error) { + return error; + } + break; + + case PRIO_DARWIN_RUNAWAY_MITIGATION: + if (uap->who == 0) { + p = curp; + } else { + p = proc_find(uap->who); + if (p == PROC_NULL) { + break; + } + refheld = 1; + } + + + error = proc_get_runaway_mitigation(p, &low); + + if (refheld) { + proc_rele(p); + } + if (error) { + return error; + } + break; + default: return EINVAL; } @@ -533,7 +581,7 @@ setpriority(struct proc *curp, struct setpriority_args *uap, int32_t *retval) break; } - error = set_gpudeny_proc(curp, p, uap->prio); + error = proc_set_gpurole(curp, p, uap->prio); found++; proc_rele(p); @@ -601,6 +649,26 @@ setpriority(struct proc *curp, struct setpriority_args *uap, int32_t *retval) break; } + case PRIO_DARWIN_RUNAWAY_MITIGATION: { + if (uap->who == 0) { + p = curp; + } else { + p = proc_find(uap->who); + if (p == PROC_NULL) { + break; + } + refheld = 1; + } + + error = proc_set_runaway_mitigation(p, uap->prio); + + found++; + if (refheld != 0) { + proc_rele(p); + } + break; + } + default: return EINVAL; } @@ -663,8 +731,10 @@ out: return error; } +#define SET_GPU_ROLE_ENTITLEMENT "com.apple.private.set-gpu-role" + static int -set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority) +proc_set_gpurole(struct proc *curp, struct proc *targetp, int priority) { int error = 0; kauth_cred_t ucred; @@ -673,7 +743,12 @@ set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority) ucred = kauth_cred_get(); target_cred = kauth_cred_proc_ref(targetp); - /* TODO: Entitlement instead of uid check */ + boolean_t entitled = FALSE; + entitled = IOCurrentTaskHasEntitlement(SET_GPU_ROLE_ENTITLEMENT); + if (!entitled) { + error = EPERM; + goto out; + } if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) && kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && @@ -695,11 +770,16 @@ set_gpudeny_proc(struct proc *curp, struct proc *targetp, int priority) #endif switch (priority) { - case PRIO_DARWIN_GPU_DENY: - task_set_gpu_denied(proc_task(targetp), TRUE); - break; + case PRIO_DARWIN_GPU_UNKNOWN: case PRIO_DARWIN_GPU_ALLOW: - task_set_gpu_denied(proc_task(targetp), FALSE); + case PRIO_DARWIN_GPU_DENY: + case PRIO_DARWIN_GPU_BACKGROUND: + case PRIO_DARWIN_GPU_UTILITY: + case PRIO_DARWIN_GPU_UI_NON_FOCAL: + case PRIO_DARWIN_GPU_UI: + case PRIO_DARWIN_GPU_UI_FOCAL: + task_set_gpu_role(proc_task(targetp), + (darwin_gpu_role_t)priority); break; default: error = EINVAL; @@ -711,6 +791,42 @@ out: return error; } +static int +proc_get_gpurole(proc_t targetp, int *priority) +{ + int error = 0; + + kauth_cred_t ucred, target_cred; + + ucred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(targetp); + + boolean_t entitled = FALSE; + entitled = IOCurrentTaskHasEntitlement(SET_GPU_ROLE_ENTITLEMENT); + + /* Root is allowed to get without entitlement */ + if (!kauth_cred_issuser(ucred) && !entitled) { + error = EPERM; + goto out; + } + + /* Even with entitlement, non-root is only alllowed to see same-user */ + if (!kauth_cred_issuser(ucred) && + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) { + error = EPERM; + goto out; + } + + darwin_gpu_role_t gpurole = task_get_gpu_role(proc_task(targetp)); + + *priority = gpurole; + +out: + kauth_cred_unref(&target_cred); + return error; +} + + static int proc_set_darwin_role(proc_t curp, proc_t targetp, int priority) { @@ -960,6 +1076,116 @@ out: return error; } +#define RUNAWAY_MITIGATION_ENTITLEMENT "com.apple.private.runaway-mitigation" + +/* Boot arg to allow RunningBoard-managed processes to be mitigated */ +static TUNABLE(bool, allow_managed_mitigation, "allow_managed_mitigation", false); + +static int +proc_set_runaway_mitigation(proc_t targetp, int priority) +{ + int error = 0; + + kauth_cred_t ucred, target_cred; + + ucred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(targetp); + + boolean_t entitled = FALSE; + entitled = IOCurrentTaskHasEntitlement(RUNAWAY_MITIGATION_ENTITLEMENT); + if (!entitled) { + error = EPERM; + goto out; + } + + /* Even with entitlement, non-root is only alllowed to set same-user */ + if (!kauth_cred_issuser(ucred) && + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) { + error = EPERM; + goto out; + } + + switch (priority) { + case PRIO_DARWIN_RUNAWAY_MITIGATION_OFF: + printf("%s[%d] disabling runaway mitigation on %s[%d]\n", + proc_best_name(current_proc()), proc_selfpid(), + proc_best_name(targetp), proc_getpid(targetp)); + + proc_set_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, + TASK_POLICY_RUNAWAY_MITIGATION, TASK_POLICY_DISABLE); + break; + + case PRIO_DARWIN_RUNAWAY_MITIGATION_ON: + /* + * RunningBoard-managed processes are not mitigatable - they should be + * managed through RunningBoard-level interfaces instead. + * Set the boot arg allow_managed_mitigation=1 to allow this. + */ + if (memorystatus_get_proc_is_managed(targetp) && !allow_managed_mitigation) { + printf("%s[%d] blocked from disabling runaway mitigation on RunningBoard managed process %s[%d]\n", + proc_best_name(current_proc()), proc_selfpid(), + proc_best_name(targetp), proc_getpid(targetp)); + + error = ENOTSUP; + goto out; + } + + proc_set_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, + TASK_POLICY_RUNAWAY_MITIGATION, TASK_POLICY_ENABLE); + + printf("%s[%d] enabling runaway mitigation on %s[%d]\n", + proc_best_name(current_proc()), proc_selfpid(), + proc_best_name(targetp), proc_getpid(targetp)); + break; + + default: + error = EINVAL; + goto out; + } + +out: + kauth_cred_unref(&target_cred); + return error; +} + +static int +proc_get_runaway_mitigation(proc_t targetp, int *priority) +{ + int error = 0; + + kauth_cred_t ucred, target_cred; + + ucred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(targetp); + + boolean_t entitled = FALSE; + entitled = IOCurrentTaskHasEntitlement(RUNAWAY_MITIGATION_ENTITLEMENT); + + /* Root is allowed to get without entitlement */ + if (!kauth_cred_issuser(ucred) && !entitled) { + error = EPERM; + goto out; + } + + /* Even with entitlement, non-root is only alllowed to see same-user */ + if (!kauth_cred_issuser(ucred) && + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred)) { + error = EPERM; + goto out; + } + + if (proc_get_task_policy(proc_task(targetp), TASK_POLICY_ATTRIBUTE, TASK_POLICY_RUNAWAY_MITIGATION)) { + *priority = PRIO_DARWIN_RUNAWAY_MITIGATION_ON; + } else { + *priority = PRIO_DARWIN_RUNAWAY_MITIGATION_OFF; + } + +out: + kauth_cred_unref(&target_cred); + return error; +} + + static int get_background_proc(struct proc *curp, struct proc *targetp, int *priority) { @@ -1501,22 +1727,30 @@ getrlimit(struct proc *p, struct getrlimit_args *uap, __unused int32_t *retval) return EINVAL; } lim = proc_limitget(p, uap->which); - return copyout((caddr_t)&lim, - uap->rlp, sizeof(struct rlimit)); + return copyout((caddr_t)&lim, uap->rlp, sizeof(struct rlimit)); +} + +static struct timeval +_absolutetime_to_timeval(uint64_t abstime) +{ + clock_sec_t sec; + clock_usec_t usec; + absolutetime_to_microtime(abstime, &sec, &usec); + return (struct timeval){ + .tv_sec = sec, + .tv_usec = usec, + }; } /* * Transform the running time and tick information in proc p into user, * system, and interrupt time usage. */ -/* No lock on proc is held for this.. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *ip) { - task_t task; + task_t task; - timerclear(up); - timerclear(sp); if (ip != NULL) { timerclear(ip); } @@ -1524,51 +1758,39 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp, struct timeval *i task = proc_task(p); if (task) { mach_task_basic_info_data_t tinfo; - task_thread_times_info_data_t ttimesinfo; - task_events_info_data_t teventsinfo; - mach_msg_type_number_t task_info_count, task_ttimes_count; + mach_msg_type_number_t task_info_count; mach_msg_type_number_t task_events_count; - struct timeval ut, st; + task_events_info_data_t teventsinfo; + struct recount_times_mach times; task_info_count = MACH_TASK_BASIC_INFO_COUNT; task_info(task, MACH_TASK_BASIC_INFO, (task_info_t)&tinfo, &task_info_count); - ut.tv_sec = tinfo.user_time.seconds; - ut.tv_usec = tinfo.user_time.microseconds; - st.tv_sec = tinfo.system_time.seconds; - st.tv_usec = tinfo.system_time.microseconds; - timeradd(&ut, up, up); - timeradd(&st, sp, sp); - - task_ttimes_count = TASK_THREAD_TIMES_INFO_COUNT; - task_info(task, TASK_THREAD_TIMES_INFO, - (task_info_t)&ttimesinfo, &task_ttimes_count); - - ut.tv_sec = ttimesinfo.user_time.seconds; - ut.tv_usec = ttimesinfo.user_time.microseconds; - st.tv_sec = ttimesinfo.system_time.seconds; - st.tv_usec = ttimesinfo.system_time.microseconds; - timeradd(&ut, up, up); - timeradd(&st, sp, sp); - task_events_count = TASK_EVENTS_INFO_COUNT; task_info(task, TASK_EVENTS_INFO, (task_info_t)&teventsinfo, &task_events_count); + times = recount_task_times(task); + *up = _absolutetime_to_timeval(times.rtm_user); + *sp = _absolutetime_to_timeval(times.rtm_system); + /* - * No need to lock "p": this does not need to be - * completely consistent, right ? + * No lock is held here, but it's only a consistency issue for non- + * getrusage(2) callers of this function. */ - p->p_stats->p_ru.ru_minflt = (teventsinfo.faults - - teventsinfo.pageins); + p->p_stats->p_ru.ru_minflt = teventsinfo.faults - + teventsinfo.pageins; p->p_stats->p_ru.ru_majflt = teventsinfo.pageins; - p->p_stats->p_ru.ru_nivcsw = (teventsinfo.csw - - p->p_stats->p_ru.ru_nvcsw); + p->p_stats->p_ru.ru_nivcsw = teventsinfo.csw - + p->p_stats->p_ru.ru_nvcsw; if (p->p_stats->p_ru.ru_nivcsw < 0) { p->p_stats->p_ru.ru_nivcsw = 0; } p->p_stats->p_ru.ru_maxrss = (long)tinfo.resident_size_max; + } else { + timerclear(up); + timerclear(sp); } } @@ -1587,7 +1809,6 @@ getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval) struct timeval utime; struct timeval stime; - switch (uap->who) { case RUSAGE_SELF: calcru(p, &utime, &stime, NULL); @@ -1857,6 +2078,8 @@ static int iopolicysys_vfs_altlink(struct proc *p, int cmd, int scope, int polic static int iopolicysys_vfs_nocache_write_fs_blksize(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); static int iopolicysys_vfs_support_long_paths(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_entitled_reserve_access(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); /* * iopolicysys @@ -1880,6 +2103,17 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) goto out; } +#if CONFIG_MACF + error = mac_proc_check_iopolicysys(p, kauth_cred_get(), + uap->cmd, + iop_param.iop_iotype, + iop_param.iop_scope, + iop_param.iop_policy); + if (error) { + return error; + } +#endif + switch (iop_param.iop_iotype) { case IOPOL_TYPE_DISK: error = iopolicysys_disk(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); @@ -1969,6 +2203,12 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) goto out; } break; + case IOPOL_TYPE_VFS_ENTITLED_RESERVE_ACCESS: + error = iopolicysys_vfs_entitled_reserve_access(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) { + goto out; + } + break; default: error = EINVAL; @@ -2575,6 +2815,54 @@ out: return error; } +static int +get_proc_vfs_ignore_permissions_policy(struct proc *p) +{ + return os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ? + IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF; +} + +static int +get_thread_vfs_ignore_permissions_policy(thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + + return (ut->uu_flag & UT_IGNORE_NODE_PERMISSIONS) ? + IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF; +} + +static void +set_proc_vfs_ignore_permissions_policy(struct proc *p, int policy) +{ + switch (policy) { + case IOPOL_VFS_IGNORE_PERMISSIONS_OFF: + os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); + break; + case IOPOL_VFS_IGNORE_PERMISSIONS_ON: + os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); + break; + default: + break; + } +} + +static void +set_thread_vfs_ignore_permissions_policy(thread_t thread, int policy) +{ + struct uthread *ut = get_bsdthread_info(thread); + + switch (policy) { + case IOPOL_VFS_IGNORE_PERMISSIONS_OFF: + ut->uu_flag &= ~UT_IGNORE_NODE_PERMISSIONS; + break; + case IOPOL_VFS_IGNORE_PERMISSIONS_ON: + ut->uu_flag |= UT_IGNORE_NODE_PERMISSIONS; + break; + default: + break; + } +} + #define AUTHORIZED_ACCESS_ENTITLEMENT \ "com.apple.private.vfs.authorized-access" int @@ -2582,8 +2870,12 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, int policy, __unused struct _iopol_param_t *iop_param) { int error = EINVAL; + thread_t thread = THREAD_NULL; switch (scope) { + case IOPOL_SCOPE_THREAD: + thread = current_thread(); + break; case IOPOL_SCOPE_PROCESS: break; default: @@ -2592,8 +2884,11 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, switch (cmd) { case IOPOL_CMD_GET: - policy = os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS ? - IOPOL_VFS_IGNORE_PERMISSIONS_ON : IOPOL_VFS_IGNORE_PERMISSIONS_OFF; + if (thread != THREAD_NULL) { + policy = get_thread_vfs_ignore_permissions_policy(thread); + } else { + policy = get_proc_vfs_ignore_permissions_policy(p); + } iop_param->iop_policy = policy; goto out_ok; case IOPOL_CMD_SET: @@ -2608,15 +2903,10 @@ iopolicysys_vfs_ignore_node_permissions(struct proc *p, int cmd, int scope, goto out; } - switch (policy) { - case IOPOL_VFS_IGNORE_PERMISSIONS_OFF: - os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); - break; - case IOPOL_VFS_IGNORE_PERMISSIONS_ON: - os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_IGNORE_NODE_PERMISSIONS, relaxed); - break; - default: - break; + if (thread != THREAD_NULL) { + set_thread_vfs_ignore_permissions_policy(thread, policy); + } else { + set_proc_vfs_ignore_permissions_policy(p, policy); } out_ok: @@ -2863,40 +3153,20 @@ static int iopolicysys_vfs_nocache_write_fs_blksize(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param) { - thread_t thread; - - switch (scope) { - case IOPOL_SCOPE_THREAD: - thread = current_thread(); - break; - case IOPOL_SCOPE_PROCESS: - thread = THREAD_NULL; - break; - default: + if (scope != IOPOL_SCOPE_PROCESS) { return EINVAL; } if (cmd == IOPOL_CMD_GET) { - if (thread != THREAD_NULL) { - struct uthread *ut = get_bsdthread_info(thread); - policy = ut->uu_flag & UT_FS_BLKSIZE_NOCACHE_WRITES ? - IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT; - } else { - policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) ? - IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT; - } + policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE) ? + IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON : IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT; iop_param->iop_policy = policy; return 0; } - /* Once set, we don't allow the process or thread to clear it. */ - if ((cmd == IOPOL_CMD_SET) && (policy == IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON)) { - if (thread != THREAD_NULL) { - struct uthread *ut = get_bsdthread_info(thread); - ut->uu_flag |= UT_FS_BLKSIZE_NOCACHE_WRITES; - } else { - os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE, relaxed); - } + /* Once set, we don't allow the process to clear it. */ + if (policy == IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON) { + os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE, relaxed); return 0; } @@ -3002,6 +3272,67 @@ out: return error; } +#define ENTITLED_RESERVE_ACCESS_ENTITLEMENT \ + "com.apple.private.vfs.entitled-reserve-access" +static int +iopolicysys_vfs_entitled_reserve_access(struct proc *p, int cmd, int scope, + int policy, struct _iopol_param_t *iop_param) +{ + struct uthread *ut; + + switch (scope) { + case IOPOL_SCOPE_THREAD: + ut = get_bsdthread_info(current_thread()); + break; + case IOPOL_SCOPE_PROCESS: + ut = NULL; + break; + default: + return EINVAL; + } + + if (cmd == IOPOL_CMD_GET) { + if (scope == IOPOL_SCOPE_THREAD) { + policy = (os_atomic_load(&ut->uu_flag, relaxed) & UT_FS_ENTITLED_RESERVE_ACCESS) ? + IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON : IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF; + } else { + policy = (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS) ? + IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON : IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF; + } + iop_param->iop_policy = policy; + return 0; + } + + if (cmd != IOPOL_CMD_SET) { + return EINVAL; + } + + if (!IOCurrentTaskHasEntitlement(ENTITLED_RESERVE_ACCESS_ENTITLEMENT)) { + return EPERM; + } + + switch (policy) { + case IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF: + if (scope == IOPOL_SCOPE_THREAD) { + os_atomic_andnot(&ut->uu_flag, UT_FS_ENTITLED_RESERVE_ACCESS, relaxed); + } else { + os_atomic_andnot(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS, relaxed); + } + break; + case IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON: + if (scope == IOPOL_SCOPE_THREAD) { + os_atomic_or(&ut->uu_flag, UT_FS_ENTITLED_RESERVE_ACCESS, relaxed); + } else { + os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS, relaxed); + } + break; + default: + return EINVAL; + } + + return 0; +} + void proc_apply_task_networkbg(int pid, thread_t thread) { diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index d3d2517be..433e88abf 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -150,6 +150,15 @@ get_system_inshutdown() return system_inshutdown; } +extern int OSKextIsInUserspaceReboot(void); + +int +get_system_inuserspacereboot() +{ + /* set by launchd before performing a userspace reboot */ + return OSKextIsInUserspaceReboot(); +} + __abortlike static void panic_kernel(int howto, char *message) @@ -268,6 +277,11 @@ reboot_kernel(int howto, char *message) if (!(howto & RB_PANIC) || !kdp_has_polled_corefile()) #endif /* DEVELOPMENT || DEBUG */ { +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP + /* Disable user space core dump before unmounting non-system volume so + * that dext cores wouldn't be written to system volume */ + do_coredump = 0; +#endif /* COREDUMP || CONFIG_UCOREDUMP */ startTime = mach_absolute_time(); vfs_unmountall(TRUE); halt_log_enter("vfs_unmountall", 0, mach_absolute_time() - startTime); diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index d32c03ce8..93d9d7c61 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -145,8 +145,8 @@ extern void doexception(int exc, mach_exception_code_t code, mach_exception_subcode_t sub); static void stop(proc_t, proc_t); -int cansignal_nomac(proc_t, kauth_cred_t, proc_t, int); -int cansignal(proc_t, kauth_cred_t, proc_t, int); +bool cansignal_nomac(proc_t, kauth_cred_t, proc_t, int); +bool cansignal(proc_t, kauth_cred_t, proc_t, int); int killpg1(proc_t, int, int, int, int); kern_return_t do_bsdexception(int, int, int); void __posix_sem_syscall_return(kern_return_t); @@ -297,39 +297,45 @@ signal_setast(thread_t sig_actthread) act_set_astbsd(sig_actthread); } -int +bool cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum) { /* you can signal yourself */ if (src == dst) { - return 1; + return true; } - /* you can't send the init proc SIGKILL, even if root */ - if (signum == SIGKILL && dst == initproc) { - return 0; + /* + * You can't signal the initproc, even if root. + * Note that this still permits the kernel itself to signal initproc directly, + * e.g SIGCHLD when reparenting or SIGTERM at shutdown, because those are + * not considered to originate from a user process, so the cansignal() + * check isn't performed. + */ + if (dst == initproc) { + return false; } /* otherwise, root can always signal */ if (kauth_cred_issuser(uc_src)) { - return 1; + return true; } /* processes in the same session can send SIGCONT to each other */ if (signum == SIGCONT && proc_sessionid(src) == proc_sessionid(dst)) { - return 1; + return true; } #if XNU_TARGET_OS_IOS // Allow debugging of third party drivers on iOS if (proc_is_third_party_debuggable_driver(dst)) { - return 1; + return true; } #endif /* XNU_TARGET_OS_IOS */ /* the source process must be authorized to signal the target */ { - int allowed = 0; + bool allowed = false; kauth_cred_t uc_dst = NOCRED, uc_ref = NOCRED; uc_dst = uc_ref = kauth_cred_proc_ref(dst); @@ -342,7 +348,7 @@ cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum) kauth_cred_getruid(uc_src) == kauth_cred_getsvuid(uc_dst) || kauth_cred_getuid(uc_src) == kauth_cred_getruid(uc_dst) || kauth_cred_getuid(uc_src) == kauth_cred_getsvuid(uc_dst)) { - allowed = 1; + allowed = true; } if (uc_ref != NOCRED) { @@ -359,13 +365,13 @@ cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum) * `dst`? The ucred is referenced by the caller so internal fileds can be used * safely. */ -int +bool cansignal(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum) { #if CONFIG_MACF - struct proc_ident dst_ident = proc_ident(dst); + struct proc_ident dst_ident = proc_ident_with_policy(dst, IDENT_VALIDATION_PROC_MAY_EXEC | IDENT_VALIDATION_PROC_MAY_EXIT); if (mac_proc_check_signal(src, NULL, &dst_ident, signum)) { - return 0; + return false; } #endif @@ -399,8 +405,7 @@ static int signal_is_restricted(proc_t p, int signum) { if (sigmask(signum) & sigrestrictmask()) { - if (sigrestrict_arg == 0 && - task_get_apptype(proc_task(p)) == TASK_APPTYPE_APP_DEFAULT) { + if (sigrestrict_arg == 0 && task_is_app(proc_task(p))) { return ENOTSUP; } else { return EINVAL; @@ -1125,8 +1130,9 @@ __pthread_kill(__unused proc_t p, struct __pthread_kill_args *uap, * workq threads must have kills enabled through either * BSDTHREAD_CTL_WORKQ_ALLOW_KILL or BSDTHREAD_CTL_WORKQ_ALLOW_SIGMASK */ - if ((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) && - !(uth->uu_workq_pthread_kill_allowed || p->p_workq_allow_sigmask)) { + if (((thread_get_tag(target_act) & THREAD_TAG_WORKQUEUE) && + !(uth->uu_workq_pthread_kill_allowed || p->p_workq_allow_sigmask)) || + (thread_get_tag(target_act) & THREAD_TAG_AIO_WORKQUEUE)) { error = ENOTSUP; goto out; } @@ -1386,7 +1392,7 @@ kill(proc_t cp, struct kill_args *uap, __unused int32_t *retval) if (uap->pid > 0) { /* kill single process */ if ((p = proc_find(uap->pid)) == NULL) { - if ((p = pzfind(uap->pid)) != NULL) { + if (pzfind(uap->pid)) { /* * POSIX 1003.1-2001 requires returning success when killing a * zombie; see Rationale for kill(2). @@ -1862,7 +1868,8 @@ set_thread_extra_flags(task_t task, struct uthread *uth, os_reason_t reason) reason->osr_flags |= OS_REASON_FLAG_SHAREDREGION_FAULT; #if __has_feature(ptrauth_calls) - if (!vm_shared_region_reslide_restrict || task_is_hardened_binary(current_task())) { + if (!vm_shared_region_reslide_restrict || + (task_get_platform_restrictions_version(current_task()) >= 1)) { reslide_shared_region = TRUE; } #endif /* __has_feature(ptrauth_calls) */ @@ -1944,7 +1951,8 @@ again: if (((uth->uu_flag & UT_NO_SIGMASK) == 0) && (((uth->uu_sigmask & mask) == 0) || (uth->uu_sigwait & mask))) { thread_t th = get_machthread(uth); - if (skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) { + if ((skip_wqthreads && (thread_get_tag(th) & THREAD_TAG_WORKQUEUE)) || + (thread_get_tag(th) & THREAD_TAG_AIO_WORKQUEUE)) { /* Workqueue threads may be parked in the kernel unable to * deliver signals for an extended period of time, so skip them * in favor of pthreads in a first pass. (rdar://50054475). */ @@ -3057,7 +3065,6 @@ postsig_locked(int signum) int mask, returnmask; struct uthread * ut; os_reason_t ut_exit_reason = OS_REASON_NULL; - int coredump_flags = 0; #if DIAGNOSTIC if (signum == 0) { @@ -3097,29 +3104,70 @@ postsig_locked(int signum) p->p_sigacts.ps_sig = signum; proc_signalend(p, 1); proc_unlock(p); - if (task_is_driver(proc_task(p))) { - coredump_flags |= COREDUMP_FULLFSYNC; - } +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP + /* + * For now, driver dumps are only performed by xnu. + * Regular processes can be configured to use xnu + * (synchronously generating very large core files), + * or xnu can generate a specially tagged corpse which + * (depending on other configuration) will cause + * ReportCrash to dump a core file asynchronously. + * + * The userland dumping path must operate + * asynchronously to avoid deadlocks, yet may have + * unexpected failures => indicate dump *initiation* + * via WCOREFLAG (or CLD_DUMPED). + */ + do { + if (task_is_driver(proc_task(p))) { #if CONFIG_COREDUMP - if (coredump(p, 0, coredump_flags) == 0) { - signum |= WCOREFLAG; - } -#endif + if (coredump(p, 0, COREDUMP_FULLFSYNC) == 0) { + signum |= WCOREFLAG; + } +#endif /* CONFIG_COREDUMP */ + break; + } +#if CONFIG_UCOREDUMP + if (do_ucoredump) { + /* + * A compatibility nod to existing + * coredump behavior: only set + * WCOREFLAG here if the user has + * implicitly asked for a core + * file and it passes security + * checks. (A core file might still + * be dumped because of other policy.) + */ + if (proc_limitgetcur(p, RLIMIT_CORE) != 0 && + is_coredump_eligible(p) == 0) { + signum |= WCOREFLAG; + } + break; + } +#endif /* CONFIG_UCOREDUMP */ +#if CONFIG_COREDUMP + if (coredump(p, 0, 0) == 0) { + signum |= WCOREFLAG; + } +#endif /* CONFIG_COREDUMP */ + } while (0); +#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ } else { proc_signalend(p, 1); proc_unlock(p); } #if CONFIG_DTRACE - bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); + bzero(&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); - ut->t_dtrace_siginfo.si_signo = signum; + const int signo = signum & ~WCOREFLAG; + ut->t_dtrace_siginfo.si_signo = signo; ut->t_dtrace_siginfo.si_pid = p->si_pid; ut->t_dtrace_siginfo.si_uid = p->si_uid; ut->t_dtrace_siginfo.si_status = WEXITSTATUS(p->si_status); /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ - switch (signum) { + switch (signo) { case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); break; @@ -3128,7 +3176,7 @@ postsig_locked(int signum) } - DTRACE_PROC3(signal__handle, int, signum, siginfo_t *, &(ut->t_dtrace_siginfo), + DTRACE_PROC3(signal__handle, int, signo, siginfo_t *, &(ut->t_dtrace_siginfo), void (*)(void), SIG_DFL); #endif diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index a95320587..a6ac31632 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -235,7 +235,7 @@ kern_open_file_for_direct_io(const char * name, int isssd = 0; uint32_t flags = 0; uint32_t blksize; - off_t maxiocount, count, segcount, wbctotal; + off_t maxiocount, count, segcount, wbctotal, set_file_size; boolean_t locked = FALSE; int fmode; mode_t cmode; @@ -341,9 +341,10 @@ kern_open_file_for_direct_io(const char * name, } } - if (set_file_size_max) { + if ((set_file_size = set_file_size_max)) { // set file size if (wbctotal) { + // only hibernate if (wbctotal >= set_file_size_min) { set_file_size_min = HIBERNATE_MIN_FILE_SIZE; } else { @@ -352,32 +353,41 @@ kern_open_file_for_direct_io(const char * name, set_file_size_min = HIBERNATE_MIN_FILE_SIZE; } } - set_file_size_max = set_file_size_min; + set_file_size = set_file_size_min; } if (fs_free_size) { mpFree += va.va_data_alloc; - if ((mpFree < set_file_size_max) || ((mpFree - set_file_size_max) < fs_free_size)) { - set_file_size_max = mpFree - fs_free_size; + if ((mpFree < set_file_size) || ((mpFree - set_file_size) < fs_free_size)) { + set_file_size = mpFree - fs_free_size; if (0 == set_file_size_min) { // passing zero for set_file_size_min (coredumps) // means caller only accepts set_file_size_max error = ENOSPC; goto out; } - if (set_file_size_max < set_file_size_min) { - set_file_size_max = set_file_size_min; - } - printf("kern_direct_file(%s): using reduced size %qd\n", - ref->name, set_file_size_max); // if set_file_size_min is passed (hibernation), // it does not check free space on disk } } - error = vnode_setsize(ref->vp, set_file_size_max, IO_NOZEROFILL | IO_NOAUTH, ref->ctx); + while (TRUE) { + if (set_file_size < set_file_size_min) { + set_file_size = set_file_size_min; + } + if (set_file_size < set_file_size_max) { + printf("kern_direct_file(%s): using reduced size %qd\n", + ref->name, set_file_size); + } + error = vnode_setsize(ref->vp, set_file_size, IO_NOZEROFILL | IO_NOAUTH, ref->ctx); + if ((ENOSPC == error) && set_file_size_min && (set_file_size > set_file_size_min) && (set_file_size > fs_free_size)) { + set_file_size -= fs_free_size; + continue; + } + break; + } if (error) { goto out; } - ref->filelength = set_file_size_max; + ref->filelength = set_file_size; } } else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) { /* Partition. */ @@ -684,10 +694,10 @@ kern_file_mount(struct kern_direct_file_io_ref_t * ref) void kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, off_t write_offset, void * addr, size_t write_length, - off_t discard_offset, off_t discard_end, bool unlink) + off_t discard_offset, off_t discard_end, off_t set_file_size, bool unlink) { int error; - printf("kern_close_file_for_direct_io(%p)\n", ref); + printf("kern_close_file_for_direct_io(%p) %qd\n", ref, set_file_size); if (!ref) { return; @@ -737,7 +747,9 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, if (addr && write_length) { (void) kern_write_file(ref, write_offset, addr, write_length, IO_SKIP_ENCRYPTION); } - + if (set_file_size) { + error = vnode_setsize(ref->vp, set_file_size, IO_NOZEROFILL | IO_NOAUTH, ref->ctx); + } error = vnode_close(ref->vp, FWRITE, ref->ctx); ref->vp = NULLVP; diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index d5c281a9d..1e9baecfc 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -156,6 +156,7 @@ #include #include +#include #include #include #include @@ -219,6 +220,9 @@ extern unsigned int vm_page_free_target; extern unsigned int vm_page_free_reserved; extern unsigned int vm_page_max_speculative_age_q; +static uint64_t userspacereboottime = 0; +static unsigned int userspacerebootpurpose = 0; + #if (DEVELOPMENT || DEBUG) extern uint32_t vm_page_creation_throttled_hard; extern uint32_t vm_page_creation_throttled_soft; @@ -318,9 +322,12 @@ STATIC int sysctl_imgsrcdev(struct sysctl_oid *oidp, void *arg1, int arg2, struc #endif STATIC int sysctl_usrstack(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_usrstack64(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP STATIC int sysctl_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_suid_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if CONFIG_UCOREDUMP +STATIC int sysctl_ucoredump(struct sysctl_oid *, void *, int, struct sysctl_req *); +#endif #endif STATIC int sysctl_delayterm(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_rage_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); @@ -1510,7 +1517,8 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where, if (vm_map_copy_overwrite(kernel_map, (vm_map_address_t)copy_start, - tmp, (vm_map_size_t) arg_size, FALSE) != KERN_SUCCESS) { + tmp, (vm_map_size_t) arg_size, + FALSE) != KERN_SUCCESS) { error = EIO; goto finish; } @@ -2006,7 +2014,9 @@ sysctl_system_version_compat SYSCTL_PROC(_kern, OID_AUTO, system_version_compat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_system_version_compat, "A", ""); +#endif /* XNU_TARGET_OS_OSX */ +#if XNU_TARGET_OS_OSX || defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) char osproductversioncompat[48] = { '\0' }; static int @@ -2023,12 +2033,42 @@ SYSCTL_PROC(_kern, OID_AUTO, osproductversioncompat, CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED, osproductversioncompat, sizeof(osproductversioncompat), sysctl_osproductversioncompat, "A", "The ProductVersion from SystemVersionCompat.plist"); -#endif +#endif /* XNU_TARGET_OS_OSX || defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */ char osproductversion[48] = { '\0' }; static char iossupportversion_string[48] = { '\0' }; +#if defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) +/* + * Equivalent to dyld_program_sdk_at_least(dyld_fall_2025_os_versions). + */ +static bool +proc_2025_fall_os_sdk_or_later(struct proc *p) +{ + const uint32_t proc_sdk_ver = proc_sdk(p); + + switch (proc_platform(p)) { + case PLATFORM_MACOS: + return proc_sdk_ver >= 0x00100000; // DYLD_MACOSX_VERSION_16_0 + case PLATFORM_IOS: + case PLATFORM_IOSSIMULATOR: + case PLATFORM_MACCATALYST: + return proc_sdk_ver >= 0x00130000; // DYLD_IOS_VERSION_19_0 + case PLATFORM_BRIDGEOS: + return proc_sdk_ver >= 0x000a0000; // DYLD_BRIDGEOS_VERSION_10_0 + case PLATFORM_TVOS: + case PLATFORM_TVOSSIMULATOR: + return proc_sdk_ver >= 0x00130000; // DYLD_TVOS_VERSION_19_0 + case PLATFORM_WATCHOS: + case PLATFORM_WATCHOSSIMULATOR: + return proc_sdk_ver >= 0x000c0000; // DYLD_WATCHOS_VERSION_12_0 + default: + return true; + } +} +#endif /* defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */ + static int sysctl_osproductversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -2039,18 +2079,22 @@ sysctl_osproductversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, #if XNU_TARGET_OS_OSX if (task_has_system_version_compat_enabled(current_task()) && (osproductversioncompat[0] != '\0')) { return sysctl_handle_string(oidp, osproductversioncompat, arg2, req); - } else { - return sysctl_handle_string(oidp, arg1, arg2, req); } -#elif defined(XNU_TARGET_OS_XR) +#endif /* XNU_TARGET_OS_OSX */ + +#if defined(XNU_TARGET_OS_XR) if (proc_platform(req->p) == PLATFORM_IOS && (iossupportversion_string[0] != '\0')) { return sysctl_handle_string(oidp, iossupportversion_string, arg2, req); - } else { - return sysctl_handle_string(oidp, arg1, arg2, req); } -#else +#endif /* defined(XNU_TARGET_OS_XR) */ + +#if defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) + if (!proc_2025_fall_os_sdk_or_later(req->p) && (osproductversioncompat[0] != '\0')) { + return sysctl_handle_string(oidp, osproductversioncompat, arg2, req); + } +#endif /* defined(XNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */ + return sysctl_handle_string(oidp, arg1, arg2, req); -#endif } #if XNU_TARGET_OS_OSX @@ -2487,10 +2531,6 @@ extern int sched_allow_rt_smt; SYSCTL_INT(_kern, OID_AUTO, sched_allow_rt_smt, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, &sched_allow_rt_smt, 0, ""); -extern int sched_allow_rt_steal; -SYSCTL_INT(_kern, OID_AUTO, sched_allow_rt_steal, - CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, - &sched_allow_rt_steal, 0, ""); extern int sched_backup_cpu_timeout_count; SYSCTL_INT(_kern, OID_AUTO, sched_backup_cpu_timeout_count, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, @@ -3352,8 +3392,7 @@ SYSCTL_UINT(_kern, OID_AUTO, secure_coredump, CTLFLAG_RD, &sc_dump_mode, 0, "sec #endif /* EXCLAVES_COREDUMP */ - -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP SYSCTL_STRING(_kern, KERN_COREFILE, corefile, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, @@ -3413,7 +3452,34 @@ SYSCTL_PROC(_kern, KERN_SUGID_COREDUMP, sugid_coredump, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_suid_coredump, "I", ""); -#endif /* CONFIG_COREDUMP */ +#if CONFIG_UCOREDUMP + +STATIC int +sysctl_ucoredump +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ +#ifdef SECURE_KERNEL + (void)req; + return ENOTSUP; +#else + int new_value, changed; + int error = sysctl_io_number(req, do_ucoredump, sizeof(int), &new_value, &changed); + if (changed) { + if (new_value == 0 || new_value == 1) { + do_ucoredump = new_value; + } else { + error = EINVAL; + } + } + return error; +#endif +} + +SYSCTL_PROC(_kern, OID_AUTO, ucoredump, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_ucoredump, "I", ""); +#endif /* CONFIG_UCOREDUMP */ +#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ #if CONFIG_KDP_INTERACTIVE_DEBUGGING @@ -4394,12 +4460,14 @@ SYSCTL_PROC(_vm, OID_AUTO, add_wire_count_over_user_limit, CTLTYPE_QUAD | CTLFLA #if DEVELOPMENT || DEBUG /* These sysctls are used to test the wired limit. */ -extern unsigned int vm_page_wire_count; -extern uint32_t vm_lopage_free_count; -extern unsigned int vm_page_stolen_count; -SYSCTL_INT(_vm, OID_AUTO, page_wire_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_wire_count, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, page_wire_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_wire_count, 0, + "The number of physical pages which are pinned and cannot be evicted"); +#if XNU_VM_HAS_LOPAGE SYSCTL_INT(_vm, OID_AUTO, lopage_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_lopage_free_count, 0, ""); +#endif SYSCTL_INT(_vm, OID_AUTO, page_stolen_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stolen_count, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, page_swapped_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_swapped_count, 0, + "The number of virtual pages whose contents are currently compressed and swapped to disk"); /* * Setting the per task variable exclude_physfootprint_ledger to 1 will allow the calling task to exclude memory entries that are @@ -4929,6 +4997,7 @@ SCALABLE_COUNTER_DECLARE(oslog_e_metadata_count); SCALABLE_COUNTER_DECLARE(oslog_e_metadata_dropped_count); SCALABLE_COUNTER_DECLARE(oslog_e_signpost_count); SCALABLE_COUNTER_DECLARE(oslog_e_signpost_dropped_count); +SCALABLE_COUNTER_DECLARE(oslog_e_replay_failure_count); SCALABLE_COUNTER_DECLARE(oslog_e_query_count); SCALABLE_COUNTER_DECLARE(oslog_e_query_error_count); SCALABLE_COUNTER_DECLARE(oslog_e_trace_mode_set_count); @@ -4989,6 +5058,8 @@ SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_signpost_count, oslog_e_signpost_count, "Number of signposts retrieved from the exclaves log server"); SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_signpost_dropped_count, oslog_e_signpost_dropped_count, "Number of dropped signposts retrieved from the exclaves log server"); +SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_replay_failure_count, oslog_e_replay_failure_count, + "Number of dropped messages that couldn't be replayed and failed generically"); SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_query_count, oslog_e_query_count, "Number of sucessful queries to the exclaves log server"); SYSCTL_SCALABLE_COUNTER(_debug, oslog_e_query_error_count, oslog_e_query_error_count, @@ -5545,6 +5616,31 @@ sysctl_get_thread_group_id SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, thread_group_id, CTLFLAG_RD | CTLFLAG_LOCKED | CTLTYPE_QUAD, 0, 0, &sysctl_get_thread_group_id, "I", "thread group id of the thread"); +extern kern_return_t sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats); + +static int +sysctl_get_clutch_bucket_group_cpu_stats SYSCTL_HANDLER_ARGS +{ + int error; + kern_return_t kr; + int sched_bucket = -1; + error = SYSCTL_IN(req, &sched_bucket, sizeof(sched_bucket)); + if (error) { + return error; + } + uint64_t cpu_stats[2]; + kr = sysctl_clutch_thread_group_cpu_time_for_thread(current_thread(), sched_bucket, cpu_stats); + error = mach_to_bsd_errno(kr); + if (error) { + return error; + } + return SYSCTL_OUT(req, cpu_stats, sizeof(cpu_stats)); +} + +SYSCTL_PROC(_kern, OID_AUTO, clutch_bucket_group_cpu_stats, CTLFLAG_RW | CTLFLAG_LOCKED | CTLTYPE_OPAQUE, + 0, 0, &sysctl_get_clutch_bucket_group_cpu_stats, "I", + "CPU used and blocked time for the current thread group at a specified scheduling bucket"); + STATIC int sysctl_thread_group_count(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -5601,6 +5697,77 @@ SYSCTL_PROC(_kern, OID_AUTO, grade_cputype, 0, 0, &sysctl_grade_cputype, "S", "grade value of cpu_type_t+cpu_sub_type_t"); + +#if DEVELOPMENT || DEBUG +STATIC int +sysctl_binary_grade_override( __unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int error; + user_addr_t oldp = 0, newp = 0; + size_t *oldlenp = NULL; + size_t newlen = 0; + + oldp = req->oldptr; + oldlenp = &(req->oldlen); + newp = req->newptr; + newlen = req->newlen; + + /* We want the current length, and maybe the string itself */ + if (oldlenp) { + char existing_overrides[256] = { 0 }; + + size_t currlen = bingrade_get_override_string(existing_overrides, sizeof(existing_overrides)); + + if (oldp && currlen > 0) { + if (*oldlenp < currlen) { + return ENOMEM; + } + /* NOTE - we do not copy the NULL terminator */ + error = copyout(existing_overrides, oldp, currlen); + if (error) { + return error; + } + } + /* return length of overrides minus the NULL terminator (just like strlen) */ + req->oldidx = currlen; + } + + /* We want to set the override string to something */ + if (newp) { + char *tmp_override = (char *)kalloc_data(newlen + 1, Z_WAITOK | Z_ZERO); + if (!tmp_override) { + return ENOMEM; + } + + error = copyin(newp, tmp_override, newlen); + if (error) { + kfree_data(tmp_override, newlen + 1); + return error; + } + + tmp_override[newlen] = 0; /* Terminate string */ + + /* Set the binary grading overrides */ + if (binary_grade_overrides_update(tmp_override) == 0) { + /* Nothing got set. */ + kfree_data(tmp_override, newlen + 1); + return EINVAL; + } + + kfree_data(tmp_override, newlen + 1); + } + + return 0; +} + + +SYSCTL_PROC(_kern, OID_AUTO, grade_override, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, &sysctl_binary_grade_override, "A", + ""); +#endif /* DEVELOPMENT || DEBUG */ + extern boolean_t allow_direct_handoff; SYSCTL_INT(_kern, OID_AUTO, direct_handoff, CTLFLAG_KERN | CTLFLAG_RW | CTLFLAG_LOCKED, @@ -6135,14 +6302,23 @@ uuid_string_t trial_treatment_id; uuid_string_t trial_experiment_id; int trial_deployment_id = -1; -SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), ""); -SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), ""); -SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &trial_deployment_id, 0, ""); +SYSCTL_STRING(_kern, OID_AUTO, trial_treatment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, trial_treatment_id, sizeof(trial_treatment_id), ""); +SYSCTL_STRING(_kern, OID_AUTO, trial_experiment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, trial_experiment_id, sizeof(trial_experiment_id), ""); +SYSCTL_INT(_kern, OID_AUTO, trial_deployment_id, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &trial_deployment_id, 0, ""); #if (DEVELOPMENT || DEBUG) /* For unit testing setting factors & limits. */ unsigned int testing_experiment_factor; -EXPERIMENT_FACTOR_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, testing_experiment_factor, &testing_experiment_factor, 5, 10, ""); + +static int32_t experiment_factor_test; +EXPERIMENT_FACTOR_INT(test, &experiment_factor_test, 0, 32, "test factor"); + +#if MACH_ASSERT && __arm64__ +/* rdar://149041040 */ +extern unsigned int panic_on_jit_guard; +EXPERIMENT_FACTOR_UINT(jitguard, &panic_on_jit_guard, 0, 7, "Panic on JIT guard failure"); +#endif /* MACH_ASSERT && __arm64__ */ extern int exception_log_max_pid; SYSCTL_INT(_debug, OID_AUTO, exception_log_max_pid, CTLFLAG_RW | CTLFLAG_LOCKED, &exception_log_max_pid, 0, "Log exceptions for all processes up to this pid"); @@ -6186,6 +6362,212 @@ SYSCTL_PROC(_kern, OID_AUTO, page_protection_type, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_page_protection_type, "I", "Type of page protection that the system supports"); +#if CONFIG_SPTM && HAS_SPTM_SYSCTL +extern bool disarm_protected_io; +static int sysctl_sptm_disarm_protected_io SYSCTL_HANDLER_ARGS +{ + int error = 0; + + uint64_t old_disarm_protected_io = (uint64_t) disarm_protected_io; + error = SYSCTL_OUT(req, &old_disarm_protected_io, sizeof(old_disarm_protected_io)); + + if (error) { + return error; + } + + uint64_t new_disarm_protected_io = old_disarm_protected_io; + if (req->newptr) { + error = SYSCTL_IN(req, &new_disarm_protected_io, sizeof(new_disarm_protected_io)); + if (!disarm_protected_io && new_disarm_protected_io) { + sptm_sysctl(SPTM_SYSCTL_DISARM_PROTECTED_IO, SPTM_SYSCTL_SET, 1); + os_atomic_thread_fence(release); + disarm_protected_io = true; + } + } + + return error; +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_disarm_protected_io, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_sptm_disarm_protected_io, "Q", ""); + +/** + * Usage of kern.sptm_sysctl_poke + * + * This sysctl provides a convenient way to trigger the "getter" handler of a + * specified SPTM sysctl. With this sysctl, you can trigger arbitrary SPTM + * code without modifying xnu source code. All you need to do is define a + * new SPTM sysctl and implement its "getter". After that, you can write + * the SPTM sysctl number to this sysctl to trigger it. + */ +static int sysctl_sptm_sysctl_poke SYSCTL_HANDLER_ARGS +{ + int error = 0; + + /* Always read-as-zero. */ + const uint64_t out = 0; + error = SYSCTL_OUT(req, &out, sizeof(out)); + + if (error) { + return error; + } + + uint64_t selector; + if (req->newptr) { + error = SYSCTL_IN(req, &selector, sizeof(selector)); + sptm_sysctl(selector, SPTM_SYSCTL_GET, 0); + } + + return error; +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_sysctl_poke, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_sptm_sysctl_poke, "Q", ""); +#endif /* CONFIG_SPTM && HAS_SPTM_SYSCTL */ + +#if CONFIG_SPTM && (DEVELOPMENT || DEBUG) +/** + * Sysctls to get SPTM allowed I/O ranges, pmap I/O ranges and I/O ranges by index. + * Used by SEAR/LASER tools. + */ +static int +sysctl_sptm_allowed_io_ranges SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + sptm_io_range_t io_range = { 0 }; + unsigned int index = 0; + + int error = SYSCTL_IN(req, &index, sizeof(index)); + if (error) { + return error; + } + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_ALLOWED_IO_RANGES, index, &io_range); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &io_range, sizeof(io_range)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_allowed_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_allowed_io_ranges, "S,sptm_io_range_t", "SPTM allowed I/O ranges by index"); + +static int +sysctl_sptm_allowed_io_ranges_count SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int count = 0; + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_ALLOWED_IO_RANGES_COUNT, 0, &count); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &count, sizeof(count)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_allowed_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_allowed_io_ranges_count, "I", "SPTM allowed I/O ranges count"); + +static int +sysctl_sptm_pmap_io_ranges SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + sptm_io_range_t io_range = { 0 }; + unsigned int index = 0; + + int error = SYSCTL_IN(req, &index, sizeof(index)); + if (error) { + return error; + } + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_PMAP_IO_RANGES, index, &io_range); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &io_range, sizeof(io_range)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_pmap_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_pmap_io_ranges, "S,sptm_io_range_t", "SPTM pmap I/O ranges by index"); + +static int +sysctl_sptm_pmap_io_ranges_count SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int count = 0; + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_PMAP_IO_RANGES_COUNT, 0, &count); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &count, sizeof(count)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_pmap_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_pmap_io_ranges_count, "I", "SPTM pmap I/O ranges count"); + +static int +sysctl_sptm_io_ranges SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + sptm_io_range_t io_range = { 0 }; + unsigned int index = 0; + + int error = SYSCTL_IN(req, &index, sizeof(index)); + if (error) { + return error; + } + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_IO_RANGES, index, &io_range); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &io_range, sizeof(io_range)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_io_ranges, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_io_ranges, "S,sptm_io_range_t", "SPTM I/O ranges by index"); + +static int +sysctl_sptm_io_ranges_count SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int count = 0; + + libsptm_error_t ret = sptm_get_info(INFO_SPTM_IO_RANGES_COUNT, 0, &count); + if (__improbable(ret != LIBSPTM_SUCCESS)) { + return EINVAL; + } + + return SYSCTL_OUT(req, &count, sizeof(count)); +} +SYSCTL_PROC(_kern, OID_AUTO, sptm_io_ranges_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_sptm_io_ranges_count, "I", "SPTM I/O ranges count"); +#endif /* CONFIG_SPTM && (DEVELOPMENT || DEBUG) */ + +#if __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM +extern bool surt_ready; +static int +sysctl_surt_ready SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int surt_ready_uint = (unsigned int)surt_ready; + return SYSCTL_OUT(req, &surt_ready_uint, sizeof(surt_ready_uint)); +} +SYSCTL_PROC(_kern, OID_AUTO, surt_ready, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_surt_ready, "I", "SURT system readiness"); +#endif /* __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM */ + +#if __arm64__ && (DEBUG || DEVELOPMENT) +extern unsigned int pmap_wcrt_on_non_dram_count_get(void); +static int +sysctl_pmap_wcrt_on_non_dram_count SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + unsigned int count = pmap_wcrt_on_non_dram_count_get(); + + return SYSCTL_OUT(req, &count, sizeof(count)); +} +SYSCTL_PROC(_kern, OID_AUTO, pmap_wcrt_on_non_dram_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_pmap_wcrt_on_non_dram_count, "I", "pmap WC/RT mapping request on non-DRAM count"); +#endif /* __arm64__ && (DEBUG || DEVELOPMENT) */ + TUNABLE_DT(int, gpu_pmem_selector, "defaults", "kern.gpu_pmem_selector", "gpu-pmem-selector", 0, TUNABLE_DT_NONE); #if CONFIG_EXCLAVES @@ -6299,3 +6681,175 @@ SYSCTL_PROC(_kern, OID_AUTO, exclaves_inspection_status, extern uint32_t disable_vm_sanitize_telemetry; SYSCTL_UINT(_debug, OID_AUTO, disable_vm_sanitize_telemetry, CTLFLAG_RW | CTLFLAG_LOCKED /*| CTLFLAG_MASKED*/, &disable_vm_sanitize_telemetry, 0, "disable VM API sanitization telemetry"); #endif + +#define kReadUserspaceRebootInfoEntitlement "com.apple.private.kernel.userspacereboot-info-read-only" +static int +_sysctl_userspacereboot_info(struct sysctl_req *req, void *ptr, size_t ptr_size) +{ + if (req->newptr != 0) { + /* initproc is the only process that can write to these sysctls */ + if (proc_getpid(req->p) != 1) { + return EPERM; + } + return SYSCTL_IN(req, ptr, ptr_size); + } else { + /* A read entitlement is required to read these sysctls */ + if (!IOCurrentTaskHasEntitlement(kReadUserspaceRebootInfoEntitlement)) { + return EPERM; + } + return SYSCTL_OUT(req, ptr, ptr_size); + } +} + +static int +sysctl_userspacereboottime(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + return _sysctl_userspacereboot_info(req, &userspacereboottime, sizeof(userspacereboottime)); +} + +static int +sysctl_userspacerebootpurpose(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + return _sysctl_userspacereboot_info(req, &userspacerebootpurpose, sizeof(userspacerebootpurpose)); +} + +SYSCTL_PROC(_kern, OID_AUTO, userspacereboottime, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_userspacereboottime, "Q", ""); +SYSCTL_PROC(_kern, OID_AUTO, userspacerebootpurpose, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_userspacerebootpurpose, "I", ""); + +#if XNU_TARGET_OS_IOS + +static LCK_GRP_DECLARE(erm_config_lock_grp, "ERM sysctl"); +static LCK_RW_DECLARE(erm_config_lock, &erm_config_lock_grp); +#define ERM_CONFIG_SYSCTL_WRITE_ENTITLEMENT "com.apple.private.security-research-device.extended-research-mode" +#define ERM_CONFIG_SYSCTL_MAX_SIZE PAGE_SIZE + +// This sysctl handler is only registered when Extended Research Mode (ERM) is active. +static int +sysctl_user_extended_research_mode_config_handler(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + // Pointer for the dynamically allocated buffer + static void *extended_research_mode_config_data = NULL; + + // Current size of the valid data stored in the buffer + static size_t extended_research_mode_config_current_size = 0; + + // Handle Read request (user wants to read the current config, before it is overwritten) + if (req->oldptr != USER_ADDR_NULL) { + int error = 0; + + lck_rw_lock_shared(&erm_config_lock); + + if (req->oldlen < extended_research_mode_config_current_size) { + error = ENOMEM; + } else { + if (extended_research_mode_config_current_size > 0) { + error = copyout(extended_research_mode_config_data, + req->oldptr, + extended_research_mode_config_current_size); + } + } + // In all cases, report the total size of the currently stored config back to the user, + req->oldlen = extended_research_mode_config_current_size; + req->oldidx = req->oldlen; + + lck_rw_unlock_shared(&erm_config_lock); + + if (error != 0) { + return error; + } + } else { + // User just want to know the current buffer size. + // All accesses to extended_research_mode_config* variables are expected + // to be done under erm_config_lock. + lck_rw_lock_shared(&erm_config_lock); + req->oldidx = extended_research_mode_config_current_size; + lck_rw_unlock_shared(&erm_config_lock); + } + + + // Handle Write request (new data provided by user) + if (req->newptr != USER_ADDR_NULL) { + if (!IOTaskHasEntitlement(proc_task(req->p), ERM_CONFIG_SYSCTL_WRITE_ENTITLEMENT)) { + return EPERM; + } + + size_t requested_len = req->newlen; + + if (requested_len > ERM_CONFIG_SYSCTL_MAX_SIZE) { + // We ensure the config provided by user-space is not too big + return EINVAL; + } + + // Allocate a new buffer for the incoming data + void *new_buffer = (void *)kalloc_data(requested_len, Z_WAITOK | Z_ZERO); + + if (new_buffer == NULL) { + return ENOMEM; // Allocation failed + } + + // Copy data from user space into the newly allocated buffer + int error = copyin(req->newptr, new_buffer, requested_len); + + if (error == 0) { + // Success: Replace the old buffer with the new one + lck_rw_lock_exclusive(&erm_config_lock); + + // Backup old buffer info for freeing it in a second step + void *old_buffer_to_free = extended_research_mode_config_data; + size_t old_buffer_size = extended_research_mode_config_current_size; + + // Point to the new buffer and update size + extended_research_mode_config_data = new_buffer; + extended_research_mode_config_current_size = requested_len; + lck_rw_unlock_exclusive(&erm_config_lock); + new_buffer = NULL; // transferred to the static pointer + + // Previous buffer is not referenced anymore, good to be deleted. + kfree_data(old_buffer_to_free, old_buffer_size); + } else { + // Copyin failed, free the buffer we just allocated and keep the old data and size intact + kfree_data(new_buffer, requested_len); + return error; + } + } + + return 0; +} + +// We don't register this sysctl handler automatically , but rather only register it only if the extended +// research mode is active. +SYSCTL_PROC(_user, // Parent node structure (_kern) + OID_AUTO, // Automatically assign OID + extended_research_mode_config, // Name of the node + CTLFLAG_NOAUTO | // We will register this sysctl on our own + CTLTYPE_OPAQUE | // Type: Opaque binary data + CTLFLAG_WR | // Allow both read and write + CTLFLAG_ANYBODY | // No user filtering + CTLFLAG_LOCKED, // The handler manages its own locking. + NULL, // arg1 (not used) + 0, // arg2 (not used) + &sysctl_user_extended_research_mode_config_handler, + "-", // don't print the content (as it is a blob) + "Configuration blob for Extended Research Mode"); + +// This function is defined in kern_codesigning.c but don't worth include the whole .h just for it. +bool extended_research_mode_state(void); + +// Only register the research_mode_config sysctl if Extended Research Mode is active +__startup_func +static void +extended_research_mode_config_sysctl_startup(void) +{ + if (__improbable(extended_research_mode_state())) { + // Register the sysctl handler + sysctl_register_oid_early(&sysctl__user_extended_research_mode_config); + } +} +STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, extended_research_mode_config_sysctl_startup); +#endif /* XNU_TARGET_OS_IOS */ + +#if DEBUG || DEVELOPMENT +SCALABLE_COUNTER_DEFINE(mach_eventlink_handoff_success_count); +SYSCTL_SCALABLE_COUNTER(_kern, mach_eventlink_handoff_success_count, + mach_eventlink_handoff_success_count, "Number of successful handoffs"); +#endif /* DEBUG || DEVELOPMENT*/ diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 10059b3fd..1dc7e152c 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -85,6 +85,7 @@ #include #include #include +#include #if CONFIG_MACF #include #endif diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index e807e6f69..0e09124f3 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -29,6 +29,7 @@ #define __KPI__ #include +#include #include #include #include @@ -78,12 +79,35 @@ SYSCTL_QUAD(_kern_ipc_mbtxcf, OID_AUTO, aborted, CTLFLAG_RD | CTLFLAG_LOCKED, &mbuf_tx_compl_aborted, ""); #endif /* (DEBUG || DEVELOPMENT) */ -void * +void * __unsafe_indexable mbuf_data(mbuf_t mbuf) { return m_mtod_current(mbuf); } +errno_t +mbuf_data_len(mbuf_t mbuf, void *__sized_by(*out_len) *out_buf, size_t *out_len) +{ + size_t len; + void *buf; + + if (out_len == NULL || out_buf == NULL) { + return EINVAL; + } + + len = mbuf_len(mbuf); + buf = m_mtod_current(mbuf); + + if (len == 0 || buf == NULL) { + return ENOENT; + } + + *out_len = len; + *out_buf = buf; + + return 0; +} + void * mbuf_datastart(mbuf_t mbuf) { @@ -249,11 +273,6 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, char * __sized_by_or_null(*size) caddr_t _addr = NULL; size_t _size = *size; - /* Jumbo cluster pool not available? */ - if (_size > MBIGCLBYTES && njcl == 0) { - return ENOTSUP; - } - if (_size <= MCLBYTES && (_addr = m_mclalloc(how)) != NULL) { _size = MCLBYTES; } else if (_size > MCLBYTES && _size <= MBIGCLBYTES && @@ -288,10 +307,8 @@ mbuf_freecluster(caddr_t addr, size_t size) m_mclfree(addr); } else if (size == MBIGCLBYTES) { m_bigfree(addr, MBIGCLBYTES, NULL); - } else if (njcl > 0) { - m_16kfree(addr, M16KCLBYTES, NULL); } else { - panic("%s: freeing jumbo cluster to an empty pool", __func__); + m_16kfree(addr, M16KCLBYTES, NULL); } } @@ -321,13 +338,7 @@ mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t *mbuf) } else if (size == MBIGCLBYTES) { *mbuf = m_mbigget(*mbuf, how); } else if (size == M16KCLBYTES) { - if (njcl > 0) { - *mbuf = m_m16kget(*mbuf, how); - } else { - /* Jumbo cluster pool not available? */ - error = ENOTSUP; - goto out; - } + *mbuf = m_m16kget(*mbuf, how); } else { error = EINVAL; goto out; @@ -513,7 +524,7 @@ mbuf_adjustlen(mbuf_t m, int amount) { /* Verify m_len will be valid after adding amount */ if (amount > 0) { - size_t used = (size_t)mbuf_data(m) - (size_t)mbuf_datastart(m) + + size_t used = (size_t)mtod(m, void*) - (size_t)mbuf_datastart(m) + m->m_len; if ((size_t)(amount + used) > mbuf_maxlen(m)) { diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index e93052216..a07d07363 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2020 Apple Inc. All rights reserved. + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,8 +93,63 @@ #include #include +#include +#include + #include "kern_exec_internal.h" +#if APPLEVIRTUALPLATFORM +#define ALLOW_FORCING_ARM64_32 1 +#endif /* APPLEVIRTUALPLATFORM */ + +#if ALLOW_FORCING_ARM64_32 +#if DEVELOPMENT || DEBUG +TUNABLE_DT(uint32_t, force_arm64_32, "/defaults", "force-arm64-32", "force-arm64-32", 0, TUNABLE_DT_NONE); +#else +TUNABLE_DT(uint32_t, force_arm64_32, "/defaults", "force-arm64-32", "force-arm64-32", 0, TUNABLE_DT_NO_BOOTARG); +#endif +#endif /* ALLOW_FORCING_ARM64_32 */ + +#if ALLOW_FORCING_ARM64_32 || DEVELOPMENT || DEBUG +/* + * The binary grading priority for the highest priority override. Each progressive override + * receives a priority 1 less than its neighbor. + */ +#define BINGRADE_OVERRIDE_MAX 200 +#endif /* ALLOW_FORCING_ARM64_32 || DEVELOPMENT || DEBUG */ + +#if DEVELOPMENT || DEBUG +/* + * Maxmum number of overrides that can be passed via the bingrade boot-arg property. + */ +#define MAX_BINGRADE_OVERRIDES 4 +/* + * Max size of one bingrade override + 1 comma + * (technically, sizeof will also include the terminating NUL here, but an overestimation of + * buffer space is fine). + */ +#define BINGRADE_MAXSTRINGLEN sizeof("0x12345678:0x12345678:0x12345678,") + +/* + * Each binary grading override has a cpu type and cpu subtype to match against the values in + * the Mach-o header. + */ +typedef struct bingrade { + uint32_t cputype; + uint32_t cpusubtype; + uint32_t execfeatures; +#define EXECFEATURES_OVERRIDE_WILDCARD (~(uint32_t)0) +} bingrade_t; + +/* The number of binary grading overrides that are active */ +static int num_bingrade_overrides = -1; + +/* + * The bingrade_overrides array is an ordered list of binary grading overrides. The first element in the array + * has the highest priority. When parsing the `bingrade' boot-arg, elements are added to this array in order. + */ +static bingrade_t bingrade_overrides[MAX_BINGRADE_OVERRIDES] = { 0 }; +#endif /* DEVELOPMENT || DEBUG */ /* An empty load_result_t */ @@ -304,6 +359,158 @@ get_macho_vnode( struct image_params *imgp ); +#if DEVELOPMENT || DEBUG +/* + * Parse the bingrade boot-arg, adding cputype/cpusubtype/execfeatures tuples to the global binary grading + * override array. The bingrade boot-arg must be of the form: + * + * NUM := '0x' | '0' | + * OVERRIDESPEC := | ':' | ':' ':' + * BINSPEC_BOOTARG := ',' | + * + * Returns the number of overrides specified in the boot-arg, or 0 if there were no overrides or the + * syntax of the overrides was found to be invalid. + */ +static int +parse_bingrade_override_bootarg(bingrade_t *overrides, int max_overrides, char *overrides_arg_string) +{ + char bingrade_arg[BINGRADE_MAXSTRINGLEN * MAX_BINGRADE_OVERRIDES + 1]; + int cputypespec_count = 0; + + /* Look for the bingrade boot-arg */ + if (overrides_arg_string != NULL || PE_parse_boot_arg_str("bingrade", bingrade_arg, sizeof(bingrade_arg))) { + char *bingrade_str = (overrides_arg_string != NULL) ? overrides_arg_string : &bingrade_arg[0]; + char *cputypespec; + + /* Skip leading whitespace */ + while (*bingrade_str == ' ' || *bingrade_str == '\t') { + bingrade_str++; + } + + if (*bingrade_str == 0) { + /* empty string, so just return 0 */ + return 0; + } + + /* If we found the boot-arg, iterate on each OVERRIDESPEC in the BOOTSPEC_BOOTARG */ + while ((cputypespec_count < max_overrides) && ((cputypespec = strsep(&bingrade_str, ",")) != NULL)) { + char *colon = strchr(cputypespec, ':'); + char *end; + char *cputypeptr; + char cputypestr[16] = { 0 }; + unsigned long cputype, cpusubtype, execfeatures; + + /* If there's a colon present, process the cpu subtype and possibly the execfeatures */ + if (colon != NULL) { + colon++; /* Move past the colon before parsing */ + + char execfeat_buf[16] = { 0 }; /* This *MUST* be preinitialized to zeroes */ + char *second_colon = strchr(colon, ':'); + ptrdiff_t amt_to_copy = 0; + + if (second_colon != NULL) { + strlcpy(execfeat_buf, second_colon + 1, MIN(strlen(second_colon + 1) + 1, sizeof(execfeat_buf))); + + execfeatures = strtoul(execfeat_buf, &end, 0); + if (execfeat_buf == end || execfeatures > UINT_MAX) { + printf("Invalid bingrade boot-arg (`%s').\n", cputypespec); + return 0; + } + + overrides[cputypespec_count].execfeatures = (uint32_t)execfeatures; + + /* + * Note there is no "+ 1" here because we are only copying up to but not + * including the second colon. Since cputypestr was initialized to all 0s + * above, the terminating NUL will already be there. + */ + amt_to_copy = second_colon - colon; + } else { + /* No second colon, so use the wildcard for execfeatures */ + overrides[cputypespec_count].execfeatures = EXECFEATURES_OVERRIDE_WILDCARD; + /* + * There is no "+ 1" here because colon was already moved forward by 1 (above). + * which allows this computation to include the terminating NUL in the length + * computed. + */ + amt_to_copy = colon - cputypespec; + } + + /* Now determine the cpu subtype */ + cpusubtype = strtoul(colon, &end, 0); + if (colon == end || cpusubtype > UINT_MAX) { + printf("Invalid bingrade boot-arg (`%s').\n", cputypespec); + return 0; + } + overrides[cputypespec_count].cpusubtype = (uint32_t)cpusubtype; + + /* Copy the cputype string into a temp buffer */ + strlcpy(cputypestr, cputypespec, MIN(sizeof(cputypestr), amt_to_copy)); + + cputypeptr = &cputypestr[0]; + } else { + /* + * No colon present, set the cpu subtype to 0, the execfeatures to EXECFEATURES_OVERRIDE_WILDCARD + * and use the whole string as the cpu type + */ + overrides[cputypespec_count].cpusubtype = 0; + overrides[cputypespec_count].execfeatures = EXECFEATURES_OVERRIDE_WILDCARD; + cputypeptr = cputypespec; + } + + cputype = strtoul(cputypeptr, &end, 0); + if (cputypeptr == end || cputype > UINT_MAX) { + printf("Invalid bingrade boot-arg (`%s').\n", cputypespec); + return 0; + } + overrides[cputypespec_count].cputype = (uint32_t)cputype; + + cputypespec_count++; + } + } else { + /* No bingrade boot-arg; return 0 overrides */ + return 0; + } + + return cputypespec_count; +} + +size_t +bingrade_get_override_string(char *existing_overrides, size_t existing_overrides_bufsize) +{ + if (num_bingrade_overrides <= 0) { + return 0; /* No overrides set */ + } + + /* Init the empty string for strlcat */ + existing_overrides[0] = 0; + + for (int i = 0; i < num_bingrade_overrides; i++) { + char next_override[33]; /* 10char + ':' + 10char + ([future] ':' + 10char) */ + snprintf(next_override, sizeof(next_override), "0x%x:0x%x", bingrade_overrides[i].cputype, bingrade_overrides[i].cpusubtype); + if (i > 0) { + strlcat(existing_overrides, ",", existing_overrides_bufsize); + } + strlcat(existing_overrides, next_override, existing_overrides_bufsize); + } + + return strlen(existing_overrides); +} + +int +binary_grade_overrides_update(char *overrides_arg) +{ +#if ALLOW_FORCING_ARM64_32 + if (force_arm64_32) { + /* If forcing arm64_32, don't allow bingrade override. */ + return 0; + } +#endif /* ALLOW_FORCING_ARM64_32 */ + num_bingrade_overrides = parse_bingrade_override_bootarg(bingrade_overrides, MAX_BINGRADE_OVERRIDES, overrides_arg); + return num_bingrade_overrides; +} +#endif /* DEVELOPMENT || DEBUG */ + static inline void widen_segment_command(const struct segment_command *scp32, struct segment_command_64 *scp) @@ -420,6 +627,7 @@ process_is_plugin_host(struct image_params *imgp, load_result_t *result) "com.apple.bash", /* Required for the 'enable' command */ "com.apple.zsh", /* Required for the 'zmodload' command */ "com.apple.ksh", /* Required for 'builtin' command */ + "com.apple.sh", /* rdar://138353488: sh re-execs into zsh or bash, which are exempted */ }; for (size_t i = 0; i < ARRAY_COUNT(hardening_exceptions); i++) { if (strncmp(hardening_exceptions[i], identity, strlen(hardening_exceptions[i])) == 0) { @@ -434,6 +642,43 @@ process_is_plugin_host(struct image_params *imgp, load_result_t *result) } #endif /* XNU_TARGET_OS_OSX */ +static int +grade_binary_override(cpu_type_t __unused exectype, cpu_subtype_t __unused execsubtype, cpu_subtype_t execfeatures __unused, + bool allow_simulator_binary __unused) +{ +#if ALLOW_FORCING_ARM64_32 + if (force_arm64_32) { + /* Forcing ARM64_32 takes precedence over 'bingrade' boot-arg. */ + if (exectype == CPU_TYPE_ARM64_32 && execsubtype == CPU_SUBTYPE_ARM64_32_V8) { + return BINGRADE_OVERRIDE_MAX; + } else { + /* Stop trying to match. */ + return 0; + } + } +#endif /* ALLOW_FORCING_ARM64_32 */ + +#if DEVELOPMENT || DEBUG + if (num_bingrade_overrides == -1) { + num_bingrade_overrides = parse_bingrade_override_bootarg(bingrade_overrides, MAX_BINGRADE_OVERRIDES, NULL); + } + + if (num_bingrade_overrides == 0) { + return -1; + } + + for (int i = 0; i < num_bingrade_overrides; i++) { + if (bingrade_overrides[i].cputype == exectype && bingrade_overrides[i].cpusubtype == execsubtype && + (bingrade_overrides[i].execfeatures == EXECFEATURES_OVERRIDE_WILDCARD || + bingrade_overrides[i].execfeatures == execfeatures)) { + return BINGRADE_OVERRIDE_MAX - i; + } + } +#endif /* DEVELOPMENT || DEBUG */ + /* exectype/execsubtype Not found in override list */ + return -1; +} + load_return_t load_machfile( struct image_params *imgp, @@ -580,6 +825,22 @@ load_machfile( return lret; } + /* + * From now on it's safe to query entitlements via the vnode interface. Let's get figuring + * out whether we're a security relevant binary out of the way immediately. + */ + switch (exec_check_security_entitlement(imgp, HARDENED_PROCESS)) { + case EXEC_SECURITY_INVALID_CONFIG: + imgp->ip_free_map = map; + return LOAD_BADMACHO; + case EXEC_SECURITY_ENTITLED: + result->is_hardened_process = true; + break; + case EXEC_SECURITY_NOT_ENTITLED: + result->is_hardened_process = false; + break; + } + #if __x86_64__ /* * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries. @@ -750,6 +1011,27 @@ pie_required( return FALSE; } +/* + * Grades the specified CPU type, CPU subtype, CPU features to determine an absolute weight, used in the determination + * of running the associated binary on this machine. + * + * If an override boot-arg is specified, the boot-arg is parsed and its values are stored for later use in overriding + * the system's hard-coded binary grading values. + */ +int +grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype, cpu_subtype_t execfeatures, bool allow_simulator_binary) +{ + extern int ml_grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool); + + int binary_grade; + + if ((binary_grade = grade_binary_override(exectype, execsubtype, execfeatures, allow_simulator_binary)) < 0) { + return ml_grade_binary(exectype, execsubtype, execfeatures, allow_simulator_binary); + } + + return binary_grade; +} + /* * The file size of a mach-o file is limited to 32 bits; this is because * this is the limit on the kalloc() of enough bytes for a mach_header and diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index e4fe3bf73..ba1f1ba4a 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -47,13 +47,13 @@ typedef int load_return_t; /* libmalloc relies on these values not changing. If they change, * you need to update the values in that project as well */ -__options_decl(HR_flags_t, uint32_t, { +__options_decl(hardened_browser_flags_t, uint32_t, { BrowserHostEntitlementMask = 0x01, BrowserGPUEntitlementMask = 0x02, BrowserNetworkEntitlementMask = 0x04, BrowserWebContentEntitlementMask = 0x08, }); - #define HR_FLAGS_NUM_NIBBLES (sizeof(HR_flags_t) / 2) + #define HR_FLAGS_NUM_NIBBLES (sizeof(hardened_browser_flags_t) / 2) /* * Structure describing the result from calling load_machfile(), if that @@ -88,7 +88,9 @@ typedef struct _load_result { is_64bit_addr : 1, is_64bit_data : 1, custom_stack : 1, - is_rosetta : 1; + is_rosetta : 1, + hardened_heap : 1, + is_hardened_process : 1; unsigned int csflags; unsigned char uuid[16]; mach_vm_address_t min_vm_addr; @@ -97,8 +99,9 @@ typedef struct _load_result { mach_vm_address_t ro_vm_end; unsigned int platform_binary; - /* Flags denoting which type of hardened runtime binary this is*/ - HR_flags_t hardened_runtime_binary; + /* Flags denoting which type of platform restrictions binary this is */ + hardened_browser_flags_t hardened_browser; + off_t cs_end_offset; void *threadstate; size_t threadstate_sz; diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 1021103f6..32f3a68c4 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -181,8 +181,8 @@ retry_trace_me: ; * when, in this case, it is the current process's parent. * Most of the other checks in cantrace() don't apply either. */ - struct proc_ident p_ident = proc_ident(p); - struct proc_ident pproc_ident = proc_ident(pproc); + struct proc_ident p_ident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); + struct proc_ident pproc_ident = proc_ident_with_policy(pproc, IDENT_VALIDATION_PROC_EXACT); kauth_cred_t pproc_cred = kauth_cred_proc_ref(pproc); /* Release pproc and find it again after MAC call to avoid deadlock */ @@ -253,7 +253,7 @@ retry_proc_find: AUDIT_ARG(process, t); task = proc_task(t); - tident = proc_ident(t); + tident = proc_ident_with_policy(t, IDENT_VALIDATION_PROC_EXACT); if (uap->req == PT_ATTACHEXC) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" @@ -571,8 +571,8 @@ cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp) } #if CONFIG_MACF - struct proc_ident cur_ident = proc_ident(cur_procp); - struct proc_ident traced_ident = proc_ident(traced_procp); + struct proc_ident cur_ident = proc_ident_with_policy(cur_procp, IDENT_VALIDATION_PROC_EXACT); + struct proc_ident traced_ident = proc_ident_with_policy(traced_procp, IDENT_VALIDATION_PROC_EXACT); kauth_cred_t cur_cred = kauth_cred_proc_ref(cur_procp); /* diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index ed98cb0ab..a1e880648 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -200,7 +200,7 @@ mcache_init(void) } mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE, - ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE); + ZC_DESTRUCTIBLE); LIST_INIT(&mcache_head); @@ -354,7 +354,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, chunksize += sizeof(uint64_t) + align; chunksize = P2ROUNDUP(chunksize, align); cp->mc_slab_zone = zone_create(cp->mc_name, chunksize, - ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE); + ZC_DESTRUCTIBLE); } cp->mc_chunksize = chunksize; diff --git a/bsd/kern/mem_acct.c b/bsd/kern/mem_acct.c new file mode 100644 index 000000000..29ce1d4e0 --- /dev/null +++ b/bsd/kern/mem_acct.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +struct mem_acct { + int64_t _Atomic ma_allocated; /* Amount of memory accounted towards this subsystem (ignore temporary per-CPU accounting from below) */ + int32_t *__zpercpu ma_percpu; /* Per-CPU "bounce-buffer" of accounting that will be folded in to `ma_allocated` */ + uint64_t ma_hardlimit; /* hard limit that will not be exceeded */ + uint8_t ma_percent; /* Percent of hard-limit we should start soft-limiting (if != 100 && != 0) */ + uint64_t _Atomic ma_peak; + char ma_name[MEM_ACCT_NAME_LENGTH]; /* Name of the subsystem using this instance of memory-accounting module */ +}; + +#define MEM_ACCT_PCPU_MAX 1024 * 1024 /* Update global var after 1MB in the per-cpu var */ + +static struct mem_acct *memacct[MEM_ACCT_MAX]; + +static uint64_t +mem_acct_softlimit(uint64_t hardlimit, uint8_t percent) +{ + return (hardlimit * percent) / 100; +} + +static uint64_t +mem_acct_presoftlimit(uint64_t hardlimit, uint8_t percent) +{ + return (mem_acct_softlimit(hardlimit, percent) * percent) / 100; +} + +int +mem_acct_limited(const struct mem_acct *macct) +{ + uint64_t hardlimit; + int64_t allocated; + uint8_t percent; + + allocated = os_atomic_load(&macct->ma_allocated, relaxed); + if (allocated < 0) { + return 0; + } + + hardlimit = os_access_once(macct->ma_hardlimit); + if (hardlimit && allocated > hardlimit) { + return MEMACCT_HARDLIMIT; + } + + percent = os_access_once(macct->ma_percent); + if (percent) { + if (allocated > mem_acct_softlimit(hardlimit, percent)) { + return MEMACCT_SOFTLIMIT; + } + + if (allocated > mem_acct_presoftlimit(hardlimit, percent)) { + return MEMACCT_PRESOFTLIMIT; + } + } + + return 0; +} + +void +_mem_acct_add(struct mem_acct *macct, int size) +{ + int *pcpu; + + /* + * Yes, the accounting is not 100% accurate with the per-cpu + * "bounce-buffer" storing intermediate results. For example, we may + * report "hard-limit" even though all the per-cpu counters may bring us + * below the limit. But honestly, we don't care... If we hit hard-limit + * the system is gonna be in a bad state anyways until we have given + * away enough memory. + * + * The same counts for softlimit, but softlimit still allows us to + * account memory and just makes us a bit more aggressive at freeing + * stuff. + */ + + /* Now, add the size to the per-cpu variable */ + disable_preemption(); + pcpu = zpercpu_get(macct->ma_percpu); + *pcpu += size; + + /* If we added enough to the pcpu variable, fold it into the global variable */ + if (*pcpu > MEM_ACCT_PCPU_MAX || *pcpu < -MEM_ACCT_PCPU_MAX) { + int limited, newlimited; + int64_t allocated; + + limited = mem_acct_limited(macct); + + allocated = os_atomic_add(&macct->ma_allocated, *pcpu, relaxed); + + /* + * Can be temporarily < 0 if the CPU freeing memory hits + * MEM_ACCT_PCPU_MAX first. + */ + if (allocated > 0) { + os_atomic_max(&macct->ma_peak, allocated, relaxed); + } + + newlimited = mem_acct_limited(macct); + if (limited != newlimited) { + os_log(OS_LOG_DEFAULT, + "memacct: %s goes from %u to %u for its limit", + macct->ma_name, limited, newlimited); + } + + *pcpu = 0; + } + enable_preemption(); +} + +static LCK_GRP_DECLARE(mem_acct_mtx_grp, "mem_acct"); +static LCK_MTX_DECLARE(mem_acct_mtx, &mem_acct_mtx_grp); + +struct mem_acct * +mem_acct_register(const char *__null_terminated name, + uint64_t hardlimit, uint8_t percent) +{ + struct mem_acct *acct = NULL; + int i, index = -1; + + if (percent > 100) { + os_log(OS_LOG_DEFAULT, + "memacct: percentage for softlimit is out-of-bounds\n"); + return NULL; + } + + lck_mtx_lock(&mem_acct_mtx); + + /* Find an empty slot in the accounting array and check for name uniqueness */ + for (i = 0; i < MEM_ACCT_MAX; i++) { + if (memacct[i] == NULL) { + if (index == -1) { + index = i; + } + + continue; + } + + if (strlcmp(memacct[i]->ma_name, name, MEM_ACCT_NAME_LENGTH - 1) == 0) { + os_log(OS_LOG_DEFAULT, + "memacct: subsystem %s already exists", name); + goto exit; + } + } + + if (index == -1) { + os_log(OS_LOG_DEFAULT, "memacct: No space for additional subsystem"); + goto exit; + } + + memacct[index] = kalloc_type(struct mem_acct, Z_WAITOK_ZERO_NOFAIL); + + acct = memacct[index]; + + strlcpy(acct->ma_name, name, MEM_ACCT_NAME_LENGTH); + acct->ma_hardlimit = hardlimit; + if (percent >= 100) { + os_log(OS_LOG_DEFAULT, + "memacct: percent is > 100"); + + memacct[index] = NULL; + kfree_type(struct mem_acct, acct); + acct = NULL; + + goto exit; + } + acct->ma_percent = percent; + acct->ma_percpu = zalloc_percpu_permanent_type(int32_t); + +exit: + lck_mtx_unlock(&mem_acct_mtx); + + return acct; +} + +/* + * Memory Accounting sysctl handlers + */ + +struct walkarg { + int w_op, w_sub; + struct sysctl_req *w_req; +}; + +/* sysctls on a per-subsystem basis */ +static int sysctl_subsystem_peak(struct walkarg *w); +static int sysctl_subsystem_soft_limit(struct walkarg *w); +static int sysctl_subsystem_hard_limit(struct walkarg *w); +static int sysctl_subsystem_allocated(struct walkarg *w); +static int sysctl_all_subsystem_statistics(struct walkarg *w); + +/* sysctls for all active subsystems */ +static int sysctl_all_statistics(struct sysctl_req *); +static int sysctl_mem_acct_subsystems(struct sysctl_req *); + +/* Handler function for all Memory Accounting sysctls */ +static int sysctl_mem_acct SYSCTL_HANDLER_ARGS; + +/* Helper functions */ +static void memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a); + +SYSCTL_NODE(_kern, OID_AUTO, memacct, + CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_mem_acct, "Memory Accounting"); + +static int +sysctl_mem_acct SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + DECLARE_SYSCTL_HANDLER_ARG_ARRAY(int, 2, name, namelen); + int error = EINVAL; + struct walkarg w; + + /* Verify the specified subsystem index is valid */ + if (name[1] >= MEM_ACCT_MAX || name[1] < 0) { + return EINVAL; + } + + bzero(&w, sizeof(w)); + w.w_req = req; + w.w_op = name[0]; + w.w_sub = name[1]; + + switch (w.w_op) { + case MEM_ACCT_PEAK: + error = sysctl_subsystem_peak(&w); + break; + case MEM_ACCT_SOFT_LIMIT: + error = sysctl_subsystem_soft_limit(&w); + break; + case MEM_ACCT_HARD_LIMIT: + error = sysctl_subsystem_hard_limit(&w); + break; + case MEM_ACCT_ALLOCATED: + error = sysctl_subsystem_allocated(&w); + break; + case MEM_ACCT_SUBSYSTEMS: + error = sysctl_mem_acct_subsystems(req); + break; + case MEM_ACCT_ALL_SUBSYSTEM_STATISTICS: + error = sysctl_all_subsystem_statistics(&w); + break; + case MEM_ACCT_ALL_STATISTICS: + error = sysctl_all_statistics(req); + break; + } + + return error; +} + +static int +sysctl_subsystem_peak(struct walkarg *w) +{ + int error; + uint64_t value; + int changed = 0; + struct mem_acct *acct = memacct[w->w_sub]; + + if (acct == NULL) { + return ENOENT; + } + + value = os_atomic_load(&acct->ma_peak, relaxed); + error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed); + if (error || !changed) { + return error; + } + + os_atomic_store(&acct->ma_peak, value, relaxed); + return 0; +} + +static int +sysctl_subsystem_soft_limit(struct walkarg *w) +{ + int error; + uint64_t hardlimit, value; + int changed = 0; + struct mem_acct *acct = memacct[w->w_sub]; + + if (acct == NULL) { + return ENOENT; + } + + hardlimit = os_atomic_load(&acct->ma_hardlimit, relaxed); + if (acct->ma_percent) { + value = mem_acct_softlimit(hardlimit, acct->ma_percent); + } else { + value = hardlimit; + } + error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed); + if (error || !changed) { + return error; + } + + return EPERM; +} + +static int +sysctl_subsystem_hard_limit(struct walkarg *w) +{ + int error; + uint64_t value; + int changed = 0; + struct mem_acct *acct = memacct[w->w_sub]; + + if (acct == NULL) { + return ENOENT; + } + + value = os_atomic_load(&acct->ma_hardlimit, relaxed); + error = sysctl_io_number(w->w_req, value, sizeof(value), &value, &changed); + if (error || !changed) { + return error; + } + + acct->ma_hardlimit = value; + return 0; +} + +static int +sysctl_subsystem_allocated(struct walkarg *w) +{ + int64_t value; + struct mem_acct *acct = memacct[w->w_sub]; + + lck_mtx_lock(&mem_acct_mtx); + + if (acct == NULL) { + return ENOENT; + } + + value = os_atomic_load(&acct->ma_allocated, relaxed); + zpercpu_foreach(v, acct->ma_percpu) { + value += *v; + } + + lck_mtx_unlock(&mem_acct_mtx); + + return sysctl_io_number(w->w_req, value, sizeof(value), NULL, NULL); +} + +static int +sysctl_all_subsystem_statistics(struct walkarg *w) +{ + /* Returns a single memacct_statistics struct for the specified subsystem */ + struct memacct_statistics stats = {}; + struct mem_acct *acct = memacct[w->w_sub]; + + lck_mtx_lock(&mem_acct_mtx); + + if (acct == NULL) { + return ENOENT; + } + + memacct_copy_stats(&stats, acct); + + lck_mtx_unlock(&mem_acct_mtx); + + return sysctl_io_opaque(w->w_req, &stats, sizeof(stats), NULL); +} + +static int +sysctl_all_statistics(struct sysctl_req *req) +{ + /* Returns an array of memacct_statistics structs for all active subsystems */ + int i, error; + int count = 0; + + lck_mtx_lock(&mem_acct_mtx); + + for (i = 0; i < MEM_ACCT_MAX; i++) { + if (memacct[i] == NULL) { + break; + } + count++; + } + + struct memacct_statistics *memstats = kalloc_data(sizeof(struct memacct_statistics) * count, Z_WAITOK_ZERO_NOFAIL); + + for (i = 0; i < count; i++) { + struct mem_acct *acct; + struct memacct_statistics *stats; + + acct = memacct[i]; + stats = &memstats[i]; + + memacct_copy_stats(stats, acct); + } + + lck_mtx_unlock(&mem_acct_mtx); + + error = sysctl_io_opaque(req, memstats, sizeof(struct memacct_statistics) * count, NULL); + if (error) { + kfree_data(memstats, sizeof(struct memacct_statistics) * count); + return error; + } + + kfree_data(memstats, sizeof(struct memacct_statistics) * count); + return 0; +} + +static int +sysctl_mem_acct_subsystems(struct sysctl_req *req) +{ + /* Returns an array names for all active subsystems */ + int i, j, error; + int count = 0; + int totalCharCount = 0; + + lck_mtx_lock(&mem_acct_mtx); + + for (i = 0; i < MEM_ACCT_MAX; i++) { + if (memacct[i] == NULL) { + break; + } + count++; + } + + char *names = kalloc_data(count * MEM_ACCT_NAME_LENGTH, Z_WAITOK_ZERO_NOFAIL); + + for (i = 0; i < count; i++) { + struct mem_acct *acct = memacct[i]; + char acct_name[MEM_ACCT_NAME_LENGTH]; + + strbufcpy(acct_name, acct->ma_name); + + for (j = 0; j < MEM_ACCT_NAME_LENGTH; j++) { + names[totalCharCount++] = acct_name[j]; + } + } + + lck_mtx_unlock(&mem_acct_mtx); + + error = sysctl_io_opaque(req, names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH, NULL); + if (error) { + kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH); + return error; + } + + kfree_data(names, sizeof(char) * count * MEM_ACCT_NAME_LENGTH); + return 0; +} + +static void +memacct_copy_stats(struct memacct_statistics *s, struct mem_acct *a) +{ + s->peak = os_atomic_load(&a->ma_peak, relaxed); + s->allocated = os_atomic_load(&a->ma_allocated, relaxed); + zpercpu_foreach(v, a->ma_percpu) { + s->allocated += *v; + } + if (a->ma_percent) { + s->softlimit = mem_acct_softlimit(a->ma_hardlimit, a->ma_percent); + } else { + s->softlimit = a->ma_hardlimit; + } + s->hardlimit = a->ma_hardlimit; + strbufcpy(s->ma_name, a->ma_name); +} diff --git a/bsd/kern/mem_acct.h b/bsd/kern/mem_acct.h new file mode 100644 index 000000000..f06806058 --- /dev/null +++ b/bsd/kern/mem_acct.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_MEM_ACCT_H +#define _KERN_MEM_ACCT_H + +#ifdef XNU_KERNEL_PRIVATE + +#include + +struct mem_acct; +/* + * Add "size" to the memory accounting module of "type". + */ +__private_extern__ void _mem_acct_add(struct mem_acct *macct, int size); +__private_extern__ struct mem_acct *mem_acct_register( + const char *__null_terminated name, uint64_t hardlimit, uint8_t percent); + +/* + * pre-softlimit means we are getting close to the softlimit (about 80% of it). + * The subsystem should start taking preventive actions. + */ +#define MEMACCT_PRESOFTLIMIT 1 +/* + * We are at the softlimit. Take actions to reduce memory usage, but don't take + * fully destructive actions yet. + */ +#define MEMACCT_SOFTLIMIT 2 +/* + * We are above the hardlimit. Prevent holding on to memory in this subsystem. + */ +#define MEMACCT_HARDLIMIT 3 + +extern int mem_acct_limited(const struct mem_acct *macct); + +static inline void +mem_acct_add(struct mem_acct *macct, unsigned int size) +{ + _mem_acct_add(macct, size); +} + +static inline void +mem_acct_sub(struct mem_acct *macct, unsigned int size) +{ + _mem_acct_add(macct, -size); +} + + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /*_KERN_MEM_ACCT_H */ diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index e2d38e8f6..fa789ade2 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -121,7 +121,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 87) +#if (MAC_POLICY_OPS_VERSION != 91) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -134,10 +134,10 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(audit_check_postselect) CHECK_SET_HOOK(audit_check_preselect) - .mpo_reserved01 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved02 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved03 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved04 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(graft_check_graft) + CHECK_SET_HOOK(graft_check_ungraft) + CHECK_SET_HOOK(graft_notify_graft) + CHECK_SET_HOOK(graft_notify_ungraft) CHECK_SET_HOOK(cred_check_label_update_execve) CHECK_SET_HOOK(cred_check_label_update) @@ -221,8 +221,8 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(vnode_notify_unlink) CHECK_SET_HOOK(vnode_check_swap) - .mpo_reserved33 = (mpo_reserved_hook_t *)common_hook, - .mpo_reserved34 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(vnode_check_dataprotect_set) + CHECK_SET_HOOK(mount_check_remount_with_flags) CHECK_SET_HOOK(mount_notify_mount) CHECK_SET_HOOK(vnode_check_copyfile) @@ -319,7 +319,7 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(proc_check_sched) CHECK_SET_HOOK(proc_check_setaudit) CHECK_SET_HOOK(proc_check_setauid) - .mpo_reserved64 = (mpo_reserved_hook_t *)common_hook, + CHECK_SET_HOOK(proc_check_iopolicysys) CHECK_SET_HOOK(proc_check_signal) CHECK_SET_HOOK(proc_check_wait) CHECK_SET_HOOK(proc_check_dump_core) diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index 9446b1d1c..d7188c9ce 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -65,6 +65,8 @@ #include #include #include +#include +#include #if CONFIG_MACF #include @@ -87,6 +89,7 @@ #define f_ops fp_glob->fg_ops #define PSEMNAMLEN 31 /* maximum name segment length we bother with */ +#define PSEMTEAMIDLEN 31 /* maximum length of team ID we consider */ struct pseminfo { unsigned int psem_flags; @@ -114,8 +117,10 @@ struct pseminfo { struct psemcache { LIST_ENTRY(psemcache) psem_hash; /* hash chain */ struct pseminfo *pseminfo; /* vnode the name refers to */ - size_t psem_nlen; /* length of name */ + size_t psem_nlen; /* length of name */ + size_t psem_teamidlen; /* length of team ID */ char psem_name[PSEMNAMLEN + 1]; /* segment name */ + char psem_teamid[PSEMTEAMIDLEN + 1]; /* team ID of users, if any */ }; #define PSEMCACHE_NULL (struct psemcache *)0 @@ -124,18 +129,27 @@ struct psemcache { #define PSEMCACHE_NEGATIVE (ENOENT) struct psemstats { - long goodhits; /* hits that we can really use */ - long neghits; /* negative hits that we can use */ - long badhits; /* hits we must drop */ - long falsehits; /* hits with id mismatch */ - long miss; /* misses */ - long longnames; /* long names that ignore cache */ + long pstats_hits; + long pstats_miss; + long pstats_local_hits; + long pstats_global_hits; + long pstats_local_miss; + long pstats_global_miss; + long pstats_local_collisions; + long pstats_global_collisions; + long pstats_fallback_hits; /* hits that missed local but hit global */ + long pstats_fallback_miss; /* hits that missed both local and global */ + long pstats_neghits; /* hits to 'negative entries' (return ENOENT) */ + long pstats_longnames; /* semaphore or team ID ENAMETOOLONG */ }; struct psemname { - char *psem_nameptr; /* pointer to looked up name */ - size_t psem_namelen; /* length of looked up component */ - u_int32_t psem_hash; /* hash value of looked up name */ + char *psem_nameptr; /* pointer to looked up name */ + size_t psem_namelen; /* length of looked up component */ + uint64_t psem_hash_local; /* hash value of looked up name and team */ + uint64_t psem_hash_global; /* hash value of looked up name, without team */ + const char *psem_teamidptr; + size_t psem_teamidlen; }; struct psemnode { @@ -147,20 +161,45 @@ struct psemnode { }; #define PSEMNODE_NULL (struct psemnode *)0 +LIST_HEAD(psemhashhead, psemcache); + +struct psemhashtable { + struct psemhashhead *psem_table; + + /* Hash table mask, i.e size - 1 */ + u_long psem_table_mask; + + /* SipHash key, randomly assigned at boot */ + uint8_t psem_siphash_key[SIPHASH_KEY_LENGTH]; +}; +#define PSEMHASH(table, hash) (&(table).psem_table[(hash) & (table).psem_table_mask]) + +struct psemhashtable psem_global, psem_local; + +long posix_sem_num; /* number of POSIX semaphores on the system */ +long posix_sem_max = 10000; /* max number of POSIX semaphores on the system */ -#define PSEMHASH(pnp) \ - (&psemhashtbl[(pnp)->psem_hash & psemhash]) -LIST_HEAD(psemhashhead, psemcache) * psemhashtbl; /* Hash Table */ -u_long psemhash; /* size of hash table - 1 */ -long psemnument; /* number of cache entries allocated */ -long posix_sem_max = 10000; /* tunable for max POSIX semaphores */ - /* 10000 limits to ~1M of memory */ SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Posix"); SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Semaphores"); SYSCTL_LONG(_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &posix_sem_max, "max"); struct psemstats psemstats; /* cache effectiveness statistics */ +#if DEBUG || DEVELOPMENT +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, hits, CTLFLAG_RD, &psemstats.pstats_hits, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, miss, CTLFLAG_RD, &psemstats.pstats_miss, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_hits, CTLFLAG_RD, &psemstats.pstats_local_hits, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_miss, CTLFLAG_RD, &psemstats.pstats_local_miss, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_hits, CTLFLAG_RD, &psemstats.pstats_global_hits, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_miss, CTLFLAG_RD, &psemstats.pstats_global_miss, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, fallback_hits, CTLFLAG_RD, &psemstats.pstats_fallback_hits, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, fallback_miss, CTLFLAG_RD, &psemstats.pstats_fallback_miss, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, local_collisions, CTLFLAG_RD, &psemstats.pstats_local_collisions, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, global_collisions, CTLFLAG_RD, &psemstats.pstats_global_collisions, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, neghits, CTLFLAG_RD, &psemstats.pstats_neghits, ""); +SYSCTL_LONG(_kern_posix_sem, OID_AUTO, longnames, CTLFLAG_RD, &psemstats.pstats_longnames, ""); +#endif + static int psem_access(struct pseminfo *pinfo, mode_t mode, kauth_cred_t cred); static int psem_cache_search(struct pseminfo **, struct psemname *, struct psemcache **); @@ -169,6 +208,8 @@ static int psem_delete(struct pseminfo * pinfo); static int psem_closefile(struct fileglob *fp, vfs_context_t ctx); static int psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache); +static const char *psem_get_teamid(proc_t p); + static const struct fileops psemops = { .fo_type = DTYPE_PSXSEM, .fo_read = fo_no_read, @@ -192,6 +233,61 @@ static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct p static void psem_cache_delete(struct psemcache *pcp); int psem_cache_purge_all(void); +static struct psemname +psem_cache_hash(char *name, size_t len, const char *teamid, size_t teamidlen) +{ + SIPHASH_CTX ctx; + struct psemname nd; + + nd.psem_nameptr = name; + nd.psem_namelen = len; + nd.psem_teamidptr = teamid; + nd.psem_teamidlen = teamidlen; + nd.psem_hash_local = 0; + nd.psem_hash_global = 0; + + _Static_assert(sizeof(nd.psem_hash_local) == SIPHASH_DIGEST_LENGTH, "hash field is wrong size for SipHash"); + _Static_assert(sizeof(nd.psem_hash_global) == SIPHASH_DIGEST_LENGTH, "hash field is wrong size for SipHash"); + + /* + * This routine is called before taking the subsystem lock, so we'll prepare hashes + * for both global and local tables up front. + */ + SipHash24_Init(&ctx); + SipHash_SetKey(&ctx, psem_global.psem_siphash_key); + SipHash_Update(&ctx, name, len); + SipHash_Final((u_int8_t *)&nd.psem_hash_global, &ctx); + + if (teamidlen > 0) { + SipHash24_Init(&ctx); + SipHash_SetKey(&ctx, psem_local.psem_siphash_key); + SipHash_Update(&ctx, name, len); + SipHash_Update(&ctx, teamid, teamidlen); + SipHash_Final((u_int8_t *)&nd.psem_hash_local, &ctx); + } + + return nd; +} + +/* + * Returns 1 if the semaphore name matches what we're looking for, otherwise 0. + * When searching the local table, the team ID must match too. + */ +static int +psem_cache_is_match(struct psemcache *sem, struct psemname *target, bool local) +{ + bool name_matches = target->psem_namelen == sem->psem_nlen && + !bcmp(target->psem_nameptr, sem->psem_name, target->psem_namelen); + + if (local) { + bool teamid_matches = target->psem_teamidlen == sem->psem_teamidlen && + !bcmp(target->psem_teamidptr, sem->psem_teamid, target->psem_teamidlen); + return name_matches && teamid_matches; + } + + return name_matches; +} + /* * Lookup an entry in the cache * @@ -206,31 +302,66 @@ static int psem_cache_search(struct pseminfo **psemp, struct psemname *pnp, struct psemcache **pcache) { - struct psemcache *pcp, *nnp; + struct psemcache *pcp = NULL, *nnp; struct psemhashhead *pcpp; - if (pnp->psem_namelen > PSEMNAMLEN) { - psemstats.longnames++; + if (pnp->psem_namelen > PSEMNAMLEN || pnp->psem_teamidlen > PSEMTEAMIDLEN) { + os_atomic_inc(&psemstats.pstats_longnames, relaxed); return PSEMCACHE_NOTFOUND; } - pcpp = PSEMHASH(pnp); - for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) { - nnp = pcp->psem_hash.le_next; - if (pcp->psem_nlen == pnp->psem_namelen && - !bcmp(pcp->psem_name, pnp->psem_nameptr, pcp->psem_nlen)) { - break; + /* If Team ID is present, try to look up in the local table first. */ + if (pnp->psem_teamidlen > 0) { + pcpp = PSEMHASH(psem_local, pnp->psem_hash_local); + + for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) { + nnp = pcp->psem_hash.le_next; + if (psem_cache_is_match(pcp, pnp, true)) { + break; + } + os_atomic_inc(&psemstats.pstats_local_collisions, relaxed); + } + + if (pcp == 0) { + os_atomic_inc(&psemstats.pstats_local_miss, relaxed); + } else { + os_atomic_inc(&psemstats.pstats_local_hits, relaxed); + } + } + + /* Otherwise, or if the local lookup failed, search the global table. */ + if (pcp == 0) { + pcpp = PSEMHASH(psem_global, pnp->psem_hash_global); + + for (pcp = pcpp->lh_first; pcp != 0; pcp = nnp) { + nnp = pcp->psem_hash.le_next; + if (psem_cache_is_match(pcp, pnp, false)) { + break; + } + os_atomic_inc(&psemstats.pstats_global_collisions, relaxed); + } + + if (pcp == 0) { + os_atomic_inc(&psemstats.pstats_global_miss, relaxed); + if (pnp->psem_teamidlen > 0) { + os_atomic_inc(&psemstats.pstats_fallback_miss, relaxed); + } + } else { + os_atomic_inc(&psemstats.pstats_global_hits, relaxed); + if (pnp->psem_teamidlen > 0) { + os_atomic_inc(&psemstats.pstats_fallback_hits, relaxed); + } } } if (pcp == 0) { - psemstats.miss++; + os_atomic_inc(&psemstats.pstats_miss, relaxed); return PSEMCACHE_NOTFOUND; } /* We found a "positive" match, return the vnode */ if (pcp->pseminfo) { - psemstats.goodhits++; + os_atomic_inc(&psemstats.pstats_hits, relaxed); /* TOUCH(ncp); */ *psemp = pcp->pseminfo; *pcache = pcp; @@ -241,7 +372,7 @@ psem_cache_search(struct pseminfo **psemp, struct psemname *pnp, * We found a "negative" match, ENOENT notifies client of this match. * The nc_vpid field records whether this is a whiteout. */ - psemstats.neghits++; + os_atomic_inc(&psemstats.pstats_neghits, relaxed); return PSEMCACHE_NEGATIVE; } @@ -252,24 +383,20 @@ static int psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *pcp) { struct psemhashhead *pcpp; - struct pseminfo *dpinfo; - struct psemcache *dpcp; #if DIAGNOSTIC if (pnp->psem_namelen > PSEMNAMLEN) { panic("cache_enter: name too long"); } + if (pnp->psem_teamidlen > PSEMTEAMIDLEN) { + panic("cache_enter: teamid too long"); + } #endif - - /* if the entry has already been added by some one else return */ - if (psem_cache_search(&dpinfo, pnp, &dpcp) == PSEMCACHE_FOUND) { - return EEXIST; - } - if (psemnument >= posix_sem_max) { + if (posix_sem_num >= posix_sem_max) { return ENOSPC; } - psemnument++; + posix_sem_num++; /* * Fill in cache info, if vp is NULL this is a "negative" cache entry. * For negative entries, we have to record whether it is a whiteout. @@ -279,7 +406,16 @@ psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *p pcp->pseminfo = psemp; pcp->psem_nlen = pnp->psem_namelen; bcopy(pnp->psem_nameptr, pcp->psem_name, pcp->psem_nlen); - pcpp = PSEMHASH(pnp); + pcp->psem_teamidlen = pnp->psem_teamidlen; + bcopy(pnp->psem_teamidptr, pcp->psem_teamid, pcp->psem_teamidlen); + + /* Insert into the right table based on Team ID. */ + if (pcp->psem_teamidlen > 0) { + pcpp = PSEMHASH(psem_local, pnp->psem_hash_local); + } else { + pcpp = PSEMHASH(psem_global, pnp->psem_hash_global); + } + #if DIAGNOSTIC { struct psemcache *p; @@ -301,7 +437,15 @@ psem_cache_add(struct pseminfo *psemp, struct psemname *pnp, struct psemcache *p void psem_cache_init(void) { - psemhashtbl = hashinit((int)(posix_sem_max / 2), M_SHM, &psemhash); + /* + * The global table stores semaphores created by processes without a Team + * ID (such as platform binaries). The local table stores all other semaphores. + */ + psem_global.psem_table = hashinit((int)(posix_sem_max / 2), M_SHM, &psem_global.psem_table_mask); + psem_local.psem_table = hashinit((int)(posix_sem_max / 2), M_SHM, &psem_local.psem_table_mask); + + read_frandom(psem_global.psem_siphash_key, sizeof(psem_global.psem_siphash_key)); + read_frandom(psem_local.psem_siphash_key, sizeof(psem_local.psem_siphash_key)); } static void @@ -317,7 +461,29 @@ psem_cache_delete(struct psemcache *pcp) #endif /* DIAGNOSTIC */ LIST_REMOVE(pcp, psem_hash); pcp->psem_hash.le_prev = NULL; - psemnument--; + posix_sem_num--; +} + +static int +psem_cache_purge_table(struct psemhashtable *table) +{ + struct psemcache *pcp, *tmppcp; + struct psemhashhead *pcpp; + + for (pcpp = &table->psem_table[table->psem_table_mask]; pcpp >= table->psem_table; pcpp--) { + LIST_FOREACH_SAFE(pcp, pcpp, psem_hash, tmppcp) { + assert(pcp->psem_nlen); + /* + * unconditionally unlink the cache entry + */ + int error = psem_unlink_internal(pcp->pseminfo, pcp); + if (error) { + return error; + } + } + } + + return 0; } /* @@ -328,8 +494,6 @@ psem_cache_delete(struct psemcache *pcp) int psem_cache_purge_all(void) { - struct psemcache *pcp, *tmppcp; - struct psemhashhead *pcpp; int error = 0; if (kauth_cred_issuser(kauth_cred_get()) == 0) { @@ -337,26 +501,22 @@ psem_cache_purge_all(void) } PSEM_SUBSYS_LOCK(); - for (pcpp = &psemhashtbl[psemhash]; pcpp >= psemhashtbl; pcpp--) { - LIST_FOREACH_SAFE(pcp, pcpp, psem_hash, tmppcp) { - assert(pcp->psem_nlen); - /* - * unconditionally unlink the cache entry - */ - error = psem_unlink_internal(pcp->pseminfo, pcp); - if (error) { - goto out; - } - } + error = psem_cache_purge_table(&psem_global); + if (error) { + goto out; } - assert(psemnument == 0); + error = psem_cache_purge_table(&psem_local); + if (error) { + goto out; + } + assert(posix_sem_num == 0); out: PSEM_SUBSYS_UNLOCK(); if (error) { printf("%s: Error %d removing all semaphores: %ld remain!\n", - __func__, error, psemnument); + __func__, error, posix_sem_num); } return error; } @@ -374,18 +534,17 @@ out: int sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) { - size_t i; int indx, error; struct psemname nd; struct pseminfo *pinfo; struct fileproc *fp = NULL; char *pnbuf = NULL; + const char *teamid = NULL; struct pseminfo *new_pinfo = PSEMINFO_NULL; struct psemnode *new_pnode = PSEMNODE_NULL; struct psemcache *pcache = PSEMCACHE_NULL; char * nameptr; - char * cp; - size_t pathlen, plen; + size_t pathlen, plen, teamidlen; mode_t fmode; mode_t cmode = (mode_t)uap->mode; int value = uap->value; @@ -432,13 +591,13 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) plen = pathlen; nameptr = pnbuf; - nd.psem_nameptr = nameptr; - nd.psem_namelen = plen; - nd.psem_hash = 0; - - for (cp = nameptr, i = 1; *cp != 0 && i <= plen; i++, cp++) { - nd.psem_hash += (unsigned char)*cp * i; + teamid = psem_get_teamid(p); + teamidlen = teamid ? strlen(teamid) : 0; + if (teamidlen > PSEMTEAMIDLEN) { + error = ENAMETOOLONG; + goto bad; } + nd = psem_cache_hash(nameptr, plen, teamid, teamidlen); /* * attempt to allocate a new fp; if unsuccessful, the fp will be @@ -697,18 +856,28 @@ psem_unlink_internal(struct pseminfo *pinfo, struct psemcache *pcache) return 0; } +static const char * +psem_get_teamid(proc_t p) +{ +#if XNU_TARGET_OS_OSX +#pragma unused(p) + return NULL; +#else + return csproc_get_teamid(p); +#endif +} + int sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *retval) { - size_t i; int error = 0; struct psemname nd; struct pseminfo *pinfo; char * nameptr; - char * cp; char * pnbuf; - size_t pathlen; + const char *teamid; + size_t pathlen, teamidlen; struct psemcache *pcache = PSEMCACHE_NULL; pinfo = PSEMINFO_NULL; @@ -741,13 +910,13 @@ sem_unlink(__unused proc_t p, struct sem_unlink_args *uap, __unused int32_t *ret } #endif /* PSXSEM_NAME_RESTRICT */ - nd.psem_nameptr = nameptr; - nd.psem_namelen = pathlen; - nd.psem_hash = 0; - - for (cp = nameptr, i = 1; *cp != 0 && i <= pathlen; i++, cp++) { - nd.psem_hash += (unsigned char)*cp * i; + teamid = psem_get_teamid(p); + teamidlen = teamid ? strlen(teamid) : 0; + if (teamidlen > PSEMTEAMIDLEN) { + error = ENAMETOOLONG; + goto bad; } + nd = psem_cache_hash(nameptr, pathlen, teamid, teamidlen); PSEM_SUBSYS_LOCK(); error = psem_cache_search(&pinfo, &nd, &pcache); diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index 647285863..c061a0928 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -219,8 +219,8 @@ static void munge_vinfo_stat(struct stat64 *sbp, struct vinfo_stat *vsbp); static int proc_piduuidinfo(pid_t pid, uuid_t uuid_buf, uint32_t buffersize); extern int proc_pidpathinfo_internal(proc_t p, __unused uint64_t arg, char *buf, uint32_t buffersize, __unused int32_t *retval); -extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); -extern int cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum); +extern bool cansignal(struct proc *, kauth_cred_t, struct proc *, int); +extern bool cansignal_nomac(proc_t src, kauth_cred_t uc_src, proc_t dst, int signum); extern int proc_get_rusage(proc_t proc, int flavor, user_addr_t buffer, int is_zombie); #define CHECK_SAME_USER TRUE @@ -754,6 +754,19 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) } #endif /* CONFIG_DELAY_IDLE_SLEEP */ + + task_t task = proc_task(p); + + if (task) { + if (task_has_hardened_heap(task)) { + pbsd->pbi_flags |= PROC_FLAG_HARDENED_HEAP_ENABLED; + } + + if (task_has_tpro(task)) { + pbsd->pbi_flags |= PROC_FLAG_TPRO_ENABLED; + } + } + switch (PROC_CONTROL_STATE(p)) { case P_PCTHROTTLE: pbsd->pbi_flags |= PROC_FLAG_PC_THROTTLE; @@ -851,6 +864,17 @@ proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo * pbsd_shortp, int zombi pbsd_shortp->pbsi_flags |= PROC_FLAG_DELAYIDLESLEEP; } #endif /* CONFIG_DELAY_IDLE_SLEEP */ + task_t task = proc_task(p); + + if (task) { + if (task_has_hardened_heap(task)) { + pbsd_shortp->pbsi_flags |= PROC_FLAG_HARDENED_HEAP_ENABLED; + } + + if (task_has_tpro(task)) { + pbsd_shortp->pbsi_flags |= PROC_FLAG_TPRO_ENABLED; + } + } switch (PROC_CONTROL_STATE(p)) { case P_PCTHROTTLE: @@ -3515,7 +3539,7 @@ proc_ident_for_audit_token(proc_ident_t out, audit_token_t token) goto out; } - *out = proc_ident(p); + *out = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); out: if (p != PROC_NULL) { proc_rele(p); diff --git a/bsd/kern/qsort.c b/bsd/kern/qsort.c index 74c506b20..8d471efb8 100644 --- a/bsd/kern/qsort.c +++ b/bsd/kern/qsort.c @@ -64,12 +64,8 @@ #include -//#include #include - -__private_extern__ -void -qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); +#include static inline char *med3(char *, char *, char *, int (*)(const void *, const void *)); static inline void swapfunc(char *, char *, long, int); diff --git a/osfmk/arm64/bti_telemetry.h b/bsd/kern/qsort.h similarity index 68% rename from osfmk/arm64/bti_telemetry.h rename to bsd/kern/qsort.h index a1628e522..aa1071618 100644 --- a/osfmk/arm64/bti_telemetry.h +++ b/bsd/kern/qsort.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Apple Inc. All rights reserved. + * Copyright (c) 2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,25 +26,27 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _BTI_TELEMETRY_H_ -#define _BTI_TELEMETRY_H_ -#ifdef CONFIG_BTI_TELEMETRY -#include -#include -#include +#ifndef _KERN_QSORT_H_ +#define _KERN_QSORT_H_ -/** - * Wakes up the BTI exception telemetry subsystem. Call once per boot. +#include + +__BEGIN_DECLS + +/* + * The `cmpfunc_t` type is a pointer to a function that should return the + * following: + * + * return < 0 for a < b + * 0 for a == b + * > 0 for a > b */ +typedef int (*cmpfunc_t)(const void *a, const void *b); + +__private_extern__ void -bti_telemetry_init(void); +qsort(void *array, size_t num_elements, size_t element_size, cmpfunc_t compare); -/** - * Handle a BTI exception. Returns TRUE if handled and OK to return from the - * exception, false otherwise. - */ -bool -bti_telemetry_handle_exception(arm_saved_state_t *state); +__END_DECLS -#endif /* CONFIG_BTI_TELEMETRY */ -#endif /* _BTI_TELEMETRY_H_ */ +#endif /* _KERN_QSORT_H_ */ diff --git a/bsd/kern/socket_flows.c b/bsd/kern/socket_flows.c index 55d76b075..b7ddc861d 100644 --- a/bsd/kern/socket_flows.c +++ b/bsd/kern/socket_flows.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023, 2024 Apple Inc. All rights reserved. + * Copyright (c) 2021, 2023-2025, Apple Inc. All rights reserved. * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code @@ -319,7 +319,11 @@ soflow_fill_hash_entry_from_address(struct soflow_hash_entry *entry, bool isLoca in6_verify_ifscope(&sin6->sin6_addr, sin6->sin6_scope_id); } } - entry->soflow_family = AF_INET6; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + entry->soflow_family = AF_INET; + } else { + entry->soflow_family = AF_INET6; + } return TRUE; default: return FALSE; @@ -334,6 +338,7 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s } if (inp->inp_vflag & INP_IPV6) { + entry->soflow_family = AF_INET6; if (isLocal == TRUE) { if (inp->inp_lport) { entry->soflow_lport = inp->inp_lport; @@ -348,6 +353,9 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s if (islocalUpdate) { entry->soflow_laddr_updated = TRUE; } + if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { + entry->soflow_family = AF_INET; + } } } else { if (inp->inp_fport) { @@ -357,9 +365,11 @@ soflow_fill_hash_entry_from_inp(struct soflow_hash_entry *entry, bool isLocal, s entry->soflow_faddr.addr6 = inp->in6p_faddr; entry->soflow_faddr6_ifscope = inp->inp_fifscope; in6_verify_ifscope(&entry->soflow_faddr.addr6, inp->inp_fifscope); + if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { + entry->soflow_family = AF_INET; + } } } - entry->soflow_family = AF_INET6; return TRUE; } else if (inp->inp_vflag & INP_IPV4) { if (isLocal == TRUE) { @@ -572,7 +582,7 @@ soflow_db_lookup_entry_internal(struct soflow_db *db, struct sockaddr *local, st matchentry.soflow_debug = SOFLOW_ENABLE_DEBUG(db->soflow_db_so, (&matchentry)); SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, &matchentry, true, "Looking for entry"); - if (inp->inp_vflag & INP_IPV6) { + if (matchentry.soflow_family == AF_INET6) { hashkey_faddr = matchentry.soflow_faddr.addr6.s6_addr32[3]; hashkey_laddr = (remoteOnly == false) ? matchentry.soflow_laddr.addr6.s6_addr32[3] : 0; } else { @@ -588,12 +598,12 @@ soflow_db_lookup_entry_internal(struct soflow_db *db, struct sockaddr *local, st flowhash = &db->soflow_db_hashbase[inp_hash_element]; LIST_FOREACH(nextentry, flowhash, soflow_entry_link) { - if (inp->inp_vflag & INP_IPV6) { + if (matchentry.soflow_family == AF_INET6) { if (soflow_match_entries_v6(nextentry, &matchentry, remoteOnly)) { SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, nextentry, nextentry->soflow_debug, "Found entry v6"); break; } - } else if (inp->inp_vflag & INP_IPV4) { + } else if (matchentry.soflow_family == AF_INET) { if (soflow_match_entries_v4(nextentry, &matchentry, remoteOnly)) { SOFLOW_ENTRY_LOG(LOG_DEBUG, db->soflow_db_so, nextentry, nextentry->soflow_debug, "Found entry v4"); break; @@ -746,7 +756,7 @@ soflow_db_add_entry(struct soflow_db *db, struct sockaddr *local, struct sockadd entry->soflow_debug = SOFLOW_ENABLE_DEBUG(db->soflow_db_so, entry); microuptime(&entry->soflow_timestamp); - if (inp->inp_vflag & INP_IPV6) { + if (entry->soflow_family == AF_INET6) { hashkey_faddr = entry->soflow_faddr.addr6.s6_addr32[3]; hashkey_laddr = entry->soflow_laddr.addr6.s6_addr32[3]; } else { @@ -778,15 +788,15 @@ done: return entry; } -static boolean_t -soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, uint8_t *__counted_by(*count) *address_ptr, int *count) +static sa_family_t +soflow_udp_get_address_from_control(struct mbuf *control, uint8_t *__counted_by(*count) *address_ptr, int *count) { struct cmsghdr *cm; struct in6_pktinfo *pi6; struct socket *so = NULL; if (control == NULL || address_ptr == NULL) { - return false; + return AF_UNSPEC; } for (; control != NULL; control = control->m_next) { @@ -801,23 +811,21 @@ soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, ui switch (cm->cmsg_type) { case IP_RECVDSTADDR: - if (family == AF_INET && - cm->cmsg_level == IPPROTO_IP && + if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_len == CMSG_LEN(sizeof(struct in_addr))) { *address_ptr = CMSG_DATA(cm); *count = sizeof(struct in_addr); - return true; + return AF_INET; } break; case IPV6_PKTINFO: case IPV6_2292PKTINFO: - if (family == AF_INET6 && - cm->cmsg_level == IPPROTO_IPV6 && + if (cm->cmsg_level == IPPROTO_IPV6 && cm->cmsg_len == CMSG_LEN(sizeof(struct in6_pktinfo))) { pi6 = (struct in6_pktinfo *)(void *)CMSG_DATA(cm); *address_ptr = (uint8_t *)&pi6->ipi6_addr; *count = sizeof(struct in6_addr); - return true; + return AF_INET6; } break; default: @@ -825,7 +833,7 @@ soflow_udp_get_address_from_control(sa_family_t family, struct mbuf *control, ui } } } - return false; + return AF_UNSPEC; } static boolean_t @@ -869,10 +877,10 @@ soflow_entry_update_local(struct soflow_db *db, struct soflow_hash_entry *entry, if (local == NULL && control != NULL) { int size = 0; uint8_t * __counted_by(size) addr_ptr = NULL; - boolean_t result = soflow_udp_get_address_from_control(entry->soflow_family, control, &addr_ptr, &size); + sa_family_t family = soflow_udp_get_address_from_control(control, &addr_ptr, &size); - if (result && size && addr_ptr) { - switch (entry->soflow_family) { + if (family != AF_UNSPEC && size && addr_ptr) { + switch (family) { case AF_INET: if (size == sizeof(struct in_addr)) { address_buf.sin.sin_port = 0; @@ -941,6 +949,7 @@ static bool soflow_nstat_provider_request_vals(nstat_provider_context ctx, u_int32_t *ifflagsp, nstat_counts *countsp, + nstat_detailed_counts *detailsp, void *metadatap) { struct soflow_hash_entry *hash_entry = (struct soflow_hash_entry *) ctx; @@ -977,6 +986,18 @@ soflow_nstat_provider_request_vals(nstat_provider_context ctx, countsp->nstat_rxpackets, countsp->nstat_rxbytes, countsp->nstat_txpackets, countsp->nstat_txbytes); } + if (detailsp) { + bzero(detailsp, sizeof(*detailsp)); + detailsp->nstat_media_stats.ms_total.ts_rxbytes = hash_entry->soflow_rxbytes; + detailsp->nstat_media_stats.ms_total.ts_txbytes = hash_entry->soflow_txbytes; + detailsp->nstat_media_stats.ms_total.ts_rxpackets = hash_entry->soflow_rxpackets; + detailsp->nstat_media_stats.ms_total.ts_txpackets = hash_entry->soflow_txpackets; + + SOFLOW_LOG(LOG_DEBUG, so, hash_entry->soflow_debug, + "Collected NSTAT detailed counts: rxpackets %llu rxbytes %llu txpackets %llu txbytes %llu", + detailsp->nstat_media_stats.ms_total.ts_rxpackets, detailsp->nstat_media_stats.ms_total.ts_rxbytes, + detailsp->nstat_media_stats.ms_total.ts_txpackets, detailsp->nstat_media_stats.ms_total.ts_txbytes); + } if (metadatap) { nstat_udp_descriptor *desc = (nstat_udp_descriptor *)metadatap; bzero(desc, sizeof(*desc)); diff --git a/bsd/kern/stackshot.c b/bsd/kern/stackshot.c index 129ed2594..b93f3a562 100644 --- a/bsd/kern/stackshot.c +++ b/bsd/kern/stackshot.c @@ -36,7 +36,7 @@ #include extern uint32_t stackshot_estimate_adj; -EXPERIMENT_FACTOR_UINT(_kern, stackshot_estimate_adj, &stackshot_estimate_adj, 0, 100, +EXPERIMENT_FACTOR_LEGACY_UINT(_kern, stackshot_estimate_adj, &stackshot_estimate_adj, 0, 100, "adjust stackshot estimates up by this percentage"); extern unsigned int stackshot_single_thread; @@ -646,7 +646,7 @@ stackshot_dirty_buffer_test(__unused int64_t in, int64_t *out) kern_return_t kr; // 8MB buffer - kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG); + kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (kr != KERN_SUCCESS) { printf("stackshot_dirty_buffer_test: kmem_alloc returned %d\n", kr); goto err; @@ -686,7 +686,7 @@ stackshot_kernel_initiator_test(int64_t in, int64_t *out) uint64_t ss_flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_NO_IO_STATS | STACKSHOT_SAVE_KEXT_LOADINFO | STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY | STACKSHOT_THREAD_WAITINFO | STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL; unsigned ss_bytes = 0; if (in == 1) { - kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG); + kr = kmem_alloc(kernel_map, &buf, 8 * 1024 * 1024, KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (kr != KERN_SUCCESS) { printf("stackshot_kernel_initiator_test: kmem_alloc returned %d\n", kr); goto err; diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index cb7cd1c59..53aba1c5f 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -522,7 +522,7 @@ oslog_init_firehose(void) kmem_alloc(kernel_map, &kernel_firehose_addr, size + ptoa(2), KMA_NOFAIL | KMA_PERMANENT | KMA_GUARD_FIRST | KMA_GUARD_LAST | - KMA_DATA | KMA_ZERO, VM_KERN_MEMORY_LOG); + KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_LOG); kernel_firehose_addr += PAGE_SIZE; /* register buffer with firehose */ diff --git a/bsd/kern/subr_log_stream.c b/bsd/kern/subr_log_stream.c index e994afaf0..c5a40090d 100644 --- a/bsd/kern/subr_log_stream.c +++ b/bsd/kern/subr_log_stream.c @@ -653,7 +653,7 @@ log_stream_teardown(log_stream_t *ls) kfree_data(ls->ls_buf, buf_size); } if (ls->ls_blk) { - kfree_type(uint8_t, ls->ls_blk_count, ls->ls_blk); + kfree_data(ls->ls_blk, ls->ls_blk_count); } bzero(ls, sizeof(*ls)); } diff --git a/bsd/kern/subr_prf.c b/bsd/kern/subr_prf.c index 1fdbeb291..518af2ca5 100644 --- a/bsd/kern/subr_prf.c +++ b/bsd/kern/subr_prf.c @@ -232,6 +232,7 @@ tprintf_impl(tpr_t tpr, const char *fmt, va_list ap) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, ap, __builtin_return_address(0)); #pragma clang diagnostic pop } @@ -421,13 +422,15 @@ printf_log_locked(bool addcr, const char *fmt, ...) return retval; } +extern bool IOSystemStateAOT(void); + bool vprintf_log_locked(const char *fmt, va_list ap, bool driverkit) { struct putchar_args pca; pca.flags = TOLOGLOCKED; - if (driverkit && enable_dklog_serial_output) { + if (driverkit && (enable_dklog_serial_output || IOSystemStateAOT())) { pca.flags |= TOCONS; } pca.tty = NULL; diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index ab6a96b47..d4f752387 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -2264,7 +2264,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) /* Finish copying in the necessary args before taking the proc lock */ error = 0; len = 0; - if (args->cmd == LEDGER_ENTRY_INFO) { + if (args->cmd == LEDGER_ENTRY_INFO || args->cmd == LEDGER_ENTRY_INFO_V2) { error = copyin(args->arg3, (char *)&len, sizeof(len)); } else if (args->cmd == LEDGER_TEMPLATE_INFO) { error = copyin(args->arg2, (char *)&len, sizeof(len)); @@ -2327,17 +2327,20 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) break; } - case LEDGER_ENTRY_INFO: { + case LEDGER_ENTRY_INFO: + case LEDGER_ENTRY_INFO_V2: { + bool v2 = (args->cmd == LEDGER_ENTRY_INFO_V2); + int entry_size = (v2) ? sizeof(struct ledger_entry_info_v2) : sizeof(struct ledger_entry_info); void *buf; int sz; /* Settle ledger entries for memorystatus and pages grabbed */ task_ledger_settle(task); - rval = ledger_get_task_entry_info_multiple(task, &buf, &len); + rval = ledger_get_task_entry_info_multiple(task, &buf, &len, v2); proc_rele(proc); if ((rval == 0) && (len >= 0)) { - sz = len * sizeof(struct ledger_entry_info); + sz = len * entry_size; rval = copyout(buf, args->arg2, sz); kfree_data(buf, sz); } @@ -2804,6 +2807,8 @@ SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT | CTLFLAG_RW | C #if __AMP__ +errno_t mach_to_bsd_errno(kern_return_t mach_err); + extern char sysctl_get_bound_cluster_type(void); static int sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS @@ -2825,15 +2830,11 @@ sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS goto out; } - if (cluster_type != 'P' && - cluster_type != 'p' && - cluster_type != 'E' && - cluster_type != 'e') { - return EINVAL; + kern_return_t kr = thread_soft_bind_cluster_type(current_thread(), cluster_type); + if (kr != KERN_SUCCESS) { + return mach_to_bsd_errno(kr); } - thread_soft_bind_cluster_type(current_thread(), cluster_type); - out: buff[0] = sysctl_get_bound_cluster_type(); @@ -2844,7 +2845,7 @@ SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING | CT 0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", ""); extern char sysctl_get_task_cluster_type(void); -extern void sysctl_task_set_cluster_type(char cluster_type); +extern kern_return_t sysctl_task_set_cluster_type(char cluster_type); static int sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS { @@ -2865,14 +2866,11 @@ sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS goto out; } - if (cluster_type != 'E' && - cluster_type != 'e' && - cluster_type != 'P' && - cluster_type != 'p') { - return EINVAL; + kern_return_t kr = sysctl_task_set_cluster_type(cluster_type); + if (kr != KERN_SUCCESS) { + return mach_to_bsd_errno(kr); } - sysctl_task_set_cluster_type(cluster_type); out: cluster_type = sysctl_get_task_cluster_type(); buff[0] = cluster_type; @@ -2934,9 +2932,15 @@ SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW | CTLFL #endif /* __AMP__ */ +#if DEVELOPMENT || DEBUG +extern int timeouts_are_fatal; +EXPERIMENT_FACTOR_INT(timeouts_are_fatal, &timeouts_are_fatal, 0, 1, + "Do timeouts panic or emit telemetry (0: telemetry, 1: panic)"); +#endif + #if SCHED_HYGIENE_DEBUG -SYSCTL_QUAD(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_QUAD(_kern, OID_AUTO, interrupt_masked_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_LEGACY_EXPERIMENT, &interrupt_masked_timeout, "Interrupt masked duration after which a tracepoint is emitted or the device panics (in mach timebase units)"); @@ -2944,7 +2948,7 @@ SYSCTL_INT(_kern, OID_AUTO, interrupt_masked_debug_mode, CTLFLAG_RW | CTLFLAG_LO &interrupt_masked_debug_mode, 0, "Enable interrupt masked tracing or panic (0: off, 1: trace, 2: panic)"); -SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED, +SYSCTL_QUAD(_kern, OID_AUTO, sched_preemption_disable_threshold_mt, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_LEGACY_EXPERIMENT, &sched_preemption_disable_threshold_mt, "Preemption disablement duration after which a tracepoint is emitted or the device panics (in mach timebase units)"); diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 04f5e7269..19fb4147e 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -453,7 +453,7 @@ static __attribute__((unused)) void soioctl_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_long)0) { diff --git a/bsd/kern/sys_ulock.c b/bsd/kern/sys_ulock.c index 8014590dd..39b845b1a 100644 --- a/bsd/kern/sys_ulock.c +++ b/bsd/kern/sys_ulock.c @@ -434,6 +434,8 @@ uaddr_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp) kern_return_t ret; vm_page_info_basic_data_t info; mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT; + + ret = vm_map_page_info(current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count); if (ret != KERN_SUCCESS) { return EINVAL; diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index a71991506..c08a74c6d 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -257,7 +257,7 @@ 162 AUE_NULL ALL { int nosys(void); } { old getdomainname } 163 AUE_NULL ALL { int nosys(void); } { old setdomainname } -164 AUE_NULL ALL { int nosys(void); } +164 AUE_FUNMOUNT ALL { int funmount(int fd, int flags); } 165 AUE_QUOTACTL ALL { int quotactl(const char *path, int cmd, int uid, caddr_t arg); } 166 AUE_NULL ALL { int nosys(void); } { old exportfs } 167 AUE_MOUNT ALL { int mount(char *type, char *path, int flags, caddr_t data); } diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index 2f110897f..d91580a5a 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -515,6 +515,10 @@ 0x1400188 MACH_SCHED_AST_CHECK 0x140018C MACH_SCHED_PREEMPT_TIMER_ACTIVE 0x1400190 MACH_PROCESSOR_SHUTDOWN +0x1400194 MACH_SCHED_PSET_BITMASKS +0x1400198 MACH_SUSPEND_DRIVERKIT_USERSPACE +0x140019c MACH_SCHED_PREFERRED_PSET +0x14001a0 MACH_SCHED_ONCORE_PREEMPT 0x1500000 MACH_MSGID_INVALID 0x1600000 MTX_SLEEP 0x1600004 MTX_SLEEP_DEADLINE @@ -631,6 +635,142 @@ 0x1b00020 VM_RECLAIM_SAMPLE 0x1b00028 VM_RECLAIM_RESIZE 0x1b0002c VM_RECLAIM_FLUSH +0x1b10804 VMLP_EVENT_API_FILL_PROCREGIONINFO +0x1b10808 VMLP_EVENT_API_FILL_PROCREGIONINFO_ONLYMAPPEDVNODES +0x1b1080c VMLP_EVENT_API_FIND_MAPPING_TO_SLIDE +0x1b10810 VMLP_EVENT_API_GET_VMMAP_ENTRIES +0x1b10814 VMLP_EVENT_API_GET_VMSUBMAP_ENTRIES +0x1b10818 VMLP_EVENT_API_KDP_LIGHTWEIGHT_FAULT +0x1b1081c VMLP_EVENT_API_KMEM_ALLOC_GUARD_INTERNAL +0x1b10820 VMLP_EVENT_API_KMEM_FREE_GUARD +0x1b10824 VMLP_EVENT_API_KMEM_GET_GOBJ_STATS +0x1b10828 VMLP_EVENT_API_KMEM_POPULATE_META_LOCKED +0x1b1082c VMLP_EVENT_API_KMEM_REALLOC_GUARD +0x1b10830 VMLP_EVENT_API_KMEM_SIZE_GUARD +0x1b10834 VMLP_EVENT_API_MACH_MAKE_MEMORY_ENTRY_SHARE +0x1b10838 VMLP_EVENT_API_MACH_VM_RANGE_CREATE_V1 +0x1b1083c VMLP_EVENT_API_MOVE_PAGES_TO_QUEUE +0x1b10840 VMLP_EVENT_API_TASK_FIND_REGION_DETAILS +0x1b10844 VMLP_EVENT_API_TASK_INFO +0x1b10848 VMLP_EVENT_API_VM32_REGION_INFO +0x1b1084c VMLP_EVENT_API_VM32_REGION_INFO_64 +0x1b10850 VMLP_EVENT_API_VM32__MAP_EXEC_LOCKDOWN +0x1b10854 VMLP_EVENT_API_VMTC_REVALIDATE_LOOKUP +0x1b10858 VMLP_EVENT_API_VM_FAULT_COPY +0x1b1085c VMLP_EVENT_API_VM_FAULT_INTERNAL +0x1b10860 VMLP_EVENT_API_VM_KERN_ALLOCATION_INFO +0x1b10864 VMLP_EVENT_API_VM_MAP_APPLE_PROTECTED +0x1b10868 VMLP_EVENT_API_VM_MAP_BEHAVIOR_SET +0x1b1086c VMLP_EVENT_API_VM_MAP_CAN_REUSE +0x1b10870 VMLP_EVENT_API_VM_MAP_CHECK_PROTECTION +0x1b10874 VMLP_EVENT_API_VM_MAP_COPYIN_INTERNAL +0x1b10878 VMLP_EVENT_API_VM_MAP_COPYOUT_INTERNAL +0x1b1087c VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE +0x1b10880 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_ALIGNED +0x1b10884 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_NESTED +0x1b10888 VMLP_EVENT_API_VM_MAP_COPY_OVERWRITE_UNALIGNED +0x1b1088c VMLP_EVENT_API_VM_MAP_CREATE_UPL +0x1b10890 VMLP_EVENT_API_VM_MAP_CS_DEBUGGED_SET +0x1b10894 VMLP_EVENT_API_VM_MAP_CS_ENFORCEMENT_SET +0x1b10898 VMLP_EVENT_API_VM_MAP_DELETE +0x1b1089c VMLP_EVENT_API_VM_MAP_DELETE_SUBMAP_RECURSE +0x1b108a0 VMLP_EVENT_API_VM_MAP_DESTROY +0x1b108a4 VMLP_EVENT_API_VM_MAP_DISCONNECT_PAGE_MAPPINGS +0x1b108a8 VMLP_EVENT_API_VM_MAP_ENTER +0x1b108ac VMLP_EVENT_API_VM_MAP_ENTER_MEM_OBJECT +0x1b108b0 VMLP_EVENT_API_VM_MAP_ENTRY_HAS_DEVICE_PAGER +0x1b108b4 VMLP_EVENT_API_VM_MAP_EXEC_LOCKDOWN +0x1b108b8 VMLP_EVENT_API_VM_MAP_FIND_SPACE +0x1b108bc VMLP_EVENT_API_VM_MAP_FORK +0x1b108c0 VMLP_EVENT_API_VM_MAP_FORK_COPY +0x1b108c4 VMLP_EVENT_API_VM_MAP_FREEZE +0x1b108c8 VMLP_EVENT_API_VM_MAP_GET_PHYS_PAGE +0x1b108cc VMLP_EVENT_API_VM_MAP_INHERIT +0x1b108d0 VMLP_EVENT_API_VM_MAP_INJECT_ERROR +0x1b108d4 VMLP_EVENT_API_VM_MAP_IS_CORPSE_SOURCE +0x1b108d8 VMLP_EVENT_API_VM_MAP_LOOKUP_AND_LOCK_OBJECT +0x1b108dc VMLP_EVENT_API_VM_MAP_MACHINE_ATTRIBUTE +0x1b108e0 VMLP_EVENT_API_VM_MAP_MARK_ALIEN +0x1b108e4 VMLP_EVENT_API_VM_MAP_MSYNC +0x1b108e8 VMLP_EVENT_API_VM_MAP_NON_ALIGNED_TEST +0x1b108ec VMLP_EVENT_API_VM_MAP_OVERWRITE_SUBMAP_RECURSE +0x1b108f0 VMLP_EVENT_API_VM_MAP_PAGEOUT +0x1b108f4 VMLP_EVENT_API_VM_MAP_PAGE_RANGE_INFO_INTERNAL +0x1b108f8 VMLP_EVENT_API_VM_MAP_PARTIAL_REAP +0x1b108fc VMLP_EVENT_API_VM_MAP_PROTECT +0x1b10900 VMLP_EVENT_API_VM_MAP_PURGABLE_CONTROL +0x1b10904 VMLP_EVENT_API_VM_MAP_RAISE_MAX_OFFSET +0x1b10908 VMLP_EVENT_API_VM_MAP_RAISE_MIN_OFFSET +0x1b1090c VMLP_EVENT_API_VM_MAP_RANGE_CONFIGURE +0x1b10910 VMLP_EVENT_API_VM_MAP_REGION +0x1b10914 VMLP_EVENT_API_VM_MAP_REGION_RECURSE_64 +0x1b10918 VMLP_EVENT_API_VM_MAP_REMAP +0x1b1091c VMLP_EVENT_API_VM_MAP_REMAP_EXTRACT +0x1b10920 VMLP_EVENT_API_VM_MAP_REMOVE_AND_UNLOCK +0x1b10924 VMLP_EVENT_API_VM_MAP_REMOVE_GUARD +0x1b10928 VMLP_EVENT_API_VM_MAP_REUSABLE_PAGES +0x1b1092c VMLP_EVENT_API_VM_MAP_REUSE_PAGES +0x1b10930 VMLP_EVENT_API_VM_MAP_SET_CACHE_ATTR +0x1b10934 VMLP_EVENT_API_VM_MAP_SET_CORPSE_SOURCE +0x1b10938 VMLP_EVENT_API_VM_MAP_SET_DATA_LIMIT +0x1b1093c VMLP_EVENT_API_VM_MAP_SET_MAX_ADDR +0x1b10940 VMLP_EVENT_API_VM_MAP_SET_SIZE_LIMIT +0x1b10944 VMLP_EVENT_API_VM_MAP_SET_TPRO_ENFORCEMENT +0x1b10948 VMLP_EVENT_API_VM_MAP_SET_TPRO_RANGE +0x1b1094c VMLP_EVENT_API_VM_MAP_SET_USER_WIRE_LIMIT +0x1b10950 VMLP_EVENT_API_VM_MAP_SHADOW_MAX +0x1b10954 VMLP_EVENT_API_VM_MAP_SIGN +0x1b10958 VMLP_EVENT_API_VM_MAP_SIMPLIFY +0x1b1095c VMLP_EVENT_API_VM_MAP_SINGLE_JIT +0x1b10960 VMLP_EVENT_API_VM_MAP_SIZES +0x1b10964 VMLP_EVENT_API_VM_MAP_SUBMAP_PMAP_CLEAN +0x1b10968 VMLP_EVENT_API_VM_MAP_SWITCH_PROTECT +0x1b1096c VMLP_EVENT_API_VM_MAP_TERMINATE +0x1b10970 VMLP_EVENT_API_VM_MAP_UNSET_CORPSE_SOURCE +0x1b10974 VMLP_EVENT_API_VM_MAP_UNWIRE_NESTED +0x1b10978 VMLP_EVENT_API_VM_MAP_WILLNEED +0x1b1097c VMLP_EVENT_API_VM_MAP_WIRE_NESTED +0x1b10980 VMLP_EVENT_API_VM_MAP_ZERO +0x1b10984 VMLP_EVENT_API_VM_PAGE_DIAGNOSE +0x1b10988 VMLP_EVENT_API_VM_SHARED_REGION_MAP_FILE +0x1b1098c VMLP_EVENT_API_VM_TOGGLE_ENTRY_REUSE +0x1b10990 VMLP_EVENT_API_ZONE_METADATA_INIT +0x1b10994 VMLP_EVENT_API_ZONE_SUBMAP_ALLOC_SEQUESTERED_VA +0x1b11004 VMLP_EVENT_LOCK_TRY_EXCL +0x1b11008 VMLP_EVENT_LOCK_FAIL_EXCL +0x1b1100c VMLP_EVENT_LOCK_REQ_EXCL +0x1b11010 VMLP_EVENT_LOCK_GOT_EXCL +0x1b11014 VMLP_EVENT_LOCK_UNLOCK_EXCL +0x1b11018 VMLP_EVENT_LOCK_DOWNGRADE +0x1b1101c VMLP_EVENT_LOCK_TRY_SH +0x1b11020 VMLP_EVENT_LOCK_FAIL_SH +0x1b11024 VMLP_EVENT_LOCK_REQ_SH +0x1b11028 VMLP_EVENT_LOCK_GOT_SH +0x1b1102c VMLP_EVENT_LOCK_UNLOCK_SH +0x1b11030 VMLP_EVENT_LOCK_TRY_UPGRADE +0x1b11034 VMLP_EVENT_LOCK_GOT_UPGRADE +0x1b11038 VMLP_EVENT_LOCK_FAIL_UPGRADE +0x1b1103c VMLP_EVENT_LOCK_SLEEP_BEGIN +0x1b11040 VMLP_EVENT_LOCK_SLEEP_END +0x1b11044 VMLP_EVENT_LOCK_YIELD_BEGIN +0x1b11048 VMLP_EVENT_LOCK_YIELD_END +0x1b11804 VMLP_EVENT_RANGE +0x1b20004 MEMINFO_PGCNT1 +0x1b20008 MEMINFO_PGCNT2 +0x1b2000c MEMINFO_PGCNT3 +0x1b20010 MEMINFO_PGCNT4 +0x1b20014 MEMINFO_PGCNT5 +0x1b20018 MEMINFO_PGCNT6 +0x1b2001c MEMINFO_PGCNT7 +0x1b20020 MEMINFO_PGCNT8 +0x1b20044 MEMINFO_PGOUT1 +0x1b20048 MEMINFO_PGOUT2 +0x1b2004c MEMINFO_PGOUT3 +0x1b20050 MEMINFO_PGOUT4 +0x1b20054 MEMINFO_PGOUT5 +0x1b20058 MEMINFO_PGOUT6 +0x1b20084 MEMINFO_DEMAND1 +0x1b20088 MEMINFO_DEMAND2 0x2010000 L_IP_In_Beg 0x2010004 L_IP_Out_Beg 0x2010008 L_IP_In_End @@ -1199,7 +1339,15 @@ 0x3130180 VFS_check_getattrlistbulk 0x3130184 VFS_check_copyfile 0x3130188 VFS_notify_unlink +0x313018C VFS_notify_rename_swap +0x3130190 VFS_check_rename_swap +0x3130194 VFS_check_dataprotect_set 0x3130198 VFS_mount_notify_mount +0x313019C VFS_mount_check_remount_with_flags +0x31301A0 VFS_graft_check_graft +0x31301A4 VFS_graft_check_ungraft +0x31301A8 VFS_graft_notify_graft +0x31301AC VFS_graft_notify_ungraft 0X3134000 VFS_io_compression_stats 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit @@ -1246,6 +1394,48 @@ 0x4030050 KEVENT_kqwl_unbind 0x4030054 KEVENT_knote_enable 0x4030058 KEVENT_knote_vanished +0x40d0004 AIO_work_queued +0x40d0008 AIO_worker_wake +0x40d000c AIO_completion_sig +0x40d0010 AIO_completion_kevent +0x40d0014 AIO_completion_cleanup_wait +0x40d0018 AIO_completion_cleanup_wake +0x40d001c AIO_completion_suspend_wake +0x40d0028 AIO_cancel +0x40d002c AIO_cancel_async_workq +0x40d0030 AIO_cancel_sync_workq +0x40d0034 AIO_cancel_activeq +0x40d0038 AIO_cancel_doneq +0x40d0050 AIO_fsync +0x40d0054 AIO_fsync_delay +0x40d0078 AIO_read +0x40d00a0 AIO_write +0x40d00c8 AIO_listio +0x40d00f0 AIO_error +0x40d00f4 AIO_error_val +0x40d00f8 AIO_error_activeq +0x40d00fc AIO_error_workq +0x40d0118 AIO_return +0x40d011c AIO_return_val +0x40d0120 AIO_return_activeq +0x40d0124 AIO_return_workq +0x40d0140 AIO_exec +0x40d0168 AIO_exit +0x40d016c AIO_exit_sleep +0x40d0190 AIO_close +0x40d0194 AIO_close_sleep +0x40d01b8 AIO_suspend +0x40d01bc AIO_suspend_sleep +0x40d01e0 AIO_worker_thread +0x40d0208 AIO_register_kevent +0x40d0230 AIO_WQ_process_entry +0x40d0234 AIO_WQ_aio_thread_create +0x40d0238 AIO_WQ_aio_thread_terminate +0x40d023c AIO_WQ_aio_death_call +0x40d0240 AIO_WQ_aio_thread_park +0x40d0244 AIO_WQ_aio_select_req +0x40d0248 AIO_WQ_aio_thread_create_failed +0x40d024c AIO_WQ_aio_thread_wakeup 0x40e0104 BSC_msync_extended_info 0x40e0264 BSC_pread_extended_info 0x40e0268 BSC_pwrite_extended_info @@ -1692,6 +1882,8 @@ 0x01a9002c MACH_SCHED_EDGE_LOAD_AVG 0x01a90030 MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD 0x01a90034 MACH_SCHED_EDGE_RSRC_HEAVY_THREAD +0x01a90038 MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE +0x01a9003c MACH_SCHED_EDGE_STIR_THE_POT 0x01ab0000 WORKGROUP_INTERVAL_CREATE 0x01ab0004 WORKGROUP_INTERVAL_DESTROY 0x01ab0008 WORKGROUP_INTERVAL_CHANGE @@ -1990,6 +2182,8 @@ 0x26120004 imp_msg_send 0x26120008 imp_msg_delv 0x26130000 imp_watchport +0x26150000 imp_thread_promote_above_task +0x26160000 imp_runaway_mitigation 0x26170000 imp_suppression_inactive 0x26170004 imp_suppression_active 0x26180000 imp_apptype_none @@ -2006,10 +2200,13 @@ 0x261a0004 imp_usynch_remove_override 0x261b0000 imp_donor_update_live_donor 0x261b0004 imp_donor_init_donor_state +0x261c0000 imp_main_thread_qos 0x261d0000 imp_sync_ipc_qos_applied 0x261d0004 imp_sync_ipc_qos_removed 0x261d0008 imp_sync_ipc_qos_overflow 0x261d000c imp_sync_ipc_qos_underflow +0x261e0000 imp_set_gpu_role +0x261f0000 imp_query_gpu_role 0x26210010 imp_task_int_bg 0x26210014 imp_task_ext_bg 0x26210020 imp_thread_int_bg @@ -2047,6 +2244,8 @@ 0x263d0028 imp_thread_qos_ipc_override 0x263e0028 imp_thread_qos_servicer_override 0x263f0028 imp_thread_iotier_kevent_override +0x26400028 imp_thread_iotier_kevent_override +0x26410018 imp_task_runaway_mitigation 0x27000000 PERF_PCEVENT 0x27001000 PERF_CPU_IDLE 0x27001100 PERF_CPU_IDLE_TIMER diff --git a/bsd/kern/tracker.c b/bsd/kern/tracker.c index ec48c4e9a..5c9618ffa 100644 --- a/bsd/kern/tracker.c +++ b/bsd/kern/tracker.c @@ -56,8 +56,10 @@ static os_log_t tracker_db_log_handle = NULL; static struct thread *g_tracker_gc_thread; #define TRACKER_GC_RUN_INTERVAL_NSEC (10 * NSEC_PER_SEC) // GC wakes up periodically #define TRACKER_GC_IDLE_TO (10) // age out entries when not used for a while +#define TRACKER_GC_EXTENDED_IDLE_TO (120) // extended timeout for entries that are used for policy evaluation static int tracker_db_idle_timeout = TRACKER_GC_IDLE_TO; +static int tracker_db_extended_idle_timeout = TRACKER_GC_EXTENDED_IDLE_TO; /* * Sysctls for debug logs control @@ -70,6 +72,9 @@ SYSCTL_INT(_net_tracker, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED, SYSCTL_INT(_net_tracker, OID_AUTO, idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &tracker_db_idle_timeout, 0, ""); +SYSCTL_INT(_net_tracker, OID_AUTO, extended_idle_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, + &tracker_db_extended_idle_timeout, 0, ""); + #define TRACKER_LOG(level, fmt, ...) \ do { \ if (tracker_log_level >= level && tracker_db_log_handle) { \ @@ -317,6 +322,12 @@ copy_metadata(tracker_metadata_t *dst_metadata, tracker_metadata_t *src_metadata dst_domain_owner_buffer[0] = 0; } + if (dst_metadata->flags & SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT) { + // If the client says this needs to extend the timeout, save that. + // This flag is passed in by the caller, and updates the saved metadata. + src_metadata->flags |= SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT; + } + is_short = (dst_metadata->flags & SO_TRACKER_ATTRIBUTE_FLAGS_DOMAIN_SHORT); dst_metadata->flags = src_metadata->flags; if (is_short) { @@ -456,6 +467,7 @@ tracker_search_and_insert(struct tracker_db *db, struct tracker_hash_entry *matc if (insert) { if (copy_metadata(&nextentry->metadata, &matchentry->metadata) == true) { TRACKER_ENTRY_LOG(LOG_DEBUG, "Updated entry", nextentry, hash_element); + nextentry->lastused = net_uptime(); return nextentry; } else { // Failed to update found entry, delete it from db and allow insertion of new entry. @@ -1069,7 +1081,11 @@ tracker_entry_expire(void *v, wait_result_t w) hash = &g_tracker_db.tracker_hashbase[i]; LIST_FOREACH_SAFE(entry, hash, entry_link, temp_entry) { - if (tracker_idle_timed_out(entry, tracker_db_idle_timeout, current_time)) { + int timeout_value = tracker_db_idle_timeout; + if (entry->metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT) { + timeout_value = tracker_db_extended_idle_timeout; + } + if (tracker_idle_timed_out(entry, timeout_value, current_time)) { TRACKER_ENTRY_LOG(LOG_DEBUG, "Deleting entry - IDLE TO", entry, i); g_tracker_db.tracker_count--; if (entry->metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_DOMAIN_SHORT) { diff --git a/bsd/kern/tty_dev.c b/bsd/kern/tty_dev.c index b9d5fbb7b..9deb01fa7 100644 --- a/bsd/kern/tty_dev.c +++ b/bsd/kern/tty_dev.c @@ -222,11 +222,16 @@ static struct ptmx_ioctl * pty_get_ioctl(dev_t dev, int open_flag, struct tty_dev_t **out_driver) { struct tty_dev_t *driver = pty_get_driver(dev); + struct ptmx_ioctl *out = NULL; if (out_driver) { *out_driver = driver; } if (driver && driver->open) { - return driver->open(minor(dev), open_flag); + out = driver->open(minor(dev), open_flag); + if (!out) { + printf("pty_get_ioctl: driver->open returned NULL\n"); + } + return out; } return NULL; } diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index be22d0e18..e41092b41 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -286,6 +286,7 @@ ptmx_get_ioctl(int minor, int open_flag) */ if ((_state.pis_total - _state.pis_free) >= ptmx_max) { DEVFS_UNLOCK(); + printf("ptmx_get_ioctl failed due to ptmx_max limit %d\n", ptmx_max); return NULL; } DEVFS_UNLOCK(); @@ -314,6 +315,7 @@ ptmx_get_ioctl(int minor, int open_flag) ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); kfree_type(struct ptmx_ioctl, new_ptmx_ioctl); + printf("ptmx_get_ioctl failed due to ptmx_max limit %d\n", ptmx_max); return NULL; } @@ -348,6 +350,7 @@ ptmx_get_ioctl(int minor, int open_flag) ttyfree(new_ptmx_ioctl->pt_tty); DEVFS_UNLOCK(); kfree_type(struct ptmx_ioctl, new_ptmx_ioctl); + printf("ptmx_get_ioctl failed because minor number %d was out of range\n", minor); return NULL; } @@ -357,6 +360,7 @@ ptmx_get_ioctl(int minor, int open_flag) kfree_type(struct ptmx_ioctl, new_ptmx_ioctl); /* Special error value so we know to redrive the open, we've been raced */ + /* XXX Can this still occur? */ return (struct ptmx_ioctl*)-1; } diff --git a/bsd/kern/tty_pty.c b/bsd/kern/tty_pty.c index fe4771fa6..27c04b854 100644 --- a/bsd/kern/tty_pty.c +++ b/bsd/kern/tty_pty.c @@ -117,6 +117,7 @@ static struct ptmx_ioctl * pty_get_ioctl(int minor, int open_flag) { if (minor >= NPTY) { + printf("pty_get_ioctl failed because minor number %d exceeded %d\n", minor, NPTY); return NULL; } struct ptmx_ioctl *pti = &pt_ioctl[minor]; diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 535ddcd38..70f126182 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -534,9 +534,6 @@ cs_validate_csblob( uint32_t n, count; const CS_CodeDirectory *best_cd = NULL; unsigned int best_rank = 0; -#if XNU_PLATFORM_WatchOS - const CS_CodeDirectory *sha1_cd = NULL; -#endif if (length < sizeof(CS_SuperBlob)) { return EBADEXEC; @@ -592,15 +589,6 @@ cs_validate_csblob( printf("multiple hash=%d CodeDirectories in signature; rejecting\n", best_cd->hashType); return EBADEXEC; } -#if XNU_PLATFORM_WatchOS - if (candidate->hashType == CS_HASHTYPE_SHA1) { - if (sha1_cd != NULL) { - printf("multiple sha1 CodeDirectories in signature; rejecting\n"); - return EBADEXEC; - } - sha1_cd = candidate; - } -#endif } else if (type == CSSLOT_ENTITLEMENTS) { if (ntohl(subBlob->magic) != CSMAGIC_EMBEDDED_ENTITLEMENTS) { return EBADEXEC; @@ -657,37 +645,6 @@ cs_validate_csblob( library_constraint = subBlob; } } - -#if XNU_PLATFORM_WatchOS - /* To keep watchOS fast enough, we have to resort to sha1 for - * some code. - * - * At the time of writing this comment, known sha1 attacks are - * collision attacks (not preimage or second preimage - * attacks), which do not apply to platform binaries since - * they have a fixed hash in the trust cache. Given this - * property, we only prefer sha1 code directories for adhoc - * signatures, which always have to be in a trust cache to be - * valid (can-load-cdhash does not exist for watchOS). Those - * are, incidentally, also the platform binaries, for which we - * care about the performance hit that sha256 would bring us. - * - * Platform binaries may still contain a (not chosen) sha256 - * code directory, which keeps software updates that switch to - * sha256-only small. - */ - - if (*rcd != NULL && sha1_cd != NULL && (ntohl(sha1_cd->flags) & CS_ADHOC)) { - if (sha1_cd->flags != (*rcd)->flags) { - printf("mismatched flags between hash %d (flags: %#x) and sha1 (flags: %#x) cd.\n", - (int)(*rcd)->hashType, (*rcd)->flags, sha1_cd->flags); - *rcd = NULL; - return EBADEXEC; - } - - *rcd = sha1_cd; - } -#endif } else if (ntohl(blob->magic) == CSMAGIC_CODEDIRECTORY) { if ((error = cs_validate_codedirectory((const CS_CodeDirectory *)(const void *)addr, length)) != 0) { return error; @@ -6158,7 +6115,8 @@ int ubc_cs_getcdhash( vnode_t vp, off_t offset, - unsigned char *cdhash) + unsigned char *cdhash, + uint8_t *type) { struct cs_blob *blobs, *blob; off_t rel_offset; @@ -6183,8 +6141,14 @@ ubc_cs_getcdhash( /* we didn't find a blob covering "offset" */ ret = EBADEXEC; /* XXX any better error ? */ } else { - /* get the SHA1 hash of that blob */ + /* get the CDHash of that blob */ bcopy(blob->csb_cdhash, cdhash, sizeof(blob->csb_cdhash)); + + /* get the type of the CDHash */ + if (type != NULL) { + *type = blob->csb_cd->hashType; + } + ret = 0; } diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index 614df7172..35bc44dfe 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -75,6 +75,8 @@ #include #include +#include + #include #include #include @@ -535,7 +537,6 @@ net_add_proto_old(struct protosw_old *opp, struct domain_old *odp) pp->pr_usrreqs = pru; pp->pr_init = pr_init_old; pp->pr_drain = opp->pr_drain; - pp->pr_sysctl = opp->pr_sysctl; pp->pr_lock = opp->pr_lock; pp->pr_unlock = opp->pr_unlock; pp->pr_getlock = opp->pr_getlock; @@ -1024,41 +1025,6 @@ net_uptime2timeval(struct timeval *tv) tv->tv_sec = (time_t)net_uptime(); } -/* - * An alternative way to obtain the coarse-grained uptime (in seconds) - * for networking code which do not require high-precision timestamp, - * as this is significantly cheaper than microuptime(). - */ -uint64_t -net_uptime(void) -{ - if (_net_uptime == 0) { - net_update_uptime(); - } - - return _net_uptime; -} - -uint64_t -net_uptime_ms(void) -{ - if (_net_uptime_ms == 0) { - net_update_uptime(); - } - - return _net_uptime_ms; -} - -uint64_t -net_uptime_us(void) -{ - if (_net_uptime_us == 0) { - net_update_uptime(); - } - - return _net_uptime_us; -} - void domain_proto_mtx_lock_assert_held(void) { diff --git a/bsd/kern/uipc_domain.h b/bsd/kern/uipc_domain.h new file mode 100644 index 000000000..d2317bbbe --- /dev/null +++ b/bsd/kern/uipc_domain.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_UIPC_DOMAIN_H +#define _KERN_UIPC_DOMAIN_H + +#ifdef XNU_KERNEL_PRIVATE + + +#include +#include + +#include + +static inline void +proto_memacct_add(struct protosw *proto, unsigned int size) +{ + if (proto->pr_mem_acct) { + mem_acct_add(proto->pr_mem_acct, size); + } else { + socket_memacct_add(size); + } +} + +static inline void +proto_memacct_sub(struct protosw *proto, unsigned int size) +{ + if (proto->pr_mem_acct) { + mem_acct_sub(proto->pr_mem_acct, size); + } else { + socket_memacct_sub(size); + } +} + +static inline bool +proto_memacct_hardlimit(const struct protosw *proto) +{ + if (proto->pr_mem_acct) { + return mem_acct_limited(proto->pr_mem_acct) == MEMACCT_HARDLIMIT; + } else { + return socket_memacct_hardlimit(); + } +} + +static inline bool +proto_memacct_limited(const struct protosw *proto) +{ + if (proto->pr_mem_acct) { + return mem_acct_limited(proto->pr_mem_acct) != 0; + } else { + return socket_memacct_limited(); + } +} + +extern uint64_t _net_uptime; +extern uint64_t _net_uptime_ms; +extern uint64_t _net_uptime_us; + +extern void net_update_uptime(void); +extern void net_update_uptime_with_time(const struct timeval *); + +/* + * ToDo - we could even replace all callers of net_uptime* by a direct access + * to _net_uptime* + */ +static inline uint64_t +net_uptime(void) +{ + return _net_uptime; +} +static inline uint64_t +net_uptime_ms(void) +{ + return _net_uptime_ms; +} +static inline uint64_t +net_uptime_us(void) +{ + return _net_uptime_us; +} + +extern void net_uptime2timeval(struct timeval *); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /*_KERN_UIPC_DOMAIN_H */ diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index e10c1d2e2..b002fff30 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -105,224 +105,13 @@ #include #include -#if CONFIG_MBUF_MCACHE -#include -#include -#endif /* CONFIG_MBUF_MCACHE */ - -#include #include +#include #if INET -extern int dump_tcp_reass_qlen(char *, int); extern int tcp_reass_qlen_space(struct socket *); #endif /* INET */ -#if MPTCP -extern int dump_mptcp_reass_qlen(char *, int); -#endif /* MPTCP */ - -#if NETWORKING -extern int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len); -#endif /* NETWORKING */ - -#if CONFIG_MBUF_MCACHE -/* - * MBUF IMPLEMENTATION NOTES. - * - * There is a total of 5 per-CPU caches: - * - * MC_MBUF: - * This is a cache of rudimentary objects of _MSIZE in size; each - * object represents an mbuf structure. This cache preserves only - * the m_type field of the mbuf during its transactions. - * - * MC_CL: - * This is a cache of rudimentary objects of MCLBYTES in size; each - * object represents a mcluster structure. This cache does not - * preserve the contents of the objects during its transactions. - * - * MC_BIGCL: - * This is a cache of rudimentary objects of MBIGCLBYTES in size; each - * object represents a mbigcluster structure. This cache does not - * preserve the contents of the objects during its transaction. - * - * MC_MBUF_CL: - * This is a cache of mbufs each having a cluster attached to it. - * It is backed by MC_MBUF and MC_CL rudimentary caches. Several - * fields of the mbuf related to the external cluster are preserved - * during transactions. - * - * MC_MBUF_BIGCL: - * This is a cache of mbufs each having a big cluster attached to it. - * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several - * fields of the mbuf related to the external cluster are preserved - * during transactions. - * - * OBJECT ALLOCATION: - * - * Allocation requests are handled first at the per-CPU (mcache) layer - * before falling back to the slab layer. Performance is optimal when - * the request is satisfied at the CPU layer because global data/lock - * never gets accessed. When the slab layer is entered for allocation, - * the slab freelist will be checked first for available objects before - * the VM backing store is invoked. Slab layer operations are serialized - * for all of the caches as the mbuf global lock is held most of the time. - * Allocation paths are different depending on the class of objects: - * - * a. Rudimentary object: - * - * { m_get_common(), m_clattach(), m_mclget(), - * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), - * composite object allocation } - * | ^ - * | | - * | +-----------------------+ - * v | - * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() - * | ^ - * v | - * [CPU cache] -------> (found?) -------+ - * | | - * v | - * mbuf_slab_alloc() | - * | | - * v | - * +---------> [freelist] -------> (found?) -------+ - * | | - * | v - * | m_clalloc() - * | | - * | v - * +---<<---- kmem_mb_alloc() - * - * b. Composite object: - * - * { m_getpackets_internal(), m_allocpacket_internal() } - * | ^ - * | | - * | +------ (done) ---------+ - * v | - * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() - * | ^ - * v | - * [CPU cache] -------> (found?) -------+ - * | | - * v | - * mbuf_cslab_alloc() | - * | | - * v | - * [freelist] -------> (found?) -------+ - * | | - * v | - * (rudimentary object) | - * mcache_alloc/mcache_alloc_ext() ------>>-----+ - * - * Auditing notes: If auditing is enabled, buffers will be subjected to - * integrity checks by the audit routine. This is done by verifying their - * contents against DEADBEEF (free) pattern before returning them to caller. - * As part of this step, the routine will also record the transaction and - * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will - * also restore any constructed data structure fields if necessary. - * - * OBJECT DEALLOCATION: - * - * Freeing an object simply involves placing it into the CPU cache; this - * pollutes the cache to benefit subsequent allocations. The slab layer - * will only be entered if the object is to be purged out of the cache. - * During normal operations, this happens only when the CPU layer resizes - * its bucket while it's adjusting to the allocation load. Deallocation - * paths are different depending on the class of objects: - * - * a. Rudimentary object: - * - * { m_free(), m_freem_list(), composite object deallocation } - * | ^ - * | | - * | +------ (done) ---------+ - * v | - * mcache_free/mcache_free_ext() | - * | | - * v | - * mbuf_slab_audit() | - * | | - * v | - * [CPU cache] ---> (not purging?) -----+ - * | | - * v | - * mbuf_slab_free() | - * | | - * v | - * [freelist] ----------->>------------+ - * (objects get purged to VM only on demand) - * - * b. Composite object: - * - * { m_free(), m_freem_list() } - * | ^ - * | | - * | +------ (done) ---------+ - * v | - * mcache_free/mcache_free_ext() | - * | | - * v | - * mbuf_cslab_audit() | - * | | - * v | - * [CPU cache] ---> (not purging?) -----+ - * | | - * v | - * mbuf_cslab_free() | - * | | - * v | - * [freelist] ---> (not purging?) -----+ - * | | - * v | - * (rudimentary object) | - * mcache_free/mcache_free_ext() ------->>------+ - * - * Auditing notes: If auditing is enabled, the audit routine will save - * any constructed data structure fields (if necessary) before filling the - * contents of the buffers with DEADBEEF (free) pattern and recording the - * transaction. Buffers that are freed (whether at CPU or slab layer) are - * expected to contain the free pattern. - * - * DEBUGGING: - * - * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this - * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, - * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, - * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak - * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. - * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. - * - * Each object is associated with exactly one mcache_audit_t structure that - * contains the information related to its last buffer transaction. Given - * an address of an object, the audit structure can be retrieved by finding - * the position of the object relevant to the base address of the cluster: - * - * +------------+ +=============+ - * | mbuf addr | | mclaudit[i] | - * +------------+ +=============+ - * | | cl_audit[0] | - * i = MTOBG(addr) +-------------+ - * | +-----> | cl_audit[1] | -----> mcache_audit_t - * b = BGTOM(i) | +-------------+ - * | | | ... | - * x = MCLIDX(b, addr) | +-------------+ - * | | | cl_audit[7] | - * +-----------------+ +-------------+ - * (e.g. x == 1) - * - * The mclaudit[] array is allocated at initialization time, but its contents - * get populated when the corresponding cluster is created. Because a page - * can be turned into NMBPG number of mbufs, we preserve enough space for the - * mbufs so that there is a 1-to-1 mapping between them. A page that never - * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the - * remaining entries unused. For 16KB cluster, only one entry from the first - * page is allocated and used for the entire object. - */ -#else /* * MBUF IMPLEMENTATION NOTES (using zalloc). * @@ -487,54 +276,6 @@ extern int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len); * Debugging mbufs can be done by booting a KASAN enabled kernel. */ -#endif /* CONFIG_MBUF_MCACHE */ - -/* TODO: should be in header file */ -/* kernel translater */ -extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -extern vm_map_t mb_map; /* special map */ - -#if CONFIG_MBUF_MCACHE -static uint32_t mb_kmem_contig_failed; -static uint32_t mb_kmem_failed; -static uint32_t mb_kmem_one_failed; -/* Timestamp of allocation failures. */ -static uint64_t mb_kmem_contig_failed_ts; -static uint64_t mb_kmem_failed_ts; -static uint64_t mb_kmem_one_failed_ts; -static uint64_t mb_kmem_contig_failed_size; -static uint64_t mb_kmem_failed_size; -static uint32_t mb_kmem_stats[6]; -#endif /* CONFIG_MBUF_MCACHE */ - -/* Global lock */ -static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf"); -static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp); -static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data; - -#if CONFIG_MBUF_MCACHE -/* Back-end (common) layer */ -static uint64_t mb_expand_cnt; -static uint64_t mb_expand_cl_cnt; -static uint64_t mb_expand_cl_total; -static uint64_t mb_expand_bigcl_cnt; -static uint64_t mb_expand_bigcl_total; -static uint64_t mb_expand_16kcl_cnt; -static uint64_t mb_expand_16kcl_total; -static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */ -static uint32_t mbuf_worker_run_cnt; -static uint64_t mbuf_worker_last_runtime; -static uint64_t mbuf_drain_last_runtime; -static int mbuf_worker_ready; /* worker thread is runnable */ -static unsigned int ncpu; /* number of CPUs */ -static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ -static ppnum_t mcl_pages; /* Size of array (# physical pages) */ -static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ -static mcache_t *ref_cache; /* Cache of cluster reference & flags */ -static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ -unsigned int mbuf_debug; /* patchable mbuf mcache flags */ -#endif /* CONFIG_MBUF_DEBUG */ -static unsigned int mb_normalized; /* number of packets "normalized" */ /* * Convention typedefs for local __single pointers. @@ -542,339 +283,39 @@ static unsigned int mb_normalized; /* number of packets "normalized" */ typedef typeof(*((zone_t)0)) *__single zone_ref_t; typedef void * __single any_ref_t; -#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ -#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ - -typedef enum { - MC_MBUF = 0, /* Regular mbuf */ - MC_CL, /* Cluster */ - MC_BIGCL, /* Large (4KB) cluster */ - MC_16KCL, /* Jumbo (16KB) cluster */ - MC_MBUF_CL, /* mbuf + cluster */ - MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ - MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ -} mbuf_class_t; - -#define MBUF_CLASS_MIN MC_MBUF -#define MBUF_CLASS_MAX MC_MBUF_16KCL -#define MBUF_CLASS_LAST MC_16KCL -#define MBUF_CLASS_VALID(c) \ - ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) -#define MBUF_CLASS_COMPOSITE(c) \ - ((int)(c) > MBUF_CLASS_LAST) - - -/* - * mbuf specific mcache allocation request flags. - */ -#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ - -/* - * Per-cluster slab structure. - * - * A slab is a cluster control structure that contains one or more object - * chunks; the available chunks are chained in the slab's freelist (sl_head). - * Each time a chunk is taken out of the slab, the slab's reference count - * gets incremented. When all chunks have been taken out, the empty slab - * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is - * returned to a slab causes the slab's reference count to be decremented; - * it also causes the slab to be reinserted back to class's slab list, if - * it's not already done. - * - * Compartmentalizing of the object chunks into slabs allows us to easily - * merge one or more slabs together when the adjacent slabs are idle, as - * well as to convert or move a slab from one class to another; e.g. the - * mbuf cluster slab can be converted to a regular cluster slab when all - * mbufs in the slab have been freed. - * - * A slab may also span across multiple clusters for chunks larger than - * a cluster's size. In this case, only the slab of the first cluster is - * used. The rest of the slabs are marked with SLF_PARTIAL to indicate - * that they are part of the larger slab. - * - * Each slab controls a page of memory. - */ -typedef struct mcl_slab { - struct mcl_slab *sl_next; /* neighboring slab */ - u_int8_t sl_class; /* controlling mbuf class */ - int8_t sl_refcnt; /* outstanding allocations */ - int8_t sl_chunks; /* chunks (bufs) in this slab */ - u_int16_t sl_flags; /* slab flags (see below) */ - u_int16_t sl_len; /* slab length */ - void *sl_base; /* base of allocated memory */ - void *sl_head; /* first free buffer */ - TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ -} mcl_slab_t; - -#define SLF_MAPPED 0x0001 /* backed by a mapped page */ -#define SLF_PARTIAL 0x0002 /* part of another slab */ -#define SLF_DETACHED 0x0004 /* not in slab freelist */ - -/* - * The array of slabs are broken into groups of arrays per 1MB of kernel - * memory to reduce the footprint. Each group is allocated on demand - * whenever a new piece of memory mapped in from the VM crosses the 1MB - * boundary. - */ -#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT) - -typedef struct mcl_slabg { - mcl_slab_t *slg_slab; /* group of slabs */ -} mcl_slabg_t; - -/* - * Number of slabs needed to control a 16KB cluster object. - */ -#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT) - -#if CONFIG_MBUF_MCACHE -/* - * Per-cluster audit structure. - */ -typedef struct { - mcache_audit_t **cl_audit; /* array of audits */ -} mcl_audit_t; - -typedef struct { - struct thread *msa_thread; /* thread doing transaction */ - struct thread *msa_pthread; /* previous transaction thread */ - uint32_t msa_tstamp; /* transaction timestamp (ms) */ - uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */ - uint16_t msa_depth; /* pc stack depth */ - uint16_t msa_pdepth; /* previous transaction pc stack */ - void *msa_stack[MCACHE_STACK_DEPTH]; - void *msa_pstack[MCACHE_STACK_DEPTH]; -} mcl_scratch_audit_t; - -typedef struct { - /* - * Size of data from the beginning of an mbuf that covers m_hdr, - * pkthdr and m_ext structures. If auditing is enabled, we allocate - * a shadow mbuf structure of this size inside each audit structure, - * and the contents of the real mbuf gets copied into it when the mbuf - * is freed. This allows us to pattern-fill the mbuf for integrity - * check, and to preserve any constructed mbuf fields (e.g. mbuf + - * cluster cache case). Note that we don't save the contents of - * clusters when they are freed; we simply pattern-fill them. - */ - u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)]; - mcl_scratch_audit_t sc_scratch __attribute__((aligned(8))); -} mcl_saved_contents_t; - -#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t)) - -#define MCA_SAVED_MBUF_PTR(_mca) \ - ((struct mbuf *)(void *)((mcl_saved_contents_t *) \ - (_mca)->mca_contents)->sc_mbuf) -#define MCA_SAVED_MBUF_SIZE \ - (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf)) -#define MCA_SAVED_SCRATCH_PTR(_mca) \ - (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch) - -/* - * mbuf specific mcache audit flags - */ -#define MB_INUSE 0x01 /* object has not been returned to slab */ -#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ -#define MB_SCVALID 0x04 /* object has valid saved contents */ - -/* - * Each of the following two arrays hold up to nmbclusters elements. - */ -static mcl_audit_t *mclaudit; /* array of cluster audit information */ -static unsigned int maxclaudit; /* max # of entries in audit table */ -static mcl_slabg_t **slabstbl; /* cluster slabs table */ -static unsigned int maxslabgrp; /* max # of entries in slabs table */ -static unsigned int slabgrp; /* # of entries in slabs table */ -#endif /* CONFIG_MBUF_MCACHE */ +/* Global lock */ +static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf"); +static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp); +#if !CONFIG_MBUF_MCACHE +static +#endif +lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data; /* Globals */ +#if !CONFIG_MBUF_MCACHE +static +#endif int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ int njcl; /* # of clusters for jumbo sizes */ int njclbytes; /* size of a jumbo cluster */ -unsigned char *mbutl; /* first mapped cluster address */ -unsigned char *embutl; /* ending virtual address of mclusters */ int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ int max_datalen; /* MHLEN - max_hdr */ -#if CONFIG_MBUF_MCACHE -static boolean_t mclverify; /* debug: pattern-checking */ -static boolean_t mcltrace; /* debug: stack tracing */ -static boolean_t mclfindleak; /* debug: leak detection */ -static boolean_t mclexpleak; /* debug: expose leak info to user space */ - -static struct timeval mb_start; /* beginning of time */ - -/* mbuf leak detection variables */ -static struct mleak_table mleak_table; -static mleak_stat_t *mleak_stat; - -#define MLEAK_STAT_SIZE(n) \ - __builtin_offsetof(mleak_stat_t, ml_trace[n]) - -struct mallocation { - mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ - u_int32_t trace_index; /* mtrace index for corresponding backtrace */ - u_int32_t count; /* How many objects were requested */ - u_int64_t hitcount; /* for determining hash effectiveness */ -}; - -struct mtrace { - u_int64_t collisions; - u_int64_t hitcount; - u_int64_t allocs; - u_int64_t depth; - uintptr_t addr[MLEAK_STACK_DEPTH]; -}; - -/* Size must be a power of two for the zhash to be able to just mask off bits */ -#define MLEAK_ALLOCATION_MAP_NUM 512 -#define MLEAK_TRACE_MAP_NUM 256 - -/* - * Sample factor for how often to record a trace. This is overwritable - * by the boot-arg mleak_sample_factor. - */ -#define MLEAK_SAMPLE_FACTOR 500 - -/* - * Number of top leakers recorded. - */ -#define MLEAK_NUM_TRACES 5 - -#define MB_LEAK_SPACING_64 " " -#define MB_LEAK_SPACING_32 " " - - -#define MB_LEAK_HDR_32 "\n\ - trace [1] trace [2] trace [3] trace [4] trace [5] \n\ - ---------- ---------- ---------- ---------- ---------- \n\ -" - -#define MB_LEAK_HDR_64 "\n\ - trace [1] trace [2] trace [3] \ - trace [4] trace [5] \n\ - ------------------ ------------------ ------------------ \ - ------------------ ------------------ \n\ -" - -static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; -static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; - -/* Hashmaps of allocations and their corresponding traces */ -static struct mallocation *mleak_allocations; -static struct mtrace *mleak_traces; -static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; - -/* Lock to protect mleak tables from concurrent modification */ -static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock"); -static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp); -static lck_mtx_t *const mleak_lock = &mleak_lock_data; - -/* *Failed* large allocations. */ -struct mtracelarge { - uint64_t size; - uint64_t depth; - uintptr_t addr[MLEAK_STACK_DEPTH]; -}; - -#define MTRACELARGE_NUM_TRACES 5 -static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES]; - -static void mtracelarge_register(size_t size); -#endif /* CONFIG_MBUF_MCACHE */ - /* Lock to protect the completion callback table */ static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl"); LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp); -extern u_int32_t high_sb_max; - -/* The minimum number of objects that are allocated, to start. */ -#define MINCL 32 -#define MINBIGCL (MINCL >> 1) -#define MIN16KCL (MINCL >> 2) - -/* Low watermarks (only map in pages once free counts go below) */ -#define MBIGCL_LOWAT MINBIGCL -#define M16KCL_LOWAT MIN16KCL - -typedef struct { - mbuf_class_t mtbl_class; /* class type */ -#if CONFIG_MBUF_MCACHE - mcache_t *mtbl_cache; /* mcache for this buffer class */ - TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ - mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ -#endif /* CONFIG_MBUF_MCACHE */ - mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ - u_int32_t mtbl_maxsize; /* maximum buffer size */ - int mtbl_minlimit; /* minimum allowed */ - int mtbl_maxlimit; /* maximum allowed */ - u_int32_t mtbl_wantpurge; /* purge during next reclaim */ - uint32_t mtbl_avgtotal; /* average total on iOS */ - u_int32_t mtbl_expand; /* worker should expand the class */ -} mbuf_table_t; - -#define m_class(c) mbuf_table[c].mtbl_class -#if CONFIG_MBUF_MCACHE -#define m_cache(c) mbuf_table[c].mtbl_cache -#define m_slablist(c) mbuf_table[c].mtbl_slablist -#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist -#else #define m_stats(c) mbuf_table[c].mtbl_stats -#endif /* CONFIG_MBUF_MCACHE */ -#define m_maxsize(c) mbuf_table[c].mtbl_maxsize -#define m_minlimit(c) mbuf_table[c].mtbl_minlimit -#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit -#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge -#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname -#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size -#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total -#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active -#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree -#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt -#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt -#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt -#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified -#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt -#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal -#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt -#define m_region_expand(c) mbuf_table[c].mtbl_expand +#if !CONFIG_MBUF_MCACHE /* * Note: number of entries in mbuf_table must not exceed * MB_STAT_MAX_MB_CLASSES */ static mbuf_table_t mbuf_table[] = { -#if CONFIG_MBUF_MCACHE - /* - * The caches for mbufs, regular clusters and big clusters. - * The average total values were based on data gathered by actual - * usage patterns on iOS. - */ - { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), - NULL, NULL, 0, 0, 0, 0, 3000, 0 }, - { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), - NULL, NULL, 0, 0, 0, 0, 2000, 0 }, - { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), - NULL, NULL, 0, 0, 0, 0, 1000, 0 }, - { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), - NULL, NULL, 0, 0, 0, 0, 200, 0 }, - /* - * The following are special caches; they serve as intermediate - * caches backed by the above rudimentary caches. Each object - * in the cache is an mbuf with a cluster attached to it. Unlike - * the above caches, these intermediate caches do not directly - * deal with the slab structures; instead, the constructed - * cached elements are simply stored in the freelists. - */ - { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 }, - { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 }, - { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 }, -#else { .mtbl_class = MC_MBUF }, { .mtbl_class = MC_CL }, { .mtbl_class = MC_BIGCL }, @@ -882,138 +323,18 @@ static mbuf_table_t mbuf_table[] = { { .mtbl_class = MC_MBUF_CL }, { .mtbl_class = MC_MBUF_BIGCL }, { .mtbl_class = MC_MBUF_16KCL }, -#endif /* CONFIG_MBUF_MCACHE */ }; +#endif /* !CONFIG_MBUF_MCACHE */ -#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) +#if !CONFIG_MBUF_MCACHE +static +#endif /* !CONFIG_MBUF_MCACHE */ +unsigned int mb_memory_pressure_percentage = 80; -#if SKYWALK && CONFIG_MBUF_MCACHE -#define MC_THRESHOLD_SCALE_DOWN_FACTOR 2 -static unsigned int mc_threshold_scale_down_factor = - MC_THRESHOLD_SCALE_DOWN_FACTOR; -#endif /* SKYWALK */ - -#if CONFIG_MBUF_MCACHE -static uint32_t -m_avgtotal(mbuf_class_t c) -{ -#if SKYWALK - return if_is_fsw_transport_netagent_enabled() ? - (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) : - mbuf_table[c].mtbl_avgtotal; -#else /* !SKYWALK */ - return mbuf_table[c].mtbl_avgtotal; -#endif /* SKYWALK */ -} -#endif /* CONFIG_MBUF_MCACHE */ - -#if CONFIG_MBUF_MCACHE -static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ -static int mb_waiters; /* number of waiters */ -#endif /* CONFIG_MBUF_MCACHE */ - -#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ -#if CONFIG_MBUF_MCACHE -static struct timeval mb_wdtstart; /* watchdog start timestamp */ -static char *mbuf_dump_buf; - -#define MBUF_DUMP_BUF_SIZE 4096 - -/* - * mbuf watchdog is enabled by default. It is also toggeable via the - * kern.ipc.mb_watchdog sysctl. - * Garbage collection is enabled by default on embedded platforms. - * mb_drain_maxint controls the amount of time to wait (in seconds) before - * consecutive calls to mbuf_drain(). - */ -static unsigned int mb_watchdog = 1; -#if !XNU_TARGET_OS_OSX -static unsigned int mb_drain_maxint = 60; -#else /* XNU_TARGET_OS_OSX */ -static unsigned int mb_drain_maxint = 0; -#endif /* XNU_TARGET_OS_OSX */ -#endif /* CONFIG_MBUF_MCACHE */ -static unsigned int mb_memory_pressure_percentage = 80; - -static void m_set_rfa(struct mbuf *, struct ext_ref *); - -#if CONFIG_MBUF_MCACHE -/* The following are used to serialize m_clalloc() */ -static boolean_t mb_clalloc_busy; -static void *mb_clalloc_waitchan = &mb_clalloc_busy; -static int mb_clalloc_waiters; -#endif /* CONFIG_MBUF_MCACHE */ - -static void mbuf_mtypes_sync(boolean_t); static int mbstat_sysctl SYSCTL_HANDLER_ARGS; -static void mbuf_stat_sync(void); static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; -#if CONFIG_MBUF_MCACHE -static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; -static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; -static char *mbuf_dump(void); -#endif /* CONFIG_MBUF_MCACHE */ -static void mbuf_table_init(void); -static inline void m_incref(struct mbuf *); -static inline u_int16_t m_decref(struct mbuf *); +#if !CONFIG_MBUF_MCACHE static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t); -#if CONFIG_MBUF_MCACHE -static int m_clalloc(const u_int32_t, const int, const u_int32_t); -static void mbuf_worker_thread_init(void); -static mcache_obj_t *slab_alloc(mbuf_class_t, int); -static void slab_free(mbuf_class_t, mcache_obj_t *); -static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, - unsigned int, int); -static void mbuf_slab_free(void *, mcache_obj_t *, int); -static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); -static void mbuf_slab_notify(void *, u_int32_t); -static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, - unsigned int); -static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); -static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, - unsigned int, int); -static void mbuf_cslab_free(void *, mcache_obj_t *, int); -static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); -static int freelist_populate(mbuf_class_t, unsigned int, int); -static void freelist_init(mbuf_class_t); -static boolean_t mbuf_cached_above(mbuf_class_t, int); -static boolean_t mbuf_steal(mbuf_class_t, unsigned int); -static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); -static int m_howmany(int, size_t); -static void mbuf_worker_thread(void); -static void mbuf_watchdog(void); -static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); - -static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, - size_t, unsigned int); -static void mcl_audit_free(void *, unsigned int); -static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); -static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); -static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, - boolean_t); -static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); -static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); -static void mcl_audit_scratch(mcache_audit_t *); -static void mcl_audit_mcheck_panic(struct mbuf *); -static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); - -static void mleak_activate(void); -static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); -static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); -static void mleak_free(mcache_obj_t *); -static void mleak_sort_traces(void); -static void mleak_update_stats(void); - -static mcl_slab_t *slab_get(void *); -static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, - void *, void *, unsigned int, int, int); -static void slab_insert(mcl_slab_t *, mbuf_class_t); -static void slab_remove(mcl_slab_t *, mbuf_class_t); -static boolean_t slab_inrange(mcl_slab_t *, void *); -static void slab_nextptr_panic(mcl_slab_t *, void *); -static void slab_detach(mcl_slab_t *); -static boolean_t slab_is_detached(mcl_slab_t *); -#else /* !CONFIG_MBUF_MCACHE */ static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t); static struct mbuf *mz_alloc(zalloc_flags_t); static void mz_free(struct mbuf *); @@ -1031,15 +352,15 @@ static void *mz_composite_mark_invalid(zone_id_t, void *); static void mz_composite_destroy(zone_id_t, void *); ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref, - ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE); + ZC_CACHING | ZC_KASAN_NOQUARANTINE); ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf, - ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE); + ZC_CACHING | ZC_KASAN_NOQUARANTINE); ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster, - ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA); + ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA); ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster, - ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA); + ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA); ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster, - ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA); + ZC_CACHING | ZC_KASAN_NOQUARANTINE | ZC_DATA); static_assert(sizeof(union mcluster) == MCLBYTES); static_assert(sizeof(union mbigcluster) == MBIGCLBYTES); static_assert(sizeof(union m16kcluster) == M16KCLBYTES); @@ -1084,22 +405,10 @@ m_class_from_zid(zone_id_t zid) static thread_call_t mbuf_defunct_tcall; static thread_call_t mbuf_drain_tcall; -#endif /* CONFIG_MBUF_MCACHE */ +#endif /* !CONFIG_MBUF_MCACHE */ static int m_copyback0(struct mbuf **, int, int len, const void * __sized_by_or_null(len), int, int); static struct mbuf *m_split0(struct mbuf *, int, int, int); -#if CONFIG_MBUF_MCACHE && (DEBUG || DEVELOPMENT) -#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__) -static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...); -static char *mbwdog_logging; -const unsigned mbwdog_logging_size = 4096; -static size_t mbwdog_logging_used; -#else -#define mbwdog_logger(fmt, ...) do { } while (0) -#endif /* CONFIG_MBUF_MCACHE &&DEBUG || DEVELOPMENT */ -#if CONFIG_MBUF_MCACHE -static void mbuf_drain_locked(boolean_t); -#endif /* CONFIG_MBUF_MCACHE */ /* flags for m_copyback0 */ #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ @@ -1107,163 +416,6 @@ static void mbuf_drain_locked(boolean_t); #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */ #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */ -/* - * This flag is set for all mbufs that come out of and into the composite - * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that - * are marked with such a flag have clusters attached to them, and will be - * treated differently when they are freed; instead of being placed back - * into the mbuf and cluster freelists, the composite mbuf + cluster objects - * are placed back into the appropriate composite cache's freelist, and the - * actual freeing is deferred until the composite objects are purged. At - * such a time, this flag will be cleared from the mbufs and the objects - * will be freed into their own separate freelists. - */ -#define EXTF_COMPOSITE 0x1 - -/* - * This flag indicates that the external cluster is read-only, i.e. it is - * or was referred to by more than one mbufs. Once set, this flag is never - * cleared. - */ -#define EXTF_READONLY 0x2 -/* - * This flag indicates that the external cluster is paired with the mbuf. - * Pairing implies an external free routine defined which will be invoked - * when the reference count drops to the minimum at m_free time. This - * flag is never cleared. - */ -#define EXTF_PAIRED 0x4 - -#define EXTF_MASK \ - (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED) - -#define MEXT_MINREF(m) ((m_get_rfa(m))->minref) -#define MEXT_REF(m) ((m_get_rfa(m))->refcnt) -#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt) -#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags) -#define MEXT_PRIV(m) ((m_get_rfa(m))->priv) -#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired) -#define MBUF_IS_COMPOSITE(m) \ - (MEXT_REF(m) == MEXT_MINREF(m) && \ - (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) -/* - * This macro can be used to test if the mbuf is paired to an external - * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject - * is important, as EXTF_PAIRED alone is insufficient since it is immutable, - * and thus survives calls to m_free_paired. - */ -#define MBUF_IS_PAIRED(m) \ - (((m)->m_flags & M_EXT) && \ - (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \ - MEXT_PMBUF(m) == (m)) - -/* - * Macros used to verify the integrity of the mbuf. - */ -#if CONFIG_MBUF_MCACHE -#define _MCHECK(m) { \ - if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \ - if (mclaudit == NULL) \ - panic("MCHECK: m_type=%d m=%p", \ - (u_int16_t)(m)->m_type, m); \ - else \ - mcl_audit_mcheck_panic(m); \ - } \ -} -#else -#define _MCHECK(m) \ - if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \ - panic("MCHECK: m_type=%d m=%p", \ - (u_int16_t)(m)->m_type, m); \ - } -#endif /* CONFIG_MBUF_MCACHE */ - -#if CONFIG_MBUF_MCACHE -#define MBUF_IN_MAP(addr) \ - ((unsigned char *)(addr) >= mbutl && \ - (unsigned char *)(addr) < embutl) - -#define MRANGE(addr) { \ - if (!MBUF_IN_MAP(addr)) \ - panic("MRANGE: address out of range 0x%p", addr); \ -} - -/* - * Macros to obtain page index given a base cluster address - */ -#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT) -#define PGTOM(x) (mbutl + (x << PAGE_SHIFT)) - -/* - * Macro to find the mbuf index relative to a base. - */ -#define MBPAGEIDX(c, m) \ - (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT) - -/* - * Same thing for 2KB cluster index. - */ -#define CLPAGEIDX(c, m) \ - (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT) - -/* - * Macro to find 4KB cluster index relative to a base - */ -#define BCLPAGEIDX(c, m) \ - (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT) -#endif /* CONFIG_MBUF_MCACHE */ - -/* - * Macros used during mbuf and cluster initialization. - */ -#define MBUF_INIT_PKTHDR(m) { \ - (m)->m_pkthdr.rcvif = NULL; \ - (m)->m_pkthdr.pkt_hdr = NULL; \ - (m)->m_pkthdr.len = 0; \ - (m)->m_pkthdr.csum_flags = 0; \ - (m)->m_pkthdr.csum_data = 0; \ - (m)->m_pkthdr.vlan_tag = 0; \ - (m)->m_pkthdr.comp_gencnt = 0; \ - (m)->m_pkthdr.pkt_crumbs = 0; \ - m_classifier_init(m, 0); \ - m_tag_init(m, 1); \ - m_scratch_init(m); \ -} - -#define MBUF_INIT(m, pkthdr, type) { \ - _MCHECK(m); \ - (m)->m_next = (m)->m_nextpkt = NULL; \ - (m)->m_len = 0; \ - (m)->m_type = type; \ - if ((pkthdr) == 0) { \ - (m)->m_data = (uintptr_t)(m)->m_dat; \ - (m)->m_flags = 0; \ - } else { \ - (m)->m_data = (uintptr_t)(m)->m_pktdat; \ - (m)->m_flags = M_PKTHDR; \ - MBUF_INIT_PKTHDR(m); \ - } \ -} - -#define MEXT_INIT mext_init - -#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \ - ref, 0, flag, 0, NULL) - -#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \ - ref, 0, flag, 0, NULL) - -#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ - MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \ - ref, 0, flag, 0, NULL) - -/* - * Macro to convert BSD malloc sleep flag to mcache's - */ -#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) - /* * The structure that holds all mbuf class statistics exportable via sysctl. * Similar to mbstat structure, the mb_stat structure is protected by the @@ -1275,6 +427,7 @@ struct omb_stat *omb_stat; /* For backwards compatibility */ #define MB_STAT_SIZE(n) \ __builtin_offsetof(mb_stat_t, mbs_class[n]) + #define OMB_STAT_SIZE(n) \ __builtin_offsetof(struct omb_stat, mbs_class[n]) @@ -1294,37 +447,69 @@ struct mbstat mbstat; #define MBSTAT_MTYPES_MAX \ (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) -/* - * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated - * atomically and stored in a per-CPU structure which is lock-free; this is - * done in order to avoid writing to the global mbstat data structure which - * would cause false sharing. During sysctl request for kern.ipc.mbstat, - * the statistics across all CPUs will be converged into the mbstat.m_mtypes - * array and returned to the application. Any updates for types greater or - * equal than MT_MAX would be done atomically to the mbstat; this slows down - * performance but is okay since the kernel uses only up to MT_MAX-1 while - * anything beyond that (up to type 255) is considered a corner case. - */ -typedef struct { - unsigned int cpu_mtypes[MT_MAX]; -} mbuf_mtypes_t; +#if !CONFIG_MBUF_MCACHE +static +#endif +mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes); -static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes); - -#define mtype_stat_add(type, n) { \ - if ((unsigned)(type) < MT_MAX) { \ - mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \ - os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \ - } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ - os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \ - } \ +__private_extern__ inline struct ext_ref * +m_get_rfa(struct mbuf *m) +{ + return m->m_ext.ext_refflags; } -#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) -#define mtype_stat_inc(t) mtype_stat_add(t, 1) -#define mtype_stat_dec(t) mtype_stat_sub(t, 1) +__private_extern__ inline m_ext_free_func_t +m_get_ext_free(struct mbuf *m) +{ + if (m->m_ext.ext_free == NULL) { + return NULL; + } -static inline void + return ptrauth_nop_cast(m_ext_free_func_t, m->m_ext.ext_free); +} + +#if !CONFIG_MBUF_MCACHE +static +#endif +caddr_t +m_get_ext_arg(struct mbuf *m) +{ + return (caddr_t)m->m_ext.ext_arg; +} + +#if !CONFIG_MBUF_MCACHE +static +#endif +void +m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free, + caddr_t ext_arg) +{ + VERIFY(m->m_flags & M_EXT); + if (rfa != NULL) { + m->m_ext.ext_refflags = rfa; + if (ext_free != NULL) { + m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free); + m->m_ext.ext_arg = ext_arg; + } else { + m->m_ext.ext_free = NULL; + m->m_ext.ext_arg = NULL; + } + } else { + if (ext_free != NULL) { + m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free); + m->m_ext.ext_arg = ext_arg; + } else { + m->m_ext.ext_free = NULL; + m->m_ext.ext_arg = NULL; + } + m->m_ext.ext_refflags = NULL; + } +} + +#if !CONFIG_MBUF_MCACHE +static +#endif +void mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size, m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa, u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag, @@ -1344,14 +529,15 @@ mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size, MEXT_PMBUF(m) = pm; } -static void -mbuf_mtypes_sync(boolean_t locked) +#if !CONFIG_MBUF_MCACHE +static +#endif +void +mbuf_mtypes_sync(void) { mbuf_mtypes_t mtc; - if (locked) { - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - } + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); mtc = *PERCPU_GET_MASTER(mbuf_mtypes); percpu_foreach_secondary(mtype, mbuf_mtypes) { @@ -1360,115 +546,23 @@ mbuf_mtypes_sync(boolean_t locked) } } - if (!locked) { - lck_mtx_lock(mbuf_mlock); - } for (int n = 0; n < MT_MAX; n++) { mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; } - if (!locked) { - lck_mtx_unlock(mbuf_mlock); - } -} - -static int -mbstat_sysctl SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, arg1, arg2) - -#if CONFIG_MBUF_MCACHE - mbuf_mtypes_sync(FALSE); -#else - lck_mtx_lock(mbuf_mlock); - mbuf_stat_sync(); - mbuf_mtypes_sync(TRUE); - lck_mtx_unlock(mbuf_mlock); -#endif - - return SYSCTL_OUT(req, &mbstat, sizeof(mbstat)); } +#if !CONFIG_MBUF_MCACHE static void mbuf_stat_sync(void) { mb_class_stat_t *sp; -#if CONFIG_MBUF_MCACHE - mcache_cpu_t *ccp; - mcache_t *cp; - int k, m, bktsize; -#else int k; uint64_t drops = 0; -#endif /* CONFIG_MBUF_MCACHE */ LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); -#if CONFIG_MBUF_MCACHE - for (k = 0; k < NELEM(mbuf_table); k++) { - cp = m_cache(k); - ccp = &cp->mc_cpu[0]; - bktsize = ccp->cc_bktsize; - sp = mbuf_table[k].mtbl_stats; - - if (cp->mc_flags & MCF_NOCPUCACHE) { - sp->mbcl_mc_state = MCS_DISABLED; - } else if (cp->mc_purge_cnt > 0) { - sp->mbcl_mc_state = MCS_PURGING; - } else if (bktsize == 0) { - sp->mbcl_mc_state = MCS_OFFLINE; - } else { - sp->mbcl_mc_state = MCS_ONLINE; - } - - sp->mbcl_mc_cached = 0; - for (m = 0; m < ncpu; m++) { - ccp = &cp->mc_cpu[m]; - if (ccp->cc_objs > 0) { - sp->mbcl_mc_cached += ccp->cc_objs; - } - if (ccp->cc_pobjs > 0) { - sp->mbcl_mc_cached += ccp->cc_pobjs; - } - } - sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); - sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - - sp->mbcl_infree; - - sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; - sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; - sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; - - /* Calculate total count specific to each class */ - sp->mbcl_ctotal = sp->mbcl_total; - switch (m_class(k)) { - case MC_MBUF: - /* Deduct mbufs used in composite caches */ - sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + - m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL)); - break; - - case MC_CL: - /* Deduct clusters used in composite cache */ - sp->mbcl_ctotal -= m_total(MC_MBUF_CL); - break; - - case MC_BIGCL: - /* Deduct clusters used in composite cache */ - sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); - break; - - case MC_16KCL: - /* Deduct clusters used in composite cache */ - sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); - break; - - default: - break; - } - } -#else - for (k = 0; k < NELEM(mbuf_table); k++) { + for (k = 0; k < MC_MAX; k++) { const zone_id_t zid = m_class_to_zid(m_class(k)); const zone_ref_t zone = zone_by_id(zid); struct zone_basic_stats stats = {}; @@ -1518,7 +612,20 @@ mbuf_stat_sync(void) mbstat.m_drops = drops; mbstat.m_bigclusters = m_total(MC_BIGCL); mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL); -#endif /* CONFIG_MBUF_MCACHE */ +} +#endif /* !CONFIG_MBUF_MCACHE */ + +static int +mbstat_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + lck_mtx_lock(mbuf_mlock); + mbuf_stat_sync(); + mbuf_mtypes_sync(); + lck_mtx_unlock(mbuf_mlock); + + return SYSCTL_OUT(req, &mbstat, sizeof(mbstat)); } static int @@ -1560,10 +667,10 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt; } statp = omb_stat; - statsz = OMB_STAT_SIZE(NELEM(mbuf_table)); + statsz = OMB_STAT_SIZE(MC_MAX); } else { statp = mb_stat; - statsz = MB_STAT_SIZE(NELEM(mbuf_table)); + statsz = MB_STAT_SIZE(MC_MAX); } lck_mtx_unlock(mbuf_mlock); @@ -1571,6 +678,72 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS return SYSCTL_OUT(req, statp, statsz); } +#if !CONFIG_MBUF_MCACHE +static void +mbuf_mcheck(struct mbuf *m) +{ + if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) { + panic("MCHECK: m_type=%d m=%p", + (u_int16_t)(m)->m_type, m); + } +} +#endif /* !CONFIG_MBUF_MCACHE */ + +static void +m_scratch_init(struct mbuf *m) +{ + struct pkthdr *pkt = &m->m_pkthdr; + + VERIFY(m->m_flags & M_PKTHDR); + + /* See comments in */ + if (pkt->pkt_flags & PKTF_PRIV_GUARDED) { + panic_plain("Invalid attempt to modify guarded module-private " + "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags); + /* NOTREACHED */ + } + + bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv)); +} + + +static void +mbuf_init_pkthdr(struct mbuf *m) +{ + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.pkt_hdr = NULL; + m->m_pkthdr.len = 0; + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.vlan_tag = 0; + m->m_pkthdr.comp_gencnt = 0; + m->m_pkthdr.pkt_crumbs = 0; + m_classifier_init(m, 0); + m_tag_init(m, 1); + m_scratch_init(m); +} + +#if !CONFIG_MBUF_MCACHE +static +#endif +void +mbuf_init(struct mbuf *m, int pkthdr, int type) +{ + mbuf_mcheck(m); + m->m_next = m->m_nextpkt = NULL; + m->m_len = 0; + m->m_type = type; + if (pkthdr == 0) { + m->m_data = (uintptr_t)m->m_dat; + m->m_flags = 0; + } else { + m->m_data = (uintptr_t)m->m_pktdat; + m->m_flags = M_PKTHDR; + mbuf_init_pkthdr(m); + } +} + + #if !CONFIG_MBUF_MCACHE /* * The following functions are wrappers around mbuf @@ -1766,7 +939,7 @@ mz_composite_build(zone_id_t zid, zalloc_flags_t flags) if (__improbable(m == NULL)) { goto out_free_rfa; } - MBUF_INIT(m, 0, MT_FREE); + mbuf_init(m, 0, MT_FREE); if (zid == ZONE_ID_MBUF_CLUSTER_2K) { MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE); } else if (zid == ZONE_ID_MBUF_CLUSTER_4K) { @@ -1805,7 +978,7 @@ mz_composite_mark_valid(zone_id_t zid, void *p) m->m_data = (uintptr_t)cl; m->m_ext.ext_buf = cl; m->m_ext.ext_size = m->m_ext.ext_size; - m_set_rfa(m, rfa); + m->m_ext.ext_refflags = rfa; #else #pragma unused(zid) #endif @@ -1834,7 +1007,7 @@ mz_composite_mark_invalid(zone_id_t zid, void *p) m->m_data = (uintptr_t)cl; m->m_ext.ext_buf = cl; m->m_ext.ext_size = m->m_ext.ext_size; - m_set_rfa(m, rfa); + m->m_ext.ext_refflags = rfa; #else #pragma unused(zid) #endif @@ -1873,46 +1046,10 @@ mz_composite_destroy(zone_id_t zid, void *p) } #endif /* !CONFIG_MBUF_MCACHE */ -#if CONFIG_MBUF_MCACHE -static int -mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, arg1, arg2) - int i; - - /* Ensure leak tracing turned on */ - if (!mclfindleak || !mclexpleak) { - return ENXIO; - } - - lck_mtx_lock(mleak_lock); - mleak_update_stats(); - i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); - lck_mtx_unlock(mleak_lock); - - return i; -} - -static int -mleak_table_sysctl SYSCTL_HANDLER_ARGS -{ -#pragma unused(oidp, arg1, arg2) - int i = 0; - - /* Ensure leak tracing turned on */ - if (!mclfindleak || !mclexpleak) { - return ENXIO; - } - - lck_mtx_lock(mleak_lock); - i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table)); - lck_mtx_unlock(mleak_lock); - - return i; -} -#endif /* CONFIG_MBUF_MCACHE */ - -static inline void +#if !CONFIG_MBUF_MCACHE +static +#endif +void m_incref(struct mbuf *m) { uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed); @@ -1928,7 +1065,10 @@ m_incref(struct mbuf *m) } } -static inline uint16_t +#if !CONFIG_MBUF_MCACHE +static +#endif +uint16_t m_decref(struct mbuf *m) { VERIFY(MEXT_REF(m) != 0); @@ -1936,60 +1076,52 @@ m_decref(struct mbuf *m) return os_atomic_dec(&MEXT_REF(m), acq_rel); } -static void +/* By default, mbuf_limit is enabled. Except when serverperfmode is set. */ +static int mbuf_limit = 1; + +#if !CONFIG_MBUF_MCACHE +static +#endif +void mbuf_table_init(void) { unsigned int b, c, s; - int m, config_mbuf_jumbo = 0; + int m; - VERIFY(NELEM(mbuf_table) <= MB_STAT_MAX_MB_CLASSES); - /* - * Kernel version of mb_stat / omb_stat should be sufficient - * for the NELEM(mbuf_table). - */ - VERIFY(OMB_STAT_SIZE(NELEM(mbuf_table)) <= sizeof(*omb_stat)); - omb_stat = zalloc_permanent(sizeof(*omb_stat), + omb_stat = zalloc_permanent(OMB_STAT_SIZE(MC_MAX), ZALIGN(struct omb_stat)); - VERIFY(MB_STAT_SIZE(NELEM(mbuf_table)) <= sizeof(*mb_stat)); - mb_stat = zalloc_permanent(sizeof(*mb_stat), + mb_stat = zalloc_permanent(MB_STAT_SIZE(MC_MAX), ZALIGN(mb_stat_t)); - mb_stat->mbs_cnt = NELEM(mbuf_table); - for (m = 0; m < NELEM(mbuf_table); m++) { + mb_stat->mbs_cnt = MC_MAX; + for (m = 0; m < MC_MAX; m++) { mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; } -#if CONFIG_MBUF_JUMBO - config_mbuf_jumbo = 1; -#endif /* CONFIG_MBUF_JUMBO */ - - if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) { - /* - * Set aside 1/3 of the mbuf cluster map for jumbo - * clusters; we do this only on platforms where jumbo - * cluster pool is enabled. - */ - njcl = nmbclusters / 3; - njclbytes = M16KCLBYTES; - } + /* + * Set aside 1/3 of the mbuf cluster map for jumbo + * clusters; we do this only on platforms where jumbo + * cluster pool is enabled. + */ + njcl = nmbclusters / 3; + njclbytes = M16KCLBYTES; /* * nclusters holds both the 2KB and 4KB pools, so ensure it's * a multiple of 4KB clusters. */ nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG); - if (njcl > 0) { - /* - * Each jumbo cluster takes 8 2KB clusters, so make - * sure that the pool size is evenly divisible by 8; - * njcl is in 2KB unit, hence treated as such. - */ - njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL); - /* Update nclusters with rounded down value of njcl */ - nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG); - } + /* + * Each jumbo cluster takes 8 2KB clusters, so make + * sure that the pool size is evenly divisible by 8; + * njcl is in 2KB unit, hence treated as such. + */ + njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL); + + /* Update nclusters with rounded down value of njcl */ + nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG); /* * njcl is valid only on platforms with 16KB jumbo clusters or @@ -2015,7 +1147,11 @@ mbuf_table_init(void) * 1/64th (c) is reserved for 2KB clusters. */ m_minlimit(MC_CL) = c; - m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ + if (mbuf_limit) { + m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ + } else { + m_maxlimit(MC_CL) = INT_MAX; + } m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); @@ -2024,7 +1160,11 @@ mbuf_table_init(void) * It cannot be turned into 2KB clusters or mbufs. */ m_minlimit(MC_BIGCL) = b; - m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ + if (mbuf_limit) { + m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ + } else { + m_maxlimit(MC_BIGCL) = INT_MAX; + } m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); @@ -2032,7 +1172,11 @@ mbuf_table_init(void) * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) */ m_minlimit(MC_MBUF) = 0; - m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */ + if (mbuf_limit) { + m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */ + } else { + m_maxlimit(MC_MBUF) = INT_MAX; + } m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE; snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); @@ -2040,13 +1184,21 @@ mbuf_table_init(void) * Set limits for the composite classes. */ m_minlimit(MC_MBUF_CL) = 0; - m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); + if (mbuf_limit) { + m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); + } else { + m_maxlimit(MC_MBUF_CL) = INT_MAX; + } m_maxsize(MC_MBUF_CL) = MCLBYTES; m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); m_minlimit(MC_MBUF_BIGCL) = 0; - m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); + if (mbuf_limit) { + m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); + } else { + m_maxlimit(MC_MBUF_BIGCL) = INT_MAX; + } m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); @@ -2055,12 +1207,20 @@ mbuf_table_init(void) * And for jumbo classes. */ m_minlimit(MC_16KCL) = 0; - m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ + if (mbuf_limit) { + m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ + } else { + m_maxlimit(MC_16KCL) = INT_MAX; + } m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); m_minlimit(MC_MBUF_16KCL) = 0; - m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); + if (mbuf_limit) { + m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); + } else { + m_maxlimit(MC_MBUF_16KCL) = INT_MAX; + } m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); @@ -2077,7 +1237,10 @@ mbuf_table_init(void) mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); } -static int +#if !CONFIG_MBUF_MCACHE +static +#endif +int mbuf_get_class(struct mbuf *m) { if (m->m_flags & M_EXT) { @@ -2108,61 +1271,27 @@ mbuf_get_class(struct mbuf *m) return MC_MBUF; } +#if !CONFIG_MBUF_MCACHE bool mbuf_class_under_pressure(struct mbuf *m) { - int mclass = mbuf_get_class(m); + struct zone_basic_stats stats = {}; + zone_ref_t zone; + zone_id_t zid; + int mclass; -#if CONFIG_MBUF_MCACHE - if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { - /* - * The above computation does not include the per-CPU cached objects. - * As a fast-path check this is good-enough. But now we do - * the "slower" count of the cached objects to know exactly the - * number of active mbufs in use. - * - * We do not take the mbuf_lock here to avoid lock-contention. Numbers - * might be slightly off but we don't try to be 100% accurate. - * At worst, we drop a packet that we shouldn't have dropped or - * we might go slightly above our memory-pressure threshold. - */ - mcache_t *cp = m_cache(mclass); - mcache_cpu_t *ccp = &cp->mc_cpu[0]; - - int bktsize = os_access_once(ccp->cc_bktsize); - uint32_t bl_total = os_access_once(cp->mc_full.bl_total); - uint32_t cached = 0; - int i; - - for (i = 0; i < ncpu; i++) { - ccp = &cp->mc_cpu[i]; - - int cc_objs = os_access_once(ccp->cc_objs); - if (cc_objs > 0) { - cached += cc_objs; - } - - int cc_pobjs = os_access_once(ccp->cc_pobjs); - if (cc_pobjs > 0) { - cached += cc_pobjs; - } - } - cached += (bl_total * bktsize); - if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { - os_log(OS_LOG_DEFAULT, - "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u", - __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass)); - return true; - } + if (mbuf_limit == 0) { + return false; } -#else + + mclass = mbuf_get_class(m); + /* * Grab the statistics from zalloc. * We can't call mbuf_stat_sync() since that requires a lock. */ - const zone_id_t zid = m_class_to_zid(m_class(mclass)); - const zone_ref_t zone = zone_by_id(zid); - struct zone_basic_stats stats = {}; + zid = m_class_to_zid(m_class(mclass)); + zone = zone_by_id(zid); zone_get_stats(zone, &stats); if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { @@ -2171,10 +1300,10 @@ mbuf_class_under_pressure(struct mbuf *m) __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass)); return true; } -#endif /* CONFIG_MBUF_MCACHE */ return false; } +#endif /* CONFIG_MBUF_MCACHE */ #if defined(__LP64__) typedef struct ncl_tbl { @@ -2220,94 +1349,81 @@ mbuf_default_ncl(uint64_t mem) return n; } +#if !CONFIG_MBUF_MCACHE __private_extern__ void mbinit(void) { unsigned int m; -#if CONFIG_MBUF_MCACHE - unsigned int initmcl = 0; - thread_t thread = THREAD_NULL; -#endif /* CONFIG_MBUF_MCACHE */ - -#if CONFIG_MBUF_MCACHE - microuptime(&mb_start); -#endif /* CONFIG_MBUF_MCACHE */ /* * These MBUF_ values must be equal to their private counterparts. */ - _CASSERT(MBUF_EXT == M_EXT); - _CASSERT(MBUF_PKTHDR == M_PKTHDR); - _CASSERT(MBUF_EOR == M_EOR); - _CASSERT(MBUF_LOOP == M_LOOP); - _CASSERT(MBUF_BCAST == M_BCAST); - _CASSERT(MBUF_MCAST == M_MCAST); - _CASSERT(MBUF_FRAG == M_FRAG); - _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG); - _CASSERT(MBUF_LASTFRAG == M_LASTFRAG); - _CASSERT(MBUF_PROMISC == M_PROMISC); - _CASSERT(MBUF_HASFCS == M_HASFCS); + static_assert(MBUF_EXT == M_EXT); + static_assert(MBUF_PKTHDR == M_PKTHDR); + static_assert(MBUF_EOR == M_EOR); + static_assert(MBUF_LOOP == M_LOOP); + static_assert(MBUF_BCAST == M_BCAST); + static_assert(MBUF_MCAST == M_MCAST); + static_assert(MBUF_FRAG == M_FRAG); + static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG); + static_assert(MBUF_LASTFRAG == M_LASTFRAG); + static_assert(MBUF_PROMISC == M_PROMISC); + static_assert(MBUF_HASFCS == M_HASFCS); - _CASSERT(MBUF_TYPE_FREE == MT_FREE); - _CASSERT(MBUF_TYPE_DATA == MT_DATA); - _CASSERT(MBUF_TYPE_HEADER == MT_HEADER); - _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET); - _CASSERT(MBUF_TYPE_PCB == MT_PCB); - _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE); - _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE); - _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE); - _CASSERT(MBUF_TYPE_SONAME == MT_SONAME); - _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS); - _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE); - _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS); - _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR); - _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL); - _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA); + static_assert(MBUF_TYPE_FREE == MT_FREE); + static_assert(MBUF_TYPE_DATA == MT_DATA); + static_assert(MBUF_TYPE_HEADER == MT_HEADER); + static_assert(MBUF_TYPE_SOCKET == MT_SOCKET); + static_assert(MBUF_TYPE_PCB == MT_PCB); + static_assert(MBUF_TYPE_RTABLE == MT_RTABLE); + static_assert(MBUF_TYPE_HTABLE == MT_HTABLE); + static_assert(MBUF_TYPE_ATABLE == MT_ATABLE); + static_assert(MBUF_TYPE_SONAME == MT_SONAME); + static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS); + static_assert(MBUF_TYPE_FTABLE == MT_FTABLE); + static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS); + static_assert(MBUF_TYPE_IFADDR == MT_IFADDR); + static_assert(MBUF_TYPE_CONTROL == MT_CONTROL); + static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA); - _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); - _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); - _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL); - _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); - _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT); - _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP); - _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP); - _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP); - _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); - _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); - _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); - _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); - _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); - _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); + static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); + static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); + static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL); + static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); + static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT); + static_assert(MBUF_CSUM_REQ_IP == CSUM_IP); + static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP); + static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP); + static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); + static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); + static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); + static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); + static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); + static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); - _CASSERT(MBUF_WAITOK == M_WAIT); - _CASSERT(MBUF_DONTWAIT == M_DONTWAIT); - _CASSERT(MBUF_COPYALL == M_COPYALL); + static_assert(MBUF_WAITOK == M_WAIT); + static_assert(MBUF_DONTWAIT == M_DONTWAIT); + static_assert(MBUF_COPYALL == M_COPYALL); - _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); - _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); - _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); - _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); - _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); - _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); - _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); - _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); - _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI); - _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); - _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); + static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); + static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); + static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); + static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); - _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); - _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); - _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); - _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); + static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); + static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); + static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); + static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); /* Module specific scratch space (32-bit alignment requirement) */ - _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % - sizeof(uint32_t))); - -#if CONFIG_MBUF_MCACHE - /* Make sure we don't save more than we should */ - _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf)); -#endif /* CONFIG_MBUF_MCACHE */ + static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t))); if (nmbclusters == 0) { nmbclusters = NMBCLUSTERS; @@ -2316,100 +1432,16 @@ mbinit(void) /* This should be a sane (at least even) value by now */ VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); + PE_parse_boot_argn("mbuf_limit", &mbuf_limit, sizeof(mbuf_limit)); + if (serverperfmode) { + mbuf_limit = 0; + } + /* Setup the mbuf table */ mbuf_table_init(); - _CASSERT(sizeof(struct mbuf) == _MSIZE); + static_assert(sizeof(struct mbuf) == _MSIZE); -#if CONFIG_MBUF_MCACHE - /* - * Allocate cluster slabs table: - * - * maxslabgrp = (N * 2048) / (1024 * 1024) - * - * Where N is nmbclusters rounded up to the nearest 512. This yields - * mcl_slab_g_t units, each one representing a MB of memory. - */ - maxslabgrp = - (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT; - slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *), - ZALIGN(mcl_slabg_t)); - - /* - * Allocate audit structures, if needed: - * - * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE - * - * This yields mcl_audit_t units, each one representing a page. - */ - PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug)); - mbuf_debug |= mcache_getflags(); - if (mbuf_debug & MCF_DEBUG) { - int l; - mcl_audit_t *mclad; - maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT); - mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit), - ZALIGN(mcl_audit_t)); - for (l = 0, mclad = mclaudit; l < maxclaudit; l++) { - mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *), - ZALIGN_PTR); - } - - mcl_audit_con_cache = mcache_create("mcl_audit_contents", - AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP); - VERIFY(mcl_audit_con_cache != NULL); - } - mclverify = (mbuf_debug & MCF_VERIFY); - mcltrace = (mbuf_debug & MCF_TRACE); - mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); - mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); - - /* Enable mbuf leak logging, with a lock to protect the tables */ - - mleak_activate(); - - /* - * Allocate structure for per-CPU statistics that's aligned - * on the CPU cache boundary; this code assumes that we never - * uninitialize this framework, since the original address - * before alignment is not saved. - */ - ncpu = ml_wait_max_cpus(); - - /* Calculate the number of pages assigned to the cluster pool */ - mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE; - mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t), - ZALIGN(ppnum_t)); - - /* Register with the I/O Bus mapper */ - mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); - - embutl = (mbutl + (nmbclusters * MCLBYTES)); - VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0); - - /* Prime up the freelist */ - PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl)); - if (initmcl != 0) { - initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ - if (initmcl > m_maxlimit(MC_BIGCL)) { - initmcl = m_maxlimit(MC_BIGCL); - } - } - if (initmcl < m_minlimit(MC_BIGCL)) { - initmcl = m_minlimit(MC_BIGCL); - } - - lck_mtx_lock(mbuf_mlock); - - /* - * For classes with non-zero minimum limits, populate their freelists - * so that m_total(class) is at least m_minlimit(class). - */ - VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); - freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); - VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); - freelist_init(m_class(MC_CL)); -#else /* * We have yet to create the non composite zones * and thus we haven't asked zalloc to allocate @@ -2421,68 +1453,21 @@ mbinit(void) */ m_total(MC_BIGCL) = m_minlimit(MC_BIGCL); m_total(MC_CL) = m_minlimit(MC_CL); -#endif /* CONFIG_MBUF_MCACHE */ - for (m = 0; m < NELEM(mbuf_table); m++) { + for (m = 0; m < MC_MAX; m++) { /* Make sure we didn't miss any */ VERIFY(m_minlimit(m_class(m)) == 0 || m_total(m_class(m)) >= m_minlimit(m_class(m))); } -#if CONFIG_MBUF_MCACHE - lck_mtx_unlock(mbuf_mlock); - - (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, - NULL, &thread); - thread_deallocate(thread); - - ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref), - 0, 0, MCR_SLEEP); -#endif /* CONFIG_MBUF_MCACHE */ - /* Create the cache for each class */ - for (m = 0; m < NELEM(mbuf_table); m++) { -#if CONFIG_MBUF_MCACHE - void *allocfunc, *freefunc, *auditfunc, *logfunc; - u_int32_t flags; - - flags = mbuf_debug; - if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || - m_class(m) == MC_MBUF_16KCL) { - allocfunc = mbuf_cslab_alloc; - freefunc = mbuf_cslab_free; - auditfunc = mbuf_cslab_audit; - logfunc = mleak_logger; - } else { - allocfunc = mbuf_slab_alloc; - freefunc = mbuf_slab_free; - auditfunc = mbuf_slab_audit; - logfunc = mleak_logger; - } - - /* - * Disable per-CPU caches for jumbo classes if there - * is no jumbo cluster pool available in the system. - * The cache itself is still created (but will never - * be populated) since it simplifies the code. - */ - if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && - njcl == 0) { - flags |= MCF_NOCPUCACHE; - } - - if (!mclfindleak) { - flags |= MCF_NOLEAKLOG; - } - - m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), - allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, - (void *)(uintptr_t)m, flags, MCR_SLEEP); -#else + for (m = 0; m < MC_MAX; m++) { if (!MBUF_CLASS_COMPOSITE(m)) { zone_ref_t zone = zone_by_id(m_class_to_zid(m)); - zone_set_exhaustible(zone, m_maxlimit(m), false); + if (mbuf_limit) { + zone_set_exhaustible(zone, m_maxlimit(m), false); + } zone_raise_reserve(zone, m_minlimit(m)); /* * Pretend that we have allocated m_total() items @@ -2491,7 +1476,6 @@ mbinit(void) */ m_total(m) = m_minlimit(m); } -#endif /* CONFIG_MBUF_MCACHE */ } /* @@ -2515,15 +1499,6 @@ mbinit(void) } } -#if CONFIG_MBUF_MCACHE - /* allocate space for mbuf_dump_buf */ - mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE); - - if (mbuf_debug & MCF_DEBUG) { - printf("%s: MLEN %d, MHLEN %d\n", __func__, - (int)_MLEN, (int)_MHLEN); - } -#else mbuf_defunct_tcall = thread_call_allocate_with_options(mbuf_watchdog_defunct, NULL, @@ -2534,1857 +1509,26 @@ mbinit(void) NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); -#endif /* CONFIG_MBUF_MCACHE */ printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__, (nmbclusters << MCLSHIFT) >> MBSHIFT, (nclusters << MCLSHIFT) >> MBSHIFT, (njcl << MCLSHIFT) >> MBSHIFT); } -#if CONFIG_MBUF_MCACHE -/* - * Obtain a slab of object(s) from the class's freelist. - */ -static mcache_obj_t * -slab_alloc(mbuf_class_t class, int wait) -{ - mcl_slab_t *sp; - mcache_obj_t *buf; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - /* This should always be NULL for us */ - VERIFY(m_cobjlist(class) == NULL); - - /* - * Treat composite objects as having longer lifespan by using - * a slab from the reverse direction, in hoping that this could - * reduce the probability of fragmentation for slabs that hold - * more than one buffer chunks (e.g. mbuf slabs). For other - * slabs, this probably doesn't make much of a difference. - */ - if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL) - && (wait & MCR_COMP)) { - sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); - } else { - sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); - } - - if (sp == NULL) { - VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); - /* The slab list for this class is empty */ - return NULL; - } - - VERIFY(m_infree(class) > 0); - VERIFY(!slab_is_detached(sp)); - VERIFY(sp->sl_class == class && - (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); - buf = sp->sl_head; - VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); - sp->sl_head = buf->obj_next; - /* Increment slab reference */ - sp->sl_refcnt++; - - VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks); - - if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { - slab_nextptr_panic(sp, sp->sl_head); - /* In case sl_head is in the map but not in the slab */ - VERIFY(slab_inrange(sp, sp->sl_head)); - /* NOTREACHED */ - } - - if (mclaudit != NULL) { - mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); - mca->mca_uflags = 0; - /* Save contents on mbuf objects only */ - if (class == MC_MBUF) { - mca->mca_uflags |= MB_SCVALID; - } - } - - if (class == MC_CL) { - mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); - /* - * A 2K cluster slab can have at most NCLPG references. - */ - VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG && - sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE); - VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL); - } else if (class == MC_BIGCL) { - mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + - m_infree(MC_MBUF_BIGCL); - /* - * A 4K cluster slab can have NBCLPG references. - */ - VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG && - sp->sl_len == PAGE_SIZE && - (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL)); - } else if (class == MC_16KCL) { - mcl_slab_t *nsp; - int k; - - --m_infree(MC_16KCL); - VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); - /* - * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. - * A 16KB big cluster takes NSLABSP16KB slabs, each having at - * most 1 reference. - */ - for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - nsp->sl_refcnt++; - VERIFY(!slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_16KCL && - nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && - nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); - } - } else { - VERIFY(class == MC_MBUF); - --m_infree(MC_MBUF); - /* - * If auditing is turned on, this check is - * deferred until later in mbuf_slab_audit(). - */ - if (mclaudit == NULL) { - _MCHECK((struct mbuf *)buf); - } - /* - * Since we have incremented the reference count above, - * an mbuf slab (formerly a 4KB cluster slab that was cut - * up into mbufs) must have a reference count between 1 - * and NMBPG at this point. - */ - VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG && - sp->sl_chunks == NMBPG && - sp->sl_len == PAGE_SIZE); - VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL); - } - - /* If empty, remove this slab from the class's freelist */ - if (sp->sl_head == NULL) { - VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG); - VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG); - VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG); - slab_remove(sp, class); - } - - return buf; -} - -/* - * Place a slab of object(s) back into a class's slab list. - */ -static void -slab_free(mbuf_class_t class, mcache_obj_t *buf) -{ - mcl_slab_t *sp; - boolean_t reinit_supercl = false; - mbuf_class_t super_class; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - VERIFY(class != MC_16KCL || njcl > 0); - VERIFY(buf->obj_next == NULL); - - /* - * Synchronizing with m_clalloc, as it reads m_total, while we here - * are modifying m_total. - */ - while (mb_clalloc_busy) { - mb_clalloc_waiters++; - (void) msleep(mb_clalloc_waitchan, mbuf_mlock, - (PZERO - 1), "m_clalloc", NULL); - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - } - - /* We are busy now; tell everyone else to go away */ - mb_clalloc_busy = TRUE; - - sp = slab_get(buf); - VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && - (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); - - /* Decrement slab reference */ - sp->sl_refcnt--; - - if (class == MC_CL) { - VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); - /* - * A slab that has been splitted for 2KB clusters can have - * at most 1 outstanding reference at this point. - */ - VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) && - sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE); - VERIFY(sp->sl_refcnt < (NCLPG - 1) || - (slab_is_detached(sp) && sp->sl_head == NULL)); - } else if (class == MC_BIGCL) { - VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); - - /* A 4KB cluster slab can have NBCLPG references at most */ - VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG); - VERIFY(sp->sl_refcnt < (NBCLPG - 1) || - (slab_is_detached(sp) && sp->sl_head == NULL)); - } else if (class == MC_16KCL) { - mcl_slab_t *nsp; - int k; - /* - * A 16KB cluster takes NSLABSP16KB slabs, all must - * now have 0 reference. - */ - VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE)); - VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); - VERIFY(slab_is_detached(sp)); - for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - nsp->sl_refcnt--; - VERIFY(slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_16KCL && - (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && - nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); - } - } else { - /* - * A slab that has been splitted for mbufs has at most - * NMBPG reference counts. Since we have decremented - * one reference above, it must now be between 0 and - * NMBPG-1. - */ - VERIFY(class == MC_MBUF); - VERIFY(sp->sl_refcnt >= 0 && - sp->sl_refcnt <= (NMBPG - 1) && - sp->sl_chunks == NMBPG && - sp->sl_len == PAGE_SIZE); - VERIFY(sp->sl_refcnt < (NMBPG - 1) || - (slab_is_detached(sp) && sp->sl_head == NULL)); - } - - /* - * When auditing is enabled, ensure that the buffer still - * contains the free pattern. Otherwise it got corrupted - * while at the CPU cache layer. - */ - if (mclaudit != NULL) { - mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); - if (mclverify) { - mcache_audit_free_verify(mca, buf, 0, - m_maxsize(class)); - } - mca->mca_uflags &= ~MB_SCVALID; - } - - if (class == MC_CL) { - mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); - buf->obj_next = sp->sl_head; - } else if (class == MC_BIGCL) { - mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + - m_infree(MC_MBUF_BIGCL); - buf->obj_next = sp->sl_head; - } else if (class == MC_16KCL) { - ++m_infree(MC_16KCL); - } else { - ++m_infree(MC_MBUF); - buf->obj_next = sp->sl_head; - } - sp->sl_head = buf; - - /* - * If a slab has been split to either one which holds 2KB clusters, - * or one which holds mbufs, turn it back to one which holds a - * 4 or 16 KB cluster depending on the page size. - */ - if (m_maxsize(MC_BIGCL) == PAGE_SIZE) { - super_class = MC_BIGCL; - } else { - VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL)); - super_class = MC_16KCL; - } - if (class == MC_MBUF && sp->sl_refcnt == 0 && - m_total(class) >= (m_minlimit(class) + NMBPG) && - m_total(super_class) < m_maxlimit(super_class)) { - int i = NMBPG; - - m_total(MC_MBUF) -= NMBPG; - mbstat.m_mbufs = m_total(MC_MBUF); - m_infree(MC_MBUF) -= NMBPG; - mtype_stat_add(MT_FREE, -((unsigned)NMBPG)); - - while (i--) { - struct mbuf *m = sp->sl_head; - VERIFY(m != NULL); - sp->sl_head = m->m_next; - m->m_next = NULL; - } - reinit_supercl = true; - } else if (class == MC_CL && sp->sl_refcnt == 0 && - m_total(class) >= (m_minlimit(class) + NCLPG) && - m_total(super_class) < m_maxlimit(super_class)) { - int i = NCLPG; - - m_total(MC_CL) -= NCLPG; - mbstat.m_clusters = m_total(MC_CL); - m_infree(MC_CL) -= NCLPG; - - while (i--) { - union mcluster *c = sp->sl_head; - VERIFY(c != NULL); - sp->sl_head = c->mcl_next; - c->mcl_next = NULL; - } - reinit_supercl = true; - } else if (class == MC_BIGCL && super_class != MC_BIGCL && - sp->sl_refcnt == 0 && - m_total(class) >= (m_minlimit(class) + NBCLPG) && - m_total(super_class) < m_maxlimit(super_class)) { - int i = NBCLPG; - - VERIFY(super_class == MC_16KCL); - m_total(MC_BIGCL) -= NBCLPG; - mbstat.m_bigclusters = m_total(MC_BIGCL); - m_infree(MC_BIGCL) -= NBCLPG; - - while (i--) { - union mbigcluster *bc = sp->sl_head; - VERIFY(bc != NULL); - sp->sl_head = bc->mbc_next; - bc->mbc_next = NULL; - } - reinit_supercl = true; - } - - if (reinit_supercl) { - VERIFY(sp->sl_head == NULL); - VERIFY(m_total(class) >= m_minlimit(class)); - slab_remove(sp, class); - - /* Reinitialize it as a cluster for the super class */ - m_total(super_class)++; - m_infree(super_class)++; - VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) && - sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0); - - slab_init(sp, super_class, SLF_MAPPED, sp->sl_base, - sp->sl_base, PAGE_SIZE, 0, 1); - if (mclverify) { - mcache_set_pattern(MCACHE_FREE_PATTERN, - (caddr_t)sp->sl_base, sp->sl_len); - } - ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL; - - if (super_class == MC_BIGCL) { - mbstat.m_bigclusters = m_total(MC_BIGCL); - mbstat.m_bigclfree = m_infree(MC_BIGCL) + - m_infree(MC_MBUF_BIGCL); - } - - VERIFY(slab_is_detached(sp)); - VERIFY(m_total(super_class) <= m_maxlimit(super_class)); - - /* And finally switch class */ - class = super_class; - } - - /* Reinsert the slab to the class's slab list */ - if (slab_is_detached(sp)) { - slab_insert(sp, class); - } - - /* We're done; let others enter */ - mb_clalloc_busy = FALSE; - if (mb_clalloc_waiters > 0) { - mb_clalloc_waiters = 0; - wakeup(mb_clalloc_waitchan); - } -} - -/* - * Common allocator for rudimentary objects called by the CPU cache layer - * during an allocation request whenever there is no available element in the - * bucket layer. It returns one or more elements from the appropriate global - * freelist. If the freelist is empty, it will attempt to populate it and - * retry the allocation. - */ -static unsigned int -mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) -{ - mbuf_class_t class = (mbuf_class_t)arg; - unsigned int need = num; - mcache_obj_t **list = *plist; - - ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); - ASSERT(need > 0); - - lck_mtx_lock(mbuf_mlock); - - for (;;) { - if ((*list = slab_alloc(class, wait)) != NULL) { - (*list)->obj_next = NULL; - list = *plist = &(*list)->obj_next; - - if (--need == 0) { - /* - * If the number of elements in freelist has - * dropped below low watermark, asynchronously - * populate the freelist now rather than doing - * it later when we run out of elements. - */ - if (!mbuf_cached_above(class, wait) && - m_infree(class) < (m_total(class) >> 5)) { - (void) freelist_populate(class, 1, - M_DONTWAIT); - } - break; - } - } else { - VERIFY(m_infree(class) == 0 || class == MC_CL); - - (void) freelist_populate(class, 1, - (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); - - if (m_infree(class) > 0) { - continue; - } - - /* Check if there's anything at the cache layer */ - if (mbuf_cached_above(class, wait)) { - break; - } - - /* watchdog checkpoint */ - mbuf_watchdog(); - - /* We have nothing and cannot block; give up */ - if (wait & MCR_NOSLEEP) { - if (!(wait & MCR_TRYHARD)) { - m_fail_cnt(class)++; - mbstat.m_drops++; - break; - } - } - - /* - * If the freelist is still empty and the caller is - * willing to be blocked, sleep on the wait channel - * until an element is available. Otherwise, if - * MCR_TRYHARD is set, do our best to satisfy the - * request without having to go to sleep. - */ - if (mbuf_worker_ready && - mbuf_sleep(class, need, wait)) { - break; - } - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - } - } - - m_alloc_cnt(class) += num - need; - lck_mtx_unlock(mbuf_mlock); - - return num - need; -} - -/* - * Common de-allocator for rudimentary objects called by the CPU cache - * layer when one or more elements need to be returned to the appropriate - * global freelist. - */ -static void -mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) -{ - mbuf_class_t class = (mbuf_class_t)arg; - mcache_obj_t *nlist; - unsigned int num = 0; - int w; - - ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); - - lck_mtx_lock(mbuf_mlock); - - for (;;) { - nlist = list->obj_next; - list->obj_next = NULL; - slab_free(class, list); - ++num; - if ((list = nlist) == NULL) { - break; - } - } - m_free_cnt(class) += num; - - if ((w = mb_waiters) > 0) { - mb_waiters = 0; - } - if (w) { - mbwdog_logger("waking up all threads"); - } - lck_mtx_unlock(mbuf_mlock); - - if (w != 0) { - wakeup(mb_waitchan); - } -} - -/* - * Common auditor for rudimentary objects called by the CPU cache layer - * during an allocation or free request. For the former, this is called - * after the objects are obtained from either the bucket or slab layer - * and before they are returned to the caller. For the latter, this is - * called immediately during free and before placing the objects into - * the bucket or slab layer. - */ -static void -mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) -{ - mbuf_class_t class = (mbuf_class_t)arg; - mcache_audit_t *mca; - - ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); - - while (list != NULL) { - lck_mtx_lock(mbuf_mlock); - mca = mcl_audit_buf2mca(class, list); - - /* Do the sanity checks */ - if (class == MC_MBUF) { - mcl_audit_mbuf(mca, list, FALSE, alloc); - ASSERT(mca->mca_uflags & MB_SCVALID); - } else { - mcl_audit_cluster(mca, list, m_maxsize(class), - alloc, TRUE); - ASSERT(!(mca->mca_uflags & MB_SCVALID)); - } - /* Record this transaction */ - if (mcltrace) { - mcache_buffer_log(mca, list, m_cache(class), &mb_start); - } - - if (alloc) { - mca->mca_uflags |= MB_INUSE; - } else { - mca->mca_uflags &= ~MB_INUSE; - } - /* Unpair the object (unconditionally) */ - mca->mca_uptr = NULL; - lck_mtx_unlock(mbuf_mlock); - - list = list->obj_next; - } -} - -/* - * Common notify routine for all caches. It is called by mcache when - * one or more objects get freed. We use this indication to trigger - * the wakeup of any sleeping threads so that they can retry their - * allocation requests. - */ -static void -mbuf_slab_notify(void *arg, u_int32_t reason) -{ - mbuf_class_t class = (mbuf_class_t)arg; - int w; - - ASSERT(MBUF_CLASS_VALID(class)); - - if (reason != MCN_RETRYALLOC) { - return; - } - - lck_mtx_lock(mbuf_mlock); - if ((w = mb_waiters) > 0) { - m_notified(class)++; - mb_waiters = 0; - } - if (w) { - mbwdog_logger("waking up all threads"); - } - lck_mtx_unlock(mbuf_mlock); - - if (w != 0) { - wakeup(mb_waitchan); - } -} - -/* - * Obtain object(s) from the composite class's freelist. - */ -static unsigned int -cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) -{ - unsigned int need = num; - mcl_slab_t *sp, *clsp, *nsp; - struct mbuf *m; - mcache_obj_t **list = *plist; - void *cl; - - VERIFY(need > 0); - VERIFY(class != MC_MBUF_16KCL || njcl > 0); - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - /* Get what we can from the freelist */ - while ((*list = m_cobjlist(class)) != NULL) { - MRANGE(*list); - - m = (struct mbuf *)*list; - sp = slab_get(m); - cl = m->m_ext.ext_buf; - clsp = slab_get(cl); - VERIFY(m->m_flags == M_EXT && cl != NULL); - VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m)); - - if (class == MC_MBUF_CL) { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NCLPG); - } else { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NBCLPG); - } - - if (class == MC_MBUF_16KCL) { - int k; - for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } - } - - if ((m_cobjlist(class) = (*list)->obj_next) != NULL && - !MBUF_IN_MAP(m_cobjlist(class))) { - slab_nextptr_panic(sp, m_cobjlist(class)); - /* NOTREACHED */ - } - (*list)->obj_next = NULL; - list = *plist = &(*list)->obj_next; - - if (--need == 0) { - break; - } - } - m_infree(class) -= (num - need); - - return num - need; -} - -/* - * Place object(s) back into a composite class's freelist. - */ -static unsigned int -cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) -{ - mcache_obj_t *o, *tail; - unsigned int num = 0; - struct mbuf *m, *ms; - mcache_audit_t *mca = NULL; - mcache_obj_t *ref_list = NULL; - mcl_slab_t *clsp, *nsp; - void *cl; - mbuf_class_t cl_class; - - ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); - VERIFY(class != MC_MBUF_16KCL || njcl > 0); - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - if (class == MC_MBUF_CL) { - cl_class = MC_CL; - } else if (class == MC_MBUF_BIGCL) { - cl_class = MC_BIGCL; - } else { - VERIFY(class == MC_MBUF_16KCL); - cl_class = MC_16KCL; - } - - o = tail = list; - - while ((m = ms = (struct mbuf *)o) != NULL) { - mcache_obj_t *rfa, *nexto = o->obj_next; - - /* Do the mbuf sanity checks */ - if (mclaudit != NULL) { - mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - if (mclverify) { - mcache_audit_free_verify(mca, m, 0, - m_maxsize(MC_MBUF)); - } - ms = MCA_SAVED_MBUF_PTR(mca); - } - - /* Do the cluster sanity checks */ - cl = ms->m_ext.ext_buf; - clsp = slab_get(cl); - if (mclverify) { - size_t size = m_maxsize(cl_class); - mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, - (mcache_obj_t *)cl), cl, 0, size); - } - VERIFY(ms->m_type == MT_FREE); - VERIFY(ms->m_flags == M_EXT); - VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - if (cl_class == MC_CL) { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NCLPG); - } else { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NBCLPG); - } - if (cl_class == MC_16KCL) { - int k; - for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } - } - - /* - * If we're asked to purge, restore the actual mbuf using - * contents of the shadow structure (if auditing is enabled) - * and clear EXTF_COMPOSITE flag from the mbuf, as we are - * about to free it and the attached cluster into their caches. - */ - if (purged) { - /* Restore constructed mbuf fields */ - if (mclaudit != NULL) { - mcl_audit_restore_mbuf(m, mca, TRUE); - } - - MEXT_MINREF(m) = 0; - MEXT_REF(m) = 0; - MEXT_PREF(m) = 0; - MEXT_FLAGS(m) = 0; - MEXT_PRIV(m) = 0; - MEXT_PMBUF(m) = NULL; - - rfa = (mcache_obj_t *)(void *)m_get_rfa(m); - m_set_ext(m, NULL, NULL, NULL); - rfa->obj_next = ref_list; - ref_list = rfa; - - m->m_type = MT_FREE; - m->m_flags = m->m_len = 0; - m->m_next = m->m_nextpkt = NULL; - - /* Save mbuf fields and make auditing happy */ - if (mclaudit != NULL) { - mcl_audit_mbuf(mca, o, FALSE, FALSE); - } - - VERIFY(m_total(class) > 0); - m_total(class)--; - - /* Free the mbuf */ - o->obj_next = NULL; - slab_free(MC_MBUF, o); - - /* And free the cluster */ - ((mcache_obj_t *)cl)->obj_next = NULL; - if (class == MC_MBUF_CL) { - slab_free(MC_CL, cl); - } else if (class == MC_MBUF_BIGCL) { - slab_free(MC_BIGCL, cl); - } else { - slab_free(MC_16KCL, cl); - } - } - - ++num; - tail = o; - o = nexto; - } - - if (!purged) { - tail->obj_next = m_cobjlist(class); - m_cobjlist(class) = list; - m_infree(class) += num; - } else if (ref_list != NULL) { - mcache_free_ext(ref_cache, ref_list); - } - - return num; -} - -/* - * Common allocator for composite objects called by the CPU cache layer - * during an allocation request whenever there is no available element in - * the bucket layer. It returns one or more composite elements from the - * appropriate global freelist. If the freelist is empty, it will attempt - * to obtain the rudimentary objects from their caches and construct them - * into composite mbuf + cluster objects. - */ -static unsigned int -mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, - int wait) -{ - mbuf_class_t class = (mbuf_class_t)arg; - mbuf_class_t cl_class = 0; - unsigned int num = 0, cnum = 0, want = needed; - mcache_obj_t *ref_list = NULL; - mcache_obj_t *mp_list = NULL; - mcache_obj_t *clp_list = NULL; - mcache_obj_t **list; - struct ext_ref *rfa; - struct mbuf *m; - void *cl; - - ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); - ASSERT(needed > 0); - - VERIFY(class != MC_MBUF_16KCL || njcl > 0); - - /* There should not be any slab for this class */ - VERIFY(m_slab_cnt(class) == 0 && - m_slablist(class).tqh_first == NULL && - m_slablist(class).tqh_last == NULL); - - lck_mtx_lock(mbuf_mlock); - - /* Try using the freelist first */ - num = cslab_alloc(class, plist, needed); - list = *plist; - if (num == needed) { - m_alloc_cnt(class) += num; - lck_mtx_unlock(mbuf_mlock); - return needed; - } - - lck_mtx_unlock(mbuf_mlock); - - /* - * We could not satisfy the request using the freelist alone; - * allocate from the appropriate rudimentary caches and use - * whatever we can get to construct the composite objects. - */ - needed -= num; - - /* - * Mark these allocation requests as coming from a composite cache. - * Also, if the caller is willing to be blocked, mark the request - * with MCR_FAILOK such that we don't end up sleeping at the mbuf - * slab layer waiting for the individual object when one or more - * of the already-constructed composite objects are available. - */ - wait |= MCR_COMP; - if (!(wait & MCR_NOSLEEP)) { - wait |= MCR_FAILOK; - } - - /* allocate mbufs */ - needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); - if (needed == 0) { - ASSERT(mp_list == NULL); - goto fail; - } - - /* allocate clusters */ - if (class == MC_MBUF_CL) { - cl_class = MC_CL; - } else if (class == MC_MBUF_BIGCL) { - cl_class = MC_BIGCL; - } else { - VERIFY(class == MC_MBUF_16KCL); - cl_class = MC_16KCL; - } - needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); - if (needed == 0) { - ASSERT(clp_list == NULL); - goto fail; - } - - needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); - if (needed == 0) { - ASSERT(ref_list == NULL); - goto fail; - } - - /* - * By this time "needed" is MIN(mbuf, cluster, ref). Any left - * overs will get freed accordingly before we return to caller. - */ - for (cnum = 0; cnum < needed; cnum++) { - struct mbuf *ms; - - m = ms = (struct mbuf *)mp_list; - mp_list = mp_list->obj_next; - - cl = clp_list; - clp_list = clp_list->obj_next; - ((mcache_obj_t *)cl)->obj_next = NULL; - - rfa = (struct ext_ref *)ref_list; - ref_list = ref_list->obj_next; - ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; - - /* - * If auditing is enabled, construct the shadow mbuf - * in the audit structure instead of in the actual one. - * mbuf_cslab_audit() will take care of restoring the - * contents after the integrity check. - */ - if (mclaudit != NULL) { - mcache_audit_t *mca, *cl_mca; - - lck_mtx_lock(mbuf_mlock); - mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - ms = MCA_SAVED_MBUF_PTR(mca); - cl_mca = mcl_audit_buf2mca(cl_class, - (mcache_obj_t *)cl); - - /* - * Pair them up. Note that this is done at the time - * the mbuf+cluster objects are constructed. This - * information should be treated as "best effort" - * debugging hint since more than one mbufs can refer - * to a cluster. In that case, the cluster might not - * be freed along with the mbuf it was paired with. - */ - mca->mca_uptr = cl_mca; - cl_mca->mca_uptr = mca; - - ASSERT(mca->mca_uflags & MB_SCVALID); - ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); - lck_mtx_unlock(mbuf_mlock); - - /* Technically, they are in the freelist */ - if (mclverify) { - size_t size; - - mcache_set_pattern(MCACHE_FREE_PATTERN, m, - m_maxsize(MC_MBUF)); - - if (class == MC_MBUF_CL) { - size = m_maxsize(MC_CL); - } else if (class == MC_MBUF_BIGCL) { - size = m_maxsize(MC_BIGCL); - } else { - size = m_maxsize(MC_16KCL); - } - - mcache_set_pattern(MCACHE_FREE_PATTERN, cl, - size); - } - } - - MBUF_INIT(ms, 0, MT_FREE); - if (class == MC_MBUF_16KCL) { - MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); - } else if (class == MC_MBUF_BIGCL) { - MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); - } else { - MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); - } - VERIFY(ms->m_flags == M_EXT); - VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - - *list = (mcache_obj_t *)m; - (*list)->obj_next = NULL; - list = *plist = &(*list)->obj_next; - } - -fail: - /* - * Free up what's left of the above. - */ - if (mp_list != NULL) { - mcache_free_ext(m_cache(MC_MBUF), mp_list); - } - if (clp_list != NULL) { - mcache_free_ext(m_cache(cl_class), clp_list); - } - if (ref_list != NULL) { - mcache_free_ext(ref_cache, ref_list); - } - - lck_mtx_lock(mbuf_mlock); - if (num > 0 || cnum > 0) { - m_total(class) += cnum; - VERIFY(m_total(class) <= m_maxlimit(class)); - m_alloc_cnt(class) += num + cnum; - } - if ((num + cnum) < want) { - m_fail_cnt(class) += (want - (num + cnum)); - } - lck_mtx_unlock(mbuf_mlock); - - return num + cnum; -} - -/* - * Common de-allocator for composite objects called by the CPU cache - * layer when one or more elements need to be returned to the appropriate - * global freelist. - */ -static void -mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) -{ - mbuf_class_t class = (mbuf_class_t)arg; - unsigned int num; - int w; - - ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); - - lck_mtx_lock(mbuf_mlock); - - num = cslab_free(class, list, purged); - m_free_cnt(class) += num; - - if ((w = mb_waiters) > 0) { - mb_waiters = 0; - } - if (w) { - mbwdog_logger("waking up all threads"); - } - - lck_mtx_unlock(mbuf_mlock); - - if (w != 0) { - wakeup(mb_waitchan); - } -} - -/* - * Common auditor for composite objects called by the CPU cache layer - * during an allocation or free request. For the former, this is called - * after the objects are obtained from either the bucket or slab layer - * and before they are returned to the caller. For the latter, this is - * called immediately during free and before placing the objects into - * the bucket or slab layer. - */ -static void -mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) -{ - mbuf_class_t class = (mbuf_class_t)arg, cl_class; - mcache_audit_t *mca; - struct mbuf *m, *ms; - mcl_slab_t *clsp, *nsp; - size_t cl_size; - void *cl; - - ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); - if (class == MC_MBUF_CL) { - cl_class = MC_CL; - } else if (class == MC_MBUF_BIGCL) { - cl_class = MC_BIGCL; - } else { - cl_class = MC_16KCL; - } - cl_size = m_maxsize(cl_class); - - while ((m = ms = (struct mbuf *)list) != NULL) { - lck_mtx_lock(mbuf_mlock); - /* Do the mbuf sanity checks and record its transaction */ - mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - mcl_audit_mbuf(mca, m, TRUE, alloc); - if (mcltrace) { - mcache_buffer_log(mca, m, m_cache(class), &mb_start); - } - - if (alloc) { - mca->mca_uflags |= MB_COMP_INUSE; - } else { - mca->mca_uflags &= ~MB_COMP_INUSE; - } - - /* - * Use the shadow mbuf in the audit structure if we are - * freeing, since the contents of the actual mbuf has been - * pattern-filled by the above call to mcl_audit_mbuf(). - */ - if (!alloc && mclverify) { - ms = MCA_SAVED_MBUF_PTR(mca); - } - - /* Do the cluster sanity checks and record its transaction */ - cl = ms->m_ext.ext_buf; - clsp = slab_get(cl); - VERIFY(ms->m_flags == M_EXT && cl != NULL); - VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - if (class == MC_MBUF_CL) { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NCLPG); - } else { - VERIFY(clsp->sl_refcnt >= 1 && - clsp->sl_refcnt <= NBCLPG); - } - - if (class == MC_MBUF_16KCL) { - int k; - for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } - } - - - mca = mcl_audit_buf2mca(cl_class, cl); - mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE); - if (mcltrace) { - mcache_buffer_log(mca, cl, m_cache(class), &mb_start); - } - - if (alloc) { - mca->mca_uflags |= MB_COMP_INUSE; - } else { - mca->mca_uflags &= ~MB_COMP_INUSE; - } - lck_mtx_unlock(mbuf_mlock); - - list = list->obj_next; - } -} - -static void -m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size, - uint64_t alloc_size, kern_return_t error) -{ - *cnt = *cnt + 1; - *ts = net_uptime(); - if (size) { - *size = alloc_size; - } - switch (error) { - case KERN_SUCCESS: - break; - case KERN_INVALID_ARGUMENT: - mb_kmem_stats[0]++; - break; - case KERN_INVALID_ADDRESS: - mb_kmem_stats[1]++; - break; - case KERN_RESOURCE_SHORTAGE: - mb_kmem_stats[2]++; - break; - case KERN_NO_SPACE: - mb_kmem_stats[3]++; - break; - case KERN_FAILURE: - mb_kmem_stats[4]++; - break; - default: - mb_kmem_stats[5]++; - break; - } -} - -static vm_offset_t -kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err) -{ - vm_offset_t addr = 0; - kern_return_t kr = KERN_SUCCESS; - - if (!physContig) { - kr = kmem_alloc(mbmap, &addr, size, - KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); - } else { - kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff, - 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); - } - - if (kr != KERN_SUCCESS) { - addr = 0; - } - if (err) { - *err = kr; - } - - return addr; -} - -/* - * Allocate some number of mbuf clusters and place on cluster freelist. - */ -static int -m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) -{ - int i, count = 0; - vm_size_t size = 0; - int numpages = 0, large_buffer; - vm_offset_t page = 0; - mcache_audit_t *mca_list = NULL; - mcache_obj_t *con_list = NULL; - mcl_slab_t *sp; - mbuf_class_t class; - kern_return_t error; - - /* Set if a buffer allocation needs allocation of multiple pages */ - large_buffer = ((bufsize == m_maxsize(MC_16KCL)) && - PAGE_SIZE < M16KCLBYTES); - VERIFY(bufsize == m_maxsize(MC_BIGCL) || - bufsize == m_maxsize(MC_16KCL)); - - VERIFY((bufsize == PAGE_SIZE) || - (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL))); - - if (bufsize == m_size(MC_BIGCL)) { - class = MC_BIGCL; - } else { - class = MC_16KCL; - } - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - /* - * Multiple threads may attempt to populate the cluster map one - * after another. Since we drop the lock below prior to acquiring - * the physical page(s), our view of the cluster map may no longer - * be accurate, and we could end up over-committing the pages beyond - * the maximum allowed for each class. To prevent it, this entire - * operation (including the page mapping) is serialized. - */ - while (mb_clalloc_busy) { - mb_clalloc_waiters++; - (void) msleep(mb_clalloc_waitchan, mbuf_mlock, - (PZERO - 1), "m_clalloc", NULL); - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - } - - /* We are busy now; tell everyone else to go away */ - mb_clalloc_busy = TRUE; - - /* - * Honor the caller's wish to block or not block. We have a way - * to grow the pool asynchronously using the mbuf worker thread. - */ - i = m_howmany(num, bufsize); - if (i <= 0 || (wait & M_DONTWAIT)) { - goto out; - } - - lck_mtx_unlock(mbuf_mlock); - - size = round_page(i * bufsize); - page = kmem_mb_alloc(mb_map, size, large_buffer, &error); - - /* - * If we did ask for "n" 16KB physically contiguous chunks - * and didn't get them, then please try again without this - * restriction. - */ - net_update_uptime(); - if (large_buffer && page == 0) { - m_vm_error_stats(&mb_kmem_contig_failed, - &mb_kmem_contig_failed_ts, - &mb_kmem_contig_failed_size, - size, error); - page = kmem_mb_alloc(mb_map, size, 0, &error); - } - - if (page == 0) { - m_vm_error_stats(&mb_kmem_failed, - &mb_kmem_failed_ts, - &mb_kmem_failed_size, - size, error); -#if PAGE_SIZE == 4096 - if (bufsize == m_maxsize(MC_BIGCL)) { -#else - if (bufsize >= m_maxsize(MC_BIGCL)) { -#endif - /* Try for 1 page if failed */ - size = PAGE_SIZE; - page = kmem_mb_alloc(mb_map, size, 0, &error); - if (page == 0) { - m_vm_error_stats(&mb_kmem_one_failed, - &mb_kmem_one_failed_ts, - NULL, size, error); - } - } - - if (page == 0) { - lck_mtx_lock(mbuf_mlock); - goto out; - } - } - - VERIFY(IS_P2ALIGNED(page, PAGE_SIZE)); - numpages = size / PAGE_SIZE; - - /* If auditing is enabled, allocate the audit structures now */ - if (mclaudit != NULL) { - int needed; - - /* - * Yes, I realize this is a waste of memory for clusters - * that never get transformed into mbufs, as we may end - * up with NMBPG-1 unused audit structures per cluster. - * But doing so tremendously simplifies the allocation - * strategy, since at this point we are not holding the - * mbuf lock and the caller is okay to be blocked. - */ - if (bufsize == PAGE_SIZE) { - needed = numpages * NMBPG; - - i = mcache_alloc_ext(mcl_audit_con_cache, - &con_list, needed, MCR_SLEEP); - - VERIFY(con_list != NULL && i == needed); - } else { - /* - * if multiple 4K pages are being used for a - * 16K cluster - */ - needed = numpages / NSLABSP16KB; - } - - i = mcache_alloc_ext(mcache_audit_cache, - (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); - - VERIFY(mca_list != NULL && i == needed); - } - - lck_mtx_lock(mbuf_mlock); - - for (i = 0; i < numpages; i++, page += PAGE_SIZE) { - ppnum_t offset = - ((unsigned char *)page - mbutl) >> PAGE_SHIFT; - ppnum_t new_page = pmap_find_phys(kernel_pmap, page); - - /* - * If there is a mapper the appropriate I/O page is - * returned; zero out the page to discard its past - * contents to prevent exposing leftover kernel memory. - */ - VERIFY(offset < mcl_pages); - if (mcl_paddr_base != 0) { - bzero((void *)(uintptr_t) page, PAGE_SIZE); - new_page = IOMapperInsertPage(mcl_paddr_base, - offset, new_page); - } - mcl_paddr[offset] = new_page; - - /* Pattern-fill this fresh page */ - if (mclverify) { - mcache_set_pattern(MCACHE_FREE_PATTERN, - (caddr_t)page, PAGE_SIZE); - } - if (bufsize == PAGE_SIZE) { - mcache_obj_t *buf; - /* One for the entire page */ - sp = slab_get((void *)page); - if (mclaudit != NULL) { - mcl_audit_init((void *)page, - &mca_list, &con_list, - AUDIT_CONTENTS_SIZE, NMBPG); - } - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, class, SLF_MAPPED, (void *)page, - (void *)page, PAGE_SIZE, 0, 1); - buf = (mcache_obj_t *)page; - buf->obj_next = NULL; - - /* Insert this slab */ - slab_insert(sp, class); - - /* Update stats now since slab_get drops the lock */ - ++m_infree(class); - ++m_total(class); - VERIFY(m_total(class) <= m_maxlimit(class)); - if (class == MC_BIGCL) { - mbstat.m_bigclfree = m_infree(MC_BIGCL) + - m_infree(MC_MBUF_BIGCL); - mbstat.m_bigclusters = m_total(MC_BIGCL); - } - ++count; - } else if ((bufsize > PAGE_SIZE) && - (i % NSLABSP16KB) == 0) { - union m16kcluster *m16kcl = (union m16kcluster *)page; - mcl_slab_t *nsp; - int k; - - /* One for the entire 16KB */ - sp = slab_get(m16kcl); - if (mclaudit != NULL) { - mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); - } - - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, MC_16KCL, SLF_MAPPED, - m16kcl, m16kcl, bufsize, 0, 1); - m16kcl->m16kcl_next = NULL; - - /* - * 2nd-Nth page's slab is part of the first one, - * where N is NSLABSP16KB. - */ - for (k = 1; k < NSLABSP16KB; k++) { - nsp = slab_get(((union mbigcluster *)page) + k); - VERIFY(nsp->sl_refcnt == 0 && - nsp->sl_flags == 0); - slab_init(nsp, MC_16KCL, - SLF_MAPPED | SLF_PARTIAL, - m16kcl, NULL, 0, 0, 0); - } - /* Insert this slab */ - slab_insert(sp, MC_16KCL); - - /* Update stats now since slab_get drops the lock */ - ++m_infree(MC_16KCL); - ++m_total(MC_16KCL); - VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); - ++count; - } - } - VERIFY(mca_list == NULL && con_list == NULL); - - /* We're done; let others enter */ - mb_clalloc_busy = FALSE; - if (mb_clalloc_waiters > 0) { - mb_clalloc_waiters = 0; - wakeup(mb_clalloc_waitchan); - } - - return count; -out: - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - mtracelarge_register(size); - - /* We're done; let others enter */ - mb_clalloc_busy = FALSE; - if (mb_clalloc_waiters > 0) { - mb_clalloc_waiters = 0; - wakeup(mb_clalloc_waitchan); - } - - /* - * When non-blocking we kick a thread if we have to grow the - * pool or if the number of free clusters is less than requested. - */ - if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) { - mbwdog_logger("waking up the worker thread to to grow %s by %d", - m_cname(class), i); - wakeup((caddr_t)&mbuf_worker_needs_wakeup); - mbuf_worker_needs_wakeup = FALSE; - } - if (class == MC_BIGCL) { - if (i > 0) { - /* - * Remember total number of 4KB clusters needed - * at this time. - */ - i += m_total(MC_BIGCL); - if (i > m_region_expand(MC_BIGCL)) { - m_region_expand(MC_BIGCL) = i; - } - } - if (m_infree(MC_BIGCL) >= num) { - return 1; - } - } else { - if (i > 0) { - /* - * Remember total number of 16KB clusters needed - * at this time. - */ - i += m_total(MC_16KCL); - if (i > m_region_expand(MC_16KCL)) { - m_region_expand(MC_16KCL) = i; - } - } - if (m_infree(MC_16KCL) >= num) { - return 1; - } - } - return 0; -} - -/* - * Populate the global freelist of the corresponding buffer class. - */ -static int -freelist_populate(mbuf_class_t class, unsigned int num, int wait) -{ - mcache_obj_t *o = NULL; - int i, numpages = 0, count; - mbuf_class_t super_class; - - VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || - class == MC_16KCL); - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) || - PAGE_SIZE == m_maxsize(MC_16KCL)); - - if (m_maxsize(class) >= PAGE_SIZE) { - return m_clalloc(num, wait, m_maxsize(class)) != 0; - } - - /* - * The rest of the function will allocate pages and will slice - * them up into the right size - */ - - numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE; - - /* Currently assume that pages are 4K or 16K */ - if (PAGE_SIZE == m_maxsize(MC_BIGCL)) { - super_class = MC_BIGCL; - } else { - super_class = MC_16KCL; - } - - i = m_clalloc(numpages, wait, m_maxsize(super_class)); - - /* how many objects will we cut the page into? */ - int numobj = PAGE_SIZE / m_maxsize(class); - - for (count = 0; count < numpages; count++) { - /* respect totals, minlimit, maxlimit */ - if (m_total(super_class) <= m_minlimit(super_class) || - m_total(class) >= m_maxlimit(class)) { - break; - } - - if ((o = slab_alloc(super_class, wait)) == NULL) { - break; - } - - struct mbuf *m = (struct mbuf *)o; - union mcluster *c = (union mcluster *)o; - union mbigcluster *mbc = (union mbigcluster *)o; - mcl_slab_t *sp = slab_get(o); - mcache_audit_t *mca = NULL; - - /* - * since one full page will be converted to MC_MBUF or - * MC_CL, verify that the reference count will match that - * assumption - */ - VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp)); - VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); - /* - * Make sure that the cluster is unmolested - * while in freelist - */ - if (mclverify) { - mca = mcl_audit_buf2mca(super_class, - (mcache_obj_t *)o); - mcache_audit_free_verify(mca, - (mcache_obj_t *)o, 0, m_maxsize(super_class)); - } - - /* Reinitialize it as an mbuf or 2K or 4K slab */ - slab_init(sp, class, sp->sl_flags, - sp->sl_base, NULL, PAGE_SIZE, 0, numobj); - - VERIFY(sp->sl_head == NULL); - - VERIFY(m_total(super_class) >= 1); - m_total(super_class)--; - - if (super_class == MC_BIGCL) { - mbstat.m_bigclusters = m_total(MC_BIGCL); - } - - m_total(class) += numobj; - VERIFY(m_total(class) <= m_maxlimit(class)); - m_infree(class) += numobj; - - i = numobj; - if (class == MC_MBUF) { - mbstat.m_mbufs = m_total(MC_MBUF); - mtype_stat_add(MT_FREE, NMBPG); - while (i--) { - /* - * If auditing is enabled, construct the - * shadow mbuf in the audit structure - * instead of the actual one. - * mbuf_slab_audit() will take care of - * restoring the contents after the - * integrity check. - */ - if (mclaudit != NULL) { - struct mbuf *ms; - mca = mcl_audit_buf2mca(MC_MBUF, - (mcache_obj_t *)m); - ms = MCA_SAVED_MBUF_PTR(mca); - ms->m_type = MT_FREE; - } else { - m->m_type = MT_FREE; - } - m->m_next = sp->sl_head; - sp->sl_head = (void *)m++; - } - } else if (class == MC_CL) { /* MC_CL */ - mbstat.m_clfree = - m_infree(MC_CL) + m_infree(MC_MBUF_CL); - mbstat.m_clusters = m_total(MC_CL); - while (i--) { - c->mcl_next = sp->sl_head; - sp->sl_head = (void *)c++; - } - } else { - VERIFY(class == MC_BIGCL); - mbstat.m_bigclusters = m_total(MC_BIGCL); - mbstat.m_bigclfree = m_infree(MC_BIGCL) + - m_infree(MC_MBUF_BIGCL); - while (i--) { - mbc->mbc_next = sp->sl_head; - sp->sl_head = (void *)mbc++; - } - } - - /* Insert into the mbuf or 2k or 4k slab list */ - slab_insert(sp, class); - - if ((i = mb_waiters) > 0) { - mb_waiters = 0; - } - if (i != 0) { - mbwdog_logger("waking up all threads"); - wakeup(mb_waitchan); - } - } - return count != 0; -} - -/* - * For each class, initialize the freelist to hold m_minlimit() objects. - */ -static void -freelist_init(mbuf_class_t class) -{ - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - VERIFY(class == MC_CL || class == MC_BIGCL); - VERIFY(m_total(class) == 0); - VERIFY(m_minlimit(class) > 0); - - while (m_total(class) < m_minlimit(class)) { - (void) freelist_populate(class, m_minlimit(class), M_WAIT); - } - - VERIFY(m_total(class) >= m_minlimit(class)); -} - -/* - * (Inaccurately) check if it might be worth a trip back to the - * mcache layer due the availability of objects there. We'll - * end up back here if there's nothing up there. - */ -static boolean_t -mbuf_cached_above(mbuf_class_t class, int wait) -{ - switch (class) { - case MC_MBUF: - if (wait & MCR_COMP) { - return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || - !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)); - } - break; - - case MC_CL: - if (wait & MCR_COMP) { - return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)); - } - break; - - case MC_BIGCL: - if (wait & MCR_COMP) { - return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)); - } - break; - - case MC_16KCL: - if (wait & MCR_COMP) { - return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)); - } - break; - - case MC_MBUF_CL: - case MC_MBUF_BIGCL: - case MC_MBUF_16KCL: - break; - - default: - VERIFY(0); - /* NOTREACHED */ - } - - return !mcache_bkt_isempty(m_cache(class)); -} - -/* - * If possible, convert constructed objects to raw ones. - */ -static boolean_t -mbuf_steal(mbuf_class_t class, unsigned int num) -{ - mcache_obj_t *top = NULL; - mcache_obj_t **list = ⊤ - unsigned int tot = 0; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - switch (class) { - case MC_MBUF: - case MC_CL: - case MC_BIGCL: - case MC_16KCL: - return FALSE; - - case MC_MBUF_CL: - case MC_MBUF_BIGCL: - case MC_MBUF_16KCL: - /* Get the required number of constructed objects if possible */ - if (m_infree(class) > m_minlimit(class)) { - tot = cslab_alloc(class, &list, - MIN(num, m_infree(class))); - } - - /* And destroy them to get back the raw objects */ - if (top != NULL) { - (void) cslab_free(class, top, 1); - } - break; - - default: - VERIFY(0); - /* NOTREACHED */ - } - - return tot == num; -} - -static void -m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) -{ - int m, bmap = 0; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); - VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); - VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); - - /* - * This logic can be made smarter; for now, simply mark - * all other related classes as potential victims. - */ - switch (class) { - case MC_MBUF: - m_wantpurge(MC_CL)++; - m_wantpurge(MC_BIGCL)++; - m_wantpurge(MC_MBUF_CL)++; - m_wantpurge(MC_MBUF_BIGCL)++; - break; - - case MC_CL: - m_wantpurge(MC_MBUF)++; - m_wantpurge(MC_BIGCL)++; - m_wantpurge(MC_MBUF_BIGCL)++; - if (!comp) { - m_wantpurge(MC_MBUF_CL)++; - } - break; - - case MC_BIGCL: - m_wantpurge(MC_MBUF)++; - m_wantpurge(MC_CL)++; - m_wantpurge(MC_MBUF_CL)++; - if (!comp) { - m_wantpurge(MC_MBUF_BIGCL)++; - } - break; - - case MC_16KCL: - if (!comp) { - m_wantpurge(MC_MBUF_16KCL)++; - } - break; - - default: - VERIFY(0); - /* NOTREACHED */ - } - - /* - * Run through each marked class and check if we really need to - * purge (and therefore temporarily disable) the per-CPU caches - * layer used by the class. If so, remember the classes since - * we are going to drop the lock below prior to purging. - */ - for (m = 0; m < NELEM(mbuf_table); m++) { - if (m_wantpurge(m) > 0) { - m_wantpurge(m) = 0; - /* - * Try hard to steal the required number of objects - * from the freelist of other mbuf classes. Only - * purge and disable the per-CPU caches layer when - * we don't have enough; it's the last resort. - */ - if (!mbuf_steal(m, num)) { - bmap |= (1 << m); - } - } - } - - lck_mtx_unlock(mbuf_mlock); - - if (bmap != 0) { - /* signal the domains to drain */ - net_drain_domains(); - - /* Sigh; we have no other choices but to ask mcache to purge */ - for (m = 0; m < NELEM(mbuf_table); m++) { - if ((bmap & (1 << m)) && - mcache_purge_cache(m_cache(m), TRUE)) { - lck_mtx_lock(mbuf_mlock); - m_purge_cnt(m)++; - mbstat.m_drain++; - lck_mtx_unlock(mbuf_mlock); - } - } - } else { - /* - * Request mcache to reap extra elements from all of its caches; - * note that all reaps are serialized and happen only at a fixed - * interval. - */ - mcache_reap(); - } - lck_mtx_lock(mbuf_mlock); -} -#endif /* CONFIG_MBUF_MCACHE */ - static inline struct mbuf * m_get_common(int wait, short type, int hdr) { struct mbuf *m; -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - - /* Is this due to a non-blocking retry? If so, then try harder */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - m = mcache_alloc(m_cache(MC_MBUF), mcflags); -#else m = mz_alloc(wait); -#endif /* CONFIG_MBUF_MCACHE */ if (m != NULL) { - MBUF_INIT(m, hdr, type); + mbuf_init(m, hdr, type); mtype_stat_inc(type); mtype_stat_dec(MT_FREE); } return m; } +#endif /* !CONFIG_MBUF_MCACHE */ /* * Space allocation routines; these are also available as macros @@ -4433,7 +1577,10 @@ m_getclr(int wait, int type) return m; } -static int +#if !CONFIG_MBUF_MCACHE +static +#endif +int m_free_paired(struct mbuf *m) { VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED)); @@ -4500,6 +1647,7 @@ m_free_paired(struct mbuf *m) return 0; } +#if !CONFIG_MBUF_MCACHE struct mbuf * m_free(struct mbuf *m) { @@ -4532,21 +1680,6 @@ m_free(struct mbuf *m) const uint16_t refcnt = m_decref(m); if (refcnt == minref && !composite) { -#if CONFIG_MBUF_MCACHE - if (m_free_func == NULL) { - mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); - } else if (m_free_func == m_bigfree) { - mcache_free(m_cache(MC_BIGCL), - m->m_ext.ext_buf); - } else if (m_free_func == m_16kfree) { - mcache_free(m_cache(MC_16KCL), - m->m_ext.ext_buf); - } else { - (*m_free_func)(m->m_ext.ext_buf, - m->m_ext.ext_size, m_get_ext_arg(m)); - } - mcache_free(ref_cache, m_get_rfa(m)); -#else if (m_free_func == NULL) { mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf); } else if (m_free_func == m_bigfree) { @@ -4558,7 +1691,6 @@ m_free(struct mbuf *m) m->m_ext.ext_size, m_get_ext_arg(m)); } mz_ref_free(m_get_rfa(m)); -#endif /* CONFIG_MBUF_MCACHE */ m_set_ext(m, NULL, NULL, NULL); } else if (refcnt == minref && composite) { VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); @@ -4577,17 +1709,6 @@ m_free(struct mbuf *m) */ MEXT_FLAGS(m) &= ~EXTF_READONLY; -#if CONFIG_MBUF_MCACHE - /* "Free" into the intermediate cache */ - if (m_free_func == NULL) { - mcache_free(m_cache(MC_MBUF_CL), m); - } else if (m_free_func == m_bigfree) { - mcache_free(m_cache(MC_MBUF_BIGCL), m); - } else { - VERIFY(m_free_func == m_16kfree); - mcache_free(m_cache(MC_MBUF_16KCL), m); - } -#else /* "Free" into the intermediate cache */ if (m_free_func == NULL) { mz_composite_free(MC_MBUF_CL, m); @@ -4597,7 +1718,6 @@ m_free(struct mbuf *m) VERIFY(m_free_func == m_16kfree); mz_composite_free(MC_MBUF_16KCL, m); } -#endif /* CONFIG_MBUF_MCACHE */ return n; } } @@ -4609,11 +1729,7 @@ m_free(struct mbuf *m) m->m_flags = m->m_len = 0; m->m_next = m->m_nextpkt = NULL; -#if CONFIG_MBUF_MCACHE - mcache_free(m_cache(MC_MBUF), m); -#else mz_free(m); -#endif /* CONFIG_MBUF_MCACHE */ return n; } @@ -4649,20 +1765,6 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize), const uint16_t refcnt = m_decref(m); if (refcnt == minref && !composite) { -#if CONFIG_MBUF_MCACHE - if (m_free_func == NULL) { - mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); - } else if (m_free_func == m_bigfree) { - mcache_free(m_cache(MC_BIGCL), - m->m_ext.ext_buf); - } else if (m_free_func == m_16kfree) { - mcache_free(m_cache(MC_16KCL), - m->m_ext.ext_buf); - } else { - (*m_free_func)(m->m_ext.ext_buf, - m->m_ext.ext_size, m_get_ext_arg(m)); - } -#else if (m_free_func == NULL) { mz_cl_free(ZONE_ID_CLUSTER_2K, m->m_ext.ext_buf); } else if (m_free_func == m_bigfree) { @@ -4673,7 +1775,6 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize), (*m_free_func)(m->m_ext.ext_buf, m->m_ext.ext_size, m_get_ext_arg(m)); } -#endif /* CONFIG_MBUF_MCACHE */ /* Re-use the reference structure */ rfa = m_get_rfa(m); } else if (refcnt == minref && composite) { @@ -4695,16 +1796,6 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize), MEXT_FLAGS(m) &= ~EXTF_READONLY; /* "Free" into the intermediate cache */ -#if CONFIG_MBUF_MCACHE - if (m_free_func == NULL) { - mcache_free(m_cache(MC_MBUF_CL), m); - } else if (m_free_func == m_bigfree) { - mcache_free(m_cache(MC_MBUF_BIGCL), m); - } else { - VERIFY(m_free_func == m_16kfree); - mcache_free(m_cache(MC_MBUF_16KCL), m); - } -#else if (m_free_func == NULL) { mz_composite_free(MC_MBUF_CL, m); } else if (m_free_func == m_bigfree) { @@ -4713,7 +1804,6 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize), VERIFY(m_free_func == m_16kfree); mz_composite_free(MC_MBUF_16KCL, m); } -#endif /* CONFIG_MBUF_MCACHE */ /* * Allocate a new mbuf, since we didn't divorce * the composite mbuf + cluster pair above. @@ -4724,25 +1814,17 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf __sized_by(extsize), } } -#if CONFIG_MBUF_MCACHE - if (rfa == NULL && - (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { - m_free(m); - return NULL; - } -#else if (rfa == NULL && (rfa = mz_ref_alloc(wait)) == NULL) { m_free(m); return NULL; } -#endif /* CONFIG_MBUF_MCACHE */ if (!pair) { - MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, + mext_init(m, extbuf, extsize, extfree, extarg, rfa, 0, 1, 0, 0, 0, NULL); } else { - MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa, + mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa, 1, 1, 1, EXTF_PAIRED, 0, m); } @@ -4759,18 +1841,7 @@ m_getcl(int wait, int type, int flags) struct mbuf *m = NULL; int hdr = (flags & M_PKTHDR); -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - - /* Is this due to a non-blocking retry? If so, then try harder */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); -#else m = mz_composite_alloc(MC_MBUF_CL, wait); -#endif /* CONFIG_MBUF_MCACHE */ if (m != NULL) { u_int16_t flag; struct ext_ref *rfa; @@ -4785,7 +1856,7 @@ m_getcl(int wait, int type, int flags) flag = MEXT_FLAGS(m); - MBUF_INIT(m, hdr, type); + mbuf_init(m, hdr, type); MBUF_CL_INIT(m, cl, rfa, 1, flag); mtype_stat_inc(type); @@ -4801,15 +1872,10 @@ m_mclget(struct mbuf *m, int wait) struct ext_ref *rfa = NULL; char *bytes = NULL; -#if CONFIG_MBUF_MCACHE - if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { - return m; - } -#else if ((rfa = mz_ref_alloc(wait)) == NULL) { return m; } -#endif /* CONFIG_MBUF_MCACHE */ + if ((bytes = m_mclalloc(wait)) != NULL) { m->m_ext.ext_size = MCLBYTES; m->m_ext.ext_buf = bytes; @@ -4817,11 +1883,7 @@ m_mclget(struct mbuf *m, int wait) } else { m->m_ext.ext_size = 0; m->m_ext.ext_buf = NULL; -#if CONFIG_MBUF_MCACHE - mcache_free(ref_cache, rfa); -#else mz_ref_free(rfa); -#endif /* CONFIG_MBUF_MCACHE */ } return m; @@ -4832,30 +1894,16 @@ char * __sized_by_or_null(MCLBYTES) m_mclalloc(int wait) { -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - - /* Is this due to a non-blocking retry? If so, then try harder */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - return mcache_alloc(m_cache(MC_CL), mcflags); -#else return mz_cl_alloc(ZONE_ID_CLUSTER_2K, wait); -#endif /* CONFIG_MBUF_MCACHE */ } /* Free an mbuf cluster */ void m_mclfree(caddr_t p) { -#if CONFIG_MBUF_MCACHE - mcache_free(m_cache(MC_CL), p); -#else mz_cl_free(ZONE_ID_CLUSTER_2K, p); -#endif /* CONFIG_MBUF_MCACHE */ } +#endif /* !CONFIG_MBUF_MCACHE */ /* * mcl_hasreference() checks if a cluster of an mbuf is referenced by @@ -4873,32 +1921,18 @@ m_mclhasreference(struct mbuf *m) return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0; } +#if !CONFIG_MBUF_MCACHE __private_extern__ char * __sized_by_or_null(MBIGCLBYTES) m_bigalloc(int wait) { -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - - /* Is this due to a non-blocking retry? If so, then try harder */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - return mcache_alloc(m_cache(MC_BIGCL), mcflags); -#else return mz_cl_alloc(ZONE_ID_CLUSTER_4K, wait); -#endif /* CONFIG_MBUF_MCACHE */ } __private_extern__ void m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) { -#if CONFIG_MBUF_MCACHE - mcache_free(m_cache(MC_BIGCL), p); -#else mz_cl_free(ZONE_ID_CLUSTER_4K, p); -#endif /* CONFIG_MBUF_MCACHE */ } /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ @@ -4908,15 +1942,10 @@ m_mbigget(struct mbuf *m, int wait) struct ext_ref *rfa = NULL; void * bytes = NULL; -#if CONFIG_MBUF_MCACHE - if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { - return m; - } -#else if ((rfa = mz_ref_alloc(wait)) == NULL) { return m; } -#endif /* CONFIG_MBUF_MCACHE */ + if ((bytes = m_bigalloc(wait)) != NULL) { m->m_ext.ext_size = MBIGCLBYTES; m->m_ext.ext_buf = bytes; @@ -4924,11 +1953,7 @@ m_mbigget(struct mbuf *m, int wait) } else { m->m_ext.ext_size = 0; m->m_ext.ext_buf = NULL; -#if CONFIG_MBUF_MCACHE - mcache_free(ref_cache, rfa); -#else mz_ref_free(rfa); -#endif /* CONFIG_MBUF_MCACHE */ } return m; @@ -4938,28 +1963,13 @@ __private_extern__ char * __sized_by_or_null(M16KCLBYTES) m_16kalloc(int wait) { -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - - /* Is this due to a non-blocking retry? If so, then try harder */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - return mcache_alloc(m_cache(MC_16KCL), mcflags); -#else return mz_cl_alloc(ZONE_ID_CLUSTER_16K, wait); -#endif /* CONFIG_MBUF_MCACHE */ } __private_extern__ void m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) { -#if CONFIG_MBUF_MCACHE - mcache_free(m_cache(MC_16KCL), p); -#else mz_cl_free(ZONE_ID_CLUSTER_16K, p); -#endif /* CONFIG_MBUF_MCACHE */ } /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ @@ -4969,15 +1979,10 @@ m_m16kget(struct mbuf *m, int wait) struct ext_ref *rfa = NULL; void *bytes = NULL; -#if CONFIG_MBUF_MCACHE - if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { - return m; - } -#else if ((rfa = mz_ref_alloc(wait)) == NULL) { return m; } -#endif /* CONFIG_MBUF_MCACHE */ + if ((bytes = m_16kalloc(wait)) != NULL) { m->m_ext.ext_size = M16KCLBYTES; m->m_ext.ext_buf = bytes; @@ -4985,15 +1990,12 @@ m_m16kget(struct mbuf *m, int wait) } else { m->m_ext.ext_size = 0; m->m_ext.ext_buf = NULL; -#if CONFIG_MBUF_MCACHE - mcache_free(ref_cache, rfa); -#else mz_ref_free(rfa); -#endif /* CONFIG_MBUF_MCACHE */ } return m; } +#endif /* !CONFIG_MBUF_MCACHE */ /* * "Move" mbuf pkthdr from "from" to "to". @@ -5100,6 +2102,7 @@ m_copy_classifier(struct mbuf *to, struct mbuf *from) to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo; } +#if !CONFIG_MBUF_MCACHE /* * Return a list of mbuf hdrs that point to clusters. Try for num_needed; * if wantall is not set, return whatever number were available. Set up the @@ -5115,14 +2118,8 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, mbuf_ref_t m = NULL; mbuf_ref_t *np, top; unsigned int pnum, needed = *num_needed; -#if CONFIG_MBUF_MCACHE - mcache_obj_t *mp_list = NULL; - int mcflags = MSLEEPF(wait); - mcache_t *cp; -#else zstack_t mp_list = {}; mbuf_class_t class = MC_MBUF_CL; -#endif /* CONFIG_MBUF_MCACHE */ u_int16_t flag; struct ext_ref *rfa; void *cl; @@ -5131,12 +2128,6 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL)); - /* - * Caller must first check for njcl because this - * routine is internal and not exposed/used via KPI. - */ - VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); - top = NULL; np = ⊤ pnum = 0; @@ -5147,21 +2138,6 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, * overrides MCR_SLEEP, since this thread will not go to sleep * if we can't get all the buffers. */ -#if CONFIG_MBUF_MCACHE - if (!wantall || (mcflags & MCR_NOSLEEP)) { - mcflags |= MCR_TRYHARD; - } - - /* Allocate the composite mbuf + cluster elements from the cache */ - if (bufsize == m_maxsize(MC_CL)) { - cp = m_cache(MC_MBUF_CL); - } else if (bufsize == m_maxsize(MC_BIGCL)) { - cp = m_cache(MC_MBUF_BIGCL); - } else { - cp = m_cache(MC_MBUF_16KCL); - } - needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); -#else if (!wantall || (wait & Z_NOWAIT)) { wait &= ~Z_NOWAIT; wait |= Z_NOPAGEWAIT; @@ -5177,15 +2153,9 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, } mp_list = mz_composite_alloc_n(class, needed, wait); needed = zstack_count(mp_list); -#endif /* CONFIG_MBUF_MCACHE */ for (pnum = 0; pnum < needed; pnum++) { -#if CONFIG_MBUF_MCACHE - m = (struct mbuf *)mp_list; - mp_list = mp_list->obj_next; -#else m = zstack_pop(&mp_list); -#endif /* CONFIG_MBUF_MCACHE */ VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); cl = m->m_ext.ext_buf; @@ -5196,7 +2166,7 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, flag = MEXT_FLAGS(m); - MBUF_INIT(m, num_with_pkthdrs, MT_DATA); + mbuf_init(m, num_with_pkthdrs, MT_DATA); if (bufsize == m_maxsize(MC_16KCL)) { MBUF_16KCL_INIT(m, cl, rfa, 1, flag); } else if (bufsize == m_maxsize(MC_BIGCL)) { @@ -5216,17 +2186,10 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, np = &m->m_next; } } -#if CONFIG_MBUF_MCACHE - ASSERT(pnum != *num_needed || mp_list == NULL); - if (mp_list != NULL) { - mcache_free_ext(cp, mp_list); - } -#else ASSERT(pnum != *num_needed || zstack_empty(mp_list)); if (!zstack_empty(mp_list)) { mz_composite_free_n(class, mp_list); } -#endif /* CONFIG_MBUF_MCACHE */ if (pnum > 0) { mtype_stat_add(MT_DATA, pnum); mtype_stat_sub(MT_FREE, pnum); @@ -5270,14 +2233,8 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, unsigned int num = 0; unsigned int nsegs = 0; unsigned int needed = 0, resid; -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - mcache_obj_t *mp_list = NULL, *rmp_list = NULL; - mcache_t *cp = NULL, *rcp = NULL; -#else zstack_t mp_list = {}, rmp_list = {}; mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL; -#endif /* CONFIG_MBUF_MCACHE */ if (*numlist == 0) { os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0"); @@ -5292,7 +2249,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, bufsize = packetlen; } else if (packetlen > m_maxsize(MC_CL)) { /* Use 4KB if jumbo cluster pool isn't available */ - if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) { + if (packetlen <= m_maxsize(MC_BIGCL)) { bufsize = m_maxsize(MC_BIGCL); } else { bufsize = m_maxsize(MC_16KCL); @@ -5302,7 +2259,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, } } else if (wantsize == m_maxsize(MC_CL) || wantsize == m_maxsize(MC_BIGCL) || - (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { + (wantsize == m_maxsize(MC_16KCL))) { bufsize = wantsize; } else { *numlist = 0; @@ -5320,7 +2277,6 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, nsegs = 2; } } else if (bufsize == m_maxsize(MC_16KCL)) { - VERIFY(njcl > 0); nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1; } else if (bufsize == m_maxsize(MC_BIGCL)) { nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1; @@ -5343,16 +2299,10 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, * overrides MCR_SLEEP, since this thread will not go to sleep * if we can't get all the buffers. */ -#if CONFIG_MBUF_MCACHE - if (!wantall || (mcflags & MCR_NOSLEEP)) { - mcflags |= MCR_TRYHARD; - } -#else if (!wantall || (wait & Z_NOWAIT)) { wait &= ~Z_NOWAIT; wait |= Z_NOPAGEWAIT; } -#endif /* !CONFIG_MBUF_MCACHE */ /* * Simple case where all elements in the lists/chains are mbufs. @@ -5364,15 +2314,9 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, if (bufsize <= MINCLSIZE) { /* Allocate the elements in one shot from the mbuf cache */ ASSERT(bufsize <= MHLEN || nsegs == 2); -#if CONFIG_MBUF_MCACHE - cp = m_cache(MC_MBUF); - needed = mcache_alloc_ext(cp, &mp_list, - (*numlist) * nsegs, mcflags); -#else class = MC_MBUF; mp_list = mz_alloc_n((*numlist) * nsegs, wait); needed = zstack_count(mp_list); -#endif /* CONFIG_MBUF_MCACHE */ /* * The number of elements must be even if we are to use an @@ -5388,38 +2332,24 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, while (num < needed) { mbuf_ref_t m = NULL; -#if CONFIG_MBUF_MCACHE - m = (struct mbuf *)mp_list; - mp_list = mp_list->obj_next; -#else m = zstack_pop(&mp_list); -#endif /* CONFIG_MBUF_MCACHE */ ASSERT(m != NULL); - MBUF_INIT(m, 1, MT_DATA); + mbuf_init(m, 1, MT_DATA); num++; if (bufsize > MHLEN) { /* A second mbuf for this segment chain */ -#if CONFIG_MBUF_MCACHE - m->m_next = (struct mbuf *)mp_list; - mp_list = mp_list->obj_next; -#else m->m_next = zstack_pop(&mp_list); -#endif /* CONFIG_MBUF_MCACHE */ ASSERT(m->m_next != NULL); - MBUF_INIT(m->m_next, 0, MT_DATA); + mbuf_init(m->m_next, 0, MT_DATA); num++; } *np = m; np = &m->m_nextpkt; } -#if CONFIG_MBUF_MCACHE - ASSERT(num != *numlist || mp_list == NULL); -#else ASSERT(num != *numlist || zstack_empty(mp_list)); -#endif /* CONFIG_MBUF_MCACHE */ if (num > 0) { mtype_stat_add(MT_DATA, num); @@ -5461,7 +2391,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, * in the chain use the same cluster size; use the * smaller of the cluster sizes. */ - if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) { + if (resid > m_maxsize(MC_BIGCL)) { r_bufsize = m_maxsize(MC_16KCL); } else if (resid > m_maxsize(MC_CL)) { r_bufsize = m_maxsize(MC_BIGCL); @@ -5482,16 +2412,6 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, * elements that can be allocated so that we know how many * segment chains we can afford to create. */ -#if CONFIG_MBUF_MCACHE - if (r_bufsize <= m_maxsize(MC_CL)) { - rcp = m_cache(MC_MBUF_CL); - } else if (r_bufsize <= m_maxsize(MC_BIGCL)) { - rcp = m_cache(MC_MBUF_BIGCL); - } else { - rcp = m_cache(MC_MBUF_16KCL); - } - needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); -#else if (r_bufsize <= m_maxsize(MC_CL)) { rclass = MC_MBUF_CL; } else if (r_bufsize <= m_maxsize(MC_BIGCL)) { @@ -5501,7 +2421,6 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, } rmp_list = mz_composite_alloc_n(rclass, *numlist, wait); needed = zstack_count(rmp_list); -#endif /* CONFIG_MBUF_MCACHE */ if (needed == 0) { goto fail; } @@ -5515,16 +2434,6 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, * Attempt to allocate the rest of the composite mbuf + cluster * elements for the number of segment chains that we need. */ -#if CONFIG_MBUF_MCACHE - if (bufsize <= m_maxsize(MC_CL)) { - cp = m_cache(MC_MBUF_CL); - } else if (bufsize <= m_maxsize(MC_BIGCL)) { - cp = m_cache(MC_MBUF_BIGCL); - } else { - cp = m_cache(MC_MBUF_16KCL); - } - needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); -#else if (bufsize <= m_maxsize(MC_CL)) { class = MC_MBUF_CL; } else if (bufsize <= m_maxsize(MC_BIGCL)) { @@ -5534,7 +2443,6 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, } mp_list = mz_composite_alloc_n(class, needed * nsegs, wait); needed = zstack_count(mp_list); -#endif /* CONFIG_MBUF_MCACHE */ /* Round it down to avoid creating a partial segment chain */ needed = (needed / nsegs) * nsegs; @@ -5565,19 +2473,9 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, ++num; if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { -#if CONFIG_MBUF_MCACHE - m = (struct mbuf *)mp_list; - mp_list = mp_list->obj_next; -#else m = zstack_pop(&mp_list); -#endif /* CONFIG_MBUF_MCACHE */ } else { -#if CONFIG_MBUF_MCACHE - m = (struct mbuf *)rmp_list; - rmp_list = rmp_list->obj_next; -#else m = zstack_pop(&rmp_list); -#endif /* CONFIG_MBUF_MCACHE */ } m_free_func = m_get_ext_free(m); ASSERT(m != NULL); @@ -5597,7 +2495,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, if (pkthdr) { first = m; } - MBUF_INIT(m, pkthdr, MT_DATA); + mbuf_init(m, pkthdr, MT_DATA); if (m_free_func == m_16kfree) { MBUF_16KCL_INIT(m, cl, rfa, 1, flag); } else if (m_free_func == m_bigfree) { @@ -5627,24 +2525,12 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, /* We've got them all; return to caller */ if (num == *numlist) { -#if CONFIG_MBUF_MCACHE - ASSERT(mp_list == NULL && rmp_list == NULL); -#else ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list)); -#endif /* CONFIG_MBUF_MCACHE */ return top; } fail: /* Free up what's left of the above */ -#if CONFIG_MBUF_MCACHE - if (mp_list != NULL) { - mcache_free_ext(cp, mp_list); - } - if (rmp_list != NULL) { - mcache_free_ext(rcp, rmp_list); - } -#else if (!zstack_empty(mp_list)) { if (class == MC_MBUF) { /* No need to elide, these mbufs came from the cache. */ @@ -5656,7 +2542,6 @@ fail: if (!zstack_empty(rmp_list)) { mz_composite_free_n(rclass, rmp_list); } -#endif /* CONFIG_MBUF_MCACHE */ if (wantall && top != NULL) { m_freem_list(top); *numlist = 0; @@ -5665,6 +2550,7 @@ fail: *numlist = num; return top; } +#endif /* !CONFIG_MBUF_MCACHE */ /* * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated @@ -5733,6 +2619,7 @@ m_getpackethdrs(int num_needed, int how) return top; } +#if !CONFIG_MBUF_MCACHE /* * Free an mbuf list (m_nextpkt) while following m_next. Returns the count * for mbufs packets freed. Used by the drivers. @@ -5741,20 +2628,9 @@ int m_freem_list(struct mbuf *m) { struct mbuf *nextpkt; -#if CONFIG_MBUF_MCACHE - mcache_obj_t *mp_list = NULL; - mcache_obj_t *mcl_list = NULL; - mcache_obj_t *mbc_list = NULL; - mcache_obj_t *m16k_list = NULL; - mcache_obj_t *m_mcl_list = NULL; - mcache_obj_t *m_mbc_list = NULL; - mcache_obj_t *m_m16k_list = NULL; - mcache_obj_t *ref_list = NULL; -#else zstack_t mp_list = {}, mcl_list = {}, mbc_list = {}, m16k_list = {}, m_mcl_list = {}, m_mbc_list = {}, m_m16k_list = {}, ref_list = {}; -#endif /* CONFIG_MBUF_MCACHE */ int pktcount = 0; int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; @@ -5766,11 +2642,7 @@ m_freem_list(struct mbuf *m) while (m != NULL) { struct mbuf *next = m->m_next; -#if CONFIG_MBUF_MCACHE - mcache_obj_t *o, *rfa; -#else void *cl = NULL; -#endif /* CONFIG_MBUF_MCACHE */ if (m->m_type == MT_FREE) { panic("m_free: freeing an already freed mbuf"); } @@ -5793,11 +2665,7 @@ m_freem_list(struct mbuf *m) mt_free++; -#if CONFIG_MBUF_MCACHE - o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; -#else cl = m->m_ext.ext_buf; -#endif /* CONFIG_MBUF_MCACHE */ /* * Make sure that we don't touch any ext_ref * member after we decrement the reference count @@ -5809,25 +2677,6 @@ m_freem_list(struct mbuf *m) const uint16_t minref = MEXT_MINREF(m); const uint16_t refcnt = m_decref(m); if (refcnt == minref && !composite) { -#if CONFIG_MBUF_MCACHE - if (m_free_func == NULL) { - o->obj_next = mcl_list; - mcl_list = o; - } else if (m_free_func == m_bigfree) { - o->obj_next = mbc_list; - mbc_list = o; - } else if (m_free_func == m_16kfree) { - o->obj_next = m16k_list; - m16k_list = o; - } else { - (*(m_free_func))((caddr_t)o, - m->m_ext.ext_size, - m_get_ext_arg(m)); - } - rfa = (mcache_obj_t *)(void *)m_get_rfa(m); - rfa->obj_next = ref_list; - ref_list = rfa; -#else if (m_free_func == NULL) { zstack_push(&mcl_list, cl); } else if (m_free_func == m_bigfree) { @@ -5840,7 +2689,6 @@ m_freem_list(struct mbuf *m) m_get_ext_arg(m)); } zstack_push(&ref_list, m_get_rfa(m)); -#endif /* CONFIG_MBUF_MCACHE */ m_set_ext(m, NULL, NULL, NULL); } else if (refcnt == minref && composite) { VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); @@ -5873,20 +2721,6 @@ m_freem_list(struct mbuf *m) MEXT_FLAGS(m) &= ~EXTF_READONLY; /* "Free" into the intermediate cache */ -#if CONFIG_MBUF_MCACHE - o = (mcache_obj_t *)m; - if (m_free_func == NULL) { - o->obj_next = m_mcl_list; - m_mcl_list = o; - } else if (m_free_func == m_bigfree) { - o->obj_next = m_mbc_list; - m_mbc_list = o; - } else { - VERIFY(m_free_func == m_16kfree); - o->obj_next = m_m16k_list; - m_m16k_list = o; - } -#else if (m_free_func == NULL) { zstack_push(&m_mcl_list, m); } else if (m_free_func == m_bigfree) { @@ -5895,7 +2729,6 @@ m_freem_list(struct mbuf *m) VERIFY(m_free_func == m_16kfree); zstack_push(&m_m16k_list, m); } -#endif /* CONFIG_MBUF_MCACHE */ m = next; continue; } @@ -5920,13 +2753,8 @@ simple_free: m->m_flags = m->m_len = 0; m->m_next = m->m_nextpkt = NULL; -#if CONFIG_MBUF_MCACHE - ((mcache_obj_t *)m)->obj_next = mp_list; - mp_list = (mcache_obj_t *)m; -#else m_elide(m); zstack_push(&mp_list, m); -#endif /* CONFIG_MBUF_MCACHE */ m = next; } @@ -5949,32 +2777,6 @@ simple_free: if (mt_tag > 0) { mtype_stat_sub(MT_TAG, mt_tag); } -#if CONFIG_MBUF_MCACHE - if (mp_list != NULL) { - mcache_free_ext(m_cache(MC_MBUF), mp_list); - } - if (mcl_list != NULL) { - mcache_free_ext(m_cache(MC_CL), mcl_list); - } - if (mbc_list != NULL) { - mcache_free_ext(m_cache(MC_BIGCL), mbc_list); - } - if (m16k_list != NULL) { - mcache_free_ext(m_cache(MC_16KCL), m16k_list); - } - if (m_mcl_list != NULL) { - mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); - } - if (m_mbc_list != NULL) { - mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); - } - if (m_m16k_list != NULL) { - mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); - } - if (ref_list != NULL) { - mcache_free_ext(ref_cache, ref_list); - } -#else if (!zstack_empty(mp_list)) { /* mbufs elided above. */ mz_free_n(mp_list); @@ -6000,10 +2802,10 @@ simple_free: if (!zstack_empty(ref_list)) { zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list); } -#endif /* CONFIG_MBUF_MCACHE */ return pktcount; } +#endif /* !CONFIG_MBUF_MCACHE */ /* * Wrapper around m_freem_list which captures the packet that's going to be @@ -6013,17 +2815,18 @@ simple_free: * DROPTAP_FLAG_DIR_IN), or the packet will not be captured. */ void -m_drop_list(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname, +m_drop_list(mbuf_t m_head, struct ifnet *ifp, uint16_t flags, uint32_t reason, const char *funcname, uint16_t linenum) { + struct mbuf *m = m_head; struct mbuf *nextpkt; - if (m == NULL) { + if (m_head == NULL) { return; } if (__probable(droptap_total_tap_count == 0)) { - m_freem_list(m); + m_freem_list(m_head); return; } @@ -6059,7 +2862,7 @@ m_drop_list(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const m = nextpkt; } } - m_freem_list(m); + m_freem_list(m_head); } void @@ -6136,6 +2939,29 @@ m_drop_if(mbuf_t m, struct ifnet *ifp, uint16_t flags, uint32_t reason, const ch m_drop_common(m, ifp, flags, reason, funcname, linenum); } +void +m_drop_extended(mbuf_t m, struct ifnet *ifp, char *frame_header, + uint16_t flags, uint32_t reason, const char *funcname, uint16_t linenum) +{ + if (m == NULL) { + return; + } + + if (__probable(droptap_total_tap_count == 0)) { + m_freem(m); + return; + } + + if (flags & DROPTAP_FLAG_DIR_OUT) { + droptap_output_mbuf(m, reason, funcname, linenum, flags, + ifp); + } else if (flags & DROPTAP_FLAG_DIR_IN) { + droptap_input_mbuf(m, reason, funcname, linenum, flags, + m->m_pkthdr.rcvif, frame_header); + } + m_freem(m); +} + /* * Mbuffer utility routines. */ @@ -6349,6 +3175,7 @@ m_copym(struct mbuf *m, int off0, int len, int wait) return m_copym_mode(m, off0, len, wait, NULL, NULL, M_COPYM_MOVE_HDR); } +#if !CONFIG_MBUF_MCACHE /* * Equivalent to m_copym except that all necessary mbuf hdrs are allocated * within this routine also. @@ -6362,12 +3189,7 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, { mbuf_ref_t m = m0, n, *np = NULL, top = NULL; int off = off0, len = len0; -#if CONFIG_MBUF_MCACHE - int mcflags = MSLEEPF(wait); - mcache_obj_t *list = NULL; -#else zstack_t list = {}; -#endif /* CONFIG_MBUF_MCACHE */ int copyhdr = 0; int type = 0; int needed = 0; @@ -6397,39 +3219,18 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, needed++; len = len0; -#if CONFIG_MBUF_MCACHE - /* - * If the caller doesn't want to be put to sleep, mark it with - * MCR_TRYHARD so that we may reclaim buffers from other places - * before giving up. - */ - if (mcflags & MCR_NOSLEEP) { - mcflags |= MCR_TRYHARD; - } - - if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, - mcflags) != needed) { - goto nospace; - } -#else list = mz_alloc_n(needed, wait); if (zstack_count(list) != needed) { goto nospace; } -#endif /* CONFIG_MBUF_MCACHE */ needed = 0; while (len > 0) { -#if CONFIG_MBUF_MCACHE - n = (struct mbuf *)list; - list = list->obj_next; -#else n = zstack_pop(&list); -#endif /* CONFIG_MBUF_MCACHE */ ASSERT(n != NULL && m != NULL); type = (top == NULL) ? MT_HEADER : m->m_type; - MBUF_INIT(n, (top == NULL), type); + mbuf_init(n, (top == NULL), type); if (top == NULL) { top = n; @@ -6447,9 +3248,7 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, } else if ((mode == M_COPYM_COPY_HDR) || (mode == M_COPYM_MUST_COPY_HDR)) { if (m_dup_pkthdr(n, m, wait) == 0) { -#if !CONFIG_MBUF_MCACHE m_elide(n); -#endif goto nospace; } } @@ -6490,30 +3289,21 @@ m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, mtype_stat_add(type, needed); mtype_stat_sub(MT_FREE, needed + 1); -#if CONFIG_MBUF_MCACHE - ASSERT(list == NULL); -#else ASSERT(zstack_empty(list)); -#endif /* CONFIG_MBUF_MCACHE */ return top; nospace: -#if CONFIG_MBUF_MCACHE - if (list != NULL) { - mcache_free_ext(m_cache(MC_MBUF), list); - } -#else if (!zstack_empty(list)) { /* No need to elide, these mbufs came from the cache. */ mz_free_n(list); } -#endif /* CONFIG_MBUF_MCACHE */ if (top != NULL) { m_freem(top); } return NULL; } +#endif /* !CONFIG_MBUF_MCACHE */ /* * Copy data from an mbuf chain starting "off" bytes from the beginning, @@ -6922,136 +3712,6 @@ extpacket: } -#if CONFIG_MBUF_MCACHE -#ifndef MBUF_GROWTH_NORMAL_THRESH -#define MBUF_GROWTH_NORMAL_THRESH 25 -#endif - -/* - * Cluster freelist allocation check. - */ -static int -m_howmany(int num, size_t bufsize) -{ - int i = 0, j = 0; - u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; - u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; - u_int32_t sumclusters, freeclusters; - u_int32_t percent_pool, percent_kmem; - u_int32_t mb_growth, mb_growth_thresh; - - VERIFY(bufsize == m_maxsize(MC_BIGCL) || - bufsize == m_maxsize(MC_16KCL)); - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - /* Numbers in 2K cluster units */ - m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; - m_clusters = m_total(MC_CL); - m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; - m_16kclusters = m_total(MC_16KCL); - sumclusters = m_mbclusters + m_clusters + m_bigclusters; - - m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; - m_clfree = m_infree(MC_CL); - m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; - m_16kclfree = m_infree(MC_16KCL); - freeclusters = m_mbfree + m_clfree + m_bigclfree; - - /* Bail if we've maxed out the mbuf memory map */ - if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || - (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && - (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { - mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)", - sumclusters, nclusters, - (m_16kclusters << NCLPJCLSHIFT), njcl); - return 0; - } - - if (bufsize == m_maxsize(MC_BIGCL)) { - /* Under minimum */ - if (m_bigclusters < m_minlimit(MC_BIGCL)) { - return m_minlimit(MC_BIGCL) - m_bigclusters; - } - - percent_pool = - ((sumclusters - freeclusters) * 100) / sumclusters; - percent_kmem = (sumclusters * 100) / nclusters; - - /* - * If a light/normal user, grow conservatively (75%) - * If a heavy user, grow aggressively (50%) - */ - if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) { - mb_growth = MB_GROWTH_NORMAL; - } else { - mb_growth = MB_GROWTH_AGGRESSIVE; - } - - if (percent_kmem < 5) { - /* For initial allocations */ - i = num; - } else { - /* Return if >= MBIGCL_LOWAT clusters available */ - if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && - m_total(MC_BIGCL) >= - MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) { - return 0; - } - - /* Ensure at least num clusters are accessible */ - if (num >= m_infree(MC_BIGCL)) { - i = num - m_infree(MC_BIGCL); - } - if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) { - j = num - (m_total(MC_BIGCL) - - m_minlimit(MC_BIGCL)); - } - - i = MAX(i, j); - - /* - * Grow pool if percent_pool > 75 (normal growth) - * or percent_pool > 50 (aggressive growth). - */ - mb_growth_thresh = 100 - (100 / (1 << mb_growth)); - if (percent_pool > mb_growth_thresh) { - j = ((sumclusters + num) >> mb_growth) - - freeclusters; - } - i = MAX(i, j); - } - - /* Check to ensure we didn't go over limits */ - if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) { - i = m_maxlimit(MC_BIGCL) - m_bigclusters; - } - if ((i << 1) + sumclusters >= nclusters) { - i = (nclusters - sumclusters) >> 1; - } - VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); - VERIFY(sumclusters + (i << 1) <= nclusters); - } else { /* 16K CL */ - VERIFY(njcl > 0); - /* Ensure at least num clusters are available */ - if (num >= m_16kclfree) { - i = num - m_16kclfree; - } - - /* Always grow 16KCL pool aggressively */ - if (((m_16kclusters + num) >> 1) > m_16kclfree) { - j = ((m_16kclusters + num) >> 1) - m_16kclfree; - } - i = MAX(i, j); - - /* Check to ensure we don't go over limit */ - if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) { - i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); - } - } - return i; -} -#endif /* CONFIG_MBUF_MCACHE */ /* * Return the number of bytes in the mbuf chain, m. */ @@ -7072,6 +3732,19 @@ m_length(struct mbuf *m) return pktlen; } +int +m_chain_capacity(const struct mbuf *m) +{ + int rawlen = 0; + while (m) { + rawlen += m_capacity(m); + m = m->m_next; + } + + return rawlen; +} + + /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf @@ -7137,12 +3810,6 @@ m_makewritable(struct mbuf **mp, int off, int len, int how) origlen = m_length(*mp); #endif /* DEBUG */ -#if 0 /* M_COPYALL is large enough */ - if (len == M_COPYALL) { - len = m_length(*mp) - off; /* XXX */ - } -#endif - error = m_copyback0(mp, off, len, NULL, M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how); @@ -7246,12 +3913,6 @@ extend: * allocate a new writable mbuf and try again. */ -#if DIAGNOSTIC - if (!(flags & M_COPYBACK0_COW)) { - panic("m_copyback0: read-only"); - } -#endif /* DIAGNOSTIC */ - /* * if we're going to write into the middle of * a mbuf, split it first. @@ -7368,27 +4029,15 @@ enobufs: return ENOBUFS; } +#if !CONFIG_MBUF_MCACHE uint64_t mcl_to_paddr(char *addr) { -#if CONFIG_MBUF_MCACHE - vm_offset_t base_phys; - - if (!MBUF_IN_MAP(addr)) { - return 0; - } - base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)]; - - if (base_phys == 0) { - return 0; - } - return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)); -#else extern addr64_t kvtophys(vm_offset_t va); return kvtophys((vm_offset_t)addr); -#endif /* CONFIG_MBUF_MCACHE */ } +#endif /* !CONFIG_MBUF_MCACHE */ /* * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. @@ -7452,7 +4101,7 @@ m_dup(struct mbuf *m, int how) MCLGET(n, how); } else if (m->m_len <= m_maxsize(MC_BIGCL)) { n = m_mbigget(n, how); - } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) { + } else if (m->m_len <= m_maxsize(MC_16KCL)) { n = m_m16kget(n, how); } if (!(n->m_flags & M_EXT)) { @@ -7587,9 +4236,6 @@ m_normalize(struct mbuf *m) } m = n; } - if (expanded) { - os_atomic_inc(&mb_normalized, relaxed); - } return top; } @@ -7802,12 +4448,6 @@ m_mtod(struct mbuf *m) return m_mtod_current(m); } -void -m_mcheck(struct mbuf *m) -{ - _MCHECK(m); -} - /* * Return a pointer to mbuf/offset of location in mbuf chain. */ @@ -7835,53 +4475,6 @@ m_getptr(struct mbuf *m, int loc, int *off) return NULL; } -#if CONFIG_MBUF_MCACHE -/* - * Inform the corresponding mcache(s) that there's a waiter below. - */ -static void -mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) -{ - mcache_waiter_inc(m_cache(class)); - if (comp) { - if (class == MC_CL) { - mcache_waiter_inc(m_cache(MC_MBUF_CL)); - } else if (class == MC_BIGCL) { - mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); - } else if (class == MC_16KCL) { - mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); - } else { - mcache_waiter_inc(m_cache(MC_MBUF_CL)); - mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); - } - } -} - -/* - * Inform the corresponding mcache(s) that there's no more waiter below. - */ -static void -mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) -{ - mcache_waiter_dec(m_cache(class)); - if (comp) { - if (class == MC_CL) { - mcache_waiter_dec(m_cache(MC_MBUF_CL)); - } else if (class == MC_BIGCL) { - mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); - } else if (class == MC_16KCL) { - mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); - } else { - mcache_waiter_dec(m_cache(MC_MBUF_CL)); - mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); - } - } -} - -static bool mbuf_watchdog_defunct_active = false; - -#endif /* CONFIG_MBUF_MCACHE */ - static uint32_t mbuf_watchdog_socket_space(struct socket *so) { @@ -7915,7 +4508,10 @@ proc_fd_trylock(proc_t p) return lck_mtx_try_lock(&p->p_fd.fd_lock); } -static int +#if !CONFIG_MBUF_MCACHE +static +#endif +int mbuf_watchdog_defunct_iterate(proc_t p, void *arg) { struct fileproc *fp = NULL; @@ -7968,6 +4564,7 @@ mbuf_watchdog_defunct_iterate(proc_t p, void *arg) extern char *proc_name_address(void *p); +#if !CONFIG_MBUF_MCACHE static void mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) { @@ -7983,12 +4580,6 @@ mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) * Defunct all sockets from this app. */ if (args.top_app != NULL) { -#if CONFIG_MBUF_MCACHE - /* Restart the watchdog count. */ - lck_mtx_lock(mbuf_mlock); - microuptime(&mb_wdtstart); - lck_mtx_unlock(mbuf_mlock); -#endif os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d", __func__, proc_name_address(args.top_app), @@ -8016,7 +4607,6 @@ mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) proc_fdunlock(args.top_app); proc_rele(args.top_app); mbstat.m_forcedefunct++; -#if !CONFIG_MBUF_MCACHE zcache_drain(ZONE_ID_MBUF_CLUSTER_2K); zcache_drain(ZONE_ID_MBUF_CLUSTER_4K); zcache_drain(ZONE_ID_MBUF_CLUSTER_16K); @@ -8025,14 +4615,9 @@ mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) zone_drain(zone_by_id(ZONE_ID_CLUSTER_4K)); zone_drain(zone_by_id(ZONE_ID_CLUSTER_16K)); zone_drain(zone_by_id(ZONE_ID_MBUF_REF)); -#endif } -#if CONFIG_MBUF_MCACHE - mbuf_watchdog_defunct_active = false; -#endif } -#if !CONFIG_MBUF_MCACHE static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted"); static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp); static uint32_t mbuf_exhausted_mask; @@ -8125,1362 +4710,6 @@ mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted) EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted); #endif /* !CONFIG_MBUF_MCACHE */ -#if CONFIG_MBUF_MCACHE -/* - * Called during slab (blocking and non-blocking) allocation. If there - * is at least one waiter, and the time since the first waiter is blocked - * is greater than the watchdog timeout, panic the system. - */ -static void -mbuf_watchdog(void) -{ - struct timeval now; - unsigned int since; - static thread_call_t defunct_tcall = NULL; - - if (mb_waiters == 0 || !mb_watchdog) { - return; - } - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - microuptime(&now); - since = now.tv_sec - mb_wdtstart.tv_sec; - - if (mbuf_watchdog_defunct_active) { - /* - * Don't panic the system while we are trying - * to find sockets to defunct. - */ - return; - } - if (since >= MB_WDT_MAXTIME) { - panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, - mb_waiters, since, mbuf_dump()); - /* NOTREACHED */ - } - /* - * Check if we are about to panic the system due - * to lack of mbufs and start defuncting sockets - * from processes that use too many sockets. - * - * We're always called with the mbuf_mlock held, - * so that also protects mbuf_watchdog_defunct_active. - */ - if (since >= MB_WDT_MAXTIME / 2) { - /* - * Start a thread to defunct sockets - * from apps that are over-using their socket - * buffers. - */ - if (defunct_tcall == NULL) { - defunct_tcall = - thread_call_allocate_with_options(mbuf_watchdog_defunct, - NULL, - THREAD_CALL_PRIORITY_KERNEL, - THREAD_CALL_OPTIONS_ONCE); - } - if (defunct_tcall != NULL) { - mbuf_watchdog_defunct_active = true; - thread_call_enter(defunct_tcall); - } - } -} - -/* - * Called during blocking allocation. Returns TRUE if one or more objects - * are available at the per-CPU caches layer and that allocation should be - * retried at that level. - */ -static boolean_t -mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) -{ - boolean_t mcache_retry = FALSE; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - /* Check if there's anything at the cache layer */ - if (mbuf_cached_above(class, wait)) { - mcache_retry = TRUE; - goto done; - } - - /* Nothing? Then try hard to get it from somewhere */ - m_reclaim(class, num, (wait & MCR_COMP)); - - /* We tried hard and got something? */ - if (m_infree(class) > 0) { - mbstat.m_wait++; - goto done; - } else if (mbuf_cached_above(class, wait)) { - mbstat.m_wait++; - mcache_retry = TRUE; - goto done; - } else if (wait & MCR_TRYHARD) { - mcache_retry = TRUE; - goto done; - } - - /* - * There's really nothing for us right now; inform the - * cache(s) that there is a waiter below and go to sleep. - */ - mbuf_waiter_inc(class, (wait & MCR_COMP)); - - VERIFY(!(wait & MCR_NOSLEEP)); - - /* - * If this is the first waiter, arm the watchdog timer. Otherwise - * check if we need to panic the system due to watchdog timeout. - */ - if (mb_waiters == 0) { - microuptime(&mb_wdtstart); - } else { - mbuf_watchdog(); - } - - mb_waiters++; - m_region_expand(class) += m_total(class) + num; - /* wake up the worker thread */ - if (mbuf_worker_ready && - mbuf_worker_needs_wakeup) { - wakeup((caddr_t)&mbuf_worker_needs_wakeup); - mbuf_worker_needs_wakeup = FALSE; - } - mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class)); - (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL); - mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class)); - - /* We are now up; stop getting notified until next round */ - mbuf_waiter_dec(class, (wait & MCR_COMP)); - - /* We waited and got something */ - if (m_infree(class) > 0) { - mbstat.m_wait++; - goto done; - } else if (mbuf_cached_above(class, wait)) { - mbstat.m_wait++; - mcache_retry = TRUE; - } -done: - return mcache_retry; -} - -__attribute__((noreturn)) -static void -mbuf_worker_thread(void) -{ - int mbuf_expand; - - while (1) { - lck_mtx_lock(mbuf_mlock); - mbwdog_logger("worker thread running"); - mbuf_worker_run_cnt++; - mbuf_expand = 0; - /* - * Allocations are based on page size, so if we have depleted - * the reserved spaces, try to free mbufs from the major classes. - */ -#if PAGE_SIZE == 4096 - uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; - uint32_t m_clusters = m_total(MC_CL); - uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; - uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters; - if (sumclusters >= nclusters) { - mbwdog_logger("reclaiming bigcl"); - mbuf_drain_locked(TRUE); - m_reclaim(MC_BIGCL, 4, FALSE); - } -#else - uint32_t m_16kclusters = m_total(MC_16KCL); - if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) { - mbwdog_logger("reclaiming 16kcl"); - mbuf_drain_locked(TRUE); - m_reclaim(MC_16KCL, 4, FALSE); - } -#endif - if (m_region_expand(MC_CL) > 0) { - int n; - mb_expand_cl_cnt++; - /* Adjust to current number of cluster in use */ - n = m_region_expand(MC_CL) - - (m_total(MC_CL) - m_infree(MC_CL)); - if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) { - n = m_maxlimit(MC_CL) - m_total(MC_CL); - } - if (n > 0) { - mb_expand_cl_total += n; - } - m_region_expand(MC_CL) = 0; - - if (n > 0) { - mbwdog_logger("expanding MC_CL by %d", n); - freelist_populate(MC_CL, n, M_WAIT); - } - } - if (m_region_expand(MC_BIGCL) > 0) { - int n; - mb_expand_bigcl_cnt++; - /* Adjust to current number of 4 KB cluster in use */ - n = m_region_expand(MC_BIGCL) - - (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); - if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) { - n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); - } - if (n > 0) { - mb_expand_bigcl_total += n; - } - m_region_expand(MC_BIGCL) = 0; - - if (n > 0) { - mbwdog_logger("expanding MC_BIGCL by %d", n); - freelist_populate(MC_BIGCL, n, M_WAIT); - } - } - if (m_region_expand(MC_16KCL) > 0) { - int n; - mb_expand_16kcl_cnt++; - /* Adjust to current number of 16 KB cluster in use */ - n = m_region_expand(MC_16KCL) - - (m_total(MC_16KCL) - m_infree(MC_16KCL)); - if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) { - n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); - } - if (n > 0) { - mb_expand_16kcl_total += n; - } - m_region_expand(MC_16KCL) = 0; - - if (n > 0) { - mbwdog_logger("expanding MC_16KCL by %d", n); - (void) freelist_populate(MC_16KCL, n, M_WAIT); - } - } - - /* - * Because we can run out of memory before filling the mbuf - * map, we should not allocate more clusters than they are - * mbufs -- otherwise we could have a large number of useless - * clusters allocated. - */ - mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d", - m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL), - m_total(MC_16KCL)); - uint32_t total_mbufs = m_total(MC_MBUF); - uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + - m_total(MC_16KCL); - if (total_mbufs < total_clusters) { - mbwdog_logger("expanding MC_MBUF by %d", - total_clusters - total_mbufs); - } - while (total_mbufs < total_clusters) { - mb_expand_cnt++; - if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) { - break; - } - total_mbufs = m_total(MC_MBUF); - total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + - m_total(MC_16KCL); - } - - mbuf_worker_needs_wakeup = TRUE; - /* - * If there's a deadlock and we're not sending / receiving - * packets, net_uptime() won't be updated. Update it here - * so we are sure it's correct. - */ - net_update_uptime(); - mbuf_worker_last_runtime = net_uptime(); - assert_wait((caddr_t)&mbuf_worker_needs_wakeup, - THREAD_UNINT); - mbwdog_logger("worker thread sleeping"); - lck_mtx_unlock(mbuf_mlock); - (void) thread_block((thread_continue_t)mbuf_worker_thread); - } -} - -__attribute__((noreturn)) -static void -mbuf_worker_thread_init(void) -{ - mbuf_worker_ready++; - mbuf_worker_thread(); -} - -static mcl_slab_t * -slab_get(void *buf) -{ - mcl_slabg_t *slg; - unsigned int ix, k; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - - VERIFY(MBUF_IN_MAP(buf)); - ix = ((unsigned char *)buf - mbutl) >> MBSHIFT; - VERIFY(ix < maxslabgrp); - - if ((slg = slabstbl[ix]) == NULL) { - /* - * In the current implementation, we never shrink the slabs - * table; if we attempt to reallocate a cluster group when - * it's already allocated, panic since this is a sign of a - * memory corruption (slabstbl[ix] got nullified). - */ - ++slabgrp; - VERIFY(ix < slabgrp); - /* - * Slabs expansion can only be done single threaded; when - * we get here, it must be as a result of m_clalloc() which - * is serialized and therefore mb_clalloc_busy must be set. - */ - VERIFY(mb_clalloc_busy); - lck_mtx_unlock(mbuf_mlock); - - /* This is a new buffer; create the slabs group for it */ - slg = zalloc_permanent_type(mcl_slabg_t); - slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB, - ZALIGN(mcl_slab_t)); - - lck_mtx_lock(mbuf_mlock); - /* - * No other thread could have gone into m_clalloc() after - * we dropped the lock above, so verify that it's true. - */ - VERIFY(mb_clalloc_busy); - - slabstbl[ix] = slg; - - /* Chain each slab in the group to its forward neighbor */ - for (k = 1; k < NSLABSPMB; k++) { - slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; - } - VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); - - /* And chain the last slab in the previous group to this */ - if (ix > 0) { - VERIFY(slabstbl[ix - 1]-> - slg_slab[NSLABSPMB - 1].sl_next == NULL); - slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = - &slg->slg_slab[0]; - } - } - - ix = MTOPG(buf) % NSLABSPMB; - VERIFY(ix < NSLABSPMB); - - return &slg->slg_slab[ix]; -} - -static void -slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, - void *base, void *head, unsigned int len, int refcnt, int chunks) -{ - sp->sl_class = class; - sp->sl_flags = flags; - sp->sl_base = base; - sp->sl_head = head; - sp->sl_len = len; - sp->sl_refcnt = refcnt; - sp->sl_chunks = chunks; - slab_detach(sp); -} - -static void -slab_insert(mcl_slab_t *sp, mbuf_class_t class) -{ - VERIFY(slab_is_detached(sp)); - m_slab_cnt(class)++; - TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); - sp->sl_flags &= ~SLF_DETACHED; - - /* - * If a buffer spans multiple contiguous pages then mark them as - * detached too - */ - if (class == MC_16KCL) { - int k; - for (k = 1; k < NSLABSP16KB; k++) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL && slab_is_detached(sp)); - sp->sl_flags &= ~SLF_DETACHED; - } - } -} - -static void -slab_remove(mcl_slab_t *sp, mbuf_class_t class) -{ - int k; - VERIFY(!slab_is_detached(sp)); - VERIFY(m_slab_cnt(class) > 0); - m_slab_cnt(class)--; - TAILQ_REMOVE(&m_slablist(class), sp, sl_link); - slab_detach(sp); - if (class == MC_16KCL) { - for (k = 1; k < NSLABSP16KB; k++) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL); - VERIFY(!slab_is_detached(sp)); - slab_detach(sp); - } - } -} - -static boolean_t -slab_inrange(mcl_slab_t *sp, void *buf) -{ - return (uintptr_t)buf >= (uintptr_t)sp->sl_base && - (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len); -} - -#undef panic - -static void -slab_nextptr_panic(mcl_slab_t *sp, void *addr) -{ - int i; - unsigned int chunk_len = sp->sl_len / sp->sl_chunks; - uintptr_t buf = (uintptr_t)sp->sl_base; - - for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { - void *next = ((mcache_obj_t *)buf)->obj_next; - if (next != addr) { - continue; - } - if (!mclverify) { - if (next != NULL && !MBUF_IN_MAP(next)) { - mcache_t *cp = m_cache(sp->sl_class); - panic("%s: %s buffer %p in slab %p modified " - "after free at offset 0: %p out of range " - "[%p-%p)\n", __func__, cp->mc_name, - (void *)buf, sp, next, mbutl, embutl); - /* NOTREACHED */ - } - } else { - mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, - (mcache_obj_t *)buf); - mcl_audit_verify_nextptr(next, mca); - } - } -} - -static void -slab_detach(mcl_slab_t *sp) -{ - sp->sl_link.tqe_next = (mcl_slab_t *)-1; - sp->sl_link.tqe_prev = (mcl_slab_t **)-1; - sp->sl_flags |= SLF_DETACHED; -} - -static boolean_t -slab_is_detached(mcl_slab_t *sp) -{ - return (intptr_t)sp->sl_link.tqe_next == -1 && - (intptr_t)sp->sl_link.tqe_prev == -1 && - (sp->sl_flags & SLF_DETACHED); -} - -static void -mcl_audit_init(void *buf, mcache_audit_t **mca_list, - mcache_obj_t **con_list, size_t con_size, unsigned int num) -{ - mcache_audit_t *mca, *mca_tail; - mcache_obj_t *con = NULL; - boolean_t save_contents = (con_list != NULL); - unsigned int i, ix; - - ASSERT(num <= NMBPG); - ASSERT(con_list == NULL || con_size != 0); - - ix = MTOPG(buf); - VERIFY(ix < maxclaudit); - - /* Make sure we haven't been here before */ - for (i = 0; i < num; i++) { - VERIFY(mclaudit[ix].cl_audit[i] == NULL); - } - - mca = mca_tail = *mca_list; - if (save_contents) { - con = *con_list; - } - - for (i = 0; i < num; i++) { - mcache_audit_t *next; - - next = mca->mca_next; - bzero(mca, sizeof(*mca)); - mca->mca_next = next; - mclaudit[ix].cl_audit[i] = mca; - - /* Attach the contents buffer if requested */ - if (save_contents) { - mcl_saved_contents_t *msc = - (mcl_saved_contents_t *)(void *)con; - - VERIFY(msc != NULL); - VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t))); - VERIFY(con_size == sizeof(*msc)); - mca->mca_contents_size = con_size; - mca->mca_contents = msc; - con = con->obj_next; - bzero(mca->mca_contents, mca->mca_contents_size); - } - - mca_tail = mca; - mca = mca->mca_next; - } - - if (save_contents) { - *con_list = con; - } - - *mca_list = mca_tail->mca_next; - mca_tail->mca_next = NULL; -} - -static void -mcl_audit_free(void *buf, unsigned int num) -{ - unsigned int i, ix; - mcache_audit_t *mca, *mca_list; - - ix = MTOPG(buf); - VERIFY(ix < maxclaudit); - - if (mclaudit[ix].cl_audit[0] != NULL) { - mca_list = mclaudit[ix].cl_audit[0]; - for (i = 0; i < num; i++) { - mca = mclaudit[ix].cl_audit[i]; - mclaudit[ix].cl_audit[i] = NULL; - if (mca->mca_contents) { - mcache_free(mcl_audit_con_cache, - mca->mca_contents); - } - } - mcache_free_ext(mcache_audit_cache, - (mcache_obj_t *)mca_list); - } -} - -/* - * Given an address of a buffer (mbuf/2KB/4KB/16KB), return - * the corresponding audit structure for that buffer. - */ -static mcache_audit_t * -mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj) -{ - mcache_audit_t *mca = NULL; - int ix = MTOPG(mobj), m_idx = 0; - unsigned char *page_addr; - - VERIFY(ix < maxclaudit); - VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE))); - - page_addr = PGTOM(ix); - - switch (class) { - case MC_MBUF: - /* - * For the mbuf case, find the index of the page - * used by the mbuf and use that index to locate the - * base address of the page. Then find out the - * mbuf index relative to the page base and use - * it to locate the audit structure. - */ - m_idx = MBPAGEIDX(page_addr, mobj); - VERIFY(m_idx < (int)NMBPG); - mca = mclaudit[ix].cl_audit[m_idx]; - break; - - case MC_CL: - /* - * Same thing as above, but for 2KB clusters in a page. - */ - m_idx = CLPAGEIDX(page_addr, mobj); - VERIFY(m_idx < (int)NCLPG); - mca = mclaudit[ix].cl_audit[m_idx]; - break; - - case MC_BIGCL: - m_idx = BCLPAGEIDX(page_addr, mobj); - VERIFY(m_idx < (int)NBCLPG); - mca = mclaudit[ix].cl_audit[m_idx]; - break; - case MC_16KCL: - /* - * Same as above, but only return the first element. - */ - mca = mclaudit[ix].cl_audit[0]; - break; - - default: - VERIFY(0); - /* NOTREACHED */ - } - - return mca; -} - -static void -mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, - boolean_t alloc) -{ - struct mbuf *m = addr; - mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; - - VERIFY(mca->mca_contents != NULL && - mca->mca_contents_size == AUDIT_CONTENTS_SIZE); - - if (mclverify) { - mcl_audit_verify_nextptr(next, mca); - } - - if (!alloc) { - /* Save constructed mbuf fields */ - mcl_audit_save_mbuf(m, mca); - if (mclverify) { - mcache_set_pattern(MCACHE_FREE_PATTERN, m, - m_maxsize(MC_MBUF)); - } - ((mcache_obj_t *)m)->obj_next = next; - return; - } - - /* Check if the buffer has been corrupted while in freelist */ - if (mclverify) { - mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); - } - /* Restore constructed mbuf fields */ - mcl_audit_restore_mbuf(m, mca, composite); -} - -static void -mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) -{ - struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca); - - if (composite) { - struct mbuf *next = m->m_next; - VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL && - MBUF_IS_COMPOSITE(ms)); - VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); - /* - * We could have hand-picked the mbuf fields and restore - * them individually, but that will be a maintenance - * headache. Instead, restore everything that was saved; - * the mbuf layer will recheck and reinitialize anyway. - */ - bcopy(ms, m, MCA_SAVED_MBUF_SIZE); - m->m_next = next; - } else { - /* - * For a regular mbuf (no cluster attached) there's nothing - * to restore other than the type field, which is expected - * to be MT_FREE. - */ - m->m_type = ms->m_type; - } - _MCHECK(m); -} - -static void -mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) -{ - VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); - _MCHECK(m); - bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE); -} - -static void -mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, - boolean_t save_next) -{ - mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; - - if (!alloc) { - if (mclverify) { - mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); - } - if (save_next) { - mcl_audit_verify_nextptr(next, mca); - ((mcache_obj_t *)addr)->obj_next = next; - } - } else if (mclverify) { - /* Check if the buffer has been corrupted while in freelist */ - mcl_audit_verify_nextptr(next, mca); - mcache_audit_free_verify_set(mca, addr, 0, size); - } -} - -static void -mcl_audit_scratch(mcache_audit_t *mca) -{ - void *stack[MCACHE_STACK_DEPTH + 1]; - mcl_scratch_audit_t *msa; - struct timeval now; - - VERIFY(mca->mca_contents != NULL); - msa = MCA_SAVED_SCRATCH_PTR(mca); - - msa->msa_pthread = msa->msa_thread; - msa->msa_thread = current_thread(); - bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack)); - msa->msa_pdepth = msa->msa_depth; - bzero(stack, sizeof(stack)); - msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1; - bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack)); - - msa->msa_ptstamp = msa->msa_tstamp; - microuptime(&now); - /* tstamp is in ms relative to base_ts */ - msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000); - if ((now.tv_sec - mb_start.tv_sec) > 0) { - msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000); - } -} - -__abortlike -static void -mcl_audit_mcheck_panic(struct mbuf *m) -{ - char buf[DUMP_MCA_BUF_SIZE]; - mcache_audit_t *mca; - - MRANGE(m); - mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - - panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s", - m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca)); - /* NOTREACHED */ -} - -__abortlike -static void -mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca) -{ - char buf[DUMP_MCA_BUF_SIZE]; - panic("mcl_audit: buffer %p modified after free at offset 0: " - "%p out of range [%p-%p)\n%s\n", - mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca)); - /* NOTREACHED */ -} - -static void -mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) -{ - if (next != NULL && !MBUF_IN_MAP(next) && - (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { - mcl_audit_verify_nextptr_panic(next, mca); - } -} - -static uintptr_t -hash_mix(uintptr_t x) -{ -#ifndef __LP64__ - x += ~(x << 15); - x ^= (x >> 10); - x += (x << 3); - x ^= (x >> 6); - x += ~(x << 11); - x ^= (x >> 16); -#else - x += ~(x << 32); - x ^= (x >> 22); - x += ~(x << 13); - x ^= (x >> 8); - x += (x << 3); - x ^= (x >> 15); - x += ~(x << 27); - x ^= (x >> 31); -#endif - return x; -} - -static uint32_t -hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) -{ - uintptr_t hash = 0; - uintptr_t mask = max_size - 1; - - while (depth) { - hash += bt[--depth]; - } - - hash = hash_mix(hash) & mask; - - assert(hash < max_size); - - return (uint32_t) hash; -} - -static uint32_t -hashaddr(uintptr_t pt, uint32_t max_size) -{ - uintptr_t hash = 0; - uintptr_t mask = max_size - 1; - - hash = hash_mix(pt) & mask; - - assert(hash < max_size); - - return (uint32_t) hash; -} - -/* This function turns on mbuf leak detection */ -static void -mleak_activate(void) -{ - mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; - PE_parse_boot_argn("mleak_sample_factor", - &mleak_table.mleak_sample_factor, - sizeof(mleak_table.mleak_sample_factor)); - - if (mleak_table.mleak_sample_factor == 0) { - mclfindleak = 0; - } - - if (mclfindleak == 0) { - return; - } - - vm_size_t alloc_size = - mleak_alloc_buckets * sizeof(struct mallocation); - vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace); - - mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation)); - mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace)); - mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), - ZALIGN(mleak_stat_t)); - - mleak_stat->ml_cnt = MLEAK_NUM_TRACES; -#ifdef __LP64__ - mleak_stat->ml_isaddr64 = 1; -#endif /* __LP64__ */ -} - -static void -mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) -{ - int temp; - - if (mclfindleak == 0) { - return; - } - - if (!alloc) { - return mleak_free(addr); - } - - temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed); - - if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { - uintptr_t bt[MLEAK_STACK_DEPTH]; - unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL); - mleak_log(bt, addr, logged, num); - } -} - -/* - * This function records the allocation in the mleak_allocations table - * and the backtrace in the mleak_traces table; if allocation slot is in use, - * replace old allocation with new one if the trace slot is in use, return - * (or increment refcount if same trace). - */ -static boolean_t -mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) -{ - struct mallocation *allocation; - struct mtrace *trace; - uint32_t trace_index; - - /* Quit if someone else modifying the tables */ - if (!lck_mtx_try_lock_spin(mleak_lock)) { - mleak_table.total_conflicts++; - return FALSE; - } - - allocation = &mleak_allocations[hashaddr((uintptr_t)addr, - mleak_alloc_buckets)]; - trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); - trace = &mleak_traces[trace_index]; - - VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); - VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); - - allocation->hitcount++; - trace->hitcount++; - - /* - * If the allocation bucket we want is occupied - * and the occupier has the same trace, just bail. - */ - if (allocation->element != NULL && - trace_index == allocation->trace_index) { - mleak_table.alloc_collisions++; - lck_mtx_unlock(mleak_lock); - return TRUE; - } - - /* - * Store the backtrace in the traces array; - * Size of zero = trace bucket is free. - */ - if (trace->allocs > 0 && - bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) { - /* Different, unique trace, but the same hash! Bail out. */ - trace->collisions++; - mleak_table.trace_collisions++; - lck_mtx_unlock(mleak_lock); - return TRUE; - } else if (trace->allocs > 0) { - /* Same trace, already added, so increment refcount */ - trace->allocs++; - } else { - /* Found an unused trace bucket, so record the trace here */ - if (trace->depth != 0) { - /* this slot previously used but not currently in use */ - mleak_table.trace_overwrites++; - } - mleak_table.trace_recorded++; - trace->allocs = 1; - memcpy(trace->addr, bt, (depth * sizeof(uintptr_t))); - trace->depth = depth; - trace->collisions = 0; - } - - /* Step 2: Store the allocation record in the allocations array */ - if (allocation->element != NULL) { - /* - * Replace an existing allocation. No need to preserve - * because only a subset of the allocations are being - * recorded anyway. - */ - mleak_table.alloc_collisions++; - } else if (allocation->trace_index != 0) { - mleak_table.alloc_overwrites++; - } - allocation->element = addr; - allocation->trace_index = trace_index; - allocation->count = num; - mleak_table.alloc_recorded++; - mleak_table.outstanding_allocs++; - - lck_mtx_unlock(mleak_lock); - return TRUE; -} - -static void -mleak_free(mcache_obj_t *addr) -{ - while (addr != NULL) { - struct mallocation *allocation = &mleak_allocations - [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; - - if (allocation->element == addr && - allocation->trace_index < mleak_trace_buckets) { - lck_mtx_lock_spin(mleak_lock); - if (allocation->element == addr && - allocation->trace_index < mleak_trace_buckets) { - struct mtrace *trace; - trace = &mleak_traces[allocation->trace_index]; - /* allocs = 0 means trace bucket is unused */ - if (trace->allocs > 0) { - trace->allocs--; - } - if (trace->allocs == 0) { - trace->depth = 0; - } - /* NULL element means alloc bucket is unused */ - allocation->element = NULL; - mleak_table.outstanding_allocs--; - } - lck_mtx_unlock(mleak_lock); - } - addr = addr->obj_next; - } -} - -static void -mleak_sort_traces() -{ - int i, j, k; - struct mtrace *swap; - - for (i = 0; i < MLEAK_NUM_TRACES; i++) { - mleak_top_trace[i] = NULL; - } - - for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) { - if (mleak_traces[i].allocs <= 0) { - continue; - } - - mleak_top_trace[j] = &mleak_traces[i]; - for (k = j; k > 0; k--) { - if (mleak_top_trace[k]->allocs <= - mleak_top_trace[k - 1]->allocs) { - break; - } - - swap = mleak_top_trace[k - 1]; - mleak_top_trace[k - 1] = mleak_top_trace[k]; - mleak_top_trace[k] = swap; - } - j++; - } - - j--; - for (; i < mleak_trace_buckets; i++) { - if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) { - continue; - } - - mleak_top_trace[j] = &mleak_traces[i]; - - for (k = j; k > 0; k--) { - if (mleak_top_trace[k]->allocs <= - mleak_top_trace[k - 1]->allocs) { - break; - } - - swap = mleak_top_trace[k - 1]; - mleak_top_trace[k - 1] = mleak_top_trace[k]; - mleak_top_trace[k] = swap; - } - } -} - -static void -mleak_update_stats() -{ - mleak_trace_stat_t *mltr; - int i; - - VERIFY(mleak_stat != NULL); -#ifdef __LP64__ - VERIFY(mleak_stat->ml_isaddr64); -#else - VERIFY(!mleak_stat->ml_isaddr64); -#endif /* !__LP64__ */ - VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); - - mleak_sort_traces(); - - mltr = &mleak_stat->ml_trace[0]; - bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES); - for (i = 0; i < MLEAK_NUM_TRACES; i++) { - int j; - - if (mleak_top_trace[i] == NULL || - mleak_top_trace[i]->allocs == 0) { - continue; - } - - mltr->mltr_collisions = mleak_top_trace[i]->collisions; - mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; - mltr->mltr_allocs = mleak_top_trace[i]->allocs; - mltr->mltr_depth = mleak_top_trace[i]->depth; - - VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); - for (j = 0; j < mltr->mltr_depth; j++) { - mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; - } - - mltr++; - } -} - -static struct mbtypes { - int mt_type; - const char *mt_name; -} mbtypes[] = { - { MT_DATA, "data" }, - { MT_OOBDATA, "oob data" }, - { MT_CONTROL, "ancillary data" }, - { MT_HEADER, "packet headers" }, - { MT_SOCKET, "socket structures" }, - { MT_PCB, "protocol control blocks" }, - { MT_RTABLE, "routing table entries" }, - { MT_HTABLE, "IMP host table entries" }, - { MT_ATABLE, "address resolution tables" }, - { MT_FTABLE, "fragment reassembly queue headers" }, - { MT_SONAME, "socket names and addresses" }, - { MT_SOOPTS, "socket options" }, - { MT_RIGHTS, "access rights" }, - { MT_IFADDR, "interface addresses" }, - { MT_TAG, "packet tags" }, - { 0, NULL } -}; - -#define MBUF_DUMP_BUF_CHK() { \ - clen -= k; \ - if (clen < 1) \ - goto done; \ - c += k; \ -} - -static char * -mbuf_dump(void) -{ - unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct, - totreturned = 0; - u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; - u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; - u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; - int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short); - uint8_t seen[256]; - struct mbtypes *mp; - mb_class_stat_t *sp; - mleak_trace_stat_t *mltr; - char *c = mbuf_dump_buf; - int i, j, k, clen = MBUF_DUMP_BUF_SIZE; - struct mbuf_watchdog_defunct_args args = {}; - - mbuf_dump_buf[0] = '\0'; - - /* synchronize all statistics in the mbuf table */ - mbuf_stat_sync(); - mbuf_mtypes_sync(TRUE); - - sp = &mb_stat->mbs_class[0]; - for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { - u_int32_t mem; - - if (m_class(i) == MC_MBUF) { - m_mbufs = sp->mbcl_active; - } else if (m_class(i) == MC_CL) { - m_clfree = sp->mbcl_total - sp->mbcl_active; - } else if (m_class(i) == MC_BIGCL) { - m_bigclfree = sp->mbcl_total - sp->mbcl_active; - } else if (njcl > 0 && m_class(i) == MC_16KCL) { - m_16kclfree = sp->mbcl_total - sp->mbcl_active; - m_16kclusters = sp->mbcl_total; - } else if (m_class(i) == MC_MBUF_CL) { - m_mbufclfree = sp->mbcl_total - sp->mbcl_active; - } else if (m_class(i) == MC_MBUF_BIGCL) { - m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; - } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { - m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; - } - - mem = sp->mbcl_ctotal * sp->mbcl_size; - totmem += mem; - totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * - sp->mbcl_size; - totreturned += sp->mbcl_release_cnt; - } - - /* adjust free counts to include composite caches */ - m_clfree += m_mbufclfree; - m_bigclfree += m_mbufbigclfree; - m_16kclfree += m_mbuf16kclfree; - - totmbufs = 0; - for (mp = mbtypes; mp->mt_name != NULL; mp++) { - totmbufs += mbstat.m_mtypes[mp->mt_type]; - } - if (totmbufs > m_mbufs) { - totmbufs = m_mbufs; - } - k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); - MBUF_DUMP_BUF_CHK(); - - bzero(&seen, sizeof(seen)); - for (mp = mbtypes; mp->mt_name != NULL; mp++) { - if (mbstat.m_mtypes[mp->mt_type] != 0) { - seen[mp->mt_type] = 1; - k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n", - mbstat.m_mtypes[mp->mt_type], mp->mt_name); - MBUF_DUMP_BUF_CHK(); - } - } - seen[MT_FREE] = 1; - for (i = 0; i < nmbtypes; i++) { - if (!seen[i] && mbstat.m_mtypes[i] != 0) { - k = scnprintf(c, clen, "\t%u mbufs allocated to " - "\n", mbstat.m_mtypes[i], i); - MBUF_DUMP_BUF_CHK(); - } - } - if ((m_mbufs - totmbufs) > 0) { - k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n", - m_mbufs - totmbufs); - MBUF_DUMP_BUF_CHK(); - } - k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" - "%u/%u mbuf 4KB clusters in use\n", - (unsigned int)(mbstat.m_clusters - m_clfree), - (unsigned int)mbstat.m_clusters, - (unsigned int)(mbstat.m_bigclusters - m_bigclfree), - (unsigned int)mbstat.m_bigclusters); - MBUF_DUMP_BUF_CHK(); - - if (njcl > 0) { - k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", - m_16kclusters - m_16kclfree, m_16kclusters, - njclbytes / 1024); - MBUF_DUMP_BUF_CHK(); - } - totused = totmem - totfree; - if (totmem == 0) { - totpct = 0; - } else if (totused < (ULONG_MAX / 100)) { - totpct = (totused * 100) / totmem; - } else { - u_long totmem1 = totmem / 100; - u_long totused1 = totused / 100; - totpct = (totused1 * 100) / totmem1; - } - k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " - "in use)\n", totmem / 1024, totpct); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "%lu KB returned to the system\n", - totreturned / 1024); - MBUF_DUMP_BUF_CHK(); - - net_update_uptime(); - - k = scnprintf(c, clen, - "worker thread runs: %u, expansions: %llu, cl %llu/%llu, " - "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt, - mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total, - mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt, - mb_expand_16kcl_total); - MBUF_DUMP_BUF_CHK(); - if (mbuf_worker_last_runtime != 0) { - k = scnprintf(c, clen, "worker thread last run time: " - "%llu (%llu seconds ago)\n", - mbuf_worker_last_runtime, - net_uptime() - mbuf_worker_last_runtime); - MBUF_DUMP_BUF_CHK(); - } - if (mbuf_drain_last_runtime != 0) { - k = scnprintf(c, clen, "drain routine last run time: " - "%llu (%llu seconds ago)\n", - mbuf_drain_last_runtime, - net_uptime() - mbuf_drain_last_runtime); - MBUF_DUMP_BUF_CHK(); - } - - /* - * Log where the most mbufs have accumulated: - * - Process socket buffers - * - TCP reassembly queue - * - Interface AQM queue (output) and DLIL input queue - */ - args.non_blocking = true; - proc_iterate(PROC_ALLPROCLIST, - mbuf_watchdog_defunct_iterate, &args, NULL, NULL); - if (args.top_app != NULL) { - k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n", - args.top_app_space_used, - proc_name_address(args.top_app), - proc_pid(args.top_app)); - proc_rele(args.top_app); - } - MBUF_DUMP_BUF_CHK(); - -#if INET - k = dump_tcp_reass_qlen(c, clen); - MBUF_DUMP_BUF_CHK(); -#endif /* INET */ - -#if MPTCP - k = dump_mptcp_reass_qlen(c, clen); - MBUF_DUMP_BUF_CHK(); -#endif /* MPTCP */ - -#if NETWORKING - k = dlil_dump_top_if_qlen(c, clen); - MBUF_DUMP_BUF_CHK(); -#endif /* NETWORKING */ - - /* mbuf leak detection statistics */ - mleak_update_stats(); - - k = scnprintf(c, clen, "\nmbuf leak detection table:\n"); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n", - mleak_table.mleak_capture / mleak_table.mleak_sample_factor, - mleak_table.mleak_sample_factor); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n", - mleak_table.outstanding_allocs); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", - mleak_table.alloc_recorded, mleak_table.trace_recorded); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", - mleak_table.alloc_collisions, mleak_table.trace_collisions); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", - mleak_table.alloc_overwrites, mleak_table.trace_overwrites); - MBUF_DUMP_BUF_CHK(); - k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n", - mleak_table.total_conflicts); - MBUF_DUMP_BUF_CHK(); - - k = scnprintf(c, clen, "top %d outstanding traces:\n", - mleak_stat->ml_cnt); - MBUF_DUMP_BUF_CHK(); - for (i = 0; i < mleak_stat->ml_cnt; i++) { - mltr = &mleak_stat->ml_trace[i]; - k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), " - "%llu hit(s), %llu collision(s)\n", (i + 1), - mltr->mltr_allocs, mltr->mltr_hitcount, - mltr->mltr_collisions); - MBUF_DUMP_BUF_CHK(); - } - - if (mleak_stat->ml_isaddr64) { - k = scnprintf(c, clen, MB_LEAK_HDR_64); - } else { - k = scnprintf(c, clen, MB_LEAK_HDR_32); - } - MBUF_DUMP_BUF_CHK(); - - for (i = 0; i < MLEAK_STACK_DEPTH; i++) { - k = scnprintf(c, clen, "%2d: ", (i + 1)); - MBUF_DUMP_BUF_CHK(); - for (j = 0; j < mleak_stat->ml_cnt; j++) { - mltr = &mleak_stat->ml_trace[j]; - if (i < mltr->mltr_depth) { - if (mleak_stat->ml_isaddr64) { - k = scnprintf(c, clen, "0x%0llx ", - (uint64_t)VM_KERNEL_UNSLIDE( - mltr->mltr_addr[i])); - } else { - k = scnprintf(c, clen, - "0x%08x ", - (uint32_t)VM_KERNEL_UNSLIDE( - mltr->mltr_addr[i])); - } - } else { - if (mleak_stat->ml_isaddr64) { - k = scnprintf(c, clen, - MB_LEAK_SPACING_64); - } else { - k = scnprintf(c, clen, - MB_LEAK_SPACING_32); - } - } - MBUF_DUMP_BUF_CHK(); - } - k = scnprintf(c, clen, "\n"); - MBUF_DUMP_BUF_CHK(); - } - -done: - return mbuf_dump_buf; -} - -#undef MBUF_DUMP_BUF_CHK -#endif /* CONFIG_MBUF_MCACHE */ - /* * Convert between a regular and a packet header mbuf. Caller is responsible * for setting or clearing M_PKTHDR; this routine does the rest of the work. @@ -9512,7 +4741,7 @@ m_reinit(struct mbuf *m, int hdr) } else { VERIFY((m->m_flags & M_EXT) || m->m_data == (uintptr_t)m->m_dat); m->m_flags |= M_PKTHDR; - MBUF_INIT_PKTHDR(m); + mbuf_init_pkthdr(m); } } else { /* Free the aux data and tags if there is any */ @@ -9572,28 +4801,12 @@ m_ext_paired_activate(struct mbuf *m) * what's done in m_clattach() for the cluster. Bump * up MEXT_PREF to indicate activation. */ - MBUF_INIT(m, hdr, type); - MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa, + mbuf_init(m, hdr, type); + mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa, 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m); } -void -m_scratch_init(struct mbuf *m) -{ - struct pkthdr *pkt = &m->m_pkthdr; - - VERIFY(m->m_flags & M_PKTHDR); - - /* See comments in */ - if (pkt->pkt_flags & PKTF_PRIV_GUARDED) { - panic_plain("Invalid attempt to modify guarded module-private " - "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags); - /* NOTREACHED */ - } - - bzero(&pkt->pkt_mpriv, sizeof(pkt->pkt_mpriv)); -} - +#if !CONFIG_MBUF_MCACHE /* * This routine is reserved for mbuf_get_driver_scratch(); clients inside * xnu that intend on utilizing the module-private area should directly @@ -9601,8 +4814,8 @@ m_scratch_init(struct mbuf *m) * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior * to handing it off to another module, respectively. */ -u_int32_t -m_scratch_get(struct mbuf *m, u_int8_t **p) +uint32_t +m_scratch_get(struct mbuf *m, uint8_t **p) { struct pkthdr *pkt = &m->m_pkthdr; @@ -9615,22 +4828,10 @@ m_scratch_get(struct mbuf *m, u_int8_t **p) /* NOTREACHED */ } -#if CONFIG_MBUF_MCACHE - if (mcltrace) { - mcache_audit_t *mca; - - lck_mtx_lock(mbuf_mlock); - mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - if (mca->mca_uflags & MB_SCVALID) { - mcl_audit_scratch(mca); - } - lck_mtx_unlock(mbuf_mlock); - } -#endif /* CONFIG_MBUF_MCACHE */ - - *p = (u_int8_t *)&pkt->pkt_mpriv; + *p = (uint8_t *)&pkt->pkt_mpriv; return sizeof(pkt->pkt_mpriv); } +#endif /* !CONFIG_MBUF_MCACHE */ void m_add_crumb(struct mbuf *m, uint16_t crumb) @@ -9668,429 +4869,13 @@ m_add_hdr_crumb_chain(struct mbuf *head, uint64_t crumb, uint64_t flag) #endif /*__arm64__*/ } -__private_extern__ inline void -m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free, - caddr_t ext_arg) -{ - VERIFY(m->m_flags & M_EXT); - - m_set_rfa(m, rfa); - m->m_ext.ext_free = ext_free; - m->m_ext.ext_arg = ext_free == NULL ? NULL : ext_arg; -} - -__private_extern__ inline struct ext_ref * __stateful_pure -m_get_rfa(struct mbuf *m) -{ - return __unsafe_forge_single(struct ext_ref *, m->m_ext.ext_refflags); -} - -static inline void -m_set_rfa(struct mbuf *m, struct ext_ref *rfa) -{ - m->m_ext.ext_refflags = rfa; -} - -__private_extern__ inline m_ext_free_func_t __stateful_pure -m_get_ext_free(struct mbuf *m) -{ - return m->m_ext.ext_free; -} - -__private_extern__ inline caddr_t -m_get_ext_arg(struct mbuf *m) -{ - return __unsafe_forge_single(caddr_t, m->m_ext.ext_arg); -} - -#if CONFIG_MBUF_MCACHE -/* - * Simple routine to avoid taking the lock when we can't run the - * mbuf drain. - */ -static int -mbuf_drain_checks(boolean_t ignore_waiters) -{ - if (mb_drain_maxint == 0) { - return 0; - } - if (!ignore_waiters && mb_waiters != 0) { - return 0; - } - - return 1; -} - -/* - * Called by the VM when there's memory pressure or when we exhausted - * the 4k/16k reserved space. - */ -static void -mbuf_drain_locked(boolean_t ignore_waiters) -{ - mbuf_class_t mc; - mcl_slab_t *sp, *sp_tmp, *nsp; - unsigned int num, k, interval, released = 0; - unsigned long total_mem = 0, use_mem = 0; - boolean_t ret, purge_caches = FALSE; - ppnum_t offset; - mcache_obj_t *obj; - unsigned long per; - static unsigned char scratch[32]; - static ppnum_t scratch_pa = 0; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - if (!mbuf_drain_checks(ignore_waiters)) { - return; - } - if (scratch_pa == 0) { - bzero(scratch, sizeof(scratch)); - scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch); - VERIFY(scratch_pa); - } else if (mclverify) { - /* - * Panic if a driver wrote to our scratch memory. - */ - for (k = 0; k < sizeof(scratch); k++) { - if (scratch[k]) { - panic("suspect DMA to freed address"); - } - } - } - /* - * Don't free memory too often as that could cause excessive - * waiting times for mbufs. Purge caches if we were asked to drain - * in the last 5 minutes. - */ - if (mbuf_drain_last_runtime != 0) { - interval = net_uptime() - mbuf_drain_last_runtime; - if (interval <= mb_drain_maxint) { - return; - } - if (interval <= mb_drain_maxint * 5) { - purge_caches = TRUE; - } - } - mbuf_drain_last_runtime = net_uptime(); - /* - * Don't free any memory if we're using 60% or more. - */ - for (mc = 0; mc < NELEM(mbuf_table); mc++) { - total_mem += m_total(mc) * m_maxsize(mc); - use_mem += m_active(mc) * m_maxsize(mc); - } - per = (use_mem * 100) / total_mem; - if (per >= 60) { - return; - } - /* - * Purge all the caches. This effectively disables - * caching for a few seconds, but the mbuf worker thread will - * re-enable them again. - */ - if (purge_caches == TRUE) { - for (mc = 0; mc < NELEM(mbuf_table); mc++) { - if (m_total(mc) < m_avgtotal(mc)) { - continue; - } - lck_mtx_unlock(mbuf_mlock); - ret = mcache_purge_cache(m_cache(mc), FALSE); - lck_mtx_lock(mbuf_mlock); - if (ret == TRUE) { - m_purge_cnt(mc)++; - } - } - } - /* - * Move the objects from the composite class freelist to - * the rudimentary slabs list, but keep at least 10% of the average - * total in the freelist. - */ - for (mc = 0; mc < NELEM(mbuf_table); mc++) { - while (m_cobjlist(mc) && - m_total(mc) < m_avgtotal(mc) && - m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { - obj = m_cobjlist(mc); - m_cobjlist(mc) = obj->obj_next; - obj->obj_next = NULL; - num = cslab_free(mc, obj, 1); - VERIFY(num == 1); - m_free_cnt(mc)++; - m_infree(mc)--; - /* cslab_free() handles m_total */ - } - } - /* - * Free the buffers present in the slab list up to 10% of the total - * average per class. - * - * We walk the list backwards in an attempt to reduce fragmentation. - */ - for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) { - TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) { - /* - * Process only unused slabs occupying memory. - */ - if (sp->sl_refcnt != 0 || sp->sl_len == 0 || - sp->sl_base == NULL) { - continue; - } - if (m_total(mc) < m_avgtotal(mc) || - m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { - break; - } - slab_remove(sp, mc); - switch (mc) { - case MC_MBUF: - m_infree(mc) -= NMBPG; - m_total(mc) -= NMBPG; - if (mclaudit != NULL) { - mcl_audit_free(sp->sl_base, NMBPG); - } - break; - case MC_CL: - m_infree(mc) -= NCLPG; - m_total(mc) -= NCLPG; - if (mclaudit != NULL) { - mcl_audit_free(sp->sl_base, NMBPG); - } - break; - case MC_BIGCL: - { - m_infree(mc) -= NBCLPG; - m_total(mc) -= NBCLPG; - if (mclaudit != NULL) { - mcl_audit_free(sp->sl_base, NMBPG); - } - break; - } - case MC_16KCL: - m_infree(mc)--; - m_total(mc)--; - for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { - nsp = nsp->sl_next; - VERIFY(nsp->sl_refcnt == 0 && - nsp->sl_base != NULL && - nsp->sl_len == 0); - slab_init(nsp, 0, 0, NULL, NULL, 0, 0, - 0); - nsp->sl_flags = 0; - } - if (mclaudit != NULL) { - if (sp->sl_len == PAGE_SIZE) { - mcl_audit_free(sp->sl_base, - NMBPG); - } else { - mcl_audit_free(sp->sl_base, 1); - } - } - break; - default: - /* - * The composite classes have their own - * freelist (m_cobjlist), so we only - * process rudimentary classes here. - */ - VERIFY(0); - } - m_release_cnt(mc) += m_size(mc); - released += m_size(mc); - VERIFY(sp->sl_base != NULL && - sp->sl_len >= PAGE_SIZE); - offset = MTOPG(sp->sl_base); - /* - * Make sure the IOMapper points to a valid, but - * bogus, address. This should prevent further DMA - * accesses to freed memory. - */ - IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa); - mcl_paddr[offset] = 0; - kmem_free(mb_map, (vm_offset_t)sp->sl_base, - sp->sl_len); - slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0); - sp->sl_flags = 0; - } - } - mbstat.m_drain++; - mbstat.m_bigclusters = m_total(MC_BIGCL); - mbstat.m_clusters = m_total(MC_CL); - mbstat.m_mbufs = m_total(MC_MBUF); - mbuf_stat_sync(); - mbuf_mtypes_sync(TRUE); -} - -__private_extern__ void -mbuf_drain(boolean_t ignore_waiters) -{ - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED); - if (!mbuf_drain_checks(ignore_waiters)) { - return; - } - lck_mtx_lock(mbuf_mlock); - mbuf_drain_locked(ignore_waiters); - lck_mtx_unlock(mbuf_mlock); -} - - -static int -m_drain_force_sysctl SYSCTL_HANDLER_ARGS -{ -#pragma unused(arg1, arg2) - int val = 0, err; - - err = sysctl_handle_int(oidp, &val, 0, req); - if (err != 0 || req->newptr == USER_ADDR_NULL) { - return err; - } - if (val) { - mbuf_drain(TRUE); - } - - return err; -} - -#if DEBUG || DEVELOPMENT -__printflike(3, 4) -static void -_mbwdog_logger(const char *func, const int line, const char *fmt, ...) -{ - va_list ap; - struct timeval now; - char str[384], p[256]; - int len; - - LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - if (mbwdog_logging == NULL) { - /* - * This might block under a mutex, which isn't really great, - * but this happens once, so we'll live. - */ - mbwdog_logging = zalloc_permanent(mbwdog_logging_size, - ZALIGN_NONE); - } - va_start(ap, fmt); - vsnprintf(p, sizeof(p), fmt, ap); - va_end(ap); - microuptime(&now); - len = scnprintf(str, sizeof(str), - "\n%ld.%d (%d/%llx) %s:%d %s", - now.tv_sec, now.tv_usec, - proc_getpid(current_proc()), - (uint64_t)VM_KERNEL_ADDRPERM(current_thread()), - func, line, p); - if (len < 0) { - return; - } - if (mbwdog_logging_used + len > mbwdog_logging_size) { - mbwdog_logging_used = mbwdog_logging_used / 2; - memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used, - mbwdog_logging_size - mbwdog_logging_used); - mbwdog_logging[mbwdog_logging_used] = 0; - } - strlcat(mbwdog_logging, str, mbwdog_logging_size); - mbwdog_logging_used += len; -} - -#endif // DEBUG || DEVELOPMENT - -static void -mtracelarge_register(size_t size) -{ - int i; - struct mtracelarge *trace; - uintptr_t bt[MLEAK_STACK_DEPTH]; - unsigned int depth; - - depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL); - /* Check if this entry is already on the list. */ - for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) { - trace = &mtracelarge_table[i]; - if (trace->size == size && trace->depth == depth && - memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) { - return; - } - } - for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) { - trace = &mtracelarge_table[i]; - if (size > trace->size) { - trace->depth = depth; - memcpy(trace->addr, bt, depth * sizeof(uintptr_t)); - trace->size = size; - break; - } - } -} - -#if DEBUG || DEVELOPMENT - -static int -mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS -{ - char *str; - - ifnet_head_lock_shared(); - lck_mtx_lock(mbuf_mlock); - - str = mbuf_dump(); - - lck_mtx_unlock(mbuf_mlock); - ifnet_head_done(); - - return sysctl_io_string(req, str, 0, 0, NULL); -} - -#endif /* DEBUG || DEVELOPMENT */ -#endif /* CONFIG_MBUF_MCACHE */ - SYSCTL_DECL(_kern_ipc); -#if DEBUG || DEVELOPMENT -#if SKYWALK && CONFIG_MBUF_MCACHE -SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor, - CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor, - MC_THRESHOLD_SCALE_DOWN_FACTOR, - "scale down factor for mbuf cache thresholds"); -#endif /* SKYWALK && CONFIG_MBUF_MCACHE */ -#if CONFIG_MBUF_MCACHE -SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump, - CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump"); -#endif /* CONFIG_MBUF_MCACHE */ -#endif /* DEBUG || DEVELOPMENT */ SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mbstat_sysctl, "S,mbstat", ""); SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mb_stat_sysctl, "S,mb_stat", ""); -#if CONFIG_MBUF_MCACHE -SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); -SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, - 0, 0, mleak_table_sysctl, "S,mleak_table", ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, - CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, - CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, - CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); -SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, - m_drain_force_sysctl, "I", - "Forces the mbuf garbage collection to run"); -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint, - CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0, - "Minimum time interval between garbage collection"); -#endif /* CONFIG_MBUF_MCACHE */ SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage, CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0, "Percentage of when we trigger memory-pressure for an mbuf-class"); -#if CONFIG_MBUF_MCACHE -static int mb_uses_mcache = 1; -#else -static int mb_uses_mcache = 0; -#endif /* CONFIG_MBUF_MCACHE */ -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_uses_mcache, - CTLFLAG_LOCKED, &mb_uses_mcache, 0, - "Whether mbufs use mcache"); diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index adec1b399..a857ca240 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -453,25 +453,6 @@ m_tag_verify_cookie(struct m_tag *tag) #endif /* defined(HAS_APPLE_PAC) */ - -struct m_tag * -m_tag_create(uint32_t id, uint16_t type, int len, int wait, struct mbuf *buf) -{ -#ifdef MB_TAG_MBUF - /* - * Create and return an m_tag, either by re-using space in a previous tag - * or by allocating a new mbuf/cluster - */ - return m_tag_create_mbuf(id, type, (uint16_t)len, wait, buf); -#else /* MB_TAG_MBUF */ -#pragma unused(buf) - /* - * Each packet tag has its own allocation - */ - return m_tag_alloc(id, type, (uint16_t)len, wait); -#endif /* MB_TAG_MBUF */ -} - #ifdef MB_TAG_MBUF /* Get a packet tag structure along with specified data following. */ static struct m_tag * @@ -517,7 +498,121 @@ m_tag_alloc_mbuf(u_int32_t id, u_int16_t type, uint16_t len, int wait) } return t; } +#endif /* MB_TAG_MBUF */ +static struct m_tag_type_entry * +get_m_tag_type_entry(uint32_t id, uint16_t type, struct m_tag_type_stats **pmtts) +{ + m_tag_type_entry_ref_t mtte = &m_tag_type_table[KERNEL_TAG_TYPE_NONE]; + + if (pmtts != NULL) { + *pmtts = &m_tag_type_stats[KERNEL_TAG_TYPE_NONE]; + } + + if (id == KERNEL_MODULE_TAG_ID) { + switch (type) { + case KERNEL_TAG_TYPE_DUMMYNET: + case KERNEL_TAG_TYPE_IPFILT: + case KERNEL_TAG_TYPE_ENCAP: + case KERNEL_TAG_TYPE_INET6: + case KERNEL_TAG_TYPE_IPSEC: + case KERNEL_TAG_TYPE_CFIL_UDP: + case KERNEL_TAG_TYPE_PF_REASS: + case KERNEL_TAG_TYPE_AQM: + case KERNEL_TAG_TYPE_DRVAUX: + mtte = &m_tag_type_table[type]; + if (pmtts != NULL) { + *pmtts = &m_tag_type_stats[type]; + } + break; + default: +#if DEBUG || DEVELOPMENT + if (type > 0 && type < KERNEL_TAG_TYPE_COUNT) { + panic("get_m_tag_type_entry unexpected m_tag type %u", + type); + } +#endif /* DEBUG || DEVELOPMENT */ + break; + } + } + + return mtte; +} + +#ifndef MB_TAG_MBUF +static struct m_tag * +m_tag_kalloc(uint32_t id, uint16_t type, uint16_t len, int wait, struct m_tag_type_entry *mtte) +{ + struct m_tag *tag = NULL; + + tag = mtte->mt_alloc_func(id, type, len, wait); + + if (__probable(tag != NULL)) { + VERIFY(IS_P2ALIGNED(tag, sizeof(uint64_t))); + + if (__improbable(tag->m_tag_data == NULL)) { + VERIFY(len == 0); + } else { + VERIFY(len != 0); + VERIFY(IS_P2ALIGNED(tag->m_tag_data, sizeof(uint64_t))); + } + } + return tag; +} + +static void +m_tag_kfree(struct m_tag *tag, struct m_tag_type_entry *mtte) +{ + mtte->mt_free_func(tag); +} +#endif /* MB_TAG_MBUF */ + +static struct m_tag * +m_tag_alloc(uint32_t id, uint16_t type, int len, int wait) +{ + struct m_tag *tag = NULL; + m_tag_type_entry_ref_t mtte = NULL; + m_tag_type_stats_ref_t mtts = NULL; + + mtte = get_m_tag_type_entry(id, type, &mtts); + + if (__improbable(len < 0 || len >= MCLBYTES - sizeof(struct m_tag))) { + goto done; + } + +#ifdef MB_TAG_MBUF + tag = m_tag_alloc_mbuf(id, type, (uint16_t)len, wait); +#else /* MB_TAG_MBUF */ + /* + * Using Z_NOWAIT could cause retransmission delays when there aren't + * many other colocated types in the zone that would prime it. Use + * Z_NOPAGEWAIT instead which will only fail to allocate when zalloc + * needs to block on the VM for pages. + */ + if (wait & Z_NOWAIT) { + wait &= ~Z_NOWAIT; + wait |= Z_NOPAGEWAIT; + } + tag = m_tag_kalloc(id, type, (uint16_t)len, wait, mtte); +#endif /* MB_TAG_MBUF */ + +done: + if (__probable(tag != NULL)) { + m_tag_verify_cookie(tag); + assert3u(tag->m_tag_id, ==, id); + assert3u(tag->m_tag_type, ==, type); + assert3u(tag->m_tag_len, ==, len); + + os_atomic_inc(&mtts->mt_alloc_count, relaxed); + } else { + os_atomic_inc(&mtts->mt_alloc_failed, relaxed); + } + + return tag; +} + + +#ifdef MB_TAG_MBUF static struct m_tag * m_tag_create_mbuf(uint32_t id, uint16_t type, uint16_t len, int wait, struct mbuf *buf) { @@ -610,6 +705,24 @@ m_tag_free_mbuf(struct m_tag *t) } #endif /* MB_TAG_MBUF */ +struct m_tag * +m_tag_create(uint32_t id, uint16_t type, int len, int wait, struct mbuf *buf) +{ +#ifdef MB_TAG_MBUF + /* + * Create and return an m_tag, either by re-using space in a previous tag + * or by allocating a new mbuf/cluster + */ + return m_tag_create_mbuf(id, type, (uint16_t)len, wait, buf); +#else /* MB_TAG_MBUF */ +#pragma unused(buf) + /* + * Each packet tag has its own allocation + */ + return m_tag_alloc(id, type, (uint16_t)len, wait); +#endif /* MB_TAG_MBUF */ +} + /* * Allocations for external data are known to not have pointers for * most platforms -- for macOS this is not guaranteed @@ -684,117 +797,6 @@ m_tag_kfree_external(struct m_tag *tag) kfree_type(struct m_tag, tag); } -static struct m_tag_type_entry * -get_m_tag_type_entry(uint32_t id, uint16_t type, struct m_tag_type_stats **pmtts) -{ - m_tag_type_entry_ref_t mtte = &m_tag_type_table[KERNEL_TAG_TYPE_NONE]; - - if (pmtts != NULL) { - *pmtts = &m_tag_type_stats[KERNEL_TAG_TYPE_NONE]; - } - - if (id == KERNEL_MODULE_TAG_ID) { - switch (type) { - case KERNEL_TAG_TYPE_DUMMYNET: - case KERNEL_TAG_TYPE_IPFILT: - case KERNEL_TAG_TYPE_ENCAP: - case KERNEL_TAG_TYPE_INET6: - case KERNEL_TAG_TYPE_IPSEC: - case KERNEL_TAG_TYPE_CFIL_UDP: - case KERNEL_TAG_TYPE_PF_REASS: - case KERNEL_TAG_TYPE_AQM: - case KERNEL_TAG_TYPE_DRVAUX: - mtte = &m_tag_type_table[type]; - if (pmtts != NULL) { - *pmtts = &m_tag_type_stats[type]; - } - break; - default: -#if DEBUG || DEVELOPMENT - if (type > 0 && type < KERNEL_TAG_TYPE_COUNT) { - panic("get_m_tag_type_entry unexpected m_tag type %u", - type); - } -#endif /* DEBUG || DEVELOPMENT */ - break; - } - } - - return mtte; -} - -#ifndef MB_TAG_MBUF -static struct m_tag * -m_tag_kalloc(uint32_t id, uint16_t type, uint16_t len, int wait, struct m_tag_type_entry *mtte) -{ - struct m_tag *tag = NULL; - - tag = mtte->mt_alloc_func(id, type, len, wait); - - if (__probable(tag != NULL)) { - VERIFY(IS_P2ALIGNED(tag, sizeof(uint64_t))); - - if (__improbable(tag->m_tag_data == NULL)) { - VERIFY(len == 0); - } else { - VERIFY(len != 0); - VERIFY(IS_P2ALIGNED(tag->m_tag_data, sizeof(uint64_t))); - } - } - return tag; -} - -static void -m_tag_kfree(struct m_tag *tag, struct m_tag_type_entry *mtte) -{ - mtte->mt_free_func(tag); -} -#endif /* MB_TAG_MBUF */ - -struct m_tag * -m_tag_alloc(uint32_t id, uint16_t type, int len, int wait) -{ - struct m_tag *tag = NULL; - m_tag_type_entry_ref_t mtte = NULL; - m_tag_type_stats_ref_t mtts = NULL; - - mtte = get_m_tag_type_entry(id, type, &mtts); - - if (__improbable(len < 0 || len >= MCLBYTES - sizeof(struct m_tag))) { - goto done; - } - -#ifdef MB_TAG_MBUF - tag = m_tag_alloc_mbuf(id, type, (uint16_t)len, wait); -#else /* MB_TAG_MBUF */ - /* - * Using Z_NOWAIT could cause retransmission delays when there aren't - * many other colocated types in the zone that would prime it. Use - * Z_NOPAGEWAIT instead which will only fail to allocate when zalloc - * needs to block on the VM for pages. - */ - if (wait & Z_NOWAIT) { - wait &= ~Z_NOWAIT; - wait |= Z_NOPAGEWAIT; - } - tag = m_tag_kalloc(id, type, (uint16_t)len, wait, mtte); -#endif /* MB_TAG_MBUF */ - -done: - if (__probable(tag != NULL)) { - m_tag_verify_cookie(tag); - assert3u(tag->m_tag_id, ==, id); - assert3u(tag->m_tag_type, ==, type); - assert3u(tag->m_tag_len, ==, len); - - os_atomic_inc(&mtts->mt_alloc_count, relaxed); - } else { - os_atomic_inc(&mtts->mt_alloc_failed, relaxed); - } - - return tag; -} - /* Free a packet tag. */ void m_tag_free(struct m_tag *tag) @@ -1262,6 +1264,22 @@ m_sum16(struct mbuf *m, uint32_t off, uint32_t len) return (uint16_t)os_cpu_in_cksum_mbuf(m, len, off, 0); } +/* + * Write packet tx_time to the mbuf's meta data. + */ +void +mbuf_set_tx_time(struct mbuf *m, uint64_t tx_time) +{ + struct m_tag *tag = NULL; + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM, + sizeof(uint64_t), M_WAITOK, m); + if (tag != NULL) { + m_tag_prepend(m, tag); + *(uint64_t *)tag->m_tag_data = tx_time; + } +} + + static int sysctl_mb_tag_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) diff --git a/bsd/kern/uipc_mbuf_mcache.c b/bsd/kern/uipc_mbuf_mcache.c new file mode 100644 index 000000000..cd9c9a58d --- /dev/null +++ b/bsd/kern/uipc_mbuf_mcache.c @@ -0,0 +1,6207 @@ +/* + * Copyright (c) 1998-2022 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include +#include + +#include + +#include +#include +#include + +#define DUMP_BUF_CHK() { \ + clen -= k; \ + if (clen < 1) \ + goto done; \ + c += k; \ +} + +#if INET +static int +dump_tcp_reass_qlen(char *str, int str_len) +{ + char *c = str; + int k, clen = str_len; + + if (tcp_reass_total_qlen != 0) { + k = scnprintf(c, clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen); + DUMP_BUF_CHK(); + } + +done: + return str_len - clen; +} +#endif /* INET */ + +#if MPTCP +static int +dump_mptcp_reass_qlen(char *str, int str_len) +{ + char *c = str; + int k, clen = str_len; + + if (mptcp_reass_total_qlen != 0) { + k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen); + DUMP_BUF_CHK(); + } + +done: + return str_len - clen; +} +#endif /* MPTCP */ + +#if NETWORKING +extern int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len); +#endif /* NETWORKING */ + +/* + * MBUF IMPLEMENTATION NOTES. + * + * There is a total of 5 per-CPU caches: + * + * MC_MBUF: + * This is a cache of rudimentary objects of _MSIZE in size; each + * object represents an mbuf structure. This cache preserves only + * the m_type field of the mbuf during its transactions. + * + * MC_CL: + * This is a cache of rudimentary objects of MCLBYTES in size; each + * object represents a mcluster structure. This cache does not + * preserve the contents of the objects during its transactions. + * + * MC_BIGCL: + * This is a cache of rudimentary objects of MBIGCLBYTES in size; each + * object represents a mbigcluster structure. This cache does not + * preserve the contents of the objects during its transaction. + * + * MC_MBUF_CL: + * This is a cache of mbufs each having a cluster attached to it. + * It is backed by MC_MBUF and MC_CL rudimentary caches. Several + * fields of the mbuf related to the external cluster are preserved + * during transactions. + * + * MC_MBUF_BIGCL: + * This is a cache of mbufs each having a big cluster attached to it. + * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several + * fields of the mbuf related to the external cluster are preserved + * during transactions. + * + * OBJECT ALLOCATION: + * + * Allocation requests are handled first at the per-CPU (mcache) layer + * before falling back to the slab layer. Performance is optimal when + * the request is satisfied at the CPU layer because global data/lock + * never gets accessed. When the slab layer is entered for allocation, + * the slab freelist will be checked first for available objects before + * the VM backing store is invoked. Slab layer operations are serialized + * for all of the caches as the mbuf global lock is held most of the time. + * Allocation paths are different depending on the class of objects: + * + * a. Rudimentary object: + * + * { m_get_common(), m_clattach(), m_mclget(), + * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), + * composite object allocation } + * | ^ + * | | + * | +-----------------------+ + * v | + * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() + * | ^ + * v | + * [CPU cache] -------> (found?) -------+ + * | | + * v | + * mbuf_slab_alloc() | + * | | + * v | + * +---------> [freelist] -------> (found?) -------+ + * | | + * | v + * | m_clalloc() + * | | + * | v + * +---<<---- kmem_mb_alloc() + * + * b. Composite object: + * + * { m_getpackets_internal(), m_allocpacket_internal() } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() + * | ^ + * v | + * [CPU cache] -------> (found?) -------+ + * | | + * v | + * mbuf_cslab_alloc() | + * | | + * v | + * [freelist] -------> (found?) -------+ + * | | + * v | + * (rudimentary object) | + * mcache_alloc/mcache_alloc_ext() ------>>-----+ + * + * Auditing notes: If auditing is enabled, buffers will be subjected to + * integrity checks by the audit routine. This is done by verifying their + * contents against DEADBEEF (free) pattern before returning them to caller. + * As part of this step, the routine will also record the transaction and + * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will + * also restore any constructed data structure fields if necessary. + * + * OBJECT DEALLOCATION: + * + * Freeing an object simply involves placing it into the CPU cache; this + * pollutes the cache to benefit subsequent allocations. The slab layer + * will only be entered if the object is to be purged out of the cache. + * During normal operations, this happens only when the CPU layer resizes + * its bucket while it's adjusting to the allocation load. Deallocation + * paths are different depending on the class of objects: + * + * a. Rudimentary object: + * + * { m_free(), m_freem_list(), composite object deallocation } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_free/mcache_free_ext() | + * | | + * v | + * mbuf_slab_audit() | + * | | + * v | + * [CPU cache] ---> (not purging?) -----+ + * | | + * v | + * mbuf_slab_free() | + * | | + * v | + * [freelist] ----------->>------------+ + * (objects get purged to VM only on demand) + * + * b. Composite object: + * + * { m_free(), m_freem_list() } + * | ^ + * | | + * | +------ (done) ---------+ + * v | + * mcache_free/mcache_free_ext() | + * | | + * v | + * mbuf_cslab_audit() | + * | | + * v | + * [CPU cache] ---> (not purging?) -----+ + * | | + * v | + * mbuf_cslab_free() | + * | | + * v | + * [freelist] ---> (not purging?) -----+ + * | | + * v | + * (rudimentary object) | + * mcache_free/mcache_free_ext() ------->>------+ + * + * Auditing notes: If auditing is enabled, the audit routine will save + * any constructed data structure fields (if necessary) before filling the + * contents of the buffers with DEADBEEF (free) pattern and recording the + * transaction. Buffers that are freed (whether at CPU or slab layer) are + * expected to contain the free pattern. + * + * DEBUGGING: + * + * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this + * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, + * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, + * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak + * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. + * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. + * + * Each object is associated with exactly one mcache_audit_t structure that + * contains the information related to its last buffer transaction. Given + * an address of an object, the audit structure can be retrieved by finding + * the position of the object relevant to the base address of the cluster: + * + * +------------+ +=============+ + * | mbuf addr | | mclaudit[i] | + * +------------+ +=============+ + * | | cl_audit[0] | + * i = MTOBG(addr) +-------------+ + * | +-----> | cl_audit[1] | -----> mcache_audit_t + * b = BGTOM(i) | +-------------+ + * | | | ... | + * x = MCLIDX(b, addr) | +-------------+ + * | | | cl_audit[7] | + * +-----------------+ +-------------+ + * (e.g. x == 1) + * + * The mclaudit[] array is allocated at initialization time, but its contents + * get populated when the corresponding cluster is created. Because a page + * can be turned into NMBPG number of mbufs, we preserve enough space for the + * mbufs so that there is a 1-to-1 mapping between them. A page that never + * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the + * remaining entries unused. For 16KB cluster, only one entry from the first + * page is allocated and used for the entire object. + */ + +extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); +extern vm_map_t mb_map; /* special map */ + +static uint32_t mb_kmem_contig_failed; +static uint32_t mb_kmem_failed; +static uint32_t mb_kmem_one_failed; +/* Timestamp of allocation failures. */ +static uint64_t mb_kmem_contig_failed_ts; +static uint64_t mb_kmem_failed_ts; +static uint64_t mb_kmem_one_failed_ts; +static uint64_t mb_kmem_contig_failed_size; +static uint64_t mb_kmem_failed_size; +static uint32_t mb_kmem_stats[6]; + +/* Back-end (common) layer */ +static uint64_t mb_expand_cnt; +static uint64_t mb_expand_cl_cnt; +static uint64_t mb_expand_cl_total; +static uint64_t mb_expand_bigcl_cnt; +static uint64_t mb_expand_bigcl_total; +static uint64_t mb_expand_16kcl_cnt; +static uint64_t mb_expand_16kcl_total; +static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */ +static uint32_t mbuf_worker_run_cnt; +static uint64_t mbuf_worker_last_runtime; +static uint64_t mbuf_drain_last_runtime; +static int mbuf_worker_ready; /* worker thread is runnable */ +static unsigned int ncpu; /* number of CPUs */ +static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ +static ppnum_t mcl_pages; /* Size of array (# physical pages) */ +static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ +static mcache_t *ref_cache; /* Cache of cluster reference & flags */ +static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ +unsigned int mbuf_debug; /* patchable mbuf mcache flags */ +static unsigned int mb_normalized; /* number of packets "normalized" */ + +#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ +#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ + +#define MBUF_CLASS_VALID(c) \ + ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) + +/* + * mbuf specific mcache allocation request flags. + */ +#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ + +/* + * Per-cluster slab structure. + * + * A slab is a cluster control structure that contains one or more object + * chunks; the available chunks are chained in the slab's freelist (sl_head). + * Each time a chunk is taken out of the slab, the slab's reference count + * gets incremented. When all chunks have been taken out, the empty slab + * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is + * returned to a slab causes the slab's reference count to be decremented; + * it also causes the slab to be reinserted back to class's slab list, if + * it's not already done. + * + * Compartmentalizing of the object chunks into slabs allows us to easily + * merge one or more slabs together when the adjacent slabs are idle, as + * well as to convert or move a slab from one class to another; e.g. the + * mbuf cluster slab can be converted to a regular cluster slab when all + * mbufs in the slab have been freed. + * + * A slab may also span across multiple clusters for chunks larger than + * a cluster's size. In this case, only the slab of the first cluster is + * used. The rest of the slabs are marked with SLF_PARTIAL to indicate + * that they are part of the larger slab. + * + * Each slab controls a page of memory. + */ +typedef struct mcl_slab { + struct mcl_slab *sl_next; /* neighboring slab */ + u_int8_t sl_class; /* controlling mbuf class */ + int8_t sl_refcnt; /* outstanding allocations */ + int8_t sl_chunks; /* chunks (bufs) in this slab */ + u_int16_t sl_flags; /* slab flags (see below) */ + u_int16_t sl_len; /* slab length */ + void *sl_base; /* base of allocated memory */ + void *sl_head; /* first free buffer */ + TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ +} mcl_slab_t; + +#define SLF_MAPPED 0x0001 /* backed by a mapped page */ +#define SLF_PARTIAL 0x0002 /* part of another slab */ +#define SLF_DETACHED 0x0004 /* not in slab freelist */ + +/* + * The array of slabs are broken into groups of arrays per 1MB of kernel + * memory to reduce the footprint. Each group is allocated on demand + * whenever a new piece of memory mapped in from the VM crosses the 1MB + * boundary. + */ +#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT) + +typedef struct mcl_slabg { + mcl_slab_t *slg_slab; /* group of slabs */ +} mcl_slabg_t; + +/* + * Number of slabs needed to control a 16KB cluster object. + */ +#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT) + +/* + * Per-cluster audit structure. + */ +typedef struct { + mcache_audit_t **cl_audit; /* array of audits */ +} mcl_audit_t; + +typedef struct { + struct thread *msa_thread; /* thread doing transaction */ + struct thread *msa_pthread; /* previous transaction thread */ + uint32_t msa_tstamp; /* transaction timestamp (ms) */ + uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */ + uint16_t msa_depth; /* pc stack depth */ + uint16_t msa_pdepth; /* previous transaction pc stack */ + void *msa_stack[MCACHE_STACK_DEPTH]; + void *msa_pstack[MCACHE_STACK_DEPTH]; +} mcl_scratch_audit_t; + +typedef struct { + /* + * Size of data from the beginning of an mbuf that covers m_hdr, + * pkthdr and m_ext structures. If auditing is enabled, we allocate + * a shadow mbuf structure of this size inside each audit structure, + * and the contents of the real mbuf gets copied into it when the mbuf + * is freed. This allows us to pattern-fill the mbuf for integrity + * check, and to preserve any constructed mbuf fields (e.g. mbuf + + * cluster cache case). Note that we don't save the contents of + * clusters when they are freed; we simply pattern-fill them. + */ + u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)]; + mcl_scratch_audit_t sc_scratch __attribute__((aligned(8))); +} mcl_saved_contents_t; + +#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t)) + +#define MCA_SAVED_MBUF_PTR(_mca) \ + ((struct mbuf *)(void *)((mcl_saved_contents_t *) \ + (_mca)->mca_contents)->sc_mbuf) +#define MCA_SAVED_MBUF_SIZE \ + (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf)) +#define MCA_SAVED_SCRATCH_PTR(_mca) \ + (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch) + +/* + * mbuf specific mcache audit flags + */ +#define MB_INUSE 0x01 /* object has not been returned to slab */ +#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ +#define MB_SCVALID 0x04 /* object has valid saved contents */ + +/* + * Each of the following two arrays hold up to nmbclusters elements. + */ +static mcl_audit_t *mclaudit; /* array of cluster audit information */ +static unsigned int maxclaudit; /* max # of entries in audit table */ +static mcl_slabg_t **slabstbl; /* cluster slabs table */ +static unsigned int maxslabgrp; /* max # of entries in slabs table */ +static unsigned int slabgrp; /* # of entries in slabs table */ + +/* Globals */ +unsigned char *mbutl; /* first mapped cluster address */ +static unsigned char *embutl; /* ending virtual address of mclusters */ + +static boolean_t mclverify; /* debug: pattern-checking */ +static boolean_t mcltrace; /* debug: stack tracing */ +static boolean_t mclfindleak; /* debug: leak detection */ +static boolean_t mclexpleak; /* debug: expose leak info to user space */ + +static struct timeval mb_start; /* beginning of time */ + +/* mbuf leak detection variables */ +static struct mleak_table mleak_table; +static mleak_stat_t *mleak_stat; + +#define MLEAK_STAT_SIZE(n) \ + __builtin_offsetof(mleak_stat_t, ml_trace[n]) + +struct mallocation { + mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ + u_int32_t trace_index; /* mtrace index for corresponding backtrace */ + u_int32_t count; /* How many objects were requested */ + u_int64_t hitcount; /* for determining hash effectiveness */ +}; + +struct mtrace { + u_int64_t collisions; + u_int64_t hitcount; + u_int64_t allocs; + u_int64_t depth; + uintptr_t addr[MLEAK_STACK_DEPTH]; +}; + +/* Size must be a power of two for the zhash to be able to just mask off bits */ +#define MLEAK_ALLOCATION_MAP_NUM 512 +#define MLEAK_TRACE_MAP_NUM 256 + +/* + * Sample factor for how often to record a trace. This is overwritable + * by the boot-arg mleak_sample_factor. + */ +#define MLEAK_SAMPLE_FACTOR 500 + +/* + * Number of top leakers recorded. + */ +#define MLEAK_NUM_TRACES 5 + +#define MB_LEAK_SPACING_64 " " +#define MB_LEAK_SPACING_32 " " + + +#define MB_LEAK_HDR_32 "\n\ + trace [1] trace [2] trace [3] trace [4] trace [5] \n\ + ---------- ---------- ---------- ---------- ---------- \n\ +" + +#define MB_LEAK_HDR_64 "\n\ + trace [1] trace [2] trace [3] \ + trace [4] trace [5] \n\ + ------------------ ------------------ ------------------ \ + ------------------ ------------------ \n\ +" + +static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; +static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; + +/* Hashmaps of allocations and their corresponding traces */ +static struct mallocation *mleak_allocations; +static struct mtrace *mleak_traces; +static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; + +/* Lock to protect mleak tables from concurrent modification */ +static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock"); +static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp); +static lck_mtx_t *const mleak_lock = &mleak_lock_data; + +/* *Failed* large allocations. */ +struct mtracelarge { + uint64_t size; + uint64_t depth; + uintptr_t addr[MLEAK_STACK_DEPTH]; +}; + +#define MTRACELARGE_NUM_TRACES 5 +static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES]; + +static void mtracelarge_register(size_t size); + +/* The minimum number of objects that are allocated, to start. */ +#define MINCL 32 +#define MINBIGCL (MINCL >> 1) + +/* Low watermarks (only map in pages once free counts go below) */ +#define MBIGCL_LOWAT MINBIGCL + +#define m_cache(c) mbuf_table[c].mtbl_cache +#define m_slablist(c) mbuf_table[c].mtbl_slablist +#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist +#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge +#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active +#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt +#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt +#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt +#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified +#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt +#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt +#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt +#define m_region_expand(c) mbuf_table[c].mtbl_expand + +mbuf_table_t mbuf_table[] = { + /* + * The caches for mbufs, regular clusters and big clusters. + * The average total values were based on data gathered by actual + * usage patterns on iOS. + */ + { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), + NULL, NULL, 0, 0, 0, 0, 3000, 0 }, + { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), + NULL, NULL, 0, 0, 0, 0, 2000, 0 }, + { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), + NULL, NULL, 0, 0, 0, 0, 1000, 0 }, + { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), + NULL, NULL, 0, 0, 0, 0, 200, 0 }, + /* + * The following are special caches; they serve as intermediate + * caches backed by the above rudimentary caches. Each object + * in the cache is an mbuf with a cluster attached to it. Unlike + * the above caches, these intermediate caches do not directly + * deal with the slab structures; instead, the constructed + * cached elements are simply stored in the freelists. + */ + { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 }, + { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 }, + { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 }, +}; + +#if SKYWALK +#define MC_THRESHOLD_SCALE_DOWN_FACTOR 2 +static unsigned int mc_threshold_scale_down_factor = + MC_THRESHOLD_SCALE_DOWN_FACTOR; +#endif /* SKYWALK */ + +static uint32_t +m_avgtotal(mbuf_class_t c) +{ +#if SKYWALK + return if_is_fsw_transport_netagent_enabled() ? + (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) : + mbuf_table[c].mtbl_avgtotal; +#else /* !SKYWALK */ + return mbuf_table[c].mtbl_avgtotal; +#endif /* SKYWALK */ +} + +static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ +static int mb_waiters; /* number of waiters */ + +static struct timeval mb_wdtstart; /* watchdog start timestamp */ +static char *mbuf_dump_buf; + +#define MBUF_DUMP_BUF_SIZE 4096 + +/* + * mbuf watchdog is enabled by default. It is also toggeable via the + * kern.ipc.mb_watchdog sysctl. + * Garbage collection is enabled by default on embedded platforms. + * mb_drain_maxint controls the amount of time to wait (in seconds) before + * consecutive calls to mbuf_drain(). + */ +static unsigned int mb_watchdog = 1; +#if !XNU_TARGET_OS_OSX +static unsigned int mb_drain_maxint = 60; +#else /* XNU_TARGET_OS_OSX */ +static unsigned int mb_drain_maxint = 0; +#endif /* XNU_TARGET_OS_OSX */ + +/* The following are used to serialize m_clalloc() */ +static boolean_t mb_clalloc_busy; +static void *mb_clalloc_waitchan = &mb_clalloc_busy; +static int mb_clalloc_waiters; + +static char *mbuf_dump(void); +static void mbuf_worker_thread_init(void); +static mcache_obj_t *slab_alloc(mbuf_class_t, int); +static void slab_free(mbuf_class_t, mcache_obj_t *); +static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, + unsigned int, int); +static void mbuf_slab_free(void *, mcache_obj_t *, int); +static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); +static void mbuf_slab_notify(void *, u_int32_t); +static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, + unsigned int); +static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); +static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, + unsigned int, int); +static void mbuf_cslab_free(void *, mcache_obj_t *, int); +static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); +static int freelist_populate(mbuf_class_t, unsigned int, int); +static void freelist_init(mbuf_class_t); +static boolean_t mbuf_cached_above(mbuf_class_t, int); +static boolean_t mbuf_steal(mbuf_class_t, unsigned int); +static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); +static int m_howmany(int, size_t); +static void mbuf_worker_thread(void); +static void mbuf_watchdog(void); +static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); + +static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, + size_t, unsigned int); +static void mcl_audit_free(void *, unsigned int); +static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); +static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); +static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, + boolean_t); +static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); +static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); +static void mcl_audit_scratch(mcache_audit_t *); +static void mcl_audit_mcheck_panic(struct mbuf *); +static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); + +static void mleak_activate(void); +static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); +static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); +static void mleak_free(mcache_obj_t *); +static void mleak_sort_traces(void); +static void mleak_update_stats(void); + +static mcl_slab_t *slab_get(void *); +static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, + void *, void *, unsigned int, int, int); +static void slab_insert(mcl_slab_t *, mbuf_class_t); +static void slab_remove(mcl_slab_t *, mbuf_class_t); +static boolean_t slab_inrange(mcl_slab_t *, void *); +static void slab_nextptr_panic(mcl_slab_t *, void *); +static void slab_detach(mcl_slab_t *); +static boolean_t slab_is_detached(mcl_slab_t *); + +#if (DEBUG || DEVELOPMENT) +#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__) +static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...); +static char *mbwdog_logging; +const unsigned mbwdog_logging_size = 4096; +static size_t mbwdog_logging_used; +#else +#define mbwdog_logger(fmt, ...) do { } while (0) +#endif /* DEBUG || DEVELOPMENT */ +static void mbuf_drain_locked(boolean_t); + +void +mbuf_mcheck(struct mbuf *m) +{ + if (__improbable(m->m_type != MT_FREE && !MBUF_IS_PAIRED(m))) { + if (mclaudit == NULL) { + panic("MCHECK: m_type=%d m=%p", + (u_int16_t)(m)->m_type, m); + } else { + mcl_audit_mcheck_panic(m); + } + } +} + +#define MBUF_IN_MAP(addr) \ + ((unsigned char *)(addr) >= mbutl && \ + (unsigned char *)(addr) < embutl) + +#define MRANGE(addr) { \ + if (!MBUF_IN_MAP(addr)) \ + panic("MRANGE: address out of range 0x%p", addr); \ +} + +/* + * Macros to obtain page index given a base cluster address + */ +#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT) +#define PGTOM(x) (mbutl + (x << PAGE_SHIFT)) + +/* + * Macro to find the mbuf index relative to a base. + */ +#define MBPAGEIDX(c, m) \ + (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT) + +/* + * Same thing for 2KB cluster index. + */ +#define CLPAGEIDX(c, m) \ + (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT) + +/* + * Macro to find 4KB cluster index relative to a base + */ +#define BCLPAGEIDX(c, m) \ + (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT) + +/* + * Macro to convert BSD malloc sleep flag to mcache's + */ +#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) + +static int +mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i; + + /* Ensure leak tracing turned on */ + if (!mclfindleak || !mclexpleak) { + return ENXIO; + } + + lck_mtx_lock(mleak_lock); + mleak_update_stats(); + i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); + lck_mtx_unlock(mleak_lock); + + return i; +} + +static int +mleak_table_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i = 0; + + /* Ensure leak tracing turned on */ + if (!mclfindleak || !mclexpleak) { + return ENXIO; + } + + lck_mtx_lock(mleak_lock); + i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table)); + lck_mtx_unlock(mleak_lock); + + return i; +} + +void +mbuf_stat_sync(void) +{ + mb_class_stat_t *sp; + mcache_cpu_t *ccp; + mcache_t *cp; + int k, m, bktsize; + + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + for (k = 0; k < MC_MAX; k++) { + cp = m_cache(k); + ccp = &cp->mc_cpu[0]; + bktsize = ccp->cc_bktsize; + sp = mbuf_table[k].mtbl_stats; + + if (cp->mc_flags & MCF_NOCPUCACHE) { + sp->mbcl_mc_state = MCS_DISABLED; + } else if (cp->mc_purge_cnt > 0) { + sp->mbcl_mc_state = MCS_PURGING; + } else if (bktsize == 0) { + sp->mbcl_mc_state = MCS_OFFLINE; + } else { + sp->mbcl_mc_state = MCS_ONLINE; + } + + sp->mbcl_mc_cached = 0; + for (m = 0; m < ncpu; m++) { + ccp = &cp->mc_cpu[m]; + if (ccp->cc_objs > 0) { + sp->mbcl_mc_cached += ccp->cc_objs; + } + if (ccp->cc_pobjs > 0) { + sp->mbcl_mc_cached += ccp->cc_pobjs; + } + } + sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); + sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - + sp->mbcl_infree; + + sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; + sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; + sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; + + /* Calculate total count specific to each class */ + sp->mbcl_ctotal = sp->mbcl_total; + switch (m_class(k)) { + case MC_MBUF: + /* Deduct mbufs used in composite caches */ + sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + + m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL)); + break; + + case MC_CL: + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_CL); + break; + + case MC_BIGCL: + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); + break; + + case MC_16KCL: + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); + break; + + default: + break; + } + } +} + +bool +mbuf_class_under_pressure(struct mbuf *m) +{ + int mclass = mbuf_get_class(m); + + if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { + /* + * The above computation does not include the per-CPU cached objects. + * As a fast-path check this is good-enough. But now we do + * the "slower" count of the cached objects to know exactly the + * number of active mbufs in use. + * + * We do not take the mbuf_lock here to avoid lock-contention. Numbers + * might be slightly off but we don't try to be 100% accurate. + * At worst, we drop a packet that we shouldn't have dropped or + * we might go slightly above our memory-pressure threshold. + */ + mcache_t *cp = m_cache(mclass); + mcache_cpu_t *ccp = &cp->mc_cpu[0]; + + int bktsize = os_access_once(ccp->cc_bktsize); + uint32_t bl_total = os_access_once(cp->mc_full.bl_total); + uint32_t cached = 0; + int i; + + for (i = 0; i < ncpu; i++) { + ccp = &cp->mc_cpu[i]; + + int cc_objs = os_access_once(ccp->cc_objs); + if (cc_objs > 0) { + cached += cc_objs; + } + + int cc_pobjs = os_access_once(ccp->cc_pobjs); + if (cc_pobjs > 0) { + cached += cc_pobjs; + } + } + cached += (bl_total * bktsize); + if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) { + os_log(OS_LOG_DEFAULT, + "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u", + __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass)); + return true; + } + } + + return false; +} + +__private_extern__ void +mbinit(void) +{ + unsigned int m; + unsigned int initmcl = 0; + thread_t thread = THREAD_NULL; + + microuptime(&mb_start); + + /* + * These MBUF_ values must be equal to their private counterparts. + */ + static_assert(MBUF_EXT == M_EXT); + static_assert(MBUF_PKTHDR == M_PKTHDR); + static_assert(MBUF_EOR == M_EOR); + static_assert(MBUF_LOOP == M_LOOP); + static_assert(MBUF_BCAST == M_BCAST); + static_assert(MBUF_MCAST == M_MCAST); + static_assert(MBUF_FRAG == M_FRAG); + static_assert(MBUF_FIRSTFRAG == M_FIRSTFRAG); + static_assert(MBUF_LASTFRAG == M_LASTFRAG); + static_assert(MBUF_PROMISC == M_PROMISC); + static_assert(MBUF_HASFCS == M_HASFCS); + + static_assert(MBUF_TYPE_FREE == MT_FREE); + static_assert(MBUF_TYPE_DATA == MT_DATA); + static_assert(MBUF_TYPE_HEADER == MT_HEADER); + static_assert(MBUF_TYPE_SOCKET == MT_SOCKET); + static_assert(MBUF_TYPE_PCB == MT_PCB); + static_assert(MBUF_TYPE_RTABLE == MT_RTABLE); + static_assert(MBUF_TYPE_HTABLE == MT_HTABLE); + static_assert(MBUF_TYPE_ATABLE == MT_ATABLE); + static_assert(MBUF_TYPE_SONAME == MT_SONAME); + static_assert(MBUF_TYPE_SOOPTS == MT_SOOPTS); + static_assert(MBUF_TYPE_FTABLE == MT_FTABLE); + static_assert(MBUF_TYPE_RIGHTS == MT_RIGHTS); + static_assert(MBUF_TYPE_IFADDR == MT_IFADDR); + static_assert(MBUF_TYPE_CONTROL == MT_CONTROL); + static_assert(MBUF_TYPE_OOBDATA == MT_OOBDATA); + + static_assert(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); + static_assert(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); + static_assert(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL); + static_assert(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); + static_assert(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT); + static_assert(MBUF_CSUM_REQ_IP == CSUM_IP); + static_assert(MBUF_CSUM_REQ_TCP == CSUM_TCP); + static_assert(MBUF_CSUM_REQ_UDP == CSUM_UDP); + static_assert(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); + static_assert(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); + static_assert(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); + static_assert(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); + static_assert(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); + static_assert(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); + + static_assert(MBUF_WAITOK == M_WAIT); + static_assert(MBUF_DONTWAIT == M_DONTWAIT); + static_assert(MBUF_COPYALL == M_COPYALL); + + static_assert(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); + static_assert(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); + static_assert(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); + static_assert(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI); + static_assert(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); + static_assert(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); + + static_assert(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); + static_assert(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); + static_assert(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); + static_assert(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); + + /* Module specific scratch space (32-bit alignment requirement) */ + static_assert(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) % sizeof(uint32_t))); + + /* Make sure we don't save more than we should */ + static_assert(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf)); + + if (nmbclusters == 0) { + nmbclusters = NMBCLUSTERS; + } + + /* This should be a sane (at least even) value by now */ + VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); + + /* Setup the mbuf table */ + mbuf_table_init(); + + static_assert(sizeof(struct mbuf) == _MSIZE); + + /* + * Allocate cluster slabs table: + * + * maxslabgrp = (N * 2048) / (1024 * 1024) + * + * Where N is nmbclusters rounded up to the nearest 512. This yields + * mcl_slab_g_t units, each one representing a MB of memory. + */ + maxslabgrp = + (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT; + slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *), + ZALIGN(mcl_slabg_t)); + + /* + * Allocate audit structures, if needed: + * + * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE + * + * This yields mcl_audit_t units, each one representing a page. + */ + PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug)); + mbuf_debug |= mcache_getflags(); + if (mbuf_debug & MCF_DEBUG) { + int l; + mcl_audit_t *mclad; + maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT); + mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit), + ZALIGN(mcl_audit_t)); + for (l = 0, mclad = mclaudit; l < maxclaudit; l++) { + mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *), + ZALIGN_PTR); + } + + mcl_audit_con_cache = mcache_create("mcl_audit_contents", + AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP); + VERIFY(mcl_audit_con_cache != NULL); + } + mclverify = (mbuf_debug & MCF_VERIFY); + mcltrace = (mbuf_debug & MCF_TRACE); + mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); + mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); + + /* Enable mbuf leak logging, with a lock to protect the tables */ + + mleak_activate(); + + /* + * Allocate structure for per-CPU statistics that's aligned + * on the CPU cache boundary; this code assumes that we never + * uninitialize this framework, since the original address + * before alignment is not saved. + */ + ncpu = ml_wait_max_cpus(); + + /* Calculate the number of pages assigned to the cluster pool */ + mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE; + mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t), + ZALIGN(ppnum_t)); + + /* Register with the I/O Bus mapper */ + mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); + + embutl = (mbutl + (nmbclusters * MCLBYTES)); + VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0); + + /* Prime up the freelist */ + PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl)); + if (initmcl != 0) { + initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ + if (initmcl > m_maxlimit(MC_BIGCL)) { + initmcl = m_maxlimit(MC_BIGCL); + } + } + if (initmcl < m_minlimit(MC_BIGCL)) { + initmcl = m_minlimit(MC_BIGCL); + } + + lck_mtx_lock(mbuf_mlock); + + /* + * For classes with non-zero minimum limits, populate their freelists + * so that m_total(class) is at least m_minlimit(class). + */ + VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); + freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); + VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); + freelist_init(m_class(MC_CL)); + + for (m = 0; m < MC_MAX; m++) { + /* Make sure we didn't miss any */ + VERIFY(m_minlimit(m_class(m)) == 0 || + m_total(m_class(m)) >= m_minlimit(m_class(m))); + } + + lck_mtx_unlock(mbuf_mlock); + + (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, + NULL, &thread); + thread_deallocate(thread); + + ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref), + 0, 0, MCR_SLEEP); + + /* Create the cache for each class */ + for (m = 0; m < MC_MAX; m++) { + void *allocfunc, *freefunc, *auditfunc, *logfunc; + u_int32_t flags; + + flags = mbuf_debug; + if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || + m_class(m) == MC_MBUF_16KCL) { + allocfunc = mbuf_cslab_alloc; + freefunc = mbuf_cslab_free; + auditfunc = mbuf_cslab_audit; + logfunc = mleak_logger; + } else { + allocfunc = mbuf_slab_alloc; + freefunc = mbuf_slab_free; + auditfunc = mbuf_slab_audit; + logfunc = mleak_logger; + } + + if (!mclfindleak) { + flags |= MCF_NOLEAKLOG; + } + + m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), + allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, + (void *)(uintptr_t)m, flags, MCR_SLEEP); + } + + /* + * Set the max limit on sb_max to be 1/16 th of the size of + * memory allocated for mbuf clusters. + */ + high_sb_max = (nmbclusters << (MCLSHIFT - 4)); + if (high_sb_max < sb_max) { + /* sb_max is too large for this configuration, scale it down */ + if (high_sb_max > (1 << MBSHIFT)) { + /* We have atleast 16 M of mbuf pool */ + sb_max = high_sb_max; + } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { + /* + * If we have more than 1M of mbufpool, cap the size of + * max sock buf at 1M + */ + sb_max = high_sb_max = (1 << MBSHIFT); + } else { + sb_max = high_sb_max; + } + } + + /* allocate space for mbuf_dump_buf */ + mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE); + + if (mbuf_debug & MCF_DEBUG) { + printf("%s: MLEN %d, MHLEN %d\n", __func__, + (int)_MLEN, (int)_MHLEN); + } + printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__, + (nmbclusters << MCLSHIFT) >> MBSHIFT, + (nclusters << MCLSHIFT) >> MBSHIFT, + (njcl << MCLSHIFT) >> MBSHIFT); +} + +/* + * Obtain a slab of object(s) from the class's freelist. + */ +static mcache_obj_t * +slab_alloc(mbuf_class_t class, int wait) +{ + mcl_slab_t *sp; + mcache_obj_t *buf; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* This should always be NULL for us */ + VERIFY(m_cobjlist(class) == NULL); + + /* + * Treat composite objects as having longer lifespan by using + * a slab from the reverse direction, in hoping that this could + * reduce the probability of fragmentation for slabs that hold + * more than one buffer chunks (e.g. mbuf slabs). For other + * slabs, this probably doesn't make much of a difference. + */ + if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL) + && (wait & MCR_COMP)) { + sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); + } else { + sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); + } + + if (sp == NULL) { + VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); + /* The slab list for this class is empty */ + return NULL; + } + + VERIFY(m_infree(class) > 0); + VERIFY(!slab_is_detached(sp)); + VERIFY(sp->sl_class == class && + (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + buf = sp->sl_head; + VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); + sp->sl_head = buf->obj_next; + /* Increment slab reference */ + sp->sl_refcnt++; + + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks); + + if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { + slab_nextptr_panic(sp, sp->sl_head); + /* In case sl_head is in the map but not in the slab */ + VERIFY(slab_inrange(sp, sp->sl_head)); + /* NOTREACHED */ + } + + if (mclaudit != NULL) { + mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); + mca->mca_uflags = 0; + /* Save contents on mbuf objects only */ + if (class == MC_MBUF) { + mca->mca_uflags |= MB_SCVALID; + } + } + + if (class == MC_CL) { + mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + /* + * A 2K cluster slab can have at most NCLPG references. + */ + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG && + sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE); + VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL); + } else if (class == MC_BIGCL) { + mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); + /* + * A 4K cluster slab can have NBCLPG references. + */ + VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG && + sp->sl_len == PAGE_SIZE && + (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL)); + } else if (class == MC_16KCL) { + mcl_slab_t *nsp; + int k; + + --m_infree(MC_16KCL); + VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); + /* + * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. + * A 16KB big cluster takes NSLABSP16KB slabs, each having at + * most 1 reference. + */ + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + nsp->sl_refcnt++; + VERIFY(!slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_16KCL && + nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && + nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } + } else { + VERIFY(class == MC_MBUF); + --m_infree(MC_MBUF); + /* + * If auditing is turned on, this check is + * deferred until later in mbuf_slab_audit(). + */ + if (mclaudit == NULL) { + mbuf_mcheck((struct mbuf *)buf); + } + /* + * Since we have incremented the reference count above, + * an mbuf slab (formerly a 4KB cluster slab that was cut + * up into mbufs) must have a reference count between 1 + * and NMBPG at this point. + */ + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG && + sp->sl_chunks == NMBPG && + sp->sl_len == PAGE_SIZE); + VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL); + } + + /* If empty, remove this slab from the class's freelist */ + if (sp->sl_head == NULL) { + VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG); + VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG); + VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG); + slab_remove(sp, class); + } + + return buf; +} + +/* + * Place a slab of object(s) back into a class's slab list. + */ +static void +slab_free(mbuf_class_t class, mcache_obj_t *buf) +{ + mcl_slab_t *sp; + boolean_t reinit_supercl = false; + mbuf_class_t super_class; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(buf->obj_next == NULL); + + /* + * Synchronizing with m_clalloc, as it reads m_total, while we here + * are modifying m_total. + */ + while (mb_clalloc_busy) { + mb_clalloc_waiters++; + (void) msleep(mb_clalloc_waitchan, mbuf_mlock, + (PZERO - 1), "m_clalloc", NULL); + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + } + + /* We are busy now; tell everyone else to go away */ + mb_clalloc_busy = TRUE; + + sp = slab_get(buf); + VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && + (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + + /* Decrement slab reference */ + sp->sl_refcnt--; + + if (class == MC_CL) { + VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); + /* + * A slab that has been splitted for 2KB clusters can have + * at most 1 outstanding reference at this point. + */ + VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) && + sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE); + VERIFY(sp->sl_refcnt < (NCLPG - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } else if (class == MC_BIGCL) { + VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); + + /* A 4KB cluster slab can have NBCLPG references at most */ + VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG); + VERIFY(sp->sl_refcnt < (NBCLPG - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } else if (class == MC_16KCL) { + mcl_slab_t *nsp; + int k; + /* + * A 16KB cluster takes NSLABSP16KB slabs, all must + * now have 0 reference. + */ + VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE)); + VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); + VERIFY(slab_is_detached(sp)); + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + nsp->sl_refcnt--; + VERIFY(slab_is_detached(nsp)); + VERIFY(nsp->sl_class == MC_16KCL && + (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && + nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && + nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && + nsp->sl_head == NULL); + } + } else { + /* + * A slab that has been splitted for mbufs has at most + * NMBPG reference counts. Since we have decremented + * one reference above, it must now be between 0 and + * NMBPG-1. + */ + VERIFY(class == MC_MBUF); + VERIFY(sp->sl_refcnt >= 0 && + sp->sl_refcnt <= (NMBPG - 1) && + sp->sl_chunks == NMBPG && + sp->sl_len == PAGE_SIZE); + VERIFY(sp->sl_refcnt < (NMBPG - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } + + /* + * When auditing is enabled, ensure that the buffer still + * contains the free pattern. Otherwise it got corrupted + * while at the CPU cache layer. + */ + if (mclaudit != NULL) { + mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); + if (mclverify) { + mcache_audit_free_verify(mca, buf, 0, + m_maxsize(class)); + } + mca->mca_uflags &= ~MB_SCVALID; + } + + if (class == MC_CL) { + mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + buf->obj_next = sp->sl_head; + } else if (class == MC_BIGCL) { + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); + buf->obj_next = sp->sl_head; + } else if (class == MC_16KCL) { + ++m_infree(MC_16KCL); + } else { + ++m_infree(MC_MBUF); + buf->obj_next = sp->sl_head; + } + sp->sl_head = buf; + + /* + * If a slab has been split to either one which holds 2KB clusters, + * or one which holds mbufs, turn it back to one which holds a + * 4 or 16 KB cluster depending on the page size. + */ + if (m_maxsize(MC_BIGCL) == PAGE_SIZE) { + super_class = MC_BIGCL; + } else { + VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL)); + super_class = MC_16KCL; + } + if (class == MC_MBUF && sp->sl_refcnt == 0 && + m_total(class) >= (m_minlimit(class) + NMBPG) && + m_total(super_class) < m_maxlimit(super_class)) { + int i = NMBPG; + + m_total(MC_MBUF) -= NMBPG; + mbstat.m_mbufs = m_total(MC_MBUF); + m_infree(MC_MBUF) -= NMBPG; + mtype_stat_add(MT_FREE, -((unsigned)NMBPG)); + + while (i--) { + struct mbuf *m = sp->sl_head; + VERIFY(m != NULL); + sp->sl_head = m->m_next; + m->m_next = NULL; + } + reinit_supercl = true; + } else if (class == MC_CL && sp->sl_refcnt == 0 && + m_total(class) >= (m_minlimit(class) + NCLPG) && + m_total(super_class) < m_maxlimit(super_class)) { + int i = NCLPG; + + m_total(MC_CL) -= NCLPG; + mbstat.m_clusters = m_total(MC_CL); + m_infree(MC_CL) -= NCLPG; + + while (i--) { + union mcluster *c = sp->sl_head; + VERIFY(c != NULL); + sp->sl_head = c->mcl_next; + c->mcl_next = NULL; + } + reinit_supercl = true; + } else if (class == MC_BIGCL && super_class != MC_BIGCL && + sp->sl_refcnt == 0 && + m_total(class) >= (m_minlimit(class) + NBCLPG) && + m_total(super_class) < m_maxlimit(super_class)) { + int i = NBCLPG; + + VERIFY(super_class == MC_16KCL); + m_total(MC_BIGCL) -= NBCLPG; + mbstat.m_bigclusters = m_total(MC_BIGCL); + m_infree(MC_BIGCL) -= NBCLPG; + + while (i--) { + union mbigcluster *bc = sp->sl_head; + VERIFY(bc != NULL); + sp->sl_head = bc->mbc_next; + bc->mbc_next = NULL; + } + reinit_supercl = true; + } + + if (reinit_supercl) { + VERIFY(sp->sl_head == NULL); + VERIFY(m_total(class) >= m_minlimit(class)); + slab_remove(sp, class); + + /* Reinitialize it as a cluster for the super class */ + m_total(super_class)++; + m_infree(super_class)++; + VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) && + sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0); + + slab_init(sp, super_class, SLF_MAPPED, sp->sl_base, + sp->sl_base, PAGE_SIZE, 0, 1); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)sp->sl_base, sp->sl_len); + } + ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL; + + if (super_class == MC_BIGCL) { + mbstat.m_bigclusters = m_total(MC_BIGCL); + mbstat.m_bigclfree = m_infree(MC_BIGCL) + + m_infree(MC_MBUF_BIGCL); + } + + VERIFY(slab_is_detached(sp)); + VERIFY(m_total(super_class) <= m_maxlimit(super_class)); + + /* And finally switch class */ + class = super_class; + } + + /* Reinsert the slab to the class's slab list */ + if (slab_is_detached(sp)) { + slab_insert(sp, class); + } + + /* We're done; let others enter */ + mb_clalloc_busy = FALSE; + if (mb_clalloc_waiters > 0) { + mb_clalloc_waiters = 0; + wakeup(mb_clalloc_waitchan); + } +} + +/* + * Common allocator for rudimentary objects called by the CPU cache layer + * during an allocation request whenever there is no available element in the + * bucket layer. It returns one or more elements from the appropriate global + * freelist. If the freelist is empty, it will attempt to populate it and + * retry the allocation. + */ +static unsigned int +mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) +{ + mbuf_class_t class = (mbuf_class_t)arg; + unsigned int need = num; + mcache_obj_t **list = *plist; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + ASSERT(need > 0); + + lck_mtx_lock(mbuf_mlock); + + for (;;) { + if ((*list = slab_alloc(class, wait)) != NULL) { + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + + if (--need == 0) { + /* + * If the number of elements in freelist has + * dropped below low watermark, asynchronously + * populate the freelist now rather than doing + * it later when we run out of elements. + */ + if (!mbuf_cached_above(class, wait) && + m_infree(class) < (m_total(class) >> 5)) { + (void) freelist_populate(class, 1, + M_DONTWAIT); + } + break; + } + } else { + VERIFY(m_infree(class) == 0 || class == MC_CL); + + (void) freelist_populate(class, 1, + (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); + + if (m_infree(class) > 0) { + continue; + } + + /* Check if there's anything at the cache layer */ + if (mbuf_cached_above(class, wait)) { + break; + } + + /* watchdog checkpoint */ + mbuf_watchdog(); + + /* We have nothing and cannot block; give up */ + if (wait & MCR_NOSLEEP) { + if (!(wait & MCR_TRYHARD)) { + m_fail_cnt(class)++; + mbstat.m_drops++; + break; + } + } + + /* + * If the freelist is still empty and the caller is + * willing to be blocked, sleep on the wait channel + * until an element is available. Otherwise, if + * MCR_TRYHARD is set, do our best to satisfy the + * request without having to go to sleep. + */ + if (mbuf_worker_ready && + mbuf_sleep(class, need, wait)) { + break; + } + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + } + } + + m_alloc_cnt(class) += num - need; + lck_mtx_unlock(mbuf_mlock); + + return num - need; +} + +/* + * Common de-allocator for rudimentary objects called by the CPU cache + * layer when one or more elements need to be returned to the appropriate + * global freelist. + */ +static void +mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_obj_t *nlist; + unsigned int num = 0; + int w; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + + lck_mtx_lock(mbuf_mlock); + + for (;;) { + nlist = list->obj_next; + list->obj_next = NULL; + slab_free(class, list); + ++num; + if ((list = nlist) == NULL) { + break; + } + } + m_free_cnt(class) += num; + + if ((w = mb_waiters) > 0) { + mb_waiters = 0; + } + if (w) { + mbwdog_logger("waking up all threads"); + } + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) { + wakeup(mb_waitchan); + } +} + +/* + * Common auditor for rudimentary objects called by the CPU cache layer + * during an allocation or free request. For the former, this is called + * after the objects are obtained from either the bucket or slab layer + * and before they are returned to the caller. For the latter, this is + * called immediately during free and before placing the objects into + * the bucket or slab layer. + */ +static void +mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mcache_audit_t *mca; + + ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); + + while (list != NULL) { + lck_mtx_lock(mbuf_mlock); + mca = mcl_audit_buf2mca(class, list); + + /* Do the sanity checks */ + if (class == MC_MBUF) { + mcl_audit_mbuf(mca, list, FALSE, alloc); + ASSERT(mca->mca_uflags & MB_SCVALID); + } else { + mcl_audit_cluster(mca, list, m_maxsize(class), + alloc, TRUE); + ASSERT(!(mca->mca_uflags & MB_SCVALID)); + } + /* Record this transaction */ + if (mcltrace) { + mcache_buffer_log(mca, list, m_cache(class), &mb_start); + } + + if (alloc) { + mca->mca_uflags |= MB_INUSE; + } else { + mca->mca_uflags &= ~MB_INUSE; + } + /* Unpair the object (unconditionally) */ + mca->mca_uptr = NULL; + lck_mtx_unlock(mbuf_mlock); + + list = list->obj_next; + } +} + +/* + * Common notify routine for all caches. It is called by mcache when + * one or more objects get freed. We use this indication to trigger + * the wakeup of any sleeping threads so that they can retry their + * allocation requests. + */ +static void +mbuf_slab_notify(void *arg, u_int32_t reason) +{ + mbuf_class_t class = (mbuf_class_t)arg; + int w; + + ASSERT(MBUF_CLASS_VALID(class)); + + if (reason != MCN_RETRYALLOC) { + return; + } + + lck_mtx_lock(mbuf_mlock); + if ((w = mb_waiters) > 0) { + m_notified(class)++; + mb_waiters = 0; + } + if (w) { + mbwdog_logger("waking up all threads"); + } + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) { + wakeup(mb_waitchan); + } +} + +/* + * Obtain object(s) from the composite class's freelist. + */ +static unsigned int +cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) +{ + unsigned int need = num; + mcl_slab_t *sp, *clsp, *nsp; + struct mbuf *m; + mcache_obj_t **list = *plist; + void *cl; + + VERIFY(need > 0); + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* Get what we can from the freelist */ + while ((*list = m_cobjlist(class)) != NULL) { + MRANGE(*list); + + m = (struct mbuf *)*list; + sp = slab_get(m); + cl = m->m_ext.ext_buf; + clsp = slab_get(cl); + VERIFY(m->m_flags == M_EXT && cl != NULL); + VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m)); + + if (class == MC_MBUF_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPG); + } else { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NBCLPG); + } + + if (class == MC_MBUF_16KCL) { + int k; + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + if ((m_cobjlist(class) = (*list)->obj_next) != NULL && + !MBUF_IN_MAP(m_cobjlist(class))) { + slab_nextptr_panic(sp, m_cobjlist(class)); + /* NOTREACHED */ + } + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + + if (--need == 0) { + break; + } + } + m_infree(class) -= (num - need); + + return num - need; +} + +/* + * Place object(s) back into a composite class's freelist. + */ +static unsigned int +cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) +{ + mcache_obj_t *o, *tail; + unsigned int num = 0; + struct mbuf *m, *ms; + mcache_audit_t *mca = NULL; + mcache_obj_t *ref_list = NULL; + mcl_slab_t *clsp, *nsp; + void *cl; + mbuf_class_t cl_class; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + + o = tail = list; + + while ((m = ms = (struct mbuf *)o) != NULL) { + mcache_obj_t *rfa, *nexto = o->obj_next; + + /* Do the mbuf sanity checks */ + if (mclaudit != NULL) { + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + if (mclverify) { + mcache_audit_free_verify(mca, m, 0, + m_maxsize(MC_MBUF)); + } + ms = MCA_SAVED_MBUF_PTR(mca); + } + + /* Do the cluster sanity checks */ + cl = ms->m_ext.ext_buf; + clsp = slab_get(cl); + if (mclverify) { + size_t size = m_maxsize(cl_class); + mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, + (mcache_obj_t *)cl), cl, 0, size); + } + VERIFY(ms->m_type == MT_FREE); + VERIFY(ms->m_flags == M_EXT); + VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + if (cl_class == MC_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPG); + } else { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NBCLPG); + } + if (cl_class == MC_16KCL) { + int k; + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + /* + * If we're asked to purge, restore the actual mbuf using + * contents of the shadow structure (if auditing is enabled) + * and clear EXTF_COMPOSITE flag from the mbuf, as we are + * about to free it and the attached cluster into their caches. + */ + if (purged) { + /* Restore constructed mbuf fields */ + if (mclaudit != NULL) { + mcl_audit_restore_mbuf(m, mca, TRUE); + } + + MEXT_MINREF(m) = 0; + MEXT_REF(m) = 0; + MEXT_PREF(m) = 0; + MEXT_FLAGS(m) = 0; + MEXT_PRIV(m) = 0; + MEXT_PMBUF(m) = NULL; + + rfa = (mcache_obj_t *)(void *)m_get_rfa(m); + m_set_ext(m, NULL, NULL, NULL); + rfa->obj_next = ref_list; + ref_list = rfa; + + m->m_type = MT_FREE; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* Save mbuf fields and make auditing happy */ + if (mclaudit != NULL) { + mcl_audit_mbuf(mca, o, FALSE, FALSE); + } + + VERIFY(m_total(class) > 0); + m_total(class)--; + + /* Free the mbuf */ + o->obj_next = NULL; + slab_free(MC_MBUF, o); + + /* And free the cluster */ + ((mcache_obj_t *)cl)->obj_next = NULL; + if (class == MC_MBUF_CL) { + slab_free(MC_CL, cl); + } else if (class == MC_MBUF_BIGCL) { + slab_free(MC_BIGCL, cl); + } else { + slab_free(MC_16KCL, cl); + } + } + + ++num; + tail = o; + o = nexto; + } + + if (!purged) { + tail->obj_next = m_cobjlist(class); + m_cobjlist(class) = list; + m_infree(class) += num; + } else if (ref_list != NULL) { + mcache_free_ext(ref_cache, ref_list); + } + + return num; +} + +/* + * Common allocator for composite objects called by the CPU cache layer + * during an allocation request whenever there is no available element in + * the bucket layer. It returns one or more composite elements from the + * appropriate global freelist. If the freelist is empty, it will attempt + * to obtain the rudimentary objects from their caches and construct them + * into composite mbuf + cluster objects. + */ +static unsigned int +mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, + int wait) +{ + mbuf_class_t class = (mbuf_class_t)arg; + mbuf_class_t cl_class = 0; + unsigned int num = 0, cnum = 0, want = needed; + mcache_obj_t *ref_list = NULL; + mcache_obj_t *mp_list = NULL; + mcache_obj_t *clp_list = NULL; + mcache_obj_t **list; + struct ext_ref *rfa; + struct mbuf *m; + void *cl; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + ASSERT(needed > 0); + + /* There should not be any slab for this class */ + VERIFY(m_slab_cnt(class) == 0 && + m_slablist(class).tqh_first == NULL && + m_slablist(class).tqh_last == NULL); + + lck_mtx_lock(mbuf_mlock); + + /* Try using the freelist first */ + num = cslab_alloc(class, plist, needed); + list = *plist; + if (num == needed) { + m_alloc_cnt(class) += num; + lck_mtx_unlock(mbuf_mlock); + return needed; + } + + lck_mtx_unlock(mbuf_mlock); + + /* + * We could not satisfy the request using the freelist alone; + * allocate from the appropriate rudimentary caches and use + * whatever we can get to construct the composite objects. + */ + needed -= num; + + /* + * Mark these allocation requests as coming from a composite cache. + * Also, if the caller is willing to be blocked, mark the request + * with MCR_FAILOK such that we don't end up sleeping at the mbuf + * slab layer waiting for the individual object when one or more + * of the already-constructed composite objects are available. + */ + wait |= MCR_COMP; + if (!(wait & MCR_NOSLEEP)) { + wait |= MCR_FAILOK; + } + + /* allocate mbufs */ + needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); + if (needed == 0) { + ASSERT(mp_list == NULL); + goto fail; + } + + /* allocate clusters */ + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); + if (needed == 0) { + ASSERT(clp_list == NULL); + goto fail; + } + + needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); + if (needed == 0) { + ASSERT(ref_list == NULL); + goto fail; + } + + /* + * By this time "needed" is MIN(mbuf, cluster, ref). Any left + * overs will get freed accordingly before we return to caller. + */ + for (cnum = 0; cnum < needed; cnum++) { + struct mbuf *ms; + + m = ms = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + + cl = clp_list; + clp_list = clp_list->obj_next; + ((mcache_obj_t *)cl)->obj_next = NULL; + + rfa = (struct ext_ref *)ref_list; + ref_list = ref_list->obj_next; + ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; + + /* + * If auditing is enabled, construct the shadow mbuf + * in the audit structure instead of in the actual one. + * mbuf_cslab_audit() will take care of restoring the + * contents after the integrity check. + */ + if (mclaudit != NULL) { + mcache_audit_t *mca, *cl_mca; + + lck_mtx_lock(mbuf_mlock); + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + ms = MCA_SAVED_MBUF_PTR(mca); + cl_mca = mcl_audit_buf2mca(cl_class, + (mcache_obj_t *)cl); + + /* + * Pair them up. Note that this is done at the time + * the mbuf+cluster objects are constructed. This + * information should be treated as "best effort" + * debugging hint since more than one mbufs can refer + * to a cluster. In that case, the cluster might not + * be freed along with the mbuf it was paired with. + */ + mca->mca_uptr = cl_mca; + cl_mca->mca_uptr = mca; + + ASSERT(mca->mca_uflags & MB_SCVALID); + ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); + lck_mtx_unlock(mbuf_mlock); + + /* Technically, they are in the freelist */ + if (mclverify) { + size_t size; + + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + + if (class == MC_MBUF_CL) { + size = m_maxsize(MC_CL); + } else if (class == MC_MBUF_BIGCL) { + size = m_maxsize(MC_BIGCL); + } else { + size = m_maxsize(MC_16KCL); + } + + mcache_set_pattern(MCACHE_FREE_PATTERN, cl, + size); + } + } + + mbuf_init(ms, 0, MT_FREE); + if (class == MC_MBUF_16KCL) { + MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } else if (class == MC_MBUF_BIGCL) { + MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } else { + MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); + } + VERIFY(ms->m_flags == M_EXT); + VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + + *list = (mcache_obj_t *)m; + (*list)->obj_next = NULL; + list = *plist = &(*list)->obj_next; + } + +fail: + /* + * Free up what's left of the above. + */ + if (mp_list != NULL) { + mcache_free_ext(m_cache(MC_MBUF), mp_list); + } + if (clp_list != NULL) { + mcache_free_ext(m_cache(cl_class), clp_list); + } + if (ref_list != NULL) { + mcache_free_ext(ref_cache, ref_list); + } + + lck_mtx_lock(mbuf_mlock); + if (num > 0 || cnum > 0) { + m_total(class) += cnum; + VERIFY(m_total(class) <= m_maxlimit(class)); + m_alloc_cnt(class) += num + cnum; + } + if ((num + cnum) < want) { + m_fail_cnt(class) += (want - (num + cnum)); + } + lck_mtx_unlock(mbuf_mlock); + + return num + cnum; +} + +/* + * Common de-allocator for composite objects called by the CPU cache + * layer when one or more elements need to be returned to the appropriate + * global freelist. + */ +static void +mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) +{ + mbuf_class_t class = (mbuf_class_t)arg; + unsigned int num; + int w; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + + lck_mtx_lock(mbuf_mlock); + + num = cslab_free(class, list, purged); + m_free_cnt(class) += num; + + if ((w = mb_waiters) > 0) { + mb_waiters = 0; + } + if (w) { + mbwdog_logger("waking up all threads"); + } + + lck_mtx_unlock(mbuf_mlock); + + if (w != 0) { + wakeup(mb_waitchan); + } +} + +/* + * Common auditor for composite objects called by the CPU cache layer + * during an allocation or free request. For the former, this is called + * after the objects are obtained from either the bucket or slab layer + * and before they are returned to the caller. For the latter, this is + * called immediately during free and before placing the objects into + * the bucket or slab layer. + */ +static void +mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) +{ + mbuf_class_t class = (mbuf_class_t)arg, cl_class; + mcache_audit_t *mca; + struct mbuf *m, *ms; + mcl_slab_t *clsp, *nsp; + size_t cl_size; + void *cl; + + ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + cl_class = MC_16KCL; + } + cl_size = m_maxsize(cl_class); + + while ((m = ms = (struct mbuf *)list) != NULL) { + lck_mtx_lock(mbuf_mlock); + /* Do the mbuf sanity checks and record its transaction */ + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + mcl_audit_mbuf(mca, m, TRUE, alloc); + if (mcltrace) { + mcache_buffer_log(mca, m, m_cache(class), &mb_start); + } + + if (alloc) { + mca->mca_uflags |= MB_COMP_INUSE; + } else { + mca->mca_uflags &= ~MB_COMP_INUSE; + } + + /* + * Use the shadow mbuf in the audit structure if we are + * freeing, since the contents of the actual mbuf has been + * pattern-filled by the above call to mcl_audit_mbuf(). + */ + if (!alloc && mclverify) { + ms = MCA_SAVED_MBUF_PTR(mca); + } + + /* Do the cluster sanity checks and record its transaction */ + cl = ms->m_ext.ext_buf; + clsp = slab_get(cl); + VERIFY(ms->m_flags == M_EXT && cl != NULL); + VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms)); + if (class == MC_MBUF_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPG); + } else { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NBCLPG); + } + + if (class == MC_MBUF_16KCL) { + int k; + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + /* Next slab must already be present */ + VERIFY(nsp != NULL); + VERIFY(nsp->sl_refcnt == 1); + } + } + + + mca = mcl_audit_buf2mca(cl_class, cl); + mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE); + if (mcltrace) { + mcache_buffer_log(mca, cl, m_cache(class), &mb_start); + } + + if (alloc) { + mca->mca_uflags |= MB_COMP_INUSE; + } else { + mca->mca_uflags &= ~MB_COMP_INUSE; + } + lck_mtx_unlock(mbuf_mlock); + + list = list->obj_next; + } +} + +static void +m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size, + uint64_t alloc_size, kern_return_t error) +{ + *cnt = *cnt + 1; + *ts = net_uptime(); + if (size) { + *size = alloc_size; + } + switch (error) { + case KERN_SUCCESS: + break; + case KERN_INVALID_ARGUMENT: + mb_kmem_stats[0]++; + break; + case KERN_INVALID_ADDRESS: + mb_kmem_stats[1]++; + break; + case KERN_RESOURCE_SHORTAGE: + mb_kmem_stats[2]++; + break; + case KERN_NO_SPACE: + mb_kmem_stats[3]++; + break; + case KERN_FAILURE: + mb_kmem_stats[4]++; + break; + default: + mb_kmem_stats[5]++; + break; + } +} + +static vm_offset_t +kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err) +{ + vm_offset_t addr = 0; + kern_return_t kr = KERN_SUCCESS; + + if (!physContig) { + kr = kmem_alloc(mbmap, &addr, size, + KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); + } else { + kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff, + 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF); + } + + if (kr != KERN_SUCCESS) { + addr = 0; + } + if (err) { + *err = kr; + } + + return addr; +} + +/* + * Allocate some number of mbuf clusters and place on cluster freelist. + */ +static int +m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) +{ + int i, count = 0; + vm_size_t size = 0; + int numpages = 0, large_buffer; + vm_offset_t page = 0; + mcache_audit_t *mca_list = NULL; + mcache_obj_t *con_list = NULL; + mcl_slab_t *sp; + mbuf_class_t class; + kern_return_t error; + + /* Set if a buffer allocation needs allocation of multiple pages */ + large_buffer = ((bufsize == m_maxsize(MC_16KCL)) && + PAGE_SIZE < M16KCLBYTES); + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); + + VERIFY((bufsize == PAGE_SIZE) || + (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL))); + + if (bufsize == m_size(MC_BIGCL)) { + class = MC_BIGCL; + } else { + class = MC_16KCL; + } + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* + * Multiple threads may attempt to populate the cluster map one + * after another. Since we drop the lock below prior to acquiring + * the physical page(s), our view of the cluster map may no longer + * be accurate, and we could end up over-committing the pages beyond + * the maximum allowed for each class. To prevent it, this entire + * operation (including the page mapping) is serialized. + */ + while (mb_clalloc_busy) { + mb_clalloc_waiters++; + (void) msleep(mb_clalloc_waitchan, mbuf_mlock, + (PZERO - 1), "m_clalloc", NULL); + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + } + + /* We are busy now; tell everyone else to go away */ + mb_clalloc_busy = TRUE; + + /* + * Honor the caller's wish to block or not block. We have a way + * to grow the pool asynchronously using the mbuf worker thread. + */ + i = m_howmany(num, bufsize); + if (i <= 0 || (wait & M_DONTWAIT)) { + goto out; + } + + lck_mtx_unlock(mbuf_mlock); + + size = round_page(i * bufsize); + page = kmem_mb_alloc(mb_map, size, large_buffer, &error); + + /* + * If we did ask for "n" 16KB physically contiguous chunks + * and didn't get them, then please try again without this + * restriction. + */ + net_update_uptime(); + if (large_buffer && page == 0) { + m_vm_error_stats(&mb_kmem_contig_failed, + &mb_kmem_contig_failed_ts, + &mb_kmem_contig_failed_size, + size, error); + page = kmem_mb_alloc(mb_map, size, 0, &error); + } + + if (page == 0) { + m_vm_error_stats(&mb_kmem_failed, + &mb_kmem_failed_ts, + &mb_kmem_failed_size, + size, error); +#if PAGE_SIZE == 4096 + if (bufsize == m_maxsize(MC_BIGCL)) { +#else + if (bufsize >= m_maxsize(MC_BIGCL)) { +#endif + /* Try for 1 page if failed */ + size = PAGE_SIZE; + page = kmem_mb_alloc(mb_map, size, 0, &error); + if (page == 0) { + m_vm_error_stats(&mb_kmem_one_failed, + &mb_kmem_one_failed_ts, + NULL, size, error); + } + } + + if (page == 0) { + lck_mtx_lock(mbuf_mlock); + goto out; + } + } + + VERIFY(IS_P2ALIGNED(page, PAGE_SIZE)); + numpages = size / PAGE_SIZE; + + /* If auditing is enabled, allocate the audit structures now */ + if (mclaudit != NULL) { + int needed; + + /* + * Yes, I realize this is a waste of memory for clusters + * that never get transformed into mbufs, as we may end + * up with NMBPG-1 unused audit structures per cluster. + * But doing so tremendously simplifies the allocation + * strategy, since at this point we are not holding the + * mbuf lock and the caller is okay to be blocked. + */ + if (bufsize == PAGE_SIZE) { + needed = numpages * NMBPG; + + i = mcache_alloc_ext(mcl_audit_con_cache, + &con_list, needed, MCR_SLEEP); + + VERIFY(con_list != NULL && i == needed); + } else { + /* + * if multiple 4K pages are being used for a + * 16K cluster + */ + needed = numpages / NSLABSP16KB; + } + + i = mcache_alloc_ext(mcache_audit_cache, + (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); + + VERIFY(mca_list != NULL && i == needed); + } + + lck_mtx_lock(mbuf_mlock); + + for (i = 0; i < numpages; i++, page += PAGE_SIZE) { + ppnum_t offset = + ((unsigned char *)page - mbutl) >> PAGE_SHIFT; + ppnum_t new_page = pmap_find_phys(kernel_pmap, page); + + /* + * If there is a mapper the appropriate I/O page is + * returned; zero out the page to discard its past + * contents to prevent exposing leftover kernel memory. + */ + VERIFY(offset < mcl_pages); + if (mcl_paddr_base != 0) { + bzero((void *)(uintptr_t) page, PAGE_SIZE); + new_page = IOMapperInsertPage(mcl_paddr_base, + offset, new_page); + } + mcl_paddr[offset] = new_page; + + /* Pattern-fill this fresh page */ + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)page, PAGE_SIZE); + } + if (bufsize == PAGE_SIZE) { + mcache_obj_t *buf; + /* One for the entire page */ + sp = slab_get((void *)page); + if (mclaudit != NULL) { + mcl_audit_init((void *)page, + &mca_list, &con_list, + AUDIT_CONTENTS_SIZE, NMBPG); + } + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, class, SLF_MAPPED, (void *)page, + (void *)page, PAGE_SIZE, 0, 1); + buf = (mcache_obj_t *)page; + buf->obj_next = NULL; + + /* Insert this slab */ + slab_insert(sp, class); + + /* Update stats now since slab_get drops the lock */ + ++m_infree(class); + ++m_total(class); + VERIFY(m_total(class) <= m_maxlimit(class)); + if (class == MC_BIGCL) { + mbstat.m_bigclfree = m_infree(MC_BIGCL) + + m_infree(MC_MBUF_BIGCL); + mbstat.m_bigclusters = m_total(MC_BIGCL); + } + ++count; + } else if ((bufsize > PAGE_SIZE) && + (i % NSLABSP16KB) == 0) { + union m16kcluster *m16kcl = (union m16kcluster *)page; + mcl_slab_t *nsp; + int k; + + /* One for the entire 16KB */ + sp = slab_get(m16kcl); + if (mclaudit != NULL) { + mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); + } + + VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); + slab_init(sp, MC_16KCL, SLF_MAPPED, + m16kcl, m16kcl, bufsize, 0, 1); + m16kcl->m16kcl_next = NULL; + + /* + * 2nd-Nth page's slab is part of the first one, + * where N is NSLABSP16KB. + */ + for (k = 1; k < NSLABSP16KB; k++) { + nsp = slab_get(((union mbigcluster *)page) + k); + VERIFY(nsp->sl_refcnt == 0 && + nsp->sl_flags == 0); + slab_init(nsp, MC_16KCL, + SLF_MAPPED | SLF_PARTIAL, + m16kcl, NULL, 0, 0, 0); + } + /* Insert this slab */ + slab_insert(sp, MC_16KCL); + + /* Update stats now since slab_get drops the lock */ + ++m_infree(MC_16KCL); + ++m_total(MC_16KCL); + VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); + ++count; + } + } + VERIFY(mca_list == NULL && con_list == NULL); + + /* We're done; let others enter */ + mb_clalloc_busy = FALSE; + if (mb_clalloc_waiters > 0) { + mb_clalloc_waiters = 0; + wakeup(mb_clalloc_waitchan); + } + + return count; +out: + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + mtracelarge_register(size); + + /* We're done; let others enter */ + mb_clalloc_busy = FALSE; + if (mb_clalloc_waiters > 0) { + mb_clalloc_waiters = 0; + wakeup(mb_clalloc_waitchan); + } + + /* + * When non-blocking we kick a thread if we have to grow the + * pool or if the number of free clusters is less than requested. + */ + if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) { + mbwdog_logger("waking up the worker thread to to grow %s by %d", + m_cname(class), i); + wakeup((caddr_t)&mbuf_worker_needs_wakeup); + mbuf_worker_needs_wakeup = FALSE; + } + if (class == MC_BIGCL) { + if (i > 0) { + /* + * Remember total number of 4KB clusters needed + * at this time. + */ + i += m_total(MC_BIGCL); + if (i > m_region_expand(MC_BIGCL)) { + m_region_expand(MC_BIGCL) = i; + } + } + if (m_infree(MC_BIGCL) >= num) { + return 1; + } + } else { + if (i > 0) { + /* + * Remember total number of 16KB clusters needed + * at this time. + */ + i += m_total(MC_16KCL); + if (i > m_region_expand(MC_16KCL)) { + m_region_expand(MC_16KCL) = i; + } + } + if (m_infree(MC_16KCL) >= num) { + return 1; + } + } + return 0; +} + +/* + * Populate the global freelist of the corresponding buffer class. + */ +static int +freelist_populate(mbuf_class_t class, unsigned int num, int wait) +{ + mcache_obj_t *o = NULL; + int i, numpages = 0, count; + mbuf_class_t super_class; + + VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || + class == MC_16KCL); + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) || + PAGE_SIZE == m_maxsize(MC_16KCL)); + + if (m_maxsize(class) >= PAGE_SIZE) { + return m_clalloc(num, wait, m_maxsize(class)) != 0; + } + + /* + * The rest of the function will allocate pages and will slice + * them up into the right size + */ + + numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Currently assume that pages are 4K or 16K */ + if (PAGE_SIZE == m_maxsize(MC_BIGCL)) { + super_class = MC_BIGCL; + } else { + super_class = MC_16KCL; + } + + i = m_clalloc(numpages, wait, m_maxsize(super_class)); + + /* how many objects will we cut the page into? */ + int numobj = PAGE_SIZE / m_maxsize(class); + + for (count = 0; count < numpages; count++) { + /* respect totals, minlimit, maxlimit */ + if (m_total(super_class) <= m_minlimit(super_class) || + m_total(class) >= m_maxlimit(class)) { + break; + } + + if ((o = slab_alloc(super_class, wait)) == NULL) { + break; + } + + struct mbuf *m = (struct mbuf *)o; + union mcluster *c = (union mcluster *)o; + union mbigcluster *mbc = (union mbigcluster *)o; + mcl_slab_t *sp = slab_get(o); + mcache_audit_t *mca = NULL; + + /* + * since one full page will be converted to MC_MBUF or + * MC_CL, verify that the reference count will match that + * assumption + */ + VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp)); + VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); + /* + * Make sure that the cluster is unmolested + * while in freelist + */ + if (mclverify) { + mca = mcl_audit_buf2mca(super_class, + (mcache_obj_t *)o); + mcache_audit_free_verify(mca, + (mcache_obj_t *)o, 0, m_maxsize(super_class)); + } + + /* Reinitialize it as an mbuf or 2K or 4K slab */ + slab_init(sp, class, sp->sl_flags, + sp->sl_base, NULL, PAGE_SIZE, 0, numobj); + + VERIFY(sp->sl_head == NULL); + + VERIFY(m_total(super_class) >= 1); + m_total(super_class)--; + + if (super_class == MC_BIGCL) { + mbstat.m_bigclusters = m_total(MC_BIGCL); + } + + m_total(class) += numobj; + VERIFY(m_total(class) <= m_maxlimit(class)); + m_infree(class) += numobj; + + i = numobj; + if (class == MC_MBUF) { + mbstat.m_mbufs = m_total(MC_MBUF); + mtype_stat_add(MT_FREE, NMBPG); + while (i--) { + /* + * If auditing is enabled, construct the + * shadow mbuf in the audit structure + * instead of the actual one. + * mbuf_slab_audit() will take care of + * restoring the contents after the + * integrity check. + */ + if (mclaudit != NULL) { + struct mbuf *ms; + mca = mcl_audit_buf2mca(MC_MBUF, + (mcache_obj_t *)m); + ms = MCA_SAVED_MBUF_PTR(mca); + ms->m_type = MT_FREE; + } else { + m->m_type = MT_FREE; + } + m->m_next = sp->sl_head; + sp->sl_head = (void *)m++; + } + } else if (class == MC_CL) { /* MC_CL */ + mbstat.m_clfree = + m_infree(MC_CL) + m_infree(MC_MBUF_CL); + mbstat.m_clusters = m_total(MC_CL); + while (i--) { + c->mcl_next = sp->sl_head; + sp->sl_head = (void *)c++; + } + } else { + VERIFY(class == MC_BIGCL); + mbstat.m_bigclusters = m_total(MC_BIGCL); + mbstat.m_bigclfree = m_infree(MC_BIGCL) + + m_infree(MC_MBUF_BIGCL); + while (i--) { + mbc->mbc_next = sp->sl_head; + sp->sl_head = (void *)mbc++; + } + } + + /* Insert into the mbuf or 2k or 4k slab list */ + slab_insert(sp, class); + + if ((i = mb_waiters) > 0) { + mb_waiters = 0; + } + if (i != 0) { + mbwdog_logger("waking up all threads"); + wakeup(mb_waitchan); + } + } + return count != 0; +} + +/* + * For each class, initialize the freelist to hold m_minlimit() objects. + */ +static void +freelist_init(mbuf_class_t class) +{ + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(class == MC_CL || class == MC_BIGCL); + VERIFY(m_total(class) == 0); + VERIFY(m_minlimit(class) > 0); + + while (m_total(class) < m_minlimit(class)) { + (void) freelist_populate(class, m_minlimit(class), M_WAIT); + } + + VERIFY(m_total(class) >= m_minlimit(class)); +} + +/* + * (Inaccurately) check if it might be worth a trip back to the + * mcache layer due the availability of objects there. We'll + * end up back here if there's nothing up there. + */ +static boolean_t +mbuf_cached_above(mbuf_class_t class, int wait) +{ + switch (class) { + case MC_MBUF: + if (wait & MCR_COMP) { + return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || + !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)); + } + break; + + case MC_CL: + if (wait & MCR_COMP) { + return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)); + } + break; + + case MC_BIGCL: + if (wait & MCR_COMP) { + return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)); + } + break; + + case MC_16KCL: + if (wait & MCR_COMP) { + return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)); + } + break; + + case MC_MBUF_CL: + case MC_MBUF_BIGCL: + case MC_MBUF_16KCL: + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return !mcache_bkt_isempty(m_cache(class)); +} + +/* + * If possible, convert constructed objects to raw ones. + */ +static boolean_t +mbuf_steal(mbuf_class_t class, unsigned int num) +{ + mcache_obj_t *top = NULL; + mcache_obj_t **list = ⊤ + unsigned int tot = 0; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + switch (class) { + case MC_MBUF: + case MC_CL: + case MC_BIGCL: + case MC_16KCL: + return FALSE; + + case MC_MBUF_CL: + case MC_MBUF_BIGCL: + case MC_MBUF_16KCL: + /* Get the required number of constructed objects if possible */ + if (m_infree(class) > m_minlimit(class)) { + tot = cslab_alloc(class, &list, + MIN(num, m_infree(class))); + } + + /* And destroy them to get back the raw objects */ + if (top != NULL) { + (void) cslab_free(class, top, 1); + } + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return tot == num; +} + +static void +m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) +{ + int m, bmap = 0; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); + + /* + * This logic can be made smarter; for now, simply mark + * all other related classes as potential victims. + */ + switch (class) { + case MC_MBUF: + m_wantpurge(MC_CL)++; + m_wantpurge(MC_BIGCL)++; + m_wantpurge(MC_MBUF_CL)++; + m_wantpurge(MC_MBUF_BIGCL)++; + break; + + case MC_CL: + m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_BIGCL)++; + m_wantpurge(MC_MBUF_BIGCL)++; + if (!comp) { + m_wantpurge(MC_MBUF_CL)++; + } + break; + + case MC_BIGCL: + m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_CL)++; + m_wantpurge(MC_MBUF_CL)++; + if (!comp) { + m_wantpurge(MC_MBUF_BIGCL)++; + } + break; + + case MC_16KCL: + if (!comp) { + m_wantpurge(MC_MBUF_16KCL)++; + } + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + /* + * Run through each marked class and check if we really need to + * purge (and therefore temporarily disable) the per-CPU caches + * layer used by the class. If so, remember the classes since + * we are going to drop the lock below prior to purging. + */ + for (m = 0; m < MC_MAX; m++) { + if (m_wantpurge(m) > 0) { + m_wantpurge(m) = 0; + /* + * Try hard to steal the required number of objects + * from the freelist of other mbuf classes. Only + * purge and disable the per-CPU caches layer when + * we don't have enough; it's the last resort. + */ + if (!mbuf_steal(m, num)) { + bmap |= (1 << m); + } + } + } + + lck_mtx_unlock(mbuf_mlock); + + if (bmap != 0) { + /* signal the domains to drain */ + net_drain_domains(); + + /* Sigh; we have no other choices but to ask mcache to purge */ + for (m = 0; m < MC_MAX; m++) { + if ((bmap & (1 << m)) && + mcache_purge_cache(m_cache(m), TRUE)) { + lck_mtx_lock(mbuf_mlock); + m_purge_cnt(m)++; + mbstat.m_drain++; + lck_mtx_unlock(mbuf_mlock); + } + } + } else { + /* + * Request mcache to reap extra elements from all of its caches; + * note that all reaps are serialized and happen only at a fixed + * interval. + */ + mcache_reap(); + } + lck_mtx_lock(mbuf_mlock); +} + +struct mbuf * +m_get_common(int wait, short type, int hdr) +{ + struct mbuf *m; + + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + m = mcache_alloc(m_cache(MC_MBUF), mcflags); + if (m != NULL) { + mbuf_init(m, hdr, type); + mtype_stat_inc(type); + mtype_stat_dec(MT_FREE); + } + return m; +} + +/* + * Space allocation routines; these are also available as macros + * for critical paths. + */ +#define _M_GETHDR(wait, type) m_get_common(wait, type, 1) + +struct mbuf * +m_free(struct mbuf *m) +{ + struct mbuf *n = m->m_next; + + if (m->m_type == MT_FREE) { + panic("m_free: freeing an already freed mbuf"); + } + + if (m->m_flags & M_PKTHDR) { + /* Free the aux data and tags if there is any */ + m_tag_delete_chain(m); + + m_do_tx_compl_callback(m, NULL); + } + + if (m->m_flags & M_EXT) { + if (MBUF_IS_PAIRED(m) && m_free_paired(m)) { + return n; + } + /* + * Make sure that we don't touch any ext_ref + * member after we decrement the reference count + * since that may lead to use-after-free + * when we do not hold the last reference. + */ + const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE); + const m_ext_free_func_t m_free_func = m_get_ext_free(m); + const uint16_t minref = MEXT_MINREF(m); + const uint16_t refcnt = m_decref(m); + + if (refcnt == minref && !composite) { + if (m_free_func == NULL) { + mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); + } else if (m_free_func == m_bigfree) { + mcache_free(m_cache(MC_BIGCL), + m->m_ext.ext_buf); + } else if (m_free_func == m_16kfree) { + mcache_free(m_cache(MC_16KCL), + m->m_ext.ext_buf); + } else { + (*m_free_func)(m->m_ext.ext_buf, + m->m_ext.ext_size, m_get_ext_arg(m)); + } + mcache_free(ref_cache, m_get_rfa(m)); + m_set_ext(m, NULL, NULL, NULL); + } else if (refcnt == minref && composite) { + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); + + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + /* + * MEXT_FLAGS is safe to access here + * since we are now sure that we held + * the last reference to ext_ref. + */ + MEXT_FLAGS(m) &= ~EXTF_READONLY; + + /* "Free" into the intermediate cache */ + if (m_free_func == NULL) { + mcache_free(m_cache(MC_MBUF_CL), m); + } else if (m_free_func == m_bigfree) { + mcache_free(m_cache(MC_MBUF_BIGCL), m); + } else { + VERIFY(m_free_func == m_16kfree); + mcache_free(m_cache(MC_MBUF_16KCL), m); + } + return n; + } + } + + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + + m->m_type = MT_FREE; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + mcache_free(m_cache(MC_MBUF), m); + + return n; +} + +__private_extern__ struct mbuf * +m_clattach(struct mbuf *m, int type, caddr_t extbuf, + void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg, + int wait, int pair) +{ + struct ext_ref *rfa = NULL; + + /* + * If pairing is requested and an existing mbuf is provided, reject + * it if it's already been paired to another cluster. Otherwise, + * allocate a new one or free any existing below. + */ + if ((m != NULL && MBUF_IS_PAIRED(m)) || + (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) { + return NULL; + } + + if (m->m_flags & M_EXT) { + /* + * Make sure that we don't touch any ext_ref + * member after we decrement the reference count + * since that may lead to use-after-free + * when we do not hold the last reference. + */ + const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE); + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL); + const m_ext_free_func_t m_free_func = m_get_ext_free(m); + const uint16_t minref = MEXT_MINREF(m); + const uint16_t refcnt = m_decref(m); + + if (refcnt == minref && !composite) { + if (m_free_func == NULL) { + mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); + } else if (m_free_func == m_bigfree) { + mcache_free(m_cache(MC_BIGCL), + m->m_ext.ext_buf); + } else if (m_free_func == m_16kfree) { + mcache_free(m_cache(MC_16KCL), + m->m_ext.ext_buf); + } else { + (*m_free_func)(m->m_ext.ext_buf, + m->m_ext.ext_size, m_get_ext_arg(m)); + } + /* Re-use the reference structure */ + rfa = m_get_rfa(m); + } else if (refcnt == minref && composite) { + VERIFY(m->m_type != MT_FREE); + + mtype_stat_dec(m->m_type); + mtype_stat_inc(MT_FREE); + + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* + * MEXT_FLAGS is safe to access here + * since we are now sure that we held + * the last reference to ext_ref. + */ + MEXT_FLAGS(m) &= ~EXTF_READONLY; + + /* "Free" into the intermediate cache */ + if (m_free_func == NULL) { + mcache_free(m_cache(MC_MBUF_CL), m); + } else if (m_free_func == m_bigfree) { + mcache_free(m_cache(MC_MBUF_BIGCL), m); + } else { + VERIFY(m_free_func == m_16kfree); + mcache_free(m_cache(MC_MBUF_16KCL), m); + } + /* + * Allocate a new mbuf, since we didn't divorce + * the composite mbuf + cluster pair above. + */ + if ((m = _M_GETHDR(wait, type)) == NULL) { + return NULL; + } + } + } + + if (rfa == NULL && + (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { + m_free(m); + return NULL; + } + + if (!pair) { + mext_init(m, extbuf, extsize, extfree, extarg, rfa, + 0, 1, 0, 0, 0, NULL); + } else { + mext_init(m, extbuf, extsize, extfree, (caddr_t)m, rfa, + 1, 1, 1, EXTF_PAIRED, 0, m); + } + + return m; +} + +/* + * Perform `fast' allocation mbuf clusters from a cache of recently-freed + * clusters. (If the cache is empty, new clusters are allocated en-masse.) + */ +struct mbuf * +m_getcl(int wait, int type, int flags) +{ + struct mbuf *m = NULL; + int hdr = (flags & M_PKTHDR); + + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); + if (m != NULL) { + u_int16_t flag; + struct ext_ref *rfa; + void *cl; + + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + cl = m->m_ext.ext_buf; + rfa = m_get_rfa(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL); + + flag = MEXT_FLAGS(m); + + mbuf_init(m, hdr, type); + MBUF_CL_INIT(m, cl, rfa, 1, flag); + + mtype_stat_inc(type); + mtype_stat_dec(MT_FREE); + } + return m; +} + +/* m_mclget() add an mbuf cluster to a normal mbuf */ +struct mbuf * +m_mclget(struct mbuf *m, int wait) +{ + struct ext_ref *rfa = NULL; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { + return m; + } + m->m_ext.ext_buf = m_mclalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); + } else { + mcache_free(ref_cache, rfa); + } + + return m; +} + +/* Allocate an mbuf cluster */ +caddr_t +m_mclalloc(int wait) +{ + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + return mcache_alloc(m_cache(MC_CL), mcflags); +} + +/* Free an mbuf cluster */ +void +m_mclfree(caddr_t p) +{ + mcache_free(m_cache(MC_CL), p); +} + +__private_extern__ caddr_t +m_bigalloc(int wait) +{ + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + return mcache_alloc(m_cache(MC_BIGCL), mcflags); +} + +__private_extern__ void +m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) +{ + mcache_free(m_cache(MC_BIGCL), p); +} + +/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ +__private_extern__ struct mbuf * +m_mbigget(struct mbuf *m, int wait) +{ + struct ext_ref *rfa = NULL; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { + return m; + } + m->m_ext.ext_buf = m_bigalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); + } else { + mcache_free(ref_cache, rfa); + } + return m; +} + +__private_extern__ caddr_t +m_16kalloc(int wait) +{ + int mcflags = MSLEEPF(wait); + + /* Is this due to a non-blocking retry? If so, then try harder */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + return mcache_alloc(m_cache(MC_16KCL), mcflags); +} + +__private_extern__ void +m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) +{ + mcache_free(m_cache(MC_16KCL), p); +} + +/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ +__private_extern__ struct mbuf * +m_m16kget(struct mbuf *m, int wait) +{ + struct ext_ref *rfa = NULL; + + if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { + return m; + } + m->m_ext.ext_buf = m_16kalloc(wait); + if (m->m_ext.ext_buf != NULL) { + MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); + } else { + mcache_free(ref_cache, rfa); + } + + return m; +} + +/* + * Return a list of mbuf hdrs that point to clusters. Try for num_needed; + * if wantall is not set, return whatever number were available. Set up the + * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these + * are chained on the m_nextpkt field. Any packets requested beyond this + * are chained onto the last packet header's m_next field. The size of + * the cluster is controlled by the parameter bufsize. + */ +__private_extern__ struct mbuf * +m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, + int wait, int wantall, size_t bufsize) +{ + struct mbuf *m = NULL; + struct mbuf **np, *top; + unsigned int pnum, needed = *num_needed; + mcache_obj_t *mp_list = NULL; + int mcflags = MSLEEPF(wait); + mcache_t *cp; + u_int16_t flag; + struct ext_ref *rfa; + void *cl; + + ASSERT(bufsize == m_maxsize(MC_CL) || + bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); + + top = NULL; + np = ⊤ + pnum = 0; + + /* + * The caller doesn't want all the requested buffers; only some. + * Try hard to get what we can, but don't block. This effectively + * overrides MCR_SLEEP, since this thread will not go to sleep + * if we can't get all the buffers. + */ + if (!wantall || (mcflags & MCR_NOSLEEP)) { + mcflags |= MCR_TRYHARD; + } + + /* Allocate the composite mbuf + cluster elements from the cache */ + if (bufsize == m_maxsize(MC_CL)) { + cp = m_cache(MC_MBUF_CL); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + cp = m_cache(MC_MBUF_BIGCL); + } else { + cp = m_cache(MC_MBUF_16KCL); + } + needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); + + for (pnum = 0; pnum < needed; pnum++) { + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + cl = m->m_ext.ext_buf; + rfa = m_get_rfa(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m)); + + flag = MEXT_FLAGS(m); + + mbuf_init(m, num_with_pkthdrs, MT_DATA); + if (bufsize == m_maxsize(MC_16KCL)) { + MBUF_16KCL_INIT(m, cl, rfa, 1, flag); + } else if (bufsize == m_maxsize(MC_BIGCL)) { + MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); + } else { + MBUF_CL_INIT(m, cl, rfa, 1, flag); + } + + if (num_with_pkthdrs > 0) { + --num_with_pkthdrs; + } + + *np = m; + if (num_with_pkthdrs > 0) { + np = &m->m_nextpkt; + } else { + np = &m->m_next; + } + } + ASSERT(pnum != *num_needed || mp_list == NULL); + if (mp_list != NULL) { + mcache_free_ext(cp, mp_list); + } + if (pnum > 0) { + mtype_stat_add(MT_DATA, pnum); + mtype_stat_sub(MT_FREE, pnum); + } + + if (wantall && (pnum != *num_needed)) { + if (top != NULL) { + m_freem_list(top); + } + return NULL; + } + + if (pnum > *num_needed) { + printf("%s: File a radar related to . \ + needed = %u, pnum = %u, num_needed = %u \n", + __func__, needed, pnum, *num_needed); + } + *num_needed = pnum; + + return top; +} + +/* + * Return list of mbuf linked by m_nextpkt. Try for numlist, and if + * wantall is not set, return whatever number were available. The size of + * each mbuf in the list is controlled by the parameter packetlen. Each + * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf + * in the chain is called a segment. If maxsegments is not null and the + * value pointed to is not null, this specify the maximum number of segments + * for a chain of mbufs. If maxsegments is zero or the value pointed to + * is zero the caller does not have any restriction on the number of segments. + * The actual number of segments of a mbuf chain is return in the value + * pointed to by maxsegments. + */ +__private_extern__ struct mbuf * +m_allocpacket_internal(unsigned int *numlist, size_t packetlen, + unsigned int *maxsegments, int wait, int wantall, size_t wantsize) +{ + struct mbuf **np, *top, *first = NULL; + size_t bufsize, r_bufsize; + unsigned int num = 0; + unsigned int nsegs = 0; + unsigned int needed = 0, resid; + int mcflags = MSLEEPF(wait); + mcache_obj_t *mp_list = NULL, *rmp_list = NULL; + mcache_t *cp = NULL, *rcp = NULL; + + if (*numlist == 0) { + os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0"); + return NULL; + } + + top = NULL; + np = ⊤ + + if (wantsize == 0) { + if (packetlen <= MINCLSIZE) { + bufsize = packetlen; + } else if (packetlen > m_maxsize(MC_CL)) { + /* Use 4KB if jumbo cluster pool isn't available */ + if (packetlen <= m_maxsize(MC_BIGCL)) { + bufsize = m_maxsize(MC_BIGCL); + } else { + bufsize = m_maxsize(MC_16KCL); + } + } else { + bufsize = m_maxsize(MC_CL); + } + } else if (wantsize == m_maxsize(MC_CL) || + wantsize == m_maxsize(MC_BIGCL) || + wantsize == m_maxsize(MC_16KCL)) { + bufsize = wantsize; + } else { + *numlist = 0; + os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported"); + return NULL; + } + + if (bufsize <= MHLEN) { + nsegs = 1; + } else if (bufsize <= MINCLSIZE) { + if (maxsegments != NULL && *maxsegments == 1) { + bufsize = m_maxsize(MC_CL); + nsegs = 1; + } else { + nsegs = 2; + } + } else if (bufsize == m_maxsize(MC_16KCL)) { + nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1; + } else if (bufsize == m_maxsize(MC_BIGCL)) { + nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1; + } else { + nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; + } + if (maxsegments != NULL) { + if (*maxsegments && nsegs > *maxsegments) { + *maxsegments = nsegs; + *numlist = 0; + os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments"); + return NULL; + } + *maxsegments = nsegs; + } + + /* + * The caller doesn't want all the requested buffers; only some. + * Try hard to get what we can, but don't block. This effectively + * overrides MCR_SLEEP, since this thread will not go to sleep + * if we can't get all the buffers. + */ + if (!wantall || (mcflags & MCR_NOSLEEP)) { + mcflags |= MCR_TRYHARD; + } + + /* + * Simple case where all elements in the lists/chains are mbufs. + * Unless bufsize is greater than MHLEN, each segment chain is made + * up of exactly 1 mbuf. Otherwise, each segment chain is made up + * of 2 mbufs; the second one is used for the residual data, i.e. + * the remaining data that cannot fit into the first mbuf. + */ + if (bufsize <= MINCLSIZE) { + /* Allocate the elements in one shot from the mbuf cache */ + ASSERT(bufsize <= MHLEN || nsegs == 2); + cp = m_cache(MC_MBUF); + needed = mcache_alloc_ext(cp, &mp_list, + (*numlist) * nsegs, mcflags); + + /* + * The number of elements must be even if we are to use an + * mbuf (instead of a cluster) to store the residual data. + * If we couldn't allocate the requested number of mbufs, + * trim the number down (if it's odd) in order to avoid + * creating a partial segment chain. + */ + if (bufsize > MHLEN && (needed & 0x1)) { + needed--; + } + + while (num < needed) { + struct mbuf *m = NULL; + + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + ASSERT(m != NULL); + + mbuf_init(m, 1, MT_DATA); + num++; + if (bufsize > MHLEN) { + /* A second mbuf for this segment chain */ + m->m_next = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + + ASSERT(m->m_next != NULL); + + mbuf_init(m->m_next, 0, MT_DATA); + num++; + } + *np = m; + np = &m->m_nextpkt; + } + ASSERT(num != *numlist || mp_list == NULL); + + if (num > 0) { + mtype_stat_add(MT_DATA, num); + mtype_stat_sub(MT_FREE, num); + } + num /= nsegs; + + /* We've got them all; return to caller */ + if (num == *numlist) { + return top; + } + + goto fail; + } + + /* + * Complex cases where elements are made up of one or more composite + * mbufs + cluster, depending on packetlen. Each N-segment chain can + * be illustrated as follows: + * + * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] + * + * Every composite mbuf + cluster element comes from the intermediate + * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, + * the last composite element will come from the MC_MBUF_CL cache, + * unless the residual data is larger than 2KB where we use the + * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual + * data is defined as extra data beyond the first element that cannot + * fit into the previous element, i.e. there is no residual data if + * the chain only has 1 segment. + */ + r_bufsize = bufsize; + resid = packetlen > bufsize ? packetlen % bufsize : 0; + if (resid > 0) { + /* There is residual data; figure out the cluster size */ + if (wantsize == 0 && packetlen > MINCLSIZE) { + /* + * Caller didn't request that all of the segments + * in the chain use the same cluster size; use the + * smaller of the cluster sizes. + */ + if (resid > m_maxsize(MC_BIGCL)) { + r_bufsize = m_maxsize(MC_16KCL); + } else if (resid > m_maxsize(MC_CL)) { + r_bufsize = m_maxsize(MC_BIGCL); + } else { + r_bufsize = m_maxsize(MC_CL); + } + } else { + /* Use the same cluster size as the other segments */ + resid = 0; + } + } + + needed = *numlist; + if (resid > 0) { + /* + * Attempt to allocate composite mbuf + cluster elements for + * the residual data in each chain; record the number of such + * elements that can be allocated so that we know how many + * segment chains we can afford to create. + */ + if (r_bufsize <= m_maxsize(MC_CL)) { + rcp = m_cache(MC_MBUF_CL); + } else if (r_bufsize <= m_maxsize(MC_BIGCL)) { + rcp = m_cache(MC_MBUF_BIGCL); + } else { + rcp = m_cache(MC_MBUF_16KCL); + } + needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); + if (needed == 0) { + goto fail; + } + + /* This is temporarily reduced for calculation */ + ASSERT(nsegs > 1); + nsegs--; + } + + /* + * Attempt to allocate the rest of the composite mbuf + cluster + * elements for the number of segment chains that we need. + */ + if (bufsize <= m_maxsize(MC_CL)) { + cp = m_cache(MC_MBUF_CL); + } else if (bufsize <= m_maxsize(MC_BIGCL)) { + cp = m_cache(MC_MBUF_BIGCL); + } else { + cp = m_cache(MC_MBUF_16KCL); + } + needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); + + /* Round it down to avoid creating a partial segment chain */ + needed = (needed / nsegs) * nsegs; + if (needed == 0) { + goto fail; + } + + if (resid > 0) { + /* + * We're about to construct the chain(s); take into account + * the number of segments we have created above to hold the + * residual data for each chain, as well as restore the + * original count of segments per chain. + */ + ASSERT(nsegs > 0); + needed += needed / nsegs; + nsegs++; + } + + for (;;) { + struct mbuf *m = NULL; + u_int16_t flag; + struct ext_ref *rfa; + void *cl; + int pkthdr; + m_ext_free_func_t m_free_func; + + ++num; + + if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { + m = (struct mbuf *)mp_list; + mp_list = mp_list->obj_next; + } else { + m = (struct mbuf *)rmp_list; + rmp_list = rmp_list->obj_next; + } + m_free_func = m_get_ext_free(m); + ASSERT(m != NULL); + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + VERIFY(m_free_func == NULL || m_free_func == m_bigfree || + m_free_func == m_16kfree); + + cl = m->m_ext.ext_buf; + rfa = m_get_rfa(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m)); + + flag = MEXT_FLAGS(m); + + pkthdr = (nsegs == 1 || (num % nsegs) == 1); + if (pkthdr) { + first = m; + } + mbuf_init(m, pkthdr, MT_DATA); + if (m_free_func == m_16kfree) { + MBUF_16KCL_INIT(m, cl, rfa, 1, flag); + } else if (m_free_func == m_bigfree) { + MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); + } else { + MBUF_CL_INIT(m, cl, rfa, 1, flag); + } + + *np = m; + if ((num % nsegs) == 0) { + np = &first->m_nextpkt; + } else { + np = &m->m_next; + } + + if (num == needed) { + break; + } + } + + if (num > 0) { + mtype_stat_add(MT_DATA, num); + mtype_stat_sub(MT_FREE, num); + } + + num /= nsegs; + + /* We've got them all; return to caller */ + if (num == *numlist) { + ASSERT(mp_list == NULL && rmp_list == NULL); + return top; + } + +fail: + /* Free up what's left of the above */ + if (mp_list != NULL) { + mcache_free_ext(cp, mp_list); + } + if (rmp_list != NULL) { + mcache_free_ext(rcp, rmp_list); + } + if (wantall && top != NULL) { + m_freem_list(top); + *numlist = 0; + return NULL; + } + *numlist = num; + return top; +} + +/* + * Free an mbuf list (m_nextpkt) while following m_next. Returns the count + * for mbufs packets freed. Used by the drivers. + */ +int +m_freem_list(struct mbuf *m) +{ + struct mbuf *nextpkt; + mcache_obj_t *mp_list = NULL; + mcache_obj_t *mcl_list = NULL; + mcache_obj_t *mbc_list = NULL; + mcache_obj_t *m16k_list = NULL; + mcache_obj_t *m_mcl_list = NULL; + mcache_obj_t *m_mbc_list = NULL; + mcache_obj_t *m_m16k_list = NULL; + mcache_obj_t *ref_list = NULL; + int pktcount = 0; + int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; + + while (m != NULL) { + pktcount++; + + nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + + while (m != NULL) { + struct mbuf *next = m->m_next; + mcache_obj_t *o, *rfa; + if (m->m_type == MT_FREE) { + panic("m_free: freeing an already freed mbuf"); + } + + if (m->m_flags & M_PKTHDR) { + /* Free the aux data and tags if there is any */ + m_tag_delete_chain(m); + m_do_tx_compl_callback(m, NULL); + } + + if (!(m->m_flags & M_EXT)) { + mt_free++; + goto simple_free; + } + + if (MBUF_IS_PAIRED(m) && m_free_paired(m)) { + m = next; + continue; + } + + mt_free++; + + o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; + /* + * Make sure that we don't touch any ext_ref + * member after we decrement the reference count + * since that may lead to use-after-free + * when we do not hold the last reference. + */ + const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE); + const m_ext_free_func_t m_free_func = m_get_ext_free(m); + const uint16_t minref = MEXT_MINREF(m); + const uint16_t refcnt = m_decref(m); + if (refcnt == minref && !composite) { + if (m_free_func == NULL) { + o->obj_next = mcl_list; + mcl_list = o; + } else if (m_free_func == m_bigfree) { + o->obj_next = mbc_list; + mbc_list = o; + } else if (m_free_func == m_16kfree) { + o->obj_next = m16k_list; + m16k_list = o; + } else { + (*(m_free_func))((caddr_t)o, + m->m_ext.ext_size, + m_get_ext_arg(m)); + } + rfa = (mcache_obj_t *)(void *)m_get_rfa(m); + rfa->obj_next = ref_list; + ref_list = rfa; + m_set_ext(m, NULL, NULL, NULL); + } else if (refcnt == minref && composite) { + VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED)); + /* + * Amortize the costs of atomic operations + * by doing them at the end, if possible. + */ + if (m->m_type == MT_DATA) { + mt_data++; + } else if (m->m_type == MT_HEADER) { + mt_header++; + } else if (m->m_type == MT_SONAME) { + mt_soname++; + } else if (m->m_type == MT_TAG) { + mt_tag++; + } else { + mtype_stat_dec(m->m_type); + } + + m->m_type = MT_FREE; + m->m_flags = M_EXT; + m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + /* + * MEXT_FLAGS is safe to access here + * since we are now sure that we held + * the last reference to ext_ref. + */ + MEXT_FLAGS(m) &= ~EXTF_READONLY; + + /* "Free" into the intermediate cache */ + o = (mcache_obj_t *)m; + if (m_free_func == NULL) { + o->obj_next = m_mcl_list; + m_mcl_list = o; + } else if (m_free_func == m_bigfree) { + o->obj_next = m_mbc_list; + m_mbc_list = o; + } else { + VERIFY(m_free_func == m_16kfree); + o->obj_next = m_m16k_list; + m_m16k_list = o; + } + m = next; + continue; + } +simple_free: + /* + * Amortize the costs of atomic operations + * by doing them at the end, if possible. + */ + if (m->m_type == MT_DATA) { + mt_data++; + } else if (m->m_type == MT_HEADER) { + mt_header++; + } else if (m->m_type == MT_SONAME) { + mt_soname++; + } else if (m->m_type == MT_TAG) { + mt_tag++; + } else if (m->m_type != MT_FREE) { + mtype_stat_dec(m->m_type); + } + + m->m_type = MT_FREE; + m->m_flags = m->m_len = 0; + m->m_next = m->m_nextpkt = NULL; + + ((mcache_obj_t *)m)->obj_next = mp_list; + mp_list = (mcache_obj_t *)m; + + m = next; + } + + m = nextpkt; + } + + if (mt_free > 0) { + mtype_stat_add(MT_FREE, mt_free); + } + if (mt_data > 0) { + mtype_stat_sub(MT_DATA, mt_data); + } + if (mt_header > 0) { + mtype_stat_sub(MT_HEADER, mt_header); + } + if (mt_soname > 0) { + mtype_stat_sub(MT_SONAME, mt_soname); + } + if (mt_tag > 0) { + mtype_stat_sub(MT_TAG, mt_tag); + } + if (mp_list != NULL) { + mcache_free_ext(m_cache(MC_MBUF), mp_list); + } + if (mcl_list != NULL) { + mcache_free_ext(m_cache(MC_CL), mcl_list); + } + if (mbc_list != NULL) { + mcache_free_ext(m_cache(MC_BIGCL), mbc_list); + } + if (m16k_list != NULL) { + mcache_free_ext(m_cache(MC_16KCL), m16k_list); + } + if (m_mcl_list != NULL) { + mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); + } + if (m_mbc_list != NULL) { + mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); + } + if (m_m16k_list != NULL) { + mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); + } + if (ref_list != NULL) { + mcache_free_ext(ref_cache, ref_list); + } + + return pktcount; +} + +/* + * Equivalent to m_copym except that all necessary mbuf hdrs are allocated + * within this routine also. + * + * The last mbuf and offset accessed are passed in and adjusted on return to + * avoid having to iterate over the entire mbuf chain each time. + */ +struct mbuf * +m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait, + struct mbuf **m_lastm, int *m_off, uint32_t mode) +{ + struct mbuf *m = m0, *n, **np = NULL; + int off = off0, len = len0; + struct mbuf *top = NULL; + int mcflags = MSLEEPF(wait); + mcache_obj_t *list = NULL; + int copyhdr = 0; + int type = 0; + int needed = 0; + + if (off == 0 && (m->m_flags & M_PKTHDR)) { + copyhdr = 1; + } + + if (m_lastm != NULL && *m_lastm != NULL) { + if (off0 >= *m_off) { + m = *m_lastm; + off = off0 - *m_off; + } + } + + while (off >= m->m_len) { + off -= m->m_len; + m = m->m_next; + } + + n = m; + while (len > 0) { + needed++; + len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); + n = n->m_next; + } + needed++; + len = len0; + + /* + * If the caller doesn't want to be put to sleep, mark it with + * MCR_TRYHARD so that we may reclaim buffers from other places + * before giving up. + */ + if (mcflags & MCR_NOSLEEP) { + mcflags |= MCR_TRYHARD; + } + + if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, + mcflags) != needed) { + goto nospace; + } + + needed = 0; + while (len > 0) { + n = (struct mbuf *)list; + list = list->obj_next; + ASSERT(n != NULL && m != NULL); + + type = (top == NULL) ? MT_HEADER : m->m_type; + mbuf_init(n, (top == NULL), type); + + if (top == NULL) { + top = n; + np = &top->m_next; + continue; + } else { + needed++; + *np = n; + } + + if (copyhdr) { + if ((mode == M_COPYM_MOVE_HDR) || + (mode == M_COPYM_MUST_MOVE_HDR)) { + M_COPY_PKTHDR(n, m); + } else if ((mode == M_COPYM_COPY_HDR) || + (mode == M_COPYM_MUST_COPY_HDR)) { + if (m_dup_pkthdr(n, m, wait) == 0) { + goto nospace; + } + } + n->m_pkthdr.len = len; + copyhdr = 0; + } + n->m_len = MIN(len, (m->m_len - off)); + + if (m->m_flags & M_EXT) { + n->m_ext = m->m_ext; + m_incref(m); + n->m_data = m->m_data + off; + n->m_flags |= M_EXT; + } else { + if (m_mtod_end(n) > m_mtod_upper_bound(n)) { + panic("%s n %p copy overflow", + __func__, n); + } + + bcopy(mtod(m, caddr_t) + off, mtod(n, caddr_t), + (unsigned)n->m_len); + } + len -= n->m_len; + + if (len == 0) { + if (m_lastm != NULL) { + *m_lastm = m; + *m_off = off0 + len0 - (off + n->m_len); + } + break; + } + off = 0; + m = m->m_next; + np = &n->m_next; + } + + mtype_stat_inc(MT_HEADER); + mtype_stat_add(type, needed); + mtype_stat_sub(MT_FREE, needed + 1); + + ASSERT(list == NULL); + + return top; + +nospace: + if (list != NULL) { + mcache_free_ext(m_cache(MC_MBUF), list); + } + if (top != NULL) { + m_freem(top); + } + return NULL; +} + +#ifndef MBUF_GROWTH_NORMAL_THRESH +#define MBUF_GROWTH_NORMAL_THRESH 25 +#endif + +/* + * Cluster freelist allocation check. + */ +static int +m_howmany(int num, size_t bufsize) +{ + int i = 0, j = 0; + u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; + u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; + u_int32_t sumclusters, freeclusters; + u_int32_t percent_pool, percent_kmem; + u_int32_t mb_growth, mb_growth_thresh; + + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* Numbers in 2K cluster units */ + m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; + m_clusters = m_total(MC_CL); + m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; + m_16kclusters = m_total(MC_16KCL); + sumclusters = m_mbclusters + m_clusters + m_bigclusters; + + m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; + m_clfree = m_infree(MC_CL); + m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; + m_16kclfree = m_infree(MC_16KCL); + freeclusters = m_mbfree + m_clfree + m_bigclfree; + + /* Bail if we've maxed out the mbuf memory map */ + if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || + (bufsize == m_maxsize(MC_16KCL) && + (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { + mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)", + sumclusters, nclusters, + (m_16kclusters << NCLPJCLSHIFT), njcl); + return 0; + } + + if (bufsize == m_maxsize(MC_BIGCL)) { + /* Under minimum */ + if (m_bigclusters < m_minlimit(MC_BIGCL)) { + return m_minlimit(MC_BIGCL) - m_bigclusters; + } + + percent_pool = + ((sumclusters - freeclusters) * 100) / sumclusters; + percent_kmem = (sumclusters * 100) / nclusters; + + /* + * If a light/normal user, grow conservatively (75%) + * If a heavy user, grow aggressively (50%) + */ + if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) { + mb_growth = MB_GROWTH_NORMAL; + } else { + mb_growth = MB_GROWTH_AGGRESSIVE; + } + + if (percent_kmem < 5) { + /* For initial allocations */ + i = num; + } else { + /* Return if >= MBIGCL_LOWAT clusters available */ + if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && + m_total(MC_BIGCL) >= + MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) { + return 0; + } + + /* Ensure at least num clusters are accessible */ + if (num >= m_infree(MC_BIGCL)) { + i = num - m_infree(MC_BIGCL); + } + if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) { + j = num - (m_total(MC_BIGCL) - + m_minlimit(MC_BIGCL)); + } + + i = MAX(i, j); + + /* + * Grow pool if percent_pool > 75 (normal growth) + * or percent_pool > 50 (aggressive growth). + */ + mb_growth_thresh = 100 - (100 / (1 << mb_growth)); + if (percent_pool > mb_growth_thresh) { + j = ((sumclusters + num) >> mb_growth) - + freeclusters; + } + i = MAX(i, j); + } + + /* Check to ensure we didn't go over limits */ + if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) { + i = m_maxlimit(MC_BIGCL) - m_bigclusters; + } + if ((i << 1) + sumclusters >= nclusters) { + i = (nclusters - sumclusters) >> 1; + } + VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); + VERIFY(sumclusters + (i << 1) <= nclusters); + } else { /* 16K CL */ + /* Ensure at least num clusters are available */ + if (num >= m_16kclfree) { + i = num - m_16kclfree; + } + + /* Always grow 16KCL pool aggressively */ + if (((m_16kclusters + num) >> 1) > m_16kclfree) { + j = ((m_16kclusters + num) >> 1) - m_16kclfree; + } + i = MAX(i, j); + + /* Check to ensure we don't go over limit */ + if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) { + i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); + } + } + return i; +} + +uint64_t +mcl_to_paddr(char *addr) +{ + vm_offset_t base_phys; + + if (!MBUF_IN_MAP(addr)) { + return 0; + } + base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)]; + + if (base_phys == 0) { + return 0; + } + return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)); +} + +/* + * Inform the corresponding mcache(s) that there's a waiter below. + */ +static void +mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) +{ + mcache_waiter_inc(m_cache(class)); + if (comp) { + if (class == MC_CL) { + mcache_waiter_inc(m_cache(MC_MBUF_CL)); + } else if (class == MC_BIGCL) { + mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); + } else if (class == MC_16KCL) { + mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); + } else { + mcache_waiter_inc(m_cache(MC_MBUF_CL)); + mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); + } + } +} + +/* + * Inform the corresponding mcache(s) that there's no more waiter below. + */ +static void +mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) +{ + mcache_waiter_dec(m_cache(class)); + if (comp) { + if (class == MC_CL) { + mcache_waiter_dec(m_cache(MC_MBUF_CL)); + } else if (class == MC_BIGCL) { + mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); + } else if (class == MC_16KCL) { + mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); + } else { + mcache_waiter_dec(m_cache(MC_MBUF_CL)); + mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); + } + } +} + +static bool mbuf_watchdog_defunct_active = false; + +struct mbuf_watchdog_defunct_args { + struct proc *top_app; + uint32_t top_app_space_used; + bool non_blocking; +}; + +extern const char *proc_name_address(void *p); + +static void +mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1) +{ +#pragma unused(arg0, arg1) + struct mbuf_watchdog_defunct_args args = {}; + struct fileproc *fp = NULL; + + args.non_blocking = false; + proc_iterate(PROC_ALLPROCLIST, + mbuf_watchdog_defunct_iterate, &args, NULL, NULL); + + /* + * Defunct all sockets from this app. + */ + if (args.top_app != NULL) { + /* Restart the watchdog count. */ + lck_mtx_lock(mbuf_mlock); + microuptime(&mb_wdtstart); + lck_mtx_unlock(mbuf_mlock); + os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d", + __func__, + proc_name_address(args.top_app), + proc_pid(args.top_app)); + proc_fdlock(args.top_app); + fdt_foreach(fp, args.top_app) { + struct fileglob *fg = fp->fp_glob; + struct socket *so = NULL; + + if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) { + continue; + } + so = (struct socket *)fp_get_data(fp); + if (!socket_try_lock(so)) { + continue; + } + if (sosetdefunct(args.top_app, so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, + TRUE) == 0) { + sodefunct(args.top_app, so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + } + socket_unlock(so, 0); + } + proc_fdunlock(args.top_app); + proc_rele(args.top_app); + mbstat.m_forcedefunct++; + } + mbuf_watchdog_defunct_active = false; +} + +/* + * Called during slab (blocking and non-blocking) allocation. If there + * is at least one waiter, and the time since the first waiter is blocked + * is greater than the watchdog timeout, panic the system. + */ +static void +mbuf_watchdog(void) +{ + struct timeval now; + unsigned int since; + static thread_call_t defunct_tcall = NULL; + + if (mb_waiters == 0 || !mb_watchdog) { + return; + } + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + microuptime(&now); + since = now.tv_sec - mb_wdtstart.tv_sec; + + if (mbuf_watchdog_defunct_active) { + /* + * Don't panic the system while we are trying + * to find sockets to defunct. + */ + return; + } + if (since >= MB_WDT_MAXTIME) { + panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, + mb_waiters, since, mbuf_dump()); + /* NOTREACHED */ + } + /* + * Check if we are about to panic the system due + * to lack of mbufs and start defuncting sockets + * from processes that use too many sockets. + * + * We're always called with the mbuf_mlock held, + * so that also protects mbuf_watchdog_defunct_active. + */ + if (since >= MB_WDT_MAXTIME / 2) { + /* + * Start a thread to defunct sockets + * from apps that are over-using their socket + * buffers. + */ + if (defunct_tcall == NULL) { + defunct_tcall = + thread_call_allocate_with_options(mbuf_watchdog_defunct, + NULL, + THREAD_CALL_PRIORITY_KERNEL, + THREAD_CALL_OPTIONS_ONCE); + } + if (defunct_tcall != NULL) { + mbuf_watchdog_defunct_active = true; + thread_call_enter(defunct_tcall); + } + } +} + +/* + * Called during blocking allocation. Returns TRUE if one or more objects + * are available at the per-CPU caches layer and that allocation should be + * retried at that level. + */ +static boolean_t +mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) +{ + boolean_t mcache_retry = FALSE; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + /* Check if there's anything at the cache layer */ + if (mbuf_cached_above(class, wait)) { + mcache_retry = TRUE; + goto done; + } + + /* Nothing? Then try hard to get it from somewhere */ + m_reclaim(class, num, (wait & MCR_COMP)); + + /* We tried hard and got something? */ + if (m_infree(class) > 0) { + mbstat.m_wait++; + goto done; + } else if (mbuf_cached_above(class, wait)) { + mbstat.m_wait++; + mcache_retry = TRUE; + goto done; + } else if (wait & MCR_TRYHARD) { + mcache_retry = TRUE; + goto done; + } + + /* + * There's really nothing for us right now; inform the + * cache(s) that there is a waiter below and go to sleep. + */ + mbuf_waiter_inc(class, (wait & MCR_COMP)); + + VERIFY(!(wait & MCR_NOSLEEP)); + + /* + * If this is the first waiter, arm the watchdog timer. Otherwise + * check if we need to panic the system due to watchdog timeout. + */ + if (mb_waiters == 0) { + microuptime(&mb_wdtstart); + } else { + mbuf_watchdog(); + } + + mb_waiters++; + m_region_expand(class) += m_total(class) + num; + /* wake up the worker thread */ + if (mbuf_worker_ready && + mbuf_worker_needs_wakeup) { + wakeup((caddr_t)&mbuf_worker_needs_wakeup); + mbuf_worker_needs_wakeup = FALSE; + } + mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class)); + (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL); + mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class)); + + /* We are now up; stop getting notified until next round */ + mbuf_waiter_dec(class, (wait & MCR_COMP)); + + /* We waited and got something */ + if (m_infree(class) > 0) { + mbstat.m_wait++; + goto done; + } else if (mbuf_cached_above(class, wait)) { + mbstat.m_wait++; + mcache_retry = TRUE; + } +done: + return mcache_retry; +} + +__attribute__((noreturn)) +static void +mbuf_worker_thread(void) +{ + int mbuf_expand; + + while (1) { + lck_mtx_lock(mbuf_mlock); + mbwdog_logger("worker thread running"); + mbuf_worker_run_cnt++; + mbuf_expand = 0; + /* + * Allocations are based on page size, so if we have depleted + * the reserved spaces, try to free mbufs from the major classes. + */ +#if PAGE_SIZE == 4096 + uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; + uint32_t m_clusters = m_total(MC_CL); + uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; + uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters; + if (sumclusters >= nclusters) { + mbwdog_logger("reclaiming bigcl"); + mbuf_drain_locked(TRUE); + m_reclaim(MC_BIGCL, 4, FALSE); + } +#else + uint32_t m_16kclusters = m_total(MC_16KCL); + if ((m_16kclusters << NCLPJCLSHIFT) >= njcl) { + mbwdog_logger("reclaiming 16kcl"); + mbuf_drain_locked(TRUE); + m_reclaim(MC_16KCL, 4, FALSE); + } +#endif + if (m_region_expand(MC_CL) > 0) { + int n; + mb_expand_cl_cnt++; + /* Adjust to current number of cluster in use */ + n = m_region_expand(MC_CL) - + (m_total(MC_CL) - m_infree(MC_CL)); + if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) { + n = m_maxlimit(MC_CL) - m_total(MC_CL); + } + if (n > 0) { + mb_expand_cl_total += n; + } + m_region_expand(MC_CL) = 0; + + if (n > 0) { + mbwdog_logger("expanding MC_CL by %d", n); + freelist_populate(MC_CL, n, M_WAIT); + } + } + if (m_region_expand(MC_BIGCL) > 0) { + int n; + mb_expand_bigcl_cnt++; + /* Adjust to current number of 4 KB cluster in use */ + n = m_region_expand(MC_BIGCL) - + (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); + if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) { + n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); + } + if (n > 0) { + mb_expand_bigcl_total += n; + } + m_region_expand(MC_BIGCL) = 0; + + if (n > 0) { + mbwdog_logger("expanding MC_BIGCL by %d", n); + freelist_populate(MC_BIGCL, n, M_WAIT); + } + } + if (m_region_expand(MC_16KCL) > 0) { + int n; + mb_expand_16kcl_cnt++; + /* Adjust to current number of 16 KB cluster in use */ + n = m_region_expand(MC_16KCL) - + (m_total(MC_16KCL) - m_infree(MC_16KCL)); + if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) { + n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); + } + if (n > 0) { + mb_expand_16kcl_total += n; + } + m_region_expand(MC_16KCL) = 0; + + if (n > 0) { + mbwdog_logger("expanding MC_16KCL by %d", n); + (void) freelist_populate(MC_16KCL, n, M_WAIT); + } + } + + /* + * Because we can run out of memory before filling the mbuf + * map, we should not allocate more clusters than they are + * mbufs -- otherwise we could have a large number of useless + * clusters allocated. + */ + mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d", + m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL), + m_total(MC_16KCL)); + uint32_t total_mbufs = m_total(MC_MBUF); + uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + + m_total(MC_16KCL); + if (total_mbufs < total_clusters) { + mbwdog_logger("expanding MC_MBUF by %d", + total_clusters - total_mbufs); + } + while (total_mbufs < total_clusters) { + mb_expand_cnt++; + if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) { + break; + } + total_mbufs = m_total(MC_MBUF); + total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + + m_total(MC_16KCL); + } + + mbuf_worker_needs_wakeup = TRUE; + /* + * If there's a deadlock and we're not sending / receiving + * packets, net_uptime() won't be updated. Update it here + * so we are sure it's correct. + */ + net_update_uptime(); + mbuf_worker_last_runtime = net_uptime(); + assert_wait((caddr_t)&mbuf_worker_needs_wakeup, + THREAD_UNINT); + mbwdog_logger("worker thread sleeping"); + lck_mtx_unlock(mbuf_mlock); + (void) thread_block((thread_continue_t)mbuf_worker_thread); + } +} + +__attribute__((noreturn)) +static void +mbuf_worker_thread_init(void) +{ + mbuf_worker_ready++; + mbuf_worker_thread(); +} + +static mcl_slab_t * +slab_get(void *buf) +{ + mcl_slabg_t *slg; + unsigned int ix, k; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(MBUF_IN_MAP(buf)); + ix = ((unsigned char *)buf - mbutl) >> MBSHIFT; + VERIFY(ix < maxslabgrp); + + if ((slg = slabstbl[ix]) == NULL) { + /* + * In the current implementation, we never shrink the slabs + * table; if we attempt to reallocate a cluster group when + * it's already allocated, panic since this is a sign of a + * memory corruption (slabstbl[ix] got nullified). + */ + ++slabgrp; + VERIFY(ix < slabgrp); + /* + * Slabs expansion can only be done single threaded; when + * we get here, it must be as a result of m_clalloc() which + * is serialized and therefore mb_clalloc_busy must be set. + */ + VERIFY(mb_clalloc_busy); + lck_mtx_unlock(mbuf_mlock); + + /* This is a new buffer; create the slabs group for it */ + slg = zalloc_permanent_type(mcl_slabg_t); + slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB, + ZALIGN(mcl_slab_t)); + + lck_mtx_lock(mbuf_mlock); + /* + * No other thread could have gone into m_clalloc() after + * we dropped the lock above, so verify that it's true. + */ + VERIFY(mb_clalloc_busy); + + slabstbl[ix] = slg; + + /* Chain each slab in the group to its forward neighbor */ + for (k = 1; k < NSLABSPMB; k++) { + slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; + } + VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); + + /* And chain the last slab in the previous group to this */ + if (ix > 0) { + VERIFY(slabstbl[ix - 1]-> + slg_slab[NSLABSPMB - 1].sl_next == NULL); + slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = + &slg->slg_slab[0]; + } + } + + ix = MTOPG(buf) % NSLABSPMB; + VERIFY(ix < NSLABSPMB); + + return &slg->slg_slab[ix]; +} + +static void +slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, + void *base, void *head, unsigned int len, int refcnt, int chunks) +{ + sp->sl_class = class; + sp->sl_flags = flags; + sp->sl_base = base; + sp->sl_head = head; + sp->sl_len = len; + sp->sl_refcnt = refcnt; + sp->sl_chunks = chunks; + slab_detach(sp); +} + +static void +slab_insert(mcl_slab_t *sp, mbuf_class_t class) +{ + VERIFY(slab_is_detached(sp)); + m_slab_cnt(class)++; + TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); + sp->sl_flags &= ~SLF_DETACHED; + + /* + * If a buffer spans multiple contiguous pages then mark them as + * detached too + */ + if (class == MC_16KCL) { + int k; + for (k = 1; k < NSLABSP16KB; k++) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL && slab_is_detached(sp)); + sp->sl_flags &= ~SLF_DETACHED; + } + } +} + +static void +slab_remove(mcl_slab_t *sp, mbuf_class_t class) +{ + int k; + VERIFY(!slab_is_detached(sp)); + VERIFY(m_slab_cnt(class) > 0); + m_slab_cnt(class)--; + TAILQ_REMOVE(&m_slablist(class), sp, sl_link); + slab_detach(sp); + if (class == MC_16KCL) { + for (k = 1; k < NSLABSP16KB; k++) { + sp = sp->sl_next; + /* Next slab must already be present */ + VERIFY(sp != NULL); + VERIFY(!slab_is_detached(sp)); + slab_detach(sp); + } + } +} + +static boolean_t +slab_inrange(mcl_slab_t *sp, void *buf) +{ + return (uintptr_t)buf >= (uintptr_t)sp->sl_base && + (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len); +} + +#undef panic + +static void +slab_nextptr_panic(mcl_slab_t *sp, void *addr) +{ + int i; + unsigned int chunk_len = sp->sl_len / sp->sl_chunks; + uintptr_t buf = (uintptr_t)sp->sl_base; + + for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { + void *next = ((mcache_obj_t *)buf)->obj_next; + if (next != addr) { + continue; + } + if (!mclverify) { + if (next != NULL && !MBUF_IN_MAP(next)) { + mcache_t *cp = m_cache(sp->sl_class); + panic("%s: %s buffer %p in slab %p modified " + "after free at offset 0: %p out of range " + "[%p-%p)\n", __func__, cp->mc_name, + (void *)buf, sp, next, mbutl, embutl); + /* NOTREACHED */ + } + } else { + mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, + (mcache_obj_t *)buf); + mcl_audit_verify_nextptr(next, mca); + } + } +} + +static void +slab_detach(mcl_slab_t *sp) +{ + sp->sl_link.tqe_next = (mcl_slab_t *)-1; + sp->sl_link.tqe_prev = (mcl_slab_t **)-1; + sp->sl_flags |= SLF_DETACHED; +} + +static boolean_t +slab_is_detached(mcl_slab_t *sp) +{ + return (intptr_t)sp->sl_link.tqe_next == -1 && + (intptr_t)sp->sl_link.tqe_prev == -1 && + (sp->sl_flags & SLF_DETACHED); +} + +static void +mcl_audit_init(void *buf, mcache_audit_t **mca_list, + mcache_obj_t **con_list, size_t con_size, unsigned int num) +{ + mcache_audit_t *mca, *mca_tail; + mcache_obj_t *con = NULL; + boolean_t save_contents = (con_list != NULL); + unsigned int i, ix; + + ASSERT(num <= NMBPG); + ASSERT(con_list == NULL || con_size != 0); + + ix = MTOPG(buf); + VERIFY(ix < maxclaudit); + + /* Make sure we haven't been here before */ + for (i = 0; i < num; i++) { + VERIFY(mclaudit[ix].cl_audit[i] == NULL); + } + + mca = mca_tail = *mca_list; + if (save_contents) { + con = *con_list; + } + + for (i = 0; i < num; i++) { + mcache_audit_t *next; + + next = mca->mca_next; + bzero(mca, sizeof(*mca)); + mca->mca_next = next; + mclaudit[ix].cl_audit[i] = mca; + + /* Attach the contents buffer if requested */ + if (save_contents) { + mcl_saved_contents_t *msc = + (mcl_saved_contents_t *)(void *)con; + + VERIFY(msc != NULL); + VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t))); + VERIFY(con_size == sizeof(*msc)); + mca->mca_contents_size = con_size; + mca->mca_contents = msc; + con = con->obj_next; + bzero(mca->mca_contents, mca->mca_contents_size); + } + + mca_tail = mca; + mca = mca->mca_next; + } + + if (save_contents) { + *con_list = con; + } + + *mca_list = mca_tail->mca_next; + mca_tail->mca_next = NULL; +} + +static void +mcl_audit_free(void *buf, unsigned int num) +{ + unsigned int i, ix; + mcache_audit_t *mca, *mca_list; + + ix = MTOPG(buf); + VERIFY(ix < maxclaudit); + + if (mclaudit[ix].cl_audit[0] != NULL) { + mca_list = mclaudit[ix].cl_audit[0]; + for (i = 0; i < num; i++) { + mca = mclaudit[ix].cl_audit[i]; + mclaudit[ix].cl_audit[i] = NULL; + if (mca->mca_contents) { + mcache_free(mcl_audit_con_cache, + mca->mca_contents); + } + } + mcache_free_ext(mcache_audit_cache, + (mcache_obj_t *)mca_list); + } +} + +/* + * Given an address of a buffer (mbuf/2KB/4KB/16KB), return + * the corresponding audit structure for that buffer. + */ +static mcache_audit_t * +mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj) +{ + mcache_audit_t *mca = NULL; + int ix = MTOPG(mobj), m_idx = 0; + unsigned char *page_addr; + + VERIFY(ix < maxclaudit); + VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE))); + + page_addr = PGTOM(ix); + + switch (class) { + case MC_MBUF: + /* + * For the mbuf case, find the index of the page + * used by the mbuf and use that index to locate the + * base address of the page. Then find out the + * mbuf index relative to the page base and use + * it to locate the audit structure. + */ + m_idx = MBPAGEIDX(page_addr, mobj); + VERIFY(m_idx < (int)NMBPG); + mca = mclaudit[ix].cl_audit[m_idx]; + break; + + case MC_CL: + /* + * Same thing as above, but for 2KB clusters in a page. + */ + m_idx = CLPAGEIDX(page_addr, mobj); + VERIFY(m_idx < (int)NCLPG); + mca = mclaudit[ix].cl_audit[m_idx]; + break; + + case MC_BIGCL: + m_idx = BCLPAGEIDX(page_addr, mobj); + VERIFY(m_idx < (int)NBCLPG); + mca = mclaudit[ix].cl_audit[m_idx]; + break; + case MC_16KCL: + /* + * Same as above, but only return the first element. + */ + mca = mclaudit[ix].cl_audit[0]; + break; + + default: + VERIFY(0); + /* NOTREACHED */ + } + + return mca; +} + +static void +mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, + boolean_t alloc) +{ + struct mbuf *m = addr; + mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; + + VERIFY(mca->mca_contents != NULL && + mca->mca_contents_size == AUDIT_CONTENTS_SIZE); + + if (mclverify) { + mcl_audit_verify_nextptr(next, mca); + } + + if (!alloc) { + /* Save constructed mbuf fields */ + mcl_audit_save_mbuf(m, mca); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + } + ((mcache_obj_t *)m)->obj_next = next; + return; + } + + /* Check if the buffer has been corrupted while in freelist */ + if (mclverify) { + mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); + } + /* Restore constructed mbuf fields */ + mcl_audit_restore_mbuf(m, mca, composite); +} + +static void +mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) +{ + struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca); + + if (composite) { + struct mbuf *next = m->m_next; + VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL && + MBUF_IS_COMPOSITE(ms)); + VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); + /* + * We could have hand-picked the mbuf fields and restore + * them individually, but that will be a maintenance + * headache. Instead, restore everything that was saved; + * the mbuf layer will recheck and reinitialize anyway. + */ + bcopy(ms, m, MCA_SAVED_MBUF_SIZE); + m->m_next = next; + } else { + /* + * For a regular mbuf (no cluster attached) there's nothing + * to restore other than the type field, which is expected + * to be MT_FREE. + */ + m->m_type = ms->m_type; + } + mbuf_mcheck(m); +} + +static void +mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) +{ + VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE); + mbuf_mcheck(m); + bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE); +} + +static void +mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, + boolean_t save_next) +{ + mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; + + if (!alloc) { + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + } + if (save_next) { + mcl_audit_verify_nextptr(next, mca); + ((mcache_obj_t *)addr)->obj_next = next; + } + } else if (mclverify) { + /* Check if the buffer has been corrupted while in freelist */ + mcl_audit_verify_nextptr(next, mca); + mcache_audit_free_verify_set(mca, addr, 0, size); + } +} + +static void +mcl_audit_scratch(mcache_audit_t *mca) +{ + void *stack[MCACHE_STACK_DEPTH + 1]; + mcl_scratch_audit_t *msa; + struct timeval now; + + VERIFY(mca->mca_contents != NULL); + msa = MCA_SAVED_SCRATCH_PTR(mca); + + msa->msa_pthread = msa->msa_thread; + msa->msa_thread = current_thread(); + bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack)); + msa->msa_pdepth = msa->msa_depth; + bzero(stack, sizeof(stack)); + msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1; + bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack)); + + msa->msa_ptstamp = msa->msa_tstamp; + microuptime(&now); + /* tstamp is in ms relative to base_ts */ + msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000); + if ((now.tv_sec - mb_start.tv_sec) > 0) { + msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000); + } +} + +__abortlike +static void +mcl_audit_mcheck_panic(struct mbuf *m) +{ + char buf[DUMP_MCA_BUF_SIZE]; + mcache_audit_t *mca; + + MRANGE(m); + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + + panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s", + m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca)); + /* NOTREACHED */ +} + +__abortlike +static void +mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca) +{ + char buf[DUMP_MCA_BUF_SIZE]; + panic("mcl_audit: buffer %p modified after free at offset 0: " + "%p out of range [%p-%p)\n%s\n", + mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca)); + /* NOTREACHED */ +} + +static void +mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) +{ + if (next != NULL && !MBUF_IN_MAP(next) && + (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { + mcl_audit_verify_nextptr_panic(next, mca); + } +} + +static uintptr_t +hash_mix(uintptr_t x) +{ +#ifndef __LP64__ + x += ~(x << 15); + x ^= (x >> 10); + x += (x << 3); + x ^= (x >> 6); + x += ~(x << 11); + x ^= (x >> 16); +#else + x += ~(x << 32); + x ^= (x >> 22); + x += ~(x << 13); + x ^= (x >> 8); + x += (x << 3); + x ^= (x >> 15); + x += ~(x << 27); + x ^= (x >> 31); +#endif + return x; +} + +static uint32_t +hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) +{ + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; + + while (depth) { + hash += bt[--depth]; + } + + hash = hash_mix(hash) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} + +static uint32_t +hashaddr(uintptr_t pt, uint32_t max_size) +{ + uintptr_t hash = 0; + uintptr_t mask = max_size - 1; + + hash = hash_mix(pt) & mask; + + assert(hash < max_size); + + return (uint32_t) hash; +} + +/* This function turns on mbuf leak detection */ +static void +mleak_activate(void) +{ + mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; + PE_parse_boot_argn("mleak_sample_factor", + &mleak_table.mleak_sample_factor, + sizeof(mleak_table.mleak_sample_factor)); + + if (mleak_table.mleak_sample_factor == 0) { + mclfindleak = 0; + } + + if (mclfindleak == 0) { + return; + } + + vm_size_t alloc_size = + mleak_alloc_buckets * sizeof(struct mallocation); + vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace); + + mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation)); + mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace)); + mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), + ZALIGN(mleak_stat_t)); + + mleak_stat->ml_cnt = MLEAK_NUM_TRACES; +#ifdef __LP64__ + mleak_stat->ml_isaddr64 = 1; +#endif /* __LP64__ */ +} + +static void +mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) +{ + int temp; + + if (mclfindleak == 0) { + return; + } + + if (!alloc) { + return mleak_free(addr); + } + + temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed); + + if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { + uintptr_t bt[MLEAK_STACK_DEPTH]; + unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL); + mleak_log(bt, addr, logged, num); + } +} + +/* + * This function records the allocation in the mleak_allocations table + * and the backtrace in the mleak_traces table; if allocation slot is in use, + * replace old allocation with new one if the trace slot is in use, return + * (or increment refcount if same trace). + */ +static boolean_t +mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) +{ + struct mallocation *allocation; + struct mtrace *trace; + uint32_t trace_index; + + /* Quit if someone else modifying the tables */ + if (!lck_mtx_try_lock_spin(mleak_lock)) { + mleak_table.total_conflicts++; + return FALSE; + } + + allocation = &mleak_allocations[hashaddr((uintptr_t)addr, + mleak_alloc_buckets)]; + trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); + trace = &mleak_traces[trace_index]; + + VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); + VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); + + allocation->hitcount++; + trace->hitcount++; + + /* + * If the allocation bucket we want is occupied + * and the occupier has the same trace, just bail. + */ + if (allocation->element != NULL && + trace_index == allocation->trace_index) { + mleak_table.alloc_collisions++; + lck_mtx_unlock(mleak_lock); + return TRUE; + } + + /* + * Store the backtrace in the traces array; + * Size of zero = trace bucket is free. + */ + if (trace->allocs > 0 && + bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) { + /* Different, unique trace, but the same hash! Bail out. */ + trace->collisions++; + mleak_table.trace_collisions++; + lck_mtx_unlock(mleak_lock); + return TRUE; + } else if (trace->allocs > 0) { + /* Same trace, already added, so increment refcount */ + trace->allocs++; + } else { + /* Found an unused trace bucket, so record the trace here */ + if (trace->depth != 0) { + /* this slot previously used but not currently in use */ + mleak_table.trace_overwrites++; + } + mleak_table.trace_recorded++; + trace->allocs = 1; + memcpy(trace->addr, bt, (depth * sizeof(uintptr_t))); + trace->depth = depth; + trace->collisions = 0; + } + + /* Step 2: Store the allocation record in the allocations array */ + if (allocation->element != NULL) { + /* + * Replace an existing allocation. No need to preserve + * because only a subset of the allocations are being + * recorded anyway. + */ + mleak_table.alloc_collisions++; + } else if (allocation->trace_index != 0) { + mleak_table.alloc_overwrites++; + } + allocation->element = addr; + allocation->trace_index = trace_index; + allocation->count = num; + mleak_table.alloc_recorded++; + mleak_table.outstanding_allocs++; + + lck_mtx_unlock(mleak_lock); + return TRUE; +} + +static void +mleak_free(mcache_obj_t *addr) +{ + while (addr != NULL) { + struct mallocation *allocation = &mleak_allocations + [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; + + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + lck_mtx_lock_spin(mleak_lock); + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + struct mtrace *trace; + trace = &mleak_traces[allocation->trace_index]; + /* allocs = 0 means trace bucket is unused */ + if (trace->allocs > 0) { + trace->allocs--; + } + if (trace->allocs == 0) { + trace->depth = 0; + } + /* NULL element means alloc bucket is unused */ + allocation->element = NULL; + mleak_table.outstanding_allocs--; + } + lck_mtx_unlock(mleak_lock); + } + addr = addr->obj_next; + } +} + +static void +mleak_sort_traces() +{ + int i, j, k; + struct mtrace *swap; + + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + mleak_top_trace[i] = NULL; + } + + for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) { + if (mleak_traces[i].allocs <= 0) { + continue; + } + + mleak_top_trace[j] = &mleak_traces[i]; + for (k = j; k > 0; k--) { + if (mleak_top_trace[k]->allocs <= + mleak_top_trace[k - 1]->allocs) { + break; + } + + swap = mleak_top_trace[k - 1]; + mleak_top_trace[k - 1] = mleak_top_trace[k]; + mleak_top_trace[k] = swap; + } + j++; + } + + j--; + for (; i < mleak_trace_buckets; i++) { + if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) { + continue; + } + + mleak_top_trace[j] = &mleak_traces[i]; + + for (k = j; k > 0; k--) { + if (mleak_top_trace[k]->allocs <= + mleak_top_trace[k - 1]->allocs) { + break; + } + + swap = mleak_top_trace[k - 1]; + mleak_top_trace[k - 1] = mleak_top_trace[k]; + mleak_top_trace[k] = swap; + } + } +} + +static void +mleak_update_stats() +{ + mleak_trace_stat_t *mltr; + int i; + + VERIFY(mleak_stat != NULL); +#ifdef __LP64__ + VERIFY(mleak_stat->ml_isaddr64); +#else + VERIFY(!mleak_stat->ml_isaddr64); +#endif /* !__LP64__ */ + VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); + + mleak_sort_traces(); + + mltr = &mleak_stat->ml_trace[0]; + bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES); + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + int j; + + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs == 0) { + continue; + } + + mltr->mltr_collisions = mleak_top_trace[i]->collisions; + mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; + mltr->mltr_allocs = mleak_top_trace[i]->allocs; + mltr->mltr_depth = mleak_top_trace[i]->depth; + + VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); + for (j = 0; j < mltr->mltr_depth; j++) { + mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; + } + + mltr++; + } +} + +static struct mbtypes { + int mt_type; + const char *mt_name; +} mbtypes[] = { + { MT_DATA, "data" }, + { MT_OOBDATA, "oob data" }, + { MT_CONTROL, "ancillary data" }, + { MT_HEADER, "packet headers" }, + { MT_SOCKET, "socket structures" }, + { MT_PCB, "protocol control blocks" }, + { MT_RTABLE, "routing table entries" }, + { MT_HTABLE, "IMP host table entries" }, + { MT_ATABLE, "address resolution tables" }, + { MT_FTABLE, "fragment reassembly queue headers" }, + { MT_SONAME, "socket names and addresses" }, + { MT_SOOPTS, "socket options" }, + { MT_RIGHTS, "access rights" }, + { MT_IFADDR, "interface addresses" }, + { MT_TAG, "packet tags" }, + { 0, NULL } +}; + +#define MBUF_DUMP_BUF_CHK() { \ + clen -= k; \ + if (clen < 1) \ + goto done; \ + c += k; \ +} + +static char * +mbuf_dump(void) +{ + unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct, + totreturned = 0; + u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; + u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; + u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; + int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short); + uint8_t seen[256]; + struct mbtypes *mp; + mb_class_stat_t *sp; + mleak_trace_stat_t *mltr; + char *c = mbuf_dump_buf; + int i, j, k, clen = MBUF_DUMP_BUF_SIZE; + struct mbuf_watchdog_defunct_args args = {}; + + mbuf_dump_buf[0] = '\0'; + + /* synchronize all statistics in the mbuf table */ + mbuf_stat_sync(); + mbuf_mtypes_sync(); + + sp = &mb_stat->mbs_class[0]; + for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { + u_int32_t mem; + + if (m_class(i) == MC_MBUF) { + m_mbufs = sp->mbcl_active; + } else if (m_class(i) == MC_CL) { + m_clfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_BIGCL) { + m_bigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_16KCL) { + m_16kclfree = sp->mbcl_total - sp->mbcl_active; + m_16kclusters = sp->mbcl_total; + } else if (m_class(i) == MC_MBUF_CL) { + m_mbufclfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_MBUF_BIGCL) { + m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_MBUF_16KCL) { + m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; + } + + mem = sp->mbcl_ctotal * sp->mbcl_size; + totmem += mem; + totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * + sp->mbcl_size; + totreturned += sp->mbcl_release_cnt; + } + + /* adjust free counts to include composite caches */ + m_clfree += m_mbufclfree; + m_bigclfree += m_mbufbigclfree; + m_16kclfree += m_mbuf16kclfree; + + totmbufs = 0; + for (mp = mbtypes; mp->mt_name != NULL; mp++) { + totmbufs += mbstat.m_mtypes[mp->mt_type]; + } + if (totmbufs > m_mbufs) { + totmbufs = m_mbufs; + } + k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); + MBUF_DUMP_BUF_CHK(); + + bzero(&seen, sizeof(seen)); + for (mp = mbtypes; mp->mt_name != NULL; mp++) { + if (mbstat.m_mtypes[mp->mt_type] != 0) { + seen[mp->mt_type] = 1; + k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n", + mbstat.m_mtypes[mp->mt_type], mp->mt_name); + MBUF_DUMP_BUF_CHK(); + } + } + seen[MT_FREE] = 1; + for (i = 0; i < nmbtypes; i++) { + if (!seen[i] && mbstat.m_mtypes[i] != 0) { + k = scnprintf(c, clen, "\t%u mbufs allocated to " + "\n", mbstat.m_mtypes[i], i); + MBUF_DUMP_BUF_CHK(); + } + } + if ((m_mbufs - totmbufs) > 0) { + k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n", + m_mbufs - totmbufs); + MBUF_DUMP_BUF_CHK(); + } + k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" + "%u/%u mbuf 4KB clusters in use\n", + (unsigned int)(mbstat.m_clusters - m_clfree), + (unsigned int)mbstat.m_clusters, + (unsigned int)(mbstat.m_bigclusters - m_bigclfree), + (unsigned int)mbstat.m_bigclusters); + MBUF_DUMP_BUF_CHK(); + + k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", + m_16kclusters - m_16kclfree, m_16kclusters, + njclbytes / 1024); + MBUF_DUMP_BUF_CHK(); + totused = totmem - totfree; + if (totmem == 0) { + totpct = 0; + } else if (totused < (ULONG_MAX / 100)) { + totpct = (totused * 100) / totmem; + } else { + u_long totmem1 = totmem / 100; + u_long totused1 = totused / 100; + totpct = (totused1 * 100) / totmem1; + } + k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " + "in use)\n", totmem / 1024, totpct); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "%lu KB returned to the system\n", + totreturned / 1024); + MBUF_DUMP_BUF_CHK(); + + net_update_uptime(); + + k = scnprintf(c, clen, + "worker thread runs: %u, expansions: %llu, cl %llu/%llu, " + "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt, + mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total, + mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt, + mb_expand_16kcl_total); + MBUF_DUMP_BUF_CHK(); + if (mbuf_worker_last_runtime != 0) { + k = scnprintf(c, clen, "worker thread last run time: " + "%llu (%llu seconds ago)\n", + mbuf_worker_last_runtime, + net_uptime() - mbuf_worker_last_runtime); + MBUF_DUMP_BUF_CHK(); + } + if (mbuf_drain_last_runtime != 0) { + k = scnprintf(c, clen, "drain routine last run time: " + "%llu (%llu seconds ago)\n", + mbuf_drain_last_runtime, + net_uptime() - mbuf_drain_last_runtime); + MBUF_DUMP_BUF_CHK(); + } + + /* + * Log where the most mbufs have accumulated: + * - Process socket buffers + * - TCP reassembly queue + * - Interface AQM queue (output) and DLIL input queue + */ + args.non_blocking = true; + proc_iterate(PROC_ALLPROCLIST, + mbuf_watchdog_defunct_iterate, &args, NULL, NULL); + if (args.top_app != NULL) { + k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n", + args.top_app_space_used, + proc_name_address(args.top_app), + proc_pid(args.top_app)); + proc_rele(args.top_app); + } + MBUF_DUMP_BUF_CHK(); + +#if INET + k = dump_tcp_reass_qlen(c, clen); + MBUF_DUMP_BUF_CHK(); +#endif /* INET */ + +#if MPTCP + k = dump_mptcp_reass_qlen(c, clen); + MBUF_DUMP_BUF_CHK(); +#endif /* MPTCP */ + +#if NETWORKING + k = dlil_dump_top_if_qlen(c, clen); + MBUF_DUMP_BUF_CHK(); +#endif /* NETWORKING */ + + /* mbuf leak detection statistics */ + mleak_update_stats(); + + k = scnprintf(c, clen, "\nmbuf leak detection table:\n"); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n", + mleak_table.mleak_capture / mleak_table.mleak_sample_factor, + mleak_table.mleak_sample_factor); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n", + mleak_table.outstanding_allocs); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", + mleak_table.alloc_recorded, mleak_table.trace_recorded); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", + mleak_table.alloc_collisions, mleak_table.trace_collisions); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", + mleak_table.alloc_overwrites, mleak_table.trace_overwrites); + MBUF_DUMP_BUF_CHK(); + k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n", + mleak_table.total_conflicts); + MBUF_DUMP_BUF_CHK(); + + k = scnprintf(c, clen, "top %d outstanding traces:\n", + mleak_stat->ml_cnt); + MBUF_DUMP_BUF_CHK(); + for (i = 0; i < mleak_stat->ml_cnt; i++) { + mltr = &mleak_stat->ml_trace[i]; + k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), " + "%llu hit(s), %llu collision(s)\n", (i + 1), + mltr->mltr_allocs, mltr->mltr_hitcount, + mltr->mltr_collisions); + MBUF_DUMP_BUF_CHK(); + } + + if (mleak_stat->ml_isaddr64) { + k = scnprintf(c, clen, MB_LEAK_HDR_64); + } else { + k = scnprintf(c, clen, MB_LEAK_HDR_32); + } + MBUF_DUMP_BUF_CHK(); + + for (i = 0; i < MLEAK_STACK_DEPTH; i++) { + k = scnprintf(c, clen, "%2d: ", (i + 1)); + MBUF_DUMP_BUF_CHK(); + for (j = 0; j < mleak_stat->ml_cnt; j++) { + mltr = &mleak_stat->ml_trace[j]; + if (i < mltr->mltr_depth) { + if (mleak_stat->ml_isaddr64) { + k = scnprintf(c, clen, "0x%0llx ", + (uint64_t)VM_KERNEL_UNSLIDE( + mltr->mltr_addr[i])); + } else { + k = scnprintf(c, clen, + "0x%08x ", + (uint32_t)VM_KERNEL_UNSLIDE( + mltr->mltr_addr[i])); + } + } else { + if (mleak_stat->ml_isaddr64) { + k = scnprintf(c, clen, + MB_LEAK_SPACING_64); + } else { + k = scnprintf(c, clen, + MB_LEAK_SPACING_32); + } + } + MBUF_DUMP_BUF_CHK(); + } + k = scnprintf(c, clen, "\n"); + MBUF_DUMP_BUF_CHK(); + } + +done: + return mbuf_dump_buf; +} + +#undef MBUF_DUMP_BUF_CHK + +/* + * This routine is reserved for mbuf_get_driver_scratch(); clients inside + * xnu that intend on utilizing the module-private area should directly + * refer to the pkt_mpriv structure in the pkthdr. They are also expected + * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior + * to handing it off to another module, respectively. + */ +u_int32_t +m_scratch_get(struct mbuf *m, u_int8_t **p) +{ + struct pkthdr *pkt = &m->m_pkthdr; + + VERIFY(m->m_flags & M_PKTHDR); + + /* See comments in */ + if (pkt->pkt_flags & PKTF_PRIV_GUARDED) { + panic_plain("Invalid attempt to access guarded module-private " + "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags); + /* NOTREACHED */ + } + + if (mcltrace) { + mcache_audit_t *mca; + + lck_mtx_lock(mbuf_mlock); + mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); + if (mca->mca_uflags & MB_SCVALID) { + mcl_audit_scratch(mca); + } + lck_mtx_unlock(mbuf_mlock); + } + + *p = (u_int8_t *)&pkt->pkt_mpriv; + return sizeof(pkt->pkt_mpriv); +} + +/* + * Simple routine to avoid taking the lock when we can't run the + * mbuf drain. + */ +static int +mbuf_drain_checks(boolean_t ignore_waiters) +{ + if (mb_drain_maxint == 0) { + return 0; + } + if (!ignore_waiters && mb_waiters != 0) { + return 0; + } + + return 1; +} + +/* + * Called by the VM when there's memory pressure or when we exhausted + * the 4k/16k reserved space. + */ +static void +mbuf_drain_locked(boolean_t ignore_waiters) +{ + mbuf_class_t mc; + mcl_slab_t *sp, *sp_tmp, *nsp; + unsigned int num, k, interval, released = 0; + unsigned long total_mem = 0, use_mem = 0; + boolean_t ret, purge_caches = FALSE; + ppnum_t offset; + mcache_obj_t *obj; + unsigned long per; + static unsigned char scratch[32]; + static ppnum_t scratch_pa = 0; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (!mbuf_drain_checks(ignore_waiters)) { + return; + } + if (scratch_pa == 0) { + bzero(scratch, sizeof(scratch)); + scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch); + VERIFY(scratch_pa); + } else if (mclverify) { + /* + * Panic if a driver wrote to our scratch memory. + */ + for (k = 0; k < sizeof(scratch); k++) { + if (scratch[k]) { + panic("suspect DMA to freed address"); + } + } + } + /* + * Don't free memory too often as that could cause excessive + * waiting times for mbufs. Purge caches if we were asked to drain + * in the last 5 minutes. + */ + if (mbuf_drain_last_runtime != 0) { + interval = net_uptime() - mbuf_drain_last_runtime; + if (interval <= mb_drain_maxint) { + return; + } + if (interval <= mb_drain_maxint * 5) { + purge_caches = TRUE; + } + } + mbuf_drain_last_runtime = net_uptime(); + /* + * Don't free any memory if we're using 60% or more. + */ + for (mc = 0; mc < MC_MAX; mc++) { + total_mem += m_total(mc) * m_maxsize(mc); + use_mem += m_active(mc) * m_maxsize(mc); + } + per = (use_mem * 100) / total_mem; + if (per >= 60) { + return; + } + /* + * Purge all the caches. This effectively disables + * caching for a few seconds, but the mbuf worker thread will + * re-enable them again. + */ + if (purge_caches == TRUE) { + for (mc = 0; mc < MC_MAX; mc++) { + if (m_total(mc) < m_avgtotal(mc)) { + continue; + } + lck_mtx_unlock(mbuf_mlock); + ret = mcache_purge_cache(m_cache(mc), FALSE); + lck_mtx_lock(mbuf_mlock); + if (ret == TRUE) { + m_purge_cnt(mc)++; + } + } + } + /* + * Move the objects from the composite class freelist to + * the rudimentary slabs list, but keep at least 10% of the average + * total in the freelist. + */ + for (mc = 0; mc < MC_MAX; mc++) { + while (m_cobjlist(mc) && + m_total(mc) < m_avgtotal(mc) && + m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { + obj = m_cobjlist(mc); + m_cobjlist(mc) = obj->obj_next; + obj->obj_next = NULL; + num = cslab_free(mc, obj, 1); + VERIFY(num == 1); + m_free_cnt(mc)++; + m_infree(mc)--; + /* cslab_free() handles m_total */ + } + } + /* + * Free the buffers present in the slab list up to 10% of the total + * average per class. + * + * We walk the list backwards in an attempt to reduce fragmentation. + */ + for (mc = MC_MAX - 1; (int)mc >= 0; mc--) { + TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) { + /* + * Process only unused slabs occupying memory. + */ + if (sp->sl_refcnt != 0 || sp->sl_len == 0 || + sp->sl_base == NULL) { + continue; + } + if (m_total(mc) < m_avgtotal(mc) || + m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) { + break; + } + slab_remove(sp, mc); + switch (mc) { + case MC_MBUF: + m_infree(mc) -= NMBPG; + m_total(mc) -= NMBPG; + if (mclaudit != NULL) { + mcl_audit_free(sp->sl_base, NMBPG); + } + break; + case MC_CL: + m_infree(mc) -= NCLPG; + m_total(mc) -= NCLPG; + if (mclaudit != NULL) { + mcl_audit_free(sp->sl_base, NMBPG); + } + break; + case MC_BIGCL: + { + m_infree(mc) -= NBCLPG; + m_total(mc) -= NBCLPG; + if (mclaudit != NULL) { + mcl_audit_free(sp->sl_base, NMBPG); + } + break; + } + case MC_16KCL: + m_infree(mc)--; + m_total(mc)--; + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { + nsp = nsp->sl_next; + VERIFY(nsp->sl_refcnt == 0 && + nsp->sl_base != NULL && + nsp->sl_len == 0); + slab_init(nsp, 0, 0, NULL, NULL, 0, 0, + 0); + nsp->sl_flags = 0; + } + if (mclaudit != NULL) { + if (sp->sl_len == PAGE_SIZE) { + mcl_audit_free(sp->sl_base, + NMBPG); + } else { + mcl_audit_free(sp->sl_base, 1); + } + } + break; + default: + /* + * The composite classes have their own + * freelist (m_cobjlist), so we only + * process rudimentary classes here. + */ + VERIFY(0); + } + m_release_cnt(mc) += m_size(mc); + released += m_size(mc); + VERIFY(sp->sl_base != NULL && + sp->sl_len >= PAGE_SIZE); + offset = MTOPG(sp->sl_base); + /* + * Make sure the IOMapper points to a valid, but + * bogus, address. This should prevent further DMA + * accesses to freed memory. + */ + IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa); + mcl_paddr[offset] = 0; + kmem_free(mb_map, (vm_offset_t)sp->sl_base, + sp->sl_len); + slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0); + sp->sl_flags = 0; + } + } + mbstat.m_drain++; + mbstat.m_bigclusters = m_total(MC_BIGCL); + mbstat.m_clusters = m_total(MC_CL); + mbstat.m_mbufs = m_total(MC_MBUF); + mbuf_stat_sync(); + mbuf_mtypes_sync(); +} + +__private_extern__ void +mbuf_drain(boolean_t ignore_waiters) +{ + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED); + if (!mbuf_drain_checks(ignore_waiters)) { + return; + } + lck_mtx_lock(mbuf_mlock); + mbuf_drain_locked(ignore_waiters); + lck_mtx_unlock(mbuf_mlock); +} + + +static int +m_drain_force_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int val = 0, err; + + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == USER_ADDR_NULL) { + return err; + } + if (val) { + mbuf_drain(TRUE); + } + + return err; +} + +#if DEBUG || DEVELOPMENT +__printflike(3, 4) +static void +_mbwdog_logger(const char *func, const int line, const char *fmt, ...) +{ + va_list ap; + struct timeval now; + char str[384], p[256]; + int len; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (mbwdog_logging == NULL) { + /* + * This might block under a mutex, which isn't really great, + * but this happens once, so we'll live. + */ + mbwdog_logging = zalloc_permanent(mbwdog_logging_size, + ZALIGN_NONE); + } + va_start(ap, fmt); + vsnprintf(p, sizeof(p), fmt, ap); + va_end(ap); + microuptime(&now); + len = scnprintf(str, sizeof(str), + "\n%ld.%d (%d/%llx) %s:%d %s", + now.tv_sec, now.tv_usec, + proc_getpid(current_proc()), + (uint64_t)VM_KERNEL_ADDRPERM(current_thread()), + func, line, p); + if (len < 0) { + return; + } + if (mbwdog_logging_used + len > mbwdog_logging_size) { + mbwdog_logging_used = mbwdog_logging_used / 2; + memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used, + mbwdog_logging_size - mbwdog_logging_used); + mbwdog_logging[mbwdog_logging_used] = 0; + } + strlcat(mbwdog_logging, str, mbwdog_logging_size); + mbwdog_logging_used += len; +} + +#endif // DEBUG || DEVELOPMENT + +static void +mtracelarge_register(size_t size) +{ + int i; + struct mtracelarge *trace; + uintptr_t bt[MLEAK_STACK_DEPTH]; + unsigned int depth; + + depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL); + /* Check if this entry is already on the list. */ + for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) { + trace = &mtracelarge_table[i]; + if (trace->size == size && trace->depth == depth && + memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) { + return; + } + } + for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) { + trace = &mtracelarge_table[i]; + if (size > trace->size) { + trace->depth = depth; + memcpy(trace->addr, bt, depth * sizeof(uintptr_t)); + trace->size = size; + break; + } + } +} + +#if DEBUG || DEVELOPMENT + +static int +mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS +{ + char *str; + + ifnet_head_lock_shared(); + lck_mtx_lock(mbuf_mlock); + + str = mbuf_dump(); + + lck_mtx_unlock(mbuf_mlock); + ifnet_head_done(); + + return sysctl_io_string(req, str, 0, 0, NULL); +} + +#endif /* DEBUG || DEVELOPMENT */ + +SYSCTL_DECL(_kern_ipc); +#if DEBUG || DEVELOPMENT +#if SKYWALK +SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor, + CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor, + MC_THRESHOLD_SCALE_DOWN_FACTOR, + "scale down factor for mbuf cache thresholds"); +#endif /* SKYWALK */ +SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump"); +#endif /* DEBUG || DEVELOPMENT */ +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_table_sysctl, "S,mleak_table", ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, + CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, + CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, + CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, + m_drain_force_sysctl, "I", + "Forces the mbuf garbage collection to run"); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint, + CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0, + "Minimum time interval between garbage collection"); diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 638b3c10a..d117de42d 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -111,14 +111,17 @@ #include #include #include -#include +#include #include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include -#include #include #include @@ -147,19 +150,8 @@ /* TODO: this should be in a header file somewhere */ extern char *proc_name_address(void *p); -static u_int32_t so_cache_hw; /* High water mark for socache */ -static u_int32_t so_cache_timeouts; /* number of timeouts */ -static u_int32_t so_cache_max_freed; /* max freed per timeout */ -static u_int32_t cached_sock_count = 0; -STAILQ_HEAD(, socket) so_cache_head; -int max_cached_sock_count = MAX_CACHED_SOCKETS; -static uint64_t so_cache_time; static int socketinit_done; -static struct zone *so_cache_zone; -ZONE_DECLARE(so_cache_zone, struct zone *); - -static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache"); -static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp); +struct mem_acct *socket_memacct; #include @@ -245,8 +237,6 @@ SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED, ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM); so_gen_t so_gencnt; /* generation count for sockets */ -MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); - #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0) #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2) #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1) @@ -257,8 +247,6 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3) #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8)) -#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES) - int somaxconn = SOMAXCONN; SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); @@ -272,29 +260,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); -/* - * Set to enable jumbo clusters (if available) for large writes when - * the socket is marked with SOF_MULTIPAGES; see below. - */ -int sosendjcl = 1; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, - CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); - -/* - * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large - * writes on the socket for all protocols on any network interfaces, - * depending upon sosendjcl above. Be extra careful when setting this - * to 1, because sending down packets that cross physical pages down to - * broken drivers (those that falsely assume that the physical pages - * are contiguous) might lead to system panics or silent data corruption. - * When set to 0, the system will respect SOF_MULTIPAGES, which is set - * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES - * capable. Set this to 1 only for testing/debugging purposes. - */ -int sosendjcl_ignore_capab = 0; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, - CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); - /* * Set this to ignore SOF1_IF_2KCL and use big clusters for large * writes on the socket for all protocols on any network interfaces. @@ -342,16 +307,8 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED, extern struct inpcbinfo tcbinfo; -/* TODO: these should be in header file */ -extern int get_inpcb_str_size(void); -extern int get_tcp_str_size(void); - -vm_size_t so_cache_zone_element_size; - static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *); -static void cached_sock_alloc(struct socket **, zalloc_flags_t); -static void cached_sock_free(struct socket *); /* * Maximum of extended background idle sockets per process @@ -395,23 +352,23 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, void socketinit(void) { - _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t)); + static_assert(sizeof(so_gencnt) == sizeof(uint64_t)); VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t))); #ifdef __LP64__ - _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen)); - _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr)); - _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen)); + static_assert(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints)); + static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif)); + static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr)); + static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen)); + static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr)); + static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen)); #else - _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr)); - _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen)); - _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr)); - _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen)); + static_assert(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints)); + static_assert(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif)); + static_assert(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr)); + static_assert(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen)); + static_assert(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr)); + static_assert(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen)); #endif if (socketinit_done) { @@ -426,92 +383,16 @@ socketinit(void) PE_parse_boot_argn("sosend_assert_panic", &sosend_assert_panic, sizeof(sosend_assert_panic)); - STAILQ_INIT(&so_cache_head); - - so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4 - + get_inpcb_str_size() + 4 + get_tcp_str_size()); - - so_cache_zone = zone_create("socache zone", so_cache_zone_element_size, - ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM); - bzero(&soextbkidlestat, sizeof(struct soextbkidlestat)); soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC; soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME; soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT; in_pcbinit(); -} -static void -cached_sock_alloc(struct socket **so, zalloc_flags_t how) -{ - caddr_t temp; - uintptr_t offset; - - lck_mtx_lock(&so_cache_mtx); - - if (!STAILQ_EMPTY(&so_cache_head)) { - VERIFY(cached_sock_count > 0); - - *so = STAILQ_FIRST(&so_cache_head); - STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); - STAILQ_NEXT((*so), so_cache_ent) = NULL; - - cached_sock_count--; - lck_mtx_unlock(&so_cache_mtx); - - temp = (*so)->so_saved_pcb; - bzero(*so, sizeof(struct socket)); - - (*so)->so_saved_pcb = temp; - } else { - lck_mtx_unlock(&so_cache_mtx); - - uint8_t *so_mem = zalloc_flags_buf(so_cache_zone, how | Z_ZERO); -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-align" - *so = (struct socket *)so_mem; - - /* - * Define offsets for extra structures into our - * single block of memory. Align extra structures - * on longword boundaries. - */ - - offset = (uintptr_t)so_mem; - offset += sizeof(struct socket); - offset = ALIGN(offset); - struct inpcb *pcb = (struct inpcb *)(so_mem + (offset - (uintptr_t)so_mem)); -#pragma clang diagnostic pop - (*so)->so_saved_pcb = (caddr_t)pcb; - - offset += get_inpcb_str_size(); - offset = ALIGN(offset); - pcb->inp_saved_ppcb = (caddr_t)(so_mem + (offset - (uintptr_t)so_mem)); - } - - OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1); -} - -static void -cached_sock_free(struct socket *so) -{ - lck_mtx_lock(&so_cache_mtx); - - so_cache_time = net_uptime(); - if (++cached_sock_count > max_cached_sock_count) { - --cached_sock_count; - lck_mtx_unlock(&so_cache_mtx); - zfree(so_cache_zone, so); - } else { - if (so_cache_hw < cached_sock_count) { - so_cache_hw = cached_sock_count; - } - - STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent); - - so->cache_timestamp = so_cache_time; - lck_mtx_unlock(&so_cache_mtx); + socket_memacct = mem_acct_register("SOCKET", 0, 0); + if (socket_memacct == NULL) { + panic("mem_acct_register returned NULL"); } } @@ -561,63 +442,19 @@ so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, } #endif /* NECP */ -boolean_t -so_cache_timer(void) -{ - struct socket *p; - int n_freed = 0; - boolean_t rc = FALSE; - - lck_mtx_lock(&so_cache_mtx); - so_cache_timeouts++; - so_cache_time = net_uptime(); - - while (!STAILQ_EMPTY(&so_cache_head)) { - VERIFY(cached_sock_count > 0); - p = STAILQ_FIRST(&so_cache_head); - if ((so_cache_time - p->cache_timestamp) < - SO_CACHE_TIME_LIMIT) { - break; - } - - STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent); - --cached_sock_count; - - zfree(so_cache_zone, p); - - if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) { - so_cache_max_freed++; - break; - } - } - - /* Schedule again if there is more to cleanup */ - if (!STAILQ_EMPTY(&so_cache_head)) { - rc = TRUE; - } - - lck_mtx_unlock(&so_cache_mtx); - return rc; -} - /* * Get a socket structure from our zone, and initialize it. - * We don't implement `waitok' yet (see comments in uipc_domain.c). + * * Note that it would probably be better to allocate socket * and PCB at the same time, but I'm not convinced that all * the protocols can be easily modified to do this. */ struct socket * -soalloc(int waitok, int dom, int type) +soalloc(void) { - zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT; struct socket *__single so; - if ((dom == PF_INET) && (type == SOCK_STREAM)) { - cached_sock_alloc(&so, how); - } else { - so = zalloc_flags(socket_zone, how | Z_ZERO); - } + so = zalloc_flags(socket_zone, Z_WAITOK_ZERO); if (so != NULL) { so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); @@ -662,7 +499,10 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, if (prp->pr_type != type) { return EPROTOTYPE; } - so = soalloc(1, dom, type); + if (proto_memacct_hardlimit(prp)) { + return ENOBUFS; + } + so = soalloc(); if (so == NULL) { return ENOBUFS; } @@ -754,6 +594,8 @@ socreate_internal(int dom, struct socket **aso, int type, int proto, so->next_lock_lr = 0; so->next_unlock_lr = 0; + proto_memacct_add(so->so_proto, sizeof(struct socket)); + /* * Attachment will create the per pcb lock if necessary and * increase refcount for creation, make sure it's done before @@ -952,6 +794,8 @@ out: void sodealloc(struct socket *so) { + proto_memacct_sub(so->so_proto, sizeof(struct socket)); + kauth_cred_unref(&so->so_cred); /* Remove any filters */ @@ -959,11 +803,7 @@ sodealloc(struct socket *so) so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt); - if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) { - cached_sock_free(so); - } else { - zfree(socket_zone, so); - } + zfree(socket_zone, so); } /* @@ -1695,6 +1535,9 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) * This allows user to disconnect by connecting to, e.g., * a null address. */ +#if NECP + bool set_domain_from_tracker_lookup = false; +#endif /* NECP */ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnectlocked(so)))) { @@ -1712,6 +1555,9 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) { so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN; } +#if NECP + set_domain_from_tracker_lookup = (metadata.domain[0] != 0); +#endif /* NECP */ necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain), __unsafe_null_terminated_from_indexable(metadata.domain_owner)); @@ -1721,6 +1567,12 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) #if NECP /* Update NECP evaluation after setting any domain via the tracker checks */ so_update_necp_policy(so, NULL, nam); + if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) { + // Mark extended timeout on tracker lookup to ensure that the entry stays around + tracker_metadata_t update_metadata = { }; + update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT; + (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, &update_metadata); + } #endif /* NECP */ /* @@ -1817,6 +1669,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src, * try to disconnect first. This allows user to disconnect * by connecting to, e.g., a null address. */ +#if NECP + bool set_domain_from_tracker_lookup = false; +#endif /* NECP */ if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) && !(so->so_proto->pr_flags & PR_MULTICONN) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || @@ -1836,6 +1691,9 @@ soconnectxlocked(struct socket *so, struct sockaddr *src, if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) { so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN; } +#if NECP + set_domain_from_tracker_lookup = (metadata.domain[0] != 0); +#endif /* NECP */ necp_set_socket_domain_attributes(so, __unsafe_null_terminated_from_indexable(metadata.domain), __unsafe_null_terminated_from_indexable(metadata.domain_owner)); } @@ -1895,6 +1753,15 @@ soconnectxlocked(struct socket *so, struct sockaddr *src, so->so_flags1 &= ~SOF1_PRECONNECT_DATA; } } + +#if NECP + if (set_domain_from_tracker_lookup && (so->so_flags1 & SOF1_DOMAIN_MATCHED_POLICY)) { + // Mark extended timeout on tracker lookup to ensure that the entry stays around + tracker_metadata_t update_metadata = { }; + update_metadata.flags = SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT; + (void)tracker_lookup(so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, &update_metadata); + } +#endif /* NECP */ } } @@ -1987,9 +1854,10 @@ int sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid, int32_t clen, int32_t atomic, int flags, int *sblocked) { - int error = 0; + int assumelock = 0; + int error = 0; int32_t space; - int assumelock = 0; + int ret; restart: if (*sblocked == 0) { @@ -2104,6 +1972,12 @@ defunct: } goto restart; } + + ret = proto_memacct_limited(so->so_proto); + if (ret == MEMACCT_HARDLIMIT || + (ret == MEMACCT_SOFTLIMIT && so->so_snd.sb_cc > 0)) { + return ENOMEM; + } return 0; } @@ -2313,9 +2187,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, * a jumbo cluster pool and if the socket is * marked accordingly. */ - jumbocl = sosendjcl && njcl > 0 && - ((so->so_flags & SOF_MULTIPAGES) || - sosendjcl_ignore_capab) && + jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0 && bigcl; socket_unlock(so, 0); @@ -4176,12 +4048,12 @@ restart: if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == (SS_NOFDREF | SS_CANTRCVMORE)) { error = 0; - goto out; + goto release; } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) { - goto out; + goto release; } sblocked = 1; @@ -4379,7 +4251,6 @@ release: socket_unlock(so, 1); } -out: *pktcntp = npkts; /* * Amortize the cost of freeing the mbufs @@ -5810,7 +5681,48 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) so->so_flags1 |= SOF1_DOMAIN_INFO_SILENT; } break; + case SO_MAX_PACING_RATE: { + uint64_t pacingrate; + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { + error = EINVAL; + goto out; + } + + error = sooptcopyin(sopt, &pacingrate, + sizeof(pacingrate), sizeof(pacingrate)); + if (error != 0) { + goto out; + } + + if (pacingrate == 0) { + error = EINVAL; + goto out; + } + sotoinpcb(so)->inp_max_pacing_rate = pacingrate; + break; + } + case SO_CONNECTION_IDLE: { + int is_idle; + + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { + error = EINVAL; + goto out; + } + + error = sooptcopyin(sopt, &is_idle, + sizeof(is_idle), sizeof(is_idle)); + if (error != 0) { + goto out; + } + + if (is_idle != 0) { + sotoinpcb(so)->inp_flags2 |= INP2_CONNECTION_IDLE; + } else { + sotoinpcb(so)->inp_flags2 &= ~INP2_CONNECTION_IDLE; + } + break; + } default: error = ENOPROTOOPT; break; @@ -6326,6 +6238,28 @@ integer: optval = ((so->so_flags1 & SOF1_DOMAIN_INFO_SILENT) > 0) ? 1 : 0; goto integer; + case SO_MAX_PACING_RATE: { + uint64_t pacingrate; + + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { + error = EINVAL; + goto out; + } + + pacingrate = sotoinpcb(so)->inp_max_pacing_rate; + + error = sooptcopyout(sopt, &pacingrate, sizeof(pacingrate)); + break; + } + case SO_CONNECTION_IDLE: { + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) { + error = EINVAL; + goto out; + } + optval = sotoinpcb(so)->inp_flags2 & INP2_CONNECTION_IDLE ? + 1 : 0; + goto integer; + } default: error = ENOPROTOOPT; break; @@ -8228,6 +8162,66 @@ socket_post_kev_msg_closed(struct socket *so) free_sockaddr(peersa); } +void +sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo) +{ + struct cmsghdr *cm; + + for (cm = M_FIRST_CMSGHDR(control); + is_cmsg_valid(control, cm); + cm = M_NXT_CMSGHDR(control, cm)) { + int val; + + if (cm->cmsg_level != SOL_SOCKET) { + continue; + } + + if (cm->cmsg_len == CMSG_LEN(sizeof(int))) { + val = *(int *)(void *)CMSG_DATA(cm); + } + + switch (cm->cmsg_type) { + case SO_TRAFFIC_CLASS: + if (cm->cmsg_len != CMSG_LEN(sizeof(int))) { + break; + } + if (SO_VALID_TC(val)) { + sockcminfo->sotc = val; + break; + } else if (val < SO_TC_NET_SERVICE_OFFSET) { + break; + } + /* + * Handle the case SO_NET_SERVICE_TYPE values are + * passed using SO_TRAFFIC_CLASS + */ + val = val - SO_TC_NET_SERVICE_OFFSET; + + OS_FALLTHROUGH; + case SO_NET_SERVICE_TYPE: + if (cm->cmsg_len != CMSG_LEN(sizeof(int))) { + break; + } + + if (!IS_VALID_NET_SERVICE_TYPE(val)) { + break; + } + sockcminfo->netsvctype = val; + sockcminfo->sotc = sotc_by_netservicetype[val]; + break; + case SCM_TXTIME: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint64_t))) { + break; + } + + sockcminfo->tx_time = *(uint64_t *)(void *)CMSG_DATA(cm); + break; + default: + break; + } + } +} + __attribute__((noinline, cold, not_tail_called, noreturn)) __private_extern__ int assfail(const char *a, const char *f, int l) diff --git a/bsd/kern/uipc_socket.h b/bsd/kern/uipc_socket.h new file mode 100644 index 000000000..66824afba --- /dev/null +++ b/bsd/kern/uipc_socket.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifdef XNU_KERNEL_PRIVATE + +#ifndef _KERN_UIPC_SOCKET_H +#define _KERN_UIPC_SOCKET_H + +#include + +#include + +extern struct mem_acct *socket_memacct; + +static inline void +socket_memacct_add(unsigned int size) +{ + mem_acct_add(socket_memacct, size); +} + +static inline void +socket_memacct_sub(unsigned int size) +{ + mem_acct_sub(socket_memacct, size); +} + +static inline bool +socket_memacct_hardlimit() +{ + return mem_acct_limited(socket_memacct) == MEMACCT_HARDLIMIT; +} + +static inline bool +socket_memacct_limited() +{ + return mem_acct_limited(socket_memacct) != 0; +} + +struct sock_cm_info { + int sotc; + int netsvctype; + uint64_t tx_time; +}; + +static inline void +sock_init_cm_info(struct sock_cm_info *sockcminfo, const struct socket *so) +{ + sockcminfo->sotc = so->so_traffic_class; + sockcminfo->netsvctype = so->so_netsvctype; + sockcminfo->tx_time = 0; +} + +extern void sock_parse_cm_info(struct mbuf *control, struct sock_cm_info *sockcminfo); + +#endif /*_KERN_UIPC_SOCKET_H */ + +#endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index e8d3b5677..2b7aa05c4 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -86,6 +86,8 @@ #include #include #include +#include +#include #include #include #include @@ -130,7 +132,7 @@ static int sbappend_common(struct sockbuf *sb, struct mbuf *m, boolean_t nodrop) /* * Primitive routines for operating on sockets and socket buffers */ -static int soqlimitcompat = 1; +int soqlimitcompat = 1; static int soqlencomp = 0; /* @@ -357,16 +359,14 @@ sonewconn_internal(struct socket *head, int connstatus) if (so_qlen >= (soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2))) { - return (struct socket *)0; + return NULL; } - so = soalloc(1, SOCK_DOM(head), head->so_type); + if (proto_memacct_hardlimit(head->so_proto)) { + return NULL; + } + so = soalloc(); if (so == NULL) { - return (struct socket *)0; - } - /* check if head was closed during the soalloc */ - if (head->so_proto == NULL) { - sodealloc(so); - return (struct socket *)0; + return NULL; } so->so_type = head->so_type; @@ -411,9 +411,11 @@ sonewconn_internal(struct socket *head, int connstatus) so->so_traffic_class = head->so_traffic_class; so->so_netsvctype = head->so_netsvctype; + proto_memacct_add(so->so_proto, sizeof(struct socket)); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); - return (struct socket *)0; + return NULL; } so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE); so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE); @@ -431,7 +433,7 @@ sonewconn_internal(struct socket *head, int connstatus) if (head->so_proto->pr_unlock) { socket_lock(head, 0); } - return (struct socket *)0; + return NULL; } if (head->so_proto->pr_unlock) { socket_lock(head, 0); @@ -442,7 +444,7 @@ sonewconn_internal(struct socket *head, int connstatus) if ((head->so_options & SO_ACCEPTCONN) == 0) { so->so_state &= ~SS_NOFDREF; soclose(so); - return (struct socket *)0; + return NULL; } } @@ -1038,45 +1040,6 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m) return 1; } -#ifdef SOCKBUF_DEBUG -void -sbcheck(struct sockbuf *sb) -{ - struct mbuf *m; - struct mbuf *n = 0; - u_int32_t len = 0, mbcnt = 0; - lck_mtx_t *mutex_held; - - if (sb->sb_so->so_proto->pr_getlock != NULL) { - mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0); - } else { - mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx; - } - - LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); - - if (sbchecking == 0) { - return; - } - - for (m = sb->sb_mb; m; m = n) { - n = m->m_nextpkt; - for (; m; m = m->m_next) { - len += m->m_len; - mbcnt += _MSIZE; - /* XXX pretty sure this is bogus */ - if (m->m_flags & M_EXT) { - mbcnt += m->m_ext.ext_size; - } - } - } - if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { - panic("cc %ld != %ld || mbcnt %ld != %ld", len, sb->sb_cc, - mbcnt, sb->sb_mbcnt); - } -} -#endif - void sblastrecordchk(struct sockbuf *sb, const char *where) { @@ -1265,7 +1228,7 @@ sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct } if (asa != NULL) { - _CASSERT(sizeof(asa->sa_len) == sizeof(__uint8_t)); + static_assert(sizeof(asa->sa_len) == sizeof(__uint8_t)); if (MLEN <= UINT8_MAX && asa->sa_len > MLEN) { return NULL; } @@ -1713,9 +1676,6 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) continue; } if (compress && n != NULL && (n->m_flags & M_EOR) == 0 && -#ifndef __APPLE__ - M_WRITABLE(n) && -#endif m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { @@ -1724,7 +1684,6 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) n->m_len += m->m_len; sb->sb_cc += m->m_len; if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { - /* XXX: Probably don't need */ sb->sb_ctl += m->m_len; } @@ -1738,6 +1697,36 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) m = m_free(m); continue; } + if (compress && n != NULL && (n->m_flags & M_EOR) == 0 && + proto_memacct_limited(sb->sb_so->so_proto) && + n->m_type == m->m_type) { + int tocopy = min((int)M_TRAILINGSPACE(n), m->m_len); + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + tocopy); + n->m_len += tocopy; + sb->sb_cc += tocopy; + if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { + sb->sb_ctl += m->m_len; + } + + /* update send byte count */ + if (sb->sb_flags & SB_SNDBYTE_CNT) { + inp_incr_sndbytes_total(sb->sb_so, + m->m_len); + inp_incr_sndbytes_unsent(sb->sb_so, + m->m_len); + } + + if (tocopy < m->m_len) { + memmove(mtod(m, caddr_t), + mtod(m, caddr_t) + tocopy, m->m_len - tocopy); + + m->m_len -= tocopy; + } else { + m = m_free(m); + continue; + } + } if (n != NULL) { n->m_next = m; } else { @@ -1871,19 +1860,12 @@ sbdrop(struct sockbuf *sb, int len) if (m == NULL) { if (next == NULL) { /* - * temporarily replacing this panic with printf - * because it occurs occasionally when closing - * a socket when there is no harm in ignoring - * it. This problem will be investigated - * further. + * We have reached the end of the mbuf chain before + * freeing the requested amount of data. + * Since there is no data left, zero the counts + * and exit the loop. */ - /* panic("sbdrop"); */ - printf("sbdrop - count not zero\n"); len = 0; - /* - * zero the counts. if we have no mbufs, - * we have no data (PR-2986815) - */ sb->sb_cc = 0; sb->sb_mbcnt = 0; break; @@ -2449,15 +2431,15 @@ sowriteable(struct socket *so) void sballoc(struct sockbuf *sb, struct mbuf *m) { + int mbcnt = m_capacity(m); + sb->sb_cc += m->m_len; if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { sb->sb_ctl += m->m_len; } - sb->sb_mbcnt += _MSIZE; - if (m->m_flags & M_EXT) { - sb->sb_mbcnt += m->m_ext.ext_size; - } + sb->sb_mbcnt += mbcnt; + proto_memacct_add(sb->sb_so->so_proto, mbcnt); /* * If data is being added to the send socket buffer, @@ -2473,14 +2455,15 @@ sballoc(struct sockbuf *sb, struct mbuf *m) void sbfree(struct sockbuf *sb, struct mbuf *m) { + int mbcnt = m_capacity(m); + sb->sb_cc -= m->m_len; if (!m_has_mtype(m, MTF_DATA | MTF_HEADER | MTF_OOBDATA)) { sb->sb_ctl -= m->m_len; } - sb->sb_mbcnt -= _MSIZE; - if (m->m_flags & M_EXT) { - sb->sb_mbcnt -= m->m_ext.ext_size; - } + + sb->sb_mbcnt -= mbcnt; + proto_memacct_sub(sb->sb_so->so_proto, mbcnt); /* * If data is being removed from the send socket buffer, @@ -2717,8 +2700,8 @@ void soevent(struct socket *so, uint32_t hint) { if (net_wake_pkt_debug > 0 && (hint & SO_FILT_HINT_WAKE_PKT)) { - os_log(OS_LOG_DEFAULT, "%s: SO_FILT_HINT_WAKE_PKT so %p", - __func__, so); + os_log(wake_packet_log_handle, "soevents: SO_FILT_HINT_WAKE_PKT so_gencnt: %llu", + so->so_gencnt); } if (so->so_flags & SOF_KNOTE) { @@ -2997,9 +2980,6 @@ SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, - CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, ""); - SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes, CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, ""); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 288bf201a..8e7e708bf 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1771,7 +1771,8 @@ static int sendit_x(proc_ref_t p, socket_ref_t so, struct sendmsg_x_args *uap, u_int *retval) { int error = 0; - uio_t __single auio = NULL; + UIO_STACKBUF(uio_buf, UIO_SMALLIOV); + uio_t __single auio; const bool is_p_64bit_process = IS_64BIT_PROCESS(p); void *src; MBUFQ_HEAD() pktlist = {}; @@ -1785,15 +1786,10 @@ sendit_x(proc_ref_t p, socket_ref_t so, struct sendmsg_x_args *uap, u_int *retva *retval = 0; /* We re-use the uio when possible */ - auio = uio_create(1, 0, + auio = uio_createwithbuffer(UIO_SMALLIOV, 0, (is_p_64bit_process ? UIO_USERSPACE64 : UIO_USERSPACE32), - UIO_WRITE); - if (auio == NULL) { - error = ENOBUFS; - DBG_PRINTF("%s uio_create() failed %d", - __func__, error); - goto done; - } + UIO_WRITE, &uio_buf[0], + UIO_SIZEOF(UIO_SMALLIOV)); src = __unsafe_forge_bidi_indexable(void *, uap->msgp, uap->cnt); @@ -3933,8 +3929,7 @@ sendfile(proc_ref_t p, struct sendfile_args *uap, __unused int *retval) * large writes only if there is a jumbo cluster pool and * if the socket is marked accordingly. */ - jumbocl = sosendjcl && njcl > 0 && - ((so->so_flags & SOF_MULTIPAGES) || sosendjcl_ignore_capab); + jumbocl = (so->so_flags & SOF_MULTIPAGES) != 0; socket_unlock(so, 0); alloc_sendpkt(M_WAIT, xfsize, &nbufs, &m0, jumbocl); diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index efa517468..c42b5bd6b 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -2473,7 +2473,7 @@ out: void unp_init(void) { - _CASSERT(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int))); + static_assert(UIPC_MAX_CMSG_FD >= (MCLBYTES / sizeof(int))); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); } diff --git a/bsd/kern/vsock_domain.c b/bsd/kern/vsock_domain.c index ebabb5104..ff7e1c115 100644 --- a/bsd/kern/vsock_domain.c +++ b/bsd/kern/vsock_domain.c @@ -40,39 +40,65 @@ #include #include #include +#include #define sotovsockpcb(so) ((struct vsockpcb *)(so)->so_pcb) #define VSOCK_PORT_RESERVED 1024 +#define VSOCK_PRIVATE_ENTITLEMENT "com.apple.private.vsock" /* VSock Protocol Globals */ -static struct vsock_transport * _Atomic the_vsock_transport = NULL; -static ZONE_DEFINE(vsockpcb_zone, "vsockpcbzone", - sizeof(struct vsockpcb), ZC_NONE); -static LCK_GRP_DECLARE(vsock_lock_grp, "vsock"); -static struct vsockpcbinfo vsockinfo; +static struct vsock_transport * _Atomic the_vsock_transport[VSOCK_PROTO_MAX]; +static ZONE_DEFINE_TYPE(vsockpcb_zone, "vsockpcbzone", struct vsockpcb, ZC_NONE); +static struct vsockpcbinfo vsockinfo[VSOCK_PROTO_MAX]; -static uint32_t vsock_sendspace = VSOCK_MAX_PACKET_SIZE * 8; -static uint32_t vsock_recvspace = VSOCK_MAX_PACKET_SIZE * 8; +static uint32_t vsock_sendspace[VSOCK_PROTO_MAX]; +static uint32_t vsock_recvspace[VSOCK_PROTO_MAX]; + +/* VSock Private Entitlements */ + +static errno_t +vsock_validate_entitlements(uint16_t protocol, struct proc *p) +{ + if (protocol != VSOCK_PROTO_PRIVATE) { + return 0; + } + + if (!p) { + p = current_proc(); + } + + if (p == kernproc) { + // Assume kernel callers are entitled. + return 0; + } + + if (!IOTaskHasEntitlement(proc_task(p), VSOCK_PRIVATE_ENTITLEMENT)) { + return EPERM; + } + + return 0; +} /* VSock PCB Helpers */ static uint32_t -vsock_get_peer_space(struct vsockpcb *pcb) +vsock_get_peer_space(struct vsockpcb *_Nonnull pcb) { + VERIFY(pcb != NULL); return pcb->peer_buf_alloc - (pcb->tx_cnt - pcb->peer_fwd_cnt); } static struct vsockpcb * -vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst) +vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst, uint16_t protocol) { struct vsockpcb *preferred = NULL; struct vsockpcb *match = NULL; struct vsockpcb *pcb = NULL; - lck_rw_lock_shared(&vsockinfo.bound_lock); - LIST_FOREACH(pcb, &vsockinfo.bound, bound) { + lck_rw_lock_shared(&vsockinfo[protocol].bound_lock); + LIST_FOREACH(pcb, &vsockinfo[protocol].bound, bound) { // Source cid and port must match. Only destination port must match. (Allows for a changing CID during migration) socket_lock(pcb->so, 1); if ((pcb->so->so_state & SS_ISCONNECTED || pcb->so->so_state & SS_ISCONNECTING) && @@ -90,14 +116,15 @@ vsock_get_matching_pcb(struct vsock_address src, struct vsock_address dst) socket_lock(match->so, 1); preferred = match; } - lck_rw_done(&vsockinfo.bound_lock); + lck_rw_done(&vsockinfo[protocol].bound_lock); return preferred; } static errno_t -vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port) +vsock_bind_address_if_free(struct vsockpcb *_Nonnull pcb, uint32_t local_cid, uint32_t local_port, uint32_t remote_cid, uint32_t remote_port) { + VERIFY(pcb != NULL); socket_lock_assert_owned(pcb->so); // Privileged ports. @@ -108,12 +135,13 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo bool taken = false; const bool check_remote = (remote_cid != VMADDR_CID_ANY && remote_port != VMADDR_PORT_ANY); + const uint16_t protocol = pcb->so->so_protocol; struct vsockpcb *pcb_match = NULL; socket_unlock(pcb->so, 0); - lck_rw_lock_exclusive(&vsockinfo.bound_lock); - LIST_FOREACH(pcb_match, &vsockinfo.bound, bound) { + lck_rw_lock_exclusive(&vsockinfo[protocol].bound_lock); + LIST_FOREACH(pcb_match, &vsockinfo[protocol].bound, bound) { socket_lock(pcb_match->so, 1); if (pcb == pcb_match || (!check_remote && pcb_match->local_address.port == local_port) || @@ -129,9 +157,9 @@ vsock_bind_address_if_free(struct vsockpcb *pcb, uint32_t local_cid, uint32_t lo if (!taken) { pcb->local_address = (struct vsock_address) { .cid = local_cid, .port = local_port }; pcb->remote_address = (struct vsock_address) { .cid = remote_cid, .port = remote_port }; - LIST_INSERT_HEAD(&vsockinfo.bound, pcb, bound); + LIST_INSERT_HEAD(&vsockinfo[protocol].bound, pcb, bound); } - lck_rw_done(&vsockinfo.bound_lock); + lck_rw_done(&vsockinfo[protocol].bound_lock); return taken ? EADDRINUSE : 0; } @@ -175,14 +203,16 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc if (laddr.port != VMADDR_PORT_ANY) { error = vsock_bind_address_if_free(pcb, laddr.cid, laddr.port, raddr.cid, raddr.port); } else { + const uint16_t protocol = pcb->so->so_protocol; + socket_unlock(pcb->so, 0); - lck_mtx_lock(&vsockinfo.port_lock); + lck_mtx_lock(&vsockinfo[protocol].port_lock); socket_lock(pcb->so, 0); const uint32_t first = VSOCK_PORT_RESERVED; const uint32_t last = VMADDR_PORT_ANY - 1; uint32_t count = last - first + 1; - uint32_t *last_port = &vsockinfo.last_port; + uint32_t *last_port = &vsockinfo[protocol].last_port; if (pcb->so->so_flags & SOF_BINDRANDOMPORT) { uint32_t random = 0; @@ -192,7 +222,7 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc do { if (count == 0) { - lck_mtx_unlock(&vsockinfo.port_lock); + lck_mtx_unlock(&vsockinfo[protocol].port_lock); return EADDRNOTAVAIL; } count--; @@ -205,7 +235,7 @@ vsock_bind_address(struct vsockpcb *pcb, struct vsock_address laddr, struct vsoc error = vsock_bind_address_if_free(pcb, laddr.cid, *last_port, raddr.cid, raddr.port); } while (error); - lck_mtx_unlock(&vsockinfo.port_lock); + lck_mtx_unlock(&vsockinfo[protocol].port_lock); } return error; @@ -228,15 +258,17 @@ vsock_unbind_pcb_locked(struct vsockpcb *pcb, bool is_locked) return; } + const uint16_t protocol = so->so_protocol; + if (!is_locked) { socket_unlock(so, 0); - lck_rw_lock_exclusive(&vsockinfo.bound_lock); + lck_rw_lock_exclusive(&vsockinfo[protocol].bound_lock); socket_lock(so, 0); // Case where some other thread also called unbind() on this socket while waiting to acquire its lock. if (!pcb->bound.le_prev) { soisdisconnected(so); - lck_rw_done(&vsockinfo.bound_lock); + lck_rw_done(&vsockinfo[protocol].bound_lock); return; } } @@ -248,7 +280,7 @@ vsock_unbind_pcb_locked(struct vsockpcb *pcb, bool is_locked) pcb->bound.le_prev = NULL; if (!is_locked) { - lck_rw_done(&vsockinfo.bound_lock); + lck_rw_done(&vsockinfo[protocol].bound_lock); } } @@ -312,15 +344,16 @@ vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbu src.cid = transport_cid; } - uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat; - uint32_t fwd_cnt = pcb->fwd_cnt; + const uint16_t protocol = pcb->so->so_protocol; + const uint32_t buf_alloc = pcb->so->so_rcv.sb_hiwat; + const uint32_t fwd_cnt = pcb->fwd_cnt; if (src.cid == dst.cid) { pcb->last_buf_alloc = buf_alloc; pcb->last_fwd_cnt = fwd_cnt; socket_unlock(pcb->so, 0); - error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m); + error = vsock_put_message(src, dst, operation, buf_alloc, fwd_cnt, m, protocol); socket_lock(pcb->so, 0); } else { struct vsock_transport *transport = pcb->transport; @@ -336,7 +369,7 @@ vsock_pcb_send_message(struct vsockpcb *pcb, enum vsock_operation operation, mbu } static errno_t -vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) +vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst, uint16_t protocol) { if (dst.cid == VMADDR_CID_ANY || dst.port == VMADDR_PORT_ANY) { return EINVAL; @@ -346,7 +379,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) struct vsock_transport *transport = NULL; if (src.cid == VMADDR_CID_ANY) { - transport = os_atomic_load(&the_vsock_transport, relaxed); + transport = os_atomic_load(&the_vsock_transport[protocol], relaxed); if (transport == NULL) { return ENODEV; } @@ -361,7 +394,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) if (src.cid == dst.cid) { // Reset both sockets. - struct vsockpcb *pcb = vsock_get_matching_pcb(src, dst); + struct vsockpcb *pcb = vsock_get_matching_pcb(src, dst, protocol); if (pcb) { socket_lock_assert_owned(pcb->so); vsock_unbind_pcb(pcb); @@ -369,7 +402,7 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) } } else { if (!transport) { - transport = os_atomic_load(&the_vsock_transport, relaxed); + transport = os_atomic_load(&the_vsock_transport[protocol], relaxed); if (transport == NULL) { return ENODEV; } @@ -381,13 +414,13 @@ vsock_pcb_reset_address(struct vsock_address src, struct vsock_address dst) } static errno_t -vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst) +vsock_pcb_safe_reset_address(struct vsockpcb *pcb, struct vsock_address src, struct vsock_address dst, uint16_t protocol) { if (pcb) { socket_lock_assert_owned(pcb->so); socket_unlock(pcb->so, 0); } - errno_t error = vsock_pcb_reset_address(src, dst); + errno_t error = vsock_pcb_reset_address(src, dst, protocol); if (pcb) { socket_lock(pcb->so, 0); } @@ -430,6 +463,18 @@ vsock_pcb_credit_update(struct vsockpcb *pcb) return vsock_pcb_send_message(pcb, VSOCK_CREDIT_UPDATE, NULL); } +static errno_t +vsock_pcb_credit_update_if_needed(struct vsockpcb *_Nonnull pcb) +{ + VERIFY(pcb != NULL); + + // Sends a credit update if the credit values have changed since the last sent message. + if (pcb->so->so_rcv.sb_hiwat != pcb->last_buf_alloc || pcb->fwd_cnt != pcb->last_fwd_cnt) { + return vsock_pcb_credit_update(pcb); + } + return 0; +} + static errno_t vsock_pcb_credit_request(struct vsockpcb *pcb) { @@ -457,7 +502,7 @@ vsock_disconnect_pcb(struct vsockpcb *pcb) } static errno_t -vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr) +vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr, struct proc *p) { if (!pcb || !pcb->so || !addr) { return EINVAL; @@ -478,14 +523,20 @@ vsock_sockaddr_vm_validate(struct vsockpcb *pcb, struct sockaddr_vm *addr) return EAFNOSUPPORT; } + errno_t error = vsock_validate_entitlements(pcb->so->so_protocol, p); + if (error) { + return error; + } + return 0; } /* VSock Receive Handlers */ static errno_t -vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_t m) +vsock_put_message_connected(struct vsockpcb *_Nonnull pcb, enum vsock_operation op, mbuf_t m) { + VERIFY(pcb != NULL); socket_lock_assert_owned(pcb->so); errno_t error = 0; @@ -520,8 +571,9 @@ vsock_put_message_connected(struct vsockpcb *pcb, enum vsock_operation op, mbuf_ } static errno_t -vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op) +vsock_put_message_connecting(struct vsockpcb *_Nonnull pcb, enum vsock_operation op) { + VERIFY(pcb != NULL); socket_lock_assert_owned(pcb->so); errno_t error = 0; @@ -544,14 +596,17 @@ vsock_put_message_connecting(struct vsockpcb *pcb, enum vsock_operation op) } static errno_t -vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst) +vsock_put_message_listening(struct vsockpcb *_Nonnull pcb, enum vsock_operation op, struct vsock_address src, struct vsock_address dst) { + VERIFY(pcb != NULL); socket_lock_assert_owned(pcb->so); struct sockaddr_vm addr; struct socket *so2 = NULL; struct vsockpcb *pcb2 = NULL; + const uint16_t protocol = pcb->so->so_protocol; + errno_t error = 0; switch (op) { @@ -566,7 +621,7 @@ vsock_put_message_listening(struct vsockpcb *pcb, enum vsock_operation op, struc so2 = sonewconn(pcb->so, 0, (struct sockaddr *)&addr); if (!so2) { // It is likely that the backlog is full. Deny this request. - vsock_pcb_safe_reset_address(pcb, dst, src); + vsock_pcb_safe_reset_address(pcb, dst, src, protocol); error = ECONNREFUSED; break; } @@ -597,7 +652,7 @@ done: soisdisconnected(so2); } socket_unlock(so2, 1); - vsock_pcb_reset_address(dst, src); + vsock_pcb_reset_address(dst, src, protocol); } else { socket_unlock(so2, 0); } @@ -605,10 +660,10 @@ done: break; case VSOCK_RESET: - error = vsock_pcb_safe_reset_address(pcb, dst, src); + error = vsock_pcb_safe_reset_address(pcb, dst, src, protocol); break; default: - vsock_pcb_safe_reset_address(pcb, dst, src); + vsock_pcb_safe_reset_address(pcb, dst, src, protocol); error = ENOTSUP; break; } @@ -621,10 +676,10 @@ done: errno_t vsock_add_transport(struct vsock_transport *transport) { - if (transport == NULL || transport->provider == NULL) { + if (transport == NULL || transport->provider == NULL || transport->protocol >= VSOCK_PROTO_MAX) { return EINVAL; } - if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, NULL, transport, acq_rel)) { + if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport[transport->protocol], NULL, transport, acq_rel)) { return EEXIST; } return 0; @@ -633,7 +688,7 @@ vsock_add_transport(struct vsock_transport *transport) errno_t vsock_remove_transport(struct vsock_transport *transport) { - if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport, transport, NULL, acq_rel)) { + if (!os_atomic_cmpxchg((void * volatile *)&the_vsock_transport[transport->protocol], transport, NULL, acq_rel)) { return ENODEV; } return 0; @@ -650,8 +705,8 @@ vsock_reset_transport(struct vsock_transport *transport) struct vsockpcb *pcb = NULL; struct vsockpcb *tmp_pcb = NULL; - lck_rw_lock_exclusive(&vsockinfo.bound_lock); - LIST_FOREACH_SAFE(pcb, &vsockinfo.bound, bound, tmp_pcb) { + lck_rw_lock_exclusive(&vsockinfo[transport->protocol].bound_lock); + LIST_FOREACH_SAFE(pcb, &vsockinfo[transport->protocol].bound, bound, tmp_pcb) { // Disconnect this transport's sockets. Listen and bind sockets must stay alive. socket_lock(pcb->so, 1); if (pcb->transport == transport && pcb->so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) { @@ -662,18 +717,18 @@ vsock_reset_transport(struct vsock_transport *transport) } socket_unlock(pcb->so, 1); } - lck_rw_done(&vsockinfo.bound_lock); + lck_rw_done(&vsockinfo[transport->protocol].bound_lock); return error; } errno_t -vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m) +vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m, uint16_t protocol) { - struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src); + struct vsockpcb *pcb = vsock_get_matching_pcb(dst, src, protocol); if (!pcb) { if (op != VSOCK_RESET) { - vsock_pcb_reset_address(dst, src); + vsock_pcb_reset_address(dst, src, protocol); } if (m != NULL) { mbuf_freem_list(m); @@ -731,9 +786,10 @@ vsock_put_message(struct vsock_address src, struct vsock_address dst, enum vsock /* VSock Sysctl */ static int -vsock_pcblist SYSCTL_HANDLER_ARGS +common_vsock_pcblist(struct sysctl_oid *oidp __unused, void *arg1, int arg2 __unused, struct sysctl_req *_Nonnull req, uint16_t protocol) { -#pragma unused(oidp,arg2) + #pragma unused(oidp,arg2) + VERIFY(req != NULL); int error; @@ -743,10 +799,10 @@ vsock_pcblist SYSCTL_HANDLER_ARGS } // Get the generation count and the count of all vsock sockets. - lck_rw_lock_shared(&vsockinfo.all_lock); - uint64_t n = vsockinfo.all_pcb_count; - vsock_gen_t gen_count = vsockinfo.vsock_gencnt; - lck_rw_done(&vsockinfo.all_lock); + lck_rw_lock_shared(&vsockinfo[protocol].all_lock); + uint64_t n = vsockinfo[protocol].all_pcb_count; + vsock_gen_t gen_count = vsockinfo[protocol].vsock_gencnt; + lck_rw_done(&vsockinfo[protocol].all_lock); const size_t xpcb_len = sizeof(struct xvsockpcb); struct xvsockpgen xvg; @@ -779,11 +835,11 @@ vsock_pcblist SYSCTL_HANDLER_ARGS return 0; } - lck_rw_lock_shared(&vsockinfo.all_lock); + lck_rw_lock_shared(&vsockinfo[protocol].all_lock); n = 0; struct vsockpcb *pcb = NULL; - TAILQ_FOREACH(pcb, &vsockinfo.all, all) { + TAILQ_FOREACH(pcb, &vsockinfo[protocol].all, all) { // Bail if there is not enough user buffer for this next socket. if (req->oldlen - req->oldidx - sizeof(xvg) < xpcb_len) { break; @@ -822,9 +878,9 @@ vsock_pcblist SYSCTL_HANDLER_ARGS } // Update the generation count to match the sockets being returned. - gen_count = vsockinfo.vsock_gencnt; + gen_count = vsockinfo[protocol].vsock_gencnt; - lck_rw_done(&vsockinfo.all_lock); + lck_rw_done(&vsockinfo[protocol].all_lock); if (!error) { /* @@ -845,30 +901,68 @@ vsock_pcblist SYSCTL_HANDLER_ARGS return error; } +static int +vsock_pcblist SYSCTL_HANDLER_ARGS +{ + return common_vsock_pcblist(oidp, arg1, arg2, req, VSOCK_PROTO_STANDARD); +} + +static int +vsock_private_pcblist SYSCTL_HANDLER_ARGS +{ + return common_vsock_pcblist(oidp, arg1, arg2, req, VSOCK_PROTO_PRIVATE); +} + #ifdef SYSCTL_DECL +// Standard namespace. SYSCTL_NODE(_net, OID_AUTO, vsock, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock"); SYSCTL_UINT(_net_vsock, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED, - &vsock_sendspace, 0, "Maximum outgoing vsock datagram size"); + &vsock_sendspace[VSOCK_PROTO_STANDARD], 0, "Maximum outgoing vsock datagram size"); SYSCTL_UINT(_net_vsock, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, - &vsock_recvspace, 0, "Maximum incoming vsock datagram size"); + &vsock_recvspace[VSOCK_PROTO_STANDARD], 0, "Maximum incoming vsock datagram size"); SYSCTL_PROC(_net_vsock, OID_AUTO, pcblist, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, - (caddr_t)(long)SOCK_STREAM, 0, vsock_pcblist, "S,xvsockpcb", + __unsafe_forge_single(caddr_t, SOCK_STREAM), 0, vsock_pcblist, "S,xvsockpcb", "List of active vsock sockets"); - SYSCTL_UINT(_net_vsock, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, - (u_int *)&vsockinfo.all_pcb_count, 0, ""); + (u_int *)&vsockinfo[VSOCK_PROTO_STANDARD].all_pcb_count, 0, ""); + +// Private namespace. +SYSCTL_NODE(_net, OID_AUTO, vsock_private, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "vsock_private"); +SYSCTL_PROC(_net_vsock_private, OID_AUTO, pcblist, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, + __unsafe_forge_single(caddr_t, SOCK_STREAM), 0, vsock_private_pcblist, "S,xvsockpcb", + "List of active private vsock sockets"); +SYSCTL_UINT(_net_vsock_private, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, + (u_int *)&vsockinfo[VSOCK_PROTO_PRIVATE].all_pcb_count, 0, ""); #endif /* VSock Protocol */ static int -vsock_attach(struct socket *so, int proto, struct proc *p) +vsock_attach(struct socket *_Nonnull so, int proto, struct proc *p) { #pragma unused(proto, p) + VERIFY(so != NULL); + + const uint16_t protocol = so->so_protocol; + if (protocol >= VSOCK_PROTO_MAX) { + return EINVAL; + } + + errno_t error = vsock_validate_entitlements(protocol, p); + if (error) { + return error; + } + + const uint32_t send_space = vsock_sendspace[protocol]; + const uint32_t receive_space = vsock_recvspace[protocol]; + if (send_space == 0 || receive_space == 0) { + return ENOMEM; + } // Reserve send and receive buffers. - errno_t error = soreserve(so, vsock_sendspace, vsock_recvspace); + error = soreserve(so, send_space, receive_space); if (error) { return error; } @@ -880,7 +974,7 @@ vsock_attach(struct socket *so, int proto, struct proc *p) } // Get the transport for this socket. - struct vsock_transport *transport = os_atomic_load(&the_vsock_transport, relaxed); + struct vsock_transport *transport = os_atomic_load(&the_vsock_transport[protocol], relaxed); if (transport == NULL) { return ENODEV; } @@ -908,11 +1002,11 @@ vsock_attach(struct socket *so, int proto, struct proc *p) } // Add to the list of all vsock sockets. - lck_rw_lock_exclusive(&vsockinfo.all_lock); - TAILQ_INSERT_TAIL(&vsockinfo.all, pcb, all); - vsockinfo.all_pcb_count++; - pcb->vsock_gencnt = ++vsockinfo.vsock_gencnt; - lck_rw_done(&vsockinfo.all_lock); + lck_rw_lock_exclusive(&vsockinfo[protocol].all_lock); + TAILQ_INSERT_TAIL(&vsockinfo[protocol].all, pcb, all); + vsockinfo[protocol].all_pcb_count++; + pcb->vsock_gencnt = ++vsockinfo[protocol].vsock_gencnt; + lck_rw_done(&vsockinfo[protocol].all_lock); return 0; } @@ -920,25 +1014,24 @@ vsock_attach(struct socket *so, int proto, struct proc *p) static int vsock_control(struct socket *so, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) data, struct ifnet *ifp, struct proc *p) { - #pragma unused(ifp) + #pragma unused(ifp, p) - VERIFY(so != NULL || p == kernproc); + VERIFY(so != NULL); if (cmd != IOCTL_VM_SOCKETS_GET_LOCAL_CID) { return EINVAL; } - struct vsock_transport *transport; - if (so) { - struct vsockpcb *pcb = sotovsockpcb(so); - if (pcb == NULL) { - return EINVAL; - } - transport = pcb->transport; - } else { - transport = os_atomic_load(&the_vsock_transport, relaxed); + if (so == NULL) { + return EINVAL; } + struct vsockpcb *pcb = sotovsockpcb(so); + if (pcb == NULL) { + return EINVAL; + } + + struct vsock_transport *transport = pcb->transport; if (transport == NULL) { return ENODEV; } @@ -971,18 +1064,24 @@ vsock_detach(struct socket *so) return error; } - // Remove from the list of all vsock sockets. - lck_rw_lock_exclusive(&vsockinfo.all_lock); - TAILQ_REMOVE(&vsockinfo.all, pcb, all); - pcb->all.tqe_next = NULL; - pcb->all.tqe_prev = NULL; - vsockinfo.all_pcb_count--; - vsockinfo.vsock_gencnt++; - lck_rw_done(&vsockinfo.all_lock); + const uint16_t protocol = so->so_protocol; // Mark this socket for deallocation. so->so_flags |= SOF_PCBCLEARING; + // Reorder locks. + socket_unlock(so, 0); + lck_rw_lock_exclusive(&vsockinfo[protocol].all_lock); + socket_lock(so, 0); + + // Remove from the list of all vsock sockets. + TAILQ_REMOVE(&vsockinfo[protocol].all, pcb, all); + pcb->all.tqe_next = NULL; + pcb->all.tqe_prev = NULL; + vsockinfo[protocol].all_pcb_count--; + vsockinfo[protocol].vsock_gencnt++; + lck_rw_done(&vsockinfo[protocol].all_lock); + return 0; } @@ -1004,7 +1103,7 @@ vsock_bind(struct socket *so, struct sockaddr *nam, struct proc *p) struct sockaddr_vm *addr = (struct sockaddr_vm *)nam; - errno_t error = vsock_sockaddr_vm_validate(pcb, addr); + errno_t error = vsock_sockaddr_vm_validate(pcb, addr, p); if (error) { return error; } @@ -1093,7 +1192,7 @@ vsock_connect(struct socket *so, struct sockaddr *nam, struct proc *p) struct sockaddr_vm *addr = (struct sockaddr_vm *)nam; - errno_t error = vsock_sockaddr_vm_validate(pcb, addr); + errno_t error = vsock_sockaddr_vm_validate(pcb, addr, p); if (error) { return error; } @@ -1376,9 +1475,9 @@ vsock_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, const uint32_t threshold = VSOCK_MAX_PACKET_SIZE; - // Send a credit update if is possible that the peer will no longer send. + // Send a credit update if it is possible that the peer will no longer send. if ((pcb->fwd_cnt - pcb->last_fwd_cnt + threshold) >= pcb->last_buf_alloc) { - errno_t error = vsock_pcb_credit_update(pcb); + errno_t error = vsock_pcb_credit_update_if_needed(pcb); if (!result && error) { result = error; } @@ -1408,23 +1507,37 @@ static struct pr_usrreqs vsock_usrreqs = { }; static void -vsock_init(struct protosw *pp, struct domain *dp) +common_vsock_init(struct protosw *pp, struct domain *dp, uint16_t protocol, lck_grp_t *lock_group) { #pragma unused(dp) - static int vsock_initialized = 0; + static int vsock_initialized[VSOCK_PROTO_MAX] = {0}; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized, 0, 1, acq_rel)) { + if (!os_atomic_cmpxchg((volatile int *)&vsock_initialized[protocol], 0, 1, acq_rel)) { return; } // Setup VSock protocol info struct. - lck_rw_init(&vsockinfo.all_lock, &vsock_lock_grp, LCK_ATTR_NULL); - lck_rw_init(&vsockinfo.bound_lock, &vsock_lock_grp, LCK_ATTR_NULL); - lck_mtx_init(&vsockinfo.port_lock, &vsock_lock_grp, LCK_ATTR_NULL); - TAILQ_INIT(&vsockinfo.all); - LIST_INIT(&vsockinfo.bound); - vsockinfo.last_port = VMADDR_PORT_ANY; + lck_rw_init(&vsockinfo[protocol].all_lock, lock_group, LCK_ATTR_NULL); + lck_rw_init(&vsockinfo[protocol].bound_lock, lock_group, LCK_ATTR_NULL); + lck_mtx_init(&vsockinfo[protocol].port_lock, lock_group, LCK_ATTR_NULL); + TAILQ_INIT(&vsockinfo[protocol].all); + LIST_INIT(&vsockinfo[protocol].bound); + vsockinfo[protocol].last_port = VMADDR_PORT_ANY; +} + +static void +vsock_init(struct protosw *pp, struct domain *dp) +{ + static LCK_GRP_DECLARE(vsock_lock_grp, "vsock"); + common_vsock_init(pp, dp, VSOCK_PROTO_STANDARD, &vsock_lock_grp); +} + +static void +vsock_private_init(struct protosw *pp, struct domain *dp) +{ + static LCK_GRP_DECLARE(vsock_private_lock_grp, "vsock_private"); + common_vsock_init(pp, dp, VSOCK_PROTO_PRIVATE, &vsock_private_lock_grp); } static int @@ -1444,8 +1557,10 @@ vsock_sofreelastref(struct socket *so, int dealloc) } static int -vsock_unlock(struct socket *so, int refcount, void *lr_saved) +vsock_unlock(struct socket *_Nonnull so, int refcount, void *lr_saved) { + VERIFY(so != NULL); + lck_mtx_t *mutex_held = so->so_proto->pr_domain->dom_mtx; #ifdef MORE_LOCKING_DEBUG LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); @@ -1472,14 +1587,22 @@ vsock_unlock(struct socket *so, int refcount, void *lr_saved) return 0; } -static struct protosw vsocksw[] = { +static struct protosw vsocksw[VSOCK_PROTO_MAX] = { { .pr_type = SOCK_STREAM, - .pr_protocol = 0, + .pr_protocol = VSOCK_PROTO_STANDARD, .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD, .pr_init = vsock_init, .pr_unlock = vsock_unlock, .pr_usrreqs = &vsock_usrreqs, + }, + { + .pr_type = SOCK_STREAM, + .pr_protocol = VSOCK_PROTO_PRIVATE, + .pr_flags = PR_CONNREQUIRED | PR_WANTRCVD, + .pr_init = vsock_private_init, + .pr_unlock = vsock_unlock, + .pr_usrreqs = &vsock_usrreqs, } }; @@ -1490,15 +1613,21 @@ static const int vsock_proto_count = (sizeof(vsocksw) / sizeof(struct protosw)); static struct domain *vsock_domain = NULL; static void -vsock_dinit(struct domain *dp) +vsock_dinit(struct domain *_Nonnull dp) { // The VSock domain is initialized with a singleton pattern. + VERIFY(dp != NULL); VERIFY(!(dp->dom_flags & DOM_INITIALIZED)); VERIFY(vsock_domain == NULL); vsock_domain = dp; + const uint32_t default_buffer_size = VSOCK_MAX_PACKET_SIZE * 8; + // Add protocols and initialize. for (int i = 0; i < vsock_proto_count; i++) { + vsock_sendspace[i] = default_buffer_size; + vsock_recvspace[i] = default_buffer_size; + net_add_proto((struct protosw *)&vsocksw[i], dp, 1); } } diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index cb2616336..231f33409 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,8 @@ struct exec_info { }; int grade_binary(cpu_type_t, cpu_subtype_t, cpu_subtype_t, bool allow_simulator_binary); +int binary_grade_overrides_update(char *overrides_arg); +size_t bingrade_get_override_string(char *existing_overrides, size_t existing_overrides_bufsize); boolean_t binary_match(cpu_type_t mask_bits, cpu_type_t req_cpu, cpu_subtype_t req_subcpu, cpu_type_t test_cpu, cpu_subtype_t test_subcpu); diff --git a/bsd/man/man2/access.2 b/bsd/man/man2/access.2 index c17e78835..23cd6afd5 100644 --- a/bsd/man/man2/access.2 +++ b/bsd/man/man2/access.2 @@ -103,25 +103,32 @@ Values for are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_EACCESS The checks for accessibility are performed using the effective user and group IDs instead of the real user and group ID as required in a call to .Fn access . .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW If .Fa path names a symbolic link, the status of the symbolic link is returned. .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path names a symbolic link, the status of the symbolic link is returned and if the path has any other symbolic links, an error is returned. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El .Pp Even if a process has appropriate privileges and indicates success for .Dv X_OK , @@ -200,6 +207,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +AT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh SEE ALSO .Xr chmod 2 , diff --git a/bsd/man/man2/chmod.2 b/bsd/man/man2/chmod.2 index 7f5b148a9..0cc2f7236 100644 --- a/bsd/man/man2/chmod.2 +++ b/bsd/man/man2/chmod.2 @@ -94,19 +94,26 @@ The values for the are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW If .Fa path names a symbolic link, then the mode of the symbolic link is changed. .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path names a symbolic link, then the mode of the symbolic link is changed and if if the path has any other symbolic links, an error is returned. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El .Pp If .Fn fchmodat @@ -280,6 +287,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if AT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh LEGACY SYNOPSIS .Fd #include diff --git a/bsd/man/man2/chown.2 b/bsd/man/man2/chown.2 index 14f002621..4471030f5 100644 --- a/bsd/man/man2/chown.2 +++ b/bsd/man/man2/chown.2 @@ -121,13 +121,13 @@ Values for are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW If .Fa path names a symbolic link, ownership of the symbolic link is changed. .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path @@ -135,6 +135,13 @@ names a symbolic link, ownership of the symbolic link is changed. If any any other symbolic link is encountered it is not followed and an error is returned instead. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El .Pp If .Fn fchownat @@ -258,6 +265,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if AT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh SEE ALSO .Xr chgrp 1 , diff --git a/bsd/man/man2/clonefile.2 b/bsd/man/man2/clonefile.2 index b15e7a11a..58b53da78 100644 --- a/bsd/man/man2/clonefile.2 +++ b/bsd/man/man2/clonefile.2 @@ -134,7 +134,7 @@ parameter specifies the options that can be passed. Options are specified in the .Fa flags argument by or'ing the following values: . -.Bl -tag -width CLONE_NOFOLLOW +.Bl -tag -width CLONE_RESOLVE_BENEATH . .It CLONE_NOFOLLOW Don't follow the src file if it is a symbolic link (applicable only if the source is not a directory). @@ -144,7 +144,7 @@ names a symbolic link. . .El .Pp -.Bl -tag -width CLONE_NOOWNERCOPY +.Bl -tag -width CLONE_RESOLVE_BENEATH . .It CLONE_NOOWNERCOPY Don't copy ownership information from the source when run called with superuser privileges. @@ -154,12 +154,32 @@ names a symbolic link. . .El .Pp -.Bl -tag -width CLONE_ACL +.Bl -tag -width CLONE_RESOLVE_BENEATH . .It CLONE_ACL Copy ACLs from the source file. .El .Pp +.Bl -tag -width CLONE_RESOLVE_BENEATH +. +.It CLONE_NOFOLLOW_ANY +do not follow any symbolic links encountered during pathname resolution. An error is returned if a symlink is encountered before the last component of either +.Fa src +or +.Fa dst. +.El +.Pp +.Bl -tag -width CLONE_RESOLVE_BENEATH +. +.It CLONE_RESOLVE_BENEATH +If +.Fa src +and +.Fa dst +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El +.Pp The .Fn clonefile , .Fn clonefileat @@ -232,6 +252,19 @@ or .Fa dst path arguments. . +.It Bq Er ELOOP +If CLONE_NOFOLLOW_ANY option was passed and a symbolic link was encountered in +translating the pathname of either +.Fa src +or +.Fa dst +arguments. +.It Bq Er ENOTCAPABLE +If CLONE_RESOLVE_BENEATH was passed and either +.Fa src +or +.Fa dst +does not reside in the directory hierarchy beneath the starting directory. .It Bq Er EROFS The requested operation requires writing in a directory on a read-only file system. . diff --git a/bsd/man/man2/execve.2 b/bsd/man/man2/execve.2 index 94f939a17..8c984e947 100644 --- a/bsd/man/man2/execve.2 +++ b/bsd/man/man2/execve.2 @@ -170,9 +170,8 @@ When a program is executed as a result of an .Fn execve call, it is entered as follows: .Bd -literal -offset indent -main(argc, argv, envp) -int argc; -char **argv, **envp; +.Ft int +.Fn main "int argc" "char **argv" "char **envp"; .Ed .Pp where diff --git a/bsd/man/man2/fs_snapshot_create.2 b/bsd/man/man2/fs_snapshot_create.2 index 1a05b3317..4075bd398 100644 --- a/bsd/man/man2/fs_snapshot_create.2 +++ b/bsd/man/man2/fs_snapshot_create.2 @@ -93,6 +93,8 @@ parameter specifies the options that can be passed. only .Fn fs_snapshot_mount options are currently defined: .Bl -tag -width SNAPSHOT_MNT_IGNORE_OWNERSHIP +.It Dv SNAPSHOT_MNT_NOEXEC +Can't exec from filesystem. .It Dv SNAPSHOT_MNT_NOSUID Setuid bits are not honored on this filesystem. .It Dv SNAPSHOT_MNT_NODEV diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index 793554b11..8429aa7b2 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -16,7 +16,7 @@ .\" .\" @(#)getattrlist.2 . -.Dd July 31, 2024 +.Dd March 7, 2025 .Dt GETATTRLIST 2 .Os Darwin .Sh NAME @@ -215,7 +215,7 @@ parameter is a bit set that controls the behaviour of the functions. The following option bits are defined. . -.Bl -tag -width FSOPT_PACK_INVAL_ATTRS +.Bl -tag -width FSOPT_ATTR_CMN_EXTENDED . .It FSOPT_NOFOLLOW If this bit is set, @@ -234,6 +234,14 @@ In addition an error is returned if a symlink is encountered before the last component of .Fa path . . +.It FSOPT_RESOLVE_BENEATH +If this bit is set, +.Fn getattrlistat +and +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +. .It FSOPT_REPORT_FULLSIZE The size of the attributes reported (in the first .Vt u_int32_t @@ -363,7 +371,7 @@ is a pointer to the ) + .Fa attr_dataoffset + -.Fa attr_datalength +.Fa attr_length ) > ( ( (char *) .Fa attrBuf ) + @@ -389,7 +397,7 @@ be the first attribute returned. By default, unsupported attributes will be skipped (i.e. not packed into the output buffer). This behavior can be over-ridden using the FSOPT_PACK_INVAL_ATTRS option flag. Both .Xr getattrlist 2 and -.Xr getatttrlistbulk 2 support this attribute while +.Xr getattrlistbulk 2 support this attribute while .Xr searchfs 2 does not. . .It ATTR_CMN_NAME @@ -460,7 +468,7 @@ ATTR_CMN_OBJID of a file system object may appear similar (in whole or in part) to it's ATTR_CMN_FILEID (see description of ATTR_CMN_FILEID below), \fBno relation between the two attributes should ever be implied.\fP .Pp -ATTR_CMN_OBJID is deprecated sarting with macOS 10.13, iOS 11.0, watchOS 4.0 and +ATTR_CMN_OBJID is deprecated starting with macOS 10.13, iOS 11.0, watchOS 4.0 and tvOS 11.0 and ATTR_CMNEXT_LINKID should be used in its place. ATTR_CMN_OBJID can only be used on older operating systems only if the file system doesn't 64 bit IDs. See the @@ -683,7 +691,7 @@ structure, of which only the ACL entry is used. .It ATTR_CMN_UUID A .Vt guid_t -of the owner of the file system object. Analoguous to +of the owner of the file system object. Analogous to .Dv ATTR_CMN_OWNERID . . .It ATTR_CMN_GRPUUID @@ -1238,7 +1246,8 @@ or snapshot, and which would be freed immediately if the file were deleted. A .Vt u_int64_t that uniquely identifies the file system object within a mounted volume for the -duration of its mount. +duration of its mount. This identifier is persistent on volumes that support the +VOL_CAP_FMT_PERSISTENTOBJECTIDS capability, such as HFS+ and APFS. .Pp On HFS+ and APFS volumes, the ATTR_CMNEXT_LINKID of a file system object is distinct from the ATTR_CMNEXT_LINKID of any hard link to that file @@ -1338,7 +1347,7 @@ directory is not marked maintain-dir-stats, a zero is returned. .It ATTR_CMNEXT_ATTRIBUTION_TAG An optional .Vt u_int64_t -id that represents the bundle id (owner) assoicated with the file +id that represents the bundle id (owner) associated with the file (zero means the file isn't attributed yet) . .It ATTR_CMNEXT_CLONE_REFCNT @@ -1769,6 +1778,13 @@ See .Xr fcntl 2 for more details. . +.It VOL_CAP_INT_BARRIERFSYNC +If this bit is set, the file system supports the +.Dv F_BARRIERFSYNC operation. +See +.Xr fcntl 2 +for more details. +. .El .Pp . @@ -1964,6 +1980,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if FSOPT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Pp . diff --git a/bsd/man/man2/getattrlistbulk.2 b/bsd/man/man2/getattrlistbulk.2 index 60f7e4566..3f3432135 100644 --- a/bsd/man/man2/getattrlistbulk.2 +++ b/bsd/man/man2/getattrlistbulk.2 @@ -126,7 +126,7 @@ parameter is a bit set that controls the behaviour of .Fn getattrlistbulk . The following option bits are defined. . -.Bl -tag -width FSOPT_PACK_INVAL_ATTRS +.Bl -tag -width FSOPT_ATTR_CMN_EXTENDED . .It FSOPT_PACK_INVAL_ATTRS If this is bit is set, then all requested attributes, @@ -138,8 +138,13 @@ attribute_set_t structure returned for the attribute. Default values will be returned for invalid attributes and should be ignored. .Pp -Please see the discussion of this flag in +.It FSOPT_ATTR_CMN_EXTENDED +If this is bit is set, forkattrs are reinterpreted as a set of extended common attributes. +.El +.Pp +Please refer to the .Xr getattrlist 2 +manual page for a discussion of the option flags. . .El .Pp diff --git a/bsd/man/man2/getxattr.2 b/bsd/man/man2/getxattr.2 index 0db90e921..290df101f 100644 --- a/bsd/man/man2/getxattr.2 +++ b/bsd/man/man2/getxattr.2 @@ -74,7 +74,7 @@ currently associated with the attribute. .Fa options specify options for retrieving extended attributes: .Pp -.Bl -tag -width XATTR_NOFOLLOW +.Bl -tag -width XATTR_RESOLVE_BENEATH .It Dv XATTR_NOFOLLOW do not follow symbolic links. .Fn getxattr @@ -86,6 +86,11 @@ will return extended attribute data from the symbolic link instead. .It Dv XATTR_NOFOLLOW_ANY do not follow any symbolic links encountered during pathname resolution. An error is returned if a symlink is encountered before the last component of path. +.It Dv XATTR_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. .It Dv XATTR_SHOWCOMPRESSION .Fn getxattr and @@ -129,7 +134,7 @@ is invalid or .Fa options has an unsupported bit set. .It Bq Er EINVAL -XATTR_NOFOLLOW or XATTR_NOFOLLOW_ANY option was passed in +XATTR_NOFOLLOW, XATTR_NOFOLLOW_ANY or XATTR_RESOLVE_BENEATH option was passed in .Fn fgetxattr . .It Bq Er EISDIR .Fa path @@ -165,6 +170,10 @@ Too many symbolic links were encountered in translating the pathname. .It Bq Er ELOOP If XATTR_NOFOLLOW_ANY option was passed and a symbolic link was encountered in translating the pathname. +.It Bq Er ENOTCAPABLE +If XATTR_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .It Bq Er EFAULT .Fa path or diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index 13049a96d..f7d4769d7 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008-2016 Apple Inc. All rights reserved. +.\" Copyright (c) 2008-2024 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -462,9 +462,15 @@ Note that this filter is not supported for vnodes. For sockets, the low water mark and socket error handling is identical to the EVFILT_READ case. .It EVFILT_AIO -This filter is currently unsupported. +Events for this filter are not registered with +.Fn kevent64 +directly but are registered via the +.Va aio_sigevent +member of an asynchronous +I/O request when it is scheduled via an asynchronous I/O system call such as +.Fn aio_read . .\"The sigevent portion of the AIO request is filled in, with -.\".Va sigev_notify_kqueue +.\".Va sigev_signo .\"containing the descriptor of the kqueue that the event should .\"be attached to, .\".Va sigev_value diff --git a/bsd/man/man2/link.2 b/bsd/man/man2/link.2 index 8dc15d179..988e9b567 100644 --- a/bsd/man/man2/link.2 +++ b/bsd/man/man2/link.2 @@ -126,12 +126,23 @@ Values for are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_FOLLOW If .Fa name1 names a symbolic link, a new link for the target of the symbolic link is created. +.It Dv AT_SYMLINK_NOFOLLOW_ANY +If +.Fa name1 +names a symbolic link, a new link for the symbolic link is +created. +If a symbolic link is encountered during pathname resolution, an error is returned. +.It Dv AT_RESOLVE_BENEATH +If +.Fa name1 +does not reside in the hierarchy beneath the starting directory, +an error is returned. .El .Pp If @@ -156,12 +167,11 @@ the behavior is identical to a call to Unless .Fa flag contains the -.Dv AT_SYMLINK_FOLLOW -flag, if -.Fa name1 -names a symbolic link, a new link is created for the symbolic link -.Fa name1 -and not its target. On OS X, not assigning AT_SYMLINK_FOLLOW to +.Dv AT_SYMLINK_FOLLOW, +.Dv AT_SYMLINK_NOFOLLOW_ANY +or the +.Dv AT_RESOLVE_BENEATH +flags. On OS X, not assigning AT_SYMLINK_FOLLOW, AT_SYMLINK_NOFOLLOW_ANY or AT_RESOLVE_BENEATH to .Fa flag may result in some file systems returning an error. .Sh RETURN VALUES @@ -301,6 +311,22 @@ or respectively, is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ELOOP +If AT_SYMLINK_NOFOLLOW_ANY was passed and a symbolic link was encountered in +translating either the +.Fa name1 +or +.Fa name2 +arguments. +.It Bq Er ENOTCAPABLE +AT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory encountered in +translating either the +.Fa name1 +or +.Fa name2 +arguments. .El .Sh SEE ALSO .Xr symlink 2 , diff --git a/bsd/man/man2/listxattr.2 b/bsd/man/man2/listxattr.2 index 3e13c193c..652a3d9e2 100644 --- a/bsd/man/man2/listxattr.2 +++ b/bsd/man/man2/listxattr.2 @@ -57,7 +57,7 @@ the size of the list of names. .Fa options controls how the attribute list is generated: .Pp -.Bl -tag -width XATTR_NOFOLLOW +.Bl -tag -width XATTR_RESOLVE_BENEATH .It Dv XATTR_NOFOLLOW do not follow symbolic links. .Fn listxattr @@ -69,6 +69,11 @@ will list attributes of the link itself. .It Dv XATTR_NOFOLLOW_ANY do not follow any symbolic links encountered during pathname resolution. An error is returned if a symlink is encountered before the last component of path. +.It Dv XATTR_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. .It Dv XATTR_SHOWCOMPRESSION .Fn listxattr and @@ -156,8 +161,12 @@ An I/O error occurred. .Fa options has an unsupported bit set. .It Bq Er EINVAL -XATTR_NOFOLLOW or XATTR_NOFOLLOW_ANY option was passed in +XATTR_NOFOLLOW, XATTR_NOFOLLOW_ANY or XATTR_RESOLVE_BENEATH option was passed in .Fn flistxattr . +.It Bq Er ENOTCAPABLE +If XATTR_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh SEE ALSO .Xr getxattr 2 , diff --git a/bsd/man/man2/mount.2 b/bsd/man/man2/mount.2 index aabf352ee..588770f64 100644 --- a/bsd/man/man2/mount.2 +++ b/bsd/man/man2/mount.2 @@ -41,6 +41,7 @@ .Nm mount , .Nm fmount, .Nm unmount +.Nm funmount .Nd mount or dismount a filesystem .Sh SYNOPSIS .Fd #include @@ -51,6 +52,8 @@ .Fn fmount "const char *type" "int fd" "int flags" "void *data" .Ft int .Fn unmount "const char *dir" "int flags" +.Ft int +.Fn funmount "int fd" "int flags" .Sh DESCRIPTION The .Fn mount @@ -129,7 +132,7 @@ The function call is equivalent to the .Fn mount function call, except in the use of the second argument. -It takes an open file descriptor representing mount point +It takes an open file descriptor representing the mount point instead of the string literal containing full path to the mount point in the filesystem hierarchy. .Pp @@ -148,6 +151,15 @@ still active. Active special devices continue to work, but any further accesses to any other active files result in errors even if the filesystem is later remounted. +.Pp +The +.Fn funmount +function call is equivalent to the +.Fn unmount +function call, except in the use of the first argument. +It takes an open file descriptor representing the mount point +instead of the string literal containing full path to the mount +point in the filesystem hierarchy. .Sh RETURN VALUES The .Fn mount @@ -158,7 +170,9 @@ and the variable .Va errno is set to indicate the error. .Pp -.Nm unmount +.Fn unmount +and +.Fn funmount returns the value 0 if the unmount succeeded; otherwise -1 is returned and the variable .Va errno @@ -201,7 +215,9 @@ Another process currently holds a reference to points outside the process's allocated address space. .El .Pp -.Nm unmount +.Fn unmount +and +.Fn funmount may fail with one of the following errors: .Bl -tag -width [ENAMETOOLONG] .It Bq Er EPERM @@ -251,3 +267,5 @@ function calls appeared in .At v6 . .Fn fmount function call first appeared in macOS version 10.13. +.Fn funmount +function call first appeared in macOS version 16. diff --git a/bsd/man/man2/open.2 b/bsd/man/man2/open.2 index 7c6c24591..df903f728 100644 --- a/bsd/man/man2/open.2 +++ b/bsd/man/man2/open.2 @@ -157,7 +157,8 @@ O_SYMLINK allow open of symlinks O_EVTONLY descriptor requested for event notifications only O_CLOEXEC mark as close-on-exec O_NOFOLLOW_ANY do not follow symlinks in the entire path -O_RESOLVE_BENEATH path must reside in the hierarchy beneath the starting directory +O_RESOLVE_BENEATH path resolution must not escape the directory associated with the file descriptor +O_UNIQUE ensure a file is opened only if it has a single hard link .Ed .Pp Opening a file with @@ -257,12 +258,17 @@ will fail. .Pp If .Dv O_RESOLVE_BENEATH -is used in the mask and the specified relative path does not reside in the directory hierarchy beneath the starting directory passed by +is used in the mask and the specified path resolution escapes the directory associated with the .Fa fd then the .Fn openat will fail. .Pp +If +.Dv O_UNIQUE +is used in the mask, the open operation will fail if the file has more than one hard link pointing to it. +The operation succeeds only if the file is guaranteed to be uniquely accessible through the provided pathname. +.Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. @@ -487,11 +493,14 @@ nor a file descriptor associated with a directory. The filename does not match the encoding rules. .It Bq Er EWOULDBLOCK O_SHLOCK or O_EXLOCK is specified, but the file is locked and the O_NONBLOCK option was specified. -.It Bq Er EACCES +.It Bq Er ENOTCAPABLE .Fa path -does not reside in the directory hierarchy beneath the starting directory passed by +resolution escapes the directory associated with .Fa fd and O_RESOLVE_BENEATH option was specified. +.It Bq Er ENOTCAPABLE +.Fa path +has multiple hard links and O_UNIQUE option was specified. .El .Sh COMPATIBILITY .Fn open diff --git a/bsd/man/man2/posix_spawn.2 b/bsd/man/man2/posix_spawn.2 index a2d521bcd..bdac44e56 100644 --- a/bsd/man/man2/posix_spawn.2 +++ b/bsd/man/man2/posix_spawn.2 @@ -212,9 +212,8 @@ or .Fn posix_spawnp call, it is entered as follows: .Bd -literal -offset indent -main(argc, argv, envp) -int argc; -char **argv, **envp; +.Ft int +.Fn main "int argc" "char **argv" "char **envp"; .Ed .Pp where diff --git a/bsd/man/man2/removexattr.2 b/bsd/man/man2/removexattr.2 index 3578a4d3f..405c74854 100644 --- a/bsd/man/man2/removexattr.2 +++ b/bsd/man/man2/removexattr.2 @@ -50,7 +50,7 @@ is a simple NULL-terminated UTF-8 string. .Fa Options is a bit mask specifying various options: .Pp -.Bl -tag -width XATTR_NOFOLLOW +.Bl -tag -width XATTR_RESOLVE_BENEATH .It Dv XATTR_NOFOLLOW do not follow symbolic links. Normally, .Fn removexattr @@ -62,6 +62,11 @@ will act on the link itself. .It Dv XATTR_NOFOLLOW_ANY do not follow any symbolic links encountered during pathname resolution. An error is returned if a symlink is encountered before the last component of path. +.It Dv XATTR_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. .It Dv XATTR_SHOWCOMPRESSION .Fn removexattr and @@ -106,7 +111,7 @@ must be valid UTF-8 .Fa options must make sense. .It Bq Er EINVAL -XATTR_NOFOLLOW or XATTR_NOFOLLOW_ANY option was passed in +XATTR_NOFOLLOW, XATTR_NOFOLLOW_ANY or XATTR_RESOLVE_BENEATH option was passed in .Fn fremovexattr . .It Bq Er ENOTDIR A component of the @@ -135,6 +140,10 @@ Too many symbolic links were encountered in .It Bq Er ELOOP If XATTR_NOFOLLOW_ANY option was passed and a symbolic link was encountered in translating the pathname. +.It Bq Er ENOTCAPABLE +If XATTR_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .It Bq Er EFAULT .Fa path or diff --git a/bsd/man/man2/rename.2 b/bsd/man/man2/rename.2 index 5f512e188..e879a1cd7 100644 --- a/bsd/man/man2/rename.2 +++ b/bsd/man/man2/rename.2 @@ -127,7 +127,7 @@ argument. Values for .Fa flags are constructed with below bits set: -.Bl -tag -offset indent +.Bl -tag -width VOL_CAP_INT_RENAME_EXCL .It Dv RENAME_SWAP On file systems that support it (see .Xr getattrlist 2 @@ -146,6 +146,13 @@ to be returned if the destination already exists. EINVAL is returned in case of .Dv RENAME_SWAP . .It Dv RENAME_NOFOLLOW_ANY If any symbolic links are encountered during pathname resolution, an error is returned. +.It Dv RENAME_RESOLVE_BENEATH +Either +.Fa from +or +.Fa to +does not reside in the hierarchy beneath the starting directory, +an error is returned. .El .Sh CAVEATS The system can deadlock if a loop is present in the file system graph. @@ -275,6 +282,14 @@ This is taken to be indicative of a looping symbolic link. If RENAME_NOFOLLOW_ANY was passed and a symbolic link was encountered in translating either pathname. .\" =========== +.It Bq Er ENOTCAPABLE +RENAME_RESOLVE_BENEATH was passed and either +.Fa from +or +.Fa to +does not reside in the +directory hierarchy beneath the starting directory. +.\" =========== .It Bq Er ENAMETOOLONG A component of a pathname exceeds .Dv {NAME_MAX} diff --git a/bsd/man/man2/sem_open.2 b/bsd/man/man2/sem_open.2 index 9f06c7775..8ce59a964 100644 --- a/bsd/man/man2/sem_open.2 +++ b/bsd/man/man2/sem_open.2 @@ -88,6 +88,22 @@ which correspond to the effective user and group IDs of the calling process. There is no visible entry in the file system for the created object in this implementation. .Pp +There are additional restrictions for non-macOS platforms only: if the +calling process is signed with a Team ID entitlement, then +.Fa name +is +.Ql namespaced , +meaning calls to +.Fn sem_open +or +.Fn sem_unlink +will not observe any semaphores created by processes signed with different Team +IDs. If a process lacks a Team ID entitlement, then +.Fn sem_open +and +.Fn sem_unlink +operate on a global namespace, which is always searched as a fallback. +.Pp The returned semaphore descriptor is available to the calling process until it is closed with .Fn sem_close , @@ -143,6 +159,9 @@ descriptors in use. .Fa name exceeded .Dv PSEMNAMLEN +characters, or Team ID of calling binary exceeded +.Dv +PSEMTEAMIDLEN characters. .It Bq Er ENFILE Too many semaphores or file descriptors are open on the system. diff --git a/bsd/man/man2/setattrlist.2 b/bsd/man/man2/setattrlist.2 index c2b9092f3..eaac65e10 100644 --- a/bsd/man/man2/setattrlist.2 +++ b/bsd/man/man2/setattrlist.2 @@ -237,7 +237,7 @@ parameter is a bit set that controls the behaviour of .Fn setattrlist . The following option bits are defined. . -.Bl -tag -width XXXbitmapcount +.Bl -tag -width FSOPT_RESOLVE_BENEATH . .It FSOPT_NOFOLLOW If this bit is set, @@ -256,6 +256,14 @@ In addition, if a symbolic link is encountered before the final component, an error is returned . +.It FSOPT_RESOLVE_BENEATH +If this bit is set, +.Fn setattrlistat +and +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +. .El . .Sh RETURN VALUES @@ -408,6 +416,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if FSOPT_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Pp . diff --git a/bsd/man/man2/setxattr.2 b/bsd/man/man2/setxattr.2 index 3818edaf4..7d45fd74e 100644 --- a/bsd/man/man2/setxattr.2 +++ b/bsd/man/man2/setxattr.2 @@ -65,7 +65,7 @@ set to zero. .Fa options controls how the attribute is set: .Pp -.Bl -tag -width XATTR_NOFOLLOW +.Bl -tag -width XATTR_RESOLVE_BENEATH .It Dv XATTR_NOFOLLOW do not follow symbolic links. .Fn setxattr @@ -78,6 +78,11 @@ will act on the link itself. .It Dv XATTR_NOFOLLOW_ANY do not follow any symbolic links encountered during pathname resolution. An error is returned if a symlink is encountered before the last component of path. +.It Dv XATTR_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. .It Dv XATTR_CREATE fail if the named attribute already exists. .It Dv XATTR_REPLACE @@ -152,7 +157,7 @@ must be valid UTF-8 and .Fa options must make sense. .It Bq Er EINVAL -XATTR_NOFOLLOW or XATTR_NOFOLLOW_ANY option was passed in +XATTR_NOFOLLOW, XATTR_NOFOLLOW_ANY or XATTR_RESOLVE_BENEATH option was passed in .Fn fsetxattr . .It Bq Er ENOTDIR A component of @@ -181,6 +186,10 @@ Too many symbolic links were encountered resolving .It Bq Er ELOOP If XATTR_NOFOLLOW_ANY option was passed and a symbolic link was encountered in translating the pathname. +.It Bq Er ENOTCAPABLE +If XATTR_RESOLVE_BENEATH was passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .It Bq Er EFAULT .Fa path or diff --git a/bsd/man/man2/stat.2 b/bsd/man/man2/stat.2 index 2df91d8a7..2545effe9 100644 --- a/bsd/man/man2/stat.2 +++ b/bsd/man/man2/stat.2 @@ -138,13 +138,13 @@ The values for the are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW If .Fa path names a symbolic link, the status of the symbolic link is returned. .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path @@ -152,6 +152,13 @@ names a symbolic link, the status of the symbolic link is returned. If a symbolic link is encountered during pathname resolution, an error is returned. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El .Pp If .Fn fstatat @@ -570,6 +577,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if AT_RESOLVE_BENEATH is passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh CAVEATS The file generation number, diff --git a/bsd/man/man2/unlink.2 b/bsd/man/man2/unlink.2 index ec130537d..3120af94f 100644 --- a/bsd/man/man2/unlink.2 +++ b/bsd/man/man2/unlink.2 @@ -83,7 +83,7 @@ The values for are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_REMOVEDIR Remove the directory entry specified by .Fa fd @@ -91,13 +91,26 @@ and .Fa path as a directory, not a normal file. .El -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path contains a symbolic link in any intermediate directory of the path, it is not followed and an error is returned instead. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_NODELETEBUSY +If +.Fa path +refers to a file with any open file descriptors, an error is returned. +.El .Pp If .Fn unlinkat @@ -141,6 +154,11 @@ The file named by the argument cannot be unlinked because it is being used by the system or by another process. .\" =========== +.It Bq Er EBUSY +The flag parameter has the AT_NODELETEBUSY bit set and the file referred to by +.Fa path +has one or more open file descriptors. +.\" =========== .It Bq Er EFAULT .Fa Path points outside the process's allocated address space. @@ -231,6 +249,12 @@ parameter has the bit set and one of the intermediate directories in the .Fa path argument is a symbolic link. +.It Bq Er ENOTCAPABLE +The flag parameter has the +.Dv AT_RESOLVE_BENEATH +bit set and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh SEE ALSO .Xr close 2 , diff --git a/bsd/man/man2/utimensat.2 b/bsd/man/man2/utimensat.2 index df63d03a1..b7feaf480 100644 --- a/bsd/man/man2/utimensat.2 +++ b/bsd/man/man2/utimensat.2 @@ -127,7 +127,7 @@ system call are constructed by a bitwise-inclusive OR of flags from the following list, defined in .In fcntl.h : -.Bl -tag -width indent +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW If .Fa path @@ -135,7 +135,8 @@ names a symbolic link, the symbolic link's times are changed. By default, .Fn utimensat changes the times of the file referenced by the symbolic link. -.Bl -tag -width indent +.El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY .It Dv AT_SYMLINK_NOFOLLOW_ANY If .Fa path @@ -146,6 +147,13 @@ By default, .Fn utimensat changes the times of the file referenced by the symbolic link. .El +.Bl -tag -width AT_SYMLINK_NOFOLLOW_ANY +.It Dv AT_RESOLVE_BENEATH +If +.Fa path +does not reside in the hierarchy beneath the starting directory, +an error is returned. +.El .Sh RETURN VALUES .Rv -std .Sh ERRORS @@ -249,6 +257,10 @@ argument is not an absolute path and is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. +.It Bq Er ENOTCAPABLE +if AT_RESOLVE_BENEATH is passed and +.Fa path +does not reside in the directory hierarchy beneath the starting directory. .El .Sh SEE ALSO .Xr chflags 2 , diff --git a/bsd/man/man3/Makefile b/bsd/man/man3/Makefile index 21c65aeaa..3b71b5c66 100644 --- a/bsd/man/man3/Makefile +++ b/bsd/man/man3/Makefile @@ -49,8 +49,8 @@ INSTALL_MAN_LINKS = \ posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addopen.3 \ posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_adddup2.3 \ posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addinherit_np.3 \ - posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addchdir_np.3 \ - posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addfchdir_np.3 \ + posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addchdir.3 \ + posix_spawn_file_actions_addclose.3 posix_spawn_file_actions_addfchdir.3 \ posix_spawn_file_actions_init.3 posix_spawn_file_actions_destroy.3 \ posix_spawnattr_init.3 posix_spawnattr_destroy.3 \ posix_spawnattr_setarchpref_np.3 posix_spawnattr_getarchpref_np.3 \ diff --git a/bsd/man/man3/posix_spawn_file_actions_addclose.3 b/bsd/man/man3/posix_spawn_file_actions_addclose.3 index e2915720a..49532c836 100644 --- a/bsd/man/man3/posix_spawn_file_actions_addclose.3 +++ b/bsd/man/man3/posix_spawn_file_actions_addclose.3 @@ -62,12 +62,12 @@ .Fa "int filedes" .Fc .Ft int -.Fo posix_spawn_file_actions_addchdir_np +.Fo posix_spawn_file_actions_addchdir .Fa "posix_spawn_file_actions_t *file_actions" .Fa "const char *restrict path" .Fc .Ft int -.Fo posix_spawn_file_actions_addfchdir_np +.Fo posix_spawn_file_actions_addfchdir .Fa "posix_spawn_file_actions_t *file_actions" .Fa "int filedes" .Fc diff --git a/bsd/man/man9/Makefile b/bsd/man/man9/Makefile index d78240361..24b9c8825 100644 --- a/bsd/man/man9/Makefile +++ b/bsd/man/man9/Makefile @@ -12,6 +12,7 @@ DATAFILES = \ backtrace_pack.9 \ backtrace_unpack.9 \ backtrace_user.9 \ + byteorder.9 \ copy.9 \ copyin.9 \ copyinstr.9 \ diff --git a/bsd/man/man9/byteorder.9 b/bsd/man/man9/byteorder.9 new file mode 100644 index 000000000..b40d231fe --- /dev/null +++ b/bsd/man/man9/byteorder.9 @@ -0,0 +1,169 @@ +.\" Copyright (c) 2002 Mike Barcroft +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd April 29, 2002 +.Dt BYTEORDER 9 +.Os +.Sh NAME +.Nm bswap16 , bswap32 , bswap64 , +.Nm be16toh , be32toh , be64toh , htobe16 , htobe32 , htobe64 , +.Nm htole16 , htole32 , htole64 , le16toh , le32toh , le64toh , +.Nm be16enc , be16dec , be32enc , be32dec , be64enc , be64dec , +.Nm le16enc , le16dec , le32enc , le32dec , le64enc , le64dec +.Nd byte order operations +.Sh SYNOPSIS +.In sys/endian.h +.Ft uint16_t +.Fn bswap16 "uint16_t int16" +.Ft uint32_t +.Fn bswap32 "uint32_t int32" +.Ft uint64_t +.Fn bswap64 "uint64_t int64" +.Ft uint16_t +.Fn be16toh "uint16_t big16" +.Ft uint32_t +.Fn be32toh "uint32_t big32" +.Ft uint64_t +.Fn be64toh "uint64_t big64" +.Ft uint16_t +.Fn htobe16 "uint16_t host16" +.Ft uint32_t +.Fn htobe32 "uint32_t host32" +.Ft uint64_t +.Fn htobe64 "uint64_t host64" +.Ft uint16_t +.Fn htole16 "uint16_t host16" +.Ft uint32_t +.Fn htole32 "uint32_t host32" +.Ft uint64_t +.Fn htole64 "uint64_t host64" +.Ft uint16_t +.Fn le16toh "uint16_t little16" +.Ft uint32_t +.Fn le32toh "uint32_t little32" +.Ft uint64_t +.Fn le64toh "uint64_t little64" +.Ft uint16_t +.Fn be16dec "const void *" +.Ft uint32_t +.Fn be32dec "const void *" +.Ft uint64_t +.Fn be64dec "const void *" +.Ft uint16_t +.Fn le16dec "const void *" +.Ft uint32_t +.Fn le32dec "const void *" +.Ft uint64_t +.Fn le64dec "const void *" +.Ft void +.Fn be16enc "void *" uint16_t +.Ft void +.Fn be32enc "void *" uint32_t +.Ft void +.Fn be64enc "void *" uint64_t +.Ft void +.Fn le16enc "void *" uint16_t +.Ft void +.Fn le32enc "void *" uint32_t +.Ft void +.Fn le64enc "void *" uint64_t +.Sh DESCRIPTION +The +.Fn bswap16 , +.Fn bswap32 , +and +.Fn bswap64 +functions return a byte order swapped integer. +On big endian systems, the number is converted to little endian byte order. +On little endian systems, the number is converted to big endian byte order. +.Pp +The +.Fn be16toh , +.Fn be32toh , +and +.Fn be64toh +functions return a big endian byte ordered integer +converted to the system's native byte order. +The return value will be the same as the argument on big endian systems. +.Pp +The +.Fn le16toh , +.Fn le32toh , +and +.Fn le64toh +functions return a little endian byte ordered integer +converted to the system's native byte order. +The return value will be the same as the argument on little endian systems. +.Pp +The +.Fn htobe16 , +.Fn htobe32 , +and +.Fn htobe64 +functions return an integer in the system's native +byte order converted to big endian byte order. +The return value will be the same as the argument on big endian systems. +.Pp +The +.Fn htole16 , +.Fn htole32 , +and +.Fn htole64 +functions return a integer in the system's native +byte order converted to little endian byte order. +The return value will be the same as the argument on little endian systems. +.Pp +The +.Fn be16enc , +.Fn be16dec , +.Fn be32enc , +.Fn be32dec , +.Fn be64enc , +.Fn be64dec , +.Fn le16enc , +.Fn le16dec , +.Fn le32enc , +.Fn le32dec , +.Fn le64enc , +and +.Fn le64dec +functions encode and decode integers to/from byte strings on any alignment +in big/little endian format. +.Sh SEE ALSO +.Xr byteorder 3 +.Sh HISTORY +The +.Fn hto* +and +.Fn *toh +functions first appeared in +.Fx 5.0 , +and were originally developed by the +.Nx +project. +.Pp +The encode/decode functions first appeared in +.Fx 5.1 . \ No newline at end of file diff --git a/bsd/miscfs/bindfs/bind_vfsops.c b/bsd/miscfs/bindfs/bind_vfsops.c index 412f9ecfc..4d50604b2 100644 --- a/bsd/miscfs/bindfs/bind_vfsops.c +++ b/bsd/miscfs/bindfs/bind_vfsops.c @@ -80,9 +80,6 @@ #define BINDFS_ENTITLEMENT "com.apple.private.bindfs-allow" -#define SIZEOF_MEMBER(type, member) (sizeof(((type *)0)->member)) -#define MAX_MNT_FROM_LENGTH (SIZEOF_MEMBER(struct vfsstatfs, f_mntfromname)) - static int bindfs_vfs_getlowerattr(mount_t mp, struct vfs_attr * vfap, vfs_context_t ctx) { @@ -155,9 +152,9 @@ bindfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v /* This could happen if the system is configured for 32 bit inodes instead of * 64 bit */ - if (count > MAX_MNT_FROM_LENGTH) { + if (count > sizeof(vfs_statfs(mp)->f_mntfromname)) { error = EINVAL; - BINDFSERROR("path to mount too large for this system %zu vs %lu\n", count, MAX_MNT_FROM_LENGTH); + BINDFSERROR("path to mount too large for this system %zu vs %lu\n", count, sizeof(vfs_statfs(mp)->f_mntfromname)); goto error; } @@ -225,7 +222,7 @@ bindfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v /* fill in the stat block */ sp = vfs_statfs(mp); - strlcpy(sp->f_mntfromname, data, MAX_MNT_FROM_LENGTH); + strlcpy(sp->f_mntfromname, data, sizeof(sp->f_mntfromname)); sp->f_flags = flags; diff --git a/bsd/miscfs/deadfs/dead_vnops.c b/bsd/miscfs/deadfs/dead_vnops.c index 97a856219..622df4041 100644 --- a/bsd/miscfs/deadfs/dead_vnops.c +++ b/bsd/miscfs/deadfs/dead_vnops.c @@ -350,6 +350,10 @@ chkvnlock(vnode_t vp) } while (vp->v_lflag & VL_DEAD) { msleep(&vp->v_lflag, &vp->v_lock, PVFS, "chkvnlock", NULL); + if (!(vp->v_lflag & VL_OPSCHANGE)) { + vnode_unlock(vp); + return 0; + } } vnode_unlock(vp); return 1; diff --git a/bsd/miscfs/devfs/devfs_fdesc_support.c b/bsd/miscfs/devfs/devfs_fdesc_support.c index a17c69922..d591bf14b 100644 --- a/bsd/miscfs/devfs/devfs_fdesc_support.c +++ b/bsd/miscfs/devfs/devfs_fdesc_support.c @@ -411,15 +411,17 @@ fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context) return error; } switch (FILEGLOB_DTYPE(fp->fp_glob)) { - case DTYPE_VNODE: - if ((error = vnode_getwithref((struct vnode *)fp_get_data(fp))) != 0) { + case DTYPE_VNODE: { + vnode_t vp = (vnode_t)fp_get_data(fp); + + if ((error = vnode_getwithref(vp)) != 0) { break; } - if ((error = vnode_authorize((struct vnode *)fp_get_data(fp), + if ((error = vnode_authorize(vp, NULL, KAUTH_VNODE_READ_ATTRIBUTES | KAUTH_VNODE_READ_SECURITY, a_context)) == 0) { - error = vnode_getattr((struct vnode *)fp_get_data(fp), vap, a_context); + error = vnode_getattr(vp, vap, a_context); } if (error == 0 && vap->va_type == VDIR) { /* @@ -430,9 +432,9 @@ fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context) */ vap->va_mode &= ~((VEXEC) | (VEXEC >> 3) | (VEXEC >> 6)); } - (void)vnode_put((struct vnode *)fp_get_data(fp)); + (void)vnode_put(vp); break; - + } case DTYPE_SOCKET: case DTYPE_PIPE: #if SOCKETS @@ -471,6 +473,22 @@ fdesc_attr(int fd, struct vnode_attr *vap, vfs_context_t a_context) error = EBADF; } + /* Update 'va_mode' to take into account the bits on the fd. */ + if (error == 0 && VATTR_IS_SUPPORTED(vap, va_mode)) { + int flags; + + flags = fp->fp_glob->fg_flag; + if (!(flags & FREAD)) { + vap->va_mode &= ~(S_IRUSR | S_IRGRP | S_IROTH); + } + if (!(flags & FWRITE)) { + vap->va_mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); + } + if (!(flags & O_EXEC)) { + vap->va_mode &= ~(S_IXUSR | S_IXGRP | S_IXOTH); + } + } + fp_drop(p, fd, fp, 0); return error; } diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index b77407b45..a5591e2ea 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -116,6 +116,7 @@ void devfs_rele_node(devnode_t *); static void devfs_consider_time_update(devnode_t *dnp, uint32_t just_changed_flags); static boolean_t devfs_update_needed(long now_s, long last_s); static boolean_t devfs_is_name_protected(struct vnode *dvp, const char *name); +static boolean_t devfs_is_vnode_protected(struct vnode *vp); void dn_times_locked(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3, uint32_t just_changed_flags); void dn_times_now(devnode_t *dnp, uint32_t just_changed_flags); void dn_mark_for_delayed_times_update(devnode_t *dnp, uint32_t just_changed_flags); @@ -211,6 +212,33 @@ devfs_is_name_protected(struct vnode *dvp, const char *name) return FALSE; } +/* + * These devfs devices cannot have their permissions updated. + */ +static boolean_t +devfs_is_vnode_protected(struct vnode *vp) +{ + struct vnode *dvp = NULLVP; + const char *vname = NULL; + boolean_t ret = FALSE; + vnode_getparent_and_name(vp, &dvp, &vname); + if (!dvp || !vname) { + ret = FALSE; + goto out; + } + + ret = devfs_is_name_protected(dvp, vname); + +out: + if (vname) { + vnode_putname(vname); + } + if (dvp != NULLVP) { + vnode_put(dvp); + } + + return ret; +} /* * Convert a component of a pathname into a pointer to a locked node. @@ -579,6 +607,13 @@ devfs_setattr(struct vnop_setattr_args *ap) * Change the permissions. */ if (VATTR_IS_ACTIVE(vap, va_mode)) { + /* + * Don't allow permission updates of critical devfs devices + */ + if (devfs_is_vnode_protected(vp)) { + error = EPERM; + goto exit; + } file_node->dn_mode &= ~07777; file_node->dn_mode |= vap->va_mode & 07777; } diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c index d24c6e0d7..caea0f33f 100644 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ b/bsd/miscfs/nullfs/null_vfsops.c @@ -80,9 +80,6 @@ #define NULLFS_ENTITLEMENT "com.apple.private.nullfs_allow" -#define SIZEOF_MEMBER(type, member) (sizeof(((type *)0)->member)) -#define MAX_MNT_FROM_LENGTH (SIZEOF_MEMBER(struct vfsstatfs, f_mntfromname)) - static int nullfs_vfs_getlowerattr(mount_t mp, struct vfs_attr * vfap, vfs_context_t ctx) { @@ -166,9 +163,9 @@ nullfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v /* This could happen if the system is configured for 32 bit inodes instead of * 64 bit */ - if (count > MAX_MNT_FROM_LENGTH) { + if (count > sizeof(vfs_statfs(mp)->f_mntfromname)) { error = EINVAL; - NULLFSDEBUG("nullfs: path to translocate too large for this system %ld vs %ld\n", count, MAX_MNT_FROM_LENGTH); + NULLFSDEBUG("nullfs: path to translocate too large for this system %ld vs %ld\n", count, sizeof(vfs_statfs(mp)->f_mntfromname)); goto error; } @@ -240,7 +237,7 @@ nullfs_mount(struct mount * mp, __unused vnode_t devvp, user_addr_t user_data, v /* fill in the stat block */ sp = vfs_statfs(mp); - strlcpy(sp->f_mntfromname, path, MAX_MNT_FROM_LENGTH); + strlcpy(sp->f_mntfromname, path, sizeof(sp->f_mntfromname)); sp->f_flags = flags; diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 62c670973..1f048affe 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -249,6 +249,7 @@ struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; int lowpri_throttle_enabled = 1; +static int spec_close_internal(struct vnode *vp, dev_t dev, int flags, vfs_context_t ctx); static void throttle_info_end_io_internal(struct _throttle_io_info_t *info, int throttle_level); static int throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap); @@ -486,6 +487,9 @@ spec_open(struct vnop_open_args *ap) } /* If it doesn't set back, we can't recover */ if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context)) { + /* Perform an explicit close on the block device, as the device is already open */ + spec_close_internal(vp, dev, ap->a_mode, ap->a_context); + error = ENXIO; } } @@ -2786,18 +2790,11 @@ spec_blockmap(__unused struct vnop_blockmap_args *ap) return ENOTSUP; } - -/* - * Device close routine - */ -int -spec_close(struct vnop_close_args *ap) +static int +spec_close_internal(struct vnode *vp, dev_t dev, int flags, vfs_context_t ctx) { - struct vnode *vp = ap->a_vp; - dev_t dev = vp->v_rdev; int error = 0; - int flags = ap->a_fflag; - struct proc *p = vfs_context_proc(ap->a_context); + struct proc *p = vfs_context_proc(ctx); struct session *sessp; struct pgrp *pg; @@ -2865,7 +2862,7 @@ spec_close(struct vnop_close_args *ap) * we must invalidate any in core blocks, so that * we can, for instance, change floppy disks. */ - if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context))) { + if ((error = spec_fsync_internal(vp, MNT_WAIT, ctx))) { return error; } @@ -2895,6 +2892,15 @@ spec_close(struct vnop_close_args *ap) return error; } +/* + * Device close routine + */ +int +spec_close(struct vnop_close_args *ap) +{ + return spec_close_internal(ap->a_vp, ap->a_vp->v_rdev, ap->a_fflag, ap->a_context); +} + /* * Return POSIX pathconf information applicable to special devices. */ diff --git a/bsd/net/Makefile b/bsd/net/Makefile index f0f5dcf74..16000c342 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -7,7 +7,7 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ - altq classq pktsched + altq classq pktsched aop # Additional private file for macOS ifeq ($(PLATFORM),MacOSX) @@ -17,6 +17,7 @@ endif # Files that are public on macOS, but private on embedded EMBEDDED_PRIVATE_DATAFILES = \ + bpf.h \ if_media.h \ if_mib.h \ route.h @@ -28,7 +29,6 @@ EXTRA_PRIVATE_DATAFILES = $(EMBEDDED_PRIVATE_DATAFILES) endif DATAFILES= $(sort \ - bpf.h \ dlil.h \ ethernet.h \ if.h \ @@ -53,7 +53,7 @@ KERNELFILES= \ if_ether.h init.h radix.h PRIVATE_DATAFILES = $(sort \ - bpf.h \ + bpf_private.h \ content_filter.h \ etherdefs.h \ firewire.h \ @@ -115,7 +115,7 @@ DRIVERKIT_DATAFILES = \ PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \ bpfdesc.h ppp_comp.h \ zlib.h bpf_compat.h net_osdep.h \ - flowadv.h net_perf.h + flowadv.h net_perf.h pktap.h INSTALL_MI_LIST = ${DATAFILES} @@ -125,8 +125,9 @@ INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} INSTALL_MI_DIR = net -INSTALL_MI_LCL_LIST = $(sort if_ipsec.h if_mib_private.h if_private.h if_var_private.h \ - if_vlan_var.h necp.h network_agent.h route_private.h ${EXTRA_PRIVATE_DATAFILES}) +INSTALL_MI_LCL_LIST = $(sort bpf_private.h if_ipsec.h if_mib_private.h if_private.h \ + if_var_private.h if_vlan_var.h necp.h network_agent.h route_private.h \ + ${EXTRA_PRIVATE_DATAFILES}) INSTALL_MODULEMAP_MI_LCL_LIST = ${PRIVATE_MODULEMAPFILES} diff --git a/bsd/net/aop/Makefile b/bsd/net/aop/Makefile new file mode 100644 index 000000000..c6910e9b5 --- /dev/null +++ b/bsd/net/aop/Makefile @@ -0,0 +1,33 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +DATAFILES= \ + +KERNELFILES= \ + +PRIVATE_DATAFILES = \ + +PRIVATE_KERNELFILES = \ + kpi_aop.h aop_stats.h aop_flow_stats.h + +INSTALL_MI_LIST = ${DATAFILES} + +INSTALL_MI_DIR = net/aop/ + +INSTALL_MI_LCL_LIST = $(sort aop_stats.h) + +EXPORT_MI_LIST = ${INSTALL_MI_LIST} ${KERNELFILES} + +EXPORT_MI_DIR = ${INSTALL_MI_DIR} + +INSTALL_SF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} + +INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${PRIVATE_KERNELFILES} + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/net/aop/aop_flow_stats.h b/bsd/net/aop/aop_flow_stats.h new file mode 100644 index 000000000..3f3872a95 --- /dev/null +++ b/bsd/net/aop/aop_flow_stats.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_AOP_FLOW_STATS_H_ +#define _NET_AOP_FLOW_STATS_H_ + +#include +#include + +typedef struct aop_buffer { + uint32_t bufsize; /* Transport buffer size */ + uint32_t bufused; /* Transport buffer used count */ +} aop_buffer_t; + +typedef struct aop_tcp_info { + struct tcp_info tcp_info; /* TCP information */ + uint8_t tcp_cc_algo; /* TCP congestion control algo from tcp_cc.h */ +} aop_tcp_info_t; + +struct aop_flow_stats { + uint32_t flow_id; /* Flow ID */ + uint32_t reserved; + uint64_t rxbytes; /* Total Rx bytes */ + uint64_t txbytes; /* Total Tx bytes */ + uint64_t rxpkts; /* Total Rx packets */ + uint64_t txpkts; /* Total Tx packets */ + aop_buffer_t tx_buffer_stats; /* Transport Tx buffer stats */ + aop_buffer_t rx_buffer_stats; /* Transport Rx buffer stats */ + activity_bitmap_t activity_bitmap; /* Activity bitmap */ + union { + aop_tcp_info_t tcp_stats; /* TCP stats */ + } transport; +}; + +#endif /* _NET_AOP_FLOW_STATS_H_ */ diff --git a/bsd/net/aop/aop_stats.h b/bsd/net/aop/aop_stats.h new file mode 100644 index 000000000..91ecdb327 --- /dev/null +++ b/bsd/net/aop/aop_stats.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_AOP_STATS_H_ +#define _NET_AOP_STATS_H_ + +/* ip stats definitions */ +#define AOP_IP_STATS_TABLE(X) \ + /* Input stats */ \ + X(AOP_IP_STATS_TOTAL, "TotalRcvd", "\t%llu total packet received\n") \ + X(AOP_IP_STATS_BADSUM, "BadCsum", "\t\t%llu bad header checksum\n") \ + X(AOP_IP_STATS_TOOSMALL, "DataTooSmall", "\t\t%llu with size smaller than minimum\n")\ + X(AOP_IP_STATS_TOOSHORT, "PktTooShort", "\t\t%llu with data size < data length\n") \ + X(AOP_IP_STATS_ADJ, "TotalAdj", "\t\t%llu with data size > data length\n") \ + X(AOP_IP_STATS_TOOLONG, "TooLong", "\t\t%llu with ip length > max ip packet size\n") \ + X(AOP_IP_STATS_BADHLEN, "BadHdrLen", "\t\t%llu with header length < data size\n") \ + X(AOP_IP_STATS_BADLEN, "BadLen", "\t\t%llu with data length < header length\n") \ + X(AOP_IP_STATS_BADOPTIONS, "BadOptions", "\t\t%llu with bad options\n") \ + X(AOP_IP_STATS_BADVERS, "BadVer", "\t\t%llu with incorrect version number\n") \ + X(AOP_IP_STATS_FRAGMENTS, "FragRcvd", "\t\t%llu fragment received\n") \ + X(AOP_IP_STATS_FRAGDROPPED, "FragDrop", "\t\t\t%llu dropped (dup or out of space)\n") \ + X(AOP_IP_STATS_FRAGTIMEOUT, "FragTimeO", "\t\t\t%llu dropped after timeout\n") \ + X(AOP_IP_STATS_REASSEMBLED, "Reassembled", "\t\t\t%llu reassembled ok\n") \ + X(AOP_IP_STATS_DELIVERED, "Delivered", "\t\t%llu packet for this host\n") \ + X(AOP_IP_STATS_NOPROTO, "UnkwnProto", "\t\t%llu packet for unknown/unsupported protocol\n") \ + /* Output stats */ \ + X(AOP_IP_STATS_LOCALOUT, "LocalOut", "\t%llu packet sent from this host\n") \ + X(AOP_IP_STATS_ODROPPED, "DropNoBuf", "\t\t%llu output packet dropped due to no bufs, etc.\n") \ + X(AOP_IP_STATS_NOROUTE, "NoRoute", "\t\t%llu output packet discarded due to no route\n") \ + X(AOP_IP_STATS_FRAGMENTED, "Fragmented", "\t\t%llu output datagram fragmented\n") \ + X(AOP_IP_STATS_OFRAGMENTS, "OutFraged", "\t\t%llu fragment created\n") \ + X(AOP_IP_STATS_CANTFRAG, "CantFrag", "\t\t%llu datagram that can't be fragmented\n") \ + X(__AOP_IP_STATS_MAX, "", "end of ip stats") + +/* ipv6 stats definitions */ +#define AOP_IP6_STATS_TABLE(X) \ + /* Input Stats */ \ + X(AOP_IP6_STATS_TOTAL, "TotalRcvd", "\t%llu total packet received\n") \ + X(AOP_IP6_STATS_TOOSMALL, "DataTooSmall", "\t\t%llu with size smaller than minimum\n") \ + X(AOP_IP6_STATS_TOOSHORT, "PktTooShort", "\t\t%llu with data size < data length\n") \ + X(AOP_IP6_STATS_ADJ, "TotalAdj", "\t\t%llu with data size > data length\n") \ + X(AOP_IP6_STATS_BADOPTIONS, "BadOptions", "\t\t%llu with bad options\n") \ + X(AOP_IP6_STATS_BADVERS, "BadVer", "\t\t%llu with incorrect version number\n") \ + X(AOP_IP6_STATS_FRAGMENTS, "FrafRcvd", "\t\t%llu fragment received\n") \ + X(AOP_IP6_STATS_FRAGDROPPED, "FragDrop", "\t\t\t%llu dropped (dup or out of space)\n") \ + X(AOP_IP6_STATS_FRAGTIMEOUT, "FragTimeO", "\t\t\t%llu dropped after timeout\n") \ + X(AOP_IP6_STATS_FRAGOVERFLOW, "FragOverFlow", "\t\t\t%llu exceeded limit\n") \ + X(AOP_IP6_STATS_REASSEMBLED, "FragReassembled","\t\t\t%llu reassembled ok\n") \ + X(AOP_IP6_STATS_DELIVERED, "Delivered", "\t\t%llu packet for this host\n") \ + X(AOP_IP6_STATS_TOOMANYHDR, "TooManyHdr", "\t\t%llu packet discarded due to too may headers\n") \ + /* Output stats */ \ + X(AOP_IP6_STATS_LOCALOUT, "LocalOut", "\t%llu packet sent from this host\n") \ + X(AOP_IP6_STATS_ODROPPED, "DropNoBuf", "\t\t%llu output packet dropped due to no bufs, etc.\n") \ + X(AOP_IP6_STATS_NOROUTE, "NoRoute", "\t\t%llu output packet discarded due to no route\n") \ + X(AOP_IP6_STATS_FRAGMENTED, "Fragmented", "\t\t%llu output datagram fragmented\n") \ + X(AOP_IP6_STATS_OFRAGMENTS, "OutFraged", "\t\t%llu fragment created\n") \ + X(AOP_IP6_STATS_CANTFRAG, "CantFrag", "\t\t%llu datagram that can't be fragmented\n")\ + X(__AOP_IP6_STATS_MAX, "", "end of ipv6 stats") + +/* tcp stats definitions */ +#define AOP_TCP_STATS_TABLE(X) \ + /* Output stats */ \ + X(AOP_TCP_STATS_SNDTOTAL, "SndTotalPkt", "\t%llu packet sent\n") \ + X(AOP_TCP_STATS_SNDPACK, "SndTotalDP", "\t\t%llu data packet") \ + X(AOP_TCP_STATS_SNDBYTE, "SndDataByte", " (%llu byte)\n") \ + X(AOP_TCP_STATS_SNDREXMITPACK, "SndDPktReXmt", "\t\t%llu data packet retransmitted") \ + X(AOP_TCP_STATS_SNDREXMITBYTE, "SndDByteReXmt"," (%llu byte)\n") \ + X(AOP_TCP_STATS_MTURESENT, "MTUReSnd", "\t\t%llu resend initiated by MTU discovery\n") \ + X(AOP_TCP_STATS_SNDACKS, "SndAck", "\t\t%llu ack-only packet") \ + X(AOP_TCP_STATS_DELACK, "DelayAck", " (%llu delayed)\n") \ + X(AOP_TCP_STATS_SNDURG, "SndURG", "\t\t%llu URG only packet\n") \ + X(AOP_TCP_STATS_SNDPROBE, "SndWinProb", "\t\t%llu window probe packet\n") \ + X(AOP_TCP_STATS_SNDWINUP, "SndWinUpd", "\t\t%llu window update packet\n") \ + X(AOP_TCP_STATS_SNDCTRL, "SndCtlPkt", "\t\t%llu control packet\n") \ + X(AOP_TCP_STATS_SYNCHALLENGE, "SYNChallenge", "\t\t%llu challenge ACK sent due to unexpected SYN\n") \ + X(AOP_TCP_STATS_RSTCHALLENGE, "RSTChallenge", "\t\t%llu challenge ACK sent due to unexpected RST\n") \ + \ + /* Input stats */ \ + X(AOP_TCP_STATS_RCVTOTAL, "RcvTotalPkt", "\t%llu packet received\n") \ + X(AOP_TCP_STATS_RCVACKPACK, "RcvAckPkt", "\t\t%llu ack") \ + X(AOP_TCP_STATS_RCVACKBYTE, "RcvAckByte", " (for %llu byte)\n") \ + X(AOP_TCP_STATS_RCVDUPACK, "RcvDupAck", "\t\t%llu duplicate ack\n") \ + X(AOP_TCP_STATS_RCVACKTOOMUCH, "RcvAckUnSnd", "\t\t%llu ack for unsent data\n") \ + X(AOP_TCP_STATS_RCVPACK, "RcvPktInSeq", "\t\t%llu packet received in-sequence") \ + X(AOP_TCP_STATS_RCVBYTE, "RcvBInSeq", " (%llu byte)\n") \ + X(AOP_TCP_STATS_RCVDUPPACK, "RcvDupPkt", "\t\t%llu completely duplicate packet") \ + X(AOP_TCP_STATS_RCVDUPBYTE, "RcvDupByte", " (%llu byte)\n") \ + X(AOP_TCP_STATS_PAWSDROP, "PAWSDrop", "\t\t%llu old duplicate packet\n") \ + X(AOP_TCP_STATS_RCVMEMDROP, "RcvMemDrop", "\t\t%llu received packet dropped due to low memory\n") \ + X(AOP_TCP_STATS_RCVPARTDUPPACK, "RcvDupData", "\t\t%llu packet with some dup. data") \ + X(AOP_TCP_STATS_RCVPARTDUPBYTE, "RcvPDupByte", " (%llu byte duped)\n") \ + X(AOP_TCP_STATS_RCVOOPACK, "RcvOOPkt", "\t\t%llu out-of-order packet") \ + X(AOP_TCP_STATS_RCVOOBYTE, "RcvOOByte", " (%llu byte)\n") \ + X(AOP_TCP_STATS_RCVPACKAFTERWIN, "RcvAftWinPkt", "\t\t%llu packet of data after window") \ + X(AOP_TCP_STATS_RCVBYTEAFTERWIN, "RcvAftWinByte"," (%llu byte)\n") \ + X(AOP_TCP_STATS_RCVWINPROBE, "RcvWinProbPkt","\t\t%llu window probe\n") \ + X(AOP_TCP_STATS_RCVWINUPD, "RcvWinUpdPkt", "\t\t%llu window update packet\n") \ + X(AOP_TCP_STATS_RCVAFTERCLOSE, "RcvAftCloPkt", "\t\t%llu packet received after close\n") \ + X(AOP_TCP_STATS_BADRST, "BadRST", "\t\t%llu bad reset\n") \ + X(AOP_TCP_STATS_RCVBADSUM, "RcvBadCsum", "\t\t%llu discarded for bad checksum\n") \ + X(AOP_TCP_STATS_RCVBADOFF, "RcvBadOff", "\t\t%llu discarded for bad header offset field\n") \ + X(AOP_TCP_STATS_RCVSHORT, "RcvTooShort", "\t\t%llu discarded because packet too short\n") \ + X(AOP_TCP_STATS_CONNATTEMPT, "ConnInit", "\t\t%llu discarded because packet too short\n") \ + \ + /* Connection stats */ \ + X(AOP_TCP_STATS_CONNECTS, "ConnEst", "\t%llu connection established (including accepts)\n") \ + X(AOP_TCP_STATS_CLOSED, "ConnClosed", "\t%llu connection closed") \ + X(AOP_TCP_STATS_DROPS, "ConnDrop", " (including %llu drop)\n") \ + X(AOP_TCP_STATS_RTTUPDATED, "RTTUpdated", "\t%llu segment updated rtt") \ + X(AOP_TCP_STATS_SEGSTIMED, "RTTTimed", " (of %llu attempt)\n") \ + X(AOP_TCP_STATS_REXMTTIMEO, "ReXmtTO", "\t%llu retransmit timeout\n") \ + X(AOP_TCP_STATS_TIMEOUTDROP, "DropTO", "\t\t%llu connection dropped by rexmit timeout\n") \ + X(AOP_TCP_STATS_RXTFINDROP, "ReXmtFINDrop", "\t\t%llu connection dropped after retransmitting FIN\n") \ + X(AOP_TCP_STATS_PERSISTTIMEO, "PersistTO", "\t%llu persist timeout\n") \ + X(AOP_TCP_STATS_PERSISTDROP, "PersisStateTO","\t\t%llu connection dropped by persist timeout\n") \ + X(AOP_TCP_STATS_KEEPTIMEO, "KATO", "\t%llu keepalive timeout\n") \ + X(AOP_TCP_STATS_KEEPPROBE, "KAProbe", "\t\t%llu keepalive probe sent\n") \ + X(AOP_TCP_STATS_KEEPDROPS, "KADrop", "\t\t%llu connection dropped by keepalive\n") \ + \ + /* SACK/RACK related stats */ \ + X(AOP_TCP_STATS_SACK_RECOVERY_EPISODE, "SACKRecEpi", "\t%llu SACK recovery episode\n") \ + X(AOP_TCP_STATS_SACK_REXMITS, "SACKReXmt", "\t%llu segment rexmit in SACK recovery episodes\n") \ + X(AOP_TCP_STATS_SACK_REXMIT_BYTES, "SACKReXmtB", "\t%llu byte rexmit in SACK recovery episodes\n") \ + X(AOP_TCP_STATS_SACK_RCV_BLOCKS, "SACKRcvBlk", "\t%llu SACK option (SACK blocks) received\n") \ + X(AOP_TCP_STATS_SACK_SEND_BLOCKS, "SACKSntBlk", "\t%llu SACK option (SACK blocks) sent\n") \ + X(AOP_TCP_STATS_SACK_SBOVERFLOW, "SACKSndBlkOF", "\t%llu SACK scoreboard overflow\n") \ + \ + X(AOP_TCP_STATS_LIMITED_TXT, "LimitedXmt", "\t%llu limited transmit done\n") \ + X(AOP_TCP_STATS_EARLY_REXMT, "EarlyReXmt", "\t%llu early retransmit done\n") \ + X(AOP_TCP_STATS_SACK_ACKADV, "SACKAdvAck", "\t%llu time cumulative ack advanced along with SACK\n") \ + X(AOP_TCP_STATS_PTO, "ProbTO", "\t%llu probe timeout\n") \ + X(AOP_TCP_STATS_RTO_AFTER_PTO, "RTOAfProb", "\t\t%llu time retransmit timeout triggered after probe\n") \ + X(AOP_TCP_STATS_PROBE_IF, "ProbeIF", "\t\t%llu time probe packets were sent for an interface\n") \ + X(AOP_TCP_STATS_PROBE_IF_CONFLICT, "ProbeIFConfl", "\t\t%llu time couldn't send probe packets for an interface\n") \ + X(AOP_TCP_STATS_TLP_RECOVERY, "TLPFastRecvr", "\t\t%llu time fast recovery after tail loss\n") \ + X(AOP_TCP_STATS_TLP_RECOVERLASTPKT, "TLPRecvrLPkt", "\t\t%llu time recovered last packet \n") \ + X(AOP_TCP_STATS_PTO_IN_RECOVERY, "PTOInRecvr", "\t\t%llu SACK based rescue retransmit\n") \ + \ + /* DSACK related statistics */ \ + X(AOP_TCP_STATS_DSACK_SENT, "DSACKSnd", "\t%llu time DSACK option was sent\n") \ + X(AOP_TCP_STATS_DSACK_RECVD, "DSACKRcv", "\t\t%llu time DSACK option was received\n") \ + X(AOP_TCP_STATS_DSACK_DISABLE, "DSACKDisable", "\t\t%llu time DSACK was disabled on a connection\n") \ + X(AOP_TCP_STATS_DSACK_BADREXMT, "DSACKBadReXmt","\t\t%llu time recovered from bad retransmission using DSACK\n") \ + X(AOP_TCP_STATS_DSACK_ACKLOSS, "DSACKAckLoss", "\t\t%llu time ignored DSACK due to ack loss\n") \ + X(AOP_TCP_STATS_DSACK_RECVD_OLD, "DSACKRcvOld", "\t\t%llu time ignored old DSACK options\n") \ + X(AOP_TCP_STATS_PMTUDBH_REVERTED, "PMTUDBHRevert","\t%llu time PMTU Blackhole detection, size reverted\n") \ + X(AOP_TCP_STATS_DROP_AFTER_SLEEP, "DropAPSleep", "\t%llu connection were dropped after long sleep\n") \ + X(__AOP_TCP_STATS_MAX, "", "end of tcp stats") + +#define AOP_UDP_STATS_TABLE(X) \ + /* Input stats */ \ + X(AOP_UDP_STATS_IPACKETS, "RcvPkt", "\t%llu datagram received\n") \ + X(AOP_UDP_STATS_HDROPS, "HdrDrop", "\t\t%llu with incomplete header\n") \ + X(AOP_UDP_STATS_BADSUM, "BadCsum", "\t\t%llu with bad data length field\n") \ + X(AOP_UDP_STATS_BADLEN, "BadLen", "\t\t%llu with bad checksum\n") \ + X(AOP_UDP_STATS_NOSUM, "NoCsum", "\t\t%llu with no checksum\n") \ + X(AOP_UDP_STATS_NOPORT, "NoPort", "\t\t%llu dropped due to no socket\n") \ + X(AOP_UDP_STATS_FULLSOCK, "FullSock", "\t\t%llu dropped due to full socket buffers\n") \ + \ + /* Output stats */ \ + X(AOP_UDP_STATS_OPACKETS, "SndPkt", "\t%llu datagram output\n") \ + \ + X(__AOP_UDP_STATS_MAX, "", "end of UDP stats") + +#define AOP_DRIVER_STATS_TABLE(X) \ + /* AOP driver stats */ \ + X(AOP_DRIVER_STATS_TXDROP, "TxDrop", "\t%llu total Tx dropped\n") \ + X(AOP_DRIVER_STATS_TXPENDING, "TxPending", "\t%llu total pending Tx not completed\n") \ + X(AOP_DRIVER_STATS_RXDROP, "RxDrop", "\t%llu total Rx dropped\n") \ + X(AOP_DRIVER_STATS_RXPENDING, "RxPending", "\t%llu total pending Rx not completed\n") \ + X(__AOP_DRIVER_STATS_MAX, "", "end of driver stats") + +/* + * Common stats operation and macro + */ +#define EXPAND_TO_ENUMERATION(a, b, c) a, +#define EXPAND_TO_STRING(a, b, c) b, +#define EXPAND_TO_FORMAT(a, b, c) c, + +#define DEFINE_STATS_STR_FUNC(type, table) \ +__attribute__((always_inline)) \ +static inline const char * \ +type##_str(enum _##type value) \ +{ \ + static const char *table[] = { \ + table(EXPAND_TO_STRING) \ + }; \ + return (table[value]); \ +} + +#define DEFINE_STATS_FMT_FUNC(type, table) \ +__attribute__((always_inline)) \ +static inline const char * \ +type##_fmt(enum _##type value) \ +{ \ + static const char *table[] = { \ + table(EXPAND_TO_FORMAT) \ + }; \ + return (table[value]); \ +} + +#define STATS_ALIGN 16 /* align for vector instruction */ + +#define STATS_REGISTER(name, NAME) \ +enum _##name { NAME##_TABLE(EXPAND_TO_ENUMERATION) }; \ +struct name { \ + uint64_t _arr[__##NAME##_MAX]; \ +} __attribute__((aligned(STATS_ALIGN))); \ +DEFINE_STATS_STR_FUNC(name, NAME##_TABLE) \ +DEFINE_STATS_FMT_FUNC(name, NAME##_TABLE) + +/* Stats registration stub */ +STATS_REGISTER(aop_ip_stats, AOP_IP_STATS); +STATS_REGISTER(aop_ip6_stats, AOP_IP6_STATS); +STATS_REGISTER(aop_tcp_stats, AOP_TCP_STATS); +STATS_REGISTER(aop_udp_stats, AOP_UDP_STATS); +STATS_REGISTER(aop_driver_stats, AOP_DRIVER_STATS); + +#undef STATS_REGISTER +#undef DEFINE_STATS_STR_FUNC +#undef EXPAND_TO_STRING +#undef EXPAND_TO_ENUMERATION + +#define NET_AOP_PROTOCOL_STATS "net.aop.protocol_stats" +#define NET_AOP_DRIVER_STATS "net.aop.driver_stats" +#define NET_AOP_ACTIVITY_BITMAP "net.aop.proc_activity_bitmaps" + +struct net_aop_protocol_stats { + struct aop_ip_stats aop_ip; + struct aop_ip6_stats aop_ip6; + struct aop_tcp_stats aop_tcp; + struct aop_udp_stats aop_udp; +}; + +struct net_aop_global_stats { + struct net_aop_protocol_stats aop_proto_stats; + struct aop_driver_stats aop_driver; +}__attribute__((aligned(64))); + +struct aop_activity_bitmap { + /* + * `start` maintains the start time of the + * bitmap. The value is set based on + * mach_continuous_time(). + */ + uint64_t start; + /* + * AOP maintains a larger bitmap to track + * state when AP goes to sleep. A bitmap of + * size 8 allows tracking network activity for + * more than 60 mins. + */ + uint64_t bitmap[8]; +}; + +#define AOP_MAX_PROC_BUNDLE_ID_LEN 256 +struct aop_proc_activity_bitmap { + char proc_bundle_id[AOP_MAX_PROC_BUNDLE_ID_LEN]; + struct aop_activity_bitmap wifi_bitmap; + struct aop_activity_bitmap cell_bitmap; +}; + +#endif /*_NET_AOP_STATS_H_*/ diff --git a/bsd/net/aop/kpi_aop.c b/bsd/net/aop/kpi_aop.c new file mode 100644 index 000000000..af75ac353 --- /dev/null +++ b/bsd/net/aop/kpi_aop.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define __KPI__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LCK_GRP_DECLARE(kaop_lock_group, "net_aop"); +static LCK_ATTR_DECLARE(kaop_lock_attr, 0, 0); +static LCK_MTX_DECLARE_ATTR(kaop_lock, &kaop_lock_group, &kaop_lock_attr); + +#define KAOP_LOCK() \ + lck_mtx_lock(&kaop_lock) +#define KAOP_LOCK_ASSERT_HELD() \ + LCK_MTX_ASSERT(&kaop_lock, LCK_MTX_ASSERT_OWNED) +#define KAOP_LOCK_ASSERT_NOTHELD() \ + LCK_MTX_ASSERT(&kaop_lock, LCK_MTX_ASSERT_NOTOWNED) +#define KAOP_UNLOCK() \ + lck_mtx_unlock(&kaop_lock) + +os_log_t kaop_log_handle = NULL; + +#define _KAOPLOG(level, type, fmt, ...) do { \ + os_log_with_type(kaop_log_handle, type, "%s - " fmt, __func__, ##__VA_ARGS__); \ +} while(0) + +#define KAOPLOG(fmt, ...) _KAOPLOG(kaop_log_handle, OS_LOG_TYPE_DEFAULT, fmt, ##__VA_ARGS__) +#define KAOPLOG_DEBUG(fmt, ...) _KAOPLOG(kaop_log_handle, OS_LOG_TYPE_DEBUG, fmt, ##__VA_ARGS__) +#define KAOPLOG_INFO(fmt, ...) _KAOPLOG(kaop_log_handle, OS_LOG_TYPE_INFO, fmt, ##__VA_ARGS__) +#define KAOPLOG_ERR(fmt, ...) _KAOPLOG(kaop_log_handle, OS_LOG_TYPE_ERROR, fmt, ##__VA_ARGS__) + +os_refgrp_decl(static, kaop_refgrp, "kaop_ref_group", NULL); + +#define KAOP_DRIVER_STATS (((uint32_t)1) << 1) +#define KAOP_PROC_ACTIVITY_BITMAPS (((uint32_t)1) << 2) + +#define KAOP_IP_STATS (((uint32_t)1) << 24) +#define KAOP_IP6_STATS (((uint32_t)1) << 25) +#define KAOP_TCP_STATS (((uint32_t)1) << 26) +#define KAOP_UDP_STATS (((uint32_t)1) << 27) + +#define KAOP_PROTOCOL_STATS (KAOP_IP_STATS | KAOP_IP6_STATS \ + | KAOP_TCP_STATS | KAOP_UDP_STATS) + +/* + * sysctl interfaces + */ +static int net_aop_stats_get_sysctl SYSCTL_HANDLER_ARGS; + +SYSCTL_NODE(_net, OID_AUTO, aop, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "AOP"); +SYSCTL_PROC(_net_aop, OID_AUTO, driver_stats, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, KAOP_DRIVER_STATS, + net_aop_stats_get_sysctl, "S,aop_driver_stats", + "AOP driver statistics counter"); +SYSCTL_PROC(_net_aop, OID_AUTO, protocol_stats, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, KAOP_PROTOCOL_STATS, + net_aop_stats_get_sysctl, "S,net_aop_protocol_stats", + "AOP protocol statistics counter"); +SYSCTL_PROC(_net_aop, OID_AUTO, proc_activity_bitmaps, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, KAOP_PROC_ACTIVITY_BITMAPS, + net_aop_stats_get_sysctl, "S,aop_proc_activity_bitmap", + "AOP process activity bitmaps"); + +#define KAOP_CAPAB_FLOW_SETUP 0x00000001 +#define KAOP_CAPAB_FLOW_STATS 0x00000002 +#define KAOP_CAPAB_STATS 0x00000004 +#define KAOP_CAPAB_PROC_ACTIVITY_BITMAPS 0x00000008 + +struct net_aop_flow_setup { + net_aop_flow_setup_fn_t fsp_flow_setup; + void *fsp_prov_ctx; +}; + +struct net_aop_flow_stats { + net_aop_flow_stats_fn_t fs_flow_stats; + void *fs_prov_ctx; +}; + +struct net_aop_stats { + net_aop_stats_fn_t gs_stats; + void *gs_prov_ctx; +}; + +struct net_aop_proc_activity_bitmaps { + net_aop_proc_activity_bitmap_fn_t pab_activity_bitmap; + void *pab_prov_ctx; +}; + +#define KAOP_FLAG_ATTACHED 0x00000001 + +struct net_aop_provider_handle { + struct net_aop_provider_init kaop_ext; + void *kaop_prov_ctx; + struct net_aop_flow_setup kaop_fsp; + struct net_aop_flow_stats kaop_fs; + struct net_aop_stats kaop_gs; + struct net_aop_proc_activity_bitmaps kaop_pb; + uint32_t kaop_capabilities; + uint32_t kaop_flags; + os_refcnt_t kaop_refcnt; +}; + +static struct net_aop_provider_handle g_aop_net_provider; + +static errno_t +net_aop_validate_init_params( + const struct net_aop_provider_init *init, const uint32_t init_len) +{ + errno_t err = 0; + + static_assert(__builtin_offsetof(struct net_aop_provider_init, kaopi_version) == 0); + static_assert(sizeof(init->kaopi_version) == sizeof(uint32_t)); + + if (init == NULL) { + KAOPLOG_ERR("init is null"); + return EINVAL; + } + + if (init_len < sizeof(uint32_t)) { + KAOPLOG_ERR("init_len[%u] < sizeof(uint32_t)", init_len); + return EINVAL; + } + + switch (init->kaopi_version) { + case NET_AOP_VERSION_1: + if (init_len != sizeof(struct net_aop_provider_init)) { + KAOPLOG_ERR("init_len[%u] != sizeof(struct net_aop_provider_init", init_len); + err = EINVAL; + break; + } + if (init->kaopi_config_capab == NULL) { + KAOPLOG_ERR("kaopi_config_capab is null"); + err = EINVAL; + break; + } + break; + default: + KAOPLOG_ERR("invalid version[%u]", init->kaopi_version); + err = EINVAL; + break; + } + + return err; +} + +static int +configure_capab_flow_setup(net_aop_provider_handle_t prov, + net_aop_config_fn_t capab_fn) +{ + struct net_aop_capab_flow_setup capab; + uint32_t capab_len; + int error; + + bzero(&capab, sizeof(capab)); + capab.kaopcfsp_version = NET_AOP_CAPAB_FLOW_SETUP_VERSION_1; + capab_len = sizeof(capab); + + error = capab_fn(prov->kaop_prov_ctx, NET_AOP_CAPAB_FLOW_SETUP, + &capab, &capab_len); + if (error != 0) { + KAOPLOG_ERR("Failed to get flow setup capability for provider"); + return error; + } + + VERIFY(capab.kaopcfsp_config != NULL); + VERIFY(capab.kaopcfsp_prov_ctx != NULL); + prov->kaop_fsp.fsp_prov_ctx = capab.kaopcfsp_prov_ctx; + prov->kaop_fsp.fsp_flow_setup = capab.kaopcfsp_config; + prov->kaop_capabilities |= KAOP_CAPAB_FLOW_SETUP; + return 0; +} + +static void +unconfigure_capab_flow_setup(net_aop_provider_handle_t prov) +{ + if ((prov->kaop_capabilities & KAOP_CAPAB_FLOW_SETUP) == 0) { + return; + } + + bzero(&prov->kaop_fsp, sizeof(prov->kaop_fsp)); + prov->kaop_capabilities &= ~KAOP_CAPAB_FLOW_SETUP; +} + +static int +configure_capab_flow_stats(net_aop_provider_handle_t prov, + net_aop_config_fn_t capab_fn) +{ + struct net_aop_capab_flow_stats capab; + uint32_t capab_len; + int error; + + bzero(&capab, sizeof(capab)); + capab.kaopcfs_version = NET_AOP_CAPAB_FLOW_STATS_VERSION_1; + capab_len = sizeof(capab); + error = capab_fn(prov->kaop_prov_ctx, NET_AOP_CAPAB_FLOW_STATS, + &capab, &capab_len); + if (error != 0) { + KAOPLOG_ERR("Failed to get flow stats capability for KAOP provider"); + return error; + } + + VERIFY(capab.kaopcfs_config != NULL); + VERIFY(capab.kaopcfs_prov_ctx != NULL); + prov->kaop_fs.fs_prov_ctx = capab.kaopcfs_prov_ctx; + prov->kaop_fs.fs_flow_stats = capab.kaopcfs_config; + prov->kaop_capabilities |= KAOP_CAPAB_FLOW_STATS; + return 0; +} + +static void +unconfigure_capab_flow_stats(net_aop_provider_handle_t prov) +{ + if ((prov->kaop_capabilities & KAOP_CAPAB_FLOW_STATS) == 0) { + return; + } + + bzero(&prov->kaop_fs, sizeof(prov->kaop_fs)); + prov->kaop_capabilities &= ~KAOP_CAPAB_FLOW_STATS; +} + +static int +configure_capab_stats(net_aop_provider_handle_t prov, + net_aop_config_fn_t capab_fn) +{ + struct net_aop_capab_stats capab; + uint32_t capab_len; + int error; + + bzero(&capab, sizeof(capab)); + capab.kaopcgs_version = NET_AOP_CAPAB_STATS_VERSION_1; + capab_len = sizeof(capab); + error = capab_fn(prov->kaop_prov_ctx, NET_AOP_CAPAB_STATS, + &capab, &capab_len); + if (error != 0) { + KAOPLOG_ERR("Failed to get stats capability provider"); + return error; + } + + VERIFY(capab.kaopcgs_config != NULL); + VERIFY(capab.kaopcgs_prov_ctx != NULL); + prov->kaop_gs.gs_prov_ctx = capab.kaopcgs_prov_ctx; + prov->kaop_gs.gs_stats = capab.kaopcgs_config; + prov->kaop_capabilities |= KAOP_CAPAB_STATS; + return 0; +} + +static void +unconfigure_capab_stats(net_aop_provider_handle_t prov) +{ + if ((prov->kaop_capabilities & KAOP_CAPAB_STATS) == 0) { + return; + } + + bzero(&prov->kaop_gs, sizeof(prov->kaop_gs)); + prov->kaop_capabilities &= ~KAOP_CAPAB_STATS; +} + +static int +configure_capab_process_bitmaps(net_aop_provider_handle_t prov, + net_aop_config_fn_t capab_fn) +{ + struct net_aop_capab_proc_activity_bitmap capab; + uint32_t capab_len; + int error; + + bzero(&capab, sizeof(capab)); + capab.kaopbm_version = NET_AOP_CAPAB_PROC_ACTIVITY_BITMAP_VERSION_1; + capab_len = sizeof(capab); + error = capab_fn(prov->kaop_prov_ctx, NET_AOP_CAPAB_PROC_ACTIVITY_BITMAP, + &capab, &capab_len); + if (error != 0) { + KAOPLOG_ERR("Failed to get proc bitmap capability provider"); + return error; + } + + VERIFY(capab.kaopbm_config != NULL); + VERIFY(capab.kaopbm_prov_ctx != NULL); + prov->kaop_pb.pab_prov_ctx = capab.kaopbm_prov_ctx; + prov->kaop_pb.pab_activity_bitmap = capab.kaopbm_config; + prov->kaop_capabilities |= KAOP_CAPAB_PROC_ACTIVITY_BITMAPS; + return 0; +} + +static void +unconfigure_capab_process_bitmaps(net_aop_provider_handle_t prov) +{ + if ((prov->kaop_capabilities & KAOP_CAPAB_PROC_ACTIVITY_BITMAPS) == 0) { + return; + } + + bzero(&prov->kaop_pb, sizeof(prov->kaop_pb)); + prov->kaop_capabilities &= ~KAOP_CAPAB_PROC_ACTIVITY_BITMAPS; +} + +static int +net_aop_provider_initialize(net_aop_provider_handle_t prov) +{ + net_aop_config_fn_t capab_fn = prov->kaop_ext.kaopi_config_capab; + if (capab_fn == NULL) { + KAOPLOG_ERR("kaop provider missing capability function"); + return EINVAL; + } + + configure_capab_flow_setup(prov, capab_fn); + configure_capab_flow_stats(prov, capab_fn); + configure_capab_stats(prov, capab_fn); + configure_capab_process_bitmaps(prov, capab_fn); + return 0; +} + +static void +net_aop_provider_cleanup(net_aop_provider_handle_t prov) +{ + KAOP_LOCK_ASSERT_HELD(); + + prov->kaop_flags &= ~KAOP_FLAG_ATTACHED; + + if (os_ref_release(&prov->kaop_refcnt) != 0) { + while (os_ref_get_count(&prov->kaop_refcnt) > 0) { + msleep(&prov->kaop_refcnt, + &kaop_lock, (PZERO + 1), __FUNCTION__, NULL); + } + } + + unconfigure_capab_flow_setup(prov); + unconfigure_capab_flow_stats(prov); + unconfigure_capab_stats(prov); + unconfigure_capab_process_bitmaps(prov); + memset(&prov->kaop_ext, 0, sizeof(prov->kaop_ext)); + prov->kaop_prov_ctx = NULL; +} + +static void +net_aop_release_refcnt(net_aop_provider_handle_t prov) +{ + KAOP_LOCK(); + if (os_ref_release(&prov->kaop_refcnt) == 0) { + wakeup(&prov->kaop_refcnt); + } + KAOP_UNLOCK(); +} + +int +net_aop_setup_flow(uint32_t flow_id, bool add, uint32_t *stats_index) +{ + net_aop_flow_setup_fn_t fsp = NULL; + void *__single fsp_ctx = NULL; + int err = 0; + + if (stats_index == NULL) { + KAOPLOG_ERR("invalid stats index param"); + return EINVAL; + } + + KAOP_LOCK(); + if ((g_aop_net_provider.kaop_capabilities & KAOP_CAPAB_FLOW_SETUP) == 0) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kern aop provider does not support flow setup"); + return ENOTSUP; + } + + if (!(g_aop_net_provider.kaop_flags & KAOP_FLAG_ATTACHED) || + !os_ref_retain_try(&g_aop_net_provider.kaop_refcnt)) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kernel aop provider is not valid"); + return ENOENT; + } + + fsp = g_aop_net_provider.kaop_fsp.fsp_flow_setup; + fsp_ctx = g_aop_net_provider.kaop_fsp.fsp_prov_ctx; + KAOP_UNLOCK(); + + err = fsp(fsp_ctx, flow_id, add, stats_index); + net_aop_release_refcnt(&g_aop_net_provider); + return err; +} + +int +net_aop_get_flow_stats(uint32_t stats_index, struct aop_flow_stats *flow_stats) +{ + net_aop_flow_stats_fn_t fs = NULL; + void *__single fs_ctx = NULL; + int err = 0; + + if (flow_stats == NULL) { + KAOPLOG_ERR("invalid flow stats param"); + return EINVAL; + } + + KAOP_LOCK(); + if ((g_aop_net_provider.kaop_capabilities & KAOP_CAPAB_FLOW_STATS) == 0) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kern aop provider does not support flow stats"); + return ENOTSUP; + } + + if (!(g_aop_net_provider.kaop_flags & KAOP_FLAG_ATTACHED) || + !os_ref_retain_try(&g_aop_net_provider.kaop_refcnt)) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kernel aop provider is not valid"); + return ENOENT; + } + + fs = g_aop_net_provider.kaop_fs.fs_flow_stats; + fs_ctx = g_aop_net_provider.kaop_fs.fs_prov_ctx; + KAOP_UNLOCK(); + + err = fs(fs_ctx, stats_index, flow_stats); + net_aop_release_refcnt(&g_aop_net_provider); + return err; +} + +int +net_aop_get_stats(net_aop_stats_type_t type, uint8_t *__sized_by(stats_len) stats, size_t stats_len) +{ + net_aop_stats_fn_t gs = NULL; + void *__single gs_ctx = NULL; + int err = 0; + + if (type == NET_AOP_STATS_TYPE_INVALID || + type > NET_AOP_STATS_TYPE_MAX) { + KAOPLOG_ERR("invalid stats type %u", type); + return EINVAL; + } + + if (stats == NULL || stats_len == 0) { + KAOPLOG_ERR("invalid stats param"); + return EINVAL; + } + + KAOP_LOCK(); + if ((g_aop_net_provider.kaop_capabilities & KAOP_CAPAB_STATS) == 0) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kern aop provider does not support stats"); + return ENOTSUP; + } + + if (!(g_aop_net_provider.kaop_flags & KAOP_FLAG_ATTACHED) || + !os_ref_retain_try(&g_aop_net_provider.kaop_refcnt)) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kernel aop provider is not valid"); + return ENOENT; + } + + gs = g_aop_net_provider.kaop_gs.gs_stats; + gs_ctx = g_aop_net_provider.kaop_gs.gs_prov_ctx; + KAOP_UNLOCK(); + + err = gs(gs_ctx, type, stats, stats_len); + net_aop_release_refcnt(&g_aop_net_provider); + return err; +} + +int +net_aop_get_proc_activity_bitmaps(struct aop_proc_activity_bitmap *proc_activity_bitmaps, + uint16_t *inout_count) +{ + net_aop_proc_activity_bitmap_fn_t pb = NULL; + void *__single pb_ctx = NULL; + int err = 0; + + if (inout_count == NULL) { + KAOPLOG_ERR("invalid inout_count param"); + return EINVAL; + } + + KAOP_LOCK(); + if ((g_aop_net_provider.kaop_capabilities & KAOP_CAPAB_PROC_ACTIVITY_BITMAPS) == 0) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kern aop provider does not support proc bitmaps"); + return ENOTSUP; + } + + if (!(g_aop_net_provider.kaop_flags & KAOP_FLAG_ATTACHED) || + !os_ref_retain_try(&g_aop_net_provider.kaop_refcnt)) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kernel aop provider is not valid"); + return ENOENT; + } + + pb = g_aop_net_provider.kaop_pb.pab_activity_bitmap; + pb_ctx = g_aop_net_provider.kaop_pb.pab_prov_ctx; + KAOP_UNLOCK(); + + err = pb(pb_ctx, proc_activity_bitmaps, inout_count); + net_aop_release_refcnt(&g_aop_net_provider); + return err; +} + +net_aop_provider_handle_t +net_aop_register_provider(const struct net_aop_provider_init *init, + const uint32_t init_len, void *ctx) +{ + errno_t err = 0; + + err = net_aop_validate_init_params(init, init_len); + if (err != 0) { + return NULL; + } + + KAOP_LOCK(); + if (g_aop_net_provider.kaop_flags & KAOP_FLAG_ATTACHED) { + KAOP_UNLOCK(); + KAOPLOG_ERR("kernel aop provider already registered"); + return NULL; + } + + os_ref_init(&g_aop_net_provider.kaop_refcnt, &kaop_refgrp); + memcpy(&g_aop_net_provider.kaop_ext, init, sizeof(g_aop_net_provider.kaop_ext)); + g_aop_net_provider.kaop_prov_ctx = ctx; + + err = net_aop_provider_initialize(&g_aop_net_provider); + if (err != 0) { + KAOP_UNLOCK(); + KAOPLOG_ERR("provider type failed to initialize"); + goto done; + } + + g_aop_net_provider.kaop_flags |= KAOP_FLAG_ATTACHED; + KAOP_UNLOCK(); +done: + KAOP_LOCK_ASSERT_NOTHELD(); + if (err != 0) { + KAOP_LOCK(); + net_aop_provider_cleanup(&g_aop_net_provider); + KAOP_UNLOCK(); + return NULL; + } + return &g_aop_net_provider; +} + +void +net_aop_deregister_provider(net_aop_provider_handle_t prov) +{ + if (prov == NULL) { + return; + } + + KAOP_LOCK(); + ASSERT(prov->kaop_flags & KAOP_FLAG_ATTACHED); + net_aop_provider_cleanup(prov); + KAOP_UNLOCK(); + return; +} + +static int +net_aop_get_protocol_stats(struct net_aop_protocol_stats *aop_proto_stats) +{ + int error = 0; + + error = net_aop_get_stats(NET_AOP_STATS_TYPE_IP, + (uint8_t *)&aop_proto_stats->aop_ip, sizeof(aop_proto_stats->aop_ip)); + if (error != 0) { + return error; + } + error = net_aop_get_stats(NET_AOP_STATS_TYPE_IPV6, + (uint8_t *)&aop_proto_stats->aop_ip6, sizeof(aop_proto_stats->aop_ip6)); + if (error != 0) { + return error; + } + error = net_aop_get_stats(NET_AOP_STATS_TYPE_TCP, + (uint8_t *)&aop_proto_stats->aop_tcp, sizeof(aop_proto_stats->aop_tcp)); + if (error != 0) { + return error; + } + error = net_aop_get_stats(NET_AOP_STATS_TYPE_UDP, + (uint8_t *)&aop_proto_stats->aop_udp, sizeof(aop_proto_stats->aop_udp)); + if (error != 0) { + return error; + } + + return error; +} + +static int +net_aop_get_driver_stats(struct aop_driver_stats *driver_stats) +{ + int error = 0; + error = net_aop_get_stats(NET_AOP_STATS_TYPE_DRIVER, + (uint8_t *)(struct aop_driver_stats *__bidi_indexable)driver_stats, sizeof(struct aop_driver_stats)); + return error; +} + +static int +aop_get_process_activity_bitmaps(struct aop_proc_activity_bitmap **bitmaps, size_t requested_buffer_space, + size_t *out_len) +{ + size_t bitmap_size = 0; + struct aop_proc_activity_bitmap *__sized_by(bitmap_size) proc_activity_bitmap = NULL; + uint16_t proc_bitmap_count = 0; + int err = 0; + + net_aop_get_proc_activity_bitmaps(NULL, &proc_bitmap_count); + + if (proc_bitmap_count > 0) { + size_t requested_count = (requested_buffer_space / (sizeof(struct aop_proc_activity_bitmap))); + requested_count = (requested_count > proc_bitmap_count) ? proc_bitmap_count : requested_count; + + size_t required_buffer_len = (requested_count * sizeof(struct aop_proc_activity_bitmap)); + proc_activity_bitmap = (struct aop_proc_activity_bitmap *)kalloc_data(required_buffer_len, Z_WAITOK | Z_ZERO); + bitmap_size = required_buffer_len; + if (proc_activity_bitmap == NULL) { + return ENOBUFS; + } + + err = net_aop_get_proc_activity_bitmaps(proc_activity_bitmap, (uint16_t *)&requested_count); + if (err != 0) { + kfree_data_sized_by(proc_activity_bitmap, bitmap_size); + return err; + } + + *bitmaps = proc_activity_bitmap; + *out_len = required_buffer_len; + return 0; + } + + return ENOENT; +} + +static int +net_aop_stats_get_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + struct net_aop_protocol_stats proto_stats = {}; + struct aop_driver_stats driver_stats = {}; + struct proc *p = NULL; + task_t __single task = NULL; + size_t buffer_space; + uint8_t *out_buffer = NULL; + size_t out_size = 0; + int error = 0; + + if (!kauth_cred_issuser(kauth_cred_get())) { + p = current_proc(); + task = proc_task(p); + bool has_aop_stats_entitlement = IOTaskHasEntitlement(task, "com.apple.private.network.aop_stats"); + if (!has_aop_stats_entitlement) { + KAOPLOG_ERR("aop stats request rejected, EPERM"); + return EPERM; + } + } + + buffer_space = req->oldlen; + if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) { + if (oidp->oid_arg2 == KAOP_PROTOCOL_STATS) { + if (buffer_space < sizeof(proto_stats)) { + return ENOMEM; + } + + error = net_aop_get_protocol_stats(&proto_stats); + out_buffer = (uint8_t *)&proto_stats; + out_size = sizeof(proto_stats); + } else if (oidp->oid_arg2 == KAOP_DRIVER_STATS) { + if (buffer_space < sizeof(driver_stats)) { + return ENOMEM; + } + + error = net_aop_get_driver_stats(&driver_stats); + out_buffer = (uint8_t *)(&driver_stats); + out_size = sizeof(driver_stats); + } else if (oidp->oid_arg2 == KAOP_PROC_ACTIVITY_BITMAPS) { + struct aop_proc_activity_bitmap *__single bitmap = NULL; + error = aop_get_process_activity_bitmaps(&bitmap, buffer_space, &out_size); + out_buffer = (uint8_t *)bitmap; + } + + if (error == 0) { + error = SYSCTL_OUT(req, out_buffer, out_size); + } + } else if (req->oldptr == USER_ADDR_NULL) { + if (oidp->oid_arg2 == KAOP_PROTOCOL_STATS) { + buffer_space = sizeof(proto_stats); + } else if (oidp->oid_arg2 == KAOP_DRIVER_STATS) { + buffer_space = sizeof(driver_stats); + } else if (oidp->oid_arg2 == KAOP_PROC_ACTIVITY_BITMAPS) { + uint16_t proc_bitmap_count = 0; + net_aop_get_proc_activity_bitmaps(NULL, &proc_bitmap_count); + buffer_space = (proc_bitmap_count * sizeof(struct aop_proc_activity_bitmap)); + } + error = SYSCTL_OUT(req, NULL, buffer_space); + } + + return error; +} + +void +net_aop_init(void) +{ + kaop_log_handle = os_log_create("com.apple.xnu.net.aopnet", "aopnet"); +} diff --git a/bsd/net/aop/kpi_aop.h b/bsd/net/aop/kpi_aop.h new file mode 100644 index 000000000..2d2e6c17f --- /dev/null +++ b/bsd/net/aop/kpi_aop.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*! + * @header kpi_aop.h + * This header defines an SPI to interact with the AOP + * using shared memory. The SPIs could be used to collect networking + * stats associated with flows in AOP. + */ + +#ifndef __NET_KPI_AOP_H__ +#define __NET_KPI_AOP_H__ + +#include +#include + +__BEGIN_DECLS + +typedef enum { + NET_AOP_CAPAB_FLOW_SETUP = 1, + NET_AOP_CAPAB_FLOW_STATS = 2, + NET_AOP_CAPAB_STATS = 3, + NET_AOP_CAPAB_PROC_ACTIVITY_BITMAP = 4, +} net_aop_capab_t; + +typedef errno_t (*net_aop_config_fn_t)(void *prov_ctx, + net_aop_capab_t capab, void *contents, uint32_t *len); + +#define NET_AOP_VERSION_1 1 +typedef struct net_aop_provider_init { + uint32_t kaopi_version; + net_aop_config_fn_t kaopi_config_capab; +} net_aop_provider_init_t; + +#define NET_AOP_CAPAB_FLOW_SETUP_VERSION_1 1 +typedef errno_t (*net_aop_flow_setup_fn_t)(void *prov_ctx, + uint32_t flow_id, bool add, uint32_t *stats_index); +struct net_aop_capab_flow_setup { + uint32_t kaopcfsp_version; + void *kaopcfsp_prov_ctx; + net_aop_flow_setup_fn_t kaopcfsp_config; +}; + +#define NET_AOP_CAPAB_FLOW_STATS_VERSION_1 1 +typedef errno_t (*net_aop_flow_stats_fn_t)(void *prov_ctx, + uint32_t stats_index, struct aop_flow_stats *flow_stats); +struct net_aop_capab_flow_stats { + uint32_t kaopcfs_version; + void *kaopcfs_prov_ctx; + net_aop_flow_stats_fn_t kaopcfs_config; +}; + +typedef enum { + NET_AOP_STATS_TYPE_INVALID = 0, + NET_AOP_STATS_TYPE_IP = 1, + NET_AOP_STATS_TYPE_IPV6 = 2, + NET_AOP_STATS_TYPE_TCP = 3, + NET_AOP_STATS_TYPE_UDP = 4, + NET_AOP_STATS_TYPE_DRIVER = 5, + NET_AOP_STATS_TYPE_MAX = NET_AOP_STATS_TYPE_DRIVER, +} net_aop_stats_type_t; + +#define NET_AOP_CAPAB_STATS_VERSION_1 1 +typedef errno_t (*net_aop_stats_fn_t)(void *prov_ctx, + net_aop_stats_type_t type, uint8_t *stats, size_t stats_len); +struct net_aop_capab_stats { + uint32_t kaopcgs_version; + void *kaopcgs_prov_ctx; + net_aop_stats_fn_t kaopcgs_config; +}; + +#define NET_AOP_CAPAB_PROC_ACTIVITY_BITMAP_VERSION_1 1 +typedef errno_t (*net_aop_proc_activity_bitmap_fn_t)(void *prov_ctx, + struct aop_proc_activity_bitmap *proc_activity_bitmaps, uint16_t *inout_count); +struct net_aop_capab_proc_activity_bitmap { + uint32_t kaopbm_version; + void *kaopbm_prov_ctx; + net_aop_proc_activity_bitmap_fn_t kaopbm_config; +}; + +typedef struct net_aop_provider_handle *net_aop_provider_handle_t; + +extern net_aop_provider_handle_t +net_aop_register_provider(const struct net_aop_provider_init *init, + const uint32_t init_len, void *ctx); +extern void +net_aop_deregister_provider(net_aop_provider_handle_t prov); + +#ifdef BSD_KERNEL_PRIVATE +void net_aop_init(void); + +int net_aop_setup_flow(uint32_t flow_id, bool add, uint32_t *stats_index); + +int net_aop_get_flow_stats(uint32_t stats_index, struct aop_flow_stats *flow_stats); + +int net_aop_get_stats(net_aop_stats_type_t type, + uint8_t *__sized_by(stats_len) stats, size_t stats_len); + +int net_aop_get_proc_activity_bitmaps(struct aop_proc_activity_bitmap *proc_activity_bitmaps, + uint16_t *inout_count); +#endif // BSD_KERNEL_PRIVATE + +__END_DECLS + +#endif /* __NET_KPI_AOP_H__ */ diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index 719b8f218..1c6101575 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -140,7 +140,7 @@ #include extern int tvtohz(struct timeval *); -extern const char *proc_name_address(void *p); +extern const char *proc_name_address(void *); #define BPF_BUFSIZE 4096 @@ -237,9 +237,9 @@ static lck_mtx_t *const bpf_mlock = &bpf_mlock_data; static int bpf_allocbufs(struct bpf_d *); static errno_t bpf_attachd(struct bpf_d *d, struct bpf_if *bp); -static int bpf_detachd(struct bpf_d *d); +static int bpf_detachd(struct bpf_d *d, struct proc *); static void bpf_freed(struct bpf_d *); -static int bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool, bool); +static int bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool, bool, struct proc *); static void bpf_timed_out(void *, void *); static void bpf_wakeup(struct bpf_d *); static uint32_t get_pkt_trunc_len(struct bpf_packet *); @@ -247,7 +247,7 @@ static void catchpacket(struct bpf_d *, struct bpf_packet *, u_int, int); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, u_int, user_addr_t, u_long); static int bpf_getdltlist(struct bpf_d *, caddr_t __bidi_indexable, struct proc *); -static int bpf_setdlt(struct bpf_d *, u_int); +static int bpf_setdlt(struct bpf_d *, u_int, struct proc *); static int bpf_set_traffic_class(struct bpf_d *, int); static void bpf_set_packet_service_class(struct mbuf *, int); @@ -769,7 +769,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * Return 1 if was closed by some thread, 0 otherwise */ static int -bpf_detachd(struct bpf_d *d) +bpf_detachd(struct bpf_d *d, struct proc *proc) { struct bpf_d **p; struct bpf_if *bp; @@ -878,9 +878,9 @@ bpf_detachd(struct bpf_d *d) /* Refresh the local variable as d could have been modified */ bpf_closed = d->bd_flags & BPF_CLOSING; - os_log(OS_LOG_DEFAULT, "bpf%d%s detached from %s fcount %llu dcount %llu", + os_log(OS_LOG_DEFAULT, "bpf%d%s detached from %s fcount %llu dcount %llu by %s:%u", d->bd_dev_minor, bpf_closed ? " closed and" : "", if_name(ifp), - d->bd_fcount, d->bd_dcount); + d->bd_fcount, d->bd_dcount, proc_name_address(proc), proc_pid(proc)); /* * Note that We've kept the reference because we may have dropped @@ -995,7 +995,7 @@ bpf_release_d(struct bpf_d *d) /* ARGSUSED */ int bpfopen(dev_t dev, int flags, __unused int fmt, - struct proc *p) + struct proc *proc) { struct bpf_d *d; @@ -1068,9 +1068,11 @@ bpfopen(dev_t dev, int flags, __unused int fmt, return ENOMEM; } - d->bd_opened_by = p; + + /* Use the proc pointer for comparaison so no need to take a reference */ + d->bd_opened_by = proc; uuid_generate(d->bd_uuid); - d->bd_pid = proc_pid(p); + d->bd_pid = proc_pid(proc); d->bd_dev_minor = minor(dev); bpf_dtab[minor(dev)] = d; /* Mark opened */ @@ -1078,7 +1080,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt, if (bpf_debug) { os_log(OS_LOG_DEFAULT, "bpf%u opened by %s.%u", - d->bd_dev_minor, proc_name_address(p), d->bd_pid); + d->bd_dev_minor, proc_name_address(proc), d->bd_pid); } return 0; } @@ -1090,7 +1092,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt, /* ARGSUSED */ int bpfclose(dev_t dev, __unused int flags, __unused int fmt, - __unused struct proc *p) + struct proc *proc) { struct bpf_d *d; @@ -1171,7 +1173,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, } if (d->bd_bif) { - bpf_detachd(d); + bpf_detachd(d, proc); } selthreadclear(&d->bd_sel); thread_call_free(d->bd_thread_call); @@ -1183,7 +1185,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, if (bpf_debug) { os_log(OS_LOG_DEFAULT, "bpf%u closed by %s.%u dcount %llu fcount %llu ccount %llu", - d->bd_dev_minor, proc_name_address(p), d->bd_pid, + d->bd_dev_minor, proc_name_address(proc), proc_pid(proc), d->bd_dcount, d->bd_fcount, d->bd_bcs.bcs_count_compressed_prefix); } @@ -1789,7 +1791,7 @@ bpf_get_device_from_uuid(uuid_t uuid) * the BPF global lock */ static int -bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp) +bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp, struct proc *proc) { struct bpf_d *d_from; int error = 0; @@ -1915,7 +1917,7 @@ bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp) * - we already prevent reads and writes * - the buffers are already allocated */ - error = bpf_setif(d_to, ifp, false, true, true); + error = bpf_setif(d_to, ifp, false, true, true, proc); if (error != 0) { os_log_error(OS_LOG_DEFAULT, "%s: bpf_setif() failed error %d", @@ -1992,7 +1994,7 @@ done: X(BIOCSNOTSTAMP) static void -log_bpf_ioctl_str(struct bpf_d *d, u_long cmd) +log_bpf_ioctl_str(struct bpf_d *d, u_long cmd, int error) { const char *p = NULL; char str[32]; @@ -2006,8 +2008,8 @@ log_bpf_ioctl_str(struct bpf_d *d, u_long cmd) snprintf(str, sizeof(str), "0x%08x", (unsigned int)cmd); p = str; } - os_log(OS_LOG_DEFAULT, "bpfioctl bpf%u %s", - d->bd_dev_minor, p); + os_log(OS_LOG_DEFAULT, "bpfioctl bpf%u %s error: %d", + d->bd_dev_minor, p, error); } #endif /* DEVELOPMENT || DEBUG */ @@ -2039,7 +2041,7 @@ log_bpf_ioctl_str(struct bpf_d *d, u_long cmd) /* ARGSUSED */ int bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, - __unused int flags, struct proc *p) + __unused int flags, struct proc *proc) { struct bpf_d *d; int error = 0; @@ -2061,12 +2063,6 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, } d->bd_state = BPF_IDLE; -#if DEVELOPMENT || DEBUG - if (bpf_debug > 0) { - log_bpf_ioctl_str(d, cmd); - } -#endif /* DEVELOPMENT || DEBUG */ - switch (cmd) { default: error = EINVAL; @@ -2103,7 +2099,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, * Get buffer len [for read()]. */ case BIOCGBLEN: { /* u_int */ - _CASSERT(sizeof(d->bd_bufsize) == sizeof(u_int)); + static_assert(sizeof(d->bd_bufsize) == sizeof(u_int)); bcopy(&d->bd_bufsize, addr, sizeof(u_int)); break; } @@ -2212,7 +2208,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, if (d->bd_bif == 0) { error = EINVAL; } else { - _CASSERT(sizeof(d->bd_bif->bif_dlt) == sizeof(u_int)); + static_assert(sizeof(d->bd_bif->bif_dlt) == sizeof(u_int)); bcopy(&d->bd_bif->bif_dlt, addr, sizeof(u_int)); } break; @@ -2225,7 +2221,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, if (d->bd_bif == NULL) { error = EINVAL; } else { - error = bpf_getdltlist(d, addr, p); + error = bpf_getdltlist(d, addr, proc); } break; @@ -2244,7 +2240,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, !(d->bd_flags & BPF_WANT_PKTAP)) { dlt = DLT_RAW; } - error = bpf_setdlt(d, dlt); + error = bpf_setdlt(d, dlt, proc); } break; @@ -2275,7 +2271,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, if (ifp == NULL) { error = ENXIO; } else { - error = bpf_setif(d, ifp, true, false, false); + error = bpf_setif(d, ifp, true, false, false, proc); } break; } @@ -2471,7 +2467,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, * Get traffic service class */ case BIOCGETTC: { /* int */ - _CASSERT(sizeof(d->bd_traffic_class) == sizeof(int)); + static_assert(sizeof(d->bd_traffic_class) == sizeof(int)); bcopy(&d->bd_traffic_class, addr, sizeof(int)); break; } @@ -2480,7 +2476,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, break; case FIOASYNC: { /* Send signal on receive packets; int */ - _CASSERT(sizeof(d->bd_async) == sizeof(int)); + static_assert(sizeof(d->bd_async) == sizeof(int)); bcopy(addr, &d->bd_async, sizeof(int)); break; } @@ -2498,7 +2494,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, break; } case BIOCGRSIG: { /* u_int */ - _CASSERT(sizeof(d->bd_sig) == sizeof(u_int)); + static_assert(sizeof(d->bd_sig) == sizeof(u_int)); bcopy(&d->bd_sig, addr, sizeof(u_int)); break; } @@ -2593,7 +2589,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, break; } - error = bpf_setup(d, bsa.bsa_uuid, ifp); + error = bpf_setup(d, bsa.bsa_uuid, ifp, proc); break; } case BIOCSPKTHDRV2: @@ -2744,6 +2740,13 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) addr, break; } +#if DEVELOPMENT || DEBUG + if (bpf_debug > 0) { + log_bpf_ioctl_str(d, cmd, error); + } +#endif /* DEVELOPMENT || DEBUG */ + + bpf_release_d(d); lck_mtx_unlock(bpf_mlock); @@ -2818,7 +2821,7 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns, */ static int bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read_write, - bool has_bufs_allocated) + bool has_bufs_allocated, struct proc *proc) { struct bpf_if *bp; int error; @@ -2867,7 +2870,7 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read_w */ if (bp != d->bd_bif) { if (d->bd_bif != NULL) { - if (bpf_detachd(d) != 0) { + if (bpf_detachd(d, proc) != 0) { return ENXIO; } } @@ -2878,8 +2881,8 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read_w if (do_reset) { reset_d(d); } - os_log(OS_LOG_DEFAULT, "bpf%u attached to %s", - d->bd_dev_minor, if_name(theywant)); + os_log(OS_LOG_DEFAULT, "bpf%u attached to %s by %s:%u", + d->bd_dev_minor, if_name(theywant), proc_name_address(proc), proc_pid(proc)); return 0; } /* Not found. */ @@ -2890,7 +2893,7 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read_w * Get a list of available data link type of the interface. */ static int -bpf_getdltlist(struct bpf_d *d, caddr_t __bidi_indexable addr, struct proc *p) +bpf_getdltlist(struct bpf_d *d, caddr_t __bidi_indexable addr, struct proc *proc) { u_int n; int error; @@ -2900,7 +2903,7 @@ bpf_getdltlist(struct bpf_d *d, caddr_t __bidi_indexable addr, struct proc *p) struct bpf_dltlist bfl; bcopy(addr, &bfl, sizeof(bfl)); - if (proc_is64bit(p)) { + if (proc_is64bit(proc)) { dlist = (user_addr_t)bfl.bfl_u.bflu_pad; } else { dlist = CAST_USER_ADDR_T(bfl.bfl_u.bflu_list); @@ -2943,7 +2946,7 @@ bpf_getdltlist(struct bpf_d *d, caddr_t __bidi_indexable addr, struct proc *p) * Set the data link type of a BPF instance. */ static int -bpf_setdlt(struct bpf_d *d, uint32_t dlt) +bpf_setdlt(struct bpf_d *d, uint32_t dlt, struct proc *proc) { int error, opromisc; struct ifnet *ifp; @@ -2976,7 +2979,7 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) } if (bp != NULL) { opromisc = d->bd_promisc; - if (bpf_detachd(d) != 0) { + if (bpf_detachd(d, proc) != 0) { return ENXIO; } error = bpf_attachd(d, bp); @@ -3036,7 +3039,7 @@ bpf_set_packet_service_class(struct mbuf *m, int tc) * Otherwise, return false but make a note that a selwakeup() must be done. */ int -bpfselect(dev_t dev, int which, void * wql, struct proc *p) +bpfselect(dev_t dev, int which, void * wql, struct proc *proc) { struct bpf_d *d; int ret = 0; @@ -3080,7 +3083,7 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) * Make the select wait, and start a timer if * necessary. */ - selrecord(p, &d->bd_sel, wql); + selrecord(proc, &d->bd_sel, wql); bpf_start_timer(d); } break; @@ -4123,6 +4126,10 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, if (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) { mbuf_ref_t m = pkt->bpfp_mbuf; + if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_ULPN) { + ehp->bh_pktflags |= BPF_PKTFLAGS_ULPN; + } + if (outbound) { /* only do lookups on non-raw INPCB */ if ((m->m_pkthdr.pkt_flags & (PKTF_FLOW_ID | @@ -4163,6 +4170,10 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, kern_packet_t kern_pkt = pkt->bpfp_pkt; packet_flowid_t flowid = 0; + if (kern_packet_get_ulpn_flag(kern_pkt)) { + ehp->bh_pktflags |= BPF_PKTFLAGS_ULPN; + } + if (outbound) { /* * Note: pp_init() asserts that kern_packet_svc_class_t is equivalent @@ -4517,7 +4528,7 @@ bpfdetach(struct ifnet *ifp) msleep((caddr_t)d, bpf_mlock, PRINET, "bpfdetach", NULL); } - bpf_detachd(d); + bpf_detachd(d, current_proc()); bpf_wakeup(d); bpf_release_d(d); } @@ -4533,11 +4544,10 @@ bpf_init(__unused void *unused) int maj; /* bpf_comp_hdr is an overlay of bpf_hdr */ - _CASSERT(BPF_WORDALIGN(sizeof(struct bpf_hdr)) == - BPF_WORDALIGN(sizeof(struct bpf_comp_hdr))); + static_assert(BPF_WORDALIGN(sizeof(struct bpf_hdr)) == BPF_WORDALIGN(sizeof(struct bpf_comp_hdr))); /* compression length must fits in a byte */ - _CASSERT(BPF_HDR_COMP_LEN_MAX <= UCHAR_MAX ); + static_assert(BPF_HDR_COMP_LEN_MAX <= UCHAR_MAX); (void) PE_parse_boot_argn("bpf_hdr_comp", &bpf_hdr_comp_enable, sizeof(bpf_hdr_comp_enable)); diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index ab3641d06..2d662f580 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2022 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,22 +80,13 @@ #include #if !defined(DRIVERKIT) +#include #include #include #include #include #include -#ifdef PRIVATE -#include -#include - -struct bpf_setup_args { - uuid_t bsa_uuid; - char bsa_ifname[IFNAMSIZ]; -}; -#endif /* PRIVATE */ - #ifdef KERNEL #include @@ -147,23 +138,6 @@ struct bpf_program { struct bpf_insn *bf_insns; }; -#ifdef KERNEL_PRIVATE -/* - * LP64 version of bpf_program. all pointers - * grow when we're dealing with a 64-bit process. - * WARNING - keep in sync with bpf_program - */ -struct bpf_program64 { - u_int bf_len; - user64_addr_t bf_insns __attribute__((aligned(8))); -}; - -struct bpf_program32 { - u_int bf_len; - user32_addr_t bf_insns; -}; -#endif /* KERNEL_PRIVATE */ - /* * Struct returned by BIOCGSTATS. */ @@ -188,18 +162,6 @@ struct bpf_version { u_short bv_minor; }; -#ifdef PRIVATE -struct bpf_comp_stats { - uint64_t bcs_total_read; /* number of packets read from device */ - uint64_t bcs_total_size; /* total size of filtered packets */ - uint64_t bcs_total_hdr_size; /* total header size of captured packets */ - uint64_t bcs_count_no_common_prefix; /* count of packets not compressible */ - uint64_t bcs_count_compressed_prefix; /* count of compressed packets */ - uint64_t bcs_total_compressed_prefix_size; /* total size of compressed data */ - uint64_t bcs_max_compressed_prefix_size; /* max compressed data size */ -}; -#endif /* PRIVATE */ - #if defined(__LP64__) #include @@ -214,25 +176,13 @@ struct bpf_comp_stats { #define BIOCGBLEN _IOR('B',102, u_int) #define BIOCSBLEN _IOWR('B',102, u_int) #define BIOCSETF _IOW('B',103, struct bpf_program) -#ifdef KERNEL_PRIVATE -#define BIOCSETF64 _IOW('B',103, struct bpf_program64) -#define BIOCSETF32 _IOW('B',103, struct bpf_program32) -#endif /* KERNEL_PRIVATE */ #define BIOCFLUSH _IO('B',104) #define BIOCPROMISC _IO('B',105) #define BIOCGDLT _IOR('B',106, u_int) #define BIOCGETIF _IOR('B',107, struct ifreq) #define BIOCSETIF _IOW('B',108, struct ifreq) #define BIOCSRTIMEOUT _IOW('B',109, struct timeval) -#ifdef KERNEL_PRIVATE -#define BIOCSRTIMEOUT64 _IOW('B',109, struct user64_timeval) -#define BIOCSRTIMEOUT32 _IOW('B',109, struct user32_timeval) -#endif /* KERNEL_PRIVATE */ #define BIOCGRTIMEOUT _IOR('B',110, struct timeval) -#ifdef KERNEL_PRIVATE -#define BIOCGRTIMEOUT64 _IOR('B',110, struct user64_timeval) -#define BIOCGRTIMEOUT32 _IOR('B',110, struct user32_timeval) -#endif /* KERNEL_PRIVATE */ #define BIOCGSTATS _IOR('B',111, struct bpf_stat) #define BIOCIMMEDIATE _IOW('B',112, u_int) #define BIOCVERSION _IOR('B',113, struct bpf_version) @@ -244,41 +194,9 @@ struct bpf_comp_stats { #define BIOCSSEESENT _IOW('B',119, u_int) #define BIOCSDLT _IOW('B',120, u_int) #define BIOCGDLTLIST _IOWR('B',121, struct bpf_dltlist) -#ifdef PRIVATE -#define BIOCGETTC _IOR('B', 122, int) -#define BIOCSETTC _IOW('B', 123, int) -#define BIOCSEXTHDR _IOW('B', 124, u_int) -#define BIOCGIFATTACHCOUNT _IOWR('B', 125, struct ifreq) -#endif /* PRIVATE */ #define BIOCSETFNR _IOW('B', 126, struct bpf_program) -#ifdef KERNEL_PRIVATE -#define BIOCSETFNR64 _IOW('B',126, struct bpf_program64) -#define BIOCSETFNR32 _IOW('B',126, struct bpf_program32) -#endif /* KERNEL_PRIVATE */ #ifdef PRIVATE -#define BIOCGWANTPKTAP _IOR('B', 127, u_int) -#define BIOCSWANTPKTAP _IOWR('B', 127, u_int) -#define BIOCSHEADDROP _IOW('B', 128, int) -#define BIOCGHEADDROP _IOR('B', 128, int) -#define BIOCSTRUNCATE _IOW('B', 129, u_int) -#define BIOCGETUUID _IOR('B', 130, uuid_t) -#define BIOCSETUP _IOW('B', 131, struct bpf_setup_args) -#define BIOCSPKTHDRV2 _IOW('B', 132, int) -#define BIOCGPKTHDRV2 _IOW('B', 133, int) -#define BIOCGHDRCOMP _IOR('B', 134, int) -#define BIOCSHDRCOMP _IOW('B', 135, int) -#define BIOCGHDRCOMPSTATS _IOR('B', 136, struct bpf_comp_stats) -#define BIOCGHDRCOMPON _IOR('B', 137, int) -#define BIOCGDIRECTION _IOR('B', 138, int) -#define BIOCSDIRECTION _IOW('B', 139, int) -#define BIOCSWRITEMAX _IOW('B', 140, u_int) -#define BIOCGWRITEMAX _IOR('B', 141, u_int) -#define BIOCGBATCHWRITE _IOR('B', 142, int) -#define BIOCSBATCHWRITE _IOW('B', 143, int) -#define BIOCGNOTSTAMP _IOR('B', 144, int) -#define BIOCSNOTSTAMP _IOW('B', 145, int) -#define BIOCGDVRTIN _IOR('B', 146, int) -#define BIOCSDVRTIN _IOW('B', 146, int) +/* See bpf_private.h for additional ioctls */ #endif /* PRIVATE */ /* @@ -300,121 +218,6 @@ struct bpf_hdr { #define SIZEOF_BPF_HDR (sizeof(struct bpf_hdr) <= 20 ? 18 : \ sizeof(struct bpf_hdr)) #endif -#ifdef PRIVATE -/* - * This structure must be a multiple of 4 bytes. - * It includes padding and spare fields that we can use later if desired. - */ -struct bpf_hdr_ext { - struct BPF_TIMEVAL bh_tstamp; /* time stamp */ - bpf_u_int32 bh_caplen; /* length of captured portion */ - bpf_u_int32 bh_datalen; /* original length of packet */ - u_short bh_hdrlen; /* length of bpf header */ - u_char bh_complen; - u_char bh_flags; -#define BPF_HDR_EXT_FLAGS_DIR_IN 0x00 -#define BPF_HDR_EXT_FLAGS_DIR_OUT 0x01 -#ifdef BSD_KERNEL_PRIVATE -#define BPF_HDR_EXT_FLAGS_TCP 0x02 -#define BPF_HDR_EXT_FLAGS_UDP 0x04 -#endif /* BSD_KERNEL_PRIVATE */ - pid_t bh_pid; /* process PID */ - char bh_comm[MAXCOMLEN + 1]; /* process command */ - u_char bh_pktflags; -#define BPF_PKTFLAGS_TCP_REXMT 0x01 -#define BPF_PKTFLAGS_START_SEQ 0x02 -#define BPF_PKTFLAGS_LAST_PKT 0x04 -#define BPF_PKTFLAGS_WAKE_PKT 0x08 - uint16_t bh_trace_tag; - bpf_u_int32 bh_svc; /* service class */ - bpf_u_int32 bh_flowid; /* kernel reserved; 0 in userland */ - bpf_u_int32 bh_unsent_bytes; /* unsent bytes at interface */ - bpf_u_int32 bh_unsent_snd; /* unsent bytes at socket buffer */ - bpf_u_int32 bh_comp_gencnt; /* unsent bytes at socket buffer */ -}; - -#define BPF_HDR_EXT_HAS_TRACE_TAG 1 -#define BPF_HDR_EXT_HAS_COMP_GENCNT 1 - -/* - * External representation of the bpf descriptor - */ -struct xbpf_d { - uint32_t bd_structsize; /* Size of this structure. */ - int32_t bd_dev_minor; - int32_t bd_sig; - uint32_t bd_slen; - uint32_t bd_hlen; - uint32_t bd_bufsize; - pid_t bd_pid; - - uint8_t bd_promisc; - uint8_t bd_immediate; - uint8_t bd_hdrcmplt; - uint8_t bd_async; - - uint8_t bd_headdrop; - uint8_t bd_direction; - uint8_t bh_compreq; - uint8_t bh_compenabled; - - uint8_t bd_exthdr; - uint8_t bd_trunc; - uint8_t bd_pkthdrv2; - uint8_t bd_batch_write : 1; - uint8_t bd_divert_in : 1; - uint8_t bd_padding : 6; - - uint64_t bd_rcount; - uint64_t bd_dcount; - uint64_t bd_fcount; - uint64_t bd_wcount; - uint64_t bd_wdcount; - - char bd_ifname[IFNAMSIZ]; - - uint64_t bd_comp_count; - uint64_t bd_comp_size; - - uint32_t bd_scnt; /* number of packets in store buffer */ - uint32_t bd_hcnt; /* number of packets in hold buffer */ - - uint64_t bd_read_count; - uint64_t bd_fsize; -}; - -#ifndef bd_seesent -/* - * Code compatibility workaround so that old versions of network_cmds will continue to build - * even if netstat -B shows an incorrect value. - */ -#define bd_seesent bd_direction -#endif /* bd_seesent */ - -#define _HAS_STRUCT_XBPF_D_ 2 - -struct bpf_comp_hdr { - struct BPF_TIMEVAL bh_tstamp; /* time stamp */ - bpf_u_int32 bh_caplen; /* length of captured portion */ - bpf_u_int32 bh_datalen; /* original length of packet */ - u_short bh_hdrlen; /* length of bpf header (this struct - * plus alignment padding) */ - u_char bh_complen; /* data portion compressed */ - u_char bh_padding; /* data portion compressed */ -}; - -#define HAS_BPF_HDR_COMP 1 -#define BPF_HDR_COMP_LEN_MAX 255 - -/* - * Packet tap directions - */ -#define BPF_D_NONE 0x0 /* See no packet (for writing only) */ -#define BPF_D_IN 0x1 /* See incoming packets */ -#define BPF_D_OUT 0x2 /* See outgoing packets */ -#define BPF_D_INOUT 0x3 /* See incoming and outgoing packets */ - -#endif /* PRIVATE */ #endif /* !defined(DRIVERKIT) */ /* @@ -756,17 +559,6 @@ struct bpf_comp_hdr { #define DLT_USER14 161 #define DLT_USER15 162 -#ifdef PRIVATE -/* - * For Apple private usage - */ -#define DLT_USER0_APPLE_INTERNAL DLT_USER0 /* rdar://12019509 */ -#define DLT_USER1_APPLE_INTERNAL DLT_USER1 /* rdar://12019509 */ -#define DLT_PKTAP DLT_USER2 /* rdar://11779467 */ -#define DLT_USER3_APPLE_INTERNAL DLT_USER3 /* rdar://19614531 */ -#define DLT_USER4_APPLE_INTERNAL DLT_USER4 /* rdar://19614531 */ -#endif /* PRIVATE */ - /* * For future use with 802.11 captures - defined by AbsoluteValue * Systems to store a number of bits of link-layer information @@ -1451,51 +1243,6 @@ struct bpf_dltlist { #pragma pack() -#ifdef KERNEL_PRIVATE -#define BPF_MIN_PKT_SIZE 40 -#define PORT_DNS 53 -#define PORT_BOOTPS 67 -#define PORT_BOOTPC 68 -#define PORT_ISAKMP 500 -#define PORT_ISAKMP_NATT 4500 /* rfc3948 */ - -#define BPF_T_MICROTIME 0x0000 /* The default */ -#define BPF_T_NONE 0x0003 - -/* Forward declerations */ -struct ifnet; -struct mbuf; - -#define BPF_PACKET_TYPE_MBUF 0 -#if SKYWALK -#define BPF_PACKET_TYPE_PKT 1 -#include -#endif /* SKYWALK */ - -struct bpf_packet { - int bpfp_type; - void *__sized_by(bpfp_header_length) bpfp_header; /* optional */ - size_t bpfp_header_length; - union { - struct mbuf *bpfpu_mbuf; - void * bpfpu_ptr; -#if SKYWALK - kern_packet_t bpfpu_pkt; -#define bpfp_pkt bpfp_u.bpfpu_pkt -#endif /* SKYWALK */ - } bpfp_u; -#define bpfp_mbuf bpfp_u.bpfpu_mbuf -#define bpfp_ptr bpfp_u.bpfpu_ptr - size_t bpfp_total_length; /* length including optional header */ -}; - -extern int bpf_validate(const struct bpf_insn *__counted_by(len), int len); -extern void bpfdetach(struct ifnet *); -extern void bpfilterattach(int); -extern u_int bpf_filter(const struct bpf_insn *__counted_by(pc_len), u_int pc_len, - u_char *__sized_by(sizeof(struct bpf_packet)), u_int wirelen, u_int); -#endif /* KERNEL_PRIVATE */ - #endif /* !defined(DRIVERKIT) */ #if defined(DRIVERKIT) || defined(KERNEL) @@ -1619,36 +1366,10 @@ extern void bpf_tap_in(ifnet_t interface, u_int32_t dlt, mbuf_t packet, extern void bpf_tap_out(ifnet_t interface, u_int32_t dlt, mbuf_t packet, void *__sized_by(header_len) header, size_t header_len); -#if SKYWALK -/*! - * @function bpf_tap_packet_in - * @discussion Call this function when your interface receives a - * packet. This function will check if any bpf devices need a - * a copy of the packet. - * @param interface The interface the packet was received on. - * @param dlt The data link type of the packet. - * @param packet The packet received. - * @param header An optional pointer to a header that will be prepended. - * @param header_len If the header was specified, the length of the header. - */ -extern void bpf_tap_packet_in(ifnet_t interface, u_int32_t dlt, - kern_packet_t packet, void *__sized_by(header_len) header, size_t header_len); - -/*! - * @function bpf_tap_packet_out - * @discussion Call this function when your interface transmits a - * packet. This function will check if any bpf devices need a - * a copy of the packet. - * @param interface The interface the packet was or will be transmitted on. - * @param dlt The data link type of the packet. - * @param packet The packet received. - * @param header An optional pointer to a header that will be prepended. - * @param header_len If the header was specified, the length of the header. - */ -extern void bpf_tap_packet_out(ifnet_t interface, u_int32_t dlt, - kern_packet_t packet, void *__sized_by(header_len) header, size_t header_len); - -#endif /* SKYWALK */ #endif /* KERNEL */ +#if defined(PRIVATE) && !defined(MODULES_SUPPORTED) +#include +#endif /* PRIVATE && !MODULES_SUPPORTED */ + #endif /* _NET_BPF_H_ */ diff --git a/bsd/net/bpf_private.h b/bsd/net/bpf_private.h new file mode 100644 index 000000000..ddeadb32c --- /dev/null +++ b/bsd/net/bpf_private.h @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.h 8.1 (Berkeley) 6/10/93 + * @(#)bpf.h 1.34 (LBL) 6/16/96 + * + * $FreeBSD: src/sys/net/bpf.h,v 1.21.2.3 2001/08/01 00:23:13 fenner Exp $ + */ +/* + * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce + * support for mandatory and extensible security protections. This notice + * is included in support of clause 2.2 (b) of the Apple Public License, + * Version 2.0. + */ + +#ifndef _NET_BPF_PRIVATE_H_ +#define _NET_BPF_PRIVATE_H_ + +#include + +#if !defined(DRIVERKIT) +#include +#include + +struct bpf_setup_args { + uuid_t bsa_uuid; + char bsa_ifname[IFNAMSIZ]; +}; + +#ifdef KERNEL_PRIVATE +/* + * LP64 version of bpf_program. all pointers + * grow when we're dealing with a 64-bit process. + * WARNING - keep in sync with bpf_program + */ +struct bpf_program64 { + u_int bf_len; + user64_addr_t bf_insns __attribute__((aligned(8))); +}; + +struct bpf_program32 { + u_int bf_len; + user32_addr_t bf_insns; +}; +#endif /* KERNEL_PRIVATE */ + +struct bpf_comp_stats { + uint64_t bcs_total_read; /* number of packets read from device */ + uint64_t bcs_total_size; /* total size of filtered packets */ + uint64_t bcs_total_hdr_size; /* total header size of captured packets */ + uint64_t bcs_count_no_common_prefix; /* count of packets not compressible */ + uint64_t bcs_count_compressed_prefix; /* count of compressed packets */ + uint64_t bcs_total_compressed_prefix_size; /* total size of compressed data */ + uint64_t bcs_max_compressed_prefix_size; /* max compressed data size */ +}; + +#ifdef KERNEL_PRIVATE +#define BIOCSETF64 _IOW('B',103, struct bpf_program64) +#define BIOCSETF32 _IOW('B',103, struct bpf_program32) +#define BIOCSRTIMEOUT64 _IOW('B',109, struct user64_timeval) +#define BIOCSRTIMEOUT32 _IOW('B',109, struct user32_timeval) +#define BIOCGRTIMEOUT64 _IOR('B',110, struct user64_timeval) +#define BIOCGRTIMEOUT32 _IOR('B',110, struct user32_timeval) +#endif /* KERNEL_PRIVATE */ +#define BIOCGETTC _IOR('B', 122, int) +#define BIOCSETTC _IOW('B', 123, int) +#define BIOCSEXTHDR _IOW('B', 124, u_int) +#define BIOCGIFATTACHCOUNT _IOWR('B', 125, struct ifreq) +#ifdef KERNEL_PRIVATE +#define BIOCSETFNR64 _IOW('B',126, struct bpf_program64) +#define BIOCSETFNR32 _IOW('B',126, struct bpf_program32) +#endif /* KERNEL_PRIVATE */ +#define BIOCGWANTPKTAP _IOR('B', 127, u_int) +#define BIOCSWANTPKTAP _IOWR('B', 127, u_int) +#define BIOCSHEADDROP _IOW('B', 128, int) +#define BIOCGHEADDROP _IOR('B', 128, int) +#define BIOCSTRUNCATE _IOW('B', 129, u_int) +#define BIOCGETUUID _IOR('B', 130, uuid_t) +#define BIOCSETUP _IOW('B', 131, struct bpf_setup_args) +#define BIOCSPKTHDRV2 _IOW('B', 132, int) +#define BIOCGPKTHDRV2 _IOW('B', 133, int) +#define BIOCGHDRCOMP _IOR('B', 134, int) +#define BIOCSHDRCOMP _IOW('B', 135, int) +#define BIOCGHDRCOMPSTATS _IOR('B', 136, struct bpf_comp_stats) +#define BIOCGHDRCOMPON _IOR('B', 137, int) +#define BIOCGDIRECTION _IOR('B', 138, int) +#define BIOCSDIRECTION _IOW('B', 139, int) +#define BIOCSWRITEMAX _IOW('B', 140, u_int) +#define BIOCGWRITEMAX _IOR('B', 141, u_int) +#define BIOCGBATCHWRITE _IOR('B', 142, int) +#define BIOCSBATCHWRITE _IOW('B', 143, int) +#define BIOCGNOTSTAMP _IOR('B', 144, int) +#define BIOCSNOTSTAMP _IOW('B', 145, int) +#define BIOCGDVRTIN _IOR('B', 146, int) +#define BIOCSDVRTIN _IOW('B', 146, int) + +/* + * This structure must be a multiple of 4 bytes. + * It includes padding and spare fields that we can use later if desired. + */ +struct bpf_hdr_ext { + struct BPF_TIMEVAL bh_tstamp; /* time stamp */ + bpf_u_int32 bh_caplen; /* length of captured portion */ + bpf_u_int32 bh_datalen; /* original length of packet */ + u_short bh_hdrlen; /* length of bpf header */ + u_char bh_complen; + u_char bh_flags; +#define BPF_HDR_EXT_FLAGS_DIR_IN 0x00 +#define BPF_HDR_EXT_FLAGS_DIR_OUT 0x01 +#ifdef BSD_KERNEL_PRIVATE +#define BPF_HDR_EXT_FLAGS_TCP 0x02 +#define BPF_HDR_EXT_FLAGS_UDP 0x04 +#endif /* BSD_KERNEL_PRIVATE */ + pid_t bh_pid; /* process PID */ + char bh_comm[MAXCOMLEN + 1]; /* process command */ + u_char bh_pktflags; +#define BPF_PKTFLAGS_TCP_REXMT 0x01 +#define BPF_PKTFLAGS_START_SEQ 0x02 +#define BPF_PKTFLAGS_LAST_PKT 0x04 +#define BPF_PKTFLAGS_WAKE_PKT 0x08 +#define BPF_PKTFLAGS_ULPN 0x10 + uint16_t bh_trace_tag; + bpf_u_int32 bh_svc; /* service class */ + bpf_u_int32 bh_flowid; /* kernel reserved; 0 in userland */ + bpf_u_int32 bh_unsent_bytes; /* unsent bytes at interface */ + bpf_u_int32 bh_unsent_snd; /* unsent bytes at socket buffer */ + bpf_u_int32 bh_comp_gencnt; /* unsent bytes at socket buffer */ +}; + +#define BPF_HDR_EXT_HAS_TRACE_TAG 1 +#define BPF_HDR_EXT_HAS_COMP_GENCNT 1 + +/* + * External representation of the bpf descriptor + */ +struct xbpf_d { + uint32_t bd_structsize; /* Size of this structure. */ + int32_t bd_dev_minor; + int32_t bd_sig; + uint32_t bd_slen; + uint32_t bd_hlen; + uint32_t bd_bufsize; + pid_t bd_pid; + + uint8_t bd_promisc; + uint8_t bd_immediate; + uint8_t bd_hdrcmplt; + uint8_t bd_async; + + uint8_t bd_headdrop; + uint8_t bd_direction; + uint8_t bh_compreq; + uint8_t bh_compenabled; + + uint8_t bd_exthdr; + uint8_t bd_trunc; + uint8_t bd_pkthdrv2; + uint8_t bd_batch_write : 1; + uint8_t bd_divert_in : 1; + uint8_t bd_padding : 6; + + uint64_t bd_rcount; + uint64_t bd_dcount; + uint64_t bd_fcount; + uint64_t bd_wcount; + uint64_t bd_wdcount; + + char bd_ifname[IFNAMSIZ]; + + uint64_t bd_comp_count; + uint64_t bd_comp_size; + + uint32_t bd_scnt; /* number of packets in store buffer */ + uint32_t bd_hcnt; /* number of packets in hold buffer */ + + uint64_t bd_read_count; + uint64_t bd_fsize; +}; + +#ifndef bd_seesent +/* + * Code compatibility workaround so that old versions of network_cmds will continue to build + * even if netstat -B shows an incorrect value. + */ +#define bd_seesent bd_direction +#endif /* bd_seesent */ + +#define _HAS_STRUCT_XBPF_D_ 2 + +struct bpf_comp_hdr { + struct BPF_TIMEVAL bh_tstamp; /* time stamp */ + bpf_u_int32 bh_caplen; /* length of captured portion */ + bpf_u_int32 bh_datalen; /* original length of packet */ + u_short bh_hdrlen; /* length of bpf header (this struct + * plus alignment padding) */ + u_char bh_complen; /* data portion compressed */ + u_char bh_padding; /* data portion compressed */ +}; + +#define HAS_BPF_HDR_COMP 1 +#define BPF_HDR_COMP_LEN_MAX 255 + +/* + * Packet tap directions + */ +#define BPF_D_NONE 0x0 /* See no packet (for writing only) */ +#define BPF_D_IN 0x1 /* See incoming packets */ +#define BPF_D_OUT 0x2 /* See outgoing packets */ +#define BPF_D_INOUT 0x3 /* See incoming and outgoing packets */ + +#endif /* !defined(DRIVERKIT) */ + +/* + * For Apple private usage + */ +#define DLT_USER0_APPLE_INTERNAL DLT_USER0 /* rdar://12019509 */ +#define DLT_USER1_APPLE_INTERNAL DLT_USER1 /* rdar://12019509 */ +#define DLT_PKTAP DLT_USER2 /* rdar://11779467 */ +#define DLT_USER3_APPLE_INTERNAL DLT_USER3 /* rdar://19614531 */ +#define DLT_USER4_APPLE_INTERNAL DLT_USER4 /* rdar://19614531 */ + +#if !defined(DRIVERKIT) +#ifdef KERNEL_PRIVATE +#define BPF_MIN_PKT_SIZE 40 +#define PORT_DNS 53 +#define PORT_BOOTPS 67 +#define PORT_BOOTPC 68 +#define PORT_ISAKMP 500 +#define PORT_ISAKMP_NATT 4500 /* rfc3948 */ + +#define BPF_T_MICROTIME 0x0000 /* The default */ +#define BPF_T_NONE 0x0003 + +/* Forward declerations */ +struct ifnet; +struct mbuf; + +#define BPF_PACKET_TYPE_MBUF 0 +#if SKYWALK +#define BPF_PACKET_TYPE_PKT 1 +#include +#endif /* SKYWALK */ + +struct bpf_packet { + int bpfp_type; + void *__sized_by(bpfp_header_length) bpfp_header; /* optional */ + size_t bpfp_header_length; + union { + struct mbuf *bpfpu_mbuf; + void * bpfpu_ptr; +#if SKYWALK + kern_packet_t bpfpu_pkt; +#define bpfp_pkt bpfp_u.bpfpu_pkt +#endif /* SKYWALK */ + } bpfp_u; +#define bpfp_mbuf bpfp_u.bpfpu_mbuf +#define bpfp_ptr bpfp_u.bpfpu_ptr + size_t bpfp_total_length; /* length including optional header */ +}; + +extern int bpf_validate(const struct bpf_insn *__counted_by(len), int len); +extern void bpfdetach(struct ifnet *); +extern void bpfilterattach(int); +extern u_int bpf_filter(const struct bpf_insn *__counted_by(pc_len), u_int pc_len, + u_char *__sized_by(sizeof(struct bpf_packet)), u_int wirelen, u_int); +#endif /* KERNEL_PRIVATE */ + +#endif /* !defined(DRIVERKIT) */ + +#ifdef KERNEL +#if SKYWALK +/*! + * @function bpf_tap_packet_in + * @discussion Call this function when your interface receives a + * packet. This function will check if any bpf devices need a + * a copy of the packet. + * @param interface The interface the packet was received on. + * @param dlt The data link type of the packet. + * @param packet The packet received. + * @param header An optional pointer to a header that will be prepended. + * @param header_len If the header was specified, the length of the header. + */ +extern void bpf_tap_packet_in(ifnet_t interface, u_int32_t dlt, + kern_packet_t packet, void *__sized_by(header_len) header, size_t header_len); + +/*! + * @function bpf_tap_packet_out + * @discussion Call this function when your interface transmits a + * packet. This function will check if any bpf devices need a + * a copy of the packet. + * @param interface The interface the packet was or will be transmitted on. + * @param dlt The data link type of the packet. + * @param packet The packet received. + * @param header An optional pointer to a header that will be prepended. + * @param header_len If the header was specified, the length of the header. + */ +extern void bpf_tap_packet_out(ifnet_t interface, u_int32_t dlt, + kern_packet_t packet, void *__sized_by(header_len) header, size_t header_len); + +#endif /* SKYWALK */ +#endif /* KERNEL */ + +#endif /* _NET_BPF_PRIVATE_H_ */ diff --git a/bsd/net/classq/classq.h b/bsd/net/classq/classq.h index 261337694..6ae51c0dd 100644 --- a/bsd/net/classq/classq.h +++ b/bsd/net/classq/classq.h @@ -215,9 +215,6 @@ typedef struct _class_queue_ { #define CLASSQF_ECN (CLASSQF_ECN4 | CLASSQF_ECN6) extern u_int32_t classq_verbose; -#if DEBUG || DEVELOPMENT -extern uint32_t fq_codel_quantum; -#endif /* DEBUG || DEVELOPMENT */ SYSCTL_DECL(_net_classq); diff --git a/bsd/net/classq/classq_fq_codel.c b/bsd/net/classq/classq_fq_codel.c index 4ec10c810..0657b9780 100644 --- a/bsd/net/classq/classq_fq_codel.c +++ b/bsd/net/classq/classq_fq_codel.c @@ -117,20 +117,24 @@ SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, l4s_min_delay_threshold, void fq_codel_init(void) { - _CASSERT(AQM_KTRACE_AON_FLOW_HIGH_DELAY == 0x8300004); - _CASSERT(AQM_KTRACE_AON_THROTTLE == 0x8300008); - _CASSERT(AQM_KTRACE_AON_FLOW_OVERWHELMING == 0x830000c); - _CASSERT(AQM_KTRACE_AON_FLOW_DQ_STALL == 0x8300010); + static_assert(AQM_KTRACE_AON_FLOW_HIGH_DELAY == 0x8300004); + static_assert(AQM_KTRACE_AON_THROTTLE == 0x8300008); + static_assert(AQM_KTRACE_AON_FLOW_OVERWHELMING == 0x830000c); + static_assert(AQM_KTRACE_AON_FLOW_DQ_STALL == 0x8300010); - _CASSERT(AQM_KTRACE_STATS_FLOW_ENQUEUE == 0x8310004); - _CASSERT(AQM_KTRACE_STATS_FLOW_DEQUEUE == 0x8310008); - _CASSERT(AQM_KTRACE_STATS_FLOW_CTL == 0x831000c); - _CASSERT(AQM_KTRACE_STATS_FLOW_ALLOC == 0x8310010); - _CASSERT(AQM_KTRACE_STATS_FLOW_DESTROY == 0x8310014); - _CASSERT(AQM_KTRACE_STATS_FLOW_REPORT_CE == 0x8310018); - _CASSERT(AQM_KTRACE_STATS_GET_QLEN == 0x831001c); - _CASSERT(AQM_KTRACE_TX_NOT_READY == 0x8310020); - _CASSERT(AQM_KTRACE_TX_PACEMAKER == 0x8310024); + static_assert(AQM_KTRACE_STATS_FLOW_ENQUEUE == 0x8310004); + static_assert(AQM_KTRACE_STATS_FLOW_DEQUEUE == 0x8310008); + static_assert(AQM_KTRACE_STATS_FLOW_CTL == 0x831000c); + static_assert(AQM_KTRACE_STATS_FLOW_ALLOC == 0x8310010); + static_assert(AQM_KTRACE_STATS_FLOW_DESTROY == 0x8310014); + static_assert(AQM_KTRACE_STATS_FLOW_REPORT_CE == 0x8310018); + static_assert(AQM_KTRACE_STATS_GET_QLEN == 0x831001c); + static_assert(AQM_KTRACE_TX_NOT_READY == 0x8310020); + static_assert(AQM_KTRACE_TX_PACEMAKER == 0x8310024); + static_assert(AQM_KTRACE_PKT_DROP == 0x8310028); + static_assert(AQM_KTRACE_OK_TO_DROP == 0x831002c); + static_assert(AQM_KTRACE_CONGESTION_INC == 0x8310030); + static_assert(AQM_KTRACE_CONGESTION_NOTIFIED == 0x8310034); } fq_t * @@ -327,9 +331,10 @@ fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl, } int -fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, +fq_codel_enq_legacy(void *fqs_p, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl) { + fq_if_t *fqs = (fq_if_t *)fqs_p; int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS; u_int64_t now; fq_t *fq = NULL; @@ -366,7 +371,7 @@ fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, __builtin_unreachable(); } - if (ifclassq_enable_l4s) { + if (fq_codel_enable_l4s) { tfc_type = pktsched_is_pkt_l4s(pkt) ? FQ_TFC_L4S : FQ_TFC_C; } @@ -378,7 +383,7 @@ fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, /* find the flowq for this packet */ fq = fq_if_hash_pkt(fqs, fq_grp, pkt_flowid, pktsched_get_pkt_svc(pkt), - now, true, tfc_type); + now, pkt_proto, pkt_flowsrc, tfc_type, true); if (__improbable(fq == NULL)) { DTRACE_IP1(memfail__drop, fq_if_t *, fqs); /* drop the packet if we could not allocate a flow queue */ @@ -398,7 +403,7 @@ fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, * Skip the dropping part if it's L4S. Flow control or ECN marking decision * will be made at dequeue time. */ - if (ifclassq_enable_l4s && tfc_type == FQ_TFC_L4S) { + if (fq_codel_enable_l4s && tfc_type == FQ_TFC_L4S) { fq_cl->fcl_stat.fcl_l4s_pkts += cnt; droptype = DTYPE_NODROP; } @@ -695,7 +700,7 @@ fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time) uint64_t pkt_tx_time; fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); - if (!ifclassq_enable_pacing || !ifclassq_enable_l4s || fq->fq_tfc_type != FQ_TFC_L4S) { + if (!fq_codel_enable_pacing) { return TRUE; } @@ -724,8 +729,10 @@ fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time) } void -fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) +fq_codel_dq_legacy(void *fqs_p, void *fq_p, pktsched_pkt_t *pkt, uint64_t now) { + fq_if_t *fqs = (fq_if_t *)fqs_p; + fq_t *fq = (fq_t *)fq_p; fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); int64_t qdelay = 0; volatile uint32_t *__single pkt_flags; @@ -743,7 +750,7 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, &pkt_flowsrc, NULL, NULL, &pkt_tx_time); l4s_pkt = pktsched_is_pkt_l4s(pkt); - if (ifclassq_enable_pacing && ifclassq_enable_l4s) { + if (fq_codel_enable_pacing && fq_codel_enable_l4s) { if (pkt_tx_time > *pkt_timestamp) { pacing_delay = pkt_tx_time - *pkt_timestamp; fq_cl->fcl_stat.fcl_paced_pkts++; @@ -802,7 +809,7 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) } fq->fq_pkts_since_last_report++; - if (ifclassq_enable_l4s && l4s_pkt) { + if (fq_codel_enable_l4s && l4s_pkt) { /* * A safe guard to make sure that L4S is not going to build a huge * queue if we encounter unexpected problems (for eg., if ACKs don't @@ -824,7 +831,7 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) IFCQ_CONVERT_LOCK(fqs->fqs_ifq); if (__improbable(l4s_local_ce_report != 0) && (*pkt_flags & PKTF_FLOW_ADV) != 0 && - fq_if_report_ce(fqs, pkt, 1, fq->fq_pkts_since_last_report)) { + fq_if_report_congestion(fqs, pkt, 0, 1, fq->fq_pkts_since_last_report)) { fq->fq_pkts_since_last_report = 0; fq_cl->fcl_stat.fcl_ce_reported++; } else if (pktsched_mark_ecn(pkt) == 0) { @@ -892,3 +899,495 @@ fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) __builtin_unreachable(); } } + +int +fq_codel_enq(void *fqs_p, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, + fq_if_classq_t *fq_cl) +{ + fq_if_t *fqs = (fq_if_t *)fqs_p; + int droptype = DTYPE_NODROP, ret = CLASSQEQ_SUCCESS; + u_int64_t now; + fq_t *fq = NULL; + uint64_t *__single pkt_timestamp; + volatile uint32_t *__single pkt_flags; + uint32_t pkt_flowid, cnt; + uint8_t pkt_proto, pkt_flowsrc; + fq_tfc_type_t tfc_type = FQ_TFC_C; + + cnt = pkt->pktsched_pcnt; + pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid, + &pkt_flowsrc, &pkt_proto, NULL, NULL); + + /* + * XXX Not walking the chain to set this flag on every packet. + * This flag is only used for debugging. Nothing is affected if it's + * not set. + */ + switch (pkt->pktsched_ptype) { + case QP_MBUF: + /* See comments in */ + VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED)); + break; +#if SKYWALK + case QP_PACKET: + /* sanity check */ + ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); + break; +#endif /* SKYWALK */ + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); + } + + if (fq_codel_enable_l4s) { + tfc_type = pktsched_is_pkt_l4s(pkt) ? FQ_TFC_L4S : FQ_TFC_C; + } + + /* + * Timestamps for every packet must be set prior to entering this path. + */ + now = *pkt_timestamp; + ASSERT(now > 0); + + /* find the flowq for this packet */ + fq = fq_if_hash_pkt(fqs, fq_grp, pkt_flowid, pktsched_get_pkt_svc(pkt), + now, pkt_proto, pkt_flowsrc, tfc_type, true); + if (__improbable(fq == NULL)) { + DTRACE_IP1(memfail__drop, fq_if_t *, fqs); + /* drop the packet if we could not allocate a flow queue */ + fq_cl->fcl_stat.fcl_drop_memfailure += cnt; + return CLASSQEQ_DROP; + } + VERIFY(fq->fq_group == fq_grp); + VERIFY(fqs->fqs_ptype == pkt->pktsched_ptype); + + KDBG(AQM_KTRACE_STATS_FLOW_ENQUEUE, fq->fq_flowhash, + AQM_KTRACE_FQ_GRP_SC_IDX(fq), + fq->fq_bytes, pktsched_get_pkt_len(pkt)); + + /* + * Skip the dropping part if it's L4S. Flow control or ECN marking decision + * will be made at dequeue time. + */ + if (fq_codel_enable_l4s && tfc_type == FQ_TFC_L4S) { + fq_cl->fcl_stat.fcl_l4s_pkts += cnt; + droptype = DTYPE_NODROP; + } + + /* + * If the queue length hits the queue limit, drop a chain with the + * same number of packets from the front of the queue for a flow with + * maximum number of bytes. This will penalize heavy and unresponsive + * flows. It will also avoid a tail drop. + */ + if (__improbable(droptype == DTYPE_NODROP && + fq_if_at_drop_limit(fqs))) { + uint32_t i; + + if (fqs->fqs_large_flow == fq) { + /* + * Drop from the head of the current fq. Since a + * new packet will be added to the tail, it is ok + * to leave fq in place. + */ + DTRACE_IP5(large__flow, fq_if_t *, fqs, + fq_if_classq_t *, fq_cl, fq_t *, fq, + pktsched_pkt_t *, pkt, uint32_t, cnt); + + for (i = 0; i < cnt; i++) { + fq_head_drop(fqs, fq); + } + fq_cl->fcl_stat.fcl_drop_overflow += cnt; + /* + * For UDP, flow control it here so that we won't waste too much + * CPU dropping packets. + */ + if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && + (*pkt_flags & PKTF_FLOW_ADV) && + (pkt_proto != IPPROTO_TCP) && + (pkt_proto != IPPROTO_QUIC)) { + if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { + fq->fq_flags |= FQF_FLOWCTL_ON; + FQ_SET_OVERWHELMING(fq); + fq_cl->fcl_stat.fcl_overwhelming++; + /* deliver flow control advisory error */ + ret = CLASSQEQ_SUCCESS_FC; + } + } + } else { + if (fqs->fqs_large_flow == NULL) { + droptype = DTYPE_FORCED; + fq_cl->fcl_stat.fcl_drop_overflow += cnt; + ret = CLASSQEQ_DROP; + + DTRACE_IP5(no__large__flow, fq_if_t *, fqs, + fq_if_classq_t *, fq_cl, fq_t *, fq, + pktsched_pkt_t *, pkt, uint32_t, cnt); + + /* + * if this fq was freshly created and there + * is nothing to enqueue, move it to empty list + */ + if (fq_empty(fq, fqs->fqs_ptype) && + !(fq->fq_flags & (FQF_NEW_FLOW | + FQF_OLD_FLOW))) { + fq_if_move_to_empty_flow(fqs, fq_cl, + fq, now); + fq = NULL; + } + } else { + DTRACE_IP5(different__large__flow, + fq_if_t *, fqs, fq_if_classq_t *, fq_cl, + fq_t *, fq, pktsched_pkt_t *, pkt, + uint32_t, cnt); + + for (i = 0; i < cnt; i++) { + fq_if_drop_packet(fqs, now); + } + } + } + } + + fq_cl->fcl_flags &= ~FCL_PACED; + + if (__probable(droptype == DTYPE_NODROP)) { + uint32_t chain_len = pktsched_get_pkt_len(pkt); + int ret_compress = 0; + + /* + * We do not compress if we are enqueuing a chain. + * Traversing the chain to look for acks would defeat the + * purpose of batch enqueueing. + */ + if (cnt == 1) { + ret_compress = fq_compressor(fqs, fq, fq_cl, pkt); + if (ret_compress == CLASSQEQ_COMPRESSED) { + fq_cl->fcl_stat.fcl_pkts_compressed++; + } + } + DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl, + fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt); + fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt, + pkt->pktsched_ptype); + + fq->fq_bytes += chain_len; + fq_cl->fcl_stat.fcl_byte_cnt += chain_len; + fq_cl->fcl_stat.fcl_pkt_cnt += cnt; + + /* + * check if this queue will qualify to be the next + * victim queue + */ + fq_if_is_flow_heavy(fqs, fq); + + if (FQ_CONGESTION_FEEDBACK_CAPABLE(fq) && fq->fq_flowsrc == FLOWSRC_INPCB && + fq->fq_congestion_cnt > fq->fq_last_congestion_cnt) { + KDBG(AQM_KTRACE_CONGESTION_NOTIFIED, + fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq), + fq->fq_bytes, fq->fq_congestion_cnt); + fq_cl->fcl_stat.fcl_congestion_feedback++; + ret = CLASSQEQ_CONGESTED; + } + } else { + DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret); + return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP; + } + + /* + * If the queue is not currently active, add it to the end of new + * flows list for that service class. + */ + if ((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == 0) { + VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL); + STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink); + fq->fq_flags |= FQF_NEW_FLOW; + + fq_cl->fcl_stat.fcl_newflows_cnt++; + + fq->fq_deficit = fq_cl->fcl_quantum; + } + fq->fq_last_congestion_cnt = fq->fq_congestion_cnt; + + return ret; +} + +static boolean_t +codel_ok_to_drop(pktsched_pkt_t *pkt, fq_t *fq, + struct codel_status *codel_status, uint64_t now) +{ + uint64_t *__single pkt_enq_time, sojourn_time; + boolean_t ok_to_drop = false; + + if (pkt->pktsched_pcnt == 0) { + codel_status->first_above_time = 0; + return false; + } + + pktsched_get_pkt_vars(pkt, NULL, &pkt_enq_time, NULL, NULL, NULL, NULL, NULL); + sojourn_time = 0; + // TODO: handle pacing + VERIFY(now >= *pkt_enq_time); + if (__probable(now > *pkt_enq_time)) { + sojourn_time = now - *pkt_enq_time; + } + + if (sojourn_time <= FQ_TARGET_DELAY(fq)) { + codel_status->first_above_time = 0; + goto end; + } + if (codel_status->first_above_time == 0) { + codel_status->first_above_time = now += FQ_UPDATE_INTERVAL(fq); + } else if (now >= codel_status->first_above_time) { + ok_to_drop = true; + KDBG(AQM_KTRACE_OK_TO_DROP, fq->fq_flowhash, sojourn_time, + *pkt_enq_time, now); + } + +end: + /* we shouldn't need to access pkt_enq_time again, clear it now */ + *pkt_enq_time = 0; + return ok_to_drop; +} + +static float +fast_inv_sqrt(float number) +{ + union { + float f; + uint32_t i; + } conv = { .f = number }; + conv.i = 0x5f3759df - (conv.i >> 1); + conv.f *= 1.5F - (number * 0.5F * conv.f * conv.f); + return conv.f; +} + +static uint64_t +codel_control_law(float inv_sqrt, uint64_t t, uint64_t interval) +{ + /* Drop becomes more frequeut as drop count increases */ + uint64_t val = (uint64_t)(inv_sqrt * interval); + uint64_t result = t + val; + VERIFY(val > 0 && val <= interval); + return result; +} + +static void +fq_drop_pkt(struct ifclassq *ifcq, struct ifnet *ifp, pktsched_pkt_t *pkt) +{ + IFCQ_DROP_ADD(ifcq, 1, pktsched_get_pkt_len(pkt)); + if (__improbable(droptap_verbose > 0)) { + pktsched_drop_pkt(pkt, ifp, DROP_REASON_AQM_HIGH_DELAY, + __func__, __LINE__, 0); + } else { + pktsched_free_pkt(pkt); + } +} + +void +fq_codel_dq(void *fqs_p, void *fq_p, pktsched_pkt_t *pkt, uint64_t now) +{ + fq_if_t *fqs = (fq_if_t *)fqs_p; + fq_t *fq = (fq_t *)fq_p; + fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); + struct ifnet *ifp = fqs->fqs_ifq->ifcq_ifp; + struct codel_status *status = &fq->codel_status; + struct ifclassq *ifcq = fqs->fqs_ifq; + volatile uint32_t *__single pkt_flags; + uint64_t *__single pkt_timestamp, pkt_tx_time = 0, pacing_delay = 0, pkt_enq_time = 0; + int64_t qdelay = 0; + boolean_t l4s_pkt; + boolean_t ok_to_drop = false; + uint32_t delta = 0; + + fq_getq_flow_internal(fqs, fq, pkt); + if (pkt->pktsched_ptype == QP_INVALID) { + VERIFY(pkt->pktsched_pkt_mbuf == NULL); + return; + } + + pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, + NULL, NULL, &pkt_tx_time); + l4s_pkt = pktsched_is_pkt_l4s(pkt); + pkt_enq_time = *pkt_timestamp; + + if (fq_codel_enable_pacing && fq_codel_enable_l4s) { + if (pkt_tx_time > pkt_enq_time) { + pacing_delay = pkt_tx_time - pkt_enq_time; + fq_cl->fcl_stat.fcl_paced_pkts++; + DTRACE_SKYWALK3(aqm__pacing__delta, uint64_t, now - pkt_tx_time, + fq_if_t *, fqs, fq_t *, fq); + } +#if (DEVELOPMENT || DEBUG) + else if (pkt_tx_time != 0) { + DTRACE_SKYWALK5(aqm__miss__pacing__delay, uint64_t, pkt_enq_time, + uint64_t, pkt_tx_time, uint64_t, now, fq_if_t *, + fqs, fq_t *, fq); + } +#endif // (DEVELOPMENT || DEBUG) + } + + if (fq_codel_enable_l4s && l4s_pkt) { + /* this will compute qdelay in nanoseconds */ + if (now > pkt_enq_time) { + qdelay = now - pkt_enq_time; + } + + fq->fq_pkts_since_last_report++; + + if ((l4s_ce_threshold != 0 && qdelay > l4s_ce_threshold + pacing_delay) || + (l4s_ce_threshold == 0 && qdelay > FQ_TARGET_DELAY(fq) + pacing_delay)) { + DTRACE_SKYWALK4(aqm__mark__ce, uint64_t, qdelay, uint64_t, pacing_delay, + fq_if_t *, fqs, fq_t *, fq); + KDBG(AQM_KTRACE_STATS_FLOW_REPORT_CE, fq->fq_flowhash, + AQM_KTRACE_FQ_GRP_SC_IDX(fq), qdelay, pacing_delay); + /* + * The packet buffer that pktsched_mark_ecn writes to can be pageable. + * Since it is not safe to write to pageable memory while preemption + * is disabled, convert the spin lock into mutex. + */ + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + if (__improbable(l4s_local_ce_report != 0) && + (*pkt_flags & PKTF_FLOW_ADV) != 0 && + fq_if_report_congestion(fqs, pkt, 0, 1, fq->fq_pkts_since_last_report)) { + fq->fq_pkts_since_last_report = 0; + fq_cl->fcl_stat.fcl_ce_reported++; + } else if (pktsched_mark_ecn(pkt) == 0) { + fq_cl->fcl_stat.fcl_ce_marked++; + } else { + fq_cl->fcl_stat.fcl_ce_mark_failures++; + } + } + *pkt_timestamp = 0; + } else { + ok_to_drop = codel_ok_to_drop(pkt, fq, status, now); + + if (status->dropping) { + if (!ok_to_drop) { + status->dropping = false; + } + /* + * Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= status->drop_next && status->dropping) { + status->count++; + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + KDBG(AQM_KTRACE_CONGESTION_INC, fq->fq_flowhash, + AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, 0); + if (FQ_CONGESTION_FEEDBACK_CAPABLE(fq) && (*pkt_flags & PKTF_FLOW_ADV)) { + fq->fq_congestion_cnt++; + /* Like in the case of ECN, return now and dequeue again. */ + if (fq->fq_flowsrc == FLOWSRC_CHANNEL) { + fq_if_report_congestion(fqs, pkt, 1, 0, fq->fq_pkts_since_last_report); + fq_cl->fcl_stat.fcl_congestion_feedback++; + } + goto end; + /* + * If congestion feedback is not supported, + * try ECN first, if fail, drop the packet. + */ + } else if (FQ_IS_ECN_CAPABLE(fq) && pktsched_mark_ecn(pkt) == 0) { + fq_cl->fcl_stat.fcl_ce_marked++; + status->drop_next = + codel_control_law(fast_inv_sqrt(status->count), status->drop_next, FQ_UPDATE_INTERVAL(fq)); + KDBG(AQM_KTRACE_PKT_DROP, fq->fq_flowhash, + status->count, status->drop_next, now); + /* + * Since we are not dropping, return now and let the + * caller dequeue again + */ + goto end; + } else { + /* Disable ECN marking for this flow if marking fails once */ + FQ_CLEAR_ECN_CAPABLE(fq); + fq_drop_pkt(ifcq, ifp, pkt); + fq_cl->fcl_stat.fcl_ce_mark_failures++; + fq_cl->fcl_stat.fcl_high_delay_drop++; + + fq_getq_flow_internal(fqs, fq, pkt); + if (!codel_ok_to_drop(pkt, fq, status, now)) { + /* exit dropping state when queuing delay falls below threshold */ + status->dropping = false; + } else { + status->drop_next = + codel_control_law(fast_inv_sqrt(status->count), status->drop_next, FQ_UPDATE_INTERVAL(fq)); + } + } + KDBG(AQM_KTRACE_PKT_DROP, fq->fq_flowhash, + status->count, status->drop_next, now); + } + /* + * If we get here, we're not in drop state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'TARGET' for 'INTERVAL', so enter drop state. + */ + } else if (ok_to_drop) { + IFCQ_CONVERT_LOCK(fqs->fqs_ifq); + KDBG(AQM_KTRACE_CONGESTION_INC, fq->fq_flowhash, + AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, 0); + if (FQ_CONGESTION_FEEDBACK_CAPABLE(fq) && (*pkt_flags & PKTF_FLOW_ADV)) { + fq->fq_congestion_cnt++; + if (fq->fq_flowsrc == FLOWSRC_CHANNEL) { + fq_if_report_congestion(fqs, pkt, 1, 0, fq->fq_pkts_since_last_report); + fq_cl->fcl_stat.fcl_congestion_feedback++; + } + } else if (FQ_IS_ECN_CAPABLE(fq) && pktsched_mark_ecn(pkt) == 0) { + fq_cl->fcl_stat.fcl_ce_marked++; + } else { + FQ_CLEAR_ECN_CAPABLE(fq); + fq_drop_pkt(ifcq, ifp, pkt); + fq_cl->fcl_stat.fcl_ce_mark_failures++; + fq_cl->fcl_stat.fcl_high_delay_drop++; + + fq_getq_flow_internal(fqs, fq, pkt); + ok_to_drop = codel_ok_to_drop(pkt, fq, status, now); + } + status->dropping = true; + + /* + * If min went above TARGET close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'INTERVAL' + * later than the time of the last drop, so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + delta = status->count - status->lastcnt; + if (delta > 0 && now - status->drop_next <= 16 * FQ_UPDATE_INTERVAL(fq)) { + status->count = MAX(status->count - 1, 1); + } else { + status->count = 1; + } + + status->drop_next = + codel_control_law(fast_inv_sqrt(status->count), now, FQ_UPDATE_INTERVAL(fq)); + status->lastcnt = status->count; + + KDBG(AQM_KTRACE_PKT_DROP, fq->fq_flowhash, + status->count, status->drop_next, now); + } + } + +end: + if (fqs->fqs_large_flow != fq || !fq_if_almost_at_drop_limit(fqs) || + fq_empty(fq, fqs->fqs_ptype)) { + FQ_CLEAR_OVERWHELMING(fq); + } + if ((fq->fq_flags & FQF_FLOWCTL_ON) && !FQ_IS_OVERWHELMING(fq)) { + fq_if_flow_feedback(fqs, fq, fq_cl); + } + + if (fq_empty(fq, fqs->fqs_ptype)) { + /* Reset getqtime so that we don't count idle times */ + fq->fq_getqtime = 0; + fq->codel_status.dropping = false; + } else { + fq->fq_getqtime = now; + } + fq_if_is_flow_heavy(fqs, fq); +} diff --git a/bsd/net/classq/classq_fq_codel.h b/bsd/net/classq/classq_fq_codel.h index 61657e9c3..dbe04510b 100644 --- a/bsd/net/classq/classq_fq_codel.h +++ b/bsd/net/classq/classq_fq_codel.h @@ -56,6 +56,10 @@ extern "C" { #define AQM_KTRACE_STATS_GET_QLEN AQMDBG_CODE(DBG_AQM_STATS, 0x007) #define AQM_KTRACE_TX_NOT_READY AQMDBG_CODE(DBG_AQM_STATS, 0x008) #define AQM_KTRACE_TX_PACEMAKER AQMDBG_CODE(DBG_AQM_STATS, 0x009) +#define AQM_KTRACE_PKT_DROP AQMDBG_CODE(DBG_AQM_STATS, 0x00a) +#define AQM_KTRACE_OK_TO_DROP AQMDBG_CODE(DBG_AQM_STATS, 0x00b) +#define AQM_KTRACE_CONGESTION_INC AQMDBG_CODE(DBG_AQM_STATS, 0x00c) +#define AQM_KTRACE_CONGESTION_NOTIFIED AQMDBG_CODE(DBG_AQM_STATS, 0x00d) #define AQM_KTRACE_FQ_GRP_SC_IDX(_fq_) \ ((_fq_)->fq_group->fqg_index << 4 | (_fq_)->fq_sc_index) @@ -79,6 +83,19 @@ extern "C" { (_fq_)->fq_flags &= ~FQF_DELAY_HIGH; \ } while (0) +#define FQ_IS_ECN_CAPABLE(_fq_) ((_fq_)->fq_flags & FQF_ECN_CAPABLE) +#define FQ_SET_ECN_CAPABLE(_fq_) do { \ + (_fq_)->fq_flags |= FQF_ECN_CAPABLE; \ + } while (0) +#define FQ_CLEAR_ECN_CAPABLE(_fq_) do { \ + (_fq_)->fq_flags &= ~FQF_ECN_CAPABLE; \ +} while (0) + +#define FQ_CONGESTION_FEEDBACK_CAPABLE(_fq_) ((_fq_)->fq_flags & FQF_CONGESTION_FEEDBACK) +#define FQ_ENABLE_CONGESTION_FEEDBACK(_fq_) do { \ + (_fq_)->fq_flags |= FQF_CONGESTION_FEEDBACK; \ +} while (0) + #define FQ_IS_OVERWHELMING(_fq_) ((_fq_)->fq_flags & FQF_OVERWHELMING) #define FQ_SET_OVERWHELMING(_fq_) do { \ if (!FQ_IS_OVERWHELMING(_fq_)) { \ @@ -109,6 +126,14 @@ extern "C" { #define FQ_INVALID_TX_TS UINT64_MAX +typedef struct codel_status { + boolean_t dropping; + uint32_t lastcnt; + uint32_t count; + uint64_t first_above_time; + uint64_t drop_next; +} codel_status_t; + struct flowq { #pragma pack(push,1) union { @@ -130,7 +155,10 @@ struct flowq { #define FQF_EMPTY_FLOW 0x20 /* Currently on empty flows queue */ #define FQF_OVERWHELMING 0x40 /* The largest flow when AQM hits queue limit */ #define FQF_FRESH_FLOW 0x80 /* The flow queue has just been allocated */ - uint8_t fq_flags; /* flags */ +#define FQF_ECN_CAPABLE 0x100 /* The flow is capable for doing ECN for classic traffic */ +#define FQF_CONGESTION_FEEDBACK 0x200 /* The flow is capable for doing congestion feedbacks */ + uint16_t fq_flags; /* flags */ + uint8_t fq_flowsrc; uint8_t fq_sc_index; /* service_class index */ bool fq_in_dqlist; fq_tfc_type_t fq_tfc_type; @@ -141,6 +169,10 @@ struct flowq { uint32_t fq_pkts_since_last_report; /* the next time that a paced packet is ready to go*/ uint64_t fq_next_tx_time; + /* number of packets that have experienced congestion event */ + uint32_t fq_congestion_cnt; + uint32_t fq_last_congestion_cnt; + codel_status_t codel_status; union { uint64_t fq_updatetime; /* next update interval */ /* empty list purge time (in nanoseconds) */ @@ -252,18 +284,18 @@ struct fq_if_classq; extern void fq_codel_init(void); extern fq_t *fq_alloc(classq_pkt_type_t); extern void fq_destroy(fq_t *, classq_pkt_type_t); -extern int fq_addq(struct fq_codel_sched_data *, fq_if_group_t *, - pktsched_pkt_t *, struct fq_if_classq *); -extern void fq_getq_flow(struct fq_codel_sched_data *, fq_t *, - pktsched_pkt_t *, uint64_t now); -extern void fq_codel_dequeue(fq_if_t *fqs, fq_t *fq, - pktsched_pkt_t *pkt, uint64_t now); +extern int fq_codel_enq_legacy(void * fqs, fq_if_group_t *, pktsched_pkt_t *, struct fq_if_classq *); +extern void fq_codel_dq_legacy(void *fqs, void *fq, pktsched_pkt_t *pkt, uint64_t now); extern void fq_getq_flow_internal(struct fq_codel_sched_data *, fq_t *, pktsched_pkt_t *); extern void fq_head_drop(struct fq_codel_sched_data *, fq_t *); extern boolean_t fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time); +extern void fq_codel_dq(void *fqs_p, void *fq_p, pktsched_pkt_t *pkt, uint64_t now); +extern int fq_codel_enq(void *fqs_p, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, + fq_if_classq_t *fq_cl); + #ifdef __cplusplus } #endif diff --git a/bsd/net/classq/classq_subr.c b/bsd/net/classq/classq_subr.c index 06d21b1bc..ea92d6eb3 100644 --- a/bsd/net/classq/classq_subr.c +++ b/bsd/net/classq/classq_subr.c @@ -44,69 +44,42 @@ #include #include #include +#include #include #include #if SKYWALK #include +#include #include #endif /* SKYWALK */ - -static errno_t ifclassq_dequeue_common(struct ifclassq *, mbuf_svc_class_t, - u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, - u_int32_t *, boolean_t, u_int8_t); +static int ifclassq_tbr_set_locked(struct ifclassq *ifq, struct tb_profile *profile, + boolean_t update); static void ifclassq_tbr_dequeue_common(struct ifclassq *, mbuf_svc_class_t, boolean_t, classq_pkt_t *, u_int8_t); -static uint64_t ifclassq_def_c_target_qdelay = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, def_c_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, - &ifclassq_def_c_target_qdelay, "def classic target queue delay in nanoseconds"); - -static uint64_t ifclassq_def_c_update_interval = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, def_c_update_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_def_c_update_interval, - "def classic update interval in nanoseconds"); - -static uint64_t ifclassq_def_l4s_target_qdelay = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, def_l4s_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, - &ifclassq_def_l4s_target_qdelay, "def L4S target queue delay in nanoseconds"); - -static uint64_t ifclassq_def_l4s_update_interval = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, def_l4s_update_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_def_l4s_update_interval, - "def L4S update interval in nanoseconds"); - -static uint64_t ifclassq_ll_c_target_qdelay = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, ll_c_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, - &ifclassq_ll_c_target_qdelay, "low latency classic target queue delay in nanoseconds"); - -static uint64_t ifclassq_ll_c_update_interval = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, ll_c_update_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_ll_c_update_interval, - "low latency classic update interval in nanoseconds"); - -static uint64_t ifclassq_ll_l4s_target_qdelay = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, ll_l4s_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, - &ifclassq_ll_l4s_target_qdelay, "low latency L4S target queue delay in nanoseconds"); - -static uint64_t ifclassq_ll_l4s_update_interval = 0; -SYSCTL_QUAD(_net_classq, OID_AUTO, ll_l4s_update_interval, - CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_ll_l4s_update_interval, - "low latency L4S update interval in nanoseconds"); - -uint32_t ifclassq_enable_l4s = 1; -SYSCTL_UINT(_net_classq, OID_AUTO, enable_l4s, - CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_enable_l4s, 0, - "enable/disable L4S"); - #if DEBUG || DEVELOPMENT uint32_t ifclassq_flow_control_adv = 1; /* flow control advisory */ SYSCTL_UINT(_net_classq, OID_AUTO, flow_control_adv, CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_flow_control_adv, 1, "enable/disable flow control advisory"); -uint32_t fq_codel_quantum = 0; +uint32_t ifclassq_congestion_feedback = 1; +SYSCTL_UINT(_net_classq, OID_AUTO, flow_congestion_feedback, + CTLFLAG_RW | CTLFLAG_LOCKED, &ifclassq_congestion_feedback, 1, + "enable/disable congestion feedback (flow control v2)"); + +SYSCTL_EXTENSIBLE_NODE(_net_classq, OID_AUTO, scheduler, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "classq scheduler"); + +/* list value and description of each model */ +#define X(name, value, description, ...) #description ":" #value " " +SYSCTL_STRING(_net_classq_scheduler, OID_AUTO, available_models, CTLFLAG_RD | CTLFLAG_LOCKED, + IFNET_SCHED_MODEL_LIST, 0, ""); +#undef X + +static int ifclassq_configure_sysctl SYSCTL_HANDLER_ARGS; #endif /* DEBUG || DEVELOPMENT */ static KALLOC_TYPE_DEFINE(ifcq_zone, struct ifclassq, NET_KT_DEFAULT); @@ -116,30 +89,9 @@ static LCK_GRP_DECLARE(ifcq_lock_group, "ifclassq locks"); void classq_init(void) { - _CASSERT(MBUF_TC_BE == 0); - _CASSERT(MBUF_SC_BE == 0); - _CASSERT(IFCQ_SC_MAX == MBUF_SC_MAX_CLASSES); -#if DEBUG || DEVELOPMENT - PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum, - sizeof(fq_codel_quantum)); - PE_parse_boot_argn("ifclassq_def_c_target_qdelay", &ifclassq_def_c_target_qdelay, - sizeof(ifclassq_def_c_target_qdelay)); - PE_parse_boot_argn("ifclassq_def_c_update_interval", - &ifclassq_def_c_update_interval, sizeof(ifclassq_def_c_update_interval)); - PE_parse_boot_argn("ifclassq_def_l4s_target_qdelay", &ifclassq_def_l4s_target_qdelay, - sizeof(ifclassq_def_l4s_target_qdelay)); - PE_parse_boot_argn("ifclassq_def_l4s_update_interval", - &ifclassq_def_l4s_update_interval, sizeof(ifclassq_def_l4s_update_interval)); - PE_parse_boot_argn("ifclassq_ll_c_target_qdelay", &ifclassq_ll_c_target_qdelay, - sizeof(ifclassq_ll_c_target_qdelay)); - PE_parse_boot_argn("ifclassq_ll_c_update_interval", - &ifclassq_ll_c_update_interval, sizeof(ifclassq_ll_c_update_interval)); - PE_parse_boot_argn("ifclassq_ll_l4s_target_qdelay", &ifclassq_ll_l4s_target_qdelay, - sizeof(ifclassq_ll_l4s_target_qdelay)); - PE_parse_boot_argn("ifclassq_ll_l4s_update_interval", - &ifclassq_ll_l4s_update_interval, sizeof(ifclassq_ll_l4s_update_interval)); -#endif /* DEBUG || DEVELOPMENT */ - fq_codel_init(); + static_assert(MBUF_TC_BE == 0); + static_assert(MBUF_SC_BE == 0); + static_assert(IFCQ_SC_MAX == MBUF_SC_MAX_CLASSES); } int @@ -183,7 +135,50 @@ ifclassq_setup(struct ifclassq *ifq, struct ifnet *ifp, uint32_t sflags) ifq->ifcq_flags = (IFCQF_READY | IFCQF_ENABLED); } } + +#if (DEBUG || DEVELOPMENT) + static_assert(sizeof(struct skoid) == sizeof(ifcq_oid_t)); + static_assert(offsetof(struct skoid, sko_oid_list) == offsetof(ifcq_oid_t, ifcq_oid_list)); + static_assert(offsetof(struct skoid, sko_oid) == offsetof(ifcq_oid_t, ifcq_oid)); + static_assert(offsetof(struct skoid, sko_name) == offsetof(ifcq_oid_t, ifcq_name)); + + struct skoid *ifcq_skoid = (struct skoid *)&ifq->ifcq_oid; + skoid_create(ifcq_skoid, + SKOID_SNODE(_net_classq_scheduler), if_name(ifp), + CTLFLAG_RW); + skoid_add_handler(ifcq_skoid, "model", CTLFLAG_RW, + ifclassq_configure_sysctl, ifq, 0); +#endif /* (DEBUG || DEVELOPMENT) */ + IFCQ_UNLOCK(ifq); + + return err; +} + +int +ifclassq_change(struct ifclassq *ifq, uint32_t model) +{ + struct ifnet *ifp = ifq->ifcq_ifp; + uint32_t omodel; + errno_t err; + + if (ifp == NULL || !IFNET_MODEL_IS_VALID(model) || + (!!(model & IFNET_SCHED_DRIVER_MANGED_MODELS)) != + (!!(ifp->if_output_sched_model & IFNET_SCHED_DRIVER_MANGED_MODELS))) { + return EINVAL; + } else if (!(ifp->if_eflags & IFEF_TXSTART)) { + return ENXIO; + } + + IFCQ_LOCK(ifq); + omodel = ifp->if_output_sched_model; + ifp->if_output_sched_model = model; + + if ((err = ifclassq_pktsched_setup(ifq)) != 0) { + ifp->if_output_sched_model = omodel; + } + IFCQ_UNLOCK(ifq); + return err; } @@ -199,7 +194,7 @@ ifclassq_teardown(struct ifclassq *ifq) if (IFCQ_TBR_IS_ENABLED(ifq)) { struct tb_profile tb = { .rate = 0, .percent = 0, .depth = 0 }; - (void) ifclassq_tbr_set(ifq, &tb, FALSE); + (void) ifclassq_tbr_set_locked(ifq, &tb, FALSE); } pktsched_teardown(ifq); ifq->ifcq_flags &= ~IFCQF_READY; @@ -217,6 +212,11 @@ ifclassq_teardown(struct ifclassq *ifq) bzero(&ifq->ifcq_xmitcnt, sizeof(ifq->ifcq_xmitcnt)); bzero(&ifq->ifcq_dropcnt, sizeof(ifq->ifcq_dropcnt)); ifq->ifcq_flags |= IFCQF_DESTROYED; + +#if (DEBUG || DEVELOPMENT) + struct skoid *ifcq_skoid = (struct skoid *)&ifq->ifcq_oid; + skoid_destroy(ifcq_skoid); +#endif /* (DEBUG || DEVELOPMENT) */ done: IFCQ_UNLOCK(ifq); } @@ -235,7 +235,27 @@ ifclassq_pktsched_setup(struct ifclassq *ifq) QP_MBUF; #endif /* SKYWALK */ - err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL, ifq->ifcq_sflags, ptype); + switch (ifp->if_output_sched_model) { + case IFNET_SCHED_MODEL_DRIVER_MANAGED: + case IFNET_SCHED_MODEL_NORMAL: + if (ifp->if_family == IFNET_FAMILY_ETHERNET && + (ifp->if_subfamily != IFNET_SUBFAMILY_WIFI)) { + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL_NEW, ifq->ifcq_sflags, ptype); + } else { + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL, ifq->ifcq_sflags, ptype); + } + break; + case IFNET_SCHED_MODEL_FQ_CODEL: + case IFNET_SCHED_MODEL_FQ_CODEL_DM: + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL, ifq->ifcq_sflags, ptype); + break; + case IFNET_SCHED_MODEL_FQ_CODEL_NEW: + case IFNET_SCHED_MODEL_FQ_CODEL_NEW_DM: + err = pktsched_setup(ifq, PKTSCHEDT_FQ_CODEL_NEW, ifq->ifcq_sflags, ptype); + break; + default: + err = EINVAL; + } return err; } @@ -262,6 +282,7 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int8_t grp_idx, u_int32_t *packets, u_int32_t *bytes) { int err = 0; + boolean_t dequeue_paused = false; IFCQ_LOCK(ifq); if ((ifq->ifcq_flags & (IFCQF_READY | IFCQF_ENABLED)) != @@ -270,7 +291,7 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int8_t grp_idx, } if (sc == MBUF_SC_UNSPEC && grp_idx == IF_CLASSQ_ALL_GRPS) { VERIFY(packets != NULL); - if (fq_if_is_all_paced(ifq)) { + if ((dequeue_paused = ifq->ifcq_ops->ps_allow_dequeue(ifq))) { *packets = 0; } else { *packets = IFCQ_LEN(ifq); @@ -280,7 +301,7 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int8_t grp_idx, VERIFY(MBUF_VALID_SC(sc) || sc == MBUF_SC_UNSPEC); - err = fq_if_request_classq(ifq, CLASSQRQ_STAT_SC, &req); + err = ifclassq_request(ifq, CLASSQRQ_STAT_SC, &req, true); if (packets != NULL) { *packets = req.packets; } @@ -289,7 +310,7 @@ ifclassq_get_len(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int8_t grp_idx, } } KDBG(AQM_KTRACE_STATS_GET_QLEN, ifq->ifcq_ifp->if_index, - packets ? *packets : 0, bytes ? *bytes : 0, fq_if_is_all_paced(ifq)); + packets ? *packets : 0, bytes ? *bytes : 0, dequeue_paused); IFCQ_UNLOCK(ifq); @@ -343,25 +364,7 @@ errno_t ifclassq_enqueue(struct ifclassq *ifq, classq_pkt_t *head, classq_pkt_t *tail, u_int32_t cnt, u_int32_t bytes, boolean_t *pdrop) { - return fq_if_enqueue_classq(ifq, head, tail, cnt, bytes, pdrop); -} - -errno_t -ifclassq_dequeue(struct ifclassq *ifq, u_int32_t pkt_limit, - u_int32_t byte_limit, classq_pkt_t *head, classq_pkt_t *tail, - u_int32_t *cnt, u_int32_t *len, u_int8_t grp_idx) -{ - return ifclassq_dequeue_common(ifq, MBUF_SC_UNSPEC, pkt_limit, - byte_limit, head, tail, cnt, len, FALSE, grp_idx); -} - -errno_t -ifclassq_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t sc, - u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head, - classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, u_int8_t grp_idx) -{ - return ifclassq_dequeue_common(ifq, sc, pkt_limit, byte_limit, - head, tail, cnt, len, TRUE, grp_idx); + return ifq->ifcq_ops->ps_enq(ifq, head, tail, cnt, bytes, pdrop); } static errno_t @@ -377,6 +380,7 @@ ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc, VERIFY(!drvmgt || MBUF_VALID_SC(sc)); + IFCQ_LOCK_SPIN(ifq); if (IFCQ_TBR_IS_ENABLED(ifq)) { goto dequeue_loop; } @@ -388,8 +392,7 @@ ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc, if (drvmgt) { int err; - IFCQ_LOCK_SPIN(ifq); - err = fq_if_dequeue_sc_classq_multi(ifq, sc, pkt_limit, + err = ifq->ifcq_ops->ps_deq_sc(ifq, sc, pkt_limit, byte_limit, head, tail, cnt, len, grp_idx); IFCQ_UNLOCK(ifq); @@ -400,8 +403,7 @@ ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc, } else { int err; - IFCQ_LOCK_SPIN(ifq); - err = fq_if_dequeue_classq_multi(ifq, pkt_limit, byte_limit, + err = ifq->ifcq_ops->ps_deq(ifq, pkt_limit, byte_limit, head, tail, cnt, len, grp_idx); IFCQ_UNLOCK(ifq); @@ -413,7 +415,6 @@ ifclassq_dequeue_common_default(struct ifclassq *ifq, mbuf_svc_class_t sc, dequeue_loop: VERIFY(IFCQ_TBR_IS_ENABLED(ifq)); - IFCQ_LOCK_SPIN(ifq); while (i < pkt_limit && l < byte_limit) { if (drvmgt) { @@ -477,13 +478,12 @@ dequeue_loop: return (first.cp_mbuf != NULL) ? 0 : EAGAIN; } -static errno_t -ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, +errno_t +ifclassq_dequeue(struct ifclassq *ifq, mbuf_svc_class_t sc, u_int32_t pkt_limit, u_int32_t byte_limit, classq_pkt_t *head, - classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, boolean_t drvmgt, - u_int8_t grp_idx) + classq_pkt_t *tail, u_int32_t *cnt, u_int32_t *len, u_int8_t grp_idx) { -#if SKYWALK + boolean_t drvmgt = sc != MBUF_SC_UNSPEC; struct ifnet *ifp = ifq->ifcq_ifp; if (__improbable(ifp->if_na_ops != NULL && @@ -501,20 +501,38 @@ ifclassq_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, return ifp->if_na_ops->ni_dequeue(ifp->if_na, sc, pkt_limit, byte_limit, head, tail, cnt, len, drvmgt, err); } -#endif /* SKYWALK */ return ifclassq_dequeue_common_default(ifq, sc, pkt_limit, byte_limit, head, tail, cnt, len, drvmgt, grp_idx); } void -ifclassq_update(struct ifclassq *ifq, cqev_t ev) +ifclassq_update(struct ifclassq *ifq, cqev_t ev, bool locked) { void *ev_p = (void *)&ev; + if (!locked) { + IFCQ_LOCK(ifq); + } IFCQ_LOCK_ASSERT_HELD(ifq); - VERIFY(IFCQ_IS_READY(ifq)); - fq_if_request_classq(ifq, CLASSQRQ_EVENT, ev_p); + if (!(IFCQ_IS_READY(ifq))) { + goto out; + } + + if (IFCQ_TBR_IS_ENABLED(ifq)) { + struct tb_profile tb = { + .rate = ifq->ifcq_tbr.tbr_rate_raw, + .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0 + }; + (void) ifclassq_tbr_set_locked(ifq, &tb, FALSE); + } + + ifclassq_request(ifq, CLASSQRQ_EVENT, ev_p, true); + +out: + if (!locked) { + IFCQ_UNLOCK(ifq); + } } int @@ -533,6 +551,7 @@ ifclassq_detach(struct ifclassq *ifq) IFCQ_LOCK_ASSERT_HELD(ifq); VERIFY(ifq->ifcq_disc == NULL); ifq->ifcq_type = PKTSCHEDT_NONE; + ifq->ifcq_ops = pktsched_ops_find(PKTSCHEDT_NONE); } int @@ -561,7 +580,6 @@ ifclassq_getqstats(struct ifclassq *ifq, u_int8_t gid, u_int32_t qid, void *ubuf *(&ifqs->ifqs_xmitcnt) = *(&ifq->ifcq_xmitcnt); *(&ifqs->ifqs_dropcnt) = *(&ifq->ifcq_dropcnt); ifqs->ifqs_scheduler = ifq->ifcq_type; - ifqs->ifqs_doorbells = ifq->ifcq_doorbells; err = pktsched_getqstats(ifq, gid, qid, ifqs); IFCQ_UNLOCK(ifq); @@ -673,9 +691,9 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, * ifcq_drain count is adjusted by the caller. */ if (drvmgt) { - fq_if_dequeue_sc_classq(ifq, sc, pkt, grp_idx); + ifq->ifcq_ops->ps_deq_sc(ifq, sc, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); } else { - fq_if_dequeue_classq(ifq, pkt, grp_idx); + ifq->ifcq_ops->ps_deq(ifq, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); } if (pkt->cp_mbuf != NULL) { @@ -702,13 +720,14 @@ ifclassq_tbr_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t sc, * set a token bucket regulator. * if the specified rate is zero, the token bucket regulator is deleted. */ -int -ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, +static int +ifclassq_tbr_set_locked(struct ifclassq *ifq, struct tb_profile *profile, boolean_t update) { struct tb_regulator *tbr; struct ifnet *ifp = ifq->ifcq_ifp; u_int64_t rate, old_rate; + uint8_t ev = CLASSQ_EV_LINK_BANDWIDTH; IFCQ_LOCK_ASSERT_HELD(ifq); VERIFY(IFCQ_IS_READY(ifq)); @@ -745,7 +764,7 @@ ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, bzero(tbr, sizeof(*tbr)); ifnet_set_start_cycle(ifp, NULL); if (update) { - ifclassq_update(ifq, CLASSQ_EV_LINK_BANDWIDTH); + ifclassq_request(ifq, CLASSQRQ_EVENT, (void*)&ev, true); } return 0; } @@ -831,118 +850,29 @@ ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, ifnet_set_start_cycle(ifp, NULL); } if (update && tbr->tbr_rate_raw != old_rate) { - ifclassq_update(ifq, CLASSQ_EV_LINK_BANDWIDTH); + ifclassq_request(ifq, CLASSQRQ_EVENT, (void*)&ev, true); } return 0; } -void -ifclassq_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay, - uint32_t flags) +int +ifclassq_tbr_set(struct ifclassq *ifq, struct tb_profile *profile, + boolean_t update) { - uint64_t qdelay = 0, qdelay_configed = 0, qdely_default = 0; - if (flags == IF_CLASSQ_DEF) { - qdelay = IFCQ_TARGET_QDELAY(ifp->if_snd); + int error = 0; + + IFCQ_LOCK(ifq); + if (!IFCQ_IS_READY(ifq)) { + error = ENXIO; + goto out; } - switch (flags) { - case IF_CLASSQ_DEF: - qdelay_configed = ifclassq_def_c_target_qdelay; - qdely_default = IFQ_DEF_C_TARGET_DELAY; - break; - case IF_CLASSQ_L4S: - qdelay_configed = ifclassq_def_l4s_target_qdelay; - if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI || - ifp->if_family == IFNET_FAMILY_CELLULAR) { - qdely_default = IFQ_DEF_L4S_WIRELESS_TARGET_DELAY; - } else { - qdely_default = IFQ_DEF_L4S_TARGET_DELAY; - } - break; - case IF_CLASSQ_LOW_LATENCY: - qdelay_configed = ifclassq_ll_c_target_qdelay; - qdely_default = IFQ_LL_C_TARGET_DELAY; - break; - case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S): - qdelay_configed = ifclassq_ll_l4s_target_qdelay; - if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI || - ifp->if_family == IFNET_FAMILY_CELLULAR) { - qdely_default = IFQ_LL_L4S_WIRELESS_TARGET_DELAY; - } else { - qdely_default = IFQ_LL_L4S_TARGET_DELAY; - } - break; - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } + error = ifclassq_tbr_set_locked(ifq, profile, update); - if (qdelay_configed != 0) { - qdelay = qdelay_configed; - } - - /* - * If we do not know the effective bandwidth, use the default - * target queue delay. - */ - if (qdelay == 0) { - qdelay = qdely_default; - } - - /* - * If a delay has been added to ifnet start callback for - * coalescing, we have to add that to the pre-set target delay - * because the packets can be in the queue longer. - */ - if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) && - ifp->if_start_delay_timeout > 0) { - qdelay += ifp->if_start_delay_timeout; - } - - *(if_target_qdelay) = qdelay; -} - -void -ifclassq_calc_update_interval(uint64_t *update_interval, uint32_t flags) -{ - uint64_t interval = 0, interval_configed = 0, interval_default = 0; - - switch (flags) { - case IF_CLASSQ_DEF: - interval_configed = ifclassq_def_c_update_interval; - interval_default = IFQ_DEF_C_UPDATE_INTERVAL; - break; - case IF_CLASSQ_L4S: - interval_configed = ifclassq_def_l4s_update_interval; - interval_default = IFQ_DEF_L4S_UPDATE_INTERVAL; - break; - case IF_CLASSQ_LOW_LATENCY: - interval_configed = ifclassq_ll_c_update_interval; - interval_default = IFQ_LL_C_UPDATE_INTERVAL; - break; - case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S): - interval_configed = ifclassq_ll_l4s_update_interval; - interval_default = IFQ_LL_L4S_UPDATE_INTERVAL; - break; - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } - - /* If the system level override is set, use it */ - if (interval_configed != 0) { - interval = interval_configed; - } - - /* Otherwise use the default value */ - if (interval == 0) { - interval = interval_default; - } - - *update_interval = interval; +out: + IFCQ_UNLOCK(ifq); + return error; } struct ifclassq * @@ -952,8 +882,10 @@ ifclassq_alloc(void) ifcq = zalloc_flags(ifcq_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); os_ref_init(&ifcq->ifcq_refcnt, NULL); - os_ref_retain(&ifcq->ifcq_refcnt); lck_mtx_init(&ifcq->ifcq_lock, &ifcq_lock_group, &ifcq_lock_attr); + ifcq->ifcq_ops = pktsched_ops_find(PKTSCHEDT_NONE); + VERIFY(ifcq->ifcq_ops != NULL); + os_log(OS_LOG_DEFAULT, "ifclassq instance %p created", ifcq); return ifcq; } @@ -971,6 +903,7 @@ ifclassq_release(struct ifclassq **pifcq) *pifcq = NULL; if (os_ref_release(&ifcq->ifcq_refcnt) == 0) { ifclassq_teardown(ifcq); + os_log(OS_LOG_DEFAULT, "ifclassq instance %p freed", ifcq); zfree(ifcq_zone, ifcq); } } @@ -982,7 +915,7 @@ ifclassq_setup_group(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags) IFCQ_LOCK(ifcq); VERIFY(ifcq->ifcq_disc != NULL); - VERIFY(ifcq->ifcq_type == PKTSCHEDT_FQ_CODEL); + VERIFY(ifcq->ifcq_type == PKTSCHEDT_FQ_CODEL || ifcq->ifcq_type == PKTSCHEDT_FQ_CODEL_NEW); err = fq_if_create_grp(ifcq, grp_idx, flags); IFCQ_UNLOCK(ifcq); @@ -990,24 +923,65 @@ ifclassq_setup_group(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags) return err; } -void -ifclassq_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx) +int +ifclassq_request(struct ifclassq * ifcq, enum cqrq rq, void *arg, bool locked) { - IFCQ_LOCK(ifcq); - VERIFY(ifcq->ifcq_disc != NULL); - VERIFY(ifcq->ifcq_type == PKTSCHEDT_FQ_CODEL); + int err = 0; - fq_if_set_grp_combined(ifcq, grp_idx); - IFCQ_UNLOCK(ifcq); + if (!locked) { + IFCQ_LOCK(ifcq); + } + IFCQ_LOCK_ASSERT_HELD(ifcq); + + if (!IFCQ_IS_ENABLED(ifcq)) { + err = ENXIO; + goto out; + } + + err = ifcq->ifcq_ops->ps_req(ifcq, rq, arg); + +out: + if (!locked) { + IFCQ_UNLOCK(ifcq); + } + return err; } void -ifclassq_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx) +ifclassq_tbr_get(struct ifclassq *ifcq, u_int32_t *sched_type, u_int64_t *tbr_bw, + u_int64_t *tbr_pct) { IFCQ_LOCK(ifcq); - VERIFY(ifcq->ifcq_disc != NULL); - VERIFY(ifcq->ifcq_type == PKTSCHEDT_FQ_CODEL); - fq_if_set_grp_separated(ifcq, grp_idx); + *sched_type = ifcq->ifcq_type; + if (IFCQ_TBR_IS_ENABLED(ifcq)) { + *tbr_bw = ifcq->ifcq_tbr.tbr_rate_raw; + *tbr_pct = ifcq->ifcq_tbr.tbr_percent; + } + IFCQ_UNLOCK(ifcq); } + +#if (DEBUG || DEVELOPMENT) +static int +ifclassq_configure_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + struct ifclassq *__single ifcq = arg1; + struct ifnet *ifp = ifcq->ifcq_ifp; + uint32_t new_model; + int changed; + int error; + + if (ifp == NULL || !IFCQ_IS_ENABLED(ifcq)) { + return ENXIO; + } + + error = sysctl_io_number(req, ifp->if_output_sched_model, + sizeof(ifp->if_output_sched_model), &new_model, &changed); + if (error == 0 && changed != 0) { + error = ifclassq_change(ifcq, new_model); + } + return error; +} +#endif /* (DEBUG || DEVELOPMENT) */ diff --git a/bsd/net/classq/if_classq.h b/bsd/net/classq/if_classq.h index 4dec76ebe..ffe9133b6 100644 --- a/bsd/net/classq/if_classq.h +++ b/bsd/net/classq/if_classq.h @@ -103,20 +103,14 @@ enum cqrq; #if DEBUG || DEVELOPMENT extern uint32_t ifclassq_flow_control_adv; +extern uint32_t ifclassq_congestion_feedback; #endif /* DEBUG || DEVELOPMENT */ -extern uint32_t ifclassq_enable_l4s; -extern unsigned int ifclassq_enable_pacing; -typedef int (*ifclassq_enq_func)(struct ifclassq *, classq_pkt_t *, - boolean_t *); -typedef void (*ifclassq_deq_func)(struct ifclassq *, classq_pkt_t *); -typedef void (*ifclassq_deq_sc_func)(struct ifclassq *, mbuf_svc_class_t, - classq_pkt_t *); -typedef int (*ifclassq_deq_multi_func)(struct ifclassq *, u_int32_t, - u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *); -typedef int (*ifclassq_deq_sc_multi_func)(struct ifclassq *, - mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, - u_int32_t *, u_int32_t *); -typedef int (*ifclassq_req_func)(struct ifclassq *, enum cqrq, void *); + +typedef struct ifcq_sysctl_oid { + struct sysctl_oid_list ifcq_oid_list; /* oid & properties */ + struct sysctl_oid ifcq_oid; /* sysctl oid storage */ + char ifcq_name[32]; /* name */ +} ifcq_oid_t; /* * Structure defining a queue for a network interface. @@ -137,9 +131,8 @@ struct ifclassq { u_int32_t ifcq_target_qdelay; /* target queue delay */ u_int32_t ifcq_bytes; /* bytes count */ u_int32_t ifcq_pkt_drop_limit; - /* number of doorbells introduced by pacemaker thread */ - uint64_t ifcq_doorbells; void *ifcq_disc; /* for scheduler-specific use */ + struct pktsched_ops *ifcq_ops; /* * ifcq_disc_slots[] represents the leaf classes configured for the * corresponding discpline/scheduler, ordered by their corresponding @@ -166,6 +159,7 @@ struct ifclassq { /* token bucket regulator */ struct tb_regulator ifcq_tbr; /* TBR */ + ifcq_oid_t ifcq_oid; }; /* ifcq_flags */ @@ -173,6 +167,7 @@ struct ifclassq { #define IFCQF_ENABLED 0x02 /* ifclassq is in use */ #define IFCQF_TBR 0x04 /* Token Bucket Regulator is in use */ #define IFCQF_DESTROYED 0x08 /* ifclassq torndown */ +#define IFCQF_LOCKLESS 0x10 /* lockless */ #define IFCQ_IS_READY(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_READY) #define IFCQ_IS_ENABLED(_ifcq) ((_ifcq)->ifcq_flags & IFCQF_ENABLED) @@ -193,6 +188,8 @@ struct ifclassq { /* packet has been compressed with another one */ #define CLASSQEQ_COMPRESSED 4 +#define CLASSQEQ_CONGESTED 5 + /* interface event argument for CLASSQRQ_EVENT */ typedef enum cqev { CLASSQ_EV_INIT = 0, @@ -224,7 +221,9 @@ struct if_ifclassq_stats { struct pktcntr ifqs_xmitcnt; struct pktcntr ifqs_dropcnt; u_int32_t ifqs_scheduler; - struct fq_codel_classstats ifqs_fq_codel_stats; + union { + struct fq_codel_classstats ifqs_fq_codel_stats; + }; } __attribute__((aligned(8))); #ifdef __cplusplus @@ -293,6 +292,7 @@ struct if_ifclassq_stats { #define IFCQ_PKT_DROP_LIMIT(_ifcq) ((_ifcq)->ifcq_pkt_drop_limit) extern int ifclassq_setup(struct ifclassq *, struct ifnet *, uint32_t); +extern int ifclassq_change(struct ifclassq *ifq, uint32_t model); extern void ifclassq_teardown(struct ifclassq *); extern int ifclassq_pktsched_setup(struct ifclassq *); extern void ifclassq_set_maxlen(struct ifclassq *, u_int32_t); @@ -301,28 +301,20 @@ extern int ifclassq_get_len(struct ifclassq *, mbuf_svc_class_t, u_int8_t, u_int32_t *, u_int32_t *); extern errno_t ifclassq_enqueue(struct ifclassq *, classq_pkt_t *, classq_pkt_t *, u_int32_t, u_int32_t, boolean_t *); -extern errno_t ifclassq_dequeue(struct ifclassq *, u_int32_t, u_int32_t, - classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *, u_int8_t); -extern errno_t ifclassq_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, +extern errno_t ifclassq_dequeue(struct ifclassq *, mbuf_svc_class_t, u_int32_t, u_int32_t, classq_pkt_t *, classq_pkt_t *, u_int32_t *, u_int32_t *, u_int8_t); -extern void *ifclassq_poll(struct ifclassq *, classq_pkt_type_t *); -extern void *ifclassq_poll_sc(struct ifclassq *, mbuf_svc_class_t, - classq_pkt_type_t *); -extern void ifclassq_update(struct ifclassq *, cqev_t); +extern void ifclassq_update(struct ifclassq *, cqev_t, bool); extern int ifclassq_attach(struct ifclassq *, u_int32_t, void *); extern void ifclassq_detach(struct ifclassq *); extern int ifclassq_getqstats(struct ifclassq *, u_int8_t, u_int32_t, void *, u_int32_t *); extern const char *__null_terminated ifclassq_ev2str(cqev_t); extern int ifclassq_tbr_set(struct ifclassq *, struct tb_profile *, boolean_t); +extern void ifclassq_tbr_get(struct ifclassq *, u_int32_t *, u_int64_t *, u_int64_t *); extern void ifclassq_tbr_dequeue(struct ifclassq *, classq_pkt_t *, u_int8_t); extern void ifclassq_tbr_dequeue_sc(struct ifclassq *, mbuf_svc_class_t, classq_pkt_t *, u_int8_t); -extern void ifclassq_calc_target_qdelay(struct ifnet *ifp, - uint64_t *if_target_qdelay, uint32_t flags); -extern void ifclassq_calc_update_interval(uint64_t *update_interval, - uint32_t flags); extern void ifclassq_set_packet_metadata(struct ifclassq *ifq, struct ifnet *ifp, classq_pkt_t *p); extern struct ifclassq *ifclassq_alloc(void); @@ -330,8 +322,7 @@ extern void ifclassq_retain(struct ifclassq *); extern void ifclassq_release(struct ifclassq **); extern int ifclassq_setup_group(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags); -extern void ifclassq_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx); -extern void ifclassq_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx); +extern int ifclassq_request(struct ifclassq *, enum cqrq, void *, bool); #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 8ecb8d6bc..afb582e1a 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -852,8 +852,7 @@ struct m_tag *cfil_dgram_save_socket_state(struct cfil_info *, struct mbuf *); boolean_t cfil_dgram_peek_socket_state(struct mbuf *m, int *inp_flags); static void cfil_sock_received_verdict(struct socket *so); static void cfil_fill_event_msg_addresses(struct soflow_hash_entry *, struct inpcb *, - union sockaddr_in_4_6 *, union sockaddr_in_4_6 *, - boolean_t, boolean_t); + union sockaddr_in_4_6 *, union sockaddr_in_4_6 *, boolean_t); static void cfil_stats_report_thread_func(void *, wait_result_t); static void cfil_stats_report(void *v, wait_result_t w); static bool cfil_dgram_gc_needed(struct socket *, struct soflow_hash_entry *, u_int64_t); @@ -993,10 +992,7 @@ cfil_data_length(struct mbuf *m, int *retmbcnt, int *retmbnum) for (m0 = m; m0 != NULL; m0 = m0->m_next) { pktlen += m0->m_len; mbnum++; - mbcnt += _MSIZE; - if (m0->m_flags & M_EXT) { - mbcnt += m0->m_ext.ext_size; - } + mbcnt += m_capacity(m0); } if (retmbcnt) { *retmbcnt = mbcnt; @@ -2549,10 +2545,10 @@ cfil_init(void) /* * Compile time verifications */ - _CASSERT(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER); - _CASSERT(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0); - _CASSERT(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0); - _CASSERT(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0); + static_assert(CFIL_MAX_FILTER_COUNT == MAX_CONTENT_FILTER); + static_assert(sizeof(struct cfil_filter_stat) % sizeof(uint32_t) == 0); + static_assert(sizeof(struct cfil_entry_stat) % sizeof(uint32_t) == 0); + static_assert(sizeof(struct cfil_sock_stat) % sizeof(uint32_t) == 0); /* * Runtime time verifications @@ -3115,9 +3111,10 @@ cfil_sock_detach(struct socket *so) static void cfil_fill_event_msg_addresses(struct soflow_hash_entry *entry, struct inpcb *inp, union sockaddr_in_4_6 *sin_src, union sockaddr_in_4_6 *sin_dst, - boolean_t isIPv4, boolean_t outgoing) + boolean_t outgoing) { - if (isIPv4) { + if ((entry != NULL && entry->soflow_family == AF_INET) || + !IS_INP_V6(inp)) { struct in_addr laddr = {0}, faddr = {0}; u_int16_t lport = 0, fport = 0; @@ -3344,7 +3341,7 @@ cfil_dispatch_closed_event_sign(cfil_crypto_state_t crypto_state, boolean_t outgoing = (cfil_info->cfi_dir == CFS_CONNECTION_DIR_OUT); union sockaddr_in_4_6 *src = outgoing ? &data.local : &data.remote; union sockaddr_in_4_6 *dst = outgoing ? &data.remote : &data.local; - cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, !IS_INP_V6(inp), outgoing); + cfil_fill_event_msg_addresses(hash_entry_ptr, inp, src, dst, outgoing); } data.byte_count_in = cfil_info->cfi_byte_inbound_count; @@ -3503,7 +3500,7 @@ cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, if (hash_entry_ptr != NULL) { cfil_fill_event_msg_addresses(hash_entry_ptr, inp, &msg_attached->cfs_src, &msg_attached->cfs_dst, - !IS_INP_V6(inp), conn_dir == CFS_CONNECTION_DIR_OUT); + conn_dir == CFS_CONNECTION_DIR_OUT); } msg_attached->cfs_conn_dir = conn_dir; @@ -3752,7 +3749,7 @@ cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int k union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL; union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr; cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, - src, dst, !IS_INP_V6(inp), outgoing); + src, dst, outgoing); } } @@ -4015,7 +4012,7 @@ cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_ */ cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, &data_req->cfc_src, &data_req->cfc_dst, - !IS_INP_V6(inp), outgoing); + outgoing); if (cfil_info->cfi_debug && cfil_log_data) { cfil_info_log(LOG_ERR, cfil_info, "CFIL: SENDING DATA UP"); @@ -7434,7 +7431,7 @@ cfil_stats_collect_flow_stats_for_filter(int kcunit, union sockaddr_in_4_6 *src = outgoing ? &cfil_info->cfi_so_attach_laddr : NULL; union sockaddr_in_4_6 *dst = outgoing ? NULL : &cfil_info->cfi_so_attach_laddr; cfil_fill_event_msg_addresses(cfil_info->cfi_hash_entry, inp, - src, dst, !IS_INP_V6(inp), outgoing); + src, dst, outgoing); } } diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index c079922a4..676122350 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -64,11 +65,13 @@ #include #include +#include +#include #include #include -#include -#include +#include #include +#include #include #include @@ -150,17 +153,8 @@ uint64_t if_creation_generation_count = 0; -__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE; - dlil_ifnet_queue_t dlil_ifnet_head; -#if DEBUG -unsigned int ifnet_debug = 1; /* debugging (enabled) */ -#else -unsigned int ifnet_debug; /* debugging (disabled) */ -#endif /* !DEBUG */ - - static u_int32_t net_rtref; static struct dlil_main_threading_info dlil_main_input_thread_info; @@ -239,7 +233,7 @@ static void ifnet_start_thread_cont(void *, wait_result_t); static void ifnet_poll_thread_func(void *, wait_result_t); static void ifnet_poll_thread_cont(void *, wait_result_t); -static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *, +static errno_t ifnet_enqueue_common_single(struct ifnet *, struct ifclassq *, classq_pkt_t *, boolean_t, boolean_t *); static void ifp_src_route_copyout(struct ifnet *, struct route *); @@ -338,6 +332,11 @@ ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable) routegenid_update(); } +os_refgrp_decl(static, if_refiogrp, "if refio refcounts", NULL); +os_refgrp_decl(static, if_datamovgrp, "if datamov refcounts", NULL); +#define IF_DATAMOV_BITS 1 +#define IF_DATAMOV_DRAINING 1 + #if SKYWALK static bool net_check_compatible_if_filter(struct ifnet *ifp); @@ -777,7 +776,7 @@ skip_mtu_ioctl: } if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) { - _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET); + static_assert((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET); *use_multi_buflet = true; /* default flowswitch buffer size */ *buf_size = NX_FSW_BUFSIZE; @@ -915,7 +914,22 @@ dlil_attach_flowswitch_nexus(ifnet_t ifp) return FALSE; } bzero(&nexus_fsw, sizeof(nexus_fsw)); - if (!ifnet_is_attached(ifp, 1)) { + + /* + * A race can happen between a thread creating a flowswitch and another thread + * detaching the interface (also destroying the flowswitch). + * + * ifnet_datamov_begin() is used here to force dlil_quiesce_and_detach_nexuses() + * (called by another thread) to wait until this function finishes so the + * flowswitch can be cleaned up by dlil_detach_flowswitch_nexus(). + * + * If ifnet_get_ioref() is used instead, dlil_quiesce_and_detach_nexuses() + * would not wait (because ifp->if_nx_flowswitch isn't assigned) and the + * created flowswitch would be left hanging and ifnet_detach_final() would never + * wakeup because the existence of the flowswitch prevents the ifnet's ioref + * from being released. + */ + if (!ifnet_datamov_begin(ifp)) { os_log(OS_LOG_DEFAULT, "%s: %s not attached", __func__, ifp->if_xname); goto done; @@ -928,7 +942,7 @@ dlil_attach_flowswitch_nexus(ifnet_t ifp) ifnet_lock_done(ifp); } } - ifnet_decr_iorefcnt(ifp); + ifnet_datamov_end(ifp); done: return attached; @@ -1014,7 +1028,7 @@ ifnet_remove_netagent(ifnet_t ifp) boolean_t ifnet_attach_flowswitch_nexus(ifnet_t ifp) { - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { return FALSE; } return dlil_attach_flowswitch_nexus(ifp); @@ -1366,84 +1380,84 @@ dlil_init(void) /* * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts. */ - _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP); - _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP); - _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP); - _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT); - _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT); - _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6); - _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6); - _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT); - _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL); - _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT); - _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING); - _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU); - _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4); - _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6); + static_assert(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP); + static_assert(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP); + static_assert(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP); + static_assert(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT); + static_assert(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT); + static_assert(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6); + static_assert(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6); + static_assert(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT); + static_assert(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL); + static_assert(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT); + static_assert(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING); + static_assert(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU); + static_assert(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4); + static_assert(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6); /* * ... as well as the mbuf checksum flags counterparts. */ - _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP); - _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP); - _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP); - _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS); - _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT); - _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6); - _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6); - _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6); - _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL); - _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT); - _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING); + static_assert(CSUM_IP == IF_HWASSIST_CSUM_IP); + static_assert(CSUM_TCP == IF_HWASSIST_CSUM_TCP); + static_assert(CSUM_UDP == IF_HWASSIST_CSUM_UDP); + static_assert(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS); + static_assert(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT); + static_assert(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6); + static_assert(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6); + static_assert(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6); + static_assert(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL); + static_assert(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT); + static_assert(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING); /* * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info. */ - _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN); - _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN); + static_assert(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN); + static_assert(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN); - _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL); - _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY); - _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER); - _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE); + static_assert(IFRLOGF_DLIL == IFNET_LOGF_DLIL); + static_assert(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY); + static_assert(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER); + static_assert(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE); - _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY); - _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY); - _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE); + static_assert(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY); + static_assert(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY); + static_assert(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE); - _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY); - _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK); - _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET); - _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP); - _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN); - _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN); - _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP); - _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC); - _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC); - _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP); - _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF); - _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH); - _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF); - _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE); - _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND); - _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR); - _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN); - _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC); + static_assert(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY); + static_assert(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK); + static_assert(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET); + static_assert(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP); + static_assert(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN); + static_assert(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN); + static_assert(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP); + static_assert(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC); + static_assert(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC); + static_assert(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP); + static_assert(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF); + static_assert(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH); + static_assert(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF); + static_assert(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE); + static_assert(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND); + static_assert(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR); + static_assert(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN); + static_assert(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC); - _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY); - _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB); - _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH); - _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI); - _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT); - _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED); - _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC); - _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY); - _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET); - _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL); - _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT); + static_assert(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY); + static_assert(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB); + static_assert(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH); + static_assert(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI); + static_assert(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT); + static_assert(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED); + static_assert(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC); + static_assert(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY); + static_assert(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET); + static_assert(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL); + static_assert(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT); - _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN); - _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN); + static_assert(DLIL_MODIDLEN == IFNET_MODIDLEN); + static_assert(DLIL_MODARGLEN == IFNET_MODARGLEN); PE_parse_boot_argn("net_affinity", &net_affinity, sizeof(net_affinity)); @@ -1454,8 +1468,6 @@ dlil_init(void) PE_parse_boot_argn("net_async", &net_async, sizeof(net_async)); - PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug)); - PE_parse_boot_argn("if_link_heuristics", &if_link_heuristics_flags, sizeof(if_link_heuristics_flags)); VERIFY(dlil_pending_thread_cnt == 0); @@ -1534,7 +1546,6 @@ dlil_init(void) #endif /* SKYWALK */ - dlil_allocation_zones_init(); eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt); TAILQ_INIT(&dlil_ifnet_head); @@ -1632,7 +1643,7 @@ dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, retval = ENXIO; goto done; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached", __func__, if_name(ifp)); retval = ENXIO; @@ -2255,6 +2266,12 @@ ifnet_start_thread_func(void *v, wait_result_t w) ASSERT(ifp->if_start_thread == current_thread()); thread_set_thread_name(current_thread(), __unsafe_null_terminated_from_indexable(thread_name)); +#if CONFIG_THREAD_GROUPS + if (IFNET_REQUIRES_CELL_GROUP(ifp)) { + thread_group_join_cellular(); + } +#endif + /* * Treat the dedicated starter thread for lo0 as equivalent to * the driver workloop thread; if net_affinity is enabled for @@ -2562,7 +2579,7 @@ ifnet_poll_thread_cont(void *v, wait_result_t wres) * else hold an IO refcnt to prevent the interface * from being detached (will be released below.) */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { lck_mtx_lock_spin(&ifp->if_poll_lock); break; } @@ -2698,30 +2715,10 @@ void ifnet_purge(struct ifnet *ifp) { if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) { - if_qflush_snd(ifp, false); + if_qflush(ifp, ifp->if_snd); } } -void -ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev) -{ - IFCQ_LOCK_ASSERT_HELD(ifq); - - if (!(IFCQ_IS_READY(ifq))) { - return; - } - - if (IFCQ_TBR_IS_ENABLED(ifq)) { - struct tb_profile tb = { - .rate = ifq->ifcq_tbr.tbr_rate_raw, - .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0 - }; - (void) ifclassq_tbr_set(ifq, &tb, FALSE); - } - - ifclassq_update(ifq, ev); -} - void ifnet_update_rcv(struct ifnet *ifp, cqev_t ev) { @@ -2740,26 +2737,7 @@ ifnet_update_rcv(struct ifnet *ifp, cqev_t ev) errno_t ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model) { - struct ifclassq *ifq; - u_int32_t omodel; - errno_t err; - - if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) { - return EINVAL; - } else if (!(ifp->if_eflags & IFEF_TXSTART)) { - return ENXIO; - } - - ifq = ifp->if_snd; - IFCQ_LOCK(ifq); - omodel = ifp->if_output_sched_model; - ifp->if_output_sched_model = model; - if ((err = ifclassq_pktsched_setup(ifq)) != 0) { - ifp->if_output_sched_model = omodel; - } - IFCQ_UNLOCK(ifq); - - return err; + return ifclassq_change(ifp->if_snd, model); } errno_t @@ -2949,7 +2927,7 @@ ifnet_mcast_clear_dscp(uint8_t *__indexable buf, uint8_t ip_ver) } static inline errno_t -ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq, +ifnet_enqueue_single(struct ifnet *ifp, struct ifclassq *ifcq, classq_pkt_t *p, boolean_t flush, boolean_t *pdrop) { #if SKYWALK @@ -3012,15 +2990,15 @@ ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq, p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) { if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_BACKGROUND)) { - ifp->if_fg_sendts = (uint32_t)_net_uptime; + ifp->if_fg_sendts = (uint32_t)net_uptime(); if (fg_ts != NULL) { - *fg_ts = (uint32_t)_net_uptime; + *fg_ts = (uint32_t)net_uptime(); } } if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) { - ifp->if_rt_sendts = (uint32_t)_net_uptime; + ifp->if_rt_sendts = (uint32_t)net_uptime(); if (rt_ts != NULL) { - *rt_ts = (uint32_t)_net_uptime; + *rt_ts = (uint32_t)net_uptime(); } } } @@ -3107,15 +3085,15 @@ ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq, * activity on a foreground flow. */ if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) { - ifp->if_fg_sendts = (uint32_t)_net_uptime; + ifp->if_fg_sendts = (uint32_t)net_uptime(); if (fg_ts != NULL) { - *fg_ts = (uint32_t)_net_uptime; + *fg_ts = (uint32_t)net_uptime(); } } if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) { - ifp->if_rt_sendts = (uint32_t)_net_uptime; + ifp->if_rt_sendts = (uint32_t)net_uptime(); if (rt_ts != NULL) { - *rt_ts = (uint32_t)_net_uptime; + *rt_ts = (uint32_t)net_uptime(); } } pktlen = p->cp_kpkt->pkt_length; @@ -3259,7 +3237,7 @@ ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq, } static inline errno_t -ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq, +ifnet_enqueue_chain(struct ifnet *ifp, struct ifclassq *ifcq, classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t flush, boolean_t *pdrop) { @@ -3289,18 +3267,18 @@ ifnet_enqueue_netem(void *handle, pktsched_pkt_t *__sized_by(n_pkts)pkts, uint32 ASSERT(n_pkts >= 1); for (i = 0; i < n_pkts - 1; i++) { - (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt, + (void) ifnet_enqueue_single(ifp, ifp->if_snd, &pkts[i].pktsched_pkt, FALSE, &pdrop); } /* flush with the last packet */ - (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt, + (void) ifnet_enqueue_single(ifp, ifp->if_snd, &pkts[i].pktsched_pkt, TRUE, &pdrop); return 0; } static inline errno_t -ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq, +ifnet_enqueue_common_single(struct ifnet *ifp, struct ifclassq *ifcq, classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop) { if (ifp->if_output_netem != NULL) { @@ -3310,7 +3288,7 @@ ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq, *pdrop = drop ? TRUE : FALSE; return error; } else { - return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop); + return ifnet_enqueue_single(ifp, ifcq, pkt, flush, pdrop); } } @@ -3347,7 +3325,7 @@ ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush, } return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - !IF_FULLY_ATTACHED(ifp)) { + !ifnet_is_fully_attached(ifp)) { /* flag tested without lock for performance */ m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_ENQUEUE_IF_NOT_ATTACHED, NULL, 0); *pdrop = TRUE; @@ -3359,7 +3337,7 @@ ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush, } CLASSQ_PKT_INIT_MBUF(&pkt, m); - return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop); + return ifnet_enqueue_common_single(ifp, NULL, &pkt, flush, pdrop); } errno_t @@ -3377,7 +3355,7 @@ ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head, ASSERT(ifp != NULL); ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0); - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { /* flag tested without lock for performance */ m_drop_list(m_head, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_ENQUEUE_IF_NOT_ATTACHED, NULL, 0); *pdrop = TRUE; @@ -3390,13 +3368,13 @@ ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head, CLASSQ_PKT_INIT_MBUF(&head, m_head); CLASSQ_PKT_INIT_MBUF(&tail, m_tail); - return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes, + return ifnet_enqueue_chain(ifp, NULL, &head, &tail, cnt, bytes, flush, pdrop); } #if SKYWALK -static errno_t -ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq, +errno_t +ifnet_enqueue_pkt(struct ifnet *ifp, struct ifclassq *ifcq, struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop) { classq_pkt_t pkt; @@ -3411,7 +3389,7 @@ ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq, } return EINVAL; } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) || - !IF_FULLY_ATTACHED(ifp))) { + !ifnet_is_fully_attached(ifp))) { /* flag tested without lock for performance */ pp_free_packet(__DECONST(struct kern_pbufpool *, kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt)); @@ -3425,25 +3403,11 @@ ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq, } CLASSQ_PKT_INIT_PACKET(&pkt, kpkt); - return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop); + return ifnet_enqueue_common_single(ifp, ifcq, &pkt, flush, pdrop); } errno_t -ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt, - boolean_t flush, boolean_t *pdrop) -{ - return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop); -} - -errno_t -ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq, - struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop) -{ - return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop); -} - -static errno_t -ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq, +ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq, struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush, boolean_t *pdrop) { @@ -3454,7 +3418,7 @@ ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq, ASSERT(ifp != NULL); ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0); - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { /* flag tested without lock for performance */ pp_free_packet_chain(k_head, NULL); *pdrop = TRUE; @@ -3467,27 +3431,9 @@ ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq, CLASSQ_PKT_INIT_PACKET(&head, k_head); CLASSQ_PKT_INIT_PACKET(&tail, k_tail); - return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes, + return ifnet_enqueue_chain(ifp, ifcq, &head, &tail, cnt, bytes, flush, pdrop); } - -errno_t -ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head, - struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush, - boolean_t *pdrop) -{ - return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail, - cnt, bytes, flush, pdrop); -} - -errno_t -ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq, - struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt, - uint32_t bytes, boolean_t flush, boolean_t *pdrop) -{ - return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail, - cnt, bytes, flush, pdrop); -} #endif /* SKYWALK */ errno_t @@ -3499,17 +3445,17 @@ ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp) if (ifp == NULL || mp == NULL) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) { + !IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)) { return ENXIO; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } #if SKYWALK ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE)); #endif /* SKYWALK */ - rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, + rc = ifclassq_dequeue(ifp->if_snd, MBUF_SC_UNSPEC, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0); VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); @@ -3528,17 +3474,17 @@ ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc, if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) { + !IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)) { return ENXIO; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } #if SKYWALK ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE)); #endif /* SKYWALK */ - rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1, + rc = ifclassq_dequeue(ifp->if_snd, sc, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0); VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); @@ -3558,17 +3504,17 @@ ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit, if (ifp == NULL || head == NULL || pkt_limit < 1) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) { + !IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)) { return ENXIO; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } #if SKYWALK ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE)); #endif /* SKYWALK */ - rc = ifclassq_dequeue(ifp->if_snd, pkt_limit, + rc = ifclassq_dequeue(ifp->if_snd, MBUF_SC_UNSPEC, pkt_limit, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0); VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); @@ -3591,17 +3537,17 @@ ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit, if (ifp == NULL || head == NULL || byte_limit < 1) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) { + !IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)) { return ENXIO; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } #if SKYWALK ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE)); #endif /* SKYWALK */ - rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT, + rc = ifclassq_dequeue(ifp->if_snd, MBUF_SC_UNSPEC, CLASSQ_DEQUEUE_MAX_PKT_LIMIT, byte_limit, &pkt_head, &pkt_tail, cnt, len, 0); VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); ifnet_decr_iorefcnt(ifp); @@ -3626,17 +3572,17 @@ ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc, !MBUF_VALID_SC(sc)) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) { + !IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)) { return ENXIO; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } #if SKYWALK ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE)); #endif /* SKYWALK */ - rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit, + rc = ifclassq_dequeue(ifp->if_snd, sc, pkt_limit, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0); VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL)); @@ -3736,7 +3682,7 @@ dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_genera lck_mtx_unlock(&ifp->if_flt_lock); /* Get an io ref count if the interface is attached */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { goto done; } @@ -3853,7 +3799,7 @@ ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code) * At this point it most likely is. We are taking a reference for * deferred processing. */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface " "is not attached", __func__, __LINE__, if_name(ifp), ioctl_code); @@ -3933,7 +3879,7 @@ ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code, } /* Get an io ref count if the interface is attached */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return EOPNOTSUPP; } @@ -4045,7 +3991,7 @@ dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback) if (ifp->if_set_bpf_tap) { /* Get an io reference on the interface if it is attached */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return ENXIO; } error = ifp->if_set_bpf_tap(ifp, mode, callback); @@ -4063,7 +4009,7 @@ dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, const struct sockaddr *verify; proto_media_resolve_multi resolvep; - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return result; } @@ -4287,17 +4233,17 @@ ifnet_lookup(struct ifnet *ifp) * being called when there are outstanding io reference counts. */ int -ifnet_is_attached(struct ifnet *ifp, int refio) +ifnet_get_ioref(struct ifnet *ifp) { - int ret; + bool ret; - lck_mtx_lock_spin(&ifp->if_ref_lock); - if ((ret = IF_FULLY_ATTACHED(ifp))) { - if (refio > 0) { - ifp->if_refio++; + ret = ifnet_is_fully_attached(ifp); + if (ret) { + if (os_ref_retain_try(&ifp->if_refio) == false) { + /* refio became 0 which means it is detaching */ + return false; } } - lck_mtx_unlock(&ifp->if_ref_lock); return ret; } @@ -4330,40 +4276,33 @@ ifnet_decr_pending_thread_count(struct ifnet *ifp) void ifnet_incr_iorefcnt(struct ifnet *ifp) { - lck_mtx_lock_spin(&ifp->if_ref_lock); - VERIFY(IF_FULLY_ATTACHED(ifp)); - VERIFY(ifp->if_refio > 0); - ifp->if_refio++; - lck_mtx_unlock(&ifp->if_ref_lock); -} - -__attribute__((always_inline)) -static void -ifnet_decr_iorefcnt_locked(struct ifnet *ifp) -{ - LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED); - - VERIFY(ifp->if_refio > 0); - VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)); - - ifp->if_refio--; - VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0); - - /* - * if there are no more outstanding io references, wakeup the - * ifnet_detach thread if detaching flag is set. - */ - if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) { - wakeup(&(ifp->if_refio)); - } + os_ref_retain(&ifp->if_refio); } void ifnet_decr_iorefcnt(struct ifnet *ifp) { - lck_mtx_lock_spin(&ifp->if_ref_lock); - ifnet_decr_iorefcnt_locked(ifp); - lck_mtx_unlock(&ifp->if_ref_lock); + /* + * if there are no more outstanding io references, wakeup the + * ifnet_detach thread. + */ + if (os_ref_release_relaxed(&ifp->if_refio) == 0) { + lck_mtx_lock(&ifp->if_ref_lock); + wakeup(&(ifp->if_refio)); + lck_mtx_unlock(&ifp->if_ref_lock); + } +} + +static void +ifnet_decr_iorefcnt_locked(struct ifnet *ifp) +{ + /* + * if there are no more outstanding io references, wakeup the + * ifnet_detach thread. + */ + if (os_ref_release_relaxed(&ifp->if_refio) == 0) { + wakeup(&(ifp->if_refio)); + } } boolean_t @@ -4371,12 +4310,14 @@ ifnet_datamov_begin(struct ifnet *ifp) { boolean_t ret; - lck_mtx_lock_spin(&ifp->if_ref_lock); - if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) { - ifp->if_refio++; - ifp->if_datamov++; + ret = ifnet_is_attached_and_ready(ifp); + if (ret) { + if (os_ref_retain_try(&ifp->if_refio) == false) { + /* refio became 0 which means it is detaching */ + return false; + } + os_ref_retain_mask(&ifp->if_datamov, IF_DATAMOV_BITS, &if_datamovgrp); } - lck_mtx_unlock(&ifp->if_ref_lock); DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret); return ret; @@ -4385,19 +4326,20 @@ ifnet_datamov_begin(struct ifnet *ifp) void ifnet_datamov_end(struct ifnet *ifp) { - lck_mtx_lock_spin(&ifp->if_ref_lock); - VERIFY(ifp->if_datamov > 0); + uint32_t datamov; /* * if there's no more thread moving data, wakeup any * drainers that's blocked waiting for this. */ - if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) { + datamov = os_ref_release_raw_relaxed_mask(&ifp->if_datamov, IF_DATAMOV_BITS, &if_datamovgrp); + if (datamov >> IF_DATAMOV_BITS == 1 && (datamov & IF_DATAMOV_DRAINING)) { + lck_mtx_lock(&ifp->if_ref_lock); DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp)); DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp); wakeup(&(ifp->if_datamov)); + lck_mtx_unlock(&ifp->if_ref_lock); } - ifnet_decr_iorefcnt_locked(ifp); - lck_mtx_unlock(&ifp->if_ref_lock); + ifnet_decr_iorefcnt(ifp); DTRACE_IP1(datamov__end, struct ifnet *, ifp); } @@ -4406,14 +4348,14 @@ static void ifnet_datamov_suspend_locked(struct ifnet *ifp) { LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED); - ifp->if_refio++; + ifnet_incr_iorefcnt(ifp); if (ifp->if_suspend++ == 0) { VERIFY(ifp->if_refflags & IFRF_READY); ifp->if_refflags &= ~IFRF_READY; } } -void +static void ifnet_datamov_suspend(struct ifnet *ifp) { lck_mtx_lock_spin(&ifp->if_ref_lock); @@ -4444,8 +4386,8 @@ ifnet_datamov_drain(struct ifnet *ifp) /* data movement must already be suspended */ VERIFY(ifp->if_suspend > 0); VERIFY(!(ifp->if_refflags & IFRF_READY)); - ifp->if_drainers++; - while (ifp->if_datamov != 0) { + os_atomic_or(&ifp->if_datamov, IF_DATAMOV_DRAINING, relaxed); + while (os_ref_get_count_mask(&ifp->if_datamov, IF_DATAMOV_BITS) > 1) { DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n", if_name(ifp)); DTRACE_IP1(datamov__wait, struct ifnet *, ifp); @@ -4454,13 +4396,12 @@ ifnet_datamov_drain(struct ifnet *ifp) DTRACE_IP1(datamov__wake, struct ifnet *, ifp); } VERIFY(!(ifp->if_refflags & IFRF_READY)); - VERIFY(ifp->if_drainers > 0); - ifp->if_drainers--; + os_atomic_andnot(&ifp->if_datamov, IF_DATAMOV_DRAINING, relaxed); lck_mtx_unlock(&ifp->if_ref_lock); /* purge the interface queues */ if ((ifp->if_eflags & IFEF_TXSTART) != 0) { - if_qflush_snd(ifp, false); + if_qflush(ifp, ifp->if_snd); } } @@ -4502,7 +4443,7 @@ dlil_attach_protocol(struct if_proto *proto, return EINVAL; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached", __func__, if_name(ifp)); return ENXIO; @@ -4837,7 +4778,7 @@ ifproto_media_send_arp(struct ifnet *ifp, u_short arpop, } extern int if_next_index(void); -extern int tcp_ecn_outbound; +extern int tcp_ecn; void dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq) @@ -4853,8 +4794,8 @@ dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq) sflags |= PKTSCHEDF_QALG_DELAYBASED; } - if (ifp->if_output_sched_model == - IFNET_SCHED_MODEL_DRIVER_MANAGED) { + if (ifp->if_output_sched_model & IFNET_SCHED_DRIVER_MANGED_MODELS) { + VERIFY(IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)); sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED; } /* Inherit drop limit from the default queue */ @@ -5050,7 +4991,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL || ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED || - ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL); + ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL || + ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL_DM); dlil_ifclassq_setup(ifp, ifp->if_snd); @@ -5229,15 +5171,6 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN; } - /* - * Enable ECN capability on this interface depending on the - * value of ECN global setting - */ - if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) { - if_set_eflags(ifp, IFEF_ECN_ENABLE); - if_clear_eflags(ifp, IFEF_ECN_DISABLE); - } - /* * Built-in Cyclops always on policy for WiFi infra */ @@ -5330,6 +5263,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) ifnet_lock_exclusive(ifp); lck_mtx_lock_spin(&ifp->if_ref_lock); ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */ + os_ref_init(&ifp->if_refio, &if_refiogrp); + os_ref_init_mask(&ifp->if_datamov, IF_DATAMOV_BITS, &if_datamovgrp, 0); lck_mtx_unlock(&ifp->if_ref_lock); if (net_rtref) { /* boot-args override; enable idle notification */ @@ -5366,11 +5301,8 @@ ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE); - if (dlil_verbose) { - DLIL_PRINTF("%s: attached%s\n", if_name(ifp), - (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : ""); - } - + os_log(OS_LOG_DEFAULT, "%s: attached%s\n", if_name(ifp), + (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : ""); return 0; } @@ -5406,7 +5338,7 @@ ifnet_detach(ifnet_t ifp) * IMPORTANT NOTE * * Any field in the ifnet that relies on IF_FULLY_ATTACHED() - * or equivalently, ifnet_is_attached(ifp, 1), can't be modified + * or equivalently, ifnet_get_ioref(ifp, 1), can't be modified * until after we've waited for all I/O references to drain * in ifnet_detach_final(). */ @@ -5451,9 +5383,8 @@ ifnet_detach(ifnet_t ifp) ifnet_flowadv(ifp->if_flowhash); } - /* Reset ECN enable/disable flags */ /* Reset CLAT46 flag */ - if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46); + if_clear_eflags(ifp, IFEF_CLAT46); /* * We do not reset the TCP keep alive counters in case @@ -5683,7 +5614,6 @@ ifnet_detach_final(struct ifnet *ifp) struct ifaddr *ifa; ifnet_detached_func if_free; int i; - bool waited = false; /* Let BPF know we're detaching */ bpfdetach(ifp); @@ -5709,19 +5639,22 @@ ifnet_detach_final(struct ifnet *ifp) * before we proceed with ifnet_detach. This is not a * common case, so block without using a continuation. */ - while (ifp->if_refio > 0) { - waited = true; - DLIL_PRINTF("%s: %s waiting for IO references to drain\n", - __func__, if_name(ifp)); - (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock, - (PZERO - 1), "ifnet_ioref_wait", NULL); + if (os_ref_release_relaxed(&ifp->if_refio) > 0) { + bool waited = false; + + while (os_ref_get_count(&ifp->if_refio) > 0) { + waited = true; + DLIL_PRINTF("%s: %s waiting for IO references to drain\n", + __func__, if_name(ifp)); + (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock, + (PZERO - 1), "ifnet_ioref_wait", NULL); + } + if (waited) { + DLIL_PRINTF("%s: %s IO references drained\n", + __func__, if_name(ifp)); + } } - if (waited) { - DLIL_PRINTF("%s: %s IO references drained\n", - __func__, if_name(ifp)); - } - VERIFY(ifp->if_datamov == 0); - VERIFY(ifp->if_drainers == 0); + os_ref_release_last_mask(&ifp->if_datamov, IF_DATAMOV_BITS, &if_datamovgrp); VERIFY(ifp->if_suspend == 0); ifp->if_refflags &= ~IFRF_READY; lck_mtx_unlock(&ifp->if_ref_lock); @@ -6015,7 +5948,7 @@ ifnet_detach_final(struct ifnet *ifp) lck_mtx_unlock(&ifp->if_flt_lock); /* Last chance to drain send queue */ - if_qflush_snd(ifp, 0); + if_qflush(ifp, ifp->if_snd); /* Last chance to cleanup any cached route */ lck_mtx_lock(&ifp->if_cached_route_lock); @@ -6365,20 +6298,10 @@ if_lqm_update(struct ifnet *ifp, int lqm, int locked) VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX); - /* Normalize to edge */ - if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) { - lqm = IFNET_LQM_THRESH_ABORT; + lqm = ifnet_lqm_normalize(lqm); + if (lqm == IFNET_LQM_THRESH_ABORT) { os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed); inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST); - } else if (lqm > IFNET_LQM_THRESH_ABORT && - lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) { - lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE; - } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE && - lqm <= IFNET_LQM_THRESH_POOR) { - lqm = IFNET_LQM_THRESH_POOR; - } else if (lqm > IFNET_LQM_THRESH_POOR && - lqm <= IFNET_LQM_THRESH_GOOD) { - lqm = IFNET_LQM_THRESH_GOOD; } /* @@ -6618,6 +6541,9 @@ if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe) if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY); } + os_log(OS_LOG_DEFAULT, "interface probing on %s set to %u by %s:%d", + if_name(ifp), conn_probe, proc_best_name(current_proc()), proc_selfpid()); + #if NECP necp_update_all_clients(); #endif /* NECP */ @@ -6883,8 +6809,8 @@ dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN], VERIFY(ifp != NULL); VERIFY(modid != NULL); - _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN); - _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN); + static_assert(sizeof(kev.modid) == DLIL_MODIDLEN); + static_assert(sizeof(kev.info) == DLIL_MODARGLEN); bzero(&kev, sizeof(kev)); @@ -6962,6 +6888,7 @@ int ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level) { struct ifclassq *ifq; + cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF }; int err = 0; if (!(ifp->if_eflags & IFEF_TXSTART)) { @@ -6971,15 +6898,8 @@ ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level) *level = IFNET_THROTTLE_OFF; ifq = ifp->if_snd; - IFCQ_LOCK(ifq); - /* Throttling works only for IFCQ, not ALTQ instances */ - if (IFCQ_IS_ENABLED(ifq)) { - cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF }; - - err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req); - *level = req.level; - } - IFCQ_UNLOCK(ifq); + err = ifclassq_request(ifq, CLASSQRQ_THROTTLE, &req, false); + *level = req.level; return err; } @@ -6988,6 +6908,7 @@ int ifnet_set_throttle(struct ifnet *ifp, u_int32_t level) { struct ifclassq *ifq; + cqrq_throttle_t req = { 1, level }; int err = 0; if (!(ifp->if_eflags & IFEF_TXSTART)) { @@ -7004,13 +6925,7 @@ ifnet_set_throttle(struct ifnet *ifp, u_int32_t level) return EINVAL; } - IFCQ_LOCK(ifq); - if (IFCQ_IS_ENABLED(ifq)) { - cqrq_throttle_t req = { 1, level }; - - err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req); - } - IFCQ_UNLOCK(ifq); + err = ifclassq_request(ifq, CLASSQRQ_THROTTLE, &req, false); if (err == 0) { DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp), @@ -7178,7 +7093,7 @@ ifnet_flowid(struct ifnet *ifp, uint32_t *flowid) if (ifp == NULL || flowid == NULL) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - !IF_FULLY_ATTACHED(ifp)) { + !ifnet_is_fully_attached(ifp)) { return ENXIO; } @@ -7195,7 +7110,7 @@ ifnet_disable_output(struct ifnet *ifp) if (ifp == NULL) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - !IF_FULLY_ATTACHED(ifp)) { + !ifnet_is_fully_attached(ifp)) { return ENXIO; } @@ -7216,7 +7131,7 @@ ifnet_enable_output(struct ifnet *ifp) if (ifp == NULL) { return EINVAL; } else if (!(ifp->if_eflags & IFEF_TXSTART) || - !IF_FULLY_ATTACHED(ifp)) { + !ifnet_is_fully_attached(ifp)) { return ENXIO; } @@ -7239,7 +7154,7 @@ ifnet_flowadv(uint32_t flowhash) ifp = ifce->ifce_ifp; /* flow hash gets recalculated per attach, so check */ - if (ifnet_is_attached(ifp, 1)) { + if (ifnet_get_ioref(ifp)) { if (ifp->if_flowhash == flowhash) { lck_mtx_lock_spin(&ifp->if_start_lock); if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) { @@ -7331,7 +7246,7 @@ ifnet_fc_get(uint32_t flowhash) /* become regular mutex */ lck_mtx_convert_spin(&ifnet_fc_lock); - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { /* * This ifp is not attached or in the process of being * detached; just don't process it. @@ -7683,7 +7598,7 @@ dlil_verify_sum16(void) int n; /* Make sure test data plus extra room for alignment fits in cluster */ - _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES); + static_assert((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES); kprintf("DLIL: running SUM16 self-tests ... "); @@ -7808,7 +7723,7 @@ dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1) #pragma unused(arg1) ifnet_ref_t ifp = arg0; - if (ifnet_is_attached(ifp, 1)) { + if (ifnet_get_ioref(ifp)) { nstat_ifnet_threshold_reached(ifp->if_index); ifnet_decr_iorefcnt(ifp); } @@ -7904,12 +7819,18 @@ ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid) return FALSE; } __private_extern__ void -ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count) +ifnet_update_inet_traffic_rule_count(ifnet_t ifp, uint32_t count) { - os_atomic_store(&ifp->if_traffic_rule_count, count, release); + os_atomic_store(&ifp->if_inet_traffic_rule_count, count, relaxed); ifnet_update_traffic_rule_genid(ifp); } +__private_extern__ void +ifnet_update_eth_traffic_rule_count(ifnet_t ifp, uint32_t count) +{ + os_atomic_store(&ifp->if_eth_traffic_rule_count, count, relaxed); + ifnet_update_traffic_rule_genid(ifp); +} #if SKYWALK static bool @@ -7928,6 +7849,7 @@ net_check_compatible_if_filter(struct ifnet *ifp) } #endif /* SKYWALK */ +#if CONFIG_MBUF_MCACHE #define DUMP_BUF_CHK() { \ clen -= k; \ if (clen < 1) \ @@ -7935,6 +7857,7 @@ net_check_compatible_if_filter(struct ifnet *ifp) c += k; \ } +#if NETWORKING int dlil_dump_top_if_qlen(char *__counted_by(str_len), int str_len); int dlil_dump_top_if_qlen(char *__counted_by(str_len) str, int str_len) @@ -7976,3 +7899,5 @@ dlil_dump_top_if_qlen(char *__counted_by(str_len) str, int str_len) done: return str_len - clen; } +#endif /* NETWORKING */ +#endif /* CONFIG_MBUF_MCACHE */ diff --git a/bsd/net/dlil_ctl.c b/bsd/net/dlil_ctl.c index b953a738c..940f129d3 100644 --- a/bsd/net/dlil_ctl.c +++ b/bsd/net/dlil_ctl.c @@ -141,10 +141,6 @@ dlil_if_acquire(uint32_t family, const void *uniqueid __sized_by(uniqueid_len), ifp1 = (struct ifnet *)dlifp1; dlifp1->dl_if_flags = DLIF_INUSE; - if (ifnet_debug) { - dlifp1->dl_if_flags |= DLIF_DEBUG; - dlifp1->dl_if_trace = dlil_if_trace; - } ifp1->if_name = tsnprintf(dlifp1->dl_if_namestorage, sizeof(dlifp1->dl_if_namestorage), ""); ifp1->if_xname = tsnprintf(dlifp1->dl_if_xnamestorage, sizeof(dlifp1->dl_if_xnamestorage), ""); @@ -218,31 +214,6 @@ end: return ret; } -void -dlil_if_trace(struct dlil_ifnet *dl_if, int refhold) -{ - struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if; - ctrace_t *tr; - u_int32_t idx; - u_int16_t *cnt; - - if (!(dl_if->dl_if_flags & DLIF_DEBUG)) { - panic("%s: dl_if %p has no debug structure", __func__, dl_if); - /* NOTREACHED */ - } - - if (refhold) { - cnt = &dl_if_dbg->dldbg_if_refhold_cnt; - tr = dl_if_dbg->dldbg_if_refhold; - } else { - cnt = &dl_if_dbg->dldbg_if_refrele_cnt; - tr = dl_if_dbg->dldbg_if_refrele; - } - - idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE; - ctrace_record(&tr[idx]); -} - /* * Stats management. */ diff --git a/bsd/net/dlil_input.c b/bsd/net/dlil_input.c index 1c7071067..55b9dbc92 100644 --- a/bsd/net/dlil_input.c +++ b/bsd/net/dlil_input.c @@ -27,6 +27,7 @@ */ #include +#include #include #include #include @@ -736,6 +737,31 @@ dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header, } } +#if (DEVELOPMENT || DEBUG) +static void +dlil_input_process_wake_packet(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m) +{ + /* + * For testing we do not care about broadcast and multicast packets as + * they are not as controllable as unicast traffic + */ + if (check_wake_mbuf(ifp, protocol_family, m) == false) { + return; + } + if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { + if ((protocol_family == PF_INET || protocol_family == PF_INET6) && + (m->m_flags & (M_BCAST | M_MCAST)) == 0) { + /* + * This is a one-shot command + */ + ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; + + m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT; + } + } +} +#endif /* (DEVELOPMENT || DEBUG) */ + static void dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m, u_int32_t cnt, ifnet_model_t mode, boolean_t ext) @@ -839,31 +865,9 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, mbuf_ref_t m, } #if (DEVELOPMENT || DEBUG) - /* - * For testing we do not care about broadcast and multicast packets as - * they are not as controllable as unicast traffic - */ - if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { - if ((protocol_family == PF_INET || protocol_family == PF_INET6) && - (m->m_flags & (M_BCAST | M_MCAST)) == 0) { - /* - * This is a one-shot command - */ - ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; - m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT; - } - } + /* For testing only */ + dlil_input_process_wake_packet(ifp, protocol_family, m); #endif /* (DEVELOPMENT || DEBUG) */ - if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) { - char buffer[64]; - size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer)); - - os_log(OS_LOG_DEFAULT, "wake packet from %s len %d", - ifp->if_xname, m_pktlen(m)); - if (mbuf_copydata(m, 0, buflen, buffer) == 0) { - log_hexdump(buffer, buflen); - } - } pktap_input(ifp, protocol_family, m, frame_header); @@ -1047,7 +1051,7 @@ skip_clat: } if (ifproto == NULL) { /* no protocol for this packet, discard */ - m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0); + m_drop_extended(m, ifp, frame_header, DROPTAP_FLAG_DIR_IN, DROP_REASON_DLIL_NO_PROTO, NULL, 0); goto next; } if (ifproto != last_ifproto) { @@ -1126,6 +1130,12 @@ dlil_input_thread_func(void *v, wait_result_t w) "dlil_input_%s", ifp->if_xname); thread_set_thread_name(inp->dlth_thread, thread_name); +#if CONFIG_THREAD_GROUPS + if (IFNET_REQUIRES_CELL_GROUP(ifp)) { + thread_group_join_cellular(); + } +#endif /* CONFIG_THREAD_GROUPS */ + lck_mtx_lock(&inp->dlth_lock); VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING))); (void) assert_wait(&inp->dlth_flags, THREAD_UNINT); @@ -1668,7 +1678,7 @@ skip: * hold an IO refcnt on the interface to prevent it from * being detached (will be release below.) */ - if (poll_req != 0 && ifnet_is_attached(ifp, 1)) { + if (poll_req != 0 && ifnet_get_ioref(ifp)) { struct ifnet_model_params p = { .model = mode, .reserved = { 0 } }; diff --git a/bsd/net/dlil_output.c b/bsd/net/dlil_output.c index 20afd8358..fa97daf45 100644 --- a/bsd/net/dlil_output.c +++ b/bsd/net/dlil_output.c @@ -86,6 +86,8 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, rtentry_ref_t rt = NULL; u_int16_t m_loop_set = 0; bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0; + uint64_t qset_id; + uint8_t qset_id_valid_flag; KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); @@ -146,7 +148,7 @@ preout_again: * Go to the next packet if translation fails */ if (retval != 0) { - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); m = NULL; ip6stat.ip6s_clat464_out_drop++; /* Make sure that the proto family is PF_INET */ @@ -171,7 +173,7 @@ preout_again: if (proto == NULL) { ifnet_lock_done(ifp); retval = ENXIO; - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); m = NULL; goto cleanup; } @@ -224,7 +226,7 @@ preout_again: if (retval == EJUSTRETURN) { goto preout_again; } - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_PRE_OUTPUT, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_PRE_OUTPUT, NULL, 0); m = NULL; goto cleanup; } @@ -234,6 +236,10 @@ preout_again: nanouptime(&now); net_timernsec(&now, &now_nsec); + qset_id = m->m_pkthdr.pkt_mpriv_qsetid; + qset_id_valid_flag = (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_QSET_ID_VALID) + ? PKTF_EXT_QSET_ID_VALID : 0; + do { m_add_hdr_crumb_interface_output(m, ifp->if_index, false); /* @@ -270,7 +276,7 @@ preout_again: retval = dlil_clat46(ifp, &proto_family, &m); /* Goto the next packet if the translation fails */ if (retval != 0) { - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_CLAT64, NULL, 0); m = NULL; ip6stat.ip6s_clat464_out_drop++; goto next; @@ -304,7 +310,7 @@ preout_again: frame_type, &pre, &post); if (retval != 0) { if (retval != EJUSTRETURN) { - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_IF_FRAMER, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_IF_FRAMER, NULL, 0); } goto next; } @@ -346,7 +352,7 @@ preout_again: retval = dlil_interface_filters_output(ifp, &m, proto_family); if (retval != 0) { if (retval != EJUSTRETURN) { - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_IF_FILTER, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_IF_FILTER, NULL, 0); } goto next; } @@ -377,7 +383,7 @@ preout_again: */ if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) { retval = EMSGSIZE; - m_drop(m, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_TSO_NOT_OK, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT, DROP_REASON_DLIL_TSO_NOT_OK, NULL, 0); goto cleanup; } @@ -459,6 +465,12 @@ preout_again: } retval = 0; } + if (retval == EQCONGESTED) { + if (adv != NULL && adv->code == FADV_SUCCESS) { + adv->code = FADV_CONGESTED; + } + retval = 0; + } if (retval == 0 && flen > 0) { fbytes += flen; fpkts++; @@ -477,6 +489,8 @@ next: m = packetlist; if (m != NULL) { m->m_flags |= m_loop_set; + m->m_pkthdr.pkt_ext_flags |= qset_id_valid_flag; + m->m_pkthdr.pkt_mpriv_qsetid = qset_id; packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; } @@ -492,13 +506,17 @@ next: if (ifp->if_eflags & IFEF_SENDLIST) { retval = (*ifp->if_output_dlil)(ifp, send_head); if (retval == EQFULL || retval == EQSUSPENDED) { - if (adv != NULL) { + if (adv != NULL && adv->code != FADV_CONGESTED) { adv->code = (retval == EQFULL ? FADV_FLOW_CONTROLLED : FADV_SUSPENDED); } retval = 0; } + if (retval == EQCONGESTED && adv != NULL) { + adv->code = FADV_CONGESTED; + retval = 0; + } if (retval == 0 && flen > 0) { fbytes += flen; fpkts++; @@ -517,13 +535,17 @@ next: send_m->m_nextpkt = NULL; retval = (*ifp->if_output_dlil)(ifp, send_m); if (retval == EQFULL || retval == EQSUSPENDED) { - if (adv != NULL) { + if (adv != NULL && adv->code != FADV_CONGESTED) { adv->code = (retval == EQFULL ? FADV_FLOW_CONTROLLED : FADV_SUSPENDED); } retval = 0; } + if (retval == EQCONGESTED && adv != NULL) { + adv->code = FADV_CONGESTED; + retval = 0; + } if (retval == 0) { enq_cnt++; if (flen > 0) { diff --git a/bsd/net/dlil_subr.c b/bsd/net/dlil_subr.c index 5aab4444c..dcb8c4f2c 100644 --- a/bsd/net/dlil_subr.c +++ b/bsd/net/dlil_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2024 Apple Inc. All rights reserved. + * Copyright (c) 1999-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,32 +57,6 @@ uint32_t dlil_pending_thread_cnt = 0; __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); __private_extern__ void if_rtproto_del(struct ifnet *ifp, int protocol); - -/* - * Allocation zones - */ -unsigned int dlif_size; /* size of dlil_ifnet to allocate */ -unsigned int dlif_bufsize; /* size of dlif_size + headroom */ -ZONE_DECLARE(dlif_zone, struct dlil_ifnet); -#define DLIF_ZONE_NAME "ifnet" /* zone name */ -zone_t dlif_zone; /* zone for dlil_ifnet */ - -unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */ -unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */ -ZONE_DECLARE(dlif_tcpstat_zone, struct ifnet_tcpstat); -#define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */ -zone_t dlif_tcpstat_zone; /* zone for tcpstat_local */ - -unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */ -unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */ -ZONE_DECLARE(dlif_udpstat_zone, struct ifnet_udpstat); -#define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */ -zone_t dlif_udpstat_zone; /* zone for udpstat_local */ - -KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT); - -KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT); - /* * Utility routines */ @@ -134,34 +108,6 @@ packet_has_vlan_tag(struct mbuf * m) return tag != 0; } -void -log_hexdump(void *__sized_by(len) data, size_t len) -{ - size_t i, j, k; - unsigned char *ptr = (unsigned char *)data; -#define MAX_DUMP_BUF 32 - unsigned char buf[3 * MAX_DUMP_BUF + 1]; - - for (i = 0; i < len; i += MAX_DUMP_BUF) { - for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) { - unsigned char msnbl = ptr[j] >> 4; - unsigned char lsnbl = ptr[j] & 0x0f; - - buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10; - buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10; - - if ((j % 2) == 1) { - buf[k++] = ' '; - } - if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) { - buf[k++] = ' '; - } - } - buf[k] = 0; - os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf); - } -} - /* * Monitor functions. */ @@ -207,149 +153,75 @@ if_flt_monitor_leave(struct ifnet *ifp) } } -/* - * Allocation routines - */ -void -dlil_allocation_zones_init(void) -{ - dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) : - sizeof(struct dlil_ifnet_dbg); - /* Enforce 64-bit alignment for dlil_ifnet structure */ - dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t); - dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t)); - dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM); - - dlif_tcpstat_size = sizeof(struct tcpstat_local); - /* Enforce 64-bit alignment for tcpstat_local structure */ - dlif_tcpstat_bufsize = - dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t); - dlif_tcpstat_bufsize = (uint32_t) - P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t)); - dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME, - dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM); - - dlif_udpstat_size = sizeof(struct udpstat_local); - /* Enforce 64-bit alignment for udpstat_local structure */ - dlif_udpstat_bufsize = - dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t); - dlif_udpstat_bufsize = (uint32_t) - P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t)); - dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME, - dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM); -} - -static void -_dlil_alloc_aligned_object(struct zone *zone, - size_t buffer_size, void *__indexable *__single pbuffer, - size_t object_size, void *__indexable *__single pobject) -{ - void *base, *buf, **pbuf; - - void *__unsafe_indexable addr = __zalloc_flags(zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); - __builtin_assume(addr != NULL); - buf = __unsafe_forge_bidi_indexable(void*, addr, buffer_size); - - /* Get the 64-bit aligned base address for this object */ - base = (void*)((char*)buf + (P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t), sizeof(u_int64_t)) - (intptr_t)buf)); - VERIFY(((intptr_t)base + object_size) <= - ((intptr_t)buf + buffer_size)); - - /* - * Wind back a pointer size from the aligned base and - * save the original address so we can free it later. - */ - pbuf = __unsafe_forge_bidi_indexable(void**, (intptr_t)base - sizeof(void *), sizeof(void *)); - *pbuf = buf; - *pbuffer = buf; - *pobject = base; -} - -static void -_dlil_free_aligned_object(struct zone *zone, void *pobject) -{ - if (pobject != NULL) { - void *__single *pbuf; - pbuf = __unsafe_forge_single(void**, ((intptr_t)pobject - sizeof(void*))); - zfree(zone, *pbuf); - } -} struct dlil_ifnet * dlif_ifnet_alloc(void) { - void *__indexable base, *__indexable buf; - _dlil_alloc_aligned_object(dlif_zone, - dlif_bufsize, &buf, - dlif_size, &base); - - return base; + return kalloc_type(struct dlil_ifnet, Z_WAITOK | Z_ZERO | Z_NOFAIL); } void dlif_ifnet_free(struct dlil_ifnet *ifnet) { - _dlil_free_aligned_object(dlif_zone, ifnet); + if (ifnet != NULL) { + kfree_type(struct dlil_ifnet, ifnet); + } } struct ifnet_filter * dlif_filt_alloc(void) { - return zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + return kalloc_type(struct ifnet_filter, Z_WAITOK | Z_ZERO | Z_NOFAIL); } void dlif_filt_free(struct ifnet_filter *filt) { if (filt != NULL) { - zfree(dlif_filt_zone, filt); + kfree_type(struct ifnet_filter, filt); } } struct if_proto * dlif_proto_alloc(void) { - return zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + return kalloc_type(struct if_proto, Z_WAITOK | Z_ZERO | Z_NOFAIL); } void dlif_proto_free(struct if_proto *ifproto) { if (ifproto != NULL) { - zfree(dlif_proto_zone, ifproto); + kfree_type(struct if_proto, ifproto); } } struct tcpstat_local * dlif_tcpstat_alloc(void) { - void *__indexable base, *__indexable buf; - _dlil_alloc_aligned_object(dlif_tcpstat_zone, - dlif_tcpstat_bufsize, &buf, - dlif_tcpstat_size, &base); - return base; + return kalloc_type(struct tcpstat_local, Z_WAITOK | Z_ZERO | Z_NOFAIL); } void dlif_tcpstat_free(struct tcpstat_local *if_tcp_stat) { - _dlil_free_aligned_object(dlif_tcpstat_zone, if_tcp_stat); + if (if_tcp_stat != NULL) { + kfree_type(struct tcpstat_local, if_tcp_stat); + } } struct udpstat_local * dlif_udpstat_alloc(void) { - void *__indexable base, *__indexable buf; - _dlil_alloc_aligned_object(dlif_udpstat_zone, - dlif_udpstat_bufsize, &buf, - dlif_udpstat_size, &base); - return base; + return kalloc_type(struct udpstat_local, Z_WAITOK | Z_ZERO | Z_NOFAIL); } void dlif_udpstat_free(struct udpstat_local *if_udp_stat) { - _dlil_free_aligned_object(dlif_tcpstat_zone, if_udp_stat); + if (if_udp_stat != NULL) { + kfree_type(struct udpstat_local, if_udp_stat); + } } struct ifaddr * @@ -538,9 +410,6 @@ dlil_if_ref(struct ifnet *ifp) panic("%s: wraparound refcnt for ifp=%p", __func__, ifp); /* NOTREACHED */ } - if (dl_if->dl_if_trace != NULL) { - (*dl_if->dl_if_trace)(dl_if, TRUE); - } lck_mtx_unlock(&dl_if->dl_if_lock); return 0; @@ -571,9 +440,6 @@ dlil_if_free(struct ifnet *ifp) break; } --dl_if->dl_if_refcnt; - if (dl_if->dl_if_trace != NULL) { - (*dl_if->dl_if_trace)(dl_if, FALSE); - } lck_mtx_unlock(&dl_if->dl_if_lock); if (need_release) { _dlil_if_release(ifp, true); @@ -826,10 +692,18 @@ dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) boolean_t is_first_frag = TRUE; boolean_t is_last_frag = TRUE; - pbuf_init_mbuf(&pbuf_store, *m, ifp); - pbuf = &pbuf_store; - iph = pbuf->pb_data; + /* + * Ensure that the incoming mbuf chain contains a valid + * IPv4 header in contiguous memory, or exit early. + */ + if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip) || + ((size_t)(*m)->m_len < sizeof(struct ip) && + (*m = m_pullup(*m, sizeof(struct ip))) == NULL)) { + ip6stat.ip6s_clat464_in_tooshort_drop++; + return -1; + } + iph = mtod(*m, struct ip *); osrc = iph->ip_src; odst = iph->ip_dst; proto = iph->ip_p; @@ -839,6 +713,15 @@ dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) tot_len = ntohs(iph->ip_len); + /* Validate that mbuf contains IP payload equal to `iph->ip_len' */ + if ((size_t)(*m)->m_pkthdr.len < tot_len) { + ip6stat.ip6s_clat464_in_tooshort_drop++; + return -1; + } + + pbuf_init_mbuf(&pbuf_store, *m, ifp); + pbuf = &pbuf_store; + /* * For packets that are not first frags * we only need to adjust CSUM. @@ -884,8 +767,12 @@ dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) goto cleanup; } - - /* Translate the IP header part first */ + /* + * Translate the IP header part first. + * NOTE: `nat464_translate_46' handles the situation where the value + * `off' is past the end of the mbuf chain that is associated with + * the pbuf, in a graceful manner. + */ error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p, iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1; @@ -963,7 +850,10 @@ dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) uint8_t tos = 0; boolean_t is_first_frag = TRUE; - /* Incoming mbuf does not contain valid IP6 header */ + /* + * Ensure that the incoming mbuf chain contains a valid + * IPv6 header in contiguous memory, or exit early. + */ if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) || ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) && (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) { @@ -996,12 +886,13 @@ dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) * CLAT46 IPv6 address */ if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) { + bool translate = false; pbuf_t pbuf_store, *pbuf = NULL; pbuf_init_mbuf(&pbuf_store, *m, ifp); pbuf = &pbuf_store; /* - * Retrive the local CLAT46 IPv4 address reserved for stateless + * Retrieve the local CLAT46 IPv4 address reserved for stateless * translation. */ ia4_clat_dst = inifa_ifpclatv4(ifp); @@ -1015,11 +906,21 @@ dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */ dst = &ia4_clat_dst->ia_addr.sin_addr; - if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) { + error = nat464_synthesize_ipv4(ifp, &osrc, &src, &translate); + if (error != 0) { ip6stat.ip6s_clat464_in_v4synthfail_drop++; error = -1; goto cleanup; } + if (!translate) { + /* no translation required */ + if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { + /* only allow icmpv6 */ + ip6stat.ip6s_clat464_in_v4synthfail_drop++; + error = -1; + } + goto cleanup; + } ip6h = pbuf->pb_data; off = sizeof(struct ip6_hdr); @@ -1069,7 +970,7 @@ cleanup: ip6stat.ip6s_clat464_in_invalpbuf_drop++; } - if (error == 0) { + if (error == 0 && translate) { *proto_family = PF_INET; ip6stat.ip6s_clat464_in_success++; } diff --git a/bsd/net/dlil_var_private.h b/bsd/net/dlil_var_private.h index 523dc79e3..d18fbbc46 100644 --- a/bsd/net/dlil_var_private.h +++ b/bsd/net/dlil_var_private.h @@ -164,10 +164,10 @@ #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8)) #define IF_DATA_REQUIRE_ALIGNED_64(f) \ - _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t))) + static_assert(!(offsetof(struct if_data_internal, f) % sizeof(u_int64_t))) #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \ - _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t))) + static_assert(!(offsetof(struct ifnet, if_data.f) % sizeof(u_int64_t))) enum { kProtoKPI_v1 = 1, @@ -180,8 +180,6 @@ enum { #define DLIL_PRINTF kprintf #endif -extern unsigned int ifnet_debug; - extern unsigned int net_rxpoll; extern unsigned int net_affinity; @@ -232,7 +230,6 @@ struct dlil_ifnet { TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */ u_int32_t dl_if_flags; /* flags (below) */ u_int32_t dl_if_refcnt; /* refcnt */ - void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */ void *dl_if_uniqueid __sized_by_or_null(dl_if_uniqueid_len); /* unique interface id */ size_t dl_if_uniqueid_len; /* length of the unique id */ char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */ @@ -251,23 +248,7 @@ struct dlil_ifnet { /* Values for dl_if_flags (private to DLIL) */ #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */ #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */ -#define DLIF_DEBUG 0x4 /* has debugging info */ -#define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */ - -/* For gdb */ -extern unsigned int if_ref_trace_hist_size; - -struct dlil_ifnet_dbg { - struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */ - u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */ - u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */ - /* - * Circular lists of ifnet_{reference,release} callers. - */ - ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE]; - ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE]; -}; #define DLIL_TO_IFP(s) (&s->dl_if) #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s) @@ -302,7 +283,6 @@ struct proto_input_entry; */ extern kern_return_t dlil_affinity_set(struct thread *, u_int32_t); extern boolean_t packet_has_vlan_tag(struct mbuf * m); -void log_hexdump(void *__sized_by(len) data, size_t len); /* * Monitor routines. diff --git a/bsd/net/droptap.c b/bsd/net/droptap.c index a4fc7df81..2e4d85258 100644 --- a/bsd/net/droptap.c +++ b/bsd/net/droptap.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -330,6 +331,10 @@ droptap_bpf_tap_packet(kern_packet_t pkt, uint32_t flags, if (kern_packet_get_wake_flag(pkt)) { hdr->pth_flags |= PTH_FLAG_WAKE_PKT; } + /* Need to check the packet flag in case full wake has been requested */ + if (kern_packet_get_lpw_flag(pkt) || if_is_lpw_enabled(ifp)) { + hdr->pth_flags |= PTH_FLAG_LPW; + } hdr->pth_trace_tag = kern_packet_get_trace_tag(pkt); hdr->pth_svc = so_svc2tc((mbuf_svc_class_t) kern_packet_get_service_class(pkt)); @@ -431,6 +436,9 @@ droptap_bpf_tap_mbuf(struct mbuf *m, uint16_t flags, if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) { hdr->pth_flags |= PTH_FLAG_WAKE_PKT; } + if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_LPW || if_is_lpw_enabled(ifp)) { + hdr->pth_flags |= PTH_FLAG_LPW; + } hdr->pth_svc = so_svc2tc(m->m_pkthdr.pkt_svc); diff --git a/bsd/net/droptap.h b/bsd/net/droptap.h index 6e79aa301..e774ba73a 100644 --- a/bsd/net/droptap.h +++ b/bsd/net/droptap.h @@ -111,6 +111,8 @@ struct droptap_header { #define DROPTAP_IPSEC 6 #define DROPTAP_IP6 7 #define DROPTAP_MPTCP 8 +#define DROPTAP_PF 9 +#define DROPTAP_BRIDGE 10 #define DROPTAP_UNSPEC 0 @@ -142,6 +144,7 @@ struct droptap_header { X(DROP_REASON_AQM_BK_SYS_THROTTLED, DROPTAP_SKYWALK, DROPTAP_AQM, 3, "AQM BK_SYS throttled") \ X(DROP_REASON_AQM_PURGE_FLOW, DROPTAP_SKYWALK, DROPTAP_AQM, 4, "AQM purge flow") \ X(DROP_REASON_AQM_DROP, DROPTAP_SKYWALK, DROPTAP_AQM, 5, "AQM drop") \ + X(DROP_REASON_AQM_HIGH_DELAY, DROPTAP_SKYWALK, DROPTAP_AQM, 6, "AQM drop due to high delay") \ /* Socket */ \ X(DROP_REASON_FULL_SOCK_RCVBUF, DROPTAP_BSD, DROPTAP_SOCK, 1, "Socket receive buffer full") \ /* DLIL */ \ @@ -159,6 +162,32 @@ struct droptap_header { X(DROP_REASON_DLIL_TSO_NOT_OK, DROPTAP_BSD, DROPTAP_DLIL, 12, "DLIL interface TSO not OK") \ /* MPTCP */ \ X(DROP_REASON_MPTCP_INPUT_MALFORMED, DROPTAP_BSD, DROPTAP_MPTCP,1, "MPTCP input packet malformed") \ + X(DROP_REASON_MPTCP_REASSEMBLY_ALLOC, DROPTAP_BSD, DROPTAP_MPTCP,2, "MPTCP reassembly allocation") \ + /* PF */ \ + X(DROP_REASON_PF_UNSPECIFIED, DROPTAP_BSD, DROPTAP_PF, 1, "PF unspecified reason") \ + X(DROP_REASON_PF_UNDERSIZED, DROPTAP_BSD, DROPTAP_PF, 2, "PF undersized") \ + X(DROP_REASON_PF_NO_ROUTE, DROPTAP_BSD, DROPTAP_PF, 3, "PF no route") \ + X(DROP_REASON_PF_NULL_IFP, DROPTAP_BSD, DROPTAP_PF, 4, "PF NULL ifp") \ + X(DROP_REASON_PF_NO_TSO, DROPTAP_BSD, DROPTAP_PF, 5, "PF No TSO?") \ + X(DROP_REASON_PF_CANNOT_FRAGMENT, DROPTAP_BSD, DROPTAP_PF, 6, "PF Cannot fragment") \ + X(DROP_REASON_PF_OVERLAPPING_FRAGMENT, DROPTAP_BSD, DROPTAP_PF, 7, "PF overlapping fragment") \ + X(DROP_REASON_PF_BAD_FRAGMENT, DROPTAP_BSD, DROPTAP_PF, 8, "PF overlapping fragment") \ + X(DROP_REASON_PF_MEM_ALLOC, DROPTAP_BSD, DROPTAP_PF, 9, "PF memory allocation") \ + X(DROP_REASON_PF_DROP, DROPTAP_BSD, DROPTAP_PF, 10, "PF drop") \ + /* BRIDGE */ \ + X(DROP_REASON_BRIDGE_UNSPECIFIED, DROPTAP_BSD, DROPTAP_BRIDGE, 1, "Bridge unspecified reason") \ + X(DROP_REASON_BRIDGE_CHECKSUM, DROPTAP_BSD, DROPTAP_BRIDGE, 2, "Bridge checksum") \ + X(DROP_REASON_BRIDGE_NOT_RUNNING, DROPTAP_BSD, DROPTAP_BRIDGE, 3, "Bridge not running") \ + X(DROP_REASON_BRIDGE_PRIVATE_SEGMENT, DROPTAP_BSD, DROPTAP_BRIDGE, 4, "Bridge private segment") \ + X(DROP_REASON_BRIDGE_NO_PROTO, DROPTAP_BSD, DROPTAP_BRIDGE, 5, "Bridge unknown protocol") \ + X(DROP_REASON_BRIDGE_BAD_PROTO, DROPTAP_BSD, DROPTAP_BRIDGE, 6, "Bridge bad protocol") \ + X(DROP_REASON_BRIDGE_MAC_NAT_FAILURE, DROPTAP_BSD, DROPTAP_BRIDGE, 7, "Bridge NAT failure") \ + X(DROP_REASON_BRIDGE_HOST_FILTER, DROPTAP_BSD, DROPTAP_BRIDGE, 8, "Bridge host filter") \ + X(DROP_REASON_BRIDGE_HWASSIST, DROPTAP_BSD, DROPTAP_BRIDGE, 9, "Bridge HW assisst") \ + X(DROP_REASON_BRIDGE_NOREF, DROPTAP_BSD, DROPTAP_BRIDGE, 10, "Bridge noref") \ + X(DROP_REASON_BRIDGE_PF, DROPTAP_BSD, DROPTAP_BRIDGE, 11, "Bridge PF") \ + X(DROP_REASON_BRIDGE_LOOP, DROPTAP_BSD, DROPTAP_BRIDGE, 12, "Bridge loop") \ + X(DROP_REASON_BRIDGE_NOT_A_MEMBER, DROPTAP_BSD, DROPTAP_BRIDGE, 13, "Bridge not a member") \ /* TCP */ \ X(DROP_REASON_TCP_RST, DROPTAP_BSD, DROPTAP_TCP, 1, "TCP connection reset") \ X(DROP_REASON_TCP_REASSEMBLY_ALLOC, DROPTAP_BSD, DROPTAP_TCP, 2, "TCP reassembly allocation") \ @@ -199,7 +228,9 @@ struct droptap_header { X(DROP_REASON_TCP_BAD_ACK, DROPTAP_BSD, DROPTAP_TCP, 37, "TCP bad ACK") \ X(DROP_REASON_TCP_BAD_RST, DROPTAP_BSD, DROPTAP_TCP, 38, "TCP bad RST") \ X(DROP_REASON_TCP_PAWS, DROPTAP_BSD, DROPTAP_TCP, 39, "TCP PAWS") \ - X(DROP_REASON__TCP_REASS_MEMORY_PRESSURE, DROPTAP_BSD, DROPTAP_TCP, 40, "TCP reassembly queue memory pressure") \ + X(DROP_REASON_TCP_REASS_MEMORY_PRESSURE, DROPTAP_BSD, DROPTAP_TCP, 40, "TCP reassembly queue memory pressure") \ + X(DROP_REASON_TCP_CREATE_SERVER_SOCKET, DROPTAP_BSD, DROPTAP_TCP, 41, "TCP create server socket failed") \ + X(DROP_REASON_TCP_INSEQ_MEMORY_PRESSURE, DROPTAP_BSD, DROPTAP_TCP, 42, "TCP in-seq input under memory pressure") \ /* IP */ \ X(DROP_REASON_IP_UNKNOWN_MULTICAST_GROUP, DROPTAP_BSD, DROPTAP_IP, 2, "IP unknown multicast group join") \ X(DROP_REASON_IP_INVALID_ADDR, DROPTAP_BSD, DROPTAP_IP, 3, "Invalid IP address") \ @@ -248,7 +279,7 @@ struct droptap_header { X(DROP_REASON_IP_MULTICAST_NO_PORT, DROPTAP_BSD, DROPTAP_IP, 46, "IP Multicast no port") \ X(DROP_REASON_IP_EISCONN, DROPTAP_BSD, DROPTAP_IP, 47, "IP Socket is already connected") \ X(DROP_REASON_IP_EAFNOSUPPORT, DROPTAP_BSD, DROPTAP_IP, 48, "IP Address family not supported by protocol family") \ - X(DROP_REASON_IP_NO_SOCK, DROPTAP_BSD, DROPTAP_IP, 49, "IP No matching sock") \ + X(DROP_REASON_IP_NO_SOCK, DROPTAP_BSD, DROPTAP_IP, 49, "IP No matching sock") \ /* IPsec */ \ X(DROP_REASON_IPSEC_REJECT, DROPTAP_BSD, DROPTAP_IPSEC,1, "IPsec reject") \ /* IPv6 */ \ @@ -268,6 +299,30 @@ struct droptap_header { X(DROP_REASON_IP6_ADDR_UNSPECIFIED, DROPTAP_BSD, DROPTAP_IP6, 14, "IPv6 Address is unspecified") \ X(DROP_REASON_IP6_FRAG_OVERLAPPING, DROPTAP_BSD, DROPTAP_IP6, 15, "IPv6 Fragment overlaping") \ X(DROP_REASON_IP6_FRAG_MIXED_CE, DROPTAP_BSD, DROPTAP_IP6, 16, "IPv6 Fragment mixed CE bits") \ + X(DROP_REASON_IP6_RA_NOT_LL, DROPTAP_BSD, DROPTAP_IP6, 17, "IPv6 RA src is not LL") \ + X(DROP_REASON_IP6_RA_BAD_LLADDR_LEN, DROPTAP_BSD, DROPTAP_IP6, 18, "IPv6 RA bad LL length") \ + X(DROP_REASON_IP6_RS_BAD_LLADDR_LEN, DROPTAP_BSD, DROPTAP_IP6, 19, "IPv6 RS bad LL length") \ + X(DROP_REASON_IP6_MEM_ALLOC, DROPTAP_BSD, DROPTAP_IP6, 20, "IPv6 memory allocation") \ + X(DROP_REASON_IP6_TOO_BIG, DROPTAP_BSD, DROPTAP_IP6, 21, "IPv6 too big for MTU") \ + X(DROP_REASON_IP6_POSSIBLE_LOOP, DROPTAP_BSD, DROPTAP_IP6, 22, "IPv6 possible loop") \ + X(DROP_REASON_IP6_ICMP_DROP, DROPTAP_BSD, DROPTAP_IP6, 23, "IPv6 ICMPv6 drop") \ + X(DROP_REASON_IP6_BAD_NI, DROPTAP_BSD, DROPTAP_IP6, 24, "IPv6 bad NI") \ + X(DROP_REASON_IP6_NS_FROM_NON_NEIGHBOR, DROPTAP_BSD, DROPTAP_IP6, 25, "IPv6 NS from non-neighbor") \ + X(DROP_REASON_IP6_NS_TO_MULTICAST, DROPTAP_BSD, DROPTAP_IP6, 26, "IPv6 NS targeting multicast") \ + X(DROP_REASON_IP6_NS_BAD_ND_OPT, DROPTAP_BSD, DROPTAP_IP6, 27, "IPv6 NS with invalid ND opt") \ + X(DROP_REASON_IP6_NS_BAD_LLADDR_LEN, DROPTAP_BSD, DROPTAP_IP6, 28, "IPv6 NS bad LL length") \ + X(DROP_REASON_IP6_NS_DUPLICATE_ADDRESS, DROPTAP_BSD, DROPTAP_IP6, 29, "IPv6 NS duplicate address") \ + X(DROP_REASON_IP6_NS_INVALID_TARGET, DROPTAP_BSD, DROPTAP_IP6, 30, "IPv6 NS invalid target") \ + X(DROP_REASON_IP6_NA_INVALID_TARGET, DROPTAP_BSD, DROPTAP_IP6, 31, "IPv6 NA invalid target") \ + X(DROP_REASON_IP6_NA_DST_MULTICAST, DROPTAP_BSD, DROPTAP_IP6, 32, "IPv6 NA destination is multicast") \ + X(DROP_REASON_IP6_NA_UNKNOWN_SRC_ADDR, DROPTAP_BSD, DROPTAP_IP6, 33, "IPv6 NA destination is multicast") \ + X(DROP_REASON_IP6_NA_BAD_LLADDR_LEN, DROPTAP_BSD, DROPTAP_IP6, 34, "IPv6 NA bad LL length") \ + X(DROP_REASON_IP6_NA_NOT_CACHED_SCOPED, DROPTAP_BSD, DROPTAP_IP6, 35, "IPv6 NA not cached scoped ") \ + X(DROP_REASON_IP6_NA_NOT_CACHED, DROPTAP_BSD, DROPTAP_IP6, 36, "IPv6 NA not cached") \ + X(DROP_REASON_IP6_NA_MISSING_LLADDR_OPT, DROPTAP_BSD, DROPTAP_IP6, 37, "IPv6 NA missing lladdr opt") \ + X(DROP_REASON_IP6_NA_MISSING_ROUTE, DROPTAP_BSD, DROPTAP_IP6, 38, "IPv6 NA missing route info") \ + X(DROP_REASON_IP6_BAD_UDP_CHECKSUM, DROPTAP_BSD, DROPTAP_IP6, 39, "IPv6 invalid UDP checksum") \ + X(DROP_REASON_IP6_ILLEGAL_PORT, DROPTAP_BSD, DROPTAP_IP6, 40, "IPv6 Illegal port") \ /* UDP */ \ X(DROP_REASON_UDP_SET_PORT_FAILURE, DROPTAP_BSD, DROPTAP_UDP, 1, "UDP failed to set ephemeral port ") \ X(DROP_REASON_UDP_DST_PORT_ZERO, DROPTAP_BSD, DROPTAP_UDP, 2, "UDP destination port zero") \ @@ -278,6 +333,9 @@ struct droptap_header { X(DROP_REASON_UDP_NECP, DROPTAP_BSD, DROPTAP_UDP, 7, "UDP denied by NECP") \ X(DROP_REASON_UDP_CANNOT_SAVE_CONTROL, DROPTAP_BSD, DROPTAP_UDP, 8, "UDP cannot save control mbufs") \ X(DROP_REASON_UDP_IPSEC, DROPTAP_BSD, DROPTAP_UDP, 9, "UDP IPsec") \ + X(DROP_REASON_UDP_PACKET_SHORTER_THAN_HEADER, DROPTAP_BSD, DROPTAP_UDP, 10, "UDP packet shorter than header") \ + X(DROP_REASON_UDP_NAT_KEEPALIVE, DROPTAP_BSD, DROPTAP_UDP, 11, "UDP NAT keepalive") \ + X(DROP_REASON_UDP_PCB_GARBAGE_COLLECTED, DROPTAP_BSD, DROPTAP_UDP, 12, "UDP PCB garbage collected") \ typedef enum drop_reason : uint32_t { #define X(reason, component, domain, code, ...) \ diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index 6de9e982b..ccc14eee1 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -449,7 +449,7 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header, protocol_family_t *protocol_family) { struct ether_header * __single eh = (struct ether_header *)(void *)frame_header; - u_short ether_type = eh->ether_type; + u_short ether_type; u_int16_t type; u_int8_t *data; u_int32_t i = 0; @@ -459,6 +459,11 @@ ether_demux(ifnet_t ifp, mbuf_t m, char *frame_header, u_int32_t extProto1 = 0; u_int32_t extProto2 = 0; + if (__improbable(eh == NULL)) { + return EINVAL; + } + ether_type = eh->ether_type; + if ((eh->ether_dhost[0] & 1) != 0) { /* Check for broadcast */ if (_ether_cmp(etherbroadcastaddr, eh->ether_dhost) == 0) { diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index d93b8f33a..05e600dd1 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -110,6 +110,7 @@ typedef struct ether_addr { #define ETHERTYPE_VLAN 0x8100 /* IEEE 802.1Q VLAN tagging */ #define ETHERTYPE_IPV6 0x86dd /* IPv6 */ #define ETHERTYPE_PAE 0x888e /* EAPOL PAE/802.1x */ +#define ETHERTYPE_WAI 0x88b4 /* WAI Authentication Protocol */ #define ETHERTYPE_RSN_PREAUTH 0x88c7 /* 802.11i / RSN Pre-Authentication */ #define ETHERTYPE_PTP 0x88f7 /* IEEE 1588 Precision Time Protocol */ #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */ diff --git a/bsd/net/flowadv.c b/bsd/net/flowadv.c index 71756bc6f..fd650b226 100644 --- a/bsd/net/flowadv.c +++ b/bsd/net/flowadv.c @@ -198,7 +198,8 @@ flowadv_thread_cont(int err) if (fce->fce_event_type == FCE_EVENT_TYPE_CONGESTION_EXPERIENCED) { switch (fce->fce_flowsrc_type) { case FLOWSRC_CHANNEL: - kern_channel_flowadv_report_ce_event(fce, fce->fce_ce_cnt, + kern_channel_flowadv_report_congestion_event(fce, + fce->fce_congestion_cnt, fce->l4s_ce_cnt, fce->fce_pkts_since_last_report); break; case FLOWSRC_INPCB: diff --git a/bsd/net/flowadv.h b/bsd/net/flowadv.h index c175ef606..37965f410 100644 --- a/bsd/net/flowadv.h +++ b/bsd/net/flowadv.h @@ -40,6 +40,7 @@ #define FADV_SUCCESS 0 /* success */ #define FADV_FLOW_CONTROLLED 1 /* regular flow control */ #define FADV_SUSPENDED 2 /* flow control due to suspension */ +#define FADV_CONGESTED 3 /* AQM gives congestion notification signal */ struct flowadv { int32_t code; /* FADV advisory code */ @@ -55,7 +56,8 @@ struct flowadv_fcentry { STAILQ_ENTRY(flowadv_fcentry) fce_link; u_int32_t fce_flowsrc_type; /* FLOWSRC values */ u_int32_t fce_flowid; - u_int32_t fce_ce_cnt; + u_int32_t fce_congestion_cnt; + u_int32_t l4s_ce_cnt; u_int32_t fce_pkts_since_last_report; fce_event_type_t fce_event_type; #if SKYWALK diff --git a/bsd/net/if.c b/bsd/net/if.c index e053c730e..3e28c564b 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -68,6 +68,7 @@ */ #include +#include #include #include @@ -173,6 +174,8 @@ static void if_rtmtu_update(struct ifnet *); static int if_clone_list(int, int *, user_addr_t); static int if_set_congested_link(struct ifnet *, boolean_t); +static int if_set_inband_wake_packet_tagging(struct ifnet *, boolean_t); +static int if_set_low_power_wake(struct ifnet *, boolean_t); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); @@ -313,6 +316,10 @@ SYSCTL_INT(_net_link_generic_system_management, OID_AUTO, verbose, */ TUNABLE_DEV_WRITEABLE(bool, management_data_unrestricted, "management_data_unrestricted", false); +int if_ultra_constrained_default_allowed = 0; // 0=Off, 1=On +SYSCTL_INT(_net_link_generic_system, OID_AUTO, ultra_constrained_default_allowed, + CTLFLAG_RW | CTLFLAG_LOCKED, &if_ultra_constrained_default_allowed, 0, ""); + #if DEBUG || DEVELOPMENT #define MANAGEMENT_CTLFLAG_ACCESS CTLFLAG_RW #else @@ -1612,7 +1619,7 @@ if_updown(struct ifnet *ifp, int up) ifp->if_flags &= ~IFF_UP; } - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { /* * The interface is not attached or is detaching, so * skip modifying any other state. @@ -1627,9 +1634,8 @@ if_updown(struct ifnet *ifp, int up) ifq = ifp->if_snd; ASSERT(ifq != NULL); IFCQ_LOCK(ifq); - if_qflush_snd(ifp, true); - ifnet_update_sndq(ifq, - up ? CLASSQ_EV_LINK_UP : CLASSQ_EV_LINK_DOWN); + ifclassq_request(ifq, CLASSQRQ_PURGE, NULL, true); + ifclassq_update(ifq, up ? CLASSQ_EV_LINK_UP : CLASSQ_EV_LINK_DOWN, true); IFCQ_UNLOCK(ifq); /* Inform protocols of changed interface state */ @@ -1681,7 +1687,7 @@ if_up( * Flush an interface queue. */ void -if_qflush(struct ifnet *ifp, struct ifclassq *ifq, bool ifq_locked) +if_qflush(struct ifnet *ifp, struct ifclassq *ifq) { lck_mtx_lock(&ifp->if_ref_lock); if ((ifp->if_refflags & IFRF_ATTACH_MASK) == 0) { @@ -1692,31 +1698,14 @@ if_qflush(struct ifnet *ifp, struct ifclassq *ifq, bool ifq_locked) ifclassq_retain(ifq); lck_mtx_unlock(&ifp->if_ref_lock); - if (!ifq_locked) { - IFCQ_LOCK(ifq); - } + ifclassq_request(ifq, CLASSQRQ_PURGE, NULL, false); - if (IFCQ_IS_ENABLED(ifq)) { - fq_if_request_classq(ifq, CLASSQRQ_PURGE, NULL); - } - - VERIFY(IFCQ_IS_EMPTY(ifq)); - - if (!ifq_locked) { - IFCQ_UNLOCK(ifq); - } ifclassq_release(&ifq); } -void -if_qflush_snd(struct ifnet *ifp, bool ifq_locked) -{ - if_qflush(ifp, ifp->if_snd, ifq_locked); -} - void if_qflush_sc(struct ifnet *ifp, mbuf_svc_class_t sc, u_int32_t flow, - u_int32_t *packets, u_int32_t *bytes, int ifq_locked) + u_int32_t *packets, u_int32_t *bytes) { struct ifclassq *ifq; u_int32_t cnt = 0, len = 0; @@ -1729,21 +1718,10 @@ if_qflush_sc(struct ifnet *ifp, mbuf_svc_class_t sc, u_int32_t flow, VERIFY(sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(sc)); VERIFY(flow != 0); - if (!ifq_locked) { - IFCQ_LOCK(ifq); - } - - if (IFCQ_IS_ENABLED(ifq)) { - cqrq_purge_sc_t req = { sc, flow, 0, 0 }; - - fq_if_request_classq(ifq, CLASSQRQ_PURGE_SC, &req); - cnt = req.packets; - len = req.bytes; - } - - if (!ifq_locked) { - IFCQ_UNLOCK(ifq); - } + cqrq_purge_sc_t req = { sc, flow, 0, 0 }; + ifclassq_request(ifq, CLASSQRQ_PURGE_SC, &req, false); + cnt = req.packets; + len = req.bytes; if (packets != NULL) { *packets = cnt; @@ -1841,7 +1819,7 @@ ifunit_common(const char *name, boolean_t hold) } /* if called from ifunit_ref() and ifnet is not attached, bail */ - if (hold && ifp != NULL && !ifnet_is_attached(ifp, 1)) { + if (hold && ifp != NULL && !ifnet_get_ioref(ifp)) { ifp = NULL; } @@ -2060,18 +2038,11 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t __sized_by(IOCPARM_LEN break; } - IFCQ_LOCK(ifq); - if (!IFCQ_IS_READY(ifq)) { - error = ENXIO; - IFCQ_UNLOCK(ifq); - break; - } bcopy(&iflpr->iflpr_output_tbr_rate, &tb.rate, sizeof(tb.rate)); bcopy(&iflpr->iflpr_output_tbr_percent, &tb.percent, sizeof(tb.percent)); error = ifclassq_tbr_set(ifq, &tb, TRUE); - IFCQ_UNLOCK(ifq); break; } @@ -2079,27 +2050,16 @@ ifioctl_linkparams(struct ifnet *ifp, u_long cmd, caddr_t __sized_by(IOCPARM_LEN u_int32_t sched_type = PKTSCHEDT_NONE, flags = 0; u_int64_t tbr_bw = 0, tbr_pct = 0; - IFCQ_LOCK(ifq); - - if (IFCQ_IS_ENABLED(ifq)) { - sched_type = ifq->ifcq_type; - } - + ifclassq_tbr_get(ifq, &sched_type, &tbr_bw, &tbr_pct); bcopy(&sched_type, &iflpr->iflpr_output_sched, sizeof(iflpr->iflpr_output_sched)); - - if (IFCQ_TBR_IS_ENABLED(ifq)) { - tbr_bw = ifq->ifcq_tbr.tbr_rate_raw; - tbr_pct = ifq->ifcq_tbr.tbr_percent; - } bcopy(&tbr_bw, &iflpr->iflpr_output_tbr_rate, sizeof(iflpr->iflpr_output_tbr_rate)); bcopy(&tbr_pct, &iflpr->iflpr_output_tbr_percent, sizeof(iflpr->iflpr_output_tbr_percent)); - IFCQ_UNLOCK(ifq); - if (ifp->if_output_sched_model == - IFNET_SCHED_MODEL_DRIVER_MANAGED) { + if (ifp->if_output_sched_model & IFNET_SCHED_DRIVER_MANGED_MODELS) { + VERIFY(IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)); flags |= IFLPRF_DRVMANAGED; } bcopy(&flags, &iflpr->iflpr_flags, sizeof(iflpr->iflpr_flags)); @@ -2397,7 +2357,7 @@ ifioctl_netagent(struct ifnet *ifp, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(c VERIFY(ifp != NULL); /* Get an io ref count if the interface is attached */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return EOPNOTSUPP; } @@ -2626,6 +2586,10 @@ ifioctl_iforder(u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) data) case SIOCSIFORDER: { /* struct if_order */ struct if_order *ifo = (struct if_order *)(void *)data; + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + break; + } if (ifo->ifo_count > (u_int32_t)if_index) { error = EINVAL; break; @@ -2732,7 +2696,10 @@ ifioctl_networkid(struct ifnet *ifp, caddr_t __indexable data) int len = ifnetidr->ifnetid_len; VERIFY(ifp != NULL); - + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + goto end; + } if (len > sizeof(ifnetidr->ifnetid)) { error = EINVAL; goto end; @@ -2762,6 +2729,10 @@ ifioctl_netsignature(struct ifnet *ifp, u_long cmd, caddr_t __sized_by(IOCPARM_L switch (cmd) { case SIOCSIFNETSIGNATURE: /* struct if_nsreq */ + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + break; + } if (ifnsr->ifnsr_len > sizeof(ifnsr->ifnsr_data)) { error = EINVAL; break; @@ -2800,6 +2771,10 @@ ifioctl_nat64prefix(struct ifnet *ifp, u_long cmd, caddr_t __sized_by(IOCPARM_LE switch (cmd) { case SIOCSIFNAT64PREFIX: /* struct if_nat64req */ + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + break; + } error = ifnet_set_nat64prefix(ifp, ifnat64->ifnat64_prefixes); if (error != 0) { ip6stat.ip6s_clat464_plat64_pfx_setfail++; @@ -3013,7 +2988,6 @@ ifioctl_restrict_intcoproc(unsigned long cmd, const char *__null_terminated ifna case SIOCGIFINTERFACESTATE: case SIOCGIFPROBECONNECTIVITY: case SIOCGIFTIMESTAMPENABLED: - case SIOCGECNMODE: case SIOCGQOSMARKINGMODE: case SIOCGQOSMARKINGENABLED: case SIOCGIFLOWINTERNET: @@ -3045,6 +3019,9 @@ ifioctl_restrict_intcoproc(unsigned long cmd, const char *__null_terminated ifna case SIOCSIFDIRECTLINK: case SIOCGIFDIRECTLINK: case SIOCGIFCONGESTEDLINK: + case SIOCGIFL4S: + case SIOCGINBANDWAKEPKT: + case SIOCGLOWPOWERWAKE: return false; default: #if (DEBUG || DEVELOPMENT) @@ -3146,7 +3123,6 @@ ifioctl_restrict_management(unsigned long cmd, const char *__null_terminated ifn case SIOCGIFPROBECONNECTIVITY: case SIOCGIFFUNCTIONALTYPE: case SIOCGIFNETSIGNATURE: - case SIOCGECNMODE: case SIOCGIFORDER: case SIOCGQOSMARKINGMODE: case SIOCGQOSMARKINGENABLED: @@ -3179,6 +3155,9 @@ ifioctl_restrict_management(unsigned long cmd, const char *__null_terminated ifn case SIOCGIFDELAYWAKEPKTEVENT: case SIOCGIFDISABLEINPUT: case SIOCGIFCONGESTEDLINK: + case SIOCSIFISCOMPANIONLINK: + case SIOCGIFL4S: + case SIOCGINBANDWAKEPKT: return false; default: if (!IOCurrentTaskHasEntitlement(MANAGEMENT_CONTROL_ENTITLEMENT)) { @@ -3449,7 +3428,6 @@ ifioctl(struct socket *so, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) data case SIOCSIFDISABLEOUTPUT: /* struct ifreq */ #endif /* (DEBUG || DEVELOPMENT) */ case SIOCSIFSUBFAMILY: /* struct ifreq */ - case SIOCGECNMODE: /* struct ifreq */ case SIOCSECNMODE: case SIOCSQOSMARKINGMODE: /* struct ifreq */ case SIOCSQOSMARKINGENABLED: /* struct ifreq */ @@ -3483,6 +3461,13 @@ ifioctl(struct socket *so, u_long cmd, caddr_t __sized_by(IOCPARM_LEN(cmd)) data case SIOCGIFDISABLEINPUT: /* struct ifreq */ case SIOCSIFCONGESTEDLINK: /* struct ifreq */ case SIOCGIFCONGESTEDLINK: /* struct ifreq */ + case SIOCSIFISCOMPANIONLINK: /* struct ifreq */ + case SIOCGIFL4S: /* struct ifreq */ + case SIOCSIFL4S: /* struct ifreq */ + case SIOCGINBANDWAKEPKT: /* struct ifreq */ + case SIOCSINBANDWAKEPKT: /* struct ifreq */ + case SIOCGLOWPOWERWAKE: /* struct ifreq */ + case SIOCSLOWPOWERWAKE: /* struct ifreq */ { struct ifreq ifr; bcopy(data, &ifr, sizeof(ifr)); @@ -4083,9 +4068,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) if_rtmtu_update(ifp); nd6_setmtu(ifp); /* Inform all transmit queues about the new MTU */ - IFCQ_LOCK(ifq); - ifnet_update_sndq(ifq, CLASSQ_EV_LINK_MTU); - IFCQ_UNLOCK(ifq); + ifclassq_update(ifq, CLASSQ_EV_LINK_MTU, false); } break; } @@ -4254,7 +4237,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) IF_INTERFACE_STATE_LQM_STATE_VALID)) { ifr->ifr_link_quality_metric = ifp->if_interface_state.lqm_state; - } else if (IF_FULLY_ATTACHED(ifp)) { + } else if (ifnet_is_fully_attached(ifp)) { ifr->ifr_link_quality_metric = IFNET_LQM_THRESH_UNKNOWN; } else { @@ -4625,33 +4608,8 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) ifr->ifr_probe_connectivity = 0; } break; - case SIOCGECNMODE: - if ((ifp->if_eflags & (IFEF_ECN_ENABLE | IFEF_ECN_DISABLE)) == - IFEF_ECN_ENABLE) { - ifr->ifr_ecn_mode = IFRTYPE_ECN_ENABLE; - } else if ((ifp->if_eflags & (IFEF_ECN_ENABLE | IFEF_ECN_DISABLE)) == - IFEF_ECN_DISABLE) { - ifr->ifr_ecn_mode = IFRTYPE_ECN_DISABLE; - } else { - ifr->ifr_ecn_mode = IFRTYPE_ECN_DEFAULT; - } - break; case SIOCSECNMODE: - if ((error = priv_check_cred(kauth_cred_get(), - PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { - return error; - } - if (ifr->ifr_ecn_mode == IFRTYPE_ECN_DEFAULT) { - if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE); - } else if (ifr->ifr_ecn_mode == IFRTYPE_ECN_ENABLE) { - if_set_eflags(ifp, IFEF_ECN_ENABLE); - if_clear_eflags(ifp, IFEF_ECN_DISABLE); - } else if (ifr->ifr_ecn_mode == IFRTYPE_ECN_DISABLE) { - if_set_eflags(ifp, IFEF_ECN_DISABLE); - if_clear_eflags(ifp, IFEF_ECN_ENABLE); - } else { - error = EINVAL; - } + error = EINVAL; break; case SIOCSIFTIMESTAMPENABLE: @@ -4911,7 +4869,7 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) return error; } if (net_wake_pkt_debug) { - os_log(OS_LOG_DEFAULT, + os_log(wake_packet_log_handle, "SIOCSIFMARKWAKEPKT %s", ifp->if_xname); } if (ifr->ifr_intval != 0) { @@ -4989,6 +4947,8 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) } else { ifp->if_xflags |= IFXF_DELAYWAKEPKTEVENT; } + os_log(OS_LOG_DEFAULT, "interface %s DELAYWAKEPKTEVENT set to %d", + ifp->if_xname, (ifr->ifr_delay_wake_pkt_event == 0) ? 0 : 1); break; case SIOCGIFDELAYWAKEPKTEVENT: ifr->ifr_delay_wake_pkt_event = @@ -5022,6 +4982,18 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) (ifp->if_xflags & IFXF_DISABLE_INPUT) != 0 ? 1 : 0; break; + case SIOCSIFISCOMPANIONLINK: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + return error; + } + if (ifr->ifr_is_companionlink) { + if_set_xflags(ifp, IFXF_IS_COMPANIONLINK); + } else { + if_clear_xflags(ifp, IFXF_IS_COMPANIONLINK); + } + break; + case SIOCSIFCONGESTEDLINK: if ((error = priv_check_cred(kauth_cred_get(), PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { @@ -5048,6 +5020,70 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) (ifp->if_xflags & IFXF_CONGESTED_LINK) != 0 ? 1 : 0; break; + case SIOCGIFL4S: + ifr->ifr_l4s_mode = ifp->if_l4s_mode; + break; + + case SIOCSIFL4S: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { + return error; + } + uint8_t mode = (uint8_t)ifr->ifr_l4s_mode; + + switch (mode) { + case IFRTYPE_L4S_DEFAULT: + case IFRTYPE_L4S_ENABLE: + case IFRTYPE_L4S_DISABLE: + ifp->if_l4s_mode = mode; + break; + default: + error = EINVAL; + break; + } + break; + + case SIOCSINBANDWAKEPKT: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { +#if (DEBUG || DEVELOPMENT) + error = proc_suser(p); + if (error != 0) { + return error; + } +#else /* (DEBUG || DEVELOPMENT) */ + return error; +#endif /* (DEBUG || DEVELOPMENT) */ + } + if_set_inband_wake_packet_tagging(ifp, (ifr->ifr_intval != 0)); + break; + + case SIOCGINBANDWAKEPKT: + ifr->ifr_intval = + (ifp->if_xflags & IFXF_INBAND_WAKE_PKT_TAGGING) != 0 ? 1 : 0; + break; + + case SIOCSLOWPOWERWAKE: + if ((error = priv_check_cred(kauth_cred_get(), + PRIV_NET_INTERFACE_CONTROL, 0)) != 0) { +#if (DEBUG || DEVELOPMENT) + error = proc_suser(p); + if (error != 0) { + return error; + } +#else /* (DEBUG || DEVELOPMENT) */ + return error; +#endif /* (DEBUG || DEVELOPMENT) */ + } + if_set_low_power_wake(ifp, (ifr->ifr_intval != 0)); + break; + + case SIOCGLOWPOWERWAKE: + ifr->ifr_intval = + (ifp->if_xflags & IFXF_LOW_POWER_WAKE) != 0 ? 1 : 0; + break; + + default: VERIFY(0); /* NOTREACHED */ @@ -5179,7 +5215,7 @@ ifconf(u_long cmd, user_addr_t ifrp, int *ret_space) * Make sure to accomodate the largest possible * size of SA(if_lladdr)->sa_len. */ - _CASSERT(sizeof(u) == (SOCK_MAXADDRLEN + 1)); + static_assert(sizeof(u) == (SOCK_MAXADDRLEN + 1)); bzero(u.buf, sizeof(u.buf)); @@ -6361,7 +6397,7 @@ void if_copy_rxpoll_stats(struct ifnet *ifp, struct if_rxpoll_stats *if_rs) { bzero(if_rs, sizeof(*if_rs)); - if (!(ifp->if_eflags & IFEF_RXPOLL) || !ifnet_is_attached(ifp, 1)) { + if (!(ifp->if_eflags & IFEF_RXPOLL) || !ifnet_get_ioref(ifp)) { return; } bcopy(&ifp->if_poll_pstats, if_rs, sizeof(*if_rs)); @@ -6375,7 +6411,7 @@ if_copy_netif_stats(struct ifnet *ifp, struct if_netif_stats *if_ns) bzero(if_ns, sizeof(*if_ns)); #if SKYWALK if (!(ifp->if_capabilities & IFCAP_SKYWALK) || - !ifnet_is_attached(ifp, 1)) { + !ifnet_get_ioref(ifp)) { return; } @@ -6496,10 +6532,12 @@ ifa_deallocated(struct ifaddr *ifa) } } +os_refgrp_decl(static, ifa_refgrp, "ifa refcounts", NULL); + void ifa_initref(struct ifaddr *ifa) { - os_ref_init_raw(&ifa->ifa_refcnt, &ifa_refgrp); + os_ref_init(&ifa->ifa_refcnt, &ifa_refgrp); } void @@ -6530,7 +6568,7 @@ static __attribute__((unused)) void ifioctl_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_long)0) { @@ -6740,7 +6778,6 @@ ifioctl_cassert(void) case SIOCGIFNETSIGNATURE: case SIOCSIFNETWORKID: - case SIOCGECNMODE: case SIOCSECNMODE: case SIOCSIFORDER: @@ -6811,6 +6848,17 @@ ifioctl_cassert(void) case SIOCSIFCONGESTEDLINK: case SIOCGIFCONGESTEDLINK: + + case SIOCSIFISCOMPANIONLINK: + + case SIOCGIFL4S: + case SIOCSIFL4S: + + case SIOCGINBANDWAKEPKT: + case SIOCSINBANDWAKEPKT: + + case SIOCGLOWPOWERWAKE: + case SIOCSLOWPOWERWAKE: ; } } @@ -7143,3 +7191,95 @@ ifnet_get_congested_link(ifnet_t ifp, boolean_t *on) *on = ((ifp->if_xflags & IFXF_CONGESTED_LINK) != 0); return 0; } + +static int +if_set_inband_wake_packet_tagging(ifnet_t ifp, boolean_t on) +{ + ifnet_lock_exclusive(ifp); + + if (on) { + if_set_xflags(ifp, IFXF_INBAND_WAKE_PKT_TAGGING); + } else { + if_clear_xflags(ifp, IFXF_INBAND_WAKE_PKT_TAGGING); + } + + ifnet_lock_done(ifp); + + os_log(OS_LOG_DEFAULT, "interface %s INBAND_WAKE_PKT_TAGGING set to %d", + ifp->if_xname, on); + return 0; +} + +errno_t +ifnet_set_inband_wake_packet_tagging(struct ifnet *ifp, boolean_t on) +{ + if (ifp == NULL) { + return EINVAL; + } + return if_set_inband_wake_packet_tagging(ifp, on); +} + +errno_t +ifnet_get_inband_wake_packet_tagging(ifnet_t ifp, boolean_t *on) +{ + if (ifp == NULL || on == NULL) { + return EINVAL; + } + + *on = ((ifp->if_xflags & IFXF_INBAND_WAKE_PKT_TAGGING) != 0); + return 0; +} + +static int +if_set_low_power_wake(ifnet_t ifp, boolean_t on) +{ + ifnet_lock_exclusive(ifp); + + if (on) { + if_set_xflags(ifp, IFXF_LOW_POWER_WAKE); + } else { + if_clear_xflags(ifp, IFXF_LOW_POWER_WAKE); + } + + ifnet_lock_done(ifp); + + os_log(OS_LOG_DEFAULT, "interface %s LPW mode set to %d", + ifp->if_xname, on); + return 0; +} + +errno_t +ifnet_set_low_power_wake(struct ifnet *ifp, boolean_t on) +{ + if (ifp == NULL) { + return EINVAL; + } + return if_set_low_power_wake(ifp, on); +} + +errno_t +ifnet_get_low_power_wake(ifnet_t ifp, boolean_t *on) +{ + if (ifp == NULL || on == NULL) { + return EINVAL; + } + + *on = ((ifp->if_xflags & IFXF_LOW_POWER_WAKE) != 0); + return 0; +} + + +/* Return the hwassist flags that are actually supported by the hardware */ +uint32_t +if_get_driver_hwassist(struct ifnet *ifp) +{ + if (NA(ifp) == NULL) { + /* When if_na is not attached yet, just use if_hwassist, which at this + * time stil only contains the assist flags that are actually supported + * by the hardware. + */ + return ifp->if_hwassist; + } else { + return NA(ifp)->nifna_netif->nif_hwassist; + } +} diff --git a/bsd/net/if.h b/bsd/net/if.h index 8feec06f0..c93a8fa6d 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -162,16 +162,6 @@ struct if_clonereq { #define IFQ_MAXLEN 128 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ -#define IFQ_DEF_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */ -#define IFQ_DEF_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ -#define IFQ_DEF_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */ -#define IFQ_DEF_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */ -#define IFQ_DEF_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ -#define IFQ_LL_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */ -#define IFQ_LL_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ -#define IFQ_LL_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */ -#define IFQ_LL_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */ -#define IFQ_LL_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ /* * Message format for use in obtaining information about interfaces * from sysctl and the routing socket @@ -357,6 +347,7 @@ struct ifreq { #define IFRTYPE_FUNCTIONAL_LAST 8 u_int8_t ifru_is_directlink; u_int8_t ifru_is_vpn; + u_int8_t ifru_is_companionlink; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index f80c10010..28d7dbd4a 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index 8c5941ed1..cf3150440 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -127,6 +127,7 @@ #include +#include #include #if NBPFILTER > 0 @@ -162,6 +163,7 @@ #include #include +#include #include #include @@ -176,6 +178,8 @@ #include +#define _TSO_CSUM (CSUM_TSO_IPV4 | CSUM_TSO_IPV6) + static struct in_addr inaddr_any = { .s_addr = INADDR_ANY }; @@ -591,7 +595,7 @@ struct bridge_softc { #define SCF_DETACHING 0x01 #define SCF_RESIZING 0x02 #define SCF_MEDIA_ACTIVE 0x04 -#define SCF_ADDRESS_ASSIGNED 0x08 +#define SCF_PROTO_ATTACHED 0x08 typedef enum { CHECKSUM_OPERATION_NONE = 0, @@ -616,6 +620,11 @@ struct bridge_hostfilter_stats bridge_hostfilter_stats; typedef uint8_t ether_type_flag_t; +typedef enum { + pkt_direction_RX, + pkt_direction_TX +} pkt_direction_t; + static LCK_GRP_DECLARE(bridge_lock_grp, "if_bridge"); #if BRIDGE_LOCK_DEBUG static LCK_ATTR_DECLARE(bridge_lock_attr, 0, 0); @@ -654,7 +663,7 @@ static errno_t bridge_iff_output(void *, ifnet_t, protocol_family_t, static errno_t bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *m); static int bridge_enqueue(ifnet_t, ifnet_t, ifnet_t, - ether_type_flag_t, mbuf_t, ChecksumOperation); + ether_type_flag_t, mbuf_t, ChecksumOperation, pkt_direction_t); static mbuf_t bridge_checksum_offload_list(ifnet_t, struct bridge_iflist *, mbuf_t, bool); static mbuf_t bridge_filter_checksum(ifnet_t, struct bridge_iflist * bif, @@ -665,6 +674,9 @@ static void bridge_aging_timer(struct bridge_softc *sc); static void bridge_broadcast(struct bridge_softc *, struct bridge_iflist *, ether_type_flag_t, mbuf_t); +static void bridge_broadcast_list(struct bridge_softc *, + struct bridge_iflist *, ether_type_flag_t, mbuf_t, pkt_direction_t); + static void bridge_span(struct bridge_softc *, ether_type_flag_t, struct mbuf *); static int bridge_rtupdate(struct bridge_softc *, const uint8_t[ETHER_ADDR_LEN], @@ -1461,7 +1473,6 @@ _mbuf_get_tso_mss(mbuf_t m) { int mss = 0; -#define _TSO_CSUM (CSUM_TSO_IPV4 | CSUM_TSO_IPV6) if ((m->m_pkthdr.csum_flags & _TSO_CSUM) != 0) { mss = m->m_pkthdr.tso_segsz; } @@ -1531,6 +1542,91 @@ done: return error; } +static void +bridge_interface_proto_attach_changed(ifnet_t ifp) +{ + uint32_t proto_count; + struct bridge_softc * __single sc = ifp->if_softc; + + proto_count = if_get_protolist(ifp, NULL, 0); + BRIDGE_LOG(LOG_DEBUG, BR_DBGF_LIFECYCLE, + "%s: proto count %d", ifp->if_xname, proto_count); + + if (sc == NULL) { + return; + } + BRIDGE_LOCK(sc); + if ((sc->sc_flags & SCF_DETACHING) != 0) { + BRIDGE_UNLOCK(sc); + return; + } + if (proto_count >= 2) { + /* an upper layer protocol is attached */ + sc->sc_flags |= SCF_PROTO_ATTACHED; + BRIDGE_LOG(LOG_DEBUG, BR_DBGF_LIFECYCLE, + "%s: setting SCF_PROTO_ATTACHED", ifp->if_xname); + } else { + /* an upper layer protocol was detached */ + sc->sc_flags &= ~SCF_PROTO_ATTACHED; + BRIDGE_LOG(LOG_DEBUG, BR_DBGF_LIFECYCLE, + "%s: clearing SCF_PROTO_ATTACHED", ifp->if_xname); + } + BRIDGE_UNLOCK(sc); +} + +static void +bridge_interface_event(struct ifnet * ifp, + __unused protocol_family_t protocol, const struct kev_msg * event) +{ + int event_code; + + if (event->vendor_code != KEV_VENDOR_APPLE + || event->kev_class != KEV_NETWORK_CLASS + || event->kev_subclass != KEV_DL_SUBCLASS) { + return; + } + event_code = event->event_code; + switch (event_code) { + case KEV_DL_PROTO_DETACHED: + case KEV_DL_PROTO_ATTACHED: + bridge_interface_proto_attach_changed(ifp); + break; + default: + break; + } + return; +} + +/* + * Function: bridge_interface_attach_protocol + * Purpose: + * Attach a protocol to the bridge to get events on the interface, + * in particular, whether protocols are attached/detached. + */ +static int +bridge_interface_attach_protocol(ifnet_t ifp) +{ + int error; + struct ifnet_attach_proto_param_v2 reg; + + bzero(®, sizeof(reg)); + reg.event = bridge_interface_event; + + error = ifnet_attach_protocol_v2(ifp, PF_BRIDGE, ®); + if (error != 0) { + BRIDGE_LOG(LOG_NOTICE, BR_DBGF_LIFECYCLE, + "%s: ifnet_attach_protocol failed, %d", + ifp->if_xname, error); + } + return error; +} + +static void +bridge_interface_detach_protocol(ifnet_t ifp) +{ + (void)ifnet_detach_protocol(ifp, PF_BRIDGE); +} + /* * bridge_clone_create: * @@ -1666,6 +1762,7 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) BRIDGE_LOG(LOG_NOTICE, 0, "ifnet_attach failed %d", error); goto done; } + (void)bridge_interface_attach_protocol(ifp); error = ifnet_set_lladdr_and_type(ifp, sc->sc_defaddr, ETHER_ADDR_LEN, IFT_ETHER); @@ -1697,6 +1794,9 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) done: if (error != 0) { + if (ifp != NULL) { + bridge_interface_detach_protocol(ifp); + } BRIDGE_LOG(LOG_NOTICE, 0, "failed error %d", error); /* TBD: Clean up: sc, sc_rthash etc */ } @@ -1716,6 +1816,8 @@ bridge_clone_destroy(struct ifnet *ifp) struct bridge_iflist *bif; errno_t error; + bridge_interface_detach_protocol(ifp); + BRIDGE_LOCK(sc); if ((sc->sc_flags & SCF_DETACHING)) { BRIDGE_UNLOCK(sc); @@ -1848,16 +1950,9 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, void *__sized_by(IOCPARM_LEN(cmd)) d (char)IOCGROUP(cmd), cmd & 0xff); switch (cmd) { - case SIOCAIFADDR_IN6_32: - case SIOCAIFADDR_IN6_64: case SIOCSIFADDR: case SIOCAIFADDR: ifnet_set_flags(ifp, IFF_UP, IFF_UP); - BRIDGE_LOCK(sc); - sc->sc_flags |= SCF_ADDRESS_ASSIGNED; - BRIDGE_UNLOCK(sc); - BRIDGE_LOG(LOG_NOTICE, 0, - "ifp %s has address", ifp->if_xname); break; case SIOCGIFMEDIA32: @@ -2660,7 +2755,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif) BRIDGE_UNLOCK(sc); /* only perform these steps if the interface is still attached */ - if (ifnet_is_attached(ifs, 1)) { + if (ifnet_get_ioref(ifs)) { #if SKYWALK add_netagent = (bif_flags & BIFF_NETAGENT_REMOVED) != 0; @@ -2711,7 +2806,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif) kfree_type(struct bridge_iflist, bif); ifs->if_bridge = NULL; #if SKYWALK - if (add_netagent && ifnet_is_attached(ifs, 1)) { + if (add_netagent && ifnet_get_ioref(ifs)) { (void)ifnet_add_netagent(ifs); ifnet_decr_iorefcnt(ifs); } @@ -2812,7 +2907,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *__sized_by(arg_len) arg, size_t } /* prevent the interface from detaching while we add the member */ - if (!ifnet_is_attached(ifs, 1)) { + if (!ifnet_get_ioref(ifs)) { return ENXIO; } @@ -4970,7 +5065,8 @@ bridge_verify_checksum_list(ifnet_t bridge_ifp, struct bridge_iflist * dbif, error = bridge_verify_checksum(&scan, &dbif->bif_stats); if (error != 0) { if (scan != NULL) { - m_freem(scan); + m_drop(scan, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_CHECKSUM, NULL, 0); scan = NULL; } } @@ -5202,8 +5298,6 @@ tso_hwassist(struct mbuf **mp, bool is_ipv4, struct ifnet * ifp, u_int mac_hlen, goto done; } } - *is_large_tcp = true; - (*mp)->m_pkthdr.pkt_proto = IPPROTO_TCP; if (mss == 0) { uint32_t hdr_len; struct tcphdr * tcp; @@ -5221,11 +5315,16 @@ tso_hwassist(struct mbuf **mp, bool is_ipv4, struct ifnet * ifp, u_int mac_hlen, BRIDGE_LOG(LOG_DEBUG, BR_DBGF_CHECKSUM, "%s: mss %d = len %d / seg cnt %d", ifp->if_xname, mss, len, seg_cnt); + if (mss <= 0) { + /* unexpected value */ + mss = 0; + goto done; + } } else { mss = ifp->if_mtu - hdr_len - if_bridge_tso_reduce_mss_tx; + assert(mss > 0); } - assert(mss > 0); csum_flags = mbuf_tso; if (supports_cksum) { csum_flags |= if_csum; @@ -5234,6 +5333,8 @@ tso_hwassist(struct mbuf **mp, bool is_ipv4, struct ifnet * ifp, u_int mac_hlen, (*mp)->m_pkthdr.csum_flags |= csum_flags; (*mp)->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } + *is_large_tcp = true; + (*mp)->m_pkthdr.pkt_proto = IPPROTO_TCP; if ((ifp->if_hwassist & if_tso) == 0) { /* need gso if no hardware support */ *need_gso = true; @@ -5259,7 +5360,8 @@ done: */ static int bridge_enqueue(ifnet_t bridge_ifp, ifnet_t src_if, ifnet_t dst_if, - ether_type_flag_t etypef, mbuf_t in_list, ChecksumOperation orig_cksum_op) + ether_type_flag_t etypef, mbuf_t in_list, ChecksumOperation orig_cksum_op, + pkt_direction_t direction) { int enqueue_error = 0; mbuf_t next_packet; @@ -5287,7 +5389,8 @@ bridge_enqueue(ifnet_t bridge_ifp, ifnet_t src_if, ifnet_t dst_if, if (mss != 0) { /* packet is marked for segmentation */ check_gso = true; - } else if (scan->m_pkthdr.rx_seg_cnt != 0) { + } else if (direction == pkt_direction_RX && + scan->m_pkthdr.rx_seg_cnt != 0) { /* LRO packet */ check_gso = true; } else if (ether_type_flag_is_ip(etypef) && @@ -5314,7 +5417,9 @@ bridge_enqueue(ifnet_t bridge_ifp, ifnet_t src_if, ifnet_t dst_if, } if (error != 0) { if (scan != NULL) { - m_freem(scan); + m_drop(scan, + direction == pkt_direction_RX ? DROPTAP_FLAG_DIR_IN : DROPTAP_FLAG_DIR_OUT, + DROP_REASON_BRIDGE_HWASSIST, NULL, 0); scan = NULL; } out_errors++; @@ -5396,10 +5501,6 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *data) } } - eh = mtod(m, struct ether_header *); - vlan = VLANTAGOF(m); - etypef = ether_type_flag_get(eh->ether_type); - BRIDGE_LOCK(sc); mac_nat_bif = sc->sc_mac_nat_bif; mac_nat_ifp = (mac_nat_bif != NULL) ? mac_nat_bif->bif_ifp : NULL; @@ -5414,6 +5515,9 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *data) } } bridge_ifp = sc->sc_ifp; + eh = mtod(m, struct ether_header *); + vlan = VLANTAGOF(m); + etypef = ether_type_flag_get(eh->ether_type); /* * APPLE MODIFICATION @@ -5456,7 +5560,8 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *data) BRIDGE_LOCK2REF(sc, error); if (error != 0) { - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_OUT, + DROP_REASON_BRIDGE_NOREF, NULL, 0); return EJUSTRETURN; } @@ -5505,12 +5610,13 @@ bridge_member_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t *data) continue; } (void)bridge_enqueue(bridge_ifp, ifp, dst_if, etypef, - mc, CHECKSUM_OPERATION_COMPUTE); + mc, CHECKSUM_OPERATION_COMPUTE, pkt_direction_TX); } BRIDGE_UNREF(sc); if ((ifp->if_flags & IFF_RUNNING) == 0) { - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_OUT, + DROP_REASON_BRIDGE_NOT_RUNNING, NULL, 0); return EJUSTRETURN; } /* allow packet to continue on the originating interface */ @@ -5524,7 +5630,8 @@ sendunicast: bridge_span(sc, etypef, m); if ((dst_if->if_flags & IFF_RUNNING) == 0) { - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_OUT, + DROP_REASON_BRIDGE_NOT_RUNNING, NULL, 0); BRIDGE_UNLOCK(sc); return EJUSTRETURN; } @@ -5536,7 +5643,7 @@ sendunicast: } if (dst_if != mac_nat_ifp) { (void) bridge_enqueue(bridge_ifp, ifp, dst_if, etypef, m, - CHECKSUM_OPERATION_COMPUTE); + CHECKSUM_OPERATION_COMPUTE, pkt_direction_TX); } else { /* * This is not the original output interface @@ -5544,7 +5651,8 @@ sendunicast: * Drop the packet because the packet can't be sent * if the source MAC is incorrect. */ - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_OUT, + DROP_REASON_BRIDGE_MAC_NAT_FAILURE, NULL, 0); } return EJUSTRETURN; } @@ -5590,7 +5698,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m) BRIDGE_UNLOCK(sc); error = bridge_enqueue(bridge_ifp, NULL, dst_if, etypef, m, - CHECKSUM_OPERATION_FINALIZE); + CHECKSUM_OPERATION_FINALIZE, pkt_direction_TX); } return error; @@ -5922,11 +6030,11 @@ bridge_interface_input_list(ifnet_t bridge_ifp, ether_type_flag_t etypef, * we think it's a large packet, segment it. */ if (info.ip_proto_hdr != NULL && - (_mbuf_get_tso_mss(scan) != 0 || - scan->m_pkthdr.rx_seg_cnt > 1 || + ((bif_uses_virtio && _mbuf_get_tso_mss(scan) != 0) || (!bif_uses_virtio && + (scan->m_pkthdr.rx_seg_cnt > 1 || (mbuf_pkthdr_len(scan) > - (bridge_ifp->if_mtu + ETHER_HDR_LEN))))) { + (bridge_ifp->if_mtu + ETHER_HDR_LEN)))))) { mblist seg; seg = gso_tcp_with_info(bridge_ifp, scan, &info, @@ -6146,7 +6254,8 @@ bridge_broadcast(struct bridge_softc *sc, struct bridge_iflist * sbif, } if (mc != NULL) { (void) bridge_enqueue(bridge_ifp, - NULL, dst_if, etypef, mc, cksum_op); + NULL, dst_if, etypef, mc, cksum_op, + pkt_direction_TX); } } @@ -6203,10 +6312,10 @@ copy_packet_list(mbuf_t m) */ static void bridge_broadcast_list(struct bridge_softc *sc, struct bridge_iflist * sbif, - ether_type_flag_t etypef, mbuf_t m) + ether_type_flag_t etypef, mbuf_t m, pkt_direction_t direction) { - bool bridge_has_address; ifnet_t bridge_ifp; + bool bridge_needs_input; struct bridge_iflist * dbif; bool is_bcast_mcast; errno_t error = 0; @@ -6253,7 +6362,7 @@ bridge_broadcast_list(struct bridge_softc *sc, struct bridge_iflist * sbif, } else { /* * sbif is NULL when the bridge interface calls - * bridge_broadcast(). + * bridge_broadcast_list() (TBD). */ cksum_op = CHECKSUM_OPERATION_FINALIZE; src_if = NULL; @@ -6268,7 +6377,7 @@ bridge_broadcast_list(struct bridge_softc *sc, struct bridge_iflist * sbif, mac_nat_if, m); } sc_filter_flags = sc->sc_filter_flags; - bridge_has_address = (sc->sc_flags & SCF_ADDRESS_ASSIGNED) != 0; + bridge_needs_input = (sc->sc_flags & SCF_PROTO_ATTACHED) != 0; BRIDGE_LOCK2REF(sc, error); if (error) { goto done; @@ -6276,7 +6385,7 @@ bridge_broadcast_list(struct bridge_softc *sc, struct bridge_iflist * sbif, is_bcast_mcast = IS_BCAST_MCAST(m); /* make a copy for the bridge interface */ - if (is_bcast_mcast && bridge_has_address) { + if (sbif != NULL && is_bcast_mcast && bridge_needs_input) { mbuf_t in_list; in_list = copy_packet_list(m); @@ -6369,7 +6478,7 @@ bridge_broadcast_list(struct bridge_softc *sc, struct bridge_iflist * sbif, } if (out_m != NULL) { bridge_enqueue(bridge_ifp, src_if, dst_if, - etypef, out_m, cksum_op); + etypef, out_m, cksum_op, direction); } } @@ -6413,22 +6522,26 @@ bridge_forward_list(struct bridge_softc *sc, struct bridge_iflist * sbif, { bool checksum_ok = false; ChecksumOperation cksum_op; - ifnet_t bridge_ifp; + ifnet_t bridge_ifp = NULL; struct bridge_iflist * dbif; uint32_t sc_filter_flags; ifnet_t src_if; + drop_reason_t drop_reason = DROP_REASON_BRIDGE_UNSPECIFIED; if ((dst_if->if_flags & IFF_RUNNING) == 0) { + drop_reason = DROP_REASON_BRIDGE_NOT_RUNNING; goto drop; } dbif = bridge_lookup_member_if(sc, dst_if); if (dbif == NULL) { /* Not a member of the bridge (anymore?) */ + drop_reason = DROP_REASON_BRIDGE_NOT_A_MEMBER; goto drop; } /* Private segments can not talk to each other */ if ((sbif->bif_ifflags & dbif->bif_ifflags & IFBIF_PRIVATE) != 0) { + drop_reason = DROP_REASON_BRIDGE_PRIVATE_SEGMENT; goto drop; } bridge_ifp = sc->sc_ifp; @@ -6480,13 +6593,13 @@ bridge_forward_list(struct bridge_softc *sc, struct bridge_iflist * sbif, */ if (m != NULL) { bridge_enqueue(bridge_ifp, src_if, dst_if, etypef, m, - cksum_op); + cksum_op, pkt_direction_RX); } return; drop: BRIDGE_UNLOCK(sc); - m_freem_list(m); + m_drop_list(m, bridge_ifp, DROPTAP_FLAG_DIR_IN, drop_reason, NULL, 0); return; } @@ -6521,7 +6634,7 @@ bridge_span(struct bridge_softc *sc, ether_type_flag_t etypef, struct mbuf *m) } (void) bridge_enqueue(sc->sc_ifp, NULL, dst_if, etypef, mc, - CHECKSUM_OPERATION_NONE); + CHECKSUM_OPERATION_NONE, pkt_direction_TX); } } @@ -8587,7 +8700,7 @@ bridge_mac_nat_forward_list(ifnet_t bridge_ifp, ether_type_flag_t etypef, dst_if->if_xname, n_lists, count); bridge_enqueue(bridge_ifp, NULL, dst_if, etypef, list.head, - CHECKSUM_OPERATION_CLEAR_OFFLOAD); + CHECKSUM_OPERATION_CLEAR_OFFLOAD, pkt_direction_RX); /* start new list */ list.head = list.tail = scan; @@ -8605,7 +8718,7 @@ bridge_mac_nat_forward_list(ifnet_t bridge_ifp, ether_type_flag_t etypef, dst_if->if_xname, n_lists, count); bridge_enqueue(bridge_ifp, NULL, dst_if, etypef, list.head, - CHECKSUM_OPERATION_CLEAR_OFFLOAD); + CHECKSUM_OPERATION_CLEAR_OFFLOAD, pkt_direction_RX); } } return; @@ -8681,7 +8794,8 @@ bridge_mac_nat_arp_translate(mbuf_t *data, struct mac_nat_record *mnr, if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback failed"); - m_freem(*data); + m_drop(*data, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_MAC_NAT_FAILURE, NULL, 0); *data = NULL; } return; @@ -8705,7 +8819,8 @@ bridge_mac_nat_ip_translate(mbuf_t *data, struct mac_nat_record *mnr) if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback uh_sum failed"); - m_freem(*data); + m_drop(*data, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_MAC_NAT_FAILURE, NULL, 0); *data = NULL; } /* update the DHCP must broadcast flag */ @@ -8717,7 +8832,8 @@ bridge_mac_nat_ip_translate(mbuf_t *data, struct mac_nat_record *mnr) if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback dp_flags failed"); - m_freem(*data); + m_drop(*data, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_MAC_NAT_FAILURE, NULL, 0); *data = NULL; } } @@ -8756,7 +8872,8 @@ bridge_mac_nat_ipv6_translate(mbuf_t *data, struct mac_nat_record *mnr, if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback lladdr failed"); - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_MAC_NAT_FAILURE, NULL, 0); *data = NULL; return; } @@ -8776,7 +8893,8 @@ bridge_mac_nat_ipv6_translate(mbuf_t *data, struct mac_nat_record *mnr, if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback cksum=0 failed"); - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_CHECKSUM, NULL, 0); *data = NULL; return; } @@ -8788,7 +8906,8 @@ bridge_mac_nat_ipv6_translate(mbuf_t *data, struct mac_nat_record *mnr, if (error != 0) { BRIDGE_LOG(LOG_NOTICE, BR_DBGF_MAC_NAT, "mbuf_copyback cksum failed"); - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_CHECKSUM, NULL, 0); *data = NULL; return; } @@ -9191,7 +9310,7 @@ bridge_pf(struct mbuf **mp, struct ifnet *ifp, uint32_t sc_filter_flags, return 0; bad: - m_freem(*mp); + m_drop(*mp, DROPTAP_FLAG_DIR_IN, DROP_REASON_BRIDGE_PF, NULL, 0); *mp = NULL; return error; } @@ -9231,7 +9350,8 @@ bridge_filter_arp_list(struct bridge_iflist * bif, mbuf_t m) sizeof(struct ether_header) + sizeof(struct ip)); } - m_freem(scan); + m_drop(scan, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_HOST_FILTER, NULL, 0); scan = NULL; } if (scan != NULL) { @@ -9250,6 +9370,7 @@ bridge_filter_checksum(ifnet_t bridge_ifp, struct bridge_iflist * bif, mbuf_t m, errno_t error; ip_packet_info info; u_int mac_hlen = sizeof(struct ether_header); + drop_reason_t drop_reason = DROP_REASON_BRIDGE_UNSPECIFIED; if (host_filter) { dbgf |= BR_DBGF_HOSTFILTER; @@ -9265,6 +9386,7 @@ bridge_filter_checksum(ifnet_t bridge_ifp, struct bridge_iflist * bif, mbuf_t m, "%s(%s) bridge_get_ip_proto failed %d", bridge_ifp->if_xname, bif->bif_ifp->if_xname, error); + drop_reason = DROP_REASON_BRIDGE_NO_PROTO; goto drop; } if (host_filter) { @@ -9288,6 +9410,7 @@ bridge_filter_checksum(ifnet_t bridge_ifp, struct bridge_iflist * bif, mbuf_t m, } if (drop) { BRIDGE_HF_DROP(brhf_ip_bad_proto, __func__, __LINE__); + drop_reason = DROP_REASON_BRIDGE_BAD_PROTO; goto drop; } bridge_hostfilter_stats.brhf_ip_ok += 1; @@ -9300,6 +9423,7 @@ bridge_filter_checksum(ifnet_t bridge_ifp, struct bridge_iflist * bif, mbuf_t m, "%s(%s) bridge_offload_checksum failed %d", bridge_ifp->if_xname, bif->bif_ifp->if_xname, error); + drop_reason = DROP_REASON_BRIDGE_CHECKSUM; goto drop; } } @@ -9314,7 +9438,7 @@ drop: sizeof(struct ether_header) + sizeof(struct ip)); } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, drop_reason, NULL, 0); m = NULL; } return NULL; @@ -9473,6 +9597,7 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, { struct bridge_iflist * bif; ifnet_t bridge_ifp; + bool bridge_needs_input; bool checksum_offload; uint8_t * dhost; #if BRIDGESTP @@ -9552,7 +9677,8 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, } if (host_filter_drop) { BRIDGE_UNLOCK(sc); - m_freem_list(list.head); + m_drop_list(list.head, bridge_ifp, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_HOST_FILTER, NULL, 0); list.head = list.tail = NULL; goto done; } @@ -9669,11 +9795,12 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, } /* - * If the bridge has an address assigned, and the destination MAC + * If the bridge has ULP attached, and the destination MAC * matches the bridge interface, claim the packets for the bridge * interface. */ - if ((sc->sc_flags & SCF_ADDRESS_ASSIGNED) != 0 && + bridge_needs_input = (sc->sc_flags & SCF_PROTO_ATTACHED) != 0; + if (bridge_needs_input && !is_broadcast && _ether_cmp(dhost, IF_LLADDR(bridge_ifp)) == 0) { is_bridge_mac = true; } @@ -9691,7 +9818,7 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, /* forward to all members except this one */ /* bridge_broadcast_list unlocks */ bridge_broadcast_list(sc, bif, etypef, - ip_bcast); + ip_bcast, pkt_direction_RX); } else { BRIDGE_UNLOCK(sc); } @@ -9728,7 +9855,8 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, /* if a member is shost, there's a loop, drop it */ if (bridge_find_member(sc, shost, bif) != NULL) { BRIDGE_UNLOCK(sc); - m_freem_list(list.head); + m_drop_list(list.head, bridge_ifp, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_LOOP, NULL, 0); list.head = list.tail = NULL; goto done; } @@ -9740,7 +9868,8 @@ bridge_input_list(struct bridge_softc * sc, ifnet_t ifp, m = copy_packet_list(list.head); if (m != NULL) { /* bridge_broadcast_list unlocks */ - bridge_broadcast_list(sc, bif, etypef, m); + bridge_broadcast_list(sc, bif, etypef, m, + pkt_direction_RX); } else { BRIDGE_UNLOCK(sc); } @@ -10009,7 +10138,6 @@ m_seg(struct mbuf *m0, int hdr_len, int mss, char * hdr2_buf __sized_by_or_null( n = 1; goto done; } - if (hdr2_buf == NULL || hdr2_len <= 0) { hdr2_buf = NULL; hdr2_len = 0; @@ -10301,13 +10429,15 @@ static mblist gso_ip_tcp(ifnet_t ifp, mbuf_t m0, struct gso_ip_tcp_state *state, bool is_tx) { struct mbuf *m; + int orig_mss; int mss = 0; #ifdef GSO_STATS int total_len = m0->m_pkthdr.len; #endif /* GSO_STATS */ mblist seg; + bool tso_with_gso = false; - mss = _mbuf_get_tso_mss(m0); + orig_mss = mss = _mbuf_get_tso_mss(m0); if (mss == 0 && !is_tx) { uint8_t seg_cnt = m0->m_pkthdr.rx_seg_cnt; @@ -10349,18 +10479,31 @@ gso_ip_tcp(ifnet_t ifp, mbuf_t m0, struct gso_ip_tcp_state *state, bool is_tx) uint32_t if_tso_max; if_tso_max = get_if_tso_mtu(ifp, is_ipv4); - mss = if_tso_max - state->ip_hlen - state->tcp_hlen; + mss = if_tso_max - state->ip_hlen - state->tcp_hlen + - ETHER_HDR_LEN; + tso_with_gso = true; } } + if (!tso_with_gso) { + /* clear TSO flags */ + m0->m_pkthdr.csum_flags &= ~_TSO_CSUM; + } seg = m_seg(m0, state->hlen, mss, 0, 0); if (seg.head == NULL || seg.head->m_nextpkt == NULL) { return seg; } - BRIDGE_LOG(LOG_DEBUG, BR_DBGF_CHECKSUM, - "%s %s mss %d nsegs %d", - ifp->if_xname, - is_tx ? "TX" : "RX", - mss, seg.count); + if (tso_with_gso) { + BRIDGE_LOG(LOG_DEBUG, BR_DBGF_CHECKSUM, + "%s TX gso size %d mss %d nsegs %d", + ifp->if_xname, + mss, orig_mss, seg.count); + } else { + BRIDGE_LOG(LOG_DEBUG, BR_DBGF_CHECKSUM, + "%s %s mss %d nsegs %d", + ifp->if_xname, + is_tx ? "TX" : "RX", + mss, seg.count); + } #ifdef GSO_STATS GSOSTAT_SET_MAX(tcp.gsos_max_mss, mss); GSOSTAT_SET_MIN(tcp.gsos_min_mss, mss); @@ -10414,7 +10557,7 @@ gso_tcp_with_info(ifnet_t ifp, mbuf_t m, ip_packet_info_t info_p, info_p->ip_hlen + info_p->ip_opt_len, info_p->ip_hdr, info_p->ip_m0_len, tcp); csum_flags = is_ipv4 ? CSUM_DELAY_DATA : CSUM_DELAY_IPV6_DATA; /* XXX */ - m->m_pkthdr.csum_flags = csum_flags; + m->m_pkthdr.csum_flags |= csum_flags; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); return gso_ip_tcp(ifp, m, &state, is_tx); } @@ -10434,7 +10577,8 @@ gso_tcp(ifnet_t ifp, mbuf_t m, u_int mac_hlen, bool is_ipv4, bool is_tx) ifp->if_xname, error, is_tx ? "TX" : "RX"); if (m != NULL) { - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_BRIDGE_CHECKSUM, NULL, 0); m = NULL; } goto no_segment; diff --git a/bsd/net/if_fake.c b/bsd/net/if_fake.c index 15660ded7..33dc65c76 100644 --- a/bsd/net/if_fake.c +++ b/bsd/net/if_fake.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2024 Apple Inc. All rights reserved. + * Copyright (c) 2015-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -182,16 +182,6 @@ static int if_fake_low_latency = 0; SYSCTL_INT(_net_link_fake, OID_AUTO, low_latency, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_low_latency, 0, "Fake interface with a low latency qset"); -static int if_fake_switch_combined_mode = 0; -SYSCTL_INT(_net_link_fake, OID_AUTO, switch_combined_mode, - CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_switch_combined_mode, 0, - "Switch a qset between combined and separate mode during dequeues"); - -static int if_fake_switch_mode_frequency = 10; -SYSCTL_INT(_net_link_fake, OID_AUTO, switch_mode_frequency, - CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_switch_mode_frequency, 0, - "The number of dequeues before we switch between the combined and separated mode"); - static int if_fake_tso_support = 0; SYSCTL_INT(_net_link_fake, OID_AUTO, tso_support, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_tso_support, 0, "Fake interface with support for TSO offload"); @@ -211,6 +201,10 @@ SYSCTL_INT(_net_link_fake, OID_AUTO, separate_frame_header, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_separate_frame_header, 0, "Put frame header in separate mbuf"); +static int if_fake_fail_ioctl = 0; +SYSCTL_INT(_net_link_fake, OID_AUTO, fail_ioctl, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_fake_fail_ioctl, 0, "Fake interface fail ioctl"); + typedef enum { IFF_PP_MODE_GLOBAL = 0, /* share a global pool */ IFF_PP_MODE_PRIVATE = 1, /* creates its own rx/tx pool */ @@ -221,6 +215,10 @@ SYSCTL_INT(_net_link_fake, OID_AUTO, pktpool_mode, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_pktpool_mode, IFF_PP_MODE_GLOBAL, "Fake interface packet pool mode (0 global, 1 private, 2 private split"); +static int if_fake_rx_flow_steering_support = 0; +SYSCTL_INT(_net_link_fake, OID_AUTO, rx_flow_steering_support, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_fake_rx_flow_steering_support, 0, "Fake interface with support for Rx flow steering"); + #define FETH_LINK_LAYER_AGGRETATION_FACTOR_MAX 512 #define FETH_LINK_LAYER_AGGRETATION_FACTOR_DEF 96 static int if_fake_link_layer_aggregation_factor = @@ -714,6 +712,7 @@ typedef uint16_t iff_flags_t; #define IFF_FLAGS_VLAN_TAGGING 0x0100 #define IFF_FLAGS_SEPARATE_FRAME_HEADER 0x0200 #define IFF_FLAGS_NX_ATTACHED 0x0400 +#define IFF_FLAGS_RX_FLOW_STEERING 0x0800 #if SKYWALK @@ -736,7 +735,6 @@ typedef struct { uint32_t fqs_idx; uint32_t fqs_dequeue_cnt; uint64_t fqs_id; - boolean_t fqs_combined_mode; } fake_qset; typedef struct { @@ -893,6 +891,17 @@ feth_set_supports_vlan_tagging(if_fake_ref fakeif) fakeif->iff_flags |= IFF_FLAGS_VLAN_TAGGING; } +static inline void +feth_set_supports_rx_flow_steering(if_fake_ref fakeif) +{ + fakeif->iff_flags |= IFF_FLAGS_RX_FLOW_STEERING; +} + +static inline bool +feth_supports_rx_flow_steering(if_fake_ref fakeif) +{ + return (fakeif->iff_flags & IFF_FLAGS_RX_FLOW_STEERING) != 0; +} #define FETH_MAXUNIT IF_MAXUNIT #define FETH_ZONE_MAX_ELEM MIN(IFNETS_MAX, FETH_MAXUNIT) @@ -961,8 +970,7 @@ get_max_mtu(int bsd_mode, unsigned int max_mtu) unsigned int mtu; if (bsd_mode != 0) { - mtu = (njcl > 0) ? (M16KCLBYTES - ETHER_HDR_LEN) - : MBIGCLBYTES - ETHER_HDR_LEN; + mtu = M16KCLBYTES - ETHER_HDR_LEN; if (mtu > max_mtu) { mtu = max_mtu; } @@ -2564,6 +2572,37 @@ fill_capab_qset_extensions(if_fake_ref fakeif, void *contents, uint32_t *len) return 0; } +static errno_t +feth_nx_rx_flow_steering_config(void *prov_ctx, uint32_t id, + struct ifnet_traffic_descriptor_common *td, uint32_t action) +{ +#pragma unused(td) + if_fake_ref fakeif = prov_ctx; + + FAKE_LOG(LOG_DEBUG, FE_DBGF_MISC, + "%s: nx_rx_flow_steering_config: id 0x%x, action %u", + fakeif->iff_name, id, action); + return 0; +} + +static errno_t +fill_capab_rx_flow_steering(if_fake_ref fakeif, void *contents, uint32_t *len) +{ + struct kern_nexus_capab_rx_flow_steering * __single capab = contents; + + if (*len != sizeof(*capab)) { + return EINVAL; + } + if (capab->kncrxfs_version != + KERN_NEXUS_CAPAB_RX_FLOW_STEERING_VERSION_1) { + return EINVAL; + } + + capab->kncrxfs_prov_ctx = fakeif; + capab->kncrxfs_config = feth_nx_rx_flow_steering_config; + return 0; +} + static errno_t feth_nx_capab_config(kern_nexus_provider_t nxprov, kern_nexus_t nx, kern_nexus_capab_t capab, void *contents, uint32_t *len) @@ -2582,6 +2621,9 @@ feth_nx_capab_config(kern_nexus_provider_t nxprov, kern_nexus_t nx, case KERN_NEXUS_CAPAB_QSET_EXTENSIONS: error = fill_capab_qset_extensions(fakeif, contents, len); break; + case KERN_NEXUS_CAPAB_RX_FLOW_STEERING: + error = fill_capab_rx_flow_steering(fakeif, contents, len); + break; default: error = ENOTSUP; break; @@ -2658,7 +2700,7 @@ create_netif_provider_and_instance(if_fake_ref fakeif, .nxpi_config_capab = feth_nx_capab_config, }; - _CASSERT(IFF_MAX_RX_RINGS == 1); + static_assert(IFF_MAX_RX_RINGS == 1); err = kern_nexus_attr_create(&nexus_attr); if (err != 0) { FAKE_LOG(LOG_NOTICE, FE_DBGF_LIFECYCLE, @@ -2923,7 +2965,6 @@ feth_nx_tx_qset_notify(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) if_fake_ref fakeif; - ifnet_t ifp; ifnet_t peer_ifp; if_fake_ref peer_fakeif = NULL; struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats; @@ -2950,7 +2991,6 @@ feth_nx_tx_qset_notify(kern_nexus_provider_t nxprov, kern_nexus_t nexus, feth_unlock(); return 0; } - ifp = fakeif->iff_ifp; peer_ifp = fakeif->iff_peer; if (peer_ifp != NULL) { peer_fakeif = ifnet_get_if_fake(peer_ifp); @@ -2976,17 +3016,6 @@ feth_nx_tx_qset_notify(kern_nexus_provider_t nxprov, kern_nexus_t nexus, goto done; } - if (if_fake_switch_combined_mode && - qset->fqs_dequeue_cnt >= if_fake_switch_mode_frequency) { - if (qset->fqs_combined_mode) { - kern_netif_set_qset_separate(qset->fqs_qset); - } else { - kern_netif_set_qset_combined(qset->fqs_qset); - } - qset->fqs_combined_mode = !qset->fqs_combined_mode; - qset->fqs_dequeue_cnt = 0; - } - for (i = 0; i < qset->fqs_tx_queue_cnt; i++) { kern_packet_t sph = 0; kern_netif_queue_t queue = qset->fqs_tx_queue[i].fq_queue; @@ -3016,7 +3045,6 @@ feth_nx_queue_tx_push(kern_nexus_provider_t nxprov, { #pragma unused(nxprov) if_fake_ref fakeif; - ifnet_t ifp; ifnet_t peer_ifp; if_fake_ref peer_fakeif = NULL; struct netif_stats *nifs = &NX_NETIF_PRIVATE(nexus)->nif_stats; @@ -3039,7 +3067,6 @@ feth_nx_queue_tx_push(kern_nexus_provider_t nxprov, (connected ? "true" : "false")); goto done; } - ifp = fakeif->iff_ifp; peer_ifp = fakeif->iff_peer; if (peer_ifp != NULL) { peer_fakeif = ifnet_get_if_fake(peer_ifp); @@ -3264,7 +3291,7 @@ create_netif_llink_provider_and_instance(if_fake_ref fakeif, * Assume llink id is same as the index for if_fake. * This is not required for other drivers. */ - _CASSERT(NETIF_LLINK_ID_DEFAULT == 0); + static_assert(NETIF_LLINK_ID_DEFAULT == 0); fill_llink_info_and_params(fakeif, 0, &llink_init, NETIF_LLINK_ID_DEFAULT, qsets, if_fake_qset_cnt, KERN_NEXUS_NET_LLINK_DEFAULT); @@ -3455,6 +3482,9 @@ feth_ifnet_set_attrs(if_fake_ref fakeif, ifnet_t ifp) "ifnet_set_offload(%s, 0x%x) succeeded", ifp->if_xname, offload); } + if (feth_supports_rx_flow_steering(fakeif)) { + ifnet_set_rx_flow_steering(ifp, true); + } } static void @@ -3499,12 +3529,14 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) bool multi_buflet; iff_pktpool_mode_t pktpool_mode; bool tso_support; + bool rx_flow_steering_support; /* make local copy of globals needed to make consistency checks below */ bsd_mode = (if_fake_bsd_mode != 0); multi_buflet = (if_fake_multibuflet != 0); tso_support = (if_fake_tso_support != 0); pktpool_mode = if_fake_pktpool_mode; + rx_flow_steering_support = (if_fake_rx_flow_steering_support != 0); if (!bsd_mode) { /* consistency checks */ @@ -3535,7 +3567,7 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) fakeif->iff_llink = iff_llink; fakeif->iff_retain_count = 1; #define FAKE_ETHER_NAME_LEN (sizeof(FAKE_ETHER_NAME) - 1) - _CASSERT(FAKE_ETHER_NAME_LEN == 4); + static_assert(FAKE_ETHER_NAME_LEN == 4); strbufcpy(mac_address, FAKE_ETHER_NAME); mac_address[ETHER_ADDR_LEN - 2] = (unit & 0xff00) >> 8; mac_address[ETHER_ADDR_LEN - 1] = unit & 0xff; @@ -3611,6 +3643,10 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) fakeif->iff_tx_drop_rate = if_fake_tx_drops; fakeif->iff_tx_completion_mode = if_tx_completion_mode; fakeif->iff_tx_exp_policy = if_fake_tx_exp_policy; + + if (rx_flow_steering_support) { + feth_set_supports_rx_flow_steering(fakeif); + } } feth_init.tx_headroom = fakeif->iff_tx_headroom; #endif /* SKYWALK */ @@ -4249,6 +4285,15 @@ feth_ioctl(ifnet_t ifp, u_long cmd, void * data) } break; + case SIOCDIFADDR: + if (if_fake_fail_ioctl != 0) { + FAKE_LOG(LOG_NOTICE, FE_DBGF_LIFECYCLE, + "%s: failing SIOCDIFADDR with EPWROFF", + ifp->if_xname); + error = EPWROFF; + } + break; + case SIOCADDMULTI: case SIOCDELMULTI: error = 0; diff --git a/bsd/net/if_headless.c b/bsd/net/if_headless.c index 911cb87f4..d824ca394 100644 --- a/bsd/net/if_headless.c +++ b/bsd/net/if_headless.c @@ -973,7 +973,7 @@ create_netif_provider_and_instance(if_headless_ref headlessif, prov_init.nxpi_sync_tx = headless_nx_sync_tx; } - _CASSERT(IFF_MAX_RX_RINGS == 1); + static_assert(IFF_MAX_RX_RINGS == 1); snprintf((char *)provider_name, sizeof(provider_name), "com.apple.netif.%s", headlessif->iff_name); diff --git a/bsd/net/if_ipsec.c b/bsd/net/if_ipsec.c index 37fd5776c..bc5634f5f 100644 --- a/bsd/net/if_ipsec.c +++ b/bsd/net/if_ipsec.c @@ -539,7 +539,7 @@ ipsec_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov, channel) struct ipsec_pcb *__single pcb = kern_nexus_get_context(nexus); - boolean_t ok = ifnet_is_attached(pcb->ipsec_ifp, 1); + boolean_t ok = ifnet_get_ioref(pcb->ipsec_ifp); /* Mark the data path as ready */ if (ok) { lck_mtx_lock(&pcb->ipsec_pcb_data_move_lock); @@ -1870,7 +1870,7 @@ ipsec_nexus_ifattach(struct ipsec_pcb *pcb, bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; + pp_init.kbi_flags |= (KBIF_VIRTUAL_DEVICE | KBIF_USER_ACCESS); // Note: we need more packets than can be held in the tx and rx rings because // packets can also be in the AQM queue(s) pp_init.kbi_packets = pcb->ipsec_netif_ring_size * (2 * pcb->ipsec_kpipe_count + 1); @@ -2359,13 +2359,12 @@ ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc) bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; + pp_init.kbi_flags |= (KBIF_VIRTUAL_DEVICE | KBIF_USER_ACCESS); // Note: We only needs are many packets as can be held in the tx and rx rings pp_init.kbi_packets = pcb->ipsec_netif_ring_size * 2 * pcb->ipsec_kpipe_count; pp_init.kbi_bufsize = pcb->ipsec_slot_size; pp_init.kbi_buf_seg_size = IPSEC_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; - pp_init.kbi_flags |= KBIF_QUANTUM; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "com.apple.kpipe.%s", pcb->ipsec_if_xname); pp_init.kbi_ctx = NULL; diff --git a/bsd/net/if_llreach.c b/bsd/net/if_llreach.c index 94d61b1f5..18c9953f6 100644 --- a/bsd/net/if_llreach.c +++ b/bsd/net/if_llreach.c @@ -129,6 +129,7 @@ #include #include +#include #include #include diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index 8b47bf8b9..b02e7f736 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -698,12 +698,6 @@ loopattach(void) __func__, result); /* NOTREACHED */ } - /* - * Disable ECN on loopback as ECN serves no purpose and otherwise - * TCP connections are subject to heuristics like SYN retransmits on RST - */ - if_clear_eflags(lo_ifp, IFEF_ECN_ENABLE); - if_set_eflags(lo_ifp, IFEF_ECN_DISABLE); bpfattach(lo_ifp, DLT_NULL, sizeof(u_int32_t)); } @@ -751,6 +745,7 @@ sysctl_sched_model SYSCTL_HANDLER_ARGS case IFNET_SCHED_MODEL_NORMAL: case IFNET_SCHED_MODEL_DRIVER_MANAGED: case IFNET_SCHED_MODEL_FQ_CODEL: + case IFNET_SCHED_MODEL_FQ_CODEL_DM: break; default: @@ -783,7 +778,7 @@ sysctl_dequeue_scidx SYSCTL_HANDLER_ARGS return EINVAL; } - if (lo_sched_model != IFNET_SCHED_MODEL_DRIVER_MANAGED) { + if ((lo_sched_model & IFNET_SCHED_DRIVER_MANGED_MODELS) == 0) { return ENODEV; } diff --git a/bsd/net/if_low_power_mode.c b/bsd/net/if_low_power_mode.c index ada0e1d40..1b6b47140 100644 --- a/bsd/net/if_low_power_mode.c +++ b/bsd/net/if_low_power_mode.c @@ -87,7 +87,7 @@ if_low_power_evhdlr_callback(__unused struct eventhandler_entry_arg arg, { struct kev_dl_low_power_mode kev; - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { return; } diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index 805157c16..99d98cf84 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -122,7 +122,7 @@ make_ifmibdata(struct ifnet *ifp, int *__counted_by(2) name, struct sysctl_req * /* * Make sure the interface is in use */ - if (ifnet_is_attached(ifp, 0)) { + if (ifnet_is_fully_attached(ifp)) { snprintf(ifmd.ifmd_name, sizeof(ifmd.ifmd_name), "%s", if_name(ifp)); diff --git a/bsd/net/if_ports_used.c b/bsd/net/if_ports_used.c index b57d7374b..5188afdde 100644 --- a/bsd/net/if_ports_used.c +++ b/bsd/net/if_ports_used.c @@ -67,6 +67,8 @@ #include +#include + #define ESP_HDR_SIZE 4 #define PORT_ISAKMP 500 #define PORT_ISAKMP_NATT 4500 /* rfc3948 */ @@ -104,12 +106,12 @@ SYSCTL_INT(_net_link_generic_system_port_used, OID_AUTO, use_test_wakeuuid, CTLFLAG_RW | CTLFLAG_LOCKED, &use_test_wakeuuid, 0, ""); -int sysctl_new_test_wakeuuid SYSCTL_HANDLER_ARGS; +static int sysctl_new_test_wakeuuid SYSCTL_HANDLER_ARGS; SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, new_test_wakeuuid, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_new_test_wakeuuid, "S,uuid_t", ""); -int sysctl_clear_test_wakeuuid SYSCTL_HANDLER_ARGS; +static int sysctl_clear_test_wakeuuid SYSCTL_HANDLER_ARGS; SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, clear_test_wakeuuid, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_clear_test_wakeuuid, "S,uuid_t", ""); @@ -117,6 +119,50 @@ SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, clear_test_wakeuuid, SYSCTL_OPAQUE(_net_link_generic_system_port_used, OID_AUTO, test_wakeuuid, CTLFLAG_RD | CTLFLAG_LOCKED, test_wakeuuid, sizeof(uuid_t), "S,uuid_t", ""); + +/* + * use_fake_lpw is used for testing only + */ +#define FAKE_LPW_OFF 0 /* fake LPW off */ +#define FAKE_LPW_ON_ONCE 1 /* use fake LPW once */ +#define FAKE_LPW_ALWAYS_ON 2 /* permanent fake LPW mode */ +#define FAKE_LPW_FLIP_ON 3 /* LPW on, then switch to off */ +#define FAKE_LPW_FLIP_OFF 4 /* LPW off, then switch to on */ + +static int use_fake_lpw = 0; +static int sysctl_use_fake_lpw SYSCTL_HANDLER_ARGS; +SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, use_fake_lpw, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &use_fake_lpw, 0, &sysctl_use_fake_lpw, "I", ""); + +bool fake_lpw_mode_is_set = false; + +SYSCTL_NODE(_net_link_generic_system_port_used, OID_AUTO, mark_wake_packet, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "if port used"); + +static int sysctl_mark_wake_packet_port SYSCTL_HANDLER_ARGS; +static int sysctl_mark_wake_packet_if SYSCTL_HANDLER_ARGS; + +static int mark_wake_packet_local_port = 0; +SYSCTL_PROC(_net_link_generic_system_port_used_mark_wake_packet, OID_AUTO, local_port, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &mark_wake_packet_local_port, 0, &sysctl_mark_wake_packet_port, "I", ""); + +static int mark_wake_packet_remote_port = 0; +SYSCTL_PROC(_net_link_generic_system_port_used_mark_wake_packet, OID_AUTO, remote_port, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &mark_wake_packet_remote_port, 0, &sysctl_mark_wake_packet_port, "I", ""); + +static int mark_wake_packet_ipproto = 0; +SYSCTL_INT(_net_link_generic_system_port_used_mark_wake_packet, OID_AUTO, ipproto, + CTLFLAG_RW | CTLFLAG_LOCKED, + &mark_wake_packet_ipproto, 0, ""); + +static char mark_wake_packet_if[IFNAMSIZ]; +SYSCTL_PROC(_net_link_generic_system_port_used_mark_wake_packet, OID_AUTO, if, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_mark_wake_packet_if, "A", ""); + #endif /* (DEVELOPMENT || DEBUG) */ static int sysctl_get_ports_used SYSCTL_HANDLER_ARGS; @@ -153,10 +199,13 @@ static uint32_t last_wake_phy_if_family; static uint32_t last_wake_phy_if_subfamily; static uint32_t last_wake_phy_if_functional_type; static bool last_wake_phy_if_delay_wake_pkt = false; +static bool last_wake_phy_if_lpw = false; static bool has_notified_wake_pkt = false; static bool has_notified_unattributed_wake = false; +static bool is_lpw_mode = false; + static LCK_GRP_DECLARE(net_port_entry_head_lock_group, "net port entry lock"); static LCK_MTX_DECLARE(net_port_entry_head_lock, &net_port_entry_head_lock_group); @@ -174,19 +223,6 @@ static SLIST_HEAD(net_port_entry_list, net_port_entry) net_port_entry_list = struct timeval wakeuiid_last_check; - -#if (DEBUG | DEVELOPMENT) -static int64_t npi_search_list_total = 0; -SYSCTL_QUAD(_net_link_generic_system_port_used, OID_AUTO, npi_search_list_total, - CTLFLAG_RD | CTLFLAG_LOCKED, - &npi_search_list_total, ""); - -static int64_t npi_search_list_max = 0; -SYSCTL_QUAD(_net_link_generic_system_port_used, OID_AUTO, npi_search_list_max, - CTLFLAG_RD | CTLFLAG_LOCKED, - &npi_search_list_max, ""); -#endif /* (DEBUG | DEVELOPMENT) */ - /* * Hashing of the net_port_entry list is based on the local port * @@ -202,8 +238,8 @@ SYSCTL_QUAD(_net_link_generic_system_port_used, OID_AUTO, npi_search_list_max, static TAILQ_HEAD(net_port_entry_hash_table, net_port_entry) * __indexable net_port_entry_hash_table = NULL; /* - * For some types of physical interface we need to delay the notiication of wake packet event - * until a user land interface controller confirms the wake was caused by its packet + * For some types of physical interface we need to delay the notification of wake packet events + * until a user land interface controller confirms the AP wake was caused by its packet */ struct net_port_info_wake_pkt_event { uint32_t npi_wp_code; @@ -233,9 +269,16 @@ SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, wake_pkt_event_delay_i CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0, sysctl_wake_pkt_event_delay_if_families, "I", ""); - +/* last_wake_pkt_event is informational */ static struct net_port_info_wake_pkt_event last_wake_pkt_event; +/* + * delay_wake_pkt_event hold the current wake packet event that is delayed waiting for + * confirmation from a userspace agent + * It can be overwritten as a wake packet makes its way up the stack + */ +static struct net_port_info_wake_pkt_event delay_wake_pkt_event; + int sysctl_last_attributed_wake_event SYSCTL_HANDLER_ARGS; static SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, last_attributed_wake_event, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, @@ -246,6 +289,34 @@ static SYSCTL_PROC(_net_link_generic_system_port_used, OID_AUTO, last_unattributed_wake_event, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_last_unattributed_wake_event, "S,net_port_info_una_wake_event", ""); +os_log_t wake_packet_log_handle = NULL; + +static bool is_wake_pkt_event_delay(uint32_t ifrtype); + +static bool +_if_need_delayed_wake_pkt_event_inner(struct ifnet *ifp) +{ + if ((ifp->if_xflags & IFXF_DELAYWAKEPKTEVENT) != 0 || + is_wake_pkt_event_delay(ifp->if_family)) { + return true; + } + return false; +} + +static bool +if_need_delayed_wake_pkt_event(struct ifnet *ifp) +{ + if (ifp != NULL) { + if (_if_need_delayed_wake_pkt_event_inner(ifp) == true) { + return true; + } + if (ifp->if_delegated.ifp != NULL) { + return _if_need_delayed_wake_pkt_event_inner(ifp->if_delegated.ifp); + } + } + return false; +} + /* * Initialize IPv4 source address hash table. */ @@ -256,11 +327,98 @@ if_ports_used_init(void) return; } + wake_packet_log_handle = os_log_create("com.apple.xnu.net.wake_packet", ""); + net_port_entry_hash_table = zalloc_permanent( NPE_HASH_BUCKET_COUNT * sizeof(*net_port_entry_hash_table), ZALIGN_PTR); } +bool +if_is_lpw_enabled(struct ifnet *ifp) +{ + bool old_is_lpw_mode = is_lpw_mode; + + if (ifp == NULL) { + return false; + } + + if ((ifp->if_xflags & IFXF_LOW_POWER_WAKE) == 0 && last_wake_phy_if_lpw == false) { + return false; + } + +#if (DEBUG || DEVELOPMENT) + if (use_fake_lpw != FAKE_LPW_OFF) { + if (strlcmp(mark_wake_packet_if, IF_XNAME(ifp), IFNAMSIZ) == 0) { + fake_lpw_mode_is_set = true; + + switch (use_fake_lpw) { + case FAKE_LPW_ON_ONCE: + is_lpw_mode = true; + use_fake_lpw = FAKE_LPW_OFF; + break; + case FAKE_LPW_ALWAYS_ON: + is_lpw_mode = true; + break; + case FAKE_LPW_FLIP_ON: + is_lpw_mode = true; + use_fake_lpw = FAKE_LPW_FLIP_OFF; + break; + case FAKE_LPW_FLIP_OFF: + is_lpw_mode = false; + use_fake_lpw = FAKE_LPW_FLIP_ON; + break; + } + + if (if_ports_used_verbose && is_lpw_mode != old_is_lpw_mode) { + os_log(wake_packet_log_handle, "if_is_lpw_enabled %s set LPW to %d", + IF_XNAME(ifp), is_lpw_mode == true ? 1 : 0); + } + + return is_lpw_mode; + } + /* In fake mode, ignore packets from other interfaces */ + return false; + } +#endif /* (DEBUG || DEVELOPMENT) */ + + if (IOPMIsLPWMode()) { + is_lpw_mode = true; + } else { + is_lpw_mode = false; + } + if (if_ports_used_verbose && is_lpw_mode != old_is_lpw_mode) { + os_log(wake_packet_log_handle, "if_is_lpw_enabled %s set LPW to %d", + IF_XNAME(ifp), is_lpw_mode == true ? 1 : 0); + } + + return is_lpw_mode; +} + +void +if_exit_lpw(struct ifnet *ifp, const char *lpw_exit_reason) +{ + if (if_is_lpw_enabled(ifp) == false) { + return; + } + is_lpw_mode = false; + + if_ports_used_stats.ifpu_lpw_to_full_wake++; + os_log_error(wake_packet_log_handle, "if_exit_lpw: LPW to Full Wake requested on %s reason %s", + IF_XNAME(ifp), lpw_exit_reason); + +#if (DEVELOPMENT || DEBUG) + if (fake_lpw_mode_is_set == true) { + /* Let's not mess up with the IO power management subsystem */ + if (IOPMIsLPWMode() == false) { + return; + } + } +#endif /* (DEVELOPMENT || DEBUG) */ + + IOPMNetworkStackFullWake(kIOPMNetworkStackFullWakeFlag, "Network.ConnectionNotIdle"); +} + static void net_port_entry_list_clear(void) { @@ -329,9 +487,9 @@ if_ports_used_update_wakeuuid(struct ifnet *ifp) if (wakeuuid_is_set) { if (uuid_parse(wakeuuid_str, wakeuuid) != 0) { - os_log(OS_LOG_DEFAULT, - "%s: IOPMCopySleepWakeUUIDKey got bad value %s\n", - __func__, wakeuuid_str); + os_log(wake_packet_log_handle, + "if_ports_used_update_wakeuuid: IOPMCopySleepWakeUUIDKey got bad value %s\n", + wakeuuid_str); wakeuuid_is_set = false; } } @@ -339,10 +497,10 @@ if_ports_used_update_wakeuuid(struct ifnet *ifp) if (!wakeuuid_is_set) { if (ifp != NULL) { if (if_ports_used_verbose > 0) { - os_log_info(OS_LOG_DEFAULT, - "%s: SleepWakeUUID not set, " + os_log_info(wake_packet_log_handle, + "if_ports_used_update_wakeuuid: SleepWakeUUID not set, " "don't update the port list for %s\n", - __func__, ifp != NULL ? if_name(ifp) : ""); + ifp != NULL ? if_name(ifp) : ""); } if_ports_used_stats.ifpu_wakeuuid_not_set_count += 1; microtime(&wakeuuid_not_set_last_time); @@ -367,6 +525,7 @@ if_ports_used_update_wakeuuid(struct ifnet *ifp) has_notified_unattributed_wake = false; memset(&last_wake_pkt_event, 0, sizeof(last_wake_pkt_event)); + memset(&delay_wake_pkt_event, 0, sizeof(delay_wake_pkt_event)); last_wake_phy_if_set = false; memset(&last_wake_phy_if_name, 0, sizeof(last_wake_phy_if_name)); @@ -374,6 +533,12 @@ if_ports_used_update_wakeuuid(struct ifnet *ifp) last_wake_phy_if_subfamily = IFRTYPE_SUBFAMILY_ANY; last_wake_phy_if_functional_type = IFRTYPE_FUNCTIONAL_UNKNOWN; last_wake_phy_if_delay_wake_pkt = false; + last_wake_phy_if_lpw = false; + + is_lpw_mode = false; +#if (DEVELOPMENT || DEBUG) + fake_lpw_mode_is_set = false; +#endif /* (DEVELOPMENT || DEBUG) */ } /* * Record the time last checked @@ -385,8 +550,8 @@ if_ports_used_update_wakeuuid(struct ifnet *ifp) uuid_string_t uuid_str; uuid_unparse(current_wakeuuid, uuid_str); - os_log(OS_LOG_DEFAULT, "%s: current wakeuuid %s", - __func__, uuid_str); + os_log(wake_packet_log_handle, "if_ports_used_update_wakeuuid: current wakeuuid %s for %s", + uuid_str, ifp != NULL ? if_name(ifp) : ""); } } @@ -447,7 +612,7 @@ net_port_info_add_entry(const struct net_port_info *npi) if (__improbable(is_wakeuuid_set() == false)) { if_ports_used_stats.ifpu_npi_not_added_no_wakeuuid++; if (if_ports_used_verbose > 0) { - os_log(OS_LOG_DEFAULT, "%s: wakeuuid not set not adding " + os_log(wake_packet_log_handle, "%s: wakeuuid not set not adding " "port: %u flags: 0x%xif: %u pid: %u epid %u", __func__, ntohs(npi->npi_local_port), @@ -461,7 +626,7 @@ net_port_info_add_entry(const struct net_port_info *npi) npe = zalloc_flags(net_port_entry_zone, Z_WAITOK | Z_ZERO); if (__improbable(npe == NULL)) { - os_log(OS_LOG_DEFAULT, "%s: zalloc() failed for " + os_log(wake_packet_log_handle, "%s: zalloc() failed for " "port: %u flags: 0x%x if: %u pid: %u epid %u", __func__, ntohs(npi->npi_local_port), @@ -474,6 +639,18 @@ net_port_info_add_entry(const struct net_port_info *npi) memcpy(&npe->npe_npi, npi, sizeof(npe->npe_npi)); + if (IF_INDEX_IN_RANGE(npe->npe_npi.npi_if_index)) { + struct ifnet *ifp = ifindex2ifnet[npe->npe_npi.npi_if_index]; + if (ifp != NULL) { + if (IFNET_IS_COMPANION_LINK(ifp)) { + npe->npe_npi.npi_flags |= NPIF_COMPLINK; + } + if (if_need_delayed_wake_pkt_event(ifp)) { + npe->npe_npi.npi_flags |= NPIF_DELAYWAKEPKTEVENT; + } + } + } + lck_mtx_lock(&net_port_entry_head_lock); if (net_port_info_has_entry(npi) == false) { @@ -488,7 +665,7 @@ net_port_info_add_entry(const struct net_port_info *npi) if_ports_used_stats.ifpu_npe_total++; if (if_ports_used_verbose > 1) { - os_log(OS_LOG_DEFAULT, "%s: num %u for " + os_log(wake_packet_log_handle, "%s: num %u for " "port: %u flags: 0x%x if: %u pid: %u epid %u", __func__, num, @@ -501,7 +678,7 @@ net_port_info_add_entry(const struct net_port_info *npi) } else { if_ports_used_stats.ifpu_npe_dup++; if (if_ports_used_verbose > 2) { - os_log(OS_LOG_DEFAULT, "%s: already added " + os_log(wake_packet_log_handle, "%s: already added " "port: %u flags: 0x%x if: %u pid: %u epid %u", __func__, ntohs(npi->npi_local_port), @@ -521,7 +698,7 @@ net_port_info_add_entry(const struct net_port_info *npi) } #if (DEVELOPMENT || DEBUG) -int +static int sysctl_new_test_wakeuuid SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) @@ -544,7 +721,7 @@ sysctl_new_test_wakeuuid SYSCTL_HANDLER_ARGS return error; } -int +static int sysctl_clear_test_wakeuuid SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) @@ -747,7 +924,7 @@ if_ports_used_add_inpcb(const uint32_t ifindex, const struct inpcb *inp) /* This is unlikely to happen but better be safe than sorry */ if (ifindex > UINT16_MAX) { - os_log(OS_LOG_DEFAULT, "%s: ifindex %u too big", __func__, ifindex); + os_log(wake_packet_log_handle, "%s: ifindex %u too big", __func__, ifindex); return false; } @@ -756,12 +933,6 @@ if_ports_used_add_inpcb(const uint32_t ifindex, const struct inpcb *inp) } else if (inp->inp_last_outifp != NULL) { npi.npi_if_index = (uint16_t)inp->inp_last_outifp->if_index; } - if (IF_INDEX_IN_RANGE(npi.npi_if_index)) { - struct ifnet *ifp = ifindex2ifnet[npi.npi_if_index]; - if (ifp != NULL && IFNET_IS_COMPANION_LINK(ifp)) { - npi.npi_flags |= NPIF_COMPLINK; - } - } npi.npi_flags |= NPIF_SOCKET; @@ -772,6 +943,10 @@ if_ports_used_add_inpcb(const uint32_t ifindex, const struct inpcb *inp) npi.npi_flags |= NPIF_NOWAKE; } + if (inp->inp_flags2 & INP2_CONNECTION_IDLE) { + npi.npi_flags |= NPIF_CONNECTION_IDLE; + } + if (SOCK_PROTO(so) == IPPROTO_TCP) { struct tcpcb *tp = intotcpcb(inp); @@ -782,7 +957,7 @@ if_ports_used_add_inpcb(const uint32_t ifindex, const struct inpcb *inp) } else if (SOCK_PROTO(so) == IPPROTO_UDP) { npi.npi_flags |= NPIF_UDP; } else { - os_log(OS_LOG_DEFAULT, "%s: unexpected protocol %u for inp %p", __func__, + os_log(wake_packet_log_handle, "%s: unexpected protocol %u for inp %p", __func__, SOCK_PROTO(inp->inp_socket), inp); return false; } @@ -856,16 +1031,10 @@ if_ports_used_add_flow_entry(const struct flow_entry *fe, const uint32_t ifindex /* This is unlikely to happen but better be safe than sorry */ if (ifindex > UINT16_MAX) { - os_log(OS_LOG_DEFAULT, "%s: ifindex %u too big", __func__, ifindex); + os_log(wake_packet_log_handle, "%s: ifindex %u too big", __func__, ifindex); return false; } npi.npi_if_index = (uint16_t)ifindex; - if (IF_INDEX_IN_RANGE(ifindex)) { - struct ifnet *ifp = ifindex2ifnet[ifindex]; - if (ifp != NULL && IFNET_IS_COMPANION_LINK(ifp)) { - npi.npi_flags |= NPIF_COMPLINK; - } - } npi.npi_flags |= NPIF_CHANNEL; @@ -875,6 +1044,9 @@ if_ports_used_add_flow_entry(const struct flow_entry *fe, const uint32_t ifindex if (ns_flags & NETNS_NOWAKEFROMSLEEP) { npi.npi_flags |= NPIF_NOWAKE; } + if (ns_flags & NETNS_CONNECTION_IDLE) { + npi.npi_flags |= NPIF_CONNECTION_IDLE; + } if ((ns_flags & NETNS_OWNER_MASK) == NETNS_LISTENER) { npi.npi_flags |= NPIF_LISTEN; } @@ -886,7 +1058,7 @@ if_ports_used_add_flow_entry(const struct flow_entry *fe, const uint32_t ifindex } else if (nfi->nfi_protocol == IPPROTO_UDP) { npi.npi_flags |= NPIF_UDP; } else { - os_log(OS_LOG_DEFAULT, "%s: unexpected protocol %u for nfi %p", + os_log(wake_packet_log_handle, "%s: unexpected protocol %u for nfi %p", __func__, nfi->nfi_protocol, nfi); return false; } @@ -952,6 +1124,11 @@ net_port_info_log_npi(const char *s, const struct net_port_info *npi) char lbuf[MAX_IPv6_STR_LEN] = {}; char fbuf[MAX_IPv6_STR_LEN] = {}; + if (npi == NULL) { + os_log(wake_packet_log_handle, "%s", s); + return; + } + if (npi->npi_flags & NPIF_IPV4) { inet_ntop(PF_INET, &npi->npi_local_addr_in.s_addr, lbuf, sizeof(lbuf)); @@ -963,7 +1140,7 @@ net_port_info_log_npi(const char *s, const struct net_port_info *npi) inet_ntop(PF_INET6, &npi->npi_foreign_addr_in6, fbuf, sizeof(fbuf)); } - os_log(OS_LOG_DEFAULT, "%s net_port_info if_index %u arch %s family %s proto %s local %s:%u foreign %s:%u pid: %u epid %u", + os_log(wake_packet_log_handle, "%s net_port_info if_index %u arch %s family %s proto %s local %s:%u foreign %s:%u pid: %u epid %u", s != NULL ? s : "", npi->npi_if_index, (npi->npi_flags & NPIF_SOCKET) ? "so" : (npi->npi_flags & NPIF_CHANNEL) ? "ch" : "unknown", @@ -987,7 +1164,7 @@ net_port_info_match_npi(struct net_port_entry *npe, const struct net_port_info * struct net_port_entry **best_match) { if (__improbable(net_wake_pkt_debug > 1)) { - net_port_info_log_npi(" ", &npe->npe_npi); + net_port_info_log_npi("net_port_info_match_npi", &npe->npe_npi); } /* @@ -1104,9 +1281,7 @@ net_port_info_find_match(struct net_port_info *in_npi) if (best_match != NULL) { best_match->npe_npi.npi_flags |= NPIF_WAKEPKT; - if (best_match->npe_npi.npi_flags & NPIF_NOWAKE) { - in_npi->npi_flags |= NPIF_NOWAKE; - } + in_npi->npi_flags = best_match->npe_npi.npi_flags; in_npi->npi_owner_pid = best_match->npe_npi.npi_owner_pid; in_npi->npi_effective_pid = best_match->npe_npi.npi_effective_pid; strbufcpy(in_npi->npi_owner_pname, best_match->npe_npi.npi_owner_pname); @@ -1145,7 +1320,7 @@ net_port_info_log_una_wake_event(const char *s, struct net_port_info_una_wake_ev inet_ntop(PF_INET6, &ev->una_wake_pkt_foreign_addr_._in_a_6.s6_addr, fbuf, sizeof(fbuf)); } - os_log(OS_LOG_DEFAULT, "%s if %s (%u) phy_if %s proto %s local %s:%u foreign %s:%u len: %u datalen: %u cflags: 0x%x proto: %u", + os_log(wake_packet_log_handle, "%s if %s (%u) phy_if %s proto %s local %s:%u foreign %s:%u len: %u datalen: %u cflags: 0x%x proto: %u lpw: %d", s != NULL ? s : "", ev->una_wake_pkt_ifname, ev->una_wake_pkt_if_index, ev->una_wake_pkt_phy_ifname, ev->una_wake_pkt_flags & NPIF_TCP ? "tcp" : ev->una_wake_pkt_flags & NPIF_UDP ? "udp" : @@ -1153,7 +1328,8 @@ net_port_info_log_una_wake_event(const char *s, struct net_port_info_una_wake_ev lbuf, ntohs(ev->una_wake_pkt_local_port), fbuf, ntohs(ev->una_wake_pkt_foreign_port), ev->una_wake_pkt_total_len, ev->una_wake_pkt_data_len, - ev->una_wake_pkt_control_flags, ev->una_wake_pkt_proto); + ev->una_wake_pkt_control_flags, ev->una_wake_pkt_proto, + ev->una_wake_pkt_flags & NPIF_LPW ? 1 : 0); } static void @@ -1173,7 +1349,7 @@ net_port_info_log_wake_event(const char *s, struct net_port_info_wake_event *ev) inet_ntop(PF_INET6, &ev->wake_pkt_foreign_addr_._in_a_6.s6_addr, fbuf, sizeof(fbuf)); } - os_log(OS_LOG_DEFAULT, "%s if %s (%u) phy_if %s proto %s local %s:%u foreign %s:%u len: %u datalen: %u cflags: 0x%x proc %s eproc %s", + os_log(wake_packet_log_handle, "%s if %s (%u) phy_if %s proto %s local %s:%u foreign %s:%u len: %u datalen: %u cflags: 0x%x proc %s eproc %s idle %d lpw %d", s != NULL ? s : "", ev->wake_pkt_ifname, ev->wake_pkt_if_index, ev->wake_pkt_phy_ifname, ev->wake_pkt_flags & NPIF_TCP ? "tcp" : ev->wake_pkt_flags ? "udp" : @@ -1181,7 +1357,9 @@ net_port_info_log_wake_event(const char *s, struct net_port_info_wake_event *ev) lbuf, ntohs(ev->wake_pkt_port), fbuf, ntohs(ev->wake_pkt_foreign_port), ev->wake_pkt_total_len, ev->wake_pkt_data_len, ev->wake_pkt_control_flags, - ev->wake_pkt_owner_pname, ev->wake_pkt_effective_pname); + ev->wake_pkt_owner_pname, ev->wake_pkt_effective_pname, + ev->wake_pkt_flags & NPIF_CONNECTION_IDLE ? 1 : 0, + ev->wake_pkt_flags & NPIF_LPW ? 1 : 0); } #endif /* (DEBUG || DEVELOPMENT) */ @@ -1212,20 +1390,46 @@ is_wake_pkt_event_delay(uint32_t ifrtype) return false; } -static void +static int if_set_wake_physical_interface(struct ifnet *ifp) { - if (last_wake_phy_if_set == true || ifp == NULL) { - return; + /* + * A physical interface is either Ethernet, cellular or companion link over BT + * otherwise assumes it is some kind of tunnel + */ + if (ifp->if_family != IFNET_FAMILY_ETHERNET && ifp->if_family != IFNET_FAMILY_CELLULAR && + IFNET_IS_COMPANION_LINK_BLUETOOTH(ifp) == false) { + return 0; } + + /* + * Only handle a wake from a physical interface per wake cycle + */ + if (last_wake_phy_if_set == true) { + if_ports_used_stats.ifpu_wake_pkt_event_error += 1; + os_log(wake_packet_log_handle, + "if_set_wake_physical_interface ignored on %s because already set on %s", + IF_XNAME(ifp), last_wake_phy_if_name); + return EJUSTRETURN; + } + last_wake_phy_if_set = true; strlcpy(last_wake_phy_if_name, IF_XNAME(ifp), sizeof(last_wake_phy_if_name)); last_wake_phy_if_family = ifp->if_family; last_wake_phy_if_subfamily = ifp->if_subfamily; last_wake_phy_if_functional_type = if_functional_type(ifp, true); - if ((ifp->if_xflags & IFXF_DELAYWAKEPKTEVENT) != 0 || is_wake_pkt_event_delay(last_wake_phy_if_family)) { + + if (if_need_delayed_wake_pkt_event(ifp)) { + if_ports_used_stats.ifpu_delay_phy_wake_pkt += 1; last_wake_phy_if_delay_wake_pkt = true; + os_log(wake_packet_log_handle, "if_set_wake_physical_interface %s last_wake_phy_if_delay_wake_pkt set", + IF_XNAME(ifp)); } + if ((ifp->if_flags & IFXF_LOW_POWER_WAKE) != 0) { + last_wake_phy_if_lpw = true; + } + + return 0; } static void @@ -1233,6 +1437,12 @@ deliver_unattributed_wake_packet_event(struct net_port_info_una_wake_event *even { struct kev_msg ev_msg = {}; + if_ports_used_stats.ifpu_unattributed_wake_event += 1; + + last_wake_pkt_event.npi_wp_code = KEV_POWER_UNATTRIBUTED_WAKE; + memcpy(&last_wake_pkt_event.npi_ev_wake_pkt_unattributed, event_data, + sizeof(struct net_port_info_una_wake_event)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_POWER_SUBCLASS; @@ -1246,7 +1456,7 @@ deliver_unattributed_wake_packet_event(struct net_port_info_una_wake_event *even uuid_string_t wake_uuid_str; uuid_unparse(event_data->una_wake_uuid, wake_uuid_str); - os_log_error(OS_LOG_DEFAULT, + os_log_error(wake_packet_log_handle, "%s: kev_post_msg() failed with error %d for wake uuid %s", __func__, result, wake_uuid_str); @@ -1262,6 +1472,14 @@ deliver_attributed_wake_packet_event(struct net_port_info_wake_event *event_data { struct kev_msg ev_msg = {}; + has_notified_wake_pkt = true; + + if_ports_used_stats.ifpu_wake_pkt_event += 1; + + last_wake_pkt_event.npi_wp_code = KEV_POWER_WAKE_PACKET; + memcpy(&last_wake_pkt_event.npi_ev_wake_pkt_attributed, event_data, + sizeof(struct net_port_info_wake_event)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_POWER_SUBCLASS; @@ -1275,7 +1493,7 @@ deliver_attributed_wake_packet_event(struct net_port_info_wake_event *event_data uuid_string_t wake_uuid_str; uuid_unparse(event_data->wake_uuid, wake_uuid_str); - os_log_error(OS_LOG_DEFAULT, + os_log_error(wake_packet_log_handle, "%s: kev_post_msg() failed with error %d for wake uuid %s", __func__, result, wake_uuid_str); @@ -1286,31 +1504,81 @@ deliver_attributed_wake_packet_event(struct net_port_info_wake_event *event_data #endif /* (DEBUG || DEVELOPMENT) */ } -static void -if_notify_unattributed_wake_mbuf(struct ifnet *ifp, struct mbuf *m, - struct net_port_info *npi, uint32_t pkt_total_len, uint32_t pkt_data_len, - uint16_t pkt_control_flags, uint16_t proto) +static bool +is_unattributed_wake_already_notified(struct net_port_info *npi) { - LCK_MTX_ASSERT(&net_port_entry_head_lock, LCK_MTX_ASSERT_NOTOWNED); + bool retval = false; - lck_mtx_lock(&net_port_entry_head_lock); - if (has_notified_unattributed_wake) { - lck_mtx_unlock(&net_port_entry_head_lock); + if (has_notified_unattributed_wake == true || has_notified_wake_pkt == true) { if_ports_used_stats.ifpu_dup_unattributed_wake_event += 1; if (__improbable(net_wake_pkt_debug > 0)) { net_port_info_log_npi("already notified unattributed wake packet", npi); } - return; + retval = true; } - has_notified_unattributed_wake = true; + + return retval; +} + +static void +check_for_existing_delayed_wake_event() +{ + /* + * Count the delayed events that are ignored as the most recent delayed + * wake event wins as the packet makes up its way up the stack + */ + if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_WAKE_PACKET) { + if_ports_used_stats.ifpu_ignored_delayed_attributed_events += 1; + } else if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_UNATTRIBUTED_WAKE) { + if_ports_used_stats.ifpu_ignored_delayed_unattributed_events += 1; + } +} + +static void +if_notify_unattributed_wake_common(struct ifnet *ifp, struct net_port_info *npi, + struct net_port_info_una_wake_event *event_data) +{ + LCK_MTX_ASSERT(&net_port_entry_head_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(&net_port_entry_head_lock); + + if (is_unattributed_wake_already_notified(npi) == true) { + goto done; + } + + /* + * Check if this is a wake packet that we cannot process inline + */ + if (if_need_delayed_wake_pkt_event(ifp)) { + check_for_existing_delayed_wake_event(); + + delay_wake_pkt_event.npi_wp_code = KEV_POWER_UNATTRIBUTED_WAKE; + memcpy(&delay_wake_pkt_event.npi_ev_wake_pkt_unattributed, event_data, + sizeof(struct net_port_info_una_wake_event)); + +#if (DEBUG || DEVELOPMENT) + if (if_ports_used_verbose > 0) { + net_port_info_log_una_wake_event("delay unattributed wake packet event", event_data); + } +#endif /* (DEBUG || DEVELOPMENT) */ + + goto done; + } + deliver_unattributed_wake_packet_event(event_data); + +done: lck_mtx_unlock(&net_port_entry_head_lock); +} - if_ports_used_stats.ifpu_unattributed_wake_event += 1; - +static void +if_notify_unattributed_wake_mbuf(struct ifnet *ifp, struct mbuf *m, + struct net_port_info *npi, uint32_t pkt_total_len, uint32_t pkt_data_len, + uint16_t pkt_control_flags, uint16_t proto) +{ struct net_port_info_una_wake_event event_data = {}; + uuid_copy(event_data.una_wake_uuid, current_wakeuuid); - event_data.una_wake_pkt_if_index = ifp != NULL ? ifp->if_index : 0; + event_data.una_wake_pkt_if_index = ifp->if_index; event_data.una_wake_pkt_flags = npi->npi_flags; event_data.una_wake_pkt_local_port = npi->npi_local_port; @@ -1323,20 +1591,16 @@ if_notify_unattributed_wake_mbuf(struct ifnet *ifp, struct mbuf *m, event_data.una_wake_pkt_control_flags = pkt_control_flags; event_data.una_wake_pkt_proto = proto; - if (ifp != NULL) { - strlcpy(event_data.una_wake_pkt_ifname, IF_XNAME(ifp), - sizeof(event_data.una_wake_pkt_ifname)); - event_data.una_wake_pkt_if_info.npi_if_family = ifp->if_family; - event_data.una_wake_pkt_if_info.npi_if_subfamily = ifp->if_subfamily; - event_data.una_wake_pkt_if_info.npi_if_functional_type = if_functional_type(ifp, true); + strlcpy(event_data.una_wake_pkt_ifname, IF_XNAME(ifp), + sizeof(event_data.una_wake_pkt_ifname)); + event_data.una_wake_pkt_if_info.npi_if_family = ifp->if_family; + event_data.una_wake_pkt_if_info.npi_if_subfamily = ifp->if_subfamily; + event_data.una_wake_pkt_if_info.npi_if_functional_type = if_functional_type(ifp, true); - strbufcpy(event_data.una_wake_pkt_phy_ifname, last_wake_phy_if_name); - event_data.una_wake_pkt_phy_if_info.npi_if_family = last_wake_phy_if_family; - event_data.una_wake_pkt_phy_if_info.npi_if_subfamily = last_wake_phy_if_subfamily; - event_data.una_wake_pkt_phy_if_info.npi_if_functional_type = last_wake_phy_if_functional_type; - } else { - if_ports_used_stats.ifpu_unattributed_null_recvif += 1; - } + strbufcpy(event_data.una_wake_pkt_phy_ifname, last_wake_phy_if_name); + event_data.una_wake_pkt_phy_if_info.npi_if_family = last_wake_phy_if_family; + event_data.una_wake_pkt_phy_if_info.npi_if_subfamily = last_wake_phy_if_subfamily; + event_data.una_wake_pkt_phy_if_info.npi_if_functional_type = last_wake_phy_if_functional_type; event_data.una_wake_ptk_len = m->m_pkthdr.len > NPI_MAX_UNA_WAKE_PKT_LEN ? NPI_MAX_UNA_WAKE_PKT_LEN : (u_int16_t)m->m_pkthdr.len; @@ -1347,7 +1611,7 @@ if_notify_unattributed_wake_mbuf(struct ifnet *ifp, struct mbuf *m, uuid_string_t wake_uuid_str; uuid_unparse(event_data.una_wake_uuid, wake_uuid_str); - os_log_error(OS_LOG_DEFAULT, + os_log_error(wake_packet_log_handle, "%s: mbuf_copydata() failed with error %d for wake uuid %s", __func__, error, wake_uuid_str); @@ -1355,19 +1619,21 @@ if_notify_unattributed_wake_mbuf(struct ifnet *ifp, struct mbuf *m, return; } - last_wake_pkt_event.npi_wp_code = KEV_POWER_UNATTRIBUTED_WAKE; - memcpy(&last_wake_pkt_event.npi_ev_wake_pkt_unattributed, &event_data, sizeof(last_wake_pkt_event.npi_ev_wake_pkt_unattributed)); + if_notify_unattributed_wake_common(ifp, npi, &event_data); +} - if (last_wake_phy_if_delay_wake_pkt) { -#if (DEBUG || DEVELOPMENT) - if (if_ports_used_verbose > 0) { - net_port_info_log_una_wake_event("delay unattributed wake packet event", &event_data); +static bool +is_attributed_wake_already_notified(struct net_port_info *npi) +{ + if (has_notified_wake_pkt == true) { + if_ports_used_stats.ifpu_dup_wake_pkt_event += 1; + if (__improbable(net_wake_pkt_debug > 0)) { + net_port_info_log_npi("already notified attributed wake packet", npi); } -#endif /* (DEBUG || DEVELOPMENT) */ - return; + return true; } - deliver_unattributed_wake_packet_event(&event_data); + return false; } static void @@ -1412,38 +1678,50 @@ if_notify_wake_packet(struct ifnet *ifp, struct net_port_info *npi, lck_mtx_lock(&net_port_entry_head_lock); - if (has_notified_wake_pkt) { - lck_mtx_unlock(&net_port_entry_head_lock); - if_ports_used_stats.ifpu_dup_wake_pkt_event += 1; - - if (__improbable(net_wake_pkt_debug > 0)) { - net_port_info_log_npi("already notified wake packet", npi); - } - return; - } - has_notified_wake_pkt = true; - - last_wake_pkt_event.npi_wp_code = KEV_POWER_WAKE_PACKET; - memcpy(&last_wake_pkt_event.npi_ev_wake_pkt_attributed, &event_data, sizeof(last_wake_pkt_event.npi_ev_wake_pkt_attributed)); - - lck_mtx_unlock(&net_port_entry_head_lock); - - if (npi->npi_flags & NPIF_NOWAKE) { - if_ports_used_stats.ifpu_spurious_wake_event += 1; - } else { - if_ports_used_stats.ifpu_wake_pkt_event += 1; + /* + * Always immediately notify attributed wake for idle connections in LPW + * even if an attributed wake has already been notified or + * the interface requires delayed wake attribution + */ + if (if_is_lpw_enabled(ifp) && + (npi->npi_flags & NPIF_CONNECTION_IDLE) != 0) { + goto deliver; } - if (last_wake_phy_if_delay_wake_pkt) { + if (is_attributed_wake_already_notified(npi) == true) { + goto done; + } + + /* + * Check if this is a wake packet that we cannot process inline + * We do not delay attributed idle connections in LPW because it is more + * important to get accurate count about attributed idle connections in LPW + * than an accurate count of attributed wake. + */ + if (if_need_delayed_wake_pkt_event(ifp)) { + check_for_existing_delayed_wake_event(); + + delay_wake_pkt_event.npi_wp_code = KEV_POWER_WAKE_PACKET; + memcpy(&delay_wake_pkt_event.npi_ev_wake_pkt_attributed, &event_data, + sizeof(struct net_port_info_wake_event)); + #if (DEBUG || DEVELOPMENT) if (if_ports_used_verbose > 0) { net_port_info_log_wake_event("delay attributed wake packet event", &event_data); } #endif /* (DEBUG || DEVELOPMENT) */ - return; + + goto done; + } + +deliver: + if (npi->npi_flags & NPIF_NOWAKE) { + if_ports_used_stats.ifpu_spurious_wake_event += 1; } deliver_attributed_wake_packet_event(&event_data); +done: + lck_mtx_unlock(&net_port_entry_head_lock); } static bool @@ -1465,7 +1743,7 @@ is_encapsulated_esp(struct mbuf *m, size_t data_offset) errno_t error = mbuf_copydata(m, data_offset, ESP_HDR_SIZE, &payload); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(ESP_HDR_SIZE) error %d", + os_log(wake_packet_log_handle, "%s: mbuf_copydata(ESP_HDR_SIZE) error %d", __func__, error); } else if (payload[0] == 0 && payload[1] == 0 && payload[2] == 0 && payload[3] == 0) { @@ -1475,6 +1753,48 @@ is_encapsulated_esp(struct mbuf *m, size_t data_offset) return true; } +static void +log_hexdump(os_log_t log_handle, void *__sized_by(len) data, size_t len) +{ + size_t i, j, k; + unsigned char *ptr = (unsigned char *)data; +#define MAX_DUMP_BUF 32 + unsigned char buf[3 * MAX_DUMP_BUF + 1]; + + for (i = 0; i < len; i += MAX_DUMP_BUF) { + for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) { + unsigned char msnbl = ptr[j] >> 4; + unsigned char lsnbl = ptr[j] & 0x0f; + + buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10; + buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10; + + if ((j % 2) == 1) { + buf[k++] = ' '; + } + if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) { + buf[k++] = ' '; + } + } + buf[k] = 0; + os_log(log_handle, "%3lu: %s", i, buf); + } +} + +__attribute__((noinline)) +static void +log_wake_mbuf(struct ifnet *ifp, struct mbuf *m) +{ + char buffer[64]; + size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer)); + + os_log(wake_packet_log_handle, "wake packet from %s len %d", + ifp->if_xname, m_pktlen(m)); + if (mbuf_copydata(m, 0, buflen, buffer) == 0) { + log_hexdump(wake_packet_log_handle, buffer, buflen); + } +} + void if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, struct mbuf *m) { @@ -1486,10 +1806,28 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru uint16_t pkt_control_flags = 0; uint16_t pkt_proto = 0; + if (ifp == NULL) { + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: receive interface is NULL"); + if_ports_used_stats.ifpu_unattributed_null_recvif += 1; + return; + } + if ((m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) == 0) { if_ports_used_stats.ifpu_match_wake_pkt_no_flag += 1; - os_log_error(OS_LOG_DEFAULT, "%s: called PKTF_WAKE_PKT not set from %s", - __func__, ifp != NULL ? IF_XNAME(ifp) : ""); + os_log_error(wake_packet_log_handle, "if_ports_used_match_mbuf: called PKTF_WAKE_PKT not set from %s", + IF_XNAME(ifp)); + return; + } + + if (__improbable(net_wake_pkt_debug > 0)) { + log_wake_mbuf(ifp, m); + } + + /* + * Only accept one wake from a physical interface per wake cycle + */ + if (if_set_wake_physical_interface(ifp) == EJUSTRETURN) { + m->m_pkthdr.pkt_flags &= ~PKTF_WAKE_PKT; return; } @@ -1498,12 +1836,9 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru pkt_total_len = m->m_pkthdr.len; pkt_data_len = pkt_total_len; - if (ifp != NULL) { - npi.npi_if_index = ifp->if_index; - if (IFNET_IS_COMPANION_LINK(ifp)) { - npi.npi_flags |= NPIF_COMPLINK; - } - if_set_wake_physical_interface(ifp); + npi.npi_if_index = ifp->if_index; + if (IFNET_IS_COMPANION_LINK(ifp)) { + npi.npi_flags |= NPIF_COMPLINK; } if (proto_family == PF_INET) { @@ -1513,8 +1848,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, 0, sizeof(struct ip), &iphdr); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(ip) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(ip) error %d", + error); goto failed; } npi.npi_flags |= NPIF_IPV4; @@ -1550,8 +1885,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru struct tcphdr th = {}; error = mbuf_copydata(m, iphdr.ip_hl << 2, sizeof(struct tcphdr), &th); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(tcphdr) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(tcphdr) error %d", + error); goto failed; } npi.npi_local_port = th.th_dport; @@ -1578,8 +1913,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, udp_offset, sizeof(struct udphdr), &uh); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(udphdr) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(udphdr) error %d", + error); goto failed; } npi.npi_local_port = uh.uh_dport; @@ -1618,8 +1953,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru } default: if_ports_used_stats.ifpu_bad_proto_wake_pkt += 1; - os_log(OS_LOG_DEFAULT, "%s: unexpected IPv4 protocol %u from %s", - __func__, iphdr.ip_p, IF_XNAME(ifp)); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: unexpected IPv4 protocol %u from %s", + iphdr.ip_p, IF_XNAME(ifp)); goto failed; } } else if (proto_family == PF_INET6) { @@ -1629,8 +1964,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, 0, sizeof(struct ip6_hdr), &ip6_hdr); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(ip6_hdr) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(ip6_hdr) error %d", + error); goto failed; } npi.npi_flags |= NPIF_IPV6; @@ -1656,8 +1991,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, sizeof(struct ip6_hdr), sizeof(struct ip6_frag), &ip6_frag); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(ip6_frag) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(ip6_frag) error %d", + error); goto failed; } @@ -1688,8 +2023,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, l3_len, sizeof(struct tcphdr), &th); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(tcphdr) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(tcphdr) error %d", + error); if_ports_used_stats.ifpu_incomplete_tcp_hdr_pkt += 1; goto failed; } @@ -1721,8 +2056,8 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru error = mbuf_copydata(m, l3_len, sizeof(struct udphdr), &uh); if (error != 0) { - os_log(OS_LOG_DEFAULT, "%s: mbuf_copydata(udphdr) error %d", - __func__, error); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: mbuf_copydata(udphdr) error %d", + error); if_ports_used_stats.ifpu_incomplete_udp_hdr_pkt += 1; goto failed; } @@ -1763,21 +2098,35 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru default: if_ports_used_stats.ifpu_bad_proto_wake_pkt += 1; - os_log(OS_LOG_DEFAULT, "%s: unexpected IPv6 protocol %u from %s", - __func__, ip6_hdr.ip6_nxt, IF_XNAME(ifp)); + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: unexpected IPv6 protocol %u from %s", + ip6_hdr.ip6_nxt, IF_XNAME(ifp)); goto failed; } } else { if_ports_used_stats.ifpu_bad_family_wake_pkt += 1; - os_log(OS_LOG_DEFAULT, "%s: unexpected protocol family %d from %s", - __func__, proto_family, IF_XNAME(ifp)); - goto failed; - } - if (ifp == NULL) { + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: unexpected protocol family %d from %s", + proto_family, IF_XNAME(ifp)); goto failed; } found = net_port_info_find_match(&npi); + +failed: + if (__improbable(if_is_lpw_enabled(ifp))) { + npi.npi_flags |= NPIF_LPW; + + if (found && (npi.npi_flags & NPIF_CONNECTION_IDLE)) { + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: idle connection in LPW on %s", + IF_XNAME(ifp)); + + if_ports_used_stats.ifpu_lpw_connection_idle_wake++; + } else { + os_log(wake_packet_log_handle, "if_ports_used_match_mbuf: not idle connection in LPW on %s", + IF_XNAME(ifp)); + + if_ports_used_stats.ifpu_lpw_not_idle_wake++; + } + } if (found) { if_notify_wake_packet(ifp, &npi, pkt_total_len, pkt_data_len, pkt_control_flags); @@ -1785,10 +2134,6 @@ if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, stru if_notify_unattributed_wake_mbuf(ifp, m, &npi, pkt_total_len, pkt_data_len, pkt_control_flags, pkt_proto); } - return; -failed: - if_notify_unattributed_wake_mbuf(ifp, m, &npi, - pkt_total_len, pkt_data_len, pkt_control_flags, pkt_proto); } #if SKYWALK @@ -1798,32 +2143,10 @@ if_notify_unattributed_wake_pkt(struct ifnet *ifp, struct __kern_packet *pkt, struct net_port_info *npi, uint32_t pkt_total_len, uint32_t pkt_data_len, uint16_t pkt_control_flags, uint16_t proto) { - LCK_MTX_ASSERT(&net_port_entry_head_lock, LCK_MTX_ASSERT_NOTOWNED); - - lck_mtx_lock(&net_port_entry_head_lock); - if (has_notified_unattributed_wake) { - lck_mtx_unlock(&net_port_entry_head_lock); - if_ports_used_stats.ifpu_dup_unattributed_wake_event += 1; - - if (__improbable(net_wake_pkt_debug > 0)) { - net_port_info_log_npi("already notified unattributed wake packet", npi); - } - return; - } - has_notified_unattributed_wake = true; - lck_mtx_unlock(&net_port_entry_head_lock); - - if_ports_used_stats.ifpu_unattributed_wake_event += 1; - - if (ifp == NULL) { - os_log(OS_LOG_DEFAULT, "%s: receive interface is NULL", - __func__); - if_ports_used_stats.ifpu_unattributed_null_recvif += 1; - } - struct net_port_info_una_wake_event event_data = {}; + uuid_copy(event_data.una_wake_uuid, current_wakeuuid); - event_data.una_wake_pkt_if_index = ifp != NULL ? ifp->if_index : 0; + event_data.una_wake_pkt_if_index = ifp->if_index; event_data.una_wake_pkt_flags = npi->npi_flags; uint16_t offset = kern_packet_get_network_header_offset(SK_PKT2PH(pkt)); @@ -1838,29 +2161,31 @@ if_notify_unattributed_wake_pkt(struct ifnet *ifp, struct __kern_packet *pkt, event_data.una_wake_pkt_foreign_port = npi->npi_foreign_port; event_data.una_wake_pkt_local_addr_ = npi->npi_local_addr_; event_data.una_wake_pkt_foreign_addr_ = npi->npi_foreign_addr_; - if (ifp != NULL) { - strlcpy(event_data.una_wake_pkt_ifname, IF_XNAME(ifp), - sizeof(event_data.una_wake_pkt_ifname)); - } + strlcpy(event_data.una_wake_pkt_ifname, IF_XNAME(ifp), + sizeof(event_data.una_wake_pkt_ifname)); event_data.una_wake_pkt_total_len = pkt_total_len; event_data.una_wake_pkt_data_len = pkt_data_len; event_data.una_wake_pkt_control_flags = pkt_control_flags; event_data.una_wake_pkt_proto = proto; - last_wake_pkt_event.npi_wp_code = KEV_POWER_UNATTRIBUTED_WAKE; - memcpy(&last_wake_pkt_event.npi_ev_wake_pkt_unattributed, &event_data, sizeof(last_wake_pkt_event.npi_ev_wake_pkt_unattributed)); + if_notify_unattributed_wake_common(ifp, npi, &event_data); +} - if (last_wake_phy_if_delay_wake_pkt) { -#if (DEBUG || DEVELOPMENT) - if (if_ports_used_verbose > 0) { - net_port_info_log_una_wake_event("delay unattributed wake packet event", &event_data); - } -#endif /* (DEBUG || DEVELOPMENT) */ - return; +__attribute__((noinline)) +static void +log_wake_pkt(struct ifnet *ifp, struct __kern_packet *pkt) +{ + uint32_t len; + + if (pkt->pkt_pflags & PKT_F_MBUF_DATA) { + len = m_pktlen(pkt->pkt_mbuf); + } else { + len = __packet_get_real_data_length(pkt); } - deliver_unattributed_wake_packet_event(&event_data); + os_log(wake_packet_log_handle, "wake packet from %s len %d", + ifp->if_xname, len); } void @@ -1873,27 +2198,44 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) uint16_t pkt_control_flags = 0; uint16_t pkt_proto = 0; + if (ifp == NULL) { + os_log(wake_packet_log_handle, "if_ports_used_match_pkt: receive interface is NULL"); + if_ports_used_stats.ifpu_unattributed_null_recvif += 1; + return; + } + if ((pkt->pkt_pflags & PKT_F_WAKE_PKT) == 0) { if_ports_used_stats.ifpu_match_wake_pkt_no_flag += 1; - os_log_error(OS_LOG_DEFAULT, "%s: called PKT_F_WAKE_PKT not set from %s", + os_log_error(wake_packet_log_handle, "%s: called PKT_F_WAKE_PKT not set from %s", __func__, IF_XNAME(ifp)); return; } + + if (__improbable(net_wake_pkt_debug > 0)) { + log_wake_pkt(ifp, pkt); + } + + /* + * Only accept one wake from a physical interface per wake cycle + */ + if (if_set_wake_physical_interface(ifp) == EJUSTRETURN) { + pkt->pkt_pflags &= ~PKT_F_WAKE_PKT; + return; + } + if_ports_used_stats.ifpu_ch_match_wake_pkt += 1; npi.npi_flags |= NPIF_CHANNEL; /* For logging */ pkt_total_len = pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen; pkt_data_len = pkt->pkt_flow_ulen; - if (ifp != NULL) { - npi.npi_if_index = ifp->if_index; - if (IFNET_IS_COMPANION_LINK(ifp)) { - npi.npi_flags |= NPIF_COMPLINK; - } - if_set_wake_physical_interface(ifp); + npi.npi_if_index = ifp->if_index; + if (IFNET_IS_COMPANION_LINK(ifp)) { + npi.npi_flags |= NPIF_COMPLINK; } + switch (pkt->pkt_flow_ip_ver) { case IPVERSION: if_ports_used_stats.ifpu_ipv4_wake_pkt += 1; @@ -1914,7 +2256,7 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) default: if_ports_used_stats.ifpu_bad_family_wake_pkt += 1; - os_log(OS_LOG_DEFAULT, "%s: unexpected protocol family %u from %s", + os_log(wake_packet_log_handle, "%s: unexpected protocol family %u from %s", __func__, pkt->pkt_flow_ip_ver, IF_XNAME(ifp)); goto failed; } @@ -1924,7 +2266,7 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) * Check if this is a fragment that is not the first fragment */ if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) { - os_log(OS_LOG_DEFAULT, "%s: unexpected wake fragment from %s", + os_log(wake_packet_log_handle, "%s: unexpected wake fragment from %s", __func__, IF_XNAME(ifp)); npi.npi_flags |= NPIF_FRAG; if_ports_used_stats.ifpu_frag_wake_pkt += 1; @@ -1944,7 +2286,7 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) } struct tcphdr * __single tcp = __unsafe_forge_single(struct tcphdr *, pkt->pkt_flow_tcp_hdr); if (tcp == NULL) { - os_log(OS_LOG_DEFAULT, "%s: pkt with unassigned TCP header from %s", + os_log(wake_packet_log_handle, "%s: pkt with unassigned TCP header from %s", __func__, IF_XNAME(ifp)); if_ports_used_stats.ifpu_incomplete_tcp_hdr_pkt += 1; goto failed; @@ -1967,7 +2309,7 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) } struct udphdr * __single uh = __unsafe_forge_single(struct udphdr *, pkt->pkt_flow_udp_hdr); if (uh == NULL) { - os_log(OS_LOG_DEFAULT, "%s: pkt with unassigned UDP header from %s", + os_log(wake_packet_log_handle, "%s: pkt with unassigned UDP header from %s", __func__, IF_XNAME(ifp)); if_ports_used_stats.ifpu_incomplete_udp_hdr_pkt += 1; goto failed; @@ -2003,16 +2345,30 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) default: if_ports_used_stats.ifpu_bad_proto_wake_pkt += 1; - os_log(OS_LOG_DEFAULT, "%s: unexpected IP protocol %u from %s", + os_log(wake_packet_log_handle, "%s: unexpected IP protocol %u from %s", __func__, pkt->pkt_flow_ip_proto, IF_XNAME(ifp)); goto failed; } - if (ifp == NULL) { - goto failed; + found = net_port_info_find_match(&npi); + +failed: + if (__improbable(if_is_lpw_enabled(ifp))) { + npi.npi_flags |= NPIF_LPW; + + if (found && (npi.npi_flags & NPIF_CONNECTION_IDLE)) { + os_log(wake_packet_log_handle, "if_ports_used_match_pkt: idle connection in LPW on %s", + IF_XNAME(ifp)); + + if_ports_used_stats.ifpu_lpw_connection_idle_wake++; + } else { + os_log(wake_packet_log_handle, "if_ports_used_match_pkt: not idle connection in LPW on %s", + IF_XNAME(ifp)); + + if_ports_used_stats.ifpu_lpw_not_idle_wake++; + } } - found = net_port_info_find_match(&npi); if (found) { if_notify_wake_packet(ifp, &npi, pkt_total_len, pkt_data_len, pkt_control_flags); @@ -2020,10 +2376,6 @@ if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt) if_notify_unattributed_wake_pkt(ifp, pkt, &npi, pkt_total_len, pkt_data_len, pkt_control_flags, pkt_proto); } - return; -failed: - if_notify_unattributed_wake_pkt(ifp, pkt, &npi, - pkt_total_len, pkt_data_len, pkt_control_flags, pkt_proto); } #endif /* SKYWALK */ @@ -2097,22 +2449,62 @@ sysctl_wake_pkt_event_notify SYSCTL_HANDLER_ARGS return EPERM; } - os_log(OS_LOG_DEFAULT, "sysctl_wake_pkt_event_notify proc %s:%u val %u last_wake_phy_if_delay_wake_pkt %d last_wake_phy_if_family %u", + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_notify proc %s:%u val %u last_wake_phy_if_delay_wake_pkt %d last_wake_phy_if_family %u delay_wake_pkt_event %d", proc_best_name(current_proc()), proc_selfpid(), - if_family, last_wake_phy_if_delay_wake_pkt, last_wake_phy_if_family); - - if (last_wake_phy_if_delay_wake_pkt && val == last_wake_phy_if_family) { - last_wake_phy_if_delay_wake_pkt = false; - if (last_wake_pkt_event.npi_wp_code == KEV_POWER_WAKE_PACKET) { - deliver_attributed_wake_packet_event(&last_wake_pkt_event.npi_ev_wake_pkt_attributed); - } else { - deliver_unattributed_wake_packet_event(&last_wake_pkt_event.npi_ev_wake_pkt_unattributed); + if_family, last_wake_phy_if_delay_wake_pkt, last_wake_phy_if_family, + delay_wake_pkt_event.npi_wp_code); +#if (DEBUG || DEVELOPMENT) + if (if_ports_used_verbose > 0) { + if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_WAKE_PACKET) { + net_port_info_log_wake_event("sysctl_wake_pkt_event_notify", &delay_wake_pkt_event.npi_ev_wake_pkt_attributed); + } else if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_UNATTRIBUTED_WAKE) { + net_port_info_log_una_wake_event("sysctl_wake_pkt_event_notify", &delay_wake_pkt_event.npi_ev_wake_pkt_unattributed); } } +#endif /* (DEBUG || DEVELOPMENT) */ + + lck_mtx_lock(&net_port_entry_head_lock); + + if (last_wake_phy_if_delay_wake_pkt == true && val == last_wake_phy_if_family) { + last_wake_phy_if_delay_wake_pkt = false; + + if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_WAKE_PACKET) { + if (is_attributed_wake_already_notified(NULL) == false) { + deliver_attributed_wake_packet_event(&delay_wake_pkt_event.npi_ev_wake_pkt_attributed); + } else { + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_notify attributed_wake_already_notified"); + } + } else if (delay_wake_pkt_event.npi_wp_code == KEV_POWER_UNATTRIBUTED_WAKE) { + if (is_unattributed_wake_already_notified(NULL)) { + deliver_unattributed_wake_packet_event(&delay_wake_pkt_event.npi_ev_wake_pkt_unattributed); + } else { + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_notify unattributed_wake_already_notified"); + } + } else { + if_ports_used_stats.ifpu_wake_pkt_event_notify_in_vain += 1; + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_notify bad npi_wp_code"); + } + } else { + if_ports_used_stats.ifpu_wake_pkt_event_notify_in_vain += 1; + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_notify in vain"); + } + lck_mtx_unlock(&net_port_entry_head_lock); return 0; } +static void +if_set_delay_wake_flags(ifnet_t ifp, bool delay) +{ + if (delay) { + if_set_xflags(ifp, IFXF_DELAYWAKEPKTEVENT); + if_clear_xflags(ifp, IFXF_INBAND_WAKE_PKT_TAGGING); + } else { + if_clear_xflags(ifp, IFXF_DELAYWAKEPKTEVENT); + if_set_xflags(ifp, IFXF_INBAND_WAKE_PKT_TAGGING); + } +} + int sysctl_wake_pkt_event_delay_if_families SYSCTL_HANDLER_ARGS { @@ -2120,27 +2512,266 @@ sysctl_wake_pkt_event_delay_if_families SYSCTL_HANDLER_ARGS long long val = npi_wake_packet_event_delay_if_families; int error; int changed = 0; - uint32_t if_families = 0; + uint32_t old_value = npi_wake_packet_event_delay_if_families; error = sysctl_io_number(req, val, sizeof(val), &val, &changed); if (error != 0 || req->newptr == 0 || changed == 0) { return error; } - if (val < 0 || val > UINT32_MAX) { - return EINVAL; - } - if_families = (uint32_t)val; - if (!IOCurrentTaskHasEntitlement(WAKE_PKT_EVENT_CONTROL_ENTITLEMENT)) { return EPERM; } - - os_log(OS_LOG_DEFAULT, "sysctl_wake_pkt_event_delay_if_families proc %s:%u npi_wake_packet_event_delay_if_families 0x%x -> 0x%x", - proc_best_name(current_proc()), proc_selfpid(), - npi_wake_packet_event_delay_if_families, if_families); + if (val < 0 || val > UINT32_MAX) { + return EINVAL; + } /* The value is the bitmap of the functional types to delay */ - npi_wake_packet_event_delay_if_families = if_families; + old_value = npi_wake_packet_event_delay_if_families; + npi_wake_packet_event_delay_if_families = (uint32_t)val; + + /* Need to reevalute the capability of doing in-band wake packet tagging */ + if (npi_wake_packet_event_delay_if_families != 0) { + uint32_t count, i; + ifnet_t *__counted_by(count) ifp_list; + + error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count); + if (error != 0) { + os_log_error(wake_packet_log_handle, + "%s: ifnet_list_get_all() failed %d", + __func__, error); + npi_wake_packet_event_delay_if_families = old_value; + return error; + } + for (i = 0; i < count; i++) { + ifnet_t ifp = ifp_list[i]; + bool delay = is_wake_pkt_event_delay(ifp->if_family); + const uint32_t flags = IFXF_INBAND_WAKE_PKT_TAGGING | IFXF_DELAYWAKEPKTEVENT; + + if ((delay && (ifp->if_xflags & flags) != IFXF_DELAYWAKEPKTEVENT) || + (!delay && (ifp->if_xflags & flags) != IFXF_INBAND_WAKE_PKT_TAGGING)) { + if_set_delay_wake_flags(ifp, delay); + + if (if_ports_used_verbose || ifp->if_family == IFNET_FAMILY_CELLULAR) { + os_log(wake_packet_log_handle, "interface %s reset INBAND_WAKE_PKT_TAGGING %d DELAYWAKEPKTEVENT %d", + ifp->if_xname, + ifp->if_xflags & IFXF_INBAND_WAKE_PKT_TAGGING ? 1 : 0, + ifp->if_xflags & IFXF_DELAYWAKEPKTEVENT ? 1 : 0); + } + } + } + ifnet_list_free_counted_by(ifp_list, count); + } + + os_log(wake_packet_log_handle, "sysctl_wake_pkt_event_delay_if_families proc %s:%u npi_wake_packet_event_delay_if_families 0x%x -> 0x%x", + proc_best_name(current_proc()), proc_selfpid(), + old_value, npi_wake_packet_event_delay_if_families); + return 0; } + +void +init_inband_wake_pkt_tagging_for_family(struct ifnet *ifp) +{ + bool delay = is_wake_pkt_event_delay(ifp->if_family); + + if_set_delay_wake_flags(ifp, delay); + + if (if_ports_used_verbose || ifp->if_family == IFNET_FAMILY_CELLULAR) { + os_log(wake_packet_log_handle, "interface %s initialized INBAND_WAKE_PKT_TAGGING %d DELAYWAKEPKTEVENT %d", + ifp->if_xname, + ifp->if_xflags & IFXF_INBAND_WAKE_PKT_TAGGING ? 1 : 0, + ifp->if_xflags & IFXF_DELAYWAKEPKTEVENT ? 1 : 0); + } +} + +#if (DEBUG | DEVELOPMENT) + +static int +sysctl_use_fake_lpw SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + int old_value = use_fake_lpw; + int new_value = *(int *)oidp->oid_arg1; + + error = sysctl_handle_int(oidp, &new_value, 0, req); + if (error == 0) { + *(int *)oidp->oid_arg1 = new_value; + + if (new_value != old_value) { + os_log(wake_packet_log_handle, "use_fake_lpw %d", new_value); + } + } + return error; +} + +static int +sysctl_mark_wake_packet_port SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + int new_value = *(int *)oidp->oid_arg1; + + error = sysctl_handle_int(oidp, &new_value, 0, req); + if (error == 0) { + if (new_value < 0 || new_value >= UINT16_MAX) { + error = EINVAL; + goto done; + } + *(int *)oidp->oid_arg1 = new_value; + } +done: + return error; +} + +static int +sysctl_mark_wake_packet_if SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + char new_value[IFNAMSIZ] = { 0 }; + int changed = 0; + + strbufcpy(new_value, IFNAMSIZ, mark_wake_packet_if, IFNAMSIZ); + error = sysctl_io_string(req, new_value, IFNAMSIZ, 0, &changed); + if (error == 0) { + strbufcpy(mark_wake_packet_if, IFNAMSIZ, new_value, IFNAMSIZ); + } + + return error; +} + +bool +check_wake_mbuf(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m) +{ + uint8_t ipproto = 0; + size_t offset = 0; + + /* The protocol and interface must both be specified */ + if (mark_wake_packet_ipproto == 0 || mark_wake_packet_if[0] == 0) { + return false; + } + /* The interface must match */ + if (strlcmp(mark_wake_packet_if, IF_XNAME(ifp), IFNAMSIZ) != 0) { + return false; + } + /* The protocol must match */ + if (protocol_family == PF_INET6) { + struct ip6_hdr ip6; + + if ((size_t)(m)->m_pkthdr.len < sizeof(struct ip6_hdr)) { + os_log(wake_packet_log_handle, "check_wake_mbuf: IP6 too short"); + return false; + } + mbuf_copydata(m, 0, sizeof(struct ip6_hdr), &ip6); + + if ((ipproto = ip6.ip6_nxt) != mark_wake_packet_ipproto) { + return false; + } + offset = sizeof(struct ip6_hdr); + } else if (protocol_family == PF_INET) { + struct ip ip; + + if ((size_t)(m)->m_pkthdr.len < sizeof(struct ip)) { + os_log(wake_packet_log_handle, "check_wake_mbuf: IP too short"); + return false; + } + mbuf_copydata(m, 0, sizeof(struct ip), &ip); + + if ((ipproto = ip.ip_p) != mark_wake_packet_ipproto) { + return false; + } + offset = sizeof(struct ip); + } + + /* Check the ports for TCP and UDP */ + if (ipproto == IPPROTO_TCP) { + struct tcphdr th; + + if ((size_t)(m)->m_pkthdr.len < offset + sizeof(struct tcphdr)) { + os_log(wake_packet_log_handle, "check_wake_mbuf: TCP too short"); + return false; + } + mbuf_copydata(m, offset, sizeof(struct tcphdr), &th); + + if (mark_wake_packet_local_port != 0 && + ntohs(th.th_dport) != mark_wake_packet_local_port) { + return false; + } + if (mark_wake_packet_remote_port != 0 && + ntohs(th.th_sport) != mark_wake_packet_remote_port) { + return false; + } + return true; + } else if (ipproto == IPPROTO_UDP) { + struct udphdr uh; + + if ((size_t)(m)->m_pkthdr.len < offset + sizeof(struct udphdr)) { + os_log(wake_packet_log_handle, "check_wake_mbufL UDP too short"); + return false; + } + mbuf_copydata(m, offset, sizeof(struct udphdr), &uh); + + if (mark_wake_packet_local_port != 0 && + ntohs(uh.uh_dport) != mark_wake_packet_local_port) { + return false; + } + if (mark_wake_packet_remote_port != 0 && + ntohs(uh.uh_sport) != mark_wake_packet_remote_port) { + return false; + } + return true; + } + + return ipproto == mark_wake_packet_ipproto; +} + +bool +check_wake_pkt(ifnet_t ifp __unused, struct __kern_packet *pkt) +{ + /* The protocol and interface must both be specified */ + if (mark_wake_packet_ipproto == 0 || mark_wake_packet_if[0] == 0) { + return false; + } + /* The interface must match */ + if (strlcmp(mark_wake_packet_if, IF_XNAME(ifp), IFNAMSIZ) != 0) { + return false; + } + /* Cannot deal with fragments */ + if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) { + return false; + } + /* Check the ports for TCP and UDP */ + if (pkt->pkt_flow_ip_proto == IPPROTO_TCP) { + struct tcphdr * __single th = __unsafe_forge_single(struct tcphdr *, pkt->pkt_flow_tcp_hdr); + if (th == NULL) { + return false; + } + if (mark_wake_packet_local_port != 0 && + ntohs(th->th_dport) != mark_wake_packet_local_port) { + return false; + } + if (mark_wake_packet_remote_port != 0 && + ntohs(th->th_sport) != mark_wake_packet_remote_port) { + return false; + } + return true; + } else if (pkt->pkt_flow_ip_proto == IPPROTO_UDP) { + struct udphdr * __single uh = __unsafe_forge_single(struct udphdr *, pkt->pkt_flow_udp_hdr); + if (uh == NULL) { + return false; + } + if (mark_wake_packet_local_port != 0 && + ntohs(uh->uh_dport) != mark_wake_packet_local_port) { + return false; + } + if (mark_wake_packet_remote_port != 0 && + ntohs(uh->uh_sport) != mark_wake_packet_remote_port) { + return false; + } + } + return pkt->pkt_flow_ip_proto == mark_wake_packet_ipproto; +} + +#endif /* (DEBUG | DEVELOPMENT) */ diff --git a/bsd/net/if_ports_used.h b/bsd/net/if_ports_used.h index 3fbda1bc8..0488c8b99 100644 --- a/bsd/net/if_ports_used.h +++ b/bsd/net/if_ports_used.h @@ -32,6 +32,7 @@ #ifdef PRIVATE #include +#include #include #include #include @@ -71,19 +72,40 @@ union in_addr_4_6 { struct in6_addr _in_a_6; }; -#define NPIF_IPV4 0x0001 -#define NPIF_IPV6 0x0002 -#define NPIF_TCP 0x0004 -#define NPIF_UDP 0x0008 -#define NPIF_DELEGATED 0x0010 -#define NPIF_SOCKET 0x0020 -#define NPIF_CHANNEL 0x0040 -#define NPIF_LISTEN 0x0080 -#define NPIF_WAKEPKT 0x0100 -#define NPIF_NOWAKE 0x0200 /* flow marked with SO_NOWAKEFROMSLEEP are normally excluded */ -#define NPIF_FRAG 0x0400 /* packet is pure fragment (i.e. no src and dst port) */ -#define NPIF_ESP 0x0800 /* for logging only */ -#define NPIF_COMPLINK 0x1000 /* interface is companion link */ +#define NPIF_IPV4 0x0001 +#define NPIF_IPV6 0x0002 +#define NPIF_TCP 0x0004 +#define NPIF_UDP 0x0008 +#define NPIF_DELEGATED 0x0010 +#define NPIF_SOCKET 0x0020 +#define NPIF_CHANNEL 0x0040 +#define NPIF_LISTEN 0x0080 +#define NPIF_WAKEPKT 0x0100 +#define NPIF_NOWAKE 0x0200 +#define NPIF_FRAG 0x0400 +#define NPIF_ESP 0x0800 +#define NPIF_COMPLINK 0x1000 +#define NPIF_CONNECTION_IDLE 0x2000 +#define NPIF_LPW 0x4000 +#define NPIF_DELAYWAKEPKTEVENT 0x8000 + +#define NPI_FLAGS_TABLE(x) \ + X(NPIF_IPV4, "4", "IPv4 flow") \ + X(NPIF_IPV6, "6", "IPv6 flow") \ + X(NPIF_TCP, "T", "TCP flow") \ + X(NPIF_UDP, "U", "UDP flow") \ + X(NPIF_DELEGATED, "D", "process delegated") \ + X(NPIF_SOCKET, "S", "socket flow") \ + X(NPIF_CHANNEL, "C", "channel") \ + X(NPIF_LISTEN, "L", "listening flow") \ + X(NPIF_WAKEPKT, "W", "wake packet") \ + X(NPIF_NOWAKE, "N", "flow marked with NOWAKEFROMSLEEP") \ + X(NPIF_FRAG, "F", "packet is pure fragment") \ + X(NPIF_ESP, "E", "ESP packet") \ + X(NPIF_COMPLINK, "c", "interface is companion link") \ + X(NPIF_CONNECTION_IDLE, "i", "flow connection is idle") \ + X(NPIF_LPW, "l", "packet received in low power wake") \ + X(NPIF_DELAYWAKEPKTEVENT, "d", "delayed wake packet attribution") #define NPI_HAS_EFFECTIVE_UUID 1 @@ -178,7 +200,7 @@ struct npi_if_info { #define NPICF_NOWAKE 0x1000 /* - * struct net_port_info_una_wake_event is the event data for KEV_POWER_WAKE_PACKE + * struct net_port_info_wake_event is the event data for KEV_POWER_WAKE_PACKET * * See for definiton of values of these kinds of fields: * - xxx_if_family IFRTYPE_FAMILY_YYY @@ -250,7 +272,7 @@ struct net_port_info_una_wake_event { struct npi_if_info una_wake_pkt_phy_if_info; /* outer-most interface of wake packet */ }; -#define IFPU_HAS_MATCH_WAKE_PKT_NO_FLAG 1 /* ifpu_match_wake_pkt_no_flag is defined */ +#define IFPU_HAS_DELAY_WAKE_EVENT_FIELDS 1 /* ifpu_delay_phy_wake_pkt and co are defined */ #define IF_PORTS_USED_STATS_LIST \ X(uint64_t, ifpu_wakeuid_gen, "wakeuuid generation%s", "", "s") \ @@ -284,10 +306,19 @@ struct net_port_info_una_wake_event { X(uint64_t, ifpu_incomplete_udp_hdr_pkt, "packet%s with incomplete UDP header", "", "s") \ X(uint64_t, ifpu_npi_not_added_no_wakeuuid, "port entr%s not added with wakeuuid not set", "y", "ies") \ X(uint64_t, ifpu_deferred_isakmp_natt_wake_pkt, "deferred matching of ISAKMP NAT traversal wake packet%s", "", "s") \ - X(uint64_t, ifpu_spurious_wake_event, "spurious wake packet event%s", "", "s") \ + X(uint64_t, ifpu_spurious_wake_event, "spurious no wake from sleep packet event%s", "", "s") \ X(uint64_t, ifpu_delayed_attributed_wake_event, "delayed attributed wake packet event%s", "", "s") \ X(uint64_t, ifpu_delayed_unattributed_wake_event, "delayed unattributed wake packet event%s", "", "s") \ - X(uint64_t, ifpu_delayed_wake_event_undelivered, "undelivered delayed wake packet event%s", "", "s") + X(uint64_t, ifpu_delayed_wake_event_undelivered, "undelivered delayed wake packet event%s", "", "s") \ + X(uint64_t, ifpu_connection_idle_wake, "connection idle wake%s", "", "s") \ + X(uint64_t, ifpu_lpw_connection_idle_wake, "LPW connection idle wake%s", "", "s") \ + X(uint64_t, ifpu_lpw_not_idle_wake, "LPW not idle connection wake%s", "", "s") \ + X(uint64_t, ifpu_lpw_to_full_wake, "LPW to full wake transition%s", "", "s") \ + X(uint64_t, ifpu_ignored_phy_wake_pkt, "ignored wake packet%s in same wake cycle", "", "s") \ + X(uint64_t, ifpu_delay_phy_wake_pkt, "delayed wake packet%s", "", "s") \ + X(uint64_t, ifpu_ignored_delayed_attributed_events, "ignored delayed attributed event%s", "", "s") \ + X(uint64_t, ifpu_ignored_delayed_unattributed_events, "ignored delayed unattributed event%s", "", "s") \ + X(uint64_t, ifpu_wake_pkt_event_notify_in_vain, "wake pkt event notifications%s in vain", "", "s") struct if_ports_used_stats { #define X(_type, _field, ...) _type _field; @@ -297,6 +328,10 @@ struct if_ports_used_stats { #ifdef XNU_KERNEL_PRIVATE +#include + +extern os_log_t wake_packet_log_handle; + extern int if_ports_used_verbose; void if_ports_used_init(void); @@ -317,7 +352,18 @@ void if_ports_used_match_pkt(struct ifnet *ifp, struct __kern_packet *pkt); void if_ports_used_match_mbuf(struct ifnet *ifp, protocol_family_t proto_family, struct mbuf *m); +void init_inband_wake_pkt_tagging_for_family(struct ifnet *ifp); + +#if (DEBUG | DEVELOPMENT) +bool check_wake_mbuf(ifnet_t ifp, protocol_family_t protocol_family, mbuf_ref_t m); +bool check_wake_pkt(ifnet_t ifp, struct __kern_packet *pkt); +#endif /* (DEBUG | DEVELOPMENT) */ + +bool if_is_lpw_enabled(struct ifnet *); +void if_exit_lpw(struct ifnet *ifp, const char *lpw_exit_reason); + #endif /* XNU_KERNEL_PRIVATE */ #endif /* PRIVATE */ + #endif /* _NET_IF_PORT_USED_H_ */ diff --git a/bsd/net/if_private.h b/bsd/net/if_private.h index 12395e361..52dd4f9d5 100644 --- a/bsd/net/if_private.h +++ b/bsd/net/if_private.h @@ -128,8 +128,8 @@ struct if_clonereq32 { #define IFEF_NOACKPRI 0x00200000 /* No TCP ACK prioritization */ #define IFEF_AWDL_RESTRICTED 0x00400000 /* Restricted AWDL mode */ #define IFEF_2KCL 0x00800000 /* prefers 2K cluster (socket based tunnel) */ -#define IFEF_ECN_ENABLE 0x01000000 /* use ECN for TCP connections on the interface */ -#define IFEF_ECN_DISABLE 0x02000000 /* do not use ECN for TCP connections on the interface */ +#define IFEF_UNUSED1 0x01000000 +#define IFEF_UNUSED2 0x02000000 #define IFEF_SKYWALK_NATIVE 0x04000000 /* Native Skywalk support */ #define IFEF_3CA 0x08000000 /* Capable of 3CA */ #define IFEF_SENDLIST 0x10000000 /* Supports tx packet lists */ @@ -161,9 +161,14 @@ struct if_clonereq32 { #define IFXF_DELAYWAKEPKTEVENT 0x00020000 /* Delay notification of wake packet events */ #define IFXF_DISABLE_INPUT 0x00040000 /* Drop receive traffic */ #define IFXF_CONGESTED_LINK 0x00080000 /* Link is congested */ +#define IFXF_IS_COMPANIONLINK 0x00100000 /* Is companion link */ +#define IFXF_RX_FLOW_STEERING 0x00200000 /* Rx flow steering */ #define IFXF_LINK_HEURISTICS 0x00800000 /* Link heuristics enabled */ #define IFXF_LINK_HEUR_OFF_PENDING 0x01000000 /* Link heurisitics delay disable */ #define IFXF_POINTOPOINT_MDNS 0x02000000 /* Point-to-point interface supports mDNS */ +#define IFXF_INBAND_WAKE_PKT_TAGGING 0x04000000 /* Inband tagging of packet wake flag */ +#define IFXF_LOW_POWER_WAKE 0x08000000 /* Low Power Wake */ +#define IFXF_REQUIRE_CELL_THREAD_GROUP 0x10000000 /* Require cellular thread group */ /* * Current requirements for an AWDL interface. Setting/clearing IFEF_AWDL @@ -307,6 +312,10 @@ struct ifreq { #define IFRTYPE_ECN_DEFAULT 0 #define IFRTYPE_ECN_ENABLE 1 #define IFRTYPE_ECN_DISABLE 2 + uint32_t ifru_l4s_mode; +#define IFRTYPE_L4S_DEFAULT 0 +#define IFRTYPE_L4S_ENABLE 1 +#define IFRTYPE_L4S_DISABLE 2 u_int32_t ifru_qosmarking_mode; #define IFRTYPE_QOSMARKING_MODE_NONE 0 #define IFRTYPE_QOSMARKING_FASTLANE 1 /* supported: socket/channel */ @@ -335,6 +344,7 @@ struct ifreq { u_int8_t ifru_is_directlink; u_int8_t ifru_is_vpn; uint32_t ifru_delay_wake_pkt_event; + u_int8_t ifru_is_companionlink; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -377,6 +387,7 @@ struct ifreq { #define ifr_interface_state ifr_ifru.ifru_interface_state #define ifr_probe_connectivity ifr_ifru.ifru_probe_connectivity #define ifr_ecn_mode ifr_ifru.ifru_ecn_mode +#define ifr_l4s_mode ifr_ifru.ifru_l4s_mode #define ifr_qosmarking_mode ifr_ifru.ifru_qosmarking_mode #define ifr_fastlane_capable ifr_qosmarking_mode #define ifr_qosmarking_enabled ifr_ifru.ifru_qosmarking_enabled @@ -394,6 +405,7 @@ struct ifreq { #define ifr_is_directlink ifr_ifru.ifru_is_directlink #define ifr_is_vpn ifr_ifru.ifru_is_vpn #define ifr_delay_wake_pkt_event ifr_ifru.ifru_delay_wake_pkt_event +#define ifr_is_companionlink ifr_ifru.ifru_is_companionlink }; #define _SIZEOF_ADDR_IFREQ(ifr) \ @@ -473,6 +485,35 @@ enum { }; #define IFNET_LQM_THRESH_BAD IFNET_LQM_THRESH_ABORT +#ifdef XNU_KERNEL_PRIVATE +/* + * Determine the LQM threshold corresponding to the input. + */ +static inline int8_t +ifnet_lqm_normalize(int32_t raw_lqm) +{ + if (raw_lqm == IFNET_LQM_THRESH_OFF) { + return IFNET_LQM_THRESH_OFF; + } + if (raw_lqm <= 0) { + return IFNET_LQM_THRESH_UNKNOWN; + } + if (raw_lqm <= IFNET_LQM_THRESH_ABORT) { + return IFNET_LQM_THRESH_ABORT; + } + if (raw_lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) { + return IFNET_LQM_THRESH_MINIMALLY_VIABLE; + } + if (raw_lqm <= IFNET_LQM_THRESH_POOR) { + return IFNET_LQM_THRESH_POOR; + } + if (raw_lqm <= IFNET_LQM_THRESH_GOOD) { + return IFNET_LQM_THRESH_GOOD; + } + return IFNET_LQM_THRESH_UNKNOWN; +} +#endif /* XNU_KERNEL_PRIVATE */ + #ifdef XNU_KERNEL_PRIVATE #define IFNET_LQM_MIN IFNET_LQM_THRESH_OFF #define IFNET_LQM_MAX IFNET_LQM_THRESH_GOOD @@ -510,16 +551,44 @@ struct if_descreq { * scheduling strategy (e.g. 802.11 WMM), and that the networking * stack is only responsible for creating multiple queues for the * corresponding service classes. + * IFNET_SCHED_MODEL_FQ_CODEL Use legacy FQ_CoDel as the output packet + * scheduling model. This also schedules traffic classes. + * This legacy FQ-CoDel implementation employs flow control + * when queuing dealy is above the configured threshold. + * IFNET_SCHED_MODEL_FQ_CODEL_DM Legacy FQ_CoDel but for driver/media that + * requires strict scheduling strategy. The driver is responisble + * for selecting the appropriate SVC at dequeue time. + * IFNET_SCHED_MODEL_FQ_CODEL_NEW RFC compliant FQ_CoDel implementation. + * This impplementation does not rely on flow control but rather packet + * drops and ECN markings to bring down queuing delay. + * IFNET_SCHED_MODEL_FQ_CODEL_NEW_DM Same as IFNET_SCHED_MODEL_FQ_CODEL_NEW + * but for driver/media that requires strict scheduling strategy. */ + + #define IFNET_SCHED_MODEL_LIST \ + X(IFNET_SCHED_MODEL_NORMAL, 0x00000000, normal) \ + X(IFNET_SCHED_MODEL_DRIVER_MANAGED, 0x00000001, driver managed) \ + X(IFNET_SCHED_MODEL_FQ_CODEL, 0x00000002, fq_codel) \ + X(IFNET_SCHED_MODEL_FQ_CODEL_DM, 0x00000004, fq_codel DM) \ + X(IFNET_SCHED_MODEL_FQ_CODEL_NEW, 0x00000008, fq_codel_new) \ + X(IFNET_SCHED_MODEL_FQ_CODEL_NEW_DM, 0x00000010, fq_codel_new DM) enum { - IFNET_SCHED_MODEL_NORMAL = 0, - IFNET_SCHED_MODEL_DRIVER_MANAGED = 1, - IFNET_SCHED_MODEL_FQ_CODEL = 2, -#ifdef XNU_KERNEL_PRIVATE - IFNET_SCHED_MODEL_MAX = 3, -#endif /* XNU_KERNEL_PRIVATE */ +#define X(name, value, ...) name = value, + IFNET_SCHED_MODEL_LIST +#undef X }; +#define IFNET_SCHED_DRIVER_MANGED_MODELS \ + (IFNET_SCHED_MODEL_DRIVER_MANAGED | IFNET_SCHED_MODEL_FQ_CODEL_DM | IFNET_SCHED_MODEL_FQ_CODEL_NEW_DM) + +#define IFNET_SCHED_VALID_MODELS \ + (IFNET_SCHED_MODEL_NORMAL | IFNET_SCHED_MODEL_DRIVER_MANAGED | \ + IFNET_SCHED_MODEL_FQ_CODEL | IFNET_SCHED_MODEL_FQ_CODEL_DM | \ + IFNET_SCHED_MODEL_FQ_CODEL_NEW | IFNET_SCHED_MODEL_FQ_CODEL_NEW_DM) + +#define IFNET_MODEL_IS_VALID(_model) \ + (((_model == IFNET_SCHED_MODEL_NORMAL) || ((_model) & IFNET_SCHED_VALID_MODELS)) && ((_model) & (_model - 1)) == 0) + /* * Values for iflpr_flags */ diff --git a/bsd/net/if_redirect.c b/bsd/net/if_redirect.c index 2ab5edf7d..8cd35126d 100644 --- a/bsd/net/if_redirect.c +++ b/bsd/net/if_redirect.c @@ -292,7 +292,7 @@ redirect_enqueue_pkt(struct nx_netif *nif, struct __kern_packet *pkt, int err; if (NX_LLINK_PROV(nif->nif_nx) && - ifp->if_traffic_rule_count > 0 && + ifp->if_inet_traffic_rule_count > 0 && nxctl_inet_traffic_rule_find_qset_id_with_pkt(ifp->if_xname, pkt, &qset_id) == 0) { struct netif_qset * __single qset; @@ -304,11 +304,11 @@ redirect_enqueue_pkt(struct nx_netif *nif, struct __kern_packet *pkt, qset = nx_netif_find_qset(nif, qset_id); ASSERT(qset != NULL); pkt->pkt_qset_idx = qset->nqs_idx; - err = ifnet_enqueue_ifcq_pkt(ifp, qset->nqs_ifcq, pkt, flush, drop); + err = ifnet_enqueue_pkt(ifp, qset->nqs_ifcq, pkt, flush, drop); nx_netif_qset_release(&qset); } else { /* callee consumes packet */ - err = ifnet_enqueue_pkt(ifp, pkt, flush, drop); + err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, flush, drop); } return err; } @@ -474,11 +474,11 @@ redirect_nx_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, return ENXIO; } if (is_tx_ring) { - _CASSERT(RD_MAX_TX_RINGS == 1); + static_assert(RD_MAX_TX_RINGS == 1); VERIFY(rd->rd_tx_ring[0] == NULL); rd->rd_tx_ring[0] = ring; } else { - _CASSERT(RD_MAX_RX_RINGS == 1); + static_assert(RD_MAX_RX_RINGS == 1); VERIFY(rd->rd_rx_ring[0] == NULL); rd->rd_rx_ring[0] = ring; } @@ -1393,7 +1393,7 @@ redirect_set_delegate(if_redirect_t rd, ifnet_t delegate_ifp) } ASSERT(rd->rd_delegate_ifp == NULL); - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { RDLOG_ERR("failed to get self reference"); DTRACE_SKYWALK2(ifp__detaching, if_redirect_t, rd, ifnet_t, ifp); error = ENXIO; @@ -1413,7 +1413,7 @@ redirect_set_delegate(if_redirect_t rd, ifnet_t delegate_ifp) ASSERT(!rd->rd_delegate_parent_set); rd->rd_delegate_parent_set = TRUE; - if (!ifnet_is_attached(delegate_ifp, 1)) { + if (!ifnet_get_ioref(delegate_ifp)) { RDLOG_ERR("failed to get delegate reference"); DTRACE_SKYWALK2(delegate__detaching, if_redirect_t, rd, ifnet_t, delegate_ifp); @@ -1483,7 +1483,7 @@ redirect_set_delegate(if_redirect_t rd, ifnet_t delegate_ifp) * Check that the delegate is still attached. If not, the detach notify above * could've been missed and we would have to cleanup everything here. */ - if (!ifnet_is_attached(delegate_ifp, 0)) { + if (!ifnet_is_fully_attached(delegate_ifp)) { RDLOG_ERR("delegate %s detached during setup", if_name(delegate_ifp)); DTRACE_SKYWALK2(delegate__detached, if_redirect_t, rd, ifnet_t, delegate_ifp); diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 9bf3977b2..0b329462a 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -68,79 +68,6 @@ #define UTUN_NEXUS 0 #endif // SKYWALK && CONFIG_NEXUS_KERNEL_PIPE -#if UTUN_NEXUS -static nexus_controller_t utun_ncd; -static int utun_ncd_refcount; -static uuid_t utun_kpipe_uuid; -static uuid_t utun_nx_dom_prov; - -typedef struct utun_nx { - uuid_t if_provider; - uuid_t if_instance; - uuid_t fsw_provider; - uuid_t fsw_instance; - uuid_t fsw_device; - uuid_t fsw_agent; -} *utun_nx_t; - -#endif // UTUN_NEXUS - -/* Control block allocated for each kernel control connection */ -struct utun_pcb { - TAILQ_ENTRY(utun_pcb) utun_chain; - kern_ctl_ref utun_ctlref; - ifnet_t utun_ifp; - u_int32_t utun_unit; - u_int32_t utun_unique_id; - u_int32_t utun_flags; - int utun_ext_ifdata_stats; - u_int32_t utun_max_pending_packets; - char utun_if_xname[IFXNAMSIZ]; - char utun_unique_name[IFXNAMSIZ]; - // PCB lock protects state fields and rings - decl_lck_rw_data(, utun_pcb_lock); - struct mbuf * utun_input_chain; - struct mbuf * utun_input_chain_last; - u_int32_t utun_input_chain_count; - // Input chain lock protects the list of input mbufs - // The input chain lock must be taken AFTER the PCB lock if both are held - lck_mtx_t utun_input_chain_lock; - -#if UTUN_NEXUS - // lock to protect utun_pcb_data_move & utun_pcb_drainers - decl_lck_mtx_data(, utun_pcb_data_move_lock); - u_int32_t utun_pcb_data_move; /* number of data moving contexts */ - u_int32_t utun_pcb_drainers; /* number of threads waiting to drain */ - u_int32_t utun_pcb_data_path_state; /* internal state of interface data path */ - - struct utun_nx utun_nx; - int utun_kpipe_enabled; - uuid_t utun_kpipe_uuid; - void * utun_kpipe_rxring; - void * utun_kpipe_txring; - kern_pbufpool_t utun_kpipe_pp; - u_int32_t utun_kpipe_tx_ring_size; - u_int32_t utun_kpipe_rx_ring_size; - - kern_nexus_t utun_netif_nexus; - kern_pbufpool_t utun_netif_pp; - void * utun_netif_rxring; - void * utun_netif_txring; - uint64_t utun_netif_txring_size; - - u_int32_t utun_slot_size; - u_int32_t utun_netif_ring_size; - u_int32_t utun_tx_fsw_ring_size; - u_int32_t utun_rx_fsw_ring_size; - // Auto attach flowswitch when netif is enabled. When set to false, - // it allows userspace nexus controller to attach and own flowswitch. - bool utun_attach_fsw; - bool utun_netif_connected; - bool utun_use_netif; - bool utun_needs_netagent; -#endif // UTUN_NEXUS -}; - /* Kernel Control functions */ static errno_t utun_ctl_setup(u_int32_t *unit, void **unitinfo); static errno_t utun_ctl_bind(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, @@ -182,13 +109,21 @@ static errno_t utun_proto_input(ifnet_t interface, protocol_family_t protocol, static errno_t utun_proto_pre_output(ifnet_t interface, protocol_family_t protocol, mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type, char *link_layer_dest); -static errno_t utun_pkt_input(struct utun_pcb *pcb, mbuf_t m); -/* data movement refcounting functions */ #if UTUN_NEXUS -static boolean_t utun_data_move_begin(struct utun_pcb *pcb); -static void utun_data_move_end(struct utun_pcb *pcb); -static void utun_wait_data_move_drain(struct utun_pcb *pcb); +static nexus_controller_t utun_ncd; +static int utun_ncd_refcount; +static uuid_t utun_kpipe_uuid; +static uuid_t utun_nx_dom_prov; + +typedef struct utun_nx { + uuid_t if_provider; + uuid_t if_instance; + uuid_t fsw_provider; + uuid_t fsw_instance; + uuid_t fsw_device; + uuid_t fsw_agent; +} *utun_nx_t; /* Data path states */ #define UTUN_PCB_DATA_PATH_READY 0x1 @@ -208,6 +143,13 @@ static void utun_wait_data_move_drain(struct utun_pcb *pcb); #define UTUN_IF_DEFAULT_BUF_SEG_SIZE skmem_usr_buf_seg_size #define UTUN_IF_HEADROOM_SIZE 32 +#define UTUN_IF_WMM_RING_COUNT NEXUS_NUM_WMM_QUEUES +#define UTUN_IF_MAX_RING_COUNT UTUN_IF_WMM_RING_COUNT +#define UTUN_NETIF_WMM_TX_RING_COUNT UTUN_IF_WMM_RING_COUNT +#define UTUN_NETIF_WMM_RX_RING_COUNT 1 +#define UTUN_NETIF_MAX_TX_RING_COUNT UTUN_NETIF_WMM_TX_RING_COUNT +#define UTUN_NETIF_MAX_RX_RING_COUNT UTUN_NETIF_WMM_RX_RING_COUNT + #define UTUN_IF_MIN_RING_SIZE 8 #define UTUN_IF_MAX_RING_SIZE 1024 @@ -273,8 +215,80 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t ring, uint32_t flags); #endif // UTUN_NEXUS +/* Control block allocated for each kernel control connection */ +struct utun_pcb { + TAILQ_ENTRY(utun_pcb) utun_chain; + kern_ctl_ref utun_ctlref; + ifnet_t utun_ifp; + u_int32_t utun_unit; + u_int32_t utun_unique_id; + u_int32_t utun_external_flags; + // These internal flags are only used within this driver + u_int32_t utun_internal_flags; + int utun_ext_ifdata_stats; + u_int32_t utun_max_pending_packets; + char utun_if_xname[IFXNAMSIZ]; + char utun_unique_name[IFXNAMSIZ]; + // PCB lock protects state fields and rings + decl_lck_rw_data(, utun_pcb_lock); + struct mbuf * utun_input_chain; + struct mbuf * utun_input_chain_last; + u_int32_t utun_input_chain_count; + // Input chain lock protects the list of input mbufs + // The input chain lock must be taken AFTER the PCB lock if both are held + lck_mtx_t utun_input_chain_lock; + +#if UTUN_NEXUS + // lock to protect utun_pcb_data_move & utun_pcb_drainers + decl_lck_mtx_data(, utun_pcb_data_move_lock); + u_int32_t utun_pcb_data_move; /* number of data moving contexts */ + u_int32_t utun_pcb_drainers; /* number of threads waiting to drain */ + u_int32_t utun_pcb_data_path_state; /* internal state of interface data path */ + + struct utun_nx utun_nx; + u_int32_t utun_kpipe_count; + pid_t utun_kpipe_pid; + uuid_t utun_kpipe_uuid[UTUN_IF_MAX_RING_COUNT]; + void * utun_kpipe_rxring[UTUN_IF_MAX_RING_COUNT]; + void * utun_kpipe_txring[UTUN_IF_MAX_RING_COUNT]; + kern_pbufpool_t utun_kpipe_pp; + u_int32_t utun_kpipe_tx_ring_size; + u_int32_t utun_kpipe_rx_ring_size; + uuid_t utun_kpipe_proc_uuid; + + kern_nexus_t utun_netif_nexus; + kern_pbufpool_t utun_netif_pp; + void * utun_netif_rxring[UTUN_NETIF_MAX_RX_RING_COUNT]; + void * utun_netif_txring[UTUN_NETIF_MAX_TX_RING_COUNT]; + uint64_t utun_netif_txring_size; + + u_int32_t utun_slot_size; + u_int32_t utun_netif_ring_size; + u_int32_t utun_tx_fsw_ring_size; + u_int32_t utun_rx_fsw_ring_size; + // Auto attach flowswitch when netif is enabled. When set to false, + // it allows userspace nexus controller to attach and own flowswitch. + bool utun_attach_fsw; + bool utun_netif_connected; + bool utun_use_netif; + bool utun_needs_netagent; +#endif // UTUN_NEXUS +}; + +/* These are internal flags not exposed outside this file */ +#define UTUN_FLAGS_KPIPE_ALLOCATED 1 + +static errno_t utun_pkt_input(struct utun_pcb *pcb, mbuf_t m); + +/* data movement refcounting functions */ +#if UTUN_NEXUS +static boolean_t utun_data_move_begin(struct utun_pcb *pcb); +static void utun_data_move_end(struct utun_pcb *pcb); +static void utun_wait_data_move_drain(struct utun_pcb *pcb); +#endif // UTUN_NEXUS + #define UTUN_DEFAULT_MTU 1500 -#define UTUN_HEADER_SIZE(_pcb) (sizeof(u_int32_t) + (((_pcb)->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) ? sizeof(uuid_t) : 0)) +#define UTUN_HEADER_SIZE(_pcb) (sizeof(u_int32_t) + (((_pcb)->utun_external_flags & UTUN_FLAGS_ENABLE_PROC_UUID) ? sizeof(uuid_t) : 0)) static kern_ctl_ref utun_kctlref; static LCK_ATTR_DECLARE(utun_lck_attr, 0, 0); @@ -286,6 +300,53 @@ TAILQ_HEAD(utun_list, utun_pcb) utun_head; static KALLOC_TYPE_DEFINE(utun_pcb_zone, struct utun_pcb, NET_KT_DEFAULT); #if UTUN_NEXUS +/* Macros to clear/set/test flags. */ +static inline void +utun_flag_set(struct utun_pcb *pcb, uint32_t flag) +{ + pcb->utun_internal_flags |= flag; +} + +static inline void +utun_flag_clr(struct utun_pcb *pcb, uint32_t flag) +{ + pcb->utun_internal_flags &= ~flag; +} + +static inline bool +utun_flag_isset(struct utun_pcb *pcb, uint32_t flag) +{ + return !!(pcb->utun_internal_flags & flag); +} + +static inline bool +utun_in_wmm_mode(struct utun_pcb *pcb) +{ + return pcb->utun_kpipe_count == UTUN_IF_WMM_RING_COUNT; +} + +static uint8_t +utun_find_tx_ring_by_svc(kern_packet_svc_class_t svc_class) +{ + switch (svc_class) { + case KPKT_SC_VO: { + return 0; + } + case KPKT_SC_VI: { + return 1; + } + case KPKT_SC_BE: { + return 2; + } + case KPKT_SC_BK: { + return 3; + } + default: { + VERIFY(0); + return 0; + } + } +} static int sysctl_if_utun_ring_size SYSCTL_HANDLER_ARGS @@ -360,11 +421,23 @@ utun_netif_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(ring_ctx) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); if (!is_tx_ring) { - VERIFY(pcb->utun_netif_rxring == NULL); - pcb->utun_netif_rxring = ring; + VERIFY(pcb->utun_netif_rxring[0] == NULL); + pcb->utun_netif_rxring[0] = ring; } else { - VERIFY(pcb->utun_netif_txring == NULL); - pcb->utun_netif_txring = ring; + uint8_t ring_idx = 0; + if (utun_in_wmm_mode(pcb)) { + int err; + kern_packet_svc_class_t svc_class; + err = kern_channel_get_service_class(ring, &svc_class); + VERIFY(err == 0); + ring_idx = utun_find_tx_ring_by_svc(svc_class); + VERIFY(ring_idx < UTUN_IF_WMM_RING_COUNT); + } + + *ring_ctx = __unsafe_forge_single(void *, (uintptr_t)ring_idx); + + VERIFY(pcb->utun_netif_txring[ring_idx] == NULL); + pcb->utun_netif_txring[ring_idx] = ring; } return 0; } @@ -375,11 +448,23 @@ utun_netif_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); - if (pcb->utun_netif_rxring == ring) { - pcb->utun_netif_rxring = NULL; - } else if (pcb->utun_netif_txring == ring) { - pcb->utun_netif_txring = NULL; + bool found = false; + + for (int i = 0; i < UTUN_NETIF_MAX_RX_RING_COUNT; i++) { + if (pcb->utun_netif_rxring[i] == ring) { + pcb->utun_netif_rxring[i] = NULL; + VERIFY(!found); + found = true; + } } + for (int i = 0; i < UTUN_NETIF_MAX_TX_RING_COUNT; i++) { + if (pcb->utun_netif_txring[i] == ring) { + pcb->utun_netif_txring[i] = NULL; + VERIFY(!found); + found = true; + } + } + VERIFY(found); } static errno_t @@ -415,8 +500,14 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, return 0; } - if (pcb->utun_kpipe_enabled) { - kern_channel_ring_t __single rx_ring = pcb->utun_kpipe_rxring; + if (pcb->utun_kpipe_count > 0 && + utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)) { + // Select the corresponding kpipe rx ring + uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(tx_ring); + VERIFY(ring_idx < UTUN_IF_MAX_RING_COUNT); + kern_channel_ring_t __single rx_ring = pcb->utun_kpipe_rxring[ring_idx]; + + // Unlock while calling notify lck_rw_unlock_shared(&pcb->utun_pcb_lock); // Signal the kernel pipe ring to read @@ -495,7 +586,7 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, // Copy in family memcpy(tx_baddr, &af, sizeof(af)); - if (pcb->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { + if (pcb->utun_external_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { kern_packet_get_euuid(tx_ph, (void *)(tx_baddr + sizeof(af))); } @@ -551,14 +642,16 @@ utun_netif_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } static errno_t -utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, - kern_channel_ring_t ring, __unused uint32_t flags) +utun_netif_tx_doorbell_one(kern_nexus_provider_t nxprov, kern_nexus_t nexus, + kern_channel_ring_t ring, uint32_t flags, uint8_t ring_idx) { #pragma unused(nxprov) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); boolean_t more = false; errno_t rc = 0; + VERIFY((flags & KERN_NEXUS_TXDOORBELLF_ASYNC_REFILL) == 0); + if (!utun_data_move_begin(pcb)) { os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->utun_ifp)); @@ -572,13 +665,22 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, */ rc = kern_channel_tx_refill_canblock(ring, UINT32_MAX, UINT32_MAX, true, &more); if (rc != 0 && rc != EAGAIN && rc != EBUSY) { - os_log_error(OS_LOG_DEFAULT, "%s, tx refill failed %d\n", __func__, rc); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s tx refill failed %d\n", __func__, + if_name(pcb->utun_ifp), ring->ckr_name, rc); } (void) kr_enter(ring, TRUE); lck_rw_lock_shared(&pcb->utun_pcb_lock); + if (ring != pcb->utun_netif_txring[ring_idx]) { + // ring no longer valid + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + kr_exit(ring); + os_log_error(OS_LOG_DEFAULT, "%s: %s ring %s index %d bad netif_txring 3\n", __func__, + if_name(pcb->utun_ifp), ring->ckr_name, ring_idx); + return ENXIO; + } - if (pcb->utun_kpipe_enabled) { + if (pcb->utun_kpipe_count > 0) { uint32_t tx_available = kern_channel_available_slot_count(ring); if (pcb->utun_netif_txring_size > 0 && tx_available >= pcb->utun_netif_txring_size - 1) { @@ -590,8 +692,8 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } } - if (pcb->utun_kpipe_enabled) { - kern_channel_ring_t __single rx_ring = pcb->utun_kpipe_rxring; + if (pcb->utun_kpipe_count > 0) { + kern_channel_ring_t __single rx_ring = pcb->utun_kpipe_rxring[ring_idx]; // Unlock while calling notify lck_rw_unlock_shared(&pcb->utun_pcb_lock); @@ -608,6 +710,33 @@ utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, return 0; } +static errno_t +utun_netif_tx_doorbell(kern_nexus_provider_t nxprov, kern_nexus_t nexus, + kern_channel_ring_t ring, __unused uint32_t flags) +{ + errno_t ret = 0; + struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); + + if (!utun_data_move_begin(pcb)) { + os_log_error(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", __func__, if_name(pcb->utun_ifp)); + return 0; + } + + if (utun_in_wmm_mode(pcb)) { + for (uint8_t i = 0; i < UTUN_IF_WMM_RING_COUNT; i++) { + kern_channel_ring_t __single nring = pcb->utun_netif_txring[i]; + ret = utun_netif_tx_doorbell_one(nxprov, nexus, nring, flags, i); + if (ret) { + break; + } + } + } else { + ret = utun_netif_tx_doorbell_one(nxprov, nexus, ring, flags, 0); + } + + utun_data_move_end(pcb); + return ret; +} static errno_t utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t rx_ring, uint32_t flags) @@ -741,138 +870,149 @@ utun_netif_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); } - struct kern_channel_ring_stat_increment tx_ring_stats; - bzero(&tx_ring_stats, sizeof(tx_ring_stats)); - kern_channel_ring_t __single tx_ring = pcb->utun_kpipe_txring; - kern_channel_slot_t tx_pslot = NULL; - kern_channel_slot_t tx_slot = NULL; - if (tx_ring == NULL) { - // Net-If TX ring not set up yet, nothing to read - goto done; - } - // Unlock utun before entering ring - lck_rw_unlock_shared(&pcb->utun_pcb_lock); + for (uint8_t ring_idx = 0; ring_idx < pcb->utun_kpipe_count; ring_idx++) { + struct kern_channel_ring_stat_increment tx_ring_stats = {}; + kern_channel_slot_t tx_pslot = NULL; + kern_channel_slot_t tx_slot = NULL; - (void)kr_enter(tx_ring, TRUE); + kern_channel_ring_t __single tx_ring = pcb->utun_kpipe_txring[ring_idx]; + if (tx_ring == NULL) { + // Net-If TX ring not set up yet, nothing to read + goto done; + } + // Unlock utun before entering ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); - // Lock again after entering and validate - lck_rw_lock_shared(&pcb->utun_pcb_lock); - if (tx_ring != pcb->utun_kpipe_txring) { - goto done; - } + (void)kr_enter(tx_ring, TRUE); - tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); - if (tx_slot == NULL) { - // Nothing to read, don't bother signalling - goto done; - } - - while (rx_slot != NULL && tx_slot != NULL) { - // Allocate rx packet - kern_packet_t rx_ph = 0; - kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); - - // Advance TX ring - tx_pslot = tx_slot; - tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); - - /* Skip slot if packet is zero-length or marked as dropped (QUMF_DROPPED) */ - if (tx_ph == 0) { - continue; + // Lock again after entering and validate + lck_rw_lock_shared(&pcb->utun_pcb_lock); + if (tx_ring != pcb->utun_kpipe_txring[ring_idx]) { + goto done; } - /* XXX We could try this alloc before advancing the slot to avoid - * dropping the packet on failure to allocate. - */ - errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); - if (__improbable(error != 0)) { - STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); - STATS_INC(nifs, NETIF_STATS_DROP); - break; + tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); + if (tx_slot == NULL) { + // Nothing to read, don't bother signalling + goto done; } - kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); - VERIFY(tx_buf != NULL); - uint8_t *tx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, - kern_buflet_get_data_address(tx_buf), - kern_buflet_get_data_limit(tx_buf)); - VERIFY(tx_baddr != 0); - tx_baddr += kern_buflet_get_data_offset(tx_buf); + while (rx_slot != NULL && tx_slot != NULL) { + // Allocate rx packet + kern_packet_t rx_ph = 0; + kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); - // Check packet length - size_t header_offset = UTUN_HEADER_SIZE(pcb); - uint32_t tx_length = kern_packet_get_data_length(tx_ph); - if (tx_length < header_offset) { - // Packet is too small - kern_pbufpool_free(rx_pp, rx_ph); - STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROP); - os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: packet length too short for header %u < %zu\n", - pcb->utun_ifp->if_xname, tx_length, header_offset); - continue; + // Advance TX ring + tx_pslot = tx_slot; + tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); + + /* Skip slot if packet is zero-length or marked as dropped (QUMF_DROPPED) */ + if (tx_ph == 0) { + continue; + } + + /* XXX We could try this alloc before advancing the slot to avoid + * dropping the packet on failure to allocate. + */ + errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); + if (__improbable(error != 0)) { + STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); + STATS_INC(nifs, NETIF_STATS_DROP); + break; + } + + kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); + VERIFY(tx_buf != NULL); + uint8_t *tx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, + kern_buflet_get_data_address(tx_buf), + kern_buflet_get_data_limit(tx_buf)); + VERIFY(tx_baddr != 0); + tx_baddr += kern_buflet_get_data_offset(tx_buf); + + // Check packet length + size_t header_offset = UTUN_HEADER_SIZE(pcb); + uint32_t tx_length = kern_packet_get_data_length(tx_ph); + if (tx_length < header_offset) { + // Packet is too small + kern_pbufpool_free(rx_pp, rx_ph); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + os_log_error(OS_LOG_DEFAULT, "utun_netif_sync_rx %s: packet length too short for header %u < %zu\n", + pcb->utun_ifp->if_xname, tx_length, header_offset); + continue; + } + + size_t length = MIN(tx_length - header_offset, + pcb->utun_slot_size); + + tx_ring_stats.kcrsi_slots_transferred++; + tx_ring_stats.kcrsi_bytes_transferred += length; + + // Fillout rx packet + kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); + VERIFY(rx_buf != NULL); + void *rx_baddr = __unsafe_forge_bidi_indexable(void *, + kern_buflet_get_data_address(rx_buf), + kern_buflet_get_data_limit(rx_buf)); + VERIFY(rx_baddr != NULL); + + // Copy-in data from tx to rx + memcpy((void *)rx_baddr, (void *)(tx_baddr + header_offset), length); + kern_packet_clear_flow_uuid(rx_ph); // Zero flow id + + // Finalize and attach the packet + error = kern_buflet_set_data_offset(rx_buf, 0); + VERIFY(error == 0); + error = kern_buflet_set_data_length(rx_buf, length); + VERIFY(error == 0); + error = kern_packet_set_headroom(rx_ph, 0); + VERIFY(error == 0); + + if (__packet_get_wake_flag(tx_ph)) { + __packet_set_wake_flag(rx_ph); + } + + error = kern_packet_finalize(rx_ph); + VERIFY(error == 0); + error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); + VERIFY(error == 0); + + STATS_INC(nifs, NETIF_STATS_RX_PACKETS); + STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT); + bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0); + + rx_ring_stats.kcrsi_slots_transferred++; + rx_ring_stats.kcrsi_bytes_transferred += length; + + rx_pslot = rx_slot; + rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); } - size_t length = MIN(tx_length - header_offset, - pcb->utun_slot_size); - - tx_ring_stats.kcrsi_slots_transferred++; - tx_ring_stats.kcrsi_bytes_transferred += length; - - // Fillout rx packet - kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); - VERIFY(rx_buf != NULL); - void *rx_baddr = __unsafe_forge_bidi_indexable(void *, - kern_buflet_get_data_address(rx_buf), - kern_buflet_get_data_limit(rx_buf)); - VERIFY(rx_baddr != NULL); - - // Copy-in data from tx to rx - memcpy((void *)rx_baddr, (void *)(tx_baddr + header_offset), length); - kern_packet_clear_flow_uuid(rx_ph); // Zero flow id - - // Finalize and attach the packet - error = kern_buflet_set_data_offset(rx_buf, 0); - VERIFY(error == 0); - error = kern_buflet_set_data_length(rx_buf, length); - VERIFY(error == 0); - error = kern_packet_set_headroom(rx_ph, 0); - VERIFY(error == 0); - error = kern_packet_finalize(rx_ph); - VERIFY(error == 0); - error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); - VERIFY(error == 0); - - STATS_INC(nifs, NETIF_STATS_RX_PACKETS); - STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT); - bpf_tap_packet_in(pcb->utun_ifp, DLT_RAW, rx_ph, NULL, 0); - - rx_ring_stats.kcrsi_slots_transferred++; - rx_ring_stats.kcrsi_bytes_transferred += length; - - rx_pslot = rx_slot; - rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); - } - done: + if (tx_pslot) { + kern_channel_advance_slot(tx_ring, tx_pslot); + kern_channel_increment_ring_net_stats(tx_ring, pcb->utun_ifp, &tx_ring_stats); + (void)kern_channel_reclaim(tx_ring); + } + + // Unlock first, then exit ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + if (tx_ring != NULL) { + if (tx_pslot != NULL) { + kern_channel_notify(tx_ring, 0); + } + kr_exit(tx_ring); + } + lck_rw_lock_shared(&pcb->utun_pcb_lock); + } + if (rx_pslot) { kern_channel_advance_slot(rx_ring, rx_pslot); kern_channel_increment_ring_net_stats(rx_ring, pcb->utun_ifp, &rx_ring_stats); } - if (tx_pslot) { - kern_channel_advance_slot(tx_ring, tx_pslot); - kern_channel_increment_ring_net_stats(tx_ring, pcb->utun_ifp, &tx_ring_stats); - (void)kern_channel_reclaim(tx_ring); - } - // Unlock first, then exit ring lck_rw_unlock_shared(&pcb->utun_pcb_lock); - if (tx_ring != NULL) { - if (tx_pslot != NULL) { - kern_channel_notify(tx_ring, 0); - } - kr_exit(tx_ring); - } utun_data_move_end(pcb); return 0; @@ -927,12 +1067,31 @@ utun_nexus_ifattach(struct utun_pcb *pcb, err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_SLOTS, ring_size); VERIFY(err == 0); + if (utun_in_wmm_mode(pcb)) { + os_log(OS_LOG_DEFAULT, "%s: %s enabling wmm mode\n", + __func__, pcb->utun_if_xname); + + init_params->output_sched_model = IFNET_SCHED_MODEL_DRIVER_MANAGED; + + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_TX_RINGS, + UTUN_NETIF_WMM_TX_RING_COUNT); + VERIFY(err == 0); + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_RX_RINGS, + UTUN_NETIF_WMM_RX_RING_COUNT); + VERIFY(err == 0); + + err = kern_nexus_attr_set(nxa, NEXUS_ATTR_QMAP, NEXUS_QMAP_TYPE_WMM); + VERIFY(err == 0); + } + pcb->utun_netif_txring_size = ring_size; bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; - pp_init.kbi_packets = pcb->utun_netif_ring_size * 2; + pp_init.kbi_flags |= (KBIF_VIRTUAL_DEVICE | KBIF_USER_ACCESS); + // Note: we need more packets than can be held in the tx and rx rings because + // packets can also be in the AQM queue(s) + pp_init.kbi_packets = pcb->utun_netif_ring_size * (2 * pcb->utun_kpipe_count + 1); pp_init.kbi_bufsize = pcb->utun_slot_size; pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; @@ -1305,44 +1464,75 @@ utun_unregister_kernel_pipe_nexus(void) lck_mtx_unlock(&utun_lock); } -// For use by socket option, not internally -static errno_t -utun_disable_channel(struct utun_pcb *pcb) +/* This structure only holds onto kpipe channels that need to be + * freed in the future, but are cleared from the pcb under lock + */ +struct utun_detached_channels { + int count; + kern_pbufpool_t pp; + uuid_t uuids[UTUN_IF_MAX_RING_COUNT]; +}; + +static void +utun_detach_channels(struct utun_pcb *pcb, struct utun_detached_channels *dc) { - errno_t result; - int enabled; - uuid_t uuid; + LCK_RW_ASSERT(&pcb->utun_pcb_lock, LCK_RW_TYPE_EXCLUSIVE); - /* Wait until all threads in the data paths are done. */ - utun_wait_data_move_drain(pcb); - - lck_rw_lock_exclusive(&pcb->utun_pcb_lock); - - enabled = pcb->utun_kpipe_enabled; - uuid_copy(uuid, pcb->utun_kpipe_uuid); - - VERIFY(uuid_is_null(pcb->utun_kpipe_uuid) == !enabled); - - pcb->utun_kpipe_enabled = 0; - uuid_clear(pcb->utun_kpipe_uuid); - - lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); - - if (enabled) { - result = kern_nexus_controller_free_provider_instance(utun_ncd, uuid); - } else { - result = ENXIO; - } - - if (!result) { - if (pcb->utun_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->utun_kpipe_pp); - pcb->utun_kpipe_pp = NULL; + if (!utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)) { + for (int i = 0; i < UTUN_IF_MAX_RING_COUNT; i++) { + VERIFY(uuid_is_null(pcb->utun_kpipe_uuid[i])); } - utun_unregister_kernel_pipe_nexus(); + dc->count = 0; + return; } - return result; + dc->count = pcb->utun_kpipe_count; + + VERIFY(dc->count >= 0); + VERIFY(dc->count <= UTUN_IF_MAX_RING_COUNT); + + for (int i = 0; i < dc->count; i++) { + VERIFY(!uuid_is_null(pcb->utun_kpipe_uuid[i])); + uuid_copy(dc->uuids[i], pcb->utun_kpipe_uuid[i]); + uuid_clear(pcb->utun_kpipe_uuid[i]); + } + for (int i = dc->count; i < UTUN_IF_MAX_RING_COUNT; i++) { + VERIFY(uuid_is_null(pcb->utun_kpipe_uuid[i])); + } + + if (dc->count) { + VERIFY(pcb->utun_kpipe_pp); + } else { + VERIFY(!pcb->utun_kpipe_pp); + } + + dc->pp = pcb->utun_kpipe_pp; + + pcb->utun_kpipe_pp = NULL; + + utun_flag_clr(pcb, UTUN_FLAGS_KPIPE_ALLOCATED); +} + +static void +utun_free_channels(struct utun_detached_channels *dc) +{ + if (!dc->count) { + return; + } + + for (int i = 0; i < dc->count; i++) { + errno_t result; + result = kern_nexus_controller_free_provider_instance(utun_ncd, + dc->uuids[i]); + VERIFY(!result); + } + + VERIFY(dc->pp); + kern_pbufpool_destroy(dc->pp); + + utun_unregister_kernel_pipe_nexus(); + + memset(dc, 0, sizeof(*dc)); } static errno_t @@ -1358,6 +1548,9 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) return result; } + VERIFY(pcb->utun_kpipe_count); + VERIFY(!utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)); + result = utun_register_kernel_pipe_nexus(pcb); if (result) { return result; @@ -1367,11 +1560,6 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) lck_rw_lock_exclusive(&pcb->utun_pcb_lock); - if (pcb->utun_kpipe_enabled) { - result = EEXIST; // return success instead? - goto done; - } - /* * Make sure we can fit packets in the channel buffers and * Allow an extra 4 bytes for the protocol number header in the channel @@ -1383,12 +1571,12 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) bzero(&pp_init, sizeof(pp_init)); pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_flags |= KBIF_VIRTUAL_DEVICE; - pp_init.kbi_packets = pcb->utun_netif_ring_size * 2; + pp_init.kbi_flags |= (KBIF_VIRTUAL_DEVICE | KBIF_USER_ACCESS); + // Note: We only needs are many packets as can be held in the tx and rx rings + pp_init.kbi_packets = pcb->utun_netif_ring_size * 2 * pcb->utun_kpipe_count; pp_init.kbi_bufsize = pcb->utun_slot_size; pp_init.kbi_buf_seg_size = UTUN_IF_DEFAULT_BUF_SEG_SIZE; pp_init.kbi_max_frags = 1; - pp_init.kbi_flags |= KBIF_QUANTUM; (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), "com.apple.kpipe.%s", pcb->utun_if_xname); pp_init.kbi_ctx = NULL; @@ -1402,29 +1590,41 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) goto done; } - VERIFY(uuid_is_null(pcb->utun_kpipe_uuid)); bzero(&init, sizeof(init)); init.nxi_version = KERN_NEXUS_CURRENT_VERSION; init.nxi_tx_pbufpool = pcb->utun_kpipe_pp; - result = kern_nexus_controller_alloc_provider_instance(utun_ncd, - utun_kpipe_uuid, pcb, NULL, &pcb->utun_kpipe_uuid, &init); - if (result) { - goto done; - } - nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT; - uuid_t uuid_null = {}; - result = kern_nexus_controller_bind_provider_instance(utun_ncd, - pcb->utun_kpipe_uuid, &port, - proc_pid(proc), uuid_null, NULL, 0, NEXUS_BIND_PID); - if (result) { - kern_nexus_controller_free_provider_instance(utun_ncd, - pcb->utun_kpipe_uuid); - uuid_clear(pcb->utun_kpipe_uuid); - goto done; - } + for (unsigned int i = 0; i < pcb->utun_kpipe_count; i++) { + VERIFY(uuid_is_null(pcb->utun_kpipe_uuid[i])); + result = kern_nexus_controller_alloc_provider_instance(utun_ncd, + utun_kpipe_uuid, pcb, NULL, &pcb->utun_kpipe_uuid[i], &init); - pcb->utun_kpipe_enabled = 1; + if (result == 0) { + nexus_port_t port = NEXUS_PORT_KERNEL_PIPE_CLIENT; + uuid_t uuid_null = {}; + const bool has_proc_uuid = !uuid_is_null(pcb->utun_kpipe_proc_uuid); + pid_t pid = pcb->utun_kpipe_pid; + if (!pid && !has_proc_uuid) { + pid = proc_pid(proc); + } + result = kern_nexus_controller_bind_provider_instance(utun_ncd, + pcb->utun_kpipe_uuid[i], &port, + pid, has_proc_uuid ? pcb->utun_kpipe_proc_uuid : uuid_null, NULL, + 0, has_proc_uuid ? NEXUS_BIND_EXEC_UUID : NEXUS_BIND_PID); + } + + if (result != 0) { + /* Unwind all of them on error */ + for (int j = 0; j < UTUN_IF_MAX_RING_COUNT; j++) { + if (!uuid_is_null(pcb->utun_kpipe_uuid[j])) { + kern_nexus_controller_free_provider_instance(utun_ncd, + pcb->utun_kpipe_uuid[j]); + uuid_clear(pcb->utun_kpipe_uuid[j]); + } + } + goto done; + } + } done: lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); @@ -1435,6 +1635,8 @@ done: pcb->utun_kpipe_pp = NULL; } utun_unregister_kernel_pipe_nexus(); + } else { + utun_flag_set(pcb, UTUN_FLAGS_KPIPE_ALLOCATED); } return result; @@ -1666,7 +1868,7 @@ utun_ctl_connect(kern_ctl_ref kctlref, return EINVAL; } - /* Handle case where utun_ctl_setup() was called, but ipsec_ctl_bind() was not */ + /* Handle case where utun_ctl_setup() was called, but utun_ctl_bind() was not */ if (pcb->utun_ctlref == NULL) { (void)utun_ctl_bind(kctlref, sac, unitinfo); } @@ -1704,6 +1906,15 @@ utun_ctl_connect(kern_ctl_ref kctlref, utun_init.free = utun_detached; #if UTUN_NEXUS + /* We don't support kpipes without a netif */ + if (pcb->utun_kpipe_count > 0 && !pcb->utun_use_netif) { + result = ENOTSUP; + os_log_error(OS_LOG_DEFAULT, "utun_ctl_connect - kpipe requires netif: failed %d\n", result); + utun_free_pcb(pcb, false); + *unitinfo = NULL; + return result; + } + if (pcb->utun_use_netif) { result = utun_nexus_ifattach(pcb, &utun_init, &pcb->utun_ifp); if (result != 0) { @@ -1713,6 +1924,17 @@ utun_ctl_connect(kern_ctl_ref kctlref, return result; } + if (pcb->utun_kpipe_count) { + result = utun_enable_channel(pcb, current_proc()); + if (result) { + os_log_error(OS_LOG_DEFAULT, "%s: %s failed to enable channels\n", + __func__, pcb->utun_if_xname); + utun_free_pcb(pcb, false); + *unitinfo = NULL; + return result; + } + } + if (pcb->utun_attach_fsw) { result = utun_flowswitch_attach(pcb); if (result != 0) { @@ -1953,10 +2175,8 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_lock_exclusive(&pcb->utun_pcb_lock); #if UTUN_NEXUS - uuid_t kpipe_uuid; - uuid_copy(kpipe_uuid, pcb->utun_kpipe_uuid); - uuid_clear(pcb->utun_kpipe_uuid); - pcb->utun_kpipe_enabled = FALSE; + struct utun_detached_channels dc = {}; + utun_detach_channels(pcb, &dc); #endif // UTUN_NEXUS pcb->utun_ctlref = NULL; @@ -1992,15 +2212,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); - if (!uuid_is_null(kpipe_uuid)) { - if (kern_nexus_controller_free_provider_instance(utun_ncd, kpipe_uuid) == 0) { - if (pcb->utun_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->utun_kpipe_pp); - pcb->utun_kpipe_pp = NULL; - } - utun_unregister_kernel_pipe_nexus(); - } - } + utun_free_channels(&dc); utun_nexus_detach(pcb); /* Decrement refcnt added by ifnet_datamov_suspend_and_drain(). */ @@ -2011,15 +2223,7 @@ utun_ctl_disconnect(__unused kern_ctl_ref kctlref, lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); #if UTUN_NEXUS - if (!uuid_is_null(kpipe_uuid)) { - if (kern_nexus_controller_free_provider_instance(utun_ncd, kpipe_uuid) == 0) { - if (pcb->utun_kpipe_pp != NULL) { - kern_pbufpool_destroy(pcb->utun_kpipe_pp); - pcb->utun_kpipe_pp = NULL; - } - utun_unregister_kernel_pipe_nexus(); - } - } + utun_free_channels(&dc); #endif // UTUN_NEXUS /* @@ -2101,7 +2305,7 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref, result = EINVAL; break; } - pcb->utun_flags = *(u_int32_t *)data; + pcb->utun_external_flags = *(u_int32_t *)data; break; case UTUN_OPT_EXT_IFDATA_STATS: @@ -2189,18 +2393,55 @@ utun_ctl_setopt(__unused kern_ctl_ref kctlref, result = EMSGSIZE; break; } - if (pcb->utun_ifp == NULL) { - // Only can set after connecting + if (pcb->utun_ifp != NULL) { + // Only can set before connecting result = EINVAL; break; } - if (*(int *)data) { - result = utun_enable_channel(pcb, current_proc()); - } else { - result = utun_disable_channel(pcb); + int *intp = __unsafe_forge_single(int *, data); + if (*intp != 0 && + *intp != 1 && + *intp != UTUN_IF_WMM_RING_COUNT) { + result = EINVAL; + break; } + lck_rw_lock_exclusive(&pcb->utun_pcb_lock); + pcb->utun_kpipe_count = *(int *)data; + lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); break; } + case UTUN_OPT_CHANNEL_BIND_PID: { + if (len != sizeof(pid_t)) { + result = EMSGSIZE; + break; + } + if (pcb->utun_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + lck_rw_lock_exclusive(&pcb->utun_pcb_lock); + pcb->utun_kpipe_pid = *(pid_t *)data; + lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); + break; + } + + case UTUN_OPT_CHANNEL_BIND_UUID: { + if (len != sizeof(uuid_t)) { + result = EMSGSIZE; + break; + } + if (pcb->utun_ifp != NULL) { + // Only can set before connecting + result = EINVAL; + break; + } + lck_rw_lock_exclusive(&pcb->utun_pcb_lock); + uuid_copy(pcb->utun_kpipe_proc_uuid, *((uuid_t *)data)); + lck_rw_unlock_exclusive(&pcb->utun_pcb_lock); + break; + } + case UTUN_OPT_ENABLE_FLOWSWITCH: { if (len != sizeof(int)) { result = EMSGSIZE; @@ -2399,7 +2640,7 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref, if (*len != sizeof(u_int32_t)) { result = EMSGSIZE; } else { - *(u_int32_t *)data = pcb->utun_flags; + *(u_int32_t *)data = pcb->utun_external_flags; } break; @@ -2439,7 +2680,29 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref, result = EMSGSIZE; } else { lck_rw_lock_shared(&pcb->utun_pcb_lock); - *(int *)data = pcb->utun_kpipe_enabled; + *(int *)data = pcb->utun_kpipe_count; + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + } + break; + } + + case UTUN_OPT_CHANNEL_BIND_PID: { + if (*len != sizeof(pid_t)) { + result = EMSGSIZE; + } else { + lck_rw_lock_shared(&pcb->utun_pcb_lock); + *(pid_t *)data = pcb->utun_kpipe_pid; + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + } + break; + } + + case UTUN_OPT_CHANNEL_BIND_UUID: { + if (*len != sizeof(uuid_t)) { + result = EMSGSIZE; + } else { + lck_rw_lock_shared(&pcb->utun_pcb_lock); + uuid_copy(*((uuid_t *)data), pcb->utun_kpipe_proc_uuid); lck_rw_unlock_shared(&pcb->utun_pcb_lock); } break; @@ -2467,12 +2730,14 @@ utun_ctl_getopt(__unused kern_ctl_ref kctlref, case UTUN_OPT_GET_CHANNEL_UUID: { lck_rw_lock_shared(&pcb->utun_pcb_lock); - if (uuid_is_null(pcb->utun_kpipe_uuid)) { + if (!utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)) { result = ENXIO; - } else if (*len != sizeof(uuid_t)) { + } else if (*len != sizeof(uuid_t) * pcb->utun_kpipe_count) { result = EMSGSIZE; } else { - uuid_copy(data, pcb->utun_kpipe_uuid); + for (unsigned i = 0; i < pcb->utun_kpipe_count; i++) { + uuid_copy(((uuid_t *)data)[i], pcb->utun_kpipe_uuid[i]); + } } lck_rw_unlock_shared(&pcb->utun_pcb_lock); break; @@ -2576,25 +2841,7 @@ utun_start(ifnet_t interface) VERIFY(pcb != NULL); #if UTUN_NEXUS - lck_rw_lock_shared(&pcb->utun_pcb_lock); - if (pcb->utun_kpipe_enabled) { - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - if (!utun_data_move_begin(pcb)) { - os_log_info(OS_LOG_DEFAULT, - "%s: data path stopped for %s\n", - __func__, if_name(pcb->utun_ifp)); - return; - } - /* It's possible to have channels enabled, but not yet have the channel opened, - * in which case the rxring will not be set - */ - if (pcb->utun_kpipe_rxring != NULL) { - kern_channel_notify(pcb->utun_kpipe_rxring, 0); - } - utun_data_move_end(pcb); - return; - } - lck_rw_unlock_shared(&pcb->utun_pcb_lock); + VERIFY(pcb->utun_kpipe_count == 0); // kpipe > 0 enforces use_netif #endif // UTUN_NEXUS for (;;) { @@ -2655,7 +2902,7 @@ utun_output(ifnet_t interface, } } - if (pcb->utun_flags & UTUN_FLAGS_NO_OUTPUT) { + if (pcb->utun_external_flags & UTUN_FLAGS_NO_OUTPUT) { /* flush data */ mbuf_freem(data); return 0; @@ -2778,7 +3025,7 @@ utun_framer(ifnet_t interface, #if NECP // Add process uuid if applicable - if (pcb->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { + if (pcb->utun_external_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { if (m_pktlen(*packet) >= (int32_t)UTUN_HEADER_SIZE(pcb)) { u_int8_t *header = mtod(*packet, uint8_t*); int uuid_err = necp_get_app_uuid_from_packet(*packet, (void *)(header + sizeof(u_int32_t))); @@ -2849,6 +3096,47 @@ utun_ioctl(ifnet_t interface, break; } + case SIOCSIFSUBFAMILY: { + uint32_t subfamily; + + subfamily = ((struct ifreq*)data)->ifr_type.ift_subfamily; + switch (subfamily) { + case IFRTYPE_SUBFAMILY_BLUETOOTH: + interface->if_subfamily = IFNET_SUBFAMILY_BLUETOOTH; + break; + case IFRTYPE_SUBFAMILY_WIFI: + interface->if_subfamily = IFNET_SUBFAMILY_WIFI; + break; + case IFRTYPE_SUBFAMILY_QUICKRELAY: + interface->if_subfamily = IFNET_SUBFAMILY_QUICKRELAY; + break; + case IFRTYPE_SUBFAMILY_DEFAULT: + interface->if_subfamily = IFNET_SUBFAMILY_DEFAULT; + break; + default: + result = EINVAL; + break; + } + break; + } + + case SIOCSIFPEEREGRESSFUNCTIONALTYPE: { + uint32_t peeregressinterfacetype; + peeregressinterfacetype = ((struct ifreq*)data)->ifr_ifru.ifru_peer_egress_functional_type; + switch (peeregressinterfacetype) { + case IFRTYPE_FUNCTIONAL_WIFI_INFRA: + case IFRTYPE_FUNCTIONAL_CELLULAR: + case IFRTYPE_FUNCTIONAL_WIRED: + case IFRTYPE_FUNCTIONAL_UNKNOWN: + interface->peer_egress_functional_type = peeregressinterfacetype; + break; + default: + result = EINVAL; + break; + } + break; + } + case SIOCSIFFLAGS: /* ifioctl() takes care of it */ break; @@ -2976,7 +3264,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet) pcb->utun_input_chain_last = packet; lck_mtx_unlock(&pcb->utun_input_chain_lock); - kern_channel_ring_t __single rx_ring = pcb->utun_netif_rxring; + kern_channel_ring_t __single rx_ring = pcb->utun_netif_rxring[0]; lck_rw_unlock_shared(&pcb->utun_pcb_lock); if (rx_ring != NULL) { @@ -2993,7 +3281,7 @@ utun_pkt_input(struct utun_pcb *pcb, mbuf_t packet) if (m_pktlen(packet) >= (int32_t)UTUN_HEADER_SIZE(pcb)) { bpf_tap_in(pcb->utun_ifp, DLT_NULL, packet, 0, 0); } - if (pcb->utun_flags & UTUN_FLAGS_NO_INPUT) { + if (pcb->utun_external_flags & UTUN_FLAGS_NO_INPUT) { /* flush data */ mbuf_freem(packet); return 0; @@ -3112,7 +3400,7 @@ utun_nexus_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov, channel) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); - boolean_t ok = ifnet_is_attached(pcb->utun_ifp, 1); + boolean_t ok = ifnet_get_ioref(pcb->utun_ifp); if (pcb->utun_netif_nexus == nexus) { pcb->utun_netif_connected = true; } @@ -3167,14 +3455,30 @@ utun_kpipe_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, { #pragma unused(nxprov) #pragma unused(channel) -#pragma unused(ring_ctx) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); + uint8_t ring_idx; + + for (ring_idx = 0; ring_idx < pcb->utun_kpipe_count; ring_idx++) { + if (!uuid_compare(channel->ch_info->cinfo_nx_uuid, pcb->utun_kpipe_uuid[ring_idx])) { + break; + } + } + + if (ring_idx == pcb->utun_kpipe_count) { + uuid_string_t uuidstr; + uuid_unparse(channel->ch_info->cinfo_nx_uuid, uuidstr); + os_log_error(OS_LOG_DEFAULT, "%s: %s cannot find channel %s\n", __func__, pcb->utun_if_xname, uuidstr); + return ENOENT; + } + + *ring_ctx = __unsafe_forge_single(void *, (uintptr_t)ring_idx); + if (!is_tx_ring) { - VERIFY(pcb->utun_kpipe_rxring == NULL); - pcb->utun_kpipe_rxring = ring; + VERIFY(pcb->utun_kpipe_rxring[ring_idx] == NULL); + pcb->utun_kpipe_rxring[ring_idx] = ring; } else { - VERIFY(pcb->utun_kpipe_txring == NULL); - pcb->utun_kpipe_txring = ring; + VERIFY(pcb->utun_kpipe_txring[ring_idx] == NULL); + pcb->utun_kpipe_txring[ring_idx] = ring; } return 0; } @@ -3184,12 +3488,19 @@ utun_kpipe_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t ring) { #pragma unused(nxprov) + bool found = false; struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); - if (pcb->utun_kpipe_rxring == ring) { - pcb->utun_kpipe_rxring = NULL; - } else if (pcb->utun_kpipe_txring == ring) { - pcb->utun_kpipe_txring = NULL; + + for (unsigned int i = 0; i < pcb->utun_kpipe_count; i++) { + if (pcb->utun_kpipe_rxring[i] == ring) { + pcb->utun_kpipe_rxring[i] = NULL; + found = true; + } else if (pcb->utun_kpipe_txring[i] == ring) { + pcb->utun_kpipe_txring[i] = NULL; + found = true; + } } + VERIFY(found); } static errno_t @@ -3207,8 +3518,7 @@ utun_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } lck_rw_lock_shared(&pcb->utun_pcb_lock); - int channel_enabled = pcb->utun_kpipe_enabled; - if (!channel_enabled) { + if (!utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)) { lck_rw_unlock_shared(&pcb->utun_pcb_lock); utun_data_move_end(pcb); return 0; @@ -3224,7 +3534,7 @@ utun_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } // Signal the netif ring to read - kern_channel_ring_t __single rx_ring = pcb->utun_netif_rxring; + kern_channel_ring_t __single rx_ring = pcb->utun_netif_rxring[0]; lck_rw_unlock_shared(&pcb->utun_pcb_lock); if (rx_ring != NULL) { kern_channel_notify(rx_ring, 0); @@ -3262,7 +3572,7 @@ utun_kpipe_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, mbuf_ref_t data = NULL; if (length >= UTUN_HEADER_SIZE(pcb) && - !(pcb->utun_flags & UTUN_FLAGS_NO_INPUT)) { + !(pcb->utun_external_flags & UTUN_FLAGS_NO_INPUT)) { errno_t error = mbuf_gethdr(MBUF_WAITOK, MBUF_TYPE_HEADER, &data); VERIFY(0 == error); error = mbuf_copyback(data, 0, length, tx_baddr, MBUF_WAITOK); @@ -3305,6 +3615,7 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(flags) struct utun_pcb *__single pcb = kern_nexus_get_context(nexus); struct kern_channel_ring_stat_increment rx_ring_stats = {}; + uint8_t ring_idx = (uint8_t)(uintptr_t)kern_channel_ring_get_context(rx_ring); if (!utun_data_move_begin(pcb)) { os_log_info(OS_LOG_DEFAULT, "%s: data path stopped for %s\n", @@ -3314,13 +3625,15 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, lck_rw_lock_shared(&pcb->utun_pcb_lock); - int channel_enabled = pcb->utun_kpipe_enabled; - if (!channel_enabled) { + if (!utun_flag_isset(pcb, UTUN_FLAGS_KPIPE_ALLOCATED)) { lck_rw_unlock_shared(&pcb->utun_pcb_lock); utun_data_move_end(pcb); return 0; } + VERIFY(pcb->utun_kpipe_count > 0); + VERIFY(ring_idx <= pcb->utun_kpipe_count); + /* reclaim user-released slots */ (void) kern_channel_reclaim(rx_ring); @@ -3331,304 +3644,206 @@ utun_kpipe_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, return 0; } - if (pcb->utun_use_netif) { - kern_channel_ring_t __single tx_ring = pcb->utun_netif_txring; - if (tx_ring == NULL || - pcb->utun_netif_nexus == NULL) { - // Net-If TX ring not set up yet, nothing to read - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - utun_data_move_end(pcb); - return 0; - } - - struct netif_stats *nifs = &NX_NETIF_PRIVATE(pcb->utun_netif_nexus)->nif_stats; - - // Unlock utun before entering ring + kern_channel_ring_t __single tx_ring = pcb->utun_netif_txring[ring_idx]; + if (tx_ring == NULL || + pcb->utun_netif_nexus == NULL) { + // Net-If TX ring not set up yet, nothing to read lck_rw_unlock_shared(&pcb->utun_pcb_lock); + utun_data_move_end(pcb); + return 0; + } - (void)kr_enter(tx_ring, TRUE); + struct netif_stats *nifs = &NX_NETIF_PRIVATE(pcb->utun_netif_nexus)->nif_stats; - // Lock again after entering and validate - lck_rw_lock_shared(&pcb->utun_pcb_lock); - if (tx_ring != pcb->utun_netif_txring) { - // Ring no longer valid - // Unlock first, then exit ring - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - kr_exit(tx_ring); - utun_data_move_end(pcb); - return 0; - } + // Unlock utun before entering ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); - struct kern_channel_ring_stat_increment tx_ring_stats; - bzero(&tx_ring_stats, sizeof(tx_ring_stats)); - kern_channel_slot_t tx_pslot = NULL; - kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); - if (tx_slot == NULL) { - // Nothing to read, don't bother signalling - // Unlock first, then exit ring - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - kr_exit(tx_ring); - utun_data_move_end(pcb); - return 0; - } + (void)kr_enter(tx_ring, TRUE); - struct kern_pbufpool *rx_pp = rx_ring->ckr_pp; - VERIFY(rx_pp != NULL); - struct kern_pbufpool *tx_pp = tx_ring->ckr_pp; - VERIFY(tx_pp != NULL); - kern_channel_slot_t rx_pslot = NULL; - kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL); - kern_packet_t tx_chain_ph = 0; + // Lock again after entering and validate + lck_rw_lock_shared(&pcb->utun_pcb_lock); + if (tx_ring != pcb->utun_netif_txring[ring_idx]) { + // Ring no longer valid + // Unlock first, then exit ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + kr_exit(tx_ring); + utun_data_move_end(pcb); + return 0; + } - while (rx_slot != NULL && tx_slot != NULL) { - size_t length; - kern_buflet_t rx_buf; - uint8_t *rx_baddr; + struct kern_channel_ring_stat_increment tx_ring_stats; + bzero(&tx_ring_stats, sizeof(tx_ring_stats)); + kern_channel_slot_t tx_pslot = NULL; + kern_channel_slot_t tx_slot = kern_channel_get_next_slot(tx_ring, NULL, NULL); + if (tx_slot == NULL) { + // Nothing to read, don't bother signalling + // Unlock first, then exit ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + kr_exit(tx_ring); + utun_data_move_end(pcb); + return 0; + } - kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); + struct kern_pbufpool *rx_pp = rx_ring->ckr_pp; + VERIFY(rx_pp != NULL); + struct kern_pbufpool *tx_pp = tx_ring->ckr_pp; + VERIFY(tx_pp != NULL); + kern_channel_slot_t rx_pslot = NULL; + kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL); + kern_packet_t tx_chain_ph = 0; - /* Skip slot if packet is zero-length or marked as dropped (QUMF_DROPPED) */ - if (tx_ph == 0) { - // Advance TX ring - tx_pslot = tx_slot; - tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); - continue; - } - (void) kern_channel_slot_detach_packet(tx_ring, tx_slot, tx_ph); - if (tx_chain_ph != 0) { - kern_packet_append(tx_ph, tx_chain_ph); - } - tx_chain_ph = tx_ph; + while (rx_slot != NULL && tx_slot != NULL) { + size_t length; + kern_buflet_t rx_buf; + uint8_t *rx_baddr; + kern_packet_t tx_ph = kern_channel_slot_get_packet(tx_ring, tx_slot); + + /* Skip slot if packet is zero-length or marked as dropped (QUMF_DROPPED) */ + if (tx_ph == 0) { // Advance TX ring tx_pslot = tx_slot; tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); - - // Allocate rx packet - kern_packet_t rx_ph = 0; - errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); - if (__improbable(error != 0)) { - os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n", - pcb->utun_ifp->if_xname); - break; - } - - kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); - VERIFY(tx_buf != NULL); - uint8_t *tx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, - kern_buflet_get_data_address(tx_buf), - kern_buflet_get_data_limit(tx_buf)); - VERIFY(tx_baddr != NULL); - tx_baddr += kern_buflet_get_data_offset(tx_buf); - - bpf_tap_packet_out(pcb->utun_ifp, DLT_RAW, tx_ph, NULL, 0); - - length = MIN(kern_packet_get_data_length(tx_ph) + UTUN_HEADER_SIZE(pcb), - pcb->utun_slot_size); - - tx_ring_stats.kcrsi_slots_transferred++; - tx_ring_stats.kcrsi_bytes_transferred += length; - - if (length < UTUN_HEADER_SIZE(pcb) || - length > pcb->utun_slot_size || - length > PP_BUF_SIZE_DEF(rx_pp) || - (pcb->utun_flags & UTUN_FLAGS_NO_OUTPUT)) { - /* flush data */ - kern_pbufpool_free(rx_pp, rx_ph); - os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n", - pcb->utun_ifp->if_xname, length, UTUN_HEADER_SIZE(pcb)); - STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); - STATS_INC(nifs, NETIF_STATS_DROP); - continue; - } - - /* fillout packet */ - rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); - VERIFY(rx_buf != NULL); - rx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, - kern_buflet_get_data_address(rx_buf), - kern_buflet_get_data_limit(rx_buf)); - VERIFY(rx_baddr != NULL); - - // Find family - uint32_t af = 0; - uint8_t vhl = *(uint8_t *)(tx_baddr); - u_int ip_version = (vhl >> 4); - switch (ip_version) { - case 4: { - af = AF_INET; - break; - } - case 6: { - af = AF_INET6; - break; - } - default: { - os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n", - pcb->utun_ifp->if_xname, ip_version, vhl, UTUN_HEADER_SIZE(pcb)); - break; - } - } - - // Copy header - af = htonl(af); - memcpy(rx_baddr, &af, sizeof(af)); - if (pcb->utun_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { - uuid_t uuid; - kern_packet_get_euuid(tx_ph, uuid); - memcpy(rx_baddr + sizeof(af), uuid, sizeof(uuid)); - } - - // Copy data from tx to rx - memcpy(rx_baddr + UTUN_HEADER_SIZE(pcb), tx_baddr, length - UTUN_HEADER_SIZE(pcb)); - kern_packet_clear_flow_uuid(rx_ph); // zero flow id - - /* finalize and attach the packet */ - error = kern_buflet_set_data_offset(rx_buf, 0); - VERIFY(error == 0); - error = kern_buflet_set_data_length(rx_buf, length); - VERIFY(error == 0); - error = kern_packet_finalize(rx_ph); - VERIFY(error == 0); - error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); - VERIFY(error == 0); - - STATS_INC(nifs, NETIF_STATS_TX_PACKETS); - STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT); - - rx_ring_stats.kcrsi_slots_transferred++; - rx_ring_stats.kcrsi_bytes_transferred += length; - - rx_pslot = rx_slot; - rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); + continue; } - - if (rx_pslot) { - kern_channel_advance_slot(rx_ring, rx_pslot); - kern_channel_increment_ring_net_stats(rx_ring, pcb->utun_ifp, &rx_ring_stats); - } - + (void) kern_channel_slot_detach_packet(tx_ring, tx_slot, tx_ph); if (tx_chain_ph != 0) { - kern_pbufpool_free_chain(tx_pp, tx_chain_ph); + kern_packet_append(tx_ph, tx_chain_ph); + } + tx_chain_ph = tx_ph; + + // Advance TX ring + tx_pslot = tx_slot; + tx_slot = kern_channel_get_next_slot(tx_ring, tx_slot, NULL); + + // Allocate rx packet + kern_packet_t rx_ph = 0; + errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); + if (__improbable(error != 0)) { + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n", + pcb->utun_ifp->if_xname); + break; } - if (tx_pslot) { - kern_channel_advance_slot(tx_ring, tx_pslot); - kern_channel_increment_ring_net_stats(tx_ring, pcb->utun_ifp, &tx_ring_stats); - (void)kern_channel_reclaim(tx_ring); + kern_buflet_t tx_buf = kern_packet_get_next_buflet(tx_ph, NULL); + VERIFY(tx_buf != NULL); + uint8_t *tx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, + kern_buflet_get_data_address(tx_buf), + kern_buflet_get_data_limit(tx_buf)); + VERIFY(tx_baddr != NULL); + tx_baddr += kern_buflet_get_data_offset(tx_buf); + + bpf_tap_packet_out(pcb->utun_ifp, DLT_RAW, tx_ph, NULL, 0); + + length = MIN(kern_packet_get_data_length(tx_ph) + UTUN_HEADER_SIZE(pcb), + pcb->utun_slot_size); + + tx_ring_stats.kcrsi_slots_transferred++; + tx_ring_stats.kcrsi_bytes_transferred += length; + + if (length < UTUN_HEADER_SIZE(pcb) || + length > pcb->utun_slot_size || + length > PP_BUF_SIZE_DEF(rx_pp) || + (pcb->utun_external_flags & UTUN_FLAGS_NO_OUTPUT)) { + /* flush data */ + kern_pbufpool_free(rx_pp, rx_ph); + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: invalid length %zu header_size %zu\n", + pcb->utun_ifp->if_xname, length, UTUN_HEADER_SIZE(pcb)); + STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); + STATS_INC(nifs, NETIF_STATS_DROP); + continue; } - /* just like utun_ctl_rcvd(), always reenable output */ - errno_t error = ifnet_enable_output(pcb->utun_ifp); - if (error != 0) { - os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); + /* fillout packet */ + rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); + VERIFY(rx_buf != NULL); + rx_baddr = __unsafe_forge_bidi_indexable(uint8_t *, + kern_buflet_get_data_address(rx_buf), + kern_buflet_get_data_limit(rx_buf)); + VERIFY(rx_baddr != NULL); + + // Find family + uint32_t af = 0; + uint8_t vhl = *(uint8_t *)(tx_baddr); + u_int ip_version = (vhl >> 4); + switch (ip_version) { + case 4: { + af = AF_INET; + break; + } + case 6: { + af = AF_INET6; + break; + } + default: { + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: unknown ip version %u vhl %u header_size %zu\n", + pcb->utun_ifp->if_xname, ip_version, vhl, UTUN_HEADER_SIZE(pcb)); + break; + } } - // Unlock first, then exit ring - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - - if (tx_pslot != NULL) { - kern_channel_notify(tx_ring, 0); + // Copy header + af = htonl(af); + memcpy(rx_baddr, &af, sizeof(af)); + if (pcb->utun_external_flags & UTUN_FLAGS_ENABLE_PROC_UUID) { + uuid_t uuid; + kern_packet_get_euuid(tx_ph, uuid); + memcpy(rx_baddr + sizeof(af), uuid, sizeof(uuid)); } - kr_exit(tx_ring); - } else { - lck_rw_unlock_shared(&pcb->utun_pcb_lock); - uint32_t mb_cnt = 0; - uint32_t mb_len = 0; - mbuf_ref_t mb_head = NULL; - mbuf_ref_t mb_tail = NULL; + // Copy data from tx to rx + memcpy(rx_baddr + UTUN_HEADER_SIZE(pcb), tx_baddr, length - UTUN_HEADER_SIZE(pcb)); + kern_packet_clear_flow_uuid(rx_ph); // zero flow id - if (ifnet_dequeue_multi(pcb->utun_ifp, avail, &mb_head, - &mb_tail, &mb_cnt, &mb_len) != 0) { - utun_data_move_end(pcb); - return 0; - } - VERIFY(mb_cnt <= avail); + /* finalize and attach the packet */ + error = kern_buflet_set_data_offset(rx_buf, 0); + VERIFY(error == 0); + error = kern_buflet_set_data_length(rx_buf, length); + VERIFY(error == 0); + error = kern_packet_finalize(rx_ph); + VERIFY(error == 0); + error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); + VERIFY(error == 0); - struct kern_pbufpool *rx_pp = rx_ring->ckr_pp; - VERIFY(rx_pp != NULL); - kern_channel_slot_t rx_pslot = NULL; - kern_channel_slot_t rx_slot = kern_channel_get_next_slot(rx_ring, NULL, NULL); - while (rx_slot) { - size_t length = 0; - mbuf_t data = NULL; - if ((data = mb_head) == NULL) { - VERIFY(mb_cnt == 0); - break; - } - mb_head = mbuf_nextpkt(mb_head); - mbuf_setnextpkt(data, NULL); - VERIFY(mb_cnt != 0); - --mb_cnt; - length = mbuf_pkthdr_len(data); - if (length < UTUN_HEADER_SIZE(pcb) || - length > pcb->utun_slot_size || - (pcb->utun_flags & UTUN_FLAGS_NO_OUTPUT)) { - /* flush data */ - mbuf_freem(data); - continue; - } - bpf_tap_out(pcb->utun_ifp, DLT_NULL, data, 0, 0); + STATS_INC(nifs, NETIF_STATS_TX_PACKETS); + STATS_INC(nifs, NETIF_STATS_TX_COPY_DIRECT); - // Allocate rx packet - kern_packet_t rx_ph = 0; - errno_t error = kern_pbufpool_alloc_nosleep(rx_pp, 1, &rx_ph); - if (__improbable(error != 0)) { - os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx %s: failed to allocate packet\n", - pcb->utun_ifp->if_xname); - break; - } + rx_ring_stats.kcrsi_slots_transferred++; + rx_ring_stats.kcrsi_bytes_transferred += length; - /* - * The ABI requires the protocol in network byte order - */ - *mtod(data, uint32_t*) = htonl(*mtod(data, uint32_t *)); - - // Fillout rx packet - kern_buflet_t rx_buf = kern_packet_get_next_buflet(rx_ph, NULL); - VERIFY(rx_buf != NULL); - void *rx_baddr = __unsafe_forge_bidi_indexable(void *, - kern_buflet_get_data_address(rx_buf), - kern_buflet_get_data_limit(rx_buf)); - VERIFY(rx_baddr != NULL); - - // Copy-in data from mbuf to buflet - mbuf_copydata(data, 0, length, rx_baddr); - kern_packet_clear_flow_uuid(rx_ph); // Zero flow id - - // Finalize and attach the packet - error = kern_buflet_set_data_offset(rx_buf, 0); - VERIFY(error == 0); - error = kern_buflet_set_data_length(rx_buf, length); - VERIFY(error == 0); - error = kern_packet_finalize(rx_ph); - VERIFY(error == 0); - error = kern_channel_slot_attach_packet(rx_ring, rx_slot, rx_ph); - VERIFY(error == 0); - - rx_ring_stats.kcrsi_slots_transferred++; - rx_ring_stats.kcrsi_bytes_transferred += length; - - if (!pcb->utun_ext_ifdata_stats) { - ifnet_stat_increment_out(pcb->utun_ifp, 1, length, 0); - } - - mbuf_freem(data); - - rx_pslot = rx_slot; - rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); - } - if (rx_pslot) { - kern_channel_advance_slot(rx_ring, rx_pslot); - kern_channel_increment_ring_stats(rx_ring, &rx_ring_stats); - } - if (mb_head != NULL) { - VERIFY(mb_cnt != 0); - mbuf_freem_list(mb_head); - } + rx_pslot = rx_slot; + rx_slot = kern_channel_get_next_slot(rx_ring, rx_slot, NULL); } + if (rx_pslot) { + kern_channel_advance_slot(rx_ring, rx_pslot); + kern_channel_increment_ring_net_stats(rx_ring, pcb->utun_ifp, &rx_ring_stats); + } + + if (tx_chain_ph != 0) { + kern_pbufpool_free_chain(tx_pp, tx_chain_ph); + } + + if (tx_pslot) { + kern_channel_advance_slot(tx_ring, tx_pslot); + kern_channel_increment_ring_net_stats(tx_ring, pcb->utun_ifp, &tx_ring_stats); + (void)kern_channel_reclaim(tx_ring); + } + + /* just like utun_ctl_rcvd(), always reenable output */ + errno_t error = ifnet_enable_output(pcb->utun_ifp); + if (error != 0) { + os_log_error(OS_LOG_DEFAULT, "utun_kpipe_sync_rx: ifnet_enable_output returned error %d\n", error); + } + + // Unlock first, then exit ring + lck_rw_unlock_shared(&pcb->utun_pcb_lock); + + if (tx_pslot != NULL) { + kern_channel_notify(tx_ring, 0); + } + kr_exit(tx_ring); + utun_data_move_end(pcb); return 0; } diff --git a/bsd/net/if_utun.h b/bsd/net/if_utun.h index 65cdb6f8a..8c101c986 100644 --- a/bsd/net/if_utun.h +++ b/bsd/net/if_utun.h @@ -74,6 +74,8 @@ boolean_t utun_interface_needs_netagent(ifnet_t interface); #define UTUN_OPT_KPIPE_TX_RING_SIZE 25 /* Must be set before connecting */ #define UTUN_OPT_KPIPE_RX_RING_SIZE 26 /* Must be set before connecting */ #define UTUN_OPT_ATTACH_FLOWSWITCH 27 /* Must be set before connecting */ +#define UTUN_OPT_CHANNEL_BIND_UUID 28 /* Must be set before connecting */ +#define UTUN_OPT_CHANNEL_BIND_PID 29 /* Must be set before connecting */ /* * Flags for by UTUN_OPT_FLAGS diff --git a/bsd/net/if_var_private.h b/bsd/net/if_var_private.h index ab1874227..0952a8ddf 100644 --- a/bsd/net/if_var_private.h +++ b/bsd/net/if_var_private.h @@ -611,6 +611,7 @@ extern bool management_data_unrestricted; extern bool management_control_unrestricted; extern bool if_management_interface_check_needed; extern int if_management_verbose; +extern int if_ultra_constrained_default_allowed; extern bool if_ultra_constrained_check_needed; #endif /* BSD_KERNEL_PRIVATE */ @@ -718,10 +719,9 @@ struct ifnet { decl_lck_mtx_data(, if_ref_lock); u_int32_t if_refflags; /* see IFRF flags below */ - u_int32_t if_refio; /* number of io ops to the underlying driver */ + os_refcnt_t if_refio; /* number of io ops to the underlying driver */ u_int32_t if_threads_pending; /* Threads created but waiting for first run */ - u_int32_t if_datamov; /* number of threads moving data */ - u_int32_t if_drainers; /* number of draining threads */ + os_ref_atomic_t if_datamov; /* number of threads moving data */ u_int32_t if_suspend; /* number of suspend requests */ #define if_list if_link @@ -1043,10 +1043,13 @@ struct ifnet { uint8_t network_id[IFNET_NETWORK_ID_LEN]; uint8_t network_id_len; + uint8_t if_l4s_mode; /* L4S capability on an interface */ + atomic_bool if_mcast_add_signaled; atomic_bool if_mcast_del_signaled; - uint32_t if_traffic_rule_count; + uint32_t if_inet_traffic_rule_count; + uint32_t if_eth_traffic_rule_count; uint32_t if_traffic_rule_genid; /* @@ -1094,14 +1097,20 @@ EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); #define IFRF_DETACHING 0x4 /* detach has been requested */ #define IFRF_READY 0x8 /* data path is ready */ -#define IFRF_ATTACH_MASK \ - (IFRF_EMBRYONIC|IFRF_ATTACHED|IFRF_DETACHING) +#define IFRF_ATTACH_MASK (IFRF_EMBRYONIC|IFRF_ATTACHED|IFRF_DETACHING) -#define IF_FULLY_ATTACHED(_ifp) \ - (((_ifp)->if_refflags & IFRF_ATTACH_MASK) == IFRF_ATTACHED) +static inline bool +ifnet_is_fully_attached(const struct ifnet *ifp) +{ + return (ifp->if_refflags & IFRF_ATTACH_MASK) == IFRF_ATTACHED; +} + +static inline bool +ifnet_is_attached_and_ready(const struct ifnet *ifp) +{ + return ifp->if_refflags == (IFRF_ATTACHED | IFRF_READY); +} -#define IF_FULLY_ATTACHED_AND_READY(_ifp) \ - (IF_FULLY_ATTACHED(_ifp) && ((_ifp)->if_refflags & IFRF_READY)) /* * Valid values for if_start_flags */ @@ -1212,7 +1221,7 @@ struct if_clone { */ struct ifaddr { decl_lck_mtx_data(, ifa_lock); /* lock for ifaddr */ - os_ref_atomic_t ifa_refcnt; /* ref count, use IFA_{ADD,REM}REF */ + os_refcnt_t ifa_refcnt; /* ref count, use IFA_{ADD,REM}REF */ uint32_t ifa_debug; /* debug flags */ struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ @@ -1266,12 +1275,10 @@ struct ifaddr { #define IFA_UNLOCK(_ifa) \ lck_mtx_unlock(&(_ifa)->ifa_lock) -os_refgrp_decl(static, ifa_refgrp, "ifa refcounts", NULL); - static inline void ifa_addref(struct ifaddr *ifa) { - os_ref_retain_raw(&ifa->ifa_refcnt, &ifa_refgrp); + os_ref_retain(&ifa->ifa_refcnt); } __private_extern__ void ifa_deallocated(struct ifaddr *ifa); @@ -1280,7 +1287,7 @@ static inline void ifa_remref(struct ifaddr *ifa) { /* We can use _relaxed, because if we hit 0 we make sure the lock is held */ - if (os_ref_release_raw_relaxed(&ifa->ifa_refcnt, &ifa_refgrp) == 0) { + if (os_ref_release_relaxed(&ifa->ifa_refcnt) == 0) { ifa_deallocated(ifa); } } @@ -1420,23 +1427,28 @@ struct ifmultiaddr { !((_ifp)->if_eflags & IFEF_AWDL) && \ !((_ifp)->if_xflags & IFXF_LOW_LATENCY)) +/* + * Indicate whether or not the immediate WiFi interface is on an AWDL link + */ +#define IFNET_IS_WIFI_AWDL(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_ETHERNET && \ + (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI && \ + ((_ifp)->if_eflags & IFEF_AWDL)) + + /* * Indicate whether or not the immediate interface is a companion link * interface. */ #define IFNET_IS_COMPANION_LINK(_ifp) \ - ((_ifp)->if_family == IFNET_FAMILY_IPSEC && \ - ((_ifp)->if_subfamily == IFNET_SUBFAMILY_BLUETOOTH || \ - (_ifp)->if_subfamily == IFNET_SUBFAMILY_WIFI || \ - (_ifp)->if_subfamily == IFNET_SUBFAMILY_QUICKRELAY || \ - (_ifp)->if_subfamily == IFNET_SUBFAMILY_DEFAULT)) + ((_ifp)->if_xflags & IFXF_IS_COMPANIONLINK) /* * Indicate whether or not the immediate interface is a companion link * interface using Bluetooth */ #define IFNET_IS_COMPANION_LINK_BLUETOOTH(_ifp) \ - ((_ifp)->if_family == IFNET_FAMILY_IPSEC && \ + (IFNET_IS_COMPANION_LINK(_ifp) && \ (_ifp)->if_subfamily == IFNET_SUBFAMILY_BLUETOOTH) /* @@ -1468,7 +1480,10 @@ struct ifmultiaddr { (_ifp)->if_delegated.ultra_constrained) #define IFNET_IS_VPN(_ifp) \ - ((_ifp)->if_xflags & IFXF_IS_VPN) \ + ((_ifp)->if_xflags & IFXF_IS_VPN) + +#define IFNET_IS_LOOPBACK(_ifp) \ + ((_ifp)->if_flags & IFF_LOOPBACK) /* * We don't support AWDL interface delegation. @@ -1500,6 +1515,12 @@ struct ifmultiaddr { (_ifp)->if_family == IFNET_FAMILY_CELLULAR) && \ (_ifp)->if_subfamily == IFNET_SUBFAMILY_REDIRECT) +/* + * Indicate whether or not the interface requires joining cellular thread group + */ +#define IFNET_REQUIRES_CELL_GROUP(_ifp) \ + ((_ifp)->if_family == IFNET_FAMILY_CELLULAR && \ + ((_ifp)->if_xflags & IFXF_REQUIRE_CELL_THREAD_GROUP)) extern int if_index; extern int if_indexcount; @@ -1535,10 +1556,9 @@ extern struct ifnet *ifunit(const char *); extern struct ifnet *ifunit_ref(const char *); extern int ifunit_extract(const char *src, char *__counted_by(dstlen)dst, size_t dstlen, int *unit); extern struct ifnet *if_withname(struct sockaddr *); -extern void if_qflush(struct ifnet *, struct ifclassq *, bool); -extern void if_qflush_snd(struct ifnet *, bool); +extern void if_qflush(struct ifnet *, struct ifclassq *); extern void if_qflush_sc(struct ifnet *, mbuf_svc_class_t, u_int32_t, - u_int32_t *, u_int32_t *, int); + u_int32_t *, u_int32_t *); extern struct if_clone *if_clone_lookup(const char *__counted_by(namelen) name, size_t namelen, u_int32_t *); extern int if_clone_attach(struct if_clone *); @@ -1550,6 +1570,8 @@ extern errno_t if_mcasts_update(struct ifnet *); extern const char *intf_event2str(int); +extern uint32_t if_get_driver_hwassist(struct ifnet *ifp); + typedef enum { IFNET_LCK_ASSERT_EXCLUSIVE, /* RW: held as writer */ IFNET_LCK_ASSERT_SHARED, /* RW: held as reader */ @@ -1597,14 +1619,13 @@ __private_extern__ void ifnet_head_assert_exclusive(void); __private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, u_int32_t); -__private_extern__ int ifnet_is_attached(struct ifnet *, int refio); +__private_extern__ int ifnet_get_ioref(struct ifnet *ifp); __private_extern__ void ifnet_incr_pending_thread_count(struct ifnet *); __private_extern__ void ifnet_decr_pending_thread_count(struct ifnet *); __private_extern__ void ifnet_incr_iorefcnt(struct ifnet *); __private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); __private_extern__ boolean_t ifnet_datamov_begin(struct ifnet *); __private_extern__ void ifnet_datamov_end(struct ifnet *); -__private_extern__ void ifnet_datamov_suspend(struct ifnet *); __private_extern__ boolean_t ifnet_datamov_suspend_if_needed(struct ifnet *); __private_extern__ void ifnet_datamov_drain(struct ifnet *); __private_extern__ void ifnet_datamov_suspend_and_drain(struct ifnet *); @@ -1707,7 +1728,6 @@ __private_extern__ void if_get_state(struct ifnet *, __private_extern__ errno_t if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe); __private_extern__ void if_lqm_update(struct ifnet *, int32_t, int); -__private_extern__ void ifnet_update_sndq(struct ifclassq *, cqev_t); __private_extern__ void ifnet_update_rcv(struct ifnet *, cqev_t); __private_extern__ void ifnet_flowadv(uint32_t); @@ -1715,14 +1735,14 @@ __private_extern__ void ifnet_flowadv(uint32_t); __private_extern__ errno_t ifnet_set_input_bandwidths(struct ifnet *, struct if_bandwidths *); __private_extern__ errno_t ifnet_set_output_bandwidths(struct ifnet *, - struct if_bandwidths *, boolean_t); + struct if_bandwidths *); __private_extern__ u_int64_t ifnet_output_linkrate(struct ifnet *); __private_extern__ u_int64_t ifnet_input_linkrate(struct ifnet *); __private_extern__ errno_t ifnet_set_input_latencies(struct ifnet *, struct if_latencies *); __private_extern__ errno_t ifnet_set_output_latencies(struct ifnet *, - struct if_latencies *, boolean_t); + struct if_latencies *); __private_extern__ void ifnet_clear_netagent(uuid_t); @@ -1789,15 +1809,11 @@ __private_extern__ int ifnet_enqueue_netem(void *handle, pktsched_pkt_t *__sized_by(n_pkts) pkts, uint32_t n_pkts); #if SKYWALK struct __kern_packet; -extern errno_t ifnet_enqueue_pkt(struct ifnet *, +extern errno_t ifnet_enqueue_pkt(struct ifnet *, struct ifclassq *ifcq, struct __kern_packet *, boolean_t, boolean_t *); -extern errno_t ifnet_enqueue_ifcq_pkt(struct ifnet *, struct ifclassq *, - struct __kern_packet *, boolean_t, boolean_t *); -extern errno_t ifnet_enqueue_pkt_chain(struct ifnet *, struct __kern_packet *, - struct __kern_packet *, uint32_t, uint32_t, boolean_t, boolean_t *); -extern errno_t ifnet_enqueue_ifcq_pkt_chain(struct ifnet *, struct ifclassq *, - struct __kern_packet *, struct __kern_packet *, uint32_t, uint32_t, boolean_t, - boolean_t *); +extern errno_t ifnet_enqueue_pkt_chain(struct ifnet *, struct ifclassq *, + struct __kern_packet *, struct __kern_packet *, uint32_t, uint32_t, + boolean_t, boolean_t *); extern errno_t ifnet_set_output_handler(struct ifnet *, ifnet_output_func); extern void ifnet_reset_output_handler(struct ifnet *); extern errno_t ifnet_set_start_handler(struct ifnet *, ifnet_start_func); @@ -1818,7 +1834,8 @@ extern u_int32_t if_clear_xflags(ifnet_t, u_int32_t); extern boolean_t sa_equal(const struct sockaddr *, const struct sockaddr *); extern void ifnet_update_traffic_rule_genid(struct ifnet *); extern boolean_t ifnet_sync_traffic_rule_genid(struct ifnet *, uint32_t *); -extern void ifnet_update_traffic_rule_count(struct ifnet *, uint32_t); +extern void ifnet_update_inet_traffic_rule_count(struct ifnet *, uint32_t); +extern void ifnet_update_eth_traffic_rule_count(struct ifnet *, uint32_t); extern bool if_update_link_heuristic(struct ifnet *); diff --git a/bsd/net/if_var_status.h b/bsd/net/if_var_status.h index 0cbd65406..60a53b62a 100644 --- a/bsd/net/if_var_status.h +++ b/bsd/net/if_var_status.h @@ -66,6 +66,7 @@ #include #include +#include #pragma pack(4) @@ -603,7 +604,7 @@ struct ifnet_interface_advisory { #pragma pack(push, 1) /* Supported types */ -/* Reserving 1 for link layer */ +#define IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH 1 #define IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET 2 /* Supported flags */ @@ -617,6 +618,16 @@ struct ifnet_traffic_descriptor_common { uint32_t itd_flags; }; +#define IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE 0x01 +#define IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR 0x02 + +struct ifnet_traffic_descriptor_eth { + struct ifnet_traffic_descriptor_common eth_common; + ether_addr_t eth_raddr; + uint16_t eth_type; + uint8_t eth_mask; +}; + #define IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER 0x01 #define IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO 0x02 #define IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR 0x04 @@ -656,6 +667,14 @@ struct ifnet_traffic_rule_action_steer { struct ifnet_traffic_rule_action ras_common; uint64_t ras_qset_id; }; + +#if KERNEL_PRIVATE +typedef enum { + RX_FLOW_STEERING_ACTION_ADD_AOP = 1, + RX_FLOW_STEERING_ACTION_REMOVE_AOP = 2, +} rx_flow_steering_action_t; +#endif /* KERNEL_PRIVATE */ + #pragma pack(pop) #pragma pack() diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 1c4ea5194..cffc06448 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1968,7 +1968,7 @@ vlan_event(struct ifnet * p, __unused protocol_family_t protocol, static errno_t vlan_detached(ifnet_t p, __unused protocol_family_t protocol) { - if (ifnet_is_attached(p, 0) == 0) { + if (!ifnet_is_fully_attached(p)) { /* if the parent isn't attached, remove all VLANs */ vlan_parent_remove_all_vlans(p); } diff --git a/bsd/net/iptap.c b/bsd/net/iptap.c index f340fb5b3..f434ceb1f 100644 --- a/bsd/net/iptap.c +++ b/bsd/net/iptap.c @@ -599,7 +599,7 @@ iptap_bpf_tap(struct mbuf *m, u_int32_t proto, int outgoing) struct ifnet *ifp = outgoing ? NULL : m->m_pkthdr.rcvif; /* Verify the structure is packed */ - _CASSERT(sizeof(hdr_buffer) == sizeof(struct pktap_header) + sizeof(u_int32_t)); + static_assert(sizeof(hdr_buffer) == sizeof(struct pktap_header) + sizeof(u_int32_t)); bzero(&hdr_buffer, sizeof(hdr_buffer)); hdr->pth_length = sizeof(struct pktap_header); diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index a62834174..0e885593a 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -281,7 +282,8 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, } einit.output = NULL; - if (einit.output_sched_model >= IFNET_SCHED_MODEL_MAX) { + if (!IFNET_MODEL_IS_VALID(einit.output_sched_model)) { + panic("wrong model %u", einit.output_sched_model); return EINVAL; } @@ -568,6 +570,11 @@ ifnet_allocate_extended(const struct ifnet_init_eparams *einit0, if_management_interface_check_needed = true; } + /* + * Set the default inband wake packet tagging for the interface family + */ + init_inband_wake_pkt_tagging_for_family(ifp); + /* * Increment the generation count on interface creation */ @@ -835,7 +842,7 @@ ifnet_set_idle_flags_locked(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) * be done at attach time. Otherwise, if it is called after * ifnet detach, then it is a no-op. */ - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { ifp->if_idle_new_flags = new_flags; ifp->if_idle_new_flags_mask = mask; return 0; @@ -875,7 +882,7 @@ ifnet_set_link_quality(ifnet_t ifp, int quality) goto done; } - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { err = ENXIO; goto done; } @@ -913,7 +920,7 @@ ifnet_set_interface_state(ifnet_t ifp, goto done; } - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { err = ENXIO; goto done; } @@ -935,7 +942,7 @@ ifnet_get_interface_state(ifnet_t ifp, goto done; } - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { err = ENXIO; goto done; } @@ -1189,9 +1196,7 @@ ifnet_set_tso_mtu(ifnet_t interface, sa_family_t family, u_int32_t mtuLen) struct ifclassq *ifq = interface->if_snd; ASSERT(ifq != NULL); /* Inform all transmit queues about the new TSO MTU */ - IFCQ_LOCK(ifq); - ifnet_update_sndq(ifq, CLASSQ_EV_LINK_MTU); - IFCQ_UNLOCK(ifq); + ifclassq_update(ifq, CLASSQ_EV_LINK_MTU, false); } return error; @@ -1478,7 +1483,7 @@ ifnet_set_bandwidths(struct ifnet *ifp, struct if_bandwidths *output_bw, } if (output_bw != NULL) { - (void) ifnet_set_output_bandwidths(ifp, output_bw, FALSE); + (void) ifnet_set_output_bandwidths(ifp, output_bw); } return 0; @@ -1508,8 +1513,7 @@ ifnet_set_link_status_outbw(struct ifnet *ifp) } errno_t -ifnet_set_output_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw, - boolean_t locked) +ifnet_set_output_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw) { struct if_bandwidths old_bw; struct ifclassq *ifq; @@ -1518,10 +1522,7 @@ ifnet_set_output_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw, VERIFY(ifp != NULL && bw != NULL); ifq = ifp->if_snd; - if (!locked) { - IFCQ_LOCK(ifq); - } - IFCQ_LOCK_ASSERT_HELD(ifq); + IFCQ_LOCK(ifq); old_bw = ifp->if_output_bw; if (bw->eff_bw != 0) { @@ -1545,12 +1546,9 @@ ifnet_set_output_bandwidths(struct ifnet *ifp, struct if_bandwidths *bw, /* Adjust queue parameters if needed */ if (old_bw.eff_bw != ifp->if_output_bw.eff_bw || old_bw.max_bw != ifp->if_output_bw.max_bw) { - ifnet_update_sndq(ifq, CLASSQ_EV_LINK_BANDWIDTH); - } - - if (!locked) { - IFCQ_UNLOCK(ifq); + ifclassq_update(ifq, CLASSQ_EV_LINK_BANDWIDTH, true); } + IFCQ_UNLOCK(ifq); /* * If this is a Wifi interface, update the values in @@ -1673,7 +1671,7 @@ ifnet_set_latencies(struct ifnet *ifp, struct if_latencies *output_lt, } if (output_lt != NULL) { - (void) ifnet_set_output_latencies(ifp, output_lt, FALSE); + (void) ifnet_set_output_latencies(ifp, output_lt); } if (input_lt != NULL) { @@ -1684,8 +1682,7 @@ ifnet_set_latencies(struct ifnet *ifp, struct if_latencies *output_lt, } errno_t -ifnet_set_output_latencies(struct ifnet *ifp, struct if_latencies *lt, - boolean_t locked) +ifnet_set_output_latencies(struct ifnet *ifp, struct if_latencies *lt) { struct if_latencies old_lt; struct ifclassq *ifq; @@ -1693,10 +1690,7 @@ ifnet_set_output_latencies(struct ifnet *ifp, struct if_latencies *lt, VERIFY(ifp != NULL && lt != NULL); ifq = ifp->if_snd; - if (!locked) { - IFCQ_LOCK(ifq); - } - IFCQ_LOCK_ASSERT_HELD(ifq); + IFCQ_LOCK(ifq); old_lt = ifp->if_output_lt; if (lt->eff_lt != 0) { @@ -1714,12 +1708,9 @@ ifnet_set_output_latencies(struct ifnet *ifp, struct if_latencies *lt, /* Adjust queue parameters if needed */ if (old_lt.eff_lt != ifp->if_output_lt.eff_lt || old_lt.max_lt != ifp->if_output_lt.max_lt) { - ifnet_update_sndq(ifq, CLASSQ_EV_LINK_LATENCY); - } - - if (!locked) { - IFCQ_UNLOCK(ifq); + ifclassq_update(ifq, CLASSQ_EV_LINK_LATENCY, true); } + IFCQ_UNLOCK(ifq); return 0; } @@ -1777,7 +1768,7 @@ ifnet_set_poll_params(struct ifnet *ifp, struct ifnet_poll_params *p) if (ifp == NULL) { return EINVAL; - } else if (!ifnet_is_attached(ifp, 1)) { + } else if (!ifnet_get_ioref(ifp)) { return ENXIO; } @@ -1803,7 +1794,7 @@ ifnet_poll_params(struct ifnet *ifp, struct ifnet_poll_params *p) if (ifp == NULL || p == NULL) { return EINVAL; - } else if (!ifnet_is_attached(ifp, 1)) { + } else if (!ifnet_get_ioref(ifp)) { return ENXIO; } @@ -2330,7 +2321,7 @@ ifnet_lladdr_copy_bytes_internal(ifnet_t interface, void *__sized_by(lladdr_len) * Make sure to accomodate the largest possible * size of SA(if_lladdr)->sa_len. */ - _CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1)); + static_assert(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1)); if (interface == NULL || lladdr == NULL) { return EINVAL; @@ -3283,7 +3274,7 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp) if (ifp == NULL) { return EINVAL; - } else if (!ifnet_is_attached(ifp, 1)) { + } else if (!ifnet_get_ioref(ifp)) { return ENXIO; } @@ -3307,8 +3298,6 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp) } bzero(&ifp->if_delegated, sizeof(ifp->if_delegated)); if (delegated_ifp != NULL && ifp != delegated_ifp) { - uint32_t set_eflags; - ifp->if_delegated.ifp = delegated_ifp; ifnet_reference(delegated_ifp); ifp->if_delegated.type = delegated_ifp->if_type; @@ -3321,13 +3310,6 @@ ifnet_set_delegate(ifnet_t ifp, ifnet_t delegated_ifp) ifp->if_delegated.ultra_constrained = delegated_ifp->if_xflags & IFXF_ULTRA_CONSTRAINED ? 1 : 0; - /* - * Propogate flags related to ECN from delegated interface - */ - if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE); - set_eflags = (delegated_ifp->if_eflags & - (IFEF_ECN_ENABLE | IFEF_ECN_DISABLE)); - if_set_eflags(ifp, set_eflags); printf("%s: is now delegating %s (type 0x%x, family %u, " "sub-family %u)\n", ifp->if_xname, delegated_ifp->if_xname, delegated_ifp->if_type, delegated_ifp->if_family, @@ -3359,7 +3341,7 @@ ifnet_get_delegate(ifnet_t ifp, ifnet_t *pdelegated_ifp) { if (ifp == NULL || pdelegated_ifp == NULL) { return EINVAL; - } else if (!ifnet_is_attached(ifp, 1)) { + } else if (!ifnet_get_ioref(ifp)) { return ENXIO; } @@ -3477,7 +3459,7 @@ ifnet_link_status_report(ifnet_t ifp, const void *__sized_by(buffer_len) buffer, * Make sure that the interface is attached but there is no need * to take a reference because this call is coming from the driver. */ - if (!ifnet_is_attached(ifp, 0)) { + if (!ifnet_is_fully_attached(ifp)) { ifnet_lock_done(ifp); return ENXIO; } @@ -3652,7 +3634,7 @@ ifnet_get_unsent_bytes(ifnet_t interface, int64_t *unsent_bytes) bytes = *unsent_bytes = 0; - if (!IF_FULLY_ATTACHED(interface)) { + if (!ifnet_is_fully_attached(interface)) { return ENXIO; } @@ -3675,7 +3657,7 @@ ifnet_get_buffer_status(const ifnet_t ifp, ifnet_buffer_status_t *buf_status) bzero(buf_status, sizeof(*buf_status)); - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { return ENXIO; } @@ -3697,7 +3679,7 @@ ifnet_normalise_unsent_data(void) ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { ifnet_lock_exclusive(ifp); - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { ifnet_lock_done(ifp); continue; } @@ -3736,3 +3718,46 @@ ifnet_get_low_power_mode(ifnet_t ifp, boolean_t *on) *on = ((ifp->if_xflags & IFXF_LOW_POWER) != 0); return 0; } + +errno_t +ifnet_set_rx_flow_steering(ifnet_t ifp, boolean_t on) +{ + errno_t error = 0; + + if (ifp == NULL) { + return EINVAL; + } + + if (on) { + error = if_set_xflags(ifp, IFXF_RX_FLOW_STEERING); + } else { + if_clear_xflags(ifp, IFXF_RX_FLOW_STEERING); + } + + return error; +} + +errno_t +ifnet_get_rx_flow_steering(ifnet_t ifp, boolean_t *on) +{ + if (ifp == NULL || on == NULL) { + return EINVAL; + } + + *on = ((ifp->if_xflags & IFXF_RX_FLOW_STEERING) != 0); + return 0; +} + +void +ifnet_enable_cellular_thread_group(ifnet_t ifp) +{ + VERIFY(ifp != NULL); + + /* This function can only be called when the ifp is just created and + * not yet attached. + */ + VERIFY(ifp->if_inp == NULL); + VERIFY(ifp->if_refflags & IFRF_EMBRYONIC); + + if_set_xflags(ifp, IFXF_REQUIRE_CELL_THREAD_GROUP); +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index 9801ab91d..6d528010c 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -3910,6 +3910,64 @@ extern errno_t ifnet_set_congested_link(ifnet_t interface, boolean_t on); */ extern errno_t ifnet_get_congested_link(ifnet_t interface, boolean_t *on); +/*! + * @function ifnet_set_rx_flow_steering + * @param interface The interface. + * @param on Set the truth value that the interface supports Rx flow steering. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_set_rx_flow_steering(ifnet_t interface, boolean_t on); + +/*! + * @function ifnet_get_rx_flow_steering + * @param interface The interface. + * @param on On output contains the truth value that the interface + * supports Rx flow steering + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_get_rx_flow_steering(ifnet_t interface, boolean_t *on); + +/*! + * @function ifnet_set_inband_wake_packet_tagging + * @param interface The interface. + * @param on Set the truth value that the interface supports inband tagging of + * the wake packet flag. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_set_inband_wake_packet_tagging(ifnet_t interface, boolean_t on); + +/*! + * @function ifnet_get_inband_wake_packet_tagging + * @param interface The interface. + * @param on On output contains the truth value whether the interface supports + * inband tagging of the wake packet flag. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_get_inband_wake_packet_tagging(ifnet_t interface, boolean_t *on); + +/*! + * @function ifnet_set_low_power_wake + * @param interface The interface. + * @param on Set the truth value that the interface supports LPW. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_set_low_power_wake(ifnet_t interface, boolean_t on); + +#define HAS_IFNET_SET_LOW_POWER_WAKE 1 + +/*! + * @function ifnet_get_low_power_wake + * @param interface The interface. + * @param on On output contains the truth value whether the interface supports LPW. + * @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_get_low_power_wake(ifnet_t interface, boolean_t *on); + +/*! + * @function ifnet_enable_cellular_thread_group + * @param interface The interface. + */ +extern void ifnet_enable_cellular_thread_group(ifnet_t interface); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/net/nat464_utils.c b/bsd/net/nat464_utils.c index e46bc2fb6..54e8dff1a 100644 --- a/bsd/net/nat464_utils.c +++ b/bsd/net/nat464_utils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023 Apple Inc. All rights reserved. + * Copyright (c) 2018-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -183,14 +183,16 @@ nat464_synthesize_ipv6(ifnet_t ifp, const struct in_addr *addrv4, struct in6_add /* Synthesize ipv4 from ipv6 */ int -nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, struct in_addr *addrv4) +nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, + struct in_addr *addrv4, bool * translate_p) { struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; int error = 0, i = 0; + bool translate = false; /* Below call is not optimized as it creates a copy of prefixes */ if ((error = ifnet_get_nat64prefix(ifp, nat64prefixes)) != 0) { - return error; + goto done; } for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) { @@ -214,7 +216,8 @@ nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, struct in_addr * we already checked that above. */ if (memcmp((const struct in6_addr *__indexable)addr, &prefix, prefix_len) != 0) { - return -1; + /* it's not the NAT64 prefix, so let it pass */ + goto done; } switch (prefix_len) { @@ -249,6 +252,9 @@ nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, struct in_addr clat_log2((LOG_DEBUG, "%s desynthesized to %s\n", __func__, inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)))); } + translate = true; +done: + *translate_p = translate; return error; } @@ -961,8 +967,14 @@ nat464_translate_proto(pbuf_t *pbuf, struct nat464_addr *osrc, hlen2 = (uint16_t)(ip2off + (iph2->ip_hl << 2)); tot_len2 = ntohs(iph2->ip_len); - /* Destination in outer IP should be Source in inner IP */ - VERIFY(IN_ARE_ADDR_EQUAL(&odst->natv4addr, &iph2->ip_src)); + /* + * Destination in outer IP should be Source in inner IP, + * otherwise the ICMP is likely errnoneous. + */ + if (!IN_ARE_ADDR_EQUAL(&odst->natv4addr, &iph2->ip_src)) { + return NT_DROP; + } + if (nat464_translate_icmp_ip(pbuf, ip2off, &tot_len, &hlen2, iph2->ip_p, iph2->ip_ttl, tot_len2, (struct nat464_addr *)ndst, (struct nat464_addr *)nsrc, diff --git a/bsd/net/nat464_utils.h b/bsd/net/nat464_utils.h index 5e7e22528..f080532f3 100644 --- a/bsd/net/nat464_utils.h +++ b/bsd/net/nat464_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Apple Inc. All rights reserved. + * Copyright (c) 2018-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,7 +109,7 @@ int nat464_synthesize_ipv6(ifnet_t, const struct in_addr *, struct in6_addr *); int - nat464_synthesize_ipv4(ifnet_t, const struct in6_addr *, struct in_addr *); + nat464_synthesize_ipv4(ifnet_t, const struct in6_addr *, struct in_addr *, bool *); int nat464_translate_64(pbuf_t *, int, uint8_t, uint8_t *, uint8_t, struct in_addr, diff --git a/bsd/net/necp.c b/bsd/net/necp.c index 003c0348d..a5a8e4e7b 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -283,12 +283,6 @@ typedef enum { // Cap the policy size at the max result + conditions size, with room for extra TLVs #define NECP_MAX_POLICY_SIZE (1024 + NECP_MAX_POLICY_RESULT_SIZE + NECP_MAX_CONDITIONS_ARRAY_SIZE) -struct necp_service_registration { - LIST_ENTRY(necp_service_registration) session_chain; - LIST_ENTRY(necp_service_registration) kernel_chain; - u_int32_t service_id; -}; - struct necp_domain_filter { LIST_ENTRY(necp_domain_filter) owner_chain; LIST_ENTRY(necp_domain_filter) chain; @@ -335,7 +329,6 @@ struct necp_session { bool dirty; LIST_HEAD(_policies, necp_session_policy) policies; - LIST_HEAD(_services, necp_service_registration) services; struct necp_domain_filter_list domain_filters; struct necp_domain_trie_list domain_tries; @@ -480,8 +473,6 @@ static inline struct necp_kernel_socket_policy * necp_socket_find_policy_match_w u_int32_t * __counted_by(route_rule_id_array_count)return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count, - necp_kernel_policy_result *return_service_action, - necp_kernel_policy_service *return_service, u_int32_t * __counted_by(netagent_array_count)return_netagent_array, size_t netagent_array_count, u_int32_t * __counted_by(netagent_use_flags_array_count)return_netagent_use_flags_array, @@ -522,17 +513,29 @@ static bool necp_uuid_app_id_mappings_dirty; #define NECP_UUID_APP_ID_HASH_SIZE 64 static u_long necp_uuid_app_id_hash_mask; static u_long necp_uuid_app_id_hash_num_buckets; -static LIST_HEAD(necp_uuid_id_mapping_head, necp_uuid_id_mapping) * __counted_by(necp_uuid_app_id_hash_num_buckets) necp_uuid_app_id_hashtbl, necp_uuid_service_id_list; // App map is real hash table, service map is just mapping +static LIST_HEAD(necp_uuid_id_mapping_head, necp_uuid_id_mapping) * __counted_by(necp_uuid_app_id_hash_num_buckets) necp_uuid_app_id_hashtbl, necp_agent_uuid_id_list; // App map is real hash table, agent map is just mapping #define APPUUIDHASH(uuid) (&necp_uuid_app_id_hashtbl[uuid[0] & necp_uuid_app_id_hash_mask]) // Assume first byte of UUIDs are evenly distributed static u_int32_t necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_policy_table); static bool necp_remove_uuid_app_id_mapping(uuid_t uuid, bool *removed_mapping, bool uuid_policy_table); -static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_app_id_locked(u_int32_t local_id); +static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_app_id_locked(u_int32_t agent_id); -static struct necp_uuid_id_mapping *necp_uuid_lookup_service_id_locked(uuid_t uuid); -static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id); -static u_int32_t necp_create_uuid_service_id_mapping(uuid_t uuid); -static bool necp_remove_uuid_service_id_mapping(uuid_t uuid); -static bool necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id); +static bool necp_agent_id_is_uuid(u_int32_t agent_id); +static struct necp_uuid_id_mapping *necp_uuid_lookup_agent_id_with_uuid_locked(uuid_t uuid); +static struct necp_uuid_id_mapping *necp_uuid_lookup_uuid_with_agent_id_locked(u_int32_t agent_id); +static u_int32_t necp_create_agent_uuid_id_mapping(uuid_t uuid); +static bool necp_remove_agent_uuid_id_mapping(uuid_t uuid); +static bool necp_remove_agent_uuid_id_mapping_with_agent_id(u_int32_t agent_id); + +struct necp_agent_type_id_mapping { + LIST_ENTRY(necp_agent_type_id_mapping) chain; + struct necp_policy_condition_agent_type agent_type; + u_int32_t id; + os_refcnt_t refcount; +}; +static LIST_HEAD(necp_agent_type_id_mapping_list, necp_agent_type_id_mapping) necp_agent_type_id_list; +static u_int32_t necp_create_agent_type_to_id_mapping(struct necp_policy_condition_agent_type *agent_type); +static bool necp_remove_agent_type_to_id_mapping(u_int32_t agent_type_id); +static struct necp_agent_type_id_mapping *necp_lookup_agent_type_with_id_locked(u_int32_t agent_id); struct necp_string_id_mapping { LIST_ENTRY(necp_string_id_mapping) chain; @@ -559,8 +562,6 @@ static Boolean necp_match_domain_with_trie(struct necp_domain_trie_list *list, u static struct necp_kernel_socket_policy *necp_kernel_socket_policy_find(necp_kernel_policy_id policy_id); static struct necp_kernel_ip_output_policy *necp_kernel_ip_output_policy_find(necp_kernel_policy_id policy_id); -static LIST_HEAD(_necp_kernel_service_list, necp_service_registration) necp_registered_service_list; - static char * __null_terminated necp_create_trimmed_domain(char * __sized_by(length)string, size_t length); static inline int necp_count_dots(char * __sized_by(length)string, size_t length); @@ -585,6 +586,7 @@ struct necp_route_rule { u_int8_t constrained_action; u_int8_t companion_action; u_int8_t vpn_action; + u_int8_t ultra_constrained_action; u_int exception_if_indices[MAX_ROUTE_RULE_INTERFACES]; u_int8_t exception_if_actions[MAX_ROUTE_RULE_INTERFACES]; os_refcnt_t refcount; @@ -594,7 +596,7 @@ static u_int32_t necp_create_route_rule(struct necp_route_rule_list *list, u_int static bool necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_id); static bool necp_route_is_interface_type_allowed(struct rtentry *route, struct ifnet *ifp, proc_t proc, struct inpcb *inp); static bool necp_route_is_allowed(struct rtentry *route, ifnet_t interface, u_int32_t * __counted_by(netagent_array_count)netagent_array, size_t netagent_array_count, - u_int32_t route_rule_id, u_int32_t *interface_type_denied); + u_int32_t route_rule_id, u_int32_t *interface_type_denied, bool *ultra_constrained_denied); static uint32_t necp_route_get_netagent(struct rtentry *route, u_int32_t * __counted_by(netagent_array_count)netagent_array, size_t netagent_array_count, u_int32_t route_rule_id, bool *remove); static bool necp_route_rule_matches_agents(u_int32_t route_rule_id); static uint32_t necp_route_get_flow_divert(struct rtentry *route, u_int32_t * __counted_by(netagent_array_count)netagent_array, size_t netagent_array_count, u_int32_t route_rule_id, u_int32_t *flow_divert_aggregate_unit); @@ -855,7 +857,8 @@ const char* resultString[NECP_POLICY_RESULT_MAX + 1] = { "NETAGENT_SCOPED", "SCOPED_DIRECT", "ALLOW_UNENTITLED", - "REMOVE_NETAGENT" + "REMOVE_NETAGENT", + "REMOVE_NETAGENT_TYPE" }; @@ -1412,80 +1415,6 @@ necp_session_lock_to_process(struct necp_session *session, struct necp_session_a return 0; } -static int -necp_session_register_service(struct necp_session *session, struct necp_session_action_args *uap, int *retval) -{ - int error = 0; - struct necp_service_registration *new_service = NULL; - - if (uap->in_buffer_length < sizeof(uuid_t) || uap->in_buffer == 0) { - NECPLOG(LOG_ERR, "necp_session_register_service invalid input (%zu)", (size_t)uap->in_buffer_length); - error = EINVAL; - goto done; - } - - uuid_t service_uuid; - error = copyin(uap->in_buffer, service_uuid, sizeof(service_uuid)); - if (error != 0) { - NECPLOG(LOG_ERR, "necp_session_register_service uuid copyin error (%d)", error); - goto done; - } - - new_service = kalloc_type(struct necp_service_registration, - Z_WAITOK | Z_ZERO | Z_NOFAIL); - - lck_rw_lock_exclusive(&necp_kernel_policy_lock); - new_service->service_id = necp_create_uuid_service_id_mapping(service_uuid); - LIST_INSERT_HEAD(&session->services, new_service, session_chain); - LIST_INSERT_HEAD(&necp_registered_service_list, new_service, kernel_chain); - lck_rw_done(&necp_kernel_policy_lock); - -done: - *retval = error; - return error; -} - -static int -necp_session_unregister_service(struct necp_session *session, struct necp_session_action_args *uap, int *retval) -{ - int error = 0; - struct necp_service_registration * __single service = NULL; - struct necp_service_registration *temp_service = NULL; - struct necp_uuid_id_mapping *mapping = NULL; - - if (uap->in_buffer_length < sizeof(uuid_t) || uap->in_buffer == 0) { - NECPLOG(LOG_ERR, "necp_session_unregister_service invalid input (%zu)", (size_t)uap->in_buffer_length); - error = EINVAL; - goto done; - } - - uuid_t service_uuid; - error = copyin(uap->in_buffer, service_uuid, sizeof(service_uuid)); - if (error != 0) { - NECPLOG(LOG_ERR, "necp_session_unregister_service uuid copyin error (%d)", error); - goto done; - } - - // Remove all matching services for this session - lck_rw_lock_exclusive(&necp_kernel_policy_lock); - mapping = necp_uuid_lookup_service_id_locked(service_uuid); - if (mapping != NULL) { - LIST_FOREACH_SAFE(service, &session->services, session_chain, temp_service) { - if (service->service_id == mapping->id) { - LIST_REMOVE(service, session_chain); - LIST_REMOVE(service, kernel_chain); - kfree_type(struct necp_service_registration, service); - } - } - necp_remove_uuid_service_id_mapping(service_uuid); - } - lck_rw_done(&necp_kernel_policy_lock); - -done: - *retval = error; - return error; -} - static int necp_session_dump_all(struct necp_session *session, struct necp_session_action_args *uap, int *retval) { @@ -1856,11 +1785,11 @@ necp_session_action(struct proc *p, struct necp_session_action_args *uap, int *r break; } case NECP_SESSION_ACTION_REGISTER_SERVICE: { - return_value = necp_session_register_service(session, uap, retval); + return_value = 0; // Ignore break; } case NECP_SESSION_ACTION_UNREGISTER_SERVICE: { - return_value = necp_session_unregister_service(session, uap, retval); + return_value = 0; // Ignore break; } case NECP_SESSION_ACTION_POLICY_DUMP_ALL: { @@ -2118,9 +2047,7 @@ necp_init(void) LIST_INIT(&necp_account_id_list); - LIST_INIT(&necp_uuid_service_id_list); - - LIST_INIT(&necp_registered_service_list); + LIST_INIT(&necp_agent_uuid_id_list); LIST_INIT(&necp_route_rules); LIST_INIT(&necp_aggregate_route_rules); @@ -2463,7 +2390,6 @@ necp_create_session(void) new_session->session_priority = NECP_SESSION_PRIORITY_UNKNOWN; new_session->dirty = FALSE; LIST_INIT(&new_session->policies); - LIST_INIT(&new_session->services); LIST_INIT(&new_session->domain_filters); LIST_INIT(&new_session->domain_tries); lck_mtx_init(&new_session->lock, &necp_kernel_policy_mtx_grp, &necp_kernel_policy_mtx_attr); @@ -2507,15 +2433,6 @@ static void necp_delete_session(struct necp_session *session) { if (session != NULL) { - struct necp_service_registration * __single service = NULL; - struct necp_service_registration *temp_service = NULL; - LIST_FOREACH_SAFE(service, &session->services, session_chain, temp_service) { - LIST_REMOVE(service, session_chain); - lck_rw_lock_exclusive(&necp_kernel_policy_lock); - LIST_REMOVE(service, kernel_chain); - lck_rw_done(&necp_kernel_policy_lock); - kfree_type(struct necp_service_registration, service); - } struct necp_domain_filter * __single filter = NULL; struct necp_domain_filter *temp_filter = NULL; LIST_FOREACH_SAFE(filter, &session->domain_filters, owner_chain, temp_filter) { @@ -2642,6 +2559,12 @@ necp_policy_result_is_valid(u_int8_t * __sized_by(length)buffer, u_int32_t lengt } break; } + case NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE: { + if (parameter_length >= sizeof(struct necp_policy_condition_agent_type)) { + validated = TRUE; + } + break; + } default: { validated = FALSE; break; @@ -2733,7 +2656,8 @@ necp_policy_condition_is_valid(u_int8_t * __sized_by(length)buffer, u_int32_t le policy_result_type == NECP_POLICY_RESULT_NETAGENT_SCOPED || policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT || policy_result_type == NECP_POLICY_RESULT_ALLOW_UNENTITLED || - policy_result_type == NECP_POLICY_RESULT_REMOVE_NETAGENT) ? TRUE : FALSE; + policy_result_type == NECP_POLICY_RESULT_REMOVE_NETAGENT || + policy_result_type == NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE) ? TRUE : FALSE; u_int32_t condition_length = necp_policy_condition_get_value_length_from_buffer(buffer, length); u_int8_t *condition_value = necp_policy_condition_get_value_pointer_from_buffer(buffer, length); u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length); @@ -4090,7 +4014,7 @@ necp_policy_unapply(struct necp_session_policy *policy) uuid_clear(policy->applied_real_app_uuid); } if (!uuid_is_null(policy->applied_result_uuid)) { - necp_remove_uuid_service_id_mapping(policy->applied_result_uuid); + necp_remove_agent_uuid_id_mapping(policy->applied_result_uuid); uuid_clear(policy->applied_result_uuid); } @@ -4108,6 +4032,12 @@ necp_policy_unapply(struct necp_session_policy *policy) policy->applied_route_rules_id = 0; } + // Release agent type mapping + if (policy->applied_agent_type_id != 0) { + necp_remove_agent_type_to_id_mapping(policy->applied_agent_type_id); + policy->applied_agent_type_id = 0; + } + // Remove socket policies for (i = 0; i < MAX_KERNEL_SOCKET_POLICIES; i++) { if (policy->kernel_socket_policies[i] != 0) { @@ -4144,7 +4074,7 @@ struct necp_policy_result_service { } __attribute__((__packed__)); static bool -necp_policy_apply(struct necp_session *session, struct necp_session_policy *policy) +necp_policy_apply(struct necp_session *session, struct necp_session_policy *policy, bool *should_update_immediately) { bool socket_only_conditions = FALSE; bool socket_ip_conditions = FALSE; @@ -4694,6 +4624,11 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli if (necp_policy_result_get_parameter_length_from_buffer(policy->result, policy->result_size) > 0) { if (necp_policy_get_result_parameter(policy, (u_int8_t *)&drop_flags, sizeof(drop_flags))) { ultimate_result_parameter.drop_flags = drop_flags; + if (ultimate_result_parameter.drop_flags & NECP_KERNEL_POLICY_DROP_FLAG_DEFUNCT_ALL_FLOWS) { + if (should_update_immediately != NULL) { + *should_update_immediately = TRUE; + } + } } } if (socket_only_conditions) { // socket_ip_conditions can be TRUE or FALSE @@ -4770,7 +4705,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli case NECP_POLICY_RESULT_REMOVE_NETAGENT: { uuid_t netagent_uuid; if (necp_policy_get_result_parameter(policy, (u_int8_t *)&netagent_uuid, sizeof(netagent_uuid))) { - ultimate_result_parameter.netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid); + ultimate_result_parameter.netagent_id = necp_create_agent_uuid_id_mapping(netagent_uuid); if (ultimate_result_parameter.netagent_id != 0) { uuid_copy(policy->applied_result_uuid, netagent_uuid); socket_layer_non_id_conditions = TRUE; @@ -4778,6 +4713,17 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } + case NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE: { + struct necp_policy_condition_agent_type netagent_type = {}; + if (necp_policy_get_result_parameter(policy, (u_int8_t *)&netagent_type, sizeof(netagent_type))) { + ultimate_result_parameter.netagent_id = necp_create_agent_type_to_id_mapping(&netagent_type); + if (ultimate_result_parameter.netagent_id != 0) { + policy->applied_agent_type_id = ultimate_result_parameter.netagent_id; + socket_layer_non_id_conditions = TRUE; + } + } + break; + } case NECP_POLICY_RESULT_SOCKET_SCOPED: { u_int32_t interface_name_length = necp_policy_get_result_parameter_length(policy); if (interface_name_length <= IFXNAMSIZ && interface_name_length > 0) { @@ -4901,6 +4847,7 @@ necp_policy_apply_all(struct necp_session *session) struct necp_session_policy *temp_policy = NULL; struct kev_necp_policies_changed_data kev_data; kev_data.changed_count = 0; + bool should_update_immediately = FALSE; lck_rw_lock_exclusive(&necp_kernel_policy_lock); @@ -4914,11 +4861,11 @@ necp_policy_apply_all(struct necp_session *session) // Delete the policy necp_policy_delete(session, policy); } else if (!policy->applied) { - necp_policy_apply(session, policy); + necp_policy_apply(session, policy, &should_update_immediately); } else if (policy->pending_update) { // Must have been applied, but needs an update. Remove and re-add. necp_policy_unapply(policy); - necp_policy_apply(session, policy); + necp_policy_apply(session, policy, &should_update_immediately); } } @@ -4932,7 +4879,12 @@ necp_policy_apply_all(struct necp_session *session) lck_rw_done(&necp_kernel_policy_lock); - necp_update_all_clients(); + if (!should_update_immediately) { + necp_update_all_clients(); + } else { + necp_update_all_clients_immediately_if_needed(true); + } + necp_post_change_event(&kev_data); if (necp_debug) { @@ -5270,12 +5222,13 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri switch (route_rule->default_action) { case NECP_ROUTE_RULE_DENY_INTERFACE: case NECP_ROUTE_RULE_DENY_INTERFACE_WITH_TYPE: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (Only %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "WiFi " : "", (route_rule->wired_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Wired " : "", (route_rule->expensive_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Exp " : "", (route_rule->constrained_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Constrained " : "", + (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Ultra-Constrained " : "", (route_rule->companion_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "Companion " : "", (route_rule->vpn_action == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? "VPN " : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[0] : "", @@ -5299,12 +5252,13 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri (route_rule->exception_if_actions[9] == NECP_ROUTE_RULE_ALLOW_INTERFACE) ? interface_names[9] : ""); break; case NECP_ROUTE_RULE_ALLOW_INTERFACE: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", IS_NECP_ROUTE_RULE_DENY(route_rule->cellular_action) ? "!Cell " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->wifi_action) ? "!WiFi " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->wired_action) ? "!Wired " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->expensive_action) ? "!Exp " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->constrained_action) ? "!Constrained " : "", + IS_NECP_ROUTE_RULE_DENY(route_rule->ultra_constrained_action) ? "!Ultra-Constrained " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->companion_action) ? "!Companion " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->vpn_action) ? "!VPN " : "", IS_NECP_ROUTE_RULE_DENY(route_rule->exception_if_actions[0]) ? "!" : "", @@ -5329,12 +5283,13 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri IS_NECP_ROUTE_RULE_DENY(route_rule->exception_if_actions[9]) ? interface_names[9] : ""); break; case NECP_ROUTE_RULE_QOS_MARKING: - snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", + snprintf(result_string, MAX_RESULT_STRING_LEN, "RouteRules (QoSMarking %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)", (route_rule->cellular_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Cell " : "", (route_rule->wifi_action == NECP_ROUTE_RULE_QOS_MARKING) ? "WiFi " : "", (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Wired " : "", (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Exp " : "", (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Constrained " : "", + (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Ultra-Constrained " : "", (route_rule->companion_action == NECP_ROUTE_RULE_QOS_MARKING) ? "Companion " : "", (route_rule->vpn_action == NECP_ROUTE_RULE_QOS_MARKING) ? "VPN " : "", (route_rule->exception_if_actions[0] == NECP_ROUTE_RULE_QOS_MARKING) ? interface_names[0] : "", @@ -5366,7 +5321,7 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri } case NECP_KERNEL_POLICY_RESULT_USE_NETAGENT: { bool found_mapping = FALSE; - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(result_parameter.netagent_id); if (mapping != NULL) { uuid_unparse(mapping->uuid, uuid_string); found_mapping = TRUE; @@ -5376,7 +5331,7 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri } case NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED: { bool found_mapping = FALSE; - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(result_parameter.netagent_id); if (mapping != NULL) { uuid_unparse(mapping->uuid, uuid_string); found_mapping = TRUE; @@ -5386,7 +5341,7 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri } case NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT: { bool found_mapping = FALSE; - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(result_parameter.netagent_id); if (mapping != NULL) { uuid_unparse(mapping->uuid, uuid_string); found_mapping = TRUE; @@ -5394,6 +5349,15 @@ necp_get_result_description(char * __sized_by(MAX_RESULT_STRING_LEN) result_stri snprintf(result_string, MAX_RESULT_STRING_LEN, "RemoveNetAgent (%s)", found_mapping ? uuid_string : "Unknown"); break; } + case NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT_TYPE: { + bool found_mapping = FALSE; + struct necp_agent_type_id_mapping *mapping = necp_lookup_agent_type_with_id_locked(result_parameter.netagent_id); + if (mapping != NULL) { + found_mapping = TRUE; + } + snprintf(result_string, MAX_RESULT_STRING_LEN, "RemoveNetAgentType (%s/%s)", found_mapping ? mapping->agent_type.agent_domain : "Unknown", found_mapping ? mapping->agent_type.agent_type : "Unknown"); + break; + } default: { snprintf(result_string, MAX_RESULT_STRING_LEN, "Unknown %d (%d)", result, result_parameter.tunnel_interface_index); break; @@ -5450,7 +5414,8 @@ necp_kernel_socket_policy_results_overlap(struct necp_kernel_socket_policy *uppe upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT || upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED || upper_policy->result == NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED || - upper_policy->result == NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT) { + upper_policy->result == NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT || + upper_policy->result == NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT_TYPE) { // Filters and route rules never cancel out lower policies return FALSE; } else if (upper_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) { @@ -6231,7 +6196,7 @@ necp_lookup_route_rule_locked(struct necp_route_rule_list *list, u_int32_t route } static struct necp_route_rule * -necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int8_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int8_t companion_action, u_int8_t vpn_action, u_int32_t * __indexable if_indices, u_int8_t * __indexable if_actions, uuid_t netagent_uuid, uuid_t match_netagent_uuid, u_int32_t control_unit, u_int32_t effective_type) +necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_int8_t default_action, u_int8_t cellular_action, u_int8_t wifi_action, u_int8_t wired_action, u_int8_t expensive_action, u_int8_t constrained_action, u_int8_t companion_action, u_int8_t vpn_action, u_int8_t ultra_constrained_action, u_int32_t * __indexable if_indices, u_int8_t * __indexable if_actions, uuid_t netagent_uuid, uuid_t match_netagent_uuid, u_int32_t control_unit, u_int32_t effective_type) { struct necp_route_rule *searchentry = NULL; struct necp_route_rule *foundentry = NULL; @@ -6245,6 +6210,7 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i searchentry->constrained_action == constrained_action && searchentry->companion_action == companion_action && searchentry->vpn_action == vpn_action && + searchentry->ultra_constrained_action == ultra_constrained_action && searchentry->control_unit == control_unit && searchentry->effective_type == effective_type) { bool match_failed = FALSE; @@ -6288,7 +6254,7 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i } if (has_agent_a) { - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(searchentry->netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(searchentry->netagent_id); if (mapping == NULL) { // Bad mapping, doesn't match continue; @@ -6306,7 +6272,7 @@ necp_lookup_route_rule_by_contents_locked(struct necp_route_rule_list *list, u_i } if (has_match_agent_a) { - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(searchentry->match_netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(searchentry->match_netagent_id); if (mapping == NULL) { // Bad mapping, doesn't match continue; @@ -6341,6 +6307,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t * __sized_by( u_int8_t constrained_action = NECP_ROUTE_RULE_NONE; u_int8_t companion_action = NECP_ROUTE_RULE_NONE; u_int8_t vpn_action = NECP_ROUTE_RULE_NONE; + u_int8_t ultra_constrained_action = NECP_ROUTE_RULE_NONE; u_int32_t if_indices[MAX_ROUTE_RULE_INTERFACES]; size_t num_valid_indices = 0; memset(&if_indices, 0, sizeof(if_indices)); @@ -6443,7 +6410,9 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t * __sized_by( if (rule_flags & NECP_ROUTE_RULE_FLAG_EXPENSIVE) { expensive_action = rule_action; } - if (rule_flags & NECP_ROUTE_RULE_FLAG_CONSTRAINED) { + if ((rule_flags & NECP_ROUTE_RULE_FLAG_ULTRA_CONSTRAINED) == NECP_ROUTE_RULE_FLAG_ULTRA_CONSTRAINED) { + ultra_constrained_action = rule_action; + } else if (rule_flags & NECP_ROUTE_RULE_FLAG_CONSTRAINED) { constrained_action = rule_action; } if (rule_flags & NECP_ROUTE_RULE_FLAG_COMPANION) { @@ -6491,7 +6460,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t * __sized_by( } } - existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, companion_action, vpn_action, if_indices, if_actions, netagent_uuid, match_netagent_uuid, control_unit, effective_type); + existing_rule = necp_lookup_route_rule_by_contents_locked(list, default_action, cellular_action, wifi_action, wired_action, expensive_action, constrained_action, companion_action, vpn_action, ultra_constrained_action, if_indices, if_actions, netagent_uuid, match_netagent_uuid, control_unit, effective_type); if (existing_rule != NULL) { route_rule_id = existing_rule->id; os_ref_retain_locked(&existing_rule->refcount); @@ -6501,10 +6470,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t * __sized_by( Z_WAITOK | Z_ZERO | Z_NOFAIL); route_rule_id = new_rule->id = necp_get_new_route_rule_id(false); if (!uuid_is_null(netagent_uuid)) { - new_rule->netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid); + new_rule->netagent_id = necp_create_agent_uuid_id_mapping(netagent_uuid); } if (!uuid_is_null(match_netagent_uuid)) { - new_rule->match_netagent_id = necp_create_uuid_service_id_mapping(match_netagent_uuid); + new_rule->match_netagent_id = necp_create_agent_uuid_id_mapping(match_netagent_uuid); } new_rule->effective_type = effective_type; new_rule->control_unit = control_unit; @@ -6513,9 +6482,10 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t * __sized_by( new_rule->wifi_action = wifi_action; new_rule->wired_action = wired_action; new_rule->expensive_action = expensive_action; - new_rule->constrained_action = constrained_action; - new_rule->companion_action = companion_action; - new_rule->vpn_action = vpn_action; + new_rule->constrained_action = constrained_action; + new_rule->companion_action = companion_action; + new_rule->vpn_action = vpn_action; + new_rule->ultra_constrained_action = ultra_constrained_action; memcpy(&new_rule->exception_if_indices, &if_indices, sizeof(if_indices)); memcpy(&new_rule->exception_if_actions, &if_actions, sizeof(if_actions)); os_ref_init(&new_rule->refcount, &necp_refgrp); @@ -6560,8 +6530,8 @@ necp_remove_route_rule(struct necp_route_rule_list *list, u_int32_t route_rule_i if (existing_rule != NULL) { if (os_ref_release_locked(&existing_rule->refcount) == 0) { necp_remove_aggregate_route_rule_for_id(existing_rule->id); - necp_remove_uuid_service_id_mapping_with_service_id(existing_rule->netagent_id); - necp_remove_uuid_service_id_mapping_with_service_id(existing_rule->match_netagent_id); + necp_remove_agent_uuid_id_mapping_with_agent_id(existing_rule->netagent_id); + necp_remove_agent_uuid_id_mapping_with_agent_id(existing_rule->match_netagent_id); LIST_REMOVE(existing_rule, chain); kfree_type(struct necp_route_rule, existing_rule); } @@ -6619,55 +6589,33 @@ necp_create_aggregate_route_rule(u_int32_t * __counted_by(MAX_AGGREGATE_ROUTE_RU return aggregate_route_rule_id; } -#define NECP_NULL_SERVICE_ID 1 -#define NECP_FIRST_VALID_SERVICE_ID 2 -#define NECP_FIRST_VALID_APP_ID UINT16_MAX static u_int32_t -necp_get_new_uuid_id(bool service) +necp_get_new_app_uuid_id(void) { - static u_int32_t necp_last_service_uuid_id = 0; static u_int32_t necp_last_app_uuid_id = 0; u_int32_t newid = 0; LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - if (service) { - bool wrapped = FALSE; - do { - necp_last_service_uuid_id++; - if (necp_last_service_uuid_id < NECP_FIRST_VALID_SERVICE_ID || - necp_last_service_uuid_id >= NECP_FIRST_VALID_APP_ID) { - if (wrapped) { - // Already wrapped, give up - NECPLOG0(LOG_ERR, "Failed to find a free service UUID.\n"); - return NECP_NULL_SERVICE_ID; - } - necp_last_service_uuid_id = NECP_FIRST_VALID_SERVICE_ID; - wrapped = TRUE; + bool wrapped = FALSE; + do { + necp_last_app_uuid_id++; + if (necp_last_app_uuid_id < 1) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free app UUID ID.\n"); + return 0; } - newid = necp_last_service_uuid_id; - } while (necp_uuid_lookup_uuid_with_service_id_locked(newid) != NULL); // If already used, keep trying - } else { - bool wrapped = FALSE; - do { - necp_last_app_uuid_id++; - if (necp_last_app_uuid_id < NECP_FIRST_VALID_APP_ID) { - if (wrapped) { - // Already wrapped, give up - NECPLOG0(LOG_ERR, "Failed to find a free app UUID.\n"); - return NECP_NULL_SERVICE_ID; - } - necp_last_app_uuid_id = NECP_FIRST_VALID_APP_ID; - wrapped = TRUE; - } - newid = necp_last_app_uuid_id; - } while (necp_uuid_lookup_uuid_with_app_id_locked(newid) != NULL); // If already used, keep trying - } + necp_last_app_uuid_id = 1; + wrapped = TRUE; + } + newid = necp_last_app_uuid_id; + } while (necp_uuid_lookup_uuid_with_app_id_locked(newid) != NULL); // If already used, keep trying - if (newid == NECP_NULL_SERVICE_ID) { - NECPLOG0(LOG_ERR, "Allocate uuid ID failed.\n"); - return NECP_NULL_SERVICE_ID; + if (newid == 0) { + NECPLOG0(LOG_ERR, "Allocate app UUID ID failed.\n"); + return 0; } return newid; @@ -6732,7 +6680,7 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_ new_mapping = kalloc_type(struct necp_uuid_id_mapping, Z_WAITOK | Z_NOFAIL); uuid_copy(new_mapping->uuid, uuid); - new_mapping->id = necp_get_new_uuid_id(false); + new_mapping->id = necp_get_new_app_uuid_id(); os_ref_init(&new_mapping->refcount, &necp_refgrp); if (uuid_policy_table) { new_mapping->table_usecount = 1; @@ -6781,27 +6729,88 @@ necp_remove_uuid_app_id_mapping(uuid_t uuid, bool *removed_mapping, bool uuid_po return FALSE; } +#define NECP_NULL_AGENT_ID 1 +#define NECP_FIRST_VALID_AGENT_UUID_ID 2 +#define NECP_FIRST_VALID_AGENT_TYPE_ID UINT16_MAX + +static bool +necp_agent_id_is_uuid(u_int32_t agent_id) +{ + return agent_id < NECP_FIRST_VALID_AGENT_TYPE_ID; +} + +static u_int32_t +necp_get_new_agent_id(bool uuid) +{ + static u_int32_t necp_last_agent_id = 0; + static u_int32_t necp_last_agent_type_id = 0; + + u_int32_t newid = 0; + + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); + + if (uuid) { + bool wrapped = FALSE; + do { + necp_last_agent_id++; + if (necp_last_agent_id < NECP_FIRST_VALID_AGENT_UUID_ID || + necp_last_agent_id >= NECP_FIRST_VALID_AGENT_TYPE_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free agent UUID ID.\n"); + return NECP_NULL_AGENT_ID; + } + necp_last_agent_id = NECP_FIRST_VALID_AGENT_UUID_ID; + wrapped = TRUE; + } + newid = necp_last_agent_id; + } while (necp_uuid_lookup_uuid_with_agent_id_locked(newid) != NULL); // If already used, keep trying + } else { + bool wrapped = FALSE; + do { + necp_last_agent_type_id++; + if (necp_last_agent_type_id < NECP_FIRST_VALID_AGENT_TYPE_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free agent type ID.\n"); + return NECP_NULL_AGENT_ID; + } + necp_last_agent_type_id = NECP_FIRST_VALID_AGENT_TYPE_ID; + wrapped = TRUE; + } + newid = necp_last_agent_type_id; + } while (necp_lookup_agent_type_with_id_locked(newid) != NULL); // If already used, keep trying + } + + if (newid == NECP_NULL_AGENT_ID) { + NECPLOG0(LOG_ERR, "Allocate agent ID failed.\n"); + return NECP_NULL_AGENT_ID; + } + + return newid; +} + static struct necp_uuid_id_mapping * -necp_uuid_get_null_service_id_mapping(void) +necp_uuid_get_null_agent_id_mapping(void) { static struct necp_uuid_id_mapping null_mapping; uuid_clear(null_mapping.uuid); - null_mapping.id = NECP_NULL_SERVICE_ID; + null_mapping.id = NECP_NULL_AGENT_ID; return &null_mapping; } static struct necp_uuid_id_mapping * -necp_uuid_lookup_service_id_locked(uuid_t uuid) +necp_uuid_lookup_agent_id_with_uuid_locked(uuid_t uuid) { struct necp_uuid_id_mapping *searchentry = NULL; struct necp_uuid_id_mapping *foundentry = NULL; if (uuid_is_null(uuid)) { - return necp_uuid_get_null_service_id_mapping(); + return necp_uuid_get_null_agent_id_mapping(); } - LIST_FOREACH(searchentry, &necp_uuid_service_id_list, chain) { + LIST_FOREACH(searchentry, &necp_agent_uuid_id_list, chain) { if (uuid_compare(searchentry->uuid, uuid) == 0) { foundentry = searchentry; break; @@ -6812,17 +6821,17 @@ necp_uuid_lookup_service_id_locked(uuid_t uuid) } static struct necp_uuid_id_mapping * -necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id) +necp_uuid_lookup_uuid_with_agent_id_locked(u_int32_t agent_id) { struct necp_uuid_id_mapping *searchentry = NULL; struct necp_uuid_id_mapping *foundentry = NULL; - if (local_id == NECP_NULL_SERVICE_ID) { - return necp_uuid_get_null_service_id_mapping(); + if (agent_id == NECP_NULL_AGENT_ID) { + return necp_uuid_get_null_agent_id_mapping(); } - LIST_FOREACH(searchentry, &necp_uuid_service_id_list, chain) { - if (searchentry->id == local_id) { + LIST_FOREACH(searchentry, &necp_agent_uuid_id_list, chain) { + if (searchentry->id == agent_id) { foundentry = searchentry; break; } @@ -6832,39 +6841,39 @@ necp_uuid_lookup_uuid_with_service_id_locked(u_int32_t local_id) } static u_int32_t -necp_create_uuid_service_id_mapping(uuid_t uuid) +necp_create_agent_uuid_id_mapping(uuid_t uuid) { - u_int32_t local_id = 0; + u_int32_t agent_id = 0; struct necp_uuid_id_mapping *existing_mapping = NULL; if (uuid_is_null(uuid)) { - return NECP_NULL_SERVICE_ID; + return NECP_NULL_AGENT_ID; } LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - existing_mapping = necp_uuid_lookup_service_id_locked(uuid); + existing_mapping = necp_uuid_lookup_agent_id_with_uuid_locked(uuid); if (existing_mapping != NULL) { - local_id = existing_mapping->id; + agent_id = existing_mapping->id; os_ref_retain_locked(&existing_mapping->refcount); } else { struct necp_uuid_id_mapping *new_mapping = NULL; new_mapping = kalloc_type(struct necp_uuid_id_mapping, Z_WAITOK | Z_NOFAIL); uuid_copy(new_mapping->uuid, uuid); - new_mapping->id = necp_get_new_uuid_id(true); + new_mapping->id = necp_get_new_agent_id(true); os_ref_init(&new_mapping->refcount, &necp_refgrp); - LIST_INSERT_HEAD(&necp_uuid_service_id_list, new_mapping, chain); + LIST_INSERT_HEAD(&necp_agent_uuid_id_list, new_mapping, chain); - local_id = new_mapping->id; + agent_id = new_mapping->id; } - return local_id; + return agent_id; } static bool -necp_remove_uuid_service_id_mapping(uuid_t uuid) +necp_remove_agent_uuid_id_mapping(uuid_t uuid) { struct necp_uuid_id_mapping * __single existing_mapping = NULL; @@ -6874,7 +6883,7 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid) LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - existing_mapping = necp_uuid_lookup_service_id_locked(uuid); + existing_mapping = necp_uuid_lookup_agent_id_with_uuid_locked(uuid); if (existing_mapping != NULL) { if (os_ref_release_locked(&existing_mapping->refcount) == 0) { LIST_REMOVE(existing_mapping, chain); @@ -6887,17 +6896,17 @@ necp_remove_uuid_service_id_mapping(uuid_t uuid) } static bool -necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id) +necp_remove_agent_uuid_id_mapping_with_agent_id(u_int32_t agent_id) { struct necp_uuid_id_mapping * __single existing_mapping = NULL; - if (service_id == 0) { + if (agent_id == 0) { return TRUE; } LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - existing_mapping = necp_uuid_lookup_uuid_with_service_id_locked(service_id); + existing_mapping = necp_uuid_lookup_uuid_with_agent_id_locked(agent_id); if (existing_mapping != NULL) { if (os_ref_release_locked(&existing_mapping->refcount) == 0) { LIST_REMOVE(existing_mapping, chain); @@ -6909,6 +6918,84 @@ necp_remove_uuid_service_id_mapping_with_service_id(u_int32_t service_id) return FALSE; } +static struct necp_agent_type_id_mapping * +necp_lookup_agent_type_to_id_locked(struct necp_policy_condition_agent_type *agent_type) +{ + struct necp_agent_type_id_mapping *searchentry = NULL; + struct necp_agent_type_id_mapping *foundentry = NULL; + + LIST_FOREACH(searchentry, &necp_agent_type_id_list, chain) { + if (strlcmp(searchentry->agent_type.agent_domain, __unsafe_null_terminated_from_indexable(agent_type->agent_domain), NETAGENT_DOMAINSIZE) == 0 && + strlcmp(searchentry->agent_type.agent_type, __unsafe_null_terminated_from_indexable(agent_type->agent_type), NETAGENT_TYPESIZE) == 0) { + foundentry = searchentry; + break; + } + } + + return foundentry; +} + +static struct necp_agent_type_id_mapping * +necp_lookup_agent_type_with_id_locked(u_int32_t agent_id) +{ + struct necp_agent_type_id_mapping *searchentry = NULL; + struct necp_agent_type_id_mapping *foundentry = NULL; + + LIST_FOREACH(searchentry, &necp_agent_type_id_list, chain) { + if (searchentry->id == agent_id) { + foundentry = searchentry; + break; + } + } + + return foundentry; +} + +static u_int32_t +necp_create_agent_type_to_id_mapping(struct necp_policy_condition_agent_type *agent_type) +{ + u_int32_t agent_type_id = 0; + struct necp_agent_type_id_mapping *existing_mapping = NULL; + + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); + + existing_mapping = necp_lookup_agent_type_to_id_locked(agent_type); + if (existing_mapping != NULL) { + agent_type_id = existing_mapping->id; + os_ref_retain_locked(&existing_mapping->refcount); + } else { + struct necp_agent_type_id_mapping * __single new_mapping = NULL; + new_mapping = kalloc_type(struct necp_agent_type_id_mapping, + Z_WAITOK | Z_ZERO | Z_NOFAIL); + strlcpy(new_mapping->agent_type.agent_domain, __unsafe_null_terminated_from_indexable(agent_type->agent_domain), NETAGENT_DOMAINSIZE); + strlcpy(new_mapping->agent_type.agent_type, __unsafe_null_terminated_from_indexable(agent_type->agent_type), NETAGENT_TYPESIZE); + new_mapping->id = necp_get_new_agent_id(false); + os_ref_init(&new_mapping->refcount, &necp_refgrp); + LIST_INSERT_HEAD(&necp_agent_type_id_list, new_mapping, chain); + agent_type_id = new_mapping->id; + } + return agent_type_id; +} + +static bool +necp_remove_agent_type_to_id_mapping(u_int32_t agent_type_id) +{ + struct necp_agent_type_id_mapping * __single existing_mapping = NULL; + + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); + + existing_mapping = necp_lookup_agent_type_with_id_locked(agent_type_id); + if (existing_mapping != NULL) { + if (os_ref_release_locked(&existing_mapping->refcount) == 0) { + LIST_REMOVE(existing_mapping, chain); + kfree_type(struct necp_agent_type_id_mapping, existing_mapping); + } + return true; + } + + return false; +} + static bool necp_kernel_socket_policies_update_uuid_table(void) { @@ -7891,9 +7978,8 @@ necp_application_find_policy_match_internal(proc_t proc, struct necp_kernel_socket_policy *matched_policy = NULL; struct necp_socket_info info = {}; necp_kernel_policy_filter filter_control_unit = 0; - necp_kernel_policy_result service_action = 0; - necp_kernel_policy_service service = { 0, 0 }; + u_int64_t extended_client_flags = 0; u_int16_t protocol = 0; u_int32_t bound_interface_index = required_interface_index; u_int32_t traffic_class = 0; @@ -8198,6 +8284,19 @@ necp_application_find_policy_match_internal(proc_t proc, } break; } + case NECP_CLIENT_PARAMETER_PREFER_AGENT: { + if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) { + break; + } + if (length >= sizeof(uuid_t)) { + if (netagent_get_agent_domain_and_type(value, + required_agent_types[num_required_agent_types].netagent_domain, + required_agent_types[num_required_agent_types].netagent_type)) { + num_required_agent_types++; + } + } + break; + } case NECP_CLIENT_PARAMETER_SCHEME_PORT: { if (length >= sizeof(scheme_port)) { memcpy(&scheme_port, value, sizeof(scheme_port)); @@ -8217,6 +8316,12 @@ necp_application_find_policy_match_internal(proc_t proc, } break; } + case NECP_CLIENT_PARAMETER_EXTENDED_FLAGS: { + if (length >= sizeof(extended_client_flags)) { + memcpy(&extended_client_flags, value, sizeof(extended_client_flags)); + } + break; + } default: { break; } @@ -8318,8 +8423,6 @@ necp_application_find_policy_match_internal(proc_t proc, route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, - &service_action, - &service, netagent_ids, NECP_MAX_NETAGENTS, netagent_use_flags, @@ -8407,42 +8510,29 @@ necp_application_find_policy_match_internal(proc_t proc, returned_result->flow_divert_aggregate_unit = flow_divert_aggregate_unit; } - returned_result->service_action = service_action; - - // Fetch service registration - if (service.identifier != 0) { - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(service.identifier); - if (mapping != NULL) { - struct necp_service_registration *service_registration = NULL; - uuid_copy(returned_result->service_uuid, mapping->uuid); - returned_result->service_data = service.data; - if (service.identifier == NECP_NULL_SERVICE_ID) { - // NULL service is always 'registered' - returned_result->service_flags |= NECP_SERVICE_FLAGS_REGISTERED; - } else { - LIST_FOREACH(service_registration, &necp_registered_service_list, kernel_chain) { - if (service.identifier == service_registration->service_id) { - returned_result->service_flags |= NECP_SERVICE_FLAGS_REGISTERED; - break; - } - } - } - } - } - // Handle netagents size_t netagent_i = 0; + size_t removed_netagent_type_i = 0; for (netagent_cursor = 0; netagent_cursor < NECP_MAX_NETAGENTS; netagent_cursor++) { - struct necp_uuid_id_mapping *mapping = NULL; u_int32_t netagent_id = netagent_ids[netagent_cursor]; if (netagent_id == 0) { continue; } - mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); - if (mapping != NULL) { - uuid_copy(returned_result->netagents[netagent_i], mapping->uuid); - returned_result->netagent_use_flags[netagent_i] = netagent_use_flags[netagent_cursor]; - netagent_i++; + + if (necp_agent_id_is_uuid(netagent_id)) { + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(netagent_id); + if (mapping != NULL) { + uuid_copy(returned_result->netagents[netagent_i], mapping->uuid); + returned_result->netagent_use_flags[netagent_i] = netagent_use_flags[netagent_cursor]; + netagent_i++; + } + } else { + struct necp_agent_type_id_mapping *mapping = necp_lookup_agent_type_with_id_locked(netagent_id); + if (mapping != NULL && removed_netagent_type_i < NECP_MAX_REMOVE_NETAGENT_TYPES && + netagent_use_flags[netagent_cursor] & NECP_AGENT_USE_FLAG_REMOVE) { + memcpy(&returned_result->remove_netagent_types[removed_netagent_type_i], &mapping->agent_type, sizeof(mapping->agent_type)); + removed_netagent_type_i++; + } } // If the flags say to remove, clear the local copy @@ -8816,7 +8906,8 @@ necp_application_find_policy_match_internal(proc_t proc, } u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; - bool route_is_allowed = necp_route_is_allowed(rt, NULL, netagent_ids, NECP_MAX_NETAGENTS, route_rule_id_array[route_rule_index], &interface_type_denied); + bool ultra_constrained_denied = false; + bool route_is_allowed = necp_route_is_allowed(rt, NULL, netagent_ids, NECP_MAX_NETAGENTS, route_rule_id_array[route_rule_index], &interface_type_denied, &ultra_constrained_denied); if (!route_is_allowed) { // If the route is blocked, treat the lookup as a drop returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; @@ -8831,7 +8922,13 @@ necp_application_find_policy_match_internal(proc_t proc, } } necp_send_application_interface_denied_event(pid, application_uuid, interface_type_denied); + } else if (ultra_constrained_denied) { + if (reason != NULL) { + *reason = NECP_CLIENT_RESULT_REASON_ULTRA_CONSTRAINED_NOT_ALLOWED; + } + necp_send_network_denied_event(pid, application_uuid, NETPOLICY_NETWORKTYPE_ULTRA_CONSTRAINED); } + // If the route gets denied, stop matching rules break; } @@ -8853,7 +8950,7 @@ necp_application_find_policy_match_internal(proc_t proc, bool remove = false; u_int32_t netagent_id = necp_route_get_netagent(rt, netagent_ids, NECP_MAX_NETAGENTS, route_rule_id_array[route_rule_index], &remove); if (netagent_id != 0) { - struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_agent_id_locked(netagent_id); if (mapping != NULL) { bool agent_already_present = false; for (netagent_cursor = 0; netagent_cursor < NECP_MAX_NETAGENTS; netagent_cursor++) { @@ -8924,7 +9021,7 @@ necp_application_find_policy_match_internal(proc_t proc, IFNET_IS_CONSTRAINED(rt->rt_ifp)); const bool ultra_constrained_not_allowed = (!(client_flags & NECP_CLIENT_PARAMETER_FLAG_ALLOW_ULTRA_CONSTRAINED) && IFNET_IS_ULTRA_CONSTRAINED(rt->rt_ifp) && (task == NULL || - !IOTaskHasEntitlement(task, ULTRA_CONSTRAINED_ENTITLEMENT))); + (!if_ultra_constrained_default_allowed && !IOTaskHasEntitlement(task, ULTRA_CONSTRAINED_ENTITLEMENT)))); const bool interface_type_blocked = !necp_route_is_interface_type_allowed(rt, NULL, proc, NULL); if (!is_listener && !is_browser) { @@ -8944,6 +9041,12 @@ necp_application_find_policy_match_internal(proc_t proc, memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); } } + + if ((extended_client_flags & NECP_CLIENT_PARAMETER_EXTENDED_FLAG_AOP2_OFFLOAD) && + ((rt->rt_ifp->if_xflags & IFXF_RX_FLOW_STEERING) == 0)) { + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + memset(&returned_result->routing_result_parameter, 0, sizeof(returned_result->routing_result_parameter)); + } } if (rt != NULL) { @@ -9240,7 +9343,7 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, } task_t __single task = proc_task(proc); if (task == NULL || - !IOTaskHasEntitlement(task, kernel_policy->cond_custom_entitlement)) { + !IOTaskHasEntitlementAsBooleanOrObject(task, kernel_policy->cond_custom_entitlement)) { // Process is missing custom entitlement return FALSE; } @@ -9253,6 +9356,9 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, // Exact match requires the number of dots to match (no suffix matching allowed) bool domain_matches = (domain_dot_count == kernel_policy->cond_domain_dot_count && necp_hostname_matches_domain(domain, domain_dot_count, kernel_policy->cond_domain, kernel_policy->cond_domain_dot_count)); + if (domain_matches && socket != NULL) { + socket->so_flags1 |= SOF1_DOMAIN_MATCHED_POLICY; + } if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_EXACT_DOMAIN) { if (domain_matches) { // No match, matches forbidden domain @@ -9268,6 +9374,9 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, NECP_DATA_TRACE_LOG_CONDITION_SOCKET_STR(debug, socket, "SOCKET", kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DOMAIN, "NECP_KERNEL_CONDITION_DOMAIN", kernel_policy->cond_domain, domain.string); bool domain_matches = necp_hostname_matches_domain(domain, domain_dot_count, kernel_policy->cond_domain, kernel_policy->cond_domain_dot_count); + if (domain_matches && socket != NULL) { + socket->so_flags1 |= SOF1_DOMAIN_MATCHED_POLICY; + } if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DOMAIN) { if (domain_matches) { // No match, matches forbidden domain @@ -9298,6 +9407,9 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, NECPLOG(LOG_ERR, "DATA-TRACE: matching <%s %zu> with trie id %d - matched %d", domain.string, domain.length, kernel_policy->cond_domain_filter, domain_matches); } } + if (domain_matches && socket != NULL) { + socket->so_flags1 |= SOF1_DOMAIN_MATCHED_POLICY; + } if (kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_DOMAIN_FILTER) { if (domain_matches) { // No match, matches forbidden domain @@ -10005,8 +10117,6 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy u_int32_t * __counted_by(route_rule_id_array_count)return_route_rule_id_array, size_t *return_route_rule_id_array_count, size_t route_rule_id_array_count, - necp_kernel_policy_result *return_service_action, - necp_kernel_policy_service *return_service, u_int32_t * __counted_by(netagent_array_count)return_netagent_array, size_t netagent_array_count, u_int32_t * __counted_by(netagent_use_flags_array_count)return_netagent_use_flags_array, @@ -10064,15 +10174,6 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy *return_route_rule_id_array_count = 0; } - if (return_service_action != NULL) { - *return_service_action = 0; - } - - if (return_service != NULL) { - return_service->identifier = 0; - return_service->data = 0; - } - // Do not subject layer-2 filter to NECP policies, return a PASS policy if (necp_pass_interpose > 0 && info->client_flags & NECP_CLIENT_PARAMETER_FLAG_INTERPOSE) { return &pass_policy; @@ -10269,6 +10370,27 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy policy_search_array[i]->result_parameter.netagent_id); } continue; + } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT_TYPE) { + bool agent_already_present = false; + for (size_t netagent_i = 0; netagent_i < netagent_cursor; netagent_i++) { + if (netagent_ids[netagent_i] == policy_search_array[i]->result_parameter.netagent_id) { + // Already present. Mark the "REMOVE" flag if flags are supported, or just clear the entry + agent_already_present = true; + netagent_use_flags[netagent_i] = NECP_AGENT_USE_FLAG_REMOVE; + } + } + if (!agent_already_present && netagent_cursor < netagent_array_count_adjusted) { + // If not present, and flags are supported, add an entry with the "REMOVE" flag + netagent_ids[netagent_cursor] = policy_search_array[i]->result_parameter.netagent_id; + netagent_use_flags[netagent_cursor] = NECP_AGENT_USE_FLAG_REMOVE; + netagent_cursor++; + } + if (necp_debug > 1 || NECP_DATA_TRACE_POLICY_ON(debug)) { + NECPLOG(LOG_DEBUG, "DATA-TRACE: Socket Policy : (Application %d Real Application %d BoundInterface %d Proto %d) Remove NetagentType %d", + (uint64_t)VM_KERNEL_ADDRPERM(so), info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, + policy_search_array[i]->result_parameter.netagent_id); + } + continue; } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT) { u_int32_t control_unit = policy_search_array[i]->result_parameter.flow_divert_control_unit; if (control_unit & FLOW_DIVERT_IS_TRANSPARENT) { @@ -10462,7 +10584,7 @@ necp_socket_verify_netagents(u_int32_t * __counted_by(NECP_MAX_NETAGENTS)netagen if (netagent_id == 0) { continue; } - mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); + mapping = necp_uuid_lookup_uuid_with_agent_id_locked(netagent_id); if (mapping != NULL) { u_int32_t agent_flags = 0; agent_flags = netagent_get_flags(mapping->uuid); @@ -10492,8 +10614,6 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local necp_kernel_policy_filter filter_control_unit = 0; struct necp_kernel_socket_policy *matched_policy = NULL; necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE; - necp_kernel_policy_result service_action = 0; - necp_kernel_policy_service service = { 0, 0 }; u_int32_t drop_dest_policy_result = NECP_KERNEL_POLICY_RESULT_NONE; necp_drop_all_bypass_check_result_t drop_all_bypass = NECP_DROP_ALL_BYPASS_CHECK_RESULT_NONE; proc_t __single socket_proc = NULL; @@ -10629,8 +10749,6 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, - &service_action, - &service, netagent_ids, NECP_MAX_NETAGENTS, NULL, @@ -11899,13 +12017,15 @@ necp_update_qos_marking(struct ifnet *ifp, u_int32_t * __counted_by(netagent_arr (route_rule->wired_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_WIRED(ifp)) || (route_rule->expensive_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_EXPENSIVE(ifp)) || (route_rule->constrained_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_CONSTRAINED(ifp)) || + (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_ULTRA_CONSTRAINED(ifp)) || (route_rule->companion_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_COMPANION_LINK(ifp)) || (route_rule->vpn_action == NECP_ROUTE_RULE_QOS_MARKING && IFNET_IS_VPN(ifp))) { qos_marking = TRUE; if (necp_debug > 2) { - NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d Cn:%d Cmpn:%d VPN:%d for Rule %d Allowed %d", + NECPLOG(LOG_DEBUG, "QoS Marking: C:%d WF:%d W:%d E:%d Cn:%d Cmpn:%d VPN:%d UlCn:%d for Rule %d Allowed %d", route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, - route_rule->expensive_action, route_rule->constrained_action, route_rule->companion_action, route_rule->vpn_action, route_rule_id, qos_marking); + route_rule->expensive_action, route_rule->constrained_action, route_rule->companion_action, route_rule->vpn_action, + route_rule->ultra_constrained_action, route_rule_id, qos_marking); } goto done; } @@ -12017,7 +12137,7 @@ necp_route_is_lqm_abort(struct ifnet *ifp, struct ifnet *delegated_ifp) static bool necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t * __counted_by(netagent_array_count)netagent_array, size_t netagent_array_count, - u_int32_t route_rule_id, u_int32_t *interface_type_denied) + u_int32_t route_rule_id, u_int32_t *interface_type_denied, bool *ultra_constrained_denied) { bool default_is_allowed = TRUE; u_int8_t type_aggregate_action = NECP_ROUTE_RULE_NONE; @@ -12246,6 +12366,25 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t } } + if (IFNET_IS_ULTRA_CONSTRAINED(ifp)) { + if (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; + } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->ultra_constrained_action)) { + if (ultra_constrained_denied != NULL && IS_NECP_ROUTE_RULE_DENY(route_rule->ultra_constrained_action)) { + *ultra_constrained_denied = true; + } + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + IS_NECP_ROUTE_RULE_DENY(route_rule->ultra_constrained_action))) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->ultra_constrained_action; + } + } + } + if (type_aggregate_action != NECP_ROUTE_RULE_NONE) { if (necp_debug > 1) { NECPLOG(LOG_DEBUG, "Route Allowed: C:%d WF:%d W:%d E:%d Cmpn:%d VPN:%d for Rule %d Allowed %d", route_rule->cellular_action, route_rule->wifi_action, route_rule->wired_action, route_rule->expensive_action, route_rule->companion_action, route_rule->vpn_action, route_rule_id, (IS_NECP_ROUTE_RULE_DENY(type_aggregate_action) ? FALSE : TRUE)); @@ -12335,7 +12474,7 @@ necp_route_is_interface_type_allowed(struct rtentry *route, struct ifnet *ifp, p static bool necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t * __counted_by(netagent_array_count)netagent_array, size_t netagent_array_count, - u_int32_t route_rule_id, u_int32_t *interface_type_denied) + u_int32_t route_rule_id, u_int32_t *interface_type_denied, bool *ultra_constrained_denied) { if ((route == NULL && interface == NULL && netagent_array == NULL) || route_rule_id == 0) { if (necp_debug > 1) { @@ -12353,13 +12492,13 @@ necp_route_is_allowed(struct rtentry *route, struct ifnet *interface, u_int32_t if (sub_route_rule_id == 0) { break; } - if (!necp_route_is_allowed_inner(route, interface, netagent_array, netagent_array_count, sub_route_rule_id, interface_type_denied)) { + if (!necp_route_is_allowed_inner(route, interface, netagent_array, netagent_array_count, sub_route_rule_id, interface_type_denied, ultra_constrained_denied)) { return FALSE; } } } } else { - return necp_route_is_allowed_inner(route, interface, netagent_array, netagent_array_count, route_rule_id, interface_type_denied); + return necp_route_is_allowed_inner(route, interface, netagent_array, netagent_array_count, route_rule_id, interface_type_denied, ultra_constrained_denied); } return TRUE; @@ -12497,6 +12636,16 @@ necp_route_get_netagent(struct rtentry *route, u_int32_t * __counted_by(netagent return 0; } + if ((ifp->if_xflags & IFXF_ULTRA_CONSTRAINED) && + route_rule->ultra_constrained_action != NECP_ROUTE_RULE_NONE) { + if (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_USE_NETAGENT || + route_rule->ultra_constrained_action == NECP_ROUTE_RULE_REMOVE_NETAGENT) { + *remove = (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_REMOVE_NETAGENT); + return route_rule->netagent_id; + } + return 0; + } + if (ifp->if_xflags & IFXF_IS_VPN && route_rule->vpn_action != NECP_ROUTE_RULE_NONE) { if (route_rule->vpn_action == NECP_ROUTE_RULE_USE_NETAGENT || @@ -12618,6 +12767,14 @@ necp_route_get_flow_divert_inner(struct rtentry *route, u_int32_t * __counted_by return 0; } + if ((ifp->if_xflags & IFXF_ULTRA_CONSTRAINED) && + route_rule->ultra_constrained_action != NECP_ROUTE_RULE_NONE) { + if (route_rule->ultra_constrained_action == NECP_ROUTE_RULE_DIVERT_SOCKET) { + return route_rule->control_unit; + } + return 0; + } + if (ifp->if_xflags & IFXF_IS_VPN && route_rule->vpn_action != NECP_ROUTE_RULE_NONE) { if (route_rule->vpn_action == NECP_ROUTE_RULE_DIVERT_SOCKET) { @@ -12683,7 +12840,7 @@ necp_packet_is_allowed_over_interface(struct mbuf *packet, struct ifnet *interfa if (route_rule_id != 0 && interface != NULL) { lck_rw_lock_shared(&necp_kernel_policy_lock); - is_allowed = necp_route_is_allowed(NULL, interface, NULL, 0, necp_get_route_rule_id_from_packet(packet), NULL); + is_allowed = necp_route_is_allowed(NULL, interface, NULL, 0, necp_get_route_rule_id_from_packet(packet), NULL, NULL); lck_rw_done(&necp_kernel_policy_lock); } return is_allowed; @@ -12699,7 +12856,7 @@ necp_netagents_allow_traffic(u_int32_t * __counted_by(netagent_id_count)netagent if (netagent_id == 0) { continue; } - mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); + mapping = necp_uuid_lookup_uuid_with_agent_id_locked(netagent_id); if (mapping != NULL) { u_int32_t agent_flags = 0; agent_flags = netagent_get_flags(mapping->uuid); @@ -12735,8 +12892,6 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr bool allowed_to_receive = TRUE; struct necp_socket_info info = {}; u_int32_t flowhash = 0; - necp_kernel_policy_result service_action = 0; - necp_kernel_policy_service service = { 0, 0 }; u_int32_t route_rule_id = 0; struct rtentry *route = NULL; u_int32_t interface_type_denied = IFRTYPE_FUNCTIONAL_UNKNOWN; @@ -12799,7 +12954,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr } else { if (inp->inp_policyresult.results.route_rule_id != 0) { lck_rw_lock_shared(&necp_kernel_policy_lock); - if (!necp_route_is_allowed(route, input_interface, NULL, 0, inp->inp_policyresult.results.route_rule_id, &interface_type_denied)) { + if (!necp_route_is_allowed(route, input_interface, NULL, 0, inp->inp_policyresult.results.route_rule_id, &interface_type_denied, NULL)) { route_allowed = FALSE; } lck_rw_done(&necp_kernel_policy_lock); @@ -12866,7 +13021,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr inp->inp_policyresult.results.result_parameter.tunnel_interface_index != verifyifindex) || !necp_route_is_interface_type_allowed(route, input_interface, NULL, inp) || (inp->inp_policyresult.results.route_rule_id != 0 && - !necp_route_is_allowed(route, input_interface, NULL, 0, inp->inp_policyresult.results.route_rule_id, &interface_type_denied))) { + !necp_route_is_allowed(route, input_interface, NULL, 0, inp->inp_policyresult.results.route_rule_id, &interface_type_denied, NULL))) { allowed_to_receive = FALSE; NECP_DATA_TRACE_LOG_SOCKET_DP(debug, so, "SOCKET - DATA PATH", "RESULT - CACHED ", return_policy_id ? *return_policy_id : 0, return_skip_policy_id ? *return_skip_policy_id : 0); } else { @@ -12907,8 +13062,6 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr route_rule_id_array, &route_rule_id_array_count, MAX_AGGREGATE_ROUTE_RULES, - &service_action, - &service, netagent_ids, NECP_MAX_NETAGENTS, NULL, @@ -13013,7 +13166,7 @@ skip_agent_check: matched_policy->result_parameter.tunnel_interface_index != verifyifindex) || !necp_route_is_interface_type_allowed(route, input_interface, NULL, inp) || (route_rule_id != 0 && - !necp_route_is_allowed(route, input_interface, netagent_ids, NECP_MAX_NETAGENTS, route_rule_id, &interface_type_denied)) || + !necp_route_is_allowed(route, input_interface, netagent_ids, NECP_MAX_NETAGENTS, route_rule_id, &interface_type_denied, NULL)) || !necp_netagents_allow_traffic(netagent_ids, NECP_MAX_NETAGENTS)) { allowed_to_receive = FALSE; } else { @@ -13640,7 +13793,7 @@ necp_addr_is_loopback(struct sockaddr *address) } if (address->sa_family == AF_INET) { - return ntohl(SIN(address)->sin_addr.s_addr) == INADDR_LOOPBACK; + return IN_LOOPBACK(ntohl(SIN(address)->sin_addr.s_addr)); } else if (address->sa_family == AF_INET6) { if (!IN6_IS_ADDR_V4MAPPED(&SIN6(address)->sin6_addr)) { return IN6_IS_ADDR_LOOPBACK(&SIN6(address)->sin6_addr); @@ -13675,8 +13828,8 @@ necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, stru return TRUE; } if (inp->inp_vflag & INP_IPV4) { - if (ntohl(inp->inp_laddr.s_addr) == INADDR_LOOPBACK || - ntohl(inp->inp_faddr.s_addr) == INADDR_LOOPBACK) { + if (IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)) || + IN_LOOPBACK(ntohl(inp->inp_faddr.s_addr))) { return TRUE; } } else if (inp->inp_vflag & INP_IPV6) { @@ -13692,10 +13845,10 @@ necp_is_loopback(struct sockaddr *local_addr, struct sockaddr *remote_addr, stru if (packet != NULL) { struct ip *ip = mtod(packet, struct ip *); if (ip->ip_v == 4) { - if (ntohl(ip->ip_src.s_addr) == INADDR_LOOPBACK) { + if (IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { return TRUE; } - if (ntohl(ip->ip_dst.s_addr) == INADDR_LOOPBACK) { + if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr))) { return TRUE; } } else if (ip->ip_v == 6) { diff --git a/bsd/net/necp.h b/bsd/net/necp.h index 8ba8df326..eb3e8d9e6 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -38,6 +38,7 @@ #include #include #include +#include #if SKYWALK #include @@ -254,8 +255,9 @@ typedef struct necp_domain_trie_request { #define NECP_POLICY_RESULT_SCOPED_DIRECT 16 // N/A, scopes to primary physical interface #define NECP_POLICY_RESULT_ALLOW_UNENTITLED 17 // N/A #define NECP_POLICY_RESULT_REMOVE_NETAGENT 18 // netagent uuid_t +#define NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE 19 // necp_policy_condition_agent_type -#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_REMOVE_NETAGENT +#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE /* * PASS Result Flags @@ -268,6 +270,7 @@ typedef struct necp_domain_trie_request { */ #define NECP_POLICY_DROP_FLAG_LOCAL_NETWORK 0x01 #define NECP_POLICY_DROP_FLAG_SUPPRESS_ALERTS 0x02 +#define NECP_POLICY_DROP_FLAG_DEFUNCT_ALL_FLOWS 0x04 /* * Local-Networks Condition Flags @@ -295,6 +298,7 @@ typedef struct necp_domain_trie_request { #define NECP_ROUTE_RULE_FLAG_CONSTRAINED 0x10 #define NECP_ROUTE_RULE_FLAG_COMPANION 0x20 #define NECP_ROUTE_RULE_FLAG_VPN 0x40 +#define NECP_ROUTE_RULE_FLAG_ULTRA_CONSTRAINED 0x90 // Note that this includes the 0x80 bit, so cannot be combined with agent UUID matching. #define NECP_ROUTE_RULE_FLAG_NETAGENT 0x80 // Last bit, reserved to mark that this applies only when an agent UUID is present @@ -386,6 +390,7 @@ typedef union { #define NECP_SERVICE_FLAGS_REGISTERED 0x01 #define NECP_MAX_NETAGENTS 16 +#define NECP_MAX_REMOVE_NETAGENT_TYPES 4 #define NECP_AGENT_USE_FLAG_SCOPE 0x01 #define NECP_AGENT_USE_FLAG_REMOVE 0x02 @@ -396,15 +401,12 @@ struct necp_aggregate_result { necp_kernel_policy_routing_result_parameter routing_result_parameter; necp_kernel_policy_filter filter_control_unit; u_int32_t flow_divert_aggregate_unit; - necp_kernel_policy_result service_action; - uuid_t service_uuid; - u_int32_t service_flags; - u_int32_t service_data; u_int routed_interface_index; u_int32_t policy_id; u_int32_t skip_policy_id; uuid_t netagents[NECP_MAX_NETAGENTS]; u_int32_t netagent_use_flags[NECP_MAX_NETAGENTS]; + struct necp_policy_condition_agent_type remove_netagent_types[NECP_MAX_REMOVE_NETAGENT_TYPES]; struct ipv6_prefix nat64_prefixes[NAT64_MAX_NUM_PREFIXES]; u_int8_t mss_recommended; }; @@ -514,7 +516,8 @@ struct necp_udp_stats { * the structures to diverge later as new stats are added. */ #define QUIC_STATELESS_RESET_TOKEN_SIZE 16 -#define NECP_QUIC_HAS_PROBE_STATUS 1 +#define NECP_QUIC_HAS_FALLBACK 1 + struct necp_extra_quic_metadata { u_int32_t sndbufsize; u_int32_t sndbufused; @@ -524,6 +527,8 @@ struct necp_extra_quic_metadata { u_int32_t traffic_mgt_flags; u_int32_t cc_alg_index; u_int32_t state; + u_int32_t fallback : 1, + unused : 31; u_int8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE]; struct necp_connection_probe_status probestatus; }; @@ -639,6 +644,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_ACTION_GET_SIGNED_CLIENT_ID 24 // Get a client ID for the appliction along with a signature. #define NECP_CLIENT_ACTION_SET_SIGNED_CLIENT_ID 25 // Set a client ID for the appliction along with a signature. #define NECP_CLIENT_ACTION_COPY_UPDATED_RESULT_FINAL 26 // Copy client result only if changed, discard data if buffer is too small. Input: client_id; Output: result in buffer +#define NECP_CLIENT_ACTION_GET_FLOW_STATISTICS 27 // Get flow statistics for aop flow #define NECP_CLIENT_PARAMETER_APPLICATION NECP_POLICY_CONDITION_APPLICATION // Requires entitlement #define NECP_CLIENT_PARAMETER_REAL_APPLICATION NECP_POLICY_CONDITION_REAL_APPLICATION // Requires entitlement @@ -713,6 +719,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_FLAGS 250 // u_int32_t, see NECP_CLIENT_PAREMETER_FLAG_* values #define NECP_CLIENT_PARAMETER_FLOW_DEMUX_PATTERN 251 // struct necp_demux_pattern +#define NECP_CLIENT_PARAMETER_EXTENDED_FLAGS 252 // u_int64_t, see NECP_CLIENT_PARAMETER_EXTENDED_FLAG_* values #define NECP_CLIENT_PARAMETER_FLAG_MULTIPATH 0x0001 // Get multipath interface results #define NECP_CLIENT_PARAMETER_FLAG_BROWSE 0x0002 // Agent assertions on nexuses are requests to browse @@ -742,6 +749,12 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_FLAG_WEB_SEARCH_CONTENT 0x2000000 // Web search traffic #define NECP_CLIENT_PARAMETER_FLAG_ALLOW_ULTRA_CONSTRAINED 0x4000000 // Allow ultra-constrained interfaces #define NECP_CLIENT_PARAMETER_FLAG_HAS_ACCOUNT_ID 0x8000000 // Client has provided an account identifier +#define NECP_CLIENT_PARAMETER_FLAG_PREFER_COMPANION 0x10000000 // Client prefers companion proxy for internet access +#define NECP_CLIENT_PARAMETER_FLAG_AVOID_COMPANION 0x20000000 // Client avoids companion proxy for internet access, potentially falling back +#define NECP_CLIENT_PARAMETER_FLAG_REQUIRE_COMPANION 0x40000000 // Client requires connecting to companion +#define NECP_CLIENT_PARAMETER_FLAG_PROHIBIT_COMPANION 0x80000000 // Client prohibits connecting to companion + +#define NECP_CLIENT_PARAMETER_EXTENDED_FLAG_AOP2_OFFLOAD 0x0000000000000001 // Flow is offloaded to AOP2 #define NECP_CLIENT_RESULT_CLIENT_ID 1 // uuid_t #define NECP_CLIENT_RESULT_POLICY_RESULT 2 // u_int32_t @@ -767,7 +780,7 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_NEXUS_INSTANCE 100 // uuid_t #define NECP_CLIENT_RESULT_NEXUS_PORT 101 // nexus_port_t #define NECP_CLIENT_RESULT_NEXUS_KEY 102 // uuid_t -#define NECP_CLIENT_RESULT_NEXUS_PORT_FLOW_INDEX 103 // u_int32_t +#define NECP_CLIENT_RESULT_NEXUS_PORT_FLOW_INDEX 103 // u_int32_t #define NECP_CLIENT_RESULT_NEXUS_FLOW_STATS 104 // struct sk_stats_flow * #define NECP_CLIENT_RESULT_LOCAL_ENDPOINT 200 // struct necp_client_endpoint @@ -783,6 +796,9 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_NAT64 214 // struct ipv6_prefix[NAT64_MAX_NUM_PREFIXES] #define NECP_CLIENT_RESULT_ESTIMATED_THROUGHPUT 215 // struct necp_client_result_estimated_throughput #define NECP_CLIENT_RESULT_AGENT_ERROR 216 // struct necp_client_result_agent_error +#define NECP_CLIENT_RESULT_UNIQUE_FLOW_TAG 217 // u_int32_t +#define NECP_CLIENT_RESULT_LINK_QUALITY 218 // int8_t +#define NECP_CLIENT_RESULT_FLOW_STATS_INDEX 219 // u_int32_t #define NECP_CLIENT_RESULT_FLAG_IS_LOCAL 0x0001 // Routes to this device #define NECP_CLIENT_RESULT_FLAG_IS_DIRECT 0x0002 // Routes to directly accessible peer @@ -853,6 +869,8 @@ struct necp_interface_details { u_int32_t hwcsum_flags; u_int8_t radio_type; u_int8_t radio_channel; +#define NECP_INTERFACE_SUPPORTS_L4S 1 + u_int8_t l4s_mode; }; #define NECP_INTERFACE_FLAG_EXPENSIVE 0x0001 @@ -870,6 +888,7 @@ struct necp_interface_details { #define NECP_INTERFACE_FLAG_IPV4_ROUTABLE 0x1000 #define NECP_INTERFACE_FLAG_IPV6_ROUTABLE 0x2000 #define NECP_INTERFACE_FLAG_ULTRA_CONSTRAINED 0x4000 +#define NECP_INTERFACE_FLAG_LOW_POWER_WAKE 0x8000 struct necp_client_parameter_netagent_type { char netagent_domain[32]; @@ -941,6 +960,7 @@ struct kev_necp_policies_changed_data { #define NECP_CLIENT_FLOW_FLAGS_RESOLVE 0x08 // Create request with a resolution agent #define NECP_CLIENT_FLOW_FLAGS_OVERRIDE_ADDRESS 0x10 // Flow has a different remote address than the parent flow #define NECP_CLIENT_FLOW_FLAGS_OVERRIDE_IP_PROTOCOL 0x20 // Flow has a different IP protocol than the parent flow +#define NECP_CLIENT_FLOW_FLAGS_OPEN_FLOW_ON_BEHALF_OF_CLIENT 0x40 // Flow is opened on behalf of a client struct necp_client_flow_stats { u_int32_t stats_type; // NECP_CLIENT_STATISTICS_TYPE_* @@ -1093,6 +1113,14 @@ struct necp_demux_pattern { uint8_t value[NECP_DEMUX_MAX_LEN]; }; +struct necp_flow_statistics { + union { + struct tcp_info tcpi; + } transport; + uint8_t transport_proto; + uint8_t pad[3]; +}; + #ifdef BSD_KERNEL_PRIVATE #include #include @@ -1270,17 +1298,14 @@ typedef u_int32_t necp_app_id; #define NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT NECP_POLICY_RESULT_SCOPED_DIRECT #define NECP_KERNEL_POLICY_RESULT_ALLOW_UNENTITLED NECP_POLICY_RESULT_ALLOW_UNENTITLED #define NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT NECP_POLICY_RESULT_REMOVE_NETAGENT +#define NECP_KERNEL_POLICY_RESULT_REMOVE_NETAGENT_TYPE NECP_POLICY_RESULT_REMOVE_NETAGENT_TYPE #define NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC NECP_POLICY_PASS_NO_SKIP_IPSEC #define NECP_KERNEL_POLICY_PASS_PF_TAG NECP_POLICY_PASS_PF_TAG #define NECP_KERNEL_POLICY_DROP_FLAG_LOCAL_NETWORK NECP_POLICY_DROP_FLAG_LOCAL_NETWORK #define NECP_KERNEL_POLICY_DROP_FLAG_SUPPRESS_ALERTS NECP_POLICY_DROP_FLAG_SUPPRESS_ALERTS - -typedef struct { - u_int32_t identifier; - u_int32_t data; -} necp_kernel_policy_service; +#define NECP_KERNEL_POLICY_DROP_FLAG_DEFUNCT_ALL_FLOWS NECP_POLICY_DROP_FLAG_DEFUNCT_ALL_FLOWS typedef union { u_int tunnel_interface_index; @@ -1292,7 +1317,6 @@ typedef union { u_int32_t netagent_id; u_int32_t pass_flags; u_int32_t drop_flags; - necp_kernel_policy_service service; } necp_kernel_policy_result_parameter; enum necp_boolean_state { @@ -1401,6 +1425,8 @@ struct necp_session_policy { uuid_t applied_result_uuid; + u_int32_t applied_agent_type_id; + u_int32_t applied_route_rules_id; necp_kernel_policy_id kernel_socket_policies[MAX_KERNEL_SOCKET_POLICIES]; @@ -1561,7 +1587,7 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id, extern void * __sized_by(*message_length) necp_create_nexus_assign_message(uuid_t nexus_instance, nexus_port_t nexus_port, void *key __sized_by(key_length), uint32_t key_length, struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, struct ether_addr *local_ether_addr, - u_int32_t flow_adv_index, void *flow_stats, size_t *message_length); + u_int32_t flow_adv_index, void *flow_stats, uint32_t flow_id, size_t *message_length); #define NECP_MAX_DEMUX_PATTERNS 4 @@ -1591,6 +1617,7 @@ struct necp_client_nexus_parameters { unsigned no_wake_from_sleep:1; unsigned is_demuxable_parent:1; unsigned reuse_port:1; + unsigned use_aop_offload:1; uuid_t parent_flow_uuid; struct necp_demux_pattern demux_patterns[NECP_MAX_DEMUX_PATTERNS]; diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c index 9ffc871bf..4ecc44458 100644 --- a/bsd/net/necp_client.c +++ b/bsd/net/necp_client.c @@ -29,14 +29,18 @@ #include #include +#include #include #include +#include #include #include #include #include #include +#include +#include #include #include @@ -257,6 +261,8 @@ static uint32_t necp_client_stats_rtt_ceiling = 1920000; // 60s const static struct sk_stats_flow ntstat_sk_stats_zero; #endif /* SKYWALK */ +static int necp_client_stats_use_route_metrics = 0; + /* * Global lock to protect socket inp_necp_attributes across updates. * NECP updating these attributes and clients accessing these attributes @@ -284,6 +290,7 @@ SYSCTL_INT(_net_necp, NECPCTL_NEXUS_FLOW_COUNT, nexus_flow_count, CTLFLAG_LOCKED SYSCTL_UINT(_net_necp, OID_AUTO, collect_stats_interval_us, CTLFLAG_RW | CTLFLAG_LOCKED, &necp_collect_stats_timeout_microseconds, 0, ""); SYSCTL_UINT(_net_necp, OID_AUTO, necp_client_stats_rtt_floor, CTLFLAG_RW | CTLFLAG_LOCKED, &necp_client_stats_rtt_floor, 0, ""); SYSCTL_UINT(_net_necp, OID_AUTO, necp_client_stats_rtt_ceiling, CTLFLAG_RW | CTLFLAG_LOCKED, &necp_client_stats_rtt_ceiling, 0, ""); +SYSCTL_INT(_net_necp, OID_AUTO, necp_client_stats_use_route_metrics, CTLFLAG_RW | CTLFLAG_LOCKED, &necp_client_stats_use_route_metrics, 0, ""); #endif /* (DEVELOPMENT || DEBUG) */ #endif /* SKYWALK */ @@ -323,6 +330,7 @@ extern unsigned int get_maxmtu(struct rtentry *); #define NECP_PARSED_PARAMETERS_FIELD_FLOW_DEMUX_PATTERN 0x4000000 #define NECP_PARSED_PARAMETERS_FIELD_UID 0x8000000 #define NECP_PARSED_PARAMETERS_FIELD_PERSONA_ID 0x10000000 +#define NECP_PARSED_PARAMETERS_FIELD_EXTENDED_FLAGS 0x20000000 #define NECP_MAX_INTERFACE_PARAMETERS 16 @@ -357,6 +365,7 @@ struct necp_client_parsed_parameters { u_int8_t demux_pattern_count; uid_t uid; uid_t persona_id; + u_int64_t extended_flags; }; static bool @@ -405,6 +414,11 @@ struct necp_client_flow_protoctl_event_header { struct necp_client_flow_protoctl_event protoctl_event; } __attribute__((__packed__)); +struct necp_client_flow_stats_index_header { + struct necp_tlv_header stats_index_tlv_header; + uint32_t stats_index; +} __attribute__((__packed__)); + struct necp_client_nexus_flow_header { struct necp_client_flow_header flow_header; struct necp_tlv_header agent_tlv_header; @@ -426,7 +440,8 @@ struct necp_client_flow { unsigned assigned : 1; unsigned has_protoctl_event : 1; unsigned check_tcp_heuristics : 1; - unsigned _reserved : 1; + unsigned aop_offload : 1; + unsigned aop_stat_index_valid : 1; union { uuid_t nexus_agent; struct { @@ -441,6 +456,8 @@ struct necp_client_flow { struct necp_client_flow_protoctl_event protoctl_event; union necp_sockaddr_union local_addr; union necp_sockaddr_union remote_addr; + uint32_t flow_tag; + uint32_t stats_index; // Index associated with AOP flows size_t assigned_results_length; u_int8_t *__counted_by(assigned_results_length) assigned_results; @@ -455,6 +472,7 @@ struct necp_client_flow_registration { u_int32_t flags; unsigned flow_result_read : 1; unsigned defunct : 1; + unsigned aop_offload : 1; void *interface_handle; necp_client_flow_cb interface_cb; struct necp_client *client; @@ -571,6 +589,7 @@ struct necp_flow_defunct { uuid_t flow_id; uuid_t nexus_agent; void *agent_handle; + void *socket_handle; int proc_pid; u_int32_t flags; struct necp_client_agent_parameters close_parameters; @@ -771,6 +790,9 @@ static void necp_sysctl_arena_destroy(struct necp_fd_data *fd_data); static void *necp_arena_sysctls_obj(struct necp_fd_data *fd_data, mach_vm_offset_t *off, size_t *size); #endif /* !SKYWALK */ +static int necp_aop_offload_stats_initialize(struct necp_client_flow_registration *flow_registration, uuid_t netagent_uuid); +static void necp_aop_offload_stats_destroy(struct necp_client_flow *flow); + void necp_copy_inp_domain_info(struct inpcb *, struct socket *, nstat_domain_info *); void necp_with_inp_domain_name(struct socket *so, void *ctx, void (*with_func)(char *domain_name __null_terminated, void *ctx)); @@ -1256,8 +1278,113 @@ necp_client_collect_interface_stats(struct necp_client_flow_registration *flow_r } static void -necp_client_collect_stats(struct necp_client_flow_registration *flow_registration) +necp_client_collect_aop_flow_stats(struct necp_client_flow_registration *flow_registration) { + struct aop_flow_stats flow_stats = {}; + struct tcp_info *tcpi = &flow_stats.transport.tcp_stats.tcp_info; + uint32_t aop_flow_count = 0; + int err = 0; + + ASSERT(flow_registration->aop_offload); + struct necp_all_kstats *kstats = (struct necp_all_kstats *)flow_registration->kstats_kaddr; + if (kstats == NULL) { + return; + } + + struct necp_stat_counts *prev_tcpstats = &(((struct necp_tcp_stats *)&kstats->necp_stats_comm)->necp_tcp_counts); + struct sk_stats_flow *sf = &flow_registration->nexus_stats->fs_stats; + + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + aop_flow_count++; + ASSERT(flow->aop_offload && aop_flow_count == 1); + if (flow->flow_tag > 0 && flow->aop_stat_index_valid) { + err = net_aop_get_flow_stats(flow->stats_index, &flow_stats); + if (err != 0) { + NECPLOG(LOG_ERR, "failed to get aop flow stats " + "for flow id %u with error %d", flow->flow_tag, err); + continue; + } + + if (__improbable(flow->flow_tag != flow_stats.flow_id)) { + NECPLOG(LOG_NOTICE, "aop flow stats, flow tag 0x%x != 0x%x", + flow->flow_tag, flow_stats.flow_id); + continue; + } + + if ((prev_tcpstats->necp_stat_rxpackets == tcpi->tcpi_rxpackets) && + prev_tcpstats->necp_stat_txpackets == tcpi->tcpi_txpackets) { + continue; + } + + uint32_t d_rxpackets = tcpi->tcpi_rxpackets - prev_tcpstats->necp_stat_rxpackets; + prev_tcpstats->necp_stat_rxpackets += d_rxpackets; + + uint32_t d_txpackets = tcpi->tcpi_txpackets - prev_tcpstats->necp_stat_txpackets; + prev_tcpstats->necp_stat_txpackets += d_txpackets; + + uint32_t d_rxbytes = tcpi->tcpi_rxbytes - prev_tcpstats->necp_stat_rxbytes; + prev_tcpstats->necp_stat_rxbytes += d_rxbytes; + + uint32_t d_txbytes = tcpi->tcpi_txbytes - prev_tcpstats->necp_stat_txbytes; + prev_tcpstats->necp_stat_txbytes += d_txbytes; + + uint32_t d_rxduplicatebytes = tcpi->tcpi_rxduplicatebytes - prev_tcpstats->necp_stat_rxduplicatebytes; + prev_tcpstats->necp_stat_rxduplicatebytes += d_rxduplicatebytes; + + uint32_t d_rxoutoforderbytes = tcpi->tcpi_rxoutoforderbytes - prev_tcpstats->necp_stat_rxoutoforderbytes; + prev_tcpstats->necp_stat_rxoutoforderbytes += d_rxoutoforderbytes; + + uint32_t d_txretransmit = tcpi->tcpi_txretransmitbytes - prev_tcpstats->necp_stat_txretransmit; + prev_tcpstats->necp_stat_txretransmit += d_txretransmit; + + uint32_t d_connectattempts = prev_tcpstats->necp_stat_connectattempts - (tcpi->tcpi_state >= TCPS_SYN_SENT ? 1 : 0); + prev_tcpstats->necp_stat_connectattempts += d_connectattempts; + + uint32_t d_connectsuccesses = prev_tcpstats->necp_stat_connectsuccesses - (tcpi->tcpi_state >= TCPS_ESTABLISHED ? 1 : 0); + prev_tcpstats->necp_stat_connectsuccesses += d_connectsuccesses; + + prev_tcpstats->necp_stat_avg_rtt = tcpi->tcpi_srtt; + prev_tcpstats->necp_stat_var_rtt = tcpi->tcpi_rttvar; + + /* Update route stats */ + NECP_CLIENT_ROUTE_LOCK(flow_registration->client); + struct rtentry *route = flow_registration->client->current_route; + if (route != NULL) { + nstat_route_update(route, d_connectattempts, + d_connectsuccesses, d_rxpackets, d_rxbytes, + d_rxduplicatebytes, d_rxoutoforderbytes, + d_txpackets, d_txbytes, d_txretransmit, + prev_tcpstats->necp_stat_avg_rtt, prev_tcpstats->necp_stat_var_rtt); + } + NECP_CLIENT_ROUTE_UNLOCK(flow_registration->client); + + /* Update nexus flow stats */ + if (sf != NULL) { + sf->sf_ibytes = flow_stats.rxbytes; + sf->sf_obytes = flow_stats.txbytes; + sf->sf_ipackets = flow_stats.rxpkts; + sf->sf_opackets = flow_stats.txpkts; + sf->sf_lseq = tcpi->tcpi_snd_nxt - 1; + sf->sf_rseq = tcpi->tcpi_rcv_nxt - 1; + sf->sf_lrtt = tcpi->tcpi_srtt; + sf->sf_rrtt = tcpi->tcpi_rcv_srtt; + sf->sf_ltrack.sft_state = tcpi->tcpi_state; + sf->sf_lwscale = tcpi->tcpi_snd_wscale; + sf->sf_rwscale = tcpi->tcpi_rcv_wscale; + + memcpy(&sf->sf_activity, &flow_stats.activity_bitmap, + sizeof(sf->sf_activity)); + } + } + } +} + +static void +necp_client_collect_nexus_flow_stats(struct necp_client_flow_registration *flow_registration) +{ + ASSERT(!flow_registration->aop_offload); + struct necp_all_kstats *kstats = (struct necp_all_kstats *)flow_registration->kstats_kaddr; if (kstats == NULL) { return; @@ -1301,6 +1428,16 @@ necp_client_collect_stats(struct necp_client_flow_registration *flow_registratio NECP_CLIENT_ROUTE_UNLOCK(flow_registration->client); } +static void +necp_client_collect_stats(struct necp_client_flow_registration *flow_registration) +{ + if (__probable(!flow_registration->aop_offload)) { + necp_client_collect_nexus_flow_stats(flow_registration); + } else { + necp_client_collect_aop_flow_stats(flow_registration); + } +} + // This is called from various places; "closing" here implies the client being closed/removed if true, otherwise being // defunct. In the former, we expect the caller to not hold the lock; for the latter it must have acquired it. static void @@ -1397,7 +1534,8 @@ necp_collect_stats_client_callout(__unused thread_call_param_t dummy, static void necp_defunct_flow_registration(struct necp_client *client, struct necp_client_flow_registration *flow_registration, - struct _necp_flow_defunct_list *defunct_list) + struct _necp_flow_defunct_list *defunct_list, + bool defunct_socket_flows) { NECP_CLIENT_ASSERT_LOCKED(client); @@ -1405,14 +1543,27 @@ necp_defunct_flow_registration(struct necp_client *client, bool needs_defunct = false; struct necp_client_flow *search_flow = NULL; LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { + bool should_defunct_flow = false; if (search_flow->nexus && !uuid_is_null(search_flow->u.nexus_agent)) { - // Save defunct values for the nexus + should_defunct_flow = true; + } else if (defunct_socket_flows && + search_flow->socket && + search_flow->u.socket_handle != NULL) { + should_defunct_flow = true; + } + + if (should_defunct_flow) { + // Save defunct values for the nexus/socket if (defunct_list != NULL) { // Sleeping alloc won't fail; copy only what's necessary struct necp_flow_defunct *flow_defunct = kalloc_type(struct necp_flow_defunct, Z_WAITOK | Z_ZERO); - uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent); + if (search_flow->nexus) { + uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent); + } else if (search_flow->socket) { + flow_defunct->socket_handle = search_flow->u.socket_handle; + } uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? client->client_id : flow_registration->registration_id)); @@ -1454,13 +1605,13 @@ necp_defunct_flow_registration(struct necp_client *client, static void necp_defunct_client_for_policy(struct necp_client *client, - struct _necp_flow_defunct_list *defunct_list) + struct _necp_flow_defunct_list *defunct_list, bool defunct_socket_flows) { NECP_CLIENT_ASSERT_LOCKED(client); struct necp_client_flow_registration *flow_registration = NULL; RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { - necp_defunct_flow_registration(client, flow_registration, defunct_list); + necp_defunct_flow_registration(client, flow_registration, defunct_list, defunct_socket_flows); } } @@ -1755,6 +1906,9 @@ necp_destroy_client_flow_registration(struct necp_client *client, } else { OSDecrementAtomic(&necp_if_flow_count); } + + necp_aop_offload_stats_destroy(search_flow); + kfree_type(struct necp_client_flow, search_flow); } @@ -1861,6 +2015,18 @@ necp_process_defunct_list(struct _necp_flow_defunct_list *defunct_list) proc_name(flow_defunct->proc_pid, namebuf, sizeof(namebuf)); NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_update_client abort nexus error (%d) for pid %d %s", netagent_error, flow_defunct->proc_pid, namebuf); } + } else if (flow_defunct->socket_handle != NULL) { + struct inpcb *inp = (struct inpcb *)flow_defunct->socket_handle; + struct socket *so = inp->inp_socket; + if (so != NULL) { + proc_t proc = proc_find(flow_defunct->proc_pid); + if (proc != PROC_NULL) { + proc_fdlock(proc); + (void)socket_defunct(proc, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + proc_fdunlock(proc); + proc_rele(proc); + } + } } LIST_REMOVE(flow_defunct, chain); kfree_type(struct necp_flow_defunct, flow_defunct); @@ -2007,7 +2173,8 @@ static void necp_client_add_nexus_flow(struct necp_client_flow_registration *flow_registration, uuid_t nexus_agent, uint32_t interface_index, - uint32_t interface_flags) + uint32_t interface_flags, + bool aop_offload) { struct necp_client_flow *new_flow = kalloc_type(struct necp_client_flow, Z_WAITOK | Z_ZERO | Z_NOFAIL); @@ -2016,7 +2183,7 @@ necp_client_add_nexus_flow(struct necp_client_flow_registration *flow_registrati new_flow->interface_index = interface_index; new_flow->interface_flags = interface_flags; new_flow->check_tcp_heuristics = TRUE; - + new_flow->aop_offload = aop_offload ? TRUE : FALSE; #if SKYWALK OSIncrementAtomic(&necp_nexus_flow_count); #endif /* SKYWALK */ @@ -2030,8 +2197,7 @@ necp_client_add_nexus_flow(struct necp_client_flow_registration *flow_registrati static void necp_client_add_nexus_flow_if_needed(struct necp_client_flow_registration *flow_registration, - uuid_t nexus_agent, - uint32_t interface_index) + uuid_t nexus_agent, uint32_t interface_index, bool aop_offload) { struct necp_client_flow *flow = NULL; LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { @@ -2053,7 +2219,7 @@ necp_client_add_nexus_flow_if_needed(struct necp_client_flow_registration *flow_ } } ifnet_head_done(); - necp_client_add_nexus_flow(flow_registration, nexus_agent, interface_index, interface_flags); + necp_client_add_nexus_flow(flow_registration, nexus_agent, interface_index, interface_flags, aop_offload); } static struct necp_client_flow * @@ -2334,7 +2500,7 @@ necp_client_update_flows(proc_t proc, u_int32_t flags = netagent_get_flags(flow->u.nexus_agent); if (!(flags & NETAGENT_FLAG_REGISTERED)) { // The agent is no longer registered! Mark defunct. - necp_defunct_flow_registration(client, flow_registration, defunct_list); + necp_defunct_flow_registration(client, flow_registration, defunct_list, false); client_updated = TRUE; } } @@ -2362,6 +2528,9 @@ necp_client_update_flows(proc_t proc, } else { OSDecrementAtomic(&necp_if_flow_count); } + + necp_aop_offload_stats_destroy(flow); + kfree_type(struct necp_client_flow, flow); } } @@ -2802,11 +2971,14 @@ necp_client_trace_parsed_parameters(struct necp_client *client, struct necp_clie break; } - NECP_CLIENT_PARAMS_LOG(client, "Parsed params - valid_fields %X flags %X delegated_upid %llu local_addr %s remote_addr %s " + NECP_CLIENT_PARAMS_LOG(client, "Parsed params - valid_fields %X flags %X " + "extended flags %llX delegated_upid %llu local_addr %s remote_addr %s " "required_interface_index %u required_interface_type %d local_address_preference %d " - "ip_protocol %d transport_protocol %d ethertype %d effective_pid %d effective_uuid %s uid %d persona_id %d traffic_class %d", + "ip_protocol %d transport_protocol %d ethertype %d effective_pid %d " + "effective_uuid %s uid %d persona_id %d traffic_class %d", parsed_parameters->valid_fields, parsed_parameters->flags, + parsed_parameters->extended_flags, parsed_parameters->delegated_upid, local_buffer, remote_buffer, parsed_parameters->required_interface_index, @@ -3246,6 +3418,13 @@ necp_client_parse_parameters(struct necp_client *client, u_int8_t * __sized_by(p } break; } + case NECP_CLIENT_PARAMETER_EXTENDED_FLAGS: { + if (length >= sizeof(u_int64_t)) { + memcpy(&parsed_parameters->extended_flags, value, sizeof(parsed_parameters->extended_flags)); + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_EXTENDED_FLAGS; + } + break; + } default: { break; } @@ -3376,8 +3555,7 @@ necp_client_parse_parameters(struct necp_client *client, u_int8_t * __sized_by(p static int necp_client_parse_result(u_int8_t * __indexable result, u_int32_t result_size, - union necp_sockaddr_union *local_address, - union necp_sockaddr_union *remote_address, + struct necp_client_flow *flow, void **flow_stats) { #pragma unused(flow_stats) @@ -3395,8 +3573,8 @@ necp_client_parse_result(u_int8_t * __indexable result, case NECP_CLIENT_RESULT_LOCAL_ENDPOINT: { if (length >= sizeof(struct necp_client_endpoint)) { struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; - if (local_address != NULL && necp_client_address_is_valid(&endpoint->u.sa)) { - local_address->sin6 = endpoint->u.sin6; + if (necp_client_address_is_valid(&endpoint->u.sa)) { + flow->local_addr.sin6 = endpoint->u.sin6; } } break; @@ -3404,8 +3582,8 @@ necp_client_parse_result(u_int8_t * __indexable result, case NECP_CLIENT_RESULT_REMOTE_ENDPOINT: { if (length >= sizeof(struct necp_client_endpoint)) { struct necp_client_endpoint *endpoint = (struct necp_client_endpoint *)(void *)value; - if (remote_address != NULL && necp_client_address_is_valid(&endpoint->u.sa)) { - remote_address->sin6 = endpoint->u.sin6; + if (necp_client_address_is_valid(&endpoint->u.sa)) { + flow->remote_addr.sin6 = endpoint->u.sin6; } } break; @@ -3421,6 +3599,12 @@ necp_client_parse_result(u_int8_t * __indexable result, memset(value, 0, length); // nullify TLV always break; } + case NECP_CLIENT_RESULT_UNIQUE_FLOW_TAG: { + if (length >= sizeof(uint32_t)) { + flow->flow_tag = *(uint32_t *)(void *)value; + break; + } + } #endif /* SKYWALK */ default: { break; @@ -3978,7 +4162,7 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp) void *message = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0, (struct necp_client_endpoint *)&flow->local_addr, (struct necp_client_endpoint *)&flow->remote_addr, - NULL, 0, NULL, &message_length); + NULL, 0, NULL, 0, &message_length); flow->assigned_results = message; flow->assigned_results_length = message_length; flow_registration->flow_result_read = FALSE; @@ -4161,8 +4345,7 @@ necp_assign_client_result_locked(struct proc *proc, void * __single nexus_stats = NULL; if (assigned_results != NULL && assigned_results_length > 0) { int error = necp_client_parse_result(assigned_results, (u_int32_t)assigned_results_length, - &flow->local_addr, &flow->remote_addr, - assigned_from_userspace_agent ? NULL : &nexus_stats); // Only assign stats from kernel agents + flow, assigned_from_userspace_agent ? NULL : &nexus_stats); // Only assign stats from kernel agents VERIFY(error == 0); } @@ -4541,6 +4724,30 @@ necp_calculate_client_result(proc_t proc, return TRUE; } +static bool +necp_agent_is_removed_by_type(struct necp_aggregate_result *result, + uuid_t agent_uuid) +{ + for (int i = 0; i < NECP_MAX_REMOVE_NETAGENT_TYPES; i++) { + if (result->remove_netagent_types[i].agent_domain[0] == 0 && + result->remove_netagent_types[i].agent_type[0] == 0) { + // Empty type, hit the end of the list + return false; + } + + char compare_agent_domain[NETAGENT_DOMAINSIZE] = { 0 }; + char compare_agent_type[NETAGENT_TYPESIZE] = { 0 }; + if (netagent_get_agent_domain_and_type(agent_uuid, compare_agent_domain, compare_agent_type)) { + if (necp_agent_types_match(result->remove_netagent_types[i].agent_domain, + result->remove_netagent_types[i].agent_type, + compare_agent_domain, compare_agent_type)) { + return true; + } + } + } + return false; +} + #define NECP_PARSED_PARAMETERS_REQUIRED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ @@ -4627,7 +4834,11 @@ necp_update_client_result(proc_t proc, if (defunct_list != NULL && result.routing_result == NECP_KERNEL_POLICY_RESULT_DROP) { // If we are forced to drop the client, defunct it if it has flows - necp_defunct_client_for_policy(client, defunct_list); + bool defunct_socket_flows = false; + if (result.routing_result_parameter.drop_flags & NECP_KERNEL_POLICY_DROP_FLAG_DEFUNCT_ALL_FLOWS) { + defunct_socket_flows = true; + } + necp_defunct_client_for_policy(client, defunct_list, defunct_socket_flows); } // Recalculate flags @@ -4726,6 +4937,23 @@ necp_update_client_result(proc_t proc, write_v4_gateway = false; } } + + if (client->current_route->rt_ifp != NULL) { + int8_t if_lqm = client->current_route->rt_ifp->if_interface_state.lqm_state; + + // Upgrade to enhancedLQM for cellular interfaces that support it + if (client->current_route->rt_ifp->if_type == IFT_CELLULAR && client->current_route->rt_ifp->if_link_status != NULL) { + struct if_cellular_status_v1 *cell_link_status = &client->current_route->rt_ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1; + + if (cell_link_status->valid_bitmask & IF_CELL_LINK_QUALITY_METRIC_VALID) { + if_lqm = ifnet_lqm_normalize(cell_link_status->link_quality_metric); + } + } + + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_LINK_QUALITY, + sizeof(if_lqm), &if_lqm, &updated, + client->result, sizeof(client->result)); + } } NECP_CLIENT_ROUTE_UNLOCK(client); @@ -4764,6 +4992,12 @@ necp_update_client_result(proc_t proc, // A removed agent, ignore continue; } + + if (necp_agent_is_removed_by_type(&result, result.netagents[i])) { + // A removed agent, ignore + continue; + } + uuid_copy(netagent.netagent_uuid, result.netagents[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, TRUE, 0, 0)) { @@ -4930,9 +5164,15 @@ necp_update_client_result(proc_t proc, break; } } + + if (!skip_agent && necp_agent_is_removed_by_type(&result, original_scoped_interface->if_agentids[i])) { + skip_agent = true; + } + if (skip_agent) { continue; } + uuid_copy(netagent.netagent_uuid, original_scoped_interface->if_agentids[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, FALSE, @@ -4964,6 +5204,11 @@ necp_update_client_result(proc_t proc, break; } } + + if (!skip_agent && necp_agent_is_removed_by_type(&result, direct_interface->if_agentids[i])) { + skip_agent = true; + } + if (skip_agent) { continue; } @@ -5000,6 +5245,11 @@ necp_update_client_result(proc_t proc, break; } } + + if (!skip_agent && necp_agent_is_removed_by_type(&result, delegate_interface->if_agentids[i])) { + skip_agent = true; + } + if (skip_agent) { continue; } @@ -6284,6 +6534,30 @@ necp_find_conn_extension_info(nstat_provider_context ctx, #if SKYWALK +static struct traffic_stats* +media_stats_embedded_ts(struct media_stats *media_stats, uint32_t ifflags) +{ + struct traffic_stats *ts = NULL; + if (media_stats) { + if (ifflags & NSTAT_IFNET_IS_WIFI) { + if (ifflags & NSTAT_IFNET_IS_WIFI_INFRA) { + ts = &media_stats->ms_wifi_infra; + } else { + ts = &media_stats->ms_wifi_non_infra; + } + } else if (ifflags & NSTAT_IFNET_IS_CELLULAR) { + ts = &media_stats->ms_cellular; + } else if (ifflags & NSTAT_IFNET_IS_WIRED) { + ts = &media_stats->ms_wired; + } else if (ifflags & NSTAT_IFNET_IS_COMPANIONLINK_BT) { + ts = &media_stats->ms_bluetooth; + } else if (!(ifflags & NSTAT_IFNET_IS_LOOPBACK)) { + ts = &media_stats->ms_alternate; + } + } + return ts; +} + static size_t necp_find_extension_info(userland_stats_provider_context *ctx, int requested_extension, /* The extension to be returned */ @@ -6503,24 +6777,20 @@ necp_find_netstat_initial_properties(struct necp_client *client) return retval; } -// Called from NetworkStatistics when it wishes to collect latest information for a TCP flow. -// It is a responsibility of NetworkStatistics to have previously zeroed any supplied memory. static bool -necp_request_tcp_netstats(userland_stats_provider_context *ctx, +necp_request_nexus_tcp_netstats(userland_stats_provider_context *ctx, u_int32_t *ifflagsp, nstat_progress_digest *digestp, nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, void *metadatap) { - if (ctx == NULL) { - return false; - } - struct necp_client_flow_registration * __single flow_registration = (struct necp_client_flow_registration *)(void *)ctx; struct necp_client *client = flow_registration->client; struct necp_all_stats *ustats_kaddr = ((struct necp_all_kstats *)flow_registration->kstats_kaddr)->necp_stats_ustats; struct necp_tcp_stats *tcpstats = (struct necp_tcp_stats *)ustats_kaddr; ASSERT(tcpstats != NULL); + ASSERT(!flow_registration->aop_offload); u_int32_t nstat_diagnostic_flags = 0; @@ -6557,7 +6827,7 @@ necp_request_tcp_netstats(userland_stats_provider_context *ctx, if (tcpstats->necp_tcp_extra.flags1 & SOF1_CELLFALLBACK) { *ifflagsp |= NSTAT_IFNET_VIA_CELLFALLBACK; } - if ((digestp == NULL) && (countsp == NULL) && (metadatap == NULL)) { + if ((digestp == NULL) && (countsp == NULL) && (detailed_countsp == NULL) && (metadatap == NULL)) { return true; } } @@ -6599,6 +6869,8 @@ necp_request_tcp_netstats(userland_stats_provider_context *ctx, countsp->nstat_connectsuccesses = tcpstats->necp_tcp_extra.state >= TCPS_ESTABLISHED ? 1 : 0; // Supplement what the user level has told us with what we know from the flowswitch + // The nstat_counts structure has only one set of packet counts so set them from the + // trusted flowswitch as clients may use them to calculate header overhead for cell/wifi/wired counts countsp->nstat_rxpackets = sf->sf_ipackets; countsp->nstat_txpackets = sf->sf_opackets; if (route_ifflags & NSTAT_IFNET_IS_CELLULAR) { @@ -6613,6 +6885,34 @@ necp_request_tcp_netstats(userland_stats_provider_context *ctx, } } + if (detailed_countsp) { + detailed_countsp->nstat_media_stats.ms_total.ts_rxbytes = tcpstats->necp_tcp_counts.necp_stat_rxbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_txbytes = tcpstats->necp_tcp_counts.necp_stat_txbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_rxpackets = tcpstats->necp_tcp_counts.necp_stat_rxpackets; + detailed_countsp->nstat_media_stats.ms_total.ts_txpackets = tcpstats->necp_tcp_counts.necp_stat_txpackets; + + detailed_countsp->nstat_rxduplicatebytes = tcpstats->necp_tcp_counts.necp_stat_rxduplicatebytes; + detailed_countsp->nstat_rxoutoforderbytes = tcpstats->necp_tcp_counts.necp_stat_rxoutoforderbytes; + detailed_countsp->nstat_txretransmit = tcpstats->necp_tcp_counts.necp_stat_txretransmit; + + detailed_countsp->nstat_min_rtt = tcpstats->necp_tcp_counts.necp_stat_min_rtt; + detailed_countsp->nstat_avg_rtt = tcpstats->necp_tcp_counts.necp_stat_avg_rtt; + detailed_countsp->nstat_var_rtt = tcpstats->necp_tcp_counts.necp_stat_var_rtt; + + // Supplement what the user level has told us with what we know from the flowswitch + // The user level statistics don't include a bitmap so use the one within the kernel, + memcpy(&detailed_countsp->nstat_media_stats.ms_total.ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + + struct traffic_stats *ts = media_stats_embedded_ts(&detailed_countsp->nstat_media_stats, route_ifflags); + if (ts) { + ts->ts_rxpackets = sf->sf_ipackets; + ts->ts_txpackets = sf->sf_opackets; + ts->ts_rxbytes = sf->sf_ibytes; + ts->ts_txbytes = sf->sf_obytes; + memcpy(&ts->ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + } + } + if (metadatap) { nstat_tcp_descriptor *desc = (nstat_tcp_descriptor *)metadatap; memset(desc, 0, sizeof(*desc)); @@ -6678,12 +6978,238 @@ necp_request_tcp_netstats(userland_stats_provider_context *ctx, return true; } +static bool +necp_request_aop_tcp_netstats(userland_stats_provider_context *ctx, + u_int32_t *ifflagsp, + nstat_progress_digest *digestp, + nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, + void *metadatap) +{ + struct aop_flow_stats flow_stats = {}; + struct tcp_info *tcpi = &flow_stats.transport.tcp_stats.tcp_info; + struct necp_client_flow_registration * __single flow_registration = (struct necp_client_flow_registration *)(void *)ctx; + struct necp_client *client = flow_registration->client; + int err = 0; + + ASSERT(flow_registration->aop_offload); + + u_int32_t nstat_diagnostic_flags = 0; + + // Retrieve details from the last time the assigned flows were updated + u_int32_t route_ifindex = IFSCOPE_NONE; + u_int32_t route_ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; + u_int64_t combined_interface_details = 0; + + combined_interface_details = os_atomic_load(&flow_registration->last_interface_details, relaxed); + split_interface_details(combined_interface_details, &route_ifindex, &route_ifflags); + + if (route_ifindex == IFSCOPE_NONE) { + // Mark no interface + nstat_diagnostic_flags |= NSTAT_IFNET_ROUTE_VALUE_UNOBTAINABLE; + route_ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; + NECPLOG(LOG_INFO, "req tcp stats, failed to get route details for pid %d curproc %d %s\n", + client->proc_pid, proc_pid(current_proc()), proc_best_name(current_proc())); + } + + const struct sk_stats_flow *sf = &flow_registration->nexus_stats->fs_stats; + if (sf == NULL) { + nstat_diagnostic_flags |= NSTAT_IFNET_FLOWSWITCH_VALUE_UNOBTAINABLE; + char namebuf[MAXCOMLEN + 1]; + (void) strlcpy(namebuf, "unknown", sizeof(namebuf)); + proc_name(client->proc_pid, namebuf, sizeof(namebuf)); + NECPLOG(LOG_ERR, "req tcp stats, necp_client flow_registration flow_stats missing for pid %d %s curproc %d %s\n", + client->proc_pid, namebuf, proc_pid(current_proc()), proc_best_name(current_proc())); + sf = &ntstat_sk_stats_zero; + } + + if (ifflagsp) { + *ifflagsp = route_ifflags | nstat_diagnostic_flags; + *ifflagsp |= (sf->sf_flags & SFLOWF_ONLINK) ? NSTAT_IFNET_IS_LOCAL : NSTAT_IFNET_IS_NON_LOCAL; + if ((digestp == NULL) && (countsp == NULL) && (detailed_countsp == NULL) && (metadatap == NULL)) { + return true; + } + } + + // This needs to revisited if multiple flows are created from one flow registration + struct necp_client_flow *flow = LIST_FIRST(&flow_registration->flow_list); + if (flow == NULL) { + return false; + } + + ASSERT(flow->aop_offload && flow->flow_tag > 0); + if (!flow->aop_stat_index_valid) { + return false; + } + err = net_aop_get_flow_stats(flow->stats_index, &flow_stats); + if (err != 0) { + NECPLOG(LOG_ERR, "failed to get aop flow stats " + "for flow id %u with error %d", flow->flow_tag, err); + return false; + } + + if (__improbable(flow->flow_tag != flow_stats.flow_id)) { + NECPLOG(LOG_ERR, "aop flow stats, flow tag 0x%x != 0x%x", + flow->flow_tag, flow_stats.flow_id); + return false; + } + + if (digestp) { + // The digest is intended to give information that may help give insight into the state of the link + digestp->rxbytes = tcpi->tcpi_rxbytes; + digestp->txbytes = tcpi->tcpi_txbytes; + digestp->rxduplicatebytes = tcpi->tcpi_rxduplicatebytes; + digestp->rxoutoforderbytes = tcpi->tcpi_rxoutoforderbytes; + digestp->txretransmit = tcpi->tcpi_txretransmitbytes; + digestp->ifindex = route_ifindex; + digestp->state = tcpi->tcpi_state; + digestp->txunacked = tcpi->tcpi_txunacked; + digestp->txwindow = tcpi->tcpi_snd_wnd; + + if ((countsp == NULL) && (metadatap == NULL)) { + return true; + } + } + + if (countsp) { + countsp->nstat_rxbytes = tcpi->tcpi_rxbytes; + countsp->nstat_txbytes = tcpi->tcpi_txbytes; + + countsp->nstat_rxduplicatebytes = tcpi->tcpi_rxduplicatebytes; + countsp->nstat_rxoutoforderbytes = tcpi->tcpi_rxoutoforderbytes; + countsp->nstat_txretransmit = tcpi->tcpi_txretransmitbytes; + + countsp->nstat_min_rtt = tcpi->tcpi_rttbest; + countsp->nstat_avg_rtt = tcpi->tcpi_srtt; + countsp->nstat_var_rtt = tcpi->tcpi_rttvar; + + countsp->nstat_connectattempts = tcpi->tcpi_state >= TCPS_SYN_SENT ? 1 : 0; + countsp->nstat_connectsuccesses = tcpi->tcpi_state >= TCPS_ESTABLISHED ? 1 : 0; + + // Supplement what the user level has told us with what we know from the flowswitch + // The nstat_counts structure has only one set of packet counts so set them from the + // trusted flowswitch as clients may use them to calculate header overhead for cell/wifi/wired counts + countsp->nstat_rxpackets = sf->sf_ipackets; + countsp->nstat_txpackets = sf->sf_opackets; + if (route_ifflags & NSTAT_IFNET_IS_CELLULAR) { + countsp->nstat_cell_rxbytes = sf->sf_ibytes; + countsp->nstat_cell_txbytes = sf->sf_obytes; + } else if (route_ifflags & NSTAT_IFNET_IS_WIFI) { + countsp->nstat_wifi_rxbytes = sf->sf_ibytes; + countsp->nstat_wifi_txbytes = sf->sf_obytes; + } else if (route_ifflags & NSTAT_IFNET_IS_WIRED) { + countsp->nstat_wired_rxbytes = sf->sf_ibytes; + countsp->nstat_wired_txbytes = sf->sf_obytes; + } + } + + if (detailed_countsp) { + detailed_countsp->nstat_media_stats.ms_total.ts_rxbytes = tcpi->tcpi_rxbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_txbytes = tcpi->tcpi_txbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_rxpackets = tcpi->tcpi_rxpackets; + detailed_countsp->nstat_media_stats.ms_total.ts_txpackets = tcpi->tcpi_txpackets; + + detailed_countsp->nstat_rxduplicatebytes = tcpi->tcpi_rxduplicatebytes; + detailed_countsp->nstat_rxoutoforderbytes = tcpi->tcpi_rxoutoforderbytes; + detailed_countsp->nstat_txretransmit = tcpi->tcpi_txretransmitbytes; + + detailed_countsp->nstat_min_rtt = tcpi->tcpi_rttbest; + detailed_countsp->nstat_avg_rtt = tcpi->tcpi_srtt; + detailed_countsp->nstat_var_rtt = tcpi->tcpi_rttvar; + + struct traffic_stats *ts = media_stats_embedded_ts(&detailed_countsp->nstat_media_stats, route_ifflags); + if (ts) { + ts->ts_rxpackets = sf->sf_ipackets; + ts->ts_txpackets = sf->sf_opackets; + ts->ts_rxbytes = sf->sf_ibytes; + ts->ts_txbytes = sf->sf_obytes; + } + } + + if (metadatap) { + nstat_tcp_descriptor *desc = (nstat_tcp_descriptor *)metadatap; + memset(desc, 0, sizeof(*desc)); + + // Metadata from the flow registration + uuid_copy(desc->fuuid, flow_registration->registration_id); + + // Metadata that the necp client should have in TLV format. + pid_t effective_pid = client->proc_pid; + necp_find_netstat_data(client, (union necp_sockaddr_union *)&desc->remote, &effective_pid, &desc->uid, desc->euuid, &desc->persona_id, &desc->traffic_class, &desc->fallback_mode); + desc->epid = (u_int32_t)effective_pid; + + // Metadata from the flow registration + memcpy(&desc->local, &flow->local_addr, sizeof(desc->local)); + + // Metadata from the route + desc->ifindex = route_ifindex; + desc->ifnet_properties = route_ifflags | nstat_diagnostic_flags; + desc->ifnet_properties |= (sf->sf_flags & SFLOWF_ONLINK) ? NSTAT_IFNET_IS_LOCAL : NSTAT_IFNET_IS_NON_LOCAL; + + // Basic metadata from userland + desc->rcvbufsize = flow_stats.rx_buffer_stats.bufsize; + desc->rcvbufused = flow_stats.rx_buffer_stats.bufused; + + // Additional TCP specific data + desc->sndbufsize = flow_stats.tx_buffer_stats.bufsize; + desc->sndbufused = flow_stats.tx_buffer_stats.bufused; + desc->txunacked = tcpi->tcpi_txunacked; + desc->txwindow = tcpi->tcpi_snd_wnd; + desc->txcwindow = tcpi->tcpi_snd_cwnd; + desc->traffic_mgt_flags = 0; + desc->state = tcpi->tcpi_state; + + u_int32_t cc_alg_index = flow_stats.transport.tcp_stats.tcp_cc_algo; + if (cc_alg_index < TCP_CC_ALGO_COUNT) { + strbufcpy(desc->cc_algo, sizeof(desc->cc_algo), tcp_cc_algo_list[cc_alg_index]->name, sizeof(tcp_cc_algo_list[cc_alg_index]->name)); + } else { + strlcpy(desc->cc_algo, "unknown", sizeof(desc->cc_algo)); + } + + desc->connstatus.probe_activated = 0; + desc->connstatus.write_probe_failed = 0; + desc->connstatus.read_probe_failed = 0; + desc->connstatus.conn_probe_failed = 0; + + if (NECP_ENABLE_CLIENT_TRACE(NECP_CLIENT_TRACE_LEVEL_FLOW)) { + uuid_string_t euuid_str = { 0 }; + uuid_unparse(desc->euuid, euuid_str); + NECPLOG(LOG_NOTICE, "Collected stats - TCP - epid %d uid %d euuid %s persona id %d", desc->epid, desc->uid, euuid_str, desc->persona_id); + } + } + + return true; +} + +// Called from NetworkStatistics when it wishes to collect latest information for a TCP flow. +// It is a responsibility of NetworkStatistics to have previously zeroed any supplied memory. +static bool +necp_request_tcp_netstats(userland_stats_provider_context *ctx, + u_int32_t *ifflagsp, + nstat_progress_digest *digestp, + nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, + void *metadatap) +{ + if (ctx == NULL) { + return false; + } + + struct necp_client_flow_registration * __single flow_registration = (struct necp_client_flow_registration *)(void *)ctx; + if (__probable(!flow_registration->aop_offload)) { + return necp_request_nexus_tcp_netstats(ctx, ifflagsp, digestp, countsp, detailed_countsp, metadatap); + } else { + return necp_request_aop_tcp_netstats(ctx, ifflagsp, digestp, countsp, detailed_countsp, metadatap); + } +} + // Called from NetworkStatistics when it wishes to collect latest information for a UDP flow. static bool necp_request_udp_netstats(userland_stats_provider_context *ctx, u_int32_t *ifflagsp, nstat_progress_digest *digestp, nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, void *metadatap) { #pragma unused(digestp) @@ -6730,7 +7256,7 @@ necp_request_udp_netstats(userland_stats_provider_context *ctx, if (ifflagsp) { *ifflagsp = route_ifflags | nstat_diagnostic_flags; *ifflagsp |= (sf->sf_flags & SFLOWF_ONLINK) ? NSTAT_IFNET_IS_LOCAL : NSTAT_IFNET_IS_NON_LOCAL; - if ((countsp == NULL) && (metadatap == NULL)) { + if ((digestp == NULL) && (countsp == NULL) && (detailed_countsp == NULL) && (metadatap == NULL)) { return true; } } @@ -6748,6 +7274,8 @@ necp_request_udp_netstats(userland_stats_provider_context *ctx, countsp->nstat_var_rtt = udpstats->necp_udp_counts.necp_stat_var_rtt; // Supplement what the user level has told us with what we know from the flowswitch + // The nstat_counts structure has only one set of packet counts so set them from the + // trusted flowswitch as clients may use them to calculate header overhead for cell/wifi/wired counts countsp->nstat_rxpackets = sf->sf_ipackets; countsp->nstat_txpackets = sf->sf_opackets; if (route_ifflags & NSTAT_IFNET_IS_CELLULAR) { @@ -6762,6 +7290,34 @@ necp_request_udp_netstats(userland_stats_provider_context *ctx, } } + if (detailed_countsp) { + detailed_countsp->nstat_media_stats.ms_total.ts_rxbytes = udpstats->necp_udp_counts.necp_stat_rxbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_txbytes = udpstats->necp_udp_counts.necp_stat_txbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_rxpackets = udpstats->necp_udp_counts.necp_stat_rxpackets; + detailed_countsp->nstat_media_stats.ms_total.ts_txpackets = udpstats->necp_udp_counts.necp_stat_txpackets; + + detailed_countsp->nstat_rxduplicatebytes = udpstats->necp_udp_counts.necp_stat_rxduplicatebytes; + detailed_countsp->nstat_rxoutoforderbytes = udpstats->necp_udp_counts.necp_stat_rxoutoforderbytes; + detailed_countsp->nstat_txretransmit = udpstats->necp_udp_counts.necp_stat_txretransmit; + + detailed_countsp->nstat_min_rtt = udpstats->necp_udp_counts.necp_stat_min_rtt; + detailed_countsp->nstat_avg_rtt = udpstats->necp_udp_counts.necp_stat_avg_rtt; + detailed_countsp->nstat_var_rtt = udpstats->necp_udp_counts.necp_stat_var_rtt; + + // Supplement what the user level has told us with what we know from the flowswitch + // The user level statistics don't include a bitmap so use the one within the kernel, + memcpy(&detailed_countsp->nstat_media_stats.ms_total.ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + + struct traffic_stats *ts = media_stats_embedded_ts(&detailed_countsp->nstat_media_stats, route_ifflags); + if (ts) { + ts->ts_rxpackets = sf->sf_ipackets; + ts->ts_txpackets = sf->sf_opackets; + ts->ts_rxbytes = sf->sf_ibytes; + ts->ts_txbytes = sf->sf_obytes; + memcpy(&ts->ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + } + } + if (metadatap) { nstat_udp_descriptor *desc = (nstat_udp_descriptor *)metadatap; memset(desc, 0, sizeof(*desc)); @@ -6814,6 +7370,7 @@ necp_request_quic_netstats(userland_stats_provider_context *ctx, u_int32_t *ifflagsp, nstat_progress_digest *digestp, nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, void *metadatap) { if (ctx == NULL) { @@ -6858,7 +7415,10 @@ necp_request_quic_netstats(userland_stats_provider_context *ctx, if (ifflagsp) { *ifflagsp = route_ifflags | nstat_diagnostic_flags; *ifflagsp |= (sf->sf_flags & SFLOWF_ONLINK) ? NSTAT_IFNET_IS_LOCAL : NSTAT_IFNET_IS_NON_LOCAL; - if ((digestp == NULL) && (countsp == NULL) && (metadatap == NULL)) { + if (quicstats->necp_quic_extra.fallback) { + *ifflagsp |= NSTAT_IFNET_VIA_CELLFALLBACK; + } + if ((digestp == NULL) && (countsp == NULL) && (detailed_countsp == NULL) && (metadatap == NULL)) { return true; } } @@ -6898,6 +7458,8 @@ necp_request_quic_netstats(userland_stats_provider_context *ctx, // TODO: It would be good to expose QUIC stats for CH/SH retransmission and connection state // Supplement what the user level has told us with what we know from the flowswitch + // The nstat_counts structure has only one set of packet counts so set them from the + // trusted flowswitch as clients may use them to calculate header overhead for cell/wifi/wired counts countsp->nstat_rxpackets = sf->sf_ipackets; countsp->nstat_txpackets = sf->sf_opackets; if (route_ifflags & NSTAT_IFNET_IS_CELLULAR) { @@ -6912,6 +7474,34 @@ necp_request_quic_netstats(userland_stats_provider_context *ctx, } } + if (detailed_countsp) { + detailed_countsp->nstat_media_stats.ms_total.ts_rxbytes = quicstats->necp_quic_counts.necp_stat_rxbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_txbytes = quicstats->necp_quic_counts.necp_stat_txbytes; + detailed_countsp->nstat_media_stats.ms_total.ts_rxpackets = quicstats->necp_quic_counts.necp_stat_rxpackets; + detailed_countsp->nstat_media_stats.ms_total.ts_txpackets = quicstats->necp_quic_counts.necp_stat_txpackets; + + detailed_countsp->nstat_rxduplicatebytes = quicstats->necp_quic_counts.necp_stat_rxduplicatebytes; + detailed_countsp->nstat_rxoutoforderbytes = quicstats->necp_quic_counts.necp_stat_rxoutoforderbytes; + detailed_countsp->nstat_txretransmit = quicstats->necp_quic_counts.necp_stat_txretransmit; + + detailed_countsp->nstat_min_rtt = quicstats->necp_quic_counts.necp_stat_min_rtt; + detailed_countsp->nstat_avg_rtt = quicstats->necp_quic_counts.necp_stat_avg_rtt; + detailed_countsp->nstat_var_rtt = quicstats->necp_quic_counts.necp_stat_var_rtt; + + // Supplement what the user level has told us with what we know from the flowswitch + // The user level statistics don't include a bitmap so use the one within the kernel, + memcpy(&detailed_countsp->nstat_media_stats.ms_total.ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + + struct traffic_stats *ts = media_stats_embedded_ts(&detailed_countsp->nstat_media_stats, route_ifflags); + if (ts) { + ts->ts_rxpackets = sf->sf_ipackets; + ts->ts_txpackets = sf->sf_opackets; + ts->ts_rxbytes = sf->sf_ibytes; + ts->ts_txbytes = sf->sf_obytes; + memcpy(&ts->ts_bitmap, &sf->sf_activity, sizeof(sf->sf_activity)); + } + } + if (metadatap) { nstat_quic_descriptor *desc = (nstat_quic_descriptor *)metadatap; memset(desc, 0, sizeof(*desc)); @@ -6936,6 +7526,10 @@ necp_request_quic_netstats(userland_stats_provider_context *ctx, desc->ifindex = route_ifindex; desc->ifnet_properties = route_ifflags | nstat_diagnostic_flags; desc->ifnet_properties |= (sf->sf_flags & SFLOWF_ONLINK) ? NSTAT_IFNET_IS_LOCAL : NSTAT_IFNET_IS_NON_LOCAL; + if (quicstats->necp_quic_extra.fallback) { + desc->ifnet_properties |= NSTAT_IFNET_VIA_CELLFALLBACK; + desc->fallback_mode = SO_FALLBACK_MODE_FAST; + } // Basic metadata from userland desc->rcvbufsize = quicstats->necp_quic_basic.rcvbufsize; @@ -7093,6 +7687,7 @@ static bool necp_request_conn_netstats(nstat_provider_context ctx, u_int32_t *ifflagsp, nstat_counts *countsp, + nstat_detailed_counts *detailsp, void *metadatap) { if (ctx == NULL) { @@ -7107,6 +7702,9 @@ necp_request_conn_netstats(nstat_provider_context ctx, if (countsp) { memset(countsp, 0, sizeof(*countsp)); } + if (detailsp) { + memset(detailsp, 0, sizeof(*detailsp)); + } if (desc) { memset(desc, 0, sizeof(*desc)); // Metadata, that the necp client should have, in TLV format. @@ -7243,6 +7841,7 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client int error = 0; struct necp_client * __single client = NULL; const size_t buffer_size = uap->buffer_size; + const task_t __single task = proc_task(p); if (fd_data->flags & NECP_OPEN_FLAG_PUSH_OBSERVER) { NECPLOG0(LOG_ERR, "NECP client observers with push enabled may not add their own clients"); @@ -7332,6 +7931,24 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client } } + if (parse_error == 0 && (parsed_parameters.extended_flags & NECP_CLIENT_PARAMETER_EXTENDED_FLAG_AOP2_OFFLOAD)) { + bool has_aop_offload_entitlement = IOTaskHasEntitlement(task, "com.apple.private.network.aop2_offload"); + if (!has_aop_offload_entitlement) { + NECPLOG(LOG_ERR, "%s(%d) does not hold the necessary entitlement for aop offload", + proc_name_address(p), proc_pid(p)); + error = EPERM; + goto done; + } + + if ((parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_MULTIPATH) || + (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_BROWSE) || + (parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER)) { + NECPLOG0(LOG_INFO, "necp_client_add, aop_offload not supported for multipath/listener"); + error = EINVAL; + goto done; + } + } + if (parse_error == 0 && parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER && (parsed_parameters.ip_protocol == IPPROTO_TCP || parsed_parameters.ip_protocol == IPPROTO_UDP)) { uint32_t *netns_addr = NULL; @@ -7767,7 +8384,8 @@ necp_client_remove_flow(struct necp_fd_data *fd_data, struct necp_client_action_ if (flow_registration != NULL && client != NULL) { NECP_CLIENT_LOCK(client); if (flow_registration->client == client) { - necp_destroy_client_flow_registration(client, flow_registration, fd_data->proc_pid, false); + bool abort = (flow_registration->aop_offload) ? true : false; + necp_destroy_client_flow_registration(client, flow_registration, fd_data->proc_pid, abort); } necp_client_release_locked(client); NECP_CLIENT_UNLOCK(client); @@ -7826,19 +8444,7 @@ necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_ break; } - if (client->current_route != NULL) { - if (client->current_route->rt_ifp->if_eflags & IFEF_ECN_ENABLE) { - check_ecn = true; - break; - } - if (client->current_route->rt_ifp->if_eflags & IFEF_ECN_DISABLE) { - break; - } - } - - bool inbound = ((parsed_parameters.flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) == 0); - if ((inbound && tcp_ecn_inbound == 1) || - (!inbound && tcp_ecn_outbound == 1)) { + if (tcp_ecn == 1) { check_ecn = true; } } while (false); @@ -7892,6 +8498,131 @@ necp_client_calculate_flow_tlv_size(struct necp_client_flow_registration *flow_r return assigned_results_size; } +static errno_t +necp_client_destination_mac_address(struct sockaddr *remote, uint32_t index, + struct ether_addr *remote_mac) +{ + struct rtentry *rt = NULL; + struct rtentry *tgt_rt = NULL; + struct rtentry *__single gwrt = NULL; + errno_t err = 0; + + ASSERT(remote_mac != NULL); + ASSERT(remote != NULL); + + rt = rtalloc1_scoped(remote, 0, 0, index); + if (rt == NULL) { + return ENOENT; + } + + if (IS_DIRECT_HOSTROUTE(rt)) { + tgt_rt = rt; + } else { + err = route_to_gwroute(remote, rt, &gwrt); + if (err != 0) { + goto done; + } + + ASSERT(gwrt != NULL); + RT_LOCK_ASSERT_HELD(gwrt); + tgt_rt = gwrt; + } + + if ((tgt_rt->rt_flags & RTF_HOST) && + (tgt_rt->rt_flags & RTF_LLINFO) && + (tgt_rt->rt_gateway->sa_family == AF_LINK) && + (SDL(tgt_rt->rt_gateway)->sdl_alen == ETHER_ADDR_LEN)) { + struct sockaddr_dl *__bidi_indexable sdl = + (struct sockaddr_dl *__bidi_indexable)SDL(tgt_rt->rt_gateway); + bcopy(LLADDR(sdl), remote_mac->octet, ETHER_ADDR_LEN); + } else { + err = ENOENT; + } +done: + if (gwrt != NULL) { + RT_UNLOCK(gwrt); + rtfree(gwrt); + gwrt = NULL; + } + + if (rt != NULL) { + rtfree(rt); + rt = NULL; + } + + return err; +} + +static uint8_t * +__sized_by(*buflen) +necp_client_flow_mac_and_gateway(struct necp_client_flow *flow, size_t *buflen) +{ + u_int8_t * __indexable buffer = NULL; + u_int8_t * __indexable cursor = NULL; + size_t valsize = 0; + + ASSERT(flow != NULL); + ASSERT(buflen != NULL); + + *buflen = 0; + + ifnet_t ifp = NULL; + ifnet_head_lock_shared(); + if (flow->interface_index != IFSCOPE_NONE && flow->interface_index <= if_index) { + ifp = ifindex2ifnet[flow->interface_index]; + } + ifnet_head_done(); + + if (ifp == NULL) { + NECPLOG0(LOG_ERR, "necp_client_flow_mac_and_gateway: ifp is NULL"); + return NULL; + } + + if (!IFNET_IS_ETHERNET(ifp)) { + return NULL; + } + + /* local MAC */ + struct ether_addr local_ether = {}; + bool local_ether_set = false; + if (ifnet_lladdr_copy_bytes(ifp, local_ether.octet, ETHER_ADDR_LEN) == 0) { + local_ether_set = true; + valsize += sizeof(struct necp_tlv_header) + sizeof(struct ether_addr); + } + + /*remote MAC */ + struct ether_addr remote_ether = {}; + bool remote_ether_set = false; + if (necp_client_destination_mac_address(SA(&flow->remote_addr), + flow->interface_index, &remote_ether) == 0) { + remote_ether_set = true; + valsize += sizeof(struct necp_tlv_header) + sizeof(struct ether_addr); + } + + if (valsize == 0) { + return NULL; + } + + buffer = kalloc_data(valsize, Z_WAITOK | Z_ZERO); + if (buffer == NULL) { + return NULL; + } + + cursor = buffer; + if (local_ether_set) { + cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_LOCAL_ETHER_ADDR, + sizeof(struct ether_addr), (uint8_t *)(struct ether_addr * __bidi_indexable)&local_ether, + buffer, valsize); + } + if (remote_ether_set) { + cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_REMOTE_ETHER_ADDR, + sizeof(struct ether_addr), (uint8_t *)(struct ether_addr * __bidi_indexable)&remote_ether, + buffer, valsize); + } + *buflen = valsize; + return buffer; +} + static int necp_client_fillout_flow_tlvs(struct necp_client *client, bool client_is_observed, @@ -7909,6 +8640,8 @@ necp_client_fillout_flow_tlvs(struct necp_client *client, u_int32_t flags = 0; u_int8_t tfo_cookie_len = 0; u_int8_t type = 0; + size_t buflen = 0; + uint8_t *buffer = NULL; type = NECP_CLIENT_RESULT_FLOW_ID; length = sizeof(header.flow_header.flow_id); @@ -7994,6 +8727,14 @@ necp_client_fillout_flow_tlvs(struct necp_client *client, if (flow->has_protoctl_event) { length += sizeof(struct necp_client_flow_protoctl_event_header); } + if (flow->nexus && flow->aop_offload) { + buffer = necp_client_flow_mac_and_gateway(flow, &buflen); + length += buflen; + + if (flow->aop_stat_index_valid) { + length += sizeof(struct necp_client_flow_stats_index_header); + } + } header.flow_header.outer_header.type = type; header.flow_header.outer_header.length = length; @@ -8040,6 +8781,41 @@ necp_client_fillout_flow_tlvs(struct necp_client *client, flow->protoctl_event.protoctl_event_val = 0; flow->protoctl_event.protoctl_event_tcp_seq_num = 0; } + + if (flow->nexus && flow->aop_offload) { + if (buffer != NULL) { + ASSERT(buflen > 0); + error = copyout(buffer, uap->buffer + client->result_length + *assigned_results_cursor, + buflen); + *assigned_results_cursor += buflen; + kfree_data_counted_by(buffer, buflen); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy mac address results" + " tlv_header copyout error (%d)", error); + return error; + } + } + + if (flow->aop_stat_index_valid) { + struct necp_client_flow_stats_index_header flow_stats_header = {}; + + type = NECP_CLIENT_RESULT_FLOW_STATS_INDEX; + length = sizeof(flow_stats_header.stats_index); + + flow_stats_header.stats_index_tlv_header.type = type; + flow_stats_header.stats_index_tlv_header.length = length; + flow_stats_header.stats_index = flow->stats_index; + + error = copyout(&flow_stats_header, uap->buffer + + client->result_length + *assigned_results_cursor, sizeof(flow_stats_header)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy flow stats index " + "tlv header copyout error (%d)", error); + return error; + } + *assigned_results_cursor += sizeof(flow_stats_header); + } + } } } if (!client_is_observed) { @@ -8460,6 +9236,12 @@ necp_client_copy_parameters_locked(struct necp_client *client, } } } + + if (parsed_parameters.valid_fields & NECP_PARSED_PARAMETERS_FIELD_EXTENDED_FLAGS) { + if (parsed_parameters.extended_flags & NECP_CLIENT_PARAMETER_EXTENDED_FLAG_AOP2_OFFLOAD) { + parameters->use_aop_offload = true; + } + } } #endif // SKYWALK @@ -8648,8 +9430,19 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg new_registration->flags = add_request->flags; + // If NECP_CLIENT_FLOW_FLAGS_OPEN_FLOW_ON_BEHALF_OF_CLIENT is set, then set registration_id_to_add to the old + // value in add_request->registration_id, otherwise use the new value in new_registration->registration_id. + bool open_flow_on_behalf_of_client = (add_request->flags & NECP_CLIENT_FLOW_FLAGS_OPEN_FLOW_ON_BEHALF_OF_CLIENT); + uuid_t registration_id_to_add = {}; + if (open_flow_on_behalf_of_client && !uuid_is_null(add_request->registration_id)) { + uuid_copy(registration_id_to_add, add_request->registration_id); + } else { + uuid_copy(registration_id_to_add, new_registration->registration_id); + } + // Copy new ID out to caller uuid_copy(add_request->registration_id, new_registration->registration_id); + new_registration->aop_offload = parameters.use_aop_offload; NECP_CLIENT_FLOW_LOG(client, new_registration, "adding flow"); @@ -8666,6 +9459,18 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg override_address->sa_len <= sizeof(parameters.remote_addr)) { SOCKADDR_COPY(override_address, ¶meters.remote_addr, override_address->sa_len); trailer_offset += override_address->sa_len; + + // Clear out any local address if the remote address is overridden + if (parameters.remote_addr.sa.sa_family == AF_INET) { + parameters.local_addr.sin.sin_family = AF_INET; + parameters.local_addr.sin.sin_len = sizeof(struct sockaddr_in); + parameters.local_addr.sin.sin_addr.s_addr = 0; + } else if (parameters.remote_addr.sa.sa_family == AF_INET6) { + parameters.local_addr.sin6.sin6_family = AF_INET6; + parameters.local_addr.sin6.sin6_len = sizeof(struct sockaddr_in6); + memset((uint8_t *)¶meters.local_addr.sin6.sin6_addr, 0, sizeof(struct in6_addr)); + parameters.local_addr.sin6.sin6_scope_id = 0; + } } else { override_address = NULL; } @@ -8681,6 +9486,13 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg } } + // If opening the flow on behalf of the client, then replace the pid and parameters.pid with the effective PID + // so that the client's PID is used for this flow instead of the PID of the process making the requests. + if (open_flow_on_behalf_of_client) { + parameters.pid = parameters.epid; + pid = parameters.epid; + } + #if SKYWALK if (add_request->flags & NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS) { size_t assigned_results_length = 0; @@ -8710,12 +9522,12 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg if (!found_nexus) { NECPLOG0(LOG_ERR, "Requested nexus not found"); } else { - necp_client_add_nexus_flow_if_needed(new_registration, add_request->agent_uuid, interface_index); + necp_client_add_nexus_flow_if_needed(new_registration, add_request->agent_uuid, interface_index, parameters.use_aop_offload); error = netagent_client_message_with_params(add_request->agent_uuid, ((new_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? client->client_id : - new_registration->registration_id), + registration_id_to_add), pid, client->agent_handle, NETAGENT_MESSAGE_TYPE_REQUEST_NEXUS, (struct necp_client_agent_parameters *)¶meters, @@ -8744,7 +9556,7 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg void *message = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0, (struct necp_client_endpoint *)&flow->local_addr, (struct necp_client_endpoint *)&flow->remote_addr, - NULL, 0, NULL, &message_length); + NULL, 0, NULL, 0, &message_length); flow->assigned_results = message; flow->assigned_results_length = message_length; } @@ -8773,6 +9585,11 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg stats_request->stats_size = bufreq.necp_stats_bufreq_size; stats_request->stats_addr = bufreq.necp_stats_bufreq_uaddr; } + + if (error == 0 && parameters.use_aop_offload) { + error = necp_aop_offload_stats_initialize( + new_registration, add_request->agent_uuid); + } #endif /* !SKYWALK */ if (error == 0 && @@ -8798,7 +9615,7 @@ necp_client_add_flow(struct necp_fd_data *fd_data, struct necp_client_action_arg } ifnet_head_done(); - necp_client_add_nexus_flow_if_needed(new_registration, add_request->agent_uuid, interface_index); + necp_client_add_nexus_flow_if_needed(new_registration, add_request->agent_uuid, interface_index, parameters.use_aop_offload); size_t dummy_length = 0; void * __sized_by(dummy_length) dummy_results = NULL; @@ -8954,7 +9771,7 @@ necp_client_request_nexus(struct necp_fd_data *fd_data, struct necp_client_actio new_registration->flags = (NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS | NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID); - necp_client_add_nexus_flow_if_needed(new_registration, nexus_uuid, interface_index); + necp_client_add_nexus_flow_if_needed(new_registration, nexus_uuid, interface_index, parameters.use_aop_offload); // Note: Any clients using "request_nexus" are not flow-registration aware. // Register the Client ID rather than the Registration ID with the nexus, since @@ -9447,6 +10264,10 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl if (IS_INTF_CLAT46(interface)) { interface_details.flags |= NECP_INTERFACE_FLAG_HAS_NAT64; } + if (interface->if_xflags & IFXF_LOW_POWER_WAKE) { + interface_details.flags |= NECP_INTERFACE_FLAG_LOW_POWER_WAKE; + } + interface_details.l4s_mode = interface->if_l4s_mode; interface_details.mtu = interface->if_mtu; #if SKYWALK fsw_get_tso_capabilities(interface, &interface_details.tso_max_segment_size_v4, @@ -10214,6 +11035,51 @@ necp_stats_initialize(struct necp_fd_data *fd_data, return error; } +static int +necp_aop_offload_stats_initialize(struct necp_client_flow_registration *flow_registration, + uuid_t netagent_uuid) +{ + int error = 0; + + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + // Verify that the client nexus agent matches + if (flow->nexus && + uuid_compare(flow->u.nexus_agent, netagent_uuid) == 0) { + ASSERT(flow->flow_tag != 0); + ASSERT(flow->aop_offload); + + error = net_aop_setup_flow(flow->flow_tag, + true, &flow->stats_index); + if (error != 0) { + NECPLOG(LOG_ERR, "failed to setup aop flow " + "stats area, error %d", error); + } else { + flow->aop_stat_index_valid = true; + } + break; + } + } + + return error; +} + +static void +necp_aop_offload_stats_destroy(struct necp_client_flow *flow) +{ + int error = 0; + + if (flow->flow_tag != 0 && flow->aop_stat_index_valid) { + error = net_aop_setup_flow(flow->flow_tag, + false, &flow->stats_index); + if (error != 0) { + NECPLOG(LOG_ERR, "failed to cleanup aop offload stats with error %d", error); + } + flow->aop_stat_index_valid = false; + } + return; +} + static NECP_CLIENT_ACTION_FUNCTION int necp_client_map_sysctls(__unused struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { @@ -10291,9 +11157,15 @@ necp_client_copy_route_statistics(__unused struct necp_fd_data *fd_data, struct route_stats.necp_stat_txretransmit = rt_stats->nstat_txretransmit; route_stats.necp_stat_connectattempts = rt_stats->nstat_connectattempts; route_stats.necp_stat_connectsuccesses = rt_stats->nstat_connectsuccesses; - route_stats.necp_stat_min_rtt = rt_stats->nstat_min_rtt; - route_stats.necp_stat_avg_rtt = rt_stats->nstat_avg_rtt; - route_stats.necp_stat_var_rtt = rt_stats->nstat_var_rtt; + if (__probable(necp_client_stats_use_route_metrics == 0)) { + route_stats.necp_stat_min_rtt = rt_stats->nstat_min_rtt; + route_stats.necp_stat_avg_rtt = rt_stats->nstat_avg_rtt; + route_stats.necp_stat_var_rtt = rt_stats->nstat_var_rtt; + } else { + route_stats.necp_stat_min_rtt = client->current_route->rtt_min; + route_stats.necp_stat_avg_rtt = client->current_route->rt_rmx.rmx_rtt; + route_stats.necp_stat_var_rtt = client->current_route->rt_rmx.rmx_rttvar; + } route_stats.necp_stat_route_flags = client->current_route->rt_flags; } @@ -10768,6 +11640,106 @@ done: return error; } +static int +necp_client_copy_flow_stats(struct necp_client_flow_registration *flow_registration, + struct necp_flow_statistics *flow_stats) +{ + struct aop_flow_stats aop_flow_stats = {}; + int error = 0; + + struct necp_client_flow *flow = LIST_FIRST(&flow_registration->flow_list); + if (flow == NULL || !flow->aop_offload || !flow->aop_stat_index_valid) { + NECPLOG0(LOG_ERR, "necp_client_copy_flow_stats only supported for aop flows"); + return EINVAL; + } + error = net_aop_get_flow_stats(flow->stats_index, &aop_flow_stats); + if (error != 0) { + NECPLOG(LOG_ERR, "net_aop_get_flow_stats failed (%d)", error); + return error; + } + + if (flow_stats->transport_proto == IPPROTO_TCP) { + struct tcp_info *tcpi = &flow_stats->transport.tcpi; + struct tcp_info *a_tcpi = &aop_flow_stats.transport.tcp_stats.tcp_info; + memcpy(tcpi, a_tcpi, sizeof(*tcpi)); + } + + return 0; +} + +static NECP_CLIENT_ACTION_FUNCTION int +necp_client_get_flow_statistics(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) +{ + int error = 0; + uuid_t flow_id = {}; + struct necp_flow_statistics flow_stats = {}; + + if (uap->client_id == 0 || uap->client_id_len != sizeof(uuid_t)) { + error = EINVAL; + NECPLOG(LOG_ERR, "necp_client_remove_flow invalid client_id (length %zu)", (size_t)uap->client_id_len); + goto done; + } + + error = copyin(uap->client_id, flow_id, sizeof(uuid_t)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_get_flow_statistics copyin client_id error (%d)", error); + goto done; + } + + if (uap->buffer_size < sizeof(flow_stats) || uap->buffer == 0) { + error = EINVAL; + goto done; + } + + error = copyin(uap->buffer, &flow_stats, sizeof(flow_stats)); + if (error) { + NECPLOG(LOG_ERR, "necp_client_get_flow_statistics copyin protocol error (%d)", error); + goto done; + } + + if (flow_stats.transport_proto != IPPROTO_TCP) { + NECPLOG(LOG_ERR, "necp_client_get_flow_statistics, transport proto %u not supported", + flow_stats.transport_proto); + error = ENOTSUP; + goto done; + } + + NECP_FD_LOCK(fd_data); + struct necp_client *client = NULL; + struct necp_client_flow_registration *flow_registration = necp_client_fd_find_flow(fd_data, flow_id); + if (flow_registration != NULL) { + client = flow_registration->client; + if (client != NULL) { + necp_client_retain(client); + } + } + NECP_FD_UNLOCK(fd_data); + + if (flow_registration != NULL && client != NULL) { + NECP_CLIENT_LOCK(client); + if (flow_registration->client == client) { + error = necp_client_copy_flow_stats(flow_registration, &flow_stats); + if (error == 0) { + error = copyout(&flow_stats, uap->buffer, sizeof(flow_stats)); + if (error != 0) { + NECPLOG(LOG_ERR, "necp_client_get_flow_statistics copyout failed (%d)", error); + } + } + } + + necp_client_release_locked(client); + NECP_CLIENT_UNLOCK(client); + } + +done: + *retval = error; + if (error != 0) { + NECPLOG(LOG_ERR, "get flow statistics error (%d)", error); + } + + return error; +} + int necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *retval) { @@ -10892,6 +11864,10 @@ necp_client_action(struct proc *p, struct necp_client_action_args *uap, int *ret return_value = necp_client_set_signed_client_id(fd_data, uap, retval); break; } + case NECP_CLIENT_ACTION_GET_FLOW_STATISTICS: { + return_value = necp_client_get_flow_statistics(fd_data, uap, retval); + break; + } default: { NECPLOG(LOG_ERR, "necp_client_action unknown action (%u)", action); return_value = EINVAL; @@ -11394,7 +12370,7 @@ void * __sized_by(*message_length) necp_create_nexus_assign_message(uuid_t nexus_instance, nexus_port_t nexus_port, void * __sized_by(key_length) key, uint32_t key_length, struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, struct ether_addr *local_ether_addr, - u_int32_t flow_adv_index, void *flow_stats, size_t *message_length) + u_int32_t flow_adv_index, void *flow_stats, uint32_t flow_id, size_t *message_length) { u_int8_t * __indexable buffer = NULL; u_int8_t * __indexable cursor = NULL; @@ -11424,6 +12400,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, nexus_port_t nexus_port, if (flow_stats != NULL) { valsize += sizeof(struct necp_tlv_header) + sizeof(void *); } + if (flow_id != 0) { + valsize += sizeof(struct necp_tlv_header) + sizeof(u_int32_t); + } if (valsize == 0) { *message_length = 0; return NULL; @@ -11458,6 +12437,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, nexus_port_t nexus_port, if (flow_stats != NULL) { cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_NEXUS_FLOW_STATS, sizeof(void *), &flow_stats, buffer, valsize); } + if (flow_id != 0) { + cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_UNIQUE_FLOW_TAG, sizeof(u_int32_t), &flow_id, buffer, valsize); + } *message_length = valsize; diff --git a/bsd/net/net_private.modulemap b/bsd/net/net_private.modulemap index 85ae2dbe2..9a875ca41 100644 --- a/bsd/net/net_private.modulemap +++ b/bsd/net/net_private.modulemap @@ -1,4 +1,16 @@ module DarwinPrivate.net [system] { + #ifndef XNU_PLATFORM_MacOSX + module bpf { + header "net/bpf.h" + export * + } + + #endif + explicit module bpf_private { + header "net/bpf_private.h" + export * + } + module if_ipsec { header "net/if_ipsec.h" export * diff --git a/bsd/net/net_stubs.c b/bsd/net/net_stubs.c index f119d09fa..74a3f18d5 100644 --- a/bsd/net/net_stubs.c +++ b/bsd/net/net_stubs.c @@ -108,6 +108,7 @@ STUB(ifnet_free_address_list); STUB(ifnet_free_multicast_list); STUB(ifnet_get_address_list); STUB(ifnet_get_address_list_family); +STUB(ifnet_get_inband_wake_packet_tagging); STUB(ifnet_get_link_mib_data); STUB(ifnet_get_link_mib_data_length); STUB(ifnet_get_multicast_list); @@ -143,6 +144,8 @@ STUB(ifnet_set_delegate); STUB(ifnet_set_eflags); STUB(ifnet_set_flags); STUB(ifnet_set_hdrlen); +STUB(ifnet_set_inband_wake_packet_tagging); +STUB(ifnet_set_low_power_wake); STUB(ifnet_set_link_mib_data); STUB(ifnet_set_lladdr); STUB(ifnet_set_metric); @@ -368,7 +371,10 @@ STUB(ifnet_get_unsent_bytes); STUB(ifnet_get_buffer_status); STUB(ifnet_normalise_unsent_data); STUB(ifnet_set_low_power_mode); +STUB(ifnet_set_rx_flow_steering); +STUB(ifnet_get_rx_flow_steering); STUB(ifnet_notify_tcp_keepalive_offload_timeout); +STUB(ifnet_enable_cellular_thread_group); STUB(in6_localaddr); STUB(in_localaddr); STUB(in6addr_local); @@ -411,6 +417,8 @@ STUB(pffinddomain); STUB(pffinddomain_old); STUB(pffindproto); STUB(pffindproto_old); +STUB(pktap_input_packet); +STUB(pktap_output_packet); STUB(pru_abort_notsupp); STUB(pru_accept_notsupp); STUB(pru_bind_notsupp); diff --git a/bsd/net/network_agent.c b/bsd/net/network_agent.c index 492e61ebe..86de159f4 100644 --- a/bsd/net/network_agent.c +++ b/bsd/net/network_agent.c @@ -1065,6 +1065,7 @@ netagent_handle_register_inner(struct netagent_session *session, struct netagent struct netagent_registration *existing_registration = netagent_find_agent_with_uuid_and_lock(new_registration->netagent->netagent_uuid, false, true); if (existing_registration != NULL) { + NETAGENTLOG0(LOG_ERR, "Existing agent registration UUID conflicts with new agent registration"); NETAGENT_SESSION_UNLOCK(session); NETAGENT_LIST_UNLOCK(); return EEXIST; diff --git a/bsd/net/network_agent.h b/bsd/net/network_agent.h index 84edc1153..e6ee17bd6 100644 --- a/bsd/net/network_agent.h +++ b/bsd/net/network_agent.h @@ -182,8 +182,10 @@ struct netagent_session_assign_nexus_message { #define NETAGENT_NEXUS_ENDPOINT_TYPE_SRV 5 #define NETAGENT_NEXUS_FLAG_SUPPORTS_USER_PACKET_POOL 0x1 -#define NETAGENT_NEXUS_FLAG_ASSERT_UNSUPPORTED 0x2 // No calls to assert the agent are required +#define NETAGENT_NEXUS_FLAG_ASSERT_UNSUPPORTED 0x2 // No calls to assert the agent are required #define NETAGENT_NEXUS_FLAG_SHOULD_USE_EVENT_RING 0x4 // indicates that nexus agent should use event rings +#define NETAGENT_NEXUS_FLAG_COMPLETE_RESOLVE_ON_CONNECT 0x8 // Indicates resolver should mark itself as complete once it has a connected child +#define NETAGENT_NEXUS_FLAG_CANCEL_REMOVED_ENDPOINTS 0x10 // Indicates resolver should cancel connection attempts to endpoints removed from resolved endpoints list struct netagent_nexus { u_int32_t frame_type; diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index 7319f164a..44840aac4 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -123,6 +124,10 @@ static struct nstat_stats nstat_stats; SYSCTL_STRUCT(_net_stats, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &nstat_stats, nstat_stats, ""); +static struct nstat_global_counts nstat_global_counts; +SYSCTL_STRUCT(_net_stats, OID_AUTO, global_counts, CTLFLAG_RD | CTLFLAG_LOCKED, + &nstat_global_counts, nstat_global_counts, ""); + static u_int32_t nstat_lim_interval = 30 * 60; /* Report interval, seconds */ static u_int32_t nstat_lim_min_tx_pkts = 100; static u_int32_t nstat_lim_min_rx_pkts = 100; @@ -141,6 +146,7 @@ SYSCTL_INT(_net_stats, OID_AUTO, lim_min_rx_pkts, #endif /* DEBUG || DEVELOPMENT */ static int ntstat_progress_indicators(struct sysctl_req *); +static int ntstat_get_metrics(struct sysctl_req *); static struct net_api_stats net_api_stats_before; static u_int64_t net_api_stats_last_report_time; #define NET_API_STATS_REPORT_INTERVAL (12 * 60 * 60) /* 12 hours, in seconds */ @@ -177,6 +183,7 @@ enum{ NSTAT_FLAG_REQCOUNTS = (1 << 1), NSTAT_FLAG_SUPPORTS_UPDATES = (1 << 2), NSTAT_FLAG_SYSINFO_SUBSCRIBED = (1 << 3), + NSTAT_FLAG_SUPPORTS_DETAILS = (1 << 4), }; static int @@ -204,9 +211,260 @@ SYSCTL_PROC(_net_stats, OID_AUTO, progress, #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n)) #endif +/* + * A note on data structures + * + * Each user space "client" of NetworkStatistics is represented by a nstat_client structure. + * Communication between user space and kernel is currently via a kernel control socket, + * but other mechanisms could theoretically be used + * + * Each ntstat_client has a queue of nstat_src structures. These typically represent flows + * but could represent whatever the relevant provider makes available to be reported to user level. + * The "nstat_src" name is perhaps a little unfortunate as these don't directly refer to the socket, channel + * or whatever is the ultimate source of the information but instead map directly to each client's idea of + * that source. They therefore contain things like the source reference quoted to the client, which is + * unique only on a per client basis. Currently each client has source references which are allocated sequentially + * starting from one + * + * There are multiple "providers", one each for things like TCP sockets, QUIC channels, routes etc. + * The ntstat_src structure contains the nts_provider field to identify the particular provider and the + * nts_cookie field, which is a pointer to a provider-dependent structure + * + * The nstat_src structure has a pointer to a provider dependent "cookie" structure + * Some, for example the QUIC channel provider, may have a single instance of a structure + * for all nstat_srcs. Optionally the provider code may use an nstat_locus structure to allow it + * to quickly iterate all the associated nstat_srcs. The alternative is to iterate over the nstat_srcs + * belonging to each of the nstat_clients and look for any matches. + * + * That gives rise to the example picture below, with notes as follows: + * 1) The second nstat_client has been in existence for longer than the first one, + * and the larger nstat_srcref numbers reflect flows that were assigned numbers earlier + * but which have now closed. + * 2) The nts_cookie field contains a pointer to a provider dependent structure, + * here either a nstat_tu_shadow, nstat_sock_locus or rtentry, for channel based or socket based flows + * or routes respectively + * 3) The nstat_tu_shadow and nstat_sock_locus structures contain an embedded nstat_locus + * 4) The nstat_tu_shadow structures are linked, and the links are traversed when collecting all + * the associated flows when there is a new "watcher". In contrast, the nstat_sock_locus structures are not + * linked and the ntstat code must understand how to traverse the relevant structures in the BSD socket code + * 5) For simplicity, various linkages are not shown. In the nstat_src we have: + * - nts_client is a back pointer to the owning nstat_client + * - nts_locus is a pointer to the associated nstat_locus, or NULL if there is no such item + * - nts_provider is a pointer to a provider-dependent structure with a list of function pointers + * + * + * Generic Code . Per Provider Code + * . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . + * nstat_clients ---+ . + * | . Channel based flows + * +-----------<---------+ . + * | . + * | . + * V . + * | nstat_client nstat_client . nstat_userprot_shad_head --+ + * | +-----------------+ +-----------------+ . | + * +->| ntc_next |---------->| ntc_next | . +-----------<--------------+ + * +-----------------+ +-----------------+ . | + * | | | | . | + * +-----------------+ +-----------------+ . | + * | ntc_src_queue |---+ | ntc_src_queue |---+ . | + * +-----------------+ | +-----------------+ | . | + * | | | | | | . | + * | | . | + * | | V | | V . V + * | | . | + * | | | | | | . | + * +-----------------+ | +-----------------+ | . | + * | | . | + * | | . | + * +---------<------------+ +--------<-------------+ . | + * | | . | + * | | . | + * | | . | + * | +-------------------------- | -------<-------------------.- | -------<-----------------+ + * | | | . | | + * | | nstat_src | nstat_src . | | + * | | +-----------------+ | +-----------------+ . | | + * +--->| nts_client_link |---+ +--->| nts_client_link |---+ . | | + * | +-----------------+ | +-----------------+ | . | | + * -->| nts_locus_link |---|------>| nts_locus_link | | . V ^ + * +-----------------+ | +-----------------+ | . | | + * | nts_locus | | | nts_locus | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_client | | | nts_client | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_srcref = 5 | | | nts_srcref = 28 | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_provider | | | nts_provider | | . | nstat_tu_shadow | + * +-----------------+ | +-----------------+ | . | +-----------------+ | + * | nts_cookie |-->|---+ | nts_coookie |-->|-.--->-->| shad_locus |---+ + * +-----------------+ | | +-----------------+ | . | +-----------------+ + * | nts_filter | | | | nts_filter | | . | | shad_link |---+ + * +-----------------+ | | +-----------------+ | . | +-----------------+ | + * | | | | | | | . | | | | + * | | | | | | | . ^ +-----------------+ | + * +-----------------+ | | +-----------------+ | . | | | | + * nstat_src struct | | | . | +-----------------+ | + * | +------------>------------|-.--+ | | | + * | | . | | | + * | | . | | | + * | | . | | | + * +---------<----------------+ +--------<-----------------+ . +-----------------+ | + * | | . | + * | | . | + * | | . +--------<-----------------+ + * | | . | + * | | . | + * | +-------------------------- | -------<-------------------.- | -------<-----------------+ + * | | | . | | + * | | nstat_src | nstat_src . | | + * | | +-----------------+ | +-----------------+ . | | + * +--->| nts_client_link |---+ +--->| nts_client_link |---+ . | | + * | +-----------------+ | +-----------------+ | . | | + * -->| nts_locus_link |---|------>| nts_locus_link | | . V ^ + * +-----------------+ | +-----------------+ | . | | + * | nts_locus | | | nts_locus | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_client | | | nts_client | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_srcref = 4 | | | nts_srcref = 27 | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_provider | | | nts_provider | | . | nstat_tu_shadow | + * +-----------------+ | +-----------------+ | . | +-----------------+ | + * | nts_cookie |-->|---+ | nts_coookie |-->|----->-->| shad_locus |---+ + * +-----------------+ | | +-----------------+ | . | +-----------------+ + * | nts_filter | | | | nts_filter | | . | | shad_link | + * +-----------------+ | | +-----------------+ | . | +-----------------+ + * | | | | | | | . | | | + * | | | | | | | . ^ +-----------------+ + * +-----------------+ | | +-----------------+ | . | | | + * | | | . | +-----------------+ + * | +------------>------------|----+ | | + * | | . | | + * | | . | | + * | | . | | + * +---------<----------------+ +--------<-----------------+ . +-----------------+ + * | | . + * | | . + * | | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * | | . + * | | . Socket based flows + * | | . + * | | . + * | | . nstat_tcp_sock_locus_head -+ + * | | . | + * | | . +-----------<--------------+ + * | | . | + * | | . | + * | +-------------------------- | -------<-------------------.- | -------<-----------------+ + * | | | . | | + * | | nstat_src | nstat_src . | | + * | | +-----------------+ | +-----------------+ . | | + * +--->| nts_client_link |---+ +--->| nts_client_link |---+ . | | + * | +-----------------+ | +-----------------+ | . | | + * -->| nts_locus_link |---|------>| nts_locus_link | | . V ^ + * +-----------------+ | +-----------------+ | . | | + * | nts_locus | | | nts_locus | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_client | | | nts_client | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_srcref = 3 | | | nts_srcref = 26 | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_provider | | | nts_provider | | . | nstat_sock_locus | inpcb + * +-----------------+ | +-----------------+ | . | +-----------------+ | +-----------------+ + * | nts_cookie |-->|---+ | nts_coookie |-->|-.--->-->| nsl_locus |---+ | | + * +-----------------+ | | +-----------------+ | . | +-----------------+ | | + * | nts_filter | | | | nts_filter | | . | | nsl_link |---+ | | + * +-----------------+ | | +-----------------+ | . | +-----------------+ | | | + * | | | | | | | . | | nsl_inp |-- |-->| | + * | | | | | | | . ^ +-----------------+ | + * +-----------------+ | | +-----------------+ | . | | | | | | + * nstat_src struct | | | . | +-----------------+ | + * | +------------>------------|-.--+ | nsl_pname | | | | + * | | . | | | + * | | . | | | | | + * | | . | | | + * +---------<----------------+ +--------<-----------------+ . +-----------------+ | | | + * | | . | + * | | . | | | + * | | . | +-----------------+ + * | | . | + * | | . | + * | | . +--------<-----------------+ + * | | . | + * | | . | + * | +-------------------------- | -------<-------------------.- | -------<-----------------+ + * | | | . | | + * | | nstat_src | nstat_src . | | + * | | +-----------------+ | +-----------------+ . | | + * +--->| nts_client_link |---+ +--->| nts_client_link |---+ . | | + * | +-----------------+ | +-----------------+ | . | | + * -->| nts_locus_link |---|------>| nts_locus_link | | . V ^ + * +-----------------+ | +-----------------+ | . | | + * | nts_locus | | | nts_locus | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_client | | | nts_client | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_srcref = 3 | | | nts_srcref = 26 | | . | | + * +-----------------+ | +-----------------+ | . | | + * | nts_provider | | | nts_provider | | . | nstat_sock_locus | inpcb + * +-----------------+ | +-----------------+ | . | +-----------------+ | +-----------------+ + * | nts_cookie |-->|---+ | nts_coookie |-->|-.--->-->| nsl_locus |---+ | | + * +-----------------+ | | +-----------------+ | . | +-----------------+ | | + * | nts_filter | | | | nts_filter | | . | | nsl_link | | | + * +-----------------+ | | +-----------------+ | . | +-----------------+ | | + * | | | | | | | . | | nsl_inp |------>| | + * | | | | | | | . ^ +-----------------+ + * +-----------------+ | | +-----------------+ | . | | | | | + * nstat_src struct | | | . | +-----------------+ + * | +------------>------------|-.--+ | nsl_pname | | | + * | | . | | + * | | . | | | | + * | | . | | + * +---------<----------------+ +--------<-----------------+ . +-----------------+ | | + * | | . + * | | . | | + * | | . +-----------------+ + * | | . + * | | . + * | | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * | | . + * | +-----------------+ | +-----------------+ . Routes + * +--->| nts_client_link | +--->| nts_client_link | . + * +-----------------+ +-----------------+ . + * | nts_locus_link | | nts_locus_link | . + * +-----------------+ +-----------------+ . + * | nts_locus | | nts_locus | . + * +-----------------+ +-----------------+ . + * | nts_client | | nts_client | . + * +-----------------+ +-----------------+ . + * | nts_srcref = 1 | | nts_srcref = 24 | . + * +-----------------+ +-----------------+ . + * | nts_provider | | nts_provider | . retentry + * +-----------------+ +-----------------+ . +-----------------+ + * | nts_cookie |------+ | nts_cookie |--------->-->| | + * +-----------------+ | +-----------------+ . | | | + * | nts_filter | | | nts_filter | . | + * +-----------------+ | +-----------------+ . | | | + * | | | | | . | + * | | | | | . | | | + * +-----------------+ | +-----------------+ . | + * | . | | | + * +-------------------------------+ + * . | | + * . + * . | | + * . +-----------------+ + * + */ typedef TAILQ_HEAD(, nstat_src) tailq_head_nstat_src; typedef TAILQ_ENTRY(nstat_src) tailq_entry_nstat_src; +typedef TAILQ_HEAD(, nstat_sock_locus) tailq_head_sock_locus; +typedef TAILQ_ENTRY(nstat_sock_locus) tailq_entry_sock_locus; + typedef TAILQ_HEAD(, nstat_tu_shadow) tailq_head_tu_shadow; typedef TAILQ_ENTRY(nstat_tu_shadow) tailq_entry_tu_shadow; @@ -216,9 +474,23 @@ typedef TAILQ_ENTRY(nstat_generic_shadow) tailq_entry_generic_shadow; typedef TAILQ_HEAD(, nstat_procdetails) tailq_head_procdetails; typedef TAILQ_ENTRY(nstat_procdetails) tailq_entry_procdetails; +static int +metrics_collection SYSCTL_HANDLER_ARGS +{ + return ntstat_get_metrics(req); +} + +SYSCTL_PROC(_net_stats, OID_AUTO, metrics, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0, + metrics_collection, "S", "Various metrics for NetworkStatistics clients, individually or collectively"); + typedef void *nstat_provider_cookie_t; +typedef struct nstat_locus { + tailq_head_nstat_src ntl_src_queue; +} nstat_locus; + struct nstat_procdetails { tailq_entry_procdetails pdet_link; int pdet_pid; @@ -238,87 +510,6 @@ typedef struct nstat_provider_filter { } nstat_provider_filter; -struct nstat_global_counts { - uint64_t nstat_global_client_current; // current number of clients overall - uint64_t nstat_global_client_max; // max number of clients overall - uint64_t nstat_global_client_allocs; // total number of clients allocated - - uint64_t nstat_global_src_current; // current number of srcs overall - uint64_t nstat_global_src_max; // max number of srcs overall - uint64_t nstat_global_src_allocs; // total number of sources allocated - uint64_t nstat_global_src_idlecheck_gone;// total number of sources discovered "gone" in idle check - - uint64_t nstat_global_tucookie_current; // current number of tucookies overall - uint64_t nstat_global_tucookie_max; // max number of tucookies overall - uint64_t nstat_global_tucookie_allocs; // total number of tucookies allocated - // Details for tucookie lifecycle - uint64_t nstat_global_tucookie_skip_dead; // When adding a watcher, pcb with "dead" state skipped over - uint64_t nstat_global_tucookie_skip_stopusing; // When adding a watcher, pcb marked as stop using - uint64_t nstat_global_tucookie_alloc_fail; // Allocation failure for a tucookie - - uint64_t nstat_global_tu_shad_current; // current number of nstat_tu_shadow objects overall - uint64_t nstat_global_tu_shad_max; // max number of tu_shadows overall - uint64_t nstat_global_tu_shad_allocs; // total number of tu_shadows allocated - - uint64_t nstat_global_gshad_current; // current number of generic shadow objects overall - uint64_t nstat_global_gshad_max; // max number of srcs overall - uint64_t nstat_global_gshad_allocs; // total number of sources allocated - - uint64_t nstat_global_procdetails_current;// current number of procdetails objects overall - uint64_t nstat_global_procdetails_max; // max number of procdetails overall - uint64_t nstat_global_procdetails_allocs;// total number of procdetails allocated -}; - -struct nstat_metrics { - uint32_t nstat_src_current; // current number of srcs for client - uint32_t nstat_src_max; // max number of srcs for client - uint32_t nstat_first_uint32_count; // Subsequent fields must be uint32_t values that, if kept per-client, - // should simply added to the global counts when the client exit - - // Tracking client requests - uint32_t nstat_query_request_all; // Client requests for all counts - uint32_t nstat_query_request_one; // Client request for counts on a single source - uint32_t nstat_query_description_all; // Client requests for all descriptors - uint32_t nstat_query_description_one; // Client requests for descriptor on a single source - uint32_t nstat_query_update_all; // Client requests for all updates - uint32_t nstat_query_update_one; // Client requests for update on a single source - uint32_t nstat_remove_src_found; // Client request to remove a source which is still in existence - uint32_t nstat_remove_src_missed; // Client request to remove a source which is no longer there - - // Details for nstat_query_request all/one - uint32_t nstat_query_request_nobuf; // No buffers for message send - uint32_t nstat_query_request_upgrade; // Successful lock upgrade to handle "gone" source - uint32_t nstat_query_request_noupgrade; // Unsuccessful lock upgrade to handle "gone" source - uint32_t nstat_query_request_nodesc; // Can't send a descriptor for "gone" source - uint32_t nstat_query_request_yield; // Client yields lock due to possibly higher priority processing - uint32_t nstat_query_request_limit; // Client requests for all counts - - // Details for nstat_query_description all/one - uint32_t nstat_query_description_nobuf; // No buffers for message send - uint32_t nstat_query_description_yield; // Client yields lock due to possibly higher priority processing - uint32_t nstat_query_description_limit; // Client requests for all counts - - // Details for nstat_query_update all/one - uint32_t nstat_query_update_nobuf; // No buffers for message send - uint32_t nstat_query_update_upgrade; // Successful lock upgrade to handle "gone" source - uint32_t nstat_query_update_noupgrade; // Unsuccessful lock upgrade to handle "gone" source - uint32_t nstat_query_update_nodesc; // Can't send a descriptor for "gone" source - uint32_t nstat_query_update_yield; // Client yields lock due to possibly higher priority processing - uint32_t nstat_query_update_limit; // Client requests for all counts - - // Details for adding a source - uint32_t nstat_src_add_success; // successful src_add - uint32_t nstat_src_add_no_buf; // fail to get buffer for initial src-added - uint32_t nstat_src_add_no_src_mem; // fail to get memory for nstat_src structure - uint32_t nstat_src_add_send_err; // fail to send initial src-added - uint32_t nstat_src_add_while_cleanup; // fail to add because client is in clean up state - - uint32_t nstat_src_gone_idlecheck; // src gone noted during periodic idle check - - uint32_t nstat_last_uint32_count; // Must be the last uint32_t count in the structure - uint32_t nstat_stats_pad; -}; - #define NUM_NSTAT_METRICS_UINT32_COUNTS ((__builtin_offsetof(struct nstat_metrics, nstat_last_uint32_count) - \ __builtin_offsetof(struct nstat_metrics, nstat_first_uint32_count)) / sizeof(uint32_t)) @@ -347,6 +538,7 @@ typedef struct nstat_client { decl_lck_mtx_data(, ntc_user_mtx); // Mutual exclusion for user level requests on this ntc_client kern_ctl_ref ntc_kctl; u_int32_t ntc_unit; + u_int32_t ntc_client_id; nstat_src_ref_t ntc_next_srcref; tailq_head_nstat_src ntc_src_queue; mbuf_t ntc_accumulated; @@ -370,10 +562,11 @@ typedef struct nstat_provider { errno_t (*nstat_lookup)(const void *__sized_by (length)data, u_int32_t length, nstat_provider_cookie_t *out_cookie); int (*nstat_gone)(nstat_provider_cookie_t cookie); errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); + errno_t (*nstat_details)(nstat_provider_cookie_t cookie, struct nstat_detailed_counts *out_details, int *out_gone); errno_t (*nstat_watcher_add)(nstat_client *client, nstat_msg_add_all_srcs *req); void (*nstat_watcher_remove)(nstat_client *client); errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *__sized_by (len)data, size_t len); - void (*nstat_release)(nstat_provider_cookie_t cookie, boolean_t locked); + void (*nstat_release)(nstat_provider_cookie_t cookie); bool (*nstat_reporting_allowed)(nstat_provider_cookie_t cookie, nstat_provider_filter *filter, u_int64_t suppression_flags); bool (*nstat_cookie_equal)(nstat_provider_cookie_t cookie1, nstat_provider_cookie_t cookie2); size_t (*nstat_copy_extension)(nstat_provider_cookie_t cookie, u_int32_t extension_id, void *buf, size_t len); @@ -381,6 +574,8 @@ typedef struct nstat_provider { typedef struct nstat_src { tailq_entry_nstat_src nts_client_link; // All sources for the nstat_client, for iterating over. + tailq_entry_nstat_src nts_locus_link; // All sources for the nstat_locus, for iterating over. + nstat_locus *nts_locus; // The (optional) locus, with further details nstat_client *nts_client; // The nstat_client that this is a source for nstat_src_ref_t nts_srcref; // The reference quoted in any messages nstat_provider *nts_provider; // The "provider" for the source, e.g. for kernel TCP sockets @@ -405,13 +600,13 @@ static int nstat_client_send_description(nstat_client *client, nstat_sr static int nstat_client_send_update(nstat_client *client, nstat_src *src, u_int64_t context, u_int64_t event, u_int16_t hdr_flags, int *gone); static errno_t nstat_client_send_removed(nstat_client *client, nstat_src *src, u_int16_t hdr_flags); static errno_t nstat_client_send_goodbye(nstat_client *client, nstat_src *src); -static void nstat_client_cleanup_source(nstat_client *client, nstat_src *src, boolean_t); +static void nstat_client_cleanup_source(nstat_client *client, nstat_src *src); static bool nstat_client_reporting_allowed(nstat_client *client, nstat_src *src, u_int64_t suppression_flags); static boolean_t nstat_client_begin_query(nstat_client *client, const nstat_msg_hdr *hdrp); static u_int16_t nstat_client_end_query(nstat_client *client, nstat_src *last_src, boolean_t partial); static void nstat_ifnet_report_lim_stats(void); static void nstat_net_api_report_stats(void); -static errno_t nstat_set_provider_filter( nstat_client *client, nstat_msg_add_all_srcs *req); +static errno_t nstat_set_provider_filter(nstat_client *client, nstat_msg_add_all_srcs *req); static errno_t nstat_client_send_event(nstat_client *client, nstat_src *src, u_int64_t event); static u_int32_t nstat_udp_watchers = 0; @@ -420,6 +615,7 @@ static nstat_merged_provider_filters merged_filters = {}; static nstat_client *nstat_clients = NULL; static uint64_t nstat_idle_time = 0; +static uint32_t nstat_next_client_id = 3; // Deliberate offset from zero to reserve values 0 and 1 #if NSTAT_FUZZ_TIMING static uint32_t nstat_random_delay_insert_modulo = 5000; @@ -427,7 +623,6 @@ static uint32_t nstat_max_nsec_delay = (NSEC_PER_SEC / 1000); #endif // NSTAT_FUZZ_TIMING static struct nstat_metrics nstat_metrics; -static struct nstat_global_counts nstat_global_counts; // For lldb macro usage static __unused const size_t nstat_trace_entries_per_client = NSTAT_TRACE_ENTRIES_PER_CLIENT; @@ -489,15 +684,15 @@ static nstat_cyclic_trace nstat_global_trace; static errno_t nstat_client_send_counts(nstat_client *client, nstat_src *src, unsigned long long context, u_int16_t hdr_flags, int *gone); static int nstat_client_send_description(nstat_client *client, nstat_src *src, u_int64_t context, u_int16_t hdr_flags); static int nstat_client_send_update(nstat_client *client, nstat_src *src, u_int64_t context, u_int64_t event, u_int16_t hdr_flags, int *gone); +static int nstat_client_send_details(nstat_client *client, nstat_src *src, u_int64_t context, u_int64_t event, u_int16_t hdr_flags, int *gone); static errno_t nstat_client_send_removed(nstat_client *client, nstat_src *src, u_int16_t hdr_flags); static errno_t nstat_client_send_goodbye(nstat_client *client, nstat_src *src); -static void nstat_client_cleanup_source(nstat_client *client, nstat_src *src, boolean_t); +static void nstat_client_cleanup_source(nstat_client *client, nstat_src *src); static bool nstat_client_reporting_allowed(nstat_client *client, nstat_src *src, u_int64_t suppression_flags); static boolean_t nstat_client_begin_query(nstat_client *client, const nstat_msg_hdr *hdrp); static u_int16_t nstat_client_end_query(nstat_client *client, nstat_src *last_src, boolean_t partial); static void nstat_ifnet_report_lim_stats(void); static void nstat_net_api_report_stats(void); -static errno_t nstat_set_provider_filter( nstat_client *client, nstat_msg_add_all_srcs *req); static errno_t nstat_client_send_event(nstat_client *client, nstat_src *src, u_int64_t event); static void nstat_client_register(void); @@ -531,25 +726,36 @@ static LCK_ATTR_DECLARE(nstat_lck_attr, 0, 0); static LCK_GRP_DECLARE(nstat_lck_grp, "network statistics kctl"); static LCK_RW_DECLARE_ATTR(nstat_rwlock, &nstat_lck_grp, &nstat_lck_attr); -#define NSTAT_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&nstat_rwlock) -#define NSTAT_LOCK_SHARED() lck_rw_lock_shared(&nstat_rwlock) +#define NSTAT_LOCK_EXCLUSIVE() \ +if (lck_rw_try_lock_exclusive(&nstat_rwlock)) { \ + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_exclusive_lock_uncontended); \ +} else { \ + lck_rw_lock_exclusive(&nstat_rwlock); \ + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_exclusive_lock_contended); \ +} + +#define NSTAT_LOCK_SHARED() \ +if (lck_rw_try_lock_shared(&nstat_rwlock)) { \ + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_shared_lock_uncontended); \ +} else { \ + lck_rw_lock_shared(&nstat_rwlock); \ + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_shared_lock_contended); \ +} + #define NSTAT_LOCK_SHARED_TO_EXCLUSIVE() lck_rw_lock_shared_to_exclusive(&nstat_rwlock) #define NSTAT_LOCK_EXCLUSIVE_TO_SHARED() lck_rw_lock_exclusive_to_shared(&nstat_rwlock) +#define NSTAT_TRY_LOCK_EXCLUSIVE() lck_rw_try_lock_exclusive(&nstat_rwlock) #define NSTAT_UNLOCK() lck_rw_done(&nstat_rwlock) #define NSTAT_UNLOCK_EXCLUSIVE() lck_rw_unlock_exclusive(&nstat_rwlock) #define NSTAT_UNLOCK_SHARED() lck_rw_unlock_shared(&nstat_rwlock) #define NSTAT_LOCK_WOULD_YIELD() lck_rw_lock_would_yield_shared(&nstat_rwlock) #define NSTAT_LOCK_YIELD() lck_rw_lock_yield_shared(&nstat_rwlock, FALSE) +#define NSTAT_LOCK_YIELD_EXCLUSIVE() lck_rw_lock_yield_exclusive(&nstat_rwlock, LCK_RW_YIELD_ANY_WAITER) #define NSTAT_ASSERT_LOCKED_EXCLUSIVE() LCK_RW_ASSERT(&nstat_rwlock, LCK_RW_ASSERT_EXCLUSIVE) #define NSTAT_ASSERT_LOCKED_SHARED() LCK_RW_ASSERT(&nstat_rwlock, LCK_RW_ASSERT_SHARED) #define NSTAT_ASSERT_LOCKED() LCK_RW_ASSERT(&nstat_rwlock, LCK_RW_ASSERT_HELD) #define NSTAT_ASSERT_UNLOCKED() LCK_RW_ASSERT(&nstat_rwlock, LCK_RW_ASSERT_NOTHELD) -typedef enum { - NSTAT_LOCK_NOTHELD = 0, - NSTAT_LOCK_HELD = 1, -} nstat_lock_status; - /* some extern definitions */ extern void tcp_report_stats(void); @@ -602,17 +808,17 @@ nstat_random_delay(nstat_client *client) #endif static void -nstat_accumulate_client_metrics(nstat_client *client) +nstat_accumulate_client_metrics(struct nstat_metrics *dest, nstat_client *client) { - if (nstat_metrics.nstat_src_max < client->ntc_metrics.nstat_src_max) { - nstat_metrics.nstat_src_max = client->ntc_metrics.nstat_src_max; + if (dest->nstat_src_max < client->ntc_metrics.nstat_src_max) { + dest->nstat_src_max = client->ntc_metrics.nstat_src_max; } // Most of the counts happen to be consecutive uint32_t values that can be picked up via pointer iteration rather than name uint32_t *srcptr = __unsafe_forge_bidi_indexable(uint32_t *, (uint32_t *)(void *)&client->ntc_metrics.nstat_first_uint32_count, (NUM_NSTAT_METRICS_UINT32_COUNTS * sizeof(uint32_t))); uint32_t *destptr = __unsafe_forge_bidi_indexable(uint32_t *, - (uint32_t *)(void *)&nstat_metrics.nstat_first_uint32_count, + (uint32_t *)(void *)&dest->nstat_first_uint32_count, (NUM_NSTAT_METRICS_UINT32_COUNTS * sizeof(uint32_t))); for (int i = 0; i < NUM_NSTAT_METRICS_UINT32_COUNTS; i++) { @@ -620,6 +826,78 @@ nstat_accumulate_client_metrics(nstat_client *client) } } +static int +metrics_for_client_id(uint32_t req_id, struct nstat_client_info *client_info) +{ + int err = 0; + struct nstat_metrics *metrics = &client_info->nstat_metrics; + bzero(client_info, sizeof(*client_info)); + + if (req_id == NSTAT_METRIC_ID_ACCUMULATED) { + client_info->nstat_client_details.nstat_client_id = NSTAT_METRIC_ID_ACCUMULATED; + *metrics = nstat_metrics; + } else { + NSTAT_LOCK_EXCLUSIVE(); + if (req_id == NSTAT_METRIC_ID_GRAND_TOTAL) { + client_info->nstat_client_details.nstat_client_id = NSTAT_METRIC_ID_GRAND_TOTAL; + *metrics = nstat_metrics; + nstat_client *client; + for (client = nstat_clients; client; client = client->ntc_next) { + nstat_accumulate_client_metrics(metrics, client); + } + } else { + nstat_client *client; + err = ERANGE; + for (client = nstat_clients; client; client = client->ntc_next) { + if (client->ntc_client_id <= req_id) { + client_info->nstat_client_details.nstat_client_id = client->ntc_client_id; + client_info->nstat_client_details.nstat_client_pid = client->ntc_procdetails->pdet_pid; + client_info->nstat_client_details.nstat_client_watching = client->ntc_watching; + client_info->nstat_client_details.nstat_client_added_src = client->ntc_added_src; + *metrics = client->ntc_metrics; + err = 0; + break; + } + } + } + NSTAT_UNLOCK_EXCLUSIVE(); + } + return err; +} + +static int +ntstat_get_metrics(struct sysctl_req *req) +{ + // The following assumes that the client_info structure is small such that stack allocation is reasonable + struct nstat_client_info client_info = {}; + int error = 0; + struct nstat_metrics_req requested; + + if (priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0) { + return EACCES; + } + if (req->newptr == USER_ADDR_NULL) { + return EINVAL; + } + if (req->newlen < sizeof(requested)) { + return EINVAL; + } + error = SYSCTL_IN(req, &requested, sizeof(requested)); + if (error != 0) { + return error; + } + if (requested.mr_version != NSTAT_METRIC_VERSION) { + return ENOTSUP; + } + error = metrics_for_client_id(requested.mr_id, &client_info); + if (error != 0) { + return error; + } + error = SYSCTL_OUT(req, &client_info, sizeof(client_info)); + return error; +} + + static void nstat_copy_sa_out( const struct sockaddr *src, @@ -717,6 +995,9 @@ nstat_ifnet_to_flags( } if (IFNET_IS_CONSTRAINED(ifp)) { flags |= NSTAT_IFNET_IS_CONSTRAINED; + if (IFNET_IS_ULTRA_CONSTRAINED(ifp)) { + flags |= NSTAT_IFNET_IS_ULTRA_CONSTRAINED; + } } if (ifnet_is_low_latency(ifp)) { flags |= NSTAT_IFNET_IS_WIFI | NSTAT_IFNET_IS_LLW; @@ -732,10 +1013,7 @@ static void nstat_update_local_flag_from_inpcb_route(const struct inpcb *inp, u_int32_t *flags) { - if (inp != NULL && - ((inp->inp_route.ro_rt != NULL && - IS_LOCALNET_ROUTE(inp->inp_route.ro_rt)) || - (inp->inp_flags2 & INP2_LAST_ROUTE_LOCAL))) { + if (inp != NULL && (inp->inp_flags2 & INP2_LAST_ROUTE_LOCAL)) { *flags |= NSTAT_IFNET_IS_LOCAL; } else { *flags |= NSTAT_IFNET_IS_NON_LOCAL; @@ -796,11 +1074,31 @@ merge_current_event_filters(void) } } +static inline void +nstat_src_remove_linkages(nstat_client *client, + nstat_src *src) +{ + NSTAT_ASSERT_LOCKED_EXCLUSIVE(); + NSTAT_NOTE_SRC(nstat_src_removed_linkage, client, src); + + TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + + if (src->nts_locus != NULL) { + TAILQ_REMOVE(&src->nts_locus->ntl_src_queue, src, nts_locus_link); + src->nts_locus = NULL; + } + assert(client->ntc_metrics.nstat_src_current > 0); + client->ntc_metrics.nstat_src_current--; +} #pragma mark -- Network Statistic Providers -- -static errno_t nstat_client_source_add(u_int64_t context, nstat_client *client, nstat_provider *provider, - nstat_provider_cookie_t cookie, nstat_lock_status lock_status); +static errno_t nstat_client_source_add(u_int64_t context, + nstat_client *client, + nstat_provider *provider, + nstat_provider_cookie_t cookie, + nstat_locus *locus); + struct nstat_provider *nstat_providers = NULL; static struct nstat_provider* @@ -855,12 +1153,12 @@ nstat_client_sanitize_cookie( } if (src) { nstat_client_send_goodbye(client, src); - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + nstat_src_remove_linkages(client, src); } NSTAT_UNLOCK_EXCLUSIVE(); if (src) { - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); } } @@ -880,6 +1178,7 @@ __private_extern__ void nstat_init(void) { nstat_log_handle = os_log_create("com.apple.xnu.net", "ntstat"); + nstat_global_counts.nstat_global_count_version = NSTAT_GLOBAL_COUNTS_VERSION; nstat_init_route_provider(); nstat_init_tcp_provider(); nstat_init_udp_provider(); @@ -1078,8 +1377,7 @@ nstat_route_counts( static void nstat_route_release( - nstat_provider_cookie_t cookie, - __unused int locked) + nstat_provider_cookie_t cookie) { rtfree((struct rtentry*)cookie); } @@ -1114,7 +1412,9 @@ nstat_route_walktree_add( return 0; } - result = nstat_client_source_add(0, client, &nstat_route_provider, rt, NSTAT_LOCK_NOTHELD); + NSTAT_LOCK_EXCLUSIVE(); + result = nstat_client_source_add(0, client, &nstat_route_provider, rt, NULL); + NSTAT_UNLOCK_EXCLUSIVE(); if (result != 0) { rtfree_locked(rt); } @@ -1177,7 +1477,7 @@ nstat_route_new_entry( RT_ADDREF(rt); // add the source, if that fails, release the reference - if (nstat_client_source_add(0, client, &nstat_route_provider, rt, NSTAT_LOCK_HELD) != 0) { + if (nstat_client_source_add(0, client, &nstat_route_provider, rt, NULL) != 0) { RT_REMREF(rt); } } @@ -1504,102 +1804,134 @@ nstat_route_update( * the interface index. This is necessary because when UDP sockets are * disconnected, the connection tuples are forever lost from the inpcb, thus * we need to keep track of the last call to connect() in ntstat. + * + * There may be either zero or one of the TCP and UDP variants of the nstat_sock_locus + * for any socket. There may be multiple nstat_src structures linked to the locus. + * At the time of original implementation, it is not expected that the nstat_sock_locus + * should live beyond the lifetime of the socket. There is therefore no reference counting + * and the structures are disposed of when the socket closes. */ -struct nstat_tucookie { - struct inpcb *inp; - char pname[PNAME_MAX_LENGTH]; - bool cached; - union{ - struct sockaddr_in v4; - struct sockaddr_in6 v6; - } local; - union{ - struct sockaddr_in v4; - struct sockaddr_in6 v6; - } remote; - unsigned int if_index; - uint32_t ifnet_properties; -}; -static struct nstat_tucookie * -nstat_tucookie_alloc_ref_internal( +#define NSTAT_SOCK_LOCUS_MAGIC 0xfeedf001 +#define NSTAT_SOCK_LOCUS_UNMAGIC 0xdeadf001 + +typedef struct nstat_sock_locus { + nstat_locus nsl_locus; // The locus as used for generic processing + tailq_entry_sock_locus nsl_link; // TCP and UDP sock_locus structures queued here + struct inpcb *nsl_inp; // As per the associated socket + uint32_t nsl_magic; // Debug aid + pid_t nsl_pid; + uint32_t nsl_ifnet_properties; + char nsl_pname[2 * MAXCOMLEN + 1]; + char nsl_is_tcp; // A boolean indicating TCP or UDP usage +} nstat_sock_locus; + +// An extended version is used for UDP +typedef struct nstat_extended_sock_locus { + nstat_sock_locus nesl_sock_locus; + union{ + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } nesl_local; + union{ + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } nesl_remote; + bool nesl_cached; + unsigned int nesl_if_index; +} nstat_extended_sock_locus; + +static tailq_head_sock_locus nstat_tcp_sock_locus_head = TAILQ_HEAD_INITIALIZER(nstat_tcp_sock_locus_head); +static tailq_head_sock_locus nstat_udp_sock_locus_head = TAILQ_HEAD_INITIALIZER(nstat_udp_sock_locus_head); + + +static bool +nstat_tcpudp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_filter *filter, bool is_UDP); + +static struct nstat_sock_locus * +nstat_sock_locus_alloc_internal( struct inpcb *inp, bool locked) { - struct nstat_tucookie *cookie; + struct nstat_sock_locus *sol; - if (inp->inp_state == INPCB_STATE_DEAD) { - NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tucookie_skip_dead); - return NULL; - } - - cookie = kalloc_type(struct nstat_tucookie, Z_WAITOK | Z_ZERO); - - if (cookie == NULL) { - NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tucookie_alloc_fail); - return NULL; - } - NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tucookie_allocs); - NSTAT_GLOBAL_COUNT_INCREMENT_WITH_MAX(nstat_global_tucookie_current, nstat_global_tucookie_max); - - if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) { - NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tucookie_skip_stopusing); - NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_tucookie_current); - kfree_type(struct nstat_tucookie, cookie); - return NULL; - } - cookie->inp = inp; - proc_best_name_for_pid(inp->inp_socket->last_pid, cookie->pname, sizeof(cookie->pname)); - /* - * We only increment the reference count for UDP sockets because we - * only cache UDP socket tuples. - */ if (SOCK_PROTO(inp->inp_socket) == IPPROTO_UDP) { - OSIncrementAtomic(&inp->inp_nstat_refcnt); + struct nstat_extended_sock_locus *esol; + esol = kalloc_type(struct nstat_extended_sock_locus, Z_WAITOK | Z_ZERO); + if (!esol) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_udp_sck_locus_alloc_fails); + return NULL; + } + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_udp_sck_locus_allocs); + NSTAT_GLOBAL_COUNT_INCREMENT_WITH_MAX(nstat_global_udp_sck_locus_current, nstat_global_udp_sck_locus_max); + if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_udp_sck_locus_stop_using); + kfree_type(struct nstat_extended_sock_locus, esol); + NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_udp_sck_locus_current); + return NULL; + } + sol = &esol->nesl_sock_locus; + } else { + sol = kalloc_type(struct nstat_sock_locus, Z_WAITOK | Z_ZERO); + if (!sol) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tcp_sck_locus_alloc_fails); + return NULL; + } + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tcp_sck_locus_allocs); + NSTAT_GLOBAL_COUNT_INCREMENT_WITH_MAX(nstat_global_tcp_sck_locus_current, nstat_global_tcp_sck_locus_max); + // The acquire here may be redundant, it is balanced by a release when the pcb is detached + // Copied from tucookie precedent, which had a different mechanism but similar constraints + if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tcp_sck_locus_stop_using); + kfree_type(struct nstat_sock_locus, sol); + NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_tcp_sck_locus_current); + return NULL; + } } + sol->nsl_inp = inp; + sol->nsl_magic = NSTAT_SOCK_LOCUS_MAGIC; + bzero(sol->nsl_pname, sizeof(sol->nsl_pname)); - return cookie; + pid_t initial_pid = inp->inp_socket->last_pid; + proc_best_name_for_pid(initial_pid, sol->nsl_pname, sizeof(sol->nsl_pname)); + if (sol->nsl_pname[0] != '\0') { + sol->nsl_pid = initial_pid; + } else { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_sck_fail_first_owner); + } + inp->inp_nstat_locus = sol; + return sol; } -static struct nstat_tucookie * -nstat_tucookie_alloc_ref( +static struct nstat_sock_locus * +nstat_sock_locus_alloc_locked( struct inpcb *inp) { - return nstat_tucookie_alloc_ref_internal(inp, false); -} - -static struct nstat_tucookie * -nstat_tucookie_alloc_ref_locked( - struct inpcb *inp) -{ - return nstat_tucookie_alloc_ref_internal(inp, true); + return nstat_sock_locus_alloc_internal(inp, true); } static void -nstat_tucookie_release_internal( - struct nstat_tucookie *cookie, - int inplock) +nstat_sock_locus_release( + struct nstat_sock_locus *sol, + int inplock) { - if (SOCK_PROTO(cookie->inp->inp_socket) == IPPROTO_UDP) { - OSDecrementAtomic(&cookie->inp->inp_nstat_refcnt); + // Note, caller should already hace cleared inp->inp_nstat_locus under lock + // to prevent multiple calls to this cleanup function + struct nstat_extended_sock_locus *esol = NULL; + struct inpcb *inp = sol->nsl_inp; + + if (SOCK_PROTO(inp->inp_socket) == IPPROTO_UDP) { + esol = (struct nstat_extended_sock_locus *)sol; } - in_pcb_checkstate(cookie->inp, WNT_RELEASE, inplock); - NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_tucookie_current); - kfree_type(struct nstat_tucookie, cookie); -} - -static void -nstat_tucookie_release( - struct nstat_tucookie *cookie) -{ - nstat_tucookie_release_internal(cookie, false); -} - -static void -nstat_tucookie_release_locked( - struct nstat_tucookie *cookie) -{ - nstat_tucookie_release_internal(cookie, true); + sol->nsl_magic = NSTAT_SOCK_LOCUS_UNMAGIC; + if (esol != NULL) { + NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_udp_sck_locus_current); + kfree_type(struct nstat_extended_sock_locus, esol); + } else { + NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_tcp_sck_locus_current); + kfree_type(struct nstat_sock_locus, sol); + } + in_pcb_checkstate(inp, WNT_RELEASE, inplock); } @@ -1652,8 +1984,8 @@ nstat_inp_bluetooth_counts(struct inpcb *inp, nstat_interface_counts *buf, size_ if (buf == NULL) { uint64_t rxbytes = 0; uint64_t txbytes = 0; - rxbytes = os_atomic_load(&inp->inp_btstat->rxbytes, relaxed); - txbytes = os_atomic_load(&inp->inp_btstat->txbytes, relaxed); + rxbytes = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_rxbytes, relaxed); + txbytes = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_txbytes, relaxed); if ((rxbytes == 0) && (txbytes == 0)) { // It's more efficient to skip sending counts if they're only going to be zero @@ -1670,10 +2002,10 @@ nstat_inp_bluetooth_counts(struct inpcb *inp, nstat_interface_counts *buf, size_ if (!(intotcpcb(inp)) || inp->inp_state == INPCB_STATE_DEAD) { return 0; } - buf->nstat_rxpackets = os_atomic_load(&inp->inp_btstat->rxpackets, relaxed); - buf->nstat_rxbytes = os_atomic_load(&inp->inp_btstat->rxbytes, relaxed); - buf->nstat_txpackets = os_atomic_load(&inp->inp_btstat->txpackets, relaxed); - buf->nstat_txbytes = os_atomic_load(&inp->inp_btstat->txbytes, relaxed); + buf->nstat_rxpackets = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_rxpackets, relaxed); + buf->nstat_rxbytes = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_rxbytes, relaxed); + buf->nstat_txpackets = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_txpackets, relaxed); + buf->nstat_txbytes = os_atomic_load(&inp->inp_mstat.ms_bluetooth.ts_txbytes, relaxed); return sizeof(nstat_interface_counts); } @@ -1693,12 +2025,11 @@ static int nstat_tcp_gone( nstat_provider_cookie_t cookie) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; struct inpcb *inp; struct tcpcb *tp; - return (!(inp = tucookie->inp) || + return (!(inp = sol->nsl_inp) || !(tp = intotcpcb(inp)) || inp->inp_state == INPCB_STATE_DEAD) ? 1 : 0; } @@ -1709,8 +2040,7 @@ nstat_tcp_counts( struct nstat_counts *out_counts, int *out_gone) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; struct inpcb *inp; bzero(out_counts, sizeof(*out_counts)); @@ -1724,17 +2054,17 @@ nstat_tcp_counts( if (out_gone) { *out_gone = 1; } - if (!(inp = tucookie->inp) || !intotcpcb(inp)) { + if (!(inp = sol->nsl_inp) || !intotcpcb(inp)) { return EINVAL; } } - inp = tucookie->inp; + inp = sol->nsl_inp; struct tcpcb *tp = intotcpcb(inp); - out_counts->nstat_rxpackets = os_atomic_load(&inp->inp_stat->rxpackets, relaxed); - out_counts->nstat_rxbytes = os_atomic_load(&inp->inp_stat->rxbytes, relaxed); - out_counts->nstat_txpackets = os_atomic_load(&inp->inp_stat->txpackets, relaxed); - out_counts->nstat_txbytes = os_atomic_load(&inp->inp_stat->txbytes, relaxed); + out_counts->nstat_rxpackets = os_atomic_load(&inp->inp_mstat.ms_total.ts_rxpackets, relaxed); + out_counts->nstat_rxbytes = os_atomic_load(&inp->inp_mstat.ms_total.ts_rxbytes, relaxed); + out_counts->nstat_txpackets = os_atomic_load(&inp->inp_mstat.ms_total.ts_txpackets, relaxed); + out_counts->nstat_txbytes = os_atomic_load(&inp->inp_mstat.ms_total.ts_txbytes, relaxed); out_counts->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; out_counts->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; out_counts->nstat_txretransmit = tp->t_stat.txretransmitbytes; @@ -1746,25 +2076,65 @@ nstat_tcp_counts( if (out_counts->nstat_avg_rtt < out_counts->nstat_min_rtt) { out_counts->nstat_min_rtt = out_counts->nstat_avg_rtt; } - out_counts->nstat_cell_rxbytes = os_atomic_load(&inp->inp_cstat->rxbytes, relaxed); - out_counts->nstat_cell_txbytes = os_atomic_load(&inp->inp_cstat->txbytes, relaxed); - out_counts->nstat_wifi_rxbytes = os_atomic_load(&inp->inp_wstat->rxbytes, relaxed); - out_counts->nstat_wifi_txbytes = os_atomic_load(&inp->inp_wstat->txbytes, relaxed); - out_counts->nstat_wired_rxbytes = os_atomic_load(&inp->inp_Wstat->rxbytes, relaxed); - out_counts->nstat_wired_txbytes = os_atomic_load(&inp->inp_Wstat->txbytes, relaxed); + out_counts->nstat_cell_rxbytes = os_atomic_load(&inp->inp_mstat.ms_cellular.ts_rxbytes, relaxed); + out_counts->nstat_cell_txbytes = os_atomic_load(&inp->inp_mstat.ms_cellular.ts_txbytes, relaxed); + out_counts->nstat_wifi_rxbytes = os_atomic_load(&inp->inp_mstat.ms_wifi_infra.ts_rxbytes, relaxed) + + os_atomic_load(&inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes, relaxed); + out_counts->nstat_wifi_txbytes = os_atomic_load(&inp->inp_mstat.ms_wifi_infra.ts_txbytes, relaxed) + + os_atomic_load(&inp->inp_mstat.ms_wifi_non_infra.ts_txbytes, relaxed); + out_counts->nstat_wired_rxbytes = os_atomic_load(&inp->inp_mstat.ms_wired.ts_rxbytes, relaxed); + out_counts->nstat_wired_txbytes = os_atomic_load(&inp->inp_mstat.ms_wired.ts_txbytes, relaxed); + + return 0; +} + +static errno_t +nstat_tcp_details( + nstat_provider_cookie_t cookie, + struct nstat_detailed_counts *out_details, + int *out_gone) +{ + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + struct inpcb *inp; + + bzero(out_details, sizeof(*out_details)); + + if (out_gone) { + *out_gone = 0; + } + + // if the pcb is in the dead state, we should stop using it + if (nstat_tcp_gone(cookie)) { + if (out_gone) { + *out_gone = 1; + } + if (!(inp = sol->nsl_inp) || !intotcpcb(inp)) { + return EINVAL; + } + } + inp = sol->nsl_inp; + struct tcpcb *tp = intotcpcb(inp); + if (tp == NULL) { + return EINVAL; + } + memcpy(out_details, &inp->inp_mstat, sizeof(inp->inp_mstat)); + out_details->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; + out_details->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; + out_details->nstat_txretransmit = tp->t_stat.txretransmitbytes; + out_details->nstat_avg_rtt = tp->t_srtt; + out_details->nstat_min_rtt = tp->t_rttbest; + out_details->nstat_var_rtt = tp->t_rttvar; + if (out_details->nstat_avg_rtt < out_details->nstat_min_rtt) { + out_details->nstat_min_rtt = out_details->nstat_avg_rtt; + } return 0; } static void nstat_tcp_release( - nstat_provider_cookie_t cookie, - int locked) + __unused nstat_provider_cookie_t cookie) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; - - nstat_tucookie_release_internal(tucookie, locked); } static errno_t @@ -1772,42 +2142,37 @@ nstat_tcp_add_watcher( nstat_client *client, nstat_msg_add_all_srcs *req) { - // There is a tricky issue around getting all TCP sockets added once - // and only once. nstat_tcp_new_pcb() is called prior to the new item - // being placed on any lists where it might be found. - // By locking the tcbinfo.ipi_lock prior to marking the client as a watcher, - // it should be impossible for a new socket to be added twice. - // On the other hand, there is still a timing issue where a new socket - // results in a call to nstat_tcp_new_pcb() before this watcher - // is instantiated and yet the socket doesn't make it into ipi_listhead - // prior to the scan. - errno_t result; - lck_rw_lock_shared(&tcbinfo.ipi_lock); + NSTAT_LOCK_EXCLUSIVE(); result = nstat_set_provider_filter(client, req); + + // Only now can nstat_tcp_new_pcb() find the client via the provider filter + // so the NSTAT_LOCK_EXCLUSIVE() ensures that sockets are added once and only once + if (result == 0) { OSIncrementAtomic(&nstat_tcp_watchers); - // Add all current tcp inpcbs. Ignore those in timewait - struct inpcb *inp; - struct nstat_tucookie *cookie; - LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) - { - cookie = nstat_tucookie_alloc_ref(inp); - if (cookie == NULL) { + struct nstat_sock_locus *sol; + + TAILQ_FOREACH(sol, &nstat_tcp_sock_locus_head, nsl_link) { + assert(sol->nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + assert(sol->nsl_inp != NULL); + assert(sol->nsl_is_tcp != 0); + struct inpcb *inp = sol->nsl_inp; + + // Ideally all dead inpcbs should have been removed from the list, but be paranoid + if (inp->inp_state == INPCB_STATE_DEAD) { + NSTAT_NOTE_QUAL(nstat_add_all_tcp_skip_dead, client, 0); continue; } - if (nstat_client_source_add(0, client, &nstat_tcp_provider, - cookie, NSTAT_LOCK_NOTHELD) != 0) { - nstat_tucookie_release(cookie); - break; - } + // The client may not be interested in all TCP flows + // There's no need to check the return code from nstat_client_source_add() + // which will have performed any recovery actions directly + nstat_client_source_add(0, client, &nstat_tcp_provider, sol, &sol->nsl_locus); } } - - lck_rw_done(&tcbinfo.ipi_lock); - + NSTAT_UNLOCK_EXCLUSIVE(); return result; } @@ -1822,34 +2187,28 @@ __private_extern__ void nstat_tcp_new_pcb( struct inpcb *inp) { - struct nstat_tucookie *cookie; - + struct nstat_sock_locus *sol = NULL; inp->inp_start_timestamp = mach_continuous_time(); - if (nstat_tcp_watchers == 0) { + assert(inp->inp_nstat_locus == NULL); + sol = nstat_sock_locus_alloc_locked(inp); + if (sol == NULL) { return; } - - socket_lock(inp->inp_socket, 0); + sol->nsl_is_tcp = 1; NSTAT_LOCK_EXCLUSIVE(); - nstat_client *client; + TAILQ_INSERT_HEAD(&nstat_tcp_sock_locus_head, sol, nsl_link); + + nstat_client *client; for (client = nstat_clients; client; client = client->ntc_next) { if ((client->ntc_watching & (1 << NSTAT_PROVIDER_TCP_KERNEL)) != 0) { - // this client is watching tcp - // acquire a reference for it - cookie = nstat_tucookie_alloc_ref_locked(inp); - if (cookie == NULL) { - continue; - } - // add the source, if that fails, release the reference - if (nstat_client_source_add(0, client, - &nstat_tcp_provider, cookie, NSTAT_LOCK_HELD) != 0) { - nstat_tucookie_release_locked(cookie); - } + // this client is watching tcp and may or may not be interested in this example + // Metrics are gathered by nstat_client_source_add(), + // There's no possible recovery action or logging needed here, no need to check return code + nstat_client_source_add(0, client, &nstat_tcp_provider, sol, &sol->nsl_locus); } } NSTAT_UNLOCK_EXCLUSIVE(); - socket_unlock(inp->inp_socket, 0); } __private_extern__ void @@ -1857,40 +2216,46 @@ nstat_pcb_detach(struct inpcb *inp) { nstat_client *client; nstat_src *src; + nstat_src *tmpsrc; tailq_head_nstat_src dead_list; - struct nstat_tucookie *tucookie; - errno_t result; + struct nstat_sock_locus *sol = NULL; - if (inp == NULL || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) { + if (inp == NULL) { return; } - TAILQ_INIT(&dead_list); NSTAT_LOCK_EXCLUSIVE(); - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - nstat_provider_id_t provider_id = src->nts_provider->nstat_provider_id; - if (provider_id == NSTAT_PROVIDER_TCP_KERNEL || provider_id == NSTAT_PROVIDER_UDP_KERNEL) { - tucookie = (struct nstat_tucookie *)src->nts_cookie; - if (tucookie->inp == inp) { - break; - } - } - } - if (src) { - result = nstat_client_send_goodbye(client, src); - - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + sol = inp->inp_nstat_locus; + if (sol) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_pcb_detach_with_locus); + TAILQ_FOREACH_SAFE(src, &inp->inp_nstat_locus->nsl_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(sol == (struct nstat_sock_locus *)src->nts_cookie); + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_pcb_detach_with_src); + client = src->nts_client; + nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); } + if (sol->nsl_is_tcp) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_pcb_detach_tcp); + TAILQ_REMOVE(&nstat_tcp_sock_locus_head, sol, nsl_link); + } else { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_pcb_detach_udp); + TAILQ_REMOVE(&nstat_udp_sock_locus_head, sol, nsl_link); + } + inp->inp_nstat_locus = NULL; + } else { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_pcb_detach_without_locus); } NSTAT_UNLOCK_EXCLUSIVE(); while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); + } + if (sol != NULL) { + nstat_sock_locus_release(sol, TRUE); } } @@ -1899,11 +2264,11 @@ nstat_pcb_event(struct inpcb *inp, u_int64_t event) { nstat_client *client; nstat_src *src; - struct nstat_tucookie *tucookie; + struct nstat_sock_locus *sol; errno_t result; nstat_provider_id_t provider_id; - if (inp == NULL || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) { + if (inp == NULL || (inp->inp_nstat_locus == NULL) || (nstat_tcp_watchers == 0 && nstat_udp_watchers == 0)) { return; } if (((merged_filters.mpf_filters[NSTAT_PROVIDER_TCP_KERNEL].mf_events & event) == 0) && @@ -1912,24 +2277,21 @@ nstat_pcb_event(struct inpcb *inp, u_int64_t event) // This check saves taking the mutex and scanning the list return; } - NSTAT_LOCK_EXCLUSIVE(); - for (client = nstat_clients; client; client = client->ntc_next) { - if (((client->ntc_provider_filters[NSTAT_PROVIDER_TCP_KERNEL].npf_events & event) == 0) && - ((client->ntc_provider_filters[NSTAT_PROVIDER_UDP_KERNEL].npf_events & event) == 0)) { - continue; - } - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - provider_id = src->nts_provider->nstat_provider_id; - if (provider_id == NSTAT_PROVIDER_TCP_KERNEL || provider_id == NSTAT_PROVIDER_UDP_KERNEL) { - tucookie = (struct nstat_tucookie *)src->nts_cookie; - if (tucookie->inp == inp) { - break; - } - } - } + sol = inp->inp_nstat_locus; - if (src && ((client->ntc_provider_filters[provider_id].npf_events & event) != 0)) { + NSTAT_LOCK_EXCLUSIVE(); + + TAILQ_FOREACH(src, &sol->nsl_locus.ntl_src_queue, nts_locus_link) { + assert(sol == (struct nstat_sock_locus *)src->nts_cookie); + client = src->nts_client; + provider_id = src->nts_provider->nstat_provider_id; + assert((provider_id == NSTAT_PROVIDER_TCP_KERNEL) || (provider_id == NSTAT_PROVIDER_UDP_KERNEL)); + + struct nstat_provider_filter *filter = &client->ntc_provider_filters[provider_id]; + bool isUDP = (provider_id == NSTAT_PROVIDER_UDP_KERNEL); + if (((filter->npf_events & event) != 0) && + (nstat_tcpudp_reporting_allowed(src->nts_cookie, filter, isUDP))) { + NSTAT_NOTE_SRC(nstat_pcb_event, client, src); result = nstat_client_send_event(client, src, event); } } @@ -1938,82 +2300,95 @@ nstat_pcb_event(struct inpcb *inp, u_int64_t event) __private_extern__ void -nstat_pcb_cache(struct inpcb *inp) +nstat_udp_pcb_cache(struct inpcb *inp) { - nstat_client *client; - nstat_src *src; - struct nstat_tucookie *tucookie; - - if (inp == NULL || nstat_udp_watchers == 0 || - inp->inp_nstat_refcnt == 0) { + if (inp == NULL) { return; } VERIFY(SOCK_PROTO(inp->inp_socket) == IPPROTO_UDP); NSTAT_LOCK_EXCLUSIVE(); - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - tucookie = (struct nstat_tucookie *)src->nts_cookie; - if (tucookie->inp == inp) { - if (inp->inp_vflag & INP_IPV6) { - in6_ip6_to_sockaddr(&inp->in6p_laddr, - inp->inp_lport, - inp->inp_lifscope, - &tucookie->local.v6, - sizeof(tucookie->local)); - in6_ip6_to_sockaddr(&inp->in6p_faddr, - inp->inp_fport, - inp->inp_fifscope, - &tucookie->remote.v6, - sizeof(tucookie->remote)); - } else if (inp->inp_vflag & INP_IPV4) { - nstat_ip_to_sockaddr(&inp->inp_laddr, - inp->inp_lport, - &tucookie->local.v4, - sizeof(tucookie->local)); - nstat_ip_to_sockaddr(&inp->inp_faddr, - inp->inp_fport, - &tucookie->remote.v4, - sizeof(tucookie->remote)); - } - if (inp->inp_last_outifp) { - tucookie->if_index = - inp->inp_last_outifp->if_index; - } - tucookie->ifnet_properties = nstat_inpcb_to_flags(inp); - tucookie->cached = true; - break; - } + struct nstat_extended_sock_locus *esol = (struct nstat_extended_sock_locus *)inp->inp_nstat_locus; + if (esol != NULL) { + assert(esol->nesl_sock_locus.nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + if (inp->inp_vflag & INP_IPV6) { + in6_ip6_to_sockaddr(&inp->in6p_laddr, + inp->inp_lport, + inp->inp_lifscope, + &esol->nesl_local.v6, + sizeof(esol->nesl_local)); + in6_ip6_to_sockaddr(&inp->in6p_faddr, + inp->inp_fport, + inp->inp_fifscope, + &esol->nesl_remote.v6, + sizeof(esol->nesl_remote)); + } else if (inp->inp_vflag & INP_IPV4) { + nstat_ip_to_sockaddr(&inp->inp_laddr, + inp->inp_lport, + &esol->nesl_local.v4, + sizeof(esol->nesl_local)); + nstat_ip_to_sockaddr(&inp->inp_faddr, + inp->inp_fport, + &esol->nesl_remote.v4, + sizeof(esol->nesl_remote)); + } else { + bzero(&esol->nesl_local, sizeof(esol->nesl_local)); + bzero(&esol->nesl_remote, sizeof(esol->nesl_remote)); } + if (inp->inp_last_outifp) { + esol->nesl_if_index = inp->inp_last_outifp->if_index; + } else { + esol->nesl_if_index = 0; + } + esol->nesl_sock_locus.nsl_ifnet_properties = nstat_inpcb_to_flags(inp); + esol->nesl_cached = true; } NSTAT_UNLOCK_EXCLUSIVE(); } __private_extern__ void -nstat_pcb_invalidate_cache(struct inpcb *inp) +nstat_udp_pcb_invalidate_cache(struct inpcb *inp) { - nstat_client *client; - nstat_src *src; - struct nstat_tucookie *tucookie; - - if (inp == NULL || nstat_udp_watchers == 0 || - inp->inp_nstat_refcnt == 0) { + if (inp == NULL) { return; } VERIFY(SOCK_PROTO(inp->inp_socket) == IPPROTO_UDP); + struct nstat_extended_sock_locus *esol = (struct nstat_extended_sock_locus *)inp->inp_nstat_locus; + + if (esol != NULL) { + assert(esol->nesl_sock_locus.nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + if (esol->nesl_cached) { + NSTAT_LOCK_EXCLUSIVE(); + esol->nesl_cached = false; + NSTAT_UNLOCK_EXCLUSIVE(); + } + } +} + +__private_extern__ void +nstat_pcb_update_last_owner(struct inpcb *inp) +{ + bool cause_event = false; NSTAT_LOCK_EXCLUSIVE(); - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - tucookie = (struct nstat_tucookie *)src->nts_cookie; - if (tucookie->inp == inp) { - tucookie->cached = false; - break; + struct nstat_sock_locus *sol = inp->inp_nstat_locus; + if (sol != NULL) { + pid_t current_pid = inp->inp_socket->last_pid; + if (sol->nsl_pid != current_pid) { + proc_best_name_for_pid(current_pid, sol->nsl_pname, sizeof(sol->nsl_pname)); + if (sol->nsl_pname[0] != '\0') { + sol->nsl_pid = current_pid; + cause_event = true; + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_sck_update_last_owner); + } else { + // How best to recover? Just leaving things untouched seems reasonable + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_sck_fail_last_owner); } } } NSTAT_UNLOCK_EXCLUSIVE(); + if (cause_event) { + nstat_pcb_event(inp, NSTAT_EVENT_SRC_DID_CHANGE_OWNER); + } } static errno_t @@ -2031,9 +2406,8 @@ nstat_tcp_copy_descriptor( } nstat_tcp_descriptor *desc = (nstat_tcp_descriptor*)data; - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; - struct inpcb *inp = tucookie->inp; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + struct inpcb *inp = sol->nsl_inp; struct tcpcb *tp = intotcpcb(inp); bzero(desc, sizeof(*desc)); @@ -2080,19 +2454,27 @@ nstat_tcp_copy_descriptor( desc->ifnet_properties |= NSTAT_SOURCE_IS_INBOUND; } else if (desc->state == TCPS_LISTEN) { desc->ifnet_properties |= NSTAT_SOURCE_IS_LISTENER; - tucookie->ifnet_properties = NSTAT_SOURCE_IS_LISTENER; + sol->nsl_ifnet_properties = NSTAT_SOURCE_IS_LISTENER; } else if (desc->state != TCPS_CLOSED) { desc->ifnet_properties |= NSTAT_SOURCE_IS_OUTBOUND; - tucookie->ifnet_properties = NSTAT_SOURCE_IS_OUTBOUND; + sol->nsl_ifnet_properties = NSTAT_SOURCE_IS_OUTBOUND; } else { - desc->ifnet_properties |= tucookie->ifnet_properties; + desc->ifnet_properties |= sol->nsl_ifnet_properties; } - proc_best_name_for_pid(desc->pid, desc->pname, sizeof(desc->pname)); - if (desc->pname[0] == 0) { - strbufcpy(desc->pname, tucookie->pname); + if ((desc->pid == sol->nsl_pid) && (sol->nsl_pname[0] != '\0')) { + // There should be a nicely cached name to use, for efficiency + strbufcpy(desc->pname, sol->nsl_pname); } else { - desc->pname[sizeof(desc->pname) - 1] = 0; - strbufcpy(tucookie->pname, desc->pname); + proc_best_name_for_pid(desc->pid, desc->pname, sizeof(desc->pname)); + if (desc->pname[0] != '\0') { + // This may not be fully synchronized but multiple updates should be fine + desc->pname[sizeof(desc->pname) - 1] = 0; + strbufcpy(sol->nsl_pname, desc->pname); + sol->nsl_pid = desc->pid; + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tcp_desc_new_name); + } else { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_tcp_desc_fail_name); + } } memcpy(desc->uuid, so->last_uuid, sizeof(so->last_uuid)); memcpy(desc->vuuid, so->so_vuuid, sizeof(so->so_vuuid)); @@ -2136,8 +2518,8 @@ nstat_tcpudp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_fi bool retval = true; if ((filter->npf_flags & (NSTAT_FILTER_IFNET_FLAGS | NSTAT_FILTER_SPECIFIC_USER)) != 0) { - struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - struct inpcb *inp = tucookie->inp; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + struct inpcb *inp = sol->nsl_inp; /* Only apply interface filter if at least one is allowed. */ if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) { @@ -2151,19 +2533,28 @@ nstat_tcpudp_reporting_allowed(nstat_provider_cookie_t cookie, nstat_provider_fi if (is_UDP) { do{ if ((filter->npf_flags & (NSTAT_FILTER_ACCEPT_CELLULAR | NSTAT_FILTER_ACCEPT_EXPENSIVE)) && - (inp->inp_cstat->rxbytes || inp->inp_cstat->txbytes)) { + (inp->inp_mstat.ms_cellular.ts_rxbytes || inp->inp_mstat.ms_cellular.ts_txbytes)) { break; } if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_WIFI) && - (inp->inp_wstat->rxbytes || inp->inp_wstat->txbytes)) { + (inp->inp_mstat.ms_wifi_infra.ts_rxbytes || inp->inp_mstat.ms_wifi_infra.ts_txbytes || + inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes || inp->inp_mstat.ms_wifi_non_infra.ts_txbytes)) { + break; + } + if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_WIFI_INFRA) && + (inp->inp_mstat.ms_wifi_infra.ts_rxbytes || inp->inp_mstat.ms_wifi_infra.ts_txbytes)) { + break; + } + if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_AWDL) && + (inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes || inp->inp_mstat.ms_wifi_non_infra.ts_txbytes)) { break; } if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_WIRED) && - (inp->inp_Wstat->rxbytes || inp->inp_Wstat->txbytes)) { + (inp->inp_mstat.ms_wired.ts_rxbytes || inp->inp_mstat.ms_wired.ts_txbytes)) { break; } if ((filter->npf_flags & NSTAT_FILTER_ACCEPT_COMPANIONLINK_BT) && - (inp->inp_btstat->rxbytes || inp->inp_btstat->txbytes)) { + (inp->inp_mstat.ms_bluetooth.ts_rxbytes || inp->inp_mstat.ms_bluetooth.ts_txbytes)) { break; } return false; @@ -2211,8 +2602,8 @@ nstat_tcp_reporting_allowed( static size_t nstat_tcp_extensions(nstat_provider_cookie_t cookie, u_int32_t extension_id, void *buf, size_t len) { - struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - struct inpcb *inp = tucookie->inp; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + struct inpcb *inp = sol->nsl_inp; if (nstat_tcp_gone(cookie)) { return 0; @@ -2241,6 +2632,7 @@ nstat_init_tcp_provider(void) nstat_tcp_provider.nstat_lookup = nstat_tcp_lookup; nstat_tcp_provider.nstat_gone = nstat_tcp_gone; nstat_tcp_provider.nstat_counts = nstat_tcp_counts; + nstat_tcp_provider.nstat_details = nstat_tcp_details; nstat_tcp_provider.nstat_release = nstat_tcp_release; nstat_tcp_provider.nstat_watcher_add = nstat_tcp_add_watcher; nstat_tcp_provider.nstat_watcher_remove = nstat_tcp_remove_watcher; @@ -2269,11 +2661,10 @@ static int nstat_udp_gone( nstat_provider_cookie_t cookie) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; struct inpcb *inp; - return (!(inp = tucookie->inp) || + return (!(inp = sol->nsl_inp) || inp->inp_state == INPCB_STATE_DEAD) ? 1 : 0; } @@ -2283,8 +2674,7 @@ nstat_udp_counts( struct nstat_counts *out_counts, int *out_gone) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; if (out_gone) { *out_gone = 0; @@ -2295,35 +2685,57 @@ nstat_udp_counts( if (out_gone) { *out_gone = 1; } - if (!tucookie->inp) { + if (!sol->nsl_inp) { return EINVAL; } } - struct inpcb *inp = tucookie->inp; + struct inpcb *inp = sol->nsl_inp; - out_counts->nstat_rxpackets = os_atomic_load(&inp->inp_stat->rxpackets, relaxed); - out_counts->nstat_rxbytes = os_atomic_load(&inp->inp_stat->rxbytes, relaxed); - out_counts->nstat_txpackets = os_atomic_load(&inp->inp_stat->txpackets, relaxed); - out_counts->nstat_txbytes = os_atomic_load(&inp->inp_stat->txbytes, relaxed); - out_counts->nstat_cell_rxbytes = os_atomic_load(&inp->inp_cstat->rxbytes, relaxed); - out_counts->nstat_cell_txbytes = os_atomic_load(&inp->inp_cstat->txbytes, relaxed); - out_counts->nstat_wifi_rxbytes = os_atomic_load(&inp->inp_wstat->rxbytes, relaxed); - out_counts->nstat_wifi_txbytes = os_atomic_load(&inp->inp_wstat->txbytes, relaxed); - out_counts->nstat_wired_rxbytes = os_atomic_load(&inp->inp_Wstat->rxbytes, relaxed); - out_counts->nstat_wired_txbytes = os_atomic_load(&inp->inp_Wstat->txbytes, relaxed); + out_counts->nstat_rxpackets = os_atomic_load(&inp->inp_mstat.ms_total.ts_rxpackets, relaxed); + out_counts->nstat_rxbytes = os_atomic_load(&inp->inp_mstat.ms_total.ts_rxbytes, relaxed); + out_counts->nstat_txpackets = os_atomic_load(&inp->inp_mstat.ms_total.ts_txpackets, relaxed); + out_counts->nstat_txbytes = os_atomic_load(&inp->inp_mstat.ms_total.ts_txbytes, relaxed); + out_counts->nstat_cell_rxbytes = os_atomic_load(&inp->inp_mstat.ms_cellular.ts_rxbytes, relaxed); + out_counts->nstat_cell_txbytes = os_atomic_load(&inp->inp_mstat.ms_cellular.ts_txbytes, relaxed); + out_counts->nstat_wifi_rxbytes = os_atomic_load(&inp->inp_mstat.ms_wifi_infra.ts_rxbytes, relaxed) + + os_atomic_load(&inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes, relaxed); + out_counts->nstat_wifi_txbytes = os_atomic_load(&inp->inp_mstat.ms_wifi_infra.ts_txbytes, relaxed) + + os_atomic_load(&inp->inp_mstat.ms_wifi_non_infra.ts_txbytes, relaxed); + out_counts->nstat_wired_rxbytes = os_atomic_load(&inp->inp_mstat.ms_wired.ts_rxbytes, relaxed); + out_counts->nstat_wired_txbytes = os_atomic_load(&inp->inp_mstat.ms_wired.ts_txbytes, relaxed); return 0; } +static errno_t +nstat_udp_details( + nstat_provider_cookie_t cookie, + struct nstat_detailed_counts *out_details, + int *out_gone) +{ + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + + if (out_gone) { + *out_gone = 0; + } + + // if the pcb is in the dead state, we should stop using it + if (nstat_udp_gone(cookie)) { + if (out_gone) { + *out_gone = 1; + } + } + struct inpcb *inp = sol->nsl_inp; + + // Note, other field in the out_details structure guaranteed to be zeroed + memcpy(out_details, &inp->inp_mstat, sizeof(inp->inp_mstat)); + return 0; +} + static void nstat_udp_release( - nstat_provider_cookie_t cookie, - int locked) + __unused nstat_provider_cookie_t cookie) { - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; - - nstat_tucookie_release_internal(tucookie, locked); } static errno_t @@ -2331,44 +2743,36 @@ nstat_udp_add_watcher( nstat_client *client, nstat_msg_add_all_srcs *req) { - // There is a tricky issue around getting all UDP sockets added once - // and only once. nstat_udp_new_pcb() is called prior to the new item - // being placed on any lists where it might be found. - // By locking the udpinfo.ipi_lock prior to marking the client as a watcher, - // it should be impossible for a new socket to be added twice. - // On the other hand, there is still a timing issue where a new socket - // results in a call to nstat_udp_new_pcb() before this watcher - // is instantiated and yet the socket doesn't make it into ipi_listhead - // prior to the scan. - errno_t result; - lck_rw_lock_shared(&udbinfo.ipi_lock); + NSTAT_LOCK_EXCLUSIVE(); result = nstat_set_provider_filter(client, req); - if (result == 0) { - struct inpcb *inp; - struct nstat_tucookie *cookie; + // Only now can nstat_udp_new_pcb() find the client via the provider filter + // so the NSTAT_LOCK_EXCLUSIVE() ensures that sockets are added once and only once + if (result == 0) { OSIncrementAtomic(&nstat_udp_watchers); - // Add all current UDP inpcbs. - LIST_FOREACH(inp, udbinfo.ipi_listhead, inp_list) - { - cookie = nstat_tucookie_alloc_ref(inp); - if (cookie == NULL) { + struct nstat_sock_locus *sol; + + TAILQ_FOREACH(sol, &nstat_udp_sock_locus_head, nsl_link) { + assert(sol->nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + assert(sol->nsl_inp != NULL); + struct inpcb *inp = sol->nsl_inp; + + // Ideally all dead inpcbs should have been removed from the list, but be paranoid + if (inp->inp_state == INPCB_STATE_DEAD) { + NSTAT_NOTE_QUAL(nstat_add_all_udp_skip_dead, client, 0); continue; } - if (nstat_client_source_add(0, client, &nstat_udp_provider, - cookie, NSTAT_LOCK_NOTHELD) != 0) { - nstat_tucookie_release(cookie); - break; - } + // The client may not be interested in all TCP flows + // There's no need to check the return code from nstat_client_source_add() + // which will have performed any recovery actions directly + nstat_client_source_add(0, client, &nstat_udp_provider, sol, &sol->nsl_locus); } } - - lck_rw_done(&udbinfo.ipi_lock); - + NSTAT_UNLOCK_EXCLUSIVE(); return result; } @@ -2383,34 +2787,28 @@ __private_extern__ void nstat_udp_new_pcb( struct inpcb *inp) { - struct nstat_tucookie *cookie; - + struct nstat_sock_locus *sol = NULL; inp->inp_start_timestamp = mach_continuous_time(); - if (nstat_udp_watchers == 0) { + assert(inp->inp_nstat_locus == NULL); + sol = nstat_sock_locus_alloc_locked(inp); + if (sol == NULL) { return; } - - socket_lock(inp->inp_socket, 0); + sol->nsl_is_tcp = 0; NSTAT_LOCK_EXCLUSIVE(); - nstat_client *client; + TAILQ_INSERT_HEAD(&nstat_udp_sock_locus_head, sol, nsl_link); + + nstat_client *client; for (client = nstat_clients; client; client = client->ntc_next) { if ((client->ntc_watching & (1 << NSTAT_PROVIDER_UDP_KERNEL)) != 0) { - // this client is watching udp - // acquire a reference for it - cookie = nstat_tucookie_alloc_ref_locked(inp); - if (cookie == NULL) { - continue; - } - // add the source, if that fails, release the reference - if (nstat_client_source_add(0, client, - &nstat_udp_provider, cookie, NSTAT_LOCK_HELD) != 0) { - nstat_tucookie_release_locked(cookie); - } + // this client is watching tcp and may or may not be interested in this example + // Metrics are gathered by nstat_client_source_add(), + // There's no possible recovery action or logging needed here, no need to check return code + nstat_client_source_add(0, client, &nstat_udp_provider, sol, &sol->nsl_locus); } } NSTAT_UNLOCK_EXCLUSIVE(); - socket_unlock(inp->inp_socket, 0); } static errno_t @@ -2427,14 +2825,13 @@ nstat_udp_copy_descriptor( return EINVAL; } - struct nstat_tucookie *tucookie = - (struct nstat_tucookie *)cookie; - nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; - struct inpcb *inp = tucookie->inp; + struct nstat_extended_sock_locus *esol = (struct nstat_extended_sock_locus *)cookie; + nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; + struct inpcb *inp = esol->nesl_sock_locus.nsl_inp; bzero(desc, sizeof(*desc)); - if (tucookie->cached == false) { + if (esol->nesl_cached == false) { if (inp->inp_vflag & INP_IPV6) { in6_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, inp->inp_lifscope, &desc->local.v6, sizeof(desc->local.v6)); @@ -2449,23 +2846,23 @@ nstat_udp_copy_descriptor( desc->ifnet_properties = nstat_inpcb_to_flags(inp); } else { if (inp->inp_vflag & INP_IPV6) { - memcpy(&desc->local.v6, &tucookie->local.v6, + memcpy(&desc->local.v6, &esol->nesl_local.v6, sizeof(desc->local.v6)); - memcpy(&desc->remote.v6, &tucookie->remote.v6, + memcpy(&desc->remote.v6, &esol->nesl_remote.v6, sizeof(desc->remote.v6)); } else if (inp->inp_vflag & INP_IPV4) { - memcpy(&desc->local.v4, &tucookie->local.v4, + memcpy(&desc->local.v4, &esol->nesl_local.v4, sizeof(desc->local.v4)); - memcpy(&desc->remote.v4, &tucookie->remote.v4, + memcpy(&desc->remote.v4, &esol->nesl_remote.v4, sizeof(desc->remote.v4)); } - desc->ifnet_properties = tucookie->ifnet_properties; + desc->ifnet_properties = esol->nesl_sock_locus.nsl_ifnet_properties; } if (inp->inp_last_outifp) { desc->ifindex = inp->inp_last_outifp->if_index; } else { - desc->ifindex = tucookie->if_index; + desc->ifindex = esol->nesl_if_index; } struct socket *so = inp->inp_socket; @@ -2474,12 +2871,25 @@ nstat_udp_copy_descriptor( // they're in sync? desc->upid = so->last_upid; desc->pid = so->last_pid; - proc_best_name_for_pid(desc->pid, desc->pname, sizeof(desc->pname)); - if (desc->pname[0] == 0) { - strbufcpy(desc->pname, tucookie->pname); + if (so->so_flags1 & SOF1_INBOUND) { + desc->ifnet_properties |= NSTAT_SOURCE_IS_INBOUND; } else { - desc->pname[sizeof(desc->pname) - 1] = 0; - strbufcpy(tucookie->pname, desc->pname); + desc->ifnet_properties |= esol->nesl_sock_locus.nsl_ifnet_properties; + } + if (desc->pid == esol->nesl_sock_locus.nsl_pid) { + // There should be a nicely cached name to use, for efficiency + strbufcpy(desc->pname, esol->nesl_sock_locus.nsl_pname); + } else { + proc_best_name_for_pid(desc->pid, desc->pname, sizeof(desc->pname)); + if (desc->pname[0] != '\0') { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_udp_desc_new_name); + // This may not be fully synchronized but multiple updates should be fine + desc->pname[sizeof(desc->pname) - 1] = 0; + strbufcpy(esol->nesl_sock_locus.nsl_pname, desc->pname); + esol->nesl_sock_locus.nsl_pid = desc->pid; + } else { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_udp_desc_fail_name); + } } memcpy(desc->uuid, so->last_uuid, sizeof(so->last_uuid)); memcpy(desc->vuuid, so->so_vuuid, sizeof(so->so_vuuid)); @@ -2528,8 +2938,8 @@ nstat_udp_reporting_allowed( static size_t nstat_udp_extensions(nstat_provider_cookie_t cookie, u_int32_t extension_id, void *buf, size_t len) { - struct nstat_tucookie *tucookie = (struct nstat_tucookie *)cookie; - struct inpcb *inp = tucookie->inp; + struct nstat_sock_locus *sol = (struct nstat_sock_locus *)cookie; + struct inpcb *inp = sol->nsl_inp; if (nstat_udp_gone(cookie)) { return 0; } @@ -2557,6 +2967,7 @@ nstat_init_udp_provider(void) nstat_udp_provider.nstat_lookup = nstat_udp_lookup; nstat_udp_provider.nstat_gone = nstat_udp_gone; nstat_udp_provider.nstat_counts = nstat_udp_counts; + nstat_udp_provider.nstat_details = nstat_udp_details; nstat_udp_provider.nstat_watcher_add = nstat_udp_add_watcher; nstat_udp_provider.nstat_watcher_remove = nstat_udp_remove_watcher; nstat_udp_provider.nstat_copy_descriptor = nstat_udp_copy_descriptor; @@ -2592,6 +3003,7 @@ enum nstat_rnf_override { }; struct nstat_tu_shadow { + nstat_locus shad_locus; tailq_entry_tu_shadow shad_link; userland_stats_request_vals_fn *shad_getvals_fn; userland_stats_request_extension_fn *shad_get_extension_fn; @@ -2600,7 +3012,7 @@ struct nstat_tu_shadow { u_int64_t shad_start_timestamp; nstat_provider_id_t shad_provider; struct nstat_procdetails *shad_procdetails; - bool shad_live; // false if defunct + bool shad_live; // false if defunct enum nstat_rnf_override shad_rnf_override; uint32_t shad_magic; }; @@ -2610,6 +3022,7 @@ struct nstat_tu_shadow { #define TU_SHADOW_UNMAGIC 0xdeaddeed static tailq_head_tu_shadow nstat_userprot_shad_head = TAILQ_HEAD_INITIALIZER(nstat_userprot_shad_head); +static tailq_head_tu_shadow nstat_defunct_userprot_shad_head = TAILQ_HEAD_INITIALIZER(nstat_defunct_userprot_shad_head); static errno_t nstat_userland_tu_lookup( @@ -2640,7 +3053,26 @@ nstat_userland_tu_counts( assert(shad->shad_magic == TU_SHADOW_MAGIC); assert(shad->shad_live); - bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, out_counts, NULL); + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, out_counts, NULL, NULL); + + if (out_gone) { + *out_gone = 0; + } + + return (result)? 0 : EIO; +} + +static errno_t +nstat_userland_tu_details( + nstat_provider_cookie_t cookie, + struct nstat_detailed_counts *out_details, + int *out_gone) +{ + struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)cookie; + assert(shad->shad_magic == TU_SHADOW_MAGIC); + assert(shad->shad_live); + + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, out_details, NULL); if (out_gone) { *out_gone = 0; @@ -2662,7 +3094,7 @@ nstat_userland_tu_copy_descriptor( struct nstat_procdetails *procdetails = shad->shad_procdetails; assert(procdetails->pdet_magic == NSTAT_PROCDETAILS_MAGIC); - bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, data); + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, NULL, data); switch (shad->shad_provider) { case NSTAT_PROVIDER_TCP_USERLAND: @@ -2730,8 +3162,7 @@ nstat_userland_tu_copy_descriptor( static void nstat_userland_tu_release( - __unused nstat_provider_cookie_t cookie, - __unused int locked) + __unused nstat_provider_cookie_t cookie) { // Called when a nstat_src is detached. // We don't reference count or ask for delayed release so nothing to do here. @@ -2777,7 +3208,7 @@ nstat_userland_tcp_reporting_allowed( if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) { u_int32_t ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL, NULL)) { if ((filter->npf_flags & ifflags) == 0) { return false; } @@ -2786,7 +3217,7 @@ nstat_userland_tcp_reporting_allowed( if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) { nstat_tcp_descriptor tcp_desc; // Stack allocation - OK or pushing the limits too far? - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, &tcp_desc)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, NULL, &tcp_desc)) { retval = check_reporting_for_user(filter, (pid_t)tcp_desc.pid, (pid_t)tcp_desc.epid, &tcp_desc.uuid, &tcp_desc.euuid); } else { @@ -2823,7 +3254,7 @@ nstat_userland_udp_reporting_allowed( if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) { u_int32_t ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL, NULL)) { if ((filter->npf_flags & ifflags) == 0) { return false; } @@ -2831,7 +3262,7 @@ nstat_userland_udp_reporting_allowed( } if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) { nstat_udp_descriptor udp_desc; // Stack allocation - OK or pushing the limits too far? - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, &udp_desc)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, NULL, &udp_desc)) { retval = check_reporting_for_user(filter, (pid_t)udp_desc.pid, (pid_t)udp_desc.epid, &udp_desc.uuid, &udp_desc.euuid); } else { @@ -2855,7 +3286,7 @@ nstat_userland_quic_reporting_allowed( if ((filter->npf_flags & NSTAT_FILTER_IFNET_FLAGS) != 0) { u_int32_t ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, NULL, NULL, NULL, NULL)) { if ((filter->npf_flags & ifflags) == 0) { return false; } @@ -2863,7 +3294,7 @@ nstat_userland_quic_reporting_allowed( } if ((filter->npf_flags & NSTAT_FILTER_SPECIFIC_USER) != 0) { nstat_quic_descriptor quic_desc; // Stack allocation - OK or pushing the limits too far? - if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, &quic_desc)) { + if ((*shad->shad_getvals_fn)(shad->shad_provider_context, NULL, NULL, NULL, NULL, &quic_desc)) { retval = check_reporting_for_user(filter, (pid_t)quic_desc.pid, (pid_t)quic_desc.epid, &quic_desc.uuid, &quic_desc.euuid); } else { @@ -2894,8 +3325,8 @@ nstat_userland_protocol_add_watcher( TAILQ_FOREACH(shad, &nstat_userprot_shad_head, shad_link) { assert(shad->shad_magic == TU_SHADOW_MAGIC); - if ((shad->shad_provider == nstat_provider_type) && (shad->shad_live)) { - result = nstat_client_source_add(0, client, nstat_provider, shad, NSTAT_LOCK_HELD); + if (shad->shad_provider == nstat_provider_type) { + result = nstat_client_source_add(0, client, nstat_provider, shad, &shad->shad_locus); if (result != 0) { NSTAT_LOG_ERROR("nstat_client_source_add returned %d for " "provider type: %d", result, nstat_provider_type); @@ -2967,6 +3398,7 @@ nstat_init_userland_tcp_provider(void) nstat_userland_tcp_provider.nstat_lookup = nstat_userland_tu_lookup; nstat_userland_tcp_provider.nstat_gone = nstat_userland_tu_gone; nstat_userland_tcp_provider.nstat_counts = nstat_userland_tu_counts; + nstat_userland_tcp_provider.nstat_details = nstat_userland_tu_details; nstat_userland_tcp_provider.nstat_release = nstat_userland_tu_release; nstat_userland_tcp_provider.nstat_watcher_add = nstat_userland_tcp_add_watcher; nstat_userland_tcp_provider.nstat_watcher_remove = nstat_userland_tcp_remove_watcher; @@ -2987,6 +3419,7 @@ nstat_init_userland_udp_provider(void) nstat_userland_udp_provider.nstat_lookup = nstat_userland_tu_lookup; nstat_userland_udp_provider.nstat_gone = nstat_userland_tu_gone; nstat_userland_udp_provider.nstat_counts = nstat_userland_tu_counts; + nstat_userland_udp_provider.nstat_details = nstat_userland_tu_details; nstat_userland_udp_provider.nstat_release = nstat_userland_tu_release; nstat_userland_udp_provider.nstat_watcher_add = nstat_userland_udp_add_watcher; nstat_userland_udp_provider.nstat_watcher_remove = nstat_userland_udp_remove_watcher; @@ -3006,6 +3439,7 @@ nstat_init_userland_quic_provider(void) nstat_userland_quic_provider.nstat_lookup = nstat_userland_tu_lookup; nstat_userland_quic_provider.nstat_gone = nstat_userland_tu_gone; nstat_userland_quic_provider.nstat_counts = nstat_userland_tu_counts; + nstat_userland_quic_provider.nstat_details = nstat_userland_tu_details; nstat_userland_quic_provider.nstat_release = nstat_userland_tu_release; nstat_userland_quic_provider.nstat_watcher_add = nstat_userland_quic_add_watcher; nstat_userland_quic_provider.nstat_watcher_remove = nstat_userland_quic_remove_watcher; @@ -3044,7 +3478,7 @@ ntstat_userland_stats_open(userland_stats_provider_context *ctx, kfree_type(struct nstat_tu_shadow, shad); return NULL; } - + TAILQ_INIT(&shad->shad_locus.ntl_src_queue); shad->shad_getvals_fn = req_fn; shad->shad_get_extension_fn = req_extension_fn; shad->shad_provider_context = ctx; @@ -3080,7 +3514,7 @@ ntstat_userland_stats_open(userland_stats_provider_context *ctx, if ((client->ntc_watching & (1 << provider_id)) != 0) { // this client is watching tcp/udp/quic userland // Link to it. - int result = nstat_client_source_add(0, client, provider, shad, NSTAT_LOCK_HELD); + int result = nstat_client_source_add(0, client, provider, shad, &shad->shad_locus); if (result != 0) { // There should be some kind of statistics for failures like this. // The kernel ntstat component should keep some @@ -3100,6 +3534,9 @@ ntstat_userland_stats_close(nstat_userland_context nstat_ctx) struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)nstat_ctx; tailq_head_nstat_src dead_list; nstat_src *src; + nstat_src *tmpsrc; + nstat_client *client; + errno_t result; if (shad == NULL) { return; @@ -3109,36 +3546,17 @@ ntstat_userland_stats_close(nstat_userland_context nstat_ctx) TAILQ_INIT(&dead_list); NSTAT_LOCK_EXCLUSIVE(); - if (nstat_userland_udp_watchers != 0 || - nstat_userland_tcp_watchers != 0 || - nstat_userland_quic_watchers != 0) { - nstat_client *client; - errno_t result; - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - if (shad == (struct nstat_tu_shadow *)src->nts_cookie) { - nstat_provider_id_t provider_id = src->nts_provider->nstat_provider_id; - if (provider_id == NSTAT_PROVIDER_TCP_USERLAND || - provider_id == NSTAT_PROVIDER_UDP_USERLAND || - provider_id == NSTAT_PROVIDER_QUIC_USERLAND) { - break; - } - } - } - - if (src) { - result = nstat_client_send_goodbye(client, src); - - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); - TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); - } - } + TAILQ_FOREACH_SAFE(src, &shad->shad_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(shad == (struct nstat_tu_shadow *)src->nts_cookie); + client = src->nts_client; + result = nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); } - TAILQ_REMOVE(&nstat_userprot_shad_head, shad, shad_link); if (shad->shad_live) { + TAILQ_REMOVE(&nstat_userprot_shad_head, shad, shad_link); if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) { nstat_userland_tcp_shadows--; } else if (shad->shad_provider == NSTAT_PROVIDER_UDP_USERLAND) { @@ -3146,13 +3564,15 @@ ntstat_userland_stats_close(nstat_userland_context nstat_ctx) } else { nstat_userland_quic_shadows--; } + } else { + TAILQ_REMOVE(&nstat_defunct_userprot_shad_head, shad, shad_link); } NSTAT_UNLOCK_EXCLUSIVE(); while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); } nstat_release_procdetails(shad->shad_procdetails); shad->shad_magic = TU_SHADOW_UNMAGIC; @@ -3167,30 +3587,18 @@ ntstat_userland_stats_event_locked( { nstat_client *client; nstat_src *src; - errno_t result; nstat_provider_id_t provider_id; NSTAT_ASSERT_LOCKED_EXCLUSIVE(); - if (nstat_userland_udp_watchers != 0 || nstat_userland_tcp_watchers != 0 || nstat_userland_quic_watchers != 0) { - for (client = nstat_clients; client; client = client->ntc_next) { - if (((client->ntc_provider_filters[NSTAT_PROVIDER_TCP_USERLAND].npf_events & event) == 0) && - ((client->ntc_provider_filters[NSTAT_PROVIDER_UDP_USERLAND].npf_events & event) == 0) && - ((client->ntc_provider_filters[NSTAT_PROVIDER_QUIC_USERLAND].npf_events & event) == 0)) { - continue; - } - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) { - provider_id = src->nts_provider->nstat_provider_id; - if (provider_id == NSTAT_PROVIDER_TCP_USERLAND || provider_id == NSTAT_PROVIDER_UDP_USERLAND || - provider_id == NSTAT_PROVIDER_QUIC_USERLAND) { - if (shad == (struct nstat_tu_shadow *)src->nts_cookie) { - break; - } - } - } - if (src && ((client->ntc_provider_filters[provider_id].npf_events & event) != 0)) { - result = nstat_client_send_event(client, src, event); - } + assert(shad->shad_magic == TU_SHADOW_MAGIC); + provider_id = shad->shad_provider; + + TAILQ_FOREACH(src, &shad->shad_locus.ntl_src_queue, nts_locus_link) { + assert(shad == (struct nstat_tu_shadow *)src->nts_cookie); + client = src->nts_client; + if ((client->ntc_provider_filters[provider_id].npf_events & event) != 0) { + nstat_client_send_event(client, src, event); } } } @@ -3207,6 +3615,9 @@ ntstat_userland_stats_event( struct nstat_tu_shadow *shad = (struct nstat_tu_shadow *)nstat_ctx; tailq_head_nstat_src dead_list; nstat_src *src; + nstat_src *tmpsrc; + nstat_client *client; + errno_t result; if (shad == NULL) { return; @@ -3218,35 +3629,22 @@ ntstat_userland_stats_event( TAILQ_INIT(&dead_list); NSTAT_LOCK_EXCLUSIVE(); - if (nstat_userland_udp_watchers != 0 || - nstat_userland_tcp_watchers != 0 || - nstat_userland_quic_watchers != 0) { - nstat_client *client; - errno_t result; - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - if (shad == (struct nstat_tu_shadow *)src->nts_cookie) { - break; - } - } - - if (src) { - if (!(src->nts_filter & NSTAT_FILTER_TCP_NO_EARLY_CLOSE)) { - result = nstat_client_send_goodbye(client, src); - - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); - TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); - } - } + TAILQ_FOREACH_SAFE(src, &shad->shad_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(shad == (struct nstat_tu_shadow *)src->nts_cookie); + if (!(src->nts_filter & NSTAT_FILTER_TCP_NO_EARLY_CLOSE)) { + assert(src->nts_locus != NULL); + client = src->nts_client; + result = nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); } } NSTAT_UNLOCK_EXCLUSIVE(); while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); } } } @@ -3257,51 +3655,37 @@ nstats_userland_stats_defunct_for_process(int pid) // Note that this can be called multiple times for the same process tailq_head_nstat_src dead_list; nstat_src *src, *tmpsrc; - struct nstat_tu_shadow *shad; + struct nstat_tu_shadow *shad, *tmpshad; TAILQ_INIT(&dead_list); NSTAT_LOCK_EXCLUSIVE(); - if (nstat_userland_udp_watchers != 0 || - nstat_userland_tcp_watchers != 0 || - nstat_userland_quic_watchers != 0) { + TAILQ_FOREACH_SAFE(shad, &nstat_userprot_shad_head, shad_link, tmpshad) { + assert(shad->shad_magic == TU_SHADOW_MAGIC); nstat_client *client; errno_t result; - - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH_SAFE(src, &client->ntc_src_queue, nts_client_link, tmpsrc) - { - nstat_provider_id_t provider_id = src->nts_provider->nstat_provider_id; - if (provider_id == NSTAT_PROVIDER_TCP_USERLAND || - provider_id == NSTAT_PROVIDER_UDP_USERLAND || - provider_id == NSTAT_PROVIDER_QUIC_USERLAND) { - shad = (struct nstat_tu_shadow *)src->nts_cookie; - if (shad->shad_procdetails->pdet_pid == pid) { - result = nstat_client_send_goodbye(client, src); - - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); - TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); - } - } - } - } - } - - TAILQ_FOREACH(shad, &nstat_userprot_shad_head, shad_link) { assert(shad->shad_magic == TU_SHADOW_MAGIC); + assert(shad->shad_live); - if (shad->shad_live) { - if (shad->shad_procdetails->pdet_pid == pid) { - shad->shad_live = false; - if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) { - nstat_userland_tcp_shadows--; - } else if (shad->shad_provider == NSTAT_PROVIDER_UDP_USERLAND) { - nstat_userland_udp_shadows--; - } else { - nstat_userland_quic_shadows--; - } + if (shad->shad_procdetails->pdet_pid == pid) { + TAILQ_FOREACH_SAFE(src, &shad->shad_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(shad == (struct nstat_tu_shadow *)src->nts_cookie); + client = src->nts_client; + result = nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); } + shad->shad_live = false; + if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) { + nstat_userland_tcp_shadows--; + } else if (shad->shad_provider == NSTAT_PROVIDER_UDP_USERLAND) { + nstat_userland_udp_shadows--; + } else { + nstat_userland_quic_shadows--; + } + TAILQ_REMOVE(&nstat_userprot_shad_head, shad, shad_link); + TAILQ_INSERT_TAIL(&nstat_defunct_userprot_shad_head, shad, shad_link); } } @@ -3309,7 +3693,7 @@ nstats_userland_stats_defunct_for_process(int pid) while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); } } @@ -3364,6 +3748,7 @@ static nstat_provider nstat_udp_subflow_provider; static u_int32_t nstat_generic_provider_watchers[NSTAT_PROVIDER_COUNT]; struct nstat_generic_shadow { + nstat_locus gshad_locus; tailq_entry_generic_shadow gshad_link; nstat_provider_context gshad_provider_context; nstat_provider_request_vals_fn *gshad_getvals_fn; @@ -3435,7 +3820,7 @@ nstat_generic_provider_counts( memset(out_counts, 0, sizeof(*out_counts)); - bool result = (*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, out_counts, NULL); + bool result = (*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, out_counts, NULL, NULL); if (out_gone) { *out_gone = 0; @@ -3443,6 +3828,24 @@ nstat_generic_provider_counts( return (result)? 0 : EIO; } +static errno_t +nstat_generic_provider_details( + nstat_provider_cookie_t cookie, + struct nstat_detailed_counts *out_counts, + int *out_gone) +{ + struct nstat_generic_shadow *gshad = (struct nstat_generic_shadow *)cookie; + assert(gshad->gshad_magic == NSTAT_GENERIC_SHADOW_MAGIC); + + memset(out_counts, 0, sizeof(*out_counts)); + + bool result = (*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, NULL, out_counts, NULL); + + if (out_gone) { + *out_gone = 0; + } + return (result)? 0 : EIO; +} static errno_t nstat_generic_provider_copy_descriptor( @@ -3455,7 +3858,7 @@ nstat_generic_provider_copy_descriptor( struct nstat_procdetails *procdetails = gshad->gshad_procdetails; assert(procdetails->pdet_magic == NSTAT_PROCDETAILS_MAGIC); - bool result = (*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, NULL, data); + bool result = (*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, NULL, NULL, data); switch (gshad->gshad_provider) { case NSTAT_PROVIDER_CONN_USERLAND: @@ -3488,8 +3891,7 @@ nstat_generic_provider_copy_descriptor( static void nstat_generic_provider_release( - __unused nstat_provider_cookie_t cookie, - __unused int locked) + __unused nstat_provider_cookie_t cookie) { // Called when a nstat_src is detached. struct nstat_generic_shadow *gshad = (struct nstat_generic_shadow *)cookie; @@ -3518,7 +3920,7 @@ nstat_generic_provider_reporting_allowed( if ((filter->npf_flags & NSTAT_FILTER_IFNET_AND_CONN_FLAGS) != 0) { u_int32_t ifflags = NSTAT_IFNET_IS_UNKNOWN_TYPE; - if ((*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, &ifflags, NULL, NULL)) { + if ((*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, &ifflags, NULL, NULL, NULL)) { if ((filter->npf_flags & ifflags) == 0) { return false; } @@ -3539,7 +3941,7 @@ nstat_generic_provider_reporting_allowed( return true; } if ((filter->npf_flags & (NSTAT_FILTER_SPECIFIC_USER_BY_EPID | NSTAT_FILTER_SPECIFIC_USER_BY_EUUID)) != 0) { - nstat_udp_descriptor udp_desc; // Stack allocation - OK or pushing the limits too far? + nstat_udp_descriptor udp_desc; // Stack allocation - OK or pushing the limits too far? switch (gshad->gshad_provider) { case NSTAT_PROVIDER_CONN_USERLAND: // Filtering by effective uuid or effective pid is currently not supported @@ -3548,7 +3950,7 @@ nstat_generic_provider_reporting_allowed( return true; case NSTAT_PROVIDER_UDP_SUBFLOW: - if ((*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, NULL, &udp_desc)) { + if ((*gshad->gshad_getvals_fn)(gshad->gshad_provider_context, NULL, NULL, NULL, &udp_desc)) { if (check_reporting_for_user(filter, procdetails->pdet_pid, (pid_t)udp_desc.epid, &procdetails->pdet_uuid, &udp_desc.euuid)) { return true; @@ -3619,7 +4021,7 @@ nstat_generic_provider_add_watcher( } } nstat_retain_gshad(gshad); - result = nstat_client_source_add(0, client, provider, gshad, NSTAT_LOCK_HELD); + result = nstat_client_source_add(0, client, provider, gshad, &gshad->gshad_locus); if (result != 0) { NSTAT_LOG_ERROR("nstat_client_source_add returned %d for " "provider type: %d", result, provider_id); @@ -3657,6 +4059,7 @@ nstat_init_userland_conn_provider(void) nstat_userland_conn_provider.nstat_lookup = nstat_generic_provider_lookup; nstat_userland_conn_provider.nstat_gone = nstat_generic_provider_gone; nstat_userland_conn_provider.nstat_counts = nstat_generic_provider_counts; + nstat_userland_conn_provider.nstat_details = nstat_generic_provider_details; nstat_userland_conn_provider.nstat_release = nstat_generic_provider_release; nstat_userland_conn_provider.nstat_watcher_add = nstat_generic_provider_add_watcher; nstat_userland_conn_provider.nstat_watcher_remove = nstat_userland_conn_remove_watcher; @@ -3676,6 +4079,7 @@ nstat_init_udp_subflow_provider(void) nstat_udp_subflow_provider.nstat_lookup = nstat_generic_provider_lookup; nstat_udp_subflow_provider.nstat_gone = nstat_generic_provider_gone; nstat_udp_subflow_provider.nstat_counts = nstat_generic_provider_counts; + nstat_udp_subflow_provider.nstat_details = nstat_generic_provider_details; nstat_udp_subflow_provider.nstat_release = nstat_generic_provider_release; nstat_udp_subflow_provider.nstat_watcher_add = nstat_generic_provider_add_watcher; nstat_udp_subflow_provider.nstat_watcher_remove = nstat_udp_subflow_remove_watcher; @@ -3707,6 +4111,7 @@ nstat_provider_stats_open(nstat_provider_context ctx, return NULL; } + TAILQ_INIT(&gshad->gshad_locus.ntl_src_queue); gshad->gshad_getvals_fn = req_fn; gshad->gshad_getextensions_fn = req_extensions_fn; gshad->gshad_provider_context = ctx; @@ -3740,7 +4145,7 @@ nstat_provider_stats_open(nstat_provider_context ctx, } // this client is watching, so link to it. nstat_retain_gshad(gshad); - int result = nstat_client_source_add(0, client, provider, gshad, NSTAT_LOCK_HELD); + int result = nstat_client_source_add(0, client, provider, gshad, &gshad->gshad_locus); if (result != 0) { // There should be some kind of statistics for failures like this. // The kernel ntstat component should keep some @@ -3761,6 +4166,7 @@ nstat_provider_stats_close(nstat_context nstat_ctx) { tailq_head_nstat_src dead_list; nstat_src *src; + nstat_src *tmpsrc; struct nstat_generic_shadow *gshad = (struct nstat_generic_shadow *)nstat_ctx; if (gshad == NULL) { @@ -3778,46 +4184,20 @@ nstat_provider_stats_close(nstat_context nstat_ctx) NSTAT_LOCK_EXCLUSIVE(); + TAILQ_FOREACH_SAFE(src, &gshad->gshad_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(gshad == (struct nstat_generic_shadow *)src->nts_cookie); + nstat_client *client = src->nts_client; + nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); + } TAILQ_REMOVE(&nstat_gshad_head, gshad, gshad_link); - int32_t num_srcs = gshad->gshad_refcnt - 1; - if ((nstat_generic_provider_watchers[gshad->gshad_provider] != 0) && (num_srcs > 0)) { - nstat_client *client; - errno_t result; - - for (client = nstat_clients; client; client = client->ntc_next) { - // Only scan further if this client is watching - if ((client->ntc_watching & (1 << gshad->gshad_provider)) != 0) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - if ((gshad == (struct nstat_generic_shadow *)src->nts_cookie) && - (gshad->gshad_provider == src->nts_provider->nstat_provider_id)) { - break; - } - } - if (src) { - result = nstat_client_send_goodbye(client, src); - // There is currently no recovery possible from failure to send, - // so no need to check the return code. - // rdar://28312774 (Scalability and resilience issues in ntstat.c) - - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); - TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); - --num_srcs; - } - - // Performance optimization, don't scan full lists if no chance of presence - if (num_srcs == 0) { - break; - } - } - } - } NSTAT_UNLOCK_EXCLUSIVE(); while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, TRUE); + nstat_client_cleanup_source(NULL, src); } nstat_release_gshad(gshad); } @@ -3842,33 +4222,22 @@ nstat_provider_stats_event(__unused nstat_context nstat_ctx, __unused uint64_t e NSTAT_LOCK_EXCLUSIVE(); - if (nstat_generic_provider_watchers[gshad->gshad_provider] != 0) { - nstat_client *client; - errno_t result; + TAILQ_FOREACH(src, &gshad->gshad_locus.ntl_src_queue, nts_locus_link) { + assert(gshad == (struct nstat_generic_shadow *)src->nts_cookie); + nstat_client *client = src->nts_client; nstat_provider_id_t provider_id = gshad->gshad_provider; - for (client = nstat_clients; client; client = client->ntc_next) { - // Only scan further if this client is watching and has interest in the event - // or the client has requested "boring" unchanged status to be ignored - if (((client->ntc_watching & (1 << provider_id)) != 0) && - (((client->ntc_provider_filters[provider_id].npf_events & event) != 0) || - ((client->ntc_provider_filters[provider_id].npf_flags & NSTAT_FILTER_SUPPRESS_BORING_FLAGS) != 0))) { - TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) - { - if (gshad == (struct nstat_generic_shadow *)src->nts_cookie) { - break; - } - } - - if (src) { - src->nts_reported = false; - if ((client->ntc_provider_filters[provider_id].npf_events & event) != 0) { - result = nstat_client_send_event(client, src, event); - // There is currently no recovery possible from failure to send, - // so no need to check the return code. - // rdar://28312774 (Scalability and resilience issues in ntstat.c) - } - } + // Check if this client is watching and has interest in the event + // or the client has requested "boring" unchanged status to be ignored + if (((client->ntc_watching & (1 << provider_id)) != 0) && + (((client->ntc_provider_filters[provider_id].npf_events & event) != 0) || + ((client->ntc_provider_filters[provider_id].npf_flags & NSTAT_FILTER_SUPPRESS_BORING_FLAGS) != 0))) { + src->nts_reported = false; + if ((client->ntc_provider_filters[provider_id].npf_events & event) != 0) { + nstat_client_send_event(client, src, event); + // There is currently no recovery possible from failure to send, + // so no need to check the return code. + // rdar://28312774 (Scalability and resilience issues in ntstat.c) } } } @@ -3920,7 +4289,7 @@ nstat_ifnet_lookup( ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { continue; } ifnet_lock_exclusive(ifp); @@ -4022,8 +4391,7 @@ nstat_ifnet_counts( static void nstat_ifnet_release( - nstat_provider_cookie_t cookie, - __unused int locked) + nstat_provider_cookie_t cookie) { struct nstat_ifnet_cookie *ifcookie; struct ifnet *ifp; @@ -4056,7 +4424,7 @@ nstat_ifnet_release( */ ifcookie = (struct nstat_ifnet_cookie *)cookie; ifp = ifcookie->ifp; - if (ifnet_is_attached(ifp, 1)) { + if (ifnet_get_ioref(ifp)) { ifnet_lock_exclusive(ifp); if (minthreshold == UINT64_MAX) { ifp->if_data_threshold = 0; @@ -4371,7 +4739,7 @@ nstat_ifnet_report_lim_stats(void) ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { continue; } @@ -4537,16 +4905,32 @@ nstat_set_keyval_string(nstat_sysinfo_keyval *kv, int key, u_int8_t *__counted_b bcopy(buf, kv->u.nstat_sysinfo_string, kv->nstat_sysinfo_valsize); } +struct ntstat_sysinfo_keyval_iter { + uint32_t i, nkeymax; + nstat_sysinfo_keyval *__counted_by(nkeymax) kv; +}; + +static inline nstat_sysinfo_keyval * +ntstat_sysinfo_next(struct ntstat_sysinfo_keyval_iter *iter) +{ + size_t index = iter->i; + if (index < iter->nkeymax) { + iter->i++; + return &iter->kv[index]; + } + return NULL; +} + static void nstat_sysinfo_send_data_internal( nstat_client *client, nstat_sysinfo_data *data) { nstat_msg_sysinfo_counts *syscnt = NULL; - size_t allocsize = 0, countsize = 0, nkeyvals = 0, finalsize = 0; - nstat_sysinfo_keyval *kv; + size_t allocsize = 0, countsize = 0, finalsize = 0; + uint32_t nkeyvals = 0; + struct ntstat_sysinfo_keyval_iter iter = {}; errno_t result = 0; - size_t i = 0; allocsize = offsetof(nstat_msg_sysinfo_counts, counts); countsize = offsetof(nstat_sysinfo_counts, nstat_sysinfo_keyvals); @@ -4575,409 +4959,411 @@ nstat_sysinfo_send_data_internal( return; } - kv = nstat_sysinfo_get_keyvals(syscnt); + iter.i = 0; + iter.kv = nstat_sysinfo_get_keyvals(syscnt); + iter.nkeymax = nkeyvals; switch (data->flags) { case NSTAT_SYSINFO_TCP_STATS: { - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_IPV4_AVGRTT, data->u.tcp_stats.ipv4_avgrtt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_IPV6_AVGRTT, data->u.tcp_stats.ipv6_avgrtt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_SEND_PLR, data->u.tcp_stats.send_plr); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_RECV_PLR, data->u.tcp_stats.recv_plr); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_SEND_TLRTO, data->u.tcp_stats.send_tlrto_rate); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_KEY_SEND_REORDERRATE, data->u.tcp_stats.send_reorder_rate); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_CONNECTION_ATTEMPTS, data->u.tcp_stats.connection_attempts); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_CONNECTION_ACCEPTS, data->u.tcp_stats.connection_accepts); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CLIENT_ENABLED, data->u.tcp_stats.ecn_client_enabled); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_SERVER_ENABLED, data->u.tcp_stats.ecn_server_enabled); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CLIENT_SETUP, data->u.tcp_stats.ecn_client_setup); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_SERVER_SETUP, data->u.tcp_stats.ecn_server_setup); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CLIENT_SUCCESS, data->u.tcp_stats.ecn_client_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_SERVER_SUCCESS, data->u.tcp_stats.ecn_server_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_NOT_SUPPORTED, data->u.tcp_stats.ecn_not_supported); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_LOST_SYN, data->u.tcp_stats.ecn_lost_syn); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_LOST_SYNACK, data->u.tcp_stats.ecn_lost_synack); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_RECV_CE, data->u.tcp_stats.ecn_recv_ce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_RECV_ECE, data->u.tcp_stats.ecn_recv_ece); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_SENT_ECE, data->u.tcp_stats.ecn_sent_ece); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CONN_RECV_CE, data->u.tcp_stats.ecn_conn_recv_ce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CONN_RECV_ECE, data->u.tcp_stats.ecn_conn_recv_ece); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CONN_PLNOCE, data->u.tcp_stats.ecn_conn_plnoce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CONN_PL_CE, data->u.tcp_stats.ecn_conn_pl_ce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_CONN_NOPL_CE, data->u.tcp_stats.ecn_conn_nopl_ce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_FALLBACK_SYNLOSS, data->u.tcp_stats.ecn_fallback_synloss); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_FALLBACK_REORDER, data->u.tcp_stats.ecn_fallback_reorder); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_ECN_FALLBACK_CE, data->u.tcp_stats.ecn_fallback_ce); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_SYN_DATA_RCV, data->u.tcp_stats.tfo_syn_data_rcv); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_REQ_RCV, data->u.tcp_stats.tfo_cookie_req_rcv); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_SENT, data->u.tcp_stats.tfo_cookie_sent); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_INVALID, data->u.tcp_stats.tfo_cookie_invalid); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_REQ, data->u.tcp_stats.tfo_cookie_req); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_RCV, data->u.tcp_stats.tfo_cookie_rcv); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_SYN_DATA_SENT, data->u.tcp_stats.tfo_syn_data_sent); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_SYN_DATA_ACKED, data->u.tcp_stats.tfo_syn_data_acked); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_SYN_LOSS, data->u.tcp_stats.tfo_syn_loss); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_BLACKHOLE, data->u.tcp_stats.tfo_blackhole); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_COOKIE_WRONG, data->u.tcp_stats.tfo_cookie_wrong); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_NO_COOKIE_RCV, data->u.tcp_stats.tfo_no_cookie_rcv); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_HEURISTICS_DISABLE, data->u.tcp_stats.tfo_heuristics_disable); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_TFO_SEND_BLACKHOLE, data->u.tcp_stats.tfo_sndblackhole); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_ATTEMPT, data->u.tcp_stats.mptcp_handover_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_INTERACTIVE_ATTEMPT, data->u.tcp_stats.mptcp_interactive_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_AGGREGATE_ATTEMPT, data->u.tcp_stats.mptcp_aggregate_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_HANDOVER_ATTEMPT, data->u.tcp_stats.mptcp_fp_handover_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_INTERACTIVE_ATTEMPT, data->u.tcp_stats.mptcp_fp_interactive_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_AGGREGATE_ATTEMPT, data->u.tcp_stats.mptcp_fp_aggregate_attempt); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HEURISTIC_FALLBACK, data->u.tcp_stats.mptcp_heuristic_fallback); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_HEURISTIC_FALLBACK, data->u.tcp_stats.mptcp_fp_heuristic_fallback); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_SUCCESS_WIFI, data->u.tcp_stats.mptcp_handover_success_wifi); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_SUCCESS_CELL, data->u.tcp_stats.mptcp_handover_success_cell); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_INTERACTIVE_SUCCESS, data->u.tcp_stats.mptcp_interactive_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_AGGREGATE_SUCCESS, data->u.tcp_stats.mptcp_aggregate_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_HANDOVER_SUCCESS_WIFI, data->u.tcp_stats.mptcp_fp_handover_success_wifi); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_HANDOVER_SUCCESS_CELL, data->u.tcp_stats.mptcp_fp_handover_success_cell); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_INTERACTIVE_SUCCESS, data->u.tcp_stats.mptcp_fp_interactive_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_FP_AGGREGATE_SUCCESS, data->u.tcp_stats.mptcp_fp_aggregate_success); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_CELL_FROM_WIFI, data->u.tcp_stats.mptcp_handover_cell_from_wifi); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_WIFI_FROM_CELL, data->u.tcp_stats.mptcp_handover_wifi_from_cell); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_INTERACTIVE_CELL_FROM_WIFI, data->u.tcp_stats.mptcp_interactive_cell_from_wifi); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_CELL_BYTES, data->u.tcp_stats.mptcp_handover_cell_bytes); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_INTERACTIVE_CELL_BYTES, data->u.tcp_stats.mptcp_interactive_cell_bytes); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_AGGREGATE_CELL_BYTES, data->u.tcp_stats.mptcp_aggregate_cell_bytes); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_HANDOVER_ALL_BYTES, data->u.tcp_stats.mptcp_handover_all_bytes); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_INTERACTIVE_ALL_BYTES, data->u.tcp_stats.mptcp_interactive_all_bytes); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_AGGREGATE_ALL_BYTES, data->u.tcp_stats.mptcp_aggregate_all_bytes); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_BACK_TO_WIFI, data->u.tcp_stats.mptcp_back_to_wifi); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_WIFI_PROXY, data->u.tcp_stats.mptcp_wifi_proxy); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_CELL_PROXY, data->u.tcp_stats.mptcp_cell_proxy); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_MPTCP_TRIGGERED_CELL, data->u.tcp_stats.mptcp_triggered_cell); - VERIFY(i == nkeyvals); + VERIFY(iter.i == nkeyvals); break; } case NSTAT_SYSINFO_LIM_STATS: { - nstat_set_keyval_string(&kv[i++], + nstat_set_keyval_string(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_SIGNATURE, data->u.lim_stats.ifnet_signature, min(data->u.lim_stats.ifnet_siglen, NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE)); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_DL_MAX_BANDWIDTH, data->u.lim_stats.lim_stat.lim_dl_max_bandwidth); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_UL_MAX_BANDWIDTH, data->u.lim_stats.lim_stat.lim_ul_max_bandwidth); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_PACKET_LOSS_PERCENT, data->u.lim_stats.lim_stat.lim_packet_loss_percent); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_PACKET_OOO_PERCENT, data->u.lim_stats.lim_stat.lim_packet_ooo_percent); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_RTT_VARIANCE, data->u.lim_stats.lim_stat.lim_rtt_variance); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_RTT_MIN, data->u.lim_stats.lim_stat.lim_rtt_min); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_RTT_AVG, data->u.lim_stats.lim_stat.lim_rtt_average); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_CONN_TIMEOUT_PERCENT, data->u.lim_stats.lim_stat.lim_conn_timeout_percent); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_DL_DETECTED, data->u.lim_stats.lim_stat.lim_dl_detected); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_UL_DETECTED, data->u.lim_stats.lim_stat.lim_ul_detected); - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_LIM_IFNET_TYPE, data->u.lim_stats.ifnet_type); break; } case NSTAT_SYSINFO_NET_API_STATS: { - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IF_FLTR_ATTACH, data->u.net_api_stats.net_api_stats.nas_iflt_attach_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IF_FLTR_ATTACH_OS, data->u.net_api_stats.net_api_stats.nas_iflt_attach_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IP_FLTR_ADD, data->u.net_api_stats.net_api_stats.nas_ipf_add_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IP_FLTR_ADD_OS, data->u.net_api_stats.net_api_stats.nas_ipf_add_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_FLTR_ATTACH, data->u.net_api_stats.net_api_stats.nas_sfltr_register_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_FLTR_ATTACH_OS, data->u.net_api_stats.net_api_stats.nas_sfltr_register_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_ALLOC_TOTAL, data->u.net_api_stats.net_api_stats.nas_socket_alloc_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_ALLOC_KERNEL, data->u.net_api_stats.net_api_stats.nas_socket_in_kernel_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_ALLOC_KERNEL_OS, data->u.net_api_stats.net_api_stats.nas_socket_in_kernel_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_NECP_CLIENTUUID, data->u.net_api_stats.net_api_stats.nas_socket_necp_clientuuid_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_LOCAL, data->u.net_api_stats.net_api_stats.nas_socket_domain_local_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_ROUTE, data->u.net_api_stats.net_api_stats.nas_socket_domain_route_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_INET, data->u.net_api_stats.net_api_stats.nas_socket_domain_inet_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_INET6, data->u.net_api_stats.net_api_stats.nas_socket_domain_inet6_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_SYSTEM, data->u.net_api_stats.net_api_stats.nas_socket_domain_system_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_MULTIPATH, data->u.net_api_stats.net_api_stats.nas_socket_domain_multipath_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_KEY, data->u.net_api_stats.net_api_stats.nas_socket_domain_key_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_NDRV, data->u.net_api_stats.net_api_stats.nas_socket_domain_ndrv_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_DOMAIN_OTHER, data->u.net_api_stats.net_api_stats.nas_socket_domain_other_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_STREAM, data->u.net_api_stats.net_api_stats.nas_socket_inet_stream_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_DGRAM, data->u.net_api_stats.net_api_stats.nas_socket_inet_dgram_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_DGRAM_CONNECTED, data->u.net_api_stats.net_api_stats.nas_socket_inet_dgram_connected); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_DGRAM_DNS, data->u.net_api_stats.net_api_stats.nas_socket_inet_dgram_dns); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_DGRAM_NO_DATA, data->u.net_api_stats.net_api_stats.nas_socket_inet_dgram_no_data); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET6_STREAM, data->u.net_api_stats.net_api_stats.nas_socket_inet6_stream_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET6_DGRAM, data->u.net_api_stats.net_api_stats.nas_socket_inet6_dgram_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET6_DGRAM_CONNECTED, data->u.net_api_stats.net_api_stats.nas_socket_inet6_dgram_connected); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET6_DGRAM_DNS, data->u.net_api_stats.net_api_stats.nas_socket_inet6_dgram_dns); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET6_DGRAM_NO_DATA, data->u.net_api_stats.net_api_stats.nas_socket_inet6_dgram_no_data); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_MCAST_JOIN, data->u.net_api_stats.net_api_stats.nas_socket_mcast_join_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_SOCK_INET_MCAST_JOIN_OS, data->u.net_api_stats.net_api_stats.nas_socket_mcast_join_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_NEXUS_FLOW_INET_STREAM, data->u.net_api_stats.net_api_stats.nas_nx_flow_inet_stream_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_NEXUS_FLOW_INET_DATAGRAM, data->u.net_api_stats.net_api_stats.nas_nx_flow_inet_dgram_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_NEXUS_FLOW_INET6_STREAM, data->u.net_api_stats.net_api_stats.nas_nx_flow_inet6_stream_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_NEXUS_FLOW_INET6_DATAGRAM, data->u.net_api_stats.net_api_stats.nas_nx_flow_inet6_dgram_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IFNET_ALLOC, data->u.net_api_stats.net_api_stats.nas_ifnet_alloc_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IFNET_ALLOC_OS, data->u.net_api_stats.net_api_stats.nas_ifnet_alloc_os_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_PF_ADDRULE, data->u.net_api_stats.net_api_stats.nas_pf_addrule_total); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_PF_ADDRULE_OS, data->u.net_api_stats.net_api_stats.nas_pf_addrule_os); - nstat_set_keyval_u64_scalar(&kv[i++], + nstat_set_keyval_u64_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_VMNET_START, data->u.net_api_stats.net_api_stats.nas_vmnet_total); #if SKYWALK - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_IF_NETAGENT_ENABLED, if_is_fsw_transport_netagent_enabled()); #endif /* SKYWALK */ - nstat_set_keyval_scalar(&kv[i++], + nstat_set_keyval_scalar(ntstat_sysinfo_next(&iter), NSTAT_SYSINFO_API_REPORT_INTERVAL, data->u.net_api_stats.report_interval); @@ -4985,10 +5371,9 @@ nstat_sysinfo_send_data_internal( } } if (syscnt != NULL) { - VERIFY(i > 0 && i <= nkeyvals); + VERIFY(iter.i > 0); countsize = offsetof(nstat_sysinfo_counts, - nstat_sysinfo_keyvals) + - sizeof(nstat_sysinfo_keyval) * i; + nstat_sysinfo_keyvals[iter.i]); finalsize += countsize; syscnt->hdr.type = NSTAT_MSG_TYPE_SYSINFO_COUNTS; assert(finalsize <= MAX_NSTAT_MSG_HDR_LENGTH); @@ -5143,7 +5528,7 @@ nstat_net_api_report_stats(void) */ memcpy(&net_api_stats_before, &net_api_stats, sizeof(struct net_api_stats)); - _CASSERT(sizeof(net_api_stats_before) == sizeof(net_api_stats)); + static_assert(sizeof(net_api_stats_before) == sizeof(net_api_stats)); } @@ -5187,18 +5572,27 @@ nstat_client_send_event( u_int64_t event) { errno_t result = ENOTSUP; - int failed = 0; if (nstat_client_reporting_allowed(client, src, 0)) { - if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { - result = nstat_client_send_update(client, src, 0, event, 0, NULL); + if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_DETAILS) != 0) { + result = nstat_client_send_details(client, src, 0, event, 0, NULL); if (result != 0) { - failed = 1; + if (nstat_debug != 0) { + NSTAT_LOG_ERROR("nstat_client_send_event() %d", result); + } + } + } else if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { + result = nstat_client_send_update(client, src, 0, event, 0, NULL); + if (result == 0) { + NSTAT_NOTE_SRC(nstat_send_event, client, src); + } else { + NSTAT_NOTE_SRC(nstat_send_event_fail, client, src); if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_event() %d", result); } } } else { + NSTAT_NOTE_SRC(nstat_send_event_notsup, client, src); if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_event() used when updates not supported"); } @@ -5218,45 +5612,77 @@ nstat_client_send_goodbye( if (nstat_client_reporting_allowed(client, src, (src->nts_reported)? NSTAT_FILTER_SUPPRESS_BORING_CLOSE: 0)) { hdr_flags = 0; - if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { + if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_DETAILS) != 0) { + result = nstat_client_send_details(client, src, 0, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL); + if (result != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_failed_details, client, src); + failed = 1; + hdr_flags = NSTAT_MSG_HDR_FLAG_CLOSED_AFTER_DROP; + if (nstat_debug != 0) { + NSTAT_LOG_ERROR("nstat_client_send_details() %d", result); + } + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_sent_details, client, src); + } + } else if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { result = nstat_client_send_update(client, src, 0, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL); if (result != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_failed_update, client, src); failed = 1; hdr_flags = NSTAT_MSG_HDR_FLAG_CLOSED_AFTER_DROP; if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_update() %d", result); } + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_sent_update, client, src); } } else { // send one last counts notification result = nstat_client_send_counts(client, src, 0, NSTAT_MSG_HDR_FLAG_CLOSING, NULL); if (result != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_failed_counts, client, src); failed = 1; hdr_flags = NSTAT_MSG_HDR_FLAG_CLOSED_AFTER_DROP; if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_counts() %d", result); } + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_sent_counts, client, src); } // send a last description result = nstat_client_send_description(client, src, 0, NSTAT_MSG_HDR_FLAG_CLOSING); if (result != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_failed_description, client, src); failed = 1; hdr_flags = NSTAT_MSG_HDR_FLAG_CLOSED_AFTER_DROP; if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_description() %d", result); } + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_sent_description, client, src); } } + } else { + if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_DETAILS) != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_filtered_details, client, src); + } else if ((client->ntc_flags & NSTAT_FLAG_SUPPORTS_UPDATES) != 0) { + NSTAT_NOTE_SRC(nstat_src_goodbye_filtered_update, client, src); + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_filtered_counts, client, src); + } } // send the source removed notification result = nstat_client_send_removed(client, src, hdr_flags); if (result != 0 && nstat_debug) { + NSTAT_NOTE_SRC(nstat_src_goodbye_failed_removed, client, src); failed = 1; if (nstat_debug != 0) { NSTAT_LOG_ERROR("nstat_client_send_removed() %d", result); } + } else { + NSTAT_NOTE_SRC(nstat_src_goodbye_sent_removed, client, src); } if (failed != 0) { @@ -5341,28 +5767,93 @@ nstat_idle_check( { nstat_client *client; nstat_src *src, *tmpsrc; + struct nstat_sock_locus *sol, *tmpsol; tailq_head_nstat_src dead_list; TAILQ_INIT(&dead_list); NSTAT_LOCK_EXCLUSIVE(); - nstat_idle_time = 0; - for (client = nstat_clients; client; client = client->ntc_next) { - TAILQ_FOREACH_SAFE(src, &client->ntc_src_queue, nts_client_link, tmpsrc) - { - if (src->nts_provider->nstat_gone(src->nts_cookie)) { - errno_t result; - // Pull it off the list - NSTAT_NOTE_SRC(nstat_src_gone_idlecheck, client, src); - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + TAILQ_FOREACH_SAFE(sol, &nstat_tcp_sock_locus_head, nsl_link, tmpsol) { + assert(sol->nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + assert(sol->nsl_inp != NULL); + assert(sol->nsl_is_tcp != 0); + struct inpcb *inp = sol->nsl_inp; + assert(inp->inp_nstat_locus == sol); - result = nstat_client_send_goodbye(client, src); - - // Put this on the list to release later + // Ideally all dead inpcbs should have been removed from the list, but be paranoid + if (inp->inp_state == INPCB_STATE_DEAD) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_idlecheck_tcp_gone); + TAILQ_FOREACH_SAFE(src, &inp->inp_nstat_locus->nsl_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(sol == (struct nstat_sock_locus *)src->nts_cookie); + client = src->nts_client; + NSTAT_NOTE_QUAL(nstat_add_all_tcp_skip_dead, client, 0); + nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); } + TAILQ_REMOVE(&nstat_tcp_sock_locus_head, sol, nsl_link); + inp->inp_nstat_locus = NULL; + nstat_sock_locus_release(sol, FALSE); + } + } + // Give others a chance to run + NSTAT_LOCK_YIELD_EXCLUSIVE(); + + TAILQ_FOREACH_SAFE(sol, &nstat_udp_sock_locus_head, nsl_link, tmpsol) { + assert(sol->nsl_magic == NSTAT_SOCK_LOCUS_MAGIC); + assert(sol->nsl_inp != NULL); + assert(sol->nsl_is_tcp == 0); + struct inpcb *inp = sol->nsl_inp; + + // Ideally all dead inpcbs should have been removed from the list, but be paranoid + if (inp->inp_state == INPCB_STATE_DEAD) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_idlecheck_udp_gone); + TAILQ_FOREACH_SAFE(src, &inp->inp_nstat_locus->nsl_locus.ntl_src_queue, nts_locus_link, tmpsrc) { + assert(sol == (struct nstat_sock_locus *)src->nts_cookie); + client = src->nts_client; + NSTAT_NOTE_QUAL(nstat_add_all_tcp_skip_dead, client, 0); + nstat_client_send_goodbye(client, src); + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); + } + TAILQ_REMOVE(&nstat_tcp_sock_locus_head, sol, nsl_link); + inp->inp_nstat_locus = NULL; + nstat_sock_locus_release(sol, FALSE); + } + } + // Give others a chance to run + NSTAT_LOCK_YIELD_EXCLUSIVE(); + + // Only routes and interfaces are left for the "gone" check. + // Routes should at some point be moved to the locus mechanism + // and interface functionality provided by alternative means to the + // NSTAT_PROVIDER_IFNET, which will remove the need to scan all possible sources. + // In the meantime, they should be a minority interest and + // it is expected that most clients will be skipped in the sequence below + for (client = nstat_clients; client; client = client->ntc_next) { + if (((client->ntc_watching & (1 << NSTAT_PROVIDER_ROUTE)) != 0) || + ((client->ntc_added_src & (1 << NSTAT_PROVIDER_ROUTE)) != 0) || + ((client->ntc_added_src & (1 << NSTAT_PROVIDER_IFNET)) != 0)) { + // this client is watching routes + TAILQ_FOREACH_SAFE(src, &client->ntc_src_queue, nts_client_link, tmpsrc) { + if (((src->nts_provider->nstat_provider_id == NSTAT_PROVIDER_ROUTE) || + (src->nts_provider->nstat_provider_id == NSTAT_PROVIDER_IFNET)) && + (src->nts_provider->nstat_gone(src->nts_cookie))) { + errno_t result; + + // Pull it off the list + NSTAT_NOTE_SRC(nstat_route_src_gone_idlecheck, client, src); + nstat_src_remove_linkages(client, src); + + result = nstat_client_send_goodbye(client, src); + + // Put this on the list to release later + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_idlecheck_route_src_gone); + } + } } } @@ -5376,11 +5867,11 @@ nstat_idle_check( /* Generate any system level reports, if needed */ nstat_sysinfo_generate_report(); - // Release the sources now that we aren't holding lots of locks + // Release the sources now that we aren't holding locks while ((src = TAILQ_FIRST(&dead_list))) { - TAILQ_REMOVE(&dead_list, src, nts_client_link); NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_src_idlecheck_gone); - nstat_client_cleanup_source(NULL, src, FALSE); + TAILQ_REMOVE(&dead_list, src, nts_client_link); + nstat_client_cleanup_source(NULL, src); } nstat_prune_procdetails(); @@ -5406,8 +5897,7 @@ nstat_client_register(void) static void nstat_client_cleanup_source( nstat_client *client, - struct nstat_src *src, - boolean_t locked) + struct nstat_src *src) { errno_t result; @@ -5421,7 +5911,7 @@ nstat_client_cleanup_source( } } // Cleanup the source if we found it. - src->nts_provider->nstat_release(src->nts_cookie, locked); + src->nts_provider->nstat_release(src->nts_cookie); NSTAT_GLOBAL_COUNT_DECREMENT(nstat_global_src_current); kfree_type(struct nstat_src, src); } @@ -5450,6 +5940,7 @@ nstat_client_connect( nstat_client *client = kalloc_type(nstat_client, Z_WAITOK | Z_ZERO); if (client == NULL) { + NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_client_alloc_fails); return ENOMEM; } @@ -5466,6 +5957,7 @@ nstat_client_connect( client->ntc_trace = (nstat_cyclic_trace *)kalloc_data(sizeof(nstat_cyclic_trace), Z_WAITOK | Z_ZERO); #endif NSTAT_LOCK_EXCLUSIVE(); + client->ntc_client_id = nstat_next_client_id++; client->ntc_next = nstat_clients; nstat_clients = client; @@ -5523,6 +6015,14 @@ nstat_client_disconnect( client->ntc_accumulated = NULL; } + // Remove any source links to associated locus + TAILQ_FOREACH(src, &client->ntc_src_queue, nts_client_link) { + if (src->nts_locus != NULL) { + TAILQ_REMOVE(&src->nts_locus->ntl_src_queue, src, nts_locus_link); + src->nts_locus = NULL; + } + } + // Copy out the list of sources TAILQ_CONCAT(&cleanup_list, &client->ntc_src_queue, nts_client_link); @@ -5530,12 +6030,12 @@ nstat_client_disconnect( while ((src = TAILQ_FIRST(&cleanup_list))) { TAILQ_REMOVE(&cleanup_list, src, nts_client_link); - nstat_client_cleanup_source(NULL, src, FALSE); + nstat_client_cleanup_source(NULL, src); } lck_mtx_destroy(&client->ntc_user_mtx, &nstat_lck_grp); nstat_release_procdetails(client->ntc_procdetails); - nstat_accumulate_client_metrics(client); + nstat_accumulate_client_metrics(&nstat_metrics, client); #if NSTAT_TRACE_ENABLED if (client->ntc_trace != NULL) { kfree_data(client->ntc_trace, sizeof(nstat_cyclic_trace)); @@ -5865,6 +6365,142 @@ nstat_client_send_update( return result; } +static int +nstat_client_send_details( + nstat_client *client, + nstat_src *src, + u_int64_t context, + u_int64_t event, + u_int16_t hdr_flags, + int *gone) +{ + // Provider doesn't support getting the descriptor or counts? Done. + if ((src->nts_provider->nstat_descriptor_length == 0 || + src->nts_provider->nstat_copy_descriptor == NULL) || + src->nts_provider->nstat_details == NULL) { + return EOPNOTSUPP; + } + + // Allocate storage for the descriptor message + mbuf_ref_t msg; + unsigned int one = 1; + size_t size = offsetof(nstat_msg_src_details, data) + + src->nts_provider->nstat_descriptor_length; + size_t total_extension_size = 0; + u_int32_t num_extensions = 0; + u_int64_t extension_mask = nstat_extension_flags_for_source(client, src); + + if ((extension_mask != 0) && (src->nts_provider->nstat_copy_extension != NULL)) { + uint32_t extension_id = 0; + for (extension_id = NSTAT_EXTENDED_UPDATE_TYPE_MIN; extension_id <= NSTAT_EXTENDED_UPDATE_TYPE_MAX; extension_id++) { + if ((extension_mask & (1ull << extension_id)) != 0) { + size_t extension_size = src->nts_provider->nstat_copy_extension(src->nts_cookie, extension_id, NULL, 0); + if (extension_size == 0) { + extension_mask &= ~(1ull << extension_id); + } else { + num_extensions++; + total_extension_size += ROUNDUP64(extension_size); + } + } + } + size += total_extension_size + (sizeof(nstat_msg_src_extended_item_hdr) * num_extensions); + } + assert(size <= MAX_NSTAT_MSG_HDR_LENGTH); + + /* + * XXX Would be interesting to see how extended details affect mbuf + * allocations, given the max segments defined as 1, one may get + * allocations with higher fragmentation. + */ + if (mbuf_allocpacket(MBUF_DONTWAIT, size, &one, &msg) != 0) { + return ENOMEM; + } + + /* zero out for nstat_msg_src_details */ + bzero(m_mtod_current(msg), size); + + nstat_msg_src_details *desc = mtod(msg, nstat_msg_src_details *); + desc->hdr.context = context; + desc->hdr.type = (num_extensions == 0) ? NSTAT_MSG_TYPE_SRC_DETAILS : + NSTAT_MSG_TYPE_SRC_EXTENDED_DETAILS; + desc->hdr.length = (u_int16_t)size; + desc->hdr.flags = hdr_flags; + desc->srcref = src->nts_srcref; + desc->event_flags = event; + desc->provider = src->nts_provider->nstat_provider_id; + + /* + * XXX The following two lines are only valid when max-segments is passed + * as one. + * Other computations with offset also depend on that being true. + * Be aware of that before making any modifications that changes that + * behavior. + */ + mbuf_setlen(msg, size); + mbuf_pkthdr_setlen(msg, mbuf_len(msg)); + + errno_t result = 0; + if (src->nts_provider->nstat_descriptor_length != 0 && src->nts_provider->nstat_copy_descriptor) { + // Query the provider for the provider specific bits + u_int8_t *desc_data_ptr = nstat_get_data(desc); + result = src->nts_provider->nstat_copy_descriptor(src->nts_cookie, desc_data_ptr, + src->nts_provider->nstat_descriptor_length); + if (result != 0) { + mbuf_freem(msg); + return result; + } + } + + if (num_extensions > 0) { + nstat_msg_src_extended_item_hdr *p_extension_hdr = (nstat_msg_src_extended_item_hdr *)mtodo(msg, sizeof(nstat_msg_src_details_hdr) + src->nts_provider->nstat_descriptor_length); + uint32_t extension_id = 0; + + for (extension_id = NSTAT_EXTENDED_UPDATE_TYPE_MIN; extension_id <= NSTAT_EXTENDED_UPDATE_TYPE_MAX; extension_id++) { + if ((extension_mask & (1ull << extension_id)) != 0) { + void *buf = (void *)(p_extension_hdr + 1); + size_t extension_size = src->nts_provider->nstat_copy_extension(src->nts_cookie, extension_id, buf, total_extension_size); + if ((extension_size == 0) || (extension_size > total_extension_size)) { + // Something has gone wrong. Instead of attempting to wind back the excess buffer space, mark it as unused + p_extension_hdr->type = NSTAT_EXTENDED_UPDATE_TYPE_UNKNOWN; + p_extension_hdr->length = total_extension_size + (sizeof(nstat_msg_src_extended_item_hdr) * (num_extensions - 1)); + break; + } else { + // The extension may be of any size alignment, reported as such in the extension header, + // but we pad to ensure that whatever comes next is suitably aligned + p_extension_hdr->type = extension_id; + p_extension_hdr->length = extension_size; + extension_size = ROUNDUP64(extension_size); + total_extension_size -= extension_size; + p_extension_hdr = (nstat_msg_src_extended_item_hdr *)(void *)((char *)buf + extension_size); + num_extensions--; + } + } + } + } + + if (src->nts_provider->nstat_details) { + result = src->nts_provider->nstat_details(src->nts_cookie, &desc->detailed_counts, gone); + if (result == 0) { + if ((src->nts_filter & NSTAT_FILTER_NOZEROBYTES) == NSTAT_FILTER_NOZEROBYTES && + desc->detailed_counts.nstat_media_stats.ms_total.ts_rxbytes == 0 && + desc->detailed_counts.nstat_media_stats.ms_total.ts_txbytes == 0) { + result = EAGAIN; + } else { + result = ctl_enqueuembuf(client->ntc_kctl, client->ntc_unit, msg, CTL_DATA_EOR); + } + } + } + + if (result != 0) { + nstat_stats.nstat_srcupatefailures += 1; + mbuf_freem(msg); + } else { + src->nts_reported = true; + } + + return result; +} + static errno_t nstat_client_append_update( nstat_client *client, @@ -5988,6 +6624,130 @@ nstat_client_append_update( return result; } +static errno_t +nstat_client_append_details( + nstat_client *client, + nstat_src *src, + int *gone) +{ + if ((src->nts_provider->nstat_descriptor_length == 0 || + src->nts_provider->nstat_copy_descriptor == NULL) && + src->nts_provider->nstat_details == NULL) { + return EOPNOTSUPP; + } + + size_t size = offsetof(nstat_msg_src_details, data) + src->nts_provider->nstat_descriptor_length; + size_t total_extension_size = 0; + u_int32_t num_extensions = 0; + u_int64_t extension_mask = nstat_extension_flags_for_source(client, src); + + if ((extension_mask != 0) && (src->nts_provider->nstat_copy_extension != NULL)) { + uint32_t extension_id = 0; + for (extension_id = NSTAT_EXTENDED_UPDATE_TYPE_MIN; extension_id <= NSTAT_EXTENDED_UPDATE_TYPE_MAX; extension_id++) { + if ((extension_mask & (1ull << extension_id)) != 0) { + size_t extension_size = src->nts_provider->nstat_copy_extension(src->nts_cookie, extension_id, NULL, 0); + if (extension_size == 0) { + extension_mask &= ~(1ull << extension_id); + } else { + num_extensions++; + total_extension_size += ROUNDUP64(extension_size); + } + } + } + size += total_extension_size + (sizeof(nstat_msg_src_extended_item_hdr) * num_extensions); + } + + /* + * This kind of limits extensions. + * The optimization is around being able to deliver multiple + * numbers of details bundled together. + * Increasing the size runs the risk of too much stack usage. + * One could potentially changed the allocation below to be on heap. + * For now limiting it to half of NSTAT_MAX_MSG_SIZE. + */ + if (size > (NSTAT_MAX_MSG_SIZE >> 1)) { + return EOPNOTSUPP; + } + + // Fill out a buffer on the stack, we will copy to the mbuf later + u_int64_t buffer[size / sizeof(u_int64_t) + 1]; // u_int64_t to ensure alignment + bzero(buffer, size); + + nstat_msg_src_details *desc = (nstat_msg_src_details *)buffer; + desc->hdr.type = (num_extensions == 0) ? NSTAT_MSG_TYPE_SRC_DETAILS : + NSTAT_MSG_TYPE_SRC_EXTENDED_DETAILS; + desc->hdr.length = (u_int16_t)size; + desc->srcref = src->nts_srcref; + desc->event_flags = 0; + desc->provider = src->nts_provider->nstat_provider_id; + + errno_t result = 0; + // Fill in the description + if (src->nts_provider->nstat_descriptor_length != 0 && src->nts_provider->nstat_copy_descriptor) { + // Query the provider for the provider specific bits + u_int8_t *desc_data_ptr = nstat_get_data(desc); + result = src->nts_provider->nstat_copy_descriptor(src->nts_cookie, desc_data_ptr, + src->nts_provider->nstat_descriptor_length); + if (result != 0) { + nstat_stats.nstat_copy_descriptor_failures++; + if (nstat_debug != 0) { + NSTAT_LOG_ERROR("src->nts_provider->nstat_copy_descriptor: %d", result); + } + return result; + } + } + + if (num_extensions > 0) { + nstat_msg_src_extended_item_hdr *p_extension_hdr = (nstat_msg_src_extended_item_hdr *)(void *)((char *)buffer + + sizeof(nstat_msg_src_details_hdr) + src->nts_provider->nstat_descriptor_length); + uint32_t extension_id = 0; + bzero(p_extension_hdr, total_extension_size + (sizeof(nstat_msg_src_extended_item_hdr) * num_extensions)); + + for (extension_id = NSTAT_EXTENDED_UPDATE_TYPE_MIN; extension_id <= NSTAT_EXTENDED_UPDATE_TYPE_MAX; extension_id++) { + if ((extension_mask & (1ull << extension_id)) != 0) { + void *buf = (void *)(p_extension_hdr + 1); + size_t extension_size = src->nts_provider->nstat_copy_extension(src->nts_cookie, extension_id, buf, total_extension_size); + if ((extension_size == 0) || (extension_size > total_extension_size)) { + // Something has gone wrong. Instead of attempting to wind back the excess buffer space, mark it as unused + p_extension_hdr->type = NSTAT_EXTENDED_UPDATE_TYPE_UNKNOWN; + p_extension_hdr->length = total_extension_size + (sizeof(nstat_msg_src_extended_item_hdr) * (num_extensions - 1)); + break; + } else { + extension_size = ROUNDUP64(extension_size); + p_extension_hdr->type = extension_id; + p_extension_hdr->length = extension_size; + total_extension_size -= extension_size; + p_extension_hdr = (nstat_msg_src_extended_item_hdr *)(void *)((char *)buf + extension_size); + num_extensions--; + } + } + } + } + + if (src->nts_provider->nstat_details) { + result = src->nts_provider->nstat_details(src->nts_cookie, &desc->detailed_counts, gone); + if (result != 0) { + nstat_stats.nstat_provider_counts_failures++; + if (nstat_debug != 0) { + NSTAT_LOG_ERROR("src->nts_provider->nstat_counts: %d", result); + } + return result; + } + + if ((src->nts_filter & NSTAT_FILTER_NOZEROBYTES) == NSTAT_FILTER_NOZEROBYTES && + desc->detailed_counts.nstat_media_stats.ms_total.ts_rxbytes == 0 && + desc->detailed_counts.nstat_media_stats.ms_total.ts_txbytes == 0) { + return EAGAIN; + } + } + + result = nstat_accumulate_msg(client, (uint8_t *)buffer, size); + if (result == 0) { + src->nts_reported = true; + } + return result; +} + static errno_t nstat_client_send_removed( nstat_client *client, @@ -6057,10 +6817,12 @@ nstat_client_handle_add_request( // sanitize cookie nstat_client_sanitize_cookie(client, provider->nstat_provider_id, cookie); - result = nstat_client_source_add(req->hdr.context, client, provider, cookie, NSTAT_LOCK_NOTHELD); + NSTAT_LOCK_EXCLUSIVE(); + result = nstat_client_source_add(req->hdr.context, client, provider, cookie, NULL); + NSTAT_UNLOCK_EXCLUSIVE(); if (result != 0) { - provider->nstat_release(cookie, 0); + provider->nstat_release(cookie); } // Set the flag if a provider added a single source @@ -6169,13 +6931,10 @@ nstat_client_source_add( nstat_client *client, nstat_provider *provider, nstat_provider_cookie_t cookie, - nstat_lock_status lock_status) + nstat_locus *locus) { - if (lock_status == NSTAT_LOCK_NOTHELD) { - NSTAT_LOCK_EXCLUSIVE(); - } else { - NSTAT_ASSERT_LOCKED_EXCLUSIVE(); - } + // It is a condition of this function that the lock be held in exclusive mode + NSTAT_ASSERT_LOCKED_EXCLUSIVE(); // Fill out source added message if appropriate errno_t result = 0; @@ -6241,6 +7000,7 @@ nstat_client_source_add( result = EINVAL; break; } + src->nts_locus = locus; src->nts_provider = provider; src->nts_cookie = cookie; src->nts_filter = src_filter; @@ -6260,7 +7020,9 @@ nstat_client_source_add( // Put the source in the list TAILQ_INSERT_HEAD(&client->ntc_src_queue, src, nts_client_link); src->nts_client = client; - + if (locus != NULL) { + TAILQ_INSERT_HEAD(&locus->ntl_src_queue, src, nts_locus_link); + } NSTAT_GLOBAL_COUNT_INCREMENT(nstat_global_src_allocs); NSTAT_GLOBAL_COUNT_INCREMENT_WITH_MAX(nstat_global_src_current, nstat_global_src_max); @@ -6272,9 +7034,6 @@ nstat_client_source_add( NSTAT_NOTE_SRC(nstat_src_add_success, client, src); } while (0); - if (lock_status == NSTAT_LOCK_NOTHELD) { - NSTAT_UNLOCK_EXCLUSIVE(); - } return result; } @@ -6300,13 +7059,13 @@ nstat_client_handle_remove_request( } } if (src) { - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + nstat_src_remove_linkages(client, src); } NSTAT_UNLOCK_EXCLUSIVE(); if (src) { - nstat_client_cleanup_source(client, src, FALSE); + nstat_client_cleanup_source(client, src); NSTAT_NOTE_QUAL(nstat_remove_src_found, client, srcref); } else { NSTAT_NOTE_QUAL(nstat_remove_src_missed, client, srcref); @@ -6419,7 +7178,7 @@ nstat_client_handle_query_request( } // pull src out of the list - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + nstat_src_remove_linkages(client, src); TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); NSTAT_LOCK_EXCLUSIVE_TO_SHARED(); } else { @@ -6470,7 +7229,7 @@ nstat_client_handle_query_request( while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); - nstat_client_cleanup_source(client, src, FALSE); + nstat_client_cleanup_source(client, src); } return result; @@ -6716,7 +7475,7 @@ nstat_client_handle_get_update( if (NSTAT_LOCK_SHARED_TO_EXCLUSIVE()) { // Successfully upgraded the lock, now we can remove the source from the client // pull src out of the list - TAILQ_REMOVE(&client->ntc_src_queue, src, nts_client_link); + nstat_src_remove_linkages(client, src); TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); NSTAT_LOCK_EXCLUSIVE_TO_SHARED(); NSTAT_NOTE_SRC(nstat_query_update_upgrade, client, src); @@ -6768,7 +7527,137 @@ nstat_client_handle_get_update( while ((src = TAILQ_FIRST(&dead_list))) { TAILQ_REMOVE(&dead_list, src, nts_client_link); // release src and send notification - nstat_client_cleanup_source(client, src, FALSE); + nstat_client_cleanup_source(client, src); + } + + return result; +} + + +static errno_t +nstat_client_handle_get_details( + nstat_client *client, + mbuf_t m) +{ + nstat_msg_query_src_req req; + + if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) { + return EINVAL; + } + + NSTAT_LOCK_SHARED(); + + client->ntc_flags |= NSTAT_FLAG_SUPPORTS_DETAILS; + + errno_t result = ENOENT; + nstat_src *src, *tmpsrc; + tailq_head_nstat_src dead_list; + u_int64_t src_count = 0; + boolean_t partial = FALSE; + const boolean_t all_srcs = (req.srcref == NSTAT_SRC_REF_ALL); + TAILQ_INIT(&dead_list); + + if (all_srcs) { + NSTAT_NOTE_QUAL(nstat_query_details_all, client, 0); + } else { + NSTAT_NOTE_QUAL(nstat_query_details_one, client, req.srcref); + } + + /* + * Error handling policy and sequence number generation is folded into + * nstat_client_begin_query. + */ + partial = nstat_client_begin_query(client, &req.hdr); + + TAILQ_FOREACH_SAFE(src, &client->ntc_src_queue, nts_client_link, tmpsrc) { + int gone = 0; + if (all_srcs) { + // Check to see if we should handle this source or if we're still skipping to find where to continue + if ((FALSE == partial || src->nts_seq != client->ntc_seq)) { + u_int64_t suppression_flags = (src->nts_reported)? NSTAT_FILTER_SUPPRESS_BORING_POLL: 0; + if (nstat_client_reporting_allowed(client, src, suppression_flags)) { + result = nstat_client_append_details(client, src, &gone); + if (ENOMEM == result || ENOBUFS == result) { + /* + * If the details message failed to + * enqueue then give up. + */ + NSTAT_NOTE_SRC(nstat_query_details_nobuf, client, src); + break; + } + if (partial) { + /* + * We skip over hard errors and + * filtered sources. + */ + src->nts_seq = client->ntc_seq; + src_count++; + } + } + } + } else if (src->nts_srcref == req.srcref) { + if (nstat_client_reporting_allowed(client, src, 0)) { + result = nstat_client_send_details(client, src, req.hdr.context, 0, 0, &gone); + } + } + + if (gone) { + if (NSTAT_LOCK_SHARED_TO_EXCLUSIVE()) { + // Successfully upgraded the lock, now we can remove the source from the client + // pull src out of the list + nstat_src_remove_linkages(client, src); + TAILQ_INSERT_TAIL(&dead_list, src, nts_client_link); + NSTAT_LOCK_EXCLUSIVE_TO_SHARED(); + NSTAT_NOTE_SRC(nstat_query_details_upgrade, client, src); + } else { + // The upgrade failed and the shared lock has been dropped + // This should be rare. Simply drop out here and have user level retry + // the poll, have the idle cleanup catch the "gone" source + NSTAT_NOTE_SRC(nstat_query_details_noupgrade, client, src); + NSTAT_LOCK_SHARED(); + break; + } + } + + if (!all_srcs && req.srcref == src->nts_srcref) { + break; + } + if (src_count >= QUERY_CONTINUATION_SRC_COUNT) { + NSTAT_NOTE_SRC(nstat_query_details_limit, client, src); + break; + } + if ((src_count >= QUERY_CONTINUATION_MIN_SRC_COUNT) && + (NSTAT_LOCK_WOULD_YIELD())) { + // A possibly higher priority thread is waiting + // Exit from here and have user level initiate the next fragment + NSTAT_NOTE_SRC(nstat_query_details_yield, client, src); + break; + } + } + + nstat_flush_accumulated_msgs(client); + + + u_int16_t flags = 0; + if (req.srcref == NSTAT_SRC_REF_ALL) { + flags = nstat_client_end_query(client, src, partial); + } + NSTAT_UNLOCK_SHARED(); + + /* + * If an error occurred enqueueing data, then allow the error to + * propagate to nstat_client_send. This way, the error is sent to + * user-level. + */ + if (all_srcs && ENOMEM != result && ENOBUFS != result) { + nstat_enqueue_success(req.hdr.context, client, flags); + result = 0; + } + + while ((src = TAILQ_FIRST(&dead_list))) { + TAILQ_REMOVE(&dead_list, src, nts_client_link); + // release src and send notification + nstat_client_cleanup_source(client, src); } return result; @@ -6855,6 +7744,10 @@ nstat_client_send( result = nstat_client_handle_get_update(client, m); break; + case NSTAT_MSG_TYPE_GET_DETAILS: + result = nstat_client_handle_get_details(client, m); + break; + case NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO: result = nstat_client_handle_subscribe_sysinfo(client); break; @@ -6986,9 +7879,9 @@ progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxd uint64_t flow_count; indicators->np_recentflows++; - flow_count = os_atomic_load(&inp->inp_stat->rxbytes, relaxed); + flow_count = os_atomic_load(&inp->inp_mstat.ms_total.ts_rxbytes, relaxed); indicators->np_recentflows_rxbytes += flow_count; - flow_count = os_atomic_load(&inp->inp_stat->txbytes, relaxed); + flow_count = os_atomic_load(&inp->inp_mstat.ms_total.ts_txbytes, relaxed); indicators->np_recentflows_txbytes += flow_count; indicators->np_recentflows_rxooo += tp->t_stat.rxoutoforderbytes; @@ -7014,12 +7907,10 @@ progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxd assert(shad->shad_magic == TU_SHADOW_MAGIC); bool consider_shad = false; - if (shad->shad_live) { - if (shad->shad_provider == NSTAT_PROVIDER_QUIC_USERLAND) { - consider_shad = update_quic_indicators; - } else if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) { - consider_shad = update_tcp_indicators; - } + if (shad->shad_provider == NSTAT_PROVIDER_QUIC_USERLAND) { + consider_shad = update_quic_indicators; + } else if (shad->shad_provider == NSTAT_PROVIDER_TCP_USERLAND) { + consider_shad = update_tcp_indicators; } if (consider_shad) { @@ -7028,7 +7919,7 @@ progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxd bzero(&digest, sizeof(digest)); // fetch ifflags and digest from necp_client - bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, &digest, NULL, NULL); + bool result = (*shad->shad_getvals_fn)(shad->shad_provider_context, &ifflags, &digest, NULL, NULL, NULL); error = (result)? 0 : EIO; if (error) { NSTAT_LOG_ERROR("nstat get ifflags and progressdigest returned %d", error); @@ -7183,7 +8074,7 @@ nstat_gather_flow_data(nstat_provider_id_t provider, nstat_flow_data *__counted_ TAILQ_FOREACH(shad, &nstat_userprot_shad_head, shad_link) { assert(shad->shad_magic == TU_SHADOW_MAGIC); - if ((shad->shad_provider == provider) && (shad->shad_live)) { + if (shad->shad_provider == provider) { if (prepared >= n) { break; } diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index f6b10be76..96fb46942 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -95,6 +95,23 @@ typedef struct nstat_counts { u_int32_t nstat_var_rtt; } nstat_counts; +// Note, the nstat_detailed_counts structure is not intended for route statistics, +// hence no equivalent of the nstat_connectattempts and nstat_connectsuccesses within nstat_counts +typedef struct nstat_detailed_counts { + /* Counters */ + struct media_stats nstat_media_stats __attribute__((aligned(sizeof(u_int64_t)))); + + u_int64_t nstat_rxduplicatebytes; + u_int64_t nstat_rxoutoforderbytes; + u_int64_t nstat_txretransmit; + + u_int32_t nstat_min_rtt; + u_int32_t nstat_avg_rtt; + u_int32_t nstat_var_rtt; + u_int32_t nstat_xtra_flags; // Reserved + uuid_t nstat_xtra_uuid; // Reserved +} nstat_detailed_counts; + #define NSTAT_SYSINFO_KEYVAL_STRING_MAXSIZE 24 typedef struct nstat_sysinfo_keyval { u_int32_t nstat_sysinfo_key; @@ -363,6 +380,7 @@ enum{ #define NSTAT_IFNET_IS_WIFI_INFRA 0x00010000 #define NSTAT_IFNET_PEEREGRESSINTERFACE_IS_CELLULAR 0x00020000 #define NSTAT_IFNET_IS_COMPANIONLINK_BT 0x00040000 +#define NSTAT_IFNET_IS_ULTRA_CONSTRAINED 0x00080000 // Not interface properties, but used for filtering in similar fashion #define NSTAT_NECP_CONN_HAS_NET_ACCESS 0x01000000 @@ -780,6 +798,7 @@ enum{ , NSTAT_MSG_TYPE_SET_FILTER = 1006 // Obsolete , NSTAT_MSG_TYPE_GET_UPDATE = 1007 , NSTAT_MSG_TYPE_SUBSCRIBE_SYSINFO = 1008 + , NSTAT_MSG_TYPE_GET_DETAILS = 1009 // Responses/Notfications , NSTAT_MSG_TYPE_SRC_ADDED = 10001 @@ -789,6 +808,8 @@ enum{ , NSTAT_MSG_TYPE_SYSINFO_COUNTS = 10005 , NSTAT_MSG_TYPE_SRC_UPDATE = 10006 , NSTAT_MSG_TYPE_SRC_EXTENDED_UPDATE = 10007 + , NSTAT_MSG_TYPE_SRC_DETAILS = 10008 + , NSTAT_MSG_TYPE_SRC_EXTENDED_DETAILS = 10009 }; enum{ @@ -1124,6 +1145,60 @@ typedef struct nstat_msg_src_update_convenient { }; } nstat_msg_src_update_convenient; +#define NSTAT_SRC_DETAILS_FIELDS \ + nstat_msg_hdr hdr; \ + nstat_src_ref_t srcref __attribute__((aligned(sizeof(u_int64_t)))); \ + nstat_event_flags_t event_flags __attribute__((aligned(sizeof(u_int64_t)))); \ + nstat_detailed_counts detailed_counts; \ + nstat_provider_id_t provider; \ + u_int8_t reserved[4] + +typedef struct nstat_msg_src_details { + NSTAT_SRC_DETAILS_FIELDS; + u_int8_t data[]; +} nstat_msg_src_details; +DEFINE_NTSTAT_DATA_ACCESSOR(struct nstat_msg_src_details) + +typedef struct nstat_msg_src_details_hdr { + NSTAT_SRC_DETAILS_FIELDS; +} nstat_msg_src_details_hdr; + +typedef struct nstat_msg_src_details_tcp { + NSTAT_SRC_DETAILS_FIELDS; + nstat_tcp_descriptor tcp_desc; +} nstat_msg_src_details_tcp; + +typedef struct nstat_msg_src_details_udp { + NSTAT_SRC_DETAILS_FIELDS; + nstat_udp_descriptor udp_desc; +} nstat_msg_src_details_udp; + +typedef struct nstat_msg_src_details_quic { + NSTAT_SRC_DETAILS_FIELDS; + nstat_quic_descriptor quic_desc; +} nstat_msg_src_details_quic; + +typedef struct nstat_msg_src_details_conn { + NSTAT_SRC_DETAILS_FIELDS; + nstat_connection_descriptor conn_desc; +} nstat_msg_src_details_conn; + + +typedef struct nstat_msg_src_details_convenient { + nstat_msg_src_details_hdr hdr; + union { + nstat_tcp_descriptor tcp; + nstat_udp_descriptor udp; + nstat_route_descriptor route; + nstat_ifnet_descriptor ifnet; + nstat_sysinfo_descriptor sysinfo; + nstat_quic_descriptor quic; + nstat_connection_descriptor conn; + }; +} nstat_msg_src_details_convenient; + + + typedef struct nstat_msg_src_extended_item_hdr { u_int32_t type; u_int32_t length; @@ -1220,6 +1295,7 @@ nstat_sysinfo_get_keyvals(struct nstat_msg_sysinfo_counts *__header_indexable co #pragma mark -- Statitiscs about Network Statistics -- +// For historic "netstat -s -p nstat" command struct nstat_stats { u_int32_t nstat_successmsgfailures; u_int32_t nstat_sendcountfailures; @@ -1239,6 +1315,204 @@ struct nstat_stats { u_int32_t nstat_handle_msg_failures; }; +// Additional counts that are "global, i.e. not per client + +#define NSTAT_GLOBAL_COUNTS_VERSION 1 +struct nstat_global_counts { + uint64_t nstat_global_count_version; // current version number for this structure + + uint64_t nstat_global_exclusive_lock_uncontended; // Uncontended acquisitions of exlusive lock + uint64_t nstat_global_exclusive_lock_contended; // Contended acquisitions of exlusive lock + + uint64_t nstat_global_shared_lock_uncontended; // Uncontended acquisitions of shared lock + uint64_t nstat_global_shared_lock_contended; // Contended acquisitions of shared lock + + uint64_t nstat_global_client_current; // current number of clients overall + uint64_t nstat_global_client_max; // max number of clients overall + uint64_t nstat_global_client_allocs; // total number of clients allocated + uint64_t nstat_global_client_alloc_fails; // total number of failures to allocate a client + + uint64_t nstat_global_src_current; // current number of srcs overall + uint64_t nstat_global_src_max; // max number of srcs overall + uint64_t nstat_global_src_allocs; // total number of sources allocated + uint64_t nstat_global_src_alloc_fails; // total number of failures to allocate a source + + uint64_t nstat_global_tcp_sck_locus_current; // current number of tcp nstat_sock_locus overall + uint64_t nstat_global_tcp_sck_locus_max; // max number of tcp nstat_sock_locus overall + uint64_t nstat_global_tcp_sck_locus_allocs; // total number of tcp nstat_sock_locus allocated + uint64_t nstat_global_tcp_sck_locus_alloc_fails;// total number of failures to allocate a tcp nstat_sock_locus + + uint64_t nstat_global_udp_sck_locus_current; // current number of udp nstat_extended_sock_locus overall + uint64_t nstat_global_udp_sck_locus_max; // max number of udp nstat_extended_sock_locus overall + uint64_t nstat_global_udp_sck_locus_allocs; // total number of udp nstat_extended_sock_locus allocated + uint64_t nstat_global_udp_sck_locus_alloc_fails;// total number of failures to allocate a udp nstat_extended_sock_locus + + uint64_t nstat_global_tu_shad_current; // current number of nstat_tu_shadow objects overall + uint64_t nstat_global_tu_shad_max; // max number of tu_shadows overall + uint64_t nstat_global_tu_shad_allocs; // total number of tu_shadows allocated + + uint64_t nstat_global_gshad_current; // current number of generic shadow objects overall + uint64_t nstat_global_gshad_max; // max number of srcs overall + uint64_t nstat_global_gshad_allocs; // total number of sources allocated + + uint64_t nstat_global_procdetails_current; // current number of procdetails objects overall + uint64_t nstat_global_procdetails_max; // max number of procdetails overall + uint64_t nstat_global_procdetails_allocs; // total number of procdetails allocated + + uint64_t nstat_global_idlecheck_tcp_gone; // idle check removes a TCP locus + uint64_t nstat_global_idlecheck_udp_gone; // idle check removes a UDP locus + uint64_t nstat_global_idlecheck_route_src_gone; // total number of route sources discovered "gone" in idle check + + // Extra details for sock locus lifecycle + uint64_t nstat_global_tcp_sck_locus_stop_using; // Socket has WNT_STOPUSING when creating the initial locus + uint64_t nstat_global_udp_sck_locus_stop_using; // Socket has WNT_STOPUSING when creating the initial locus + uint64_t nstat_global_pcb_detach_with_locus; // Expected path, locus on pcb_detach + uint64_t nstat_global_pcb_detach_with_src; // Expected path, locus on pcb_detach, an associated source being detached + uint64_t nstat_global_pcb_detach_without_locus; // Unexpected path, no locus on pcb_detach + uint64_t nstat_global_pcb_detach_udp; // pcb detach removes a UDP locus + uint64_t nstat_global_pcb_detach_tcp; // pcb detach removes a TCP locus + + uint64_t nstat_global_sck_update_last_owner; // nstat_pcb_update_last_owner() was called + uint64_t nstat_global_sck_fail_first_owner; // can't set name on sock locus create + uint64_t nstat_global_sck_fail_last_owner; // nstat_pcb_update_last_owner() was called, no name available + uint64_t nstat_global_tcp_desc_new_name; // Socket ownership discovered to have changed + uint64_t nstat_global_tcp_desc_fail_name; // Socket ownership discovered to have changed, fail to get new name + uint64_t nstat_global_udp_desc_new_name; // Socket ownership discovered to have changed + uint64_t nstat_global_udp_desc_fail_name; // Socket ownership discovered to have changed, fail to get new name + + // The following are expected to be removed as and when the socket handling code is refined + uint64_t nstat_global_tucookie_current; + uint64_t nstat_global_tucookie_max; + uint64_t nstat_global_tucookie_allocs; + uint64_t nstat_global_tucookie_alloc_fail; + uint64_t nstat_global_tucookie_skip_dead; + uint64_t nstat_global_tucookie_skip_stopusing; + uint64_t nstat_global_src_idlecheck_gone; +}; + + +// Counts that are typically per-client +// They are also accumulated globally for all previous clients +// +// The "net.stats.metrics" systctl can request these metrics either from a specific client, +// the accumulated counts for closed clients, or a summary of all the closed and current clients. +// To collect individual metrics for all clients, an initial request is made targeting +// NSTAT_METRIC_ID_MAX via the mr_id field in the request structure. The returned metrics will be +// for the client with the highest identifer, as returned in the nstat_client_id field. +// The next request should target that returned identifier minus one, which will collect +// the client with the next highest identifier. This sequence can continue until metrics +// for all clients have been collected +#define NSTAT_METRIC_VERSION 2 +#define NSTAT_METRIC_ID_ACCUMULATED 0x0 /* Accumulation from all clients that have previously closed */ +#define NSTAT_METRIC_ID_GRAND_TOTAL 0x1 /* Accumulation from all clients, current and historic */ +#define NSTAT_METRIC_ID_MAX 0xffffffff /* Start scanning all clients with this initial value */ + +struct nstat_metrics_req { + uint32_t mr_version; // The version of metrics being requested + uint32_t mr_id; // Identifier for the metrics, a client id, or accumulated or grand total +}; + +struct nstat_client_details { + uint32_t nstat_client_id; // Identifier for this set of metrics, a client id, or accumulated + pid_t nstat_client_pid; // Process id of client that owns these metrics + uint32_t nstat_client_watching; // Bitmap of providers being watched + uint32_t nstat_client_added_src; // Bitmap of providers with individually added sources +}; + +struct nstat_metrics { + uint32_t nstat_src_current; // current number of srcs for client + uint32_t nstat_src_max; // max number of srcs for client + uint32_t nstat_first_uint32_count; // Subsequent fields must be uint32_t values that, if kept per-client, + // should simply added to the global counts when the client exit + + // Tracking client requests + uint32_t nstat_query_request_all; // Client requests for all counts + uint32_t nstat_query_request_one; // Client request for counts on a single source + uint32_t nstat_query_description_all; // Client requests for all descriptors + uint32_t nstat_query_description_one; // Client requests for descriptor on a single source + uint32_t nstat_query_update_all; // Client requests for all updates + uint32_t nstat_query_update_one; // Client requests for update on a single source + uint32_t nstat_remove_src_found; // Client request to remove a source which is still in existence + uint32_t nstat_remove_src_missed; // Client request to remove a source which is no longer there + + // Details for nstat_query_request all/one + uint32_t nstat_query_request_nobuf; // No buffers for message send + uint32_t nstat_query_request_upgrade; // Successful lock upgrade to handle "gone" source + uint32_t nstat_query_request_noupgrade; // Unsuccessful lock upgrade to handle "gone" source + uint32_t nstat_query_request_nodesc; // Can't send a descriptor for "gone" source + uint32_t nstat_query_request_yield; // Client yields lock due to possibly higher priority processing + uint32_t nstat_query_request_limit; // Client requests for all counts + + // Details for nstat_query_description all/one + uint32_t nstat_query_description_nobuf; // No buffers for message send + uint32_t nstat_query_description_yield; // Client yields lock due to possibly higher priority processing + uint32_t nstat_query_description_limit; // Client requests for all counts + + // Details for nstat_query_details all/one + uint32_t nstat_query_details_nobuf; // No buffers for message send + uint32_t nstat_query_details_upgrade; // Successful lock upgrade to handle "gone" source + uint32_t nstat_query_details_noupgrade; // Unsuccessful lock upgrade to handle "gone" source + uint32_t nstat_query_details_yield; // Client yields lock due to possibly higher priority processing + uint32_t nstat_query_details_limit; // Client requests for all counts + uint32_t nstat_query_details_all; // Request received for all sources + uint32_t nstat_query_details_one; // Request received for a specific source + + // Details for nstat_query_update all/one + uint32_t nstat_query_update_nobuf; // No buffers for message send + uint32_t nstat_query_update_upgrade; // Successful lock upgrade to handle "gone" source + uint32_t nstat_query_update_noupgrade; // Unsuccessful lock upgrade to handle "gone" source + uint32_t nstat_query_update_nodesc; // Can't send a descriptor for "gone" source + uint32_t nstat_query_update_yield; // Client yields lock due to possibly higher priority processing + uint32_t nstat_query_update_limit; // Client requests for all counts + + // Details for adding a source + uint32_t nstat_src_add_success; // successful src_add + uint32_t nstat_src_add_no_buf; // fail to get buffer for initial src-added + uint32_t nstat_src_add_no_src_mem; // fail to get memory for nstat_src structure + uint32_t nstat_src_add_send_err; // fail to send initial src-added + uint32_t nstat_src_add_while_cleanup; // fail to add because client is in clean up state + + // Details for adding the client as a watcher + uint32_t nstat_add_all_tcp_skip_dead; // Skip a dead PCB when adding all TCP + uint32_t nstat_add_all_udp_skip_dead; // Skip a dead PCB when adding all UDP + + // Details for sending "goodbye" on source removal + uint32_t nstat_src_goodbye_successes;// Successful goodbyes (include cases messages filtered out) + uint32_t nstat_src_goodbye_failures; // Failed goodbyes, further qualified by.. + uint32_t nstat_src_goodbye_sent_details; // Sent a concluding details message + uint32_t nstat_src_goodbye_failed_details; // Failed to send a details message + uint32_t nstat_src_goodbye_filtered_details;// Skipped trying to send a details message + uint32_t nstat_src_goodbye_sent_update; // Sent a concluding update message + uint32_t nstat_src_goodbye_failed_update; // Failed to send an update message + uint32_t nstat_src_goodbye_filtered_update; // Skipped trying to send an update message + uint32_t nstat_src_goodbye_sent_counts; // Sent a concluding counts message + uint32_t nstat_src_goodbye_failed_counts; // Failed to send a counts message + uint32_t nstat_src_goodbye_filtered_counts; // Skipped trying to send both counts and descriptor messages + uint32_t nstat_src_goodbye_sent_description;// Sent a concluding description message + uint32_t nstat_src_goodbye_failed_description; // Failed to send a description message + uint32_t nstat_src_goodbye_sent_removed; // Sent a concluding removed message + uint32_t nstat_src_goodbye_failed_removed; // Failed to send a removed message + uint32_t nstat_src_goodbye_filtered_removed; // Skipped on sending a removed message + + uint32_t nstat_pcb_event; // send pcb event code called, one precursor to the send_event metrics + uint32_t nstat_send_event; // send event successful + uint32_t nstat_send_event_fail; // send event fail, likely lack of buffers + uint32_t nstat_send_event_notsup; // send event not supported, old style client + + + uint32_t nstat_route_src_gone_idlecheck; // route src gone noted during periodic idle check + uint32_t nstat_src_removed_linkage; // removed src linkages on the way to deletion + + uint32_t nstat_src_gone_idlecheck; // Expected to be redundant/removed when socket handling code is refined + + uint32_t nstat_last_uint32_count; // Must be the last uint32_t count in the structure + uint32_t nstat_stats_pad; +}; + +struct nstat_client_info { + struct nstat_client_details nstat_client_details; + struct nstat_metrics nstat_metrics; +}; /* * Structure with information that gives insight into forward progress on an * interface, exported to user-land via sysctl(3). @@ -1433,8 +1707,9 @@ void nstat_udp_new_pcb(struct inpcb *inp); void nstat_route_new_entry(struct rtentry *rt); void nstat_pcb_detach(struct inpcb *inp); void nstat_pcb_event(struct inpcb *inp, u_int64_t event); -void nstat_pcb_cache(struct inpcb *inp); -void nstat_pcb_invalidate_cache(struct inpcb *inp); +void nstat_udp_pcb_cache(struct inpcb *inp); +void nstat_udp_pcb_invalidate_cache(struct inpcb *inp); +void nstat_pcb_update_last_owner(struct inpcb *inp); void nstat_ifnet_threshold_reached(unsigned int ifindex); @@ -1474,6 +1749,7 @@ typedef bool (userland_stats_request_vals_fn)(userland_stats_provider_context *c u_int32_t *ifflagsp, nstat_progress_digest *digestp, nstat_counts *countsp, + nstat_detailed_counts *detailed_countsp, void *metadatap); // Netstats can also request "extension" items, specified by the allowed_extensions flag @@ -1534,6 +1810,7 @@ typedef void *nstat_context; /* This is quoted by the external provid typedef bool (nstat_provider_request_vals_fn)(nstat_provider_context ctx, u_int32_t *ifflagsp, /* Flags for being on cell/wifi etc, used for filtering */ nstat_counts *countsp, /* Counts to be filled in */ + nstat_detailed_counts *detailsp, /* Detailed Counts to be filled in */ void *metadatap); /* A descriptor for the particular provider */ // Netstats can also request "extension" items, specified by the allowed_extensions flag diff --git a/bsd/net/packet_mangler.c b/bsd/net/packet_mangler.c index 76e7f9fcf..dd394e09e 100644 --- a/bsd/net/packet_mangler.c +++ b/bsd/net/packet_mangler.c @@ -664,7 +664,7 @@ pkt_mnglr_init(void) /* * Compile time verifications */ - _CASSERT(PKT_MNGLR_MAX_FILTER_COUNT == MAX_PACKET_MANGLER); + static_assert(PKT_MNGLR_MAX_FILTER_COUNT == MAX_PACKET_MANGLER); /* * Register kernel control diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 733764828..e85ec0d9d 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -83,6 +83,9 @@ #include +#include + +#include #include #include #include @@ -2157,8 +2160,8 @@ pf_calc_state_key_flowhash(struct pf_state_key *sk) VERIFY(sk->flowsrc == FLOWSRC_PF); bzero(&fk, sizeof(fk)); - _CASSERT(sizeof(sk->lan.addr) == sizeof(fk.ffk_laddr)); - _CASSERT(sizeof(sk->ext_lan.addr) == sizeof(fk.ffk_laddr)); + static_assert(sizeof(sk->lan.addr) == sizeof(fk.ffk_laddr)); + static_assert(sizeof(sk->ext_lan.addr) == sizeof(fk.ffk_laddr)); bcopy(&sk->lan.addr, &fk.ffk_laddr, sizeof(fk.ffk_laddr)); bcopy(&sk->ext_lan.addr, &fk.ffk_raddr, sizeof(fk.ffk_raddr)); fk.ffk_af = sk->af_lan; @@ -4853,7 +4856,7 @@ pf_nat64_ipv6(pbuf_t *pbuf, int off, struct pf_pdesc *pd) } if ((m = pbuf_to_mbuf(pbuf, TRUE)) != NULL) { - ip_input(m); + ip_proto_input(AF_INET, m); } return PF_NAT64; @@ -9227,6 +9230,8 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, int error = 0; uint32_t sw_csum; int interface_mtu = 0; + drop_reason_t drop_reason = DROP_REASON_PF_UNSPECIFIED; + bzero(&iproute, sizeof(iproute)); if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL || @@ -9265,6 +9270,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (m0->m_len < (int)sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: packet length < sizeof (struct ip)\n")); + drop_reason = DROP_REASON_PF_UNDERSIZED; goto bad; } @@ -9279,6 +9285,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, rtalloc(ro); if (ro->ro_rt == NULL) { ipstat.ips_noroute++; + drop_reason = DROP_REASON_PF_NO_ROUTE; goto bad; } @@ -9294,6 +9301,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: TAILQ_EMPTY(&r->rpool.list)\n")); + drop_reason = DROP_REASON_PF_NO_ROUTE; goto bad; } if (s == NULL) { @@ -9313,11 +9321,13 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, } } if (ifp == NULL) { + drop_reason = DROP_REASON_PF_NULL_IFP; goto bad; } if (oifp != ifp) { if (pf_test_mbuf(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { + drop_reason = DROP_REASON_PF_DROP; goto bad; } else if (m0 == NULL) { goto done; @@ -9325,6 +9335,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (m0->m_len < (int)sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route: packet length < sizeof (struct ip)\n")); + drop_reason = DROP_REASON_PF_UNDERSIZED; goto bad; } ip = mtod(m0, struct ip *); @@ -9368,6 +9379,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, interface_mtu); goto done; } else { + drop_reason = DROP_REASON_PF_NO_TSO; goto bad; } } @@ -9383,6 +9395,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (error) { m0 = NULL; + drop_reason = DROP_REASON_PF_CANNOT_FRAGMENT; goto bad; } @@ -9407,7 +9420,8 @@ done: bad: if (m0) { - m_freem(m0); + m_drop(m0, DROPTAP_FLAG_DIR_IN, drop_reason, NULL, 0); + m0 = NULL; } goto done; } @@ -9428,6 +9442,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_src_node *__single sn = NULL; int error = 0; struct pf_mtag *__single pf_mtag; + drop_reason_t drop_reason = DROP_REASON_PF_UNSPECIFIED; if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL || (dir != PF_IN && dir != PF_OUT) || oifp == NULL) { @@ -9463,6 +9478,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (m0->m_len < (int)sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: m0->m_len < sizeof (struct ip6_hdr)\n")); + drop_reason = DROP_REASON_PF_UNDERSIZED; goto bad; } ip6 = mtod(m0, struct ip6_hdr *); @@ -9488,6 +9504,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n")); + drop_reason = DROP_REASON_PF_NO_ROUTE; goto bad; } if (s == NULL) { @@ -9506,11 +9523,13 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (ifp == NULL) { + drop_reason = DROP_REASON_PF_NULL_IFP; goto bad; } if (oifp != ifp) { if (pf_test6_mbuf(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) { + drop_reason = DROP_REASON_PF_DROP; goto bad; } else if (m0 == NULL) { goto done; @@ -9518,6 +9537,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (m0->m_len < (int)sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_route6: m0->m_len " "< sizeof (struct ip6_hdr)\n")); + drop_reason = DROP_REASON_PF_UNDERSIZED; goto bad; } pf_mtag = pf_get_mtag(m0); @@ -9555,6 +9575,7 @@ pf_route6(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, if (r->rt != PF_DUPTO) { icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); } else { + drop_reason = DROP_REASON_PF_NO_TSO; goto bad; } } @@ -9564,7 +9585,7 @@ done: bad: if (m0) { - m_freem(m0); + m_drop(m0, DROPTAP_FLAG_DIR_IN, drop_reason, NULL, 0); m0 = NULL; } goto done; @@ -10922,8 +10943,7 @@ pool_init(struct pool *pp, size_t size, unsigned int align, unsigned int ioff, { #pragma unused(align, ioff, flags, palloc) bzero(pp, sizeof(*pp)); - pp->pool_zone = zone_create(wchan, size, - ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM); + pp->pool_zone = zone_create(wchan, size, ZC_ZFREE_CLEARMEM); pp->pool_hiwat = pp->pool_limit = (unsigned int)-1; pp->pool_name = wchan; } diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 31f1df37d..f8c2154df 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -85,6 +85,7 @@ #include #include +#include #include #include #include @@ -490,17 +491,17 @@ pfinit(void) TAILQ_INIT(&pf_pabuf); TAILQ_INIT(&state_list); - _CASSERT((SC_BE & SCIDX_MASK) == SCIDX_BE); - _CASSERT((SC_BK_SYS & SCIDX_MASK) == SCIDX_BK_SYS); - _CASSERT((SC_BK & SCIDX_MASK) == SCIDX_BK); - _CASSERT((SC_RD & SCIDX_MASK) == SCIDX_RD); - _CASSERT((SC_OAM & SCIDX_MASK) == SCIDX_OAM); - _CASSERT((SC_AV & SCIDX_MASK) == SCIDX_AV); - _CASSERT((SC_RV & SCIDX_MASK) == SCIDX_RV); - _CASSERT((SC_VI & SCIDX_MASK) == SCIDX_VI); - _CASSERT((SC_SIG & SCIDX_MASK) == SCIDX_SIG); - _CASSERT((SC_VO & SCIDX_MASK) == SCIDX_VO); - _CASSERT((SC_CTL & SCIDX_MASK) == SCIDX_CTL); + static_assert((SC_BE & SCIDX_MASK) == SCIDX_BE); + static_assert((SC_BK_SYS & SCIDX_MASK) == SCIDX_BK_SYS); + static_assert((SC_BK & SCIDX_MASK) == SCIDX_BK); + static_assert((SC_RD & SCIDX_MASK) == SCIDX_RD); + static_assert((SC_OAM & SCIDX_MASK) == SCIDX_OAM); + static_assert((SC_AV & SCIDX_MASK) == SCIDX_AV); + static_assert((SC_RV & SCIDX_MASK) == SCIDX_RV); + static_assert((SC_VI & SCIDX_MASK) == SCIDX_VI); + static_assert((SC_SIG & SCIDX_MASK) == SCIDX_SIG); + static_assert((SC_VO & SCIDX_MASK) == SCIDX_VO); + static_assert((SC_CTL & SCIDX_MASK) == SCIDX_CTL); /* default rule should never be garbage collected */ pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next; @@ -828,7 +829,7 @@ tagname2tag(struct pf_tags *head, char const *tagname) /* * check if it is a reserved tag. */ - _CASSERT(RESERVED_TAG_ID_MIN > DYNAMIC_TAG_ID_MAX); + static_assert(RESERVED_TAG_ID_MIN > DYNAMIC_TAG_ID_MAX); for (int i = 0; i < NUM_RESERVED_TAGS; i++) { if (strlcmp(pf_reserved_tag_table[i].tag_name, tagname, PF_TAG_NAME_SIZE) == 0) { new_tagid = pf_reserved_tag_table[i].tag_id; @@ -4737,7 +4738,8 @@ pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input, #endif if (pf_test_mbuf(input ? PF_IN : PF_OUT, ifp, mp, NULL, fwa) != PF_PASS) { if (*mp != NULL) { - m_freem(*mp); + m_drop(*mp, input ? DROPTAP_FLAG_DIR_IN : DROPTAP_FLAG_DIR_OUT, + DROP_REASON_PF_NO_ROUTE, NULL, 0); *mp = NULL; error = EHOSTUNREACH; } else { @@ -4787,7 +4789,8 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input, if (pf_test6_mbuf(input ? PF_IN : PF_OUT, ifp, mp, NULL, fwa) != PF_PASS) { if (*mp != NULL) { - m_freem(*mp); + m_drop(*mp, input ? DROPTAP_FLAG_DIR_IN : DROPTAP_FLAG_DIR_OUT, + DROP_REASON_PF_NO_ROUTE, NULL, 0); *mp = NULL; error = EHOSTUNREACH; } else { @@ -4888,7 +4891,7 @@ static __attribute__((unused)) void pfioctl_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_long)0) { diff --git a/bsd/net/pf_norm.c b/bsd/net/pf_norm.c index 82f5f0dc8..22beae601 100644 --- a/bsd/net/pf_norm.c +++ b/bsd/net/pf_norm.c @@ -86,6 +86,7 @@ #include #include +#include struct pf_frent { LIST_ENTRY(pf_frent) fr_next; @@ -758,7 +759,8 @@ insert: m->m_pkthdr.csum_rx_val = csum; m->m_pkthdr.csum_rx_start = sizeof(struct ip); m->m_pkthdr.csum_flags = (*frag)->fr_csum_flags; - } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || + } else if ((m->m_pkthdr.rcvif != NULL && + m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; @@ -794,7 +796,7 @@ drop_fragment: /* Oops - fail safe - drop packet */ pool_put(&pf_frent_pl, frent); pf_nfrents--; - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_BAD_FRAGMENT, NULL, 0); return NULL; } @@ -1085,7 +1087,8 @@ no_mem: (*frag)->fr_flags |= PFFRAG_SEENLAST; } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_PF_MEM_ALLOC, NULL, 0); return NULL; drop_fragment: @@ -1104,7 +1107,8 @@ drop_fragment: (*frag)->fr_flags |= PFFRAG_DROP; } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, + DROP_REASON_PF_BAD_FRAGMENT, NULL, 0); return NULL; } @@ -1402,7 +1406,8 @@ insert: m->m_pkthdr.csum_rx_val = csum; m->m_pkthdr.csum_rx_start = sizeof(struct ip6_hdr); m->m_pkthdr.csum_flags = (*frag)->fr_csum_flags; - } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || + } else if ((m->m_pkthdr.rcvif != NULL && + m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || (m->m_pkthdr.pkt_flags & PKTF_LOOP)) { /* loopback checksums are always OK */ m->m_pkthdr.csum_data = 0xffff; @@ -1450,7 +1455,7 @@ insert: sizeof(*ftag), M_NOWAIT, m); if (mtag == NULL) { /* XXX: add stats */ - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_MEM_ALLOC, NULL, 0); return NULL; } ftag = (struct pf_fragment_tag *)mtag->m_tag_data; @@ -1470,7 +1475,7 @@ drop_fragment: /* Oops - fail safe - drop packet */ pool_put(&pf_frent_pl, frent); --pf_nfrents; - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_BAD_FRAGMENT, NULL, 0); return NULL; } @@ -1787,7 +1792,7 @@ no_mem: (*frag)->fr_flags |= PFFRAG_SEENLAST; } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_MEM_ALLOC, NULL, 0); return NULL; drop_fragment: @@ -1806,7 +1811,7 @@ drop_fragment: (*frag)->fr_flags |= PFFRAG_DROP; } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_BAD_FRAGMENT, NULL, 0); return NULL; } @@ -1823,7 +1828,6 @@ pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag) struct route_in6 *__single ro; struct sockaddr_in6 *__single dst; struct ip6_hdr *__single hdr; - struct pf_mtag *__single mtag; struct m_tag *__single tag; if (pbufp == NULL || !pbuf_is_valid(*pbufp) || ftag == NULL) { @@ -1832,7 +1836,6 @@ pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag) } m = pbuf_to_mbuf(*pbufp, FALSE); hdr = mtod(m, struct ip6_hdr *); - mtag = pf_find_mtag(m); hdrlen = ftag->ft_hdrlen - sizeof(struct ip6_hdr); extoff = ftag->ft_extoff; maxlen = ftag->ft_maxlen; @@ -1842,7 +1845,7 @@ pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag) m_tag_delete(m, tag); ftag = NULL; tag = NULL; - mtag->pftag_flags &= ~PF_TAG_REASSEMBLED; + pf_find_mtag(m)->pftag_flags &= ~PF_TAG_REASSEMBLED; ro = &ip6route; bzero((struct route_in6 *__bidi_indexable)ro, sizeof(*ro)); dst = (struct sockaddr_in6 *)&ro->ro_dst; @@ -1889,7 +1892,7 @@ pf_refragment6(struct ifnet *ifp, pbuf_t **pbufp, struct pf_fragment_tag *ftag) * PF_TAG_REFRAGMENTED flag set to indicate ip6_forward() * and pf_route6() that the mbuf contains a chain of fragments. */ - mtag->pftag_flags |= PF_TAG_REFRAGMENTED; + pf_find_mtag(m)->pftag_flags |= PF_TAG_REFRAGMENTED; action = PF_PASS; pbuf_init_mbuf(*pbufp, m, ifp); } else { @@ -2033,7 +2036,7 @@ pf_normalize_ip(pbuf_t *pbuf, int dir, struct pfi_kif *kif, u_short *reason, frent = pool_get(&pf_frent_pl, PR_NOWAIT); if (frent == NULL) { REASON_SET(reason, PFRES_MEMORY); - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN, DROP_REASON_PF_MEM_ALLOC, NULL, 0); return PF_DROP; } pf_nfrents++; diff --git a/bsd/net/pf_pbuf.c b/bsd/net/pf_pbuf.c index 4047c0637..a91a3eb54 100644 --- a/bsd/net/pf_pbuf.c +++ b/bsd/net/pf_pbuf.c @@ -229,12 +229,21 @@ pbuf_ensure_writable(pbuf_t *pbuf, size_t len) } void * +__attribute__((warn_unused_result)) pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen) { void *rv = NULL; VERIFY(off >= 0); - VERIFY((u_int)off <= pbuf->pb_packet_len); + + /* + * Gracefully handle the case where `pbuf' + * does not have sufficient data + * for the requested `off'/`olen' combination. + */ + if ((u_int)(off + olen) > pbuf->pb_packet_len) { + return NULL; + } if (pbuf->pb_type == PBUF_TYPE_MBUF) { struct mbuf *m, *n; @@ -263,18 +272,23 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen) return NULL; } - rv = mtod(n, void *); - if (off > 0) { /* Merge the two chains */ int mlen; + struct mbuf *new_n; + int new_off = 0; mlen = n->m_pkthdr.len; m_cat(m, n); m->m_pkthdr.len += mlen; + + new_n = m_getptr(m, off, &new_off); + rv = mtod(new_n, uint8_t *) + new_off; } else { /* The new mbuf becomes the packet header */ pbuf->pb_mbuf = n; + + rv = mtod(n, void *); } pbuf_sync(pbuf); @@ -307,13 +321,22 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen) } void * +__attribute__((warn_unused_result)) pbuf_contig_segment(pbuf_t *pbuf, int off, int len) { void *__single rv = NULL; VERIFY(off >= 0); VERIFY(len >= 0); - VERIFY((u_int)(off + len) <= pbuf->pb_packet_len); + + /* + * Gracefully handle the case where `pbuf' + * does not have sufficient data + * for the requested `off'/`len' combination. + */ + if ((u_int)(off + len) > pbuf->pb_packet_len) { + return NULL; + } /* * Note: If this fails, then the pbuf is destroyed. This is a diff --git a/bsd/net/pf_pbuf.h b/bsd/net/pf_pbuf.h index b995fa291..67bfe038c 100644 --- a/bsd/net/pf_pbuf.h +++ b/bsd/net/pf_pbuf.h @@ -96,8 +96,8 @@ struct mbuf *pbuf_clone_to_mbuf(pbuf_t *); void * pbuf_ensure_contig(pbuf_t *, size_t); void * pbuf_ensure_writable(pbuf_t *, size_t); -void * pbuf_resize_segment(pbuf_t *, int off, int olen, int nlen); -void * pbuf_contig_segment(pbuf_t *, int off, int len); +void * pbuf_resize_segment(pbuf_t *, int off, int olen, int nlen) __attribute__((warn_unused_result)); +void * pbuf_contig_segment(pbuf_t *, int off, int len) __attribute__((warn_unused_result)); void pbuf_copy_data(pbuf_t *, int, int, void *__sized_by(buflen), size_t buflen); void pbuf_copy_back(pbuf_t *, int, int, void *__sized_by(buflen), size_t buflen); diff --git a/bsd/net/pktap.c b/bsd/net/pktap.c index adaa8d29f..e4c4b3a57 100644 --- a/bsd/net/pktap.c +++ b/bsd/net/pktap.c @@ -201,15 +201,15 @@ pktap_hexdump(int mask, void *__sized_by(len) addr, size_t len) } } -#define _CASSERT_OFFFSETOF_FIELD(s1, s2, f) \ - _CASSERT(offsetof(struct s1, f) == offsetof(struct s2, f)) +#define ASSERT_OFFFSETOF_FIELD(s1, s2, f) \ + static_assert(offsetof(struct s1, f) == offsetof(struct s2, f)) __private_extern__ void pktap_init(void) { int error = 0; - _CASSERT_OFFFSETOF_FIELD(pktap_header, pktap_v2_hdr, pth_flags); + ASSERT_OFFFSETOF_FIELD(pktap_header, pktap_v2_hdr, pth_flags); /* Make sure we're called only once */ VERIFY(pktap_inited == 0); @@ -1186,7 +1186,7 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, u_int32_t pre_adjust = 0; /* Verify the structure is packed */ - _CASSERT(sizeof(hdr_buffer) == sizeof(struct pktap_header) + sizeof(u_int32_t)); + static_assert(sizeof(hdr_buffer) == sizeof(struct pktap_header) + sizeof(u_int32_t)); bzero(&hdr_buffer, sizeof(hdr_buffer)); hdr->pth_length = sizeof(struct pktap_header); @@ -1300,6 +1300,11 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) { hdr->pth_flags |= PTH_FLAG_WAKE_PKT; } + + /* Need to check the packet flag in case full wake has been requested */ + if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_LPW || if_is_lpw_enabled(ifp)) { + hdr->pth_flags |= PTH_FLAG_LPW; + } if (outgoing != 0) { hdr->pth_comp_gencnt = m->m_pkthdr.comp_gencnt; } @@ -1450,7 +1455,13 @@ pktap_bpf_tap_packet(struct ifnet *ifp, protocol_family_t proto, uint32_t dlt, if (kern_packet_get_wake_flag(pkt)) { hdr->pth_flags |= PTH_FLAG_WAKE_PKT; } + + /* Need to check the packet flag in case full wake has been requested */ + if (kern_packet_get_lpw_flag(pkt) || if_is_lpw_enabled(ifp)) { + hdr->pth_flags |= PTH_FLAG_LPW; + } kern_packet_get_compression_generation_count(pkt, &hdr->pth_comp_gencnt); + hdr->pth_trace_tag = kern_packet_get_trace_tag(pkt); hdr->pth_protocol_family = proto; hdr->pth_svc = so_svc2tc((mbuf_svc_class_t) diff --git a/bsd/net/pktap.h b/bsd/net/pktap.h index 49406c14d..2418f5017 100644 --- a/bsd/net/pktap.h +++ b/bsd/net/pktap.h @@ -216,9 +216,25 @@ struct pktap_buffer_v2_hdr_extra { #define PTH_FLAG_NEXUS_CHAN 0x00040000 /* Packet on a nexus channel */ #define PTH_FLAG_V2_HDR 0x00080000 /* Version 2 of pktap */ #define PTH_FLAG_WAKE_PKT 0x00100000 /* Packet caused system to ake from sleep */ +#define PTH_FLAG_ULPN 0x00200000 /* Packet transitted coprocessor */ +#define PTH_FLAG_LPW 0x00400000 /* Packet in low power wake */ #include +#ifdef KERNEL_PRIVATE +#if SKYWALK +#include +extern void pktap_input_packet(struct ifnet *, protocol_family_t, uint32_t, + pid_t, const char *, pid_t, const char *, kern_packet_t, + const void *__sized_by(header_len) header, size_t header_len, + uint8_t, uint32_t, uint32_t); +extern void pktap_output_packet(struct ifnet *, protocol_family_t, uint32_t, + pid_t, const char *, pid_t, const char *, kern_packet_t, + const void *__sized_by(header_len) header, size_t header_len, + uint8_t, uint32_t, uint32_t); +#endif /* SKYWALK */ +#endif /* KERNEL_PRIVATE */ + #ifdef BSD_KERNEL_PRIVATE #include @@ -238,17 +254,6 @@ extern void pktap_fill_proc_info(struct pktap_header *, protocol_family_t, struct mbuf *, u_int32_t, int, struct ifnet *); extern void pktap_finalize_proc_info(struct pktap_header *); extern void pktap_v2_finalize_proc_info(struct pktap_v2_hdr *); -#if SKYWALK -#include -extern void pktap_input_packet(struct ifnet *, protocol_family_t, uint32_t, - pid_t, const char *, pid_t, const char *, kern_packet_t, - const void *__sized_by(header_len) header, size_t header_len, - uint8_t, uint32_t, uint32_t); -extern void pktap_output_packet(struct ifnet *, protocol_family_t, uint32_t, - pid_t, const char *, pid_t, const char *, kern_packet_t, - const void *__sized_by(header_len) header, size_t header_len, - uint8_t, uint32_t, uint32_t); -#endif /* SKYWALK */ extern void convert_to_pktap_header_to_v2(struct bpf_packet *bpf_pkt, bool truncate); #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/net/pktsched/Makefile b/bsd/net/pktsched/Makefile index fc5f59f77..8e14c2dc4 100644 --- a/bsd/net/pktsched/Makefile +++ b/bsd/net/pktsched/Makefile @@ -12,7 +12,7 @@ KERNELFILES= \ PRIVATE_DATAFILES = \ pktsched.h pktsched_cbq.h pktsched_fairq.h pktsched_hfsc.h \ - pktsched_priq.h pktsched_rmclass.h pktsched_fq_codel.h + pktsched_priq.h pktsched_rmclass.h pktsched_fq_codel.h pktsched_ops.h PRIVATE_KERNELFILES = ${KERNELFILES} diff --git a/bsd/net/pktsched/pktsched.c b/bsd/net/pktsched/pktsched.c index 34e35d4a1..3357a62ea 100644 --- a/bsd/net/pktsched/pktsched.c +++ b/bsd/net/pktsched/pktsched.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -70,6 +71,95 @@ SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched" SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &pktsched_verbose, 0, "Packet scheduler verbosity level"); +static void +pktsched_teardown_noop(__unused struct ifclassq *ifq) +{ + return; +} + +static int +pktsched_request_noop(struct ifclassq *ifq, cqrq_t rq, void *arg) +{ +#pragma unused(ifq, rq, arg) + return ENXIO; +} + +static int +pktsched_getqstats_noop(struct ifclassq *ifq, + uint8_t gid, u_int32_t qid, + struct if_ifclassq_stats *ifqs) +{ +#pragma unused(ifq, gid, qid, ifqs) + return ENXIO; +} + +static int +pktsched_enqueue_noop(struct ifclassq *ifq, + classq_pkt_t *h, classq_pkt_t *t, uint32_t cnt, + uint32_t bytes, boolean_t *pdrop) +{ + pktsched_pkt_t pkt; + pktsched_pkt_encap_chain(&pkt, h, t, cnt, bytes); + if (__improbable(droptap_verbose > 0)) { + pktsched_drop_pkt(&pkt, ifq->ifcq_ifp, DROP_REASON_AQM_BK_SYS_THROTTLED, + __func__, __LINE__, 0); + } else { + pktsched_free_pkt(&pkt); + } + + *pdrop = true; + return ENXIO; +} + +static int +pktsched_dequeue_noop(struct ifclassq *ifq, + u_int32_t maxpktcnt, u_int32_t maxbytecnt, + classq_pkt_t *first_packet, classq_pkt_t *last_packet, + u_int32_t *retpktcnt, u_int32_t *retbytecnt, + uint8_t grp_idx) +{ +#pragma unused(ifq, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx) + return ENXIO; +} + +static int +pktsched_dequeue_sc_noop(struct ifclassq *ifq, + mbuf_svc_class_t svc, u_int32_t maxpktcnt, + u_int32_t maxbytecnt, classq_pkt_t *first_packet, + classq_pkt_t *last_packet, u_int32_t *retpktcnt, + u_int32_t *retbytecnt, uint8_t grp_idx) +{ +#pragma unused(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx) + return ENXIO; +} + +static int +pktsched_setup_noop(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype) +{ +#pragma unused(ifq, flags, ptype) + return ENXIO; +} + +static boolean_t +pktsched_allow_dequeue_noop(struct ifclassq *ifq) +{ +#pragma unused(ifq) + return false; +} + +struct pktsched_ops pktsched_noops = { + .ps_id = PKTSCHEDT_NONE, + .ps_setup = pktsched_setup_noop, + .ps_teardown = pktsched_teardown_noop, + .ps_enq = pktsched_enqueue_noop, + .ps_deq = pktsched_dequeue_noop, + .ps_deq_sc = pktsched_dequeue_sc_noop, + .ps_req = pktsched_request_noop, + .ps_stats = pktsched_getqstats_noop, + .ps_allow_dequeue = pktsched_allow_dequeue_noop, +}; + void pktsched_init(void) { @@ -78,6 +168,7 @@ pktsched_init(void) panic("%s: no CPU clock available!", __func__); /* NOTREACHED */ } + pktsched_ops_register(&pktsched_noops); pktsched_fq_init(); } @@ -113,11 +204,12 @@ pktsched_nsecs_to_abstime(u_int64_t nsecs) } int -pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags, +pktsched_setup(struct ifclassq *ifq, u_int8_t scheduler, u_int32_t sflags, classq_pkt_type_t ptype) { int error = 0; u_int32_t rflags; + pktsched_ops_t *ops; IFCQ_LOCK_ASSERT_HELD(ifq); @@ -135,6 +227,17 @@ pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags, rflags = (ifq->ifcq_flags & IFCQF_ENABLED); if (ifq->ifcq_type != PKTSCHEDT_NONE) { + /* Don't support changing qdisc for fq_codel that has multiple groups */ + if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) { + fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; + uint8_t grp_idx; + for (grp_idx = 1; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) { + if (fqs->fqs_classq_groups[grp_idx] != NULL) { + return ENOTSUP; + } + } + } + pktsched_teardown(ifq); /* Teardown should have succeeded */ @@ -142,10 +245,16 @@ pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags, VERIFY(ifq->ifcq_disc == NULL); } - error = fq_if_setup_ifclassq(ifq, sflags, ptype); + ops = pktsched_ops_find(scheduler); + ASSERT(ops != NULL); + ifq->ifcq_ops = ops; + error = ops->ps_setup(ifq, sflags, ptype); if (error == 0) { ifq->ifcq_flags |= rflags; } + if (ops->ps_ops_flags & PKTSCHED_OPS_LOCKLESS) { + ifq->ifcq_flags |= IFCQF_LOCKLESS; + } return error; } @@ -154,30 +263,21 @@ void pktsched_teardown(struct ifclassq *ifq) { IFCQ_LOCK_ASSERT_HELD(ifq); - if_qflush(ifq->ifcq_ifp, ifq, true); + ifq->ifcq_ops->ps_req(ifq, CLASSQRQ_PURGE, 0); VERIFY(IFCQ_IS_EMPTY(ifq)); ifq->ifcq_flags &= ~IFCQF_ENABLED; - if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) { - /* Could be PKTSCHEDT_NONE */ - fq_if_teardown_ifclassq(ifq); - } + ifq->ifcq_ops->ps_teardown(ifq); return; } +// TODO: change function signature to be more generic int pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid, struct if_ifclassq_stats *ifqs) { - int error = 0; - IFCQ_LOCK_ASSERT_HELD(ifq); - if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) { - /* Could be PKTSCHEDT_NONE */ - error = fq_if_getqstats_ifclassq(ifq, (uint8_t)gid, qid, ifqs); - } - - return error; + return ifq->ifcq_ops->ps_stats(ifq, (uint8_t)gid, qid, ifqs); } void @@ -414,6 +514,7 @@ pktsched_drop_pkt(pktsched_pkt_t *pkt, struct ifnet *ifp, drop_reason_t reason, } droptap_output_packet(SK_PKT2PH(kpkt), reason, funcname, linenum, flags, ifp, kpkt->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); + pktsched_free_pkt(pkt); break; } #endif /* SKYWALK */ @@ -423,8 +524,6 @@ pktsched_drop_pkt(pktsched_pkt_t *pkt, struct ifnet *ifp, drop_reason_t reason, /* NOTREACHED */ __builtin_unreachable(); } - - pktsched_free_pkt(pkt); } mbuf_svc_class_t @@ -556,16 +655,13 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) break; } - _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == - sizeof(fce->fce_flowid)); + static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(fce->fce_flowid)); fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc; fce->fce_flowid = m->m_pkthdr.pkt_flowid; #if SKYWALK - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == - sizeof(fce->fce_flowsrc_token)); - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == - sizeof(fce->fce_flowsrc_fidx)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(fce->fce_flowsrc_token)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(fce->fce_flowsrc_fidx)); if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) { fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx; @@ -585,12 +681,9 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) break; } - _CASSERT(sizeof(fce->fce_flowid) == - sizeof(kp->pkt_flow_token)); - _CASSERT(sizeof(fce->fce_flowsrc_fidx) == - sizeof(kp->pkt_flowsrc_fidx)); - _CASSERT(sizeof(fce->fce_flowsrc_token) == - sizeof(kp->pkt_flowsrc_token)); + static_assert(sizeof(fce->fce_flowid) == sizeof(kp->pkt_flow_token)); + static_assert(sizeof(fce->fce_flowsrc_fidx) == sizeof(kp->pkt_flowsrc_fidx)); + static_assert(sizeof(fce->fce_flowsrc_token) == sizeof(kp->pkt_flowsrc_token)); ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV); fce->fce_flowsrc_type = kp->pkt_flowsrc_type; @@ -611,43 +704,6 @@ pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) return fce; } -uint32_t * -pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags) -{ - uint32_t *hashp = NULL; - - switch (pkt->pktsched_ptype) { - case QP_MBUF: { - struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); - - _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t)); - _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t)); - *sfb_flags = &pkth->pkt_mpriv_flags; - hashp = &pkth->pkt_mpriv_hash; - break; - } - -#if SKYWALK - case QP_PACKET: { - struct __kern_packet *kp = pkt->pktsched_pkt_kpkt; - - _CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t)); - _CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t)); - *sfb_flags = &kp->pkt_classq_flags; - hashp = &kp->pkt_classq_hash; - break; - } -#endif /* SKYWALK */ - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } - - return hashp; -} - static int pktsched_mbuf_mark_ecn(struct mbuf* m) { diff --git a/bsd/net/pktsched/pktsched.h b/bsd/net/pktsched/pktsched.h index b297660cf..be5bae01a 100644 --- a/bsd/net/pktsched/pktsched.h +++ b/bsd/net/pktsched/pktsched.h @@ -43,7 +43,8 @@ extern "C" { #define PKTSCHEDT_TCQ 5 /* traffic class queue */ #define PKTSCHEDT_QFQ 6 /* quick fair queueing */ #define PKTSCHEDT_FQ_CODEL 7 /* Flow queues with CoDel */ -#define PKTSCHEDT_MAX 8 /* should be max sched type + 1 */ +#define PKTSCHEDT_FQ_CODEL_NEW 8 /* Flow queues with RFC compliant CoDel */ +#define PKTSCHEDT_MAX 9 /* should be max sched type + 1 */ #ifdef BSD_KERNEL_PRIVATE #include @@ -185,7 +186,7 @@ struct if_ifclassq_stats; extern void pktsched_register_m_tag(void); extern void pktsched_init(void); -extern int pktsched_setup(struct ifclassq *, u_int32_t, u_int32_t, +extern int pktsched_setup(struct ifclassq *, u_int8_t, u_int32_t, classq_pkt_type_t); extern void pktsched_teardown(struct ifclassq *); extern int pktsched_getqstats(struct ifclassq *, u_int32_t, u_int32_t, @@ -199,7 +200,6 @@ extern int pktsched_clone_pkt(pktsched_pkt_t *, pktsched_pkt_t *); extern void pktsched_corrupt_packet(pktsched_pkt_t *pkt); extern void pktsched_get_pkt_vars(pktsched_pkt_t *, volatile uint32_t **, uint64_t **, uint32_t *, uint8_t *, uint8_t *, uint32_t *, uint64_t *); -extern uint32_t *pktsched_get_pkt_sfb_vars(pktsched_pkt_t *, uint32_t **); extern void pktsched_pkt_encap(pktsched_pkt_t *, classq_pkt_t *); extern void pktsched_pkt_encap_chain(pktsched_pkt_t *, classq_pkt_t *, classq_pkt_t *, uint32_t, uint32_t); diff --git a/bsd/net/pktsched/pktsched_fq_codel.c b/bsd/net/pktsched/pktsched_fq_codel.c index 70fad7bd8..4e4c65322 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.c +++ b/bsd/net/pktsched/pktsched_fq_codel.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include /* for PE_parse_boot_argn */ @@ -42,6 +43,8 @@ #include #include +#include + #define FQ_CODEL_DEFAULT_QUANTUM 1500 #define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q) @@ -55,22 +58,63 @@ #define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5) #define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5) +#define IFQ_DEF_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */ +#define IFQ_DEF_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ +#define IFQ_DEF_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */ +#define IFQ_DEF_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */ +#define IFQ_DEF_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ +#define IFQ_LL_C_TARGET_DELAY (10ULL * 1000 * 1000) /* 10 ms */ +#define IFQ_LL_C_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ +#define IFQ_LL_L4S_TARGET_DELAY (2ULL * 1000 * 1000) /* 2 ms */ +#define IFQ_LL_L4S_WIRELESS_TARGET_DELAY (15ULL * 1000 * 1000) /* 15 ms */ +#define IFQ_LL_L4S_UPDATE_INTERVAL (100ULL * 1000 * 1000) /* 100 ms */ + +static uint64_t fq_if_def_c_target_qdelay = 0; +static uint64_t fq_if_def_c_update_interval = 0; +static uint64_t fq_if_def_l4s_target_qdelay = 0; +static uint64_t fq_if_def_l4s_update_interval = 0; +static uint64_t fq_if_ll_c_target_qdelay = 0; +static uint64_t fq_if_ll_c_update_interval = 0; +static uint64_t fq_if_ll_l4s_target_qdelay = 0; +static uint64_t fq_if_ll_l4s_update_interval = 0; + +uint32_t fq_codel_quantum = 0; + static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT); static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT); SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "FQ-CODEL parameters"); -SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED, - &ifclassq_enable_pacing, 0, "Enable pacing"); - static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY; #if (DEVELOPMENT || DEBUG) +SYSCTL_EXTENSIBLE_NODE(_net_classq_fq_codel, OID_AUTO, params, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "classq fq codel parameters"); + SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)"); + +SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, + &fq_if_def_c_target_qdelay, "classic target queue delay in nanoseconds"); + +SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, update_interval, + CTLFLAG_RW | CTLFLAG_LOCKED, &fq_if_def_c_update_interval, + "classic update interval in nanoseconds"); #endif /* !DEVELOPMENT && !DEBUG */ -unsigned int ifclassq_enable_pacing = 1; +unsigned int fq_codel_enable_pacing = 1; +SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED, + &fq_codel_enable_pacing, 0, "Enable pacing"); + +uint32_t fq_codel_enable_l4s = 1; +SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_l4s, + CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_l4s, 0, + "enable/disable L4S"); + +uint32_t fq_codel_enable_ecn = 0; +SYSCTL_UINT(_net_classq_fq_codel, OID_AUTO, enable_ecn, + CTLFLAG_RW | CTLFLAG_LOCKED, &fq_codel_enable_ecn, 0, + "enable/disable ECN for classic traffic"); typedef STAILQ_HEAD(, flowq) flowq_dqlist_t; @@ -78,7 +122,7 @@ static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t); static void fq_if_destroy(fq_if_t *fqs); static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority, uint32_t quantum, uint32_t drr_max, uint32_t svc_class); -static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t, +static void fq_if_dequeue_class(fq_if_t *, fq_if_classq_t *, uint32_t, int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *, uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*); void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat); @@ -93,7 +137,7 @@ static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq); static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all); static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now); -static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, +static int fq_if_dequeue_sc_separate(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx); @@ -102,6 +146,11 @@ static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp); static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx); static void fq_if_destroy_grps(fq_if_t *fqs); +static void fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx); +static void fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx); +static void fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay, + uint32_t flags); +static void fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags); uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = { [FQ_IF_CTL_INDEX] = 8, @@ -140,6 +189,26 @@ static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state, fq_if_state src_state); +void fq_if_teardown(struct ifclassq *ifq); +int fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg); +int fq_if_getqstats(struct ifclassq *ifq, uint8_t gid, + u_int32_t qid, struct if_ifclassq_stats *ifqs); +int fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *h, + classq_pkt_t *t, uint32_t cnt, uint32_t bytes, boolean_t *pdrop); +int fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt, + u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, + u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx); +int fq_if_dequeue_sc(struct ifclassq *ifq, + mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, + classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, + u_int32_t *retbytecnt, uint8_t grp_idx); +int fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype); +boolean_t fq_if_allow_dequeue(struct ifclassq *ifq); + +int fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype); + bitmap_ops_t fq_if_grps_bitmap_ops = { .ffs = fq_if_grps_bitmap_ffs, @@ -160,13 +229,35 @@ bitmap_ops_t fq_if_grps_sc_bitmap_ops = static uint32_t fq_if_hash_table_size; -extern int serverperfmode; // Temporary to resolve build dependency +struct pktsched_ops fq_codel_classq_ops = { + .ps_id = PKTSCHEDT_FQ_CODEL, + .ps_setup = fq_if_setup_legacy, + .ps_teardown = fq_if_teardown, + .ps_enq = fq_if_enqueue, + .ps_deq = fq_if_dequeue, + .ps_deq_sc = fq_if_dequeue_sc, + .ps_req = fq_if_request, + .ps_stats = fq_if_getqstats, + .ps_allow_dequeue = fq_if_allow_dequeue, +}; + +struct pktsched_ops new_fq_codel_classq_ops = { + .ps_id = PKTSCHEDT_FQ_CODEL_NEW, + .ps_setup = fq_if_setup_new, + .ps_teardown = fq_if_teardown, + .ps_enq = fq_if_enqueue, + .ps_deq = fq_if_dequeue, + .ps_deq_sc = fq_if_dequeue_sc, + .ps_req = fq_if_request, + .ps_stats = fq_if_getqstats, + .ps_allow_dequeue = fq_if_allow_dequeue, +}; void pktsched_fq_init(void) { - PE_parse_boot_argn("ifclassq_enable_pacing", &ifclassq_enable_pacing, - sizeof(ifclassq_enable_pacing)); + pktsched_ops_register(&fq_codel_classq_ops); + pktsched_ops_register(&new_fq_codel_classq_ops); if (serverperfmode) { fq_if_hash_table_size = (1 << 16); @@ -193,6 +284,32 @@ pktsched_fq_init(void) pri_index += 1; drr = 0; } + +#if DEBUG || DEVELOPMENT + PE_parse_boot_argn("fq_codel_quantum", &fq_codel_quantum, + sizeof(fq_codel_quantum)); + PE_parse_boot_argn("fq_if_def_c_target_qdelay", &fq_if_def_c_target_qdelay, + sizeof(fq_if_def_c_target_qdelay)); + PE_parse_boot_argn("fq_if_def_c_update_interval", + &fq_if_def_c_update_interval, sizeof(fq_if_def_c_update_interval)); + PE_parse_boot_argn("fq_if_def_l4s_target_qdelay", &fq_if_def_l4s_target_qdelay, + sizeof(fq_if_def_l4s_target_qdelay)); + PE_parse_boot_argn("fq_if_def_l4s_update_interval", + &fq_if_def_l4s_update_interval, sizeof(fq_if_def_l4s_update_interval)); + PE_parse_boot_argn("fq_if_ll_c_target_qdelay", &fq_if_ll_c_target_qdelay, + sizeof(fq_if_ll_c_target_qdelay)); + PE_parse_boot_argn("fq_if_ll_c_update_interval", + &fq_if_ll_c_update_interval, sizeof(fq_if_ll_c_update_interval)); + PE_parse_boot_argn("fq_if_ll_l4s_target_qdelay", &fq_if_ll_l4s_target_qdelay, + sizeof(fq_if_ll_l4s_target_qdelay)); + PE_parse_boot_argn("fq_if_ll_l4s_update_interval", + &fq_if_ll_l4s_update_interval, sizeof(fq_if_ll_l4s_update_interval)); +#endif /* DEBUG || DEVELOPMENT */ + + PE_parse_boot_argn("fq_codel_enable_pacing", &fq_codel_enable_pacing, + sizeof(fq_codel_enable_pacing)); + + fq_codel_init(); } static uint32_t @@ -255,7 +372,11 @@ fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, while (fq->fq_deficit > 0 && limit_reached == FALSE && !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) { _PKTSCHED_PKT_INIT(&pkt); - fq_getq_flow(fqs, fq, &pkt, now); + fqs->fqs_dequeue(fqs, fq, &pkt, now); + if (pkt.pktsched_pcnt == 0) { + continue; + } + ASSERT(pkt.pktsched_ptype == QP_PACKET); plen = pktsched_get_pkt_len(&pkt); @@ -310,7 +431,11 @@ fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq, while (fq->fq_deficit > 0 && limit_reached == FALSE && !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) { _PKTSCHED_PKT_INIT(&pkt); - fq_getq_flow(fqs, fq, &pkt, now); + fqs->fqs_dequeue(fqs, fq, &pkt, now); + if (pkt.pktsched_pcnt == 0) { + continue; + } + ASSERT(pkt.pktsched_ptype == QP_MBUF); plen = pktsched_get_pkt_len(&pkt); @@ -360,6 +485,114 @@ fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1) ifnet_start_ignore_delay(ifp); } +static void +fq_if_calc_target_qdelay(struct ifnet *ifp, uint64_t *if_target_qdelay, + uint32_t flags) +{ + uint64_t qdelay = 0, qdelay_configed = 0, qdely_default = 0; + if (flags == IF_CLASSQ_DEF) { + qdelay = IFCQ_TARGET_QDELAY(ifp->if_snd); + } + + switch (flags) { + case IF_CLASSQ_DEF: + qdelay_configed = fq_if_def_c_target_qdelay; + qdely_default = IFQ_DEF_C_TARGET_DELAY; + break; + case IF_CLASSQ_L4S: + qdelay_configed = fq_if_def_l4s_target_qdelay; + if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI || + ifp->if_family == IFNET_FAMILY_CELLULAR) { + qdely_default = IFQ_DEF_L4S_WIRELESS_TARGET_DELAY; + } else { + qdely_default = IFQ_DEF_L4S_TARGET_DELAY; + } + break; + case IF_CLASSQ_LOW_LATENCY: + qdelay_configed = fq_if_ll_c_target_qdelay; + qdely_default = IFQ_LL_C_TARGET_DELAY; + break; + case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S): + qdelay_configed = fq_if_ll_l4s_target_qdelay; + if (ifp->if_subfamily == IFNET_SUBFAMILY_WIFI || + ifp->if_family == IFNET_FAMILY_CELLULAR) { + qdely_default = IFQ_LL_L4S_WIRELESS_TARGET_DELAY; + } else { + qdely_default = IFQ_LL_L4S_TARGET_DELAY; + } + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); + } + + if (qdelay_configed != 0) { + qdelay = qdelay_configed; + } + + /* + * If we do not know the effective bandwidth, use the default + * target queue delay. + */ + if (qdelay == 0) { + qdelay = qdely_default; + } + + /* + * If a delay has been added to ifnet start callback for + * coalescing, we have to add that to the pre-set target delay + * because the packets can be in the queue longer. + */ + if ((ifp->if_eflags & IFEF_ENQUEUE_MULTI) && + ifp->if_start_delay_timeout > 0) { + qdelay += ifp->if_start_delay_timeout; + } + + *(if_target_qdelay) = qdelay; +} + +static void +fq_if_calc_update_interval(uint64_t *update_interval, uint32_t flags) +{ + uint64_t interval = 0, interval_configed = 0, interval_default = 0; + + switch (flags) { + case IF_CLASSQ_DEF: + interval_configed = fq_if_def_c_update_interval; + interval_default = IFQ_DEF_C_UPDATE_INTERVAL; + break; + case IF_CLASSQ_L4S: + interval_configed = fq_if_def_l4s_update_interval; + interval_default = IFQ_DEF_L4S_UPDATE_INTERVAL; + break; + case IF_CLASSQ_LOW_LATENCY: + interval_configed = fq_if_ll_c_update_interval; + interval_default = IFQ_LL_C_UPDATE_INTERVAL; + break; + case (IF_CLASSQ_LOW_LATENCY | IF_CLASSQ_L4S): + interval_configed = fq_if_ll_l4s_update_interval; + interval_default = IFQ_LL_L4S_UPDATE_INTERVAL; + break; + default: + VERIFY(0); + /* NOTREACHED */ + __builtin_unreachable(); + } + + /* If the system level override is set, use it */ + if (interval_configed != 0) { + interval = interval_configed; + } + + /* Otherwise use the default value */ + if (interval == 0) { + interval = interval_default; + } + + *update_interval = interval; +} + fq_if_t * fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype) { @@ -419,6 +652,11 @@ fq_if_destroy(fq_if_t *fqs) fqs->fqs_ifq = NULL; +#if (DEBUG || DEVELOPMENT) + struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid; + skoid_destroy(fqs_skoid); +#endif /* (DEBUG || DEVELOPMENT) */ + kfree_type_counted_by(flowq_list_t, fqs->fqs_flows_count, fqs->fqs_flows); zfree(fq_if_zone, fqs); } @@ -519,7 +757,7 @@ fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum, } int -fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, +fq_if_enqueue(struct ifclassq *ifq, classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop) { uint8_t pri, grp_idx = 0; @@ -532,6 +770,7 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes); + IFCQ_LOCK_SPIN(ifq); fqs = (fq_if_t *)ifq->ifcq_disc; svc = pktsched_get_pkt_svc(&pkt); #if SKYWALK @@ -542,7 +781,6 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, pri = fq_if_service_to_priority(fqs, svc); VERIFY(pri < FQ_IF_MAX_CLASSES); - IFCQ_LOCK_SPIN(ifq); fq_group = fq_if_find_grp(fqs, grp_idx); fq_cl = &fq_group->fqg_classq[pri]; @@ -562,7 +800,7 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, } ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype); - ret = fq_addq(fqs, fq_group, &pkt, fq_cl); + ret = fqs->fqs_enqueue(fqs, fq_group, &pkt, fq_cl); if (!FQ_IF_CLASSQ_IDLE(fq_cl)) { if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) & (1 << pri)) == 0) { @@ -582,6 +820,9 @@ fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head, } else if (ret == CLASSQEQ_COMPRESSED) { ret = 0; *pdrop = FALSE; + } else if (ret == CLASSQEQ_CONGESTED) { + ret = EQCONGESTED; + *pdrop = FALSE; } else { IFCQ_UNLOCK(ifq); *pdrop = TRUE; @@ -625,21 +866,6 @@ done: return ret; } -void -fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx) -{ - (void) fq_if_dequeue_classq_multi(ifq, 1, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); -} - -void -fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, - classq_pkt_t *pkt, uint8_t grp_idx) -{ - (void) fq_if_dequeue_sc_classq_multi(ifq, svc, 1, - CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, pkt, NULL, NULL, NULL, grp_idx); -} - static inline void fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq) { @@ -854,7 +1080,7 @@ static void fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time) { uint64_t deadline = 0; - if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { + if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) { return; } ASSERT(next_tx_time != FQ_INVALID_TX_TS); @@ -871,7 +1097,7 @@ fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time) } static int -fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc, +fq_if_dequeue_common(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) @@ -941,7 +1167,7 @@ fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc, fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB); fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB); if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) { - if (ifclassq_enable_pacing && ifclassq_enable_l4s) { + if (fq_codel_enable_pacing && fq_codel_enable_l4s) { /* * Move fq_cl in IR back to ER, so that they will inspected with priority * the next time the driver dequeues @@ -978,7 +1204,7 @@ fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc, goto state_change; } } - fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), + fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced, &fq_cl_next_tx_time); @@ -1013,7 +1239,7 @@ state_change: pktsched_bit_set(pri, &fq_grp->fqg_bitmaps[FQ_IF_IB]); pktsched_bit_clr(pri, &fq_grp->fqg_bitmaps[FQ_IF_ER]); } else if (fq_cl_all_paced) { - if (ifclassq_enable_pacing && ifclassq_enable_l4s) { + if (fq_codel_enable_pacing && fq_codel_enable_l4s) { /* * If a fq_cl still has budget but only paced queues, park it * to IR so that we will not keep loopping over it @@ -1030,7 +1256,7 @@ state_change: fq_cl->fcl_budget = 0; } if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) { - if (ifclassq_enable_pacing && ifclassq_enable_l4s) { + if (fq_codel_enable_pacing && fq_codel_enable_l4s) { /* * Move fq_cl in IR back to ER, so that they will inspected with priority * the next time the driver dequeues @@ -1072,17 +1298,17 @@ state_change: } int -fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, +fq_if_dequeue(struct ifclassq *ifq, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) { - return fq_if_dequeue_classq_multi_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt, + return fq_if_dequeue_common(ifq, MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } int -fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, +fq_if_dequeue_sc(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) @@ -1090,20 +1316,20 @@ fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc, fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; if (fq_if_is_grp_combined(fqs, grp_idx)) { - return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt, + return fq_if_dequeue_common(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } else { /* * take a shortcut here since there is no need to schedule * one single service class. */ - return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt, + return fq_if_dequeue_sc_separate(ifq, svc, maxpktcnt, maxbytecnt, first_packet, last_packet, retpktcnt, retbytecnt, grp_idx); } } static int -fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc, +fq_if_dequeue_sc_separate(struct ifclassq *ifq, mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx) @@ -1156,7 +1382,7 @@ fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t sv bool all_paced = false; uint64_t next_tx_time = FQ_INVALID_TX_TS; - fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt), + fq_if_dequeue_class(fqs, fq_cl, (maxpktcnt - total_pktcnt), (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt, &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time); if (head.cp_mbuf != NULL) { @@ -1229,7 +1455,7 @@ fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp, pkts = bytes = 0; _PKTSCHED_PKT_INIT(&pkt); for (;;) { - fq_getq_flow(fqs, fq, &pkt, now); + fqs->fqs_dequeue(fqs, fq, &pkt, now); if (pkt.pktsched_pkt_mbuf == NULL) { VERIFY(pkt.pktsched_ptype == QP_INVALID); break; @@ -1357,7 +1583,7 @@ fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req) * Packet and traffic type are needed only if we want * to create a flow queue. */ - fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C); + fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, 0, 0, FQ_TFC_C, false); if (fq != NULL) { fq_if_purge_flow(fqs, fq, &pkts, &bytes, now); req->bytes += bytes; @@ -1369,7 +1595,7 @@ fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req) static uint32_t fq_if_calc_quantum(struct ifnet *ifp) { - uint32_t quantum; + uint32_t quantum, hwassist_flags; switch (ifp->if_family) { case IFNET_FAMILY_ETHERNET: @@ -1389,7 +1615,8 @@ fq_if_calc_quantum(struct ifnet *ifp) break; } - if ((ifp->if_hwassist & IFNET_TSOF) != 0) { + hwassist_flags = if_get_driver_hwassist(ifp); + if ((hwassist_flags & IFNET_TSOF) != 0) { VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX); VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX); quantum = MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu); @@ -1551,7 +1778,7 @@ fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64 stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt; stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt; - if (ifclassq_enable_pacing && ifclassq_enable_l4s && + if (fq_codel_enable_pacing && fq_codel_enable_l4s && fq_if_is_fq_cl_paced(fq_cl, now)) { stat->packets = 0; stat->bytes = 0; @@ -1564,7 +1791,7 @@ fq_if_is_grp_all_paced(fq_if_group_t *grp) fq_if_classq_t *fq_cl; uint64_t now; - if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { + if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) { return false; } @@ -1583,14 +1810,14 @@ fq_if_is_grp_all_paced(fq_if_group_t *grp) } boolean_t -fq_if_is_all_paced(struct ifclassq *ifq) +fq_if_allow_dequeue(struct ifclassq *ifq) { fq_if_group_t *grp; fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; IFCQ_LOCK_ASSERT_HELD(ifq); - if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) { + if (!fq_codel_enable_pacing || !fq_codel_enable_l4s) { return false; } @@ -1624,7 +1851,7 @@ fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat) if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) { if (stat->sc == MBUF_SC_UNSPEC) { - if (!fq_if_is_all_paced(fqs->fqs_ifq)) { + if (!fq_if_allow_dequeue(fqs->fqs_ifq)) { stat->packets = IFCQ_LEN(fqs->fqs_ifq); stat->bytes = IFCQ_BYTES(fqs->fqs_ifq); } @@ -1677,7 +1904,7 @@ fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat) } int -fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg) +fq_if_request(struct ifclassq *ifq, cqrq_t rq, void *arg) { int err = 0; fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; @@ -1691,6 +1918,7 @@ fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg) switch (rq) { case CLASSQRQ_PURGE: fq_if_purge(fqs); + VERIFY(IFCQ_IS_EMPTY(ifq)); break; case CLASSQRQ_PURGE_SC: fq_if_purge_sc(fqs, (cqrq_purge_sc_t *)arg); @@ -1708,9 +1936,36 @@ fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg) return err; } -int -fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, - classq_pkt_type_t ptype) +#if (DEBUG || DEVELOPMENT) +static int +fq_if_configure_target_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + fq_if_t *__single fqs = arg1; + uint64_t *target_delay; + uint64_t new_target = 0; + int changed; + int error; + + if (fqs->fqs_ifq == NULL || !IFCQ_IS_ENABLED(fqs->fqs_ifq) || fqs->fqs_classq_groups[0] == NULL) { + return ENXIO; + } + + target_delay = &fqs->fqs_classq_groups[0]->fqg_target_qdelays[FQ_TFC_C]; + error = sysctl_io_number(req, *target_delay, + sizeof(*target_delay), &new_target, &changed); + if (error == 0 && changed != 0) { + *target_delay = new_target; + } + return error; +} +#endif /* (DEBUG || DEVELOPMENT) */ + + + +static int +fq_if_setup_common(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype, boolean_t legacy) { fq_if_t *fqs = NULL; int err = 0; @@ -1730,7 +1985,27 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops; } - err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs); + if (legacy) { + fqs->fqs_dequeue = fq_codel_dq_legacy; + fqs->fqs_enqueue = fq_codel_enq_legacy; + fqs->fqs_flags |= FQS_LEGACY; + err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs); + } else { + fqs->fqs_dequeue = fq_codel_dq; + fqs->fqs_enqueue = fq_codel_enq; + err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL_NEW, fqs); + } + +#if (DEBUG || DEVELOPMENT) + struct ifnet *ifp = ifq->ifcq_ifp; + struct skoid *fqs_skoid = (struct skoid *)&fqs->fqs_oid; + skoid_create(fqs_skoid, + SKOID_SNODE(_net_classq_fq_codel_params), if_name(ifp), + CTLFLAG_RW); + skoid_add_handler((struct skoid *)fqs_skoid, "target_delay", CTLFLAG_RW, + fq_if_configure_target_sysctl, fqs, 0); +#endif /* (DEBUG || DEVELOPMENT) */ + if (err != 0) { os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, " "failed to attach fq_if: %d\n", __func__, err); @@ -1752,10 +2027,24 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, return err; } +int +fq_if_setup_legacy(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype) +{ + return fq_if_setup_common(ifq, flags, ptype, true); +} + +int +fq_if_setup_new(struct ifclassq *ifq, u_int32_t flags, + classq_pkt_type_t ptype) +{ + return fq_if_setup_common(ifq, flags, ptype, false); +} + fq_t * fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid, - mbuf_svc_class_t svc_class, uint64_t now, bool create, - fq_tfc_type_t tfc_type) + mbuf_svc_class_t svc_class, uint64_t now, uint8_t pkt_proto, + uint8_t pkt_flowsrc, fq_tfc_type_t tfc_type, bool create) { fq_t *fq = NULL; flowq_list_t *fq_list; @@ -1792,6 +2081,18 @@ fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid, fq->fq_next_tx_time = FQ_INVALID_TX_TS; LIST_INSERT_HEAD(fq_list, fq, fq_hashlink); fq_cl->fcl_stat.fcl_flows_cnt++; + fq->fq_flags |= fq_codel_enable_ecn ? FQF_ECN_CAPABLE : 0; + if ( +#if (DEBUG || DEVELOPMENT) + ifclassq_congestion_feedback && +#endif /* (DEBUG || DEVELOPMENT) */ + tfc_type != FQ_TFC_L4S && + !(fqs->fqs_flags & FQS_LEGACY) && + (pkt_proto == IPPROTO_TCP || pkt_proto == IPPROTO_QUIC) && + (pkt_flowsrc == FLOWSRC_INPCB || pkt_flowsrc == FLOWSRC_CHANNEL)) { + FQ_ENABLE_CONGESTION_FEEDBACK(fq); + fq->fq_flowsrc = pkt_flowsrc; + } } KDBG(AQM_KTRACE_STATS_FLOW_ALLOC, fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash, @@ -1811,7 +2112,7 @@ fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, uint32_t flowid, return fq; } -void +static void fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq) { ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0); @@ -2115,8 +2416,8 @@ fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl) } boolean_t -fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt, - uint32_t pkt_cnt) +fq_if_report_congestion(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t congestion_cnt, + uint32_t l4s_ce_cnt, uint32_t pkt_cnt) { struct flowadv_fcentry *fce; @@ -2131,7 +2432,8 @@ fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt, fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK); if (fce != NULL) { fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED; - fce->fce_ce_cnt = ce_cnt; + fce->fce_congestion_cnt = congestion_cnt; + fce->l4s_ce_cnt = l4s_ce_cnt; fce->fce_pkts_since_last_report = pkt_cnt; flowadv_add_entry(fce); @@ -2141,7 +2443,7 @@ fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt, void -fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, +fq_if_dequeue_class(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit, int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom, uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist, bool budget_restricted, uint64_t now, bool *fq_cl_paced, @@ -2326,12 +2628,13 @@ done: } void -fq_if_teardown_ifclassq(struct ifclassq *ifq) +fq_if_teardown(struct ifclassq *ifq) { fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc; IFCQ_LOCK_ASSERT_HELD(ifq); - VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL); + VERIFY(fqs != NULL); + VERIFY(ifq->ifcq_type == PKTSCHEDT_FQ_CODEL || ifq->ifcq_type == PKTSCHEDT_FQ_CODEL_NEW); fq_if_destroy(fqs); ifq->ifcq_disc = NULL; ifclassq_detach(ifq); @@ -2363,7 +2666,7 @@ fq_export_flowstats(fq_if_t *fqs, fq_t *fq, } int -fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid, +fq_if_getqstats(struct ifclassq *ifq, uint8_t gid, u_int32_t qid, struct if_ifclassq_stats *ifqs) { struct fq_codel_classstats *fcls; @@ -2427,6 +2730,8 @@ fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid, fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time; fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts; fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed; + fcls->fcls_high_delay_drop = fq_cl->fcl_stat.fcl_high_delay_drop; + fcls->fcls_congestion_feedback = fq_cl->fcl_stat.fcl_congestion_feedback; /* Gather per flow stats */ flowstat_cnt = min((fcls->fcls_newflows_cnt + @@ -2499,8 +2804,8 @@ fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags) _FQ_CLASSQ_INIT(grp, VO, quantum); } else { /* SIG shares same INDEX with VI */ - _CASSERT(SCIDX_SIG == SCIDX_VI); - _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX); + static_assert(SCIDX_SIG == SCIDX_VI); + static_assert(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX); _FQ_CLASSQ_INIT(grp, BK_SYS, quantum); _FQ_CLASSQ_INIT(grp, BK, quantum); @@ -2524,14 +2829,14 @@ update: } calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY); - ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C], + fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_C], calc_flags); - ifclassq_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S], + fq_if_calc_target_qdelay(ifp, &grp->fqg_target_qdelays[FQ_TFC_L4S], calc_flags | IF_CLASSQ_L4S); - ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C], + fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_C], calc_flags); - ifclassq_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S], + fq_if_calc_update_interval(&grp->fqg_update_intervals[FQ_TFC_L4S], calc_flags | IF_CLASSQ_L4S); return 0; diff --git a/bsd/net/pktsched/pktsched_fq_codel.h b/bsd/net/pktsched/pktsched_fq_codel.h index b572fdf94..ad210628c 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.h +++ b/bsd/net/pktsched/pktsched_fq_codel.h @@ -75,6 +75,8 @@ struct fcl_stat { uint64_t fcl_ignore_tx_time; uint64_t fcl_paced_pkts; uint64_t fcl_fcl_pacemaker_needed; + uint64_t fcl_high_delay_drop; + uint64_t fcl_congestion_feedback; }; /* Set the quantum to be one MTU */ @@ -190,6 +192,11 @@ typedef struct fq_if_bitmap_ops { fq_if_bitmaps_move move; } bitmap_ops_t; +typedef void (*fq_codel_dq_t)(void *fqs, void *fq, + pktsched_pkt_t *pkt, uint64_t now); +typedef int (*fq_codel_enq_t)(void *fqs, fq_if_group_t *fq_grp, + pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl); + typedef struct fq_codel_sched_data { struct ifclassq *fqs_ifq; /* back pointer to ifclassq */ flowq_list_t *fqs_flows __counted_by(fqs_flows_count); /* flows table */ @@ -198,6 +205,7 @@ typedef struct fq_codel_sched_data { uint8_t fqs_throttle; /* throttle on or off */ uint8_t fqs_flags; /* flags */ #define FQS_DRIVER_MANAGED 0x1 +#define FQS_LEGACY 0x2 struct flowadv_fclist fqs_fclist; /* flow control state */ struct flowq *fqs_large_flow; /* flow has highest number of bytes */ TAILQ_HEAD(, flowq) fqs_empty_list; /* list of empty flows */ @@ -215,6 +223,9 @@ typedef struct fq_codel_sched_data { #define grp_bitmaps_clr fqs_bm_ops->clr #define grp_bitmaps_move fqs_bm_ops->move fq_if_group_t *fqs_classq_groups[FQ_IF_MAX_GROUPS]; + fq_codel_enq_t fqs_enqueue; + fq_codel_dq_t fqs_dequeue; + ifcq_oid_t fqs_oid; } fq_if_t; #define FQS_GROUP(_fqs, _group_idx) \ @@ -320,31 +331,24 @@ struct fq_codel_classstats { uint64_t fcls_ignore_tx_time; uint64_t fcls_paced_pkts; uint64_t fcls_fcl_pacing_needed; + uint64_t fcls_high_delay_drop; + uint64_t fcls_congestion_feedback; }; +extern uint32_t fq_codel_enable_l4s; +extern unsigned int fq_codel_enable_pacing; +#if DEBUG || DEVELOPMENT +extern uint32_t fq_codel_quantum; +#endif /* DEBUG || DEVELOPMENT */ + #ifdef BSD_KERNEL_PRIVATE _Static_assert(FQ_IF_STATS_MAX_GROUPS == FQ_IF_MAX_GROUPS, "max group counts do not match"); extern void pktsched_fq_init(void); -extern void fq_codel_scheduler_init(void); -extern int fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *h, - classq_pkt_t *t, uint32_t cnt, uint32_t bytes, boolean_t *pdrop); -extern void fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, - uint8_t grp_idx); -extern void fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc, - classq_pkt_t *pkt, uint8_t grp_idx); -extern int fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt, - u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, - u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx); -extern int fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, - mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt, - classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt, - u_int32_t *retbytecnt, uint8_t grp_idx); -extern int fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg); extern struct flowq *fq_if_hash_pkt(fq_if_t *, fq_if_group_t *, - u_int32_t, mbuf_svc_class_t, u_int64_t, bool, fq_tfc_type_t); + u_int32_t, mbuf_svc_class_t, u_int64_t, uint8_t, uint8_t, fq_tfc_type_t, bool); extern boolean_t fq_if_at_drop_limit(fq_if_t *); extern boolean_t fq_if_almost_at_drop_limit(fq_if_t *fqs); extern void fq_if_drop_packet(fq_if_t *, uint64_t); @@ -352,20 +356,12 @@ extern void fq_if_is_flow_heavy(fq_if_t *, struct flowq *); extern boolean_t fq_if_add_fcentry(fq_if_t *, pktsched_pkt_t *, uint8_t, struct flowq *, fq_if_classq_t *); extern void fq_if_flow_feedback(fq_if_t *, struct flowq *, fq_if_classq_t *); -extern boolean_t fq_if_report_ce(fq_if_t *, pktsched_pkt_t *, uint32_t, uint32_t); -extern int fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, - classq_pkt_type_t ptype); -extern void fq_if_teardown_ifclassq(struct ifclassq *ifq); -extern int fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, - u_int32_t qid, struct if_ifclassq_stats *ifqs); -extern void fq_if_destroy_flow(fq_if_t *, fq_if_classq_t *, struct flowq *); +extern boolean_t fq_if_report_congestion(fq_if_t *, pktsched_pkt_t *, uint32_t, + uint32_t, uint32_t); extern void fq_if_move_to_empty_flow(fq_if_t *, fq_if_classq_t *, struct flowq *, uint64_t); extern int fq_if_create_grp(struct ifclassq *ifcq, uint8_t qset_idx, uint8_t flags); -extern void fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t qset_idx); -extern void fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t qset_idx); extern fq_if_group_t *fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx); -extern boolean_t fq_if_is_all_paced(struct ifclassq *ifq); #endif /* BSD_KERNEL_PRIVATE */ #ifdef __cplusplus diff --git a/bsd/net/pktsched/pktsched_netem.c b/bsd/net/pktsched/pktsched_netem.c index 6e13585b3..34fc3ba12 100644 --- a/bsd/net/pktsched/pktsched_netem.c +++ b/bsd/net/pktsched/pktsched_netem.c @@ -1318,8 +1318,8 @@ netem_create(const char *name, struct ifnet *ifp, void *output_handle, { struct netem *ne; - _CASSERT(IF_NETEM_MODEL_NULL == NETEM_MODEL_NULL); - _CASSERT(IF_NETEM_MODEL_NLC == NETEM_MODEL_NLC); + static_assert(IF_NETEM_MODEL_NULL == NETEM_MODEL_NULL); + static_assert(IF_NETEM_MODEL_NLC == NETEM_MODEL_NLC); ne = kalloc_type(struct netem, Z_WAITOK | Z_ZERO); diff --git a/bsd/net/pktsched/pktsched_ops.c b/bsd/net/pktsched/pktsched_ops.c new file mode 100644 index 000000000..76c11beab --- /dev/null +++ b/bsd/net/pktsched/pktsched_ops.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include + +pktsched_ops_list_t pktsched_ops_list; + +void +pktsched_ops_register(pktsched_ops_t *new_ops) +{ + pktsched_ops_t *ops; + + ASSERT(new_ops->ps_deq != NULL); + ASSERT(new_ops->ps_enq != NULL); + ASSERT(new_ops->ps_deq_sc != NULL); + ASSERT(new_ops->ps_setup != NULL); + ASSERT(new_ops->ps_teardown != NULL); + ASSERT(new_ops->ps_req != NULL); + ASSERT(new_ops->ps_allow_dequeue != NULL); + + LIST_FOREACH(ops, &pktsched_ops_list, ps_ops_link) { + VERIFY(ops->ps_id != new_ops->ps_id); + } + + LIST_INSERT_HEAD(&pktsched_ops_list, new_ops, ps_ops_link); +} + +pktsched_ops_t * +pktsched_ops_find(uint8_t ops_id) +{ + pktsched_ops_t *ops; + + LIST_FOREACH(ops, &pktsched_ops_list, ps_ops_link) { + if (ops->ps_id == ops_id) { + return ops; + } + } + + return NULL; +} diff --git a/bsd/net/pktsched/pktsched_ops.h b/bsd/net/pktsched/pktsched_ops.h new file mode 100644 index 000000000..4eb290a76 --- /dev/null +++ b/bsd/net/pktsched/pktsched_ops.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PKTSCHED_PKTSCHED_OPS_H_ +#define _PKTSCHED_PKTSCHED_OPS_H_ + +#ifdef PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef int (*pktsched_setup_t)(struct ifclassq *ifcq, u_int32_t flags, + classq_pkt_type_t ptype); +typedef void (*pktsched_teardown_t)(struct ifclassq *ifcq); +typedef int (*pktsched_request_t)(struct ifclassq *ifcq, enum cqrq, void *arg); +typedef boolean_t (*pktsched_allow_dequeue_t)(struct ifclassq *ifcq); +typedef int (*pktsched_stats_t)(struct ifclassq *ifcq, uint8_t gid, + u_int32_t qid, struct if_ifclassq_stats *ifqs); +typedef int (*pktsched_enq_t)(struct ifclassq *ifq, classq_pkt_t *head, + classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop); +typedef int (*pktsched_deq_t)(struct ifclassq *ifq, u_int32_t maxpktcnt, + u_int32_t maxbytecnt, classq_pkt_t *first_packet, classq_pkt_t *last_packet, + u_int32_t *retpktcnt, u_int32_t *retbytecnt, uint8_t grp_idx); +typedef int (*pktsched_deq_sc_t)(struct ifclassq *ifq, mbuf_svc_class_t svc, + u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet, + classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt, + uint8_t grp_idx); + +typedef struct pktsched_ops { + uint8_t ps_id; +#define PKTSCHED_OPS_LOCKLESS 0x1 + uint8_t ps_ops_flags; + pktsched_setup_t ps_setup; + pktsched_teardown_t ps_teardown; + pktsched_enq_t ps_enq; + pktsched_deq_t ps_deq; + pktsched_deq_sc_t ps_deq_sc; + pktsched_request_t ps_req; + pktsched_stats_t ps_stats; + pktsched_allow_dequeue_t ps_allow_dequeue; + LIST_ENTRY(pktsched_ops) ps_ops_link; +}pktsched_ops_t; + +typedef LIST_HEAD(, pktsched_ops) pktsched_ops_list_t; + +void +pktsched_ops_register(pktsched_ops_t *new_ops); + +pktsched_ops_t * +pktsched_ops_find(uint8_t ps_id); + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* _PKTSCHED_PKTSCHED_OPS_H_ */ diff --git a/bsd/net/radix.c b/bsd/net/radix.c index 660e0a321..0c59f5fd8 100644 --- a/bsd/net/radix.c +++ b/bsd/net/radix.c @@ -80,20 +80,47 @@ static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, walktree_f_t *f, void *w); static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); static struct radix_node *rn_insert(void *, struct radix_node_head *, int *, struct radix_node[2]); -static struct radix_node *rn_newpair(void * __sized_by(vlen), uint8_t vlen, int, struct radix_node[2]); +static struct radix_node *rn_newpair(const void * __sized_by(vlen), uint8_t vlen, int, struct radix_node[2]); static struct radix_node *rn_search(void *, struct radix_node *); static struct radix_node *rn_search_m(void *, struct radix_node *, void *); -static int max_keylen; static struct radix_mask *rn_mkfreelist; static struct radix_node_head *mask_rnhead; -static char *addmask_key; static char normal_chars[] = {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1}; -static char *rn_zeros, *rn_ones; -static zone_t radix_node_zone; KALLOC_TYPE_DEFINE(radix_node_head_zone, struct radix_node_head, KT_DEFAULT); + +#define MAX_KEYLEN 32 +#define MAX_KEYLEN_BMASK 0x1F + +/* + * Constant size buffers that are used for the netmask radix maintenance. + * + * rn_ones - buffer with all bits set to 1, used when constructing new keys. + * rn_zeros - buffer with all bits set to 0, used when constructing new keys. + */ + +/* + * Constant size buffers that are used for the netmask radix maintenance. + * + * rn_ones - buffer with all bits set to 1, used when constructing new keys. + * rn_zeros - buffer with all bits set to 0, used when constructing new keys. + */ +static const char rn_zeros[MAX_KEYLEN] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 +}; + +static const char rn_ones[MAX_KEYLEN] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + #define rn_masktop (mask_rnhead->rnh_treetop) #undef Bcmp #define Bcmp(a, b, l) \ @@ -106,6 +133,52 @@ static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip, #define RN_MATCHF(rn, f, arg) (f == NULL || (*f)((rn), arg)) + +/* + * Netmask radix tree. + * + * Most netmasks are going to be same for the different routes. + * Because of that, it is important to avoid wasting memory + * for the duplicate copies of same mask bitpatterns. + * + * The radix datastructure solves this by using another radix tree, + * which is keyed by the netmask bits. + * + */ + +/* + * Radix tree glue. + */ +struct rn_base_entry { + union { + struct { + struct radix_node tt; + struct radix_node t; + } _split_nodes; +#define rnb_tt _split_nodes.tt +#define rnb_t _split_nodes.t + struct radix_node rnb_nodes[2]; + }; +}; + +static struct radix_node * +rn_lexical_parent(struct radix_node *tt) +{ + struct rn_base_entry *nrn; + nrn = __container_of(tt, struct rn_base_entry, rnb_tt); + return &nrn->rnb_t; +} + +/* + * The entry in the netmask radix tree. + */ +struct netmask_rn_entry { + struct rn_base_entry nrn_base; +#define nrn_tt nrn_base.rnb_tt +#define nrn_t nrn_base.rnb_t + char nrn_netmask[MAX_KEYLEN]; +}; + /* * The data structure for the keys is a radix tree with one way * branching removed. The index rn_bit at an internal node n represents a bit @@ -283,17 +356,20 @@ rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip, rn_matchf_t *f, void *w) { uint8_t cplen; - char *cp = rnarg_unpack(trial, &cplen); - char *cp2 = rn_get_key(leaf); - char *cp3 = rn_get_mask(leaf); - char *cplim; - int length = min(*(u_char *)cp, *(u_char *)cp2); + const char *cp = rnarg_unpack(trial, &cplen); + const char *cp2 = rn_get_key(leaf); + const char *cp3 = rn_get_mask(leaf); + const char *cplim; + int length = min(*cp, *cp2); if (cp3 == 0) { cp3 = rn_ones; } else { - length = min(length, *(u_char *)cp3); + length = min(length, *cp3); } + + length = min(length, MAX_KEYLEN); + cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) { if ((*cp ^ *cp2) & *cp3) { @@ -317,10 +393,10 @@ rn_match_args(void *v_arg, struct radix_node_head *head, uint8_t vlen0; caddr_t v = rnarg_unpack(v_arg, &vlen0); struct radix_node *t = head->rnh_treetop, *x; - caddr_t cp = v, cp2; - caddr_t cplim; + caddr_t cp, cp2, cplim, key; struct radix_node *saved_t, *top = t; - int off = t->rn_offset, vlen = vlen0, matched_off; + int off = t->rn_offset, matched_off; + uint8_t klen, cmp_len; int test, b, rn_bit; /* @@ -328,7 +404,8 @@ rn_match_args(void *v_arg, struct radix_node_head *head, * subroutine call. */ for (; t->rn_bit >= 0;) { - if (t->rn_bmask & cp[t->rn_offset]) { + uint8_t test_byte = rnarg_get(v, vlen0, t->rn_offset); + if (t->rn_bmask & test_byte) { t = t->rn_right; } else { t = t->rn_left; @@ -346,11 +423,23 @@ rn_match_args(void *v_arg, struct radix_node_head *head, * are probably the most common case... */ if (rn_get_mask(t)) { - vlen = rn_get_masklen(t); + cmp_len = rn_get_masklen(t); + } else { + cmp_len = vlen0; } - cp += off; - cp2 = rn_get_key(t) + off; - cplim = v + vlen; + + /* + * Set the `cmp_len' to the minimal of the 3 lengths: + * - the length of t's mask (cmp_len) + * - the length of t's key (klen) + * - the length of the v argument (vlen) + */ + key = rn_get_key(t, &klen); + cmp_len = (uint8_t)min(min(cmp_len, klen), vlen0); + + cp = v + off; + cp2 = key + off; + cplim = v + cmp_len; for (; cp < cplim; cp++, cp2++) { if (*cp != *cp2) { @@ -454,15 +543,18 @@ int rn_debug = 1; #endif static struct radix_node * -rn_newpair(void *v __sized_by(vlen), uint8_t vlen, int b, struct radix_node nodes[2]) +rn_newpair(const void *v __sized_by(vlen), uint8_t vlen, int b, struct radix_node nodes[2]) { - struct radix_node *tt = nodes, *t = tt + 1; + struct radix_node *tt = &nodes[0]; + struct radix_node *t = &nodes[1]; + t->rn_bit = (short)b; t->rn_bmask = 0x80 >> (b & 7); t->rn_left = tt; t->rn_offset = b >> 3; tt->rn_bit = -1; rn_set_key(tt, v, vlen); + tt->rn_parent = t; tt->rn_flags = t->rn_flags = RNF_ACTIVE; tt->rn_mklist = t->rn_mklist = NULL; @@ -487,6 +579,7 @@ rn_insert(void *v_arg, struct radix_node_head *head, int *dupentry, caddr_t cp = v + head_off; int b; struct radix_node *tt; + uint8_t test_byte; /* * Find first bit at which v and t->rn_key differ */ @@ -511,10 +604,10 @@ on1: } { struct radix_node *p, *x = top; - cp = v; do { p = x; - if (cp[x->rn_offset] & x->rn_bmask) { + test_byte = rnarg_get(v, vlen, x->rn_offset); + if (x->rn_bmask & test_byte) { x = x->rn_right; } else { x = x->rn_left; @@ -526,16 +619,18 @@ on1: log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); } #endif - t = rn_newpair(v_arg, vlen, b, nodes); + t = rn_newpair(v, vlen, b, nodes); tt = t->rn_left; - if ((cp[p->rn_offset] & p->rn_bmask) == 0) { + test_byte = rnarg_get(v, vlen, p->rn_offset); + if ((p->rn_bmask & test_byte) == 0) { p->rn_left = t; } else { p->rn_right = t; } x->rn_parent = t; t->rn_parent = p; /* frees x, p as temp vars below */ - if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + test_byte = rnarg_get(v, vlen, t->rn_offset); + if ((t->rn_bmask & test_byte) == 0) { t->rn_right = x; } else { t->rn_right = tt; @@ -553,17 +648,19 @@ on1: struct radix_node * rn_addmask(void *n_arg, int search, int skip) { - caddr_t netmask = (caddr_t)n_arg; - struct radix_node *x; + uint8_t mlen0; + caddr_t netmask = rnarg_unpack(n_arg, &mlen0); + struct radix_node *x __single; + struct netmask_rn_entry *nrn_entry; caddr_t cp, cplim; int b = 0, mlen, j; + uint8_t cmp_len; + caddr_t key; int maskduplicated, m0, isnormal; - struct radix_node *saved_x; - static int last_zeroed = 0; - if ((mlen = *(u_char *)netmask) > max_keylen) { - mlen = max_keylen; - } + char addmask_key[MAX_KEYLEN] = {0, }; + + mlen = min(mlen0, MAX_KEYLEN); if (skip == 0) { skip = 1; } @@ -571,10 +668,10 @@ rn_addmask(void *n_arg, int search, int skip) return mask_rnhead->rnh_nodes; } if (skip > 1) { - Bcopy(rn_ones + 1, addmask_key + 1, skip - 1); + bcopy(rn_ones + 1, addmask_key + 1, skip - 1); } if ((m0 = mlen) > skip) { - Bcopy(netmask + skip, addmask_key + skip, mlen - skip); + bcopy(netmask + skip, addmask_key + skip, mlen - skip); } /* * Trim trailing zeroes. @@ -584,36 +681,36 @@ rn_addmask(void *n_arg, int search, int skip) } mlen = (int)(cp - addmask_key); if (mlen <= skip) { - if (m0 >= last_zeroed) { - last_zeroed = mlen; - } return mask_rnhead->rnh_nodes; } - if (m0 < last_zeroed) { - Bzero(addmask_key + m0, last_zeroed - m0); - } - *addmask_key = last_zeroed = (char)mlen; + + *addmask_key = (char)mlen; x = rn_search(addmask_key, rn_masktop); - if (Bcmp(addmask_key, rn_get_key(x), mlen) != 0) { + key = rn_get_key(x, &cmp_len); + if (mlen < cmp_len) { + cmp_len = (int8_t)mlen; + } + if (Bcmp(addmask_key, key, cmp_len) != 0) { x = NULL; } if (x || search) { return x; } - x = saved_x = zalloc_flags(radix_node_zone, Z_WAITOK_ZERO_NOFAIL); - netmask = cp = (caddr_t)(x + 2); - Bcopy(addmask_key, cp, mlen); - x = rn_insert(cp, mask_rnhead, &maskduplicated, x); + nrn_entry = kalloc_type(struct netmask_rn_entry, Z_WAITOK_ZERO_NOFAIL); + netmask = nrn_entry->nrn_netmask; + Bcopy(addmask_key, netmask, mlen); + x = rn_insert(netmask, mask_rnhead, &maskduplicated, nrn_entry->nrn_base.rnb_nodes); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); - zfree(radix_node_zone, saved_x); + kfree_type(struct netmask_rn_entry, nrn_entry); return x; } mask_rnhead->rnh_cnt++; /* * Calculate index of mask, and check for normalcy. */ - cplim = netmask + mlen; isnormal = 1; + cplim = netmask + mlen; + isnormal = 1; for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) { cp++; } @@ -637,14 +734,16 @@ static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter(void *m_arg, void *n_arg) { - u_char *mp = m_arg, *np = n_arg, *lim; + uint8_t mplen, nlen; + caddr_t mp = rnarg_unpack(m_arg, &mplen); + caddr_t np = rnarg_unpack(n_arg, &nlen); if (*mp > *np) { return 1; /* not really, but need to check longer one first */ } if (*mp == *np) { - for (lim = mp + *mp; mp < lim;) { - if (*mp++ > *np++) { + for (int i = 1; i < mplen; ++i) { + if (mp[i] > np[i]) { return 1; } } @@ -674,9 +773,10 @@ struct radix_node * rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node treenodes[2]) { - uint8_t vlen, masklen; + uint8_t vlen, mlen0; caddr_t v = rnarg_unpack(v_arg, &vlen); - caddr_t netmask = rnarg_unpack(n_arg, &masklen); + caddr_t netmask = rnarg_unpack(n_arg, &mlen0); + uint8_t mlen = mlen0; struct radix_node *t, *x = NULL, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; @@ -700,7 +800,7 @@ rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, /* * Note: the auxillary mask is stored as a "key". */ - netmask = rn_get_key(x); + netmask = rn_get_key(x, &mlen); } /* * Deal with duplicated keys: attach node to previous instance @@ -763,7 +863,7 @@ rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Put mask in tree. */ if (netmask) { - rn_set_mask(tt, netmask, masklen); + rn_set_mask(tt, netmask, mlen); tt->rn_bit = x->rn_bit; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } @@ -848,24 +948,44 @@ on2: struct radix_node * rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) { - struct radix_node *t, *p, *x, *tt; + uint8_t vlen, mlen0; + caddr_t v = rnarg_unpack(v_arg, &vlen); + caddr_t netmask = rnarg_unpack(netmask_arg, &mlen0); + uint8_t masklen = mlen0, key_cmp_len, tt_key_len; + + struct radix_node *t __single, *p __single, *x __single, *tt __single; struct radix_mask *m, *saved_m, **mp; struct radix_node *dupedkey, *saved_tt, *top; - caddr_t v, netmask; - int b, head_off, vlen; + int b, head_off; - v = v_arg; - netmask = netmask_arg; - x = head->rnh_treetop; - tt = rn_search(v, x); - head_off = x->rn_offset; - vlen = *(u_char *)v; - saved_tt = tt; - top = x; - if (tt == 0 || - Bcmp(v + head_off, rn_get_key(tt) + head_off, vlen - head_off)) { + x = top = head->rnh_treetop; + tt = saved_tt = rn_search(v, x); + + /* + * Verify that the found node (`tt'), is valid, and that it can + * be compared against `v_arg'. + */ + if (tt == NULL) { + log(LOG_ERR, "rn_delete: key not found (key_len=%d)\n", vlen); return NULL; } + + head_off = x->rn_offset; + tt_key_len = rn_get_keylen(tt); + key_cmp_len = (uint8_t)min(vlen, tt_key_len); + + if (key_cmp_len < head_off) { + log(LOG_ERR, "rn_delete: key too short (cmp_len=%d, head_offset=%d)\n", + key_cmp_len, head_off); + return NULL; + } + + if (Bcmp(v + head_off, rn_get_key(tt) + head_off, key_cmp_len - head_off)) { + log(LOG_ERR, "rn_delete: key mismatch (cmp_len=%d, head_offset=%d)\n", + key_cmp_len, head_off); + return NULL; + } + /* * Delete our route from mask lists. */ @@ -873,7 +993,7 @@ rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) if ((x = rn_addmask(netmask, 1, head_off)) == 0) { return NULL; } - netmask = rn_get_key(x); + netmask = rn_get_key(x, &masklen); while (rn_get_mask(tt) != netmask) { if ((tt = tt->rn_dupedkey) == 0) { return NULL; @@ -947,7 +1067,8 @@ on1: */ if (tt == saved_tt) { /* remove from head of chain */ - x = dupedkey; x->rn_parent = t; + x = dupedkey; + x->rn_parent = t; if (t->rn_left == tt) { t->rn_left = x; } else { @@ -968,10 +1089,11 @@ on1: log(LOG_ERR, "rn_delete: couldn't find us\n"); } } - t = tt + 1; + t = rn_lexical_parent(tt); if (t->rn_flags & RNF_ACTIVE) { #ifndef RN_DEBUG - *++x = *t; + x = rn_lexical_parent(x); + *x = *t; p = t->rn_parent; #else b = t->rn_info; @@ -1035,7 +1157,7 @@ on1: /* * We may be holding an active internal node in the tree. */ - x = tt + 1; + x = rn_lexical_parent(tt); if (t != x) { #ifndef RN_DEBUG *t = *x; @@ -1054,8 +1176,10 @@ on1: } } out: + x = rn_lexical_parent(tt); + x->rn_flags &= ~RNF_ACTIVE; tt->rn_flags &= ~RNF_ACTIVE; - tt[1].rn_flags &= ~RNF_ACTIVE; + return tt; } @@ -1068,9 +1192,10 @@ rn_walktree_from(struct radix_node_head *h, void *a, void *m, walktree_f_t *f, void *w) { int error; + uint8_t alen, mlen; struct radix_node *base, *next; - u_char *xa = (u_char *)a; - u_char *xm = (u_char *)m; + caddr_t xa = rnarg_unpack(a, &alen); + caddr_t xm = rnarg_unpack(m, &mlen); struct radix_node *rn, *last; int stopping; int lastb; @@ -1097,11 +1222,13 @@ restart: */ for (rn = h->rnh_treetop; rn->rn_bit >= 0;) { last = rn; - if (!(rn->rn_bmask & xm[rn->rn_offset])) { + uint8_t test_byte; + test_byte = rnarg_get(xm, mlen, rn->rn_offset); + if (!(rn->rn_bmask & test_byte)) { break; } - - if (rn->rn_bmask & xa[rn->rn_offset]) { + test_byte = rnarg_get(xa, alen, rn->rn_offset); + if (rn->rn_bmask & test_byte) { rn = rn->rn_right; } else { rn = rn->rn_left; @@ -1269,7 +1396,7 @@ rn_inithead(void **head, int off) rnh = zalloc_flags(radix_node_head_zone, Z_WAITOK_ZERO_NOFAIL); *head = rnh; - t = rn_newpair(rn_zeros, (int8_t)max_keylen, off, rnh->rnh_nodes); + t = rn_newpair(rn_zeros, (int8_t)MAX_KEYLEN, off, rnh->rnh_nodes); ttt = rnh->rnh_nodes + 2; t->rn_right = ttt; t->rn_parent = t; @@ -1277,7 +1404,7 @@ rn_inithead(void **head, int off) tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_bit = (short)(-1 - off); *ttt = *tt; - rn_set_key(ttt, rn_ones, (int8_t)max_keylen); + rn_set_key(ttt, rn_ones, (int8_t)MAX_KEYLEN); rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; @@ -1294,31 +1421,27 @@ rn_inithead(void **head, int off) void rn_init(void) { - char *cp, *cplim; struct domain *dom; - /* lock already held when rn_init is called */ + /* + * Validate that no domain has max key that exceeds the MAX_KEYLEN constant. + * This is really not expected to happen unless we introduce a new domain. + * In such case, the MAX_KEYLEN constant will need to be updated, + * along with the layout of `struct rn_base_entry'. + * + * N.B. lock already held when rn_init is called. + */ TAILQ_FOREACH(dom, &domains, dom_entry) { - if (dom->dom_maxrtkey > max_keylen) { - max_keylen = dom->dom_maxrtkey; + if (MAX_KEYLEN < dom->dom_maxrtkey) { + log(LOG_ERR, "rn_init: encountered domain %s with max key len %d exceeding the limit %d", + dom->dom_name, + dom->dom_maxrtkey, + MAX_KEYLEN); + return; } } - if (max_keylen == 0) { - log(LOG_ERR, - "rn_init: radix functions require max_keylen be set\n"); - return; - } - rn_zeros = zalloc_permanent(3 * max_keylen, ZALIGN_NONE); - rn_ones = cp = rn_zeros + max_keylen; - addmask_key = cplim = rn_ones + max_keylen; - while (cp < cplim) { - *cp++ = -1; - } + if (rn_inithead((void **)&mask_rnhead, 0) == 0) { panic("rn_init 2"); } - - radix_node_zone = zone_create("radix_node", - sizeof(struct radix_node) * 2 + max_keylen, - ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM); } diff --git a/bsd/net/radix.h b/bsd/net/radix.h index 4472f6b2b..ec5df66c4 100644 --- a/bsd/net/radix.h +++ b/bsd/net/radix.h @@ -82,6 +82,17 @@ MALLOC_DECLARE(M_RTABLE); #define __RN_INLINE_LENGTHS (__BIGGEST_ALIGNMENT__ > 4) +#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) +/* + * For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers + * are 32-bit: + * Aligned to 64-bit since this is cast to rtentry, which is 64-bit aligned. + */ +#define __RN_NODE_ALIGNMENT_ATTR__ __attribute__((aligned(8))) +#else /* __arm__ && (__BIGGEST_ALIGNMENT__ > 4) */ +#define __RN_NODE_ALIGNMENT_ATTR__ +#endif /* __arm__ && (__BIGGEST_ALIGNMENT__ > 4) */ + /* * Radix search tree node layout. */ @@ -117,16 +128,7 @@ struct radix_node { struct radix_node *rn_twin; struct radix_node *rn_ybro; #endif - -#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) -/* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers - * are 32-bit: - * Aligned to 64-bit since this is cast to rtentry, which is 64-bit aligned. - */ -} __attribute__ ((aligned(8))); -#else -}; -#endif +} __RN_NODE_ALIGNMENT_ATTR__; #define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey #define rn_offset rn_u.rn_node.rn_Off @@ -157,6 +159,7 @@ typedef struct radix_node * __single radix_node_ref_t; */ static inline void __attribute__((always_inline)) +__attribute__((overloadable)) rn_set_key(struct radix_node *rn, void *key __sized_by(keylen), uint8_t keylen) { #if __RN_INLINE_LENGTHS @@ -167,6 +170,19 @@ rn_set_key(struct radix_node *rn, void *key __sized_by(keylen), uint8_t keylen) rn->__rn_key = key; } +static inline void +__attribute__((always_inline)) +__attribute__((overloadable)) +rn_set_key(struct radix_node *rn, const void *key __sized_by(keylen), uint8_t keylen) +{ +#if __RN_INLINE_LENGTHS + rn->__rn_keylen = keylen; +#else /* !__RN_INLINE_LENGTHS */ + (void)keylen; +#endif /* !__RN_INLINE_LENGTHS */ + rn->__rn_key = __DECONST(void *, key); +} + /* * Returns the routing key length. */ @@ -194,12 +210,24 @@ rn_get_keylen(struct radix_node *rn) */ static inline char * __header_indexable __attribute__((always_inline)) __stateful_pure +__attribute__((overloadable)) rn_get_key(struct radix_node *rn) { return __unsafe_forge_bidi_indexable(char *, rn->rn_u.rn_leaf.rn_Key, rn_get_keylen(rn)); } +static inline char * __header_indexable +__attribute__((always_inline)) __stateful_pure +__attribute__((overloadable)) +rn_get_key(struct radix_node *rn, uint8_t *plen) +{ + uint8_t keylen = rn_get_keylen(rn); + caddr_t key = __unsafe_forge_bidi_indexable(char *, rn->rn_u.rn_leaf.rn_Key, keylen); + *plen = keylen; + return key; +} + /* * Sets the routing mask bytes and length. */ @@ -209,7 +237,6 @@ rn_set_mask(struct radix_node *rn, void *mask __sized_by(masklen), uint8_t maskl { #if __RN_INLINE_LENGTHS /* - * Unlike the keys, the masks are always sockaddrs. * The first byte is the length of the addressable bytes, * whereas the second is the address family. * diff --git a/bsd/net/restricted_in_port.c b/bsd/net/restricted_in_port.c index 47540f6b3..43196efc5 100644 --- a/bsd/net/restricted_in_port.c +++ b/bsd/net/restricted_in_port.c @@ -368,11 +368,11 @@ restricted_in_port_init(void) unsigned int i; #if SKYWALK - _CASSERT(PORT_FLAGS_LISTENER == NETNS_LISTENER); - _CASSERT(PORT_FLAGS_SKYWALK == NETNS_SKYWALK); - _CASSERT(PORT_FLAGS_BSD == NETNS_BSD); - _CASSERT(PORT_FLAGS_PF == NETNS_PF); - _CASSERT(PORT_FLAGS_MAX == NETNS_OWNER_MAX); + static_assert(PORT_FLAGS_LISTENER == NETNS_LISTENER); + static_assert(PORT_FLAGS_SKYWALK == NETNS_SKYWALK); + static_assert(PORT_FLAGS_BSD == NETNS_BSD); + static_assert(PORT_FLAGS_PF == NETNS_PF); + static_assert(PORT_FLAGS_MAX == NETNS_OWNER_MAX); #endif /* SKYWALK */ restricted_port_bitmap = bitmap_alloc(UINT16_MAX); diff --git a/bsd/net/route.c b/bsd/net/route.c index 025b17d2e..288bc02d4 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -115,8 +116,12 @@ #include #endif + +#include + #include + /* * Synchronization notes: * @@ -411,10 +416,10 @@ static int sysctl_rt_verbose SYSCTL_HANDLER_ARGS; SYSCTL_DECL(_net_route); SYSCTL_PROC(_net_route, OID_AUTO, verbose, - CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, - &rt_verbose, 0, - sysctl_rt_verbose, "I", - "Route logging verbosity level"); + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW | CTLFLAG_ANYBODY, + &rt_verbose, 0, + sysctl_rt_verbose, "I", + "Route logging verbosity level"); static int sysctl_rt_verbose SYSCTL_HANDLER_ARGS @@ -436,10 +441,10 @@ sysctl_rt_verbose SYSCTL_HANDLER_ARGS } if (!(kauth_cred_issuser(kauth_cred_get()) != 0 || - IOCurrentTaskHasEntitlement("com.apple.private.networking.elevated-logging"))) { + IOCurrentTaskHasEntitlement("com.apple.private.networking.elevated-logging"))) { #if (DEBUG || DEVELOPMENT) os_log(OS_LOG_DEFAULT, "%s:%s: sysctl not allowed\n", - proc_name_string, __func__); + proc_name_string, __func__); #endif error = EPERM; goto done; @@ -456,12 +461,12 @@ sysctl_rt_verbose SYSCTL_HANDLER_ARGS done: #if (DEBUG || DEVELOPMENT) os_log(OS_LOG_DEFAULT, "%s:%s return: verbose is %d " - "and error is %d\n", proc_name_string, __func__, rt_verbose, error); + "and error is %d\n", proc_name_string, __func__, rt_verbose, error); #endif return error; } - static void +static void rtable_init(struct radix_node_head * __single * __header_indexable table) { struct domain *dom; @@ -484,14 +489,10 @@ route_init(void) { int size; - _CASSERT(offsetof(struct route, ro_rt) == - offsetof(struct route_in6, ro_rt)); - _CASSERT(offsetof(struct route, ro_srcia) == - offsetof(struct route_in6, ro_srcia)); - _CASSERT(offsetof(struct route, ro_flags) == - offsetof(struct route_in6, ro_flags)); - _CASSERT(offsetof(struct route, ro_dst) == - offsetof(struct route_in6, ro_dst)); + static_assert(offsetof(struct route, ro_rt) == offsetof(struct route_in6, ro_rt)); + static_assert(offsetof(struct route, ro_srcia) == offsetof(struct route_in6, ro_srcia)); + static_assert(offsetof(struct route, ro_flags) == offsetof(struct route_in6, ro_flags)); + static_assert(offsetof(struct route, ro_dst) == offsetof(struct route_in6, ro_dst)); PE_parse_boot_argn("rte_debug", &rte_debug, sizeof(rte_debug)); if (rte_debug != 0) { @@ -3185,6 +3186,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, struct sockaddr_storage dst_ss; struct sockaddr_storage mask_ss; boolean_t dontcare; + boolean_t empty_dst; char gbuf[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN], s_netmask[MAX_IPv6_STR_LEN]; VERIFY(!coarse || ifscope == IFSCOPE_NONE); @@ -3200,6 +3202,11 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, netmask = NULL; } + if (rt_verbose > 1) { + empty_dst = ((af == AF_INET && SIN(dst)->sin_addr.s_addr == 0) || + (af == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&SIN6(dst)->sin6_addr))); + } + /* * Non-scoped route lookup. */ @@ -3235,7 +3242,7 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, dontcare = (ifscope == IFSCOPE_NONE); #if (DEVELOPMENT || DEBUG) - if (rt_verbose > 2) { + if (rt_verbose > 2 && !empty_dst) { if (af == AF_INET) { (void) inet_ntop(af, &SIN(dst)->sin_addr.s_addr, s_dst, sizeof(s_dst)); @@ -3404,48 +3411,44 @@ rt_lookup_common(boolean_t lookup_only, boolean_t coarse, struct sockaddr *dst, } } - if (rn == NULL) { - if (rt_verbose == 2) { - if (af == AF_INET) { - (void) inet_ntop(af, &SIN(dst)->sin_addr.s_addr, - s_dst, sizeof(s_dst)); - } else { - (void) inet_ntop(af, &SIN6(dst)->sin6_addr, - s_dst, sizeof(s_dst)); - } + if (rn == NULL) { + if (rt_verbose > 1 && !empty_dst) { + if (af == AF_INET) { + (void) inet_ntop(af, &SIN(dst)->sin_addr.s_addr, + s_dst, sizeof(s_dst)); + } else { + (void) inet_ntop(af, &SIN6(dst)->sin6_addr, + s_dst, sizeof(s_dst)); + } - if (netmask != NULL && af == AF_INET) { - (void) inet_ntop(af, &SIN(netmask)->sin_addr.s_addr, - s_netmask, sizeof(s_netmask)); - } - if (netmask != NULL && af == AF_INET6) { - (void) inet_ntop(af, &SIN6(netmask)->sin6_addr, - s_netmask, sizeof(s_netmask)); - } else { - *s_netmask = '\0'; - } - os_log(OS_LOG_DEFAULT, "%s:%d (%s, %s, %u) return NULL\n", - __func__, __LINE__, s_dst, s_netmask, ifscope); - } else { - if (rt_verbose > 2) { - os_log(OS_LOG_DEFAULT, "%s:%d %u return NULL\n", __func__, __LINE__, ifscope); - } - } - } else if (rt_verbose > 2) { - char dbuf[MAX_SCOPE_ADDR_STR_LEN]; - rtentry_ref_t rt = RT(rn); + if (netmask != NULL && af == AF_INET) { + (void) inet_ntop(af, &SIN(netmask)->sin_addr.s_addr, + s_netmask, sizeof(s_netmask)); + } + if (netmask != NULL && af == AF_INET6) { + (void) inet_ntop(af, &SIN6(netmask)->sin6_addr, + s_netmask, sizeof(s_netmask)); + } else { + *s_netmask = '\0'; + } + os_log(OS_LOG_DEFAULT, "%s:%d (%s, %s, %u) return NULL\n", + __func__, __LINE__, s_dst, s_netmask, ifscope); + } + } else if (rt_verbose > 2) { + char dbuf[MAX_SCOPE_ADDR_STR_LEN]; + rtentry_ref_t rt = RT(rn); - rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf)); + rt_str(rt, dbuf, sizeof(dbuf), gbuf, sizeof(gbuf)); - os_log(OS_LOG_DEFAULT, "%s %u return %p to %s->%s->%s ifa_ifp %s\n", - __func__, ifscope, rt, - dbuf, gbuf, - (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "", - (rt->rt_ifa->ifa_ifp != NULL) ? - rt->rt_ifa->ifa_ifp->if_xname : ""); - } + os_log(OS_LOG_DEFAULT, "%s %u return %p to %s->%s->%s ifa_ifp %s\n", + __func__, ifscope, rt, + dbuf, gbuf, + (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "", + (rt->rt_ifa->ifa_ifp != NULL) ? + rt->rt_ifa->ifa_ifp->if_xname : ""); + } - return RT(rn); + return RT(rn); } struct rtentry * @@ -4773,7 +4776,7 @@ static __attribute__((unused)) void rtm_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_int16_t)0) { @@ -4828,3 +4831,57 @@ rtv_cassert(void) ; } } + +static inline ether_addr_t * +_sockaddr_get_lladdr(struct sockaddr * gateway) +{ + ether_addr_t *lladdr = NULL; + + if (gateway && gateway->sa_family == AF_LINK) { + struct sockaddr_dl *sdl = SDL(gateway); + + if (sdl->sdl_alen != 0) { + lladdr = (ether_addr_t *)LLADDR(sdl); + } + } + return lladdr; +} + +uint64_t +rt_lookup_qset_id(route_t rt, bool skip_if_no_change) +{ + ifnet_t ifp = rt->rt_ifp; + uint64_t qset_id; + + if (!ifp->if_eth_traffic_rule_count) { + DTRACE_IP1(no__eth__rules, route_t, rt); + qset_id = 0; + goto done; + } else if (!ifnet_sync_traffic_rule_genid(ifp, &rt->rt_tr_genid) && + skip_if_no_change) { + DTRACE_IP1(same__eth__rule__genid, route_t, rt); + qset_id = rt->rt_qset_id; + goto done; + } + + uint16_t eth_type = (rt_key(rt)->sa_family == AF_INET) + ? ETHERTYPE_IP : ETHERTYPE_IPV6; + ether_addr_t *eth_raddr = _sockaddr_get_lladdr(rt->rt_gateway); + + int err = nxctl_eth_traffic_rule_find_qset_id(ifp->if_xname, + eth_type, eth_raddr, &rt->rt_qset_id); + if (err != 0) { + DTRACE_IP3(qset__id__not__found__eth, + route_t, rt, + uint16_t, eth_type, ether_addr_t *, eth_raddr); + rt->rt_qset_id = 0; + } else { + DTRACE_IP3(qset__id__found__eth, + route_t, rt, + uint16_t, eth_type, ether_addr_t *, eth_raddr); + } + qset_id = rt->rt_qset_id; + +done: + return qset_id; +} diff --git a/bsd/net/route_private.h b/bsd/net/route_private.h index 731f1e051..4146d01e2 100644 --- a/bsd/net/route_private.h +++ b/bsd/net/route_private.h @@ -191,6 +191,8 @@ struct rtentry { u_int32_t rtt_min; /* minimum RTT computed from history */ u_int32_t rtt_expire_ts; /* RTT history expire timestamp */ u_int8_t rtt_index; /* Index into RTT history */ + uint64_t rt_qset_id; /* QSet to route packets to */ + uint32_t rt_tr_genid; /* Traffic rule gen id used to determine qset_id */ /* Event handler context for the rtentrt */ struct eventhandler_lists_ctxt rt_evhdlr_ctxt; }; @@ -204,9 +206,9 @@ rn_rtentry(struct radix_node *rn) /* Backward compatibility. */ #define RT(r) rn_rtentry((r)) -#define rt_key_free(r) ({ \ - void *__r __single = rt_key(r); \ - kheap_free_addr(KHEAP_DATA_BUFFERS, __r); \ +#define rt_key_free(r) ({ \ + void *__r __single = rt_key(r); \ + kfree_data_addr(__r); \ }) enum { @@ -539,6 +541,7 @@ extern void route_event_init(struct route_event *p_route_ev, struct rtentry *rt, extern int route_event_walktree(struct radix_node *rn, void *arg); extern void route_event_enqueue_nwk_wq_entry(struct rtentry *, struct rtentry *, uint32_t, eventhandler_tag, boolean_t); +extern uint64_t rt_lookup_qset_id(route_t, bool); #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NET_ROUTE_PRIVATE_H_ */ diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 3cecea78d..e8fcfecf5 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -590,6 +591,10 @@ route_output(struct mbuf *m, struct socket *so) saved_nrt->rt_rmx.rmx_locks |= (RTM->rtm_inits & RTM->rtm_rmx.rmx_locks); saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK]; + if ((saved_nrt->rt_flags & (RTF_UP | RTF_LLINFO)) == + (RTF_UP | RTF_LLINFO)) { + rt_lookup_qset_id(saved_nrt, false); + } RT_REMREF_LOCKED(saved_nrt); RT_UNLOCK(saved_nrt); } @@ -1337,7 +1342,7 @@ rt_msg1(u_char type, struct rt_addrinfo *rtinfo) /* * Make sure to accomodate the largest possible size of sa_len. */ - _CASSERT(sizeof(ssbuf) == (SOCK_MAXADDRLEN + 1)); + static_assert(sizeof(ssbuf) == (SOCK_MAXADDRLEN + 1)); if ((sa = rtinfo->rti_info[i]) == NULL) { continue; @@ -1431,7 +1436,7 @@ again: /* * Make sure to accomodate the largest possible size of sa_len. */ - _CASSERT(sizeof(ssbuf) == (SOCK_MAXADDRLEN + 1)); + static_assert(sizeof(ssbuf) == (SOCK_MAXADDRLEN + 1)); if ((sa = rtinfo->rti_info[i]) == NULL) { continue; diff --git a/bsd/net/siphash.c b/bsd/net/siphash.c new file mode 100644 index 000000000..e5b5a5f8e --- /dev/null +++ b/bsd/net/siphash.c @@ -0,0 +1,257 @@ +/*- + * Copyright (c) 2013 Andre Oppermann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d + * are the number of compression rounds and the number of finalization rounds. + * A compression round is identical to a finalization round and this round + * function is called SipRound. Given a 128-bit key k and a (possibly empty) + * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). + * + * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, + * by Jean-Philippe Aumasson and Daniel J. Bernstein, + * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa + * https://131002.net/siphash/siphash.pdf + * https://131002.net/siphash/ + */ + +#include +#include +#include +#include +#include +#include +#include + +static void SipRounds(SIPHASH_CTX *ctx, int final); + +void +SipHash_InitX(SIPHASH_CTX *ctx, uint8_t rc, uint8_t rf) +{ + ctx->v[0] = 0x736f6d6570736575ull; + ctx->v[1] = 0x646f72616e646f6dull; + ctx->v[2] = 0x6c7967656e657261ull; + ctx->v[3] = 0x7465646279746573ull; + ctx->buf.b64 = 0; + ctx->bytes = 0; + ctx->buflen = 0; + ctx->rounds_compr = rc; + ctx->rounds_final = rf; + ctx->initialized = 1; +} + +void +SipHash_SetKey(SIPHASH_CTX *ctx, const uint8_t key[SIPHASH_KEY_LENGTH]) +{ + uint64_t k[2]; + + ASSERT(ctx->v[0] == 0x736f6d6570736575ull && + ctx->initialized == 1); + + k[0] = le64dec(&key[0]); + k[1] = le64dec(&key[8]); + + ctx->v[0] ^= k[0]; + ctx->v[1] ^= k[1]; + ctx->v[2] ^= k[0]; + ctx->v[3] ^= k[1]; + + ctx->initialized = 2; +} + +static const uint8_t *__indexable +SipBuf(SIPHASH_CTX *ctx, const uint8_t * __sized_by_or_null(len)src, size_t len, size_t *delta, int final) +{ + size_t x = 0; + + const uint8_t *buf = src; + + /* handle hashing 0 length buffer - needed for test vectors */ + if (len == 0 && final == 0) { + return 0; + } + + if (final) { + ASSERT(len == 0); + ctx->buf.b8[7] = (uint8_t)ctx->bytes; + } else { + ASSERT((len > 0) && src); + x = MIN(len, sizeof(ctx->buf.b64) - ctx->buflen); + bcopy(buf, &ctx->buf.b8[ctx->buflen], x); + ctx->buflen += x; + buf += x; + } + + if (ctx->buflen == 8 || final) { + ctx->v[3] ^= le64toh(ctx->buf.b64); + SipRounds(ctx, 0); + ctx->v[0] ^= le64toh(ctx->buf.b64); + ctx->buf.b64 = 0; + ctx->buflen = 0; + } + + if (delta != NULL) { + *delta = x; + } + + return buf; +} + +void +SipHash_Update(SIPHASH_CTX *ctx, const void *src __sized_by(len0), size_t len0) +{ + uint64_t m; + const uint64_t *p; + const uint8_t *s; + size_t rem; + size_t len = len0; + size_t len_in_bytes = 0; + + ASSERT(ctx->initialized == 2); + + s = src; + ctx->bytes += len; + + /* + * Push length smaller than block size into buffer or + * fill up the buffer if there is already something + * in it. + */ + if (ctx->buflen > 0 || len < 8) { + size_t delta = 0; + s = SipBuf(ctx, s, len, &delta, 0); + len -= delta; + } + if (len == 0) { + return; + } + + rem = len & 0x7; + len_in_bytes = len; + len >>= 3; + + /* Optimze for 64bit aligned/unaligned access. */ + if (((uintptr_t)s & 0x7) == 0) { + p = __unsafe_forge_bidi_indexable(const uint64_t *, + __builtin_assume_aligned((const uint8_t *__unsafe_indexable)s, sizeof(uint64_t)), len_in_bytes); + for (; len > 0; len--, p++) { + m = le64toh(*p); + ctx->v[3] ^= m; + SipRounds(ctx, 0); + ctx->v[0] ^= m; + } + s = (const uint8_t *)p; + } else { + for (; len > 0; len--, s += 8) { + m = le64dec(s); + ctx->v[3] ^= m; + SipRounds(ctx, 0); + ctx->v[0] ^= m; + } + } + + /* Push remainder into buffer. */ + if (rem > 0) { + s = SipBuf(ctx, s, rem, NULL, 0); + } +} + +void +SipHash_Final(uint8_t dst[SIPHASH_DIGEST_LENGTH], SIPHASH_CTX *ctx) +{ + uint64_t r; + + ASSERT(ctx->initialized == 2); + + r = SipHash_End(ctx); + le64enc(dst, r); +} + +uint64_t +SipHash_End(SIPHASH_CTX *ctx) +{ + uint64_t r; + + ASSERT(ctx->initialized == 2); + + SipBuf(ctx, NULL, 0, NULL, 1); + + ctx->v[2] ^= 0xff; + SipRounds(ctx, 1); + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + + bzero(ctx, sizeof(*ctx)); + return r; +} + +uint64_t +SipHashX(SIPHASH_CTX *ctx, uint8_t rc, uint8_t rf, + const uint8_t key[SIPHASH_KEY_LENGTH], + const void *src __sized_by(len), size_t len) +{ + SipHash_InitX(ctx, rc, rf); + SipHash_SetKey(ctx, key); + SipHash_Update(ctx, src, len); + + return SipHash_End(ctx); +} + +#define SIP_ROTL(x, b) (uint64_t)(((x) << (b)) | ( (x) >> (64 - (b)))) + +static void +SipRounds(SIPHASH_CTX *ctx, int final) +{ + int rounds; + + if (!final) { + rounds = ctx->rounds_compr; + } else { + rounds = ctx->rounds_final; + } + + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = SIP_ROTL(ctx->v[1], 13); + ctx->v[3] = SIP_ROTL(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = SIP_ROTL(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = SIP_ROTL(ctx->v[1], 17); + ctx->v[3] = SIP_ROTL(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = SIP_ROTL(ctx->v[2], 32); + } +} diff --git a/bsd/net/siphash.h b/bsd/net/siphash.h new file mode 100644 index 000000000..cb7b89852 --- /dev/null +++ b/bsd/net/siphash.h @@ -0,0 +1,85 @@ +/*- + * Copyright (c) 2013 Andre Oppermann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) + * optimized for speed on short messages returning a 64bit hash/digest value. + * + * The number of rounds is defined during the initialization: + * SipHash24_Init() for the fast and resonable strong version + * SipHash48_Init() for the strong version (half as fast) + * + * struct SIPHASH_CTX ctx; + * SipHash24_Init(&ctx); + * SipHash_SetKey(&ctx, "16bytes long key"); + * SipHash_Update(&ctx, pointer_to_string, length_of_string); + * SipHash_Final(output, &ctx); + */ + +#ifndef _NET_SIPHASH_H_ +#define _NET_SIPHASH_H_ + +#include +#include + +#define SIPHASH_BLOCK_LENGTH 8 +#define SIPHASH_KEY_LENGTH 16 +#define SIPHASH_DIGEST_LENGTH 8 + +typedef struct _SIPHASH_CTX { + uint64_t v[4]; + union { + uint64_t b64; + uint8_t b8[8]; + } buf; + uint64_t bytes; + uint8_t buflen; + uint8_t rounds_compr; + uint8_t rounds_final; + uint8_t initialized; +} SIPHASH_CTX; + + +#define SipHash24_Init(x) SipHash_InitX((x), 2, 4) +#define SipHash48_Init(x) SipHash_InitX((x), 4, 8) +void SipHash_InitX(SIPHASH_CTX *, uint8_t, uint8_t); +void SipHash_SetKey(SIPHASH_CTX *, + const uint8_t[SIPHASH_KEY_LENGTH]); +void SipHash_Update(SIPHASH_CTX *ctx, const void *src __sized_by(len0), size_t len0); +void SipHash_Final(uint8_t[SIPHASH_DIGEST_LENGTH], SIPHASH_CTX *); +uint64_t SipHash_End(SIPHASH_CTX *); + +#define SipHash24(x, y, z, i) SipHashX((x), 2, 4, (y), (z), (i)); +#define SipHash48(x, y, z, i) SipHashX((x), 4, 8, (y), (z), (i)); +uint64_t SipHashX(SIPHASH_CTX *ctx, uint8_t rc, uint8_t rf, + const uint8_t key[SIPHASH_KEY_LENGTH], const void *src __sized_by(len), size_t len); + +int SipHash24_TestVectors(void); + +#endif /* _SIPHASH_H_ */ diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index 8310b3bb8..a45074913 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -35,6 +35,7 @@ PRIVATE_DATAFILES = \ ip_flowid.h \ mptcp_var.h \ tcp.h \ + tcp_cache.h \ tcp_cc.h \ tcp_log.h \ tcp_private.h \ @@ -78,7 +79,7 @@ INSTALL_DRIVERKIT_MI_LIST = ${DRIVERKIT_DATAFILES} INSTALL_MI_DIR = netinet -INSTALL_MI_LCL_LIST = in_private.h in_stat.h tcp_private.h +INSTALL_MI_LCL_LIST = in_private.h in_stat.h tcp_private.h mptcp_var.h INSTALL_MODULEMAP_MI_LCL_LIST = ${PRIVATE_MODULEMAPFILES} diff --git a/bsd/netinet/cpu_in_cksum_gen.c b/bsd/netinet/cpu_in_cksum_gen.c index ab6c19a89..0292bcfc1 100644 --- a/bsd/netinet/cpu_in_cksum_gen.c +++ b/bsd/netinet/cpu_in_cksum_gen.c @@ -79,11 +79,6 @@ #define CKSUM_ERR(fmt, args...) fprintf_stderr(fmt, ## args) #endif /* !KERNEL */ -/* compile time assert */ -#ifndef _CASSERT -#define _CASSERT(x) _Static_assert(x, "compile-time assertion failed") -#endif /* !_CASSERT */ - #ifndef VERIFY #define VERIFY(EX) ((void)0) #endif /* !VERIFY */ @@ -95,6 +90,10 @@ #define PREDICT_TRUE(x) __builtin_expect(!!((long)(x)), 1L) #define PREDICT_FALSE(x) __builtin_expect(!!((long)(x)), 0L) +#if !defined(static_assert) +#define static_assert(x) _Static_assert(x, #x) +#endif + /* fake mbuf struct used only for calling os_cpu_in_cksum_mbuf() */ struct _mbuf { struct _mbuf *_m_next; @@ -168,20 +167,20 @@ os_cpu_in_cksum(const void *__sized_by(len) data, uint32_t len, uint32_t initial * sure the offsets are as expected. */ #if defined(__LP64__) - _CASSERT(offsetof(struct _mbuf, _m_next) == 0); - _CASSERT(offsetof(struct _mbuf, _m_data) == 16); - _CASSERT(offsetof(struct _mbuf, _m_len) == 24); + static_assert(offsetof(struct _mbuf, _m_next) == 0); + static_assert(offsetof(struct _mbuf, _m_data) == 16); + static_assert(offsetof(struct _mbuf, _m_len) == 24); #else /* !__LP64__ */ - _CASSERT(offsetof(struct _mbuf, _m_next) == 0); - _CASSERT(offsetof(struct _mbuf, _m_data) == 8); - _CASSERT(offsetof(struct _mbuf, _m_len) == 12); + static_assert(offsetof(struct _mbuf, _m_next) == 0); + static_assert(offsetof(struct _mbuf, _m_data) == 8); + static_assert(offsetof(struct _mbuf, _m_len) == 12); #endif /* !__LP64__ */ #ifdef KERNEL - _CASSERT(offsetof(struct _mbuf, _m_next) == + static_assert(offsetof(struct _mbuf, _m_next) == offsetof(struct mbuf, m_next)); - _CASSERT(offsetof(struct _mbuf, _m_data) == + static_assert(offsetof(struct _mbuf, _m_data) == offsetof(struct mbuf, m_data)); - _CASSERT(offsetof(struct _mbuf, _m_len) == + static_assert(offsetof(struct _mbuf, _m_len) == offsetof(struct mbuf, m_len)); #endif /* KERNEL */ struct _mbuf m = { diff --git a/bsd/netinet/flow_divert.c b/bsd/netinet/flow_divert.c index bc2922383..e9fd6eba9 100644 --- a/bsd/netinet/flow_divert.c +++ b/bsd/netinet/flow_divert.c @@ -656,7 +656,7 @@ flow_divert_add_data_statistics(struct flow_divert_pcb *fd_cb, size_t data_len, { struct inpcb *inp = NULL; struct ifnet *ifp = NULL; - stats_functional_type ifnet_count_type = stats_functional_type_none; + stats_functional_type ifnet_count_type = stats_functional_type_unclassified; inp = sotoinpcb(fd_cb->so); if (inp == NULL) { @@ -673,13 +673,10 @@ flow_divert_add_data_statistics(struct flow_divert_pcb *fd_cb, size_t data_len, } if (send) { - INP_ADD_STAT(inp, ifnet_count_type, txpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, txbytes, data_len); + INP_ADD_TXSTAT(inp, ifnet_count_type, 1, data_len); } else { - INP_ADD_STAT(inp, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, rxbytes, data_len); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, data_len); } - inp_set_activity_bitmap(inp); } static errno_t @@ -3415,6 +3412,8 @@ flow_divert_detach(struct socket *so) sbflush(&so->so_snd); sbflush(&so->so_rcv); + flow_divert_disconnect_socket(so, !(fd_cb->flags & FLOW_DIVERT_IMPLICIT_CONNECT), false); + if (!fd_cb->plugin_locked) { socket_unlock(so, 0); FDLOCK(fd_cb); diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index 382d3368e..87caf68ba 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -343,6 +343,7 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ #define ND_OPT_CAPTIVE_PORTAL 37 /* RFC 7710 */ #define ND_OPT_PREF64 38 /* RFC 8781 */ +#define ND_OPT_DNR 144 /* RFC 9463 */ struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; @@ -409,6 +410,20 @@ struct nd_opt_dnssl { /* domain name search list */ u_int8_t nd_opt_dnssl_domains[8]; } __attribute__((__packed__)); +/* + * DNR (Discovery of Network-designated Resolvers) RFC 9463 + */ +struct nd_opt_dnr { + u_int8_t nd_opt_dnr_type; + u_int8_t nd_opt_dnr_len; + u_int8_t nd_opt_dnr_svc_priority[2]; + u_int8_t nd_opt_dnr_lifetime[4]; + u_int8_t nd_opt_dnr_adn_len[2]; + u_int8_t nd_opt_dnr_continuation[1]; +} __attribute__((__packed__)); + +#define ND_OPT_DNR_MIN_LENGTH offsetof(struct nd_opt_dnr, nd_opt_dnr_continuation) + /* * PREF64 (NAT64 prefix) RFC 8781 */ @@ -728,7 +743,8 @@ struct icmp6stat { #define ICMPV6CTL_ND6_ACCEPT_6TO4 25 #define ICMPV6CTL_ND6_OPTIMISTIC_DAD 26 /* RFC 4429 */ #define ICMPV6CTL_ERRPPSLIMIT_RANDOM_INCR 27 -#define ICMPV6CTL_MAXID 28 +#define ICMPV6CTL_ND6_RTILIST 28 +#define ICMPV6CTL_MAXID 29 #ifdef BSD_KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ @@ -766,8 +782,7 @@ struct rtentry; struct rttimer; struct in6_multi; # endif -struct ip6protosw; -void icmp6_init(struct ip6protosw *, struct domain *); +void icmp6_init(struct protosw *, struct domain *); void icmp6_paramerror(struct mbuf *, int); void icmp6_error_flag(struct mbuf *, int, int, int, int); diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index f6dc05b93..9bd98c73e 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -96,6 +96,7 @@ #include #include +#include #include #include @@ -2973,7 +2974,6 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi, struct igmp_tparams *itp) { struct ifnet *ifp; - int retval = 0; INM_LOCK_ASSERT_HELD(inm); IGI_LOCK_ASSERT_NOTHELD(igi); @@ -3008,13 +3008,13 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi, IF_DRAIN(&inm->inm_scq); - retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); + int retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); itp->cst = (inm->inm_scq.ifq_len > 0); IGMP_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); + // N.B.: igmp_v3_enqueue_group_record() returned the number of bytes sent. if (retval <= 0) { IGI_UNLOCK(igi); - retval *= -1; - goto done; + return -retval; } /* * If record(s) were enqueued, start the state-change @@ -3025,7 +3025,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi, itp->sct = 1; IGI_UNLOCK(igi); done: - return retval; + return 0; } /* @@ -4024,7 +4024,7 @@ igmp_sendpkt(struct mbuf *m) /* * Check if the ifnet is still attached. */ - if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + if (ifp == NULL || !ifnet_is_fully_attached(ifp)) { os_log_error(OS_LOG_DEFAULT, "%s: dropped 0x%llx as interface went away\n", __func__, (uint64_t)VM_KERNEL_ADDRPERM(m)); m_freem(m); @@ -4215,10 +4215,9 @@ igmp_init(struct protosw *pp, struct domain *dp) VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - if (igmp_initialized) { + if (!os_atomic_cmpxchg(&igmp_initialized, 0, 1, relaxed)) { return; } - igmp_initialized = 1; os_log(OS_LOG_DEFAULT, "%s: initializing\n", __func__); igmp_timers_are_running = 0; LIST_INIT(&igi_head); diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index e72c547a2..9ef30e293 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2021 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -700,13 +700,8 @@ inctl_ifaddr(struct ifnet *ifp, struct in_ifaddr *ia, u_long cmd, case SIOCDIFADDR: /* struct ifreq */ VERIFY(ia != NULL); - error = ifnet_ioctl(ifp, PF_INET, SIOCDIFADDR, ia); - if (error == EOPNOTSUPP) { - error = 0; - } - if (error != 0) { - break; - } + + (void)ifnet_ioctl(ifp, PF_INET, SIOCDIFADDR, ia); /* Fill out the kernel event information */ ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -2239,7 +2234,7 @@ static __attribute__((unused)) void ipproto_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_int16_t)0) { @@ -2431,6 +2426,7 @@ ipsockopt_cassert(void) case IP_NO_IFT_CELLULAR: // #define IP_NO_IFT_PDP IP_NO_IFT_CELLULAR /* deprecated */ case IP_OUT_IF: + case IP_RECV_LINK_ADDR_TYPE: ; } } diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 6dbe279c8..86354c825 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -85,10 +85,11 @@ #include #include #include -#include -#include #include +#include +#include +#include #include @@ -1428,66 +1429,67 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, } rt_ifa = route->rt_ifa; - /* Become a regular mutex, just in case */ - RT_CONVERT_LOCK(route); - IFLR_LOCK_SPIN(lr); + if (unreachable || (llinfo->la_flags & LLINFO_PROBING)) { + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(route); + IFLR_LOCK_SPIN(lr); - if ((unreachable || (llinfo->la_flags & LLINFO_PROBING)) && - lr->lr_probes < arp_unicast_lim) { - /* - * Thus mark the entry with la_probeexp deadline to - * trigger the probe timer to be scheduled (if not - * already). This gets cleared the moment we get - * an ARP reply. - */ - probing = TRUE; - if (lr->lr_probes == 0) { - llinfo->la_probeexp = (timenow + arpt_probe); - llinfo->la_flags |= LLINFO_PROBING; + if (lr->lr_probes < arp_unicast_lim) { /* - * Provide notification that ARP unicast - * probing has started. - * We only do it for the first unicast probe - * attempt. + * Thus mark the entry with la_probeexp deadline to + * trigger the probe timer to be scheduled (if not + * already). This gets cleared the moment we get + * an ARP reply. */ - send_probe_notif = TRUE; - } + probing = TRUE; + if (lr->lr_probes == 0) { + llinfo->la_probeexp = (timenow + arpt_probe); + llinfo->la_flags |= LLINFO_PROBING; + /* + * Provide notification that ARP unicast + * probing has started. + * We only do it for the first unicast probe + * attempt. + */ + send_probe_notif = TRUE; + } - /* - * Start the unicast probe and anticipate a reply; - * afterwards, return existing entry to caller and - * let it be used anyway. If peer is non-existent - * we'll broadcast ARP next time around. - */ - lr->lr_probes++; - SOCKADDR_ZERO(&sdl, sizeof(sdl)); - sdl.sdl_alen = ifp->if_addrlen; - bcopy(&lr->lr_key.addr, LLADDR(&sdl), - ifp->if_addrlen); - IFLR_UNLOCK(lr); - IFA_LOCK_SPIN(rt_ifa); - ifa_addref(rt_ifa); - sa = rt_ifa->ifa_addr; - IFA_UNLOCK(rt_ifa); - rtflags = route->rt_flags; - RT_UNLOCK(route); - dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, - SDL(&sdl), - SA(net_dest), rtflags); - ifa_remref(rt_ifa); - RT_LOCK(route); - goto release; - } else { - IFLR_UNLOCK(lr); - if (!unreachable && - !(llinfo->la_flags & LLINFO_PROBING)) { /* - * Normal case where peer is still reachable, - * we're not probing and if_addrlen is anything - * but IF_LLREACH_MAXLEN. + * Start the unicast probe and anticipate a reply; + * afterwards, return existing entry to caller and + * let it be used anyway. If peer is non-existent + * we'll broadcast ARP next time around. */ + lr->lr_probes++; + SOCKADDR_ZERO(&sdl, sizeof(sdl)); + sdl.sdl_alen = ifp->if_addrlen; + bcopy(&lr->lr_key.addr, LLADDR(&sdl), + ifp->if_addrlen); + IFLR_UNLOCK(lr); + IFA_LOCK_SPIN(rt_ifa); + ifa_addref(rt_ifa); + sa = rt_ifa->ifa_addr; + IFA_UNLOCK(rt_ifa); + rtflags = route->rt_flags; + RT_UNLOCK(route); + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, + SDL(&sdl), + SA(net_dest), rtflags); + ifa_remref(rt_ifa); + RT_LOCK(route); goto release; } + + IFLR_UNLOCK(lr); + } + if (!unreachable && + !(llinfo->la_flags & LLINFO_PROBING)) { + /* + * Normal case where peer is still reachable, + * we're not probing and if_addrlen is anything + * but IF_LLREACH_MAXLEN. + */ + goto release; } } @@ -1603,6 +1605,18 @@ release: } if (route != NULL) { + /* Set qset id only if there are traffic rules. Else, for bridge + * use cases, the flag will be set and traffic rules won't be + * run on the downstream interface. + */ + if (result == 0 && ifp->if_eth_traffic_rule_count) { + uint64_t qset_id = rt_lookup_qset_id(route, true); + if (packet != NULL) { + packet->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QSET_ID_VALID; + packet->m_pkthdr.pkt_mpriv_qsetid = qset_id; + } + } + if (send_probe_notif) { arp_send_probe_notification(route); } @@ -2115,6 +2129,7 @@ match: llinfo->la_prbreq_cnt = 0; if (rt_evcode) { + rt_lookup_qset_id(route, false); /* * Enqueue work item to invoke callback for this route entry */ diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c index f793b74e7..f0536b00c 100644 --- a/bsd/netinet/in_mcast.c +++ b/bsd/netinet/in_mcast.c @@ -243,7 +243,7 @@ inm_is_ifp_detached(const struct in_multi *inm) VERIFY(inm->inm_ifma != NULL); VERIFY(inm->inm_ifp == inm->inm_ifma->ifma_ifp); - return !ifnet_is_attached(inm->inm_ifp, 0); + return !ifnet_is_fully_attached(inm->inm_ifp); } /* diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 23e28873f..5bef6846b 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -86,6 +86,7 @@ #include +#include #include #include @@ -134,6 +135,9 @@ #include +extern int udp_use_randomport; +extern int tcp_use_randomport; + extern const char *proc_name_address(struct proc *); static LCK_GRP_DECLARE(inpcb_lock_grp, "inpcb"); @@ -672,7 +676,7 @@ in_pcb_check_ultra_constrained_entitled(struct inpcb *inp) if (if_ultra_constrained_check_needed) { inp->inp_flags2 |= INP2_ULTRA_CONSTRAINED_CHECKED; - if (IOCurrentTaskHasEntitlement(ULTRA_CONSTRAINED_ENTITLEMENT)) { + if (if_ultra_constrained_default_allowed || IOCurrentTaskHasEntitlement(ULTRA_CONSTRAINED_ENTITLEMENT)) { inp->inp_flags2 |= INP2_ULTRA_CONSTRAINED_ALLOWED; } } @@ -689,78 +693,32 @@ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p) { #pragma unused(p) + void *__unsafe_indexable addr; struct inpcb *inp; - caddr_t temp; - if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) { - void *__unsafe_indexable addr = __zalloc_flags(pcbinfo->ipi_zone, - Z_WAITOK | Z_ZERO | Z_NOFAIL); - __builtin_assume(addr != NULL); - /* - * N.B: the allocation above may actually be inp_tp - * which is a structure that includes inpcb, but for - * the purposes of this function we just touch - * struct inpcb. - */ - inp = __unsafe_forge_single(struct inpcb *, addr); - } else { - inp = (struct inpcb *)(void *)so->so_saved_pcb; - temp = inp->inp_saved_ppcb; - bzero((caddr_t)inp, sizeof(*inp)); - inp->inp_saved_ppcb = temp; + if (proto_memacct_hardlimit(so->so_proto)) { + return ENOBUFS; } + addr = __zalloc_flags(pcbinfo->ipi_zone, Z_WAITOK_ZERO_NOFAIL); + __builtin_assume(addr != NULL); + + proto_memacct_add(so->so_proto, kalloc_type_size(pcbinfo->ipi_zone)); + + /* + * N.B: the allocation above may actually be inp_tp + * which is a structure that includes inpcb, but for + * the purposes of this function we just touch + * struct inpcb. + */ + inp = __unsafe_forge_single(struct inpcb *, addr); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; -#define INP_ALIGN_AND_CAST(_type, _ptr) ({ \ - typeof((_type)(void *__header_bidi_indexable)NULL) __roundup_type;\ - const volatile char *__roundup_align_ptr = (const volatile char *)(_ptr); \ - __roundup_align_ptr += P2ROUNDUP((uintptr_t)__roundup_align_ptr, \ - _Alignof(typeof(*__roundup_type))) - (uintptr_t)__roundup_align_ptr; \ - __DEQUALIFY(_type, __roundup_align_ptr); \ -}) - /* make sure inp_stat is always 64-bit aligned */ - inp->inp_stat = INP_ALIGN_AND_CAST(struct inp_stat *, inp->inp_stat_store); - if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) + - sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) { - panic("%s: insufficient space to align inp_stat", __func__); - /* NOTREACHED */ - } - - /* make sure inp_cstat is always 64-bit aligned */ - inp->inp_cstat = INP_ALIGN_AND_CAST(struct inp_stat *, inp->inp_cstat_store); - if (((uintptr_t)inp->inp_cstat - (uintptr_t)inp->inp_cstat_store) + - sizeof(*inp->inp_cstat) > sizeof(inp->inp_cstat_store)) { - panic("%s: insufficient space to align inp_cstat", __func__); - /* NOTREACHED */ - } - - /* make sure inp_wstat is always 64-bit aligned */ - inp->inp_wstat = INP_ALIGN_AND_CAST(struct inp_stat *, inp->inp_wstat_store); - if (((uintptr_t)inp->inp_wstat - (uintptr_t)inp->inp_wstat_store) + - sizeof(*inp->inp_wstat) > sizeof(inp->inp_wstat_store)) { - panic("%s: insufficient space to align inp_wstat", __func__); - /* NOTREACHED */ - } - - /* make sure inp_Wstat is always 64-bit aligned */ - inp->inp_Wstat = INP_ALIGN_AND_CAST(struct inp_stat *, inp->inp_Wstat_store); - if (((uintptr_t)inp->inp_Wstat - (uintptr_t)inp->inp_Wstat_store) + - sizeof(*inp->inp_Wstat) > sizeof(inp->inp_Wstat_store)) { - panic("%s: insufficient space to align inp_Wstat", __func__); - /* NOTREACHED */ - } - - /* make sure inp_btstat is always 64-bit aligned */ - inp->inp_btstat = INP_ALIGN_AND_CAST(struct inp_stat *, inp->inp_btstat_store); - if (((uintptr_t)inp->inp_btstat - (uintptr_t)inp->inp_btstat_store) + - sizeof(*inp->inp_btstat) > sizeof(inp->inp_btstat_store)) { - panic("%s: insufficient space to align inp_btstat", __func__); - /* NOTREACHED */ - } -#undef INP_ALIGN_AND_CAST so->so_pcb = (caddr_t)inp; + // There was some history about alignment of statistics counters + // Ensure that all is as expected + VERIFY(IS_P2ALIGNED(&inp->inp_mstat, sizeof(u_int64_t))); if (so->so_proto->pr_flags & PR_PCBLOCK) { lck_mtx_init(&inp->inpcb_mtx, pcbinfo->ipi_lock_grp, @@ -780,6 +738,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, struct proc *p) (void) inp_update_policy(inp); + inp->inp_max_pacing_rate = UINT64_MAX; + lck_rw_lock_exclusive(&pcbinfo->ipi_lock); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); @@ -1226,7 +1186,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct sockaddr *remote, str /* * Skip if this is a restricted port as we do not want to - * restricted ports as ephemeral + * use restricted ports as ephemeral */ if (IS_RESTRICTED_IN_PORT(lport)) { continue; @@ -1288,7 +1248,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct sockaddr *remote, str /* * Skip if this is a restricted port as we do not want to - * restricted ports as ephemeral + * use restricted ports as ephemeral */ if (IS_RESTRICTED_IN_PORT(lport)) { continue; @@ -1956,7 +1916,7 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) { - nstat_pcb_invalidate_cache(inp); + nstat_udp_pcb_invalidate_cache(inp); } in_pcbrehash(inp); lck_rw_done(&inp->inp_pcbinfo->ipi_lock); @@ -1969,7 +1929,7 @@ in_pcbdisconnect(struct inpcb *inp) struct socket *so = inp->inp_socket; if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) { - nstat_pcb_cache(inp); + nstat_udp_pcb_cache(inp); } inp->inp_faddr.s_addr = INADDR_ANY; @@ -2018,8 +1978,8 @@ in_pcbdetach(struct inpcb *inp) } #endif /* IPSEC */ - if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) { - if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) { + if (SOCK_PROTO(so) == IPPROTO_UDP) { + if (inp->inp_mstat.ms_total.ts_rxpackets == 0 && inp->inp_mstat.ms_total.ts_txpackets == 0) { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_no_data); } } @@ -2081,9 +2041,8 @@ in_pcbdetach(struct inpcb *inp) * Schedule a notification to report that flow is * using client side translation. */ - if (inp->inp_stat != NULL && - (inp->inp_stat->txbytes != 0 || - inp->inp_stat->rxbytes != 0)) { + if (inp->inp_mstat.ms_total.ts_txbytes != 0 || + inp->inp_mstat.ms_total.ts_rxbytes != 0) { if (so->so_flags & SOF_DELEGATED) { in6_clat46_event_enqueue_nwk_wq_entry( IN6_CLAT46_EVENT_V4_FLOW, @@ -2165,7 +2124,6 @@ in_pcbdispose(struct inpcb *inp) } /* makes sure we're not called twice from so_close */ so->so_flags |= SOF_PCBCLEARING; - so->so_saved_pcb = (caddr_t)inp; so->so_pcb = NULL; inp->inp_socket = NULL; #if NECP @@ -2177,9 +2135,9 @@ in_pcbdispose(struct inpcb *inp) * we deallocate the structure. */ ROUTE_RELEASE(&inp->inp_route); - if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) { - zfree(ipi->ipi_zone, inp); - } + zfree(ipi->ipi_zone, inp); + proto_memacct_sub(so->so_proto, kalloc_type_size(ipi->ipi_zone)); + sodealloc(so); } } @@ -3782,10 +3740,10 @@ inp_flush(struct inpcb *inp, int optval) oifp = inp->inp_last_outifp; if (rtifp != NULL) { - if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL, 0); + if_qflush_sc(rtifp, so_tc2msc(optval), flowhash, NULL, NULL); } if (oifp != NULL && oifp != rtifp) { - if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL, 0); + if_qflush_sc(oifp, so_tc2msc(optval), flowhash, NULL, NULL); } return 0; @@ -4350,16 +4308,11 @@ inp_update_netns_flags(struct socket *so) } #endif /* SKYWALK */ -inline void -inp_set_activity_bitmap(struct inpcb *inp) -{ - in_stat_set_activity_bitmap(&inp->inp_nw_activity, net_uptime()); -} - inline void inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *ab) { - bcopy(&inp->inp_nw_activity, ab, sizeof(*ab)); + // Just grab the total bitmap until we have more precision in bitmap retrieval + bcopy(&inp->inp_mstat.ms_total.ts_bitmap, ab, sizeof(*ab)); } void @@ -4383,6 +4336,7 @@ inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep) } else { inp->inp_e_proc_name[0] = 0; } + nstat_pcb_update_last_owner(inp); } void @@ -4539,3 +4493,211 @@ inp_exit_bind_in_progress(struct socket *so) wakeup_one((caddr_t)&inp->inp_bind_in_progress_waiters); } } + +/* + * XXX: this is borrowed from in6_pcbsetport(). If possible, we should + * share this function by all *bsd*... + */ +int +in_pcbsetport(struct in_addr laddr, struct sockaddr *remote, struct inpcb *inp, struct proc *p, + int locked) +{ + struct socket *__single so = inp->inp_socket; + uint16_t lport = 0, first, last, rand_port; + uint16_t *__single lastport; + int count, error = 0, wild = 0; + boolean_t counting_down; + bool found, randomport; + struct inpcbinfo *__single pcbinfo = inp->inp_pcbinfo; + kauth_cred_t __single cred; +#if SKYWALK + bool laddr_unspecified = laddr.s_addr == INADDR_ANY; +#else +#pragma unused(laddr) +#endif + if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */ + if (!lck_rw_try_lock_exclusive(&pcbinfo->ipi_lock)) { + socket_unlock(inp->inp_socket, 0); + lck_rw_lock_exclusive(&pcbinfo->ipi_lock); + socket_lock(inp->inp_socket, 0); + } + + /* + * Check if a local port was assigned to the inp while + * this thread was waiting for the pcbinfo lock + */ + if (inp->inp_lport != 0) { + VERIFY(inp->inp_flags2 & INP2_INHASHLIST); + lck_rw_done(&pcbinfo->ipi_lock); + + /* + * It is not an error if another thread allocated + * a port + */ + return 0; + } + } + + /* XXX: this is redundant when called from in6_pcbbind */ + if ((so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) == 0) { + wild = INPLOOKUP_WILDCARD; + } + + randomport = (so->so_flags & SOF_BINDRANDOMPORT) > 0 || + (so->so_type == SOCK_STREAM ? tcp_use_randomport : + udp_use_randomport) > 0; + + if (inp->inp_flags & INP_HIGHPORT) { + first = (uint16_t)ipport_hifirstauto; /* sysctl */ + last = (uint16_t)ipport_hilastauto; + lastport = &pcbinfo->ipi_lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { + if (!locked) { + lck_rw_done(&pcbinfo->ipi_lock); + } + return error; + } + first = (uint16_t)ipport_lowfirstauto; /* 1023 */ + last = (uint16_t)ipport_lowlastauto; /* 600 */ + lastport = &pcbinfo->ipi_lastlow; + } else { + first = (uint16_t)ipport_firstauto; /* sysctl */ + last = (uint16_t)ipport_lastauto; + lastport = &pcbinfo->ipi_lastport; + } + + if (first == last) { + randomport = false; + } + /* + * Simple check to ensure all ports are not used up causing + * a deadlock here. + */ + found = false; + if (first > last) { + /* counting down */ + if (randomport) { + read_frandom(&rand_port, sizeof(rand_port)); + *lastport = first - (rand_port % (first - last)); + } + count = first - last; + counting_down = TRUE; + } else { + /* counting up */ + if (randomport) { + read_frandom(&rand_port, sizeof(rand_port)); + *lastport = first + (rand_port % (first - last)); + } + count = last - first; + counting_down = FALSE; + } + do { + if (count-- < 0) { /* completely used? */ + /* + * Undo any address bind that may have + * occurred above. + */ + inp->in6p_laddr = in6addr_any; + inp->in6p_last_outifp = NULL; +#if SKYWALK + if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) { + netns_set_ifnet(&inp->inp_netns_token, + NULL); + } +#endif /* SKYWALK */ + if (!locked) { + lck_rw_done(&pcbinfo->ipi_lock); + } + return EAGAIN; + } + if (counting_down) { + --*lastport; + if (*lastport > first || *lastport < last) { + *lastport = first; + } + } else { + ++*lastport; + if (*lastport < first || *lastport > last) { + *lastport = first; + } + } + lport = htons(*lastport); + + /* + * Skip if this is a restricted port as we do not want to + * use restricted ports as ephemeral + */ + if (IS_RESTRICTED_IN_PORT(lport)) { + continue; + } + + found = (in_pcblookup_local(pcbinfo, inp->inp_laddr, + lport, wild) == NULL); +#if SKYWALK + if (found && + (SOCK_PROTO(so) == IPPROTO_TCP || + SOCK_PROTO(so) == IPPROTO_UDP) && + !(inp->inp_flags2 & INP2_EXTERNAL_PORT)) { + if (laddr_unspecified && + (inp->inp_vflag & INP_IPV6) != 0 && + (inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + struct in_addr ip_zero = { .s_addr = 0 }; + + netns_release(&inp->inp_wildcard_netns_token); + if (netns_reserve_in( + &inp->inp_wildcard_netns_token, + ip_zero, + (uint8_t)SOCK_PROTO(so), lport, + NETNS_BSD, NULL) != 0) { + /* port in use in IPv4 namespace */ + found = false; + } + } + if (found && + netns_reserve_in(&inp->inp_netns_token, + inp->inp_laddr, (uint8_t)SOCK_PROTO(so), lport, + NETNS_BSD, NULL) != 0) { + netns_release(&inp->inp_wildcard_netns_token); + found = false; + } + } +#endif /* SKYWALK */ + } while (!found); + + inp->inp_lport = lport; + inp->inp_flags |= INP_ANONPORT; + + bool is_ipv6 = (inp->inp_vflag & INP_IPV6); + if (is_ipv6) { + inp->inp_vflag &= ~INP_IPV6; + } + + if (in_pcbinshash(inp, remote, 1) != 0) { + inp->inp_last_outifp = NULL; + inp->inp_lifscope = IFSCOPE_NONE; +#if SKYWALK + netns_release(&inp->inp_netns_token); +#endif /* SKYWALK */ + inp->inp_lport = 0; + inp->inp_flags &= ~INP_ANONPORT; + if (is_ipv6) { + inp->inp_vflag |= INP_IPV6; + } + if (!locked) { + lck_rw_done(&pcbinfo->ipi_lock); + } + return EAGAIN; + } + if (is_ipv6) { + inp->inp_vflag |= INP_IPV6; + } + + if (!locked) { + lck_rw_done(&pcbinfo->ipi_lock); + } + return 0; +} diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index a9d5ac700..09bafd582 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -133,19 +134,14 @@ struct in_addr_4in6 { struct icmp6_filter; struct ifnet; -struct inp_stat { - u_int64_t rxpackets; - u_int64_t rxbytes; - u_int64_t txpackets; - u_int64_t txbytes; -}; - typedef enum { - stats_functional_type_none = 0, - stats_functional_type_cell = 1, - stats_functional_type_wifi = 2, - stats_functional_type_wired = 3, - stats_functional_type_bluetooth = 4 + stats_functional_type_untracked = 0, /* Deliberately ignored for detailed stats, e.g. loopback */ + stats_functional_type_cell = 1, + stats_functional_type_wifi_infra = 2, + stats_functional_type_wifi_non_infra = 3, + stats_functional_type_wired = 4, + stats_functional_type_bluetooth = 5, + stats_functional_type_unclassified = 6, /* Catch-all, appearance may need further investigation */ } stats_functional_type; struct inp_necp_attributes { @@ -237,7 +233,6 @@ struct inpcb { uint64_t inp_fadv_start_time; uint64_t inp_fadv_cnt; - caddr_t inp_saved_ppcb; /* place to save pointer while cached */ #if IPSEC struct inpcbpolicy *inp_sp; /* for IPsec */ #endif /* IPSEC */ @@ -264,52 +259,109 @@ struct inpcb { uint8_t inp_keepalive_datalen; /* keepalive data length */ uint8_t inp_keepalive_type; /* type of application */ uint16_t inp_keepalive_interval; /* keepalive interval */ - uint32_t inp_nstat_refcnt __attribute__((aligned(4))); - struct inp_stat *inp_stat; - struct inp_stat *inp_cstat; /* cellular data */ - struct inp_stat *inp_wstat; /* Wi-Fi data */ - struct inp_stat *inp_Wstat; /* Wired data */ - struct inp_stat *inp_btstat; /* Bluetooth data */ - uint8_t inp_stat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; - uint8_t inp_cstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; - uint8_t inp_wstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; - uint8_t inp_Wstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; - uint8_t inp_btstat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; - activity_bitmap_t inp_nw_activity; + struct nstat_sock_locus *inp_nstat_locus __attribute__((aligned(sizeof(u_int64_t)))); + struct media_stats inp_mstat __attribute__((aligned(8))); /* All counts, total/cell/wifi etc */ uint64_t inp_start_timestamp; uint64_t inp_connect_timestamp; char inp_last_proc_name[MAXCOMLEN + 1]; char inp_e_proc_name[MAXCOMLEN + 1]; + + uint64_t inp_max_pacing_rate; /* Per-connection maximumg pacing rate to be enforced (Bytes/second) */ }; -#define IFNET_COUNT_TYPE(_ifp) \ - IFNET_IS_CELLULAR(_ifp) ? stats_functional_type_cell: \ - IFNET_IS_WIFI(_ifp) ? stats_functional_type_wifi: \ - IFNET_IS_WIRED(_ifp) ? stats_functional_type_wired: \ - IFNET_IS_COMPANION_LINK_BLUETOOTH(_ifp)? stats_functional_type_bluetooth: stats_functional_type_none; +#define IFNET_COUNT_TYPE(_ifp) \ + IFNET_IS_LOOPBACK(_ifp) ? stats_functional_type_untracked: \ + IFNET_IS_CELLULAR(_ifp) ? stats_functional_type_cell: \ + IFNET_IS_WIFI(_ifp) ? \ + IFNET_IS_WIFI_INFRA(_ifp) ? stats_functional_type_wifi_infra: \ + stats_functional_type_wifi_non_infra: \ + IFNET_IS_WIRED(_ifp) ? stats_functional_type_wired: \ + IFNET_IS_COMPANION_LINK_BLUETOOTH(_ifp)? stats_functional_type_bluetooth: stats_functional_type_unclassified; -#define INP_ADD_STAT(_inp, _stats_functional_type, _a, _n) \ -do { \ - locked_add_64(&((_inp)->inp_stat->_a), (_n)); \ - switch(_stats_functional_type) { \ +#define INP_ADD_RXSTAT(_inp, _stats_functional_type, _p, _b) \ +do { \ + locked_add_64(&((_inp)->inp_mstat.ms_total.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_total.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_total.ts_bitmap), net_uptime()); \ + switch(_stats_functional_type) { \ case stats_functional_type_cell: \ - locked_add_64(&((_inp)->inp_cstat->_a), (_n)); \ + locked_add_64(&((_inp)->inp_mstat.ms_cellular.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_cellular.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_cellular.ts_bitmap), net_uptime()); \ break; \ - case stats_functional_type_wifi: \ - locked_add_64(&((_inp)->inp_wstat->_a), (_n)); \ + case stats_functional_type_wifi_infra: \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_infra.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_infra.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wifi_infra.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_wifi_non_infra: \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_bitmap), net_uptime()); \ break; \ case stats_functional_type_wired: \ - locked_add_64(&((_inp)->inp_Wstat->_a), (_n)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wired.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wired.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wired.ts_bitmap), net_uptime()); \ break; \ case stats_functional_type_bluetooth: \ - locked_add_64(&((_inp)->inp_btstat->_a), (_n)); \ + locked_add_64(&((_inp)->inp_mstat.ms_bluetooth.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_bluetooth.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_bluetooth.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_unclassified: \ + locked_add_64(&((_inp)->inp_mstat.ms_alternate.ts_rxpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_alternate.ts_rxbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_alternate.ts_bitmap), net_uptime()); \ break; \ default: \ break; \ }; \ } while (0); +#define INP_ADD_TXSTAT(_inp, _stats_functional_type, _p, _b) \ +do { \ + locked_add_64(&((_inp)->inp_mstat.ms_total.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_total.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_total.ts_bitmap), net_uptime()); \ + switch(_stats_functional_type) { \ + case stats_functional_type_cell: \ + locked_add_64(&((_inp)->inp_mstat.ms_cellular.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_cellular.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_cellular.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_wifi_infra: \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_infra.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_infra.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wifi_infra.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_wifi_non_infra: \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wifi_non_infra.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_wired: \ + locked_add_64(&((_inp)->inp_mstat.ms_wired.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_wired.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_wired.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_bluetooth: \ + locked_add_64(&((_inp)->inp_mstat.ms_bluetooth.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_bluetooth.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_bluetooth.ts_bitmap), net_uptime()); \ + break; \ + case stats_functional_type_unclassified: \ + locked_add_64(&((_inp)->inp_mstat.ms_alternate.ts_txpackets), (_p)); \ + locked_add_64(&((_inp)->inp_mstat.ms_alternate.ts_txbytes), (_b)); \ + in_stat_set_activity_bitmap(&((_inp)->inp_mstat.ms_alternate.ts_bitmap), net_uptime()); \ + break; \ + default: \ + break; \ + }; \ +} while (0); + + #endif /* BSD_KERNEL_PRIVATE */ /* @@ -775,28 +827,30 @@ struct inpcbinfo { * * Overflowed INP flags; use INP2 prefix to avoid misuse. */ -#define INP2_TIMEWAIT 0x00000001 /* in TIMEWAIT */ -#define INP2_IN_FCTREE 0x00000002 /* in inp_fc_tree */ -#define INP2_WANT_APP_POLICY 0x00000004 /* necp app policy check is desired */ -#define INP2_NO_IFF_EXPENSIVE 0x00000008 /* do not use expensive interface */ -#define INP2_INHASHLIST 0x00000010 /* pcb is in inp_hash list */ -#define INP2_AWDL_UNRESTRICTED 0x00000020 /* AWDL restricted mode allowed */ -#define INP2_KEEPALIVE_OFFLOAD 0x00000040 /* Enable UDP or TCP keepalive offload */ -#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ +#define INP2_TIMEWAIT 0x00000001 /* in TIMEWAIT */ +#define INP2_IN_FCTREE 0x00000002 /* in inp_fc_tree */ +#define INP2_WANT_APP_POLICY 0x00000004 /* necp app policy check is desired */ +#define INP2_NO_IFF_EXPENSIVE 0x00000008 /* do not use expensive interface */ +#define INP2_INHASHLIST 0x00000010 /* pcb is in inp_hash list */ +#define INP2_AWDL_UNRESTRICTED 0x00000020 /* AWDL restricted mode allowed */ +#define INP2_KEEPALIVE_OFFLOAD 0x00000040 /* Enable UDP or TCP keepalive offload */ +#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ #define INP2_CONNECT_IN_PROGRESS 0x00000100 /* A connect call is in progress, so binds are intermediate steps */ -#define INP2_CLAT46_FLOW 0x00000200 /* The flow is going to use CLAT46 path */ -#define INP2_EXTERNAL_PORT 0x00000400 /* The port is registered externally, for NECP listeners */ -#define INP2_NO_IFF_CONSTRAINED 0x00000800 /* do not use constrained interface */ -#define INP2_DONTFRAG 0x00001000 /* mark the DF bit in the IP header to avoid fragmentation */ -#define INP2_SCOPED_BY_NECP 0x00002000 /* NECP scoped the pcb */ -#define INP2_LOGGING_ENABLED 0x00004000 /* logging enabled for the socket */ -#define INP2_LOGGED_SUMMARY 0x00008000 /* logged: the final summary */ -#define INP2_MANAGEMENT_ALLOWED 0x00010000 /* Allow communication over a management interface */ -#define INP2_MANAGEMENT_CHECKED 0x00020000 /* Checked entitlements for a management interface */ -#define INP2_BIND_IN_PROGRESS 0x00040000 /* A bind call is in progress */ -#define INP2_LAST_ROUTE_LOCAL 0x00080000 /* Last used route was local */ -#define INP2_ULTRA_CONSTRAINED_ALLOWED 0x00100000 /* Allow communication over ultra-constrained interfaces */ -#define INP2_ULTRA_CONSTRAINED_CHECKED 0x00200000 /* Checked entitlements for ultra-constrained interfaces */ +#define INP2_CLAT46_FLOW 0x00000200 /* The flow is going to use CLAT46 path */ +#define INP2_EXTERNAL_PORT 0x00000400 /* The port is registered externally, for NECP listeners */ +#define INP2_NO_IFF_CONSTRAINED 0x00000800 /* do not use constrained interface */ +#define INP2_DONTFRAG 0x00001000 /* mark the DF bit in the IP header to avoid fragmentation */ +#define INP2_SCOPED_BY_NECP 0x00002000 /* NECP scoped the pcb */ +#define INP2_LOGGING_ENABLED 0x00004000 /* logging enabled for the socket */ +#define INP2_LOGGED_SUMMARY 0x00008000 /* logged: the final summary */ +#define INP2_MANAGEMENT_ALLOWED 0x00010000 /* Allow communication over a management interface */ +#define INP2_MANAGEMENT_CHECKED 0x00020000 /* Checked entitlements for a management interface */ +#define INP2_BIND_IN_PROGRESS 0x00040000 /* A bind call is in progress */ +#define INP2_LAST_ROUTE_LOCAL 0x00080000 /* Last used route was local */ +#define INP2_ULTRA_CONSTRAINED_ALLOWED 0x00100000 /* Allow communication over ultra-constrained interfaces */ +#define INP2_ULTRA_CONSTRAINED_CHECKED 0x00200000 /* Checked entitlements for ultra-constrained interfaces */ +#define INP2_RECV_LINK_ADDR_TYPE 0x00400000 /* receive the type of the link level address */ +#define INP2_CONNECTION_IDLE 0x00800000 /* Connection is idle */ /* * Flags passed to in_pcblookup*() functions. @@ -931,7 +985,6 @@ extern void inp_incr_sndbytes_unsent(struct socket *, int32_t); extern void inp_decr_sndbytes_unsent(struct socket *, int32_t); extern int32_t inp_get_sndbytes_allunsent(struct socket *, u_int32_t); extern void inp_decr_sndbytes_allunsent(struct socket *, u_int32_t); -extern void inp_set_activity_bitmap(struct inpcb *inp); extern void inp_get_activity_bitmap(struct inpcb *inp, activity_bitmap_t *b); extern void inp_update_last_owner(struct socket *so, struct proc *p, struct proc *ep); extern void inp_copy_last_owner(struct socket *so, struct socket *head); @@ -950,5 +1003,6 @@ extern void in_management_interface_check(void); extern void in_pcb_check_management_entitled(struct inpcb *inp); extern void in_pcb_check_ultra_constrained_entitled(struct inpcb *inp); extern char *inp_snprintf_tuple(struct inpcb *, char *__sized_by(buflen) buf, size_t buflen); +extern int in_pcbsetport(struct in_addr, struct sockaddr *, struct inpcb *, struct proc *, int); #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_IN_PCB_H_ */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index a17acb833..945582ea0 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -616,7 +616,7 @@ inpcb_get_if_ports_used(ifnet_t ifp, int protocol, uint32_t flags, fbuf, sizeof(fbuf)); } - os_log(OS_LOG_DEFAULT, + os_log(wake_packet_log_handle, "inpcb_get_if_ports_used: route is down %s %s:%u %s:%u ifp %s proc %s:%d", SOCK_PROTO(inp->inp_socket) == IPPROTO_TCP ? "tcp" : "udp", lbuf, ntohs(inp->inp_lport), fbuf, ntohs(inp->inp_fport), @@ -713,7 +713,7 @@ inpcb_get_if_ports_used(ifnet_t ifp, int protocol, uint32_t flags, fbuf, sizeof(fbuf)); } - os_log(OS_LOG_DEFAULT, + os_log(wake_packet_log_handle, "inpcb_get_if_ports_used: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d", SOCK_PROTO(inp->inp_socket) == IPPROTO_TCP ? "tcp" : "udp", lbuf, ntohs(inp->inp_lport), fbuf, ntohs(inp->inp_fport), @@ -750,7 +750,7 @@ inpcb_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags, error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count); if (error != 0) { - os_log_error(OS_LOG_DEFAULT, + os_log_error(wake_packet_log_handle, "%s: ifnet_list_get_all() failed %d", __func__, error); return; diff --git a/bsd/netinet/in_private.h b/bsd/netinet/in_private.h index 7283539e7..95c966c3b 100644 --- a/bsd/netinet/in_private.h +++ b/bsd/netinet/in_private.h @@ -119,6 +119,16 @@ struct sockaddr_inifscope { #define IP_NO_IFT_PDP IP_NO_IFT_CELLULAR /* deprecated */ #define IP_OUT_IF 9696 /* for internal use only */ +#define IP_RECV_LINK_ADDR_TYPE 9697 /* bool: receive the type of the link level address */ + +/* + * Values for IP_RECV_LINK_ADDR_TYPE in ancillary message + */ +#define IP_RECV_LINK_ADDR_UNICAST 0 +#define IP_RECV_LINK_ADDR_BROADCAST 1 +#define IP_RECV_LINK_ADDR_MULTICAST 2 + + #ifdef BSD_KERNEL_PRIVATE #define CTL_IPPROTO_NAMES { \ { "ip", CTLTYPE_NODE }, \ diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index 8996192b4..b00879ae4 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -103,7 +103,6 @@ #endif /* IPSEC */ static void in_dinit(struct domain *); -static void ip_proto_input(protocol_family_t, mbuf_t); extern struct domain inetdomain_s; static struct pr_usrreqs nousrreqs; @@ -300,7 +299,7 @@ in_dinit(struct domain *dp) * fit in a small mbuf because m_pullup only puls into 256 * byte mbuf */ - _CASSERT((sizeof(struct tcpiphdr) + TCP_MAXOLEN) <= _MHLEN); + static_assert((sizeof(struct tcpiphdr) + TCP_MAXOLEN) <= _MHLEN); /* * Attach first, then initialize; ip_init() needs raw IP handler. @@ -324,22 +323,6 @@ in_dinit(struct domain *dp) domain_unguard_release(unguard); } -static void -ip_proto_input(protocol_family_t protocol, mbuf_t packet_list) -{ -#pragma unused(protocol) - - if (packet_list->m_nextpkt != NULL) { - ip_input_process_list(packet_list); - } else { - /* - * XXX remove this path if ip_input_process_list is proven - * to be stable and has minimum overhead on most platforms. - */ - ip_input(packet_list); - } -} - SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Internet Family"); diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index c6d8cc017..556896d58 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include diff --git a/bsd/netinet/in_stat.h b/bsd/netinet/in_stat.h index 6c8e7529d..d802a1ad0 100644 --- a/bsd/netinet/in_stat.h +++ b/bsd/netinet/in_stat.h @@ -38,6 +38,26 @@ typedef struct activity_bitmap { uint64_t bitmap[2]; /* 128 bit map, each bit == 8 sec */ } activity_bitmap_t; +typedef struct traffic_stats { + uint64_t ts_rxpackets; + uint64_t ts_rxbytes; + uint64_t ts_txpackets; + uint64_t ts_txbytes; + activity_bitmap_t ts_bitmap; +} traffic_stats_t; + +typedef struct media_stats { + traffic_stats_t ms_total; + traffic_stats_t ms_cellular; + traffic_stats_t ms_wifi_infra; + traffic_stats_t ms_wifi_non_infra; + traffic_stats_t ms_wired; + traffic_stats_t ms_bluetooth; + traffic_stats_t ms_alternate; // "Spare", use initially for unclassified +} media_stats_t; + +#define USE_MS_ALTERNATE_FOR_UNCLASSIFIED 1 + #endif /* PRIVATE */ #ifdef BSD_KERNEL_PRIVATE diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index 522e22d64..56e97e573 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -1020,59 +1020,7 @@ so_get_opportunistic(struct socket *so) } __private_extern__ int -so_tc_from_control(struct mbuf *control, int *out_netsvctype) -{ - struct cmsghdr *cm; - int sotc = SO_TC_UNSPEC; - - *out_netsvctype = _NET_SERVICE_TYPE_UNSPEC; - - for (cm = M_FIRST_CMSGHDR(control); - is_cmsg_valid(control, cm); - cm = M_NXT_CMSGHDR(control, cm)) { - int val; - - if (cm->cmsg_level != SOL_SOCKET || - cm->cmsg_len != CMSG_LEN(sizeof(int))) { - continue; - } - val = *(int *)(void *)CMSG_DATA(cm); - /* - * The first valid option wins - */ - switch (cm->cmsg_type) { - case SO_TRAFFIC_CLASS: - if (SO_VALID_TC(val)) { - sotc = val; - return sotc; - /* NOT REACHED */ - } else if (val < SO_TC_NET_SERVICE_OFFSET) { - break; - } - /* - * Handle the case SO_NET_SERVICE_TYPE values are - * passed using SO_TRAFFIC_CLASS - */ - val = val - SO_TC_NET_SERVICE_OFFSET; - OS_FALLTHROUGH; - case SO_NET_SERVICE_TYPE: - if (!IS_VALID_NET_SERVICE_TYPE(val)) { - break; - } - *out_netsvctype = val; - sotc = sotc_by_netservicetype[val]; - return sotc; - /* NOT REACHED */ - default: - break; - } - } - - return sotc; -} - -__private_extern__ int -so_tos_from_control(struct mbuf *control) +ip_tos_from_control(struct mbuf *control) { struct cmsghdr *cm; int tos = IPTOS_UNSPEC; diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index b3a7d8d9d..9ab442834 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -575,7 +575,6 @@ extern int in_control(struct socket *, u_long cmd, caddr_t __sized_by(IOCPARM_LE extern int in_inithead(void **, int); extern void in_rtqdrain(void); extern struct radix_node *in_validate(struct radix_node *); -extern void ip_input(struct mbuf *); extern void ip_input_process_list(struct mbuf *); extern int in_ifadown(struct ifaddr *ifa, int); extern void in_ifscrub(struct ifnet *, struct in_ifaddr *, int); diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 452182408..6ba9244f1 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1242,6 +1242,11 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, if ((inp_flags & INP_HDRINCL) != 0) { /* Expect 32-bit aligned data ptr on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); + + if (m->m_pkthdr.len < sizeof(struct ip)) { + goto bad; + } + /* * This is not raw IP, we liberal only for fields TOS, * id and TTL. diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 0c7d8bec5..ce48ccbd4 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -291,10 +291,6 @@ static int ip_checkinterface_debug = IP_CHECK_IF_DEBUG; SYSCTL_INT(_net_inet_ip, OID_AUTO, checkinterface_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface_debug, IP_CHECK_IF_DEBUG, ""); -static int ip_chaining = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW | CTLFLAG_LOCKED, - &ip_chaining, 1, "Do receive side ip address based chaining"); - static int ip_chainsz = 6; SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_chainsz, 1, "IP receive side max chaining"); @@ -485,12 +481,11 @@ ip_init(struct protosw *pp, struct domain *dp) * Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is * interchangeable with in_aliasreq; they must have the same size. */ - _CASSERT(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq)); + static_assert(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq)); - if (ip_initialized) { + if (!os_atomic_cmpxchg(&ip_initialized, 0, 1, relaxed)) { return; } - ip_initialized = 1; TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl_init(); @@ -529,7 +524,7 @@ ip_init(struct protosw *pp, struct domain *dp) TAILQ_INIT(&ipq[i]); } - maxnipq = nmbclusters / 32; + maxnipq = 8192; maxfragsperpacket = 128; /* enough for 64k in 512 byte fragments */ ipq_updateparams(); lck_mtx_unlock(&ipqlock); @@ -618,6 +613,16 @@ inaddr_hashlookup(uint32_t key) return &in_ifaddrhashtbl[inaddr_hashval(key)]; } +static void +ip_proto_process_wake_packet(struct mbuf *m) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + + if (if_is_lpw_enabled(ifp)) { + if_exit_lpw(ifp, "IP packet"); + } +} + __private_extern__ void ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, ipfilter_t inject_ipfref) @@ -688,6 +693,13 @@ ip_proto_dispatch_in(struct mbuf *m, int hlen, u_int8_t proto, ip->ip_off = ntohs(ip->ip_off); } + /* + * Check if need to switch to full wake mode -- TCP knows about idle connections + */ + if (__improbable(ip->ip_p != IPPROTO_TCP && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) != 0)) { + ip_proto_process_wake_packet(m); + } + /* * If there isn't a specific lock for the protocol * we're about to call, use the generic lock for AF_INET. @@ -1609,8 +1621,9 @@ ours: } void -ip_input_process_list(struct mbuf *packet_list) +ip_proto_input(protocol_family_t protocol, mbuf_t packet_list) { +#pragma unused(protocol) pktchain_elm_t pktchain_tbl[PKTTBL_SZ]; mbuf_ref_t packet = NULL; @@ -1623,28 +1636,6 @@ ip_input_process_list(struct mbuf *packet_list) int chain = 0; struct ip_fw_in_args args; - if (ip_chaining == 0) { - mbuf_ref_t m = packet_list; -#if (DEBUG || DEVELOPMENT) - if (ip_input_measure) { - net_perf_start_time(&net_perf, &start_tv); - } -#endif /* (DEBUG || DEVELOPMENT) */ - - while (m) { - packet_list = mbuf_nextpkt(m); - mbuf_setnextpkt(m, NULL); - ip_input(m); - m = packet_list; - num_pkts++; - } -#if (DEBUG || DEVELOPMENT) - if (ip_input_measure) { - net_perf_measure_time(&net_perf, &start_tv, num_pkts); - } -#endif /* (DEBUG || DEVELOPMENT) */ - return; - } #if (DEBUG || DEVELOPMENT) if (ip_input_measure) { net_perf_start_time(&net_perf, &start_tv); @@ -1722,400 +1713,6 @@ restart_list_process: } #endif /* (DEBUG || DEVELOPMENT) */ } -/* - * Ip input routine. Checksum and byte swap header. If fragmented - * try to reassemble. Process options. Pass to next level. - */ -void -ip_input(struct mbuf *m) -{ - struct ip *__single ip; - unsigned int hlen; - u_short sum = 0; -#if DUMMYNET - struct ip_fw_args args; - struct m_tag *__single tag; -#endif - ipfilter_t __single inject_filter_ref = NULL; - ifnet_ref_t inifp; - - /* Check if the mbuf is still valid after interface filter processing */ - MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); - inifp = m->m_pkthdr.rcvif; - VERIFY(inifp != NULL); - - m_add_crumb(m, PKT_CRUMB_IP_INPUT); - - ipstat.ips_rxc_notlist++; - - /* Perform IP header alignment fixup, if needed */ - IP_HDR_ALIGNMENT_FIXUP(m, inifp, return ); - - m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED; - -#if DUMMYNET - bzero(&args, sizeof(struct ip_fw_args)); - - /* - * Don't bother searching for tag(s) if there's none. - */ - if (SLIST_EMPTY(&m->m_pkthdr.tags)) { - goto ipfw_tags_done; - } - - /* Grab info from mtags prepended to the chain */ - if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, - KERNEL_TAG_TYPE_DUMMYNET)) != NULL) { - struct dn_pkt_tag *__single dn_tag; - - dn_tag = (struct dn_pkt_tag *)(tag->m_tag_data); - args.fwa_pf_rule = dn_tag->dn_pf_rule; - - m_tag_delete(m, tag); - } - -#if DIAGNOSTIC - if (m == NULL || !(m->m_flags & M_PKTHDR)) { - panic("ip_input no HDR"); - } -#endif - - if (args.fwa_pf_rule) { - /* dummynet already filtered us */ - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - inject_filter_ref = ipf_get_inject_filter(m); - if (args.fwa_pf_rule) { - goto check_with_pf; - } - } -ipfw_tags_done: -#endif /* DUMMYNET */ - - /* - * No need to process packet twice if we've already seen it. - */ - if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { - inject_filter_ref = ipf_get_inject_filter(m); - } - if (inject_filter_ref != NULL) { - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - - DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, - struct ip *, ip, struct ifnet *, inifp, - struct ip *, ip, struct ip6_hdr *, NULL); - - ip->ip_len = ntohs(ip->ip_len) - (u_short)hlen; - ip->ip_off = ntohs(ip->ip_off); - ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); - return; - } - - if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) { - if_ports_used_match_mbuf(inifp, PF_INET, m); - } - - OSAddAtomic(1, &ipstat.ips_total); - if (m->m_pkthdr.len < sizeof(struct ip)) { - goto tooshort; - } - - if (m->m_len < sizeof(struct ip) && - (m = m_pullup(m, sizeof(struct ip))) == NULL) { - OSAddAtomic(1, &ipstat.ips_toosmall); - return; - } - ip = mtod(m, struct ip *); - - KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, - ip->ip_p, ip->ip_off, ip->ip_len); - - if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { - OSAddAtomic(1, &ipstat.ips_badvers); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_BAD_VERSION, - NULL, 0); - goto bad; - } - - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - if (hlen < sizeof(struct ip)) { /* minimum header length */ - OSAddAtomic(1, &ipstat.ips_badhlen); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_BAD_HDR_LENGTH, - NULL, 0); - goto bad; - } - if (hlen > m->m_len) { - if ((m = m_pullup(m, hlen)) == NULL) { - OSAddAtomic(1, &ipstat.ips_badhlen); - return; - } - ip = mtod(m, struct ip *); - } - - /* 127/8 must not appear on wire - RFC1122 */ - if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { - /* - * Allow for the following exceptions: - * - * 1. If the packet was sent to loopback (i.e. rcvif - * would have been set earlier at output time.) - * - * 2. If the packet was sent out on loopback from a local - * source address which belongs to a non-loopback - * interface (i.e. rcvif may not necessarily be a - * loopback interface, hence the test for PKTF_LOOP.) - * Unlike IPv6, there is no interface scope ID, and - * therefore we don't care so much about PKTF_IFINFO. - */ - if (!(inifp->if_flags & IFF_LOOPBACK) && - !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) { - OSAddAtomic(1, &ipstat.ips_badaddr); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_INVALID_ADDR, - NULL, 0); - goto bad; - } - } - - /* IPv4 Link-Local Addresses as defined in RFC3927 */ - if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || - IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) { - ip_linklocal_stat.iplls_in_total++; - if (ip->ip_ttl != MAXTTL) { - OSAddAtomic(1, &ip_linklocal_stat.iplls_in_badttl); - /* Silently drop link local traffic with bad TTL */ - if (!ip_linklocal_in_allowbadttl) { - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_BAD_TTL, - NULL, 0); - goto bad; - } - } - } - - sum = ip_cksum(m, hlen); - if (sum) { - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_BAD_CHECKSUM, - NULL, 0); - goto bad; - } - - DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, - struct ip *, ip, struct ifnet *, inifp, - struct ip *, ip, struct ip6_hdr *, NULL); - - /* - * Naively assume we can attribute inbound data to the route we would - * use to send to this destination. Asymmetric routing breaks this - * assumption, but it still allows us to account for traffic from - * a remote node in the routing table. - * this has a very significant performance impact so we bypass - * if nstat_collect is disabled. We may also bypass if the - * protocol is tcp in the future because tcp will have a route that - * we can use to attribute the data to. That does mean we would not - * account for forwarded tcp traffic. - */ - if (nstat_collect) { - rtentry_ref_t rt = ifnet_cached_rtlookup_inet(inifp, ip->ip_src); - if (rt != NULL) { - nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); - rtfree(rt); - } - } - - /* - * Convert fields to host representation. - */ -#if BYTE_ORDER != BIG_ENDIAN - NTOHS(ip->ip_len); -#endif - - if (ip->ip_len < hlen) { - OSAddAtomic(1, &ipstat.ips_badlen); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_BAD_LENGTH, - NULL, 0); - goto bad; - } - -#if BYTE_ORDER != BIG_ENDIAN - NTOHS(ip->ip_off); -#endif - /* - * Check that the amount of data in the buffers - * is as at least much as the IP header would have us expect. - * Trim mbufs if longer than we expect. - * Drop packet if shorter than we expect. - */ - if (m->m_pkthdr.len < ip->ip_len) { -tooshort: - OSAddAtomic(1, &ipstat.ips_tooshort); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_TOO_SHORT, - NULL, 0); - goto bad; - } - if (m->m_pkthdr.len > ip->ip_len) { - ip_input_adjust(m, ip, inifp); - } - -#if DUMMYNET -check_with_pf: -#endif -#if PF - /* Invoke inbound packet filter */ - if (PF_IS_ENABLED) { - int error; -#if DUMMYNET - error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args); -#else - error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL); -#endif /* DUMMYNET */ - if (error != 0 || m == NULL) { - if (m != NULL) { - panic("%s: unexpected packet %p", - __func__, m); - /* NOTREACHED */ - } - /* Already freed by callee */ - return; - } - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - } -#endif /* PF */ - -#if IPSEC - if (ipsec_bypass == 0 && ipsec_get_history_count(m)) { - goto pass; - } -#endif - -pass: - /* - * Process options and, if not destined for us, - * ship it on. ip_dooptions returns 1 when an - * error was detected (causing an icmp message - * to be sent and the original packet to be freed). - */ - ip_nhops = 0; /* for source routed packets */ - if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, NULL)) { - return; - } - - /* - * Check our list of addresses, to see if the packet is for us. - * If we don't have any addresses, assume any unicast packet - * we receive might be for us (and let the upper layers deal - * with it). - */ - if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST | M_BCAST))) { - ip_setdstifaddr_info(m, inifp->if_index, NULL); - goto ours; - } - - /* - * Enable a consistency check between the destination address - * and the arrival interface for a unicast packet (the RFC 1122 - * strong ES model) if IP forwarding is disabled and the packet - * is not locally generated and the packet is not subject to - * 'ipfw fwd'. - * - * XXX - Checking also should be disabled if the destination - * address is ipnat'ed to a different interface. - * - * XXX - Checking is incompatible with IP aliases added - * to the loopback interface instead of the interface where - * the packets are received. - */ - if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { - ip_check_if_result_t check_if_result = IP_CHECK_IF_NONE; - - check_if_result = ip_input_check_interface(&m, ip, inifp); - ASSERT(check_if_result != IP_CHECK_IF_NONE); - if (check_if_result == IP_CHECK_IF_OURS) { - goto ours; - } else if (check_if_result == IP_CHECK_IF_DROP) { - return; - } - } else { - struct in_multi *__single inm; - /* - * See if we belong to the destination multicast group on the - * arrival interface. - */ - in_multihead_lock_shared(); - IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm); - in_multihead_lock_done(); - if (inm == NULL) { - OSAddAtomic(1, &ipstat.ips_notmember); - HTONS(ip->ip_len); - HTONS(ip->ip_off); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, - DROP_REASON_IP_UNKNOWN_MULTICAST_GROUP, NULL, 0); - return; - } - ip_setdstifaddr_info(m, inifp->if_index, NULL); - INM_REMREF(inm); - goto ours; - } - - /* - * Not for us; forward if possible and desirable. - */ - if (ipforwarding == 0) { - OSAddAtomic(1, &ipstat.ips_cantforward); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_UNKNOWN_MULTICAST_GROUP, - NULL, 0); - } else { - ip_forward(m, 0, NULL); - } - return; - -ours: - /* - * If offset or IP_MF are set, must reassemble. - */ - if (ip->ip_off & ~(IP_DF | IP_RF)) { - m = ip_reass(m); - if (m == NULL) { - return; - } - ip = mtod(m, struct ip *); - /* Get the header length of the reassembled packet */ - hlen = IP_VHL_HL(ip->ip_vhl) << 2; - } - - /* - * Further protocols expect the packet length to be w/o the - * IP header. - */ - ip->ip_len -= hlen; - - -#if IPSEC - /* - * enforce IPsec policy checking if we are seeing last header. - * note that we do not visit this with protocols with pcb layer - * code - like udp/tcp/raw ip. - */ - if (ipsec_bypass == 0 && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) { - if (ipsec4_in_reject(m, NULL)) { - IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IPSEC_REJECT, - NULL, 0); - goto bad; - } - } -#endif /* IPSEC */ - - /* - * Switch out to protocol's input routine. - */ - OSAddAtomic(1, &ipstat.ips_delivered); - - ip_proto_dispatch_in(m, hlen, ip->ip_p, 0); -bad: - KERNEL_DEBUG(DBG_LAYER_END, 0, 0, 0, 0, 0); -} static void ipq_updateparams(void) @@ -2160,7 +1757,7 @@ sysctl_maxnipq SYSCTL_HANDLER_ARGS goto done; } /* impose bounds */ - if (i < -1 || i > (nmbclusters / 4)) { + if (i < -1) { error = EINVAL; goto done; } @@ -3881,7 +3478,7 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, * Make sure to accomodate the largest possible * size of SA(if_lladdr)->sa_len. */ - _CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1)); + static_assert(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + 1)); ifnet_head_lock_shared(); if ((ifp = m->m_pkthdr.rcvif) != NULL && @@ -3902,7 +3499,7 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, IFA_UNLOCK(ifa); goto makedummy; } - /* the above _CASSERT ensures sdl_len fits in sdlbuf */ + /* the above static_assert() ensures sdl_len fits in sdlbuf */ SOCKADDR_COPY(sdp, sdl2, sdp->sdl_len); IFA_UNLOCK(ifa); } else { @@ -3948,6 +3545,21 @@ makedummy: goto no_mbufs; } } + if (inp->inp_flags2 & INP2_RECV_LINK_ADDR_TYPE) { + int mode = IP_RECV_LINK_ADDR_UNICAST; + + if (m->m_flags & M_BCAST) { + mode = IP_RECV_LINK_ADDR_BROADCAST; + } else if (m->m_flags & M_MCAST) { + mode = IP_RECV_LINK_ADDR_MULTICAST; + } + + mp = sbcreatecontrol_mbuf((caddr_t)&mode, + sizeof(int), IP_RECV_LINK_ADDR_TYPE, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } return 0; no_mbufs: diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 6ea057960..33d876991 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1911,7 +1911,7 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum) firstlen = len = (mtu - hlen) & ~7; if (len < 8) { OSAddAtomic(1, &ipstat.ips_odropped); - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_TOO_SMALL, + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_TOO_SMALL, NULL, 0); return EMSGSIZE; } @@ -1934,7 +1934,7 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum) for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ if (m == NULL) { - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_NO_MEM, + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_NO_MEM, NULL, 0); OSAddAtomic(1, &ipstat.ips_odropped); return ENOBUFS; @@ -1960,7 +1960,7 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum) mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copy(m0, off, len); if (m->m_next == NULL) { - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_NO_MEM, + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_FRAG_NO_MEM, NULL, 0); OSAddAtomic(1, &ipstat.ips_odropped); return ENOBUFS; @@ -2048,7 +2048,7 @@ in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) uint32_t offset, _hlen, mlen, hlen, len, sw_csum; uint16_t csum, ip_len; - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); VERIFY(m->m_flags & M_PKTHDR); sw_csum = (csum_flags & m->m_pkthdr.csum_flags); @@ -2406,6 +2406,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVPKTINFO: case IP_RECVTOS: case IP_DONTFRAG: + case IP_RECV_LINK_ADDR_TYPE: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) { @@ -2480,6 +2481,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) } OPTSET2(INP2_DONTFRAG); break; + + case IP_RECV_LINK_ADDR_TYPE: + OPTSET2(INP2_RECV_LINK_ADDR_TYPE); + break; #undef OPTSET #undef OPTSET2 } @@ -2689,6 +2694,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVPKTINFO: case IP_RECVTOS: case IP_DONTFRAG: + case IP_RECV_LINK_ADDR_TYPE: switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; @@ -2740,6 +2746,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_DONTFRAG: optval = OPTBIT2(INP2_DONTFRAG); break; + case IP_RECV_LINK_ADDR_TYPE: + optval = OPTBIT2(INP2_RECV_LINK_ADDR_TYPE); + break; } error = sooptcopyout(sopt, &optval, sizeof(optval)); break; diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index e150c54e0..9fc654f69 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -368,6 +368,7 @@ struct domain; extern int ip_checkrouteralert(struct mbuf *); extern int ip_ctloutput(struct socket *, struct sockopt *sopt); +extern void ip_proto_input(protocol_family_t protocol, mbuf_t packet_list); extern void ip_drain(void); extern void ip_init(struct protosw *, struct domain *); extern int ip_output(struct mbuf *, struct mbuf *, struct route *, int, diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index ab97d3e19..2f93bc901 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -248,7 +248,7 @@ mptcp_reass_present(struct socket *mp_so) dowakeup = 1; } } - tcp_reass_qent_free(q); + tcp_reass_qent_free(mp_so->so_proto, q); mp_tp->mpt_reassqlen--; count++; q = LIST_FIRST(&mp_tp->mpt_segq); @@ -293,7 +293,14 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * } /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ - te = tcp_reass_qent_alloc(); + te = tcp_reass_qent_alloc(mp_so->so_proto); + if (te == NULL) { + m_drop_list(m, NULL, + DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, + DROP_REASON_MPTCP_REASSEMBLY_ALLOC, NULL, 0); + *tlenp = 0; + return 0; + } mp_tp->mpt_reassqlen++; OSIncrementAtomic(&mptcp_reass_total_qlen); @@ -321,7 +328,7 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * if (i >= *tlenp) { tcpstat.tcps_mptcp_rcvduppack++; m_freem(m); - tcp_reass_qent_free(te); + tcp_reass_qent_free(mp_so->so_proto, te); te = NULL; mp_tp->mpt_reassqlen--; OSDecrementAtomic(&mptcp_reass_total_qlen); @@ -364,7 +371,7 @@ mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf * nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - tcp_reass_qent_free(q); + tcp_reass_qent_free(mp_so->so_proto, q); mp_tp->mpt_reassqlen--; OSDecrementAtomic(&mptcp_reass_total_qlen); q = nq; @@ -1122,6 +1129,8 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len, uint16_t csum) { + struct mptsub *mpts = tp->t_mpsub; + if (mdss_data_len == 0) { os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte)); @@ -1136,10 +1145,10 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp, mptcp_notify_mpready(tp->t_inpcb->inp_socket); - tp->t_rcv_map.mpt_dsn = full_dsn; - tp->t_rcv_map.mpt_sseq = seqn; - tp->t_rcv_map.mpt_len = mdss_data_len; - tp->t_rcv_map.mpt_csum = csum; + mpts->mpts_rcv_map.mpt_dsn = full_dsn; + mpts->mpts_rcv_map.mpt_sseq = seqn; + mpts->mpts_rcv_map.mpt_len = mdss_data_len; + mpts->mpts_rcv_map.mpt_csum = csum; tp->t_mpflags |= TMPF_EMBED_DSN; } @@ -1523,25 +1532,3 @@ mptcp_set_restrictions(struct socket *mp_so) ifnet_head_done(); } - -#define DUMP_BUF_CHK() { \ - clen -= k; \ - if (clen < 1) \ - goto done; \ - c += k; \ -} - -int -dump_mptcp_reass_qlen(char *str __sized_by(str_len), int str_len) -{ - char *c = str; - int k, clen = str_len; - - if (mptcp_reass_total_qlen != 0) { - k = scnprintf(c, clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen); - DUMP_BUF_CHK(); - } - -done: - return str_len - clen; -} diff --git a/bsd/netinet/mptcp.h b/bsd/netinet/mptcp.h index 2fadd2437..611bd10d0 100644 --- a/bsd/netinet/mptcp.h +++ b/bsd/netinet/mptcp.h @@ -316,16 +316,6 @@ struct mptcp_add_addr_opt { maddr_flags:4; #endif uint8_t maddr_addrid; - union { - struct { - struct in_addr maddr_addrv4; - uint32_t maddr_pad[3]; - }; - - struct { - struct in6_addr maddr_addrv6; - }; - } maddr_u; }__attribute__((__packed__)); struct mptcp_add_addr_hmac_msg_v4 { diff --git a/bsd/netinet/mptcp_opt.c b/bsd/netinet/mptcp_opt.c index 69f42d181..5693f3d33 100644 --- a/bsd/netinet/mptcp_opt.c +++ b/bsd/netinet/mptcp_opt.c @@ -57,8 +57,8 @@ static int mptcp_validate_join_hmac(struct tcpcb *, u_char* __sized_by(maclen), int maclen); static int mptcp_snd_mpprio(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend, int optlen); -static void mptcp_send_remaddr_opt(struct tcpcb *, struct mptcp_remaddr_opt *); -static int mptcp_echo_add_addr(struct tcpcb *, u_char * __ended_by(optend), u_char *optend, unsigned int); +static void mptcp_send_remaddr_opt(struct tcpcb *tp, struct mptcp_remaddr_opt *opt); +static int mptcp_echo_add_addr(struct tcpcb *tp, u_char * __indexable cp, unsigned int optlen); /* * MPTCP Options Output Processing @@ -447,7 +447,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt __ended_by(optend), } /* Start a timer to retransmit the ACK */ tp->t_timer[TCPT_JACK_RXMT] = - OFFSET_FROM_START(tp, tcp_jack_rxmt); + tcp_offset_from_start(tp, tcp_jack_rxmt); tp->t_mpflags &= ~TMPF_SND_JACK; goto ret_optlen; @@ -476,7 +476,7 @@ mptcp_setup_opts(struct tcpcb *tp, int32_t off, u_char *opt __ended_by(optend), } if (tp->t_mpflags & TMPF_MPTCP_ECHO_ADDR) { - optlen = mptcp_echo_add_addr(tp, opt, optend, optlen); + optlen = mptcp_echo_add_addr(tp, opt, optlen); } if (tp->t_mpflags & TMPF_SND_MPPRIO) { @@ -899,85 +899,6 @@ ret_optlen: * MPTCP Options Input Processing */ -/* - * In most cases, option can be parsed by performing the cast - * - * opt_type *opt = (opt_type*)optp; - * - * However, in some cases there will be less bytes on the wire - * the size of the corresponding C struct, i.e.: - * - * (optend - optp) < sizeof(opt_type) - * - * In such cases, the bounds of `opt' will be smaller than - * the size of its declared pointee type. Any attempt to - * dereference `opt' (or to access its fields) - * will lead to an `-fbounds-safety' trap. - * - * To prevent such undesirable situation, we are using - * the "shadow storage" pattern: - * - If there are enough bytes so that the cast expression - * opt_type *opt = (opt_type*)optp; - * will produce a "valid" pointer, we will perform a cast. - * - Otherwise, we will copy the bytes into a stack allocated - * structure, and return a pointer to that structure. - * - * If the `VERBOSE_OPTION_PARSING_LOGGING' is set to 1, - * the code will produce additional logging at the detriment - * of performance. This is off by default, but the code is kept for now. - */ -#define VERBOSE_OPTION_PARSING_LOGGING 0 -#if VERBOSE_OPTION_PARSING_LOGGING - -#define MPTCP_OPT_CHECK_UNDERRUN(shadow_opt, optlen) do { \ - if (__improbable(sizeof((shadow_opt)) < (optlen))) { \ - size_t ignored = (optlen) - sizeof((shadow_opt)); \ - os_log(mptcp_log_handle, \ - "%s - option length exceeds the size of underlying storage " \ - "(optlen=%lu, storage size=%lu) %lu bytes will be ignored\n", \ - __func__, (size_t)(optlen), sizeof((shadow_opt)), ignored); \ - } \ -} while(0) - -#define MPTCP_OPT_REPORT_COPY(shadow_opt, available) do { \ - os_log(mptcp_log_handle, \ - "%s - insufficent input to use cast-parsing (required=%lu; available=%ld); " \ - " option data will be copied to local storage\n", \ - __func__, sizeof((shadow_opt)), available); \ - \ -} while(0) - -#else /* !VERBOSE_OPTION_PARSING_LOGGING*/ - -#define MPTCP_OPT_CHECK_UNDERRUN(shadow_opt, optlen) do { \ - (void)(optlen); \ -} while(0) - -#define MPTCP_OPT_REPORT_COPY(shadow_opt, optlen) do { \ - (void)(optlen); \ -} while(0) -#endif /* DEBUG || DEVELOPMENT */ - - -#define MPTCP_OPT_GET(shadow_opt, optp, optend, optlen) ({ \ - __typeof__((shadow_opt)) * __single opt_ptr; \ - \ - ptrdiff_t available = (optend) - (optp); \ - \ - MPTCP_OPT_CHECK_UNDERRUN(shadow_opt, optlen); \ - \ - if (__improbable(available < sizeof((shadow_opt)))) { \ - MPTCP_OPT_REPORT_COPY(shadow_opt, available); \ - memset((caddr_t)&(shadow_opt) + available, \ - 0, sizeof((shadow_opt)) - available); \ - memcpy(&(shadow_opt), (optp), available); \ - opt_ptr = &(shadow_opt); \ - } else { \ - opt_ptr = __unsafe_forge_single(__typeof__((shadow_opt))*, (optp)); \ - } \ - opt_ptr; \ -}) - static int mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype) { @@ -1007,31 +928,11 @@ mptcp_sanitize_option(struct tcpcb *tp, int mptcp_subtype) return ret; } -static int -mptcp_valid_mpcapable_common_opt(struct mptcp_mpcapable_opt_common *crsp) -{ - /* mmco_kind, mmco_len and mmco_subtype are validated before */ - - if (!(crsp->mmco_flags & MPCAP_PROPOSAL_SBIT)) { - return 0; - } - - if (crsp->mmco_flags & (MPCAP_BBIT | MPCAP_DBIT | - MPCAP_EBIT | MPCAP_FBIT | MPCAP_GBIT)) { - return 0; - } - - return 1; -} - - static void -mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend, struct tcphdr *th, +mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, struct tcphdr *th, uint8_t optlen) { - struct mptcp_mpcapable_opt_common crsp_s, *crsp; - crsp = MPTCP_OPT_GET(crsp_s, cp, optend, optlen); - struct mptcp_mpcapable_opt_rsp rsp_s, *rsp = NULL; + struct mptcp_mpcapable_opt_rsp *rsp; struct mptcb *mp_tp = tptomptp(tp); struct mptses *mpte = mp_tp->mpt_mpte; @@ -1040,12 +941,6 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * return; } - /* Validate the kind, len, flags */ - if (mptcp_valid_mpcapable_common_opt(crsp) != 1) { - tcpstat.tcps_invalid_mpcap++; - return; - } - /* handle SYN/ACK retransmission by acknowledging with ACK */ if (mp_tp->mpt_state >= MPTCPS_ESTABLISHED) { return; @@ -1061,15 +956,24 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * return; } + rsp = (struct mptcp_mpcapable_opt_rsp *)cp; + + if (!(rsp->mmc_common.mmco_flags & MPCAP_PROPOSAL_SBIT) || + rsp->mmc_common.mmco_flags & (MPCAP_BBIT | MPCAP_DBIT | + MPCAP_EBIT | MPCAP_FBIT | MPCAP_GBIT)) { + tcpstat.tcps_invalid_mpcap++; + return; + } + /* * If checksum flag is set, enable MPTCP checksum, even if * it was not negotiated on the first SYN. */ - if (crsp->mmco_flags & MPCAP_CHECKSUM_CBIT) { + if (rsp->mmc_common.mmco_flags & MPCAP_CHECKSUM_CBIT) { mp_tp->mpt_flags |= MPTCPF_CHECKSUM; } - if (crsp->mmco_flags & MPCAP_UNICAST_IPBIT) { + if (rsp->mmc_common.mmco_flags & MPCAP_UNICAST_IPBIT) { mpte->mpte_flags |= MPTE_UNICAST_IP; /* We need an explicit signal for the addresses - zero the existing ones */ @@ -1077,7 +981,6 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * memset(&mpte->mpte_sub_dst_v6, 0, sizeof(mpte->mpte_sub_dst_v6)); } - rsp = MPTCP_OPT_GET(rsp_s, cp, optend, optlen); mp_tp->mpt_remotekey = rsp->mmc_localkey; /* For now just downgrade to the peer's version */ if (rsp->mmc_common.mmco_version < mp_tp->mpt_version) { @@ -1097,18 +1000,10 @@ mptcp_do_mpcapable_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * static void -mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend, struct tcphdr *th, uint8_t optlen) +mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, struct tcphdr *th, uint8_t optlen) { -#define MPTCP_JOPT_ERROR_PATH(tp) { \ - tcpstat.tcps_invalid_joins++; \ - if (tp->t_inpcb->inp_socket != NULL) { \ - soevent(tp->t_inpcb->inp_socket, \ - SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); \ - } \ -} - int error = 0; - struct mptcp_mpjoin_opt_rsp join_rsp_s, *join_rsp; - join_rsp = MPTCP_OPT_GET(join_rsp_s, cp, optend, optlen); + struct mptcp_mpjoin_opt_rsp *join_rsp; + int error; /* Only valid on SYN/ACK */ if ((th->th_flags & (TH_SYN | TH_ACK)) != (TH_SYN | TH_ACK)) { @@ -1116,29 +1011,40 @@ mptcp_do_mpjoin_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *opt } if (optlen != sizeof(struct mptcp_mpjoin_opt_rsp)) { - os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK: unexpected optlen = %u mp option = %lu\n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), + os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK: unexpected " + "optlen = %u mp option = %lu\n", __func__, + (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), optlen, sizeof(struct mptcp_mpjoin_opt_rsp)); tp->t_mpflags &= ~TMPF_PREESTABLISHED; /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; + goto join_error; } + join_rsp = (struct mptcp_mpjoin_opt_rsp *)cp; + mptcp_set_raddr_rand(tp->t_local_aid, tptomptp(tp), join_rsp->mmjo_addr_id, join_rsp->mmjo_rand); error = mptcp_validate_join_hmac(tp, (u_char*)&join_rsp->mmjo_mac, HMAC_TRUNCATED_SYNACK); if (error) { - os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK error = %d \n", - __func__, (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), + os_log_error(mptcp_log_handle, "%s - %lx: SYN_ACK error=%d \n", + __func__, + (unsigned long)VM_KERNEL_ADDRPERM(tptomptp(tp)->mpt_mpte), error); tp->t_mpflags &= ~TMPF_PREESTABLISHED; /* send RST and close */ - MPTCP_JOPT_ERROR_PATH(tp); - return; + goto join_error; } tp->t_mpflags |= (TMPF_SENT_JOIN | TMPF_SND_JACK); + + return; + +join_error: + tcpstat.tcps_invalid_joins++; + if (tp->t_inpcb->inp_socket != NULL) { + soevent(tp->t_inpcb->inp_socket, + SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); + } } static int @@ -1290,23 +1196,16 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, u_int64_t full_dsn, } static void -mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, struct tcpcb *tp, struct tcphdr *th, uint8_t optlen) +mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, struct tcpcb *tp, struct tcphdr *th) { - struct mptcp_dss_copt dss_rsp_s, *dss_rsp; - dss_rsp = MPTCP_OPT_GET(dss_rsp_s, cp, optend, optlen); - u_int64_t full_dack = 0; - u_int32_t tiwin = th->th_win << tp->snd_scale; + struct mptcp_dss_copt *dss_rsp; + uint64_t full_dack = 0; + uint32_t tiwin = th->th_win << tp->snd_scale; struct mptcb *mp_tp = tptomptp(tp); - int csum_len = 0; + unsigned int csum_len = 0; -#define MPTCP_DSS_OPT_SZ_CHK(len, expected_len) { \ - if (len != expected_len) { \ - os_log_error(mptcp_log_handle, "%s - %lx: bad len = %d dss: %x\n",\ - __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), \ - len, dss_rsp->mdss_flags); \ - return; \ - } \ -} + /* bounds-checks happens in the caller of the function */ + dss_rsp = (struct mptcp_dss_copt *)cp; if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) { csum_len = 2; @@ -1317,11 +1216,14 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M): { /* 32-bit DSS, No Data ACK */ - struct mptcp_dsn_opt dss_rsp1_s, *dss_rsp1; - dss_rsp1 = MPTCP_OPT_GET(dss_rsp1_s, cp, optend, optlen); + struct mptcp_dsn_opt *dss_rsp1; + + if (dss_rsp->mdss_len != sizeof(struct mptcp_dsn_opt) + csum_len) { + goto err_len; + } + + dss_rsp1 = (struct mptcp_dsn_opt *)cp; - MPTCP_DSS_OPT_SZ_CHK(dss_rsp1->mdss_copt.mdss_len, - sizeof(struct mptcp_dsn_opt) + csum_len); if (csum_len == 0) { mptcp_update_dss_rcv_state(dss_rsp1, tp, 0); } else { @@ -1334,13 +1236,16 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_A): { /* 32-bit Data ACK, no DSS */ - struct mptcp_data_ack_opt dack_opt_s, *dack_opt; - dack_opt = MPTCP_OPT_GET(dack_opt_s, cp, optend, optlen); + struct mptcp_data_ack_opt *dack_opt; + uint32_t dack; - MPTCP_DSS_OPT_SZ_CHK(dack_opt->mdss_copt.mdss_len, - sizeof(struct mptcp_data_ack_opt)); + if (dss_rsp->mdss_len != sizeof(struct mptcp_data_ack_opt)) { + goto err_len; + } - u_int32_t dack = dack_opt->mdss_ack; + dack_opt = (struct mptcp_data_ack_opt *)cp; + + dack = dack_opt->mdss_ack; NTOHL(dack); MPTCP_EXTEND_DSN(mp_tp->mpt_snduna, dack, full_dack); mptcp_do_dss_opt_ack_meat(full_dack, mp_tp->mpt_sndwl1, tp, tiwin); @@ -1349,16 +1254,19 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M | MDSS_A): { /* 32-bit Data ACK + 32-bit DSS */ - struct mptcp_dss_ack_opt dss_ack_rsp_s, *dss_ack_rsp; - dss_ack_rsp = MPTCP_OPT_GET(dss_ack_rsp_s, cp, optend, optlen); - u_int64_t full_dsn; + struct mptcp_dss_ack_opt *dss_ack_rsp; + uint64_t full_dsn; uint16_t csum = 0; + uint32_t dack; - MPTCP_DSS_OPT_SZ_CHK(dss_ack_rsp->mdss_copt.mdss_len, - sizeof(struct mptcp_dss_ack_opt) + csum_len); + if (dss_rsp->mdss_len != sizeof(struct mptcp_dss_ack_opt) + csum_len) { + goto err_len; + } + + dss_ack_rsp = (struct mptcp_dss_ack_opt *)cp; + + dack = ntohl(dss_ack_rsp->mdss_ack); - u_int32_t dack = dss_ack_rsp->mdss_ack; - NTOHL(dack); MPTCP_EXTEND_DSN(mp_tp->mpt_snduna, dack, full_dack); NTOHL(dss_ack_rsp->mdss_dsn); @@ -1382,13 +1290,15 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M | MDSS_m): { /* 64-bit DSS , No Data ACK */ - struct mptcp_dsn64_opt dsn64_s, *dsn64; - dsn64 = MPTCP_OPT_GET(dsn64_s, cp, optend, optlen); - u_int64_t full_dsn; + struct mptcp_dsn64_opt *dsn64; + uint64_t full_dsn; uint16_t csum = 0; - MPTCP_DSS_OPT_SZ_CHK(dsn64->mdss_copt.mdss_len, - sizeof(struct mptcp_dsn64_opt) + csum_len); + if (dss_rsp->mdss_len != sizeof(struct mptcp_dsn64_opt) + csum_len) { + goto err_len; + } + + dsn64 = (struct mptcp_dsn64_opt *)cp; mp_tp->mpt_flags |= MPTCPF_SND_64BITACK; @@ -1409,11 +1319,13 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_A | MDSS_a): { /* 64-bit Data ACK, no DSS */ - struct mptcp_data_ack64_opt dack64_s, *dack64; - dack64 = MPTCP_OPT_GET(dack64_s, cp, optend, optlen); + struct mptcp_data_ack64_opt *dack64; - MPTCP_DSS_OPT_SZ_CHK(dack64->mdss_copt.mdss_len, - sizeof(struct mptcp_data_ack64_opt)); + if (dss_rsp->mdss_len != sizeof(struct mptcp_data_ack64_opt)) { + goto err_len; + } + + dack64 = (struct mptcp_data_ack64_opt *)cp; mp_tp->mpt_flags |= MPTCPF_RCVD_64BITACK; @@ -1424,15 +1336,17 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M | MDSS_m | MDSS_A): { /* 64-bit DSS + 32-bit Data ACK */ - struct mptcp_dss64_ack32_opt dss_ack_rsp_s, *dss_ack_rsp; - dss_ack_rsp = MPTCP_OPT_GET(dss_ack_rsp_s, cp, optend, optlen); - u_int64_t full_dsn; + struct mptcp_dss64_ack32_opt *dss_ack_rsp; + uint64_t full_dsn; uint16_t csum = 0; - MPTCP_DSS_OPT_SZ_CHK(dss_ack_rsp->mdss_copt.mdss_len, - sizeof(struct mptcp_dss64_ack32_opt) + csum_len); + if (dss_rsp->mdss_len != sizeof(struct mptcp_dss64_ack32_opt) + csum_len) { + goto err_len; + } - u_int32_t dack = dss_ack_rsp->mdss_ack; + dss_ack_rsp = (struct mptcp_dss64_ack32_opt *)cp; + + uint32_t dack = dss_ack_rsp->mdss_ack; NTOHL(dack); mp_tp->mpt_flags |= MPTCPF_SND_64BITACK; MPTCP_EXTEND_DSN(mp_tp->mpt_snduna, dack, full_dack); @@ -1457,13 +1371,14 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M | MDSS_A | MDSS_a): { /* 32-bit DSS + 64-bit Data ACK */ - struct mptcp_dss32_ack64_opt dss32_ack_64_opt_s, *dss32_ack64_opt; - dss32_ack64_opt = MPTCP_OPT_GET(dss32_ack_64_opt_s, cp, optend, optlen); - u_int64_t full_dsn; + struct mptcp_dss32_ack64_opt *dss32_ack64_opt; + uint64_t full_dsn; - MPTCP_DSS_OPT_SZ_CHK( - dss32_ack64_opt->mdss_copt.mdss_len, - sizeof(struct mptcp_dss32_ack64_opt) + csum_len); + if (dss_rsp->mdss_len != sizeof(struct mptcp_dss32_ack64_opt) + csum_len) { + goto err_len; + } + + dss32_ack64_opt = (struct mptcp_dss32_ack64_opt *)cp; full_dack = mptcp_ntoh64(dss32_ack64_opt->mdss_ack); NTOHL(dss32_ack64_opt->mdss_dsn); @@ -1491,12 +1406,14 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st case (MDSS_M | MDSS_m | MDSS_A | MDSS_a): { /* 64-bit DSS + 64-bit Data ACK */ - struct mptcp_dss64_ack64_opt dss64_ack_64_s, *dss64_ack64; - dss64_ack64 = MPTCP_OPT_GET(dss64_ack_64_s, cp, optend, optlen); - u_int64_t full_dsn; + struct mptcp_dss64_ack64_opt *dss64_ack64; + uint64_t full_dsn; - MPTCP_DSS_OPT_SZ_CHK(dss64_ack64->mdss_copt.mdss_len, - sizeof(struct mptcp_dss64_ack64_opt) + csum_len); + if (dss_rsp->mdss_len != sizeof(struct mptcp_dss64_ack64_opt) + csum_len) { + goto err_len; + } + + dss64_ack64 = (struct mptcp_dss64_ack64_opt *)cp; mp_tp->mpt_flags |= MPTCPF_RCVD_64BITACK; mp_tp->mpt_flags |= MPTCPF_SND_64BITACK; @@ -1522,42 +1439,53 @@ mptcp_do_dss_opt_meat(u_char *cp __ended_by(optend), u_char *optend __unused, st default: break; } + + return; + +err_len: + os_log_error(mptcp_log_handle, "%s - %lx: bad len = %d dss: %x\n", + __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), + dss_rsp->mdss_len, dss_rsp->mdss_flags); + return; } static void mptcp_do_dss_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend, struct tcphdr *th, uint8_t optlen) { - struct mptcp_dss_copt dss_rsp_s, *dss_rsp; - dss_rsp = MPTCP_OPT_GET(dss_rsp_s, cp, optend, optlen); - struct mptcb *mp_tp = tptomptp(tp); + struct mptcp_dss_copt *dss_rsp; - if (!mp_tp) { + if (!tptomptp(tp)) { return; } + if (optlen < sizeof(struct mptcp_dss_copt)) { + tcpstat.tcps_invalid_opt++; + return; + } + dss_rsp = (struct mptcp_dss_copt *)cp; + if (dss_rsp->mdss_subtype == MPO_DSS) { if (dss_rsp->mdss_flags & MDSS_F) { - tp->t_rcv_map.mpt_dfin = 1; + tp->t_mpsub->mpts_rcv_map.mpt_dfin = 1; } else { - tp->t_rcv_map.mpt_dfin = 0; + tp->t_mpsub->mpts_rcv_map.mpt_dfin = 0; } - mptcp_do_dss_opt_meat(cp, optend, tp, th, optlen); + mptcp_do_dss_opt_meat(cp, optend, tp, th); } } static void mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, struct tcphdr *th, uint8_t optlen) { - struct mptcb *mp_tp = NULL; - struct mptcp_fastclose_opt fc_opt_s, *fc_opt; - fc_opt = MPTCP_OPT_GET(fc_opt_s, cp, optend, optlen); + struct mptcp_fastclose_opt *fc_opt; + struct mptcb *mp_tp; if (th->th_flags != TH_ACK) { return; } - if (fc_opt->mfast_len != sizeof(struct mptcp_fastclose_opt)) { + if (optlen != sizeof(struct mptcp_fastclose_opt)) { tcpstat.tcps_invalid_opt++; return; } @@ -1567,6 +1495,8 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * return; } + fc_opt = (struct mptcp_fastclose_opt *)cp; + if (fc_opt->mfast_key != mp_tp->mpt_localkey) { tcpstat.tcps_invalid_opt++; return; @@ -1594,9 +1524,8 @@ mptcp_do_fastclose_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char * static void mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, struct tcphdr *th, uint8_t optlen) { - struct mptcp_mpfail_opt fail_opt_s, *fail_opt; - fail_opt = MPTCP_OPT_GET(fail_opt_s, cp, optend, optlen); - u_int32_t mdss_subflow_seqn = 0; + struct mptcp_mpfail_opt *fail_opt; + uint32_t mdss_subflow_seqn = 0; struct mptcb *mp_tp; int error = 0; @@ -1614,10 +1543,12 @@ mptcp_do_mpfail_opt(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *opt return; } - if (fail_opt->mfail_len != sizeof(struct mptcp_mpfail_opt)) { + if (optlen != sizeof(struct mptcp_mpfail_opt)) { return; } + fail_opt = (struct mptcp_mpfail_opt *)cp; + mp_tp = tptomptp(tp); mp_tp->mpt_flags |= MPTCPF_RECVD_MPFAIL; @@ -1649,25 +1580,25 @@ mptcp_validate_add_addr_hmac(struct tcpcb *tp, u_char *hmac __sized_by(mac_len), } static void -mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend, uint8_t optlen) +mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, uint8_t optlen) { + struct mptcp_add_addr_opt *addr_opt; struct mptcb *mp_tp = tptomptp(tp); struct mptses *mpte = mp_tp->mpt_mpte; - struct mptcp_add_addr_opt addr_opt_s, *addr_opt; - addr_opt = MPTCP_OPT_GET(addr_opt_s, cp, optend, optlen); - - if (addr_opt->maddr_len != MPTCP_V1_ADD_ADDR_OPT_LEN_V4 && - addr_opt->maddr_len != MPTCP_V1_ADD_ADDR_OPT_LEN_V4 + 2 && - addr_opt->maddr_len != MPTCP_V1_ADD_ADDR_OPT_LEN_V6 && - addr_opt->maddr_len != MPTCP_V1_ADD_ADDR_OPT_LEN_V6 + 2) { + if (optlen != MPTCP_V1_ADD_ADDR_OPT_LEN_V4 && + optlen != MPTCP_V1_ADD_ADDR_OPT_LEN_V4 + 2 && + optlen != MPTCP_V1_ADD_ADDR_OPT_LEN_V6 && + optlen != MPTCP_V1_ADD_ADDR_OPT_LEN_V6 + 2) { os_log_error(mptcp_log_handle, "%s - %lx: Wrong ADD_ADDR length %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), - addr_opt->maddr_len); + optlen); return; } + addr_opt = (struct mptcp_add_addr_opt *)cp; + if ((addr_opt->maddr_flags & MPTCP_V1_ADD_ADDR_ECHO) != 0) { os_log(mptcp_log_handle, "%s - %lx: Received ADD_ADDR with echo bit\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte)); @@ -1677,7 +1608,7 @@ mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char if (addr_opt->maddr_len < MPTCP_V1_ADD_ADDR_OPT_LEN_V6) { struct sockaddr_in *dst = &mpte->mpte_sub_dst_v4; - struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4; + struct in_addr *addr = (struct in_addr *)(void *)(cp + sizeof(*addr_opt)); in_addr_t haddr = ntohl(addr->s_addr); if (IN_ZERONET(haddr) || @@ -1700,7 +1631,7 @@ mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char uint16_t msg_len = sizeof(struct mptcp_add_addr_hmac_msg_v4); struct mptcp_add_addr_hmac_msg_v4 msg = {0}; msg.maddr_addrid = addr_opt->maddr_addrid; - msg.maddr_addr = addr_opt->maddr_u.maddr_addrv4; + msg.maddr_addr = *addr; if (addr_opt->maddr_len > MPTCP_V1_ADD_ADDR_OPT_LEN_V4) { msg.maddr_port = *(uint16_t *)(void *)(cp + addr_opt->maddr_len - HMAC_TRUNCATED_ADD_ADDR - 2); } @@ -1723,7 +1654,7 @@ mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char mpte->mpte_last_added_addr_is_v4 = TRUE; } else { struct sockaddr_in6 *dst = &mpte->mpte_sub_dst_v6; - struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6; + struct in6_addr *addr = (struct in6_addr *)(void *)(cp + sizeof(*addr_opt)); if (IN6_IS_ADDR_LINKLOCAL(addr) || IN6_IS_ADDR_MULTICAST(addr) || @@ -1745,7 +1676,7 @@ mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char uint16_t msg_len = sizeof(struct mptcp_add_addr_hmac_msg_v6); struct mptcp_add_addr_hmac_msg_v6 msg = {0}; msg.maddr_addrid = addr_opt->maddr_addrid; - msg.maddr_addr = addr_opt->maddr_u.maddr_addrv6; + msg.maddr_addr = *addr; if (addr_opt->maddr_len > MPTCP_V1_ADD_ADDR_OPT_LEN_V6) { msg.maddr_port = *(uint16_t *)(void *)(cp + addr_opt->maddr_len - HMAC_TRUNCATED_ADD_ADDR - 2); } @@ -1786,18 +1717,19 @@ mptcp_do_add_addr_opt_v1(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char static void mptcp_do_add_addr_opt_v0(struct mptses *mpte, u_char *cp __ended_by(optend), u_char *optend __unused, uint8_t optlen) { - struct mptcp_add_addr_opt addr_opt_s, *addr_opt; - addr_opt = MPTCP_OPT_GET(addr_opt_s, cp, optend, optlen); + struct mptcp_add_addr_opt *addr_opt; - if (addr_opt->maddr_len != MPTCP_V0_ADD_ADDR_OPT_LEN_V4 && - addr_opt->maddr_len != MPTCP_V0_ADD_ADDR_OPT_LEN_V6) { + if (optlen != MPTCP_V0_ADD_ADDR_OPT_LEN_V4 && + optlen != MPTCP_V0_ADD_ADDR_OPT_LEN_V6) { os_log_error(mptcp_log_handle, "%s - %lx: Wrong ADD_ADDR length %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), - addr_opt->maddr_len); + optlen); return; } + addr_opt = (struct mptcp_add_addr_opt *)cp; + if (addr_opt->maddr_len == MPTCP_V0_ADD_ADDR_OPT_LEN_V4 && addr_opt->maddr_flags != MPTCP_V0_ADD_ADDR_IPV4) { os_log_error(mptcp_log_handle, "%s - %lx: ADD_ADDR length for v4 but version is %u\n", @@ -1818,7 +1750,7 @@ mptcp_do_add_addr_opt_v0(struct mptses *mpte, u_char *cp __ended_by(optend), u_c if (addr_opt->maddr_len == MPTCP_V0_ADD_ADDR_OPT_LEN_V4) { struct sockaddr_in *dst = &mpte->mpte_sub_dst_v4; - struct in_addr *addr = &addr_opt->maddr_u.maddr_addrv4; + struct in_addr *addr = (struct in_addr *)(void *)(cp + sizeof(*addr_opt)); in_addr_t haddr = ntohl(addr->s_addr); if (IN_ZERONET(haddr) || @@ -1844,7 +1776,7 @@ mptcp_do_add_addr_opt_v0(struct mptses *mpte, u_char *cp __ended_by(optend), u_c mpte->mpte_last_added_addr_is_v4 = TRUE; } else { struct sockaddr_in6 *dst = &mpte->mpte_sub_dst_v6; - struct in6_addr *addr = &addr_opt->maddr_u.maddr_addrv6; + struct in6_addr *addr = (struct in6_addr *)(void *)(cp + sizeof(*addr_opt)); if (IN6_IS_ADDR_LINKLOCAL(addr) || IN6_IS_ADDR_MULTICAST(addr) || @@ -1948,11 +1880,14 @@ mptcp_send_remaddr_opt(struct tcpcb *tp, struct mptcp_remaddr_opt *opt) } static int -mptcp_echo_add_addr(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *optend __unused, unsigned int optlen) +mptcp_echo_add_addr(struct tcpcb *tp, u_char * __indexable cp, unsigned int optlen) { - struct mptcp_add_addr_opt mpaddr; - struct mptcb *mp_tp = tptomptp(tp); - struct mptses *mpte = mp_tp->mpt_mpte; + struct mptcp_add_addr_opt *mpaddr; + struct mptcb *mp_tp; + struct mptses *mpte; + + mp_tp = tptomptp(tp); + mpte = mp_tp->mpt_mpte; // MPTCP v0 doesn't require echoing add_addr if (mp_tp->mpt_version == MPTCP_VERSION_0) { @@ -1964,20 +1899,23 @@ mptcp_echo_add_addr(struct tcpcb *tp, u_char *cp __ended_by(optend), u_char *opt return optlen; } - bzero(&mpaddr, sizeof(mpaddr)); - mpaddr.maddr_kind = TCPOPT_MULTIPATH; - mpaddr.maddr_len = (uint8_t)mpaddr_size; - mpaddr.maddr_subtype = MPO_ADD_ADDR; - mpaddr.maddr_flags = MPTCP_V1_ADD_ADDR_ECHO; + cp += optlen; + mpaddr = (struct mptcp_add_addr_opt *)cp; + + mpaddr->maddr_kind = TCPOPT_MULTIPATH; + mpaddr->maddr_len = (uint8_t)mpaddr_size; + mpaddr->maddr_subtype = MPO_ADD_ADDR; + mpaddr->maddr_flags = MPTCP_V1_ADD_ADDR_ECHO; if (mpte->mpte_last_added_addr_is_v4) { - mpaddr.maddr_u.maddr_addrv4.s_addr = mpte->mpte_sub_dst_v4.sin_addr.s_addr; - mpaddr.maddr_addrid = mpte->sub_dst_addr_id_v4; + struct in_addr *addr = (struct in_addr *)(void *)(cp + sizeof(struct mptcp_add_addr_opt)); + addr->s_addr = mpte->mpte_sub_dst_v4.sin_addr.s_addr; + mpaddr->maddr_addrid = mpte->sub_dst_addr_id_v4; } else { - mpaddr.maddr_u.maddr_addrv6 = mpte->mpte_sub_dst_v6.sin6_addr; - mpaddr.maddr_addrid = mpte->sub_dst_addr_id_v6; + struct in6_addr *addr = (struct in6_addr *)(void *)(cp + sizeof(struct mptcp_add_addr_opt)); + *addr = mpte->mpte_sub_dst_v6.sin6_addr; + mpaddr->maddr_addrid = mpte->sub_dst_addr_id_v6; } - memcpy(cp + optlen, &mpaddr, mpaddr_size); optlen += mpaddr_size; tp->t_mpflags &= ~TMPF_MPTCP_ECHO_ADDR; return optlen; diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index e7fa75e30..b80e118e0 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -2712,17 +2712,19 @@ mptcpstats_update(struct mptcp_itf_stats *stats __counted_by(stats_count), uint1 if (index != -1) { struct inpcb *inp = sotoinpcb(mpts->mpts_socket); - stats[index].mpis_txbytes += inp->inp_stat->txbytes; - stats[index].mpis_rxbytes += inp->inp_stat->rxbytes; + stats[index].mpis_txbytes += inp->inp_mstat.ms_total.ts_txbytes; + stats[index].mpis_rxbytes += inp->inp_mstat.ms_total.ts_rxbytes; - stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes; - stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes; + stats[index].mpis_wifi_txbytes += inp->inp_mstat.ms_wifi_infra.ts_txbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_txbytes; + stats[index].mpis_wifi_rxbytes += inp->inp_mstat.ms_wifi_infra.ts_rxbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes; - stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes; - stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes; + stats[index].mpis_wired_txbytes += inp->inp_mstat.ms_wired.ts_txbytes; + stats[index].mpis_wired_rxbytes += inp->inp_mstat.ms_wired.ts_rxbytes; - stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes; - stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes; + stats[index].mpis_cell_txbytes += inp->inp_mstat.ms_cellular.ts_txbytes; + stats[index].mpis_cell_rxbytes += inp->inp_mstat.ms_cellular.ts_rxbytes; } } @@ -2747,8 +2749,8 @@ mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts) mptcp_unset_cellicon(mpte, mpts, 1); - mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes; - mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes; + mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_rxbytes; + mpte->mpte_init_txbytes = sotoinpcb(so)->inp_mstat.ms_total.ts_txbytes; TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry); mpte->mpte_numflows--; @@ -4345,7 +4347,7 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, if (tp->t_state != TCPS_CLOSED) { mbuf_ref_t m; - struct tcptemp *t_template = tcp_maketemplate(tp, &m); + struct tcptemp *t_template = tcp_maketemplate(tp, &m, NULL, NULL); if (t_template) { struct tcp_respond_args tra; @@ -4360,7 +4362,7 @@ mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts, tcp_respond(tp, t_template->tt_ipgen, sizeof(t_template->tt_ipgen), &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una, TH_RST, &tra); + tp->rcv_nxt, tp->snd_una, 0, TH_RST, NULL, 0, 0, 0, &tra, false); (void) m_free(m); } } @@ -5631,11 +5633,11 @@ mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th) VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)); if (tp->t_mpflags & TMPF_EMBED_DSN) { - m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn; - m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq; - m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len; - m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum; - if (tp->t_rcv_map.mpt_dfin) { + m->m_pkthdr.mp_dsn = tp->t_mpsub->mpts_rcv_map.mpt_dsn; + m->m_pkthdr.mp_rseq = tp->t_mpsub->mpts_rcv_map.mpt_sseq; + m->m_pkthdr.mp_rlen = tp->t_mpsub->mpts_rcv_map.mpt_len; + m->m_pkthdr.mp_csum = tp->t_mpsub->mpts_rcv_map.mpt_csum; + if (tp->t_mpsub->mpts_rcv_map.mpt_dfin) { m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN; } @@ -6160,7 +6162,6 @@ mptcp_pcblist SYSCTL_HANDLER_ARGS mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn; mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd; mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt; - mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt; mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn; mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd; @@ -6676,14 +6677,15 @@ symptoms_is_wifi_lossy(void) int mptcp_freeq(struct mptcb *mp_tp) { + struct protosw *proto = mptetoso(mp_tp->mpt_mpte)->so_proto; struct tseg_qent *q; - int rv = 0; int count = 0; + int rv = 0; while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - tcp_reass_qent_free(q); + tcp_reass_qent_free(proto, q); count++; rv = 1; } @@ -6741,7 +6743,8 @@ mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts) /* Remember the last time we set the cellicon. Needed for debouncing */ mpte->mpte_last_cellicon_set = tcp_now; - tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE); + tp->t_timer[TCPT_CELLICON] = tcp_offset_from_start(tp, + MPTCP_CELLICON_TOGGLE_RATE); tcp_sched_timers(tp); if (mpts->mpts_flags & MPTSF_CELLICON_SET && @@ -6927,10 +6930,9 @@ mptcp_init(struct protosw *pp, struct domain *dp) VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); /* do this only once */ - if (mptcp_initialized) { + if (!os_atomic_cmpxchg(&mptcp_initialized, 0, 1, relaxed)) { return; } - mptcp_initialized = 1; mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK; diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c index d14eba178..8e748dca9 100644 --- a/bsd/netinet/mptcp_usrreq.c +++ b/bsd/netinet/mptcp_usrreq.c @@ -533,8 +533,8 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, inp = sotoinpcb(mpts->mpts_socket); - mptcp_ci.mptcpci_init_rxbytes = inp->inp_stat->rxbytes; - mptcp_ci.mptcpci_init_txbytes = inp->inp_stat->txbytes; + mptcp_ci.mptcpci_init_rxbytes = inp->inp_mstat.ms_total.ts_rxbytes; + mptcp_ci.mptcpci_init_txbytes = inp->inp_mstat.ms_total.ts_txbytes; initial_info_set = 1; } @@ -702,24 +702,26 @@ mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags, /* Roll the itf-stats into the tcp_info */ tcp_ci.tcpci_tcp_info.tcpi_txbytes += - mptsinp->inp_stat->txbytes; + mptsinp->inp_mstat.ms_total.ts_txbytes; tcp_ci.tcpci_tcp_info.tcpi_rxbytes += - mptsinp->inp_stat->rxbytes; + mptsinp->inp_mstat.ms_total.ts_rxbytes; tcp_ci.tcpci_tcp_info.tcpi_wifi_txbytes += - mptsinp->inp_wstat->txbytes; + mptsinp->inp_mstat.ms_wifi_infra.ts_txbytes + + mptsinp->inp_mstat.ms_wifi_non_infra.ts_txbytes; tcp_ci.tcpci_tcp_info.tcpi_wifi_rxbytes += - mptsinp->inp_wstat->rxbytes; + mptsinp->inp_mstat.ms_wifi_infra.ts_rxbytes + + mptsinp->inp_mstat.ms_wifi_non_infra.ts_rxbytes; tcp_ci.tcpci_tcp_info.tcpi_wired_txbytes += - mptsinp->inp_Wstat->txbytes; + mptsinp->inp_mstat.ms_wired.ts_txbytes; tcp_ci.tcpci_tcp_info.tcpi_wired_rxbytes += - mptsinp->inp_Wstat->rxbytes; + mptsinp->inp_mstat.ms_wired.ts_rxbytes; tcp_ci.tcpci_tcp_info.tcpi_cell_txbytes += - mptsinp->inp_cstat->txbytes; + mptsinp->inp_mstat.ms_cellular.ts_txbytes; tcp_ci.tcpci_tcp_info.tcpi_cell_rxbytes += - mptsinp->inp_cstat->rxbytes; + mptsinp->inp_mstat.ms_cellular.ts_rxbytes; } } @@ -1176,7 +1178,7 @@ mptcp_uiotombuf(struct uio *uio, int how, user_ssize_t space, struct mbuf **top) while (len > 0) { uint32_t m_needed = 1; - if (njcl > 0 && len > MBIGCLBYTES) { + if (len > MBIGCLBYTES) { mb = m_getpackets_internal(&m_needed, 1, how, 1, M16KCLBYTES); } else if (len > MCLBYTES) { @@ -1946,14 +1948,16 @@ mptcp_fill_info_bytestats(struct tcp_info *ti, struct mptses *mpte) continue; } - ti->tcpi_txbytes += inp->inp_stat->txbytes; - ti->tcpi_rxbytes += inp->inp_stat->rxbytes; - ti->tcpi_cell_txbytes += inp->inp_cstat->txbytes; - ti->tcpi_cell_rxbytes += inp->inp_cstat->rxbytes; - ti->tcpi_wifi_txbytes += inp->inp_wstat->txbytes; - ti->tcpi_wifi_rxbytes += inp->inp_wstat->rxbytes; - ti->tcpi_wired_txbytes += inp->inp_Wstat->txbytes; - ti->tcpi_wired_rxbytes += inp->inp_Wstat->rxbytes; + ti->tcpi_txbytes += inp->inp_mstat.ms_total.ts_txbytes; + ti->tcpi_rxbytes += inp->inp_mstat.ms_total.ts_rxbytes; + ti->tcpi_cell_txbytes += inp->inp_mstat.ms_cellular.ts_txbytes; + ti->tcpi_cell_rxbytes += inp->inp_mstat.ms_cellular.ts_rxbytes; + ti->tcpi_wifi_txbytes += inp->inp_mstat.ms_wifi_infra.ts_txbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_txbytes; + ti->tcpi_wifi_rxbytes += inp->inp_mstat.ms_wifi_infra.ts_rxbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes; + ti->tcpi_wired_txbytes += inp->inp_mstat.ms_wired.ts_txbytes; + ti->tcpi_wired_rxbytes += inp->inp_mstat.ms_wired.ts_rxbytes; } for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) { diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index cb9213afc..aab655879 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -29,10 +29,10 @@ #ifndef _NETINET_MPTCP_VAR_H_ #define _NETINET_MPTCP_VAR_H_ -#ifdef PRIVATE #include #include -#endif +#include +#include #ifdef BSD_KERNEL_PRIVATE #include @@ -267,6 +267,7 @@ struct mptsub { uint32_t mpts_probesoon; /* send probe after probeto */ uint32_t mpts_probecnt; /* number of probes sent */ uint32_t mpts_maxseg; /* cached value of t_maxseg */ + struct mpt_dsn_map mpts_rcv_map; /* Receive mapping list */ }; /* @@ -644,7 +645,6 @@ extern int mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ -#ifdef PRIVATE typedef struct mptcp_flow { uint64_t flow_len; @@ -679,7 +679,6 @@ typedef struct conninfo_mptcp { /* Receive side */ uint64_t mptcpci_rcvnxt; /* Next expected DSN */ - uint64_t mptcpci_rcvatmark; /* Session level rcvnxt */ uint64_t mptcpci_ridsn; /* Peer's IDSN */ uint32_t mptcpci_rcvwnd; /* Receive window */ @@ -733,5 +732,4 @@ struct kev_mptcp_data { int value; }; -#endif /* PRIVATE */ #endif /* _NETINET_MPTCP_VAR_H_ */ diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index 1de7a91cc..39c7f3ddd 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -148,10 +149,9 @@ rip_init(struct protosw *pp, struct domain *dp) VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - if (rip_initialized) { + if (!os_atomic_cmpxchg(&rip_initialized, 0, 1, relaxed)) { return; } - rip_initialized = 1; LIST_INIT(&ripcb); ripcbinfo.ipi_listhead = &ripcb; @@ -390,6 +390,7 @@ rip_output( struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; int inp_flags = inp ? inp->inp_flags : 0; + struct sock_cm_info sockcminfo; struct ip_out_args ipoa; struct ip_moptions *imo; int tos = IPTOS_UNSPEC; @@ -464,22 +465,15 @@ rip_output( ipoa.ipoa_boundif = IFSCOPE_NONE; ipoa.ipoa_flags = IPOAF_SELECT_SRCIF; - int sotc = SO_TC_UNSPEC; - int netsvctype = _NET_SERVICE_TYPE_UNSPEC; - + sock_init_cm_info(&sockcminfo, so); if (control != NULL) { - tos = so_tos_from_control(control); - sotc = so_tc_from_control(control, &netsvctype); + tos = ip_tos_from_control(control); + sock_parse_cm_info(control, &sockcminfo); m_freem(control); control = NULL; } - if (sotc == SO_TC_UNSPEC) { - sotc = so->so_traffic_class; - netsvctype = so->so_netsvctype; - } - if (inp == NULL #if NECP || (necp_socket_should_use_flow_divert(inp)) @@ -516,8 +510,8 @@ rip_output( if (INP_ULTRA_CONSTRAINED_ALLOWED(inp)) { ipoa.ipoa_flags |= IPOAF_ULTRA_CONSTRAINED_ALLOWED; } - ipoa.ipoa_sotc = sotc; - ipoa.ipoa_netsvctype = netsvctype; + ipoa.ipoa_sotc = sockcminfo.sotc; + ipoa.ipoa_netsvctype = sockcminfo.netsvctype; if (inp->inp_flowhash == 0) { inp_calc_flowhash(inp); @@ -660,7 +654,10 @@ rip_output( ROUTE_RELEASE(&inp->inp_route); } - set_packet_service_class(m, so, sotc, 0); + set_packet_service_class(m, so, sockcminfo.sotc, 0); + if (sockcminfo.tx_time) { + mbuf_set_tx_time(m, sockcminfo.tx_time); + } m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = inp->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 8fb9091e5..8e5234126 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -124,14 +124,17 @@ struct tcphdr { }; #define TCPOPT_EOL 0 +#define TCPOLEN_EOL 1 #define TCPOPT_NOP 1 +#define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 -#define TCPOPT_SACK_PERMITTED 4 /* Experimental */ +#define TCPOPT_SACK_PERMITTED 4 /* SACK capability in SYN */ #define TCPOLEN_SACK_PERMITTED 2 -#define TCPOPT_SACK 5 /* Experimental */ +#define TCPOPT_SACK 5 +#define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* len of sack block */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 diff --git a/bsd/netinet/tcp_cache.c b/bsd/netinet/tcp_cache.c index 8afdbb6ab..58a5e4737 100644 --- a/bsd/netinet/tcp_cache.c +++ b/bsd/netinet/tcp_cache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2024 Apple Inc. All rights reserved. + * Copyright (c) 2015-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,18 +42,7 @@ #include #include -typedef union { - struct in_addr addr; - struct in6_addr addr6; -} in_4_6_addr; - -struct tcp_heuristic_key { - union { - uint8_t thk_net_signature[IFNET_SIGNATURELEN]; - in_4_6_addr thk_ip; - }; - sa_family_t thk_family; -}; +#include struct tcp_heuristic { SLIST_ENTRY(tcp_heuristic) list; @@ -69,10 +58,9 @@ struct tcp_heuristic { uint8_t th_tfo_req_rst; /* The number of times a SYN+cookie-req has received a RST */ uint8_t th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */ uint8_t th_mptcp_success; /* The number of times MPTCP-negotiation has been successful */ - uint8_t th_ecn_loss; /* The number of times a SYN+ecn has been lost */ + uint8_t th_ecn_loss; /* The number of times a SYN+ecn was likely dropped */ uint8_t th_ecn_aggressive; /* The number of times we did an aggressive fallback */ uint8_t th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */ - uint8_t th_ecn_droprxmt; /* The number of times ECN connection is dropped after multiple retransmits */ uint8_t th_ecn_synrst; /* number of times RST was received in response to an ECN enabled SYN */ uint32_t th_tfo_enabled_time; /* The moment when we reenabled TFO after backing off */ uint32_t th_tfo_backoff_until; /* Time until when we should not try out TFO */ @@ -86,6 +74,7 @@ struct tcp_heuristic { // N.B.: we may sometimes erase ALL values from th_val_start to the end of the structure. }; + struct tcp_heuristics_head { SLIST_HEAD(tcp_heur_bucket, tcp_heuristic) tcp_heuristics; @@ -93,16 +82,6 @@ struct tcp_heuristics_head { lck_mtx_t thh_mtx; }; -struct tcp_cache_key { - sa_family_t tck_family; - - struct tcp_heuristic_key tck_src; - in_4_6_addr tck_dst; -}; - -#define MPTCP_VERSION_SUPPORTED 1 -#define MPTCP_VERSION_UNSUPPORTED -1 -#define MPTCP_VERSION_SUPPORTED_UNKNOWN 0 struct tcp_cache { SLIST_ENTRY(tcp_cache) list; @@ -158,10 +137,10 @@ static uint32_t tcp_backoff_maximum = 65536; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, backoff_maximum, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_backoff_maximum, 0, "Maximum time for which we won't try TFO"); -static uint32_t tcp_ecn_timeout = 60; +static uint32_t tcp_ecn_timeout = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ecn_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, - &tcp_ecn_timeout, 60, "Initial minutes to wait before re-trying ECN"); + &tcp_ecn_timeout, 5, "Initial minutes to wait before re-trying ECN"); static int disable_tcp_heuristics = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_tcp_heuristics, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -192,24 +171,25 @@ tcp_min_to_hz(uint32_t minutes) /* Number of SYN-losses we accept */ #define TFO_MAX_COOKIE_LOSS 2 -#define ECN_MAX_SYN_LOSS 2 #define MPTCP_MAX_SYN_LOSS 2 #define MPTCP_SUCCESS_TRIGGER 10 #define MPTCP_VERSION_MAX_FAIL 2 +#define ECN_MAX_SYN_LOSS 5 #define ECN_MAX_DROPRST 1 -#define ECN_MAX_DROPRXMT 4 #define ECN_MAX_SYNRST 4 +#define ECN_MAX_CE_AGGRESSIVE 1 /* Flags for setting/unsetting loss-heuristics, limited to 4 bytes */ -#define TCPCACHE_F_TFO_REQ 0x01 -#define TCPCACHE_F_TFO_DATA 0x02 -#define TCPCACHE_F_ECN 0x04 -#define TCPCACHE_F_MPTCP 0x08 -#define TCPCACHE_F_ECN_DROPRST 0x10 -#define TCPCACHE_F_ECN_DROPRXMT 0x20 -#define TCPCACHE_F_TFO_REQ_RST 0x40 -#define TCPCACHE_F_TFO_DATA_RST 0x80 -#define TCPCACHE_F_ECN_SYNRST 0x100 +#define TCPCACHE_F_TFO_REQ 0x01 +#define TCPCACHE_F_TFO_DATA 0x02 +#define TCPCACHE_F_ECN 0x04 +#define TCPCACHE_F_MPTCP 0x08 +#define TCPCACHE_F_ECN_DROPRST 0x10 +#define TCPCACHE_F_ECN_AGGRESSIVE 0x20 +#define TCPCACHE_F_TFO_REQ_RST 0x40 +#define TCPCACHE_F_TFO_DATA_RST 0x80 +#define TCPCACHE_F_ECN_SYNRST 0x100 +#define TCPCACHE_F_ECN_SYN_LOSS 0x200 /* Always retry ECN after backing off to this level for some heuristics */ #define ECN_RETRY_LIMIT 9 @@ -814,12 +794,10 @@ tcp_heuristic_reset_counters(struct tcp_cache_key_src *tcks, uint8_t flags) } if (flags & TCPCACHE_F_ECN) { - if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS || tpheur->th_ecn_synrst >= ECN_MAX_SYNRST) { - os_log(OS_LOG_DEFAULT, "%s: Resetting ECN-loss to 0 from %u and synrst from %u on heur %lx\n", - __func__, tpheur->th_ecn_loss, tpheur->th_ecn_synrst, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); - } tpheur->th_ecn_loss = 0; + tpheur->th_ecn_aggressive = 0; tpheur->th_ecn_synrst = 0; + tpheur->th_ecn_droprst = 0; } if (flags & TCPCACHE_F_MPTCP) { @@ -968,23 +946,6 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, } } - if ((flags & TCPCACHE_F_ECN) && - tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT && - TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { - tpheur->th_ecn_loss++; - if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { - tcpstat.tcps_ecn_fallback_synloss++; - TCP_CACHE_INC_IFNET_STAT(tcks->ifp, tcks->af, ecn_fallback_synloss); - tpheur->th_ecn_backoff = tcp_now + - (tcp_min_to_hz(tcp_ecn_timeout) << - (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); - - os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-loss\n", - __func__, tpheur->th_ecn_backoff, tcp_now, - (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); - } - } - if ((flags & TCPCACHE_F_MPTCP) && tpheur->th_mptcp_loss < TCP_CACHE_OVERFLOW_PROTECT && tpheur->th_mptcp_heuristic_disabled == 0) { @@ -1005,6 +966,32 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, } } + if ((flags & TCPCACHE_F_ECN_SYN_LOSS) && + tpheur->th_ecn_loss < TCP_CACHE_OVERFLOW_PROTECT && + TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { + tpheur->th_ecn_loss++; + if (tpheur->th_ecn_loss >= ECN_MAX_SYN_LOSS) { + tcpstat.tcps_ecn_fallback_synloss++; + TCP_CACHE_INC_IFNET_STAT(tcks->ifp, tcks->af, ecn_fallback_synloss); + tpheur->th_ecn_backoff = tcp_now + + (tcp_min_to_hz(tcp_ecn_timeout) << + (tpheur->th_ecn_loss - ECN_MAX_SYN_LOSS)); + } + } + + if ((flags & TCPCACHE_F_ECN_AGGRESSIVE) && + tpheur->th_ecn_aggressive < TCP_CACHE_OVERFLOW_PROTECT && + TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { + tpheur->th_ecn_aggressive++; + if (tpheur->th_ecn_aggressive >= ECN_MAX_CE_AGGRESSIVE) { + tcpstat.tcps_ecn_fallback_ce++; + TCP_CACHE_INC_IFNET_STAT(tcks->ifp, tcks->af, ecn_fallback_ce); + tpheur->th_ecn_backoff = tcp_now + + (tcp_min_to_hz(tcp_ecn_timeout) << + (tpheur->th_ecn_aggressive - ECN_MAX_CE_AGGRESSIVE)); + } + } + if ((flags & TCPCACHE_F_ECN_DROPRST) && tpheur->th_ecn_droprst < TCP_CACHE_OVERFLOW_PROTECT && TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { @@ -1016,30 +1003,9 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_droprst - ECN_MAX_DROPRST)); - - os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-RST\n", - __func__, tpheur->th_ecn_backoff, tcp_now, - (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } - if ((flags & TCPCACHE_F_ECN_DROPRXMT) && - tpheur->th_ecn_droprxmt < TCP_CACHE_OVERFLOW_PROTECT && - TSTMP_LEQ(tpheur->th_ecn_backoff, tcp_now)) { - tpheur->th_ecn_droprxmt++; - if (tpheur->th_ecn_droprxmt >= ECN_MAX_DROPRXMT) { - tcpstat.tcps_ecn_fallback_droprxmt++; - TCP_CACHE_INC_IFNET_STAT(tcks->ifp, tcks->af, - ecn_fallback_droprxmt); - tpheur->th_ecn_backoff = tcp_now + - (tcp_min_to_hz(tcp_ecn_timeout) << - (tpheur->th_ecn_droprxmt - ECN_MAX_DROPRXMT)); - - os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for drop-Rxmit\n", - __func__, tpheur->th_ecn_backoff, tcp_now, - (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); - } - } if ((flags & TCPCACHE_F_ECN_SYNRST) && tpheur->th_ecn_synrst < TCP_CACHE_OVERFLOW_PROTECT) { tpheur->th_ecn_synrst++; @@ -1050,10 +1016,6 @@ tcp_heuristic_inc_counters(struct tcp_cache_key_src *tcks, tpheur->th_ecn_backoff = tcp_now + (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_synrst - ECN_MAX_SYNRST)); - - os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx for SYN-RST\n", - __func__, tpheur->th_ecn_backoff, tcp_now, - (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); } } tcp_heuristic_unlock(head); @@ -1126,8 +1088,7 @@ tcp_heuristic_ecn_loss(struct tcpcb *tp) } tcp_cache_key_src_create(tp, &tcks); - - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN); + tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_SYN_LOSS); } void @@ -1136,30 +1097,27 @@ tcp_heuristic_ecn_droprst(struct tcpcb *tp) struct tcp_cache_key_src tcks; tcp_cache_key_src_create(tp, &tcks); - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_DROPRST); } -void -tcp_heuristic_ecn_droprxmt(struct tcpcb *tp) -{ - struct tcp_cache_key_src tcks; - - tcp_cache_key_src_create(tp, &tcks); - - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_DROPRXMT); -} - void tcp_heuristic_ecn_synrst(struct tcpcb *tp) { struct tcp_cache_key_src tcks; tcp_cache_key_src_create(tp, &tcks); - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_SYNRST); } +void +tcp_heuristic_ecn_aggressive(struct tcpcb *tp) +{ + struct tcp_cache_key_src tcks; + + tcp_cache_key_src_create(tp, &tcks); + tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_AGGRESSIVE); +} + void tcp_heuristic_tfo_middlebox(struct tcpcb *tp) { @@ -1171,50 +1129,6 @@ tcp_heuristic_tfo_middlebox(struct tcpcb *tp) tcp_heuristic_tfo_middlebox_common(&tcks); } -static void -tcp_heuristic_ecn_aggressive_common(struct tcp_cache_key_src *tcks) -{ - struct tcp_heuristics_head *__single head; - struct tcp_heuristic *__single tpheur; - - tpheur = tcp_getheuristic_with_lock(tcks, 1, &head); - if (tpheur == NULL) { - return; - } - - if (TSTMP_GT(tpheur->th_ecn_backoff, tcp_now)) { - /* We are already in aggressive mode */ - tcp_heuristic_unlock(head); - return; - } - - /* Must be done before, otherwise we will start off with expo-backoff */ - tpheur->th_ecn_backoff = tcp_now + - (tcp_min_to_hz(tcp_ecn_timeout) << (tpheur->th_ecn_aggressive)); - - /* - * Ugly way to prevent integer overflow... limit to prevent in - * overflow during exp. backoff. - */ - if (tpheur->th_ecn_aggressive < TCP_CACHE_OVERFLOW_PROTECT) { - tpheur->th_ecn_aggressive++; - } - - tcp_heuristic_unlock(head); - - os_log(OS_LOG_DEFAULT, "%s disable ECN until %u now %u on %lx\n", __func__, - tpheur->th_ecn_backoff, tcp_now, (unsigned long)VM_KERNEL_ADDRPERM(tpheur)); -} - -void -tcp_heuristic_ecn_aggressive(struct tcpcb *tp) -{ - struct tcp_cache_key_src tcks; - - tcp_cache_key_src_create(tp, &tcks); - tcp_heuristic_ecn_aggressive_common(&tcks); -} - static boolean_t tcp_heuristic_do_tfo_common(struct tcp_cache_key_src *tcks) { @@ -1351,9 +1265,7 @@ tcp_heuristic_do_ecn_common(struct tcp_cache_key_src *tcks) if (tpheur->th_ecn_droprst >= ECN_RETRY_LIMIT) { tpheur->th_ecn_droprst = 0; } - if (tpheur->th_ecn_droprxmt >= ECN_RETRY_LIMIT) { - tpheur->th_ecn_droprxmt = 0; - } + if (tpheur->th_ecn_synrst >= ECN_RETRY_LIMIT) { tpheur->th_ecn_synrst = 0; } @@ -1420,15 +1332,13 @@ tcp_heuristics_ecn_update(struct necp_tcp_ecn_cache *necp_buffer, if (necp_buffer->necp_tcp_ecn_heuristics_success) { tcp_heuristic_reset_counters(&tcks, TCPCACHE_F_ECN); } else if (necp_buffer->necp_tcp_ecn_heuristics_loss) { - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN); + tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_SYN_LOSS); } else if (necp_buffer->necp_tcp_ecn_heuristics_drop_rst) { tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_DROPRST); - } else if (necp_buffer->necp_tcp_ecn_heuristics_drop_rxmt) { - tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_DROPRXMT); } else if (necp_buffer->necp_tcp_ecn_heuristics_syn_rst) { tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_SYNRST); } else if (necp_buffer->necp_tcp_ecn_heuristics_aggressive) { - tcp_heuristic_ecn_aggressive_common(&tcks); + tcp_heuristic_inc_counters(&tcks, TCPCACHE_F_ECN_AGGRESSIVE); } return; @@ -1638,6 +1548,175 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, clear_tfocache, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcpcleartfo, 0, &sysctl_cleartfo, "I", "Toggle to clear the TFO destination based heuristic cache"); +static int +sysctl_tcp_heuristics_list SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + size_t total_entries = 0; + size_t total_size; + bool entitled = false; + + if (tcp_heuristics == NULL || tcp_heuristics_size == 0) { + return ENOENT; + } + + if (IOCurrentTaskHasEntitlement(TCP_HEURISTICS_LIST_ENTITLEMENT)) { + entitled = true; + } + + /* First pass: count total number of heuristic entries across all buckets */ + for (size_t i = 0; i < tcp_heuristics_size; i++) { + struct tcp_heuristics_head *head = &tcp_heuristics[i]; + struct tcp_heuristic *tpheur; + + lck_mtx_lock(&head->thh_mtx); + SLIST_FOREACH(tpheur, &head->tcp_heuristics, list) { + total_entries++; + } + lck_mtx_unlock(&head->thh_mtx); + } + + total_size = total_entries * sizeof(struct tcp_heuristics_data); + + if (req->oldptr == USER_ADDR_NULL) { + /* Just return the size needed */ + return SYSCTL_OUT(req, NULL, total_size); + } + + if (req->oldlen < total_size) { + return ENOMEM; + } + + /* Second pass: copy out all heuristic entries */ + for (size_t i = 0; i < tcp_heuristics_size; i++) { + struct tcp_heuristics_head *head = &tcp_heuristics[i]; + struct tcp_heuristic *tpheur; + + lck_mtx_lock(&head->thh_mtx); + SLIST_FOREACH(tpheur, &head->tcp_heuristics, list) { + struct tcp_heuristics_data heur_data; + + /* Copy data from tcp_heuristic to tcp_heuristics_data (excluding list field) */ + heur_data.th_last_access = tpheur->th_last_access; + if (entitled) { + heur_data.th_key = tpheur->th_key; + } else { + heur_data.th_key.thk_family = tpheur->th_key.thk_family; + } + heur_data.th_tfo_data_loss = tpheur->th_tfo_data_loss; + heur_data.th_tfo_req_loss = tpheur->th_tfo_req_loss; + heur_data.th_tfo_data_rst = tpheur->th_tfo_data_rst; + heur_data.th_tfo_req_rst = tpheur->th_tfo_req_rst; + heur_data.th_mptcp_loss = tpheur->th_mptcp_loss; + heur_data.th_mptcp_success = tpheur->th_mptcp_success; + heur_data.th_ecn_droprst = tpheur->th_ecn_droprst; + heur_data.th_ecn_synrst = tpheur->th_ecn_synrst; + heur_data.th_tfo_enabled_time = tpheur->th_tfo_enabled_time; + heur_data.th_tfo_backoff_until = tpheur->th_tfo_backoff_until; + heur_data.th_tfo_backoff = tpheur->th_tfo_backoff; + heur_data.th_mptcp_backoff = tpheur->th_mptcp_backoff; + heur_data.th_ecn_backoff = tpheur->th_ecn_backoff; + heur_data.th_tfo_in_backoff = tpheur->th_tfo_in_backoff; + heur_data.th_mptcp_in_backoff = tpheur->th_mptcp_in_backoff; + heur_data.th_mptcp_heuristic_disabled = tpheur->th_mptcp_heuristic_disabled; + + error = SYSCTL_OUT(req, &heur_data, sizeof(struct tcp_heuristics_data)); + if (error) { + lck_mtx_unlock(&head->thh_mtx); + return error; + } + } + lck_mtx_unlock(&head->thh_mtx); + } + + return error; +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, heuristics_list, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, + NULL, 0, sysctl_tcp_heuristics_list, "S,tcp_heuristics_data", + "TCP heuristics entries from all buckets"); + +static int +sysctl_tcp_cache_list SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0; + size_t total_entries = 0; + size_t total_size; + bool entitled = false; + + if (tcp_cache == NULL || tcp_cache_size == 0) { + return ENOENT; + } + + if (IOCurrentTaskHasEntitlement(TCP_CACHE_LIST_ENTITLEMENT)) { + entitled = true; + } + + /* First pass: count total number of cache entries across all buckets */ + for (size_t i = 0; i < tcp_cache_size; i++) { + struct tcp_cache_head *head = &tcp_cache[i]; + struct tcp_cache *tpcache; + + lck_mtx_lock(&head->tch_mtx); + SLIST_FOREACH(tpcache, &head->tcp_caches, list) { + total_entries++; + } + lck_mtx_unlock(&head->tch_mtx); + } + + total_size = total_entries * sizeof(struct tcp_cache_data); + + if (req->oldptr == USER_ADDR_NULL) { + /* Just return the size needed */ + return SYSCTL_OUT(req, NULL, total_size); + } + + if (req->oldlen < total_size) { + return ENOMEM; + } + + /* Second pass: copy out all cache entries */ + for (size_t i = 0; i < tcp_cache_size; i++) { + struct tcp_cache_head *head = &tcp_cache[i]; + struct tcp_cache *tpcache; + + lck_mtx_lock(&head->tch_mtx); + SLIST_FOREACH(tpcache, &head->tcp_caches, list) { + struct tcp_cache_data cache_data; + + /* Copy data from tcp_cache to tcp_cache_data (excluding list field) */ + cache_data.tc_last_access = tpcache->tc_last_access; + if (entitled) { + cache_data.tc_key = tpcache->tc_key; + } else { + cache_data.tc_key.tck_family = tpcache->tc_key.tck_family; + } + memcpy(cache_data.tc_tfo_cookie, tpcache->tc_tfo_cookie, TFO_COOKIE_LEN_MAX); + cache_data.tc_tfo_cookie_len = tpcache->tc_tfo_cookie_len; + cache_data.tc_mptcp_version_confirmed = tpcache->tc_mptcp_version_confirmed; + cache_data.tc_mptcp_version = tpcache->tc_mptcp_version; + cache_data.tc_mptcp_next_version_try = tpcache->tc_mptcp_next_version_try; + + error = SYSCTL_OUT(req, &cache_data, sizeof(struct tcp_cache_data)); + if (error) { + lck_mtx_unlock(&head->tch_mtx); + return error; + } + } + lck_mtx_unlock(&head->tch_mtx); + } + + return error; +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, cache_list, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, + NULL, 0, sysctl_tcp_cache_list, "S,tcp_cache_data", + "TCP cache entries from all buckets"); + void tcp_cache_init(void) { diff --git a/bsd/netinet/tcp_cache.h b/bsd/netinet/tcp_cache.h index a12c9f1b1..45df2526a 100644 --- a/bsd/netinet/tcp_cache.h +++ b/bsd/netinet/tcp_cache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2024 Apple Inc. All rights reserved. + * Copyright (c) 2015-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -31,11 +31,86 @@ #ifndef _NETINET_TCP_CACHE_H #define _NETINET_TCP_CACHE_H +#include #include #include -#define ECN_MIN_CE_PROBES 10 /* Probes are basically the number of incoming packets */ -#define ECN_MAX_CE_RATIO 7 /* Ratio is the maximum number of CE-packets we accept per incoming "probe" */ +#ifdef PRIVATE + +#define TCP_HEURISTICS_LIST_ENTITLEMENT "com.apple.private.tcp.heuristics_list" +#define TCP_CACHE_LIST_ENTITLEMENT "com.apple.private.tcp.cache_list" + +typedef union { + struct in_addr addr; + struct in6_addr addr6; +} in_4_6_addr; + +struct tcp_heuristic_key { + union { + uint8_t thk_net_signature[IFNET_SIGNATURELEN]; + in_4_6_addr thk_ip; + }; + sa_family_t thk_family; +}; + +/* Data structure for sysctl export - same as tcp_heuristic but without list field */ +struct tcp_heuristics_data { + uint32_t th_last_access; + + struct tcp_heuristic_key th_key; + + uint8_t th_tfo_data_loss; /* The number of times a SYN+data has been lost */ + uint8_t th_tfo_req_loss; /* The number of times a SYN+cookie-req has been lost */ + uint8_t th_tfo_data_rst; /* The number of times a SYN+data has received a RST */ + uint8_t th_tfo_req_rst; /* The number of times a SYN+cookie-req has received a RST */ + uint8_t th_mptcp_loss; /* The number of times a SYN+MP_CAPABLE has been lost */ + uint8_t th_mptcp_success; /* The number of times MPTCP-negotiation has been successful */ + uint8_t th_ecn_droprst; /* The number of times ECN connections received a RST after first data pkt */ + uint8_t th_ecn_synrst; /* number of times RST was received in response to an ECN enabled SYN */ + uint32_t th_tfo_enabled_time; /* The moment when we reenabled TFO after backing off */ + uint32_t th_tfo_backoff_until; /* Time until when we should not try out TFO */ + uint32_t th_tfo_backoff; /* Current backoff timer */ + uint32_t th_mptcp_backoff; /* Time until when we should not try out MPTCP */ + uint32_t th_ecn_backoff; /* Time until when we should not try out ECN */ + + uint8_t th_tfo_in_backoff:1, /* Are we avoiding TFO due to the backoff timer? */ + th_mptcp_in_backoff:1, /* Are we avoiding MPTCP due to the backoff timer? */ + th_mptcp_heuristic_disabled:1; /* Are heuristics disabled? */ +}; + +struct tcp_cache_key { + sa_family_t tck_family; + + struct tcp_heuristic_key tck_src; + in_4_6_addr tck_dst; +}; + +#define TFO_COOKIE_LEN_MAX 16 + +/* Data structure for sysctl export - same as tcp_cache but without list field */ +struct tcp_cache_data { + uint32_t tc_last_access; + + struct tcp_cache_key tc_key; + + uint8_t tc_tfo_cookie[TFO_COOKIE_LEN_MAX]; + uint8_t tc_tfo_cookie_len; + + uint8_t tc_mptcp_version_confirmed:1; + uint8_t tc_mptcp_version; /* version to use right now */ + uint32_t tc_mptcp_next_version_try; /* Time, until we try preferred version again */ +}; + +#define MPTCP_VERSION_SUPPORTED 1 +#define MPTCP_VERSION_UNSUPPORTED -1 +#define MPTCP_VERSION_SUPPORTED_UNKNOWN 0 + +#endif /* PRIVATE */ + +#ifdef KERNEL_PRIVATE + +#define ECN_MIN_CE_PROBES (20) /* Probes are basically the number of incoming packets */ +#define ECN_MAX_CE_RATIO (18) /* Ratio is the maximum number of E/CE-packets we accept per incoming "probe" */ extern void tcp_cache_set_cookie(struct tcpcb *tp, u_char *__counted_by(len) cookie, u_int8_t len); extern int tcp_cache_get_cookie(struct tcpcb *tp, u_char *__counted_by(buflen) cookie, uint8_t buflen, u_int8_t *len); @@ -47,8 +122,8 @@ extern void tcp_heuristic_tfo_loss(struct tcpcb *tp); extern void tcp_heuristic_tfo_rst(struct tcpcb *tp); extern void tcp_heuristic_mptcp_loss(struct tcpcb *tp); extern void tcp_heuristic_ecn_loss(struct tcpcb *tp); -extern void tcp_heuristic_tfo_middlebox(struct tcpcb *tp); extern void tcp_heuristic_ecn_aggressive(struct tcpcb *tp); +extern void tcp_heuristic_tfo_middlebox(struct tcpcb *tp); extern void tcp_heuristic_tfo_success(struct tcpcb *tp); extern void tcp_heuristic_mptcp_success(struct tcpcb *tp); extern void tcp_heuristic_ecn_success(struct tcpcb *tp); @@ -56,7 +131,6 @@ extern boolean_t tcp_heuristic_do_tfo(struct tcpcb *tp); extern int tcp_heuristic_do_mptcp(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_ecn(struct tcpcb *tp); extern void tcp_heuristic_ecn_droprst(struct tcpcb *tp); -extern void tcp_heuristic_ecn_droprxmt(struct tcpcb *tp); extern void tcp_heuristic_ecn_synrst(struct tcpcb *tp); extern boolean_t tcp_heuristic_do_ecn_with_address(struct ifnet *ifp, @@ -72,4 +146,5 @@ extern void tcp_heuristics_tfo_update(struct necp_tcp_tfo_cache *necp_buffer, extern void tcp_cache_init(void); +#endif /* KERNEL_PRIVATE */ #endif /* _NETINET_TCP_CACHE_H */ diff --git a/bsd/netinet/tcp_cc.c b/bsd/netinet/tcp_cc.c index 4c17ee8ad..c2a82fed8 100644 --- a/bsd/netinet/tcp_cc.c +++ b/bsd/netinet/tcp_cc.c @@ -116,13 +116,7 @@ tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) void tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) { - if (tcp_cubic_minor_fixes) { - tp->snd_cwnd = tcp_initial_cwnd(tp); - } else { - /* initial congestion window according to RFC 3390 */ - tp->snd_cwnd = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); - } + tp->snd_cwnd = tcp_initial_cwnd(tp); } /* @@ -157,55 +151,52 @@ tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) } break; case 3: - if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) { - if ((tp->t_flags & TF_RXWIN0SENT) == 0 && - (th->th_flags & TH_PUSH) == 0 && - ((tp->t_unacksegs == 1) || - ((tp->t_flags & TF_STRETCHACK) && - tp->t_unacksegs < maxseg_unacked))) { - return 1; - } - } else { - uint32_t recwin; + { + uint32_t recwin; - /* Get the receive-window we would announce */ - recwin = tcp_sbspace(tp); - if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) { - recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale); - } - - /* Delay ACK, if: - * - * 1. We are not sending a zero-window - * 2. We are not forcing fast ACKs - * 3. We have more than the low-water mark in receive-buffer - * 4. The receive-window is not increasing - * 5. We have less than or equal of an MSS unacked or - * Window actually has been growing larger than the initial value by half of it. - * (this makes sure that during ramp-up we ACK every second MSS - * until we pass the tcp_recvspace * 1.5-threshold) - * 6. We haven't waited for half a BDP - * 7. The amount of unacked data is less than the maximum ACK-burst (256 MSS) - * We try to avoid having the sender end up hitting huge ACK-ranges. - * - * (a note on 6: The receive-window is - * roughly 2 BDP. Thus, recwin / 4 means half a BDP and - * thus we enforce an ACK roughly twice per RTT - even - * if the app does not read) - */ - if ((tp->t_flags & TF_RXWIN0SENT) == 0 && - tp->t_forced_acks == 0 && - tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat && - recwin <= tp->t_last_recwin && - (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg || - recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) && - (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2) && - (tp->rcv_nxt - tp->last_ack_sent) < 256 * tp->t_maxseg) { - tp->t_stat.acks_delayed++; - return 1; - } + /* Get the receive-window we would announce */ + recwin = tcp_sbspace(tp); + if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) { + recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale); } - break; + + if ((tp->t_flagsext & TF_QUICKACK) && + tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg) { + return 0; + } + + /* Delay ACK, if: + * + * 1. We are not sending a zero-window + * 2. We are not forcing fast ACKs + * 3. We have more than the low-water mark in receive-buffer + * 4. The receive-window is not increasing + * 5. We have less than or equal of an MSS unacked or + * Window actually has been growing larger than the initial value by half of it. + * (this makes sure that during ramp-up we ACK every second MSS + * until we pass the tcp_recvspace * 1.5-threshold) + * 6. We haven't waited for half a BDP + * 7. The amount of unacked data is less than the maximum ACK-burst (256 MSS) + * We try to avoid having the sender end up hitting huge ACK-ranges. + * + * (a note on 6: The receive-window is + * roughly 2 BDP. Thus, recwin / 4 means half a BDP and + * thus we enforce an ACK roughly twice per RTT - even + * if the app does not read) + */ + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + tp->t_forced_acks == 0 && + tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat && + recwin <= tp->t_last_recwin && + (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg || + recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) && + (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2) && + (tp->rcv_nxt - tp->last_ack_sent) < 256 * tp->t_maxseg) { + tp->t_stat.acks_delayed++; + return 1; + } + } + break; } return 0; } @@ -223,35 +214,6 @@ tcp_cc_allocate_state(struct tcpcb *tp) } } -/* - * If stretch ack was disabled automatically on long standing connections, - * re-evaluate the situation after 15 minutes to enable it. - */ -#define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) -void -tcp_cc_after_idle_stretchack(struct tcpcb *tp) -{ - struct tcp_globals *globals; - int32_t tdiff; - - if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) { - return; - } - - globals = tcp_get_globals(tp); - tdiff = timer_diff(tcp_globals_now(globals), 0, tp->rcv_nostrack_ts, 0); - if (tdiff < 0) { - tdiff = -tdiff; - } - - if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { - tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; - tp->t_stretchack_delayed = 0; - - tcp_reset_stretch_ack(tp); - } -} - /* * Detect if the congestion window is non-validated according to * draft-ietf-tcpm-newcwv-07 @@ -261,6 +223,20 @@ tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; + if (tp->t_inpcb->inp_max_pacing_rate != UINT64_MAX) { + uint64_t rate; + + rate = tcp_compute_measured_rate(tp); + + /* + * Multiply by 2 because we want some amount of standing queue + * in the AQM + */ + if (tp->t_inpcb->inp_max_pacing_rate < (rate >> 1)) { + return 1; + } + } + if (tp->t_pipeack == 0) { tp->t_flagsext &= ~TF_CWND_NONVALIDATED; return 0; @@ -291,11 +267,7 @@ tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp) tp->t_pipeack = tcp_get_max_pipeack(tp); tcp_clear_pipeack_state(tp); tp->snd_cwnd = (max(tp->t_pipeack, tp->t_lossflightsize) >> 1); - if (tcp_cubic_minor_fixes) { - tp->snd_cwnd = max(tp->snd_cwnd, tp->t_maxseg); - } else { - tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); - } + tp->snd_cwnd = max(tp->snd_cwnd, tp->t_maxseg); tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh; tp->t_flagsext &= ~TF_CWND_NONVALIDATED; } diff --git a/bsd/netinet/tcp_cc.h b/bsd/netinet/tcp_cc.h index 2db6b46a4..042933685 100644 --- a/bsd/netinet/tcp_cc.h +++ b/bsd/netinet/tcp_cc.h @@ -139,6 +139,7 @@ struct tcp_cc_debug_state { X(TCP_CC_TLP_IN_FASTRECOVERY) \ X(TCP_CC_DSACK_BAD_REXMT) \ X(TCP_CC_FIRST_REXMT) \ + X(TCP_CC_FLOW_CONGESTION_NOTIFIED) \ X(MAX_TCP_CC_EVENTS) enum tcp_cc_event { @@ -275,7 +276,6 @@ extern void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp); extern void tcp_cc_cwnd_init_or_reset(struct tcpcb *tp); extern int tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th); extern void tcp_cc_allocate_state(struct tcpcb *tp); -extern void tcp_cc_after_idle_stretchack(struct tcpcb *tp); extern uint32_t tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp); extern void tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp); extern u_int32_t tcp_get_max_pipeack(struct tcpcb *tp); @@ -284,11 +284,7 @@ extern void tcp_clear_pipeack_state(struct tcpcb *tp); static inline uint32_t tcp_initial_cwnd(struct tcpcb *tp) { - if (tcp_cubic_minor_fixes) { - return TCP_CC_CWND_INIT_PKTS * tp->t_maxseg; - } else { - return TCP_CC_CWND_INIT_BYTES; - } + return TCP_CC_CWND_INIT_PKTS * tp->t_maxseg; } #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/tcp_cubic.c b/bsd/netinet/tcp_cubic.c index e14626743..4e1eee34b 100644 --- a/bsd/netinet/tcp_cubic.c +++ b/bsd/netinet/tcp_cubic.c @@ -79,15 +79,9 @@ tcp_cubic_init(struct tcpcb *tp) { os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed); - if (tcp_cubic_rfc_compliant) { - tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */ - tcp_cubic_fast_convergence_factor = 0.85f; - tcp_cubic_beta = 0.7f; - } else { - tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ - tcp_cubic_fast_convergence_factor = 0.875f; - tcp_cubic_beta = 0.8f; - } + tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */ + tcp_cubic_fast_convergence_factor = 0.85f; + tcp_cubic_beta = 0.7f; VERIFY(tp->t_ccstate != NULL); tcp_cubic_clear_state(tp); @@ -126,13 +120,16 @@ tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp) * loss and Cubic will enter steady-state too early. It is better * to always probe to find the initial slow-start threshold. */ - if (tp->t_inpcb->inp_stat->txbytes <= tcp_initial_cwnd(tp) && + if (tp->t_inpcb->inp_mstat.ms_total.ts_txbytes <= tcp_initial_cwnd(tp) && tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) { tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; } /* Initialize cubic last max to be same as ssthresh */ tp->t_ccstate->cub_last_max = tp->snd_ssthresh; + + /* Set initial pacer state */ + tcp_update_pacer_state(tp); } /* @@ -169,11 +166,7 @@ tcp_cubic_update(struct tcpcb *tp, uint32_t rtt) * period that the window will take to increase to * last_max again after backoff due to loss. */ - if (tcp_cubic_minor_fixes) { - K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; - } else { - K = (tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; - } + K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; K = cbrtf(K); tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ; /* Origin point */ @@ -240,36 +233,25 @@ tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th) * at the beginning of the epoch. */ tp->t_ccstate->cub_tcp_win = min(tp->snd_cwnd, tp->snd_wnd); - if (tcp_cubic_minor_fixes) { - tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp); - } else { - tp->t_ccstate->cub_tcp_bytes_acked = 0; - } + tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp); } else { tp->t_ccstate->cub_tcp_bytes_acked += BYTES_ACKED(th, tp); - if (tcp_cubic_minor_fixes) { - /* - * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked - * against the snd_cwnd represents exactly one RTT at full rate. - */ - while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) { - /* Enough bytes have been ACK'd for TCP to do AIMD*/ - tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd; + /* + * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked + * against the snd_cwnd represents exactly one RTT at full rate. + */ + while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) { + /* Enough bytes have been ACK'd for TCP to do AIMD */ + tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd; - if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max || !tcp_cubic_rfc_compliant) { - tp->t_ccstate->cub_tcp_win += tp->t_maxseg; - } else { - /* Increase-rate from Section 4.2, RFC 8312 */ - float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta); - - tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor); - } - } - } else { - if (tp->t_ccstate->cub_tcp_bytes_acked >= tp->t_ccstate->cub_tcp_win) { - tp->t_ccstate->cub_tcp_bytes_acked -= tp->t_ccstate->cub_tcp_win; + if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max) { tp->t_ccstate->cub_tcp_win += tp->t_maxseg; + } else { + /* Increase-rate from Section 4.2, RFC 8312 */ + float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta); + + tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor); } } } @@ -303,54 +285,38 @@ tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th) /* Compute TCP window if a multiplicative decrease of 0.2 is used */ tcp_win = tcp_cubic_tcpwin(tp, th); - if (tp->snd_cwnd < tcp_win && tcp_cubic_minor_fixes == 0 && TCP_CUBIC_ENABLE_TCPMODE(tp)) { - /* this connection is in TCP-friendly region */ - if (tp->t_bytes_acked >= tp->snd_cwnd) { - tp->t_bytes_acked -= tp->snd_cwnd; - tp->snd_cwnd = min(tcp_win, TCP_MAXWIN << tp->snd_scale); - } - } else { - if (cubic_target_win > tp->snd_cwnd) { - /* - * The target win is computed for the next RTT. - * To reach this value, cwnd will have to be updated - * one segment at a time. Compute how many bytes - * need to be acknowledged before we can increase - * the cwnd by one segment. - */ - incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; - incr_win /= (cubic_target_win - tp->snd_cwnd); - if (!tcp_cubic_minor_fixes) { - if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { - tp->t_bytes_acked -= incr_win; - tp->snd_cwnd = - min((tp->snd_cwnd + tp->t_maxseg), - TCP_MAXWIN << tp->snd_scale); - } - } + if (cubic_target_win > tp->snd_cwnd) { + /* + * The target win is computed for the next RTT. + * To reach this value, cwnd will have to be updated + * one segment at a time. Compute how many bytes + * need to be acknowledged before we can increase + * the cwnd by one segment. + */ + incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; + incr_win /= (cubic_target_win - tp->snd_cwnd); + } + + tcp_win = tcp_round_to(tcp_win, tp->t_maxseg); + + if (tp->snd_cwnd < tcp_win) { + uint64_t tcp_incr_win; + + tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; + tcp_incr_win /= (tcp_win - tp->snd_cwnd); + + if (tcp_incr_win < incr_win) { + /* this connection is in TCP-friendly region */ + incr_win = tcp_incr_win; } } - if (tcp_cubic_minor_fixes) { - tcp_win = tcp_round_to(tcp_win, tp->t_maxseg); - - if (tp->snd_cwnd < tcp_win) { - uint64_t tcp_incr_win; - - tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; - tcp_incr_win /= (tcp_win - tp->snd_cwnd); - - if (tcp_incr_win < incr_win) { - /* this connection is in TCP-friendly region */ - incr_win = tcp_incr_win; - } - } - - if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { - tp->t_bytes_acked -= incr_win; - tp->snd_cwnd = min(tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale); - } + if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { + tp->t_bytes_acked -= incr_win; + tp->snd_cwnd = min(tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale); } + + tcp_update_pacer_state(tp); } static void @@ -372,19 +338,17 @@ tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) uint32_t acked, abc_lim, incr; acked = BYTES_ACKED(th, tp); - if (tcp_cubic_minor_fixes) { - /* - * Maximum burst-size is limited to the initial congestion-window. - * We know that the network can survive this kind of burst. - */ - abc_lim = tcp_initial_cwnd(tp); - } else { - abc_lim = (tp->snd_nxt == tp->snd_max) ? 2 * tp->t_maxseg : tp->t_maxseg; - } + /* + * Maximum burst-size is limited to the initial congestion-window. + * We know that the network can survive this kind of burst. + */ + abc_lim = tcp_initial_cwnd(tp); incr = min(acked, abc_lim); tp->snd_cwnd += incr; tp->snd_cwnd = min(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); + + tcp_update_pacer_state(tp); } } @@ -400,11 +364,7 @@ tcp_cubic_pre_fr(struct tcpcb *tp) win = min(tp->snd_cwnd, tp->snd_wnd); if (tp->t_flagsext & TF_CWND_NONVALIDATED) { tp->t_lossflightsize = tp->snd_max - tp->snd_una; - if (tcp_flow_control_response) { - win = max(tp->t_pipeack, tp->t_lossflightsize); - } else { - win = (max(tp->t_pipeack, tp->t_lossflightsize)) >> 1; - } + win = max(tp->t_pipeack, tp->t_lossflightsize); } else { tp->t_lossflightsize = 0; } @@ -418,7 +378,7 @@ tcp_cubic_pre_fr(struct tcpcb *tp) * and it is capturing some of the bandwidth. To reach convergence * quickly, backoff a little more. */ - if (win < tp->t_ccstate->cub_last_max && tcp_cubic_minor_fixes) { + if (win < tp->t_ccstate->cub_last_max) { tp->t_ccstate->cub_last_max = (uint32_t)((float)win * tcp_cubic_fast_convergence_factor); } else { tp->t_ccstate->cub_last_max = win; @@ -488,48 +448,9 @@ tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th) ack = tp->snd_una; } - if (SEQ_LEQ(ack, tp->snd_max) && (!tcp_cubic_minor_fixes || tcp_flow_control_response)) { - flight_size = tp->snd_max - ack; - } else if (tcp_cubic_minor_fixes) { - /* - * Cubic Minor Fixes: snd_max - th_ack is a very very bad estimate - * of the flight size. Either the app is sending at full speed and - * flight_size *is* snd_sshtresh, or the app is not sending at full - * speed and congestion-window validation would have kicked in earlier. - * - * Except that for the latter, snd_ssthresh is way too high. - * When we exit recovery we will burst a lot of data out... - * - * So, tcp_flow_control_response brings us back to the old behavior. - * Too many feature-flags... - */ - flight_size = tp->snd_ssthresh; - } + VERIFY(SEQ_LEQ(ack, tp->snd_max)); + flight_size = tp->snd_max - ack; - /* - * Cubic Minor Fixes: t_lossflightsize is always 0, because of - * EXIT_FASTRECOVERY. This here is basically dead code... - */ - if (SACK_ENABLED(tp) && tp->t_lossflightsize > 0 && !tcp_cubic_minor_fixes) { - uint32_t total_rxt_size = 0, ncwnd; - /* - * When SACK is enabled, the number of retransmitted bytes - * can be counted more accurately. - */ - total_rxt_size = tcp_rxtseg_total_size(tp); - ncwnd = max(tp->t_pipeack, tp->t_lossflightsize); - if (total_rxt_size <= ncwnd) { - ncwnd = ncwnd - total_rxt_size; - } - - /* - * To avoid sending a large burst at the end of recovery - * set a max limit on ncwnd - */ - ncwnd = min(ncwnd, (tp->t_maxseg << 6)); - ncwnd = ncwnd >> 1; - flight_size = max(ncwnd, flight_size); - } /* * Complete ack. The current window was inflated for fast recovery. * It has to be deflated post recovery. @@ -547,6 +468,8 @@ tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th) tp->t_ccstate->cub_tcp_win = 0; tp->t_ccstate->cub_tcp_bytes_acked = 0; + + tcp_update_pacer_state(tp); } static void @@ -574,6 +497,8 @@ tcp_cubic_after_timeout(struct tcpcb *tp) * timeout might indicate severe congestion. */ tp->snd_cwnd = tp->t_maxseg; + + tcp_update_pacer_state(tp); } static int diff --git a/bsd/netinet/tcp_includes.h b/bsd/netinet/tcp_includes.h index 13a40368b..d801931af 100644 --- a/bsd/netinet/tcp_includes.h +++ b/bsd/netinet/tcp_includes.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 3586a3a93..b283c3b85 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -109,6 +109,7 @@ #include #include #include +#include #include #include #include @@ -145,33 +146,19 @@ #define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ) #define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ) -#define TCP_STRETCHACK_ENABLE_PKTCNT 2000 struct tcpstat tcpstat; -SYSCTL_SKMEM_TCP_INT(OID_AUTO, flow_control_response, - CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_flow_control_response, 1, - "Improved response to Flow-control events"); - static int log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0, "Log all incoming TCP connections"); -SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_strategy, - CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ack_strategy, TCP_ACK_STRATEGY_MODERN, - "Revised TCP ACK-strategy, avoiding stretch-ACK implementation"); - static int blackhole = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0, "Do not send RST when dropping refused connections"); -/* TODO - remove once uTCP stopped using it */ -SYSCTL_SKMEM_TCP_INT(OID_AUTO, aggressive_rcvwnd_inc, - CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_aggressive_rcvwnd_inc, 1, - "Be more aggressive about increasing the receive-window."); - SYSCTL_SKMEM_TCP_INT(OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_delack_enabled, 3, "Delay ACK to try and piggyback it onto a data packet"); @@ -224,6 +211,7 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautorcvbuf, CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_do_autorcvbuf, 1, "Enable automatic socket buffer tuning"); +/* ToDo - remove once uTCP stops using it. */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, autotunereorder, CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autotune_reorder, 1, "Enable automatic socket buffer tuning even when reordering is present"); @@ -241,15 +229,6 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit, CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, 10, "Maximum number of challenge ACKs per connection per second"); -/* TO BE REMOVED */ -SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961, - CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1, - "Enable/Disable full RFC 5961 compliance"); - -SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_better_lr, - CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_do_better_lr, 1, - "Improved TCP Loss Recovery"); - SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_min_curr_rtt, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_use_min_curr_rtt, 1, "Use a min of k=4 RTT samples for congestion controllers"); @@ -258,6 +237,11 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, awdl_rtobase, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_awdl_rtobase, 100, "Initial RTO for AWDL interface"); +int tcp_syncookie = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookie, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_syncookie, 1, + "0: disable, 1: Use SYN cookies when backlog is full, 2: Always use SYN cookies"); + extern int tcp_acc_iaj_high; extern int tcp_acc_iaj_react_limit; extern int tcp_fin_timeout; @@ -265,11 +249,6 @@ extern int tcp_fin_timeout; uint8_t tcprexmtthresh = 3; uint32_t tcp_now; -struct timeval tcp_uptime; /* uptime when tcp_now was last updated */ - -/* Used to sychronize updates to tcp_now */ -static LCK_GRP_DECLARE(tcp_uptime_mtx_grp, "tcpuptime"); -LCK_SPIN_DECLARE(tcp_uptime_lock, &tcp_uptime_mtx_grp); struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ @@ -282,7 +261,6 @@ static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq); static inline unsigned int tcp_maxmtu(struct rtentry *); -static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags); static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int); #if TRAFFIC_MGT @@ -324,8 +302,8 @@ static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, #define log_in_vain_log( a ) { log a; } -int tcp_rcvunackwin = TCPTV_UNACKWIN; -int tcp_maxrcvidle = TCPTV_MAXRCVIDLE; +/* ToDo - to be removed once uTCP stops using it */ +#define TCP_RCV_SS_PKTCOUNT 512 SYSCTL_SKMEM_TCP_INT(OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_rcvsspktcnt, TCP_RCV_SS_PKTCOUNT, "packets to be seen before receiver stretches acks"); @@ -574,9 +552,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, /* * If the reassembly queue already has entries or if we are going * to add a new one, then the connection has reached a loss state. - * Reset the stretch-ack algorithm at this point. + * Reset the force-ACK counter at this point. */ - tcp_reset_stretch_ack(tp); tp->t_forced_acks = TCP_FORCED_ACKS_COUNT; #if TRAFFIC_MGT @@ -587,9 +564,18 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, if (th->th_seq != tp->rcv_nxt) { struct mbuf *tmp = m; + + if (tcp_memacct_softlimit()) { + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_REASS_MEMORY_PRESSURE, NULL, 0); + tcp_reass_overflows++; + tcpstat.tcps_rcvmemdrop++; + *tlenp = 0; + return 0; + } + while (tmp != NULL) { if (mbuf_class_under_pressure(tmp)) { - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON__TCP_REASS_MEMORY_PRESSURE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_REASS_MEMORY_PRESSURE, NULL, 0); tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; *tlenp = 0; @@ -618,10 +604,15 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, return 0; } - /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ - te = tcp_reass_qent_alloc(); - tp->t_reassqlen++; - OSIncrementAtomic(&tcp_reass_total_qlen); + /* Create a new queue entry. If we can't, just drop the pkt. */ + te = tcp_create_reass_qent(tp, m, th, *tlenp); + if (te == NULL) { + m_drop_list(m, NULL, + DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, + DROP_REASON_TCP_REASSEMBLY_ALLOC, NULL, 0); + *tlenp = 0; + return 0; + } /* * Find a segment which begins after this one does. @@ -645,7 +636,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, if (i > 0) { if (i > 1) { /* - * Note duplicate data sequnce numbers + * Note duplicate data sequence numbers * to report in DSACK option */ tp->t_dsack_lseq = th->th_seq; @@ -659,24 +650,20 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, dsack_set = TRUE; } if (i >= *tlenp) { + struct mbuf *tmp; + tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; if (nstat_collect) { nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE); - INP_ADD_STAT(inp, ifnet_count_type, - rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, - rxbytes, *tlenp); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, *tlenp); tp->t_stat.rxduplicatebytes += *tlenp; - inp_set_activity_bitmap(inp); } - m_freem(m); - tcp_reass_qent_free(te); + tmp = tcp_destroy_reass_qent(tp, te); + m_freem(tmp); te = NULL; - tp->t_reassqlen--; - OSDecrementAtomic(&tcp_reass_total_qlen); /* * Try to present any queued data * at the left window edge to the user. @@ -687,6 +674,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, } m_adj(m, i); *tlenp -= i; + te->tqe_len -= i; th->th_seq += i; } } @@ -703,9 +691,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, if (nstat_collect) { nstat_route_rx(inp->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER); - INP_ADD_STAT(inp, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, rxbytes, *tlenp); - inp_set_activity_bitmap(inp); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, *tlenp); } /* @@ -713,6 +699,8 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, * if they are completely covered, dequeue them. */ while (q) { + struct mbuf *tmp; + int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; if (i <= 0) { break; @@ -747,23 +735,16 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m, } nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); - tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ? - q->tqe_m->m_ext.ext_size : 0; - m_freem(q->tqe_m); - tcp_reass_qent_free(q); - tp->t_reassqlen--; - OSDecrementAtomic(&tcp_reass_total_qlen); + + tmp = tcp_destroy_reass_qent(tp, q); + + m_freem(tmp); q = nq; } /* Insert the new segment queue entry into place. */ - te->tqe_m = m; - te->tqe_th = th; - te->tqe_len = *tlenp; - - tp->t_reassq_mbcnt += _MSIZE + (m->m_flags & M_EXT) ? m->m_ext.ext_size : 0; - if (p == NULL) { LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { @@ -797,28 +778,30 @@ present: tcpstat.tcps_recovered_pkts++; do { + uint8_t psh = q->tqe_th->th_flags & TH_PUSH; + struct mbuf *tmp; + tp->rcv_nxt += q->tqe_len; flags = q->tqe_th->th_flags & TH_FIN; + LIST_REMOVE(q, tqe_q); - tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ? - q->tqe_m->m_ext.ext_size : 0; + + tmp = tcp_destroy_reass_qent(tp, q); + if (so->so_state & SS_CANTRCVMORE) { - m_freem(q->tqe_m); + m_freem(tmp); } else { - so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ - if (q->tqe_th->th_flags & TH_PUSH) { + so_recv_data_stat(so, tmp, 0); /* XXXX */ + if (psh) { tp->t_flagsext |= TF_LAST_IS_PSH; } else { tp->t_flagsext &= ~TF_LAST_IS_PSH; } - if (sbappendstream_rcvdemux(so, q->tqe_m)) { + if (sbappendstream_rcvdemux(so, tmp)) { *dowakeup = 1; } } - tcp_reass_qent_free(q); - tp->t_reassqlen--; - OSDecrementAtomic(&tcp_reass_total_qlen); q = LIST_FIRST(&tp->t_segq); } while (q && q->tqe_th->th_seq == tp->rcv_nxt); tp->t_flagsext &= ~TF_REASS_INPROG; @@ -840,6 +823,21 @@ present: return flags; } +/* + * Reduce congestion window when local AQM sends + * congestion event. We don't enter FAST_RECOVERY here + * as there is no packet loss. + */ +void +tcp_local_congestion_notification(struct tcpcb *tp) +{ + if (CC_ALGO(tp)->pre_fr != NULL) { + CC_ALGO(tp)->pre_fr(tp); + } + + tp->snd_cwnd = tp->snd_ssthresh; +} + /* * Enter fast recovery and reduce congestion window, * used when CE is seen or when a tail loss @@ -897,7 +895,7 @@ tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen) (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY))) && tlen > 0 && tp->t_state == TCPS_ESTABLISHED) { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, (TCP_REXMTVAL(tp) << 1)); tp->t_flagsext |= TF_DETECT_READSTALL; tp->t_rtimo_probes = 0; @@ -907,7 +905,7 @@ tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen) inline void tcp_keepalive_reset(struct tcpcb *tp) { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPIDLE(tp)); tp->t_flagsext &= ~(TF_DETECT_READSTALL); tp->t_rtimo_probes = 0; @@ -926,9 +924,9 @@ tcp_set_finwait_timeout(struct tcpcb *tp) if (tcp_fin_timeout > 0 && tcp_fin_timeout < TCP_CONN_MAXIDLE(tp)) { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_fin_timeout); + tp->t_timer[TCPT_2MSL] = tcp_offset_from_start(tp, tcp_fin_timeout); } else { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, TCP_CONN_MAXIDLE(tp)); + tp->t_timer[TCPT_2MSL] = tcp_offset_from_start(tp, TCP_CONN_MAXIDLE(tp)); } } @@ -1016,13 +1014,14 @@ tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv, * - the high water mark already reached the maximum * - the stream is in background and receive side is being * throttled + * - we are memory-limited */ if (tcp_do_autorcvbuf == 0 || (sbrcv->sb_flags & SB_AUTOSIZE) == 0 || sbrcv->sb_hiwat >= tcp_autorcvbuf_max || (tp->t_flagsext & TF_RECV_THROTTLE) || (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) || - (!tcp_autotune_reorder && !LIST_EMPTY(&tp->t_segq))) { + (tcp_memacct_limited() && sbrcv->sb_hiwat >= tcp_recvspace)) { /* Can not resize the socket buffer, just return */ goto out; } @@ -1223,92 +1222,6 @@ tcp_sbrcv_tstmp_check(struct tcpcb *tp) } } -/* A receiver will evaluate the flow of packets on a connection - * to see if it can reduce ack traffic. The receiver will start - * stretching acks if all of the following conditions are met: - * 1. tcp_delack_enabled is set to 3 - * 2. If the bytes received in the last 100ms is greater than a threshold - * defined by maxseg_unacked - * 3. If the connection has not been idle for tcp_maxrcvidle period. - * 4. If the connection has seen enough packets to let the slow-start - * finish after connection establishment or after some packet loss. - * - * The receiver will stop stretching acks if there is congestion/reordering - * as indicated by packets on reassembly queue or an ECN. If the delayed-ack - * timer fires while stretching acks, it means that the packet flow has gone - * below the threshold defined by maxseg_unacked and the receiver will stop - * stretching acks. The receiver gets no indication when slow-start is completed - * or when the connection reaches an idle state. That is why we use - * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle - * state. - */ -static inline int -tcp_stretch_ack_enable(struct tcpcb *tp, int thflags) -{ - if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && - TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) { - tp->t_flags |= TF_STREAMING_ON; - } else { - tp->t_flags &= ~TF_STREAMING_ON; - } - - /* If there has been an idle time, reset streaming detection */ - if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) { - tp->t_flags &= ~TF_STREAMING_ON; - } - - /* - * If there are flags other than TH_ACK set, reset streaming - * detection - */ - if (thflags & ~TH_ACK) { - tp->t_flags &= ~TF_STREAMING_ON; - } - - if (tp->t_flagsext & TF_DISABLE_STRETCHACK) { - if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) { - tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; - tp->rcv_nostrack_pkts = 0; - tp->rcv_nostrack_ts = 0; - } else { - tp->rcv_nostrack_pkts++; - } - } - - if (!(tp->t_flagsext & (TF_NOSTRETCHACK | TF_DISABLE_STRETCHACK)) && - (tp->t_flags & TF_STREAMING_ON) && - (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) || - (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { - return 1; - } - - return 0; -} - -/* - * Reset the state related to stretch-ack algorithm. This will make - * the receiver generate an ack every other packet. The receiver - * will start re-evaluating the rate at which packets come to decide - * if it can benefit by lowering the ack traffic. - */ -void -tcp_reset_stretch_ack(struct tcpcb *tp) -{ - tp->t_flags &= ~(TF_STRETCHACK | TF_STREAMING_ON); - tp->rcv_by_unackwin = 0; - tp->rcv_by_unackhalfwin = 0; - tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; - - /* - * When there is packet loss or packet re-ordering or CWR due to - * ECN, the sender's congestion window is reduced. In these states, - * generate an ack for every other packet for some time to allow - * the sender's congestion window to grow. - */ - tp->t_flagsext |= TF_RCVUNACK_WAITSS; - tp->rcv_waitforss = 0; -} - /* * The last packet was a retransmission, check if this ack * indicates that the retransmission was spurious. @@ -1329,7 +1242,7 @@ tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th, bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); /* If the ack has ECN CE bit, then cwnd has to be adjusted */ - if ((TCP_ACC_ECN_ON(tp) && tp->t_aecn.t_delta_ce_packets > 0) || + if ((tp->accurate_ecn_on && tp->t_aecn.t_delta_ce_packets > 0) || (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))) { return 0; } @@ -1687,7 +1600,7 @@ tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen) * We send the probe out rather quickly (after one RTO). It does not * really hurt that much, it's only one additional segment on the wire. */ - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp))); + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, (TCP_REXMTVAL(tp))); } static void @@ -1710,7 +1623,7 @@ tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th) } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) { /* There is a hole! Wait a bit for data... */ tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_REXMTVAL(tp)); } } @@ -1762,6 +1675,9 @@ tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup) static void tcp_update_snd_una(struct tcpcb *tp, uint32_t ack) { + uint32_t delta = ack - tp->snd_una; + + tp->t_stat.bytes_acked += delta; tp->snd_una = ack; } @@ -1779,7 +1695,7 @@ tcp_syn_data_valid(struct tcpcb *tp, struct tcphdr *tcp_hdr, int tlen) } /* We could have wrapped around, check that */ - if (tp->t_inpcb->inp_stat->rxbytes > INT32_MAX) { + if (tp->t_inpcb->inp_mstat.ms_total.ts_rxbytes > INT32_MAX) { return false; } @@ -1829,7 +1745,7 @@ tcp_input_process_accecn_syn(struct tcpcb *tp, int ace_flags, uint8_t ip_ecn) break; case (TH_ACE): /* Accurate ECN */ - if (TCP_L4S_ENABLED(tp)) { + if (tp->l4s_enabled) { switch (ip_ecn) { case IPTOS_ECN_NOTECT: tp->ecn_flags |= TE_ACE_SETUP_NON_ECT; @@ -1864,7 +1780,7 @@ tcp_input_process_accecn_syn(struct tcpcb *tp, int ace_flags, uint8_t ip_ecn) default: /* Forward Compatibility */ /* Accurate ECN */ - if (TCP_L4S_ENABLED(tp)) { + if (tp->l4s_enabled) { switch (ip_ecn) { case IPTOS_ECN_NOTECT: tp->ecn_flags |= TE_ACE_SETUP_NON_ECT; @@ -1894,6 +1810,274 @@ tcp_input_process_accecn_syn(struct tcpcb *tp, int ace_flags, uint8_t ip_ecn) } } +/* Process SYN/ACK packet that wishes to negotiate Accurate ECN */ +static void +tcp_input_process_accecn_synack(struct tcpcb *tp, struct inpcb *inp, struct tcpopt *to, + int thflags, int ace_flags, uint8_t ip_ecn, uint32_t tlen, uint32_t segment_count) +{ + if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) { + /* Receiving Any|0|1 is classic ECN-setup SYN-ACK */ + tp->ecn_flags |= TE_SETUPRECEIVED; + if (TCP_ECN_ENABLED(tp)) { + tcp_heuristic_ecn_success(tp); + tcpstat.tcps_ecn_client_success++; + } + + if (tp->ecn_flags & TE_ACE_SETUPSENT) { + /* + * Sent AccECN SYN but received classic ECN SYN-ACK + * Set classic ECN related flags + */ + tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT); + tp->ecn_flags &= ~TE_ACE_SETUPSENT; + if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { + tp->t_client_accecn_state = tcp_connection_client_classic_ecn_available; + } + } + } else if (tp->l4s_enabled && ace_flags != 0 && + ace_flags != TH_ACE) { + /* Initialize sender side packet & byte counters */ + tp->t_aecn.t_snd_ce_packets = 5; + tp->t_aecn.t_snd_ect1_bytes = tp->t_aecn.t_snd_ect0_bytes = 1; + tp->t_aecn.t_snd_ce_bytes = 0; + tp->ecn_flags |= TE_ACE_FINAL_ACK_3WHS; + /* + * Client received AccECN SYN-ACK that reflects the state (ECN) + * in which SYN packet was delivered. This helps to detect if + * there was mangling of the SYN packet on the path. Currently, we + * only send Not-ECT on SYN packets. So, we should set Not-ECT in + * all packets if we receive any encoding other than 0|TH_CWR|0. + * If 0|0|0 and 1|1|1 were received, fail Accurate ECN negotiation + * by not setting TE_ACE_SETUPRECEIVED. + */ + uint32_t ecn_flags = TE_ACE_SETUPRECEIVED; + if (tp->l4s_enabled) { + ecn_flags |= TE_SENDIPECT; + } + switch (ace_flags) { + case (0 | TH_CWR | 0): + /* Non-ECT SYN was delivered */ + tp->ecn_flags |= ecn_flags; + tcpstat.tcps_ecn_ace_syn_not_ect++; + tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success; + break; + case (0 | TH_CWR | TH_ECE): + /* ECT1 SYN was delivered */ + tp->ecn_flags |= ecn_flags; + /* Mangling detected, set Non-ECT on outgoing packets */ + tp->ecn_flags &= ~TE_SENDIPECT; + tcpstat.tcps_ecn_ace_syn_ect1++; + tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; + break; + case (TH_AE | 0 | 0): + /* ECT0 SYN was delivered */ + tp->ecn_flags |= ecn_flags; + /* Mangling detected, set Non-ECT on outgoing packets */ + tp->ecn_flags &= ~TE_SENDIPECT; + tcpstat.tcps_ecn_ace_syn_ect0++; + tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; + break; + case (TH_AE | TH_CWR | 0): + /* CE SYN was delivered */ + tp->ecn_flags |= ecn_flags; + /* Mangling detected, set Non-ECT on outgoing packets */ + tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; + tp->ecn_flags &= ~TE_SENDIPECT; + /* + * Although we don't send ECT SYN yet, it is possible that + * a network element changed Not-ECT to ECT and later there + * was congestion at another network element that set it to CE. + * To keep it simple, we will consider this as a congestion event + * for the congestion controller. + * If a TCP client in AccECN mode receives CE feedback in the TCP + * flags of a SYN/ACK, it MUST NOT increment s.cep. + */ + tp->snd_cwnd = 2 * tp->t_maxseg; + tcpstat.tcps_ecn_ace_syn_ce++; + break; + default: + break; + } + /* Set Accurate ECN state for client */ + tcp_set_accurate_ecn(tp); + + if (TCP_ECN_ENABLED(tp)) { + tcp_heuristic_ecn_success(tp); + tcpstat.tcps_ecn_client_success++; + } + /* + * A TCP client in AccECN mode MUST feed back which of the 4 + * possible values of the IP-ECN field that was received in the + * SYN/ACK. Set the setup flag for final ACK accordingly. + * We will initialize r.cep, r.e1b, r.e0b first and then increment + * if CE was set on the IP-ECN field of the SYN-ACK. + */ + tp->t_aecn.t_rcv_ce_packets = 5; + tp->t_aecn.t_rcv_ect0_bytes = tp->t_aecn.t_rcv_ect1_bytes = 1; + tp->t_aecn.t_rcv_ce_bytes = 0; + + /* Increment packet & byte counters based on IP-ECN */ + tcp_input_ip_ecn(tp, inp, (uint32_t)tlen, (uint32_t)segment_count, ip_ecn); + switch (ip_ecn) { + case IPTOS_ECN_NOTECT: + /* Not-ECT SYN-ACK was received */ + tp->ecn_flags |= TE_ACE_SETUP_NON_ECT; + break; + case IPTOS_ECN_ECT1: + /* ECT1 SYN-ACK was received */ + tp->ecn_flags |= TE_ACE_SETUP_ECT1; + break; + case IPTOS_ECN_ECT0: + /* ECT0 SYN-ACK was received */ + tp->ecn_flags |= TE_ACE_SETUP_ECT0; + break; + case IPTOS_ECN_CE: + tp->ecn_flags |= TE_ACE_SETUP_CE; + break; + } + /* Update the time for this newly SYN-ACK packet */ + if ((to->to_flags & TOF_TS) != 0 && (to->to_tsecr != 0) && + (tp->t_last_ack_tsecr == 0 || TSTMP_GEQ(to->to_tsecr, tp->t_last_ack_tsecr))) { + tp->t_last_ack_tsecr = to->to_tsecr; + } + } else { + if ((tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) && + tp->t_rxtshift == 0) { + tcp_heuristic_ecn_success(tp); + tcpstat.tcps_ecn_not_supported++; + } + if (((tp->ecn_flags & TE_SETUPSENT) != 0 && tp->t_rxtshift == 1) || + ((tp->ecn_flags & TE_ACE_SETUPSENT) != 0 && tp->t_rxtshift == 2)) { + /* + * We keep heuristics for when SYN ECN was likely dropped at the network by + * checking that we received an ACK for the subsequent retransmission without ECN + */ + tcp_heuristic_ecn_loss(tp); + } + + /* non-ECN-setup SYN-ACK */ + tp->ecn_flags &= ~TE_SENDIPECT; + /* + * If Accurate ECN SYN was retransmitted twice and non-ECN SYN-ACK + * was received, then we consider it as Accurate ECN blackholing + */ + if ((tp->ecn_flags & TE_LOST_SYN) && tp->t_rxtshift <= 2 && + tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { + tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_blackholed; + } + /* + * If SYN wasn't retransmitted twice yet, the server supports neither classic nor + * accurate ECN SYN-ACK. Accurate ECN should already be disabled for both half connections + * as TE_ACE_SETUPRECEIVED flag is not set. + */ + if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { + tp->t_client_accecn_state = tcp_connection_client_ecn_not_available; + } + } +} + +static void +tcp_input_process_accecn_last_ack(struct tcpcb *tp, struct tcpopt *to, + uint32_t tlen, uint16_t ace_flags, bool syn_cookie_processed) +{ + if (syn_cookie_processed) { + /* Set AccECN and L4S flags as if these were negotiated successfully. */ + if (tp->l4s_enabled) { + tp->ecn_flags |= (TE_ACC_ECN_ON | TE_SENDIPECT); + tcp_set_accurate_ecn(tp); + } + tp->t_aecn.t_rcv_ce_packets = 5; + tp->t_aecn.t_snd_ce_packets = 5; + /* Initialize CE byte counter to 0 */ + tp->t_aecn.t_rcv_ce_bytes = tp->t_aecn.t_snd_ce_bytes = 0; + /* Initialize ECT byte counter to 1 to distinguish zeroing of options */ + tp->t_aecn.t_rcv_ect1_bytes = tp->t_aecn.t_rcv_ect0_bytes = 1; + tp->t_aecn.t_snd_ect1_bytes = tp->t_aecn.t_snd_ect0_bytes = 1; + } + if (tlen == 0 && to->to_nsacks == 0) { + /* + * ACK for SYN-ACK reflects the state (ECN) in which SYN-ACK packet + * was delivered. Use Table 4 of Accurate ECN draft to decode only + * when a pure ACK with no SACK block is received. + * 0|0|0 will fail Accurate ECN negotiation and disable ECN. + */ + switch (ace_flags) { + case (0 | TH_CWR | 0): + /* Non-ECT SYN-ACK was delivered */ + tp->t_aecn.t_snd_ce_packets = 5; + if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested || syn_cookie_processed) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success; + } + break; + case (0 | TH_CWR | TH_ECE): + /* ECT1 SYN-ACK was delivered, mangling detected */ + OS_FALLTHROUGH; + case (TH_AE | 0 | 0): + /* ECT0 SYN-ACK was delivered, mangling detected */ + tp->t_aecn.t_snd_ce_packets = 5; + if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested || syn_cookie_processed) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected; + } + break; + case (TH_AE | TH_CWR | 0): + /* + * CE SYN-ACK was delivered, even though mangling happened, + * CE could indicate congestion at a node after mangling occured. + * Set cwnd to 2 segments + */ + tp->t_aecn.t_snd_ce_packets = 6; + tp->snd_cwnd = 2 * tp->t_maxseg; + if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested || syn_cookie_processed) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected; + } + break; + case (0 | 0 | 0): + /* Disable ECN, as ACE fields were zeroed */ + tp->ecn_flags &= ~(TE_SETUPRECEIVED | TE_SENDIPECT | + TE_SENDCWR | TE_ACE_SETUPRECEIVED); + tcp_set_accurate_ecn(tp); + /* + * Since last ACK has no ECN flag set and TE_LOST_SYNACK is set, this is in response + * to the second (non-ECN setup) SYN-ACK retransmission. In such a case, we assume + * that AccECN SYN-ACK was blackholed. + */ + if ((tp->ecn_flags & TE_LOST_SYNACK) && tp->t_rxtshift <= 2 && + (tp->t_server_accecn_state == tcp_connection_server_classic_ecn_requested || + tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested || + syn_cookie_processed)) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_blackholed; + } + /* + * SYN-ACK hasn't been retransmitted twice yet, so this could likely mean bleaching of ACE + * on the path from client to server on last ACK. + */ + if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_ace_bleaching_detected; + } + break; + default: + /* Unused values for forward compatibility */ + tp->t_aecn.t_snd_ce_packets = 5; + break; + } + /* Update the time for this newly received last ACK */ + if ((to->to_flags & TOF_TS) != 0 && (to->to_tsecr != 0) && + (tp->t_last_ack_tsecr == 0 || TSTMP_GEQ(to->to_tsecr, tp->t_last_ack_tsecr))) { + tp->t_last_ack_tsecr = to->to_tsecr; + } + } else if (to->to_nsacks == 0) { + /* + * If 3rd ACK is lost, we won't receive the last ACK + * encoding. We will move the server to AccECN mode + * regardless. + */ + tp->t_aecn.t_snd_ce_packets = 5; + if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested || syn_cookie_processed) { + tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success; + } + } +} + static uint32_t tcp_process_ace_field(struct tcpcb *tp, uint32_t pkts_acked, uint64_t old_sceb, uint8_t ace) { @@ -2012,25 +2196,575 @@ tcp_process_accecn(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, tp->t_aecn.accecn_processed = 1; } +static void +tcp_ece_aggressive_heur(struct tcpcb *tp, uint32_t pkts_acked) +{ + if (tp->ecn_flags & TE_ECEHEURI_SET) { + /* ECN heuristic already determined */ + return; + } + + tp->t_ecn_recv_ece_pkt += pkts_acked; + + if (tp->t_ecn_capable_packets_acked < ECN_MIN_CE_PROBES) { + /* Still in learning phase - insufficient probe data */ + return; + } + + if (tp->t_ecn_recv_ece_pkt > ECN_MAX_CE_RATIO) { + /* Excessive congestion detected - disable ECN */ + tcp_heuristic_ecn_aggressive(tp); + tp->ecn_flags |= TE_ECEHEURI_SET; + tp->ecn_flags &= ~TE_SENDIPECT; /* Disable ECT for future packets */ + } else { + /* Path is suitable for ECN */ + tp->ecn_flags |= TE_ECEHEURI_SET; + } +} +/* + * Process SYN from clients and create a new connecting socket + * from the listener socket. If the listen queue exceeds a certain + * threshold, then generate a SYN cookie instead. + * + * When SYN cookie is used, this function is also called when + * we receive last ACK from the client to create a new connecting + * socket. + */ + +bool +tcp_create_server_socket(struct tcp_inp *tpi, struct socket **so2, + bool *syn_cookie_sent, int *dropsocket) +{ +#define TCP_LOG_HDR (tpi->isipv6 ? (void *)tpi->ip6 : (void *)tpi->ip) + + struct socket *so = tpi->so; + struct tcpcb *otp = *tpi->tp; + struct inpcb *oinp = sotoinpcb(so); + struct tcphdr *th = tpi->th; + struct sockaddr_storage from; + struct sockaddr_storage to2; + struct tcpcb *tp; + struct inpcb *inp; + struct ifnet *head_ifscope; + bool head_nocell, head_recvanyif, + head_noexpensive, head_awdl_unrestricted, + head_intcoproc_allowed, head_external_port, + head_noconstrained, head_management_allowed, + head_ultra_constrained_allowed; + boolean_t check_cfil = cfil_filter_present(); + + /* Get listener's bound-to-interface, if any */ + // TODO check that oinp is same as inp set in tcp_input + head_ifscope = (oinp->inp_flags & INP_BOUND_IF) ? + oinp->inp_boundifp : NULL; + /* Get listener's no-cellular information, if any */ + head_nocell = INP_NO_CELLULAR(oinp); + /* Get listener's recv-any-interface, if any */ + head_recvanyif = (oinp->inp_flags & INP_RECV_ANYIF); + /* Get listener's no-expensive information, if any */ + head_noexpensive = INP_NO_EXPENSIVE(oinp); + head_noconstrained = INP_NO_CONSTRAINED(oinp); + head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(oinp); + head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(oinp); + head_external_port = (oinp->inp_flags2 & INP2_EXTERNAL_PORT); + head_management_allowed = INP_MANAGEMENT_ALLOWED(oinp); + head_ultra_constrained_allowed = INP_ULTRA_CONSTRAINED_ALLOWED(oinp); + + if (so->so_filt || check_cfil || TCP_SYN_COOKIE_ENABLED(otp)) { + if (tpi->isipv6) { + struct sockaddr_in6 *sin6 = SIN6(&from); + + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = th->th_sport; + sin6->sin6_flowinfo = 0; + sin6->sin6_addr = tpi->ip6->ip6_src; + sin6->sin6_scope_id = 0; + + sin6 = SIN6(&to2); + + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = th->th_dport; + sin6->sin6_flowinfo = 0; + sin6->sin6_addr = tpi->ip6->ip6_dst; + sin6->sin6_scope_id = 0; + } else { + struct sockaddr_in *sin = SIN(&from); + + sin->sin_len = sizeof(*sin); + sin->sin_family = AF_INET; + sin->sin_port = th->th_sport; + sin->sin_addr = tpi->ip->ip_src; + + sin = SIN(&to2); + + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_port = th->th_dport; + sin->sin_addr = tpi->ip->ip_dst; + } + } + + if (so->so_filt) { + *so2 = sonewconn(so, 0, SA(&from)); + } else { + if (tcp_can_send_syncookie(so, otp, th->th_flags)) { + ASSERT(tpi->to != NULL); + + tcp_dooptions(otp, tpi->optp, tpi->optlen, th, tpi->to); + tcp_syncookie_syn(tpi, SA(&to2), SA(&from)); + if (syn_cookie_sent) { + *syn_cookie_sent = true; + } + /* Release reference and unlock listener socket */ + socket_unlock(so, 1); + /* + * In case of SYN cookies, we don't allocate connected + * socket yet, return success. + */ + return true; + } else { + *so2 = sonewconn(so, 0, NULL); + } + } + if (*so2 == 0) { + tcpstat.tcps_listendrop++; + if (tcp_dropdropablreq(so)) { + if (so->so_filt) { + *so2 = sonewconn(so, 0, SA(&from)); + } else { + if (tcp_can_send_syncookie(so, otp, th->th_flags)) { + ASSERT(tpi->to != NULL); + tcp_dooptions(otp, tpi->optp, tpi->optlen, th, tpi->to); + tcp_syncookie_syn(tpi, SA(&to2), SA(&from)); + if (syn_cookie_sent) { + *syn_cookie_sent = true; + } + /* Release reference and unlock listener socket */ + socket_unlock(so, 1); + + return true; + } else { + *so2 = sonewconn(so, 0, NULL); + } + } + } + if (*so2 == 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, otp, false, " listen drop"); + goto drop; + } + } + + /* Point "inp" and "tp" in tandem to new socket */ + *(tpi->inp) = inp = (struct inpcb *)(*so2)->so_pcb; + *(tpi->tp) = tp = intotcpcb(inp); + + socket_unlock(so, 0); /* Unlock but keep a reference on listener for now */ + + socket_lock(*so2, 1); + + /* + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + * There are some error conditions in which we + * have to drop the temporary socket. + */ + (*dropsocket)++; + + /* + * Inherit INP_BOUND_IF from listener; testing if + * head_ifscope is non-NULL is sufficient, since it + * can only be set to a non-zero value earlier if + * the listener has such a flag set. + */ + if (head_ifscope != NULL) { + inp->inp_flags |= INP_BOUND_IF; + inp->inp_boundifp = head_ifscope; + } else { + inp->inp_flags &= ~INP_BOUND_IF; + } + /* + * Inherit restrictions from listener. + */ + if (head_nocell) { + inp_set_nocellular(inp); + } + if (head_noexpensive) { + inp_set_noexpensive(inp); + } + if (head_noconstrained) { + inp_set_noconstrained(inp); + } + if (head_awdl_unrestricted) { + inp_set_awdl_unrestricted(inp); + } + if (head_intcoproc_allowed) { + inp_set_intcoproc_allowed(inp); + } + if (head_management_allowed) { + inp_set_management_allowed(inp); + } + if (head_ultra_constrained_allowed) { + inp_set_ultra_constrained_allowed(inp); + } + /* + * Inherit {IN,IN6}_RECV_ANYIF from listener. + */ + if (head_recvanyif) { + inp->inp_flags |= INP_RECV_ANYIF; + } else { + inp->inp_flags &= ~INP_RECV_ANYIF; + } + + if (head_external_port) { + inp->inp_flags2 |= INP2_EXTERNAL_PORT; + } + if (tpi->isipv6) { + inp->in6p_laddr = tpi->ip6->ip6_dst; + inp->inp_lifscope = in6_addr2scopeid(tpi->ifp, &inp->in6p_laddr); + in6_verify_ifscope(&tpi->ip6->ip6_dst, inp->inp_lifscope); + } else { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; + inp->inp_laddr = tpi->ip->ip_dst; + } + inp->inp_lport = th->th_dport; + if (in_pcbinshash(inp, SA(&from), 0) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ + if (tpi->isipv6) { + inp->in6p_laddr = in6addr_any; + inp->inp_lifscope = IFSCOPE_NONE; + } else { + inp->inp_laddr.s_addr = INADDR_ANY; + } +#if SKYWALK + netns_release(&inp->inp_netns_token); +#endif /* SKYWALK */ + inp->inp_lport = 0; + socket_lock(so, 0); /* release ref on parent */ + socket_unlock(so, 1); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed"); + goto drop; + } + socket_lock(so, 0); + if (tpi->isipv6) { + /* + * Inherit socket options from the listening + * socket. + * Note that in6p_inputopts are not (even + * should not be) copied, since it stores + * previously received options and is used to + * detect if each new option is different than + * the previous one and hence should be passed + * to a user. + * If we copied in6p_inputopts, a user would + * not be able to receive options just after + * calling the accept system call. + */ + inp->inp_flags |= + oinp->inp_flags & INP_CONTROLOPTS; + if (oinp->in6p_outputopts) { + inp->in6p_outputopts = + ip6_copypktopts(oinp->in6p_outputopts, Z_NOWAIT); + } + } else { + inp->inp_options = ip_srcroute(); + inp->inp_ip_tos = oinp->inp_ip_tos; + } +#if IPSEC + /* copy old policy into new socket's */ + if (sotoinpcb(so)->inp_sp) { + int error = 0; + /* Is it a security hole here to silently fail to copy the policy? */ + if (inp->inp_sp == NULL) { + error = ipsec_init_policy(*so2, &inp->inp_sp); + } + if (error != 0 || ipsec_copy_policy(sotoinpcb(so)->inp_sp, inp->inp_sp)) { + printf("tcp_input: could not copy policy\n"); + } + } +#endif + /* inherit states from the listener */ + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_LISTEN); + TCP_LOG_STATE(tp, TCPS_LISTEN); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= otp->t_flags & (TF_NOPUSH | TF_NOOPT | TF_NODELAY); + tp->t_flagsext |= (otp->t_flagsext & (TF_RXTFINDROP | TF_NOTIMEWAIT | + TF_FASTOPEN | TF_L4S_ENABLED | TF_L4S_DISABLED)); + tp->t_keepinit = otp->t_keepinit; + tp->t_keepcnt = otp->t_keepcnt; + tp->t_keepintvl = otp->t_keepintvl; + tp->t_adaptive_wtimo = otp->t_adaptive_wtimo; + tp->t_adaptive_rtimo = otp->t_adaptive_rtimo; + tp->t_inpcb->inp_ip_ttl = otp->t_inpcb->inp_ip_ttl; + if (((*so2)->so_flags & SOF_NOTSENT_LOWAT) != 0) { + tp->t_notsent_lowat = otp->t_notsent_lowat; + } + if (tp->t_flagsext & (TF_L4S_ENABLED | TF_L4S_DISABLED)) { + tcp_set_foreground_cc(*so2); + } + tp->t_inpcb->inp_flags2 |= + otp->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD; + + /* now drop the reference on the listener */ + socket_unlock(so, 1); + + tp->request_r_scale = tcp_get_max_rwinscale(tp, *so2); + +#if CONTENT_FILTER + if (check_cfil) { + int error = cfil_sock_attach(*so2, SA(&to2), SA(&from), CFS_CONNECTION_DIR_IN); + if (error != 0) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed"); + goto drop; + } + } +#endif /* CONTENT_FILTER */ + + KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return true; +drop: + return false; +#undef TCP_LOG_HDR +} + +/* + * This function is used to setup TCP server socket in either of below cases, + * 1. SYN cookie is disabled and SYN is received. + * 2. SYN cookie is enabled and SYN cookie is received with last ACK + * Socket MUST already be created before this function is called. + * It returns true for success and false for failure. + */ +bool +tcp_setup_server_socket(struct tcp_inp *tpi, struct socket *so, bool syn_cookie_used) +{ +#define TCP_LOG_HDR (tpi->isipv6 ? (void *)tpi->ip6 : (void *)tpi->ip) + + struct inpcb *inp = *tpi->inp; + struct tcpcb *tp = *tpi->tp; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int error = 0; + struct in_addr laddr; + struct in6_addr laddr6; + + socket_lock_assert_owned(so); + + /* Clear the logging flags inherited from the listening socket */ + inp->inp_log_flags = 0; + inp->inp_flags2 &= ~INP2_LOGGING_ENABLED; + + if (__improbable(inp->inp_flags2 & INP2_BIND_IN_PROGRESS)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, tpi->th, tp, false, "LISTEN bind in progress"); + + return false; + } + inp_enter_bind_in_progress(so); + + if (tpi->isipv6) { + sin6 = kalloc_type(struct sockaddr_in6, Z_NOWAIT | Z_ZERO); + if (sin6 == NULL) { + error = ENOMEM; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, tpi->th, tp, false, "LISTEN kalloc_type failed"); + goto pcbconnect_done; + } + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = tpi->ip6->ip6_src; + sin6->sin6_port = tpi->th->th_sport; + if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&tpi->ip6->ip6_src)) { + sin6->sin6_scope_id = ip6_input_getsrcifscope(tpi->m); + } + laddr6 = inp->in6p_laddr; + uint32_t lifscope = inp->inp_lifscope; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + inp->in6p_laddr = tpi->ip6->ip6_dst; + inp->inp_lifscope = in6_addr2scopeid(tpi->ifp, &inp->in6p_laddr); + in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope); + } + if ((error = in6_pcbconnect(inp, SA(sin6), tpi->kernel_proc)) != 0) { + inp->in6p_laddr = laddr6; + kfree_type(struct sockaddr_in6, sin6); + inp->inp_lifscope = lifscope; + in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, tpi->th, tp, false, " LISTEN in6_pcbconnect failed"); + goto pcbconnect_done; + } + kfree_type(struct sockaddr_in6, sin6); + } else { + socket_lock_assert_owned(so); + sin = kalloc_type(struct sockaddr_in, Z_NOWAIT); + if (sin == NULL) { + error = ENOMEM; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, tpi->th, tp, false, "LISTEN kalloc_type failed"); + goto pcbconnect_done; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = tpi->ip->ip_src; + sin->sin_port = tpi->th->th_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) { + inp->inp_laddr = tpi->ip->ip_dst; + } + if ((error = in_pcbconnect(inp, SA(sin), tpi->kernel_proc, IFSCOPE_NONE, NULL)) != 0) { + inp->inp_laddr = laddr; + kfree_type(struct sockaddr_in, sin); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, tpi->th, tp, false, " LISTEN in_pcbconnect failed"); + goto pcbconnect_done; + } + kfree_type(struct sockaddr_in, sin); + } +pcbconnect_done: + inp_exit_bind_in_progress(so); + if (error != 0) { + return false; + } + /* + * We already processed the options just before calling + * tcp_syncookie_ack. If timestamp option is present in + * last ACK, then we assume that it was already negotiated + * during SYN/ACK. For other options, we derive the state + * from the cookie. + */ + if (syn_cookie_used) { + tpi->to->to_flags |= TOF_SCALE; + tpi->to->to_wscale = MIN(tpi->peer_wscale, TCP_MAX_WINSHIFT); + tpi->to->to_mss = tpi->peer_mss; + tpi->to->to_flags |= TOF_MSS; + + if (tpi->sackok == 1) { + tpi->to->to_flags |= TOF_SACKPERM; + } + if (tpi->ecnok == 1) { + tp->ecn_flags |= (TE_ECN_ON | TE_SENDIPECT); + } + } + /* Get timestamp and other options that are in either: + * SYN, when SYN cookies are disabled + * OR last ACK, when SYN cookies are enabled + */ + if (tpi->optp != NULL) { + tcp_dooptions(tp, tpi->optp, tpi->optlen, tpi->th, tpi->to); + } + tcp_finalize_options(tp, tpi->to, tpi->ifscope); + + if (tpi->iss) { + tp->iss = tpi->iss; + } else { + tp->iss = tcp_new_isn(tp); + } + if (syn_cookie_used) { + tp->irs = tpi->irs; + } else { + tp->irs = tpi->th->th_seq; + } + if (tpi->ts_offset) { + tp->t_ts_offset = tpi->ts_offset; + /* Adjust received tsecr when SYN cookie is used */ + tpi->to->to_tsecr -= tpi->ts_offset; + } + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->snd_recover = tp->snd_una; + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tpi->tiwin; /* initial send-window */ + tp->max_sndwnd = tp->snd_wnd; + tp->t_flags |= TF_ACKNOW; + tp->t_unacksegs = 0; + tp->t_unacksegs_ce = 0; + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); + TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED); + + tp->t_state = TCPS_SYN_RECEIVED; + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, + TCP_CONN_KEEPINIT(tp)); + tp->t_connect_time = tcp_now; + + if (inp->inp_flowhash == 0) { + inp_calc_flowhash(inp); + ASSERT(inp->inp_flowhash != 0); + } + /* update flowinfo - RFC 6437 */ + if (inp->inp_flow == 0 && + inp->in6p_flags & IN6P_AUTOFLOWLABEL) { + inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; + inp->inp_flow |= + (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); + } + + /* reset the incomp processing flag */ + so->so_flags &= ~(SOF_INCOMP_INPROGRESS); + tcpstat.tcps_accepts++; + + if (!syn_cookie_used) { + int ace_flags = ((tpi->th->th_x2 << 8) | tpi->th->th_flags) & TH_ACE; + tcp_input_process_accecn_syn(tp, ace_flags, tpi->ip_ecn); + } + /* + * The address and connection state are finalized + */ + TCP_LOG_CONNECT(tp, false, 0); + + tcp_add_fsw_flow(tp, tpi->ifp); + + return true; +#undef TCP_LOG_HDR +} + +static void +tcp_input_process_wake_packet(__unused struct mbuf *m, __unused protocol_family_t protocol_family, struct inpcb *inp) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + + /* + * Note: we will stay in LPW if the TCP packet is invalid or have not found a PCB + */ + if (__improbable(if_is_lpw_enabled(ifp))) { + if (inp->inp_flags2 & INP2_CONNECTION_IDLE) { + struct tcpcb *tp = intotcpcb(inp); + TCP_LOG(tp, "LPW drop TCP connection idle"); + tcp_drop(tp, 0); + } else { + if_exit_lpw(ifp, "TCP connection not idle "); + } + } +} + void tcp_input(struct mbuf *m, int off0) { int exiting_fr = 0; struct tcphdr *th; struct ip *ip = NULL; - struct inpcb *inp; + struct inpcb *__single inp; u_char *optp = NULL; int optlen = 0; int tlen, off; int drop_hdrlen; - struct tcpcb *tp = 0; + struct tcpcb *__single tp = 0; int thflags; struct socket *so = 0; int todrop, acked = 0, ourfinisacked, needoutput = 0; int read_wakeup = 0; int write_wakeup = 0; - struct in_addr laddr; - struct in6_addr laddr6; int dropsocket = 0; int iss = 0, nosock = 0; uint32_t tiwin, sack_bytes_acked = 0; @@ -2051,10 +2785,11 @@ tcp_input(struct mbuf *m, int off0) boolean_t dsack_tlp = false; struct tcp_respond_args tra; int prev_t_state; - boolean_t check_cfil = cfil_filter_present(); bool findpcb_iterated = false; bool rack_loss_detected = false; bool is_th_swapped = false; + bool syn_cookie_processed = false; + bool ret = false; /* * The mbuf may be freed after it has been added to the receive socket * buffer or the reassembly queue, so we reinitialize th to point to a @@ -2074,9 +2809,6 @@ tcp_input(struct mbuf *m, int off0) } while (0) drop_reason_t drop_reason = DROP_REASON_UNSPECIFIED; - if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) { - segment_count = 1; - } TCP_INC_VAR(tcpstat.tcps_rcvtotal, segment_count); struct ip6_hdr *ip6 = NULL; @@ -2224,6 +2956,7 @@ tcp_input(struct mbuf *m, int off0) to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4)); to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8)); optp = NULL; /* we've parsed the options */ + optlen = 0; } } thflags = th->th_flags; @@ -2394,9 +3127,6 @@ findpcb: * as this isn't cause for a panic (the socket might be leaked however)... */ inp = NULL; -#if TEMPDEBUG - printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp); -#endif TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL"); drop_reason = DROP_REASON_TCP_NO_SOCK; goto dropnosock; @@ -2468,6 +3198,13 @@ findpcb: goto drop; } + /* + * Note: we will stay in LPW if the TCP packet is invalid or have not found a PCB + */ + if (__improbable((m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) != 0)) { + tcp_input_process_wake_packet(m, isipv6 ? PF_INET6 : PF_INET, inp); + } + #if NECP if (so->so_state & SS_ISCONNECTED) { // Connected TCP sockets have a fully-bound local and remote, @@ -2545,379 +3282,203 @@ findpcb: soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_WAKE_PKT); } - if (so->so_options & (SO_DEBUG | SO_ACCEPTCONN)) { - if (so->so_options & SO_ACCEPTCONN) { - struct tcpcb *tp0 = tp; - struct socket *so2; - struct socket *oso; - struct sockaddr_storage from; - struct sockaddr_storage to2; - struct inpcb *oinp = sotoinpcb(so); - struct ifnet *head_ifscope; - bool head_nocell, head_recvanyif, - head_noexpensive, head_awdl_unrestricted, - head_intcoproc_allowed, head_external_port, - head_noconstrained, head_management_allowed, - head_ultra_constrained_allowed; - - /* Get listener's bound-to-interface, if any */ - head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundifp : NULL; - /* Get listener's no-cellular information, if any */ - head_nocell = INP_NO_CELLULAR(inp); - /* Get listener's recv-any-interface, if any */ - head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF); - /* Get listener's no-expensive information, if any */ - head_noexpensive = INP_NO_EXPENSIVE(inp); - head_noconstrained = INP_NO_CONSTRAINED(inp); - head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp); - head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp); - head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT); - head_management_allowed = INP_MANAGEMENT_ALLOWED(inp); - head_ultra_constrained_allowed = INP_ULTRA_CONSTRAINED_ALLOWED(inp); - + if (so->so_options & SO_ACCEPTCONN) { + struct socket *__single so2; + /* + * Initialize with fields common to both case: + * 1. SYN is received + * 2. Last ACK is received for listening socket (when SYN cookie is enabled) + */ + struct tcp_inp tpi = {.so = so, .inp = &inp, .tp = &tp, .m = m, .th = th, + .to = &to, .optp = optp, .optlen = optlen, .ip6 = ip6, .ip = ip, + .isipv6 = isipv6, .ifp = ifp, .ifscope = ifscope, .kernel_proc = kernel_proc}; + /* + * When SYN cookie is enabled, check for an existing connection + * attempt if the flag is only ACK. A successful lookup creates a new + * socket appended to the listen queue in SYN_RECEIVED state. + */ + if (TCP_SYN_COOKIE_ENABLED(tp) && (thflags & (TH_RST | TH_ACK | TH_SYN)) == TH_ACK) { /* - * If the state is LISTEN then ignore segment if it contains an RST. - * If the segment contains an ACK then it is bad and send a RST. - * If it does not contain a SYN then it is not interesting; drop it. - * If it is from this socket, drop it, it must be forged. + * Pull initial sequence numbers out of last ACK and + * revert sequence number advances. Populate other fields + * needed to create and setup the server socket. */ - if ((thflags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { - IF_TCP_STATINC(ifp, listbadsyn); - - if (thflags & TH_RST) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, - thflags & TH_SYN ? "ignore SYN with RST" : "ignore RST"); - drop_reason = DROP_REASON_TCP_SYN_RST; - goto drop; - } - if (thflags & TH_ACK) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, - thflags & TH_SYN ? "bad SYN with ACK" : "bad ACK"); - tp = NULL; - tcpstat.tcps_badsyn++; - drop_reason = DROP_REASON_TCP_SYN_ACK_LISTENER; - goto dropwithreset; - } - - /* We come here if there is no SYN set */ - tcpstat.tcps_badsyn++; - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN"); - drop_reason = DROP_REASON_TCP_LISTENER_NO_SYN; - goto drop; - } - KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0); - if (th->th_dport == th->th_sport) { - if (isipv6) { - if (in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port"); - drop_reason = DROP_REASON_TCP_SAME_PORT; - goto drop; - } - } else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address"); - drop_reason = DROP_REASON_TCP_SAME_PORT; - goto drop; - } - } - /* - * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN - * in_broadcast() should never return true on a received - * packet with M_BCAST not set. - * - * Packets with a multicast source address should also - * be discarded. - */ - if (m->m_flags & (M_BCAST | M_MCAST)) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST"); - drop_reason = DROP_REASON_TCP_BCAST_MCAST; - goto drop; - } - if (isipv6) { - if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || - IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST"); - drop_reason = DROP_REASON_TCP_BCAST_MCAST; - goto drop; - } - } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || - ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address"); - drop_reason = DROP_REASON_TCP_BCAST_MCAST; - goto drop; - } - - - /* - * If deprecated address is forbidden, - * we do not accept SYN to deprecated interface - * address to prevent any new inbound connection from - * getting established. - * When we do not accept SYN, we send a TCP RST, - * with deprecated source address (instead of dropping - * it). We compromise it as it is much better for peer - * to send a RST, and RST will be the final packet - * for the exchange. - * - * If we do not forbid deprecated addresses, we accept - * the SYN packet. RFC 4862 forbids dropping SYN in - * this case. - */ - if (isipv6 && !ip6_use_deprecated) { - uint32_t ia6_flags; - - if (ip6_getdstifaddr_info(m, NULL, - &ia6_flags) == 0) { - if (ia6_flags & IN6_IFF_DEPRECATED) { - tp = NULL; - IF_TCP_STATINC(ifp, deprecate6); - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address"); - drop_reason = DROP_REASON_TCP_DEPRECATED_ADDR; - goto dropwithreset; - } - } - } - if (so->so_filt || check_cfil) { - if (isipv6) { - struct sockaddr_in6 *sin6 = SIN6(&from); - - sin6->sin6_len = sizeof(*sin6); - sin6->sin6_family = AF_INET6; - sin6->sin6_port = th->th_sport; - sin6->sin6_flowinfo = 0; - sin6->sin6_addr = ip6->ip6_src; - sin6->sin6_scope_id = 0; - - sin6 = SIN6(&to2); - - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_family = AF_INET6; - sin6->sin6_port = th->th_dport; - sin6->sin6_flowinfo = 0; - sin6->sin6_addr = ip6->ip6_dst; - sin6->sin6_scope_id = 0; - } else { - struct sockaddr_in *sin = SIN(&from); - - sin->sin_len = sizeof(*sin); - sin->sin_family = AF_INET; - sin->sin_port = th->th_sport; - sin->sin_addr = ip->ip_src; - - sin = SIN(&to2); - - sin->sin_len = sizeof(struct sockaddr_in); - sin->sin_family = AF_INET; - sin->sin_port = th->th_dport; - sin->sin_addr = ip->ip_dst; - } - } - - if (so->so_filt) { - so2 = sonewconn(so, 0, SA(&from)); - } else { - so2 = sonewconn(so, 0, NULL); - } - if (so2 == 0) { + tpi.iss = th->th_ack - 1; + tpi.irs = th->th_seq - 1; + tpi.tiwin = tiwin; + tpi.ip_ecn = ip_ecn; + ret = tcp_syncookie_ack(&tpi, &so2, &dropsocket); + if (so2 == NULL) { + /* Either cookie validation failed or we could not allocate a socket */ tcpstat.tcps_listendrop++; - if (tcp_dropdropablreq(so)) { - if (so->so_filt) { - so2 = sonewconn(so, 0, SA(&from)); - } else { - so2 = sonewconn(so, 0, NULL); - } - } - if (!so2) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop"); - drop_reason = DROP_REASON_TCP_LISTENER_DROP; - goto drop; - } + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop with SYN cookies"); + drop_reason = DROP_REASON_TCP_LISTENER_DROP; + goto drop; + } + /* Set so to newly connected socket */ + so = so2; + if (ret == false) { + /* + * There are multiple reasons for tcp_syncookie_ack() to return + * failure even if server socket was created successfully + * 1. During server socket creation, if we failed to put the + * PCB on the hash lists or cfil_sock_attach failed. + * 2. During server socket setup, if in_pcbconnect failed. + * Need to check th behavior when ACK was not for our + * SYN/ACK. Do our protection against double ACK. If peer + * sent us 2 ACKs, then for the first one tcp_syncookie_ack() + * successfully creates a connected socket, while we were + * waiting on the inpcb lock. + */ + drop_reason = DROP_REASON_TCP_LISTENER_DROP; + goto drop; } /* Point "inp" and "tp" in tandem to new socket */ - inp = (struct inpcb *)so2->so_pcb; + inp = (struct inpcb *)so->so_pcb; tp = intotcpcb(inp); + syn_cookie_processed = true; + /* + * New connection inpcb is already locked by + * tcp_syncookie_ack() when it calls tcp_create_server_socket. + */ + ASSERT(tp->t_state == TCPS_SYN_RECEIVED); + /* + * Process the segment and the data it + * contains. + */ + goto syn_cookie_valid; + } - oso = so; - socket_unlock(so, 0); /* Unlock but keep a reference on listener for now */ + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + */ + if ((thflags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { + IF_TCP_STATINC(ifp, listbadsyn); - so = so2; - socket_lock(so, 1); - /* - * Mark socket as temporary until we're - * committed to keeping it. The code at - * ``drop'' and ``dropwithreset'' check the - * flag dropsocket to see if the temporary - * socket created here should be discarded. - * We mark the socket as discardable until - * we're committed to it below in TCPS_LISTEN. - * There are some error conditions in which we - * have to drop the temporary socket. - */ - dropsocket++; - /* - * Inherit INP_BOUND_IF from listener; testing if - * head_ifscope is non-NULL is sufficient, since it - * can only be set to a non-zero value earlier if - * the listener has such a flag set. - */ - if (head_ifscope != NULL) { - inp->inp_flags |= INP_BOUND_IF; - inp->inp_boundifp = head_ifscope; - } else { - inp->inp_flags &= ~INP_BOUND_IF; - } - /* - * Inherit restrictions from listener. - */ - if (head_nocell) { - inp_set_nocellular(inp); - } - if (head_noexpensive) { - inp_set_noexpensive(inp); - } - if (head_noconstrained) { - inp_set_noconstrained(inp); - } - if (head_awdl_unrestricted) { - inp_set_awdl_unrestricted(inp); - } - if (head_intcoproc_allowed) { - inp_set_intcoproc_allowed(inp); - } - if (head_management_allowed) { - inp_set_management_allowed(inp); - } - if (head_ultra_constrained_allowed) { - inp_set_ultra_constrained_allowed(inp); - } - /* - * Inherit {IN,IN6}_RECV_ANYIF from listener. - */ - if (head_recvanyif) { - inp->inp_flags |= INP_RECV_ANYIF; - } else { - inp->inp_flags &= ~INP_RECV_ANYIF; - } - - if (head_external_port) { - inp->inp_flags2 |= INP2_EXTERNAL_PORT; - } - if (isipv6) { - inp->in6p_laddr = ip6->ip6_dst; - inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr); - in6_verify_ifscope(&ip6->ip6_dst, inp->inp_lifscope); - } else { - inp->inp_vflag &= ~INP_IPV6; - inp->inp_vflag |= INP_IPV4; - inp->inp_laddr = ip->ip_dst; - } - inp->inp_lport = th->th_dport; - if (in_pcbinshash(inp, SA(&from), 0) != 0) { - /* - * Undo the assignments above if we failed to - * put the PCB on the hash lists. - */ - if (isipv6) { - inp->in6p_laddr = in6addr_any; - inp->inp_lifscope = IFSCOPE_NONE; - } else { - inp->inp_laddr.s_addr = INADDR_ANY; - } -#if SKYWALK - netns_release(&inp->inp_netns_token); -#endif /* SKYWALK */ - inp->inp_lport = 0; - socket_lock(oso, 0); /* release ref on parent */ - socket_unlock(oso, 1); - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed"); - drop_reason = DROP_REASON_TCP_PCB_HASH_FAILED; + if (thflags & TH_RST) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, + thflags & TH_SYN ? "ignore SYN with RST" : "ignore RST"); + drop_reason = DROP_REASON_TCP_SYN_RST; goto drop; } - socket_lock(oso, 0); + if (thflags & TH_ACK) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, + thflags & TH_SYN ? "bad SYN with ACK" : "bad ACK"); + tp = NULL; + tcpstat.tcps_badsyn++; + drop_reason = DROP_REASON_TCP_SYN_ACK_LISTENER; + goto dropwithreset; + } + + /* We come here if there is no SYN set */ + tcpstat.tcps_badsyn++; + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN"); + drop_reason = DROP_REASON_TCP_LISTENER_NO_SYN; + goto drop; + } + KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0); + if (th->th_dport == th->th_sport) { if (isipv6) { - /* - * Inherit socket options from the listening - * socket. - * Note that in6p_inputopts are not (even - * should not be) copied, since it stores - * previously received options and is used to - * detect if each new option is different than - * the previous one and hence should be passed - * to a user. - * If we copied in6p_inputopts, a user would - * not be able to receive options just after - * calling the accept system call. - */ - inp->inp_flags |= - oinp->inp_flags & INP_CONTROLOPTS; - if (oinp->in6p_outputopts) { - inp->in6p_outputopts = - ip6_copypktopts(oinp->in6p_outputopts, - Z_NOWAIT); - } - } else { - inp->inp_options = ip_srcroute(); - inp->inp_ip_tos = oinp->inp_ip_tos; - } -#if IPSEC - /* copy old policy into new socket's */ - if (sotoinpcb(oso)->inp_sp) { - int error = 0; - /* Is it a security hole here to silently fail to copy the policy? */ - if (inp->inp_sp == NULL) { - error = ipsec_init_policy(so, &inp->inp_sp); - } - if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) { - printf("tcp_input: could not copy policy\n"); - } - } -#endif - /* inherit states from the listener */ - DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, - struct tcpcb *, tp, int32_t, TCPS_LISTEN); - TCP_LOG_STATE(tp, TCPS_LISTEN); - tp->t_state = TCPS_LISTEN; - tp->t_flags |= tp0->t_flags & (TF_NOPUSH | TF_NOOPT | TF_NODELAY); - tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP | TF_NOTIMEWAIT | TF_FASTOPEN | TF_L4S_ENABLED | TF_L4S_DISABLED)); - tp->t_keepinit = tp0->t_keepinit; - tp->t_keepcnt = tp0->t_keepcnt; - tp->t_keepintvl = tp0->t_keepintvl; - tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo; - tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo; - tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; - if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) { - tp->t_notsent_lowat = tp0->t_notsent_lowat; - } - if (tp->t_flagsext & (TF_L4S_ENABLED | TF_L4S_DISABLED)) { - tcp_set_foreground_cc(so); - } - tp->t_inpcb->inp_flags2 |= - tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD; - - /* now drop the reference on the listener */ - socket_unlock(oso, 1); - - tcp_set_max_rwinscale(tp, so); - -#if CONTENT_FILTER - if (check_cfil) { - int error = cfil_sock_attach(so2, SA(&to2), SA(&from), CFS_CONNECTION_DIR_IN); - if (error != 0) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed"); - drop_reason = DROP_REASON_TCP_CONTENT_FILTER_ATTACH; + if (in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port"); + drop_reason = DROP_REASON_TCP_SAME_PORT; goto drop; } + } else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address"); + drop_reason = DROP_REASON_TCP_SAME_PORT; + goto drop; } -#endif /* CONTENT_FILTER */ + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + * + * Packets with a multicast source address should also + * be discarded. + */ + if (m->m_flags & (M_BCAST | M_MCAST)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST"); + drop_reason = DROP_REASON_TCP_BCAST_MCAST; + goto drop; + } + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST"); + drop_reason = DROP_REASON_TCP_BCAST_MCAST; + goto drop; + } + } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address"); + drop_reason = DROP_REASON_TCP_BCAST_MCAST; + goto drop; + } - KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0); + /* + * If deprecated address is forbidden, + * we do not accept SYN to deprecated interface + * address to prevent any new inbound connection from + * getting established. + * When we do not accept SYN, we send a TCP RST, + * with deprecated source address (instead of dropping + * it). We compromise it as it is much better for peer + * to send a RST, and RST will be the final packet + * for the exchange. + * + * If we do not forbid deprecated addresses, we accept + * the SYN packet. RFC 4862 forbids dropping SYN in + * this case. + */ + if (isipv6 && !ip6_use_deprecated) { + uint32_t ia6_flags; + + if (ip6_getdstifaddr_info(m, NULL, + &ia6_flags) == 0) { + if (ia6_flags & IN6_IFF_DEPRECATED) { + tp = NULL; + IF_TCP_STATINC(ifp, deprecate6); + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address"); + drop_reason = DROP_REASON_TCP_DEPRECATED_ADDR; + goto dropwithreset; + } + } + } + + bool syn_cookie_sent = false; + ret = tcp_create_server_socket(&tpi, &so2, &syn_cookie_sent, &dropsocket); + + if (syn_cookie_sent) { + /* + * SYN cookie sent and mbuf consumed. + * Only the listen socket is unlocked by tcp_syncookie_syn(). + */ + KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0); + return; + } + if (!so2) { + TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop"); + drop_reason = DROP_REASON_TCP_LISTENER_DROP; + goto drop; + } + /* Set so to newly connected socket */ + so = so2; + + if (ret == false) { + drop_reason = DROP_REASON_TCP_CREATE_SERVER_SOCKET; + goto drop; } } +syn_cookie_valid: socket_lock_assert_owned(so); - /* * Packet accounting should not be done on listening socket */ @@ -2939,34 +3500,7 @@ findpcb: so->last_pid, so->so_log_seqn++); } - if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { - /* - * Evaluate the rate of arrival of packets to see if the - * receiver can reduce the ack traffic. The algorithm to - * stretch acks will be enabled if the connection meets - * certain criteria defined in tcp_stretch_ack_enable function. - */ - if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { - TCP_INC_VAR(tp->rcv_waitforss, segment_count); - } - if (tcp_stretch_ack_enable(tp, thflags)) { - tp->t_flags |= TF_STRETCHACK; - tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS); - tp->rcv_waitforss = 0; - } else { - tp->t_flags &= ~(TF_STRETCHACK); - } - if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) { - tp->rcv_by_unackhalfwin += (tlen + off); - tp->rcv_by_unackwin += (tlen + off); - } else { - tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; - tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off; - tp->rcv_by_unackhalfwin = tlen + off; - } - } - - if (TCP_L4S_ENABLED(tp) && TCP_ACC_ECN_ON(tp)) { + if (tp->accurate_ecn_on) { /* Reset the state used for AccECN processing */ tp->t_aecn.accecn_processed = 0; } @@ -2988,7 +3522,7 @@ findpcb: } /* Accurate ECN has different semantics for TH_CWR. */ - if (!TCP_ACC_ECN_ON(tp)) { + if (!tp->accurate_ecn_on) { /* * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't * bother doing extensive checks for state and whatnot. @@ -3005,7 +3539,7 @@ findpcb: * or valid data packets */ uint8_t ace = tcp_get_ace(th); - if (TCP_ACC_ECN_ON(tp) && tp->t_state == TCPS_ESTABLISHED) { + if (tp->accurate_ecn_on && tp->t_state == TCPS_ESTABLISHED) { /* Update receive side counters */ if (tlen == 0 || (tlen > 0 && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && @@ -3044,38 +3578,34 @@ findpcb: /* * If we received an explicit notification of congestion in * ip tos ecn bits or by the CWR bit in TCP header flags, reset - * the ack-stretching state. We need to handle ECN notification if + * the force-ACK counter. We need to handle ECN notification if * an ECN setup SYN was sent even once. */ if (tp->t_state == TCPS_ESTABLISHED && (tp->ecn_flags & TE_SETUPSENT) && (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) { - tcp_reset_stretch_ack(tp); tp->t_forced_acks = TCP_FORCED_ACKS_COUNT; CLEAR_IAJ_STATE(tp); } - if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED && - !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) { - tcpstat.tcps_ecn_fallback_ce++; - tcp_heuristic_ecn_aggressive(tp); - tp->ecn_flags |= TE_CEHEURI_SET; - } - if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) && - ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) { - if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) { - tp->t_ecn_recv_ce_pkt++; - } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) { + if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED) { + /* Received CE on a non-ECN enabled connection */ + if (!TCP_ECN_ENABLED(tp)) { tcpstat.tcps_ecn_fallback_ce++; - tcp_heuristic_ecn_aggressive(tp); - tp->ecn_flags |= TE_CEHEURI_SET; INP_INC_IFNET_STAT(inp, ecn_fallback_ce); - } else { - /* We tracked the first ECN_MIN_CE_PROBES segments, we - * now know that the path is good. - */ - tp->ecn_flags |= TE_CEHEURI_SET; + } else if (!(tp->ecn_flags & TE_ECEHEURI_SET)) { + if (inp->inp_mstat.ms_total.ts_rxpackets < ECN_MIN_CE_PROBES) { + tp->t_ecn_recv_ce_pkt++; + } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) { + tcp_heuristic_ecn_aggressive(tp); + tp->ecn_flags |= TE_ECEHEURI_SET; + } else { + /* We tracked the first ECN_MIN_CE_PROBES segments, we + * now know that the path is good. + */ + tp->ecn_flags |= TE_ECEHEURI_SET; + } } } @@ -3221,7 +3751,7 @@ findpcb: * We increment t_unacksegs_ce for both data segments * and pure ACKs for Accurate ECN */ - if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) { + if (tp->accurate_ecn_on && ip_ecn == IPTOS_ECN_CE) { TCP_INC_VAR(tp->t_unacksegs_ce, segment_count); } @@ -3250,10 +3780,7 @@ findpcb: tcpstat.tcps_rcvackbyte += acked; /* TE_SENDIPECT is only set when L4S sysctl is enabled */ - if (TCP_ACC_ECN_ON(tp) && (tp->ecn_flags & TE_SENDIPECT)) { - if (!TCP_L4S_ENABLED(tp)) { - os_log_error(OS_LOG_DEFAULT, "TE_SENDIPECT flag is set but TCP_L4S_ENABLED is not"); - } + if (tp->accurate_ecn_on && (tp->ecn_flags & TE_SENDIPECT)) { uint32_t pkts_acked = tcp_packets_this_ack(tp, acked); tp->total_ect_packets_acked += pkts_acked; @@ -3336,8 +3863,7 @@ findpcb: tp->t_timer[TCPT_REORDER] = 0; tcp_rack_reset_segs_retransmitted(tp); } else if (tp->t_timer[TCPT_PERSIST] == 0) { - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); } if (!SLIST_EMPTY(&tp->t_rxt_segments) && !TCP_DSACK_SEQ_IN_WINDOW(tp, @@ -3373,6 +3899,18 @@ findpcb: } } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) && tlen <= tcp_sbspace(tp)) { + int mem = tcp_memacct_limited(); + if (mem == MEMACCT_HARDLIMIT || + (mem == MEMACCT_SOFTLIMIT && so->so_rcv.sb_cc > 0)) { + /* + * If we are at the hard limit, just drop. + * If we are at the softlimit, only accept one + * packet into the receive-queue. + */ + drop_reason = DROP_REASON_TCP_INSEQ_MEMORY_PRESSURE; + tcpstat.tcps_rcvmemdrop++; + goto drop; + } /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and @@ -3406,11 +3944,7 @@ findpcb: TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count); tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { - INP_ADD_STAT(inp, ifnet_count_type, - rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, rxbytes, - tlen); - inp_set_activity_bitmap(inp); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, tlen); } /* Calculate the RTT on the receiver */ @@ -3461,7 +3995,7 @@ findpcb: if (DELAY_ACK(tp, th)) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; - tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + tp->t_timer[TCPT_DELACK] = tcp_offset_from_start(tp, tcp_delack); } } else { tp->t_flags |= TF_ACKNOW; @@ -3528,149 +4062,21 @@ findpcb: * segment in this state. */ case TCPS_LISTEN: { - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - int error = 0; + struct tcp_inp tpi = {.inp = &inp, .tp = &tp, .m = m, .th = th, + .iss = iss, .tiwin = tiwin, .to = &to, .optp = optp, .optlen = optlen, + .ip6 = ip6, .ip = ip, .ip_ecn = ip_ecn, .isipv6 = isipv6, .ifp = ifp, + .ifscope = ifscope, .kernel_proc = kernel_proc}; + ret = tcp_setup_server_socket(&tpi, so, false); - socket_lock_assert_owned(so); - - /* Clear the logging flags inherited from the listening socket */ - inp->inp_log_flags = 0; - inp->inp_flags2 &= ~INP2_LOGGING_ENABLED; - - if (__improbable(inp->inp_flags2 & INP2_BIND_IN_PROGRESS)) { - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN bind in progress"); - drop_reason = DROP_REASON_TCP_BIND_IN_PROGRESS; + if (ret == false) { + drop_reason = DROP_REASON_TCP_CREATE_SERVER_SOCKET; goto drop; } - inp_enter_bind_in_progress(so); - - if (isipv6) { - sin6 = kalloc_type(struct sockaddr_in6, Z_NOWAIT | Z_ZERO); - if (sin6 == NULL) { - error = ENOMEM; - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed"); - drop_reason = DROP_REASON_TCP_MEM_ALLOC; - goto pcbconnect_done; - } - sin6->sin6_family = AF_INET6; - sin6->sin6_len = sizeof(*sin6); - sin6->sin6_addr = ip6->ip6_src; - sin6->sin6_port = th->th_sport; - if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) { - sin6->sin6_scope_id = ip6_input_getsrcifscope(m); - } - laddr6 = inp->in6p_laddr; - uint32_t lifscope = inp->inp_lifscope; - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { - inp->in6p_laddr = ip6->ip6_dst; - inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr); - in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope); - } - if ((error = in6_pcbconnect(inp, SA(sin6), kernel_proc)) != 0) { - inp->in6p_laddr = laddr6; - kfree_type(struct sockaddr_in6, sin6); - inp->inp_lifscope = lifscope; - in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope); - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed"); - drop_reason = DROP_REASON_TCP_PCB_CONNECT; - goto pcbconnect_done; - } - kfree_type(struct sockaddr_in6, sin6); - } else { - socket_lock_assert_owned(so); - sin = kalloc_type(struct sockaddr_in, Z_NOWAIT); - if (sin == NULL) { - error = ENOMEM; - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed"); - drop_reason = DROP_REASON_TCP_MEM_ALLOC; - goto pcbconnect_done; - } - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = ip->ip_src; - sin->sin_port = th->th_sport; - bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); - laddr = inp->inp_laddr; - if (inp->inp_laddr.s_addr == INADDR_ANY) { - inp->inp_laddr = ip->ip_dst; - } - if ((error = in_pcbconnect(inp, SA(sin), kernel_proc, IFSCOPE_NONE, NULL)) != 0) { - inp->inp_laddr = laddr; - kfree_type(struct sockaddr_in, sin); - TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed"); - drop_reason = DROP_REASON_TCP_PCB_CONNECT; - goto pcbconnect_done; - } - kfree_type(struct sockaddr_in, sin); - } -pcbconnect_done: - inp_exit_bind_in_progress(so); - if (error != 0) { - goto drop; - } - - tcp_dooptions(tp, optp, optlen, th, &to); - tcp_finalize_options(tp, &to, ifscope); - if (TFO_ENABLED(tp) && tcp_tfo_syn(tp, &to)) { isconnected = TRUE; } - - if (iss) { - tp->iss = iss; - } else { - tp->iss = tcp_new_isn(tp); - } - tp->irs = th->th_seq; - tcp_sendseqinit(tp); - tcp_rcvseqinit(tp); - tp->snd_recover = tp->snd_una; - /* - * Initialization of the tcpcb for transaction; - * set SND.WND = SEG.WND, - * initialize CCsend and CCrecv. - */ - tp->snd_wnd = tiwin; /* initial send-window */ - tp->max_sndwnd = tp->snd_wnd; - tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; - tp->t_unacksegs_ce = 0; - DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, - struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); - TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED); - tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, - TCP_CONN_KEEPINIT(tp)); - tp->t_connect_time = tcp_now; dropsocket = 0; /* committed to socket */ - if (inp->inp_flowhash == 0) { - inp_calc_flowhash(inp); - ASSERT(inp->inp_flowhash != 0); - } - /* update flowinfo - RFC 6437 */ - if (inp->inp_flow == 0 && - inp->in6p_flags & IN6P_AUTOFLOWLABEL) { - inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; - inp->inp_flow |= - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - } - - /* reset the incomp processing flag */ - so->so_flags &= ~(SOF_INCOMP_INPROGRESS); - tcpstat.tcps_accepts++; - - int ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE; - tcp_input_process_accecn_syn(tp, ace_flags, ip_ecn); - - /* - * The address and connection state are finalized - */ - TCP_LOG_CONNECT(tp, false, 0); - - tcp_add_fsw_flow(tp, ifp); - goto trimthenstep6; } @@ -3767,160 +4173,8 @@ pcbconnect_done: tcpstat.tcps_connects++; const uint32_t ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE; - - if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) { - /* Receiving Any|0|1 is classic ECN-setup SYN-ACK */ - tp->ecn_flags |= TE_SETUPRECEIVED; - if (TCP_ECN_ENABLED(tp)) { - tcp_heuristic_ecn_success(tp); - tcpstat.tcps_ecn_client_success++; - } - - if (tp->ecn_flags & TE_ACE_SETUPSENT) { - /* - * Sent AccECN SYN but received classic ECN SYN-ACK - * Set classic ECN related flags - */ - tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT); - tp->ecn_flags &= ~TE_ACE_SETUPSENT; - if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { - tp->t_client_accecn_state = tcp_connection_client_classic_ecn_available; - } - } - } else if (TCP_L4S_ENABLED(tp) && ace_flags != 0 && - ace_flags != TH_ACE) { - /* Initialize sender side packet & byte counters */ - tp->t_aecn.t_snd_ce_packets = 5; - tp->t_aecn.t_snd_ect1_bytes = tp->t_aecn.t_snd_ect0_bytes = 1; - tp->t_aecn.t_snd_ce_bytes = 0; - tp->ecn_flags |= TE_ACE_FINAL_ACK_3WHS; - /* - * Client received AccECN SYN-ACK that reflects the state (ECN) - * in which SYN packet was delivered. This helps to detect if - * there was mangling of the SYN packet on the path. Currently, we - * only send Not-ECT on SYN packets. So, we should set Not-ECT in - * all packets if we receive any encoding other than 0|TH_CWR|0. - * If 0|0|0 and 1|1|1 were received, fail Accurate ECN negotiation - * by not setting TE_ACE_SETUPRECEIVED. - */ - uint32_t ecn_flags = TE_ACE_SETUPRECEIVED; - if (TCP_L4S_ENABLED(tp)) { - ecn_flags |= TE_SENDIPECT; - } - switch (ace_flags) { - case (0 | TH_CWR | 0): - /* Non-ECT SYN was delivered */ - tp->ecn_flags |= ecn_flags; - tcpstat.tcps_ecn_ace_syn_not_ect++; - tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success; - break; - case (0 | TH_CWR | TH_ECE): - /* ECT1 SYN was delivered */ - tp->ecn_flags |= ecn_flags; - /* Mangling detected, set Non-ECT on outgoing packets */ - tp->ecn_flags &= ~TE_SENDIPECT; - tcpstat.tcps_ecn_ace_syn_ect1++; - tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; - break; - case (TH_AE | 0 | 0): - /* ECT0 SYN was delivered */ - tp->ecn_flags |= ecn_flags; - /* Mangling detected, set Non-ECT on outgoing packets */ - tp->ecn_flags &= ~TE_SENDIPECT; - tcpstat.tcps_ecn_ace_syn_ect0++; - tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; - break; - case (TH_AE | TH_CWR | 0): - /* CE SYN was delivered */ - tp->ecn_flags |= ecn_flags; - /* Mangling detected, set Non-ECT on outgoing packets */ - tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected; - tp->ecn_flags &= ~TE_SENDIPECT; - /* - * Although we don't send ECT SYN yet, it is possible that - * a network element changed Not-ECT to ECT and later there - * was congestion at another network element that set it to CE. - * To keep it simple, we will consider this as a congestion event - * for the congestion controller. - * If a TCP client in AccECN mode receives CE feedback in the TCP - * flags of a SYN/ACK, it MUST NOT increment s.cep. - */ - tp->snd_cwnd = 2 * tp->t_maxseg; - tcpstat.tcps_ecn_ace_syn_ce++; - break; - default: - break; - } - if (TCP_ECN_ENABLED(tp)) { - tcp_heuristic_ecn_success(tp); - tcpstat.tcps_ecn_client_success++; - } - /* - * A TCP client in AccECN mode MUST feed back which of the 4 - * possible values of the IP-ECN field that was received in the - * SYN/ACK. Set the setup flag for final ACK accordingly. - * We will initialize r.cep, r.e1b, r.e0b first and then increment - * if CE was set on the IP-ECN field of the SYN-ACK. - */ - tp->t_aecn.t_rcv_ce_packets = 5; - tp->t_aecn.t_rcv_ect0_bytes = tp->t_aecn.t_rcv_ect1_bytes = 1; - tp->t_aecn.t_rcv_ce_bytes = 0; - - /* Increment packet & byte counters based on IP-ECN */ - tcp_input_ip_ecn(tp, inp, (uint32_t)tlen, (uint32_t)segment_count, ip_ecn); - - switch (ip_ecn) { - case IPTOS_ECN_NOTECT: - /* Not-ECT SYN-ACK was received */ - tp->ecn_flags |= TE_ACE_SETUP_NON_ECT; - break; - case IPTOS_ECN_ECT1: - /* ECT1 SYN-ACK was received */ - tp->ecn_flags |= TE_ACE_SETUP_ECT1; - break; - case IPTOS_ECN_ECT0: - /* ECT0 SYN-ACK was received */ - tp->ecn_flags |= TE_ACE_SETUP_ECT0; - break; - case IPTOS_ECN_CE: - tp->ecn_flags |= TE_ACE_SETUP_CE; - break; - } - /* Update the time for this newly SYN-ACK packet */ - if ((to.to_flags & TOF_TS) != 0 && (to.to_tsecr != 0) && - (tp->t_last_ack_tsecr == 0 || TSTMP_GEQ(to.to_tsecr, tp->t_last_ack_tsecr))) { - tp->t_last_ack_tsecr = to.to_tsecr; - } - } else { - if ((tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) && - tp->t_rxtshift == 0) { - tcp_heuristic_ecn_success(tp); - tcpstat.tcps_ecn_not_supported++; - } - if ((tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) && - tp->t_rxtshift > 0) { - tcp_heuristic_ecn_loss(tp); - } - - /* non-ECN-setup SYN-ACK */ - tp->ecn_flags &= ~TE_SENDIPECT; - /* - * If Accurate ECN SYN was retransmitted twice and non-ECN SYN-ACK - * was received, then we consider it as Accurate ECN blackholing - */ - if ((tp->ecn_flags & TE_LOST_SYN) && tp->t_rxtshift <= 2 && - tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { - tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_blackholed; - } - /* - * If SYN wasn't retransmitted twice yet, the server supports neither classic nor - * accurate ECN SYN-ACK. Accurate ECN should already be disabled for both half connections - * as TE_ACE_SETUPRECEIVED flag is not set. - */ - if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) { - tp->t_client_accecn_state = tcp_connection_client_ecn_not_available; - } - } + tcp_input_process_accecn_synack(tp, inp, &to, thflags, ace_flags, ip_ecn, + (uint32_t)tlen, (uint32_t)segment_count); /* Do window scaling on this connection? */ if (TCP_WINDOW_SCALE_ENABLED(tp)) { @@ -3976,13 +4230,13 @@ pcbconnect_done: * ACKNOW will be turned on later. */ TCP_INC_VAR(tp->t_unacksegs, segment_count); - if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) { + if (tp->accurate_ecn_on && ip_ecn == IPTOS_ECN_CE) { TCP_INC_VAR(tp->t_unacksegs_ce, segment_count); } if (DELAY_ACK(tp, th) && tlen != 0) { if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; - tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + tp->t_timer[TCPT_DELACK] = tcp_offset_from_start(tp, tcp_delack); } } else { tp->t_flags |= TF_ACKNOW; @@ -4013,7 +4267,7 @@ pcbconnect_done: TCP_LOG_STATE(tp, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; tp->t_timer[TCPT_KEEP] = - OFFSET_FROM_START(tp, + tcp_offset_from_start(tp, TCP_CONN_KEEPIDLE(tp)); if (nstat_collect) { nstat_route_connect_success( @@ -4233,12 +4487,12 @@ trimthenstep6: goto close; case TCPS_ESTABLISHED: - if ((TCP_ECN_ENABLED(tp) || TCP_ACC_ECN_ON(tp)) && + if ((TCP_ECN_ENABLED(tp) || tp->accurate_ecn_on) && tp->snd_una == tp->iss + 1 && SEQ_GT(tp->snd_max, tp->snd_una)) { /* * If the first data packet on an - * ECN connection, receives a RST + * ECN connection receives a RST * increment the heuristic */ tcp_heuristic_ecn_droprst(tp); @@ -4309,30 +4563,11 @@ close: tp->t_pawsdrop++; tcpstat.tcps_pawsdrop++; - /* - * PAWS-drop when ECN is being used? That indicates - * that ECT-marked packets take a different path, with - * different congestion-characteristics. - * - * Only fallback when we did send less than 2GB as PAWS - * really has no reason to kick in earlier. - */ - if ((TCP_ECN_ENABLED(tp) || TCP_ACC_ECN_ON(tp)) && - inp->inp_stat->rxbytes < 2147483648) { - INP_INC_IFNET_STAT(inp, ecn_fallback_reorder); - tcpstat.tcps_ecn_fallback_reorder++; - tcp_heuristic_ecn_aggressive(tp); - } - if (nstat_collect) { nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE); - INP_ADD_STAT(inp, ifnet_count_type, - rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, - rxbytes, tlen); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, tlen); tp->t_stat.rxduplicatebytes += tlen; - inp_set_activity_bitmap(inp); } if (tlen > 0) { goto dropafterack; @@ -4356,6 +4591,30 @@ close: goto dropwithreset; } + /* + * For SYN received in TIME_WAIT state: + * A valid SYN with the intention to create a new connection + * should have a higher timestamp than seen for the current + * connection, if timestamp is supported. OR if timestamp + * is either equal or not supported, sequence number of the + * incoming SYN should be greater than the last sequence + * number seen on the current connection. + */ + if (tp->t_state == TCPS_TIME_WAIT && tlen == 0 && + (thflags & (TH_SYN | TH_ACK | TH_RST)) == TH_SYN) { + bool higher_seq = SEQ_GT(th->th_seq, tp->rcv_nxt); + bool newer_time = TSTMP_GT(to.to_tsval, tp->ts_recent) || + (to.to_tsval == tp->ts_recent && higher_seq); + bool tstmp_received = to.to_flags & TOF_TS; + + if ((tstmp_received && newer_time) || (!tstmp_received && higher_seq)) { + iss = tcp_new_isn(tp); + tp = tcp_close(tp); + socket_unlock(so, 1); + goto findpcb; + } + } + /* * Check if there is old data at the beginning of the window * i.e. the sequence number is before rcv_nxt @@ -4430,10 +4689,8 @@ close: if (nstat_collect) { nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE); - INP_ADD_STAT(inp, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, rxbytes, todrop); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, todrop); tp->t_stat.rxduplicatebytes += todrop; - inp_set_activity_bitmap(inp); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; @@ -4498,20 +4755,6 @@ close: tcpstat.tcps_rcvpackafterwin++; if (todrop >= tlen) { tcpstat.tcps_rcvbyteafterwin += tlen; - /* - * If a new connection request is received - * while in TIME_WAIT, drop the old connection - * and start over if the sequence numbers - * are above the previous ones. - */ - if (thflags & TH_SYN && - tp->t_state == TCPS_TIME_WAIT && - SEQ_GT(th->th_seq, tp->rcv_nxt)) { - iss = tcp_new_isn(tp); - tp = tcp_close(tp); - socket_unlock(so, 1); - goto findpcb; - } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from @@ -4683,7 +4926,7 @@ close: struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); TCP_LOG_STATE(tp, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPIDLE(tp)); if (nstat_collect) { nstat_route_connect_success( @@ -4706,89 +4949,15 @@ close: * AccECN server in SYN-RCVD state received an ACK with * SYN=0, process handshake encoding present in the ACK for SYN-ACK * and update receive side counters. + * + * When SYN cookies are used, process last ACK only if classic ECN + * wasn't negotiated. */ - if (TCP_ACC_ECN_ON(tp) && (thflags & (TH_SYN | TH_ACK)) == TH_ACK) { - const uint32_t ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE; - if (tlen == 0 && to.to_nsacks == 0) { - /* - * ACK for SYN-ACK reflects the state (ECN) in which SYN-ACK packet - * was delivered. Use Table 4 of Accurate ECN draft to decode only - * when a pure ACK with no SACK block is received. - * 0|0|0 will fail Accurate ECN negotiation and disable ECN. - */ - switch (ace_flags) { - case (0 | TH_CWR | 0): - /* Non-ECT SYN-ACK was delivered */ - tp->t_aecn.t_snd_ce_packets = 5; - if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success; - } - break; - case (0 | TH_CWR | TH_ECE): - /* ECT1 SYN-ACK was delivered, mangling detected */ - OS_FALLTHROUGH; - case (TH_AE | 0 | 0): - /* ECT0 SYN-ACK was delivered, mangling detected */ - tp->t_aecn.t_snd_ce_packets = 5; - if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected; - } - break; - case (TH_AE | TH_CWR | 0): - /* - * CE SYN-ACK was delivered, even though mangling happened, - * CE could indicate congestion at a node after mangling occured. - * Set cwnd to 2 segments - */ - tp->t_aecn.t_snd_ce_packets = 6; - tp->snd_cwnd = 2 * tp->t_maxseg; - if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected; - } - break; - case (0 | 0 | 0): - /* Disable ECN, as ACE fields were zeroed */ - tp->ecn_flags &= ~(TE_SETUPRECEIVED | TE_SENDIPECT | - TE_SENDCWR | TE_ACE_SETUPRECEIVED); - /* - * Since last ACK has no ECN flag set and TE_LOST_SYNACK is set, this is in response - * to the second (non-ECN setup) SYN-ACK retransmission. In such a case, we assume - * that AccECN SYN-ACK was blackholed. - */ - if ((tp->ecn_flags & TE_LOST_SYNACK) && tp->t_rxtshift <= 2 && - (tp->t_server_accecn_state == tcp_connection_server_classic_ecn_requested || - tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested)) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_blackholed; - } - /* - * SYN-ACK hasn't been retransmitted twice yet, so this could likely mean bleaching of ACE - * on the path from client to server on last ACK. - */ - if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_ace_bleaching_detected; - } - break; - default: - /* Unused values for forward compatibility */ - tp->t_aecn.t_snd_ce_packets = 5; - break; - } - /* Update the time for this newly received last ACK */ - if ((to.to_flags & TOF_TS) != 0 && (to.to_tsecr != 0) && - (tp->t_last_ack_tsecr == 0 || TSTMP_GEQ(to.to_tsecr, tp->t_last_ack_tsecr))) { - tp->t_last_ack_tsecr = to.to_tsecr; - } - } else if (to.to_nsacks == 0) { - /* - * If 3rd ACK is lost, we won't receive the last ACK - * encoding. We will move the server to AccECN mode - * regardless. - */ - tp->t_aecn.t_snd_ce_packets = 5; - if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) { - tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success; - } - } + if ((tp->accurate_ecn_on || (tp->l4s_enabled && !TCP_ECN_ENABLED(tp) && syn_cookie_processed)) + && (thflags & (TH_SYN | TH_ACK)) == TH_ACK) { + uint16_t aceflags = tcp_get_flags(th); + aceflags &= TH_ACE; + tcp_input_process_accecn_last_ack(tp, &to, (uint32_t)tlen, aceflags, syn_cookie_processed); /* Increment receive side counters based on IP-ECN */ tcp_input_ip_ecn(tp, inp, (uint32_t)tlen, (uint32_t)segment_count, ip_ecn); } @@ -4869,6 +5038,9 @@ close: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: + { + const uint64_t byte_limit = MIN(tp->t_stat.bytes_acked, tp->max_sndwnd); + if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; if (tcp_is_ack_ratelimited(tp)) { @@ -4880,7 +5052,7 @@ close: goto dropafterack; } } - if (SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) { + if (SEQ_LT(th->th_ack, tp->snd_una - byte_limit)) { if (tcp_is_ack_ratelimited(tp)) { TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK"); drop_reason = DROP_REASON_TCP_OLD_ACK; @@ -4967,7 +5139,7 @@ close: * Process AccECN feedback here for control packets * that don't have s/acked bytes */ - if (TCP_ACC_ECN_ON(tp) && (tp->ecn_flags & TE_SENDIPECT) && + if (tp->accurate_ecn_on && (tp->ecn_flags & TE_SENDIPECT) && (sack_bytes_acked == 0)) { tp->total_ect_packets_acked += 1; @@ -4983,7 +5155,7 @@ close: if (tlen == 0 && (tiwin == tp->snd_wnd || (to.to_nsacks > 0 && sack_bytes_acked > 0))) { - uint32_t old_dupacks; + uint32_t old_dupacks = 0; /* * If both ends send FIN at the same time, * then the ack will be a duplicate ack @@ -5028,9 +5200,11 @@ process_dupack: ++tp->t_dupacks; } - tp->sackhint.sack_bytes_acked += sack_bytes_acked; + if (!TCP_RACK_ENABLED(tp)) { + tp->sackhint.sack_bytes_acked += sack_bytes_acked; + } - if (sack_bytes_acked > 0 && TCP_ACC_ECN_ON(tp) && + if (sack_bytes_acked > 0 && tp->accurate_ecn_on && (tp->ecn_flags & TE_SENDIPECT) && tp->t_state == TCPS_ESTABLISHED) { uint32_t pkts_sacked = tcp_packets_this_ack(tp, sack_bytes_acked); tp->total_ect_packets_acked += pkts_sacked; @@ -5191,7 +5365,7 @@ process_dupack: (tp->t_state == TCPS_ESTABLISHED || tp->t_state == TCPS_FIN_WAIT_1)) { tp->t_timer[TCPT_DELAYFR] = - OFFSET_FROM_START(tp, + tcp_offset_from_start(tp, tp->t_reorderwin); tp->t_flagsext |= TF_DELAY_RECOVERY; tcpstat.tcps_delay_recovery++; @@ -5211,7 +5385,7 @@ process_dupack: } ENTER_FASTRECOVERY(tp); tp->t_timer[TCPT_REXMT] = 0; - if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) { + if (!tp->accurate_ecn_on && TCP_ECN_ENABLED(tp)) { tp->ecn_flags |= TE_SENDCWR; } @@ -5303,9 +5477,7 @@ process_dupack: tcp_ccdbg_trace(tp, th, TCP_CC_PARTIAL_ACK); } } else { - if (tcp_cubic_minor_fixes) { - exiting_fr = 1; - } + exiting_fr = 1; EXIT_FASTRECOVERY(tp); if (CC_ALGO(tp)->post_fr != NULL) { CC_ALGO(tp)->post_fr(tp, th); @@ -5384,8 +5556,7 @@ process_ACK: tcp_rack_reset_segs_retransmitted(tp); needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) { - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); } if ((prev_t_state == TCPS_SYN_SENT || @@ -5434,7 +5605,7 @@ process_ACK: * Since SYN-ACK has a special encoding, exclude it from below. * Only perform it before CC is called and snd_una is updated. */ - if (TCP_ACC_ECN_ON(tp) && !(thflags & TH_SYN)) { + if (tp->accurate_ecn_on && !(thflags & TH_SYN)) { /* * For a server in SYN_RECEIVED state (that switched to * ESTABLISHED in this ACK, exclude processing the last ACK @@ -5467,8 +5638,13 @@ process_ACK: tcp_process_accecn(tp, &to, th, pkts_acked, ace); } } else if (TCP_ECN_ENABLED(tp) && (thflags & TH_ECE)) { + uint32_t pkts_acked = tcp_packets_this_ack(tp, acked); /* * For classic ECN, congestion event is receiving TH_ECE. + * Disable ECN if > 90% marking is observed in ACK packets + */ + tcp_ece_aggressive_heur(tp, pkts_acked); + /* * Reduce the congestion window if we haven't * done so. */ @@ -5488,7 +5664,7 @@ process_ACK: tp->ecn_flags |= TE_RECV_ECN_ECE; INP_INC_IFNET_STAT(inp, ecn_recv_ece); tcpstat.tcps_ecn_recv_ece++; - tp->t_ecn_capable_packets_marked++; + tp->t_ecn_capable_packets_marked += pkts_acked; tcp_ccdbg_trace(tp, th, TCP_CC_ECN_RCVD); } } @@ -5674,6 +5850,7 @@ process_ACK: goto process_dupack; } } + } step6: /* @@ -5788,6 +5965,21 @@ dodata: tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ + + if (th->th_seq == tp->rcv_nxt) { + int mem = tcp_memacct_limited(); + if (mem == MEMACCT_HARDLIMIT || + (mem == MEMACCT_SOFTLIMIT && so->so_rcv.sb_cc > 0)) { + /* + * If we are at the hard limit, just drop. + * If we are at the softlimit, only accept one + * packet into the receive-queue. + */ + drop_reason = DROP_REASON_TCP_INSEQ_MEMORY_PRESSURE; + tcpstat.tcps_rcvmemdrop++; + goto drop; + } + } /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now @@ -5811,7 +6003,7 @@ dodata: if ((tp->t_flags & TF_DELACK) == 0) { tp->t_flags |= TF_DELACK; tp->t_timer[TCPT_DELACK] = - OFFSET_FROM_START(tp, tcp_delack); + tcp_offset_from_start(tp, tcp_delack); } } else { tp->t_flags |= TF_ACKNOW; @@ -5829,11 +6021,7 @@ dodata: TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count); tcpstat.tcps_rcvbyte += tlen; if (nstat_collect) { - INP_ADD_STAT(inp, ifnet_count_type, - rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, - rxbytes, tlen); - inp_set_activity_bitmap(inp); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, tlen); } tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); if (TCP_USE_RLEDBAT(tp, so) && @@ -5885,9 +6073,7 @@ dodata: */ tcp_compute_rcv_rtt(tp, &to, th); - if (tcp_autotune_reorder) { - tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); - } + tcp_sbrcv_grow(tp, &so->so_rcv, &to, tlen); if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.data_rcvd != NULL) { tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen); @@ -5944,7 +6130,7 @@ dodata: * We increment t_unacksegs_ce for both data segments and pure ACKs * No need to increment if a FIN has already been received. */ - if (TCP_ACC_ECN_ON(tp) && TCPS_HAVEESTABLISHED(tp->t_state) && + if (tp->accurate_ecn_on && TCPS_HAVEESTABLISHED(tp->t_state) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { if (ip_ecn == IPTOS_ECN_CE) { TCP_INC_VAR(tp->t_unacksegs_ce, segment_count); @@ -6120,14 +6306,14 @@ dropwithreset: if (thflags & TH_ACK) { /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), m->m_len, th, m, (tcp_seq)0, th->th_ack, - TH_RST, &tra); + 0, TH_RST, NULL, 0, 0, 0, &tra, false); } else { if (thflags & TH_SYN) { tlen++; } /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), m->m_len, th, m, th->th_seq + tlen, - (tcp_seq)0, TH_RST | TH_ACK, &tra); + (tcp_seq)0, 0, TH_RST | TH_ACK, NULL, 0, 0, 0, &tra, false); } /* destroy temporarily created socket */ if (dropsocket) { @@ -6239,7 +6425,7 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp0 __counted_by(cnt0), int cnt0, struct continue; } to->to_flags |= TOF_SCALE; - to->to_requested_s_scale = MIN(cp[2], TCP_MAX_WINSHIFT); + to->to_wscale = MIN(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: @@ -6264,13 +6450,14 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp0 __counted_by(cnt0), int cnt0, struct continue; } if (th->th_flags & TH_SYN) { - to->to_flags |= TOF_SACK; + to->to_flags |= TOF_SACKPERM; } break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) { continue; } + to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks_size = optlen - 2; to->to_sacks = cp + 2; @@ -6338,7 +6525,7 @@ tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope) tcp_mss(tp, to->to_mss, ifscope); } if (SACK_ENABLED(tp)) { - if (!(to->to_flags & TOF_SACK)) { + if (!(to->to_flags & TOF_SACKPERM)) { tp->t_flagsext &= ~(TF_SACK_ENABLE); } else { tp->t_flags |= TF_SACK_PERMIT; @@ -6346,7 +6533,7 @@ tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope) } if (to->to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; - tp->requested_s_scale = to->to_requested_s_scale; + tp->requested_s_scale = to->to_wscale; /* Re-enable window scaling, if the option is received */ if (tp->request_r_scale > 0) { @@ -6802,6 +6989,8 @@ compute_rto: TCP_LOG_RTT_INFO(tp); } + tcp_update_pacer_state(tp); + TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar); } @@ -6905,7 +7094,7 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope) { struct rtentry *rt; struct ifnet *ifp; - int rtt, mss; + int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; @@ -6983,7 +7172,7 @@ tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope) * or rttvar. Convert from the route-table units * to scaled multiples of the slow timeout timer. */ - if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) { + if (tp->t_srtt == 0 && rt->rt_rmx.rmx_rtt != 0) { tcp_getrt_rtt(tp, rt); } else { tp->t_rttmin = TCPTV_REXMTMIN; @@ -7422,7 +7611,7 @@ tcp_set_foreground_cc(struct socket *so) } else { struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); - if (TCP_L4S_ENABLED(tp)) { + if (tp->l4s_enabled) { tcp_set_new_cc(so, TCP_CC_ALGO_PRAGUE_INDEX); } else { tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX); @@ -7480,10 +7669,6 @@ inp_fc_throttle_tcp(struct inpcb *inp) { tcpcb_ref_t tp = inp->inp_ppcb; - if (!tcp_flow_control_response) { - return; - } - /* * Back off the slow-start threshold and enter * congestion avoidance phase @@ -7499,54 +7684,14 @@ inp_fc_unthrottle_tcp(struct inpcb *inp) tcpcb_ref_t tp = inp->inp_ppcb; struct ifnet *outifp = inp->inp_last_outifp; - if (tcp_flow_control_response) { - if (CC_ALGO(tp)->post_fr != NULL) { - CC_ALGO(tp)->post_fr(tp, NULL); - } - - tp->t_bytes_acked = 0; - - /* - * Reset retransmit shift as we know that the reason - * for delay in sending a packet is due to flow - * control on the outgoing interface. There is no need - * to backoff retransmit timer except for cellular interface - */ - if (tp->t_rxtshift != 0 && outifp != NULL && - IFNET_IS_CELLULAR(outifp)) { - TCP_LOG(tp, "inp_fc_unthrottle_tcp keep rxmit state t_rxtshift %d", tp->t_rxtshift); - } else { - TCP_RESET_REXMT_STATE(tp); - } - - tp->t_flagsext &= ~TF_CWND_NONVALIDATED; - - /* - * Start the output stream again. Since we are - * not retransmitting data, do not reset the - * retransmit timer or rtt calculation. - */ - tcp_output(tp); - return; + if (CC_ALGO(tp)->post_fr != NULL) { + CC_ALGO(tp)->post_fr(tp, NULL); } - /* - * Back off the slow-start threshold and enter - * congestion avoidance phase - */ - if (CC_ALGO(tp)->pre_fr != NULL) { - CC_ALGO(tp)->pre_fr(tp); - } - - tp->snd_cwnd = tp->snd_ssthresh; - tp->t_flagsext &= ~TF_CWND_NONVALIDATED; - /* - * Restart counting for ABC as we changed the - * congestion window just now. - */ tp->t_bytes_acked = 0; - /* Reset retransmit shift as we know that the reason + /* + * Reset retransmit shift as we know that the reason * for delay in sending a packet is due to flow * control on the outgoing interface. There is no need * to backoff retransmit timer. @@ -7558,6 +7703,8 @@ inp_fc_unthrottle_tcp(struct inpcb *inp) TCP_RESET_REXMT_STATE(tp); } + tp->t_flagsext &= ~TF_CWND_NONVALIDATED; + /* * Start the output stream again. Since we are * not retransmitting data, do not reset the @@ -7783,28 +7930,6 @@ tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen) return 0; } -#define DUMP_BUF_CHK() { \ - clen -= k; \ - if (clen < 1) \ - goto done; \ - c += k; \ -} - -int -dump_tcp_reass_qlen(char *str __sized_by(str_len), int str_len) -{ - char *c = str; - int k, clen = str_len; - - if (tcp_reass_total_qlen != 0) { - k = scnprintf(c, clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen); - DUMP_BUF_CHK(); - } - -done: - return str_len - clen; -} - uint32_t tcp_reass_qlen_space(struct socket *so) { diff --git a/bsd/netinet/tcp_ledbat.c b/bsd/netinet/tcp_ledbat.c index 2bd97b674..d52538c33 100644 --- a/bsd/netinet/tcp_ledbat.c +++ b/bsd/netinet/tcp_ledbat.c @@ -184,6 +184,8 @@ tcp_ledbat_cwnd_init(struct tcpcb *tp) { tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; tp->bg_ssthresh = tp->snd_ssthresh; + + tcp_update_pacer_state(tp); } /* Function to handle an in-sequence ack which is fast-path processing @@ -369,6 +371,8 @@ ledbat_pp_ack_rcvd(struct tcpcb *tp, uint32_t bytes_acked) /* Congestion avoidance */ ledbat_pp_congestion_avd(tp, bytes_acked, base_rtt, curr_rtt, tcp_globals_now(globals)); } + + tcp_update_pacer_state(tp); } /* Function to process an ack. @@ -428,6 +432,8 @@ tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) if (incr > 0) { update_cwnd(tp, incr, true); } + + tcp_update_pacer_state(tp); } void @@ -486,6 +492,8 @@ tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th) } tp->t_bytes_acked = 0; tp->t_ccstate->ledbat_md_bytes_acked = 0; + + tcp_update_pacer_state(tp); } /* @@ -517,6 +525,8 @@ tcp_ledbat_after_timeout(struct tcpcb *tp) tcp_ledbat_clear_state(tp); tcp_ledbat_pre_fr(tp); tp->snd_cwnd = tp->t_maxseg; + + tcp_update_pacer_state(tp); } } @@ -537,15 +547,7 @@ tcp_ledbat_after_timeout(struct tcpcb *tp) static int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { - if (tcp_ack_strategy == TCP_ACK_STRATEGY_MODERN) { - return tcp_cc_delay_ack(tp, th); - } else { - if ((tp->t_flags & TF_RXWIN0SENT) == 0 && - (th->th_flags & TH_PUSH) == 0 && (tp->t_unacksegs == 1)) { - return 1; - } - return 0; - } + return tcp_cc_delay_ack(tp, th); } /* Change a connection to use ledbat. First, lower bg_ssthresh value diff --git a/bsd/netinet/tcp_log.c b/bsd/netinet/tcp_log.c index 4a1b99caf..66cf61402 100644 --- a/bsd/netinet/tcp_log.c +++ b/bsd/netinet/tcp_log.c @@ -51,7 +51,9 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED, 0, TLEF_DROP_NECP | TLEF_DROP_PCB | TLEF_DROP_PKT | \ TLEF_SYN_RXMT | TLEF_LOG) #else /* (DEVELOPMENT || DEBUG) */ -#define TCP_LOG_ENABLE_DEFAULT 0 +#define TCP_LOG_ENABLE_DEFAULT \ + (TLEF_CONNECTION | TLEF_DST_LOCAL | TLEF_DST_GW | \ + TLEF_DROP_NECP) #endif /* (DEVELOPMENT || DEBUG) */ uint32_t tcp_log_enable_flags = TCP_LOG_ENABLE_DEFAULT; @@ -355,6 +357,7 @@ tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp, TCP_LOG_COMMON_PCB_FMT "snd_una: %u snd_max: %u " "SO_KA: %d RSTALL: %d TFOPRB: %d idle_time: %u " + "rtimo_probes: %u adaptive_rtimo: %u " "KIDLE: %d KINTV: %d KCNT: %d", func_name, line_no, TCP_LOG_COMMON_PCB_ARGS, @@ -362,7 +365,7 @@ tcp_log_keepalive(const char *func_name, int line_no, struct tcpcb *tp, tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE, tp->t_flagsext & TF_DETECT_READSTALL, tp->t_tfo_probe_state == TFO_PROBE_PROBING, - idle_time, + idle_time, tp->t_rtimo_probes, tp->t_adaptive_rtimo, TCP_CONN_KEEPIDLE(tp), TCP_CONN_KEEPINTVL(tp), TCP_CONN_KEEPCNT(tp)); } @@ -420,8 +423,8 @@ tcp_log_connection(struct tcpcb *tp, const char *event, int error) event, \ TCP_LOG_COMMON_PCB_ARGS, \ tp->t_syn_rcvd, tp->t_syn_sent, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ P_MS(tp->t_srtt, TCP_RTT_SHIFT), \ P_MS(tp->t_rttvar, TCP_RTTVAR_SHIFT), \ get_base_rtt(tp), \ @@ -637,8 +640,8 @@ tcp_log_connection_summary(const char *func_name, int line_no, struct tcpcb *tp) TCP_LOG_COMMON_PCB_ARGS, \ duration / TCP_RETRANSHZ, duration % TCP_RETRANSHZ, \ conntime / TCP_RETRANSHZ, conntime % TCP_RETRANSHZ, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ tp->t_stat.rxmitpkts, \ tp->t_rcvoopack, tp->t_stat.rxduplicatebytes, tp->t_stat.acks_delayed, tp->t_stat.delayed_acks_sent, \ P_MS(tp->t_srtt, TCP_RTT_SHIFT), \ @@ -1038,8 +1041,8 @@ tcp_log_message(const char *func_name, int line_no, struct tcpcb *tp, const char #define TCP_LOG_MESSAGE_ARGS \ func_name, line_no, \ TCP_LOG_COMMON_PCB_ARGS, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ message os_log(OS_LOG_DEFAULT, TCP_LOG_MESSAGE_FMT, @@ -1208,8 +1211,8 @@ tcp_log_output(const char *func_name, int line_no, struct tcpcb *tp, const char #define TCP_LOG_MESSAGE_ARGS \ func_name, line_no, \ TCP_LOG_COMMON_PCB_ARGS, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ tp->t_stat.rxmitpkts, tp->t_stat.txretransmitbytes, \ message diff --git a/bsd/netinet/tcp_newreno.c b/bsd/netinet/tcp_newreno.c index 6337c7018..345644da9 100644 --- a/bsd/netinet/tcp_newreno.c +++ b/bsd/netinet/tcp_newreno.c @@ -130,6 +130,8 @@ void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) { tcp_cc_cwnd_init_or_reset(tp); + + tcp_update_pacer_state(tp); } @@ -152,6 +154,8 @@ tcp_newreno_congestion_avd(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd += tp->t_maxseg; } } + + tcp_update_pacer_state(tp); } /* Function to process an ack. */ @@ -199,6 +203,8 @@ tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) incr = ulmin(acked, abc_lim); } tp->snd_cwnd = min(cw + incr, TCP_MAXWIN << tp->snd_scale); + + tcp_update_pacer_state(tp); } void @@ -245,6 +251,8 @@ tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd = tp->snd_ssthresh; } tp->t_bytes_acked = 0; + + tcp_update_pacer_state(tp); } /* Function to change the congestion window when the retransmit @@ -286,6 +294,8 @@ tcp_newreno_after_timeout(struct tcpcb *tp) tp->snd_cwnd = tp->t_maxseg; tcp_cc_resize_sndbuf(tp); + + tcp_update_pacer_state(tp); } } diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 461d8db8b..602ed3139 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -185,72 +185,32 @@ sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) int i, err = 0, changed = 0; - ifnet_ref_t ifp; - err = sysctl_io_number(req, tcp_ecn_outbound, sizeof(int32_t), + err = sysctl_io_number(req, tcp_ecn, sizeof(int32_t), &i, &changed); if (err != 0 || req->newptr == USER_ADDR_NULL) { return err; } if (changed) { - if ((tcp_ecn_outbound == 0 || tcp_ecn_outbound == 1) && - (i == 0 || i == 1)) { - tcp_ecn_outbound = i; - SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound); - return err; - } - if (tcp_ecn_outbound == 2 && (i == 0 || i == 1)) { - /* - * Reset ECN enable flags on non-cellular - * interfaces so that the system default will take - * over - */ - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (!IFNET_IS_CELLULAR(ifp)) { - if_clear_eflags(ifp, - IFEF_ECN_ENABLE | - IFEF_ECN_DISABLE); - } - } - ifnet_head_done(); - } else { - /* - * Set ECN enable flags on non-cellular - * interfaces - */ - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (!IFNET_IS_CELLULAR(ifp)) { - if_set_eflags(ifp, IFEF_ECN_ENABLE); - if_clear_eflags(ifp, IFEF_ECN_DISABLE); - } - } - ifnet_head_done(); - } - tcp_ecn_outbound = i; - SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound); - } - /* Change the other one too as the work is done */ - if (i == 2 || tcp_ecn_inbound == 2) { - tcp_ecn_inbound = i; - SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound); + tcp_ecn = i; + SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn, tcp_ecn); } return err; } +/* TODO: remove ecn_initiate_out once libnetcore ECN cleanup changes land */ int tcp_ecn_outbound = 2; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, sysctl_change_ecn_setting, "IU", "Initiate ECN for outbound connections"); -int tcp_ecn_inbound = 2; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, +int tcp_ecn = 1; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn, 0, sysctl_change_ecn_setting, "IU", - "Initiate ECN for inbound connections"); + "ECN system setting (0: disable, 1: enable)"); SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_packet_chaining, 50, @@ -483,54 +443,51 @@ tcp_send_ecn_flags_on_syn(struct tcpcb *tp) } void -tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp) +tcp_set_l4s(struct tcpcb *tp, struct ifnet *ifp) { - boolean_t inbound; - - /* - * Socket option has precedence - */ - if (tp->ecn_flags & TE_ECN_MODE_ENABLE) { - tp->ecn_flags |= TE_ENABLE_ECN; - goto check_heuristic; + if (tp->t_state >= TCPS_ESTABLISHED) { + return; } - if (tp->ecn_flags & TE_ECN_MODE_DISABLE) { + /* + * L4S is enabled if, + * 1. It is not disabled explicitly by developer or interface setting or tcp options + * 2. It is enabled either by developer or interface setting or A/B deployment or tcp_options, + * It implicitly enables Accurate ECN which supports ACE and AccECN option for ECN feedback + */ + bool l4s_disabled = (tcp_l4s_developer == tcp_l4s_developer_disable || + (ifp != NULL && ifp->if_l4s_mode == IFRTYPE_L4S_DISABLE) || + (tp->t_flagsext & TF_L4S_DISABLED) == 1); + + tp->l4s_enabled = !l4s_disabled && (tcp_l4s_developer == tcp_l4s_developer_enable || + (ifp != NULL && ifp->if_l4s_mode == IFRTYPE_L4S_ENABLE) || tcp_l4s == 1 || + ((tp->t_flagsext & TF_L4S_ENABLED))); +} + +void +tcp_set_accurate_ecn(struct tcpcb *tp) +{ + if ((tp->ecn_flags & TE_ACC_ECN_ON) == TE_ACC_ECN_ON) { + tp->accurate_ecn_on = true; + } else { + tp->accurate_ecn_on = false; + } +} + +void +tcp_set_ecn(struct tcpcb *tp) +{ + bool ecn_enabled = tcp_ecn_enabled(tp->ecn_flags); + + if (!ecn_enabled || !tcp_heuristic_do_ecn(tp)) { tp->ecn_flags &= ~TE_ENABLE_ECN; return; } - /* - * Per interface setting comes next - */ - if (ifp != NULL) { - if (ifp->if_eflags & IFEF_ECN_ENABLE) { - tp->ecn_flags |= TE_ENABLE_ECN; - goto check_heuristic; - } - if (ifp->if_eflags & IFEF_ECN_DISABLE) { - tp->ecn_flags &= ~TE_ENABLE_ECN; - return; - } - } - /* - * System wide settings come last - */ - inbound = (tp->t_inpcb->inp_socket->so_head != NULL); - if ((inbound && tcp_ecn_inbound == 1) || - (!inbound && tcp_ecn_outbound == 1)) { - tp->ecn_flags |= TE_ENABLE_ECN; - goto check_heuristic; - } else { - tp->ecn_flags &= ~TE_ENABLE_ECN; - } + /* ECN is enabled based on system settings */ + tp->ecn_flags |= TE_ENABLE_ECN; - return; - -check_heuristic: - if (TCP_L4S_ENABLED(tp)) { - /* Allow ECN when Accurate ECN is enabled until heuristics are fixed */ - tp->ecn_flags |= TE_ENABLE_ECN; + if (tp->l4s_enabled) { /* Set the accurate ECN state */ if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_disabled) { tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_enabled; @@ -539,25 +496,28 @@ check_heuristic: tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_enabled; } } - if (!tcp_heuristic_do_ecn(tp) && !TCP_L4S_ENABLED(tp)) { - /* Allow ECN when Accurate ECN is enabled until heuristics are fixed */ - tp->ecn_flags &= ~TE_ENABLE_ECN; - } +} + +bool +tcp_ecn_enabled(uint32_t ecn_flags) +{ /* - * If the interface setting, system-level setting and heuristics - * allow to enable ECN, randomly select 5% of connections to - * enable it + * Socket option has precedence */ - if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE | TE_ECN_MODE_DISABLE - | TE_ENABLE_ECN)) == TE_ENABLE_ECN) { - /* - * Use the random value in iss for randomizing - * this selection - */ - if ((tp->iss % 100) >= tcp_ecn_setup_percentage && !TCP_L4S_ENABLED(tp)) { - /* Don't disable Accurate ECN randomly */ - tp->ecn_flags &= ~TE_ENABLE_ECN; - } + if (ecn_flags & TE_ECN_MODE_ENABLE) { + return true; + } + if (ecn_flags & TE_ECN_MODE_DISABLE) { + return false; + } + + /* + * System wide settings come last + */ + if (tcp_ecn == 1) { + return true; + } else { + return false; } } @@ -642,24 +602,22 @@ tcp_add_accecn_option(struct tcpcb *tp, uint16_t flags, uint32_t *__indexable lp } if (max_len < (TCPOLEN_ACCECN_EMPTY + 1 * TCPOLEN_ACCECN_COUNTER)) { - /* Can carry EMPTY option which can be used to test path in SYN-ACK packet */ + /* Can carry EMPTY option (2 bytes) which can be used to test path in SYN-ACK packet */ if (flags & TH_SYN) { - *lp++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | - (TCPOPT_NOP << 8) | TCPOPT_NOP); - *optlen += len + 2; /* 2 NOPs */ - TCP_LOG(tp, "add empty AccECN option, optlen=%u", *optlen); + *(uint16_t *)lp++ = htons((TCPOPT_ACCECN1 << 8) | len); + *optlen += len; } } else if (max_len < (TCPOLEN_ACCECN_EMPTY + 2 * TCPOLEN_ACCECN_COUNTER)) { /* Can carry one option */ len += 1 * TCPOLEN_ACCECN_COUNTER; if (tp->ecn_flags & TE_ACO_ECT1) { *lp++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | ((e1b >> 8) & 0xffff)); - *lp++ = htonl(((e1b & 0xff) << 24) | (TCPOPT_NOP << 16) | (TCPOPT_NOP << 8) | TCPOPT_NOP); + *(uint16_t *)lp++ = htons((uint16_t)((e1b & 0xff) << 8) | TCPOPT_NOP); } else { *lp++ = htonl((TCPOPT_ACCECN0 << 24) | (len << 16) | ((e0b >> 8) & 0xffff)); - *lp++ = htonl(((e0b & 0xff) << 24) | (TCPOPT_NOP << 16) | (TCPOPT_NOP << 8) | TCPOPT_NOP); + *(uint16_t *)lp++ = htons((uint16_t)((e0b & 0xff) << 8) | TCPOPT_NOP); } - *optlen += len + 3; /* 3 NOPs */ + *optlen += len + 1; /* 1 NOPs */ } else if (max_len < (TCPOLEN_ACCECN_EMPTY + 3 * TCPOLEN_ACCECN_COUNTER)) { /* Can carry two options */ len += 2 * TCPOLEN_ACCECN_COUNTER; @@ -691,6 +649,278 @@ tcp_add_accecn_option(struct tcpcb *tp, uint16_t flags, uint32_t *__indexable lp } } +/* + * Insert TCP options according to the supplied parameters to the place + * optp in a consistent way. Can handle unaligned destinations. + * + * The order of the option processing is crucial for optimal packing and + * alignment for the scarce option space. + * + * The optimal order for a SYN/SYN-ACK segment is: + * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. + * + * The SACK options should be last. SACK blocks consume 8*n+2 bytes. + * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). + * At minimum we need 10 bytes (to generate 1 SACK block). If both + * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, + * we only have 10 bytes for SACK options (40 - (12 + 18)). + */ +uint8_t +tcp_addoptions(struct tcpopt *to, u_char * __ended_by(optend) optp, u_char * optend) +{ + uint32_t mask; + uint8_t optlen = 0; + + for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { + if ((to->to_flags & mask) != mask) { + continue; + } + if (optlen == TCP_MAXOLEN) { + break; + } + switch (to->to_flags & mask) { + case TOF_MSS: + while (optlen % 4) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) { + continue; + } + optlen += TCPOLEN_MAXSEG; + *optp++ = TCPOPT_MAXSEG; + *optp++ = TCPOLEN_MAXSEG; + to->to_mss = htons(to->to_mss); + bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); + optp += sizeof(to->to_mss); + optend = optend; + break; + case TOF_SCALE: + while (!optlen || optlen % 2 != 1) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) { + continue; + } + optlen += TCPOLEN_WINDOW; + *optp++ = TCPOPT_WINDOW; + *optp++ = TCPOLEN_WINDOW; + *optp++ = to->to_wscale; + break; + case TOF_SACKPERM: + while (optlen % 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) { + continue; + } + optlen += TCPOLEN_SACK_PERMITTED; + *optp++ = TCPOPT_SACK_PERMITTED; + *optp++ = TCPOLEN_SACK_PERMITTED; + break; + case TOF_TS: + while (!optlen || optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) { + continue; + } + optlen += TCPOLEN_TIMESTAMP; + *optp++ = TCPOPT_TIMESTAMP; + *optp++ = TCPOLEN_TIMESTAMP; + to->to_tsval = htonl(to->to_tsval); + to->to_tsecr = htonl(to->to_tsecr); + bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); + optp += sizeof(to->to_tsval); + optend = optend; + bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); + optp += sizeof(to->to_tsecr); + optend = optend; + break; + case TOF_SACK: + { + int sackblks = 0; + struct sackblk *sack = (struct sackblk *)(void *)to->to_sacks; + tcp_seq sack_seq; + + while (!optlen || optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) { + continue; + } + optlen += TCPOLEN_SACKHDR; + *optp++ = TCPOPT_SACK; + sackblks = min(to->to_nsacks, + (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); + *optp++ = TCPOLEN_SACKHDR + (uint8_t)sackblks * TCPOLEN_SACK; + while (sackblks--) { + sack_seq = htonl(sack->start); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + optend = optend; + sack_seq = htonl(sack->end); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + optend = optend; + optlen += TCPOLEN_SACK; + sack++; + } + tcpstat.tcps_sack_send_blocks++; + break; + } + default: + /* SYN cookies are disabled when TFO is used */ + break; + } + } + + /* Terminate and pad TCP options to a 4 byte boundary. */ + if (optlen % 4) { + optlen += TCPOLEN_EOL; + *optp++ = TCPOPT_EOL; + } + /* + * According to RFC 793 (STD0007): + * "The content of the header beyond the End-of-Option option + * must be header padding (i.e., zero)." + * and later: "The padding is composed of zeros." + */ + while (optlen % 4) { + optlen += TCPOLEN_EOL; + *optp++ = TCPOPT_EOL; + } + + ASSERT(optlen <= TCP_MAXOLEN); + return optlen; +} +/* + * Set up the ECN information for the from + * client SYN information. + */ +static uint16_t +tcp_accecn_synack_respond(struct tcpcb * tp, uint16_t thflags) +{ + /* Server received either legacy or Accurate ECN setup SYN */ + if (tp->ecn_flags & (TE_SETUPRECEIVED | TE_ACE_SETUPRECEIVED)) { + if (tcp_send_ecn_flags_on_syn(tp)) { + if (tp->l4s_enabled && (tp->ecn_flags & TE_ACE_SETUPRECEIVED)) { + /* + * Accurate ECN mode is on. Initialize packet and byte counters + * for the server sending SYN-ACK. Although s_cep will be initialized + * during input processing of ACK of SYN-ACK, initialize here as well + * in case ACK gets lost. + * + * Non-zero initial values are used to + * support a stateless handshake (see + * Section 5.1 of AccECN draft) and to be + * distinct from cases where the fields + * are incorrectly zeroed. + */ + tp->t_aecn.t_rcv_ce_packets = 5; + tp->t_aecn.t_snd_ce_packets = 5; + + /* Initialize CE byte counter to 0 */ + tp->t_aecn.t_rcv_ce_bytes = tp->t_aecn.t_snd_ce_bytes = 0; + + if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) { + tp->t_prev_ace_flags = TH_CWR; + thflags |= tp->t_prev_ace_flags; + /* Remove the setup flag as it is also used for final ACK */ + tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT; + tcpstat.tcps_ecn_ace_syn_not_ect++; + } else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) { + tp->t_prev_ace_flags = (TH_CWR | TH_ECE); + thflags |= tp->t_prev_ace_flags; + tp->ecn_flags &= ~TE_ACE_SETUP_ECT1; + tcpstat.tcps_ecn_ace_syn_ect1++; + } else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) { + tp->t_prev_ace_flags = TH_AE; + thflags |= tp->t_prev_ace_flags; + tp->ecn_flags &= ~TE_ACE_SETUP_ECT0; + tcpstat.tcps_ecn_ace_syn_ect0++; + } else if (tp->ecn_flags & TE_ACE_SETUP_CE) { + tp->t_prev_ace_flags = (TH_AE | TH_CWR); + thflags |= tp->t_prev_ace_flags; + tp->ecn_flags &= ~TE_ACE_SETUP_CE; + /* + * Receive counter is updated on + * all acceptable packets except + * CE on SYN packets (SYN=1, ACK=0) + */ + tcpstat.tcps_ecn_ace_syn_ce++; + } else { + if (tp->t_prev_ace_flags != 0) { + /* Set the flags for retransmitted SYN-ACK same as the previous one */ + thflags |= tp->t_prev_ace_flags; + } else { + /* We shouldn't come here */ + panic("ECN flags (0x%x) not set correctly", tp->ecn_flags); + } + } + /* + * We now send ECT1 packets when + * L4S and Accurate ECN mode is on + */ + tp->ecn_flags |= TE_ACE_SETUPSENT; + if (tp->l4s_enabled) { + tp->ecn_flags |= TE_SENDIPECT; + tcp_set_accurate_ecn(tp); + } + } else if (tp->ecn_flags & TE_SETUPRECEIVED) { + /* + * Setting TH_ECE makes this an ECN-setup + * SYN-ACK + */ + thflags |= TH_ECE; + /* + * Record that we sent the ECN-setup and + * default to setting IP ECT. + */ + tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT); + } + tcpstat.tcps_ecn_server_setup++; + tcpstat.tcps_ecn_server_success++; + } else { + /* + * For classic ECN, we sent an ECN-setup SYN-ACK but it was + * dropped. Fallback to non-ECN-setup + * SYN-ACK and clear flag to indicate that + * we should not send data with IP ECT set + * + * Pretend we didn't receive an + * ECN-setup SYN. + * + * We already incremented the counter + * assuming that the ECN setup will + * succeed. Decrementing here + * tcps_ecn_server_success to correct it. + * + * For Accurate ECN, we don't yet remove TE_ACE_SETUPRECEIVED + * as the client might have received Accurate ECN SYN-ACK. + * We decide Accurate ECN's state on processing last ACK from the client. + */ + if (tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) { + tcpstat.tcps_ecn_lost_synack++; + tcpstat.tcps_ecn_server_success--; + tp->ecn_flags |= TE_LOST_SYNACK; + } + if (!tp->l4s_enabled) { + /* Do this only for classic ECN. */ + tp->ecn_flags &= + ~(TE_SETUPRECEIVED | TE_SENDIPECT | + TE_SENDCWR); + } + } + } + return thflags; +} + /* * Tcp output routine: figure out what should be sent and send it. * @@ -721,6 +951,8 @@ int tcp_output(struct tcpcb *tp) { uint32_t tcp_now_local = os_access_once(tcp_now); + uint32_t *tsvalptr; + uint64_t pacing_tx_time; struct inpcb *__single inp = tp->t_inpcb; struct socket *__single so = inp->inp_socket; int32_t len, recwin, sendwin, off; @@ -755,13 +987,15 @@ tcp_output(struct tcpcb *tp) #if MPTCP boolean_t mptcp_acknow; #endif /* MPTCP */ - stats_functional_type ifnet_count_type = stats_functional_type_none; + stats_functional_type ifnet_count_type = stats_functional_type_unclassified; int sotc = so->so_traffic_class; boolean_t do_not_compress = FALSE; bool sack_rescue_rxt = false; bool sack_rxmted = false; bool link_heuristics_enabled = false; + struct ifnet *outifp = inp != NULL ? inp->inp_last_outifp : NULL; + /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -791,8 +1025,6 @@ tcp_output(struct tcpcb *tp) tcp_rxtseg_clean(tp); } - /* If stretch ack was auto-disabled, re-evaluate it */ - tcp_cc_after_idle_stretchack(tp); tp->t_forced_acks = TCP_FORCED_ACKS_COUNT; } tp->t_flags &= ~TF_LASTIDLE; @@ -814,10 +1046,12 @@ tcp_output(struct tcpcb *tp) } #endif /* MPTCP */ - link_heuristics_enabled = if_link_heuristics_enabled(tp->t_inpcb->inp_last_outifp); + link_heuristics_enabled = if_link_heuristics_enabled(outifp); again: tcp_now_local = os_access_once(tcp_now); + pacing_tx_time = 0; + tsvalptr = NULL; #if MPTCP mptcp_acknow = FALSE; @@ -894,8 +1128,7 @@ again: * advertised peer window may not be valid anymore */ if (tp->t_timer[TCPT_REXMT] == 0) { - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); if (tp->t_timer[TCPT_PERSIST] != 0) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_persist_stop = 0; @@ -935,7 +1168,10 @@ again: somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES)); tcp_set_tso(tp, ifp); soif2kcl(so, (ifp->if_eflags & IFEF_2KCL)); - tcp_set_ecn(tp, ifp); + /* Don't do ECN for Loopback & Cellular */ + if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0 && !IFNET_IS_CELLULAR(ifp)) { + tcp_set_ecn(tp); + } /* * If the route changes, we cannot use the link heuristics @@ -1289,7 +1525,7 @@ after_sack_rexmit: len = tcp_tfo_check(tp, len); } - if (tp->tcp_cc_index == TCP_CC_ALGO_PRAGUE_INDEX && + if ((tp->tcp_cc_index == TCP_CC_ALGO_PRAGUE_INDEX || inp->inp_max_pacing_rate != UINT64_MAX) && tp->t_pacer.tso_burst_size != 0 && len > 0 && (uint32_t)len > tp->t_pacer.tso_burst_size) { len = tp->t_pacer.tso_burst_size; @@ -1656,73 +1892,36 @@ after_sack_rexmit: oldwin = tp->rcv_adv - tp->rcv_nxt; } - if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) { - if (adv >= (int32_t) (2 * tp->t_maxseg)) { - /* - * Update only if the resulting scaled value of - * the window changed, or if there is a change in - * the sequence since the last ack. This avoids - * what appears as dupe ACKS (see rdar://5640997) - * - * If streaming is detected avoid sending too many - * window updates. We will depend on the delack - * timer to send a window update when needed. - * - * If there is more data to read, don't send an ACK. - * Otherwise we will end up sending many ACKs if the - * application is doing micro-reads. - */ - if (!(tp->t_flags & TF_STRETCHACK) && - (tp->last_ack_sent != tp->rcv_nxt || - ((oldwin + adv) >> tp->rcv_scale) > - (oldwin >> tp->rcv_scale))) { - goto send; - } + if (adv >= (int32_t) (2 * tp->t_maxseg)) { + /* + * ACK every second full-sized segment, if the + * ACK is advancing or the window becomes bigger + */ + if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat && + (tp->last_ack_sent != tp->rcv_nxt || + ((oldwin + adv) >> tp->rcv_scale) > + (oldwin >> tp->rcv_scale))) { + goto send; } - } else { - if (adv >= (int32_t) (2 * tp->t_maxseg)) { - /* - * ACK every second full-sized segment, if the - * ACK is advancing or the window becomes bigger - */ - if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat && - (tp->last_ack_sent != tp->rcv_nxt || - ((oldwin + adv) >> tp->rcv_scale) > - (oldwin >> tp->rcv_scale))) { - goto send; - } - } else if (tp->t_flags & TF_DELACK) { - /* - * If we delayed the ACK and the window - * is not advancing by a lot (< 2MSS), ACK - * immediately if the last incoming packet had - * the push flag set and we emptied the buffer. - * - * This takes care of a sender doing small - * repeated writes with Nagle enabled. - */ - if (so->so_rcv.sb_cc == 0 && - tp->last_ack_sent != tp->rcv_nxt && - (tp->t_flagsext & TF_LAST_IS_PSH)) { - goto send; - } + } else if (tp->t_flags & TF_DELACK) { + /* + * If we delayed the ACK and the window + * is not advancing by a lot (< 2MSS), ACK + * immediately if the last incoming packet had + * the push flag set and we emptied the buffer. + * + * This takes care of a sender doing small + * repeated writes with Nagle enabled. + */ + if (so->so_rcv.sb_cc == 0 && + tp->last_ack_sent != tp->rcv_nxt && + (tp->t_flagsext & TF_LAST_IS_PSH)) { + goto send; } } - if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) { - goto send; - } - /* - * Make sure that the delayed ack timer is set if - * we delayed sending a window update because of - * streaming detection. - */ - if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY && - (tp->t_flags & TF_STRETCHACK) && - !(tp->t_flags & TF_DELACK)) { - tp->t_flags |= TF_DELACK; - tp->t_timer[TCPT_DELACK] = - OFFSET_FROM_START(tp, tcp_delack); + if (4 * adv >= (int32_t) so->so_rcv.sb_hiwat) { + goto send; } } @@ -1764,9 +1963,7 @@ after_sack_rexmit: SEQ_GT(tp->snd_max, tp->snd_una) && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, - tp->t_rxtcur); + tcp_set_rto(tp); goto just_return; } /* @@ -1843,13 +2040,14 @@ send: * hit a retransmission timeout, then we should disable AccECN option * for the rest of the connection. */ - if (TCP_ACC_ECN_ON(tp) && tp->t_state == TCPS_ESTABLISHED && + if (tp->accurate_ecn_on && tp->t_state == TCPS_ESTABLISHED && tp->snd_una == tp->iss + 1 && (tp->snd_fack == tp->iss) && tp->t_rxtshift > 0) { if ((tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0) { tp->ecn_flags |= TE_RETRY_WITHOUT_ACO; } } + /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. @@ -1886,6 +2084,10 @@ send: tp->request_r_scale); optlen += 4; } + /* Check if L4S is enabled after outifp has been set and update the CC */ + if (tp->l4s_enabled && tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX) { + tcp_set_foreground_cc(so); + } #if MPTCP if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) { optlen = mptcp_setup_syn_opts(so, opt, opt + sizeof(opt), optlen); @@ -1903,11 +2105,13 @@ send: (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { - u_int32_t *lp = (u_int32_t *)(void *)(opt + optlen); + uint32_t *lp = (u_int32_t *)(void *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); - *lp++ = htonl(tcp_now_local + tp->t_ts_offset); + + tsvalptr = lp; + lp++; /* tsval will be set later (see access to tsvalptr) */ *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } @@ -1925,7 +2129,7 @@ send: * ACK), include SACK permitted option. If this is a * SYN ACK, include SACK permitted option if peer has * already done so. This is only for active connect, - * since the syncache takes care of the passive connect. + * since the syncookie takes care of the passive connect. */ if ((flags & TH_SYN) && (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) { @@ -1997,7 +2201,7 @@ send: * to send the smallest recommended AccECN Option * if the space wouldn't permit sending all blocks. */ - if (nsack > 2 && TCP_ACC_ECN_ON(tp) && + if (nsack > 2 && tp->accurate_ecn_on && (tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0 && tp->ecn_flags & (TE_ACO_ECT1 | TE_ACO_ECT0)) { nsack--; @@ -2058,13 +2262,15 @@ send: * when doing an AccECN session. Don't send AccECN option * if retransmitting a SYN-ACK or a data segment */ - if ((TCP_ACC_ECN_ON(tp) || - (TCP_L4S_ENABLED(tp) && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && + if ((tp->accurate_ecn_on || + (tp->l4s_enabled && (flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && (tp->ecn_flags & TE_ACE_SETUPRECEIVED))) && (tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == 0) { uint32_t *lp = (uint32_t *)(void *)(opt + optlen); /* lp will become outdated after options are added */ tcp_add_accecn_option(tp, flags, lp, (uint8_t *)&optlen); + /* Make sure we didn't write more than 40 bytes */ + ASSERT((u_char *)lp - opt <= MAX_TCPOPTLEN); } /* Pad TCP options to a 4 byte boundary */ if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) { @@ -2083,7 +2289,7 @@ send: * We have completed handshake and are in ESTABLISHED state, and * This is not the final ACK of 3WHS. */ - if (TCP_ACC_ECN_ON(tp) && TCPS_HAVEESTABLISHED(tp->t_state) && + if (tp->accurate_ecn_on && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) == 0) { uint8_t ace = tp->t_aecn.t_rcv_ce_packets & TCP_ACE_MASK; if (ace & 0x01) { @@ -2129,121 +2335,11 @@ send: */ if ((flags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK) && (tp->ecn_flags & TE_ENABLE_ECN)) { - /* Server received either legacy or Accurate ECN setup SYN */ - if (tp->ecn_flags & (TE_SETUPRECEIVED | TE_ACE_SETUPRECEIVED)) { - if (tcp_send_ecn_flags_on_syn(tp)) { - if (TCP_L4S_ENABLED(tp) && (tp->ecn_flags & TE_ACE_SETUPRECEIVED)) { - /* - * Accurate ECN mode is on. Initialize packet and byte counters - * for the server sending SYN-ACK. Although s_cep will be initialized - * during input processing of ACK of SYN-ACK, initialize here as well - * in case ACK gets lost. - * - * Non-zero initial values are used to - * support a stateless handshake (see - * Section 5.1 of AccECN draft) and to be - * distinct from cases where the fields - * are incorrectly zeroed. - */ - tp->t_aecn.t_rcv_ce_packets = 5; - tp->t_aecn.t_snd_ce_packets = 5; - - /* Initialize CE byte counter to 0 */ - tp->t_aecn.t_rcv_ce_bytes = tp->t_aecn.t_snd_ce_bytes = 0; - - if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) { - tp->t_prev_ace_flags = TH_CWR; - flags |= tp->t_prev_ace_flags; - /* Remove the setup flag as it is also used for final ACK */ - tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT; - tcpstat.tcps_ecn_ace_syn_not_ect++; - } else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) { - tp->t_prev_ace_flags = (TH_CWR | TH_ECE); - flags |= tp->t_prev_ace_flags; - tp->ecn_flags &= ~TE_ACE_SETUP_ECT1; - tcpstat.tcps_ecn_ace_syn_ect1++; - } else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) { - tp->t_prev_ace_flags = TH_AE; - flags |= tp->t_prev_ace_flags; - tp->ecn_flags &= ~TE_ACE_SETUP_ECT0; - tcpstat.tcps_ecn_ace_syn_ect0++; - } else if (tp->ecn_flags & TE_ACE_SETUP_CE) { - tp->t_prev_ace_flags = (TH_AE | TH_CWR); - flags |= tp->t_prev_ace_flags; - tp->ecn_flags &= ~TE_ACE_SETUP_CE; - /* - * Receive counter is updated on - * all acceptable packets except - * CE on SYN packets (SYN=1, ACK=0) - */ - tcpstat.tcps_ecn_ace_syn_ce++; - } else { - if (tp->t_prev_ace_flags != 0) { - /* Set the flags for retransmitted SYN-ACK same as the previous one */ - flags |= tp->t_prev_ace_flags; - } else { - /* We shouldn't come here */ - panic("ECN flags (0x%x) not set correctly", tp->ecn_flags); - } - } - /* - * We now send ECT1 packets when - * L4S and Accurate ECN mode is on - */ - tp->ecn_flags |= TE_ACE_SETUPSENT; - if (TCP_L4S_ENABLED(tp)) { - tp->ecn_flags |= TE_SENDIPECT; - } - } else if (tp->ecn_flags & TE_SETUPRECEIVED) { - /* - * Setting TH_ECE makes this an ECN-setup - * SYN-ACK - */ - flags |= TH_ECE; - /* - * Record that we sent the ECN-setup and - * default to setting IP ECT. - */ - tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT); - } - tcpstat.tcps_ecn_server_setup++; - tcpstat.tcps_ecn_server_success++; - } else { - /* - * For classic ECN, we sent an ECN-setup SYN-ACK but it was - * dropped. Fallback to non-ECN-setup - * SYN-ACK and clear flag to indicate that - * we should not send data with IP ECT set - * - * Pretend we didn't receive an - * ECN-setup SYN. - * - * We already incremented the counter - * assuming that the ECN setup will - * succeed. Decrementing here - * tcps_ecn_server_success to correct it. - * - * For Accurate ECN, we don't yet remove TE_ACE_SETUPRECEIVED - * as the client might have received Accurate ECN SYN-ACK. - * We decide Accurate ECN's state on processing last ACK from the client. - */ - if (tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) { - tcpstat.tcps_ecn_lost_synack++; - tcpstat.tcps_ecn_server_success--; - tp->ecn_flags |= TE_LOST_SYNACK; - } - if (!TCP_L4S_ENABLED(tp)) { - /* Do this only for classic ECN. */ - tp->ecn_flags &= - ~(TE_SETUPRECEIVED | TE_SENDIPECT | - TE_SENDCWR); - } - } - } + flags = tcp_accecn_synack_respond(tp, flags); } else if ((flags & (TH_SYN | TH_ACK)) == TH_SYN && (tp->ecn_flags & TE_ENABLE_ECN)) { if (tcp_send_ecn_flags_on_syn(tp)) { - if (TCP_L4S_ENABLED(tp)) { + if (tp->l4s_enabled) { /* * We are negotiating AccECN in SYN. * We only set TE_SENDIPECT after the handshake @@ -2277,7 +2373,7 @@ send: } tp->ecn_flags &= ~TE_SENDIPECT; } - } else if (TCP_ACC_ECN_ON(tp) && (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) && + } else if (tp->accurate_ecn_on && (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) && len == 0 && (flags & (TH_FLAGS_ALL)) == TH_ACK) { /* * Client has processed SYN-ACK and moved to ESTABLISHED. @@ -2387,6 +2483,41 @@ send: } } + if (!(flags & TH_SYN) && + ((tp->accurate_ecn_on && (tp->ecn_flags & TE_SENDIPECT) != 0) || + inp->inp_max_pacing_rate != UINT64_MAX)) { + uint32_t pacing_delay; + + pacing_delay = tcp_pacer_get_packet_tx_time(tp, len, &pacing_tx_time); + + if (TSTMP_GT(tcp_now_local + pacing_delay, tp->t_latest_tx)) { + /* + * We need to make sure that time never moves backwards. This is + * needed because `tcp_now` is not the same as `microuptime` + * and thus two threads trying to send (one from the app, one + * from dlil_input) may end up with different views on the time + * and thus we may end up going backwards... + * So, make sure t_latest_tx is strictly increasing. + */ + tp->t_latest_tx = tcp_now_local + pacing_delay; + } + } else { + if (TSTMP_GT(tcp_now_local, tp->t_latest_tx)) { + tp->t_latest_tx = tcp_now_local; + } + } + + if (tsvalptr != NULL) { + uint32_t tsval; + + /* + * pacing_delay is folded into t_latest_tx, so that our + * RTT-estimate is not artificially inflated. + */ + tsval = tp->t_ts_offset + tp->t_latest_tx; + *tsvalptr = htonl(tsval); + } + if (max_linkhdr + hdrlen > MCLBYTES) { panic("tcphdr too big"); } @@ -2435,10 +2566,7 @@ send: if (nstat_collect) { nstat_route_tx(inp->inp_route.ro_rt, 1, len, NSTAT_TX_FLAG_RETRANSMIT); - INP_ADD_STAT(inp, ifnet_count_type, - txpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, - txbytes, len); + INP_ADD_TXSTAT(inp, ifnet_count_type, 1, len); tp->t_stat.txretransmitbytes += len; tp->t_stat.rxmitpkts++; } @@ -2450,17 +2578,13 @@ send: tcpstat.tcps_sndbyte += len; if (nstat_collect) { - INP_ADD_STAT(inp, ifnet_count_type, - txpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, - txbytes, len); + INP_ADD_TXSTAT(inp, ifnet_count_type, 1, len); } if (tp->ecn_flags & TE_SENDIPECT) { tp->t_ecn_capable_packets_sent++; } inp_decr_sndbytes_unsent(so, len); } - inp_set_activity_bitmap(inp); #if MPTCP if (tp->t_mpflags & TMPF_MPTCP_TRUE) { tcpstat.tcps_mp_sndpacks++; @@ -2670,16 +2794,12 @@ send: if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(void *)(ip6 + 1); - tcp_fillheaders(m, tp, ip6, th); + tcp_fillheaders(m, tp, ip6, th, NULL, NULL); - if (TCP_L4S_ENABLED(tp) && TCP_ACC_ECN_ON(tp)) { + if (tp->accurate_ecn_on) { /* We send ECT1 for ALL packets (data, control, fast retransmits, RTO) */ if ((tp->ecn_flags & TE_SENDIPECT) != 0 && !(flags & TH_SYN)) { ip6->ip6_flow |= htonl(IPTOS_ECN_ECT1 << 20); - uint64_t tx_time = tcp_pacer_get_packet_tx_time(tp, (uint16_t)len); - if (tx_time) { - tcp_set_mbuf_tx_time(m, tx_time); - } } } else { if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && @@ -2696,16 +2816,12 @@ send: ip = mtod(m, struct ip *); th = (struct tcphdr *)(void *)(ip + 1); /* this picks up the pseudo header (w/o the length) */ - tcp_fillheaders(m, tp, ip, th); + tcp_fillheaders(m, tp, ip, th, NULL, NULL); - if (TCP_L4S_ENABLED(tp) && TCP_ACC_ECN_ON(tp)) { + if (tp->accurate_ecn_on) { /* We send ECT1 for ALL packets (data, control, fast retransmits, RTO) */ if ((tp->ecn_flags & TE_SENDIPECT) != 0 && !(flags & TH_SYN)) { ip->ip_tos |= IPTOS_ECN_ECT1; - uint64_t tx_time = tcp_pacer_get_packet_tx_time(tp, (uint16_t)len); - if (tx_time) { - tcp_set_mbuf_tx_time(m, tx_time); - } } } else { if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && @@ -2720,6 +2836,10 @@ send: #endif /* PF_ECN */ } + if (pacing_tx_time) { + mbuf_set_tx_time(m, pacing_tx_time); + } + /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. @@ -2791,8 +2911,7 @@ send: th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } /* Separate AE from flags */ - th->th_flags = (flags & (TH_FLAGS_ALL)); - th->th_x2 = (flags & (TH_AE)) >> 8; + tcp_set_flags(th, flags); th->th_win = htons((u_short) (recwin >> tp->rcv_scale)); tp->t_last_recwin = recwin; if (!(so->so_flags & SOF_MP_SUBFLOW)) { @@ -2924,13 +3043,13 @@ send: } if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; - tp->t_sndtime = tcp_now_local; + tp->t_sndtime = tp->t_latest_tx; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { - tp->t_rtttime = tcp_now_local; + tp->t_rtttime = tp->t_latest_tx; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; @@ -2952,46 +3071,11 @@ timer: tp->t_persist_stop = 0; TCP_RESET_REXMT_STATE(tp); } - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); } - /* - * Set tail loss probe timeout if new data is being - * transmitted. This will be supported only when - * SACK option is enabled on a connection. - * - * Every time new data is sent PTO will get reset. - */ - if (tcp_enable_tlp && len != 0 && tp->t_state == TCPS_ESTABLISHED && - SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) && - tp->snd_nxt == tp->snd_max && - SEQ_GT(tp->snd_nxt, tp->snd_una) && - tp->t_rxtshift == 0 && - (tp->t_flagsext & (TF_SENT_TLPROBE | TF_PKTS_REORDERED)) == 0) { - uint32_t pto, srtt; - struct ifnet *outifp = tp->t_inpcb->inp_last_outifp; - - /* - * Don't use TLP on congested link - */ - srtt = tp->t_srtt >> TCP_RTT_SHIFT; - if (((tcp_link_heuristics_flags & TCP_LINK_HEUR_NOTLP) != 0 && - if_link_heuristics_enabled(outifp)) == false) { - pto = 2 * srtt; - if ((tp->snd_max - tp->snd_una) <= tp->t_maxseg) { - pto += tcp_delack; - } else { - pto += 2; - } - - /* if RTO is less than PTO, choose RTO instead */ - if (tp->t_rxtcur < pto) { - pto = tp->t_rxtcur; - } - - tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto); - } + if (tcp_enable_tlp && len != 0) { + tcp_set_pto(tp); } } else { /* @@ -3009,7 +3093,7 @@ timer: } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { tp->snd_max = tp->snd_nxt + len; - tp->t_sndtime = tcp_now_local; + tp->t_sndtime = tp->t_latest_tx; } } @@ -3091,7 +3175,7 @@ timer: u_int32_t pass_flags; if (!necp_socket_is_allowed_to_send_recv(inp, NULL, 0, &policy_id, &route_rule_id, &skip_policy_id, &pass_flags)) { TCP_LOG_DROP_NECP(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true); - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_NECP, NULL, 0); + m_drop_if(m, outifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_NECP, NULL, 0); error = EHOSTUNREACH; goto out; } @@ -3131,8 +3215,8 @@ timer: m->m_nextpkt = NULL; - if (inp->inp_last_outifp != NULL && - !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) { + if (outifp != NULL && + !(outifp->if_flags & IFF_LOOPBACK)) { /* Hint to prioritize this packet if * 1. if the packet has no data * 2. the interface supports transmit-start model and did @@ -3141,9 +3225,9 @@ timer: * 4. there is no outstanding data on this connection. * 5. Link heuristics are not enabled for the interface */ - if (len == 0 && (inp->inp_last_outifp->if_eflags & (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) { + if (len == 0 && (outifp->if_eflags & (IFEF_TXSTART | IFEF_NOACKPRI)) == IFEF_TXSTART) { if (link_heuristics_enabled && (tcp_link_heuristics_flags & TCP_LINK_HEUR_NOACKPRI) != 0) { - IF_TCP_STATINC(inp->inp_last_outifp, linkheur_noackpri); + IF_TCP_STATINC(outifp, linkheur_noackpri); } else { if (th->th_flags == TH_ACK && tp->snd_una == tp->snd_max && @@ -3194,7 +3278,8 @@ timer: /* Set both RACK and EVER retransmitted flags */ retransmit_flag = (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) ? TCP_SEGMENT_RETRANSMITTED : 0; } - tcp_seg_sent_insert(tp, seg, ntohl(th->th_seq), ntohl(th->th_seq) + len, tcp_now_local, retransmit_flag); + tcp_seg_sent_insert(tp, seg, ntohl(th->th_seq), + ntohl(th->th_seq) + len, tp->t_latest_tx, retransmit_flag); } if ((th->th_flags & TH_SYN) != 0) { @@ -3216,8 +3301,7 @@ timer: } } TCP_LOG_TH_FLAGS(isipv6 ? (void *)ip6 : (void *)ip, th, tp, true, - inp->inp_last_outifp != NULL ? inp->inp_last_outifp : - inp->inp_boundifp); + outifp != NULL ? outifp : inp->inp_boundifp); if (link_heuristics_enabled && (tcp_link_heuristics_flags & TCP_LINK_HEUR_RXMT_COMP) != 0 && (len != 0 || (th->th_flags & TH_FIN) != 0)) { @@ -3228,7 +3312,7 @@ timer: uint32_t gencnt = ntohl(th->th_seq) & TCP_COMP_RXMT_GENCNT_MASK; if ((m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) != 0 && gencnt == tp->t_comp_rxmt_gencnt) { - IF_TCP_STATINC(inp->inp_last_outifp, linkheur_comprxmt); + IF_TCP_STATINC(outifp, linkheur_comprxmt); m->m_pkthdr.comp_gencnt = gencnt; } else { tp->t_comp_rxmt_gencnt = gencnt; @@ -3332,7 +3416,7 @@ timer: } out: if (tp->t_pktlist_head != NULL) { - m_drop_list(tp->t_pktlist_head, NULL, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_PKT_UNSENT, NULL, 0); + m_drop_list(tp->t_pktlist_head, outifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_TCP_PKT_UNSENT, NULL, 0); } TCP_PKTLIST_CLEAR(tp); @@ -3346,8 +3430,7 @@ out: tp->t_timer[TCPT_PERSIST] == 0 && (len != 0 || (flags & (TH_SYN | TH_FIN)) != 0 || so->so_snd.sb_cc > 0)) { - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); } tp->snd_cwnd = tp->t_maxseg; tp->t_bytes_acked = 0; @@ -3390,7 +3473,7 @@ out: */ if ((error == EHOSTUNREACH || error == ENETDOWN || error == EADDRNOTAVAIL) && TCPS_HAVERCVDSYN(tp->t_state) && - !inp_restricted_send(inp, inp->inp_last_outifp)) { + !inp_restricted_send(inp, outifp)) { tp->t_softerror = error; TCP_LOG_OUTPUT(tp, "soft error %d silently handled", error); error = 0; @@ -3425,6 +3508,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, struct inpcb *__single inp = tp->t_inpcb; struct ifnet *__single outif = NULL; bool check_qos_marking_again = (so->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE) ? FALSE : TRUE; + bool fadv_congested = FALSE; union { struct route _ro; @@ -3627,6 +3711,10 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, ifdenied = (ipoa.ipoa_flags & IPOAF_R_IFDENIED); } + if (adv->code == FADV_CONGESTED) { + fadv_congested = TRUE; + } + if (chain || error) { /* * If we sent down a chain then we are done since @@ -3673,6 +3761,13 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, } } + if (fadv_congested && !IN_FASTRECOVERY(tp) && !(tp->t_flags & TF_CLOSING) && + tp->t_state == TCPS_ESTABLISHED) { + TCP_LOG_OUTPUT(tp, "flow congestion notified"); + tcp_local_congestion_notification(tp); + tcp_ccdbg_trace(tp, NULL, TCP_CC_FLOW_CONGESTION_NOTIFIED); + } + /* * When an interface queue gets suspended, some of the * packets are dropped. Return ENOBUFS, to update the @@ -3697,7 +3792,9 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, * outgoing interface */ if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) { + ifnet_head_lock_shared(); outif = ifindex2ifnet[ip6oa.ip6oa_boundif]; + ifnet_head_done(); } else if (ro6.ro_rt != NULL) { outif = ro6.ro_rt->rt_ifp; } @@ -3758,8 +3855,7 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, * reset the retransmit timer */ tcp_getrt_rtt(tp, tp->t_inpcb->in6p_route.ro_rt); - tcp_set_link_heur_rtomin(tp, inp->inp_last_outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); } return error; #undef ro @@ -3793,7 +3889,7 @@ tcp_setpersist(struct tcpcb *tp) TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], t * tcp_backoff[tp->t_rxtshift], tcptv_persmin_val, TCPTV_PERSMAX, 0); - tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]); + tp->t_timer[TCPT_PERSIST] = tcp_offset_from_start(tp, tp->t_timer[TCPT_PERSIST]); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) { tp->t_rxtshift++; diff --git a/bsd/netinet/tcp_pacing.c b/bsd/netinet/tcp_pacing.c new file mode 100644 index 000000000..d46f5f995 --- /dev/null +++ b/bsd/netinet/tcp_pacing.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* TCP-pacing implementation and helper functions */ + +#include "tcp_includes.h" + +static uint64_t +microuptime_ns(void) +{ + uint64_t abstime = mach_absolute_time(); + uint64_t ns = 0; + + absolutetime_to_nanoseconds(abstime, &ns); + + return ns; +} + +/* Compute interval to use for specified (size) amount of data */ +static uint32_t +tcp_pacer_get_packet_interval(struct tcpcb *tp, uint64_t size) +{ + uint64_t rate = tp->t_pacer.rate; + uint64_t interval; + + if (rate == 0) { + os_log_error(OS_LOG_DEFAULT, + "%s: pacer rate shouldn't be 0, CCA is %s (cwnd=%u, smoothed rtt=%u ms)", + __func__, CC_ALGO(tp)->name, tp->snd_cwnd, tp->t_srtt >> TCP_RTT_SHIFT); + + return 0; + } + + interval = (size * NSEC_PER_SEC) / rate; + + if (interval > UINT32_MAX) { + interval = UINT32_MAX; + } + + return (uint32_t)interval; +} + +/* + * Computes packet's (of length pkt_len) tx_time according to the TCP-connection + * state. Also, returns the delay between now and the tx_time in milli-seconds. + * All values are in nano-seconds. + */ +uint32_t +tcp_pacer_get_packet_tx_time(struct tcpcb *tp, int pkt_len, uint64_t *tx_time) +{ + uint64_t now = microuptime_ns(); + + if (pkt_len < 0) { + pkt_len = 0; + } + + if (tp->t_pacer.packet_tx_time == 0) { + tp->t_pacer.packet_tx_time = now; + tp->t_pacer.current_size = pkt_len; + } else { + if (tp->t_pacer.current_size >= tp->t_pacer.tso_burst_size) { + /* + * Increment tx_time by packet_interval and + * reset current_size to this packet's len + */ + tp->t_pacer.packet_tx_time += + tcp_pacer_get_packet_interval(tp, tp->t_pacer.current_size); + tp->t_pacer.current_size = pkt_len; + if (now > tp->t_pacer.packet_tx_time) { + /* + * If current time is bigger, then application + * has already paced the packet. Also, we can't + * set tx_time in the past. + */ + tp->t_pacer.packet_tx_time = now; + } + } else { + tp->t_pacer.current_size += pkt_len; + } + } + + if (now < tp->t_pacer.packet_tx_time) { + *tx_time = tp->t_pacer.packet_tx_time; + } else { + *tx_time = now; + } + + /* + * tcp_pacer_get_packet_interval() guarantees that the below substraction + * is less than UINT32_MAX. + */ + return (uint32_t)(*tx_time - now) / NSEC_PER_MSEC; +} + +#define MSEC_PER_SEC (1000) /* milliseconds per second */ +uint64_t +tcp_compute_measured_rate(const struct tcpcb *tp) +{ + uint32_t srtt = tp->t_srtt; + uint64_t rate; + + if (srtt == 0) { + /* Can't pace when it's at 0 */ + return 0; + } + + rate = tp->snd_cwnd; + + /* Multiply by MSEC_PER_SEC as srtt is in milliseconds */ + rate *= MSEC_PER_SEC; + rate = (rate << TCP_RTT_SHIFT) / srtt; + + return rate; +} + +#define BURST_SHIFT (12) /* 1/(2^12) = 0.000244s, we allow a burst queue of at least 250us */ +void +tcp_update_pacer_state(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + uint32_t burst; + uint64_t rate; + + rate = tcp_compute_measured_rate(tp); + /* Use 200% rate when in slow start */ + if (tp->snd_cwnd < tp->snd_ssthresh) { + rate *= 2; + } + + if (inp->inp_max_pacing_rate != UINT64_MAX) { + if (inp->inp_max_pacing_rate < rate) { + rate = inp->inp_max_pacing_rate; + } + } + burst = (uint32_t)(rate >> BURST_SHIFT); + + tp->t_pacer.rate = rate; + tp->t_pacer.tso_burst_size = max(tp->t_maxseg, burst); +} diff --git a/bsd/netinet/tcp_pacing.h b/bsd/netinet/tcp_pacing.h new file mode 100644 index 000000000..6a668fe1f --- /dev/null +++ b/bsd/netinet/tcp_pacing.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* TCP-pacing implementation and helper functions */ + +#ifndef _NETINET_TCP_PACING_H_ +#define _NETINET_TCP_PACING_H_ + +#include "tcp_includes.h" + +uint32_t tcp_pacer_get_packet_tx_time(struct tcpcb *tp, int pkt_len, + uint64_t *tx_time); + +uint64_t tcp_compute_measured_rate(const struct tcpcb *tp); + +void tcp_update_pacer_state(struct tcpcb *tp); + +#endif /* _NETINET_TCP_PACING_H_ */ diff --git a/bsd/netinet/tcp_prague.c b/bsd/netinet/tcp_prague.c index ab7df63f9..d6c63140c 100644 --- a/bsd/netinet/tcp_prague.c +++ b/bsd/netinet/tcp_prague.c @@ -42,7 +42,6 @@ static void tcp_prague_process_ecn(struct tcpcb *tp, struct tcphdr *th, uint32_t uint32_t packets_marked, uint32_t packets_acked); static void tcp_prague_set_bytes_acked(struct tcpcb *tp, uint32_t acked); -static void prague_update_pacer_state(struct tcpcb *tp); static void prague_ca_after_ce(struct tcpcb *tp, uint32_t acked); extern float cbrtf(float x); @@ -76,11 +75,6 @@ struct tcp_cc_algo tcp_cc_prague = { #define MAX_ALPHA (1ULL << ALPHA_SHIFT) #define REF_RTT_RATE (25) /* 25 ms */ -#define BURST_SHIFT (12) /* 1/(2^12) = 0.000244s, we allow a burst queue of at least 250us */ - -#define PACING_INITIAL_RTT (100) /* 100ms, Only used to calculate startup pacer rate */ -#define MSEC_PER_SEC (1000) /* milliseconds per second */ - static float cubic_beta = 0.7f; static float cubic_one_sub_beta = 0.3f; static float cubic_one_add_beta = 1.7f; @@ -278,7 +272,7 @@ tcp_prague_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd = 2 * tp->t_maxseg; } - prague_update_pacer_state(tp); + tcp_update_pacer_state(tp); } static void @@ -332,7 +326,7 @@ tcp_prague_post_fr(struct tcpcb *tp, __unused struct tcphdr *th) */ tp->snd_cwnd = tp->snd_ssthresh; - prague_update_pacer_state(tp); + tcp_update_pacer_state(tp); tp->t_ccstate->reno_cwnd = 0; tp->t_ccstate->reno_acked = 0; @@ -351,31 +345,6 @@ rtt_elapsed(uint32_t largest_snd_nxt, uint32_t ack) return largest_snd_nxt == 0 || SEQ_GT(ack, largest_snd_nxt); } -static void -prague_update_pacer_state(struct tcpcb *tp) -{ - uint32_t srtt = tp->t_srtt >> TCP_RTT_SHIFT; - if (srtt == 0) { - srtt = PACING_INITIAL_RTT; - } - - uint64_t rate = tp->snd_cwnd; - - /* Use 200% rate when in slow start */ - if (tp->snd_cwnd < tp->snd_ssthresh) { - rate *= 2; - } - - /* Multiply by MSEC_PER_SEC as srtt is in milliseconds */ - rate *= MSEC_PER_SEC; - rate = rate / srtt; - - uint32_t burst = (uint32_t)(rate >> BURST_SHIFT); - - tp->t_pacer.rate = rate; - tp->t_pacer.tso_burst_size = max(tp->t_maxseg, burst); -} - /* * RTT independence using square of RTT ratio to acheive rate fairness. * For additive increase, alpha = (RTT / REF_RTT) ^ 2 @@ -571,7 +540,7 @@ tcp_prague_process_ecn(struct tcpcb *tp, struct tcphdr *th, uint32_t new_bytes_m /* Update pacer state if cwnd has changed */ if (cwnd_changed) { - prague_update_pacer_state(tp); + tcp_update_pacer_state(tp); } /* New round for CWR */ tp->t_ccstate->snd_nxt_cwr = tp->snd_nxt; @@ -643,7 +612,7 @@ tcp_prague_cwnd_init_or_reset(struct tcpcb *tp) * loss and Cubic will enter steady-state too early. It is better * to always probe to find the initial slow-start threshold. */ - if (tp->t_inpcb->inp_stat->txbytes <= tcp_initial_cwnd(tp) && + if (tp->t_inpcb->inp_mstat.ms_total.ts_txbytes <= tcp_initial_cwnd(tp) && tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) { tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; } @@ -652,11 +621,7 @@ tcp_prague_cwnd_init_or_reset(struct tcpcb *tp) tp->t_ccstate->cubic_W_max = tp->snd_ssthresh; /* Set initial pacer state */ - uint64_t startup_rate = - tp->snd_cwnd * MSEC_PER_SEC / PACING_INITIAL_RTT; - uint32_t startup_burst_size = tp->t_maxseg; - tp->t_pacer.rate = startup_rate; - tp->t_pacer.tso_burst_size = startup_burst_size; + tcp_update_pacer_state(tp); } static void @@ -684,6 +649,8 @@ tcp_prague_after_timeout(struct tcpcb *tp) * timeout might indicate severe congestion. */ tp->snd_cwnd = tp->t_maxseg; + + tcp_update_pacer_state(tp); } static int diff --git a/bsd/netinet/tcp_private.h b/bsd/netinet/tcp_private.h index 86eaad80c..16265d21b 100644 --- a/bsd/netinet/tcp_private.h +++ b/bsd/netinet/tcp_private.h @@ -118,7 +118,7 @@ struct tcp_notify_ack_complete { #define TCP_NOTIFY_ACKNOWLEDGEMENT 0x212 /* Notify when data is acknowledged */ #define MPTCP_SERVICE_TYPE 0x213 /* MPTCP Service type */ -#define TCP_FASTOPEN_FORCE_HEURISTICS 0x214 /* Make sure TFO-heuristics never get disabled */ +/* UNUSED 0x214 */ #define MPTCP_SVCTYPE_HANDOVER 0 /* Default 0 */ #define MPTCP_SVCTYPE_INTERACTIVE 1 @@ -156,7 +156,7 @@ struct tcp_notify_ack_complete { #define TCPI_OPT_ECN 0x08 #define TCPI_FLAG_LOSSRECOVERY 0x01 /* Currently in loss recovery */ -#define TCPI_FLAG_STREAMING_ON 0x02 /* Streaming detection on */ +#define TCPI_FLAG_STREAMING_ON 0x02 /* Streaming detection on - remove when uTCP stops using it */ struct tcp_conn_status { union { @@ -309,7 +309,6 @@ struct tcp_info { u_int64_t tcpi_txretransmitpackets __attribute__((aligned(8))); -#define TCPINFO_HAS_RCV_RTT 1 uint32_t tcpi_rcv_srtt; /* Receiver's Smoothed RTT */ uint32_t tcpi_client_accecn_state; /* Client's Accurate ECN state */ uint32_t tcpi_server_accecn_state; /* Server's Accurate ECN state as seen by clent */ @@ -330,6 +329,10 @@ struct tcp_info { #define TCPINFO_HAS_LIMITED_TIME 1 uint64_t tcpi_flow_control_total_time; uint64_t tcpi_rcvwnd_limited_total_time; + +#define TCPINFO_HAS_PACING_RATE 1 + uint64_t tcpi_pacing_rate; + uint64_t tcpi_max_pacing_rate; }; struct tcp_measure_bw_burst { diff --git a/bsd/netinet/tcp_rack.c b/bsd/netinet/tcp_rack.c index eeef24bae..1ef74db38 100644 --- a/bsd/netinet/tcp_rack.c +++ b/bsd/netinet/tcp_rack.c @@ -301,7 +301,7 @@ tcp_rack_detect_loss_and_arm_timer(struct tcpcb *tp, uint32_t dup_acks) reordering_timeout = tcp_rack_detect_loss(tp, dup_acks, &loss_detected); if (reordering_timeout) { - tp->t_timer[TCPT_REORDER] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_REORDER] = tcp_offset_from_start(tp, reordering_timeout + REORDERING_WINDOW_FLOOR); /* Since losses can be marked at future point, clear the TLP timer */ tp->t_timer[TCPT_PTO] = 0; diff --git a/bsd/netinet/tcp_sack.c b/bsd/netinet/tcp_sack.c index abd5fea96..747447529 100644 --- a/bsd/netinet/tcp_sack.c +++ b/bsd/netinet/tcp_sack.c @@ -93,6 +93,8 @@ #include #include +#include "tcp_includes.h" + #if IPSEC #include #endif /*IPSEC*/ @@ -105,15 +107,11 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_maxholes, CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_sack_maxholes, 128, "Maximum number of TCP SACK holes allowed per connection"); +/* ToDo - remove when uTCP stops using it */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_globalmaxholes, CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_sack_globalmaxholes, 65536, "Global maximum number of TCP SACK holes"); -static SInt32 tcp_sack_globalholes = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD | CTLFLAG_LOCKED, - &tcp_sack_globalholes, 0, - "Global number of TCP SACK holes currently allocated"); - static KALLOC_TYPE_DEFINE(sack_hole_zone, struct sackhole, NET_KT_DEFAULT); #define TCP_VALIDATE_SACK_SEQ_NUMBERS(_tp_, _sb_, _ack_) \ @@ -208,13 +206,10 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; - /* If we are requesting SACK recovery, reset the stretch-ack state + /* If we are requesting SACK recovery, reset the force-ACK counter * so that connection will generate more acks after recovery and * sender's cwnd will open. */ - if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) { - tcp_reset_stretch_ack(tp); - } if (tp->rcv_numsacks > 0) { tp->t_forced_acks = TCP_FORCED_ACKS_COUNT; } @@ -245,19 +240,23 @@ tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) struct sackhole *hole; if (tp->snd_numholes >= tcp_sack_maxholes || - tcp_sack_globalholes >= tcp_sack_globalmaxholes) { + tcp_memacct_hardlimit()) { + /* + * We only check for hardlimit, because properly handling SACK + * will allow us to recover quicker (and thus free memory). + */ tcpstat.tcps_sack_sboverflow++; return NULL; } hole = zalloc_flags(sack_hole_zone, Z_WAITOK | Z_NOFAIL); + tcp_memacct_add(kalloc_type_size(sack_hole_zone)); hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; - OSIncrementAtomic(&tcp_sack_globalholes); return hole; } @@ -269,9 +268,9 @@ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { zfree(sack_hole_zone, hole); + tcp_memacct_sub(kalloc_type_size(sack_hole_zone)); tp->snd_numholes--; - OSDecrementAtomic(&tcp_sack_globalholes); } /* @@ -374,16 +373,6 @@ tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, tcpstat.tcps_reordered_pkts++; tp->t_reordered_pkts++; - /* - * If reordering is seen on a connection wth ECN enabled, - * increment the heuristic - */ - if (TCP_ECN_ENABLED(tp)) { - INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder); - tcpstat.tcps_ecn_fallback_reorder++; - tcp_heuristic_ecn_aggressive(tp); - } - VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); if (s->rxmit_start > 0) { @@ -490,7 +479,8 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, *highest_sacked_seq = sblkp->end; - while (sblkp >= sack_blocks) { + /* RACK can get disabled if segment allocation fails */ + while (sblkp >= sack_blocks && TCP_RACK_ENABLED(tp)) { /* * Mark SACKed segments which allows us to skip through such * segments during RACK loss detection diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 674a8654c..f18449a98 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,7 @@ #include #include #include +#include #include #include #include @@ -145,11 +147,14 @@ #include #include #include +#include #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) static tcp_cc tcp_ccgen; +struct mem_acct *tcp_memacct; + extern struct tcptimerlist tcp_timer_list; extern struct tcptailq tcp_tw_tailq; @@ -179,9 +184,11 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER, "Enable TCP Fastopen (RFC 7413)"); +/* ToDo - remove once uTCP stops using it */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED, uint32_t, tcp_now_init, 0, "Initial tcp now value"); +/* ToDo - remove once uTCP stops using it */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED, uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds"); @@ -254,8 +261,6 @@ KALLOC_TYPE_DEFINE(tcp_seg_sent_zone, struct tcp_seg_sent, NET_KT_DEFAULT); extern int slowlink_wsize; /* window correction for slow links */ extern int path_mtu_discovery; -uint32_t tcp_now_remainder_us = 0; /* remaining micro seconds for tcp_now */ - static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb); #define TCP_BWMEAS_BURST_MINSIZE 6 @@ -292,9 +297,6 @@ struct inp_tp { static KALLOC_TYPE_DEFINE(tcpcbzone, struct inp_tp, NET_KT_DEFAULT); -int get_inpcb_str_size(void); -int get_tcp_str_size(void); - os_log_t tcp_mpkl_log_object = NULL; static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); @@ -389,18 +391,6 @@ exit: return error; } -int -get_inpcb_str_size(void) -{ - return sizeof(struct inpcb); -} - -int -get_tcp_str_size(void) -{ - return sizeof(struct tcpcb); -} - static int scale_to_powerof2(int size); /* @@ -515,13 +505,22 @@ tcp_init(struct protosw *pp, struct domain *dp) #pragma unused(dp) static int tcp_initialized = 0; struct inpcbinfo *pcbinfo; + struct timeval now; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - if (tcp_initialized) { + if (tcp_memacct == NULL) { + uint64_t hlimit = max_mem_actual >> 5; + tcp_memacct = mem_acct_register("TCP", hlimit, 80); + if (tcp_memacct == NULL) { + panic("mem_acct_register returned NULL"); + } + } + pp->pr_mem_acct = tcp_memacct; + + if (!os_atomic_cmpxchg(&tcp_initialized, 0, 1, relaxed)) { return; } - tcp_initialized = 1; #if DEBUG || DEVELOPMENT (void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max, @@ -536,20 +535,17 @@ tcp_init(struct protosw *pp, struct domain *dp) tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; - microuptime(&tcp_uptime); - read_frandom(&tcp_now, sizeof(tcp_now)); + microuptime(&now); + tcp_now = (uint32_t)now.tv_sec * 1000 + now.tv_usec / TCP_RETRANSHZ_TO_USEC; - /* Starts tcp internal clock at a random value */ - tcp_now = tcp_now & 0x3fffffff; - - /* expose initial uptime/now via systcl for utcp to keep time sync */ + /* ToDo - remove once uTCP stops using it */ tcp_now_init = tcp_now; - tcp_microuptime_init = - (uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC)); + tcp_microuptime_init = tcp_now; SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init); SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init); tcp_tfo_init(); + tcp_syncookie_init(); LIST_INIT(&tcb); tcbinfo.ipi_listhead = &tcb; @@ -667,12 +663,21 @@ tcp_init(struct protosw *pp, struct domain *dp) * of the tcpcb each time to conserve mbufs. */ void -tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr) +tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr, + struct sockaddr *local, struct sockaddr *remote) { struct inpcb *inp = tp->t_inpcb; struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; - if ((inp->inp_vflag & INP_IPV6) != 0) { + bool isipv6 = false; + + if (local != NULL && remote != NULL) { + isipv6 = (local->sa_family == AF_INET6); + } else { + isipv6 = (inp->inp_vflag & INP_IPV6) != 0; + } + + if (isipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; @@ -683,15 +688,33 @@ tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr) ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = 0; - ip6->ip6_src = inp->in6p_laddr; - ip6->ip6_dst = inp->in6p_faddr; + if (local != NULL) { + ip6->ip6_src = SIN6(local)->sin6_addr; + } else { + ip6->ip6_src = inp->in6p_laddr; + } + if (remote != NULL) { + ip6->ip6_dst = SIN6(remote)->sin6_addr; + } else { + ip6->ip6_dst = inp->in6p_faddr; + } + if (m->m_flags & M_PKTHDR) { - uint32_t lifscope = inp->inp_lifscope != 0 ? inp->inp_lifscope : inp->inp_fifscope; - uint32_t fifscope = inp->inp_fifscope != 0 ? inp->inp_fifscope : inp->inp_lifscope; + uint32_t lifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE; + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + lifscope = inp->inp_lifscope; + } else if (SIN6(local)->sin6_scope_id != IFSCOPE_NONE) { + lifscope = SIN6(local)->sin6_scope_id; + } + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + fifscope = inp->inp_fifscope; + } else if (SIN6(remote)->sin6_scope_id != IFSCOPE_NONE) { + fifscope = SIN6(remote)->sin6_scope_id; + } ip6_output_setsrcifscope(m, lifscope, NULL); ip6_output_setdstifscope(m, fifscope, NULL); } - tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr, + tcp_hdr->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst, htonl(sizeof(struct tcphdr) + IPPROTO_TCP)); } else { struct ip *ip = (struct ip *) ip_ptr; @@ -704,15 +727,30 @@ tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr) ip->ip_ttl = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; - ip->ip_src = inp->inp_laddr; - ip->ip_dst = inp->inp_faddr; + if (local != NULL) { + ip->ip_src = SIN(local)->sin_addr; + } else { + ip->ip_src = inp->inp_laddr; + } + if (remote != NULL) { + ip->ip_dst = SIN(remote)->sin_addr; + } else { + ip->ip_dst = inp->inp_faddr; + } tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); } - - tcp_hdr->th_sport = inp->inp_lport; - tcp_hdr->th_dport = inp->inp_fport; + if (local != NULL) { + tcp_hdr->th_sport = SIN(local)->sin_port; + } else { + tcp_hdr->th_sport = inp->inp_lport; + } + if (remote != NULL) { + tcp_hdr->th_dport = SIN(remote)->sin_port; + } else { + tcp_hdr->th_dport = inp->inp_fport; + } tcp_hdr->th_seq = 0; tcp_hdr->th_ack = 0; tcp_hdr->th_x2 = 0; @@ -722,13 +760,45 @@ tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr) tcp_hdr->th_urp = 0; } +static uint8_t +tcp_filloptions(struct tcpopt *peer_to, uint16_t thflags, uint16_t mss, uint8_t rcv_scale, + uint32_t ts_offset, u_char *__counted_by(TCP_MAXOLEN) optp) +{ + uint8_t optlen = 0; + struct tcpopt to; + + to.to_flags = 0; + + if (thflags & TH_SYN) { + to.to_mss = mss; + to.to_flags = TOF_MSS; + if (peer_to->to_flags & TOF_SCALE) { + to.to_wscale = rcv_scale; + to.to_flags |= TOF_SCALE; + } + if (peer_to->to_flags & TOF_SACKPERM) { + to.to_flags |= TOF_SACKPERM; + } + } + if ((peer_to->to_flags & TOF_TS)) { + uint32_t tcp_now_local = os_access_once(tcp_now); + to.to_tsval = ts_offset + tcp_now_local; + to.to_tsecr = peer_to->to_tsval; + to.to_flags |= TOF_TS; + } + optlen = tcp_addoptions(&to, optp, optp + TCP_MAXOLEN); + + return optlen; +} + /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * -tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp) +tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp, + struct sockaddr *local, struct sockaddr *remote) { struct mbuf *m; struct tcptemp *n; @@ -740,7 +810,7 @@ tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp) m->m_len = sizeof(struct tcptemp); n = mtod(m, struct tcptemp *); - tcp_fillheaders(m, tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); + tcp_fillheaders(m, tp, (void *)&n->tt_ipgen, (void *)&n->tt_t, local, remote); return n; } @@ -759,10 +829,13 @@ tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp) * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void -tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_size __unused, struct tcphdr *th, struct mbuf *m, - tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra) +tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_size __unused, + struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint32_t rcv_win, uint16_t flags, + struct tcpopt *peer_to, uint16_t mss, uint8_t rcv_scale, uint32_t ts_offset, + struct tcp_respond_args *tra, bool send_syncookie) { uint16_t tlen; + uint8_t optlen = 0; int win = 0; struct route *ro = 0; struct route sro; @@ -804,6 +877,10 @@ tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_s ro = &sro; bzero(ro, sizeof(*ro)); } + if (rcv_win != 0) { + /* Set TCP receive window if provided */ + win = rcv_win; + } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ @@ -830,7 +907,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_s #if MPTCP if ((tp) && (tp->t_mpflags & TMPF_RESET)) { flags = (TH_RST | TH_ACK); - } else + } else if (!send_syncookie) #endif flags = TH_ACK; } else { @@ -868,6 +945,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_s xchg(nth->th_dport, nth->th_sport, n_short); #undef xchg } + + if (peer_to != NULL) { + u_char *optp = (u_char *)(nth + 1); + optlen = tcp_filloptions(peer_to, flags, mss, rcv_scale, ts_offset, optp); + tlen += optlen; + } + if (isipv6) { ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) + tlen)); @@ -889,8 +973,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen __sized_by(ipgen_size), size_t ipgen_s nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; - nth->th_off = sizeof(struct tcphdr) >> 2; - nth->th_flags = flags; + nth->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + tcp_set_flags(nth, flags); if (tp) { nth->th_win = htons((u_short) (win >> tp->rcv_scale)); } else { @@ -1110,18 +1194,13 @@ tcp_newtcpcb(struct inpcb *inp) { struct inp_tp *it; struct tcpcb *tp; - struct socket *so = inp->inp_socket; int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; uint32_t random_32; calculate_tcp_clock(); - if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) { - it = (struct inp_tp *)(void *)inp; - tp = &it->tcb; - } else { - tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb; - } + it = (struct inp_tp *)(void *)inp; + tp = &it->tcb; bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); @@ -1134,6 +1213,12 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_flagsext |= TF_RACK_ENABLED; } + if (tcp_syncookie == 1) { + tp->t_flagsext |= TF_SYN_COOKIE_ENABLED; + } else if (tcp_syncookie == 2) { + tp->t_flagsext |= TF_SYN_COOKIE_FORCE_ENABLED; + } + TAILQ_INIT(&tp->snd_holes); SLIST_INIT(&tp->t_rxt_segments); TAILQ_INIT(&tp->t_segs_sent); @@ -1162,7 +1247,9 @@ tcp_newtcpcb(struct inpcb *inp) tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX; #endif } else { - if (TCP_L4S_ENABLED(tp)) { + /* Set L4S state even if ifp might be NULL */ + tcp_set_l4s(tp, inp->inp_last_outifp); + if (tp->l4s_enabled) { tp->tcp_cc_index = TCP_CC_ALGO_PRAGUE_INDEX; } else { tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX; @@ -1185,11 +1272,9 @@ tcp_newtcpcb(struct inpcb *inp) tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = tcp_now; - tp->tentry.timer_start = tcp_now; - tp->rcv_unackwin = tcp_now; + tp->tentry.te_timer_start = tcp_now; tp->t_persist_timeout = tcp_max_persist_timeout; tp->t_persist_stop = 0; - tp->t_flagsext |= TF_RCVUNACK_WAITSS; tp->t_rexmtthresh = (uint8_t)tcprexmtthresh; tp->rack.reo_wnd_multi = 1; tp->rfbuf_ts = tcp_now; @@ -1268,11 +1353,11 @@ tcp_drop(struct tcpcb *tp, int errno) void tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) { - uint32_t rtt = rt->rt_rmx.rmx_rtt; - TCP_LOG_RTM_RTT(tp, rt); - if (rtt != 0 && tcp_init_rtt_from_cache != 0) { + if (rt->rt_rmx.rmx_rtt != 0 && tcp_init_rtt_from_cache != 0) { + uint32_t rtt = rt->rt_rmx.rmx_rtt; + uint32_t rttvar; /* * XXX the lock bit for RTT indicates that the value * is also a minimum value; this is subject to time. @@ -1283,27 +1368,21 @@ tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt) tp->t_rttmin = TCPTV_REXMTMIN; } - tp->t_srtt = - rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); + rtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { - tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + rttvar = rt->rt_rmx.rmx_rttvar / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE)); tcpstat.tcps_usedrttvar++; } else { /* default variation is +- 1 rtt */ - tp->t_rttvar = + rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } - /* - * The RTO formula in the route metric case is based on: - * srtt + 4 * rttvar - * modulo the min, max and slop - */ TCPT_RANGESET(tp->t_rxtcur, - TCP_REXMTVAL(tp), + tcp_rto_formula(tp->t_rttmin, rtt, rttvar), tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp)); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_srtt == 0 && @@ -1372,8 +1451,8 @@ tcp_create_ifnet_stats_per_flow(struct tcpcb *tp, ifs->bw_rcvbw_max = 0; } ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets; - ifs->txpackets = inp->inp_stat->txpackets; - ifs->rxpackets = inp->inp_stat->rxpackets; + ifs->txpackets = inp->inp_mstat.ms_total.ts_txpackets; + ifs->rxpackets = inp->inp_mstat.ms_total.ts_rxpackets; } static inline void @@ -1483,6 +1562,88 @@ tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs, stat->lim_bk_txpkts += ifs->bk_txpackets; } +static void +tcp_free_reassq(struct tcpcb *tp) +{ + struct tseg_qent *q; + + while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { + struct mbuf *m; + + LIST_REMOVE(q, tqe_q); + m = tcp_destroy_reass_qent(tp, q); + m_freem(m); + } +} + +struct tseg_qent * +tcp_create_reass_qent(struct tcpcb *tp, struct mbuf *m, + struct tcphdr *th, int len) +{ + struct tseg_qent *te; + int size; + + te = tcp_reass_qent_alloc(tp->t_inpcb->inp_socket->so_proto); + if (te == NULL) { + return NULL; + } + + tp->t_reassqlen++; + OSIncrementAtomic(&tcp_reass_total_qlen); + + size = m_chain_capacity(m); + tcp_memacct_add(size); + tp->t_reassq_mbcnt += size; + + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = len; + + return te; +} + +struct mbuf * +tcp_destroy_reass_qent(struct tcpcb *tp, struct tseg_qent *q) +{ + struct mbuf *m = q->tqe_m; + int size; + + size = m_chain_capacity(m); + tcp_memacct_sub(size); + tp->t_reassq_mbcnt -= size; + + tp->t_reassqlen--; + OSDecrementAtomic(&tcp_reass_total_qlen); + tcp_reass_qent_free(tp->t_inpcb->inp_socket->so_proto, q); + + return m; +} + +struct tseg_qent * +tcp_reass_qent_alloc(struct protosw *proto) +{ + struct tseg_qent *reass; + + if (proto_memacct_hardlimit(proto)) { + return NULL; + } + reass = zalloc_flags(tcp_reass_zone, Z_NOPAGEWAIT); + if (reass == NULL) { + return NULL; + } + + proto_memacct_add(proto, kalloc_type_size(tcp_reass_zone)); + + return reass; +} + +void +tcp_reass_qent_free(struct protosw *proto, struct tseg_qent *te) +{ + proto_memacct_sub(proto, kalloc_type_size(tcp_reass_zone)); + zfree(tcp_reass_zone, te); +} + /* * Close a TCP control block: * discard all space held by the tcp @@ -1670,7 +1831,7 @@ no_valid_rt: } /* free the reassembly queue, if any */ - (void) tcp_freeq(tp); + tcp_free_reassq(tp); /* performance stats per interface */ tcp_create_ifnet_stats_per_flow(tp, &ifs); @@ -1693,10 +1854,6 @@ no_valid_rt: } TCP_PKTLIST_CLEAR(tp); - if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) { - inp->inp_saved_ppcb = (caddr_t) tp; - } - TCP_LOG_STATE(tp, TCPS_CLOSED); tp->t_state = TCPS_CLOSED; @@ -1759,30 +1916,6 @@ no_valid_rt: return NULL; } -int -tcp_freeq(struct tcpcb *tp) -{ - struct tseg_qent *q; - int rv = 0; - int count = 0; - - while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { - LIST_REMOVE(q, tqe_q); - tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ? - q->tqe_m->m_ext.ext_size : 0; - m_freem(q->tqe_m); - zfree(tcp_reass_zone, q); - rv = 1; - count++; - } - tp->t_reassqlen = 0; - if (count > 0) { - OSAddAtomic(-count, &tcp_reass_total_qlen); - } - return rv; -} - - void tcp_drain(void) { @@ -2805,6 +2938,7 @@ tcp_new_isn(struct tcpcb *tp) if (__probable(tcp_randomize_timestamps)) { tp->t_ts_offset = md5_buffer[1]; } + tp->t_latest_tx = tcp_now; return new_isn; } @@ -3073,7 +3207,12 @@ tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope) tcp_set_tso(tp, rt->rt_ifp); soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); - tcp_set_ecn(tp, rt->rt_ifp); + /* Don't do ECN and L4S for Loopback & Cellular (if L4S is default) */ + if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0 && + !(IFNET_IS_CELLULAR(rt->rt_ifp) && rt->rt_ifp->if_l4s_mode == IFRTYPE_L4S_DEFAULT)) { + tcp_set_ecn(tp); + tcp_set_l4s(tp, rt->rt_ifp); + } if (inp->inp_last_outifp == NULL) { inp->inp_last_outifp = rt->rt_ifp; #if SKYWALK @@ -3181,7 +3320,12 @@ tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope) tcp_set_tso(tp, rt->rt_ifp); soif2kcl(inp->inp_socket, (rt->rt_ifp->if_eflags & IFEF_2KCL)); - tcp_set_ecn(tp, rt->rt_ifp); + /* Don't do ECN and L4S for Loopback & Cellular (if L4S is default) */ + if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0 && + !(IFNET_IS_CELLULAR(rt->rt_ifp) && rt->rt_ifp->if_l4s_mode == IFRTYPE_L4S_DEFAULT)) { + tcp_set_ecn(tp); + tcp_set_l4s(tp, rt->rt_ifp); + } if (inp->inp_last_outifp == NULL) { inp->inp_last_outifp = rt->rt_ifp; #if SKYWALK @@ -3233,13 +3377,13 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) th = (struct tcphdr *)(void *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - tcp_fillheaders(m, tp, ip6, th); + tcp_fillheaders(m, tp, ip6, th, NULL, NULL); hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); - tcp_fillheaders(m, tp, ip, th); + tcp_fillheaders(m, tp, ip, th, NULL, NULL); hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); @@ -3574,9 +3718,6 @@ tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp) } } -#define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \ - (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)) - /* * Function to calculate the tcp clock. The tcp clock will get updated * at the boundaries of the tcp layer. This is done at 3 places: @@ -3585,14 +3726,12 @@ tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp) * 3. When a tcp timer fires or before tcp slow timeout * */ - void calculate_tcp_clock(void) { - struct timeval tv = tcp_uptime; - struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC}; - struct timeval now, hold_now; - uint32_t incr = 0; + uint32_t current_tcp_now; + struct timeval now; + uint32_t tmp; microuptime(&now); @@ -3603,122 +3742,17 @@ calculate_tcp_clock(void) */ net_update_uptime_with_time(&now); - timevaladd(&tv, &interval); - if (timevalcmp(&now, &tv, >)) { - /* time to update the clock */ - lck_spin_lock(&tcp_uptime_lock); - if (timevalcmp(&tcp_uptime, &now, >=)) { - /* clock got updated while waiting for the lock */ - lck_spin_unlock(&tcp_uptime_lock); - return; - } + current_tcp_now = (uint32_t)now.tv_sec * 1000 + now.tv_usec / TCP_RETRANSHZ_TO_USEC; - microuptime(&now); - hold_now = now; - tv = tcp_uptime; - timevalsub(&now, &tv); + tmp = os_atomic_load(&tcp_now, relaxed); + if (tmp < current_tcp_now) { + os_atomic_cmpxchg(&tcp_now, tmp, current_tcp_now, relaxed); - incr = TIMEVAL_TO_TCPHZ(now); - - /* Account for the previous remainder */ - uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) + - tcp_now_remainder_us; - if (remaining_us >= TCP_RETRANSHZ_TO_USEC) { - incr += (remaining_us / TCP_RETRANSHZ_TO_USEC); - } - - if (incr > 0) { - tcp_uptime = hold_now; - tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC; - tcp_now += incr; - } - - lck_spin_unlock(&tcp_uptime_lock); - } -} - -uint64_t -microuptime_ns(void) -{ - uint64_t abstime = mach_absolute_time(); - uint64_t ns = 0; - absolutetime_to_nanoseconds(abstime, &ns); - - return ns; -} - -#define MAX_BURST_INTERVAL_KERNEL_PACING_NSEC \ - (10 * NSEC_PER_MSEC) // Don't delay more than 10ms between two bursts -static uint64_t -tcp_pacer_get_packet_interval(struct tcpcb *tp, uint32_t size) -{ - if (tp->t_pacer.rate == 0) { - os_log_error(OS_LOG_DEFAULT, - "pacer rate shouldn't be 0, CCA is %s (cwnd=%u, smoothed rtt=%u ms)", - CC_ALGO(tp)->name, tp->snd_cwnd, tp->t_srtt >> TCP_RTT_SHIFT); - - return MAX_BURST_INTERVAL_KERNEL_PACING_NSEC; - } - - uint64_t interval = (uint64_t)size * NSEC_PER_SEC / tp->t_pacer.rate; - if (interval > MAX_BURST_INTERVAL_KERNEL_PACING_NSEC) { - interval = MAX_BURST_INTERVAL_KERNEL_PACING_NSEC; - } - - return interval; -} - -/* Return packet tx_time in nanoseconds (absolute as well as continuous) */ -uint64_t -tcp_pacer_get_packet_tx_time(struct tcpcb *tp, uint16_t pkt_len) -{ - /* - * This function is called multiple times for mss-sized packets - * and for high-speeds, we'd want to send multiple packets - * that add up to burst_size at the same time. - */ - uint64_t now = microuptime_ns(); - - if (pkt_len == 0 || now == 0) { - return now; - } - - if (tp->t_pacer.packet_tx_time == 0) { - tp->t_pacer.packet_tx_time = now; - tp->t_pacer.current_size = pkt_len; - } else { - tp->t_pacer.current_size += pkt_len; - if (tp->t_pacer.current_size > tp->t_pacer.tso_burst_size) { - /* - * Increment tx_time by packet_interval and - * reset size to this packet's len - */ - tp->t_pacer.packet_tx_time += - tcp_pacer_get_packet_interval(tp, tp->t_pacer.current_size); - tp->t_pacer.current_size = 0; - if (now > tp->t_pacer.packet_tx_time) { - /* - * If current time is bigger, then application - * has already paced the packet. Also, we can't - * set tx_time in the past. - */ - tp->t_pacer.packet_tx_time = now; - } - } - } - - return tp->t_pacer.packet_tx_time; -} - -void -tcp_set_mbuf_tx_time(struct mbuf *m, uint64_t tx_time) -{ - struct m_tag *tag = NULL; - tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM, - sizeof(uint64_t), M_WAITOK, m); - if (tag != NULL) { - m_tag_prepend(m, tag); - *(uint64_t *)tag->m_tag_data = tx_time; + /* + * No cmpxchg loop needed here. If someone else updated quicker, + * we can take that value. The only requirement is that + * tcp_now never decreases. + */ } } @@ -3728,12 +3762,13 @@ tcp_set_mbuf_tx_time(struct mbuf *m, uint64_t tx_time) * room to potentially increase the window size upto a maximum * defined by the constant tcp_autorcvbuf_max. */ -void -tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) +uint8_t +tcp_get_max_rwinscale(struct tcpcb *tp, struct socket *so) { + uint8_t rcv_wscale; uint32_t maxsockbufsize; - tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale); + rcv_wscale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale); maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; @@ -3742,11 +3777,13 @@ tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so) * to send the max receive window size; adding 1 to TCP_MAXWIN * ensures that. */ - while (tp->request_r_scale < TCP_MAX_WINSHIFT && - ((TCP_MAXWIN + 1) << tp->request_r_scale) < maxsockbufsize) { - tp->request_r_scale++; + while (rcv_wscale < TCP_MAX_WINSHIFT && + ((TCP_MAXWIN + 1) << rcv_wscale) < maxsockbufsize) { + rcv_wscale++; } - tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT); + rcv_wscale = MIN(rcv_wscale, TCP_MAX_WINSHIFT); + + return rcv_wscale; } int @@ -3859,11 +3896,13 @@ tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end) tcp_rxt_seg_drop++; tp->t_rxt_seg_drop++; zfree(tcp_rxt_seg_zone, rxseg); + tcp_memacct_sub(kalloc_type_size(tcp_rxt_seg_zone)); tp->t_rxt_seg_count -= 1; } rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + tcp_memacct_add(kalloc_type_size(tcp_rxt_seg_zone)); rxseg->rx_start = start; rxseg->rx_end = end; rxseg->rx_count = rxcount + 1; @@ -3934,6 +3973,7 @@ tcp_rxtseg_clean(struct tcpcb *tp) SLIST_REMOVE(&tp->t_rxt_segments, rxseg, tcp_rxt_seg, rx_link); zfree(tcp_rxt_seg_zone, rxseg); + tcp_memacct_sub(kalloc_type_size(tcp_rxt_seg_zone)); } tp->t_rxt_seg_count = 0; tp->t_dsack_lastuna = tp->snd_max; @@ -3980,6 +4020,8 @@ tcp_rxtseg_total_size(struct tcpcb *tp) return total_size; } +static void tcp_rack_free_and_disable(struct tcpcb *tp); + int tcp_seg_cmp(const struct tcp_seg_sent *seg1, const struct tcp_seg_sent *seg2) { @@ -4006,14 +4048,19 @@ tcp_seg_alloc_init(struct tcpcb *tp) if (seg != NULL) { TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link); tp->seg_pool.free_segs_count--; + + bzero(seg, sizeof(*seg)); } else { - // TODO: remove Z_WAITOK and Z_NOFAIL? - seg = zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + if (tcp_memacct_hardlimit()) { + return NULL; + } + + seg = zalloc_flags(tcp_seg_sent_zone, Z_NOPAGEWAIT | Z_ZERO); if (seg == NULL) { return NULL; } + tcp_memacct_add(kalloc_type_size(tcp_seg_sent_zone)); } - bzero(seg, sizeof(*seg)); return seg; } @@ -4058,11 +4105,14 @@ tcp_seg_sent_insert_before(struct tcpcb *tp, struct tcp_seg_sent *before, tcp_se uint32_t xmit_ts, uint8_t flags) { struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp); - /* segment MUST be allocated, there is no other fail-safe here */ + if (seg == NULL) { + tcp_rack_free_and_disable(tp); + return NULL; + } tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags); struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg); if (not_inserted) { - os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, + TCP_LOG(tp, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, not_inserted->start_seq, not_inserted->end_seq); } TAILQ_INSERT_BEFORE(before, seg, tx_link); @@ -4075,11 +4125,15 @@ tcp_seg_rto_insert_end(struct tcpcb *tp, tcp_seq start, tcp_seq end, uint32_t xmit_ts, uint8_t flags) { struct tcp_seg_sent *seg = tcp_seg_alloc_init(tp); + if (seg == NULL) { + tcp_rack_free_and_disable(tp); + return NULL; + } /* segment MUST be allocated, there is no other fail-safe here */ tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags); struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg); if (not_inserted) { - os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, + TCP_LOG(tp, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, not_inserted->start_seq, not_inserted->end_seq); } TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link); @@ -4109,6 +4163,7 @@ tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, t */ struct tcp_seg_sent *partial_seg = tcp_seg_alloc_init(tp); if (partial_seg == NULL) { + tcp_rack_free_and_disable(tp); return; } seg->start_seq += (end - start); @@ -4116,7 +4171,7 @@ tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, t struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, partial_seg); if (not_inserted) { - os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, + TCP_LOG(tp, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, not_inserted->start_seq, not_inserted->end_seq); } TAILQ_INSERT_TAIL(&tp->t_segs_sent, partial_seg, tx_link); @@ -4129,13 +4184,14 @@ tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, t /* This is a new segment */ seg = tcp_seg_alloc_init(tp); if (seg == NULL) { + tcp_rack_free_and_disable(tp); return; } tcp_rack_transmit_seg(tp, seg, start, end, xmit_ts, flags); struct tcp_seg_sent *not_inserted = RB_INSERT(tcp_seg_sent_tree_head, &tp->t_segs_sent_tree, seg); if (not_inserted) { - os_log(OS_LOG_DEFAULT, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, + TCP_LOG(tp, "segment %p[%u %u) was not inserted in the RB tree", not_inserted, not_inserted->start_seq, not_inserted->end_seq); } TAILQ_INSERT_TAIL(&tp->t_segs_sent, seg, tx_link); @@ -4171,8 +4227,12 @@ tcp_seg_sent_insert(struct tcpcb *tp, struct tcp_seg_sent *seg, tcp_seq start, t * This segment is partially retransmitted. We split this segment at the boundary of end * sequence. First insert the part being retransmitted at the end of time-ordered list. */ - tcp_seg_rto_insert_end(tp, found_seg->start_seq, end, xmit_ts, + struct tcp_seg_sent *inserted_seg = tcp_seg_rto_insert_end(tp, found_seg->start_seq, end, xmit_ts, found_seg->flags | flags); + /* If segment is not allocated, RACK is already disabled and cleaned up */ + if (inserted_seg == NULL) { + return; + } if (SEQ_LEQ(found_seg->start_seq, start)) { /* @@ -4446,8 +4506,12 @@ tcp_segs_dosack_matched(struct tcpcb *tp, struct tcp_seg_sent *found_seg, * SACKed parts. */ /* First create a new segment for unSACKed part */ - tcp_seg_sent_insert_before(tp, found_seg, found_seg->start_seq, sblk_start, + struct tcp_seg_sent *inserted_seg = tcp_seg_sent_insert_before(tp, found_seg, found_seg->start_seq, sblk_start, found_seg->xmit_ts, found_seg->flags); + /* If segment is not allocated, RACK is already disabled and cleaned up */ + if (inserted_seg == NULL) { + return; + } /* Now, update the SACKed part */ found_seg->start_seq = sblk_start; /* Record seg flags before they get erased. */ @@ -4512,16 +4576,20 @@ tcp_segs_dosack(struct tcpcb *tp, tcp_seq sblk_start, tcp_seq sblk_end, * of SACK block. First insert the newly SACKed part */ tcp_seq start = SEQ_LEQ(sblk_start, found_seg->start_seq) ? found_seg->start_seq : sblk_start; - struct tcp_seg_sent *inserted = tcp_seg_sent_insert_before(tp, found_seg, start, + struct tcp_seg_sent *newly_sacked = tcp_seg_sent_insert_before(tp, found_seg, start, sblk_end, found_seg->xmit_ts, found_seg->flags); + /* If segment is not allocated, RACK is already disabled and cleaned up */ + if (newly_sacked == NULL) { + return; + } /* Record seg flags before they get erased. */ - uint8_t seg_flags = inserted->flags; + uint8_t seg_flags = newly_sacked->flags; /* Mark the SACKed segment */ - tcp_seg_mark_sacked(tp, inserted, newbytes_sacked); + tcp_seg_mark_sacked(tp, newly_sacked, newbytes_sacked); /* Advance RACK state */ - tcp_rack_update_segment_acked(tp, tsecr, inserted->xmit_ts, - inserted->end_seq, !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)); + tcp_rack_update_segment_acked(tp, tsecr, newly_sacked->xmit_ts, + newly_sacked->end_seq, !!(seg_flags & TCP_SEGMENT_RETRANSMITTED_ATLEAST_ONCE)); if (sblk_start == found_seg->start_seq) { /* @@ -4534,8 +4602,12 @@ tcp_segs_dosack(struct tcpcb *tp, tcp_seq sblk_start, tcp_seq sblk_end, if (SEQ_GT(sblk_start, found_seg->start_seq)) { /* Insert the remaining unSACKed part before the SACKED segment inserted above */ - tcp_seg_sent_insert_before(tp, inserted, found_seg->start_seq, + struct tcp_seg_sent *unsacked = tcp_seg_sent_insert_before(tp, newly_sacked, found_seg->start_seq, sblk_start, found_seg->xmit_ts, found_seg->flags); + /* If segment is not allocated, RACK is already disabled and cleaned up */ + if (unsacked == NULL) { + return; + } /* Move the start of existing segment */ found_seg->start_seq = sblk_end; break; @@ -4552,8 +4624,13 @@ tcp_segs_dosack(struct tcpcb *tp, tcp_seq sblk_start, tcp_seq sblk_end, tcp_segs_dosack_matched(tp, sacked_seg, sblk_start, tsecr, newbytes_sacked); } - /* Move the start of existing segment */ - found_seg->start_seq = sblk_end; + /* + * RACK might have been disabled (if a segment allocation failed) and all associated + * state freed. If RACK hasn't been disabled, move the start of existing segment. + */ + if (TCP_RACK_ENABLED(tp)) { + found_seg->start_seq = sblk_end; + } } break; } @@ -4608,6 +4685,7 @@ tcp_seg_delete(struct tcpcb *tp, struct tcp_seg_sent *seg) { if (tp->seg_pool.free_segs_count >= TCP_SEG_POOL_MAX_ITEM_COUNT) { zfree(tcp_seg_sent_zone, seg); + tcp_memacct_sub(kalloc_type_size(tcp_seg_sent_zone)); } else { bzero(seg, sizeof(*seg)); TAILQ_INSERT_TAIL(&tp->seg_pool.free_segs, seg, free_link); @@ -4645,11 +4723,20 @@ tcp_segs_sent_clean(struct tcpcb *tp, bool free_segs) TAILQ_FOREACH_SAFE(seg, &tp->seg_pool.free_segs, free_link, next) { TAILQ_REMOVE(&tp->seg_pool.free_segs, seg, free_link); zfree(tcp_seg_sent_zone, seg); + tcp_memacct_sub(kalloc_type_size(tcp_seg_sent_zone)); } tp->seg_pool.free_segs_count = 0; } } +void +tcp_rack_free_and_disable(struct tcpcb *tp) +{ + TCP_LOG(tp, "not enough memory to allocate segment, disabling RACK"); + tcp_segs_sent_clean(tp, true); + tp->t_flagsext &= ~TF_RACK_ENABLED; +} + void tcp_get_connectivity_status(struct tcpcb *tp, struct tcp_conn_status *connstatus) @@ -4693,8 +4780,8 @@ tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp, /* * The code assumes the IP + TCP headers fit in an mbuf packet header */ - _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN); - _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN); + static_assert(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN); + static_assert(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN); MGETHDR(m, M_WAIT, MT_HEADER); if (m == NULL) { @@ -4720,7 +4807,7 @@ tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp, m->m_pkthdr.len = m->m_len; } - tcp_fillheaders(m, tp, data, th); + tcp_fillheaders(m, tp, data, th, NULL, NULL); if (inp->inp_vflag & INP_IPV4) { struct ip *ip; @@ -5268,7 +5355,7 @@ void tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs, struct ifnet *ifp) { - if (ifp == NULL || !IF_FULLY_ATTACHED(ifp)) { + if (ifp == NULL || !ifnet_is_fully_attached(ifp)) { return; } @@ -5394,43 +5481,6 @@ tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs, ifnet_lock_done(ifp); } -struct tseg_qent * -tcp_reass_qent_alloc(void) -{ - return zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL); -} - -void -tcp_reass_qent_free(struct tseg_qent *te) -{ - zfree(tcp_reass_zone, te); -} - -struct tcp_rxt_seg * -tcp_rxt_seg_qent_alloc(void) -{ - return zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); -} - -void -tcp_rxt_seg_qent_free(struct tcp_rxt_seg *te) -{ - zfree(tcp_rxt_seg_zone, te); -} - - -struct tcp_seg_sent * -tcp_seg_sent_qent_alloc(void) -{ - return zalloc_flags(tcp_seg_sent_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); -} - -void -tcp_seg_sent_qent_free(struct tcp_seg_sent *te) -{ - zfree(tcp_seg_sent_zone, te); -} - #if SKYWALK #include diff --git a/bsd/netinet/tcp_syncookie.c b/bsd/netinet/tcp_syncookie.c new file mode 100644 index 000000000..7cba88728 --- /dev/null +++ b/bsd/netinet/tcp_syncookie.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2001 McAfee, Inc. + * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jonathan Lemon + * and McAfee Research, the Security Research Division of McAfee, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. [2001 McAfee, Inc.] + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "tcp_includes.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int path_mtu_discovery; +int tcp_syncookie_hmac_sha256 = 0; + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookie_hmac_sha256, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_syncookie_hmac_sha256, 0, + "0: disable, 1: Use HMAC with SHA-256 for generating SYN cookie"); + +static bool +syncookie_respond(struct socket *so, struct tcpcb *tp, struct tcp_inp *tpi, uint16_t flags, + struct sockaddr *local, struct sockaddr *remote); +static uint32_t syncookie_siphash(struct tcp_inp *tpi, uint8_t flags, uint8_t key[SYNCOOKIE_SECRET_SIZE]); +static uint32_t syncookie_hmac_sha256(struct tcp_inp *tpi, uint8_t flags, uint8_t key[CCSHA256_OUTPUT_SIZE]); +static uint32_t syncookie_mac(struct tcp_inp *tpi, uint8_t flags, uint8_t secbit); +static tcp_seq syncookie_generate(struct tcp_inp *tpi, bool has_ecn); +static bool syncookie_lookup(struct tcp_inp *tpi); +static void syncookie_reseed(void); + +static struct syncookie_secret tcp_syncookie_secret; + +/* + * This function gets called when we receive an ACK for a + * socket in the LISTEN state. We create the connection + * and set its state based on information from SYN cookies + * and options/flags received in last ACK. The returned + * tcpcb is in the SYN-RECEIVED state. + * + * Return true on success and false on failure. + */ +bool +tcp_syncookie_ack(struct tcp_inp *tpi, struct socket **so2, int* dropsocket) +{ +#define TCP_LOG_HDR (isipv6 ? (void *)ip6 : (void *)ip) + + ASSERT((tcp_get_flags(tpi->th) & (TH_RST | TH_ACK | TH_SYN)) == TH_ACK); + /* + * We don't support syncache, so see if this ACK is + * a returning syncookie. To do this, check that the + * syncookie is valid. + */ + bool ret = syncookie_lookup(tpi); + + if (ret == false) { + TCP_LOG(*tpi->tp, "Segment failed SYNCOOKIE authentication, " + "segment rejected (probably spoofed)"); + goto failed; + } + + ret = tcp_create_server_socket(tpi, so2, NULL, dropsocket); + + if (ret == false) { + goto failed; + } + + ret = tcp_setup_server_socket(tpi, *so2, true); + + /* Set snd state for newly created tcpcb */ + (*tpi->tp)->snd_nxt = (*tpi->tp)->snd_max = tpi->th->th_ack; + + if (ret == false) { + /* + * We failed to setup the server socket, return failure + * so that tcp_input can cleanup the socket and the + * incoming segment + */ + goto failed; + } + *dropsocket = 0; /* committed to socket */ + + if (__improbable(*so2 == NULL)) { + tcpstat.tcps_sc_aborted++; + } else { + tcpstat.tcps_sc_completed++; + } + + return true; + +failed: + return false; +} + +static uint8_t +syncookie_process_accecn_syn(struct tcpcb *tp, uint32_t ace_flags, + uint8_t ip_ecn) +{ + uint8_t setup_flags = 0; + switch (ace_flags) { + case (0 | 0 | 0): + /* No ECN */ + break; + case (0 | TH_CWR | TH_ECE): + /* Legacy ECN-setup */ + setup_flags |= SC_ECN_SETUP; + break; + case (TH_ACE): + /* Accurate ECN */ + if (tp->l4s_enabled) { + switch (ip_ecn) { + case IPTOS_ECN_NOTECT: + setup_flags |= SC_ACE_SETUP_NOT_ECT; + break; + case IPTOS_ECN_ECT1: + setup_flags |= SC_ACE_SETUP_ECT1; + break; + case IPTOS_ECN_ECT0: + setup_flags |= SC_ACE_SETUP_ECT0; + break; + case IPTOS_ECN_CE: + setup_flags |= SC_ACE_SETUP_CE; + break; + } + } else { + /* + * If AccECN is not enabled, ignore + * the TH_AE bit and do Legacy ECN-setup + */ + setup_flags |= SC_ECN_SETUP; + } + default: + /* Forward Compatibility */ + /* Accurate ECN */ + if (tp->l4s_enabled) { + switch (ip_ecn) { + case IPTOS_ECN_NOTECT: + setup_flags |= SC_ACE_SETUP_NOT_ECT; + break; + case IPTOS_ECN_ECT1: + setup_flags |= SC_ACE_SETUP_ECT1; + break; + case IPTOS_ECN_ECT0: + setup_flags |= SC_ACE_SETUP_ECT0; + break; + case IPTOS_ECN_CE: + setup_flags |= SC_ACE_SETUP_CE; + break; + } + } + break; + } + return setup_flags; +} + +static uint16_t +syncookie_respond_accecn(uint8_t setup_flags, uint16_t thflags) +{ + switch (setup_flags) { + case SC_ECN_SETUP: + thflags |= TH_ECE; + break; + case SC_ACE_SETUP_NOT_ECT: + thflags |= TH_CWR; + break; + case SC_ACE_SETUP_ECT1: + thflags |= (TH_CWR | TH_ECE); + break; + case SC_ACE_SETUP_ECT0: + thflags |= TH_AE; + break; + case SC_ACE_SETUP_CE: + thflags |= (TH_AE | TH_CWR); + break; + } + + return thflags; +} + +/* + * Given a LISTEN socket and an inbound SYN request, generate + * a SYN cookie, and send back a segment: + * + * to the source. + */ +void +tcp_syncookie_syn(struct tcp_inp *tpi, struct sockaddr *local, + struct sockaddr *remote) +{ + struct socket *so = tpi->so; + struct inpcb *inp; + struct tcpcb *tp; + uint8_t ip_tos, ip_ecn; + uint8_t ace_setup_flags = 0; + + /* make sure inp is locked for listen socket */ + socket_lock_assert_owned(so); + + ASSERT((tcp_get_flags(tpi->th) & (TH_RST | TH_ACK | TH_SYN)) == TH_SYN); + + ASSERT((so->so_options & SO_ACCEPTCONN) != 0); + + /* Reseed the key if SYNCOOKIE_LIFETIME time has elapsed */ + if (tcp_now > tcp_syncookie_secret.last_updated + + SYNCOOKIE_LIFETIME * TCP_RETRANSHZ) { + syncookie_reseed(); + } + inp = sotoinpcb(so); + tp = sototcpcb(so); + + if (tpi->isipv6) { + if ((inp->in6p_outputopts == NULL) || + (inp->in6p_outputopts->ip6po_tclass == -1)) { + ip_tos = 0; + } else { + ip_tos = (uint8_t)inp->in6p_outputopts->ip6po_tclass; + } + } else { + ip_tos = inp->inp_ip_tos; + } + + ip_ecn = ip_tos & IPTOS_ECN_MASK; + + /* Is ECN enabled? */ + bool is_ecn = tcp_ecn_enabled(tp->ecn_flags); + /* ECN Handshake */ + if (is_ecn) { + int ace_flags = ((tpi->th->th_x2 << 8) | tpi->th->th_flags) & TH_ACE; + ace_setup_flags = syncookie_process_accecn_syn(tp, ace_flags, ip_ecn); + } + bool classic_ecn = !!(ace_setup_flags & SC_ECN_SETUP); + + tpi->iss = syncookie_generate(tpi, classic_ecn); + + uint16_t output_flags = TH_SYN | TH_ACK; + output_flags = syncookie_respond_accecn(ace_setup_flags, output_flags); + /* + * Do a standard 3-way handshake. + */ + if (syncookie_respond(so, tp, tpi, output_flags, local, remote)) { + tcpstat.tcps_sndacks++; + tcpstat.tcps_sndtotal++; + } else { + tcpstat.tcps_sc_dropped++; + } + if (tpi->m != NULL) { + m_freem(tpi->m); + } +} + +/* + * Send SYN|ACK to the peer in response to a peer's SYN segment + */ +static bool +syncookie_respond(struct socket *so, struct tcpcb *tp, struct tcp_inp *tpi, uint16_t flags, + struct sockaddr *local, struct sockaddr *remote) +{ + struct tcptemp *__single t_template; + struct mbuf *__single m; + tcp_seq seq; + uint16_t mss = 0; + uint32_t win; + + if (flags & TH_SYN) { + seq = tpi->iss; + } else { + seq = tpi->iss + 1; + } + + t_template = tcp_maketemplate(tp, &m, local, remote); + if (t_template != NULL) { + /* Use the properties of listener socket for sending SYN-ACK with cookie */ + struct inpcb *inp = tp->t_inpcb; + + uint16_t min_protoh = tpi->isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + : sizeof(struct tcpiphdr); + if (tpi->isipv6) { + mss = (uint16_t)IN6_LINKMTU(tpi->ifp); + } else { + mss = (uint16_t)tpi->ifp->if_mtu; + } + mss -= min_protoh; + + win = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? + so->so_rcv.sb_hiwat : tcp_autorcvbuf_max; + win = imin(win, TCP_MAXWIN); + uint8_t rcv_scale = tcp_get_max_rwinscale(tp, so); + + struct tcp_respond_args tra; + + bzero(&tra, sizeof(tra)); + tra.nocell = INP_NO_CELLULAR(inp) ? 1 : 0; + tra.noexpensive = INP_NO_EXPENSIVE(inp) ? 1 : 0; + tra.noconstrained = INP_NO_CONSTRAINED(inp) ? 1 : 0; + tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp) ? 1 : 0; + tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp) ? 1 : 0; + tra.management_allowed = INP_MANAGEMENT_ALLOWED(inp) ? 1 : 0; + tra.keep_alive = 1; + if (tp->t_inpcb->inp_flags & INP_BOUND_IF) { + tra.ifscope = tp->t_inpcb->inp_boundifp->if_index; + } else { + tra.ifscope = IFSCOPE_NONE; + } + tcp_respond((struct tcpcb*) 0, t_template->tt_ipgen, sizeof(t_template->tt_ipgen), + &t_template->tt_t, (struct mbuf *)NULL, + tpi->th->th_seq + 1, seq, win, flags, tpi->to, mss, rcv_scale, tpi->ts_offset, &tra, true); + (void) m_free(m); + + tcpstat.tcps_sc_sendcookie++; + + return true; + } else { + return false; + } +} + +/* + * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks + * that exceed the capacity of the listen queue by avoiding the storage of any + * of the SYNs we receive. Syncookies defend against blind SYN flooding + * attacks where the attacker does not have access to our responses. + * + * Syncookies encode and include all necessary information about the + * connection setup within the SYN|ACK that we send back. That way we + * can avoid keeping any local state until the ACK to our SYN|ACK returns + * (if ever). + * + * The only reliable information persisting the 3WHS is our initial sequence + * number ISS of 32 bits. Syncookies embed a cryptographically sufficient + * strong hash (MAC) value and a few bits of TCP SYN options in the ISS + * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK + * returns and signifies a legitimate connection if it matches the ACK. + * + * The available space of 32 bits to store the hash and to encode the SYN + * option information is very tight and we should have at least 24 bits for + * the MAC to keep the number of guesses by blind spoofing reasonably high. + * + * SYN option information we have to encode to fully restore a connection: + * MSS: is imporant to chose an optimal segment size to avoid IP level + * fragmentation along the path. The common MSS values can be encoded + * in a 3-bit table. Uncommon values are captured by the next lower value + * in the table leading to a slight increase in packetization overhead. + * WSCALE: is necessary to allow large windows to be used for high delay- + * bandwidth product links. Not scaling the window when it was initially + * negotiated is bad for performance as lack of scaling further decreases + * the apparent available send window. We only need to encode the WSCALE + * we received from the remote end. Our end can be recalculated at any + * time. The common WSCALE values can be encoded in a 3-bit table. + * Uncommon values are captured by the next lower value in the table + * making us under-estimate the available window size halving our + * theoretically possible maximum throughput for that connection. + * SACK: Greatly assists in packet loss recovery and requires 1 bit. + * TIMESTAMP is not encoded because it is a permanent option + * that is included in all segments on a connection. We enable it when + * the ACK has it. + * Accurate ECN is not encoded because the last ACK has enough state to + * determine the state negotiated during SYN/ACK. + * + * Security of syncookies and attack vectors: + * + * The MAC is computed over (faddr||laddr||fport||lport||irs||flags) + * together with the global secret to make it unique per connection attempt. + * Thus any change of any of those parameters results in a different MAC output + * in an unpredictable way unless a collision is encountered. 24 bits of the + * MAC are embedded into the ISS. + * + * To prevent replay attacks two rotating global secrets are updated with a + * new random value every 15 seconds. The life-time of a syncookie is thus + * 15-30 seconds. + * + * Vector 1: Attacking the secret. This requires finding a weakness in the + * MAC itself or the way it is used here. The attacker can do a chosen plain + * text attack by varying and testing the all parameters under his control. + * The strength depends on the size and randomness of the secret, and the + * cryptographic security of the MAC function. Due to the constant updating + * of the secret the attacker has at most 29.999 seconds to find the secret + * and launch spoofed connections. After that he has to start all over again. + * + * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC + * size an average of 4,823 attempts are required for a 50% chance of success + * to spoof a single syncookie (birthday collision paradox). However the + * attacker is blind and doesn't know if one of his attempts succeeded unless + * he has a side channel to interfere success from. A single connection setup + * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. + * This many attempts are required for each one blind spoofed connection. For + * every additional spoofed connection he has to launch another N attempts. + * Thus for a sustained rate 100 spoofed connections per second approximately + * 1,800,000 packets per second would have to be sent. + * + * NB: The MAC function should be fast so that it doesn't become a CPU + * exhaustion attack vector itself. + * + * References: + * RFC4987 TCP SYN Flooding Attacks and Common Mitigations + * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 + * http://cr.yp.to/syncookies.html (overview) + * http://cr.yp.to/syncookies/archive (details) + * + * + * Schematic construction of a syncookie enabled Initial Sequence Number: + * 0 1 2 3 + * 12345678901234567890123456789012 + * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| + * + * x 24 MAC (truncated) + * W 3 Send Window Scale index + * M 2 MSS index + * E 1 Classic ECN permitted + * S 1 SACK permitted + * P 1 Odd/even secret + */ +/* + * Distribution and probability of certain MSS values. Those in between are + * rounded down to the next lower one. + */ +static uint16_t tcp_sc_msstab_v4[] = { 536, 1300, 1460, 4036 }; + +static uint16_t tcp_sc_msstab_v6[] = { 1220, 1420, 1440, 4016 }; + +/* + * Distribution and probability of certain WSCALE values. We have to map the + * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 + * bits based on prevalence of certain values. Where we don't have an exact + * match for are rounded down to the next lower one letting us under-estimate + * the true available window. At the moment this would happen only for the + * very uncommon values 2, 5 and those above 9 (more than 32MB socket buffer + * and window size). The absence of the WSCALE option (no scaling in either + * direction) is encoded with index zero. + */ +static uint8_t tcp_sc_wstab[] = { 0, 1, 3, 4, 6, 7, 8, 9 }; + +#define nitems(_x_) (sizeof(_x_) / sizeof(*_x_)) + +/* + * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed + * and good cryptographic properties. + */ +static uint32_t +syncookie_siphash(struct tcp_inp *tpi, uint8_t flags, uint8_t key[SYNCOOKIE_SECRET_SIZE]) +{ + SIPHASH_CTX ctx; + uint32_t siphash[2]; + + SipHash24_Init(&ctx); + SipHash_SetKey(&ctx, key); + if (tpi->isipv6) { + SipHash_Update(&ctx, &tpi->ip6->ip6_src.s6_addr, sizeof(tpi->ip6->ip6_src.s6_addr)); + SipHash_Update(&ctx, &tpi->ip6->ip6_dst.s6_addr, sizeof(tpi->ip6->ip6_dst.s6_addr)); + } else { + SipHash_Update(&ctx, &tpi->ip->ip_src.s_addr, sizeof(tpi->ip->ip_src.s_addr)); + SipHash_Update(&ctx, &tpi->ip->ip_dst.s_addr, sizeof(tpi->ip->ip_dst.s_addr)); + } + + SipHash_Update(&ctx, &tpi->th->th_sport, sizeof(tpi->th->th_sport)); + SipHash_Update(&ctx, &tpi->th->th_dport, sizeof(tpi->th->th_dport)); + SipHash_Update(&ctx, &tpi->irs, sizeof(tpi->irs)); + SipHash_Update(&ctx, &flags, sizeof(flags)); + SipHash_Final((u_int8_t *)&siphash, &ctx); + + tpi->ts_offset = siphash[1]; + + return siphash[0] ^ siphash[1]; +} + +/* + * HMAC with SHA-256 is only used for comparison with Siphash + */ +static uint32_t +syncookie_hmac_sha256(struct tcp_inp *tpi, uint8_t flags, uint8_t key[CCSHA256_OUTPUT_SIZE]) +{ + /* SHA256 mac is 32 bytes */ + uint32_t mac[8] = {}; + const struct ccdigest_info *di = ccsha256_di(); + + cchmac_ctx_decl(di->state_size, di->block_size, ctx); + cchmac_init(di, ctx, CCSHA256_OUTPUT_SIZE, key); + if (tpi->isipv6) { + cchmac_update(di, ctx, sizeof(tpi->ip6->ip6_src.s6_addr), &tpi->ip6->ip6_src.s6_addr); + cchmac_update(di, ctx, sizeof(tpi->ip6->ip6_dst.s6_addr), &tpi->ip6->ip6_dst.s6_addr); + } else { + cchmac_update(di, ctx, sizeof(tpi->ip->ip_src.s_addr), &tpi->ip->ip_src.s_addr); + cchmac_update(di, ctx, sizeof(tpi->ip->ip_dst.s_addr), &tpi->ip->ip_dst.s_addr); + } + cchmac_update(di, ctx, sizeof(tpi->th->th_sport), &tpi->th->th_sport); + cchmac_update(di, ctx, sizeof(tpi->th->th_dport), &tpi->th->th_dport); + cchmac_update(di, ctx, sizeof(tpi->irs), &tpi->irs); + cchmac_update(di, ctx, sizeof(flags), &flags); + cchmac_final(di, ctx, (uint8_t *)mac); + + tpi->ts_offset = mac[1]; + + return mac[0] ^ mac[1] ^ mac[2] ^ mac[3] ^ mac[4] ^ mac[5] ^ mac[6] ^ mac[7]; +} + +static uint32_t +syncookie_mac(struct tcp_inp *tpi, uint8_t flags, uint8_t secbit) +{ + if (tcp_syncookie_hmac_sha256) { + /* key size is 32 bytes */ + return syncookie_hmac_sha256(tpi, flags, (uint8_t *) tcp_syncookie_secret.key); + } else { + /* key size is 16 bytes */ + return syncookie_siphash(tpi, flags, tcp_syncookie_secret.key[secbit]); + } +} + +static tcp_seq +syncookie_generate(struct tcp_inp *tpi, bool has_ecn) +{ + uint8_t i, secbit, peer_wscale = 0; + uint32_t iss, hash; + syncookie cookie; + uint16_t peer_mss = 0; + + cookie.cookie = 0; + + struct tcpopt *to = tpi->to; + + if (to->to_flags & TOF_MSS) { + peer_mss = to->to_mss; /* peer mss may be zero */ + } + if (to->to_flags & TOF_SCALE) { + peer_wscale = to->to_wscale; + } + + /* Map our computed MSS into the 2-bit index. */ + if (tpi->isipv6) { + for (i = nitems(tcp_sc_msstab_v6) - 1; + tcp_sc_msstab_v6[i] > peer_mss && i > 0; + i--) { + ; + } + } else { + for (i = nitems(tcp_sc_msstab_v4) - 1; + tcp_sc_msstab_v4[i] > peer_mss && i > 0; + i--) { + ; + } + } + cookie.flags.mss_idx = i; + /* + * Map the send window scale into the 3-bit index but only if + * the wscale option was received. + */ + if (peer_wscale > 0) { + for (i = nitems(tcp_sc_wstab) - 1; + tcp_sc_wstab[i] > peer_wscale && i > 0; + i--) { + ; + } + cookie.flags.wscale_idx = i; + } + /* Can we do SACK? */ + if (to->to_flags & TOF_SACKPERM) { + cookie.flags.sack_ok = 1; + } + + /* Should we do classic ECN? */ + if (has_ecn) { + cookie.flags.ecn_ok = 1; + } + + /* Which of the two secrets to use. */ + secbit = tcp_syncookie_secret.oddeven & 0x1; + cookie.flags.odd_even = secbit; + tpi->irs = tpi->th->th_seq; + hash = syncookie_mac(tpi, cookie.cookie, secbit); + /* + * Put the flags into the hash and XOR them to get better ISS number + * variance. This doesn't enhance the cryptographic strength and is + * done to prevent the 8 cookie bits from showing up directly on the + * wire. + */ + iss = hash & ~0xff; + iss |= cookie.cookie ^ (hash >> 24); + + tcpstat.tcps_sc_sendcookie++; + + return iss; +} + +/* + * Validate received SYN cookie in th_ack. Returns true on success + * and a false on failure + */ +static bool +syncookie_lookup(struct tcp_inp *tpi) +{ + syncookie cookie; + uint32_t hash; + tcp_seq ack; + /* + * Pull information out of SYN-ACK/ACK and revert sequence number + * advances. + */ + ack = tpi->th->th_ack - 1; + tpi->irs = tpi->th->th_seq - 1; + + /* + * Unpack the flags containing enough information to restore the + * connection. + */ + cookie.cookie = (ack & 0xff) ^ (ack >> 24); + hash = syncookie_mac(tpi, cookie.cookie, cookie.flags.odd_even); + + /* The recomputed hash failed to match the ACK */ + if ((ack & ~0xff) != (hash & ~0xff)) { + return false; + } + if (tpi->isipv6) { + tpi->peer_mss = tcp_sc_msstab_v6[cookie.flags.mss_idx]; + } else { + tpi->peer_mss = tcp_sc_msstab_v4[cookie.flags.mss_idx]; + } + + /* Only use wscale if it was enabled in the orignal SYN. */ + if (cookie.flags.wscale_idx > 0) { + tpi->peer_wscale = tcp_sc_wstab[cookie.flags.wscale_idx]; + } + if (cookie.flags.sack_ok) { + tpi->sackok = true; + } + + if (cookie.flags.ecn_ok) { + tpi->ecnok = true; + } + + tcpstat.tcps_sc_recvcookie++; + return true; +} + +/* + * We reseed when we receive a new connection request if + * last update was done SYNCOOKIE_LIFETIME ago + */ +static void +syncookie_reseed(void) +{ + struct syncookie_secret *secret = &tcp_syncookie_secret; + uint8_t *secbits; + int secbit; + + /* + * Reseeding the secret doesn't have to be protected by a lock. + * It only must be ensured that the new random values are visible + * to all CPUs in a SMP environment. The atomic with release + * semantics ensures that. + */ + secbit = (secret->oddeven & 0x1) ? 0 : 1; + secbits = secret->key[secbit]; + read_frandom(secbits, SYNCOOKIE_SECRET_SIZE); + os_atomic_add(&secret->oddeven, 1, relaxed); + + tcp_syncookie_secret.last_updated = tcp_now; +} + +void +tcp_syncookie_init() +{ + /* Init syncookie secret */ + read_frandom(tcp_syncookie_secret.key[0], SYNCOOKIE_SECRET_SIZE); + read_frandom(tcp_syncookie_secret.key[1], SYNCOOKIE_SECRET_SIZE); + tcp_syncookie_secret.last_updated = tcp_now; +} diff --git a/bsd/nfs/nfsdiskless.h b/bsd/netinet/tcp_syncookie.h similarity index 63% rename from bsd/nfs/nfsdiskless.h rename to bsd/netinet/tcp_syncookie.h index 3ed0195e1..08da917a8 100644 --- a/bsd/nfs/nfsdiskless.h +++ b/bsd/netinet/tcp_syncookie.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,12 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. + +/*- + * SPDX-License-Identifier: BSD-3-Clause * - * This code is derived from software contributed to Berkeley by - * Rick Macklem at The University of Guelph. + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,11 +40,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -60,34 +55,49 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * @(#)nfsdiskless.h 8.2 (Berkeley) 3/30/95 - * FreeBSD-Id: nfsdiskless.h,v 1.10 1997/09/07 12:56:46 bde Exp $ */ +#ifndef _NETINET_TCP_SYNCOOKIE_H_ +#define _NETINET_TCP_SYNCOOKIE_H_ -#ifndef _NFS_NFSDISKLESS_H_ -#define _NFS_NFSDISKLESS_H_ +#include +#include -#include +#ifdef KERNEL_PRIVATE -#ifdef __APPLE_API_PRIVATE +void tcp_syncookie_init(void); +void tcp_syncookie_syn(struct tcp_inp *tpi, struct sockaddr *local, struct sockaddr *remote); +bool tcp_syncookie_ack(struct tcp_inp *tpi, struct socket **so2, int* dropsocket); -struct nfs_dlmount { - struct sockaddr_in ndm_saddr; /* Address of file server */ - char ndm_host[MAXHOSTNAMELEN];/* Host name for mount pt */ - char *ndm_path; /* path name for mount pt */ - char *ndm_mntfrom; /* mntfromname for mount pt */ - u_int32_t ndm_nfsv3; /* NFSv3 or NFSv2? */ - u_int32_t ndm_sotype; /* SOCK_STREAM or SOCK_DGRAM? */ - u_int32_t ndm_fhlen; /* length of file handle */ - u_char ndm_fh[NFSX_V3FHMAX]; /* The file's file handle */ +/* + * Flags for the Accurate ECN setup + */ +#define SC_ECN_SETUP 0x01 /* send classic ECN setup */ +#define SC_ACE_SETUP_NOT_ECT 0x02 /* send ACE not-ECT setup */ +#define SC_ACE_SETUP_ECT1 0x04 /* send ACE ECT1 setup */ +#define SC_ACE_SETUP_ECT0 0x08 /* send ACE ECT0 setup */ +#define SC_ACE_SETUP_CE 0x10 /* send ACE CE setup */ + + +#define SYNCOOKIE_SECRET_SIZE 16 +#define SYNCOOKIE_LIFETIME 15 /* seconds */ + +struct syncookie_secret { + volatile u_int oddeven; + uint8_t key[2][SYNCOOKIE_SECRET_SIZE]; + uint32_t last_updated; }; -struct nfs_diskless { - struct nfs_dlmount nd_root; /* Mount info for root */ - struct nfs_dlmount nd_private; /* Mount info for private */ -}; +typedef union { + uint8_t cookie; + struct { + uint8_t odd_even:1, + sack_ok:1, + ecn_ok:1, /* Only needed for classic ECN */ + wscale_idx:3, + mss_idx:2; + } flags; +} syncookie; +#endif /* KERNEL_PRIVATE */ -#endif /* __APPLE_API_PRIVATE */ -#endif /* _NFS_NFSDISKLESS_H_ */ +#endif /* _NETINET_TCP_SYNCOOKIE_H_ */ diff --git a/bsd/netinet/tcp_sysctls.c b/bsd/netinet/tcp_sysctls.c index ef3e569c5..981bb68a0 100644 --- a/bsd/netinet/tcp_sysctls.c +++ b/bsd/netinet/tcp_sysctls.c @@ -42,9 +42,11 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_fast_convergence, CTLFLAG_RW | CTLFLAG_LOCK SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_use_minrtt, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_cubic_use_minrtt, 0, "use a min of 5 sec rtt"); +/* TODO - remove once uTCP stops using it */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_minor_fixes, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_cubic_minor_fixes, 1, "Minor fixes to TCP Cubic"); +/* TODO - remove once uTCP stops using it */ SYSCTL_SKMEM_TCP_INT(OID_AUTO, cubic_rfc_compliant, CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_cubic_rfc_compliant, 1, "RFC Compliance for TCP Cubic"); @@ -76,7 +78,7 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, bg_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, * 'Allowed_increase' parameter is set to 8. If the flight size is zero, then * we want the congestion window to be at least 8 packets to reduce the * delay induced by delayed ack. This helps when the receiver is acking - * more than 2 packets at a time (stretching acks for better performance). + * more than 2 packets at a time. * * 'Tether' is also set to 2. We do not want this to limit the growth of cwnd * during slow-start. diff --git a/bsd/netinet/tcp_sysctls.h b/bsd/netinet/tcp_sysctls.h index 69d3bb3e1..16de3e1f0 100644 --- a/bsd/netinet/tcp_sysctls.h +++ b/bsd/netinet/tcp_sysctls.h @@ -34,7 +34,9 @@ extern int tcp_cubic_tcp_friendliness; extern int tcp_cubic_fast_convergence; extern int tcp_cubic_use_minrtt; +/* TODO - remove once uTCP stops using it */ extern int tcp_cubic_minor_fixes; +/* TODO - remove once uTCP stops using it */ extern int tcp_cubic_rfc_compliant; extern int tcp_rack; diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index e08d7cac1..6dce87da5 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -105,9 +105,6 @@ #include #include -/* Max number of times a stretch ack can be delayed on a connection */ -#define TCP_STRETCHACK_DELAY_THRESHOLD 5 - /* * If the host processor has been sleeping for too long, this is the threshold * used to avoid sending stale retransmissions. @@ -448,7 +445,7 @@ struct tcp_last_report_stats { static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); static boolean_t tcp_garbage_collect(struct inpcb *, int); -#define TIMERENTRY_TO_TP(te) (__unsafe_forge_single(struct tcpcb *, ((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))) +#define TIMERENTRY_TO_TP(te) (__unsafe_forge_single(struct tcpcb *, ((uintptr_t)te - offsetof(struct tcpcb, tentry.te_le.le_next)))) #define VERIFY_NEXT_LINK(elm, field) do { \ if (LIST_NEXT((elm),field) != NULL && \ @@ -853,11 +850,6 @@ tcp_gc(struct inpcbinfo *ipi) lck_rw_done(&ipi->ipi_lock); - /* Clean up the socache while we are here */ - if (so_cache_timer()) { - os_atomic_inc(&ipi->ipi_gc_req.intimer_lazy, relaxed); - } - KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot, 0, 0, 0); @@ -876,14 +868,14 @@ tcp_canceltimers(struct tcpcb *tp) for (i = 0; i < TCPT_NTIMERS; i++) { tp->t_timer[i] = 0; } - tp->tentry.timer_start = tcp_now; - tp->tentry.index = TCPT_NONE; + tp->tentry.te_timer_start = tcp_now; + tp->tentry.te_index = TCPT_NONE; } -int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = +static int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; -int tcp_backoff[TCP_MAXRXTSHIFT + 1] = +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ @@ -995,7 +987,7 @@ tcp_send_keep_alive(struct tcpcb *tp) struct mbuf *__single m; tcpstat.tcps_keepprobe++; - t_template = tcp_maketemplate(tp, &m); + t_template = tcp_maketemplate(tp, &m, NULL, NULL); if (t_template != NULL) { struct inpcb *inp = tp->t_inpcb; struct tcp_respond_args tra; @@ -1015,7 +1007,7 @@ tcp_send_keep_alive(struct tcpcb *tp) } tcp_respond(tp, t_template->tt_ipgen, sizeof(t_template->tt_ipgen), &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0, &tra); + tp->rcv_nxt, tp->snd_una - 1, 0, 0, NULL, 0, 0, 0, &tra, false); (void) m_free(m); return true; } else { @@ -1052,7 +1044,7 @@ tcp_timers(struct tcpcb *tp, int timer) if (tp->t_state != TCPS_TIME_WAIT && tp->t_state != TCPS_FIN_WAIT_2 && ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_2MSL] = tcp_offset_from_start(tp, (u_int32_t)TCP_CONN_KEEPINTVL(tp)); } else { if (tp->t_state == TCPS_FIN_WAIT_2) { @@ -1117,24 +1109,11 @@ tcp_timers(struct tcpcb *tp, int timer) } else { tcpstat.tcps_timeoutdrop++; } - if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) { - if (TCP_ECN_ENABLED(tp)) { - INP_INC_IFNET_STAT(tp->t_inpcb, - ecn_on.rxmit_drop); - } else { - INP_INC_IFNET_STAT(tp->t_inpcb, - ecn_off.rxmit_drop); - } - } + tp->t_rxtshift = TCP_MAXRXTSHIFT; soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT)); - if (TCP_ECN_ENABLED(tp) && - tp->t_state == TCPS_ESTABLISHED) { - tcp_heuristic_ecn_droprxmt(tp); - } - TCP_LOG_DROP_PCB(NULL, NULL, tp, false, "retransmission timeout drop"); tp = tcp_drop(tp, tp->t_softerror ? @@ -1258,9 +1237,16 @@ retransmit_packet: if ((tcp_link_heuristics_flags & TCP_LINK_HEUR_SYNRMXT) != 0 && if_link_heuristics_enabled(outifp)) { IF_TCP_STATINC(outifp, linkheur_synrxmt); - rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + /* + * The following increases the RTO by the expected backoff. + * + * We don't want to use TCP_REXMTVAL() as that would take + * the SRTT into account. But, we are in SYN_SENT state and + * thus don't have an SRTT. + */ + rexmt = tp->t_rxtcur << ((tcp_backoff[tp->t_rxtshift] - tcp_backoff[tp->t_rxtshift - 1]) / tcp_backoff[tp->t_rxtshift - 1]); } else { - rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; + rexmt = tp->t_rxtcur << ((tcp_syn_backoff[tp->t_rxtshift] - tcp_syn_backoff[tp->t_rxtshift - 1]) / tcp_syn_backoff[tp->t_rxtshift - 1]); } tp->t_stat.synrxtshift = tp->t_rxtshift; tp->t_stat.rxmitsyns++; @@ -1277,8 +1263,7 @@ retransmit_packet: TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX, TCP_ADD_REXMTSLOP(tp)); - tcp_set_link_heur_rtomin(tp, outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); + tcp_set_rto(tp); TCP_LOG_RTT_INFO(tp); @@ -1420,7 +1405,7 @@ retransmit_packet: * right after Fast Retransmits and ECE * notification receipts. */ - if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) { + if (!tp->accurate_ecn_on && TCP_ECN_ENABLED(tp)) { tp->ecn_flags |= TE_SENDCWR; } } @@ -1521,10 +1506,10 @@ fc_output: TCP_LOG_KEEP_ALIVE(tp, idle_time); } - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPINTVL(tp)); } else { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPIDLE(tp)); } if (tp->t_flagsext & TF_DETECT_READSTALL) { @@ -1556,7 +1541,7 @@ fc_output: if (reenable_probe) { int ind = min(tp->t_rtimo_probes, TCP_MAXRXTSHIFT); - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START( + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start( tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)); } } @@ -1573,7 +1558,7 @@ fc_output: * timeout slower than regular keepalive due to the * backing off. */ - tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START( + tp->t_timer[TCPT_KEEP] = min(tcp_offset_from_start( tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)), tp->t_timer[TCPT_KEEP]); } else if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) && @@ -1596,35 +1581,6 @@ fc_output: tp->t_timer[TCPT_DELACK] = 0; tp->t_flags |= TF_ACKNOW; - /* - * If delayed ack timer fired while stretching - * acks, count the number of times the streaming - * detection was not correct. If this exceeds a - * threshold, disable strech ack on this - * connection - * - * Also, go back to acking every other packet. - */ - if ((tp->t_flags & TF_STRETCHACK)) { - if (tp->t_unacksegs > 1 && - tp->t_unacksegs < maxseg_unacked) { - tp->t_stretchack_delayed++; - } - - if (tp->t_stretchack_delayed > - TCP_STRETCHACK_DELAY_THRESHOLD) { - tp->t_flagsext |= TF_DISABLE_STRETCHACK; - /* - * Note the time at which stretch - * ack was disabled automatically - */ - tp->rcv_nostrack_ts = tcp_now; - tcpstat.tcps_nostretchack++; - tp->t_stretchack_delayed = 0; - tp->rcv_nostrack_pkts = 0; - } - tcp_reset_stretch_ack(tp); - } tp->t_forced_acks = TCP_FORCED_ACKS_COUNT; /* @@ -1682,7 +1638,7 @@ fc_output: } if (mpte->mpte_cellicon_increments) { - tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE); + tp->t_timer[TCPT_CELLICON] = tcp_offset_from_start(tp, MPTCP_CELLICON_TOGGLE_RATE); } break; @@ -1829,17 +1785,7 @@ fc_output: if (tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0 && (tp->t_inpcb->inp_socket->so_snd.sb_cc != 0 || tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED)) { - tcp_set_link_heur_rtomin(tp, outifp); - tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); - - os_log(OS_LOG_DEFAULT, - "%s: tcp_output() returned %u with retransmission timer disabled " - "for %u > %u in state %d, reset timer to %d", - __func__, ret, - ntohs(tp->t_inpcb->inp_lport), - ntohs(tp->t_inpcb->inp_fport), - tp->t_state, - tp->t_timer[TCPT_REXMT]); + tcp_set_rto(tp); tcp_check_timer_state(tp); } @@ -1870,7 +1816,7 @@ fc_output: tcp_rexmt_save_state(tp); if (CC_ALGO(tp)->pre_fr != NULL) { CC_ALGO(tp)->pre_fr(tp); - if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) { + if (!tp->accurate_ecn_on && TCP_ECN_ENABLED(tp)) { tp->ecn_flags |= TE_SENDCWR; } } @@ -1912,16 +1858,28 @@ tcp_remove_timer(struct tcpcb *tp) lck_mtx_lock(&listp->mtx); if (listp->next_te != NULL && listp->next_te == &tp->tentry) { - listp->next_te = LIST_NEXT(&tp->tentry, le); + listp->next_te = LIST_NEXT(&tp->tentry, te_le); + } + + LIST_REMOVE(&tp->tentry, te_le); + /* + * The use count has been incremented when the PCB + * was placed on the timer list, and needs to be decremented. + * As a safety precaution, we are checking against underflow. + */ + if (__improbable(tp->t_inpcb->inp_socket->so_usecount == 0)) { + TCP_LOG(tp, "%s: inpcb socket so_usecount underflow " + " when removing timer entry\n", __func__); + } else { + tp->t_inpcb->inp_socket->so_usecount--; } - LIST_REMOVE(&tp->tentry, le); tp->t_flags &= ~(TF_TIMER_ONLIST); listp->entries--; - tp->tentry.le.le_next = NULL; - tp->tentry.le.le_prev = NULL; + tp->tentry.te_le.le_next = NULL; + tp->tentry.te_le.le_prev = NULL; lck_mtx_unlock(&listp->mtx); } @@ -2051,17 +2009,17 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, * with another thread that can cancel or reschedule the timer * that is about to run. Check if we need to run anything. */ - if ((index = tp->tentry.index) == TCPT_NONE) { + if ((index = tp->tentry.te_index) == TCPT_NONE) { goto done; } timer_val = tp->t_timer[index]; - diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); + diff = timer_diff(tp->tentry.te_runtime, 0, tcp_now, 0); if (diff > 0) { - if (tp->tentry.index != TCPT_NONE) { + if (tp->tentry.te_index != TCPT_NONE) { offset = diff; - *(te_mode) = tp->tentry.mode; + *(te_mode) = tp->tentry.te_mode; } goto done; } @@ -2078,10 +2036,10 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, * Check if there are any other timers that need to be run. * While doing it, adjust the timer values wrt tcp_now. */ - tp->tentry.mode = 0; + tp->tentry.te_mode = 0; for (i = 0; i < TCPT_NTIMERS; ++i) { if (tp->t_timer[i] != 0) { - diff = timer_diff(tp->tentry.timer_start, + diff = timer_diff(tp->tentry.te_timer_start, tp->t_timer[i], tcp_now, 0); if (diff <= 0) { needtorun[i] = TRUE; @@ -2093,20 +2051,20 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, lo_timer = diff; lo_index = i; } - TCP_SET_TIMER_MODE(tp->tentry.mode, i); + TCP_SET_TIMER_MODE(tp->tentry.te_mode, i); } } } - tp->tentry.timer_start = tcp_now; - tp->tentry.index = lo_index; - VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); + tp->tentry.te_timer_start = tcp_now; + tp->tentry.te_index = lo_index; + VERIFY(tp->tentry.te_index == TCPT_NONE || tp->tentry.te_mode > 0); - if (tp->tentry.index != TCPT_NONE) { - tp->tentry.runtime = tp->tentry.timer_start + - tp->t_timer[tp->tentry.index]; - if (tp->tentry.runtime == 0) { - tp->tentry.runtime++; + if (tp->tentry.te_index != TCPT_NONE) { + tp->tentry.te_runtime = tp->tentry.te_timer_start + + tp->t_timer[tp->tentry.te_index]; + if (tp->tentry.te_runtime == 0) { + tp->tentry.te_runtime++; } } @@ -2126,13 +2084,13 @@ tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode, tcp_set_lotimer_index(tp); } - if (tp->tentry.index < TCPT_NONE) { - offset = tp->t_timer[tp->tentry.index]; - *(te_mode) = tp->tentry.mode; + if (tp->tentry.te_index < TCPT_NONE) { + offset = tp->t_timer[tp->tentry.te_index]; + *(te_mode) = tp->tentry.te_mode; } done: - if (tp != NULL && tp->tentry.index == TCPT_NONE) { + if (tp != NULL && tp->tentry.te_index == TCPT_NONE) { tcp_remove_timer(tp); offset = 0; } @@ -2141,22 +2099,9 @@ done: return offset; } -void -tcp_run_timerlist(void * arg1, void * arg2) +static void +tcp_timer_update_drift_stats(struct tcptimerlist *listp) { -#pragma unused(arg1, arg2) - struct tcptimerentry *te, *__single next_te; - struct tcptimerlist *__single listp = &tcp_timer_list; - struct tcpcb *__single tp; - uint32_t next_timer = 0; /* offset of the next timer on the list */ - u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */ - u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */ - uint32_t active_count = 0; - - calculate_tcp_clock(); - - lck_mtx_lock(&listp->mtx); - int32_t drift = tcp_now - listp->runtime; if (drift <= 1) { tcpstat.tcps_timer_drift_le_1_ms++; @@ -2177,25 +2122,55 @@ tcp_run_timerlist(void * arg1, void * arg2) } else { tcpstat.tcps_timer_drift_gt_1000_ms++; } +} +void +tcp_run_timerlist(void * arg1, void * arg2) +{ +#pragma unused(arg1, arg2) + struct tcptimerentry *te, *__single next_te; + struct tcptimerlist *__single listp = &tcp_timer_list; + struct tcpcb *__single tp; + uint32_t next_timer = 0; /* offset of the next timer on the list */ + u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */ + u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */ + uint32_t num_entries; + + calculate_tcp_clock(); + + lck_mtx_lock(&listp->mtx); + + tcp_timer_update_drift_stats(listp); + + listp->started_at = tcp_now; + + num_entries = listp->entries; listp->running = TRUE; + listp->processed_count = 0; - LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { + LIST_FOREACH_SAFE(te, &listp->lhead, te_le, next_te) { uint32_t offset = 0; - uint32_t runtime = te->runtime; + uint32_t runtime = te->te_runtime; tp = TIMERENTRY_TO_TP(te); + listp->processed_count++; + if (listp->processed_count > num_entries) { + os_log(OS_LOG_DEFAULT, "tcp_run_timerlist done: processed_count %u > num_entries %u current %u", + listp->processed_count, num_entries, listp->entries); + break; + } + /* * An interface probe may need to happen before the previously scheduled runtime */ - if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now) && + if (te->te_index < TCPT_NONE && TSTMP_GT(runtime, tcp_now) && !TCP_IF_STATE_CHANGED(tp, listp->probe_if_index)) { offset = timer_diff(runtime, 0, tcp_now, 0); if (next_timer == 0 || offset < next_timer) { next_timer = offset; } - list_mode |= te->mode; + list_mode |= te->te_mode; continue; } @@ -2216,7 +2191,6 @@ tcp_run_timerlist(void * arg1, void * arg2) lck_mtx_lock(&listp->mtx); continue; } - active_count++; /* * Store the next timerentry pointer before releasing the @@ -2226,8 +2200,8 @@ tcp_run_timerlist(void * arg1, void * arg2) */ listp->next_te = next_te; - VERIFY_NEXT_LINK(&tp->tentry, le); - VERIFY_PREV_LINK(&tp->tentry, le); + VERIFY_NEXT_LINK(&tp->tentry, te_le); + VERIFY_PREV_LINK(&tp->tentry, te_le); lck_mtx_unlock(&listp->mtx); @@ -2303,6 +2277,8 @@ tcp_run_timerlist(void * arg1, void * arg2) listp->pref_mode = 0; listp->pref_offset = 0; listp->probe_if_index = 0; + listp->started_at = 0; + listp->processed_count = 0; lck_mtx_unlock(&listp->mtx); } @@ -2315,8 +2291,8 @@ void tcp_sched_timers(struct tcpcb *tp) { struct tcptimerentry *te = &tp->tentry; - u_int16_t index = te->index; - u_int16_t mode = te->mode; + u_int16_t index = te->te_index; + u_int16_t mode = te->te_mode; struct tcptimerlist *listp = &tcp_timer_list; int32_t offset = 0; boolean_t list_locked = FALSE; @@ -2339,7 +2315,7 @@ tcp_sched_timers(struct tcpcb *tp) * compute the offset at which the next timer for this connection * has to run. */ - offset = timer_diff(te->runtime, 0, tcp_now, 0); + offset = timer_diff(te->te_runtime, 0, tcp_now, 0); if (offset <= 0) { offset = 1; tcp_timer_advanced++; @@ -2352,7 +2328,16 @@ tcp_sched_timers(struct tcpcb *tp) } if (!TIMER_IS_ON_LIST(tp)) { - LIST_INSERT_HEAD(&listp->lhead, te, le); + /* + * Adding the timer entry should constitute an incresed socket use count, + * otherwise the socket use count may reach zero while being referenced + * via the timer entry. If this happens, the timer service routine + * will run into an UAF (use after free) when attempting + * to get the related protocol control block. + */ + tp->t_inpcb->inp_socket->so_usecount++; + + LIST_INSERT_HEAD(&listp->lhead, te, te_le); tp->t_flags |= TF_TIMER_ONLIST; listp->entries++; @@ -2371,7 +2356,7 @@ tcp_sched_timers(struct tcpcb *tp) * Timer entry is currently on the list, check if the list needs * to be rescheduled. */ - if (need_to_resched_timerlist(te->runtime, mode)) { + if (need_to_resched_timerlist(te->te_runtime, mode)) { tcp_resched_timerlist++; if (!list_locked) { @@ -2379,8 +2364,8 @@ tcp_sched_timers(struct tcpcb *tp) list_locked = TRUE; } - VERIFY_NEXT_LINK(te, le); - VERIFY_PREV_LINK(te, le); + VERIFY_NEXT_LINK(te, te_le); + VERIFY_PREV_LINK(te, te_le); if (listp->running) { listp->pref_mode |= mode; @@ -2450,15 +2435,15 @@ tcp_set_lotimer_index(struct tcpcb *tp) } } } - tp->tentry.index = lo_index; - tp->tentry.mode = mode; - VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0); + tp->tentry.te_index = lo_index; + tp->tentry.te_mode = mode; + VERIFY(tp->tentry.te_index == TCPT_NONE || tp->tentry.te_mode > 0); - if (tp->tentry.index != TCPT_NONE) { - tp->tentry.runtime = tp->tentry.timer_start - + tp->t_timer[tp->tentry.index]; - if (tp->tentry.runtime == 0) { - tp->tentry.runtime++; + if (tp->tentry.te_index != TCPT_NONE) { + tp->tentry.te_runtime = tp->tentry.te_timer_start + + tp->t_timer[tp->tentry.te_index]; + if (tp->tentry.te_runtime == 0) { + tp->tentry.te_runtime++; } } } @@ -2594,12 +2579,11 @@ tcp_report_stats(void) (uint32_t)((var * 100) / tcpstat.tcps_sndpack); } - if (tcp_ecn_outbound == 1) { + if (tcp_ecn == 1) { stat.ecn_client_enabled = 1; - } - if (tcp_ecn_inbound == 1) { stat.ecn_server_enabled = 1; } + tcp_cumulative_stat(tcpstat.tcps_connattempt, &prev.tcps_connattempt, &stat.connection_attempts); tcp_cumulative_stat(tcpstat.tcps_accepts, @@ -2807,24 +2791,24 @@ tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp) tp->t_rtimo_probes == 0) { tp->t_flagsext |= TF_DETECT_READSTALL; tp->t_rtimo_probes = 0; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_TIMER_10MS_QUANTUM); - if (tp->tentry.index == TCPT_NONE) { - tp->tentry.index = TCPT_KEEP; - tp->tentry.runtime = tcp_now + + if (tp->tentry.te_index == TCPT_NONE) { + tp->tentry.te_index = TCPT_KEEP; + tp->tentry.te_runtime = tcp_now + TCP_TIMER_10MS_QUANTUM; } else { int32_t diff = 0; /* Reset runtime to be in next 10ms */ - diff = timer_diff(tp->tentry.runtime, 0, + diff = timer_diff(tp->tentry.te_runtime, 0, tcp_now, TCP_TIMER_10MS_QUANTUM); if (diff > 0) { - tp->tentry.index = TCPT_KEEP; - tp->tentry.runtime = tcp_now + + tp->tentry.te_index = TCPT_KEEP; + tp->tentry.te_runtime = tcp_now + TCP_TIMER_10MS_QUANTUM; - if (tp->tentry.runtime == 0) { - tp->tentry.runtime++; + if (tp->tentry.te_runtime == 0) { + tp->tentry.te_runtime++; } } } @@ -3054,9 +3038,22 @@ tcp_itimer(struct inpcbinfo *ipi) lck_rw_done(&ipi->ipi_lock); } -void -tcp_set_link_heur_rtomin(struct tcpcb *tp, ifnet_t ifp) +static uint32_t +tcp_offset_from_latest_tx(const struct tcpcb *tp, uint32_t offset) { + if (TSTMP_GT(tp->t_latest_tx, tcp_now)) { + return _tcp_offset_from_start(tp, offset, tp->t_latest_tx); + } else { + return _tcp_offset_from_start(tp, offset, tcp_now); + } +} + + +void +tcp_set_rto(struct tcpcb *tp) +{ + struct ifnet *ifp = tp->t_inpcb->inp_last_outifp; + if ((tcp_link_heuristics_flags & TCP_LINK_HEUR_RTOMIN) != 0 && ifp != NULL && if_link_heuristics_enabled(ifp)) { if (tp->t_rxtcur < tcp_link_heuristics_rto_min) { @@ -3064,4 +3061,54 @@ tcp_set_link_heur_rtomin(struct tcpcb *tp, ifnet_t ifp) tp->t_rxtcur = tcp_link_heuristics_rto_min; } } + + tp->t_timer[TCPT_REXMT] = tcp_offset_from_latest_tx(tp, tp->t_rxtcur); +} + +void +tcp_set_pto(struct tcpcb *tp) +{ + uint32_t pto, srtt; + struct ifnet *ifp; + + /* + * Set tail loss probe timeout if new data is being + * transmitted. This will be supported only when + * SACK option is enabled on a connection. + * + * Every time new data is sent PTO will get reset. + */ + if (tp->t_state != TCPS_ESTABLISHED || + !SACK_ENABLED(tp) || IN_FASTRECOVERY(tp) || + tp->snd_nxt != tp->snd_max || + SEQ_LEQ(tp->snd_nxt, tp->snd_una) || + tp->t_rxtshift != 0 || + (tp->t_flagsext & (TF_SENT_TLPROBE | TF_PKTS_REORDERED)) != 0) { + return; + } + + ifp = tp->t_inpcb->inp_last_outifp; + + /* + * Don't use TLP on congested link + */ + if ((tcp_link_heuristics_flags & TCP_LINK_HEUR_NOTLP) != 0 && + if_link_heuristics_enabled(ifp)) { + return; + } + + srtt = tp->t_srtt >> TCP_RTT_SHIFT; + pto = 2 * srtt; + if ((tp->snd_max - tp->snd_una) <= tp->t_maxseg) { + pto += tcp_delack; + } else { + pto += 2; + } + + /* if RTO is less than PTO, choose RTO instead */ + if (tp->t_rxtcur < pto) { + pto = tp->t_rxtcur; + } + + tp->t_timer[TCPT_PTO] = tcp_offset_from_latest_tx(tp, pto); } diff --git a/bsd/netinet/tcp_timer.h b/bsd/netinet/tcp_timer.h index fbe482de8..5d221c5b5 100644 --- a/bsd/netinet/tcp_timer.h +++ b/bsd/netinet/tcp_timer.h @@ -191,23 +191,6 @@ extern int tcptv_persmin_val; #define TCPTV_FINWAIT2 ( 60*TCP_RETRANSHZ) /* timeout to get out of FIN_WAIT_2 */ -/* - * Window for counting received bytes to see if ack-stretching - * can start (default 100 ms) - */ -#define TCPTV_UNACKWIN ( TCP_RETRANSHZ/10 ) - -/* Receiver idle time, avoid ack-stretching after this idle time */ -#define TCPTV_MAXRCVIDLE (TCP_RETRANSHZ/5 ) - -/* - * No ack stretching during slow-start, until we see some packets. - * By the time the receiver gets 512 packets, the senders cwnd - * should open by a few hundred packets consdering the - * slow-start progression. - */ -#define TCP_RCV_SS_PKTCOUNT 512 - #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ #define TCP_LINGERTIME 120 /* linger at most 2 minutes */ @@ -235,11 +218,11 @@ static char *tcptimers[] = struct tcptimerlist; struct tcptimerentry { - LIST_ENTRY(tcptimerentry) le; /* links for timer list */ - uint32_t timer_start; /* tcp clock when the timer was started */ - uint16_t index; /* index of lowest timer that needs to run first */ - uint16_t mode; /* Bit-wise OR of timers that are active */ - uint32_t runtime; /* deadline at which the first timer has to fire */ + LIST_ENTRY(tcptimerentry) te_le; /* links for timer list */ + uint32_t te_timer_start; /* tcp clock when the timer was started */ + uint16_t te_index; /* index of lowest timer that needs to run first */ + uint16_t te_mode; /* Bit-wise OR of timers that are active */ + uint32_t te_runtime; /* deadline at which the first timer has to fire */ }; LIST_HEAD(timerlisthead, tcptimerentry); @@ -251,8 +234,10 @@ struct tcptimerlist { thread_call_t call; /* call entry */ uint32_t runtime; /* time at which this list is going to run */ uint32_t schedtime; /* time at which this list was scheduled */ + uint32_t started_at; /* time at which this list started to run */ uint32_t entries; /* Number of entries on the list */ uint32_t maxentries; /* Max number of entries at any time */ + uint32_t processed_count; /* Number of entries that have been processed */ /* Set desired mode when timer list running */ boolean_t running; /* Set when timer list is being processed */ @@ -329,7 +314,5 @@ extern int tcp_backoff[TCP_MAXRXTSHIFT + 1]; extern int tcp_rexmt_slop; extern u_int32_t tcp_max_persist_timeout; /* Maximum persistence for Zero Window Probes */ -#define OFFSET_FROM_START(tp, off) ((tcp_now + (off)) - (tp)->tentry.timer_start) - #endif /* BSD_KERNEL_PRIVATE */ #endif /* !_NETINET_TCP_TIMER_H_ */ diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index 082e651d0..43f59f8e7 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1587,13 +1587,13 @@ skip_oinp: ASSERT(inp->inp_flowhash != 0); } - tcp_set_max_rwinscale(tp, so); + tp->request_r_scale = tcp_get_max_rwinscale(tp, so); soisconnecting(so); tcpstat.tcps_connattempt++; TCP_LOG_STATE(tp, TCPS_SYN_SENT); tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_CONN_KEEPINIT(tp)); + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); tp->t_connect_time = tcp_now; @@ -1721,13 +1721,13 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); } - tcp_set_max_rwinscale(tp, so); + tp->request_r_scale = tcp_get_max_rwinscale(tp, so); soisconnecting(so); tcpstat.tcps_connattempt++; TCP_LOG_STATE(tp, TCPS_SYN_SENT); tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPINIT(tp)); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1780,10 +1780,6 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_flags |= TCPI_FLAG_LOSSRECOVERY; } - if (tp->t_flags & TF_STREAMING_ON) { - ti->tcpi_flags |= TCPI_FLAG_STREAMING_ON; - } - ti->tcpi_rto = tp->t_timer[TCPT_REXMT] ? tp->t_rxtcur : 0; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; @@ -1813,14 +1809,14 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_bw = (tp->t_bwmeas->bw_sndbw * 8000); } - ti->tcpi_txpackets = inp != NULL ? inp->inp_stat->txpackets : 0; - ti->tcpi_txbytes = inp != NULL ? inp->inp_stat->txbytes : 0; + ti->tcpi_txpackets = inp != NULL ? inp->inp_mstat.ms_total.ts_txpackets : 0; + ti->tcpi_txbytes = inp != NULL ? inp->inp_mstat.ms_total.ts_txbytes : 0; ti->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes; ti->tcpi_txretransmitpackets = tp->t_stat.rxmitpkts; ti->tcpi_txunacked = tp->snd_max - tp->snd_una; - ti->tcpi_rxpackets = inp != NULL ? inp->inp_stat->rxpackets : 0; - ti->tcpi_rxbytes = inp != NULL ? inp->inp_stat->rxbytes : 0; + ti->tcpi_rxpackets = inp != NULL ? inp->inp_mstat.ms_total.ts_rxpackets : 0; + ti->tcpi_rxbytes = inp != NULL ? inp->inp_mstat.ms_total.ts_rxbytes : 0; ti->tcpi_rxduplicatebytes = tp->t_stat.rxduplicatebytes; ti->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; @@ -1828,20 +1824,25 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_synrexmits = (uint8_t)tp->t_stat.rxmitsyns; } if (inp != NULL) { - ti->tcpi_cell_rxpackets = inp->inp_cstat->rxpackets; - ti->tcpi_cell_rxbytes = inp->inp_cstat->rxbytes; - ti->tcpi_cell_txpackets = inp->inp_cstat->txpackets; - ti->tcpi_cell_txbytes = inp->inp_cstat->txbytes; + ti->tcpi_cell_rxpackets = inp->inp_mstat.ms_cellular.ts_rxpackets; + ti->tcpi_cell_rxbytes = inp->inp_mstat.ms_cellular.ts_rxbytes; + ti->tcpi_cell_txpackets = inp->inp_mstat.ms_cellular.ts_txpackets; + ti->tcpi_cell_txbytes = inp->inp_mstat.ms_cellular.ts_txbytes; - ti->tcpi_wifi_rxpackets = inp->inp_wstat->rxpackets; - ti->tcpi_wifi_rxbytes = inp->inp_wstat->rxbytes; - ti->tcpi_wifi_txpackets = inp->inp_wstat->txpackets; - ti->tcpi_wifi_txbytes = inp->inp_wstat->txbytes; + ti->tcpi_wifi_rxpackets = inp->inp_mstat.ms_wifi_infra.ts_rxpackets + + inp->inp_mstat.ms_wifi_non_infra.ts_rxpackets; + ti->tcpi_wifi_rxbytes = inp->inp_mstat.ms_wifi_infra.ts_rxbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_rxbytes; - ti->tcpi_wired_rxpackets = inp->inp_Wstat->rxpackets; - ti->tcpi_wired_rxbytes = inp->inp_Wstat->rxbytes; - ti->tcpi_wired_txpackets = inp->inp_Wstat->txpackets; - ti->tcpi_wired_txbytes = inp->inp_Wstat->txbytes; + ti->tcpi_wifi_txpackets = inp->inp_mstat.ms_wifi_infra.ts_txpackets + + inp->inp_mstat.ms_wifi_non_infra.ts_txpackets; + ti->tcpi_wifi_txbytes = inp->inp_mstat.ms_wifi_infra.ts_txbytes + + inp->inp_mstat.ms_wifi_non_infra.ts_txbytes; + + ti->tcpi_wired_rxpackets = inp->inp_mstat.ms_wired.ts_rxpackets; + ti->tcpi_wired_rxbytes = inp->inp_mstat.ms_wired.ts_rxbytes; + ti->tcpi_wired_txpackets = inp->inp_mstat.ms_wired.ts_txpackets; + ti->tcpi_wired_txbytes = inp->inp_mstat.ms_wired.ts_txbytes; } tcp_get_connectivity_status(tp, &ti->tcpi_connstatus); @@ -1864,7 +1865,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_ecn_client_setup = !!(tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)); ti->tcpi_ecn_server_setup = !!(tp->ecn_flags & (TE_SETUPRECEIVED | TE_ACE_SETUPRECEIVED)); - ti->tcpi_ecn_success = (TCP_ECN_ENABLED(tp) || TCP_ACC_ECN_ON(tp)) ? 1 : 0; + ti->tcpi_ecn_success = (TCP_ECN_ENABLED(tp) || tp->accurate_ecn_on) ? 1 : 0; ti->tcpi_ecn_lost_syn = !!(tp->ecn_flags & TE_LOST_SYN); ti->tcpi_ecn_lost_synack = !!(tp->ecn_flags & TE_LOST_SYNACK); @@ -1918,18 +1919,21 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) * As some of the AccECN fields are initialized to non-zero * values, we subtract the initial values. */ - ti->tcpi_received_ce_packets = tp->t_aecn.t_rcv_ce_packets - 5; - ti->tcpi_received_ect0_bytes = tp->t_aecn.t_rcv_ect0_bytes - 1; - ti->tcpi_received_ect1_bytes = tp->t_aecn.t_rcv_ect1_bytes - 1; + ti->tcpi_received_ce_packets = tp->t_aecn.t_rcv_ce_packets ? tp->t_aecn.t_rcv_ce_packets - 5 : 0; + ti->tcpi_received_ect0_bytes = tp->t_aecn.t_rcv_ect0_bytes ? tp->t_aecn.t_rcv_ect0_bytes - 1 : 0; + ti->tcpi_received_ect1_bytes = tp->t_aecn.t_rcv_ect1_bytes ? tp->t_aecn.t_rcv_ect1_bytes - 1 : 0; ti->tcpi_received_ce_bytes = tp->t_aecn.t_rcv_ce_bytes; - ti->tcpi_delivered_ect0_bytes = tp->t_aecn.t_snd_ect0_bytes - 1; - ti->tcpi_delivered_ect1_bytes = tp->t_aecn.t_snd_ect1_bytes - 1; + ti->tcpi_delivered_ect0_bytes = tp->t_aecn.t_snd_ect0_bytes ? tp->t_aecn.t_snd_ect0_bytes - 1 : 0; + ti->tcpi_delivered_ect1_bytes = tp->t_aecn.t_snd_ect1_bytes ? tp->t_aecn.t_snd_ect1_bytes - 1 : 0; ti->tcpi_delivered_ce_bytes = tp->t_aecn.t_snd_ce_bytes; - ti->tcpi_l4s_enabled = TCP_L4S_ENABLED(tp); + ti->tcpi_l4s_enabled = tp->l4s_enabled; ti->tcpi_flow_control_total_time = inp->inp_fadv_total_time; ti->tcpi_rcvwnd_limited_total_time = tp->t_rcvwnd_limited_total_time; + + ti->tcpi_pacing_rate = tp->t_pacer.rate; + ti->tcpi_max_pacing_rate = inp->inp_max_pacing_rate; } __private_extern__ errno_t @@ -2077,12 +2081,12 @@ tcp_connection_fill_info(struct tcpcb *tp, struct tcp_connection_info *tci) tci->tcpi_rttcur = tp->t_rttcur; tci->tcpi_srtt = (tp->t_srtt >> TCP_RTT_SHIFT); tci->tcpi_rttvar = (tp->t_rttvar >> TCP_RTTVAR_SHIFT); - tci->tcpi_txpackets = inp != NULL ? inp->inp_stat->txpackets : 0; - tci->tcpi_txbytes = inp != NULL ? inp->inp_stat->txbytes : 0; + tci->tcpi_txpackets = inp != NULL ? inp->inp_mstat.ms_total.ts_txpackets : 0; + tci->tcpi_txbytes = inp != NULL ? inp->inp_mstat.ms_total.ts_txbytes : 0; tci->tcpi_txretransmitbytes = tp->t_stat.txretransmitbytes; tci->tcpi_txretransmitpackets = tp->t_stat.rxmitpkts; - tci->tcpi_rxpackets = inp != NULL ? inp->inp_stat->rxpackets : 0; - tci->tcpi_rxbytes = inp != NULL ? inp->inp_stat->rxbytes : 0; + tci->tcpi_rxpackets = inp != NULL ? inp->inp_mstat.ms_total.ts_rxpackets : 0; + tci->tcpi_rxbytes = inp != NULL ? inp->inp_mstat.ms_total.ts_rxbytes : 0; tci->tcpi_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; tci->tcpi_tfo_syn_data_rcv = !!(tp->t_tfo_stats & TFO_S_SYNDATA_RCV); @@ -2451,7 +2455,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_keepidle = optval * TCP_RETRANSHZ; /* reset the timer to new value */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPIDLE(tp)); tcp_check_timer_state(tp); } @@ -2470,7 +2474,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_keepinit = optval * TCP_RETRANSHZ; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) { - tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_KEEP] = tcp_offset_from_start(tp, TCP_CONN_KEEPINIT(tp)); tcp_check_timer_state(tp); } @@ -2489,7 +2493,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_keepintvl = optval * TCP_RETRANSHZ; if (tp->t_state == TCPS_FIN_WAIT_2 && TCP_CONN_MAXIDLE(tp) > 0) { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_2MSL] = tcp_offset_from_start(tp, TCP_CONN_MAXIDLE(tp)); tcp_check_timer_state(tp); } @@ -2508,7 +2512,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_keepcnt = optval; if (tp->t_state == TCPS_FIN_WAIT_2 && TCP_CONN_MAXIDLE(tp) > 0) { - tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, + tp->t_timer[TCPT_2MSL] = tcp_offset_from_start(tp, TCP_CONN_MAXIDLE(tp)); tcp_check_timer_state(tp); } @@ -2624,9 +2628,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (optval < 0 || optval > 1) { error = EINVAL; } else if (optval == 0) { - tp->t_flagsext &= ~(TF_NOSTRETCHACK); + tp->t_flagsext &= ~(TF_QUICKACK); } else { - tp->t_flagsext |= TF_NOSTRETCHACK; + tp->t_flagsext |= TF_QUICKACK; } break; case TCP_DISABLE_BLACKHOLE_DETECTION: @@ -2671,9 +2675,6 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } else { tcp_disable_tfo(tp); } - break; - case TCP_FASTOPEN_FORCE_HEURISTICS: - break; case TCP_FASTOPEN_FORCE_ENABLE: error = sooptcopyin(sopt, &optval, sizeof(optval), @@ -2899,9 +2900,6 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } optval = !!TFO_ENABLED(tp); break; - case TCP_FASTOPEN_FORCE_HEURISTICS: - optval = 0; - break; case TCP_FASTOPEN_FORCE_ENABLE: optval = (tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) ? 1 : 0; break; @@ -2943,7 +2941,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } break; case TCP_SENDMOREACKS: - if (tp->t_flagsext & TF_NOSTRETCHACK) { + if (tp->t_flagsext & TF_QUICKACK) { optval = 1; } else { optval = 0; @@ -3007,12 +3005,11 @@ done: } /* - * tcp_sendspace and tcp_recvspace are the default send and receive window - * sizes, respectively. These are obsolescent (this information should - * be set by the route). + * tcp_sendspace and tcp_recvspace are the initial send and receive window + * sizes, respectively. */ -uint32_t tcp_sendspace = 1448 * 256; -uint32_t tcp_recvspace = 1448 * 384; +uint32_t tcp_sendspace = 128 * 1024; +uint32_t tcp_recvspace = 128 * 1024; /* During attach, the size of socket buffer allocated is limited to * sb_max in sbreserve. Disallow setting the tcp send and recv space @@ -3306,7 +3303,7 @@ static __attribute__((unused)) void tcpsockopt_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((int)0) { diff --git a/bsd/netinet/tcp_utils.h b/bsd/netinet/tcp_utils.h index 2617dfd28..9209c7680 100644 --- a/bsd/netinet/tcp_utils.h +++ b/bsd/netinet/tcp_utils.h @@ -29,6 +29,7 @@ #ifndef _NETINET_TCP_UTILS_H_ #define _NETINET_TCP_UTILS_H_ +#include #include struct tcp_globals {}; @@ -50,4 +51,34 @@ tcp_globals_now(struct tcp_globals *globals) extern void tcp_ccdbg_control_register(void); extern void tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event); +static inline void +tcp_memacct_add(unsigned int size) +{ + mem_acct_add(tcp_memacct, size); +} + +static inline void +tcp_memacct_sub(unsigned int size) +{ + mem_acct_sub(tcp_memacct, size); +} + +static inline int +tcp_memacct_limited(void) +{ + return mem_acct_limited(tcp_memacct); +} + +static inline bool +tcp_memacct_hardlimit(void) +{ + return mem_acct_limited(tcp_memacct) == MEMACCT_HARDLIMIT; +} + +static inline bool +tcp_memacct_softlimit(void) +{ + return mem_acct_limited(tcp_memacct) > MEMACCT_PRESOFTLIMIT; +} + #endif /* _NETINET_TCP_UTILS_H_ */ diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 5922aa668..72c1808f2 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -160,6 +160,19 @@ struct name { \ /* Divisor used for Accurate ECN options field */ #define TCP_ACO_DIV (1 << 24) +static __inline uint16_t +tcp_get_flags(const struct tcphdr *th) +{ + return (uint16_t)((th->th_x2 << 8) | th->th_flags); +} + +static __inline void +tcp_set_flags(struct tcphdr *th, uint16_t flags) +{ + th->th_x2 = (flags >> 8) & 0x0f; + th->th_flags = flags & 0xff; +} + /* * Kernel variables for tcp. */ @@ -244,6 +257,32 @@ struct tcptemp { struct tcphdr tt_t; }; +struct tcp_inp { + struct socket *so; + struct inpcb **inp; + struct tcpcb **tp; + struct mbuf *m; + struct tcphdr *th; + struct tcpopt *to; + u_char *optp __counted_by(optlen); + struct ip6_hdr *ip6; + struct ip *ip; + struct ifnet *ifp; + struct proc *kernel_proc; + tcp_seq iss; + tcp_seq irs; + uint32_t tiwin; + uint32_t ts_offset; + int optlen; + unsigned int ifscope; + uint16_t peer_mss; + uint8_t peer_wscale; + bool sackok; + bool ecnok; + uint8_t ip_ecn; + bool isipv6; +}; + struct bwmeas { tcp_seq bw_start; /* start of bw measurement */ uint32_t bw_ts; /* timestamp when bw measurement started */ @@ -414,13 +453,13 @@ struct tcpcb { #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x800000 /* require MD5 digests (RFC2385) */ #define TF_MAXSEGSNT 0x1000000 /* last segment sent was a full segment */ -#define TF_STREAMING_ON 0x2000000 /* Receiver detected streaming */ +/* Unused 0x2000000 */ #define TF_PMTUD 0x4000000 /* Perform Path MTU Discovery for this connection */ #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ #define TF_BLACKHOLE 0x20000000 /* Path MTU Discovery Black Hole detection */ #define TF_TIMER_ONLIST 0x40000000 /* pcb is on tcp_timer_list */ -#define TF_STRETCHACK 0x80000000 /* receiver is going to delay acks */ +/* Unused 0x80000000 */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -439,6 +478,7 @@ struct tcpcb { uint32_t rcv_wnd; /* receive window */ uint32_t t_last_recwin; tcp_seq rcv_up; /* receive urgent pointer */ + uint32_t t_latest_tx; /* Most recent transmit scheduled (including potential pacing) */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ @@ -483,23 +523,23 @@ struct tcpcb { #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* RFC 1323 variables */ - u_int8_t snd_scale; /* window scaling for send window */ - u_int8_t rcv_scale; /* window scaling for recv window */ - u_int8_t request_r_scale; /* pending window scaling */ - u_int8_t requested_s_scale; - u_int8_t tcp_cc_index; /* index of congestion control algorithm */ - u_int8_t t_adaptive_rtimo; /* Read timeout used as a multiple of RTT */ - u_int8_t t_adaptive_wtimo; /* Write timeout used as a multiple of RTT */ - u_int8_t t_stretchack_delayed; /* stretch ack delayed */ + uint8_t snd_scale; /* window scaling for send window */ + uint8_t rcv_scale; /* window scaling for recv window */ + uint8_t request_r_scale; /* pending window scaling */ + uint8_t requested_s_scale; + uint8_t tcp_cc_index; /* index of congestion control algorithm */ + uint8_t t_adaptive_rtimo; /* Read timeout used as a multiple of RTT */ + uint8_t t_adaptive_wtimo; /* Write timeout used as a multiple of RTT */ /* State for limiting early retransmits when SACK is not enabled */ - u_int16_t t_early_rexmt_count; /* count of early rexmts */ - u_int32_t t_early_rexmt_win; /* window for limiting early rexmts */ + uint16_t t_early_rexmt_count; /* count of early rexmts */ + uint32_t t_early_rexmt_win; /* window for limiting early rexmts */ - u_int32_t ts_recent; /* timestamp echo data */ + uint32_t ts_recent; /* timestamp echo data */ + uint32_t t_ts_offset; /* Randomized timestamp offset to hide on-the-wire timestamp */ - u_int32_t ts_recent_age; /* when last updated */ - tcp_seq last_ack_sent; + uint32_t ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; uint32_t t_bytes_acked; /* RFC 3465 variable for ABC, used by CCA only */ uint32_t total_ect_packets_marked; /* Cumulative count of total ECT packets marked */ @@ -522,16 +562,8 @@ struct tcpcb { uint32_t t_persist_stop; /* persistence limit deadline if triggered by ZWP */ uint32_t t_notsent_lowat; /* Low water for not sent data */ -/* Receiver state for stretch-ack algorithm */ - u_int32_t rcv_unackwin; /* to measure win for stretching acks */ - u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ - u_int32_t rcv_by_unackhalfwin; - u_int32_t rcv_nostrack_ts; /* timestamp when stretch ack was disabled automatically */ - u_int32_t rcv_nostrack_pkts; /* pkts received since strech ack was disabled */ - u_int16_t rcv_waitforss; /* wait for packets during slow-start */ - /* ECN stats */ - u_int32_t ecn_flags; + uint32_t ecn_flags; #define TE_SETUPSENT 0x00000001 /* We have sent classic ECN-SETUP SYN or SYN-ACK */ #define TE_SETUPRECEIVED 0x00000002 /* We have received classic ECN-SETUP SYN or SYN-ACK */ #define TE_SENDIPECT 0x00000004 /* We haven't sent or received non-ECN-setup SYN or SYN-ACK, set IP ECT on outbound packet */ @@ -546,7 +578,7 @@ struct tcpcb { #define TE_ECN_MODE_DISABLE 0x00000800 /* Option ECN mode set to disable */ #define TE_ENABLE_ECN 0x00001000 /* Enable negotiation of ECN */ #define TE_ECN_ON (TE_SETUPSENT | TE_SETUPRECEIVED) /* ECN was successfully negotiated on a connection */ -#define TE_CEHEURI_SET 0x00002000 /* We did our CE-probing at the beginning */ +#define TE_ECEHEURI_SET 0x00002000 /* We did our E/CE-probing at the beginning */ #define TE_CLIENT_SETUP 0x00004000 /* setup from client side */ #define TE_RCVD_SYN_RST 0x00008000 /* Received RST to the first ECN enabled SYN */ #define TE_ACE_SETUP_NON_ECT 0x00010000 /* Encode received non-ECT either for SYN-ACK (server) or final ACK (client) */ @@ -563,8 +595,8 @@ struct tcpcb { #define TE_FORCE_ECT1 0x40000000 /* Force setting ECT1 on outgoing packets for testing purpose */ #define TE_FORCE_ECT0 0x80000000 /* Force setting ECT0 on outgoing packets for testing purpose */ - u_int32_t t_ecn_recv_ce; /* Received CE from the network */ - u_int32_t t_ecn_recv_cwr; /* Packets received with CWR */ + uint32_t t_ecn_recv_ce; /* Received CE from the network */ + uint32_t t_ecn_recv_cwr; /* Packets received with CWR */ uint32_t t_client_accecn_state; /* Client's Accurate ECN state */ uint32_t t_server_accecn_state; /* Server's Accurate ECN state */ uint64_t t_ecn_capable_packets_sent; /* Packets sent with ECT */ @@ -581,15 +613,15 @@ struct tcpcb { struct pacer t_pacer; /* Pacer state used to pace packets */ /* state for bad retransmit recovery */ - u_int32_t snd_cwnd_prev; /* cwnd prior to retransmit */ - u_int32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ - tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ - int t_srtt_prev; /* srtt prior to retransmit */ - int t_rttvar_prev; /* rttvar prior to retransmit */ - u_int32_t t_badrexmt_time; /* bad rexmt detection time */ + uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ + uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ + tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ + int t_srtt_prev; /* srtt prior to retransmit */ + int t_rttvar_prev; /* rttvar prior to retransmit */ + uint32_t t_badrexmt_time; /* bad rexmt detection time */ /* Packet reordering metric */ - u_int32_t t_reorderwin; /* Reordering late time offset */ + uint32_t t_reorderwin; /* Reordering late time offset */ /* SACK related state */ int16_t snd_numholes; /* number of holes seen by sender */ @@ -602,40 +634,42 @@ struct tcpcb { struct mbuf *t_pktlist_head; /* First packet in transmit chain */ struct mbuf *t_pktlist_tail; /* Last packet in transmit chain */ - u_int32_t t_pktlist_sentlen; /* total bytes in transmit chain */ + uint32_t t_pktlist_sentlen; /* total bytes in transmit chain */ - u_int32_t t_keepidle; /* keepalive idle timer (override global if > 0) */ - u_int32_t t_keepinit; /* connection timeout, i.e. idle time + uint32_t t_keepidle; /* keepalive idle timer (override global if > 0) */ + uint32_t t_keepinit; /* connection timeout, i.e. idle time * in SYN_SENT or SYN_RECV state */ - u_int32_t t_keepintvl; /* interval between keepalives */ - u_int32_t t_keepcnt; /* number of keepalives before close */ + uint32_t t_keepintvl; /* interval between keepalives */ + uint32_t t_keepcnt; /* number of keepalives before close */ - u_int32_t tso_max_segment_size; /* TSO maximum segment unit for NIC */ - u_int16_t t_pmtud_lastseg_size; /* size of the last sent segment */ - u_int32_t t_pmtud_saved_maxopd; /* MSS saved before performing PMTU-D BlackHole detection */ - u_int32_t t_pmtud_start_ts; /* Time of PMTUD blackhole detection */ + uint32_t tso_max_segment_size; /* TSO maximum segment unit for NIC */ + uint16_t t_pmtud_lastseg_size; /* size of the last sent segment */ + uint32_t t_pmtud_saved_maxopd; /* MSS saved before performing PMTU-D BlackHole detection */ + uint32_t t_pmtud_start_ts; /* Time of PMTUD blackhole detection */ struct{ - u_int32_t rxduplicatebytes; - u_int32_t rxoutoforderbytes; - u_int32_t txretransmitbytes; - u_int16_t synrxtshift; - u_int16_t rxmitsyns; - u_int16_t unused_pad_to_8; - u_int32_t rxmitpkts; + uint32_t rxduplicatebytes; + uint32_t rxoutoforderbytes; + uint32_t txretransmitbytes; + uint16_t synrxtshift; + uint16_t rxmitsyns; + uint16_t unused_pad_to_8; + uint32_t rxmitpkts; uint32_t delayed_acks_sent; uint32_t acks_delayed; + uint64_t bytes_acked; } t_stat; - u_int8_t t_syn_sent; - u_int8_t t_syn_rcvd; - u_int8_t t_notify_ack_count; - u_int8_t t_ecn_recv_ce_pkt; /* Received packet with CE-bit set (independent from last_ack_sent) */ - u_int32_t t_cached_maxopd; /* default for MSS adjustment using link status report */ + uint8_t t_syn_sent; + uint8_t t_syn_rcvd; + uint8_t t_notify_ack_count; + uint8_t t_ecn_recv_ce_pkt; /* Received data packet with CE bit set */ + uint8_t t_ecn_recv_ece_pkt; /* Received ACK packet with ECE bit set */ + uint32_t t_cached_maxopd; /* default for MSS adjustment using link status report */ uint32_t bg_ssthresh; /* Slow start threshold until delay increases */ uint32_t t_flagsext; /* Another field to accommodate more flags */ -#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ -#define TF_RCVUNACK_WAITSS 0x2 /* set when the receiver should not stretch acks */ +#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ +/* Unused 0x2 */ #define TF_BWMEAS_INPROGRESS 0x4 /* Indicate BW meas is happening */ #define TF_MEASURESNDBW 0x8 /* Measure send bw on this connection */ #define TF_LAST_IS_PSH 0x10 /* Indicates whether the last packet in the rcv socket buffer had the PUSH-flag set */ @@ -643,14 +677,16 @@ struct tcpcb { #define TF_RECOMPUTE_RTT 0x40 /* recompute RTT after spurious retransmit */ #define TF_DETECT_READSTALL 0x80 /* Used to detect a stall during read operation */ #define TF_RECV_THROTTLE 0x100 /* Input throttling active */ -#define TF_NOSTRETCHACK 0x200 /* ack every other packet */ +#define TF_QUICKACK 0x200 /* Force-ACK every other packet */ +#define TF_SYN_COOKIE_ENABLED 0x400 /* SYN cookie is enabled for listener when max backlog is reached */ #define TF_NOTIMEWAIT 0x800 /* Avoid going into time-wait */ #define TF_SENT_TLPROBE 0x1000 /* Sent data in PTO */ #define TF_PKTS_REORDERED 0x2000 /* Detected reordering */ #define TF_DELAY_RECOVERY 0x4000 /* delay fast recovery */ #define TF_FORCE 0x8000 /* force 1 byte out */ -#define TF_DISABLE_STRETCHACK 0x10000 /* auto-disable stretch ack */ +/* Unused 0x10000 */ #define TF_NOBLACKHOLE_DETECTION 0x20000 /* Disable PMTU blackhole detection */ +#define TF_SYN_COOKIE_FORCE_ENABLED 0x40000 /* SYN cookie is enabled for listener unconditionally */ #define TF_RESCUE_RXT 0x80000 /* SACK rescue retransmit */ #define TF_CWND_NONVALIDATED 0x100000 /* cwnd non validated */ #define TF_IF_PROBING 0x200000 /* Trigger interface probe timeout */ @@ -697,13 +733,13 @@ struct tcpcb { tcp_seq t_dsack_lastuna; /* snd_una when last recovery episode started */ /* state for congestion window validation (draft-ietf-tcpm-newcwv-07) */ #define TCP_PIPEACK_SAMPLE_COUNT 3 - u_int32_t t_pipeack_sample[TCP_PIPEACK_SAMPLE_COUNT]; /* pipeack, bytes acked within RTT */ + uint32_t t_pipeack_sample[TCP_PIPEACK_SAMPLE_COUNT]; /* pipeack, bytes acked within RTT */ tcp_seq t_pipeack_lastuna; /* una when pipeack measurement started */ - u_int32_t t_pipeack; - u_int32_t t_lossflightsize; + uint32_t t_pipeack; + uint32_t t_lossflightsize; #if MPTCP - u_int32_t t_mpflags; /* flags for multipath TCP */ + uint32_t t_mpflags; /* flags for multipath TCP */ #define TMPF_PREESTABLISHED 0x00000001 /* conn in pre-established state */ #define TMPF_SND_KEYS 0x00000002 /* indicates that keys should be send */ @@ -736,10 +772,9 @@ struct tcpcb { tcp_seq t_mpuna; /* unacknowledged sequence */ struct mptcb *t_mptcb; /* pointer to MPTCP TCB */ struct mptsub *t_mpsub; /* pointer to the MPTCP subflow */ - struct mpt_dsn_map t_rcv_map; /* Receive mapping list */ - u_int8_t t_local_aid; /* Addr Id for authentication */ - u_int8_t t_rem_aid; /* Addr ID of another subflow */ - u_int8_t t_mprxtshift; /* join retransmission */ + uint8_t t_local_aid; /* Addr Id for authentication */ + uint8_t t_rem_aid; /* Addr ID of another subflow */ + uint8_t t_mprxtshift; /* join retransmission */ #endif /* MPTCP */ #define TFO_F_OFFER_COOKIE 0x01 /* We will offer a cookie */ @@ -749,7 +784,7 @@ struct tcpcb { #define TFO_F_SYN_LOSS 0x10 /* A SYN-loss triggered a fallback to regular TCP on the client-side */ #define TFO_F_NO_SNDPROBING 0x20 /* This network is guaranteed to support TFO in the upstream direction */ #define TFO_F_HEURISTIC_DONE 0x40 /* We have already marked this network as bad */ - u_int8_t t_tfo_flags; + uint8_t t_tfo_flags; #define TFO_S_SYNDATA_RCV 0x01 /* SYN+data has been received */ #define TFO_S_COOKIEREQ_RECV 0x02 /* TFO-cookie request received */ #define TFO_S_COOKIE_SENT 0x04 /* TFO-cookie announced in SYN/ACK */ @@ -765,9 +800,9 @@ struct tcpcb { #define TFO_S_SEND_BLACKHOLE 0x1000 /* TFO got blackholed in the send direction */ #define TFO_S_RECV_BLACKHOLE 0x2000 /* TFO got blackholed in the recv direction */ #define TFO_S_ONE_BYTE_PROXY 0x4000 /* TFO failed because of a proxy acknowledging just one byte */ - u_int16_t t_tfo_stats; + uint16_t t_tfo_stats; - u_int8_t t_tfo_probes; /* TFO-probes we did send */ + uint8_t t_tfo_probes; /* TFO-probes we did send */ /* * This here is the TFO-probing state-machine. Transitions are as follows: * @@ -795,23 +830,23 @@ struct tcpcb { #define TFO_PROBE_NONE 0 /* Not probing now */ #define TFO_PROBE_PROBING 1 /* Sending out TCP-keepalives waiting for reply */ #define TFO_PROBE_WAIT_DATA 2 /* Received reply, waiting for data */ - u_int8_t t_tfo_probe_state; + uint8_t t_tfo_probe_state; - u_int32_t t_rcvoopack; /* out-of-order packets received */ - u_int32_t t_pawsdrop; /* segments dropped due to PAWS */ - u_int32_t t_sack_recovery_episode; /* SACK recovery episodes */ + uint32_t t_rcvoopack; /* out-of-order packets received */ + uint32_t t_pawsdrop; /* segments dropped due to PAWS */ + uint32_t t_sack_recovery_episode; /* SACK recovery episodes */ uint32_t t_rack_recovery_episode; /* RACK recovery episodes */ uint32_t t_rack_reo_timeout_recovery_episode; /* RACK recovery triggered by reordering timeout */ - u_int32_t t_reordered_pkts; /* packets reorderd */ - u_int32_t t_dsack_sent; /* Sent DSACK notification */ - u_int32_t t_dsack_recvd; /* Received a valid DSACK option */ + uint32_t t_reordered_pkts; /* packets reorderd */ + uint32_t t_dsack_sent; /* Sent DSACK notification */ + uint32_t t_dsack_recvd; /* Received a valid DSACK option */ SLIST_HEAD(, tcp_notify_ack_marker) t_notify_ack; /* state for notifying data acknowledgements */ - u_int32_t t_recv_throttle_ts; /* TS for start of recv throttle */ - u_int32_t t_rxt_minimum_timeout; /* minimum retransmit timeout in ms */ + uint32_t t_recv_throttle_ts; /* TS for start of recv throttle */ + uint32_t t_rxt_minimum_timeout; /* minimum retransmit timeout in ms */ uint32_t t_challengeack_last; /* last time challenge ACK was sent per sec */ uint32_t t_challengeack_count; /* # of challenge ACKs already sent per sec */ - u_int32_t t_connect_time; /* time when the connection started */ + uint32_t t_connect_time; /* time when the connection started */ uint64_t t_rcvwnd_limited_total_time; uint64_t t_rcvwnd_limited_start_time; @@ -823,8 +858,6 @@ struct tcpcb { uint32_t t_comp_ack_lastinc; /* Last time the gen-count was changed - should change every TCP_COMP_CHANGE_RATE ms */ #define TCP_COMP_CHANGE_RATE 5 /* Intervals at which we change the gencnt. Means that worst-case we send one ACK every TCP_COMP_CHANGE_RATE ms */ - uint32_t t_ts_offset; /* Randomized timestamp offset to hide on-the-wire timestamp */ - #define NCURR_RTT_HIST 4 /* Number of current RTT samples (k) */ uint32_t curr_rtt_hist[NCURR_RTT_HIST]; /* last k current RTT samples */ uint32_t curr_rtt_min; /* Minimum current RTT from last k samples */ @@ -865,6 +898,10 @@ struct tcpcb { uint32_t bytes_retransmitted; uint32_t bytes_sacked; + uint8_t l4s_enabled:1, + accurate_ecn_on:1, + _pad:6; + uuid_t t_fsw_uuid; uuid_t t_flow_uuid; }; @@ -876,6 +913,32 @@ __CCT_DECLARE_CONSTRAINED_PTR_TYPES(struct tcpcb, tcpcb); #define TFO_ENABLED(tp) (tp->t_flagsext & TF_FASTOPEN) #define TCP_RACK_ENABLED(tp) ((tp->t_flagsext & TF_RACK_ENABLED) && SACK_ENABLED(tp) && !TFO_ENABLED(tp)) +extern int tcp_syncookie; +extern int soqlimitcompat; +extern int mptcp_enable; + +#define LOOPBACK_INTERFACE(tp) (tp->t_inpcb->inp_boundifp != NULL && \ + (tp->t_inpcb->inp_boundifp->if_flags & IFF_LOOPBACK)) + +#define TCP_SYN_COOKIE_DISABLED(tp) (TFO_ENABLED(tp) || tp->t_mpflags & TMPF_MPTCP_TRUE || LOOPBACK_INTERFACE(tp)) + +#define TCP_SYN_COOKIE_FORCE_ENABLED(tp) ((tp->t_flagsext & TF_SYN_COOKIE_FORCE_ENABLED) \ + && !TCP_SYN_COOKIE_DISABLED(tp)) + +#define TCP_SYN_COOKIE_ENABLED(tp) ((tp->t_flagsext & (TF_SYN_COOKIE_ENABLED | TF_SYN_COOKIE_FORCE_ENABLED)) \ + && !TCP_SYN_COOKIE_DISABLED(tp)) + +static __inline bool +tcp_can_send_syncookie(const struct socket *head, const struct tcpcb *tp, const uint8_t th_flags) +{ + int backlog = soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2); + backlog = backlog - (backlog >> 3); + bool backlog_reached = head->so_incqlen >= (backlog >> 1) || head->so_qlen >= backlog; + bool can_send_syncookie = (TCP_SYN_COOKIE_FORCE_ENABLED(tp) || (TCP_SYN_COOKIE_ENABLED(tp) && backlog_reached)); + can_send_syncookie &= ((th_flags & (TH_RST | TH_ACK | TH_SYN)) == TH_SYN); + return can_send_syncookie; +} + static inline bool tcp_sent_tlp_retrans(const struct tcpcb *tp) { @@ -950,27 +1013,6 @@ typedef enum { tcp_l4s_developer_disable = 2 } tcp_l4s_t; -/* - * TCP L4S is enabled if, - * 1. It is not disabled explicitly by developer or tcp options - * 2. It is enabled either by developer or A/B deployment or tcp_options, - * It implicitly enables Accurate ECN which supports ACE and AccECN option for ECN feedback - */ - -#define TCP_L4S_DISABLED(_tp_) \ - (tcp_l4s_developer == tcp_l4s_developer_disable || \ - ((_tp_)->t_flagsext & TF_L4S_DISABLED) == 1) - -#define TCP_L4S_ENABLED(_tp_) \ - (!TCP_L4S_DISABLED(tp) && \ - (tcp_l4s_developer == tcp_l4s_developer_enable || \ - tcp_l4s == 1 || ((_tp_)->t_flagsext & TF_L4S_ENABLED))) - -/* L4S is enabled and Accurate ECN has been negotiated end-to-end */ -#define TCP_ACC_ECN_ON(_tp_) \ - (TCP_L4S_ENABLED(_tp_) && \ - (((_tp_)->ecn_flags & (TE_ACC_ECN_ON)) == (TE_ACC_ECN_ON))) - /* * Gives number of bytes acked by this ack */ @@ -1025,18 +1067,20 @@ typedef enum { struct tcpopt { uint32_t to_flags; /* which options are present */ #define TOF_TS 0x0001 /* timestamp */ +#define TOF_SACKPERM 0x0004 /* SACK permitted (only in SYN/ACK) */ #define TOF_MSS 0x0010 #define TOF_SCALE 0x0020 #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ -#define TOF_SACK 0x0100 /* Peer sent SACK option */ +#define TOF_SACK 0x0100 /* SACK option present */ #define TOF_MPTCP 0x0200 /* MPTCP options to be dropped */ #define TOF_TFO 0x0400 /* TFO cookie option present */ #define TOF_TFOREQ 0x0800 /* TFO cookie request present */ +#define TOF_MAXOPT 0x1000 uint32_t to_tsval; uint32_t to_tsecr; uint16_t to_mss; - uint8_t to_requested_s_scale; + uint8_t to_wscale; uint8_t to_nsacks; /* number of SACK blocks */ u_char *to_sacks __sized_by(to_sacks_size); /* pointer to the first SACK blocks */ uint32_t to_sacks_size; /* boundary for to_sacks */ @@ -1056,26 +1100,6 @@ struct tcpopt { #define TFO_COOKIE_LEN_DEFAULT 8 #define TFO_COOKIE_LEN_MAX 16 -/* - * The initial retransmission should happen at rtt + 4 * rttvar. - * Because of the way we do the smoothing, srtt and rttvar - * will each average +1/2 tick of bias. When we compute - * the retransmit timer, we want 1/2 tick of rounding and - * 1 extra tick because of +-1/2 tick uncertainty in the - * firing of the timer. The bias will give us exactly the - * 1.5 tick we need. But, because the bias is - * statistical, we have to test that we don't drop below - * the minimum feasible timer (which is 2 ticks). - * This version of the macro adapted from a paper by Lawrence - * Brakmo and Larry Peterson which outlines a problem caused - * by insufficient precision in the original implementation, - * which results in inappropriately large RTO values for very - * fast networks. - */ -#define TCP_REXMTVAL(tp) \ - max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ - + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) - /* * Jaguar compatible TCP control block, for xtcpcb * Does not have the old fields @@ -1250,7 +1274,6 @@ struct tcpstat { u_int32_t tcps_pawsdrop; /* segments dropped due to PAWS */ u_int32_t tcps_predack; /* times hdr predict ok for acks */ u_int32_t tcps_preddat; /* times hdr predict ok for data pkts */ - u_int32_t tcps_pcbcachemiss; u_int32_t tcps_cachedrtt; /* times cached RTT in route updated */ u_int32_t tcps_cachedrttvar; /* times cached rttvar updated */ u_int32_t tcps_cachedssthresh; /* times cached ssthresh updated */ @@ -1270,25 +1293,12 @@ struct tcpstat { u_int32_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ u_int32_t tcps_badrst; /* ignored RSTs in the window */ - u_int32_t tcps_sc_added; /* entry added to syncache */ - u_int32_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ - u_int32_t tcps_sc_dupsyn; /* duplicate SYN packet */ u_int32_t tcps_sc_dropped; /* could not reply to packet */ u_int32_t tcps_sc_completed; /* successful extraction of entry */ - u_int32_t tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ - u_int32_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ - u_int32_t tcps_sc_reset; /* RST removed entry from syncache */ - u_int32_t tcps_sc_stale; /* timed out or listen socket gone */ u_int32_t tcps_sc_aborted; /* syncache entry aborted */ - u_int32_t tcps_sc_badack; /* removed due to bad ACK */ - u_int32_t tcps_sc_unreach; /* ICMP unreachable received */ - u_int32_t tcps_sc_zonefail; /* zalloc() failed */ u_int32_t tcps_sc_sendcookie; /* SYN cookie sent */ u_int32_t tcps_sc_recvcookie; /* SYN cookie received */ - u_int32_t tcps_hc_added; /* entry added to hostcache */ - u_int32_t tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ - /* SACK related stats */ u_int32_t tcps_sack_recovery_episode; /* SACK recovery episodes */ u_int32_t tcps_sack_rexmits; /* SACK rexmit segments */ @@ -1319,9 +1329,6 @@ struct tcpstat { u_int32_t tcps_snd_swcsum_bytes; /* tcp swcksum (outbound), bytes */ u_int32_t tcps_snd6_swcsum; /* tcp6 swcksum (outbound), packets */ u_int32_t tcps_snd6_swcsum_bytes; /* tcp6 swcksum (outbound), bytes */ - u_int32_t tcps_unused_1; - u_int32_t tcps_unused_2; - u_int32_t tcps_unused_3; /* MPTCP Related stats */ u_int32_t tcps_invalid_mpcap; /* Invalid MPTCP capable opts */ @@ -1330,8 +1337,6 @@ struct tcpstat { u_int32_t tcps_join_fallback; /* No MPTCP in secondary */ u_int32_t tcps_estab_fallback; /* DSS option dropped */ u_int32_t tcps_invalid_opt; /* Catchall error stat */ - u_int32_t tcps_mp_outofwin; /* Packet lies outside the - * shared recv window */ u_int32_t tcps_mp_reducedwin; /* Reduced subflow window */ u_int32_t tcps_mp_badcsum; /* Bad DSS csum */ u_int32_t tcps_mp_oodata; /* Out of order data */ @@ -1354,14 +1359,10 @@ struct tcpstat { u_int32_t tcps_detect_reordering; /* Detect pkt reordering */ u_int32_t tcps_delay_recovery; /* Delay fast recovery */ u_int32_t tcps_avoid_rxmt; /* Retransmission was avoided */ - u_int32_t tcps_unnecessary_rxmt; /* Retransmission was not needed */ - u_int32_t tcps_nostretchack; /* disabled stretch ack algorithm on a connection */ - u_int32_t tcps_rescue_rxmt; /* SACK rescue retransmit */ u_int32_t tcps_pto_in_recovery; /* rescue retransmit in fast recovery */ u_int32_t tcps_pmtudbh_reverted; /* PMTU Blackhole detection, segment size reverted */ /* DSACK related statistics */ - u_int32_t tcps_dsack_disable; /* DSACK disabled due to n/w duplication */ u_int32_t tcps_dsack_ackloss; /* ignore DSACK due to ack loss */ u_int32_t tcps_dsack_badrexmt; /* DSACK based bad rexmt recovery */ u_int32_t tcps_dsack_sent; /* Sent DSACK notification */ @@ -1369,10 +1370,8 @@ struct tcpstat { u_int32_t tcps_dsack_recvd_old; /* Received an out of window DSACK option */ /* MPTCP Subflow selection stats */ - u_int32_t tcps_mp_sel_symtomsd; /* By symptomsd */ u_int32_t tcps_mp_sel_rtt; /* By RTT comparison */ u_int32_t tcps_mp_sel_rto; /* By RTO comparison */ - u_int32_t tcps_mp_sel_peer; /* By peer's output pattern */ u_int32_t tcps_mp_num_probes; /* Number of probes sent */ u_int32_t tcps_mp_verdowngrade; /* MPTCP version downgrade */ u_int32_t tcps_drop_after_sleep; /* drop after long AP sleep */ @@ -1756,12 +1755,9 @@ extern int tcp_fastopen; extern int ss_fltsz_local; extern int target_qdelay; extern uint32_t tcp_now; /* for RFC 1323 timestamps */ -extern struct timeval tcp_uptime; -extern lck_spin_t tcp_uptime_lock; extern int tcp_delack_enabled; extern int maxseg_unacked; -extern int tcp_ecn_outbound; -extern int tcp_ecn_inbound; +extern int tcp_ecn; extern uint32_t tcp_do_autorcvbuf; extern uint32_t tcp_autorcvbuf_max; extern int tcp_recv_bg; @@ -1804,23 +1800,13 @@ extern int32_t tcp_link_heuristics_rto_min; */ #define TCP_ACK_COMPRESSION_DUMMY 1 +extern struct tseg_qent *tcp_create_reass_qent(struct tcpcb *tp, struct mbuf *m, + struct tcphdr *th, int len); +extern struct mbuf *tcp_destroy_reass_qent(struct tcpcb *tp, + struct tseg_qent *q); KALLOC_TYPE_DECLARE(tcp_reass_zone); -extern struct tseg_qent * tcp_reass_qent_alloc(void); -extern void tcp_reass_qent_free(struct tseg_qent *te); - -KALLOC_TYPE_DECLARE(tcp_rxt_seg_zone); -extern struct tcp_rxt_seg * tcp_rxt_seg_qent_alloc(void); -extern void tcp_rxt_seg_qent_free(struct tcp_rxt_seg *te); - -KALLOC_TYPE_DECLARE(tcp_seg_sent_zone); -extern struct tcp_seg_sent * tcp_seg_sent_qent_alloc(void); -extern void tcp_seg_sent_qent_free(struct tcp_seg_sent *te); - - -extern int tcp_do_better_lr; -extern int tcp_cubic_minor_fixes; -extern int tcp_cubic_rfc_compliant; -extern int tcp_flow_control_response; +extern struct tseg_qent *tcp_reass_qent_alloc(struct protosw *proto); +extern void tcp_reass_qent_free(struct protosw *proto, struct tseg_qent *te); extern int tcp_rack; @@ -1842,6 +1828,7 @@ struct tcp_respond_args { }; void tcp_canceltimers(struct tcpcb *); +uint8_t tcp_addoptions(struct tcpopt *to, u_char * __ended_by(optend) optp, u_char * optend); struct tcpcb * tcp_close(struct tcpcb *); void tcp_ctlinput(int, struct sockaddr *, void *, struct ifnet *); @@ -1851,6 +1838,8 @@ tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt); void tcp_init(struct protosw *, struct domain *); +bool tcp_create_server_socket(struct tcp_inp *tpi, struct socket **so2, bool *syn_cookie_sent, int *dropsocket); +bool tcp_setup_server_socket(struct tcp_inp *tpi, struct socket *so, bool syn_cookie_used); void tcp_input(struct mbuf *, int); void tcp_mss(struct tcpcb *, int, unsigned int); uint32_t tcp_ceil(double a); @@ -1866,7 +1855,7 @@ struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_respond(struct tcpcb *, void *ipgen __sized_by(ipgen_size), size_t ipgen_size, struct tcphdr *, struct mbuf *, - tcp_seq, tcp_seq, uint8_t, struct tcp_respond_args *); + tcp_seq, tcp_seq, uint32_t, uint16_t, struct tcpopt *, uint16_t, uint8_t, uint32_t, struct tcp_respond_args *, bool send_syncookie); struct rtentry * tcp_rtlookup(struct inpcb *, unsigned int); void tcp_setpersist(struct tcpcb *); @@ -1876,8 +1865,8 @@ void tcp_check_timer_state(struct tcpcb *tp); void tcp_run_timerlist(void *arg1, void *arg2); void tcp_sched_timers(struct tcpcb *tp); -struct tcptemp *tcp_maketemplate(struct tcpcb *, struct mbuf **); -void tcp_fillheaders(struct mbuf *, struct tcpcb *, void *, void *); +struct tcptemp *tcp_maketemplate(struct tcpcb *, struct mbuf **, struct sockaddr *, struct sockaddr *); +void tcp_fillheaders(struct mbuf *, struct tcpcb *, void *, void *, struct sockaddr *, struct sockaddr *); struct tcpcb *tcp_timers(struct tcpcb *, int); void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int); @@ -1896,14 +1885,16 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int32_t tcp_sbspace(struct tcpcb *tp); void tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp); -void tcp_set_ecn(struct tcpcb *tp, struct ifnet *ifp); +void tcp_set_ecn(struct tcpcb *tp); +void tcp_set_l4s(struct tcpcb *tp, struct ifnet *ifp); +void tcp_set_accurate_ecn(struct tcpcb *tp); +bool tcp_ecn_enabled(uint32_t ecn_flags); uint8_t tcp_get_ace(struct tcphdr *th); uint32_t tcp_flight_size(struct tcpcb *tp); -void tcp_reset_stretch_ack(struct tcpcb *tp); extern void tcp_get_ports_used(ifnet_t ifp, int, u_int32_t, bitstr_t *__counted_by(bitstr_size(IP_PORTRANGE_SIZE))); uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); uint32_t tcp_find_anypcb_byaddr(struct ifaddr *ifa); -void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so); +uint8_t tcp_get_max_rwinscale(struct tcpcb *tp, struct socket *so); struct bwmeas* tcp_bwmeas_alloc(struct tcpcb *tp); void tcp_bwmeas_free(struct tcpcb *tp); extern int32_t timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2); @@ -1948,9 +1939,6 @@ void reset_acc_iaj(struct tcpcb *tp); int tcp_lock(struct socket *, int, void *); int tcp_unlock(struct socket *, int, void *); void calculate_tcp_clock(void); -uint64_t microuptime_ns(void); -uint64_t tcp_pacer_get_packet_tx_time(struct tcpcb *tp, uint16_t pkt_len); -void tcp_set_mbuf_tx_time(struct mbuf *m, uint64_t tx_time); extern void tcp_keepalive_reset(struct tcpcb *); extern uint32_t get_base_rtt(struct tcpcb *tp); @@ -1979,6 +1967,7 @@ extern boolean_t tcp_rxtseg_detect_bad_rexmt(struct tcpcb *, tcp_seq); extern boolean_t tcp_rxtseg_dsack_for_tlp(struct tcpcb *); extern u_int32_t tcp_rxtseg_total_size(struct tcpcb *tp); extern void tcp_rexmt_save_state(struct tcpcb *tp); +void tcp_local_congestion_notification(struct tcpcb *tp); void tcp_enter_fast_recovery(struct tcpcb *tp); extern void tcp_interface_send_probe(u_int16_t if_index_available); extern void tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable); @@ -1994,7 +1983,6 @@ extern int tcp_notify_kao_timeout(ifnet_t ifp, extern void tcp_disable_tfo(struct tcpcb *tp); extern void tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out __sized_by(blk_size), size_t blk_size); #define TCP_FASTOPEN_KEYLEN 16 -extern int tcp_freeq(struct tcpcb *tp); extern errno_t tcp_notify_ack_id_valid(struct tcpcb *, struct socket *, u_int32_t); extern errno_t tcp_add_notify_ack_marker(struct tcpcb *, u_int32_t); extern void tcp_notify_ack_free(struct tcpcb *); @@ -2014,21 +2002,17 @@ extern uint16_t mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen); extern int mptcp_adj_mss(struct tcpcb *, boolean_t); extern void mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th); -extern int dump_mptcp_reass_qlen(char * __sized_by(str_len), int str_len); #endif -extern int dump_tcp_reass_qlen(char * str __sized_by(str_len), int str_len); extern uint32_t tcp_reass_qlen_space(struct socket *); __private_extern__ void tcp_update_stats_per_flow( struct ifnet_stats_per_flow *, struct ifnet *); -extern void tcp_set_link_heur_rtomin(struct tcpcb *tp, ifnet_t ifp); +extern void tcp_set_rto(struct tcpcb *tp); +extern void tcp_set_pto(struct tcpcb *tp); -#define TCP_ACK_STRATEGY_LEGACY 0 -#define TCP_ACK_STRATEGY_MODERN 1 - -extern int tcp_ack_strategy; +extern struct mem_acct *tcp_memacct; #if SKYWALK void tcp_add_fsw_flow(struct tcpcb *, struct ifnet *); @@ -2043,6 +2027,43 @@ typedef void *__single lr_ref_t; ? __unsafe_forge_single(void *, __builtin_return_address(0)) \ : (lr)) +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +static inline uint32_t +tcp_rto_formula(uint32_t rttmin, uint32_t srtt, uint32_t rttvar) +{ + return max(rttmin, + ((srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) + rttvar) >> TCP_DELTA_SHIFT); +} + +static inline uint32_t +_tcp_offset_from_start(const struct tcpcb *tp, uint32_t offset, + uint32_t tcp_now_var) +{ + return tcp_now_var + offset - tp->tentry.te_timer_start; +} + +static inline uint32_t +tcp_offset_from_start(const struct tcpcb *tp, uint32_t offset) +{ + return _tcp_offset_from_start(tp, offset, tcp_now); +} + +#define TCP_REXMTVAL(tp) tcp_rto_formula((tp)->t_rttmin, (tp)->t_srtt, (tp)->t_rttvar) #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/bsd/netinet/udp_log.c b/bsd/netinet/udp_log.c index ef82794fd..20da59893 100644 --- a/bsd/netinet/udp_log.c +++ b/bsd/netinet/udp_log.c @@ -49,7 +49,8 @@ SYSCTL_NODE(_net_inet_udp, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED, 0, #define UDP_LOG_ENABLE_DEFAULT \ (ULEF_CONNECT | ULEF_DST_LOCAL | ULEF_DST_GW) #else /* (DEVELOPMENT || DEBUG) */ -#define UDP_LOG_ENABLE_DEFAULT 0 +#define UDP_LOG_ENABLE_DEFAULT \ + (ULEF_CONNECT | ULEF_DST_LOCAL | ULEF_DST_GW) #endif /* (DEVELOPMENT || DEBUG) */ uint32_t udp_log_enable_flags = UDP_LOG_ENABLE_DEFAULT; @@ -239,8 +240,8 @@ udp_log_common(struct inpcb *inp, const char *event, int error) #define UDP_LOG_CONNECTION_ARGS \ event, \ UDP_LOG_COMMON_PCB_ARGS, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ error, \ so->so_error, \ (so->so_flags1 & SOF1_TC_NET_SERV_TYPE) ? so->so_netsvctype : so->so_traffic_class, \ @@ -370,15 +371,15 @@ udp_log_connection_summary(struct inpcb *inp) "rxnospace pkts/bytes: %llu/%llu " \ "so_error: %d " \ "svc/tc: %u " \ - "flow: 0x%x" \ + "flow: 0x%x " \ "flowctl: %lluus (%llux) " #define UDP_LOG_CONNECTION_SUMMARY_ARGS \ UDP_LOG_COMMON_PCB_ARGS, \ duration_secs, duration_microsecs / 1000, \ connection_secs, connection_microsecs / 1000, \ - inp->inp_stat->rxbytes, inp->inp_stat->txbytes, \ - inp->inp_stat->rxpackets, inp->inp_stat->txpackets, \ + inp->inp_mstat.ms_total.ts_rxbytes, inp->inp_mstat.ms_total.ts_txbytes, \ + inp->inp_mstat.ms_total.ts_rxpackets, inp->inp_mstat.ms_total.ts_txpackets, \ so->so_tc_stats[SO_STATS_SBNOSPACE].rxpackets, \ so->so_tc_stats[SO_STATS_SBNOSPACE].rxbytes, \ so->so_error, \ @@ -441,7 +442,7 @@ udp_log_message(const char *func_name, int line_no, struct inpcb *inp, const cha #define UDP_LOG_MESSAGE_ARGS \ func_name, line_no, \ UDP_LOG_COMMON_PCB_ARGS, \ - format + message os_log(OS_LOG_DEFAULT, UDP_LOG_MESSAGE_FMT, UDP_LOG_MESSAGE_ARGS); diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 91997efd7..476443f06 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -74,6 +74,7 @@ #include #include +#include #include #include #include @@ -196,22 +197,22 @@ struct udp_ip6 { u_char uip6_init_done : 1; }; -int udp_abort(struct socket *); -int udp_attach(struct socket *, int, struct proc *); -int udp_bind(struct socket *, struct sockaddr *, struct proc *); -int udp_connect(struct socket *, struct sockaddr *, struct proc *); -int udp_connectx(struct socket *, struct sockaddr *, +static int udp_abort(struct socket *); +static int udp_attach(struct socket *, int, struct proc *); +static int udp_bind(struct socket *, struct sockaddr *, struct proc *); +static int udp_connect(struct socket *, struct sockaddr *, struct proc *); +static int udp_connectx(struct socket *, struct sockaddr *, struct sockaddr *, struct proc *, uint32_t, sae_associd_t, sae_connid_t *, uint32_t, void *, uint32_t, struct uio *, user_ssize_t *); -int udp_detach(struct socket *); -int udp_disconnect(struct socket *); -int udp_disconnectx(struct socket *, sae_associd_t, sae_connid_t); -int udp_send(struct socket *, int, struct mbuf *, struct sockaddr *, +static int udp_detach(struct socket *); +static int udp_disconnect(struct socket *); +static int udp_disconnectx(struct socket *, sae_associd_t, sae_connid_t); +static int udp_send(struct socket *, int, struct mbuf *, struct sockaddr *, struct mbuf *, struct proc *); static void udp_append(struct inpcb *, struct ip *, struct mbuf *, int, struct sockaddr_in *, struct udp_in6 *, struct udp_ip6 *, struct ifnet *); static int udp_input_checksum(struct mbuf *, struct udphdr *, int, int); -int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, +static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct proc *); static void ip_2_ip6_hdr(struct ip6_hdr *ip6, struct ip *ip); static void udp_gc(struct inpcbinfo *); @@ -236,20 +237,31 @@ struct pr_usrreqs udp_usrreqs = { .pru_defunct = udp_defunct, }; +struct mem_acct *udp_memacct; + void udp_init(struct protosw *pp, struct domain *dp) { #pragma unused(dp) static int udp_initialized = 0; struct inpcbinfo *pcbinfo; + uint32_t pool_size = 0; VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - if (udp_initialized) { + if (udp_memacct == NULL) { + udp_memacct = mem_acct_register("UDP", 0, 0); + if (udp_memacct == NULL) { + panic("mem_acct_register returned NULL"); + } + } + pp->pr_mem_acct = udp_memacct; + + if (!os_atomic_cmpxchg(&udp_initialized, 0, 1, relaxed)) { return; } - udp_initialized = 1; - uint32_t pool_size = (nmbclusters << MCLSHIFT) >> MBSHIFT; + + pool_size = (nmbclusters << MCLSHIFT) >> MBSHIFT; if (pool_size >= 96) { /* Improves 10GbE UDP performance. */ udp_recvspace = 786896; @@ -788,9 +800,7 @@ udp_input(struct mbuf *m, int iphlen) } if (nstat_collect) { stats_functional_type ifnet_count_type = IFNET_COUNT_TYPE(ifp); - INP_ADD_STAT(inp, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, rxbytes, m->m_pkthdr.len); - inp_set_activity_bitmap(inp); + INP_ADD_RXSTAT(inp, ifnet_count_type, 1, m->m_pkthdr.len); } #if CONTENT_FILTER && NECP if (check_cfil && inp != NULL && inp->inp_policyresult.results.filter_control_unit == 0) { @@ -903,10 +913,7 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, } if (nstat_collect) { stats_functional_type ifnet_count_type = IFNET_COUNT_TYPE(ifp); - INP_ADD_STAT(last, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(last, ifnet_count_type, rxbytes, - n->m_pkthdr.len); - inp_set_activity_bitmap(last); + INP_ADD_RXSTAT(last, ifnet_count_type, 1, n->m_pkthdr.len); } so_recv_data_stat(last->inp_socket, n, 0); m_adj(n, off); @@ -1485,9 +1492,7 @@ udp_check_pktinfo(struct mbuf *control, struct ifnet **outif, struct in_pktinfo *pktinfo; struct ifnet *ifp; - if (outif != NULL) { - *outif = NULL; - } + *outif = NULL; /* * XXX: Currently, we assume all the optional information is stored @@ -1533,10 +1538,8 @@ udp_check_pktinfo(struct mbuf *control, struct ifnet **outif, ifnet_head_done(); return ENXIO; } - if (outif != NULL) { - ifnet_reference(ifp); - *outif = ifp; - } + ifnet_reference(ifp); + *outif = ifp; ifnet_head_done(); laddr->s_addr = INADDR_ANY; break; @@ -1554,16 +1557,16 @@ udp_check_pktinfo(struct mbuf *control, struct ifnet **outif, return 0; } -int +static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct proc *p) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct sockaddr_in *sin; - struct in_addr origladdr, laddr, faddr, pi_laddr; + struct in_addr laddr, faddr, pi_laddr; u_short lport, fport; - int error = 0, udp_dodisconnect = 0, pktinfo = 0; + int error = 0, pktinfo = 0; struct socket *so = inp->inp_socket; int soopts = 0; struct mbuf *inpopts; @@ -1586,9 +1589,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ifnet_ref_t outif = NULL; struct flowadv *adv = &ipoa.ipoa_flowadv; - int sotc = SO_TC_UNSPEC; - int netsvctype = _NET_SERVICE_TYPE_UNSPEC; - struct ifnet *origoutifp = NULL; + struct sock_cm_info sockcminfo; int flowadv = 0; int tos = IPTOS_UNSPEC; @@ -1628,10 +1629,13 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } #endif + sock_init_cm_info(&sockcminfo, so); + if (control != NULL) { - tos = so_tos_from_control(control); - sotc = so_tc_from_control(control, &netsvctype); - VERIFY(outif == NULL); + tos = ip_tos_from_control(control); + + sock_parse_cm_info(control, &sockcminfo); + error = udp_check_pktinfo(control, &outif, &pi_laddr); m_freem(control); control = NULL; @@ -1644,10 +1648,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ipoa.ipoa_boundif = outif->if_index; } } - if (sotc == SO_TC_UNSPEC) { - sotc = so->so_traffic_class; - netsvctype = so->so_netsvctype; - } KERNEL_DEBUG(DBG_LAYER_OUT_BEG, inp->inp_fport, inp->inp_lport, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, @@ -1701,8 +1701,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (INP_ULTRA_CONSTRAINED_ALLOWED(inp)) { ipoa.ipoa_flags |= IPOAF_ULTRA_CONSTRAINED_ALLOWED; } - ipoa.ipoa_sotc = sotc; - ipoa.ipoa_netsvctype = netsvctype; + ipoa.ipoa_sotc = sockcminfo.sotc; + ipoa.ipoa_netsvctype = sockcminfo.netsvctype; soopts |= IP_OUTARGS; /* @@ -1764,16 +1764,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, (ipoa.ipoa_boundif != IFSCOPE_NONE && pktinfo)) { /* temp src address for this datagram only */ laddr = pi_laddr; - origladdr.s_addr = INADDR_ANY; - /* we don't want to keep the laddr or route */ - udp_dodisconnect = 1; - /* remember we don't care about src addr */ - inp->inp_flags |= INP_INADDR_ANY; } else { - origladdr = laddr = inp->inp_laddr; + laddr = inp->inp_laddr; } - origoutifp = inp->inp_last_outifp; faddr = inp->inp_faddr; lport = inp->inp_lport; fport = inp->inp_fport; @@ -1788,82 +1782,72 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, sndinprog_cnt_used = true; if (addr) { - sin = SIN(addr); - if (faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; UDP_LOG(inp, "socket already connected error EISCONN"); goto release; } - if (lport == 0) { - inp_enter_bind_in_progress(so); + sin = SIN(addr); + faddr = sin->sin_addr; + fport = sin->sin_port; /* allow 0 port */ - /* - * In case we don't have a local port set, go through - * the full connect. We don't have a local port yet - * (i.e., we can't be looked up), so it's not an issue - * if the input runs at the same time we do this. - */ - /* if we have a source address specified, use that */ - if (pi_laddr.s_addr != INADDR_ANY) { - inp->inp_laddr = pi_laddr; - } - /* - * If a scope is specified, use it. Scope from - * IP_PKTINFO takes precendence over the the scope - * set via INP_BOUND_IF. - */ - error = in_pcbconnect(inp, addr, p, ipoa.ipoa_boundif, - &outif); + /* + * Fast path case + * + * If neeed get a local address and a local port to build + * the packet without changing the pcb + * and interfering with the input path. See 3851370. + * + * And don't disconnect as this could unbind the local port + * + * Scope from IP_PKTINFO takes precendence over the + * the scope set via INP_BOUND_IF. + */ + if (laddr.s_addr == INADDR_ANY) { + char laddr_str[MAX_IPv4_STR_LEN]; + char addr_str[MAX_IPv4_STR_LEN]; - inp_exit_bind_in_progress(so); + inet_ntop(AF_INET, &laddr.s_addr, laddr_str, sizeof(laddr_str)); + inet_ntop(AF_INET, &sin->sin_addr.s_addr, addr_str, sizeof(addr_str)); + UDP_LOG(inp, "calling in_pcbladdr addr %s laddr %s ipoa_boundif %u outif %s", + addr_str, laddr_str, ipoa.ipoa_boundif, outif != NULL ? if_name(outif) : ""); - if (error) { - UDP_LOG(inp, "in_pcbconnect error %d", error); + if ((error = in_pcbladdr(inp, addr, &laddr, + ipoa.ipoa_boundif, &outif, 0)) != 0) { + UDP_LOG(inp, "in_pcbladdr error %d", error); goto release; } - laddr = inp->inp_laddr; - lport = inp->inp_lport; - faddr = inp->inp_faddr; - fport = inp->inp_fport; - udp_dodisconnect = 1; - /* synch up in case in_pcbladdr() overrides */ - if (outif != NULL && ipoa.ipoa_boundif != IFSCOPE_NONE) { + if (outif != NULL && + ipoa.ipoa_boundif != IFSCOPE_NONE) { ipoa.ipoa_boundif = outif->if_index; } - } else { - /* - * Fast path case - * - * We have a full address and a local port; use those - * info to build the packet without changing the pcb - * and interfering with the input path. See 3851370. - * - * Scope from IP_PKTINFO takes precendence over the - * the scope set via INP_BOUND_IF. - */ - if (laddr.s_addr == INADDR_ANY) { - if ((error = in_pcbladdr(inp, addr, &laddr, - ipoa.ipoa_boundif, &outif, 0)) != 0) { - UDP_LOG(inp, "in_pcbladdr error %d", error); - goto release; - } - /* - * from pcbconnect: remember we don't - * care about src addr. - */ - inp->inp_flags |= INP_INADDR_ANY; - /* synch up in case in_pcbladdr() overrides */ - if (outif != NULL && - ipoa.ipoa_boundif != IFSCOPE_NONE) { - ipoa.ipoa_boundif = outif->if_index; - } + inet_ntop(AF_INET, &laddr.s_addr, laddr_str, sizeof(laddr_str)); + inet_ntop(AF_INET, &sin->sin_addr.s_addr, addr_str, sizeof(addr_str)); + UDP_LOG(inp, "after in_pcbladdr addr %s laddr %s ipoa_boundif %u outif %s", + addr_str, laddr_str, ipoa.ipoa_boundif, outif != NULL ? if_name(outif) : ""); + } + + if (lport == 0) { + inp_enter_bind_in_progress(so); + + error = in_pcbsetport(laddr, addr, inp, p, 0); + + if (error == 0) { + ASSERT(inp->inp_lport != 0); } - faddr = sin->sin_addr; - fport = sin->sin_port; + inp_exit_bind_in_progress(so); + + if (error != 0) { + UDP_LOG(inp, "in_pcbsetport error %d", error); + goto release; + } + lport = inp->inp_lport; + UDP_LOG(inp, "in_pcbsetport returned lport %u", + ntohs(lport)); } } else { if (faddr.s_addr == INADDR_ANY) { @@ -1999,8 +1983,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, &laddr, &faddr, NULL, 0, &policy_id, &route_rule_id, &skip_policy_id, &pass_flags)) { error = EHOSTUNREACH; UDP_LOG_DROP_NECP((struct ip *)&ui->ui_i, &ui->ui_u, inp, true); - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_UDP_NECP, NULL, 0); + m_drop_if(m, outif, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_UDP_NECP, NULL, 0); m = NULL; + UDP_LOG(inp, "necp_socket_is_allowed_to_send_recv_v4 error %d", error); goto abort; } @@ -2023,8 +2008,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (inp->inp_sp != NULL && ipsec_setsocket(m, inp->inp_socket) != 0) { error = ENOBUFS; UDP_LOG_DROP_PCB((struct ip *)&ui->ui_i, &ui->ui_u, inp, true, "ipsec_setsocket error ENOBUFS"); - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_UDP_IPSEC, NULL, 0); + m_drop_if(m, outif, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_UDP_IPSEC, NULL, 0); m = NULL; + UDP_LOG(inp, "necp_socket_is_allowed_to_send_recv_v4 error %d", error); goto abort; } #endif /* IPSEC */ @@ -2058,7 +2044,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); - set_packet_service_class(m, so, sotc, 0); + set_packet_service_class(m, so, sockcminfo.sotc, 0); + if (sockcminfo.tx_time) { + mbuf_set_tx_time(m, sockcminfo.tx_time); + } m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = inp->inp_flowhash; m->m_pkthdr.pkt_proto = IPPROTO_UDP; @@ -2097,6 +2086,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, IMO_REMREF(mopts); } + if (error != 0) { + UDP_LOG(inp, "ip_output error %d", error); + } + if (check_qos_marking_again) { inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount; @@ -2108,14 +2101,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } if (error == 0 && nstat_collect) { - stats_functional_type ifnet_count_type = stats_functional_type_none; + stats_functional_type ifnet_count_type = stats_functional_type_unclassified; if (ro.ro_rt != NULL) { ifnet_count_type = IFNET_COUNT_TYPE(ro.ro_rt->rt_ifp); } - INP_ADD_STAT(inp, ifnet_count_type, txpackets, 1); - INP_ADD_STAT(inp, ifnet_count_type, txbytes, len); - inp_set_activity_bitmap(inp); + INP_ADD_TXSTAT(inp, ifnet_count_type, 1, len); } if (flowadv && (adv->code == FADV_FLOW_CONTROLLED || @@ -2140,20 +2131,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } abort: - if (udp_dodisconnect) { - /* Always discard the cached route for unconnected socket */ - ROUTE_RELEASE(&inp->inp_route); - in_pcbdisconnect(inp); - inp->inp_laddr = origladdr; /* XXX rehash? */ - /* no reference needed */ - inp->inp_last_outifp = origoutifp; -#if SKYWALK - if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) { - netns_set_ifnet(&inp->inp_netns_token, - inp->inp_last_outifp); - } -#endif /* SKYWALK */ - } else if (inp->inp_route.ro_rt != NULL) { + if (inp->inp_route.ro_rt != NULL) { struct rtentry *rt = inp->inp_route.ro_rt; struct ifnet *outifp; @@ -2283,7 +2261,7 @@ SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_sendspace, 0, &sysctl_udp_sospace, "IU", "Maximum outgoing UDP datagram size"); -int +static int udp_abort(struct socket *so) { struct inpcb *inp; @@ -2298,7 +2276,7 @@ udp_abort(struct socket *so) return 0; } -int +static int udp_attach(struct socket *so, int proto, struct proc *p) { #pragma unused(proto) @@ -2327,7 +2305,7 @@ udp_attach(struct socket *so, int proto, struct proc *p) return 0; } -int +static int udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; @@ -2365,7 +2343,7 @@ udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p) return error; } -int +static int udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp; @@ -2519,7 +2497,7 @@ done: return error; } -int +static int udp_connectx(struct socket *so, struct sockaddr *src, struct sockaddr *dst, struct proc *p, uint32_t ifscope, sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg, @@ -2529,7 +2507,7 @@ udp_connectx(struct socket *so, struct sockaddr *src, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written); } -int +static int udp_detach(struct socket *so) { struct inpcb *inp; @@ -2557,7 +2535,7 @@ udp_detach(struct socket *so) return 0; } -int +static int udp_disconnect(struct socket *so) { struct inpcb *inp; @@ -2589,7 +2567,7 @@ udp_disconnect(struct socket *so) return 0; } -int +static int udp_disconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid) { #pragma unused(cid) @@ -2600,7 +2578,7 @@ udp_disconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid) return udp_disconnect(so); } -int +static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct proc *p) { diff --git a/bsd/netinet/udp_var.h b/bsd/netinet/udp_var.h index d31e194f5..a83a108be 100644 --- a/bsd/netinet/udp_var.h +++ b/bsd/netinet/udp_var.h @@ -63,10 +63,10 @@ #ifndef _NETINET_UDP_VAR_H_ #define _NETINET_UDP_VAR_H_ -#include -#include #include #include +#include +#include /* * UDP kernel structures and variables. @@ -127,6 +127,7 @@ struct udpstat { #ifdef BSD_KERNEL_PRIVATE #include +#include #include #define UDPCTL_NAMES { \ @@ -162,6 +163,20 @@ extern u_int32_t udp_sendspace; extern u_int32_t udp_recvspace; extern struct udpstat udpstat; extern int udp_log_in_vain; +extern struct mem_acct *udp_memacct; + +static inline void +udp_memacct_add(int size) +{ + mem_acct_add(udp_memacct, size); +} + +static inline void +udp_memacct_sub(int size) +{ + mem_acct_sub(udp_memacct, size); +} + __BEGIN_DECLS extern void udp_ctlinput(int, struct sockaddr *, void *, struct ifnet *); diff --git a/bsd/netinet6/dest6.c b/bsd/netinet6/dest6.c index 17a257f15..8efb2c4c7 100644 --- a/bsd/netinet6/dest6.c +++ b/bsd/netinet6/dest6.c @@ -68,6 +68,7 @@ #include #include +#include #include #include @@ -88,6 +89,7 @@ dest6_input(struct mbuf **mp, int *offp, int proto) int off = *offp, dstoptlen = 0, optlen = 0; struct ip6_dest *dstopts = NULL; u_int8_t *opt = NULL; + drop_reason_t drop_reason = DROP_REASON_UNSPECIFIED; /* validation of the length of the header */ IP6_EXTHDR_CHECK(m, off, sizeof(*dstopts), return IPPROTO_DONE); @@ -105,6 +107,7 @@ dest6_input(struct mbuf **mp, int *offp, int proto) if (*opt != IP6OPT_PAD1 && (dstoptlen < IP6OPT_MINLEN || *(opt + 1) + 2 > dstoptlen)) { ip6stat.ip6s_toosmall++; + drop_reason = DROP_REASON_IP_TOO_SMALL; goto bad; } @@ -132,6 +135,6 @@ dest6_input(struct mbuf **mp, int *offp, int proto) bad: *mp = NULL; - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); return IPPROTO_DONE; } diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 07c5aa250..40e1f5fc5 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -1471,7 +1471,7 @@ esp_auth( size_t siz; int error; - _CASSERT(ESP_AUTH_MAXSUMSIZE == AH_MAXSUMSIZE); + static_assert(ESP_AUTH_MAXSUMSIZE == AH_MAXSUMSIZE); /* sanity checks */ if (m0->m_pkthdr.len < skip) { diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index 3e7437744..e87db0fdc 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -335,7 +335,7 @@ esp4_input_extended(struct mbuf *m, int off, ifnet_t interface) /* * check for sequence number. */ - _CASSERT(MBUF_TC_MAX <= UINT8_MAX); + static_assert(MBUF_TC_MAX <= UINT8_MAX); if (ipsec_chkreplay(seq, sav, (u_int8_t)replay_index)) { ; /*okey*/ } else { @@ -779,7 +779,7 @@ noreplaycheck: if (nxt == IPPROTO_TCP || nxt == IPPROTO_UDP) { m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; - _CASSERT(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val)); + static_assert(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val)); } if (nxt != IPPROTO_DONE) { @@ -1540,7 +1540,7 @@ noreplaycheck: if (nxt == IPPROTO_TCP || nxt == IPPROTO_UDP) { m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xFFFF; - _CASSERT(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val)); + static_assert(offsetof(struct pkthdr, csum_data) == offsetof(struct pkthdr, csum_rx_val)); } // Input via IPsec interface diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index 190b90e91..82d7c06e2 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -173,7 +174,7 @@ frag6_init(void) ip6q.ip6q_next = ip6q.ip6q_prev = &ip6q; /* same limits as IPv4 */ - ip6_maxfragpackets = nmbclusters / 32; + ip6_maxfragpackets = 8192; ip6_maxfrags = ip6_maxfragpackets * 2; ip6q_updateparams(); lck_mtx_unlock(&ip6qlock); @@ -1303,7 +1304,7 @@ sysctl_maxfragpackets SYSCTL_HANDLER_ARGS goto done; } /* impose bounds */ - if (i < -1 || i > (nmbclusters / 4)) { + if (i < -1) { error = EINVAL; goto done; } @@ -1327,7 +1328,7 @@ sysctl_maxfrags SYSCTL_HANDLER_ARGS goto done; } /* impose bounds */ - if (i < -1 || i > (nmbclusters / 4)) { + if (i < -1) { error = EINVAL; goto done; } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index c7ac43da9..32062883f 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -109,6 +109,7 @@ #include #include +#include #include #include #include @@ -176,7 +177,7 @@ static int icmp6_notify_error(struct mbuf *, int, int, int); void -icmp6_init(struct ip6protosw *pp, struct domain *dp) +icmp6_init(struct protosw *pp, struct domain *dp) { #pragma unused(dp) static int icmp6_initialized = 0; @@ -186,14 +187,15 @@ icmp6_init(struct ip6protosw *pp, struct domain *dp) (pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); /* This gets called by more than one protocols, so initialize once */ - if (!icmp6_initialized) { - icmp6_initialized = 1; - mld_init(); - if (icmp6errppslim >= 0 && - icmp6errppslim_random_incr > 0 && - icmp6errppslim <= INT32_MAX - (icmp6errppslim_random_incr + 1)) { - icmp6errppslim += (random() % icmp6errppslim_random_incr) + 1; - } + if (!os_atomic_cmpxchg(&icmp6_initialized, 0, 1, relaxed)) { + return; + } + + mld_init(); + if (icmp6errppslim >= 0 && + icmp6errppslim_random_incr > 0 && + icmp6errppslim <= INT32_MAX - (icmp6errppslim_random_incr + 1)) { + icmp6errppslim += (random() % icmp6errppslim_random_incr) + 1; } } @@ -456,7 +458,7 @@ freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_ICMP_DROP, NULL, 0); } /* @@ -1614,7 +1616,7 @@ ni6_input(struct mbuf *m, int off) return n; bad: - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_NI, NULL, 0); if (n) { m_freem(n); } @@ -2381,7 +2383,7 @@ icmp6_reflect(struct mbuf *m, size_t off) TAILQ_FOREACH(ia, IN6ADDR_HASH(&t), ia6_hash) { IFA_LOCK(&ia->ia_ifa); if (in6_are_addr_equal_scoped(&t, &ia->ia_addr.sin6_addr, tifscope, ia->ia_addr.sin6_scope_id) && - (ia->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) == 0) { + (ia->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) == 0) { IFA_UNLOCK(&ia->ia_ifa); src = &t; sifscope = tifscope; @@ -2575,13 +2577,13 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { - nd6log(error, + nd6log0(error, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(&src6)); goto bad; } if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log(error, + nd6log0(error, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(&src6), ip6->ip6_hlim); @@ -2604,7 +2606,7 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) RT_LOCK(rt); if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6) { - nd6log(error, + nd6log0(error, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); @@ -2615,7 +2617,7 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) gw6 = &((SIN6(rt->rt_gateway))->sin6_addr); if (!in6_are_addr_equal_scoped(&src6, gw6, src_ifscope, (SIN6(rt->rt_gateway))->sin6_scope_id)) { - nd6log(error, + nd6log0(error, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", @@ -2626,7 +2628,7 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) goto bad; } } else { - nd6log(error, + nd6log0(error, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); @@ -2637,7 +2639,7 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) rt = NULL; } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { - nd6log(error, + nd6log0(error, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); @@ -2652,7 +2654,7 @@ icmp6_redirect_input(struct mbuf *m, int off, int icmp6len) is_onlink = 1; /* on-link destination case */ } if (!is_router && !is_onlink) { - nd6log(error, + nd6log0(error, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6)); @@ -3219,6 +3221,7 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_2292RTHDR: case IPV6_BOUND_IF: case IPV6_NO_IFT_CELLULAR: + case IPV6_RECV_LINK_ADDR_TYPE: return ip6_ctloutput(so, sopt); default: diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index 309110876..eb67cf656 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -3032,6 +3032,10 @@ in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *__single ifa; + if (ifp == NULL) { + return NULL; + } + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { @@ -4429,7 +4433,7 @@ static __attribute__((unused)) void in6ioctl_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((u_long)0) { @@ -4716,10 +4720,10 @@ in6_iahash_insert_ptp(struct in6_ifaddr *ia) * values. */ static __attribute__((unused)) void -tcpsockopt_cassert(void) +ipv6sockopt_cassert(void) { /* - * This is equivalent to _CASSERT() and the compiler wouldn't + * This is equivalent to static_assert() and the compiler wouldn't * generate any instructions, thus for compile time only. */ switch ((int)0) { @@ -4800,6 +4804,7 @@ tcpsockopt_cassert(void) /* bsd/netinet6/in6_private.h */ case IPV6_NO_IFT_CELLULAR: case IPV6_OUT_IF: + case IPV6_RECV_LINK_ADDR_TYPE: ; } } diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index 50df2c163..b9f255f5e 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -107,6 +107,7 @@ #include #include +#include /* * Identification of the network protocol stack diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index ddd264952..dce3227c5 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2023 Apple Inc. All rights reserved. + * Copyright (c) 2003-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -592,7 +592,7 @@ in6_ifattach_loopback( /* add the new interface address */ error = in6_update_ifa(ifp, &ifra, 0, &ia); if (error != 0) { - nd6log(error, + nd6log0(error, "%s: failed to configure loopback address %s (error=%d)\n", __func__, if_name(ifp), error); VERIFY(ia == NULL); @@ -897,7 +897,7 @@ in6_ifattach_aliasreq(struct ifnet *ifp, struct ifnet *altifp, } else { if (in6_select_iid_from_all_hw(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { - nd6log(error, "%s: no IID available\n", + nd6log0(error, "%s: no IID available\n", if_name(ifp)); return EADDRNOTAVAIL; } diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c index 2924045f3..1b23477e1 100644 --- a/bsd/netinet6/in6_mcast.c +++ b/bsd/netinet6/in6_mcast.c @@ -242,7 +242,7 @@ in6m_is_ifp_detached(const struct in6_multi *inm) VERIFY(inm->in6m_ifma != NULL); VERIFY(inm->in6m_ifp == inm->in6m_ifma->ifma_ifp); - return !ifnet_is_attached(inm->in6m_ifp, 0); + return !ifnet_is_fully_attached(inm->in6m_ifp); } /* diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 4cfbaaf17..49f2950c5 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -811,7 +811,7 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) inp->inp_fifscope = sin6->sin6_scope_id; in6_verify_ifscope(&inp->in6p_faddr, inp->inp_fifscope); if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) { - nstat_pcb_invalidate_cache(inp); + nstat_udp_pcb_invalidate_cache(inp); } in_pcbrehash(inp); lck_rw_done(&inp->inp_pcbinfo->ipi_lock); @@ -842,7 +842,7 @@ in6_pcbdisconnect(struct inpcb *inp) socket_lock(so, 0); } if (nstat_collect && SOCK_PROTO(so) == IPPROTO_UDP) { - nstat_pcb_cache(inp); + nstat_udp_pcb_cache(inp); } bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; @@ -878,8 +878,8 @@ in6_pcbdetach(struct inpcb *inp) } #endif /* IPSEC */ - if (inp->inp_stat != NULL && SOCK_PROTO(so) == IPPROTO_UDP) { - if (inp->inp_stat->rxpackets == 0 && inp->inp_stat->txpackets == 0) { + if (SOCK_PROTO(so) == IPPROTO_UDP) { + if (inp->inp_mstat.ms_total.ts_rxpackets == 0 && inp->inp_mstat.ms_total.ts_txpackets == 0) { INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_no_data); } } diff --git a/bsd/netinet6/in6_private.h b/bsd/netinet6/in6_private.h index c4b9ba032..7b6b79f00 100644 --- a/bsd/netinet6/in6_private.h +++ b/bsd/netinet6/in6_private.h @@ -163,8 +163,15 @@ struct route_in6 { * Options for use with [gs]etsockopt at the IPV6 level. * First word of comment is data type; bool is stored in int. */ -#define IPV6_NO_IFT_CELLULAR 6969 /* for internal use only */ -#define IPV6_OUT_IF 9696 /* for internal use only */ +#define IPV6_NO_IFT_CELLULAR 6969 /* for internal use only */ +#define IPV6_OUT_IF 9696 /* for internal use only */ + +#define IPV6_RECV_LINK_ADDR_TYPE 9697 /* bool: receive the type of the link level address */ + +/* + * Values for IPV6_RECV_LINK_ADDR_TYPE in ancillary messages are the same as + * IP6_RECV_LINK_ADDR_TYPE -- see netinet/in_private.h + */ #ifdef BSD_KERNEL_PRIVATE #define CTL_IPV6PROTO_NAMES { \ diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index 9cf56311a..ef2f893d0 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -177,9 +177,7 @@ struct ip6protosw inet6sw[] = { .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = udp_ctloutput, -#if !INET /* don't call initialization twice */ .pr_init = udp_init, -#endif /* !INET */ .pr_usrreqs = &udp6_usrreqs, .pr_lock = udp_lock, .pr_unlock = udp_unlock, @@ -196,9 +194,7 @@ struct ip6protosw inet6sw[] = { .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, -#if !INET /* don't call initialization and timeout routines twice */ .pr_init = tcp_init, -#endif /* !INET */ .pr_drain = tcp_drain, .pr_usrreqs = &tcp6_usrreqs, .pr_lock = tcp_lock, @@ -215,9 +211,7 @@ struct ip6protosw inet6sw[] = { .pr_output = rip6_pr_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, -#if !INET /* don't call initialization and timeout routines twice */ .pr_init = rip_init, -#endif /* !INET */ .pr_usrreqs = &rip6_usrreqs, .pr_unlock = rip_unlock, .pr_update_last_owner = inp_update_last_owner, @@ -357,49 +351,28 @@ in6_dinit(struct domain *dp) inet6domain = dp; - _CASSERT(sizeof(struct protosw) == sizeof(struct ip6protosw)); - _CASSERT(offsetof(struct ip6protosw, pr_entry) == - offsetof(struct protosw, pr_entry)); - _CASSERT(offsetof(struct ip6protosw, pr_domain) == - offsetof(struct protosw, pr_domain)); - _CASSERT(offsetof(struct ip6protosw, pr_protosw) == - offsetof(struct protosw, pr_protosw)); - _CASSERT(offsetof(struct ip6protosw, pr_type) == - offsetof(struct protosw, pr_type)); - _CASSERT(offsetof(struct ip6protosw, pr_protocol) == - offsetof(struct protosw, pr_protocol)); - _CASSERT(offsetof(struct ip6protosw, pr_flags) == - offsetof(struct protosw, pr_flags)); - _CASSERT(offsetof(struct ip6protosw, pr_input) == - offsetof(struct protosw, pr_input)); - _CASSERT(offsetof(struct ip6protosw, pr_output) == - offsetof(struct protosw, pr_output)); - _CASSERT(offsetof(struct ip6protosw, pr_ctlinput) == - offsetof(struct protosw, pr_ctlinput)); - _CASSERT(offsetof(struct ip6protosw, pr_ctloutput) == - offsetof(struct protosw, pr_ctloutput)); - _CASSERT(offsetof(struct ip6protosw, pr_usrreqs) == - offsetof(struct protosw, pr_usrreqs)); - _CASSERT(offsetof(struct ip6protosw, pr_init) == - offsetof(struct protosw, pr_init)); - _CASSERT(offsetof(struct ip6protosw, pr_drain) == - offsetof(struct protosw, pr_drain)); - _CASSERT(offsetof(struct ip6protosw, pr_sysctl) == - offsetof(struct protosw, pr_sysctl)); - _CASSERT(offsetof(struct ip6protosw, pr_lock) == - offsetof(struct protosw, pr_lock)); - _CASSERT(offsetof(struct ip6protosw, pr_unlock) == - offsetof(struct protosw, pr_unlock)); - _CASSERT(offsetof(struct ip6protosw, pr_getlock) == - offsetof(struct protosw, pr_getlock)); - _CASSERT(offsetof(struct ip6protosw, pr_filter_head) == - offsetof(struct protosw, pr_filter_head)); - _CASSERT(offsetof(struct ip6protosw, pr_old) == - offsetof(struct protosw, pr_old)); - _CASSERT(offsetof(struct ip6protosw, pr_update_last_owner) == - offsetof(struct protosw, pr_update_last_owner)); - _CASSERT(offsetof(struct ip6protosw, pr_copy_last_owner) == - offsetof(struct protosw, pr_copy_last_owner)); + static_assert(sizeof(struct protosw) == sizeof(struct ip6protosw)); + static_assert(offsetof(struct ip6protosw, pr_entry) == offsetof(struct protosw, pr_entry)); + static_assert(offsetof(struct ip6protosw, pr_domain) == offsetof(struct protosw, pr_domain)); + static_assert(offsetof(struct ip6protosw, pr_protosw) == offsetof(struct protosw, pr_protosw)); + static_assert(offsetof(struct ip6protosw, pr_type) == offsetof(struct protosw, pr_type)); + static_assert(offsetof(struct ip6protosw, pr_protocol) == offsetof(struct protosw, pr_protocol)); + static_assert(offsetof(struct ip6protosw, pr_flags) == offsetof(struct protosw, pr_flags)); + static_assert(offsetof(struct ip6protosw, pr_input) == offsetof(struct protosw, pr_input)); + static_assert(offsetof(struct ip6protosw, pr_output) == offsetof(struct protosw, pr_output)); + static_assert(offsetof(struct ip6protosw, pr_ctlinput) == offsetof(struct protosw, pr_ctlinput)); + static_assert(offsetof(struct ip6protosw, pr_ctloutput) == offsetof(struct protosw, pr_ctloutput)); + static_assert(offsetof(struct ip6protosw, pr_usrreqs) == offsetof(struct protosw, pr_usrreqs)); + static_assert(offsetof(struct ip6protosw, pr_init) == offsetof(struct protosw, pr_init)); + static_assert(offsetof(struct ip6protosw, pr_drain) == offsetof(struct protosw, pr_drain)); + static_assert(offsetof(struct ip6protosw, pr_lock) == offsetof(struct protosw, pr_lock)); + static_assert(offsetof(struct ip6protosw, pr_unlock) == offsetof(struct protosw, pr_unlock)); + static_assert(offsetof(struct ip6protosw, pr_getlock) == offsetof(struct protosw, pr_getlock)); + static_assert(offsetof(struct ip6protosw, pr_filter_head) == offsetof(struct protosw, pr_filter_head)); + static_assert(offsetof(struct ip6protosw, pr_old) == offsetof(struct protosw, pr_old)); + static_assert(offsetof(struct ip6protosw, pr_update_last_owner) == offsetof(struct protosw, pr_update_last_owner)); + static_assert(offsetof(struct ip6protosw, pr_copy_last_owner) == offsetof(struct protosw, pr_copy_last_owner)); + static_assert(offsetof(struct ip6protosw, pr_mem_acct) == offsetof(struct protosw, pr_mem_acct)); /* * Attach first, then initialize. ip6_init() needs raw IP6 handler. diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index afa0eb874..a4d4bda6c 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -319,13 +319,13 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, if (ret != NULL) { if (flags != rt->rt_flags) { - os_log_debug(OS_LOG_DEFAULT, "%s: route to %s->%s->%s inserted, " + os_log(OS_LOG_DEFAULT, "%s: route to %s->%s->%s inserted, " "oflags=0x%x, flags=0x%x\n", __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "", flags, rt->rt_flags); } else { - os_log_debug(OS_LOG_DEFAULT, "%s: route to %s->%s->%s inserted, " + os_log(OS_LOG_DEFAULT, "%s: route to %s->%s->%s inserted, " "flags=0x%x\n", __func__, dbuf, gbuf, (rt->rt_ifp != NULL) ? rt->rt_ifp->if_xname : "", rt->rt_flags); diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index 96bc73a90..ca721d8ea 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -405,7 +405,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, m_freem(mcopy); #endif } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_CANNOT_FORWARD, NULL, 0); return NULL; } } @@ -608,7 +608,7 @@ skip_ipsec: RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); } - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_TOO_BIG, NULL, 0); return NULL; } @@ -643,7 +643,7 @@ skip_ipsec: RT_UNLOCK(rt); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_POSSIBLE_LOOP, NULL, 0); return NULL; } type = ND_REDIRECT; diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index bdc597352..6038d4e24 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2024 Apple Inc. All rights reserved. + * Copyright (c) 2003-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -177,10 +177,10 @@ u_int32_t in6addr_nhash = 0; /* hash table size */ struct in6_ifaddrhashhead *__counted_by(in6addr_nhash) in6_ifaddrhashtbl = 0; #define IN6_IFSTAT_REQUIRE_ALIGNED_64(f) \ - _CASSERT(!(offsetof(struct in6_ifstat, f) % sizeof (uint64_t))) + static_assert(!(offsetof(struct in6_ifstat, f) % sizeof (uint64_t))) #define ICMP6_IFSTAT_REQUIRE_ALIGNED_64(f) \ - _CASSERT(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) + static_assert(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) struct ip6stat ip6stat; @@ -352,7 +352,7 @@ ip6_proto_input(protocol_family_t protocol, mbuf_t packet) * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void -ip6_init(struct ip6protosw *pp, struct domain *dp) +ip6_init(struct protosw *pp, struct domain *dp) { static int ip6_initialized = 0; struct protosw *__single pr; @@ -363,13 +363,13 @@ ip6_init(struct ip6protosw *pp, struct domain *dp) domain_proto_mtx_lock_assert_held(); VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - _CASSERT((sizeof(struct ip6_hdr) + - sizeof(struct icmp6_hdr)) <= _MHLEN); + static_assert((sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr)) <= _MHLEN); - if (ip6_initialized) { + static_assert(IP_RECV_LINK_ADDR_TYPE == IPV6_RECV_LINK_ADDR_TYPE); + + if (!os_atomic_cmpxchg(&ip6_initialized, 0, 1, relaxed)) { return; } - ip6_initialized = 1; eventhandler_lists_ctxt_init(&in6_evhdlr_ctxt); (void)EVENTHANDLER_REGISTER(&in6_evhdlr_ctxt, in6_event, @@ -598,6 +598,7 @@ ip6_input_adjust(struct mbuf *m, struct ip6_hdr *ip6, uint32_t plen, } } } + static ip6_check_if_result_t ip6_input_check_interface(struct mbuf *m, struct ip6_hdr *ip6, struct ifnet *inifp, struct route_in6 *rin6, struct ifnet **deliverifp) { @@ -623,7 +624,7 @@ ip6_input_check_interface(struct mbuf *m, struct ip6_hdr *ip6, struct ifnet *ini * TODO: should we accept loopback */ if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &tmp_dst, ia6->ia_ifp->if_index, dst_ifscope)) { - if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) { + if ((ia6->ia6_flags & IN6_IFF_NOTREADY) != 0) { continue; } best_ia6 = ia6; @@ -775,6 +776,16 @@ ip6_input_check_interface(struct mbuf *m, struct ip6_hdr *ip6, struct ifnet *ini return result; } +static void +ip6_input_process_wake_packet(struct mbuf *m) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + + if (if_is_lpw_enabled(ifp)) { + if_exit_lpw(ifp, "IP6 packet"); + } +} + void ip6_input(struct mbuf *m) { @@ -1428,6 +1439,13 @@ injectit: struct ip6_hdr *, ip6, struct ifnet *, inifp, struct ip *, NULL, struct ip6_hdr *, ip6); + /* + * Check if need to switch to full wake mode -- TCP knows about idle connections + */ + if (__improbable(nxt != IPPROTO_TCP && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) != 0)) { + ip6_input_process_wake_packet(m); + } + if ((pr_input = ip6_protox[nxt]->pr_input) == NULL) { m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_NO_PROTO, NULL, 0); m = NULL; @@ -1924,6 +1942,21 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, } } + if (inp->inp_flags2 & INP2_RECV_LINK_ADDR_TYPE) { + int mode = IP_RECV_LINK_ADDR_UNICAST; + + /* There is no broadcast for IPv6 */ + if (m->m_flags & M_MCAST) { + mode = IP_RECV_LINK_ADDR_MULTICAST; + } + + mp = sbcreatecontrol_mbuf((caddr_t)&mode, + sizeof(int), IPV6_RECV_LINK_ADDR_TYPE, IPPROTO_IPV6, mp); + if (*mp == NULL) { + return NULL; + } + } + if (v4only != NULL) { *v4only = 0; } diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index e467cdb0e..3376f1985 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -2061,7 +2061,7 @@ in6_finalize_cksum(struct mbuf *m, uint32_t hoff, int32_t optlen, uint16_t csum, ulpoff, plen; uint8_t nxt; - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); VERIFY(m->m_flags & M_PKTHDR); sw_csum = (csum_flags & m->m_pkthdr.csum_flags); @@ -2498,6 +2498,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_RECVTCLASS: case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: + case IPV6_RECV_LINK_ADDR_TYPE: if (optlen != sizeof(int)) { error = EINVAL; break; @@ -2546,6 +2547,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) } while (0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) +#define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ @@ -2660,6 +2662,10 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; + + case IPV6_RECV_LINK_ADDR_TYPE: + OPTSET2(INP2_RECV_LINK_ADDR_TYPE); + break; } break; @@ -2933,6 +2939,7 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: + case IPV6_RECV_LINK_ADDR_TYPE: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); @@ -2989,7 +2996,12 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; + + case IPV6_RECV_LINK_ADDR_TYPE: + optval = OPTBIT2(INP2_RECV_LINK_ADDR_TYPE); + break; } + if (error) { break; } @@ -3869,7 +3881,7 @@ ip6_setpktopt(int optname, uint8_t *buf __sized_by(len), int len, struct ip6_pkt return EINVAL; } - opt->ip6po_hlim = *hlimp; + opt->ip6po_hlim = (int16_t)(*hlimp); break; } @@ -3884,7 +3896,7 @@ ip6_setpktopt(int optname, uint8_t *buf __sized_by(len), int len, struct ip6_pkt return EINVAL; } - opt->ip6po_tclass = tclass; + opt->ip6po_tclass = (int16_t)tclass; break; } @@ -4111,7 +4123,7 @@ ip6_setpktopt(int optname, uint8_t *buf __sized_by(len), int len, struct ip6_pkt minmtupolicy != IP6PO_MINMTU_ALL) { return EINVAL; } - opt->ip6po_minmtu = minmtupolicy; + opt->ip6po_minmtu = (int8_t)minmtupolicy; break; case IPV6_DONTFRAG: @@ -4140,7 +4152,7 @@ ip6_setpktopt(int optname, uint8_t *buf __sized_by(len), int len, struct ip6_pkt preftemp != IP6PO_TEMPADDR_PREFER) { return EINVAL; } - opt->ip6po_prefer_tempaddr = preftemp; + opt->ip6po_prefer_tempaddr = (int8_t)preftemp; break; default: diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index ff43ebf0e..4e8b7d486 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -207,8 +207,24 @@ struct ip6po_nhinfo { #define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route struct ip6_pktopts { - struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ - int ip6po_hlim; /* Hoplimit for outgoing packets */ + int16_t ip6po_hlim; /* Hoplimit for outgoing packets */ + + int16_t ip6po_tclass; /* traffic class */ + + int8_t ip6po_minmtu:3, /* fragment vs PMTU discovery policy */ + ip6po_prefer_tempaddr:3, /* whether temporary addresses are preferred as source address */ + ip6po_flags:2; + +#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast */ +#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ +#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ + +#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ +#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ +#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ + +#define IP6PO_DONTFRAG 0x01 /* no fragmentation (IPV6_DONTFRAG) */ +#define IP6PO_USECOA 0x02 /* use care of address */ /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; @@ -226,28 +242,6 @@ struct ip6_pktopts { /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; - - int ip6po_tclass; /* traffic class */ - - int ip6po_minmtu; /* fragment vs PMTU discovery policy */ -#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast */ -#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ -#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ - - /* whether temporary addresses are preferred as source address */ - int ip6po_prefer_tempaddr; - -#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ -#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ -#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ - - int ip6po_flags; -#if 0 /* parameters in this block is obsolete. do not reuse the values. */ -#define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ -#define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ -#endif -#define IP6PO_DONTFRAG 0x04 /* no fragmentation (IPV6_DONTFRAG) */ -#define IP6PO_USECOA 0x08 /* use care of address */ }; /* @@ -534,7 +528,7 @@ extern int icmp6_dgram_attach(struct socket *, int, struct proc *); extern void ip6_register_m_tag(void); -extern void ip6_init(struct ip6protosw *, struct domain *); +extern void ip6_init(struct protosw *, struct domain *); extern void ip6_input(struct mbuf *); extern void ip6_setsrcifaddr_info(struct mbuf *, uint32_t, struct in6_ifaddr *); extern void ip6_setdstifaddr_info(struct mbuf *, uint32_t, struct in6_ifaddr *); diff --git a/bsd/netinet6/ip6protosw.h b/bsd/netinet6/ip6protosw.h index 55d1bf799..30444410e 100644 --- a/bsd/netinet6/ip6protosw.h +++ b/bsd/netinet6/ip6protosw.h @@ -179,10 +179,9 @@ struct ip6protosw { * utility hooks */ void (*pr_init) /* initialization hook */ - (struct ip6protosw *, struct domain *); + (struct protosw *, struct domain *); void (*pr_drain)(void); /* flush any excess space possible */ /* for compat. with IPv4 protosw */ - int (*pr_sysctl)(void); /* sysctl for protocol */ int (*pr_lock) /* lock function for protocol */ (struct socket *so, int refcnt, void *debug); int (*pr_unlock) /* unlock for protocol */ @@ -200,6 +199,9 @@ struct ip6protosw { void (*pr_copy_last_owner) /* copy last socket from listener */ (struct socket *so, struct socket *head); + + /* Memory Accounting instance for this subsystem. */ + struct mem_acct *pr_mem_acct; }; #endif /* BSD_KERNEL_PRIVATE */ #endif /* _NETINET6_IP6PROTOSW_H_ */ diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 3565efe19..afa8ef5e1 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -112,6 +112,7 @@ #include #include +#include #include #include @@ -2663,7 +2664,6 @@ mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli, struct mld_tparams *mtp) { struct ifnet *ifp; - int retval = 0; IN6M_LOCK_ASSERT_HELD(inm); MLI_LOCK_ASSERT_NOTHELD(mli); @@ -2698,16 +2698,13 @@ mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli, IF_DRAIN(&inm->in6m_scq); - retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, + int retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); mtp->cst = (inm->in6m_scq.ifq_len > 0); MLD_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); if (retval <= 0) { MLI_UNLOCK(mli); - retval *= -1; - goto done; - } else { - retval = 0; + return -retval; } /* @@ -2720,7 +2717,7 @@ mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli, MLI_UNLOCK(mli); done: - return retval; + return 0; } /* @@ -3690,7 +3687,7 @@ mld_dispatch_packet(struct mbuf *m) * Check if the ifnet is still attached. */ ifp = mld_restore_context(m); - if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + if (ifp == NULL || !ifnet_is_fully_attached(ifp)) { os_log_error(OS_LOG_DEFAULT, "%s: dropped 0x%llx as interface went away\n", __func__, (uint64_t)VM_KERNEL_ADDRPERM(m)); m_freem(m); diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index f05271df3..3a8fb23ad 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2023 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,8 +81,10 @@ #include #include +#include #include +#include #include #include #include @@ -223,6 +225,7 @@ static void nd6_purge_interface_llinfo(struct ifnet *); static int nd6_sysctl_drlist SYSCTL_HANDLER_ARGS; static int nd6_sysctl_prlist SYSCTL_HANDLER_ARGS; +static int nd6_sysctl_rtilist SYSCTL_HANDLER_ARGS; /* * Insertion and removal from llinfo_nd6 must be done with rnh_lock held. @@ -260,7 +263,11 @@ SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, - nd6_sysctl_prlist, "S,in6_defrouter", ""); + nd6_sysctl_prlist, "S,in6_prefix", ""); + +SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_RTILIST, nd6_rtilist, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + nd6_sysctl_rtilist, "S,in6_route_info", ""); SYSCTL_DECL(_net_inet6_ip6); @@ -708,7 +715,7 @@ nd6_options(union nd_opts *ndopts) case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - nd6log(error, + nd6log(info, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type); /* XXX bark? */ @@ -1795,7 +1802,7 @@ nd6_timeout(void *arg) sarg.draining = 1; } nd6_service(&sarg); - nd6log3(debug, "%s: found %u, aging_lazy %u, aging %u, " + nd6log4(debug, "%s: found %u, aging_lazy %u, aging %u, " "sticky %u, killed %u\n", __func__, sarg.found, sarg.aging_lazy, sarg.aging, sarg.sticky, sarg.killed); /* re-arm the timer if there's work to do */ @@ -1824,7 +1831,7 @@ nd6_timeout(void *arg) } nd6_sched_timeout(&atv, leeway); } else if (nd6_debug) { - nd6log3(debug, "%s: not rescheduling timer\n", __func__); + nd6log4(debug, "%s: not rescheduling timer\n", __func__); } lck_mtx_unlock(rnh_lock); } @@ -1844,14 +1851,14 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv) /* see comments on top of this file */ if (nd6_timeout_run == 0) { if (ltv == NULL) { - nd6log3(debug, "%s: timer scheduled in " + nd6log4(debug, "%s: timer scheduled in " "T+%llus.%lluu (demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, nd6_sched_timeout_want); nd6_fast_timer_on = TRUE; timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv)); } else { - nd6log3(debug, "%s: timer scheduled in " + nd6log4(debug, "%s: timer scheduled in " "T+%llus.%lluu with %llus.%lluu leeway " "(demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, (uint64_t)ltv->tv_sec, @@ -1864,7 +1871,7 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv) nd6_sched_timeout_want = 0; } else if (nd6_timeout_run == 1 && ltv == NULL && nd6_fast_timer_on == FALSE) { - nd6log3(debug, "%s: fast timer scheduled in " + nd6log4(debug, "%s: fast timer scheduled in " "T+%llus.%lluu (demand %d)\n", __func__, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec, nd6_sched_timeout_want); @@ -1874,12 +1881,12 @@ nd6_sched_timeout(struct timeval *atv, struct timeval *ltv) timeout(nd6_timeout, &nd6_fast_timer_on, tvtohz(atv)); } else { if (ltv == NULL) { - nd6log3(debug, "%s: not scheduling timer: " + nd6log4(debug, "%s: not scheduling timer: " "timers %d, fast_timer %d, T+%llus.%lluu\n", __func__, nd6_timeout_run, nd6_fast_timer_on, (uint64_t)atv->tv_sec, (uint64_t)atv->tv_usec); } else { - nd6log3(debug, "%s: not scheduling timer: " + nd6log4(debug, "%s: not scheduling timer: " "timers %d, fast_timer %d, T+%llus.%lluu " "with %llus.%lluu leeway\n", __func__, nd6_timeout_run, nd6_fast_timer_on, @@ -2194,8 +2201,8 @@ nd6_purge_interface_rti_entries(struct ifnet *ifp) * For that reason, installed ones must be inserted * at the tail and uninstalled ones at the head */ - TAILQ_REMOVE(&rti->nd_rti_router_list, dr, dr_entry); + if (dr->stateflags & NDDRF_INSTALLED) { TAILQ_INSERT_TAIL(&rti_tmp.nd_rti_router_list, dr, dr_entry); } else { @@ -3029,7 +3036,7 @@ nd6_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) error = in6_mc_join(ifp, &llsol, NULL, &in6m, 0); if (error) { - nd6log(error, "%s: failed to join " + nd6log0(error, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), ip6_sprintf(&llsol), error); } else { @@ -3749,6 +3756,8 @@ fail: } if (do_update) { + rt_lookup_qset_id(rt, false); + int route_ev_code = 0; if (llchange) { @@ -3880,6 +3889,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, uint64_t timenow; rtentry_ref_t rtrele = NULL; struct nd_ifinfo *__single ndi = NULL; + drop_reason_t drop_reason = DROP_REASON_UNSPECIFIED; if (rt != NULL) { RT_LOCK_SPIN(rt); @@ -3930,6 +3940,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, return error; } } else { + drop_reason = DROP_REASON_IP_NO_ROUTE; senderr(EHOSTUNREACH); } } @@ -3965,6 +3976,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, ifa_remref(&ia6->ia_ifa); } if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { + drop_reason = DROP_REASON_IP_NO_ROUTE; senderr(EHOSTUNREACH); } goto sendpkt; @@ -3976,6 +3988,7 @@ nd6_output_list(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, /* If hint is now down, give up */ if (!(rt->rt_flags & RTF_UP)) { RT_UNLOCK(rt); + drop_reason = DROP_REASON_IP_NO_ROUTE; senderr(EHOSTUNREACH); } @@ -4021,6 +4034,7 @@ lookup: rtfree_locked(gwrt); } lck_mtx_unlock(rnh_lock); + drop_reason = DROP_REASON_IP_NO_ROUTE; senderr(EHOSTUNREACH); } VERIFY(gwrt != NULL); @@ -4073,6 +4087,7 @@ lookup: rtfree(rt); rt = NULL; /* "rtrele" == original "rt" */ + drop_reason = DROP_REASON_IP_NO_ROUTE; senderr(EHOSTUNREACH); } } @@ -4147,6 +4162,7 @@ lookup: ip6_sprintf(&dst->sin6_addr), (uint64_t)VM_KERNEL_ADDRPERM(ln), (uint64_t)VM_KERNEL_ADDRPERM(rt)); + drop_reason = DROP_REASON_IP6_MEM_ALLOC; senderr(EIO); /* XXX: good error? */ } lck_mtx_unlock(&ndi->lock); @@ -4176,7 +4192,7 @@ lookup: ND6_CACHE_STATE_TRANSITION(ln, ND6_LLINFO_DELAY); ln_setexpire(ln, timenow + nd6_delay); /* N.B.: we will re-arm the timer below. */ - _CASSERT(ND6_LLINFO_DELAY > ND6_LLINFO_INCOMPLETE); + static_assert(ND6_LLINFO_DELAY > ND6_LLINFO_INCOMPLETE); } /* @@ -4299,6 +4315,7 @@ sendpkt: /* discard the packet if IPv6 operation is disabled on the interface */ if (ifp->if_eflags & IFEF_IPV6_DISABLED) { error = ENETDOWN; /* better error? */ + drop_reason = DROP_REASON_IP6_IF_IPV6_DISABLED; goto bad; } @@ -4316,6 +4333,7 @@ sendpkt: IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst))) { ip6stat.ip6s_badscope++; error = EADDRNOTAVAIL; + drop_reason = DROP_REASON_IP6_BAD_SCOPE; goto bad; } } @@ -4358,7 +4376,8 @@ sendpkt: bad: if (m0 != NULL) { - m_freem_list(m0); + m_drop_list(m0, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); + m0 = NULL; } release: @@ -4511,7 +4530,7 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, sdl = SDL(route->rt_gateway); if (sdl->sdl_alen == 0) { /* this should be impossible, but we bark here for debugging */ - nd6log(error, "%s: route %s on %s%d sdl_alen == 0\n", __func__, + nd6log0(error, "%s: route %s on %s%d sdl_alen == 0\n", __func__, ip6_sprintf(&ip6_dest->sin6_addr), route->rt_ifp->if_name, route->rt_ifp->if_unit); result = EHOSTUNREACH; @@ -4523,6 +4542,17 @@ nd6_lookup_ipv6(ifnet_t ifp, const struct sockaddr_in6 *ip6_dest, release: if (route != NULL) { + /* Set qset id only if there are traffic rules. Else, for bridge + * use cases, the flag will be set and traffic rules won't be + * run on the downstream interface */ + if (result == 0 && ifp->if_eth_traffic_rule_count) { + uint64_t qset_id = rt_lookup_qset_id(route, true); + if (packet != NULL) { + packet->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QSET_ID_VALID; + packet->m_pkthdr.pkt_mpriv_qsetid = qset_id; + } + } + if (route == hint) { RT_REMREF_LOCKED(route); RT_UNLOCK(route); @@ -4784,8 +4814,9 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; - LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) - p.advrtrs++; + LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { + p.advrtrs++; + } error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) { NDPR_UNLOCK(pr); @@ -4835,8 +4866,9 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; - LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) - p.advrtrs++; + LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { + p.advrtrs++; + } error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) { NDPR_UNLOCK(pr); @@ -4867,6 +4899,108 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS return error; } +static int +nd6_sysctl_rtilist SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + struct nd_route_info *rti = NULL; + struct nd_defrouter *dr = NULL; + char pbuf[MAX_IPv6_STR_LEN]; + int error = 0; + + if (req->newptr != USER_ADDR_NULL) { + return EPERM; + } + + lck_mtx_lock(nd6_mutex); + if (proc_is64bit(req->p)) { + struct in6_route_info_64 d; + struct in6_defrouter_64 drx; + + bzero(&d, sizeof(d)); + + bzero(&drx, sizeof(drx)); + drx.rtaddr.sin6_family = AF_INET6; + drx.rtaddr.sin6_len = sizeof(drx.rtaddr); + + TAILQ_FOREACH(rti, &nd_rti_list, nd_rti_entry) { + d.prefix = rti->nd_rti_prefix; + d.prefixlen = rti->nd_rti_prefixlen; + d.defrtrs = 0; + TAILQ_FOREACH(dr, &rti->nd_rti_router_list, dr_entry) { + d.defrtrs++; + } + error = SYSCTL_OUT(req, &d, sizeof(d)); + if (error != 0) { + break; + } + + TAILQ_FOREACH(dr, &rti->nd_rti_router_list, dr_entry) { + drx.rtaddr.sin6_addr = dr->rtaddr; + if (in6_recoverscope(&drx.rtaddr, + &dr->rtaddr, dr->ifp) != 0) { + nd6log0(error, "scope error in default router " + "list (%s)\n", inet_ntop(AF_INET6, + &dr->rtaddr, pbuf, sizeof(pbuf))); + } + drx.flags = dr->flags; + drx.stateflags = dr->stateflags; + drx.rtlifetime = (u_short)dr->rtlifetime; + drx.expire = (int)nddr_getexpire(dr); + drx.if_index = dr->ifp->if_index; + error = SYSCTL_OUT(req, &drx, sizeof(drx)); + if (error != 0) { + break; + } + } + } + } else { + struct in6_route_info_32 d; + struct in6_defrouter_32 drx; + + bzero(&d, sizeof(d)); + + bzero(&drx, sizeof(drx)); + drx.rtaddr.sin6_family = AF_INET6; + drx.rtaddr.sin6_len = sizeof(drx.rtaddr); + + TAILQ_FOREACH(rti, &nd_rti_list, nd_rti_entry) { + d.prefix = rti->nd_rti_prefix; + d.prefixlen = rti->nd_rti_prefixlen; + d.defrtrs = 0; + TAILQ_FOREACH(dr, &rti->nd_rti_router_list, dr_entry) { + d.defrtrs++; + } + error = SYSCTL_OUT(req, &d, sizeof(d)); + if (error != 0) { + break; + } + + TAILQ_FOREACH(dr, &rti->nd_rti_router_list, dr_entry) { + drx.rtaddr.sin6_addr = dr->rtaddr; + if (in6_recoverscope(&drx.rtaddr, + &dr->rtaddr, dr->ifp) != 0) { + nd6log0(error, "scope error in default router " + "list (%s)\n", inet_ntop(AF_INET6, + &dr->rtaddr, pbuf, sizeof(pbuf))); + } + drx.flags = dr->flags; + drx.stateflags = dr->stateflags; + drx.rtlifetime = (u_short)dr->rtlifetime; + drx.expire = (int)nddr_getexpire(dr); + drx.if_index = dr->ifp->if_index; + error = SYSCTL_OUT(req, &drx, sizeof(drx)); + if (error != 0) { + break; + } + } + } + } + lck_mtx_unlock(nd6_mutex); + + return error; +} + void in6_ifaddr_set_dadprogress(struct in6_ifaddr *ia) { diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index bd9796a91..81b08b7bb 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -56,7 +56,13 @@ #ifndef _NETINET6_ND6_H_ #define _NETINET6_ND6_H_ + #include +#include +#include +#ifndef BSD_KERNEL_PRIVATE +#include +#endif #include /* see net/route.h, or net/if_inarp.h */ @@ -73,6 +79,7 @@ #include #include #include +#include struct llinfo_nd6 { /* @@ -385,6 +392,29 @@ struct in6_ndifreq_64 { }; #endif /* BSD_KERNEL_PRIVATE */ +struct in6_route_info { + struct in6_addr prefix; + u_int8_t prefixlen; + u_short defrtrs; /* number of default routers */ + /* struct in6_defrouter defrtr[] */ +} __attribute__((aligned(8))); + +#if defined(BSD_KERNEL_PRIVATE) +struct in6_route_info_32 { + struct in6_addr prefix; + u_int8_t prefixlen; + u_short defrtrs; /* number of default routers */ + /* struct in6_defrouter defrtr[] */ +}; + +struct in6_route_info_64 { + struct in6_addr prefix; + u_int8_t prefixlen; + u_short defrtrs; /* number of default routers */ + /* struct in6_defrouter defrtr[] */ +} __attribute__((aligned(8))); +#endif /* BSD_KERNEL_PRIVATE */ + /* Prefix status */ #define NDPRF_ONLINK 0x1 #define NDPRF_DETACHED 0x2 @@ -398,7 +428,7 @@ struct in6_ndifreq_64 { #define NDPRF_CLAT46 0x40000 #define CLAT46_COLLISION_COUNT_OFFSET 128 -#endif +#endif /* BSD_KERNEL_PRIVATE */ /* protocol constants */ #define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */ @@ -440,6 +470,7 @@ struct in6_ndifreq_64 { #define MAX_REACHABLE_TIME 3600000 /* msec */ #define REACHABLE_TIME 30000 /* msec */ #define RETRANS_TIMER 1000 /* msec */ +#define MAX_RA_RETRANS_TIMER 10000 /* msec */ #define MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */ #define MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */ #define DEF_TEMP_VALID_LIFETIME 604800 /* 1 week */ @@ -741,6 +772,7 @@ extern int nd6_optimistic_dad; #define nd6log(type, ...) do { if (nd6_debug >= 1) os_log_##type(OS_LOG_DEFAULT, ##__VA_ARGS__); } while (0) #define nd6log2(type, ...) do { if (nd6_debug >= 2) os_log_##type(OS_LOG_DEFAULT, ##__VA_ARGS__); } while (0) #define nd6log3(type, ...) do { if (nd6_debug >= 3) os_log_##type(OS_LOG_DEFAULT, ##__VA_ARGS__); } while (0) +#define nd6log4(type, ...) do { if (nd6_debug >= 4) os_log_##type(OS_LOG_DEFAULT, ##__VA_ARGS__); } while (0) #define ND6_OPTIMISTIC_DAD_LINKLOCAL (1 << 0) #define ND6_OPTIMISTIC_DAD_AUTOCONF (1 << 1) diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index f3bee999a..d81fdc3b6 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,7 @@ #include #include +#include #include #include @@ -299,7 +300,7 @@ nd6_ns_input( } if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log(error, + nd6log0(error, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); @@ -339,14 +340,14 @@ nd6_ns_input( } if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { nd6log(info, "nd6_ns_input: NS packet from non-neighbor\n"); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NS_FROM_NON_NEIGHBOR, NULL, 0); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log(info, "nd6_ns_input: bad NS target (multicast)\n"); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NS_TO_MULTICAST, NULL, 0); goto bad; } @@ -356,7 +357,7 @@ nd6_ns_input( if (nd6_options(&ndopts) < 0) { nd6log(info, "nd6_ns_input: invalid ND option, ignored\n"); /* nd6_options have incremented stats */ - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NS_BAD_ND_OPT, NULL, 0); goto bad; } @@ -470,7 +471,7 @@ nd6_ns_input( "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NS_BAD_LLADDR_LEN, NULL, 0); goto bad; } @@ -478,7 +479,7 @@ nd6_ns_input( nd6log(info, "nd6_ns_input: duplicate IP6 address %s\n", ip6_sprintf(&saddr6)); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NS_DUPLICATE_ADDRESS, NULL, 0); goto bad; } @@ -656,7 +657,7 @@ nd6_ns_output( im6o = ip6_allocmoptions(Z_NOWAIT); if (im6o == NULL) { - m_freem(m); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_MEM_ALLOC, NULL, 0); return; } @@ -710,7 +711,7 @@ nd6_ns_output( * Otherwise, we perform the source address selection as usual. */ struct ip6_hdr *__single hip6; /* hold ip6 */ - struct in6_addr *__single hsrc = NULL; + struct in6_addr hsrc = {}; /* Caller holds ref on this route */ if (ln != NULL) { @@ -723,9 +724,7 @@ nd6_ns_output( hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof(*hip6) < ln->ln_hold->m_len) { - hsrc = &hip6->ip6_src; - } else { - hsrc = NULL; + memcpy(&hsrc, &hip6->ip6_src, sizeof(struct in6_addr)); } } /* Update probe count, if applicable */ @@ -737,9 +736,9 @@ nd6_ns_output( rtflags = ln->ln_rt->rt_flags; RT_UNLOCK(ln->ln_rt); } - if (hsrc != NULL && (ia = in6ifa_ifpwithaddr(ifp, hsrc)) && + if (!IN6_IS_ADDR_UNSPECIFIED(&hsrc) && (ia = in6ifa_ifpwithaddr(ifp, &hsrc)) && (ia->ia6_flags & IN6_IFF_OPTIMISTIC) == 0) { - src = hsrc; + src = &hsrc; } else { int error; struct sockaddr_in6 dst_sa; @@ -803,6 +802,7 @@ nd6_ns_output( src = &src_in; ip6oa.ip6oa_flags &= ~IP6OAF_BOUND_SRCADDR; } + ip6->ip6_src = *src; ip6_output_setsrcifscope(m, ifp->if_index, ia); nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); @@ -919,7 +919,7 @@ exit: return; bad: - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); goto exit; } @@ -993,14 +993,14 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) nd6log(error, "nd6_na_input: invalid target address %s\n", ip6_sprintf(&taddr6)); - drop_reason = DROP_REASON_IP_DST_ADDR_NO_AVAIL; + drop_reason = DROP_REASON_IP6_NA_INVALID_TARGET; goto bad; } if (IN6_IS_ADDR_MULTICAST(&daddr6)) { if (is_solicited) { nd6log(error, "nd6_na_input: a solicited adv is multicasted\n"); - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_DST_MULTICAST; goto bad; } } @@ -1024,7 +1024,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) "(if %d, NA packet %d)\n", ip6_sprintf(&taddr6), ifp->if_addrlen, lladdrlen - 2); - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_BAD_LLADDR_LEN; goto bad; } } @@ -1047,12 +1047,12 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) */ if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) { if (!ip6_forwarding || !nd6_prproxy) { - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_NOT_CACHED_SCOPED; goto freeit; } if ((rt = nd6_lookup(&taddr6, 0, NULL, 0)) == NULL) { - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_NOT_CACHED; goto freeit; } @@ -1089,7 +1089,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) (sdl = SDL(rt->rt_gateway)) == NULL) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_MISSING_ROUTE; goto freeit; } @@ -1103,7 +1103,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) if (ifp->if_addrlen && !lladdr) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - drop_reason = DROP_REASON_IP6_BAD_ND_STATE; + drop_reason = DROP_REASON_IP6_NA_MISSING_LLADDR_OPT; goto freeit; } @@ -1134,6 +1134,8 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) ln_setexpire(ln, timenow + nd6_gctimer); } + rt_lookup_qset_id(rt, false); + /* * Enqueue work item to invoke callback for this * route entry @@ -1269,6 +1271,7 @@ nd6_na_input(struct mbuf *m, int off, int icmp6len) */ /* Enqueue work item to invoke callback for this route entry */ if (llchange) { + rt_lookup_qset_id(rt, false); route_event_enqueue_nwk_wq_entry(rt, NULL, ROUTE_LLENTRY_CHANGED, NULL, TRUE); } @@ -1493,7 +1496,7 @@ nd6_na_output( im6o = ip6_allocmoptions(Z_NOWAIT); if (im6o == NULL) { - m_freem(m); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_MEM_ALLOC, NULL, 0); return; } @@ -1521,7 +1524,7 @@ nd6_na_output( daddr6.s6_addr32[2] = 0; daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; if (in6_setscope(&daddr6, ifp, NULL)) { - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_SCOPE, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_SCOPE, NULL, 0); goto exit; } @@ -1546,7 +1549,7 @@ nd6_na_output( nd6log(info, "nd6_na_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(&dst_sa.sin6_addr), error); - m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_ND_STATE, NULL, 0); + m_drop_if(m, ifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_NA_UNKNOWN_SRC_ADDR, NULL, 0); goto exit; } ip6->ip6_src = *src; @@ -2039,7 +2042,8 @@ nd6_dad_timer(struct ifaddr *ifa) * becomes clear when a looped back probe is detected. */ nd6log0(info, - "%s: a looped back NS message is detected during DAD for %s. Another DAD probe is being sent on interface %s.\n", + "%s: a looped back NS message is detected during DAD for %s. " + "Another DAD probe is being sent on interface %s.\n", __func__, ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ia->ia_ifp)); /* @@ -2116,7 +2120,7 @@ nd6_dad_duplicated(struct ifaddr *ifa) } IFA_LOCK(&ia->ia_ifa); DAD_LOCK(dp); - nd6log(error, "%s: NS in/out/loopback=%d/%d/%d, NA in=%d\n", + nd6log(info, "%s: NS in/out/loopback=%d/%d/%d, NA in=%d\n", __func__, dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount, dp->dad_na_icount); candisable = FALSE; @@ -2373,7 +2377,7 @@ nd6_dad_na_input(struct mbuf *m, struct ifnet *ifp, struct in6_addr *taddr, if (ip6a && (ip6a->ip6a_flags & IP6A_HASEEN) != 0 && bcmp(ip6a->ip6a_ehsrc, lladdr, ETHER_ADDR_LEN) != 0) { IFA_UNLOCK(ifa); - nd6log(error, "%s: ignoring duplicate NA on %s " + nd6log0(info, "%s: ignoring duplicate NA on %s " "[eh_src != tgtlladdr]\n", __func__, if_name(ifp)); goto done; } @@ -2707,8 +2711,8 @@ nd6_alt_node_absent(struct ifnet *ifp, struct sockaddr_in6 *sin6, struct sockadd "for interface %s.\n", __func__, ip6_sprintf(&sin6->sin6_addr), ifp->if_xname); } else { - nd6log(error, "%s: Failed to delete host route to %s " - "for interface %s with error :%d.\n", __func__, + nd6log0(error, "%s: Failed to delete host route to %s " + "for interface %s with error: %d.\n", __func__, ip6_sprintf(&sin6->sin6_addr), ifp->if_xname, error); } diff --git a/bsd/netinet6/nd6_prproxy.c b/bsd/netinet6/nd6_prproxy.c index f78ba78ee..7038975b6 100644 --- a/bsd/netinet6/nd6_prproxy.c +++ b/bsd/netinet6/nd6_prproxy.c @@ -90,6 +90,7 @@ #include #include +#include #include #include diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 783204fd6..0c0540fd2 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2024 Apple Inc. All rights reserved. + * Copyright (c) 2003-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,9 +71,11 @@ #include #include +#include #include #include +#include #include #include #include @@ -250,6 +252,7 @@ nd6_rs_input( char *lladdr = NULL; int lladdrlen = 0; union nd_opts ndopts = {}; + drop_reason_t drop_reason = DROP_REASON_UNSPECIFIED; /* Expect 32-bit aligned data pointer on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m); @@ -261,10 +264,11 @@ nd6_rs_input( /* Sanity checks */ if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log(error, + nd6log0(error, "nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); + drop_reason = DROP_REASON_IP6_BAD_HLIM; goto bad; } @@ -313,6 +317,7 @@ nd6_rs_input( "nd6_rs_input: lladdrlen mismatch for %s " "(if %d, RS packet %d)\n", ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2); + drop_reason = DROP_REASON_IP6_RS_BAD_LLADDR_LEN; goto bad; } @@ -324,7 +329,7 @@ freeit: bad: icmp6stat.icp6s_badrs++; - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); } #define ND_OPT_LEN_TO_BYTE_SCALE 3 /* ND opt len is in units of 8 octets */ @@ -364,6 +369,7 @@ nd6_ra_input( u_int32_t advreachable; boolean_t rti_defrtr_processed = FALSE; boolean_t is_local_ra = FALSE; + drop_reason_t drop_reason = DROP_REASON_UNSPECIFIED; #if (DEVELOPMENT || DEBUG) if (ip6_accept_rtadv == 0) { @@ -406,17 +412,19 @@ nd6_ra_input( } if (ip6->ip6_hlim != IPV6_MAXHLIM) { - nd6log(error, + nd6log0(error, "nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(&ip6->ip6_src), ip6_sprintf(&ip6->ip6_dst), if_name(ifp)); + drop_reason = DROP_REASON_IP6_BAD_HLIM; goto bad; } if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) { - nd6log(error, + nd6log0(error, "nd6_ra_input: src %s is not link-local\n", ip6_sprintf(&saddr6)); + drop_reason = DROP_REASON_IP6_RA_NOT_LL; goto bad; } @@ -460,13 +468,21 @@ nd6_ra_input( } } if (nd_ra->nd_ra_retransmit) { - ndi->retrans = ntohl(nd_ra->nd_ra_retransmit); + u_int32_t retrans = ntohl(nd_ra->nd_ra_retransmit); + if (retrans < MAX_RA_RETRANS_TIMER) { + ndi->retrans = retrans; + } else { + nd6log0(info, "%s: ignoring retrans time of %u in RA from %s ;" + " Using default of %u", + __func__, + retrans, ip6_sprintf(&ip6->ip6_src), ndi->retrans); + } } if (nd_ra->nd_ra_curhoplimit) { if (ndi->chlim < nd_ra->nd_ra_curhoplimit) { ndi->chlim = nd_ra->nd_ra_curhoplimit; } else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) { - nd6log(error, + nd6log0(error, "RA with a lower CurHopLimit sent from " "%s on %s (current = %d, received = %d). " "Ignored.\n", ip6_sprintf(&ip6->ip6_src), @@ -850,6 +866,7 @@ skip: "nd6_ra_input: lladdrlen mismatch for %s " "(if %d, RA packet %d)\n", ip6_sprintf(&saddr6), ifp->if_addrlen, lladdrlen - 2); + drop_reason = DROP_REASON_IP6_RA_BAD_LLADDR_LEN; goto bad; } @@ -874,7 +891,9 @@ skip: lck_mtx_unlock(nd6_mutex); freeit: - m_freem(m); + if (m) { + m_freem(m); + } if (dr) { NDDR_REMREF(dr); } @@ -888,6 +907,9 @@ freeit: return; bad: + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); + m = NULL; + icmp6stat.icp6s_badra++; goto freeit; } @@ -929,6 +951,11 @@ defrouter_addreq(struct nd_defrouter *new, struct nd_route_info *rti, boolean_t int err; struct nd_ifinfo *ndi = ND_IFINFO(new->ifp); int rtflags = RTF_GATEWAY; + if (rti) { + nd6log(info, "%s: defrouter_addreq prefix %s scoped=%d", __func__, ip6_sprintf(&rti->nd_rti_prefix), scoped); + } else { + nd6log(info, "%s: defrouter_addreq scoped=%d", __func__, scoped); + } LCK_MTX_ASSERT(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); NDDR_LOCK_ASSERT_NOTHELD(new); @@ -939,6 +966,7 @@ defrouter_addreq(struct nd_defrouter *new, struct nd_route_info *rti, boolean_t NDDR_LOCK(new); if (new->stateflags & NDDRF_INSTALLED) { + nd6log(info, "%s: defrouter_addreq already installed", __func__); goto out; } if (new->ifp->if_ipv6_router_mode == IPV6_ROUTER_MODE_EXCLUSIVE) { @@ -1017,8 +1045,9 @@ defrouter_addreq(struct nd_defrouter *new, struct nd_route_info *rti, boolean_t new->rtaddr_mapped = gate.sin6_addr; new->stateflags |= NDDRF_MAPPED; - nd6log(info, "%s: Default router %s mapped " - "to ", if_name(new->ifp), ip6_sprintf(&new->rtaddr)); + nd6log(info, "%s: %s Default router %s mapped " + "to ", __func__, if_name(new->ifp), ip6_sprintf(&new->rtaddr)); + nd6log(info, "%s\n", ip6_sprintf(&new->rtaddr_mapped)); nd6log(info, "%s\n", ip6_sprintf(&new->rtaddr_mapped)); } } @@ -1037,7 +1066,7 @@ defrouter_addreq(struct nd_defrouter *new, struct nd_route_info *rti, boolean_t new->stateflags |= NDDRF_IFSCOPE; } } else { - nd6log(error, "%s: failed to add default router " + nd6log0(error, "%s: failed to add default router " "%s on %s scoped %d (errno = %d)\n", __func__, ip6_sprintf(&gate.sin6_addr), if_name(new->ifp), (ifscope != IFSCOPE_NONE), err); @@ -1165,7 +1194,7 @@ defrouter_delreq(struct nd_defrouter *dr, struct nd_route_info *rti) RT_UNLOCK(oldrt); rtfree(oldrt); } else if (err != ESRCH) { - nd6log(error, "%s: failed to delete default router " + nd6log0(error, "%s: failed to delete default router " "%s on %s scoped %d (errno = %d)\n", __func__, ip6_sprintf(&gate.sin6_addr), dr->ifp != NULL ? if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err); @@ -1798,9 +1827,9 @@ defrouter_select(struct ifnet *ifp, struct nd_drhead *nd_router_listp) lck_mtx_lock(nd6_mutex); } else { /* this should not happen; warn for diagnosis */ - nd6log(error, "defrouter_select: more than one " - "default router is installed for interface :%s.\n", - if_name(installed_dr->ifp)); + nd6log0(error, "%s: more than one " + "default router is installed for interface: %s\n", + __func__, if_name(installed_dr->ifp)); NDDR_UNLOCK(dr); } } else { @@ -1932,10 +1961,18 @@ install_route: */ lck_mtx_unlock(nd6_mutex); if (installed_dr != selected_dr) { - nd6log(info, - "%s:%d: Found a better router for interface " - "%s. Installing new default route.\n", - __func__, __LINE__, if_name(ifp)); + if (rti) { + nd6log(info, + "%s:%d: Found a better router for interface " + "%s. Installing new default route: %s/%p\n", + __func__, __LINE__, if_name(ifp), + ip6_sprintf(&rti->nd_rti_prefix), &rti->nd_rti_prefix); + } else { + nd6log(info, + "%s:%d: Found a better router for interface " + "%s. Installing new default route. NO RTI\n", + __func__, __LINE__, if_name(ifp)); + } if (installed_dr != NULL) { defrouter_delreq(installed_dr, rti); } @@ -2081,7 +2118,7 @@ defrtrlist_update_common(struct nd_defrouter *new, struct nd_drhead *nd_router_l /* * preferred router may be changed, so relocate * this router. - * XXX: calling TAILQ_REMOVE directly is a bad manner. + * XXX: calling TAILQ_REMOVE directly is bad manners. * However, since defrtrlist_del() has many side * effects, we intentionally do so here. * defrouter_select() below will handle routing @@ -2106,7 +2143,7 @@ defrtrlist_update_common(struct nd_defrouter *new, struct nd_drhead *nd_router_l ndi->ndefrouters >= ip6_maxifdefrouters) { lck_mtx_unlock(&ndi->lock); nddr_free(n); - nd6log(error, "%s: ignoring router addition as we have hit the " + nd6log0(error, "%s: ignoring router addition as we have hit the " "max limit of %d for max default routers.\n", __func__, ip6_maxifdefrouters); return NULL; @@ -2344,7 +2381,7 @@ nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, if ((e = nd6_prefix_onlink_common(new, force_scoped, new->ndpr_ifp->if_index)) != 0) { - nd6log(error, "nd6_prelist_add: failed to make " + nd6log0(error, "nd6_prelist_add: failed to make " "the prefix %s/%d on-link %s on %s (errno=%d)\n", ip6_sprintf(&new->ndpr_prefix.sin6_addr), new->ndpr_plen, force_scoped ? "scoped" : @@ -2408,7 +2445,7 @@ prelist_remove(struct nd_prefix *pr) NDPR_UNLOCK(pr); lck_mtx_unlock(nd6_mutex); if ((error = nd6_prefix_offlink(pr)) != 0) { - nd6log(error, "prelist_remove: failed to make " + nd6log0(error, "prelist_remove: failed to make " "%s/%d offlink on %s, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), error); @@ -2435,7 +2472,7 @@ prelist_remove(struct nd_prefix *pr) err = nd6_prefix_offlink(tmp_pr); lck_mtx_lock(nd6_mutex); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d offlink on %s, " "errno=%d\n", __func__, ip6_sprintf(&tmp_pr->ndpr_prefix.sin6_addr), @@ -2444,14 +2481,14 @@ prelist_remove(struct nd_prefix *pr) err = nd6_prefix_onlink_scoped(tmp_pr, IFSCOPE_NONE); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d onlink on %s, errno=%d\n", __func__, ip6_sprintf(&tmp_pr->ndpr_prefix.sin6_addr), tmp_pr->ndpr_plen, if_name(tmp_pr->ndpr_ifp), err); } if (err != 0) { - nd6log(error, + nd6log0(error, "%s: error unscoping %s/%d from %s\n", __func__, ip6_sprintf(&tmp_pr->ndpr_prefix.sin6_addr), tmp_pr->ndpr_plen, if_name(tmp_pr->ndpr_ifp)); @@ -2573,7 +2610,7 @@ prelist_update( NDPR_UNLOCK(pr); if ((e = nd6_prefix_onlink(pr)) != 0) { - nd6log(error, + nd6log0(error, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " "(errno=%d)\n", @@ -3363,7 +3400,7 @@ pfxlist_onlink_check(void) pr->ndpr_raf_onlink) { NDPR_UNLOCK(pr); if ((error = nd6_prefix_onlink(pr)) != 0) { - nd6log(error, + nd6log0(error, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), @@ -3405,8 +3442,8 @@ pfxlist_onlink_check(void) err = ifnet_get_address_list_family_internal(NULL, &ifap, &addresses_count, AF_INET6, 0, M_NOWAIT, 0); if (err != 0 || ifap == NULL) { - nd6log(error, "%s: ifnet_get_address_list_family_internal " - "failed", __func__); + nd6log0(error, "%s: ifnet_get_address_list_family_internal " + "failed. err=%d", __func__, err); return; } for (i = 0; ifap[i]; i++) { @@ -3539,6 +3576,7 @@ nd6_prefix_sync(struct ifnet *ifp) { struct nd_prefix *__single pr, *__single opr; int err = 0; + uint64_t timenow; LCK_MTX_ASSERT(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -3546,9 +3584,17 @@ nd6_prefix_sync(struct ifnet *ifp) return; } - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + + net_update_uptime(); + timenow = net_uptime(); + + LIST_FOREACH(pr, &nd_prefix, ndpr_entry) { NDPR_LOCK(pr); - if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { + if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { + NDPR_UNLOCK(pr); + continue; + } + if (pr->ndpr_expire != 0 && pr->ndpr_expire < timenow) { NDPR_UNLOCK(pr); continue; } @@ -3572,14 +3618,14 @@ nd6_prefix_sync(struct ifnet *ifp) err = nd6_prefix_offlink(opr); lck_mtx_lock(nd6_mutex); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d offlink on %s, " "errno=%d\n", __func__, ip6_sprintf(&opr->ndpr_prefix.sin6_addr), opr->ndpr_plen, if_name(opr->ndpr_ifp), err); } } else { - nd6log(error, + nd6log0(error, "%s: scoped %s/%d on %s has no matching unscoped prefix\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp)); @@ -3589,7 +3635,7 @@ nd6_prefix_sync(struct ifnet *ifp) err = nd6_prefix_offlink(pr); lck_mtx_lock(nd6_mutex); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d offlink on %s, errno=%d\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), err); @@ -3599,7 +3645,7 @@ nd6_prefix_sync(struct ifnet *ifp) if (opr != NULL) { err = nd6_prefix_onlink_scoped(opr, opr->ndpr_ifp->if_index); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d scoped onlink on %s, " "errno=%d\n", __func__, ip6_sprintf(&opr->ndpr_prefix.sin6_addr), @@ -3609,14 +3655,14 @@ nd6_prefix_sync(struct ifnet *ifp) err = nd6_prefix_onlink_scoped(pr, IFSCOPE_NONE); if (err != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make %s/%d onlink on %s, errno=%d\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), err); } if (err != 0) { - nd6log(error, + nd6log0(error, "%s: error promoting %s/%d to %s from %s\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), @@ -3652,7 +3698,7 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, /* sanity check */ NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { - nd6log(error, + nd6log0(error, "%s: %s/%d on %s scoped=%d is already on-link\n", __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), @@ -3795,7 +3841,7 @@ nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, NDPR_LOCK(pr); } else { NDPR_LOCK(pr); - nd6log(error, "nd6_prefix_onlink: failed to add route for a" + nd6log0(error, "nd6_prefix_onlink: failed to add route for a" " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%x," " scoped=%d, errno = %d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), @@ -3895,7 +3941,7 @@ nd6_prefix_offlink(struct nd_prefix *pr) /* sanity check */ NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { - nd6log(error, + nd6log0(error, "nd6_prefix_offlink: %s/%d on %s scoped=%d is already " "off-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(pr->ndpr_ifp), @@ -3934,7 +3980,7 @@ nd6_prefix_offlink(struct nd_prefix *pr) RT_UNLOCK(rt); rtfree(rt); } else { - nd6log(error, + nd6log0(error, "nd6_prefix_offlink: failed to delete route: " "%s/%d on %s, scoped %d, (errno = %d)\n", ip6_sprintf(&sa6.sin6_addr), prefix_len, if_name(ifp), @@ -4105,11 +4151,11 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, } } else { if (!is_clat46) { - nd6log(error, "%s: no CGA available (%s)\n", - __func__, if_name(ifp)); + nd6log0(error, "%s: no CGA available (%s) err=%d\n", + __func__, if_name(ifp), error); } else { - nd6log(error, "%s: no CLAT46 available (%s)\n", - __func__, if_name(ifp)); + nd6log0(error, "%s: no CLAT46 available (%s) err=%d\n", + __func__, if_name(ifp), error); } goto done; } @@ -4159,7 +4205,7 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, } error = in6_update_ifa(ifp, &ifra, ifaupdate, &ia6); if (error != 0) { - nd6log(error, + nd6log0(error, "%s: failed to make ifaddr %s on %s (errno=%d)\n", __func__, ip6_sprintf(&ifra.ifra_addr.sin6_addr), if_name(ifp), error); @@ -4286,7 +4332,8 @@ again: ifaupdate = IN6_IFAUPDATE_NOWAIT | IN6_IFAUPDATE_DADDELAY; error = in6_update_ifa(ifp, &ifra, ifaupdate, &newia); if (error != 0) { - nd6log(error, "in6_tmpifadd: failed to add address.\n"); + nd6log0(error, "%s: failed to add address. err=%d\n", + __func__, error); return error; } VERIFY(newia != NULL); @@ -4298,7 +4345,7 @@ again: * We lost the race with another thread that has purged * ia0 address; in this case, purge the tmp addr as well. */ - nd6log(error, "in6_tmpifadd: no public address\n"); + nd6log0(error, "in6_tmpifadd: no public address\n"); VERIFY(!(ia0->ia6_flags & IN6_IFF_AUTOCONF)); IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); in6_purgeaddr(&newia->ia_ifa); diff --git a/bsd/netinet6/nd6_var.h b/bsd/netinet6/nd6_var.h index 8c810386a..621a0c3da 100644 --- a/bsd/netinet6/nd6_var.h +++ b/bsd/netinet6/nd6_var.h @@ -58,6 +58,10 @@ #define _NETINET6_ND6_VAR_H_ #ifdef BSD_KERNEL_PRIVATE +#include +#include +#include + struct nd_ifinfo { decl_lck_mtx_data(, lock); boolean_t initialized; /* Flag to see the entry is initialized */ diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 3a5caac50..dc37c7332 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -101,6 +101,8 @@ #include #include +#include + #include #include #include @@ -261,7 +263,8 @@ rip6_input( SO_RECV_CONTROL_OPTS(last->in6p_socket)) { ret = ip6_savecontrol(last, m, &opts); if (ret != 0) { - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP_ENOBUFS, NULL, 0); + m = NULL; m_freem(opts); ip6stat.ip6s_delivered--; goto unlock; @@ -374,8 +377,7 @@ rip6_output( struct ip6_moptions *__single im6o = NULL; struct ifnet *__single oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ - int sotc = SO_TC_UNSPEC; - int netsvctype = _NET_SERVICE_TYPE_UNSPEC; + struct sock_cm_info sockcminfo; struct ip6_out_args ip6oa; int flags = IPV6_OUTARGS; struct sockaddr_in6 tmp; @@ -506,9 +508,11 @@ rip6_output( ip6oa.ip6oa_flags |= IP6OAF_ULTRA_CONSTRAINED_ALLOWED; } + sock_init_cm_info(&sockcminfo, so); + dst = &dstsock->sin6_addr; if (control) { - sotc = so_tc_from_control(control, &netsvctype); + sock_parse_cm_info(control, &sockcminfo); if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, SOCK_PROTO(so))) != 0) { @@ -518,12 +522,8 @@ rip6_output( } else { optp = in6p->in6p_outputopts; } - if (sotc == SO_TC_UNSPEC) { - sotc = so->so_traffic_class; - netsvctype = so->so_netsvctype; - } - ip6oa.ip6oa_sotc = sotc; - ip6oa.ip6oa_netsvctype = netsvctype; + ip6oa.ip6oa_sotc = sockcminfo.sotc; + ip6oa.ip6oa_netsvctype = sockcminfo.netsvctype; /* * For an ICMPv6 packet, we should know its type and code @@ -805,7 +805,10 @@ rip6_output( oifp = NULL; } - set_packet_service_class(m, so, sotc, PKT_SCF_IPV6); + set_packet_service_class(m, so, sockcminfo.sotc, PKT_SCF_IPV6); + if (sockcminfo.tx_time) { + mbuf_set_tx_time(m, sockcminfo.tx_time); + } m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = in6p->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | @@ -903,7 +906,7 @@ rip6_output( bad: if (m != NULL) { - m_drop(m, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); + m_drop_if(m, oifp, DROPTAP_FLAG_DIR_OUT | DROPTAP_FLAG_L2_MISSING, drop_reason, NULL, 0); } freectl: diff --git a/bsd/netinet6/route6.c b/bsd/netinet6/route6.c index d237890df..765e7c844 100644 --- a/bsd/netinet6/route6.c +++ b/bsd/netinet6/route6.c @@ -90,7 +90,8 @@ route6_input(struct mbuf **mp, int *offp, int proto) if (ip6a->ip6a_flags & IP6A_SWAP) { ip6stat.ip6s_badoptions++; *mp = NULL; - m_freem(m); + m_drop(m, DROPTAP_FLAG_DIR_IN | DROPTAP_FLAG_L2_MISSING, DROP_REASON_IP6_BAD_OPTION, NULL, 0); + return IPPROTO_DONE; } } diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index f848f1cc3..33d6e8426 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -107,6 +107,8 @@ #include #include +#include + #include #include @@ -165,8 +167,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, int flags; struct sockaddr_in6 tmp; struct in6_addr storage; - int sotc = SO_TC_UNSPEC; - int netsvctype = _NET_SERVICE_TYPE_UNSPEC; + struct sock_cm_info sockcminfo; struct ip6_out_args ip6oa; struct flowadv *__single adv = &ip6oa.ip6oa_flowadv; struct socket *__single so = in6p->in6p_socket; @@ -249,8 +250,11 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, } #endif + sock_init_cm_info(&sockcminfo, so); + if (control) { - sotc = so_tc_from_control(control, &netsvctype); + sock_parse_cm_info(control, &sockcminfo); + if ((error = ip6_setpktopts(control, &opt, in6p->in6p_outputopts, IPPROTO_UDP)) != 0) { drop_reason = DROP_REASON_IP6_BAD_OPTION; @@ -262,12 +266,8 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, optp = in6p->in6p_outputopts; } - if (sotc == SO_TC_UNSPEC) { - sotc = so->so_traffic_class; - netsvctype = so->so_netsvctype; - } - ip6oa.ip6oa_sotc = sotc; - ip6oa.ip6oa_netsvctype = netsvctype; + ip6oa.ip6oa_sotc = sockcminfo.sotc; + ip6oa.ip6oa_netsvctype = sockcminfo.netsvctype; in6p->inp_sndinprog_cnt++; sndinprog_cnt_used = true; @@ -569,7 +569,10 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, /* Copy the cached route and take an extra reference */ in6p_route_copyout(in6p, &ro); - set_packet_service_class(m, so, sotc, PKT_SCF_IPV6); + set_packet_service_class(m, so, sockcminfo.sotc, PKT_SCF_IPV6); + if (sockcminfo.tx_time) { + mbuf_set_tx_time(m, sockcminfo.tx_time); + } m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB; m->m_pkthdr.pkt_flowid = in6p->inp_flowhash; @@ -631,15 +634,13 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, } if (error == 0 && nstat_collect) { - stats_functional_type ifnet_count_type = stats_functional_type_none; + stats_functional_type ifnet_count_type = stats_functional_type_unclassified; if (in6p->in6p_route.ro_rt != NULL) { ifnet_count_type = IFNET_COUNT_TYPE(in6p->in6p_route. ro_rt->rt_ifp); } - INP_ADD_STAT(in6p, ifnet_count_type, txpackets, 1); - INP_ADD_STAT(in6p, ifnet_count_type, txbytes, ulen); - inp_set_activity_bitmap(in6p); + INP_ADD_TXSTAT(in6p, ifnet_count_type, 1, ulen); } if (flowadv && (adv->code == FADV_FLOW_CONTROLLED || @@ -707,7 +708,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, * outgoing interface */ if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) { + ifnet_head_lock_shared(); outif = ifindex2ifnet[ip6oa.ip6oa_boundif]; + ifnet_head_done(); } else { outif = rt->rt_ifp; } diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index f67694701..4bc1101e7 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -224,9 +224,7 @@ udp6_append(struct inpcb *last, struct ip6_hdr *ip6, m_adj(n, off); if (nstat_collect) { stats_functional_type ifnet_count_type = IFNET_COUNT_TYPE(ifp); - INP_ADD_STAT(last, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(last, ifnet_count_type, rxbytes, n->m_pkthdr.len); - inp_set_activity_bitmap(last); + INP_ADD_RXSTAT(last, ifnet_count_type, 1, n->m_pkthdr.len); } so_recv_data_stat(last->in6p_socket, n, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, @@ -250,7 +248,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *__single opts = NULL; int off = *offp; int plen, ulen, ret = 0; - stats_functional_type ifnet_count_type = stats_functional_type_none; + stats_functional_type ifnet_count_type = stats_functional_type_unclassified; struct sockaddr_in6 udp_in6; struct inpcbinfo *__single pcbinfo = &udbinfo; struct sockaddr_in6 fromsa; @@ -289,7 +287,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) /* destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) { IF_UDP_STATINC(ifp, port0); - drop_reason = DROP_REASON_IP_ILLEGAL_PORT; + drop_reason = DROP_REASON_IP6_ILLEGAL_PORT; goto bad; } @@ -297,7 +295,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) * Checksum extended UDP header and data. */ if (udp6_input_checksum(m, uh, off, ulen)) { - drop_reason = DROP_REASON_IP_BAD_CHECKSUM; + drop_reason = DROP_REASON_IP6_BAD_UDP_CHECKSUM; goto bad; } @@ -546,6 +544,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) if ((m = m_pullup(m, off + sizeof(struct udphdr) + payload_len)) == NULL) { udpstat.udps_hdrops++; + drop_reason = DROP_REASON_UDP_PACKET_SHORTER_THAN_HEADER; goto bad; } /* @@ -560,6 +559,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) /* Check for NAT keepalive packet */ if (payload_len == 1 && *(u_int8_t*) ((caddr_t)uh + sizeof(struct udphdr)) == 0xFF) { + drop_reason = DROP_REASON_UDP_PACKET_SHORTER_THAN_HEADER; goto bad; } else if (payload_len == 4 && *(u_int32_t*)(void *) ((caddr_t)uh + sizeof(struct udphdr)) != 0) { @@ -632,6 +632,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) if (in_pcb_checkstate(in6p, WNT_RELEASE, 1) == WNT_STOPUSING) { udp_unlock(in6p->in6p_socket, 1, 0); IF_UDP_STATINC(ifp, cleanup); + drop_reason = DROP_REASON_UDP_PCB_GARBAGE_COLLECTED; goto bad; } @@ -651,9 +652,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) m_adj(m, off + sizeof(struct udphdr)); if (nstat_collect) { ifnet_count_type = IFNET_COUNT_TYPE(ifp); - INP_ADD_STAT(in6p, ifnet_count_type, rxpackets, 1); - INP_ADD_STAT(in6p, ifnet_count_type, rxbytes, m->m_pkthdr.len); - inp_set_activity_bitmap(in6p); + INP_ADD_RXSTAT(in6p, ifnet_count_type, 1, m->m_pkthdr.len); } so_recv_data_stat(in6p->in6p_socket, m, 0); if (sbappendaddr(&in6p->in6p_socket->so_rcv, diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index bacbe5c98..334b2a57a 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -653,13 +653,12 @@ key_init(struct protosw *pp, struct domain *dp __unused) VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED); - _CASSERT(PFKEY_ALIGN8(sizeof(struct sadb_msg)) <= _MHLEN); - _CASSERT(MAX_REPLAY_WINDOWS == MBUF_TC_MAX); + static_assert(PFKEY_ALIGN8(sizeof(struct sadb_msg)) <= _MHLEN); + static_assert(MAX_REPLAY_WINDOWS == MBUF_TC_MAX); - if (key_initialized) { + if (!os_atomic_cmpxchg(&key_initialized, 0, 1, relaxed)) { return; } - key_initialized = 1; for (i = 0; i < SPIHASHSIZE; i++) { LIST_INIT(&spihash[i]); @@ -2273,7 +2272,7 @@ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, if (idx == SADB_EXT_RESERVED) { len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); - MGETHDR(n, M_WAITOK, MT_DATA); // sadb_msg len < MHLEN - enforced by _CASSERT + MGETHDR(n, M_WAITOK, MT_DATA); // sadb_msg len < MHLEN - enforced by static_assert() if (!n) { goto fail; } diff --git a/bsd/nfs/gss/gss_krb5_mech.c b/bsd/nfs/gss/gss_krb5_mech.c index 93620cc02..3c23e8e29 100644 --- a/bsd/nfs/gss/gss_krb5_mech.c +++ b/bsd/nfs/gss/gss_krb5_mech.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -139,7 +140,7 @@ printmbuf(const char *str, mbuf_t mb, uint32_t offset, uint32_t len) } for (i = offset; len && i < mbuf_len(mb); i++) { const char *s = (cout % 8) ? " " : (cout % 16) ? " " : "\n"; - printf("%02x%s", ((uint8_t *)mbuf_data(mb))[i], s); + printf("%02x%s", (mtod(mb, uint8_t *))[i], s); len--; cout++; } @@ -387,7 +388,7 @@ gss_join_mbuf(mbuf_t head, mbuf_t body, mbuf_t tail) errno_t gss_prepend_mbuf(mbuf_t *chain, uint8_t *bytes, size_t size) { - uint8_t *data = mbuf_data(*chain); + uint8_t *data = mtod(*chain, uint8_t *); size_t leading = mbuf_leadingspace(*chain); size_t trailing = mbuf_trailingspace(*chain); size_t mlen = mbuf_len(*chain); @@ -402,7 +403,7 @@ gss_prepend_mbuf(mbuf_t *chain, uint8_t *bytes, size_t size) if (error) { return error; } - data = mbuf_data(*chain); + data = mtod(*chain, uint8_t *); memcpy(data, bytes, size); return 0; @@ -458,7 +459,7 @@ mbuf_walk(mbuf_t mbp, size_t offset, size_t len, size_t blocksize, int (*crypto_ /* Move to the start of the chain */ for (mb = mbp; mb && len > 0; mb = mbuf_next(mb)) { - ptr = mbuf_data(mb); + ptr = mtod(mb, uint8_t *); mlen = mbuf_len(mb); if (offset >= mlen) { /* Offset not yet reached */ @@ -530,7 +531,7 @@ mbuf_walk(mbuf_t mbp, size_t offset, size_t len, size_t blocksize, int (*crypto_ return error; } } - nptr = mbuf_data(nmb); + nptr = mtod(nmb, uint8_t *); memcpy(block + residue, nptr, offset); } len -= offset; diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index b27bb9158..30a3f1de5 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -169,7 +169,7 @@ extern int nfs_ticks; * Note that some of these structures come out of their own nfs zones. */ #define NFS_NODEALLOC 1024 -#define NFS_MNTALLOC 1024 +#define NFS_MNTALLOC 2048 #define NFS_SVCALLOC 512 #define NFS_ARGSVERSION_XDR 88 /* NFS mount args are in XDR format */ @@ -771,36 +771,36 @@ typedef struct nfserr_info { * NFS Common Errors */ #define NFSERR_INFO_COMMON \ - { "NFS_OK", NFS_OK, 0 }, \ - { "NFSERR_PERM", NFSERR_PERM, 1 }, \ - { "NFSERR_NOENT", NFSERR_NOENT, 2 }, \ - { "NFSERR_IO", NFSERR_IO, 3 }, \ - { "NFSERR_NXIO", NFSERR_NXIO, 4 }, \ - { "NFSERR_ACCES", NFSERR_ACCES, 5 }, \ - { "NFSERR_EXIST", NFSERR_EXIST, 6 }, \ - { "NFSERR_XDEV", NFSERR_XDEV, 7 }, \ - { "NFSERR_NODEV", NFSERR_NODEV, 8 }, \ - { "NFSERR_NOTDIR", NFSERR_NOTDIR, 9 }, \ - { "NFSERR_ISDIR", NFSERR_ISDIR, 10 }, \ - { "NFSERR_INVAL", NFSERR_INVAL, 11 }, \ - { "NFSERR_FBIG", NFSERR_FBIG, 12 }, \ - { "NFSERR_NOSPC", NFSERR_NOSPC, 13 }, \ - { "NFSERR_ROFS", NFSERR_ROFS, 14 }, \ - { "NFSERR_MLINK", NFSERR_MLINK, 15 }, \ - { "NFSERR_NAMETOL", NFSERR_NAMETOL, 16 }, \ - { "NFSERR_NOTEMPTY", NFSERR_NOTEMPTY, 17 }, \ - { "NFSERR_DQUOT", NFSERR_DQUOT, 18 }, \ - { "NFSERR_STALE", NFSERR_STALE, 19 }, \ - { "NFSERR_REMOTE", NFSERR_REMOTE, 20 }, \ - { "NFSERR_WFLUSH", NFSERR_WFLUSH, 21 }, \ - { "NFSERR_BADHANDLE", NFSERR_BADHANDLE, 22 }, \ - { "NFSERR_NOT_SYNC", NFSERR_NOT_SYNC, 23 }, \ - { "NFSERR_BAD_COOKIE", NFSERR_BAD_COOKIE, 24 }, \ - { "NFSERR_NOTSUPP", NFSERR_NOTSUPP, 25 }, \ - { "NFSERR_TOOSMALL", NFSERR_TOOSMALL, 26 }, \ - { "NFSERR_SERVERFAULT", NFSERR_SERVERFAULT, 27 }, \ - { "NFSERR_BADTYPE", NFSERR_BADTYPE, 28 }, \ - { "NFSERR_DELAY", NFSERR_DELAY, 29 } + { "NFS_OK", NFS_OK, 0 }, \ + { "ERR_PERM", NFSERR_PERM, 1 }, \ + { "ERR_NOENT", NFSERR_NOENT, 2 }, \ + { "ERR_IO", NFSERR_IO, 3 }, \ + { "ERR_NXIO", NFSERR_NXIO, 4 }, \ + { "ERR_ACCES", NFSERR_ACCES, 5 }, \ + { "ERR_EXIST", NFSERR_EXIST, 6 }, \ + { "ERR_XDEV", NFSERR_XDEV, 7 }, \ + { "ERR_NODEV", NFSERR_NODEV, 8 }, \ + { "ERR_NOTDIR", NFSERR_NOTDIR, 9 }, \ + { "ERR_ISDIR", NFSERR_ISDIR, 10 }, \ + { "ERR_INVAL", NFSERR_INVAL, 11 }, \ + { "ERR_FBIG", NFSERR_FBIG, 12 }, \ + { "ERR_NOSPC", NFSERR_NOSPC, 13 }, \ + { "ERR_ROFS", NFSERR_ROFS, 14 }, \ + { "ERR_MLINK", NFSERR_MLINK, 15 }, \ + { "ERR_NAMETOL", NFSERR_NAMETOL, 16 }, \ + { "ERR_NOTEMPTY", NFSERR_NOTEMPTY, 17 }, \ + { "ERR_DQUOT", NFSERR_DQUOT, 18 }, \ + { "ERR_STALE", NFSERR_STALE, 19 }, \ + { "ERR_REMOTE", NFSERR_REMOTE, 20 }, \ + { "ERR_WFLUSH", NFSERR_WFLUSH, 21 }, \ + { "ERR_BADHANDLE", NFSERR_BADHANDLE, 22 }, \ + { "ERR_NOT_SYNC", NFSERR_NOT_SYNC, 23 }, \ + { "ERR_BAD_COOKIE", NFSERR_BAD_COOKIE, 24 }, \ + { "ERR_NOTSUPP", NFSERR_NOTSUPP, 25 }, \ + { "ERR_TOOSMALL", NFSERR_TOOSMALL, 26 }, \ + { "ERR_SERVERFAULT", NFSERR_SERVERFAULT, 27 }, \ + { "ERR_BADTYPE", NFSERR_BADTYPE, 28 }, \ + { "ERR_DELAY", NFSERR_DELAY, 29 } #define NFSERR_INFO_COMMON_SIZE 30 @@ -808,48 +808,89 @@ typedef struct nfserr_info { * NFSv4 Errors */ #define NFSERR_INFO_V4 \ - { "ERR_SAME", NFSERR_SAME, 0 }, \ - { "ERR_DENIED", NFSERR_DENIED, 1 }, \ - { "ERR_EXPIRED", NFSERR_EXPIRED, 2 }, \ - { "ERR_LOCKED", NFSERR_LOCKED, 3 }, \ - { "ERR_GRACE", NFSERR_GRACE, 4 }, \ - { "ERR_FHEXPIRED", NFSERR_FHEXPIRED, 5 }, \ - { "ERR_SHARE_DENIED", NFSERR_SHARE_DENIED, 6 }, \ - { "ERR_WRONGSEC", NFSERR_WRONGSEC, 7 }, \ - { "ERR_CLID_INUSE", NFSERR_CLID_INUSE, 8 }, \ - { "ERR_RESOURCE", NFSERR_RESOURCE, 9 }, \ - { "ERR_MOVED", NFSERR_MOVED, 10 }, \ - { "ERR_NOFILEHANDLE", NFSERR_NOFILEHANDLE, 11 }, \ - { "ERR_MINOR_VERS_MISMATCH", NFSERR_MINOR_VERS_MISMATCH, 12 }, \ - { "ERR_STALE_CLIENTID", NFSERR_STALE_CLIENTID, 13 }, \ - { "ERR_STALE_STATEID", NFSERR_STALE_STATEID, 14 }, \ - { "ERR_OLD_STATEID", NFSERR_OLD_STATEID, 15 }, \ - { "ERR_BAD_STATEID", NFSERR_BAD_STATEID, 16 }, \ - { "ERR_BAD_SEQID", NFSERR_BAD_SEQID, 17 }, \ - { "ERR_NOT_SAME", NFSERR_NOT_SAME, 18 }, \ - { "ERR_LOCK_RANGE", NFSERR_LOCK_RANGE, 19 }, \ - { "ERR_SYMLINK", NFSERR_SYMLINK, 20 }, \ - { "ERR_RESTOREFH", NFSERR_RESTOREFH, 21 }, \ - { "ERR_LEASE_MOVED", NFSERR_LEASE_MOVED, 22 }, \ - { "ERR_ATTRNOTSUPP", NFSERR_ATTRNOTSUPP, 23 }, \ - { "ERR_NO_GRACE", NFSERR_NO_GRACE, 24 }, \ - { "ERR_RECLAIM_BAD", NFSERR_RECLAIM_BAD, 25 }, \ - { "ERR_RECLAIM_CONFLICT", NFSERR_RECLAIM_CONFLICT, 26 }, \ - { "ERR_BADXDR", NFSERR_BADXDR, 27 }, \ - { "ERR_LOCKS_HELD", NFSERR_LOCKS_HELD, 28 }, \ - { "ERR_OPENMODE", NFSERR_OPENMODE, 29 }, \ - { "ERR_BADOWNER", NFSERR_BADOWNER, 30 }, \ - { "ERR_BADCHAR", NFSERR_BADCHAR, 31 }, \ - { "ERR_BADNAME", NFSERR_BADNAME, 32 }, \ - { "ERR_BAD_RANGE", NFSERR_BAD_RANGE, 33 }, \ - { "ERR_LOCK_NOTSUPP", NFSERR_LOCK_NOTSUPP, 34 }, \ - { "ERR_OP_ILLEGAL", NFSERR_OP_ILLEGAL, 35 }, \ - { "ERR_DEADLOCK", NFSERR_DEADLOCK, 36 }, \ - { "ERR_FILE_OPEN", NFSERR_FILE_OPEN, 37 }, \ - { "ERR_ADMIN_REVOKED", NFSERR_ADMIN_REVOKED, 38 }, \ - { "ERR_CB_PATH_DOWN", NFSERR_CB_PATH_DOWN, 39 } + /* NFSv4 Errors */ \ + { "ERR_SAME", NFSERR_SAME, 0 }, \ + { "ERR_DENIED", NFSERR_DENIED, 1 }, \ + { "ERR_EXPIRED", NFSERR_EXPIRED, 2 }, \ + { "ERR_LOCKED", NFSERR_LOCKED, 3 }, \ + { "ERR_GRACE", NFSERR_GRACE, 4 }, \ + { "ERR_FHEXPIRED", NFSERR_FHEXPIRED, 5 }, \ + { "ERR_SHARE_DENIED", NFSERR_SHARE_DENIED, 6 }, \ + { "ERR_WRONGSEC", NFSERR_WRONGSEC, 7 }, \ + { "ERR_CLID_INUSE", NFSERR_CLID_INUSE, 8 }, \ + { "ERR_RESOURCE", NFSERR_RESOURCE, 9 }, \ + { "ERR_MOVED", NFSERR_MOVED, 10 }, \ + { "ERR_NOFILEHANDLE", NFSERR_NOFILEHANDLE, 11 }, \ + { "ERR_MINOR_VERS_MISMATCH", NFSERR_MINOR_VERS_MISMATCH, 12 }, \ + { "ERR_STALE_CLIENTID", NFSERR_STALE_CLIENTID, 13 }, \ + { "ERR_STALE_STATEID", NFSERR_STALE_STATEID, 14 }, \ + { "ERR_OLD_STATEID", NFSERR_OLD_STATEID, 15 }, \ + { "ERR_BAD_STATEID", NFSERR_BAD_STATEID, 16 }, \ + { "ERR_BAD_SEQID", NFSERR_BAD_SEQID, 17 }, \ + { "ERR_NOT_SAME", NFSERR_NOT_SAME, 18 }, \ + { "ERR_LOCK_RANGE", NFSERR_LOCK_RANGE, 19 }, \ + { "ERR_SYMLINK", NFSERR_SYMLINK, 20 }, \ + { "ERR_RESTOREFH", NFSERR_RESTOREFH, 21 }, \ + { "ERR_LEASE_MOVED", NFSERR_LEASE_MOVED, 22 }, \ + { "ERR_ATTRNOTSUPP", NFSERR_ATTRNOTSUPP, 23 }, \ + { "ERR_NO_GRACE", NFSERR_NO_GRACE, 24 }, \ + { "ERR_RECLAIM_BAD", NFSERR_RECLAIM_BAD, 25 }, \ + { "ERR_RECLAIM_CONFLICT", NFSERR_RECLAIM_CONFLICT, 26 }, \ + { "ERR_BADXDR", NFSERR_BADXDR, 27 }, \ + { "ERR_LOCKS_HELD", NFSERR_LOCKS_HELD, 28 }, \ + { "ERR_OPENMODE", NFSERR_OPENMODE, 29 }, \ + { "ERR_BADOWNER", NFSERR_BADOWNER, 30 }, \ + { "ERR_BADCHAR", NFSERR_BADCHAR, 31 }, \ + { "ERR_BADNAME", NFSERR_BADNAME, 32 }, \ + { "ERR_BAD_RANGE", NFSERR_BAD_RANGE, 33 }, \ + { "ERR_LOCK_NOTSUPP", NFSERR_LOCK_NOTSUPP, 34 }, \ + { "ERR_OP_ILLEGAL", NFSERR_OP_ILLEGAL, 35 }, \ + { "ERR_DEADLOCK", NFSERR_DEADLOCK, 36 }, \ + { "ERR_FILE_OPEN", NFSERR_FILE_OPEN, 37 }, \ + { "ERR_ADMIN_REVOKED", NFSERR_ADMIN_REVOKED, 38 }, \ + { "ERR_CB_PATH_DOWN", NFSERR_CB_PATH_DOWN, 39 } , \ + /* NFSv4.1 Errors */ \ + { "ERR_BADIOMODE", NFSERR_BADIOMODE, 40} , \ + { "ERR_BADLAYOUT", NFSERR_BADLAYOUT, 41 } , \ + { "ERR_BADSESSIONDIGEST", NFSERR_BADSESSIONDIGEST, 42 } , \ + { "ERR_BADSESSION", NFSERR_BADSESSION, 43 } , \ + { "ERR_BADSLOT", NFSERR_BADSLOT, 44 } , \ + { "ERR_COMPLETEALREADY", NFSERR_COMPLETEALREADY, 45 } , \ + { "ERR_NOTBNDTOSESS", NFSERR_NOTBNDTOSESS, 46 } , \ + { "ERR_DELEGALREADYWANT", NFSERR_DELEGALREADYWANT, 47 } , \ + { "ERR_BACKCHANBUSY", NFSERR_BACKCHANBUSY, 48 } , \ + { "ERR_LAYOUTTRYLATER", NFSERR_LAYOUTTRYLATER, 49 } , \ + { "ERR_LAYOUTUNAVAIL", NFSERR_LAYOUTUNAVAIL, 50 } , \ + { "ERR_NOMATCHLAYOUT", NFSERR_NOMATCHLAYOUT, 51 } , \ + { "ERR_RECALLCONFLICT", NFSERR_RECALLCONFLICT, 52 } , \ + { "ERR_UNKNLAYOUTTYPE", NFSERR_UNKNLAYOUTTYPE, 53 } , \ + { "ERR_SEQMISORDERED", NFSERR_SEQMISORDERED, 54 } , \ + { "ERR_SEQUENCEPOS", NFSERR_SEQUENCEPOS, 55 } , \ + { "ERR_REQTOOBIG", NFSERR_REQTOOBIG, 56 } , \ + { "ERR_REPTOOBIG", NFSERR_REPTOOBIG, 57 } , \ + { "ERR_REPTOOBIGTOCACHE", NFSERR_REPTOOBIGTOCACHE, 58 } , \ + { "ERR_RETRYUNCACHEDREP", NFSERR_RETRYUNCACHEDREP, 59 } , \ + { "ERR_UNSAFECOMPOUND", NFSERR_UNSAFECOMPOUND, 60 } , \ + { "ERR_TOOMANYOPS", NFSERR_TOOMANYOPS, 61 } , \ + { "ERR_OPNOTINSESS", NFSERR_OPNOTINSESS, 62 } , \ + { "ERR_HASHALGUNSUPP", NFSERR_HASHALGUNSUPP, 63 } , \ + { "ERR_CLIENTIDBUSY", NFSERR_CLIENTIDBUSY, 64 } , \ + { "ERR_PNFSIOHOLE", NFSERR_PNFSIOHOLE, 65 } , \ + { "ERR_SEQFALSERETRY", NFSERR_SEQFALSERETRY, 66 } , \ + { "ERR_BADHIGHSLOT", NFSERR_BADHIGHSLOT, 67 } , \ + { "ERR_DEADSESSION", NFSERR_DEADSESSION, 68 } , \ + { "ERR_ENCRALGUNSUPP", NFSERR_ENCRALGUNSUPP, 69 } , \ + { "ERR_PNFSNOLAYOUT", NFSERR_PNFSNOLAYOUT, 70 } , \ + { "ERR_NOTONLYOP", NFSERR_NOTONLYOP, 71 } , \ + { "ERR_WRONGCRED", NFSERR_WRONGCRED, 72 } , \ + { "ERR_WRONGTYPE", NFSERR_WRONGTYPE, 73 } , \ + { "ERR_DIRDELEGUNAVAIL", NFSERR_DIRDELEGUNAVAIL, 74 } , \ + { "ERR_REJECTDELEG", NFSERR_REJECTDELEG, 75 } , \ + { "ERR_RETURNCONFLICT", NFSERR_RETURNCONFLICT, 76 } , \ + { "ERR_DELEGREVOKED", NFSERR_DELEGREVOKED, 77 } -#define NFSERR_INFO_V4_SIZE 40 +#define NFSERR_INFO_V4_SIZE 40 +#define NFSERR_INFO_V41_SIZE 78 /* * XXX to allow amd to include nfs.h without nfsproto.h @@ -883,8 +924,8 @@ struct nfsclntstats { uint64_t nlm_test; uint64_t nlm_unlock; } nlmcnt; // NFSv3 only - uint64_t opcntv4[NFS_OP_COUNT]; - uint64_t cbopcntv4[NFS_OP_CB_COUNT]; + uint64_t opcntv4[NFS_V41_OP_COUNT]; + uint64_t cbopcntv4[NFS_V41_OP_CB_COUNT]; uint64_t rpcretries; uint64_t rpcrequests; uint64_t rpctimeouts; @@ -894,7 +935,7 @@ struct nfsclntstats { uint64_t pageouts; struct { uint64_t errs_common[NFSERR_INFO_COMMON_SIZE]; - uint64_t errs_v4[NFSERR_INFO_V4_SIZE]; + uint64_t errs_v4[NFSERR_INFO_V41_SIZE]; uint64_t errs_unknown; } nfs_errs; }; diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index 7bebbfc79..3acf8e233 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -78,6 +78,8 @@ #include #include +#include +#include #include #include @@ -144,7 +146,7 @@ rpc_gss_prepend_32(mbuf_t *mb, uint32_t value) uint32_t *data; #if 0 - data = mbuf_data(*mb); + data = mtod(*mb, uint32_t *); /* * If a wap token comes back and is not aligned * get a new buffer (which should be aligned) to put the @@ -166,7 +168,7 @@ rpc_gss_prepend_32(mbuf_t *mb, uint32_t value) return error; } - data = mbuf_data(*mb); + data = mtod(*mb, uint32_t *); *data = txdr_unsigned(value); return 0; @@ -193,7 +195,7 @@ rpc_gss_data_create(mbuf_t *mbp_head, uint32_t seqnum) if (error) { return error; } - data = mbuf_data(mb); + data = mtod(mb, uint8_t *); #if 0 /* Reserve space for prepending */ len = mbuf_maxlen(mb); @@ -201,7 +203,7 @@ rpc_gss_data_create(mbuf_t *mbp_head, uint32_t seqnum) printf("%s: data = %p, len = %d\n", __func__, data, (int)len); error = mbuf_setdata(mb, data + len, 0); if (error || mbuf_trailingspace(mb)) { - printf("%s: data = %p trailingspace = %d error = %d\n", __func__, mbuf_data(mb), (int)mbuf_trailingspace(mb), error); + printf("%s: data = %p trailingspace = %d error = %d\n", __func__, mtod(mb, caddr_t), (int)mbuf_trailingspace(mb), error); } #endif /* Reserve 16 words for prepending */ @@ -744,7 +746,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) } /* Get the wrap token (current mbuf in the chain starting at the current offset) */ - start = nmc->nmc_ptr - (caddr_t)mbuf_data(nmc->nmc_mcur); + start = nmc->nmc_ptr - mtod(nmc->nmc_mcur, caddr_t); /* split out the wrap token */ argsize = arglen; @@ -770,7 +772,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) /* Now replace the wrapped arguments with the unwrapped ones */ mbuf_setnext(prev_mbuf, reply_mbuf); nmc->nmc_mcur = reply_mbuf; - nmc->nmc_ptr = mbuf_data(reply_mbuf); + nmc->nmc_ptr = mtod(reply_mbuf, caddr_t); nmc->nmc_left = mbuf_len(reply_mbuf); /* @@ -1495,7 +1497,7 @@ nfs_gss_append_chain(struct nfsm_chain *nmc, mbuf_t mc) } nmc->nmc_mcur = tail; - nmc->nmc_ptr = (caddr_t) mbuf_data(tail) + mbuf_len(tail); + nmc->nmc_ptr = mtod(tail, caddr_t) + mbuf_len(tail); nmc->nmc_left = mbuf_trailingspace(tail); return 0; @@ -1517,7 +1519,7 @@ nfs_gss_nfsm_chain(struct nfsm_chain *nmc, mbuf_t mc) nmc->nmc_mhead = mc; nmc->nmc_mcur = tail; - nmc->nmc_ptr = (caddr_t) mbuf_data(tail) + mbuf_len(tail); + nmc->nmc_ptr = mtod(tail, caddr_t) + mbuf_len(tail); nmc->nmc_left = mbuf_trailingspace(tail); nmc->nmc_flags = 0; } diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 5c34a1d12..c718980bd 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include @@ -812,7 +813,7 @@ nfsrv_readlink( nfsmerr_if(error); for (mp = mpath; mp; mp = mbuf_next(mp)) { - uio_addiov(auio, CAST_USER_ADDR_T((caddr_t)mbuf_data(mp)), mbuf_len(mp)); + uio_addiov(auio, CAST_USER_ADDR_T(mtod(mp, caddr_t)), mbuf_len(mp)); } error = nfsrv_fhtovp(&nfh, nd, &vp, &nx, &nxo); @@ -1018,7 +1019,7 @@ nfsrv_read( goto errorexit; } for (m = mread; m; m = mbuf_next(m)) { - uio_addiov(auio, CAST_USER_ADDR_T((caddr_t)mbuf_data(m)), mbuf_len(m)); + uio_addiov(auio, CAST_USER_ADDR_T(mtod(m, caddr_t)), mbuf_len(m)); } error = VNOP_READ(vp, auio, IO_NODELOCKED, ctx); } else { @@ -1419,7 +1420,7 @@ nfsrv_write( nfsmerr_if(error); for (m = nmreq->nmc_mcur; m; m = mbuf_next(m)) { if ((mlen = (int)mbuf_len(m)) > 0) { - uio_addiov(auio, CAST_USER_ADDR_T((caddr_t)mbuf_data(m)), mlen); + uio_addiov(auio, CAST_USER_ADDR_T(mtod(m, caddr_t)), mlen); } } /* @@ -1728,7 +1729,7 @@ loop1: if (!error) { for (m = nmreq->nmc_mhead; m; m = mbuf_next(m)) { if ((tlen = mbuf_len(m)) > 0) { - uio_addiov(auio, CAST_USER_ADDR_T((caddr_t)mbuf_data(m)), tlen); + uio_addiov(auio, CAST_USER_ADDR_T(mtod(m, caddr_t)), tlen); } } error = VNOP_WRITE(vp, auio, ioflags, ctx); @@ -3409,6 +3410,7 @@ nfsrv_link( struct nfs_export *nx; struct nfs_export_options *nxo; struct nfsm_chain *nmreq, nmrep; + const char *vname = NULL; error = 0; dpreattrerr = dpostattrerr = attrerr = ENOENT; @@ -3444,7 +3446,8 @@ nfsrv_link( goto out; } - NDINIT(&ni, CREATE, OP_LINK, LOCKPARENT, UIO_SYSSPACE, CAST_USER_ADDR_T(vnode_getname(vp)), ctx); + vname = vnode_getname(vp); + NDINIT(&ni, CREATE, OP_LINK, LOCKPARENT, UIO_SYSSPACE, CAST_USER_ADDR_T(vname), ctx); error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { error = nfsrv_namei(nd, ctx, &ni, &dnfh, &dirp, &nx, &nxo); @@ -3531,6 +3534,9 @@ out: vnode_put(dirp); dirp = NULL; } + if (vname) { + vnode_putname(vname); + } vnode_put(vp); vp = NULL; diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index f034f4007..79f90ac66 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -201,7 +201,7 @@ nfsrv_rephead( } if (siz < nfs_mbuf_minclsize) { /* leave space for lower level headers */ - tl = mbuf_data(mrep); + tl = mtod(mrep, u_int32_t *); tl += 80 / sizeof(*tl); /* XXX max_hdr? XXX */ mbuf_setdata(mrep, tl, 6 * NFSX_UNSIGNED); } @@ -297,7 +297,7 @@ nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top) bzero(&msg, sizeof(msg)); if (nam && !sock_isconnected(so) && (slp->ns_sotype != SOCK_STREAM)) { - if ((sendnam = mbuf_data(nam))) { + if ((sendnam = SA(mtod(nam, caddr_t)))) { msg.msg_name = (caddr_t)sendnam; msg.msg_namelen = sendnam->sa_len; } @@ -441,7 +441,7 @@ nfsrv_rcv_locked(socket_t so, struct nfsrv_sock *slp, int waitflag) if (mp) { if (msg.msg_name && (mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &mhck) == 0)) { mbuf_setlen(mhck, nam.ss_len); - bcopy(&nam, mbuf_data(mhck), nam.ss_len); + bcopy(&nam, mtod(mhck, caddr_t), nam.ss_len); m = mhck; if (mbuf_setnext(m, mp)) { /* trouble... just drop it */ @@ -518,7 +518,7 @@ nfsrv_getstream(struct nfsrv_sock *slp, int waitflag) return 0; } m = slp->ns_raw; - mdata = mbuf_data(m); + mdata = mtod(m, caddr_t); mlen = mbuf_len(m); if (mlen >= NFSX_UNSIGNED) { bcopy(mdata, (caddr_t)&recmark, NFSX_UNSIGNED); @@ -531,7 +531,7 @@ nfsrv_getstream(struct nfsrv_sock *slp, int waitflag) while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { while (mlen == 0) { m = mbuf_next(m); - cp2 = mbuf_data(m); + cp2 = mtod(m, caddr_t); mlen = mbuf_len(m); } *cp1++ = *cp2++; @@ -576,7 +576,7 @@ nfsrv_getstream(struct nfsrv_sock *slp, int waitflag) len = 0; m = slp->ns_raw; mlen = mbuf_len(m); - mdata = mbuf_data(m); + mdata = mtod(m, caddr_t); om = NULL; while (len < slp->ns_reclen) { if ((len + mlen) > slp->ns_reclen) { @@ -611,13 +611,13 @@ nfsrv_getstream(struct nfsrv_sock *slp, int waitflag) return EWOULDBLOCK; } mlen = mbuf_len(m); - mdata = mbuf_data(m); + mdata = mtod(m, caddr_t); } else { om = m; len += mlen; m = mbuf_next(m); mlen = mbuf_len(m); - mdata = mbuf_data(m); + mdata = mtod(m, caddr_t); } } slp->ns_raw = m; diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index 403698ada..9e793593e 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -190,14 +191,14 @@ netaddr_match( switch (family) { case AF_INET: - inetaddr = mbuf_data(nam); + inetaddr = SIN(mtod(nam, caddr_t)); if ((inetaddr->sin_family == AF_INET) && (inetaddr->sin_addr.s_addr == haddr->had_inetaddr)) { return 1; } break; case AF_INET6: - inet6addr = mbuf_data(nam); + inet6addr = SIN6(mtod(nam, caddr_t)); if ((inet6addr->sin6_family == AF_INET6) && !bcmp(&inet6addr->sin6_addr, &haddr->had_inet6addr, sizeof(inet6addr->sin6_addr))) { return 1; @@ -334,7 +335,7 @@ loop: TAILQ_INSERT_TAIL(&nfsrv_reqcache_lruhead, rp, rc_lru); rp->rc_state = RC_INPROG; rp->rc_xid = nd->nd_retxid; - saddr = mbuf_data(nd->nd_nam); + saddr = SA(mtod(nd->nd_nam, caddr_t)); rp->rc_family = saddr->sa_family; switch (saddr->sa_family) { case AF_INET: diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index ced4254a6..f789044ea 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -152,7 +153,7 @@ nfs_dump_mbuf(const char *func, int lineno, const char *msg, mbuf_t mb) printf("%s:%d %s\n", func, lineno, msg); for (m = mb; m; m = mbuf_next(m)) { - hexdump(mbuf_data(m), mbuf_len(m)); + hexdump(mtod(m, void *), mbuf_len(m)); } } @@ -411,7 +412,7 @@ nfsm_chain_new_mbuf(struct nfsm_chain *nmc, size_t sizehint) /* do we have a current mbuf? */ if (nmc->nmc_mcur) { /* first cap off current mbuf */ - mbuf_setlen(nmc->nmc_mcur, nmc->nmc_ptr - (caddr_t)mbuf_data(nmc->nmc_mcur)); + mbuf_setlen(nmc->nmc_mcur, nmc->nmc_ptr - mtod(nmc->nmc_mcur, caddr_t)); /* then append the new mbuf */ error = mbuf_setnext(nmc->nmc_mcur, mb); if (error) { @@ -422,7 +423,7 @@ nfsm_chain_new_mbuf(struct nfsm_chain *nmc, size_t sizehint) /* set up for using the new mbuf */ nmc->nmc_mcur = mb; - nmc->nmc_ptr = mbuf_data(mb); + nmc->nmc_ptr = mtod(mb, caddr_t); nmc->nmc_left = mbuf_trailingspace(mb); return 0; @@ -511,7 +512,7 @@ nfsm_chain_offset(struct nfsm_chain *nmc) for (mb = nmc->nmc_mhead; mb; mb = mbuf_next(mb)) { if (mb == nmc->nmc_mcur) { - return len + (nmc->nmc_ptr - (caddr_t) mbuf_data(mb)); + return len + (nmc->nmc_ptr - mtod(mb, caddr_t)); } len += mbuf_len(mb); } @@ -540,7 +541,7 @@ nfsm_chain_advance(struct nfsm_chain *nmc, size_t len) if (!mb) { return EBADRPC; } - nmc->nmc_ptr = mbuf_data(mb); + nmc->nmc_ptr = mtod(mb, caddr_t); nmc->nmc_left = mbuf_len(mb); } @@ -560,7 +561,7 @@ nfsm_chain_reverse(struct nfsm_chain *nmc, size_t len) size_t mlen, new_offset; int error = 0; - mlen = nmc->nmc_ptr - (caddr_t) mbuf_data(nmc->nmc_mcur); + mlen = nmc->nmc_ptr - mtod(nmc->nmc_mcur, caddr_t); if (len <= mlen) { nmc->nmc_ptr -= len; nmc->nmc_left += len; @@ -603,7 +604,7 @@ nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *nmc, uint32_t len, u_char **p if (!mb) { break; } - nmc->nmc_ptr = mbuf_data(mb); + nmc->nmc_ptr = mtod(mb, caddr_t); nmc->nmc_left = mbuf_len(mb); } /* check if we've run out of data */ @@ -642,7 +643,7 @@ nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *nmc, uint32_t len, u_char **p } /* the returned pointer will be the new mbuf's data pointer */ - *pptr = ptr = mbuf_data(mb); + *pptr = ptr = mtod(mb, u_char *); /* copy "left" bytes to the new mbuf */ bcopy(nmc->nmc_ptr, ptr, left); @@ -696,7 +697,7 @@ nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *nmc, uint32_t len, u_char **p while (need && mb) { /* copy as much as we need/can */ - ptr = mbuf_data(mb); + ptr = mtod(mb, u_char *); mblen = mbuf_len(mb); cplen = MIN(mblen, need); if (cplen) { @@ -733,7 +734,7 @@ nfsm_chain_get_opaque_pointer_f(struct nfsm_chain *nmc, uint32_t len, u_char **p * just set nmc to point at whatever remains in that mbuf. */ nmc->nmc_mcur = mb; - nmc->nmc_ptr = mbuf_data(mb); + nmc->nmc_ptr = mtod(mb, caddr_t); nmc->nmc_left = mbuf_len(mb); /* move past any padding */ @@ -773,7 +774,7 @@ nfsm_chain_get_opaque_f(struct nfsm_chain *nmc, size_t len, u_char *buf) if (len) { mbuf_t mb = mbuf_next(nmc->nmc_mcur); nmc->nmc_mcur = mb; - nmc->nmc_ptr = mb ? mbuf_data(mb) : NULL; + nmc->nmc_ptr = mb ? mtod(mb, caddr_t) : NULL; nmc->nmc_left = mb ? mbuf_len(mb) : 0; } } @@ -822,7 +823,7 @@ nfsm_chain_get_uio(struct nfsm_chain *nmc, size_t len, uio_t uio) if (len) { mbuf_t mb = mbuf_next(nmc->nmc_mcur); nmc->nmc_mcur = mb; - nmc->nmc_ptr = mb ? mbuf_data(mb) : NULL; + nmc->nmc_ptr = mb ? mtod(mb, caddr_t) : NULL; nmc->nmc_left = mb ? mbuf_len(mb) : 0; } } @@ -1054,7 +1055,7 @@ nfsm_adj(mbuf_t mp, int len, int nul) mlen -= len; mbuf_setlen(m, mlen); if (nul > 0) { - cp = (caddr_t)mbuf_data(m) + mlen - nul; + cp = mtod(m, caddr_t) + mlen - nul; for (i = 0; i < nul; i++) { *cp++ = '\0'; } @@ -1076,7 +1077,7 @@ nfsm_adj(mbuf_t mp, int len, int nul) mlen = count; mbuf_setlen(m, count); if (nul > 0) { - cp = (caddr_t)mbuf_data(m) + mlen - nul; + cp = mtod(m, caddr_t) + mlen - nul; for (i = 0; i < nul; i++) { *cp++ = '\0'; } @@ -1115,7 +1116,7 @@ nfsm_chain_trim_data(struct nfsm_chain *nmc, int len, int *mlen) } /* trim current mbuf */ - data = mbuf_data(m); + data = mtod(m, caddr_t); dlen = mbuf_len(m); adjust = nmc->nmc_ptr - data; dlen -= adjust; @@ -2179,7 +2180,7 @@ nfsrv_export_lookup(struct nfs_export *nx, mbuf_t nam) /* Lookup in the export list first. */ if (nam != NULL) { - saddr = mbuf_data(nam); + saddr = SA(mtod(nam, caddr_t)); if (saddr->sa_family > AF_MAX) { /* Bogus sockaddr? Don't match anything. */ return NULL; @@ -2610,7 +2611,7 @@ nfsrv_update_user_stat(struct nfs_export *nx, struct nfsrv_descript *nd, uid_t u return; } - saddr = (struct sockaddr *)mbuf_data(nd->nd_nam); + saddr = SA(mtod(nd->nd_nam, caddr_t)); /* check address family before going any further */ if ((saddr->sa_family != AF_INET) && (saddr->sa_family != AF_INET6)) { diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index 0a6510df7..817726679 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -75,6 +75,8 @@ */ #include +#include +#include #include #include #include @@ -1076,12 +1078,12 @@ nfssvc_nfsd(void) if (nfsrv_require_resv_port) { /* Check if source port is a reserved port */ in_port_t port = 0; - struct sockaddr *saddr = mbuf_data(nd->nd_nam); + struct sockaddr *saddr = mtod(nd->nd_nam, struct sockaddr*); if (saddr->sa_family == AF_INET) { - port = ntohs(((struct sockaddr_in*)saddr)->sin_port); + port = ntohs((SIN(saddr))->sin_port); } else if (saddr->sa_family == AF_INET6) { - port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + port = ntohs((SIN6(saddr))->sin6_port); } if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) { nd->nd_procnum = NFSPROC_NOOP; @@ -1177,7 +1179,7 @@ nfssvc_nfsd(void) if (slp->ns_sotype == SOCK_STREAM) { error = mbuf_prepend(&m, NFSX_UNSIGNED, MBUF_WAITOK); if (!error) { - *(u_int32_t*)mbuf_data(m) = htonl(0x80000000 | siz); + *mtod(m, u_int32_t *) = htonl(0x80000000 | siz); } } if (!error) { diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index 3f886a163..50a2cfd88 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -74,6 +74,8 @@ #ifdef __APPLE_API_PRIVATE #include +#include +#include int nfsm_chain_new_mbuf(struct nfsm_chain *, size_t); int nfsm_chain_add_opaque_f(struct nfsm_chain *, const u_char *, size_t); @@ -230,7 +232,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); do { \ (NMC)->nmc_mhead = (MB); \ (NMC)->nmc_mcur = (NMC)->nmc_mhead; \ - (NMC)->nmc_ptr = mbuf_data((NMC)->nmc_mcur); \ + (NMC)->nmc_ptr = mtod((NMC)->nmc_mcur, caddr_t); \ (NMC)->nmc_left = mbuf_trailingspace((NMC)->nmc_mcur); \ (NMC)->nmc_flags = 0; \ } while (0) @@ -250,7 +252,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); if ((E) || !(NMC)->nmc_mcur) break; \ /* cap off current mbuf */ \ mbuf_setlen((NMC)->nmc_mcur, \ - (NMC)->nmc_ptr - (caddr_t)mbuf_data((NMC)->nmc_mcur)); \ + (NMC)->nmc_ptr - (caddr_t)mtod((NMC)->nmc_mcur, caddr_t)); \ } while (0) /* make sure there's room for size bytes in current mbuf */ @@ -327,7 +329,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); do { \ if (E) break; \ mbuf_setlen((NMC)->nmc_mcur, \ - (NMC)->nmc_ptr - (caddr_t)mbuf_data((NMC)->nmc_mcur)); \ + (NMC)->nmc_ptr - mtod((NMC)->nmc_mcur, caddr_t)); \ (NMC)->nmc_left = 0; \ } while (0) @@ -418,7 +420,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); break; \ } \ (NMC)->nmc_mcur = (NMC)->nmc_mhead = (H); \ - (NMC)->nmc_ptr = mbuf_data(H); \ + (NMC)->nmc_ptr = mtod(H, caddr_t); \ (NMC)->nmc_left = mbuf_len(H); \ } while (0) diff --git a/bsd/nfs/nfsproto.h b/bsd/nfs/nfsproto.h index f41656ddc..08a5a6d88 100644 --- a/bsd/nfs/nfsproto.h +++ b/bsd/nfs/nfsproto.h @@ -99,6 +99,9 @@ #define NFSRV_MAXDATA NFS_MAXDATA #define NFSRV_TCPSOCKBUF (2 * NFSRV_MAXDATA) +#define NFSV4_MINORVERSION 0 /* V4.0 Minor version */ +#define NFSV41_MINORVERSION 1 /* V4.1 Minor version */ + #define NFS4_CALLBACK_PROG 0x40000000 #define NFS4_CALLBACK_PROG_VERSION 1 @@ -125,7 +128,9 @@ #define NFSERR_STALE 70 #define NFSERR_REMOTE 71 /* Version 3 only */ #define NFSERR_WFLUSH 99 /* Version 2 only */ -#define NFSERR_BADHANDLE 10001 /* The rest Version 3 only */ + +/* NFSv3 specific errors */ +#define NFSERR_BADHANDLE 10001 #define NFSERR_NOT_SYNC 10002 #define NFSERR_BAD_COOKIE 10003 #define NFSERR_NOTSUPP 10004 @@ -135,7 +140,9 @@ #define NFSERR_JUKEBOX 10008 #define NFSERR_TRYLATER NFSERR_JUKEBOX #define NFSERR_DELAY NFSERR_JUKEBOX -#define NFSERR_SAME 10009 /* The rest Version 4 only */ + +/* NFSv4.0 specific errors */ +#define NFSERR_SAME 10009 #define NFSERR_DENIED 10010 #define NFSERR_EXPIRED 10011 #define NFSERR_LOCKED 10012 @@ -176,15 +183,57 @@ #define NFSERR_ADMIN_REVOKED 10047 #define NFSERR_CB_PATH_DOWN 10048 +/* NFSv4.1 specific errors */ +#define NFSERR_BADIOMODE 10049 +#define NFSERR_BADLAYOUT 10050 +#define NFSERR_BADSESSIONDIGEST 10051 +#define NFSERR_BADSESSION 10052 +#define NFSERR_BADSLOT 10053 +#define NFSERR_COMPLETEALREADY 10054 +#define NFSERR_NOTBNDTOSESS 10055 +#define NFSERR_DELEGALREADYWANT 10056 +#define NFSERR_BACKCHANBUSY 10057 +#define NFSERR_LAYOUTTRYLATER 10058 +#define NFSERR_LAYOUTUNAVAIL 10059 +#define NFSERR_NOMATCHLAYOUT 10060 +#define NFSERR_RECALLCONFLICT 10061 +#define NFSERR_UNKNLAYOUTTYPE 10062 +#define NFSERR_SEQMISORDERED 10063 +#define NFSERR_SEQUENCEPOS 10064 +#define NFSERR_REQTOOBIG 10065 +#define NFSERR_REPTOOBIG 10066 +#define NFSERR_REPTOOBIGTOCACHE 10067 +#define NFSERR_RETRYUNCACHEDREP 10068 +#define NFSERR_UNSAFECOMPOUND 10069 +#define NFSERR_TOOMANYOPS 10070 +#define NFSERR_OPNOTINSESS 10071 +#define NFSERR_HASHALGUNSUPP 10072 +#define NFSERR_CLIENTIDBUSY 10074 +#define NFSERR_PNFSIOHOLE 10075 +#define NFSERR_SEQFALSERETRY 10076 +#define NFSERR_BADHIGHSLOT 10077 +#define NFSERR_DEADSESSION 10078 +#define NFSERR_ENCRALGUNSUPP 10079 +#define NFSERR_PNFSNOLAYOUT 10080 +#define NFSERR_NOTONLYOP 10081 +#define NFSERR_WRONGCRED 10082 +#define NFSERR_WRONGTYPE 10083 +#define NFSERR_DIRDELEGUNAVAIL 10084 +#define NFSERR_REJECTDELEG 10085 +#define NFSERR_RETURNCONFLICT 10086 +#define NFSERR_DELEGREVOKED 10087 + #define NFSERR_STALEWRITEVERF 30001 /* Fake return for nfs_commit() */ #define NFSERR_DIRBUFDROPPED 30002 /* Fake return for nfs*_readdir_rpc() */ +#define NFSERR_REPLYFROMCACHE 30003 /* Fake return for nfs41_sequence_cb_get() */ +#define NFSERR_SEQSTATUSERR 30004 /* Fake return for nfs41_sequence_get() */ /* * For gss we would like to return EAUTH when we don't have or can't get credentials, * but some callers don't know what to do with it, so we define our own version * of EAUTH to be EACCES */ -#define NFSERR_EAUTH EACCES +#define NFSERR_EAUTH EACCES #define NFSERR_RETVOID 0x20000000 /* Return void, not error */ #define NFSERR_AUTHERR 0x40000000 /* Mark an authentication error */ @@ -219,6 +268,7 @@ #define NFS4_FHSIZE 128 #define NFS4_VERIFIER_SIZE 8 #define NFS4_OPAQUE_LIMIT 1024 +#define NFS4_SESSIONID_SIZE 16 /* variants for multiple versions */ #define NFSX_FH(V) (((V) == NFS_VER2) ? NFSX_V2FH : (NFSX_UNSIGNED + \ @@ -325,7 +375,8 @@ /* NFS CREATE types */ #define NFS_CREATE_UNCHECKED 0 #define NFS_CREATE_GUARDED 1 -#define NFS_CREATE_EXCLUSIVE 2 +#define NFS_CREATE_EXCLUSIVE 2 /* Deprecated in NFSv4.1. */ +#define NFS_CREATE_EXCLUSIVE4_1 3 /* New to NFSv4.1 */ /* Only define these if nfs_prot.h hasn't been included */ #ifndef NFS_PROGRAM @@ -349,7 +400,7 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, /* * NFS attribute management stuff */ -#define NFS_ATTR_BITMAP_LEN 2 +#define NFS_ATTR_BITMAP_LEN 3 #define NFS_BITMAP_SET(B, I) (((uint32_t *)(B))[(I)/32] |= 1U<<((I)%32)) #define NFS_BITMAP_CLR(B, I) (((uint32_t *)(B))[(I)/32] &= ~(1U<<((I)%32))) #define NFS_BITMAP_ISSET(B, I) (((uint32_t *)(B))[(I)/32] & (1U<<((I)%32))) @@ -440,220 +491,27 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_FATTR_TIME_MODIFY 53 #define NFS_FATTR_TIME_MODIFY_SET 54 #define NFS_FATTR_MOUNTED_ON_FILEID 55 - -#define NFS4_ALL_ATTRIBUTES(A) \ - do { \ - /* required: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_SUPPORTED_ATTRS); \ - NFS_BITMAP_SET((A), NFS_FATTR_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FH_EXPIRE_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHANGE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_LINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_SYMLINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_NAMED_ATTR); \ - NFS_BITMAP_SET((A), NFS_FATTR_FSID); \ - NFS_BITMAP_SET((A), NFS_FATTR_UNIQUE_HANDLES); \ - NFS_BITMAP_SET((A), NFS_FATTR_LEASE_TIME); \ - NFS_BITMAP_SET((A), NFS_FATTR_RDATTR_ERROR); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILEHANDLE); \ - /* optional: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_ACL); \ - NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_ARCHIVE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CANSETTIME); \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_INSENSITIVE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_PRESERVING); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHOWN_RESTRICTED); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILEID); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FS_LOCATIONS); \ - NFS_BITMAP_SET((A), NFS_FATTR_HIDDEN); \ - NFS_BITMAP_SET((A), NFS_FATTR_HOMOGENEOUS); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXFILESIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXLINK); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXNAME); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXREAD); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXWRITE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MIMETYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MODE); \ - NFS_BITMAP_SET((A), NFS_FATTR_NO_TRUNC); \ - NFS_BITMAP_SET((A), NFS_FATTR_NUMLINKS); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER_GROUP); \ - NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_AVAIL_HARD); \ - NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_AVAIL_SOFT); \ - NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_USED); \ - NFS_BITMAP_SET((A), NFS_FATTR_RAWDEV); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_USED); \ - NFS_BITMAP_SET((A), NFS_FATTR_SYSTEM); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_ACCESS); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_ACCESS_SET); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_BACKUP); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_CREATE); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_DELTA); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY_SET); \ - NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); \ - } while (0) - -#define NFS4_PER_OBJECT_ATTRIBUTES(A) \ - do { \ - /* required: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHANGE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_NAMED_ATTR); \ - NFS_BITMAP_SET((A), NFS_FATTR_FSID); \ - NFS_BITMAP_SET((A), NFS_FATTR_RDATTR_ERROR); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILEHANDLE); \ - /* optional: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_ACL); \ - NFS_BITMAP_SET((A), NFS_FATTR_ARCHIVE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILEID); \ - NFS_BITMAP_SET((A), NFS_FATTR_HIDDEN); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXLINK); \ - NFS_BITMAP_SET((A), NFS_FATTR_MIMETYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MODE); \ - NFS_BITMAP_SET((A), NFS_FATTR_NUMLINKS); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER_GROUP); \ - NFS_BITMAP_SET((A), NFS_FATTR_RAWDEV); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_USED); \ - NFS_BITMAP_SET((A), NFS_FATTR_SYSTEM); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_ACCESS); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_BACKUP); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_CREATE); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ - NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); \ - } while (0) - -#define NFS4_PER_FS_ATTRIBUTES(A) \ - do { \ - /* required: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_SUPPORTED_ATTRS); \ - NFS_BITMAP_SET((A), NFS_FATTR_FH_EXPIRE_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_LINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_SYMLINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_UNIQUE_HANDLES); \ - NFS_BITMAP_SET((A), NFS_FATTR_LEASE_TIME); \ - /* optional: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_CANSETTIME); \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_INSENSITIVE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_PRESERVING); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHOWN_RESTRICTED); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FS_LOCATIONS); \ - NFS_BITMAP_SET((A), NFS_FATTR_HOMOGENEOUS); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXFILESIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXNAME); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXREAD); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXWRITE); \ - NFS_BITMAP_SET((A), NFS_FATTR_NO_TRUNC); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_DELTA); \ - } while (0) - -#define NFS4_DEFAULT_ATTRIBUTES(A) \ - do { \ - /* required: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_SUPPORTED_ATTRS); \ - NFS_BITMAP_SET((A), NFS_FATTR_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FH_EXPIRE_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHANGE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_LINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_SYMLINK_SUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_NAMED_ATTR); \ - NFS_BITMAP_SET((A), NFS_FATTR_FSID); \ - NFS_BITMAP_SET((A), NFS_FATTR_UNIQUE_HANDLES); \ - NFS_BITMAP_SET((A), NFS_FATTR_LEASE_TIME); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_RDATTR_ERROR); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_FILEHANDLE); */ \ - /* optional: */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_ACL); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); \ - NFS_BITMAP_SET((A), NFS_FATTR_ARCHIVE); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_CANSETTIME); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_INSENSITIVE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CASE_PRESERVING); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHOWN_RESTRICTED); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILEID); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_TOTAL); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_FS_LOCATIONS); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_HIDDEN); \ - NFS_BITMAP_SET((A), NFS_FATTR_HOMOGENEOUS); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXFILESIZE); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXLINK); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXNAME); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXREAD); \ - NFS_BITMAP_SET((A), NFS_FATTR_MAXWRITE); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_MIMETYPE); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_MODE); \ - NFS_BITMAP_SET((A), NFS_FATTR_NO_TRUNC); \ - NFS_BITMAP_SET((A), NFS_FATTR_NUMLINKS); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER); \ - NFS_BITMAP_SET((A), NFS_FATTR_OWNER_GROUP); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_AVAIL_HARD); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_AVAIL_SOFT); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_QUOTA_USED); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_RAWDEV); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_USED); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_SYSTEM); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_ACCESS); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_TIME_ACCESS_SET); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_BACKUP); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_CREATE); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_TIME_DELTA); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ - /* NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY_SET); */ \ - NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); \ - } while (0) - -/* - * NFSv4 WRITE RPCs contain partial GETATTR requests - only type, change, size, metadatatime and modifytime are requested. - * In such cases, we do not update the time stamp - but the requested attributes. - */ -#define NFS4_DEFAULT_WRITE_ATTRIBUTES(A) \ - do { \ - /* required: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TYPE); \ - NFS_BITMAP_SET((A), NFS_FATTR_CHANGE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SIZE); \ - /* optional: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ - NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ - } while (0) - -/* attributes requested when we want to do a "statfs" */ -#define NFS4_STATFS_ATTRIBUTES(A) \ - do { \ - /* optional: */ \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_FILES_TOTAL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_AVAIL); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_FREE); \ - NFS_BITMAP_SET((A), NFS_FATTR_SPACE_TOTAL); \ - } while (0) +#define NFS_FATTR_DIR_NOTIF_DELAY 56 +#define NFS_FATTR_DIRENT_NOTIF_DELAY 57 +#define NFS_FATTR_DACL 58 +#define NFS_FATTR_SACL 59 +#define NFS_FATTR_CHANGE_POLICY 60 +#define NFS_FATTR_FS_STATUS 61 +#define NFS_FATTR_FS_LAYOUT_TYPE 62 +#define NFS_FATTR_LAYOUT_HINT 63 +#define NFS_FATTR_LAYOUT_TYPE 64 +#define NFS_FATTR_LAYOUT_BLKSIZE 65 +#define NFS_FATTR_LAYOUT_ALIGNMENT 66 +#define NFS_FATTR_FS_LOCATIONS_INFO 67 +#define NFS_FATTR_MDSTHRESHOLD 68 +#define NFS_FATTR_RETENTION_GET 69 +#define NFS_FATTR_RETENTION_SET 70 +#define NFS_FATTR_RETENTEVT_GET 71 +#define NFS_FATTR_RETENTEVT_SET 72 +#define NFS_FATTR_RETENTION_HOLD 73 +#define NFS_FATTR_MODE_SET_MASKED 74 +#define NFS_FATTR_SUPPATTR_EXCLCREAT 75 +#define NFS_FATTR_FS_CHARSET_CAP 76 /* * NFS OPEN constants @@ -661,9 +519,11 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, /* open type */ #define NFS_OPEN_NOCREATE 0 #define NFS_OPEN_CREATE 1 + /* delegation space limit */ #define NFS_LIMIT_SIZE 1 #define NFS_LIMIT_BLOCKS 2 + /* access/deny modes */ #define NFS_OPEN_SHARE_ACCESS_NONE 0x00000000 #define NFS_OPEN_SHARE_ACCESS_READ 0x00000001 @@ -673,18 +533,50 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_OPEN_SHARE_DENY_READ 0x00000001 #define NFS_OPEN_SHARE_DENY_WRITE 0x00000002 #define NFS_OPEN_SHARE_DENY_BOTH 0x00000003 + +/* flags for share_access field of OPEN4args */ +#define NFS_OPEN4_SHARE_ACCESS_WANT_DELEG_MASK 0x0FF00 +#define NFS_OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE 0x00000 +#define NFS_OPEN4_SHARE_ACCESS_WANT_READ_DELEG 0x00100 +#define NFS_OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG 0x00200 +#define NFS_OPEN4_SHARE_ACCESS_WANT_ANY_DELEG 0x00300 +#define NFS_OPEN4_SHARE_ACCESS_WANT_NO_DELEG 0x00400 +#define NFS_OPEN4_SHARE_ACCESS_WANT_CANCEL 0x00500 +#define NFS_OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL 0x10000 +#define NFS_OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED 0x20000 + /* delegation types */ #define NFS_OPEN_DELEGATE_NONE 0 #define NFS_OPEN_DELEGATE_READ 1 #define NFS_OPEN_DELEGATE_WRITE 2 +#define NFS_OPEN_DELEGATE_NONE_EXT 3 /* New to v4.1 */ + /* delegation claim types */ #define NFS_CLAIM_NULL 0 #define NFS_CLAIM_PREVIOUS 1 #define NFS_CLAIM_DELEGATE_CUR 2 #define NFS_CLAIM_DELEGATE_PREV 3 +#define NFS_CLAIM_FH 4 /* New to v4.1 */ +#define NFS_CLAIM_DELEG_CUR_FH 5 /* New to v4.1 */ +#define NFS_CLAIM_DELEG_PREV_FH 6 /* New to v4.1 */ + +/* why_no_delegation4 */ +#define NFS_WND4_NOT_WANTED 0 +#define NFS_WND4_CONTENTION 1 +#define NFS_WND4_RESOURCE 2 +#define NFS_WND4_NOT_SUPP_FTYPE 3 +#define NFS_WND4_WRITE_DELEG_NOT_SUPP_FTYPE 4 +#define NFS_WND4_NOT_SUPP_UPGRADE 5 +#define NFS_WND4_NOT_SUPP_DOWNGRADE 6 +#define NFS_WND4_CANCELLED 7 +#define NFS_WND4_IS_DIR 8 + /* open result flags */ #define NFS_OPEN_RESULT_CONFIRM 0x00000002 #define NFS_OPEN_RESULT_LOCKTYPE_POSIX 0x00000004 +#define NFS_OPEN_RESULT_PRESERVE_UNLINKED 0x00000008 +#define NFS_OPEN_RESULT_MAY_NOTIFY_LOCK 0x00000020 + /* NFS lock types */ #define NFS_LOCK_TYPE_READ 1 #define NFS_LOCK_TYPE_WRITE 2 @@ -735,14 +627,61 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_OP_VERIFY 37 #define NFS_OP_WRITE 38 #define NFS_OP_RELEASE_LOCKOWNER 39 + +/* NFSv4 opcodes */ +#define NFS_V4_OP_COUNT 40 + +/* NFSv4.1 opcodes */ +#define NFS_OP_BACKCHANNELCTL 40 +#define NFS_OP_BINDCONNTOSESS 41 +#define NFS_OP_EXCHANGEID 42 +#define NFS_OP_CREATESESSION 43 +#define NFS_OP_DESTROYSESSION 44 +#define NFS_OP_FREESTATEID 45 +#define NFS_OP_GETDIRDELEG 46 +#define NFS_OP_GETDEVINFO 47 +#define NFS_OP_GETDEVLIST 48 +#define NFS_OP_LAYOUTCOMMIT 49 +#define NFS_OP_LAYOUTGET 50 +#define NFS_OP_LAYOUTRETURN 51 +#define NFS_OP_SECINFONONAME 52 +#define NFS_OP_SEQUENCE 53 +#define NFS_OP_SETSSV 54 +#define NFS_OP_TESTSTATEID 55 +#define NFS_OP_WANTDELEG 56 +#define NFS_OP_DESTROYCLIENTID 57 +#define NFS_OP_RECLAIMCOMPL 58 + +/* NFSv4.1 opcodes */ +#define NFS_V41_OP_COUNT 59 + +/* illegal op code */ #define NFS_OP_ILLEGAL 10044 -#define NFS_OP_COUNT 40 /* NFSv4 callback opcodes */ #define NFS_OP_CB_GETATTR 3 #define NFS_OP_CB_RECALL 4 + +/* NFSv4 callback opcodes */ +#define NFS_V4_OP_CB_COUNT 5 + +/* NFSv4.1 callback opcodes */ +#define NFS_OP_CB_LAYOUTRECALL 5 +#define NFS_OP_CB_NOTIFY 6 +#define NFS_OP_CB_PUSHDELEG 7 +#define NFS_OP_CB_RECALLANY 8 +#define NFS_OP_CB_RECALLOBJAVAIL 9 +#define NFS_OP_CB_RECALLSLOT 10 +#define NFS_OP_CB_SEQUENCE 11 +#define NFS_OP_CB_WANTCANCELLED 12 +#define NFS_OP_CB_NOTIFYLOCK 13 +#define NFS_OP_CB_NOTIFYDEVID 14 + +/* NFSv4.1 callback opcodes */ +#define NFS_V41_OP_CB_COUNT 15 + +/* illegal op code */ #define NFS_OP_CB_ILLEGAL 10044 -#define NFS_OP_CB_COUNT 5 /* NFSv4 file handle type flags */ #define NFS_FH_PERSISTENT 0x00000000 @@ -751,6 +690,98 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_FH_VOL_MIGRATION 0x00000004 #define NFS_FH_VOL_RENAME 0x00000008 +/* NFSv4.1 Constants */ + +/* BIND_CONN_TO_SESSION - Associate Connection with Session */ +#define NFS_CDFC4_FORE 0x1 +#define NFS_CDFC4_BACK 0x2 +#define NFS_CDFC4_FORE_OR_BOTH 0x3 +#define NFS_CDFC4_BACK_OR_BOTH 0x7 + +#define NFS_CDFS4_FORE 0x1 +#define NFS_CDFS4_BACK 0x2 +#define NFS_CDFS4_BOTH 0x3 + +/* EXCHANGE_ID - Instantiate Client ID */ +#define NFS_EXCHGID4_FLAG_SUPP_MOVED_REFER 0x00000001 +#define NFS_EXCHGID4_FLAG_SUPP_MOVED_MIGR 0x00000002 +#define NFS_EXCHGID4_FLAG_BIND_PRINC_STATEID 0x00000100 +#define NFS_EXCHGID4_FLAG_USE_NON_PNFS 0x00010000 +#define NFS_EXCHGID4_FLAG_USE_PNFS_MDS 0x00020000 +#define NFS_EXCHGID4_FLAG_USE_PNFS_DS 0x00040000 +#define NFS_EXCHGID4_FLAG_MASK_PNFS 0x00070000 +#define NFS_EXCHGID4_FLAG_UPD_CONFIRMED_REC_A 0x40000000 +#define NFS_EXCHGID4_FLAG_CONFIRMED_R 0x80000000 + +#define NFS_EXCHGID4_SP4_NONE 0 +#define NFS_EXCHGID4_SP4_MACH_CRED 1 +#define NFS_EXCHGID4_SP4_SSV 2 + +/* CREATE_SESSION - Create New Session and Confirm Client ID */ +#define NFS_CREATE_SESSION4_FLAG_PERSIST 0x00000001 +#define NFS_CREATE_SESSION4_FLAG_CONN_BACK_CHAN 0x00000002 +#define NFS_CREATE_SESSION4_FLAG_CONN_RDMA 0x00000004 + +/* GET_DIR_DELEGATION - Get a Directory Delegation */ +#define NFS_GDD4_OK 0 +#define NFS_GDD4_UNAVAIL 1 + +/* LAYOUTRETURN - Release Layout Information */ +#define NFS_LAYOUT4_RET_REC_FILE 1 +#define NFS_LAYOUT4_RET_REC_FSID 2 +#define NFS_LAYOUT4_RET_REC_ALL 3 + +#define NFS_LAYOUTRETURN4_FILE NFS_LAYOUT4_RET_REC_FILE +#define NFS_LAYOUTRETURN4_FSID NFS_LAYOUT4_RET_REC_FSID +#define NFS_LAYOUTRETURN4_ALL NFS_LAYOUT4_RET_REC_ALL + +/* SECINFO_NO_NAME - Get Security on Unnamed Object */ +#define NFS_SECINFO_STYLE4_CURRENT_FH 0 +#define NFS_SECINFO_STYLE4_PARENT 1 + +/* SEQUENCE - Supply Per-Procedure Sequencing and Control */ +#define NFS_SEQ4_STATUS_CB_PATH_DOWN 0x00000001 +#define NFS_SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 +#define NFS_SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +#define NFS_SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED 0x00000008 +#define NFS_SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED 0x00000010 +#define NFS_SEQ4_STATUS_ADMIN_STATE_REVOKED 0x00000020 +#define NFS_SEQ4_STATUS_RECALLABLE_STATE_REVOKED 0x00000040 +#define NFS_SEQ4_STATUS_LEASE_MOVED 0x00000080 +#define NFS_SEQ4_STATUS_RESTART_RECLAIM_NEEDED 0x00000100 +#define NFS_SEQ4_STATUS_CB_PATH_DOWN_SESSION 0x00000200 +#define NFS_SEQ4_STATUS_BACKCHANNEL_FAULT 0x00000400 +#define NFS_SEQ4_STATUS_DEVID_CHANGED 0x00000800 +#define NFS_SEQ4_STATUS_DEVID_DELETED 0x00001000 + +/* CB_LAYOUTRECALL - Recall Layout from Client */ +#define NFS_LAYOUTRECALL4_FILE NFS_LAYOUT4_RET_REC_FILE +#define NFS_LAYOUTRECALL4_FSID NFS_LAYOUT4_RET_REC_FSID +#define NFS_LAYOUTRECALL4_ALL NFS_LAYOUT4_RET_REC_ALL + +/* CB_NOTIFY - Notify Client of Directory Changes */ +#define NFS_NOTIFY4_CHANGE_CHILD_ATTRS 0 +#define NFS_NOTIFY4_CHANGE_DIR_ATTRS 1 +#define NFS_NOTIFY4_REMOVE_ENTRY 2 +#define NFS_NOTIFY4_ADD_ENTRY 3 +#define NFS_NOTIFY4_RENAME_ENTRY 4 +#define NFS_NOTIFY4_CHANGE_COOKIE_VERIFIER 5 + +/* CB_RECALL_ANY - Keep Any N Recallable Objects */ +#define NFS_RCA4_TYPE_MASK_RDATA_DLG 0 +#define NFS_RCA4_TYPE_MASK_WDATA_DLG 1 +#define NFS_RCA4_TYPE_MASK_DIR_DLG 2 +#define NFS_RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define NFS_RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define NFS_RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define NFS_RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define NFS_RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define NFS_RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + +/* CB_NOTIFY_DEVICEID - Notify Client of Device ID Changes */ +#define NFS_NOTIFY_DEVICEID4_CHANGE 1 +#define NFS_NOTIFY_DEVICEID4_DELETE 2 + /* * NFSv4 ACL constants */ @@ -759,11 +790,13 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_ACL_SUPPORT_DENY_ACL 0x00000002 #define NFS_ACL_SUPPORT_AUDIT_ACL 0x00000004 #define NFS_ACL_SUPPORT_ALARM_ACL 0x00000008 + /* ACE types */ #define NFS_ACE_ACCESS_ALLOWED_ACE_TYPE 0x00000000 #define NFS_ACE_ACCESS_DENIED_ACE_TYPE 0x00000001 #define NFS_ACE_SYSTEM_AUDIT_ACE_TYPE 0x00000002 #define NFS_ACE_SYSTEM_ALARM_ACE_TYPE 0x00000003 + /* ACE flags */ #define NFS_ACE_FILE_INHERIT_ACE 0x00000001 #define NFS_ACE_DIRECTORY_INHERIT_ACE 0x00000002 @@ -773,6 +806,7 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020 #define NFS_ACE_IDENTIFIER_GROUP 0x00000040 #define NFS_ACE_INHERITED_ACE 0x00000080 + /* ACE mask flags */ #define NFS_ACE_READ_DATA 0x00000001 #define NFS_ACE_LIST_DIRECTORY 0x00000001 @@ -786,6 +820,8 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_ACE_DELETE_CHILD 0x00000040 #define NFS_ACE_READ_ATTRIBUTES 0x00000080 #define NFS_ACE_WRITE_ATTRIBUTES 0x00000100 +#define NFS_ACE_WRITE_RETENTION 0x00000200 +#define NFS_ACE_WRITE_RETENTION_HOLD 0x00000400 #define NFS_ACE_DELETE 0x00010000 #define NFS_ACE_READ_ACL 0x00020000 #define NFS_ACE_WRITE_ACL 0x00040000 @@ -795,6 +831,87 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, #define NFS_ACE_GENERIC_WRITE 0x00160106 #define NFS_ACE_GENERIC_EXECUTE 0x001200A0 +/* deviceid4 */ +#define NFS4_DEVICEID4_SIZE 16 + +/* retention_get */ +#define NFS_RET4_DURATION_INFINITE 0xffffffffffffffff + +/* na41_flag */ +#define NFS_ACL4_AUTO_INHERIT 0x00000001 +#define NFS_ACL4_PROTECTED 0x00000002 +#define NFS_ACL4_DEFAULTED 0x00000004 + +/* thi_hintset */ +#define NFS_TH4_READ_SIZE 0 +#define NFS_TH4_WRITE_SIZE 1 +#define NFS_TH4_READ_IOSIZE 2 +#define NFS_TH4_WRITE_IOSIZE 3 + +/* fs_locations_info4 */ +#define NFS_FSLI4BX_GFLAGS 0 +#define NFS_FSLI4BX_TFLAGS 1 + +#define NFS_FSLI4BX_CLSIMUL 2 +#define NFS_FSLI4BX_CLHANDLE 3 +#define NFS_FSLI4BX_CLFILEID 4 +#define NFS_FSLI4BX_CLWRITEVER 5 +#define NFS_FSLI4BX_CLCHANGE 6 +#define NFS_FSLI4BX_CLREADDIR 7 + +#define NFS_FSLI4BX_READRANK 8 +#define NFS_FSLI4BX_WRITERANK 9 +#define NFS_FSLI4BX_READORDER 10 +#define NFS_FSLI4BX_WRITEORDER 11 + +#define NFS_FSLI4GF_WRITABLE 0x01 +#define NFS_FSLI4GF_CUR_REQ 0x02 +#define NFS_FSLI4GF_ABSENT 0x04 +#define NFS_FSLI4GF_GOING 0x08 +#define NFS_FSLI4GF_SPLIT 0x10 + +#define NFS_FSLI4TF_RDMA 0x01 /* Bits defined within the transport flag byte */ + +#define NFS_FSLI4IF_VAR_SUB 0x00000001 /* Flag bits in fli_flags */ + +/* layouttype4 */ +#define NFS_LAYOUT4_NFSV4_1_FILES 0x1 +#define NFS_LAYOUT4_OSD2_OBJECTS 0x2 +#define NFS_LAYOUT4_BLOCK_VOLUME 0x3 + +/* layoutiomode4 */ +#define NFS_LAYOUTIOMODE4_READ 1 +#define NFS_LAYOUTIOMODE4_RW 2 +#define NFS_LAYOUTIOMODE4_ANY 3 + +/* fs4_status_type */ +#define NFS_STATUS4_FIXED 1 +#define NFS_STATUS4_UPDATED 2 +#define NFS_STATUS4_VERSIONED 3 +#define NFS_STATUS4_WRITABLE 4 +#define NFS_STATUS4_REFERRAL 5 + +/* nfsv4_1_file_layouthint4 */ +#define NFS_NFL4_UFLG_MASK 0x0000003F +#define NFS_NFL4_UFLG_DENSE 0x00000001 +#define NFS_NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 +#define NFS_NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 + +/* filelayout_hint_care4 */ +#define NFS_NFLH4_CARE_DENSE NFS_NFL4_UFLG_DENSE +#define NFS_NFLH4_CARE_COMMIT_THRU_MDS NFS_NFL4_UFLG_COMMIT_THRU_MDS +#define NFS_NFLH4_CARE_STRIPE_UNIT_SIZE 0x00000040 +#define NFS_NFLH4_CARE_STRIPE_COUNT 0x00000080 + +/* fs_charset_cap4 */ +#define NFS_FSCHARSET_CAP4_CONTAINS_NON_UTF8 0x1 +#define NFS_FSCHARSET_CAP4_ALLOWS_ONLY_UTF8 0x2 + +/* ssv_subkey4 */ +#define NFS_SSV4_SUBKEY_MIC_I2T 1 +#define NFS_SSV4_SUBKEY_MIC_T2I 2 +#define NFS_SSV4_SUBKEY_SEAL_I2T 3 +#define NFS_SSV4_SUBKEY_SEAL_T2I 4 /* * Quads are defined as arrays of 2 32-bit values to ensure dense packing diff --git a/bsd/pgo/profile_runtime_data.c b/bsd/pgo/profile_runtime_data.c index f6c506055..bd97e982f 100644 --- a/bsd/pgo/profile_runtime_data.c +++ b/bsd/pgo/profile_runtime_data.c @@ -1,11 +1,16 @@ #include +#ifndef __BUILDING_XNU_LIBRARY__ /* * This tells compiler_rt not to include userspace-specific stuff writing * profile data to a file. + * When building userspace unit-test we don't do that because we do want + * the normal file-saving coverage mechanism to work as usual in a */ int __llvm_profile_runtime = 0; +#endif /* __BUILDING_XNU_LIBRARY__ */ + /* compiler-rt requires this. It uses it to page-align * certain things inside its buffers. */ diff --git a/bsd/pthread/pthread_shims.c b/bsd/pthread/pthread_shims.c index 3225d6f58..e5e58b569 100644 --- a/bsd/pthread/pthread_shims.c +++ b/bsd/pthread/pthread_shims.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -369,9 +370,12 @@ bsdthread_terminate(struct proc *p, struct bsdthread_terminate_args *uap, int32_ struct _bsdthread_terminate *bts = &uth->uu_save.uus_bsdthread_terminate; mach_port_name_t sem = (mach_port_name_t)uap->sema_or_ulock; mach_port_name_t thp = uap->port; + uint16_t tag = thread_get_tag(th); - if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) { + if (tag & THREAD_TAG_WORKQUEUE) { workq_thread_terminate(p, get_bsdthread_info(th)); + } else if (tag & THREAD_TAG_AIO_WORKQUEUE) { + return ENOTSUP; } /* @@ -562,6 +566,8 @@ static const struct pthread_callbacks_s pthread_callbacks = { .thread_bootstrap_return = pthread_bootstrap_return, .unix_syscall_return = unix_syscall_return, + .abandon_preemption_disable_measurement = abandon_preemption_disable_measurement, + .get_bsdthread_info = get_bsdthread_info, .thread_policy_set_internal = thread_policy_set_internal, .thread_policy_get = thread_policy_get, @@ -573,12 +579,12 @@ static const struct pthread_callbacks_s pthread_callbacks = { .current_map = _current_map, .thread_create_immovable = thread_create_immovable, - .thread_terminate_pinned = thread_terminate_pinned, + .thread_terminate_pinned = thread_terminate_immovable, .thread_resume = thread_resume, .kevent_workq_internal = kevent_workq_internal, - .convert_thread_to_port_pinned = convert_thread_to_port_pinned, + .convert_thread_to_port_pinned = convert_thread_to_port_immovable, .proc_get_stack_addr_hint = proc_get_stack_addr_hint, .proc_set_stack_addr_hint = proc_set_stack_addr_hint, diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c index a0bb56c9b..45d9fc00f 100644 --- a/bsd/pthread/pthread_workqueue.c +++ b/bsd/pthread/pthread_workqueue.c @@ -4944,12 +4944,12 @@ workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags) } if (uth->uu_workq_thport == MACH_PORT_NULL) { - /* convert_thread_to_port_pinned() consumes a reference */ + /* convert_thread_to_port_immovable() consumes a reference */ thread_reference(th); - /* Convert to immovable/pinned thread port, but port is not pinned yet */ - ipc_port_t port = convert_thread_to_port_pinned(th); - /* Atomically, pin and copy out the port */ - uth->uu_workq_thport = ipc_port_copyout_send_pinned(port, get_task_ipcspace(proc_task(p))); + /* Convert to immovable thread port, then pin the entry */ + uth->uu_workq_thport = ipc_port_copyout_send_pinned( + convert_thread_to_port_immovable(th), + get_task_ipcspace(proc_task(p))); } /* Thread has been set up to run, arm its next workqueue quantum or disarm diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index 8dd19a851..05eff314f 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -1308,6 +1308,16 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) kau_write(rec, tok); } break; + case AUE_FUNMOUNT: + if (ARG_IS_VALID(kar, ARG_FD)) { + tok = au_to_arg32(2, "dir fd", ar->ar_arg_fd); + kau_write(rec, tok); + } + if (ARG_IS_VALID(kar, ARG_FFLAGS)) { + tok = au_to_arg32(3, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + } + break; case AUE_MSGCTL: ar->ar_event = audit_msgctl_to_event(ar->ar_arg_svipc_cmd); diff --git a/bsd/skywalk/channel/channel.c b/bsd/skywalk/channel/channel.c index b49177cb7..8ccbb2940 100644 --- a/bsd/skywalk/channel/channel.c +++ b/bsd/skywalk/channel/channel.c @@ -59,6 +59,8 @@ #include #include +#include + #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) struct ch_event_result { @@ -86,7 +88,7 @@ static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t, static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx, struct ch_ev_thresh *); static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *, int, int *); + struct nxbind *, struct proc *, int, int *); static void ch_disconnect(struct kern_channel *); static int ch_set_lowat_thresh(struct kern_channel *, enum txrx, struct sockopt *); @@ -179,10 +181,9 @@ SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY); static void ch_redzone_init(void) { - _CASSERT(sizeof(__ch_umd_redzone_cookie) == - sizeof(((struct __metadata_preamble *)0)->mdp_redzone)); - _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble)); - _CASSERT(sizeof(struct __slot_desc) == 8); + static_assert(sizeof(__ch_umd_redzone_cookie) == sizeof(((struct __metadata_preamble *)0)->mdp_redzone)); + static_assert(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble)); + static_assert(sizeof(struct __slot_desc) == 8); /* Initialize random user red zone cookie values */ do { @@ -201,8 +202,8 @@ channel_init(void) SK_LOCK_ASSERT_HELD(); ASSERT(!__ch_inited); - _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0); - _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0); + static_assert(offsetof(struct __user_packet, pkt_qum) == 0); + static_assert(offsetof(struct __kern_packet, pkt_qum) == 0); ch_redzone_init(); @@ -299,8 +300,8 @@ csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql) struct ch_selinfo *csi = &kring->ckr_si; CSI_LOCK(csi); - SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) " - "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R", + SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) " + "si %p si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags); @@ -315,7 +316,7 @@ csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p, struct ch_selinfo *csi = &na->na_si[t]; CSI_LOCK(csi); - SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x", + SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p si_flags 0x%x", (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na), SK_KVA(&csi->csi_si), csi->csi_si.si_flags); @@ -380,12 +381,12 @@ csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay, struct ch_selinfo *csi = &kring->ckr_si; CSI_LOCK(csi); - SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) " - "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b", + SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) kr %s (%p) " + "si %p si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay, - within_kevent, selwake, hint, CHAN_FILT_HINT_BITS); + within_kevent, selwake, hint); csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint); CSI_UNLOCK(csi); @@ -398,11 +399,11 @@ csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay, struct ch_selinfo *csi = &na->na_si[t]; CSI_LOCK(csi); - SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx " - "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b", + SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (%p) si %p " + "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%x", (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na), SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay, - within_kevent, selwake, hint, CHAN_FILT_HINT_BITS); + within_kevent, selwake, hint); switch (t) { case NR_RX: @@ -508,7 +509,7 @@ filt_chrwdetach(struct knote *kn, boolean_t write) si = &csi->csi_si; CSI_LOCK(csi); - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) " + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s) " "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", write ? "write" : "read", si->si_flags); @@ -551,8 +552,8 @@ filt_chrw(struct knote *kn, long hint, int events) #pragma unused(hint) #pragma unused(events) #endif - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx " - "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name, + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p " + "kn %p (%s%s) hint 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", (events == POLLOUT) ? "write" : "read", @@ -807,7 +808,7 @@ filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev) CSI_UNLOCK(csi); - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)", + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s%s)", na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "", (ev == EVFILT_WRITE) ? "write" : "read"); @@ -909,9 +910,9 @@ filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev) struct ch_selinfo *csi; long hint = 0; - _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE); - _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT); - _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD); + static_assert(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE); + static_assert(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT); + static_assert(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD); ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL); @@ -936,7 +937,7 @@ filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev) /* on registration force an event */ hint |= CHAN_FILT_HINT_FLOW_ADV_UPD; } - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)", + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), "EVFILT_NW_CHANNEL"); return filt_chan_extended_common(kn, hint); @@ -963,7 +964,7 @@ filt_che_detach(struct knote *kn) CSI_UNLOCK(csi); lck_mtx_unlock(&ch->ch_lock); - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)", + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p kn %p (%s)", ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn), "EVFILT_NW_CHANNEL"); } @@ -983,9 +984,8 @@ filt_che_event(struct knote *kn, long hint) if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) { VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0); } - SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)", - ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint, - CHAN_FILT_HINT_BITS); + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) ch %p hint 0x%lx)", + ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint); return filt_chan_extended_common(kn, hint); } @@ -1060,6 +1060,7 @@ int ch_kqfilter(struct kern_channel *ch, struct knote *kn, struct kevent_qos_s *kev) { + SK_LOG_VAR(char dbgbuf[CH_DBGBUF_SIZE]); int result; lck_mtx_lock(&ch->ch_lock); @@ -1067,8 +1068,8 @@ ch_kqfilter(struct kern_channel *ch, struct knote *kn, if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) || na_reject_channel(ch, ch->ch_na))) { - SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name, - ch->ch_pid, ch->ch_flags, CHANF_BITS); + SK_ERR("channel is non-permissive %s", + ch2str(ch, dbgbuf, sizeof(dbgbuf))); knote_set_error(kn, ENXIO); lck_mtx_unlock(&ch->ch_lock); return 0; @@ -1132,9 +1133,9 @@ ch_event_log(const char *prefix, const struct kern_channel *ch, struct proc *p, const struct nexus_adapter *na, int events, int revents) { - SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) " - "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na), - SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p), + SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (%p) ch %p %s(%d) " + "th %p ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na), + SK_KVA(ch), sk_proc_name(p), sk_proc_pid(p), SK_KVA(current_thread()), events, revents); } #endif /* SK_LOG */ @@ -1205,7 +1206,7 @@ ch_event(struct kern_channel *ch, int events, void *wql, protect = sk_sync_protect(); /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); /* and make this channel eligible for draining again */ if (na->na_flags & NAF_DRAINING) { @@ -1493,8 +1494,6 @@ ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id) /* see comments in ch_open() */ if (cinfo->cinfo_nx_port != port) { continue; - } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) { - continue; } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY && ring_id != cinfo->cinfo_ch_ring_id && ring_id != CHANNEL_RING_ID_ANY) { @@ -1521,18 +1520,17 @@ ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port) uuid_string_t uuidstr; SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u", - sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p), + sk_proc_name(p), sk_proc_pid(p), proc_uniqueid(p), sk_uuid_unparse(p_uuid, uuidstr), port); } SK_LOG_ATTRIBUTE static void ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring, - uint32_t mode, const char *mode_bits, int err) + uint32_t mode, int err) { - SK_D("%s(%d) port %u ring %d mode 0x%b err %d", - sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring, - mode, mode_bits, err); + SK_D("%s(%d) port %u ring %d mode 0x%x err %d", + sk_proc_name(p), sk_proc_pid(p), port, (int)ring, mode, err); } #endif /* SK_LOG */ @@ -1583,33 +1581,13 @@ ch_open(struct ch_init *init, struct proc *p, int fd, int *err) } } - /* "no copy" is valid only when at least one tx/rx mon flag is set */ - if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) { - mode &= ~CHMODE_MONITOR_NO_COPY; - } - - if (mode & CHMODE_MONITOR) { - if ((*err = skywalk_priv_check_cred(p, cred, - PRIV_SKYWALK_OBSERVE_ALL)) != 0) { - goto done; - } - /* Don't allow non-root processes to monitor channels. */ - if (kauth_cred_issuser(cred) == 0) { - *err = EPERM; - goto done; - } - } - /* * Check with the nexus to see if the port is bound; if so, prepare * our nxbind structure that we'll need to pass down to the nexus * for it compare. If the caller provides a key, we take it over * and will free it ourselves (as part of freeing nxbind.) - * - * If this is a monitor channel, skip this altogether since the check - * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above. */ - if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) { + if (!NX_ANONYMOUS_PROV(nx)) { /* * -fbounds-safety: ci_key is user_addr_t (aka uint64_t), so * can't mark it as __sized_by. Forge it instead. @@ -1638,54 +1616,36 @@ ch_open(struct ch_init *init, struct proc *p, int fd, int *err) } /* - * There can only be one owner of {port,ring_id} tuple. Once - * owned, this can be made available among multiple monitors. + * There can only be one owner of {port,ring_id} tuple. * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over * all rings. Further attempts to own any or all of the rings * will be declined. * - * Multiple monitors are allowed to exist. If a channel has been - * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be - * monitored. If an owning channel has been bound to an individual - * ring, only that ring can be monitored, either by specifying the - * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time. - * * For example, assuming a 2-rings setup for port 'p': * * owner{p,-1} - * will allow: - * monitor{p,-1}, monitor{p,0}, monitor{p,1} * will not allow: * owner{p,-1}, owner{p,0}, owner{p,1} * * owner{p,0} * will allow: - * owner{p,1}, monitor{p,-1}, monitor{p,0} + * owner{p,1} * will not allow: - * owner{p,-1}, owner{p,0}, monitor{p,1} + * owner{p,-1}, owner{p,0} */ if ((ch0 = ch_find(nx, port, ring)) != NULL) { - SK_D("found ch0 0x%llx", SK_KVA(ch0)); - /* - * Unless this is a monitor channel, allow only at - * most one owner of the {port,ring_id} tuple. - */ - if (!(mode & CHMODE_MONITOR)) { + SK_D("found ch0 %p", SK_KVA(ch0)); #if SK_LOG - uuid_string_t uuidstr; - char *na_name = (ch0->ch_na != NULL) ? - ch0->ch_na->na_name : ""; + uuid_string_t uuidstr; + char *na_name = (ch0->ch_na != NULL) ? + ch0->ch_na->na_name : ""; - SK_DSC(p, "ch %s flags (0x%x) exists on port %d on " - "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port, - sk_uuid_unparse(nx->nx_uuid, uuidstr), - ch0->ch_name, ch0->ch_pid); + SK_PERR(p, "ch %s flags (0x%x) exists on port %d on " + "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port, + sk_uuid_unparse(nx->nx_uuid, uuidstr), + ch0->ch_name, ch0->ch_pid); #endif /* SK_LOG */ - *err = EBUSY; - goto done; - } - } else if (mode & CHMODE_MONITOR) { - *err = ENXIO; + *err = EBUSY; goto done; } @@ -1697,13 +1657,13 @@ ch_open(struct ch_init *init, struct proc *p, int fd, int *err) chr.cr_ring_id = ring; /* upon success, returns a channel with reference held */ - ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err); + ch = ch_connect(nx, &chr, nxb, p, fd, err); done: #if SK_LOG if (__improbable(sk_verbose != 0)) { - ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err); + ch_open_log2(p, port, ring, mode, *err); } #endif /* SK_LOG */ @@ -1749,7 +1709,7 @@ ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref, } /* upon success, returns a channel with reference held */ - ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err); + ch = ch_connect(nx, chr, NULL, kernproc, -1, err); if (ch != NULL) { /* * nonxref channels don't hold any reference to the nexus, @@ -1778,13 +1738,12 @@ ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref, if (nx->nx_prov != NULL) { nxdom_prov_name = NX_DOM_PROV(nx)->nxdom_prov_name; } - SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d", + SK_D("nx %p (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%x err %d", SK_KVA(nx), (nxdom_prov_name != NULL) ? nxdom_prov_name : "", (na_name != NULL) ? na_name : "", (int)chr->cr_port, (int)chr->cr_ring_id, - sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode, - CHMODE_BITS, *err); + sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode, *err); #endif /* SK_LOG */ done: @@ -1806,12 +1765,11 @@ ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special) const char *nxdom_prov_name = (ch->ch_nexus != NULL) ? NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : ""; - SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)", + SK_D("ch %p (%s:%s:\"%s\":%u:%d) uuid %s flags 0x%x", SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name, - ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); - SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id, - uuidstr)); - SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS); + ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id, + sk_uuid_unparse(ch->ch_info->cinfo_ch_id, uuidstr), + ch->ch_flags); #endif /* SK_LOG */ struct kern_nexus *nx = ch->ch_nexus; @@ -1977,11 +1935,11 @@ ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo, ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL)); if (ch_schema != NULL) { - SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema)); + SK_D("channel_schema at %p", SK_KVA(ch_schema)); SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name); SK_D(" kern_uuid: %s", sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr)); - SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS); + SK_D(" flags: 0x%x", ch_schema->csm_flags); SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings, cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring); SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings, @@ -2011,27 +1969,27 @@ ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo, SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs); } - SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)", + SK_D("ch %p (%s:%s:\"%s\":%u:%d)", SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name, nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id); SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr)); SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr)); - SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS); - SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask), - sk_proc_name_address(p), sk_proc_pid(p)); + SK_D(" flags: 0x%x", ch->ch_flags); + SK_D(" task: %p %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask), + sk_proc_name(p), sk_proc_pid(p)); SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value, ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ? "bytes" : "slots")); SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value, ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ? "bytes" : "slots")); - SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref)); + SK_D(" mmapref: %p", SK_KVA(ch->ch_mmap.ami_mapref)); SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base); - SK_D(" mapsize: 0x%llx (%llu KB)", + SK_D(" mapsize: %llu (%llu KB)", (uint64_t)cinfo->cinfo_mem_map_size, (uint64_t)cinfo->cinfo_mem_map_size >> 10); - SK_D(" memsize: 0x%llx (%llu KB)", + SK_D(" memsize: %llu (%llu KB)", (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10); SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset); } @@ -2048,8 +2006,8 @@ ch_connect_log2(const struct kern_nexus *nx, int err) #endif /* SK_LOG */ static struct kern_channel * -ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0, - struct nxbind *nxb, struct proc *p, int fd, int *err) +ch_connect(struct kern_nexus *nx, struct chreq *chr, struct nxbind *nxb, + struct proc *p, int fd, int *err) { struct kern_nexus_domain_provider *nxdom_prov; struct kern_channel *ch = NULL; @@ -2136,11 +2094,11 @@ ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0, goto done; } - SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p), - sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port); + SK_PDF(SK_VERB_CHANNEL, p, "%snexus port %u requested", + reserved_port ? "[reserved] " : "", chr->cr_port); if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov, - nx, ch, chr, ch0, nxb, p)) != 0) { + nx, ch, chr, nxb, p)) != 0) { goto done; } @@ -2501,7 +2459,7 @@ ch_free(struct kern_channel *ch) ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT | CHANF_IF_ADV))); lck_mtx_destroy(&ch->ch_lock, &channel_lock_group); - SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch)); + SK_DF(SK_VERB_MEM, "ch %p FREE", SK_KVA(ch)); ASSERT(ch->ch_info != NULL); zfree(ch_info_zone, ch->ch_info); ch->ch_info = NULL; @@ -2552,16 +2510,29 @@ ch_release(struct kern_channel *ch) return lastref; } -/* - * -fbounds-safety: Why is the arg void *? All callers pass struct kern_channel * - */ void -ch_dtor(struct kern_channel *arg) +ch_dtor(struct kern_channel *ch) { - struct kern_channel *ch = arg; - SK_LOCK(); ch_close(ch, TRUE); (void) ch_release_locked(ch); SK_UNLOCK(); } + +void +ch_update_upp_buf_stats(struct kern_channel *ch, struct kern_pbufpool *pp) +{ + uint64_t buf_inuse = pp->pp_u_bufinuse; + struct __user_channel_schema *csm = ch->ch_schema; + os_atomic_store(&csm->csm_upp_buf_inuse, buf_inuse, relaxed); +} + +SK_NO_INLINE_ATTRIBUTE +char * +ch2str(const struct kern_channel *ch, char *__counted_by(dsz)dst, size_t dsz) +{ + (void) sk_snprintf(dst, dsz, "%p %s flags 0x%b", + SK_KVA(ch), ch->ch_name, ch->ch_flags, CHANF_BITS); + + return dst; +} diff --git a/bsd/skywalk/channel/channel_kern.c b/bsd/skywalk/channel/channel_kern.c index 0e6b5532c..fc0d190b0 100644 --- a/bsd/skywalk/channel/channel_kern.c +++ b/bsd/skywalk/channel/channel_kern.c @@ -356,15 +356,15 @@ kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring, goto out; } - if (__improbable(!IF_FULLY_ATTACHED(ifp))) { - SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached", + if (__improbable(!ifnet_is_fully_attached(ifp))) { + SK_ERR("hwna %p ifp %s (%p), interface not attached", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; } if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) { - SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), " + SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna %p ifp %s (%p), " "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; @@ -382,7 +382,7 @@ kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring, if (__improbable(KR_DROP(hw_kring) || !NA_IS_ACTIVE(hw_kring->ckr_na))) { kr_exit(hw_kring); - SK_ERR("hw-kr 0x%llx stopped", SK_KVA(hw_kring)); + SK_ERR("hw-kr %p stopped", SK_KVA(hw_kring)); rc = ENXIO; goto out; } @@ -468,7 +468,7 @@ _kern_channel_flowadv_signal(struct flowadv_fcentry *fce, flow_adv_type_t type) struct nx_flowswitch *fsw; flow_adv_func_type_t flow_adv_func = NULL; - _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); + static_assert(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); if (type == FLOW_ADV_SIGNAL_SUSPEND) { flow_adv_func = na_flowadv_set; @@ -482,7 +482,7 @@ _kern_channel_flowadv_signal(struct flowadv_fcentry *fce, flow_adv_type_t type) } else { LCK_RW_ASSERT(&fsw_ifp_to_fsw(ifp)->fsw_lock, LCK_RW_ASSERT_SHARED); } - if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) { + if (ifnet_is_fully_attached(ifp) == false || ifp->if_na == NULL) { goto done; } @@ -541,8 +541,8 @@ kern_channel_flowadv_set(struct flowadv_fcentry *fce) } void -kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce, - uint32_t ce_cnt, uint32_t total_pkt_cnt) +kern_channel_flowadv_report_congestion_event(struct flowadv_fcentry *fce, + uint32_t congestion_cnt, uint32_t l4s_ce_cnt, uint32_t total_pkt_cnt) { const flowadv_token_t ch_token = fce->fce_flowsrc_token; const flowadv_token_t flow_token = fce->fce_flowid; @@ -553,10 +553,10 @@ kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce, struct kern_channel *ch = NULL; struct nx_flowswitch *fsw; - _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); + static_assert(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token)); SK_LOCK(); - if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) { + if (ifnet_is_fully_attached(ifp) == false || ifp->if_na == NULL) { goto done; } @@ -580,8 +580,8 @@ kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce, if (ch != NULL) { if (ch->ch_na != NULL && - na_flowadv_report_ce_event(ch, flow_fidx, flow_token, - ce_cnt, total_pkt_cnt)) { + na_flowadv_report_congestion_event(ch, flow_fidx, flow_token, + congestion_cnt, l4s_ce_cnt, total_pkt_cnt)) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) notified of flow update", ch->ch_name, ch->ch_pid); @@ -616,9 +616,9 @@ kern_channel_memstatus(struct proc *p, uint32_t status, return; } - SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b status %d", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch), - ch->ch_flags, CHANF_BITS, status); + SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%p flags 0x%x status %d", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(ch), + ch->ch_flags, status); /* serialize accesses against channel syscalls */ lck_mtx_lock(&ch->ch_lock); @@ -676,9 +676,9 @@ kern_channel_defunct(struct proc *p, struct kern_channel *ch) return; } - SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch), - ch->ch_flags, CHANF_BITS); + SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%p flags 0x%x", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(ch), + ch->ch_flags); /* serialize accesses against channel syscalls */ lck_mtx_lock(&ch->ch_lock); diff --git a/bsd/skywalk/channel/channel_ring.c b/bsd/skywalk/channel/channel_ring.c index cba240e2e..ebae39b23 100644 --- a/bsd/skywalk/channel/channel_ring.c +++ b/bsd/skywalk/channel/channel_ring.c @@ -28,6 +28,7 @@ #include #include +#include #include static void kr_update_user_stats(struct __kern_channel_ring *, @@ -130,7 +131,7 @@ kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep) lck_spin_unlock(&kr->ckr_slock); (void) thread_block(THREAD_CONTINUE_NULL); SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" " - "(0x%llx) busy=%u", kr->ckr_name, + "(%p) busy=%u", kr->ckr_name, SK_KVA(kr), kr->ckr_busy); lck_spin_lock(&kr->ckr_slock); } @@ -142,7 +143,7 @@ kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep) done: lck_spin_unlock(&kr->ckr_slock); - SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired", + SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) right acquired", kr->ckr_name, SK_KVA(kr)); return 0; @@ -174,7 +175,7 @@ kr_exit(struct __kern_channel_ring *kr) lck_spin_unlock(&kr->ckr_slock); } - SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)", + SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) right released (%u waiters)", kr->ckr_name, SK_KVA(kr), want); } @@ -191,7 +192,7 @@ kr_start(struct __kern_channel_ring *kr) kr_exit(kr); - SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started", + SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) is started", kr->ckr_name, SK_KVA(kr)); } @@ -217,8 +218,8 @@ kr_stop(struct __kern_channel_ring *kr, uint32_t state) lck_spin_unlock(&kr->ckr_slock); SK_DF(SK_VERB_LOCKS, - "kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u", - kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state); + "kr \"%s\" (0x%p) krflags 0x%x is now stopped s=%u", + kr->ckr_name, SK_KVA(kr), kr->ckr_flags, state); } static void @@ -321,6 +322,7 @@ kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count, } now = net_uptime(); + stats->crs_last_update_net_uptime = now; if (__probable(kring->ckr_accumulate_start != 0)) { diff_secs = now - kring->ckr_accumulate_start; if (diff_secs >= kr_accumulate_interval) { @@ -383,10 +385,8 @@ kr_log_bad_ring(struct __kern_channel_ring *kring) slot_idx_t i; int errors = 0; - // XXX KASSERT nm_kr_tryget - SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS); - // XXX probably wrong to trust userspace + SK_ERR("kr \"%s\" (0x%p) krflags 0x%x", kring->ckr_name, SK_KVA(kring), + kring->ckr_flags); if (ring->ring_head > lim) { errors++; @@ -414,11 +414,11 @@ kr_log_bad_ring(struct __kern_channel_ring *kring) if (errors != 0) { SK_ERR("total %d errors", errors); - SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, " - "head %u -> %u tail %u -> %u", kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head, - kring->ckr_rhead, kring->ckr_khead, - ring->ring_tail, kring->ckr_ktail); + SK_ERR("kr \"%s\" (0x%p) krflags 0x%x crash, " + "head %u/%u -> %u tail %u/%u -> %u", kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, ring->ring_head, + kring->ckr_rhead, kring->ckr_khead, ring->ring_tail, + kring->ckr_rtail, kring->ckr_ktail); } } #endif /* SK_LOG */ @@ -490,9 +490,9 @@ kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring, /* Internalize */ err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p); if (__improbable(err != 0)) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u dropped " "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, err, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, @@ -538,9 +538,9 @@ kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err); if (__improbable(err != 0)) { if (kqum != NULL) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u " "kqum %p, bad buflet chain", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, SK_KVA(kqum)); *err_reason = @@ -548,10 +548,10 @@ kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, goto done; } - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u " " unallocated packet %u kh %u kt %u | " "rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, @@ -573,9 +573,9 @@ kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, /* Internalize */ err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p); if (__improbable(err != 0)) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u dropped " "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, err, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, @@ -678,7 +678,7 @@ kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring, ckr_rtail = kring->ckr_rtail; SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail); @@ -711,11 +711,11 @@ kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring, return head; error: - SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | " - "rh %u rt %u | h %u t %u |", sk_proc_name_address(p), + SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | " + "rh %u rt %u | h %u t %u |", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead, - ckr_rtail, head, ring->ring_tail); + ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, head, + ring->ring_tail); skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC); @@ -744,7 +744,7 @@ kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) ckr_rtail = kring->ckr_rtail; SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail); @@ -755,11 +755,11 @@ kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) return head; error: - SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | " - "rh %u rt %u | h %u t %u |", sk_proc_name_address(p), + SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | " + "rh %u rt %u | h %u t %u |", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead, - ckr_rtail, head, ring->ring_tail); + ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, head, + ring->ring_tail); skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC); return kring->ckr_num_slots; @@ -809,9 +809,9 @@ kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring, * subtract byte counts from slots just given back to the kernel. */ if (kring->ckr_ready_bytes < *byte_count) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes " - "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, + SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes " + "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_ready_bytes, *byte_count, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -856,9 +856,9 @@ kr_rxprologue_nodetach(struct kern_channel *ch, * subtract byte counts from slots just given back to the kernel. */ if (kring->ckr_ready_bytes < *byte_count) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes " - "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, + SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes " + "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_ready_bytes, *byte_count, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -866,7 +866,7 @@ kr_rxprologue_nodetach(struct kern_channel *ch, *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES; #if (DEVELOPMENT || DEBUG) if (kr_disable_panic_on_sync_err == 0) { - panic("kr(0x%llx), inconsistent, head %u, ready %llu, " + panic("kr(%p), inconsistent, head %u, ready %llu, " "cnt %u", SK_KVA(kring), head, kring->ckr_ready_bytes, *byte_count); /* NOTREACHED */ @@ -908,9 +908,9 @@ kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, */ ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx))); if (SD_VALID_METADATA(usd)) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u not " "detached md %u kh %u kt %u | rh %u rt %u |" - " h %u t %u", sk_proc_name_address(p), + " h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail, @@ -930,9 +930,9 @@ kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, * subtract byte counts from slots just given back to the kernel */ if (kring->ckr_ready_bytes < *byte_count) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes " - "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, + SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes " + "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_ready_bytes, *byte_count, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -999,7 +999,7 @@ kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring, ckr_ktail = kring->ckr_ktail; SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, ring->ring_head, ring->ring_tail); @@ -1033,7 +1033,7 @@ kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring, /* Update Rx dequeue timestamp */ if (slot_count > 0) { - kring->ckr_rx_dequeue_ts = _net_uptime; + kring->ckr_rx_dequeue_ts = net_uptime(); } /* update the kernel view of ring */ @@ -1041,11 +1041,10 @@ kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring, return head; error: - SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | " + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ckr_khead, ckr_ktail, - kring->ckr_rhead, kring->ckr_rtail, + ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, ring->ring_head, ring->ring_tail); skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC); @@ -1073,7 +1072,7 @@ kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) head = ring->ring_head; SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, head, ring->ring_tail); @@ -1089,11 +1088,10 @@ kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) return head; error: - SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | " + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ckr_khead, ckr_ktail, - kring->ckr_rhead, kring->ckr_rtail, + ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, ring->ring_head, ring->ring_tail); skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC); @@ -1260,7 +1258,7 @@ kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring, *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead; SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -1354,6 +1352,7 @@ kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring, byte_count += kqum->qum_len; slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim); } + ch_update_upp_buf_stats(ch, pp); PP_UNLOCK(pp); kring->ckr_ready_bytes += byte_count; @@ -1410,7 +1409,7 @@ kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring, *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead; SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -1437,7 +1436,7 @@ kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p) SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | " "rh %u rt %u | h %u t %u | ws %u", - sk_proc_name_address(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -1461,7 +1460,7 @@ kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p) *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead; SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -1485,7 +1484,7 @@ kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) head = ring->ring_head; SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, head, ring->ring_tail); @@ -1509,9 +1508,9 @@ kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) */ VERIFY(!KSD_VALID_METADATA(ksd)); if (__improbable(SD_VALID_METADATA(usd))) { - SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not " + SK_ERR("%s(%d) kr \"%s\" (%p) slot %u not " "detached md %u kh %u kt %u | rh %u rt %u |" - " h %u t %u", sk_proc_name_address(p), + " h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), slot_idx, usd->sd_md_idx, ckr_khead, ckr_ktail, kring->ckr_rhead, @@ -1528,11 +1527,10 @@ kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p) return head; error: - SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | " + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ckr_khead, ckr_ktail, - kring->ckr_rhead, kring->ckr_rtail, + ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, ring->ring_head, ring->ring_tail); skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC); @@ -1579,6 +1577,7 @@ kr_event_sync_finalize(struct kern_channel *ch, ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0); slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim); } + ch_update_upp_buf_stats(ch, pp); PP_UNLOCK(pp); /* just recalculate slot count using pointer arithmetic */ @@ -1594,7 +1593,7 @@ kr_event_sync_finalize(struct kern_channel *ch, *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead; SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | " - "rh %u rt %u | h %u t %u", sk_proc_name_address(p), + "rh %u rt %u | h %u t %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail, kring->ckr_ring->ring_head, @@ -1640,8 +1639,6 @@ kr_internalize_metadata(struct kern_channel *ch, struct __user_quantum *uqum; /* user source */ struct __user_packet *upkt; struct __kern_packet *kpkt; - const nexus_meta_type_t md_type = METADATA_TYPE(kqum); - const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum); uint32_t len = 0, bdoff, bdlim; uint16_t bcnt = 0, bmax, i; boolean_t dropped; @@ -1654,8 +1651,8 @@ kr_internalize_metadata(struct kern_channel *ch, */ ASSERT(kqum->qum_pp == kring->ckr_pp); - _CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com)); - _CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com)); + static_assert(sizeof(uqum->qum_com) == sizeof(kqum->qum_com)); + static_assert(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com)); uqum = __DECONST(struct __user_quantum *, kqum->qum_user); ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL); upkt = SK_PTR_ADDR_UPKT(uqum); @@ -1663,8 +1660,8 @@ kr_internalize_metadata(struct kern_channel *ch, DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring, struct __kern_packet *, kpkt, struct __user_packet *, upkt); - SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring), + SK_DF(SK_VERB_MEM, "%s(%d) kring %p uqum %p -> kqum %p", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring), SK_KVA(uqum), SK_KVA(kqum)); /* check if it's dropped before we internalize it */ @@ -1681,55 +1678,32 @@ kr_internalize_metadata(struct kern_channel *ch, /* if marked as dropped, don't bother going further */ if (__improbable(dropped)) { - SK_ERR("%s(%d) kring 0x%llx dropped", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring)); + SK_ERR("%s(%d) kring %p dropped", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring)); err = ERANGE; goto done; } - switch (md_type) { - case NEXUS_META_TYPE_PACKET: - /* - * Internalize common packet metadata. - */ - _PKT_INTERNALIZE(upkt, kpkt); + /* + * Internalize common packet metadata. + */ + _PKT_INTERNALIZE(upkt, kpkt); - switch (md_subtype) { - case NEXUS_META_SUBTYPE_PAYLOAD: - /* sanitize link layer fields for payload mode */ - kpkt->pkt_link_flags = 0; - break; - default: - break; - } + if (__probable(ch != NULL)) { + _UUID_COPY(kpkt->pkt_flowsrc_id, + ch->ch_info->cinfo_ch_id); + } - if (__probable(ch != NULL)) { - _UUID_COPY(kpkt->pkt_flowsrc_id, - ch->ch_info->cinfo_ch_id); - } - - bcnt = upkt->pkt_bufs_cnt; - bmax = kpkt->pkt_bufs_max; - ASSERT(bmax == maxfrags); - if (__improbable((bcnt == 0) || (bcnt > bmax) || - (upkt->pkt_bufs_max != bmax))) { - SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d", - sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max); - err = ERANGE; - goto done; - } - break; - - case NEXUS_META_TYPE_QUANTUM: - ASSERT(maxfrags == 1); - bcnt = bmax = 1; - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); + bcnt = upkt->pkt_bufs_cnt; + bmax = kpkt->pkt_bufs_max; + ASSERT(bmax == maxfrags); + if (__improbable((bcnt == 0) || (bcnt > bmax) || + (upkt->pkt_bufs_max != bmax))) { + SK_ERR("%s(%d) kring %p bad bufcnt %d, %d, %d", + sk_proc_name(p), sk_proc_pid(p), + SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max); + err = ERANGE; + goto done; } ASSERT(bcnt != 0); @@ -1740,9 +1714,9 @@ kr_internalize_metadata(struct kern_channel *ch, * Validate and internalize buflets. */ for (i = 0; i < bcnt; i++) { - _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0); - _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0); - _CASSERT(offsetof(struct __kern_quantum, qum_com) == 0); + static_assert(offsetof(struct __kern_packet, pkt_qum) == 0); + static_assert(offsetof(struct __user_packet, pkt_qum) == 0); + static_assert(offsetof(struct __kern_quantum, qum_com) == 0); PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf); ASSERT(kbuf != NULL); if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) { @@ -1778,8 +1752,8 @@ kr_internalize_metadata(struct kern_channel *ch, if (__improbable(!BUF_IN_RANGE(kbuf) || ubuf->buf_idx != kbuf->buf_idx)) { kbuf->buf_dlen = kbuf->buf_doff = 0; - SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x", - sk_proc_name_address(p), sk_proc_pid(p), + SK_ERR("%s(%d) kring %p bad bufidx 0x%x, 0x%x", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx); err = ERANGE; goto done; @@ -1796,62 +1770,36 @@ kr_internalize_metadata(struct kern_channel *ch, pkbuf = kbuf; } - _CASSERT(offsetof(struct __kern_packet, pkt_length) == - offsetof(struct __kern_packet, pkt_qum.qum_len)); + static_assert(offsetof(struct __kern_packet, pkt_length) == offsetof(struct __kern_packet, pkt_qum.qum_len)); if (__improbable(kpkt->pkt_length != len)) { - SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d", - sk_proc_name_address(p), sk_proc_pid(p), + SK_ERR("%s(%d) kring %p bad pktlen %d, %d", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring), kpkt->pkt_length, len); err = ERANGE; goto done; } - if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) { + if (err == 0) { bdlim = PP_BUF_SIZE_DEF(kqum->qum_pp); - switch (md_subtype) { - case NEXUS_META_SUBTYPE_RAW: - /* - * For a raw packet from user space we need to - * validate that headroom is sane and is in the - * first buflet. - */ - if (__improbable(kpkt->pkt_headroom != bdoff)) { - SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d", - sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(kring), kpkt->pkt_headroom, bdoff); - err = ERANGE; - goto done; - } - if (__improbable(kpkt->pkt_headroom + - kpkt->pkt_l2_len >= bdlim)) { - SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d", - sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(kring), kpkt->pkt_l2_len, bdlim); - err = ERANGE; - goto done; - } - break; - case NEXUS_META_SUBTYPE_PAYLOAD: - /* - * For a payload packet from user space we need - * to validate that payload starts from 0 and L2 - * length is 0. - */ - if (__improbable((kpkt->pkt_headroom != 0) || - (kpkt->pkt_l2_len != 0))) { - SK_ERR("%s(%d) kring 0x%llx bad headroom " - "payload subtype %d headroom %d l2len %d", - sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(kring), SK_PTR_SUBTYPE(kpkt), - kpkt->pkt_headroom, kpkt->pkt_l2_len); - err = ERANGE; - goto done; - } - break; - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); + /* + * For a raw packet from user space we need to + * validate that headroom is sane and is in the + * first buflet. + */ + if (__improbable(kpkt->pkt_headroom != bdoff)) { + SK_ERR("%s(%d) kring %p bad headroom %d, %d", + sk_proc_name(p), sk_proc_pid(p), + SK_KVA(kring), kpkt->pkt_headroom, bdoff); + err = ERANGE; + goto done; + } + if (__improbable(kpkt->pkt_headroom + + kpkt->pkt_l2_len >= bdlim)) { + SK_ERR("%s(%d) kring %p bad headroom l2len %d, %d", + sk_proc_name(p), sk_proc_pid(p), + SK_KVA(kring), kpkt->pkt_l2_len, bdlim); + err = ERANGE; + goto done; } /* validate checksum offload properties */ @@ -1862,7 +1810,7 @@ kr_internalize_metadata(struct kern_channel *ch, start > kpkt->pkt_length || (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) { SK_ERR("%s(%d) flags 0x%x start %u stuff %u " - "len %u", sk_proc_name_address(p), + "len %u", sk_proc_name(p), sk_proc_pid(p), kpkt->pkt_csum_flags, start, stuff, kpkt->pkt_length); err = ERANGE; @@ -1897,8 +1845,6 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, struct __user_quantum *uqum; /* user destination */ struct __user_packet *upkt; struct __kern_packet *kpkt; - const nexus_meta_type_t md_type = METADATA_TYPE(kqum); - const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum); uint32_t len = 0; uint16_t bcnt = 0, bmax, i; @@ -1910,8 +1856,8 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, ASSERT(kqum->qum_pp == kring->ckr_pp); ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED)); - _CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com)); - _CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com)); + static_assert(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com)); + static_assert(sizeof(kqum->qum_com) == sizeof(uqum->qum_com)); uqum = __DECONST(struct __user_quantum *, kqum->qum_user); ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL); upkt = SK_PTR_ADDR_UPKT(uqum); @@ -1919,8 +1865,8 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring, struct __kern_packet *, kpkt, struct __user_packet *, upkt); - SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring), + SK_DF(SK_VERB_MEM, "%s(%d) kring %p kqum %p -> uqum %p", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring), SK_KVA(kqum), SK_KVA(uqum)); /* @@ -1928,45 +1874,20 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, */ _QUM_EXTERNALIZE(kqum, uqum); - switch (md_type) { - case NEXUS_META_TYPE_PACKET: { - bcnt = kpkt->pkt_bufs_cnt; - bmax = kpkt->pkt_bufs_max; - ASSERT(bmax == maxfrags); - ASSERT(bcnt <= bmax); - /* - * Externalize common packet metadata. - */ - _PKT_EXTERNALIZE(kpkt, upkt); + bcnt = kpkt->pkt_bufs_cnt; + bmax = kpkt->pkt_bufs_max; + ASSERT(bmax == maxfrags); + ASSERT(bcnt <= bmax); + /* + * Externalize common packet metadata. + */ + _PKT_EXTERNALIZE(kpkt, upkt); - /* sanitize buflet count and limit (deconst) */ - _CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t)); - _CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t)); - *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax; - *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt; - - switch (md_subtype) { - case NEXUS_META_SUBTYPE_PAYLOAD: - /* sanitize link layer fields for payload mode */ - upkt->pkt_headroom = 0; - upkt->pkt_link_flags = 0; - break; - default: - break; - } - break; - } - - case NEXUS_META_TYPE_QUANTUM: - ASSERT(maxfrags == 1); - bcnt = bmax = 1; - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } + /* sanitize buflet count and limit (deconst) */ + static_assert(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t)); + static_assert(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t)); + *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax; + *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt; ASSERT(bcnt != 0); /* @@ -1985,7 +1906,7 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, * Externalize buflets. */ for (i = 0; i < bcnt; i++) { - _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0); + static_assert(offsetof(struct __kern_packet, pkt_qum) == 0); PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf); ASSERT(kbuf != NULL); @@ -2022,10 +1943,21 @@ kr_externalize_metadata_internal(struct __kern_channel_ring *kring, kqum->qum_qflags &= ~QUM_F_INTERNALIZED; } - void kr_externalize_metadata(struct __kern_channel_ring *kring, const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p) { kr_externalize_metadata_internal(kring, maxfrags, kqum, p); } + +SK_NO_INLINE_ATTRIBUTE +char * +kr2str(const struct __kern_channel_ring *kr, char *__counted_by(dsz)dst, + size_t dsz) +{ + (void) sk_snprintf(dst, dsz, "%p %s %s flags 0x%b", + SK_KVA(kr), kr->ckr_name, sk_ring2str(kr->ckr_tx), kr->ckr_flags, + CKRF_BITS); + + return dst; +} diff --git a/bsd/skywalk/channel/channel_syscalls.c b/bsd/skywalk/channel/channel_syscalls.c index 8ad8db2b7..32e8f3f01 100644 --- a/bsd/skywalk/channel/channel_syscalls.c +++ b/bsd/skywalk/channel/channel_syscalls.c @@ -37,6 +37,8 @@ #include #include +#include + static int chop_select(struct fileproc *, int, void *, vfs_context_t); static int chop_close(struct fileglob *, vfs_context_t); static int chop_kqfilter(struct fileproc *, struct knote *, struct kevent_qos_s *); @@ -144,7 +146,7 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) if (__improbable(uap->init == USER_ADDR_NULL || uap->init_len < sizeof(init))) { - SK_DSC(p, "EINVAL: init 0x%llx, init_len %u", SK_KVA(uap->init), + SK_PERR(p, "EINVAL: init %p, init_len %u", SK_KVA(uap->init), uap->init_len); err = EINVAL; goto done; @@ -152,49 +154,31 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) err = copyin(uap->init, (caddr_t)&init, sizeof(init)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %u: init 0x%llx", err, SK_KVA(uap->init)); + SK_PERR(p, "copyin err %u: init %p", err, SK_KVA(uap->init)); goto done; } if (__improbable(init.ci_version != CHANNEL_INIT_CURRENT_VERSION)) { - SK_DSC(p, "ENOTSUP: init.ci_version %u != %u", init.ci_version, + SK_PERR(p, "ENOTSUP: init.ci_version %u != %u", init.ci_version, CHANNEL_INIT_CURRENT_VERSION); err = ENOTSUP; goto done; } else if (__improbable(uuid_is_null(init.ci_nx_uuid))) { - SK_DSC(p, "EINVAL: uuid_is_null"); + SK_PERR(p, "EINVAL: uuid_is_null"); err = EINVAL; goto done; } else if (__improbable((init.ci_key_len != 0 && init.ci_key == USER_ADDR_NULL) || (init.ci_key_len == 0 && init.ci_key != USER_ADDR_NULL))) { - SK_DSC(p, "EINVAL: ci_key_len %i, ci_key 0x%llx", + SK_PERR(p, "EINVAL: ci_key_len %i, ci_key %p", init.ci_key_len, SK_KVA(init.ci_key)); err = EINVAL; goto done; } - if ((init.ci_ch_mode & CHMODE_MONITOR) != 0) { - if (__improbable((init.ci_ch_mode & CHMODE_USER_PACKET_POOL) != 0)) { - SK_DSC(p, "EINVAL: PACKET_POOL not supported for MONITOR mode"); - err = EINVAL; - goto done; - } - if (__improbable((init.ci_ch_mode & CHMODE_EVENT_RING) != 0)) { - SK_DSC(p, "EINVAL: EVENT ring not supported for MONITOR mode"); - err = EINVAL; - goto done; - } - if (__improbable((init.ci_ch_mode & CHMODE_LOW_LATENCY) != 0)) { - SK_DSC(p, "EINVAL: low latency not supported for MONITOR mode"); - err = EINVAL; - goto done; - } - } - if ((init.ci_ch_mode & CHMODE_EVENT_RING) != 0) { if ((init.ci_ch_mode & CHMODE_USER_PACKET_POOL) == 0) { - SK_DSC(p, "EINVAL: PACKET_POOL is required for EVENT ring"); + SK_PERR(p, "EINVAL: PACKET_POOL is required for EVENT ring"); err = EINVAL; goto done; } @@ -212,28 +196,28 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) err = falloc_guarded(p, &fp, &fd, vfs_context_current(), &guard, GUARD_CLOSE | GUARD_DUP | GUARD_SOCKET_IPC | GUARD_FILEPORT | GUARD_WRITE); if (__improbable(err != 0)) { - SK_DSC(p, "falloc_guarded: %u", err); + SK_PERR(p, "falloc_guarded: %u", err); goto done; } keylen = init.ci_key_len; if (keylen != 0) { if (__improbable(keylen > NEXUS_MAX_KEY_LEN)) { - SK_DSC(p, "EINVAL: ci_key_len %u", keylen); + SK_PERR(p, "EINVAL: ci_key_len %u", keylen); err = EINVAL; goto done; } key = sk_alloc_data(keylen, Z_WAITOK, skmem_tag_ch_key); if (__improbable(key == NULL)) { - SK_DSC(p, "ENOMEM: ci_key_len %u", keylen); + SK_PERR(p, "ENOMEM: ci_key_len %u", keylen); err = ENOMEM; goto done; } err = copyin(init.ci_key, (caddr_t)key, keylen); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %u: ci_key 0x%llx, ci_key_len %u", + SK_PERR(p, "copyin err %u: ci_key %p, ci_key_len %u", err, SK_KVA(init.ci_key), keylen); goto done; } @@ -247,7 +231,7 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) /* in case not processed */ key = USER_ADDR_TO_PTR(init); ASSERT(err != 0); - SK_DSC(p, "ch_open nx_port %d err %u", + SK_PERR(p, "ch_open nx_port %d err %u", (int)init.ci_nx_port, err); goto done; } @@ -259,7 +243,7 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) init.ci_key = USER_ADDR_NULL; err = copyout(&init, uap->init, sizeof(init)); if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %u: init 0x%llx", err, + SK_PERR(p, "copyout err %u: init %p", err, SK_KVA(uap->init)); goto done; } @@ -276,9 +260,8 @@ __channel_open(struct proc *p, struct __channel_open_args *uap, int *retval) *retval = fd; - SK_D("%s(%d) nx_port %d fd %d guard 0x%llx", - sk_proc_name_address(p), sk_proc_pid(p), (int)init.ci_nx_port, - fd, guard); + SK_D("%s(%d) nx_port %d fd %d %s", sk_proc_name(p), + sk_proc_pid(p), (int)init.ci_nx_port, fd, ch->ch_na->na_name); done: if (key != NULL) { @@ -312,14 +295,14 @@ __channel_get_info(struct proc *p, struct __channel_get_info_args *uap, err = fp_get_ftype(p, uap->c, DTYPE_CHANNEL, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype err %u", err); + SK_PERR(p, "fp_get_ftype err %u", err); return err; } ch = (struct kern_channel *__single)fp_get_data(fp); if (__improbable(uap->cinfo == USER_ADDR_NULL || uap->cinfolen < sizeof(struct ch_info))) { - SK_DSC(p, "EINVAL: cinfo 0x%llx, cinfolen %u", + SK_PERR(p, "EINVAL: cinfo %p, cinfolen %u", SK_KVA(uap->cinfo), uap->cinfolen); err = EINVAL; goto done; @@ -329,7 +312,7 @@ __channel_get_info(struct proc *p, struct __channel_get_info_args *uap, err = copyout(ch->ch_info, uap->cinfo, sizeof(struct ch_info)); lck_mtx_unlock(&ch->ch_lock); if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %u: cinfo 0x%llx", err, + SK_PERR(p, "copyout err %u: cinfo %p", err, SK_KVA(uap->cinfo)); goto done; } @@ -349,8 +332,8 @@ channel_sync_log1(uint64_t verb, const char *sync, struct proc *p, const struct __kern_channel_ring *kring, ring_id_t i) { verb |= SK_VERB_SYNC; - SK_DF(verb, "%s(%d) pre: %s ring %u na \"%s\" (0x%llx) ch 0x%llx " - "th 0x%llx h %u kh %u", sk_proc_name_address(p), sk_proc_pid(p), + SK_DF(verb, "%s(%d) pre: %s ring %u na \"%s\" (%p) ch %p " + "th %p h %u kh %u", sk_proc_name(p), sk_proc_pid(p), sync, i, na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(current_thread()), kring->ckr_ring->ring_head, kring->ckr_khead); @@ -364,7 +347,7 @@ channel_sync_log2(uint64_t verb, const char *sync, struct proc *p, { verb |= SK_VERB_SYNC; SK_DF(verb, "%s(%d) post: %s ring %u na \"%s\" h %u kh %u", - sk_proc_name_address(p), sk_proc_pid(p), sync, i, na->na_name, + sk_proc_name(p), sk_proc_pid(p), sync, i, na->na_name, kring->ckr_ring->ring_head, kring->ckr_khead); } #endif /* SK_LOG */ @@ -391,7 +374,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) err = fp_get_ftype(p, uap->c, DTYPE_CHANNEL, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype err %u", err); + SK_PERR(p, "fp_get_ftype err %u", err); return err; } ch = (struct kern_channel *__single)fp_get_data(fp); @@ -403,7 +386,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) flags = uap->flags; if (__improbable(mode != CHANNEL_SYNC_TX && mode != CHANNEL_SYNC_RX && mode != CHANNEL_SYNC_UPP)) { - SK_DSC(p, "EINVAL: mode %u", mode); + SK_PERR(p, "EINVAL: mode %u", mode); err = EINVAL; goto done; } @@ -411,14 +394,14 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) if (__improbable((ch->ch_flags & CHANF_USER_PACKET_POOL) == 0 && (flags & (CHANNEL_SYNCF_ALLOC | CHANNEL_SYNCF_FREE | CHANNEL_SYNCF_ALLOC_BUF)) != 0)) { - SK_DSC(p, "EINVAL: !CHANF_USER_PACKET_POOL with " + SK_PERR(p, "EINVAL: !CHANF_USER_PACKET_POOL with " "SYNCF_ALLOC/FREE"); err = EINVAL; goto done; } if (__improbable(ch->ch_flags & CHANF_DEFUNCT)) { - SK_DSC(p, "channel is defunct"); + SK_PERR(p, "channel is defunct"); err = ENXIO; goto done; } @@ -433,7 +416,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) ASSERT(NA_IS_ACTIVE(na)); if (__improbable(na_reject_channel(ch, na))) { - SK_DSC(p, "channel is non-permissive"); + SK_PERR(p, "channel is non-permissive"); err = ENXIO; goto done; } @@ -442,7 +425,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) protect = sk_sync_protect(); /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); /* and make this channel eligible for draining again */ if (na->na_flags & NAF_DRAINING) { @@ -486,7 +469,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) kr_log_bad_ring(kring); error = EFAULT; if (!err) { - SK_DSC(p, "EFAULT: " + SK_PERR(p, "EFAULT: " "kr_txsync_prologue()"); err = EFAULT; } @@ -496,7 +479,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) } else { error = EIO; if (!err) { - SK_DSC(p, "EIO: TX " + SK_PERR(p, "EIO: TX " "kring->ckr_na_sync()"); err = EIO; } @@ -519,7 +502,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) kr_log_bad_ring(kring); error = EFAULT; if (!err) { - SK_DSC(p, "EFAULT: " + SK_PERR(p, "EFAULT: " "kr_rxsync_prologue()"); err = EFAULT; } @@ -529,7 +512,7 @@ __channel_sync(struct proc *p, struct __channel_sync_args *uap, int *retval) } else { error = EIO; if (!err) { - SK_DSC(p, "EIO: " "RX " + SK_PERR(p, "EIO: " "RX " "kring->ckr_na_sync()"); err = EIO; } @@ -597,7 +580,7 @@ packet_pool_sync: kring->ckr_num_slots)) { kr_log_bad_ring(kring); if (!err) { - SK_DSC(p, + SK_PERR(p, "EFAULT: kr_alloc_sync_prologue()"); err = EFAULT; } @@ -606,7 +589,7 @@ packet_pool_sync: kr_alloc_sync_finalize(kring, p); } else { if (!err) { - SK_DSC(p, + SK_PERR(p, "EIO: ALLOC: ring->ckr_na_sync()"); err = EIO; } @@ -644,7 +627,7 @@ packet_pool_sync: kring->ckr_num_slots)) { kr_log_bad_ring(kring); if (!err) { - SK_DSC(p, + SK_PERR(p, "EFAULT: kr_free_sync_prologue()"); err = EFAULT; } @@ -653,7 +636,7 @@ packet_pool_sync: kr_free_sync_finalize(kring, p); } else { if (!err) { - SK_DSC(p, + SK_PERR(p, "EIO: FREE: ring->ckr_na_sync()"); err = EIO; } @@ -698,13 +681,13 @@ __channel_get_opt(struct proc *p, struct __channel_get_opt_args *uap, err = fp_get_ftype(p, uap->c, DTYPE_CHANNEL, ENODEV, &fp); if (err != 0) { - SK_DSC(p, "fp_get_ftype err %u", err); + SK_PERR(p, "fp_get_ftype err %u", err); return err; } ch = (struct kern_channel *__single)fp_get_data(fp); if (uap->aoptlen == USER_ADDR_NULL) { - SK_DSC(p, "EINVAL: uap->aoptlen == USER_ADDR_NULL"); + SK_PERR(p, "EINVAL: uap->aoptlen == USER_ADDR_NULL"); err = EINVAL; goto done; } @@ -712,7 +695,7 @@ __channel_get_opt(struct proc *p, struct __channel_get_opt_args *uap, if (uap->aoptval != USER_ADDR_NULL) { err = copyin(uap->aoptlen, &optlen, sizeof(optlen)); if (err != 0) { - SK_DSC(p, "copyin err %u: aoptlen 0x%llx", err, + SK_PERR(p, "copyin err %u: aoptlen %p", err, SK_KVA(uap->aoptlen)); goto done; } @@ -735,7 +718,7 @@ __channel_get_opt(struct proc *p, struct __channel_get_opt_args *uap, err = copyout(&optlen, uap->aoptlen, sizeof(optlen)); #if SK_LOG if (err != 0) { - SK_DSC(p, "copyout err %u: aoptlen 0x%llx", err, + SK_PERR(p, "copyout err %u: aoptlen %p", err, SK_KVA(uap->aoptlen)); } #endif @@ -761,7 +744,7 @@ __channel_set_opt(struct proc *p, struct __channel_set_opt_args *uap, err = fp_get_ftype(p, uap->c, DTYPE_CHANNEL, ENODEV, &fp); if (err != 0) { - SK_DSC(p, "fp_get_ftype err %u", err); + SK_PERR(p, "fp_get_ftype err %u", err); return err; } ch = (struct kern_channel *__single)fp_get_data(fp); @@ -775,11 +758,11 @@ __channel_set_opt(struct proc *p, struct __channel_set_opt_args *uap, lck_mtx_lock(&ch->ch_lock); if (__improbable(ch->ch_flags & (CHANF_CLOSING | CHANF_DEFUNCT))) { - SK_DSC(p, "channel is closing/defunct"); + SK_PERR(p, "channel is closing/defunct"); err = ENXIO; } else if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) || na_reject_channel(ch, ch->ch_na))) { - SK_DSC(p, "channel is non-permissive"); + SK_PERR(p, "channel is non-permissive"); err = ENXIO; } else { err = ch_set_opt(ch, &sopt); @@ -790,7 +773,7 @@ __channel_set_opt(struct proc *p, struct __channel_set_opt_args *uap, #if SK_LOG if (err != 0) { - SK_DSC(p, "ch_set_opt() err %u", err); + SK_PERR(p, "ch_set_opt() err %u", err); } #endif diff --git a/bsd/skywalk/channel/channel_var.h b/bsd/skywalk/channel/channel_var.h index c4f6911c9..a7c0d6de5 100644 --- a/bsd/skywalk/channel/channel_var.h +++ b/bsd/skywalk/channel/channel_var.h @@ -57,6 +57,7 @@ #define _SKYWALK_CHANNEL_CHANNELVAR_H_ #ifdef BSD_KERNEL_PRIVATE +#include #include #include #include @@ -228,7 +229,6 @@ struct chreq { uint32_t cr_pipe_id; /* in */ ring_id_t cr_ring_id; /* in */ ring_set_t cr_ring_set; /* out */ - ch_endpoint_t cr_real_endpoint; /* out */ ch_endpoint_t cr_endpoint; /* out */ mach_vm_size_t cr_memsize; /* out */ mach_vm_offset_t cr_memoffset; /* out */ @@ -327,8 +327,8 @@ struct __kern_channel_ring { slot_idx_t ckr_num_slots; /* # of slots */ uint32_t ckr_max_pkt_len;/* max pp pkt size */ uint32_t ckr_largest; /* largest packet seen */ - const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */ - enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */ + const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */ + enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */ volatile slot_idx_t ckr_khead; volatile slot_idx_t ckr_ktail; @@ -474,30 +474,10 @@ struct __kern_channel_ring { /* * Protects kring in the event of multiple writers; - * only used by flow switch and monitor. + * only used by flow switch. */ decl_lck_mtx_data(, ckr_qlock); -#if CONFIG_NEXUS_MONITOR - /* array of krings that are monitoring this kring */ - struct __kern_channel_ring **ckr_monitors; - uint32_t ckr_max_monitors; /* current size of the monitors array */ - uint32_t ckr_n_monitors; /* next unused entry in the monitor array */ - /* - * Monitors work by intercepting the sync and notify callbacks of the - * monitored krings. This is implemented by replacing the pointers - * above and saving the previous ones in mon_* pointers below - */ - int (*ckr_mon_sync)(struct __kern_channel_ring *kring, struct proc *, - uint32_t flags); - int (*ckr_mon_notify)(struct __kern_channel_ring *kring, struct proc *, - uint32_t flags); - - uint32_t ckr_mon_tail; /* last seen slot on rx */ - /* index of this ring in the monitored ring array */ - uint32_t ckr_mon_pos; -#endif /* CONFIG_NEXUS_MONITOR */ - uint32_t ckr_users; /* existing bindings for this ring */ /* ring flush rate limit */ @@ -507,19 +487,16 @@ struct __kern_channel_ring { #define CKR_TBR_TOKEN_INVALID INT64_MAX /* stats capturing errors */ - channel_ring_error_stats ckr_err_stats - __attribute__((aligned(sizeof(uint64_t)))); + channel_ring_error_stats ckr_err_stats __sk_aligned(64); /* stats capturing actual data movement (nexus provider's view) */ - channel_ring_stats ckr_stats - __attribute__((aligned(sizeof(uint64_t)))); + channel_ring_stats ckr_stats __sk_aligned(64); uint64_t ckr_accumulated_bytes; uint64_t ckr_accumulated_slots; uint64_t ckr_accumulate_start; /* in seconds */ /* stats capturing user activities per sync (user's view) */ - channel_ring_user_stats ckr_usr_stats - __attribute__((aligned(sizeof(uint64_t)))); + channel_ring_user_stats ckr_usr_stats __sk_aligned(64); uint64_t ckr_user_accumulated_bytes; uint64_t ckr_user_accumulated_slots; uint64_t ckr_user_accumulated_syncs; @@ -532,7 +509,7 @@ struct __kern_channel_ring { uint64_t ckr_rx_dequeue_ts; /* last timestamp when userspace dequeued */ uint64_t ckr_rx_enqueue_ts; /* last timestamp when kernel enqueued */ -} __attribute__((__aligned__(CHANNEL_CACHE_ALIGN_MAX))); +} __sk_aligned(CHANNEL_CACHE_ALIGN_MAX); #define KR_LOCK(_kr) \ lck_mtx_lock(&(_kr)->ckr_qlock) @@ -650,26 +627,24 @@ KR_SLOT_INDEX(const struct __kern_channel_ring *kr, } while (0) #define _USD_COPY(_src, _dst) do { \ - _CASSERT(sizeof (struct __user_slot_desc) == 8); \ + static_assert(sizeof(struct __user_slot_desc) == 8); \ sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \ } while (0) #define _USD_SWAP(_usd1, _usd2) do { \ - struct __user_slot_desc _tusd \ - __attribute((aligned(sizeof (uint64_t)))); \ + struct __user_slot_desc _tusd __sk_aligned(64); \ _USD_COPY(_usd1, &_tusd); \ _USD_COPY(_usd2, _usd1); \ _USD_COPY(&_tusd, _usd2); \ } while (0) #define _KSD_COPY(_src, _dst) do { \ - _CASSERT(sizeof (struct __kern_slot_desc) == 8); \ + static_assert(sizeof(struct __kern_slot_desc) == 8); \ sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \ } while (0) #define _KSD_SWAP(_ksd1, _ksd2) do { \ - struct __kern_slot_desc _tksd \ - __attribute((aligned(sizeof (uint64_t)))); \ + struct __kern_slot_desc _tksd __sk_aligned(64); \ _KSD_COPY(_ksd1, &_tksd); \ _KSD_COPY(_ksd2, _ksd1); \ _KSD_COPY(&_tksd, _ksd2); \ @@ -686,34 +661,17 @@ KR_SLOT_INDEX(const struct __kern_channel_ring *kr, } while (0) #define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \ - struct __kern_quantum *_q = SK_PTR_ADDR_KQUM(_md); \ - switch (METADATA_TYPE(_q)) { \ - case NEXUS_META_TYPE_PACKET: { \ - struct __kern_packet *_p = \ - (struct __kern_packet *)(void *)(_md); \ - struct __kern_buflet *_kbft; \ - PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ - (_addr) = __unsafe_forge_bidi_indexable(void *, \ - __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \ - (_objaddr) = __unsafe_forge_bidi_indexable(void *, \ - _kbft->buf_objaddr, _kbft->buf_dlim); \ - (_doff) = _kbft->buf_doff; \ - (_dlen) = _kbft->buf_dlen; \ - (_dlim) = _kbft->buf_dlim; \ - break; \ - } \ - default: \ - (_addr) = __unsafe_forge_bidi_indexable(void *, \ - __DECONST(void *, _q->qum_buf[0].buf_addr), \ - _q->qum_buf[0].buf_dlim); \ - (_objaddr) = __unsafe_forge_bidi_indexable(void *, \ - _q->qum_buf[0].buf_objaddr, \ - _q->qum_buf[0].buf_dlim); \ - (_doff) = _q->qum_buf[0].buf_doff; \ - (_dlen) = _q->qum_buf[0].buf_dlen; \ - (_dlim) = _q->qum_buf[0].buf_dlim; \ - break; \ - } \ + struct __kern_packet *_p = \ + (struct __kern_packet *)(void *)(_md); \ + struct __kern_buflet *_kbft; \ + PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ + (_addr) = __unsafe_forge_bidi_indexable(void *, \ + __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \ + (_objaddr) = __unsafe_forge_bidi_indexable(void *, \ + _kbft->buf_objaddr, _kbft->buf_dlim); \ + (_doff) = _kbft->buf_doff; \ + (_dlen) = _kbft->buf_dlen; \ + (_dlim) = _kbft->buf_dlim; \ ASSERT((_addr) != NULL); \ ASSERT((_objaddr) != NULL); \ } while (0) @@ -852,6 +810,14 @@ extern void ch_retain_locked(struct kern_channel *); extern int ch_release(struct kern_channel *); extern int ch_release_locked(struct kern_channel *); extern void ch_dtor(struct kern_channel *); +extern void ch_update_upp_buf_stats(struct kern_channel *ch, + struct kern_pbufpool *pp); + +#if SK_LOG +#define CH_DBGBUF_SIZE 256 +extern char * ch2str(const struct kern_channel *na, char *__counted_by(dsz)dst, + size_t dsz); +#endif /* SK_LOG */ extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t); extern void csi_destroy(struct ch_selinfo *); @@ -931,6 +897,8 @@ extern void kr_event_sync_finalize(struct kern_channel *ch, #if SK_LOG extern void kr_log_bad_ring(struct __kern_channel_ring *); +extern char * kr2str(const struct __kern_channel_ring *kr, + char *__counted_by(dsz)dst, size_t dsz); #else #define kr_log_bad_ring(_kr) do { ((void)0); } while (0) #endif /* SK_LOG */ diff --git a/bsd/skywalk/channel/kern_channel_event.c b/bsd/skywalk/channel/kern_channel_event.c index c4d33f125..e87d647c9 100644 --- a/bsd/skywalk/channel/kern_channel_event.c +++ b/bsd/skywalk/channel/kern_channel_event.c @@ -59,7 +59,7 @@ __notif_dest_by_ifp(struct __notif_dest *dest, const ifnet_t ifp) return EINVAL; } - if (!IF_FULLY_ATTACHED(ifp)) { + if (!ifnet_is_fully_attached(ifp)) { return ENXIO; } @@ -124,8 +124,7 @@ kern_channel_packet_event_notify(struct __notif_dest *dest, os_channel_event_type_t event_type, size_t event_dlen, uint8_t *__sized_by(event_dlen)event_data, uint32_t nx_port_id) { - char buf[CHANNEL_EVENT_MAX_LEN] - __attribute((aligned(sizeof(uint64_t)))); + char buf[CHANNEL_EVENT_MAX_LEN] __sk_aligned(64); struct __kern_channel_event *event = (struct __kern_channel_event *)(void *)buf; @@ -146,9 +145,9 @@ kern_channel_packet_event_notify(struct __notif_dest *dest, event->ev_dlen = (uint16_t)event_dlen; memcpy(event->ev_data, event_data, event_dlen); - SK_DF(SK_VERB_EVENTS, "%s[%d] kern_channel_event: %p dest_type: %hu len: %hu " + SK_DF(SK_VERB_EVENTS, "%s[%d] kern_channel_event: %p dest_type: %u len: %zu " "type: %u flags: %u res: %hu dlen: %hu", - dest->dest_desc, nx_port_id, event, event_dlen, + dest->dest_desc, nx_port_id, SK_KVA(event), dest->dest_type, event_dlen, event->ev_type, event->ev_flags, event->_reserved, event->ev_dlen); switch (dest->dest_type) { @@ -297,9 +296,8 @@ kern_channel_event_notify(struct __kern_channel_ring *kring) { ASSERT(kring->ckr_tx == NR_TX); - SK_DF(SK_VERB_EVENTS, "%s(%d) na \"%s\" (0x%llx) kr 0x%llx", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), - KRNA(kring)->na_name, SK_KVA(KRNA(kring)), SK_KVA(kring)); + SK_DF(SK_VERB_EVENTS, "na \"%s\" (%p) kr %p", KRNA(kring)->na_name, + SK_KVA(KRNA(kring)), SK_KVA(kring)); na_post_event(kring, TRUE, FALSE, FALSE, CHAN_FILT_HINT_CHANNEL_EVENT); } diff --git a/bsd/skywalk/channel/os_channel.h b/bsd/skywalk/channel/os_channel.h index 0d87a00ce..697ea330f 100644 --- a/bsd/skywalk/channel/os_channel.h +++ b/bsd/skywalk/channel/os_channel.h @@ -49,6 +49,7 @@ #define OS_CHANNEL_HAS_NUM_BUFFERS_ATTR 1 /* CHANNEL_ATTR_NUM_BUFFERS */ #define OS_CHANNEL_HAS_LARGE_PACKET 1 /* CHANNEL_ATTR_LARGE_BUF_SIZE and */ /* os_channel_large_packet_alloc() */ +#define OS_CHANNEL_HAS_BUFFER_STATS 1 /* os_channel_get_buffer_stats() */ /* Flow advisory table index */ typedef uint32_t flowadv_idx_t; @@ -59,8 +60,8 @@ typedef uint32_t flowadv_idx_t; */ typedef enum { CHANNEL_DIR_TX_RX, /* default: TX and RX ring(s) */ - CHANNEL_DIR_TX, /* (monitor) only TX ring(s) */ - CHANNEL_DIR_RX /* (monitor) only RX ring(s) */ + CHANNEL_DIR_TX, /* only TX ring(s) */ + CHANNEL_DIR_RX /* only RX ring(s) */ } ring_dir_t; /* @@ -108,15 +109,6 @@ typedef struct channel_ring_desc *channel_ring_t; typedef struct __slot_desc *channel_slot_t; typedef struct channel_attr *channel_attr_t; -/* - * Channel monitor types. - */ -typedef enum { - CHANNEL_MONITOR_OFF, /* default */ - CHANNEL_MONITOR_NO_COPY, /* zero-copy (delayed) mode */ - CHANNEL_MONITOR_COPY /* copy (immediate) mode */ -} channel_monitor_type_t; - /* * Channel threshold unit types. */ @@ -141,7 +133,7 @@ typedef enum { CHANNEL_ATTR_SLOT_META_SIZE, /* (g) metadata per slot (bytes) */ CHANNEL_ATTR_EXCLUSIVE, /* (g/s) bool: exclusive open */ CHANNEL_ATTR_NO_AUTO_SYNC, /* (g/s) bool: will do explicit sync */ - CHANNEL_ATTR_MONITOR, /* (g/s) see channel_monitor_type_t */ + CHANNEL_ATTR_UNUSED_1, /* unused */ CHANNEL_ATTR_TX_LOWAT_UNIT, /* (g/s) see channel_threshold_unit_t */ CHANNEL_ATTR_TX_LOWAT_VALUE, /* (g/s) transmit low-watermark */ CHANNEL_ATTR_RX_LOWAT_UNIT, /* (g/s) see channel_threshold_unit_t */ @@ -450,6 +442,11 @@ extern int os_channel_flow_admissible(const channel_ring_t ring, extern int os_channel_flow_adv_get_ce_count(const channel_ring_t chrd, uuid_t flow_id, const flowadv_idx_t flow_index, uint32_t *ce_cnt, uint32_t *pkt_cnt); + +#define AQM_CONGESTION_FEEDBACK 1 +extern int os_channel_flow_adv_get_feedback(const channel_ring_t chrd, + uuid_t flow_id, const flowadv_idx_t flow_index, uint32_t *congestion_cnt, + uint32_t *ce_cnt, uint32_t *pkt_cnt); /* * Allocate a packet from the channel's packet pool. * Returns 0 on success with the packet handle in packet arg. @@ -543,6 +540,10 @@ os_channel_buflet_alloc(const channel_t chd, buflet_t *bft); extern int os_channel_buflet_free(const channel_t chd, buflet_t ubft); + +extern int +os_channel_get_upp_buffer_stats(const channel_t chd, uint64_t *buffer_total, + uint64_t *buffer_inuse); __END_DECLS #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ #else /* KERNEL */ @@ -679,8 +680,8 @@ __private_extern__ errno_t kern_channel_slot_detach_packet_byidx( const kern_channel_ring_t kring, const uint32_t sidx, kern_packet_t ph); __private_extern__ void kern_channel_flowadv_clear(struct flowadv_fcentry *); __private_extern__ void kern_channel_flowadv_set(struct flowadv_fcentry *); -__private_extern__ void kern_channel_flowadv_report_ce_event( - struct flowadv_fcentry *, uint32_t, uint32_t); +__private_extern__ void kern_channel_flowadv_report_congestion_event( + struct flowadv_fcentry *, uint32_t, uint32_t, uint32_t); __private_extern__ void kern_channel_memstatus(struct proc *, uint32_t, struct kern_channel *); __private_extern__ void kern_channel_defunct(struct proc *, diff --git a/bsd/skywalk/channel/os_channel_private.h b/bsd/skywalk/channel/os_channel_private.h index 7ac554aed..7e84ccc01 100644 --- a/bsd/skywalk/channel/os_channel_private.h +++ b/bsd/skywalk/channel/os_channel_private.h @@ -161,6 +161,10 @@ struct __user_channel_schema { char csm_kern_name[CHANNEL_SCHEMA_KERN_NAME]; uuid_t csm_kern_uuid; + /* Number of UPP buffers in use and max */ + volatile uint64_t csm_upp_buf_inuse; + volatile uint64_t csm_upp_buf_total; + /* * The rest of the fields may be rearranged as needed, with * the expectation that CSM_CURRENT_VERSION be bumped up on @@ -254,7 +258,7 @@ struct __user_channel_schema { * to ensure that both kernel and libsystem_kernel are in sync, * as otherwise we'd assert due to version mismatch. */ -#define CSM_CURRENT_VERSION 18 +#define CSM_CURRENT_VERSION 19 /* valid values for csm_flags */ #define CSM_PRIV_MEM 0x1 /* private memory region */ @@ -449,7 +453,7 @@ struct __flowadv_entry { uint32_t fae_id_32[4]; uuid_t fae_id; /* flow ID from userspace stack */ }; - volatile uint32_t fae_ce_cnt; + volatile uint32_t fae_congestion_cnt; volatile uint32_t fae_pkt_cnt; volatile uint32_t fae_flags; /* flags FLOWADVF_* */ /* flow ID generated by flowswitch */ @@ -519,22 +523,19 @@ struct ch_init { guardid_t ci_guard; /* out: guard ID */ }; -#define CHMODE_MONITOR_TX 0x00000001 -#define CHMODE_MONITOR_RX 0x00000002 -#define CHMODE_MONITOR_NO_COPY 0x00000004 /* only if mon tx/rx is set */ +#define CHMODE_UNUSED_1 0x00000001 +#define CHMODE_UNUSED_2 0x00000002 +#define CHMODE_UNUSED_3 0x00000004 #define CHMODE_USER_PACKET_POOL 0x00000008 #define CHMODE_DEFUNCT_OK 0x00000010 #define CHMODE_FILTER 0x00000020 /* packet filter channel */ #define CHMODE_EVENT_RING 0x00000040 #define CHMODE_LOW_LATENCY 0x00000080 #define CHMODE_EXCLUSIVE 0x00000200 -#define CHMODE_MONITOR \ - (CHMODE_MONITOR_TX | CHMODE_MONITOR_RX) #ifdef KERNEL /* mask off userland-settable bits */ #define CHMODE_MASK \ - (CHMODE_MONITOR | CHMODE_MONITOR_NO_COPY | \ - CHMODE_USER_PACKET_POOL | CHMODE_FILTER | \ + (CHMODE_USER_PACKET_POOL | CHMODE_FILTER | \ CHMODE_DEFUNCT_OK | CHMODE_EVENT_RING | CHMODE_EXCLUSIVE | \ CHMODE_LOW_LATENCY) #define CHMODE_KERNEL 0x00001000 /* special, in-kernel */ @@ -644,7 +645,6 @@ struct channel_attr { uint32_t cha_meta_size; uint32_t cha_stats_size; uint32_t cha_exclusive; - uint32_t cha_monitor; uint32_t cha_key_len; void *cha_key; struct ch_ev_thresh cha_tx_lowat; diff --git a/bsd/skywalk/core/skywalk.c b/bsd/skywalk/core/skywalk.c index 0ce9f0773..ffc64ef55 100644 --- a/bsd/skywalk/core/skywalk.c +++ b/bsd/skywalk/core/skywalk.c @@ -30,6 +30,7 @@ #include /* for PE_parse_boot_argn */ #include /* for csproc_get_platform_binary */ #include +#include #if CONFIG_MACF #include #endif /* CONFIG_MACF */ @@ -50,11 +51,11 @@ static void skywalk_fini(void); static int sk_priv_chk(proc_t, kauth_cred_t, int); static int __sk_inited = 0; +uint64_t sk_verbose; + #if (DEVELOPMENT || DEBUG) size_t sk_copy_thres = SK_COPY_THRES; -uint64_t sk_verbose; #endif /* DEVELOPMENT || DEBUG */ -uint32_t sk_debug; uint64_t sk_features = #if SKYWALK SK_FEATURE_SKYWALK | @@ -68,9 +69,6 @@ uint64_t sk_features = #if CONFIG_NEXUS_FLOWSWITCH SK_FEATURE_NEXUS_FLOWSWITCH | #endif -#if CONFIG_NEXUS_MONITOR - SK_FEATURE_NEXUS_MONITOR | -#endif #if CONFIG_NEXUS_NETIF SK_FEATURE_NEXUS_NETIF | #endif @@ -191,7 +189,6 @@ int sk_netif_compat_rx_mbq_limit = SK_NETIF_COMPAT_RX_MBQ_LIMIT; uint32_t sk_netif_tx_mit = SK_NETIF_MIT_AUTO; uint32_t sk_netif_rx_mit = SK_NETIF_MIT_AUTO; char sk_ll_prefix[IFNAMSIZ] = "llw"; -uint32_t sk_rx_sync_packets = 1; uint32_t sk_channel_buflet_alloc = 0; uint32_t sk_netif_queue_stat_enable = 0; @@ -203,11 +200,10 @@ SYSCTL_NODE(_kern_skywalk, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_LOCKED, SYSCTL_OPAQUE(_kern_skywalk, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_LOCKED, &sk_features, sizeof(sk_features), "-", "Skywalk features"); -#if (DEVELOPMENT || DEBUG) SYSCTL_QUAD(_kern_skywalk, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_verbose, "Skywalk verbose mode"); -SYSCTL_UINT(_kern_skywalk, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, - &sk_debug, 0, "Skywalk debug mode"); + +#if (DEVELOPMENT || DEBUG) SYSCTL_LONG(_kern_skywalk, OID_AUTO, sk_copy_thres, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_copy_thres, "Skywalk copy threshold"); static int __priv_check = 1; @@ -219,20 +215,14 @@ SYSCTL_UINT(_kern_skywalk, OID_AUTO, sk_cksum_tx, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_cksum_tx, 0, "Advertise (and perform) outbound checksum offload"); SYSCTL_UINT(_kern_skywalk, OID_AUTO, sk_cksum_rx, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_cksum_rx, 0, "Perform inbound checksum offload"); -SYSCTL_UINT(_kern_skywalk, OID_AUTO, sk_rx_sync_packets, CTLFLAG_RW | CTLFLAG_LOCKED, - &sk_rx_sync_packets, 0, "Enable RX sync packets"); SYSCTL_UINT(_kern_skywalk, OID_AUTO, chan_buf_alloc, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_channel_buflet_alloc, 0, "channel buflet allocation (enable/disable)"); -#endif /* !DEVELOPMENT && !DEBUG */ -#if (DEVELOPMENT || DEBUG) uint32_t sk_inject_error_rmask = 0x3; SYSCTL_UINT(_kern_skywalk, OID_AUTO, inject_error_rmask, CTLFLAG_RW | CTLFLAG_LOCKED, &sk_inject_error_rmask, 0x3, ""); -#endif /* !DEVELOPMENT && !DEBUG */ -#if (DEVELOPMENT || DEBUG) static void skywalk_self_tests(void); #endif /* (DEVELOPMENT || DEBUG) */ @@ -248,10 +238,11 @@ static SKMEM_TAG_DEFINE(skmem_tag_dump, SKMEM_TAG_DUMP); static uint32_t sk_dump_buf_size; static char *__sized_by(sk_dump_buf_size) sk_dump_buf; -#define SK_DUMP_BUF_SIZE 2048 #define SK_DUMP_BUF_ALIGN 16 #endif /* (SK_LOG || DEVELOPMENT || DEBUG) */ +os_log_t sk_log_handle; + __startup_func void __sk_tag_make(const struct sk_tag_spec *spec) @@ -343,11 +334,12 @@ skywalk_init(void) VERIFY(!__sk_inited); - _CASSERT(sizeof(kern_packet_t) == sizeof(uint64_t)); - _CASSERT(sizeof(bitmap_t) == sizeof(uint64_t)); + static_assert(sizeof(kern_packet_t) == sizeof(uint64_t)); + static_assert(sizeof(bitmap_t) == sizeof(uint64_t)); + + sk_log_handle = os_log_create("com.apple.xnu", "skywalk"); #if (DEVELOPMENT || DEBUG) - PE_parse_boot_argn("sk_debug", &sk_debug, sizeof(sk_debug)); PE_parse_boot_argn("sk_verbose", &sk_verbose, sizeof(sk_verbose)); (void) PE_parse_boot_argn("sk_opp_defunct", &sk_opp_defunct, sizeof(sk_opp_defunct)); @@ -422,8 +414,6 @@ skywalk_init(void) sizeof(sk_fsw_gso_mtu)); (void) PE_parse_boot_argn("sk_fsw_max_bufs", &sk_fsw_max_bufs, sizeof(sk_fsw_max_bufs)); - (void) PE_parse_boot_argn("sk_rx_sync_packets", &sk_rx_sync_packets, - sizeof(sk_rx_sync_packets)); (void) PE_parse_boot_argn("sk_chan_buf_alloc", &sk_channel_buflet_alloc, sizeof(sk_channel_buflet_alloc)); (void) PE_parse_boot_argn("sk_guard", &sk_guard, sizeof(sk_guard)); @@ -579,11 +569,11 @@ sk_priv_chk(proc_t p, kauth_cred_t cred, int priv) #if SK_LOG if (__priv_check) { SK_DF(SK_VERB_PRIV, "%s(%d) insufficient privilege %d " - "(\"%s\") err %d", sk_proc_name_address(p), + "(\"%s\") err %d", sk_proc_name(p), sk_proc_pid(p), priv, pstr, ret); } else { SK_DF(SK_VERB_PRIV, "%s(%d) IGNORING missing privilege " - "%d (\"%s\") err %d", sk_proc_name_address(p), + "%d (\"%s\") err %d", sk_proc_name(p), sk_proc_pid(p), priv, pstr, ret); } #endif /* SK_LOG */ @@ -670,7 +660,7 @@ skywalk_nxctl_check_privileges(proc_t p, kauth_cred_t cred) #if (DEVELOPMENT || DEBUG) if (ret != 0) { SK_ERR("%s(%d) insufficient privilege to open nexus controller " - "err %d", sk_proc_name_address(p), sk_proc_pid(p), ret); + "err %d", sk_proc_name(p), sk_proc_pid(p), ret); } #endif /* !DEVELOPMENT && !DEBUG */ done: @@ -719,39 +709,28 @@ sk_uuid_unparse(const uuid_t uu, uuid_string_t out) * buffer's total length. * @param dumplen * length to be dumped. - * @param dst - * destination char buffer. sk_dump_buf would be used if dst is NULL. - * @param lim - * destination char buffer max length. Not used if dst is NULL. - * - * -fbounds-safety: Note that all callers of this function pass NULL and 0 for - * dst and lim, respectively. */ const char * -__counted_by(lim) -sk_dump(const char *label, const void *__sized_by(len) obj, int len, int dumplen, - char *__counted_by(lim) dst, int lim) +__counted_by(SK_DUMP_BUF_SIZE) +sk_dump(const char *label, const void *__sized_by(len) obj, int len, int dumplen) { int i, j, i0, n = 0; static char hex[] = "0123456789abcdef"; const char *p = obj; /* dump cursor */ uint32_t size; char *__sized_by(size) o; /* output position */ + const int lim = SK_DUMP_BUF_SIZE; + char* __counted_by(lim) dst = sk_dump_buf; + #define P_HI(x) hex[((x) & 0xf0) >> 4] #define P_LO(x) hex[((x) & 0xf)] #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') - if (dst == NULL) { - dst = sk_dump_buf; - lim = SK_DUMP_BUF_SIZE; - } else if (lim <= 0 || lim > len) { - dst = dst; - lim = len; /* rdar://117789233 */ - } + dumplen = MIN(len, dumplen); o = dst; size = lim; - n = scnprintf(o, lim, "%s 0x%llx len %d lim %d\n", label, + n = scnprintf(o, lim, "%s %p len %d lim %d\n", label, SK_KVA(p), len, lim); o += strbuflen(o, n); size -= n; @@ -785,7 +764,7 @@ sk_dump(const char *label, const void *__sized_by(len) obj, int len, int dumplen * "Safe" variant of proc_name_address(), meant to be used only for logging. */ const char * -sk_proc_name_address(struct proc *p) +sk_proc_name(struct proc *p) { if (p == PROC_NULL) { return "proc_null"; @@ -807,6 +786,34 @@ sk_proc_pid(struct proc *p) return proc_pid(p); } +const char * +sk_ntop(int af, const void *addr, char *__counted_by(addr_strlen)addr_str, + size_t addr_strlen) +{ + const char *__null_terminated str = NULL; + + addr_str[0] = '\0'; + + if (inp_log_privacy != 0) { + switch (af) { + case AF_INET: + strlcpy(addr_str, "", addr_strlen); + break; + case AF_INET6: + strlcpy(addr_str, "", addr_strlen); + break; + default: + VERIFY(0); + __builtin_unreachable(); + } + str = __unsafe_null_terminated_from_indexable(addr_str); + } else { + str = inet_ntop(af, addr, addr_str, (socklen_t)addr_strlen); + } + + return str; +} + const char * sk_sa_ntop(struct sockaddr *sa, char *__counted_by(addr_strlen)addr_str, size_t addr_strlen) @@ -817,12 +824,12 @@ sk_sa_ntop(struct sockaddr *sa, char *__counted_by(addr_strlen)addr_str, switch (sa->sa_family) { case AF_INET: - str = inet_ntop(AF_INET, &SIN(sa)->sin_addr.s_addr, + str = sk_ntop(AF_INET, &SIN(sa)->sin_addr.s_addr, addr_str, (socklen_t)addr_strlen); break; case AF_INET6: - str = inet_ntop(AF_INET6, &SIN6(sa)->sin6_addr, + str = sk_ntop(AF_INET6, &SIN6(sa)->sin6_addr, addr_str, (socklen_t)addr_strlen); break; @@ -893,12 +900,12 @@ skywalk_kill_process(struct proc *p, uint64_t reason_code) exit_reason = os_reason_create(OS_REASON_SKYWALK, reason_code); if (exit_reason == OS_REASON_NULL) { SK_ERR("%s(%d) unable to allocate memory for crash reason " - "0x%llX", sk_proc_name_address(p), sk_proc_pid(p), + "0x%llX", sk_proc_name(p), sk_proc_pid(p), reason_code); } else { exit_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; SK_ERR("%s(%d) aborted for reason 0x%llX", - sk_proc_name_address(p), sk_proc_pid(p), reason_code); + sk_proc_name(p), sk_proc_pid(p), reason_code); } psignal_try_thread_with_reason(p, current_thread(), SIGABRT, @@ -910,7 +917,7 @@ skywalk_kill_process(struct proc *p, uint64_t reason_code) #define SK_MASK_MAXLEN 80 /* maximum mask length */ #define SK_MEMCMP_MASK_VERIFY(t, l, lr) do { \ - _CASSERT(sizeof(t##_m) == SK_MASK_MAXLEN); \ + static_assert(sizeof(t##_m) == SK_MASK_MAXLEN); \ if ((sk_memcmp_mask_##l##B(hdr1, hdr2, t##_m) != 0) ^ \ (skywalk_memcmp_mask_ref(hdr1, hdr2, t##_m, lr) != 0)) { \ panic_plain("\nbroken: " #t " using " \ @@ -1151,7 +1158,7 @@ skywalk_memcmp_mask_self_tests(void) }; /* validate flow entry mask (2-tuple) */ - _CASSERT(FKMASK_2TUPLE == (FKMASK_PROTO | FKMASK_SPORT)); + static_assert(FKMASK_2TUPLE == (FKMASK_PROTO | FKMASK_SPORT)); VERIFY(fk_mask_2tuple.fk_mask == FKMASK_2TUPLE); VERIFY(fk_mask_2tuple.fk_ipver == 0); VERIFY(fk_mask_2tuple.fk_proto == 0xff); @@ -1163,7 +1170,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_2tuple.fk_dst._addr64[1] == 0); VERIFY(fk_mask_2tuple.fk_pad[0] == 0); - _CASSERT(FKMASK_3TUPLE == (FKMASK_2TUPLE | FKMASK_IPVER | FKMASK_SRC)); + static_assert(FKMASK_3TUPLE == (FKMASK_2TUPLE | FKMASK_IPVER | FKMASK_SRC)); VERIFY(fk_mask_3tuple.fk_mask == FKMASK_3TUPLE); VERIFY(fk_mask_3tuple.fk_ipver == 0xff); VERIFY(fk_mask_3tuple.fk_proto == 0xff); @@ -1175,7 +1182,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_3tuple.fk_dst._addr64[1] == 0); VERIFY(fk_mask_3tuple.fk_pad[0] == 0); - _CASSERT(FKMASK_4TUPLE == (FKMASK_3TUPLE | FKMASK_DPORT)); + static_assert(FKMASK_4TUPLE == (FKMASK_3TUPLE | FKMASK_DPORT)); VERIFY(fk_mask_4tuple.fk_mask == FKMASK_4TUPLE); VERIFY(fk_mask_4tuple.fk_ipver == 0xff); VERIFY(fk_mask_4tuple.fk_proto == 0xff); @@ -1187,7 +1194,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_4tuple.fk_dst._addr64[1] == 0); VERIFY(fk_mask_4tuple.fk_pad[0] == 0); - _CASSERT(FKMASK_5TUPLE == (FKMASK_4TUPLE | FKMASK_DST)); + static_assert(FKMASK_5TUPLE == (FKMASK_4TUPLE | FKMASK_DST)); VERIFY(fk_mask_5tuple.fk_mask == FKMASK_5TUPLE); VERIFY(fk_mask_5tuple.fk_ipver == 0xff); VERIFY(fk_mask_5tuple.fk_proto == 0xff); @@ -1199,7 +1206,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_5tuple.fk_dst._addr64[1] == 0xffffffffffffffffULL); VERIFY(fk_mask_5tuple.fk_pad[0] == 0); - _CASSERT(FKMASK_IPFLOW1 == FKMASK_PROTO); + static_assert(FKMASK_IPFLOW1 == FKMASK_PROTO); VERIFY(fk_mask_ipflow1.fk_mask == FKMASK_IPFLOW1); VERIFY(fk_mask_ipflow1.fk_ipver == 0); VERIFY(fk_mask_ipflow1.fk_proto == 0xff); @@ -1211,7 +1218,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_ipflow1.fk_dst._addr64[1] == 0); VERIFY(fk_mask_ipflow1.fk_pad[0] == 0); - _CASSERT(FKMASK_IPFLOW2 == (FKMASK_IPFLOW1 | FKMASK_IPVER | FKMASK_SRC)); + static_assert(FKMASK_IPFLOW2 == (FKMASK_IPFLOW1 | FKMASK_IPVER | FKMASK_SRC)); VERIFY(fk_mask_ipflow2.fk_mask == FKMASK_IPFLOW2); VERIFY(fk_mask_ipflow2.fk_ipver == 0xff); VERIFY(fk_mask_ipflow2.fk_proto == 0xff); @@ -1223,7 +1230,7 @@ skywalk_memcmp_mask_self_tests(void) VERIFY(fk_mask_ipflow2.fk_dst._addr64[1] == 0); VERIFY(fk_mask_ipflow2.fk_pad[0] == 0); - _CASSERT(FKMASK_IPFLOW3 == (FKMASK_IPFLOW2 | FKMASK_DST)); + static_assert(FKMASK_IPFLOW3 == (FKMASK_IPFLOW2 | FKMASK_DST)); VERIFY(fk_mask_ipflow3.fk_mask == FKMASK_IPFLOW3); VERIFY(fk_mask_ipflow3.fk_ipver == 0xff); VERIFY(fk_mask_ipflow3.fk_proto == 0xff); @@ -1410,10 +1417,10 @@ skywalk_self_tests(void) * 2nd section is reference target based on bcopy; * 3rd section is test target base on our stuff. */ - _CASSERT(SK_COPY_LEN != 0 && (SK_COPY_LEN % 128) == 0); - _CASSERT((SK_COPY_LEN % 16) == 0); - _CASSERT((SK_DUMP_BUF_ALIGN % 16) == 0); - _CASSERT(SK_DUMP_BUF_SIZE >= (SK_DUMP_BUF_ALIGN + (SK_COPY_LEN * 3))); + static_assert(SK_COPY_LEN != 0 && (SK_COPY_LEN % 128) == 0); + static_assert((SK_COPY_LEN % 16) == 0); + static_assert((SK_DUMP_BUF_ALIGN % 16) == 0); + static_assert(SK_DUMP_BUF_SIZE >= (SK_DUMP_BUF_ALIGN + (SK_COPY_LEN * 3))); s1 = sk_dump_buf; if (!IS_P2ALIGNED(s1, SK_DUMP_BUF_ALIGN)) { @@ -1549,26 +1556,26 @@ skywalk_self_tests(void) bzero(sk_dump_buf, SK_DUMP_BUF_SIZE); /* Keep packet trace code in sync with ariadne plist */ - _CASSERT(SK_KTRACE_AON_IF_STATS == 0x8100004); + static_assert(SK_KTRACE_AON_IF_STATS == 0x8100004); - _CASSERT(SK_KTRACE_FSW_DEV_RING_FLUSH == 0x8110004); - _CASSERT(SK_KTRACE_FSW_USER_RING_FLUSH == 0x8110008); - _CASSERT(SK_KTRACE_FSW_FLOW_TRACK_RTT == 0x8110010); + static_assert(SK_KTRACE_FSW_DEV_RING_FLUSH == 0x8110004); + static_assert(SK_KTRACE_FSW_USER_RING_FLUSH == 0x8110008); + static_assert(SK_KTRACE_FSW_FLOW_TRACK_RTT == 0x8110010); - _CASSERT(SK_KTRACE_NETIF_RING_TX_REFILL == 0x8120004); - _CASSERT(SK_KTRACE_NETIF_HOST_ENQUEUE == 0x8120008); - _CASSERT(SK_KTRACE_NETIF_MIT_RX_INTR == 0x812000c); - _CASSERT(SK_KTRACE_NETIF_COMMON_INTR == 0x8120010); - _CASSERT(SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT == 0x8120014); - _CASSERT(SK_KTRACE_NETIF_RX_NOTIFY_FAST == 0x8120018); + static_assert(SK_KTRACE_NETIF_RING_TX_REFILL == 0x8120004); + static_assert(SK_KTRACE_NETIF_HOST_ENQUEUE == 0x8120008); + static_assert(SK_KTRACE_NETIF_MIT_RX_INTR == 0x812000c); + static_assert(SK_KTRACE_NETIF_COMMON_INTR == 0x8120010); + static_assert(SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT == 0x8120014); + static_assert(SK_KTRACE_NETIF_RX_NOTIFY_FAST == 0x8120018); - _CASSERT(SK_KTRACE_CHANNEL_TX_REFILL == 0x8130004); + static_assert(SK_KTRACE_CHANNEL_TX_REFILL == 0x8130004); - _CASSERT(SK_KTRACE_PKT_RX_DRV == 0x8140004); - _CASSERT(SK_KTRACE_PKT_RX_FSW == 0x8140008); - _CASSERT(SK_KTRACE_PKT_RX_CHN == 0x814000c); - _CASSERT(SK_KTRACE_PKT_TX_FSW == 0x8140040); - _CASSERT(SK_KTRACE_PKT_TX_AQM == 0x8140044); - _CASSERT(SK_KTRACE_PKT_TX_DRV == 0x8140048); + static_assert(SK_KTRACE_PKT_RX_DRV == 0x8140004); + static_assert(SK_KTRACE_PKT_RX_FSW == 0x8140008); + static_assert(SK_KTRACE_PKT_RX_CHN == 0x814000c); + static_assert(SK_KTRACE_PKT_TX_FSW == 0x8140040); + static_assert(SK_KTRACE_PKT_TX_AQM == 0x8140044); + static_assert(SK_KTRACE_PKT_TX_DRV == 0x8140048); } #endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/skywalk/core/skywalk_common.h b/bsd/skywalk/core/skywalk_common.h index b8dde86ed..73f294fb8 100644 --- a/bsd/skywalk/core/skywalk_common.h +++ b/bsd/skywalk/core/skywalk_common.h @@ -94,9 +94,9 @@ __END_DECLS ((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align))))) /* compile time assert */ -#ifndef _CASSERT -#define _CASSERT(x) _Static_assert(x, "compile-time assertion failed") -#endif /* !_CASSERT */ +#ifndef static_assert +#define static_assert(x) _Static_assert(x, #x) +#endif /* !static_assert */ /* power of 2 address alignment */ #ifndef IS_P2ALIGNED diff --git a/bsd/skywalk/core/skywalk_proc_info.c b/bsd/skywalk/core/skywalk_proc_info.c index a0018b7c3..ca0bd051a 100644 --- a/bsd/skywalk/core/skywalk_proc_info.c +++ b/bsd/skywalk/core/skywalk_proc_info.c @@ -42,15 +42,6 @@ ch_mode_to_flags(uint32_t ch_mode) { uint32_t flags = 0; - if ((ch_mode & CHMODE_MONITOR_RX) != 0) { - flags |= PROC_CHANNEL_FLAGS_MONITOR_RX; - } - if ((ch_mode & CHMODE_MONITOR_TX) != 0) { - flags |= PROC_CHANNEL_FLAGS_MONITOR_TX; - } - if ((ch_mode & CHMODE_MONITOR_NO_COPY) != 0) { - flags |= PROC_CHANNEL_FLAGS_MONITOR_NO_COPY; - } if ((ch_mode & CHMODE_EXCLUSIVE) != 0) { flags |= PROC_CHANNEL_FLAGS_EXCLUSIVE; } diff --git a/bsd/skywalk/core/skywalk_var.h b/bsd/skywalk/core/skywalk_var.h index ff008e9b5..ff72939ec 100644 --- a/bsd/skywalk/core/skywalk_var.h +++ b/bsd/skywalk/core/skywalk_var.h @@ -111,8 +111,6 @@ #define SK_ATOMIC_TEST_AND_SET(p) (!os_atomic_cmpxchg((p), 0, 1, acq_rel)) #define SK_ATOMIC_CLEAR(p) os_atomic_store((p), 0, release) -extern uint32_t sk_debug; - /* * feature bits defined in os_skywalk_private.h */ @@ -351,7 +349,6 @@ typedef enum netif_mit_cfg { } netif_mit_cfg_t; extern uint32_t sk_netif_tx_mit; extern uint32_t sk_netif_rx_mit; -extern uint32_t sk_rx_sync_packets; extern uint32_t sk_channel_buflet_alloc; extern uint32_t sk_min_pool_size; extern uint32_t sk_netif_queue_stat_enable; @@ -504,15 +501,21 @@ extern boolean_t skywalk_check_platform_binary(proc_t); extern boolean_t skywalk_netif_direct_allowed(const char *); extern boolean_t skywalk_netif_direct_enabled(void); extern void sk_gen_guard_id(boolean_t, const uuid_t, guardid_t *); -extern char *__counted_by(sizeof(uuid_string_t)) sk_uuid_unparse(const uuid_t, uuid_string_t); +extern char *__counted_by(sizeof(uuid_string_t)) sk_uuid_unparse(const uuid_t, + uuid_string_t); #if SK_LOG -extern const char *__counted_by(lim) sk_dump(const char *label, - const void *__sized_by(len) obj, int len, int dumplen, - char *__counted_by(lim) dst, int lim); +#define SK_DUMP_BUF_SIZE 2048 +extern const char *__counted_by(SK_DUMP_BUF_SIZE) sk_dump(const char *label, + const void *__sized_by(len) obj, int len, int dumplen); extern const char *sk_proc_name_address(struct proc *); +extern const char *sk_proc_name(struct proc *); extern int sk_proc_pid(struct proc *); -extern const char *sk_sa_ntop(struct sockaddr *, char *__counted_by(addr_strlen), - size_t addr_strlen); + +/* skywalk ntop function that follows privacy (IP redaction) setting */ +extern const char * sk_ntop(int af, const void *addr, + char *__counted_by(addr_strlen)addr_str, size_t addr_strlen); +extern const char *sk_sa_ntop(struct sockaddr *sa, + char *__counted_by(addr_strlen)addr_str, size_t addr_strlen); extern const char *sk_memstatus2str(uint32_t); #endif /* SK_LOG */ diff --git a/bsd/skywalk/mem/skmem.c b/bsd/skywalk/mem/skmem.c index 40dc3d105..15bc6e35e 100644 --- a/bsd/skywalk/mem/skmem.c +++ b/bsd/skywalk/mem/skmem.c @@ -29,6 +29,7 @@ #include #include #include +#include #include /* @@ -174,8 +175,8 @@ static const struct skmem_region_params skmem_regions[SKMEM_REGIONS] = { .srp_id = SKMEM_REGION_UMD, .srp_cflags = SKMEM_REGION_CR_MMAPOK | SKMEM_REGION_CR_NOMAGAZINES, - .srp_md_type = NEXUS_META_TYPE_QUANTUM, - .srp_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .srp_md_type = NEXUS_META_TYPE_PACKET, + .srp_md_subtype = NEXUS_META_SUBTYPE_RAW, .srp_max_frags = 1, }, @@ -292,8 +293,8 @@ static const struct skmem_region_params skmem_regions[SKMEM_REGIONS] = { .srp_id = SKMEM_REGION_KMD, .srp_cflags = SKMEM_REGION_CR_NOMAGAZINES | SKMEM_REGION_CR_MEMTAG, - .srp_md_type = NEXUS_META_TYPE_QUANTUM, - .srp_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .srp_md_type = NEXUS_META_TYPE_PACKET, + .srp_md_subtype = NEXUS_META_SUBTYPE_RAW, .srp_max_frags = 1, }, [SKMEM_REGION_RXKMD] = { @@ -302,8 +303,8 @@ static const struct skmem_region_params skmem_regions[SKMEM_REGIONS] = { .srp_cflags = SKMEM_REGION_CR_NOMAGAZINES | SKMEM_REGION_CR_MEMTAG, .srp_r_obj_cnt = 0, - .srp_md_type = NEXUS_META_TYPE_QUANTUM, - .srp_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .srp_md_type = NEXUS_META_TYPE_PACKET, + .srp_md_subtype = NEXUS_META_SUBTYPE_RAW, .srp_max_frags = 1, }, [SKMEM_REGION_TXKMD] = { @@ -312,8 +313,8 @@ static const struct skmem_region_params skmem_regions[SKMEM_REGIONS] = { .srp_cflags = SKMEM_REGION_CR_NOMAGAZINES | SKMEM_REGION_CR_MEMTAG, .srp_r_obj_cnt = 0, - .srp_md_type = NEXUS_META_TYPE_QUANTUM, - .srp_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .srp_md_type = NEXUS_META_TYPE_PACKET, + .srp_md_subtype = NEXUS_META_SUBTYPE_RAW, .srp_max_frags = 1, }, @@ -543,7 +544,7 @@ skmem_sys_region_init(void) srp.srp_r_obj_size = SK_SYS_OBJSIZE_DEFAULT; skmem_region_params_config(&srp); - _CASSERT(SK_SYS_OBJSIZE_DEFAULT >= sizeof(skmem_sysctl)); + static_assert(SK_SYS_OBJSIZE_DEFAULT >= sizeof(skmem_sysctl)); sk_sys_region = skmem_region_create("global", &srp, NULL, NULL, NULL); if (sk_sys_region == NULL) { panic("failed to allocate global sysctls region"); @@ -593,12 +594,6 @@ skmem_get_sysctls_obj(size_t * size) return sk_sys_obj; } -/* for VM stats */ -extern unsigned int vm_page_free_count, vm_page_speculative_count, - vm_page_active_count, vm_page_inactive_count, vm_page_inactive_count, - vm_page_wire_count, vm_page_throttled_count, vm_lopage_free_count, - vm_page_purgeable_count, vm_page_purged_count; - #define SKMEM_WDT_DUMP_BUF_CHK() do { \ clen -= k; \ if (clen < 1) \ @@ -606,19 +601,6 @@ extern unsigned int vm_page_free_count, vm_page_speculative_count, c += k; \ } while (0) -/* - * The compiler doesn't know that snprintf() supports %b format - * specifier, so use our own wrapper to vsnprintf() here instead. - */ -#define skmem_snprintf(str, size, format, ...) ({ \ - _Pragma("clang diagnostic push") \ - _Pragma("clang diagnostic ignored \"-Wformat-invalid-specifier\"") \ - _Pragma("clang diagnostic ignored \"-Wformat-extra-args\"") \ - _Pragma("clang diagnostic ignored \"-Wformat\"") \ - snprintf(str, size, format, ## __VA_ARGS__) \ - _Pragma("clang diagnostic pop"); \ -}) - __attribute__((noinline, cold, not_tail_called)) char * skmem_dump(struct skmem_region *skr) @@ -637,14 +619,13 @@ skmem_dump(struct skmem_region *skr) } c = skmem_dump_buf; - k = skmem_snprintf(c, clen, + k = sk_snprintf(c, clen, "Region %p\n" - " | Mode : 0x%b\n" + " | Mode : 0x%x\n" " | Memory : [%llu in use [%llu wired]] / [%llu total]\n" " | Transactions : [%llu segment allocs, %llu frees]\n\n", - skr, skr->skr_mode, SKR_MODE_BITS, skr->skr_meminuse, - skr->skr_w_meminuse, skr->skr_memtotal, skr->skr_alloc, - skr->skr_free); + skr, skr->skr_mode, skr->skr_meminuse, skr->skr_w_meminuse, + skr->skr_memtotal, skr->skr_alloc, skr->skr_free); SKMEM_WDT_DUMP_BUF_CHK(); if (skr->skr_mode & SKR_MODE_SLAB) { @@ -652,22 +633,21 @@ skmem_dump(struct skmem_region *skr) if ((skm = skr->skr_cache[i]) == NULL) { continue; } - k = skmem_snprintf(c, clen, "Cache %p\n" - " | Mode : 0x%b\n" + k = sk_snprintf(c, clen, "Cache %p\n" + " | Mode : 0x%x\n" " | Memory : [%llu in use] / [%llu total]\n" " | Transactions : [%llu alloc failures]\n" " | [%llu slab creates, %llu destroys]\n" " | [%llu slab allocs, %llu frees]\n\n", - skm, skm->skm_mode, SKM_MODE_BITS, - skm->skm_sl_bufinuse, skm->skm_sl_bufmax, - skm->skm_sl_alloc_fail, skm->skm_sl_create, - skm->skm_sl_destroy, skm->skm_sl_alloc, - skm->skm_sl_free); + skm, skm->skm_mode, skm->skm_sl_bufinuse, + skm->skm_sl_bufmax, skm->skm_sl_alloc_fail, + skm->skm_sl_create, skm->skm_sl_destroy, + skm->skm_sl_alloc, skm->skm_sl_free); SKMEM_WDT_DUMP_BUF_CHK(); } } - k = skmem_snprintf(c, clen, + k = sk_snprintf(c, clen, "VM Pages\n" " | Free : %u [%u speculative]\n" " | Active : %u\n" diff --git a/bsd/skywalk/mem/skmem_arena.c b/bsd/skywalk/mem/skmem_arena.c index 59d4762e9..febdd063e 100644 --- a/bsd/skywalk/mem/skmem_arena.c +++ b/bsd/skywalk/mem/skmem_arena.c @@ -99,6 +99,8 @@ #include #include +#include + static void skmem_arena_destroy(struct skmem_arena *); static void skmem_arena_teardown(struct skmem_arena *, boolean_t); static int skmem_arena_create_finalize(struct skmem_arena *); @@ -180,9 +182,9 @@ skmem_arena_sd_setup(const struct nexus_adapter *na, name = __unsafe_null_terminated_from_indexable(na->na_name); ksd_skr = skmem_region_create(name, &srp[ksd_type], NULL, NULL, NULL); if (ksd_skr == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to " - "create %s region", ar->ar_name, SK_KVA(ar), - ar->ar_flags, ARF_BITS, srp[ksd_type].srp_name); + SK_ERR("\"%s\" ar 0x%p flags 0x%x failed to create %s region", + ar->ar_name, SK_KVA(ar), ar->ar_flags, + srp[ksd_type].srp_name); err = ENOMEM; goto failed; } @@ -204,8 +206,8 @@ skmem_arena_sd_setup(const struct nexus_adapter *na, NULL, NULL, NULL, NULL, ar->ar_regions[ksd_type], SKMEM_CR_NOMAGAZINES); if (*cachep == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, cname); + SK_ERR("\"%s\" ar %p flags 0x%x failed to create %s", + ar->ar_name, SK_KVA(ar), ar->ar_flags, cname); err = ENOMEM; goto failed; } @@ -285,8 +287,8 @@ skmem_arena_pp_setup(struct skmem_arena *ar, rx_pp = pp_create(name, srp, NULL, NULL, NULL, NULL, NULL, ppcreatef); if (rx_pp == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create pp", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_ERR("\"%s\" ar %p flags 0x%x failed to create pp", + ar->ar_name, SK_KVA(ar), ar->ar_flags); return false; } pp_retain(rx_pp); @@ -645,9 +647,9 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, /* otherwise create it */ if ((ar->ar_regions[i] = skmem_region_create(name, &srp[i], NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to " + SK_ERR("\"%s\" ar %p flags 0x%x failed to " "create %s region", ar->ar_name, SK_KVA(ar), - ar->ar_flags, ARF_BITS, srp[i].srp_name); + ar->ar_flags, srp[i].srp_name); goto failed; } } @@ -661,9 +663,8 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, srp[SKMEM_REGION_SCHEMA].srp_c_obj_size, 0, NULL, NULL, NULL, NULL, ar->ar_regions[SKMEM_REGION_SCHEMA], SKMEM_CR_NOMAGAZINES)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, - cname); + SK_ERR("\"%s\" ar %p flags 0x%x failed to create %s", + ar->ar_name, SK_KVA(ar), ar->ar_flags, cname); goto failed; } } @@ -677,8 +678,8 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, srp[SKMEM_REGION_RING].srp_c_obj_size, 0, NULL, NULL, NULL, NULL, ar->ar_regions[SKMEM_REGION_RING], SKMEM_CR_NOMAGAZINES)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, cname); + SK_ERR("\"%s\" ar %p flags 0x%x failed to create %s", + ar->ar_name, SK_KVA(ar), ar->ar_flags, cname); goto failed; } @@ -698,9 +699,8 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, if ((obj = skmem_region_alloc(skr, &maddr, NULL, NULL, SKMEM_SLEEP, skr->skr_c_obj_size, &msize)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to alloc " - "stats", ar->ar_name, SK_KVA(ar), ar->ar_flags, - ARF_BITS); + SK_ERR("\"%s\" ar %p flags 0x%x failed to alloc stats", + ar->ar_name, SK_KVA(ar), ar->ar_flags); goto failed; } arn->arn_stats_obj = obj; @@ -723,9 +723,8 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, if ((obj = skmem_region_alloc(skr, &maddr, NULL, NULL, SKMEM_SLEEP, skr->skr_c_obj_size, &msize)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to alloc " - "flowadv", ar->ar_name, SK_KVA(ar), ar->ar_flags, - ARF_BITS); + SK_ERR("\"%s\" ar %p flags 0x%x failed to alloc " + "flowadv", ar->ar_name, SK_KVA(ar), ar->ar_flags); goto failed; } /* XXX -fbounds-safety: should get the count elsewhere */ @@ -734,8 +733,8 @@ skmem_arena_create_for_nexus(const struct nexus_adapter *na, } if (skmem_arena_create_finalize(ar) != 0) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to finalize", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_ERR("\"%s\" ar %p flags 0x%x failed to finalize", + ar->ar_name, SK_KVA(ar), ar->ar_flags); goto failed; } @@ -976,17 +975,11 @@ skmem_arena_create_for_necp(const char *name, if ((ar->ar_regions[SKMEM_REGION_USTATS] = skmem_region_create(name, srp_ustats, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s region", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, - srp_ustats->srp_name); goto failed; } if ((ar->ar_regions[SKMEM_REGION_KSTATS] = skmem_region_create(name, srp_kstats, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s region", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, - srp_kstats->srp_name); goto failed; } @@ -999,14 +992,10 @@ skmem_arena_create_for_necp(const char *name, srp_kstats->srp_c_obj_size, 0, necp_stats_ctor, NULL, NULL, NULL, ar->ar_regions[SKMEM_REGION_KSTATS], SKMEM_CR_NOMAGAZINES)) == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create %s", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, cname); goto failed; } if (skmem_arena_create_finalize(ar) != 0) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to finalize", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS); goto failed; } @@ -1052,6 +1041,8 @@ skmem_arena_create_for_necp(const char *name, return ar; failed: + SK_ERR("\"%s\" ar %p flags 0x%x failed to create %s region", + ar->ar_name, SK_KVA(ar), ar->ar_flags, srp_kstats->srp_name); AR_LOCK_ASSERT_HELD(ar); skmem_arena_destroy(ar); *perr = ENOMEM; @@ -1154,8 +1145,8 @@ skmem_arena_create_for_system(const char *name, int *perr) ASSERT(ars->ars_sysctls_objsize != 0); if (skmem_arena_create_finalize(ar) != 0) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to finalize", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_ERR("\"%s\" ar %p flags 0x%x failed to finalize", + ar->ar_name, SK_KVA(ar), ar->ar_flags); goto failed; } @@ -1289,8 +1280,8 @@ skmem_arena_destroy(struct skmem_arena *ar) { AR_LOCK_ASSERT_HELD(ar); - SK_DF(SK_VERB_MEM_ARENA, "\"%s\" ar 0x%llx flags %b", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_DF(SK_VERB_MEM_ARENA, "\"%s\" ar %p flags 0x%x", + ar->ar_name, SK_KVA(ar), ar->ar_flags); ASSERT(ar->ar_refcnt == 0); if (ar->ar_link.tqe_next != NULL || ar->ar_link.tqe_prev != NULL) { @@ -1397,9 +1388,8 @@ skmem_arena_create_finalize(struct skmem_arena *ar) */ ar->ar_ar = IOSKArenaCreate(reg, (IOSKCount)regcnt); if (ar->ar_ar == NULL) { - SK_ERR("\"%s\" ar 0x%llx flags %b failed to create " - "IOSKArena of %u regions", ar->ar_name, SK_KVA(ar), - ar->ar_flags, ARF_BITS, regcnt); + SK_ERR("\"%s\" ar %p flags 0x%x failed to create IOSKArena of" + "%u regions", ar->ar_name, SK_KVA(ar), ar->ar_flags, regcnt); err = ENOMEM; goto failed; } @@ -1713,9 +1703,9 @@ skmem_arena_mredirect(struct skmem_arena *ar, struct skmem_arena_mmap_info *ami, AR_UNLOCK(ar); SK_DF(((err != 0) ? SK_VERB_ERROR : SK_VERB_DEFAULT), - "%s(%d) \"%s\" ar 0x%llx flags %b inactive %u need_defunct %u " - "err %d", sk_proc_name_address(p), sk_proc_pid(p), ar->ar_name, - SK_KVA(ar), ar->ar_flags, ARF_BITS, !(ar->ar_flags & ARF_ACTIVE), + "%s(%d) \"%s\" ar %p flags 0x%x inactive %u need_defunct %u " + "err %d", sk_proc_name(p), sk_proc_pid(p), ar->ar_name, + SK_KVA(ar), ar->ar_flags, !(ar->ar_flags & ARF_ACTIVE), *need_defunct, err); return err; @@ -1729,8 +1719,8 @@ skmem_arena_defunct(struct skmem_arena *ar) { AR_LOCK(ar); - SK_DF(SK_VERB_MEM_ARENA, "\"%s\" ar 0x%llx flags 0x%b", ar->ar_name, - SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_DF(SK_VERB_MEM_ARENA, "\"%s\" ar %p flags 0x%x", ar->ar_name, + SK_KVA(ar), ar->ar_flags); if (ar->ar_flags & ARF_DEFUNCT) { AR_UNLOCK(ar); @@ -1892,16 +1882,16 @@ skmem_arena_create_region_log(struct skmem_arena *ar) switch (ar->ar_type) { case SKMEM_ARENA_TYPE_NEXUS: - SK_D("\"%s\" ar 0x%llx flags %b rx_pp 0x%llx tx_pp 0x%llu", - ar->ar_name, SK_KVA(ar), ar->ar_flags, ARF_BITS, + SK_D("\"%s\" ar %p flags 0x%x rx_pp %p tx_pp %p", + ar->ar_name, SK_KVA(ar), ar->ar_flags, SK_KVA(skmem_arena_nexus(ar)->arn_rx_pp), SK_KVA(skmem_arena_nexus(ar)->arn_tx_pp)); break; case SKMEM_ARENA_TYPE_NECP: case SKMEM_ARENA_TYPE_SYSTEM: - SK_D("\"%s\" ar 0x%llx flags %b", ar->ar_name, - SK_KVA(ar), ar->ar_flags, ARF_BITS); + SK_D("\"%s\" ar %p flags 0x%x", ar->ar_name, SK_KVA(ar), + ar->ar_flags); break; } @@ -2046,3 +2036,14 @@ skmem_arena_mib_get_sysctl SYSCTL_HANDLER_ARGS return error; } + +SK_NO_INLINE_ATTRIBUTE +char * +ar2str(const struct skmem_arena *ar, char *__counted_by(dsz)dst, + size_t dsz) +{ + (void) sk_snprintf(dst, dsz, "%p %s flags 0x%b", + SK_KVA(ar), ar->ar_name, ar->ar_flags, ARF_BITS); + + return dst; +} diff --git a/bsd/skywalk/mem/skmem_arena_var.h b/bsd/skywalk/mem/skmem_arena_var.h index f6241544b..7ff2cd614 100644 --- a/bsd/skywalk/mem/skmem_arena_var.h +++ b/bsd/skywalk/mem/skmem_arena_var.h @@ -266,6 +266,8 @@ extern void skmem_arena_get_stats(struct skmem_arena *, uint64_t *, extern mach_vm_offset_t skmem_arena_get_region_offset(struct skmem_arena *, skmem_region_id_t); extern void skmem_arena_reap(struct skmem_arena *, boolean_t); +extern char * ar2str(const struct skmem_arena *ar, char *__counted_by(dsz)dst, + size_t dsz); __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ #endif /* _SKYWALK_MEM_SKMEMARENAVAR_H */ diff --git a/bsd/skywalk/mem/skmem_cache.c b/bsd/skywalk/mem/skmem_cache.c index bd1c8e721..c6c9e7187 100644 --- a/bsd/skywalk/mem/skmem_cache.c +++ b/bsd/skywalk/mem/skmem_cache.c @@ -31,6 +31,7 @@ #include /* for PE_parse_boot_argn */ #include /* for OSBacktrace */ #include /* for assert_wait */ +#include #include /* @@ -358,7 +359,7 @@ skmem_cache_pre_init(void) #endif /* KASAN */ skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX); skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size, - ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE); + ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE); } TAILQ_INIT(&skmem_cache_head); @@ -376,27 +377,27 @@ skmem_cache_init(void) struct skmem_magtype *mtp; uint32_t i; - _CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL); + static_assert(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL); - _CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES); - _CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT); - _CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT); - _CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH); - _CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC); - _CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE); - _CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO); + static_assert(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES); + static_assert(SKM_MODE_AUDIT == SCA_MODE_AUDIT); + static_assert(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT); + static_assert(SKM_MODE_BATCH == SCA_MODE_BATCH); + static_assert(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC); + static_assert(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE); + static_assert(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO); ASSERT(__skmem_cache_pre_inited); ASSERT(!__skmem_cache_inited); - _CASSERT(offsetof(struct skmem_bufctl, bc_addr) == offsetof(struct skmem_bufctl_audit, bc_addr)); - _CASSERT(offsetof(struct skmem_bufctl, bc_addrm) == offsetof(struct skmem_bufctl_audit, bc_addrm)); - _CASSERT(offsetof(struct skmem_bufctl, bc_slab) == offsetof(struct skmem_bufctl_audit, bc_slab)); - _CASSERT(offsetof(struct skmem_bufctl, bc_lim) == offsetof(struct skmem_bufctl_audit, bc_lim)); - _CASSERT(offsetof(struct skmem_bufctl, bc_flags) == offsetof(struct skmem_bufctl_audit, bc_flags)); - _CASSERT(offsetof(struct skmem_bufctl, bc_idx) == offsetof(struct skmem_bufctl_audit, bc_idx)); - _CASSERT(offsetof(struct skmem_bufctl, bc_usecnt) == offsetof(struct skmem_bufctl_audit, bc_usecnt)); - _CASSERT(sizeof(struct skmem_bufctl) == offsetof(struct skmem_bufctl_audit, bc_thread)); + static_assert(offsetof(struct skmem_bufctl, bc_addr) == offsetof(struct skmem_bufctl_audit, bc_addr)); + static_assert(offsetof(struct skmem_bufctl, bc_addrm) == offsetof(struct skmem_bufctl_audit, bc_addrm)); + static_assert(offsetof(struct skmem_bufctl, bc_slab) == offsetof(struct skmem_bufctl_audit, bc_slab)); + static_assert(offsetof(struct skmem_bufctl, bc_lim) == offsetof(struct skmem_bufctl_audit, bc_lim)); + static_assert(offsetof(struct skmem_bufctl, bc_flags) == offsetof(struct skmem_bufctl_audit, bc_flags)); + static_assert(offsetof(struct skmem_bufctl, bc_idx) == offsetof(struct skmem_bufctl_audit, bc_idx)); + static_assert(offsetof(struct skmem_bufctl, bc_usecnt) == offsetof(struct skmem_bufctl_audit, bc_usecnt)); + static_assert(sizeof(struct skmem_bufctl) == offsetof(struct skmem_bufctl_audit, bc_thread)); PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug)); skmem_debug &= SKMEM_DEBUG_MASK; @@ -678,7 +679,7 @@ skmem_cache_create(const char *name, size_t bufsize, size_t bufalign, * are mappable to user space (we can't leak kernel * addresses). */ - _CASSERT(offsetof(struct skmem_obj, mo_next) == 0); + static_assert(offsetof(struct skmem_obj, mo_next) == 0); VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK)); /* batching is currently not supported on pseudo regions */ @@ -807,8 +808,8 @@ skmem_cache_create(const char *name, size_t bufsize, size_t bufalign, TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link); SKMEM_CACHE_UNLOCK(); - SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b", - skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS); + SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm %p mode 0x%x", + skm->skm_name, SK_KVA(skm), skm->skm_mode); SK_DF(SK_VERB_MEM_CACHE, " bufsz %u bufalign %u chunksz %u objsz %u slabsz %u", (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign, @@ -890,7 +891,7 @@ skmem_cache_destroy(struct skmem_cache *skm) lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp); lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp); - SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx", + SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm %p", skm->skm_name, SK_KVA(skm)); /* callee releases reference */ @@ -1831,7 +1832,7 @@ skmem_cache_magazine_purge(struct skmem_cache *skm) SKM_SLAB_LOCK_ASSERT_NOTHELD(skm); - SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm)); + SK_DF(SK_VERB_MEM_CACHE, "skm %p", SK_KVA(skm)); for (cpuid = 0; cpuid < ncpu; cpuid++) { cp = &skm->skm_cpu_cache[cpuid]; @@ -1894,7 +1895,7 @@ skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg) SKM_CPU_UNLOCK(cp); } - SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d", + SK_DF(SK_VERB_MEM_CACHE, "skm %p chunksize %u magsize %d", SK_KVA(skm), (uint32_t)skm->skm_chunksize, SKMEM_CPU_CACHE(skm)->cp_magsize); } @@ -1925,7 +1926,7 @@ skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep) SKM_RESIZE_UNLOCK(skm); (void) thread_block(THREAD_CONTINUE_NULL); SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" " - "(0x%llx) busy=%u", skm->skm_name, + "(%p) busy=%u", skm->skm_name, SK_KVA(skm), skm->skm_rs_busy); SKM_RESIZE_LOCK(skm); } @@ -2079,7 +2080,7 @@ skmem_cache_hash_rescale(struct skmem_cache *skm) } SK_DF(SK_VERB_MEM_CACHE, - "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm), + "skm %p old_size %u new_size %u [%u moved]", SK_KVA(skm), (uint32_t)old_size, (uint32_t)new_size, moved); SKM_SLAB_UNLOCK(skm); diff --git a/bsd/skywalk/mem/skmem_region.c b/bsd/skywalk/mem/skmem_region.c index f436e33be..f91828d92 100644 --- a/bsd/skywalk/mem/skmem_region.c +++ b/bsd/skywalk/mem/skmem_region.c @@ -115,6 +115,8 @@ #define _FN_KPRINTF /* don't redefine kprintf() */ #include /* for PE_parse_boot_argn */ +#include + static void skmem_region_destroy(struct skmem_region *skr); static void skmem_region_depopulate(struct skmem_region *); static int sksegment_cmp(const struct sksegment *, const struct sksegment *); @@ -252,88 +254,88 @@ skmem_region_init(void) { boolean_t randomize_seg_size; - _CASSERT(sizeof(bitmap_t) == sizeof(uint64_t)); - _CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3)); - _CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0); - _CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL); + static_assert(sizeof(bitmap_t) == sizeof(uint64_t)); + static_assert(BMAPSZ == (sizeof(bitmap_t) << 3)); + static_assert((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0); + static_assert(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL); ASSERT(!__skmem_region_inited); /* enforce the ordering here */ - _CASSERT(SKMEM_REGION_GUARD_HEAD == 0); - _CASSERT(SKMEM_REGION_SCHEMA == 1); - _CASSERT(SKMEM_REGION_RING == 2); - _CASSERT(SKMEM_REGION_BUF_DEF == 3); - _CASSERT(SKMEM_REGION_BUF_LARGE == 4); - _CASSERT(SKMEM_REGION_RXBUF_DEF == 5); - _CASSERT(SKMEM_REGION_RXBUF_LARGE == 6); - _CASSERT(SKMEM_REGION_TXBUF_DEF == 7); - _CASSERT(SKMEM_REGION_TXBUF_LARGE == 8); - _CASSERT(SKMEM_REGION_UMD == 9); - _CASSERT(SKMEM_REGION_TXAUSD == 10); - _CASSERT(SKMEM_REGION_RXFUSD == 11); - _CASSERT(SKMEM_REGION_UBFT == 12); - _CASSERT(SKMEM_REGION_USTATS == 13); - _CASSERT(SKMEM_REGION_FLOWADV == 14); - _CASSERT(SKMEM_REGION_NEXUSADV == 15); - _CASSERT(SKMEM_REGION_SYSCTLS == 16); - _CASSERT(SKMEM_REGION_GUARD_TAIL == 17); - _CASSERT(SKMEM_REGION_KMD == 18); - _CASSERT(SKMEM_REGION_RXKMD == 19); - _CASSERT(SKMEM_REGION_TXKMD == 20); - _CASSERT(SKMEM_REGION_KBFT == 21); - _CASSERT(SKMEM_REGION_RXKBFT == 22); - _CASSERT(SKMEM_REGION_TXKBFT == 23); - _CASSERT(SKMEM_REGION_TXAKSD == 24); - _CASSERT(SKMEM_REGION_RXFKSD == 25); - _CASSERT(SKMEM_REGION_KSTATS == 26); - _CASSERT(SKMEM_REGION_INTRINSIC == 27); + static_assert(SKMEM_REGION_GUARD_HEAD == 0); + static_assert(SKMEM_REGION_SCHEMA == 1); + static_assert(SKMEM_REGION_RING == 2); + static_assert(SKMEM_REGION_BUF_DEF == 3); + static_assert(SKMEM_REGION_BUF_LARGE == 4); + static_assert(SKMEM_REGION_RXBUF_DEF == 5); + static_assert(SKMEM_REGION_RXBUF_LARGE == 6); + static_assert(SKMEM_REGION_TXBUF_DEF == 7); + static_assert(SKMEM_REGION_TXBUF_LARGE == 8); + static_assert(SKMEM_REGION_UMD == 9); + static_assert(SKMEM_REGION_TXAUSD == 10); + static_assert(SKMEM_REGION_RXFUSD == 11); + static_assert(SKMEM_REGION_UBFT == 12); + static_assert(SKMEM_REGION_USTATS == 13); + static_assert(SKMEM_REGION_FLOWADV == 14); + static_assert(SKMEM_REGION_NEXUSADV == 15); + static_assert(SKMEM_REGION_SYSCTLS == 16); + static_assert(SKMEM_REGION_GUARD_TAIL == 17); + static_assert(SKMEM_REGION_KMD == 18); + static_assert(SKMEM_REGION_RXKMD == 19); + static_assert(SKMEM_REGION_TXKMD == 20); + static_assert(SKMEM_REGION_KBFT == 21); + static_assert(SKMEM_REGION_RXKBFT == 22); + static_assert(SKMEM_REGION_TXKBFT == 23); + static_assert(SKMEM_REGION_TXAKSD == 24); + static_assert(SKMEM_REGION_RXFKSD == 25); + static_assert(SKMEM_REGION_KSTATS == 26); + static_assert(SKMEM_REGION_INTRINSIC == 27); - _CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD); - _CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA); - _CASSERT(SREG_RING == SKMEM_REGION_RING); - _CASSERT(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF); - _CASSERT(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE); - _CASSERT(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF); - _CASSERT(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE); - _CASSERT(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF); - _CASSERT(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE); - _CASSERT(SREG_UMD == SKMEM_REGION_UMD); - _CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD); - _CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD); - _CASSERT(SREG_UBFT == SKMEM_REGION_UBFT); - _CASSERT(SREG_USTATS == SKMEM_REGION_USTATS); - _CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV); - _CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV); - _CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS); - _CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL); - _CASSERT(SREG_KMD == SKMEM_REGION_KMD); - _CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD); - _CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD); - _CASSERT(SREG_KBFT == SKMEM_REGION_KBFT); - _CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT); - _CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT); - _CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD); - _CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD); - _CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS); + static_assert(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD); + static_assert(SREG_SCHEMA == SKMEM_REGION_SCHEMA); + static_assert(SREG_RING == SKMEM_REGION_RING); + static_assert(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF); + static_assert(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE); + static_assert(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF); + static_assert(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE); + static_assert(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF); + static_assert(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE); + static_assert(SREG_UMD == SKMEM_REGION_UMD); + static_assert(SREG_TXAUSD == SKMEM_REGION_TXAUSD); + static_assert(SREG_RXFUSD == SKMEM_REGION_RXFUSD); + static_assert(SREG_UBFT == SKMEM_REGION_UBFT); + static_assert(SREG_USTATS == SKMEM_REGION_USTATS); + static_assert(SREG_FLOWADV == SKMEM_REGION_FLOWADV); + static_assert(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV); + static_assert(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS); + static_assert(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL); + static_assert(SREG_KMD == SKMEM_REGION_KMD); + static_assert(SREG_RXKMD == SKMEM_REGION_RXKMD); + static_assert(SREG_TXKMD == SKMEM_REGION_TXKMD); + static_assert(SREG_KBFT == SKMEM_REGION_KBFT); + static_assert(SREG_RXKBFT == SKMEM_REGION_RXKBFT); + static_assert(SREG_TXKBFT == SKMEM_REGION_TXKBFT); + static_assert(SREG_TXAKSD == SKMEM_REGION_TXAKSD); + static_assert(SREG_RXFKSD == SKMEM_REGION_RXFKSD); + static_assert(SREG_KSTATS == SKMEM_REGION_KSTATS); - _CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT); - _CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK); - _CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY); - _CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY); - _CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT); - _CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC); - _CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES); - _CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE); - _CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN); - _CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT); - _CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD); - _CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG); - _CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK); - _CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA); - _CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO); - _CASSERT(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE); - _CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB); - _CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED); + static_assert(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT); + static_assert(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK); + static_assert(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY); + static_assert(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY); + static_assert(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT); + static_assert(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC); + static_assert(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES); + static_assert(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE); + static_assert(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN); + static_assert(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT); + static_assert(SKR_MODE_GUARD == SREG_MODE_GUARD); + static_assert(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG); + static_assert(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK); + static_assert(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA); + static_assert(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO); + static_assert(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE); + static_assert(SKR_MODE_SLAB == SREG_MODE_SLAB); + static_assert(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED); (void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size, sizeof(skmem_seg_size)); @@ -388,7 +390,7 @@ skmem_region_init(void) SKMEM_MIN_SEG_SIZE); VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0); - SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], " + SK_D("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], " "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size, skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size, skmem_usr_buf_seg_size); @@ -734,6 +736,7 @@ skmem_region_create(const char *name, struct skmem_region_params *srp, skr->skr_r_obj_cnt = srp->srp_r_obj_cnt; skr->skr_c_obj_size = srp->srp_c_obj_size; skr->skr_c_obj_cnt = srp->srp_c_obj_cnt; + skr->skr_memtotal = skr->skr_seg_size * srp->srp_seg_cnt; skr->skr_params.srp_md_type = srp->srp_md_type; skr->skr_params.srp_md_subtype = srp->srp_md_subtype; @@ -754,7 +757,7 @@ skmem_region_create(const char *name, struct skmem_region_params *srp, (void) snprintf(skr->skr_name, sizeof(skr->skr_name), "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name); - SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ", + SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p ", skr->skr_name, SK_KVA(skr)); /* sanity check */ @@ -879,10 +882,9 @@ skmem_region_create(const char *name, struct skmem_region_params *srp, if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec, (IOSKSize)skr->skr_seg_size, (IOSKCount)skr->skr_seg_max_cnt)) == NULL) { - SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed", + SK_ERR("\%s\": [%u * %u] cflags 0x%x skr_reg failed", skr->skr_name, (uint32_t)skr->skr_seg_size, - (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags, - SKMEM_REGION_CR_BITS); + (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags); goto failed; } } @@ -897,10 +899,10 @@ skmem_region_create(const char *name, struct skmem_region_params *srp, SKMEM_REGION_UNLOCK(); SK_DF(SK_VERB_MEM_REGION, - " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b", + " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%x", (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt, (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt, - skr->skr_cflags, SKMEM_REGION_CR_BITS); + skr->skr_cflags); return skr; @@ -921,7 +923,7 @@ skmem_region_destroy(struct skmem_region *skr) SKR_LOCK_ASSERT_HELD(skr); - SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx", + SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p", skr->skr_name, SK_KVA(skr)); /* @@ -1020,7 +1022,7 @@ void skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr) { ASSERT(mskr != NULL); - SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ", + SK_DF(SK_VERB_MEM_REGION, "skr master %p, slave %p ", SK_KVA(skr), SK_KVA(mskr)); SKR_LOCK(skr); @@ -1295,16 +1297,15 @@ retry: ASSERT((mach_vm_address_t)addr == sg->sg_start); #if SK_LOG - SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", + SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p", SK_KVA(skr), SK_KVA(sg)); if (skr->skr_mirror == NULL || !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) { - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)", sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); } else { - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored", - sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start), - SK_KVA(sg->sg_end)); + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) mirrored", + sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); } #endif /* SK_LOG */ @@ -1401,9 +1402,9 @@ skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0, addr = skmem_region_alloc_common(skr, sg, skr->skr_seg_size); #if SK_LOG - SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", + SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p", SK_KVA(skr), SK_KVA(sg)); - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)", sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end)); #endif /* SK_LOG */ @@ -1459,16 +1460,16 @@ skmem_region_free(struct skmem_region *skr, void *addr, void *maddr) skr->skr_free++; #if SK_LOG - SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx", + SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p", SK_KVA(skr), SK_KVA(sg)); if (skr->skr_mirror == NULL || !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) { - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)", + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)", sg->sg_index, SK_KVA(addr), SK_KVA((uintptr_t)addr + skr->skr_seg_size)); } else { - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored", - sg->sg_index, SK_KVA(sg), SK_KVA(addr), + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) mirrored", + sg->sg_index, SK_KVA(addr), SK_KVA((uintptr_t)addr + skr->skr_seg_size)); } #endif /* SK_LOG */ @@ -1487,7 +1488,7 @@ skmem_region_free(struct skmem_region *skr, void *addr, void *maddr) /* wake up any blocked threads waiting for a segment */ if (skr->skr_seg_waiters != 0) { SK_DF(SK_VERB_MEM_REGION, - "sg 0x%llx waking up %u waiters", SK_KVA(sg), + "sg %p waking up %u waiters", SK_KVA(sg), skr->skr_seg_waiters); skr->skr_seg_waiters = 0; wakeup(&skr->skr_seg_free); @@ -1552,7 +1553,7 @@ skmem_region_depopulate(struct skmem_region *skr) { struct sksegment *sg, *tsg; - SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ", + SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p ", skr->skr_name, SK_KVA(skr)); SKR_LOCK_ASSERT_HELD(skr); @@ -1611,9 +1612,8 @@ sksegment_create(struct skmem_region *skr, uint32_t i) /* claim it (clear bit) */ bit_clear(*bmap, i % BMAPSZ); - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", i, - SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode, - SKR_MODE_BITS); + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) 0x%x", i, + SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode); return sg; } @@ -1641,9 +1641,8 @@ sksegment_destroy(struct skmem_region *skr, struct sksegment *sg) bmap = &skr->skr_seg_bmap[i / BMAPSZ]; ASSERT(!bit_test(*bmap, i % BMAPSZ)); - SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", - i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), - skr->skr_mode, SKR_MODE_BITS); + SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) 0x%x", + i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode); /* * Undo what's done earlier at segment creation time. @@ -1725,9 +1724,6 @@ sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg, sg->sg_md = NULL; sg->sg_start = sg->sg_end = 0; sg->sg_state = SKSEG_STATE_DETACHED; - - ASSERT(skr->skr_memtotal >= skr->skr_seg_size); - skr->skr_memtotal -= skr->skr_seg_size; } sg->sg_type = SKSEG_TYPE_FREE; @@ -1770,7 +1766,7 @@ sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg, if (__improbable(mtbf != 0 && !purging && (net_uptime_ms() % mtbf) == 0 && !(skmflag & SKMEM_PANIC))) { - SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure", + SK_ERR("skr \"%s\" %p sg %p MTBF failure", skr->skr_name, SK_KVA(skr), SK_KVA(sg)); net_update_uptime(); return NULL; @@ -1867,8 +1863,6 @@ sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg, sg->sg_state = IOSKBufferIsWired(sg->sg_md) ? SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED; - skr->skr_memtotal += skr->skr_seg_size; - ASSERT(sg->sg_md != NULL); ASSERT(sg->sg_start != 0 && sg->sg_end != 0); @@ -2026,8 +2020,8 @@ skmem_region_hash_rescale(struct skmem_region *skr) } SK_DF(SK_VERB_MEM_REGION, - "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr), - (uint32_t)old_size, (uint32_t)new_size, moved); + "skr %p old_size %zu new_size %zu [%u moved]", SK_KVA(skr), + old_size, new_size, moved); SKR_UNLOCK(skr); @@ -2385,7 +2379,7 @@ skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2, int changed, error; uint64_t newval; - _CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t)); + static_assert(sizeof(skmem_region_mtbf) == sizeof(uint64_t)); if ((error = sysctl_io_number(req, skmem_region_mtbf, sizeof(uint64_t), &newval, &changed)) == 0) { if (changed) { diff --git a/bsd/skywalk/mem/skmem_slab.c b/bsd/skywalk/mem/skmem_slab.c index 02d8ff331..9d340eaf5 100644 --- a/bsd/skywalk/mem/skmem_slab.c +++ b/bsd/skywalk/mem/skmem_slab.c @@ -31,6 +31,7 @@ #include /* for PE_parse_boot_argn */ #include /* for OSBacktrace */ #include /* for assert_wait */ +#include #include static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t); @@ -132,9 +133,9 @@ skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag) --chunks; } - SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx", + SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p", SK_KVA(skm), SK_KVA(sl)); - SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index, + SK_DF(SK_VERB_MEM_CACHE, " [%u] [%p-%p)", sl->sl_seg->sg_index, SK_KVA(slab), SK_KVA(slab + objsize)); return sl; @@ -164,9 +165,9 @@ skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl) ASSERT(sl->sl_refcnt == 0); - SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx", + SK_DF(SK_VERB_MEM_CACHE, "skm %p sl %p", SK_KVA(skm), SK_KVA(sl)); - SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index, + SK_DF(SK_VERB_MEM_CACHE, " [%u] [%p-%p)", sl->sl_seg->sg_index, SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize)); /* diff --git a/bsd/skywalk/mem/skmem_test.c b/bsd/skywalk/mem/skmem_test.c index 278700062..b3287b01f 100644 --- a/bsd/skywalk/mem/skmem_test.c +++ b/bsd/skywalk/mem/skmem_test.c @@ -71,7 +71,7 @@ struct skmt_thread_info { kern_packet_t sti_mpc; /* cloned packet */ thread_t sti_thread; /* thread instance */ boolean_t sti_nosleep; /* non-sleeping allocation */ -} __attribute__((aligned(CHANNEL_CACHE_ALIGN_MAX))); +} __sk_aligned(CHANNEL_CACHE_ALIGN_MAX); static struct skmt_thread_info *skmth_info; static uint32_t skmth_info_size; @@ -925,7 +925,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eeo_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_eeo_mismatch 0x%x, " - "0x%x, 0x%llx", csum_eeo_ref, csum, + "0x%x, %p", csum_eeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_eeo_ref == csum); @@ -941,7 +941,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eoe_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_eoe_mismatch 0x%x, " - "0x%x, 0x%llx", csum_eoe_ref, csum, + "0x%x, %p", csum_eoe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_eoe_ref == csum); @@ -956,7 +956,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eoo_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_eoo_mismatch 0x%x, " - "0x%x, 0x%llx", csum_eoo_ref, csum, + "0x%x, %p", csum_eoo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_eoo_ref == csum); @@ -971,7 +971,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_oeo_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_oeo_mismatch 0x%x, " - "0x%x, 0x%llx", csum_oeo_ref, csum, + "0x%x, %p", csum_oeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_oeo_ref == csum); @@ -985,7 +985,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_ooo_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_ooo_mismatch 0x%x, " - "0x%x, 0x%llx", csum_ooo_ref, csum, + "0x%x, %p", csum_ooo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_ooo_ref == csum); @@ -999,7 +999,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_ooe_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_ooe_mismatch 0x%x, " - "0x%x, 0x%llx", csum_ooe_ref, csum, + "0x%x, %p", csum_ooe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_ooe_ref == csum); @@ -1014,7 +1014,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_ooe_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_oee_mismatch 0x%x, " - "0x%x, 0x%llx", csum_oee_ref, csum, + "0x%x, %p", csum_oee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_oee_ref == csum); @@ -1031,7 +1031,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eee_ref != csum) { SK_ERR("pkt_copypkt_sum: csum_eee_mismatch 0x%x, " - "0x%x, 0x%llx", csum_eee_ref, csum, + "0x%x, %p", csum_eee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb))); } VERIFY(csum_eee_ref == csum); @@ -1040,7 +1040,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 0, buffer, len - 1, TRUE, 0, NULL); if (csum_eeo_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_eeo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eeo_ref, + "0x%x, 0x%x, %p, %p", csum_eeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1050,7 +1050,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 0, buffer + 1, len - 1, TRUE, 0, NULL); if (csum_eoo_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_eoo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eoo_ref, + "0x%x, 0x%x, %p, %p", csum_eoo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1059,7 +1059,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 0, buffer + 1, len - 2, TRUE, 0, NULL); if (csum_eoe_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_eoe_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eoe_ref, + "0x%x, 0x%x, %p, %p", csum_eoe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1068,7 +1068,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 1, buffer + 1, len - 2, TRUE, 0, NULL); if (csum_ooe_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_ooe_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_ooe_ref, + "0x%x, 0x%x, %p, %p", csum_ooe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1077,7 +1077,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 1, buffer, len - 2, TRUE, 0, NULL); if (csum_oee_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_oee_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_oee_ref, + "0x%x, 0x%x, %p, %p", csum_oee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1086,7 +1086,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 1, buffer, len - 1, TRUE, 0, NULL); if (csum_oeo_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_oeo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_oeo_ref, + "0x%x, 0x%x, %p, %p", csum_oeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1095,7 +1095,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 1, buffer + 1, len - 1, TRUE, 0, NULL); if (csum_ooo_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_ooo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_ooo_ref, + "0x%x, 0x%x, %p, %p", csum_ooo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1105,7 +1105,7 @@ skmem_packet_tests(uint32_t flags) csum = pkt_copyaddr_sum(ph_mb, 0, buffer, len, TRUE, 0, NULL); if (csum_eee_ref != csum) { SK_ERR("pkt_copyaddr_sum: csum_eee_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eee_ref, + "0x%x, 0x%x, %p, %p", csum_eee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(buffer)); } @@ -1128,7 +1128,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eee_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_eee_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eee_ref, + "0x%x, 0x%x, %p, %p", csum_eee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1145,7 +1145,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eoe_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_eoe_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eoe_ref, + "0x%x, 0x%x, %p, %p", csum_eoe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1162,7 +1162,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eoo_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_eoo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eoo_ref, + "0x%x, 0x%x, %p, %p", csum_eoo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1180,7 +1180,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_eeo_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_eeo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_eeo_ref, + "0x%x, 0x%x, %p, %p", csum_eeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1198,7 +1198,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_oeo_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_oeo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_oeo_ref, + "0x%x, 0x%x, %p, %p", csum_oeo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1215,7 +1215,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_oee_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_oee_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_oee_ref, + "0x%x, 0x%x, %p, %p", csum_oee_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1232,7 +1232,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_ooe_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_ooe_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_ooe_ref, + "0x%x, 0x%x, %p, %p", csum_ooe_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1249,7 +1249,7 @@ skmem_packet_tests(uint32_t flags) VERIFY(__packet_finalize(ph_mb) == 0); if (csum_ooo_ref != csum) { SK_ERR("pkt_mcopypkt_sum: csum_ooo_mismatch " - "0x%x, 0x%x, 0x%llx, 0x%llx", csum_ooo_ref, + "0x%x, 0x%x, %p, %p", csum_ooo_ref, csum, SK_KVA(SK_PTR_ADDR_KQUM(ph_mb)), SK_KVA(m)); } @@ -1271,140 +1271,6 @@ skmem_packet_tests(uint32_t flags) ref_buffer = NULL; } -static void -skmem_quantum_tests(uint32_t flags) -{ - struct kern_pbufpool_init pp_init; - struct kern_pbufpool_memory_info pp_mem_info; - kern_pbufpool_t pp = NULL; - kern_packet_t *phary = NULL; - uint32_t phcnt = 0; - kern_packet_t ph = 0; - uint32_t i; - errno_t err; - - flags |= KBIF_QUANTUM; - - SK_ERR("flags 0x%x", flags); - - phary = (kern_packet_t *) kalloc_data(sizeof(kern_packet_t) * MAX_PH_ARY, - Z_WAITOK | Z_ZERO); - - bzero(&pp_init, sizeof(pp_init)); - pp_init.kbi_version = KERN_PBUFPOOL_CURRENT_VERSION; - pp_init.kbi_buf_seg_size = skmem_usr_buf_seg_size; - (void) snprintf((char *)pp_init.kbi_name, sizeof(pp_init.kbi_name), - "%s", "skmem_quantum_tests"); - pp_init.kbi_flags = (KBIF_QUANTUM | flags); - pp_init.kbi_packets = 64; - pp_init.kbi_bufsize = SKMEM_TEST_BUFSIZE; - pp_init.kbi_buflets = (64 * 2); - pp_init.kbi_ctx = NULL; - pp_init.kbi_ctx_retain = NULL; - pp_init.kbi_ctx_release = NULL; - - pp_init.kbi_max_frags = 4; - /* max_frags must be 1 for quantum type */ - VERIFY(kern_pbufpool_create(&pp_init, &pp, NULL) == EINVAL); - pp_init.kbi_max_frags = 1; - if ((flags & KBIF_QUANTUM) && (flags & KBIF_BUFFER_ON_DEMAND)) { - VERIFY(kern_pbufpool_create(&pp_init, &pp, NULL) == EINVAL); - goto done; - } - VERIFY(kern_pbufpool_create(&pp_init, &pp, NULL) == 0); - bzero(&pp_mem_info, sizeof(pp_mem_info)); - VERIFY(kern_pbufpool_get_memory_info(pp, &pp_mem_info) == 0); - VERIFY(pp_mem_info.kpm_flags & KPMF_EXTERNAL); - VERIFY(pp_mem_info.kpm_buflets >= pp_mem_info.kpm_packets); - VERIFY(pp_mem_info.kpm_packets >= 64); - VERIFY(pp_mem_info.kpm_packets <= MAX_PH_ARY); - VERIFY(pp_mem_info.kpm_max_frags == 1); - VERIFY(pp_mem_info.kpm_buflets >= 64); - VERIFY(pp_mem_info.kpm_bufsize == SKMEM_TEST_BUFSIZE); - VERIFY(kern_pbufpool_alloc(pp, 4, &ph) == EINVAL); - /* allocate and free one at a time */ - for (i = 0, phcnt = 0; i < pp_mem_info.kpm_packets; i++) { - boolean_t stop = FALSE; - /* - * This may fail if skmem_region_mtbf is set, or if - * the system is short on memory. Perform retries - * at this layer to get at least 64 packets. - */ - while ((err = kern_pbufpool_alloc_nosleep(pp, 1, &ph)) != 0) { - VERIFY(err == ENOMEM); - if (phcnt < 64) { - SK_ERR("retrying alloc for quantum %u", phcnt); - delay(250 * NSEC_PER_USEC); /* 1/4 sec */ - continue; - } - stop = TRUE; - break; - } - if (stop) { - break; - } - VERIFY(ph != 0); - VERIFY(kern_packet_get_data_length(ph) == 0); - VERIFY(kern_packet_get_buflet_count(ph) == 1); - phary[phcnt++] = ph; - } - VERIFY(phcnt >= 64); - for (i = 0; i < phcnt; i++) { - kern_pbufpool_free(pp, phary[i]); - phary[i] = 0; - } - /* allocate and free in batch */ - phcnt = pp_mem_info.kpm_packets; - for (;;) { - err = kern_pbufpool_alloc_batch_nosleep(pp, 1, phary, &phcnt); - VERIFY(err != EINVAL && err != ENOTSUP); - if (err == ENOMEM) { - phcnt = pp_mem_info.kpm_packets; - SK_ERR("retrying batch alloc for %u quantums", phcnt); - delay(250 * NSEC_PER_USEC); /* 1/4 sec */ - } else if (err == EAGAIN) { - SK_ERR("batch alloc for %u quantums only returned %u", - pp_mem_info.kpm_packets, phcnt); - break; - } else { - VERIFY(err == 0); - break; - } - } - VERIFY(phcnt > 0); - for (i = 0; i < phcnt; i++) { - VERIFY(phary[i] != 0); - VERIFY(kern_packet_get_data_length(phary[i]) == 0); - VERIFY(kern_packet_get_buflet_count(phary[i]) == 1); - } - kern_pbufpool_free_batch(pp, phary, phcnt); - /* allocate and free one at a time (blocking) */ - for (i = 0, phcnt = 0; i < pp_mem_info.kpm_packets; i++) { - VERIFY(kern_pbufpool_alloc(pp, 1, &ph) == 0); - VERIFY(ph != 0); - VERIFY(kern_packet_get_data_length(ph) == 0); - VERIFY(kern_packet_get_buflet_count(ph) == 1); - phary[phcnt++] = ph; - } - VERIFY(phcnt >= 64); - for (i = 0; i < phcnt; i++) { - kern_pbufpool_free(pp, phary[i]); - phary[i] = 0; - } - /* allocate and free in batch (blocking) */ - bzero(&skmt_alloccb_ctx, sizeof(skmt_alloccb_ctx)); - skmt_alloccb_ctx.stc_req = phcnt; - VERIFY(kern_pbufpool_alloc_batch_callback(pp, 1, phary, &phcnt, - skmem_test_alloccb, &skmt_alloccb_ctx) == 0); - VERIFY(skmt_alloccb_ctx.stc_idx == phcnt); - kern_pbufpool_free_batch(pp, phary, phcnt); - kern_pbufpool_destroy(pp); - pp = NULL; -done: - kfree_data(phary, sizeof(kern_packet_t) * MAX_PH_ARY); - phary = NULL; -} - static void skmem_basic_tests(void) { @@ -1550,71 +1416,6 @@ skmem_basic_tests(void) skmem_packet_tests(KBIF_VIRTUAL_DEVICE | KBIF_BUFFER_ON_DEMAND | TEST_OPTION_INHIBIT_CACHE); #endif - - /* check quantum KPIs */ - skmem_quantum_tests(0); - skmem_quantum_tests(KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_PERSISTENT); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_NO_MAGAZINES); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_MONOLITHIC | - KBIF_USER_ACCESS); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_PERSISTENT | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_MONOLITHIC | - KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_MONOLITHIC | - KBIF_USER_ACCESS | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_PERSISTENT | KBIF_MONOLITHIC | - KBIF_BUFFER_ON_DEMAND | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_USER_ACCESS); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_USER_ACCESS | - KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_USER_ACCESS | - TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_MONOLITHIC | KBIF_BUFFER_ON_DEMAND | - TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_BUFFER_ON_DEMAND | KBIF_NO_MAGAZINES); - skmem_quantum_tests(KBIF_BUFFER_ON_DEMAND | KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_BUFFER_ON_DEMAND | TEST_OPTION_INHIBIT_CACHE); - - /* check quantum KPIs (vdev) */ - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_NO_MAGAZINES); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - KBIF_MONOLITHIC | KBIF_USER_ACCESS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - KBIF_MONOLITHIC | KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - KBIF_MONOLITHIC | KBIF_USER_ACCESS | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_PERSISTENT | - KBIF_MONOLITHIC | KBIF_BUFFER_ON_DEMAND | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_USER_ACCESS | KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_USER_ACCESS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_USER_ACCESS | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_MONOLITHIC | - KBIF_BUFFER_ON_DEMAND | TEST_OPTION_INHIBIT_CACHE); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_BUFFER_ON_DEMAND); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_BUFFER_ON_DEMAND | - KBIF_PHYS_CONTIGUOUS); - skmem_quantum_tests(KBIF_VIRTUAL_DEVICE | KBIF_BUFFER_ON_DEMAND | - TEST_OPTION_INHIBIT_CACHE); } static void diff --git a/bsd/skywalk/namespace/flowidns.c b/bsd/skywalk/namespace/flowidns.c index 998d5ba7a..130063f4d 100644 --- a/bsd/skywalk/namespace/flowidns.c +++ b/bsd/skywalk/namespace/flowidns.c @@ -52,6 +52,7 @@ #include #include #include +#include /* maximum number of flowID generation retries in case of collision */ #define FLOWIDNS_MAX_FLOWID_GEN_RETRY 5 @@ -228,16 +229,16 @@ flowidns_init() flowidns_domain_id_t domain; VERIFY(__flowidns_inited == 0); - _CASSERT(SFH_DOMAIN_IPSEC == FLOWIDNS_DOMAIN_IPSEC); - _CASSERT(SFH_DOMAIN_FLOWSWITCH == FLOWIDNS_DOMAIN_FLOWSWITCH); - _CASSERT(SFH_DOMAIN_INPCB == FLOWIDNS_DOMAIN_INPCB); - _CASSERT(SFH_DOMAIN_PF == FLOWIDNS_DOMAIN_PF); - _CASSERT(FLOWIDNS_DOMAIN_MIN == 0); + static_assert(SFH_DOMAIN_IPSEC == FLOWIDNS_DOMAIN_IPSEC); + static_assert(SFH_DOMAIN_FLOWSWITCH == FLOWIDNS_DOMAIN_FLOWSWITCH); + static_assert(SFH_DOMAIN_INPCB == FLOWIDNS_DOMAIN_INPCB); + static_assert(SFH_DOMAIN_PF == FLOWIDNS_DOMAIN_PF); + static_assert(FLOWIDNS_DOMAIN_MIN == 0); /* * FLOWIDNS_FLOWID_DOMAIN_{MASK, SHIFT} macros are based on below * assumption. */ - _CASSERT(FLOWIDNS_DOMAIN_MAX == 3); + static_assert(FLOWIDNS_DOMAIN_MAX == 3); for (domain = FLOWIDNS_DOMAIN_MIN; domain <= FLOWIDNS_DOMAIN_MAX; domain++) { @@ -324,10 +325,8 @@ flowidns_dump_domain(struct sysctl_req *req, struct flowidns_domain *domain) record.sfr_af = fftn->fftn_flowkey.ffk_af; record.sfr_ipproto = fftn->fftn_flowkey.ffk_proto; record.sfr_protoid = fftn->fftn_flowkey.ffk_protoid; - _CASSERT(sizeof(fftn->fftn_flowkey.ffk_laddr) == - sizeof(record.sfr_laddr)); - _CASSERT(sizeof(fftn->fftn_flowkey.ffk_raddr) == - sizeof(record.sfr_raddr)); + static_assert(sizeof(fftn->fftn_flowkey.ffk_laddr) == sizeof(record.sfr_laddr)); + static_assert(sizeof(fftn->fftn_flowkey.ffk_raddr) == sizeof(record.sfr_raddr)); bcopy(&(fftn->fftn_flowkey.ffk_laddr), &record.sfr_laddr, sizeof(record.sfr_laddr)); bcopy(&(fftn->fftn_flowkey.ffk_raddr), &record.sfr_raddr, diff --git a/bsd/skywalk/namespace/netns.c b/bsd/skywalk/namespace/netns.c index 9401269b3..36bd875e8 100644 --- a/bsd/skywalk/namespace/netns.c +++ b/bsd/skywalk/namespace/netns.c @@ -66,6 +66,12 @@ static int __netns_inited = 0; #define PROTO_STR(proto) ((proto == IPPROTO_TCP) ? "tcp" : "udp") #define LEN_TO_AF(len) (((len == sizeof (struct in_addr)) ? \ AF_INET : AF_INET6)) +#define NS_PORT_ERR(_fmt, ...) do { \ + proc_t _p = current_proc(); \ + SK_ERR("%s(%d) port %u: " _fmt, sk_proc_name(_p), sk_proc_pid(_p), \ + port, ##__VA_ARGS__); \ +} while (0); + /* * Locking * Netns is currently protected by a global mutex, NETNS_LOCK. This lock is @@ -331,7 +337,7 @@ netns_ns_free(struct ns *namespace) NS_VERB_PROTO(namespace->ns_proto), "freeing %s ns for IP %s", PROTO_STR(namespace->ns_proto), - inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str))); RB_FOREACH_SAFE(res, ns_reservation_tree, &namespace->ns_reservations, @@ -475,7 +481,7 @@ _netns_get_ns(uint32_t *__sized_by(addr_len)addr, uint8_t addr_len, uint8_t prot if (create && namespace == NULL) { SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), "allocating %s ns for IP %s", - PROTO_STR(proto), inet_ntop(LEN_TO_AF(addr_len), addr, + PROTO_STR(proto), sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str, sizeof(tmp_ip_str))); NETNS_LOCK_CONVERT(); namespace = netns_ns_alloc(Z_WAITOK | Z_NOFAIL); @@ -568,7 +574,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) if (res == NULL) { SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), "ERROR %s:%s:%d // flags 0x%x // OUT OF MEMORY", - inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags); return ENOMEM; @@ -584,7 +590,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), "pre: %s:%s:%d // flags 0x%x // refs %d sky, %d ls, " - "%d bsd %d pf", inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + "%d bsd %d pf", sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), port, flags, NETNS_REF_COUNT(res, NETNS_SKYWALK), @@ -615,7 +621,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) * listener wildcard entry for this * protocol/port number means this must fail. */ - SK_ERR("ADDRINUSE: Duplicate wildcard"); + NS_PORT_ERR("ADDRINUSE: Duplicate wildcard"); err = EADDRINUSE; goto done; } @@ -636,7 +642,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) * namespace for this port means this * must fail. */ - SK_ERR("ADDRINUSE: Wildcard with non-wild."); + NS_PORT_ERR("ADDRINUSE: Wildcard with non-wild."); err = EADDRINUSE; goto done; } @@ -656,7 +662,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) * which Skywalk already has a wildcard * reservation. */ - SK_ERR("ADDRINUSE: BSD requesting Skywalk port"); + NS_PORT_ERR("ADDRINUSE: BSD requesting Skywalk port"); err = EADDRINUSE; goto done; } @@ -680,7 +686,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) (NETNS_REF_COUNT(skres, NETNS_SKYWALK) | NETNS_REF_COUNT(skres, NETNS_LISTENER)) != 0) { - SK_ERR("ADDRINUSE: BSD wildcard with non-wild."); + NS_PORT_ERR("ADDRINUSE: BSD wildcard with non-wild."); err = EADDRINUSE; goto done; } @@ -692,7 +698,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) /* check collision w/ BSD */ if (NETNS_REF_COUNT(res, NETNS_BSD) > 0 || NETNS_REF_COUNT(res, NETNS_PF) > 0) { - SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)"); + NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ BSD)"); err = EADDRINUSE; goto done; } @@ -739,7 +745,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) goto done; } } - SK_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)"); + NS_PORT_ERR("ERROR - Skywalk got ADDRINUSE (w/ SK connected flow)"); err = EADDRINUSE; } /* @@ -761,7 +767,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V4)], res, port) || _netns_is_port_used(netns_global_non_wild[ NETNS_NS_GLOBAL_IDX(proto, NETNS_ADDRLEN_V6)], res, port)) { - SK_ERR("ERROR - Listener got ADDRINUSE"); + NS_PORT_ERR("ERROR - Listener got ADDRINUSE"); err = EADDRINUSE; } break; @@ -770,7 +776,7 @@ _netns_reserve_common(struct ns *namespace, in_port_t port, uint32_t flags) case NETNS_PF: if (NETNS_REF_COUNT(res, NETNS_SKYWALK) > 0 || NETNS_REF_COUNT(res, NETNS_LISTENER) > 0) { - SK_ERR("ERROR - %s got ADDRINUSE", + NS_PORT_ERR("ERROR - %s got ADDRINUSE", ((flags & NETNS_OWNER_MASK) == NETNS_PF) ? "PF" : "BSD"); err = EADDRINUSE; @@ -795,7 +801,7 @@ done: NS_VERB_PROTO(namespace->ns_proto), "post: %s:%s:%d err %d // flags 0x%x // refs %d sky, " "%d ls, %d bsd %d pf", - inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(namespace->ns_proto), port, err, flags, NETNS_REF_COUNT(res, NETNS_SKYWALK), @@ -833,7 +839,7 @@ _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags) SK_DF(NS_VERB_IP(namespace->ns_addr_len) | NS_VERB_PROTO(namespace->ns_proto), "ERROR %s:%s:%d // flags 0x%x // not found", - inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(namespace->ns_proto), port, flags); VERIFY(res != NULL); @@ -842,7 +848,7 @@ _netns_release_common(struct ns *namespace, in_port_t port, uint32_t flags) SK_DF(NS_VERB_IP(namespace->ns_addr_len) | NS_VERB_PROTO(namespace->ns_proto), "%s:%s:%d // flags 0x%x // refs %d sky, %d ls, %d bsd, %d pf", - inet_ntop(LEN_TO_AF(namespace->ns_addr_len), + sk_ntop(LEN_TO_AF(namespace->ns_addr_len), namespace->ns_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(namespace->ns_proto), port, flags, NETNS_REF_COUNT(res, NETNS_SKYWALK), @@ -897,7 +903,7 @@ netns_clear_ifnet(struct ns_token *nstoken) SK_DF(NS_VERB_IP(nstoken->nt_addr_len) | NS_VERB_PROTO(nstoken->nt_proto), "%s:%s:%d // removed from ifnet %d", - inet_ntop(LEN_TO_AF(nstoken->nt_addr_len), + sk_ntop(LEN_TO_AF(nstoken->nt_addr_len), nstoken->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(nstoken->nt_proto), nstoken->nt_port, nstoken->nt_ifp->if_index); @@ -941,7 +947,7 @@ _netns_reserve_kpi_common(struct ns *ns, netns_token *token, SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), "reserving %s:%s:%d // flags 0x%x // token %svalid", - inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str, + sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), hport, flags, NETNS_TOKEN_VALID(token) ? "" : "in"); @@ -965,7 +971,7 @@ _netns_reserve_kpi_common(struct ns *ns, netns_token *token, SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto), "%s:%s:%d // flags 0x%x -> 0x%x", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(nt->nt_proto), @@ -1232,12 +1238,12 @@ netns_reserve(netns_token *token, uint32_t *__sized_by(addr_len)addr, } if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { - SK_ERR("netns doesn't support non TCP/UDP protocol"); + NS_PORT_ERR("netns doesn't support non TCP/UDP protocol"); return ENOTSUP; } SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), - "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr, + "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port), flags); @@ -1264,10 +1270,11 @@ extern int tcp_use_randomport; int netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr, - uint8_t addr_len, uint8_t proto, in_port_t *port, uint32_t flags, + uint8_t addr_len, uint8_t proto, in_port_t *pport, uint32_t flags, struct ns_flow_info *nfi) { int err = 0; + SK_LOG_VAR(in_port_t port = *pport); in_port_t first = (in_port_t)ipport_firstauto; in_port_t last = (in_port_t)ipport_lastauto; in_port_t rand_port; @@ -1287,13 +1294,13 @@ netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr, } if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { - SK_ERR("netns doesn't support non TCP/UDP protocol"); + NS_PORT_ERR("netns doesn't support non TCP/UDP protocol"); return ENOTSUP; } SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), - "%s:%s:%d // flags 0x%x", inet_ntop(LEN_TO_AF(addr_len), addr, - tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(*port), + "%s:%s:%d // flags 0x%x", sk_ntop(LEN_TO_AF(addr_len), addr, + tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), ntohs(port), flags); NETNS_LOCK_SPIN(); @@ -1308,7 +1315,7 @@ netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr, if (proto == IPPROTO_UDP) { if (UINT16_MAX - namespace->ns_n_reservations < NETNS_NS_UDP_EPHEMERAL_RESERVE) { - SK_ERR("UDP ephemeral port not available" + NS_PORT_ERR("UDP ephemeral port not available" "(less than 4096 UDP ports left)"); err = EADDRNOTAVAIL; NETNS_UNLOCK(); @@ -1353,7 +1360,7 @@ netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr, while (true) { if (n_last_port == 0) { - SK_ERR("ephemeral port search range includes 0"); + NS_PORT_ERR("ephemeral port search range includes 0"); err = EINVAL; break; } @@ -1383,14 +1390,14 @@ netns_reserve_ephemeral(netns_token *token, uint32_t *__sized_by(addr_len)addr, n_last_port = htons(last_port); if (last_port == rand_port || first == last) { - SK_ERR("couldn't find free ephemeral port"); + NS_PORT_ERR("couldn't find free ephemeral port"); err = EADDRNOTAVAIL; break; } } if (err == 0) { - *port = n_last_port; + *pport = n_last_port; if (count_up) { namespace->ns_last_ephemeral_port_up = last_port; } else { @@ -1439,7 +1446,7 @@ netns_release(netns_token *token) SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), "releasing %s:%s:%d", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(proto), nt->nt_port); @@ -1496,9 +1503,9 @@ netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr, proto = nt->nt_proto; #if SK_LOG - inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str_1, sizeof(tmp_ip_str_1)); - inet_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2, + sk_ntop(LEN_TO_AF(addr_len), addr, tmp_ip_str_2, sizeof(tmp_ip_str_2)); #endif /* SK_LOG */ SK_DF(NS_VERB_IP(addr_len) | NS_VERB_PROTO(proto), @@ -1528,7 +1535,8 @@ netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr, nt->nt_flags))) { NETNS_LOCK_CONVERT(); netns_ns_cleanup(new_namespace); - SK_ERR("ERROR - reservation collision under new namespace"); + SK_ERR("port %u reservation collision under new namespace", + nt->nt_port); goto done; } @@ -1559,7 +1567,8 @@ netns_change_addr(netns_token *token, uint32_t *__sized_by(addr_len)addr, if ((err = _netns_reserve_common(global_namespace, nt->nt_port, nt->nt_flags)) != 0) { - SK_ERR("ERROR - reservation collision under new global namespace"); + SK_ERR("port %u - reservation collision under new global namespace", + nt->nt_port); /* XXX: Should not fail. Maybe assert instead */ goto done; } @@ -1582,13 +1591,13 @@ _netns_set_ifnet_internal(struct ns_token *nt, struct ifnet *ifp) NETNS_LOCK_ASSERT_HELD(); - if (ifp != NULL && ifnet_is_attached(ifp, 1)) { + if (ifp != NULL && ifnet_get_ioref(ifp)) { nt->nt_ifp = ifp; LIST_INSERT_HEAD(&ifp->if_netns_tokens, nt, nt_ifp_link); SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto), "%s:%s:%d // added to ifnet %d", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(nt->nt_proto), nt->nt_port, ifp->if_index); @@ -1618,7 +1627,7 @@ netns_set_ifnet(netns_token *token, ifnet_t ifp) if (nt->nt_ifp == ifp) { SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto), "%s:%s:%d // ifnet already %d, exiting early", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(nt->nt_proto), nt->nt_port, ifp ? ifp->if_index : -1); @@ -1672,10 +1681,10 @@ _netns_set_state(netns_token *token, uint32_t state) nt->nt_state |= state; SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto), - "%s:%s:%d // state 0x%b", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, + "%s:%s:%d // state 0x%x", + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), - PROTO_STR(nt->nt_proto), nt->nt_port, state, NETNS_STATE_BITS); + PROTO_STR(nt->nt_proto), nt->nt_port, state); NETNS_UNLOCK(); } @@ -1743,7 +1752,7 @@ netns_change_flags(netns_token *token, uint32_t set_flags, SK_DF(NS_VERB_IP(nt->nt_addr_len) | NS_VERB_PROTO(nt->nt_proto), "%s:%s:%d // flags 0x%x -> 0x%x", - inet_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, + sk_ntop(LEN_TO_AF(nt->nt_addr_len), nt->nt_addr, tmp_ip_str, sizeof(tmp_ip_str)), PROTO_STR(nt->nt_proto), nt->nt_port, nt->nt_flags, nt->nt_flags | set_flags & ~clear_flags); @@ -1768,7 +1777,7 @@ netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protoc return; } - if (fe->fe_flags & FLOWENTF_EXTRL_PORT) { + if (fe->fe_flags & (FLOWENTF_EXTRL_PORT | FLOWENTF_AOP_OFFLOAD)) { return; } @@ -1850,22 +1859,22 @@ netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protoc proc_name(nfi->nfi_owner_pid, pname, sizeof(pname)); if (protocol == PF_INET) { - inet_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr, + sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr, lbuf, sizeof(lbuf)); - inet_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr, + sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr, fbuf, sizeof(fbuf)); lport = nfi->nfi_laddr.sin.sin_port; fport = nfi->nfi_faddr.sin.sin_port; } else { - inet_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr, + sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr, lbuf, sizeof(lbuf)); - inet_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr, + sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr, fbuf, sizeof(fbuf)); lport = nfi->nfi_laddr.sin6.sin6_port; fport = nfi->nfi_faddr.sin6.sin6_port; } - os_log(OS_LOG_DEFAULT, + os_log(wake_packet_log_handle, "netns_local_port_scan_flow_entry: route is down %s %s:%u %s:%u ifp %s proc %s:%d", token->nt_proto == IPPROTO_TCP ? "tcp" : "udp", lbuf, ntohs(lport), fbuf, ntohs(fport), @@ -1889,22 +1898,22 @@ netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protoc proc_name(nfi->nfi_owner_pid, pname, sizeof(pname)); if (protocol == PF_INET) { - inet_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr, + sk_ntop(PF_INET, &nfi->nfi_laddr.sin.sin_addr, lbuf, sizeof(lbuf)); - inet_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr, + sk_ntop(PF_INET, &nfi->nfi_faddr.sin.sin_addr, fbuf, sizeof(fbuf)); lport = nfi->nfi_laddr.sin.sin_port; fport = nfi->nfi_faddr.sin.sin_port; } else { - inet_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr, + sk_ntop(PF_INET6, &nfi->nfi_laddr.sin6.sin6_addr.s6_addr, lbuf, sizeof(lbuf)); - inet_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr, + sk_ntop(PF_INET6, &nfi->nfi_faddr.sin6.sin6_addr, fbuf, sizeof(fbuf)); lport = nfi->nfi_laddr.sin6.sin6_port; fport = nfi->nfi_faddr.sin6.sin6_port; } - os_log(OS_LOG_DEFAULT, + os_log(wake_packet_log_handle, "netns_local_port_scan_flow_entry: no wake from sleep %s %s:%u %s:%u ifp %s proc %s:%d", token->nt_proto == IPPROTO_TCP ? "tcp" : "udp", lbuf, ntohs(lport), fbuf, ntohs(fport), @@ -1926,10 +1935,9 @@ netns_local_port_scan_flow_entry(struct flow_entry *fe, protocol_family_t protoc (void) if_ports_used_add_flow_entry(fe, token->nt_ifp->if_index, token->nt_flow_info, token->nt_flags); } else { - SK_ERR("%s: unknown owner port %u" + SK_ERR("unknown owner port %u" " nt_flags 0x%x ifindex %u nt_flow_info %p\n", - __func__, token->nt_port, - token->nt_flags, + token->nt_port, token->nt_flags, token->nt_ifp != NULL ? token->nt_ifp->if_index : 0, token->nt_flow_info); } @@ -1945,7 +1953,7 @@ netns_get_if_local_ports(ifnet_t ifp, protocol_family_t protocol, return; } /* Ensure that the interface is attached and won't detach */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { return; } fsw = fsw_ifp_to_fsw(ifp); @@ -1980,7 +1988,7 @@ netns_get_local_ports(ifnet_t ifp, protocol_family_t protocol, error = ifnet_list_get_all(IFNET_FAMILY_ANY, &ifp_list, &count); if (error != 0) { - os_log_error(OS_LOG_DEFAULT, + os_log_error(wake_packet_log_handle, "%s: ifnet_list_get_all() failed %d", __func__, error); return error; diff --git a/bsd/skywalk/namespace/netns.h b/bsd/skywalk/namespace/netns.h index 041dc9df1..b4a600726 100644 --- a/bsd/skywalk/namespace/netns.h +++ b/bsd/skywalk/namespace/netns.h @@ -296,8 +296,8 @@ netns_change_addr_in6(netns_token *token, struct in6_addr addr) /* Flags for change_flags */ /* - * Set when the reservation backs a socket with the SO_NOWAKEFROMSLEEP option - * set + * Set when the reservation backs a flow that should not have its port + * offloaded for network wake */ #define NETNS_NOWAKEFROMSLEEP 0x20 @@ -315,7 +315,12 @@ netns_change_addr_in6(netns_token *token, struct in6_addr addr) */ #define NETNS_REUSEPORT 0x100 +/* + * Set when the connection is marked as idle + */ +#define NETNS_CONNECTION_IDLE 0x200 + #define NETNS_CONFIGURATION_FLAGS (NETNS_NOWAKEFROMSLEEP | NETNS_RECVANYIF | \ - NETNS_EXTBGIDLE | NETNS_REUSEPORT) + NETNS_EXTBGIDLE | NETNS_REUSEPORT | NETNS_CONNECTION_IDLE) #endif /* !_SKYWALK_NAMESPACE_NETNS_H_ */ diff --git a/bsd/skywalk/namespace/protons.c b/bsd/skywalk/namespace/protons.c index ddd3b3d25..ede9d28fb 100644 --- a/bsd/skywalk/namespace/protons.c +++ b/bsd/skywalk/namespace/protons.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/bsd/skywalk/nexus/Makefile b/bsd/skywalk/nexus/Makefile index df6ea040a..709d6be7f 100644 --- a/bsd/skywalk/nexus/Makefile +++ b/bsd/skywalk/nexus/Makefile @@ -28,7 +28,7 @@ PRIVATE_DATAFILES = \ # Installs header file for Apple internal use for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders PRIVATE_KERNELFILES = \ - os_nexus.h + os_nexus.h nexus_ioctl.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow.c b/bsd/skywalk/nexus/flowswitch/flow/flow.c index d224d55d7..f7b605d76 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow.c @@ -91,8 +91,7 @@ flow_init(void) /* these are initialized in skywalk_init() */ VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX); VERIFY(sk_fadv_nchunks != 0); - _CASSERT(sizeof(*((struct flow_owner *)0)->fo_flowadv_bmap) == - sizeof(bitmap_t)); + static_assert(sizeof(*((struct flow_owner *)0)->fo_flowadv_bmap) == sizeof(bitmap_t)); sk_fab_size = (sk_fadv_nchunks * sizeof(bitmap_t)); if (sk_fab_cache == NULL) { diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c b/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c index 84c8c133a..854c73fcb 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_agg.c @@ -98,15 +98,15 @@ struct flow_agg { #if __has_ptrcheck #define FLOW_AGG_CLEAR(_fa) do { \ - _CASSERT(sizeof(struct flow_agg) == 48); \ - _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40); \ + static_assert(sizeof(struct flow_agg) == 48); \ + static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 40); \ sk_zero_48(_fa); \ (_fa)->fa_fix_pkt_sum = 0; \ } while (0) #else #define FLOW_AGG_CLEAR(_fa) do { \ - _CASSERT(sizeof(struct flow_agg) == 40); \ - _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \ + static_assert(sizeof(struct flow_agg) == 40); \ + static_assert(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \ sk_zero_32(_fa); \ (_fa)->fa_fix_pkt_sum = 0; \ } while (0) @@ -224,8 +224,8 @@ _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input) bufcnt = kern_packet_get_buflet_count(ph); } - SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u", - sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d", + SK_DF(logflags, "%s(%d) %spkt %p plen %u", + sk_proc_name(p), sk_proc_pid(p), is_input ? "s":"d", SK_KVA(pkt), pkt->pkt_length); SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x", @@ -240,7 +240,7 @@ _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input) for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) { SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf", __buflet_get_data_address(buf), - __buflet_get_data_length(buf), 128, NULL, 0)); + __buflet_get_data_length(buf), 128)); buf = kern_packet_get_next_buflet(ph, buf); } } @@ -259,8 +259,8 @@ _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf) SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); - SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m), + SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(m), m->m_pkthdr.len); SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x", @@ -270,7 +270,7 @@ _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf) /* Dump the first mbuf */ ASSERT(m_mtod_current(m) != NULL); SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf", - (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0)); + (uint8_t *)m_mtod_current(m), m->m_len, 128)); } #define mbuf_agg_log(_m, _p, _is_mbuf) do { \ @@ -287,8 +287,8 @@ _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf) (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); while (m != NULL) { - SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u", - sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m), + SK_DF(logflags, "%s(%d) dest mbuf %p pktlen %u", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(m), m->m_pkthdr.len); SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x", @@ -697,7 +697,7 @@ copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf, * assumption that the smallest flowswitch packet pool buffer should * be large enough to hold the IP and TCP headers in the first buflet. */ - _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY); + static_assert(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY); SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX | (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); @@ -966,8 +966,8 @@ can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt, uint8_t *ip_hdr; ASSERT(fa->fa_sptr != NULL); - _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE); - _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE); + static_assert(sizeof(struct ip6_tcp_mask) == MASK_SIZE); + static_assert(sizeof(struct ip_tcp_mask) == MASK_SIZE); if (__improbable(pkt->pkt_length < MASK_SIZE)) { STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP); @@ -1466,7 +1466,7 @@ flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt, /* First time we append packets, need to set it to 1 */ spkt->pkt_seg_cnt = 1; } - _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt)); + static_assert(sizeof(result) == sizeof(spkt->pkt_seg_cnt)); if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) { spkt->pkt_seg_cnt = result; } @@ -1479,7 +1479,7 @@ flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt, /* First time we append packets, need to set it to 1 */ smbuf->m_pkthdr.rx_seg_cnt = 1; } - _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.rx_seg_cnt)); + static_assert(sizeof(result) == sizeof(smbuf->m_pkthdr.rx_seg_cnt)); if (!os_add_overflow(1, smbuf->m_pkthdr.rx_seg_cnt, &result)) { smbuf->m_pkthdr.rx_seg_cnt = result; } @@ -1723,7 +1723,7 @@ flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe, } SK_DF(SK_VERB_FLOW_TRACK, "flow_pkt_track failed (err %d)", err); __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt, - DROP_REASON_FSW_FLOW_TRACK_ERR, 0); + DROP_REASON_FSW_FLOW_TRACK_ERR, DROPTAP_FLAG_DIR_IN); continue; } @@ -1776,7 +1776,7 @@ flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe, SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp, plen); __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt, - DROP_REASON_FSW_GSO_NOMEM_PKT, 0); + DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN); continue; } if (bh_cnt < bh_cnt_tmp) { @@ -1808,7 +1808,7 @@ flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe, STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT); SK_ERR("buflet alloc failed (err %d)", err); __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt, - DROP_REASON_FSW_GSO_NOMEM_PKT, 0); + DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN); continue; } } @@ -1887,7 +1887,7 @@ non_agg: SK_ERR("packet alloc failed (err %d)", err); _free_dbuf_array(dpp, &dbuf_array); __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt, - DROP_REASON_FSW_GSO_NOMEM_PKT, 0); + DROP_REASON_FSW_GSO_NOMEM_PKT, DROPTAP_FLAG_DIR_IN); continue; } spkt = SK_PTR_ADDR_KPKT(sph); @@ -1980,7 +1980,8 @@ _finalize_smbuf(struct mbuf *smbuf) SK_NO_INLINE_ATTRIBUTE static void flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct pktq *rx_pkts, uint32_t rx_bytes, bool is_mbuf) + struct pktq *rx_pkts, struct mbufq *host_mq, + uint32_t rx_bytes, bool is_mbuf) { #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt, _reason, _flags) do { \ drop_packets++; \ @@ -2497,11 +2498,10 @@ next: } /* - * Call fsw_host_sendup() with mbuf chain - * directly. + * Enqueue smbufs for caller to process. */ mchain_agg_log(m_chain, kernproc, is_mbuf); - fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes); + mbufq_enqueue(host_mq, m_chain, smbuf, smbufs, bytes); if (__improbable(is_mbuf)) { STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs); @@ -2524,14 +2524,15 @@ next: void flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags) + struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq, + uint32_t flags) { #pragma unused(flags) struct pktq dropped_pkts; bool is_mbuf; if (__improbable((flags & FLOW_PROC_FLAG_FRAGMENTS) != 0)) { - dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, FLOW_PROC_FLAG_FRAGMENTS); + dp_flow_rx_process(fsw, fe, rx_pkts, rx_bytes, host_mq, FLOW_PROC_FLAG_FRAGMENTS); return; } @@ -2561,13 +2562,13 @@ flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe, !dlil_has_if_filter(fsw->fsw_ifp); } if (__improbable(!do_rx_agg)) { - fsw_host_rx(fsw, rx_pkts); + fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq); return; } if (__improbable(pktap_total_tap_count != 0)) { fsw_snoop(fsw, fe, rx_pkts, true); } - flow_rx_agg_host(fsw, fe, rx_pkts, rx_bytes, is_mbuf); + flow_rx_agg_host(fsw, fe, rx_pkts, host_mq, rx_bytes, is_mbuf); } else { /* channel flow */ if (__improbable(pktap_total_tap_count != 0)) { diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_classifier.c b/bsd/skywalk/nexus/flowswitch/flow/flow_classifier.c index ab6d42791..a4e1c8587 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_classifier.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_classifier.c @@ -35,10 +35,10 @@ #define CL_SKIP_ON(t) \ if (__improbable(t)) { \ - SK_ERR("%d: skip " #t, __LINE__); \ + SK_PERR(current_proc(), "%d: skip " #t, __LINE__); \ SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \ pkt_buf + pkt->pkt_headroom, __packet_get_real_data_length(pkt), \ - MIN(128, bdlen), NULL, 0)); \ + MIN(128, bdlen))); \ error = ENOTSUP; \ goto done; \ } @@ -113,53 +113,53 @@ flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af, int error = 0; /* must be 16-bytes aligned due to use of sk_copy* below */ - _CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0); - _CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0); + static_assert((offsetof(struct __flow, flow_l3) % 16) == 0); + static_assert((offsetof(struct __flow, flow_ipv4_src) % 16) == 0); + static_assert((offsetof(struct __flow, flow_ipv6_src) % 16) == 0); + static_assert((offsetof(struct __flow, flow_l4) % 16) == 0); + static_assert((offsetof(struct __flow, flow_tcp_src) % 16) == 0); + static_assert((offsetof(struct __flow, flow_udp_src) % 16) == 0); + static_assert((offsetof(struct __flow, flow_esp_spi) % 16) == 0); - _CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8); - _CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) - + static_assert(sizeof(struct __flow_l3_ipv4_addrs) == 8); + static_assert((offsetof(struct __flow_l3_ipv4_addrs, _dst) - offsetof(struct __flow_l3_ipv4_addrs, _src)) == (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src))); - _CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32); - _CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) - + static_assert(sizeof(struct __flow_l3_ipv6_addrs) == 32); + static_assert((offsetof(struct __flow_l3_ipv6_addrs, _dst) - offsetof(struct __flow_l3_ipv6_addrs, _src)) == (offsetof(struct ip6_hdr, ip6_dst) - offsetof(struct ip6_hdr, ip6_src))); /* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */ - _CASSERT(sizeof(struct __flow_l4_tcp) == 16); - _CASSERT((offsetof(struct __flow_l4_tcp, _dst) - + static_assert(sizeof(struct __flow_l4_tcp) == 16); + static_assert((offsetof(struct __flow_l4_tcp, _dst) - offsetof(struct __flow_l4_tcp, _src)) == (offsetof(struct tcphdr, th_dport) - offsetof(struct tcphdr, th_sport))); - _CASSERT((offsetof(struct __flow_l4_tcp, _seq) - + static_assert((offsetof(struct __flow_l4_tcp, _seq) - offsetof(struct __flow_l4_tcp, _src)) == (offsetof(struct tcphdr, th_seq) - offsetof(struct tcphdr, th_sport))); - _CASSERT((offsetof(struct __flow_l4_tcp, _ack) - + static_assert((offsetof(struct __flow_l4_tcp, _ack) - offsetof(struct __flow_l4_tcp, _src)) == (offsetof(struct tcphdr, th_ack) - offsetof(struct tcphdr, th_sport))); - _CASSERT((offsetof(struct __flow_l4_tcp, _flags) - + static_assert((offsetof(struct __flow_l4_tcp, _flags) - offsetof(struct __flow_l4_tcp, _src)) == (offsetof(struct tcphdr, th_flags) - offsetof(struct tcphdr, th_sport))); - _CASSERT((offsetof(struct __flow_l4_tcp, _win) - + static_assert((offsetof(struct __flow_l4_tcp, _win) - offsetof(struct __flow_l4_tcp, _src)) == (offsetof(struct tcphdr, th_win) - offsetof(struct tcphdr, th_sport))); /* ensure same offsets use for TCP and UDP */ - _CASSERT(sizeof(struct __flow_l4_udp) == 8); - _CASSERT(offsetof(struct __flow, flow_tcp_src) == + static_assert(sizeof(struct __flow_l4_udp) == 8); + static_assert(offsetof(struct __flow, flow_tcp_src) == offsetof(struct __flow, flow_udp_src)); - _CASSERT(offsetof(struct __flow, flow_tcp_dst) == + static_assert(offsetof(struct __flow, flow_tcp_dst) == offsetof(struct __flow, flow_udp_dst)); @@ -429,7 +429,7 @@ done: pkt->pkt_length, mtu, pkt->pkt_proto_seg_sz); SK_ERR("%s", sk_dump("buf", l3_hdr, cls_len, - 128, NULL, 0)); + 128)); error = EMSGSIZE; goto fail; } diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_entry.c b/bsd/skywalk/nexus/flowswitch/flow/flow_entry.c index 0a23bde46..45e9fabc3 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_entry.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_entry.c @@ -31,12 +31,15 @@ #include #include #include +#include +#include #include #include #include #include + struct flow_entry *fe_alloc(boolean_t); static void fe_free(struct flow_entry *); static int fe_id_cmp(const struct flow_entry *, const struct flow_entry *); @@ -188,8 +191,8 @@ flow_entry_calc_flowid(struct flow_entry *fe) struct flowidns_flow_key fk; bzero(&fk, sizeof(fk)); - _CASSERT(sizeof(fe->fe_key.fk_src) == sizeof(fk.ffk_laddr)); - _CASSERT(sizeof(fe->fe_key.fk_dst) == sizeof(fk.ffk_raddr)); + static_assert(sizeof(fe->fe_key.fk_src) == sizeof(fk.ffk_laddr)); + static_assert(sizeof(fe->fe_key.fk_dst) == sizeof(fk.ffk_raddr)); bcopy(&fe->fe_key.fk_src, &fk.ffk_laddr, sizeof(fk.ffk_laddr)); bcopy(&fe->fe_key.fk_dst, &fk.ffk_raddr, sizeof(fk.ffk_raddr)); @@ -211,11 +214,8 @@ flow_entry_add_child(struct flow_entry *parent_fe, struct flow_entry *child_fe) lck_rw_lock_exclusive(&parent_fe->fe_child_list_lock); if (parent_fe->fe_flags & FLOWENTF_NONVIABLE) { - SK_ERR("child entry add failed, parent fe \"%s\" non viable 0x%llx " - "flags 0x%b %s(%d)", fe_as_string(parent_fe, - dbgbuf, sizeof(dbgbuf)), SK_KVA(parent_fe), parent_fe->fe_flags, - FLOWENTF_BITS, parent_fe->fe_proc_name, - parent_fe->fe_pid); + SK_ERR("child entry add failed, parent fe \"%s\" non viable", + fe2str(parent_fe, dbgbuf, sizeof(dbgbuf))); lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock); return false; } @@ -224,11 +224,8 @@ flow_entry_add_child(struct flow_entry *parent_fe, struct flow_entry *child_fe) TAILQ_FOREACH_SAFE(fe, &parent_fe->fe_child_list, fe_child_link, tfe) { if (!fe_id_cmp(fe, child_fe)) { lck_rw_unlock_exclusive(&parent_fe->fe_child_list_lock); - SK_ERR("child entry \"%s\" already exists at fe 0x%llx " - "flags 0x%b %s(%d)", fe_as_string(fe, - dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, - FLOWENTF_BITS, fe->fe_proc_name, - fe->fe_pid); + SK_ERR("child entry \"%s\" already exists", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); return false; } @@ -367,33 +364,59 @@ flow_qset_select_dynamic(struct nx_flowswitch *fsw, struct flow_entry *fe, struct ifnet *ifp; uint64_t qset_id; struct nx_netif *nif; - boolean_t changed; int err; ifp = fsw->fsw_ifp; - changed = ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid); - if (!changed && skip_if_no_change) { + if (ifp->if_traffic_rule_genid == fe->fe_tr_genid && skip_if_no_change) { return; } if (fe->fe_qset != NULL) { nx_netif_qset_release(&fe->fe_qset); ASSERT(fe->fe_qset == NULL); } - if (ifp->if_traffic_rule_count == 0) { + + /* + * Note: ifp can have either eth traffc rules or inet traffc rules + * and not both. + */ + if (ifp->if_eth_traffic_rule_count > 0) { + if (!fe->fe_route) { + return; + } + + struct flow_route *fr = fe->fe_route; + struct rtentry *rt = (fr->fr_flags & FLOWRTF_GATEWAY) + ? fr->fr_rt_gw : fr->fr_rt_dst; + if (!rt) { + return; + } + + /* If tr_genid is stale in the rtentry, run traffic rules again */ + ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid); + if (rt->rt_tr_genid != fe->fe_tr_genid) { + rt_lookup_qset_id(rt, true); + } + + qset_id = rt->rt_qset_id; + } else if (ifp->if_inet_traffic_rule_count > 0) { + ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid); + + err = convert_flowkey_to_inet_td(&fe->fe_key, &td); + ASSERT(err == 0); + err = nxctl_inet_traffic_rule_find_qset_id(ifp->if_xname, &td, &qset_id); + if (err != 0) { + DTRACE_SKYWALK3(qset__id__not__found, + struct nx_flowswitch *, fsw, + struct flow_entry *, fe, + struct ifnet_traffic_descriptor_inet *, &td); + return; + } + } else { DTRACE_SKYWALK2(no__rules, struct nx_flowswitch *, fsw, struct flow_entry *, fe); return; } - err = convert_flowkey_to_inet_td(&fe->fe_key, &td); - ASSERT(err == 0); - err = nxctl_inet_traffic_rule_find_qset_id(ifp->if_xname, &td, &qset_id); - if (err != 0) { - DTRACE_SKYWALK3(qset__id__not__found, - struct nx_flowswitch *, fsw, - struct flow_entry *, fe, - struct ifnet_traffic_descriptor_inet *, &td); - return; - } + DTRACE_SKYWALK4(qset__id__found, struct nx_flowswitch *, fsw, struct flow_entry *, fe, struct ifnet_traffic_descriptor_inet *, &td, uint64_t, qset_id); @@ -412,6 +435,7 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) struct flow_entry *__single parent_fe = NULL; flowadv_idx_t fadv_idx = FLOWADV_IDX_NONE; struct nexus_adapter *dev_na; + struct nx_flowswitch *fsw; struct nx_netif *nif; int err; @@ -428,7 +452,8 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) goto done; } - struct flow_mgr *fm = fo->fo_fsw->fsw_flow_mgr; + fsw = fo->fo_fsw; + struct flow_mgr *fm = fsw->fsw_flow_mgr; fe = flow_mgr_find_conflicting_fe(fm, &key); if (fe != NULL) { if ((fe->fe_flags & FLOWENTF_PARENT) && @@ -436,11 +461,8 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) parent_fe = fe; fe = NULL; } else { - SK_ERR("entry \"%s\" already exists at fe 0x%llx " - "flags 0x%b %s(%d)", fe_as_string(fe, - dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, - FLOWENTF_BITS, fe->fe_proc_name, - fe->fe_pid); + SK_ERR("entry \"%s\" already exists", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); /* don't return it */ flow_entry_release(&fe); err = EEXIST; @@ -491,6 +513,9 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) if (req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP) { fe->fe_flags |= FLOWENTF_NOWAKEFROMSLEEP; } + if (req->nfr_flags & NXFLOWREQF_CONNECTION_IDLE) { + fe->fe_flags |= FLOWENTF_CONNECTION_IDLE; + } fe->fe_port_reservation = req->nfr_port_reservation; req->nfr_port_reservation = NULL; if (req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV) { @@ -507,7 +532,7 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) fe->fe_tx_process = dp_flow_tx_process; fe->fe_rx_process = dp_flow_rx_process; - dev_na = fo->fo_fsw->fsw_dev_ch->ch_na; + dev_na = fsw->fsw_dev_ch->ch_na; nif = NX_NETIF_PRIVATE(dev_na->na_nx); if (NX_LLINK_PROV(nif->nif_nx) && (fe->fe_key.fk_mask & (FKMASK_IPVER | FKMASK_PROTO | FKMASK_DST)) == @@ -519,7 +544,7 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) } else { fe->fe_qset_select = FE_QSET_SELECT_DYNAMIC; fe->fe_qset_id = 0; - flow_qset_select_dynamic(fo->fo_fsw, fe, FALSE); + flow_qset_select_dynamic(fsw, fe, FALSE); } } else { fe->fe_qset_select = FE_QSET_SELECT_NONE; @@ -530,7 +555,7 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) fe->fe_transport_protocol = req->nfr_transport_protocol; if (NX_FSW_TCP_RX_AGG_ENABLED() && - (fo->fo_fsw->fsw_nx->nx_prov->nxprov_params->nxp_max_frags > 1) && + (fsw->fsw_nx->nx_prov->nxprov_params->nxp_max_frags > 1) && (fe->fe_key.fk_proto == IPPROTO_TCP) && (fe->fe_key.fk_mask == FKMASK_5TUPLE)) { fe->fe_rx_process = flow_rx_agg_tcp; @@ -591,6 +616,37 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) fe->fe_policy_id = req->nfr_policy_id; fe->fe_skip_policy_id = req->nfr_skip_policy_id; + *(struct nx_flowswitch **)(uintptr_t)&fe->fe_fsw = fsw; + fe->fe_pid = fo->fo_pid; + if (req->nfr_epid != -1 && req->nfr_epid != fo->fo_pid) { + fe->fe_epid = req->nfr_epid; + proc_name(fe->fe_epid, fe->fe_eproc_name, + sizeof(fe->fe_eproc_name)); + } else { + fe->fe_epid = -1; + } + + (void) snprintf(fe->fe_proc_name, sizeof(fe->fe_proc_name), "%s", + fo->fo_name); + + fe_stats_init(fe); + flow_stats_retain(fe->fe_stats); + req->nfr_flow_stats = fe->fe_stats; + fe->fe_rx_worker_tid = 0; + + if (req->nfr_flags & NXFLOWREQF_AOP_OFFLOAD) { + os_atomic_or(&fe->fe_flags, FLOWENTF_AOP_OFFLOAD, relaxed); + /* + * For TCP flows over AOP, we will always linger in the kernel. + * We do not do TCP Time-Wait in AOP. This is so that we can + * cleanup resources from AOP quickly. + */ + if (req->nfr_ip_protocol == IPPROTO_TCP) { + os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed); + fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ; + } + } + err = flow_mgr_flow_hash_mask_add(fm, fe->fe_key.fk_mask); ASSERT(err == 0); @@ -615,30 +671,7 @@ flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr) RB_INSERT(flow_entry_id_tree, &fo->fo_flow_entry_id_head, fe); flow_entry_retain(fe); /* one refcnt in id_tree */ - *(struct nx_flowswitch **)(uintptr_t)&fe->fe_fsw = fo->fo_fsw; - fe->fe_pid = fo->fo_pid; - if (req->nfr_epid != -1 && req->nfr_epid != fo->fo_pid) { - fe->fe_epid = req->nfr_epid; - proc_name(fe->fe_epid, fe->fe_eproc_name, - sizeof(fe->fe_eproc_name)); - } else { - fe->fe_epid = -1; - } - - (void) snprintf(fe->fe_proc_name, sizeof(fe->fe_proc_name), "%s", - fo->fo_name); - - fe_stats_init(fe); - flow_stats_retain(fe->fe_stats); - req->nfr_flow_stats = fe->fe_stats; - fe->fe_rx_worker_tid = 0; - -#if SK_LOG - SK_DF(SK_VERB_FLOW, "allocated entry \"%s\" fe 0x%llx flags 0x%b " - "[fo 0x%llx ]", fe_as_string(fe, dbgbuf, - sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, FLOWENTF_BITS, - SK_KVA(fo)); -#endif /* SK_LOG */ + SK_D("fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf))); done: if (parent_fe != NULL) { @@ -649,6 +682,7 @@ done: flow_owner_flowadv_index_free(fo, fadv_idx); } if (fe != NULL) { + fe->fe_flags |= (FLOWENTF_TORN_DOWN | FLOWENTF_DESTROYED); flow_entry_release(&fe); } } @@ -656,15 +690,133 @@ done: return fe; } +/* + * Add an RX flow steering rule for the given flow entry. + * + * This function provides a high-level interface for configuring RX flow steering + * rules based on flow entry characteristics. It converts the flow key to a traffic + * descriptor and configures the underlying netif for hardware steering. + * + * Parameters: + * fsw - The flowswitch instance + * fe - The flow entry to configure steering for + * + * Returns: + * 0 - Success + * ENOTSUP - RX flow steering not supported + * EINVAL - Invalid parameters + * ENXIO - Device unavailable + * Other - Provider-specific error codes + */ +int +flow_entry_add_rx_steering_rule(struct nx_flowswitch *fsw, struct flow_entry *fe) +{ + struct ifnet_traffic_descriptor_inet td; + struct kern_nexus *nx; + int err = 0; + + if (__improbable(fsw == NULL || fe == NULL)) { + SK_ERR("Invalid parameters: fsw=%p, fe=%p", SK_KVA(fsw), SK_KVA(fe)); + return EINVAL; + } + + /* RX steering is only for AOP offload flows */ + ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD); + + /* Check if device channel is available */ + if (__improbable(fsw->fsw_dev_ch == NULL)) { + SK_ERR("Device channel not available for RX flow steering"); + FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE); + return ENXIO; + } + + nx = fsw->fsw_dev_ch->ch_na->na_nx; + if (__improbable(nx == NULL)) { + SK_ERR("Nexus not available for RX flow steering"); + FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE); + return ENXIO; + } + + /* Convert flow key to traffic descriptor */ + memset(&td, 0, sizeof(struct ifnet_traffic_descriptor_inet)); + err = convert_flowkey_to_inet_td(&fe->fe_key, &td); + if (__improbable(err != 0)) { + SK_ERR("Failed to convert flow key to traffic descriptor (err %d)", err); + FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE); + return err; + } + + /* Always set inbound flag for RX flow steering */ + td.inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND; + + SK_DF(SK_VERB_NETIF, + "Adding RX flow steering rule: fsw=%p, fe=%p, flow_id=%u", + SK_KVA(fsw), SK_KVA(fe), fe->fe_flowid); + + /* Configure the RX flow steering rule */ + err = nx_netif_configure_rx_flow_steering(nx, fe->fe_flowid, + (struct ifnet_traffic_descriptor_common *)&td, + RX_FLOW_STEERING_ACTION_ADD_AOP); + + if (__improbable(err != 0)) { + FSW_STATS_INC(FSW_STATS_RX_FS_ADD_FAILURE); + SK_ERR("RX flow steering rule add failed (err %d)", err); + DTRACE_SKYWALK4(rx__flow__steering__rule__add__failed, + struct nx_flowswitch *, fsw, struct flow_entry *, fe, + uint32_t, fe->fe_flowid, int, err); + } else { + FSW_STATS_INC(FSW_STATS_RX_FS_ADD_SUCCESS); + SK_DF(SK_VERB_NETIF, + "Successfully added RX flow steering rule: flow_id=%u", + fe->fe_flowid); + DTRACE_SKYWALK3(rx__flow__steering__rule__add__success, + struct nx_flowswitch *, fsw, struct flow_entry *, fe, + uint32_t, fe->fe_flowid); + + /* Mark the flow entry as having RX steering configured */ + os_atomic_or(&fe->fe_flags, FLOWENTF_RX_STEERING, relaxed); + } + + return err; +} + +void +flow_entry_rx_steering_rule_cleanup(struct nx_flowswitch *fsw, struct flow_entry *fe) +{ + struct kern_nexus *nx = NULL; + int err = 0; + + ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD); + + /* + * We check for fsw->fsw_dev_ch here because the flow could be cleaned + * up after the flow-switch has detached. The race between flow-switch + * detach and flow cleanup is prevented because flow_entry_teardown() is + * called either with a SK_LOCK() or with fsw_detach_barrier_add(). + */ + if (fsw->fsw_dev_ch != NULL) { + nx = fsw->fsw_dev_ch->ch_na->na_nx; + err = nx_netif_configure_rx_flow_steering(nx, + fe->fe_flowid, NULL, RX_FLOW_STEERING_ACTION_REMOVE_AOP); + if (err != 0) { + FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_FAILURE); + SK_ERR("rx flow steering cleanup failed (err %d)", err); + } else { + FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_SUCCESS); + } + } else { + FSW_STATS_INC(FSW_STATS_RX_FS_REMOVE_SKIPPED); + } +} + void flow_entry_teardown(struct flow_owner *fo, struct flow_entry *fe) { #if SK_LOG char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; - SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b [fo 0x%llx] " - "non_via %d withdrawn %d", fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), - SK_KVA(fe), fe->fe_flags, FLOWENTF_BITS, SK_KVA(fo), - fe->fe_want_nonviable, fe->fe_want_withdraw); + SK_DF(SK_VERB_FLOW, "fe \"%s\" [fo %p] " + "non_via %d withdrawn %d", fe2str(fe, dbgbuf, sizeof(dbgbuf)), + SK_KVA(fo), fe->fe_want_nonviable, fe->fe_want_withdraw); #endif /* SK_LOG */ struct nx_flowswitch *fsw = fo->fo_fsw; @@ -745,6 +897,10 @@ flow_entry_destroy(struct flow_owner *fo, struct flow_entry *fe, bool nolinger, ASSERT(!(fe->fe_flags & FLOWENTF_DESTROYED)); os_atomic_or(&fe->fe_flags, FLOWENTF_DESTROYED, relaxed); + if (fe->fe_flags & FLOWENTF_RX_STEERING) { + fsw_rxstrc_insert(fe); + } + if (fe->fe_transport_protocol == IPPROTO_QUIC) { if (!nolinger && close_params != NULL) { /* @@ -785,14 +941,6 @@ flow_entry_release(struct flow_entry **pfe) struct flow_entry *fe = *pfe; ASSERT(fe != NULL); *pfe = NULL; /* caller lose reference */ -#if SK_LOG - if (__improbable(sk_verbose != 0)) { - char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; - SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), - fe->fe_flags, FLOWENTF_BITS); - } -#endif /* SK_LOG */ if (__improbable(os_ref_release(&fe->fe_refcnt) == 0)) { fe->fe_nx_port = NEXUS_PORT_ANY; @@ -826,7 +974,7 @@ flow_entry_dead_alloc(zalloc_flags_t how) fed = zalloc_flags(sk_fed_zone, how | Z_ZERO); if (fed != NULL) { - SK_DF(SK_VERB_MEM, "fed 0x%llx ALLOC", SK_KVA(fed)); + SK_DF(SK_VERB_MEM, "fed %p ALLOC", SK_KVA(fed)); } return fed; } @@ -834,7 +982,7 @@ flow_entry_dead_alloc(zalloc_flags_t how) void flow_entry_dead_free(struct flow_entry_dead *fed) { - SK_DF(SK_VERB_MEM, "fed 0x%llx FREE", SK_KVA(fed)); + SK_DF(SK_VERB_MEM, "fed %p FREE", SK_KVA(fed)); zfree(sk_fed_zone, fed); } @@ -930,22 +1078,33 @@ fe_stats_update(struct flow_entry *fe) } else { sf->sf_flags &= ~SFLOWF_NOWAKEFROMSLEEP; } + if (fe->fe_flags & FLOWENTF_AOP_OFFLOAD) { + sf->sf_flags |= SFLOWF_AOP_OFFLOAD; + } + if (fe->fe_flags & FLOWENTF_CONNECTION_IDLE) { + sf->sf_flags |= SFLOWF_CONNECTION_IDLE; + } else { + sf->sf_flags &= ~SFLOWF_CONNECTION_IDLE; + } sf->sf_bucket_idx = SFLOW_BUCKET_NONE; - sf->sf_ltrack.sft_state = fe->fe_ltrack.fse_state; - sf->sf_ltrack.sft_seq = fe->fe_ltrack.fse_seqlo; - sf->sf_ltrack.sft_max_win = fe->fe_ltrack.fse_max_win; - sf->sf_ltrack.sft_wscale = fe->fe_ltrack.fse_wscale; - sf->sf_rtrack.sft_state = fe->fe_rtrack.fse_state; - sf->sf_rtrack.sft_seq = fe->fe_rtrack.fse_seqlo; - sf->sf_rtrack.sft_max_win = fe->fe_rtrack.fse_max_win; + /* AOP offload flows are updated in NECP via shared memory with AOP */ + if (!(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) { + sf->sf_ltrack.sft_state = fe->fe_ltrack.fse_state; + sf->sf_ltrack.sft_seq = fe->fe_ltrack.fse_seqlo; + sf->sf_ltrack.sft_max_win = fe->fe_ltrack.fse_max_win; + sf->sf_ltrack.sft_wscale = fe->fe_ltrack.fse_wscale; + sf->sf_rtrack.sft_state = fe->fe_rtrack.fse_state; + sf->sf_rtrack.sft_seq = fe->fe_rtrack.fse_seqlo; + sf->sf_rtrack.sft_max_win = fe->fe_rtrack.fse_max_win; + } } void flow_entry_stats_get(struct flow_entry *fe, struct sk_stats_flow *sf) { - _CASSERT(sizeof(fe->fe_stats->fs_stats) == sizeof(*sf)); + static_assert(sizeof(fe->fe_stats->fs_stats) == sizeof(*sf)); fe_stats_update(fe); bcopy(&fe->fe_stats->fs_stats, sf, sizeof(*sf)); @@ -956,7 +1115,7 @@ fe_alloc(boolean_t can_block) { struct flow_entry *fe; - _CASSERT((offsetof(struct flow_entry, fe_key) % 16) == 0); + static_assert((offsetof(struct flow_entry, fe_key) % 16) == 0); fe = skmem_cache_alloc(sk_fe_cache, can_block ? SKMEM_SLEEP : SKMEM_NOSLEEP); @@ -978,7 +1137,7 @@ fe_alloc(boolean_t can_block) return NULL; } - SK_DF(SK_VERB_MEM, "fe 0x%llx ALLOC", SK_KVA(fe)); + SK_DF(SK_VERB_MEM, "fe %p ALLOC", SK_KVA(fe)); os_ref_init(&fe->fe_refcnt, &flow_entry_refgrp); @@ -1037,7 +1196,7 @@ fe_id_cmp(const struct flow_entry *a, const struct flow_entry *b) #if SK_LOG SK_NO_INLINE_ATTRIBUTE char * -fk_as_string(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz) +fk2str(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz) { int af; char src_s[MAX_IPv6_STR_LEN]; @@ -1045,30 +1204,29 @@ fk_as_string(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz) af = fk->fk_ipver == 4 ? AF_INET : AF_INET6; - (void) inet_ntop(af, &fk->fk_src, src_s, sizeof(src_s)); - (void) inet_ntop(af, &fk->fk_dst, dst_s, sizeof(dst_s)); + (void) sk_ntop(af, &fk->fk_src, src_s, sizeof(src_s)); + (void) sk_ntop(af, &fk->fk_dst, dst_s, sizeof(dst_s)); (void) snprintf(dst, dsz, - "ipver=%u,src=%s,dst=%s,proto=0x%02u,sport=%u,dport=%u " - "mask=%08x,hash=%08x", - fk->fk_ipver, src_s, dst_s, fk->fk_proto, ntohs(fk->fk_sport), - ntohs(fk->fk_dport), fk->fk_mask, flow_key_hash(fk)); + "ipver=%u,src=%s.%u,dst=%s.%u,proto=0x%02u mask=0x%08x,hash=0x%08x", + fk->fk_ipver, src_s, ntohs(fk->fk_sport), dst_s, ntohs(fk->fk_dport), + fk->fk_proto, fk->fk_mask, flow_key_hash(fk)); return dst; } SK_NO_INLINE_ATTRIBUTE char * -fe_as_string(const struct flow_entry *fe, char *__counted_by(dsz)dst, size_t dsz) +fe2str(const struct flow_entry *fe, char *__counted_by(dsz)dst, size_t dsz) { char keybuf[FLOWKEY_DBGBUF_SIZE]; /* just for debug message */ uuid_string_t uuidstr; - fk_as_string(&fe->fe_key, keybuf, sizeof(keybuf)); + fk2str(&fe->fe_key, keybuf, sizeof(keybuf)); - (void) snprintf(dst, dsz, - "fe 0x%llx proc %s nx_port %d flow_uuid %s %s tp_proto=0x%02u", - SK_KVA(fe), fe->fe_proc_name, (int)fe->fe_nx_port, - sk_uuid_unparse(fe->fe_uuid, uuidstr), + (void) sk_snprintf(dst, dsz, "%p proc %s(%d)%s nx_port %d flow_uuid %s" + " flags 0x%b %s tp_proto=0x%02u", SK_KVA(fe), fe->fe_proc_name, + fe->fe_pid, fe->fe_eproc_name, (int)fe->fe_nx_port, + sk_uuid_unparse(fe->fe_uuid, uuidstr), fe->fe_flags, FLOWENTF_BITS, keybuf, fe->fe_transport_protocol); return dst; diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c b/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c index e54ae50f9..9c9f4b7ea 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_manager.c @@ -346,7 +346,7 @@ flow_mgr_terminate(struct flow_mgr *fm) for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, i); - SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]", SK_KVA(fob), i); + SK_DF(SK_VERB_FLOW, "purging fob %p [%u]", SK_KVA(fob), i); flow_owner_bucket_purge_all(fob); } @@ -369,7 +369,7 @@ flow_mgr_terminate(struct flow_mgr *fm) for (i = 0; i < fm->fm_route_buckets_cnt; i++) { struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, i); - SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]", SK_KVA(frb), i); + SK_DF(SK_VERB_FLOW, "purging frb %p [%u]", SK_KVA(frb), i); flow_route_bucket_purge_all(frb); } @@ -876,12 +876,12 @@ flow_req_dump(char *desc, struct nx_flow_req *req) // unsanitized req, treat source and destination AF separately if (saddr->sa.sa_family == AF_INET) { sipver = IPVERSION; - (void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s, + (void) sk_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s, sizeof(src_s)); sport = ntohs(saddr->sin.sin_port); } else if (saddr->sa.sa_family == AF_INET6) { sipver = IPV6_VERSION; - (void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s, + (void) sk_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s, sizeof(src_s)); sport = ntohs(saddr->sin6.sin6_port); } else { @@ -890,12 +890,12 @@ flow_req_dump(char *desc, struct nx_flow_req *req) } if (daddr->sa.sa_family == AF_INET) { dipver = IPVERSION; - (void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s, + (void) sk_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s, sizeof(dst_s)); dport = ntohs(daddr->sin.sin_port); } else if (daddr->sa.sa_family == AF_INET6) { dipver = IPV6_VERSION; - (void) inet_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s, + (void) sk_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s, sizeof(dst_s)); dport = ntohs(daddr->sin6.sin6_port); } else { @@ -904,10 +904,10 @@ flow_req_dump(char *desc, struct nx_flow_req *req) } SK_DF(SK_VERB_FLOW, - "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d" - " nx_port=%u,flags 0x%b", desc, sk_uuid_unparse(req->nfr_flow_uuid, - uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport, - req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS); + "%s %s sipver=%u,dipver=%u,src=%s.%u,dst=%s.%u,proto=%d " + "nx_port=%u,flags 0x%x", desc, sk_uuid_unparse(req->nfr_flow_uuid, + uuid_s), sipver, dipver, src_s, sport, dst_s, dport, protocol, + req->nfr_nx_port, req->nfr_flags); } #else #define flow_req_dump(str, req) do { ((void)0); } while (0) @@ -958,6 +958,7 @@ flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm, VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^ (req->nfr_flowadv_idx == FLOWADV_IDX_NONE)); req->nfr_flowadv_idx = fe->fe_adv_idx; + req->nfr_flowid = fe->fe_flowid; flow_req_dump("added ", req); @@ -1072,7 +1073,7 @@ flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid) } u; uint64_t key; - _CASSERT(sizeof(u.uuid) == sizeof(u.u64)); + static_assert(sizeof(u.uuid) == sizeof(u.u64)); uuid_copy(u.uuid, fr_uuid); /* XOR fold UUID down to 4-bytes */ @@ -1116,7 +1117,7 @@ __flow_mgr_find_fe_by_key_prelog(struct flow_key *key) { SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s", - fk_as_string(key, dbgbuf, sizeof(dbgbuf))); + fk2str(key, dbgbuf, sizeof(dbgbuf))); } SK_NO_INLINE_ATTRIBUTE @@ -1125,8 +1126,8 @@ __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe) { SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); if (fe != NULL) { - SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe 0x%llx \"%s\"", - SK_KVA(fe), fe_as_string(fe, dbgbuf, sizeof(dbgbuf))); + SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe \"%s\"", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); } else { SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found"); } @@ -1159,7 +1160,7 @@ flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key) hash = flow_key_hash(key); node = cuckoo_hashtable_find_with_hash(fm->fm_flow_table, key, hash); SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, - "[%d] mask=%08x hash %08x node 0x%llx", i, mask, hash, + "[%d] mask=%08x hash %08x node %p", i, mask, hash, SK_KVA(node)); if (node != NULL) { fe = __container_of(node, struct flow_entry, fe_cnode); diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_namespace.c b/bsd/skywalk/nexus/flowswitch/flow/flow_namespace.c index eb6df74e7..797e014f8 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_namespace.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_namespace.c @@ -45,7 +45,7 @@ */ int flow_namespace_create(union sockaddr_in_4_6 *laddr, uint8_t protocol, - netns_token *token, uint16_t nfr_flags, struct ns_flow_info *nfi) + netns_token *token, uint32_t nfr_flags, struct ns_flow_info *nfi) { sa_family_t af = laddr->sa.sa_family; uint32_t *addr; diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_owner.c b/bsd/skywalk/nexus/flowswitch/flow/flow_owner.c index e174ecbfd..90d262995 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_owner.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_owner.c @@ -71,7 +71,7 @@ flow_owner_buckets_alloc(size_t fob_cnt, size_t * fob_sz, size_t * tot_sz){ ASSERT(IS_P2ALIGNED(fob, cache_sz)); #endif - SK_DF(SK_VERB_MEM, "fob 0x%llx fob_cnt %zu fob_sz %zu " + SK_DF(SK_VERB_MEM, "fob %p fob_cnt %zu fob_sz %zu " "(total %zu bytes) ALLOC", SK_KVA(fob), fob_cnt, *fob_sz, fob_tot_sz); @@ -81,7 +81,7 @@ flow_owner_buckets_alloc(size_t fob_cnt, size_t * fob_sz, size_t * tot_sz){ void flow_owner_buckets_free(struct flow_owner_bucket *fob, size_t tot_sz) { - SK_DF(SK_VERB_MEM, "fob 0x%llx FREE", SK_KVA(fob)); + SK_DF(SK_VERB_MEM, "fob %p FREE", SK_KVA(fob)); sk_free_type_hash(KT_SK_FOB, tot_sz, fob); } @@ -109,7 +109,7 @@ flow_owner_bucket_destroy(struct flow_owner_bucket *fob) */ FOB_LOCK(fob); while (!RB_EMPTY(&fob->fob_owner_head)) { - SK_ERR("waiting for fob 0x%llx to go idle", SK_KVA(fob)); + SK_ERR("waiting for fob %p to go idle", SK_KVA(fob)); if (++(fob->fob_dtor_waiters) == 0) { /* wraparound */ fob->fob_dtor_waiters++; } @@ -353,8 +353,8 @@ flow_owner_alloc(struct flow_owner_bucket *fob, struct proc *p, struct flow_owner *fo; const pid_t pid = proc_pid(p); - _CASSERT(true == 1); - _CASSERT(false == 0); + static_assert(true == 1); + static_assert(false == 0); ASSERT(low_latency == true || low_latency == false); ASSERT(nx_port != NEXUS_PORT_ANY); FOB_LOCK_ASSERT_HELD(fob); @@ -413,8 +413,8 @@ flow_owner_alloc(struct flow_owner_bucket *fob, struct proc *p, *(struct nx_flowswitch **)(uintptr_t)&fo->fo_fsw = fsw; RB_INSERT(flow_owner_tree, &fob->fob_owner_head, fo); - SK_DF(SK_VERB_FLOW, "%s(%d) fob 0x%llx added fo 0x%llx " - "nx_port %d nx_port_pid_bound %d ll %d nx_port_na 0x%llx", + SK_DF(SK_VERB_FLOW, "%s(%d) fob %p added fo %p " + "nx_port %d nx_port_pid_bound %d ll %d nx_port_na %p", fo->fo_name, fo->fo_pid, SK_KVA(fob), SK_KVA(fo), (int)nx_port, nx_port_pid_bound, fo->fo_low_latency, SK_KVA(nx_port_na)); @@ -443,7 +443,7 @@ flow_owner_free(struct flow_owner_bucket *fob, struct flow_owner *fo) wakeup(&fob->fob_dtor_waiters); } - SK_DF(SK_VERB_FLOW, "%s(%d) fob 0x%llx removed fo 0x%llx nx_port %d", + SK_DF(SK_VERB_FLOW, "%s(%d) fob %p removed fo %p nx_port %d", fo->fo_name, fo->fo_pid, SK_KVA(fob), SK_KVA(fo), (int)fo->fo_nx_port); @@ -567,7 +567,7 @@ fo_alloc(boolean_t can_block) bzero(fo, sk_fo_size); - SK_DF(SK_VERB_MEM, "fo 0x%llx ALLOC", SK_KVA(fo)); + SK_DF(SK_VERB_MEM, "fo %p ALLOC", SK_KVA(fo)); return fo; } @@ -579,7 +579,7 @@ fo_free(struct flow_owner *fo) ASSERT(RB_EMPTY(&fo->fo_flow_entry_id_head)); ASSERT(fo->fo_flowadv_bmap == NULL); - SK_DF(SK_VERB_MEM, "fo 0x%llx FREE", SK_KVA(fo)); + SK_DF(SK_VERB_MEM, "fo %p FREE", SK_KVA(fo)); skmem_cache_free(sk_fo_cache, fo); } diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_route.c b/bsd/skywalk/nexus/flowswitch/flow/flow_route.c index 647ddd734..e4034e5c3 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_route.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_route.c @@ -73,6 +73,8 @@ #include #include +#include + extern struct rtstat_64 rtstat; static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock"); @@ -158,7 +160,7 @@ flow_route_buckets_alloc(size_t frb_cnt, size_t * frb_sz, size_t * tot_sz){ ASSERT(IS_P2ALIGNED(frb, cache_sz)); #endif - SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu " + SK_DF(SK_VERB_MEM, "frb %p frb_cnt %zu frb_sz %zu " "(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt, *frb_sz, frb_tot_sz); @@ -168,7 +170,7 @@ flow_route_buckets_alloc(size_t frb_cnt, size_t * frb_sz, size_t * tot_sz){ void flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz) { - SK_DF(SK_VERB_MEM, "frb 0x%llx FREE", SK_KVA(frb)); + SK_DF(SK_VERB_MEM, "frb %p FREE", SK_KVA(frb)); sk_free_type_hash(KT_SK_FRB, tot_sz, frb); } @@ -255,7 +257,7 @@ flow_route_id_buckets_alloc(size_t frib_cnt, size_t * frib_sz, size_t * tot_sz){ ASSERT(IS_P2ALIGNED(frib, cache_sz)); #endif /* !KASAN_CLASSIC */ - SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu " + SK_DF(SK_VERB_MEM, "frib %p frib_cnt %zu frib_sz %zu " "(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt, *frib_sz, frib_tot_sz); @@ -265,7 +267,7 @@ flow_route_id_buckets_alloc(size_t frib_cnt, size_t * frib_sz, size_t * tot_sz){ void flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz) { - SK_DF(SK_VERB_MEM, "frib 0x%llx FREE", SK_KVA(frib)); + SK_DF(SK_VERB_MEM, "frib %p FREE", SK_KVA(frib)); sk_free_type_hash(KT_SK_FRIB, tot_sz, frib); } @@ -318,14 +320,14 @@ fr_alloc(boolean_t cansleep) lck_mtx_init(&fr->fr_lock, &flow_route_lock_group, &flow_route_lock_attr); uuid_generate_random(fr->fr_uuid); - SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr)); + SK_DF(SK_VERB_MEM, "allocated fr %p", SK_KVA(fr)); return fr; } static void fr_free(struct flow_route *fr) { - SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr)); + SK_DF(SK_VERB_MEM, "freeing fr %p", SK_KVA(fr)); VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED)); VERIFY(fr->fr_usecnt == 0); @@ -502,7 +504,7 @@ flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_re * address of the gateway in fr_gaddr. */ (void) sa_copy(rt->rt_gateway, &ss, NULL); - _CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss)); + static_assert(sizeof(fr->fr_gaddr) <= sizeof(ss)); bcopy(&ss, &fr->fr_gaddr, sizeof(fr->fr_gaddr)); os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed); } else if (IS_DIRECT_HOSTROUTE(rt)) { @@ -606,7 +608,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, FR_LOCK(fr); err = flow_route_configure(fr, ifp, req); if (err != 0) { - SK_ERR("fr 0x%llx error re-configuring dst %s " + SK_ERR("fr %p error re-configuring dst %s " "on %s (err %d) [R]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, err); @@ -615,7 +617,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, } if (err == 0) { SK_DF(SK_VERB_FLOW_ROUTE, - "fr 0x%llx found for dst %s " "on %s [R,%u]", + "fr %p found for dst %s " "on %s [R,%u]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt); } @@ -640,7 +642,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, FR_LOCK(fr); err = flow_route_configure(fr, ifp, req); if (err != 0) { - SK_ERR("fr 0x%llx error re-configuring dst %s " + SK_ERR("fr %p error re-configuring dst %s " "on %s (err %d) [W]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, err); @@ -649,7 +651,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, } if (err == 0) { SK_DF(SK_VERB_FLOW_ROUTE, - "fr 0x%llx found for dst %s on %s [W,%u]", + "fr %p found for dst %s on %s [W,%u]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt); } @@ -689,7 +691,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, FR_LOCK(fr); if ((err = flow_route_configure(fr, ifp, req)) != 0) { - SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)", + SK_ERR("fr %p error configuring dst %s on %s (err %d)", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, err); FR_UNLOCK(fr); @@ -727,7 +729,7 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, #endif /* DEBUG */ /* for the trees */ - _CASSERT(FLOW_ROUTE_MINREF == 2); + static_assert(FLOW_ROUTE_MINREF == 2); flow_route_retain(fr); flow_route_retain(fr); ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF); @@ -742,12 +744,12 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, if (!(fr->fr_flags & FLOWRTF_RESOLVED) && (err = fr_resolve(arg, fr, NULL)) != 0) { if (fr->fr_flags & FLOWRTF_GATEWAY) { - SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)", + SK_ERR("fr %p resolve %s gw %s on %s (err %d)", SK_KVA(fr), (err == EJUSTRETURN ? "pending" : "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s, sizeof(dst_s)), ifp->if_xname, err); } else { - SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)", + SK_ERR("fr %p resolve %s dst %s on %s (err %d)", SK_KVA(fr), (err == EJUSTRETURN ? "pending" : "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname, err); @@ -763,14 +765,14 @@ flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, #if SK_LOG if (fr->fr_flags & FLOWRTF_GATEWAY) { SK_DF(SK_VERB_FLOW_ROUTE, - "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr), + "add fr %p %s -> %s via gw %s on %s", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)), ifp->if_xname); } else { SK_DF(SK_VERB_FLOW_ROUTE, - "add fr 0x%llx %s -> %s on %s", SK_KVA(fr), + "add fr %p %s -> %s on %s", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), ifp->if_xname); @@ -809,7 +811,7 @@ __flow_route_release(struct flow_route *fr, boolean_t renew) VERIFY(fr->fr_usecnt > 0); if (fr->fr_flags & FLOWRTF_ATTACHED) { if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1) && renew) { - fr->fr_expire = _net_uptime + flow_route_expire; + fr->fr_expire = net_uptime() + flow_route_expire; } } else { /* @@ -864,7 +866,7 @@ flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid, (fr->fr_expire > now && !early_expire && !(fr->fr_flags & FLOWRTF_DELETED)))) { lck_spin_unlock(&fr->fr_reflock); - SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx " + SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr %p " "refcnt %u expire %llu", SK_KVA(fr), fr->fr_usecnt, fr->fr_expire); continue; @@ -883,7 +885,7 @@ flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid, } FRIB_WLOCK_ASSERT_HELD(frib); - _CASSERT(FLOW_ROUTE_MINREF == 2); + static_assert(FLOW_ROUTE_MINREF == 2); ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF); RB_REMOVE(flow_route_tree, &frb->frb_head, fr); @@ -894,7 +896,7 @@ flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid, #if SK_LOG if (fr->fr_flags & FLOWRTF_GATEWAY) { SK_DF(SK_VERB_FLOW_ROUTE, - "remove fr 0x%llx %s -> %s via gw %s [exp %lld]", + "remove fr %p %s -> %s via gw %s [exp %lld]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), @@ -902,7 +904,7 @@ flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid, (int64_t)(fr->fr_expire - now)); } else { SK_DF(SK_VERB_FLOW_ROUTE, - "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr), + "remove fr %p %s -> %s [exp %lld]", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), (int64_t)(fr->fr_expire - now)); @@ -1144,7 +1146,7 @@ flow_route_ev_callback(struct eventhandler_entry_arg ee_arg, if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) { SK_DF(SK_VERB_FLOW_ROUTE, - "%s: fr 0x%llx eth_type 0x%x " + "%s: fr %p eth_type 0x%x " "eth_src %x:%x:%x:%x:%x:%x " "eth_dst %x:%x:%x:%x:%x:%x [%s])", fm->fm_name, SK_KVA(fr), @@ -1242,8 +1244,8 @@ flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst, if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) { err = EHOSTUNREACH; SK_ERR("route to %s has src address marked detaching " - "(err %d)", inet_ntop(AF_INET, - &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err); + "(err %d)", sk_ntop(AF_INET, &SIN(dst)->sin_addr, + dst_s, sizeof(dst_s)), err); ifnet_lock_done(ifp); break; } @@ -1269,7 +1271,7 @@ flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst, } VERIFY(src_ifp == NULL); SK_ERR("src address to dst %s on %s not available " - "(err %d)", inet_ntop(AF_INET6, + "(err %d)", sk_ntop(AF_INET6, &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)), ifp->if_xname, err); break; @@ -1283,9 +1285,9 @@ flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst, err = ENETUNREACH; } SK_ERR("dst %s, src %s ifp %s != %s (err %d)", - inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr, + sk_ntop(AF_INET6, &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)), - inet_ntop(AF_INET6, &SIN6(src)->sin6_addr, + sk_ntop(AF_INET6, &SIN6(src)->sin6_addr, src_s, sizeof(src_s)), src_ifp->if_xname, ifp->if_xname, err); break; @@ -1371,13 +1373,13 @@ flow_route_cleanup(struct flow_route *fr) #if SK_LOG if (fr->fr_flags & FLOWRTF_GATEWAY) { SK_DF(SK_VERB_FLOW_ROUTE, - "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr), + "clean fr %p %s -> %s via gw %s", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs))); } else if (fr->fr_flags & FLOWRTF_ONLINK) { SK_DF(SK_VERB_FLOW_ROUTE, - "clean fr 0x%llx %s -> %s", SK_KVA(fr), + "clean fr %p %s -> %s", SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds))); } diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_stats.c b/bsd/skywalk/nexus/flowswitch/flow/flow_stats.c index c92194eee..c5427c729 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_stats.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_stats.c @@ -73,8 +73,8 @@ flow_stats_alloc(boolean_t cansleep) { struct flow_stats *fs; - _CASSERT((offsetof(struct flow_stats, fs_stats) % 16) == 0); - _CASSERT((offsetof(struct sk_stats_flow, sf_key) % 16) == 0); + static_assert((offsetof(struct flow_stats, fs_stats) % 16) == 0); + static_assert((offsetof(struct sk_stats_flow, sf_key) % 16) == 0); /* XXX -fbounds-safety: fix after skmem merge */ fs = __unsafe_forge_bidi_indexable(struct flow_stats *, @@ -92,7 +92,7 @@ flow_stats_alloc(boolean_t cansleep) ASSERT(IS_P2ALIGNED(fs, 16)); bzero(fs, flow_stats_size); os_ref_init(&fs->fs_refcnt, &flow_stats_refgrp); - SK_DF(SK_VERB_MEM, "allocated fs 0x%llx", SK_KVA(fs)); + SK_DF(SK_VERB_MEM, "allocated fs %p", SK_KVA(fs)); return fs; } @@ -101,6 +101,6 @@ flow_stats_free(struct flow_stats *fs) { VERIFY(os_ref_get_count(&fs->fs_refcnt) == 0); - SK_DF(SK_VERB_MEM, "freeing fs 0x%llx", SK_KVA(fs)); + SK_DF(SK_VERB_MEM, "freeing fs %p", SK_KVA(fs)); skmem_cache_free(flow_stats_cache, fs); } diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_track.c b/bsd/skywalk/nexus/flowswitch/flow/flow_track.c index 9e00e8cd8..f5b56841d 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_track.c +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_track.c @@ -61,7 +61,7 @@ flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt) uint8_t optlen, wscale = 0; const uint8_t *opt; - _CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t)); + static_assert(sizeof(s->fse_flags) == sizeof(uint16_t)); ASSERT(hlen >= (int)sizeof(struct tcphdr)); opt = hdr + sizeof(struct tcphdr); @@ -174,11 +174,11 @@ flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input, /* start a new RTT tracking session under sampling rate limit */ if (dst_last == 0 || - _net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) { + net_uptime() - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) { if (ulen > 0 && dst->fse_rtt.frtt_timestamp == 0) { dst->fse_rtt.frtt_timestamp = mach_absolute_time(); - dst->fse_rtt.frtt_last = _net_uptime; + dst->fse_rtt.frtt_last = net_uptime(); dst->fse_rtt.frtt_seg_begin = seq; dst->fse_rtt.frtt_seg_end = seq + ulen; KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START), @@ -509,8 +509,13 @@ flow_track_tcp(struct flow_entry *fe, struct flow_track *src, } } if (tcp_flags & TH_RST) { - src->fse_state = dst->fse_state = TCPS_TIME_WAIT; - ftflags |= FTF_WAITCLOSE; + /* + * Do not act on TCP RST with invalid sequence number per RFC 5961 + */ + if (SEQ_GEQ(orig_seq, src->fse_seqlo)) { + src->fse_state = dst->fse_state = TCPS_TIME_WAIT; + ftflags |= FTF_WAITCLOSE; + } } } else { if (dst->fse_state == TCPS_SYN_SENT && @@ -572,7 +577,7 @@ done: * If we're over the rate limit for outbound SYNs, drop packet. */ if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) { - uint32_t now = (uint32_t)_net_uptime; + uint32_t now = (uint32_t)net_uptime(); if ((now - src->fse_syn_ts) > 1) { src->fse_syn_ts = now; src->fse_syn_cnt = 0; @@ -597,7 +602,7 @@ flow_track_tcp_want_abort(struct flow_entry *fe) struct flow_track *dst = &fe->fe_rtrack; if (fe->fe_key.fk_proto != IPPROTO_TCP || - (fe->fe_flags & FLOWENTF_ABORTED)) { + (fe->fe_flags & (FLOWENTF_ABORTED | FLOWENTF_AOP_OFFLOAD))) { goto done; } @@ -668,7 +673,7 @@ flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets, if (__probable(active)) { in_stat_set_activity_bitmap(&fe->fe_stats->fs_activity, - _net_uptime); + net_uptime()); } } @@ -678,33 +683,33 @@ flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in) struct flow_track *src, *dst; int ret = 0; - _CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED); - _CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN); - _CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT); - _CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED); - _CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED); - _CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT); - _CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1); - _CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING); - _CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK); - _CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2); - _CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT); - _CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC); - _CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE); - _CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE); - _CASSERT(SFT_STATE_MAX == FT_STATE_MAX); + static_assert(SFT_STATE_CLOSED == FT_STATE_CLOSED); + static_assert(SFT_STATE_LISTEN == FT_STATE_LISTEN); + static_assert(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT); + static_assert(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED); + static_assert(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED); + static_assert(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT); + static_assert(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1); + static_assert(SFT_STATE_CLOSING == FT_STATE_CLOSING); + static_assert(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK); + static_assert(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2); + static_assert(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT); + static_assert(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC); + static_assert(SFT_STATE_SINGLE == FT_STATE_SINGLE); + static_assert(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE); + static_assert(SFT_STATE_MAX == FT_STATE_MAX); - _CASSERT(FT_STATE_CLOSED == TCPS_CLOSED); - _CASSERT(FT_STATE_LISTEN == TCPS_LISTEN); - _CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT); - _CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED); - _CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED); - _CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT); - _CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1); - _CASSERT(FT_STATE_CLOSING == TCPS_CLOSING); - _CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK); - _CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2); - _CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT); + static_assert(FT_STATE_CLOSED == TCPS_CLOSED); + static_assert(FT_STATE_LISTEN == TCPS_LISTEN); + static_assert(FT_STATE_SYN_SENT == TCPS_SYN_SENT); + static_assert(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED); + static_assert(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED); + static_assert(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT); + static_assert(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1); + static_assert(FT_STATE_CLOSING == TCPS_CLOSING); + static_assert(FT_STATE_LAST_ACK == TCPS_LAST_ACK); + static_assert(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2); + static_assert(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT); ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); diff --git a/bsd/skywalk/nexus/flowswitch/flow/flow_var.h b/bsd/skywalk/nexus/flowswitch/flow/flow_var.h index 37f16b495..6e35932ad 100644 --- a/bsd/skywalk/nexus/flowswitch/flow/flow_var.h +++ b/bsd/skywalk/nexus/flowswitch/flow/flow_var.h @@ -250,7 +250,8 @@ typedef void (*flow_tx_action_t)(struct nx_flowswitch *fsw, struct flow_entry *f #define FLOW_PROC_FLAG_FRAGMENTS 0x0001 typedef void (*flow_rx_action_t)(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct pktq *pkts, uint32_t rx_bytes, uint32_t flags); + struct pktq *pkts, uint32_t rx_bytes, struct mbufq *host_mq, + uint32_t flags); struct flow_entry { /**** Common Group ****/ @@ -349,13 +350,18 @@ struct flow_entry { uint8_t fe_demux_pattern_count; struct kern_flow_demux_pattern *__counted_by(fe_demux_pattern_count)fe_demux_patterns; uint8_t *__sized_by_or_null(FLOW_DEMUX_MAX_LEN) fe_demux_pkt_data; + + TAILQ_ENTRY(flow_entry) fe_rxstrc_link; }; /* valid values for fe_flags */ #define FLOWENTF_INITED 0x00000001 /* {src,dst} states initialized */ +#define FLOWENTF_AOP_OFFLOAD 0x00000002 /* AOP Offload flow */ +#define FLOWENTF_RX_STEERING 0x00000004 /* RX flow steering configured */ #define FLOWENTF_TRACK 0x00000010 /* enable state tracking */ #define FLOWENTF_CONNECTED 0x00000020 /* connected mode */ #define FLOWENTF_LISTENER 0x00000040 /* listener mode */ +#define FLOWENTF_RXSTRC_PENDING 0x00000080 /* Rx steering rule cleanup pending */ #define FLOWENTF_QOS_MARKING 0x00000100 /* flow can have qos marking */ #define FLOWENTF_LOW_LATENCY 0x00000200 /* low latency flow */ #define FLOWENTF_WAIT_CLOSE 0x00001000 /* defer free after close */ @@ -366,6 +372,7 @@ struct flow_entry { #define FLOWENTF_CHILD 0x00020000 /* child flow */ #define FLOWENTF_PARENT 0x00040000 /* parent flow */ #define FLOWENTF_NOWAKEFROMSLEEP 0x00080000 /* don't wake for this flow */ +#define FLOWENTF_CONNECTION_IDLE 0x00100000 /* connection is idle */ #define FLOWENTF_ABORTED 0x01000000 /* has sent RST to peer */ #define FLOWENTF_NONVIABLE 0x02000000 /* disabled; awaiting tear down */ #define FLOWENTF_WITHDRAWN 0x04000000 /* flow has been withdrawn */ @@ -375,9 +382,10 @@ struct flow_entry { #define FLOWENTF_LINGERING 0x80000000 /* destroyed and in linger list */ #define FLOWENTF_BITS \ - "\020\01INITED\05TRACK\06CONNECTED\07LISTNER\011QOS_MARKING" \ + "\020\01INITED\02AOP_OFFLOAD\03RX_STEERING\05TRACK\06CONNECTED\07LISTNER\011QOS_MARKING" \ "\012LOW_LATENCY\015WAIT_CLOSE\016CLOSE_NOTIFY\017EXT_PORT" \ - "\020EXT_PROTO\021EXT_FLOWID\031ABORTED\032NONVIABLE\033WITHDRAWN" \ + "\020EXT_PROTO\021EXT_FLOWID\024NOWAKEFROMSLEEP\025CONNECTION_IDLE" \ + "\031ABORTED\032NONVIABLE\033WITHDRAWN" \ "\034TORN_DOWN\035HALF_CLOSED\037DESTROYED\40LINGERING" TAILQ_HEAD(flow_entry_linger_head, flow_entry); @@ -396,6 +404,8 @@ struct flow_entry_dead { } __sk_aligned(8); }; +TAILQ_HEAD(flow_entry_rxstrc_head, flow_entry); + /* * Minimum refcnt for a flow route entry to be considered as idle. */ @@ -647,10 +657,10 @@ static inline int flow_key_cmp_mask(const struct flow_key *match, const struct flow_key *key, const struct flow_key *mask) { - _CASSERT(FLOW_KEY_LEN == 48); - _CASSERT(FLOW_KEY_LEN == sizeof(struct flow_key)); - _CASSERT((sizeof(struct flow_entry) % 16) == 0); - _CASSERT((offsetof(struct flow_entry, fe_key) % 16) == 0); + static_assert(FLOW_KEY_LEN == 48); + static_assert(FLOW_KEY_LEN == sizeof(struct flow_key)); + static_assert((sizeof(struct flow_entry) % 16) == 0); + static_assert((offsetof(struct flow_entry, fe_key) % 16) == 0); /* local variables are __bidi_indexable with -fbounds-safety */ const struct flow_key *match_idx = match; @@ -948,7 +958,7 @@ extern int flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask); extern struct flow_entry * fe_alloc(boolean_t can_block); extern int flow_namespace_create(union sockaddr_in_4_6 *, uint8_t protocol, - netns_token *, uint16_t, struct ns_flow_info *); + netns_token *, uint32_t, struct ns_flow_info *); extern void flow_namespace_half_close(netns_token *token); extern void flow_namespace_withdraw(netns_token *); extern void flow_namespace_destroy(netns_token *); @@ -992,6 +1002,10 @@ extern struct flow_entry * flow_entry_alloc(struct flow_owner *fo, extern void flow_entry_teardown(struct flow_owner *, struct flow_entry *); extern void flow_entry_destroy(struct flow_owner *, struct flow_entry *, bool, void *); +extern int flow_entry_add_rx_steering_rule(struct nx_flowswitch *fsw, + struct flow_entry *fe); +extern void flow_entry_rx_steering_rule_cleanup(struct nx_flowswitch *, + struct flow_entry *); extern void flow_entry_retain(struct flow_entry *fe); extern void flow_entry_release(struct flow_entry **pfe); extern uint32_t flow_entry_refcnt(struct flow_entry *fe); @@ -1017,12 +1031,14 @@ extern void flow_track_abort_tcp( struct flow_entry *fe, extern void flow_track_abort_quic(struct flow_entry *fe, uint8_t *__counted_by(QUIC_STATELESS_RESET_TOKEN_SIZE)token); -extern void fsw_host_rx(struct nx_flowswitch *, struct pktq *); -extern void fsw_host_sendup(struct ifnet *, struct mbuf *, struct mbuf *, - uint32_t, uint32_t); +extern void fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq); +extern void fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq, + struct mbufq *host_mq); +extern void fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq); extern void flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags); + struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq, + uint32_t flags); extern void flow_route_init(void); extern void flow_route_fini(void); @@ -1062,8 +1078,8 @@ extern struct flow_stats *flow_stats_alloc(boolean_t cansleep); #if SK_LOG #define FLOWKEY_DBGBUF_SIZE 256 #define FLOWENTRY_DBGBUF_SIZE 512 -extern char *fk_as_string(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz); -extern char *fe_as_string(const struct flow_entry *fe, char *__counted_by(dsz)dst, size_t dsz); +extern char *fk2str(const struct flow_key *fk, char *__counted_by(dsz)dst, size_t dsz); +extern char *fe2str(const struct flow_entry *fe, char *__counted_by(dsz)dst, size_t dsz); #endif /* SK_LOG */ __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/skywalk/nexus/flowswitch/fsw.c b/bsd/skywalk/nexus/flowswitch/fsw.c index 5f86440a9..63338aa5b 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw.c +++ b/bsd/skywalk/nexus/flowswitch/fsw.c @@ -61,6 +61,9 @@ #include #include #include +#include + +#include #if (DEVELOPMENT || DEBUG) SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue, @@ -156,7 +159,7 @@ out: if ((*vpna) != NULL) { (*vpna)->vpna_up.na_private = ch; SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW, - "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" " + "vpna \"%s\" (%p) refs %u to fsw \"%s\" " "nx_port %d (err %d)", (*vpna)->vpna_up.na_name, SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount, cr_name, (int)(*vpna)->vpna_nx_port, err); @@ -377,7 +380,7 @@ fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna) * single threaded write to destination rings. */ if ((ifp->if_eflags & IFEF_TXSTART) == 0) { - SK_ERR("non TXSTART interface not supported ifp(0x%llx)", + SK_ERR("non TXSTART interface not supported ifp(%p)", SK_KVA(ifp)); return ENOTSUP; } @@ -475,7 +478,7 @@ fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna) __unsafe_null_terminated_from_indexable(fsw->fsw_reap_name)); error = fsw_netagent_register(fsw, ifp); - SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, + SK_DF(error ? SK_VERB_ERROR : SK_VERB_DEFAULT, "fsw_netagent_register %s (family %u) (err %d)", if_name(ifp), ifp->if_family, error); @@ -535,8 +538,8 @@ fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna) skoid_destroy(&fsw->fsw_skoid); - SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)", - ((fsw->fsw_agent_session != NULL) ? "netagent" : ""), + SK_D("%sdetached from %s (family %u)", + ((fsw->fsw_agent_session != NULL) ? "netagent " : ""), if_name(ifp), ifp->if_family); if (hwna != NULL) { @@ -602,7 +605,7 @@ fsw_host_setup(struct nx_flowswitch *fsw) * single threaded write to destination rings. */ if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) { - SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported", + SK_ERR("ifp(%p): multiple rx rings(%d) not supported", SK_KVA(ifp), hwna->na_num_rx_rings); return ENOTSUP; } @@ -654,14 +657,14 @@ fsw_ctl_attach_log(const struct nx_spec_req *nsr, nustr = __unsafe_forge_null_terminated(const char *, sk_uuid_unparse(nsr->nsr_uuid, uuidstr)); } else if (nsr->nsr_flags & NXSPECREQ_IFP) { - nustr = tsnprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx", + nustr = tsnprintf((char *)uuidstr, sizeof(uuidstr), "%p", SK_KVA(nsr->nsr_ifp)); } else { nustr = __unsafe_null_terminated_from_indexable(nsr->nsr_name); } SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW, - "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d", + "nexus %p (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr, sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err); } @@ -977,13 +980,13 @@ done: if (nsr != NULL) { uuid_string_t ifuuidstr; SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW, - "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d", + "nexus %p (%s) if_uuid %s flags 0x%x err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err); } else { SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW, - "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx), + "nexus %p (%s) ANY err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, err); } #endif /* SK_LOG */ @@ -1022,6 +1025,7 @@ fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p, struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx); struct nx_spec_req *__single nsr = data; struct nx_flow_req *__single req = data; + const task_t __single task = proc_task(p); boolean_t need_check; int error = 0; @@ -1069,6 +1073,13 @@ fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p, goto done; } } + + if (req->nfr_flags & NXFLOWREQF_AOP_OFFLOAD) { + if (!IOTaskHasEntitlement(task, "com.apple.private.network.aop2_offload")) { + error = EPERM; + goto done; + } + } break; default: @@ -1271,8 +1282,8 @@ out: char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol " "event %d with value %d and tcp sequence number %d", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), - protoctl_event_code, p_val->val, p_val->tcp_seq_number); + fe2str(fe, dbgbuf, sizeof(dbgbuf)), protoctl_event_code, + p_val->val, p_val->tcp_seq_number); #endif /* SK_LOG */ if ((error = netagent_update_flow_protoctl_event( fsw->fsw_agent_session, fe_uuid, protoctl_event_code, @@ -1449,7 +1460,7 @@ fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna, done: SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW, - "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu " + "fsw %p nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu " "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err); @@ -1480,7 +1491,7 @@ fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna) vpna->vpna_pid, nx_port, FALSE); SK_DF(SK_VERB_FSW, - "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u " + "fsw %p nx_port %d pid %d pid_bound %u defunct %u " "purged %u", SK_KVA(fsw), (int)nx_port, vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct, purge_cnt); @@ -1546,14 +1557,14 @@ fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb, #if SK_LOG if (*vpna != NULL) { SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx " + "+++ vpna \"%s\" (%p) <-> fsw %p " "%sport %d refonly %u (err %d)", (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw), nx_fsw_dom_port_is_reserved(nx, nx_port) ? "[reserved] " : "", (int)nx_port, refonly, error); } else { SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "+++ fsw 0x%llx nx_port %d refonly %u " + "+++ fsw %p nx_port %d refonly %u " "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error); } #endif /* SK_LOG */ @@ -1624,7 +1635,7 @@ fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna, return; } - SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx " + SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (%p) -!- fsw %p " "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna), SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct); @@ -1647,8 +1658,8 @@ fsw_port_na_activate(struct nx_flowswitch *fsw, SK_LOCK_ASSERT_HELD(); /* The following code relies on the static value asserted below */ - _CASSERT(FSW_VP_DEV == 0); - _CASSERT(FSW_VP_HOST == 1); + static_assert(FSW_VP_DEV == 0); + static_assert(FSW_VP_HOST == 1); ASSERT(NA_IS_ACTIVE(&vpna->vpna_up)); ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY); @@ -1680,7 +1691,7 @@ fsw_port_na_activate(struct nx_flowswitch *fsw, done: SK_DF(SK_VERB_FSW, - "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u", + "fsw %p %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u", SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port, vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt); @@ -2354,8 +2365,8 @@ fsw_read_boot_args(void) void fsw_init(void) { - _CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1); - _CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE); + static_assert(NX_FSW_CHUNK_FREE == (uint64_t) -1); + static_assert(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE); if (!__nx_fsw_inited) { fsw_read_boot_args(); @@ -2415,7 +2426,7 @@ fsw_alloc(zalloc_flags_t how) fsw->fsw_host_ch = NULL; fsw->fsw_closed_na_stats = nsfw; - SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw)); + SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw)); return fsw; } @@ -2490,6 +2501,7 @@ fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna, nx_prov->nxprov_params->nxp_ifindex = 0; /* free any flow entries in the deferred list */ fsw_linger_purge(fsw); + fsw_rxstrc_purge(fsw); } /* * If we are destroying the instance, release lock to let all @@ -2525,6 +2537,6 @@ fsw_free(struct nx_flowswitch *fsw) fsw->fsw_closed_na_stats = NULL; FSW_RWDESTROY(fsw); - SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw)); + SK_DF(SK_VERB_MEM, "fsw %p FREE", SK_KVA(fsw)); zfree(nx_fsw_zone, fsw); } diff --git a/bsd/skywalk/nexus/flowswitch/fsw_classq.c b/bsd/skywalk/nexus/flowswitch/fsw_classq.c index de77529a4..8e939d233 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_classq.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_classq.c @@ -60,7 +60,7 @@ fsw_classq_teardown(struct nx_flowswitch *fsw, struct nexus_adapter *hostna) ASSERT(fsw->fsw_classq_enq_ptype == QP_PACKET); } /* flush the interface queues */ - if_qflush_snd(hostna->na_ifp, false); + if_qflush(hostna->na_ifp, hostna->na_ifp->if_snd); } struct mbuf * @@ -84,23 +84,18 @@ fsw_classq_kpkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt) m, 0, pkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(pkt), pkt->pkt_csum_tx_start_off); - _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == - sizeof(pkt->pkt_flow_token)); - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == - sizeof(pkt->pkt_flowsrc_token)); - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == - sizeof(pkt->pkt_flowsrc_fidx)); - _CASSERT(sizeof(m->m_pkthdr.comp_gencnt) == - sizeof(pkt->pkt_comp_gencnt)); + static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx)); + static_assert(sizeof(m->m_pkthdr.comp_gencnt) == sizeof(pkt->pkt_comp_gencnt)); m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token; m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt; m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token; m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx; - SK_DF(SK_VERB_TX | SK_VERB_DUMP, "%s(%d) %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), - sk_dump("buf", m_mtod_current(m), m->m_len, 128, NULL, 0)); + SK_PDF(SK_VERB_TX | SK_VERB_DUMP, current_proc(), "%s", + sk_dump("buf", m_mtod_current(m), m->m_len, 128)); if (__improbable((error != 0))) { if (m != NULL) { diff --git a/bsd/skywalk/nexus/flowswitch/fsw_dp.c b/bsd/skywalk/nexus/flowswitch/fsw_dp.c index a96c2def9..9f79bb89a 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_dp.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_dp.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ; #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */ static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL; -#define NX_FSW_RX_STALL_THRES 10 /* seconds */ +#define NX_FSW_RX_STALL_THRES 0 /* seconds (0 = disable) */ static uint32_t fsw_rx_stall_thresh = NX_FSW_RX_STALL_THRES; #define NX_FSW_RX_STALL_DEFUNCT 1 /* defunct Rx-stalled channel (0 = disable) */ @@ -187,9 +188,9 @@ SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu, */ #define FSW_IP_REASS_FORCE_OFF 0 #define FSW_IP_REASS_FORCE_ON 1 -#define FSW_IP_REASS_NO_FORCE 2 +#define FSW_IP_REASS_AUTO 2 -uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE; +uint32_t fsw_ip_reass = FSW_IP_REASS_AUTO; static int fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS @@ -202,7 +203,7 @@ fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS error = sysctl_io_number(req, fsw_ip_reass, sizeof(fsw_ip_reass), &new_value, &changed); if (error == 0 && changed != 0) { - if (new_value > FSW_IP_REASS_NO_FORCE) { + if (new_value > FSW_IP_REASS_AUTO) { return EINVAL; } fsw_ip_reass = new_value; @@ -261,6 +262,7 @@ static void fsw_purge_cache(struct nx_flowswitch *, boolean_t); static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t); static uint32_t fsw_process_deferred(struct nx_flowswitch *); static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *); +static void fsw_process_rxstrc(struct nx_flowswitch *); static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *, struct __kern_packet *); @@ -274,10 +276,10 @@ static int __fsw_dp_inited = 0; int fsw_dp_init(void) { - _CASSERT(FSW_VP_DEV == 0); - _CASSERT(FSW_VP_HOST == 1); - _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN); - _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT); + static_assert(FSW_VP_DEV == 0); + static_assert(FSW_VP_HOST == 1); + static_assert((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN); + static_assert((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT); ASSERT(!__fsw_dp_inited); @@ -310,7 +312,7 @@ dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq) uint32_t _len = KPKTQ_LEN(pktq); \ if (KPKTQ_EMPTY(pktq)) { \ ASSERT(_len == 0); \ - return; \ + break; \ } \ SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \ FSW_STATS_ADD(FSW_STATS_DROP, _len); \ @@ -332,6 +334,18 @@ dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq) dp_free_pktq(fsw, pktq); \ } while (0) +#define dp_drop_pkt_single_nofree(fsw, pkt, outgoing, _reason, _flags) do { \ + SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \ + FSW_STATS_ADD(FSW_STATS_DROP, 1); \ + if (__probable(droptap_total_tap_count == 0)) { \ + break; \ + } \ + drop_func_t dropfunc; \ + dropfunc = (outgoing) ? droptap_output_packet : droptap_input_packet; \ + dropfunc(SK_PKT2PH(pkt), _reason, __func__, __LINE__, _flags, \ + fsw->fsw_ifp, (pkt)->pkt_qum.qum_pid, NULL, -1, NULL, 0, 0); \ +} while (0) + #define dp_drop_pkt_single(fsw, pkt, outgoing, _reason, _flags) do { \ SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop 1 packet"); \ FSW_STATS_ADD(FSW_STATS_DROP, 1); \ @@ -541,11 +555,11 @@ copy_packet_from_dev_log(struct __kern_packet *spkt, MD_BUFLET_ADDR_ABS(dpkt, daddr); pkt_len = __packet_get_real_data_length(dpkt); SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u", - sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, + sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length, dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, (uint32_t)dpkt->pkt_l2_len); SK_DF(logflags | SK_VERB_DUMP, "%s", - sk_dump("buf", daddr, pkt_len, 128, NULL, 0)); + sk_dump("buf", daddr, pkt_len, 128)); } #else #define copy_packet_from_dev_log(...) @@ -640,7 +654,7 @@ copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt, if (spkt->pkt_pflags & PKT_F_MBUF_DATA) { ifp_inc_traffic_class_in(fsw->fsw_ifp, spkt->pkt_mbuf); - mbuf_free(spkt->pkt_mbuf); + mbuf_freem(spkt->pkt_mbuf); KPKT_CLEAR_MBUF_DATA(spkt); } else { fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph); @@ -816,11 +830,10 @@ top: SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]); SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP, - "%s %s %s \"%s\" fe 0x%llx", + "%s %s %s \"%s\" fe %p", input ? "Rx" : "Tx", if_name(fsw->fsw_ifp), - sk_proc_name_address(current_proc()), - fk_as_string(&key, fkbuf, sizeof(fkbuf)), - SK_KVA(fe)); + sk_proc_name(current_proc()), + fk2str(&key, fkbuf, sizeof(fkbuf)), SK_KVA(fe)); return fe; } @@ -1003,9 +1016,15 @@ rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, "Rx flow torn down"); flow_entry_release(&fe); - fe = NULL; + return NULL; } + if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) { + FSW_STATS_INC(FSW_STATS_RX_DISABLED); + SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, + "Rx not allowed for this flow"); + flow_entry_release(&fe); + } return fe; } @@ -1554,25 +1573,25 @@ convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq, } void -fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail, - uint32_t cnt, uint32_t bytes) +fsw_host_sendup(struct ifnet *ifp, struct mbufq *host_mq) { struct ifnet_stat_increment_param s; + if (mbufq_empty(host_mq)) { + return; + } + bzero(&s, sizeof(s)); - s.packets_in = cnt; - s.bytes_in = bytes; - dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL); + s.packets_in = host_mq->count; + s.bytes_in = host_mq->bytes; + dlil_input_handler(ifp, mbufq_first(host_mq), mbufq_last(host_mq), &s, FALSE, NULL); } void -fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq) +fsw_host_rx_cb(struct nx_flowswitch *fsw, struct pktq *pktq) { - struct mbuf *__single m_head = NULL, *__single m_tail = NULL; - uint32_t cnt = 0, bytes = 0; ifnet_fsw_rx_cb_t __single cb; void *__single cb_arg; - boolean_t compat; ASSERT(!KPKTQ_EMPTY(pktq)); if (ifnet_get_flowswitch_rx_callback(fsw->fsw_ifp, &cb, &cb_arg) == 0) { @@ -1587,6 +1606,19 @@ fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq) struct pktq *, pktq); } } +} + +void +fsw_host_rx_enqueue_mbq(struct nx_flowswitch *fsw, struct pktq *pktq, + struct mbufq *host_mq) +{ + struct mbuf *__single m_head = NULL, *__single m_tail = NULL; + uint32_t cnt = 0, bytes = 0; + boolean_t compat; + + if (KPKTQ_EMPTY(pktq)) { + return; + } /* All packets in the pktq must have the same type */ compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0); @@ -1601,7 +1633,8 @@ fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq) DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw); return; } - fsw_host_sendup(fsw->fsw_ifp, m_head, m_tail, cnt, bytes); + + mbufq_enqueue(host_mq, m_head, m_tail, cnt, bytes); } void @@ -1615,7 +1648,7 @@ fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw, * a dequeue. */ if (r->ckr_rx_dequeue_ts >= r->ckr_rx_enqueue_ts) { - r->ckr_rx_enqueue_ts = _net_uptime; + r->ckr_rx_enqueue_ts = net_uptime(); } FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq)); dp_drop_pktq(fsw, pktq, 0, DROP_REASON_RX_DST_RING_FULL, __LINE__, @@ -1690,8 +1723,8 @@ flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx) if (__improbable(KR_DROP(r))) { FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE); - SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode", - r->ckr_name, SK_KVA(r)); + SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %p %s drop mode", + SK_KVA(r), r->ckr_name); return NULL; } @@ -1755,7 +1788,7 @@ dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) /* if flow was (or is going to be) marked as nonviable, drop it */ if (__improbable(fe->fe_want_nonviable || (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) { - SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable", + SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow %p non-viable", SK_KVA(fe)); return false; } @@ -1778,7 +1811,8 @@ dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) void dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct pktq *rx_pkts, uint32_t rx_bytes, uint32_t flags) + struct pktq *rx_pkts, uint32_t rx_bytes, struct mbufq *host_mq, + uint32_t flags) { #pragma unused(flags) struct pktq dpkts; /* dst pool alloc'ed packets */ @@ -1815,7 +1849,8 @@ dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, * the enqueue path below. This path should only be hit * for the rare tcp fragmentation case. */ - fsw_host_rx(fsw, rx_pkts); + + fsw_host_rx_enqueue_mbq(fsw, rx_pkts, host_mq); return; } @@ -1842,7 +1877,7 @@ dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, ASSERT(KPKTQ_EMPTY(&dpkts)); KPKTQ_CONCAT(&dropped_pkts, rx_pkts); FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); - SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts, + SK_ERR("failed to alloc %u pkts for kr %s, %p", n_pkts, r->ckr_name, SK_KVA(r)); reason = DROP_REASON_FSW_PP_ALLOC_FAILED; line = __LINE__; @@ -1862,8 +1897,8 @@ dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, if (__improbable(buf_cnt == 0)) { KPKTQ_CONCAT(&dropped_pkts, rx_pkts); FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); - SK_ERR("failed to alloc %d buflets (err %d) for kr %s, " - "0x%llu", cnt, err, r->ckr_name, SK_KVA(r)); + SK_ERR("failed to alloc %d buflets (err %d) for kr %s %p", + cnt, err, r->ckr_name, SK_KVA(r)); reason = DROP_REASON_FSW_PP_ALLOC_FAILED; line = __LINE__; goto done; @@ -1938,7 +1973,7 @@ dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, dpkt = NULL; SK_ERR("failed to alloc %d " "buflets (err %d) for " - "kr %s, 0x%llu", cnt, err, + "kr %s, %p", cnt, err, r->ckr_name, SK_KVA(r)); break; } @@ -2016,7 +2051,7 @@ done: static inline void rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, - struct flow_entry_list *fes) + struct flow_entry_list *fes, struct mbufq *host_mq) { struct pktq rx_pkts; uint32_t rx_bytes; @@ -2044,7 +2079,7 @@ rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d", KPKTQ_LEN(&rx_pkts), fe, fe->fe_nx_port); /* flow related processing (default, agg, fpd, etc.) */ - fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, rx_proc_flags); + fe->fe_rx_process(fsw, fe, &rx_pkts, rx_bytes, host_mq, rx_proc_flags); } ASSERT(KPKTQ_EMPTY(&rx_pkts)); @@ -2053,31 +2088,64 @@ rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, } } +static void +dp_rx_process_low_power_wake(struct nx_flowswitch *fsw, struct flow_entry *fe) +{ + if (fe->fe_port_reservation == NULL || (fe->fe_flags & FLOWENTF_EXTRL_PORT) != 0) { + return; + } + if (fe->fe_key.fk_proto == IPPROTO_TCP && (fe->fe_flags & FLOWENTF_CONNECTION_IDLE)) { + os_log(wake_packet_log_handle, "dp_rx_process_low_power_wake LPW TCP connection idle"); + + if (flow_track_tcp_want_abort(fe)) { + os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY | FLOWENTF_WAIT_CLOSE, relaxed); + fe->fe_want_withdraw = 1; + flow_track_abort_tcp(fe, NULL, NULL); + } + } else { + if_exit_lpw(fsw->fsw_ifp, "dp_rx_process_low_power_wake LPW connection not idle"); + } +} + static inline void -dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) +dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt) { /* * We only care about wake packets of flows that belong the flow switch * as wake packets for the host stack are handled by the host input * function */ -#if (DEBUG || DEVELOPMENT) - if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { - /* - * This is a one shot command - */ - fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; - pkt->pkt_pflags |= PKT_F_WAKE_PKT; +#if (DEBUG || DEVELOPMENT) + /* For testing only */ + if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { + if (check_wake_pkt(fsw->fsw_ifp, pkt) == true) { + /* + * This is a one shot command + */ + fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; + + pkt->pkt_pflags |= PKT_F_WAKE_PKT; + } } #endif /* (DEBUG || DEVELOPMENT) */ + if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) { if_ports_used_match_pkt(fsw->fsw_ifp, pkt); + + /* + * When a packet is received in LPW mode for an idle TCP connection, the connection + * is aborted immediately with a RST so the peer drops the connection at once + */ + if (if_is_lpw_enabled(fsw->fsw_ifp)) { + pkt->pkt_pflags |= __PKT_F_LPW; + dp_rx_process_low_power_wake(fsw, fe); + } } } static void -_fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) +_fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq) { struct __kern_packet *__single pkt, *__single tpkt; struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); @@ -2088,10 +2156,15 @@ _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) uint16_t line = 0; int err; uint64_t thread_id; + struct mbufq host_mq; + struct ifnet *ifp; + mbufq_init(&host_mq); KPKTQ_INIT(&host_pkts); KPKTQ_INIT(&dropped_pkts); + FSW_RLOCK(fsw); + if (__improbable(FSW_QUIESCED(fsw))) { DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw); KPKTQ_CONCAT(&dropped_pkts, pktq); @@ -2106,6 +2179,7 @@ _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) goto done; } + ifp = fsw->fsw_ifp; thread_id = thread_tid(current_thread()); prev_fe = NULL; KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { @@ -2149,7 +2223,7 @@ _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) continue; } - dp_rx_process_wake_packet(fsw, pkt); + dp_rx_process_wake_packet(fsw, fe, pkt); rx_flow_batch_packets(&fes, fe, pkt, thread_id); prev_fe = fe; @@ -2157,16 +2231,20 @@ _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) struct flow_entry *tfe = NULL; TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) { - rx_flow_process(fsw, fe, &fes); + rx_flow_process(fsw, fe, &fes, &host_mq); flow_entry_release(&fe); } if (!KPKTQ_EMPTY(&host_pkts)) { - fsw_host_rx(fsw, &host_pkts); + fsw_host_rx_cb(fsw, &host_pkts); + fsw_host_rx_enqueue_mbq(fsw, &host_pkts, &host_mq); } done: dp_drop_pktq(fsw, &dropped_pkts, 0, reason, line, 0); + FSW_RUNLOCK(fsw); + + fsw_host_sendup(ifp, &host_mq); } #if (DEVELOPMENT || DEBUG) @@ -2224,9 +2302,7 @@ fsw_rps_thread_cont(void *v, wait_result_t w) sk_protect_t protect; protect = sk_sync_protect(); - FSW_RLOCK(fsw); - _fsw_receive_locked(fsw, &pkts); - FSW_RUNLOCK(fsw); + _fsw_receive(fsw, &pkts); sk_sync_unprotect(protect); lck_mtx_lock(&frt->frt_lock); @@ -2401,13 +2477,13 @@ get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt) void fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq) { - FSW_RLOCK(fsw); #if (DEVELOPMENT || DEBUG) + FSW_RLOCK(fsw); if (fsw->fsw_rps_nthreads != 0) { struct __kern_packet *pkt, *tpkt; bitmap_t map = 0; - _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1); + static_assert(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1); KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { uint32_t id = get_rps_id(fsw, pkt); KPKTQ_REMOVE(pktq, pkt); @@ -2418,12 +2494,15 @@ fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq) i = bitmap_next(&map, i)) { fsw_rps_thread_schedule(fsw, i); } + FSW_RUNLOCK(fsw); } else #endif /* !DEVELOPMENT && !DEBUG */ { - _fsw_receive_locked(fsw, pktq); +#if (DEVELOPMENT || DEBUG) + FSW_RUNLOCK(fsw); +#endif /* !DEVELOPMENT && !DEBUG */ + _fsw_receive(fsw, pktq); } - FSW_RUNLOCK(fsw); } int @@ -2651,7 +2730,7 @@ dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp, (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)); } else if (error == ENOBUFS) { SK_DF(logflags, "%s(%d) packet allocation failure", - sk_proc_name_address(p), sk_proc_pid(p)); + sk_proc_name(p), sk_proc_pid(p)); } else if (error == 0) { ASSERT(dpkt != NULL); char *daddr; @@ -2660,14 +2739,15 @@ dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp, MD_BUFLET_ADDR_ABS(dpkt, daddr); pkt_len = __packet_get_real_data_length(dpkt); SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)", - sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, + sk_proc_name(p), sk_proc_pid(p), spkt->pkt_length, dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, (uint32_t)fsw->fsw_frame_headroom, (uint32_t)ifp->if_tx_headroom); SK_DF(logflags | SK_VERB_DUMP, "%s", - sk_dump("buf", daddr, pkt_len, 128, NULL, 0)); + sk_dump("buf", daddr, pkt_len, 128)); } else { - SK_DF(logflags, "%s(%d) error %d", error); + SK_DF(logflags, "%s(%d) error %d", sk_proc_name(p), + sk_proc_pid(p), error); } } #else @@ -2692,7 +2772,7 @@ fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt) /* Copy AQM metadata */ dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type; dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx; - _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0); + static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0); _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id); _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid); dpkt->pkt_policy_id = spkt->pkt_policy_id; @@ -2770,12 +2850,9 @@ convert_pkt_to_mbuf(struct __kern_packet *pkt) struct mbuf *m = pkt->pkt_mbuf; /* pass additional metadata generated from flow parse/lookup */ - _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == - sizeof(pkt->pkt_flow_token)); - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == - sizeof(pkt->pkt_flowsrc_token)); - _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == - sizeof(pkt->pkt_flowsrc_fidx)); + static_assert(sizeof(m->m_pkthdr.pkt_flowid) == sizeof(pkt->pkt_flow_token)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == sizeof(pkt->pkt_flowsrc_token)); + static_assert(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == sizeof(pkt->pkt_flowsrc_fidx)); m->m_pkthdr.pkt_svc = pkt->pkt_svc_class; m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto; m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token; @@ -2888,7 +2965,7 @@ classq_enqueue_flow_single(struct nx_flowswitch *fsw, } case QP_PACKET: { /* native interface */ /* ifnet_enqueue consumes packet */ - err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop); + err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &pkt_drop); pkt = NULL; #if (DEVELOPMENT || DEBUG) if (__improbable(!pkt_drop)) { @@ -2961,7 +3038,7 @@ classq_enqueue_flow_chain(struct nx_flowswitch *fsw, } case QP_PACKET: { /* native interface */ /* ifnet_enqueue consumes packet */ - err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt, + err = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_head, pkt_tail, cnt, bytes, FALSE, &pkt_drop); pkt_head = NULL; #if (DEVELOPMENT || DEBUG) @@ -3174,8 +3251,8 @@ dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe) MD_BUFLET_ADDR_ABS(pkt, addr); SK_ERR("listener flow sends non-RST packet %s", - sk_dump(sk_proc_name_address(current_proc()), - addr, __packet_get_real_data_length(pkt), 128, NULL, 0)); + sk_dump(sk_proc_name(current_proc()), + addr, __packet_get_real_data_length(pkt), 128)); } pp_free_packet_single(pkt); } @@ -3197,15 +3274,15 @@ fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts, * activity on a foreground flow. */ if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) { - ifp->if_fg_sendts = (uint32_t)_net_uptime; + ifp->if_fg_sendts = (uint32_t)net_uptime(); if (fg_ts != NULL) { - *fg_ts = _net_uptime; + *fg_ts = net_uptime(); } } if (pkt->pkt_pflags & PKT_F_REALTIME) { - ifp->if_rt_sendts = (uint32_t)_net_uptime; + ifp->if_rt_sendts = (uint32_t)net_uptime(); if (rt_ts != NULL) { - *rt_ts = _net_uptime; + *rt_ts = net_uptime(); } } } @@ -3347,22 +3424,15 @@ tx_process_continuous_ip_frag(struct nx_flowswitch *fsw, if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) { FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID); - SK_ERR("%s(%d) invalid zero fragment id", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc())); + SK_PERR(current_proc(), "invalid zero fragment id"); return NULL; } - SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, - "%s(%d) continuation frag, id %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), - pkt->pkt_flow_ip_frag_id); + SK_PDF(SK_VERB_FSW_DP | SK_VERB_TX, current_proc(), + "continuation frag, id %u", pkt->pkt_flow_ip_frag_id); if (__improbable(prev_fe == NULL || !prev_fe->fe_tx_is_cont_frag)) { - SK_ERR("%s(%d) unexpected continuation frag", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), + SK_PERR(current_proc(), "unexpected continuation frag %u", pkt->pkt_flow_ip_frag_id); FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); return NULL; @@ -3370,11 +3440,8 @@ tx_process_continuous_ip_frag(struct nx_flowswitch *fsw, if (__improbable(pkt->pkt_flow_ip_frag_id != prev_fe->fe_tx_frag_id)) { FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); - SK_ERR("%s(%d) wrong continuation frag id %u expecting %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), - pkt->pkt_flow_ip_frag_id, - prev_fe->fe_tx_frag_id); + SK_PERR(current_proc(), "wrong continuation frag id %u expecting %u", + pkt->pkt_flow_ip_frag_id, prev_fe->fe_tx_frag_id); return NULL; } @@ -3385,6 +3452,7 @@ static struct flow_entry * tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, struct flow_entry *prev_fe) { + SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); struct flow_entry *__single fe; fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe); @@ -3393,12 +3461,22 @@ tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, } if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) { - SK_RDERR(5, "Tx flow torn down"); + SK_RDERR(5, "Tx flow torn down %s", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN); flow_entry_release(&fe); goto done; } + if (__improbable(fe->fe_flags & FLOWENTF_AOP_OFFLOAD)) { + SK_RDERR(5, "Tx not allowed for this flow"); + SK_RDERR(5, "Tx not allowed for this flow %s", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); + FSW_STATS_INC(FSW_STATS_TX_DISABLED); + flow_entry_release(&fe); + goto done; + } + _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1, null_func); @@ -3406,7 +3484,8 @@ tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, uuid_string_t flow_id_str, pkt_id_str; sk_uuid_unparse(fe->fe_uuid, flow_id_str); sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str); - SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str); + SK_ERR("pkt flow id %s != flow id %s, %s", pkt_id_str, + flow_id_str, fe2str(fe, dbgbuf, sizeof(dbgbuf))); flow_entry_release(&fe); FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID); } @@ -3440,9 +3519,9 @@ dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt) MD_BUFLET_ADDR_ABS(pkt, pkt_buf); pkt_len = __packet_get_real_data_length(pkt); - SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()), + SK_DF(verb, "%s(%d) %s %s", sk_proc_name(current_proc()), sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf, pkt_len, - 128, NULL, 0)); + 128)); } #else /* !SK_LOG */ #define dp_tx_log_pkt(...) @@ -3552,8 +3631,16 @@ dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq) ASSERT(pkt != NULL); err = dp_copy_to_dev(fsw, spkt, pkt); if (__improbable(err != 0)) { - dp_drop_pkt_single(fsw, pkt, 1, DROP_REASON_FSW_PKT_COPY_FAILED, - DROPTAP_FLAG_L2_MISSING); + /* + * Copy to dev pool failed, so droptap should capture + * the source pkt because dev pkt might not have metadata + * or buffer filled out yet. Source pkt is freed by + * fsw_user_ring_flush, so defer the free to that. + */ + dp_drop_pkt_single_nofree(fsw, spkt, 1, + DROP_REASON_FSW_PKT_COPY_FAILED, DROPTAP_FLAG_L2_MISSING); + /* Free the dev pool packet */ + pp_free_packet_single(pkt); continue; } @@ -4222,6 +4309,8 @@ fsw_dp_ctor(struct nx_flowswitch *fsw) lck_mtx_init(&fsw->fsw_reap_lock, &nexus_lock_group, &nexus_lock_attr); lck_mtx_init(&fsw->fsw_linger_lock, &nexus_lock_group, &nexus_lock_attr); TAILQ_INIT(&fsw->fsw_linger_head); + lck_mtx_init(&fsw->fsw_rxstrc_lock, &nexus_lock_group, &nexus_lock_attr); + TAILQ_INIT(&fsw->fsw_rxstrc_head); fsw_name = tsnprintf(name, sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id); error = nx_advisory_alloc(nx, fsw_name, @@ -4251,7 +4340,7 @@ fsw_dp_ctor(struct nx_flowswitch *fsw) /* this must not fail */ VERIFY(fsw->fsw_reap_thread != NULL); - SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw)); + SK_DF(SK_VERB_MEM, "fsw %p ALLOC", SK_KVA(fsw)); return error; @@ -4313,6 +4402,7 @@ fsw_dp_dtor(struct nx_flowswitch *fsw) /* free any remaining flow entries in the linger list */ fsw_linger_purge(fsw); + fsw_rxstrc_purge(fsw); if (fsw->fsw_flow_mgr != NULL) { flow_mgr_destroy(fsw->fsw_flow_mgr); @@ -4330,9 +4420,7 @@ fsw_linger_insert(struct flow_entry *fe) { struct nx_flowswitch *fsw = fe->fe_fsw; SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); - SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), - fe->fe_flags, FLOWENTF_BITS); + SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf))); net_update_uptime(); @@ -4342,7 +4430,7 @@ fsw_linger_insert(struct flow_entry *fe) ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING)); ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE); ASSERT(fe->fe_linger_wait != 0); - fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait); + fe->fe_linger_expire = (net_uptime() + fe->fe_linger_wait); os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed); lck_mtx_lock_spin(&fsw->fsw_linger_lock); @@ -4359,9 +4447,7 @@ fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head, struct flow_entry *fe) { SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); - SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), - fe->fe_flags, FLOWENTF_BITS); + SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf))); ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); @@ -4398,11 +4484,79 @@ fsw_linger_purge(struct nx_flowswitch *fsw) lck_mtx_unlock(&fsw->fsw_linger_lock); } +void +fsw_rxstrc_insert(struct flow_entry *fe) +{ + struct nx_flowswitch *fsw = fe->fe_fsw; + SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); + SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf))); + + ASSERT(flow_entry_refcnt(fe) >= 1); + ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); + ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); + ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD); + ASSERT(!(fe->fe_flags & FLOWENTF_RXSTRC_PENDING)); + os_atomic_or(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed); + + flow_entry_retain(fe); + + lck_mtx_lock_spin(&fsw->fsw_rxstrc_lock); + TAILQ_INSERT_TAIL(&fsw->fsw_rxstrc_head, fe, fe_rxstrc_link); + fsw->fsw_rxstrc_cnt++; + VERIFY(fsw->fsw_rxstrc_cnt != 0); + lck_mtx_unlock(&fsw->fsw_rxstrc_lock); + + fsw_reap_sched(fsw); +} + +static void +fsw_rxstrc_remove_internal(struct flow_entry_rxstrc_head *rxstrc_head, + struct flow_entry *fe) +{ + SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); + SK_DF(SK_VERB_FLOW, "fe \"%s\"", fe2str(fe, dbgbuf, sizeof(dbgbuf))); + + ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); + ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); + ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD); + ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING); + os_atomic_andnot(&fe->fe_flags, FLOWENTF_RXSTRC_PENDING, relaxed); + + TAILQ_REMOVE(rxstrc_head, fe, fe_rxstrc_link); + flow_entry_release(&fe); +} + +static void +fsw_rxstrc_remove(struct flow_entry *fe) +{ + struct nx_flowswitch *fsw = fe->fe_fsw; + + LCK_MTX_ASSERT(&fsw->fsw_rxstrc_lock, LCK_MTX_ASSERT_OWNED); + + fsw_rxstrc_remove_internal(&fsw->fsw_rxstrc_head, fe); + VERIFY(fsw->fsw_rxstrc_cnt != 0); + fsw->fsw_rxstrc_cnt--; +} + +void +fsw_rxstrc_purge(struct nx_flowswitch *fsw) +{ + struct flow_entry *fe, *tfe; + + lck_mtx_lock(&fsw->fsw_rxstrc_lock); + TAILQ_FOREACH_SAFE(fe, &fsw->fsw_rxstrc_head, fe_rxstrc_link, tfe) { + fsw_rxstrc_remove(fe); + } + ASSERT(fsw->fsw_rxstrc_cnt == 0); + ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head)); + lck_mtx_unlock(&fsw->fsw_rxstrc_lock); +} + static void fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw) { struct kern_nexus *nx; - uint64_t now = _net_uptime; + uint64_t now = net_uptime(); nx = fsw->fsw_nx; @@ -4440,7 +4594,7 @@ fsw_defunct_rx_stall_channel(struct nx_flowswitch *fsw) DTRACE_SKYWALK3(rx__stall, struct nx_flowswitch *, fsw, struct nexus_adapter *, na, struct __kern_channel_ring *, ring); FSW_STATS_INC(FSW_STATS_RX_STALL); - SK_ERR("Rx stall detected in proc %s(%llu) (%s): " + SK_ERR("Rx stall detected in proc %s(%d) (%s): " "elapsed %llu (s), now: %llu, enqueue: %llu, dequeue: %llu, " "defunct: %s", ch->ch_name, ch->ch_pid, fsw->fsw_ifp->if_xname, @@ -4521,7 +4675,7 @@ fsw_reap_thread_cont(void *v, wait_result_t wres) uint32_t fr_freed, fr_resid = 0; struct ifnet *ifp = fsw->fsw_ifp; uint64_t i = FSW_REAP_IVAL; - uint64_t now = _net_uptime; + uint64_t now = net_uptime(); uint64_t last; ASSERT(fsw->fsw_ifp != NULL); @@ -4537,13 +4691,18 @@ fsw_reap_thread_cont(void *v, wait_result_t wres) fe_freed = fsw_process_linger(fsw, &fe_aborted); /* - * Pass 3: prune idle flow routes. + * Pass 3: process any pending Rx steering rule cleanup flows + */ + fsw_process_rxstrc(fsw); + + /* + * Pass 4: prune idle flow routes. */ fr_freed = flow_route_prune(fsw->fsw_flow_mgr, ifp, &fr_resid); /* - * Pass 4: prune flow table + * Pass 5: prune flow table * */ cuckoo_hashtable_try_shrink(fsw->fsw_flow_mgr->fm_flow_table); @@ -4633,13 +4792,15 @@ fsw_reap_thread_cont(void *v, wait_result_t wres) fsw->fsw_reap_last = now; } - /* Check for Rx stall condition every NX_FSW_RX_STALL_THRES seconds */ + /* Check for Rx stall condition every fsw_rx_stall_thresh seconds */ last = fsw->fsw_rx_stall_chk_last; - if (last != 0 && (now - last) >= NX_FSW_RX_STALL_THRES) { - fsw_defunct_rx_stall_channel(fsw); - fsw->fsw_rx_stall_chk_last = now; - } else if (__improbable(last == 0)) { - fsw->fsw_rx_stall_chk_last = now; + if (fsw_rx_stall_thresh != 0) { + if (last != 0 && (now - last) >= fsw_rx_stall_thresh) { + fsw_defunct_rx_stall_channel(fsw); + fsw->fsw_rx_stall_chk_last = now; + } else if (__improbable(last == 0)) { + fsw->fsw_rx_stall_chk_last = now; + } } nanoseconds_to_absolutetime(i * NSEC_PER_SEC, &t); @@ -4701,12 +4862,28 @@ fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low) /* uncrustify doesn't handle C blocks properly */ /* BEGIN IGNORE CODESTYLE */ nx_port_foreach(nx, ^(nexus_port_t p) { + boolean_t purge; struct nexus_adapter *na = nx_port_get_na(nx, p); - if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) { + + if (na == NULL) { + DTRACE_SKYWALK1(ch__drain__na__null, struct nexus_adapter *, na); return; } - boolean_t purge; + /* + * If NA is deactivated, no need to proceed further with channel drain. + * Note: fsw_vp_na_activate takes FSW_WLOCK before clearing the + * NAF_ACTIVE flag. + */ + if ((na->na_flags & NAF_ACTIVE) == 0) { + DTRACE_SKYWALK1(ch__drain__na__inactive, struct nexus_adapter *, na); + return; + } + + if (na->na_work_ts == 0 || na->na_rx_rings == NULL) { + DTRACE_SKYWALK1(ch__drain__na__invalid, struct nexus_adapter *, na); + return; + } /* * If some activity happened in the last FSW_DRAIN_CH_THRES @@ -4909,7 +5086,7 @@ fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort) struct flow_entry_linger_head linger_head = TAILQ_HEAD_INITIALIZER(linger_head); struct flow_entry *fe, *tfe; - uint64_t now = _net_uptime; + uint64_t now = net_uptime(); uint32_t i = 0, cnt = 0, freed = 0; ASSERT(fsw->fsw_ifp != NULL); @@ -4946,10 +5123,8 @@ fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort) flow_track_abort_tcp(fe, NULL, NULL); (*abort)++; SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); - SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx " - "flags 0x%b [RST]", fe_as_string(fe, dbgbuf, - sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, - FLOWENTF_BITS); + SK_DF(SK_VERB_FLOW, "fe \"%s\" [RST]", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); } /* @@ -4982,6 +5157,37 @@ fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort) return freed; } +static void +fsw_process_rxstrc(struct nx_flowswitch *fsw) +{ + struct flow_entry_rxstrc_head rxstrc_head = + TAILQ_HEAD_INITIALIZER(rxstrc_head); + struct flow_entry *fe, *tfe; + + /* + * We don't want to contend with the datapath, so move + * everything that's in the rxstrc list into a local list. + * This allows us to cleanup Rx steering rules or free the flow entry + * outside the lock. + */ + lck_mtx_lock(&fsw->fsw_rxstrc_lock); + TAILQ_CONCAT(&rxstrc_head, &fsw->fsw_rxstrc_head, fe_rxstrc_link); + ASSERT(TAILQ_EMPTY(&fsw->fsw_rxstrc_head)); + fsw->fsw_rxstrc_cnt = 0; + lck_mtx_unlock(&fsw->fsw_rxstrc_lock); + + TAILQ_FOREACH_SAFE(fe, &rxstrc_head, fe_rxstrc_link, tfe) { + ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); + ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); + ASSERT(fe->fe_flags & FLOWENTF_RXSTRC_PENDING); + ASSERT(fe->fe_flags & FLOWENTF_AOP_OFFLOAD); + + flow_entry_rx_steering_rule_cleanup(fsw, fe); + fsw_rxstrc_remove_internal(&rxstrc_head, fe); + fe = NULL; + } +} + __attribute__((always_inline)) static inline void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph) diff --git a/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c b/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c index 76ca916d0..8c2bff6f0 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_ethernet.c @@ -37,6 +37,7 @@ #include #include #include +#include #define FSW_ETHER_LEN_PADDED 16 #define FSW_ETHER_PADDING (FSW_ETHER_LEN_PADDED - ETHER_HDR_LEN) @@ -83,13 +84,13 @@ fsw_ethernet_ctor(struct nx_flowswitch *fsw, struct flow_route *fr) htons(ETHERTYPE_IP) : htons(ETHERTYPE_IPV6)); /* const override */ - _CASSERT(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t)); - _CASSERT(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t)); + static_assert(sizeof(fr->fr_llhdr.flh_off) == sizeof(uint8_t)); + static_assert(sizeof(fr->fr_llhdr.flh_len) == sizeof(uint8_t)); *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_off = 2; *(uint8_t *)(uintptr_t)&fr->fr_llhdr.flh_len = ETHER_HDR_LEN; SK_DF(SK_VERB_FLOW_ROUTE, - "fr 0x%llx eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x", + "fr %p eth_type 0x%x eth_src %x:%x:%x:%x:%x:%x", SK_KVA(fr), ntohs(fr->fr_eth.ether_type), fr->fr_eth.ether_shost[0], fr->fr_eth.ether_shost[1], fr->fr_eth.ether_shost[2], fr->fr_eth.ether_shost[3], @@ -355,7 +356,7 @@ fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr, ETHER_ADDR_LEN) { err = EHOSTUNREACH; SK_ERR("invalid permanent route %s on %s" - "ln 0x%llx (err %d)", + "ln %p (err %d)", sk_sa_ntop(rt_key(tgt_rt), dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln), err); @@ -415,7 +416,7 @@ fsw_ethernet_resolve(struct nx_flowswitch *fsw, struct flow_route *fr, RT_UNLOCK(tgt_rt); SK_DF(SK_VERB_FLOW_ROUTE, "soliciting for %s on %s" - "ln 0x%llx state %u", sk_sa_ntop(rt_key(tgt_rt), + "ln %p state %u", sk_sa_ntop(rt_key(tgt_rt), dst_s, sizeof(dst_s)), ifp->if_xname, SK_KVA(ln), ln->ln_state); @@ -498,9 +499,9 @@ fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr, bcopy(&fr->fr_eth.ether_shost, &old_shost, ETHER_ADDR_LEN); fsw_ethernet_ctor(fsw, fr); - SK_ERR("fr 0x%llx source MAC address updated on %s, " + SK_ERR("fr %p source MAC address updated on %s, " "was %x:%x:%x:%x:%x:%x now %x:%x:%x:%x:%x:%x", - SK_KVA(fr), fsw->fsw_ifp, + SK_KVA(fr), if_name(fsw->fsw_ifp), old_shost[0], old_shost[1], old_shost[2], old_shost[3], old_shost[4], old_shost[5], @@ -509,7 +510,7 @@ fsw_ethernet_frame(struct nx_flowswitch *fsw, struct flow_route *fr, fr->fr_eth.ether_shost[4], fr->fr_eth.ether_shost[5]); } - _CASSERT(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED); + static_assert(sizeof(fr->fr_eth_padded) == FSW_ETHER_LEN_PADDED); if ((fr->fr_flags & FLOWRTF_DST_LL_MCAST) != 0) { pkt->pkt_link_flags |= PKT_LINKF_MCAST; diff --git a/bsd/skywalk/nexus/flowswitch/fsw_flow.c b/bsd/skywalk/nexus/flowswitch/fsw_flow.c index 7158e9b43..107094224 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_flow.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_flow.c @@ -29,6 +29,7 @@ #include #include #include +#include static void fsw_flow_route_ctor(void *, struct flow_route *); static int fsw_flow_route_resolve(void *, struct flow_route *, @@ -52,6 +53,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) struct proc *p; int pid = req0->nfr_pid; bool low_latency = ((req0->nfr_flags & NXFLOWREQF_LOW_LATENCY) != 0); + struct flow_entry *__single aop_fe = NULL; #if SK_LOG uuid_string_t uuidstr; #endif /* SK_LOG */ @@ -107,7 +109,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) if ((*error = msleep(&fob->fob_open_waiters, &fob->fob_lock, (PZERO + 1) | PSPIN, __FUNCTION__, NULL)) == EINTR) { SK_ERR("%s(%d) binding for uuid %s was interrupted", - sk_proc_name_address(p), pid, + sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr)); ASSERT(fob->fob_open_waiters > 0); fob->fob_open_waiters--; @@ -118,7 +120,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) } if (__improbable((fob->fob_busy_flags & FOBF_DEAD) != 0)) { SK_ERR("%s(%d) binding for flow_uuid %s aborted due to " - "dead owner", sk_proc_name_address(p), pid, + "dead owner", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr)); *error = ENXIO; goto done; @@ -162,7 +164,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) &nx_port, &nxb, NULL)) != 0) { sk_free_data_sized_by(nxb.nxb_key, nxb.nxb_key_len); SK_ERR("%s(%d) failed to bind flow_uuid %s to a " - "nx_port (err %d)", sk_proc_name_address(p), + "nx_port (err %d)", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), *error); nx_port = NEXUS_PORT_ANY; @@ -173,7 +175,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) nx_bound = TRUE; SK_DF(SK_VERB_FLOW, "%s(%d) flow_uuid %s associated with " - "ephemeral nx_port %d", sk_proc_name_address(p), + "ephemeral nx_port %d", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), (int)nx_port); @@ -186,7 +188,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) 0 || fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) { SK_ERR("%s(%d) binding for flow_uuid %s aborted " - "(lost race)", sk_proc_name_address(p), + "(lost race)", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr)); *error = ENXIO; @@ -217,7 +219,7 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) uuid_generate_random(uuid_key); SK_DF(SK_VERB_FLOW, "%s(%d) flow_uuid %s associated " - "with nx_port %d", sk_proc_name_address(p), + "with nx_port %d", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), (int)nx_port); } else { @@ -258,10 +260,8 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) if ((fe = flow_entry_find_by_uuid(fo, req.nfr_flow_uuid)) != NULL) { #if SK_LOG char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; - SK_DSC(p, "flow uuid collision: \"%s\" already exists at " - "fe 0x%llx flags 0x%b %s(%d)", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), - fe->fe_flags, FLOWENTF_BITS, fe->fe_proc_name, fe->fe_pid); + SK_PERR(p, "flow uuid collision with fe \"%s\"", + fe2str(fe, dbgbuf, sizeof(dbgbuf))); #endif /* SK_LOG */ *error = EEXIST; flow_entry_release(&fe); @@ -281,20 +281,27 @@ fsw_flow_add(struct nx_flowswitch *fsw, struct nx_flow_req *req0, int *error) *error = flow_mgr_flow_add(nx, fm, fo, fsw->fsw_ifp, &req, fsw_flow_route_ctor, fsw_flow_route_resolve, fsw); - if (*error == 0) { - /* replace original request with our (modified) local copy */ - bcopy(&req, req0, sizeof(*req0)); + if (*error == 0) { SK_DF(SK_VERB_FLOW, "%s(%d) flow_uuid %s is now on " - "nx_port %d", sk_proc_name_address(p), pid, + "nx_port %d", sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), (int)nx_port); + + /* Lookup flow entry for RX steering if needed (before FOB unlock) */ + if (req.nfr_flags & NXFLOWREQF_AOP_OFFLOAD) { + aop_fe = flow_entry_find_by_uuid(fo, req.nfr_flow_uuid); + ASSERT(aop_fe); + } else { + /* replace original request with our (modified) local copy */ + bcopy(&req, req0, sizeof(*req0)); + } } done: if (__improbable(*error != 0)) { SK_ERR("%s(%d) failed to add flow_uuid %s (err %d)", - sk_proc_name_address(p), pid, + sk_proc_name(p), pid, sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), *error); if (fo != NULL) { if (new_mapping) { @@ -327,9 +334,44 @@ done: } FOB_UNLOCK(fob); + /* Configure RX flow steering if flow was added successfully and AOP offload is requested */ + if (aop_fe != NULL) { + int rx_steering_err = flow_entry_add_rx_steering_rule(fsw, aop_fe); + if (rx_steering_err != 0) { + SK_ERR("%s(%d) failed to add RX steering rule for " + "flow_uuid %s (err %d)", sk_proc_name(p), pid, + sk_uuid_unparse(req.nfr_flow_uuid, uuidstr), + rx_steering_err); + flow_entry_release(&aop_fe); + aop_fe = NULL; + /* Clean up the flow since RX steering failed */ + fsw_flow_del(fsw, &req, true, NULL); + /* + * Release flow stats reference count for the additional reference + * that would be passed back to NECP client in successful flow creation. + * Since flow creation succeeded and stats were assigned to the request + * at flow_entry_alloc(), but we're now cleaning up the flow due to + * RX steering failure, we must release this reference as the caller + * should not receive flow stats for a flow that was cleaned up. + */ + if (req.nfr_flow_stats != NULL) { + flow_stats_release(req.nfr_flow_stats); + req.nfr_flow_stats = NULL; + } + *error = rx_steering_err; + fo = NULL; + } else { + /* replace original request with our (modified) local copy */ + bcopy(&req, req0, sizeof(*req0)); + flow_entry_release(&aop_fe); + aop_fe = NULL; + } + } + unbusy: proc_rele(p); p = PROC_NULL; + ASSERT(aop_fe == NULL); /* allow any pending detach to proceed */ fsw_detach_barrier_remove(fsw); @@ -469,22 +511,46 @@ fsw_flow_config(struct nx_flowswitch *fsw, struct nx_flow_req *req) goto done; } - /* right now only support NXFLOWREQF_NOWAKEFROMSLEEP config */ nt = fe->fe_port_reservation; - if (req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP) { - os_atomic_or(&fe->fe_flags, FLOWENTF_NOWAKEFROMSLEEP, relaxed); - netns_change_flags(&nt, NETNS_NOWAKEFROMSLEEP, 0); - } else { - os_atomic_andnot(&fe->fe_flags, FLOWENTF_NOWAKEFROMSLEEP, relaxed); - netns_change_flags(&nt, 0, NETNS_NOWAKEFROMSLEEP); - } -#if SK_LOG - char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; - SK_DF(SK_VERB_FLOW, "%s: NOWAKEFROMSLEEP %d", - fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), - req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP ? 1 : 0); -#endif /* SK_LOG */ + /* + * First handle the idle/reused connection flags + * + * Note: That we expect either connection idle/reused to be set or + * no wake from sleep to be set/cleared + */ + if (req->nfr_flags & (NXFLOWREQF_CONNECTION_IDLE | NXFLOWREQF_CONNECTION_REUSED)) { + if (req->nfr_flags & NXFLOWREQF_CONNECTION_IDLE) { + os_atomic_or(&fe->fe_flags, FLOWENTF_CONNECTION_IDLE, relaxed); + netns_change_flags(&nt, NETNS_CONNECTION_IDLE, 0); + } + if (req->nfr_flags & NXFLOWREQF_CONNECTION_REUSED) { + os_atomic_andnot(&fe->fe_flags, FLOWENTF_CONNECTION_IDLE, relaxed); + netns_change_flags(&nt, 0, NETNS_CONNECTION_IDLE); + } +#if SK_LOG + char dbgbuf[256]; + SK_DF(SK_VERB_FLOW, "%s: CONNECTION_IDLE %d CONNECTION_REUSE %d", + fe2str(fe, dbgbuf, sizeof(dbgbuf)), + req->nfr_flags & NXFLOWREQF_CONNECTION_IDLE ? 1 : 0, + req->nfr_flags & NXFLOWREQF_CONNECTION_REUSED ? 1 : 0); +#endif /* SK_LOG */ + } else { + /* right now only support NXFLOWREQF_NOWAKEFROMSLEEP config */ + if (req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP) { + os_atomic_or(&fe->fe_flags, FLOWENTF_NOWAKEFROMSLEEP, relaxed); + netns_change_flags(&nt, NETNS_NOWAKEFROMSLEEP, 0); + } else { + os_atomic_andnot(&fe->fe_flags, FLOWENTF_NOWAKEFROMSLEEP, relaxed); + netns_change_flags(&nt, 0, NETNS_NOWAKEFROMSLEEP); + } +#if SK_LOG + char dbgbuf[FLOWENTRY_DBGBUF_SIZE]; + SK_DF(SK_VERB_FLOW, "%s: NOWAKEFROMSLEEP %d", + fe2str(fe, dbgbuf, sizeof(dbgbuf)), + req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP ? 1 : 0); +#endif /* SK_LOG */ + } done: if (fe != NULL) { flow_entry_release(&fe); diff --git a/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c b/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c index 77621dfa1..498bb39a3 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c @@ -73,6 +73,7 @@ #include #include #include +#include #define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */ #define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */ @@ -179,6 +180,8 @@ static void ipf_free_pkt(struct ipf *f); static void ipfq_drain(struct fsw_ip_frag_mgr *mgr); static void ipfq_reap(struct fsw_ip_frag_mgr *mgr); static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS; +static struct mbuf *ipf_pkt2mbuf(struct fsw_ip_frag_mgr *, struct __kern_packet *); +static void ipf_icmp6_error_flag(struct mbuf *, int, int, int, int); void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt, int param); void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f); @@ -376,6 +379,27 @@ fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt, key.ipfk_len = IPFK_LEN_V6; key.ipfk_ident = ip6f->ip6f_ident; + /* + * https://tools.ietf.org/html/rfc8200#page-20 + * If the first fragment does not include all headers through an + * Upper-Layer header, then that fragment should be discarded and + * an ICMP Parameter Problem, Code 3, message should be sent to + * the source of the fragment, with the Pointer field set to zero. + */ + if (fragoff == 0) { + struct __kern_packet *p = *pkt; + struct mbuf *m = ipf_pkt2mbuf(mgr, p); + if (__probable(m != NULL)) { + if (!ip6_pkt_has_ulp(m)) { + ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, + ICMP6_PARAMPROB_FIRSTFRAG_INCOMP_HDR, 0, 0); + return EINVAL; + } else { + mbuf_freem(m); + } + } + } + err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen, fragflag, nfrags, tlen); diff --git a/bsd/skywalk/nexus/flowswitch/fsw_netagent.c b/bsd/skywalk/nexus/flowswitch/fsw_netagent.c index 7d96ee01b..4b36db1dd 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_netagent.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_netagent.c @@ -142,6 +142,9 @@ fsw_netagent_flow_add(struct nx_flowswitch *fsw, uuid_t flow_uuid, pid_t pid, if (cparams->reuse_port) { req.nfr_flags |= NXFLOWREQF_REUSEPORT; } + if (cparams->use_aop_offload) { + req.nfr_flags |= NXFLOWREQF_AOP_OFFLOAD; + } req.nfr_context = context; req.nfr_pid = pid; @@ -157,7 +160,7 @@ fsw_netagent_flow_add(struct nx_flowswitch *fsw, uuid_t flow_uuid, pid_t pid, goto done; } - _CASSERT(sizeof(struct necp_demux_pattern) == sizeof(struct flow_demux_pattern)); + static_assert(sizeof(struct necp_demux_pattern) == sizeof(struct flow_demux_pattern)); for (int i = 0; i < cparams->demux_pattern_count; i++) { memcpy(&req.nfr_flow_demux_patterns[i], &cparams->demux_patterns[i], sizeof(struct flow_demux_pattern)); @@ -193,7 +196,7 @@ fsw_netagent_flow_add(struct nx_flowswitch *fsw, uuid_t flow_uuid, pid_t pid, necp_create_nexus_assign_message(fsw->fsw_nx->nx_uuid, req.nfr_nx_port, fo->fo_key, sizeof(fo->fo_key), &local_endpoint, &remote_endpoint, NULL, req.nfr_flowadv_idx, - req.nfr_flow_stats, &assign_message_length); + req.nfr_flow_stats, req.nfr_flowid, &assign_message_length); if (assign_message != NULL) { req.nfr_flow_stats = NULL; @@ -295,7 +298,7 @@ fsw_netagent_event(u_int8_t event, uuid_t flow_uuid, pid_t pid, void *context, */ error = fsw_netagent_flow_del(fsw, flow_uuid, pid, (event == NETAGENT_EVENT_NEXUS_FLOW_REMOVE), context, - cparams); + cparams->u.close_token); break; default: @@ -312,8 +315,8 @@ fsw_netagent_register(struct nx_flowswitch *fsw, struct ifnet *ifp) struct netagent_nexus_agent agent; int error = 0; - _CASSERT(FLOWADV_IDX_NONE == UINT32_MAX); - _CASSERT(NECP_FLOWADV_IDX_INVALID == FLOWADV_IDX_NONE); + static_assert(FLOWADV_IDX_NONE == UINT32_MAX); + static_assert(NECP_FLOWADV_IDX_INVALID == FLOWADV_IDX_NONE); if (!fsw_netagent) { return 0; diff --git a/bsd/skywalk/nexus/flowswitch/fsw_var.h b/bsd/skywalk/nexus/flowswitch/fsw_var.h index bb3970ae6..fd1375b68 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_var.h +++ b/bsd/skywalk/nexus/flowswitch/fsw_var.h @@ -152,6 +152,8 @@ extern boolean_t fsw_detach_barrier_add(struct nx_flowswitch *fsw); extern void fsw_detach_barrier_remove(struct nx_flowswitch *fsw); extern void fsw_linger_insert(struct flow_entry *fsw); extern void fsw_linger_purge(struct nx_flowswitch *fsw); +extern void fsw_rxstrc_insert(struct flow_entry *fsw); +extern void fsw_rxstrc_purge(struct nx_flowswitch *fsw); extern void fsw_reap_sched(struct nx_flowswitch *fsw); extern int fsw_dev_input_netem_dequeue(void *handle, @@ -164,7 +166,7 @@ extern void dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, uint32_t flags); extern void dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, struct pktq *rx_pkts, uint32_t rx_bytes, - uint32_t flags); + struct mbufq *host_mq, uint32_t flags); #if (DEVELOPMENT || DEBUG) extern int fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n); diff --git a/bsd/skywalk/nexus/flowswitch/fsw_vp.c b/bsd/skywalk/nexus/flowswitch/fsw_vp.c index eaf4a233a..3bb55ba02 100644 --- a/bsd/skywalk/nexus/flowswitch/fsw_vp.c +++ b/bsd/skywalk/nexus/flowswitch/fsw_vp.c @@ -78,7 +78,7 @@ fsw_vp_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) ASSERT(na->na_type == NA_FLOWSWITCH_VP); - SK_DF(SK_VERB_FSW, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_FSW, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); /* @@ -96,7 +96,7 @@ fsw_vp_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) ret = fsw_port_na_activate(fsw, vpna, mode); if (ret != 0) { - SK_DF(SK_VERB_FSW, "na \"%s\" (0x%llx) %s err(%d)", + SK_DF(SK_VERB_FSW, "na \"%s\" (%p) %s err(%d)", na->na_name, SK_KVA(na), na_activate_mode2str(mode), ret); if (mode == NA_ACTIVATE_MODE_ON) { os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed); @@ -148,7 +148,7 @@ fsw_vp_na_dtor(struct nexus_adapter *na) SK_LOCK_ASSERT_HELD(); ASSERT(na->na_type == NA_FLOWSWITCH_VP); - SK_DF(SK_VERB_FSW, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na)); + SK_DF(SK_VERB_FSW, "na \"%s\" (%p)", na->na_name, SK_KVA(na)); if (fsw != NULL) { FSW_WLOCK(fsw); @@ -224,16 +224,14 @@ fsw_vp_na_txsync(struct __kern_channel_ring *kring, struct proc *p, kring->ckr_khead = kring->ckr_rhead; kring->ckr_ktail = SLOT_PREV(kring->ckr_rhead, kring->ckr_lim); error = ENODEV; - SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b in drop mode (err %d)", - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, error); + SK_ERR("kr \"%s\" (%p) krflags 0x%x in drop mode (err %d)", + kring->ckr_name, SK_KVA(kring), kring->ckr_flags, error); } SK_DF(SK_VERB_FSW | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0x%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0x%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); return error; } @@ -263,11 +261,11 @@ fsw_vp_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, os_atomic_thread_fence(seq_cst); SK_DF(SK_VERB_FSW | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u " - "kh %u (was %u) rh %u flags 0x%x", sk_proc_name_address(p), + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u " + "kh %u (was %u) rh %u flags 0x%x", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, kring->ckr_ring_id, kring->ckr_khead, khead_prev, - kring->ckr_rhead, flags); + kring->ckr_ring_id, kring->ckr_khead, khead_prev, kring->ckr_rhead, + flags); return 0; } @@ -301,7 +299,7 @@ fsw_vp_na_special(struct nexus_adapter *na, struct kern_channel *ch, break; case NXSPEC_CMD_DISCONNECT: - ASSERT(na->na_channels > 0); + ASSERT(na->na_channels == 1); ASSERT(na->na_flags & NAF_SPEC_INIT); os_atomic_andnot(&na->na_flags, NAF_SPEC_INIT, relaxed); @@ -323,7 +321,7 @@ fsw_vp_na_special(struct nexus_adapter *na, struct kern_channel *ch, done: SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "ch 0x%llx na \"%s\" (0x%llx) nx 0x%llx spec_cmd %u (err %d)", + "ch %p na \"%s\" (%p) nx %p spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na), SK_KVA(ch->ch_nexus), spec_cmd, error); @@ -436,10 +434,10 @@ fsw_vp_na_create(struct kern_nexus *nx, struct chreq *chr, struct proc *p, SK_DF(SK_VERB_FSW, "na_name: \"%s\"", na->na_name); SK_DF(SK_VERB_FSW, " UUID: %s", sk_uuid_unparse(na->na_uuid, uuidstr)); - SK_DF(SK_VERB_FSW, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_FSW, " nx: %p (\"%s\":\"%s\")", SK_KVA(na->na_nx), NX_DOM(na->na_nx)->nxdom_name, NX_DOM_PROV(na->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_FSW, " flags: 0x%b", na->na_flags, NAF_BITS); + SK_DF(SK_VERB_FSW, " flags: 0x%x", na->na_flags); SK_DF(SK_VERB_FSW, " stats_type: %u", na->na_stats_type); SK_DF(SK_VERB_FSW, " flowadv_max: %u", na->na_flowadv_max); SK_DF(SK_VERB_FSW, " rings: tx %u rx %u af %u", @@ -474,7 +472,7 @@ fsw_vp_na_alloc(zalloc_flags_t how) { struct nexus_vp_adapter *vpna; - _CASSERT(offsetof(struct nexus_vp_adapter, vpna_up) == 0); + static_assert(offsetof(struct nexus_vp_adapter, vpna_up) == 0); vpna = zalloc_flags(na_vp_zone, how | Z_ZERO); if (vpna) { @@ -490,7 +488,7 @@ fsw_vp_na_free(struct nexus_adapter *na) struct nexus_vp_adapter *__single vpna = (struct nexus_vp_adapter *)(void *)na; ASSERT(vpna->vpna_up.na_refcount == 0); - SK_DF(SK_VERB_MEM, "vpna 0x%llx FREE", SK_KVA(vpna)); + SK_DF(SK_VERB_MEM, "vpna %p FREE", SK_KVA(vpna)); bzero(vpna, sizeof(*vpna)); zfree(na_vp_zone, vpna); } diff --git a/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c b/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c index 3dd67b054..370862fe8 100644 --- a/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c +++ b/bsd/skywalk/nexus/flowswitch/nx_flowswitch.c @@ -100,7 +100,7 @@ static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *, static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t); static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *); + struct nxbind *, struct proc *); static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *); static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *, @@ -290,10 +290,9 @@ nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov, const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj) { #pragma unused(nxdom_prov, nxp) - _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE); - _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE); + static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE); + static_assert(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE); - *(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD; *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw); VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX); *(adj->adj_flowadv_max) = sk_max_flows; @@ -419,7 +418,7 @@ nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov, struct skmem_region_params srp[SKMEM_REGIONS]; SK_DF(SK_VERB_FSW, - "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx), + "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx), NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name, SK_KVA(na)); @@ -554,7 +553,7 @@ nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov, done: SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW, - "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx), + "nexus %p (%s) cmd %d (err %d)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err); return err; } @@ -575,7 +574,7 @@ nx_fsw_prov_nx_ctor(struct kern_nexus *nx) ASSERT(nx->nx_arg == NULL); - SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); + SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); fsw = fsw_alloc(Z_WAITOK); nx->nx_arg = fsw; @@ -589,7 +588,7 @@ nx_fsw_prov_nx_ctor(struct kern_nexus *nx) FSW_WUNLOCK(fsw); - SK_D("create new fsw 0x%llx for nexus 0x%llx", + SK_D("create new fsw %p for nexus %p", SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx)); return 0; @@ -603,7 +602,7 @@ nx_fsw_prov_nx_dtor(struct kern_nexus *nx) SK_LOCK_ASSERT_HELD(); - SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx), + SK_D("nexus %p (%s) fsw %p", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw)); err = fsw_ctl_detach(nx, current_proc(), NULL); @@ -611,7 +610,7 @@ nx_fsw_prov_nx_dtor(struct kern_nexus *nx) ASSERT(fsw->fsw_dev_ch == NULL); ASSERT(fsw->fsw_host_ch == NULL); - SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw)); + SK_DF(SK_VERB_FSW, "marking fsw %p as free", SK_KVA(fsw)); fsw_free(fsw); nx->nx_arg = NULL; } @@ -678,7 +677,7 @@ nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd, FSW_WUNLOCK(fsw); SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx), + "nx %p \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx), nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""), (int)port, first, (last - 1), error); @@ -733,7 +732,7 @@ nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port, FSW_WUNLOCK(fsw); SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx), + "nx %p \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx), nx->nx_prov->nxprov_params->nxp_name, (int)port, first, (last - 1), error); @@ -756,7 +755,7 @@ nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) FSW_WUNLOCK(fsw); SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW, - "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx), + "nx %p \"%s\" nx_port %d (err %d)", SK_KVA(nx), nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error); return error; @@ -765,7 +764,7 @@ nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) + struct nxbind *nxb, struct proc *p) { #pragma unused(nxdom_prov) nexus_port_t port = chr->cr_port; @@ -784,12 +783,12 @@ nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, goto done; } - chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH; + chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH; ASSERT(port != NEXUS_PORT_ANY); (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port); chr->cr_ring_set = RING_SET_DEFAULT; - err = na_connect(nx, ch, chr, ch0, nxb, p); + err = na_connect(nx, ch, chr, nxb, p); done: return err; @@ -802,9 +801,10 @@ nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov, #pragma unused(nxdom_prov) SK_LOCK_ASSERT_HELD(); - SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch), - SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, - ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); + SK_DF(SK_VERB_FSW, "channel %p -!- nexus %p (%s:\"%s\":%u:%d)", + SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, + ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, + (int)ch->ch_info->cinfo_ch_ring_id); if (ch->ch_flags & CHANF_KERNEL) { na_disconnect_spec(nx, ch); @@ -862,7 +862,7 @@ nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, na_defunct(nx, ch, ch->ch_na, locked); } - SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d", + SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d) err %d", ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, @@ -885,13 +885,13 @@ nx_fsw_na_find_log(const struct chreq *chr, boolean_t create) { uuid_string_t uuidstr; - SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u " - "ring_id %d ring_set %u ep_type %u:%u create %u%s", + SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%x pipe_id %u " + "ring_id %d ring_set %u ep_type %u create %u%s", chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), - (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id, - (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint, - chr->cr_endpoint, create, (strlcmp(chr->cr_name, - NX_FSW_NAME, sizeof(NX_FSW_NAME)) != 0) ? " (skipped)" : ""); + (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id, + (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create, + (strlcmp(chr->cr_name, NX_FSW_NAME, sizeof(NX_FSW_NAME)) != 0) ? + " (skipped)" : ""); } #endif /* SK_LOG */ @@ -948,7 +948,7 @@ nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch, /* use reference held by nx_fsw_attach_vp above */ *na = &vpna->vpna_up; SK_DF(SK_VERB_FSW, - "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d", + "vpna \"%s\" (%p) refs %u to fsw \"%s\" nx_port %d", (*na)->na_name, SK_KVA(*na), (*na)->na_refcount, cr_name, (int)vpna->vpna_nx_port); } diff --git a/bsd/skywalk/nexus/flowswitch/nx_flowswitch.h b/bsd/skywalk/nexus/flowswitch/nx_flowswitch.h index c2d7f3fc9..352ca4f54 100644 --- a/bsd/skywalk/nexus/flowswitch/nx_flowswitch.h +++ b/bsd/skywalk/nexus/flowswitch/nx_flowswitch.h @@ -320,6 +320,9 @@ struct nx_flowswitch { uint32_t fsw_rps_nthreads; struct fsw_rps_thread *__counted_by(fsw_rps_nthreads)fsw_rps_threads; #endif /* !DEVELOPMENT && !DEBUG */ + decl_lck_mtx_data(, fsw_rxstrc_lock); + struct flow_entry_rxstrc_head fsw_rxstrc_head; + uint32_t fsw_rxstrc_cnt; }; #define NX_FSW_PRIVATE(_nx) ((struct nx_flowswitch *)(_nx)->nx_arg) diff --git a/bsd/skywalk/nexus/kpipe/nx_kernel_pipe.c b/bsd/skywalk/nexus/kpipe/nx_kernel_pipe.c index 5381dc034..21506a3c5 100644 --- a/bsd/skywalk/nexus/kpipe/nx_kernel_pipe.c +++ b/bsd/skywalk/nexus/kpipe/nx_kernel_pipe.c @@ -82,8 +82,8 @@ static int nx_kpipe_dom_bind_port(struct kern_nexus *, nexus_port_t *, struct nxbind *, void *); static int nx_kpipe_dom_unbind_port(struct kern_nexus *, nexus_port_t); static int nx_kpipe_dom_connect(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *); + struct kern_nexus *, struct kern_channel *, struct chreq *, struct nxbind *, + struct proc *); static void nx_kpipe_dom_disconnect(struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *); static void nx_kpipe_dom_defunct(struct kern_nexus_domain_provider *, @@ -113,8 +113,8 @@ struct nxdom nx_kpipe_dom_s = { .nxdom_prov_head = STAILQ_HEAD_INITIALIZER(nx_kpipe_dom_s.nxdom_prov_head), .nxdom_type = NEXUS_TYPE_KERNEL_PIPE, - .nxdom_md_type = NEXUS_META_TYPE_QUANTUM, - .nxdom_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .nxdom_md_type = NEXUS_META_TYPE_PACKET, + .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW, .nxdom_name = "kpipe", .nxdom_ports = { .nb_def = 1, @@ -314,14 +314,13 @@ nx_kpipe_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) static int nx_kpipe_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) + struct nxbind *nxb, struct proc *p) { #pragma unused(nxdom_prov) nexus_port_t port = chr->cr_port; int err = 0; - SK_DF(SK_VERB_KERNEL_PIPE, "port %d mode 0x%b", - (int)port, chr->cr_mode, CHMODE_BITS); + SK_DF(SK_VERB_KERNEL_PIPE, "port %d mode 0x%x", port, chr->cr_mode); SK_LOCK_ASSERT_HELD(); @@ -356,12 +355,12 @@ nx_kpipe_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, } chr->cr_ring_set = RING_SET_DEFAULT; - chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_KERNEL_PIPE; + chr->cr_endpoint = CH_ENDPOINT_KERNEL_PIPE; (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "kpipe:%llu:%.*s", nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen, nx->nx_prov->nxprov_params->nxp_name); - err = na_connect(nx, ch, chr, ch0, nxb, p); + err = na_connect(nx, ch, chr, nxb, p); if (err == 0) { /* * Mark the kernel slot descriptor region as busy; this @@ -384,7 +383,7 @@ nx_kpipe_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov, #pragma unused(nxdom_prov, nx) SK_LOCK_ASSERT_HELD(); - SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch), + SK_D("channel %p -!- nexus %p (%s:\"%s\":%u:%d)", SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -458,7 +457,7 @@ nx_kpipe_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, na_defunct(nx, ch, ch->ch_na, locked); - SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)", + SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d)", ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -509,7 +508,7 @@ nx_kpipe_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov, int err = 0; SK_DF(SK_VERB_KERNEL_PIPE, - "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx), + "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx), NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name, SK_KVA(na)); @@ -583,7 +582,7 @@ na_kpipe_alloc(zalloc_flags_t how) { struct nexus_kpipe_adapter *kna; - _CASSERT(offsetof(struct nexus_kpipe_adapter, kna_up) == 0); + static_assert(offsetof(struct nexus_kpipe_adapter, kna_up) == 0); kna = zalloc_flags(na_kpipe_zone, how | Z_ZERO); if (kna) { @@ -599,7 +598,7 @@ na_kpipe_free(struct nexus_adapter *na) struct nexus_kpipe_adapter *kna = (struct nexus_kpipe_adapter *)na; ASSERT(kna->kna_up.na_refcount == 0); - SK_DF(SK_VERB_MEM, "kna 0x%llx FREE", SK_KVA(kna)); + SK_DF(SK_VERB_MEM, "kna %p FREE", SK_KVA(kna)); bzero(kna, sizeof(*kna)); zfree(na_kpipe_zone, kna); } @@ -610,10 +609,9 @@ nx_kpipe_na_txsync(struct __kern_channel_ring *kring, struct proc *p, { #pragma unused(p) SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); return nx_sync_tx(kring, (flags & NA_SYNCF_FORCE_RECLAIM)); } @@ -624,9 +622,9 @@ nx_kpipe_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, { #pragma unused(p) SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); ASSERT(kring->ckr_rhead <= kring->ckr_lim); @@ -639,7 +637,7 @@ nx_kpipe_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) { ASSERT(na->na_type == NA_KERNEL_PIPE); - SK_DF(SK_VERB_KERNEL_PIPE, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_KERNEL_PIPE, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); switch (mode) { @@ -708,12 +706,11 @@ nx_kpipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, #if SK_LOG uuid_string_t uuidstr; - SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u " - "ring_id %d ring_set %u ep_type %u:%u create %u%s", + SK_PDF(SK_VERB_KERNEL_PIPE, p, "name \"%s\" spec_uuid \"%s\" port %d " + "mode 0x%x pipe_id %u ring_id %d ring_set %u ep_type %u create %u%s", chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), - (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, - chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set, - chr->cr_real_endpoint, chr->cr_endpoint, create, + (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id, + (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create, (ep != CH_ENDPOINT_KERNEL_PIPE) ? " (skipped)" : ""); #endif /* SK_LOG */ @@ -779,15 +776,14 @@ nx_kpipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, skmem_arena_nexus(na->na_arena)->arn_flowadv_obj != NULL); #if SK_LOG - SK_DF(SK_VERB_KERNEL_PIPE, "created kpipe adapter 0x%llx", SK_KVA(kna)); + SK_DF(SK_VERB_KERNEL_PIPE, "created kpipe adapter %p", SK_KVA(kna)); SK_DF(SK_VERB_KERNEL_PIPE, "na_name: \"%s\"", na->na_name); SK_DF(SK_VERB_KERNEL_PIPE, " UUID: %s", sk_uuid_unparse(na->na_uuid, uuidstr)); - SK_DF(SK_VERB_KERNEL_PIPE, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_KERNEL_PIPE, " nx: %p (\"%s\":\"%s\")", SK_KVA(na->na_nx), NX_DOM(na->na_nx)->nxdom_name, NX_DOM_PROV(na->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_KERNEL_PIPE, " flags: 0x%b", - na->na_flags, NAF_BITS); + SK_DF(SK_VERB_KERNEL_PIPE, " flags: 0x%x", na->na_flags); SK_DF(SK_VERB_KERNEL_PIPE, " flowadv_max: %u", na->na_flowadv_max); SK_DF(SK_VERB_KERNEL_PIPE, " rings: tx %u rx %u", na_get_nrings(na, NR_TX), diff --git a/bsd/skywalk/nexus/kpipe/nx_kpipe_loopback.c b/bsd/skywalk/nexus/kpipe/nx_kpipe_loopback.c index 601cb982e..b442977e4 100644 --- a/bsd/skywalk/nexus/kpipe/nx_kpipe_loopback.c +++ b/bsd/skywalk/nexus/kpipe/nx_kpipe_loopback.c @@ -127,7 +127,7 @@ kplo_pre_connect(kern_nexus_provider_t nxprov, KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); *ch_ctx = KPLO_GENERATE_CTX(channel); - SK_DF(SK_VERB_KERNEL_PIPE, "nx_port %u ch 0x%llx ch_ctx 0x%llx", + SK_DF(SK_VERB_KERNEL_PIPE, "nx_port %u ch %p ch_ctx 0x%llx", nexus_port, SK_KVA(channel), (uint64_t)(*ch_ctx)); error = kern_nexus_get_pbufpool(nexus, NULL, NULL); @@ -177,9 +177,9 @@ kplo_connected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); KPLO_VERIFY_CTX(channel, kern_channel_get_context(channel)); - SK_DF(SK_VERB_KERNEL_PIPE, "channel 0x%llx", SK_KVA(channel)); - SK_DF(SK_VERB_KERNEL_PIPE, " RX_ring 0x%llx", SK_KVA(kplo_rxring)); - SK_DF(SK_VERB_KERNEL_PIPE, " TX_ring 0x%llx", SK_KVA(kplo_txring)); + SK_DF(SK_VERB_KERNEL_PIPE, "channel %p", SK_KVA(channel)); + SK_DF(SK_VERB_KERNEL_PIPE, " RX_ring %p", SK_KVA(kplo_rxring)); + SK_DF(SK_VERB_KERNEL_PIPE, " TX_ring %p", SK_KVA(kplo_txring)); KPLO_INJECT_ERROR(3); @@ -194,7 +194,7 @@ kplo_pre_disconnect(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov) KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); KPLO_VERIFY_CTX(channel, kern_channel_get_context(channel)); - SK_DF(SK_VERB_KERNEL_PIPE, "called for channel 0x%llx", + SK_DF(SK_VERB_KERNEL_PIPE, "called for channel %p", SK_KVA(channel)); } @@ -205,7 +205,7 @@ kplo_disconnected(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov) KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); KPLO_VERIFY_CTX(channel, kern_channel_get_context(channel)); - SK_DF(SK_VERB_KERNEL_PIPE, "called for channel 0x%llx", + SK_DF(SK_VERB_KERNEL_PIPE, "called for channel %p", SK_KVA(channel)); bzero(&kplo_tx_pp_info, sizeof(kplo_tx_pp_info)); kplo_tx_pp = kplo_rx_pp = NULL; @@ -232,7 +232,7 @@ kplo_ring_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, } *ring_ctx = KPLO_GENERATE_CTX(ring); - SK_DF(SK_VERB_KERNEL_PIPE, "%s_ring 0x%llx ring_ctx 0x%llx, err(%d)", + SK_DF(SK_VERB_KERNEL_PIPE, "%s_ring %p ring_ctx 0x%llx, err(%d)", KPLO_WHICH_RING(ring), SK_KVA(ring), (uint64_t)(*ring_ctx), error); done: @@ -246,7 +246,7 @@ kplo_ring_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, #pragma unused(nxprov) KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); KPLO_VERIFY_CTX(ring, kern_channel_ring_get_context(ring)); - SK_DF(SK_VERB_KERNEL_PIPE, "%s_ring 0x%llx", + SK_DF(SK_VERB_KERNEL_PIPE, "%s_ring %p", KPLO_WHICH_RING(ring), SK_KVA(ring)); if (ring == kplo_txring) { @@ -276,7 +276,7 @@ kplo_slot_init(kern_nexus_provider_t nxprov, kern_nexus_t nexus, *pslot_ctx = KPLO_GENERATE_CTX(slot); *slot_prop_addr = NULL; SK_DF(SK_VERB_KERNEL_PIPE, - " slot 0x%llx id %u slot_ctx 0x%llx [%u]", + " slot %p id %u slot_ctx %p [%u]", SK_KVA(slot), slot_id, SK_KVA(*pslot_ctx), kplo_drv_slots); lck_mtx_unlock(&kplo_lock); @@ -298,7 +298,7 @@ kplo_slot_fini(kern_nexus_provider_t nxprov, kern_nexus_t nexus, lck_mtx_lock(&kplo_lock); KPLO_VERIFY_CTX(slot, ctx); - SK_DF(SK_VERB_KERNEL_PIPE, " slot 0x%llx id %u [%u]", + SK_DF(SK_VERB_KERNEL_PIPE, " slot %p id %u [%u]", SK_KVA(slot), slot_id, kplo_drv_slots); lck_mtx_unlock(&kplo_lock); } @@ -314,8 +314,8 @@ kplo_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, KPLO_VERIFY_CTX(kplo_nx_ctx, kern_nexus_get_context(nexus)); KPLO_VERIFY_CTX(ring, kern_channel_ring_get_context(ring)); SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_TX, - "called with ring \"%s\" krflags 0x%b flags 0x%x", - ring->ckr_name, ring->ckr_flags, CKRF_BITS, flags); + "called with ring \"%s\" krflags 0x%x flags 0x%x", + ring->ckr_name, ring->ckr_flags, flags); VERIFY(ring == kplo_txring); kern_channel_ring_t txkring = kplo_txring; @@ -333,7 +333,7 @@ kplo_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, KPLO_INJECT_ERROR(8); SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_TX, - "0x%llx: %s %x -> %s", SK_KVA(txkring), txkring->ckr_name, + "%p: %s %x -> %s", SK_KVA(txkring), txkring->ckr_name, flags, rxkring->ckr_name); SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_TX, "tx before: kh %3u kt %3u | h %3u t %3u", @@ -396,7 +396,7 @@ kplo_sync_tx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, if (kplo_dump_buf) { SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_DUMP, "%s", - sk_dump("buf", baddr + doff, dlen, 128, NULL, 0)); + sk_dump("buf", baddr + doff, dlen, 128)); } VERIFY(kern_buflet_set_data_offset(buf, 0) == 0); @@ -482,8 +482,8 @@ kplo_sync_rx(kern_nexus_provider_t nxprov, kern_nexus_t nexus, kern_channel_ring_t rxkring = ring; SK_DF(SK_VERB_KERNEL_PIPE | SK_VERB_SYNC | SK_VERB_RX, - "called with ring \"%s\" krflags 0x%b flags 0x%x", - ring->ckr_name, ring->ckr_flags, CKRF_BITS, flags); + "called with ring \"%s\" krflags 0x%x flags 0x%x", + ring->ckr_name, ring->ckr_flags, flags); KPLO_INJECT_ERROR(10); diff --git a/bsd/skywalk/nexus/monitor/Makefile b/bsd/skywalk/nexus/monitor/Makefile deleted file mode 100644 index b6f2b7aad..000000000 --- a/bsd/skywalk/nexus/monitor/Makefile +++ /dev/null @@ -1,46 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -EXPINC_SUBDIRS = \ - -# Installs header file for user level - -# $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -# $(DSTROOT)/usr/include/ -DATAFILES= \ - -# Installs header file for kernel extensions - -# $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers -# $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders -KERNELFILES= \ - -# Installs header file for Apple internal use in user level - -# $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -PRIVATE_DATAFILES = \ - -# Installs header file for Apple internal use for kernel extensions - -# $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders -PRIVATE_KERNELFILES = \ - -INSTALL_MI_LIST = ${DATAFILES} - -INSTALL_MI_DIR = skywalk - -EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -# /System/Library/Frameworks/System.framework/PrivateHeaders -INSTALL_SF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} - -# /System/Library/Frameworks/Kernel.framework/PrivateHeaders -INSTALL_KF_MI_LCL_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/bsd/skywalk/nexus/monitor/nx_monitor.c b/bsd/skywalk/nexus/monitor/nx_monitor.c deleted file mode 100644 index 5be38f351..000000000 --- a/bsd/skywalk/nexus/monitor/nx_monitor.c +++ /dev/null @@ -1,1712 +0,0 @@ -/* - * Copyright (c) 2015-2021 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * - * Monitors - * - * netmap monitors can be used to do monitoring of network traffic - * on another adapter, when the latter adapter is working in netmap mode. - * - * Monitors offer to userspace the same interface as any other netmap port, - * with as many pairs of netmap rings as the monitored adapter. - * However, only the rx rings are actually used. Each monitor rx ring receives - * the traffic transiting on both the tx and rx corresponding rings in the - * monitored adapter. During registration, the user can choose if she wants - * to intercept tx only, rx only, or both tx and rx traffic. - * - * If the monitor is not able to cope with the stream of frames, excess traffic - * will be dropped. - * - * If the monitored adapter leaves netmap mode, the monitor has to be restarted. - * - * Monitors can be either zero-copy or copy-based. - * - * Copy monitors see the frames before they are consumed: - * - * - For tx traffic, this is when the application sends them, before they are - * passed down to the adapter. - * - * - For rx traffic, this is when they are received by the adapter, before - * they are sent up to the application, if any (note that, if no - * application is reading from a monitored ring, the ring will eventually - * fill up and traffic will stop). - * - * Zero-copy monitors only see the frames after they have been consumed: - * - * - For tx traffic, this is after the slots containing the frames have been - * marked as free. Note that this may happen at a considerably delay after - * frame transmission, since freeing of slots is often done lazily. - * - * - For rx traffic, this is after the consumer on the monitored adapter - * has released them. In most cases, the consumer is a userspace - * application which may have modified the frame contents. - * - * Several copy monitors may be active on any ring. Zero-copy monitors, - * instead, need exclusive access to each of the monitored rings. This may - * change in the future, if we implement zero-copy monitor chaining. - * - */ - -#include -#include - -static int nx_mon_na_txsync(struct __kern_channel_ring *, struct proc *, - uint32_t); -static int nx_mon_na_rxsync(struct __kern_channel_ring *, struct proc *, - uint32_t); -static int nx_mon_na_krings_create(struct nexus_adapter *, - struct kern_channel *); -static void nx_mon_na_krings_delete(struct nexus_adapter *, - struct kern_channel *, boolean_t); -static uint32_t nx_mon_txrx2chmode(enum txrx); -static int nx_mon_kr_alloc(struct __kern_channel_ring *, uint32_t); -static void nx_mon_kr_dealloc(struct __kern_channel_ring *); -static int nx_mon_na_krings_locks(struct nexus_adapter *, - uint32_t[NR_TXRX], uint32_t[NR_TXRX]); -static void nx_mon_na_krings_unlock(struct nexus_adapter *, - const uint32_t[NR_TXRX], const uint32_t[NR_TXRX]); -static int nx_mon_enable(struct nexus_adapter *, int); -static void nx_mon_disable(struct nexus_adapter *); -static int nx_mon_add(struct __kern_channel_ring *, - struct __kern_channel_ring *, boolean_t); -static void nx_mon_del(struct __kern_channel_ring *, - struct __kern_channel_ring *, boolean_t); -static int nx_mon_na_activate_common(struct nexus_adapter *, - na_activate_mode_t, boolean_t); -static pkt_copy_from_pkt_t nx_mon_quantum_copy_64x; - -static int nx_mon_zcopy_parent_sync(struct __kern_channel_ring *, - struct proc *, uint32_t, enum txrx); -static int nx_mon_zcopy_na_activate(struct nexus_adapter *, na_activate_mode_t); -static void nx_mon_zcopy_na_dtor(struct nexus_adapter *); - -static void nx_mon_parent_sync(struct __kern_channel_ring *, struct proc *, - slot_idx_t, int); -static int nx_mon_na_activate(struct nexus_adapter *, na_activate_mode_t); -static void nx_mon_na_dtor(struct nexus_adapter *); - -/* - * monitors work by replacing the nm_sync() and possibly the - * nm_notify() callbacks in the monitored rings. - */ -static int nx_mon_zcopy_parent_txsync(struct __kern_channel_ring *, - struct proc *, uint32_t); -static int nx_mon_zcopy_parent_rxsync(struct __kern_channel_ring *, - struct proc *, uint32_t); -static int nx_mon_parent_txsync(struct __kern_channel_ring *, - struct proc *, uint32_t); -static int nx_mon_parent_rxsync(struct __kern_channel_ring *, - struct proc *, uint32_t); -static int nx_mon_parent_notify(struct __kern_channel_ring *, - struct proc *, uint32_t); - -static void nx_mon_dom_init(struct nxdom *); -static void nx_mon_dom_terminate(struct nxdom *); -static void nx_mon_dom_fini(struct nxdom *); -static int nx_mon_dom_bind_port(struct kern_nexus *, nexus_port_t *, - struct nxbind *, void *); -static int nx_mon_dom_unbind_port(struct kern_nexus *, nexus_port_t); -static int nx_mon_dom_connect(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *); -static void nx_mon_dom_disconnect(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *); -static void nx_mon_dom_defunct(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, struct proc *); -static void nx_mon_dom_defunct_finalize(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, boolean_t); - -static int nx_mon_prov_init(struct kern_nexus_domain_provider *); -static int nx_mon_prov_params_adjust(const struct kern_nexus_domain_provider *, - const struct nxprov_params *, struct nxprov_adjusted_params *); -static int nx_mon_prov_params(struct kern_nexus_domain_provider *, - const uint32_t, const struct nxprov_params *, struct nxprov_params *, - struct skmem_region_params[SKMEM_REGIONS], uint32_t); -static int nx_mon_prov_mem_new(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct nexus_adapter *); -static void nx_mon_prov_fini(struct kern_nexus_domain_provider *); - -static struct nexus_monitor_adapter *na_mon_alloc(zalloc_flags_t); -static void na_mon_free(struct nexus_adapter *); - -struct nxdom nx_monitor_dom_s = { - .nxdom_prov_head = - STAILQ_HEAD_INITIALIZER(nx_monitor_dom_s.nxdom_prov_head), - .nxdom_type = NEXUS_TYPE_MONITOR, - .nxdom_md_type = NEXUS_META_TYPE_QUANTUM, - .nxdom_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, - .nxdom_name = "monitor", - /* - * The following values don't really matter much, as a monitor - * isn't usable on its own; we just define them as non-zeroes. - */ - .nxdom_ports = { - .nb_def = 1, - .nb_min = 1, - .nb_max = 1, - }, - .nxdom_tx_rings = { - .nb_def = 1, - .nb_min = 1, - .nb_max = 1, - }, - .nxdom_rx_rings = { - .nb_def = 1, - .nb_min = 1, - .nb_max = 1, - }, - .nxdom_tx_slots = { - .nb_def = 1, - .nb_min = 1, - .nb_max = 1, - }, - .nxdom_rx_slots = { - .nb_def = 1, - .nb_min = 1, - .nb_max = 1, - }, - .nxdom_buf_size = { - .nb_def = 64, - .nb_min = 64, - .nb_max = 64, - }, - .nxdom_large_buf_size = { - .nb_def = 0, - .nb_min = 0, - .nb_max = 0, - }, - .nxdom_meta_size = { - .nb_def = NX_METADATA_OBJ_MIN_SZ, - .nb_min = NX_METADATA_OBJ_MIN_SZ, - .nb_max = NX_METADATA_USR_MAX_SZ, - }, - .nxdom_stats_size = { - .nb_def = 0, - .nb_min = 0, - .nb_max = NX_STATS_MAX_SZ, - }, - .nxdom_pipes = { - .nb_def = 0, - .nb_min = 0, - .nb_max = 0, - }, - .nxdom_flowadv_max = { - .nb_def = 0, - .nb_min = 0, - .nb_max = NX_FLOWADV_MAX, - }, - .nxdom_nexusadv_size = { - .nb_def = 0, - .nb_min = 0, - .nb_max = NX_NEXUSADV_MAX_SZ, - }, - .nxdom_capabilities = { - .nb_def = NXPCAP_USER_CHANNEL, - .nb_min = NXPCAP_USER_CHANNEL, - .nb_max = NXPCAP_USER_CHANNEL, - }, - .nxdom_qmap = { - .nb_def = NEXUS_QMAP_TYPE_INVALID, - .nb_min = NEXUS_QMAP_TYPE_INVALID, - .nb_max = NEXUS_QMAP_TYPE_INVALID, - }, - .nxdom_max_frags = { - .nb_def = NX_PBUF_FRAGS_DEFAULT, - .nb_min = NX_PBUF_FRAGS_MIN, - .nb_max = NX_PBUF_FRAGS_DEFAULT, - }, - .nxdom_init = nx_mon_dom_init, - .nxdom_terminate = nx_mon_dom_terminate, - .nxdom_fini = nx_mon_dom_fini, - .nxdom_find_port = NULL, - .nxdom_port_is_reserved = NULL, - .nxdom_bind_port = nx_mon_dom_bind_port, - .nxdom_unbind_port = nx_mon_dom_unbind_port, - .nxdom_connect = nx_mon_dom_connect, - .nxdom_disconnect = nx_mon_dom_disconnect, - .nxdom_defunct = nx_mon_dom_defunct, - .nxdom_defunct_finalize = nx_mon_dom_defunct_finalize, -}; - -static struct kern_nexus_domain_provider nx_monitor_prov_s = { - .nxdom_prov_name = NEXUS_PROVIDER_MONITOR, - .nxdom_prov_flags = NXDOMPROVF_DEFAULT, - .nxdom_prov_cb = { - .dp_cb_init = nx_mon_prov_init, - .dp_cb_fini = nx_mon_prov_fini, - .dp_cb_params = nx_mon_prov_params, - .dp_cb_mem_new = nx_mon_prov_mem_new, - .dp_cb_config = NULL, - .dp_cb_nx_ctor = NULL, - .dp_cb_nx_dtor = NULL, - .dp_cb_nx_mem_info = NULL, /* not supported */ - .dp_cb_nx_mib_get = NULL, - }, -}; - -static SKMEM_TYPE_DEFINE(na_mon_zone, struct nexus_monitor_adapter); - -#define SKMEM_TAG_MONITORS "com.apple.skywalk.monitors" -static SKMEM_TAG_DEFINE(skmem_tag_monitors, SKMEM_TAG_MONITORS); - -static void -nx_mon_dom_init(struct nxdom *nxdom) -{ - SK_LOCK_ASSERT_HELD(); - ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED)); - - (void) nxdom_prov_add(nxdom, &nx_monitor_prov_s); -} - -static void -nx_mon_dom_terminate(struct nxdom *nxdom) -{ - struct kern_nexus_domain_provider *nxdom_prov, *tnxdp; - - STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head, - nxdom_prov_link, tnxdp) { - (void) nxdom_prov_del(nxdom_prov); - } -} - -static void -nx_mon_dom_fini(struct nxdom *nxdom) -{ -#pragma unused(nxdom) -} - -__attribute__((noreturn)) -static int -nx_mon_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port, - struct nxbind *nxb, void *info) -{ -#pragma unused(nx, nx_port, nxb, info) - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -} - -__attribute__((noreturn)) -static int -nx_mon_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) -{ -#pragma unused(nx, nx_port) - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -} - -__attribute__((noreturn)) -static int -nx_mon_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, - struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) -{ -#pragma unused(nxdom_prov, nx, ch, chr, ch0, nxb, p) - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -} - -__attribute__((noreturn)) -static void -nx_mon_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov, - struct kern_nexus *nx, struct kern_channel *ch) -{ -#pragma unused(nxdom_prov, nx, ch) - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -} - -static void -nx_mon_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov, - struct kern_nexus *nx, struct kern_channel *ch, struct proc *p) -{ -#pragma unused(nxdom_prov, nx, ch, p) -} - -static void -nx_mon_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, - struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked) -{ -#pragma unused(nxdom_prov, nx, ch, locked) -} - -static int -nx_mon_prov_init(struct kern_nexus_domain_provider *nxdom_prov) -{ -#pragma unused(nxdom_prov) - SK_D("initializing %s", nxdom_prov->nxdom_prov_name); - return 0; -} - -static int -nx_mon_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov, - const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj) -{ -#pragma unused(nxdom_prov, nxp, adj) - - return 0; -} - -static int -nx_mon_prov_params(struct kern_nexus_domain_provider *nxdom_prov, - const uint32_t req, const struct nxprov_params *nxp0, - struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS], - uint32_t pp_region_config_flags) -{ - struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom; - - return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp, - nxdom, nxdom, nxdom, pp_region_config_flags, - nx_mon_prov_params_adjust); -} - -static int -nx_mon_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov, - struct kern_nexus *nx, struct nexus_adapter *na) -{ -#pragma unused(nxdom_prov) - int err = 0; - - SK_DF(SK_VERB_MONITOR, - "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx), - NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name, - SK_KVA(na)); - - ASSERT(na->na_arena == NULL); - ASSERT(NX_USER_CHANNEL_PROV(nx)); - /* - * The underlying nexus adapter uses the same memory allocator - * as the monitored adapter; don't store the pp in the nexus. - * - * This means that clients calling kern_nexus_get_pbufpool() - * will get NULL, but this is fine since we don't expose the - * monitor to external kernel clients. - */ - na->na_arena = skmem_arena_create_for_nexus(na, - NX_PROV(nx)->nxprov_region_params, NULL, NULL, FALSE, - FALSE, NULL, &err); - ASSERT(na->na_arena != NULL || err != 0); - - return err; -} - -static void -nx_mon_prov_fini(struct kern_nexus_domain_provider *nxdom_prov) -{ -#pragma unused(nxdom_prov) - SK_D("destroying %s", nxdom_prov->nxdom_prov_name); -} - -static struct nexus_monitor_adapter * -na_mon_alloc(zalloc_flags_t how) -{ - struct nexus_monitor_adapter *mna; - - _CASSERT(offsetof(struct nexus_monitor_adapter, mna_up) == 0); - - mna = zalloc_flags(na_mon_zone, how | Z_ZERO); - if (mna) { - mna->mna_up.na_type = NA_MONITOR; - mna->mna_up.na_free = na_mon_free; - } - return mna; -} - -static void -na_mon_free(struct nexus_adapter *na) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - - ASSERT(mna->mna_up.na_refcount == 0); - SK_DF(SK_VERB_MEM, "mna 0x%llx FREE", SK_KVA(mna)); - bzero(mna, sizeof(*mna)); - zfree(na_mon_zone, mna); -} - -/* - * Functions common to both kind of monitors. - */ - -/* - * nm_sync callback for the monitor's own tx rings. - * This makes no sense and always returns error - */ -static int -nx_mon_na_txsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ -#pragma unused(kring, p, flags) - SK_DF(SK_VERB_MONITOR | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); - return EIO; -} - -/* - * nm_sync callback for the monitor's own rx rings. - * Note that the lock in nx_mon_zcopy_parent_sync only protects - * writers among themselves. Synchronization between writers - * (i.e., nx_mon_zcopy_parent_txsync and nx_mon_zcopy_parent_rxsync) - * and readers (i.e., nx_mon_zcopy_parent_rxsync) relies on memory barriers. - */ -static int -nx_mon_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ -#pragma unused(p, flags) - SK_DF(SK_VERB_MONITOR | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); - kring->ckr_khead = kring->ckr_rhead; - os_atomic_thread_fence(seq_cst); - return 0; -} - -/* - * na_krings_create callbacks for monitors. - * We could use the default netmap_hw_krings_zmon, but - * we don't need the nx_mbq. - */ -static int -nx_mon_na_krings_create(struct nexus_adapter *na, struct kern_channel *ch) -{ - ASSERT(na->na_type == NA_MONITOR); - return na_rings_mem_setup(na, FALSE, ch); -} - -/* na_krings_delete callback for monitors */ -static void -nx_mon_na_krings_delete(struct nexus_adapter *na, struct kern_channel *ch, - boolean_t defunct) -{ - ASSERT(na->na_type == NA_MONITOR); - na_rings_mem_teardown(na, ch, defunct); -} - -__attribute__((always_inline)) -static inline uint32_t -nx_mon_txrx2chmode(enum txrx t) -{ - return t == NR_RX ? CHMODE_MONITOR_RX : CHMODE_MONITOR_TX; -} - -/* allocate the monitors array in the monitored kring */ -static int -nx_mon_kr_alloc(struct __kern_channel_ring *kring, uint32_t n) -{ - struct __kern_channel_ring **nm; - - if (n <= kring->ckr_max_monitors) { - /* we already have more entries that requested */ - return 0; - } - - nm = sk_realloc_type_array(struct __kern_channel_ring *, - kring->ckr_max_monitors, n, kring->ckr_monitors, - Z_WAITOK, skmem_tag_monitors); - if (nm == NULL) { - return ENOMEM; - } - - kring->ckr_monitors = nm; - kring->ckr_max_monitors = n; - - return 0; -} - -/* deallocate the parent array in the parent adapter */ -static void -nx_mon_kr_dealloc(struct __kern_channel_ring *kring) -{ - if (kring->ckr_monitors != NULL) { - if (kring->ckr_n_monitors > 0) { - SK_ERR("freeing not empty monitor array for \"%s\" " - "(%u dangling monitors)!", kring->ckr_name, - kring->ckr_n_monitors); - } - sk_free_type_array(struct __kern_channel_ring *, - kring->ckr_max_monitors, kring->ckr_monitors); - kring->ckr_monitors = NULL; - kring->ckr_max_monitors = 0; - kring->ckr_n_monitors = 0; - } -} - -static int -nx_mon_na_krings_locks(struct nexus_adapter *na, - uint32_t qfirst[NR_TXRX], uint32_t qlast[NR_TXRX]) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - enum txrx t; - int err = 0; - - for_rx_tx(t) { - uint32_t i; - - if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) { - continue; - } - - qfirst[t] = qlast[t] = mna->mna_first[t]; - - /* synchronize with concurrently running nm_sync()s */ - for (i = mna->mna_first[t]; i < mna->mna_last[t]; i++) { - struct __kern_channel_ring *kring; - - /* the parent adapter's kring */ - kring = &NAKR(pna, t)[i]; - kr_stop(kring, KR_LOCKED); - qlast[t] = i + 1; - } - if (err != 0) { - break; - } - } - - return err; -} - -static void -nx_mon_na_krings_unlock(struct nexus_adapter *na, - const uint32_t qfirst[NR_TXRX], const uint32_t qlast[NR_TXRX]) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - enum txrx t; - - for_rx_tx(t) { - uint32_t i; - - if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) { - continue; - } - - /* synchronize with concurrently running nm_sync()s */ - for (i = qfirst[t]; i < qlast[t]; i++) { - struct __kern_channel_ring *kring; - - /* the parent adapter's kring */ - kring = &NAKR(pna, t)[i]; - kr_start(kring); - } - } -} - -static int -nx_mon_enable(struct nexus_adapter *na, boolean_t zcopy) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - struct skmem_arena_nexus *na_arena = skmem_arena_nexus(pna->na_arena); - uint32_t qfirst[NR_TXRX], qlast[NR_TXRX]; - enum txrx t; - int err = 0; - uint32_t i; - - ASSERT(!(na->na_flags & NAF_ACTIVE)); - - bzero(&qfirst, sizeof(qfirst)); - bzero(&qlast, sizeof(qlast)); - - /* - * Acquire the target kring(s). q{first,last}0 represent the - * target ring set. q{first,last} represent the ones that have - * been successfully acquired. In the event the acquisition - * fails, we must release any previously-acquired rings. - */ - if ((err = nx_mon_na_krings_locks(na, qfirst, qlast)) != 0) { - goto unlock; - } - - ASSERT(na_arena->arn_rx_pp == na_arena->arn_tx_pp); - if (na_arena->arn_rx_pp->pp_max_frags > 1) { - VERIFY(na_arena->arn_rx_pp->pp_md_type == NEXUS_META_TYPE_PACKET); - mna->mna_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt; - } else { - if (na_arena->arn_rx_pp->pp_md_type == NEXUS_META_TYPE_PACKET) { - mna->mna_pkt_copy_from_pkt = pkt_copy_from_pkt; - } else { - mna->mna_pkt_copy_from_pkt = nx_mon_quantum_copy_64x; - } - } - - for_rx_tx(t) { - if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) { - continue; - } - - for (i = qfirst[t]; i < qlast[t]; i++) { - struct __kern_channel_ring *kring, *mkring; - - /* the parent adapter's kring */ - kring = &NAKR(pna, t)[i]; - mkring = &na->na_rx_rings[i]; - err = nx_mon_add(mkring, kring, zcopy); - if (err != 0) { - break; - } - } - if (err != 0) { - break; - } - } - - if (err == 0) { - os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed); - goto unlock; - } - - for_rx_tx(t) { - if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) { - continue; - } - - for (i = qfirst[t]; i < qlast[t]; i++) { - struct __kern_channel_ring *kring, *mkring; - - /* the parent adapter's kring */ - kring = &NAKR(pna, t)[i]; - mkring = &na->na_rx_rings[i]; - nx_mon_del(mkring, kring, FALSE); - } - } - ASSERT(!(na->na_flags & NAF_ACTIVE)); - -unlock: - nx_mon_na_krings_unlock(na, qfirst, qlast); - - SK_DF(err ? SK_VERB_ERROR : SK_VERB_MONITOR, - "%s (0x%llx): mode 0x%x txrings[%u,%u], rxrings[%u,%u] err %d", - na->na_name, SK_KVA(na), mna->mna_mode, qfirst[NR_TX], qlast[NR_TX], - qfirst[NR_RX], qlast[NR_RX], err); - - return err; -} - -static void -nx_mon_disable(struct nexus_adapter *na) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - uint32_t qfirst[NR_TXRX], qlast[NR_TXRX]; - enum txrx t; - int err; - uint32_t i; - - ASSERT(na->na_flags & NAF_ACTIVE); - - bzero(&qfirst, sizeof(qfirst)); - bzero(&qlast, sizeof(qlast)); - - /* blocking kring(s) acquisition; must not fail */ - err = nx_mon_na_krings_locks(na, qfirst, qlast); - ASSERT(err == 0); - mna->mna_pkt_copy_from_pkt = NULL; - for_rx_tx(t) { - if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) { - continue; - } - - for (i = qfirst[t]; i < qlast[t]; i++) { - struct __kern_channel_ring *kring, *mkring; - - kring = &NAKR(pna, t)[i]; - mkring = &na->na_rx_rings[i]; - nx_mon_del(mkring, kring, FALSE); - } - } - os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed); - - nx_mon_na_krings_unlock(na, qfirst, qlast); -} - -/* - * Add the monitor mkring to the list of monitors of kring. - * If this is the first monitor, intercept the callbacks - */ -static int -nx_mon_add(struct __kern_channel_ring *mkring, - struct __kern_channel_ring *kring, boolean_t zcopy) -{ - int error; - - /* make sure the monitor array exists and is big enough */ - error = nx_mon_kr_alloc(kring, kring->ckr_n_monitors + 1); - if (error != 0) { - return error; - } - - kring->ckr_monitors[kring->ckr_n_monitors] = mkring; - mkring->ckr_mon_pos = kring->ckr_n_monitors; - kring->ckr_n_monitors++; - if (kring->ckr_n_monitors == 1) { - /* this is the first monitor, intercept callbacks */ - SK_DF(SK_VERB_MONITOR, - "mkr \"%s\" (0x%llx) krflags 0x%b intercept callbacks " - "on kr \"%s\" (0x%llx) krflags 0x%b", mkring->ckr_name, - SK_KVA(mkring), mkring->ckr_flags, CKRF_BITS, - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS); - kring->ckr_mon_sync = kring->ckr_na_sync; - /* - * zcopy monitors do not override nm_notify(), but - * we save the original one regardless, so that - * nx_mon_del() does not need to know the - * monitor type - */ - kring->ckr_mon_notify = kring->ckr_na_notify; - if (kring->ckr_tx == NR_TX) { - kring->ckr_na_sync = - (zcopy ? nx_mon_zcopy_parent_txsync : - nx_mon_parent_txsync); - } else { - kring->ckr_na_sync = - (zcopy ? nx_mon_zcopy_parent_rxsync : - nx_mon_parent_rxsync); - if (!zcopy) { - /* also intercept notify */ - kring->ckr_na_notify = nx_mon_parent_notify; - kring->ckr_mon_tail = kring->ckr_ktail; - } - } - } else { - SK_DF(SK_VERB_MONITOR, - "mkr \"%s\" (0x%llx) krflags 0x%b already intercept " - "callbacks on kr \"%s\" (0x%llx) krflags 0x%b, " - "%u monitors", mkring->ckr_name, SK_KVA(mkring), - mkring->ckr_flags, CKRF_BITS, kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, - kring->ckr_n_monitors); - } - return 0; -} - -/* - * Remove the monitor mkring from the list of monitors of kring. - * If this is the last monitor, restore the original callbacks - */ -static void -nx_mon_del(struct __kern_channel_ring *mkring, - struct __kern_channel_ring *kring, boolean_t all) -{ - ASSERT(kring->ckr_n_monitors != 0); - if (all) { - kring->ckr_n_monitors = 0; - } else { - kring->ckr_n_monitors--; - if (mkring->ckr_mon_pos != kring->ckr_n_monitors) { - kring->ckr_monitors[mkring->ckr_mon_pos] = - kring->ckr_monitors[kring->ckr_n_monitors]; - kring->ckr_monitors[mkring->ckr_mon_pos]->ckr_mon_pos = - mkring->ckr_mon_pos; - } - kring->ckr_monitors[kring->ckr_n_monitors] = NULL; - } - if (kring->ckr_n_monitors == 0) { - /* - * This was the last monitor, restore callbacks - * and delete monitor array. - */ - SK_DF(SK_VERB_MONITOR, - "restoring sync callback on kr \"%s\" (0x%llx) " - "krflags 0x%b", kring->ckr_name, SK_KVA(kring), - kring->ckr_flags, CKRF_BITS); - kring->ckr_na_sync = kring->ckr_mon_sync; - kring->ckr_mon_sync = NULL; - if (kring->ckr_tx == NR_RX) { - SK_DF(SK_VERB_MONITOR, - "restoring notify callback on kr \"%s\" (0x%llx) " - "krflags 0x%b", kring->ckr_name, SK_KVA(kring), - kring->ckr_flags, CKRF_BITS); - kring->ckr_na_notify = kring->ckr_mon_notify; - kring->ckr_mon_notify = NULL; - } - nx_mon_kr_dealloc(kring); - } else { - SK_DF(SK_VERB_MONITOR, - "NOT restoring callbacks on kr \"%s\" (0x%llx) " - "krflags 0x%b, %u monitors left", kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, - kring->ckr_n_monitors); - } -} - -/* - * This is called when the monitored adapter leaves skywalk mode (see - * na_unbind_channel). We need to notify the monitors that the monitored - * rings are gone. We do this by setting their mna->mna_pna to NULL. - * Note that the rings must be stopped when this happens, so no monitor - * ring callback can be active. - */ -void -nx_mon_stop(struct nexus_adapter *na) -{ - enum txrx t; - - SK_LOCK_ASSERT_HELD(); - - /* skip if this adapter has no allocated rings */ - if (na->na_tx_rings == NULL) { - return; - } - - na_disable_all_rings(na); - - for_rx_tx(t) { - uint32_t i; - - for (i = 0; i < na_get_nrings(na, t); i++) { - struct __kern_channel_ring *kring = &NAKR(na, t)[i]; - uint32_t j; - - for (j = 0; j < kring->ckr_n_monitors; j++) { - struct __kern_channel_ring *mkring = - kring->ckr_monitors[j]; - struct nexus_monitor_adapter *mna = - (struct nexus_monitor_adapter *) - KRNA(mkring); - - /* forget about this adapter */ - if (mna->mna_pna != NULL) { - ASSERT(na == mna->mna_pna); - (void) na_release_locked(mna->mna_pna); - mna->mna_pna = NULL; - } - } - - /* - * Remove all monitors and restore callbacks; - * this is important for nexus adapters that - * are linked to one another, e.g. pipe, since - * the callback changes on one adapter affects - * its peer during sync times. - */ - if (kring->ckr_n_monitors > 0) { - nx_mon_del(NULL, kring, TRUE); - } - - ASSERT(kring->ckr_monitors == NULL); - ASSERT(kring->ckr_max_monitors == 0); - ASSERT(kring->ckr_n_monitors == 0); - } - } - - na_enable_all_rings(na); -} - -/* - * Common functions for the na_activate() callbacks of both kind of - * monitors. - */ -static int -nx_mon_na_activate_common(struct nexus_adapter *na, na_activate_mode_t mode, - boolean_t zcopy) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - int err = 0; - - ASSERT(na->na_type == NA_MONITOR); - - SK_DF(SK_VERB_MONITOR, "na \"%s\" (0x%llx) %s zcopy %u", na->na_name, - SK_KVA(na), na_activate_mode2str(mode), zcopy); - - switch (mode) { - case NA_ACTIVATE_MODE_ON: - if (pna == NULL) { - /* parent left skywalk mode, fatal */ - SK_ERR("%s: internal error", na->na_name); - err = ENXIO; - } else { - err = nx_mon_enable(na, zcopy); - } - break; - - case NA_ACTIVATE_MODE_DEFUNCT: - break; - - case NA_ACTIVATE_MODE_OFF: - if (pna == NULL) { - SK_DF(SK_VERB_MONITOR, "%s: parent left skywalk mode, " - "nothing to restore", na->na_name); - } else { - nx_mon_disable(na); - } - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } - - return err; -} - -/* - * Functions specific for zero-copy monitors. - */ - -/* - * Common function for both zero-copy tx and rx nm_sync() - * callbacks - */ -static int -nx_mon_zcopy_parent_sync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags, enum txrx tx) -{ - struct __kern_channel_ring *mkring = kring->ckr_monitors[0]; - int rel_slots, free_slots, busy, sent = 0; - slot_idx_t beg, end, i; - const slot_idx_t lim = kring->ckr_lim; - const slot_idx_t mlim; - int error = 0; - - if (mkring == NULL) { - SK_RD(5, "NULL monitor on kr \"%s\" (0x%llx) krflags 0x%b", - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS); - return 0; - } - - ASSERT(!KR_KERNEL_ONLY(kring)); - ASSERT(!KR_KERNEL_ONLY(mkring)); - - /* deconst */ - *(slot_idx_t *)(uintptr_t)&mlim = mkring->ckr_lim; - - /* get the relased slots (rel_slots) */ - if (tx == NR_TX) { - beg = kring->ckr_ktail; - error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags); - if (error) { - return error; - } - end = kring->ckr_ktail; - } else { /* NR_RX */ - beg = kring->ckr_khead; - end = kring->ckr_rhead; - } - - rel_slots = end - beg; - if (rel_slots < 0) { - rel_slots += kring->ckr_num_slots; - } - - if (!rel_slots) { - /* - * No released slots, but we still need - * to call rxsync if this is a rx ring - */ - goto out_rxsync; - } - - /* - * We need to lock the monitor receive ring, since it - * is the target of bot tx and rx traffic from the monitored - * adapter - */ - KR_LOCK(mkring); - /* get the free slots available on the monitor ring */ - i = mkring->ckr_ktail; - busy = i - mkring->ckr_khead; - if (busy < 0) { - busy += mkring->ckr_num_slots; - } - free_slots = mlim - busy; - - if (!free_slots) { - goto out; - } - - /* swap min(free_slots, rel_slots) slots */ - if (free_slots < rel_slots) { - beg += (rel_slots - free_slots); - if (beg >= kring->ckr_num_slots) { - beg -= kring->ckr_num_slots; - } - rel_slots = free_slots; - } - - sent = rel_slots; - for (; rel_slots; rel_slots--) { - /* - * Swap the slots. - * - * XXX: adi@apple.com -- this bypasses the slot attach/detach - * interface, and needs to be changed when monitor adopts the - * packet APIs. SD_SWAP() will perform a block copy of the - * swap, and will readjust the kernel slot descriptor's sd_user - * accordingly. - */ - SD_SWAP(KR_KSD(mkring, i), KR_USD(mkring, i), - KR_KSD(kring, beg), KR_USD(kring, beg)); - - SK_RD(5, "beg %u buf_idx %u", beg, - METADATA_IDX(KR_KSD(kring, beg)->sd_qum)); - - beg = SLOT_NEXT(beg, lim); - i = SLOT_NEXT(i, mlim); - } - os_atomic_thread_fence(seq_cst); - mkring->ckr_ktail = i; - -out: - KR_UNLOCK(mkring); - - if (sent) { - /* notify the new frames to the monitor */ - (void) mkring->ckr_na_notify(mkring, p, 0); - } - -out_rxsync: - if (tx == NR_RX) { - error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags); - } - - return error; -} - -/* - * Callback used to replace the ckr_na_sync callback in the monitored tx rings. - */ -static int -nx_mon_zcopy_parent_txsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ - SK_DF(SK_VERB_MONITOR, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags); - return nx_mon_zcopy_parent_sync(kring, p, flags, NR_TX); -} - -/* callback used to replace the nm_sync callback in the monitored rx rings */ -static int -nx_mon_zcopy_parent_rxsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ - SK_DF(SK_VERB_MONITOR, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags); - return nx_mon_zcopy_parent_sync(kring, p, flags, NR_RX); -} - -static int -nx_mon_zcopy_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) -{ - return nx_mon_na_activate_common(na, mode, TRUE /* zcopy */); -} - -/* na_dtor callback for monitors */ -static void -nx_mon_zcopy_na_dtor(struct nexus_adapter *na) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - - SK_LOCK_ASSERT_HELD(); - ASSERT(na->na_type == NA_MONITOR); - - if (pna != NULL) { - (void) na_release_locked(pna); - mna->mna_pna = NULL; - } -} - -/* - * Functions specific for copy monitors. - */ - -static void -nx_mon_parent_sync(struct __kern_channel_ring *kring, struct proc *p, - slot_idx_t first_new, int new_slots) -{ - nexus_meta_type_t md_type = KRNA(kring)->na_md_type; - uint32_t j; - - for (j = 0; j < kring->ckr_n_monitors; j++) { - struct __kern_channel_ring *mkring = kring->ckr_monitors[j]; - slot_idx_t i, mlim, beg; - int free_slots, busy, sent = 0, m; - const slot_idx_t lim = kring->ckr_lim; - struct nexus_adapter *dst_na = KRNA(mkring); - struct nexus_monitor_adapter *mna = - (struct nexus_monitor_adapter *)dst_na; - uint32_t max_len = mkring->ckr_pp->pp_max_frags * - PP_BUF_SIZE_DEF(mkring->ckr_pp); - - /* - * src and dst adapters must share the same nexus; - * this test is done in nx_monitor_na_find(). This - * covers both buffer and metadata sizes. - */ - - mlim = mkring->ckr_lim; - - /* - * We need to lock the monitor receive ring, since it - * is the target of both tx and rx traffics from the - * monitored adapter. - */ - KR_LOCK(mkring); - /* get the free slots available on the monitor ring */ - i = mkring->ckr_ktail; - busy = i - mkring->ckr_khead; - if (busy < 0) { - busy += mkring->ckr_num_slots; - } - free_slots = mlim - busy; - - if (!free_slots) { - goto out; - } - - /* copy min(free_slots, new_slots) slots */ - m = new_slots; - beg = first_new; - if (free_slots < m) { - beg += (m - free_slots); - if (beg >= kring->ckr_num_slots) { - beg -= kring->ckr_num_slots; - } - m = free_slots; - } - - ASSERT(KRNA(mkring)->na_md_type == md_type); - - for (; m; m--) { - struct __kern_slot_desc *src_sd = KR_KSD(kring, beg); - struct __kern_slot_desc *dst_sd = KR_KSD(mkring, i); - struct __kern_packet *spkt, *dpkt; - kern_packet_t sph, dph; - uint32_t copy_len; - - if (!KSD_VALID_METADATA(src_sd)) { - goto skip; - } - - /* retreive packet handles from slot */ - spkt = src_sd->sd_pkt; - sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt), - METADATA_SUBTYPE(spkt)); - dpkt = dst_sd->sd_pkt; - dph = SK_PTR_ENCODE(dpkt, METADATA_TYPE(dpkt), - METADATA_SUBTYPE(dpkt)); - - ASSERT(METADATA_TYPE(spkt) == METADATA_TYPE(dpkt)); - - ASSERT(spkt->pkt_qum.qum_len <= (UINT32_MAX - 63)); - copy_len = spkt->pkt_qum.qum_len; - - /* round to a multiple of 64 */ - copy_len = (copy_len + 63) & ~63; - - if (__improbable(copy_len > max_len)) { - SK_RD(5, "kr \"%s\" -> mkr \"%s\": " - "truncating %u to %u", - kring->ckr_name, mkring->ckr_name, - (uint32_t)copy_len, max_len); - copy_len = max_len; - } - - /* copy buffers */ - mna->mna_pkt_copy_from_pkt(kring->ckr_tx, dph, 0, sph, - 0, copy_len, FALSE, 0, 0, FALSE); - - /* copy the associated meta data */ - _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); - if (md_type == NEXUS_META_TYPE_PACKET) { - _PKT_COPY(spkt, dpkt); - ASSERT(dpkt->pkt_mbuf == NULL); - } - - ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) || - PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp)); - - sent++; - i = SLOT_NEXT(i, mlim); -skip: - beg = SLOT_NEXT(beg, lim); - } - os_atomic_thread_fence(seq_cst); - mkring->ckr_ktail = i; -out: - KR_UNLOCK(mkring); - - if (sent) { - /* notify the new frames to the monitor */ - (void) mkring->ckr_na_notify(mkring, p, 0); - } - } -} - -/* callback used to replace the nm_sync callback in the monitored tx rings */ -static int -nx_mon_parent_txsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ - slot_idx_t first_new; - int new_slots; - nexus_type_t nx_type = - kring->ckr_na->na_nxdom_prov->nxdom_prov_dom->nxdom_type; - - /* - * For user pipe nexus, txsync can also be initated from RX process - * context, hence user pipe tx ring should be accessed holding - * ckr_qlock. - */ - if (nx_type == NEXUS_TYPE_USER_PIPE) { - KR_LOCK(kring); - } - - /* get the new slots */ - first_new = kring->ckr_khead; - new_slots = kring->ckr_rhead - first_new; - if (new_slots < 0) { - new_slots += kring->ckr_num_slots; - } - if (new_slots) { - nx_mon_parent_sync(kring, p, first_new, new_slots); - } - - if (nx_type == NEXUS_TYPE_USER_PIPE) { - KR_UNLOCK(kring); - } - - return kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags); -} - -/* callback used to replace the nm_sync callback in the monitored rx rings */ -static int -nx_mon_parent_rxsync(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ - slot_idx_t first_new; - int new_slots, error; - - /* get the new slots */ - error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags); - if (error) { - return error; - } - first_new = kring->ckr_mon_tail; - new_slots = kring->ckr_ktail - first_new; - if (new_slots < 0) { - new_slots += kring->ckr_num_slots; - } - if (new_slots) { - nx_mon_parent_sync(kring, p, first_new, new_slots); - } - kring->ckr_mon_tail = kring->ckr_ktail; - return 0; -} - -/* - * Callback used to replace the nm_notify() callback in the monitored rx rings - */ -static int -nx_mon_parent_notify(struct __kern_channel_ring *kring, struct proc *p, - uint32_t flags) -{ - int err = 0; - sk_protect_t protect = NULL; - - SK_DF(SK_VERB_MONITOR | SK_VERB_NOTIFY | - ((kring->ckr_tx == NR_TX) ? SK_VERB_TX : SK_VERB_RX), - "kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x", kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags); - /* - * ?xsync callbacks have tryget called by their callers, - * but here we have to call it by ourself. If we can't - * acquire the exclusive sync right, skip the sync. - */ - if ((err = kr_enter(kring, FALSE)) == 0) { - protect = sk_sync_protect(); - nx_mon_parent_rxsync(kring, p, NA_SYNCF_FORCE_READ); - sk_sync_unprotect(protect); - kr_exit(kring); - } - /* in all cases (even error), we must invoke notify */ - kring->ckr_mon_notify(kring, p, (NA_NOTEF_MONITOR | flags)); - return err; -} - -static int -nx_mon_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) -{ - return nx_mon_na_activate_common(na, mode, FALSE /* no zcopy */); -} - -static void -nx_mon_na_dtor(struct nexus_adapter *na) -{ - struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na; - struct nexus_adapter *pna = mna->mna_pna; - - SK_LOCK_ASSERT_HELD(); - ASSERT(na->na_type == NA_MONITOR); - - if (pna != NULL) { - (void) na_release_locked(pna); - mna->mna_pna = NULL; - } -} - -/* check if chr is a request for a monitor adapter that we can satisfy */ -int -nx_monitor_na_find(struct kern_nexus *nx, struct kern_channel *ch, - struct chreq *chr, struct kern_channel *ch0, struct nxbind *nxb, - struct proc *p, struct nexus_adapter **na, boolean_t create) -{ -#pragma unused(ch) - boolean_t zcopy = !!(chr->cr_mode & CHMODE_MONITOR_NO_COPY); - struct nexus_adapter *pna = NULL; /* parent adapter */ - struct nexus_monitor_adapter *mna = NULL; - char monsuff[10] = ""; - struct chreq pchr; - uint32_t i; - int error; - enum txrx t; - - SK_LOCK_ASSERT_HELD(); - *na = NULL; - -#if SK_LOG - uuid_string_t uuidstr; - SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u " - "ring_id %d ring_set %u ep_type %u:%u ch0 0x%llx create %u%s", - chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), - (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, - chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set, - chr->cr_real_endpoint, chr->cr_endpoint, SK_KVA(ch0), create, - !(chr->cr_mode & CHMODE_MONITOR) ? " (skipped)" : ""); -#endif /* SK_LOG */ - - if (!(chr->cr_mode & CHMODE_MONITOR)) { - return 0; - } - - /* XXX: Don't allow user packet pool mode in monitor for now */ - if (chr->cr_mode & CHMODE_USER_PACKET_POOL) { - SK_ERR("User Packet pool mode not supported for monitor"); - return ENOTSUP; - } - - mna = na_mon_alloc(Z_WAITOK); - - ASSERT(mna->mna_up.na_type == NA_MONITOR); - ASSERT(mna->mna_up.na_free == na_mon_free); - - /* override the ring set since we're monitoring */ - chr->cr_ring_set = RING_SET_ALL; - - if (ch0 != NULL) { - /* - * We've been given the owning channel from ch_open(); - * use this as shortcut since otherwise we'd have to - * find it ourselves. - */ -#if (DEBUG || DEVELOPMENT) - ASSERT(!(ch0->ch_info->cinfo_ch_mode & CHMODE_MONITOR)); - ASSERT(ch0->ch_info->cinfo_nx_port == chr->cr_port); -#endif /* DEBUG || DEVELOPMENT */ - pna = ch0->ch_na; - na_retain_locked(pna); - } else { - /* - * First, try to find the adapter that we want to monitor - * We use the same chr, after we have turned off the monitor - * flags. In this way we can potentially monitor everything - * skywalk understands, except other monitors. - */ - memcpy(&pchr, chr, sizeof(pchr)); - pchr.cr_mode &= ~CHMODE_MONITOR; - error = na_find(ch, nx, &pchr, ch0, nxb, p, &pna, create); - if (error != 0) { - SK_ERR("parent lookup failed: %d", error); - return error; - } - } - ASSERT(pna != NULL); - SK_DF(SK_VERB_MONITOR, - "found parent: \"%s\" (0x%llx)", pna->na_name, SK_KVA(pna)); - - if (!NA_IS_ACTIVE(pna)) { - /* parent not in skywalk mode */ - /* - * XXX we can wait for the parent to enter skywalk mode, - * by intercepting its na_activate() callback (2014-03-16) - */ - SK_ERR("parent \"%s\" (0x%llx) not in skywalk mode", - pna->na_name, SK_KVA(pna)); - error = ENXIO; - goto put_out; - } else if (zcopy && NA_KERNEL_ONLY(pna)) { - /* - * Zero-copy mode requires the parent adapter to be - * created in a non-kernel-only mode. - */ - SK_ERR("parent \"%s\" (0x%llx) is in kernel-only mode", - pna->na_name, SK_KVA(pna)); - error = ENODEV; - goto put_out; - } - - /* grab all the rings we need in the parent */ - mna->mna_pna = pna; - error = na_interp_ringid(pna, chr->cr_ring_id, chr->cr_ring_set, - mna->mna_first, mna->mna_last); - if (error != 0) { - SK_ERR("ring_mode %u ring_id %d error %d", chr->cr_ring_set, - (int)chr->cr_ring_id, error); - goto put_out; - } - if (mna->mna_last[NR_TX] - mna->mna_first[NR_TX] == 1) { - (void) snprintf(monsuff, 10, "-%u", mna->mna_first[NR_TX]); - } - (void) snprintf(mna->mna_up.na_name, sizeof(mna->mna_up.na_name), - "%s%s/%s%s%s", pna->na_name, monsuff, zcopy ? "z" : "", - (chr->cr_mode & CHMODE_MONITOR_TX) ? "r" : "", - (chr->cr_mode & CHMODE_MONITOR_RX) ? "t" : ""); - uuid_generate_random(mna->mna_up.na_uuid); - - /* these don't apply to the monitor adapter */ - *(nexus_stats_type_t *)(uintptr_t)&mna->mna_up.na_stats_type = - NEXUS_STATS_TYPE_INVALID; - *(uint32_t *)(uintptr_t)&mna->mna_up.na_flowadv_max = 0; - - if (zcopy) { - /* - * Zero copy monitors need exclusive access - * to the monitored rings. - */ - for_rx_tx(t) { - if (!(chr->cr_mode & nx_mon_txrx2chmode(t))) { - continue; - } - for (i = mna->mna_first[t]; - i < mna->mna_last[t]; i++) { - struct __kern_channel_ring *kring = - &NAKR(pna, t)[i]; - if (kring->ckr_n_monitors > 0) { - error = EBUSY; - SK_ERR("kr \"%s\" already monitored " - "by \"%s\"", kring->ckr_name, - kring->ckr_monitors[0]->ckr_name); - goto put_out; - } - } - } - mna->mna_up.na_activate = nx_mon_zcopy_na_activate; - mna->mna_up.na_dtor = nx_mon_zcopy_na_dtor; - /* - * To have zero copy, we need to use the same memory allocator - * as the monitored port. - */ - mna->mna_up.na_arena = pna->na_arena; - skmem_arena_retain((&mna->mna_up)->na_arena); - os_atomic_or(&mna->mna_up.na_flags, NAF_MEM_LOANED, relaxed); - } else { - /* normal monitors are incompatible with zero copy ones */ - for_rx_tx(t) { - if (!(chr->cr_mode & nx_mon_txrx2chmode(t))) { - continue; - } - for (i = mna->mna_first[t]; - i < mna->mna_last[t]; i++) { - struct __kern_channel_ring *kring = - &NAKR(pna, t)[i]; - if (kring->ckr_n_monitors > 0 && - KRNA(kring->ckr_monitors[0])-> - na_activate == nx_mon_zcopy_na_activate) { - error = EBUSY; - SK_ERR("kr \"%s\" is busy (zcopy)", - kring->ckr_name); - goto put_out; - } - } - } - mna->mna_up.na_activate = nx_mon_na_activate; - mna->mna_up.na_dtor = nx_mon_na_dtor; - /* - * allocate a new (private) allocator instance using the - * parent nexus configuration. - */ - if ((error = nx_monitor_prov_s.nxdom_prov_mem_new( - NX_DOM_PROV(nx), nx, &mna->mna_up)) != 0) { - ASSERT(mna->mna_up.na_arena == NULL); - goto put_out; - } - ASSERT(mna->mna_up.na_arena != NULL); - mna->mna_up.na_rxsync = nx_mon_na_rxsync; - } - *(nexus_meta_type_t *)(uintptr_t)&mna->mna_up.na_md_type = - pna->na_md_type; - *(nexus_meta_subtype_t *)(uintptr_t)&mna->mna_up.na_md_subtype = - pna->na_md_subtype; - - /* a do-nothing txsync: monitors cannot be used to inject packets */ - mna->mna_up.na_txsync = nx_mon_na_txsync; - mna->mna_up.na_rxsync = nx_mon_na_rxsync; - mna->mna_up.na_krings_create = nx_mon_na_krings_create; - mna->mna_up.na_krings_delete = nx_mon_na_krings_delete; - - /* - * We set the number of our na_rx_rings to be - * max(na_num_tx_rings, na_num_rx_rings) in the parent - */ - na_set_nrings(&mna->mna_up, NR_TX, na_get_nrings(pna, NR_TX)); - na_set_nrings(&mna->mna_up, NR_RX, na_get_nrings(pna, NR_RX)); - if (na_get_nrings(pna, NR_TX) > na_get_nrings(pna, NR_RX)) { - na_set_nrings(&mna->mna_up, NR_RX, na_get_nrings(pna, NR_TX)); - } - na_set_nslots(&mna->mna_up, NR_TX, na_get_nslots(pna, NR_TX)); - na_set_nslots(&mna->mna_up, NR_RX, na_get_nslots(pna, NR_RX)); - - na_attach_common(&mna->mna_up, nx, &nx_monitor_prov_s); - - /* remember the traffic directions we have to monitor */ - mna->mna_mode = (chr->cr_mode & CHMODE_MONITOR); - - /* keep the reference to the parent */ - *na = &mna->mna_up; - na_retain_locked(*na); - - /* sanity check: monitor and monitored adapters must share the nexus */ - ASSERT((*na)->na_nx == pna->na_nx); - -#if SK_LOG - SK_DF(SK_VERB_MONITOR, "created monitor adapter 0x%llx", SK_KVA(mna)); - SK_DF(SK_VERB_MONITOR, "na_name: \"%s\"", mna->mna_up.na_name); - SK_DF(SK_VERB_MONITOR, " UUID: %s", - sk_uuid_unparse(mna->mna_up.na_uuid, uuidstr)); - SK_DF(SK_VERB_MONITOR, " nx: 0x%llx (\"%s\":\"%s\")", - SK_KVA(mna->mna_up.na_nx), NX_DOM(mna->mna_up.na_nx)->nxdom_name, - NX_DOM_PROV(mna->mna_up.na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_MONITOR, " flags: 0x%b", - mna->mna_up.na_flags, NAF_BITS); - SK_DF(SK_VERB_MONITOR, " rings: tx %u rx %u", - na_get_nrings(&mna->mna_up, NR_TX), - na_get_nrings(&mna->mna_up, NR_RX)); - SK_DF(SK_VERB_MONITOR, " slots: tx %u rx %u", - na_get_nslots(&mna->mna_up, NR_TX), - na_get_nslots(&mna->mna_up, NR_RX)); -#if CONFIG_NEXUS_USER_PIPE - SK_DF(SK_VERB_MONITOR, " next_pipe: %u", mna->mna_up.na_next_pipe); - SK_DF(SK_VERB_MONITOR, " max_pipes: %u", mna->mna_up.na_max_pipes); -#endif /* CONFIG_NEXUS_USER_PIPE */ - SK_DF(SK_VERB_MONITOR, " mna_tx_rings: [%u,%u)", mna->mna_first[NR_TX], - mna->mna_last[NR_TX]); - SK_DF(SK_VERB_MONITOR, " mna_rx_rings: [%u,%u)", mna->mna_first[NR_RX], - mna->mna_last[NR_RX]); - SK_DF(SK_VERB_MONITOR, " mna_mode: %u", mna->mna_mode); -#endif /* SK_LOG */ - - return 0; - -put_out: - if (pna != NULL) { - (void) na_release_locked(pna); - pna = NULL; - } - NA_FREE(&mna->mna_up); - return error; -} - -static void -nx_mon_quantum_copy_64x(const enum txrx t, kern_packet_t dph, - const uint16_t doff, kern_packet_t sph, const uint16_t soff, - const uint32_t len, const boolean_t unused_arg1, - const uint16_t unused_arg2, const uint16_t unused_arg3, - const boolean_t unused_arg4) -{ - /* for function prototype parity with pkt_copy_from_pkt_t */ -#pragma unused(unused_arg1, unused_arg2, unused_arg3, unused_arg4) -#pragma unused(t, doff, soff) - struct __kern_quantum *dqum = SK_PTR_ADDR_KQUM(dph); - struct __kern_quantum *squm = SK_PTR_ADDR_KQUM(sph); - uint8_t *sbuf, *dbuf; - - ASSERT(METADATA_TYPE(squm) == NEXUS_META_TYPE_QUANTUM); - ASSERT(METADATA_TYPE(squm) == METADATA_TYPE(dqum)); - VERIFY(IS_P2ALIGNED(len, 64)); - - MD_BUFLET_ADDR(squm, sbuf); - MD_BUFLET_ADDR(dqum, dbuf); - VERIFY(IS_P2ALIGNED(dbuf, sizeof(uint64_t))); - - if (__probable(IS_P2ALIGNED(sbuf, sizeof(uint64_t)))) { - sk_copy64_64x((uint64_t *)(void *)sbuf, - (uint64_t *)(void *)dbuf, len); - } else { - bcopy(sbuf, dbuf, len); - } - /* - * This copy routine only copies to/from a buflet, so the length - * is guaranteed be <= the size of a buflet. - */ - VERIFY(len <= UINT16_MAX); - METADATA_SET_LEN(dqum, (uint16_t)len, 0); -} diff --git a/bsd/skywalk/nexus/monitor/nx_monitor.h b/bsd/skywalk/nexus/monitor/nx_monitor.h deleted file mode 100644 index 892fd1336..000000000 --- a/bsd/skywalk/nexus/monitor/nx_monitor.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2015-2016 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. - * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _SKYWALK_NEXUS_MONITOR_H_ -#define _SKYWALK_NEXUS_MONITOR_H_ - -#include - -#if CONFIG_NEXUS_MONITOR -struct nexus_monitor_adapter { - /* - * This is an overlay structure on nexus_adapter; - * make sure it contains 'up' as the first member. - */ - struct nexus_adapter mna_up; - - struct nexus_adapter *mna_pna; - uint32_t mna_first[NR_TXRX]; - uint32_t mna_last[NR_TXRX]; - uint32_t mna_mode; - pkt_copy_from_pkt_t *mna_pkt_copy_from_pkt; -}; - -#define NEXUS_PROVIDER_MONITOR "com.apple.nexus.monitor" - -extern struct nxdom nx_monitor_dom_s; - -__BEGIN_DECLS -extern int nx_monitor_na_find(struct kern_nexus *, struct kern_channel *, - struct chreq *, struct kern_channel *, struct nxbind *, struct proc *, - struct nexus_adapter **, boolean_t); -extern void nx_mon_stop(struct nexus_adapter *); -__END_DECLS -#endif /* CONFIG_NEXUS_MONITOR */ -#endif /* _SKYWALK_NEXUS_MONITOR_H_ */ diff --git a/bsd/skywalk/nexus/netif/nx_netif.c b/bsd/skywalk/nexus/netif/nx_netif.c index d60124d12..258414ff1 100644 --- a/bsd/skywalk/nexus/netif/nx_netif.c +++ b/bsd/skywalk/nexus/netif/nx_netif.c @@ -84,6 +84,7 @@ #include #include #include +#include #define NX_NETIF_MAXRINGS NX_MAX_NUM_RING_PAIR #define NX_NETIF_MINSLOTS 2 /* XXX same as above */ @@ -122,8 +123,8 @@ static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *, struct nxbind *, void *); static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t); static int nx_netif_dom_connect(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *); + struct kern_nexus *, struct kern_channel *, struct chreq *, struct nxbind *, + struct proc *); static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *); static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *, @@ -352,13 +353,13 @@ nx_netif_dom_init(struct nxdom *nxdom) SK_LOCK_ASSERT_HELD(); ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED)); - _CASSERT(NEXUS_PORT_NET_IF_DEV == 0); - _CASSERT(NEXUS_PORT_NET_IF_HOST == 1); - _CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2); - _CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE); - _CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED); - _CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO); - _CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX); + static_assert(NEXUS_PORT_NET_IF_DEV == 0); + static_assert(NEXUS_PORT_NET_IF_HOST == 1); + static_assert(NEXUS_PORT_NET_IF_CLIENT == 2); + static_assert(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE); + static_assert(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED); + static_assert(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO); + static_assert(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX); (void) nxdom_prov_add(nxdom, &nx_netif_prov_s); @@ -560,8 +561,7 @@ nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov, PKT_MAX_PROTO_HEADER_SIZE); return EINVAL; } - _CASSERT(sizeof(struct __kern_netif_intf_advisory) == - NX_INTF_ADV_SIZE); + static_assert(sizeof(struct __kern_netif_intf_advisory) == NX_INTF_ADV_SIZE); *(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory); } done: @@ -592,7 +592,7 @@ nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov, boolean_t kernel_only; SK_DF(SK_VERB_NETIF, - "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx), + "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx), NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name, SK_KVA(na)); @@ -781,8 +781,7 @@ nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov, } case NXCFG_CMD_FLOW_ADD: case NXCFG_CMD_FLOW_DEL: { - _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) == - offsetof(struct nx_flow_req, _nfr_common_field_end)); + static_assert(offsetof(struct nx_flow_req, _nfr_kernel_field_end) == offsetof(struct nx_flow_req, _nfr_common_field_end)); struct nx_flow_req nfr; bzero(&nfr, sizeof(nfr)); @@ -810,7 +809,7 @@ nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov, } done: SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF, - "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx), + "nexus %p (%s) cmd %d err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err); return err; } @@ -833,7 +832,7 @@ nx_netif_prov_nx_ctor(struct kern_nexus *nx) SK_LOCK_ASSERT_HELD(); ASSERT(nx->nx_arg == NULL); - SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); + SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); nx->nx_arg = nx_netif_alloc(Z_WAITOK); n = NX_NETIF_PRIVATE(nx); @@ -849,7 +848,7 @@ nx_netif_prov_nx_ctor(struct kern_nexus *nx) } } n->nif_nx = nx; - SK_D("create new netif 0x%llx for nexus 0x%llx", + SK_D("create new netif %p for nexus %p", SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx)); return 0; } @@ -861,7 +860,7 @@ nx_netif_prov_nx_dtor(struct kern_nexus *nx) SK_LOCK_ASSERT_HELD(); - SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx), + SK_D("nexus %p (%s) netif %p", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n)); /* @@ -882,7 +881,7 @@ nx_netif_prov_nx_dtor(struct kern_nexus *nx) nxb_free(n->nif_host_nxb); n->nif_host_nxb = NULL; } - SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n)); + SK_DF(SK_VERB_NETIF, "marking netif %p as free", SK_KVA(n)); nx_netif_free(n); nx->nx_arg = NULL; } @@ -1117,7 +1116,7 @@ nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port, } SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF, - "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)", + "+++ netif %p nx_port %d, total %u active %u (err %d)", SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS, nx->nx_active_ports, error); @@ -1142,7 +1141,7 @@ nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) static int nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) + struct nxbind *nxb, struct proc *p) { #pragma unused(nxdom_prov) int err = 0; @@ -1188,7 +1187,7 @@ nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, } chr->cr_ring_set = RING_SET_DEFAULT; - chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF; + chr->cr_endpoint = CH_ENDPOINT_NET_IF; (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "netif:%llu:%.*s", nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen, nx->nx_prov->nxprov_params->nxp_name); @@ -1196,7 +1195,7 @@ nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, if (ch->ch_flags & CHANF_KERNEL) { err = na_connect_spec(nx, ch, chr, p); } else { - err = na_connect(nx, ch, chr, ch0, nxb, p); + err = na_connect(nx, ch, chr, nxb, p); } if (err == 0) { @@ -1221,7 +1220,7 @@ nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov, #pragma unused(nxdom_prov) SK_LOCK_ASSERT_HELD(); - SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch), + SK_D("channel %p -!- nexus %p (%s:\"%s\":%u:%d)", SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -1290,7 +1289,7 @@ nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, ifnet_decr_iorefcnt(ifp); ch->ch_na->na_ifp = NULL; } - SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)", + SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d)", ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -1307,7 +1306,7 @@ nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, struct nexus_netif_adapter * na_netif_alloc(zalloc_flags_t how) { - _CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0); + static_assert(offsetof(struct nexus_netif_adapter, nifna_up) == 0); return zalloc_flags(na_netif_zone, how | Z_ZERO); } @@ -1318,7 +1317,7 @@ na_netif_free(struct nexus_adapter *na) struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na; SK_LOCK_ASSERT_HELD(); - SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna)); + SK_DF(SK_VERB_MEM, "nifna %p FREE", SK_KVA(nifna)); ASSERT(na->na_refcount == 0); ASSERT(nifna->nifna_tx_mit == NULL); @@ -1406,14 +1405,14 @@ done: if (nsr->nsr_flags & NXSPECREQ_UUID) { nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr); } else if (nsr->nsr_flags & NXSPECREQ_IFP) { - (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx", + (void) snprintf((char *)uuidstr, sizeof(uuidstr), "%p", SK_KVA(nsr->nsr_ifp)); nustr = uuidstr; } else { nustr = nsr->nsr_name; } SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF, - "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d", + "nexus %p (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr, sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err); #endif /* SK_LOG */ @@ -1516,13 +1515,13 @@ nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr) if (nsr != NULL) { uuid_string_t ifuuidstr; SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF, - "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d", + "nexus %p (%s) if_uuid %s flags 0x%x err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err); } else { SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF, - "nexus 0x%llx (%s) err %d", SK_KVA(nx), + "nexus %p (%s) err %d", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, err); } #endif /* SK_LOG */ @@ -1714,7 +1713,7 @@ nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags) struct kern_nexus *nx = hwna->na_nx; /* update our work timestamp */ - hwna->na_work_ts = _net_uptime; + hwna->na_work_ts = net_uptime(); if (NX_LLINK_PROV(nx)) { nx_netif_llink_notify_all(nx, flags); @@ -1760,26 +1759,25 @@ nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p, ASSERT(ifp != NULL); SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); - if (__improbable(!IF_FULLY_ATTACHED(ifp))) { - SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached", + if (__improbable(!ifnet_is_fully_attached(ifp))) { + SK_ERR("kr %p ifp %s (%p), interface not attached", SK_KVA(kring), if_name(ifp), SK_KVA(ifp)); return ENXIO; } if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) { - SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), " + SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr %p ifp %s (%p), " "flow control ON", SK_KVA(kring), if_name(ifp), SK_KVA(ifp)); return ENXIO; } /* update our work timestamp */ - KRNA(kring)->na_work_ts = _net_uptime; + KRNA(kring)->na_work_ts = net_uptime(); sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) || !KR_KERNEL_ONLY(kring); @@ -1821,15 +1819,14 @@ nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, int ret; SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); ASSERT(kring->ckr_rhead <= kring->ckr_lim); /* update our work timestamp */ - KRNA(kring)->na_work_ts = _net_uptime; + KRNA(kring)->na_work_ts = net_uptime(); ret = nx_sync_rx(kring, (flags & NA_SYNCF_FORCE_READ) || kring->ckr_pending_intr != 0); @@ -1847,7 +1844,7 @@ nx_netif_na_dtor(struct nexus_adapter *na) SK_LOCK_ASSERT_HELD(); ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST); - SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na)); + SK_DF(SK_VERB_NETIF, "na \"%s\" (%p)", na->na_name, SK_KVA(na)); /* * If the finalizer callback hasn't been called for whatever @@ -1893,12 +1890,12 @@ nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p, SK_DF(SK_VERB_NETIF | SK_VERB_INTR | ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX), - "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b", + "na \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS); + SK_KVA(kring), kring->ckr_flags); /* update our work timestamp */ - KRNA(kring)->na_work_ts = _net_uptime; + KRNA(kring)->na_work_ts = net_uptime(); kring->ckr_pending_intr++; if (work_done != NULL) { @@ -2080,7 +2077,7 @@ nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) ASSERT(na->na_type == NA_NETIF_DEV); ASSERT(!(na->na_flags & NAF_HOST_ONLY)); - SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name, + SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s [%s]", na->na_name, SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode)); switch (mode) { @@ -2275,7 +2272,7 @@ nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp) * * The ifnet in 'na_ifp' will be released by na_release_locked(). */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { if (!(ifp->if_refflags & IFRF_EMBRYONIC)) { ifp = NULL; retval = ENXIO; @@ -2301,7 +2298,7 @@ nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp) ASSERT(devna->na_ifp == NULL); } else { ASSERT(devna->na_private == NULL); - /* use I/O refcnt from ifnet_is_attached() */ + /* use I/O refcnt from ifnet_get_ioref() */ devna->na_ifp = ifp; } devna->na_activate = nx_netif_na_activate; @@ -2463,10 +2460,10 @@ nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name); SK_DF(SK_VERB_NETIF, " UUID: %s", sk_uuid_unparse(devna->na_uuid, uuidstr)); - SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_NETIF, " nx: %p (\"%s\":\"%s\")", SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name, NX_DOM_PROV(devna->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS); + SK_DF(SK_VERB_NETIF, " flags: 0x%x", devna->na_flags); SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max); SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u", na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX)); @@ -2476,16 +2473,15 @@ nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe); SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes); #endif /* CONFIG_NEXUS_USER_PIPE */ - SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]", - SK_KVA(ifp), ifp->if_xname, ifp->if_refio); + SK_DF(SK_VERB_NETIF, " ifp: %p %s [ioref %u]", + SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio)); SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name); SK_DF(SK_VERB_NETIF, " UUID: %s", sk_uuid_unparse(hostna->na_uuid, uuidstr)); - SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_NETIF, " nx: %p (\"%s\":\"%s\")", SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name, NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_NETIF, " flags: 0x%b", - hostna->na_flags, NAF_BITS); + SK_DF(SK_VERB_NETIF, " flags: 0x%x", hostna->na_flags); SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max); SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u", na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX)); @@ -2495,8 +2491,8 @@ nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe); SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes); #endif /* CONFIG_NEXUS_USER_PIPE */ - SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]", - SK_KVA(ifp), ifp->if_xname, ifp->if_refio); + SK_DF(SK_VERB_NETIF, " ifp: %p %s [ioref %u]", + SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio)); #endif /* SK_LOG */ err: @@ -2720,6 +2716,75 @@ nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset, return err; } +static void +configure_capab_rx_flow_steering(struct nx_netif *nif, + nxprov_capab_config_fn_t capab_fn) +{ + struct kern_nexus_capab_rx_flow_steering capab; + struct kern_nexus *nx = nif->nif_nx; + uint32_t capab_len; + int error; + + /* check/configure Rx flow steering */ + if ((nif->nif_ifp->if_xflags & IFXF_RX_FLOW_STEERING) == 0) { + return; + } + bzero(&capab, sizeof(capab)); + capab.kncrxfs_version = + KERN_NEXUS_CAPAB_RX_FLOW_STEERING_VERSION_1; + capab_len = sizeof(capab); + error = capab_fn(NX_PROV(nx), nx, + KERN_NEXUS_CAPAB_RX_FLOW_STEERING, &capab, &capab_len); + if (error != 0) { + DTRACE_SKYWALK2(rx__flow__steering__capab__error, + struct nx_netif *, nif, int, error); + return; + } + VERIFY(capab.kncrxfs_config != NULL); + VERIFY(capab.kncrxfs_prov_ctx != NULL); + nif->nif_rx_flow_steering.config_fn = capab.kncrxfs_config; + nif->nif_rx_flow_steering.prov_ctx = capab.kncrxfs_prov_ctx; + nif->nif_extended_capabilities |= NETIF_CAPAB_RX_FLOW_STEERING; +} + +static void +unconfigure_capab_rx_flow_steering(struct nx_netif *nif) +{ + if ((nif->nif_extended_capabilities & NETIF_CAPAB_RX_FLOW_STEERING) == 0) { + return; + } + bzero(&nif->nif_rx_flow_steering, sizeof(nif->nif_rx_flow_steering)); + nif->nif_extended_capabilities &= ~NETIF_CAPAB_RX_FLOW_STEERING; +} + +int +nx_netif_configure_rx_flow_steering(struct kern_nexus *nx, uint32_t id, + struct ifnet_traffic_descriptor_common *td, + rx_flow_steering_action_t action) +{ + struct netif_rx_flow_steering *rx_flow_steering = NULL; + struct nx_netif *nif; + int err = 0; + + if ((nx->nx_flags & NXF_CLOSED) != 0) { + return ENXIO; + } + + ASSERT(NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF); + nif = NX_NETIF_PRIVATE(nx); + + if ((nif->nif_extended_capabilities & NETIF_CAPAB_RX_FLOW_STEERING) == 0) { + return ENOTSUP; + } + + rx_flow_steering = &nif->nif_rx_flow_steering; + VERIFY(rx_flow_steering->prov_ctx != NULL); + VERIFY(rx_flow_steering->config_fn != NULL); + err = rx_flow_steering->config_fn(rx_flow_steering->prov_ctx, id, + td, action); + return err; +} + static void nx_netif_capabilities_init(struct nx_netif *nif) { @@ -2738,6 +2803,7 @@ nx_netif_capabilities_init(struct nx_netif *nif) } configure_capab_interface_advisory(nif, capab_fn); configure_capab_qset_extensions(nif, capab_fn); + configure_capab_rx_flow_steering(nif, capab_fn); } static void @@ -2745,6 +2811,7 @@ nx_netif_capabilities_fini(struct nx_netif *nif) { unconfigure_capab_interface_advisory(nif); unconfigure_capab_qset_extensions(nif); + unconfigure_capab_rx_flow_steering(nif); } static void @@ -2785,7 +2852,7 @@ na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp) ASSERT(devna != NULL); ASSERT(hostna != NULL); - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { VERIFY(0); /* NOTREACHED */ __builtin_unreachable(); @@ -2793,7 +2860,7 @@ na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp) ASSERT(devna->na_private == ifp); ASSERT(devna->na_ifp == NULL); - /* use I/O refcnt held by ifnet_is_attached() above */ + /* use I/O refcnt held by ifnet_get_ioref() above */ devna->na_ifp = devna->na_private; devna->na_private = NULL; @@ -2810,7 +2877,9 @@ na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp) nx_netif_capabilities_init(nif); nx_netif_agent_init(nif); (void) nxctl_inet_traffic_rule_get_count(ifp->if_xname, - &ifp->if_traffic_rule_count); + &ifp->if_inet_traffic_rule_count); + (void) nxctl_eth_traffic_rule_get_count(ifp->if_xname, + &ifp->if_eth_traffic_rule_count); nx_netif_verify_tso_config(nif); nx_netif_callbacks_init(nif); } @@ -2823,7 +2892,7 @@ nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp, struct nx_netif *nif = nifna->nifna_netif; struct kern_nexus *nx = nif->nif_nx; struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV); - uint64_t now = _net_uptime; + uint64_t now = net_uptime(); boolean_t purge; ASSERT(thres != 0); @@ -2926,7 +2995,7 @@ nx_netif_llw_detach_notify(void *arg) ch = ch_list[i]; p = proc_find(ch->ch_pid); if (p == NULL) { - SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid); + SK_ERR("ch %p pid %d not found", SK_KVA(ch), ch->ch_pid); DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx, struct kern_channel *, ch, pid_t, ch->ch_pid); ch_release(ch); @@ -3071,9 +3140,9 @@ nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch, done: SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF, - "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx " + "ch %p from na \"%s\" (%p) naflags %x nx %p " "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na), - na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error); + na->na_flags, SK_KVA(ch->ch_nexus), spec_cmd, error); return error; } @@ -3100,12 +3169,11 @@ nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch, #if SK_LOG uuid_string_t uuidstr; - SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u " - "ring_id %d ring_set %u ep_type %u:%u create %u%s", + SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%x pipe_id %u " + "ring_id %d ring_set %u ep_type %u create %u%s", chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), - (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, - chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set, - chr->cr_real_endpoint, chr->cr_endpoint, create, + (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id, + (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create, (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : ""); #endif /* SK_LOG */ @@ -3222,7 +3290,7 @@ done: if (err) { SK_ERR("na not found, err(%d)", err); } else { - SK_DF(SK_VERB_NETIF, "found na 0x%llu", na); + SK_DF(SK_VERB_NETIF, "found na %p", SK_KVA(na)); } return err; } @@ -3286,7 +3354,7 @@ nx_netif_alloc(zalloc_flags_t how) NETIF_RWINIT(n); os_ref_init(&n->nif_refcnt, NULL); - SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n)); + SK_DF(SK_VERB_MEM, "netif %p", SK_KVA(n)); return n; } @@ -3298,7 +3366,7 @@ nx_netif_destroy(struct nx_netif *n) ASSERT(n->nif_host_nxb == NULL); ASSERT(os_ref_get_count(&n->nif_refcnt) == 0); nx_netif_llink_config_free(n); - SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n)); + SK_DF(SK_VERB_MEM, "netif %p", SK_KVA(n)); NETIF_RWDESTROY(n); zfree(nx_netif_zone, n); } @@ -3308,7 +3376,7 @@ nx_netif_release(struct nx_netif *n) { SK_LOCK_ASSERT_HELD(); - SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n), + SK_DF(SK_VERB_MEM, "netif %p, refcnt %d", SK_KVA(n), os_ref_get_count(&n->nif_refcnt)); if (os_ref_release(&n->nif_refcnt) == 0) { nx_netif_destroy(n); @@ -3323,7 +3391,7 @@ nx_netif_retain(struct nx_netif *n) /* retaining an object with a zero refcount is not allowed */ ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1); os_ref_retain(&n->nif_refcnt); - SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n), + SK_DF(SK_VERB_MEM, "netif %p, refcnt %d", SK_KVA(n), os_ref_get_count(&n->nif_refcnt)); } @@ -3375,12 +3443,9 @@ static errno_t nx_netif_interface_advisory_notify(void *kern_ctx, const struct ifnet_interface_advisory *advisory) { - _CASSERT(offsetof(struct ifnet_interface_advisory, version) == - offsetof(struct ifnet_interface_advisory, header.version)); - _CASSERT(offsetof(struct ifnet_interface_advisory, direction) == - offsetof(struct ifnet_interface_advisory, header.direction)); - _CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) == - offsetof(struct ifnet_interface_advisory, header.interface_type)); + static_assert(offsetof(struct ifnet_interface_advisory, version) == offsetof(struct ifnet_interface_advisory, header.version)); + static_assert(offsetof(struct ifnet_interface_advisory, direction) == offsetof(struct ifnet_interface_advisory, header.direction)); + static_assert(offsetof(struct ifnet_interface_advisory, _reserved) == offsetof(struct ifnet_interface_advisory, header.interface_type)); if (__improbable(kern_ctx == NULL || advisory == NULL)) { return EINVAL; @@ -3532,7 +3597,7 @@ netif_receive(struct nexus_netif_adapter *nifna, } /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); if (nif->nif_filter_cnt > 0) { struct __kern_packet *__single fpkt_chain = NULL; @@ -3667,7 +3732,7 @@ consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end) } int -netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p, +netif_rx_notify(struct __kern_channel_ring *ring, struct proc *p, uint32_t flags) { struct nexus_adapter *hwna; @@ -3689,10 +3754,9 @@ netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p, if (err != 0) { /* not a serious error, so no need to be chatty here */ SK_DF(SK_VERB_FSW, - "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b " + "hwna \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x " "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)), - ring->ckr_name, SK_KVA(ring), ring->ckr_flags, - CKRF_BITS, err); + ring->ckr_name, SK_KVA(ring), ring->ckr_flags, err); goto out; } if (__improbable(KR_DROP(ring))) { @@ -3719,7 +3783,7 @@ netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p, if (__improbable(ring->ckr_khead == ktail)) { SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX, "how strange, interrupt with no packets on hwna " - "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring))); + "\"%s\" (%p)", KRNA(ring)->na_name, SK_KVA(KRNA(ring))); goto put_out; } ktail = netif_rate_limit(ring, nif->nif_input_rate, ring->ckr_rhead, @@ -3746,78 +3810,22 @@ out: return err; } -int -netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p, - uint32_t flags) -{ -#pragma unused(p, flags) - sk_protect_t protect; - struct nexus_adapter *hwna; - struct nexus_pkt_stats stats = {0}; - uint32_t i, count; - int err = 0; - - KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START), - SK_KVA(ring)); - - /* XXX - * sk_sync_protect() is not needed for this case because - * we are not using the dev ring. Unfortunately lots of - * macros used by fsw still require this. - */ - protect = sk_sync_protect(); - hwna = KRNA(ring); - count = na_get_nslots(hwna, NR_RX); - err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count); - if (__improbable(err != 0)) { - SK_ERR("nx_rx_sync_packets failed: %d", err); - DTRACE_SKYWALK2(rx__sync__packets__failed, - struct __kern_channel_ring *, ring, int, err); - goto out; - } - DTRACE_SKYWALK1(chain__count, uint32_t, count); - for (i = 0; i < count; i++) { - struct __kern_packet *pkt_chain; - - pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]); - ASSERT(pkt_chain != NULL); - netif_receive(NIFNA(KRNA(ring)), pkt_chain, &stats); - - if (ring->ckr_netif_mit_stats != NULL && - stats.nps_pkts != 0 && stats.nps_bytes != 0) { - ring->ckr_netif_mit_stats(ring, stats.nps_pkts, - stats.nps_bytes); - } - } -out: - sk_sync_unprotect(protect); - KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END), - SK_KVA(ring), err); - return err; -} - - /* * Configure the NA to operate in a particular mode. */ static channel_ring_notify_t -netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode) +netif_hwna_get_notify(netif_mode_t mode) { channel_ring_notify_t notify = NULL; - boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 && - nx_has_rx_sync_packets(ring)); if (mode == NETIF_MODE_FSW) { - notify = (has_sync_pkts ? netif_rx_notify_fast : - netif_rx_notify_default); + notify = netif_rx_notify; } else if (mode == NETIF_MODE_LLW) { - notify = (has_sync_pkts ? netif_llw_rx_notify_fast : - netif_llw_rx_notify_default); + notify = netif_llw_rx_notify; } return notify; } - static uint32_t netif_mode_to_flag(netif_mode_t mode) { @@ -3844,7 +3852,7 @@ netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode, for (i = 0; i < na_get_nrings(hwna, NR_RX); i++) { struct __kern_channel_ring *kr = &NAKR(hwna, NR_RX)[i]; - channel_ring_notify_t notify = netif_hwna_get_notify(kr, mode); + channel_ring_notify_t notify = netif_hwna_get_notify(mode); if (set) { kr->ckr_save_notify = kr->ckr_netif_notify; @@ -4136,22 +4144,23 @@ netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq, struct ifnet *ifp = hwna->na_ifp; uint32_t pkts_cnt; uint32_t bytes_cnt; + uint32_t sch_model = ifp->if_output_sched_model; + mbuf_svc_class_t svc; errno_t rc; ASSERT(ifp != NULL); - ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX); + ASSERT(IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)); ASSERT((pkt_limit != 0) && (byte_limit != 0)); if (ifcq == NULL) { ifcq = netif_get_default_ifcq(hwna); } - if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) { - rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc, - pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx); - } else { - rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit, - &pkt_head, NULL, pkt_cnt, bytes, qset_idx); - } + + svc = (sch_model & IFNET_SCHED_DRIVER_MANGED_MODELS) ? + (mbuf_svc_class_t)sc : MBUF_SC_UNSPEC; + rc = ifclassq_dequeue(ifcq, svc, pkt_limit, byte_limit, &pkt_head, NULL, + pkt_cnt, bytes, qset_idx); + ASSERT((rc == 0) || (rc == EAGAIN)); ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL)); @@ -4216,15 +4225,15 @@ netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit, goto out; } - if (__improbable(!IF_FULLY_ATTACHED(ifp))) { - SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached", + if (__improbable(!ifnet_is_fully_attached(ifp))) { + SK_ERR("hwna %p ifp %s (%p), interface not attached", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; } if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) { - SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), " + SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna %p ifp %s (%p), " "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp)); rc = ENXIO; goto out; @@ -4243,7 +4252,7 @@ netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit, if (__improbable(KR_DROP(ring) || !NA_IS_ACTIVE(ring->ckr_na))) { - SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring)); + SK_ERR("hw-kr %p stopped", SK_KVA(ring)); rc = ENXIO; goto done; } @@ -4555,24 +4564,6 @@ kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc, bytes_cnt); } -void -kern_netif_set_qset_combined(kern_netif_qset_t qset) -{ - VERIFY(qset != NULL); - VERIFY(qset->nqs_ifcq != NULL); - - ifclassq_set_grp_combined(qset->nqs_ifcq, qset->nqs_idx); -} - -void -kern_netif_set_qset_separate(kern_netif_qset_t qset) -{ - VERIFY(qset != NULL); - VERIFY(qset->nqs_ifcq != NULL); - - ifclassq_set_grp_separated(qset->nqs_ifcq, qset->nqs_idx); -} - errno_t kern_nexus_netif_llink_add(struct kern_nexus *nx, struct kern_nexus_netif_llink_init *llink_init) diff --git a/bsd/skywalk/nexus/netif/nx_netif.h b/bsd/skywalk/nexus/netif/nx_netif.h index 609709b09..617efa4af 100644 --- a/bsd/skywalk/nexus/netif/nx_netif.h +++ b/bsd/skywalk/nexus/netif/nx_netif.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2023 Apple Inc. All rights reserved. + * Copyright (c) 2015-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,7 +119,7 @@ struct netif_queue { void *nq_ctx; kern_packet_svc_class_t nq_svc; /* service class of TX queue */ uint16_t nq_flags; -}__attribute__((aligned(sizeof(uint64_t)))); +} __sk_aligned(64); /* values for nq_flags */ #define NETIF_QUEUE_EXT_INITED 0x0001 /* nxnpi_queue_init() succeeded */ @@ -196,7 +196,7 @@ struct netif_agent_flow { uuid_t naf_flow_uuid; uuid_t naf_bind_key; nexus_port_t naf_nx_port; - uint16_t naf_flags; + uint32_t naf_flags; pid_t naf_pid; union sockaddr_in_4_6 naf_daddr; union sockaddr_in_4_6 naf_saddr; @@ -204,15 +204,15 @@ struct netif_agent_flow { #define NIFNA(_na) (__container_of((_na), struct nexus_netif_adapter, nifna_up)) -/* nif_flags */ /* - * This is named differently from the flow classification rule - * (IPV6 ULA) because this gives us the flexibility of using - * different types of classification in the future. + * Values for nif_flags + * Used for describing the internal state of the nx_netif structure */ #define NETIF_FLAG_LOW_LATENCY 0x00000001 #define NETIF_FLAG_COMPAT 0x00000002 #define NETIF_FLAG_LLINK_INITIALIZED 0x00000004 +#define NETIF_FLAG_CHANGE_PENDING 0x00000008 + #define NETIF_IS_LOW_LATENCY(n) \ (((n)->nif_flags & NETIF_FLAG_LOW_LATENCY) != 0) #define NETIF_IS_COMPAT(n) \ @@ -248,12 +248,18 @@ typedef enum { /* nif capabilities */ #define NETIF_CAPAB_INTERFACE_ADVISORY 0x00000001 #define NETIF_CAPAB_QSET_EXTENSIONS 0x00000002 +#define NETIF_CAPAB_RX_FLOW_STEERING 0x00000004 struct netif_qset_extensions { kern_nexus_capab_qsext_notify_steering_info_fn_t qe_notify_steering_info; void *qe_prov_ctx; }; +struct netif_rx_flow_steering { + kern_nexus_capab_rx_flow_steering_config_fn_t config_fn; + void *prov_ctx; +}; + /* * nx_netif is a descriptor for a netif nexus instance. */ @@ -325,6 +331,8 @@ struct nx_netif { void *nif_intf_adv_prov_ctx; struct netif_qset_extensions nif_qset_extensions; + + struct netif_rx_flow_steering nif_rx_flow_steering; #if (DEVELOPMENT || DEBUG) struct skoid nif_skoid; #endif /* !DEVELOPMENT && !DEBUG */ @@ -718,13 +726,9 @@ extern void nx_netif_vp_region_params_adjust(struct nexus_adapter *, extern void nx_netif_pktap_output(ifnet_t, int, struct __kern_packet *); -extern int netif_rx_notify_default(struct __kern_channel_ring *, +extern int netif_rx_notify(struct __kern_channel_ring *, struct proc *p, uint32_t); -extern int netif_rx_notify_fast(struct __kern_channel_ring *, - struct proc *p, uint32_t); -extern int netif_llw_rx_notify_default(struct __kern_channel_ring *, - struct proc *p, uint32_t); -extern int netif_llw_rx_notify_fast(struct __kern_channel_ring *, +extern int netif_llw_rx_notify(struct __kern_channel_ring *, struct proc *p, uint32_t); extern void netif_receive(struct nexus_netif_adapter *, struct __kern_packet *, struct nexus_pkt_stats *); @@ -765,6 +769,8 @@ extern void nx_netif_qset_release(struct netif_qset **); extern void nx_netif_llink_init(struct nx_netif *); extern void nx_netif_llink_fini(struct nx_netif *); extern struct netif_qset * nx_netif_find_qset(struct nx_netif *, uint64_t); +extern struct netif_qset * nx_netif_find_qset_with_pkt(struct ifnet *, + struct __kern_packet *); extern struct netif_qset * nx_netif_get_default_qset_noref(struct nx_netif *); extern int netif_qset_enqueue(struct netif_qset *, bool chain, struct __kern_packet *, struct __kern_packet *, uint32_t, uint32_t, @@ -782,6 +788,12 @@ extern int nx_netif_llink_remove(struct nx_netif *, kern_nexus_netif_llink_id_t); extern int nx_netif_notify_steering_info(struct nx_netif *, struct netif_qset *, struct ifnet_traffic_descriptor_common *, bool); + +/* + * Rx flow steering functions + */ +extern int nx_netif_configure_rx_flow_steering(struct kern_nexus *, uint32_t, + struct ifnet_traffic_descriptor_common *, rx_flow_steering_action_t); __END_DECLS #endif /* CONFIG_NEXUS_NETIF */ #include diff --git a/bsd/skywalk/nexus/netif/nx_netif_compat.c b/bsd/skywalk/nexus/netif/nx_netif_compat.c index ebdc49945..acec70b6a 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_compat.c +++ b/bsd/skywalk/nexus/netif/nx_netif_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2022 Apple Inc. All rights reserved. + * Copyright (c) 2015-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -55,8 +55,9 @@ #include #include #include -#include #include +#include +#include static void na_netif_compat_finalize(struct nexus_netif_adapter *, struct ifnet *); @@ -161,7 +162,7 @@ static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_pool, SKMEM_TAG_NETIF_COMPAT_POOL void nx_netif_compat_init(struct nxdom *nxdom) { - _CASSERT(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE); + static_assert(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE); /* * We want nxprov_create() coming from userland to use the @@ -182,7 +183,7 @@ na_netif_compat_alloc(zalloc_flags_t how) { struct nexus_netif_compat_adapter *nca; - _CASSERT(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0); + static_assert(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0); nca = zalloc_flags(na_netif_compat_zone, how | Z_ZERO); if (nca) { @@ -234,7 +235,7 @@ nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg) f = NMB_GET_FLAGS(p); i = NMB_GET_INDEX(p); - SK_DF(SK_VERB_NETIF, "%s m 0x%llx txq %u i %u f 0x%x", + SK_DF(SK_VERB_NETIF, "%s m %p txq %u i %u f 0x%x", if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f); if (f & NMB_PROPF_TX_NOTIFY) { @@ -250,11 +251,11 @@ nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg) } /* TODO: adi@apple.com -- what to do? */ SK_ERR("Failed to clear TX_NOTIFY " - "m 0x%llx i %u err %d", SK_KVA(m), i, err); + "m %p i %u err %d", SK_KVA(m), i, err); } else { nx_netif_compat_tx_intr(ifp, NR_TX, txq, NULL); SK_DF(SK_VERB_NETIF | SK_VERB_INTR | SK_VERB_TX, - "%s TX irq m 0x%llx txq %u i %u f 0x%x", + "%s TX irq m %p txq %u i %u f 0x%x", if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f); STATS_INC(nifs, NETIF_STATS_TX_IRQ); } @@ -290,7 +291,7 @@ nx_netif_compat_ring_alloc(int how, int len, uint16_t idx) if (err == EBUSY) { /* try again */ continue; } - SK_ERR("Failed to initialize properties m 0x%llx " + SK_ERR("Failed to initialize properties m %p " "err %d", SK_KVA(m), err); m_freem(m); return NULL; @@ -301,7 +302,7 @@ nx_netif_compat_ring_alloc(int how, int len, uint16_t idx) break; } - SK_DF(SK_VERB_MEM, "alloc m 0x%llx size %u i %u", + SK_DF(SK_VERB_MEM, "alloc m %p size %u i %u", SK_KVA(m), (uint32_t)size, i); return m; @@ -327,7 +328,7 @@ nx_netif_compat_ring_free(struct mbuf *m) continue; } /* TODO: adi@apple.com -- what to do? */ - SK_ERR("Failed to clear properties m 0x%llx err %d", + SK_ERR("Failed to clear properties m %p err %d", SK_KVA(m), err); } break; @@ -343,7 +344,7 @@ nx_netif_compat_tx_intr(struct ifnet *ifp, enum txrx t, uint32_t q, if (__improbable(!NA_IS_ACTIVE(na) || q >= na_get_nrings(na, t))) { if (q >= na_get_nrings(na, t)) { - SK_ERR("na \"%s\" (0x%llx) invalid q %u >= %u", + SK_ERR("na \"%s\" (%p) invalid q %u >= %u", na->na_name, SK_KVA(na), q, na_get_nrings(na, t)); } } else { @@ -404,7 +405,7 @@ nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) ASSERT(na->na_type == NA_NETIF_COMPAT_DEV); ASSERT(!(na->na_flags & NAF_HOST_ONLY)); - SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); nca = (struct nexus_netif_compat_adapter *)nifna; @@ -532,9 +533,9 @@ nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) nx_mbq_safe_init(kr, &kr->ckr_rx_queue, limit, &nexus_mbq_lock_group, &nexus_lock_attr); SK_DF(SK_VERB_NETIF, - "na \"%s\" (0x%llx) initialized kr \"%s\" " - "(0x%llx) krflags 0x%b", na->na_name, SK_KVA(na), - kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS); + "na \"%s\" (%p) initialized kr \"%s\" " + "(%p) krflags 0x%x", na->na_name, SK_KVA(na), + kr->ckr_name, SK_KVA(kr), kr->ckr_flags); } /* @@ -755,7 +756,7 @@ nx_netif_compat_tx_clean(struct netif_stats *nifs, } kring->ckr_ktail = SLOT_PREV(nm_i, lim); - SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (0x%llx) tx completed [%u] -> " + SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (%p) tx completed [%u] -> " "kh %u kt %u | rh %u rt %u", kring->ckr_name, SK_KVA(kring), n, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail); @@ -827,7 +828,7 @@ nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring, * on the ring slot 'e': There is nothing to do. */ SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX, - "TX_NOTIFY already set at %u m 0x%llx kc %u ntc %u", + "TX_NOTIFY already set at %u m %p kc %u ntc %u", e, SK_KVA(m), khead, ntc); return; } @@ -841,11 +842,11 @@ nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring, continue; } /* TODO: adi@apple.com -- what to do? */ - SK_ERR("Failed to set TX_NOTIFY at %u m 0x%llx kh %u " + SK_ERR("Failed to set TX_NOTIFY at %u m %p kh %u " "ntc %u, err %d", e, SK_KVA(m), khead, ntc, err); } else { SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX, - "Request TX_NOTIFY at %u m 0x%llx kh %u ntc %u", + "Request TX_NOTIFY at %u m %p kh %u ntc %u", e, SK_KVA(m), khead, ntc); } break; @@ -860,10 +861,10 @@ nx_netif_compat_na_txsync_log(struct __kern_channel_ring *kring, struct proc *p, uint32_t flags, slot_idx_t nm_i) { SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0x%x " + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0x%x " "nm_i %u, kh %u kt %u | rh %u rt %u", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags, nm_i, kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail); } @@ -888,7 +889,7 @@ nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p, STATS_INC(nifs, NETIF_STATS_TX_SYNC); /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); /* * First part: process new packets to send. @@ -916,13 +917,13 @@ nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p, STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); SK_DF(SK_VERB_MEM, - "%s(%d) kr \"%s\" (0x%llx) " - "krflags 0x%b ckr_tx_pool[%u] " + "%s(%d) kr \"%s\" (%p) " + "krflags 0x%x ckr_tx_pool[%u] " "allocation failed", - sk_proc_name_address(p), + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, nm_i); + nm_i); /* * Here we could schedule a timer * which retries to replenish after @@ -1020,9 +1021,9 @@ static void nx_netif_compat_receive_log1(const struct __kern_channel_ring *kring, struct nx_mbq *q) { - SK_RD(10, "kr \"%s\" (0x%llx) krflags 0x%b FULL " - "(qlen %u qsize %llu), kc %u kt %u", kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, nx_mbq_len(q), + SK_RD(10, "kr \"%s\" (%p) krflags 0x%x FULL " + "(qlen %u qsize %zu), kc %u kt %u", kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, nx_mbq_len(q), nx_mbq_size(q), kring->ckr_khead, kring->ckr_ktail); } @@ -1032,10 +1033,10 @@ static void nx_netif_compat_receive_log2(const struct __kern_channel_ring *kring, struct nx_mbq *q, const struct ifnet_stat_increment_param *s) { - SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (0x%llx) krflags 0x%b OK, " - "added %u packets %u bytes, now qlen %u qsize %llu", - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, CKRF_BITS, - s->packets_in, s->bytes_in, nx_mbq_len(q), nx_mbq_size(q)); + SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (%p) krflags 0x%x OK, " + "added %u packets %u bytes, now qlen %u qsize %zu", + kring->ckr_name, SK_KVA(kring), kring->ckr_flags, s->packets_in, + s->bytes_in, nx_mbq_len(q), nx_mbq_size(q)); } #endif /* SK_LOG */ @@ -1060,7 +1061,7 @@ nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head, errno_t err = 0; /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); if (__improbable(m_head == NULL)) { ASSERT(m_tail == NULL); @@ -1110,7 +1111,7 @@ nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head, * then here do: * * if (r >= na_get_nrings(na, NR_RX)) { - * SK_ERR("na \"%s\" (0x%llx) invalid r %u >= %u", + * SK_ERR("na \"%s\" (%p) invalid r %u >= %u", * na->na_name, SK_KVA(na), r, * na_get_nrings(na, NR_RX)); * } @@ -1208,10 +1209,10 @@ nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring *kring, struct proc *p, uint32_t flags, slot_idx_t nm_i) { SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b " - "ring %u flags 0x%x nm_i %u kt %u", sk_proc_name_address(p), + "%s(%d) kr \"%s\" (%p) krflags 0x%x " + "ring %u flags 0x%x nm_i %u kt %u", sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail); + kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail); } #endif /* SK_LOG */ @@ -1318,7 +1319,7 @@ nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, } /* update our work timestamp */ - na->na_work_ts = _net_uptime; + na->na_work_ts = net_uptime(); /* first empty slot in the receive ring */ nm_i = kring->ckr_ktail; @@ -1364,8 +1365,8 @@ nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, err = kern_pbufpool_alloc_batch_nosleep(pp, 1, kring->ckr_scratch, &ph_cnt); if (err == ENOMEM) { - SK_DF(SK_VERB_MEM, "%s(%p) failed to alloc %d pkts for kr " - "0x%llu", sk_proc_name_address(p), sk_proc_pid(p), ph_cnt, + SK_DF(SK_VERB_MEM, "%s(%d) failed to alloc %d pkts for kr %p", + sk_proc_name(p), sk_proc_pid(p), ph_cnt, SK_KVA(kring)); goto done; } @@ -1386,7 +1387,7 @@ nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, if (__improbable(mlen == 0 || h == NULL || h < (char *)mbuf_datastart(m) || h > (char *)m->m_data)) { STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); - SK_RD(5, "kr \"%s\" (0x%llx) m 0x%llx len %d" + SK_RD(5, "kr \"%s\" (%p) m %p len %d" "bad pkt_hdr", kring->ckr_name, SK_KVA(kring), SK_KVA(m), mlen); m_freem(m); @@ -1433,11 +1434,10 @@ nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, pkt->pkt_link_flags |= PKT_LINKF_ETHFCS; } if (mbuf_get_vlan_tag(m, &tag) == 0) { - (void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag, - FALSE); + (void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag); } SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX, - "kr \"%s\" (0x%llx) m 0x%llx idx %u slot_len %d", + "kr \"%s\" (%p) m %p idx %u slot_len %d", kring->ckr_name, SK_KVA(kring), SK_KVA(m), nm_i, mlen); if (__probable(attach_mbuf)) { @@ -1528,7 +1528,7 @@ nx_netif_compat_na_dtor(struct nexus_adapter *na) SK_LOCK_ASSERT_HELD(); - SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na)); + SK_DF(SK_VERB_NETIF, "na \"%s\" (%p)", na->na_name, SK_KVA(na)); /* * If the finalizer callback hasn't been called for whatever @@ -1591,7 +1591,7 @@ nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) * * The ifnet in 'na_ifp' will be released by na_release_locked(). */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { if (!(ifp->if_refflags & IFRF_EMBRYONIC)) { ifp = NULL; retval = ENXIO; @@ -1615,7 +1615,7 @@ nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) ASSERT(devna->na_ifp == NULL); } else { ASSERT(devna->na_private == NULL); - /* use I/O refcnt from ifnet_is_attached() */ + /* use I/O refcnt from ifnet_get_ioref() */ devna->na_ifp = ifp; } @@ -1752,10 +1752,10 @@ nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, "na_name: \"%s\"", devna->na_name); SK_DF(SK_VERB_NETIF, " UUID: %s", sk_uuid_unparse(devna->na_uuid, uuidstr)); - SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_NETIF, " nx: %p (\"%s\":\"%s\")", SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name, NX_DOM_PROV(devna->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS); + SK_DF(SK_VERB_NETIF, " flags: 0x%x", devna->na_flags); SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max); SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u", na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX)); @@ -1765,16 +1765,15 @@ nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe); SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes); #endif /* CONFIG_NEXUS_USER_PIPE */ - SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]", - SK_KVA(ifp), ifp->if_xname, ifp->if_refio); + SK_DF(SK_VERB_NETIF, " ifp: %p %s [ioref %u]", + SK_KVA(ifp), ifp->if_xname, os_ref_get_count(&ifp->if_refio)); SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name); SK_DF(SK_VERB_NETIF, " UUID: %s", sk_uuid_unparse(hostna->na_uuid, uuidstr)); - SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_NETIF, " nx: %p (\"%s\":\"%s\")", SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name, NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_NETIF, " flags: 0x%b", - hostna->na_flags, NAF_BITS); + SK_DF(SK_VERB_NETIF, " flags: 0x%x", hostna->na_flags); SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max); SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u", na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX)); @@ -1784,8 +1783,8 @@ nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe); SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes); #endif /* CONFIG_NEXUS_USER_PIPE */ - SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]", SK_KVA(ifp), - ifp->if_xname, ifp->if_refio); + SK_DF(SK_VERB_NETIF, " ifp: %p %s [ioref %u]", SK_KVA(ifp), + ifp->if_xname, os_ref_get_count(&ifp->if_refio)); #endif /* SK_LOG */ err: @@ -1888,7 +1887,7 @@ nx_netif_compat_xmit_frame(struct nexus_adapter *na, struct mbuf *m, int ret = 0; if ((ret = mbuf_ring_cluster_activate(m)) != 0) { - panic("Failed to activate mbuf ring cluster 0x%llx (%d)", + panic("Failed to activate mbuf ring cluster %p (%d)", SK_KVA(m), ret); /* NOTREACHED */ __builtin_unreachable(); diff --git a/bsd/skywalk/nexus/netif/nx_netif_filter.c b/bsd/skywalk/nexus/netif/nx_netif_filter.c index bc16350a9..fec2d5877 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_filter.c +++ b/bsd/skywalk/nexus/netif/nx_netif_filter.c @@ -386,11 +386,11 @@ nx_netif_filter_set_enable(struct nx_netif *nif, boolean_t set) } lck_mtx_lock(&nif->nif_filter_lock); if (set) { - SK_DF(SK_VERB_FILTER, "%s: filter enabled, nif 0x%llx", + SK_DF(SK_VERB_FILTER, "%s: filter enabled, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nif->nif_filter_flags |= NETIF_FILTER_FLAG_ENABLED; } else { - SK_DF(SK_VERB_FILTER, "%s: filter disabled, nif 0x%llx", + SK_DF(SK_VERB_FILTER, "%s: filter disabled, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nif->nif_filter_flags &= ~NETIF_FILTER_FLAG_ENABLED; } diff --git a/bsd/skywalk/nexus/netif/nx_netif_filter_compat.c b/bsd/skywalk/nexus/netif/nx_netif_filter_compat.c index b9199ecce..02024d6e6 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_filter_compat.c +++ b/bsd/skywalk/nexus/netif/nx_netif_filter_compat.c @@ -268,7 +268,7 @@ nx_netif_compat_tx_dequeue(struct nexus_netif_adapter *nifna, * TODO: * The number of packets to move should be dependent on * the available ring space of the next filter. The limits - * should be adjusted at ifclassq_dequeue_common(). + * should be adjusted at ifclassq_dequeue(). */ nx_netif_filter_tx_mbuf_enqueue(nifna, head->cp_mbuf); } diff --git a/bsd/skywalk/nexus/netif/nx_netif_filter_native.c b/bsd/skywalk/nexus/netif/nx_netif_filter_native.c index 2f4d8d85e..e95e915c5 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_filter_native.c +++ b/bsd/skywalk/nexus/netif/nx_netif_filter_native.c @@ -269,7 +269,7 @@ nx_netif_native_tx_dequeue(struct nexus_netif_adapter *nifna, * TODO: * The number of packets to move should be dependent on * the available ring space of the next filter. The limits - * should be adjusted at ifclassq_dequeue_common(). + * should be adjusted at ifclassq_dequeue(). */ nx_netif_filter_tx_pkt_enqueue(nifna, head->cp_kpkt); } diff --git a/bsd/skywalk/nexus/netif/nx_netif_filter_vp.c b/bsd/skywalk/nexus/netif/nx_netif_filter_vp.c index 03e67ee13..74aef5dde 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_filter_vp.c +++ b/bsd/skywalk/nexus/netif/nx_netif_filter_vp.c @@ -199,7 +199,7 @@ netif_filter_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed); } - SK_DF(SK_VERB_FILTER, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_FILTER, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); return 0; } @@ -509,7 +509,7 @@ netif_filter_na_dtor(struct nexus_adapter *na) nifna->nifna_netif = NULL; } NETIF_WUNLOCK(nif); - SK_DF(SK_VERB_FILTER, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na)); + SK_DF(SK_VERB_FILTER, "na \"%s\" (%p)", na->na_name, SK_KVA(na)); } int diff --git a/bsd/skywalk/nexus/netif/nx_netif_flow.c b/bsd/skywalk/nexus/netif/nx_netif_flow.c index fa524c2f1..4efbcea3f 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_flow.c +++ b/bsd/skywalk/nexus/netif/nx_netif_flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Apple Inc. All rights reserved. + * Copyright (c) 2019-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -147,7 +147,7 @@ netif_flow_ethertype_info(struct __kern_packet *pkt, } etype = ntohs(etype); - if (kern_packet_get_vlan_tag(SK_PKT2PH(pkt), &tag, NULL) == 0) { + if (kern_packet_get_vlan_tag(SK_PKT2PH(pkt), &tag) == 0) { DTRACE_SKYWALK2(hw__vlan, struct __kern_packet *, pkt, uint16_t, tag); } else if (etype == ETHERTYPE_VLAN) { @@ -783,7 +783,7 @@ nx_netif_flow_add(struct nx_netif *nif, nexus_port_t port, } STATS_INC(nifs, NETIF_STATS_VP_FLOW_ADD); lck_mtx_unlock(&nif->nif_flow_lock); - SK_DF(SK_VERB_VP, "flow add successful: if %s, nif 0x%llx", + SK_DF(SK_VERB_VP, "flow add successful: if %s, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nx_netif_flow_log(nif, nf, TRUE); return 0; @@ -798,7 +798,7 @@ fail: } } lck_mtx_unlock(&nif->nif_flow_lock); - SK_ERR("flow add failed: if %s, nif 0x%llx, err %d", + SK_ERR("flow add failed: if %s, nif %p, err %d", if_name(nif->nif_ifp), SK_KVA(nif), err); return err; } @@ -829,7 +829,7 @@ nx_netif_flow_remove(struct nx_netif *nif, struct netif_flow *nf) STATS_INC(nifs, NETIF_STATS_VP_FLOW_REMOVE); lck_mtx_unlock(&nif->nif_flow_lock); - SK_DF(SK_VERB_VP, "flow remove: if %s, nif 0x%llx", + SK_DF(SK_VERB_VP, "flow remove: if %s, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nx_netif_flow_log(nif, nf, FALSE); sk_free_type(struct netif_flow, nf); @@ -898,11 +898,11 @@ nx_netif_flow_set_enable(struct nx_netif *nif, boolean_t set) } lck_mtx_lock(&nif->nif_flow_lock); if (set) { - SK_DF(SK_VERB_VP, "%s: flow enable, nif 0x%llx", + SK_DF(SK_VERB_VP, "%s: flow enable, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nif->nif_flow_flags |= NETIF_FLOW_FLAG_ENABLED; } else { - SK_DF(SK_VERB_VP, "%s: flow disable, nif 0x%llx", + SK_DF(SK_VERB_VP, "%s: flow disable, nif %p", if_name(nif->nif_ifp), SK_KVA(nif)); nif->nif_flow_flags &= ~NETIF_FLOW_FLAG_ENABLED; } diff --git a/bsd/skywalk/nexus/netif/nx_netif_gso.c b/bsd/skywalk/nexus/netif/nx_netif_gso.c index 6bd417a59..f5c1302ba 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_gso.c +++ b/bsd/skywalk/nexus/netif/nx_netif_gso.c @@ -157,7 +157,7 @@ netif_gso_check_netif_active(struct ifnet *ifp, struct mbuf *m, if (__improbable(!NA_IS_ACTIVE(hwna))) { STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE); SK_DF(SK_VERB_NETIF, - "\"%s\" (0x%llx) not in skywalk mode anymore", + "\"%s\" (%p) not in skywalk mode anymore", hwna->na_name, SK_KVA(hwna)); return ENXIO; } @@ -167,9 +167,9 @@ netif_gso_check_netif_active(struct ifnet *ifp, struct mbuf *m, if (__improbable(KR_DROP(kring))) { STATS_INC(nifs, NETIF_STATS_DROP_KRDROP_MODE); SK_DF(SK_VERB_NETIF, - "kr \"%s\" (0x%llx) krflags 0x%b or %s in drop mode", + "kr \"%s\" (%p) krflags 0x%x or %s in drop mode", kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ifp->if_xname); + ifp->if_xname); return ENXIO; } *pp = kring->ckr_pp; @@ -190,25 +190,18 @@ netif_gso_send(struct ifnet *ifp, struct __kern_packet *head, struct nx_netif *nif = NA(ifp)->nifna_netif; struct netif_stats *nifs = &nif->nif_stats; struct netif_qset *__single qset = NULL; - uint64_t qset_id = 0; int error = 0; boolean_t dropped; - if (NX_LLINK_PROV(nif->nif_nx) && - ifp->if_traffic_rule_count > 0 && - nxctl_inet_traffic_rule_find_qset_id_with_pkt(ifp->if_xname, - head, &qset_id) == 0) { - qset = nx_netif_find_qset(nif, qset_id); - ASSERT(qset != NULL); - } + qset = nx_netif_find_qset_with_pkt(ifp, head); if (netif_chain_enqueue_enabled(ifp)) { dropped = false; if (qset != NULL) { head->pkt_qset_idx = qset->nqs_idx; - error = ifnet_enqueue_ifcq_pkt_chain(ifp, qset->nqs_ifcq, + error = ifnet_enqueue_pkt_chain(ifp, qset->nqs_ifcq, head, tail, count, bytes, false, &dropped); } else { - error = ifnet_enqueue_pkt_chain(ifp, head, tail, + error = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, head, tail, count, bytes, false, &dropped); } if (__improbable(dropped)) { @@ -230,10 +223,10 @@ netif_gso_send(struct ifnet *ifp, struct __kern_packet *head, dropped = false; if (qset != NULL) { pkt->pkt_qset_idx = qset->nqs_idx; - err = ifnet_enqueue_ifcq_pkt(ifp, qset->nqs_ifcq, + err = ifnet_enqueue_pkt(ifp, qset->nqs_ifcq, pkt, false, &dropped); } else { - err = ifnet_enqueue_pkt(ifp, pkt, false, &dropped); + err = ifnet_enqueue_pkt(ifp, ifp->if_snd, pkt, false, &dropped); } if (error == 0 && __improbable(err != 0)) { error = err; @@ -411,6 +404,12 @@ netif_gso_tcp_segment_mbuf(struct mbuf *m, struct ifnet *ifp, KPKTQ_INIT(&pktq_seg); n_bytes = total_len + (state->hlen * (n_pkts - 1)); + if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_QSET_ID_VALID) { + pkt_chain_head->pkt_pflags |= PKT_F_PRIV_HAS_QSET_ID; + pkt_chain_head->pkt_priv = + __unsafe_forge_single(void *, m->m_pkthdr.pkt_mpriv_qsetid); + } + error = netif_gso_send(ifp, pkt_chain_head, pkt_chain_tail, n_pkts, n_bytes); @@ -771,9 +770,9 @@ netif_gso_dispatch(struct ifnet *ifp, struct mbuf *m) void netif_gso_init(void) { - _CASSERT(CSUM_TO_GSO(~(CSUM_TSO_IPV4 | CSUM_TSO_IPV6)) == GSO_NONE); - _CASSERT(CSUM_TO_GSO(CSUM_TSO_IPV4) == GSO_TCP4); - _CASSERT(CSUM_TO_GSO(CSUM_TSO_IPV6) == GSO_TCP6); + static_assert(CSUM_TO_GSO(~(CSUM_TSO_IPV4 | CSUM_TSO_IPV6)) == GSO_NONE); + static_assert(CSUM_TO_GSO(CSUM_TSO_IPV4) == GSO_TCP4); + static_assert(CSUM_TO_GSO(CSUM_TSO_IPV6) == GSO_TCP6); netif_gso_functions[GSO_NONE] = nx_netif_host_output; netif_gso_functions[GSO_TCP4] = netif_gso_ipv4_tcp; netif_gso_functions[GSO_TCP6] = netif_gso_ipv6_tcp; diff --git a/bsd/skywalk/nexus/netif/nx_netif_host.c b/bsd/skywalk/nexus/netif/nx_netif_host.c index 8cba06998..322a1f5d9 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_host.c +++ b/bsd/skywalk/nexus/netif/nx_netif_host.c @@ -183,7 +183,7 @@ nx_netif_host_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) na->na_type == NA_NETIF_COMPAT_HOST); ASSERT(na->na_flags & NAF_HOST_ONLY); - SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_NETIF, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); switch (mode) { @@ -255,10 +255,9 @@ nx_netif_host_krings_create(struct nexus_adapter *na, struct kern_channel *ch) NX_MBQ_NO_LIMIT, &nexus_mbq_lock_group, &nexus_lock_attr); SK_DF(SK_VERB_NETIF, - "na \"%s\" (0x%llx) initialized host kr \"%s\" " - "(0x%llx) krflags 0x%b", na->na_name, SK_KVA(na), - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS); + "na \"%s\" (%p) initialized host kr \"%s\" " + "(%p) krflags 0x%x", na->na_name, SK_KVA(na), + kring->ckr_name, SK_KVA(kring), kring->ckr_flags); } } return ret; @@ -290,10 +289,10 @@ nx_netif_host_krings_delete(struct nexus_adapter *na, struct kern_channel *ch, kring = &NAKR(na, NR_RX)[i]; q = &kring->ckr_rx_queue; SK_DF(SK_VERB_NETIF, - "na \"%s\" (0x%llx) destroy host kr \"%s\" (0x%llx) " - "krflags 0x%b with qlen %u", na->na_name, SK_KVA(na), + "na \"%s\" (%p) destroy host kr \"%s\" (%p) " + "krflags 0x%x with qlen %u", na->na_name, SK_KVA(na), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, nx_mbq_len(q)); + nx_mbq_len(q)); nx_mbq_purge(q); if (!defunct) { nx_mbq_safe_destroy(q); @@ -444,12 +443,13 @@ nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain) struct __kern_packet *pkt_chain_head, *pkt_chain_tail; struct netif_qset *__single qset = NULL; struct pktq pkt_q; - uint64_t qset_id; bool qset_id_valid = false; boolean_t pkt_drop = FALSE; uint32_t n_pkts = 0, n_bytes = 0; errno_t error = 0; + static_assert(sizeof(m_head->m_pkthdr.pkt_mpriv_qsetid) == sizeof(uint64_t)); + ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE); ASSERT(hostna->na_type == NA_NETIF_HOST); @@ -483,7 +483,7 @@ nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain) } if (__improbable(!NA_IS_ACTIVE(hwna) || !NA_IS_ACTIVE(hostna))) { STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE); - SK_ERR("\"%s\" (0x%llx) not in skywalk mode anymore", + SK_ERR("\"%s\" (%p) not in skywalk mode anymore", hwna->na_name, SK_KVA(hwna)); error = ENXIO; pkt_drop = TRUE; @@ -496,9 +496,9 @@ nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain) STATS_INC(nifs, NETIF_STATS_DROP_KRDROP_MODE); /* not a serious error, so no need to be chatty here */ SK_DF(SK_VERB_NETIF, - "kr \"%s\" (0x%llx) krflags 0x%b or %s in drop mode", + "kr \"%s\" (%p) krflags 0x%x or %s in drop mode", kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, ifp->if_xname); + ifp->if_xname); error = ENXIO; pkt_drop = TRUE; goto out; @@ -506,7 +506,7 @@ nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain) if (__improbable(((unsigned)m_pktlen(m) + ifp->if_tx_headroom) > kring->ckr_max_pkt_len)) { /* too long for us */ STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); - SK_ERR("\"%s\" (0x%llx) from_host, drop packet size %u > %u", + SK_ERR("\"%s\" (%p) from_host, drop packet size %u > %u", hwna->na_name, SK_KVA(hwna), m_pktlen(m), kring->ckr_max_pkt_len); pkt_drop = TRUE; @@ -530,32 +530,32 @@ nx_netif_host_output(struct ifnet *ifp, struct mbuf *m_chain) nx_netif_pktap_output(ifp, af, kpkt); } } - if (NX_LLINK_PROV(nif->nif_nx) && - ifp->if_traffic_rule_count > 0 && - !qset_id_valid && - nxctl_inet_traffic_rule_find_qset_id_with_pkt(ifp->if_xname, - kpkt, &qset_id) == 0) { - qset_id_valid = true; - /* - * This always returns a qset because if the qset id - * is invalid the default qset is returned. - */ - qset = nx_netif_find_qset(nif, qset_id); - ASSERT(qset != NULL); + if (!qset_id_valid) { + if (m->m_pkthdr.pkt_ext_flags & PKTF_EXT_QSET_ID_VALID) { + kpkt->pkt_pflags |= PKT_F_PRIV_HAS_QSET_ID; + kpkt->pkt_priv = + __unsafe_forge_single(void *, m->m_pkthdr.pkt_mpriv_qsetid); + } + + qset = nx_netif_find_qset_with_pkt(ifp, kpkt); + if (qset != NULL) { + qset_id_valid = true; + } } + if (qset != NULL) { kpkt->pkt_qset_idx = qset->nqs_idx; } if (!netif_chain_enqueue_enabled(ifp)) { if (qset != NULL) { - error = ifnet_enqueue_ifcq_pkt(ifp, + error = ifnet_enqueue_pkt(ifp, qset->nqs_ifcq, kpkt, false, &pkt_drop); nx_netif_qset_release(&qset); } else { /* callee consumes packet */ - error = ifnet_enqueue_pkt(ifp, kpkt, false, &pkt_drop); + error = ifnet_enqueue_pkt(ifp, ifp->if_snd, kpkt, false, &pkt_drop); } if (pkt_drop) { @@ -589,13 +589,13 @@ out: pkt_chain_head = KPKTQ_FIRST(&pkt_q); pkt_chain_tail = KPKTQ_LAST(&pkt_q); if (qset != NULL) { - error = ifnet_enqueue_ifcq_pkt_chain(ifp, qset->nqs_ifcq, + error = ifnet_enqueue_pkt_chain(ifp, qset->nqs_ifcq, pkt_chain_head, pkt_chain_tail, n_pkts, n_bytes, false, &pkt_drop); nx_netif_qset_release(&qset); } else { /* callee consumes packet */ - error = ifnet_enqueue_pkt_chain(ifp, pkt_chain_head, pkt_chain_tail, - n_pkts, n_bytes, false, &pkt_drop); + error = ifnet_enqueue_pkt_chain(ifp, ifp->if_snd, pkt_chain_head, + pkt_chain_tail, n_pkts, n_bytes, false, &pkt_drop); } if (pkt_drop) { STATS_ADD(nifs, NETIF_STATS_TX_DROP_ENQ_AQM, n_pkts); @@ -668,7 +668,7 @@ nx_netif_mbuf_to_kpkt_log(struct __kern_packet *kpkt, uint32_t len, " hr %u l2 %u poff %u", len, kpkt->pkt_length, kpkt->pkt_headroom, kpkt->pkt_l2_len, poff); SK_DF(SK_VERB_HOST | SK_VERB_TX | SK_VERB_DUMP, "%s", - sk_dump("buf", baddr, pkt_len, 128, NULL, 0)); + sk_dump("buf", baddr, pkt_len, 128)); } #endif /* SK_LOG */ @@ -700,8 +700,8 @@ nx_netif_mbuf_to_kpkt(struct nexus_adapter *na, struct mbuf *m) if (__improbable(ph == 0)) { STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_PKT); SK_DF(SK_VERB_MEM, - "%s(%d) pp \"%s\" (0x%llx) has no more " - "packet for %s", sk_proc_name_address(current_proc()), + "%s(%d) pp \"%s\" (%p) has no more " + "packet for %s", sk_proc_name(current_proc()), sk_proc_pid(current_proc()), pp->pp_name, SK_KVA(pp), if_name(na->na_ifp)); return NULL; diff --git a/bsd/skywalk/nexus/netif/nx_netif_llink.c b/bsd/skywalk/nexus/netif/nx_netif_llink.c index e0854bb71..ebe6f0664 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_llink.c +++ b/bsd/skywalk/nexus/netif/nx_netif_llink.c @@ -134,7 +134,7 @@ nx_netif_qset_alloc(uint8_t nrxqs, uint8_t ntxqs) { struct netif_qset *qset; - _CASSERT(sizeof(struct netif_queue) % sizeof(uint64_t) == 0); + static_assert(sizeof(struct netif_queue) % sizeof(uint64_t) == 0); qset = sk_alloc_type_header_array(struct netif_qset, struct netif_queue, nrxqs + ntxqs, Z_WAITOK | Z_NOFAIL, nx_netif_tag_qset); @@ -264,14 +264,14 @@ nx_netif_qset_init(struct netif_qset *qset, struct netif_llink *llink, nx_netif_qset_setup_ifclassq(llink, qset); } - for (i = 0; i < qset->nqs_num_rx_queues; i++) { nx_netif_driver_queue_init(qset, NETIF_QSET_RX_QUEUE(qset, i), KPKT_SC_UNSPEC, true); } - if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) { + if (ifp->if_output_sched_model & IFNET_SCHED_DRIVER_MANGED_MODELS) { VERIFY(qset->nqs_num_tx_queues == _NETIF_QSET_MAX_TXQS); + VERIFY(IFNET_MODEL_IS_VALID(ifp->if_output_sched_model)); for (i = 0; i < qset->nqs_num_tx_queues; i++) { nx_netif_driver_queue_init(qset, NETIF_QSET_TX_QUEUE(qset, i), svc[i], false); @@ -465,13 +465,17 @@ nx_netif_llink_add(struct nx_netif *nif, llink = nx_netif_llink_create_locked(nif, llink_init); lck_rw_unlock_exclusive(&nif->nif_llink_lock); VERIFY(llink != NULL); + ASSERT((llink->nll_flags & NETIF_LLINK_FLAG_DEFAULT) == 0); err = nx_netif_llink_ext_init_queues(nif->nif_nx, llink); if (err != 0) { lck_rw_lock_exclusive(&nif->nif_llink_lock); nx_netif_llink_destroy_locked(nif, &llink); lck_rw_unlock_exclusive(&nif->nif_llink_lock); } else { - /* increment reference for the caller */ + /* + * Increment reference to keep the same pattern as default llink + * refcnt is 2 after this retain. + */ nx_netif_llink_retain(llink); *pllink = llink; } @@ -483,13 +487,14 @@ nx_netif_llink_remove(struct nx_netif *nif, kern_nexus_netif_llink_id_t llink_id) { bool llink_found = false; - struct netif_llink *__single llink; + struct netif_llink *__single llink, *__single llink_tmp; struct netif_stats *nifs = &nif->nif_stats; lck_rw_lock_exclusive(&nif->nif_llink_lock); STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) { if (llink->nll_link_id == llink_id) { llink_found = true; + llink_tmp = llink; break; } } @@ -499,8 +504,10 @@ nx_netif_llink_remove(struct nx_netif *nif, DTRACE_SKYWALK1(not__found, uint64_t, llink_id); return ENOENT; } - nx_netif_llink_ext_fini_queues(nif->nif_nx, llink); + ASSERT((llink_tmp->nll_flags & NETIF_LLINK_FLAG_DEFAULT) == 0); + nx_netif_llink_ext_fini_queues(nif->nif_nx, llink_tmp); lck_rw_lock_exclusive(&nif->nif_llink_lock); + nx_netif_llink_release(&llink_tmp); nx_netif_llink_destroy_locked(nif, &llink); lck_rw_unlock_exclusive(&nif->nif_llink_lock); return 0; @@ -596,7 +603,7 @@ netif_qset_enqueue(struct netif_qset *qset, bool chain, netif_ifp_inc_traffic_class_out_pkt(ifp, pkt_chain->pkt_svc_class, cnt, bytes); - err = ifnet_enqueue_pkt_chain(ifp, pkt_chain, tail, cnt, + err = ifnet_enqueue_pkt_chain(ifp, qset->nqs_ifcq, pkt_chain, tail, cnt, bytes, false, &pkt_drop); if (__improbable(err != 0)) { if ((err == EQFULL || err == EQSUSPENDED)) { @@ -616,7 +623,7 @@ netif_qset_enqueue(struct netif_qset *qset, bool chain, netif_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class, 1, pkt->pkt_length); - err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop); + err = ifnet_enqueue_pkt(ifp, qset->nqs_ifcq, pkt, false, &pkt_drop); if (__improbable(err != 0)) { if ((err == EQFULL || err == EQSUSPENDED)) { (*flowctl)++; @@ -780,6 +787,38 @@ def_qset: return qset; } +struct netif_qset * +nx_netif_find_qset_with_pkt(struct ifnet *ifp, struct __kern_packet *pkt) +{ + struct nx_netif *nif = NA(ifp)->nifna_netif; + struct netif_qset *__single qset = NULL; + uint64_t qset_id; + + if (NX_LLINK_PROV(nif->nif_nx)) { + /* + * Note: ifp can have either eth traffc rules or inet traffc rules + * and not both. + */ + if (ifp->if_eth_traffic_rule_count) { + if (__probable(pkt->pkt_pflags & PKT_F_PRIV_HAS_QSET_ID)) { + qset = nx_netif_find_qset(nif, (uint64_t) pkt->pkt_priv); + ASSERT(qset != NULL); + } else if (nxctl_eth_traffic_rule_find_qset_id_with_pkt( + ifp->if_xname, pkt, &qset_id) == 0) { + qset = nx_netif_find_qset(nif, qset_id); + ASSERT(qset != NULL); + } + } else if (ifp->if_inet_traffic_rule_count > 0 && + nxctl_inet_traffic_rule_find_qset_id_with_pkt( + ifp->if_xname, pkt, &qset_id) == 0) { + qset = nx_netif_find_qset(nif, qset_id); + ASSERT(qset != NULL); + } + } + + return qset; +} + void nx_netif_llink_init(struct nx_netif *nif) { @@ -873,10 +912,6 @@ nx_netif_validate_llink_config(struct kern_nexus_netif_llink_init *init, SK_ERR("has more than one default qset"); return EINVAL; } - if (qsinit[i].nlqi_num_rxqs == 0) { - SK_ERR("num_rxqs == 0"); - return EINVAL; - } has_default_qset = true; } if (qsinit[i].nlqi_num_txqs == 0) { @@ -965,7 +1000,7 @@ nx_netif_llink_ext_init_queues(struct kern_nexus *nx, struct netif_llink *llink) qset->nqs_idx, qset->nqs_id, qset, &qset->nqs_ctx); if (err != 0) { STATS_INC(nifs, NETIF_STATS_LLINK_QSET_INIT_FAIL); - SK_ERR("nx: 0x%llx, qset: %d, qset init err %d", + SK_ERR("nx: %p, qset: %d, qset init err %d", SK_KVA(nx), qset->nqs_idx, err); goto out; } @@ -979,7 +1014,7 @@ nx_netif_llink_ext_init_queues(struct kern_nexus *nx, struct netif_llink *llink) i, false, drvq, &drvq->nq_ctx); if (err != 0) { STATS_INC(nifs, NETIF_STATS_LLINK_RXQ_INIT_FAIL); - SK_ERR("nx: 0x%llx qset: %d queue_init err %d", + SK_ERR("nx: %p qset: %d queue_init err %d", SK_KVA(nx), qset->nqs_idx, err); goto out; } @@ -993,7 +1028,7 @@ nx_netif_llink_ext_init_queues(struct kern_nexus *nx, struct netif_llink *llink) i, true, drvq, &drvq->nq_ctx); if (err != 0) { STATS_INC(nifs, NETIF_STATS_LLINK_TXQ_INIT_FAIL); - SK_ERR("nx: 0x%llx qset: %d queue_init err %d", + SK_ERR("nx: %p qset: %d queue_init err %d", SK_KVA(nx), qset->nqs_idx, err); goto out; } diff --git a/bsd/skywalk/nexus/netif/nx_netif_mit.c b/bsd/skywalk/nexus/netif/nx_netif_mit.c index c1d441a0c..f684b78a9 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_mit.c +++ b/bsd/skywalk/nexus/netif/nx_netif_mit.c @@ -30,8 +30,9 @@ #include #include #include -#include #include +#include +#include extern kern_return_t thread_terminate(thread_t); @@ -161,8 +162,7 @@ nx_netif_mit_init(struct nx_netif *nif, const struct ifnet *ifp, char oid_name_buf[24]; const char *__null_terminated oid_name = NULL; - _CASSERT(sizeof(mit_cfg_tbl_native_cellular) <= - sizeof(((struct nx_netif_mit *)0)->mit_tbl)); + static_assert(sizeof(mit_cfg_tbl_native_cellular) <= sizeof(((struct nx_netif_mit *)0)->mit_tbl)); lck_spin_init(&mit->mit_lock, kr->ckr_qlock_group, &channel_lock_attr); @@ -298,7 +298,7 @@ nx_netif_mit_init(struct nx_netif *nif, const struct ifnet *ifp, MIT_ADD_SKOID(2); MIT_ADD_SKOID(3); MIT_ADD_SKOID(4); - _CASSERT(NETIF_MIT_CFG_TBL_MAX_CFG == 5); + static_assert(NETIF_MIT_CFG_TBL_MAX_CFG == 5); #endif /* !DEVELOPMENT && !DEBUG */ } @@ -830,7 +830,7 @@ nx_netif_mit_stats(struct __kern_channel_ring *kr, uint64_t pkts, } SK_RDF(SK_VERB_NETIF_MIT, 2, "%s [%u]: pavg %u bavg %u " - "delay %llu usec", mit->mit_name, mit->mit_cfg_idx, + "delay %u usec", mit->mit_name, mit->mit_cfg_idx, mit->mit_packets_avg, mit->mit_bytes_avg, (mode == MIT_MODE_ADVANCED_STATIC ? 0 : (mit->mit_tbl[mit->mit_cfg_idx].cfg_ival))); @@ -855,7 +855,7 @@ nx_netif_mit_stats(struct __kern_channel_ring *kr, uint64_t pkts, ASSERT(cfg_idx < mit->mit_cfg_idx_max); SK_DF(SK_VERB_NETIF_MIT, "%s [%u->%u]: pavg %u " - "bavg %u [mode %u->%u, delay %llu->%llu usec]", + "bavg %u [mode %u->%u, delay %u->%u usec]", mit->mit_name, mit->mit_cfg_idx, cfg_idx, mit->mit_packets_avg, mit->mit_bytes_avg, mit->mit_mode, mode, diff --git a/bsd/skywalk/nexus/netif/nx_netif_netagent.c b/bsd/skywalk/nexus/netif/nx_netif_netagent.c index ebb9dc426..8eddfccbc 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_netagent.c +++ b/bsd/skywalk/nexus/netif/nx_netif_netagent.c @@ -97,13 +97,13 @@ get_ipv6_ula(struct in6_addr *addr) } while (ipv6_ula_interface_id == 0); /* Return the generated address */ - _CASSERT(sizeof(buf) == sizeof(struct in6_addr)); + static_assert(sizeof(buf) == sizeof(struct in6_addr)); bcopy(buf, addr, sizeof(struct in6_addr)); #if SK_LOG char addrbuf[MAX_IPv6_STR_LEN]; SK_DF(SK_VERB_NETIF, "generated IPv6 address: %s", - inet_ntop(AF_INET6, addr, addrbuf, sizeof(addrbuf))); + sk_ntop(AF_INET6, addr, addrbuf, sizeof(addrbuf))); #endif /* SK_LOG */ } @@ -316,7 +316,7 @@ nx_netif_netagent_flow_bind(struct nx_netif *nif, struct nx_flow_req *nfr) nfr->nfr_proc = NULL; proc_rele(p); SK_ERR("%s(%d) failed to bind flow_uuid %s to a " - "nx_port (err %d)", sk_proc_name_address(p), + "nx_port (err %d)", sk_proc_name(p), pid, sk_uuid_unparse(nfr->nfr_flow_uuid, uuidstr), err); return err; @@ -351,7 +351,7 @@ static int nx_netif_netagent_check_flags(struct nx_netif *nif, struct nx_flow_req *nfr, boolean_t add) { - uint16_t flags = nfr->nfr_flags; + uint32_t flags = nfr->nfr_flags; if ((nif->nif_agent_flags & NETIF_AGENT_FLAG_ADDED) == 0) { SK_ERR("no agent added"); @@ -546,10 +546,10 @@ nx_netif_netagent_flow_add(struct nx_netif *nif, struct nx_flow_req *nfr) SK_DF(SK_VERB_NETIF, "flow type: IPv6 ULA"); SK_DF(SK_VERB_NETIF, "IPv6 local: %s", - inet_ntop(AF_INET6, &nfr->nfr_saddr.sin6.sin6_addr, + sk_ntop(AF_INET6, &nfr->nfr_saddr.sin6.sin6_addr, local, sizeof(local))); SK_DF(SK_VERB_NETIF, "IPv6 remote: %s", - inet_ntop(AF_INET6, &nfr->nfr_daddr.sin6.sin6_addr, + sk_ntop(AF_INET6, &nfr->nfr_daddr.sin6.sin6_addr, remote, sizeof(remote))); } #endif /* SK_LOG */ @@ -664,7 +664,7 @@ nx_netif_netagent_handle_interpose_flow_add(struct nx_netif *nif, message = necp_create_nexus_assign_message(nif->nif_nx->nx_uuid, nfr.nfr_nx_port, nfr.nfr_bind_key, sizeof(nfr.nfr_bind_key), - NULL, NULL, NULL, 0, NULL, &len); + NULL, NULL, NULL, 0, NULL, 0, &len); if (message == NULL) { (void) nx_netif_netagent_flow_del(nif, &nfr); return ENOMEM; @@ -699,7 +699,7 @@ nx_netif_netagent_handle_custom_ether_flow_add(struct nx_netif *nif, message = necp_create_nexus_assign_message(nif->nif_nx->nx_uuid, nfr.nfr_nx_port, nfr.nfr_bind_key, sizeof(nfr.nfr_bind_key), - NULL, NULL, &nfr.nfr_etheraddr, 0, NULL, &len); + NULL, NULL, &nfr.nfr_etheraddr, 0, NULL, 0, &len); if (message == NULL) { (void) nx_netif_netagent_flow_del(nif, &nfr); return ENOMEM; @@ -776,7 +776,7 @@ nx_netif_netagent_handle_ipv6_ula_flow_add(struct nx_netif *nif, message = necp_create_nexus_assign_message( zero_nx_uuid, NEXUS_PORT_ANY, NULL, 0, &local_endpoint, NULL, - &nfr.nfr_etheraddr, 0, NULL, &len); + &nfr.nfr_etheraddr, 0, NULL, 0, &len); } else { bzero(&remote_endpoint, sizeof(remote_endpoint)); SOCKADDR_COPY(&nfr.nfr_daddr.sin6, &remote_endpoint.u.sin6, @@ -785,7 +785,7 @@ nx_netif_netagent_handle_ipv6_ula_flow_add(struct nx_netif *nif, message = necp_create_nexus_assign_message( nif->nif_nx->nx_uuid, nfr.nfr_nx_port, nfr.nfr_bind_key, sizeof(nfr.nfr_bind_key), &local_endpoint, - &remote_endpoint, &nfr.nfr_etheraddr, 0, NULL, &len); + &remote_endpoint, &nfr.nfr_etheraddr, 0, NULL, 0, &len); } if (message == NULL) { /* This is a no-op for the listener flow */ @@ -885,8 +885,8 @@ nx_netif_agent_register(struct nx_netif *nif, uint32_t features) struct netagent_nexus_agent agent; int err = 0; - _CASSERT(FLOWADV_IDX_NONE == UINT32_MAX); - _CASSERT(NECP_FLOWADV_IDX_INVALID == FLOWADV_IDX_NONE); + static_assert(FLOWADV_IDX_NONE == UINT32_MAX); + static_assert(NECP_FLOWADV_IDX_INVALID == FLOWADV_IDX_NONE); if (!nif_netagent) { return ENOTSUP; diff --git a/bsd/skywalk/nexus/netif/nx_netif_poll.c b/bsd/skywalk/nexus/netif/nx_netif_poll.c index 3b22e473d..75ff316e4 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_poll.c +++ b/bsd/skywalk/nexus/netif/nx_netif_poll.c @@ -385,7 +385,7 @@ netif_rxpoll_compat_thread_cont(void *v, wait_result_t wres) * else hold an IO refcnt to prevent the interface * from being detached (will be released below.) */ - if (!ifnet_is_attached(ifp, 1)) { + if (!ifnet_get_ioref(ifp)) { lck_mtx_lock_spin(&ifp->if_poll_lock); break; } @@ -447,7 +447,7 @@ netif_rxpoll_compat_thread_cont(void *v, wait_result_t wres) if (ts != NULL) { uint64_t interval; - _CASSERT(IF_RXPOLL_INTERVALTIME_MIN >= (1ULL * 1000)); + static_assert(IF_RXPOLL_INTERVALTIME_MIN >= (1ULL * 1000)); net_timerusec(ts, &interval); ASSERT(interval <= UINT32_MAX); clock_interval_to_deadline((uint32_t)interval, NSEC_PER_USEC, diff --git a/bsd/skywalk/nexus/netif/nx_netif_util.c b/bsd/skywalk/nexus/netif/nx_netif_util.c index e5d46741e..0fb921b2c 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_util.c +++ b/bsd/skywalk/nexus/netif/nx_netif_util.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Apple Inc. All rights reserved. + * Copyright (c) 2019-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,79 +73,55 @@ SK_NO_INLINE_ATTRIBUTE static void fill_vlan_info(struct __kern_packet *fpkt) { - uint8_t *buf; - struct ether_vlan_header *evl; + struct mbuf *m; + struct __kern_packet *pkt; uint16_t tag; - boolean_t tag_in_pkt = FALSE; - if (fpkt->pkt_length < sizeof(*evl)) { - DTRACE_SKYWALK2(bad__len, struct __kern_packet *, fpkt, - uint32_t, fpkt->pkt_length); - return; - } - MD_BUFLET_ADDR_ABS(fpkt, buf); - buf += fpkt->pkt_headroom; - evl = (struct ether_vlan_header *)(void *)buf; - if (ntohs(evl->evl_encap_proto) == ETHERTYPE_VLAN) { - tag = ntohs(evl->evl_tag); - tag_in_pkt = TRUE; - DTRACE_SKYWALK1(tag__in__pkt, uint16_t, tag); - } else { - struct mbuf *m; - struct __kern_packet *pkt; + /* + * A filter packet must always have an mbuf or a packet + * attached. + */ + VERIFY((fpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0 || + (fpkt->pkt_pflags & PKT_F_PKT_DATA) != 0); + + if ((fpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { + m = fpkt->pkt_mbuf; + VERIFY(m != NULL); + if (mbuf_get_vlan_tag(m, &tag) != 0) { + return; + } + DTRACE_SKYWALK1(tag__from__mbuf, uint16_t, tag); + kern_packet_set_vlan_tag(SK_PKT2PH(fpkt), tag); + } else if ((fpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) { + pkt = fpkt->pkt_pkt; + VERIFY(pkt != NULL); /* - * A filter packet must always have an mbuf or a packet - * attached. + * The attached packet could have an mbuf attached + * if it came from the compat path. */ - VERIFY((fpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0 || - (fpkt->pkt_pflags & PKT_F_PKT_DATA) != 0); - - if ((fpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { + if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { m = fpkt->pkt_mbuf; VERIFY(m != NULL); if (mbuf_get_vlan_tag(m, &tag) != 0) { return; } - DTRACE_SKYWALK1(tag__from__mbuf, uint16_t, tag); - } else if ((fpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) { - pkt = fpkt->pkt_pkt; - VERIFY(pkt != NULL); - - /* - * The attached packet could have an mbuf attached - * if it came from the compat path. - */ - if ((pkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { - m = fpkt->pkt_mbuf; - VERIFY(m != NULL); - if (mbuf_get_vlan_tag(m, &tag) != 0) { - return; - } - DTRACE_SKYWALK1(tag__from__inner__mbuf, - uint16_t, tag); - } else { - /* - * XXX - * No native driver today fills in the vlan tag - * metadata. This code will work when the driver - * adds support for this. - */ - VERIFY((pkt->pkt_pflags & PKT_F_PKT_DATA) == 0); - if (__packet_get_vlan_tag(SK_PKT2PH(pkt), &tag, - NULL) != 0) { - return; - } - DTRACE_SKYWALK1(tag__from__pkt, uint16_t, tag); - } + DTRACE_SKYWALK1(tag__from__inner__mbuf, + uint16_t, tag); } else { - panic("filter packet has no mbuf or packet attached: " - "pkt_pflags 0x%llx\n", fpkt->pkt_pflags); - /* NOTREACHED */ - __builtin_unreachable(); + VERIFY((pkt->pkt_pflags & PKT_F_PKT_DATA) == 0); + if (__packet_get_vlan_tag(SK_PKT2PH(pkt), &tag) != 0) { + return; + } + DTRACE_SKYWALK1(tag__from__pkt, uint16_t, tag); } + kern_packet_set_vlan_tag(SK_PKT2PH(fpkt), tag); + } else { + panic("filter packet has no mbuf or packet attached: " + "pkt_pflags 0x%llx\n", fpkt->pkt_pflags); + /* NOTREACHED */ + __builtin_unreachable(); } - kern_packet_set_vlan_tag(SK_PKT2PH(fpkt), tag, tag_in_pkt); } static struct __kern_packet * @@ -684,6 +660,7 @@ nx_netif_pkt_to_pkt(struct nexus_netif_adapter *nifna, } if (type == NR_TX) { + dpkt->pkt_svc_class = pkt->pkt_svc_class; dpkt->pkt_flowsrc_type = pkt->pkt_flowsrc_type; dpkt->pkt_flow_token = pkt->pkt_flow_token; dpkt->pkt_policy_id = pkt->pkt_policy_id; diff --git a/bsd/skywalk/nexus/netif/nx_netif_vp.c b/bsd/skywalk/nexus/netif/nx_netif_vp.c index a082c67af..221be1d85 100644 --- a/bsd/skywalk/nexus/netif/nx_netif_vp.c +++ b/bsd/skywalk/nexus/netif/nx_netif_vp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Apple Inc. All rights reserved. + * Copyright (c) 2019-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,10 +223,9 @@ netif_hwna_rx_get_pkts(struct __kern_channel_ring *ring, struct proc *p, (ring->ckr_flags & CKRF_HOST) != 0)); if (err != 0) { SK_DF(SK_VERB_VP, - "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b " + "hwna \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x " "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)), - ring->ckr_name, SK_KVA(ring), ring->ckr_flags, - CKRF_BITS, err); + ring->ckr_name, SK_KVA(ring), ring->ckr_flags, err); STATS_INC(nifs, NETIF_STATS_VP_KR_ENTER_FAIL); return err; } @@ -245,7 +244,7 @@ netif_hwna_rx_get_pkts(struct __kern_channel_ring *ring, struct proc *p, ktail = ring->ckr_ktail; if (__improbable(ring->ckr_khead == ktail)) { SK_DF(SK_VERB_VP, - "spurious wakeup on hwna %s (0x%llx)", KRNA(ring)->na_name, + "spurious wakeup on hwna %s (%p)", KRNA(ring)->na_name, SK_KVA(KRNA(ring))); STATS_INC(nifs, NETIF_STATS_VP_SPURIOUS_NOTIFY); err = ENOENT; @@ -277,37 +276,7 @@ out: } int -netif_llw_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p, - uint32_t flags) -{ -#pragma unused (p, flags) - struct nexus_adapter *hwna; - uint32_t count; - int i, err; - - hwna = KRNA(ring); - count = na_get_nslots(hwna, NR_RX); - err = nx_rx_sync_packets(ring, ring->ckr_scratch, &count); - if (__improbable(err != 0)) { - SK_ERR("nx_rx_sync_packets failed: %d", err); - DTRACE_SKYWALK2(rx__sync__packets__failed, - struct __kern_channel_ring *, ring, int, err); - return err; - } - DTRACE_SKYWALK1(chain__count, uint32_t, count); - for (i = 0; i < count; i++) { - struct __kern_packet *pkt_chain; - - pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]); - ASSERT(pkt_chain != NULL); - (void) nx_netif_demux(NIFNA(KRNA(ring)), pkt_chain, NULL, - NULL, NETIF_FLOW_SOURCE); - } - return 0; -} - -int -netif_llw_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p, +netif_llw_rx_notify(struct __kern_channel_ring *ring, struct proc *p, uint32_t flags) { int err; @@ -321,6 +290,28 @@ netif_llw_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p, NULL, NETIF_FLOW_SOURCE); } +static void +netif_change_pending(struct nx_netif *nif) +{ + SK_LOCK_ASSERT_HELD(); + while ((nif->nif_flags & NETIF_FLAG_CHANGE_PENDING) != 0) { + DTRACE_SKYWALK1(change__pending__wait, struct nx_netif *, nif); + (void) msleep(&nif->nif_flags, &sk_lock, (PZERO - 1), + __func__, NULL); + DTRACE_SKYWALK1(change__pending__wake, struct nx_netif *, nif); + } + nif->nif_flags |= NETIF_FLAG_CHANGE_PENDING; +} + +static void +netif_change_done(struct nx_netif *nif) +{ + SK_LOCK_ASSERT_HELD(); + ASSERT((nif->nif_flags & NETIF_FLAG_CHANGE_PENDING) != 0); + nif->nif_flags &= ~NETIF_FLAG_CHANGE_PENDING; + wakeup(&nif->nif_flags); +} + static errno_t netif_hwna_setup(struct nx_netif *nif) { @@ -331,10 +322,17 @@ netif_hwna_setup(struct nx_netif *nif) SK_LOCK_ASSERT_HELD(); ASSERT(NETIF_IS_LOW_LATENCY(nif)); + /* + * Because sk_lock is released within some functions below, we need + * this extra synchronization to ensure that netif_hwna_setup/ + * netif_hwna_teardown can run atomically. + */ + netif_change_pending(nif); if (nif->nif_hw_ch != NULL) { nif->nif_hw_ch_refcnt++; SK_DF(SK_VERB_VP, "%s: hw channel already open, refcnt %d", if_name(nif->nif_ifp), nif->nif_hw_ch_refcnt); + netif_change_done(nif); return 0; } ASSERT(nif->nif_hw_ch_refcnt == 0); @@ -347,17 +345,19 @@ netif_hwna_setup(struct nx_netif *nif) err = 0; ch = ch_open_special(nx, &chr, FALSE, &err); if (ch == NULL) { - SK_ERR("%s: failed to open nx 0x%llx (err %d)", + SK_ERR("%s: failed to open nx %p (err %d)", if_name(nif->nif_ifp), SK_KVA(nx), err); + netif_change_done(nif); return err; } netif_hwna_set_mode(ch->ch_na, NETIF_MODE_LLW, NULL); na_start_spec(nx, ch); nif->nif_hw_ch_refcnt = 1; nif->nif_hw_ch = ch; - SK_DF(SK_VERB_VP, "%s: hw channel opened 0x%llx, %s:%s", + SK_DF(SK_VERB_VP, "%s: hw channel opened %p, %s:%s", if_name(nif->nif_ifp), SK_KVA(ch), NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name); + netif_change_done(nif); return 0; } @@ -370,12 +370,14 @@ netif_hwna_teardown(struct nx_netif *nif) SK_LOCK_ASSERT_HELD(); ASSERT(NETIF_IS_LOW_LATENCY(nif)); ASSERT(ch != NULL); + netif_change_pending(nif); if (--nif->nif_hw_ch_refcnt > 0) { SK_DF(SK_VERB_VP, "%s: hw channel still open, refcnt %d", if_name(nif->nif_ifp), nif->nif_hw_ch_refcnt); + netif_change_done(nif); return; } - SK_DF(SK_VERB_VP, "%s: hw channel closing 0x%llx, %s:%s", + SK_DF(SK_VERB_VP, "%s: hw channel closing %p, %s:%s", if_name(nif->nif_ifp), SK_KVA(ch), NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name); @@ -387,6 +389,7 @@ netif_hwna_teardown(struct nx_netif *nif) SK_DF(SK_VERB_VP, "%s: hw channel closed, %s:%s", if_name(nif->nif_ifp), NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name); + netif_change_done(nif); } static int @@ -468,7 +471,7 @@ netif_vp_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) } else { err = netif_vp_na_activate_off(na); } - SK_DF(SK_VERB_VP, "na \"%s\" (0x%llx) %s err %d", na->na_name, + SK_DF(SK_VERB_VP, "na \"%s\" (%p) %s err %d", na->na_name, SK_KVA(na), na_activate_mode2str(mode), err); return err; } @@ -602,7 +605,7 @@ netif_vp_send_pkt_chain_common(struct nexus_netif_adapter *dev_nifna, pkt = next; continue; } - err = ifnet_enqueue_pkt(ifp, p, FALSE, &drop); + err = ifnet_enqueue_pkt(ifp, ifp->if_snd, p, FALSE, &drop); } if (err != 0) { SK_ERR("enqueue failed: %d", err); @@ -948,7 +951,7 @@ netif_vp_na_dtor(struct nexus_adapter *na) nifna->nifna_netif = NULL; } NETIF_WUNLOCK(nif); - SK_DF(SK_VERB_VP, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na)); + SK_DF(SK_VERB_VP, "na \"%s\" (%p)", na->na_name, SK_KVA(na)); } int diff --git a/bsd/skywalk/nexus/nexus.c b/bsd/skywalk/nexus/nexus.c index c00df63a3..e2b9af06a 100644 --- a/bsd/skywalk/nexus/nexus.c +++ b/bsd/skywalk/nexus/nexus.c @@ -31,6 +31,8 @@ #include #include +#include + static uint32_t disable_nxctl_check = 0; #if (DEVELOPMENT || DEBUG) SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check, @@ -210,7 +212,7 @@ nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid, #if SK_LOG uuid_string_t uuidstr; - SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl), + SK_D("nxctl %p UUID %s", SK_KVA(nxctl), sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr)); #endif /* SK_LOG */ @@ -235,9 +237,9 @@ nxctl_close(struct nxctl *nxctl) #if SK_LOG uuid_string_t uuidstr; - SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl), + SK_D("nxctl %p UUID %s flags 0x%x", SK_KVA(nxctl), sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr), - nxctl->nxctl_flags, NEXUSCTLF_BITS); + nxctl->nxctl_flags); #endif /* SK_LOG */ if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) { @@ -809,10 +811,10 @@ nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt) ASSERT(nbr.nb_port != NEXUS_PORT_ANY); (void) sooptcopyout(sopt, &nbr, sizeof(nbr)); - SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d " - "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u", + SK_D("nexus %p nxb %p port %u flags 0x%x pid %d " + "(uniqueid %llu) exec_uuid %s key %p key_len %u", SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags, - NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid, + nxb->nxb_pid, nxb->nxb_uniqueid, sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr), (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0, nxb->nxb_key_len); @@ -959,7 +961,7 @@ nxb_alloc(zalloc_flags_t how) struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO); if (nxb) { - SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb)); + SK_DF(SK_VERB_MEM, "nxb %p ALLOC", SK_KVA(nxb)); } return nxb; } @@ -967,7 +969,7 @@ nxb_alloc(zalloc_flags_t how) void nxb_free(struct nxbind *nxb) { - SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb), + SK_DF(SK_VERB_MEM, "nxb %p key %p FREE", SK_KVA(nxb), (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0); if (nxb->nxb_key != NULL) { @@ -1190,7 +1192,7 @@ nxctl_free(struct nxctl *nxctl) ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED)); kauth_cred_unref(&nxctl->nxctl_cred); lck_mtx_destroy(&nxctl->nxctl_lock, &nexus_lock_group); - SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl)); + SK_D("nxctl %p FREE", SK_KVA(nxctl)); if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) { zfree(nxctl_zone, nxctl); } @@ -1268,9 +1270,7 @@ nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch, SK_LOCK_ASSERT_HELD(); LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); - /* monitor channels aren't externally visible/usable, so ignore */ - if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) || - (ch->ch_flags & CHANF_EXT_SKIP) || + if ((ch->ch_flags & CHANF_EXT_SKIP) || (nxprov->nxprov_ext.nxpi_pre_connect == NULL || nxprov->nxprov_ext.nxpi_connected == NULL)) { return 0; @@ -1284,9 +1284,8 @@ nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch, err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx, ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx); if (err != 0) { - SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect " - "error %d", SK_KVA(ch), ch->ch_flags, - CHANF_BITS, SK_KVA(nx), err); + SK_D("ch %p flags %x nx %p pre_connect " + "error %d", SK_KVA(ch), ch->ch_flags, SK_KVA(nx), err); ch->ch_ctx = NULL; goto done; } @@ -1309,13 +1308,13 @@ nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch, err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch); if (err != 0) { - SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d", - SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err); + SK_D("ch %p flags %x nx %p connected error %d", + SK_KVA(ch), ch->ch_flags, SK_KVA(nx), err); goto done; } os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed); - SK_D("ch 0x%llx flags %b nx 0x%llx connected", - SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx)); + SK_D("ch %p flags %x nx %p connected", + SK_KVA(ch), ch->ch_flags, SK_KVA(nx)); done: @@ -1368,8 +1367,8 @@ nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch) nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch); os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed); - SK_D("ch 0x%llx flags %b nx 0x%llx disconnected", - SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx)); + SK_D("ch %p flags %x nx %p disconnected", + SK_KVA(ch), ch->ch_flags, SK_KVA(nx)); /* We're done with this channel */ ch->ch_ctx = NULL; @@ -1397,9 +1396,8 @@ nxprov_create_common(struct nxctl *nxctl, uint32_t pp_region_config_flags; int i; - _CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext)); - _CASSERT(sizeof(*init) >= - sizeof(struct kern_nexus_netif_provider_init)); + static_assert(sizeof(*init) == sizeof(nxprov->nxprov_ext)); + static_assert(sizeof(*init) >= sizeof(struct kern_nexus_netif_provider_init)); SK_LOCK_ASSERT_HELD(); ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL); @@ -1487,7 +1485,7 @@ nxprov_create_common(struct nxctl *nxctl, #if SK_LOG uuid_string_t uuidstr; - SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov), + SK_D("nxprov %p UUID %s", SK_KVA(nxprov), sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr)); #endif /* SK_LOG */ @@ -1525,7 +1523,6 @@ nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg, break; case NEXUS_TYPE_KERNEL_PIPE: /* only for kernel */ - case NEXUS_TYPE_MONITOR: /* invalid */ default: *err = EINVAL; goto done; @@ -1592,7 +1589,6 @@ nxprov_create_kern(struct nxctl *nxctl, break; case NEXUS_TYPE_USER_PIPE: /* only for userland */ - case NEXUS_TYPE_MONITOR: /* invalid */ default: *err = EINVAL; goto done; @@ -1651,9 +1647,9 @@ nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked) #if SK_LOG uuid_string_t uuidstr; - SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov), + SK_D("nxprov %p UUID %s flags 0x%x", SK_KVA(nxprov), sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr), - nxprov->nxprov_flags, NXPROVF_BITS); + nxprov->nxprov_flags); #endif /* SK_LOG */ if (nxprov->nxprov_flags & NXPROVF_CLOSED) { @@ -1698,9 +1694,9 @@ nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked) #if SK_LOG uuid_string_t uuidstr; - SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov), + SK_D("nxprov %p UUID %s flags 0x%x", SK_KVA(nxprov), sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr), - nxprov->nxprov_flags, NXPROVF_BITS); + nxprov->nxprov_flags); #endif /* SK_LOG */ ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED); @@ -1760,7 +1756,7 @@ nxprov_free(struct kern_nexus_provider *nxprov) nxprov_params_free(nxprov->nxprov_params); nxprov->nxprov_params = NULL; ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED)); - SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov)); + SK_DF(SK_VERB_MEM, "nxprov %p FREE", SK_KVA(nxprov)); zfree(nxprov_zone, nxprov); } @@ -1817,7 +1813,7 @@ nxprov_params_alloc(zalloc_flags_t how) void nxprov_params_free(struct nxprov_params *nxp) { - SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp)); + SK_DF(SK_VERB_MEM, "nxp %p FREE", SK_KVA(nxp)); zfree(nxprov_params_zone, nxp); } @@ -1969,7 +1965,7 @@ nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid, nx_retain_locked(nx); /* one for the caller */ #if SK_LOG - SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx), + SK_D("nexus %p (%s:%s) UUID %s", SK_KVA(nx), nxdom_prov->nxdom_prov_dom->nxdom_name, nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr)); #endif /* SK_LOG */ @@ -2073,10 +2069,9 @@ nx_close(struct kern_nexus *nx, boolean_t locked) } else { #if SK_LOG uuid_string_t uuidstr; - SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx), + SK_D("nexus %p (%s:%s) UUID %s flags 0x%x", SK_KVA(nx), NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name, - sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, - NXF_BITS); + sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags); #endif /* SK_LOG */ if (STAILQ_EMPTY(&nx->nx_ch_head)) { @@ -2118,8 +2113,8 @@ nx_detach(struct kern_nexus *nx) #if SK_LOG uuid_string_t uuidstr; - SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx), - sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS); + SK_D("nexus %p UUID %s flags 0x%x", SK_KVA(nx), + sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags); #endif /* SK_LOG */ /* Caller must hold extra refs, on top of the two in reg/global lists */ @@ -2167,10 +2162,10 @@ nx_advisory_alloc(struct kern_nexus *nx, const char *name, /* -fbounds-safety: why do we need maddr? */ void *__sized_by(msize) maddr = NULL; - _CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t)); - _CASSERT((sizeof(struct sk_nexusadv) + + static_assert(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t)); + static_assert((sizeof(struct sk_nexusadv) + sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ); - _CASSERT((sizeof(struct netif_nexus_advisory) + + static_assert((sizeof(struct netif_nexus_advisory) + sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ); ASSERT(nx->nx_adv.nxv_reg == NULL); ASSERT(nx->nx_adv.nxv_adv == NULL); @@ -2252,7 +2247,7 @@ nx_free(struct kern_nexus *nx) ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head)); lck_rw_destroy(&nx->nx_ch_if_adv_lock, &nexus_lock_group); - SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx)); + SK_DF(SK_VERB_MEM, "nexus %p FREE", SK_KVA(nx)); zfree(nx_zone, nx); } @@ -2333,11 +2328,11 @@ nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch) if ((err = nxprov->nxprov_ext.nxpi_ring_init( nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX), &kring->ckr_ctx)) != 0) { - SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" " - "(0x%llx) krflags %b ring_init error %d", - SK_KVA(ch), ch->ch_flags, CHANF_BITS, - SK_KVA(nx), kring->ckr_name, SK_KVA(kring), - kring->ckr_flags, CKRF_BITS, err); + SK_D("ch %p flags %x nx %p kr \"%s\" " + "(%p) krflags %x ring_init error %d", + SK_KVA(ch), ch->ch_flags, SK_KVA(nx), + kring->ckr_name, SK_KVA(kring), + kring->ckr_flags, err); kring->ckr_ctx = NULL; undo = TRUE; break; @@ -2460,9 +2455,9 @@ nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring) ASSERT(&slot[i] <= kring->ckr_ksds_last); if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring, &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) { - SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u " + SK_D("nx %p kr \"%s\" (%p) krflags %x slot %u " "slot_init error %d", SK_KVA(nx), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err); + SK_KVA(kring), kring->ckr_flags, i, err); break; } /* we don't want this to be used by client, so verify here */ @@ -2580,7 +2575,7 @@ nx_port_find(struct kern_nexus *nx, nexus_port_t first, } } - SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx), + SK_DF(SK_VERB_NXPORT, "nx %p nx_port %d (err %d)", SK_KVA(nx), (int)*nx_port, err); return err; @@ -2592,19 +2587,18 @@ nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow) ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX); nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports); struct nx_port_info *ports; - size_t limit; - nexus_port_size_t i, num_ports, old_num_ports; + nexus_port_size_t limit, i, num_ports, old_num_ports; bitmap_t *bmap; ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0); ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0); - _CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK); + static_assert((sizeof(*bmap) * 8) == NX_PORT_CHUNK); ASSERT(powerof2(dom_port_max)); ASSERT(dom_port_max % NX_PORT_CHUNK == 0); old_num_ports = nx->nx_num_ports; num_ports = nx->nx_num_ports + grow; - limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK); + limit = (nexus_port_size_t)P2ROUNDUP(dom_port_max, NX_PORT_CHUNK); if (num_ports > limit) { SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)", nx->nx_num_ports, grow, num_ports, limit); @@ -2644,7 +2638,7 @@ nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow) nx->nx_ports = ports; nx->nx_num_ports = num_ports; - SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added", + SK_DF(SK_VERB_NXPORT, "!!! nx %p ports %u/%u, %u ports added", SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow); return 0; @@ -2750,7 +2744,7 @@ done: (*na)->na_nx_port = nx_port; } - SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)", + SK_DF(SK_VERB_NXPORT, "nx %p nx_port %d, ports %u/%u (err %d)", SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err); @@ -2793,7 +2787,7 @@ nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port) //XXX wshen0123@apple.com --- try to shrink bitmap & nx_ports ??? - SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u", + SK_DF(SK_VERB_NXPORT, "--- nx %p nx_port %d, ports %u/%u", SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports); } @@ -2845,7 +2839,7 @@ nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port, done: SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT, - "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx), + "+++ nx %p nx_port %d, ports %u/%u (err %d)", SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err); return err; @@ -2921,7 +2915,7 @@ nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port) done: SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT, - "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx), + "--- nx %p nx_port %d, ports %u/%u (err %d)", SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err); return err; @@ -3213,6 +3207,7 @@ populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring, nexus_channel_ring_entry *__counted_by(entry_count)entries, uint32_t NX_FB_ARG entry_count) { + uint64_t now = net_uptime(); ring_id_t i; nexus_channel_ring_entry_t scan; struct __kern_channel_ring *ring; @@ -3229,6 +3224,8 @@ populate_ring_entries(struct __kern_channel_ring *__counted_by(last)kring, sizeof(scan->ncre_user_stats)); } else { scan->ncre_stats = ring->ckr_stats; + scan->ncre_stats.crs_seconds_since_last_update = now - + scan->ncre_stats.crs_last_update_net_uptime; scan->ncre_user_stats = ring->ckr_usr_stats; } scan->ncre_error_stats = ring->ckr_err_stats; @@ -3242,9 +3239,6 @@ nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags) { uint32_t flags = 0; - flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0; - flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0; - flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0; flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0; flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0; flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0; diff --git a/bsd/skywalk/nexus/nexus_adapter.c b/bsd/skywalk/nexus/nexus_adapter.c index 1da748113..4c520a02a 100644 --- a/bsd/skywalk/nexus/nexus_adapter.c +++ b/bsd/skywalk/nexus/nexus_adapter.c @@ -54,12 +54,12 @@ */ #include #include -#include #include #include #include #include #include +#include static int na_krings_use(struct kern_channel *); static void na_krings_unuse(struct kern_channel *); @@ -181,13 +181,12 @@ na_init(void) * the expected limit and that it's properly aligned. This * check may be adjusted in future as needed. */ - _CASSERT(sizeof(struct nexus_mdata) <= 32 && - IS_P2ALIGNED(sizeof(struct nexus_mdata), 8)); - _CASSERT(sizeof(struct nexus_mdata) <= sizeof(struct __user_quantum)); + static_assert(sizeof(struct nexus_mdata) <= 32 && IS_P2ALIGNED(sizeof(struct nexus_mdata), 8)); + static_assert(sizeof(struct nexus_mdata) <= sizeof(struct __user_quantum)); /* see comments on nexus_meta_type_t */ - _CASSERT(NEXUS_META_TYPE_MAX == 3); - _CASSERT(NEXUS_META_SUBTYPE_MAX == 3); + static_assert(NEXUS_META_TYPE_MAX == 3); + static_assert(NEXUS_META_SUBTYPE_MAX == 3); ASSERT(!__na_inited); @@ -333,7 +332,7 @@ na_krings_use(struct kern_channel *ch) enum txrx t; uint32_t i; - SK_DF(SK_VERB_NA | SK_VERB_RING, "na \"%s\" (0x%llx) grabbing tx [%u,%u) rx [%u,%u)", + SK_DF(SK_VERB_NA | SK_VERB_RING, "na \"%s\" (%p) grabbing tx [%u,%u) rx [%u,%u)", na->na_name, SK_KVA(na), ch->ch_first[NR_TX], ch->ch_last[NR_TX], ch->ch_first[NR_RX], ch->ch_last[NR_RX]); @@ -348,9 +347,9 @@ na_krings_use(struct kern_channel *ch) if ((kring->ckr_flags & CKRF_EXCLUSIVE) || (kring->ckr_users && excl)) { SK_DF(SK_VERB_NA | SK_VERB_RING, - "kr \"%s\" (0x%llx) krflags 0x%b is busy", + "kr \"%s\" (%p) krflags 0x%x is busy", kring->ckr_name, SK_KVA(kring), - kring->ckr_flags, CKRF_BITS); + kring->ckr_flags); return EBUSY; } } @@ -386,7 +385,7 @@ na_krings_unuse(struct kern_channel *ch) uint32_t i; SK_DF(SK_VERB_NA | SK_VERB_RING, - "na \"%s\" (0x%llx) releasing tx [%u, %u) rx [%u, %u)", + "na \"%s\" (%p) releasing tx [%u, %u) rx [%u, %u)", na->na_name, SK_KVA(na), ch->ch_first[NR_TX], ch->ch_last[NR_TX], ch->ch_first[NR_RX], ch->ch_last[NR_RX]); @@ -459,18 +458,11 @@ na_bind_channel(struct nexus_adapter *na, struct kern_channel *ch, if (ch_mode & CHMODE_EXCLUSIVE) { os_atomic_or(&ch->ch_flags, CHANF_EXCLUSIVE, relaxed); } - /* - * Disallow automatic sync for monitor mode, since TX - * direction is disabled. - */ - if (ch_mode & CHMODE_MONITOR) { - os_atomic_or(&ch->ch_flags, CHANF_RXONLY, relaxed); - } if (!!(na->na_flags & NAF_USER_PKT_POOL) ^ !!(ch_mode & CHMODE_USER_PACKET_POOL)) { - SK_ERR("incompatible channel mode (0x%b), na_flags (0x%b)", - ch_mode, CHMODE_BITS, na->na_flags, NAF_BITS); + SK_ERR("incompatible channel mode (0x%x), na_flags (0x%x)", + ch_mode, na->na_flags); err = EINVAL; goto err; } @@ -557,6 +549,7 @@ na_bind_channel(struct nexus_adapter *na, struct kern_channel *ch, goto err_free_schema; } ch->ch_pp = rx_pp; + ch->ch_schema->csm_upp_buf_total = rx_pp->pp_kmd_region->skr_c_obj_cnt; } if (!NA_IS_ACTIVE(na)) { @@ -565,28 +558,28 @@ na_bind_channel(struct nexus_adapter *na, struct kern_channel *ch, goto err_release_pp; } - SK_D("activated \"%s\" adapter 0x%llx", na->na_name, + SK_DF(SK_VERB_NA, "activated \"%s\" adapter %p", na->na_name, SK_KVA(na)); - SK_D(" na_md_type: %u", na->na_md_type); - SK_D(" na_md_subtype: %u", na->na_md_subtype); + SK_DF(SK_VERB_NA, " na_md_type: %u", na->na_md_type); + SK_DF(SK_VERB_NA, " na_md_subtype: %u", na->na_md_subtype); } - SK_D("ch 0x%llx", SK_KVA(ch)); - SK_D(" ch_flags: 0x%b", ch->ch_flags, CHANF_BITS); + SK_DF(SK_VERB_NA, "ch %p", SK_KVA(ch)); + SK_DF(SK_VERB_NA, " ch_flags: 0x%x", ch->ch_flags); if (ch->ch_schema != NULL) { - SK_D(" ch_schema: 0x%llx", SK_KVA(ch->ch_schema)); + SK_DF(SK_VERB_NA, " ch_schema: %p", SK_KVA(ch->ch_schema)); } - SK_D(" ch_na: 0x%llx (chcnt %u)", SK_KVA(ch->ch_na), + SK_DF(SK_VERB_NA, " ch_na: %p (chcnt %u)", SK_KVA(ch->ch_na), ch->ch_na->na_channels); - SK_D(" ch_tx_rings: [%u,%u)", ch->ch_first[NR_TX], + SK_DF(SK_VERB_NA, " ch_tx_rings: [%u,%u)", ch->ch_first[NR_TX], ch->ch_last[NR_TX]); - SK_D(" ch_rx_rings: [%u,%u)", ch->ch_first[NR_RX], + SK_DF(SK_VERB_NA, " ch_rx_rings: [%u,%u)", ch->ch_first[NR_RX], ch->ch_last[NR_RX]); - SK_D(" ch_alloc_rings: [%u,%u)", ch->ch_first[NR_A], + SK_DF(SK_VERB_NA, " ch_alloc_rings: [%u,%u)", ch->ch_first[NR_A], ch->ch_last[NR_A]); - SK_D(" ch_free_rings: [%u,%u)", ch->ch_first[NR_F], + SK_DF(SK_VERB_NA, " ch_free_rings: [%u,%u)", ch->ch_first[NR_F], ch->ch_last[NR_F]); - SK_D(" ch_ev_rings: [%u,%u)", ch->ch_first[NR_EV], + SK_DF(SK_VERB_NA, " ch_ev_rings: [%u,%u)", ch->ch_first[NR_EV], ch->ch_last[NR_EV]); return 0; @@ -642,7 +635,7 @@ na_unbind_channel(struct kern_channel *ch) na_krings_unuse(ch); if (na->na_channels == 0) { /* last instance */ - SK_D("%s(%d): deleting last channel instance for %s", + SK_DF(SK_VERB_NA, "%s(%d): deleting last channel instance for %s", ch->ch_name, ch->ch_pid, na->na_name); /* @@ -675,7 +668,7 @@ na_unbind_channel(struct kern_channel *ch) na_unset_ringid(ch); /* reap the caches now (purge if adapter is idle) */ - skmem_arena_reap(na->na_arena, (na->na_channels == 0)); + skmem_arena_reap(na->na_arena, true); /* delete the csm */ if (ch->ch_schema != NULL) { @@ -704,14 +697,6 @@ na_teardown(struct nexus_adapter *na, struct kern_channel *ch, SK_LOCK_ASSERT_HELD(); LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED); -#if CONFIG_NEXUS_MONITOR - /* - * Walk through all the rings and tell any monitor - * that the port is going to exit Skywalk mode - */ - nx_mon_stop(na); -#endif /* CONFIG_NEXUS_MONITOR */ - /* * Deactive the adapter. */ @@ -775,14 +760,10 @@ na_schema_alloc(struct kern_channel *ch) #undef ASSERT_COUNT_TYPES_MATCH /* see comments for struct __user_channel_schema */ - _CASSERT(offsetof(struct __user_channel_schema, csm_ver) == 0); - _CASSERT(offsetof(struct __user_channel_schema, csm_flags) == - sizeof(csm->csm_ver)); - _CASSERT(offsetof(struct __user_channel_schema, csm_kern_name) == - sizeof(csm->csm_ver) + sizeof(csm->csm_flags)); - _CASSERT(offsetof(struct __user_channel_schema, csm_kern_uuid) == - sizeof(csm->csm_ver) + sizeof(csm->csm_flags) + - sizeof(csm->csm_kern_name)); + static_assert(offsetof(struct __user_channel_schema, csm_ver) == 0); + static_assert(offsetof(struct __user_channel_schema, csm_flags) == sizeof(csm->csm_ver)); + static_assert(offsetof(struct __user_channel_schema, csm_kern_name) == sizeof(csm->csm_ver) + sizeof(csm->csm_flags)); + static_assert(offsetof(struct __user_channel_schema, csm_kern_uuid) == sizeof(csm->csm_ver) + sizeof(csm->csm_flags) + sizeof(csm->csm_kern_name)); SK_LOCK_ASSERT_HELD(); @@ -841,7 +822,7 @@ na_schema_alloc(struct kern_channel *ch) *(uint32_t *)(uintptr_t)&csm->csm_ver = CSM_CURRENT_VERSION; /* kernel version and executable UUID */ - _CASSERT(sizeof(csm->csm_kern_name) == _SYS_NAMELEN); + static_assert(sizeof(csm->csm_kern_name) == _SYS_NAMELEN); (void) strlcpy(csm->csm_kern_name, version, sizeof(csm->csm_kern_name)); @@ -1119,7 +1100,7 @@ na_attach_common(struct nexus_adapter *na, struct kern_nexus *nx, na->na_nx = nx; na->na_nxdom_prov = nxdom_prov; - SK_D("na 0x%llx nx 0x%llx nxtype %u ar 0x%llx", + SK_DF(SK_VERB_NA, "na %p nx %p nxtype %u ar %p", SK_KVA(na), SK_KVA(nx), nxdom_prov->nxdom_prov_dom->nxdom_type, SK_KVA(na->na_arena)); } @@ -1131,11 +1112,10 @@ na_post_event(struct __kern_channel_ring *kring, boolean_t nodelay, struct nexus_adapter *na = KRNA(kring); enum txrx t = kring->ckr_tx; - SK_DF(SK_VERB_EVENTS, - "%s(%d) na \"%s\" (0x%llx) kr 0x%llx kev %u sel %u hint 0x%b", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), + SK_PDF(SK_VERB_EVENTS, current_proc(), + "na \"%s\" (%p) kr %p kev %u sel %u hint 0x%x", na->na_name, SK_KVA(na), SK_KVA(kring), within_kevent, selwake, - hint, CHAN_FILT_HINT_BITS); + hint); csi_selwakeup_one(kring, nodelay, within_kevent, selwake, hint); /* @@ -1155,13 +1135,13 @@ na_notify(struct __kern_channel_ring *kring, struct proc *p, uint32_t flags) #pragma unused(p) SK_DF(SK_VERB_NOTIFY | ((kring->ckr_tx == NR_TX) ? SK_VERB_TX : SK_VERB_RX), - "%s(%d) [%s] na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b " + "%s(%d) [%s] na \"%s\" (%p) kr \"%s\" (%p) krflags 0x%x " "flags 0x%x, kh %u kt %u | h %u t %u", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring), - kring->ckr_flags, CKRF_BITS, flags, kring->ckr_khead, - kring->ckr_ktail, kring->ckr_rhead, kring->ckr_rtail); + kring->ckr_flags, flags, kring->ckr_khead, kring->ckr_ktail, + kring->ckr_rhead, kring->ckr_rtail); na_post_event(kring, (flags & NA_NOTEF_PUSH), (flags & NA_NOTEF_IN_KEVENT), TRUE, 0); @@ -1197,14 +1177,14 @@ na_update_config(struct nexus_adapter *na) na_get_nslots(na, NR_RX) == rxd) { return 0; /* nothing changed */ } - SK_D("stored config %s: txring %u x %u, rxring %u x %u", + SK_DF(SK_VERB_NA, "stored config %s: txring %u x %u, rxring %u x %u", na->na_name, na_get_nrings(na, NR_TX), na_get_nslots(na, NR_TX), na_get_nrings(na, NR_RX), na_get_nslots(na, NR_RX)); - SK_D("new config %s: txring %u x %u, rxring %u x %u", + SK_DF(SK_VERB_NA, "new config %s: txring %u x %u, rxring %u x %u", na->na_name, txr, txd, rxr, rxd); if (na->na_channels == 0) { - SK_D("configuration changed (but fine)"); + SK_DF(SK_VERB_NA, "configuration changed (but fine)"); na_set_nrings(na, NR_TX, txr); na_set_nslots(na, NR_TX, txd); na_set_nrings(na, NR_RX, rxr); @@ -1224,35 +1204,29 @@ na_kr_setup_netif_svc_map(struct nexus_adapter *na) ASSERT(na->na_type == NA_NETIF_DEV); num_tx_rings = na_get_nrings(na, NR_TX); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BK_SYS) == - NAKR_WMM_SC2RINGID(KPKT_SC_BK)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) == - NAKR_WMM_SC2RINGID(KPKT_SC_RD)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) == - NAKR_WMM_SC2RINGID(KPKT_SC_OAM)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_AV) == - NAKR_WMM_SC2RINGID(KPKT_SC_RV)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_AV) == - NAKR_WMM_SC2RINGID(KPKT_SC_VI)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VO) == - NAKR_WMM_SC2RINGID(KPKT_SC_CTL)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_BK_SYS) == NAKR_WMM_SC2RINGID(KPKT_SC_BK)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_BE) == NAKR_WMM_SC2RINGID(KPKT_SC_RD)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_BE) == NAKR_WMM_SC2RINGID(KPKT_SC_OAM)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_AV) == NAKR_WMM_SC2RINGID(KPKT_SC_RV)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_AV) == NAKR_WMM_SC2RINGID(KPKT_SC_VI)); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_VO) == NAKR_WMM_SC2RINGID(KPKT_SC_CTL)); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BK) < NA_NUM_WMM_CLASSES); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_BE) < NA_NUM_WMM_CLASSES); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VI) < NA_NUM_WMM_CLASSES); - _CASSERT(NAKR_WMM_SC2RINGID(KPKT_SC_VO) < NA_NUM_WMM_CLASSES); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_BK) < NA_NUM_WMM_CLASSES); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_BE) < NA_NUM_WMM_CLASSES); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_VI) < NA_NUM_WMM_CLASSES); + static_assert(NAKR_WMM_SC2RINGID(KPKT_SC_VO) < NA_NUM_WMM_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_BK_SYS) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_BK) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_BE) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_RD) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_OAM) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_AV) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_RV) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_VI) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_SIG) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_VO) < KPKT_SC_MAX_CLASSES); - _CASSERT(MBUF_SCIDX(KPKT_SC_CTL) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_BK_SYS) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_BK) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_BE) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_RD) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_OAM) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_AV) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_RV) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_VI) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_SIG) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_VO) < KPKT_SC_MAX_CLASSES); + static_assert(MBUF_SCIDX(KPKT_SC_CTL) < KPKT_SC_MAX_CLASSES); /* * we support the following 2 configurations: @@ -1607,14 +1581,6 @@ na_kr_create(struct nexus_adapter *na, boolean_t alloc_ctx) kring->ckr_finalize = NULL; break; #endif /* CONFIG_NEXUS_USER_PIPE */ -#if CONFIG_NEXUS_MONITOR - case NA_MONITOR: - ASSERT(!(na->na_flags & - NAF_USER_PKT_POOL)); - kring->ckr_prologue = kr_txprologue; - kring->ckr_finalize = NULL; - break; -#endif /* CONFIG_NEXUS_MONITOR */ default: if (na->na_flags & NAF_USER_PKT_POOL) { kring->ckr_prologue = @@ -1645,15 +1611,6 @@ na_kr_create(struct nexus_adapter *na, boolean_t alloc_ctx) kring->ckr_finalize = kr_rxfinalize; break; #endif /* CONFIG_NEXUS_USER_PIPE */ -#if CONFIG_NEXUS_MONITOR - case NA_MONITOR: - ASSERT(!(na->na_flags & - NAF_USER_PKT_POOL)); - kring->ckr_prologue = - kr_rxprologue_nodetach; - kring->ckr_finalize = kr_rxfinalize; - break; -#endif /* CONFIG_NEXUS_MONITOR */ default: if (na->na_flags & NAF_USER_PKT_POOL) { kring->ckr_prologue = @@ -1691,9 +1648,9 @@ na_kr_create(struct nexus_adapter *na, boolean_t alloc_ctx) "%s %s%u%s", na->na_name, sk_ring2str(t), i, ((kring->ckr_flags & CKRF_HOST) ? "^" : "")); SK_DF(SK_VERB_NA | SK_VERB_RING, - "kr \"%s\" (0x%llx) krflags 0x%b rh %u rt %u", + "kr \"%s\" (%p) krflags 0x%x rh %u rt %u", kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS, kring->ckr_rhead, kring->ckr_rtail); + kring->ckr_rhead, kring->ckr_rtail); kring->ckr_state = KR_READY; q_lck_grp = na_kr_q_lck_grp(t); s_lck_grp = na_kr_s_lck_grp(t); @@ -1893,7 +1850,7 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) if (ring != NULL) { SK_DF(SK_VERB_NA | SK_VERB_RING, - "kr 0x%llx (\"%s\") is already " + "kr %p (\"%s\") is already " "initialized", SK_KVA(kring), kring->ckr_name); continue; /* already created by somebody else */ @@ -1903,7 +1860,7 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) (ring = skmem_cache_alloc(arn->arn_ring_cache, SKMEM_NOSLEEP)) == NULL) { SK_ERR("Cannot allocate %s_ring for kr " - "0x%llx (\"%s\")", sk_ring2str(t), + "%p (\"%s\")", sk_ring2str(t), SK_KVA(kring), kring->ckr_name); goto cleanup; } @@ -1935,8 +1892,7 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) (roff[SKMEM_REGION_BUF_LARGE] - ring_off); *(mach_vm_offset_t *)(uintptr_t)&ring->ring_md_base = (roff[SKMEM_REGION_UMD] - ring_off); - _CASSERT(sizeof(uint16_t) == - sizeof(ring->ring_bft_size)); + static_assert(sizeof(uint16_t) == sizeof(ring->ring_bft_size)); if (roff[SKMEM_REGION_UBFT] != 0) { ASSERT(ar->ar_regions[SKMEM_REGION_UBFT] != NULL); @@ -1971,12 +1927,9 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) *(slot_idx_t *)(uintptr_t)&ring->ring_tail = kring->ckr_rtail; - _CASSERT(sizeof(uint32_t) == - sizeof(ring->ring_def_buf_size)); - _CASSERT(sizeof(uint32_t) == - sizeof(ring->ring_large_buf_size)); - _CASSERT(sizeof(uint16_t) == - sizeof(ring->ring_md_size)); + static_assert(sizeof(uint32_t) == sizeof(ring->ring_def_buf_size)); + static_assert(sizeof(uint32_t) == sizeof(ring->ring_large_buf_size)); + static_assert(sizeof(uint16_t) == sizeof(ring->ring_md_size)); *(uint32_t *)(uintptr_t)&ring->ring_def_buf_size = ar->ar_regions[SKMEM_REGION_BUF_DEF]->skr_c_obj_size; if (ar->ar_regions[SKMEM_REGION_BUF_LARGE] != NULL) { @@ -1999,15 +1952,15 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) } /* ring info */ - _CASSERT(sizeof(uint16_t) == sizeof(ring->ring_id)); - _CASSERT(sizeof(uint16_t) == sizeof(ring->ring_kind)); + static_assert(sizeof(uint16_t) == sizeof(ring->ring_id)); + static_assert(sizeof(uint16_t) == sizeof(ring->ring_kind)); *(uint16_t *)(uintptr_t)&ring->ring_id = (uint16_t)kring->ckr_ring_id; *(uint16_t *)(uintptr_t)&ring->ring_kind = (uint16_t)kring->ckr_tx; SK_DF(SK_VERB_NA | SK_VERB_RING, - "%s_ring at 0x%llx kr 0x%llx (\"%s\")", + "%s_ring at %p kr %p (\"%s\")", sk_ring2str(t), SK_KVA(ring), SK_KVA(kring), kring->ckr_name); SK_DF(SK_VERB_NA | SK_VERB_RING, @@ -2025,19 +1978,19 @@ na_kr_setup(struct nexus_adapter *na, struct kern_channel *ch) " sd_base: 0x%llx", (uint64_t)ring->ring_sd_base); SK_DF(SK_VERB_NA | SK_VERB_RING, - " h, t: %u, %u, %u", ring->ring_head, + " h, t: %u, %u", ring->ring_head, ring->ring_tail); SK_DF(SK_VERB_NA | SK_VERB_RING, - " md_size: %d", + " md_size: %llu", (uint64_t)ring->ring_md_size); /* make sure they're in synch */ - _CASSERT(NR_RX == CR_KIND_RX); - _CASSERT(NR_TX == CR_KIND_TX); - _CASSERT(NR_A == CR_KIND_ALLOC); - _CASSERT(NR_F == CR_KIND_FREE); - _CASSERT(NR_EV == CR_KIND_EVENT); - _CASSERT(NR_LBA == CR_KIND_LARGE_BUF_ALLOC); + static_assert(NR_RX == CR_KIND_RX); + static_assert(NR_TX == CR_KIND_TX); + static_assert(NR_A == CR_KIND_ALLOC); + static_assert(NR_F == CR_KIND_FREE); + static_assert(NR_EV == CR_KIND_EVENT); + static_assert(NR_LBA == CR_KIND_LARGE_BUF_ALLOC); skip_user_ring_setup: /* @@ -2056,7 +2009,7 @@ skip_user_ring_setup: SKMEM_NOSLEEP); if (ksds == NULL) { SK_ERR("Cannot allocate %s_ksds for kr " - "0x%llx (\"%s\")", sk_ring2str(t), + "%p (\"%s\")", sk_ring2str(t), SK_KVA(kring), kring->ckr_name); goto cleanup; } @@ -2080,7 +2033,7 @@ skip_user_ring_setup: !(na->na_flags & NAF_USER_PKT_POOL) && na_kr_populate_slots(kring) != 0) { SK_ERR("Cannot allocate buffers for kr " - "0x%llx (\"%s\")", SK_KVA(kring), + "%p (\"%s\")", SK_KVA(kring), kring->ckr_name); goto cleanup; } @@ -2289,7 +2242,6 @@ na_kr_populate_slots(struct __kern_channel_ring *kring) __builtin_unreachable(); case NEXUS_TYPE_USER_PIPE: - case NEXUS_TYPE_MONITOR: break; default: @@ -2305,7 +2257,7 @@ na_kr_populate_slots(struct __kern_channel_ring *kring) SKMEM_NOSLEEP)); if (kqum == NULL) { err = ENOMEM; - SK_ERR("ar 0x%llx (\"%s\") no more buffers " + SK_ERR("ar %p (\"%s\") no more buffers " "after %u of %u, err %d", SK_KVA(na->na_arena), na->na_arena->ar_name, i, nslots, err); goto cleanup; @@ -2323,13 +2275,13 @@ na_kr_populate_slots(struct __kern_channel_ring *kring) kqum, current_proc()); } - SK_DF(SK_VERB_MEM, " C ksd [%-3d, 0x%llx] kqum [%-3u, 0x%llx] " - " kbuf[%-3u, 0x%llx]", i, SK_KVA(ksd), METADATA_IDX(kqum), + SK_DF(SK_VERB_MEM, " C ksd [%-3d, %p] kqum [%-3u, %p] " + " kbuf[%-3u, %p]", i, SK_KVA(ksd), METADATA_IDX(kqum), SK_KVA(kqum), kqum->qum_buf[0].buf_idx, SK_KVA(&kqum->qum_buf[0])); if (!(kqum->qum_qflags & QUM_F_KERNEL_ONLY)) { - SK_DF(SK_VERB_MEM, " C usd [%-3d, 0x%llx] " - "uqum [%-3u, 0x%llx] ubuf[%-3u, 0x%llx]", + SK_DF(SK_VERB_MEM, " C usd [%-3d, %p] " + "uqum [%-3u, %p] ubuf[%-3u, %p]", (int)(usd ? usd->sd_md_idx : OBJ_IDX_NONE), SK_KVA(usd), METADATA_IDX(kqum), SK_KVA(kqum->qum_user), @@ -2340,7 +2292,7 @@ na_kr_populate_slots(struct __kern_channel_ring *kring) sidx = SLOT_NEXT(sidx, kring->ckr_lim); } - SK_DF(SK_VERB_NA | SK_VERB_RING, "ar 0x%llx (\"%s\") populated %u slots from idx %u", + SK_DF(SK_VERB_NA | SK_VERB_RING, "ar %p (\"%s\") populated %u slots from idx %u", SK_KVA(na->na_arena), na->na_arena->ar_name, nslots, start_idx); cleanup: @@ -2408,7 +2360,7 @@ na_kr_depopulate_slots(struct __kern_channel_ring *kring, */ if (upp && (kqum->qum_qflags & QUM_F_INTERNALIZED)) { if ((qum = pp_find_upp(pp, midx)) != NULL) { - panic("internalized packet 0x%llx in htbl", + panic("internalized packet %p in htbl", SK_KVA(qum)); /* NOTREACHED */ __builtin_unreachable(); @@ -2450,13 +2402,13 @@ na_kr_depopulate_slots(struct __kern_channel_ring *kring, /* detach packet from slot */ kqum->qum_ksd = NULL; - SK_DF(SK_VERB_MEM, " D ksd [%-3d, 0x%llx] kqum [%-3u, 0x%llx] " - " kbuf[%-3u, 0x%llx]", i, SK_KVA(ksd), + SK_DF(SK_VERB_MEM, " D ksd [%-3d, %p] kqum [%-3u, %p] " + " kbuf[%-3u, %p]", i, SK_KVA(ksd), METADATA_IDX(kqum), SK_KVA(kqum), kqum->qum_buf[0].buf_idx, SK_KVA(&kqum->qum_buf[0])); if (!(kqum->qum_qflags & QUM_F_KERNEL_ONLY)) { - SK_DF(SK_VERB_MEM, " D usd [%-3u, 0x%llx] " - "uqum [%-3u, 0x%llx] ubuf[%-3u, 0x%llx]", + SK_DF(SK_VERB_MEM, " D usd [%-3u, %p] " + "uqum [%-3u, %p] ubuf[%-3u, %p]", (int)(usd ? usd->sd_md_idx : OBJ_IDX_NONE), SK_KVA(usd), METADATA_IDX(kqum), SK_KVA(kqum->qum_user), @@ -2469,7 +2421,7 @@ na_kr_depopulate_slots(struct __kern_channel_ring *kring, } } - SK_DF(SK_VERB_NA | SK_VERB_RING, "ar 0x%llx (\"%s\") depopulated %u of %u slots", + SK_DF(SK_VERB_NA | SK_VERB_RING, "ar %p (\"%s\") depopulated %u of %u slots", SK_KVA(KRNA(kring)->na_arena), KRNA(kring)->na_arena->ar_name, j, n); } @@ -2557,7 +2509,7 @@ na_kr_drop(struct nexus_adapter *na, boolean_t drop) } if (error != 0) { - SK_ERR("na \"%s\" (0x%llx) kr \"%s\" (0x%llx) " + SK_ERR("na \"%s\" (%p) kr \"%s\" (%p) " "kr_enter failed %d", na->na_name, SK_KVA(na), kring->ckr_name, SK_KVA(kring), @@ -2565,10 +2517,9 @@ na_kr_drop(struct nexus_adapter *na, boolean_t drop) } else { kr_exit(kring); } - SK_D("na \"%s\" (0x%llx) kr \"%s\" (0x%llx) " - "krflags 0x%b", na->na_name, SK_KVA(na), - kring->ckr_name, SK_KVA(kring), kring->ckr_flags, - CKRF_BITS); + SK_DF(SK_VERB_NA, "na \"%s\" (%p) kr \"%s\" (%p) " + "krflags 0x%x", na->na_name, SK_KVA(na), + kring->ckr_name, SK_KVA(kring), kring->ckr_flags); } } } @@ -2654,7 +2605,7 @@ na_unlock_all_rings(struct nexus_adapter *na) int na_connect(struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) + struct nxbind *nxb, struct proc *p) { struct nexus_adapter *__single na = NULL; mach_vm_size_t memsize = 0; @@ -2667,7 +2618,7 @@ na_connect(struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, SK_LOCK_ASSERT_HELD(); /* find the nexus adapter and return the reference */ - err = na_find(ch, nx, chr, ch0, nxb, p, &na, TRUE /* create */); + err = na_find(ch, nx, chr, nxb, p, &na, TRUE /* create */); if (err != 0) { ASSERT(na == NULL); goto done; @@ -2705,7 +2656,8 @@ na_connect(struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, if (!(skmem_arena_nexus(na->na_arena)->arn_mode & AR_NEXUS_MODE_EXTERNAL_PPOOL)) { - os_atomic_or(__DECONST(uint32_t *, &ch->ch_schema->csm_flags), CSM_PRIV_MEM, relaxed); + os_atomic_or(__DECONST(uint32_t *, &ch->ch_schema->csm_flags), + CSM_PRIV_MEM, relaxed); } err = skmem_arena_mmap(na->na_arena, p, &ch->ch_mmap); @@ -2717,11 +2669,10 @@ na_connect(struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, chr->cr_memsize = memsize; chr->cr_memoffset = ch->ch_schema_offset; - SK_D("%s(%d) ch 0x%llx <-> nx 0x%llx (%s:\"%s\":%d:%d) na 0x%llx " - "naflags %b", sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, - na->na_name, (int)chr->cr_port, (int)chr->cr_ring_id, SK_KVA(na), - na->na_flags, NAF_BITS); + SK_DF(SK_VERB_NA, "%s(%d) ch %p <-> nx %p (%s:\"%s\":%d:%d) na %p naflags 0x%x", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(ch), SK_KVA(nx), + NX_DOM_PROV(nx)->nxdom_prov_name, na->na_name, (int)chr->cr_port, + (int)chr->cr_ring_id, SK_KVA(na), na->na_flags); done: if (err != 0) { @@ -2753,11 +2704,11 @@ na_disconnect(struct kern_nexus *nx, struct kern_channel *ch) SK_LOCK_ASSERT_HELD(); - SK_D("ch 0x%llx -!- nx 0x%llx (%s:\"%s\":%u:%d) na 0x%llx naflags %b", + SK_DF(SK_VERB_NA, "ch %p -!- nx %p (%s:\"%s\":%u:%d) na %p naflags 0x%x", SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(ch->ch_na), - ch->ch_na->na_flags, NAF_BITS); + ch->ch_na->na_flags); /* destroy mapping and release references */ na_unbind_channel(ch); @@ -2803,12 +2754,11 @@ na_defunct(struct kern_nexus *nx, struct kern_channel *ch, } } - SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) " - "na 0x%llx naflags %b", ch->ch_name, ch->ch_pid, - SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, - na->na_name, ch->ch_info->cinfo_nx_port, - (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(na), - na->na_flags, NAF_BITS); + SK_DF(SK_VERB_NA, "%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d) na %p naflags 0x%x", + ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx), + NX_DOM_PROV(nx)->nxdom_prov_name, na->na_name, + ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id, + SK_KVA(na), na->na_flags); if (!locked) { lck_mtx_unlock(&ch->ch_lock); @@ -2835,7 +2785,7 @@ na_connect_spec(struct kern_nexus *nx, struct kern_channel *ch, SK_LOCK_ASSERT_HELD(); - error = na_find(ch, nx, chr, NULL, NULL, kernproc, &na, TRUE); + error = na_find(ch, nx, chr, NULL, kernproc, &na, TRUE); if (error != 0) { goto done; } @@ -2885,11 +2835,10 @@ na_connect_spec(struct kern_nexus *nx, struct kern_channel *ch, skmem_arena_get_stats(na->na_arena, &memsize, NULL); chr->cr_memsize = memsize; - SK_D("%s(%d) ch 0x%llx <-> nx 0x%llx (%s:\"%s\":%d:%d) na 0x%llx " - "naflags %b", sk_proc_name_address(p), sk_proc_pid(p), - SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, - na->na_name, (int)chr->cr_port, (int)chr->cr_ring_id, SK_KVA(na), - na->na_flags, NAF_BITS); + SK_DF(SK_VERB_NA, "%s(%d) ch %p <-> nx %p (%s:\"%s\":%d:%d) na %p naflags 0x%x", + sk_proc_name(p), sk_proc_pid(p), SK_KVA(ch), SK_KVA(nx), + NX_DOM_PROV(nx)->nxdom_prov_name, na->na_name, (int)chr->cr_port, + (int)chr->cr_ring_id, SK_KVA(na), na->na_flags); done: if (error != 0) { @@ -2925,11 +2874,11 @@ na_disconnect_spec(struct kern_nexus *nx, struct kern_channel *ch) ASSERT(na != NULL); ASSERT(na->na_flags & NAF_SPEC_INIT); /* has been bound */ - SK_D("ch 0x%llx -!- nx 0x%llx (%s:\"%s\":%u:%d) na 0x%llx naflags %b", + SK_DF(SK_VERB_NA, "ch %p -!- nx %p (%s:\"%s\":%u:%d) na %p naflags 0x%x", SK_KVA(ch), SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, - na->na_name, ch->ch_info->cinfo_nx_port, - (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(na), - na->na_flags, NAF_BITS); + ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, + (int)ch->ch_info->cinfo_ch_ring_id, SK_KVA(ch->ch_na), + ch->ch_na->na_flags); /* take a reference for this routine */ na_retain_locked(na); @@ -3000,12 +2949,12 @@ na_stop_spec(struct kern_nexus *nx, struct kern_channel *ch) */ int na_find(struct kern_channel *ch, struct kern_nexus *nx, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p, - struct nexus_adapter **na, boolean_t create) + struct nxbind *nxb, struct proc *p, struct nexus_adapter **na, + boolean_t create) { int error = 0; - _CASSERT(sizeof(chr->cr_name) == sizeof((*na)->na_name)); + static_assert(sizeof(chr->cr_name) == sizeof((*na)->na_name)); *na = NULL; /* default return value */ @@ -3023,13 +2972,6 @@ na_find(struct kern_channel *ch, struct kern_nexus *nx, struct chreq *chr, * !0 !NULL impossible */ -#if CONFIG_NEXUS_MONITOR - /* try to see if this is a monitor port */ - error = nx_monitor_na_find(nx, ch, chr, ch0, nxb, p, na, create); - if (error != 0 || *na != NULL) { - return error; - } -#endif /* CONFIG_NEXUS_MONITOR */ #if CONFIG_NEXUS_USER_PIPE /* try to see if this is a pipe port */ error = nx_upipe_na_find(nx, ch, chr, nxb, p, na, create); @@ -3070,7 +3012,7 @@ na_retain_locked(struct nexus_adapter *na) if (na != NULL) { #if SK_LOG uint32_t oref = os_atomic_inc_orig(&na->na_refcount, relaxed); - SK_DF(SK_VERB_REFCNT, "na \"%s\" (0x%llx) refcnt %u chcnt %u", + SK_DF(SK_VERB_REFCNT, "na \"%s\" (%p) refcnt %u chcnt %u", na->na_name, SK_KVA(na), oref + 1, na->na_channels); #else /* !SK_LOG */ os_atomic_inc(&na->na_refcount, relaxed); @@ -3089,7 +3031,7 @@ na_release_locked(struct nexus_adapter *na) ASSERT(na->na_refcount > 0); oref = os_atomic_dec_orig(&na->na_refcount, relaxed); if (oref > 1) { - SK_DF(SK_VERB_REFCNT, "na \"%s\" (0x%llx) refcnt %u chcnt %u", + SK_DF(SK_VERB_REFCNT, "na \"%s\" (%p) refcnt %u chcnt %u", na->na_name, SK_KVA(na), oref - 1, na->na_channels); return 0; } @@ -3111,7 +3053,7 @@ na_release_locked(struct nexus_adapter *na) na->na_arena = NULL; } - SK_DF(SK_VERB_MEM, "na \"%s\" (0x%llx) being freed", + SK_DF(SK_VERB_MEM, "na \"%s\" (%p) being freed", na->na_name, SK_KVA(na)); NA_FREE(na); @@ -3135,7 +3077,7 @@ static void na_pseudo_free(struct nexus_adapter *na) { ASSERT(na->na_refcount == 0); - SK_DF(SK_VERB_MEM, "na 0x%llx FREE", SK_KVA(na)); + SK_DF(SK_VERB_MEM, "na %p FREE", SK_KVA(na)); bzero(na, sizeof(*na)); zfree(na_pseudo_zone, na); } @@ -3146,9 +3088,9 @@ na_pseudo_txsync(struct __kern_channel_ring *kring, struct proc *p, { #pragma unused(kring, p, flags) SK_DF(SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); return 0; @@ -3160,10 +3102,9 @@ na_pseudo_rxsync(struct __kern_channel_ring *kring, struct proc *p, { #pragma unused(kring, p, flags) SK_DF(SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, - SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, - flags); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u flags 0%x", + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, + SK_KVA(kring), kring->ckr_flags, kring->ckr_ring_id, flags); ASSERT(kring->ckr_rhead <= kring->ckr_lim); @@ -3173,7 +3114,7 @@ na_pseudo_rxsync(struct __kern_channel_ring *kring, struct proc *p, static int na_pseudo_activate(struct nexus_adapter *na, na_activate_mode_t mode) { - SK_D("na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_NA, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); switch (mode) { @@ -3278,20 +3219,20 @@ na_pseudo_create(struct kern_nexus *nx, struct chreq *chr, #if SK_LOG uuid_string_t uuidstr; - SK_D("na_name: \"%s\"", na->na_name); - SK_D(" UUID: %s", sk_uuid_unparse(na->na_uuid, uuidstr)); - SK_D(" nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_NA, "na_name: \"%s\"", na->na_name); + SK_DF(SK_VERB_NA, " UUID: %s", sk_uuid_unparse(na->na_uuid, uuidstr)); + SK_DF(SK_VERB_NA, " nx: %p (\"%s\":\"%s\")", SK_KVA(na->na_nx), NX_DOM(na->na_nx)->nxdom_name, NX_DOM_PROV(na->na_nx)->nxdom_prov_name); - SK_D(" flags: %b", na->na_flags, NAF_BITS); - SK_D(" flowadv_max: %u", na->na_flowadv_max); - SK_D(" rings: tx %u rx %u", + SK_DF(SK_VERB_NA, " flags: 0x%x", na->na_flags); + SK_DF(SK_VERB_NA, " flowadv_max: %u", na->na_flowadv_max); + SK_DF(SK_VERB_NA, " rings: tx %u rx %u", na_get_nrings(na, NR_TX), na_get_nrings(na, NR_RX)); - SK_D(" slots: tx %u rx %u", + SK_DF(SK_VERB_NA, " slots: tx %u rx %u", na_get_nslots(na, NR_TX), na_get_nslots(na, NR_RX)); #if CONFIG_NEXUS_USER_PIPE - SK_D(" next_pipe: %u", na->na_next_pipe); - SK_D(" max_pipes: %u", na->na_max_pipes); + SK_DF(SK_VERB_NA, " next_pipe: %u", na->na_next_pipe); + SK_DF(SK_VERB_NA, " max_pipes: %u", na->na_max_pipes); #endif /* CONFIG_NEXUS_USER_PIPE */ #endif /* SK_LOG */ @@ -3386,7 +3327,7 @@ na_flowadv_set(const struct kern_channel *ch, const flowadv_idx_t fe_idx, if (arn->arn_flowadv_obj != NULL) { struct __flowadv_entry *fae = &arn->arn_flowadv_obj[fe_idx]; - _CASSERT(sizeof(fae->fae_token) == sizeof(flow_token)); + static_assert(sizeof(fae->fae_token) == sizeof(flow_token)); /* * We cannot guarantee that the flow is still around by now, * so check if that's the case and let the caller know. @@ -3401,11 +3342,11 @@ na_flowadv_set(const struct kern_channel *ch, const flowadv_idx_t fe_idx, } if (suspend) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) %s flow token 0x%x fidx %u " - "SUSPEND", sk_proc_name_address(current_proc()), + "SUSPEND", sk_proc_name(current_proc()), sk_proc_pid(current_proc()), fae_uuid_str, flow_token, fe_idx); } else { - SK_ERR("%s(%d) flow token 0x%llu fidx %u no longer around", - sk_proc_name_address(current_proc()), + SK_ERR("%s(%d) flow token 0x%x fidx %u no longer around", + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), flow_token, fe_idx); } @@ -3435,7 +3376,7 @@ na_flowadv_clear(const struct kern_channel *ch, const flowadv_idx_t fe_idx, if (arn->arn_flowadv_obj != NULL) { struct __flowadv_entry *__single fae = &arn->arn_flowadv_obj[fe_idx]; - _CASSERT(sizeof(fae->fae_token) == sizeof(flow_token)); + static_assert(sizeof(fae->fae_token) == sizeof(flow_token)); /* * We cannot guarantee that the flow is still around by now, * so check if that's the case and let the caller know. @@ -3450,8 +3391,8 @@ na_flowadv_clear(const struct kern_channel *ch, const flowadv_idx_t fe_idx, } if (resume) { SK_DF(SK_VERB_FLOW_ADVISORY, "%s(%d) %s flow token 0x%x " - "fidx %u RESUME", ch->ch_name, ch->ch_pid, fae_uuid_str, flow_token, - fe_idx); + "fidx %u RESUME", ch->ch_name, ch->ch_pid, fae_uuid_str, + flow_token, fe_idx); } else { SK_ERR("%s(%d): flow token 0x%x fidx %u no longer around", ch->ch_name, ch->ch_pid, flow_token, fe_idx); @@ -3463,8 +3404,9 @@ na_flowadv_clear(const struct kern_channel *ch, const flowadv_idx_t fe_idx, } int -na_flowadv_report_ce_event(const struct kern_channel *ch, const flowadv_idx_t fe_idx, - const flowadv_token_t flow_token, uint32_t ce_cnt, uint32_t total_pkt_cnt) +na_flowadv_report_congestion_event(const struct kern_channel *ch, + const flowadv_idx_t fe_idx, const flowadv_token_t flow_token, + uint32_t congestion_cnt, __unused uint32_t l4s_ce_cnt, uint32_t total_pkt_cnt) { struct nexus_adapter *na = ch->ch_na; struct skmem_arena *ar = na->na_arena; @@ -3483,14 +3425,14 @@ na_flowadv_report_ce_event(const struct kern_channel *ch, const flowadv_idx_t fe if (arn->arn_flowadv_obj != NULL) { struct __flowadv_entry *__single fae = &arn->arn_flowadv_obj[fe_idx]; - _CASSERT(sizeof(fae->fae_token) == sizeof(flow_token)); + static_assert(sizeof(fae->fae_token) == sizeof(flow_token)); /* * We cannot guarantee that the flow is still around by now, * so check if that's the case and let the caller know. */ if ((added = (fae->fae_token == flow_token))) { ASSERT(fae->fae_flags & FLOWADVF_VALID); - fae->fae_ce_cnt += ce_cnt; + fae->fae_congestion_cnt += congestion_cnt; fae->fae_pkt_cnt += total_pkt_cnt; uuid_unparse(fae->fae_id, fae_uuid_str); } @@ -3516,8 +3458,8 @@ na_flowadv_event(struct __kern_channel_ring *kring) { ASSERT(kring->ckr_tx == NR_TX); - SK_DF(SK_VERB_EVENTS, "%s(%d) na \"%s\" (0x%llx) kr 0x%llx", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), + SK_DF(SK_VERB_EVENTS, "%s(%d) na \"%s\" (%p) kr %p", + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), KRNA(kring)->na_name, SK_KVA(KRNA(kring)), SK_KVA(kring)); na_post_event(kring, TRUE, FALSE, FALSE, CHAN_FILT_HINT_FLOW_ADV_UPD); @@ -3548,7 +3490,7 @@ na_packet_pool_free_sync(struct __kern_channel_ring *kring, struct proc *p, /* nothing to free */ if (__improbable(n == 0)) { SK_DF(SK_VERB_MEM | SK_VERB_SYNC, "%s(%d) kr \"%s\" %s", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, "nothing to free"); goto done; } @@ -3562,7 +3504,7 @@ na_packet_pool_free_sync(struct __kern_channel_ring *kring, struct proc *p, usd = KR_USD(kring, j); if (__improbable(!SD_VALID_METADATA(usd))) { - SK_ERR("bad slot %d 0x%llx", j, SK_KVA(ksd)); + SK_ERR("bad slot %d %p", j, SK_KVA(ksd)); ret = EINVAL; break; } @@ -3691,7 +3633,7 @@ na_packet_pool_alloc_sync_common(struct __kern_channel_ring *kring, struct proc ASSERT(!KR_KERNEL_ONLY(kring)); ASSERT(!PP_KERNEL_ONLY(pp)); - now = _net_uptime; + now = net_uptime(); if ((flags & NA_SYNCF_UPP_PURGE) != 0) { if (now - kring->ckr_sync_time >= na_upp_reap_interval) { kring->ckr_alloc_ws = na_upp_reap_min_pkts; @@ -3742,7 +3684,7 @@ na_packet_pool_alloc_sync_common(struct __kern_channel_ring *kring, struct proc err = alloc_packets(pp, kring->ckr_scratch, PP_HAS_BUFFER_ON_DEMAND(pp) && large, &ph_cnt); if (__improbable(ph_cnt == 0)) { - SK_ERR("kr 0x%llx failed to alloc %u packet s(%d)", + SK_ERR("kr %p failed to alloc %u packet s(%d)", SK_KVA(kring), ph_needed, err); kring->ckr_err_stats.cres_pkt_alloc_failures += ph_needed; } else { @@ -3826,7 +3768,7 @@ na_packet_pool_free_buf_sync(struct __kern_channel_ring *kring, struct proc *p, /* nothing to free */ if (__improbable(n == 0)) { SK_DF(SK_VERB_MEM | SK_VERB_SYNC, "%s(%d) kr \"%s\" %s", - sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, + sk_proc_name(p), sk_proc_pid(p), kring->ckr_name, "nothing to free"); goto done; } @@ -3839,7 +3781,7 @@ na_packet_pool_free_buf_sync(struct __kern_channel_ring *kring, struct proc *p, usd = KR_USD(kring, j); if (__improbable(!SD_VALID_METADATA(usd))) { - SK_ERR("bad slot %d 0x%llx", j, SK_KVA(ksd)); + SK_ERR("bad slot %d %p", j, SK_KVA(ksd)); ret = EINVAL; break; } @@ -3885,7 +3827,7 @@ na_packet_pool_alloc_buf_sync(struct __kern_channel_ring *kring, struct proc *p, ASSERT(!KR_KERNEL_ONLY(kring)); ASSERT(!PP_KERNEL_ONLY(pp)); - now = _net_uptime; + now = net_uptime(); if ((flags & NA_SYNCF_UPP_PURGE) != 0) { if (now - kring->ckr_sync_time >= na_upp_reap_interval) { kring->ckr_alloc_ws = na_upp_reap_min_pkts; @@ -3937,7 +3879,7 @@ na_packet_pool_alloc_buf_sync(struct __kern_channel_ring *kring, struct proc *p, SKMEM_NOSLEEP, false); if (bh_cnt == 0) { - SK_ERR("kr 0x%llx failed to alloc %u buflets(%d)", + SK_ERR("kr %p failed to alloc %u buflets(%d)", SK_KVA(kring), bh_needed, err); kring->ckr_err_stats.cres_pkt_alloc_failures += bh_needed; } @@ -3990,11 +3932,22 @@ na_drain(struct nexus_adapter *na, boolean_t purge) /* will be cleared on next channel sync */ if (!(os_atomic_or_orig(&na->na_flags, NAF_DRAINING, relaxed) & NAF_DRAINING) && NA_IS_ACTIVE(na)) { - SK_DF(SK_VERB_NA, "%s: %s na 0x%llx flags %b", + SK_DF(SK_VERB_NA, "%s: %s na %p flags 0x%x", na->na_name, (purge ? "purging" : "pruning"), - SK_KVA(na), na->na_flags, NAF_BITS); + SK_KVA(na), na->na_flags); /* reap (purge/prune) caches in the arena */ skmem_arena_reap(na->na_arena, purge); } } + +SK_NO_INLINE_ATTRIBUTE +char * +na2str(const struct nexus_adapter *na, char *__counted_by(dsz)dst, + size_t dsz) +{ + (void) sk_snprintf(dst, dsz, "%p %s flags 0x%b", + SK_KVA(na), na->na_name, na->na_flags, NAF_BITS); + + return dst; +} diff --git a/bsd/skywalk/nexus/nexus_adapter.h b/bsd/skywalk/nexus/nexus_adapter.h index 1754f9125..2926af473 100644 --- a/bsd/skywalk/nexus/nexus_adapter.h +++ b/bsd/skywalk/nexus/nexus_adapter.h @@ -76,9 +76,6 @@ typedef enum { #if CONFIG_NEXUS_KERNEL_PIPE NA_KERNEL_PIPE, /* struct nexus_kpipe_adapter */ #endif /* CONFIG_NEXUS_KERNEL_PIPE */ -#if CONFIG_NEXUS_MONITOR - NA_MONITOR, /* struct nexus_monitor_adapter */ -#endif /* CONFIG_NEXUS_MONITOR */ #if CONFIG_NEXUS_NETIF NA_NETIF_DEV, /* struct nexus_netif_adapter (dev) */ NA_NETIF_HOST, /* struct nexus_netif_adapter (host) */ @@ -113,7 +110,7 @@ struct nexus_pkt_stats { /* * The "struct nexus_adapter" contains all base fields needed to support * Nexus adapter operations. There are different types of Nexus adapters - * (upipe, kpipe, fsw, monitor, vp, ...) so a nexus_adapter is + * (upipe, kpipe, fsw, vp, ...) so a nexus_adapter is * always the first field in the derived type. */ struct nexus_adapter { @@ -297,7 +294,7 @@ struct nexus_adapter { uint32_t flags); int (*na_rxsync)(struct __kern_channel_ring *kring, struct proc *, uint32_t flags); -#define NA_SYNCF_MONITOR 0x1 +#define NA_SYNCF_UNUSED_1 0x1 #define NA_SYNCF_FORCE_READ 0x2 #define NA_SYNCF_FORCE_RECLAIM 0x4 #define NA_SYNCF_NETIF 0x8 /* netif normal sync */ @@ -316,7 +313,7 @@ struct nexus_adapter { */ int (*na_notify)(struct __kern_channel_ring *kring, struct proc *, uint32_t flags); -#define NA_NOTEF_MONITOR 0x1 +#define NA_NOTEF_UNUSED_1 0x1 #define NA_NOTEF_IN_KEVENT 0x2 #define NA_NOTEF_CAN_SLEEP 0x4 /* OK to block in kr_enter() */ #define NA_NOTEF_NETIF 0x8 /* same as NA_SYNCF_NETIF */ @@ -643,14 +640,13 @@ extern int na_interp_ringid(struct nexus_adapter *, ring_id_t, ring_set_t, uint32_t[NR_TXRX], uint32_t[NR_TXRX]); extern struct kern_pbufpool *na_kr_get_pp(struct nexus_adapter *, enum txrx); -extern int na_find(struct kern_channel *, struct kern_nexus *, - struct chreq *, struct kern_channel *, struct nxbind *, - struct proc *, struct nexus_adapter **, boolean_t); +extern int na_find(struct kern_channel *, struct kern_nexus *, struct chreq *, + struct nxbind *, struct proc *, struct nexus_adapter **, boolean_t); extern void na_retain_locked(struct nexus_adapter *na); extern int na_release_locked(struct nexus_adapter *na); extern int na_connect(struct kern_nexus *, struct kern_channel *, - struct chreq *, struct kern_channel *, struct nxbind *, struct proc *); + struct chreq *, struct nxbind *, struct proc *); extern void na_disconnect(struct kern_nexus *, struct kern_channel *); extern void na_defunct(struct kern_nexus *, struct kern_channel *, struct nexus_adapter *, boolean_t); @@ -671,15 +667,20 @@ extern bool na_flowadv_set(const struct kern_channel *, const flowadv_idx_t, const flowadv_token_t); extern bool na_flowadv_clear(const struct kern_channel *, const flowadv_idx_t, const flowadv_token_t); -extern int na_flowadv_report_ce_event(const struct kern_channel *ch, +extern int na_flowadv_report_congestion_event(const struct kern_channel *ch, const flowadv_idx_t fe_idx, const flowadv_token_t flow_token, - uint32_t ce_cnt, uint32_t total_pkt_cnt); + uint32_t congestion_cnt, uint32_t ce_cnt, uint32_t total_pkt_cnt); extern void na_flowadv_event(struct __kern_channel_ring *); extern void na_post_event(struct __kern_channel_ring *, boolean_t, boolean_t, boolean_t, uint32_t); extern void na_drain(struct nexus_adapter *, boolean_t); +#if SK_LOG +#define NA_DBGBUF_SIZE 256 +extern char * na2str(const struct nexus_adapter *na, char *__counted_by(dsz)dst, size_t dsz); +#endif /* SK_LOG */ + __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ #endif /* _SKYWALK_NEXUS_ADAPTER_H_ */ diff --git a/bsd/skywalk/nexus/nexus_ioctl.c b/bsd/skywalk/nexus/nexus_ioctl.c index 12bb2eb6f..782b42f3b 100644 --- a/bsd/skywalk/nexus/nexus_ioctl.c +++ b/bsd/skywalk/nexus/nexus_ioctl.c @@ -39,6 +39,7 @@ nxioctl_check_entitlement(u_long cmd) } switch (cmd) { case NXIOC_ADD_TRAFFIC_RULE_INET: + case NXIOC_ADD_TRAFFIC_RULE_ETH: case NXIOC_REMOVE_TRAFFIC_RULE: entitled = IOCurrentTaskHasEntitlement( NXCTL_TRAFFIC_RULE_WRITE_ENTITLEMENT); @@ -48,12 +49,30 @@ nxioctl_check_entitlement(u_long cmd) NXCTL_TRAFFIC_RULE_READ_ENTITLEMENT); break; default: - SK_ERR("invalid command %x", cmd); + SK_ERR("invalid command %lx", cmd); return ENOTSUP; } return entitled ? 0 : EPERM; } +static int +_nxioctl(struct nxctl *nxctl, u_long cmd, caddr_t data, proc_t procp) +{ + switch (cmd) { + case NXIOC_ADD_TRAFFIC_RULE_INET: + return nxioctl_add_traffic_rule_inet(nxctl, data, procp); + case NXIOC_ADD_TRAFFIC_RULE_ETH: + return nxioctl_add_traffic_rule_eth(nxctl, data, procp); + case NXIOC_REMOVE_TRAFFIC_RULE: + return nxioctl_remove_traffic_rule(nxctl, data, procp); + case NXIOC_GET_TRAFFIC_RULES: + return nxioctl_get_traffic_rules(nxctl, data, procp); + default: + SK_ERR("invalid command %lx", cmd); + return ENOTSUP; + } +} + int nxioctl(struct nxctl *nxctl, u_long cmd, caddr_t data, proc_t procp) { @@ -62,15 +81,11 @@ nxioctl(struct nxctl *nxctl, u_long cmd, caddr_t data, proc_t procp) if ((err = nxioctl_check_entitlement(cmd)) != 0) { return err; } - switch (cmd) { - case NXIOC_ADD_TRAFFIC_RULE_INET: - return nxioctl_add_traffic_rule_inet(nxctl, data, procp); - case NXIOC_REMOVE_TRAFFIC_RULE: - return nxioctl_remove_traffic_rule(nxctl, data, procp); - case NXIOC_GET_TRAFFIC_RULES: - return nxioctl_get_traffic_rules(nxctl, data, procp); - default: - SK_ERR("invalid command %x", cmd); - return ENOTSUP; - } + return _nxioctl(nxctl, cmd, data, procp); +} + +int +nxioctl_kernel(nexus_controller_t ncd, u_long cmd, caddr_t data, proc_t procp) +{ + return _nxioctl(ncd->ncd_nxctl, cmd, data, procp); } diff --git a/bsd/skywalk/nexus/nexus_ioctl.h b/bsd/skywalk/nexus/nexus_ioctl.h index bf32eb665..8d17e3564 100644 --- a/bsd/skywalk/nexus/nexus_ioctl.h +++ b/bsd/skywalk/nexus/nexus_ioctl.h @@ -35,7 +35,7 @@ * included by code implementing the nexus controller ioctl logic, * in particular, the Skywalk kernel and libsyscall code. */ -#include + #include #include @@ -45,7 +45,9 @@ #endif /* !LIBSYSCALL_INTERFACE */ #else extern int nxioctl(struct nxctl *, u_long, caddr_t, proc_t); +extern int nxioctl_kernel(struct nexus_controller *, u_long, caddr_t, proc_t); extern int nxioctl_add_traffic_rule_inet(struct nxctl *, caddr_t, proc_t); +extern int nxioctl_add_traffic_rule_eth(struct nxctl *, caddr_t, proc_t); extern int nxioctl_remove_traffic_rule(struct nxctl *, caddr_t, proc_t); extern int nxioctl_get_traffic_rules(struct nxctl *, caddr_t, proc_t); #endif /* !KERNEL */ @@ -67,6 +69,16 @@ struct nxctl_add_traffic_rule_inet_iocargs { #define NXIOC_ADD_TRAFFIC_RULE_INET \ _IOWR('n', 1, struct nxctl_add_traffic_rule_inet_iocargs) +struct nxctl_add_traffic_rule_eth_iocargs { + char atre_ifname[IFNAMSIZ]; + struct ifnet_traffic_descriptor_eth atre_td; + struct ifnet_traffic_rule_action_steer atre_ra; + uint32_t atre_flags; + uuid_t atre_uuid; +}; +#define NXIOC_ADD_TRAFFIC_RULE_ETH \ + _IOWR('n', 4, struct nxctl_add_traffic_rule_eth_iocargs) + struct nxctl_remove_traffic_rule_iocargs { uuid_t rtr_uuid; }; @@ -90,6 +102,11 @@ struct nxctl_traffic_rule_inet_iocinfo { struct ifnet_traffic_descriptor_inet tri_td; struct ifnet_traffic_rule_action_steer tri_ra; }; +struct nxctl_traffic_rule_eth_iocinfo { + struct nxctl_traffic_rule_generic_iocinfo tre_common; + struct ifnet_traffic_descriptor_eth tre_td; + struct ifnet_traffic_rule_action_steer tre_ra; +}; struct nxctl_get_traffic_rules_iocargs { uint8_t gtr_type; uint32_t gtr_size; diff --git a/bsd/skywalk/nexus/nexus_kern.c b/bsd/skywalk/nexus/nexus_kern.c index 0ac059291..a64a234f0 100644 --- a/bsd/skywalk/nexus/nexus_kern.c +++ b/bsd/skywalk/nexus/nexus_kern.c @@ -31,7 +31,6 @@ #include #include #include -#include static STAILQ_HEAD(, nxdom) nexus_domains = STAILQ_HEAD_INITIALIZER(nexus_domains); @@ -107,9 +106,6 @@ nxdom_attach_all(void) #if CONFIG_NEXUS_NETIF nxdom_attach(&nx_netif_dom_s); #endif /* CONFIG_NEXUS_NETIF */ -#if CONFIG_NEXUS_MONITOR - nxdom_attach(&nx_monitor_dom_s); -#endif /* CONFIG_NEXUS_MONITOR */ /* ask domains to initialize */ STAILQ_FOREACH(nxdom, &nexus_domains, nxdom_link) @@ -177,31 +173,6 @@ nxdom_attach(struct nxdom *nxdom) case NEXUS_TYPE_KERNEL_PIPE: case NEXUS_TYPE_NET_IF: case NEXUS_TYPE_FLOW_SWITCH: - case NEXUS_TYPE_MONITOR: - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } - - /* verify this is a valid metadata type */ - switch (nxdom->nxdom_md_type) { - case NEXUS_META_TYPE_QUANTUM: - case NEXUS_META_TYPE_PACKET: - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } - - /* verify this is a valid metadata subtype */ - switch (nxdom->nxdom_md_subtype) { - case NEXUS_META_SUBTYPE_PAYLOAD: - case NEXUS_META_SUBTYPE_RAW: break; default: @@ -394,7 +365,7 @@ nxdom_prov_add(struct nxdom *nxdom, nxdom_prov_retain_locked(nxdom_prov); } - SK_D("nxdom_prov 0x%llx (%s) dom %s", + SK_D("nxdom_prov %p (%s) dom %s", SK_KVA(nxdom_prov), nxdom_prov->nxdom_prov_name, nxdom->nxdom_name); } else { @@ -419,7 +390,7 @@ nxdom_prov_del(struct kern_nexus_domain_provider *nxdom_prov) return; } - SK_D("nxdom_prov 0x%llx (%s:%s)", SK_KVA(nxdom_prov), nxdom->nxdom_name, + SK_D("nxdom_prov %p (%s:%s)", SK_KVA(nxdom_prov), nxdom->nxdom_name, nxdom_prov->nxdom_prov_name); /* keep the reference around for the detaching list (see below) */ @@ -448,9 +419,9 @@ nxdom_prov_del(struct kern_nexus_domain_provider *nxdom_prov) static void nxdom_del_provider_final(struct kern_nexus_domain_provider *nxdom_prov) { -#if (DEBUG || DEVELOPMENT) +#if SK_LOG struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom; -#endif /* DEBUG || DEVELOPMENT */ +#endif /* SK_LOG */ SK_LOCK_ASSERT_HELD(); @@ -458,7 +429,7 @@ nxdom_del_provider_final(struct kern_nexus_domain_provider *nxdom_prov) NXDOMPROVF_DETACHING)) == NXDOMPROVF_DETACHING); ASSERT(nxdom != NULL); - SK_D("nxdom_prov 0x%llx (%s:%s)", SK_KVA(nxdom_prov), nxdom->nxdom_name, + SK_D("nxdom_prov %p (%s:%s)", SK_KVA(nxdom_prov), nxdom->nxdom_name, nxdom_prov->nxdom_prov_name); nxdom_prov->nxdom_prov_flags &= ~NXDOMPROVF_DETACHING; @@ -550,7 +521,7 @@ kern_nexus_register_domain_provider(const nexus_type_t type, struct nxdom *nxdom; errno_t err = 0; - _CASSERT(sizeof(*init) == sizeof(nxdom_prov->nxdom_prov_ext)); + static_assert(sizeof(*init) == sizeof(nxdom_prov->nxdom_prov_ext)); if (type >= NEXUS_TYPE_MAX || dom_prov_uuid == NULL) { return EINVAL; @@ -721,7 +692,7 @@ nxa_alloc(zalloc_flags_t how) static void nxa_free(struct nexus_attr *nxa) { - SK_DF(SK_VERB_MEM, "nxa 0x%llx FREE", SK_KVA(nxa)); + SK_DF(SK_VERB_MEM, "nxa %p FREE", SK_KVA(nxa)); zfree(nxa_zone, nxa); } @@ -784,7 +755,7 @@ ncd_alloc(zalloc_flags_t how) static void ncd_free(struct nexus_controller *ncd) { - SK_DF(SK_VERB_MEM, "ncd 0x%llx FREE", SK_KVA(ncd)); + SK_DF(SK_VERB_MEM, "ncd %p FREE", SK_KVA(ncd)); zfree(ncd_zone, ncd); } @@ -846,9 +817,8 @@ nexus_controller_register_provider_validate_init_params( errno_t err = 0; struct kern_nexus_netif_provider_init *netif_init; - _CASSERT(__builtin_offsetof(struct kern_nexus_provider_init, - nxpi_version) == 0); - _CASSERT(sizeof(init->nxpi_version) == sizeof(uint32_t)); + static_assert(__builtin_offsetof(struct kern_nexus_provider_init, nxpi_version) == 0); + static_assert(sizeof(init->nxpi_version) == sizeof(uint32_t)); if (init == NULL) { return 0; @@ -864,6 +834,8 @@ nexus_controller_register_provider_validate_init_params( err = EINVAL; break; } + ASSERT(init->nxpi_rx_sync_packets == NULL); + ASSERT(init->nxpi_tx_sync_packets == NULL); /* * sync_{tx,rx} callbacks are required; the rest of the * callback pairs are optional, but must be symmetrical. @@ -1737,7 +1709,7 @@ nxdom_prov_free(struct kern_nexus_domain_provider *nxdom_prov) uuid_clear(nxdom_prov->nxdom_prov_uuid); nxdom_prov->nxdom_prov_dom = NULL; - SK_DF(SK_VERB_MEM, "nxdom_prov 0x%llx %s", SK_KVA(nxdom_prov), + SK_DF(SK_VERB_MEM, "nxdom_prov %p %s", SK_KVA(nxdom_prov), ((nxdom_prov->nxdom_prov_flags & NXDOMPROVF_EXT) ? "FREE" : "DESTROY")); if (nxdom_prov->nxdom_prov_flags & NXDOMPROVF_EXT) { @@ -1961,16 +1933,14 @@ nxprov_params_adjust(struct kern_nexus_domain_provider *nxdom_prov, if (NXDOM_MIN(nxdom_min, capabilities) != 0 && !(capabs & NXDOM_MIN(nxdom_min, capabilities))) { - SK_ERR("%s: caps 0x%b < min 0x%b", - nxdom_prov->nxdom_prov_name, capabs, NXPCAP_BITS, - NXDOM_MIN(nxdom_min, capabilities), NXPCAP_BITS); + SK_ERR("%s: caps 0x%x < min 0x%x", nxdom_prov->nxdom_prov_name, + capabs, NXDOM_MIN(nxdom_min, capabilities)); err = EINVAL; goto error; } else if (NXDOM_MAX(nxdom_max, capabilities) != 0 && (capabs & ~NXDOM_MAX(nxdom_max, capabilities))) { - SK_ERR("%s: caps 0x%b > max 0x%b", - nxdom_prov->nxdom_prov_name, capabs, NXPCAP_BITS, - NXDOM_MAX(nxdom_max, capabilities), NXPCAP_BITS); + SK_ERR("%s: caps 0x%x > max 0x%x", nxdom_prov->nxdom_prov_name, + capabs, NXDOM_MAX(nxdom_max, capabilities)); err = EINVAL; goto error; } @@ -2066,21 +2036,21 @@ nxprov_params_adjust(struct kern_nexus_domain_provider *nxdom_prov, nxp->nxp_rx_slots = rx_slots; nxp->nxp_large_buf_size = large_buf_size; - SK_D("nxdom \"%s\" (0x%llx) type %d", + SK_D("nxdom \"%s\" (%p) type %d", nxdom_prov->nxdom_prov_dom->nxdom_name, SK_KVA(nxdom_prov->nxdom_prov_dom), nxdom_prov->nxdom_prov_dom->nxdom_type); - SK_D("nxp \"%s\" (0x%llx) flags 0x%b", - nxp->nxp_name, SK_KVA(nxp), nxp->nxp_flags, NXPF_BITS); - SK_D(" req 0x%b rings %u/%u/%u/%u/%u slots %u/%u/%u/%u/%u buf %u " + SK_D("nxp \"%s\" (%p) flags 0x%x", + nxp->nxp_name, SK_KVA(nxp), nxp->nxp_flags); + SK_D(" req 0x%x rings %u/%u/%u/%u/%u slots %u/%u/%u/%u/%u buf %u " "type %u subtype %u stats %u flowadv_max %u nexusadv_size %u " - "capabs 0x%b pipes %u extensions %u max_frags %u headguard %u " - "tailguard %u large_buf %u", req, NXPREQ_BITS, tx_rings, rx_rings, + "capabs 0x%x pipes %u extensions %u max_frags %u headguard %u " + "tailguard %u large_buf %u", req, tx_rings, rx_rings, alloc_rings, free_rings, ev_rings, tx_slots, rx_slots, alloc_slots, free_slots, ev_slots, nxp->nxp_buf_size, nxp->nxp_md_type, nxp->nxp_md_subtype, stats_size, flowadv_max, nexusadv_size, - capabs, NXPCAP_BITS, nxp->nxp_pipes, nxp->nxp_extensions, - nxp->nxp_max_frags, srp[SKMEM_REGION_GUARD_HEAD].srp_r_obj_size * + capabs, nxp->nxp_pipes, nxp->nxp_extensions, nxp->nxp_max_frags, + srp[SKMEM_REGION_GUARD_HEAD].srp_r_obj_size * srp[SKMEM_REGION_GUARD_HEAD].srp_r_obj_cnt, srp[SKMEM_REGION_GUARD_TAIL].srp_r_obj_size * srp[SKMEM_REGION_GUARD_TAIL].srp_r_obj_cnt, @@ -2187,8 +2157,7 @@ nxprov_params_adjust(struct kern_nexus_domain_provider *nxdom_prov, /* flow advisory region size */ if (flowadv_max != 0) { - _CASSERT(NX_FLOWADV_DEFAULT * sizeof(struct __flowadv_entry) <= - SKMEM_MIN_SEG_SIZE); + static_assert(NX_FLOWADV_DEFAULT * sizeof(struct __flowadv_entry) <= SKMEM_MIN_SEG_SIZE); MUL(sizeof(struct __flowadv_entry), flowadv_max, &tmp1); srp[SKMEM_REGION_FLOWADV].srp_r_obj_size = tmp1; srp[SKMEM_REGION_FLOWADV].srp_r_obj_cnt = 1; diff --git a/bsd/skywalk/nexus/nexus_syscalls.c b/bsd/skywalk/nexus/nexus_syscalls.c index 3458f8d80..564b384d8 100644 --- a/bsd/skywalk/nexus/nexus_syscalls.c +++ b/bsd/skywalk/nexus/nexus_syscalls.c @@ -91,7 +91,7 @@ __nexus_open(struct proc *p, struct __nexus_open_args *uap, int *retval) if (__improbable(uap->init == USER_ADDR_NULL || uap->init_len < sizeof(init))) { - SK_DSC(p, "EINVAL: init %p, init_len %u", uap->init, + SK_PERR(p, "EINVAL: init 0x%llx, init_len %u", uap->init, uap->init_len); err = EINVAL; goto done; @@ -99,12 +99,12 @@ __nexus_open(struct proc *p, struct __nexus_open_args *uap, int *retval) err = copyin(uap->init, (caddr_t)&init, sizeof(init)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %d, init 0x%llx", err, SK_KVA(uap->init)); + SK_PERR(p, "copyin err %d, init %p", err, SK_KVA(uap->init)); goto done; } if (__improbable(init.ni_version != NEXUSCTL_INIT_CURRENT_VERSION)) { - SK_DSC(p, "ENOTSUP: version %u != %u", init.ni_version, + SK_PERR(p, "ENOTSUP: version %u != %u", init.ni_version, NEXUSCTL_INIT_CURRENT_VERSION); err = ENOTSUP; goto done; @@ -117,14 +117,14 @@ __nexus_open(struct proc *p, struct __nexus_open_args *uap, int *retval) err = falloc_guarded(p, &fp, &fd, vfs_context_current(), &guard, GUARD_CLOSE | GUARD_DUP | GUARD_SOCKET_IPC | GUARD_FILEPORT | GUARD_WRITE); if (__improbable(err != 0)) { - SK_DSC(p, "falloc_guarded err %d", err); + SK_PERR(p, "falloc_guarded err %d", err); goto done; } nxctl = nxctl_create(p, fp, nxctl_uuid, &err); if (__improbable(nxctl == NULL)) { ASSERT(err != 0); - SK_DSC(p, "nxctl_create err %d", err); + SK_PERR(p, "nxctl_create err %d", err); goto done; } @@ -132,7 +132,7 @@ __nexus_open(struct proc *p, struct __nexus_open_args *uap, int *retval) init.ni_guard = guard; err = copyout(&init, uap->init, sizeof(init)); if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %d, init 0x%llx", err, + SK_PERR(p, "copyout err %d, init %p", err, SK_KVA(uap->init)); goto done; } @@ -150,7 +150,7 @@ __nexus_open(struct proc *p, struct __nexus_open_args *uap, int *retval) *retval = fd; SK_D("%s(%d) fd %d guard 0x%llx", - sk_proc_name_address(p), sk_proc_pid(p), fd, guard); + sk_proc_name(p), sk_proc_pid(p), fd, guard); done: if (__improbable(err != 0)) { @@ -182,7 +182,7 @@ __nexus_register(struct proc *p, struct __nexus_register_args *uap, int *retval) if (__improbable(uap->reg == USER_ADDR_NULL || uap->reg_len < sizeof(reg) || uap->prov_uuid == USER_ADDR_NULL || uap->prov_uuid_len < sizeof(uuid_t))) { - SK_DSC(p, "EINVAL: reg 0x%llx, reg_len %u, prov_uuid 0x%llx, " + SK_PERR(p, "EINVAL: reg %p, reg_len %u, prov_uuid %p, " "prov_uuid_len %u", SK_KVA(uap->reg), uap->reg_len, SK_KVA(uap->prov_uuid), uap->prov_uuid_len); return EINVAL; @@ -190,25 +190,25 @@ __nexus_register(struct proc *p, struct __nexus_register_args *uap, int *retval) err = copyin(uap->reg, (caddr_t)®, sizeof(reg)); if (err != 0) { - SK_DSC(p, "copyin err %d, reg 0x%llx", err, SK_KVA(uap->reg)); + SK_PERR(p, "copyin err %d, reg %p", err, SK_KVA(uap->reg)); return err; } if (__improbable(reg.nxpreg_version != NXPROV_REG_CURRENT_VERSION)) { - SK_DSC(p, "EINVAL: version %u != %u", reg.nxpreg_version, + SK_PERR(p, "EINVAL: version %u != %u", reg.nxpreg_version, NXPROV_REG_CURRENT_VERSION); return EINVAL; } if (__improbable(reg.nxpreg_params.nxp_namelen == 0 || reg.nxpreg_params.nxp_namelen > sizeof(nexus_name_t))) { - SK_DSC(p, "EINVAL: namelen %u", reg.nxpreg_params.nxp_namelen); + SK_PERR(p, "EINVAL: namelen %u", reg.nxpreg_params.nxp_namelen); return EINVAL; } err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); @@ -218,13 +218,13 @@ __nexus_register(struct proc *p, struct __nexus_register_args *uap, int *retval) lck_mtx_unlock(&nxctl->nxctl_lock); if (__improbable(nxprov == NULL)) { ASSERT(err != 0); - SK_DSC(p, "nxprov_create: %d", err); + SK_PERR(p, "nxprov_create: %d", err); goto done; } err = copyout(&nxprov->nxprov_uuid, uap->prov_uuid, sizeof(uuid_t)); if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %d, prov_uuid 0x%llx", err, + SK_PERR(p, "copyout err %d, prov_uuid %p", err, SK_KVA(uap->prov_uuid)); goto done; } @@ -257,26 +257,26 @@ __nexus_deregister(struct proc *p, struct __nexus_deregister_args *uap, AUDIT_ARG(fd, uap->ctl); if (__improbable(uap->prov_uuid_len < sizeof(uuid_t))) { - SK_DSC(p, "EINVAL: prov_len %u < %u", uap->prov_uuid_len, + SK_PERR(p, "EINVAL: prov_len %u < %lu", uap->prov_uuid_len, sizeof(uuid_t)); return EINVAL; } err = copyin(uap->prov_uuid, (caddr_t)&nxprov_uuid, sizeof(uuid_t)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %d, prov_uuid 0x%llx", err, + SK_PERR(p, "copyin err %d, prov_uuid %p", err, SK_KVA(uap->prov_uuid)); return err; } if (__improbable(uuid_is_null(nxprov_uuid))) { - SK_DSC(p, "EINVAL: uuid_is_null"); + SK_PERR(p, "EINVAL: uuid_is_null"); return EINVAL; } err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); @@ -305,27 +305,27 @@ __nexus_create(struct proc *p, struct __nexus_create_args *uap, int *retval) if (__improbable(uap->prov_uuid_len < sizeof(uuid_t) || uap->nx_uuid_len < sizeof(uuid_t) || uap->nx_uuid == USER_ADDR_NULL)) { - SK_DSC(p, "EINVAL: prov_uuid_len %u, nx_uuid_len %u, " - "nx_uuid 0x%llx", uap->prov_uuid_len, uap->nx_uuid_len, + SK_PERR(p, "EINVAL: prov_uuid_len %u, nx_uuid_len %u, " + "nx_uuid %p", uap->prov_uuid_len, uap->nx_uuid_len, SK_KVA(uap->nx_uuid)); return EINVAL; } err = copyin(uap->prov_uuid, (caddr_t)&nxprov_uuid, sizeof(uuid_t)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %d, prov_uuid 0x%llx", err, + SK_PERR(p, "copyin err %d, prov_uuid %p", err, SK_KVA(uap->prov_uuid)); return err; } if (__improbable(uuid_is_null(nxprov_uuid))) { - SK_DSC(p, "EINVAL: uuid_is_null"); + SK_PERR(p, "EINVAL: uuid_is_null"); return EINVAL; } err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); @@ -336,12 +336,12 @@ __nexus_create(struct proc *p, struct __nexus_create_args *uap, int *retval) lck_mtx_unlock(&nxctl->nxctl_lock); if (__improbable(nx == NULL)) { ASSERT(err != 0); - SK_DSC(p, "nx_create: %d", err); + SK_PERR(p, "nx_create: %d", err); goto done; } err = copyout(&nx->nx_uuid, uap->nx_uuid, sizeof(uuid_t)); if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %d, nx_uuid 0x%llx", err, + SK_PERR(p, "copyout err %d, nx_uuid %p", err, SK_KVA(uap->nx_uuid)); goto done; } @@ -370,26 +370,26 @@ __nexus_destroy(struct proc *p, struct __nexus_destroy_args *uap, int *retval) if (__improbable(uap->nx_uuid == USER_ADDR_NULL || uap->nx_uuid_len < sizeof(uuid_t))) { - SK_DSC(p, "EINVAL: nx_uuid 0x%llx, nx_uuid_len %u", + SK_PERR(p, "EINVAL: nx_uuid %p, nx_uuid_len %u", SK_KVA(uap->nx_uuid), uap->nx_uuid_len); return EINVAL; } err = copyin(uap->nx_uuid, (caddr_t)&nx_uuid, sizeof(uuid_t)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %d, nx_uuid 0x%llx", err, + SK_PERR(p, "copyin err %d, nx_uuid %p", err, SK_KVA(uap->nx_uuid)); return err; } if (__improbable(uuid_is_null(nx_uuid))) { - SK_DSC(p, "EINVAL: uuid_is_null"); + SK_PERR(p, "EINVAL: uuid_is_null"); return EINVAL; } err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); @@ -417,13 +417,13 @@ __nexus_get_opt(struct proc *p, struct __nexus_get_opt_args *uap, int *retval) err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); if (__improbable(uap->aoptlen == USER_ADDR_NULL)) { - SK_DSC(p, "EINVAL: aoptlen == USER_ADDR_NULL"); + SK_PERR(p, "EINVAL: aoptlen == USER_ADDR_NULL"); err = EINVAL; goto done; } @@ -431,7 +431,7 @@ __nexus_get_opt(struct proc *p, struct __nexus_get_opt_args *uap, int *retval) if (uap->aoptval != USER_ADDR_NULL) { err = copyin(uap->aoptlen, &optlen, sizeof(optlen)); if (__improbable(err != 0)) { - SK_DSC(p, "copyin err %d, aoptlen 0x%llx", err, + SK_PERR(p, "copyin err %d, aoptlen %p", err, SK_KVA(uap->aoptlen)); goto done; } @@ -454,7 +454,7 @@ __nexus_get_opt(struct proc *p, struct __nexus_get_opt_args *uap, int *retval) err = copyout(&optlen, uap->aoptlen, sizeof(optlen)); #if SK_LOG if (__improbable(err != 0)) { - SK_DSC(p, "copyout err %d, aoptlen 0x%llx", err, + SK_PERR(p, "copyout err %d, aoptlen %p", err, SK_KVA(uap->aoptlen)); } #endif /* SK_LOG */ @@ -487,7 +487,7 @@ __nexus_set_opt(struct proc *p, struct __nexus_set_opt_args *uap, int *retval) err = fp_get_ftype(p, uap->ctl, DTYPE_NEXUS, ENODEV, &fp); if (__improbable(err != 0)) { - SK_DSC(p, "fp_get_ftype: %d", err); + SK_PERR(p, "fp_get_ftype: %d", err); return err; } nxctl = (struct nxctl *)fp_get_data(fp); diff --git a/bsd/skywalk/nexus/nexus_traffic_rule.c b/bsd/skywalk/nexus/nexus_traffic_rule.c index 2ec7fe6ca..d10333b79 100644 --- a/bsd/skywalk/nexus/nexus_traffic_rule.c +++ b/bsd/skywalk/nexus/nexus_traffic_rule.c @@ -25,116 +25,9 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -#include -#include -#include -#include -#include -/* - * Implementation of nexus traffic rules APIs. - */ - -struct nxctl_traffic_rule_type; -struct nxctl_traffic_rule; - -/* - * These callbacks need to be implemented for each rule type. - */ - -/* Validate user provided parameters. */ -typedef int (nxctl_traffic_rule_validate_cb_t)( - struct nxctl_traffic_rule_type *type, - const char *ifname, - struct ifnet_traffic_descriptor_common *td, - struct ifnet_traffic_rule_action *ra); -/* - * Each rule type has its own global structure for storing rules. - * These callbacks access this global structure. - */ -#define NTR_FIND_FLAG_EXACT 0x0001 -typedef int (nxctl_traffic_rule_find_cb_t)( - struct nxctl_traffic_rule_type *type, - const char *ifname, - struct ifnet_traffic_descriptor_common *td, - uint32_t flags, - struct nxctl_traffic_rule **ntrp); - -typedef int (nxctl_traffic_rule_find_by_uuid_cb_t)( - struct nxctl_traffic_rule_type *type, - uuid_t uuid, - struct nxctl_traffic_rule **ntrp); - -typedef void (nxctl_traffic_rule_link_cb_t)( - struct nxctl_traffic_rule *ntr); - -typedef void (nxctl_traffic_rule_unlink_cb_t)( - struct nxctl_traffic_rule *ntr); - -/* - * Notifies lower layers of the addition/removal of a rule. - * This is called outside of nxctl_traffic_rule_lock to avoid potential - * locking issues. - */ -#define NTR_NOTIFY_FLAG_ADD 0x0001 -#define NTR_NOTIFY_FLAG_REMOVE 0x0002 -typedef int (nxctl_traffic_rule_notify_cb_t)( - struct nxctl_traffic_rule *ntr, - uint32_t flags); - -/* - * Create/Destroy callbacks for a rule type. - */ -typedef int (nxctl_traffic_rule_create_cb_t)( - struct nxctl_traffic_rule_type *type, - const char *ifname, - struct ifnet_traffic_descriptor_common *td, - struct ifnet_traffic_rule_action *ra, - uint32_t flags, - struct nxctl_traffic_rule **ntrp); - -typedef void (nxctl_traffic_rule_destroy_cb_t)( - struct nxctl_traffic_rule *ntr); - -/* - * This is used for copying all rules for a type (including generic - * and type-specific info) to userspace. - */ -typedef int (nxctl_traffic_rule_get_all_cb_t)( - struct nxctl_traffic_rule_type *type, - uint32_t size, - uint32_t *count, - user_addr_t uaddr); - -struct nxctl_traffic_rule_type { - uint8_t ntrt_type; - nxctl_traffic_rule_validate_cb_t *ntrt_validate; - nxctl_traffic_rule_find_cb_t *ntrt_find; - nxctl_traffic_rule_find_by_uuid_cb_t *ntrt_find_by_uuid; - nxctl_traffic_rule_link_cb_t *ntrt_link; - nxctl_traffic_rule_unlink_cb_t *ntrt_unlink; - nxctl_traffic_rule_notify_cb_t *ntrt_notify; - nxctl_traffic_rule_create_cb_t *ntrt_create; - nxctl_traffic_rule_destroy_cb_t *ntrt_destroy; - nxctl_traffic_rule_get_all_cb_t *ntrt_get_all; - /* - * -fbounds-safety: Why is this void *? All usage of this was for - * struct nxctl_traffic_rule_inet_storage * - */ - struct nxctl_traffic_rule_inet_storage *ntrt_storage; -}; - -static nxctl_traffic_rule_validate_cb_t inet_traffic_rule_validate; -static nxctl_traffic_rule_find_cb_t inet_traffic_rule_find; -static nxctl_traffic_rule_find_by_uuid_cb_t inet_traffic_rule_find_by_uuid; -static nxctl_traffic_rule_link_cb_t inet_traffic_rule_link; -static nxctl_traffic_rule_unlink_cb_t inet_traffic_rule_unlink; -static nxctl_traffic_rule_notify_cb_t inet_traffic_rule_notify; -static nxctl_traffic_rule_create_cb_t inet_traffic_rule_create; -static nxctl_traffic_rule_destroy_cb_t inet_traffic_rule_destroy; -static nxctl_traffic_rule_get_all_cb_t inet_traffic_rule_get_all; +#include +#include static struct nxctl_traffic_rule_type nxctl_rule_types[] = { { @@ -148,102 +41,25 @@ static struct nxctl_traffic_rule_type nxctl_rule_types[] = { .ntrt_create = inet_traffic_rule_create, .ntrt_destroy = inet_traffic_rule_destroy, .ntrt_get_all = inet_traffic_rule_get_all, + .ntrt_get_count = inet_traffic_rule_get_count, + }, + { + .ntrt_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH, + .ntrt_validate = eth_traffic_rule_validate, + .ntrt_find = eth_traffic_rule_find, + .ntrt_find_by_uuid = eth_traffic_rule_find_by_uuid, + .ntrt_link = eth_traffic_rule_link, + .ntrt_unlink = eth_traffic_rule_unlink, + .ntrt_notify = eth_traffic_rule_notify, + .ntrt_create = eth_traffic_rule_create, + .ntrt_destroy = eth_traffic_rule_destroy, + .ntrt_get_all = eth_traffic_rule_get_all, + .ntrt_get_count = eth_traffic_rule_get_count, }, }; #define NRULETYPES \ (sizeof(nxctl_rule_types)/sizeof(struct nxctl_traffic_rule_type)) -/* - * Generic traffic rule. - * Contains fields common to all traffic rules. - */ -#define NTR_FLAG_PERSIST 0x0001 -#define NTR_FLAG_ON_NXCTL_LIST 0x0002 -struct nxctl_traffic_rule { - struct nxctl_traffic_rule_type *ntr_type; - uint32_t ntr_flags; - os_refcnt_t ntr_refcnt; - uuid_t ntr_uuid; - char ntr_procname[NTR_PROCNAME_SZ]; - char ntr_ifname[IFNAMSIZ]; - SLIST_ENTRY(nxctl_traffic_rule) ntr_storage_link; -}; - -/* - * Inet-specific traffic rule. - */ -struct nxctl_traffic_rule_inet { - struct nxctl_traffic_rule ntri_common; - SLIST_ENTRY(nxctl_traffic_rule_inet) ntri_storage_link; - struct ifnet_traffic_descriptor_inet ntri_td; - struct ifnet_traffic_rule_action_steer ntri_ra; -}; - -/* - * Currently supported tuple types. - */ -#define ITDBIT(set, bit) (((set) != 0) ? (bit) : 0) -#define ITRM(proto, laddr, raddr, lport, rport) \ - (IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER | \ - ITDBIT(proto, IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO) | \ - ITDBIT(laddr, IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) | \ - ITDBIT(raddr, IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) | \ - ITDBIT(lport, IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) | \ - ITDBIT(rport, IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT)) - -static uint8_t nxctl_inet_traffic_rule_masks[] = { - ITRM(1, 1, 1, 1, 1), - ITRM(1, 1, 1, 1, 0), - ITRM(1, 1, 1, 0, 1), - ITRM(1, 1, 1, 0, 0), - ITRM(1, 1, 0, 1, 1), - ITRM(1, 1, 0, 1, 0), - ITRM(1, 1, 0, 0, 1), - ITRM(1, 1, 0, 0, 0), - ITRM(1, 0, 1, 1, 1), - ITRM(1, 0, 1, 1, 0), - ITRM(1, 0, 1, 0, 1), - ITRM(1, 0, 1, 0, 0), - ITRM(1, 0, 0, 1, 1), - ITRM(1, 0, 0, 1, 0), - ITRM(1, 0, 0, 0, 1), - // ITRM(1, 0, 0, 0, 0), addr or port is required - ITRM(0, 1, 1, 1, 1), - ITRM(0, 1, 1, 1, 0), - ITRM(0, 1, 1, 0, 1), - ITRM(0, 1, 1, 0, 0), - ITRM(0, 1, 0, 1, 1), - ITRM(0, 1, 0, 1, 0), - ITRM(0, 1, 0, 0, 1), - ITRM(0, 1, 0, 0, 0), - ITRM(0, 0, 1, 1, 1), - ITRM(0, 0, 1, 1, 0), - ITRM(0, 0, 1, 0, 1), - ITRM(0, 0, 1, 0, 0), - ITRM(0, 0, 0, 1, 1), - ITRM(0, 0, 0, 1, 0), - ITRM(0, 0, 0, 0, 1), - // ITRM(0, 0, 0, 0, 0), -}; -#define NINETRULEMASKS \ - (sizeof(nxctl_inet_traffic_rule_masks)/sizeof(uint8_t)) - -/* Per-interface lists of traffic rules */ -SLIST_HEAD(nxctl_traffic_rule_inet_head, nxctl_traffic_rule_inet); -struct nxctl_traffic_rule_inet_if { - char rii_ifname[IFNAMSIZ]; - struct nxctl_traffic_rule_inet_head rii_lists[NINETRULEMASKS]; - uint32_t rii_count; - SLIST_ENTRY(nxctl_traffic_rule_inet_if) rii_link; -}; - -/* List of per-interface lists */ -SLIST_HEAD(nxctl_traffic_rule_inet_if_head, nxctl_traffic_rule_inet_if); -struct nxctl_traffic_rule_inet_storage { - struct nxctl_traffic_rule_inet_if_head ris_if_list; - uint32_t ris_count; -}; - /* Per-fd list kept at the nxctl */ SLIST_HEAD(nxctl_traffic_rule_head, nxctl_traffic_rule); struct nxctl_traffic_rule_storage { @@ -252,282 +68,44 @@ struct nxctl_traffic_rule_storage { }; static LCK_RW_DECLARE_ATTR(nxctl_traffic_rule_lock, &sk_lock_group, &sk_lock_attr); -#define NXTR_WLOCK() \ - lck_rw_lock_exclusive(&nxctl_traffic_rule_lock) -#define NXTR_WUNLOCK() \ - lck_rw_unlock_exclusive(&nxctl_traffic_rule_lock) -#define NXTR_RLOCK() \ - lck_rw_lock_shared(&nxctl_traffic_rule_lock) -#define NXTR_RUNLOCK() \ - lck_rw_unlock_shared(&nxctl_traffic_rule_lock) + +SK_INLINE_ATTRIBUTE +void +nxtr_wlock(void) +{ + lck_rw_lock_exclusive(&nxctl_traffic_rule_lock); +} + +SK_INLINE_ATTRIBUTE +void +nxtr_wunlock(void) +{ + lck_rw_unlock_exclusive(&nxctl_traffic_rule_lock); +} + +SK_INLINE_ATTRIBUTE +void +nxtr_rlock(void) +{ + lck_rw_lock_shared(&nxctl_traffic_rule_lock); +} + +SK_INLINE_ATTRIBUTE +void +nxtr_runlock(void) +{ + lck_rw_unlock_shared(&nxctl_traffic_rule_lock); +} static struct nxctl_traffic_rule_type *find_traffic_rule_type(uint8_t type); -static void retain_traffic_rule(struct nxctl_traffic_rule *ntr); -static void release_traffic_rule(struct nxctl_traffic_rule *ntr); static int remove_traffic_rule(struct nxctl *nxctl, uuid_t uuid, struct nxctl_traffic_rule **ntrp); -static boolean_t inet_v6addr_cmp(struct ifnet_ip_addr *a1, - struct ifnet_ip_addr *a2); static int notify_traffic_rule(struct nxctl_traffic_rule *ntr, uint32_t flags); #define NXCTL_TRAFFIC_RULE_TAG "com.apple.skywalk.nexus.traffic_rule" static kern_allocation_name_t nxctl_traffic_rule_tag; static struct nxctl_traffic_rule_type *inet_traffic_rule_type = NULL; - -/* - * If an interface attaches after rule(s) are added, this function is used - * retrieve the current rule count for that interface. - */ -int -nxctl_inet_traffic_rule_get_count(const char *ifname, uint32_t *count) -{ - struct nxctl_traffic_rule_inet_storage *rs; - struct nxctl_traffic_rule_inet_if *rif; - int err; - - NXTR_RLOCK(); - rs = inet_traffic_rule_type->ntrt_storage; - if (rs == NULL) { - err = ENOENT; - goto fail; - } - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - if (strlcmp(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)) == 0) { - break; - } - } - if (rif == NULL) { - err = ENOENT; - goto fail; - } - *count = rif->rii_count; - NXTR_RUNLOCK(); - return 0; -fail: - NXTR_RUNLOCK(); - return err; -} - -/* - * Used for finding the qset id associated with a traffic descriptor. - */ -int -nxctl_inet_traffic_rule_find_qset_id(const char *ifname, - struct ifnet_traffic_descriptor_inet *td, uint64_t *qset_id) -{ - struct nxctl_traffic_rule_inet *__single ntri = NULL; - struct nxctl_traffic_rule *__single ntr = NULL; - int err; - - NXTR_RLOCK(); - ASSERT(inet_traffic_rule_type != NULL); - err = inet_traffic_rule_type->ntrt_find(inet_traffic_rule_type, ifname, - (struct ifnet_traffic_descriptor_common *)td, 0, &ntr); - if (err != 0) { - SK_ERR("rule find failed: %d", err); - goto fail; - } - ntri = __container_of(ntr, struct nxctl_traffic_rule_inet, ntri_common); - *qset_id = ntri->ntri_ra.ras_qset_id; - NXTR_RUNLOCK(); - return 0; -fail: - NXTR_RUNLOCK(); - return err; -} - -/* - * Based on flow_pkt_classify(). - * This function populates struct ifnet_traffic_descriptor_inet instead of struct __flow. - */ -static int -fill_inet_td(struct __kern_packet *pkt, struct ifnet_traffic_descriptor_inet *td) -{ - union { - volatile struct ip *__indexable _iph; - volatile struct ip6_hdr *__indexable _ip6; - } _l3; - #define iph _l3._iph - #define ip6 _l3._ip6 - union { - volatile struct tcphdr *_tcph; - volatile struct udphdr *_udph; - } _l4; - #define tcph _l4._tcph - #define udph _l4._udph - uint8_t *pkt_buf, *l3_hdr; - uint32_t bdlen, bdlim, bdoff, cls_len; - size_t pkt_len; - uint8_t ipv, l3hlen = 0; /* IP header length */ - uint16_t l3tlen = 0; /* total length of IP packet */ - uint8_t l4hlen = 0; /* TCP/UDP header length */ - uint16_t ulen = 0; /* user data length */ - int err; - - ASSERT(pkt->pkt_l2_len <= pkt->pkt_length); - pkt_len = pkt->pkt_length - pkt->pkt_l2_len; - - MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); - cls_len = bdlim - bdoff; - cls_len -= pkt->pkt_l2_len; - cls_len = (uint32_t)MIN(cls_len, pkt_len); - VERIFY(pkt_len >= cls_len); - if (cls_len == 0) { - SK_ERR("cls_len == 0"); - err = EINVAL; - goto fail; - } - l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; - iph = (volatile struct ip *)(void *)l3_hdr; - ipv = iph->ip_v; - - switch (ipv) { - case 4: - if (cls_len < sizeof(struct ip)) { - SK_ERR("cls_len < sizeof(struct ip) (%d < %d)", - cls_len, sizeof(struct ip)); - err = EINVAL; - goto fail; - } - l3hlen = (uint8_t)(iph->ip_hl << 2); - if (l3hlen < sizeof(struct ip)) { - SK_ERR("l3hlen < sizeof(struct ip) (%d < %d)", - l3hlen, sizeof(struct ip)); - err = EINVAL; - goto fail; - } - if (cls_len < l3hlen) { - SK_ERR("cls_len < l3hlen (%d < %d)", cls_len, l3hlen); - err = EINVAL; - goto fail; - } - l3tlen = ntohs(iph->ip_len); - if (l3tlen < l3hlen) { - SK_ERR("l3tlen < l3hlen (%d < %d)", l3tlen, l3hlen); - err = EINVAL; - goto fail; - } - if (pkt_len < l3tlen) { - SK_ERR("pkt_len < l3tlen (%d < %d)", pkt_len, l3tlen); - err = EINVAL; - goto fail; - } - td->inet_ipver = IPVERSION; - td->inet_proto = iph->ip_p; - bcopy(__DECONST(void *, &iph->ip_src), &td->inet_laddr.iia_v4addr, - sizeof(iph->ip_src)); - bcopy(__DECONST(void *, &iph->ip_dst), &td->inet_raddr.iia_v4addr, - sizeof(iph->ip_dst)); - break; - case 6: - l3hlen = sizeof(struct ip6_hdr); - if (cls_len < l3hlen) { - SK_ERR("cls_len < l3hlen (%d < %d)", cls_len, l3hlen); - err = EINVAL; - goto fail; - } - l3tlen = l3hlen + ntohs(ip6->ip6_plen); - if (pkt_len < l3tlen) { - SK_ERR("pkt_len < l3tlen (%d < %d)", pkt_len, l3tlen); - err = EINVAL; - goto fail; - } - td->inet_ipver = IPV6_VERSION; - td->inet_proto = ip6->ip6_nxt; - bcopy(__DECONST(void *, &ip6->ip6_src), &td->inet_laddr, - sizeof(ip6->ip6_src)); - bcopy(__DECONST(void *, &ip6->ip6_dst), &td->inet_raddr, - sizeof(ip6->ip6_dst)); - break; - default: - SK_ERR("ipv == %d", ipv); - err = EINVAL; - goto fail; - } - tcph = __DECONST(volatile struct tcphdr *, (volatile uint8_t *)iph + l3hlen); - ulen = (l3tlen - l3hlen); - if (td->inet_proto == IPPROTO_TCP) { - if (cls_len < l3hlen + sizeof(*tcph) || ulen < sizeof(*tcph)) { - SK_ERR("cls_len < l3hlen + sizeof(*tcph) || ulen < sizeof(*tcph) " - "(%d < %d + %d || %d < %d)", cls_len, l3hlen, sizeof(*tcph), - ulen, sizeof(*tcph)); - err = EINVAL; - goto fail; - } - l4hlen = (uint8_t)(tcph->th_off << 2); - if (l4hlen < sizeof(*tcph)) { - SK_ERR("l4hlen < sizeof(*tcph) (%d < %d)", l4hlen, sizeof(*tcph)); - err = EINVAL; - goto fail; - } - if (l4hlen > ulen) { - SK_ERR("l4hlen > ulen (%d > %d)", l4hlen, ulen); - err = EINVAL; - goto fail; - } - bcopy(__DECONST(void *, &tcph->th_sport), &td->inet_lport, - sizeof(td->inet_lport)); - bcopy(__DECONST(void *, &tcph->th_dport), &td->inet_rport, - sizeof(td->inet_rport)); - } else if (td->inet_proto == IPPROTO_UDP) { - if (cls_len < l3hlen + sizeof(*udph) || ulen < sizeof(*udph)) { - SK_ERR("cls_len < l3hlen + sizeof(*udph) || ulen < sizeof(*udph) " - "(%d < %d + %d || %d < %d)", cls_len, l3hlen, sizeof(*udph), - ulen, sizeof(*udph)); - err = EINVAL; - goto fail; - } - l4hlen = sizeof(*udph); - if (l4hlen > ulen) { - SK_ERR("l4hlen > ulen (%d > %d)", l4hlen, ulen); - err = EINVAL; - goto fail; - } - bcopy(__DECONST(void *, &udph->uh_sport), &td->inet_lport, - sizeof(td->inet_lport)); - bcopy(__DECONST(void *, &udph->uh_dport), &td->inet_rport, - sizeof(td->inet_rport)); - } else { - err = ENOTSUP; - goto fail; - } - - td->inet_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET; - td->inet_common.itd_len = sizeof(*td); - td->inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | - IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND; - td->inet_mask |= (IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER | - IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO | - IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR | - IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR | - IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT | - IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT); - return 0; -fail: - DTRACE_SKYWALK5(classify__failed, struct ip *, iph, size_t, pkt_len, - uint8_t, pkt->pkt_l2_len, struct ifnet_traffic_descriptor_inet *, td, - int, err); - bzero(td, sizeof(*td)); - return err; - #undef iph - #undef ip6 - #undef tcph - #undef udph -} - -int -nxctl_inet_traffic_rule_find_qset_id_with_pkt(const char *ifname, - struct __kern_packet *pkt, uint64_t *qset_id) -{ - struct ifnet_traffic_descriptor_inet td; - int err; - - err = fill_inet_td(pkt, &td); - if (err != 0) { - return err; - } - return nxctl_inet_traffic_rule_find_qset_id(ifname, &td, qset_id); -} +static struct nxctl_traffic_rule_type *eth_traffic_rule_type = NULL; void nxctl_traffic_rule_init(void) @@ -541,6 +119,14 @@ nxctl_traffic_rule_init(void) inet_traffic_rule_type = find_traffic_rule_type(IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET); ASSERT(inet_traffic_rule_type != NULL); + + ASSERT(eth_traffic_rule_type == NULL); + eth_traffic_rule_type = + find_traffic_rule_type(IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH); + ASSERT(eth_traffic_rule_type != NULL); + + inet_traffic_rule_init(nxctl_traffic_rule_tag); + eth_traffic_rule_init(nxctl_traffic_rule_tag); } void @@ -551,13 +137,7 @@ nxctl_traffic_rule_fini(void) nxctl_traffic_rule_tag = NULL; } inet_traffic_rule_type = NULL; -} - -static struct ifnet_ip_addr v6_zeros_addr = {0}; -static boolean_t -inet_v6addr_cmp(struct ifnet_ip_addr *a1, struct ifnet_ip_addr *a2) -{ - return memcmp(a1, a2, sizeof(*a1)) == 0; + eth_traffic_rule_type = NULL; } SK_NO_INLINE_ATTRIBUTE @@ -666,545 +246,8 @@ remove_traffic_rule_from_nxctl(struct nxctl *nxctl, lck_mtx_unlock(&nxctl->nxctl_lock); } -static int -inet_traffic_rule_validate(struct nxctl_traffic_rule_type *type, - const char *ifname, - struct ifnet_traffic_descriptor_common *td, - struct ifnet_traffic_rule_action *ra) -{ -#pragma unused(type) - char buf[IFNAMSIZ]; - int unit, i; - struct ifnet_traffic_descriptor_inet *tdi; - uint8_t mask = 0, ipver, proto; - - if (ifunit_extract(ifname, buf, sizeof(buf), &unit) < 0) { - SK_ERR("invalid ifname: %s", ifname); - return EINVAL; - } - if (td->itd_len != sizeof(*tdi)) { - SK_ERR("invalid td len: expected %d, actual %d", - sizeof(*tdi), td->itd_len); - return EINVAL; - } - if (td->itd_flags == 0 || - (td->itd_flags & - ~(IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | - IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND)) != 0) { - SK_ERR("invalid td flags: 0x%x", td->itd_flags); - return EINVAL; - } - tdi = (struct ifnet_traffic_descriptor_inet *)td; - for (i = 0; i < NINETRULEMASKS; i++) { - if (tdi->inet_mask == nxctl_inet_traffic_rule_masks[i]) { - mask = tdi->inet_mask; - break; - } - } - if (mask == 0) { - SK_ERR("invalid inet mask: 0x%x", tdi->inet_mask); - return EINVAL; - } - ipver = tdi->inet_ipver; - if (ipver != IPVERSION && ipver != IPV6_VERSION) { - SK_ERR("invalid inet ipver: 0x%x", ipver); - return EINVAL; - } - proto = tdi->inet_proto; - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { - SK_ERR("invalid inet proto: %d", proto); - return EINVAL; - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0) { - if (ipver == IPVERSION) { - if (tdi->inet_laddr.iia_v4addr == INADDR_ANY) { - SK_ERR("inet laddr v4 cannot be unspecified"); - return EINVAL; - } - } else { - if (inet_v6addr_cmp(&tdi->inet_laddr, &v6_zeros_addr)) { - SK_ERR("inet laddr v4 cannot be unspecified"); - return EINVAL; - } - } - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0) { - if (ipver == IPVERSION) { - if (tdi->inet_raddr.iia_v4addr == INADDR_ANY) { - SK_ERR("inet raddr v6 cannot be unspecified"); - return EINVAL; - } - } else { - if (inet_v6addr_cmp(&tdi->inet_raddr, &v6_zeros_addr)) { - SK_ERR("inet raddr v6 cannot be unspecified"); - return EINVAL; - } - } - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) != 0) { - if (tdi->inet_lport == 0) { - SK_ERR("inet lport cannot be unspecified"); - return EINVAL; - } - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT) != 0) { - if (tdi->inet_rport == 0) { - SK_ERR("inet rport cannot be unspecified"); - return EINVAL; - } - } - if (ra->ra_len != sizeof(struct ifnet_traffic_rule_action_steer)) { - SK_ERR("invalid ra len: expected %d, actual %d", - sizeof(struct ifnet_traffic_rule_action_steer), ra->ra_len); - return EINVAL; - } - return 0; -} - SK_NO_INLINE_ATTRIBUTE -static struct nxctl_traffic_rule_inet_storage * -inet_traffic_rule_storage_create(void) -{ - struct nxctl_traffic_rule_inet_storage *rs; - - rs = sk_alloc_type(struct nxctl_traffic_rule_inet_storage, - Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); - SLIST_INIT(&rs->ris_if_list); - rs->ris_count = 0; - return rs; -} - -SK_NO_INLINE_ATTRIBUTE -static void -inet_traffic_rule_storage_destroy(struct nxctl_traffic_rule_inet_storage *rs) -{ - ASSERT(rs->ris_count == 0); - ASSERT(SLIST_EMPTY(&rs->ris_if_list)); - sk_free_type(struct nxctl_traffic_rule_inet_storage, rs); -} - -SK_NO_INLINE_ATTRIBUTE -static struct nxctl_traffic_rule_inet_if * -inet_traffic_rule_if_create(const char *ifname) -{ - struct nxctl_traffic_rule_inet_if *rif; - int i; - - rif = sk_alloc_type(struct nxctl_traffic_rule_inet_if, - Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); - for (i = 0; i < NINETRULEMASKS; i++) { - SLIST_INIT(&rif->rii_lists[i]); - } - strlcpy(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)); - rif->rii_count = 0; - return rif; -} - -SK_NO_INLINE_ATTRIBUTE -static void -inet_traffic_rule_if_destroy(struct nxctl_traffic_rule_inet_if *rif) -{ - int i; - - for (i = 0; i < NINETRULEMASKS; i++) { - ASSERT(SLIST_EMPTY(&rif->rii_lists[i])); - } - ASSERT(rif->rii_count == 0); - sk_free_type(struct nxctl_traffic_rule_inet_if, rif); -} - -SK_NO_INLINE_ATTRIBUTE -static boolean_t -inet_traffic_rule_match(struct nxctl_traffic_rule_inet *ntri, const char *ifname, - uint32_t flags, struct ifnet_traffic_descriptor_inet *tdi) -{ - struct nxctl_traffic_rule *ntr = (struct nxctl_traffic_rule *)ntri; - struct ifnet_traffic_descriptor_inet *tdi0; - uint8_t mask; - boolean_t exact; - - VERIFY(strlcmp(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)) == 0); - tdi0 = &ntri->ntri_td; - - exact = ((flags & NTR_FIND_FLAG_EXACT) != 0); - mask = tdi0->inet_mask & tdi->inet_mask; - if (exact) { - ASSERT(tdi0->inet_mask == tdi->inet_mask); - } - ASSERT((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER) != 0); - if (tdi0->inet_ipver != tdi->inet_ipver) { - DTRACE_SKYWALK2(ipver__mismatch, - uint8_t, tdi0->inet_ipver, uint8_t, tdi->inet_ipver); - return FALSE; - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO) != 0 && - tdi0->inet_proto != tdi->inet_proto) { - DTRACE_SKYWALK2(proto__mismatch, - uint8_t, tdi0->inet_proto, uint8_t, tdi->inet_proto); - return FALSE; - } - if (tdi0->inet_ipver == IPVERSION) { - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0 && - tdi0->inet_laddr.iia_v4addr != tdi->inet_laddr.iia_v4addr) { - DTRACE_SKYWALK2(v4laddr__mismatch, - in_addr_t, tdi0->inet_laddr.iia_v4addr, - in_addr_t, tdi->inet_laddr.iia_v4addr); - return FALSE; - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0 && - tdi0->inet_raddr.iia_v4addr != tdi->inet_raddr.iia_v4addr) { - DTRACE_SKYWALK2(v4raddr__mismatch, - in_addr_t, tdi0->inet_raddr.iia_v4addr, - in_addr_t, tdi->inet_raddr.iia_v4addr); - return FALSE; - } - } else { - ASSERT(tdi0->inet_ipver == IPV6_VERSION); - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0 && - !inet_v6addr_cmp(&tdi0->inet_laddr, &tdi->inet_laddr)) { - DTRACE_SKYWALK2(v6laddr__mismatch, - struct in6_addr *, &tdi0->inet_laddr, - struct in6_addr *, &tdi->inet_laddr); - return FALSE; - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0 && - !inet_v6addr_cmp(&tdi0->inet_raddr, &tdi->inet_raddr)) { - DTRACE_SKYWALK2(v6raddr__mismatch, - struct in6_addr *, &tdi0->inet_raddr, - struct in6_addr *, &tdi->inet_raddr); - return FALSE; - } - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) != 0 && - tdi0->inet_lport != tdi->inet_lport) { - DTRACE_SKYWALK2(lport__mismatch, - uint8_t, tdi0->inet_lport, uint8_t, tdi->inet_lport); - return FALSE; - } - if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT) != 0 && - tdi0->inet_rport != tdi->inet_rport) { - DTRACE_SKYWALK2(rport__mismatch, - uint8_t, tdi0->inet_rport, uint8_t, tdi->inet_rport); - return FALSE; - } - return TRUE; -} - -static int -inet_traffic_rule_find(struct nxctl_traffic_rule_type *type, const char *ifname, - struct ifnet_traffic_descriptor_common *td, uint32_t flags, - struct nxctl_traffic_rule **ntrp) -{ - struct nxctl_traffic_rule_inet *ntri = NULL; - struct nxctl_traffic_rule_inet_storage *rs = type->ntrt_storage; - struct nxctl_traffic_rule_inet_if *rif; - struct ifnet_traffic_descriptor_inet *tdi = - (struct ifnet_traffic_descriptor_inet *)td; - int i; - - if (rs == NULL) { - return ENOENT; - } - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - if (strlcmp(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)) != 0) { - continue; - } - for (i = 0; i < NINETRULEMASKS; i++) { - if ((flags & NTR_FIND_FLAG_EXACT) != 0 && - tdi->inet_mask != nxctl_inet_traffic_rule_masks[i]) { - continue; - } - SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { - if (inet_traffic_rule_match(ntri, ifname, flags, tdi)) { - *ntrp = (struct nxctl_traffic_rule *)ntri; - return 0; - } - } - } - } - return ENOENT; -} - -static int -inet_traffic_rule_find_by_uuid(struct nxctl_traffic_rule_type *type, - uuid_t uuid, struct nxctl_traffic_rule **ntrp) -{ - struct nxctl_traffic_rule_inet *ntri; - struct nxctl_traffic_rule *ntr; - struct nxctl_traffic_rule_inet_storage *rs = type->ntrt_storage; - struct nxctl_traffic_rule_inet_if *rif; - int i; - - if (rs == NULL) { - return ENOENT; - } - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - for (i = 0; i < NINETRULEMASKS; i++) { - SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { - ntr = &ntri->ntri_common; - if (uuid_compare(ntr->ntr_uuid, uuid) == 0) { - *ntrp = ntr; - return 0; - } - } - } - } - return ENOENT; -} - -static void -inet_update_ifnet_traffic_rule_count(const char *ifname, uint32_t count) -{ - struct ifnet *ifp; - - ifp = ifunit_ref(ifname); - if (ifp == NULL) { - DTRACE_SKYWALK1(ifname__not__found, char *, ifname); - return; - } - ifnet_update_traffic_rule_count(ifp, count); - ifnet_decr_iorefcnt(ifp); -} - -static void -inet_traffic_rule_link(struct nxctl_traffic_rule *ntr) -{ - struct nxctl_traffic_rule_type *type = ntr->ntr_type; - struct nxctl_traffic_rule_inet_storage *rs; - struct nxctl_traffic_rule_inet_if *rif; - struct nxctl_traffic_rule_inet *ntri = - (struct nxctl_traffic_rule_inet *)ntr; - struct nxctl_traffic_rule_inet_head *list = NULL; - int i; - char *__null_terminated ntr_ifname = NULL; - char *__null_terminated rii_ifname = NULL; - - if ((rs = type->ntrt_storage) == NULL) { - rs = inet_traffic_rule_storage_create(); - type->ntrt_storage = rs; - } - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - if (strbufcmp(rif->rii_ifname, ntr->ntr_ifname) == 0) { - break; - } - } - if (rif == NULL) { - ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); - rif = inet_traffic_rule_if_create(ntr_ifname); - SLIST_INSERT_HEAD(&rs->ris_if_list, rif, rii_link); - } - for (i = 0; i < NINETRULEMASKS; i++) { - if (ntri->ntri_td.inet_mask == - nxctl_inet_traffic_rule_masks[i]) { - list = &rif->rii_lists[i]; - break; - } - } - retain_traffic_rule(ntr); - ASSERT(list != NULL); - SLIST_INSERT_HEAD(list, ntri, ntri_storage_link); - /* per-interface count */ - rif->rii_count++; - rii_ifname = __unsafe_null_terminated_from_indexable(rif->rii_ifname); - inet_update_ifnet_traffic_rule_count(rii_ifname, rif->rii_count); - - /* global count */ - rs->ris_count++; -} - -static void -inet_traffic_rule_unlink(struct nxctl_traffic_rule *ntr) -{ - struct nxctl_traffic_rule_inet_storage *rs; - struct nxctl_traffic_rule_inet_if *rif; - struct nxctl_traffic_rule_inet *ntri = - (struct nxctl_traffic_rule_inet *)ntr; - struct nxctl_traffic_rule_inet_head *list = NULL; - struct nxctl_traffic_rule_type *type; - int i; - char *__null_terminated rii_ifname = NULL; - - type = ntr->ntr_type; - rs = type->ntrt_storage; - ASSERT(rs != NULL); - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - if (strbufcmp(rif->rii_ifname, ntr->ntr_ifname) == 0) { - break; - } - } - ASSERT(rif != NULL); - for (i = 0; i < NINETRULEMASKS; i++) { - if (ntri->ntri_td.inet_mask == - nxctl_inet_traffic_rule_masks[i]) { - list = &rif->rii_lists[i]; - break; - } - } - ASSERT(list != NULL); - SLIST_REMOVE(list, ntri, nxctl_traffic_rule_inet, ntri_storage_link); - rif->rii_count--; - rii_ifname = __unsafe_null_terminated_from_indexable(rif->rii_ifname); - inet_update_ifnet_traffic_rule_count(rii_ifname, rif->rii_count); - - rs->ris_count--; - release_traffic_rule(ntr); - - if (rif->rii_count == 0) { - SLIST_REMOVE(&rs->ris_if_list, rif, nxctl_traffic_rule_inet_if, rii_link); - inet_traffic_rule_if_destroy(rif); - } - if (rs->ris_count == 0) { - type->ntrt_storage = NULL; - inet_traffic_rule_storage_destroy(rs); - } -} - -/* - * XXX - * This may need additional changes to ensure safety against detach/attach. - * This is not an issue for the first consumer of llink interfaces, cellular, - * which does not detach. - */ -static int -inet_traffic_rule_notify(struct nxctl_traffic_rule *ntr, uint32_t flags) -{ - struct ifnet *ifp; - struct nx_netif *nif; - struct netif_qset *__single qset = NULL; - struct nxctl_traffic_rule_inet *ntri; - int err = 0; - char *__null_terminated ntr_ifname = NULL; - - ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); - ifp = ifunit_ref(ntr_ifname); - if (ifp == NULL) { - DTRACE_SKYWALK1(ifname__not__found, char *, ntr->ntr_ifname); - err = ENXIO; - goto done; - } - nif = NA(ifp)->nifna_netif; - if (!NX_LLINK_PROV(nif->nif_nx)) { - DTRACE_SKYWALK1(llink__not__enabled, struct ifnet *, ifp); - err = ENOTSUP; - goto done; - } - ntri = (struct nxctl_traffic_rule_inet *)ntr; - qset = nx_netif_find_qset(nif, ntri->ntri_ra.ras_qset_id); - err = nx_netif_notify_steering_info(nif, qset, - (struct ifnet_traffic_descriptor_common *)&ntri->ntri_td, - ((flags & NTR_NOTIFY_FLAG_ADD) != 0)); -done: - if (qset != NULL) { - nx_netif_qset_release(&qset); - } - if (ifp != NULL) { - ifnet_decr_iorefcnt(ifp); - } - return err; -} - -static int -inet_traffic_rule_create(struct nxctl_traffic_rule_type *type, - const char *ifname, struct ifnet_traffic_descriptor_common *td, - struct ifnet_traffic_rule_action *ra, uint32_t flags, - struct nxctl_traffic_rule **ntrp) -{ - struct nxctl_traffic_rule_inet *ntri; - struct nxctl_traffic_rule *ntr; - struct ifnet_traffic_descriptor_inet *tdi; - struct ifnet_traffic_rule_action_steer *ras; - - ntri = sk_alloc_type(struct nxctl_traffic_rule_inet, - Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); - ntr = &ntri->ntri_common; - - ntr->ntr_type = type; - ntr->ntr_flags = flags; - uuid_generate(ntr->ntr_uuid); - os_ref_init(&ntr->ntr_refcnt, NULL); - - strlcpy(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)); - proc_selfname(ntr->ntr_procname, sizeof(ntr->ntr_procname)); - - tdi = __container_of(td, struct ifnet_traffic_descriptor_inet, inet_common); - ras = __container_of(ra, struct ifnet_traffic_rule_action_steer, ras_common); - bcopy(tdi, &ntri->ntri_td, sizeof(ntri->ntri_td)); - bcopy(ras, &ntri->ntri_ra, sizeof(ntri->ntri_ra)); - - *ntrp = ntr; - return 0; -} - -static void -inet_traffic_rule_destroy(struct nxctl_traffic_rule *ntr) -{ - struct nxctl_traffic_rule_inet *ntri; - - ASSERT(os_ref_get_count(&ntr->ntr_refcnt) == 0); - ntri = (struct nxctl_traffic_rule_inet *)ntr; - sk_free_type(struct nxctl_traffic_rule_inet, ntri); -} - -static void -convert_ntri_to_iocinfo(struct nxctl_traffic_rule_inet *ntri, - struct nxctl_traffic_rule_inet_iocinfo *info) -{ - struct nxctl_traffic_rule *ntr; - struct nxctl_traffic_rule_generic_iocinfo *ginfo; - - bzero(info, sizeof(*info)); - ntr = &ntri->ntri_common; - ginfo = &info->tri_common; - _CASSERT(sizeof(ntr->ntr_procname) == sizeof(ginfo->trg_procname)); - _CASSERT(sizeof(ntr->ntr_ifname) == sizeof(ginfo->trg_ifname)); - uuid_copy(ginfo->trg_uuid, ntr->ntr_uuid); - strbufcpy(ginfo->trg_procname, ntr->ntr_procname); - strbufcpy(ginfo->trg_ifname, ntr->ntr_ifname); - bcopy(&ntri->ntri_td, &info->tri_td, sizeof(info->tri_td)); - bcopy(&ntri->ntri_ra, &info->tri_ra, sizeof(info->tri_ra)); -} - -static int -inet_traffic_rule_get_all(struct nxctl_traffic_rule_type *type, uint32_t size, - uint32_t *count, user_addr_t uaddr) -{ - struct nxctl_traffic_rule_inet *ntri = NULL; - struct nxctl_traffic_rule_inet_storage *rs = type->ntrt_storage; - struct nxctl_traffic_rule_inet_if *rif; - struct nxctl_traffic_rule_inet_iocinfo info; - int i, err; - - if (size != sizeof(info)) { - SK_ERR("size: actual %d, expected %d", size, sizeof(info)); - return EINVAL; - } - if (rs == NULL) { - *count = 0; - return 0; - } - if (*count < rs->ris_count) { - SK_ERR("count: given %d, require: %d", *count, rs->ris_count); - return ENOBUFS; - } - SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { - for (i = 0; i < NINETRULEMASKS; i++) { - SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { - convert_ntri_to_iocinfo(ntri, &info); - err = copyout(&info, uaddr, sizeof(info)); - if (err != 0) { - SK_ERR("copyout failed: %d", err); - return err; - } - uaddr += sizeof(info); - } - } - } - *count = rs->ris_count; - return 0; -} - -SK_NO_INLINE_ATTRIBUTE -static void +void retain_traffic_rule(struct nxctl_traffic_rule *ntr) { #if (DEVELOPMENT || DEBUG) @@ -1216,7 +259,7 @@ retain_traffic_rule(struct nxctl_traffic_rule *ntr) } SK_NO_INLINE_ATTRIBUTE -static void +void release_traffic_rule(struct nxctl_traffic_rule *ntr) { #if (DEVELOPMENT || DEBUG) @@ -1225,7 +268,12 @@ release_traffic_rule(struct nxctl_traffic_rule *ntr) os_ref_count_t, count); #endif if (os_ref_release(&ntr->ntr_refcnt) == 0) { - ntr->ntr_type->ntrt_destroy(ntr); + struct nxctl_traffic_rule_type *type; + + type = find_traffic_rule_type(ntr->ntrt_type); + ASSERT(type); + + type->ntrt_destroy(ntr); } } @@ -1233,7 +281,12 @@ SK_NO_INLINE_ATTRIBUTE static int notify_traffic_rule(struct nxctl_traffic_rule *ntr, uint32_t flags) { - return ntr->ntr_type->ntrt_notify(ntr, flags); + struct nxctl_traffic_rule_type *type; + + type = find_traffic_rule_type(ntr->ntrt_type); + ASSERT(type); + + return type->ntrt_notify(ntr, flags); } static void @@ -1247,7 +300,13 @@ link_traffic_rule(struct nxctl *nxctl, struct nxctl_traffic_rule *ntr) if ((ntr->ntr_flags & NTR_FLAG_PERSIST) == 0) { add_traffic_rule_to_nxctl(nxctl, ntr); } - ntr->ntr_type->ntrt_link(ntr); + + struct nxctl_traffic_rule_type *type; + + type = find_traffic_rule_type(ntr->ntrt_type); + ASSERT(type); + + type->ntrt_link(ntr); } static void @@ -1256,7 +315,13 @@ unlink_traffic_rule(struct nxctl *nxctl, struct nxctl_traffic_rule *ntr) if ((ntr->ntr_flags & NTR_FLAG_PERSIST) == 0) { remove_traffic_rule_from_nxctl(nxctl, ntr); } - ntr->ntr_type->ntrt_unlink(ntr); + + struct nxctl_traffic_rule_type *type; + + type = find_traffic_rule_type(ntr->ntrt_type); + ASSERT(type); + + type->ntrt_unlink(ntr); } static int @@ -1268,7 +333,7 @@ find_traffic_rule_by_uuid(uuid_t uuid, struct nxctl_traffic_rule **ntrp) for (i = 0; i < NRULETYPES; i++) { ntrt = &nxctl_rule_types[i]; - err = ntrt->ntrt_find_by_uuid(ntrt, uuid, &ntr); + err = ntrt->ntrt_find_by_uuid(uuid, &ntr); if (err == 0) { ASSERT(ntr != NULL); *ntrp = ntr; @@ -1312,12 +377,23 @@ add_traffic_rule(struct nxctl *nxctl, const char *ifname, err = EINVAL; goto fail; } - err = type->ntrt_validate(type, ifname, td, ra); + for (int i = 0; i < NRULETYPES; i++) { + if (&nxctl_rule_types[i] != type) { + uint32_t count = 0; + err = nxctl_rule_types[i].ntrt_get_count(ifname, &count); + if (!(err == ENOENT || (err == 0 && count == 0))) { + SK_ERR("other types of rules are added to the same ifname"); + err = EINVAL; + goto fail; + } + } + } + err = type->ntrt_validate(ifname, td, ra); if (err != 0) { SK_ERR("rule validate failed: %d", err); goto fail; } - err = type->ntrt_find(type, ifname, td, NTR_FIND_FLAG_EXACT, &ntr); + err = type->ntrt_find(ifname, td, NTR_FIND_FLAG_EXACT, &ntr); if (err == 0) { SK_ERR("rule already exists"); ASSERT(ntr != NULL); @@ -1327,7 +403,7 @@ add_traffic_rule(struct nxctl *nxctl, const char *ifname, SK_ERR("rule find failed: %d", err); goto fail; } - err = type->ntrt_create(type, ifname, td, ra, flags, &ntr); + err = type->ntrt_create(ifname, td, ra, flags, &ntr); if (err != 0) { SK_ERR("rule create failed: %d", err); goto fail; @@ -1412,12 +488,29 @@ nxioctl_add_traffic_rule_inet(struct nxctl *nxctl, caddr_t data, proc_t procp) atri_ifname = __unsafe_null_terminated_from_indexable(args->atri_ifname); return add_traffic_rule_generic(nxctl, atri_ifname, - (struct ifnet_traffic_descriptor_common *)&args->atri_td, - (struct ifnet_traffic_rule_action *)&args->atri_ra, + &args->atri_td.inet_common, + &args->atri_ra.ras_common, convert_traffic_rule_ioc_flags(args->atri_flags), &args->atri_uuid); } +int +nxioctl_add_traffic_rule_eth(struct nxctl *nxctl, caddr_t data, proc_t procp) +{ +#pragma unused(procp) + struct nxctl_add_traffic_rule_eth_iocargs *args = + (struct nxctl_add_traffic_rule_eth_iocargs *)(void *)data; + char *__null_terminated atre_ifname = NULL; + + atre_ifname = __unsafe_null_terminated_from_indexable(args->atre_ifname); + + return add_traffic_rule_generic(nxctl, atre_ifname, + &args->atre_td.eth_common, + &args->atre_ra.ras_common, + convert_traffic_rule_ioc_flags(args->atre_flags), + &args->atre_uuid); +} + int nxioctl_remove_traffic_rule(struct nxctl *nxctl, caddr_t data, proc_t procp) { @@ -1455,7 +548,7 @@ nxioctl_get_traffic_rules(struct nxctl *nxctl, caddr_t data, proc_t procp) } uaddr = proc_is64bit(procp) ? args->gtr_buf64 : CAST_USER_ADDR_T(args->gtr_buf); - err = type->ntrt_get_all(type, args->gtr_size, &args->gtr_count, uaddr); + err = type->ntrt_get_all(args->gtr_size, &args->gtr_count, uaddr); if (err != 0) { goto fail; } diff --git a/bsd/skywalk/nexus/nexus_traffic_rule.h b/bsd/skywalk/nexus/nexus_traffic_rule.h new file mode 100644 index 000000000..833c67055 --- /dev/null +++ b/bsd/skywalk/nexus/nexus_traffic_rule.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2022 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SKYWALK_NEXUS_TRAFFIC_RULE_H_ +#define _SKYWALK_NEXUS_TRAFFIC_RULE_H_ + +#include + +__BEGIN_DECLS +struct nxctl_traffic_rule; + +/* + * These callbacks need to be implemented for each rule type. + */ + +/* Validate user provided parameters. */ +typedef int (nxctl_traffic_rule_validate_cb_t)( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra); +/* + * Each rule type has its own global structure for storing rules. + * These callbacks access this global structure. + */ +#define NTR_FIND_FLAG_EXACT 0x0001 +typedef int (nxctl_traffic_rule_find_cb_t)( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +typedef int (nxctl_traffic_rule_find_by_uuid_cb_t)( + uuid_t uuid, + struct nxctl_traffic_rule **ntrp); + +typedef void (nxctl_traffic_rule_link_cb_t)( + struct nxctl_traffic_rule *ntr); + +typedef void (nxctl_traffic_rule_unlink_cb_t)( + struct nxctl_traffic_rule *ntr); + +/* + * Notifies lower layers of the addition/removal of a rule. + * This is called outside of nxctl_traffic_rule_lock to avoid potential + * locking issues. + */ +#define NTR_NOTIFY_FLAG_ADD 0x0001 +#define NTR_NOTIFY_FLAG_REMOVE 0x0002 +typedef int (nxctl_traffic_rule_notify_cb_t)( + struct nxctl_traffic_rule *ntr, + uint32_t flags); + +/* + * Callback for a rule type to get rule count. + */ +typedef int (nxctl_traffic_rule_get_count_cb_t)( + const char *ifname, uint32_t *count); + +/* + * Create/Destroy callbacks for a rule type. + */ +typedef int (nxctl_traffic_rule_create_cb_t)( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, + uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +typedef void (nxctl_traffic_rule_destroy_cb_t)( + struct nxctl_traffic_rule *ntr); + +/* + * This is used for copying all rules for a type (including generic + * and type-specific info) to userspace. + */ +typedef int (nxctl_traffic_rule_get_all_cb_t)( + uint32_t size, + uint32_t *count, + user_addr_t uaddr); + +struct nxctl_traffic_rule_type { + uint8_t ntrt_type; + nxctl_traffic_rule_validate_cb_t *ntrt_validate; + nxctl_traffic_rule_find_cb_t *ntrt_find; + nxctl_traffic_rule_find_by_uuid_cb_t *ntrt_find_by_uuid; + nxctl_traffic_rule_link_cb_t *ntrt_link; + nxctl_traffic_rule_unlink_cb_t *ntrt_unlink; + nxctl_traffic_rule_notify_cb_t *ntrt_notify; + nxctl_traffic_rule_create_cb_t *ntrt_create; + nxctl_traffic_rule_destroy_cb_t *ntrt_destroy; + nxctl_traffic_rule_get_all_cb_t *ntrt_get_all; + nxctl_traffic_rule_get_count_cb_t *ntrt_get_count; +}; + +/* + * Generic traffic rule. + * Contains fields common to all traffic rules. + */ +#define NTR_FLAG_PERSIST 0x0001 +#define NTR_FLAG_ON_NXCTL_LIST 0x0002 +struct nxctl_traffic_rule { + uint8_t ntrt_type; + uint32_t ntr_flags; + os_refcnt_t ntr_refcnt; + uuid_t ntr_uuid; + char ntr_procname[NTR_PROCNAME_SZ]; + char ntr_ifname[IFNAMSIZ]; + SLIST_ENTRY(nxctl_traffic_rule) ntr_storage_link; +}; + +#define ITDBIT(set, bit) (((set) != 0) ? (bit) : 0) + + +void nxtr_wlock(void); +void nxtr_wunlock(void); +void nxtr_rlock(void); +void nxtr_runlock(void); + +#define NXTR_WLOCK() nxtr_wlock() +#define NXTR_WUNLOCK() nxtr_wunlock() +#define NXTR_RLOCK() nxtr_rlock() +#define NXTR_RUNLOCK() nxtr_runlock() + +void retain_traffic_rule(struct nxctl_traffic_rule *ntr); +void release_traffic_rule(struct nxctl_traffic_rule *ntr); + +__END_DECLS + +#endif /* _SKYWALK_NEXUS_TRAFFIC_RULE_H_ */ diff --git a/bsd/skywalk/nexus/nexus_traffic_rule_eth.c b/bsd/skywalk/nexus/nexus_traffic_rule_eth.c new file mode 100644 index 000000000..b6e2847de --- /dev/null +++ b/bsd/skywalk/nexus/nexus_traffic_rule_eth.c @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include + +/* + * Eth-specific traffic rule. + */ +struct nxctl_traffic_rule_eth { + struct nxctl_traffic_rule ntre_common; + SLIST_ENTRY(nxctl_traffic_rule_eth) ntre_storage_link; + struct ifnet_traffic_descriptor_eth ntre_td; + struct ifnet_traffic_rule_action_steer ntre_ra; +}; + +/* + * Currently supported tuple types. + */ +#define ETRM(type, raddr) \ + ITDBIT(type, IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE) | \ + ITDBIT(raddr, IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR) + +static uint8_t nxctl_eth_traffic_rule_masks[] = { + ETRM(1, 0), + ETRM(0, 1), +}; +#define NETHRULEMASKS \ + (sizeof(nxctl_eth_traffic_rule_masks)/sizeof(uint8_t)) + +/* Per-interface lists of eth traffic rules */ +SLIST_HEAD(nxctl_traffic_rule_eth_head, nxctl_traffic_rule_eth); +struct nxctl_traffic_rule_eth_if { + char rei_ifname[IFNAMSIZ]; + struct nxctl_traffic_rule_eth_head rei_lists[NETHRULEMASKS]; + uint32_t rei_count; + SLIST_ENTRY(nxctl_traffic_rule_eth_if) rei_link; +}; + +/* List of per-interface lists */ +SLIST_HEAD(nxctl_traffic_rule_eth_if_head, nxctl_traffic_rule_eth_if); +struct nxctl_traffic_rule_eth_storage { + struct nxctl_traffic_rule_eth_if_head res_if_list; + uint32_t res_count; +}; + +static struct nxctl_traffic_rule_eth_storage *rs = NULL; +static kern_allocation_name_t nxctl_traffic_rule_tag = NULL; + +/* + * If an interface attaches after rule(s) are added, this function is used + * retrieve the current rule count for that interface. + */ +int +nxctl_eth_traffic_rule_get_count(const char *ifname, uint32_t *count) +{ + int err; + + NXTR_RLOCK(); + err = eth_traffic_rule_get_count(ifname, count); + NXTR_RUNLOCK(); + + return err; +} + +/* + * Used for finding the qset id associated with a ether type and ether remote addr. + */ +int +nxctl_eth_traffic_rule_find_qset_id(const char *ifname, + uint16_t eth_type, ether_addr_t *eth_raddr, uint64_t *qset_id) +{ + struct nxctl_traffic_rule_eth *__single ntre = NULL; + struct nxctl_traffic_rule *__single ntr = NULL; + struct ifnet_traffic_descriptor_eth td = {0}; + int err; + + td.eth_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH; + td.eth_common.itd_len = sizeof(td); + td.eth_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | + IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND; + + td.eth_type = eth_type; + td.eth_mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE; + + if (eth_raddr) { + bcopy(eth_raddr, &td.eth_raddr, ETHER_ADDR_LEN); + td.eth_mask |= IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR; + } + + NXTR_RLOCK(); + err = eth_traffic_rule_find(ifname, &td.eth_common, 0, &ntr); + if (err != 0) { + goto fail; + } + ntre = __container_of(ntr, struct nxctl_traffic_rule_eth, ntre_common); + *qset_id = ntre->ntre_ra.ras_qset_id; + NXTR_RUNLOCK(); + return 0; +fail: + NXTR_RUNLOCK(); + return err; +} + +static int +parse_eth_hdr(struct __kern_packet *pkt, uint16_t *eth_type, ether_addr_t *eth_raddr) +{ + volatile ether_header_t *_l2 = NULL; + uint8_t *pkt_buf, *l2_hdr; + uint32_t bdlen, bdlim, bdoff, cls_len; + int err; + + ASSERT(pkt->pkt_l2_len <= pkt->pkt_length); + + MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); + cls_len = bdlim - bdoff; + cls_len = (uint32_t)MIN(cls_len, pkt->pkt_length); + VERIFY(pkt->pkt_length >= cls_len); + if (cls_len == 0) { + SK_ERR("cls_len == 0"); + err = EINVAL; + goto fail; + } + + l2_hdr = pkt_buf + pkt->pkt_headroom; + _l2 = (volatile ether_header_t *)(void *)l2_hdr; + + *eth_type = ntohs(_l2->ether_type); + bcopy(__DECONST(void *, &_l2->ether_dhost), eth_raddr, ETHER_ADDR_LEN); + + return 0; + +fail: + DTRACE_SKYWALK4(classify__failed, ether_header_t *, _l2, size_t, pkt->pkt_length, + uint8_t, pkt->pkt_l2_len, int, err); + return err; +} + +int +nxctl_eth_traffic_rule_find_qset_id_with_pkt(const char *ifname, + struct __kern_packet *pkt, uint64_t *qset_id) +{ + ether_addr_t eth_raddr; + uint16_t eth_type; + int err; + + err = parse_eth_hdr(pkt, ð_type, ð_raddr); + if (err != 0) { + return err; + } + return nxctl_eth_traffic_rule_find_qset_id(ifname, eth_type, ð_raddr, qset_id); +} + +void +eth_traffic_rule_init(kern_allocation_name_t rule_tag) +{ + ASSERT(nxctl_traffic_rule_tag == NULL); + nxctl_traffic_rule_tag = rule_tag; +} + +int +eth_traffic_rule_validate( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra) +{ + char buf[IFNAMSIZ]; + int unit, i; + struct ifnet_traffic_descriptor_eth *tdi; + + if (ifunit_extract(ifname, buf, sizeof(buf), &unit) < 0) { + SK_ERR("invalid ifname: %s", ifname); + return EINVAL; + } + if (td->itd_len != sizeof(*tdi)) { + SK_ERR("invalid td len: expected %lu, actual %d", + sizeof(*tdi), td->itd_len); + return EINVAL; + } + if (td->itd_flags == 0 || + (td->itd_flags & + ~(IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | + IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND)) != 0) { + SK_ERR("invalid td flags: 0x%x", td->itd_flags); + return EINVAL; + } + tdi = (struct ifnet_traffic_descriptor_eth *)td; + for (i = 0; i < NETHRULEMASKS; i++) { + if (tdi->eth_mask == nxctl_eth_traffic_rule_masks[i]) { + break; + } + } + if (i == NETHRULEMASKS) { + SK_ERR("invalid eth mask: 0x%x", tdi->eth_mask); + return EINVAL; + } + if ((tdi->eth_mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE) != 0) { + if (tdi->eth_type != ETHERTYPE_PAE && + tdi->eth_type != ETHERTYPE_WAI) { + SK_ERR("invalid eth type 0x%x", tdi->eth_type); + return EINVAL; + } + } + if ((tdi->eth_mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR) != 0) { + ether_addr_t mac_zeros_addr = {0}; + if (!_ether_cmp(&tdi->eth_raddr, &mac_zeros_addr)) { + SK_ERR("eth raddr cannot be unspecified"); + return EINVAL; + } + } + if (ra->ra_len != sizeof(struct ifnet_traffic_rule_action_steer)) { + SK_ERR("invalid ra len: expected %lu, actual %d", + sizeof(struct ifnet_traffic_rule_action_steer), ra->ra_len); + return EINVAL; + } + return 0; +} + +SK_NO_INLINE_ATTRIBUTE +static void +eth_traffic_rule_storage_create(void) +{ + rs = sk_alloc_type(struct nxctl_traffic_rule_eth_storage, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + SLIST_INIT(&rs->res_if_list); + rs->res_count = 0; + return; +} + +SK_NO_INLINE_ATTRIBUTE +static void +eth_traffic_rule_storage_destroy(void) +{ + ASSERT(rs->res_count == 0); + ASSERT(SLIST_EMPTY(&rs->res_if_list)); + sk_free_type(struct nxctl_traffic_rule_eth_storage, rs); +} + +SK_NO_INLINE_ATTRIBUTE +static struct nxctl_traffic_rule_eth_if * +eth_traffic_rule_if_create(const char *ifname) +{ + struct nxctl_traffic_rule_eth_if *rif; + int i; + + rif = sk_alloc_type(struct nxctl_traffic_rule_eth_if, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + for (i = 0; i < NETHRULEMASKS; i++) { + SLIST_INIT(&rif->rei_lists[i]); + } + strlcpy(rif->rei_ifname, ifname, sizeof(rif->rei_ifname)); + rif->rei_count = 0; + return rif; +} + +SK_NO_INLINE_ATTRIBUTE +static void +eth_traffic_rule_if_destroy(struct nxctl_traffic_rule_eth_if *rif) +{ + int i; + + for (i = 0; i < NETHRULEMASKS; i++) { + ASSERT(SLIST_EMPTY(&rif->rei_lists[i])); + } + ASSERT(rif->rei_count == 0); + sk_free_type(struct nxctl_traffic_rule_eth_if, rif); +} + +SK_NO_INLINE_ATTRIBUTE +static boolean_t +eth_traffic_rule_match(struct nxctl_traffic_rule_eth *ntre, const char *ifname, + uint32_t flags, struct ifnet_traffic_descriptor_eth *tdi) +{ + struct nxctl_traffic_rule *ntr = (struct nxctl_traffic_rule *)ntre; + struct ifnet_traffic_descriptor_eth *tdi0; + uint8_t mask; + boolean_t exact; + + VERIFY(strlcmp(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)) == 0); + tdi0 = &ntre->ntre_td; + + exact = ((flags & NTR_FIND_FLAG_EXACT) != 0); + mask = tdi0->eth_mask & tdi->eth_mask; + if (exact) { + ASSERT(tdi0->eth_mask == tdi->eth_mask); + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE) != 0 && + tdi0->eth_type != tdi->eth_type) { + DTRACE_SKYWALK2(eth_type__mismatch, + uint8_t, tdi0->eth_type, uint8_t, tdi->eth_type); + return FALSE; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR) != 0 && + _ether_cmp(&tdi0->eth_raddr, &tdi->eth_raddr)) { + DTRACE_SKYWALK2(eth_raddr__mismatch, + ether_addr_t *, &tdi0->eth_raddr, + ether_addr_t *, &tdi->eth_raddr); + return FALSE; + } + return TRUE; +} + +int +eth_traffic_rule_find(const char *ifname, + struct ifnet_traffic_descriptor_common *td, uint32_t flags, + struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_eth *ntre = NULL; + struct nxctl_traffic_rule_eth_if *rif; + struct ifnet_traffic_descriptor_eth *tdi = + (struct ifnet_traffic_descriptor_eth *)td; + int i; + + if (rs == NULL) { + return ENOENT; + } + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + if (strlcmp(rif->rei_ifname, ifname, sizeof(rif->rei_ifname)) != 0) { + continue; + } + for (i = 0; i < NETHRULEMASKS; i++) { + if ((flags & NTR_FIND_FLAG_EXACT) != 0 && + tdi->eth_mask != nxctl_eth_traffic_rule_masks[i]) { + continue; + } + SLIST_FOREACH(ntre, &rif->rei_lists[i], ntre_storage_link) { + if (eth_traffic_rule_match(ntre, ifname, flags, tdi)) { + *ntrp = (struct nxctl_traffic_rule *)ntre; + return 0; + } + } + } + } + return ENOENT; +} + +int +eth_traffic_rule_find_by_uuid( + uuid_t uuid, struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_eth *ntre; + struct nxctl_traffic_rule *ntr; + struct nxctl_traffic_rule_eth_if *rif; + int i; + + if (rs == NULL) { + return ENOENT; + } + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + for (i = 0; i < NETHRULEMASKS; i++) { + SLIST_FOREACH(ntre, &rif->rei_lists[i], ntre_storage_link) { + ntr = &ntre->ntre_common; + if (uuid_compare(ntr->ntr_uuid, uuid) == 0) { + *ntrp = ntr; + return 0; + } + } + } + } + return ENOENT; +} + +static void +eth_update_ifnet_traffic_rule_count(const char *ifname, uint32_t count) +{ + struct ifnet *ifp; + + ifp = ifunit_ref(ifname); + if (ifp == NULL) { + DTRACE_SKYWALK1(ifname__not__found, char *, ifname); + return; + } + ifnet_update_eth_traffic_rule_count(ifp, count); + ifnet_decr_iorefcnt(ifp); +} + +void +eth_traffic_rule_link(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_eth_if *rif; + struct nxctl_traffic_rule_eth *ntre = + (struct nxctl_traffic_rule_eth *)ntr; + struct nxctl_traffic_rule_eth_head *list = NULL; + int i; + char *__null_terminated ntr_ifname = NULL; + char *__null_terminated rei_ifname = NULL; + + if (rs == NULL) { + eth_traffic_rule_storage_create(); + } + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + if (strbufcmp(rif->rei_ifname, ntr->ntr_ifname) == 0) { + break; + } + } + if (rif == NULL) { + ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); + rif = eth_traffic_rule_if_create(ntr_ifname); + SLIST_INSERT_HEAD(&rs->res_if_list, rif, rei_link); + } + for (i = 0; i < NETHRULEMASKS; i++) { + if (ntre->ntre_td.eth_mask == + nxctl_eth_traffic_rule_masks[i]) { + list = &rif->rei_lists[i]; + break; + } + } + retain_traffic_rule(ntr); + ASSERT(list != NULL); + SLIST_INSERT_HEAD(list, ntre, ntre_storage_link); + /* per-interface count */ + rif->rei_count++; + rei_ifname = __unsafe_null_terminated_from_indexable(rif->rei_ifname); + eth_update_ifnet_traffic_rule_count(rei_ifname, rif->rei_count); + + /* global count */ + rs->res_count++; +} + +void +eth_traffic_rule_unlink(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_eth_if *rif; + struct nxctl_traffic_rule_eth *ntre = + (struct nxctl_traffic_rule_eth *)ntr; + struct nxctl_traffic_rule_eth_head *list = NULL; + int i; + char *__null_terminated rei_ifname = NULL; + + ASSERT(rs != NULL); + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + if (strbufcmp(rif->rei_ifname, ntr->ntr_ifname) == 0) { + break; + } + } + ASSERT(rif != NULL); + for (i = 0; i < NETHRULEMASKS; i++) { + if (ntre->ntre_td.eth_mask == + nxctl_eth_traffic_rule_masks[i]) { + list = &rif->rei_lists[i]; + break; + } + } + ASSERT(list != NULL); + SLIST_REMOVE(list, ntre, nxctl_traffic_rule_eth, ntre_storage_link); + rif->rei_count--; + rei_ifname = __unsafe_null_terminated_from_indexable(rif->rei_ifname); + eth_update_ifnet_traffic_rule_count(rei_ifname, rif->rei_count); + + rs->res_count--; + release_traffic_rule(ntr); + + if (rif->rei_count == 0) { + SLIST_REMOVE(&rs->res_if_list, rif, nxctl_traffic_rule_eth_if, rei_link); + eth_traffic_rule_if_destroy(rif); + } + if (rs->res_count == 0) { + eth_traffic_rule_storage_destroy(); + rs = NULL; + } +} + +/* + * XXX + * This may need additional changes to ensure safety against detach/attach. + */ +int +eth_traffic_rule_notify(struct nxctl_traffic_rule *ntr, uint32_t flags) +{ + struct ifnet *ifp; + struct nx_netif *nif; + struct netif_qset *__single qset = NULL; + struct nxctl_traffic_rule_eth *ntre; + int err = 0; + char *__null_terminated ntr_ifname = NULL; + + ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); + ifp = ifunit_ref(ntr_ifname); + if (ifp == NULL) { + DTRACE_SKYWALK1(ifname__not__found, char *, ntr->ntr_ifname); + err = ENXIO; + goto done; + } + nif = NA(ifp)->nifna_netif; + if (!NX_LLINK_PROV(nif->nif_nx)) { + DTRACE_SKYWALK1(llink__not__enabled, struct ifnet *, ifp); + err = ENOTSUP; + goto done; + } + ntre = (struct nxctl_traffic_rule_eth *)ntr; + qset = nx_netif_find_qset(nif, ntre->ntre_ra.ras_qset_id); + err = nx_netif_notify_steering_info(nif, qset, + (struct ifnet_traffic_descriptor_common *)&ntre->ntre_td, + ((flags & NTR_NOTIFY_FLAG_ADD) != 0)); +done: + if (qset != NULL) { + nx_netif_qset_release(&qset); + } + if (ifp != NULL) { + ifnet_decr_iorefcnt(ifp); + } + return err; +} + +int +eth_traffic_rule_get_count(const char *ifname, uint32_t *count) +{ + struct nxctl_traffic_rule_eth_if *rif; + int err; + + if (rs == NULL) { + err = ENOENT; + goto fail; + } + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + if (strlcmp(rif->rei_ifname, ifname, sizeof(rif->rei_ifname)) == 0) { + break; + } + } + if (rif == NULL) { + err = ENOENT; + goto fail; + } + *count = rif->rei_count; + return 0; +fail: + return err; +} + +int +eth_traffic_rule_create( + const char *ifname, struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, uint32_t flags, + struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_eth *ntre; + struct nxctl_traffic_rule *ntr; + struct ifnet_traffic_descriptor_eth *tdi; + struct ifnet_traffic_rule_action_steer *ras; + + ntre = sk_alloc_type(struct nxctl_traffic_rule_eth, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + ntr = &ntre->ntre_common; + + ntr->ntrt_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH; + ntr->ntr_flags = flags; + uuid_generate(ntr->ntr_uuid); + os_ref_init(&ntr->ntr_refcnt, NULL); + + strlcpy(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)); + proc_selfname(ntr->ntr_procname, sizeof(ntr->ntr_procname)); + + tdi = __container_of(td, struct ifnet_traffic_descriptor_eth, eth_common); + ras = __container_of(ra, struct ifnet_traffic_rule_action_steer, ras_common); + bcopy(tdi, &ntre->ntre_td, sizeof(ntre->ntre_td)); + bcopy(ras, &ntre->ntre_ra, sizeof(ntre->ntre_ra)); + + *ntrp = ntr; + return 0; +} + +void +eth_traffic_rule_destroy(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_eth *ntre; + + ASSERT(os_ref_get_count(&ntr->ntr_refcnt) == 0); + ntre = (struct nxctl_traffic_rule_eth *)ntr; + sk_free_type(struct nxctl_traffic_rule_eth, ntre); +} + +static void +convert_ntre_to_iocinfo(struct nxctl_traffic_rule_eth *ntre, + struct nxctl_traffic_rule_eth_iocinfo *info) +{ + struct nxctl_traffic_rule *ntr; + struct nxctl_traffic_rule_generic_iocinfo *ginfo; + + bzero(info, sizeof(*info)); + ntr = &ntre->ntre_common; + ginfo = &info->tre_common; + static_assert(sizeof(ntr->ntr_procname) == sizeof(ginfo->trg_procname)); + static_assert(sizeof(ntr->ntr_ifname) == sizeof(ginfo->trg_ifname)); + uuid_copy(ginfo->trg_uuid, ntr->ntr_uuid); + strbufcpy(ginfo->trg_procname, ntr->ntr_procname); + strbufcpy(ginfo->trg_ifname, ntr->ntr_ifname); + bcopy(&ntre->ntre_td, &info->tre_td, sizeof(info->tre_td)); + bcopy(&ntre->ntre_ra, &info->tre_ra, sizeof(info->tre_ra)); +} + +int +eth_traffic_rule_get_all(uint32_t size, + uint32_t *count, user_addr_t uaddr) +{ + struct nxctl_traffic_rule_eth *ntre = NULL; + struct nxctl_traffic_rule_eth_if *rif; + struct nxctl_traffic_rule_eth_iocinfo info; + int i, err; + + if (size != sizeof(info)) { + SK_ERR("size: actual %d, expected %lu", size, sizeof(info)); + return EINVAL; + } + if (rs == NULL) { + *count = 0; + return 0; + } + if (*count < rs->res_count) { + SK_ERR("count: given %d, require: %d", *count, rs->res_count); + return ENOBUFS; + } + SLIST_FOREACH(rif, &rs->res_if_list, rei_link) { + for (i = 0; i < NETHRULEMASKS; i++) { + SLIST_FOREACH(ntre, &rif->rei_lists[i], ntre_storage_link) { + convert_ntre_to_iocinfo(ntre, &info); + err = copyout(&info, uaddr, sizeof(info)); + if (err != 0) { + SK_ERR("copyout failed: %d", err); + return err; + } + uaddr += sizeof(info); + } + } + } + *count = rs->res_count; + return 0; +} diff --git a/bsd/skywalk/nexus/nexus_traffic_rule_eth.h b/bsd/skywalk/nexus/nexus_traffic_rule_eth.h new file mode 100644 index 000000000..8a63d45c2 --- /dev/null +++ b/bsd/skywalk/nexus/nexus_traffic_rule_eth.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SKYWALK_NEXUS_TRAFFIC_RULE_ETH_H_ +#define _SKYWALK_NEXUS_TRAFFIC_RULE_ETH_H_ + +#include + +__BEGIN_DECLS +void eth_traffic_rule_init(kern_allocation_name_t rule_tag); + +int eth_traffic_rule_validate( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra); + +int eth_traffic_rule_find(const char *ifname, + struct ifnet_traffic_descriptor_common *td, uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +int eth_traffic_rule_find_by_uuid( + uuid_t uuid, struct nxctl_traffic_rule **ntrp); + +void eth_traffic_rule_link(struct nxctl_traffic_rule *ntr); + +void eth_traffic_rule_unlink(struct nxctl_traffic_rule *ntr); + +int eth_traffic_rule_notify(struct nxctl_traffic_rule *ntr, uint32_t flags); + +int eth_traffic_rule_get_count(const char *ifname, uint32_t *count); + +int eth_traffic_rule_create( + const char *ifname, struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +void eth_traffic_rule_destroy(struct nxctl_traffic_rule *ntr); + +int eth_traffic_rule_get_all(uint32_t size, + uint32_t *count, user_addr_t uaddr); + +__END_DECLS + +#endif /* _SKYWALK_NEXUS_TRAFFIC_RULE_ETH_H_ */ diff --git a/bsd/skywalk/nexus/nexus_traffic_rule_inet.c b/bsd/skywalk/nexus/nexus_traffic_rule_inet.c new file mode 100644 index 000000000..cc1e76ea4 --- /dev/null +++ b/bsd/skywalk/nexus/nexus_traffic_rule_inet.c @@ -0,0 +1,915 @@ +/* + * Copyright (c) 2022 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +/* + * Inet-specific traffic rule. + */ +struct nxctl_traffic_rule_inet { + struct nxctl_traffic_rule ntri_common; + SLIST_ENTRY(nxctl_traffic_rule_inet) ntri_storage_link; + struct ifnet_traffic_descriptor_inet ntri_td; + struct ifnet_traffic_rule_action_steer ntri_ra; +}; + +/* + * Currently supported tuple types. + */ +#define ITRM(proto, laddr, raddr, lport, rport) \ + (IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER | \ + ITDBIT(proto, IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO) | \ + ITDBIT(laddr, IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) | \ + ITDBIT(raddr, IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) | \ + ITDBIT(lport, IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) | \ + ITDBIT(rport, IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT)) + +static uint8_t nxctl_inet_traffic_rule_masks[] = { + ITRM(1, 1, 1, 1, 1), + ITRM(1, 1, 1, 1, 0), + ITRM(1, 1, 1, 0, 1), + ITRM(1, 1, 1, 0, 0), + ITRM(1, 1, 0, 1, 1), + ITRM(1, 1, 0, 1, 0), + ITRM(1, 1, 0, 0, 1), + ITRM(1, 1, 0, 0, 0), + ITRM(1, 0, 1, 1, 1), + ITRM(1, 0, 1, 1, 0), + ITRM(1, 0, 1, 0, 1), + ITRM(1, 0, 1, 0, 0), + ITRM(1, 0, 0, 1, 1), + ITRM(1, 0, 0, 1, 0), + ITRM(1, 0, 0, 0, 1), + // ITRM(1, 0, 0, 0, 0), addr or port is required + ITRM(0, 1, 1, 1, 1), + ITRM(0, 1, 1, 1, 0), + ITRM(0, 1, 1, 0, 1), + ITRM(0, 1, 1, 0, 0), + ITRM(0, 1, 0, 1, 1), + ITRM(0, 1, 0, 1, 0), + ITRM(0, 1, 0, 0, 1), + ITRM(0, 1, 0, 0, 0), + ITRM(0, 0, 1, 1, 1), + ITRM(0, 0, 1, 1, 0), + ITRM(0, 0, 1, 0, 1), + ITRM(0, 0, 1, 0, 0), + ITRM(0, 0, 0, 1, 1), + ITRM(0, 0, 0, 1, 0), + ITRM(0, 0, 0, 0, 1), + // ITRM(0, 0, 0, 0, 0), +}; +#define NINETRULEMASKS \ + (sizeof(nxctl_inet_traffic_rule_masks)/sizeof(uint8_t)) + +/* Per-interface lists of inet traffic rules */ +SLIST_HEAD(nxctl_traffic_rule_inet_head, nxctl_traffic_rule_inet); +struct nxctl_traffic_rule_inet_if { + char rii_ifname[IFNAMSIZ]; + struct nxctl_traffic_rule_inet_head rii_lists[NINETRULEMASKS]; + uint32_t rii_count; + SLIST_ENTRY(nxctl_traffic_rule_inet_if) rii_link; +}; + +/* List of per-interface lists */ +SLIST_HEAD(nxctl_traffic_rule_inet_if_head, nxctl_traffic_rule_inet_if); +struct nxctl_traffic_rule_inet_storage { + struct nxctl_traffic_rule_inet_if_head ris_if_list; + uint32_t ris_count; +}; + +static struct nxctl_traffic_rule_inet_storage *rs = NULL; +static kern_allocation_name_t nxctl_traffic_rule_tag = NULL; + +static boolean_t inet_v6addr_cmp(struct ifnet_ip_addr *a1, + struct ifnet_ip_addr *a2); + +/* + * If an interface attaches after rule(s) are added, this function is used + * retrieve the current rule count for that interface. + */ +int +nxctl_inet_traffic_rule_get_count(const char *ifname, uint32_t *count) +{ + int err; + + NXTR_RLOCK(); + err = inet_traffic_rule_get_count(ifname, count); + NXTR_RUNLOCK(); + + return err; +} + +/* + * Used for finding the qset id associated with a traffic descriptor. + */ +int +nxctl_inet_traffic_rule_find_qset_id(const char *ifname, + struct ifnet_traffic_descriptor_inet *td, uint64_t *qset_id) +{ + struct nxctl_traffic_rule_inet *__single ntri = NULL; + struct nxctl_traffic_rule *__single ntr = NULL; + int err; + + NXTR_RLOCK(); + err = inet_traffic_rule_find(ifname, &td->inet_common, 0, &ntr); + if (err != 0) { + goto fail; + } + ntri = __container_of(ntr, struct nxctl_traffic_rule_inet, ntri_common); + *qset_id = ntri->ntri_ra.ras_qset_id; + NXTR_RUNLOCK(); + return 0; +fail: + NXTR_RUNLOCK(); + return err; +} + +/* + * Based on flow_pkt_classify(). + * This function populates struct ifnet_traffic_descriptor_inet instead of struct __flow. + */ +static int +fill_inet_td(struct __kern_packet *pkt, struct ifnet_traffic_descriptor_inet *td) +{ + union { + volatile struct ip *__indexable _iph; + volatile struct ip6_hdr *__indexable _ip6; + } _l3; + #define iph _l3._iph + #define ip6 _l3._ip6 + union { + volatile struct tcphdr *_tcph; + volatile struct udphdr *_udph; + } _l4; + #define tcph _l4._tcph + #define udph _l4._udph + uint8_t *pkt_buf, *l3_hdr; + uint32_t bdlen, bdlim, bdoff, cls_len; + size_t pkt_len; + uint8_t ipv, l3hlen = 0; /* IP header length */ + uint16_t l3tlen = 0; /* total length of IP packet */ + uint8_t l4hlen = 0; /* TCP/UDP header length */ + uint16_t ulen = 0; /* user data length */ + int err; + + ASSERT(pkt->pkt_l2_len <= pkt->pkt_length); + pkt_len = pkt->pkt_length - pkt->pkt_l2_len; + + MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); + cls_len = bdlim - bdoff; + cls_len -= pkt->pkt_l2_len; + cls_len = (uint32_t)MIN(cls_len, pkt_len); + VERIFY(pkt_len >= cls_len); + if (cls_len == 0) { + SK_ERR("cls_len == 0"); + err = EINVAL; + goto fail; + } + l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; + iph = (volatile struct ip *)(void *)l3_hdr; + ipv = iph->ip_v; + + switch (ipv) { + case 4: + if (cls_len < sizeof(struct ip)) { + SK_ERR("cls_len < sizeof(struct ip) (%d < %lu)", + cls_len, sizeof(struct ip)); + err = EINVAL; + goto fail; + } + l3hlen = (uint8_t)(iph->ip_hl << 2); + if (l3hlen < sizeof(struct ip)) { + SK_ERR("l3hlen < sizeof(struct ip) (%d < %lu)", + l3hlen, sizeof(struct ip)); + err = EINVAL; + goto fail; + } + if (cls_len < l3hlen) { + SK_ERR("cls_len < l3hlen (%d < %d)", cls_len, l3hlen); + err = EINVAL; + goto fail; + } + l3tlen = ntohs(iph->ip_len); + if (l3tlen < l3hlen) { + SK_ERR("l3tlen < l3hlen (%d < %d)", l3tlen, l3hlen); + err = EINVAL; + goto fail; + } + if (pkt_len < l3tlen) { + SK_ERR("pkt_len < l3tlen (%zu < %d)", pkt_len, l3tlen); + err = EINVAL; + goto fail; + } + td->inet_ipver = IPVERSION; + td->inet_proto = iph->ip_p; + bcopy(__DECONST(void *, &iph->ip_src), &td->inet_laddr.iia_v4addr, + sizeof(iph->ip_src)); + bcopy(__DECONST(void *, &iph->ip_dst), &td->inet_raddr.iia_v4addr, + sizeof(iph->ip_dst)); + break; + case 6: + l3hlen = sizeof(struct ip6_hdr); + if (cls_len < l3hlen) { + SK_ERR("cls_len < l3hlen (%d < %d)", cls_len, l3hlen); + err = EINVAL; + goto fail; + } + l3tlen = l3hlen + ntohs(ip6->ip6_plen); + if (pkt_len < l3tlen) { + SK_ERR("pkt_len < l3tlen (%zu < %d)", pkt_len, l3tlen); + err = EINVAL; + goto fail; + } + td->inet_ipver = IPV6_VERSION; + td->inet_proto = ip6->ip6_nxt; + bcopy(__DECONST(void *, &ip6->ip6_src), &td->inet_laddr, + sizeof(ip6->ip6_src)); + bcopy(__DECONST(void *, &ip6->ip6_dst), &td->inet_raddr, + sizeof(ip6->ip6_dst)); + break; + default: + SK_ERR("ipv == %d", ipv); + err = EINVAL; + goto fail; + } + tcph = __DECONST(volatile struct tcphdr *, (volatile uint8_t *)iph + l3hlen); + ulen = (l3tlen - l3hlen); + if (td->inet_proto == IPPROTO_TCP) { + if (cls_len < l3hlen + sizeof(*tcph) || ulen < sizeof(*tcph)) { + SK_ERR("cls_len < l3hlen + sizeof(*tcph) || ulen < sizeof(*tcph) " + "(%d < %d + %lu || %d < %lu)", cls_len, l3hlen, sizeof(*tcph), + ulen, sizeof(*tcph)); + err = EINVAL; + goto fail; + } + l4hlen = (uint8_t)(tcph->th_off << 2); + if (l4hlen < sizeof(*tcph)) { + SK_ERR("l4hlen < sizeof(*tcph) (%d < %lu)", l4hlen, sizeof(*tcph)); + err = EINVAL; + goto fail; + } + if (l4hlen > ulen) { + SK_ERR("l4hlen > ulen (%d > %d)", l4hlen, ulen); + err = EINVAL; + goto fail; + } + bcopy(__DECONST(void *, &tcph->th_sport), &td->inet_lport, + sizeof(td->inet_lport)); + bcopy(__DECONST(void *, &tcph->th_dport), &td->inet_rport, + sizeof(td->inet_rport)); + } else if (td->inet_proto == IPPROTO_UDP) { + if (cls_len < l3hlen + sizeof(*udph) || ulen < sizeof(*udph)) { + SK_ERR("cls_len < l3hlen + sizeof(*udph) || ulen < sizeof(*udph) " + "(%d < %d + %lu || %d < %lu)", cls_len, l3hlen, sizeof(*udph), + ulen, sizeof(*udph)); + err = EINVAL; + goto fail; + } + l4hlen = sizeof(*udph); + if (l4hlen > ulen) { + SK_ERR("l4hlen > ulen (%d > %d)", l4hlen, ulen); + err = EINVAL; + goto fail; + } + bcopy(__DECONST(void *, &udph->uh_sport), &td->inet_lport, + sizeof(td->inet_lport)); + bcopy(__DECONST(void *, &udph->uh_dport), &td->inet_rport, + sizeof(td->inet_rport)); + } else { + err = ENOTSUP; + goto fail; + } + + td->inet_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET; + td->inet_common.itd_len = sizeof(*td); + td->inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | + IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND; + td->inet_mask |= (IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER | + IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO | + IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR | + IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR | + IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT | + IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT); + return 0; +fail: + DTRACE_SKYWALK5(classify__failed, struct ip *, iph, size_t, pkt_len, + uint8_t, pkt->pkt_l2_len, struct ifnet_traffic_descriptor_inet *, td, + int, err); + bzero(td, sizeof(*td)); + return err; + #undef iph + #undef ip6 + #undef tcph + #undef udph +} + +int +nxctl_inet_traffic_rule_find_qset_id_with_pkt(const char *ifname, + struct __kern_packet *pkt, uint64_t *qset_id) +{ + struct ifnet_traffic_descriptor_inet td; + int err; + + err = fill_inet_td(pkt, &td); + if (err != 0) { + return err; + } + return nxctl_inet_traffic_rule_find_qset_id(ifname, &td, qset_id); +} + +static struct ifnet_ip_addr v6_zeros_addr = {0}; +static boolean_t +inet_v6addr_cmp(struct ifnet_ip_addr *a1, struct ifnet_ip_addr *a2) +{ + return memcmp(a1, a2, sizeof(*a1)) == 0; +} + +void +inet_traffic_rule_init(kern_allocation_name_t rule_tag) +{ + ASSERT(nxctl_traffic_rule_tag == NULL); + nxctl_traffic_rule_tag = rule_tag; +} + +int +inet_traffic_rule_validate( + const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra) +{ + char buf[IFNAMSIZ]; + int unit, i; + struct ifnet_traffic_descriptor_inet *tdi; + uint8_t mask = 0, ipver, proto; + + if (ifunit_extract(ifname, buf, sizeof(buf), &unit) < 0) { + SK_ERR("invalid ifname: %s", ifname); + return EINVAL; + } + if (td->itd_len != sizeof(*tdi)) { + SK_ERR("invalid td len: expected %lu, actual %d", + sizeof(*tdi), td->itd_len); + return EINVAL; + } + if (td->itd_flags == 0 || + (td->itd_flags & + ~(IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | + IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND)) != 0) { + SK_ERR("invalid td flags: 0x%x", td->itd_flags); + return EINVAL; + } + tdi = (struct ifnet_traffic_descriptor_inet *)td; + for (i = 0; i < NINETRULEMASKS; i++) { + if (tdi->inet_mask == nxctl_inet_traffic_rule_masks[i]) { + mask = tdi->inet_mask; + break; + } + } + if (mask == 0) { + SK_ERR("invalid inet mask: 0x%x", tdi->inet_mask); + return EINVAL; + } + ipver = tdi->inet_ipver; + if (ipver != IPVERSION && ipver != IPV6_VERSION) { + SK_ERR("invalid inet ipver: 0x%x", ipver); + return EINVAL; + } + proto = tdi->inet_proto; + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + SK_ERR("invalid inet proto: %d", proto); + return EINVAL; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0) { + if (ipver == IPVERSION) { + if (tdi->inet_laddr.iia_v4addr == INADDR_ANY) { + SK_ERR("inet laddr v4 cannot be unspecified"); + return EINVAL; + } + } else { + if (inet_v6addr_cmp(&tdi->inet_laddr, &v6_zeros_addr)) { + SK_ERR("inet laddr v4 cannot be unspecified"); + return EINVAL; + } + } + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0) { + if (ipver == IPVERSION) { + if (tdi->inet_raddr.iia_v4addr == INADDR_ANY) { + SK_ERR("inet raddr v6 cannot be unspecified"); + return EINVAL; + } + } else { + if (inet_v6addr_cmp(&tdi->inet_raddr, &v6_zeros_addr)) { + SK_ERR("inet raddr v6 cannot be unspecified"); + return EINVAL; + } + } + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) != 0) { + if (tdi->inet_lport == 0) { + SK_ERR("inet lport cannot be unspecified"); + return EINVAL; + } + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT) != 0) { + if (tdi->inet_rport == 0) { + SK_ERR("inet rport cannot be unspecified"); + return EINVAL; + } + } + if (ra->ra_len != sizeof(struct ifnet_traffic_rule_action_steer)) { + SK_ERR("invalid ra len: expected %lu, actual %d", + sizeof(struct ifnet_traffic_rule_action_steer), ra->ra_len); + return EINVAL; + } + return 0; +} + +SK_NO_INLINE_ATTRIBUTE +static void +inet_traffic_rule_storage_create(void) +{ + rs = sk_alloc_type(struct nxctl_traffic_rule_inet_storage, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + SLIST_INIT(&rs->ris_if_list); + rs->ris_count = 0; + return; +} + +SK_NO_INLINE_ATTRIBUTE +static void +inet_traffic_rule_storage_destroy(void) +{ + ASSERT(rs->ris_count == 0); + ASSERT(SLIST_EMPTY(&rs->ris_if_list)); + sk_free_type(struct nxctl_traffic_rule_inet_storage, rs); +} + +SK_NO_INLINE_ATTRIBUTE +static struct nxctl_traffic_rule_inet_if * +inet_traffic_rule_if_create(const char *ifname) +{ + struct nxctl_traffic_rule_inet_if *rif; + int i; + + rif = sk_alloc_type(struct nxctl_traffic_rule_inet_if, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + for (i = 0; i < NINETRULEMASKS; i++) { + SLIST_INIT(&rif->rii_lists[i]); + } + strlcpy(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)); + rif->rii_count = 0; + return rif; +} + +SK_NO_INLINE_ATTRIBUTE +static void +inet_traffic_rule_if_destroy(struct nxctl_traffic_rule_inet_if *rif) +{ + int i; + + for (i = 0; i < NINETRULEMASKS; i++) { + ASSERT(SLIST_EMPTY(&rif->rii_lists[i])); + } + ASSERT(rif->rii_count == 0); + sk_free_type(struct nxctl_traffic_rule_inet_if, rif); +} + +SK_NO_INLINE_ATTRIBUTE +static boolean_t +inet_traffic_rule_match(struct nxctl_traffic_rule_inet *ntri, const char *ifname, + uint32_t flags, struct ifnet_traffic_descriptor_inet *tdi) +{ + struct nxctl_traffic_rule *ntr = (struct nxctl_traffic_rule *)ntri; + struct ifnet_traffic_descriptor_inet *tdi0; + uint8_t mask; + boolean_t exact; + + VERIFY(strlcmp(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)) == 0); + tdi0 = &ntri->ntri_td; + + exact = ((flags & NTR_FIND_FLAG_EXACT) != 0); + mask = tdi0->inet_mask & tdi->inet_mask; + if (exact) { + ASSERT(tdi0->inet_mask == tdi->inet_mask); + } + ASSERT((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER) != 0); + if (tdi0->inet_ipver != tdi->inet_ipver) { + DTRACE_SKYWALK2(ipver__mismatch, + uint8_t, tdi0->inet_ipver, uint8_t, tdi->inet_ipver); + return FALSE; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO) != 0 && + tdi0->inet_proto != tdi->inet_proto) { + DTRACE_SKYWALK2(proto__mismatch, + uint8_t, tdi0->inet_proto, uint8_t, tdi->inet_proto); + return FALSE; + } + if (tdi0->inet_ipver == IPVERSION) { + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0 && + tdi0->inet_laddr.iia_v4addr != tdi->inet_laddr.iia_v4addr) { + DTRACE_SKYWALK2(v4laddr__mismatch, + in_addr_t, tdi0->inet_laddr.iia_v4addr, + in_addr_t, tdi->inet_laddr.iia_v4addr); + return FALSE; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0 && + tdi0->inet_raddr.iia_v4addr != tdi->inet_raddr.iia_v4addr) { + DTRACE_SKYWALK2(v4raddr__mismatch, + in_addr_t, tdi0->inet_raddr.iia_v4addr, + in_addr_t, tdi->inet_raddr.iia_v4addr); + return FALSE; + } + } else { + ASSERT(tdi0->inet_ipver == IPV6_VERSION); + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR) != 0 && + !inet_v6addr_cmp(&tdi0->inet_laddr, &tdi->inet_laddr)) { + DTRACE_SKYWALK2(v6laddr__mismatch, + struct in6_addr *, &tdi0->inet_laddr, + struct in6_addr *, &tdi->inet_laddr); + return FALSE; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR) != 0 && + !inet_v6addr_cmp(&tdi0->inet_raddr, &tdi->inet_raddr)) { + DTRACE_SKYWALK2(v6raddr__mismatch, + struct in6_addr *, &tdi0->inet_raddr, + struct in6_addr *, &tdi->inet_raddr); + return FALSE; + } + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT) != 0 && + tdi0->inet_lport != tdi->inet_lport) { + DTRACE_SKYWALK2(lport__mismatch, + uint8_t, tdi0->inet_lport, uint8_t, tdi->inet_lport); + return FALSE; + } + if ((mask & IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT) != 0 && + tdi0->inet_rport != tdi->inet_rport) { + DTRACE_SKYWALK2(rport__mismatch, + uint8_t, tdi0->inet_rport, uint8_t, tdi->inet_rport); + return FALSE; + } + return TRUE; +} + +int +inet_traffic_rule_find(const char *ifname, + struct ifnet_traffic_descriptor_common *td, uint32_t flags, + struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_inet *ntri = NULL; + struct nxctl_traffic_rule_inet_if *rif; + struct ifnet_traffic_descriptor_inet *tdi = + (struct ifnet_traffic_descriptor_inet *)td; + int i; + + if (rs == NULL) { + return ENOENT; + } + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + if (strlcmp(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)) != 0) { + continue; + } + for (i = 0; i < NINETRULEMASKS; i++) { + if ((flags & NTR_FIND_FLAG_EXACT) != 0 && + tdi->inet_mask != nxctl_inet_traffic_rule_masks[i]) { + continue; + } + SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { + if (inet_traffic_rule_match(ntri, ifname, flags, tdi)) { + *ntrp = (struct nxctl_traffic_rule *)ntri; + return 0; + } + } + } + } + return ENOENT; +} + +int +inet_traffic_rule_find_by_uuid( + uuid_t uuid, struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_inet *ntri; + struct nxctl_traffic_rule *ntr; + struct nxctl_traffic_rule_inet_if *rif; + int i; + + if (rs == NULL) { + return ENOENT; + } + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + for (i = 0; i < NINETRULEMASKS; i++) { + SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { + ntr = &ntri->ntri_common; + if (uuid_compare(ntr->ntr_uuid, uuid) == 0) { + *ntrp = ntr; + return 0; + } + } + } + } + return ENOENT; +} + +static void +inet_update_ifnet_traffic_rule_count(const char *ifname, uint32_t count) +{ + struct ifnet *ifp; + + ifp = ifunit_ref(ifname); + if (ifp == NULL) { + DTRACE_SKYWALK1(ifname__not__found, char *, ifname); + return; + } + ifnet_update_inet_traffic_rule_count(ifp, count); + ifnet_decr_iorefcnt(ifp); +} + +void +inet_traffic_rule_link(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_inet_if *rif; + struct nxctl_traffic_rule_inet *ntri = + (struct nxctl_traffic_rule_inet *)ntr; + struct nxctl_traffic_rule_inet_head *list = NULL; + int i; + char *__null_terminated ntr_ifname = NULL; + char *__null_terminated rii_ifname = NULL; + + if (rs == NULL) { + inet_traffic_rule_storage_create(); + } + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + if (strbufcmp(rif->rii_ifname, ntr->ntr_ifname) == 0) { + break; + } + } + if (rif == NULL) { + ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); + rif = inet_traffic_rule_if_create(ntr_ifname); + SLIST_INSERT_HEAD(&rs->ris_if_list, rif, rii_link); + } + for (i = 0; i < NINETRULEMASKS; i++) { + if (ntri->ntri_td.inet_mask == + nxctl_inet_traffic_rule_masks[i]) { + list = &rif->rii_lists[i]; + break; + } + } + retain_traffic_rule(ntr); + ASSERT(list != NULL); + SLIST_INSERT_HEAD(list, ntri, ntri_storage_link); + /* per-interface count */ + rif->rii_count++; + rii_ifname = __unsafe_null_terminated_from_indexable(rif->rii_ifname); + inet_update_ifnet_traffic_rule_count(rii_ifname, rif->rii_count); + + /* global count */ + rs->ris_count++; +} + +void +inet_traffic_rule_unlink(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_inet_if *rif; + struct nxctl_traffic_rule_inet *ntri = + (struct nxctl_traffic_rule_inet *)ntr; + struct nxctl_traffic_rule_inet_head *list = NULL; + int i; + char *__null_terminated rii_ifname = NULL; + + ASSERT(rs != NULL); + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + if (strbufcmp(rif->rii_ifname, ntr->ntr_ifname) == 0) { + break; + } + } + ASSERT(rif != NULL); + for (i = 0; i < NINETRULEMASKS; i++) { + if (ntri->ntri_td.inet_mask == + nxctl_inet_traffic_rule_masks[i]) { + list = &rif->rii_lists[i]; + break; + } + } + ASSERT(list != NULL); + SLIST_REMOVE(list, ntri, nxctl_traffic_rule_inet, ntri_storage_link); + rif->rii_count--; + rii_ifname = __unsafe_null_terminated_from_indexable(rif->rii_ifname); + inet_update_ifnet_traffic_rule_count(rii_ifname, rif->rii_count); + + rs->ris_count--; + release_traffic_rule(ntr); + + if (rif->rii_count == 0) { + SLIST_REMOVE(&rs->ris_if_list, rif, nxctl_traffic_rule_inet_if, rii_link); + inet_traffic_rule_if_destroy(rif); + } + if (rs->ris_count == 0) { + inet_traffic_rule_storage_destroy(); + } +} + +/* + * XXX + * This may need additional changes to ensure safety against detach/attach. + * This is not an issue for the first consumer of llink interfaces, cellular, + * which does not detach. + */ +int +inet_traffic_rule_notify(struct nxctl_traffic_rule *ntr, uint32_t flags) +{ + struct ifnet *ifp; + struct nx_netif *nif; + struct netif_qset *__single qset = NULL; + struct nxctl_traffic_rule_inet *ntri; + int err = 0; + char *__null_terminated ntr_ifname = NULL; + + ntr_ifname = __unsafe_null_terminated_from_indexable(ntr->ntr_ifname); + ifp = ifunit_ref(ntr_ifname); + if (ifp == NULL) { + DTRACE_SKYWALK1(ifname__not__found, char *, ntr->ntr_ifname); + err = ENXIO; + goto done; + } + nif = NA(ifp)->nifna_netif; + if (!NX_LLINK_PROV(nif->nif_nx)) { + DTRACE_SKYWALK1(llink__not__enabled, struct ifnet *, ifp); + err = ENOTSUP; + goto done; + } + ntri = (struct nxctl_traffic_rule_inet *)ntr; + qset = nx_netif_find_qset(nif, ntri->ntri_ra.ras_qset_id); + if (qset == NULL || (qset->nqs_flags & NETIF_QSET_FLAG_EXT_INITED) == 0) { + DTRACE_SKYWALK1(qset__not__initialized, struct netif_qset *, qset); + err = ENXIO; + goto done; + } + err = nx_netif_notify_steering_info(nif, qset, + (struct ifnet_traffic_descriptor_common *)&ntri->ntri_td, + ((flags & NTR_NOTIFY_FLAG_ADD) != 0)); +done: + if (qset != NULL) { + nx_netif_qset_release(&qset); + } + if (ifp != NULL) { + ifnet_decr_iorefcnt(ifp); + } + return err; +} + +int +inet_traffic_rule_get_count(const char *ifname, uint32_t *count) +{ + struct nxctl_traffic_rule_inet_if *rif; + int err; + + if (rs == NULL) { + err = ENOENT; + goto fail; + } + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + if (strlcmp(rif->rii_ifname, ifname, sizeof(rif->rii_ifname)) == 0) { + break; + } + } + if (rif == NULL) { + err = ENOENT; + goto fail; + } + *count = rif->rii_count; + return 0; +fail: + return err; +} + +int +inet_traffic_rule_create( + const char *ifname, struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, uint32_t flags, + struct nxctl_traffic_rule **ntrp) +{ + struct nxctl_traffic_rule_inet *ntri; + struct nxctl_traffic_rule *ntr; + struct ifnet_traffic_descriptor_inet *tdi; + struct ifnet_traffic_rule_action_steer *ras; + + ntri = sk_alloc_type(struct nxctl_traffic_rule_inet, + Z_WAITOK | Z_NOFAIL, nxctl_traffic_rule_tag); + ntr = &ntri->ntri_common; + + ntr->ntrt_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET; + ntr->ntr_flags = flags; + uuid_generate(ntr->ntr_uuid); + os_ref_init(&ntr->ntr_refcnt, NULL); + + strlcpy(ntr->ntr_ifname, ifname, sizeof(ntr->ntr_ifname)); + proc_selfname(ntr->ntr_procname, sizeof(ntr->ntr_procname)); + + tdi = __container_of(td, struct ifnet_traffic_descriptor_inet, inet_common); + ras = __container_of(ra, struct ifnet_traffic_rule_action_steer, ras_common); + bcopy(tdi, &ntri->ntri_td, sizeof(ntri->ntri_td)); + bcopy(ras, &ntri->ntri_ra, sizeof(ntri->ntri_ra)); + + *ntrp = ntr; + return 0; +} + +void +inet_traffic_rule_destroy(struct nxctl_traffic_rule *ntr) +{ + struct nxctl_traffic_rule_inet *ntri; + + ASSERT(os_ref_get_count(&ntr->ntr_refcnt) == 0); + ntri = (struct nxctl_traffic_rule_inet *)ntr; + sk_free_type(struct nxctl_traffic_rule_inet, ntri); +} + +static void +convert_ntri_to_iocinfo(struct nxctl_traffic_rule_inet *ntri, + struct nxctl_traffic_rule_inet_iocinfo *info) +{ + struct nxctl_traffic_rule *ntr; + struct nxctl_traffic_rule_generic_iocinfo *ginfo; + + bzero(info, sizeof(*info)); + ntr = &ntri->ntri_common; + ginfo = &info->tri_common; + static_assert(sizeof(ntr->ntr_procname) == sizeof(ginfo->trg_procname)); + static_assert(sizeof(ntr->ntr_ifname) == sizeof(ginfo->trg_ifname)); + uuid_copy(ginfo->trg_uuid, ntr->ntr_uuid); + strbufcpy(ginfo->trg_procname, ntr->ntr_procname); + strbufcpy(ginfo->trg_ifname, ntr->ntr_ifname); + bcopy(&ntri->ntri_td, &info->tri_td, sizeof(info->tri_td)); + bcopy(&ntri->ntri_ra, &info->tri_ra, sizeof(info->tri_ra)); +} + +int +inet_traffic_rule_get_all(uint32_t size, + uint32_t *count, user_addr_t uaddr) +{ + struct nxctl_traffic_rule_inet *ntri = NULL; + struct nxctl_traffic_rule_inet_if *rif; + struct nxctl_traffic_rule_inet_iocinfo info; + int i, err; + + if (size != sizeof(info)) { + SK_ERR("size: actual %u, expected %lu", size, sizeof(info)); + return EINVAL; + } + if (rs == NULL) { + *count = 0; + return 0; + } + if (*count < rs->ris_count) { + SK_ERR("count: given %d, require: %d", *count, rs->ris_count); + return ENOBUFS; + } + SLIST_FOREACH(rif, &rs->ris_if_list, rii_link) { + for (i = 0; i < NINETRULEMASKS; i++) { + SLIST_FOREACH(ntri, &rif->rii_lists[i], ntri_storage_link) { + convert_ntri_to_iocinfo(ntri, &info); + err = copyout(&info, uaddr, sizeof(info)); + if (err != 0) { + SK_ERR("copyout failed: %d", err); + return err; + } + uaddr += sizeof(info); + } + } + } + *count = rs->ris_count; + return 0; +} diff --git a/bsd/skywalk/nexus/nexus_traffic_rule_inet.h b/bsd/skywalk/nexus/nexus_traffic_rule_inet.h new file mode 100644 index 000000000..87bcf81dc --- /dev/null +++ b/bsd/skywalk/nexus/nexus_traffic_rule_inet.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SKYWALK_NEXUS_TRAFFIC_RULE_INET_H_ +#define _SKYWALK_NEXUS_TRAFFIC_RULE_INET_H_ + +#include + +__BEGIN_DECLS +void inet_traffic_rule_init(kern_allocation_name_t rule_tag); + +int inet_traffic_rule_validate(const char *ifname, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra); + +int inet_traffic_rule_find(const char *ifname, + struct ifnet_traffic_descriptor_common *td, uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +int inet_traffic_rule_find_by_uuid(uuid_t uuid, + struct nxctl_traffic_rule **ntrp); + +void inet_traffic_rule_link(struct nxctl_traffic_rule *ntr); + +void inet_traffic_rule_unlink(struct nxctl_traffic_rule *ntr); + +int inet_traffic_rule_notify(struct nxctl_traffic_rule *ntr, uint32_t flags); + +int inet_traffic_rule_get_count(const char *ifname, uint32_t *count); + +int inet_traffic_rule_create( + const char *ifname, struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, uint32_t flags, + struct nxctl_traffic_rule **ntrp); + +void inet_traffic_rule_destroy(struct nxctl_traffic_rule *ntr); + +int inet_traffic_rule_get_all(uint32_t size, + uint32_t *count, user_addr_t uaddr); + +__END_DECLS + +#endif /* _SKYWALK_NEXUS_TRAFFIC_RULE_INET_H_ */ diff --git a/bsd/skywalk/nexus/nexus_var.h b/bsd/skywalk/nexus/nexus_var.h index b95cb88a3..90e0fde74 100644 --- a/bsd/skywalk/nexus/nexus_var.h +++ b/bsd/skywalk/nexus/nexus_var.h @@ -399,8 +399,7 @@ struct nxdom { (struct kern_nexus *, nexus_port_t); int (*nxdom_connect) /* required */ (struct kern_nexus_domain_provider *, struct kern_nexus *, - struct kern_channel *, struct chreq *, struct kern_channel *, - struct nxbind *, struct proc *); + struct kern_channel *, struct chreq *, struct nxbind *, struct proc *); void (*nxdom_disconnect) /* required */ (struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *); @@ -512,6 +511,11 @@ extern int nxctl_inet_traffic_rule_find_qset_id_with_pkt(const char *, extern int nxctl_inet_traffic_rule_find_qset_id(const char *, struct ifnet_traffic_descriptor_inet *, uint64_t *); extern int nxctl_inet_traffic_rule_get_count(const char *, uint32_t *); +extern int nxctl_eth_traffic_rule_find_qset_id_with_pkt(const char *, + struct __kern_packet *, uint64_t *); +extern int nxctl_eth_traffic_rule_find_qset_id(const char *, + uint16_t, ether_addr_t *, uint64_t *); +extern int nxctl_eth_traffic_rule_get_count(const char *, uint32_t *); extern int nxctl_get_opt(struct nxctl *, struct sockopt *); extern int nxctl_set_opt(struct nxctl *, struct sockopt *); extern void nxctl_retain(struct nxctl *); @@ -641,32 +645,6 @@ nx_tx_doorbell(struct __kern_channel_ring *kring, boolean_t async) kring, (async ? KERN_NEXUS_TXDOORBELLF_ASYNC_REFILL: 0)); } -__attribute__((always_inline)) -static inline int -nx_rx_sync_packets(struct __kern_channel_ring *kring, - uint64_t *__counted_by(*count)packets, uint32_t *count) -{ - struct kern_nexus_provider *nxprov = NX_PROV(KRNA(kring)->na_nx); - - ASSERT(kring->ckr_tx == NR_RX); - if (nxprov->nxprov_ext.nxpi_rx_sync_packets != NULL) { - return nxprov->nxprov_ext.nxpi_rx_sync_packets(nxprov, - KRNA(kring)->na_nx, kring, packets, count, 0); - } else { - return 0; - } -} - -__attribute__((always_inline)) -static inline boolean_t -nx_has_rx_sync_packets(struct __kern_channel_ring *kring) -{ - struct kern_nexus_provider *nxprov = NX_PROV(KRNA(kring)->na_nx); - - ASSERT(kring->ckr_tx == NR_RX); - return nxprov->nxprov_ext.nxpi_rx_sync_packets != NULL; -} - __attribute__((always_inline)) static __inline__ errno_t nx_tx_qset_notify(struct kern_nexus *nx, void *qset_ctx) diff --git a/bsd/skywalk/nexus/os_nexus.h b/bsd/skywalk/nexus/os_nexus.h index 62818b862..8879ce65d 100644 --- a/bsd/skywalk/nexus/os_nexus.h +++ b/bsd/skywalk/nexus/os_nexus.h @@ -81,12 +81,6 @@ typedef enum { NEXUS_TYPE_NET_IF, /* network interface (kernel) */ NEXUS_TYPE_FLOW_SWITCH, /* flow switch (user/kernel) */ #ifdef BSD_KERNEL_PRIVATE - /* - * Monitor nexus isn't directly usable on its own; we just - * need a type definition here for it to act as a pseudo - * domain provider. - */ - NEXUS_TYPE_MONITOR, /* monitor (user) */ NEXUS_TYPE_MAX, /* this needs to be last */ NEXUS_TYPE_UNDEFINED = -1, /* for kernel internal use */ #endif /* BSD_KERNEL_PRIVATE */ @@ -711,6 +705,8 @@ typedef enum { KERN_NEXUS_CAPAB_INTERFACE_ADVISORY = 1, /* extends queue set functionality: e.g. notify steering info */ KERN_NEXUS_CAPAB_QSET_EXTENSIONS, + /* Rx flow steering to support AOP offload traffic */ + KERN_NEXUS_CAPAB_RX_FLOW_STEERING, } kern_nexus_capab_t; typedef errno_t (*nxprov_capab_config_fn_t)(kern_nexus_provider_t nexus_prov, @@ -754,6 +750,19 @@ struct kern_nexus_capab_qset_extensions { kern_nexus_capab_qsext_notify_steering_info_fn_t cqe_notify_steering_info; }; + +#define KERN_NEXUS_CAPAB_RX_FLOW_STEERING_VERSION_1 1 +typedef errno_t (*kern_nexus_capab_rx_flow_steering_config_fn_t)( + void *provider_context, + uint32_t id, + struct ifnet_traffic_descriptor_common *td, + uint32_t action); +struct kern_nexus_capab_rx_flow_steering { + uint32_t kncrxfs_version; + void *kncrxfs_prov_ctx; + kern_nexus_capab_rx_flow_steering_config_fn_t kncrxfs_config; +}; + /* * Nexus provider init (version 1) */ @@ -771,8 +780,8 @@ struct kern_nexus_provider_init { nxprov_sync_tx_fn_t nxpi_sync_tx; /* required */ nxprov_sync_rx_fn_t nxpi_sync_rx; /* required */ nxprov_tx_doorbell_fn_t nxpi_tx_doorbell; /* required (netif) */ - nxprov_sync_packets_fn_t nxpi_rx_sync_packets; /* optional (netif) */ - nxprov_sync_packets_fn_t nxpi_tx_sync_packets; /* optional (netif) */ + nxprov_sync_packets_fn_t nxpi_rx_sync_packets; /* DO NOT USE (netif) */ + nxprov_sync_packets_fn_t nxpi_tx_sync_packets; /* DO NOT USE (netif) */ nxprov_capab_config_fn_t nxpi_config_capab; /* optional (netif) */ }; @@ -1074,10 +1083,6 @@ extern errno_t kern_nexus_netif_llink_remove(struct kern_nexus *, extern errno_t kern_netif_qset_tx_queue_len(kern_netif_qset_t, uint32_t, uint32_t *, uint32_t *); -extern void kern_netif_set_qset_combined(kern_netif_qset_t qset); - -extern void kern_netif_set_qset_separate(kern_netif_qset_t qset); - /* * Misc. */ diff --git a/bsd/skywalk/nexus/os_nexus_private.h b/bsd/skywalk/nexus/os_nexus_private.h index 92b5487df..9f98187ed 100644 --- a/bsd/skywalk/nexus/os_nexus_private.h +++ b/bsd/skywalk/nexus/os_nexus_private.h @@ -392,7 +392,7 @@ struct nx_flow_req { union sockaddr_in_4_6 nfr_daddr; uint8_t nfr_ip_protocol; uint8_t nfr_transport_protocol; - uint16_t nfr_flags; + uint32_t nfr_flags; uuid_t nfr_flow_uuid; packet_svc_class_t nfr_svc_class; uuid_t nfr_euuid; @@ -405,6 +405,7 @@ struct nx_flow_req { uuid_t nfr_parent_flow_uuid; uint8_t nfr_flow_demux_count; struct flow_demux_pattern nfr_flow_demux_patterns[MAX_FLOW_DEMUX_PATTERN]; + uint32_t nfr_flowid; // below is reserved kernel-only fields union { #ifdef KERNEL @@ -438,26 +439,30 @@ struct nx_flow_req { }; /* valid flags for nfr_flags */ -#define NXFLOWREQF_TRACK 0x0001 /* enable state tracking */ -#define NXFLOWREQF_QOS_MARKING 0x0002 /* allow qos marking */ -#define NXFLOWREQF_FILTER 0x0004 /* interpose filter */ -#define NXFLOWREQF_CUSTOM_ETHER 0x0008 /* custom ethertype */ -#define NXFLOWREQF_IPV6_ULA 0x0010 /* ipv6 ula */ -#define NXFLOWREQF_LISTENER 0x0020 /* listener */ -#define NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION 0x0040 /* override system address selection */ -#define NXFLOWREQF_USE_STABLE_ADDRESS 0x0080 /* if override local, use stable address */ -#define NXFLOWREQF_FLOWADV 0x0100 /* allocate flow advisory */ -#define NXFLOWREQF_ASIS 0x0200 /* create flow as is in nfr */ -#define NXFLOWREQF_LOW_LATENCY 0x0400 /* low latency flow */ -#define NXFLOWREQF_NOWAKEFROMSLEEP 0x0800 /* Don't wake for traffic to this flow */ -#define NXFLOWREQF_REUSEPORT 0x1000 /* Don't wake for traffic to this flow */ -#define NXFLOWREQF_PARENT 0x4000 /* Parent flow */ +#define NXFLOWREQF_TRACK 0x00000001 /* enable state tracking */ +#define NXFLOWREQF_QOS_MARKING 0x00000002 /* allow qos marking */ +#define NXFLOWREQF_FILTER 0x00000004 /* interpose filter */ +#define NXFLOWREQF_CUSTOM_ETHER 0x00000008 /* custom ethertype */ +#define NXFLOWREQF_IPV6_ULA 0x00000010 /* ipv6 ula */ +#define NXFLOWREQF_LISTENER 0x00000020 /* listener */ +#define NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION 0x00000040 /* override system address selection */ +#define NXFLOWREQF_USE_STABLE_ADDRESS 0x00000080 /* if override local, use stable address */ +#define NXFLOWREQF_FLOWADV 0x00000100 /* allocate flow advisory */ +#define NXFLOWREQF_ASIS 0x00000200 /* create flow as is in nfr */ +#define NXFLOWREQF_LOW_LATENCY 0x00000400 /* low latency flow */ +#define NXFLOWREQF_NOWAKEFROMSLEEP 0x00000800 /* Don't wake for traffic to this flow */ +#define NXFLOWREQF_REUSEPORT 0x00001000 /* Don't wake for traffic to this flow */ +#define NXFLOWREQF_PARENT 0x00004000 /* Parent flow */ +#define NXFLOWREQF_AOP_OFFLOAD 0x00008000 /* AOP2 offload flow */ +#define NXFLOWREQF_CONNECTION_IDLE 0x00010000 /* connection is idle */ +#define NXFLOWREQF_CONNECTION_REUSED 0x00020000 /* connection is reused */ #define NXFLOWREQF_BITS \ - "\020\01TRACK\02QOS_MARKING\03FILTER\04CUSTOM_ETHER\05IPV6_ULA" \ - "\06LISTENER\07OVERRIDE_ADDRESS_SELECTION\010USE_STABLE_ADDRESS" \ - "\011ALLOC_FLOWADV\012ASIS\013LOW_LATENCY\014NOWAKEUPFROMSLEEP" \ - "\015REUSEPORT\017PARENT" + "\020\01TRACK\02QOS_MARKING\03FILTER\04CUSTOM_ETHER\05IPV6_ULA" \ + "\06LISTENER\07OVERRIDE_ADDRESS_SELECTION\010USE_STABLE_ADDRESS" \ + "\011ALLOC_FLOWADV\012ASIS\013LOW_LATENCY\014NOWAKEUPFROMSLEEP" \ + "\015REUSEPORT\017PARENT\020AOP_OFFLOAD\021CONNECTION_IDLE\022CONNECTION_REUSED" + struct flow_ip_addr { union { @@ -514,8 +519,8 @@ extern const struct flow_key fk_mask_ipflow2; extern const struct flow_key fk_mask_ipflow3; #define FLOW_KEY_CLEAR(_fk) do { \ - _CASSERT(FLOW_KEY_LEN == 48); \ - _CASSERT(FLOW_KEY_LEN == sizeof(struct flow_key)); \ + static_assert(FLOW_KEY_LEN == 48); \ + static_assert(FLOW_KEY_LEN == sizeof(struct flow_key)); \ sk_zero_48(_fk); \ } while (0) @@ -526,7 +531,8 @@ extern const struct flow_key fk_mask_ipflow3; NXFLOWREQF_CUSTOM_ETHER | NXFLOWREQF_IPV6_ULA | NXFLOWREQF_LISTENER | \ NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION | NXFLOWREQF_USE_STABLE_ADDRESS | \ NXFLOWREQF_FLOWADV | NXFLOWREQF_LOW_LATENCY | NXFLOWREQF_NOWAKEFROMSLEEP | \ - NXFLOWREQF_REUSEPORT | NXFLOWREQF_PARENT) + NXFLOWREQF_REUSEPORT | NXFLOWREQF_PARENT | NXFLOWREQF_AOP_OFFLOAD | \ + NXFLOWREQF_CONNECTION_IDLE | NXFLOWREQF_CONNECTION_REUSED) #define NXFLOWREQF_EXT_PORT_RSV 0x1000 /* external port reservation */ #define NXFLOWREQF_EXT_PROTO_RSV 0x2000 /* external proto reservation */ @@ -534,8 +540,7 @@ extern const struct flow_key fk_mask_ipflow3; static inline void nx_flow_req_internalize(struct nx_flow_req *req) { - _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) == - offsetof(struct nx_flow_req, _nfr_common_field_end)); + static_assert(offsetof(struct nx_flow_req, _nfr_kernel_field_end) == offsetof(struct nx_flow_req, _nfr_common_field_end)); /* init kernel only fields */ bzero(&req->_nfr_opaque, sizeof(req->_nfr_opaque)); @@ -688,6 +693,8 @@ extern int __os_nexus_get_llink_info(const nexus_controller_t ncd, const uuid_t nx_uuid, const struct nx_llink_info_req *nlir, size_t len); extern int os_nexus_flow_set_wake_from_sleep(const uuid_t nx_uuid, const uuid_t flow_uuid, bool enable); +extern int os_nexus_flow_set_connection_idle(const uuid_t nx_uuid, + const uuid_t flow_uuid, bool enable); __END_DECLS #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/skywalk/nexus/upipe/nx_user_pipe.c b/bsd/skywalk/nexus/upipe/nx_user_pipe.c index 025dc2dad..4093558e4 100644 --- a/bsd/skywalk/nexus/upipe/nx_user_pipe.c +++ b/bsd/skywalk/nexus/upipe/nx_user_pipe.c @@ -90,8 +90,8 @@ static int nx_upipe_dom_bind_port(struct kern_nexus *, nexus_port_t *, struct nxbind *, void *); static int nx_upipe_dom_unbind_port(struct kern_nexus *, nexus_port_t); static int nx_upipe_dom_connect(struct kern_nexus_domain_provider *, - struct kern_nexus *, struct kern_channel *, struct chreq *, - struct kern_channel *, struct nxbind *, struct proc *); + struct kern_nexus *, struct kern_channel *, struct chreq *, struct nxbind *, + struct proc *); static void nx_upipe_dom_disconnect(struct kern_nexus_domain_provider *, struct kern_nexus *, struct kern_channel *); static void nx_upipe_dom_defunct(struct kern_nexus_domain_provider *, @@ -131,8 +131,8 @@ struct nxdom nx_upipe_dom_s = { .nxdom_prov_head = STAILQ_HEAD_INITIALIZER(nx_upipe_dom_s.nxdom_prov_head), .nxdom_type = NEXUS_TYPE_USER_PIPE, - .nxdom_md_type = NEXUS_META_TYPE_QUANTUM, - .nxdom_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD, + .nxdom_md_type = NEXUS_META_TYPE_PACKET, + .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW, .nxdom_name = "upipe", .nxdom_ports = { .nb_def = 2, @@ -334,7 +334,7 @@ nx_upipe_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov, int err = 0; SK_DF(SK_VERB_USER_PIPE, - "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx), + "nx %p (\"%s\":\"%s\") na \"%s\" (%p)", SK_KVA(nx), NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name, SK_KVA(na)); @@ -370,10 +370,10 @@ nx_upipe_prov_nx_ctor(struct kern_nexus *nx) SK_LOCK_ASSERT_HELD(); ASSERT(nx->nx_arg == NULL); - SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); + SK_D("nexus %p (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name); nx->nx_arg = nx_upipe_alloc(Z_WAITOK); - SK_D("create new upipe 0x%llx for nexus 0x%llx", + SK_D("create new upipe %p for nexus %p", SK_KVA(NX_UPIPE_PRIVATE(nx)), SK_KVA(nx)); return 0; @@ -386,7 +386,7 @@ nx_upipe_prov_nx_dtor(struct kern_nexus *nx) SK_LOCK_ASSERT_HELD(); - SK_D("nexus 0x%llx (%s) upipe 0x%llx", SK_KVA(nx), + SK_D("nexus %p (%s) upipe %p", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(u)); if (u->nup_cli_nxb != NULL) { @@ -398,7 +398,7 @@ nx_upipe_prov_nx_dtor(struct kern_nexus *nx) u->nup_srv_nxb = NULL; } - SK_DF(SK_VERB_USER_PIPE, "marking upipe 0x%llx as free", SK_KVA(u)); + SK_DF(SK_VERB_USER_PIPE, "marking upipe %p as free", SK_KVA(u)); nx_upipe_free(u); nx->nx_arg = NULL; } @@ -408,7 +408,7 @@ na_upipe_alloc(zalloc_flags_t how) { struct nexus_upipe_adapter *pna; - _CASSERT(offsetof(struct nexus_upipe_adapter, pna_up) == 0); + static_assert(offsetof(struct nexus_upipe_adapter, pna_up) == 0); pna = zalloc_flags(na_upipe_zone, how | Z_ZERO); if (pna) { @@ -424,7 +424,7 @@ na_upipe_free(struct nexus_adapter *na) struct nexus_upipe_adapter *pna = (struct nexus_upipe_adapter *)na; ASSERT(pna->pna_up.na_refcount == 0); - SK_DF(SK_VERB_MEM, "pna 0x%llx FREE", SK_KVA(pna)); + SK_DF(SK_VERB_MEM, "pna %p FREE", SK_KVA(pna)); bzero(pna, sizeof(*pna)); zfree(na_upipe_zone, pna); } @@ -513,7 +513,7 @@ nx_upipe_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port) static int nx_upipe_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr, - struct kern_channel *ch0, struct nxbind *nxb, struct proc *p) + struct nxbind *nxb, struct proc *p) { #pragma unused(nxdom_prov) nexus_port_t port = chr->cr_port; @@ -549,22 +549,21 @@ nx_upipe_dom_connect(struct kern_nexus_domain_provider *nxdom_prov, } if (port == NEXUS_PORT_USER_PIPE_SERVER) { - chr->cr_real_endpoint = CH_ENDPOINT_USER_PIPE_MASTER; + chr->cr_endpoint = CH_ENDPOINT_USER_PIPE_MASTER; } else if (port == NEXUS_PORT_USER_PIPE_CLIENT) { - chr->cr_real_endpoint = CH_ENDPOINT_USER_PIPE_SLAVE; + chr->cr_endpoint = CH_ENDPOINT_USER_PIPE_SLAVE; } else { err = EINVAL; goto done; } - chr->cr_endpoint = chr->cr_real_endpoint; chr->cr_ring_set = RING_SET_DEFAULT; chr->cr_pipe_id = 0; (void) snprintf(chr->cr_name, sizeof(chr->cr_name), "upipe:%llu:%.*s", nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen, nx->nx_prov->nxprov_params->nxp_name); - err = na_connect(nx, ch, chr, ch0, nxb, p); + err = na_connect(nx, ch, chr, nxb, p); done: return err; } @@ -576,7 +575,7 @@ nx_upipe_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov, #pragma unused(nxdom_prov) SK_LOCK_ASSERT_HELD(); - SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch), + SK_D("channel %p -!- nexus %p (%s:\"%s\":%u:%d)", SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -684,7 +683,7 @@ nx_upipe_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov, na_defunct(nx, ch, pna->pna_parent, locked); } - SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)", + SK_D("%s(%d): ch %p -/- nx %p (%s:\"%s\":%u:%d)", ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name, ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id); @@ -801,12 +800,12 @@ nx_upipe_na_txsync(struct __kern_channel_ring *txkring, struct proc *p, int sent = 0, ret = 0; SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u " - "flags 0x%x -> kr \"%s\" (0x%llx) krflags 0x%b ring %u", - sk_proc_name_address(p), sk_proc_pid(p), txkring->ckr_name, - SK_KVA(txkring), txkring->ckr_flags, CKRF_BITS, - txkring->ckr_ring_id, flags, rxkring->ckr_name, SK_KVA(rxkring), - rxkring->ckr_flags, CKRF_BITS, rxkring->ckr_ring_id); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u " + "flags 0x%x -> kr \"%s\" (%p) krflags 0x%x ring %u", + sk_proc_name(p), sk_proc_pid(p), txkring->ckr_name, + SK_KVA(txkring), txkring->ckr_flags, txkring->ckr_ring_id, flags, + rxkring->ckr_name, SK_KVA(rxkring), rxkring->ckr_flags, + rxkring->ckr_ring_id); /* * Serialize write access to the transmit ring, since another @@ -861,13 +860,13 @@ nx_upipe_na_txsync_locked(struct __kern_channel_ring *txkring, struct proc *p, SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %3u kt %3u | " - "rh %3u rt %3u [pre%s]", sk_proc_name_address(p), + "rh %3u rt %3u [pre%s]", sk_proc_name(p), sk_proc_pid(p), txkring->ckr_name, txkring->ckr_khead, txkring->ckr_ktail, txkring->ckr_rhead, txkring->ckr_rtail, rx ? "*" : ""); SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %3u kt %3u | " - "rh %3u rt %3u [pre%s]", sk_proc_name_address(p), + "rh %3u rt %3u [pre%s]", sk_proc_name(p), sk_proc_pid(p), rxkring->ckr_name, rxkring->ckr_khead, rxkring->ckr_ktail, rxkring->ckr_rhead, rxkring->ckr_rtail, rx ? "*" : ""); @@ -901,14 +900,14 @@ nx_upipe_na_txsync_locked(struct __kern_channel_ring *txkring, struct proc *p, SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\" -> new %u, kr \"%s\" " - "-> free %u", sk_proc_name_address(p), sk_proc_pid(p), + "-> free %u", sk_proc_name(p), sk_proc_pid(p), txkring->ckr_name, n, rxkring->ckr_name, m); /* rxring is full, or nothing to send? */ if (__improbable((sent = limit) == 0)) { SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\" -> %s%s", - sk_proc_name_address(p), sk_proc_pid(p), (n > m) ? + sk_proc_name(p), sk_proc_pid(p), (n > m) ? rxkring->ckr_name : txkring->ckr_name, ((n > m) ? "no room avail" : "no new slots"), (rx ? " (lost race, ok)" : "")); @@ -967,13 +966,13 @@ nx_upipe_na_txsync_locked(struct __kern_channel_ring *txkring, struct proc *p, done: SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %3u kt %3u | " - "rh %3u rt %3u [post%s]", sk_proc_name_address(p), + "rh %3u rt %3u [post%s]", sk_proc_name(p), sk_proc_pid(p), txkring->ckr_name, txkring->ckr_khead, txkring->ckr_ktail, txkring->ckr_rhead, txkring->ckr_rtail, rx ? "*" : ""); SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %3u kt %3u | " - "rh %3u rt %3u [post%s]", sk_proc_name_address(p), + "rh %3u rt %3u [post%s]", sk_proc_name(p), sk_proc_pid(p), rxkring->ckr_name, rxkring->ckr_khead, rxkring->ckr_ktail, rxkring->ckr_rhead, rxkring->ckr_rtail, rx ? "*" : ""); @@ -994,12 +993,12 @@ nx_upipe_na_rxsync(struct __kern_channel_ring *rxkring, struct proc *p, uint32_t r; SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_RX, - "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u " - "flags 0x%x <- kr \"%s\" (0x%llx) krflags 0x%b ring %u", - sk_proc_name_address(p), sk_proc_pid(p), rxkring->ckr_name, - SK_KVA(rxkring), rxkring->ckr_flags, CKRF_BITS, - rxkring->ckr_ring_id, flags, txkring->ckr_name, SK_KVA(txkring), - txkring->ckr_flags, CKRF_BITS, txkring->ckr_ring_id); + "%s(%d) kr \"%s\" (%p) krflags 0x%x ring %u " + "flags 0x%x <- kr \"%s\" (%p) krflags 0x%x ring %u", + sk_proc_name(p), sk_proc_pid(p), rxkring->ckr_name, + SK_KVA(rxkring), rxkring->ckr_flags, rxkring->ckr_ring_id, flags, + txkring->ckr_name, SK_KVA(txkring), txkring->ckr_flags, + txkring->ckr_ring_id); ASSERT(rxkring->ckr_owner == current_thread()); @@ -1031,7 +1030,7 @@ nx_upipe_na_rxsync(struct __kern_channel_ring *rxkring, struct proc *p, SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\" <- free %u, kr \"%s\" <- new %u", - sk_proc_name_address(p), sk_proc_pid(p), + sk_proc_name(p), sk_proc_pid(p), rxkring->ckr_name, m, txkring->ckr_name, n); /* @@ -1062,7 +1061,7 @@ nx_upipe_na_rxsync(struct __kern_channel_ring *rxkring, struct proc *p, * If we fail to get the kring lock, then don't worry because * there's already a transmit sync in progress to move packets. */ - if (__probable(n != 0 && m != 0 && (flags & NA_SYNCF_MONITOR) == 0)) { + if (__probable(n != 0 && m != 0)) { (void) kr_enter(txkring, TRUE); n = nx_upipe_na_txsync_locked(txkring, p, flags, &ret, TRUE); kr_exit(txkring); @@ -1079,7 +1078,7 @@ nx_upipe_na_rxsync(struct __kern_channel_ring *rxkring, struct proc *p, SK_DF(SK_VERB_USER_PIPE | SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %3u kt %3u | " "rh %3u rt %3u [rel %u new %u]", - sk_proc_name_address(p), sk_proc_pid(p), rxkring->ckr_name, + sk_proc_name(p), sk_proc_pid(p), rxkring->ckr_name, rxkring->ckr_khead, rxkring->ckr_ktail, rxkring->ckr_rhead, rxkring->ckr_rtail, r, n); @@ -1210,13 +1209,13 @@ nx_upipe_na_krings_create(struct nexus_adapter *na, struct kern_channel *ch) if (pna->pna_peer_ref) { /* case 1) above */ SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 1, create everything", SK_KVA(na)); + "%p: case 1, create everything", SK_KVA(na)); error = nx_upipe_na_rings_create(na, ch); } else { /* case 2) above */ /* recover the hidden rings */ SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 2, hidden rings", SK_KVA(na)); + "%p: case 2, hidden rings", SK_KVA(na)); for_rx_tx(t) { for (i = 0; i < na_get_nrings(na, t); i++) { NAKR(na, t)[i].ckr_ring = @@ -1274,7 +1273,7 @@ nx_upipe_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) SK_LOCK_ASSERT_HELD(); - SK_DF(SK_VERB_USER_PIPE, "na \"%s\" (0x%llx) %s", na->na_name, + SK_DF(SK_VERB_USER_PIPE, "na \"%s\" (%p) %s", na->na_name, SK_KVA(na), na_activate_mode2str(mode)); switch (mode) { @@ -1297,14 +1296,14 @@ nx_upipe_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) if (pna->pna_peer_ref) { SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 1.a or 2.a, nothing to do", SK_KVA(na)); + "%p: case 1.a or 2.a, nothing to do", SK_KVA(na)); return 0; } switch (mode) { case NA_ACTIVATE_MODE_ON: SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 1.b, drop peer", SK_KVA(na)); + "%p: case 1.b, drop peer", SK_KVA(na)); if (pna->pna_peer->pna_peer_ref) { pna->pna_peer->pna_peer_ref = FALSE; (void) na_release_locked(na); @@ -1313,7 +1312,7 @@ nx_upipe_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) case NA_ACTIVATE_MODE_OFF: SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 2.b, grab peer", SK_KVA(na)); + "%p: case 2.b, grab peer", SK_KVA(na)); if (!pna->pna_peer->pna_peer_ref) { na_retain_locked(na); pna->pna_peer->pna_peer_ref = TRUE; @@ -1362,7 +1361,7 @@ nx_upipe_na_krings_delete(struct nexus_adapter *na, struct kern_channel *ch, if (!pna->pna_peer_ref) { SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 2, kept alive by peer", SK_KVA(na)); + "%p: case 2, kept alive by peer", SK_KVA(na)); /* * If adapter is defunct (note the explicit test against * NAF_DEFUNCT, and not the "defunct" parameter passed in @@ -1380,7 +1379,7 @@ nx_upipe_na_krings_delete(struct nexus_adapter *na, struct kern_channel *ch, /* case 1) above */ SK_DF(SK_VERB_USER_PIPE, - "0x%llx: case 1, deleting everyhing", SK_KVA(na)); + "%p: case 1, deleting everyhing", SK_KVA(na)); ASSERT(na->na_channels == 0 || (na->na_flags & NAF_DEFUNCT)); @@ -1420,10 +1419,10 @@ nx_upipe_na_dtor(struct nexus_adapter *na) SK_LOCK_ASSERT_HELD(); - SK_DF(SK_VERB_USER_PIPE, "0x%llx", SK_KVA(na)); + SK_DF(SK_VERB_USER_PIPE, "%p", SK_KVA(na)); if (pna->pna_peer_ref) { SK_DF(SK_VERB_USER_PIPE, - "0x%llx: clean up peer 0x%llx", SK_KVA(na), + "%p: clean up peer %p", SK_KVA(na), SK_KVA(&pna->pna_peer->pna_up)); pna->pna_peer_ref = FALSE; (void) na_release_locked(&pna->pna_peer->pna_up); @@ -1438,7 +1437,7 @@ nx_upipe_na_dtor(struct nexus_adapter *na) ASSERT(u->nup_pna_users != 0); if (--u->nup_pna_users == 0) { ASSERT(u->nup_pna != NULL); - SK_DF(SK_VERB_USER_PIPE, "release parent: \"%s\" (0x%llx)", + SK_DF(SK_VERB_USER_PIPE, "release parent: \"%s\" (%p)", u->nup_pna->na_name, SK_KVA(u->nup_pna)); na_release_locked(u->nup_pna); u->nup_pna = NULL; @@ -1465,12 +1464,11 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, #if SK_LOG uuid_string_t uuidstr; - SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u " - "ring_id %d ring_set %u ep_type %u:%u create %u%s", + SK_PDF(SK_VERB_USER_PIPE, p, "name \"%s\" spec_uuid \"%s\" port %d " + "mode 0x%x pipe_id %u ring_id %d ring_set %u ep_type %u create %u%s", chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), - (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, - chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set, - chr->cr_real_endpoint, chr->cr_endpoint, create, + (int)chr->cr_port, chr->cr_mode, chr->cr_pipe_id, + (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_endpoint, create, (ep != CH_ENDPOINT_USER_PIPE_MASTER && ep != CH_ENDPOINT_USER_PIPE_SLAVE) ? " (skipped)" : ""); #endif /* SK_LOG */ @@ -1503,7 +1501,7 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, */ if ((pna = u->nup_pna) != NULL) { na_retain_locked(pna); /* for us */ - SK_DF(SK_VERB_USER_PIPE, "found parent: \"%s\" (0x%llx)", + SK_DF(SK_VERB_USER_PIPE, "found parent: \"%s\" (%p)", pna->na_name, SK_KVA(pna)); } else { /* callee will hold a reference for us upon success */ @@ -1515,7 +1513,7 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, /* hold an extra reference for nx_upipe */ u->nup_pna = pna; na_retain_locked(pna); - SK_DF(SK_VERB_USER_PIPE, "created parent: \"%s\" (0x%llx)", + SK_DF(SK_VERB_USER_PIPE, "created parent: \"%s\" (%p)", pna->na_name, SK_KVA(pna)); } @@ -1654,16 +1652,15 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, u->nup_pna_users += 2; #if SK_LOG - SK_DF(SK_VERB_USER_PIPE, "created master 0x%llx and slave 0x%llx", + SK_DF(SK_VERB_USER_PIPE, "created master %p and slave %p", SK_KVA(mna), SK_KVA(sna)); SK_DF(SK_VERB_USER_PIPE, "mna: \"%s\"", mna->pna_up.na_name); SK_DF(SK_VERB_USER_PIPE, " UUID: %s", sk_uuid_unparse(mna->pna_up.na_uuid, uuidstr)); - SK_DF(SK_VERB_USER_PIPE, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_USER_PIPE, " nx: %p (\"%s\":\"%s\")", SK_KVA(mna->pna_up.na_nx), NX_DOM(mna->pna_up.na_nx)->nxdom_name, NX_DOM_PROV(mna->pna_up.na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_USER_PIPE, " flags: 0x%b", - mna->pna_up.na_flags, NAF_BITS); + SK_DF(SK_VERB_USER_PIPE, " flags: 0x%x", mna->pna_up.na_flags); SK_DF(SK_VERB_USER_PIPE, " flowadv_max: %u", mna->pna_up.na_flowadv_max); SK_DF(SK_VERB_USER_PIPE, " rings: tx %u rx %u", @@ -1683,11 +1680,10 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, SK_DF(SK_VERB_USER_PIPE, "sna: \"%s\"", sna->pna_up.na_name); SK_DF(SK_VERB_USER_PIPE, " UUID: %s", sk_uuid_unparse(sna->pna_up.na_uuid, uuidstr)); - SK_DF(SK_VERB_USER_PIPE, " nx: 0x%llx (\"%s\":\"%s\")", + SK_DF(SK_VERB_USER_PIPE, " nx: %p (\"%s\":\"%s\")", SK_KVA(sna->pna_up.na_nx), NX_DOM(sna->pna_up.na_nx)->nxdom_name, NX_DOM_PROV(sna->pna_up.na_nx)->nxdom_prov_name); - SK_DF(SK_VERB_USER_PIPE, " flags: 0x%b", - sna->pna_up.na_flags, NAF_BITS); + SK_DF(SK_VERB_USER_PIPE, " flags: 0x%x", sna->pna_up.na_flags); SK_DF(SK_VERB_USER_PIPE, " flowadv_max: %u", sna->pna_up.na_flowadv_max); SK_DF(SK_VERB_USER_PIPE, " rings: tx %u rx %u", @@ -1708,7 +1704,7 @@ nx_upipe_na_find(struct kern_nexus *nx, struct kern_channel *ch, found: - SK_DF(SK_VERB_USER_PIPE, "pipe_id %u role %s at 0x%llx", pipe_id, + SK_DF(SK_VERB_USER_PIPE, "pipe_id %u role %s at %p", pipe_id, (req->pna_role == CH_ENDPOINT_USER_PIPE_MASTER ? "master" : "slave"), SK_KVA(req)); if ((chr->cr_mode & CHMODE_DEFUNCT_OK) == 0) { @@ -1743,7 +1739,7 @@ nx_upipe_alloc(zalloc_flags_t how) u = zalloc_flags(nx_upipe_zone, how | Z_ZERO); if (u) { - SK_DF(SK_VERB_MEM, "upipe 0x%llx ALLOC", SK_KVA(u)); + SK_DF(SK_VERB_MEM, "upipe %p ALLOC", SK_KVA(u)); } return u; } @@ -1756,6 +1752,6 @@ nx_upipe_free(struct nx_upipe *u) ASSERT(u->nup_cli_nxb == NULL); ASSERT(u->nup_srv_nxb == NULL); - SK_DF(SK_VERB_MEM, "upipe 0x%llx FREE", SK_KVA(u)); + SK_DF(SK_VERB_MEM, "upipe %p FREE", SK_KVA(u)); zfree(nx_upipe_zone, u); } diff --git a/bsd/skywalk/os_skywalk_private.h b/bsd/skywalk/os_skywalk_private.h index 388f07eac..0ed117fb4 100644 --- a/bsd/skywalk/os_skywalk_private.h +++ b/bsd/skywalk/os_skywalk_private.h @@ -38,7 +38,7 @@ /* branch prediction helpers */ #include #define SK_ALIGN64_CASSERT(type, field) \ - _CASSERT((__builtin_offsetof(type, field) % sizeof (uint64_t)) == 0) + _Static_assert((__builtin_offsetof(type, field) % sizeof(uint64_t)) == 0, "incorrect alignment") #if !defined(KERNEL) || defined(BSD_KERNEL_PRIVATE) enum { @@ -46,7 +46,7 @@ enum { SK_FEATURE_DEVELOPMENT = 1ULL << 1, SK_FEATURE_DEBUG = 1ULL << 2, SK_FEATURE_NEXUS_FLOWSWITCH = 1ULL << 3, - SK_FEATURE_NEXUS_MONITOR = 1ULL << 4, + SK_FEATURE_NEXUS_UNUSED_4 = 1ULL << 4, SK_FEATURE_NEXUS_NETIF = 1ULL << 5, SK_FEATURE_NEXUS_USER_PIPE = 1ULL << 6, SK_FEATURE_NEXUS_KERNEL_PIPE = 1ULL << 7, @@ -144,7 +144,7 @@ enum { X(SK_VERB_SYNC, 9) /* 0x0000000000000200 */ \ X(SK_VERB_NOTIFY, 10) /* 0x0000000000000400 */ \ X(SK_VERB_INTR, 11) /* 0x0000000000000800 */ \ - X(SK_VERB_MONITOR, 12) /* 0x0000000000001000 */ \ + X(__SK_VERB_12, 12) /* 0x0000000000001000 */ \ X(SK_VERB_DEV, 13) /* 0x0000000000002000 */ \ X(SK_VERB_HOST, 14) /* 0x0000000000004000 */ \ X(SK_VERB_USER, 15) /* 0x0000000000008000 */ \ @@ -222,59 +222,51 @@ enum SK_VERB_FLAGS { #include #include #include +#include -#if (DEVELOPMENT || DEBUG) -#define SK_KVA(p) ((uint64_t)(p)) -#define SK_LOG 1 -#else -#define SK_KVA(p) ((uint64_t)VM_KERNEL_ADDRPERM(p)) -#define SK_LOG 0 -#endif /* !DEVELOPMENT && !DEBUG */ +#define SK_LOG 1 #if SK_LOG -#define SK_LOG_VAR(x) x -#else -#define SK_LOG_VAR(x) -#endif - -#define SK_INLINE_ATTRIBUTE __attribute__((always_inline)) -#define SK_NO_INLINE_ATTRIBUTE __attribute__((noinline)) -#define SK_LOG_ATTRIBUTE __attribute__((noinline, cold, not_tail_called)) - -#if SK_LOG -/* - * Because the compiler doesn't know about the %b format specifier, - * most warnings for _SK_D are disabled by pragma. - * - * XXX adi@apple.com: This means the compiler will not warn us about - * invalid parameters passed to kprintf(), so make sure to scrutinize - * any changes made to code using any logging macros defined below. - */ - extern uint64_t sk_verbose; -#define _SK_D(_flag, _fmt, ...) do { \ - if (__improbable(((_flag) && (sk_verbose & (_flag)) == (_flag)) || \ - (_flag) == SK_VERB_ERROR)) { \ - _Pragma("clang diagnostic push") \ - _Pragma("clang diagnostic ignored \"-Wformat-invalid-specifier\"") \ - _Pragma("clang diagnostic ignored \"-Wformat-extra-args\"") \ - _Pragma("clang diagnostic ignored \"-Wformat\"") \ - kprintf("SK[%u]: %-30s " _fmt "\n", \ - cpu_number(), __FUNCTION__, ##__VA_ARGS__); \ - _Pragma("clang diagnostic pop") \ - } \ +extern os_log_t sk_log_handle; + +#define SK_KVA(p) (void *)VM_KERNEL_ADDRHIDE(p) +#define SK_LOG_VAR(x) x + +#define _SK_LOG(_type, _flag, _fmt, ...) do { \ + if (__improbable(((_flag) == SK_VERB_DEFAULT) || \ + ((_flag) == SK_VERB_ERROR) || \ + ((sk_verbose & (_flag)) == (_flag)))) { \ + os_log_with_type(sk_log_handle, \ + (_flag) == SK_VERB_ERROR ? OS_LOG_TYPE_ERROR : _type, \ + "SK[%u]: %-30s " _fmt "\n", cpu_number(), __FUNCTION__, \ + ##__VA_ARGS__); \ + } \ } while (0) -#define SK_DF(_flag, _fmt, ...) _SK_D((uint64_t)_flag, _fmt, ##__VA_ARGS__) -#define SK_D(_fmt, ...) SK_DF(SK_VERB_DEFAULT, _fmt, ##__VA_ARGS__) -#define SK_ERR(_fmt, ...) SK_DF(SK_VERB_ERROR, _fmt, ##__VA_ARGS__) -#define SK_DSC(_p, _fmt, ...) SK_ERR("%s(%d): " _fmt, \ - sk_proc_name_address(_p), sk_proc_pid(_p), ##__VA_ARGS__) +/* error log (captured by default) */ +#define SK_ERR(_fmt, ...) _SK_LOG(OS_LOG_TYPE_ERROR, SK_VERB_ERROR, _fmt, ##__VA_ARGS__) + +/* error log with proc info (captured by default) */ +#define SK_PERR(_p, _fmt, ...) do { \ + SK_ERR("%s(%d): " _fmt, sk_proc_name(_p), sk_proc_pid(_p), ##__VA_ARGS__); \ +} while (0) + +/* default log (captured by default) */ +#define SK_D(_fmt, ...) _SK_LOG(OS_LOG_TYPE_DEFAULT, SK_VERB_DEFAULT, _fmt, ##__VA_ARGS__) + +/* debug log (enabled sk_verbose flag) */ +#define SK_DF(_flag, _fmt, ...) _SK_LOG(OS_LOG_TYPE_DEFAULT, (uint64_t)_flag, _fmt, ##__VA_ARGS__) + +/* SK_DF with proc info */ +#define SK_PDF(_flag, _p, _fmt, ...) do { \ + SK_DF(_flag, "%s(%d): " _fmt, sk_proc_name(_p), sk_proc_pid(_p), ##__VA_ARGS__); \ +} while (0) /* rate limited, lps indicates how many per second */ #define _SK_RD(_flag, _lps, _fmt, ...) do { \ static int __t0, __now, __cnt; \ - __now = (int)_net_uptime; \ + __now = (int)net_uptime(); \ if (__t0 != __now) { \ __t0 = __now; \ __cnt = 0; \ @@ -289,7 +281,22 @@ extern uint64_t sk_verbose; SK_RDF(SK_VERB_DEFAULT, _lps, _fmt, ##__VA_ARGS__) #define SK_RDERR(_lps, _fmt, ...) \ SK_RDF(SK_VERB_ERROR, _lps, _fmt, ##__VA_ARGS__) + +/* + * The compiler doesn't know that snprintf() supports %b format + * specifier, so use our own wrapper to vsnprintf() here instead. + */ +#define sk_snprintf(str, size, format, ...) ({ \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wformat-invalid-specifier\"") \ + _Pragma("clang diagnostic ignored \"-Wformat-extra-args\"") \ + _Pragma("clang diagnostic ignored \"-Wformat\"") \ + snprintf(str, size, format, ## __VA_ARGS__) \ + _Pragma("clang diagnostic pop"); \ +}) + #else /* !SK_LOG */ +#define SK_LOG_VAR(x) #define SK_DF(_flag, _fmt, ...) do { ((void)0); } while (0) #define SK_D(_fmt, ...) do { ((void)0); } while (0) #define SK_ERR(_fmt, ...) do { ((void)0); } while (0) @@ -299,6 +306,11 @@ extern uint64_t sk_verbose; #define SK_RDERR(_lps, _fmt, ...) do { ((void)0); } while (0) #endif /* ! SK_LOG */ +#define SK_INLINE_ATTRIBUTE __attribute__((always_inline)) +#define SK_NO_INLINE_ATTRIBUTE __attribute__((noinline)) +#define SK_LOG_ATTRIBUTE __attribute__((noinline, cold, not_tail_called)) + + #ifdef BSD_KERNEL_PRIVATE #include #include diff --git a/bsd/skywalk/os_stats_private.h b/bsd/skywalk/os_stats_private.h index 90d5000d2..a17361c1e 100644 --- a/bsd/skywalk/os_stats_private.h +++ b/bsd/skywalk/os_stats_private.h @@ -475,7 +475,8 @@ X(TCP_STATS_JOIN_RXMTS, "JoinAckReXmt", "\t%llu join ack retransmits\n") \ X(TCP_STATS_TAILLOSS_RTO, "TailLossTRO", "\t%llu RTO due to tail loss\n") \ X(TCP_STATS_RECOVERED_PKTS, "RecoveryPkt", "\t%llu recovered after loss\n") \ - X(TCP_STATS_NOSTRETCHACK, "NoStrechAck", "\t%llu disabled stretch ack algorithm on a connection\n") \ + /* ToDo - to be removed */ \ + X(TCP_STATS_NOSTRETCHACK, "NoStrechAck", "\t%llu disabled stretch ack algorithm on a connection\n") \ X(TCP_STATS_RESCUE_RXMT, "SACKRsqReXmt", "\t%llu SACK rescue retransmit\n") \ \ /* MPTCP Subflow selection stats */ \ @@ -805,6 +806,7 @@ X(FSW_STATS_RX_PKT_NOT_LISTENER, "RxPktNotListener", "\t\t%llu packet not for listener\n") \ X(FSW_STATS_RX_FLOW_IN_USE, "RxFlowInUse", "\t\t%llu flow in use\n") \ X(FSW_STATS_RX_STALL, "RxRingStall", "\t\t%llu Rx rings stalled\n") \ + X(FSW_STATS_RX_DISABLED, "RxDisabled", "\t\t%llu dropped, flow Rx disabled\n") \ /* Rx frag stats (fsw doesn't manage fragments on Tx) */ \ X(FSW_STATS_RX_FRAG_V4, "RxFragV4", "\t\t%llu total received ipv4 fragments\n") \ X(FSW_STATS_RX_FRAG_V6, "RxFragV6", "\t\t%llu total received ipv6 fragments\n") \ @@ -868,6 +870,7 @@ X(FSW_STATS_TX_COPY_PKT2MBUF, "TxCopyPktToMbuf", "\t\t%llu copied pkt -> mbuf\n") \ X(FSW_STATS_TX_COPY_SUM, "TxCopySum", "\t\t%llu copy+checksumed\n") \ X(FSW_STATS_TX_COPY_BAD_LEN, "TxCopyBadLen", "\t\t%llu dropped, bad packet length\n") \ + X(FSW_STATS_TX_DISABLED, "TxDisabled", "\t\t%llu dropped, flow tx disabled\n") \ \ /* Drop stats (generic bidirectional) */ \ X(FSW_STATS_DROP, "Drop", "\t%llu total dropped\n") \ @@ -905,7 +908,14 @@ /* FPD stats */ \ FSW_FPD_STATS(X) \ \ - X(__FSW_STATS_MAX, "", "end of flowswitch stats") + /* Rx Flow Steering stats */ \ + X(FSW_STATS_RX_FS_ADD_SUCCESS, "RxFSAddSuccess", "\t%llu rx flow steering add success\n") \ + X(FSW_STATS_RX_FS_REMOVE_SUCCESS, "RxFSRemoveSuccess", "\t%llu rx flow steering remove success\n") \ + X(FSW_STATS_RX_FS_ADD_FAILURE, "RxFSAddFailure", "\t%llu rx flow steering add failure\n") \ + X(FSW_STATS_RX_FS_REMOVE_FAILURE, "RxFSRemoveFailure", "\t%llu rx flow steering remove failure\n") \ + X(FSW_STATS_RX_FS_REMOVE_SKIPPED, "RxFSRemoveSkipped", "\t%llu rx flow steering remove skipped\n") \ + \ + X(__FSW_STATS_MAX, "", "end of flowswitch stats") /* END CSTYLED */ @@ -1021,26 +1031,28 @@ typedef struct { uint64_t crsu_total_slots_transferred; uint64_t crsu_total_bytes_transferred; uint64_t crsu_number_of_syncs; + uint64_t crsu_bytes_per_sync; + uint64_t crsu_bytes_per_sync_ma; uint32_t crsu_min_slots_transferred; uint32_t crsu_max_slots_transferred; uint32_t crsu_slots_per_sync; uint32_t crsu_slots_per_sync_ma; - uint64_t crsu_bytes_per_sync; - uint64_t crsu_bytes_per_sync_ma; - uint32_t __crsu_reserved[2]; } channel_ring_user_stats, *channel_ring_user_stats_t; typedef struct { uint64_t crs_total_slots_transferred; uint64_t crs_total_bytes_transferred; uint64_t crs_number_of_transfers; - uint32_t crs_min_slots_transferred; - uint32_t crs_max_slots_transferred; - uint32_t crs_slots_per_second; - uint32_t crs_slots_per_second_ma; + union { + uint64_t crs_last_update_net_uptime;/* used by kernel as timestamp */ + uint64_t crs_seconds_since_last_update;/* published to userspace as time lapsed */ + }; + uint64_t crs_slots_per_second; + uint64_t crs_slots_per_second_ma; uint64_t crs_bytes_per_second; uint64_t crs_bytes_per_second_ma; - uint32_t __crs_reserved[2]; + uint32_t crs_min_slots_transferred; + uint32_t crs_max_slots_transferred; } channel_ring_stats, *channel_ring_stats_t; struct netif_qstats { @@ -1120,9 +1132,6 @@ typedef struct { nexus_channel_ring_entry nce_ring_entries[__counted_by(nce_ring_count)]; /* tx followed by rx */ } nexus_channel_entry, *nexus_channel_entry_t; -#define SCHF_MONITOR_TX 0x00000001 -#define SCHF_MONITOR_RX 0x00000002 -#define SCHF_MONITOR_NO_COPY 0x00000004 #define SCHF_USER_PACKET_POOL 0x00000008 #define SCHF_DEFUNCT_OK 0x00000010 #define SCHF_EXCLUSIVE 0x00000020 @@ -1366,6 +1375,7 @@ struct sk_stats_flow { #define SFLOWF_TRACK 0x00000010 /* flow is tracked */ #define SFLOWF_CONNECTED 0x00000020 /* connected mode */ #define SFLOWF_LISTENER 0x00000040 /* listener mode */ +#define SFLOWF_AOP_OFFLOAD 0x00000080 /* AOP offloaded flow */ #define SFLOWF_QOS_MARKING 0x00000100 /* flow can have qos marking */ #define SFLOWF_BOUND_IP 0x00000200 /* src addr explicity bound */ #define SFLOWF_ONLINK 0x00000400 /* dst directly on the link */ @@ -1373,6 +1383,7 @@ struct sk_stats_flow { #define SFLOWF_WAIT_CLOSE 0x00001000 /* defer free after close */ #define SFLOWF_CLOSE_NOTIFY 0x00002000 /* notify NECP upon tear down */ #define SFLOWF_NOWAKEFROMSLEEP 0x00004000 /* don't wake for this flow */ +#define SFLOWF_CONNECTION_IDLE 0x00008000 /* connection is idle */ #define SFLOWF_ABORTED 0x01000000 /* has sent RST to peer */ #define SFLOWF_NONVIABLE 0x02000000 /* disabled; to be torn down */ #define SFLOWF_WITHDRAWN 0x04000000 /* flow has been withdrawn */ diff --git a/bsd/skywalk/os_sysctls_private.h b/bsd/skywalk/os_sysctls_private.h index b2994b110..8b1a27fdd 100644 --- a/bsd/skywalk/os_sysctls_private.h +++ b/bsd/skywalk/os_sysctls_private.h @@ -62,8 +62,7 @@ X(int32_t, path_mtu_discovery, 1) \ X(int32_t, local_slowstart_flightsize, 8) \ X(uint32_t, ecn_setup_percentage, 50) \ - X(int32_t, ecn_initiate_out, 0) \ - X(int32_t, ecn_negotiate_in, 0) \ + X(int32_t, ecn, 1) \ X(int32_t, packetchain, 50) \ X(int32_t, socket_unlocked_on_output, 1) \ X(int32_t, min_iaj_win, 16) \ @@ -97,22 +96,17 @@ X(int32_t, broken_peer_syn_rexmit_thres, 10) \ X(int32_t, pmtud_blackhole_detection, 1) \ X(uint32_t, pmtud_blackhole_mss, 1200) \ - X(int32_t, sendspace, 1448*256) \ - X(int32_t, recvspace, 1448*384) \ + X(int32_t, sendspace, 1024*128) \ + X(int32_t, recvspace, 1024*128) \ X(uint32_t, microuptime_init, 0) \ X(uint32_t, now_init, 0) \ X(uint32_t, challengeack_limit, 10) \ - X(int32_t, do_rfc5961, 1) \ X(int32_t, init_rtt_from_cache, 1) \ X(uint32_t, autotunereorder, 1) \ X(uint32_t, do_ack_compression, 1) \ X(uint32_t, ack_compression_rate, 5) \ - X(int32_t, do_better_lr, 1) \ X(int32_t, cubic_minor_fixes, 1) \ X(int32_t, cubic_rfc_compliant, 1) \ - X(int32_t, aggressive_rcvwnd_inc, 1) \ - X(int32_t, ack_strategy, 1) \ - X(int32_t, flow_control_response, 1) \ X(int32_t, randomize_timestamps, 1) \ X(uint32_t, ledbat_plus_plus, 1) \ X(uint32_t, use_ledbat, 0) \ @@ -141,6 +135,7 @@ #define SKMEM_SYSCTL_TCP_HAS_RACK 1 #define SKMEM_SYSCTL_TCP_HAS_L4S 1 #define SKMEM_SYSCTL_TCP_HAS_LINK_HEURISTICS 1 +#define SKMEM_SYSCTL_TCP_HAS_REFACTORED_ECN 1 /* * When adding a new type above, be sure to add a corresponding * printf format below. Clients use NW_SYSCTL_PRI_##type diff --git a/bsd/skywalk/packet/os_packet.h b/bsd/skywalk/packet/os_packet.h index 1f5b77a19..69f02e747 100644 --- a/bsd/skywalk/packet/os_packet.h +++ b/bsd/skywalk/packet/os_packet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Apple Inc. All rights reserved. + * Copyright (c) 2016-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -430,13 +430,13 @@ extern int os_packet_get_expiry_action(const packet_t, packet_expiry_action_t *) extern int os_packet_set_token(const packet_t, const void *, const uint16_t); extern int os_packet_get_packetid(const packet_t, packet_id_t *); extern int os_packet_set_packetid(const packet_t, packet_id_t *); -extern int os_packet_set_vlan_tag(const packet_t, const uint16_t, - const boolean_t); -extern int os_packet_get_vlan_tag(const packet_t, uint16_t *, boolean_t *); +extern int os_packet_set_vlan_tag(const packet_t, const uint16_t); +extern int os_packet_get_vlan_tag(const packet_t, uint16_t *); extern uint16_t os_packet_get_vlan_id(const uint16_t); extern uint8_t os_packet_get_vlan_priority(const uint16_t); #define HAS_OS_PACKET_GET_WAKE_FLAG 1 extern boolean_t os_packet_get_wake_flag(const packet_t); +extern void os_packet_set_wake_flag(const packet_t); #define HAS_OS_PACKET_KEEP_ALIVE 1 extern boolean_t os_packet_get_keep_alive(const packet_t); extern void os_packet_set_keep_alive(const packet_t, const boolean_t); @@ -756,14 +756,15 @@ extern errno_t kern_packet_set_token(const kern_packet_t, extern errno_t kern_packet_get_token(const kern_packet_t, void *__sized_by(*len), uint16_t *len); extern errno_t kern_packet_get_packetid(const kern_packet_t, packet_id_t *); -extern errno_t kern_packet_set_vlan_tag(const kern_packet_t, const uint16_t, - const boolean_t); -extern errno_t kern_packet_get_vlan_tag(const kern_packet_t, uint16_t *, - boolean_t *); +extern errno_t kern_packet_set_vlan_tag(const kern_packet_t, const uint16_t); +extern errno_t kern_packet_get_vlan_tag(const kern_packet_t, uint16_t *); extern uint16_t kern_packet_get_vlan_id(const uint16_t); extern uint8_t kern_packet_get_vlan_priority(const uint16_t); extern void kern_packet_set_wake_flag(const kern_packet_t); extern boolean_t kern_packet_get_wake_flag(const kern_packet_t); +extern void kern_packet_set_ulpn_flag(const kern_packet_t); +extern boolean_t kern_packet_get_ulpn_flag(const kern_packet_t); +extern boolean_t kern_packet_get_lpw_flag(const kern_packet_t); extern errno_t kern_packet_set_fpd_sequence_number(const kern_packet_t, uint32_t); extern errno_t kern_packet_set_fpd_context_id(const kern_packet_t, uint16_t); diff --git a/bsd/skywalk/packet/os_packet_private.h b/bsd/skywalk/packet/os_packet_private.h index a6cefcc4a..6898c6ceb 100644 --- a/bsd/skywalk/packet/os_packet_private.h +++ b/bsd/skywalk/packet/os_packet_private.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Apple Inc. All rights reserved. + * Copyright (c) 2016-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -314,9 +314,9 @@ struct __user_buflet { #ifdef KERNEL #define BUF_CTOR(_buf, _baddr, _bidx, _dlim, _dlen, _doff, _nbaddr, _nbidx, _bflag) do { \ - _CASSERT(sizeof ((_buf)->buf_addr) == sizeof (mach_vm_address_t)); \ - _CASSERT(sizeof ((_buf)->buf_idx) == sizeof (obj_idx_t)); \ - _CASSERT(sizeof ((_buf)->buf_dlim) == sizeof (uint32_t)); \ + static_assert(sizeof ((_buf)->buf_addr) == sizeof (mach_vm_address_t)); \ + static_assert(sizeof ((_buf)->buf_idx) == sizeof (obj_idx_t)); \ + static_assert(sizeof ((_buf)->buf_dlim) == sizeof (uint32_t)); \ BUF_BADDR(_buf, _baddr); \ BUF_NBFT_ADDR(_buf, _nbaddr); \ BUF_BIDX(_buf, _bidx); \ @@ -464,7 +464,7 @@ struct __user_quantum { (_kqum)->qum_flow_id_val64[1] = 0; \ (_kqum)->qum_qflags = (_flags); \ (_kqum)->qum_len = (_len); \ - _CASSERT(sizeof(METADATA_IDX(_kqum)) == sizeof(obj_idx_t)); \ + static_assert(sizeof(METADATA_IDX(_kqum)) == sizeof(obj_idx_t)); \ *(obj_idx_t *)(uintptr_t)&METADATA_IDX(_kqum) = (_qidx); \ BUF_CTOR(&(_kqum)->qum_buf[0], (_baddr), (_bidx), (_dlim), 0, 0, 0, \ OBJ_IDX_NONE, 0); \ @@ -793,15 +793,15 @@ struct __user_packet { #define __PKT_F_PKT_DATA 0x0000010000000000ULL /* (K) */ #define PKT_F_PROMISC 0x0000020000000000ULL /* (U+K) */ #define PKT_F_OPT_VLTAG 0x0000040000000000ULL /* (U+K) */ -#define PKT_F_OPT_VLTAG_IN_PKT 0x0000080000000000ULL /* (U+K) */ +/* 0x0000080000000000ULL (reserved) */ #define __PKT_F_TX_PORT_DATA 0x0000100000000000ULL /* (K) */ #define PKT_F_OPT_EXP_ACTION 0x0000200000000000ULL /* (U+K) */ #define PKT_F_OPT_APP_METADATA 0x0000400000000000ULL /* (U+K) */ #define PKT_F_L4S 0x0000800000000000ULL /* (U+K) */ #define PKT_F_OPT_TX_TIMESTAMP 0x0001000000000000ULL /* (U+K) */ -/* 0x0002000000000000ULL */ -/* 0x0004000000000000ULL */ -/* 0x0008000000000000ULL */ +#define PKT_F_PRIV_HAS_QSET_ID 0x0002000000000000ULL /* (K) */ +#define PKT_F_ULPN 0x0004000000000000ULL /* (U+K) */ +#define __PKT_F_LPW 0x0008000000000000ULL /* (K) */ /* 0x0010000000000000ULL */ /* 0x0020000000000000ULL */ /* 0x0040000000000000ULL */ @@ -821,7 +821,7 @@ struct __user_packet { #define PKT_F_OPT_DATA \ (PKT_F_OPT_GROUP_START | PKT_F_OPT_GROUP_END | \ PKT_F_OPT_EXPIRE_TS | PKT_F_OPT_TOKEN | \ - PKT_F_OPT_VLTAG | PKT_F_OPT_VLTAG_IN_PKT | PKT_F_OPT_EXP_ACTION | \ + PKT_F_OPT_VLTAG | PKT_F_OPT_EXP_ACTION | \ PKT_F_OPT_APP_METADATA | PKT_F_OPT_TX_TIMESTAMP) #ifdef KERNEL @@ -831,7 +831,7 @@ struct __user_packet { #define PKT_F_USER_MASK \ (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | \ PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC | \ - PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S) + PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S | PKT_F_ULPN) /* * Aliases for kernel-only flags. See notes above. The ones marked @@ -907,8 +907,7 @@ struct __user_packet { #define SK_PTR_ADDR(_p) ((uint64_t)(_p) & SK_PTR_ADDR_MASK) #define SK_PTR_ADDR_ENC(_p) ((uint64_t)(_p) & SK_PTR_ADDR_MASK) -#define SK_PTR_ENCODE(_p, _t, _s) \ - (SK_PTR_ADDR_ENC(_p) | SK_PTR_TYPE_ENC(_t) | SK_PTR_SUBTYPE_ENC(_s)) +#define SK_PTR_ENCODE(_p, _t, _s) ((uint64_t)(_p)) #define SK_PTR_ADDR_UQUM(_ph) (__unsafe_forge_single(struct __user_quantum *, SK_PTR_ADDR(_ph))) #define SK_PTR_ADDR_UPKT(_ph) (__unsafe_forge_single(struct __user_packet *, SK_PTR_ADDR(_ph))) @@ -921,12 +920,6 @@ __BEGIN_DECLS extern struct mbuf *kern_packet_get_mbuf(const kern_packet_t); __END_DECLS #else /* !KERNEL */ -#if defined(LIBSYSCALL_INTERFACE) -__BEGIN_DECLS -extern void pkt_subtype_assert_fail(const packet_t, uint64_t, uint64_t); -extern void pkt_type_assert_fail(const packet_t, uint64_t); -__END_DECLS -#endif /* LIBSYSCALL_INTERFACE */ #endif /* !KERNEL */ #if defined(LIBSYSCALL_INTERFACE) || defined(BSD_KERNEL_PRIVATE) #include diff --git a/bsd/skywalk/packet/packet_common.h b/bsd/skywalk/packet/packet_common.h index a6a74ea67..c5b78d50f 100644 --- a/bsd/skywalk/packet/packet_common.h +++ b/bsd/skywalk/packet/packet_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Apple Inc. All rights reserved. + * Copyright (c) 2016-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,27 +67,8 @@ /* * Common. */ -#if (DEBUG || DEVELOPMENT) -#define PKT_SUBTYPE_ASSERT(_ph, _type, _subtype) do { \ - if (__improbable(SK_PTR_TYPE(_ph) != (uint64_t)(_type) || \ - SK_PTR_SUBTYPE(_ph) != (uint64_t)(_subtype))) { \ - pkt_subtype_assert_fail(_ph, _type, _subtype); \ - /* NOTREACHED */ \ - __builtin_unreachable(); \ - } \ -} while (0) - -#define PKT_TYPE_ASSERT(_ph, _type) do { \ - if (__improbable(SK_PTR_TYPE(_ph) != (uint64_t)(_type))) { \ - pkt_type_assert_fail(_ph, _type); \ - /* NOTREACHED */ \ - __builtin_unreachable(); \ - } \ -} while (0) -#else /* !DEBUG && !DEVELOPMENT */ -#define PKT_SUBTYPE_ASSERT(_ph, _type, _subtype) ((void)0) #define PKT_TYPE_ASSERT(_ph, _type) ((void)0) -#endif /* !DEBUG && !DEVELOPMENT */ +#define PKT_SUBTYPE_ASSERT(_ph, _type, _subtype) ((void)0) #define QUM_GET_NEXT_BUFLET(_qum, _pbuf, _buf) do { \ ASSERT((_pbuf) == NULL || (_pbuf) == (_qum)->qum_buf); \ @@ -417,7 +398,7 @@ __packet_opt_get_token(const struct __packet_opt *po, ttype = (uint8_t)po->__po_token_type; ASSERT(tlen <= PKT_OPT_MAX_TOKEN_SIZE); - _CASSERT((__builtin_offsetof(struct __packet_opt, __po_token) % 8) == 0); + static_assert((__builtin_offsetof(struct __packet_opt, __po_token) % 8) == 0); bcopy(po->__po_token, token, tlen); /* * -fbounds-safety: Updating *len should be fine because at this point @@ -459,7 +440,7 @@ __packet_opt_set_token(struct __packet_opt *po, const void *__sized_by(PKT_OPT_MAX_TOKEN_SIZE)token, const uint16_t len, const uint8_t type, volatile uint64_t *pflags) { - _CASSERT((__builtin_offsetof(struct __packet_opt, __po_token) % 8) == 0); + static_assert((__builtin_offsetof(struct __packet_opt, __po_token) % 8) == 0); if (len != 0) { if (token == NULL || len > PKT_OPT_MAX_TOKEN_SIZE || type == 0) { @@ -476,9 +457,9 @@ __packet_opt_set_token(struct __packet_opt *po, po->__po_token_type = type; *pflags |= PKT_F_OPT_TOKEN; } else { - _CASSERT(sizeof(po->__po_token_data[0]) == 8); - _CASSERT(sizeof(po->__po_token_data[1]) == 8); - _CASSERT(sizeof(po->__po_token) == 16); + static_assert(sizeof(po->__po_token_data[0]) == 8); + static_assert(sizeof(po->__po_token_data[1]) == 8); + static_assert(sizeof(po->__po_token) == 16); po->__po_token_data[0] = 0; po->__po_token_data[1] = 0; po->__po_token_len = 0; @@ -582,8 +563,7 @@ __packet_set_packetid(const uint64_t ph, const packet_id_t *pktid) __attribute__((always_inline)) static inline errno_t -__packet_get_vlan_tag(const uint64_t ph, uint16_t *vlan_tag, - boolean_t *tag_in_pkt) +__packet_get_vlan_tag(const uint64_t ph, uint16_t *vlan_tag) { #ifdef KERNEL struct __packet_opt *po = PKT_ADDR(ph)->pkt_com_opt; @@ -592,7 +572,7 @@ __packet_get_vlan_tag(const uint64_t ph, uint16_t *vlan_tag, #endif /* !KERNEL */ uint64_t pflags; - PKT_SUBTYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET, NEXUS_META_SUBTYPE_RAW); + PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); pflags = PKT_ADDR(ph)->pkt_pflags; if ((pflags & PKT_F_OPT_VLTAG) == 0) { return ENOENT; @@ -600,16 +580,12 @@ __packet_get_vlan_tag(const uint64_t ph, uint16_t *vlan_tag, if (vlan_tag != NULL) { *vlan_tag = po->__po_vlan_tag; } - if (tag_in_pkt != NULL) { - *tag_in_pkt = ((pflags & PKT_F_OPT_VLTAG_IN_PKT) != 0); - } return 0; } __attribute__((always_inline)) static inline errno_t -__packet_set_vlan_tag(const uint64_t ph, const uint16_t vlan_tag, - const boolean_t tag_in_pkt) +__packet_set_vlan_tag(const uint64_t ph, const uint16_t vlan_tag) { #ifdef KERNEL struct __packet_opt *po = PKT_ADDR(ph)->pkt_com_opt; @@ -617,13 +593,10 @@ __packet_set_vlan_tag(const uint64_t ph, const uint16_t vlan_tag, struct __packet_opt *po = &PKT_ADDR(ph)->pkt_com_opt; #endif /* !KERNEL */ - PKT_SUBTYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET, NEXUS_META_SUBTYPE_RAW); + PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); PKT_ADDR(ph)->pkt_pflags |= PKT_F_OPT_VLTAG; po->__po_vlan_tag = vlan_tag; - if (tag_in_pkt) { - PKT_ADDR(ph)->pkt_pflags |= PKT_F_OPT_VLTAG_IN_PKT; - } return 0; } @@ -689,7 +662,6 @@ __packet_set_app_metadata(const uint64_t ph, return 0; } -#ifdef KERNEL __attribute__((always_inline)) static inline void __packet_set_wake_flag(const uint64_t ph) @@ -697,7 +669,6 @@ __packet_set_wake_flag(const uint64_t ph) PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); PKT_ADDR(ph)->pkt_pflags |= PKT_F_WAKE_PKT; } -#endif __attribute__((always_inline)) static inline boolean_t @@ -706,6 +677,30 @@ __packet_get_wake_flag(const uint64_t ph) return (PKT_ADDR(ph)->pkt_pflags & PKT_F_WAKE_PKT) != 0; } +#ifdef KERNEL +__attribute__((always_inline)) +static inline void +__packet_set_ulpn_flag(const uint64_t ph) +{ + PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); + PKT_ADDR(ph)->pkt_pflags |= PKT_F_ULPN; +} +#endif + +__attribute__((always_inline)) +static inline boolean_t +__packet_get_ulpn_flag(const uint64_t ph) +{ + return (PKT_ADDR(ph)->pkt_pflags & PKT_F_ULPN) != 0; +} + +__attribute__((always_inline)) +static inline boolean_t +__packet_get_lpw_flag(const uint64_t ph) +{ + return (PKT_ADDR(ph)->pkt_pflags & __PKT_F_LPW) != 0; +} + __attribute__((always_inline)) static inline void __packet_set_keep_alive(const uint64_t ph, const boolean_t is_keep_alive) @@ -773,7 +768,7 @@ __packet_set_service_class(const uint64_t ph, const uint32_t sc) { int err = 0; - _CASSERT(sizeof(QUM_ADDR(ph)->qum_svc_class == sizeof(uint32_t))); + static_assert(sizeof(QUM_ADDR(ph)->qum_svc_class == sizeof(uint32_t))); switch (sc) { case PKT_SC_BE: @@ -804,7 +799,7 @@ __packet_get_service_class(const uint64_t ph) { uint32_t sc; - _CASSERT(sizeof(QUM_ADDR(ph)->qum_svc_class == sizeof(uint32_t))); + static_assert(sizeof(QUM_ADDR(ph)->qum_svc_class == sizeof(uint32_t))); switch (QUM_ADDR(ph)->qum_svc_class) { case PKT_SC_BE: /* most likely best effort */ @@ -833,7 +828,7 @@ __attribute__((always_inline)) static inline errno_t __packet_set_comp_gencnt(const uint64_t ph, const uint32_t gencnt) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_comp_gencnt == sizeof(uint32_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_comp_gencnt == sizeof(uint32_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); PKT_ADDR(ph)->pkt_comp_gencnt = gencnt; @@ -845,7 +840,7 @@ __attribute__((always_inline)) static inline errno_t __packet_get_comp_gencnt(const uint64_t ph, uint32_t *pgencnt) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_comp_gencnt == sizeof(uint32_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_comp_gencnt == sizeof(uint32_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); if (pgencnt == NULL) { @@ -1032,7 +1027,7 @@ __attribute__((always_inline)) static inline uint8_t __packet_get_aggregation_type(const uint64_t ph) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_aggr_type == sizeof(uint8_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_aggr_type == sizeof(uint8_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); return PKT_ADDR(ph)->pkt_aggr_type; @@ -1073,35 +1068,21 @@ __packet_get_buflet_count(const uint64_t ph) { uint16_t bcnt = 0; - switch (SK_PTR_TYPE(ph)) { - case NEXUS_META_TYPE_PACKET: - bcnt = PKT_ADDR(ph)->pkt_bufs_cnt; + bcnt = PKT_ADDR(ph)->pkt_bufs_cnt; #ifdef KERNEL - VERIFY(bcnt != 0 || - PP_HAS_BUFFER_ON_DEMAND(PKT_ADDR(ph)->pkt_qum.qum_pp)); + VERIFY(bcnt != 0 || + PP_HAS_BUFFER_ON_DEMAND(PKT_ADDR(ph)->pkt_qum.qum_pp)); #else /* !KERNEL */ - /* - * Handle the case where the metadata region gets - * redirected to anonymous zero-filled pages at - * defunct time. There's always 1 buflet in the - * packet metadata, so pretend that's the count. - */ - if (__improbable(bcnt == 0)) { - bcnt = 1; - } -#endif /* !KERNEL */ - break; - case NEXUS_META_TYPE_QUANTUM: + /* + * Handle the case where the metadata region gets + * redirected to anonymous zero-filled pages at + * defunct time. There's always 1 buflet in the + * packet metadata, so pretend that's the count. + */ + if (__improbable(bcnt == 0)) { bcnt = 1; - break; - default: -#ifdef KERNEL - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -#endif /* KERNEL */ - break; } +#endif /* !KERNEL */ return bcnt; } @@ -1187,38 +1168,23 @@ __packet_get_next_buflet(const uint64_t ph, const void *bprev0) void *bcur = NULL; #endif /* !KERNEL */ - switch (SK_PTR_TYPE(ph)) { - case NEXUS_META_TYPE_PACKET: { - uint32_t bcnt = PKT_ADDR(ph)->pkt_bufs_cnt; + uint32_t bcnt = PKT_ADDR(ph)->pkt_bufs_cnt; #ifdef KERNEL - ASSERT(bcnt != 0 || - PP_HAS_BUFFER_ON_DEMAND(PKT_ADDR(ph)->pkt_qum.qum_pp)); + ASSERT(bcnt != 0 || + PP_HAS_BUFFER_ON_DEMAND(PKT_ADDR(ph)->pkt_qum.qum_pp)); #else /* !KERNEL */ - /* - * Handle the case where the metadata region gets - * redirected to anonymous zero-filled pages at - * defunct time. There's always 1 buflet in the - * packet metadata, so pretend that's the count. - */ - if (__improbable(bcnt == 0)) { - bcnt = 1; - bprev = NULL; - } + /* + * Handle the case where the metadata region gets + * redirected to anonymous zero-filled pages at + * defunct time. There's always 1 buflet in the + * packet metadata, so pretend that's the count. + */ + if (__improbable(bcnt == 0)) { + bcnt = 1; + bprev = NULL; + } #endif /* !KERNEL */ - PKT_GET_NEXT_BUFLET(PKT_ADDR(ph), bcnt, BLT_ADDR(bprev), bcur); - break; - } - case NEXUS_META_TYPE_QUANTUM: - QUM_GET_NEXT_BUFLET(QUM_ADDR(ph), BLT_ADDR(bprev), bcur); - break; - default: -#ifdef KERNEL - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -#endif /* KERNEL */ - break; - } + PKT_GET_NEXT_BUFLET(PKT_ADDR(ph), bcnt, BLT_ADDR(bprev), bcur); return bcur; } @@ -1226,7 +1192,7 @@ __attribute__((always_inline)) static inline uint8_t __packet_get_segment_count(const uint64_t ph) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_seg_cnt == sizeof(uint8_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_seg_cnt == sizeof(uint8_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); return PKT_ADDR(ph)->pkt_seg_cnt; @@ -1236,7 +1202,7 @@ __attribute__((always_inline)) static inline void __packet_set_segment_count(const uint64_t ph, uint8_t segcount) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_seg_cnt == sizeof(uint8_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_seg_cnt == sizeof(uint8_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); PKT_ADDR(ph)->pkt_seg_cnt = segcount; @@ -1263,7 +1229,7 @@ __attribute__((always_inline)) static inline void __packet_get_tso_flags(const uint64_t ph, packet_tso_flags_t *flags) { - _CASSERT(sizeof(PKT_ADDR(ph)->pkt_proto_seg_sz == sizeof(uint16_t))); + static_assert(sizeof(PKT_ADDR(ph)->pkt_proto_seg_sz == sizeof(uint16_t))); PKT_TYPE_ASSERT(ph, NEXUS_META_TYPE_PACKET); *flags = PKT_ADDR(ph)->pkt_csum_flags & (PACKET_CSUM_TSO_FLAGS); @@ -1295,7 +1261,7 @@ __buflet_set_data_limit(const void *buf, const uint32_t dlim) /* full bounds checking will be performed during finalize */ if (__probable((uint32_t)dlim <= BLT_ADDR(buf)->buf_objlim)) { - _CASSERT(sizeof(BLT_ADDR(buf)->buf_dlim) == sizeof(uint32_t)); + static_assert(sizeof(BLT_ADDR(buf)->buf_dlim) == sizeof(uint32_t)); /* deconst */ *(uint32_t *)(uintptr_t)&BLT_ADDR(buf)->buf_dlim = dlim; return 0; @@ -1392,80 +1358,50 @@ __packet_finalize(const uint64_t ph) goto done; } - switch (SK_PTR_TYPE(ph)) { - case NEXUS_META_TYPE_PACKET: - if (__improbable(bdoff0 > UINT8_MAX)) { - err = ERANGE; + if (__improbable(bdoff0 > UINT8_MAX)) { + err = ERANGE; + goto done; + } + /* internalize headroom value from offset */ + PKT_ADDR(ph)->pkt_headroom = (uint8_t)bdoff0; + /* validate header offsets in packet */ +#ifndef KERNEL + /* Overwrite L2 len for raw packets from user space */ + PKT_ADDR(ph)->pkt_l2_len = 0; +#else /* !KERNEL */ + /* ensure that L3 >= L2 && L3 < bdlim */ + if (__improbable((PKT_ADDR(ph)->pkt_headroom + + PKT_ADDR(ph)->pkt_l2_len) >= bdlim0)) { + err = ERANGE; + goto done; + } +#endif /* KERNEL */ + + if (__improbable(PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_DATA)) { +#ifdef KERNEL + struct __packet_opt *po = PKT_ADDR(ph)->pkt_com_opt; +#else /* !KERNEL */ + struct __packet_opt *po = &PKT_ADDR(ph)->pkt_com_opt; +#endif /* !KERNEL */ + if ((PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_EXPIRE_TS) && + po->__po_expire_ts == 0) { + err = EINVAL; goto done; } - /* internalize headroom value from offset */ - PKT_ADDR(ph)->pkt_headroom = (uint8_t)bdoff0; - /* validate header offsets in packet */ - switch (SK_PTR_SUBTYPE(ph)) { - case NEXUS_META_SUBTYPE_RAW: -#ifndef KERNEL - /* Overwrite L2 len for raw packets from user space */ - PKT_ADDR(ph)->pkt_l2_len = 0; -#else /* !KERNEL */ - /* ensure that L3 >= L2 && L3 < bdlim */ - if (__improbable((PKT_ADDR(ph)->pkt_headroom + - PKT_ADDR(ph)->pkt_l2_len) >= bdlim0)) { - err = ERANGE; - goto done; - } -#endif /* KERNEL */ - break; - case NEXUS_META_SUBTYPE_PAYLOAD: - /* - * For payload packet there is no concept of headroom - * and L3 offset should always be 0 - */ - if (__improbable((PKT_ADDR(ph)->pkt_headroom != 0) || - (PKT_ADDR(ph)->pkt_l2_len != 0))) { - err = ERANGE; - goto done; - } - break; - default: -#ifdef KERNEL - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); -#endif /* KERNEL */ - break; + if ((PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_TOKEN) && + po->__po_token_len == 0) { + err = EINVAL; + goto done; } - - if (__improbable(PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_DATA)) { -#ifdef KERNEL - struct __packet_opt *po = PKT_ADDR(ph)->pkt_com_opt; -#else /* !KERNEL */ - struct __packet_opt *po = &PKT_ADDR(ph)->pkt_com_opt; -#endif /* !KERNEL */ - if ((PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_EXPIRE_TS) && - po->__po_expire_ts == 0) { - err = EINVAL; - goto done; - } - if ((PKT_ADDR(ph)->pkt_pflags & PKT_F_OPT_TOKEN) && - po->__po_token_len == 0) { - err = EINVAL; - goto done; - } - ASSERT(err == 0); - } - - /* - * NOTE: we don't need the validation for total packet length - * as checking if each buflet is in range and that - * (pkt_headroom == bdoff0), should cover this check. - */ - break; - - default: - /* nothing to do currently for quantum */ - break; + ASSERT(err == 0); } + /* + * NOTE: we don't need the validation for total packet length + * as checking if each buflet is in range and that + * (pkt_headroom == bdoff0), should cover this check. + */ + done: if (__probable(err == 0)) { QUM_ADDR(ph)->qum_len = len; @@ -1558,38 +1494,16 @@ __packet_finalize_with_mbuf(struct __kern_packet *pkt) } /* validate header offsets in packet */ - switch (METADATA_SUBTYPE(pkt)) { - case NEXUS_META_SUBTYPE_RAW: - if (__improbable((pkt->pkt_headroom != bdoff) || - (pkt->pkt_headroom >= bdlim))) { - err = ERANGE; - goto done; - } - if (__improbable((pkt->pkt_headroom + - pkt->pkt_l2_len) >= bdlim)) { - err = ERANGE; - goto done; - } - break; - - case NEXUS_META_SUBTYPE_PAYLOAD: - /* - * For payload packet there is no concept of headroom. - */ - if (__improbable((pkt->pkt_headroom != 0) || (bdoff != 0) || - (pkt->pkt_l2_len != 0))) { - err = ERANGE; - goto done; - } - break; - - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - break; + if (__improbable((pkt->pkt_headroom != bdoff) || + (pkt->pkt_headroom >= bdlim))) { + err = ERANGE; + goto done; + } + if (__improbable((pkt->pkt_headroom + + pkt->pkt_l2_len) >= bdlim)) { + err = ERANGE; + goto done; } - if (__improbable(pkt->pkt_pflags & PKT_F_OPT_DATA)) { struct __packet_opt *po = pkt->pkt_com_opt; @@ -1768,7 +1682,7 @@ __packet_get_tx_nx_port_id(const uint64_t ph, uint32_t *nx_port_id) nexus_port_t nx_port; uint16_t vpna_gencnt; - _CASSERT(sizeof(nx_port) == sizeof(uint16_t)); + static_assert(sizeof(nx_port) == sizeof(uint16_t)); err = __packet_get_tx_nx_port(ph, &nx_port, &vpna_gencnt); if (err == 0) { @@ -1873,7 +1787,7 @@ __buflet_set_data_address(const void *buf, const void *addr) /* full bounds checking will be performed during finalize */ if (__probable((uintptr_t)addr >= (uintptr_t)BLT_ADDR(buf)->buf_objaddr)) { - _CASSERT(sizeof(BLT_ADDR(buf)->buf_addr) == + static_assert(sizeof(BLT_ADDR(buf)->buf_addr) == sizeof(mach_vm_address_t)); /* deconst */ *(mach_vm_address_t *)(uintptr_t)&BLT_ADDR(buf)->buf_addr = @@ -1961,7 +1875,7 @@ __attribute__((always_inline)) static inline struct sksegment * __buflet_get_object_segment(const void *buf, kern_obj_idx_seg_t *idx) { - _CASSERT(sizeof(obj_idx_t) == sizeof(kern_obj_idx_seg_t)); + static_assert(sizeof(obj_idx_t) == sizeof(kern_obj_idx_seg_t)); if (idx != NULL) { *idx = BLT_ADDR(buf)->buf_ctl->bc_idx; @@ -2007,13 +1921,7 @@ __attribute__((always_inline)) static inline packet_trace_id_t __packet_get_trace_id(const uint64_t ph) { - switch (SK_PTR_TYPE(ph)) { - case NEXUS_META_TYPE_PACKET: - return PKT_ADDR(ph)->pkt_trace_id; - break; - default: - return 0; - } + return PKT_ADDR(ph)->pkt_trace_id; } __attribute__((always_inline)) diff --git a/bsd/skywalk/packet/packet_copy.c b/bsd/skywalk/packet/packet_copy.c index 6d796a69c..e1e00341d 100644 --- a/bsd/skywalk/packet/packet_copy.c +++ b/bsd/skywalk/packet/packet_copy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Apple Inc. All rights reserved. + * Copyright (c) 2017-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,17 +30,16 @@ #include #include -uint32_t copy_pkt_tx_time = 1; #if (DEVELOPMENT || DEBUG) + +/* per-packet logging is wasteful in release */ +#define COPY_LOG 1 + SYSCTL_NODE(_kern_skywalk, OID_AUTO, packet, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk packet"); int pkt_trailers = 0; /* for testing trailing bytes */ SYSCTL_INT(_kern_skywalk_packet, OID_AUTO, trailers, CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_trailers, 0, ""); - -SYSCTL_UINT(_kern_skywalk_packet, OID_AUTO, copy_pkt_tx_time, - CTLFLAG_RW | CTLFLAG_LOCKED, ©_pkt_tx_time, 0, - "copy tx time from pkt to mbuf"); #endif /* !DEVELOPMENT && !DEBUG */ @@ -101,7 +100,7 @@ pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff, uint8_t *sbaddr, *dbaddr; boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt); - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); /* get buffer address from packet */ MD_BUFLET_ADDR_ABS(spkt, sbaddr); @@ -137,16 +136,17 @@ pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff, dpkt->pkt_csum_flags |= spkt->pkt_csum_flags & PACKET_CSUM_RX_FLAGS; } +#if COPY_LOG SK_DF(SK_VERB_COPY | SK_VERB_RX, "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY | SK_VERB_RX, - " pkt 0x%llx doff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " pkt %p doff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(dpkt), doff, dpkt->pkt_csum_flags, (uint32_t)dpkt->pkt_csum_rx_start_off, (uint32_t)dpkt->pkt_csum_rx_value); +#endif break; case NR_TX: @@ -184,11 +184,13 @@ pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff, dpkt->pkt_csum_tx_start_off = 0; dpkt->pkt_csum_tx_stuff_off = 0; +#if COPY_LOG SK_DF(SK_VERB_COPY | SK_VERB_TX, "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u", - sk_proc_name_address(current_proc()), + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), len, (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags); +#endif break; default: @@ -198,10 +200,12 @@ pkt_copy_from_pkt(const enum txrx t, kern_packet_t dph, const uint16_t doff, } METADATA_ADJUST_LEN(dpkt, len, doff); +#if COPY_LOG SK_DF(SK_VERB_COPY | SK_VERB_DUMP, "%s(%d) %s %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), (t == NR_RX) ? "RX" : "TX", - sk_dump("buf", dbaddr, len, 128, NULL, 0)); + sk_dump("buf", dbaddr, len, 128)); +#endif } /* @@ -255,7 +259,7 @@ _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbad while (len != 0) { PKT_GET_NEXT_BUFLET(spkt, sbcnt, sbufp, sbuf); if (__improbable(sbuf == NULL)) { - panic("%s: bad packet, 0x%llx [off %d, len %d]", + panic("%s: bad packet, %p [off %d, len %d]", __func__, SK_KVA(spkt), off0, len0); /* NOTREACHED */ __builtin_unreachable(); @@ -316,19 +320,13 @@ _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbad } dbaddr += clen; - /* - * -fbounds-safety: the following 3 lines were moved up from - * after the if-block. None of these are modified in the - * if-block, so moving these up here shouldn't change the - * behavior. Also, updating len before updating sbaddr led to - * faster throughput than doing: dbaddr += clen; sbaddr += clen; + * Updating len before updating sbaddr led to faster throughput + * than doing: dbaddr += clen; sbaddr += clen; * len -= clen + odd; */ - sblen -= clen + odd; - len -= clen + odd; - ASSERT(sblen == 0 || len == 0); - + len -= clen; + sblen -= clen; sbaddr += clen; if (__probable(do_csum)) { @@ -338,7 +336,16 @@ _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbad #else /* BYTE_ORDER != LITTLE_ENDIAN */ partial += (uint8_t)*sbaddr << 8; #endif /* BYTE_ORDER != LITTLE_ENDIAN */ - *dbaddr++ = *sbaddr++; + ASSERT(odd == 1); + /* + * -fbounds-safety: Not written as `*dbaddr++ = *sbaddr++` + * to avoid compiler bug (rdar://98749526). This + * bug is only fixed when using `bound-checks-new-checks`. + */ + *dbaddr = *sbaddr++; + dbaddr++; + len -= 1; + sblen -= 1; started_on_odd = !started_on_odd; } @@ -352,6 +359,7 @@ _pkt_copyaddr_sum(kern_packet_t sph, uint16_t soff, uint8_t *__sized_by(len)dbad */ sum = (sum >> 16) + (sum & 0xffff); } + ASSERT(sblen == 0 || len == 0); } if (odd_start) { @@ -753,11 +761,13 @@ pkt_copy_multi_buflet_from_pkt(const enum txrx t, kern_packet_t dph, dpkt->pkt_csum_tx_start_off = 0; dpkt->pkt_csum_tx_stuff_off = 0; +#if COPY_LOG SK_DF(SK_VERB_COPY | SK_VERB_TX, "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u, flags %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start, dpkt->pkt_csum_flags); + sk_proc_name(current_proc()), sk_proc_pid(current_proc()), + len, (copysum ? (len - start) : 0), csum, start, + dpkt->pkt_csum_flags); +#endif break; default: @@ -811,9 +821,10 @@ pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, struct m_tag *ts_tag = NULL; uint32_t partial; uint16_t csum = 0; + uint16_t vlan = 0; uint8_t *baddr; - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); /* get buffer address from packet */ MD_BUFLET_ADDR_ABS(pkt, baddr); @@ -846,21 +857,26 @@ pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, } else { m_copydata(m, moff, len, baddr); } + + if (mbuf_get_vlan_tag(m, &vlan) == 0) { + __packet_set_vlan_tag(ph, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_RX, current_proc(), + "RX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x", + " mbuf %p csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(m), m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, (uint32_t)m->m_pkthdr.csum_rx_val); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " pkt %p poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_rx_start_off, (uint32_t)pkt->pkt_csum_rx_value); +#endif break; case NR_TX: @@ -941,6 +957,9 @@ pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) { pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq); } + if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_LPW) != 0) { + pkt->pkt_pflags |= __PKT_F_LPW; + } if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) { pkt->pkt_pflags |= PKT_F_L4S; } @@ -976,16 +995,20 @@ pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data)); } + if (mbuf_get_vlan_tag(m, &vlan) == 0) { + __packet_set_vlan_tag(ph, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_TX, current_proc(), + "TX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u", + " mbuf %p csumf/txstart/txstuff 0x%x/%u/%u", SK_KVA(m), m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_tx_start, (uint32_t)m->m_pkthdr.csum_tx_stuff); +#endif break; default: @@ -1001,10 +1024,10 @@ pkt_copy_from_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, __packet_set_link_multicast(ph); } - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), - (t == NR_RX) ? "RX" : "TX", - sk_dump("buf", baddr, len, 128, NULL, 0)); +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, current_proc(), "%s %s", + (t == NR_RX) ? "RX" : "TX", sk_dump("buf", baddr, len, 128)); +#endif } /* @@ -1198,9 +1221,10 @@ pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph, struct m_tag *ts_tag = NULL; uint32_t partial; uint16_t csum = 0; + uint16_t vlan = 0; uint8_t *baddr; - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); /* get buffer address from packet */ MD_BUFLET_ADDR_ABS(pkt, baddr); @@ -1234,21 +1258,26 @@ pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph, } else { (void) m_copypkt_sum(m, moff, ph, poff, len, FALSE); } + + if (mbuf_get_vlan_tag(m, &vlan) == 0) { + __packet_set_vlan_tag(ph, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_RX, current_proc(), + "RX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " mbuf 0x%llx csumf/rxstart/rxval 0x%x/%u/0x%04x", + " mbuf %p csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(m), m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, (uint32_t)m->m_pkthdr.csum_rx_val); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " pkt %p poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_rx_start_off, (uint32_t)pkt->pkt_csum_rx_value); +#endif break; case NR_TX: @@ -1330,6 +1359,9 @@ pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph, if ((m->m_pkthdr.pkt_flags & PKTF_START_SEQ) != 0) { pkt->pkt_flow_tcp_seq = htonl(m->m_pkthdr.tx_start_seq); } + if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_LPW) != 0) { + pkt->pkt_pflags |= __PKT_F_LPW; + } if ((m->m_pkthdr.pkt_ext_flags & PKTF_EXT_L4S) != 0) { pkt->pkt_pflags |= PKT_F_L4S; } @@ -1365,16 +1397,20 @@ pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph, __packet_set_tx_timestamp(ph, *(uint64_t *)(ts_tag->m_tag_data)); } + if (mbuf_get_vlan_tag(m, &vlan) == 0) { + __packet_set_vlan_tag(ph, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_TX, current_proc(), + "TX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - " mbuf 0x%llx csumf/txstart/txstuff 0x%x/%u/%u", + " mbuf %p csumf/txstart/txstuff 0x%x/%u/%u", SK_KVA(m), m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_tx_start, (uint32_t)m->m_pkthdr.csum_tx_stuff); +#endif break; default: @@ -1389,10 +1425,10 @@ pkt_copy_multi_buflet_from_mbuf(const enum txrx t, kern_packet_t ph, __packet_set_link_multicast(ph); } - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), - (t == NR_RX) ? "RX" : "TX", - sk_dump("buf", baddr, len, 128, NULL, 0)); +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, current_proc(), "%s %s", + (t == NR_RX) ? "RX" : "TX", sk_dump("buf", baddr, len, 128)); +#endif } static inline uint32_t @@ -1449,12 +1485,13 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, uint32_t partial = 0; uint32_t remaining_len = len, copied_len = 0; uint16_t csum = 0; + uint16_t vlan = 0; uint8_t *baddr; uint8_t *dp; boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt); ASSERT(len >= start); - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); /* get buffer address from packet */ MD_BUFLET_ADDR_ABS(pkt, baddr); @@ -1514,7 +1551,7 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, } else { m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off; m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value; - _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS); + static_assert(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS); m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS; if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) { m->m_pkthdr.csum_flags |= CSUM_PARTIAL; @@ -1527,21 +1564,25 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, m->m_pkthdr.rx_seg_cnt = pkt->pkt_seg_cnt; + if (__packet_get_vlan_tag(ph, &vlan) == 0) { + mbuf_set_vlan_tag(m, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_RX, current_proc(), + "RX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " mbuf %p moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(m), moff, m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, (uint32_t)m->m_pkthdr.csum_rx_val); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " pkt %p poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_rx_start_off, (uint32_t)pkt->pkt_csum_rx_value); +#endif break; case NR_TX: @@ -1605,11 +1646,13 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) { m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq); } + if ((pkt->pkt_pflags & __PKT_F_LPW) != 0) { + m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_LPW; + } if ((pkt->pkt_pflags & PKT_F_L4S) != 0) { m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S; } - if (__improbable(copy_pkt_tx_time != 0 && - (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) { + if ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0) { struct m_tag *tag = NULL; tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM, sizeof(uint64_t), M_WAITOK, m); @@ -1621,16 +1664,20 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, m->m_pkthdr.necp_mtag.necp_policy_id = pkt->pkt_policy_id; m->m_pkthdr.necp_mtag.necp_skip_policy_id = pkt->pkt_skip_policy_id; + if (__packet_get_vlan_tag(ph, &vlan) == 0) { + mbuf_set_vlan_tag(m, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_TX, current_proc(), + "TX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u", + " pkt %p poff %u csumf/txstart/txstuff 0x%x/%u/%u", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_tx_start_off, (uint32_t)pkt->pkt_csum_tx_stuff_off); +#endif break; default: @@ -1644,10 +1691,11 @@ pkt_copy_to_mbuf(const enum txrx t, kern_packet_t ph, const uint16_t poff, } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) { m->m_flags |= M_MCAST; } - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, current_proc(), "%s %s", (t == NR_RX) ? "RX" : "TX", - sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0)); + sk_dump("buf", (uint8_t *)dp, m->m_len, 128)); +#endif } /* @@ -1669,12 +1717,13 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, uint32_t partial = 0; uint32_t remaining_len = len, copied_len = 0; uint16_t csum = 0; + uint16_t vlan = 0; uint8_t *baddr; uint8_t *dp; boolean_t do_sum = copysum && !PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt); ASSERT(len >= start); - _CASSERT(sizeof(csum) == sizeof(uint16_t)); + static_assert(sizeof(csum) == sizeof(uint16_t)); /* get buffer address from packet */ MD_BUFLET_ADDR_ABS(pkt, baddr); @@ -1730,7 +1779,7 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, } else { m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off; m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value; - _CASSERT(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS); + static_assert(CSUM_RX_FULL_FLAGS == PACKET_CSUM_RX_FULL_FLAGS); m->m_pkthdr.csum_flags |= pkt->pkt_csum_flags & PACKET_CSUM_RX_FULL_FLAGS; if (__improbable((pkt->pkt_csum_flags & PACKET_CSUM_PARTIAL) != 0)) { m->m_pkthdr.csum_flags |= CSUM_PARTIAL; @@ -1746,21 +1795,25 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, m->m_pkthdr.rx_seg_cnt = pkt->pkt_seg_cnt; + if (__packet_get_vlan_tag(ph, &vlan) == 0) { + mbuf_set_vlan_tag(m, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_RX, current_proc(), + "RX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - "%s(%d) RX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " mbuf 0x%llx moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " mbuf %p moff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(m), moff, m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, (uint32_t)m->m_pkthdr.csum_rx_val); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_RX, - " pkt 0x%llx poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", + " pkt %p poff %u csumf/rxstart/rxval 0x%x/%u/0x%04x", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_rx_start_off, (uint32_t)pkt->pkt_csum_rx_value); +#endif break; case NR_TX: ASSERT(len <= M16KCLBYTES); @@ -1822,11 +1875,13 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) { m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq); } + if ((pkt->pkt_pflags & __PKT_F_LPW) != 0) { + m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_LPW; + } if ((pkt->pkt_pflags & PKT_F_L4S) != 0) { m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S; } - if (__improbable(copy_pkt_tx_time != 0 && - (pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) { + if ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0) { struct m_tag *tag = NULL; tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_AQM, sizeof(uint64_t), M_WAITOK, m); @@ -1836,16 +1891,20 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, } } + if (__packet_get_vlan_tag(ph, &vlan) == 0) { + mbuf_set_vlan_tag(m, vlan); + } + +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_TX, current_proc(), + "TX len %u, copy+sum %u (csum 0x%04x), start %u", + len, (copysum ? (len - start) : 0), csum, start); SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - "%s(%d) TX len %u, copy+sum %u (csum 0x%04x), start %u", - sk_proc_name_address(current_proc()), - sk_proc_pid(current_proc()), len, - (copysum ? (len - start) : 0), csum, start); - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_TX, - " pkt 0x%llx poff %u csumf/txstart/txstuff 0x%x/%u/%u", + " pkt %p poff %u csumf/txstart/txstuff 0x%x/%u/%u", SK_KVA(pkt), poff, pkt->pkt_csum_flags, (uint32_t)pkt->pkt_csum_tx_start_off, (uint32_t)pkt->pkt_csum_tx_stuff_off); +#endif break; default: @@ -1859,10 +1918,11 @@ pkt_copy_multi_buflet_to_mbuf(const enum txrx t, kern_packet_t ph, } else if (pkt->pkt_link_flags & PKT_LINKF_MCAST) { m->m_flags |= M_MCAST; } - SK_DF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, "%s(%d) %s %s", - sk_proc_name_address(current_proc()), sk_proc_pid(current_proc()), +#if COPY_LOG + SK_PDF(SK_VERB_COPY_MBUF | SK_VERB_DUMP, current_proc(), "%s %s", (t == NR_RX) ? "RX" : "TX", - sk_dump("buf", (uint8_t *)dp, m->m_len, 128, NULL, 0)); + sk_dump("buf", (uint8_t *)dp, m->m_len, 128)); +#endif } /* diff --git a/bsd/skywalk/packet/packet_kern.c b/bsd/skywalk/packet/packet_kern.c index 7d2ebbc6c..2f552935b 100644 --- a/bsd/skywalk/packet/packet_kern.c +++ b/bsd/skywalk/packet/packet_kern.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Apple Inc. All rights reserved. + * Copyright (c) 2016-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,29 +32,6 @@ static int kern_packet_clone_internal(const kern_packet_t, kern_packet_t *, uint32_t, kern_packet_copy_mode_t); -#if (DEBUG || DEVELOPMENT) -__attribute__((noreturn)) -void -pkt_subtype_assert_fail(const kern_packet_t ph, uint64_t type, uint64_t subtype) -{ - panic("invalid packet handle 0x%llx (type %llu != %llu || " - "subtype %llu != %llu)", ph, SK_PTR_TYPE(ph), type, - SK_PTR_SUBTYPE(ph), subtype); - /* NOTREACHED */ - __builtin_unreachable(); -} - -__attribute__((noreturn)) -void -pkt_type_assert_fail(const kern_packet_t ph, uint64_t type) -{ - panic("invalid packet handle 0x%llx (type %llu != %llu)", - ph, SK_PTR_TYPE(ph), type); - /* NOTREACHED */ - __builtin_unreachable(); -} -#endif /* DEBUG || DEVELOPMENT */ - errno_t kern_packet_set_headroom(const kern_packet_t ph, const uint8_t headroom) { @@ -308,47 +285,31 @@ kern_packet_clear_flow_uuid(const kern_packet_t ph) void kern_packet_get_euuid(const kern_packet_t ph, uuid_t euuid) { - if (__probable(SK_PTR_TYPE(ph) == NEXUS_META_TYPE_PACKET)) { - uuid_copy(euuid, PKT_ADDR(ph)->pkt_policy_euuid); - } else { - uuid_clear(euuid); - } + uuid_copy(euuid, PKT_ADDR(ph)->pkt_policy_euuid); } void kern_packet_set_policy_id(const kern_packet_t ph, uint32_t policy_id) { - if (__probable(SK_PTR_TYPE(ph) == NEXUS_META_TYPE_PACKET)) { - PKT_ADDR(ph)->pkt_policy_id = policy_id; - } + PKT_ADDR(ph)->pkt_policy_id = policy_id; } uint32_t kern_packet_get_policy_id(const kern_packet_t ph) { - if (__probable(SK_PTR_TYPE(ph) == NEXUS_META_TYPE_PACKET)) { - return PKT_ADDR(ph)->pkt_policy_id; - } else { - return 0; - } + return PKT_ADDR(ph)->pkt_policy_id; } void kern_packet_set_skip_policy_id(const kern_packet_t ph, uint32_t skip_policy_id) { - if (__probable(SK_PTR_TYPE(ph) == NEXUS_META_TYPE_PACKET)) { - PKT_ADDR(ph)->pkt_skip_policy_id = skip_policy_id; - } + PKT_ADDR(ph)->pkt_skip_policy_id = skip_policy_id; } uint32_t kern_packet_get_skip_policy_id(const kern_packet_t ph) { - if (__probable(SK_PTR_TYPE(ph) == NEXUS_META_TYPE_PACKET)) { - return PKT_ADDR(ph)->pkt_skip_policy_id; - } else { - return 0; - } + return PKT_ADDR(ph)->pkt_skip_policy_id; } uint32_t @@ -513,17 +474,15 @@ kern_packet_get_packetid(const kern_packet_t ph, packet_id_t *pktid) } errno_t -kern_packet_set_vlan_tag(const kern_packet_t ph, const uint16_t tag, - const boolean_t tag_in_pkt) +kern_packet_set_vlan_tag(const kern_packet_t ph, const uint16_t tag) { - return __packet_set_vlan_tag(ph, tag, tag_in_pkt); + return __packet_set_vlan_tag(ph, tag); } errno_t -kern_packet_get_vlan_tag(const kern_packet_t ph, uint16_t *tag, - boolean_t *tag_in_pkt) +kern_packet_get_vlan_tag(const kern_packet_t ph, uint16_t *tag) { - return __packet_get_vlan_tag(ph, tag, tag_in_pkt); + return __packet_get_vlan_tag(ph, tag); } uint16_t @@ -557,6 +516,24 @@ kern_packet_get_wake_flag(const kern_packet_t ph) return __packet_get_wake_flag(ph); } +void +kern_packet_set_ulpn_flag(const kern_packet_t ph) +{ + return __packet_set_ulpn_flag(ph); +} + +boolean_t +kern_packet_get_ulpn_flag(const kern_packet_t ph) +{ + return __packet_get_ulpn_flag(ph); +} + +boolean_t +kern_packet_get_lpw_flag(const kern_packet_t ph) +{ + return __packet_get_lpw_flag(ph); +} + uint32_t kern_inet_checksum(const void *data, uint32_t len, uint32_t sum0) { @@ -589,7 +566,6 @@ kern_packet_clone_internal(const kern_packet_t ph1, kern_packet_t *ph2, int err; /* TODO: Add quantum support */ - VERIFY(SK_PTR_TYPE(ph1) == NEXUS_META_TYPE_PACKET); /* Source needs to be finalized (not dropped) and with 1 buflet */ if ((p1->pkt_qum.qum_qflags & QUM_F_DROPPED) != 0 || @@ -705,7 +681,7 @@ kern_packet_clone_internal(const kern_packet_t ph1, kern_packet_t *ph2, /* Copy AQM metadata */ p2->pkt_flowsrc_type = p1->pkt_flowsrc_type; p2->pkt_flowsrc_fidx = p1->pkt_flowsrc_fidx; - _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0); + static_assert((offsetof(struct __flow, flow_src_id) % 8) == 0); _UUID_COPY(p2->pkt_flowsrc_id, p1->pkt_flowsrc_id); _UUID_COPY(p2->pkt_policy_euuid, p1->pkt_policy_euuid); p2->pkt_policy_id = p1->pkt_policy_id; diff --git a/bsd/skywalk/packet/packet_var.h b/bsd/skywalk/packet/packet_var.h index fc06ed2e6..5befd54ca 100644 --- a/bsd/skywalk/packet/packet_var.h +++ b/bsd/skywalk/packet/packet_var.h @@ -78,7 +78,7 @@ struct __kern_buflet_ext { } __attribute((packed)); #define KBUF_CTOR(_kbuf, _baddr, _bidxreg, _bc, _pp, _large) do { \ - _CASSERT(sizeof ((_kbuf)->buf_addr) == sizeof (mach_vm_address_t));\ + static_assert(sizeof ((_kbuf)->buf_addr) == sizeof (mach_vm_address_t));\ /* kernel variant (deconst) */ \ BUF_CTOR(_kbuf, _baddr, _bidxreg, (_large) ? PP_BUF_SIZE_LARGE(_pp) :\ PP_BUF_SIZE_DEF(_pp), 0, 0, (_kbuf)->buf_nbft_addr, \ @@ -94,7 +94,7 @@ struct __kern_buflet_ext { #define KBUF_EXT_CTOR(_kbuf, _ubuf, _baddr, _bidxreg, _bc, \ _bft_idx_reg, _pp, _large) do { \ ASSERT(_bft_idx_reg != OBJ_IDX_NONE); \ - _CASSERT(sizeof((_kbuf)->buf_flag) == sizeof(uint16_t)); \ + static_assert(sizeof((_kbuf)->buf_flag) == sizeof(uint16_t)); \ /* we don't set buf_nbft_addr here as during construction it */ \ /* is used by skmem batch alloc logic */ \ *__DECONST(uint16_t *, &(_kbuf)->buf_flag) = BUFLET_FLAG_EXTERNAL;\ @@ -180,13 +180,13 @@ struct __kern_buflet_ext { ASSERT((_skb)->buf_nbft_addr == 0); \ ASSERT((_skb)->buf_nbft_idx == OBJ_IDX_NONE); \ ASSERT(!((_dkb)->buf_flag & BUFLET_FLAG_EXTERNAL)); \ - _CASSERT(sizeof(struct __kern_buflet) == 50); \ + static_assert(sizeof(struct __kern_buflet) == 50); \ /* copy everything in the kernel buflet */ \ sk_copy64_40((uint64_t *)(void *)(_skb), (uint64_t *)(void *)(_dkb));\ ((uint64_t *)(void *)(_dkb))[5] = ((uint64_t *)(void *)(_skb))[5]; \ ((uint16_t *)(void *)(_dkb))[24] = ((uint16_t *)(void *)(_skb))[24]; \ ASSERT((_dkb)->buf_ctl == (_skb)->buf_ctl); \ - _CASSERT(sizeof((_dkb)->buf_flag) == sizeof(uint16_t)); \ + static_assert(sizeof((_dkb)->buf_flag) == sizeof(uint16_t)); \ *__DECONST(uint16_t *, &(_dkb)->buf_flag) &= ~BUFLET_FLAG_EXTERNAL;\ if (__probable((_dkb)->buf_ctl != NULL)) { \ skmem_bufctl_use(__DECONST(struct skmem_bufctl *, \ @@ -216,14 +216,14 @@ struct __kern_quantum { #define KQUM_CTOR(_kqum, _midx, _uqum, _pp, _qflags) do { \ ASSERT((uintptr_t)(_kqum) != (uintptr_t)(_uqum)); \ - _CASSERT(sizeof(METADATA_IDX(_kqum)) == sizeof(obj_idx_t)); \ + static_assert(sizeof(METADATA_IDX(_kqum)) == sizeof(obj_idx_t)); \ /* kernel variant (deconst) */ \ _KQUM_CTOR(_kqum, (PP_KERNEL_ONLY(_pp) ? \ QUM_F_KERNEL_ONLY : 0) | _qflags, 0, 0, OBJ_IDX_NONE, \ PP_BUF_SIZE_DEF((_pp)), _midx); \ - _CASSERT(NEXUS_META_TYPE_MAX <= UINT16_MAX); \ + static_assert(NEXUS_META_TYPE_MAX <= UINT16_MAX); \ METADATA_TYPE(_kqum) = (uint16_t)(_pp)->pp_md_type; \ - _CASSERT(NEXUS_META_SUBTYPE_MAX <= UINT16_MAX); \ + static_assert(NEXUS_META_SUBTYPE_MAX <= UINT16_MAX); \ METADATA_SUBTYPE(_kqum) = (uint16_t)(_pp)->pp_md_subtype; \ *(struct kern_pbufpool **)(uintptr_t)&(_kqum)->qum_pp = (_pp); \ *(struct __user_quantum **)(uintptr_t)&(_kqum)->qum_user = (_uqum); \ @@ -255,13 +255,13 @@ _UUID_MATCH(uuid_t u1, uuid_t u2) } #define _UUID_COPY(_dst, _src) do { \ - _CASSERT(sizeof (uuid_t) == 16); \ + static_assert(sizeof(uuid_t) == 16); \ sk_copy64_16((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \ } while (0) #define _UUID_CLEAR(_u) do { \ uint64_t *__dst = (uint64_t *)(void *)(_u); \ - _CASSERT(sizeof (uuid_t) == 16); \ + static_assert(sizeof(uuid_t) == 16); \ *(__dst++) = 0; /* qw[0] */ \ *(__dst) = 0; /* qw[1] */ \ } while (0) @@ -276,8 +276,8 @@ _UUID_MATCH(uuid_t u1, uuid_t u2) */ #define _QUM_COPY(_skq, _dkq) do { \ volatile uint16_t _sf = ((_dkq)->qum_qflags & QUM_F_SAVE_MASK); \ - _CASSERT(sizeof (_sf) == sizeof ((_dkq)->qum_qflags)); \ - _CASSERT(offsetof(struct __quantum, __q_flags) == 24); \ + static_assert(sizeof(_sf) == sizeof((_dkq)->qum_qflags)); \ + static_assert(offsetof(struct __quantum, __q_flags) == 24); \ /* copy everything above (and excluding) __q_flags */ \ sk_copy64_24((uint64_t *)(void *)&(_skq)->qum_com, \ (uint64_t *)(void *)&(_dkq)->qum_com); \ @@ -307,8 +307,8 @@ _UUID_MATCH(uuid_t u1, uuid_t u2) * after __q_flags. This macro is used only during externalize. */ #define _QUM_EXTERNALIZE(_kq, _uq) do { \ - _CASSERT(offsetof(struct __quantum, __q_flags) == 24); \ - _CASSERT(sizeof(METADATA_IDX(_uq)) == sizeof(obj_idx_t)); \ + static_assert(offsetof(struct __quantum, __q_flags) == 24); \ + static_assert(sizeof(METADATA_IDX(_uq)) == sizeof(obj_idx_t)); \ /* copy __quantum excluding qum_qflags */ \ sk_copy64_24((uint64_t *)(void *)&(_kq)->qum_com, \ (uint64_t *)(void *)&(_uq)->qum_com); \ @@ -475,8 +475,8 @@ struct __kern_packet { /* save packet flags since it might be wiped out */ \ volatile uint64_t __pflags = (_pflags); \ /* first wipe it clean */ \ - _CASSERT(sizeof(struct __packet_com) == 32); \ - _CASSERT(sizeof(struct __packet) == 32); \ + static_assert(sizeof(struct __packet_com) == 32); \ + static_assert(sizeof(struct __packet) == 32); \ sk_zero_32(&(_p)->pkt_com.__pkt_data[0]); \ /* then initialize */ \ (_p)->pkt_pflags = (__pflags); \ @@ -485,16 +485,15 @@ struct __kern_packet { #define _PKT_CTOR(_p, _pflags, _bufcnt, _maxfrags) do { \ _PKT_COM_INIT(_p, _pflags); \ - _CASSERT(sizeof ((_p)->pkt_bufs_max) == sizeof (uint16_t)); \ - _CASSERT(sizeof ((_p)->pkt_bufs_cnt) == sizeof (uint16_t)); \ + static_assert(sizeof((_p)->pkt_bufs_max) == sizeof(uint16_t)); \ + static_assert(sizeof((_p)->pkt_bufs_cnt) == sizeof(uint16_t)); \ /* deconst */ \ *(uint16_t *)(uintptr_t)&(_p)->pkt_bufs_max = (_maxfrags); \ *(uint16_t *)(uintptr_t)&(_p)->pkt_bufs_cnt = (_bufcnt); \ } while (0) #define KPKT_CLEAR_MBUF_PKT_DATA(_pk) do { \ - _CASSERT(offsetof(struct __kern_packet, pkt_mbuf) == \ - offsetof(struct __kern_packet, pkt_pkt)); \ + static_assert(offsetof(struct __kern_packet, pkt_mbuf) == offsetof(struct __kern_packet, pkt_pkt)); \ (_pk)->pkt_pflags &= ~(PKT_F_MBUF_MASK|PKT_F_PKT_MASK); \ /* the following also clears pkt_pkt */ \ (_pk)->pkt_mbuf = NULL; \ @@ -511,7 +510,7 @@ struct __kern_packet { } while (0) #define KPKT_CLEAR_FLOW_INIT(_fl) do { \ - _CASSERT(sizeof ((_fl)->flow_init_data) == 128); \ + static_assert(sizeof((_fl)->flow_init_data) == 128); \ sk_zero_128(&(_fl)->flow_init_data[0]); \ } while (0) @@ -568,9 +567,9 @@ struct __kern_packet { if (((_p)->pkt_pflags & PKT_F_TX_COMPL_DATA) == 0) { \ ASSERT((_p)->pkt_pflags & PKT_F_TX_COMPL_ALLOC); \ (_p)->pkt_pflags |= PKT_F_TX_COMPL_DATA; \ - _CASSERT(sizeof((_p)->pkt_tx_compl_data64) == 24); \ + static_assert(sizeof((_p)->pkt_tx_compl_data64) == 24); \ /* 32-bit compl_data should be in the union */ \ - _CASSERT(sizeof((_p)->pkt_tx_compl_data) <= 24); \ + static_assert(sizeof((_p)->pkt_tx_compl_data) <= 24); \ (_p)->pkt_tx_compl_data64[0] = 0; \ (_p)->pkt_tx_compl_data64[1] = 0; \ (_p)->pkt_tx_compl_data64[2] = 0; \ @@ -583,7 +582,7 @@ struct __kern_packet { */ #define _PKT_COPY_OPT_DATA(_skp, _dkp) do { \ if (__improbable(((_skp)->pkt_pflags & PKT_F_OPT_DATA) != 0)) { \ - _CASSERT(sizeof(struct __packet_opt) == 40); \ + static_assert(sizeof(struct __packet_opt) == 40); \ ASSERT((_skp)->pkt_pflags & PKT_F_OPT_ALLOC); \ sk_copy64_40((uint64_t *)(struct __packet_opt *__header_bidi_indexable)(_skp)->pkt_com_opt, \ (uint64_t *)(struct __packet_opt *__header_bidi_indexable)(_dkp)->pkt_com_opt); \ @@ -600,9 +599,9 @@ struct __kern_packet { * after __p_flags. */ #define _PKT_COPY(_skp, _dkp) do { \ - _CASSERT(sizeof(struct __packet) == 32); \ - _CASSERT(sizeof(struct __packet_com) == 32); \ - _CASSERT(offsetof(struct __packet, __p_flags) == 24); \ + static_assert(sizeof(struct __packet) == 32); \ + static_assert(sizeof(struct __packet_com) == 32); \ + static_assert(offsetof(struct __packet, __p_flags) == 24); \ /* copy __packet excluding pkt_pflags */ \ sk_copy64_24((uint64_t *)(struct __packet *__header_bidi_indexable)&(_skp)->pkt_com, \ (uint64_t *)(struct __packet *__header_bidi_indexable)&(_dkp)->pkt_com); \ @@ -632,9 +631,9 @@ struct __kern_packet { */ #define _PKT_INTERNALIZE(_up, _kp) do { \ volatile uint64_t _kf = ((_kp)->pkt_pflags & ~PKT_F_USER_MASK); \ - _CASSERT(sizeof(struct __packet) == 32); \ - _CASSERT(sizeof(struct __packet_com) == 32); \ - _CASSERT(offsetof(struct __packet, __p_flags) == 24); \ + static_assert(sizeof(struct __packet) == 32); \ + static_assert(sizeof(struct __packet_com) == 32); \ + static_assert(offsetof(struct __packet, __p_flags) == 24); \ /* copy __packet excluding pkt_pflags */ \ sk_copy64_24((uint64_t *)(void *)&(_up)->pkt_com, \ (uint64_t *)(void *)&(_kp)->pkt_com); \ @@ -642,7 +641,7 @@ struct __kern_packet { (_kp)->pkt_pflags = ((_up)->pkt_pflags & PKT_F_USER_MASK) | _kf;\ /* copy (internalize) __packet_opt if applicable */ \ if (__improbable(((_kp)->pkt_pflags & PKT_F_OPT_DATA) != 0)) { \ - _CASSERT(sizeof(struct __packet_opt) == 40); \ + static_assert(sizeof(struct __packet_opt) == 40); \ ASSERT((_kp)->pkt_pflags & PKT_F_OPT_ALLOC); \ sk_copy64_40((uint64_t *)(void *)&(_up)->pkt_com_opt, \ (uint64_t *)(struct __packet_opt *__header_bidi_indexable)(_kp)->pkt_com_opt); \ @@ -659,9 +658,9 @@ struct __kern_packet { * after __p_flags. This macro is used only during externalize. */ #define _PKT_EXTERNALIZE(_kp, _up) do { \ - _CASSERT(sizeof(struct __packet) == 32); \ - _CASSERT(sizeof(struct __packet_com) == 32); \ - _CASSERT(offsetof(struct __packet, __p_flags) == 24); \ + static_assert(sizeof(struct __packet) == 32); \ + static_assert(sizeof(struct __packet_com) == 32); \ + static_assert(offsetof(struct __packet, __p_flags) == 24); \ /* copy __packet excluding pkt_pflags */ \ sk_copy64_24((uint64_t *)(void *)&(_kp)->pkt_com, \ (uint64_t *)(void *)&(_up)->pkt_com); \ @@ -669,20 +668,17 @@ struct __kern_packet { (_up)->pkt_pflags = ((_kp)->pkt_pflags & PKT_F_USER_MASK); \ /* copy (externalize) __packet_opt if applicable */ \ if (__improbable(((_kp)->pkt_pflags & PKT_F_OPT_DATA) != 0)) { \ - _CASSERT(sizeof(struct __packet_opt) == 40); \ + static_assert(sizeof(struct __packet_opt) == 40); \ ASSERT((_kp)->pkt_pflags & PKT_F_OPT_ALLOC); \ sk_copy64_40((uint64_t *)(struct __packet_opt *__header_bidi_indexable)(_kp)->pkt_com_opt, \ (uint64_t *)(void *)&(_up)->pkt_com_opt); \ } \ } while (0) -#define SK_PTR_ADDR_KQUM(_ph) __unsafe_forge_single(struct __kern_quantum *, \ - (SK_PTR_ADDR(_ph))) -#define SK_PTR_ADDR_KPKT(_ph) __unsafe_forge_single(struct __kern_packet *, \ - (SK_PTR_ADDR(_ph))) +#define SK_PTR_ADDR_KQUM(_ph) __unsafe_forge_single(struct __kern_quantum *, (_ph)) +#define SK_PTR_ADDR_KPKT(_ph) __unsafe_forge_single(struct __kern_packet *, (_ph)) #define SK_PTR_KPKT(_pa) ((struct __kern_packet *)(void *)(_pa)) -#define SK_PKT2PH(_pkt) \ - (SK_PTR_ENCODE((_pkt), METADATA_TYPE((_pkt)), METADATA_SUBTYPE((_pkt)))) +#define SK_PKT2PH(_pkt) ((uint64_t)(_pkt)) /* * Set the length of the data to various places: __user_slot_desc, @@ -696,43 +692,21 @@ struct __kern_packet { struct __kern_quantum *_q = \ (struct __kern_quantum *)(void *)(_md); \ _q->qum_len = (_len); \ - switch (METADATA_TYPE(_q)) { \ - case NEXUS_META_TYPE_PACKET: { \ - struct __kern_packet *_p = \ - (struct __kern_packet *)(void *)(_md); \ - struct __kern_buflet *_kbft; \ - PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ - _kbft->buf_dlen = (_len); \ - _kbft->buf_doff = (_doff); \ - break; \ - } \ - default: \ - ASSERT(METADATA_TYPE(_q) == NEXUS_META_TYPE_QUANTUM); \ - _q->qum_buf[0].buf_dlen = (_len); \ - _q->qum_buf[0].buf_doff = (_doff); \ - break; \ - } \ + struct __kern_packet *_p = \ + (struct __kern_packet *)(void *)(_md); \ + struct __kern_buflet *_kbft; \ + PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ + _kbft->buf_dlen = (_len); \ + _kbft->buf_doff = (_doff); \ } while (0) #define METADATA_ADJUST_LEN(_md, _len, _doff) do { \ - struct __kern_quantum *_q = \ - (struct __kern_quantum *)(void *)(_md); \ - switch (METADATA_TYPE(_q)) { \ - case NEXUS_META_TYPE_PACKET: { \ - struct __kern_packet *_p = \ - (struct __kern_packet *)(void *)(_md); \ - struct __kern_buflet *_kbft; \ - PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ - _kbft->buf_dlen += (_len); \ - _kbft->buf_doff = (_doff); \ - break; \ - } \ - default: \ - ASSERT(METADATA_TYPE(_q) == NEXUS_META_TYPE_QUANTUM); \ - _q->qum_buf[0].buf_dlen += (_len); \ - _q->qum_buf[0].buf_doff = (_doff); \ - break; \ - } \ + struct __kern_packet *_p = \ + (struct __kern_packet *)(void *)(_md); \ + struct __kern_buflet *_kbft; \ + PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \ + _kbft->buf_dlen += (_len); \ + _kbft->buf_doff = (_doff); \ } while (0) __attribute__((always_inline)) @@ -849,8 +823,6 @@ typedef void (pkt_copy_to_mbuf_t)(const enum txrx, kern_packet_t, const boolean_t, const uint16_t); __BEGIN_DECLS -extern void pkt_subtype_assert_fail(const kern_packet_t, uint64_t, uint64_t); -extern void pkt_type_assert_fail(const kern_packet_t, uint64_t); extern pkt_copy_from_pkt_t pkt_copy_from_pkt; extern pkt_copy_from_pkt_t pkt_copy_multi_buflet_from_pkt; diff --git a/bsd/skywalk/packet/pbufpool.c b/bsd/skywalk/packet/pbufpool.c index 35b2df62a..40890014d 100644 --- a/bsd/skywalk/packet/pbufpool.c +++ b/bsd/skywalk/packet/pbufpool.c @@ -30,6 +30,7 @@ #include #include #include +#include static struct kern_pbufpool *pp_alloc(zalloc_flags_t); static void pp_free(struct kern_pbufpool *); @@ -112,105 +113,95 @@ static int __pp_inited = 0; int pp_init(void) { - _CASSERT(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC); - _CASSERT(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS); - _CASSERT(KPKT_SC_BK == MBUF_SC_BK); - _CASSERT(KPKT_SC_BE == MBUF_SC_BE); - _CASSERT(KPKT_SC_RD == MBUF_SC_RD); - _CASSERT(KPKT_SC_OAM == MBUF_SC_OAM); - _CASSERT(KPKT_SC_AV == MBUF_SC_AV); - _CASSERT(KPKT_SC_RV == MBUF_SC_RV); - _CASSERT(KPKT_SC_VI == MBUF_SC_VI); - _CASSERT(KPKT_SC_SIG == MBUF_SC_SIG); - _CASSERT(KPKT_SC_VO == MBUF_SC_VO); - _CASSERT(KPKT_SC_CTL == MBUF_SC_CTL); + static_assert(KPKT_SC_UNSPEC == MBUF_SC_UNSPEC); + static_assert(KPKT_SC_BK_SYS == MBUF_SC_BK_SYS); + static_assert(KPKT_SC_BK == MBUF_SC_BK); + static_assert(KPKT_SC_BE == MBUF_SC_BE); + static_assert(KPKT_SC_RD == MBUF_SC_RD); + static_assert(KPKT_SC_OAM == MBUF_SC_OAM); + static_assert(KPKT_SC_AV == MBUF_SC_AV); + static_assert(KPKT_SC_RV == MBUF_SC_RV); + static_assert(KPKT_SC_VI == MBUF_SC_VI); + static_assert(KPKT_SC_SIG == MBUF_SC_SIG); + static_assert(KPKT_SC_VO == MBUF_SC_VO); + static_assert(KPKT_SC_CTL == MBUF_SC_CTL); - _CASSERT(KPKT_SC_BK_SYS == PKT_SC_BK_SYS); - _CASSERT(KPKT_SC_BK == PKT_SC_BK); - _CASSERT(KPKT_SC_BE == PKT_SC_BE); - _CASSERT(KPKT_SC_RD == PKT_SC_RD); - _CASSERT(KPKT_SC_OAM == PKT_SC_OAM); - _CASSERT(KPKT_SC_AV == PKT_SC_AV); - _CASSERT(KPKT_SC_RV == PKT_SC_RV); - _CASSERT(KPKT_SC_VI == PKT_SC_VI); - _CASSERT(KPKT_SC_SIG == PKT_SC_SIG); - _CASSERT(KPKT_SC_VO == PKT_SC_VO); - _CASSERT(KPKT_SC_CTL == PKT_SC_CTL); - _CASSERT(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES); + static_assert(KPKT_SC_BK_SYS == PKT_SC_BK_SYS); + static_assert(KPKT_SC_BK == PKT_SC_BK); + static_assert(KPKT_SC_BE == PKT_SC_BE); + static_assert(KPKT_SC_RD == PKT_SC_RD); + static_assert(KPKT_SC_OAM == PKT_SC_OAM); + static_assert(KPKT_SC_AV == PKT_SC_AV); + static_assert(KPKT_SC_RV == PKT_SC_RV); + static_assert(KPKT_SC_VI == PKT_SC_VI); + static_assert(KPKT_SC_SIG == PKT_SC_SIG); + static_assert(KPKT_SC_VO == PKT_SC_VO); + static_assert(KPKT_SC_CTL == PKT_SC_CTL); + static_assert(KPKT_SC_MAX_CLASSES == MBUF_SC_MAX_CLASSES); - _CASSERT(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC); - _CASSERT(KPKT_TC_BE == MBUF_TC_BE); - _CASSERT(KPKT_TC_BK == MBUF_TC_BK); - _CASSERT(KPKT_TC_VI == MBUF_TC_VI); - _CASSERT(KPKT_TC_VO == MBUF_TC_VO); - _CASSERT(KPKT_TC_MAX == MBUF_TC_MAX); + static_assert(KPKT_TC_UNSPEC == MBUF_TC_UNSPEC); + static_assert(KPKT_TC_BE == MBUF_TC_BE); + static_assert(KPKT_TC_BK == MBUF_TC_BK); + static_assert(KPKT_TC_VI == MBUF_TC_VI); + static_assert(KPKT_TC_VO == MBUF_TC_VO); + static_assert(KPKT_TC_MAX == MBUF_TC_MAX); - _CASSERT(KPKT_TC_BE == PKT_TC_BE); - _CASSERT(KPKT_TC_BK == PKT_TC_BK); - _CASSERT(KPKT_TC_VI == PKT_TC_VI); - _CASSERT(KPKT_TC_VO == PKT_TC_VO); + static_assert(KPKT_TC_BE == PKT_TC_BE); + static_assert(KPKT_TC_BK == PKT_TC_BK); + static_assert(KPKT_TC_VI == PKT_TC_VI); + static_assert(KPKT_TC_VO == PKT_TC_VO); - _CASSERT(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS); - _CASSERT(PKT_SCVAL_BK == SCVAL_BK); - _CASSERT(PKT_SCVAL_BE == SCVAL_BE); - _CASSERT(PKT_SCVAL_RD == SCVAL_RD); - _CASSERT(PKT_SCVAL_OAM == SCVAL_OAM); - _CASSERT(PKT_SCVAL_AV == SCVAL_AV); - _CASSERT(PKT_SCVAL_RV == SCVAL_RV); - _CASSERT(PKT_SCVAL_VI == SCVAL_VI); - _CASSERT(PKT_SCVAL_VO == SCVAL_VO); - _CASSERT(PKT_SCVAL_CTL == SCVAL_CTL); + static_assert(PKT_SCVAL_BK_SYS == SCVAL_BK_SYS); + static_assert(PKT_SCVAL_BK == SCVAL_BK); + static_assert(PKT_SCVAL_BE == SCVAL_BE); + static_assert(PKT_SCVAL_RD == SCVAL_RD); + static_assert(PKT_SCVAL_OAM == SCVAL_OAM); + static_assert(PKT_SCVAL_AV == SCVAL_AV); + static_assert(PKT_SCVAL_RV == SCVAL_RV); + static_assert(PKT_SCVAL_VI == SCVAL_VI); + static_assert(PKT_SCVAL_VO == SCVAL_VO); + static_assert(PKT_SCVAL_CTL == SCVAL_CTL); /* * Assert that the value of common packet flags between mbuf and * skywalk packets match, and that they are in PKT_F_COMMON_MASK. */ - _CASSERT(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND); - _CASSERT(PKT_F_REALTIME == PKTF_SO_REALTIME); - _CASSERT(PKT_F_REXMT == PKTF_TCP_REXMT); - _CASSERT(PKT_F_LAST_PKT == PKTF_LAST_PKT); - _CASSERT(PKT_F_FLOW_ID == PKTF_FLOW_ID); - _CASSERT(PKT_F_FLOW_ADV == PKTF_FLOW_ADV); - _CASSERT(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ); - _CASSERT(PKT_F_TS_VALID == PKTF_TS_VALID); - _CASSERT(PKT_F_NEW_FLOW == PKTF_NEW_FLOW); - _CASSERT(PKT_F_START_SEQ == PKTF_START_SEQ); - _CASSERT(PKT_F_KEEPALIVE == PKTF_KEEPALIVE); - _CASSERT(PKT_F_WAKE_PKT == PKTF_WAKE_PKT); - _CASSERT(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | - PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV | - PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW | - PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT)); + static_assert(PKT_F_BACKGROUND == PKTF_SO_BACKGROUND); + static_assert(PKT_F_REALTIME == PKTF_SO_REALTIME); + static_assert(PKT_F_REXMT == PKTF_TCP_REXMT); + static_assert(PKT_F_LAST_PKT == PKTF_LAST_PKT); + static_assert(PKT_F_FLOW_ID == PKTF_FLOW_ID); + static_assert(PKT_F_FLOW_ADV == PKTF_FLOW_ADV); + static_assert(PKT_F_TX_COMPL_TS_REQ == PKTF_TX_COMPL_TS_REQ); + static_assert(PKT_F_TS_VALID == PKTF_TS_VALID); + static_assert(PKT_F_NEW_FLOW == PKTF_NEW_FLOW); + static_assert(PKT_F_START_SEQ == PKTF_START_SEQ); + static_assert(PKT_F_KEEPALIVE == PKTF_KEEPALIVE); + static_assert(PKT_F_WAKE_PKT == PKTF_WAKE_PKT); + static_assert(PKT_F_COMMON_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_FLOW_ID | PKT_F_FLOW_ADV | PKT_F_TX_COMPL_TS_REQ | PKT_F_TS_VALID | PKT_F_NEW_FLOW | PKT_F_START_SEQ | PKT_F_KEEPALIVE | PKT_F_WAKE_PKT)); /* * Assert packet flags shared with userland. */ - _CASSERT(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | - PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC | - PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S)); + static_assert(PKT_F_USER_MASK == (PKT_F_BACKGROUND | PKT_F_REALTIME | PKT_F_REXMT | PKT_F_LAST_PKT | PKT_F_OPT_DATA | PKT_F_PROMISC | PKT_F_TRUNCATED | PKT_F_WAKE_PKT | PKT_F_L4S | PKT_F_ULPN)); - _CASSERT(offsetof(struct __kern_quantum, qum_len) == - offsetof(struct __kern_packet, pkt_length)); + static_assert(offsetof(struct __kern_quantum, qum_len) == offsetof(struct __kern_packet, pkt_length)); /* * Due to the use of tagged pointer, we need the size of * the metadata preamble structure to be multiples of 16. * See SK_PTR_TAG() definition for details. */ - _CASSERT(sizeof(struct __metadata_preamble) != 0 && - (sizeof(struct __metadata_preamble) % 16) == 0); + static_assert(sizeof(struct __metadata_preamble) != 0 && (sizeof(struct __metadata_preamble) % 16) == 0); - _CASSERT(NX_PBUF_FRAGS_MIN == 1 && - NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT); + static_assert(NX_PBUF_FRAGS_MIN == 1 && NX_PBUF_FRAGS_MIN == NX_PBUF_FRAGS_DEFAULT); /* * Batch alloc/free requires linking the objects together; * make sure that the fields are at the same offset since * we cast the object to struct skmem_obj. */ - _CASSERT(offsetof(struct __metadata_preamble, _mdp_next) == - offsetof(struct skmem_obj, mo_next)); - _CASSERT(offsetof(struct __buflet, __buflet_next) == - offsetof(struct skmem_obj, mo_next)); + static_assert(offsetof(struct __metadata_preamble, _mdp_next) == offsetof(struct skmem_obj, mo_next)); + static_assert(offsetof(struct __buflet, __buflet_next) == offsetof(struct skmem_obj, mo_next)); SK_LOCK_ASSERT_HELD(); ASSERT(!__pp_inited); @@ -273,7 +264,7 @@ pp_free(struct kern_pbufpool *pp) pp_destroy(pp); PP_UNLOCK(pp); - SK_DF(SK_VERB_MEM, "pp 0x%llx FREE", SK_KVA(pp)); + SK_DF(SK_VERB_MEM, "pp %p FREE", SK_KVA(pp)); lck_mtx_destroy(&pp->pp_lock, &skmem_lock_grp); zfree(pp_zone, pp); } @@ -357,18 +348,7 @@ pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS], ASSERT(max_frags != 0); - switch (md_type) { - case NEXUS_META_TYPE_QUANTUM: - md_size = NX_METADATA_QUANTUM_SZ; - break; - case NEXUS_META_TYPE_PACKET: - md_size = NX_METADATA_PACKET_SZ(max_frags); - break; - default: - VERIFY(0); - /* NOTREACHED */ - __builtin_unreachable(); - } + md_size = NX_METADATA_PACKET_SZ(max_frags); switch (flags & PP_REGION_CONFIG_BUF_IODIR_BIDIR) { case PP_REGION_CONFIG_BUF_IODIR_IN: @@ -496,7 +476,6 @@ pp_regions_params_adjust(struct skmem_region_params srp_array[SKMEM_REGIONS], /* configure kernel buflet region */ if (config_buflet) { - ASSERT(md_type == NEXUS_META_TYPE_PACKET); /* * Ideally we want the number of buflets to be * "kmd_srp->srp_c_obj_cnt * (kmd_srp->srp_max_frags - 1)", @@ -553,53 +532,40 @@ pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum, ASSERT(bufcnt == 1 || PP_HAS_BUFFER_ON_DEMAND(pp)); /* construct {user,kernel} metadata */ - switch (pp->pp_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); - struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum); - struct __packet_opt *__single opt; - struct __flow *__single flow; - struct __packet_compl *__single compl; - uint64_t pflags; + struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); + struct __user_packet *upkt = SK_PTR_ADDR_UPKT(uqum); + struct __packet_opt *__single opt; + struct __flow *__single flow; + struct __packet_compl *__single compl; + uint64_t pflags; - if (raw) { - opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP); - flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP); - compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP); - pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC | - PKT_F_TX_COMPL_ALLOC); - } else { - ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) && - kpkt->pkt_com_opt != NULL); - opt = kpkt->pkt_com_opt; - ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) && - kpkt->pkt_flow != NULL); - flow = kpkt->pkt_flow; - ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) && - kpkt->pkt_tx_compl != NULL); - compl = kpkt->pkt_tx_compl; - pflags = kpkt->pkt_pflags; - } - /* will be adjusted below as part of allocating buffer(s) */ - _CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t)); - _CASSERT(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t)); - pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt); - pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max); + if (raw) { + opt = skmem_cache_alloc(pp_opt_cache, SKMEM_SLEEP); + flow = skmem_cache_alloc(pp_flow_cache, SKMEM_SLEEP); + compl = skmem_cache_alloc(pp_compl_cache, SKMEM_SLEEP); + pflags = (PKT_F_OPT_ALLOC | PKT_F_FLOW_ALLOC | + PKT_F_TX_COMPL_ALLOC); + } else { + ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) && + kpkt->pkt_com_opt != NULL); + opt = kpkt->pkt_com_opt; + ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) && + kpkt->pkt_flow != NULL); + flow = kpkt->pkt_flow; + ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) && + kpkt->pkt_tx_compl != NULL); + compl = kpkt->pkt_tx_compl; + pflags = kpkt->pkt_pflags; + } + /* will be adjusted below as part of allocating buffer(s) */ + static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t)); + static_assert(sizeof(kpkt->pkt_bufs_max) == sizeof(uint16_t)); + pbufs_cnt = __DECONST(uint16_t *, &kpkt->pkt_bufs_cnt); + pbufs_max = __DECONST(uint16_t *, &kpkt->pkt_bufs_max); - /* kernel (and user) packet */ - KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx, - upkt, pp, 0, pp->pp_max_frags, 0); - break; - } - default: - ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM); - VERIFY(bufcnt == 1); - /* TODO: point these to quantum's once they're defined */ - pbufs_cnt = pbufs_max = NULL; - /* kernel quantum */ - KQUM_CTOR(kqum, midx, uqum, pp, 0); - break; - } + /* kernel (and user) packet */ + KPKT_CTOR(kpkt, pflags, opt, flow, compl, midx, + upkt, pp, 0, pp->pp_max_frags, 0); kbuf = kqum->qum_buf; for (i = 0; i < bufcnt; i++) { @@ -631,7 +597,7 @@ pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum, kbuf = (kern_buflet_t)*blist; if (__improbable(kbuf == NULL)) { SK_DF(SK_VERB_MEM, "failed to get buflet," - " pp 0x%llx", SK_KVA(pp)); + " pp %p", SK_KVA(pp)); goto fail; } @@ -653,7 +619,7 @@ pp_metadata_construct(struct __kern_quantum *kqum, struct __user_quantum *uqum, ASSERT(!PP_KERNEL_ONLY(pp) || (kqum->qum_qflags & QUM_F_KERNEL_ONLY)); ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE); - SK_DF(SK_VERB_MEM, "pp 0x%llx pkt 0x%llx bufcnt %d buf 0x%llx", + SK_DF(SK_VERB_MEM, "pp %p pkt %p bufcnt %d buf %p", SK_KVA(pp), SK_KVA(kqum), bufcnt, SK_KVA(baddr)); return 0; @@ -781,47 +747,24 @@ pp_metadata_destruct_common(struct __kern_quantum *kqum, ASSERT(blist_def != NULL); ASSERT(blist_large != NULL); - switch (pp->pp_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); + struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); - ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp)); - ASSERT(kpkt->pkt_qum.qum_pp == pp); - ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type); - ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype); - ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE); - ASSERT(kpkt->pkt_qum.qum_ksd == NULL); - ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max); - ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags); - _CASSERT(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t)); - bufcnt = kpkt->pkt_bufs_cnt; - kbuf = &kqum->qum_buf[0]; - /* - * special handling for empty first buflet. - */ - first_buflet_empty = (kbuf->buf_addr == 0); - *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0; - break; - } - default: - ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM); - ASSERT(kqum->qum_user != NULL || PP_KERNEL_ONLY(pp)); - ASSERT(kqum->qum_pp == pp); - ASSERT(METADATA_TYPE(kqum) == pp->pp_md_type); - ASSERT(METADATA_SUBTYPE(kqum) == pp->pp_md_subtype); - ASSERT(METADATA_IDX(kqum) != OBJ_IDX_NONE); - ASSERT(kqum->qum_ksd == NULL); - kbuf = &kqum->qum_buf[0]; - /* - * XXX: Special handling for quantum as we don't currently - * define bufs_{cnt,max} there. Given that we support at - * most only 1 buflet for now, check if buf_addr is non-NULL. - * See related code in pp_metadata_construct(). - */ - first_buflet_empty = (kbuf->buf_addr == 0); - bufcnt = first_buflet_empty ? 0 : 1; - break; - } + ASSERT(kpkt->pkt_user != NULL || PP_KERNEL_ONLY(pp)); + ASSERT(kpkt->pkt_qum.qum_pp == pp); + ASSERT(METADATA_TYPE(kpkt) == pp->pp_md_type); + ASSERT(METADATA_SUBTYPE(kpkt) == pp->pp_md_subtype); + ASSERT(METADATA_IDX(kpkt) != OBJ_IDX_NONE); + ASSERT(kpkt->pkt_qum.qum_ksd == NULL); + ASSERT(kpkt->pkt_bufs_cnt <= kpkt->pkt_bufs_max); + ASSERT(kpkt->pkt_bufs_max == pp->pp_max_frags); + static_assert(sizeof(kpkt->pkt_bufs_cnt) == sizeof(uint16_t)); + bufcnt = kpkt->pkt_bufs_cnt; + kbuf = &kqum->qum_buf[0]; + /* + * special handling for empty first buflet. + */ + first_buflet_empty = (kbuf->buf_addr == 0); + *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = 0; /* * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t which is @@ -893,41 +836,30 @@ pp_metadata_destruct_common(struct __kern_quantum *kqum, /* if we're about to return this object to the slab, clean it up */ if (raw) { - switch (pp->pp_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); - - ASSERT(kpkt->pkt_com_opt != NULL || - !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC)); - if (kpkt->pkt_com_opt != NULL) { - ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC); - skmem_cache_free(pp_opt_cache, - kpkt->pkt_com_opt); - kpkt->pkt_com_opt = NULL; - } - ASSERT(kpkt->pkt_flow != NULL || - !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC)); - if (kpkt->pkt_flow != NULL) { - ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC); - skmem_cache_free(pp_flow_cache, kpkt->pkt_flow); - kpkt->pkt_flow = NULL; - } - ASSERT(kpkt->pkt_tx_compl != NULL || - !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC)); - if (kpkt->pkt_tx_compl != NULL) { - ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC); - skmem_cache_free(pp_compl_cache, - kpkt->pkt_tx_compl); - kpkt->pkt_tx_compl = NULL; - } - kpkt->pkt_pflags = 0; - break; + ASSERT(kpkt->pkt_com_opt != NULL || + !(kpkt->pkt_pflags & PKT_F_OPT_ALLOC)); + if (kpkt->pkt_com_opt != NULL) { + ASSERT(kpkt->pkt_pflags & PKT_F_OPT_ALLOC); + skmem_cache_free(pp_opt_cache, + kpkt->pkt_com_opt); + kpkt->pkt_com_opt = NULL; } - default: - ASSERT(METADATA_TYPE(kqum) == NEXUS_META_TYPE_QUANTUM); - /* nothing to do for quantum (yet) */ - break; + ASSERT(kpkt->pkt_flow != NULL || + !(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC)); + if (kpkt->pkt_flow != NULL) { + ASSERT(kpkt->pkt_pflags & PKT_F_FLOW_ALLOC); + skmem_cache_free(pp_flow_cache, kpkt->pkt_flow); + kpkt->pkt_flow = NULL; } + ASSERT(kpkt->pkt_tx_compl != NULL || + !(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC)); + if (kpkt->pkt_tx_compl != NULL) { + ASSERT(kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC); + skmem_cache_free(pp_compl_cache, + kpkt->pkt_tx_compl); + kpkt->pkt_tx_compl = NULL; + } + kpkt->pkt_pflags = 0; } } @@ -1068,7 +1000,7 @@ pp_buflet_metadata_dtor(void *addr, void *arg) ASSERT(kbft->buf_ctl != NULL); KBUF_DTOR(kbft, usecnt); - SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", SK_KVA(pp), + SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u", SK_KVA(pp), SK_KVA(objaddr), usecnt); if (__probable(usecnt == 0)) { skmem_cache_free(large ? PP_BUF_CACHE_LARGE(pp) : @@ -1175,16 +1107,10 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], ASSERT(def_buf_obj_size != 0); ASSERT(md_type > NEXUS_META_TYPE_INVALID && md_type <= NEXUS_META_TYPE_MAX); - if (md_type == NEXUS_META_TYPE_QUANTUM) { - ASSERT(max_frags == 1); - ASSERT(md_size >= - (METADATA_PREAMBLE_SZ + NX_METADATA_QUANTUM_SZ)); - } else { - ASSERT(max_frags >= 1); - ASSERT(md_type == NEXUS_META_TYPE_PACKET); - ASSERT(md_size >= (METADATA_PREAMBLE_SZ + - NX_METADATA_PACKET_SZ(max_frags))); - } + ASSERT(max_frags >= 1); + ASSERT(md_type == NEXUS_META_TYPE_PACKET); + ASSERT(md_size >= (METADATA_PREAMBLE_SZ + + NX_METADATA_PACKET_SZ(max_frags))); ASSERT(md_subtype > NEXUS_META_SUBTYPE_INVALID && md_subtype <= NEXUS_META_SUBTYPE_MAX); #endif /* DEBUG || DEVELOPMENT */ @@ -1243,14 +1169,14 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], if (umd_srp != NULL && (pp->pp_umd_region = skmem_region_create(name, umd_srp, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), umd_srp->srp_name); goto failed; } if ((pp->pp_kmd_region = skmem_region_create(name, kmd_srp, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), kmd_srp->srp_name); goto failed; } @@ -1276,7 +1202,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], if (PP_HAS_BUFFER_ON_DEMAND(pp) && !PP_KERNEL_ONLY(pp)) { if ((pp->pp_ubft_region = skmem_region_create(name, ubft_srp, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), ubft_srp->srp_name); goto failed; } @@ -1285,7 +1211,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], if (PP_HAS_BUFFER_ON_DEMAND(pp)) { if ((pp->pp_kbft_region = skmem_region_create(name, kbft_srp, NULL, NULL, NULL)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), kbft_srp->srp_name); goto failed; } @@ -1314,7 +1240,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], } if (pp->pp_kmd_cache == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache", + SK_ERR("\"%s\" (%p) failed to create \"%s\" cache", pp->pp_name, SK_KVA(pp), cname); goto failed; } @@ -1331,7 +1257,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], md_cflags); if (PP_KBFT_CACHE_DEF(pp) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache", + SK_ERR("\"%s\" (%p) failed to create \"%s\" cache", pp->pp_name, SK_KVA(pp), cname); goto failed; } @@ -1348,7 +1274,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], NULL, pp, pp->pp_kbft_region, md_cflags); if (PP_KBFT_CACHE_LARGE(pp) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to " + SK_ERR("\"%s\" (%p) failed to " "create \"%s\" cache", pp->pp_name, SK_KVA(pp), cname); goto failed; @@ -1358,7 +1284,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], if ((PP_BUF_REGION_DEF(pp) = skmem_region_create(name, buf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), buf_srp->srp_name); goto failed; } @@ -1367,7 +1293,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], PP_BUF_REGION_LARGE(pp) = skmem_region_create(name, lbuf_srp, pp_buf_seg_ctor, pp_buf_seg_dtor, pp); if (PP_BUF_REGION_LARGE(pp) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create %s region", + SK_ERR("\"%s\" (%p) failed to create %s region", pp->pp_name, SK_KVA(pp), lbuf_srp->srp_name); goto failed; } @@ -1383,7 +1309,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], def_buf_obj_size, 0, NULL, NULL, NULL, pp, PP_BUF_REGION_DEF(pp), buf_def_cflags)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache", + SK_ERR("\"%s\" (%p) failed to create \"%s\" cache", pp->pp_name, SK_KVA(pp), cname); goto failed; } @@ -1393,7 +1319,7 @@ pp_create(const char *name, struct skmem_region_params srp_array[SKMEM_REGIONS], if ((PP_BUF_CACHE_LARGE(pp) = skmem_cache_create(cache_name, lbuf_srp->srp_c_obj_size, 0, NULL, NULL, NULL, pp, PP_BUF_REGION_LARGE(pp), SKMEM_CR_NOMAGAZINES)) == NULL) { - SK_ERR("\"%s\" (0x%llx) failed to create \"%s\" cache", + SK_ERR("\"%s\" (%p) failed to create \"%s\" cache", pp->pp_name, SK_KVA(pp), cname); goto failed; } @@ -1985,60 +1911,49 @@ pp_metadata_init(struct __metadata_preamble *mdp, struct kern_pbufpool *pp, } /* (re)construct {user,kernel} metadata */ - switch (pp->pp_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); - struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf; - uint16_t i; + struct __kern_packet *kpkt = SK_PTR_ADDR_KPKT(kqum); + struct __kern_buflet *kbuf = &kpkt->pkt_qum_buf; + uint16_t i; - /* sanitize flags */ - kpkt->pkt_pflags &= PKT_F_INIT_MASK; + /* sanitize flags */ + kpkt->pkt_pflags &= PKT_F_INIT_MASK; - ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) && - kpkt->pkt_com_opt != NULL); - ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) && - kpkt->pkt_flow != NULL); - ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) && - kpkt->pkt_tx_compl != NULL); + ASSERT((kpkt->pkt_pflags & PKT_F_OPT_ALLOC) && + kpkt->pkt_com_opt != NULL); + ASSERT((kpkt->pkt_pflags & PKT_F_FLOW_ALLOC) && + kpkt->pkt_flow != NULL); + ASSERT((kpkt->pkt_pflags & PKT_F_TX_COMPL_ALLOC) && + kpkt->pkt_tx_compl != NULL); + /* + * XXX: For now we always set PKT_F_FLOW_DATA; + * this is a no-op but done for consistency + * with the other PKT_F_*_DATA flags. + */ + kpkt->pkt_pflags |= PKT_F_FLOW_DATA; + + /* initialize kernel packet */ + KPKT_INIT(kpkt, QUM_F_INTERNALIZED); + + ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp)); + if (PP_HAS_BUFFER_ON_DEMAND(pp)) { + ASSERT(kbuf->buf_ctl == NULL); + ASSERT(kbuf->buf_addr == 0); /* - * XXX: For now we always set PKT_F_FLOW_DATA; - * this is a no-op but done for consistency - * with the other PKT_F_*_DATA flags. + * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t + * which is unsafe, so we just forge it here. */ - kpkt->pkt_pflags |= PKT_F_FLOW_DATA; - - /* initialize kernel packet */ - KPKT_INIT(kpkt, QUM_F_INTERNALIZED); - - ASSERT(bufcnt || PP_HAS_BUFFER_ON_DEMAND(pp)); - if (PP_HAS_BUFFER_ON_DEMAND(pp)) { - ASSERT(kbuf->buf_ctl == NULL); - ASSERT(kbuf->buf_addr == 0); - /* - * -fbounds-safety: buf_nbft_addr is a mach_vm_address_t - * which is unsafe, so we just forge it here. - */ - kbuf = __unsafe_forge_single(struct __kern_buflet *, - __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr)); - } - /* initialize kernel buflet */ - for (i = 0; i < bufcnt; i++) { - ASSERT(kbuf != NULL); - KBUF_INIT(kbuf); - kbuf = __unsafe_forge_single(struct __kern_buflet *, - __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr)); - } - ASSERT((kbuf == NULL) || (bufcnt == 0)); - break; + kbuf = __unsafe_forge_single(struct __kern_buflet *, + __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr)); } - default: - ASSERT(pp->pp_md_type == NEXUS_META_TYPE_QUANTUM); - /* kernel quantum */ - KQUM_INIT(kqum, QUM_F_INTERNALIZED); - KBUF_INIT(&kqum->qum_buf[0]); - break; + /* initialize kernel buflet */ + for (i = 0; i < bufcnt; i++) { + ASSERT(kbuf != NULL); + KBUF_INIT(kbuf); + kbuf = __unsafe_forge_single(struct __kern_buflet *, + __DECONST(struct __kern_buflet *, kbuf->buf_nbft_addr)); } + ASSERT((kbuf == NULL) || (bufcnt == 0)); return kqum; } @@ -2274,52 +2189,43 @@ pp_metadata_fini(struct __kern_quantum *kqum, struct kern_pbufpool *pp, struct skmem_obj **blist_nocahce_large) { struct __metadata_preamble *mdp = METADATA_PREAMBLE(kqum); - ASSERT(SK_PTR_TAG(kqum) == 0); + struct __kern_packet *kpkt = SK_PTR_KPKT(kqum); - switch (pp->pp_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __kern_packet *kpkt = SK_PTR_KPKT(kqum); - - if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) { - __packet_perform_tx_completion_callbacks( - SK_PKT2PH(kpkt), NULL); - } - if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { - ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0); - ASSERT(kpkt->pkt_mbuf != NULL); - ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL); - if (mp != NULL) { - ASSERT(*mp == NULL); - *mp = kpkt->pkt_mbuf; - } else { - m_freem(kpkt->pkt_mbuf); - } - KPKT_CLEAR_MBUF_DATA(kpkt); - } else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) { - ASSERT(kpkt->pkt_pkt != NULL); - ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL); - if (kpp != NULL) { - ASSERT(*kpp == NULL); - *kpp = kpkt->pkt_pkt; - } else { - /* can only recurse once */ - ASSERT((kpkt->pkt_pkt->pkt_pflags & - PKT_F_PKT_DATA) == 0); - pp_free_packet_single(kpkt->pkt_pkt); - } - KPKT_CLEAR_PKT_DATA(kpkt); - } - kpkt->pkt_pflags &= ~PKT_F_TRUNCATED; - ASSERT(kpkt->pkt_nextpkt == NULL); - ASSERT(kpkt->pkt_qum.qum_ksd == NULL); - ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0); - ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0); - break; + if ((kpkt->pkt_pflags & PKT_F_TX_COMPL_TS_REQ) != 0) { + __packet_perform_tx_completion_callbacks( + SK_PKT2PH(kpkt), NULL); } - default: - break; + if ((kpkt->pkt_pflags & PKT_F_MBUF_DATA) != 0) { + ASSERT((kpkt->pkt_pflags & PKT_F_PKT_DATA) == 0); + ASSERT(kpkt->pkt_mbuf != NULL); + ASSERT(kpkt->pkt_mbuf->m_nextpkt == NULL); + if (mp != NULL) { + ASSERT(*mp == NULL); + *mp = kpkt->pkt_mbuf; + } else { + m_freem(kpkt->pkt_mbuf); + } + KPKT_CLEAR_MBUF_DATA(kpkt); + } else if ((kpkt->pkt_pflags & PKT_F_PKT_DATA) != 0) { + ASSERT(kpkt->pkt_pkt != NULL); + ASSERT(kpkt->pkt_pkt->pkt_nextpkt == NULL); + if (kpp != NULL) { + ASSERT(*kpp == NULL); + *kpp = kpkt->pkt_pkt; + } else { + /* can only recurse once */ + ASSERT((kpkt->pkt_pkt->pkt_pflags & + PKT_F_PKT_DATA) == 0); + pp_free_packet_single(kpkt->pkt_pkt); + } + KPKT_CLEAR_PKT_DATA(kpkt); } + kpkt->pkt_pflags &= ~PKT_F_TRUNCATED; + ASSERT(kpkt->pkt_nextpkt == NULL); + ASSERT(kpkt->pkt_qum.qum_ksd == NULL); + ASSERT((kpkt->pkt_pflags & PKT_F_MBUF_MASK) == 0); + ASSERT((kpkt->pkt_pflags & PKT_F_PKT_MASK) == 0); if (__improbable(PP_HAS_BUFFER_ON_DEMAND(pp))) { pp_metadata_destruct_common(kqum, pp, FALSE, blist_def, blist_nocache_def, @@ -2575,7 +2481,7 @@ pp_alloc_buffer_common(const kern_pbufpool_t pp, struct skmem_obj_info *oi, #endif /* (DEVELOPMENT || DEBUG) */ if (__improbable(baddr == 0)) { - SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp 0x%llx", + SK_DF(SK_VERB_MEM, "failed to alloc buffer, pp %p", SK_KVA(pp)); return 0; } @@ -2732,7 +2638,7 @@ pp_free_buflet_common(const kern_pbufpool_t pp, kern_buflet_t kbft) ASSERT(kbft->buf_idx != OBJ_IDX_NONE); ASSERT(kbft->buf_ctl != NULL); KBUF_DTOR(kbft, usecnt); - SK_DF(SK_VERB_MEM, "pp 0x%llx buf 0x%llx usecnt %u", + SK_DF(SK_VERB_MEM, "pp %p buf %p usecnt %u", SK_KVA(pp), SK_KVA(objaddr), usecnt); if (__probable(usecnt == 0)) { skmem_cache_free(BUFLET_HAS_LARGE_BUF(kbft) ? diff --git a/bsd/skywalk/packet/pbufpool_kern.c b/bsd/skywalk/packet/pbufpool_kern.c index b132ff0f1..8462662b9 100644 --- a/bsd/skywalk/packet/pbufpool_kern.c +++ b/bsd/skywalk/packet/pbufpool_kern.c @@ -47,7 +47,6 @@ kern_pbufpool_create(const struct kern_pbufpool_init *init, struct skmem_region_params srp[SKMEM_REGIONS]; struct kern_pbufpool *pp = NULL; nexus_meta_type_t md_type; - nexus_meta_subtype_t md_subtype; uint32_t buf_cnt; uint16_t max_frags; uint32_t ppcreatef = PPCREATEF_EXTERNAL; @@ -84,8 +83,6 @@ kern_pbufpool_create(const struct kern_pbufpool_init *init, * XXX: adi@apple.com - to allow for "direct" channels from * user process to driver, we will need to revisit this. */ - md_subtype = ((md_type == NEXUS_META_TYPE_QUANTUM) ? - NEXUS_META_SUBTYPE_PAYLOAD : NEXUS_META_SUBTYPE_RAW); kernel_only = (md_type == NEXUS_META_TYPE_PACKET) && #if (DEVELOPMENT || DEBUG) !skywalk_netif_direct_enabled() && @@ -190,8 +187,8 @@ kern_pbufpool_create(const struct kern_pbufpool_init *init, } /* adjust region params */ - pp_regions_params_adjust(srp, md_type, md_subtype, pkt_cnt, max_frags, - init->kbi_bufsize, 0, buf_cnt, init->kbi_buf_seg_size, + pp_regions_params_adjust(srp, NEXUS_META_TYPE_PACKET, NEXUS_META_SUBTYPE_RAW, + pkt_cnt, max_frags, init->kbi_bufsize, 0, buf_cnt, init->kbi_buf_seg_size, pp_region_flags); /* diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 2df7fda67..e7a2f3769 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -15,7 +15,8 @@ EXPINC_SUBDIRS = \ # Files that are public on macOS, but private on embedded EMBEDDED_PRIVATE_DATAFILES = \ disk.h dtrace.h dtrace_glue.h dtrace_impl.h fasttrap.h fasttrap_isa.h ioctl_compat.h kdebug.h \ - kern_control.h kernel_types.h proc_info.h protosw.h reboot.h ttychars.h ttydev.h ubc.h vnode.h + kern_control.h kern_event.h kernel_types.h proc_info.h protosw.h reboot.h sys_domain.h ttychars.h \ + ttydev.h ubc.h vnode.h ifeq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS) $(SUPPORTED_SIMULATOR_PLATFORMS),$(PLATFORM)),) EXTRA_DATAFILES = $(EMBEDDED_PRIVATE_DATAFILES) @@ -36,19 +37,19 @@ DATAFILES = $(sort \ errno.h ev.h event.h fcntl.h file.h filedesc.h \ fileport.h filio.h fsgetpath.h gmon.h \ ioccom.h ioctl.h \ - ipc.h kernel.h kern_event.h lctx.h loadable_fs.h lock.h lockf.h \ + ipc.h kernel.h lctx.h loadable_fs.h lock.h lockf.h \ kauth.h kdebug_signpost.h lockstat.h malloc.h \ mbuf.h mman.h mount.h msg.h msgbuf.h netport.h param.h paths.h pipe.h poll.h \ proc.h ptrace.h queue.h quota.h resource.h resourcevar.h \ sbuf.h posix_sem.h posix_shm.h random.h sdt.h\ select.h sem.h semaphore.h shm.h signal.h signalvar.h snapshot.h socket.h socketvar.h sockio.h stat.h stdio.h \ - sysctl.h syslimits.h syslog.h sys_domain.h termios.h time.h \ + sysctl.h syslimits.h syslog.h termios.h time.h \ timeb.h times.h trace.h tty.h ttycom.h \ ttydefaults.h types.h ucontext.h ucred.h uio.h un.h unistd.h unpcb.h \ user.h utfconv.h utsname.h vadvise.h vcmd.h \ vm.h vmmeter.h vmparam.h vnode_if.h vsock.h vstat.h wait.h xattr.h \ _select.h _structs.h _types.h _endian.h __endian.h domain.h \ - spawn.h timex.h commpage.h log_data.h \ + spawn.h timex.h commpage.h log_data.h endian.h \ $(EXTRA_DATAFILES)) # Installs header file for DriverKit drivers - @@ -74,13 +75,13 @@ PRIVATE_DATAFILES = $(sort \ csr.h \ decmpfs.h \ dirent_private.h \ - disk.h \ + disk_private.h \ disklabel.h \ domain.h \ event.h \ event_log.h \ event_private.h \ - fcntl.h \ + fcntl_private.h \ fileport.h \ fsctl.h \ fsevents.h \ @@ -94,13 +95,15 @@ PRIVATE_DATAFILES = $(sort \ kdebug.h \ kdebug_private.h \ kdebug_triage.h \ + kern_control_private.h \ kern_debug.h \ - kern_event.h \ + kern_event_private.h \ kern_memorystatus.h \ kern_memorystatus_freeze.h \ kern_overrides.h \ kern_sysctl.h \ mbuf.h \ + mem_acct_private.h \ mman.h \ monotonic.h \ persona.h \ @@ -124,13 +127,14 @@ PRIVATE_DATAFILES = $(sort \ spawn.h \ spawn_internal.h \ stackshot.h \ - sys_domain.h \ + sys_domain_private.h \ tree.h \ ulock.h \ unpcb.h \ ux_exception.h \ variant_internal.h \ vsock.h \ + vsock_private.h \ work_interval.h \ process_policy.h \ proc_uuid_policy.h \ @@ -184,10 +188,12 @@ PRIVATE_KERNELFILES = \ decmpfs.h \ dirent_private.h \ disktab.h \ + disk_private.h \ eventhandler.h \ event_log.h \ event_private.h \ fbt.h \ + fcntl_private.h \ fileport.h \ fsctl.h \ fslog.h \ @@ -196,6 +202,8 @@ PRIVATE_KERNELFILES = \ kpi_private.h \ ktrace.h \ kdebug_triage.h \ + kern_control_private.h \ + kern_event_private.h \ linker_set.h \ mach_swapon.h \ monotonic.h \ @@ -215,6 +223,7 @@ PRIVATE_KERNELFILES = \ socket_private.h \ sockio_private.h \ stackshot.h \ + sys_domain_private.h \ timeb.h times.h \ tprintf.h \ tty.h ttychars.h \ @@ -223,6 +232,7 @@ PRIVATE_KERNELFILES = \ variant_internal.h \ vfs_context.h \ vmmeter.h \ + vsock_private.h \ reason.h \ spawn_internal.h \ priv.h \ @@ -282,9 +292,10 @@ EXPORT_MI_DIR = sys # /usr/local/include INSTALL_MI_LCL_LIST = $(sort \ attr_private.h coalition_private.h code_signing.h codesign.h content_protection.h csr.h decmpfs.h dirent_private.h \ - event_log.h event_private.h fsevents.h fsgetpath_private.h guarded.h kdebug_private.h \ - kern_memorystatus.h preoslog.h proc_info_private.h reason.h resource_private.h \ - socket_private.h sockio_private.h stackshot.h work_interval.h ${EXTRA_PRIVATE_DATAFILES}) + disk_private.h event_log.h event_private.h fcntl_private.h fsevents.h fsgetpath_private.h guarded.h kas_info.h \ + kdebug_private.h kern_control_private.h kern_event_private.h kern_memorystatus.h mem_acct_private.h preoslog.h \ + proc_info_private.h reason.h resource_private.h socket_private.h sockio_private.h stackshot.h sys_domain_private.h \ + vsock_private.h work_interval.h ${EXTRA_PRIVATE_DATAFILES}) # /System/Library/Frameworks/System.framework/PrivateHeaders INSTALL_SF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} diff --git a/bsd/sys/_types/_graftdmg_un.h b/bsd/sys/_types/_graftdmg_un.h index 366336bea..5b3b02b66 100644 --- a/bsd/sys/_types/_graftdmg_un.h +++ b/bsd/sys/_types/_graftdmg_un.h @@ -43,6 +43,9 @@ #define SBC_STRICT_AUTH 0x0010 /* Strict authentication mode */ #define SBC_PRESERVE_GRAFT 0x0020 /* Preserve graft itself until unmount */ +/* Flag values for ungraftdmg */ +#define UNGRAFTDMG_NOFORCE 0x0000000000000002ULL /* Disallow ungraft if a non-dir vnode inside the graft is in use */ + typedef struct secure_boot_cryptex_args { u_int32_t sbc_version; u_int32_t sbc_4cc; diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 3057f4afe..11fd4909c 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -53,9 +53,10 @@ /* Additional FSOPT values in attr_private.h */ #endif -#define FSOPT_ATTR_CMN_EXTENDED 0x00000020 +#define FSOPT_ATTR_CMN_EXTENDED 0x00000020 #define FSOPT_RETURN_REALDEV 0x00000200 #define FSOPT_NOFOLLOW_ANY 0x00000800 +#define FSOPT_RESOLVE_BENEATH 0x00001000 /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 @@ -372,6 +373,9 @@ typedef struct vol_capabilities_attr { * * VOL_CAP_INT_PUNCHHOLE: When set, the volume supports the F_PUNCHHOLE * fcntl. + * + * VOL_CAP_INT_BARRIERFSYNC: When set, the volume supports the F_BARRIERFSYNC + * fcntl. */ #define VOL_CAP_INT_SEARCHFS 0x00000001 #define VOL_CAP_INT_ATTRLIST 0x00000002 @@ -399,6 +403,7 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_INT_RENAME_SECLUDE 0x00200000 #define VOL_CAP_INT_ATTRIBUTION_TAG 0x00400000 #define VOL_CAP_INT_PUNCHHOLE 0x00800000 +#define VOL_CAP_INT_BARRIERFSYNC 0x01000000 typedef struct vol_attributes_attr { attribute_set_t validattr; diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 9c866f3d2..7bb3f6efa 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -1078,6 +1078,33 @@ void buf_markstatic(buf_t bp); */ int buf_static(buf_t bp); +__options_decl(vnode_verify_kind_t, uint32_t, { + VK_HASH_NONE = 0x00, + VK_HASH_SHA3_256 = 0x01, + VK_HASH_SHA3_384 = 0x02, + VK_HASH_SHA3_512 = 0x03, +}); + +#define NUM_VERIFY_KIND 4 + +/*! + * @function buf_verify_enable + * @abstract Set up buf to retrieve hashes alongwith data. + * @param bp buf pointer. + * @param verify_kind specific algorithm to be used for the hash calculation. + * @return 0 if successful, error otherwise. + */ +errno_t buf_verify_enable(buf_t bp, vnode_verify_kind_t verify_kind); + +/*! + * @function buf_verifyptr + * @abstract Gets pointer to the buffer to store the hash calculated for the data. + * @param bp buf pointer. + * @param len pointer to uint32_t variable to store the length. + * @return Pointer to a buffer (of length passed in second argument), NULL if there is no hash needed. + */ +uint8_t * buf_verifyptr(buf_t bp, uint32_t *len); + /*! * @function bufattr_markiosched * @abstract Mark a buffer as belonging to an io scheduled mount point @@ -1207,6 +1234,30 @@ int bufattr_throttled(bufattr_t bap); */ int bufattr_willverify(bufattr_t bap); +/*! + * @function bufattr_verifykind + * @abstract Get type of hash requested. + * @param bap Buffer attribute to test. + * @return Values from the vnode_verify_kind_t enum. + */ +vnode_verify_kind_t bufattr_verifykind(bufattr_t bap); + +/*! + * @function bufattr_verifyptr + * @abstract Gets pointer to the buffer to store the hash calculated for the data. + * @param bap Buffer attribute to get pointer for. + * @param len pointer to uint32_t variable to store the length. + * @return Pointer to a buffer (of length passed in second argument), NULL if there is no hash needed. + */ +uint8_t * bufattr_verifyptr(bufattr_t bap, uint32_t *len); + +/*! + * @function bufattr_setverifyvalid + * @abstract Set the values stored in verify buffer as valid + * @param bap Buffer attribute to set valid. + */ +void bufattr_setverifyvalid(bufattr_t bap); + /*! * @function bufattr_passive * @abstract Check if a buffer is marked passive. @@ -1291,6 +1342,7 @@ buf_t buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t externa void buf_drop(buf_t); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index e09112884..e221f5e5e 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -94,7 +94,11 @@ struct bufattr { uint64_t ba_cp_file_off; #endif uint64_t ba_flags; /* flags. Some are only in-use on embedded devices */ - void *ba_verify_ctx; + union { + void *verify_ctx; + void *verify_ptr; /* only for metadata, B_CLUSTER not set */ + } ba_un; + vnode_verify_kind_t ba_verify_type; }; /* @@ -280,6 +284,7 @@ extern vm_offset_t buf_kernel_addrperm; #define BA_EXPEDITED_META_IO 0x00010000 /* metadata I/O which needs a high I/O tier */ #define BA_WILL_VERIFY 0x00020000 /* Cluster layer will verify data */ #define BA_ASYNC_VERIFY 0x00040000 /* Allowed to hand off to async threads */ +#define BA_VERIFY_VALID 0x00080000 /* Hash calculated by driver is valid */ #define GET_BUFATTR_IO_TIER(bap) ((bap->ba_flags & BA_IO_TIER_MASK) >> BA_IO_TIER_SHIFT) #define SET_BUFATTR_IO_TIER(bap, tier) \ @@ -339,6 +344,10 @@ void buf_setcpoff(buf_t, uint64_t); vnode_t buf_vnop_vnode(buf_t); +uint32_t get_num_bytes_for_verify_kind(vnode_verify_kind_t); +uint8_t * buf_verifyptr_with_size(buf_t bp, int size, uint32_t *len); +void buf_verify_free(buf_t bp); + __END_DECLS diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 5ffc7b0c5..8b810050f 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -199,7 +199,16 @@ */ #define __exported __attribute__((__visibility__("default"))) #define __exported_push _Pragma("GCC visibility push(default)") +#ifndef __BUILDING_XNU_LIBRARY__ +#define __exported_push_hidden _Pragma("GCC visibility push(hidden)") #define __exported_pop _Pragma("GCC visibility pop") +#define __exported_hidden __private_extern__ +#else /* __BUILDING_XNU_LIBRARY__ */ +/* Don't hide symbols that the might be need to be used from outside */ +#define __exported_push_hidden +#define __exported_pop +#define __exported_hidden +#endif /* __BUILDING_XNU_LIBRARY__ */ /* __deprecated causes the compiler to produce a warning when encountering * code using the deprecated functionality. @@ -534,12 +543,14 @@ * for plain C (see also ). * * Attribute __unsafe_buffer_usage can be used to label functions that should be - * avoided as they may perform or otherwise introduce unsafe buffer - * manipulation operations. + * avoided as they may perform or otherwise introduce unsafe buffer manipulation + * operations. The attribute can also be attached to class/struct fields that + * are used in unsafe buffer manipulations. * - * Calls to such functions are flagged by -Wunsafe-buffer-usage, similarly to + * Calls to attribute annotated functions are flagged by -Wunsafe-buffer-usage, similar to * how unchecked buffer manipulation operations are flagged when observed - * by the compiler directly: + * by the compiler directly. Similarly, use of and assignment to the struct/class fields + * that have the attribute also get flagged by the compiler. * * // An unsafe function that needs to be avoided. * __unsafe_buffer_usage @@ -552,14 +563,30 @@ * int array[5]; * * // Direct unsafe buffer manipulation through subscript operator: - * array[idx] = 3; // warning [-Wunsafe-buffer-usage] + * array[idx] = 3; // warning: function introduces unsafe buffer manipulation [-Wunsafe-buffer-usage] * // Unsafe buffer manipulation through function foo(): - * foo(array, 5); // warning [-Wunsafe-buffer-usage] + * foo(array, 5); // warning: function introduces unsafe buffer manipulation [-Wunsafe-buffer-usage] * // Checked buffer manipulation, with bounds information automatically * // preserved for the purposes of runtime checks in standard library: * foo(array); // no warning * } * + * struct Reader { + * // Field involved in unsafe buffer manipulation + * __unsafe_buffer_usage + * void *ptr; + * + * __unsafe_buffer_usage + * size_t sz, count; + * }; + * + * void add_element(Reader rdr, int value) { + * if(rdr.count < rdr.sz) { // warning: unsafe buffer access [-Wunsafe-buffer-usage] + * rdr.ptr[rdr.count] = value; // warning: unsafe buffer access [-Wunsafe-buffer-usage] + * rdr.count++; // warning: unsafe buffer access [-Wunsafe-buffer-usage] + * } + * } + * * While annotating a function as __unsafe_buffer_usage has an effect similar * to annotating it as __deprecated, the __unsafe_buffer_usage attribute * should be used whenever the resulting warning needs to be controlled @@ -567,11 +594,12 @@ * don't attempt to achieve bounds safety this way) as opposed to -Wdeprecated * (enabled in most codebases). * - * The attribute does NOT suppress -Wunsafe-buffer-usage warnings inside - * the function's body; it simply introduces new warnings at each call site - * to help the developers avoid the function entirely. Most of the time - * it does not make sense to annotate a function as __unsafe_buffer_usage - * without providing the users with a safe alternative. + * The attribute suppresses all -Wunsafe-buffer-usage warnings inside the + * function's body as it is explictly marked as unsafe by the user and + * introduces new warnings at each call site to help the developers avoid the + * function entirely. Most of the time it does not make sense to annotate a + * function as __unsafe_buffer_usage without providing the users with a safe + * alternative. * * Pragmas __unsafe_buffer_usage_begin and __unsafe_buffer_usage_end * annotate a range of code as intentionally containing unsafe buffer @@ -585,9 +613,7 @@ * * These pragmas are NOT a way to mass-annotate functions with the attribute * __unsafe_buffer_usage. Functions declared within the pragma range - * do NOT get annotated automatically. In some rare situations it makes sense - * to do all three: put the attribute on the function, put pragmas inside - * the body of the function, and put pragmas around some call sites. + * do NOT get annotated automatically. */ #if __has_cpp_attribute(clang::unsafe_buffer_usage) #define __has_safe_buffers 1 @@ -1387,8 +1413,11 @@ * When unsupported, this macro is ignored and stack usage will not generate an * error. As such, this macro should only be used when stack usage may pose a * security concern rather than a functional issue. + * + * In user-space compilation the function decorated with this calls into a mock + * function and that uses the stack, so this needs to be disabled. */ -#if __OPTIMIZE__ +#if __OPTIMIZE__ && !defined(__BUILDING_XNU_LIBRARY__) #define __SECURITY_STACK_DISALLOWED_PUSH \ _Pragma("clang diagnostic push") \ _Pragma("clang diagnostic error \"-Wframe-larger-than\"") diff --git a/bsd/sys/clonefile.h b/bsd/sys/clonefile.h index 85208510e..a73fa5a5a 100644 --- a/bsd/sys/clonefile.h +++ b/bsd/sys/clonefile.h @@ -30,10 +30,11 @@ #define _SYS_CLONEFILE_H_ /* Options for clonefile calls */ -#define CLONE_NOFOLLOW 0x0001 /* Don't follow symbolic links */ -#define CLONE_NOOWNERCOPY 0x0002 /* Don't copy ownership information from source */ -#define CLONE_ACL 0x0004 /* Copy access control lists from source */ -#define CLONE_NOFOLLOW_ANY 0x0008 /* Don't follow any symbolic links in the path */ +#define CLONE_NOFOLLOW 0x0001 /* Don't follow symbolic links */ +#define CLONE_NOOWNERCOPY 0x0002 /* Don't copy ownership information from source */ +#define CLONE_ACL 0x0004 /* Copy access control lists from source */ +#define CLONE_NOFOLLOW_ANY 0x0008 /* Don't follow any symbolic links in the path */ +#define CLONE_RESOLVE_BENEATH 0x0010 /* path must reside in the hierarchy beneath the starting directory */ #ifndef KERNEL diff --git a/bsd/sys/code_signing.h b/bsd/sys/code_signing.h index 961197801..184087dd6 100644 --- a/bsd/sys/code_signing.h +++ b/bsd/sys/code_signing.h @@ -45,6 +45,7 @@ typedef uint32_t code_signing_config_t; #define CS_CONFIG_GET_OUT_OF_MY_WAY (1 << 3) #define CS_CONFIG_INTEGRITY_SKIP (1 << 4) #define CS_CONFIG_RELAX_PROFILE_TRUST (1 << 5) +#define CS_CONFIG_DEV_MODE_POLICY (1 << 6) /* Config - Features */ #define CS_CONFIG_REM_SUPPORTED (1 << 25) @@ -91,6 +92,7 @@ typedef uint64_t image4_cs_trap_t; #define XNU_SUPPORTS_SECURE_CHANNEL_SHARED_PAGE 1 #define XNU_SUPPORTS_CSM_DEVICE_STATE 1 #define XNU_SUPPORTS_REGISTER_PROFILE 1 +#define XNU_SUPPORTS_RESEARCH_STATE 1 /* Forward declarations */ struct cs_blob; @@ -122,6 +124,7 @@ typedef struct _cs_profile_register_t { #if XNU_KERNEL_PRIVATE #include +#include #include #include @@ -134,6 +137,10 @@ typedef struct _cs_profile_register_t { /* Common developer mode state variable */ extern bool *developer_mode_enabled; +/* Common research mode state variables */ +extern bool research_mode_enabled; +extern bool extended_research_mode_enabled; + /** * This function is used to allocate code signing data which in some cases needs to * align to a page length. This is a frequent operation, and as a result, a common @@ -181,6 +188,23 @@ image4_get_object_spec_from_index( return obj_spec; } +/** + * Research modes are only allowed when we're using a virtual device, security research + * device or when we're using a dev-fused device. + */ +static inline bool +allow_research_modes(void) +{ + if (PE_vmm_present != 0) { + return true; + } else if ((PE_esdm_fuses & (1 << 0)) != 0) { + return true; + } else if (PE_i_can_has_debugger(NULL) == true) { + return true; + } + return false; +} + /** * Perform any initialization required for managing code signing state on the system. * This is called within XNU itself and doesn't need to be exported to anything external. @@ -242,6 +266,30 @@ disable_developer_mode(void); bool developer_mode_state(void); +/* + * Query the current state of research mode on the system. This call never traps into + * the monitor environment as the state is queried at boot and saved in read-only-late + * memory. + * + * This state can only ever be enabled on platforms which support the trusted execution + * monitor environment. The state requires research fusing and the use of a security + * research device. + */ +bool +research_mode_state(void); + +/* + * Query the current state of extended research mode on the system. This call never traps + * into the monitor environment as the state is queried at boot and saved in read-only-late + * memory. + * + * This state can only ever be enabled on platforms which support the trusted execution + * monitor environment. The state requires research fusing and the use of a security + * research device. + */ +bool +extended_research_mode_state(void); + /** * Attempt to enable restricted execution mode on the system. Not all systems support * restricted execution mode. If the call is successful, KERN_SUCCESS is returned, or @@ -470,7 +518,14 @@ get_jit_address_range_kdp( * address space from the monitor. */ kern_return_t -address_space_debugged( +address_space_debugged_state( + const proc_t process); + +/** + * Implements the same policy as address_space_debugged_state(), but returns + * with boolean semantics. + */ +bool is_address_space_debugged( const proc_t process); #if CODE_SIGNING_MONITOR @@ -622,6 +677,19 @@ csm_reconstitute_code_signature( vm_address_t *unneeded_addr, vm_size_t *unneeded_size); +/** + * Setup a nested address space object with the required base address and size for the + * nested region. The code signing monitor will enforce that code signature associations + * can only be made within this address region. + * + * This must be called before any associations can be made with the nested address space. + */ +kern_return_t +csm_setup_nested_address_space( + pmap_t pmap, + const vm_address_t region_addr, + const vm_size_t region_size); + /** * Associate a code signature with an address space for a specified region with the * monitor environment. The code signature can only be associated if it has been diff --git a/bsd/sys/code_signing_internal.h b/bsd/sys/code_signing_internal.h index 8f5366e4b..380eed0d4 100644 --- a/bsd/sys/code_signing_internal.h +++ b/bsd/sys/code_signing_internal.h @@ -191,6 +191,11 @@ kern_return_t CSM_PREFIX(reconstitute_code_signature)( vm_address_t *unneeded_addr, vm_size_t *unneeded_size); +kern_return_t CSM_PREFIX(setup_nested_address_space)( + pmap_t pmap, + const vm_address_t region_addr, + const vm_size_t region_size); + kern_return_t CSM_PREFIX(associate_code_signature)( pmap_t pmap, void *sig_obj, diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index 646a7c1a9..9da328fa1 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -58,6 +58,7 @@ #define CS_OPS_CLEAR_LV 15 /* clear the library validation flag */ #define CS_OPS_DER_ENTITLEMENTS_BLOB 16 /* get der entitlements blob */ #define CS_OPS_VALIDATION_CATEGORY 17 /* get process validation category */ +#define CS_OPS_CDHASH_WITH_INFO 18 /* get code directory hash with info */ #define CS_MAX_TEAMID_LEN 64 diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 82efb4fcc..dfec64c5e 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2014 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2025 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,10 +32,6 @@ #include #include -#ifdef XNU_KERNEL_PRIVATE -#include -#endif /* XNU_KERNEL_PRIVATE */ - /* * Definitions * @@ -180,15 +176,6 @@ typedef struct{ #define DK_LOCATION_INTERNAL 0x00000000 #define DK_LOCATION_EXTERNAL 0x00000001 -#ifdef KERNEL -#ifdef PRIVATE - -/* Definitions of option bits for dk_unmap_t */ -#define _DK_UNMAP_INITIALIZE 0x00000100 - -#endif /* PRIVATE */ -#endif /* KERNEL */ - #define DKIOCEJECT _IO('d', 21) #define DKIOCSYNCHRONIZE _IOW('d', 22, dk_synchronize_t) @@ -279,77 +266,13 @@ typedef struct{ #define DKIOCGETENCRYPTIONTYPE _IOR('d', 86, uint32_t) #define DKIOCISLOWPOWERMODE _IOR('d', 87, uint32_t) #define DKIOCGETIOMINSATURATIONBYTECOUNT _IOR('d', 88, uint32_t) - -#ifdef XNU_KERNEL_PRIVATE -typedef struct{ - boolean_t mi_mdev; /* Is this a memdev device? */ - boolean_t mi_phys; /* Physical memory? */ - uint32_t mi_base; /* Base page number of the device? */ - uint64_t mi_size; /* Size of the device (in ) */ -} dk_memdev_info_t; - -typedef dk_memdev_info_t memdev_info_t; - -#define DKIOCGETMEMDEVINFO _IOR('d', 90, dk_memdev_info_t) -#endif /* XNU_KERNEL_PRIVATE */ -#ifdef PRIVATE -typedef struct _dk_cs_pin { - dk_extent_t cp_extent; - int64_t cp_flags; -} _dk_cs_pin_t; -/* The following are modifiers to _DKIOCCSPINEXTENT/cp_flags operation */ -#define _DKIOCCSPINTOFASTMEDIA (0) /* Pin extent to the fast (SSD) media */ -#define _DKIOCCSPINFORHIBERNATION (1 << 0) /* Pin of hibernation file, content not preserved */ -#define _DKIOCCSPINDISCARDDENYLIST (1 << 1) /* Hibernation complete/error, stop denylist-ing */ -#define _DKIOCCSPINTOSLOWMEDIA (1 << 2) /* Pin extent to the slow (HDD) media */ -#define _DKIOCCSTEMPORARYPIN (1 << 3) /* Relocate, but do not pin, to indicated media */ -#define _DKIOCCSHIBERNATEIMGSIZE (1 << 4) /* Anticipate/Max size of the upcoming hibernate */ -#define _DKIOCCSPINFORSWAPFILE (1 << 5) /* Pin of swap file, content not preserved */ - -#define _DKIOCCSSETLVNAME _IOW('d', 198, char[256]) -#define _DKIOCCSPINEXTENT _IOW('d', 199, _dk_cs_pin_t) -#define _DKIOCCSUNPINEXTENT _IOW('d', 200, _dk_cs_pin_t) -#define _DKIOCGETMIGRATIONUNITBYTESIZE _IOR('d', 201, uint32_t) - -typedef struct _dk_cs_map { - dk_extent_t cm_extent; - uint64_t cm_bytes_mapped; -} _dk_cs_map_t; - -typedef struct _dk_cs_unmap { - dk_extent_t *extents; - uint32_t extentsCount; - uint32_t options; -} _dk_cs_unmap_t; - -#define _DKIOCCSMAP _IOWR('d', 202, _dk_cs_map_t) -// No longer used: _DKIOCCSSETFSVNODE (203) & _DKIOCCSGETFREEBYTES (204) -#define _DKIOCCSUNMAP _IOWR('d', 205, _dk_cs_unmap_t) - -typedef enum { - DK_APFS_ONE_DEVICE = 1, - DK_APFS_FUSION -} dk_apfs_flavour_t; - -#define DKIOCGETAPFSFLAVOUR _IOR('d', 91, dk_apfs_flavour_t) - -// Extent's offset and length returned in bytes -typedef struct dk_apfs_wbc_range { - dev_t dev; // Physical device for extents - uint32_t count; // Number of extents - dk_extent_t extents[2]; // Addresses are relative to device we return -} dk_apfs_wbc_range_t; - -#define DKIOCAPFSGETWBCRANGE _IOR('d', 92, dk_apfs_wbc_range_t) -#define DKIOCAPFSRELEASEWBCRANGE _IO('d', 93) - -#define DKIOCGETMAXSWAPWRITE _IOR('d', 94, uint64_t) - -#endif /* PRIVATE */ #endif /* KERNEL */ #ifdef PRIVATE -#define _DKIOCSETSTATIC _IO('d', 84) +/* See disk_private.h for additional ioctls */ +#ifndef MODULES_SUPPORTED +#include +#endif /* !MODULES_SUPPORTED */ #endif /* PRIVATE */ #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/disk_private.h b/bsd/sys/disk_private.h new file mode 100644 index 000000000..730d48ce0 --- /dev/null +++ b/bsd/sys/disk_private.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_DISK_PRIVATE_H_ +#define _SYS_DISK_PRIVATE_H_ + +#include + +#ifdef XNU_KERNEL_PRIVATE +#include +#endif /* XNU_KERNEL_PRIVATE */ + +#ifdef KERNEL + +/* Definitions of option bits for dk_unmap_t */ +#define _DK_UNMAP_INITIALIZE 0x00000100 + +#ifdef XNU_KERNEL_PRIVATE +typedef struct{ + boolean_t mi_mdev; /* Is this a memdev device? */ + boolean_t mi_phys; /* Physical memory? */ + uint32_t mi_base; /* Base page number of the device? */ + uint64_t mi_size; /* Size of the device (in ) */ +} dk_memdev_info_t; + +typedef dk_memdev_info_t memdev_info_t; + +#define DKIOCGETMEMDEVINFO _IOR('d', 90, dk_memdev_info_t) +#endif /* XNU_KERNEL_PRIVATE */ +typedef struct _dk_cs_pin { + dk_extent_t cp_extent; + int64_t cp_flags; +} _dk_cs_pin_t; +/* The following are modifiers to _DKIOCCSPINEXTENT/cp_flags operation */ +#define _DKIOCCSPINTOFASTMEDIA (0) /* Pin extent to the fast (SSD) media */ +#define _DKIOCCSPINFORHIBERNATION (1 << 0) /* Pin of hibernation file, content not preserved */ +#define _DKIOCCSPINDISCARDDENYLIST (1 << 1) /* Hibernation complete/error, stop denylist-ing */ +#define _DKIOCCSPINTOSLOWMEDIA (1 << 2) /* Pin extent to the slow (HDD) media */ +#define _DKIOCCSTEMPORARYPIN (1 << 3) /* Relocate, but do not pin, to indicated media */ +#define _DKIOCCSHIBERNATEIMGSIZE (1 << 4) /* Anticipate/Max size of the upcoming hibernate */ +#define _DKIOCCSPINFORSWAPFILE (1 << 5) /* Pin of swap file, content not preserved */ + +#define _DKIOCCSSETLVNAME _IOW('d', 198, char[256]) +#define _DKIOCCSPINEXTENT _IOW('d', 199, _dk_cs_pin_t) +#define _DKIOCCSUNPINEXTENT _IOW('d', 200, _dk_cs_pin_t) +#define _DKIOCGETMIGRATIONUNITBYTESIZE _IOR('d', 201, uint32_t) + +typedef struct _dk_cs_map { + dk_extent_t cm_extent; + uint64_t cm_bytes_mapped; +} _dk_cs_map_t; + +typedef struct _dk_cs_unmap { + dk_extent_t *extents; + uint32_t extentsCount; + uint32_t options; +} _dk_cs_unmap_t; + +#define _DKIOCCSMAP _IOWR('d', 202, _dk_cs_map_t) +// No longer used: _DKIOCCSSETFSVNODE (203) & _DKIOCCSGETFREEBYTES (204) +#define _DKIOCCSUNMAP _IOWR('d', 205, _dk_cs_unmap_t) + +typedef enum { + DK_APFS_ONE_DEVICE = 1, + DK_APFS_FUSION +} dk_apfs_flavour_t; + +#define DKIOCGETAPFSFLAVOUR _IOR('d', 91, dk_apfs_flavour_t) + +// Extent's offset and length returned in bytes +typedef struct dk_apfs_wbc_range { + dev_t dev; // Physical device for extents + uint32_t count; // Number of extents + dk_extent_t extents[2]; // Addresses are relative to device we return +} dk_apfs_wbc_range_t; + +#define DKIOCAPFSGETWBCRANGE _IOR('d', 92, dk_apfs_wbc_range_t) +#define DKIOCAPFSRELEASEWBCRANGE _IO('d', 93) + +#define DKIOCGETMAXSWAPWRITE _IOR('d', 94, uint64_t) + +#endif /* KERNEL */ + +#define _DKIOCSETSTATIC _IO('d', 84) + +#endif /* _SYS_DISK_PRIVATE_H_ */ diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index e2adabbdd..299da44df 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -1418,7 +1418,6 @@ extern void dtrace_state_free(minor_t minor); /* * DTrace restriction checks */ -extern void dtrace_restriction_policy_load(void); extern boolean_t dtrace_is_restricted(void); extern boolean_t dtrace_are_restrictions_relaxed(void); extern boolean_t dtrace_fbt_probes_restricted(void); diff --git a/bsd/sys/endian.h b/bsd/sys/endian.h new file mode 100644 index 000000000..5254f34bf --- /dev/null +++ b/bsd/sys/endian.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)endian.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _SYS_ENDIAN_H_ +#define _SYS_ENDIAN_H_ + +#include +#include +#include + +#ifndef DRIVERKIT +#include +#define __bswap16(x) __DARWIN_OSSwapInt16(x) +#define __bswap32(x) __DARWIN_OSSwapInt32(x) +#define __bswap64(x) __DARWIN_OSSwapInt64(x) +#else /* DRIVERKIT */ +#define __bswap16(x) __builtin_bswap16(x) +#define __bswap32(x) __builtin_bswap32(x) +#define __bswap64(x) __builtin_bswap64(x) +#endif /* DRIVERKIT */ + +/* + * General byte order swapping functions. + */ +#define bswap16(x) __bswap16(x) +#define bswap32(x) __bswap32(x) +#define bswap64(x) __bswap64(x) + +/* + * Macros to convert to a specific endianness. + */ +#if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN +#define htobe16(x) __bswap16((x)) +#define htobe32(x) __bswap32((x)) +#define htobe64(x) __bswap64((x)) +#define htole16(x) ((__uint16_t)(x)) +#define htole32(x) ((__uint32_t)(x)) +#define htole64(x) ((__uint64_t)(x)) + +#define be16toh(x) __bswap16((x)) +#define be32toh(x) __bswap32((x)) +#define be64toh(x) __bswap64((x)) +#define le16toh(x) ((__uint16_t)(x)) +#define le32toh(x) ((__uint32_t)(x)) +#define le64toh(x) ((__uint64_t)(x)) +#else /* __DARWIN_BYTE_ORDER != __DARWIN_LITTLE_ENDIAN */ +#define htobe16(x) ((__uint16_t)(x)) +#define htobe32(x) ((__uint32_t)(x)) +#define htobe64(x) ((__uint64_t)(x)) +#define htole16(x) __bswap16((x)) +#define htole32(x) __bswap32((x)) +#define htole64(x) __bswap64((x)) + +#define be16toh(x) ((__uint16_t)(x)) +#define be32toh(x) ((__uint32_t)(x)) +#define be64toh(x) ((__uint64_t)(x)) +#define le16toh(x) __bswap16((x)) +#define le32toh(x) __bswap32((x)) +#define le64toh(x) __bswap64((x)) +#endif /* __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN */ + +/* + * Routines to encode/decode big- and little-endian multi-octet values + * to/from an octet stream. + */ +static __inline __uint16_t +be16dec(const void *__sized_by(sizeof(__uint16_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return (__uint16_t)(p[0] << 8) | p[1]; +} + +static __inline __uint32_t +be32dec(const void *__sized_by(sizeof(__uint32_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return ((__uint32_t)p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; +} + +static __inline __uint64_t +be64dec(const void *__sized_by(sizeof(__uint64_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return ((__uint64_t)be32dec(p) << 32) | be32dec(p + 4); +} + +static __inline __uint16_t +le16dec(const void *__sized_by(sizeof(__uint16_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return (__uint16_t)(p[1] << 8) | p[0]; +} + +static __inline __uint32_t +le32dec(const void *__sized_by(sizeof(__uint32_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return ((__uint32_t)p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; +} + +static __inline __uint64_t +le64dec(const void *__sized_by(sizeof(__uint64_t)) pp) +{ + const __uint8_t *p = (const __uint8_t *)pp; + + return ((__uint64_t)le32dec(p + 4) << 32) | le32dec(p); +} + +static __inline void +be16enc(void *__sized_by(sizeof(__uint16_t)) pp, __uint16_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + p[0] = (u >> 8) & 0xff; + p[1] = u & 0xff; +} + +static __inline void +be32enc(void *__sized_by(sizeof(__uint32_t)) pp, __uint32_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + p[0] = (u >> 24) & 0xff; + p[1] = (u >> 16) & 0xff; + p[2] = (u >> 8) & 0xff; + p[3] = u & 0xff; +} + +static __inline void +be64enc(void *__sized_by(sizeof(__uint64_t)) pp, __uint64_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + be32enc(p, (__uint32_t)(u >> 32)); + be32enc(p + 4, (__uint32_t)(u & 0xffffffffU)); +} + +static __inline void +le16enc(void *__sized_by(sizeof(__uint16_t)) pp, __uint16_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; +} + +static __inline void +le32enc(void *__sized_by(sizeof(__uint32_t)) pp, __uint32_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; + p[2] = (u >> 16) & 0xff; + p[3] = (u >> 24) & 0xff; +} + +static __inline void +le64enc(void *__sized_by(sizeof(__uint64_t)) pp, __uint64_t u) +{ + __uint8_t *p = (__uint8_t *)pp; + + le32enc(p, (__uint32_t)(u & 0xffffffffU)); + le32enc(p + 4, (__uint32_t)(u >> 32)); +} + +#endif /* _SYS_ENDIAN_H_ */ diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 10b7c2016..f800d9118 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -262,7 +262,12 @@ __END_DECLS #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EQFULL 106 /* Interface output queue is full */ -#define ELAST 106 /* Must be equal largest errno */ +#endif + +#define ENOTCAPABLE 107 /* Capabilities insufficient */ + +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +#define ELAST 107 /* Must be equal largest errno */ #endif #ifdef KERNEL @@ -281,10 +286,11 @@ __END_DECLS #define ECVCERORR 256 #define ECVPERORR 512 #else /* BSD_KERNEL_PRIVATE */ -/* -5, -6 and -7 and -106 are reserved for kernel internal use */ +/* -5, -6 and -7 and -106 and -108 are reserved for kernel internal use */ #endif /* BSD_KERNEL_PRIVATE */ #ifdef PRIVATE #define EQSUSPENDED (-EQFULL) /* Output queue is suspended */ +#define EQCONGESTED (-108) /* Output queue is congested */ #endif /* PRIVATE */ #endif /* KERNEL */ #endif /* _SYS_ERRNO_H_ */ diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index ec43d1f58..0fceed904 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2022 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,10 +128,11 @@ #define O_TRUNC 0x00000400 /* truncate to zero length */ #define O_EXCL 0x00000800 /* error if already exists */ #define O_RESOLVE_BENEATH 0x00001000 /* only for open(2), same value as FMARK */ +#define O_UNIQUE 0x00002000 /* only for open(2), same value as FDEFER */ #ifdef KERNEL #define FMARK 0x00001000 /* mark during gc(), same value as O_RESOLVE_BENEATH */ -#define FDEFER 0x00002000 /* defer for next gc pass */ +#define FDEFER 0x00002000 /* defer for next gc pass, same value as O_UNIQUE */ #define FWASLOCKED 0x00004000 /* has or has had an advisory fcntl lock */ #define FHASLOCK FWASLOCKED /* obsolete compatibility name */ #endif @@ -209,14 +210,13 @@ #define AT_SYMLINK_FOLLOW 0x0040 /* Act on target of symlink */ #define AT_REMOVEDIR 0x0080 /* Path refers to directory */ #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL -#ifdef PRIVATE -#define AT_REMOVEDIR_DATALESS 0x0100 /* Remove a dataless directory without materializing first */ -#endif #define AT_REALDEV 0x0200 /* Return real device inodes resides on for fstatat(2) */ #define AT_FDONLY 0x0400 /* Use only the fd and Ignore the path for fstatat(2) */ #define AT_SYMLINK_NOFOLLOW_ANY 0x0800 /* Path should not contain any symlinks */ +#define AT_RESOLVE_BENEATH 0x2000 /* Path must reside in the hierarchy beneath the starting directory */ +#define AT_NODELETEBUSY 0x4000 /* Don't delete busy files */ #ifdef PRIVATE -#define AT_SYSTEM_DISCARDED 0x1000 /* Indicated file/folder was discarded by system */ +/* See fcntl_private.h for additional flags */ #endif #endif #endif @@ -307,19 +307,8 @@ #define F_THAW_FS 54 /* "thaw" all fs operations */ #define F_GLOBAL_NOCACHE 55 /* turn data caching off/on (globally) for this file */ -#ifdef PRIVATE -#define F_OPENFROM 56 /* SPI: open a file relative to fd (must be a dir) */ -#define F_UNLINKFROM 57 /* SPI: open a file relative to fd (must be a dir) */ -#define F_CHECK_OPENEVT 58 /* SPI: if a process is marked OPENEVT, or in O_EVTONLY on opens of this vnode */ -#endif /* PRIVATE */ - #define F_ADDSIGS 59 /* add detached signatures */ -#ifdef PRIVATE -/* Deprecated/Removed in 10.9 */ -#define F_MARKDEPENDENCY 60 /* this process hosts the device supporting the fs backing this fd */ -#endif - #define F_ADDFILESIGS 61 /* add signature from same file (used by dyld for shared libs) */ #define F_NODIRECT 62 /* used in conjunction with F_NOCACHE to indicate that DIRECT, synchonous writes */ @@ -340,14 +329,6 @@ /* See F_DUPFD_CLOEXEC below for 67 */ -#ifdef PRIVATE -#define F_SETSTATICCONTENT 68 /* - * indicate to the filesystem/storage driver that the content to be - * written is usually static. a nonzero value enables it, 0 disables it. - */ -#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ -#endif - #define F_SETBACKINGSTORE 70 /* Mark the file as being the backing store for another filesystem */ #define F_GETPATH_MTMINFO 71 /* return the full path of the FD, but error in specific mtmd circumstances */ @@ -365,27 +346,8 @@ #define F_FINDSIGS 78 /* Add detached code signatures (used by dyld for shared libs) */ -#ifdef PRIVATE -#define F_GETDEFAULTPROTLEVEL 79 /* Get the default protection level for the filesystem */ -#define F_MAKECOMPRESSED 80 /* Make the file compressed; truncate & toggle BSD bits */ -#define F_SET_GREEDY_MODE 81 /* - * indicate to the filesystem/storage driver that the content to be - * written should be written in greedy mode for additional speed at - * the cost of storage efficiency. A nonzero value enables it, 0 disables it. - */ - -#define F_SETIOTYPE 82 /* - * Use parameters to describe content being written to the FD. See - * flag definitions below for argument bits. - */ -#endif - #define F_ADDFILESIGS_FOR_DYLD_SIM 83 /* Add signature from same file, only if it is signed by Apple (used by dyld for simulator) */ -#ifdef PRIVATE -#define F_RECYCLE 84 /* Recycle vnode; debug/development builds only */ -#endif - #define F_BARRIERFSYNC 85 /* fsync + issue barrier to drive */ #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL @@ -396,13 +358,6 @@ #define F_OFD_SETLKWTIMEOUT 93 /* (as F_OFD_SETLKW but return if timeout) */ #endif -#ifdef PRIVATE -#define F_OFD_GETLKPID 94 /* get record locking information */ - -#define F_SETCONFINED 95 /* "confine" OFD to process */ -#define F_GETCONFINED 96 /* is-fd-confined? */ -#endif - #define F_ADDFILESIGS_RETURN 97 /* Add signature from same file, return end offset in structure on success */ #define F_CHECK_LV 98 /* Check if Library Validation allows this Mach-O file to be mapped into the calling process */ @@ -423,19 +378,15 @@ #define F_SETLEASE_ARG(t, oc) ((t) | ((oc) << 2)) -#ifdef PRIVATE -#define F_ASSERT_BG_ACCESS 108 /* Assert background access to a file */ -#define F_RELEASE_BG_ACCESS 109 /* Release background access to a file */ -#endif // PRIVATE - #define F_TRANSFEREXTENTS 110 /* Transfer allocated extents beyond leof to a different file */ #define F_ATTRIBUTION_TAG 111 /* Based on flags, query/set/delete a file's attribution tag */ -#if PRIVATE #define F_NOCACHE_EXT 112 /* turn data caching off/on for this fd and relax size and alignment restrictions for write */ -#endif #define F_ADDSIGS_MAIN_BINARY 113 /* add detached signatures for main binary -- development only */ +#ifdef PRIVATE +/* See fcntl_private.h for additional command values */ +#endif /* PRIVATE */ // FS-specific fcntl()'s numbers begin at 0x00010000 and go up #define FCNTL_FS_SPECIFIC_BASE 0x00010000 @@ -449,7 +400,7 @@ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ #if PRIVATE -#define FD_CLOFORK 2 /* close-on-fork flag */ +/* See fcntl_private.h for additional flags */ #endif /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ @@ -468,14 +419,6 @@ #define F_CONFINED 0x1000 /* fileglob cannot leave curproc */ #endif -#if PRIVATE -/* - * ISOCHRONOUS attempts to sustain a minimum platform-dependent throughput - * for the duration of the I/O delivered to the driver. - */ -#define F_IOTYPE_ISOCHRONOUS 0x0001 -#endif - /* * [XSI] The values used for l_whence shall be defined as described * in @@ -515,6 +458,9 @@ struct flock { }; #include +#ifdef KERNEL +#include +#endif #if __DARWIN_C_LEVEL >= __DARWIN_C_FULL /* @@ -525,6 +471,13 @@ struct flocktimeout { struct flock fl; /* flock passed for file locking */ struct timespec timeout; /* timespec struct for timeout */ }; + +#ifdef KERNEL +struct user32_flocktimeout { + struct flock fl; /* flock passed for file locking */ + struct user32_timespec timeout; /* timespec struct for timeout */ +}; +#endif /* KERNEL */ #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) @@ -701,15 +654,6 @@ typedef struct fspecread { off_t fsr_length; /* IN: size of the region */ } fspecread_t; -#ifdef PRIVATE -/* fassertbgaccess_t used by F_ASSERT_BG_ACCESS */ -typedef struct fassertbgaccess { - unsigned int fbga_flags; /* unused */ - unsigned int reserved; /* (to maintain 8-byte alignment) */ - unsigned long long ttl; /* IN: time to live for the assertion (nanoseconds; continuous) */ -} fassertbgaccess_t; -#endif // PRIVATE - /* fattributiontag_t used by F_ATTRIBUTION_TAG */ #define ATTRIBUTION_NAME_MAX 255 typedef struct fattributiontag { @@ -761,38 +705,6 @@ struct log2phys { #define O_POPUP 0x80000000 /* force window to popup on open */ #define O_ALERT 0x20000000 /* small, clean popup window */ -#ifdef PRIVATE -/* - * SPI: Argument data for F_OPENFROM - */ -struct fopenfrom { - unsigned int o_flags; /* same as open(2) */ - mode_t o_mode; /* same as open(2) */ - char * o_pathname; /* relative pathname */ -}; - -#ifdef KERNEL -/* - * LP64 version of fopenfrom. Memory pointers - * grow when we're dealing with a 64-bit process. - * - * WARNING - keep in sync with fopenfrom (above) - */ -struct user32_fopenfrom { - unsigned int o_flags; - mode_t o_mode; - user32_addr_t o_pathname; -}; - -struct user_fopenfrom { - unsigned int o_flags; - mode_t o_mode; - user_addr_t o_pathname; -}; -#endif /* KERNEL */ - -#endif /* PRIVATE */ - #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifndef KERNEL @@ -827,23 +739,6 @@ int creat(const char *, mode_t) __DARWIN_ALIAS_C(creat); int fcntl(int, int, ...) __DARWIN_ALIAS_C(fcntl); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#ifdef PRIVATE -/* - * These definitions are retained temporarily for compatibility. - * If you want to use fileports, please use - * #include - * or - * #include - */ -#ifndef _FILEPORT_T -#define _FILEPORT_T -typedef __darwin_mach_port_t fileport_t; -#define FILEPORT_NULL ((fileport_t)0) -#endif /* _FILEPORT_T */ - -int fileport_makeport(int, fileport_t*); -int fileport_makefd(fileport_t); -#endif /* PRIVATE */ int openx_np(const char *, int, filesec_t); /* * data-protected non-portable open(2) : @@ -866,4 +761,8 @@ int filesec_unset_property(filesec_t, filesec_property_t) __OSX_AVAILABLE_ST __END_DECLS #endif +#if defined(PRIVATE) && !defined(MODULES_SUPPORTED) +#include +#endif /* PRIVATE && !MODULES_SUPPORTED */ + #endif /* !_SYS_FCNTL_H_ */ diff --git a/bsd/sys/fcntl_private.h b/bsd/sys/fcntl_private.h new file mode 100644 index 000000000..b65f1cef3 --- /dev/null +++ b/bsd/sys/fcntl_private.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/*- + * Copyright (c) 1983, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fcntl.h 8.3 (Berkeley) 1/21/94 + */ + + +#ifndef _SYS_FCNTL_PRIVATE_H_ +#define _SYS_FCNTL_PRIVATE_H_ + +#include +#include + +#if __DARWIN_C_LEVEL >= 200809L +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +#define AT_REMOVEDIR_DATALESS 0x0100 /* Remove a dataless directory without materializing first */ +#define AT_SYSTEM_DISCARDED 0x1000 /* Indicated file/folder was discarded by system */ +#endif +#endif + +/* + * Constants used for fcntl(2) + */ + +/* command values */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define F_OPENFROM 56 /* SPI: open a file relative to fd (must be a dir) */ +#define F_UNLINKFROM 57 /* SPI: open a file relative to fd (must be a dir) */ +#define F_CHECK_OPENEVT 58 /* SPI: if a process is marked OPENEVT, or in O_EVTONLY on opens of this vnode */ + +/* Deprecated/Removed in 10.9 */ +#define F_MARKDEPENDENCY 60 /* this process hosts the device supporting the fs backing this fd */ + +#define F_SETSTATICCONTENT 68 /* + * indicate to the filesystem/storage driver that the content to be + * written is usually static. a nonzero value enables it, 0 disables it. + */ +#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ + +#define F_GETDEFAULTPROTLEVEL 79 /* Get the default protection level for the filesystem */ +#define F_MAKECOMPRESSED 80 /* Make the file compressed; truncate & toggle BSD bits */ +#define F_SET_GREEDY_MODE 81 /* + * indicate to the filesystem/storage driver that the content to be + * written should be written in greedy mode for additional speed at + * the cost of storage efficiency. A nonzero value enables it, 0 disables it. + */ + +#define F_SETIOTYPE 82 /* + * Use parameters to describe content being written to the FD. See + * flag definitions below for argument bits. + */ + +#define F_RECYCLE 84 /* Recycle vnode; debug/development builds only */ + +#define F_OFD_GETLKPID 94 /* get record locking information */ + +#define F_SETCONFINED 95 /* "confine" OFD to process */ +#define F_GETCONFINED 96 /* is-fd-confined? */ + +#define F_ASSERT_BG_ACCESS 108 /* Assert background access to a file */ +#define F_RELEASE_BG_ACCESS 109 /* Release background access to a file */ + +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +/* file descriptor flags (F_GETFD, F_SETFD) */ +#define FD_CLOFORK 2 /* close-on-fork flag */ + +/* + * ISOCHRONOUS attempts to sustain a minimum platform-dependent throughput + * for the duration of the I/O delivered to the driver. + */ +#define F_IOTYPE_ISOCHRONOUS 0x0001 + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +/* fassertbgaccess_t used by F_ASSERT_BG_ACCESS */ +typedef struct fassertbgaccess { + unsigned int fbga_flags; /* unused */ + unsigned int reserved; /* (to maintain 8-byte alignment) */ + unsigned long long ttl; /* IN: time to live for the assertion (nanoseconds; continuous) */ +} fassertbgaccess_t; + +/* + * SPI: Argument data for F_OPENFROM + */ +struct fopenfrom { + unsigned int o_flags; /* same as open(2) */ + mode_t o_mode; /* same as open(2) */ + char * o_pathname; /* relative pathname */ +}; + +#ifdef KERNEL +/* + * LP64 version of fopenfrom. Memory pointers + * grow when we're dealing with a 64-bit process. + * + * WARNING - keep in sync with fopenfrom (above) + */ +struct user32_fopenfrom { + unsigned int o_flags; + mode_t o_mode; + user32_addr_t o_pathname; +}; + +struct user_fopenfrom { + unsigned int o_flags; + mode_t o_mode; + user_addr_t o_pathname; +}; +#endif /* KERNEL */ + + +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#endif /* !_SYS_FCNTL_PRIVATE_H_ */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index 77bc47851..b64fcc4b7 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -282,15 +282,16 @@ struct fsioc_cas_bsdflags { #define FSIOC_GRAFT_VERSION 2 /* Grafting flags */ -#define FSCTL_GRAFT_PRESERVE_MOUNT 0x0001 /* Preserve underlying mount until shutdown */ -#define FSCTL_GRAFT_ALTERNATE_SHARED_REGION 0x0002 /* Binaries within should use alternate shared region */ -#define FSCTL_GRAFT_SYSTEM_CONTENT 0x0004 /* Cryptex contains system content */ -#define FSCTL_GRAFT_PANIC_ON_AUTHFAIL 0x0008 /* On failure to authenticate, panic */ -#define FSCTL_GRAFT_STRICT_AUTH 0x0010 /* Strict authentication mode */ -#define FSCTL_GRAFT_PRESERVE_GRAFT 0x0020 /* Preserve graft itself until unmount */ +#define FSCTL_GRAFT_PRESERVE_MOUNT 0x0000000000000001ULL /* Preserve underlying mount until shutdown */ +#define FSCTL_GRAFT_ALTERNATE_SHARED_REGION 0x0000000000000002ULL /* Binaries within should use alternate shared region */ +#define FSCTL_GRAFT_SYSTEM_CONTENT 0x0000000000000004ULL /* Cryptex contains system content */ +#define FSCTL_GRAFT_PANIC_ON_AUTHFAIL 0x0000000000000008ULL /* On failure to authenticate, panic */ +#define FSCTL_GRAFT_STRICT_AUTH 0x0000000000000010ULL /* Strict authentication mode */ +#define FSCTL_GRAFT_PRESERVE_GRAFT 0x0000000000000020ULL /* Preserve graft itself until unmount */ /* Ungrafting flags */ -#define FSCTL_UNGRAFT_UNGRAFTALL 0x0001 /* Ungraft all currently grafted filesystems */ +#define FSCTL_UNGRAFT_UNGRAFTALL 0x0000000000000001ULL /* Ungraft all currently grafted filesystems */ +#define FSCTL_UNGRAFT_NOFORCE 0x0000000000000002ULL /* Disallow ungraft if a non-dir vnode inside the graft is in use */ #ifdef KERNEL diff --git a/bsd/sys/guarded.h b/bsd/sys/guarded.h index ca9bd1199..89347ca7e 100644 --- a/bsd/sys/guarded.h +++ b/bsd/sys/guarded.h @@ -166,6 +166,7 @@ enum guard_vn_exception_codes { #define kVNG_POLICY_EXC_CORPSE (1u << 3) #define kVNG_POLICY_SIGKILL (1u << 4) #define kVNG_POLICY_UPRINTMSG (1u << 5) +#define kVNG_POLICY_EXC_CORE (1u << 6) #if BSD_KERNEL_PRIVATE struct fileglob; diff --git a/bsd/sys/imageboot.h b/bsd/sys/imageboot.h index d4d364101..acbba91d3 100644 --- a/bsd/sys/imageboot.h +++ b/bsd/sys/imageboot.h @@ -43,7 +43,7 @@ int imageboot_format_is_valid(const char *root_path); int imageboot_mount_image(const char *root_path, int height, imageboot_type_t type); int imageboot_pivot_image(const char *image_path, imageboot_type_t type, const char *mount_path, const char *outgoing_root_path, const bool rooted_dmg, const bool skip_signature_check); -int imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp); /* use kmem_free(kernel_map, ...) */ +int imageboot_read_file_pageable(const char *path, void **bufp, size_t *bufszp, bool no_softlimit); /* use kmem_free(kernel_map, ...) */ int imageboot_read_file(const char *path, void **bufp, size_t *bufszp, off_t *fsizep); int imageboot_read_file_from_offset(const char *path, off_t offset, void **bufp, size_t *bufszp); diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index 6da0e710b..c09791cab 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -157,10 +157,8 @@ struct image_params { #define IMGPF_PLUGIN_HOST_DISABLE_A_KEYS 0x00002000 /* process hosts plugins, disable ptr auth A keys */ #define IMGPF_HW_TPRO 0x00004000 /* HW support for read-only/read-write trusted paths */ #define IMGPF_HARDENED_HEAP 0x00008000 /* enable hardened-heap for the process */ -#define IMGPF_RESERVED_3 0x01000000 #define IMGPF_ROSETTA 0x10000000 /* load rosetta runtime */ #define IMGPF_ALT_ROSETTA 0x20000000 /* load alternative rosetta runtime */ -#define IMGPF_RESERVED_2 0x40000000 #define IMGPF_NOJOP 0x80000000 /* diff --git a/bsd/sys/kas_info.h b/bsd/sys/kas_info.h index a35f2bbac..1aaccf9c3 100644 --- a/bsd/sys/kas_info.h +++ b/bsd/sys/kas_info.h @@ -31,6 +31,7 @@ #include #include +#include /* * kas_info() ("Kernel Address Space Info") is a private interface that allows diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 22d7bfded..fabf5fa14 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -184,6 +184,8 @@ __BEGIN_DECLS #define DBG_MACH_MACHDEP_EXCP_SC_x86 0xAE // Machine Dependent System Calls on x86 #define DBG_MACH_MACHDEP_EXCP_SC_ARM 0xAF // Machine Dependent System Calls on arm #define DBG_MACH_VM_RECLAIM 0xB0 // Deferred Memory Reclamation +#define DBG_MACH_VM_LOCK_PERF 0xB1 // Performance of VM Locks +#define DBG_MACH_MEMINFO 0xB2 // General system memory information // Codes for DBG_MACH_IO #define DBC_MACH_IO_MMIO_READ 0x1 @@ -291,6 +293,10 @@ __BEGIN_DECLS #define MACH_SCHED_AST_CHECK 0x62 /* run ast check interrupt handler */ #define MACH_SCHED_PREEMPT_TIMER_ACTIVE 0x63 /* preempt timer is armed */ #define MACH_PROCESSOR_SHUTDOWN 0x64 /* processor was shut down */ +#define MACH_SCHED_PSET_BITMASKS 0x65 /* Migration, rotation, and recommendation bitmasks for each pset */ +#define MACH_SUSPEND_DRIVERKIT_USERSPACE 0x66 /* one driverkit process is suspended/unsuspended by iokit */ +#define MACH_SCHED_PREFERRED_PSET 0x67 /* Recommendation change for a thread group at a specific QoS */ +#define MACH_SCHED_ONCORE_PREEMPT 0x68 /* CLPC requested thread preemption */ /* Codes for Clutch/Edge Scheduler (DBG_MACH_SCHED_CLUTCH) */ #define MACH_SCHED_CLUTCH_ROOT_BUCKET_STATE 0x0 /* __unused */ @@ -309,6 +315,7 @@ __BEGIN_DECLS #define MACH_SCHED_EDGE_CLUSTER_SHARED_LOAD 0xc /* Per-cluster shared resource load */ #define MACH_SCHED_EDGE_RSRC_HEAVY_THREAD 0xd /* Resource heavy thread state */ #define MACH_SCHED_EDGE_SHARED_RSRC_MIGRATE 0xe /* Migrating a shared resource thread due to cluster load imbalance */ +#define MACH_SCHED_EDGE_STIR_THE_POT 0xf /* Rotate running threads on and off P-cores to share time and make roughly equal forward progress */ /* Codes for workgroup interval subsystem (DBG_MACH_WORKGROUP) */ #define WORKGROUP_INTERVAL_CREATE 0x0 /* work interval creation */ @@ -365,6 +372,7 @@ __BEGIN_DECLS #define DBG_VM_INFO8 0x112 #define DBG_VM_INFO9 0x113 #define DBG_VM_INFO10 0x114 +#define DBG_VM_INFO11 0x115 #define DBG_VM_UPL_PAGE_WAIT 0x120 #define DBG_VM_IOPL_PAGE_WAIT 0x121 @@ -386,6 +394,7 @@ __BEGIN_DECLS #define DBG_VM_UPL_REQUEST 0x133 #define DBG_VM_IOPL_REQUEST 0x134 #define DBG_VM_KERN_REQUEST 0x135 +#define DBG_VM_UPL_THROTTLE 0x136 #define DBG_VM_DATA_WRITE 0x140 #define DBG_VM_PRESSURE_LEVEL_CHANGE 0x141 @@ -394,6 +403,8 @@ __BEGIN_DECLS #define DBG_VM_MAP_LOOKUP_ENTRY_FAILURE 0x143 +#define DBG_VM_FAULT_DEACTIVATE_BEHIND 0x160 + /* * Codes for Working Set Measurement (DBG_MACH_WORKINGSET) */ @@ -538,6 +549,7 @@ __BEGIN_DECLS #define PMAP__IOMMU_GRANT_PAGE 0x1e #define PMAP__BATCH_UPDATE_CACHING 0x1f #define PMAP__COLLECT_CACHE_OPS 0x20 +#define PMAP__SET_SHARED_REGION 0x21 /* Codes for clock (DBG_MACH_CLOCK) */ #define MACH_EPOCH_CHANGE 0x0 /* wake epoch change */ @@ -663,6 +675,30 @@ __BEGIN_DECLS #define VM_RECLAIM_RESIZE 0x0a #define VM_RECLAIM_FLUSH 0x0b +#pragma mark System Memory Info Codes (DBG_MACH_MEMINFO) + +/* system memory state */ +#define DBG_MEMINFO_PGCNT1 0x01 +#define DBG_MEMINFO_PGCNT2 0x02 +#define DBG_MEMINFO_PGCNT3 0x03 +#define DBG_MEMINFO_PGCNT4 0x04 +#define DBG_MEMINFO_PGCNT5 0x05 +#define DBG_MEMINFO_PGCNT6 0x06 +#define DBG_MEMINFO_PGCNT7 0x07 +#define DBG_MEMINFO_PGCNT8 0x08 + +/* Page eviction statistics */ +#define DBG_MEMINFO_PGOUT1 0x11 +#define DBG_MEMINFO_PGOUT2 0x12 +#define DBG_MEMINFO_PGOUT3 0x13 +#define DBG_MEMINFO_PGOUT4 0x14 +#define DBG_MEMINFO_PGOUT5 0x15 +#define DBG_MEMINFO_PGOUT6 0x16 + +/* Page demand statistics */ +#define DBG_MEMINFO_DEMAND1 0x21 +#define DBG_MEMINFO_DEMAND2 0x22 + /* **** The Kernel Debug Sub Classes for Network (DBG_NETWORK) **** */ #define DBG_NETIP 1 /* Internet Protocol */ #define DBG_NETARP 2 /* Address Resolution Protocol */ @@ -1014,6 +1050,8 @@ __BEGIN_DECLS #define IMP_BOOST 0x11 /* Task boost level changed */ #define IMP_MSG 0x12 /* boosting message sent by donating task on donating port */ #define IMP_WATCHPORT 0x13 /* port marked as watchport, and boost was transferred to the watched task */ +#define IMP_THREAD_PROMOTE_ABOVE_TASK 0x15 /* Thread is turnstile boosted above task clamp */ +#define IMP_RUNAWAY_MITIGATION 0x16 /* Runaway mitigation status change */ #define IMP_TASK_SUPPRESSION 0x17 /* Task changed suppression behaviors */ #define IMP_TASK_APPTYPE 0x18 /* Task launched with apptype */ #define IMP_UPDATE 0x19 /* Requested -> effective calculation */ @@ -1021,7 +1059,10 @@ __BEGIN_DECLS #define IMP_DONOR_CHANGE 0x1B /* The iit_donor bit changed */ #define IMP_MAIN_THREAD_QOS 0x1C /* The task's main thread QoS was set */ #define IMP_SYNC_IPC_QOS 0x1D /* Sync IPC QOS override */ -/* DBG_IMPORTANCE subclasses 0x20 - 0x40 are reserved for task policy flavors */ +#define IMP_SET_GPU_ROLE 0x1E /* Update GPU Role */ +#define IMP_QUERY_GPU_ROLE 0x1F /* Driver queries GPU Role */ + +/* DBG_IMPORTANCE subclasses 0x20 - 0x50 are reserved for task policy flavors */ /* thread and task attributes */ #define IMP_TASK_POLICY_DARWIN_BG 0x21 @@ -1062,6 +1103,8 @@ __BEGIN_DECLS #define IMP_TASK_POLICY_IOTIER_KEVENT_OVERRIDE 0x3F #define IMP_TASK_POLICY_WI_DRIVEN 0x40 +#define IMP_TASK_POLICY_RUNAWAY_MITIGATION 0x41 + /* Codes for IMP_ASSERTION */ #define IMP_HOLD 0x2 /* Task holds a boost assertion */ #define IMP_DROP 0x4 /* Task drops a boost assertion */ @@ -1203,6 +1246,8 @@ __BEGIN_DECLS #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code) #define POWERDDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_POWERD, code) +#define MEMINFO_CODE(code) KDBG_EVENTID(DBG_MACH, DBG_MACH_MEMINFO, code) + // VFS lookup events #define VFS_LOOKUP (FSDBG_CODE(DBG_FSRW,36)) #define VFS_LOOKUP_DONE (FSDBG_CODE(DBG_FSRW,39)) diff --git a/bsd/sys/kdebug_common.h b/bsd/sys/kdebug_common.h index 31226ccf3..dc4aab5ac 100644 --- a/bsd/sys/kdebug_common.h +++ b/bsd/sys/kdebug_common.h @@ -46,7 +46,7 @@ #endif // defined(__x86_64__) #define TRIAGE_EVENTS_PER_STORAGE_UNIT 128 -#define TRIAGE_MIN_STORAGE_UNITS_PER_CPU 1 +#define TRIAGE_MIN_STORAGE_UNITS_PER_CPU 2 #define TRACE_EVENTS_PER_STORAGE_UNIT 2048 #define TRACE_MIN_STORAGE_UNITS_PER_CPU 4 @@ -164,28 +164,17 @@ void kdebug_lck_init(void); int kdebug_storage_lock(struct kd_control *ctl); void kdebug_storage_unlock(struct kd_control *ctl, int intrs_en); -/* - * Disable wrapping and return true if trace wrapped, false otherwise. - */ -bool kdebug_disable_wrap(struct kd_control *ctl, kdebug_emit_filter_t *old_emit, - kdebug_live_flags_t *old_live_flags); +bool kdebug_storage_alloc( + struct kd_control *kd_ctrl_page, + struct kd_buffer *kd_data_page, + int cpu); -int create_buffers_triage(void); +void create_buffers_triage(void); int create_buffers(struct kd_control *ctl, struct kd_buffer *buf, vm_tag_t tag); void delete_buffers(struct kd_control *ctl, struct kd_buffer *buf); -void kernel_debug_write(struct kd_control *ctl, struct kd_buffer *buf, - struct kd_record kd_rec); - -int kernel_debug_read(struct kd_control *ctl, struct kd_buffer *buf, - user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, - uint32_t file_version); - -extern int RAW_file_written; -#define RAW_FLUSH_SIZE (2 * 1024 * 1024) - void commpage_update_kdebug_state(void); #endif /* BSD_SYS_KDEBUG_COMMON_H */ diff --git a/bsd/sys/kdebug_kernel.h b/bsd/sys/kdebug_kernel.h index 817d10d21..44df424ae 100644 --- a/bsd/sys/kdebug_kernel.h +++ b/bsd/sys/kdebug_kernel.h @@ -297,7 +297,7 @@ extern unsigned int kdebug_enable; do { \ if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \ + (uintptr_t)(c), (uintptr_t)(d), KDBG_NON_PROCESS); \ } \ } while (0) #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ @@ -420,11 +420,13 @@ void kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, void kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); -#define KDBG_FLAG_FILTERED 0x01 -#define KDBG_FLAG_NOPROCFILT 0x02 +__options_decl(kdebug_emit_flags_t, uint64_t, { + KDBG_FILTER_ONLY = 0x01, + KDBG_NON_PROCESS = 0x02, +}); void kernel_debug_flags(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uint64_t flags); + uintptr_t arg3, uintptr_t arg4, kdebug_emit_flags_t flags); void kernel_debug_filtered(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); diff --git a/bsd/sys/kdebug_private.h b/bsd/sys/kdebug_private.h index 171558fb4..824eefb4a 100644 --- a/bsd/sys/kdebug_private.h +++ b/bsd/sys/kdebug_private.h @@ -204,6 +204,7 @@ __BEGIN_DECLS // DBG_SECURITY private subclasses #define DBG_SEC_SSMA 0x02 +#define DBG_SEC_ERM 0x03 #define SKYWALKDBG_CODE(SubClass, code) KDBG_CODE(DBG_DLIL, SubClass, code) #define PPTDBG_CODE(SubClass, code) KDBG_CODE(DBG_PPT, SubClass, code) diff --git a/bsd/sys/kdebug_triage.h b/bsd/sys/kdebug_triage.h index 4167a467b..d750b4e58 100644 --- a/bsd/sys/kdebug_triage.h +++ b/bsd/sys/kdebug_triage.h @@ -23,8 +23,6 @@ #ifndef BSD_SYS_KDEBUG_TRIAGE_H #define BSD_SYS_KDEBUG_TRIAGE_H -void delete_buffers_triage(void); - #define KDBG_TRIAGE_CLASS_MASK (0xff000000) #define KDBG_TRIAGE_CLASS_OFFSET (24) #define KDBG_TRIAGE_CLASS_MAX (0xff) @@ -112,6 +110,9 @@ enum vm_subsys_error_codes { KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR, KDBG_TRIAGE_VM_CODE_SIGNING, KDBG_TRIAGE_VM_FAULTS_DISABLED, + KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE, + KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE, + KDBG_TRIAGE_VM_UPL_WRITE_ON_EXEC_REGION, KDBG_TRIAGE_VM_MAX }; #define VM_MAX_TRIAGE_STRINGS (KDBG_TRIAGE_VM_MAX) diff --git a/bsd/sys/kern_control.h b/bsd/sys/kern_control.h index d667bae17..365a4afe1 100644 --- a/bsd/sys/kern_control.h +++ b/bsd/sys/kern_control.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004, 2012-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2004, 2012-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -147,58 +147,6 @@ struct sockaddr_ctl { u_int32_t sc_reserved[5]; }; -#ifdef PRIVATE - -struct xkctl_reg { - u_int32_t xkr_len; - u_int32_t xkr_kind; - u_int32_t xkr_id; - u_int32_t xkr_reg_unit; - u_int32_t xkr_flags; - u_int64_t xkr_kctlref; - u_int32_t xkr_recvbufsize; - u_int32_t xkr_sendbufsize; - u_int32_t xkr_lastunit; - u_int32_t xkr_pcbcount; - u_int64_t xkr_connect; - u_int64_t xkr_disconnect; - u_int64_t xkr_send; - u_int64_t xkr_send_list; - u_int64_t xkr_setopt; - u_int64_t xkr_getopt; - u_int64_t xkr_rcvd; - char xkr_name[MAX_KCTL_NAME]; -}; - -struct xkctlpcb { - u_int32_t xkp_len; - u_int32_t xkp_kind; - u_int64_t xkp_kctpcb; - u_int32_t xkp_unit; - u_int32_t xkp_kctlid; - u_int64_t xkp_kctlref; - char xkp_kctlname[MAX_KCTL_NAME]; -}; - -struct kctlstat { - u_int64_t kcs_reg_total __attribute__((aligned(8))); - u_int64_t kcs_reg_count __attribute__((aligned(8))); - u_int64_t kcs_pcbcount __attribute__((aligned(8))); - u_int64_t kcs_gencnt __attribute__((aligned(8))); - u_int64_t kcs_connections __attribute__((aligned(8))); - u_int64_t kcs_conn_fail __attribute__((aligned(8))); - u_int64_t kcs_send_fail __attribute__((aligned(8))); - u_int64_t kcs_send_list_fail __attribute__((aligned(8))); - u_int64_t kcs_enqueue_fail __attribute__((aligned(8))); - u_int64_t kcs_enqueue_fullsock __attribute__((aligned(8))); - u_int64_t kcs_bad_kctlref __attribute__((aligned(8))); - u_int64_t kcs_tbl_size_too_big __attribute__((aligned(8))); - u_int64_t kcs_enqdata_mb_alloc_fail __attribute__((aligned(8))); - u_int64_t kcs_enqdata_sbappend_fail __attribute__((aligned(8))); -}; - -#endif /* PRIVATE */ - #ifdef KERNEL #include @@ -238,29 +186,6 @@ typedef void * kern_ctl_ref; */ #define CTL_FLAG_REG_SOCK_STREAM 0x4 -#ifdef KERNEL_PRIVATE -/*! - * @defined CTL_FLAG_REG_EXTENDED - * @discussion This flag indicates that this kernel control utilizes the - * the extended fields within the kern_ctl_reg structure. - */ -#define CTL_FLAG_REG_EXTENDED 0x8 - -/*! - * @defined CTL_FLAG_REG_CRIT - * @discussion This flag indicates that this kernel control utilizes the - * the extended fields within the kern_ctl_reg structure. - */ -#define CTL_FLAG_REG_CRIT 0x10 - -/*! - * @defined CTL_FLAG_REG_SETUP - * @discussion This flag indicates that this kernel control utilizes the - * the setup callback field within the kern_ctl_reg structure. - */ -#define CTL_FLAG_REG_SETUP 0x20 -#endif /* KERNEL_PRIVATE */ - /* Data flags for controllers */ /*! * @defined CTL_DATA_NOWAKEUP @@ -279,16 +204,6 @@ typedef void * kern_ctl_ref; */ #define CTL_DATA_EOR 0x2 -#ifdef KERNEL_PRIVATE -/*! - * @defined CTL_DATA_CRIT - * @discussion This flag indicates the data is critical to the client - * and that it needs to be forced into the socket buffer - * by resizing it if needed. - */ -#define CTL_DATA_CRIT 0x4 -#endif /* KERNEL_PRIVATE */ - __BEGIN_DECLS /*! @@ -390,84 +305,20 @@ typedef errno_t (*ctl_getopt_func)(kern_ctl_ref kctlref, u_int32_t unit, void *u int opt, void *data, size_t *len); #ifdef KERNEL_PRIVATE -/*! - * @typedef ctl_rcvd_func - * @discussion The ctl_rcvd_func is called when the client reads data from - * the kernel control socket. The kernel control can use this callback - * in combination with ctl_getenqueuespace() to avoid overflowing - * the socket's receive buffer. When ctl_getenqueuespace() returns - * 0 or ctl_enqueuedata()/ctl_enqueuembuf() return ENOBUFS, the - * kernel control can wait until this callback is called before - * trying to enqueue the data again. - * @param kctlref The control ref of the kernel control. - * @param unit The unit number of the kernel control instance. - * @param unitinfo The user-defined private data initialized by the - * ctl_connect_func callback. - * @param flags The recv flags. See the recv(2) man page. +/* + * KERN_CTL_REG_OPAQUE means that kern_ctl_reg is an opaque structure + * in the public header, and redeclared in kern_control_private.h. */ -typedef void (*ctl_rcvd_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, - int flags); - -/*! - * @typedef ctl_send_list_func - * @discussion The ctl_send_list_func is used to receive data sent from - * the client to the kernel control. - * @param kctlref The control ref of the kernel control. - * @param unit The unit number of the kernel control instance the client has - * connected to. - * @param unitinfo The user-defined private data initialized by the - * ctl_connect_func callback. - * @param m The data sent by the client to the kernel control in an - * mbuf packet chain. Your function is responsible for releasing - * mbuf packet chain. - * @param flags The flags specified by the client when calling - * send/sendto/sendmsg (MSG_OOB/MSG_DONTROUTE). - */ -typedef errno_t (*ctl_send_list_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, - mbuf_t m, int flags); - -/*! - * @typedef ctl_bind_func - * @discussion The ctl_bind_func is an optional function that allows the client - * to set up their unitinfo prior to connecting. - * @param kctlref The control ref for the kernel control the client is - * binding to. - * @param sac The address used to connect to this control. The field sc_unit - * contains the unit number of the kernel control instance the client is - * binding to. If CTL_FLAG_REG_ID_UNIT was set when the kernel control - * was registered, sc_unit is the ctl_unit of the kern_ctl_reg structure. - * If CTL_FLAG_REG_ID_UNIT was not set when the kernel control was - * registered, sc_unit is the dynamically allocated unit number of - * the new kernel control instance that is used for this connection. - * @param unitinfo A placeholder for a pointer to the optional user-defined - * private data associated with this kernel control instance. This - * opaque info will be provided to the user when the rest of the - * callback routines are executed. For example, it can be used - * to pass a pointer to an instance-specific data structure in - * order for the user to keep track of the states related to this - * kernel control instance. - */ -typedef errno_t (*ctl_bind_func)(kern_ctl_ref kctlref, - struct sockaddr_ctl *sac, - void **unitinfo); - -/*! - * @typedef ctl_setup_func - * @discussion The ctl_setup_func is an optional function that allows the client - * to pick a unit number in the case that the caller hasn't specified one - * @param unit A placeholder for a pointer to the unit number that is selected with - * this kernel control instance - * @param unitinfo A placeholder for a pointer to the optional user-defined - * private data associated with this kernel control instance. This - * opaque info will be provided to the user when the rest of the - * callback routines are executed. For example, it can be used - * to pass a pointer to an instance-specific data structure in - * order for the user to keep track of the states related to this - * kernel control instance. - */ -typedef errno_t (*ctl_setup_func)(u_int32_t *unit, void **unitinfo); +#define KERN_CTL_REG_OPAQUE #endif /* KERNEL_PRIVATE */ - +#ifdef KERN_CTL_REG_OPAQUE +/*! + * @struct kern_ctl_reg + * @discussion This structure defines the properties of a kernel + * control being registered. + */ +struct kern_ctl_reg; +#else /*! * @struct kern_ctl_reg * @discussion This structure defines the properties of a kernel @@ -519,13 +370,8 @@ struct kern_ctl_reg { ctl_send_func ctl_send; ctl_setopt_func ctl_setopt; ctl_getopt_func ctl_getopt; -#ifdef KERNEL_PRIVATE - ctl_rcvd_func ctl_rcvd; /* Only valid if CTL_FLAG_REG_EXTENDED is set */ - ctl_send_list_func ctl_send_list;/* Only valid if CTL_FLAG_REG_EXTENDED is set */ - ctl_bind_func ctl_bind; - ctl_setup_func ctl_setup; -#endif /* KERNEL_PRIVATE */ }; +#endif /* KERN_CTL_REG_OPAQUE */ /*! * @function ctl_register @@ -595,43 +441,6 @@ errno_t errno_t ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m, u_int32_t flags); -#ifdef PRIVATE -/*! - * @function ctl_enqueuembuf_list - * @discussion Send data stored in an mbuf packet chain from the kernel - * control to the client. The caller is responsible for freeing - * the mbuf chain if ctl_enqueuembuf returns an error. - * Not valid if ctl_flags contains CTL_FLAG_REG_SOCK_STREAM. - * @param kctlref The control reference of the kernel control. - * @param unit The unit number of the kernel control instance. - * @param m_list An mbuf chain containing the data to send to the client. - * @param flags Send flags. CTL_DATA_NOWAKEUP is - * the only supported flags. - * @param m_remain A pointer to the list of mbuf packets in the chain that - * could not be enqueued. - * @result 0 - Data was enqueued to be read by the client. - * EINVAL - Invalid parameters. - * ENOBUFS - The queue is full. - */ -errno_t -ctl_enqueuembuf_list(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m_list, - u_int32_t flags, mbuf_t *m_remain); - -/*! - * @function ctl_getenqueuepacketcount - * @discussion Retrieve the number of packets in the socket - * receive buffer. - * @param kctlref The control reference of the kernel control. - * @param unit The unit number of the kernel control instance. - * @param pcnt The address where to return the current count. - * @result 0 - Success; the packet count is returned to caller. - * EINVAL - Invalid parameters. - */ -errno_t -ctl_getenqueuepacketcount(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *pcnt); - -#endif /* PRIVATE */ - /*! * @function ctl_getenqueuespace * @discussion Retrieve the amount of space currently available for data to be sent @@ -661,28 +470,11 @@ ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space); errno_t ctl_getenqueuereadable(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *difference); -#ifdef KERNEL_PRIVATE - -#include -#include - -/* - * internal structure maintained for each register controller - */ -struct ctl_cb; -struct kctl; -struct socket; -struct socket_info; - -void kctl_fill_socketinfo(struct socket *, struct socket_info *); - -u_int32_t ctl_id_by_name(const char *); -errno_t ctl_name_by_id(u_int32_t, char *__counted_by(maxsize), size_t maxsize); - -extern const u_int32_t ctl_maxunit; -#endif /* KERNEL_PRIVATE */ - __END_DECLS #endif /* KERNEL */ +#if defined(PRIVATE) && !defined(MODULES_SUPPORTED) +#include +#endif /* PRIVATE && !MODULES_SUPPORTED */ + #endif /* KPI_KERN_CONTROL_H */ diff --git a/bsd/sys/kern_control_private.h b/bsd/sys/kern_control_private.h new file mode 100644 index 000000000..035da6063 --- /dev/null +++ b/bsd/sys/kern_control_private.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*! + * @header kern_control.h + * This header defines an API to communicate between a kernel + * extension and a process outside of the kernel. + */ + +#ifndef KPI_KERN_CONTROL_PRIVATE_H +#define KPI_KERN_CONTROL_PRIVATE_H + +#include + +struct xkctl_reg { + u_int32_t xkr_len; + u_int32_t xkr_kind; + u_int32_t xkr_id; + u_int32_t xkr_reg_unit; + u_int32_t xkr_flags; + u_int64_t xkr_kctlref; + u_int32_t xkr_recvbufsize; + u_int32_t xkr_sendbufsize; + u_int32_t xkr_lastunit; + u_int32_t xkr_pcbcount; + u_int64_t xkr_connect; + u_int64_t xkr_disconnect; + u_int64_t xkr_send; + u_int64_t xkr_send_list; + u_int64_t xkr_setopt; + u_int64_t xkr_getopt; + u_int64_t xkr_rcvd; + char xkr_name[MAX_KCTL_NAME]; +}; + +struct xkctlpcb { + u_int32_t xkp_len; + u_int32_t xkp_kind; + u_int64_t xkp_kctpcb; + u_int32_t xkp_unit; + u_int32_t xkp_kctlid; + u_int64_t xkp_kctlref; + char xkp_kctlname[MAX_KCTL_NAME]; +}; + +struct kctlstat { + u_int64_t kcs_reg_total __attribute__((aligned(8))); + u_int64_t kcs_reg_count __attribute__((aligned(8))); + u_int64_t kcs_pcbcount __attribute__((aligned(8))); + u_int64_t kcs_gencnt __attribute__((aligned(8))); + u_int64_t kcs_connections __attribute__((aligned(8))); + u_int64_t kcs_conn_fail __attribute__((aligned(8))); + u_int64_t kcs_send_fail __attribute__((aligned(8))); + u_int64_t kcs_send_list_fail __attribute__((aligned(8))); + u_int64_t kcs_enqueue_fail __attribute__((aligned(8))); + u_int64_t kcs_enqueue_fullsock __attribute__((aligned(8))); + u_int64_t kcs_bad_kctlref __attribute__((aligned(8))); + u_int64_t kcs_tbl_size_too_big __attribute__((aligned(8))); + u_int64_t kcs_enqdata_mb_alloc_fail __attribute__((aligned(8))); + u_int64_t kcs_enqdata_sbappend_fail __attribute__((aligned(8))); +}; + +#ifdef KERNEL + +#ifdef KERNEL_PRIVATE +/*! + * @defined CTL_FLAG_REG_EXTENDED + * @discussion This flag indicates that this kernel control utilizes the + * the extended fields within the kern_ctl_reg structure. + */ +#define CTL_FLAG_REG_EXTENDED 0x8 + +/*! + * @defined CTL_FLAG_REG_CRIT + * @discussion This flag indicates that this kernel control utilizes the + * the extended fields within the kern_ctl_reg structure. + */ +#define CTL_FLAG_REG_CRIT 0x10 + +/*! + * @defined CTL_FLAG_REG_SETUP + * @discussion This flag indicates that this kernel control utilizes the + * the setup callback field within the kern_ctl_reg structure. + */ +#define CTL_FLAG_REG_SETUP 0x20 + +/*! + * @defined CTL_DATA_CRIT + * @discussion This flag indicates the data is critical to the client + * and that it needs to be forced into the socket buffer + * by resizing it if needed. + */ +#define CTL_DATA_CRIT 0x4 +#endif /* KERNEL_PRIVATE */ + +__BEGIN_DECLS + +#ifdef KERNEL_PRIVATE +/*! + * @typedef ctl_rcvd_func + * @discussion The ctl_rcvd_func is called when the client reads data from + * the kernel control socket. The kernel control can use this callback + * in combination with ctl_getenqueuespace() to avoid overflowing + * the socket's receive buffer. When ctl_getenqueuespace() returns + * 0 or ctl_enqueuedata()/ctl_enqueuembuf() return ENOBUFS, the + * kernel control can wait until this callback is called before + * trying to enqueue the data again. + * @param kctlref The control ref of the kernel control. + * @param unit The unit number of the kernel control instance. + * @param unitinfo The user-defined private data initialized by the + * ctl_connect_func callback. + * @param flags The recv flags. See the recv(2) man page. + */ +typedef void (*ctl_rcvd_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, + int flags); + +/*! + * @typedef ctl_send_list_func + * @discussion The ctl_send_list_func is used to receive data sent from + * the client to the kernel control. + * @param kctlref The control ref of the kernel control. + * @param unit The unit number of the kernel control instance the client has + * connected to. + * @param unitinfo The user-defined private data initialized by the + * ctl_connect_func callback. + * @param m The data sent by the client to the kernel control in an + * mbuf packet chain. Your function is responsible for releasing + * mbuf packet chain. + * @param flags The flags specified by the client when calling + * send/sendto/sendmsg (MSG_OOB/MSG_DONTROUTE). + */ +typedef errno_t (*ctl_send_list_func)(kern_ctl_ref kctlref, u_int32_t unit, void *unitinfo, + mbuf_t m, int flags); + +/*! + * @typedef ctl_bind_func + * @discussion The ctl_bind_func is an optional function that allows the client + * to set up their unitinfo prior to connecting. + * @param kctlref The control ref for the kernel control the client is + * binding to. + * @param sac The address used to connect to this control. The field sc_unit + * contains the unit number of the kernel control instance the client is + * binding to. If CTL_FLAG_REG_ID_UNIT was set when the kernel control + * was registered, sc_unit is the ctl_unit of the kern_ctl_reg structure. + * If CTL_FLAG_REG_ID_UNIT was not set when the kernel control was + * registered, sc_unit is the dynamically allocated unit number of + * the new kernel control instance that is used for this connection. + * @param unitinfo A placeholder for a pointer to the optional user-defined + * private data associated with this kernel control instance. This + * opaque info will be provided to the user when the rest of the + * callback routines are executed. For example, it can be used + * to pass a pointer to an instance-specific data structure in + * order for the user to keep track of the states related to this + * kernel control instance. + */ +typedef errno_t (*ctl_bind_func)(kern_ctl_ref kctlref, + struct sockaddr_ctl *sac, + void **unitinfo); + +/*! + * @typedef ctl_setup_func + * @discussion The ctl_setup_func is an optional function that allows the client + * to pick a unit number in the case that the caller hasn't specified one + * @param unit A placeholder for a pointer to the unit number that is selected with + * this kernel control instance + * @param unitinfo A placeholder for a pointer to the optional user-defined + * private data associated with this kernel control instance. This + * opaque info will be provided to the user when the rest of the + * callback routines are executed. For example, it can be used + * to pass a pointer to an instance-specific data structure in + * order for the user to keep track of the states related to this + * kernel control instance. + */ +typedef errno_t (*ctl_setup_func)(u_int32_t *unit, void **unitinfo); +#endif /* KERNEL_PRIVATE */ + +#ifdef KERN_CTL_REG_OPAQUE +/*! + * @struct kern_ctl_reg + * @discussion This structure defines the properties of a kernel + * control being registered. + * @field ctl_name A Bundle ID string of up to MAX_KCTL_NAME bytes (including the ending zero). + * This string should not be empty. + * @field ctl_id The control ID may be dynamically assigned or it can be a + * 32-bit creator code assigned by DTS. + * For a DTS assigned creator code the CTL_FLAG_REG_ID_UNIT flag must be set. + * For a dynamically assigned control ID, do not set the CTL_FLAG_REG_ID_UNIT flag. + * The value of the dynamically assigned control ID is set to this field + * when the registration succeeds. + * @field ctl_unit A separate unit number to register multiple units that + * share the same control ID with DTS assigned creator code when + * the CTL_FLAG_REG_ID_UNIT flag is set. + * This field is ignored for a dynamically assigned control ID. + * @field ctl_flags CTL_FLAG_PRIVILEGED and/or CTL_FLAG_REG_ID_UNIT. + * @field ctl_sendsize Override the default send size. If set to zero, + * the default send size will be used, and this default value + * is set to this field to be retrieved by the caller. + * @field ctl_recvsize Override the default receive size. If set to + * zero, the default receive size will be used, and this default value + * is set to this field to be retrieved by the caller. + * @field ctl_connect Specify the function to be called whenever a client + * connects to the kernel control. This field must be specified. + * @field ctl_disconnect Specify a function to be called whenever a + * client disconnects from the kernel control. + * @field ctl_send Specify a function to handle data send from the + * client to the kernel control. + * @field ctl_setopt Specify a function to handle set socket option + * operations for the kernel control. + * @field ctl_getopt Specify a function to handle get socket option + * operations for the kernel control. + */ +struct kern_ctl_reg { + /* control information */ + char ctl_name[MAX_KCTL_NAME]; + u_int32_t ctl_id; + u_int32_t ctl_unit; + + /* control settings */ + u_int32_t ctl_flags; + u_int32_t ctl_sendsize; + u_int32_t ctl_recvsize; + + /* Dispatch functions */ + ctl_connect_func ctl_connect; + ctl_disconnect_func ctl_disconnect; + ctl_send_func ctl_send; + ctl_setopt_func ctl_setopt; + ctl_getopt_func ctl_getopt; + ctl_rcvd_func ctl_rcvd; /* Only valid if CTL_FLAG_REG_EXTENDED is set */ + ctl_send_list_func ctl_send_list;/* Only valid if CTL_FLAG_REG_EXTENDED is set */ + ctl_bind_func ctl_bind; + ctl_setup_func ctl_setup; +}; +#endif /* KERN_CTL_REG_OPAQUE */ + +/*! + * @function ctl_enqueuembuf_list + * @discussion Send data stored in an mbuf packet chain from the kernel + * control to the client. The caller is responsible for freeing + * the mbuf chain if ctl_enqueuembuf returns an error. + * Not valid if ctl_flags contains CTL_FLAG_REG_SOCK_STREAM. + * @param kctlref The control reference of the kernel control. + * @param unit The unit number of the kernel control instance. + * @param m_list An mbuf chain containing the data to send to the client. + * @param flags Send flags. CTL_DATA_NOWAKEUP is + * the only supported flags. + * @param m_remain A pointer to the list of mbuf packets in the chain that + * could not be enqueued. + * @result 0 - Data was enqueued to be read by the client. + * EINVAL - Invalid parameters. + * ENOBUFS - The queue is full. + */ +errno_t +ctl_enqueuembuf_list(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m_list, + u_int32_t flags, mbuf_t *m_remain); + +/*! + * @function ctl_getenqueuepacketcount + * @discussion Retrieve the number of packets in the socket + * receive buffer. + * @param kctlref The control reference of the kernel control. + * @param unit The unit number of the kernel control instance. + * @param pcnt The address where to return the current count. + * @result 0 - Success; the packet count is returned to caller. + * EINVAL - Invalid parameters. + */ +errno_t +ctl_getenqueuepacketcount(kern_ctl_ref kctlref, u_int32_t unit, u_int32_t *pcnt); + +#ifdef KERNEL_PRIVATE + +#include +#include + +/* + * internal structure maintained for each register controller + */ +struct ctl_cb; +struct kctl; +struct socket; +struct socket_info; + +void kctl_fill_socketinfo(struct socket *, struct socket_info *); + +u_int32_t ctl_id_by_name(const char *); +errno_t ctl_name_by_id(u_int32_t, char *__counted_by(maxsize), size_t maxsize); + +extern const u_int32_t ctl_maxunit; +#endif /* KERNEL_PRIVATE */ + +__END_DECLS +#endif /* KERNEL */ + +#endif /* KPI_KERN_CONTROL_PRIVATE_H */ diff --git a/bsd/sys/kern_event.h b/bsd/sys/kern_event.h index 9930d2d42..f4640849d 100644 --- a/bsd/sys/kern_event.h +++ b/bsd/sys/kern_event.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2021 Apple Inc. All rights reserved. + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -243,27 +243,6 @@ struct kev_vendor_code { */ #define SIOCGKEVVENDOR _IOWR('e', 4, struct kev_vendor_code) -#ifdef PRIVATE -struct xkevtpcb { - u_int32_t kep_len; - u_int32_t kep_kind; - u_int64_t kep_evtpcb; - u_int32_t kep_vendor_code_filter; - u_int32_t kep_class_filter; - u_int32_t kep_subclass_filter; -}; - -struct kevtstat { - u_int64_t kes_pcbcount __attribute__((aligned(8))); - u_int64_t kes_gencnt __attribute__((aligned(8))); - u_int64_t kes_badvendor __attribute__((aligned(8))); - u_int64_t kes_toobig __attribute__((aligned(8))); - u_int64_t kes_nomem __attribute__((aligned(8))); - u_int64_t kes_fullsock __attribute__((aligned(8))); - u_int64_t kes_posted __attribute__((aligned(8))); -}; -#endif /* PRIVATE */ - #ifdef KERNEL /*! * @define N_KEV_VECTORS @@ -327,27 +306,8 @@ errno_t kev_vendor_code_find(const char *vendor_string, u_int32_t *vendor_code); */ errno_t kev_msg_post(struct kev_msg *event_msg); -#ifdef PRIVATE -/* - * Internal version of kev_msg_post. Allows posting Apple vendor code kernel - * events. - */ -int kev_post_msg(struct kev_msg *event); -int kev_post_msg_nowait(struct kev_msg *event); - -LIST_HEAD(kern_event_head, kern_event_pcb); - -struct kern_event_pcb { - decl_lck_mtx_data(, evp_mtx); /* per-socket mutex */ - LIST_ENTRY(kern_event_pcb) evp_link; /* glue on list of all PCBs */ - struct socket *evp_socket; /* pointer back to socket */ - u_int32_t evp_vendor_code_filter; - u_int32_t evp_class_filter; - u_int32_t evp_subclass_filter; -}; - -#define sotoevpcb(so) ((struct kern_event_pcb *)((so)->so_pcb)) - -#endif /* PRIVATE */ #endif /* KERNEL */ +#if defined(PRIVATE) && !defined(MODULES_SUPPORTED) +#include +#endif /* PRIVATE && !MODULES_SUPPORTED */ #endif /* SYS_KERN_EVENT_H */ diff --git a/bsd/sys/kern_event_private.h b/bsd/sys/kern_event_private.h new file mode 100644 index 000000000..40f97bbd6 --- /dev/null +++ b/bsd/sys/kern_event_private.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ +/*! + * @header kern_event.h + * This header defines in-kernel functions for generating kernel events as + * well as functions for receiving kernel events using a kernel event + * socket. + */ + +#ifndef SYS_KERN_EVENT_PRIVATE_H +#define SYS_KERN_EVENT_PRIVATE_H + +#include + +struct xkevtpcb { + u_int32_t kep_len; + u_int32_t kep_kind; + u_int64_t kep_evtpcb; + u_int32_t kep_vendor_code_filter; + u_int32_t kep_class_filter; + u_int32_t kep_subclass_filter; +}; + +struct kevtstat { + u_int64_t kes_pcbcount __attribute__((aligned(8))); + u_int64_t kes_gencnt __attribute__((aligned(8))); + u_int64_t kes_badvendor __attribute__((aligned(8))); + u_int64_t kes_toobig __attribute__((aligned(8))); + u_int64_t kes_nomem __attribute__((aligned(8))); + u_int64_t kes_fullsock __attribute__((aligned(8))); + u_int64_t kes_posted __attribute__((aligned(8))); +}; + +#ifdef KERNEL +/* + * Internal version of kev_msg_post. Allows posting Apple vendor code kernel + * events. + */ +int kev_post_msg(struct kev_msg *event); +int kev_post_msg_nowait(struct kev_msg *event); + +LIST_HEAD(kern_event_head, kern_event_pcb); + +struct kern_event_pcb { + decl_lck_mtx_data(, evp_mtx); /* per-socket mutex */ + LIST_ENTRY(kern_event_pcb) evp_link; /* glue on list of all PCBs */ + struct socket *evp_socket; /* pointer back to socket */ + u_int32_t evp_vendor_code_filter; + u_int32_t evp_class_filter; + u_int32_t evp_subclass_filter; +}; + +#define sotoevpcb(so) ((struct kern_event_pcb *)((so)->so_pcb)) + +#endif /* KERNEL */ +#endif /* SYS_KERN_EVENT_PRIVATE_H */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index f09edf2b5..fb5aa97f8 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -233,6 +233,7 @@ typedef struct jetsam_snapshot_entry { uint64_t csflags; uint32_t cs_trust_level; uint64_t jse_neural_nofootprint_total_pages; + uint64_t jse_prio_start; /* absolute time process moved to current priority */ } memorystatus_jetsam_snapshot_entry_t; typedef struct jetsam_snapshot { @@ -247,9 +248,6 @@ typedef struct jetsam_snapshot { /* TODO - deprecate; see */ #define kMaxSnapshotEntries 192 -#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries -#define JETSAM_SNAPSHOT_TIMEOUT_SECS 30 - /* State */ #define kMemorystatusSuspended 0x001 #define kMemorystatusFrozen 0x002 @@ -287,7 +285,8 @@ typedef struct jetsam_snapshot { #define JETSAM_REASON_LOWSWAP 13 #define JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE 14 #define JETSAM_REASON_MEMORY_VMPAGEOUT_STARVATION 15 -#define JETSAM_REASON_MEMORY_LONGIDLE_EXIT 17 /* Skips 16 on purpose to make room for conclave limit reason */ +#define JETSAM_REASON_MEMORY_CONCLAVELIMIT 16 +#define JETSAM_REASON_MEMORY_LONGIDLE_EXIT 17 #define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_MEMORY_LONGIDLE_EXIT /* non-memorystatus jetsam reasons */ @@ -313,6 +312,7 @@ typedef enum { kMemorystatusKilledLowSwap = JETSAM_REASON_LOWSWAP, kMemorystatusKilledSustainedPressure = JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE, kMemorystatusKilledVMPageoutStarvation = JETSAM_REASON_MEMORY_VMPAGEOUT_STARVATION, + kMemorystatusKilledConclaveLimit = JETSAM_REASON_MEMORY_CONCLAVELIMIT, } memorystatus_kill_cause_t; /* @@ -379,11 +379,14 @@ __END_DECLS #define MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_NAMES 32 /* Get jetsam snapshot zprint names array */ #define MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_INFO 33 /* Get jetsam snapshot zprint zone info */ #define MEMORYSTATUS_CMD_GET_JETSAM_ZPRINT_MEMINFO 34 /* Get jetsam snapshot zprint wired memory info */ +#define MEMORYSTATUS_CMD_REARM_MEMLIMIT 36 /* Re-arm memory limit (EXC_RESOURCE) */ #define MEMORYSTATUS_CMD_GET_PRIORITY_LIST_V2 35 /* Get priority list with v2 struct */ #define MEMORYSTATUS_CMD_GET_KILL_COUNTS 37 /* Get kill counts */ +#define MEMORYSTATUS_CMD_GET_CONCLAVE_LIMIT 38 /* Get the conclave memory limit */ + /* Commands that act on a group of processes */ #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES 100 @@ -395,8 +398,12 @@ __END_DECLS #define MEMORYSTATUS_CMD_TEST_JETSAM_SORT 1001 /* Select priority band sort order */ -#define JETSAM_SORT_NOSORT 0 -#define JETSAM_SORT_DEFAULT 1 +__enum_decl(memorystatus_jetsam_sort_order_t, int, { + JETSAM_SORT_NONE = 0x0, /* No sort */ + JETSAM_SORT_LRU = 0x1, /* Sort by LRU by coalition leader */ + JETSAM_SORT_FOOTPRINT = 0x2, /* Sort by footprint by coalition leader */ + JETSAM_SORT_FOOTPRINT_NOCOAL = 0x3, /* Sort by footprint ignoring coalitions */ +}); #endif /* PRIVATE */ @@ -418,6 +425,9 @@ __END_DECLS #define MEMORYSTATUS_FLAGS_GRP_SET_FREEZE_PRIORITY 0x100 /* Set a new ordered list of freeze candidates */ #define MEMORYSTATUS_FLAGS_GRP_SET_DEMOTE_PRIORITY 0x200 /* Set a new ordered list of demote candidates */ + +#define MEMORYSTATUS_FLAGS_REARM_ACTIVE 0x400 /* Re-arm active limit */ +#define MEMORYSTATUS_FLAGS_REARM_INACTIVE 0x800 /* Re-arm inactive limit */ /* * For use with memorystatus_control: * MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT @@ -658,8 +668,9 @@ void memorystatus_knote_unregister(struct knote *kn); void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); void memorystatus_log_diag_threshold_exception(const int diag_threshold_value); void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); +void memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb); void proc_memstat_skip(proc_t p, boolean_t set); -void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); +void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion); #if __arm64__ void memorystatus_act_on_legacy_footprint_entitlement(proc_t p, boolean_t footprint_increase); @@ -671,19 +682,11 @@ void memorystatus_act_on_entitled_developer_task_limit(proc_t p); #endif /* CONFIG_MEMORYSTATUS */ int memorystatus_get_pressure_status_kdp(void); -int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index); #if CONFIG_JETSAM extern unsigned int memorystatus_swap_all_apps; -/* - * Wake up the memorystatus thread so it can do async kills. - * The memorystatus thread will keep killing until the system is - * considered healthy. - */ -void memorystatus_thread_wake(void); - void jetsam_on_ledger_cpulimit_exceeded(void); /* diff --git a/bsd/sys/kern_memorystatus_notify.h b/bsd/sys/kern_memorystatus_notify.h index 4d2e9523a..24ee6177e 100644 --- a/bsd/sys/kern_memorystatus_notify.h +++ b/bsd/sys/kern_memorystatus_notify.h @@ -39,7 +39,7 @@ extern vm_pressure_level_t memorystatus_vm_pressure_level; extern _Atomic bool memorystatus_hwm_candidates; -extern unsigned int memorystatus_sustained_pressure_maximum_band; +extern unsigned int memstat_sustained_pressure_max_pri; boolean_t memorystatus_warn_process(const proc_t p, boolean_t is_active, boolean_t is_fatal, boolean_t exceeded); @@ -54,8 +54,7 @@ int memorystatus_low_mem_privileged_listener(uint32_t op_flags); int memorystatus_send_pressure_note(int pid); boolean_t memorystatus_is_foreground_locked(proc_t p); boolean_t memorystatus_bg_pressure_eligible(proc_t p); -void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, - boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); +void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion); void memorystatus_broadcast_jetsam_pressure( vm_pressure_level_t pressure_level); diff --git a/bsd/sys/kern_memorystatus_xnu.h b/bsd/sys/kern_memorystatus_xnu.h index d4b7f0983..f447218fb 100644 --- a/bsd/sys/kern_memorystatus_xnu.h +++ b/bsd/sys/kern_memorystatus_xnu.h @@ -39,6 +39,12 @@ __BEGIN_DECLS /* TODO: migrate other xnu-private interfaces from kern_memorystatus.h */ +/* + * Query if this process is state-managed by RunningBoard. + */ +typedef struct proc * proc_t; +extern bool memorystatus_get_proc_is_managed(proc_t proc); + /* * Return the minimum number of available pages jetsam requires before it * begins killing non-idle processes. This is useful for some pageout diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index e650b7288..26e9aa244 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -55,6 +55,23 @@ #define __NKE_API_DEPRECATED #endif /* PRIVATE */ +#ifdef KERNEL_PRIVATE +#include +#if __has_feature(attribute_unavailable_with_message) +#define __EXTENSION_ONLY_KPI_DEPRECATED_BY(REPLACEMENT) \ +__attribute__((__unavailable__( \ + "Only available outside of the kernel. Use " #REPLACEMENT " in the kernel instead."))) +#define __BOUNDS_SAFETY_DEPRECATED_BY(REPLACEMENT) +#else /*! __has_feature(attribute_unavailable_with_message) */ +#define __EXTENSION_ONLY_KPI_DEPRECATED_BY(REPLACEMENT) +#define __BOUNDS_SAFETY_DEPRECATED_BY(REPLACEMENT) +#endif /*! __has_feature(attribute_unavailable_with_message) */ +#else /* !KERNEL_PRIVATE */ +#define __EXTENSION_ONLY_KPI_DEPRECATED_BY(REPLACEMENT) +#define __BOUNDS_SAFETY_DEPRECATED_BY(REPLACEMENT) \ +__ptrcheck_unavailable_r(REPLACEMENT) +#endif /* !KERNEL_PRIVATE */ + #ifdef KERNEL_PRIVATE #include #endif /* KERNEL_PRIVATE */ @@ -284,6 +301,36 @@ struct mbuf_stat { __BEGIN_DECLS /* Data access */ + +/*! + * @function mbuf_data_len + * @discussion Returns a pointer to the start of data along with the data length in this mbuf. + * There may be additional data on chained mbufs. The data you're + * looking for may not be virtually contiguous if it spans more + * than one mbuf. In addition, data that is virtually contiguous + * might not be represented by physically contiguous pages; see + * further comments in `mbuf_data_to_physical'. + * If the data structure you want to access stradles multiple + * mbufs in a chain, the useable data length (returned by `*out_len') + * will be smaller than the expected size. + * In this case, either use `mbuf_pullup', which will create + * a new mbuf with the data structure in a congigous buffer, + * or alternatively copy the pieces of the data structure + * from the mbufs comprised by the chain into a separately allocated + * buffer with a sufficient capacity. + * Using `mbuf_pullup' has the advantage of not having to + * copy the data; however if the size of the requred data exceeds + * the maximal mbuf size, `mbuf_pullup' will fail, and free the chain. + * @param mbuf The mbuf. + * @param out_buf Pointer to the data buffer in this mbuf. + * @param out_len Pointer to the amount of available data in the buffer pointed to by `out_buf'. + * @result EINVAL if one of the parameters is NULL. + * ENOENT if the mbuf does not have valid data buffer. + * 0 if successful. + */ +extern errno_t mbuf_data_len(mbuf_t mbuf, void *__sized_by(*out_len) * out_buf, size_t *out_len) +__NKE_API_DEPRECATED; + /*! * @function mbuf_data * @discussion Returns a pointer to the start of data in this mbuf. @@ -291,20 +338,71 @@ __BEGIN_DECLS * looking for may not be virtually contiguous if it spans more * than one mbuf. In addition, data that is virtually contiguous * might not be represented by physically contiguous pages; see - * further comments in mbuf_data_to_physical. Use mbuf_len to - * determine the length of data available in this mbuf. If a data - * structure you want to access stradles two mbufs in a chain, - * either use mbuf_pullup to get the data contiguous in one mbuf - * or copy the pieces of data from each mbuf in to a contiguous - * buffer. Using mbuf_pullup has the advantage of not having to - * copy the data. On the other hand, if you don't make sure there - * is space in the mbuf, mbuf_pullup may fail and free the mbuf. + * further comments in `mbuf_data_to_physical'. + * To determine the usable length of the data available in this mbuf, + * use `mbuf_len', or replace the invocation of `mbuf_data' + * with `mbuf_data_len', which will return the length of available + * data along with the data pointer. + * If the data structure you want to access stradles multiple + * mbufs in a chain, the returned length will be smaller than + * the expected size. In this case, either use `mbuf_pullup', + * which will create an mbuf containing the data structure + * in a congigous buffer, or alternatively copy the pieces + * of the data structure from the mbufs comprised by the chain + * into a separately allocated buffer with a sufficient capacity. + * Using `mbuf_pullup' has the advantage of not having to + * copy the data; however if the size of the requred data exceeds + * the maximal mbuf size, `mbuf_pullup' will fail, and free the chain. + * @warning This function is NOT SAFE to use with `-fbounds-safety'. + * Use `mbuf_data_safe' or `mbuf_data_len' instead. + * Inside the kernel, the recommended replacement is `mtod'. * @param mbuf The mbuf. * @result A pointer to the data in the mbuf. */ -extern void *mbuf_data(mbuf_t mbuf) +extern void * __unsafe_indexable mbuf_data(mbuf_t mbuf) +__BOUNDS_SAFETY_DEPRECATED_BY('mbuf_data_safe, mbuf_data_len') __NKE_API_DEPRECATED; +/*! + * @function mbuf_data_safe + * @discussion Returns a checked pointer to the start of data in this mbuf. + * There may be additional data on chained mbufs. The data you're + * looking for may not be virtually contiguous if it spans more + * than one mbuf. In addition, data that is virtually contiguous + * might not be represented by physically contiguous pages; see + * further comments in `mbuf_data_to_physical'. + * To determine the usable length of the data available in this mbuf, + * use `mbuf_len', or replace the invocation of `mbuf_data_safe' + * with `mbuf_data_len', which will return the length of available + * data along with the data pointer. + * If the data structure you want to access stradles multiple + * mbufs in a chain, the useable data length (see above) will be + * smaller than the expected size. + * In this case, either use `mbuf_pullup', which will create + * a new mbuf with the data structure in a congigous buffer, + * or alternatively copy the pieces of the data structure + * from the mbufs comprised by the chain into a separately allocated + * buffer with a sufficient capacity. + * Using `mbuf_pullup' has the advantage of not having to + * copy the data; however if the size of the requred data exceeds + * the maximal mbuf size, `mbuf_pullup' will fail, and free the chain. + * @param mbuf The mbuf. + * @result A pointer to the data in the mbuf. + */ +static inline void * __header_indexable +mbuf_data_safe(mbuf_t mbuf) +{ + size_t len = 0; + void * __sized_by(len) buf = 0; + errno_t err; + err = mbuf_data_len(mbuf, &buf, &len); + if (err != 0) { + return 0; + } + return buf; +} +#define __KPI_MBUF_HAS_MBUF_DATA_SAFE (1) + /*! * @function mbuf_datastart * @discussion Returns the start of the space set aside for storing diff --git a/bsd/sys/linker_set.h b/bsd/sys/linker_set.h index 1b0dd523f..fce0f0160 100644 --- a/bsd/sys/linker_set.h +++ b/bsd/sys/linker_set.h @@ -120,7 +120,7 @@ * void const * __set_SET_sym_SYM __attribute__((section("__DATA_CONST,SET"))) = & SYM */ -/* Wrap entries in a type that can be blacklisted from KASAN */ +/* Wrap entries in a type that can be denylisted from KASAN */ struct linker_set_entry { void *ptr; } LINKER_SET_ENTRY_PACKED; diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index d7449a18d..b286b75c4 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -101,6 +101,7 @@ * this is done when at least MINCLSIZE of data must be stored. */ #if CONFIG_MBUF_MCACHE +#include #define _MSIZESHIFT 8 /* 256 */ #define _MSIZE (1 << _MSIZESHIFT) /* size of an mbuf */ #else /* CONFIG_MBUF_MCACHE */ @@ -456,8 +457,8 @@ struct pkthdr { uint32_t comp_gencnt; uint32_t pkt_crumbs:16, pkt_compl_callbacks:8, - pkt_ext_flags:3, - pkt_unused:5; /* Currently unused - feel free to grab those 5 bits */ + pkt_ext_flags:6, + pkt_unused:2; /* Currently unused - feel free to grab those 2 bits */ /* * Module private scratch space (32-bit aligned), currently 16-bytes * large. Anything stored here is not guaranteed to survive across @@ -478,10 +479,14 @@ struct pkthdr { u_int64_t __mpriv64[2]; } __mpriv_u; } pkt_mpriv __attribute__((aligned(4))); -#define pkt_mpriv_hash pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val32 -#define pkt_mpriv_flags pkt_mpriv.__mpriv_u.__mpriv32[1].__mpriv32_u.__val32 -#define pkt_mpriv_srcid pkt_mpriv.__mpriv_u.__mpriv32[2].__mpriv32_u.__val32 -#define pkt_mpriv_fidx pkt_mpriv.__mpriv_u.__mpriv32[3].__mpriv32_u.__val32 +/* + * While qset_id takes 64 bits here, as upper 32 bits of qset_id are reserved + * currently, there is a scope to limit to 32 bits if other use cases need + * pkt_mpriv + */ +#define pkt_mpriv_qsetid pkt_mpriv.__mpriv_u.__mpriv64[0] +#define pkt_mpriv_srcid pkt_mpriv.__mpriv_u.__mpriv32[2].__mpriv32_u.__val32 +#define pkt_mpriv_fidx pkt_mpriv.__mpriv_u.__mpriv32[3].__mpriv32_u.__val32 }; /* @@ -567,6 +572,9 @@ struct pkthdr { #define PKTF_EXT_OUTPUT_SCOPE 0x1 /* outgoing packet has ipv6 address scope id */ #define PKTF_EXT_L4S 0x2 /* pkts is from a L4S connection */ #define PKTF_EXT_QUIC 0x4 /* flag to denote a QUIC packet */ +#define PKTF_EXT_QSET_ID_VALID 0x8 /* flag to denote if traffic rules are run */ +#define PKTF_EXT_ULPN 0x10 /* packet transitted coprocessor */ +#define PKTF_EXT_LPW 0x20 /* packet received in low power wake */ #define PKT_CRUMB_TS_COMP_REQ 0x0001 /* timestamp completion requested */ #define PKT_CRUMB_TS_COMP_CB 0x0002 /* timestamp callback called */ @@ -863,12 +871,6 @@ enum { * and internal data. */ -#if 1 -#define MCHECK(m) m_mcheck(m) -#else -#define MCHECK(m) -#endif - #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) @@ -890,10 +892,6 @@ union mcluster { char mcl_buf[MCLBYTES]; }; -#define MCLALLOC(p, how) ((p) = m_mclalloc(how)) - -#define MCLFREE(p) m_mclfree(p) - #define MCLGET(m, how) ((m) = m_mclget(m, how)) /* @@ -1070,7 +1068,7 @@ do { \ do { \ if (!(m->m_flags & MBUF_PKTHDR) || \ m->m_len < 0 || \ - m->m_len > ((njcl > 0) ? njclbytes : MBIGCLBYTES) || \ + m->m_len > njclbytes || \ m->m_type == MT_FREE || \ ((m->m_flags & M_EXT) != 0 && m->m_ext.ext_buf == NULL)) { \ panic_plain("Failed mbuf validity check: mbuf %p len %d " \ @@ -1204,6 +1202,46 @@ struct name { \ #define MBUFQ_ADD_CRUMB(_q, _m, _f) #endif /* (DEBUG || DEVELOPMENT) */ +struct mbufq { + MBUFQ_HEAD(counted_mbufq) mq; + uint32_t count; + uint32_t bytes; +}; + +inline void +mbufq_init(struct mbufq *q) +{ + MBUFQ_INIT(&q->mq); + q->bytes = q->count = 0; +} + +inline void +mbufq_enqueue(struct mbufq *q, struct mbuf *head, struct mbuf *tail, + uint32_t cnt, uint32_t bytes) +{ + MBUFQ_ENQUEUE_MULTI(&q->mq, head, tail); + q->count += cnt; + q->bytes += bytes; +} + +inline boolean_t +mbufq_empty(struct mbufq *q) +{ + return q->count == 0; +} + +inline struct mbuf* +mbufq_first(struct mbufq *q) +{ + return MBUFQ_FIRST(&q->mq); +} + +inline struct mbuf* +mbufq_last(struct mbufq *q) +{ + return MBUFQ_LAST(&q->mq); +} + #endif /* XNU_KERNEL_PRIVATE */ /* @@ -1441,6 +1479,9 @@ extern void m_freem(struct mbuf *) __XNU_INTERNAL(m_freem); extern void m_drop(mbuf_t, uint16_t, uint32_t, const char *, uint16_t); extern void m_drop_if(mbuf_t, struct ifnet *, uint16_t, uint32_t, const char *, uint16_t); extern void m_drop_list(mbuf_t, struct ifnet *, uint16_t, uint32_t, const char *, uint16_t); +extern void m_drop_extended(mbuf_t, struct ifnet *, char *, + uint16_t, uint32_t, const char *, uint16_t); + extern u_int64_t mcl_to_paddr(char *); extern void m_adj(struct mbuf *, int); extern void m_cat(struct mbuf *, struct mbuf *); @@ -1458,6 +1499,7 @@ extern struct mbuf *m_pullup(struct mbuf *, int); extern struct mbuf *m_split(struct mbuf *, int, int); extern void m_mclfree(caddr_t p); extern bool mbuf_class_under_pressure(struct mbuf *m); +extern int m_chain_capacity(const struct mbuf *m); /* * Accessors for the mbuf data range. @@ -1500,6 +1542,12 @@ m_has_mtype(const struct mbuf *m, int mtype_flags) return (1 << m->m_type) & mtype_flags; } +static inline int +m_capacity(const struct mbuf *m) +{ + return _MSIZE + ((m->m_flags & M_EXT) ? m->m_ext.ext_size : 0); +} + /* * On platforms which require strict alignment (currently for anything but * i386 or x86_64 or arm64), this macro checks whether the data pointer of an mbuf @@ -1610,8 +1658,6 @@ m_has_mtype(const struct mbuf *m, int mtype_flags) c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_SIG || \ c == SCVAL_VO || SCVAL_CTL) -extern unsigned char *mbutl; /* start VA of mbuf pool */ -extern unsigned char *embutl; /* end VA of mbuf pool */ extern unsigned int nmbclusters; /* number of mapped clusters */ extern int njcl; /* # of jumbo clusters */ extern int njclbytes; /* size of a jumbo cluster */ @@ -1623,6 +1669,8 @@ extern int max_linkhdr; /* largest link-level header */ /* Use max_protohdr instead of _max_protohdr */ extern int max_protohdr; /* largest protocol header */ +extern uint32_t high_sb_max; + __private_extern__ unsigned int mbuf_default_ncl(uint64_t); __private_extern__ void mbinit(void); __private_extern__ struct mbuf *m_clattach(struct mbuf *, int, caddr_t __sized_by(extsize), @@ -1668,7 +1716,6 @@ __private_extern__ void m_align(struct mbuf *, int); __private_extern__ struct mbuf *m_normalize(struct mbuf *m); __private_extern__ void m_mchtype(struct mbuf *m, int t); -__private_extern__ void m_mcheck(struct mbuf *); __private_extern__ void m_copyback(struct mbuf *, int, int len, const void * __sized_by(len)); __private_extern__ struct mbuf *m_copyback_cow(struct mbuf *, int, int len, @@ -1771,7 +1818,6 @@ enum { }; /* Packet tag routines */ -__private_extern__ struct m_tag *m_tag_alloc(u_int32_t, u_int16_t, int, int); __private_extern__ struct m_tag *m_tag_create(u_int32_t, u_int16_t, int, int, struct mbuf *); __private_extern__ void m_tag_free(struct m_tag *); @@ -1795,7 +1841,6 @@ void m_tag_create_cookie(struct m_tag *); void mbuf_tag_init(void); -__private_extern__ void m_scratch_init(struct mbuf *); __private_extern__ u_int32_t m_scratch_get(struct mbuf *, u_int8_t **); __private_extern__ void m_classifier_init(struct mbuf *, uint32_t); @@ -1807,7 +1852,6 @@ __private_extern__ mbuf_svc_class_t m_service_class_from_val(u_int32_t); __private_extern__ int m_set_traffic_class(struct mbuf *, mbuf_traffic_class_t); __private_extern__ mbuf_traffic_class_t m_get_traffic_class(struct mbuf *); -__private_extern__ struct m_tag *m_tag_alloc(u_int32_t, u_int16_t, int, int); __private_extern__ void mbuf_tag_init(void); #define ADDCARRY(_x) do { \ @@ -1819,17 +1863,188 @@ __private_extern__ u_int16_t m_adj_sum16(struct mbuf *, u_int32_t, u_int32_t, u_int32_t, u_int32_t); __private_extern__ u_int16_t m_sum16(struct mbuf *, u_int32_t, u_int32_t); -__private_extern__ void m_set_ext(struct mbuf *, struct ext_ref *, - m_ext_free_func_t, caddr_t); +__private_extern__ void mbuf_set_tx_time(struct mbuf *m, uint64_t tx_time); + __private_extern__ struct ext_ref *m_get_rfa(struct mbuf *); __private_extern__ m_ext_free_func_t m_get_ext_free(struct mbuf *); -__private_extern__ caddr_t m_get_ext_arg(struct mbuf *); __private_extern__ void m_do_tx_compl_callback(struct mbuf *, struct ifnet *); __private_extern__ mbuf_tx_compl_func m_get_tx_compl_callback(u_int32_t); - - __END_DECLS + +/* START - the following can be moved to uipc_mbuf.c once we got rid of CONFIG_MBUF_MCACHE */ +typedef enum { + MC_MBUF = 0, /* Regular mbuf */ + MC_CL, /* Cluster */ + MC_BIGCL, /* Large (4KB) cluster */ + MC_16KCL, /* Jumbo (16KB) cluster */ + MC_MBUF_CL, /* mbuf + cluster */ + MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ + MC_MBUF_16KCL, /* mbuf + jumbo (16KB) cluster */ + MC_MAX +} mbuf_class_t; + +typedef struct { + mbuf_class_t mtbl_class; /* class type */ +#if CONFIG_MBUF_MCACHE + mcache_t *mtbl_cache; /* mcache for this buffer class */ + TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ + mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ +#endif + mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ + u_int32_t mtbl_maxsize; /* maximum buffer size */ + int mtbl_minlimit; /* minimum allowed */ + int mtbl_maxlimit; /* maximum allowed */ + u_int32_t mtbl_wantpurge; /* purge during next reclaim */ + uint32_t mtbl_avgtotal; /* average total on iOS */ + u_int32_t mtbl_expand; /* worker should expand the class */ +} mbuf_table_t; + +/* + * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated + * atomically and stored in a per-CPU structure which is lock-free; this is + * done in order to avoid writing to the global mbstat data structure which + * would cause false sharing. During sysctl request for kern.ipc.mbstat, + * the statistics across all CPUs will be converged into the mbstat.m_mtypes + * array and returned to the application. Any updates for types greater or + * equal than MT_MAX would be done atomically to the mbstat; this slows down + * performance but is okay since the kernel uses only up to MT_MAX-1 while + * anything beyond that (up to type 255) is considered a corner case. + */ +typedef struct { + unsigned int cpu_mtypes[MT_MAX]; +} mbuf_mtypes_t; + +#define MBUF_CLASS_MIN MC_MBUF +#define MBUF_CLASS_MAX MC_MBUF_16KCL +#define MBUF_CLASS_LAST MC_16KCL + +#define MBUF_CLASS_COMPOSITE(c) \ + ((int)(c) > MBUF_CLASS_LAST) + +#define m_class(c) mbuf_table[c].mtbl_class +#define m_maxsize(c) mbuf_table[c].mtbl_maxsize +#define m_minlimit(c) mbuf_table[c].mtbl_minlimit +#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit +#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname +#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size +#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total +#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree + +#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) +#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ + +/* + * This flag is set for all mbufs that come out of and into the composite + * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that + * are marked with such a flag have clusters attached to them, and will be + * treated differently when they are freed; instead of being placed back + * into the mbuf and cluster freelists, the composite mbuf + cluster objects + * are placed back into the appropriate composite cache's freelist, and the + * actual freeing is deferred until the composite objects are purged. At + * such a time, this flag will be cleared from the mbufs and the objects + * will be freed into their own separate freelists. + */ +#define EXTF_COMPOSITE 0x1 + +/* + * This flag indicates that the external cluster is read-only, i.e. it is + * or was referred to by more than one mbufs. Once set, this flag is never + * cleared. + */ +#define EXTF_READONLY 0x2 + +/* + * This flag indicates that the external cluster is paired with the mbuf. + * Pairing implies an external free routine defined which will be invoked + * when the reference count drops to the minimum at m_free time. This + * flag is never cleared. + */ +#define EXTF_PAIRED 0x4 + +#define EXTF_MASK \ + (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED) + +#define MEXT_MINREF(m) ((m_get_rfa(m))->minref) +#define MEXT_REF(m) ((m_get_rfa(m))->refcnt) +#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt) +#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags) +#define MEXT_PRIV(m) ((m_get_rfa(m))->priv) +#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired) +#define MBUF_IS_COMPOSITE(m) \ + (MEXT_REF(m) == MEXT_MINREF(m) && \ + (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) + +/* + * This macro can be used to test if the mbuf is paired to an external + * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject + * is important, as EXTF_PAIRED alone is insufficient since it is immutable, + * and thus survives calls to m_free_paired. + */ +#define MBUF_IS_PAIRED(m) \ + (((m)->m_flags & M_EXT) && \ + (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \ + MEXT_PMBUF(m) == (m)) + +#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ + mext_init(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) + +#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ + mext_init(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) + +#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ + mext_init(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \ + ref, 0, flag, 0, NULL) + +#define MBSTAT_MTYPES_MAX \ + (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) + +#define mtype_stat_add(type, n) { \ + if ((unsigned)(type) < MT_MAX) { \ + mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \ + os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \ + } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ + os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \ + } \ +} + +#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) +#define mtype_stat_inc(t) mtype_stat_add(t, 1) +#define mtype_stat_dec(t) mtype_stat_sub(t, 1) +/* END - the following can be moved to uipc_mbuf.c once we got rid of CONFIG_MBUF_MCACHE */ + +#if CONFIG_MBUF_MCACHE +extern lck_mtx_t *const mbuf_mlock; +extern int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ +extern unsigned char *mbutl; /* start VA of mbuf pool */ +extern unsigned int mb_memory_pressure_percentage; +extern struct mb_stat *mb_stat; +PERCPU_DECL(mbuf_mtypes_t, mbuf_mtypes); + +extern mbuf_table_t mbuf_table[]; + +extern void mbuf_mtypes_sync(void); +extern void mbuf_stat_sync(void); +extern void mbuf_table_init(void); +extern void m_incref(struct mbuf *m); +extern uint16_t m_decref(struct mbuf *m); +extern struct mbuf *m_get_common(int wait, short type, int hdr); +extern int m_free_paired(struct mbuf *m); +extern caddr_t m_get_ext_arg(struct mbuf *m); +extern int mbuf_watchdog_defunct_iterate(proc_t p, void *arg); +extern void m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free, + caddr_t ext_arg); +extern void mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size, + m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa, + u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag, + u_int32_t priv, struct mbuf *pm); +extern int mbuf_get_class(struct mbuf *m); +extern void mbuf_init(struct mbuf *m, int pkthdr, int type); +extern void mbuf_mcheck(struct mbuf *m); +#endif /* CONFIG_MBUF_MCACHE */ + #endif /* XNU_KERNEL_PRIVATE */ #endif /* !_SYS_MBUF_H_ */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 306dbc936..bf1d6881d 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -59,11 +59,6 @@ extern "C" { #define ASSERT(EX) ((void)0) #endif -/* - * Compile time assert; this should be on its own someday. - */ -#define _CASSERT(x) _Static_assert(x, "compile-time assertion failed") - /* * Use CPU_CACHE_LINE_SIZE instead of MAX_CPU_CACHE_LINE_SIZE, unless * wasting space is of no concern. diff --git a/bsd/sys/mem_acct_private.h b/bsd/sys/mem_acct_private.h new file mode 100644 index 000000000..dce926c03 --- /dev/null +++ b/bsd/sys/mem_acct_private.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_MEM_ACCT_PRIVATE_H +#define _SYS_MEM_ACCT_PRIVATE_H + +#include + +#include + +#define MEM_ACCT_PEAK 1 /* reset/get peak value for a subsystem */ +#define MEM_ACCT_SOFT_LIMIT 2 /* set/get soft limit for a subsystem */ +#define MEM_ACCT_HARD_LIMIT 3 /* set/get hard limit for a subsystem */ +#define MEM_ACCT_ALLOCATED 4 /* set/get currently allocated memory for a subsystem */ +#define MEM_ACCT_SUBSYSTEMS 5 /* get all subsystem names */ +#define MEM_ACCT_ALL_SUBSYSTEM_STATISTICS 6 /* returns all statistics for all subsystems */ +#define MEM_ACCT_ALL_STATISTICS 7 /* returns all statistics for a specific subsystem */ + +#define MEM_ACCT_MAX 8 /* Current maximum number of accounting objects we allow */ + +#define MEM_ACCT_NAME_LENGTH 16 /* max size for subsystem name */ + +struct memacct_statistics { + uint64_t peak; + int64_t allocated; + uint64_t softlimit; + uint64_t hardlimit; + char ma_name[MEM_ACCT_NAME_LENGTH]; +}; + +#endif /* _SYS_MEM_ACCT_PRIVATE_H */ diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index f7bd45f79..30b3ec781 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -518,7 +518,8 @@ struct netfs_status { #define VQ_DESIRED_DISK 0x4000 /* the desired disk space */ #define VQ_FREE_SPACE_CHANGE 0x8000 /* free disk space has significantly changed */ #define VQ_PURGEABLE_SPACE_CHANGE 0x10000 /* purgeable disk space has significantly changed */ -#define VQ_FLAG20000 0x20000 /* placeholder */ +#define VQ_IDLE_PURGE_NOTIFY 0x20000 /* Above nearlowdisk and below desired disk space */ +#define VQ_FLAG40000 0x40000 /* placeholder */ #ifdef KERNEL @@ -1355,6 +1356,7 @@ boolean_t vfs_context_is_dataless_manipulator(vfs_context_t); boolean_t vfs_context_can_resolve_triggers(vfs_context_t); boolean_t vfs_context_can_break_leases(vfs_context_t); boolean_t vfs_context_skip_mtime_update(vfs_context_t ctx); +boolean_t vfs_context_allow_entitled_reserve_access(vfs_context_t ctx); void vfs_setmntsystem(mount_t mp); void vfs_setmntsystemdata(mount_t mp); void vfs_setmntswap(mount_t mp); @@ -1502,7 +1504,7 @@ OS_ENUM(graftdmg_type, uint32_t, GRAFTDMG_CRYPTEX_BOOT = 1, GRAFTDMG_CRYPTEX_PREBOOT = 2, GRAFTDMG_CRYPTEX_DOWNLEVEL = 3, - // Reserved: CRYPTEX1_AUTH_ENV_GENERIC = 4, + GRAFTDMG_CRYPTEX_AUTH_ENV_GENERIC = 4, // Reserved: CRYPTEX1_AUTH_ENV_GENERIC_SUPPLEMENTAL = 5, GRAFTDMG_CRYPTEX_PDI_NONCE = 6, GRAFTDMG_CRYPTEX_EFFECTIVE_AP = 7, @@ -1549,6 +1551,7 @@ int statfs(const char *, struct statfs *) __DARWIN_INODE64(statfs); int statfs64(const char *, struct statfs64 *) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_5, __MAC_10_6, __IPHONE_NA, __IPHONE_NA); #endif /* !__DARWIN_ONLY_64_BIT_INO_T */ int unmount(const char *, int); +int funmount(int, int) __OSX_AVAILABLE(16.0) __IOS_AVAILABLE(19.0) __TVOS_AVAILABLE(19.0) __WATCHOS_AVAILABLE(12.0); int getvfsbyname(const char *, struct vfsconf *); #if PRIVATE int pivot_root(const char *, const char *) __OSX_AVAILABLE(10.16); diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 00681149c..781edd82b 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -145,7 +145,15 @@ struct nameidata { #define NAMEI_NOFOLLOW_ANY 0x1000 /* no symlinks allowed in the path */ #define NAMEI_ROOTDIR 0x2000 /* Limit lookup to ni_rootdir (similar to chroot) */ -#define NAMEI_RESOLVE_BENEATH 0x4000 /* path must reside in the hierarchy beneath the starting directory */ +#define NAMEI_RESOLVE_BENEATH 0x4000 /* path resolution must not escape the starting directory */ +#define NAMEI_NODOTDOT 0x8000 /* prevent '..' path traversal */ + +#define NAMEI_LOCAL 0x10000 /* prevent a path lookup into a network filesystem */ +#define NAMEI_NODEVFS 0x20000 /* prevent a path lookup into `devfs` filesystem */ +#define NAMEI_IMMOVABLE 0x40000 /* prevent a path lookup into a removable filesystem */ +#define NAMEI_NOXATTRS 0x80000 /* prevent a path lookup on named streams */ + +#define NAMEI_UNIQUE 0x100000 /* prevent a path lookup from succeeding on a vnode with multiple links */ #ifdef KERNEL /* @@ -184,7 +192,9 @@ struct nameidata { #define USEDVP 0x00400000 /* start the lookup at ndp.ni_dvp */ #define CN_VOLFSPATH 0x00800000 /* user path was a volfs style path */ #define CN_FIRMLINK_NOFOLLOW 0x01000000 /* Do not follow firm links */ -#define UNIONCREATED 0x02000000 /* union fs creation of vnode */ +#if NAMEDSTREAMS +#define MARKISSHADOW 0x02000000 /* only for getshadowfile() */ +#endif #if NAMEDRSRCFORK #define CN_WANTSRSRCFORK 0x04000000 #define CN_ALLOWRSRCFORK 0x08000000 @@ -250,6 +260,7 @@ int relookup(struct vnode *dvp, struct vnode **vpp, #if CONFIG_UNION_MOUNTS int lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx); #endif /* CONFIG_UNION_MOUNTS */ +int lookup_check_for_resolve_prefix(char *path, size_t pathbuflen, size_t len, uint32_t *resolve_flags, size_t *prefix_len); void lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create); void kdebug_lookup(struct vnode *dp, struct componentname *cnp); diff --git a/bsd/sys/paths.h b/bsd/sys/paths.h index 3a2f1bb05..538330184 100644 --- a/bsd/sys/paths.h +++ b/bsd/sys/paths.h @@ -40,5 +40,16 @@ #define _PATH_RSRCNAME "rsrc" #define _PATH_RSRCFORKSPEC "/..namedfork/rsrc" +/* Prefix Path Namespace */ +#define RESOLVE_NOFOLLOW_ANY 0x00000001 /* no symlinks allowed in path */ +#define RESOLVE_NODOTDOT 0x00000002 /* prevent '..' path traversal */ +#define RESOLVE_LOCAL 0x00000004 /* prevent a path lookup into a network filesystem */ +#define RESOLVE_NODEVFS 0x00000008 /* prevent a path lookup into `devfs` filesystem */ +#define RESOLVE_IMMOVABLE 0x00000010 /* prevent a path lookup into a removable filesystem */ +#define RESOLVE_UNIQUE 0x00000020 /* prevent a path lookup on a vnode with multiple links */ +#define RESOLVE_NOXATTRS 0x00000040 /* prevent a path lookup on named streams */ + +#define RESOLVE_VALIDMASK 0x0000007F + #endif /* __APPLE_API_PRIVATE */ #endif /* !_SYS_PATHS_H_ */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 90b6a0957..9197a7eda 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -97,6 +97,7 @@ struct session; struct pgrp; struct proc; +struct proc_ident; /* Exported fields for kern sysctls */ struct extern_proc { @@ -226,6 +227,7 @@ struct extern_proc { #define P_DIRTY_LAUNCH_IN_PROGRESS 0x00000200 /* launch is in progress */ #define P_DIRTY_DEFER_ALWAYS 0x00000400 /* defer going to idle-exit after every dirty->clean transition. * For legacy jetsam policy only. This is the default with the other policies.*/ +#define P_DIRTY_SHUTDOWN_ON_CLEAN 0x00000800 /* process should shutdown on going clean */ #define P_DIRTY_IS_DIRTY (P_DIRTY | P_DIRTY_SHUTDOWN) #define P_DIRTY_IDLE_EXIT_ENABLED (P_DIRTY_TRACK|P_DIRTY_ALLOW_IDLE_EXIT) @@ -249,23 +251,6 @@ extern bool proc_is_third_party_debuggable_driver(proc_t p); #endif /* XNU_KERNEL_PRIVATE */ -#if KERNEL_PRIVATE -/* - * Identify a process uniquely. - * proc_ident's fields match 1-1 with those in struct proc. - */ -struct proc_ident { - uint64_t p_uniqueid; - pid_t p_pid; - int p_idversion; -}; - -/* obtain a proc_ident from a proc_ref */ -extern struct proc_ident proc_ident(proc_t p); -#else -struct proc_ident; -#endif /* KERNEL_PRIVATE */ - /* * __unsafe_indexable is a workaround for * rdar://88409003 (PredefinedExpr trips C string detection) @@ -306,8 +291,38 @@ void proc_selfname(char * buf, int size); /* find a process with a given pid. This comes with a reference which needs to be dropped by proc_rele */ extern proc_t proc_find(int pid); -/* find a process with a given process identity */ -extern proc_t proc_find_ident(struct proc_ident const *i); +/* + * Function: proc_find_ident + * + * Description: Obtain a proc ref from the provided proc_ident. + * + * Returns: + * - Non-null proc_t on success + * - PROC_NULL on error + */ +extern proc_t proc_find_ident(const proc_ident_t i); +#ifdef KERNEL_PRIVATE +/* + * Function: proc_find_ident_validated + * + * Description: Obtain a proc ref from the provided proc_ident. + * + * Returns: + * - 0 on Success + * - EINVAL: When the provided arguments are invalid (NULL) + * - ESTALE: The process exists but is currently a zombie and has not been reaped + * via wait(). Callers may choose to handle this edge case as a non-error. + * - ESRCH: When the lookup or validation fails otherwise. The process + * described by the identifier no longer exists. + */ +extern errno_t proc_find_ident_validated(const proc_ident_t i, proc_t *out); +/* compare a proc_ident to a proc ref */ +extern bool proc_ident_equal_ref(proc_ident_t ident, proc_t proc); +/* compare a proc_ident to another proc_ident */ +extern bool proc_ident_equal(proc_ident_t ident, proc_ident_t other); +/* compare a proc_ident to an audit_token_t */ +extern bool proc_ident_equal_token(proc_ident_t ident, audit_token_t token); +#endif /* KERNEL_PRIVATE */ /* find a process with a given audit token */ extern proc_t proc_find_audit_token(const audit_token_t token); /* returns a handle to current process which is referenced. The reference needs to be dropped with proc_rele */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index 0c7b92fc3..718a69fcf 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -352,6 +352,11 @@ struct proc_threadwithpathinfo { struct vnode_info_path pvip; }; +struct proc_archinfo { + cpu_type_t p_cputype; + cpu_subtype_t p_cpusubtype; +}; + /* * Socket */ @@ -764,6 +769,9 @@ struct channel_fdinfo { #define PROC_PID_RUSAGE 16 #define PROC_PID_RUSAGE_SIZE 0 +#define PROC_PIDARCHINFO 19 +#define PROC_PIDARCHINFO_SIZE (sizeof(struct proc_archinfo)) + #ifdef PRIVATE /* Additional PROC_PID values in proc_info_private.h */ #endif /* PRIVATE */ @@ -837,6 +845,7 @@ struct channel_fdinfo { #define PROC_DIRTY_DEFER 0x4 #define PROC_DIRTY_LAUNCH_IN_PROGRESS 0x8 #define PROC_DIRTY_DEFER_ALWAYS 0x10 +#define PROC_DIRTY_SHUTDOWN_ON_CLEAN 0x20 /* proc_get_dirty() flags */ #define PROC_DIRTY_TRACKED 0x1 diff --git a/bsd/sys/proc_info_private.h b/bsd/sys/proc_info_private.h index fc35b7bbf..ab56b8134 100644 --- a/bsd/sys/proc_info_private.h +++ b/bsd/sys/proc_info_private.h @@ -60,11 +60,6 @@ struct proc_bsdinfowithuniqid { struct proc_uniqidentifierinfo p_uniqidentifier; }; -struct proc_archinfo { - cpu_type_t p_cputype; - cpu_subtype_t p_cpusubtype; -}; - struct proc_pidcoalitioninfo { uint64_t coalition_id[COALITION_NUM_TYPES]; uint64_t reserved1; @@ -123,8 +118,16 @@ struct proc_delegated_signal_info { #define PROC_FLAG_APPLICATION 0x1000000 /* Process is an application */ #define PROC_FLAG_IOS_APPLICATION PROC_FLAG_APPLICATION /* Process is an application */ #define PROC_FLAG_ROSETTA 0x2000000 /* Process is running translated under Rosetta */ -#define PROC_FLAG_SEC_ENABLED 0x4000000 -#define PROC_FLAG_SEC_BYPASS_ENABLED 0x8000000 + +/* + * Security config. + * These flags are currently folded inside pbi_flags, but per-feature policies should + * likely move elsewhere. + */ +#define PROC_FLAG_SEC_ENABLED 0x04000000 +#define PROC_FLAG_SEC_BYPASS_ENABLED 0x08000000 +#define PROC_FLAG_HARDENED_HEAP_ENABLED 0x10000000 +#define PROC_FLAG_TPRO_ENABLED 0x20000000 /* keep in sync with KQ_* in sys/eventvar.h */ #define PROC_KQUEUE_WORKQ 0x0040 @@ -147,9 +150,7 @@ struct kevent_extinfo { #define PROC_PIDT_BSDINFOWITHUNIQID_SIZE \ (sizeof(struct proc_bsdinfowithuniqid)) -#define PROC_PIDARCHINFO 19 -#define PROC_PIDARCHINFO_SIZE \ - (sizeof(struct proc_archinfo)) +/* PROC_PIDARCHINFO defined in sys/proc_info.h */ #define PROC_PIDCOALITIONINFO 20 #define PROC_PIDCOALITIONINFO_SIZE (sizeof(struct proc_pidcoalitioninfo)) diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index c4e0254b5..4a5465fbe 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -428,6 +428,8 @@ struct proc { uint32_t p_pth_tsd_offset; /* offset from pthread_t to TSD for new threads */ user_addr_t p_stack_addr_hint; /* stack allocation hint for wq threads */ struct workqueue *_Atomic p_wqptr; /* workq ptr */ + struct workq_aio_s *_Atomic p_aio_wqptr; /* aio_workq ptr */ + struct timeval p_start; /* starting time */ void * p_rcall; @@ -587,6 +589,7 @@ struct proc { #define P_VFS_IOPOLICY_ALTLINK 0x0400 #define P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE 0x0800 #define P_VFS_IOPOLICY_SUPPORT_LONG_PATHS 0x1000 +#define P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS 0x2000 #define P_VFS_IOPOLICY_INHERITED_MASK \ (P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY | \ @@ -604,7 +607,8 @@ struct proc { #define P_VFS_IOPOLICY_VALID_MASK \ (P_VFS_IOPOLICY_INHERITED_MASK | \ - P_VFS_IOPOLICY_ALLOW_LOW_SPACE_WRITES) + P_VFS_IOPOLICY_ALLOW_LOW_SPACE_WRITES | \ + P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS) /* process creation arguments */ #define PROC_CREATE_FORK 0 /* independent child (running) */ @@ -733,7 +737,7 @@ struct user64_extern_proc { }; #endif /* KERNEL */ -#pragma GCC visibility push(hidden) +__exported_push_hidden extern struct vfs_context vfs_context0; @@ -750,6 +754,26 @@ extern unsigned int proc_shutdown_exitcount; #define NO_PID 100000 extern lck_mtx_t proc_list_mlock; +#ifdef XNU_KERNEL_PRIVATE +/* + * Identify a process uniquely. + * proc_ident's fields match 1-1 with those in struct proc. + */ +#define PROC_IDENT_PID_BIT_COUNT 28 +struct proc_ident { + uint64_t p_uniqueid; + pid_t + may_exit : 1, + may_exec : 1, + reserved : 2, + p_pid : PROC_IDENT_PID_BIT_COUNT; + int p_idversion; +}; +_Static_assert(sizeof(pid_t) == 4, "proc_ident assumes a 32-bit pid_t"); +_Static_assert(PID_MAX < (1 << PROC_IDENT_PID_BIT_COUNT), "proc_ident assumes PID_MAX requires less than 28bits"); +_Static_assert(NO_PID < (1 << PROC_IDENT_PID_BIT_COUNT), "proc_ident assumes NO_PID requires less than 28bits"); +#endif + #define BSD_SIMUL_EXECS 33 /* 32 , allow for rounding */ #define BSD_PAGEABLE_SIZE_PER_EXEC (NCARGS + PAGE_SIZE + PAGE_SIZE) /* page for apple vars, page for executable header */ extern int execargs_cache_size; @@ -776,14 +800,17 @@ LIST_HEAD(proclist, proc); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ -#if CONFIG_COREDUMP +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP extern const char * defaultcorefiledir; extern const char * defaultdrivercorefiledir; extern char corefilename[MAXPATHLEN + 1]; extern char drivercorefilename[MAXPATHLEN + 1]; extern int do_coredump; extern int sugid_coredump; -#endif +#if CONFIG_UCOREDUMP +extern int do_ucoredump; +#endif /* CONFIG_UCOREDUMP */ +#endif /* CONFIG_COREDUMP || CONFIG_UCOREDUMP */ __options_decl(cloneproc_flags_t, uint32_t, { CLONEPROC_SPAWN = 0, @@ -816,9 +843,9 @@ extern void proc_update_creds_onproc(struct proc *, kauth_cred_t cred); extern kauth_cred_t proc_ucred_locked(proc_t p); extern kauth_cred_t proc_ucred_smr(proc_t p); extern kauth_cred_t proc_ucred_unsafe(proc_t p) __exported; -#if CONFIG_COREDUMP -__private_extern__ int proc_core_name(const char *format, const char *name, uid_t uid, pid_t pid, - char *cr_name, size_t cr_name_len); +#if CONFIG_COREDUMP || CONFIG_UCOREDUMP +__private_extern__ int proc_core_name(const char *format, const char *name, + uid_t uid, pid_t pid, char *cr_name, size_t cr_name_len); #endif /* proc_best_name_for_pid finds a process with a given pid and copies its best name of * the executable (32-byte name if it exists, otherwise the 16-byte name) to @@ -826,7 +853,8 @@ __private_extern__ int proc_core_name(const char *format, const char *name, uid_ */ extern void proc_best_name_for_pid(int pid, char * buf, int size); extern int isinferior(struct proc *, struct proc *); -__private_extern__ struct proc *pzfind(pid_t); /* Find zombie by id. */ +__private_extern__ bool pzfind(pid_t); /* Check zombie by pid. */ +__private_extern__ bool pzfind_unique(pid_t, uint64_t); /* Check zombie by uniqueid. */ __private_extern__ struct proc *proc_find_zombref(pid_t); /* Find zombie by id. */ __private_extern__ struct proc *proc_find_zombref_locked(pid_t); /* Find zombie by id. */ __private_extern__ void proc_drop_zombref(struct proc * p); /* Drop zombie ref. */ @@ -1031,6 +1059,16 @@ extern void proc_childrenwalk(proc_t p, proc_iterate_fn_t callout, void *arg); extern void proc_rebootscan(proc_iterate_fn_t callout, void *arg, proc_iterate_fn_t filterfn, void *filterarg); +/* + * Construct a proc_ident from a proc_t + */ +extern struct proc_ident proc_ident_with_policy(proc_t p, proc_ident_validation_policy_t policy); + +/* + * Validate that a particular policy bit is set + */ +extern bool proc_ident_has_policy(const proc_ident_t ident, enum proc_ident_validation_policy policy); + pid_t dtrace_proc_selfpid(void); pid_t dtrace_proc_selfppid(void); uid_t dtrace_proc_selfruid(void); @@ -1053,10 +1091,12 @@ bool proc_ignores_node_permissions(proc_t proc); /* * @func no_paging_space_action + * * @brief React to compressor/swap exhaustion + * * @returns true if the low-swap note should be sent */ -extern bool no_paging_space_action(void); +extern bool no_paging_space_action(uint32_t cause); -#pragma GCC visibility pop +__exported_pop #endif /* !_SYS_PROC_INTERNAL_H_ */ diff --git a/bsd/sys/proc_ro.h b/bsd/sys/proc_ro.h index 4a582649b..f7c8be45d 100644 --- a/bsd/sys/proc_ro.h +++ b/bsd/sys/proc_ro.h @@ -29,6 +29,7 @@ #ifndef _SYS_PROC_RO_H_ #define _SYS_PROC_RO_H_ +#include #include #include #include @@ -92,7 +93,7 @@ struct proc_ro { struct task_filter_ro_data task_filters; #endif uint32_t t_flags_ro; /* RO-protected task flags (see osfmk/kern/task.h) */ - uint32_t task_control_port_options; + task_control_port_options_t task_control_port_options; }); }; diff --git a/bsd/sys/protosw.h b/bsd/sys/protosw.h index 5dddd2ec8..97750cf0e 100644 --- a/bsd/sys/protosw.h +++ b/bsd/sys/protosw.h @@ -280,8 +280,6 @@ struct protosw { void (*pr_init) /* initialization hook */ (struct protosw *, struct domain *); void (*pr_drain)(void); /* flush any excess space possible */ - int (*pr_sysctl) /* sysctl for protocol */ - (int *, u_int, void *, size_t *, void *, size_t); int (*pr_lock) /* lock function for protocol */ (struct socket *so, int refcnt, void *debug); int (*pr_unlock) /* unlock for protocol */ @@ -299,6 +297,9 @@ struct protosw { void (*pr_copy_last_owner) /* copy last socket from listener */ (struct socket *so, struct socket *head); + + /* Memory Accounting instance for this subsystem. */ + struct mem_acct *pr_mem_acct; }; /* @@ -549,13 +550,6 @@ struct pr_usrreqs { /* Values for pru_flags */ #define PRUF_OLD 0x10000000 /* added via net_add_proto */ - -#ifdef BSD_KERNEL_PRIVATE -/* - * For faster access than net_uptime(), bypassing the initialization. - */ -extern u_int64_t _net_uptime; -#endif /* BSD_KERNEL_PRIVATE */ #endif /* XNU_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -615,12 +609,6 @@ extern int net_del_proto(int, int, struct domain *) __XNU_INTERNAL(net_del_proto); extern int net_add_proto_old(struct protosw_old *, struct domain_old *); extern int net_del_proto_old(int, int, struct domain_old *); -extern void net_update_uptime(void); -extern void net_update_uptime_with_time(const struct timeval *); -extern uint64_t net_uptime(void); -extern uint64_t net_uptime_ms(void); -extern uint64_t net_uptime_us(void); -extern void net_uptime2timeval(struct timeval *); extern struct protosw *pffindproto(int family, int protocol, int type) __XNU_INTERNAL(pffindproto); #else diff --git a/bsd/sys/pthread_shims.h b/bsd/sys/pthread_shims.h index 4afb3d86d..eb505a47c 100644 --- a/bsd/sys/pthread_shims.h +++ b/bsd/sys/pthread_shims.h @@ -214,7 +214,9 @@ typedef const struct pthread_callbacks_s { void (*thread_exception_return)(void); void (*thread_bootstrap_return)(void); - void *__unused_was_absolutetime_to_microtime; + /* osfmk/kern/cpu_data.h */ + void (*abandon_preemption_disable_measurement)(void); + void *__unused_was_thread_set_workq_pri; void *__unused_was_thread_set_workq_qos; diff --git a/bsd/sys/reason.h b/bsd/sys/reason.h index e6db1358c..a8cdfc0cf 100644 --- a/bsd/sys/reason.h +++ b/bsd/sys/reason.h @@ -89,7 +89,7 @@ int exit_with_mach_exception(struct proc *p, exception_info_t exception, uint32_ #if CONFIG_EXCLAVES int exit_with_exclave_exception(struct proc *p, exception_info_t exception, uint32_t flags); #endif -void exit_with_mach_exception_using_ast(exception_info_t exception, uint32_t flags); +void exit_with_mach_exception_using_ast(exception_info_t exception, uint32_t flags, bool fatal); #else /* XNU_KERNEL_PRIVATE */ @@ -155,11 +155,15 @@ void os_reason_set_description_data(os_reason_t cur_reason, uint32_t type, void #define OS_REASON_CORERC 41 #define OS_REASON_SELF_RESTRICT 42 #define OS_REASON_ARKIT 43 +#define OS_REASON_CAMERA 44 +#define OS_REASON_BACKBOARD 45 +#define OS_REASON_POWEREXCEPTIONS 46 +#define OS_REASON_SECINIT 47 /* * Update whenever new OS_REASON namespaces are added. */ -#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ARKIT +#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_SECINIT #define OS_REASON_BUFFER_MAX_SIZE 5120 @@ -294,6 +298,7 @@ int terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_c #define EXEC_EXIT_REASON_SET_DYLD_INFO 17 #define EXEC_EXIT_REASON_MACHINE_THREAD 18 #define EXEC_EXIT_REASON_BAD_PSATTR 19 +#define EXEC_EXIT_REASON_NOX86EXEC 20 #define EXEC_EXIT_REASON_MAP_EXEC_FAILURE 21 /* * guard reasons diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index b6e786144..7df9f50dd 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -167,6 +167,7 @@ __END_DECLS #if KERNEL_PRIVATE __BEGIN_DECLS int get_system_inshutdown(void); +int get_system_inuserspacereboot(void); __END_DECLS #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index beac1241f..dc1871bc8 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -575,6 +575,7 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_TYPE_VFS_SKIP_MTIME_UPDATE 8 #define IOPOL_TYPE_VFS_ALLOW_LOW_SPACE_WRITES 9 #define IOPOL_TYPE_VFS_DISALLOW_RW_FOR_O_EVTONLY 10 +#define IOPOL_TYPE_VFS_ENTITLED_RESERVE_ACCESS 14 /* scope */ #define IOPOL_SCOPE_PROCESS 0 @@ -625,6 +626,9 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_DEFAULT 0 #define IOPOL_VFS_NOCACHE_WRITE_FS_BLKSIZE_ON 1 +#define IOPOL_VFS_ENTITLED_RESERVE_ACCESS_OFF 0 +#define IOPOL_VFS_ENTITLED_RESERVE_ACCESS_ON 1 + #endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL */ #ifndef KERNEL diff --git a/bsd/sys/resource_private.h b/bsd/sys/resource_private.h index 6288f1409..84c5657d2 100644 --- a/bsd/sys/resource_private.h +++ b/bsd/sys/resource_private.h @@ -118,12 +118,28 @@ __END_DECLS #endif /* !defined(KERNEL) */ -/* Additional private parameters to getpriority()/setpriority( */ +/* Additional private parameters to getpriority()/setpriority() */ #define PRIO_DARWIN_GPU 5 /* Second argument is a PID */ -#define PRIO_DARWIN_GPU_ALLOW 0x1 -#define PRIO_DARWIN_GPU_DENY 0x2 +__enum_decl(darwin_gpu_role_t, uint8_t, { + /* GPU Role unmanaged, default value at task start */ + PRIO_DARWIN_GPU_UNKNOWN = 0x0, + /* existing allow state for compatibility */ + PRIO_DARWIN_GPU_ALLOW = 0x1, + /* GPU access is denied by Runningboard */ + PRIO_DARWIN_GPU_DENY = 0x2, + /* Allowed to use GPU at Background priority, not visible to user, prioritizes running most-efficiently */ + PRIO_DARWIN_GPU_BACKGROUND = 0x3, + /* GPU used for non-visible-UI long-running progress-bar workloads, balances between sustainable thermals and perf */ + PRIO_DARWIN_GPU_UTILITY = 0x4, + /* Renders visible UI, known to be a non-focal app */ + PRIO_DARWIN_GPU_UI_NON_FOCAL = 0x5, + /* Renders visible UI, unknown focality */ + PRIO_DARWIN_GPU_UI = 0x6, + /* Renders visible UI, is part of a focal app */ + PRIO_DARWIN_GPU_UI_FOCAL = 0x7, +}); #define PRIO_DARWIN_ROLE 6 /* Second argument is a PID */ @@ -134,9 +150,11 @@ __END_DECLS #define PRIO_DARWIN_ROLE_UI_NON_FOCAL 0x4 /* On screen, non-focal UI */ #define PRIO_DARWIN_ROLE_TAL_LAUNCH 0x5 /* Throttled-launch (for OS X TAL resume) */ #define PRIO_DARWIN_ROLE_DARWIN_BG 0x6 /* Throttled for running in the background */ +#define PRIO_DARWIN_ROLE_USER_INIT 0x7 /* Off-screen doing user-initiated work */ #define PRIO_DARWIN_GAME_MODE 7 /* Second argument is a PID */ #define PRIO_DARWIN_CARPLAY_MODE 8 /* Second argument is a PID */ +#define PRIO_DARWIN_RUNAWAY_MITIGATION 9 /* Second argument is a PID */ #define PRIO_DARWIN_GAME_MODE_OFF 0x0 #define PRIO_DARWIN_GAME_MODE_ON 0x1 @@ -144,6 +162,9 @@ __END_DECLS #define PRIO_DARWIN_CARPLAY_MODE_OFF 0x0 #define PRIO_DARWIN_CARPLAY_MODE_ON 0x1 +#define PRIO_DARWIN_RUNAWAY_MITIGATION_OFF 0x0 +#define PRIO_DARWIN_RUNAWAY_MITIGATION_ON 0x1 + /* * Flags for I/O monitor control. */ diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index 25f2e353f..9ecad9af6 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -166,6 +166,7 @@ union sigval { #define SIGEV_NONE 0 /* No async notification */ #define SIGEV_SIGNAL 1 /* aio - completion notification */ #define SIGEV_THREAD 3 /* [NOTIMP] [RTS] call notification function */ +#define SIGEV_KEVENT 4 /* Generate a kevent */ #ifndef KERNEL struct sigevent { diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index 4473091ee..82cb185c8 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -247,6 +247,7 @@ void psignal_sigkill_try_thread_with_reason(struct proc *p, struct thread *th cpu_type_t process_cpu_type(struct proc * core_proc); cpu_type_t process_cpu_subtype(struct proc * core_proc); +int is_coredump_eligible(struct proc *); int coredump(struct proc *p, uint32_t reserve_mb, int coredump_flags); void set_thread_exit_reason(void *th, void *reason, boolean_t proc_locked); diff --git a/bsd/sys/snapshot.h b/bsd/sys/snapshot.h index ae218bc90..32d0456cc 100644 --- a/bsd/sys/snapshot.h +++ b/bsd/sys/snapshot.h @@ -51,12 +51,16 @@ int fs_snapshot_rename(int, const char *, const char *, uint32_t) __OSX_AVAILABL #endif /* !KERNEL */ /* fs_snapshot_mount() supported flags */ +#define SNAPSHOT_MNT_RESERVED1 0x00000001 /* same as MNT_RDONLY */ +#define SNAPSHOT_MNT_NOEXEC 0x00000004 /* same as MNT_NOEXEC */ #define SNAPSHOT_MNT_NOSUID 0x00000008 /* same as MNT_NOSUID */ #define SNAPSHOT_MNT_NODEV 0x00000010 /* same as MNT_NODEV */ #define SNAPSHOT_MNT_DONTBROWSE 0x00100000 /* same as MNT_DONTBROWSE */ #define SNAPSHOT_MNT_IGNORE_OWNERSHIP 0x00200000 /* same as MNT_IGNORE_OWNERSHIP */ #define SNAPSHOT_MNT_NOFOLLOW 0x08000000 /* same as MNT_NOFOLLOW */ +#define SNAPSHOT_MNT_VALIDMASK 0x0830001d + #ifndef KERNEL int fs_snapshot_mount(int, const char *, const char *, uint32_t) __OSX_AVAILABLE(10.12) __IOS_AVAILABLE(10.0) __TVOS_AVAILABLE(10.0) __WATCHOS_AVAILABLE(3.0); diff --git a/bsd/sys/socket_private.h b/bsd/sys/socket_private.h index 5d39a73cc..cf5c61bb5 100644 --- a/bsd/sys/socket_private.h +++ b/bsd/sys/socket_private.h @@ -258,6 +258,8 @@ #define SO_APPLICATION_ID 0x1133 /* ID of attributing app - so_application_id_t */ /* 0x1134 is SO_BINDTODEVICE, see socket.h */ #define SO_MARK_DOMAIN_INFO_SILENT 0x1135 /* Domain information should be silently withheld */ +#define SO_MAX_PACING_RATE 0x1136 /* Define per-socket maximum pacing rate in bytes/sec */ +#define SO_CONNECTION_IDLE 0x1137 /* Connection is idle (int) */ struct so_mark_cellfallback_uuid_args { uuid_t flow_uuid; @@ -571,6 +573,7 @@ struct user32_sa_endpoints { #define SCM_TIMESTAMP_CONTINUOUS 0x07 /* timestamp (uint64_t) */ #define SCM_MPKL_SEND_INFO 0x08 /* send info for multi-layer packet logging (struct so_mpkl_send_info) */ #define SCM_MPKL_RECV_INFO 0x09 /* receive info for multi-layer packet logging (struct so_mpkl_recv_info */ +#define SCM_TXTIME 0x10 /* Set expected transmit time in absolute-time nanoseconds */ #ifdef KERNEL_PRIVATE /* diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index dd9e59c22..1435253e7 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -102,12 +102,6 @@ struct sockutil; /* strings for sleep message: */ extern char netio[], netcon[], netcls[]; -#define SOCKET_CACHE_ON -#define SO_CACHE_FLUSH_INTERVAL 1 /* Seconds */ -#define SO_CACHE_TIME_LIMIT (120/SO_CACHE_FLUSH_INTERVAL) /* Seconds */ -#define SO_CACHE_MAX_FREE_BATCH 50 -#define MAX_CACHED_SOCKETS 512 -#define TEMPDEBUG 0 #endif /* KERNEL_PRIVATE */ #ifdef PRIVATE @@ -227,9 +221,6 @@ struct socket { kauth_cred_t so_cred; /* cred of who opened the socket */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* generation count */ - STAILQ_ENTRY(socket) so_cache_ent; /* socache entry */ - caddr_t so_saved_pcb; /* Saved pcb when cacheing */ - u_int64_t cache_timestamp; /* time socket was cached */ uint32_t so_eventmask; /* event mask */ pid_t last_pid; /* pid of most recent accessor */ @@ -281,7 +272,7 @@ struct socket { #define SOF1_PRECONNECT_DATA 0x00000020 /* request for preconnect data */ #define SOF1_EXTEND_BK_IDLE_WANTED 0x00000040 /* option set */ #define SOF1_EXTEND_BK_IDLE_INPROG 0x00000080 /* socket */ -#define SOF1_CACHED_IN_SOCK_LAYER 0x00000100 /* bundled with inpcb and tcpcb */ +/* UNUSED */ #define SOF1_TFO_REWIND 0x00000200 /* rewind mptcp meta data */ #define SOF1_CELLFALLBACK 0x00000400 /* Initiated by cell fallback */ #define SOF1_QOSMARKING_ALLOWED 0x00000800 /* policy allows DSCP map */ @@ -304,6 +295,7 @@ struct socket { #define SOF1_TRACKER_NON_APP_INITIATED 0x10000000 /* Tracker connection is non-app initiated */ #define SOF1_APPROVED_APP_DOMAIN 0x20000000 /* Connection is for an approved associated app domain */ #define SOF1_DOMAIN_INFO_SILENT 0x40000000 /* Maintain silence on any domain information */ +#define SOF1_DOMAIN_MATCHED_POLICY 0x80000000 /* Domain was used for policy evaluation */ uint32_t so_upcallusecount; /* number of upcalls in progress */ int so_usecount; /* refcounting of socket use */ @@ -685,11 +677,12 @@ struct sf_buf { } #define SB_MB_CHECK(sb) do { \ - if (((sb)->sb_mb != NULL && \ - (sb)->sb_cc == 0) || \ + if (((sb)->sb_mb != NULL && (sb)->sb_cc == 0 && m_length((sb)->sb_mb) != 0) || \ ((sb)->sb_mb == NULL && (sb)->sb_cc > 0)) \ - panic("corrupt so_rcv: sb_mb %p sb_cc %d\n", \ - (sb)->sb_mb, (sb)->sb_cc); \ + panic("corrupt so_rcv (%s:%d): sb_mb %p m_len: %d m_type: %u sb_cc %u sb_ctl %u\n", \ + __func__, __LINE__, \ + (sb)->sb_mb, (sb)->sb_mb != NULL ? m_length((sb)->sb_mb) : 0, \ + (sb)->sb_mb != NULL ? (sb)->sb_mb->m_type : 0, (sb)->sb_cc, (sb)->sb_ctl); \ } while (0) #define SODEFUNCTLOG(fmt, ...) do { \ @@ -737,8 +730,6 @@ struct so_procinfo { extern uint32_t sb_max; extern so_gen_t so_gencnt; extern int socket_debug; -extern int sosendjcl; -extern int sosendjcl_ignore_capab; extern int sodefunctlog; extern int sothrottlelog; extern int sorestrictrecv; @@ -861,7 +852,6 @@ extern int sbappendstream_rcvdemux(struct socket *so, struct mbuf *m); #if MPTCP extern int sbappendmptcpstream_rcv(struct sockbuf *sb, struct mbuf *m); #endif /* MPTCP */ -extern void sbcheck(struct sockbuf *sb); extern void sblastmbufchk(struct sockbuf *, const char *); extern void sblastrecordchk(struct sockbuf *, const char *); extern struct mbuf *sbcreatecontrol(caddr_t __sized_by(size) p, int size, int type, int level); @@ -928,7 +918,7 @@ extern void sbunlock(struct sockbuf *sb, boolean_t keeplocked); extern int soaccept(struct socket *so, struct sockaddr **nam); extern int soacceptlock(struct socket *so, struct sockaddr **nam, int dolock); extern int soacceptfilter(struct socket *so, struct socket *head); -extern struct socket *soalloc(int waitok, int dom, int type); +extern struct socket *soalloc(void); extern int sobindlock(struct socket *so, struct sockaddr *nam, int dolock); extern int soclose(struct socket *so); extern int soclose_locked(struct socket *so); @@ -1020,6 +1010,9 @@ typedef struct tracker_metadata_short { char domain_owner[TRACKER_DOMAIN_SHORT_MAX + 1]; } tracker_metadata_short_t; +// metadata will be filled out by the lookup. +// Set the SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT flag in the metadata to request that the +// entry be extended. extern int tracker_lookup(uuid_t app_uuid, struct sockaddr *, tracker_metadata_t *metadata); /* @@ -1061,8 +1054,7 @@ extern void so_update_tx_data_stats(struct socket *, uint32_t, uint32_t); extern void set_packet_service_class(struct mbuf *, struct socket *, mbuf_svc_class_t, u_int32_t); -extern int so_tos_from_control(struct mbuf *); -extern int so_tc_from_control(struct mbuf *, int *); +extern int ip_tos_from_control(struct mbuf *); extern mbuf_svc_class_t so_tc2msc(int); extern int so_svc2tc(mbuf_svc_class_t); @@ -1091,7 +1083,6 @@ extern int so_wait_for_if_feedback(struct socket *); extern int soopt_getm(struct sockopt *sopt, struct mbuf **mp); extern int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); extern int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); -extern boolean_t so_cache_timer(void); extern void mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len); extern void mptcp_preproc_sbdrop(struct socket *, struct mbuf *, unsigned int); @@ -1141,6 +1132,7 @@ enum so_tracker_attribute { #define SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED 0x00000001 #define SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER 0x00000002 #define SO_TRACKER_ATTRIBUTE_FLAGS_DOMAIN_SHORT 0x00000004 +#define SO_TRACKER_ATTRIBUTE_FLAGS_EXTENDED_TIMEOUT 0x00000008 #ifndef KERNEL #define SO_TRACKER_TRANSPARENCY_VERSION 3 diff --git a/bsd/sys/sockio_private.h b/bsd/sys/sockio_private.h index e841b594a..39bce7c61 100644 --- a/bsd/sys/sockio_private.h +++ b/bsd/sys/sockio_private.h @@ -127,6 +127,12 @@ #define SIOCGPOINTOPOINTMDNS _IOWR('i', 95, struct ifreq) /* mDNS on point-to-point interface */ #define SIOCSPOINTOPOINTMDNS _IOW('i', 95, struct ifreq) /* mDNS on point-to-point interface */ +#define SIOCGINBANDWAKEPKT _IOWR('i', 96, struct ifreq) /* inband wake packet tagging (int) */ +#define SIOCSINBANDWAKEPKT _IOW('i', 96, struct ifreq) /* inband wake packet tagging (int) */ + +#define SIOCGLOWPOWERWAKE _IOWR('i', 97, struct ifreq) /* low power wake mode (int) */ +#define SIOCSLOWPOWERWAKE _IOW('i', 97, struct ifreq) /* low power wake mode (int) */ + #ifdef KERNEL_PRIVATE #define SIOCSDRVSPEC32 _IOW('i', 123, struct ifdrv32) /* set driver-specific * parameters */ @@ -204,7 +210,6 @@ #define SIOCSIFNETSIGNATURE _IOWR('i', 174, struct if_nsreq) #define SIOCGIFNETSIGNATURE _IOWR('i', 175, struct if_nsreq) -#define SIOCGECNMODE _IOWR('i', 176, struct ifreq) #define SIOCSECNMODE _IOW('i', 177, struct ifreq) #define SIOCSIFORDER _IOWR('i', 178, struct if_order) @@ -295,4 +300,9 @@ #define SIOCSIFCONGESTEDLINK _IOW('i', 226, struct ifreq) /* ifr_intval */ #define SIOCGIFCONGESTEDLINK _IOWR('i', 226, struct ifreq) /* ifr_intval */ +#define SIOCSIFISCOMPANIONLINK _IOW('i', 227, struct ifreq) /* marks interface as a companion link interface */ + +#define SIOCSIFL4S _IOW('i', 228, struct ifreq) /* Set L4S enablement state (Enable or Disable) */ +#define SIOCGIFL4S _IOWR('i', 228, struct ifreq) + #endif /* !_SYS_SOCKIO_PRIVATE_H_ */ diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index d9c867612..c988f8167 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -258,6 +258,8 @@ typedef struct _posix_spawnattr { uint32_t psa_kqworkloop_soft_limit; /* kqworkloop soft limit */ uint32_t psa_kqworkloop_hard_limit; /* kqworkloop hard limit */ + uint32_t psa_conclave_mem_limit; /* conclave hard memory limit (in MB) */ + /* * NOTE: Extensions array pointers must stay at the end so that * everything above this point stays the same size on different bitnesses @@ -286,6 +288,8 @@ __options_decl(posix_spawn_secflag_options, uint16_t, { POSIX_SPAWN_SECFLAG_EXPLICIT_CHECK_ENFORCE = 0x80, POSIX_SPAWN_SECFLAG_EXPLICIT_DISABLE_INHERIT = 0x100, POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE_INHERIT = 0x200, + POSIX_SPAWN_SECFLAG_EXPLICIT_REQUIRE_ENABLE = 0x400, + POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE_PURE_DATA = 0x800, }); /* @@ -303,7 +307,7 @@ __options_decl(posix_spawn_secflag_options, uint16_t, { */ #define POSIX_SPAWN_JETSAM_MEMLIMIT_ACTIVE_FATAL 0x04 /* if set, limit is fatal when the process is active */ #define POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL 0x08 /* if set, limit is fatal when the process is inactive */ - +#define POSIX_SPAWN_JETSAM_REALTIME_AUDIO 0x10 /* if set, avoid expensive memory telemetry while audio is playing */ /* * Flags set based on posix_spawnattr_set_jetsam_ttr_np(). @@ -356,7 +360,8 @@ __options_decl(posix_spawn_secflag_options, uint16_t, { #define POSIX_SPAWN_PROC_TYPE_MASK 0x00000F00 #define POSIX_SPAWN_PROC_TYPE_APP_DEFAULT 0x00000100 -#define POSIX_SPAWN_PROC_TYPE_APP_TAL 0x00000200 /* unused */ +#define POSIX_SPAWN_PROC_TYPE_APP_NONUI 0x00000200 +#define POSIX_SPAWN_PROC_TYPE_APP_TAL POSIX_SPAWN_PROC_TYPE_APP_NONUI /* old name */ #define POSIX_SPAWN_PROC_TYPE_DAEMON_STANDARD 0x00000300 #define POSIX_SPAWN_PROC_TYPE_DAEMON_INTERACTIVE 0x00000400 diff --git a/bsd/sys/stdio.h b/bsd/sys/stdio.h index 089d9e1bc..bf60d7784 100644 --- a/bsd/sys/stdio.h +++ b/bsd/sys/stdio.h @@ -37,6 +37,7 @@ #define RENAME_EXCL 0x00000004 #define RENAME_RESERVED1 0x00000008 #define RENAME_NOFOLLOW_ANY 0x00000010 +#define RENAME_RESOLVE_BENEATH 0x00000020 #endif #ifndef KERNEL diff --git a/bsd/sys/sys_domain.h b/bsd/sys/sys_domain.h index a0bda7358..ed0c73017 100644 --- a/bsd/sys/sys_domain.h +++ b/bsd/sys/sys_domain.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005, 2012, 2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2005, 2012, 2014, 2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -34,10 +34,6 @@ #include #include -#ifdef KERNEL_PRIVATE -#include -#endif /* KERNEL_PRIVATE */ - /* Kernel Events Protocol */ #define SYSPROTO_EVENT 1 /* kernel events protocol */ @@ -53,26 +49,8 @@ struct sockaddr_sys { u_int32_t ss_reserved[7]; /* reserved to the protocol use */ }; -#ifdef PRIVATE -struct xsystmgen { - u_int32_t xg_len; /* length of this structure */ - u_int64_t xg_count; /* number of PCBs at this time */ - u_int64_t xg_gen; /* generation count at this time */ - u_int64_t xg_sogen; /* current socket generation count */ -}; -#endif /* PRIVATE */ - -#ifdef KERNEL_PRIVATE - -extern struct domain *systemdomain; - -SYSCTL_DECL(_net_systm); - -/* built in system domain protocols init function */ -__BEGIN_DECLS -void kern_event_init(struct domain *); -void kern_control_init(struct domain *); -__END_DECLS +#if defined(PRIVATE) && !defined(MODULES_SUPPORTED) +#include #endif /* KERNEL_PRIVATE */ #endif /* _SYSTEM_DOMAIN_H_ */ diff --git a/bsd/sys/sys_domain_private.h b/bsd/sys/sys_domain_private.h new file mode 100644 index 000000000..d91b8ed91 --- /dev/null +++ b/bsd/sys/sys_domain_private.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef _SYSTEM_DOMAIN_PRIVATE_H_ +#define _SYSTEM_DOMAIN_PRIVATE_H_ + +#include + +#ifdef KERNEL_PRIVATE +#include +#endif /* KERNEL_PRIVATE */ + +struct xsystmgen { + u_int32_t xg_len; /* length of this structure */ + u_int64_t xg_count; /* number of PCBs at this time */ + u_int64_t xg_gen; /* generation count at this time */ + u_int64_t xg_sogen; /* current socket generation count */ +}; + +#ifdef KERNEL_PRIVATE + +extern struct domain *systemdomain; + +SYSCTL_DECL(_net_systm); + +/* built in system domain protocols init function */ +__BEGIN_DECLS +void kern_event_init(struct domain *); +void kern_control_init(struct domain *); +__END_DECLS +#endif /* KERNEL_PRIVATE */ + +#endif /* _SYSTEM_DOMAIN_PRIVATE_H_ */ diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index 4f0b291ae..9dc917dea 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -168,7 +168,8 @@ struct ctlname { #if XNU_KERNEL_PRIVATE #define CTLFLAG_PERMANENT 0x00200000 /* permanent sysctl_oid */ #endif -#define CTLFLAG_EXPERIMENT 0x00100000 /* Allows writing w/ the trial experiment entitlement. */ +#define CTLFLAG_EXPERIMENT 0x00100000 /* Allows read/write w/ the trial experiment entitlement. */ +#define CTLFLAG_LEGACY_EXPERIMENT 0x00080000 /* Allows writing w/ the legacy trial experiment entitlement. */ /* * USE THIS instead of a hardwired number from the categories below @@ -530,16 +531,20 @@ __END_DECLS SYSCTL_OID(parent, nbr, name, access, \ ptr, arg, handler, fmt, descr) +#pragma mark Trial Experiments + /* * The EXPERIMENT macros below expose values for on-device experimentation (A/B testing) via Trial. - * These values will be set shortly after boot by the KRExperiments framework based on any - * active experiments on the device. - * Values exposed via these macros are still normal sysctls and can be set by the superuser in the - * development or debug kernel. However, on the release kernel they can ONLY be set by processes - * with the com.apple.private.write-kr-experiment-factors entitlement. - * In addition, for numeric types, special macros are provided that enforce a valid range for the value (inclusive) - * to ensure that an errant experiment can't set a totally unexpected value. These macros also track which - * values have been modified via sycstl(3) so that they can be inspected with the showexperiments lldb macro. + * These values will be set shortly after boot by triald based on any active experiments on the + * device. Values exposed via these macros are still normal sysctls and can be set by the + * superuser in the development or debug kernel. However, on the release kernel they can ONLY be + * set by processes with the com.apple.private.kernel.read-write-trial-experiment-factors + * entitlement. + * + * For numeric types, special macros are provided that enforce a valid range for the value + * (inclusive) to ensure that an errant experiment can't set a totally unexpected value. These + * macros also track which values have been modified via sycstl(3) so that they can be inspected + * with the showexperiments lldb macro. */ struct experiment_spec { @@ -568,7 +573,57 @@ int experiment_factor_##experiment_factor_typename##_handler SYSCTL_HANDLER_ARGS experiment_factor_numeric_types #undef X -#define __EXPERIMENT_FACTOR_SPEC(parent, name, p, min, max) \ +#define __EXPERIMENT_FACTOR_SPEC(name, p, min, max) \ + struct experiment_spec _experiment_##name = { \ + .ptr = p, \ + .min_value = min, \ + .max_value = max, \ + .original_value = 0, \ + .modified = false \ + } + +#define EXPERIMENT_FACTOR_UINT(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(unsigned int), "must be integer sized"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_uint_handler, "IU", descr); + +#define EXPERIMENT_FACTOR_INT(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(int), "must be integer sized"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_int_handler, "I", descr); + +#define EXPERIMENT_FACTOR_ULONG(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(unsigned long), "must be long sized"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_ulong_handler, "LU", descr); + +#define EXPERIMENT_FACTOR_LONG(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(long), "must be long sized"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_long_handler, "L", descr); + +#define EXPERIMENT_FACTOR_UINT64(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(uint64_t), "must be 8 bytes"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_uint64_handler, "QU", descr); + +#define EXPERIMENT_FACTOR_INT64(name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_SPEC(name, ptr, min, max); \ + _Static_assert(sizeof(*(ptr)) == sizeof(int64_t), "must be 8 bytes"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &_experiment_##name, 1, &experiment_factor_int64_handler, "Q", descr); + +/* + * Calls an user provided handler to read / write this factor. + * Entitlement checking will still be done by sysctl, but it's the callers responsibility to validate any new values. + * This factor will not be printed out via the showexperiments lldb macro. + */ +#define EXPERIMENT_FACTOR_PROC(access, ptr, arg, handler, fmt, descr) \ + _Static_assert(arg != 1, "arg can not be 1"); \ + SYSCTL_PROC(_kern_trial, OID_AUTO, name, access | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, ptr, arg, handler, fmt, descr); + +/* Legacy factors */ + +#define __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, p, min, max) \ struct experiment_spec experiment_##parent##_##name = { \ .ptr = p, \ .min_value = min, \ @@ -577,44 +632,44 @@ experiment_factor_numeric_types .modified = false \ } -#define EXPERIMENT_FACTOR_UINT(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_UINT(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(unsigned int), "must be integer sized"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint_handler, "IU", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint_handler, "IU", descr); -#define EXPERIMENT_FACTOR_INT(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_INT(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(int), "must be integer sized"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int_handler, "I", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int_handler, "I", descr); -#define EXPERIMENT_FACTOR_ULONG(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_ULONG(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(unsigned long), "must be long sized"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_ulong_handler, "LU", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_ulong_handler, "LU", descr); -#define EXPERIMENT_FACTOR_LONG(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_LONG(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(long), "must be long sized"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_long_handler, "L", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_long_handler, "L", descr); -#define EXPERIMENT_FACTOR_UINT64(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_UINT64(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(uint64_t), "must be 8 bytes"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint64_handler, "QU", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_uint64_handler, "QU", descr); -#define EXPERIMENT_FACTOR_INT64(parent, name, ptr, min, max, descr) \ - __EXPERIMENT_FACTOR_SPEC(parent, name, ptr, min, max); \ +#define EXPERIMENT_FACTOR_LEGACY_INT64(parent, name, ptr, min, max, descr) \ + __EXPERIMENT_FACTOR_LEGACY_SPEC(parent, name, ptr, min, max); \ _Static_assert(sizeof(*(ptr)) == sizeof(int64_t), "must be 8 bytes"); \ - SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int64_handler, "Q", descr); + SYSCTL_PROC(parent, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, &experiment_##parent##_##name, 1, &experiment_factor_int64_handler, "Q", descr); /* * Calls an user provided handler to read / write this factor. * Entitlement checking will still be done by sysctl, but it's the callers responsibility to validate any new values. * This factor will not be printed out via the showexperiments lldb macro. */ -#define EXPERIMENT_FACTOR_PROC(parent, name, access, ptr, arg, handler, fmt, descr) \ +#define EXPERIMENT_FACTOR_LEGACY_PROC(parent, name, access, ptr, arg, handler, fmt, descr) \ _Static_assert(arg != 1, "arg can not be 1"); \ - SYSCTL_PROC(parent, OID_AUTO, name, access | CTLFLAG_ANYBODY | CTLFLAG_EXPERIMENT, ptr, arg, handler, fmt, descr); + SYSCTL_PROC(parent, OID_AUTO, name, access | CTLFLAG_ANYBODY | CTLFLAG_LEGACY_EXPERIMENT, ptr, arg, handler, fmt, descr); #ifdef XNU_KERNEL_PRIVATE /* @@ -651,6 +706,11 @@ SYSCTL_DECL(_debug_test); #ifdef PRIVATE SYSCTL_DECL(_kern_bridge); SYSCTL_DECL(_hw_features); +SYSCTL_DECL(_kern_trial); +#endif + +#if BSD_KERNEL_PRIVATE +SYSCTL_DECL(_kern_memorystatus); #endif #if defined(BSD_KERNEL_PRIVATE) && SKYWALK diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index b3ab03dee..56f044452 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -259,7 +259,7 @@ struct cs_blob *ubc_get_cs_blobs(vnode_t); struct cs_blob *ubc_get_cs_supplement(vnode_t); #endif void ubc_get_cs_mtime(vnode_t, struct timespec *); -int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); +int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *, uint8_t*); kern_return_t ubc_cs_blob_allocate(vm_offset_t *, vm_size_t *); void ubc_cs_blob_deallocate(vm_offset_t, vm_size_t); boolean_t ubc_cs_is_range_codesigned(vnode_t, mach_vm_offset_t, mach_vm_size_t); diff --git a/bsd/sys/user.h b/bsd/sys/user.h index f29b7b93d..9516e4fe6 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -403,11 +403,11 @@ typedef struct uthread * uthread_t; #define UT_ATIME_UPDATE 0x00002000 /* don't update atime for files accessed by this thread */ #define UT_NSPACE_FORCEDATALESSFAULTS 0x00004000 /* thread always materializes dataless files */ #define UT_LP64 0x00010000 /* denormalized P_LP64 bit from proc */ -#define UT_FS_BLKSIZE_NOCACHE_WRITES 0x00020000 /* thread wants sub pagesize directIO writes */ +#define UT_FS_ENTITLED_RESERVE_ACCESS 0x00020000 /* thread's FS allocations should come from the entitled reserve */ #define UT_SKIP_MTIME_UPDATE 0x00040000 /* don't update mtime for files modified by this thread */ #define UT_SKIP_MTIME_UPDATE_IGNORE 0x00080000 /* ignore the process's mtime update policy when the policy is not enabled for this thread */ - #define UT_SUPPORT_LONG_PATHS 0x00100000 /* support long paths in syscalls used by this thread */ +#define UT_IGNORE_NODE_PERMISSIONS 0x00200000 /* thread should ignore node permissions */ #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 423587b7f..ba5bfafae 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -781,23 +781,25 @@ struct vnode_attr { /* * Flags for va_dataprotect_flags */ -#define VA_DP_RAWENCRYPTED 0x0001 -#define VA_DP_RAWUNENCRYPTED 0x0002 -#define VA_DP_AUTHENTICATE 0x0004 +#define VA_DP_RAWENCRYPTED 0x0001 +#define VA_DP_RAWUNENCRYPTED 0x0002 +#define VA_DP_AUTHENTICATE 0x0004 +#define VA_DP_MINIMUM_PROTECTION 0x0008 #endif /* * Flags for va_vaflags. */ -#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ -#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ -#define VA_NOINHERIT 0x040000 /* Don't inherit ACLs from parent */ -#define VA_NOAUTH 0x080000 -#define VA_64BITOBJIDS 0x100000 /* fileid/linkid/parentid are 64 bit */ -#define VA_REALFSID 0x200000 /* Return real fsid */ -#define VA_USEFSID 0x400000 /* Use fsid from filesystem */ -#define VA_FILESEC_ACL 0x800000 /* ACL is interior to filesec */ +#define VA_UTIMES_NULL 0x0010000 /* utimes argument was NULL */ +#define VA_EXCLUSIVE 0x0020000 /* exclusive create request */ +#define VA_NOINHERIT 0x0040000 /* Don't inherit ACLs from parent */ +#define VA_NOAUTH 0x0080000 +#define VA_64BITOBJIDS 0x0100000 /* fileid/linkid/parentid are 64 bit */ +#define VA_REALFSID 0x0200000 /* Return real fsid */ +#define VA_USEFSID 0x0400000 /* Use fsid from filesystem */ +#define VA_FILESEC_ACL 0x0800000 /* ACL is interior to filesec */ +#define VA_VAFILEID 0x1000000 /* Verify fileid and fsid */ /* * Modes. Some values same as Ixxx entries from inode.h for now. @@ -836,14 +838,17 @@ extern int vttoif_tab[]; #define REVOKEALL 0x0001 /* vnop_revoke: revoke all aliases */ /* VNOP_REMOVE/unlink flags */ -#define VNODE_REMOVE_NODELETEBUSY 0x0001 /* Don't delete busy files (Carbon) */ +#define VNODE_REMOVE_NODELETEBUSY 0x0001 /* Don't delete busy files */ #define VNODE_REMOVE_SKIP_NAMESPACE_EVENT 0x0002 /* Do not upcall to userland handlers */ #define VNODE_REMOVE_NO_AUDIT_PATH 0x0004 /* Do not audit the path */ #define VNODE_REMOVE_DATALESS_DIR 0x0008 /* Special handling for removing a dataless directory without materialization */ #ifdef BSD_KERNEL_PRIVATE #define VNODE_REMOVE_NOFOLLOW_ANY 0x0010 -#endif +#endif /* BSD_KERNEL_PRIVATE */ #define VNODE_REMOVE_SYSTEM_DISCARDED 0x0020 /* Update speculative telemetry with SYSTEM_DISCARDED use state (Default USER_DISCARDED use state) */ +#ifdef BSD_KERNEL_PRIVATE +#define VNODE_REMOVE_RESOLVE_BENEATH 0x0040 /* path must reside in the hierarchy beneath the starting directory */ +#endif /* BSD_KERNEL_PRIVATE */ /* VNOP_READDIR flags: */ #define VNODE_READDIR_EXTENDED 0x0001 /* use extended directory entries */ @@ -2524,6 +2529,16 @@ task_t vfs_context_task(vfs_context_t ctx); */ int vnode_isauthfs(vnode_t vp); +/*! + * @function vnode_hasmultipath + * @abstract Determine if the given vnode has multiple paths. + * @discussion This function needs to be called with an iocount held on the + * given vnode. + * @param vp The vnode to examine. + * @result Non-zero to indicate that the vnode has multiple paths. Zero otherwise. + */ +int vnode_hasmultipath(vnode_t vp); + #endif /* KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -2536,7 +2551,7 @@ int vn_stat_noauth(struct vnode *vp, void * sb, kauth_filesec_t *xsec, int i int vaccess(mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, kauth_cred_t cred); int check_mountedon(dev_t dev, enum vtype type, int *errorp); -int vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash); +int vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash, uint8_t *type); void vnode_reclaim(vnode_t); vnode_t current_workingdir(void); void *vnode_vfsfsprivate(vnode_t); @@ -2551,6 +2566,7 @@ int vnode_makeimode(int, int); enum vtype vnode_iftovt(int); int vnode_vttoif(enum vtype); int vnode_isshadow(vnode_t); +int vnode_getfromid(int, uint64_t, vfs_context_t, int, vnode_t *); boolean_t vnode_on_reliable_media(vnode_t); /* * Indicate that a file has multiple hard links. VFS will always call diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index f35545697..61882fa45 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -794,6 +794,8 @@ enum { VFS_RENAME_DATALESS = 0x00000008, /* used by sys/stdio for RENAME_NOFOLLOW_ANY */ VFS_RENAME_RESERVED1 = 0x00000010, + /* used by sys/stdio for RENAME_RESOLVE_BENEATH */ + VFS_RENAME_RESERVED2 = 0x00000020, VFS_RENAME_FLAGS_MASK = (VFS_RENAME_SECLUDE | VFS_RENAME_SWAP | VFS_RENAME_EXCL), @@ -1821,10 +1823,12 @@ __options_decl(vnode_verify_flags_t, uint32_t, { VNODE_VERIFY_CONTEXT_ALLOC = 1, VNODE_VERIFY_WITH_CONTEXT = 2, VNODE_VERIFY_CONTEXT_FREE = 4, + VNODE_VERIFY_PRECOMPUTED = 8, }); #define VNODE_VERIFY_DEFAULT VNODE_VERIFY_DEFAULT #define VNODE_VERIFY_WITH_CONTEXT VNODE_VERIFY_WITH_CONTEXT +#define VNODE_VERIFY_PRECOMPUTED VNODE_VERIFY_PRECOMPUTED struct vnop_verify_args { struct vnodeop_desc *a_desc; @@ -1836,6 +1840,7 @@ struct vnop_verify_args { void **a_verify_ctxp; vnode_verify_flags_t a_flags; vfs_context_t a_context; + vnode_verify_kind_t *a_verifykind; /* vnode_verify_kind_t defined in sys/buf.h */ }; /*! @@ -1847,17 +1852,33 @@ struct vnop_verify_args { * @param vp The vnode for which data is to be verified. * @param foffset Offset (in bytes) at which region to be verified starts. * @param buf buffer containing file data at foffset. If this is NULL, then only the verification block size is - * being requested. - * @param bufsize size of data buffer to be verified. + * being requested. When VNODE_VERIFY_PRECOMPUTED is set, this buffer is for the precomputed verification + * data. + * @param bufsize size of data buffer to be verified. For VNODE_VERIFY_CONTEXT_ALLOC, this specifies the length of the region + * of the file beginning at f_offset that needs verification and the context should be allocated for (f_offset, f_offset + bufsize) * @param verifyblksize pointer to size of verification block size in use for this file. If the verification block size is 0, * no verification will be performed. The verification block size can be any value which is a power of two upto 128KiB. * @param verify_ctxp context for verification to allocated by the FS and used in verification. + * * @param flags modifier flags. + * if no flags are set (VNODE_VERIFY_DEFAULT), one or both of a_buf and a_verifyblksize is passed. Verification is only required + * if a_buf is passed. In each of the flag values, a_verifyblocksize must be returned if it is set + * For all flag values, the operation to be performed is specified by the value of the flag and the corresponding + * arguments that the operation requires will be set. + * VNODE_VERIFY_CONTEXT_ALLOC : f_offset, bufsize and verify_ctxp. + * VNODE_VERIFY_WITH_CONTEXT : f_offset, buf, bufsize, verify_ctxp + * VNODE_VERIFY_PRECOMPUTED : f_offset, buf, bufsize + * VNODE_VERIFY_CONTEXT_FREE verify_ctxp + * * @param ctx Context to authenticate for verify request; currently often set to NULL. - * @return 0 for success, else an error code. + * @param verifykind Additional information on kind of data to be verified. for example if a specific type of hash function is required. + * Only types defined for vnode_verify_kind_t are supported. + * @return 0 for success, else an error code. For VNODE_VERIFY_PRECOMPUTED, an error return of EAGAIN indicates + * that the Filesystem would like to fallback to VNODE_VERIFY_WITH_CONTEXT. + * */ #ifdef XNU_KERNEL_PRIVATE -extern errno_t VNOP_VERIFY(vnode_t, off_t, uint8_t *, size_t, size_t *, void **, vnode_verify_flags_t, vfs_context_t); +extern errno_t VNOP_VERIFY(vnode_t, off_t, uint8_t *, size_t, size_t *, void **, vnode_verify_flags_t, vfs_context_t, vnode_verify_kind_t *); #endif /* XNU_KERNEL_PRIVATE */ #endif // defined(__APPLE_API_UNSTABLE) diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index 6cd5782cb..f8affa8c4 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -321,6 +321,7 @@ struct vnode { /* v_ext_flags (8 bits) */ #define VE_LINKCHANGE 0x01 #define VE_LINKCHANGEWAIT 0x02 +#define VE_NOT_HARDLINK 0x04 /* * This structure describes vnode data which is specific to a file descriptor. @@ -456,7 +457,7 @@ struct ostat; /* bdevvp moved to vnode.h as private KPI */ void cvtstat(struct stat *st, struct ostat *ost); void vprint(const char *label, struct vnode *vp); - +void vprint_path(const char *label, struct vnode *vp); __private_extern__ int set_package_extensions_table(user_addr_t data, int nentries, int maxwidth); #if CONFIG_MACF diff --git a/bsd/sys/vsock_private.h b/bsd/sys/vsock_private.h new file mode 100644 index 000000000..6609a4cc3 --- /dev/null +++ b/bsd/sys/vsock_private.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _VSOCK_PRIVATE_H_ +#define _VSOCK_PRIVATE_H_ + +#include +#include + +__BEGIN_DECLS + +#define VSOCK_PROTO_STANDARD 0 +#define VSOCK_PROTO_PRIVATE 1 +#define VSOCK_PROTO_MAX 2 + +__END_DECLS + +#endif /* _VSOCK_PRIVATE_H_ */ diff --git a/bsd/sys/vsock_transport.h b/bsd/sys/vsock_transport.h index c584aed4d..6601454cd 100644 --- a/bsd/sys/vsock_transport.h +++ b/bsd/sys/vsock_transport.h @@ -36,7 +36,7 @@ __BEGIN_DECLS #include #include -#include +#include #define VSOCK_MAX_PACKET_SIZE 65536 @@ -58,6 +58,7 @@ struct vsock_address { }; struct vsock_transport { + uint16_t protocol; void *provider; int (*get_cid)(void *provider, uint32_t *cid); int (*attach_socket)(void *provider); @@ -70,7 +71,7 @@ extern int vsock_add_transport(struct vsock_transport *transport); extern int vsock_remove_transport(struct vsock_transport *transport); extern int vsock_reset_transport(struct vsock_transport *transport); extern int vsock_put_message(struct vsock_address src, struct vsock_address dst, - enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m); + enum vsock_operation op, uint32_t buf_alloc, uint32_t fwd_cnt, mbuf_t m, uint16_t protocol); __END_DECLS diff --git a/bsd/sys/work_interval.h b/bsd/sys/work_interval.h index b5b72f2ab..3d5e78ccb 100644 --- a/bsd/sys/work_interval.h +++ b/bsd/sys/work_interval.h @@ -306,16 +306,15 @@ int work_interval_leave(void); #define WORK_INTERVAL_WORKLOAD_ID_HAS_ID (1u << 0) #define WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED (1u << 1) #define WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL (1u << 2) +/* Work interval is allowed to provide complexity values per frame as part of {start, update, finish} calls */ +#define WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED (1u << 3) /* Flags allowed to be passed in from userspace as part of kern_work_interval_set_workload_id() */ -#define WORK_INTERVAL_SET_WORKLOAD_ID_FLAGS_MASK (WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL | WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED) +#define WORK_INTERVAL_SET_WORKLOAD_ID_FLAGS_MASK (WORK_INTERVAL_WORKLOAD_ID_RT_CRITICAL | WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED | WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED) #ifdef XNU_KERNEL_PRIVATE - /* Marker that workinterval was joined before workload ID was set */ #define WORK_INTERVAL_WORKLOAD_ID_ALREADY_JOINED (1u << 31) -/* Work interval is allowed to provide complexity values per frame as part of {start, update, finish} calls */ -#define WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED (1u << 30) #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/xattr.h b/bsd/sys/xattr.h index ca64d1407..9fb79d256 100644 --- a/bsd/sys/xattr.h +++ b/bsd/sys/xattr.h @@ -48,7 +48,8 @@ #define XATTR_SHOWCOMPRESSION 0x0020 /* Options for pathname based xattr calls */ -#define XATTR_NOFOLLOW_ANY 0x0040 /* Don't follow any symbolic links in the path */ +#define XATTR_NOFOLLOW_ANY 0x0040 /* Don't follow any symbolic links in the path */ +#define XATTR_RESOLVE_BENEATH 0x0080 /* path must reside in the hierarchy beneath the starting directory */ #define XATTR_MAXNAMELEN 127 diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c index b0ca129f9..fd33ac15a 100644 --- a/bsd/tests/bsd_tests.c +++ b/bsd/tests/bsd_tests.c @@ -52,7 +52,7 @@ extern kern_return_t arm_cpu_capabilities_legacy_test(void); extern kern_return_t pmap_test(void); #endif /* defined(__arm64__) */ kern_return_t ipi_test(void); -#if defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) extern kern_return_t ctrr_test(void); #endif #if __ARM_PAN_AVAILABLE__ @@ -75,7 +75,7 @@ struct xnupost_test bsd_post_tests[] = { * This will be addressed in a future change. */ #else -#if defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) XNUPOST_TEST_CONFIG_BASIC(ctrr_test), #endif #endif diff --git a/bsd/tests/ctrr_test_sysctl.c b/bsd/tests/ctrr_test_sysctl.c index ffa15504f..d8e24a95a 100644 --- a/bsd/tests/ctrr_test_sysctl.c +++ b/bsd/tests/ctrr_test_sysctl.c @@ -26,9 +26,11 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include +#include -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) extern kern_return_t ctrr_test(void); static int @@ -40,10 +42,14 @@ sysctl_run_ctrr_test(__unused struct sysctl_oid *oidp, __unused void *arg1, __un if (error || !changed) { return error; } - return ctrr_test(); + kern_return_t kr = ctrr_test(); + if (kr != KERN_SUCCESS || T_TESTRESULT != T_STATE_PASS) { + return EDEVERR; + } + return 0; } SYSCTL_PROC(_kern, OID_AUTO, run_ctrr_test, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_run_ctrr_test, "I", ""); -#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ +#endif /* (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) */ diff --git a/bsd/tests/pmap_test_sysctl.c b/bsd/tests/pmap_test_sysctl.c index 1de95d1b6..8d4491160 100644 --- a/bsd/tests/pmap_test_sysctl.c +++ b/bsd/tests/pmap_test_sysctl.c @@ -39,6 +39,7 @@ extern uint64_t test_pmap_page_protect_overhead(unsigned int, unsigned int); #if CONFIG_SPTM extern kern_return_t test_pmap_huge_pv_list(unsigned int, unsigned int); extern kern_return_t test_pmap_reentrance(unsigned int); +extern kern_return_t test_surt(unsigned int); #endif static int @@ -178,7 +179,6 @@ SYSCTL_PROC(_kern, OID_AUTO, pmap_page_protect_overhead_test, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_test_pmap_page_protect_overhead, "-", ""); #if CONFIG_SPTM - static int sysctl_test_pmap_huge_pv_list(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -213,4 +213,33 @@ SYSCTL_PROC(_kern, OID_AUTO, pmap_reentrance_test, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_test_pmap_reentrance, "I", ""); -#endif +#if __ARM64_PMAP_SUBPAGE_L1__ +extern unsigned int surt_list_len(void); +static int +sysctl_surt_list_len(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int len = surt_list_len(); + return SYSCTL_OUT(req, &len, sizeof(len)); +} + +SYSCTL_PROC(_kern, OID_AUTO, surt_list_len, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_surt_list_len, "I", ""); + +static int +sysctl_test_surt(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int num_surts; + int error, changed; + error = sysctl_io_number(req, 0, sizeof(num_surts), &num_surts, &changed); + if (error || !changed) { + return error; + } + return test_surt(num_surts); +} + +SYSCTL_PROC(_kern, OID_AUTO, surt_test, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_test_surt, "I", ""); +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ +#endif /* CONFIG_SPTM */ diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 35659d338..0c5551176 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include @@ -1369,7 +1370,6 @@ vfs_context_can_break_leases(vfs_context_t ctx) bool vfs_context_allow_fs_blksize_nocache_write(vfs_context_t ctx) { - uthread_t uth; thread_t t; proc_t p; @@ -1377,11 +1377,6 @@ vfs_context_allow_fs_blksize_nocache_write(vfs_context_t ctx) return false; } - uth = get_bsdthread_info(t); - if (uth && (uth->uu_flag & UT_FS_BLKSIZE_NOCACHE_WRITES)) { - return true; - } - p = (proc_t)get_bsdthreadtask_info(t); if (p && (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_NOCACHE_WRITE_FS_BLKSIZE)) { return true; @@ -1417,6 +1412,30 @@ vfs_context_skip_mtime_update(vfs_context_t ctx) return false; } +boolean_t +vfs_context_allow_entitled_reserve_access(vfs_context_t ctx) +{ + thread_t t; + uthread_t uth; + proc_t p; + + if ((ctx == NULL) || (t = VFS_CONTEXT_GET_THREAD(ctx)) == NULL) { + return false; + } + + uth = get_bsdthread_info(t); + if (uth && (os_atomic_load(&uth->uu_flag, relaxed) & UT_FS_ENTITLED_RESERVE_ACCESS)) { + return true; + } + + p = (proc_t)get_bsdthreadtask_info(t); + if (p && (os_atomic_load(&p->p_vfs_iopolicy, relaxed) & P_VFS_IOPOLICY_ENTITLED_RESERVE_ACCESS)) { + return true; + } + + return false; +} + /* * vfs_context_proc * @@ -1855,10 +1874,9 @@ boolean_t vnode_isonssd(vnode_t vp) { if (vp) { - if (vp->v_mount) { - if (vp->v_mount->mnt_kern_flag & MNTK_SSD) { - return TRUE; - } + mount_t mp = vp->v_mount; + if (mp && disk_conditioner_mount_is_ssd(mp)) { + return TRUE; } } return FALSE; @@ -6207,17 +6225,23 @@ struct vnop_verify_args { void **a_verify_ctxp; int a_flags; vfs_context_t a_context; + vnode_verifY_kind_t *a_verifykind; }; #endif errno_t VNOP_VERIFY(struct vnode *vp, off_t foffset, uint8_t *buf, size_t bufsize, size_t *verify_block_size, void **verify_ctxp, vnode_verify_flags_t flags, - vfs_context_t ctx) + vfs_context_t ctx, vnode_verify_kind_t *verify_kind) { int _err; struct vnop_verify_args a; + assert(!(flags & VNODE_VERIFY_CONTEXT_ALLOC) || ((foffset >= 0) && bufsize)); + assert(!(flags & (VNODE_VERIFY_CONTEXT_FREE | VNODE_VERIFY_WITH_CONTEXT)) || verify_ctxp); + assert(!(flags & (VNODE_VERIFY_PRECOMPUTED | VNODE_VERIFY_WITH_CONTEXT)) || + ((foffset >= 0) && buf && bufsize)); + if (ctx == NULL) { ctx = vfs_context_kernel(); } @@ -6230,6 +6254,10 @@ VNOP_VERIFY(struct vnode *vp, off_t foffset, uint8_t *buf, size_t bufsize, a.a_flags = flags; a.a_verify_ctxp = verify_ctxp; a.a_context = ctx; + if (verify_kind != NULL) { + *verify_kind = VK_HASH_NONE; + } + a.a_verifykind = verify_kind; _err = (*vp->v_op[vnop_verify_desc.vdesc_offset])(&a); DTRACE_FSINFO(verify, vnode_t, vp); diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index ab25b4464..24971cad1 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -92,7 +92,7 @@ struct _attrlist_buf { static int -attrlist_build_path(vnode_t vp, char **outbuf, int *outbuflen, int *outpathlen, int flags) +attrlist_build_path(vnode_t vp, char **outbuf, int *outbuflen, int *outpathlen, char *prefix, int prefix_len, int flags) { proc_t p = vfs_context_proc(vfs_context_current()); int retlen = 0; @@ -114,8 +114,14 @@ attrlist_build_path(vnode_t vp, char **outbuf, int *outbuflen, int *outpathlen, buf = kalloc_data(buflen, Z_WAITOK | Z_ZERO | Z_NOFAIL); } + /* Add the resolve prefix if provided */ + if (prefix && prefix_len) { + assert(prefix_len + 1 <= buflen); + strlcpy(buf, prefix, prefix_len + 1); + } + /* call build_path making sure NOT to use the cache-only behavior */ - err = build_path(vp, buf, buflen, &retlen, flags, vfs_context_current()); + err = build_path(vp, buf + prefix_len, buflen - prefix_len, &retlen, flags, vfs_context_current()); } while (err == ENOSPC && proc_support_long_paths(p) && (buflen *= 2) && buflen <= MAXLONGPATHLEN); if (err == 0) { if (outbuf) { @@ -125,7 +131,7 @@ attrlist_build_path(vnode_t vp, char **outbuf, int *outbuflen, int *outpathlen, *outbuflen = buflen; } if (outpathlen) { - *outpathlen = retlen - 1; + *outpathlen = retlen + prefix_len - 1; } } return err; @@ -1007,9 +1013,17 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, VATTR_INIT(&va); VFSATTR_INIT(&vs); vs.f_vol_name = NULL; - mnt = vp->v_mount; attr_max_buffer = proc_support_long_paths(vfs_context_proc(ctx)) ? ATTR_MAX_BUFFER_LONGPATHS : ATTR_MAX_BUFFER; + mnt = vp->v_mount; + + /* Check for invalid or dead mounts. */ + if (!mnt || mnt == dead_mountp) { + /* This condition can only be true for fgetattrlist */ + error = EBADF; + VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: volume attributes requested on dead mount."); + goto out; + } /* Check for special packing semantics */ return_valid = (alp->commonattr & ATTR_CMN_RETURNED_ATTRS); @@ -1908,11 +1922,12 @@ attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp } } if (alp->commonattr & ATTR_CMN_FNDRINFO) { - size_t fisize = 32; + size_t fisize = lmax(lmin(32, abp->allocated - (abp->fixedcursor - abp->base)), 0); error = 0; if (vp && !is_bulk) { uio_t auio; + size_t fialloc = fisize; UIO_STACKBUF(uio_buf, 1); if ((auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, @@ -1921,10 +1936,10 @@ attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp goto out; } uio_addiov(auio, CAST_USER_ADDR_T(abp->fixedcursor), - fisize); + fialloc); /* fisize may be reset to 0 after this call */ error = vn_getxattr(vp, XATTR_FINDERINFO_NAME, auio, - &fisize, XATTR_NOSECURITY, ctx); + &fialloc, XATTR_NOSECURITY, ctx); uio_free(auio); /* @@ -1936,12 +1951,12 @@ attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp ((error == ENOATTR) || (error == ENOENT) || (error == ENOTSUP) || (error == EPERM))) { VFS_DEBUG(ctx, vp, "ATTRLIST - No system.finderinfo attribute, returning zeroes"); - bzero(abp->fixedcursor, 32); + bzero(abp->fixedcursor, fisize); error = 0; } if (error == 0) { - abp->fixedcursor += 32; + abp->fixedcursor += roundup(fisize, 4); abp->actual.commonattr |= ATTR_CMN_FNDRINFO; } else if (!return_valid) { goto out; @@ -1955,11 +1970,12 @@ attr_pack_common(vfs_context_t ctx, mount_t mp, vnode_t vp, struct attrlist *alp } } else if (VATTR_IS_SUPPORTED(vap, va_finderinfo)) { bcopy(&vap->va_finderinfo[0], abp->fixedcursor, fisize); - abp->fixedcursor += fisize; + abp->fixedcursor += roundup(fisize, 4); + abp->actual.commonattr |= ATTR_CMN_FNDRINFO; } else if (!return_valid || pack_invalid) { bzero(abp->fixedcursor, fisize); - abp->fixedcursor += fisize; + abp->fixedcursor += roundup(fisize, 4); } } if (alp->commonattr & ATTR_CMN_OWNERID) { @@ -2665,7 +2681,7 @@ struct _attrlist_paths { static errno_t calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, ssize_t *varsizep, struct _attrlist_paths *pathsp, const char **vnamep, - const char **cnpp, ssize_t *cnlp) + const char **cnpp, ssize_t *cnlp, char *pathbuf) { int error = 0; @@ -2716,7 +2732,19 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, if (vp && (alp->commonattr & ATTR_CMN_FULLPATH)) { int pathlen; int buflen; - int err = attrlist_build_path(vp, &(pathsp->fullpathptr), &buflen, &pathlen, 0); + int err; + uint32_t resolve_flags = 0; + size_t perfix_len = 0; + + if (pathbuf) { + err = lookup_check_for_resolve_prefix(pathbuf, PATHBUFLEN, PATHBUFLEN, &resolve_flags, &perfix_len); + if (err) { + error = err; + goto out; + } + } + + err = attrlist_build_path(vp, &(pathsp->fullpathptr), &buflen, &pathlen, pathbuf, (int)perfix_len, 0); if (err) { error = err; goto out; @@ -2733,7 +2761,7 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, if (vp && (alp->forkattr & ATTR_CMNEXT_RELPATH)) { int pathlen; int buflen; - int err = attrlist_build_path(vp, &(pathsp->relpathptr), &buflen, &pathlen, BUILDPATH_VOLUME_RELATIVE); + int err = attrlist_build_path(vp, &(pathsp->relpathptr), &buflen, &pathlen, NULL, 0, BUILDPATH_VOLUME_RELATIVE); if (err) { error = err; goto out; @@ -2750,7 +2778,7 @@ calc_varsize(vnode_t vp, struct attrlist *alp, struct vnode_attr *vap, if (vp && (alp->forkattr & ATTR_CMNEXT_NOFIRMLINKPATH)) { int pathlen; int buflen; - int err = attrlist_build_path(vp, &(pathsp->REALpathptr), &buflen, &pathlen, BUILDPATH_NO_FIRMLINK); + int err = attrlist_build_path(vp, &(pathsp->REALpathptr), &buflen, &pathlen, NULL, 0, BUILDPATH_NO_FIRMLINK); if (err) { error = err; goto out; @@ -2788,7 +2816,8 @@ out: static errno_t vfs_attr_pack_internal(mount_t mp, vnode_t vp, uio_t auio, struct attrlist *alp, uint64_t options, struct vnode_attr *vap, __unused void *fndesc, - vfs_context_t ctx, int is_bulk, enum vtype vtype, ssize_t fixedsize) + vfs_context_t ctx, int is_bulk, enum vtype vtype, ssize_t fixedsize, + char *pathbuf) { struct _attrlist_buf ab; struct _attrlist_paths apaths = {.fullpathptr = NULL, .fullpathlen = 0, .fullpathbuflen = 0, @@ -2885,7 +2914,7 @@ vfs_attr_pack_internal(mount_t mp, vnode_t vp, uio_t auio, struct attrlist *alp, /* * Compute variable-space requirements. */ - error = calc_varsize(vp, alp, vap, &varsize, &apaths, &vname, &cnp, &cnl); + error = calc_varsize(vp, alp, vap, &varsize, &apaths, &vname, &cnp, &cnl, pathbuf); if (error) { goto out; } @@ -3168,7 +3197,7 @@ vfs_attr_pack_ext(mount_t mp, vnode_t vp, uio_t uio, struct attrlist *alp, uint6 error = vfs_attr_pack_internal(mp, vp, uio, alp, options | FSOPT_REPORT_FULLSIZE, vap, NULL, ctx, 1, v_type, - fixedsize); + fixedsize, NULL); if (mp) { vap->va_uid = ouid; @@ -3188,6 +3217,26 @@ vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, return vfs_attr_pack_ext(NULL, vp, uio, alp, options, vap, fndesc, ctx); } +/* + * Attributes used by the non-blocking version of {,f}statfs_ext(), + * which can be satisfied without calling into the file system back + * end. + */ +#define FAST_STATFS_CMN_ATTRS \ + (ATTR_CMN_RETURNED_ATTRS | \ + ATTR_CMN_FSID /* f_fsid */ ) + +#define FAST_STATFS_VOL_ATTRS \ + (ATTR_VOL_INFO | \ + ATTR_VOL_FSTYPE /* f_type */ | \ + ATTR_VOL_MOUNTPOINT /* f_mntonname */ | \ + ATTR_VOL_MOUNTFLAGS /* f_flags */ | \ + ATTR_VOL_MOUNTEDDEVICE /* f_mntfromname */ | \ + ATTR_VOL_FSTYPENAME /* f_fstypename */ | \ + ATTR_VOL_FSSUBTYPE /* f_fssubtype */ | \ + ATTR_VOL_MOUNTEXTFLAGS /* f_flags_ext */ | \ + ATTR_VOL_OWNER /* f_owner */ ) + /* * Obtain attribute information about a filesystem object. * @@ -3200,7 +3249,8 @@ vfs_attr_pack(vnode_t vp, uio_t uio, struct attrlist *alp, uint64_t options, static int getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, user_addr_t attributeBuffer, size_t bufferSize, uint64_t options, - enum uio_seg segflg, char* authoritative_name, struct ucred *file_cred) + enum uio_seg segflg, char* authoritative_name, struct ucred *file_cred, + char *pathbuf) { struct vnode_attr *va; kauth_action_t action; @@ -3216,6 +3266,15 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, // must be true for fork attributes to be used as new common attributes const int use_fork = (options & FSOPT_ATTR_CMN_EXTENDED) != 0; + /* + * Check to see if this is a fast-statfs operation. + */ + const int is_fast_statfs = + (alp->volattr != 0 && alp->fileattr == 0 && + alp->dirattr == 0 && alp->forkattr == 0 && + (alp->volattr & ~FAST_STATFS_VOL_ATTRS) == 0 && + (alp->commonattr & ~FAST_STATFS_CMN_ATTRS) == 0); + if (bufferSize < sizeof(uint32_t)) { return ERANGE; } @@ -3247,7 +3306,15 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, (options & FSOPT_NOFOLLOW) ? "no":"", vp->v_name); #if CONFIG_MACF - error = mac_vnode_check_getattrlist(ctx, vp, alp, options); + /* + * If we're doing a fast-statfs operation, gate it on the same + * capability as a regular statfs(). + */ + if (is_fast_statfs) { + error = mac_mount_check_stat(ctx, vp->v_mount); + } else { + error = mac_vnode_check_getattrlist(ctx, vp, alp, options); + } if (error) { goto out; } @@ -3400,7 +3467,7 @@ getattrlist_internal(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, } error = vfs_attr_pack_internal(vp->v_mount, vp, auio, alp, options, va, NULL, ctx, - 0, vtype, fixedsize); + 0, vtype, fixedsize, pathbuf); out: if (va_name) { @@ -3451,7 +3518,7 @@ fgetattrlist(proc_t p, struct fgetattrlist_args *uap, __unused int32_t *retval) uap->bufferSize, uap->options, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : \ UIO_USERSPACE32), NULL, - fp->fp_glob->fg_cred); + fp->fp_glob->fg_cred, NULL); out_vnode_put: vnode_put(vp); @@ -3485,6 +3552,9 @@ getattrlistat_internal(vfs_context_t ctx, user_addr_t path, if (options & FSOPT_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (options & FSOPT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } error = nameiat(&nd, fd); @@ -3495,7 +3565,7 @@ getattrlistat_internal(vfs_context_t ctx, user_addr_t path, vp = nd.ni_vp; error = getattrlist_internal(ctx, vp, alp, attributeBuffer, - bufferSize, options, segflg, NULL, NOCRED); + bufferSize, options, segflg, NULL, NOCRED, nd.ni_pathbuf); /* Retain the namei reference until the getattrlist completes. */ nameidone(&nd); @@ -4012,7 +4082,7 @@ readdirattr(vnode_t dvp, struct fd_vn_data *fvd, uio_t auio, CAST_USER_ADDR_T(kern_attr_buf), kern_attr_buf_siz, options | FSOPT_REPORT_FULLSIZE, UIO_SYSSPACE, CAST_DOWN_EXPLICIT(char *, name_buffer), - NOCRED); + NOCRED, NULL); nameidone(&nd); @@ -4633,6 +4703,11 @@ setattrlist_internal(vnode_t vp, struct setattrlist_args *uap, proc_t p, vfs_con if (al.commonattr & ATTR_CMN_DATA_PROTECT_FLAGS) { ATTR_UNPACK(va.va_dataprotect_class); VATTR_SET_ACTIVE(&va, va_dataprotect_class); +#if CONFIG_MACF + if ((error = mac_vnode_check_dataprotect_set(ctx, vp, &va.va_dataprotect_class))) { + goto out; + } +#endif } /* volume */ @@ -4811,6 +4886,9 @@ setattrlist(proc_t p, struct setattrlist_args *uap, __unused int32_t *retval) if (uap->options & FSOPT_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & FSOPT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = namei(&nd)) != 0) { goto out; } @@ -4856,6 +4934,9 @@ setattrlistat(proc_t p, struct setattrlistat_args *uap, __unused int32_t *retval if (uap->options & FSOPT_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & FSOPT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = nameiat(&nd, uap->fd)) != 0) { goto out; } diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 3b495c3b6..06c0c20ef 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -183,6 +183,22 @@ typedef struct { fs_buffer_cache_gc_callout_t fs_callouts[FS_BUFFER_CACHE_GC_CALLOUTS_MAX_SIZE] = { {NULL, NULL} }; +static const uint32_t num_bytes_for_verify_kind[NUM_VERIFY_KIND] = { + [VK_HASH_NONE] = 0, + [VK_HASH_SHA3_256] = 32, + [VK_HASH_SHA3_384] = 48, + [VK_HASH_SHA3_512] = 64, +}; + +uint32_t +get_num_bytes_for_verify_kind(vnode_verify_kind_t verify_kind) +{ + if (verify_kind < NUM_VERIFY_KIND) { + return num_bytes_for_verify_kind[verify_kind]; + } + return 0; +} + static __inline__ int buf_timestamp(void) { @@ -627,6 +643,153 @@ bufattr_willverify(bufattr_t bap) return 0; } +vnode_verify_kind_t +bufattr_verifykind(bufattr_t bap) +{ + return bap->ba_verify_type; +} + +void +bufattr_setverifyvalid(bufattr_t bap) +{ + assert(bap->ba_verify_type); + bap->ba_flags |= BA_VERIFY_VALID; +} + +uint8_t * +buf_verifyptr_with_size(buf_t bp, int verify_size, uint32_t *len) +{ + upl_t upl; + vnode_t vp; + mount_t mp; + uint32_t num_bytes; + uint8_t *buf; + uint32_t size; + + if (!len) { + return NULL; + } + + *len = 0; + if (!(os_atomic_load(&bp->b_attr.ba_verify_type, relaxed))) { + return NULL; + } + + vp = bp->b_vp; + if (vp) { + mp = vp->v_mount; + } else { + mp = NULL; + } + + num_bytes = get_num_bytes_for_verify_kind(bp->b_attr.ba_verify_type); + + if (!(bp->b_flags & B_CLUSTER)) { + if (bp->b_attr.ba_un.verify_ptr && bp->b_bcount && vp) { + if (vnode_isspec(bp->b_vp)) { + *len = (bp->b_bcount / vp->v_specsize) * num_bytes; + } else if (mp && mp->mnt_devblocksize) { + *len = (bp->b_bcount / mp->mnt_devblocksize) * num_bytes; + } else { + return NULL; + } + return bp->b_attr.ba_un.verify_ptr; + } + return NULL; + } + + if (!(bp->b_attr.ba_flags & BA_WILL_VERIFY)) { + return NULL; + } + + upl = bp->b_upl; + if (!(upl && vp && mp && mp->mnt_devblocksize)) { + return NULL; + } + + buf = upl_fs_verify_buf(upl, &size); + if (!(buf && size && len && num_bytes)) { + return NULL; + } + + if (!verify_size) { + verify_size = bp->b_bcount; + } + *len = (verify_size / mp->mnt_devblocksize) * num_bytes; + assert(*len <= size); + + if (bp->b_uploffset == 0) { + return buf; + } else { + uint32_t start = (bp->b_uploffset / mp->mnt_devblocksize) * num_bytes; + + assert((start + *len) <= size); + return buf + start; + } +} + +uint8_t * +buf_verifyptr(buf_t bp, uint32_t *len) +{ + return buf_verifyptr_with_size(bp, 0, len); +} + +uint8_t * +bufattr_verifyptr(bufattr_t bap, uint32_t *len) +{ + return buf_verifyptr_with_size(__container_of(bap, struct buf, b_attr), 0, len); +} + +errno_t +buf_verify_enable(buf_t bp, vnode_verify_kind_t verify_type) +{ + uint32_t num_bytes; + + if ((bp->b_flags & B_CLUSTER) || !(bp->b_bcount)) { + return EINVAL; + } + + if (vnode_isspec(bp->b_vp)) { + num_bytes = (bp->b_bcount / bp->b_vp->v_specsize) * get_num_bytes_for_verify_kind(verify_type); + } else if (bp->b_vp->v_mount && bp->b_vp->v_mount->mnt_devblocksize) { + num_bytes = (bp->b_bcount / bp->b_vp->v_mount->mnt_devblocksize) * get_num_bytes_for_verify_kind(verify_type); + } else { + return EINVAL; + } + + uint8_t *verify_ptr = kalloc_data(num_bytes, Z_WAITOK | Z_ZERO | Z_NOFAIL); + if (os_atomic_cmpxchg(&bp->b_attr.ba_verify_type, 0, verify_type, acq_rel)) { + assert(bp->b_attr.ba_un.verify_ptr == NULL); + bp->b_attr.ba_un.verify_ptr = verify_ptr; + } else { + kfree_data(verify_ptr, num_bytes); + } + + return 0; +} + +void +buf_verify_free(buf_t bp) +{ + if ((bp->b_flags & B_CLUSTER) || !(bp->b_bcount)) { + return; + } + + if (os_atomic_load(&bp->b_attr.ba_verify_type, relaxed)) { + uint32_t num_bytes; + + if (vnode_isspec(bp->b_vp)) { + num_bytes = (bp->b_bcount / bp->b_vp->v_specsize) * get_num_bytes_for_verify_kind(bp->b_attr.ba_verify_type); + } else if (bp->b_vp->v_mount && bp->b_vp->v_mount->mnt_devblocksize) { + num_bytes = (bp->b_bcount / bp->b_vp->v_mount->mnt_devblocksize) * get_num_bytes_for_verify_kind(bp->b_attr.ba_verify_type); + } else { + return; + } + kfree_data(bp->b_attr.ba_un.verify_ptr, num_bytes); + os_atomic_store(&bp->b_attr.ba_verify_type, 0, release); + } +} + errno_t buf_error(buf_t bp) { @@ -2846,6 +3009,8 @@ buf_brelse(buf_t bp) } } + buf_verify_free(bp); + /* * If it's locked, don't report an error; try again later. */ @@ -4359,10 +4524,12 @@ biodone_done: vm_offset_t buf_kernel_addrperm_addr(void * addr) { + addr = (void *) VM_KERNEL_STRIP_PTR(addr); + if ((vm_offset_t)addr == 0) { return 0; } else { - return (vm_offset_t)addr + buf_kernel_addrperm; + return ML_ADDRPERM((vm_offset_t)addr, buf_kernel_addrperm); } } diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index c20902a9f..d92702773 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -1789,7 +1789,6 @@ retry: NAME_CACHE_LOCK_SHARED(); locked = true; } - ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); dmp = dp->v_mount; vid = dp->v_id; @@ -1851,6 +1850,15 @@ retry: if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') { cnp->cn_flags |= ISDOTDOT; + + /* if dp is the starting directory and RESOLVE_BENEATH, we should break */ + if ((ndp->ni_flag & NAMEI_RESOLVE_BENEATH) && (dp == ndp->ni_usedvp)) { + break; + } + /* Break if '..' path traversal is prohibited */ + if (ndp->ni_flag & NAMEI_NODOTDOT) { + break; + } } #if NAMEDRSRCFORK @@ -1862,6 +1870,11 @@ retry: if ((ndp->ni_pathlen == sizeof(_PATH_RSRCFORKSPEC)) && (cp[1] == '.' && cp[2] == '.') && bcmp(cp, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC)) == 0) { + /* Break if path lookup on named streams is prohibited. */ + if (ndp->ni_flag & NAMEI_NOXATTRS) { + break; + } + /* Skip volfs file systems that don't support native streams. */ if ((dmp != NULL) && (dmp->mnt_flag & MNT_DOVOLFS) && @@ -1985,13 +1998,12 @@ skiprsrcfork: * for them before checking the cache. */ if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { + if ((cnp->cn_flags & ISLASTCN) && !vnode_isdir(dp)) { + break; + } vp = dp; vvid = vid; } else if ((cnp->cn_flags & ISDOTDOT)) { - /* if dp is the starting directory and RESOLVE_BENEATH, we should break */ - if ((ndp->ni_flag & NAMEI_RESOLVE_BENEATH) && (dp == ndp->ni_usedvp)) { - break; - } /* * If this is a chrooted process, we need to check if * the process is trying to break out of its chrooted @@ -2168,6 +2180,24 @@ skiprsrcfork: } #endif /* CONFIG_TRIGGERS */ + if ((ndp->ni_flag & NAMEI_LOCAL) && !(vp->v_mount->mnt_flag & MNT_LOCAL)) { + /* Prevent a path lookup from ever crossing into a network filesystem */ + vp = NULL; + break; + } + + if ((ndp->ni_flag & NAMEI_NODEVFS) && (vnode_tag(vp) == VT_DEVFS)) { + /* Prevent a path lookup into `devfs` filesystem */ + vp = NULL; + break; + } + + if ((ndp->ni_flag & NAMEI_IMMOVABLE) && (vp->v_mount->mnt_flag & MNT_REMOVABLE) && !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) { + /* prevent a path lookup into a removable filesystem */ + vp = NULL; + break; + } + if (!(locked || vid_is_same(vp, vvid))) { vp = NULL; break; diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 2c6146719..806e747ac 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -197,6 +197,9 @@ static void cluster_iostate_wait(struct clios *iostate, u_int target, const char static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags); +static int cluster_handle_split_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, + u_int io_size, int rounded_size, int local_flags, int (*callback)(buf_t, void *), void *callback_arg); + static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference); @@ -320,6 +323,18 @@ uint32_t throttle_max_iosize = (128 * 1024); SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, ""); +uint32_t split_pgin = 1; +uint32_t split_all_pgin = 1; +uint32_t split_all_pgin_equal = 0; +uint32_t split_pgin_headio = 0; + +SYSCTL_INT(_kern, OID_AUTO, split_pagein_io, CTLFLAG_RW | CTLFLAG_LOCKED, &split_pgin, 0, ""); +#if DEVELOPMENT || DEBUG +SYSCTL_INT(_kern, OID_AUTO, split_pagein_io_all, CTLFLAG_RW | CTLFLAG_LOCKED, &split_all_pgin, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, split_pagein_io_equal, CTLFLAG_RW | CTLFLAG_LOCKED, &split_all_pgin_equal, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, split_pagein_do_headio, CTLFLAG_RW | CTLFLAG_LOCKED, &split_pgin_headio, 0, ""); +#endif + struct verify_buf { TAILQ_ENTRY(verify_buf) vb_entry; buf_t vb_cbp; @@ -342,7 +357,7 @@ static struct verify_buf verify_bufs[MAX_VERIFY_THREADS * MAX_REQUESTS_PER_THREA */ static int verify_in_flight = 0; -#if defined(XNU_TARGET_OS_IOS) +#if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_XR) #define NUM_DEFAULT_THREADS 2 #elif defined(XNU_TARGET_OS_OSX) #define NUM_DEFAULT_THREADS 4 @@ -877,6 +892,96 @@ enqueue_buf_for_verify(buf_t cbp, void *callback_arg) } } +static int +cluster_handle_verification(buf_t cbp_head, vnode_t vp, upl_t upl, int upl_offset, int transaction_size, int error) +{ + off_t start_off = cbp_head->b_clfoffset; + void *verify_ctx = cbp_head->b_attr.ba_un.verify_ctx; + caddr_t verify_buf = NULL; + uint32_t verify_length = transaction_size; + vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE; + int verify_error = EAGAIN; + + assert(cbp_head->b_attr.ba_flags & BA_WILL_VERIFY); + + cbp_head->b_attr.ba_un.verify_ctx = NULL; + if (error) { + goto free_context; + } + + /* + * If we don't have a precomputed hash, we make a single call to both + * verify and free the context. If we have a precomputed hash, then we + * make two separate calls - one to verify the hash and the second one to + * free. If the filesystem returns EAGAIN we fall back to the non + * precomputed hash case. + */ + if (cbp_head->b_attr.ba_verify_type && cbp_head->b_attr.ba_flags & BA_VERIFY_VALID) { + verify_buf = (caddr_t)buf_verifyptr_with_size(cbp_head, transaction_size, &verify_length); + verify_flags = VNODE_VERIFY_WITH_CONTEXT | VNODE_VERIFY_PRECOMPUTED; + + if (verify_buf && verify_length) { + verify_error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, + NULL, &verify_ctx, verify_flags, NULL, NULL); + } else { + verify_error = EAGAIN; + } + + verify_buf = NULL; + verify_length = transaction_size; + verify_flags = VNODE_VERIFY_CONTEXT_FREE; + } + + if (verify_error != EAGAIN) { + error = verify_error; + } else { + vm_offset_t vaddr; + + /* + * Map it in. + * + * ubc_upl_map_range unfortunately cannot handle concurrent map + * requests for the same UPL and returns failures when it can't + * map. The map exclusive mechanism enforces mutual exclusion + * for concurrent requests. + */ + verify_error = 0; + os_atomic_inc(&verify_in_flight, relaxed); + upl_set_map_exclusive(upl); + error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr); + if (error) { + upl_clear_map_exclusive(upl); + printf("ubc_upl_map_range returned error %d upl = %p, upl_offset = %d, size = %d", + error, upl, (int)upl_offset, (int)round_page(transaction_size)); + error = EIO; + if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) { + panic("verify_in_flight underflow"); + } + } else { + verify_buf = (caddr_t)vaddr; + verify_flags |= VNODE_VERIFY_WITH_CONTEXT; + } + } + +free_context: + verify_error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, + NULL, &verify_ctx, verify_flags, NULL, NULL); + if (!error) { + error = verify_error; + } + + if (verify_buf) { + (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size)); + upl_clear_map_exclusive(upl); + verify_buf = NULL; + if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) { + panic("verify_in_flight underflow"); + } + } + + return error; +} + static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp) { @@ -936,7 +1041,7 @@ cluster_iodone(buf_t bp, void *callback_arg) async = cluster_verify_threads && (os_atomic_load(&cbp_head->b_attr.ba_flags, acquire) & BA_ASYNC_VERIFY); - assert(!async || cbp_head->b_attr.ba_verify_ctx); + assert(!async || cbp_head->b_attr.ba_un.verify_ctx); if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { lck_mtx_lock_spin(&cl_transaction_mtxp); @@ -1016,7 +1121,6 @@ cluster_iodone_finish(buf_t cbp_head, void *callback_arg) buf_t real_bp; vnode_t vp; struct clios *iostate; - void *verify_ctx; error = 0; total_size = 0; @@ -1083,54 +1187,8 @@ cluster_iodone_finish(buf_t cbp_head, void *callback_arg) cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); } - verify_ctx = cbp_head->b_attr.ba_verify_ctx; - cbp_head->b_attr.ba_verify_ctx = NULL; - if (verify_ctx) { - vnode_verify_flags_t verify_flags = VNODE_VERIFY_CONTEXT_FREE; - caddr_t verify_buf = NULL; - off_t start_off = cbp_head->b_clfoffset; - size_t verify_length = transaction_size; - vm_offset_t vaddr; - - if (!error) { - /* - * Map it in. - * - * ubc_upl_map_range unfortunately cannot handle concurrent map - * requests for the same UPL and returns failures when it can't - * map. The map exclusive mechanism enforces mutual exclusion - * for concurrent requests. - */ - os_atomic_inc(&verify_in_flight, relaxed); - upl_set_map_exclusive(upl); - error = ubc_upl_map_range(upl, upl_offset, round_page(transaction_size), VM_PROT_DEFAULT, &vaddr); - if (error) { - upl_clear_map_exclusive(upl); - printf("ubc_upl_map_range returned error %d upl = %p, upl_offset = %d, size = %d", - error, upl, (int)upl_offset, (int)round_page(transaction_size)); - error = EIO; - if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) { - panic("verify_in_flight underflow"); - } - } else { - verify_buf = (caddr_t)vaddr; - verify_flags |= VNODE_VERIFY_WITH_CONTEXT; - } - } - - int verify_error = VNOP_VERIFY(vp, start_off, (uint8_t *)verify_buf, verify_length, 0, &verify_ctx, verify_flags, NULL); - if (!error) { - error = verify_error; - } - - if (verify_buf) { - (void)ubc_upl_unmap_range(upl, upl_offset, round_page(transaction_size)); - upl_clear_map_exclusive(upl); - verify_buf = NULL; - if (os_atomic_dec_orig(&verify_in_flight, relaxed) == 0) { - panic("verify_in_flight underflow"); - } - } + if (cbp_head->b_attr.ba_un.verify_ctx) { + error = cluster_handle_verification(cbp_head, vp, upl, upl_offset, transaction_size, error); } else if (cbp_head->b_attr.ba_flags & BA_WILL_VERIFY) { error = EBADMSG; } @@ -1303,7 +1361,7 @@ cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block } error = VNOP_VERIFY(vp, start_off, NULL, length, - &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL); + &verify_block_size, &verify_ctx, VNODE_VERIFY_CONTEXT_ALLOC, NULL, NULL); assert(!(error && verify_ctx)); @@ -1322,7 +1380,7 @@ cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block os_atomic_dec(&cluster_verify_threads, relaxed); } } - cbp_head->b_attr.ba_verify_ctx = verify_ctx; + cbp_head->b_attr.ba_un.verify_ctx = verify_ctx; /* * At least one thread is busy (at the time we * checked), so we can let it get queued for @@ -1330,12 +1388,12 @@ cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset, size_t verify_block * this wrong. */ if (os_atomic_load(&verify_in_flight, relaxed)) { - /* This flag and the setting of ba_verify_ctx needs to be ordered */ + /* This flag and the setting of ba_un.verify_ctx needs to be ordered */ os_atomic_or(&cbp_head->b_attr.ba_flags, BA_ASYNC_VERIFY, release); } } } else { - cbp_head->b_attr.ba_verify_ctx = NULL; + cbp_head->b_attr.ba_un.verify_ctx = NULL; } cbp_head->b_validend = zero_offset; @@ -1471,6 +1529,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no mount_t mp; size_t verify_block_size = 0; vm_offset_t upl_end_offset; + vnode_verify_kind_t verify_kind = VK_HASH_NONE; boolean_t need_EOT = FALSE; /* @@ -1538,7 +1597,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no /* See if we can do cluster verification (pageins and aligned reads) */ if ((flags & CL_PAGEIN || cluster_verify_threads) && !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && - (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) && + (VNOP_VERIFY(vp, f_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, &verify_kind) == 0) && verify_block_size) { if (verify_block_size != PAGE_SIZE) { verify_block_size = 0; @@ -1549,16 +1608,22 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } /* * For reads, only allow cluster verification if f_offset - * and upl_offset are both page aligned. If they are not - * page aligned, leave it to the filesystem to do verification - * Furthermore, the size also has to be aligned to page size. - * Strictly speaking the alignments need to be for verify_block_size + * and upl_offset are both page aligned. Additionally, for direct reads, + * require that the length of the write also be page aligned. + * If they are not page aligned, leave it to the filesystem to do verification. + * Strictly speaking, the alignments need to be for verify_block_size * but since the only verify_block_size that is currently supported * is page size, we check against page alignment. */ if (verify_block_size && !(flags & CL_PAGEIN) && - ((f_offset & PAGE_MASK) || (upl_offset & PAGE_MASK) || (non_rounded_size & PAGE_MASK))) { + ((f_offset & PAGE_MASK) || (upl_offset & PAGE_MASK) || + ((flags & CL_DIRECT_IO) && (non_rounded_size & PAGE_MASK)))) { verify_block_size = 0; + verify_kind = VK_HASH_NONE; + } + if (verify_block_size && verify_kind && !upl_has_fs_verify_info(upl)) { + upl_set_fs_verify_info(upl, + (upl_adjusted_size(upl, PAGE_MASK) / mp->mnt_devblocksize) * get_num_bytes_for_verify_kind(verify_kind)); } } } else { @@ -1707,7 +1772,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no create_cached_upl: ubc_create_upl_kernel(vp, cached_upl_f_offset, cached_upl_size, &cached_upl, &cached_pl, UPL_SET_LITE | UPL_WILL_MODIFY, VM_KERN_MEMORY_FILE); - if (upl_has_wired_pages(cached_upl)) { + if (cached_upl && upl_has_wired_pages(cached_upl)) { /* * Pages in this UPL would contain stale data after our direct write * (which is intended to overwrite these pages on disk). The UPL is @@ -2224,6 +2289,9 @@ create_cached_upl: } if (verify_block_size) { cbp->b_attr.ba_flags |= BA_WILL_VERIFY; + if (verify_kind) { + cbp->b_attr.ba_verify_type = verify_kind; + } } cbp->b_lblkno = lblkno; @@ -2668,6 +2736,133 @@ cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL); } +#define SPLIT_PAGEIN_MAX_IOSIZE 32768 + +/* + * Do a big pagein request as multiple I/Os - the first I/O will be for + * SPLIT_PAGEIN_MAX_IOSIZE (32K)sized which includes the page that the caused + * the fault and then i/o will be initiated for the remaining. + */ +static int +cluster_handle_split_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, + u_int io_size, int rounded_size, int local_flags, int (*callback)(buf_t, void *), void *callback_arg) +{ + upl_page_info_t *pl = ubc_upl_pageinfo(upl); + const off_t start_f_offset = f_offset; + const upl_offset_t start_upl_offset = upl_offset; + const int start_pg = upl_offset >> PAGE_SHIFT; + const int last_pg = ((upl_offset + rounded_size) >> PAGE_SHIFT) - 1; + u_int split_io_size = SPLIT_PAGEIN_MAX_IOSIZE; + u_int head_io_size = 0; + int retval = 0; + int error = 0; + int pg; + + assert(SPLIT_PAGEIN_MAX_IOSIZE >= (2 * PAGE_SIZE)); + + for (pg = start_pg; (pg <= last_pg) && !(upl_page_is_needed(pl, pg)); pg++) { + ; + } + + /* + * The global variables affecting behaviour + * split_all_pgin -> Split pageins even if we don't find the needed page. + * split_pgin_headio -> for a pagein in which there is a head calculated, + * do the head i/o or not. + * + * split_all_pgin_equal -> split the entire bug request into equal sized small i/os of 32K. + * + * Whichever way the i/o is split, the i/o for the needed page always happens first and then we decide + * whether we have to do i/o for the head and then if we need to issue equal sized i/o. + * + * By default we are set up to do only the i/o for the needed page, followed by a "unsplit" tail. + */ + if ((pg > start_pg) && (pg <= last_pg)) { + head_io_size = ((pg - start_pg) * PAGE_SIZE); + + if (head_io_size < SPLIT_PAGEIN_MAX_IOSIZE) { + head_io_size = 0; + } else if (!split_all_pgin) { + goto out; + } else if ((rounded_size - head_io_size) <= SPLIT_PAGEIN_MAX_IOSIZE) { + head_io_size = (rounded_size - SPLIT_PAGEIN_MAX_IOSIZE); + } else { + head_io_size &= ~(SPLIT_PAGEIN_MAX_IOSIZE - 1); + } + + assertf(io_size > head_io_size, "io_size is %d, head_io_size = %d", io_size, head_io_size); + + if (head_io_size) { + upl_offset += head_io_size; + f_offset += head_io_size; + io_size -= head_io_size; + + if (!split_pgin_headio) { + if (local_flags & CL_COMMIT) { + ubc_upl_abort_range(upl, start_upl_offset, head_io_size, + UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } + head_io_size = 0; + } + + split_io_size = MIN(SPLIT_PAGEIN_MAX_IOSIZE, io_size); + } + + assertf(io_size >= split_io_size, "io_size is %d, split_io_size = %d", io_size, split_io_size); + } else if ((pg > last_pg) && !split_all_pgin) { + goto out; + } + + /* This is the 32K i/o for the "needed" page */ + retval = cluster_io(vp, upl, upl_offset, f_offset, split_io_size, + local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); + + io_size -= split_io_size; + + if (io_size) { + upl_offset += split_io_size; + f_offset += split_io_size; + } else if (head_io_size) { + io_size = head_io_size; + head_io_size = 0; + upl_offset = start_upl_offset; + f_offset = start_f_offset; + } + + while (io_size) { + if (split_all_pgin_equal && (io_size > SPLIT_PAGEIN_MAX_IOSIZE)) { + split_io_size = SPLIT_PAGEIN_MAX_IOSIZE; + } else { + split_io_size = io_size; + } + + assertf(io_size >= split_io_size, "io_size is %d, split_io_size = %d", io_size, split_io_size); + + /* We have to issue this i/o anyway even if we get an error from any of the previous ones */ + error = cluster_io(vp, upl, upl_offset, f_offset, split_io_size, + local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); + if (!retval) { + retval = error; + } + + io_size -= split_io_size; + + if ((io_size == 0) && head_io_size) { + io_size = head_io_size; + head_io_size = 0; + upl_offset = start_upl_offset; + f_offset = start_f_offset; + } else if (io_size) { + upl_offset += split_io_size; + f_offset += split_io_size; + } + } + + return retval; +out: + return cluster_io(vp, upl, upl_offset, f_offset, io_size, + local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); +} int cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset, @@ -2733,6 +2928,11 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offse size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); } + if ((io_size > SPLIT_PAGEIN_MAX_IOSIZE) && vnode_isonssd(vp) && split_pgin) { + return cluster_handle_split_pagein(vp, upl, upl_offset, f_offset, io_size, + rounded_size, local_flags, callback, callback_arg); + } + retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -4826,7 +5026,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * is performed in the File system. */ size_t verify_block_size = 0; - if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) { + if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL) == 0) /* && verify_block_size */) { for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) { if (!upl_valid_page(pl, uio_last)) { break; @@ -5162,6 +5362,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, upl_t upl = NULL; upl_page_info_t *pl; off_t max_io_size; + size_t verify_block_size = 0; vm_offset_t upl_offset, vector_upl_offset = 0; upl_size_t upl_size = 0, vector_upl_size = 0; vm_size_t upl_needed_size; @@ -5196,6 +5397,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, int vector_upl_index = 0; upl_t vector_upl = NULL; cl_direct_read_lock_t *lock = NULL; + uint32_t verify_mask = 0; assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT); @@ -5269,6 +5471,14 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, uio_acct = uio_duplicate(uio); } + retval = VNOP_VERIFY(vp, 0, NULL, 0, &verify_block_size, NULL, VNODE_VERIFY_DEFAULT, NULL, NULL); + if (retval) { + verify_block_size = 0; + } else if (verify_block_size) { + assert((verify_block_size & (verify_block_size - 1)) == 0); + verify_mask = verify_block_size - 1; + } + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -5306,6 +5516,18 @@ next_dread: misaligned = 1; } + if (verify_block_size && !misaligned && ((uio_offset(uio) & verify_mask) || (uio_resid(uio) & verify_mask))) { + /* + * If the offset is not aligned to the verification block size + * or the size is not aligned to the verification block size, + * we simply send this through the cached i/o path as that is + * what the Filesystem will end up doing anyway i.e. it will + * read all the remaining data in order to verify it and then + * discard the data it has read. + */ + misaligned = 1; + } + max_io_size = filesize - uio->uio_offset; /* diff --git a/bsd/vfs/vfs_cprotect.c b/bsd/vfs/vfs_cprotect.c index c791bf494..53bffd97e 100644 --- a/bsd/vfs/vfs_cprotect.c +++ b/bsd/vfs/vfs_cprotect.c @@ -123,7 +123,7 @@ cpx_alloc(size_t key_len, bool needs_ctx) assert(cpx_size(key_len) <= PAGE_SIZE); kmem_alloc(kernel_map, (vm_offset_t *)&cpx, PAGE_SIZE, - KMA_DATA | KMA_NOFAIL, VM_KERN_MEMORY_FILE); + KMA_KOBJECT | KMA_NOFAIL | KMA_ZERO, VM_KERN_MEMORY_FILE); //mark the page as protectable, since kmem_alloc succeeded. cpx->cpx_flags |= CPX_WRITE_PROTECTABLE; #else diff --git a/bsd/vfs/vfs_exclave_fs.c b/bsd/vfs/vfs_exclave_fs.c index 4bc855b71..34c275cb0 100644 --- a/bsd/vfs/vfs_exclave_fs.c +++ b/bsd/vfs/vfs_exclave_fs.c @@ -52,6 +52,7 @@ struct open_vnode { vnode_t vp; dev_t dev; uint64_t file_id; + uint32_t fstag; uint32_t open_count; #if (DEVELOPMENT || DEBUG) uint32_t flags; @@ -279,6 +280,35 @@ is_fs_writeable(uint32_t fs_tag) return (fs_tag == EFT_EXCLAVE) || (fs_tag == EFT_EXCLAVE_MAIN); } +/* + * Check if an ancestor of base_vp is a registered base dir. + */ +static bool +is_parent_registered(vnode_t base_vp) +{ + vnode_t vp = base_vp->v_parent; + + while (vp != NULLVP) { + int i; + registered_fs_tag_t *rft; + for (i = 0; i <= rft_hashmask; i++) { + registered_tags_head_t *head = registered_tags_hash + i; + LIST_FOREACH(rft, head, link) { + if (rft->vp == vp) { + printf("vfs_exclave_fs: vnode [%s] has an ancestor which is a registered base_dir [%s], fstag %d\n", + base_vp->v_name ? base_vp->v_name : "no-name", + vp->v_name ? vp->v_name : "no-name", rft->fstag); + return true; + } + } + } + vp = vp->v_parent; + } + + return false; +} + + /* * Set a base directory for the given fs tag. */ @@ -312,22 +342,9 @@ set_base_dir(uint32_t fs_tag, vnode_t vp, fsioc_graft_info_t *graft_info, bool i goto out; } - /* - * make sure that a writable fs does not share a dev_t with another non writable fs (and vice versa) - * since writable vnodes are opened RW whereas non writable fs vnodes - * are opened RO - */ - int i; - bool is_writable_fs_tag = is_fs_writeable(fs_tag); - for (i = 0; i <= rft_hashmask; i++) { - registered_tags_head_t *head = registered_tags_hash + i; - LIST_FOREACH(rft, head, link) { - if ((is_fs_writeable(rft->fstag) != is_writable_fs_tag) && rft->dev == dev) { - printf("tag %u has same device 0x%x as tag %u\n", fs_tag, rft->fstag, dev); - error = EBUSY; - goto out; - } - } + if (is_parent_registered(vp)) { + error = EBUSY; + goto out; } rft = kalloc_type(registered_fs_tag_t, Z_WAITOK | Z_ZERO); @@ -500,12 +517,6 @@ vfs_exclave_fs_register(uint32_t fs_tag, vnode_t vp) return ENXIO; } -#if !defined(XNU_TARGET_OS_OSX) - if (fs_tag == EFT_EXCLAVE_MAIN) { - return ENOTSUP; - } -#endif - vnode_vfsname(vp, vfs_name); if (strcmp(vfs_name, "apfs")) { return ENOTSUP; @@ -584,7 +595,6 @@ vfs_exclave_fs_register_path(uint32_t fs_tag, const char *base_path) static void release_open_vnodes(registered_fs_tag_t *base_dir) { - dev_t dev; int i; lck_mtx_lock(&open_vnodes_mtx); @@ -593,27 +603,11 @@ release_open_vnodes(registered_fs_tag_t *base_dir) goto done; } - dev = base_dir->dev; - - if (num_tags_registered > 1) { - /* skip release if another base dir has the same device */ - for (i = 0; i <= rft_hashmask; i++) { - registered_tags_head_t *rfthead = registered_tags_hash + i; - registered_fs_tag_t *rft; - - LIST_FOREACH(rft, rfthead, link) { - if ((rft != base_dir) && (rft->dev == dev)) { - goto done; - } - } - } - } - for (i = 0; i < open_vnodes_hashmask + 1; i++) { struct open_vnode *entry, *temp_entry; LIST_FOREACH_SAFE(entry, &open_vnodes_hashtbl[i], chain, temp_entry) { - if (entry->dev != dev) { + if (entry->fstag != base_dir->fstag) { continue; } while (entry->open_count) { @@ -931,6 +925,7 @@ increment_vnode_open_count(vnode_t vp, registered_fs_tag_t *base_dir, uint64_t f entry->vp = vp; entry->dev = base_dir->dev; entry->file_id = file_id; + entry->fstag = base_dir->fstag; LIST_INSERT_HEAD(list, entry, chain); num_open_vnodes++; } diff --git a/bsd/vfs/vfs_exclave_fs.h b/bsd/vfs/vfs_exclave_fs.h index 6c8adcf93..886b1a874 100644 --- a/bsd/vfs/vfs_exclave_fs.h +++ b/bsd/vfs/vfs_exclave_fs.h @@ -51,6 +51,7 @@ typedef struct { #define EXCLAVE_FS_SYNC_OP_UBC 2 #define EXCLAVE_FS_REGISTER_ENTITLEMENT "com.apple.private.vfs.exclave-fs-register" +#define EXCLAVE_FS_LIST_ENTITLEMENT "com.apple.private.vfs.exclave-fs-list" int vfs_exclave_fs_start(void); void vfs_exclave_fs_stop(void); diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index f06b64c59..3db9b925d 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -108,7 +108,7 @@ #if CONFIG_VOLFS -static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx); +static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx, vnode_t rdvp); #define MAX_VOLFS_RESTARTS 5 #endif @@ -127,9 +127,29 @@ static int lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *nd extern lck_rw_t rootvnode_rw_lock; -#define RESOLVE_NOFOLLOW_ANY 0x00000001 #define RESOLVE_CHECKED 0x80000000 -static int lookup_check_for_resolve_prefix(char *path, size_t pathbuflen, size_t len, uint32_t *resolve_flags, size_t *prefix_len); + +static KALLOC_HEAP_DEFINE(KHEAP_VFS_NAMEI, "vfs_namei", KHEAP_ID_DATA_BUFFERS); + +/* namei allocation/free methods */ + +__typed_allocators_ignore_push + +static void * +namei_alloc(size_t size) +{ + assert(size <= MAXLONGPATHLEN); + return kheap_alloc(KHEAP_VFS_NAMEI, size, Z_WAITOK_ZERO_NOFAIL); +} + +static void +namei_free(void *addr, size_t size) +{ + assert(size <= MAXLONGPATHLEN); + kheap_free(KHEAP_VFS_NAMEI, addr, size); +} + +__typed_allocators_ignore_pop /* * Convert a pathname into a pointer to a locked inode. @@ -267,7 +287,7 @@ retry_copy: } } - cnp->cn_pnbuf = zalloc(ZV_NAMEI); + cnp->cn_pnbuf = namei_alloc(MAXPATHLEN); cnp->cn_flags |= HASBUF; cnp->cn_pnlen = MAXPATHLEN; bytes_copied = 0; @@ -275,17 +295,13 @@ retry_copy: goto retry_copy; } else if (error == ENAMETOOLONG && (cnp->cn_flags & HASBUF) && (cnp->cn_pnlen * 2) <= MAXLONGPATHLEN && proc_support_long_paths(p)) { - if (cnp->cn_pnlen == MAXPATHLEN) { - /* First time we arrive here, the buffer came from ZV_NAMEI */ - zfree(ZV_NAMEI, cnp->cn_pnbuf); - } else { - kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen); - } + /* First time we arrive here, the buffer came from namei_alloc */ + namei_free(cnp->cn_pnbuf, cnp->cn_pnlen); resolve_error = 0; cnp->cn_pnlen *= 2; - cnp->cn_pnbuf = kalloc_data(cnp->cn_pnlen, Z_WAITOK | Z_ZERO | Z_NOFAIL); + cnp->cn_pnbuf = namei_alloc(cnp->cn_pnlen); bytes_copied = 0; goto retry_copy; @@ -317,6 +333,26 @@ retry_copy: cnp->cn_pnlen -= resolve_prefix_len; ndp->ni_pathlen -= resolve_prefix_len; resolve_prefix_len = 0; + + /* Update ndp with the resolve flags */ + if (resolve_flags & RESOLVE_NODOTDOT) { + ndp->ni_flag |= NAMEI_NODOTDOT; + } + if (resolve_flags & RESOLVE_LOCAL) { + ndp->ni_flag |= NAMEI_LOCAL; + } + if (resolve_flags & RESOLVE_NODEVFS) { + ndp->ni_flag |= NAMEI_NODEVFS; + } + if (resolve_flags & RESOLVE_IMMOVABLE) { + ndp->ni_flag |= NAMEI_IMMOVABLE; + } + if (resolve_flags & RESOLVE_UNIQUE) { + ndp->ni_flag |= NAMEI_UNIQUE; + } + if (resolve_flags & RESOLVE_NOXATTRS) { + ndp->ni_flag |= NAMEI_NOXATTRS; + } } } @@ -351,18 +387,26 @@ retry_copy: char * realpath; size_t realpathlen; int realpath_err; + vnode_t rdvp = NULLVP; /* Attempt to resolve a legacy volfs style pathname. */ realpathlen = MAXPATHLEN; do { - if (realpathlen == MAXPATHLEN) { - realpath = zalloc(ZV_NAMEI); - } else { - /* - * To be consistent with the behavior of openbyid_np, which always supports - * long paths, do not gate our support on proc_support_long_paths either. - */ - realpath = kalloc_data(realpathlen, Z_WAITOK | Z_ZERO | Z_NOFAIL); + /* + * To be consistent with the behavior of openbyid_np, which always supports + * long paths, do not gate our support on proc_support_long_paths either. + */ + realpath = namei_alloc(realpathlen); + + if (fdt_flag_test(&p->p_fd, FD_CHROOT)) { + proc_dirs_lock_shared(p); + if (fdt_flag_test(&p->p_fd, FD_CHROOT)) { + rdvp = p->p_fd.fd_rdir; + if (vnode_get(rdvp)) { + rdvp = NULLVP; + } + } + proc_dirs_unlock_shared(p); } /* * We only error out on the ENAMETOOLONG cases where we know that @@ -370,23 +414,21 @@ retry_copy: * realpathlen characters. In other failure cases, we may be dealing with a path * that legitimately looks like /.vol/1234/567 and is not meant to be translated */ - if ((realpath_err = vfs_getrealpath(&cnp->cn_pnbuf[6], realpath, realpathlen, ctx))) { - if (realpathlen == MAXPATHLEN) { - zfree(ZV_NAMEI, realpath); - } else { - kfree_data(realpath, realpathlen); - } + realpath_err = vfs_getrealpath(&cnp->cn_pnbuf[6], realpath, realpathlen, ctx, rdvp); + + if (rdvp) { + vnode_put(rdvp); + rdvp = NULLVP; + } + if (realpath_err) { + namei_free(realpath, realpathlen); if (realpath_err == ENOSPC || realpath_err == ENAMETOOLONG) { error = ENAMETOOLONG; } } else { size_t tmp_len; if (cnp->cn_flags & HASBUF) { - if (cnp->cn_pnlen == MAXPATHLEN) { - zfree(ZV_NAMEI, cnp->cn_pnbuf); - } else { - kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen); - } + namei_free(cnp->cn_pnbuf, cnp->cn_pnlen); } cnp->cn_pnbuf = realpath; cnp->cn_pnlen = (int)realpathlen; @@ -469,7 +511,7 @@ retry_copy: /* Absolute paths are never allowed in NAMEI_RESOLVE_BENEATH */ lck_rw_unlock_shared(&rootvnode_rw_lock); proc_dirs_unlock_shared(p); - error = EACCES; + error = ENOTCAPABLE; goto error_out; } dp = ndp->ni_rootdir; @@ -563,6 +605,10 @@ retry_copy: * Check for symbolic link */ if ((cnp->cn_flags & ISSYMLINK) == 0) { + if ((ndp->ni_flag & NAMEI_UNIQUE) && ndp->ni_vp && vnode_hasmultipath(ndp->ni_vp)) { + error = ENOTCAPABLE; + goto out_drop; + } if (startdir_with_usecount) { vnode_rele(startdir_with_usecount); startdir_with_usecount = NULLVP; @@ -651,11 +697,7 @@ error_out: if ((cnp->cn_flags & HASBUF)) { cnp->cn_flags &= ~HASBUF; - if (cnp->cn_pnlen == MAXPATHLEN) { - zfree(ZV_NAMEI, cnp->cn_pnbuf); - } else { - kfree_data(cnp->cn_pnbuf, cnp->cn_pnlen); - } + namei_free(cnp->cn_pnbuf, cnp->cn_pnlen); } cnp->cn_pnbuf = NULL; ndp->ni_vp = NULLVP; @@ -714,7 +756,7 @@ namei_compound_available(vnode_t dp, struct nameidata *ndp) return 0; } -static int +int lookup_check_for_resolve_prefix(char *path, size_t pathbuflen, size_t len, uint32_t *resolve_flags, size_t *prefix_len) { int error = 0; @@ -744,7 +786,7 @@ lookup_check_for_resolve_prefix(char *path, size_t pathbuflen, size_t len, uint3 path[pathbuflen - 1] = '\0'; unsigned long flag_val = strtoul(flag, &next, 10); path[pathbuflen - 1] = last_char; - if (next[0] != '/' || (flag_val & ~(RESOLVE_NOFOLLOW_ANY))) { + if (next[0] != '/' || (flag_val & ~RESOLVE_VALIDMASK)) { error = EINVAL; goto out; } @@ -1007,6 +1049,15 @@ lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int */ if ((ndp->ni_flag & NAMEI_TRAILINGSLASH)) { if (dp->v_type != VDIR) { +#if CONFIG_MACF + /* + * Prevent the information disclosure on the vnode + */ + if (mac_vnode_check_stat(ctx, NOCRED, dp) == EPERM) { + error = EPERM; + goto out; + } +#endif /* CONFIG_MACF */ error = ENOTDIR; goto out; } @@ -1318,10 +1369,21 @@ dirloop: goto returned_from_lookup_path; } +#if NAMEDRSRCFORK + /* return ENOTCAPABLE if path lookup on named streams is prohibited. */ + if ((ndp->ni_flag & NAMEI_NOXATTRS) && + (ndp->ni_pathlen == sizeof(_PATH_RSRCFORKSPEC)) && + (ndp->ni_next[1] == '.' && ndp->ni_next[2] == '.') && + bcmp(ndp->ni_next, _PATH_RSRCFORKSPEC, sizeof(_PATH_RSRCFORKSPEC)) == 0) { + error = ENOTCAPABLE; + goto bad; + } +#endif /* NAMEDRSRCFORK */ + /* * Handle "..": three special cases. * 1. if at starting directory (e.g. the cwd/usedvp) - * and RESOLVE_BENEATH, then return EACCES. + * and RESOLVE_BENEATH, then return ENOTCAPABLE. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. @@ -1331,9 +1393,14 @@ dirloop: * .. in the other file system. */ if ((cnp->cn_flags & ISDOTDOT)) { - /* if dp is the starting directory and RESOLVE_BENEATH, we should return EACCES */ + /* if dp is the starting directory and RESOLVE_BENEATH, we should return ENOTCAPABLE */ if ((ndp->ni_flag & NAMEI_RESOLVE_BENEATH) && (dp == ndp->ni_usedvp)) { - error = EACCES; + error = ENOTCAPABLE; + goto bad; + } + /* return ENOTCAPABLE if '..' path traversal is prohibited */ + if ((ndp->ni_flag & NAMEI_NODOTDOT)) { + error = ENOTCAPABLE; goto bad; } /* @@ -1424,6 +1491,15 @@ unionlookup: ndp->ni_vp = NULLVP; if (dp->v_type != VDIR) { +#if CONFIG_MACF + /* + * Prevent the information disclosure on the vnode + */ + if (mac_vnode_check_stat(ctx, NOCRED, dp) == EPERM) { + error = EPERM; + goto lookup_error; + } +#endif /* CONFIG_MACF */ error = ENOTDIR; goto lookup_error; } @@ -1625,7 +1701,7 @@ lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx) return 0; } - path = zalloc_flags(ZV_NAMEI, Z_WAITOK | Z_NOFAIL); + path = namei_alloc(MAXPATHLEN); /* * Walk back up to the mountpoint following the @@ -1669,7 +1745,7 @@ lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx) nameidone(&nd); done: if (path) { - zfree(ZV_NAMEI, path); + namei_free(path, MAXPATHLEN); } return error; } @@ -1738,6 +1814,21 @@ restart: break; // don't traverse into a forced unmount } + if ((ndp->ni_flag & NAMEI_LOCAL) && !(mp->mnt_flag & MNT_LOCAL)) { + /* Prevent a path lookup from ever crossing into a network filesystem */ + error = ENOTCAPABLE; + goto out; + } + if ((ndp->ni_flag & NAMEI_NODEVFS) && (strcmp(mp->mnt_vfsstat.f_fstypename, "devfs") == 0)) { + /* Prevent a path lookup into `devfs` filesystem */ + error = ENOTCAPABLE; + goto out; + } + if ((ndp->ni_flag & NAMEI_IMMOVABLE) && (mp->mnt_flag & MNT_REMOVABLE) && !(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) { + /* Prevent a path lookup into a removable filesystem */ + error = ENOTCAPABLE; + goto out; + } if (vfs_busy(mp, vbusyflags)) { mount_dropcrossref(mp, dp, 0); @@ -1833,6 +1924,11 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_i bool dp_has_iocount = false; if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { +#if CONFIG_MACF + if (mac_vnode_check_stat(ctx, NOCRED, ndp->ni_vp) == EPERM) { + return EPERM; + } +#endif /* CONFIG_MACF */ return ELOOP; } #if CONFIG_MACF @@ -1848,13 +1944,12 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_i if (need_newpathbuf) { if (!(cnp->cn_flags & HASBUF) || cnp->cn_pnlen == MAXPATHLEN) { - cp = zalloc(ZV_NAMEI); cplen = MAXPATHLEN; } else { assert(proc_support_long_paths(vfs_context_proc(ctx))); - cp = kalloc_data(cnp->cn_pnlen, Z_WAITOK | Z_ZERO); cplen = cnp->cn_pnlen; } + cp = namei_alloc(cplen); } else { cp = cnp->cn_pnbuf; } @@ -1891,11 +1986,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_i if (error) { if (need_newpathbuf) { - if (cplen == MAXPATHLEN) { - zfree(ZV_NAMEI, cp); - } else { - kfree_data(cp, cplen); - } + namei_free(cp, cplen); } return error; } @@ -1908,11 +1999,7 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_i cnp->cn_pnlen = cplen; if ((cnp->cn_flags & HASBUF)) { - if (tmplen == MAXPATHLEN) { - zfree(ZV_NAMEI, tmppn); - } else { - kfree_data(tmppn, tmplen); - } + namei_free(tmppn, tmplen); } else { cnp->cn_flags |= HASBUF; } @@ -1943,10 +2030,10 @@ lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, bool *new_dp_has_i * Check if symbolic link restarts us at the root */ if (*(cnp->cn_nameptr) == '/') { - /* return EACCES if resolve beneath and the symlink restarts at root */ + /* return ENOTCAPABLE if resolve beneath and the symlink restarts at root */ if (ndp->ni_flag & NAMEI_RESOLVE_BENEATH) { vnode_put(dp); /* ALWAYS have a dvp for a symlink */ - return EACCES; + return ENOTCAPABLE; } while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; @@ -2093,11 +2180,7 @@ nameidone(struct nameidata *ndp) ndp->ni_cnd.cn_pnbuf = NULL; ndp->ni_cnd.cn_flags &= ~HASBUF; - if (ndp->ni_cnd.cn_pnlen == MAXPATHLEN) { - zfree(ZV_NAMEI, tmp); - } else { - kfree_data(tmp, ndp->ni_cnd.cn_pnlen); - } + namei_free(tmp, ndp->ni_cnd.cn_pnlen); } } @@ -2244,7 +2327,7 @@ vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx) */ #if CONFIG_VOLFS static int -vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx) +vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx, vnode_t rdvp) { vnode_t vp; struct mount *mp = NULL; @@ -2310,6 +2393,33 @@ vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_ } realpath[0] = '\0'; + /* Check for and fail if the path is not under the chroot */ + if (rdvp != NULLVP) { + int is_subdir = 0; + vnode_t pvp = NULLVP; + + /* Get the parent if vp is not a directory */ + if (!vnode_isdir(vp) && !(pvp = vnode_getparent(vp))) { + error = EINVAL; + vnode_put(vp); + goto out; + } + + /* Check if a given directory vp/pvp is a subdirectory of rdvp */ + error = vnode_issubdir(pvp ? pvp : vp, rdvp, &is_subdir, ctx); + if (pvp) { + vnode_put(pvp); + } + if (error || !is_subdir) { + if (!error) { + /* Path is not under the chroot */ + error = EINVAL; + } + vnode_put(vp); + goto out; + } + } + /* Get the absolute path to this vnode. */ error = build_path(vp, realpath, (int)bufsize, &length, 0, ctx); vnode_put(vp); diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index ecf3fabf0..77a16c1a2 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -608,7 +609,7 @@ vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) if ((vp->v_usecount != 0) && ((vp->v_usecount - vp->v_kusecount) != 0)) { ret = 1; if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { - vprint("vnode_umount_preflight - busy vnode", vp); + vprint_path("vnode_umount_preflight - busy vnode", vp); } else { return ret; } @@ -618,7 +619,7 @@ vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags) if (vp->v_iocount > 0) { ret = 1; if (print_busy_vnodes && ((flags & FORCECLOSE) == 0)) { - vprint("vnode_umount_preflight - busy vnode", vp); + vprint_path("vnode_umount_preflight - busy vnode", vp); } else { return ret; } @@ -2228,7 +2229,7 @@ found_alias: nvp->v_specinfo->si_opencount = 0; nvp->v_specinfo->si_initted = 0; nvp->v_specinfo->si_throttleable = 0; - nvp->v_specinfo->si_devbsdunit = LOWPRI_MAX_NUM_DEV; + nvp->v_specinfo->si_devbsdunit = LOWPRI_MAX_NUM_DEV - 1; SPECHASH_LOCK(); @@ -2343,13 +2344,12 @@ vnode_ref_ext(vnode_t vp, int fmode, int flags) /* * if you are the owner of drain/termination, can acquire usecount */ - if ((flags & VNODE_REF_FORCE) == 0) { - if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { - if (vp->v_owner != current_thread()) { - error = ENOENT; - goto out; - } - } + if (((flags & VNODE_REF_FORCE) == 0) && + ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) && + !(vp->v_lflag & VL_OPSCHANGE) && + (vp->v_owner != current_thread())) { + error = ENOENT; + goto out; } /* Enable atomic ops on v_usecount without the vnode lock */ @@ -2966,12 +2966,12 @@ loop: continue; } + vnode_unlock(vp); /* log vnodes blocking unforced unmounts */ if (print_busy_vnodes && first_try && ((flags & FORCECLOSE) == 0)) { - vprint("vflush - busy vnode", vp); + vprint_path("vflush - busy vnode", vp); } - vnode_unlock(vp); mount_lock(mp); busy++; } @@ -3610,17 +3610,39 @@ int prtactive = 0; /* 1 => print out reclaim of active vnodes */ static const char *typename[] = { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; -void -vprint(const char *label, struct vnode *vp) +static void +vprint_internal(const char *label, struct vnode *vp, bool with_path) { char sbuf[64]; if (label != NULL) { printf("%s: ", label); } - printf("name %s type %s, usecount %d, writecount %d\n", - vp->v_name, typename[vp->v_type], - vp->v_usecount, vp->v_writecount); + + if (with_path) { + char const *path = NULL; + char *vn_path = NULL; + vm_size_t vn_pathlen = MAXPATHLEN; + + vn_path = zalloc(ZV_NAMEI); + if (vn_getpath(vp, vn_path, (int*)&vn_pathlen) == 0) { + path = vn_path; + } else { + path = "(get vnode path failed)"; + } + + printf("name %s, type %s, usecount %d, writecount %d, path %s\n", + vp->v_name, typename[vp->v_type], + vp->v_usecount, vp->v_writecount, path); + + if (vn_path) { + zfree(ZV_NAMEI, vn_path); + } + } else { + printf("name %s, type %s, usecount %d, writecount %d\n", + vp->v_name, typename[vp->v_type], + vp->v_usecount, vp->v_writecount); + } sbuf[0] = '\0'; if (vp->v_flag & VROOT) { strlcat(sbuf, "|VROOT", sizeof(sbuf)); @@ -3641,10 +3663,22 @@ vprint(const char *label, struct vnode *vp) strlcat(sbuf, "|VALIASED", sizeof(sbuf)); } if (sbuf[0] != '\0') { - printf("vnode flags (%s\n", &sbuf[1]); + printf("vnode flags (%s)\n", &sbuf[1]); } } +void +vprint(const char *label, struct vnode *vp) +{ + vprint_internal(label, vp, false); +} + +void +vprint_path(const char *label, struct vnode *vp) +{ + vprint_internal(label, vp, true); +} + static int vn_getpath_flags_to_buildpath_flags(int flags) { @@ -3780,9 +3814,9 @@ vn_getpath_no_firmlink(struct vnode *vp, char *pathbuf, int *len) } int -vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash) +vn_getcdhash(struct vnode *vp, off_t offset, unsigned char *cdhash, uint8_t *type) { - return ubc_cs_getcdhash(vp, offset, cdhash); + return ubc_cs_getcdhash(vp, offset, cdhash, type); } @@ -6162,6 +6196,7 @@ vnode_drop_internal(vnode_t vp, bool locked) return vp; } + vnode_lock_convert(vp); vnode_list_lock(); /* @@ -6182,7 +6217,7 @@ vnode_drop_internal(vnode_t vp, bool locked) #if CONFIG_MACF struct label *tmpl = mac_vnode_label(vp); - vp->v_label = NULL; + os_atomic_store(&vp->v_label, NULL, release); #endif /* CONFIG_MACF */ vnode_unlock(vp); @@ -6798,6 +6833,7 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) panic("vnode reclaim in progress"); } vp->v_lflag |= VL_TERMINATE; + vp->v_lflag &= ~VL_OPSCHANGE; vn_clearunionwait(vp, 1); @@ -7180,9 +7216,11 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); } - if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) { - vp->v_flag |= VISUNION; +#if NAMEDSTREAMS + if (cnp->cn_flags & MARKISSHADOW) { + vp->v_flag |= VISSHADOW; } +#endif } if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { /* @@ -7345,8 +7383,6 @@ vnode_initialize(uint32_t __unused flavor, uint32_t size, void *data, vnode_t *v * vnode_create_internal. */ vnode_lock_spin(*vpp); - VNASSERT(((*vpp)->v_iocount == 1), *vpp, - ("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount)); VNASSERT(((*vpp)->v_usecount == 0), *vpp, ("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount)); VNASSERT(((*vpp)->v_lflag & VL_DEAD), *vpp, @@ -8206,6 +8242,10 @@ dot_underbar_check_paired_vnode(struct componentname *cnp, vnode_t vp, int error = 0; bool dvp_needs_put = false; + if (cnp->cn_namelen <= 2 || cnp->cn_nameptr[0] != '.' || cnp->cn_nameptr[1] != '_') { + return 0; + } + if (!dvp) { if ((dvp = vnode_getparent(vp)) == NULLVP) { return 0; @@ -8240,7 +8280,7 @@ dot_underbar_check_paired_vnode(struct componentname *cnp, vnode_t vp, int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) { -#if !CONFIG_MACF +#if (!CONFIG_MACF && !NAMEDRSRCFORK) #pragma unused(cnp) #endif int error = 0; @@ -8259,14 +8299,26 @@ vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_cont if (!error) { error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); #if CONFIG_APPLEDOUBLE - if (!error && !(NATIVE_XATTR(dvp)) && - (cnp->cn_namelen > (sizeof("._a") - 1)) && - cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '_') { + if (!error && !NATIVE_XATTR(dvp)) { error = dot_underbar_check_paired_vnode(cnp, vp, dvp, ctx); } #endif /* CONFIG_APPLEDOUBLE */ } #endif /* MAC */ + + /* authorize file's resource fork */ +#if NAMEDRSRCFORK + if (!error && cnp && (cnp->cn_flags & CN_WANTSRSRCFORK)) { + /* If CN_WANTSRSRCFORK is set, that implies that 'dvp' is the base file and 'vp' is the namedstream file */ +#if CONFIG_MACF + error = mac_vnode_check_deleteextattr(ctx, dvp, XATTR_RESOURCEFORK_NAME); +#endif /* MAC */ + if (!error) { + error = vnode_authorize(dvp, NULL, KAUTH_VNODE_WRITE_EXTATTRIBUTES, ctx); + } + } +#endif /* NAMEDRSRCFORK */ + if (!error) { error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); } @@ -8337,9 +8389,7 @@ vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs } } #if CONFIG_APPLEDOUBLE - if (fmode & (FWRITE | O_TRUNC) && !(NATIVE_XATTR(vp)) && - (cnp->cn_namelen > (sizeof("._a") - 1)) && - cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '_') { + if (fmode & (FWRITE | O_TRUNC) && !NATIVE_XATTR(vp)) { error = dot_underbar_check_paired_vnode(cnp, vp, NULLVP, ctx); if (error) { return error; @@ -8348,6 +8398,39 @@ vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs #endif /* CONFIG_APPLEDOUBLE */ #endif + /* authorize file's resource fork */ +#if NAMEDRSRCFORK + if (cnp && (cnp->cn_flags & CN_WANTSRSRCFORK)) { + /* If CN_WANTSRSRCFORK is set, that implies that 'pvp' is the base file and 'vp' is the namedstream file */ + vnode_t pvp = vnode_getparent(vp); + if (pvp == NULLVP) { + return ENOENT; + } + +#if CONFIG_MACF + error = mac_vnode_check_getextattr(ctx, pvp, XATTR_RESOURCEFORK_NAME, NULL); + if (error) { + vnode_put(pvp); + return error; + } +#endif /* MAC */ + + action = 0; + if (fmode & FREAD) { + action |= KAUTH_VNODE_READ_EXTATTRIBUTES; + } + if (fmode & (FWRITE | O_TRUNC)) { + action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + } + error = vnode_authorize(pvp, NULL, action, ctx); + if (error) { + vnode_put(pvp); + return error; + } + vnode_put(pvp); + } +#endif /* NAMEDRSRCFORK */ + /* compute action to be authorized */ action = 0; if (fmode & FREAD) { @@ -8399,6 +8482,7 @@ vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *v #endif /* Creation case */ int error; + kauth_action_t action = KAUTH_VNODE_ADD_FILE; if (cnp->cn_ndp == NULL) { panic("NULL cn_ndp"); @@ -8415,6 +8499,21 @@ vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *v } } + /* authorize file's resource fork */ +#if NAMEDRSRCFORK + if (cnp && (cnp->cn_flags & CN_WANTSRSRCFORK)) { + /* If CN_WANTSRSRCFORK is set, that implies that 'dvp' is the base file and 'vp' is the namedstream file */ +#if CONFIG_MACF + error = mac_vnode_check_setextattr(ctx, dvp, XATTR_RESOURCEFORK_NAME, NULL); + if (error) { + return error; + } +#endif /* MAC */ + + action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + } +#endif /* NAMEDRSRCFORK */ + #if CONFIG_MACF error = mac_vnode_check_create(ctx, dvp, cnp, vap); if (error) { @@ -8422,7 +8521,7 @@ vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *v } #endif /* CONFIG_MACF */ - return vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx); + return vnode_authorize(dvp, NULL, action, ctx); } int @@ -8483,15 +8582,11 @@ vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct co error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp); } #if CONFIG_APPLEDOUBLE - if (!error && !(NATIVE_XATTR(fdvp)) && - fcnp->cn_namelen > (sizeof("._a") - 1) && - fcnp->cn_nameptr[0] == '.' && fcnp->cn_nameptr[1] == '_') { + if (!error && !NATIVE_XATTR(fdvp)) { error = dot_underbar_check_paired_vnode(fcnp, fvp, fdvp, ctx); } /* Currently no Filesystem that does not support native xattrs supports rename swap */ - if (!error && swap && !(NATIVE_XATTR(tdvp)) && - (tcnp->cn_namelen > (sizeof("._a") - 1)) && - (tcnp->cn_nameptr[0] == '.') && (tcnp->cn_nameptr[1] == '_')) { + if (!error && swap && !NATIVE_XATTR(tdvp)) { error = dot_underbar_check_paired_vnode(tcnp, tvp, tdvp, ctx); } #endif /* CONFIG_APPLEDOUBLE */ @@ -9178,6 +9273,18 @@ vauth_dir_ingroup(vauth_ctx vcp, int *ismember, int idontknow) return error; } +static int +vfs_context_ignores_node_permissions(vfs_context_t ctx) +{ + if (proc_ignores_node_permissions(vfs_context_proc(ctx))) { + return 1; + } + if (get_bsdthread_info(vfs_context_thread(ctx))->uu_flag & UT_IGNORE_NODE_PERMISSIONS) { + return 1; + } + return 0; +} + /* * Test the posix permissions in (vap) to determine whether (credential) * may perform (action) @@ -9226,7 +9333,7 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) * Processes with the appropriate entitlement can marked themselves as * ignoring file/directory permissions if they own it. */ - if (!owner_ok && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + if (!owner_ok && vfs_context_ignores_node_permissions(vcp->ctx)) { owner_ok = 1; } @@ -9424,7 +9531,7 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) switch (eval.ae_result) { case KAUTH_RESULT_DENY: - if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + if (vauth_file_owner(vcp) && vfs_context_ignores_node_permissions(vcp->ctx)) { KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); return 0; } @@ -9496,7 +9603,7 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) } switch (eval.ae_result) { case KAUTH_RESULT_DENY: - if (vauth_dir_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + if (vauth_dir_owner(vcp) && vfs_context_ignores_node_permissions(vcp->ctx)) { KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); return 0; } @@ -9623,7 +9730,7 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r switch (eval.ae_result) { case KAUTH_RESULT_DENY: - if (vauth_file_owner(vcp) && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + if (vauth_file_owner(vcp) && vfs_context_ignores_node_permissions(vcp->ctx)) { KAUTH_DEBUG("%p Override DENY due to entitlement", vcp->vp); return 0; } @@ -9817,7 +9924,7 @@ vnode_authorize_checkimmutable(mount_t mp, vauth_ctx vcp, } else { owner = vauth_file_owner(vcp); } - if (owner && proc_ignores_node_permissions(vfs_context_proc(vcp->ctx))) { + if (owner && vfs_context_ignores_node_permissions(vcp->ctx)) { error = vnode_immutable(vap, append, 1); } } @@ -13323,3 +13430,62 @@ vnode_rdadvise(vnode_t vp, off_t offset, int len, vfs_context_t ctx) return VNOP_IOCTL(vp, F_RDADVISE, (caddr_t)&ra_struct, 0, ctx); } + +int +vnode_hasmultipath(vnode_t vp) +{ + struct vnode_attr va; + bool is_local_volume = !!(vp->v_mount->mnt_flag & MNT_LOCAL); + bool link_locked = false; + int has_multipath = 0; + int error; + + /* + * If the volume doesn't support directory hard link then the directory + * can't be a hard link. + */ + if ((vp->v_type == VDIR) && is_local_volume && + !(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) { + goto out; + } + + vnode_link_lock(vp); + link_locked = true; + + if (is_local_volume && (vp->v_ext_flag & VE_NOT_HARDLINK)) { + goto out; + } + + /* + * Not all file systems adopt vnode_setmultipath() to mark a vnode is + * hard link (VISHARDLINK) so we need to call into the file system to get + * the link count attributes to determine if the vnode has multiple paths. + */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_nlink); + VATTR_WANTED(&va, va_dirlinkcount); + + error = vnode_getattr(vp, &va, vfs_context_current()); + if (error) { + goto out; + } + + if ((vp->v_type == VDIR) && VATTR_IS_SUPPORTED(&va, va_dirlinkcount)) { + has_multipath = (va.va_dirlinkcount > 1); + } else if (VATTR_IS_SUPPORTED(&va, va_nlink)) { + has_multipath = (va.va_nlink > 1); + } + + if (has_multipath == 0) { + vnode_lock_spin(vp); + vp->v_ext_flag |= VE_NOT_HARDLINK; + vnode_unlock(vp); + } + +out: + if (link_locked) { + vnode_link_unlock(vp); + } + + return has_multipath; +} diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index fe34d1bb3..a2ce29ce8 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -729,57 +729,92 @@ graftdmg(__unused proc_t p, struct graftdmg_args *uap, __unused int32_t *retval) graftdmg_args_un kern_gda = {}; int error = 0; secure_boot_cryptex_args_t *sbc_args = NULL; + bool graft_on_parent = (ua_mountdir == USER_ADDR_NULL); vnode_t cryptex_vp = NULLVP; - vnode_t mounton_vp = NULLVP; struct nameidata nd = {}; vfs_context_t ctx = vfs_context_current(); +#if CONFIG_MACF + vnode_t parent_vp = NULLVP; +#endif if (!IOTaskHasEntitlement(vfs_context_task(ctx), GRAFTDMG_ENTITLEMENT)) { return EPERM; } + // Copy graftargs in, if provided. error = copyin(ua_graftargs, &kern_gda, sizeof(graftdmg_args_un)); if (error) { return error; } - // Copy mount dir in, if provided. - if (ua_mountdir != USER_ADDR_NULL) { - // Acquire vnode for mount-on path + // Convert fd to vnode. + error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp); + if (error) { + return error; + } + + if (vnode_isdir(cryptex_vp)) { + error = EISDIR; + goto graftout; + } + +#if CONFIG_MACF + if (graft_on_parent) { + // Grafting on Cryptex file parent directory, need to get its vp for MAC check. + parent_vp = vnode_getparent(cryptex_vp); + if (parent_vp == NULLVP) { + error = ENOENT; + goto graftout; + } + } +#endif + + if (!graft_on_parent) { NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1), UIO_USERSPACE, ua_mountdir, ctx); error = namei(&nd); if (error) { - return error; + goto graftout; } - mounton_vp = nd.ni_vp; } - // Convert fd to vnode. - error = vnode_getfromfd(ctx, ua_dmgfd, &cryptex_vp); +#if CONFIG_MACF + vnode_t macf_vp = graft_on_parent ? parent_vp : nd.ni_vp; + error = mac_graft_check_graft(ctx, macf_vp); if (error) { goto graftout; } +#endif if (ua_grafttype == 0 || ua_grafttype > GRAFTDMG_CRYPTEX_MAX) { error = EINVAL; } else { sbc_args = &kern_gda.sbc_args; - error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, cryptex_vp, mounton_vp); + error = graft_secureboot_cryptex(ua_grafttype, sbc_args, ctx, + cryptex_vp, graft_on_parent ? NULLVP : nd.ni_vp); } +#if CONFIG_MACF + if (!error) { + mac_graft_notify_graft(ctx, macf_vp); + } +#endif + graftout: - if (cryptex_vp) { +#if CONFIG_MACF + if (parent_vp != NULLVP) { + vnode_put(parent_vp); + parent_vp = NULLVP; + } +#endif + if (cryptex_vp != NULLVP) { vnode_put(cryptex_vp); cryptex_vp = NULLVP; } - if (mounton_vp) { - vnode_put(mounton_vp); - mounton_vp = NULLVP; - } - if (ua_mountdir != USER_ADDR_NULL) { + if (nd.ni_vp != NULLVP) { + vnode_put(nd.ni_vp); nameidone(&nd); } @@ -795,8 +830,7 @@ ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *ret { int error = 0; user_addr_t ua_mountdir = uap->mountdir; - fsioc_ungraft_fs_t ugfs; - vnode_t mounton_vp = NULLVP; + fsioc_ungraft_fs_t ugfs = {}; struct nameidata nd = {}; vfs_context_t ctx = vfs_context_current(); @@ -804,11 +838,13 @@ ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *ret return EPERM; } - if (uap->flags != 0 || ua_mountdir == USER_ADDR_NULL) { + if (ua_mountdir == USER_ADDR_NULL) { return EINVAL; } - ugfs.ungraft_flags = 0; + if (uap->flags & UNGRAFTDMG_NOFORCE) { + ugfs.ungraft_flags |= FSCTL_UNGRAFT_NOFORCE; + } // Acquire vnode for mount-on path NDINIT(&nd, LOOKUP, OP_MOUNT, (FOLLOW | AUDITVNPATH1), @@ -818,12 +854,30 @@ ungraftdmg(__unused proc_t p, struct ungraftdmg_args *uap, __unused int32_t *ret if (error) { return error; } - mounton_vp = nd.ni_vp; + + if (!vnode_isdir(nd.ni_vp)) { + error = ENOTDIR; + goto ungraftout; + } + +#if CONFIG_MACF + error = mac_graft_check_ungraft(ctx, nd.ni_vp); + if (error) { + goto ungraftout; + } +#endif // Call into the FS to perform the ungraft - error = VNOP_IOCTL(mounton_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx); + error = VNOP_IOCTL(nd.ni_vp, FSIOC_UNGRAFT_FS, (caddr_t)&ugfs, 0, ctx); - vnode_put(mounton_vp); +#if CONFIG_MACF + if (!error) { + mac_graft_notify_ungraft(ctx, nd.ni_vp); + } +#endif + +ungraftout: + vnode_put(nd.ni_vp); nameidone(&nd); return error; @@ -2769,6 +2823,53 @@ unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval) return safedounmount(mp, flags, ctx); } +int +funmount(__unused proc_t p, struct funmount_args *uap, __unused int32_t *retval) +{ + int error; + vnode_t vp; + struct mount *mp; + vfs_context_t ctx; + + AUDIT_ARG(fd, uap->fd); + AUDIT_ARG(fflags, uap->flags); + + /* + * If the process has the entitlement, use the kernel's context when + * performing lookup on the mount path as the process might lack proper + * permission to access the directory. + */ + ctx = IOCurrentTaskHasEntitlement(ROLE_ACCOUNT_UNMOUNT_ENTITLEMENT) ? + vfs_context_kernel() : vfs_context_current(); + + error = vnode_getfromfd(ctx, uap->fd, &vp); + if (error) { + return error; + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vnode_put(vp); + return EINVAL; + } + mp = vnode_mount(vp); + +#if CONFIG_MACF + error = mac_mount_check_umount(ctx, mp); + if (error != 0) { + vnode_put(vp); + return error; + } +#endif + mount_ref(mp, 0); + vnode_put(vp); + + /* safedounmount consumes the mount ref */ + return safedounmount(mp, uap->flags, ctx); +} + int vfs_unmountbyfsid(fsid_t *fsid, int flags, vfs_context_t ctx) { @@ -2937,6 +3038,12 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) mount_generation++; name_cache_unlock(); + /* + * Make sure there are no one in the mount iterations or lookup. + * Drain makes 'mnt_iterref' -ve so on error exit we need to ensure that + * 'mnt_iterref' is reset back to 0 by calling mount_iterreset(). + */ + mount_iterdrain(mp); lck_rw_lock_exclusive(&mp->mnt_rwlock); if (withref != 0) { @@ -2948,6 +3055,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) if ((mp->mnt_flag & MNT_RDONLY) == 0) { error = VFS_SYNC(mp, MNT_WAIT, ctx); if (error) { + mount_iterreset(mp); mount_lock(mp); mp->mnt_kern_flag &= ~MNTK_UNMOUNT; mp->mnt_lflag &= ~MNT_LUNMOUNT; @@ -2968,6 +3076,7 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) } error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags); if ((forcedunmount == 0) && error) { + mount_iterreset(mp); mount_lock(mp); mp->mnt_kern_flag &= ~MNTK_UNMOUNT; mp->mnt_lflag &= ~MNT_LUNMOUNT; @@ -2975,9 +3084,6 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) goto out; } - /* make sure there are no one in the mount iterations or lookup */ - mount_iterdrain(mp); - error = VFS_UNMOUNT(mp, flags, ctx); if (error) { mount_iterreset(mp); @@ -4115,6 +4221,72 @@ vnode_getfromfd(vfs_context_t ctx, int fd, vnode_t *vpp) return error; } +int +vnode_getfromid(int volfs_id, uint64_t objid, vfs_context_t ctx, int realfsid, vnode_t *vpp) +{ + int error = 0; + vnode_t vp = NULLVP; + struct mount *mp = NULL; + + if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) { + error = ENOTSUP; /* unexpected failure */ + return ENOTSUP; + } + +#if CONFIG_UNION_MOUNTS +unionget: +#endif /* CONFIG_UNION_MOUNTS */ + if (objid == 2) { + struct vfs_attr vfsattr; + int use_vfs_root = TRUE; + + VFSATTR_INIT(&vfsattr); + VFSATTR_WANTED(&vfsattr, f_capabilities); + if (!realfsid && + vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 && + VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { + if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) && + (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) { + use_vfs_root = FALSE; + } + } + + if (use_vfs_root) { + error = VFS_ROOT(mp, &vp, ctx); + } else { + error = VFS_VGET(mp, objid, &vp, ctx); + } + } else { + error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx); + } + +#if CONFIG_UNION_MOUNTS + if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) { + /* + * If the fileid isn't found and we're in a union + * mount volume, then see if the fileid is in the + * mounted-on volume. + */ + struct mount *tmp = mp; + mp = vnode_mount(tmp->mnt_vnodecovered); + vfs_unbusy(tmp); + if (vfs_busy(mp, LK_NOWAIT) == 0) { + goto unionget; + } + } else { + vfs_unbusy(mp); + } +#else + vfs_unbusy(mp); +#endif /* CONFIG_UNION_MOUNTS */ + + if (!error) { + *vpp = vp; + } + + return error; +} + /* * Wrapper function around namei to start lookup from a directory * specified by a file descriptor ni_dirfd. @@ -5189,7 +5361,7 @@ open_dprotected_np(__unused proc_t p, struct open_dprotected_np_args *uap, int32 static int openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode, - int fd, enum uio_seg segflg, int *retval) + int fd, enum uio_seg segflg, int *retval, uint64_t *objidp, fsid_t *fsidp) { struct filedesc *fdp = &vfs_context_proc(ctx)->p_fd; struct { @@ -5210,11 +5382,23 @@ openat_internal(vfs_context_t ctx, user_addr_t path, int flags, int mode, cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; VATTR_SET(vap, va_mode, cmode & ACCESSPERMS); + /* Check for fileid and fsid authentication */ + if (objidp || fsidp) { + if (!objidp || !fsidp) { + error = EINVAL; + goto out; + } + VATTR_SET(vap, va_flags, VA_VAFILEID); + VATTR_SET(vap, va_fileid, *objidp); + VATTR_SET(vap, va_fsid64, *fsidp); + } + NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, segflg, path, ctx); error = open1at(ctx, ndp, flags, vap, NULL, NULL, retval, fd, AUTH_OPEN_NOAUTHFD); +out: kfree_type(typeof(*__open_data), __open_data); return error; @@ -5232,7 +5416,7 @@ open_nocancel(__unused proc_t p, struct open_nocancel_args *uap, int32_t *retval) { return openat_internal(vfs_context_current(), uap->path, uap->flags, - uap->mode, AT_FDCWD, UIO_USERSPACE, retval); + uap->mode, AT_FDCWD, UIO_USERSPACE, retval, NULL, NULL); } int @@ -5240,7 +5424,7 @@ openat_nocancel(__unused proc_t p, struct openat_nocancel_args *uap, int32_t *retval) { return openat_internal(vfs_context_current(), uap->path, uap->flags, - uap->mode, uap->fd, UIO_USERSPACE, retval); + uap->mode, uap->fd, UIO_USERSPACE, retval, NULL, NULL); } int @@ -5263,6 +5447,8 @@ vfs_context_can_open_by_id(vfs_context_t ctx) OPEN_BY_ID_ENTITLEMENT); } +#define MAX_OPENBYID_NP_RETRIES 10 + /* * openbyid_np: open a file given a file system id and a file system object id * the hfs file system object id is an fsobj_id_t {uint32, uint32} @@ -5291,7 +5477,9 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval) { fsid_t fsid; uint64_t objid; + int fd; int error; + int retry_count = 0; char *buf = NULL; int buflen = MAXPATHLEN; int pathlen = 0; @@ -5313,6 +5501,13 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval) AUDIT_ARG(value32, fsid.val[0]); AUDIT_ARG(value64, objid); +retry: + fd = -1; + error = 0; + buf = NULL; + pathlen = 0; + buflen = MAXPATHLEN; + /*resolve path from fsis, objid*/ do { buf = kalloc_data(buflen + 1, Z_WAITOK); @@ -5336,10 +5531,25 @@ openbyid_np(__unused proc_t p, struct openbyid_np_args *uap, int *retval) buf[pathlen] = 0; error = openat_internal( - ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, retval); + ctx, (user_addr_t)buf, uap->oflags, 0, AT_FDCWD, UIO_SYSSPACE, &fd, &objid, &fsid); kfree_data(buf, buflen + 1); + /* Ensure the correct file is opened */ + if (error == ERECYCLE) { + if (retry_count < MAX_OPENBYID_NP_RETRIES) { + retry_count += 1; + goto retry; + } else { + printf("openbyid_np() retry limit due to ERECYCLE reached\n"); + error = ENOENT; + } + } + + if (!error) { + *retval = fd; + } + return error; } @@ -5735,6 +5945,12 @@ retry: vp = dvp = lvp = NULLVP; NDINIT(&nd, LOOKUP, OP_LOOKUP, AUDITVNPATH1 | follow, segflg, path, ctx); + if (flag & AT_SYMLINK_NOFOLLOW_ANY) { + nd.ni_flag |= NAMEI_NOFOLLOW_ANY; + } + if (flag & AT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } error = nameiat(&nd, fd1); if (error) { @@ -5832,6 +6048,10 @@ retry: (void)mac_vnode_notify_link(ctx, vp, dvp, &nd.ni_cnd); #endif + vnode_lock_spin(vp); + vp->v_ext_flag &= ~VE_NOT_HARDLINK; + vnode_unlock(vp); + assert(locked_vp == vp); vnode_link_unlock(locked_vp); locked_vp = NULLVP; @@ -5959,7 +6179,7 @@ link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval) int linkat(__unused proc_t p, struct linkat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_SYMLINK_FOLLOW) { + if (uap->flag & ~(AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -6197,6 +6417,7 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, int retry_count = 0; int cn_flags; int nofollow_any = 0; + int resolve_beneath = 0; cn_flags = LOCKPARENT; if (!(unlink_flags & VNODE_REMOVE_NO_AUDIT_PATH)) { @@ -6206,6 +6427,10 @@ unlinkat_internal(vfs_context_t ctx, int fd, vnode_t start_dvp, nofollow_any = NAMEI_NOFOLLOW_ANY; unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY; } + if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) { + resolve_beneath = NAMEI_RESOLVE_BENEATH; + unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH; + } /* If a starting dvp is passed, it trumps any fd passed. */ if (start_dvp) { cn_flags |= USEDVP; @@ -6234,7 +6459,7 @@ retry: NDINIT(ndp, DELETE, OP_UNLINK, cn_flags, segflg, path_arg, ctx); ndp->ni_dvp = start_dvp; - ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any; + ndp->ni_flag |= NAMEI_COMPOUNDREMOVE | nofollow_any | resolve_beneath; cnp = &ndp->ni_cnd; continue_lookup: @@ -6502,18 +6727,25 @@ unlinkat(__unused proc_t p, struct unlinkat_args *uap, __unused int32_t *retval) { int unlink_flags = 0; - if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED)) { + if (uap->flag & ~(AT_REMOVEDIR | AT_REMOVEDIR_DATALESS | AT_SYMLINK_NOFOLLOW_ANY | AT_SYSTEM_DISCARDED | AT_RESOLVE_BENEATH | AT_NODELETEBUSY)) { return EINVAL; } if (uap->flag & AT_SYMLINK_NOFOLLOW_ANY) { unlink_flags |= VNODE_REMOVE_NOFOLLOW_ANY; } + if (uap->flag & AT_RESOLVE_BENEATH) { + unlink_flags |= VNODE_REMOVE_RESOLVE_BENEATH; + } if (uap->flag & AT_SYSTEM_DISCARDED) { unlink_flags |= VNODE_REMOVE_SYSTEM_DISCARDED; } + if (uap->flag & AT_NODELETEBUSY) { + unlink_flags |= VNODE_REMOVE_NODELETEBUSY; + } + if (uap->flag & (AT_REMOVEDIR | AT_REMOVEDIR_DATALESS)) { if (uap->flag & AT_REMOVEDIR_DATALESS) { unlink_flags |= VNODE_REMOVE_DATALESS_DIR; @@ -7027,6 +7259,9 @@ faccessat_internal(vfs_context_t ctx, int fd, user_addr_t path, int amode, if (flag & AT_SYMLINK_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (flag & AT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } #if NAMEDRSRCFORK /* access(F_OK) calls are allowed for resource forks. */ @@ -7084,7 +7319,7 @@ int faccessat(__unused proc_t p, struct faccessat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) { + if (uap->flag & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -7131,6 +7366,9 @@ fstatat_internal(vfs_context_t ctx, user_addr_t path, user_addr_t ub, if (flag & AT_SYMLINK_NOFOLLOW_ANY) { ndp->ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (flag & AT_RESOLVE_BENEATH) { + ndp->ni_flag |= NAMEI_RESOLVE_BENEATH; + } #if NAMEDRSRCFORK int is_namedstream = 0; @@ -7417,7 +7655,7 @@ lstat64_extended(__unused proc_t p, struct lstat64_extended_args *uap, __unused int fstatat(__unused proc_t p, struct fstatat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -7429,7 +7667,7 @@ int fstatat64(__unused proc_t p, struct fstatat64_args *uap, __unused int32_t *retval) { - if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY)) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_REALDEV | AT_FDONLY | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -7831,6 +8069,9 @@ chmodat(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap, if (flag & AT_SYMLINK_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (flag & AT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = nameiat(&nd, fd))) { return error; } @@ -7964,7 +8205,7 @@ chmod(__unused proc_t p, struct chmod_args *uap, __unused int32_t *retval) int fchmodat(__unused proc_t p, struct fchmodat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY)) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -8138,6 +8379,9 @@ fchownat_internal(vfs_context_t ctx, int fd, user_addr_t path, uid_t uid, if (flag & AT_SYMLINK_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (flag & AT_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } error = nameiat(&nd, fd); if (error) { @@ -8169,7 +8413,7 @@ lchown(__unused proc_t p, struct lchown_args *uap, __unused int32_t *retval) int fchownat(__unused proc_t p, struct fchownat_args *uap, __unused int32_t *retval) { - if (uap->flag & ~AT_SYMLINK_NOFOLLOW) { + if (uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_SYMLINK_NOFOLLOW_ANY | AT_RESOLVE_BENEATH)) { return EINVAL; } @@ -8838,6 +9082,9 @@ clonefile_internal(vnode_t fvp, boolean_t data_read_authorised, int dst_dirfd, if (flags & CLONE_NOFOLLOW_ANY) { tondp->ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (flags & CLONE_RESOLVE_BENEATH) { + tondp->ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = nameiat(tondp, dst_dirfd))) { kfree_type(struct nameidata, tondp); @@ -9052,7 +9299,7 @@ clonefileat(__unused proc_t p, struct clonefileat_args *uap, /* Check that the flags are valid. */ if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL | - CLONE_NOFOLLOW_ANY)) { + CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) { return EINVAL; } @@ -9066,6 +9313,9 @@ clonefileat(__unused proc_t p, struct clonefileat_args *uap, if (uap->flags & CLONE_NOFOLLOW_ANY) { ndp->ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->flags & CLONE_RESOLVE_BENEATH) { + ndp->ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = nameiat(ndp, uap->src_dirfd))) { kfree_type(struct nameidata, ndp); @@ -9094,7 +9344,7 @@ fclonefileat(__unused proc_t p, struct fclonefileat_args *uap, /* Check that the flags are valid. */ if (uap->flags & ~(CLONE_NOFOLLOW | CLONE_NOOWNERCOPY | CLONE_ACL | - CLONE_NOFOLLOW_ANY)) { + CLONE_NOFOLLOW_ANY | CLONE_RESOLVE_BENEATH)) { return EINVAL; } @@ -9128,11 +9378,11 @@ out: static int rename_submounts_callback(mount_t mp, void *arg) { + char *prefix = (char *)arg; + int prefix_len = (int)strlen(prefix); int error = 0; - mount_t pmp = (mount_t)arg; - int prefix_len = (int)strlen(pmp->mnt_vfsstat.f_mntonname); - if (strncmp(mp->mnt_vfsstat.f_mntonname, pmp->mnt_vfsstat.f_mntonname, prefix_len) != 0) { + if (strncmp(mp->mnt_vfsstat.f_mntonname, prefix, prefix_len) != 0) { return 0; } @@ -9172,11 +9422,12 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, int do_retry; int retry_count; int mntrename; + int dirrename; int need_event; int need_kpath2; int has_listeners; const char *oname = NULL; - char *from_name = NULL, *to_name = NULL; + char *old_dirpath = NULL, *from_name = NULL, *to_name = NULL; char *from_name_no_firmlink = NULL, *to_name_no_firmlink = NULL; int from_len = 0, to_len = 0; int from_len_no_firmlink = 0, to_len_no_firmlink = 0; @@ -9195,6 +9446,7 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, int continuing = 0; vfs_rename_flags_t flags = uflags & VFS_RENAME_FLAGS_MASK; int32_t nofollow_any = 0; + int32_t resolve_beneath = 0; /* carving out a chunk for structs that are too big to be on stack. */ struct { struct nameidata from_node, to_node; @@ -9213,19 +9465,22 @@ retry: fdvp = tdvp = NULL; fvap = tvap = NULL; mnt_fvp = NULLVP; - mntrename = FALSE; + mntrename = dirrename = FALSE; vn_authorize_skipped = FALSE; if (uflags & RENAME_NOFOLLOW_ANY) { nofollow_any = NAMEI_NOFOLLOW_ANY; } + if (uflags & RENAME_RESOLVE_BENEATH) { + resolve_beneath = NAMEI_RESOLVE_BENEATH; + } NDINIT(fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1, segflg, from, ctx); - fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any; + fromnd->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath; NDINIT(tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK, segflg, to, ctx); - tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any; + tond->ni_flag = NAMEI_COMPOUNDRENAME | nofollow_any | resolve_beneath; continue_lookup: if ((fromnd->ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { @@ -9237,6 +9492,9 @@ continue_lookup: if (fvp && fvp->v_type == VDIR) { tond->ni_cnd.cn_flags |= WILLBEDIR; +#if defined(XNU_TARGET_OS_OSX) + dirrename = TRUE; +#endif } } @@ -9595,8 +9853,6 @@ continue_lookup: retry_count += 1; } } - vnode_link_unlock(fvp); - locked_vp = NULLVP; goto out1; } } @@ -9611,6 +9867,38 @@ continue_lookup: oname = fvp->v_name; oparent = fvp->v_parent; + /* + * If renaming a directory, stash its path which we need later when + * updating the 'f_mntonname' of sub mounts. + */ + if (dirrename) { + int pathlen = MAXPATHLEN; + + old_dirpath = zalloc(ZV_NAMEI); + error = vn_getpath_fsenter(fvp, old_dirpath, &pathlen); + if (error) { + /* + * Process that supports long path (opt-in to IO policy + * IOPOL_TYPE_VFS_SUPPORT_LONG_PATHS) can have directory with path + * length up to MAXLONGPATHLEN (8192). Since max path length in + * mount's 'f_mntonname' is MAXPATHLEN (1024), this means the + * directory can't be the parent of the sub mounts so we can just + * silently drop the error and skip the check to update the + * 'f_mntonname' of sub mounts. + */ + if (error == ENOSPC) { + dirrename = false; + error = 0; + if (old_dirpath) { + zfree(ZV_NAMEI, old_dirpath); + old_dirpath = NULL; + } + } else { + goto out1; + } + } + } + skipped_lookup: #if CONFIG_FILE_LEASES /* Lease break needed for source's parent dir? */ @@ -9813,7 +10101,8 @@ skipped_lookup: } /* Update f_mntonname of sub mounts */ - vfs_iterate(0, rename_submounts_callback, (void *)mp); + vfs_iterate(0, rename_submounts_callback, + (void *)mp->mnt_vfsstat.f_mntonname); /* append name to prefix */ maxlen = MAXPATHLEN - (int)(pathend - mp->mnt_vfsstat.f_mntonname); @@ -9826,7 +10115,15 @@ skipped_lookup: vfs_unbusy(mp); vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL); + } else if (dirrename) { + /* + * If we renamed a directory, we need to check if there is any sub + * mount(s) mounted under the directory. If so, then we need to update + * the sub mount's f_mntonname path. + */ + vfs_iterate(0, rename_submounts_callback, (void *)old_dirpath); } + /* * fix up name & parent pointers. note that we first * check that fvp has the same name/parent pointers it @@ -9862,6 +10159,11 @@ out1: } } } + if (locked_vp) { + assert(locked_vp == fvp); + vnode_link_unlock(locked_vp); + locked_vp = NULLVP; + } if (to_name != NULL) { RELEASE_PATH(to_name); to_name = NULL; @@ -9878,6 +10180,10 @@ out1: RELEASE_PATH(from_name_no_firmlink); from_name_no_firmlink = NULL; } + if (old_dirpath != NULL) { + zfree(ZV_NAMEI, old_dirpath); + old_dirpath = NULL; + } if (holding_mntlock) { mount_unlock_renames(locked_mp); mount_drop(locked_mp, 0); @@ -9933,7 +10239,7 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) int renameatx_np(__unused proc_t p, struct renameatx_np_args *uap, __unused int32_t *retval) { - if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY)) { + if (uap->flags & ~(RENAME_SECLUDE | RENAME_EXCL | RENAME_SWAP | RENAME_NOFOLLOW_ANY | RENAME_RESOLVE_BENEATH)) { return EINVAL; } @@ -10174,6 +10480,7 @@ rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath, int restart_flag; int nofollow_any = 0; + int resolve_beneath = 0; __rmdir_data = kalloc_type(typeof(*__rmdir_data), Z_WAITOK); ndp = &__rmdir_data->nd; @@ -10182,6 +10489,10 @@ rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath, nofollow_any = NAMEI_NOFOLLOW_ANY; unlink_flags &= ~VNODE_REMOVE_NOFOLLOW_ANY; } + if (unlink_flags & VNODE_REMOVE_RESOLVE_BENEATH) { + resolve_beneath = NAMEI_RESOLVE_BENEATH; + unlink_flags &= ~VNODE_REMOVE_RESOLVE_BENEATH; + } /* * This loop exists to restart rmdir in the unlikely case that two @@ -10191,7 +10502,7 @@ rmdirat_internal(vfs_context_t ctx, int fd, user_addr_t dirpath, do { NDINIT(ndp, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1, segflg, dirpath, ctx); - ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any; + ndp->ni_flag = NAMEI_COMPOUNDRMDIR | nofollow_any | resolve_beneath; continue_lookup: restart_flag = 0; vap = NULL; @@ -13099,7 +13410,8 @@ unlock: case FSIOC_EXCLAVE_FS_GET_BASE_DIRS: { exclave_fs_get_base_dirs_t *get_base_dirs = ((exclave_fs_get_base_dirs_t *)data); exclave_fs_base_dir_t *dirs = NULL; - if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT)) { + if (!IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_REGISTER_ENTITLEMENT) && + !IOTaskHasEntitlement(vfs_context_task(ctx), EXCLAVE_FS_LIST_ENTITLEMENT)) { error = EPERM; break; } @@ -13333,6 +13645,9 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) if (uap->options & XATTR_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & XATTR_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = namei(&nd))) { return error; @@ -13413,7 +13728,7 @@ fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval) UIO_STACKBUF(uio_buf, 1); if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT | - XATTR_NOFOLLOW_ANY)) { + XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) { return EINVAL; } @@ -13516,6 +13831,9 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) if (uap->options & XATTR_NOFOLLOW_ANY) { sactx->nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & XATTR_RESOLVE_BENEATH) { + sactx->nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = namei(&sactx->nd))) { goto out; @@ -13562,7 +13880,7 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval) UIO_STACKBUF(uio_buf, 1); if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT | - XATTR_NOFOLLOW_ANY)) { + XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) { return EINVAL; } @@ -13650,6 +13968,9 @@ removexattr(proc_t p, struct removexattr_args *uap, int *retval) if (uap->options & XATTR_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & XATTR_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = namei(&nd))) { return error; @@ -13690,7 +14011,7 @@ fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval) #endif if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT | - XATTR_NOFOLLOW_ANY)) { + XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) { return EINVAL; } @@ -13753,6 +14074,9 @@ listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval) if (uap->options & XATTR_NOFOLLOW_ANY) { nd.ni_flag |= NAMEI_NOFOLLOW_ANY; } + if (uap->options & XATTR_RESOLVE_BENEATH) { + nd.ni_flag |= NAMEI_RESOLVE_BENEATH; + } if ((error = namei(&nd))) { return error; @@ -13791,7 +14115,7 @@ flistxattr(proc_t p, struct flistxattr_args *uap, user_ssize_t *retval) UIO_STACKBUF(uio_buf, 1); if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT | - XATTR_NOFOLLOW_ANY)) { + XATTR_NOFOLLOW_ANY | XATTR_RESOLVE_BENEATH)) { return EINVAL; } @@ -13825,7 +14149,6 @@ fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid, vm_size_t bufsize, caddr_t buf, uint32_t options, int *pathlen) { int error; - struct mount *mp = NULL; vnode_t vp; int length; int bpflags; @@ -13841,58 +14164,7 @@ fsgetpath_internal(vfs_context_t ctx, int volfs_id, uint64_t objid, } retry: - if ((mp = mount_lookupby_volfsid(volfs_id, 1)) == NULL) { - error = ENOTSUP; /* unexpected failure */ - return ENOTSUP; - } - -#if CONFIG_UNION_MOUNTS -unionget: -#endif /* CONFIG_UNION_MOUNTS */ - if (objid == 2) { - struct vfs_attr vfsattr; - int use_vfs_root = TRUE; - - VFSATTR_INIT(&vfsattr); - VFSATTR_WANTED(&vfsattr, f_capabilities); - if (!(options & FSOPT_ISREALFSID) && - vfs_getattr(mp, &vfsattr, vfs_context_kernel()) == 0 && - VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) { - if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS) && - (vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_VOL_GROUPS)) { - use_vfs_root = FALSE; - } - } - - if (use_vfs_root) { - error = VFS_ROOT(mp, &vp, ctx); - } else { - error = VFS_VGET(mp, objid, &vp, ctx); - } - } else { - error = VFS_VGET(mp, (ino64_t)objid, &vp, ctx); - } - -#if CONFIG_UNION_MOUNTS - if (error == ENOENT && (mp->mnt_flag & MNT_UNION)) { - /* - * If the fileid isn't found and we're in a union - * mount volume, then see if the fileid is in the - * mounted-on volume. - */ - struct mount *tmp = mp; - mp = vnode_mount(tmp->mnt_vnodecovered); - vfs_unbusy(tmp); - if (vfs_busy(mp, LK_NOWAIT) == 0) { - goto unionget; - } - } else { - vfs_unbusy(mp); - } -#else - vfs_unbusy(mp); -#endif /* CONFIG_UNION_MOUNTS */ - + error = vnode_getfromid(volfs_id, objid, ctx, options & FSOPT_ISREALFSID, &vp); if (error) { return error; } @@ -14463,13 +14735,19 @@ out: * made. */ static int __attribute__((noinline)) -snapshot_create(int dirfd, user_addr_t name, __unused uint32_t flags, +snapshot_create(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) { vnode_t rvp, snapdvp; int error; struct nameidata *ndp; + /* No flags are currently defined */ + if (flags) { + printf("snapshot_create: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + ndp = kalloc_type(struct nameidata, Z_WAITOK); error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, CREATE, @@ -14516,13 +14794,19 @@ out: * delete the snapshot. */ static int __attribute__((noinline)) -snapshot_delete(int dirfd, user_addr_t name, __unused uint32_t flags, +snapshot_delete(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) { vnode_t rvp, snapdvp; int error; struct nameidata *ndp; + /* No flags are currently defined */ + if (flags) { + printf("snapshot_delete: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + ndp = kalloc_type(struct nameidata, Z_WAITOK); error = vnode_get_snapshot(dirfd, &rvp, &snapdvp, name, ndp, DELETE, @@ -14550,7 +14834,7 @@ out: * Marks the filesystem to revert to the given snapshot on next mount. */ static int __attribute__((noinline)) -snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags, +snapshot_revert(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) { int error; @@ -14561,6 +14845,12 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags, caddr_t name_buf; size_t name_len; + /* No flags are currently defined */ + if (flags) { + printf("snapshot_revert: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + error = vnode_getfromfd(ctx, dirfd, &rvp); if (error) { return error; @@ -14643,7 +14933,7 @@ snapshot_revert(int dirfd, user_addr_t name, __unused uint32_t flags, */ static int __attribute__((noinline)) snapshot_rename(int dirfd, user_addr_t old, user_addr_t new, - __unused uint32_t flags, vfs_context_t ctx) + uint32_t flags, vfs_context_t ctx) { vnode_t rvp, snapdvp; int error, i; @@ -14657,6 +14947,12 @@ snapshot_rename(int dirfd, user_addr_t old, user_addr_t new, struct nameidata to_node; } * __rename_data; + /* No flags are currently defined */ + if (flags) { + printf("snapshot_rename: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + __rename_data = kalloc_type(typeof(*__rename_data), Z_WAITOK); fromnd = &__rename_data->from_node; tond = &__rename_data->to_node; @@ -14744,7 +15040,7 @@ out: */ static int __attribute__((noinline)) snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, - __unused user_addr_t mnt_data, __unused uint32_t flags, vfs_context_t ctx) + __unused user_addr_t mnt_data, uint32_t flags, vfs_context_t ctx) { mount_t mp; vnode_t rvp, snapdvp, snapvp, vp, pvp; @@ -14757,6 +15053,12 @@ snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, struct nameidata dirnd; } * __snapshot_mount_data; + /* Check for invalid flags */ + if (flags & ~SNAPSHOT_MNT_VALIDMASK) { + printf("snapshot_mount: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + __snapshot_mount_data = kalloc_type(typeof(*__snapshot_mount_data), Z_WAITOK); snapndp = &__snapshot_mount_data->snapnd; dirndp = &__snapshot_mount_data->dirnd; @@ -14774,6 +15076,9 @@ snapshot_mount(int dirfd, user_addr_t name, user_addr_t directory, } /* Convert snapshot_mount flags to mount flags */ + if (flags & SNAPSHOT_MNT_NOEXEC) { + mount_flags |= MNT_NOEXEC; + } if (flags & SNAPSHOT_MNT_NOSUID) { mount_flags |= MNT_NOSUID; } @@ -14845,7 +15150,7 @@ out: * Marks the filesystem to root from the given snapshot on next boot. */ static int __attribute__((noinline)) -snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags, +snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) { int error; @@ -14856,6 +15161,12 @@ snapshot_root(int dirfd, user_addr_t name, __unused uint32_t flags, caddr_t name_buf; size_t name_len; + /* No flags are currently defined */ + if (flags) { + printf("snapshot_root: Invalid flags passed 0x%x\n", flags); + return EINVAL; + } + error = vnode_getfromfd(ctx, dirfd, &rvp); if (error) { return error; diff --git a/bsd/vfs/vfs_unicode.c b/bsd/vfs/vfs_unicode.c index 8166556a2..c91f2fd02 100644 --- a/bsd/vfs/vfs_unicode.c +++ b/bsd/vfs/vfs_unicode.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2016-2023 Apple, Inc. All rights reserved. + * Copyright (C) 2016-2025 Apple, Inc. All rights reserved. * Some portions covered by other copyrights, listed below. *--- * Copyright (C) 2016 and later: Unicode, Inc. and others. @@ -50,8 +50,8 @@ int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerC void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]) { - version[0] = 15; - version[1] = 1; + version[0] = 16; + version[1] = 0; version[2] = 0; version[3] = 0; return; diff --git a/bsd/vfs/vfs_unicode_data.h b/bsd/vfs/vfs_unicode_data.h index 03bb1cccf..efdcdb83f 100644 --- a/bsd/vfs/vfs_unicode_data.h +++ b/bsd/vfs/vfs_unicode_data.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023 Apple, Inc. All rights reserved. + * Copyright (c) 2016-2025 Apple, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -197,14 +197,14 @@ enum { /* Start generated data. */ /* hiCount: 806, nfTrieHi size 1612 */ -/* midCount: 147, nfTrieMid size 4704 */ -/* loCount: 437, nfTrieLo size 13984 */ -/* u16InvMasksIndex: 135, nfU16InvMasks size 270 */ +/* midCount: 153, nfTrieMid size 4896 */ +/* loCount: 448, nfTrieLo size 14336 */ +/* u16InvMasksIndex: 139, nfU16InvMasks size 278 */ /* u16Seq2Index: 773, nfU16Seq2 size 3092 */ /* u16Seq3Index: 222, nfU16Seq3 size 1332 */ /* u16SeqMiscOffset: 198, nfU16SeqMisc size 396 */ -/* u32CharIndex: 927, nfU32Char size 3708 */ -/* u32SeqMiscOffset: 87, nfU32SeqMisc size 348 */ +/* u32CharIndex: 954, nfU32Char size 3816 */ +/* u32SeqMiscOffset: 151, nfU32SeqMisc size 604 */ /* basicCFCount: 1280, nfBasicCF size 2560 */ static uint16_t nfTrieHi[806] = { @@ -228,19 +228,19 @@ static uint16_t nfTrieHi[806] = { /* 0x100:0x10000 */ 0xC039, 0xC03A, 0xC03B, 0xC03C, 0xC03D, 0xC03E, 0x0000, 0xC03F, 0xC040, 0xC041, 0xC042, 0xC043, 0xC044, 0xC045, 0xC046, 0xC047, /* 0x110:0x11000 */ 0xC048, 0xC049, 0xC04A, 0xC04B, 0xC04C, 0xC04D, 0xC04E, 0xC04F, 0xC050, 0xC051, 0xC052, 0xC053, 0xC054, 0xC055, 0xC056, 0xC057, /* 0x120:0x12000 */ 0x0000, 0x0000, 0x0000, 0xC058, 0xC059, 0xC05A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC05B, - /* 0x130:0x13000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0xC05C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, - /* 0x140:0x14000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC05D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + /* 0x130:0x13000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0xC05C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x140:0x14000 */ 0x0000, 0x0000, 0x0000, 0xC05D, 0x0000, 0x0000, 0xC05E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, /* 0x150:0x15000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, - /* 0x160:0x16000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC05E, 0xC05F, 0xFFFF, 0xFFFF, 0xC060, 0xC061, + /* 0x160:0x16000 */ 0xFFFF, 0xC05F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC060, 0xC061, 0xFFFF, 0xC062, 0xC063, 0xC064, /* 0x170:0x17000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x180:0x18000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC062, 0x0000, 0x0000, 0x0000, 0x0000, 0xC063, 0xC064, 0xFFFF, 0xFFFF, + /* 0x180:0x18000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC065, 0x0000, 0x0000, 0x0000, 0x0000, 0xC066, 0xC067, 0xFFFF, 0xFFFF, /* 0x190:0x19000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, - /* 0x1A0:0x1A000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC065, - /* 0x1B0:0x1B000 */ 0x0000, 0xC066, 0xC067, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC068, 0xFFFF, 0xFFFF, 0xFFFF, - /* 0x1C0:0x1C000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC069, - /* 0x1D0:0x1D000 */ 0xC06A, 0xC06B, 0xC06C, 0xC06D, 0xC06E, 0xC06F, 0xC070, 0xC071, 0x0000, 0x0000, 0xC072, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC073, - /* 0x1E0:0x1E000 */ 0xC074, 0xC075, 0xC076, 0xFFFF, 0xC077, 0xFFFF, 0xFFFF, 0xC078, 0xC079, 0xC07A, 0xFFFF, 0xFFFF, 0xC07B, 0xC07C, 0xC07D, 0xFFFF, - /* 0x1F0:0x1F000 */ 0xC07E, 0xC07F, 0xC080, 0x0000, 0x0000, 0x0000, 0xC081, 0xC082, 0xC083, 0x0000, 0xC084, 0xC085, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + /* 0x1A0:0x1A000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC068, + /* 0x1B0:0x1B000 */ 0x0000, 0xC069, 0xC06A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC06B, 0xFFFF, 0xFFFF, 0xFFFF, + /* 0x1C0:0x1C000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC06C, 0x0000, 0xC06D, 0xC06E, + /* 0x1D0:0x1D000 */ 0xC06F, 0xC070, 0xC071, 0xC072, 0xC073, 0xC074, 0xC075, 0xC076, 0x0000, 0x0000, 0xC077, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC078, + /* 0x1E0:0x1E000 */ 0xC079, 0xC07A, 0xC07B, 0xFFFF, 0xC07C, 0xC07D, 0xFFFF, 0xC07E, 0xC07F, 0xC080, 0xFFFF, 0xFFFF, 0xC081, 0xC082, 0xC083, 0xFFFF, + /* 0x1F0:0x1F000 */ 0xC084, 0xC085, 0xC086, 0x0000, 0x0000, 0x0000, 0xC087, 0xC088, 0xC089, 0x0000, 0xC08A, 0xC08B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, /* 0x200:0x20000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x210:0x21000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x220:0x22000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -251,17 +251,17 @@ static uint16_t nfTrieHi[806] = { /* 0x270:0x27000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x280:0x28000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x290:0x29000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x2A0:0x2A000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC086, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x2B0:0x2B000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC087, 0xC088, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x2C0:0x2C000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC089, 0x0000, + /* 0x2A0:0x2A000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC08C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x2B0:0x2B000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC08D, 0xC08E, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x2C0:0x2C000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC08F, 0x0000, /* 0x2D0:0x2D000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x2E0:0x2E000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC08A, 0x0000, 0x0000, 0xC08B, 0xFFFF, - /* 0x2F0:0x2F000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC08C, 0xC08D, 0xC08E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + /* 0x2E0:0x2E000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC090, 0x0000, 0x0000, 0xC091, 0xFFFF, + /* 0x2F0:0x2F000 */ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC092, 0xC093, 0xC094, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, /* 0x300:0x30000 */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x310:0x31000 */ 0x0000, 0x0000, 0x0000, 0xC08F, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - /* 0x320:0x32000 */ 0x0000, 0x0000, 0x0000, 0xC090, 0xC091, 0xC092 + /* 0x310:0x31000 */ 0x0000, 0x0000, 0x0000, 0xC095, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + /* 0x320:0x32000 */ 0x0000, 0x0000, 0x0000, 0xC096, 0xC097, 0xC098 }; -static uint16_t nfTrieMid[147][16] = { +static uint16_t nfTrieMid[153][16] = { /* 0x000 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC000, 0xC001, 0xC002, 0xC003 }, /* 0x001 */ { 0xC004, 0xC005, 0xC006, 0xC007, 0xC008, 0xC009, 0xC00A, 0xC00B, 0x0000, 0x0000, 0xC00C, 0xC00D, 0xC00E, 0xC00F, 0xC010, 0xC011 }, /* 0x002 */ { 0xC012, 0xC013, 0xC014, 0xC015, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, @@ -295,7 +295,7 @@ static uint16_t nfTrieMid[147][16] = { /* 0x01E */ { 0x0000, 0x0000, 0xC0C1, 0xC0C2, 0x0000, 0x0000, 0xC0C3, 0x0000, 0xC0C4, 0xC0C5, 0xC0C6, 0x0000, 0xC0C7, 0x0000, 0x0000, 0x0000 }, /* 0x01F */ { 0xC0C8, 0x0000, 0xC0C9, 0x0000, 0xC0CA, 0x0000, 0xC0CB, 0xC0CC, 0xC0CD, 0x0000, 0xC0CE, 0x0000, 0x0000, 0x0000, 0xC0CF, 0x0000 }, /* 0x020 */ { 0x0000, 0x0000, 0xC0D0, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x021 */ { 0x0000, 0x0000, 0xAE0C, 0xFFFF, 0xAE05, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC0D1, 0xC0D2, 0x0000, 0x0000, 0x0000 }, + /* 0x021 */ { 0x0000, 0x0000, 0xAE2C, 0xFFFF, 0xAE05, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC0D1, 0xC0D2, 0x0000, 0x0000, 0x0000 }, /* 0x022 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xC0D3, 0x0000, 0x0000 }, /* 0x023 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE09, 0x0000, 0xAE37, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x024 */ { 0xC0D4, 0xC0D5, 0xC0D6, 0x0000, 0x0000, 0x0000, 0xC0D7, 0xC0D8, 0xC0D9, 0xC0DA, 0xC0DB, 0xC0DC, 0xC0DD, 0xC0DE, 0xC0DF, 0xC0E0 }, @@ -324,93 +324,99 @@ static uint16_t nfTrieMid[147][16] = { /* 0x03B */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE2B, 0x0000, 0x0000, 0x0000, 0xAE0F, 0xC13A, 0xAE22 }, /* 0x03C */ { 0x0000, 0x0000, 0xAE4F, 0x0000, 0xAE05, 0x0000, 0x0000, 0xC13B, 0x0000, 0xAE02, 0x0000, 0x0000, 0xAE50, 0xAE30, 0xFFFF, 0xFFFF }, /* 0x03D */ { 0xC13C, 0xC13D, 0xC13E, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE32, 0xAE2C, 0xC13F, 0xC140, 0xC141, 0x0000, 0xAE22 }, - /* 0x03E */ { 0x0000, 0x0000, 0xAE12, 0x0000, 0x0000, 0x0000, 0xAE3D, 0xC142, 0xC143, 0xC144, 0xAE0E, 0xAE51, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x03F */ { 0x0000, 0x0000, 0x0000, 0xAE0C, 0x0000, 0xAE30, 0xAE12, 0xFFFF, 0xAE37, 0x0000, 0x0000, 0xAE52, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x040 */ { 0xAE53, 0x0000, 0x0000, 0xAE54, 0x0000, 0xAE37, 0x0000, 0x0000, 0x0000, 0xAE04, 0xAE19, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE55 }, - /* 0x041 */ { 0x0000, 0xAE56, 0x0000, 0xAE57, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xAE58, 0x0000, 0xAE44, 0x0000, 0x0000 }, - /* 0x042 */ { 0xC145, 0xAE59, 0x0000, 0xC146, 0xAE2D, 0xAE2D, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC147, 0xAE0C }, - /* 0x043 */ { 0x0000, 0x0000, 0x0000, 0xAE5A, 0x0000, 0xAE5B, 0x0000, 0xAE5C, 0x0000, 0xAE5D, 0xAE5E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x044 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xC148, 0xC149, 0xC14A, 0xC14B, 0x0000, 0x0000, 0x0000, 0xAE5F }, - /* 0x045 */ { 0x0000, 0x0000, 0xC14C, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x046 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE04, 0x0000, 0x0000, 0xC14D, 0xAE03, 0xFFFF, 0xFFFF, 0xFFFF, 0xC14E }, - /* 0x047 */ { 0x0000, 0x0000, 0xAE12, 0x0000, 0xC14F, 0xC150, 0xFFFF, 0x0000, 0xC151, 0xFFFF, 0xFFFF, 0x0000, 0xAE22, 0xFFFF, 0x0000, 0xAE0C }, - /* 0x048 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC152, 0xAE44, 0x0000, 0xC153, 0x0000, 0xC154, 0xC155, 0xC156, 0xAE60, 0x0000, 0xAE2D, 0xAE2C }, - /* 0x049 */ { 0xC157, 0x0000, 0xC158, 0xC159, 0xAE12, 0x0000, 0x0000, 0xC15A, 0x0000, 0x0000, 0x0000, 0x0000, 0xC15B, 0x0000, 0xAE21, 0xAE01 }, - /* 0x04A */ { 0x0000, 0xAE0E, 0x0000, 0xC15C, 0xAE03, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE61, 0xAE02, 0xAE2C, 0x0000, 0x0000, 0x0000, 0xC15D, 0xAE2C }, - /* 0x04B */ { 0xAE06, 0xAE07, 0xAE08, 0xC15E, 0xC15F, 0xAE62, 0xC160, 0xC161, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x04C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC162, 0xC163, 0xAE03, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC164, 0xC165, 0xAE2C, 0xFFFF, 0xFFFF }, - /* 0x04D */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC166, 0xC167, 0xAE32, 0xFFFF, 0xFFFF }, - /* 0x04E */ { 0x0000, 0x0000, 0x0000, 0xC168, 0xAE01, 0xAE2C, 0xAE2B, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC169, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x04F */ { 0x0000, 0xAE63, 0xC16A, 0x0000, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x050 */ { 0x0000, 0x0000, 0x0000, 0xC16B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC16C, 0xC16D, 0x0000, 0x0000, 0x0000, 0xAE64 }, - /* 0x051 */ { 0xAE65, 0xAE66, 0x0000, 0xC16E, 0xC16F, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE67, 0x0000, 0x0000, 0xAE67, 0xC170, 0xFFFF }, - /* 0x052 */ { 0x0000, 0x0000, 0x0000, 0xC171, 0xC172, 0x0000, 0x0000, 0x0000, 0x0000, 0xC173, 0xAE42, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2D }, - /* 0x053 */ { 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x054 */ { 0xAE08, 0x0000, 0x0000, 0xC174, 0xAE30, 0x0000, 0xAE2B, 0x0000, 0x0000, 0xAE44, 0xAE68, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x055 */ { 0xAE69, 0x0000, 0x0000, 0xAE6A, 0xC175, 0xAE2C, 0xAE6B, 0x0000, 0xAE04, 0xC176, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x03E */ { 0x0000, 0x0000, 0xAE12, 0x0000, 0x0000, 0x0000, 0xAE51, 0xC142, 0xC143, 0xC144, 0xAE0E, 0xAE52, 0xC145, 0x0000, 0xC146, 0xAE2E }, + /* 0x03F */ { 0x0000, 0x0000, 0x0000, 0xAE0C, 0x0000, 0xAE30, 0xAE12, 0xFFFF, 0xAE37, 0x0000, 0x0000, 0xAE53, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x040 */ { 0xAE54, 0x0000, 0x0000, 0xAE55, 0x0000, 0xAE37, 0x0000, 0x0000, 0x0000, 0xAE04, 0xAE19, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE56 }, + /* 0x041 */ { 0x0000, 0xAE57, 0x0000, 0xAE58, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xAE59, 0x0000, 0xAE44, 0x0000, 0x0000 }, + /* 0x042 */ { 0xC147, 0xAE5A, 0x0000, 0xC148, 0xAE2D, 0xAE2D, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC149, 0xAE0C }, + /* 0x043 */ { 0x0000, 0x0000, 0x0000, 0xAE5B, 0x0000, 0xAE5C, 0x0000, 0xAE5D, 0x0000, 0xAE5E, 0xAE5F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x044 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xC14A, 0xC14B, 0xC14C, 0xC14D, 0x0000, 0x0000, 0x0000, 0xAE60 }, + /* 0x045 */ { 0x0000, 0x0000, 0xC14E, 0xAE2C, 0x0000, 0xC14F, 0xC150, 0x0000, 0xAE61, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x046 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE04, 0x0000, 0x0000, 0xC151, 0xAE03, 0xAE20, 0xFFFF, 0xFFFF, 0xC152 }, + /* 0x047 */ { 0x0000, 0x0000, 0xAE12, 0x0000, 0xC153, 0xC154, 0xFFFF, 0x0000, 0xC155, 0xFFFF, 0xFFFF, 0x0000, 0xAE22, 0xFFFF, 0x0000, 0xAE0C }, + /* 0x048 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC156, 0xAE44, 0x0000, 0xC157, 0x0000, 0xC158, 0xC159, 0xC15A, 0xAE62, 0x0000, 0xAE2D, 0xAE2C }, + /* 0x049 */ { 0xC15B, 0x0000, 0xC15C, 0xC15D, 0xAE12, 0x0000, 0x0000, 0xC15E, 0x0000, 0x0000, 0x0000, 0x0000, 0xC15F, 0x0000, 0xAE21, 0xAE01 }, + /* 0x04A */ { 0x0000, 0xAE0E, 0x0000, 0xC160, 0xAE03, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE63, 0xAE02, 0xAE2C, 0x0000, 0x0000, 0x0000, 0xC161, 0xAE2C }, + /* 0x04B */ { 0xAE06, 0xAE07, 0xAE08, 0xC162, 0xC163, 0xAE64, 0xC164, 0xC165, 0xC166, 0xC167, 0x0000, 0xAE37, 0xC168, 0xC169, 0xAE65, 0xFFFF }, + /* 0x04C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC16A, 0xC16B, 0xAE03, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC16C, 0xC16D, 0xAE2C, 0xFFFF, 0xFFFF }, + /* 0x04D */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC16E, 0xC16F, 0xAE32, 0xFFFF, 0xFFFF }, + /* 0x04E */ { 0x0000, 0x0000, 0x0000, 0xC170, 0xAE01, 0xAE2C, 0xAE2B, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xC171, 0xAE2C, 0x0000, 0xAE2E, 0xFFFF }, + /* 0x04F */ { 0x0000, 0xAE66, 0xC172, 0x0000, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x050 */ { 0x0000, 0x0000, 0x0000, 0xC173, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC174, 0xC175, 0x0000, 0x0000, 0x0000, 0xAE67 }, + /* 0x051 */ { 0xAE68, 0xAE69, 0x0000, 0xC176, 0xC177, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE6A, 0x0000, 0x0000, 0xAE6A, 0xC178, 0xFFFF }, + /* 0x052 */ { 0x0000, 0x0000, 0x0000, 0xC179, 0xC17A, 0x0000, 0x0000, 0x0000, 0x0000, 0xC17B, 0xAE42, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2D }, + /* 0x053 */ { 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xAE03, 0xAE2C }, + /* 0x054 */ { 0xAE08, 0x0000, 0x0000, 0xC17C, 0xAE30, 0x0000, 0xAE2B, 0x0000, 0x0000, 0xAE44, 0xAE6B, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x055 */ { 0xAE6C, 0x0000, 0x0000, 0xAE6D, 0xC17D, 0xAE2C, 0xAE6E, 0x0000, 0xAE04, 0xC17E, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x056 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE2D }, - /* 0x057 */ { 0x0000, 0xAE18, 0x0000, 0xAE33, 0xC177, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE0F, 0x0000, 0x0000, 0x0000, 0xAE6C }, + /* 0x057 */ { 0x0000, 0xAE18, 0x0000, 0xAE33, 0xC17F, 0xAE05, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE0F, 0x0000, 0x0000, 0x0000, 0xAE6F }, /* 0x058 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x059 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE04, 0xAE01, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x05A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x05B */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE42 }, - /* 0x05C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x05D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x05E */ { 0x0000, 0x0000, 0x0000, 0xAE2D, 0x0000, 0xAE04, 0xAE15, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE04, 0xAE2C, 0x0000, 0xAE32, 0xC178 }, - /* 0x05F */ { 0x0000, 0x0000, 0x0000, 0xC179, 0xAE30, 0xAE3B, 0xAE0E, 0xAE6D, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x060 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC17A, 0xC17B, 0x0000, 0x0000, 0x0000, 0xAE05, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x061 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE00, 0x0000, 0x0000, 0x0000, 0xAE39, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE01, 0xC17C }, - /* 0x062 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE12 }, - /* 0x063 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30, 0xFFFF, 0xFFFF }, - /* 0x064 */ { 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x065 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE6E }, - /* 0x066 */ { 0x0000, 0x0000, 0xAE42, 0xAE6F, 0xFFFF, 0xAE70, 0xAE71, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x067 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE22 }, - /* 0x068 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE05, 0xAE2B, 0xAE2D, 0xC17D, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x069 */ { 0x0000, 0x0000, 0xAE32, 0x0000, 0xAE0C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x06A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30 }, - /* 0x06B */ { 0x0000, 0x0000, 0xAE72, 0x0000, 0x0000, 0xC17E, 0xC17F, 0xC180, 0xC181, 0x0000, 0xC182, 0xC183, 0xC184, 0x0000, 0xAE05, 0xFFFF }, - /* 0x06C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC185, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE2E, 0x0000, 0xAE2E }, - /* 0x06D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0C, 0x0000, 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x06E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE35, 0x0000, 0x0000, 0x0000, 0xAE17, 0xAE73, 0xAE74, 0xAE75, 0x0000, 0x0000, 0x0000 }, - /* 0x06F */ { 0xAE76, 0xAE77, 0x0000, 0xAE78, 0xAE79, 0xAE18, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x070 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE5B, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x071 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE7A, 0x0000, 0x0000, 0x0000 }, - /* 0x072 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE22, 0xAE3E, 0xAE21, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x073 */ { 0x0000, 0xAE04, 0xAE7B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x074 */ { 0xC186, 0xC187, 0xC188, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xC189, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x075 */ { 0x0000, 0x0000, 0xAE2B, 0xC18A, 0xAE15, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x076 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xC18B, 0xFFFF, 0x0000, 0x0000, 0xC18C, 0xAE57 }, - /* 0x077 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xC18D, 0xAE2C }, - /* 0x078 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE7C, 0xAE04 }, - /* 0x079 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE7D, 0xC18E, 0xFFFF, 0xFFFF }, - /* 0x07A */ { 0xC18F, 0xC190, 0xC191, 0x0000, 0xC192, 0xAE15, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x07B */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE21, 0x0000, 0x0000, 0x0000, 0xAE01, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x07C */ { 0xAE21, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x07D */ { 0xAE75, 0x0000, 0xAE7E, 0xAE7F, 0xAE80, 0xAE81, 0xAE82, 0xAE83, 0xAE3B, 0xAE22, 0xAE84, 0xAE22, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE03 }, - /* 0x07E */ { 0x0000, 0x0000, 0xAE22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xAE04, 0xAE21, 0xAE21, 0xAE21, 0x0000, 0xAE30 }, - /* 0x07F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE0B, 0x0000 }, - /* 0x080 */ { 0xAE42, 0x0000, 0x0000, 0xAE22, 0xAE2D, 0xAE03, 0xAE30, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x081 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE58, 0xAE2B, 0xAE2B }, - /* 0x082 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE41, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2C, 0xAE22, 0xAE0F }, - /* 0x083 */ { 0xAE22, 0x0000, 0x0000, 0x0000, 0xAE12, 0xAE2C, 0x0000, 0x0000, 0xAE12, 0x0000, 0xAE32, 0xAE03, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x084 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xAE32, 0xAE2B, 0xAE2D, 0x0000, 0x0000, 0xAE02, 0xAE85, 0xAE22, 0xAE2D, 0xAE2D }, - /* 0x085 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE45, 0x0000, 0x0000, 0xAE05, 0xFFFF, 0xFFFF, 0xAE2C }, - /* 0x086 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, - /* 0x087 */ { 0x0000, 0x0000, 0x0000, 0xAE2C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x088 */ { 0x0000, 0xAE32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x089 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE03, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x08A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0F, 0x0000 }, - /* 0x08B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x08C */ { 0xC193, 0xC194, 0xC195, 0xC196, 0xC197, 0xC198, 0xC199, 0xC19A, 0xC19B, 0xC19C, 0xC19D, 0xC19E, 0xC19F, 0xC1A0, 0xC1A1, 0xC1A2 }, - /* 0x08D */ { 0xC1A3, 0xC1A4, 0xC1A5, 0xC1A6, 0xC1A7, 0xC1A8, 0xC1A9, 0xC1AA, 0xC1AB, 0xC1AC, 0xC1AD, 0xC1AE, 0xC1AF, 0xC1B0, 0xC1B1, 0xC1B2 }, - /* 0x08E */ { 0xC1B3, 0xC1B4, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x08F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE05, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x090 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x091 */ { 0xAE86, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x092 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF } + /* 0x05C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x05D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE05 }, + /* 0x05E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x05F */ { 0x0000, 0x0000, 0xC180, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x060 */ { 0x0000, 0x0000, 0x0000, 0xAE2D, 0x0000, 0xAE04, 0xAE15, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE04, 0xAE2C, 0x0000, 0xAE32, 0xC181 }, + /* 0x061 */ { 0x0000, 0x0000, 0x0000, 0xC182, 0xAE30, 0xAE3B, 0xAE0E, 0xAE70, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x062 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xC183, 0xAE2C, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x063 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xC184, 0xC185, 0x0000, 0x0000, 0x0000, 0xAE05, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x064 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE00, 0x0000, 0x0000, 0x0000, 0xAE39, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE01, 0xC186 }, + /* 0x065 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE12 }, + /* 0x066 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30, 0xFFFF, 0xAE71 }, + /* 0x067 */ { 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x068 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE72 }, + /* 0x069 */ { 0x0000, 0x0000, 0xAE42, 0xAE73, 0xFFFF, 0xAE74, 0xAE75, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x06A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE22 }, + /* 0x06B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE05, 0xAE2B, 0xAE2D, 0xC187, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x06C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2C }, + /* 0x06D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x06E */ { 0x0000, 0x0000, 0xAE32, 0x0000, 0xAE0C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x06F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE30 }, + /* 0x070 */ { 0x0000, 0x0000, 0xAE76, 0x0000, 0x0000, 0xC188, 0xC189, 0xC18A, 0xC18B, 0x0000, 0xC18C, 0xC18D, 0xC18E, 0x0000, 0xAE05, 0xFFFF }, + /* 0x071 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xC18F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xAE2E, 0x0000, 0xAE2E }, + /* 0x072 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0C, 0x0000, 0xAE2D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x073 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE35, 0x0000, 0x0000, 0x0000, 0xAE17, 0xAE77, 0xAE78, 0xAE79, 0x0000, 0x0000, 0x0000 }, + /* 0x074 */ { 0xAE7A, 0xAE7B, 0x0000, 0xAE7C, 0xAE7D, 0xAE18, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x075 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE5C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x076 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE7E, 0x0000, 0x0000, 0x0000 }, + /* 0x077 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE22, 0xAE3E, 0xAE21, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x078 */ { 0x0000, 0xAE04, 0xAE7F, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x079 */ { 0xC190, 0xC191, 0xC192, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xC193, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x07A */ { 0x0000, 0x0000, 0xAE2B, 0xC194, 0xAE15, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x07B */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xC195, 0xFFFF, 0x0000, 0x0000, 0xC196, 0xAE58 }, + /* 0x07C */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xC197, 0xAE2C }, + /* 0x07D */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xC198, 0xAE00 }, + /* 0x07E */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE80, 0xAE04 }, + /* 0x07F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE81, 0xC199, 0xFFFF, 0xFFFF }, + /* 0x080 */ { 0xC19A, 0xC19B, 0xC19C, 0x0000, 0xC19D, 0xAE15, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x081 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE21, 0x0000, 0x0000, 0x0000, 0xAE01, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x082 */ { 0xAE21, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x083 */ { 0xAE79, 0x0000, 0xAE82, 0xAE83, 0xAE84, 0xAE85, 0xAE86, 0xAE87, 0xAE3B, 0xAE22, 0xAE88, 0xAE22, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE03 }, + /* 0x084 */ { 0x0000, 0x0000, 0xAE22, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xAE04, 0xAE21, 0xAE21, 0xAE21, 0x0000, 0xAE30 }, + /* 0x085 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xAE0B, 0x0000 }, + /* 0x086 */ { 0xAE42, 0x0000, 0x0000, 0xAE22, 0xAE2D, 0xAE03, 0xAE30, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x087 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE59, 0xAE2B, 0xAE2B }, + /* 0x088 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE41, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2C, 0xAE22, 0xAE0F }, + /* 0x089 */ { 0xAE22, 0x0000, 0x0000, 0x0000, 0xAE12, 0xAE2C, 0x0000, 0x0000, 0xAE12, 0x0000, 0xAE32, 0xAE22, 0xAE03, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x08A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2E, 0xAE32, 0xAE2B, 0xAE58, 0x0000, 0x0000, 0x0000, 0xAE89, 0xAE47, 0xAE2C, 0xAE2D }, + /* 0x08B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE45, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE2C }, + /* 0x08C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, + /* 0x08D */ { 0x0000, 0x0000, 0x0000, 0xAE2C, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x08E */ { 0x0000, 0xAE32, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x08F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE03, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x090 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE0F, 0x0000 }, + /* 0x091 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAE32, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x092 */ { 0xC19E, 0xC19F, 0xC1A0, 0xC1A1, 0xC1A2, 0xC1A3, 0xC1A4, 0xC1A5, 0xC1A6, 0xC1A7, 0xC1A8, 0xC1A9, 0xC1AA, 0xC1AB, 0xC1AC, 0xC1AD }, + /* 0x093 */ { 0xC1AE, 0xC1AF, 0xC1B0, 0xC1B1, 0xC1B2, 0xC1B3, 0xC1B4, 0xC1B5, 0xC1B6, 0xC1B7, 0xC1B8, 0xC1B9, 0xC1BA, 0xC1BB, 0xC1BC, 0xC1BD }, + /* 0x094 */ { 0xC1BE, 0xC1BF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x095 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAE05, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x096 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x097 */ { 0xAE8A, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x098 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF } }; -static uint16_t nfTrieLo[437][16] = { +static uint16_t nfTrieLo[448][16] = { /* 0x000 */ { 0xB000, 0xB001, 0xB002, 0xB003, 0xB004, 0xB005, 0x0000, 0xB006, 0xB007, 0xB008, 0xB009, 0xB00A, 0xB00B, 0xB00C, 0xB00D, 0xB00E }, /* 0x001 */ { 0x0000, 0xB00F, 0xB010, 0xB011, 0xB012, 0xB013, 0xB014, 0x0000, 0x0000, 0xB015, 0xB016, 0xB017, 0xB018, 0xB019, 0x0000, 0xB81A }, /* 0x002 */ { 0xB01B, 0xB01C, 0xB01D, 0xB01E, 0xB01F, 0xB020, 0x0000, 0xB021, 0xB022, 0xB023, 0xB024, 0xB025, 0xB026, 0xB027, 0xB028, 0xB029 }, @@ -483,7 +489,7 @@ static uint16_t nfTrieLo[437][16] = { /* 0x045 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, /* 0x046 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF }, /* 0x047 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xADDC, 0xADDC, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF }, - /* 0x048 */ { 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xADE6, 0xADDC, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, + /* 0x048 */ { 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, /* 0x049 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADDC }, /* 0x04A */ { 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, /* 0x04B */ { 0xADE6, 0xADE6, 0x0000, 0xADDC, 0xADE6, 0xADE6, 0xADDC, 0xADE6, 0xADE6, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0xADDC }, @@ -551,17 +557,17 @@ static uint16_t nfTrieLo[437][16] = { /* 0x089 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB173, 0x0000, 0xB174, 0x0000, 0xB175, 0x0000, 0xB176, 0x0000, 0xB177, 0x0000 }, /* 0x08A */ { 0x0000, 0x0000, 0xB178, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x08B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB179, 0x0000, 0xB17A, 0x0000, 0x0000 }, - /* 0x08C */ { 0xB17B, 0xB17C, 0x0000, 0xB17D, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x08C */ { 0xB17B, 0xB17C, 0x0000, 0xB17D, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000 }, /* 0x08D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADDC, 0xADE6, 0xADE6, 0xADE6 }, - /* 0x08E */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF }, + /* 0x08E */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x08F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x090 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x091 */ { 0x0000, 0x0000, 0xAD09, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x092 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x093 */ { 0xE86C, 0xE86D, 0xE86E, 0xE86F, 0xE870, 0xE871, 0xE872, 0xE873, 0xE874, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x094 */ { 0xE875, 0xE876, 0xE877, 0xE878, 0xE879, 0xE87A, 0xE87B, 0xE87C, 0xE87D, 0xE87E, 0xE87F, 0xE880, 0xE881, 0xE882, 0xE883, 0xE884 }, - /* 0x095 */ { 0xE885, 0xE886, 0xE887, 0xE888, 0xE889, 0xE88A, 0xE88B, 0xE88C, 0xE88D, 0xE88E, 0xE88F, 0xE890, 0xE891, 0xE892, 0xE893, 0xE894 }, - /* 0x096 */ { 0xE895, 0xE896, 0xE897, 0xE898, 0xE899, 0xE89A, 0xE89B, 0xE89C, 0xE89D, 0xE89E, 0xE89F, 0xFFFF, 0xFFFF, 0xE8A0, 0xE8A1, 0xE8A2 }, + /* 0x093 */ { 0xE86C, 0xE86D, 0xE86E, 0xE86F, 0xE870, 0xE871, 0xE872, 0xE873, 0xE874, 0xE875, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x094 */ { 0xE876, 0xE877, 0xE878, 0xE879, 0xE87A, 0xE87B, 0xE87C, 0xE87D, 0xE87E, 0xE87F, 0xE880, 0xE881, 0xE882, 0xE883, 0xE884, 0xE885 }, + /* 0x095 */ { 0xE886, 0xE887, 0xE888, 0xE889, 0xE88A, 0xE88B, 0xE88C, 0xE88D, 0xE88E, 0xE88F, 0xE890, 0xE891, 0xE892, 0xE893, 0xE894, 0xE895 }, + /* 0x096 */ { 0xE896, 0xE897, 0xE898, 0xE899, 0xE89A, 0xE89B, 0xE89C, 0xE89D, 0xE89E, 0xE89F, 0xE8A0, 0xFFFF, 0xFFFF, 0xE8A1, 0xE8A2, 0xE8A3 }, /* 0x097 */ { 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xAD01, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0xADDC, 0xADDC }, /* 0x098 */ { 0xADE6, 0x0000, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0x0000, 0x0000 }, /* 0x099 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, @@ -583,7 +589,7 @@ static uint16_t nfTrieLo[437][16] = { /* 0x0A9 */ { 0xC050, 0xC051, 0xC052, 0xC053, 0xC054, 0xC055, 0xC056, 0xC057, 0xB205, 0xB206, 0xB207, 0xB208, 0xB209, 0xB20A, 0xB20B, 0xB20C }, /* 0x0AA */ { 0xC058, 0xC059, 0xC05A, 0xC05B, 0xC05C, 0xC05D, 0xC05E, 0xC05F, 0xC060, 0xC061, 0xC062, 0xC063, 0xC064, 0xC065, 0xC066, 0xC067 }, /* 0x0AB */ { 0xC068, 0xC069, 0xC06A, 0xC06B, 0xB20D, 0xB20E, 0xB20F, 0xB210, 0xC06C, 0xC06D, 0xC06E, 0xC06F, 0xC070, 0xC071, 0xC072, 0xC073 }, - /* 0x0AC */ { 0xC074, 0xC075, 0xB211, 0xB212, 0xB213, 0xB214, 0xB215, 0xB216, 0xB217, 0xB218, 0xE8A3, 0x0000, 0xE8A4, 0x0000, 0xE8A5, 0x0000 }, + /* 0x0AC */ { 0xC074, 0xC075, 0xB211, 0xB212, 0xB213, 0xB214, 0xB215, 0xB216, 0xB217, 0xB218, 0xE8A4, 0x0000, 0xE8A5, 0x0000, 0xE8A6, 0x0000 }, /* 0x0AD */ { 0xB219, 0xB21A, 0xC076, 0xC077, 0xC078, 0xC079, 0xC07A, 0xC07B, 0xB21B, 0xB21C, 0xC07C, 0xC07D, 0xC07E, 0xC07F, 0xC080, 0xC081 }, /* 0x0AE */ { 0xB21D, 0xB21E, 0xC082, 0xC083, 0xC084, 0xC085, 0xFFFF, 0xFFFF, 0xB21F, 0xB220, 0xC086, 0xC087, 0xC088, 0xC089, 0xFFFF, 0xFFFF }, /* 0x0AF */ { 0xB221, 0xB222, 0xC08A, 0xC08B, 0xC08C, 0xC08D, 0xC08E, 0xC08F, 0xB223, 0xB224, 0xC090, 0xC091, 0xC092, 0xC093, 0xC094, 0xC095 }, @@ -605,9 +611,9 @@ static uint16_t nfTrieLo[437][16] = { /* 0x0BF */ { 0x0000, 0xADE6, 0x0000, 0x0000, 0x0000, 0xAD01, 0xAD01, 0xADE6, 0xADDC, 0xADE6, 0xAD01, 0xAD01, 0xADDC, 0xADDC, 0xADDC, 0xADDC }, /* 0x0C0 */ { 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x0C1 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03A9, 0x0000, 0x0000, 0x0000, 0x004B, 0xB273, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x0C2 */ { 0x0000, 0x0000, 0xE8A6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x0C3 */ { 0xE8A7, 0xE8A8, 0xE8A9, 0xE8AA, 0xE8AB, 0xE8AC, 0xE8AD, 0xE8AE, 0xE8AF, 0xE8B0, 0xE8B1, 0xE8B2, 0xE8B3, 0xE8B4, 0xE8B5, 0xE8B6 }, - /* 0x0C4 */ { 0x0000, 0x0000, 0x0000, 0xE8B7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x0C2 */ { 0x0000, 0x0000, 0xE8A7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x0C3 */ { 0xE8A8, 0xE8A9, 0xE8AA, 0xE8AB, 0xE8AC, 0xE8AD, 0xE8AE, 0xE8AF, 0xE8B0, 0xE8B1, 0xE8B2, 0xE8B3, 0xE8B4, 0xE8B5, 0xE8B6, 0xE8B7 }, + /* 0x0C4 */ { 0x0000, 0x0000, 0x0000, 0xE8B8, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x0C5 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB274, 0xB275, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x0C6 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB276, 0x0000 }, /* 0x0C7 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB277, 0xB278, 0xB279 }, @@ -620,22 +626,22 @@ static uint16_t nfTrieLo[437][16] = { /* 0x0CE */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB294, 0xB295, 0xB296, 0xB297 }, /* 0x0CF */ { 0xB298, 0xB299, 0xB29A, 0xB29B, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB29C, 0xB29D, 0xB29E, 0xB29F, 0x0000, 0x0000 }, /* 0x0D0 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x3008, 0x3009, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x0D1 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE8B8, 0xE8B9, 0xE8BA, 0xE8BB, 0xE8BC, 0xE8BD, 0xE8BE, 0xE8BF, 0xE8C0, 0xE8C1 }, - /* 0x0D2 */ { 0xE8C2, 0xE8C3, 0xE8C4, 0xE8C5, 0xE8C6, 0xE8C7, 0xE8C8, 0xE8C9, 0xE8CA, 0xE8CB, 0xE8CC, 0xE8CD, 0xE8CE, 0xE8CF, 0xE8D0, 0xE8D1 }, + /* 0x0D1 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE8B9, 0xE8BA, 0xE8BB, 0xE8BC, 0xE8BD, 0xE8BE, 0xE8BF, 0xE8C0, 0xE8C1, 0xE8C2 }, + /* 0x0D2 */ { 0xE8C3, 0xE8C4, 0xE8C5, 0xE8C6, 0xE8C7, 0xE8C8, 0xE8C9, 0xE8CA, 0xE8CB, 0xE8CC, 0xE8CD, 0xE8CE, 0xE8CF, 0xE8D0, 0xE8D1, 0xE8D2 }, /* 0x0D3 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB2A0, 0x0000, 0x0000, 0x0000 }, - /* 0x0D4 */ { 0xE8D2, 0xE8D3, 0xE8D4, 0xE8D5, 0xE8D6, 0xE8D7, 0xE8D8, 0xE8D9, 0xE8DA, 0xE8DB, 0xE8DC, 0xE8DD, 0xE8DE, 0xE8DF, 0xE8E0, 0xE8E1 }, - /* 0x0D5 */ { 0xE8E2, 0xE8E3, 0xE8E4, 0xE8E5, 0xE8E6, 0xE8E7, 0xE8E8, 0xE8E9, 0xE8EA, 0xE8EB, 0xE8EC, 0xE8ED, 0xE8EE, 0xE8EF, 0xE8F0, 0xE8F1 }, - /* 0x0D6 */ { 0xE8F2, 0xE8F3, 0xE8F4, 0xE8F5, 0xE8F6, 0xE8F7, 0xE8F8, 0xE8F9, 0xE8FA, 0xE8FB, 0xE8FC, 0xE8FD, 0xE8FE, 0xE8FF, 0xE900, 0xE901 }, - /* 0x0D7 */ { 0xE902, 0x0000, 0xE903, 0xE904, 0xE905, 0x0000, 0x0000, 0xE906, 0x0000, 0xE907, 0x0000, 0xE908, 0x0000, 0xE909, 0xE90A, 0xE90B }, - /* 0x0D8 */ { 0xE90C, 0x0000, 0xE90D, 0x0000, 0x0000, 0xE90E, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE90F, 0xE910 }, - /* 0x0D9 */ { 0xE911, 0x0000, 0xE912, 0x0000, 0xE913, 0x0000, 0xE914, 0x0000, 0xE915, 0x0000, 0xE916, 0x0000, 0xE917, 0x0000, 0xE918, 0x0000 }, - /* 0x0DA */ { 0xE919, 0x0000, 0xE91A, 0x0000, 0xE91B, 0x0000, 0xE91C, 0x0000, 0xE91D, 0x0000, 0xE91E, 0x0000, 0xE91F, 0x0000, 0xE920, 0x0000 }, - /* 0x0DB */ { 0xE921, 0x0000, 0xE922, 0x0000, 0xE923, 0x0000, 0xE924, 0x0000, 0xE925, 0x0000, 0xE926, 0x0000, 0xE927, 0x0000, 0xE928, 0x0000 }, - /* 0x0DC */ { 0xE929, 0x0000, 0xE92A, 0x0000, 0xE92B, 0x0000, 0xE92C, 0x0000, 0xE92D, 0x0000, 0xE92E, 0x0000, 0xE92F, 0x0000, 0xE930, 0x0000 }, - /* 0x0DD */ { 0xE931, 0x0000, 0xE932, 0x0000, 0xE933, 0x0000, 0xE934, 0x0000, 0xE935, 0x0000, 0xE936, 0x0000, 0xE937, 0x0000, 0xE938, 0x0000 }, - /* 0x0DE */ { 0xE939, 0x0000, 0xE93A, 0x0000, 0xE93B, 0x0000, 0xE93C, 0x0000, 0xE93D, 0x0000, 0xE93E, 0x0000, 0xE93F, 0x0000, 0xE940, 0x0000 }, - /* 0x0DF */ { 0xE941, 0x0000, 0xE942, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE943, 0x0000, 0xE944, 0x0000, 0xADE6 }, - /* 0x0E0 */ { 0xADE6, 0xADE6, 0xE945, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x0D4 */ { 0xE8D3, 0xE8D4, 0xE8D5, 0xE8D6, 0xE8D7, 0xE8D8, 0xE8D9, 0xE8DA, 0xE8DB, 0xE8DC, 0xE8DD, 0xE8DE, 0xE8DF, 0xE8E0, 0xE8E1, 0xE8E2 }, + /* 0x0D5 */ { 0xE8E3, 0xE8E4, 0xE8E5, 0xE8E6, 0xE8E7, 0xE8E8, 0xE8E9, 0xE8EA, 0xE8EB, 0xE8EC, 0xE8ED, 0xE8EE, 0xE8EF, 0xE8F0, 0xE8F1, 0xE8F2 }, + /* 0x0D6 */ { 0xE8F3, 0xE8F4, 0xE8F5, 0xE8F6, 0xE8F7, 0xE8F8, 0xE8F9, 0xE8FA, 0xE8FB, 0xE8FC, 0xE8FD, 0xE8FE, 0xE8FF, 0xE900, 0xE901, 0xE902 }, + /* 0x0D7 */ { 0xE903, 0x0000, 0xE904, 0xE905, 0xE906, 0x0000, 0x0000, 0xE907, 0x0000, 0xE908, 0x0000, 0xE909, 0x0000, 0xE90A, 0xE90B, 0xE90C }, + /* 0x0D8 */ { 0xE90D, 0x0000, 0xE90E, 0x0000, 0x0000, 0xE90F, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE910, 0xE911 }, + /* 0x0D9 */ { 0xE912, 0x0000, 0xE913, 0x0000, 0xE914, 0x0000, 0xE915, 0x0000, 0xE916, 0x0000, 0xE917, 0x0000, 0xE918, 0x0000, 0xE919, 0x0000 }, + /* 0x0DA */ { 0xE91A, 0x0000, 0xE91B, 0x0000, 0xE91C, 0x0000, 0xE91D, 0x0000, 0xE91E, 0x0000, 0xE91F, 0x0000, 0xE920, 0x0000, 0xE921, 0x0000 }, + /* 0x0DB */ { 0xE922, 0x0000, 0xE923, 0x0000, 0xE924, 0x0000, 0xE925, 0x0000, 0xE926, 0x0000, 0xE927, 0x0000, 0xE928, 0x0000, 0xE929, 0x0000 }, + /* 0x0DC */ { 0xE92A, 0x0000, 0xE92B, 0x0000, 0xE92C, 0x0000, 0xE92D, 0x0000, 0xE92E, 0x0000, 0xE92F, 0x0000, 0xE930, 0x0000, 0xE931, 0x0000 }, + /* 0x0DD */ { 0xE932, 0x0000, 0xE933, 0x0000, 0xE934, 0x0000, 0xE935, 0x0000, 0xE936, 0x0000, 0xE937, 0x0000, 0xE938, 0x0000, 0xE939, 0x0000 }, + /* 0x0DE */ { 0xE93A, 0x0000, 0xE93B, 0x0000, 0xE93C, 0x0000, 0xE93D, 0x0000, 0xE93E, 0x0000, 0xE93F, 0x0000, 0xE940, 0x0000, 0xE941, 0x0000 }, + /* 0x0DF */ { 0xE942, 0x0000, 0xE943, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE944, 0x0000, 0xE945, 0x0000, 0xADE6 }, + /* 0x0E0 */ { 0xADE6, 0xADE6, 0xE946, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x0E1 */ { 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAD09 }, /* 0x0E2 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDA, 0xADE4, 0xADE8, 0xADDE, 0xADE0, 0xADE0 }, /* 0x0E3 */ { 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB2A1, 0x0000, 0xB2A2, 0x0000 }, @@ -648,26 +654,26 @@ static uint16_t nfTrieLo[437][16] = { /* 0x0EA */ { 0xB2C6, 0x0000, 0xB2C7, 0x0000, 0x0000, 0xB2C8, 0x0000, 0xB2C9, 0x0000, 0xB2CA, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x0EB */ { 0xB2CB, 0xB2CC, 0x0000, 0xB2CD, 0xB2CE, 0x0000, 0xB2CF, 0xB2D0, 0x0000, 0xB2D1, 0xB2D2, 0x0000, 0xB2D3, 0xB2D4, 0x0000, 0x0000 }, /* 0x0EC */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xB2D5, 0x0000, 0x0000, 0xB2D6, 0xB2D7, 0xB2D8, 0xB2D9, 0x0000, 0x0000, 0x0000, 0xB2DA, 0x0000 }, - /* 0x0ED */ { 0xE946, 0x0000, 0xE947, 0x0000, 0xE948, 0x0000, 0xE949, 0x0000, 0xE94A, 0x0000, 0xE94B, 0x0000, 0xE94C, 0x0000, 0xE94D, 0x0000 }, - /* 0x0EE */ { 0xE94E, 0x0000, 0xE94F, 0x0000, 0xE950, 0x0000, 0xE951, 0x0000, 0xE952, 0x0000, 0xE953, 0x0000, 0xE954, 0x0000, 0xE955, 0x0000 }, - /* 0x0EF */ { 0xE956, 0x0000, 0xE957, 0x0000, 0xE958, 0x0000, 0xE959, 0x0000, 0xE95A, 0x0000, 0xE95B, 0x0000, 0xE95C, 0x0000, 0x0000, 0xADE6 }, + /* 0x0ED */ { 0xE947, 0x0000, 0xE948, 0x0000, 0xE949, 0x0000, 0xE94A, 0x0000, 0xE94B, 0x0000, 0xE94C, 0x0000, 0xE94D, 0x0000, 0xE94E, 0x0000 }, + /* 0x0EE */ { 0xE94F, 0x0000, 0xE950, 0x0000, 0xE951, 0x0000, 0xE952, 0x0000, 0xE953, 0x0000, 0xE954, 0x0000, 0xE955, 0x0000, 0xE956, 0x0000 }, + /* 0x0EF */ { 0xE957, 0x0000, 0xE958, 0x0000, 0xE959, 0x0000, 0xE95A, 0x0000, 0xE95B, 0x0000, 0xE95C, 0x0000, 0xE95D, 0x0000, 0x0000, 0xADE6 }, /* 0x0F0 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000 }, - /* 0x0F1 */ { 0xE95D, 0x0000, 0xE95E, 0x0000, 0xE95F, 0x0000, 0xE960, 0x0000, 0xE961, 0x0000, 0xE962, 0x0000, 0xE963, 0x0000, 0xE964, 0x0000 }, - /* 0x0F2 */ { 0xE965, 0x0000, 0xE966, 0x0000, 0xE967, 0x0000, 0xE968, 0x0000, 0xE969, 0x0000, 0xE96A, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6 }, + /* 0x0F1 */ { 0xE95E, 0x0000, 0xE95F, 0x0000, 0xE960, 0x0000, 0xE961, 0x0000, 0xE962, 0x0000, 0xE963, 0x0000, 0xE964, 0x0000, 0xE965, 0x0000 }, + /* 0x0F2 */ { 0xE966, 0x0000, 0xE967, 0x0000, 0xE968, 0x0000, 0xE969, 0x0000, 0xE96A, 0x0000, 0xE96B, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6 }, /* 0x0F3 */ { 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x0F4 */ { 0x0000, 0x0000, 0xE96B, 0x0000, 0xE96C, 0x0000, 0xE96D, 0x0000, 0xE96E, 0x0000, 0xE96F, 0x0000, 0xE970, 0x0000, 0xE971, 0x0000 }, - /* 0x0F5 */ { 0x0000, 0x0000, 0xE972, 0x0000, 0xE973, 0x0000, 0xE974, 0x0000, 0xE975, 0x0000, 0xE976, 0x0000, 0xE977, 0x0000, 0xE978, 0x0000 }, - /* 0x0F6 */ { 0xE979, 0x0000, 0xE97A, 0x0000, 0xE97B, 0x0000, 0xE97C, 0x0000, 0xE97D, 0x0000, 0xE97E, 0x0000, 0xE97F, 0x0000, 0xE980, 0x0000 }, - /* 0x0F7 */ { 0xE981, 0x0000, 0xE982, 0x0000, 0xE983, 0x0000, 0xE984, 0x0000, 0xE985, 0x0000, 0xE986, 0x0000, 0xE987, 0x0000, 0xE988, 0x0000 }, - /* 0x0F8 */ { 0xE989, 0x0000, 0xE98A, 0x0000, 0xE98B, 0x0000, 0xE98C, 0x0000, 0xE98D, 0x0000, 0xE98E, 0x0000, 0xE98F, 0x0000, 0xE990, 0x0000 }, - /* 0x0F9 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE991, 0x0000, 0xE992, 0x0000, 0xE993, 0xE994, 0x0000 }, - /* 0x0FA */ { 0xE995, 0x0000, 0xE996, 0x0000, 0xE997, 0x0000, 0xE998, 0x0000, 0x0000, 0x0000, 0x0000, 0xE999, 0x0000, 0xE99A, 0x0000, 0x0000 }, - /* 0x0FB */ { 0xE99B, 0x0000, 0xE99C, 0x0000, 0x0000, 0x0000, 0xE99D, 0x0000, 0xE99E, 0x0000, 0xE99F, 0x0000, 0xE9A0, 0x0000, 0xE9A1, 0x0000 }, - /* 0x0FC */ { 0xE9A2, 0x0000, 0xE9A3, 0x0000, 0xE9A4, 0x0000, 0xE9A5, 0x0000, 0xE9A6, 0x0000, 0xE9A7, 0xE9A8, 0xE9A9, 0xE9AA, 0xE9AB, 0x0000 }, - /* 0x0FD */ { 0xE9AC, 0xE9AD, 0xE9AE, 0xE9AF, 0xE9B0, 0x0000, 0xE9B1, 0x0000, 0xE9B2, 0x0000, 0xE9B3, 0x0000, 0xE9B4, 0x0000, 0xE9B5, 0x0000 }, - /* 0x0FE */ { 0xE9B6, 0x0000, 0xE9B7, 0x0000, 0xE9B8, 0xE9B9, 0xE9BA, 0xE9BB, 0x0000, 0xE9BC, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x0FF */ { 0xE9BD, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xE9BE, 0x0000, 0xE9BF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x100 */ { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xE9C0, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x0F4 */ { 0x0000, 0x0000, 0xE96C, 0x0000, 0xE96D, 0x0000, 0xE96E, 0x0000, 0xE96F, 0x0000, 0xE970, 0x0000, 0xE971, 0x0000, 0xE972, 0x0000 }, + /* 0x0F5 */ { 0x0000, 0x0000, 0xE973, 0x0000, 0xE974, 0x0000, 0xE975, 0x0000, 0xE976, 0x0000, 0xE977, 0x0000, 0xE978, 0x0000, 0xE979, 0x0000 }, + /* 0x0F6 */ { 0xE97A, 0x0000, 0xE97B, 0x0000, 0xE97C, 0x0000, 0xE97D, 0x0000, 0xE97E, 0x0000, 0xE97F, 0x0000, 0xE980, 0x0000, 0xE981, 0x0000 }, + /* 0x0F7 */ { 0xE982, 0x0000, 0xE983, 0x0000, 0xE984, 0x0000, 0xE985, 0x0000, 0xE986, 0x0000, 0xE987, 0x0000, 0xE988, 0x0000, 0xE989, 0x0000 }, + /* 0x0F8 */ { 0xE98A, 0x0000, 0xE98B, 0x0000, 0xE98C, 0x0000, 0xE98D, 0x0000, 0xE98E, 0x0000, 0xE98F, 0x0000, 0xE990, 0x0000, 0xE991, 0x0000 }, + /* 0x0F9 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xE992, 0x0000, 0xE993, 0x0000, 0xE994, 0xE995, 0x0000 }, + /* 0x0FA */ { 0xE996, 0x0000, 0xE997, 0x0000, 0xE998, 0x0000, 0xE999, 0x0000, 0x0000, 0x0000, 0x0000, 0xE99A, 0x0000, 0xE99B, 0x0000, 0x0000 }, + /* 0x0FB */ { 0xE99C, 0x0000, 0xE99D, 0x0000, 0x0000, 0x0000, 0xE99E, 0x0000, 0xE99F, 0x0000, 0xE9A0, 0x0000, 0xE9A1, 0x0000, 0xE9A2, 0x0000 }, + /* 0x0FC */ { 0xE9A3, 0x0000, 0xE9A4, 0x0000, 0xE9A5, 0x0000, 0xE9A6, 0x0000, 0xE9A7, 0x0000, 0xE9A8, 0xE9A9, 0xE9AA, 0xE9AB, 0xE9AC, 0x0000 }, + /* 0x0FD */ { 0xE9AD, 0xE9AE, 0xE9AF, 0xE9B0, 0xE9B1, 0x0000, 0xE9B2, 0x0000, 0xE9B3, 0x0000, 0xE9B4, 0x0000, 0xE9B5, 0x0000, 0xE9B6, 0x0000 }, + /* 0x0FE */ { 0xE9B7, 0x0000, 0xE9B8, 0x0000, 0xE9B9, 0xE9BA, 0xE9BB, 0xE9BC, 0x0000, 0xE9BD, 0x0000, 0xE9BE, 0xE9BF, 0x0000, 0xFFFF, 0xFFFF }, + /* 0x0FF */ { 0xE9C0, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xE9C1, 0x0000, 0xE9C2, 0x0000, 0xE9C3, 0x0000, 0xE9C4, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x100 */ { 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0xE9C5, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x101 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x102 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x103 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000 }, @@ -679,11 +685,11 @@ static uint16_t nfTrieLo[437][16] = { /* 0x109 */ { 0xADE6, 0x0000, 0xADE6, 0xADE6, 0xADDC, 0x0000, 0x0000, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6 }, /* 0x10A */ { 0x0000, 0xADE6, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x10B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x10C */ { 0xE9C1, 0xE9C2, 0xE9C3, 0xE9C4, 0xE9C5, 0xE9C6, 0xE9C7, 0xE9C8, 0xE9C9, 0xE9CA, 0xE9CB, 0xE9CC, 0xE9CD, 0xE9CE, 0xE9CF, 0xE9D0 }, - /* 0x10D */ { 0xE9D1, 0xE9D2, 0xE9D3, 0xE9D4, 0xE9D5, 0xE9D6, 0xE9D7, 0xE9D8, 0xE9D9, 0xE9DA, 0xE9DB, 0xE9DC, 0xE9DD, 0xE9DE, 0xE9DF, 0xE9E0 }, - /* 0x10E */ { 0xE9E1, 0xE9E2, 0xE9E3, 0xE9E4, 0xE9E5, 0xE9E6, 0xE9E7, 0xE9E8, 0xE9E9, 0xE9EA, 0xE9EB, 0xE9EC, 0xE9ED, 0xE9EE, 0xE9EF, 0xE9F0 }, - /* 0x10F */ { 0xE9F1, 0xE9F2, 0xE9F3, 0xE9F4, 0xE9F5, 0xE9F6, 0xE9F7, 0xE9F8, 0xE9F9, 0xE9FA, 0xE9FB, 0xE9FC, 0xE9FD, 0xE9FE, 0xE9FF, 0xEA00 }, - /* 0x110 */ { 0xEA01, 0xEA02, 0xEA03, 0xEA04, 0xEA05, 0xEA06, 0xEA07, 0xEA08, 0xEA09, 0xEA0A, 0xEA0B, 0xEA0C, 0xEA0D, 0xEA0E, 0xEA0F, 0xEA10 }, + /* 0x10C */ { 0xE9C6, 0xE9C7, 0xE9C8, 0xE9C9, 0xE9CA, 0xE9CB, 0xE9CC, 0xE9CD, 0xE9CE, 0xE9CF, 0xE9D0, 0xE9D1, 0xE9D2, 0xE9D3, 0xE9D4, 0xE9D5 }, + /* 0x10D */ { 0xE9D6, 0xE9D7, 0xE9D8, 0xE9D9, 0xE9DA, 0xE9DB, 0xE9DC, 0xE9DD, 0xE9DE, 0xE9DF, 0xE9E0, 0xE9E1, 0xE9E2, 0xE9E3, 0xE9E4, 0xE9E5 }, + /* 0x10E */ { 0xE9E6, 0xE9E7, 0xE9E8, 0xE9E9, 0xE9EA, 0xE9EB, 0xE9EC, 0xE9ED, 0xE9EE, 0xE9EF, 0xE9F0, 0xE9F1, 0xE9F2, 0xE9F3, 0xE9F4, 0xE9F5 }, + /* 0x10F */ { 0xE9F6, 0xE9F7, 0xE9F8, 0xE9F9, 0xE9FA, 0xE9FB, 0xE9FC, 0xE9FD, 0xE9FE, 0xE9FF, 0xEA00, 0xEA01, 0xEA02, 0xEA03, 0xEA04, 0xEA05 }, + /* 0x110 */ { 0xEA06, 0xEA07, 0xEA08, 0xEA09, 0xEA0A, 0xEA0B, 0xEA0C, 0xEA0D, 0xEA0E, 0xEA0F, 0xEA10, 0xEA11, 0xEA12, 0xEA13, 0xEA14, 0xEA15 }, /* 0x111 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF }, /* 0x112 */ { 0xAC00, 0xAC00, 0xAC00, 0xAC00, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x113 */ { 0x8C48, 0x66F4, 0x8ECA, 0x8CC8, 0x6ED1, 0x4E32, 0x53E5, 0x9F9C, 0x9F9C, 0x5951, 0x91D1, 0x5587, 0x5948, 0x61F6, 0x7669, 0x7F85 }, @@ -708,157 +714,168 @@ static uint16_t nfTrieLo[437][16] = { /* 0x126 */ { 0x4FAE, 0x50E7, 0x514D, 0x52C9, 0x52E4, 0x5351, 0x559D, 0x5606, 0x5668, 0x5840, 0x58A8, 0x5C64, 0x5C6E, 0x6094, 0x6168, 0x618E }, /* 0x127 */ { 0x61F2, 0x654F, 0x65E2, 0x6691, 0x6885, 0x6D77, 0x6E1A, 0x6F22, 0x716E, 0x722B, 0x7422, 0x7891, 0x793E, 0x7949, 0x7948, 0x7950 }, /* 0x128 */ { 0x7956, 0x795D, 0x798D, 0x798E, 0x7A40, 0x7A81, 0x7BC0, 0x7DF4, 0x7E09, 0x7E41, 0x7F72, 0x8005, 0x81ED, 0x8279, 0x8279, 0x8457 }, - /* 0x129 */ { 0x8910, 0x8996, 0x8B01, 0x8B39, 0x8CD3, 0x8D08, 0x8FB6, 0x9038, 0x96E3, 0x97FF, 0x983B, 0x6075, 0xE211, 0x8218, 0xFFFF, 0xFFFF }, + /* 0x129 */ { 0x8910, 0x8996, 0x8B01, 0x8B39, 0x8CD3, 0x8D08, 0x8FB6, 0x9038, 0x96E3, 0x97FF, 0x983B, 0x6075, 0xE216, 0x8218, 0xFFFF, 0xFFFF }, /* 0x12A */ { 0x4E26, 0x51B5, 0x5168, 0x4F80, 0x5145, 0x5180, 0x52C7, 0x52FA, 0x559D, 0x5555, 0x5599, 0x55E2, 0x585A, 0x58B3, 0x5944, 0x5954 }, /* 0x12B */ { 0x5A62, 0x5B28, 0x5ED2, 0x5ED9, 0x5F69, 0x5FAD, 0x60D8, 0x614E, 0x6108, 0x618E, 0x6160, 0x61F2, 0x6234, 0x63C4, 0x641C, 0x6452 }, /* 0x12C */ { 0x6556, 0x6674, 0x6717, 0x671B, 0x6756, 0x6B79, 0x6BBA, 0x6D41, 0x6EDB, 0x6ECB, 0x6F22, 0x701E, 0x716E, 0x77A7, 0x7235, 0x72AF }, /* 0x12D */ { 0x732A, 0x7471, 0x7506, 0x753B, 0x761D, 0x761F, 0x76CA, 0x76DB, 0x76F4, 0x774A, 0x7740, 0x78CC, 0x7AB1, 0x7BC0, 0x7C7B, 0x7D5B }, /* 0x12E */ { 0x7DF4, 0x7F3E, 0x8005, 0x8352, 0x83EF, 0x8779, 0x8941, 0x8986, 0x8996, 0x8ABF, 0x8AF8, 0x8ACB, 0x8B01, 0x8AFE, 0x8AED, 0x8B39 }, - /* 0x12F */ { 0x8B8A, 0x8D08, 0x8F38, 0x9072, 0x9199, 0x9276, 0x967C, 0x96E3, 0x9756, 0x97DB, 0x97FF, 0x980B, 0x983B, 0x9B12, 0x9F9C, 0xE212 }, - /* 0x130 */ { 0xE213, 0xE214, 0x3B9D, 0x4018, 0x4039, 0xE215, 0xE216, 0xE217, 0x9F43, 0x9F8E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x12F */ { 0x8B8A, 0x8D08, 0x8F38, 0x9072, 0x9199, 0x9276, 0x967C, 0x96E3, 0x9756, 0x97DB, 0x97FF, 0x980B, 0x983B, 0x9B12, 0x9F9C, 0xE217 }, + /* 0x130 */ { 0xE218, 0xE219, 0x3B9D, 0x4018, 0x4039, 0xE21A, 0xE21B, 0xE21C, 0x9F43, 0x9F8E, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x131 */ { 0xBADB, 0xBADC, 0xBADD, 0xC8DA, 0xC8DB, 0xBADE, 0xBADF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, /* 0x132 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xBAE0, 0xBAE1, 0xBAE2, 0xBAE3, 0xBAE4, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xB2E5, 0xAD1A, 0xB2E6 }, /* 0x133 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xB2E7, 0xB2E8, 0xC0DC, 0xC0DD, 0xB2E9, 0xB2EA }, /* 0x134 */ { 0xB2EB, 0xB2EC, 0xB2ED, 0xB2EE, 0xB2EF, 0xB2F0, 0xB2F1, 0xFFFF, 0xB2F2, 0xB2F3, 0xB2F4, 0xB2F5, 0xB2F6, 0xFFFF, 0xB2F7, 0xFFFF }, /* 0x135 */ { 0xB2F8, 0xB2F9, 0xFFFF, 0xB2FA, 0xB2FB, 0xFFFF, 0xB2FC, 0xB2FD, 0xB2FE, 0xB2FF, 0xB300, 0xB301, 0xB302, 0xB303, 0xB304, 0x0000 }, /* 0x136 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADE6, 0xADE6 }, - /* 0x137 */ { 0x0000, 0xEA18, 0xEA19, 0xEA1A, 0xEA1B, 0xEA1C, 0xEA1D, 0xEA1E, 0xEA1F, 0xEA20, 0xEA21, 0xEA22, 0xEA23, 0xEA24, 0xEA25, 0xEA26 }, - /* 0x138 */ { 0xEA27, 0xEA28, 0xEA29, 0xEA2A, 0xEA2B, 0xEA2C, 0xEA2D, 0xEA2E, 0xEA2F, 0xEA30, 0xEA31, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x137 */ { 0x0000, 0xEA1D, 0xEA1E, 0xEA1F, 0xEA20, 0xEA21, 0xEA22, 0xEA23, 0xEA24, 0xEA25, 0xEA26, 0xEA27, 0xEA28, 0xEA29, 0xEA2A, 0xEA2B }, + /* 0x138 */ { 0xEA2C, 0xEA2D, 0xEA2E, 0xEA2F, 0xEA30, 0xEA31, 0xEA32, 0xEA33, 0xEA34, 0xEA35, 0xEA36, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x139 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xFFFF, 0xFFFF }, /* 0x13A */ { 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, /* 0x13B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x13C */ { 0xEA32, 0xEA33, 0xEA34, 0xEA35, 0xEA36, 0xEA37, 0xEA38, 0xEA39, 0xEA3A, 0xEA3B, 0xEA3C, 0xEA3D, 0xEA3E, 0xEA3F, 0xEA40, 0xEA41 }, - /* 0x13D */ { 0xEA42, 0xEA43, 0xEA44, 0xEA45, 0xEA46, 0xEA47, 0xEA48, 0xEA49, 0xEA4A, 0xEA4B, 0xEA4C, 0xEA4D, 0xEA4E, 0xEA4F, 0xEA50, 0xEA51 }, - /* 0x13E */ { 0xEA52, 0xEA53, 0xEA54, 0xEA55, 0xEA56, 0xEA57, 0xEA58, 0xEA59, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x13F */ { 0xEA5A, 0xEA5B, 0xEA5C, 0xEA5D, 0xEA5E, 0xEA5F, 0xEA60, 0xEA61, 0xEA62, 0xEA63, 0xEA64, 0xEA65, 0xEA66, 0xEA67, 0xEA68, 0xEA69 }, - /* 0x140 */ { 0xEA6A, 0xEA6B, 0xEA6C, 0xEA6D, 0xEA6E, 0xEA6F, 0xEA70, 0xEA71, 0xEA72, 0xEA73, 0xEA74, 0xEA75, 0xEA76, 0xEA77, 0xEA78, 0xEA79 }, - /* 0x141 */ { 0xEA7A, 0xEA7B, 0xEA7C, 0xEA7D, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x142 */ { 0xEA7E, 0xEA7F, 0xEA80, 0xEA81, 0xEA82, 0xEA83, 0xEA84, 0xEA85, 0xEA86, 0xEA87, 0xEA88, 0xFFFF, 0xEA89, 0xEA8A, 0xEA8B, 0xEA8C }, - /* 0x143 */ { 0xEA8D, 0xEA8E, 0xEA8F, 0xEA90, 0xEA91, 0xEA92, 0xEA93, 0xEA94, 0xEA95, 0xEA96, 0xEA97, 0xFFFF, 0xEA98, 0xEA99, 0xEA9A, 0xEA9B }, - /* 0x144 */ { 0xEA9C, 0xEA9D, 0xEA9E, 0xFFFF, 0xEA9F, 0xEAA0, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x145 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xADDC, 0x0000, 0xADE6 }, - /* 0x146 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xADE6, 0xAD01, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAD09 }, - /* 0x147 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x148 */ { 0xEAA1, 0xEAA2, 0xEAA3, 0xEAA4, 0xEAA5, 0xEAA6, 0xEAA7, 0xEAA8, 0xEAA9, 0xEAAA, 0xEAAB, 0xEAAC, 0xEAAD, 0xEAAE, 0xEAAF, 0xEAB0 }, - /* 0x149 */ { 0xEAB1, 0xEAB2, 0xEAB3, 0xEAB4, 0xEAB5, 0xEAB6, 0xEAB7, 0xEAB8, 0xEAB9, 0xEABA, 0xEABB, 0xEABC, 0xEABD, 0xEABE, 0xEABF, 0xEAC0 }, - /* 0x14A */ { 0xEAC1, 0xEAC2, 0xEAC3, 0xEAC4, 0xEAC5, 0xEAC6, 0xEAC7, 0xEAC8, 0xEAC9, 0xEACA, 0xEACB, 0xEACC, 0xEACD, 0xEACE, 0xEACF, 0xEAD0 }, - /* 0x14B */ { 0xEAD1, 0xEAD2, 0xEAD3, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x14C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x14D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xADE6, 0xADE6, 0x0000, 0xFFFF, 0xFFFF }, - /* 0x14E */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xADDC, 0xADDC, 0xADDC }, - /* 0x14F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADE6, 0xADDC, 0xADDC, 0xADDC }, - /* 0x150 */ { 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x151 */ { 0x0000, 0x0000, 0xADE6, 0xADDC, 0xADE6, 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x152 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, - /* 0x153 */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAD09 }, - /* 0x154 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF000, 0x0000, 0xF003, 0x0000, 0x0000, 0x0000 }, - /* 0x155 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF006, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x156 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x157 */ { 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x158 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF009, 0xF00C }, - /* 0x159 */ { 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD09, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x15A */ { 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x15B */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x15C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x15D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x15E */ { 0x0000, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xAD07, 0xAD07, 0x0000, 0x0000, 0x0000 }, - /* 0x15F */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xF00F, 0xF012, 0xAD09, 0xFFFF, 0xFFFF }, - /* 0x160 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x161 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x162 */ { 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x163 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xADE6, 0x0000 }, - /* 0x164 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF015, 0xF018, 0x0000, 0xF01B, 0x0000 }, - /* 0x165 */ { 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x166 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xF01E, 0xF021, 0x0000, 0x0000, 0x0000, 0xAD09 }, - /* 0x167 */ { 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x168 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09 }, - /* 0x169 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x16A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x16B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x16C */ { 0xEAD4, 0xEAD5, 0xEAD6, 0xEAD7, 0xEAD8, 0xEAD9, 0xEADA, 0xEADB, 0xEADC, 0xEADD, 0xEADE, 0xEADF, 0xEAE0, 0xEAE1, 0xEAE2, 0xEAE3 }, - /* 0x16D */ { 0xEAE4, 0xEAE5, 0xEAE6, 0xEAE7, 0xEAE8, 0xEAE9, 0xEAEA, 0xEAEB, 0xEAEC, 0xEAED, 0xEAEE, 0xEAEF, 0xEAF0, 0xEAF1, 0xEAF2, 0xEAF3 }, - /* 0x16E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xF024, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xAD09, 0xAD09, 0x0000 }, - /* 0x16F */ { 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x170 */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x171 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x172 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x173 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x174 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09 }, - /* 0x175 */ { 0x0000, 0x0000, 0xAD07, 0x0000, 0xAD09, 0xAD09, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x176 */ { 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x177 */ { 0x0000, 0xAD09, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x178 */ { 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x179 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x17A */ { 0xEAF4, 0xEAF5, 0xEAF6, 0xEAF7, 0xEAF8, 0xEAF9, 0xEAFA, 0xEAFB, 0xEAFC, 0xEAFD, 0xEAFE, 0xEAFF, 0xEB00, 0xEB01, 0xEB02, 0xEB03 }, - /* 0x17B */ { 0xEB04, 0xEB05, 0xEB06, 0xEB07, 0xEB08, 0xEB09, 0xEB0A, 0xEB0B, 0xEB0C, 0xEB0D, 0xEB0E, 0xEB0F, 0xEB10, 0xEB11, 0xEB12, 0xEB13 }, - /* 0x17C */ { 0xAD06, 0xAD06, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x17D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xAD01, 0x0000 }, - /* 0x17E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF027, 0xF02A }, - /* 0x17F */ { 0xF02D, 0xF031, 0xF035, 0xF039, 0xF03D, 0xADD8, 0xADD8, 0xAD01, 0xAD01, 0xAD01, 0x0000, 0x0000, 0x0000, 0xADE2, 0xADD8, 0xADD8 }, - /* 0x180 */ { 0xADD8, 0xADD8, 0xADD8, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC }, - /* 0x181 */ { 0xADDC, 0xADDC, 0xADDC, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x182 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000 }, - /* 0x183 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF041, 0xF044, 0xF047, 0xF04B, 0xF04F }, - /* 0x184 */ { 0xF053, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x185 */ { 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x186 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, - /* 0x187 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, - /* 0x188 */ { 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x189 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xADE6 }, - /* 0x18A */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, - /* 0x18B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xFFFF }, - /* 0x18C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, - /* 0x18D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE8, 0xADE8, 0xADDC, 0xADE6 }, - /* 0x18E */ { 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x18F */ { 0xEB14, 0xEB15, 0xEB16, 0xEB17, 0xEB18, 0xEB19, 0xEB1A, 0xEB1B, 0xEB1C, 0xEB1D, 0xEB1E, 0xEB1F, 0xEB20, 0xEB21, 0xEB22, 0xEB23 }, - /* 0x190 */ { 0xEB24, 0xEB25, 0xEB26, 0xEB27, 0xEB28, 0xEB29, 0xEB2A, 0xEB2B, 0xEB2C, 0xEB2D, 0xEB2E, 0xEB2F, 0xEB30, 0xEB31, 0xEB32, 0xEB33 }, - /* 0x191 */ { 0xEB34, 0xEB35, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, - /* 0x192 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xAD07, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, - /* 0x193 */ { 0x4E3D, 0x4E38, 0x4E41, 0xE336, 0x4F60, 0x4FAE, 0x4FBB, 0x5002, 0x507A, 0x5099, 0x50E7, 0x50CF, 0x349E, 0xE337, 0x514D, 0x5154 }, - /* 0x194 */ { 0x5164, 0x5177, 0xE338, 0x34B9, 0x5167, 0x518D, 0xE339, 0x5197, 0x51A4, 0x4ECC, 0x51AC, 0x51B5, 0xE33A, 0x51F5, 0x5203, 0x34DF }, - /* 0x195 */ { 0x523B, 0x5246, 0x5272, 0x5277, 0x3515, 0x52C7, 0x52C9, 0x52E4, 0x52FA, 0x5305, 0x5306, 0x5317, 0x5349, 0x5351, 0x535A, 0x5373 }, - /* 0x196 */ { 0x537D, 0x537F, 0x537F, 0x537F, 0xE33B, 0x7070, 0x53CA, 0x53DF, 0xE33C, 0x53EB, 0x53F1, 0x5406, 0x549E, 0x5438, 0x5448, 0x5468 }, - /* 0x197 */ { 0x54A2, 0x54F6, 0x5510, 0x5553, 0x5563, 0x5584, 0x5584, 0x5599, 0x55AB, 0x55B3, 0x55C2, 0x5716, 0x5606, 0x5717, 0x5651, 0x5674 }, - /* 0x198 */ { 0x5207, 0x58EE, 0x57CE, 0x57F4, 0x580D, 0x578B, 0x5832, 0x5831, 0x58AC, 0xE33D, 0x58F2, 0x58F7, 0x5906, 0x591A, 0x5922, 0x5962 }, - /* 0x199 */ { 0xE33E, 0xE33F, 0x59EC, 0x5A1B, 0x5A27, 0x59D8, 0x5A66, 0x36EE, 0x36FC, 0x5B08, 0x5B3E, 0x5B3E, 0xE340, 0x5BC3, 0x5BD8, 0x5BE7 }, - /* 0x19A */ { 0x5BF3, 0xE341, 0x5BFF, 0x5C06, 0x5F53, 0x5C22, 0x3781, 0x5C60, 0x5C6E, 0x5CC0, 0x5C8D, 0xE342, 0x5D43, 0xE343, 0x5D6E, 0x5D6B }, - /* 0x19B */ { 0x5D7C, 0x5DE1, 0x5DE2, 0x382F, 0x5DFD, 0x5E28, 0x5E3D, 0x5E69, 0x3862, 0xE344, 0x387C, 0x5EB0, 0x5EB3, 0x5EB6, 0x5ECA, 0xE345 }, - /* 0x19C */ { 0x5EFE, 0xE346, 0xE347, 0x8201, 0x5F22, 0x5F22, 0x38C7, 0xE348, 0xE349, 0x5F62, 0x5F6B, 0x38E3, 0x5F9A, 0x5FCD, 0x5FD7, 0x5FF9 }, - /* 0x19D */ { 0x6081, 0x393A, 0x391C, 0x6094, 0xE34A, 0x60C7, 0x6148, 0x614C, 0x614E, 0x614C, 0x617A, 0x618E, 0x61B2, 0x61A4, 0x61AF, 0x61DE }, - /* 0x19E */ { 0x61F2, 0x61F6, 0x6210, 0x621B, 0x625D, 0x62B1, 0x62D4, 0x6350, 0xE34B, 0x633D, 0x62FC, 0x6368, 0x6383, 0x63E4, 0xE34C, 0x6422 }, - /* 0x19F */ { 0x63C5, 0x63A9, 0x3A2E, 0x6469, 0x647E, 0x649D, 0x6477, 0x3A6C, 0x654F, 0x656C, 0xE34D, 0x65E3, 0x66F8, 0x6649, 0x3B19, 0x6691 }, - /* 0x1A0 */ { 0x3B08, 0x3AE4, 0x5192, 0x5195, 0x6700, 0x669C, 0x80AD, 0x43D9, 0x6717, 0x671B, 0x6721, 0x675E, 0x6753, 0xE34E, 0x3B49, 0x67FA }, - /* 0x1A1 */ { 0x6785, 0x6852, 0x6885, 0xE34F, 0x688E, 0x681F, 0x6914, 0x3B9D, 0x6942, 0x69A3, 0x69EA, 0x6AA8, 0xE350, 0x6ADB, 0x3C18, 0x6B21 }, - /* 0x1A2 */ { 0xE351, 0x6B54, 0x3C4E, 0x6B72, 0x6B9F, 0x6BBA, 0x6BBB, 0xE352, 0xE353, 0xE354, 0x6C4E, 0xE355, 0x6CBF, 0x6CCD, 0x6C67, 0x6D16 }, - /* 0x1A3 */ { 0x6D3E, 0x6D77, 0x6D41, 0x6D69, 0x6D78, 0x6D85, 0xE356, 0x6D34, 0x6E2F, 0x6E6E, 0x3D33, 0x6ECB, 0x6EC7, 0xE357, 0x6DF9, 0x6F6E }, - /* 0x1A4 */ { 0xE358, 0xE359, 0x6FC6, 0x7039, 0x701E, 0x701B, 0x3D96, 0x704A, 0x707D, 0x7077, 0x70AD, 0xE35A, 0x7145, 0xE35B, 0x719C, 0xE35C }, - /* 0x1A5 */ { 0x7228, 0x7235, 0x7250, 0xE35D, 0x7280, 0x7295, 0xE35E, 0xE35F, 0x737A, 0x738B, 0x3EAC, 0x73A5, 0x3EB8, 0x3EB8, 0x7447, 0x745C }, - /* 0x1A6 */ { 0x7471, 0x7485, 0x74CA, 0x3F1B, 0x7524, 0xE360, 0x753E, 0xE361, 0x7570, 0xE362, 0x7610, 0xE363, 0xE364, 0xE365, 0x3FFC, 0x4008 }, - /* 0x1A7 */ { 0x76F4, 0xE366, 0xE367, 0xE368, 0xE369, 0x771E, 0x771F, 0x771F, 0x774A, 0x4039, 0x778B, 0x4046, 0x4096, 0xE36A, 0x784E, 0x788C }, - /* 0x1A8 */ { 0x78CC, 0x40E3, 0xE36B, 0x7956, 0xE36C, 0xE36D, 0x798F, 0x79EB, 0x412F, 0x7A40, 0x7A4A, 0x7A4F, 0xE36E, 0xE36F, 0xE370, 0x7AEE }, - /* 0x1A9 */ { 0x4202, 0xE371, 0x7BC6, 0x7BC9, 0x4227, 0xE372, 0x7CD2, 0x42A0, 0x7CE8, 0x7CE3, 0x7D00, 0xE373, 0x7D63, 0x4301, 0x7DC7, 0x7E02 }, - /* 0x1AA */ { 0x7E45, 0x4334, 0xE374, 0xE375, 0x4359, 0xE376, 0x7F7A, 0xE377, 0x7F95, 0x7FFA, 0x8005, 0xE378, 0xE379, 0x8060, 0xE37A, 0x8070 }, - /* 0x1AB */ { 0xE37B, 0x43D5, 0x80B2, 0x8103, 0x440B, 0x813E, 0x5AB5, 0xE37C, 0xE37D, 0xE37E, 0xE37F, 0x8201, 0x8204, 0x8F9E, 0x446B, 0x8291 }, - /* 0x1AC */ { 0x828B, 0x829D, 0x52B3, 0x82B1, 0x82B3, 0x82BD, 0x82E6, 0xE380, 0x82E5, 0x831D, 0x8363, 0x83AD, 0x8323, 0x83BD, 0x83E7, 0x8457 }, - /* 0x1AD */ { 0x8353, 0x83CA, 0x83CC, 0x83DC, 0xE381, 0xE382, 0xE383, 0x452B, 0x84F1, 0x84F3, 0x8516, 0xE384, 0x8564, 0xE385, 0x455D, 0x4561 }, - /* 0x1AE */ { 0xE386, 0xE387, 0x456B, 0x8650, 0x865C, 0x8667, 0x8669, 0x86A9, 0x8688, 0x870E, 0x86E2, 0x8779, 0x8728, 0x876B, 0x8786, 0x45D7 }, - /* 0x1AF */ { 0x87E1, 0x8801, 0x45F9, 0x8860, 0x8863, 0xE388, 0x88D7, 0x88DE, 0x4635, 0x88FA, 0x34BB, 0xE389, 0xE38A, 0x46BE, 0x46C7, 0x8AA0 }, - /* 0x1B0 */ { 0x8AED, 0x8B8A, 0x8C55, 0xE38B, 0x8CAB, 0x8CC1, 0x8D1B, 0x8D77, 0xE38C, 0xE38D, 0x8DCB, 0x8DBC, 0x8DF0, 0xE38E, 0x8ED4, 0x8F38 }, - /* 0x1B1 */ { 0xE38F, 0xE390, 0x9094, 0x90F1, 0x9111, 0xE391, 0x911B, 0x9238, 0x92D7, 0x92D8, 0x927C, 0x93F9, 0x9415, 0xE392, 0x958B, 0x4995 }, - /* 0x1B2 */ { 0x95B7, 0xE393, 0x49E6, 0x96C3, 0x5DB2, 0x9723, 0xE394, 0xE395, 0x4A6E, 0x4A76, 0x97E0, 0xE396, 0x4AB2, 0xE397, 0x980B, 0x980B }, - /* 0x1B3 */ { 0x9829, 0xE398, 0x98E2, 0x4B33, 0x9929, 0x99A7, 0x99C2, 0x99FE, 0x4BCE, 0xE399, 0x9B12, 0x9C40, 0x9CFD, 0x4CCE, 0x4CED, 0x9D67 }, - /* 0x1B4 */ { 0xE39A, 0x4CF8, 0xE39B, 0xE39C, 0xE39D, 0x9EBB, 0x4D56, 0x9EF9, 0x9EFE, 0x9F05, 0x9F0F, 0x9F16, 0x9F3B, 0xE39E, 0xFFFF, 0xFFFF } + /* 0x13C */ { 0xEA37, 0xEA38, 0xEA39, 0xEA3A, 0xEA3B, 0xEA3C, 0xEA3D, 0xEA3E, 0xEA3F, 0xEA40, 0xEA41, 0xEA42, 0xEA43, 0xEA44, 0xEA45, 0xEA46 }, + /* 0x13D */ { 0xEA47, 0xEA48, 0xEA49, 0xEA4A, 0xEA4B, 0xEA4C, 0xEA4D, 0xEA4E, 0xEA4F, 0xEA50, 0xEA51, 0xEA52, 0xEA53, 0xEA54, 0xEA55, 0xEA56 }, + /* 0x13E */ { 0xEA57, 0xEA58, 0xEA59, 0xEA5A, 0xEA5B, 0xEA5C, 0xEA5D, 0xEA5E, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x13F */ { 0xEA5F, 0xEA60, 0xEA61, 0xEA62, 0xEA63, 0xEA64, 0xEA65, 0xEA66, 0xEA67, 0xEA68, 0xEA69, 0xEA6A, 0xEA6B, 0xEA6C, 0xEA6D, 0xEA6E }, + /* 0x140 */ { 0xEA6F, 0xEA70, 0xEA71, 0xEA72, 0xEA73, 0xEA74, 0xEA75, 0xEA76, 0xEA77, 0xEA78, 0xEA79, 0xEA7A, 0xEA7B, 0xEA7C, 0xEA7D, 0xEA7E }, + /* 0x141 */ { 0xEA7F, 0xEA80, 0xEA81, 0xEA82, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x142 */ { 0xEA83, 0xEA84, 0xEA85, 0xEA86, 0xEA87, 0xEA88, 0xEA89, 0xEA8A, 0xEA8B, 0xEA8C, 0xEA8D, 0xFFFF, 0xEA8E, 0xEA8F, 0xEA90, 0xEA91 }, + /* 0x143 */ { 0xEA92, 0xEA93, 0xEA94, 0xEA95, 0xEA96, 0xEA97, 0xEA98, 0xEA99, 0xEA9A, 0xEA9B, 0xEA9C, 0xFFFF, 0xEA9D, 0xEA9E, 0xEA9F, 0xEAA0 }, + /* 0x144 */ { 0xEAA1, 0xEAA2, 0xEAA3, 0xFFFF, 0xEAA4, 0xEAA5, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x145 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x146 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xF003, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x147 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xADDC, 0x0000, 0xADE6 }, + /* 0x148 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xADE6, 0xAD01, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAD09 }, + /* 0x149 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x14A */ { 0xEAA6, 0xEAA7, 0xEAA8, 0xEAA9, 0xEAAA, 0xEAAB, 0xEAAC, 0xEAAD, 0xEAAE, 0xEAAF, 0xEAB0, 0xEAB1, 0xEAB2, 0xEAB3, 0xEAB4, 0xEAB5 }, + /* 0x14B */ { 0xEAB6, 0xEAB7, 0xEAB8, 0xEAB9, 0xEABA, 0xEABB, 0xEABC, 0xEABD, 0xEABE, 0xEABF, 0xEAC0, 0xEAC1, 0xEAC2, 0xEAC3, 0xEAC4, 0xEAC5 }, + /* 0x14C */ { 0xEAC6, 0xEAC7, 0xEAC8, 0xEAC9, 0xEACA, 0xEACB, 0xEACC, 0xEACD, 0xEACE, 0xEACF, 0xEAD0, 0xEAD1, 0xEAD2, 0xEAD3, 0xEAD4, 0xEAD5 }, + /* 0x14D */ { 0xEAD6, 0xEAD7, 0xEAD8, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x14E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x14F */ { 0xEAD9, 0xEADA, 0xEADB, 0xEADC, 0xEADD, 0xEADE, 0xEADF, 0xEAE0, 0xEAE1, 0xEAE2, 0xEAE3, 0xEAE4, 0xEAE5, 0xEAE6, 0xEAE7, 0xEAE8 }, + /* 0x150 */ { 0xEAE9, 0xEAEA, 0xEAEB, 0xEAEC, 0xEAED, 0xEAEE, 0xFFFF, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000 }, + /* 0x151 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xADE6, 0xADE6, 0x0000, 0xFFFF, 0xFFFF }, + /* 0x152 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xADDC, 0xADDC, 0xADDC }, + /* 0x153 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xADDC, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADE6, 0xADDC, 0xADDC, 0xADDC }, + /* 0x154 */ { 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x155 */ { 0x0000, 0x0000, 0xADE6, 0xADDC, 0xADE6, 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x156 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, + /* 0x157 */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xAD09 }, + /* 0x158 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF006, 0x0000, 0xF009, 0x0000, 0x0000, 0x0000 }, + /* 0x159 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF00C, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x15A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x15B */ { 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x15C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF00F, 0xF012 }, + /* 0x15D */ { 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD09, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x15E */ { 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x15F */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x160 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x161 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD07, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x162 */ { 0x0000, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xAD07, 0xAD07, 0x0000, 0x0000, 0x0000 }, + /* 0x163 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xF015, 0xF018, 0xAD09, 0xFFFF, 0xFFFF }, + /* 0x164 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x165 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x166 */ { 0x0000, 0x0000, 0x0000, 0xF01B, 0x0000, 0xF01E, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xF021, 0xFFFF }, + /* 0x167 */ { 0x0000, 0xF024, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x168 */ { 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xF027, 0xFFFF, 0xF02A, 0xF02D, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0xAD09, 0xAD09 }, + /* 0x169 */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x16A */ { 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x16B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xADE6, 0x0000 }, + /* 0x16C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF030, 0xF033, 0x0000, 0xF036, 0x0000 }, + /* 0x16D */ { 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x16E */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xF039, 0xF03C, 0x0000, 0x0000, 0x0000, 0xAD09 }, + /* 0x16F */ { 0xAD07, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x170 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09 }, + /* 0x171 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x172 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x173 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xAD07, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x174 */ { 0xEAEF, 0xEAF0, 0xEAF1, 0xEAF2, 0xEAF3, 0xEAF4, 0xEAF5, 0xEAF6, 0xEAF7, 0xEAF8, 0xEAF9, 0xEAFA, 0xEAFB, 0xEAFC, 0xEAFD, 0xEAFE }, + /* 0x175 */ { 0xEAFF, 0xEB00, 0xEB01, 0xEB02, 0xEB03, 0xEB04, 0xEB05, 0xEB06, 0xEB07, 0xEB08, 0xEB09, 0xEB0A, 0xEB0B, 0xEB0C, 0xEB0D, 0xEB0E }, + /* 0x176 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xF03F, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xAD09, 0xAD09, 0x0000 }, + /* 0x177 */ { 0x0000, 0x0000, 0x0000, 0xAD07, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x178 */ { 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x179 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x17A */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x17B */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x17C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09 }, + /* 0x17D */ { 0x0000, 0x0000, 0xAD07, 0x0000, 0xAD09, 0xAD09, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x17E */ { 0x0000, 0x0000, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x17F */ { 0x0000, 0xAD09, 0xAD09, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x180 */ { 0x0000, 0xF042, 0xF045, 0xF048, 0xF04B, 0xF04E, 0xF051, 0xF055, 0xF059, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xAD09 }, + /* 0x181 */ { 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0xAD01, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x182 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x183 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF05D, 0xF060, 0xF063, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x184 */ { 0xEB0F, 0xEB10, 0xEB11, 0xEB12, 0xEB13, 0xEB14, 0xEB15, 0xEB16, 0xEB17, 0xEB18, 0xEB19, 0xEB1A, 0xEB1B, 0xEB1C, 0xEB1D, 0xEB1E }, + /* 0x185 */ { 0xEB1F, 0xEB20, 0xEB21, 0xEB22, 0xEB23, 0xEB24, 0xEB25, 0xEB26, 0xEB27, 0xEB28, 0xEB29, 0xEB2A, 0xEB2B, 0xEB2C, 0xEB2D, 0xEB2E }, + /* 0x186 */ { 0xAD06, 0xAD06, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x187 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xAD01, 0x0000 }, + /* 0x188 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF067, 0xF06A }, + /* 0x189 */ { 0xF06D, 0xF071, 0xF075, 0xF079, 0xF07D, 0xADD8, 0xADD8, 0xAD01, 0xAD01, 0xAD01, 0x0000, 0x0000, 0x0000, 0xADE2, 0xADD8, 0xADD8 }, + /* 0x18A */ { 0xADD8, 0xADD8, 0xADD8, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC }, + /* 0x18B */ { 0xADDC, 0xADDC, 0xADDC, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADDC, 0xADDC, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x18C */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000 }, + /* 0x18D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xF081, 0xF084, 0xF087, 0xF08B, 0xF08F }, + /* 0x18E */ { 0xF093, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x18F */ { 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x190 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, + /* 0x191 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, + /* 0x192 */ { 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xFFFF, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x193 */ { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xADE6 }, + /* 0x194 */ { 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF }, + /* 0x195 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xFFFF }, + /* 0x196 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6 }, + /* 0x197 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE8, 0xADE8, 0xADDC, 0xADE6 }, + /* 0x198 */ { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADDC }, + /* 0x199 */ { 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xADDC, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x19A */ { 0xEB2F, 0xEB30, 0xEB31, 0xEB32, 0xEB33, 0xEB34, 0xEB35, 0xEB36, 0xEB37, 0xEB38, 0xEB39, 0xEB3A, 0xEB3B, 0xEB3C, 0xEB3D, 0xEB3E }, + /* 0x19B */ { 0xEB3F, 0xEB40, 0xEB41, 0xEB42, 0xEB43, 0xEB44, 0xEB45, 0xEB46, 0xEB47, 0xEB48, 0xEB49, 0xEB4A, 0xEB4B, 0xEB4C, 0xEB4D, 0xEB4E }, + /* 0x19C */ { 0xEB4F, 0xEB50, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + /* 0x19D */ { 0x0000, 0x0000, 0x0000, 0x0000, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xADE6, 0xAD07, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }, + /* 0x19E */ { 0x4E3D, 0x4E38, 0x4E41, 0xE351, 0x4F60, 0x4FAE, 0x4FBB, 0x5002, 0x507A, 0x5099, 0x50E7, 0x50CF, 0x349E, 0xE352, 0x514D, 0x5154 }, + /* 0x19F */ { 0x5164, 0x5177, 0xE353, 0x34B9, 0x5167, 0x518D, 0xE354, 0x5197, 0x51A4, 0x4ECC, 0x51AC, 0x51B5, 0xE355, 0x51F5, 0x5203, 0x34DF }, + /* 0x1A0 */ { 0x523B, 0x5246, 0x5272, 0x5277, 0x3515, 0x52C7, 0x52C9, 0x52E4, 0x52FA, 0x5305, 0x5306, 0x5317, 0x5349, 0x5351, 0x535A, 0x5373 }, + /* 0x1A1 */ { 0x537D, 0x537F, 0x537F, 0x537F, 0xE356, 0x7070, 0x53CA, 0x53DF, 0xE357, 0x53EB, 0x53F1, 0x5406, 0x549E, 0x5438, 0x5448, 0x5468 }, + /* 0x1A2 */ { 0x54A2, 0x54F6, 0x5510, 0x5553, 0x5563, 0x5584, 0x5584, 0x5599, 0x55AB, 0x55B3, 0x55C2, 0x5716, 0x5606, 0x5717, 0x5651, 0x5674 }, + /* 0x1A3 */ { 0x5207, 0x58EE, 0x57CE, 0x57F4, 0x580D, 0x578B, 0x5832, 0x5831, 0x58AC, 0xE358, 0x58F2, 0x58F7, 0x5906, 0x591A, 0x5922, 0x5962 }, + /* 0x1A4 */ { 0xE359, 0xE35A, 0x59EC, 0x5A1B, 0x5A27, 0x59D8, 0x5A66, 0x36EE, 0x36FC, 0x5B08, 0x5B3E, 0x5B3E, 0xE35B, 0x5BC3, 0x5BD8, 0x5BE7 }, + /* 0x1A5 */ { 0x5BF3, 0xE35C, 0x5BFF, 0x5C06, 0x5F53, 0x5C22, 0x3781, 0x5C60, 0x5C6E, 0x5CC0, 0x5C8D, 0xE35D, 0x5D43, 0xE35E, 0x5D6E, 0x5D6B }, + /* 0x1A6 */ { 0x5D7C, 0x5DE1, 0x5DE2, 0x382F, 0x5DFD, 0x5E28, 0x5E3D, 0x5E69, 0x3862, 0xE35F, 0x387C, 0x5EB0, 0x5EB3, 0x5EB6, 0x5ECA, 0xE360 }, + /* 0x1A7 */ { 0x5EFE, 0xE361, 0xE362, 0x8201, 0x5F22, 0x5F22, 0x38C7, 0xE363, 0xE364, 0x5F62, 0x5F6B, 0x38E3, 0x5F9A, 0x5FCD, 0x5FD7, 0x5FF9 }, + /* 0x1A8 */ { 0x6081, 0x393A, 0x391C, 0x6094, 0xE365, 0x60C7, 0x6148, 0x614C, 0x614E, 0x614C, 0x617A, 0x618E, 0x61B2, 0x61A4, 0x61AF, 0x61DE }, + /* 0x1A9 */ { 0x61F2, 0x61F6, 0x6210, 0x621B, 0x625D, 0x62B1, 0x62D4, 0x6350, 0xE366, 0x633D, 0x62FC, 0x6368, 0x6383, 0x63E4, 0xE367, 0x6422 }, + /* 0x1AA */ { 0x63C5, 0x63A9, 0x3A2E, 0x6469, 0x647E, 0x649D, 0x6477, 0x3A6C, 0x654F, 0x656C, 0xE368, 0x65E3, 0x66F8, 0x6649, 0x3B19, 0x6691 }, + /* 0x1AB */ { 0x3B08, 0x3AE4, 0x5192, 0x5195, 0x6700, 0x669C, 0x80AD, 0x43D9, 0x6717, 0x671B, 0x6721, 0x675E, 0x6753, 0xE369, 0x3B49, 0x67FA }, + /* 0x1AC */ { 0x6785, 0x6852, 0x6885, 0xE36A, 0x688E, 0x681F, 0x6914, 0x3B9D, 0x6942, 0x69A3, 0x69EA, 0x6AA8, 0xE36B, 0x6ADB, 0x3C18, 0x6B21 }, + /* 0x1AD */ { 0xE36C, 0x6B54, 0x3C4E, 0x6B72, 0x6B9F, 0x6BBA, 0x6BBB, 0xE36D, 0xE36E, 0xE36F, 0x6C4E, 0xE370, 0x6CBF, 0x6CCD, 0x6C67, 0x6D16 }, + /* 0x1AE */ { 0x6D3E, 0x6D77, 0x6D41, 0x6D69, 0x6D78, 0x6D85, 0xE371, 0x6D34, 0x6E2F, 0x6E6E, 0x3D33, 0x6ECB, 0x6EC7, 0xE372, 0x6DF9, 0x6F6E }, + /* 0x1AF */ { 0xE373, 0xE374, 0x6FC6, 0x7039, 0x701E, 0x701B, 0x3D96, 0x704A, 0x707D, 0x7077, 0x70AD, 0xE375, 0x7145, 0xE376, 0x719C, 0xE377 }, + /* 0x1B0 */ { 0x7228, 0x7235, 0x7250, 0xE378, 0x7280, 0x7295, 0xE379, 0xE37A, 0x737A, 0x738B, 0x3EAC, 0x73A5, 0x3EB8, 0x3EB8, 0x7447, 0x745C }, + /* 0x1B1 */ { 0x7471, 0x7485, 0x74CA, 0x3F1B, 0x7524, 0xE37B, 0x753E, 0xE37C, 0x7570, 0xE37D, 0x7610, 0xE37E, 0xE37F, 0xE380, 0x3FFC, 0x4008 }, + /* 0x1B2 */ { 0x76F4, 0xE381, 0xE382, 0xE383, 0xE384, 0x771E, 0x771F, 0x771F, 0x774A, 0x4039, 0x778B, 0x4046, 0x4096, 0xE385, 0x784E, 0x788C }, + /* 0x1B3 */ { 0x78CC, 0x40E3, 0xE386, 0x7956, 0xE387, 0xE388, 0x798F, 0x79EB, 0x412F, 0x7A40, 0x7A4A, 0x7A4F, 0xE389, 0xE38A, 0xE38B, 0x7AEE }, + /* 0x1B4 */ { 0x4202, 0xE38C, 0x7BC6, 0x7BC9, 0x4227, 0xE38D, 0x7CD2, 0x42A0, 0x7CE8, 0x7CE3, 0x7D00, 0xE38E, 0x7D63, 0x4301, 0x7DC7, 0x7E02 }, + /* 0x1B5 */ { 0x7E45, 0x4334, 0xE38F, 0xE390, 0x4359, 0xE391, 0x7F7A, 0xE392, 0x7F95, 0x7FFA, 0x8005, 0xE393, 0xE394, 0x8060, 0xE395, 0x8070 }, + /* 0x1B6 */ { 0xE396, 0x43D5, 0x80B2, 0x8103, 0x440B, 0x813E, 0x5AB5, 0xE397, 0xE398, 0xE399, 0xE39A, 0x8201, 0x8204, 0x8F9E, 0x446B, 0x8291 }, + /* 0x1B7 */ { 0x828B, 0x829D, 0x52B3, 0x82B1, 0x82B3, 0x82BD, 0x82E6, 0xE39B, 0x82E5, 0x831D, 0x8363, 0x83AD, 0x8323, 0x83BD, 0x83E7, 0x8457 }, + /* 0x1B8 */ { 0x8353, 0x83CA, 0x83CC, 0x83DC, 0xE39C, 0xE39D, 0xE39E, 0x452B, 0x84F1, 0x84F3, 0x8516, 0xE39F, 0x8564, 0xE3A0, 0x455D, 0x4561 }, + /* 0x1B9 */ { 0xE3A1, 0xE3A2, 0x456B, 0x8650, 0x865C, 0x8667, 0x8669, 0x86A9, 0x8688, 0x870E, 0x86E2, 0x8779, 0x8728, 0x876B, 0x8786, 0x45D7 }, + /* 0x1BA */ { 0x87E1, 0x8801, 0x45F9, 0x8860, 0x8863, 0xE3A3, 0x88D7, 0x88DE, 0x4635, 0x88FA, 0x34BB, 0xE3A4, 0xE3A5, 0x46BE, 0x46C7, 0x8AA0 }, + /* 0x1BB */ { 0x8AED, 0x8B8A, 0x8C55, 0xE3A6, 0x8CAB, 0x8CC1, 0x8D1B, 0x8D77, 0xE3A7, 0xE3A8, 0x8DCB, 0x8DBC, 0x8DF0, 0xE3A9, 0x8ED4, 0x8F38 }, + /* 0x1BC */ { 0xE3AA, 0xE3AB, 0x9094, 0x90F1, 0x9111, 0xE3AC, 0x911B, 0x9238, 0x92D7, 0x92D8, 0x927C, 0x93F9, 0x9415, 0xE3AD, 0x958B, 0x4995 }, + /* 0x1BD */ { 0x95B7, 0xE3AE, 0x49E6, 0x96C3, 0x5DB2, 0x9723, 0xE3AF, 0xE3B0, 0x4A6E, 0x4A76, 0x97E0, 0xE3B1, 0x4AB2, 0xE3B2, 0x980B, 0x980B }, + /* 0x1BE */ { 0x9829, 0xE3B3, 0x98E2, 0x4B33, 0x9929, 0x99A7, 0x99C2, 0x99FE, 0x4BCE, 0xE3B4, 0x9B12, 0x9C40, 0x9CFD, 0x4CCE, 0x4CED, 0x9D67 }, + /* 0x1BF */ { 0xE3B5, 0x4CF8, 0xE3B6, 0xE3B7, 0xE3B8, 0x9EBB, 0x4D56, 0x9EF9, 0x9EFE, 0x9F05, 0x9F0F, 0x9F16, 0x9F3B, 0xE3B9, 0xFFFF, 0xFFFF } }; -static uint16_t nfU16InvMasks[135] = { +static uint16_t nfU16InvMasks[139] = { /* 0x000 */ 0x7800, 0xFFE0, 0x4000, 0xFFFC, 0x8000, 0xF800, 0x6010, 0x0006, 0x0200, 0x0030, 0x7811, 0x003F, 0xFF80, 0x4011, 0x0004, 0xFFFE, /* 0x010 */ 0x01FC, 0x6011, 0xFF00, 0x3813, 0x38E7, 0x3C00, 0xFF7E, 0x2000, 0x0002, 0x007F, 0x9F9F, 0xFFF1, 0x000F, 0x0011, 0x0380, 0xD004, /* 0x020 */ 0xFFE3, 0x0001, 0xF000, 0x0829, 0x0050, 0x0C00, 0xC200, 0xC280, 0x80C2, 0x00C2, 0x0080, 0xE000, 0xFC00, 0xFE00, 0xFFF0, 0xFFF2, - /* 0x030 */ 0xFFC0, 0x000E, 0xC000, 0x3800, 0x1C00, 0x0020, 0x000C, 0x0040, 0xDF40, 0x7F00, 0x8080, 0x0400, 0x001F, 0x7FF0, 0x07FF, 0x8181, + /* 0x030 */ 0xFFC0, 0x000E, 0xC000, 0x3800, 0x1C00, 0x0020, 0x000C, 0x0040, 0xDF40, 0x7F00, 0x8080, 0x0400, 0x001F, 0x7FC0, 0x07FF, 0x8181, /* 0x040 */ 0xFF81, 0x0780, 0xFFF8, 0x0007, 0x0003, 0x0008, 0xF080, 0x6000, 0x0303, 0xE303, 0xC1FF, 0x1000, 0x4800, 0x0078, 0x0070, 0x1FF0, - /* 0x050 */ 0x00F0, 0xE404, 0xF802, 0x02C0, 0x6E40, 0x07C8, 0x7000, 0x7C00, 0x0F00, 0x0110, 0x01C0, 0x00C0, 0x00F8, 0xE1FC, 0x01FF, 0x03F8, - /* 0x060 */ 0xDFF8, 0x4280, 0x1F7E, 0x1800, 0x7FF8, 0x0D80, 0x0090, 0x0300, 0x0100, 0x0480, 0x4B80, 0x0240, 0x7FFC, 0x1F00, 0x9010, 0xFFFB, - /* 0x070 */ 0xFFD8, 0xFF0F, 0x0180, 0x219B, 0x1400, 0x0010, 0x1840, 0x2020, 0x8400, 0x03A0, 0x3000, 0xF81F, 0x9080, 0x0060, 0x0169, 0xF508, - /* 0x080 */ 0x157B, 0x5569, 0x0869, 0xA108, 0x0411, 0x3FC0, 0xFFFD + /* 0x050 */ 0x00F0, 0x7FF0, 0xE404, 0xF802, 0x02C0, 0x6E40, 0x07C8, 0x7000, 0x7C00, 0x0F00, 0x0110, 0x01C0, 0x00C0, 0x00F8, 0xE1FC, 0x01FF, + /* 0x060 */ 0x03F8, 0x3FC0, 0xDFF8, 0x4280, 0x1F7E, 0xFFF9, 0x1800, 0x7FF8, 0x0D80, 0x0090, 0x0300, 0x0100, 0x0480, 0x4B80, 0x0240, 0x7FFC, + /* 0x070 */ 0x1F00, 0x7FFF, 0x9010, 0xFFFB, 0xFFD8, 0xFF0F, 0x0180, 0x219B, 0x1400, 0x0010, 0x1840, 0x2020, 0x8400, 0x03A0, 0x3000, 0xF81F, + /* 0x080 */ 0x9080, 0x0060, 0x0169, 0xF508, 0x157B, 0x5569, 0x0869, 0xA108, 0x0411, 0x3F80, 0xFFFD }; static uint16_t nfU16Seq2[773][2] = { /* 0x000 */ {0x0041, 0x0300}, {0x0041, 0x0301}, {0x0041, 0x0302}, {0x0041, 0x0303}, {0x0041, 0x0308}, {0x0041, 0x030A}, @@ -1064,7 +1081,7 @@ static uint16_t nfU16SeqMisc[198] = { /* 0x0B0 */ 0x0300, 0x0345, 0x0004, 0x03A9, 0x0313, 0x0301, 0x0345, 0x0004, 0x03A9, 0x0314, 0x0301, 0x0345, 0x0004, 0x03A9, 0x0313, 0x0342, /* 0x0C0 */ 0x0345, 0x0004, 0x03A9, 0x0314, 0x0342, 0x0345 }; -static int32_t nfU32Char[927] = { +static int32_t nfU32Char[954] = { /* 0x000 */ 0x00501, 0x00503, 0x00505, 0x00507, 0x00509, 0x0050B, 0x0050D, 0x0050F, /* 0x008 */ 0x00511, 0x00513, 0x00515, 0x00517, 0x00519, 0x0051B, 0x0051D, 0x0051F, /* 0x010 */ 0x00521, 0x00523, 0x00525, 0x00527, 0x00529, 0x0052B, 0x0052D, 0x0052F, @@ -1079,121 +1096,133 @@ static int32_t nfU32Char[927] = { /* 0x058 */ 0x02D1A, 0x02D1B, 0x02D1C, 0x02D1D, 0x02D1E, 0x02D1F, 0x02D20, 0x02D21, /* 0x060 */ 0x02D22, 0x02D23, 0x02D24, 0x02D25, 0x02D27, 0x02D2D, 0x013F0, 0x013F1, /* 0x068 */ 0x013F2, 0x013F3, 0x013F4, 0x013F5, 0x00432, 0x00434, 0x0043E, 0x00441, - /* 0x070 */ 0x00442, 0x00442, 0x0044A, 0x00463, 0x0A64B, 0x010D0, 0x010D1, 0x010D2, - /* 0x078 */ 0x010D3, 0x010D4, 0x010D5, 0x010D6, 0x010D7, 0x010D8, 0x010D9, 0x010DA, - /* 0x080 */ 0x010DB, 0x010DC, 0x010DD, 0x010DE, 0x010DF, 0x010E0, 0x010E1, 0x010E2, - /* 0x088 */ 0x010E3, 0x010E4, 0x010E5, 0x010E6, 0x010E7, 0x010E8, 0x010E9, 0x010EA, - /* 0x090 */ 0x010EB, 0x010EC, 0x010ED, 0x010EE, 0x010EF, 0x010F0, 0x010F1, 0x010F2, - /* 0x098 */ 0x010F3, 0x010F4, 0x010F5, 0x010F6, 0x010F7, 0x010F8, 0x010F9, 0x010FA, - /* 0x0A0 */ 0x010FD, 0x010FE, 0x010FF, 0x01EFB, 0x01EFD, 0x01EFF, 0x0214E, 0x02170, - /* 0x0A8 */ 0x02171, 0x02172, 0x02173, 0x02174, 0x02175, 0x02176, 0x02177, 0x02178, - /* 0x0B0 */ 0x02179, 0x0217A, 0x0217B, 0x0217C, 0x0217D, 0x0217E, 0x0217F, 0x02184, - /* 0x0B8 */ 0x024D0, 0x024D1, 0x024D2, 0x024D3, 0x024D4, 0x024D5, 0x024D6, 0x024D7, - /* 0x0C0 */ 0x024D8, 0x024D9, 0x024DA, 0x024DB, 0x024DC, 0x024DD, 0x024DE, 0x024DF, - /* 0x0C8 */ 0x024E0, 0x024E1, 0x024E2, 0x024E3, 0x024E4, 0x024E5, 0x024E6, 0x024E7, - /* 0x0D0 */ 0x024E8, 0x024E9, 0x02C30, 0x02C31, 0x02C32, 0x02C33, 0x02C34, 0x02C35, - /* 0x0D8 */ 0x02C36, 0x02C37, 0x02C38, 0x02C39, 0x02C3A, 0x02C3B, 0x02C3C, 0x02C3D, - /* 0x0E0 */ 0x02C3E, 0x02C3F, 0x02C40, 0x02C41, 0x02C42, 0x02C43, 0x02C44, 0x02C45, - /* 0x0E8 */ 0x02C46, 0x02C47, 0x02C48, 0x02C49, 0x02C4A, 0x02C4B, 0x02C4C, 0x02C4D, - /* 0x0F0 */ 0x02C4E, 0x02C4F, 0x02C50, 0x02C51, 0x02C52, 0x02C53, 0x02C54, 0x02C55, - /* 0x0F8 */ 0x02C56, 0x02C57, 0x02C58, 0x02C59, 0x02C5A, 0x02C5B, 0x02C5C, 0x02C5D, - /* 0x100 */ 0x02C5E, 0x02C5F, 0x02C61, 0x0026B, 0x01D7D, 0x0027D, 0x02C68, 0x02C6A, - /* 0x108 */ 0x02C6C, 0x00251, 0x00271, 0x00250, 0x00252, 0x02C73, 0x02C76, 0x0023F, - /* 0x110 */ 0x00240, 0x02C81, 0x02C83, 0x02C85, 0x02C87, 0x02C89, 0x02C8B, 0x02C8D, - /* 0x118 */ 0x02C8F, 0x02C91, 0x02C93, 0x02C95, 0x02C97, 0x02C99, 0x02C9B, 0x02C9D, - /* 0x120 */ 0x02C9F, 0x02CA1, 0x02CA3, 0x02CA5, 0x02CA7, 0x02CA9, 0x02CAB, 0x02CAD, - /* 0x128 */ 0x02CAF, 0x02CB1, 0x02CB3, 0x02CB5, 0x02CB7, 0x02CB9, 0x02CBB, 0x02CBD, - /* 0x130 */ 0x02CBF, 0x02CC1, 0x02CC3, 0x02CC5, 0x02CC7, 0x02CC9, 0x02CCB, 0x02CCD, - /* 0x138 */ 0x02CCF, 0x02CD1, 0x02CD3, 0x02CD5, 0x02CD7, 0x02CD9, 0x02CDB, 0x02CDD, - /* 0x140 */ 0x02CDF, 0x02CE1, 0x02CE3, 0x02CEC, 0x02CEE, 0x02CF3, 0x0A641, 0x0A643, - /* 0x148 */ 0x0A645, 0x0A647, 0x0A649, 0x0A64B, 0x0A64D, 0x0A64F, 0x0A651, 0x0A653, - /* 0x150 */ 0x0A655, 0x0A657, 0x0A659, 0x0A65B, 0x0A65D, 0x0A65F, 0x0A661, 0x0A663, - /* 0x158 */ 0x0A665, 0x0A667, 0x0A669, 0x0A66B, 0x0A66D, 0x0A681, 0x0A683, 0x0A685, - /* 0x160 */ 0x0A687, 0x0A689, 0x0A68B, 0x0A68D, 0x0A68F, 0x0A691, 0x0A693, 0x0A695, - /* 0x168 */ 0x0A697, 0x0A699, 0x0A69B, 0x0A723, 0x0A725, 0x0A727, 0x0A729, 0x0A72B, - /* 0x170 */ 0x0A72D, 0x0A72F, 0x0A733, 0x0A735, 0x0A737, 0x0A739, 0x0A73B, 0x0A73D, - /* 0x178 */ 0x0A73F, 0x0A741, 0x0A743, 0x0A745, 0x0A747, 0x0A749, 0x0A74B, 0x0A74D, - /* 0x180 */ 0x0A74F, 0x0A751, 0x0A753, 0x0A755, 0x0A757, 0x0A759, 0x0A75B, 0x0A75D, - /* 0x188 */ 0x0A75F, 0x0A761, 0x0A763, 0x0A765, 0x0A767, 0x0A769, 0x0A76B, 0x0A76D, - /* 0x190 */ 0x0A76F, 0x0A77A, 0x0A77C, 0x01D79, 0x0A77F, 0x0A781, 0x0A783, 0x0A785, - /* 0x198 */ 0x0A787, 0x0A78C, 0x00265, 0x0A791, 0x0A793, 0x0A797, 0x0A799, 0x0A79B, - /* 0x1A0 */ 0x0A79D, 0x0A79F, 0x0A7A1, 0x0A7A3, 0x0A7A5, 0x0A7A7, 0x0A7A9, 0x00266, - /* 0x1A8 */ 0x0025C, 0x00261, 0x0026C, 0x0026A, 0x0029E, 0x00287, 0x0029D, 0x0AB53, - /* 0x1B0 */ 0x0A7B5, 0x0A7B7, 0x0A7B9, 0x0A7BB, 0x0A7BD, 0x0A7BF, 0x0A7C1, 0x0A7C3, - /* 0x1B8 */ 0x0A794, 0x00282, 0x01D8E, 0x0A7C8, 0x0A7CA, 0x0A7D1, 0x0A7D7, 0x0A7D9, - /* 0x1C0 */ 0x0A7F6, 0x013A0, 0x013A1, 0x013A2, 0x013A3, 0x013A4, 0x013A5, 0x013A6, - /* 0x1C8 */ 0x013A7, 0x013A8, 0x013A9, 0x013AA, 0x013AB, 0x013AC, 0x013AD, 0x013AE, - /* 0x1D0 */ 0x013AF, 0x013B0, 0x013B1, 0x013B2, 0x013B3, 0x013B4, 0x013B5, 0x013B6, - /* 0x1D8 */ 0x013B7, 0x013B8, 0x013B9, 0x013BA, 0x013BB, 0x013BC, 0x013BD, 0x013BE, - /* 0x1E0 */ 0x013BF, 0x013C0, 0x013C1, 0x013C2, 0x013C3, 0x013C4, 0x013C5, 0x013C6, - /* 0x1E8 */ 0x013C7, 0x013C8, 0x013C9, 0x013CA, 0x013CB, 0x013CC, 0x013CD, 0x013CE, - /* 0x1F0 */ 0x013CF, 0x013D0, 0x013D1, 0x013D2, 0x013D3, 0x013D4, 0x013D5, 0x013D6, - /* 0x1F8 */ 0x013D7, 0x013D8, 0x013D9, 0x013DA, 0x013DB, 0x013DC, 0x013DD, 0x013DE, - /* 0x200 */ 0x013DF, 0x013E0, 0x013E1, 0x013E2, 0x013E3, 0x013E4, 0x013E5, 0x013E6, - /* 0x208 */ 0x013E7, 0x013E8, 0x013E9, 0x013EA, 0x013EB, 0x013EC, 0x013ED, 0x013EE, - /* 0x210 */ 0x013EF, 0x242EE, 0x2284A, 0x22844, 0x233D5, 0x25249, 0x25CD0, 0x27ED3, - /* 0x218 */ 0x0FF41, 0x0FF42, 0x0FF43, 0x0FF44, 0x0FF45, 0x0FF46, 0x0FF47, 0x0FF48, - /* 0x220 */ 0x0FF49, 0x0FF4A, 0x0FF4B, 0x0FF4C, 0x0FF4D, 0x0FF4E, 0x0FF4F, 0x0FF50, - /* 0x228 */ 0x0FF51, 0x0FF52, 0x0FF53, 0x0FF54, 0x0FF55, 0x0FF56, 0x0FF57, 0x0FF58, - /* 0x230 */ 0x0FF59, 0x0FF5A, 0x10428, 0x10429, 0x1042A, 0x1042B, 0x1042C, 0x1042D, - /* 0x238 */ 0x1042E, 0x1042F, 0x10430, 0x10431, 0x10432, 0x10433, 0x10434, 0x10435, - /* 0x240 */ 0x10436, 0x10437, 0x10438, 0x10439, 0x1043A, 0x1043B, 0x1043C, 0x1043D, - /* 0x248 */ 0x1043E, 0x1043F, 0x10440, 0x10441, 0x10442, 0x10443, 0x10444, 0x10445, - /* 0x250 */ 0x10446, 0x10447, 0x10448, 0x10449, 0x1044A, 0x1044B, 0x1044C, 0x1044D, - /* 0x258 */ 0x1044E, 0x1044F, 0x104D8, 0x104D9, 0x104DA, 0x104DB, 0x104DC, 0x104DD, - /* 0x260 */ 0x104DE, 0x104DF, 0x104E0, 0x104E1, 0x104E2, 0x104E3, 0x104E4, 0x104E5, - /* 0x268 */ 0x104E6, 0x104E7, 0x104E8, 0x104E9, 0x104EA, 0x104EB, 0x104EC, 0x104ED, - /* 0x270 */ 0x104EE, 0x104EF, 0x104F0, 0x104F1, 0x104F2, 0x104F3, 0x104F4, 0x104F5, - /* 0x278 */ 0x104F6, 0x104F7, 0x104F8, 0x104F9, 0x104FA, 0x104FB, 0x10597, 0x10598, - /* 0x280 */ 0x10599, 0x1059A, 0x1059B, 0x1059C, 0x1059D, 0x1059E, 0x1059F, 0x105A0, - /* 0x288 */ 0x105A1, 0x105A3, 0x105A4, 0x105A5, 0x105A6, 0x105A7, 0x105A8, 0x105A9, - /* 0x290 */ 0x105AA, 0x105AB, 0x105AC, 0x105AD, 0x105AE, 0x105AF, 0x105B0, 0x105B1, - /* 0x298 */ 0x105B3, 0x105B4, 0x105B5, 0x105B6, 0x105B7, 0x105B8, 0x105B9, 0x105BB, - /* 0x2A0 */ 0x105BC, 0x10CC0, 0x10CC1, 0x10CC2, 0x10CC3, 0x10CC4, 0x10CC5, 0x10CC6, - /* 0x2A8 */ 0x10CC7, 0x10CC8, 0x10CC9, 0x10CCA, 0x10CCB, 0x10CCC, 0x10CCD, 0x10CCE, - /* 0x2B0 */ 0x10CCF, 0x10CD0, 0x10CD1, 0x10CD2, 0x10CD3, 0x10CD4, 0x10CD5, 0x10CD6, - /* 0x2B8 */ 0x10CD7, 0x10CD8, 0x10CD9, 0x10CDA, 0x10CDB, 0x10CDC, 0x10CDD, 0x10CDE, - /* 0x2C0 */ 0x10CDF, 0x10CE0, 0x10CE1, 0x10CE2, 0x10CE3, 0x10CE4, 0x10CE5, 0x10CE6, - /* 0x2C8 */ 0x10CE7, 0x10CE8, 0x10CE9, 0x10CEA, 0x10CEB, 0x10CEC, 0x10CED, 0x10CEE, - /* 0x2D0 */ 0x10CEF, 0x10CF0, 0x10CF1, 0x10CF2, 0x118C0, 0x118C1, 0x118C2, 0x118C3, - /* 0x2D8 */ 0x118C4, 0x118C5, 0x118C6, 0x118C7, 0x118C8, 0x118C9, 0x118CA, 0x118CB, - /* 0x2E0 */ 0x118CC, 0x118CD, 0x118CE, 0x118CF, 0x118D0, 0x118D1, 0x118D2, 0x118D3, - /* 0x2E8 */ 0x118D4, 0x118D5, 0x118D6, 0x118D7, 0x118D8, 0x118D9, 0x118DA, 0x118DB, - /* 0x2F0 */ 0x118DC, 0x118DD, 0x118DE, 0x118DF, 0x16E60, 0x16E61, 0x16E62, 0x16E63, - /* 0x2F8 */ 0x16E64, 0x16E65, 0x16E66, 0x16E67, 0x16E68, 0x16E69, 0x16E6A, 0x16E6B, - /* 0x300 */ 0x16E6C, 0x16E6D, 0x16E6E, 0x16E6F, 0x16E70, 0x16E71, 0x16E72, 0x16E73, - /* 0x308 */ 0x16E74, 0x16E75, 0x16E76, 0x16E77, 0x16E78, 0x16E79, 0x16E7A, 0x16E7B, - /* 0x310 */ 0x16E7C, 0x16E7D, 0x16E7E, 0x16E7F, 0x1E922, 0x1E923, 0x1E924, 0x1E925, - /* 0x318 */ 0x1E926, 0x1E927, 0x1E928, 0x1E929, 0x1E92A, 0x1E92B, 0x1E92C, 0x1E92D, - /* 0x320 */ 0x1E92E, 0x1E92F, 0x1E930, 0x1E931, 0x1E932, 0x1E933, 0x1E934, 0x1E935, - /* 0x328 */ 0x1E936, 0x1E937, 0x1E938, 0x1E939, 0x1E93A, 0x1E93B, 0x1E93C, 0x1E93D, - /* 0x330 */ 0x1E93E, 0x1E93F, 0x1E940, 0x1E941, 0x1E942, 0x1E943, 0x20122, 0x2063A, - /* 0x338 */ 0x2051C, 0x2054B, 0x291DF, 0x20A2C, 0x20B63, 0x214E4, 0x216A8, 0x216EA, - /* 0x340 */ 0x219C8, 0x21B18, 0x21DE4, 0x21DE6, 0x22183, 0x2A392, 0x22331, 0x22331, - /* 0x348 */ 0x232B8, 0x261DA, 0x226D4, 0x22B0C, 0x22BF1, 0x2300A, 0x233C3, 0x2346D, - /* 0x350 */ 0x236A3, 0x238A7, 0x23A8D, 0x21D0B, 0x23AFA, 0x23CBC, 0x23D1E, 0x23ED1, - /* 0x358 */ 0x23F5E, 0x23F8E, 0x20525, 0x24263, 0x243AB, 0x24608, 0x24735, 0x24814, - /* 0x360 */ 0x24C36, 0x24C92, 0x2219F, 0x24FA1, 0x24FB8, 0x25044, 0x250F3, 0x250F2, - /* 0x368 */ 0x25119, 0x25133, 0x2541D, 0x25626, 0x2569A, 0x256C5, 0x2597C, 0x25AA7, - /* 0x370 */ 0x25AA7, 0x25BAB, 0x25C80, 0x25F86, 0x26228, 0x26247, 0x262D9, 0x2633E, - /* 0x378 */ 0x264DA, 0x26523, 0x265A8, 0x2335F, 0x267A7, 0x267B5, 0x23393, 0x2339C, - /* 0x380 */ 0x26B3C, 0x26C36, 0x26D6B, 0x26CD5, 0x273CA, 0x26F2C, 0x26FB1, 0x270D2, - /* 0x388 */ 0x27667, 0x278AE, 0x27966, 0x27CA8, 0x27F2F, 0x20804, 0x208DE, 0x285D2, - /* 0x390 */ 0x285ED, 0x2872E, 0x28BFA, 0x28D77, 0x29145, 0x2921A, 0x2940A, 0x29496, - /* 0x398 */ 0x295B6, 0x29B30, 0x2A0CE, 0x2A105, 0x2A20E, 0x2A291, 0x2A600 + /* 0x070 */ 0x00442, 0x00442, 0x0044A, 0x00463, 0x0A64B, 0x01C8A, 0x010D0, 0x010D1, + /* 0x078 */ 0x010D2, 0x010D3, 0x010D4, 0x010D5, 0x010D6, 0x010D7, 0x010D8, 0x010D9, + /* 0x080 */ 0x010DA, 0x010DB, 0x010DC, 0x010DD, 0x010DE, 0x010DF, 0x010E0, 0x010E1, + /* 0x088 */ 0x010E2, 0x010E3, 0x010E4, 0x010E5, 0x010E6, 0x010E7, 0x010E8, 0x010E9, + /* 0x090 */ 0x010EA, 0x010EB, 0x010EC, 0x010ED, 0x010EE, 0x010EF, 0x010F0, 0x010F1, + /* 0x098 */ 0x010F2, 0x010F3, 0x010F4, 0x010F5, 0x010F6, 0x010F7, 0x010F8, 0x010F9, + /* 0x0A0 */ 0x010FA, 0x010FD, 0x010FE, 0x010FF, 0x01EFB, 0x01EFD, 0x01EFF, 0x0214E, + /* 0x0A8 */ 0x02170, 0x02171, 0x02172, 0x02173, 0x02174, 0x02175, 0x02176, 0x02177, + /* 0x0B0 */ 0x02178, 0x02179, 0x0217A, 0x0217B, 0x0217C, 0x0217D, 0x0217E, 0x0217F, + /* 0x0B8 */ 0x02184, 0x024D0, 0x024D1, 0x024D2, 0x024D3, 0x024D4, 0x024D5, 0x024D6, + /* 0x0C0 */ 0x024D7, 0x024D8, 0x024D9, 0x024DA, 0x024DB, 0x024DC, 0x024DD, 0x024DE, + /* 0x0C8 */ 0x024DF, 0x024E0, 0x024E1, 0x024E2, 0x024E3, 0x024E4, 0x024E5, 0x024E6, + /* 0x0D0 */ 0x024E7, 0x024E8, 0x024E9, 0x02C30, 0x02C31, 0x02C32, 0x02C33, 0x02C34, + /* 0x0D8 */ 0x02C35, 0x02C36, 0x02C37, 0x02C38, 0x02C39, 0x02C3A, 0x02C3B, 0x02C3C, + /* 0x0E0 */ 0x02C3D, 0x02C3E, 0x02C3F, 0x02C40, 0x02C41, 0x02C42, 0x02C43, 0x02C44, + /* 0x0E8 */ 0x02C45, 0x02C46, 0x02C47, 0x02C48, 0x02C49, 0x02C4A, 0x02C4B, 0x02C4C, + /* 0x0F0 */ 0x02C4D, 0x02C4E, 0x02C4F, 0x02C50, 0x02C51, 0x02C52, 0x02C53, 0x02C54, + /* 0x0F8 */ 0x02C55, 0x02C56, 0x02C57, 0x02C58, 0x02C59, 0x02C5A, 0x02C5B, 0x02C5C, + /* 0x100 */ 0x02C5D, 0x02C5E, 0x02C5F, 0x02C61, 0x0026B, 0x01D7D, 0x0027D, 0x02C68, + /* 0x108 */ 0x02C6A, 0x02C6C, 0x00251, 0x00271, 0x00250, 0x00252, 0x02C73, 0x02C76, + /* 0x110 */ 0x0023F, 0x00240, 0x02C81, 0x02C83, 0x02C85, 0x02C87, 0x02C89, 0x02C8B, + /* 0x118 */ 0x02C8D, 0x02C8F, 0x02C91, 0x02C93, 0x02C95, 0x02C97, 0x02C99, 0x02C9B, + /* 0x120 */ 0x02C9D, 0x02C9F, 0x02CA1, 0x02CA3, 0x02CA5, 0x02CA7, 0x02CA9, 0x02CAB, + /* 0x128 */ 0x02CAD, 0x02CAF, 0x02CB1, 0x02CB3, 0x02CB5, 0x02CB7, 0x02CB9, 0x02CBB, + /* 0x130 */ 0x02CBD, 0x02CBF, 0x02CC1, 0x02CC3, 0x02CC5, 0x02CC7, 0x02CC9, 0x02CCB, + /* 0x138 */ 0x02CCD, 0x02CCF, 0x02CD1, 0x02CD3, 0x02CD5, 0x02CD7, 0x02CD9, 0x02CDB, + /* 0x140 */ 0x02CDD, 0x02CDF, 0x02CE1, 0x02CE3, 0x02CEC, 0x02CEE, 0x02CF3, 0x0A641, + /* 0x148 */ 0x0A643, 0x0A645, 0x0A647, 0x0A649, 0x0A64B, 0x0A64D, 0x0A64F, 0x0A651, + /* 0x150 */ 0x0A653, 0x0A655, 0x0A657, 0x0A659, 0x0A65B, 0x0A65D, 0x0A65F, 0x0A661, + /* 0x158 */ 0x0A663, 0x0A665, 0x0A667, 0x0A669, 0x0A66B, 0x0A66D, 0x0A681, 0x0A683, + /* 0x160 */ 0x0A685, 0x0A687, 0x0A689, 0x0A68B, 0x0A68D, 0x0A68F, 0x0A691, 0x0A693, + /* 0x168 */ 0x0A695, 0x0A697, 0x0A699, 0x0A69B, 0x0A723, 0x0A725, 0x0A727, 0x0A729, + /* 0x170 */ 0x0A72B, 0x0A72D, 0x0A72F, 0x0A733, 0x0A735, 0x0A737, 0x0A739, 0x0A73B, + /* 0x178 */ 0x0A73D, 0x0A73F, 0x0A741, 0x0A743, 0x0A745, 0x0A747, 0x0A749, 0x0A74B, + /* 0x180 */ 0x0A74D, 0x0A74F, 0x0A751, 0x0A753, 0x0A755, 0x0A757, 0x0A759, 0x0A75B, + /* 0x188 */ 0x0A75D, 0x0A75F, 0x0A761, 0x0A763, 0x0A765, 0x0A767, 0x0A769, 0x0A76B, + /* 0x190 */ 0x0A76D, 0x0A76F, 0x0A77A, 0x0A77C, 0x01D79, 0x0A77F, 0x0A781, 0x0A783, + /* 0x198 */ 0x0A785, 0x0A787, 0x0A78C, 0x00265, 0x0A791, 0x0A793, 0x0A797, 0x0A799, + /* 0x1A0 */ 0x0A79B, 0x0A79D, 0x0A79F, 0x0A7A1, 0x0A7A3, 0x0A7A5, 0x0A7A7, 0x0A7A9, + /* 0x1A8 */ 0x00266, 0x0025C, 0x00261, 0x0026C, 0x0026A, 0x0029E, 0x00287, 0x0029D, + /* 0x1B0 */ 0x0AB53, 0x0A7B5, 0x0A7B7, 0x0A7B9, 0x0A7BB, 0x0A7BD, 0x0A7BF, 0x0A7C1, + /* 0x1B8 */ 0x0A7C3, 0x0A794, 0x00282, 0x01D8E, 0x0A7C8, 0x0A7CA, 0x00264, 0x0A7CD, + /* 0x1C0 */ 0x0A7D1, 0x0A7D7, 0x0A7D9, 0x0A7DB, 0x0019B, 0x0A7F6, 0x013A0, 0x013A1, + /* 0x1C8 */ 0x013A2, 0x013A3, 0x013A4, 0x013A5, 0x013A6, 0x013A7, 0x013A8, 0x013A9, + /* 0x1D0 */ 0x013AA, 0x013AB, 0x013AC, 0x013AD, 0x013AE, 0x013AF, 0x013B0, 0x013B1, + /* 0x1D8 */ 0x013B2, 0x013B3, 0x013B4, 0x013B5, 0x013B6, 0x013B7, 0x013B8, 0x013B9, + /* 0x1E0 */ 0x013BA, 0x013BB, 0x013BC, 0x013BD, 0x013BE, 0x013BF, 0x013C0, 0x013C1, + /* 0x1E8 */ 0x013C2, 0x013C3, 0x013C4, 0x013C5, 0x013C6, 0x013C7, 0x013C8, 0x013C9, + /* 0x1F0 */ 0x013CA, 0x013CB, 0x013CC, 0x013CD, 0x013CE, 0x013CF, 0x013D0, 0x013D1, + /* 0x1F8 */ 0x013D2, 0x013D3, 0x013D4, 0x013D5, 0x013D6, 0x013D7, 0x013D8, 0x013D9, + /* 0x200 */ 0x013DA, 0x013DB, 0x013DC, 0x013DD, 0x013DE, 0x013DF, 0x013E0, 0x013E1, + /* 0x208 */ 0x013E2, 0x013E3, 0x013E4, 0x013E5, 0x013E6, 0x013E7, 0x013E8, 0x013E9, + /* 0x210 */ 0x013EA, 0x013EB, 0x013EC, 0x013ED, 0x013EE, 0x013EF, 0x242EE, 0x2284A, + /* 0x218 */ 0x22844, 0x233D5, 0x25249, 0x25CD0, 0x27ED3, 0x0FF41, 0x0FF42, 0x0FF43, + /* 0x220 */ 0x0FF44, 0x0FF45, 0x0FF46, 0x0FF47, 0x0FF48, 0x0FF49, 0x0FF4A, 0x0FF4B, + /* 0x228 */ 0x0FF4C, 0x0FF4D, 0x0FF4E, 0x0FF4F, 0x0FF50, 0x0FF51, 0x0FF52, 0x0FF53, + /* 0x230 */ 0x0FF54, 0x0FF55, 0x0FF56, 0x0FF57, 0x0FF58, 0x0FF59, 0x0FF5A, 0x10428, + /* 0x238 */ 0x10429, 0x1042A, 0x1042B, 0x1042C, 0x1042D, 0x1042E, 0x1042F, 0x10430, + /* 0x240 */ 0x10431, 0x10432, 0x10433, 0x10434, 0x10435, 0x10436, 0x10437, 0x10438, + /* 0x248 */ 0x10439, 0x1043A, 0x1043B, 0x1043C, 0x1043D, 0x1043E, 0x1043F, 0x10440, + /* 0x250 */ 0x10441, 0x10442, 0x10443, 0x10444, 0x10445, 0x10446, 0x10447, 0x10448, + /* 0x258 */ 0x10449, 0x1044A, 0x1044B, 0x1044C, 0x1044D, 0x1044E, 0x1044F, 0x104D8, + /* 0x260 */ 0x104D9, 0x104DA, 0x104DB, 0x104DC, 0x104DD, 0x104DE, 0x104DF, 0x104E0, + /* 0x268 */ 0x104E1, 0x104E2, 0x104E3, 0x104E4, 0x104E5, 0x104E6, 0x104E7, 0x104E8, + /* 0x270 */ 0x104E9, 0x104EA, 0x104EB, 0x104EC, 0x104ED, 0x104EE, 0x104EF, 0x104F0, + /* 0x278 */ 0x104F1, 0x104F2, 0x104F3, 0x104F4, 0x104F5, 0x104F6, 0x104F7, 0x104F8, + /* 0x280 */ 0x104F9, 0x104FA, 0x104FB, 0x10597, 0x10598, 0x10599, 0x1059A, 0x1059B, + /* 0x288 */ 0x1059C, 0x1059D, 0x1059E, 0x1059F, 0x105A0, 0x105A1, 0x105A3, 0x105A4, + /* 0x290 */ 0x105A5, 0x105A6, 0x105A7, 0x105A8, 0x105A9, 0x105AA, 0x105AB, 0x105AC, + /* 0x298 */ 0x105AD, 0x105AE, 0x105AF, 0x105B0, 0x105B1, 0x105B3, 0x105B4, 0x105B5, + /* 0x2A0 */ 0x105B6, 0x105B7, 0x105B8, 0x105B9, 0x105BB, 0x105BC, 0x10CC0, 0x10CC1, + /* 0x2A8 */ 0x10CC2, 0x10CC3, 0x10CC4, 0x10CC5, 0x10CC6, 0x10CC7, 0x10CC8, 0x10CC9, + /* 0x2B0 */ 0x10CCA, 0x10CCB, 0x10CCC, 0x10CCD, 0x10CCE, 0x10CCF, 0x10CD0, 0x10CD1, + /* 0x2B8 */ 0x10CD2, 0x10CD3, 0x10CD4, 0x10CD5, 0x10CD6, 0x10CD7, 0x10CD8, 0x10CD9, + /* 0x2C0 */ 0x10CDA, 0x10CDB, 0x10CDC, 0x10CDD, 0x10CDE, 0x10CDF, 0x10CE0, 0x10CE1, + /* 0x2C8 */ 0x10CE2, 0x10CE3, 0x10CE4, 0x10CE5, 0x10CE6, 0x10CE7, 0x10CE8, 0x10CE9, + /* 0x2D0 */ 0x10CEA, 0x10CEB, 0x10CEC, 0x10CED, 0x10CEE, 0x10CEF, 0x10CF0, 0x10CF1, + /* 0x2D8 */ 0x10CF2, 0x10D70, 0x10D71, 0x10D72, 0x10D73, 0x10D74, 0x10D75, 0x10D76, + /* 0x2E0 */ 0x10D77, 0x10D78, 0x10D79, 0x10D7A, 0x10D7B, 0x10D7C, 0x10D7D, 0x10D7E, + /* 0x2E8 */ 0x10D7F, 0x10D80, 0x10D81, 0x10D82, 0x10D83, 0x10D84, 0x10D85, 0x118C0, + /* 0x2F0 */ 0x118C1, 0x118C2, 0x118C3, 0x118C4, 0x118C5, 0x118C6, 0x118C7, 0x118C8, + /* 0x2F8 */ 0x118C9, 0x118CA, 0x118CB, 0x118CC, 0x118CD, 0x118CE, 0x118CF, 0x118D0, + /* 0x300 */ 0x118D1, 0x118D2, 0x118D3, 0x118D4, 0x118D5, 0x118D6, 0x118D7, 0x118D8, + /* 0x308 */ 0x118D9, 0x118DA, 0x118DB, 0x118DC, 0x118DD, 0x118DE, 0x118DF, 0x16E60, + /* 0x310 */ 0x16E61, 0x16E62, 0x16E63, 0x16E64, 0x16E65, 0x16E66, 0x16E67, 0x16E68, + /* 0x318 */ 0x16E69, 0x16E6A, 0x16E6B, 0x16E6C, 0x16E6D, 0x16E6E, 0x16E6F, 0x16E70, + /* 0x320 */ 0x16E71, 0x16E72, 0x16E73, 0x16E74, 0x16E75, 0x16E76, 0x16E77, 0x16E78, + /* 0x328 */ 0x16E79, 0x16E7A, 0x16E7B, 0x16E7C, 0x16E7D, 0x16E7E, 0x16E7F, 0x1E922, + /* 0x330 */ 0x1E923, 0x1E924, 0x1E925, 0x1E926, 0x1E927, 0x1E928, 0x1E929, 0x1E92A, + /* 0x338 */ 0x1E92B, 0x1E92C, 0x1E92D, 0x1E92E, 0x1E92F, 0x1E930, 0x1E931, 0x1E932, + /* 0x340 */ 0x1E933, 0x1E934, 0x1E935, 0x1E936, 0x1E937, 0x1E938, 0x1E939, 0x1E93A, + /* 0x348 */ 0x1E93B, 0x1E93C, 0x1E93D, 0x1E93E, 0x1E93F, 0x1E940, 0x1E941, 0x1E942, + /* 0x350 */ 0x1E943, 0x20122, 0x2063A, 0x2051C, 0x2054B, 0x291DF, 0x20A2C, 0x20B63, + /* 0x358 */ 0x214E4, 0x216A8, 0x216EA, 0x219C8, 0x21B18, 0x21DE4, 0x21DE6, 0x22183, + /* 0x360 */ 0x2A392, 0x22331, 0x22331, 0x232B8, 0x261DA, 0x226D4, 0x22B0C, 0x22BF1, + /* 0x368 */ 0x2300A, 0x233C3, 0x2346D, 0x236A3, 0x238A7, 0x23A8D, 0x21D0B, 0x23AFA, + /* 0x370 */ 0x23CBC, 0x23D1E, 0x23ED1, 0x23F5E, 0x23F8E, 0x20525, 0x24263, 0x243AB, + /* 0x378 */ 0x24608, 0x24735, 0x24814, 0x24C36, 0x24C92, 0x2219F, 0x24FA1, 0x24FB8, + /* 0x380 */ 0x25044, 0x250F3, 0x250F2, 0x25119, 0x25133, 0x2541D, 0x25626, 0x2569A, + /* 0x388 */ 0x256C5, 0x2597C, 0x25AA7, 0x25AA7, 0x25BAB, 0x25C80, 0x25F86, 0x26228, + /* 0x390 */ 0x26247, 0x262D9, 0x2633E, 0x264DA, 0x26523, 0x265A8, 0x2335F, 0x267A7, + /* 0x398 */ 0x267B5, 0x23393, 0x2339C, 0x26B3C, 0x26C36, 0x26D6B, 0x26CD5, 0x273CA, + /* 0x3A0 */ 0x26F2C, 0x26FB1, 0x270D2, 0x27667, 0x278AE, 0x27966, 0x27CA8, 0x27F2F, + /* 0x3A8 */ 0x20804, 0x208DE, 0x285D2, 0x285ED, 0x2872E, 0x28BFA, 0x28D77, 0x29145, + /* 0x3B0 */ 0x2921A, 0x2940A, 0x29496, 0x295B6, 0x29B30, 0x2A0CE, 0x2A105, 0x2A20E, + /* 0x3B8 */ 0x2A291, 0x2A600 }; -static int32_t nfU32SeqMisc[87] = { - /* 0x000 */ 0x00002, 0x11099, 0x110BA, 0x00002, 0x1109B, 0x110BA, 0x00002, 0x110A5, - /* 0x008 */ 0x110BA, 0x00002, 0x11131, 0x11127, 0x00002, 0x11132, 0x11127, 0x00002, - /* 0x010 */ 0x11347, 0x1133E, 0x00002, 0x11347, 0x11357, 0x00002, 0x114B9, 0x114BA, - /* 0x018 */ 0x00002, 0x114B9, 0x114B0, 0x00002, 0x114B9, 0x114BD, 0x00002, 0x115B8, - /* 0x020 */ 0x115AF, 0x00002, 0x115B9, 0x115AF, 0x00002, 0x11935, 0x11930, 0x00002, - /* 0x028 */ 0x1D157, 0x1D165, 0x00002, 0x1D158, 0x1D165, 0x00003, 0x1D158, 0x1D165, - /* 0x030 */ 0x1D16E, 0x00003, 0x1D158, 0x1D165, 0x1D16F, 0x00003, 0x1D158, 0x1D165, - /* 0x038 */ 0x1D170, 0x00003, 0x1D158, 0x1D165, 0x1D171, 0x00003, 0x1D158, 0x1D165, - /* 0x040 */ 0x1D172, 0x00002, 0x1D1B9, 0x1D165, 0x00002, 0x1D1BA, 0x1D165, 0x00003, - /* 0x048 */ 0x1D1B9, 0x1D165, 0x1D16E, 0x00003, 0x1D1BA, 0x1D165, 0x1D16E, 0x00003, - /* 0x050 */ 0x1D1B9, 0x1D165, 0x1D16F, 0x00003, 0x1D1BA, 0x1D165, 0x1D16F +static int32_t nfU32SeqMisc[151] = { + /* 0x000 */ 0x00002, 0x105D2, 0x00307, 0x00002, 0x105DA, 0x00307, 0x00002, 0x11099, + /* 0x008 */ 0x110BA, 0x00002, 0x1109B, 0x110BA, 0x00002, 0x110A5, 0x110BA, 0x00002, + /* 0x010 */ 0x11131, 0x11127, 0x00002, 0x11132, 0x11127, 0x00002, 0x11347, 0x1133E, + /* 0x018 */ 0x00002, 0x11347, 0x11357, 0x00002, 0x11382, 0x113C9, 0x00002, 0x11384, + /* 0x020 */ 0x113BB, 0x00002, 0x1138B, 0x113C2, 0x00002, 0x11390, 0x113C9, 0x00002, + /* 0x028 */ 0x113C2, 0x113C2, 0x00002, 0x113C2, 0x113B8, 0x00002, 0x113C2, 0x113C9, + /* 0x030 */ 0x00002, 0x114B9, 0x114BA, 0x00002, 0x114B9, 0x114B0, 0x00002, 0x114B9, + /* 0x038 */ 0x114BD, 0x00002, 0x115B8, 0x115AF, 0x00002, 0x115B9, 0x115AF, 0x00002, + /* 0x040 */ 0x11935, 0x11930, 0x00002, 0x1611E, 0x1611E, 0x00002, 0x1611E, 0x16129, + /* 0x048 */ 0x00002, 0x1611E, 0x1611F, 0x00002, 0x16129, 0x1611F, 0x00002, 0x1611E, + /* 0x050 */ 0x16120, 0x00003, 0x1611E, 0x1611E, 0x1611F, 0x00003, 0x1611E, 0x16129, + /* 0x058 */ 0x1611F, 0x00003, 0x1611E, 0x1611E, 0x16120, 0x00002, 0x16D67, 0x16D67, + /* 0x060 */ 0x00002, 0x16D63, 0x16D67, 0x00003, 0x16D63, 0x16D67, 0x16D67, 0x00002, + /* 0x068 */ 0x1D157, 0x1D165, 0x00002, 0x1D158, 0x1D165, 0x00003, 0x1D158, 0x1D165, + /* 0x070 */ 0x1D16E, 0x00003, 0x1D158, 0x1D165, 0x1D16F, 0x00003, 0x1D158, 0x1D165, + /* 0x078 */ 0x1D170, 0x00003, 0x1D158, 0x1D165, 0x1D171, 0x00003, 0x1D158, 0x1D165, + /* 0x080 */ 0x1D172, 0x00002, 0x1D1B9, 0x1D165, 0x00002, 0x1D1BA, 0x1D165, 0x00003, + /* 0x088 */ 0x1D1B9, 0x1D165, 0x1D16E, 0x00003, 0x1D1BA, 0x1D165, 0x1D16E, 0x00003, + /* 0x090 */ 0x1D1B9, 0x1D165, 0x1D16F, 0x00003, 0x1D1BA, 0x1D165, 0x1D16F }; static uint16_t nfBasicCF[1280] = { /* 0x000 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index b419ac805..b3b0d2cec 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -385,7 +385,7 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap, vnode_t boolean_t need_vnop_open; boolean_t batched; boolean_t ref_failed; - int nretries = 0; + int nretries = 0, max_retries = 10; again: vp = NULL; @@ -441,7 +441,7 @@ again: /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif - if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0) { + if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0 && (fmode & O_SYMLINK) == 0) { ndp->ni_cnd.cn_flags |= FOLLOW; } if (fmode & O_NOFOLLOW_ANY) { @@ -449,9 +449,13 @@ again: ndp->ni_flag |= NAMEI_NOFOLLOW_ANY; } if (fmode & O_RESOLVE_BENEATH) { - /* will return EACCES if relative path does not reside in the hierarchy beneath the starting directory */ + /* will return ENOTCAPABLE if path resolution escapes the starting directory */ ndp->ni_flag |= NAMEI_RESOLVE_BENEATH; } + if (fmode & O_UNIQUE) { + /* will return ENOTCAPABLE if target vnode has multiple links */ + ndp->ni_flag |= NAMEI_UNIQUE; + } continue_create_lookup: if ((error = namei(ndp))) { @@ -571,9 +575,13 @@ continue_create_lookup: ndp->ni_flag |= NAMEI_NOFOLLOW_ANY; } if (fmode & O_RESOLVE_BENEATH) { - /* will return EACCES if relative path does not reside in the hierarchy beneath the starting directory */ + /* will return ENOTCAPABLE if path resolution escapes the starting directory */ ndp->ni_flag |= NAMEI_RESOLVE_BENEATH; } + if (fmode & O_UNIQUE) { + /* will return ENOTCAPABLE if target vnode has multiple links */ + ndp->ni_flag |= NAMEI_UNIQUE; + } /* Do a lookup, possibly going directly to filesystem for compound operation */ do { @@ -625,6 +633,28 @@ continue_create_lookup: panic("Haven't cleaned up adequately in vn_open_auth()"); } + /* Verify that the vnode returned from namei() has the expected fileid and fsid */ + if (VATTR_IS_ACTIVE(vap, va_flags) && ISSET(vap->va_flags, VA_VAFILEID)) { + vnode_t tmp_vp; + + if (!VATTR_IS_ACTIVE(vap, va_fsid64) || !VATTR_IS_ACTIVE(vap, va_fileid)) { + error = EINVAL; + goto bad; + } + + error = vnode_getfromid(vap->va_fsid64.val[0], vap->va_fileid, ctx, FSOPT_ISREALFSID, &tmp_vp); + if (error) { + goto bad; + } + + if (tmp_vp != vp) { + vnode_put(tmp_vp); + error = ERECYCLE; + goto bad; + } + vnode_put(tmp_vp); + } + /* * Expect to use this code for filesystems without compound VNOPs, for the root * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(), @@ -750,14 +780,21 @@ bad: * may possibly be blcoking other threads from running. * * We start yielding the CPU after some number of - * retries for increasing durations. Note that this is - * still a loop without an exit condition. + * retries for increasing durations. */ nretries += 1; + if (nretries > max_retries) { + printf("%s: reached max_retries error %d need_vnop_open %d " + "fmode 0x%x ref_failed %d\n", + __func__, error, need_vnop_open, *fmodep, ref_failed); + goto out; + } if (nretries > RETRY_NO_YIELD_COUNT) { - /* Every hz/100 secs is 10 msecs ... */ - tsleep(&nretries, PVFS, "vn_open_auth_retry", - MIN((nretries * (hz / 100)), hz)); + struct timespec to; + + to.tv_sec = 0; + to.tv_nsec = nretries * 10 * NSEC_PER_MSEC; + msleep(&nretries, (lck_mtx_t *)0, PVFS, "vn_open_auth_retry", &to); } nameidone(ndp); goto again; @@ -1499,7 +1536,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 VATTR_WANTED(&va, va_iosize); /* lower layers will synthesise va_total_alloc from va_data_size if required */ VATTR_WANTED(&va, va_total_alloc); - if (xsec != NULL) { + if (xsec != NULL && !vnode_isnamedstream(vp)) { VATTR_WANTED(&va, va_uuuid); VATTR_WANTED(&va, va_guuid); VATTR_WANTED(&va, va_acl); diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index 7cd5f62b5..c3aabc378 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -78,7 +78,7 @@ static int shadow_sequence; #define SHADOW_NAME_FMT ".vfs_rsrc_stream_%p%08x%p" #define SHADOW_DIR_FMT ".vfs_rsrc_streams_%p%x" -#define SHADOW_DIR_CONTAINER "/var/run" +#define SHADOW_DIR_CONTAINER "/private/var/run" #define MAKE_SHADOW_NAME(VP, NAME) \ snprintf((NAME), sizeof((NAME)), (SHADOW_NAME_FMT), \ @@ -767,7 +767,7 @@ retry_create: MAKE_SHADOW_NAME(vp, tmpname); bzero(&cn, sizeof(cn)); cn.cn_nameiop = LOOKUP; - cn.cn_flags = ISLASTCN; + cn.cn_flags = ISLASTCN | MARKISSHADOW; cn.cn_context = context; cn.cn_pnbuf = tmpname; cn.cn_pnlen = sizeof(tmpname); @@ -1096,6 +1096,70 @@ default_removenamedstream(vnode_t vp, const char *name, vfs_context_t context) return default_removexattr(vp, XATTR_RESOURCEFORK_NAME, 0, context); } +static bool +is_shadow_dir_valid(vnode_t parent_sdvp, vnode_t sdvp, vfs_context_t kernelctx) +{ + struct vnode_attr va; + uint32_t tmp_fsid; + bool is_valid = false; + + /* Make sure it's in fact a directory */ + if (sdvp->v_type != VDIR) { + goto out; + } + + /* Obtain the fsid for what should be the /private/var/run directory. */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + if (VNOP_GETATTR(parent_sdvp, &va, kernelctx) != 0 || + !VATTR_IS_SUPPORTED(&va, va_fsid)) { + goto out; + } + + tmp_fsid = va.va_fsid; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_uid); + VATTR_WANTED(&va, va_gid); + VATTR_WANTED(&va, va_mode); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_dirlinkcount); + VATTR_WANTED(&va, va_acl); + /* Provide defaults for attrs that may not be supported */ + va.va_dirlinkcount = 1; + va.va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; + + if (VNOP_GETATTR(sdvp, &va, kernelctx) != 0 || + !VATTR_IS_SUPPORTED(&va, va_uid) || + !VATTR_IS_SUPPORTED(&va, va_gid) || + !VATTR_IS_SUPPORTED(&va, va_mode) || + !VATTR_IS_SUPPORTED(&va, va_fsid)) { + goto out; + } + + /* + * Make sure its what we want: + * - owned by root + * - not writable by anyone + * - on same file system as /private/var/run + * - not a hard-linked directory + * - no ACLs (they might grant write access) + */ + if ((va.va_uid != 0) || (va.va_gid != 0) || + (va.va_mode & (S_IWUSR | S_IRWXG | S_IRWXO)) || + (va.va_fsid != tmp_fsid) || + (va.va_dirlinkcount != 1) || + (va.va_acl != (kauth_acl_t) KAUTH_FILESEC_NONE)) { + goto out; + } + + /* If we get here, then the shadow dir is valid. */ + is_valid = true; + +out: + return is_valid; +} + static int get_shadow_dir(vnode_t *sdvpp) { @@ -1104,40 +1168,23 @@ get_shadow_dir(vnode_t *sdvpp) struct componentname cn; struct vnode_attr va; char tmpname[80]; - uint32_t tmp_fsid; int error; vfs_context_t kernelctx = vfs_context_kernel(); - bzero(tmpname, sizeof(tmpname)); - MAKE_SHADOW_DIRNAME(rootvnode, tmpname); /* - * Look up the shadow directory to ensure that it still exists. - * By looking it up, we get an iocounted dvp to use, and avoid some coherency issues - * in caching it when multiple threads may be trying to manipulate the pointers. - * * Make sure to use the kernel context. We want a singular view of * the shadow dir regardless of chrooted processes. */ - error = vnode_lookup(tmpname, 0, &sdvp, kernelctx); - if (error == 0) { - /* - * If we get here, then we have successfully looked up the shadow dir, - * and it has an iocount from the lookup. Return the vp in the output argument. - */ - *sdvpp = sdvp; - return 0; - } - /* In the failure case, no iocount is acquired */ - sdvp = NULLVP; - bzero(tmpname, sizeof(tmpname)); /* - * Obtain the vnode for "/var/run" directory using the kernel + * Obtain the vnode for "/private/var/run" directory using the kernel * context. * * This is defined in the SHADOW_DIR_CONTAINER macro */ - if (vnode_lookup(SHADOW_DIR_CONTAINER, 0, &dvp, kernelctx) != 0) { + error = vnode_lookup(SHADOW_DIR_CONTAINER, VNODE_LOOKUP_NOFOLLOW_ANY, &dvp, + kernelctx); + if (error) { error = ENOTSUP; goto out; } @@ -1147,7 +1194,50 @@ get_shadow_dir(vnode_t *sdvpp) * 'dvp' below suggests the parent directory so * we only need to provide the leaf entry name */ + bzero(tmpname, sizeof(tmpname)); MAKE_SHADOW_DIR_LEAF(rootvnode, tmpname); + + /* + * Look up the shadow directory to ensure that it still exists. + * By looking it up, we get an iocounted sdvp to use, and avoid some + * coherency issues in caching it when multiple threads may be trying to + * manipulate the pointers. + */ + error = vnode_lookupat(tmpname, VNODE_LOOKUP_NOFOLLOW, &sdvp, kernelctx, dvp); + if (error == 0) { + if (is_shadow_dir_valid(dvp, sdvp, kernelctx)) { + /* + * If we get here, then we have successfully looked up the shadow + * dir, and it has an iocount from the lookup. Return the vp in the + * output argument. + */ + goto out; + } + + /* + * Lookup returned us something that is not a valid shadow dir. + * Remove it and proceed with recreating the shadow dir. + */ + bzero(&cn, sizeof(cn)); + cn.cn_nameiop = DELETE; + cn.cn_flags = ISLASTCN; + cn.cn_context = kernelctx; + cn.cn_pnbuf = tmpname; + cn.cn_pnlen = sizeof(tmpname); + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = (int)strlen(tmpname); + + error = VNOP_REMOVE(dvp, sdvp, &cn, 0, kernelctx); + if (error) { + error = ENOTSUP; + goto out; + } + + vnode_put(sdvp); + } + + /* In the failure case, no iocount is acquired */ + sdvp = NULLVP; bzero(&cn, sizeof(cn)); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; @@ -1176,53 +1266,8 @@ get_shadow_dir(vnode_t *sdvpp) if (error == EEXIST) { /* loser has to look up directory */ error = VNOP_LOOKUP(dvp, &sdvp, &cn, kernelctx); - if (error == 0) { - /* Make sure its in fact a directory */ - if (sdvp->v_type != VDIR) { - goto baddir; - } - /* Obtain the fsid for /var/run directory */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_fsid); - if (VNOP_GETATTR(dvp, &va, kernelctx) != 0 || - !VATTR_IS_SUPPORTED(&va, va_fsid)) { - goto baddir; - } - tmp_fsid = va.va_fsid; - - VATTR_INIT(&va); - VATTR_WANTED(&va, va_uid); - VATTR_WANTED(&va, va_gid); - VATTR_WANTED(&va, va_mode); - VATTR_WANTED(&va, va_fsid); - VATTR_WANTED(&va, va_dirlinkcount); - VATTR_WANTED(&va, va_acl); - /* Provide defaults for attrs that may not be supported */ - va.va_dirlinkcount = 1; - va.va_acl = (kauth_acl_t) KAUTH_FILESEC_NONE; - - if (VNOP_GETATTR(sdvp, &va, kernelctx) != 0 || - !VATTR_IS_SUPPORTED(&va, va_uid) || - !VATTR_IS_SUPPORTED(&va, va_gid) || - !VATTR_IS_SUPPORTED(&va, va_mode) || - !VATTR_IS_SUPPORTED(&va, va_fsid)) { - goto baddir; - } - /* - * Make sure its what we want: - * - owned by root - * - not writable by anyone - * - on same file system as /var/run - * - not a hard-linked directory - * - no ACLs (they might grant write access) - */ - if ((va.va_uid != 0) || (va.va_gid != 0) || - (va.va_mode & (S_IWUSR | S_IRWXG | S_IRWXO)) || - (va.va_fsid != tmp_fsid) || - (va.va_dirlinkcount != 1) || - (va.va_acl != (kauth_acl_t) KAUTH_FILESEC_NONE)) { - goto baddir; - } + if (error == 0 && is_shadow_dir_valid(dvp, sdvp, kernelctx) == false) { + goto baddir; } } out: @@ -1468,27 +1513,8 @@ typedef struct attr_info { #define SWAP64(x) OSSwapBigToHostInt64((x)) -/* - * sysctl stuff - */ -static int vfs_xattr_doubleagent_enabled = 1; -SYSCTL_DECL(_vfs_generic); -SYSCTL_INT(_vfs_generic, OID_AUTO, xattr_doubleagent_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vfs_xattr_doubleagent_enabled, 0, ""); - static int get_doubleagentd_port(mach_port_t *doubleagentd_port); -/* - * VFS default xattr functions - */ -static int default_getxattr_vfs(vnode_t vp, const char *name, uio_t uio, size_t *size, int options, - vfs_context_t context); -static int default_setxattr_vfs(vnode_t vp, const char *name, uio_t uio, int options, - vfs_context_t context); -static int default_listxattr_vfs(vnode_t vp, uio_t uio, size_t *size, int options, - vfs_context_t context); -static int default_removexattr_vfs(vnode_t vp, const char *name, int options, - vfs_context_t context); - /* * DoubleAgent default xattr functions */ @@ -1509,131 +1535,15 @@ static u_int32_t emptyfinfo[8] = {0}; /* * Local support routines */ -static void close_xattrfile(struct fileglob *xfg, bool have_iocount, bool drop_iocount, vfs_context_t context); +static void close_xattrfile(struct fileglob *xfg, bool have_iocount, bool drop_iocount, vfs_context_t context); static int open_xattrfile(vnode_t vp, int fileflags, struct fileglob **xfgp, - int64_t *file_sizep, vfs_context_t context); + int64_t *file_sizep, bool *created_xattr_filep, vfs_context_t context); -static int create_xattrfile(vnode_t xvp, u_int32_t fileid, vfs_context_t context); +static void remove_xattrfile(struct fileglob *xfg, vnode_t xvp, vfs_context_t context); -static void remove_xattrfile(struct fileglob *xfg, vnode_t xvp, vfs_context_t context); +static int make_xattrfile_port(struct fileglob *xfg, ipc_port_t *portp); -static int get_xattrinfo(vnode_t xvp, int setting, attr_info_t *ainfop, vfs_context_t context); - -static void rel_xattrinfo(attr_info_t *ainfop); - -static int write_xattrinfo(attr_info_t *ainfop); - -static void init_empty_resource_fork(rsrcfork_header_t * rsrcforkhdr); - -static int make_xattrfile_port(struct fileglob *xfg, ipc_port_t *portp); - -#if BYTE_ORDER == LITTLE_ENDIAN -static void swap_adhdr(apple_double_header_t *adh); -static void swap_attrhdr(attr_header_t *ah, attr_info_t* info); - -#else -#define swap_adhdr(x) -#define swap_attrhdr(x, y) -#endif - -static int check_and_swap_attrhdr(attr_header_t *ah, attr_info_t* ainfop); -static int shift_data_down(vnode_t xvp, off_t start, size_t len, off_t delta, vfs_context_t context); -static int shift_data_up(vnode_t xvp, off_t start, size_t len, off_t delta, vfs_context_t context); - - -/* - * Sanity check and swap the header of an AppleDouble file. Assumes the buffer - * is in big endian (as it would exist on disk). Verifies the following: - * - magic field - * - version field - * - number of entries - * - that each entry fits within the file size - * - * If the header is invalid, ENOATTR is returned. - * - * NOTE: Does not attempt to validate the extended attributes header that - * may be embedded in the Finder Info entry. - */ -static int -check_and_swap_apple_double_header(attr_info_t *ainfop) -{ - int i, j; - u_int32_t header_end; - u_int32_t entry_end; - size_t rawsize; - apple_double_header_t *header; - - rawsize = ainfop->rawsize; - header = (apple_double_header_t *) ainfop->rawdata; - - /* Is the file big enough to contain an AppleDouble header? */ - if (rawsize < offsetof(apple_double_header_t, entries)) { - return ENOATTR; - } - - /* Swap the AppleDouble header fields to native order */ - header->magic = SWAP32(header->magic); - header->version = SWAP32(header->version); - header->numEntries = SWAP16(header->numEntries); - - /* Sanity check the AppleDouble header fields */ - if (header->magic != ADH_MAGIC || - header->version != ADH_VERSION || - header->numEntries < 1 || - header->numEntries > 15) { - return ENOATTR; - } - - /* Calculate where the entries[] array ends */ - header_end = offsetof(apple_double_header_t, entries) + - header->numEntries * sizeof(apple_double_entry_t); - - /* Is the file big enough to contain the AppleDouble entries? */ - if (rawsize < header_end) { - return ENOATTR; - } - - /* Swap and sanity check each AppleDouble entry */ - for (i = 0; i < header->numEntries; i++) { - /* Swap the per-entry fields to native order */ - header->entries[i].type = SWAP32(header->entries[i].type); - header->entries[i].offset = SWAP32(header->entries[i].offset); - header->entries[i].length = SWAP32(header->entries[i].length); - - entry_end = header->entries[i].offset + header->entries[i].length; - - /* - * Does the entry's content start within the header itself, - * did the addition overflow, or does the entry's content - * extend past the end of the file? - */ - if (header->entries[i].offset < header_end || - entry_end < header->entries[i].offset || - entry_end > ainfop->filesize) { - return ENOATTR; - } - - /* - * Does the current entry's content overlap with a previous - * entry's content? - * - * Yes, this is O(N**2), and there are more efficient algorithms - * for testing pairwise overlap of N ranges when N is large. - * But we have already ensured N < 16, and N is almost always 2. - * So there's no point in using a more complex algorithm. - */ - - for (j = 0; j < i; j++) { - if (entry_end > header->entries[j].offset && - header->entries[j].offset + header->entries[j].length > header->entries[i].offset) { - return ENOATTR; - } - } - } - - return 0; -} /* * Retrieve the data of an extended attribute. @@ -1645,14 +1555,12 @@ default_getxattr(vnode_t vp, const char *name, uio_t uio, size_t *size, mach_port_t port; int error; - if (vfs_xattr_doubleagent_enabled && - get_doubleagentd_port(&port) == 0) { + if (get_doubleagentd_port(&port) == 0) { error = default_getxattr_doubleagent(vp, name, uio, size, options, context, port); ipc_port_release_send(port); } else { - error = default_getxattr_vfs(vp, name, uio, size, options, - context); + error = ENOATTR; } return error; } @@ -1667,13 +1575,12 @@ default_setxattr(vnode_t vp, const char *name, uio_t uio, int options, mach_port_t port; int error; - if (vfs_xattr_doubleagent_enabled && - get_doubleagentd_port(&port) == 0) { + if (get_doubleagentd_port(&port) == 0) { error = default_setxattr_doubleagent(vp, name, uio, options, context, port); ipc_port_release_send(port); } else { - error = default_setxattr_vfs(vp, name, uio, options, context); + error = ENOATTR; } return error; } @@ -1688,13 +1595,12 @@ default_removexattr(vnode_t vp, const char *name, __unused int options, mach_port_t port; int error; - if (vfs_xattr_doubleagent_enabled && - get_doubleagentd_port(&port) == 0) { + if (get_doubleagentd_port(&port) == 0) { error = default_removexattr_doubleagent(vp, name, options, context, port); ipc_port_release_send(port); } else { - error = default_removexattr_vfs(vp, name, options, context); + error = ENOATTR; } return error; } @@ -1709,871 +1615,16 @@ default_listxattr(vnode_t vp, uio_t uio, size_t *size, __unused int options, mach_port_t port; int error; - if (vfs_xattr_doubleagent_enabled && - get_doubleagentd_port(&port) == 0) { + if (get_doubleagentd_port(&port) == 0) { error = default_listxattr_doubleagent(vp, uio, size, options, context, port); ipc_port_release_send(port); } else { - error = default_listxattr_vfs(vp, uio, size, options, context); + error = 0; } return error; } -/* - * Retrieve the data of an extended attribute. - * (VFS implementation). - */ -static int -default_getxattr_vfs(vnode_t vp, const char *name, uio_t uio, size_t *size, - __unused int options, vfs_context_t context) -{ - vnode_t xvp = NULL; - struct fileglob *xfg = NULL; - attr_info_t ainfo; - attr_header_t *header; - attr_entry_t *entry; - u_int8_t *attrdata; - u_int32_t datalen; - size_t namelen; - int isrsrcfork; - int fileflags; - int i; - int error; - - fileflags = FREAD | O_SHLOCK; - isrsrcfork = strncmp(name, XATTR_RESOURCEFORK_NAME, - sizeof(XATTR_RESOURCEFORK_NAME)) == 0; - - if ((error = open_xattrfile(vp, fileflags, &xfg, NULL, context))) { - return error; - } - xvp = fg_get_data(xfg); - - if ((error = get_xattrinfo(xvp, 0, &ainfo, context))) { - close_xattrfile(xfg, true, true, context); - return error; - } - - /* Get the Finder Info. */ - if (strncmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - if (ainfo.finderinfo == NULL || ainfo.emptyfinderinfo) { - error = ENOATTR; - } else if (uio == NULL) { - *size = FINDERINFOSIZE; - error = 0; - } else if (uio_offset(uio) != 0) { - error = EINVAL; - } else if (uio_resid(uio) < FINDERINFOSIZE) { - error = ERANGE; - } else { - attrdata = (u_int8_t*)ainfo.filehdr + ainfo.finderinfo->offset; - error = uiomove((caddr_t)attrdata, FINDERINFOSIZE, uio); - } - goto out; - } - - /* Read the Resource Fork. */ - if (isrsrcfork) { - if (!vnode_isreg(vp)) { - error = EPERM; - } else if (ainfo.rsrcfork == NULL) { - error = ENOATTR; - } else if (uio == NULL) { - *size = (size_t)ainfo.rsrcfork->length; - } else { - uio_setoffset(uio, uio_offset(uio) + ainfo.rsrcfork->offset); - error = VNOP_READ(xvp, uio, 0, context); - if (error == 0) { - uio_setoffset(uio, uio_offset(uio) - ainfo.rsrcfork->offset); - } - } - goto out; - } - - if (ainfo.attrhdr == NULL || ainfo.attr_entry == NULL) { - error = ENOATTR; - goto out; - } - if (uio_offset(uio) != 0) { - error = EINVAL; - goto out; - } - error = ENOATTR; - namelen = strlen(name) + 1; - header = ainfo.attrhdr; - entry = ainfo.attr_entry; - /* - * Search for attribute name in the header. - */ - for (i = 0; i < header->num_attrs && ATTR_VALID(entry, ainfo); i++) { - if (strncmp((const char *)entry->name, name, namelen) == 0) { - datalen = entry->length; - if (uio == NULL) { - *size = datalen; - error = 0; - break; - } - if (uio_resid(uio) < (user_ssize_t)datalen) { - error = ERANGE; - break; - } - if (entry->offset + datalen < ATTR_MAX_HDR_SIZE) { - attrdata = ((u_int8_t *)header + entry->offset); - error = uiomove((caddr_t)attrdata, datalen, uio); - } else { - uio_setoffset(uio, entry->offset); - error = VNOP_READ(xvp, uio, 0, context); - uio_setoffset(uio, 0); - } - break; - } - entry = ATTR_NEXT(entry); - } -out: - rel_xattrinfo(&ainfo); - close_xattrfile(xfg, true, true, context); - - return error; -} - -/* - * Set the data of an extended attribute. - * (VFS implementation). - */ -static int __attribute__((noinline)) -default_setxattr_vfs(vnode_t vp, const char *name, uio_t uio, int options, vfs_context_t context) -{ - vnode_t xvp = NULL; - struct fileglob *xfg = NULL; - attr_info_t ainfo; - attr_header_t *header; - attr_entry_t *entry; - attr_entry_t *lastentry; - u_int8_t *attrdata; - size_t datalen; - size_t entrylen; - size_t datafreespace; - int namelen; - int found = 0; - int i; - int splitdata; - int fileflags; - int error; - char finfo[FINDERINFOSIZE]; - - datalen = uio_resid(uio); - if (datalen > XATTR_MAXSIZE) { - return E2BIG; - } - namelen = (int)strlen(name) + 1; - if (namelen > UINT8_MAX) { - return EINVAL; - } - entrylen = ATTR_ENTRY_LENGTH(namelen); - - /* - * By convention, Finder Info that is all zeroes is equivalent to not - * having a Finder Info EA. So if we're trying to set the Finder Info - * to all zeroes, then delete it instead. If a file didn't have an - * AppleDouble file before, this prevents creating an AppleDouble file - * with no useful content. - * - * If neither XATTR_CREATE nor XATTR_REPLACE were specified, we check - * for all zeroes Finder Info before opening the AppleDouble file. - * But if either of those options were specified, we need to open the - * AppleDouble file to see whether there was already Finder Info (so we - * can return an error if needed); this case is handled further below. - * - * NOTE: this copies the Finder Info data into the "finfo" local. - */ - if (strncmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - /* - * TODO: check the XATTR_CREATE and XATTR_REPLACE flags. - * That means we probably have to open_xattrfile and get_xattrinfo. - */ - if (uio_offset(uio) != 0) { - return EINVAL; - } - - if (datalen != FINDERINFOSIZE) { - return ERANGE; - } - - error = uiomove(finfo, (int)datalen, uio); - if (error) { - return error; - } - if ((options & (XATTR_CREATE | XATTR_REPLACE)) == 0 && - bcmp(finfo, emptyfinfo, FINDERINFOSIZE) == 0) { - error = default_removexattr(vp, name, 0, context); - if (error == ENOATTR) { - error = 0; - } - return error; - } - } - -start: - /* - * Open the file locked since setting an attribute - * can change the layout of the Apple Double file. - */ - fileflags = FREAD | FWRITE | O_EXLOCK; - if ((error = open_xattrfile(vp, O_CREAT | fileflags, &xfg, NULL, context))) { - return error; - } - xvp = fg_get_data(xfg); - - if ((error = get_xattrinfo(xvp, ATTR_SETTING, &ainfo, context))) { - close_xattrfile(xfg, true, true, context); - return error; - } - - /* Set the Finder Info. */ - if (strncmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - if (ainfo.finderinfo && !ainfo.emptyfinderinfo) { - /* attr exists and "create" was specified? */ - if (options & XATTR_CREATE) { - error = EEXIST; - goto out; - } - } else { - /* attr doesn't exists and "replace" was specified? */ - if (options & XATTR_REPLACE) { - error = ENOATTR; - goto out; - } - } - if (options != 0 && bcmp(finfo, emptyfinfo, FINDERINFOSIZE) == 0) { - /* - * Setting the Finder Info to all zeroes is equivalent to - * removing it. Close the xattr file and let - * default_removexattr do the work (including deleting - * the xattr file if there are no other xattrs). - * - * Note that we have to handle the case where the - * Finder Info was already all zeroes, and we ignore - * ENOATTR. - * - * The common case where options == 0 was handled above. - */ - rel_xattrinfo(&ainfo); - close_xattrfile(xfg, true, true, context); - error = default_removexattr(vp, name, 0, context); - if (error == ENOATTR) { - error = 0; - } - return error; - } - if (ainfo.finderinfo) { - attrdata = (u_int8_t *)ainfo.filehdr + ainfo.finderinfo->offset; - bcopy(finfo, attrdata, datalen); - ainfo.iosize = sizeof(attr_header_t); - error = write_xattrinfo(&ainfo); - goto out; - } - error = ENOATTR; - goto out; - } - - /* Write the Resource Fork. */ - if (strncmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0) { - off_t endoffset; - - if (!vnode_isreg(vp)) { - error = EPERM; - goto out; - } - /* Make sure we have a rsrc fork pointer.. */ - if (ainfo.rsrcfork == NULL) { - error = ENOATTR; - goto out; - } - if (ainfo.rsrcfork) { - if (ainfo.rsrcfork->length != 0) { - if (options & XATTR_CREATE) { - /* attr exists, and create specified ? */ - error = EEXIST; - goto out; - } - } else { - /* Zero length AD rsrc fork */ - if (options & XATTR_REPLACE) { - /* attr doesn't exist (0-length), but replace specified ? */ - error = ENOATTR; - goto out; - } - } - } else { - /* We can't do much if we somehow didn't get an AD rsrc pointer */ - error = ENOATTR; - goto out; - } - - endoffset = uio_resid(uio) + uio_offset(uio); /* new size */ - if (endoffset > UINT32_MAX || endoffset < 0) { - error = EINVAL; - goto out; - } - uio_setoffset(uio, uio_offset(uio) + ainfo.rsrcfork->offset); - error = VNOP_WRITE(xvp, uio, 0, context); - if (error) { - goto out; - } - uio_setoffset(uio, uio_offset(uio) - ainfo.rsrcfork->offset); - if (endoffset > ainfo.rsrcfork->length) { - ainfo.rsrcfork->length = (u_int32_t)endoffset; - ainfo.iosize = sizeof(attr_header_t); - error = write_xattrinfo(&ainfo); - goto out; - } - goto out; - } - - if (datalen > ATTR_MAX_SIZE) { - return E2BIG; /* EINVAL instead ? */ - } - - if (ainfo.attrhdr == NULL) { - error = ENOATTR; - goto out; - } - header = ainfo.attrhdr; - entry = ainfo.attr_entry; - - /* Check if data area crosses the maximum header size. */ - if ((header->data_start + header->data_length + entrylen + datalen) > ATTR_MAX_HDR_SIZE) { - splitdata = 1; /* do data I/O separately */ - } else { - splitdata = 0; - } - - /* - * See if attribute already exists. - */ - for (i = 0; i < header->num_attrs && ATTR_VALID(entry, ainfo); i++) { - if (strncmp((const char *)entry->name, name, namelen) == 0) { - found = 1; - break; - } - entry = ATTR_NEXT(entry); - } - - if (found) { - if (options & XATTR_CREATE) { - error = EEXIST; - goto out; - } - if (datalen == entry->length) { - if (splitdata) { - uio_setoffset(uio, entry->offset); - error = VNOP_WRITE(xvp, uio, 0, context); - uio_setoffset(uio, 0); - if (error) { - printf("setxattr: VNOP_WRITE error %d\n", error); - } - } else { - attrdata = (u_int8_t *)header + entry->offset; - error = uiomove((caddr_t)attrdata, (int)datalen, uio); - if (error) { - goto out; - } - ainfo.iosize = ainfo.attrhdr->data_start + ainfo.attrhdr->data_length; - error = write_xattrinfo(&ainfo); - if (error) { - printf("setxattr: write_xattrinfo error %d\n", error); - } - } - goto out; - } else { - /* - * Brute force approach - just remove old entry and set new entry. - */ - found = 0; - rel_xattrinfo(&ainfo); - close_xattrfile(xfg, true, true, context); - error = default_removexattr(vp, name, options, context); - if (error) { - return error; - } - /* Clear XATTR_REPLACE option since we just removed the attribute. */ - options &= ~XATTR_REPLACE; - goto start; /* start over */ - } - } else { - if (!ATTR_VALID(entry, ainfo)) { - error = ENOSPC; - goto out; - } - } - - if (options & XATTR_REPLACE) { - error = ENOATTR; /* nothing there to replace */ - goto out; - } - /* Check if header size limit has been reached. */ - if ((header->data_start + entrylen) > ATTR_MAX_HDR_SIZE) { - error = ENOSPC; - goto out; - } - - datafreespace = header->total_size - (header->data_start + header->data_length); - - /* Check if we need more space. */ - if ((datalen + entrylen) > datafreespace) { - size_t growsize; - - growsize = roundup((datalen + entrylen) - datafreespace, ATTR_BUF_SIZE); - - /* Clip roundup size when we can still fit in ATTR_MAX_HDR_SIZE. */ - if (!splitdata && (header->total_size + growsize) > ATTR_MAX_HDR_SIZE) { - growsize = ATTR_MAX_HDR_SIZE - header->total_size; - } - - ainfo.filesize += growsize; - error = vnode_setsize(xvp, ainfo.filesize, 0, context); - if (error) { - printf("setxattr: VNOP_TRUNCATE error %d\n", error); - } - if (error) { - goto out; - } - - /* - * Move the resource fork out of the way. - */ - if (ainfo.rsrcfork) { - if (ainfo.rsrcfork->length != 0) { - shift_data_down(xvp, - ainfo.rsrcfork->offset, - ainfo.rsrcfork->length, - growsize, context); - } - ainfo.rsrcfork->offset += growsize; - } - ainfo.finderinfo->length += growsize; - header->total_size += growsize; - } - - /* Make space for a new entry. */ - if (splitdata) { - shift_data_down(xvp, - header->data_start, - header->data_length, - entrylen, context); - } else { - bcopy((u_int8_t *)header + header->data_start, - (u_int8_t *)header + header->data_start + entrylen, - header->data_length); - } - header->data_start += entrylen; - - /* Fix up entry data offsets. */ - lastentry = entry; - for (entry = ainfo.attr_entry; entry != lastentry && ATTR_VALID(entry, ainfo); entry = ATTR_NEXT(entry)) { - entry->offset += entrylen; - } - - /* - * If the attribute data area is entirely within - * the header buffer, then just update the buffer, - * otherwise we'll write it separately to the file. - */ - if (splitdata) { - off_t offset; - - /* Write new attribute data after the end of existing data. */ - offset = header->data_start + header->data_length; - uio_setoffset(uio, offset); - error = VNOP_WRITE(xvp, uio, 0, context); - uio_setoffset(uio, 0); - if (error) { - printf("setxattr: VNOP_WRITE error %d\n", error); - goto out; - } - } else { - attrdata = (u_int8_t *)header + header->data_start + header->data_length; - - error = uiomove((caddr_t)attrdata, (int)datalen, uio); - if (error) { - printf("setxattr: uiomove error %d\n", error); - goto out; - } - } - - /* Create the attribute entry. */ - lastentry->length = (u_int32_t)datalen; - lastentry->offset = header->data_start + header->data_length; - lastentry->namelen = (u_int8_t)namelen; - lastentry->flags = 0; - bcopy(name, &lastentry->name[0], namelen); - - /* Update the attributes header. */ - header->num_attrs++; - header->data_length += datalen; - - if (splitdata) { - /* Only write the entries, since the data was written separately. */ - ainfo.iosize = ainfo.attrhdr->data_start; - } else { - /* The entry and data are both in the header; write them together. */ - ainfo.iosize = ainfo.attrhdr->data_start + ainfo.attrhdr->data_length; - } - error = write_xattrinfo(&ainfo); - if (error) { - printf("setxattr: write_xattrinfo error %d\n", error); - } - -out: - rel_xattrinfo(&ainfo); - close_xattrfile(xfg, true, true, context); - - /* Touch the change time if we changed an attribute. */ - if (error == 0) { - struct vnode_attr va; - - /* Re-write the mtime to cause a ctime change. */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_modify_time); - if (vnode_getattr(vp, &va, context) == 0) { - VATTR_INIT(&va); - VATTR_SET(&va, va_modify_time, va.va_modify_time); - (void) vnode_setattr(vp, &va, context); - } - } - - post_event_if_success(vp, error, NOTE_ATTRIB); - - return error; -} - - -/* - * Remove an extended attribute. - * (VFS implementation). - */ -static int -default_removexattr_vfs(vnode_t vp, const char *name, __unused int options, vfs_context_t context) -{ - vnode_t xvp = NULL; - struct fileglob *xfg = NULL; - attr_info_t ainfo; - attr_header_t *header; - attr_entry_t *entry; - attr_entry_t *oldslot; - u_int8_t *attrdata; - u_int32_t dataoff; - size_t datalen; - size_t entrylen; - int namelen; - int found = 0, lastone = 0; - int i; - int splitdata; - int attrcount = 0; - int isrsrcfork; - int fileflags; - int error; - - fileflags = FREAD | FWRITE | O_EXLOCK; - isrsrcfork = strncmp(name, XATTR_RESOURCEFORK_NAME, - sizeof(XATTR_RESOURCEFORK_NAME)) == 0; - - if ((error = open_xattrfile(vp, fileflags, &xfg, NULL, context))) { - return error; - } - xvp = fg_get_data(xfg); - - if ((error = get_xattrinfo(xvp, 0, &ainfo, context))) { - close_xattrfile(xfg, true, true, context); - return error; - } - if (ainfo.attrhdr) { - attrcount += ainfo.attrhdr->num_attrs; - } - if (ainfo.rsrcfork) { - ++attrcount; - } - if (ainfo.finderinfo && !ainfo.emptyfinderinfo) { - ++attrcount; - } - - /* Clear the Finder Info. */ - if (strncmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { - if (ainfo.finderinfo == NULL || ainfo.emptyfinderinfo) { - error = ENOATTR; - goto out; - } - /* On removal of last attribute the ._ file is removed. */ - if (--attrcount == 0) { - goto out; - } - attrdata = (u_int8_t *)ainfo.filehdr + ainfo.finderinfo->offset; - bzero((caddr_t)attrdata, FINDERINFOSIZE); - error = write_xattrinfo(&ainfo); - goto out; - } - - /* Clear the Resource Fork. */ - if (isrsrcfork) { - if (!vnode_isreg(vp)) { - error = EPERM; - goto out; - } - if (ainfo.rsrcfork == NULL || ainfo.rsrcfork->length == 0) { - error = ENOATTR; - goto out; - } - /* On removal of last attribute the ._ file is removed. */ - if (--attrcount == 0) { - goto out; - } - /* - * XXX - * If the resource fork isn't the last AppleDouble - * entry then the space needs to be reclaimed by - * shifting the entries after the resource fork. - */ - if ((ainfo.rsrcfork->offset + ainfo.rsrcfork->length) == ainfo.filesize) { - ainfo.filesize -= ainfo.rsrcfork->length; - error = vnode_setsize(xvp, ainfo.filesize, 0, context); - } - if (error == 0) { - ainfo.rsrcfork->length = 0; - ainfo.iosize = sizeof(attr_header_t); - error = write_xattrinfo(&ainfo); - } - goto out; - } - - if (ainfo.attrhdr == NULL) { - error = ENOATTR; - goto out; - } - namelen = (int)strlen(name) + 1; - header = ainfo.attrhdr; - entry = ainfo.attr_entry; - - /* - * See if this attribute exists. - */ - for (i = 0; i < header->num_attrs && ATTR_VALID(entry, ainfo); i++) { - if (strncmp((const char *)entry->name, name, namelen) == 0) { - found = 1; - if ((i + 1) == header->num_attrs) { - lastone = 1; - } - break; - } - entry = ATTR_NEXT(entry); - } - if (!found) { - error = ENOATTR; - goto out; - } - /* On removal of last attribute the ._ file is removed. */ - if (--attrcount == 0) { - goto out; - } - - datalen = entry->length; - dataoff = entry->offset; - entrylen = ATTR_ENTRY_LENGTH(namelen); - if ((header->data_start + header->data_length) > ATTR_MAX_HDR_SIZE) { - splitdata = 1; - } else { - splitdata = 0; - } - - /* Remove the attribute entry. */ - if (!lastone) { - bcopy((u_int8_t *)entry + entrylen, (u_int8_t *)entry, - ((size_t)header + header->data_start) - ((size_t)entry + entrylen)); - } - - /* Adjust the attribute data. */ - if (splitdata) { - shift_data_up(xvp, - header->data_start, - dataoff - header->data_start, - entrylen, - context); - if (!lastone) { - shift_data_up(xvp, - dataoff + datalen, - (header->data_start + header->data_length) - (dataoff + datalen), - datalen + entrylen, - context); - } - /* XXX write zeros to freed space ? */ - ainfo.iosize = ainfo.attrhdr->data_start - entrylen; - } else { - bcopy((u_int8_t *)header + header->data_start, - (u_int8_t *)header + header->data_start - entrylen, - dataoff - header->data_start); - if (!lastone) { - bcopy((u_int8_t *)header + dataoff + datalen, - (u_int8_t *)header + dataoff - entrylen, - (header->data_start + header->data_length) - (dataoff + datalen)); - } - bzero(((u_int8_t *)header + header->data_start + header->data_length) - (datalen + entrylen), (datalen + entrylen)); - ainfo.iosize = ainfo.attrhdr->data_start + ainfo.attrhdr->data_length; - } - - /* Adjust the header values and entry offsets. */ - header->num_attrs--; - header->data_start -= entrylen; - header->data_length -= datalen; - - oldslot = entry; - entry = ainfo.attr_entry; - for (i = 0; i < header->num_attrs && ATTR_VALID(entry, ainfo); i++) { - entry->offset -= entrylen; - if (entry >= oldslot) { - entry->offset -= datalen; - } - entry = ATTR_NEXT(entry); - } - error = write_xattrinfo(&ainfo); - if (error) { - printf("removexattr: write_xattrinfo error %d\n", error); - } -out: - rel_xattrinfo(&ainfo); - - /* When there are no more attributes remove the ._ file. */ - if (attrcount == 0) { - remove_xattrfile(xfg, xvp, context); - } else { - close_xattrfile(xfg, true, true, context); - } - /* Touch the change time if we changed an attribute. */ - if (error == 0) { - struct vnode_attr va; - - /* Re-write the mtime to cause a ctime change. */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_modify_time); - if (vnode_getattr(vp, &va, context) == 0) { - VATTR_INIT(&va); - VATTR_SET(&va, va_modify_time, va.va_modify_time); - (void) vnode_setattr(vp, &va, context); - } - } - - post_event_if_success(vp, error, NOTE_ATTRIB); - - return error; -} - - -/* - * Retrieve the list of extended attribute names. - * (VFS implementation). - */ -static int -default_listxattr_vfs(vnode_t vp, uio_t uio, size_t *size, __unused int options, vfs_context_t context) -{ - vnode_t xvp = NULL; - struct fileglob *xfg = NULL; - attr_info_t ainfo; - attr_entry_t *entry; - int i, count; - int error; - - /* - * We do not zero "*size" here as we don't want to stomp a size set when - * VNOP_LISTXATTR processed any native EAs. That size is initially zeroed by the - * system call layer, up in listxattr or flistxattr. - */ - - if ((error = open_xattrfile(vp, FREAD | O_SHLOCK, &xfg, NULL, context))) { - if (error == ENOATTR) { - error = 0; - } - return error; - } - xvp = fg_get_data(xfg); - - if ((error = get_xattrinfo(xvp, 0, &ainfo, context))) { - if (error == ENOATTR) { - error = 0; - } - close_xattrfile(xfg, true, true, context); - return error; - } - - /* Check for Finder Info. */ - if (ainfo.finderinfo && !ainfo.emptyfinderinfo) { - if (uio == NULL) { - *size += sizeof(XATTR_FINDERINFO_NAME); - } else if (uio_resid(uio) < (user_ssize_t)sizeof(XATTR_FINDERINFO_NAME)) { - error = ERANGE; - goto out; - } else { - error = uiomove(XATTR_FINDERINFO_NAME, - sizeof(XATTR_FINDERINFO_NAME), uio); - if (error) { - error = ERANGE; - goto out; - } - } - } - - /* Check for Resource Fork. */ - if (vnode_isreg(vp) && ainfo.rsrcfork) { - if (uio == NULL) { - *size += sizeof(XATTR_RESOURCEFORK_NAME); - } else if (uio_resid(uio) < (user_ssize_t)sizeof(XATTR_RESOURCEFORK_NAME)) { - error = ERANGE; - goto out; - } else { - error = uiomove(XATTR_RESOURCEFORK_NAME, - sizeof(XATTR_RESOURCEFORK_NAME), uio); - if (error) { - error = ERANGE; - goto out; - } - } - } - - /* Check for attributes. */ - if (ainfo.attrhdr) { - count = ainfo.attrhdr->num_attrs; - for (i = 0, entry = ainfo.attr_entry; i < count && ATTR_VALID(entry, ainfo); i++) { - if (xattr_protected((const char *)entry->name) || - ((entry->namelen < XATTR_MAXNAMELEN) && - (entry->name[entry->namelen] == '\0') && - (xattr_validatename((const char *)entry->name) != 0))) { - entry = ATTR_NEXT(entry); - continue; - } - if (uio == NULL) { - *size += entry->namelen; - entry = ATTR_NEXT(entry); - continue; - } - if (uio_resid(uio) < entry->namelen) { - error = ERANGE; - break; - } - error = uiomove((caddr_t) entry->name, entry->namelen, uio); - if (error) { - if (error != EFAULT) { - error = ERANGE; - } - break; - } - entry = ATTR_NEXT(entry); - } - } -out: - rel_xattrinfo(&ainfo); - close_xattrfile(xfg, true, true, context); - - return error; -} - static int get_doubleagentd_port(mach_port_t *doubleagentd_port) { @@ -2618,7 +1669,7 @@ default_getxattr_doubleagent(vnode_t vp, const char *name, uio_t uio, isrsrcfork = strncmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0; - if ((error = open_xattrfile(vp, fileflags, &xfg, &fsize, context))) { + if ((error = open_xattrfile(vp, fileflags, &xfg, &fsize, NULL, context))) { goto out; } xvp = fg_get_data(xfg); @@ -2710,7 +1761,7 @@ default_listxattr_doubleagent(vnode_t vp, uio_t uio, size_t *size, * flistxattr(). */ - if ((error = open_xattrfile(vp, FREAD | O_SHLOCK, &xfg, &fsize, + if ((error = open_xattrfile(vp, FREAD | O_SHLOCK, &xfg, &fsize, NULL, context))) { if (error == ENOATTR) { error = 0; @@ -2799,6 +1850,8 @@ default_setxattr_doubleagent(vnode_t vp, const char *name, uio_t uio, int64_t fsize; kern_return_t kr; bool have_iocount = true; + bool created_xattr_file = false; + bool removed_xattr_file = false; datalen = uio_resid(uio); if (datalen > XATTR_MAXSIZE) { @@ -2869,8 +1922,8 @@ default_setxattr_doubleagent(vnode_t vp, const char *name, uio_t uio, * can change the layout of the Apple Double file. */ fileflags = FREAD | FWRITE | O_EXLOCK; - if ((error = open_xattrfile(vp, O_CREAT | fileflags, &xfg, - &fsize, context))) { + if ((error = open_xattrfile(vp, O_CREAT | fileflags, &xfg, &fsize, + &created_xattr_file, context))) { goto out; } xvp = fg_get_data(xfg); @@ -2913,7 +1966,22 @@ default_setxattr_doubleagent(vnode_t vp, const char *name, uio_t uio, out: if (xfg != NULL) { - close_xattrfile(xfg, have_iocount, true, context); + /* + * In case we have just created the AppleDouble file, and DoubleAgent + * couldn't allocate space for the xattr, remove it so we won't leave + * an uninitialized AppleDouble file. + */ + if (error && created_xattr_file) { + /* remove_xattrfile() assumes we have an iocount on the vnode */ + if (vnode_getwithref(xvp) == 0) { + remove_xattrfile(xfg, xvp, context); + removed_xattr_file = true; + } + } + /* remove_xattrfile() would call close_xattrfile already */ + if (!removed_xattr_file) { + close_xattrfile(xfg, have_iocount, true, context); + } } /* Touch the change time if we changed an attribute. */ @@ -2960,7 +2028,7 @@ default_removexattr_doubleagent(vnode_t vp, const char *name, isrsrcfork = strncmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0; - if ((error = open_xattrfile(vp, fileflags, &xfg, &fsize, context))) { + if ((error = open_xattrfile(vp, fileflags, &xfg, &fsize, NULL, context))) { goto out; } xvp = fg_get_data(xfg); @@ -3023,7 +2091,7 @@ out: static int open_xattrfile(vnode_t vp, int fileflags, struct fileglob **xfgp, - int64_t *file_sizep, vfs_context_t context) + int64_t *file_sizep, bool *created_xattr_filep, vfs_context_t context) { extern const struct fileops vnops; /* XXX */ vnode_t xvp = NULLVP; @@ -3145,6 +2213,9 @@ lookup: } else { xvp = nd->ni_vp; created_xattr_file = true; + if (created_xattr_filep) { + *created_xattr_filep = true; + } } } nameidone(nd); @@ -3196,34 +2267,6 @@ lookup: } referenced = 1; - /* - * If create was requested, make sure file header exists. - * This is only done in the non-DoubleAgent case. - * XXX And will be garbage-collected in due time. - */ - if (!vfs_xattr_doubleagent_enabled && (fileflags & O_CREAT) != 0) { - VATTR_INIT(va); - VATTR_WANTED(va, va_data_size); - VATTR_WANTED(va, va_fileid); - VATTR_WANTED(va, va_nlink); - if ((error = vnode_getattr(xvp, va, context)) != 0) { - error = EPERM; - goto out; - } - - /* If the file is empty then add a default header. */ - if (va->va_data_size == 0) { - /* Don't adopt hard-linked "._" files. */ - if (VATTR_IS_SUPPORTED(va, va_nlink) && va->va_nlink > 1) { - error = EPERM; - goto out; - } - if ((error = create_xattrfile(xvp, (u_int32_t)va->va_fileid, context))) { - goto out; - } - } - } - /* * Allocate a file object for the referenced vnode. * This file object now owns the vnode reference, @@ -3411,679 +2454,6 @@ out: } } -/* - * Read in and parse the AppleDouble header and entries, and the extended - * attribute header and entries if any. Populates the fields of ainfop - * based on the headers and entries found. - * - * The basic idea is to: - * - Read in up to ATTR_MAX_HDR_SIZE bytes of the start of the file. All - * AppleDouble entries, the extended attribute header, and extended - * attribute entries must lie within this part of the file; the rest of - * the AppleDouble handling code assumes this. Plus it allows us to - * somewhat optimize by doing a smaller number of larger I/Os. - * - Swap and sanity check the AppleDouble header (including the AppleDouble - * entries). - * - Find the Finder Info and Resource Fork entries, if any. - * - If we're going to be writing, try to make sure the Finder Info entry has - * room to store the extended attribute header, plus some space for extended - * attributes. - * - Swap and sanity check the extended attribute header and entries (if any). - */ -static int -get_xattrinfo(vnode_t xvp, int setting, attr_info_t *ainfop, vfs_context_t context) -{ - uio_t auio = NULL; - void * buffer = NULL; - apple_double_header_t *filehdr; - struct vnode_attr va; - size_t iosize = 0; - int i; - int error; - - bzero(ainfop, sizeof(attr_info_t)); - ainfop->filevp = xvp; - ainfop->context = context; - VATTR_INIT(&va); - VATTR_WANTED(&va, va_data_size); - VATTR_WANTED(&va, va_fileid); - if ((error = vnode_getattr(xvp, &va, context))) { - goto bail; - } - ainfop->filesize = va.va_data_size; - - /* When setting attributes, allow room for the header to grow. */ - if (setting) { - iosize = ATTR_MAX_HDR_SIZE; - } else { - iosize = MIN(ATTR_MAX_HDR_SIZE, ainfop->filesize); - } - - if (iosize == 0 || iosize < sizeof(apple_double_header_t)) { - error = ENOATTR; - goto bail; - } - - ainfop->iosize = iosize; - buffer = kalloc_data(iosize, Z_WAITOK | Z_ZERO); - if (buffer == NULL) { - error = ENOMEM; - goto bail; - } - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); - uio_addiov(auio, (uintptr_t)buffer, iosize); - - /* Read the file header. */ - error = VNOP_READ(xvp, auio, 0, context); - if (error) { - goto bail; - } - ainfop->rawsize = iosize - uio_resid(auio); - ainfop->rawdata = (u_int8_t *)buffer; - - filehdr = (apple_double_header_t *)buffer; - - error = check_and_swap_apple_double_header(ainfop); - if (error) { - goto bail; - } - - ainfop->filehdr = filehdr; /* valid AppleDouble header */ - - /* rel_xattrinfo is responsible for freeing the header buffer */ - buffer = NULL; - - /* Find the Finder Info and Resource Fork entries, if any */ - for (i = 0; i < filehdr->numEntries; ++i) { - if (filehdr->entries[i].type == AD_FINDERINFO && - filehdr->entries[i].length >= FINDERINFOSIZE) { - /* We found the Finder Info entry. */ - ainfop->finderinfo = &filehdr->entries[i]; - - /* At this point check_and_swap_apple_double_header() call above - * verified that all apple double entires are valid: - * they point somewhere within the file. - * - * Now for finderinfo make sure that the fixed portion - * is within the buffer we read in. - */ - if (((ainfop->finderinfo->offset + FINDERINFOSIZE) > ainfop->finderinfo->offset) && - ((ainfop->finderinfo->offset + FINDERINFOSIZE) <= ainfop->rawsize)) { - /* - * Is the Finder Info "empty" (all zeroes)? If so, - * we'll pretend like the Finder Info extended attribute - * does not exist. - */ - if (bcmp((u_int8_t*)ainfop->filehdr + ainfop->finderinfo->offset, emptyfinfo, sizeof(emptyfinfo)) == 0) { - ainfop->emptyfinderinfo = 1; - } - } else { - error = ENOATTR; - goto bail; - } - } - if (filehdr->entries[i].type == AD_RESOURCE) { - /* - * Ignore zero-length resource forks when getting. If setting, - * we need to remember the resource fork entry so it can be - * updated once the new content has been written. - */ - if (filehdr->entries[i].length == 0 && !setting) { - continue; - } - - /* - * Check to see if any "empty" resource fork is ours (i.e. is ignorable). - * - * The "empty" resource headers we created have a system data tag of: - * "This resource fork intentionally left blank " - */ - if (filehdr->entries[i].length == sizeof(rsrcfork_header_t) && !setting) { - uio_t rf_uio; - u_int8_t systemData[64]; - int rf_err; - - - /* Read the system data which starts at byte 16 */ - rf_uio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); - uio_addiov(rf_uio, (uintptr_t)systemData, sizeof(systemData)); - uio_setoffset(rf_uio, filehdr->entries[i].offset + 16); - rf_err = VNOP_READ(xvp, rf_uio, 0, context); - uio_free(rf_uio); - - if (rf_err != 0 || - bcmp(systemData, RF_EMPTY_TAG, sizeof(RF_EMPTY_TAG)) == 0) { - continue; /* skip this resource fork */ - } - } - ainfop->rsrcfork = &filehdr->entries[i]; - if (i != (filehdr->numEntries - 1)) { - printf("get_xattrinfo: resource fork not last entry\n"); - ainfop->readonly = 1; - } - continue; - } - } - - /* - * See if this file looks like it is laid out correctly to contain - * extended attributes. If so, then do the following: - * - * - If we're going to be writing, try to make sure the Finder Info - * entry has room to store the extended attribute header, plus some - * space for extended attributes. - * - * - Swap and sanity check the extended attribute header and entries - * (if any). - */ - if (filehdr->numEntries == 2 && - ainfop->finderinfo == &filehdr->entries[0] && - ainfop->rsrcfork == &filehdr->entries[1] && - ainfop->finderinfo->offset == offsetof(apple_double_header_t, finfo)) { - attr_header_t *attrhdr; - attrhdr = (attr_header_t *)filehdr; - /* - * If we're going to be writing, try to make sure the Finder - * Info entry has room to store the extended attribute header, - * plus some space for extended attributes. - */ - if (setting && ainfop->finderinfo->length == FINDERINFOSIZE) { - size_t delta; - size_t writesize; - - delta = ATTR_BUF_SIZE - (filehdr->entries[0].offset + FINDERINFOSIZE); - if (ainfop->rsrcfork && filehdr->entries[1].length) { - /* Make some room before existing resource fork. */ - shift_data_down(xvp, - filehdr->entries[1].offset, - filehdr->entries[1].length, - delta, context); - writesize = sizeof(attr_header_t); - } else { - /* We are in case where existing resource fork of length 0, try to create a new, empty resource fork. */ - rsrcfork_header_t *rsrcforkhdr; - - /* Do we have enough space in the header buffer for empty resource fork */ - if (filehdr->entries[1].offset + delta + sizeof(rsrcfork_header_t) > ainfop->iosize) { - /* we do not have space, bail for now */ - error = ENOATTR; - goto bail; - } - - vnode_setsize(xvp, filehdr->entries[1].offset + delta, 0, context); - - /* Steal some space for an empty RF header. */ - delta -= sizeof(rsrcfork_header_t); - - bzero(&attrhdr->appledouble.pad[0], delta); - rsrcforkhdr = (rsrcfork_header_t *)((char *)filehdr + filehdr->entries[1].offset + delta); - - /* Fill in Empty Resource Fork Header. */ - init_empty_resource_fork(rsrcforkhdr); - - filehdr->entries[1].length = sizeof(rsrcfork_header_t); - writesize = ATTR_BUF_SIZE; - } - filehdr->entries[0].length += delta; - filehdr->entries[1].offset += delta; - - /* Fill in Attribute Header. */ - attrhdr->magic = ATTR_HDR_MAGIC; - attrhdr->debug_tag = (u_int32_t)va.va_fileid; - attrhdr->total_size = filehdr->entries[1].offset; - attrhdr->data_start = sizeof(attr_header_t); - attrhdr->data_length = 0; - attrhdr->reserved[0] = 0; - attrhdr->reserved[1] = 0; - attrhdr->reserved[2] = 0; - attrhdr->flags = 0; - attrhdr->num_attrs = 0; - - /* Push out new header */ - uio_reset(auio, 0, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)filehdr, writesize); - - swap_adhdr(filehdr); /* to big endian */ - swap_attrhdr(attrhdr, ainfop); /* to big endian */ - error = VNOP_WRITE(xvp, auio, 0, context); - swap_adhdr(filehdr); /* back to native */ - /* The attribute header gets swapped below. */ - } - } - /* - * Swap and sanity check the extended attribute header and - * entries (if any). The Finder Info content must be big enough - * to include the extended attribute header; if not, we just - * ignore it. - * - * Note that we're passing the offset + length (i.e. the end) - * of the Finder Info instead of rawsize to validate_attrhdr. - * This ensures that all extended attributes lie within the - * Finder Info content according to the AppleDouble entry. - * - * Sets ainfop->attrhdr and ainfop->attr_entry if a valid - * header was found. - */ - if (ainfop->finderinfo && - ainfop->finderinfo == &filehdr->entries[0] && - ainfop->finderinfo->length >= (sizeof(attr_header_t) - sizeof(apple_double_header_t))) { - attr_header_t *attrhdr = (attr_header_t*)filehdr; - - if (ainfop->finderinfo->offset != offsetof(apple_double_header_t, finfo)) { - error = ENOATTR; - goto bail; - } - - if ((error = check_and_swap_attrhdr(attrhdr, ainfop)) == 0) { - ainfop->attrhdr = attrhdr; /* valid attribute header */ - /* First attr_entry starts immediately following attribute header */ - ainfop->attr_entry = (attr_entry_t *)&attrhdr[1]; - } - } - - error = 0; -bail: - if (auio != NULL) { - uio_free(auio); - } - kfree_data(buffer, iosize); - return error; -} - - -static int -create_xattrfile(vnode_t xvp, u_int32_t fileid, vfs_context_t context) -{ - attr_header_t *xah; - rsrcfork_header_t *rsrcforkhdr; - void * buffer; - uio_t auio; - int rsrcforksize; - int error; - - buffer = kalloc_data(ATTR_BUF_SIZE, Z_WAITOK | Z_ZERO); - - xah = (attr_header_t *)buffer; - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)buffer, ATTR_BUF_SIZE); - rsrcforksize = sizeof(rsrcfork_header_t); - rsrcforkhdr = (rsrcfork_header_t *) ((char *)buffer + ATTR_BUF_SIZE - rsrcforksize); - - /* Fill in Apple Double Header. */ - xah->appledouble.magic = SWAP32(ADH_MAGIC); - xah->appledouble.version = SWAP32(ADH_VERSION); - xah->appledouble.numEntries = SWAP16(2); - xah->appledouble.entries[0].type = SWAP32(AD_FINDERINFO); - xah->appledouble.entries[0].offset = SWAP32(offsetof(apple_double_header_t, finfo)); - xah->appledouble.entries[0].length = SWAP32(ATTR_BUF_SIZE - offsetof(apple_double_header_t, finfo) - rsrcforksize); - xah->appledouble.entries[1].type = SWAP32(AD_RESOURCE); - xah->appledouble.entries[1].offset = SWAP32(ATTR_BUF_SIZE - rsrcforksize); - xah->appledouble.entries[1].length = SWAP32(rsrcforksize); - bcopy(ADH_MACOSX, xah->appledouble.filler, sizeof(xah->appledouble.filler)); - - /* Fill in Attribute Header. */ - xah->magic = SWAP32(ATTR_HDR_MAGIC); - xah->debug_tag = SWAP32(fileid); - xah->total_size = SWAP32(ATTR_BUF_SIZE - rsrcforksize); - xah->data_start = SWAP32(sizeof(attr_header_t)); - - /* Fill in Empty Resource Fork Header. */ - init_empty_resource_fork(rsrcforkhdr); - - /* Push it out. */ - error = VNOP_WRITE(xvp, auio, IO_UNIT, context); - - /* Did we write out the full uio? */ - if (uio_resid(auio) > 0) { - error = ENOSPC; - } - - uio_free(auio); - kfree_data(buffer, ATTR_BUF_SIZE); - - return error; -} - -static void -init_empty_resource_fork(rsrcfork_header_t * rsrcforkhdr) -{ - bzero(rsrcforkhdr, sizeof(rsrcfork_header_t)); - rsrcforkhdr->fh_DataOffset = SWAP32(RF_FIRST_RESOURCE); - rsrcforkhdr->fh_MapOffset = SWAP32(RF_FIRST_RESOURCE); - rsrcforkhdr->fh_MapLength = SWAP32(RF_NULL_MAP_LENGTH); - rsrcforkhdr->mh_DataOffset = SWAP32(RF_FIRST_RESOURCE); - rsrcforkhdr->mh_MapOffset = SWAP32(RF_FIRST_RESOURCE); - rsrcforkhdr->mh_MapLength = SWAP32(RF_NULL_MAP_LENGTH); - rsrcforkhdr->mh_Types = SWAP16(RF_NULL_MAP_LENGTH - 2 ); - rsrcforkhdr->mh_Names = SWAP16(RF_NULL_MAP_LENGTH); - rsrcforkhdr->typeCount = SWAP16(-1); - bcopy(RF_EMPTY_TAG, rsrcforkhdr->systemData, sizeof(RF_EMPTY_TAG)); -} - -static void -rel_xattrinfo(attr_info_t *ainfop) -{ - kfree_data_addr(ainfop->filehdr); - bzero(ainfop, sizeof(attr_info_t)); -} - -static int -write_xattrinfo(attr_info_t *ainfop) -{ - uio_t auio; - int error; - - auio = uio_create(1, 0, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)ainfop->filehdr, ainfop->iosize); - - swap_adhdr(ainfop->filehdr); - if (ainfop->attrhdr != NULL) { - swap_attrhdr(ainfop->attrhdr, ainfop); - } - - error = VNOP_WRITE(ainfop->filevp, auio, 0, ainfop->context); - - swap_adhdr(ainfop->filehdr); - if (ainfop->attrhdr != NULL) { - swap_attrhdr(ainfop->attrhdr, ainfop); - } - uio_free(auio); - - return error; -} - -#if BYTE_ORDER == LITTLE_ENDIAN -/* - * Endian swap apple double header - */ -static void -swap_adhdr(apple_double_header_t *adh) -{ - int count; - int i; - - count = (adh->magic == ADH_MAGIC) ? adh->numEntries : SWAP16(adh->numEntries); - - adh->magic = SWAP32(adh->magic); - adh->version = SWAP32(adh->version); - adh->numEntries = SWAP16(adh->numEntries); - - for (i = 0; i < count; i++) { - adh->entries[i].type = SWAP32(adh->entries[i].type); - adh->entries[i].offset = SWAP32(adh->entries[i].offset); - adh->entries[i].length = SWAP32(adh->entries[i].length); - } -} - -/* - * Endian swap extended attributes header - */ -static void -swap_attrhdr(attr_header_t *ah, attr_info_t* info) -{ - attr_entry_t *ae; - int count; - int i; - - count = (ah->magic == ATTR_HDR_MAGIC) ? ah->num_attrs : SWAP16(ah->num_attrs); - - ah->magic = SWAP32(ah->magic); - ah->debug_tag = SWAP32(ah->debug_tag); - ah->total_size = SWAP32(ah->total_size); - ah->data_start = SWAP32(ah->data_start); - ah->data_length = SWAP32(ah->data_length); - ah->flags = SWAP16(ah->flags); - ah->num_attrs = SWAP16(ah->num_attrs); - - ae = (attr_entry_t *)(&ah[1]); - for (i = 0; i < count && ATTR_VALID(ae, *info); i++, ae = ATTR_NEXT(ae)) { - ae->offset = SWAP32(ae->offset); - ae->length = SWAP32(ae->length); - ae->flags = SWAP16(ae->flags); - } -} -#endif - -/* - * Validate and swap the attributes header contents, and each attribute's - * attr_entry_t. - * - * Note: Assumes the caller has verified that the Finder Info content is large - * enough to contain the attr_header structure itself. Therefore, we can - * swap the header fields before sanity checking them. - */ -static int -check_and_swap_attrhdr(attr_header_t *ah, attr_info_t *ainfop) -{ - attr_entry_t *ae; - u_int8_t *buf_end; - u_int32_t end; - int count; - int i; - uint32_t total_header_size; - uint32_t total_data_size; - - if (ah == NULL) { - return EINVAL; - } - - if (SWAP32(ah->magic) != ATTR_HDR_MAGIC) { - return EINVAL; - } - - /* Swap the basic header fields */ - ah->magic = SWAP32(ah->magic); - ah->debug_tag = SWAP32(ah->debug_tag); - ah->total_size = SWAP32(ah->total_size); - ah->data_start = SWAP32(ah->data_start); - ah->data_length = SWAP32(ah->data_length); - ah->flags = SWAP16(ah->flags); - ah->num_attrs = SWAP16(ah->num_attrs); - - /* - * Make sure the total_size fits within the Finder Info area, and the - * extended attribute data area fits within total_size. - */ - end = ah->data_start + ah->data_length; - if (ah->total_size > ainfop->finderinfo->offset + ainfop->finderinfo->length || - ah->data_start < sizeof(attr_header_t) || - end < ah->data_start || - end > ah->total_size) { - return EINVAL; - } - - /* - * Make sure each of the attr_entry_t's fits within total_size. - */ - buf_end = ainfop->rawdata + ah->data_start; - if (buf_end > ainfop->rawdata + ainfop->rawsize) { - return EINVAL; - } - count = ah->num_attrs; - if (count > 256) { - return EINVAL; - } - ae = (attr_entry_t *)(&ah[1]); - - total_header_size = sizeof(attr_header_t); - total_data_size = 0; - for (i = 0; i < count; i++) { - /* Make sure the fixed-size part of this attr_entry_t fits. */ - if ((u_int8_t *) &ae[1] > buf_end) { - return EINVAL; - } - - /* Make sure the variable-length name fits */ - if (&ae->name[ae->namelen] > buf_end) { - return EINVAL; - } - - /* Make sure that namelen is matching name's real length, namelen included NUL */ - if (strnlen((const char *)ae->name, ae->namelen) != ae->namelen - 1) { - return EINVAL; - } - - /* Swap the attribute entry fields */ - ae->offset = SWAP32(ae->offset); - ae->length = SWAP32(ae->length); - ae->flags = SWAP16(ae->flags); - - /* Make sure the attribute content fits and points to the data part */ - end = ae->offset + ae->length; - if (end < ae->offset || end > ah->total_size) { - return EINVAL; - } - - /* Make sure entry points to data section and not header */ - if (ae->offset < ah->data_start || end > ah->data_start + ah->data_length) { - return EINVAL; - } - - /* We verified namelen is ok above, so add this entry's size to a total */ - if (os_add_overflow(total_header_size, ATTR_ENTRY_LENGTH(ae->namelen), &total_header_size)) { - return EINVAL; - } - - /* We verified that entry's length is within data section, so add it to running size total */ - if (os_add_overflow(total_data_size, ae->length, &total_data_size)) { - return EINVAL; - } - - ae = ATTR_NEXT(ae); - } - - - /* make sure data_start is actually after all the xattr key entries */ - if (ah->data_start < total_header_size) { - return EINVAL; - } - - /* make sure all entries' data length add to header's idea of data length */ - if (total_data_size != ah->data_length) { - return EINVAL; - } - - return 0; -} - -// -// "start" & "end" are byte offsets in the file. -// "to" is the byte offset we want to move the -// data to. "to" should be > "start". -// -// we do the copy backwards to avoid problems if -// there's an overlap. -// -static int -shift_data_down(vnode_t xvp, off_t start, size_t len, off_t delta, vfs_context_t context) -{ - int ret, iolen; - size_t chunk, orig_chunk; - char *buff; - off_t pos; - kauth_cred_t ucred = vfs_context_ucred(context); - proc_t p = vfs_context_proc(context); - - if (delta == 0 || len == 0) { - return 0; - } - - chunk = 4096; - if (len < chunk) { - chunk = len; - } - orig_chunk = chunk; - - buff = kalloc_data(chunk, Z_WAITOK); - if (buff == NULL) { - return ENOMEM; - } - - for (pos = start + len - chunk; pos >= start; pos -= chunk) { - ret = vn_rdwr(UIO_READ, xvp, buff, (int)chunk, pos, UIO_SYSSPACE, IO_NODELOCKED | IO_NOAUTH, ucred, &iolen, p); - if (iolen != 0) { - printf("xattr:shift_data: error reading data @ %lld (read %d of %lu) (%d)\n", - pos, ret, chunk, ret); - break; - } - - ret = vn_rdwr(UIO_WRITE, xvp, buff, (int)chunk, pos + delta, UIO_SYSSPACE, IO_NODELOCKED | IO_NOAUTH, ucred, &iolen, p); - if (iolen != 0) { - printf("xattr:shift_data: error writing data @ %lld (wrote %d of %lu) (%d)\n", - pos + delta, ret, chunk, ret); - break; - } - - if ((pos - (off_t)chunk) < start) { - chunk = pos - start; - - if (chunk == 0) { // we're all done - break; - } - } - } - - kfree_data(buff, orig_chunk); - return 0; -} - - -static int -shift_data_up(vnode_t xvp, off_t start, size_t len, off_t delta, vfs_context_t context) -{ - int ret, iolen; - size_t chunk, orig_chunk; - char *buff; - off_t pos; - off_t end; - kauth_cred_t ucred = vfs_context_ucred(context); - proc_t p = vfs_context_proc(context); - - if (delta == 0 || len == 0) { - return 0; - } - - chunk = 4096; - if (len < chunk) { - chunk = len; - } - orig_chunk = chunk; - end = start + len; - - buff = kalloc_data(chunk, Z_WAITOK); - if (buff == NULL) { - return ENOMEM; - } - - for (pos = start; pos < end; pos += chunk) { - ret = vn_rdwr(UIO_READ, xvp, buff, (int)chunk, pos, UIO_SYSSPACE, IO_NODELOCKED | IO_NOAUTH, ucred, &iolen, p); - if (iolen != 0) { - printf("xattr:shift_data: error reading data @ %lld (read %d of %lu) (%d)\n", - pos, ret, chunk, ret); - break; - } - - ret = vn_rdwr(UIO_WRITE, xvp, buff, (int)chunk, pos - delta, UIO_SYSSPACE, IO_NODELOCKED | IO_NOAUTH, ucred, &iolen, p); - if (iolen != 0) { - printf("xattr:shift_data: error writing data @ %lld (wrote %d of %lu) (%d)\n", - pos + delta, ret, chunk, ret); - break; - } - - if ((pos + (off_t)chunk) > end) { - chunk = end - pos; - - if (chunk == 0) { // we're all done - break; - } - } - } - - kfree_data(buff, orig_chunk); - return 0; -} - static int make_xattrfile_port(struct fileglob *fg, ipc_port_t *portp) { diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index dc64da15b..d36964982 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -105,10 +105,11 @@ #include #include -#if DEVELOPMENT || DEBUG #include /* for c_segment_info */ #include /* for vm_compressor_serialize_segment_debug_info() */ -#endif +#include /* for vm_chead_select_t */ +#include +#include #include #include @@ -136,6 +137,18 @@ SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, #if DEVELOPMENT || DEBUG +extern int vm_object_cache_evict_all(void); +static int +sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, req) + (void) vm_object_cache_evict_all(); + return 0; +} + +SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, + 0, 0, &sysctl_vm_object_cache_evict, "I", ""); + static int sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS { @@ -311,7 +324,51 @@ extern int apple_protect_pager_data_request_debug; SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, ""); extern unsigned int vm_object_copy_delayed_paging_wait_disable; -EXPERIMENT_FACTOR_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, ""); +EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, ""); + +__enum_closed_decl(vm_submap_test_op, uint32_t, { + vsto_make_submap = 1, /* make submap from entries in current_map() + * at start..end, offset ignored */ + vsto_remap_submap = 2, /* map in current_map() at start..end, + * from parent address submap_base_address + * and submap address offset */ + vsto_end +}); + +static int +sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS +{ + int error; + struct { + vm_submap_test_op op; + mach_vm_address_t submap_base_address; + mach_vm_address_t start; + mach_vm_address_t end; + mach_vm_address_t offset; + } args; + if (req->newlen != sizeof(args)) { + return EINVAL; + } + error = SYSCTL_IN(req, &args, sizeof(args)); + if (error) { + return error; + } + + switch (args.op) { + case vsto_make_submap: + vm_map_testing_make_sealed_submap(current_map(), args.start, args.end); + break; + case vsto_remap_submap: + vm_map_testing_remap_submap(current_map(), + args.submap_base_address, args.start, args.end, args.offset); + break; + default: + return EINVAL; + } + + return 0; +} +SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", ""); #if __arm64__ /* These are meant to support the page table accounting unit test. */ @@ -396,6 +453,20 @@ SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor #endif /* DEVELOPMENT || DEBUG */ #endif /* PAGE_SLEEP_WITH_INHERITOR */ +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 +extern uint32_t vm_cheads; +extern vm_chead_select_t vm_chead_select; +extern boolean_t vm_chead_rehint; +#if DEVELOPMENT || DEBUG +SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ +EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, ""); +EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, ""); +EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, ""); +#endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */ + /* * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ @@ -905,7 +976,8 @@ SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED * dyld will then check what's mapped at that address. * * If the shared region is empty, dyld will then attempt to map the shared - * cache file in the shared region via the shared_region_map_np() system call. + * cache file in the shared region via the shared_region_map_and_slide_2_np() + * system call. * * If something's already mapped in the shared region, dyld will check if it * matches the shared cache it would like to use for that process. @@ -931,7 +1003,7 @@ shared_region_check_np( vm_shared_region_t shared_region; mach_vm_offset_t start_address = 0; int error = 0; - kern_return_t kr; + kern_return_t kr = KERN_FAILURE; task_t task = current_task(); SHARED_REGION_TRACE_DEBUG( @@ -949,7 +1021,7 @@ shared_region_check_np( return 0; } - /* retrieve the current tasks's shared region */ + /* retrieve the current task's shared region */ shared_region = vm_shared_region_get(task); if (shared_region != NULL) { /* @@ -962,47 +1034,59 @@ shared_region_check_np( vm_shared_region_set(task, NULL); } else { /* retrieve address of its first mapping... */ - kr = vm_shared_region_start_address(shared_region, &start_address, task); + kr = vm_shared_region_start_address(shared_region, &start_address); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] " "check_np(0x%llx) " - "vm_shared_region_start_address() failed\n", + "vm_shared_region_start_address() returned 0x%x\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, + (uint64_t)uap->start_address, kr)); + error = ENOMEM; + } + if (error == 0) { + /* Insert the shared region submap and various bits of debug info into the task. */ + kr = vm_shared_region_update_task(task, shared_region, start_address); + if (kr != KERN_SUCCESS) { + SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] " + "check_np(0x%llx) " + "vm_shared_update_task() returned 0x%x\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, + (uint64_t)uap->start_address, kr)); + + error = ENOMEM; + } + } +#if __has_feature(ptrauth_calls) + /* + * Remap any section of the shared library that + * has authenticated pointers into private memory. + */ + if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) { + SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] " + "check_np(0x%llx) " + "vm_shared_region_auth_remap() failed\n", (void *)VM_KERNEL_ADDRPERM(current_thread()), proc_getpid(p), p->p_comm, (uint64_t)uap->start_address)); error = ENOMEM; - } else { -#if __has_feature(ptrauth_calls) - /* - * Remap any section of the shared library that - * has authenticated pointers into private memory. - */ - if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) { - SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] " - "check_np(0x%llx) " - "vm_shared_region_auth_remap() failed\n", - (void *)VM_KERNEL_ADDRPERM(current_thread()), - proc_getpid(p), p->p_comm, - (uint64_t)uap->start_address)); - error = ENOMEM; - } + } #endif /* __has_feature(ptrauth_calls) */ - - /* ... and give it to the caller */ - if (error == 0) { - error = copyout(&start_address, - (user_addr_t) uap->start_address, - sizeof(start_address)); - if (error != 0) { - SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] " - "check_np(0x%llx) " - "copyout(0x%llx) error %d\n", - (void *)VM_KERNEL_ADDRPERM(current_thread()), - proc_getpid(p), p->p_comm, - (uint64_t)uap->start_address, (uint64_t)start_address, - error)); - } + /* Give the start address to the caller */ + if (error == 0) { + error = copyout(&start_address, + (user_addr_t) uap->start_address, + sizeof(start_address)); + if (error != 0) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] " + "check_np(0x%llx) " + "copyout(0x%llx) error %d\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, + (uint64_t)uap->start_address, (uint64_t)start_address, + error)); } } } @@ -1088,7 +1172,7 @@ shared_region_map_and_slide_setup( boolean_t is_driverkit = task_is_driver(current_task()); SHARED_REGION_TRACE_DEBUG( - ("shared_region: %p [%d(%s)] -> map\n", + ("shared_region: %p [%d(%s)] -> map_and_slide_setup\n", (void *)VM_KERNEL_ADDRPERM(current_thread()), proc_getpid(p), p->p_comm)); @@ -1121,7 +1205,7 @@ shared_region_map_and_slide_setup( } /* get the process's shared region (setup in vm_map_exec()) */ - shared_region = vm_shared_region_trim_and_get(current_task()); + shared_region = vm_shared_region_get(current_task()); *shared_region_ptr = shared_region; if (shared_region == NULL) { SHARED_REGION_TRACE_ERROR( @@ -1527,6 +1611,10 @@ done: *sr_file_mappings = NULL; *shared_region_ptr = NULL; } + SHARED_REGION_TRACE_DEBUG( + ("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, error)); return error; } @@ -1859,6 +1947,12 @@ shared_region_map_and_slide_2_np( files_count = uap->files_count; mappings_count = uap->mappings_count; + SHARED_REGION_TRACE_DEBUG( + ("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, + (uint64_t)uap->mappings_u)); + if (files_count == 0) { SHARED_REGION_TRACE_INFO( ("shared_region: %p [%d(%s)] map(): " @@ -1915,6 +2009,10 @@ shared_region_map_and_slide_2_np( */ kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files); if (kr != KERN_SUCCESS) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] copyin() returned 0x%x\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, kr)); goto done; } @@ -1924,6 +2022,10 @@ shared_region_map_and_slide_2_np( mappings_count, mappings); if (__improbable(kr != KERN_SUCCESS)) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, kr)); kr = vm_sanitize_get_kr(kr); goto done; } @@ -1994,6 +2096,13 @@ shared_region_map_and_slide_2_np( done: kfree_data(shared_files, files_count * sizeof(shared_files[0])); kfree_data(mappings, mappings_count * sizeof(mappings[0])); + + SHARED_REGION_TRACE_DEBUG( + ("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n", + (void *)VM_KERNEL_ADDRPERM(current_thread()), + proc_getpid(p), p->p_comm, + (uint64_t)uap->mappings_u, kr)); + return kr; } @@ -2365,6 +2474,19 @@ SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED, SCALABLE_COUNTER_DECLARE(vm_page_grab_count); SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed"); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern); +SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)"); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl); +SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)"); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl); +SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)"); + + +#if DEVELOPMENT || DEBUG +SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count); +SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count, + "Number of pages deactivated behind"); +#endif #if DEVELOPMENT || DEBUG #if __ARM_MIXED_PAGE_SIZE__ @@ -2473,8 +2595,11 @@ SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, /* counts of pages prefaulted when entering a memory object */ extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout; +extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page; SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, ""); SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, ""); +SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, ""); +SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, ""); #if defined (__x86_64__) extern unsigned int vm_clump_promote_threshold; @@ -2657,13 +2782,13 @@ SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all, extern uint32_t vm_reclaim_buffer_count; extern uint64_t vm_reclaim_gc_epoch; extern uint64_t vm_reclaim_gc_reclaim_count; +extern uint64_t vm_reclaim_sampling_period_abs; +extern uint64_t vm_reclaim_sampling_period_ns; +extern bool vm_reclaim_debug; #if XNU_TARGET_OS_IOS extern uint64_t vm_reclaim_max_threshold; #else /* !XNU_TARGET_OS_IOS */ -extern bool vm_reclaim_debug; extern bool vm_reclaim_enabled; -extern uint64_t vm_reclaim_sampling_period_ns; -extern uint64_t vm_reclaim_sampling_period_abs; extern uint32_t vm_reclaim_autotrim_pct_normal; extern uint32_t vm_reclaim_autotrim_pct_pressure; extern uint32_t vm_reclaim_autotrim_pct_critical; @@ -2682,6 +2807,9 @@ SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch, SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count, "Number of times the global GC thread has reclaimed from a buffer"); +SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug, + CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0, + "Debug logs for vm.reclaim"); #if XNU_TARGET_OS_IOS SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold, @@ -2691,9 +2819,6 @@ SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold, SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0, "Whether deferred memory reclamation is enabled on this system"); -SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug, - CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0, - "Whether vm.reclaim debug logs are enabled"); SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0, "Percentage of a task's lifetime max phys_footprint that must be reclaimable " @@ -2719,6 +2844,7 @@ SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold, "The number of sampling periods between accounting updates that may elapse " "before the buffer is considered \"abandoned\""); +#endif /* XNU_TARGET_OS_IOS */ static int sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS @@ -2738,10 +2864,9 @@ sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS } SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns, - CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "I", + CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU", "Interval (nanoseconds) at which to sample the minimum buffer size and " "consider trimming excess"); -#endif /* XNU_TARGET_OS_IOS */ #endif /* DEVELOPMENT || DEBUG */ #endif /* CONFIG_DEFERRED_RECLAIM */ @@ -3094,8 +3219,14 @@ extern int vm_protect_privileged_from_untrusted; SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, ""); extern uint64_t vm_copied_on_read; +extern uint64_t vm_copied_on_read_kernel_map; +extern uint64_t vm_copied_on_read_platform_map; SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, ""); +SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, ""); +SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, ""); extern int vm_shared_region_count; extern int vm_shared_region_peak; @@ -3208,6 +3339,10 @@ SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED, SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_shadow_skipped, ""); +extern uint64_t vm_object_upl_throttle_cnt; +SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_object_upl_throttle_cnt, + "The number of times in which a UPL write was throttled due to pageout starvation"); SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED, @@ -3381,6 +3516,15 @@ SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_A extern uint64_t cluster_direct_write_wired; SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, ""); +extern uint64_t vm_object_pageout_not_on_queue; +extern uint64_t vm_object_pageout_not_pageable; +extern uint64_t vm_object_pageout_pageable; +extern uint64_t vm_object_pageout_active_local; +SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, ""); +SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, ""); +SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, ""); +SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, ""); + #if DEVELOPMENT || DEBUG @@ -3598,7 +3742,58 @@ out: } SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", ""); +static int +systctl_vm_reset_tag SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error; + int tag; + kern_return_t kr; + /* Need to be root */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EPERM; + } + error = SYSCTL_IN(req, &tag, sizeof(tag)); + if (error) { + return error; + } + + if (tag > VM_MAX_TAG_VALUE) { + return EINVAL; + } + + kr = vm_tag_reset_peak((vm_tag_t)tag); + + return mach_to_bsd_errno(kr); +} + +SYSCTL_PROC(_vm, OID_AUTO, reset_tag, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, &systctl_vm_reset_tag, "I", ""); + +static int +systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + /* Only reset the values if the sysctl is a write */ + if (!req->newptr) { + return EINVAL; + } + + /* Need to be root */ + if (!kauth_cred_issuser(kauth_cred_get())) { + return EPERM; + } + + vm_tag_reset_all_peaks(); + + return 0; +} + +SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED, + 0, 0, &systctl_vm_reset_all_tags, "I", ""); #endif /* DEVELOPMENT || DEBUG */ diff --git a/config/BSDKernel.arm.exports b/config/BSDKernel.arm.exports index 899009b21..3e226a6d6 100644 --- a/config/BSDKernel.arm.exports +++ b/config/BSDKernel.arm.exports @@ -1,5 +1,6 @@ _file_vnode _mbuf_data +_mbuf_data_len _mbuf_len _mbuf_next _mbuf_nextpkt diff --git a/config/BSDKernel.arm64.exports b/config/BSDKernel.arm64.exports index 899009b21..3e226a6d6 100644 --- a/config/BSDKernel.arm64.exports +++ b/config/BSDKernel.arm64.exports @@ -1,5 +1,6 @@ _file_vnode _mbuf_data +_mbuf_data_len _mbuf_len _mbuf_next _mbuf_nextpkt diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index d41d29fff..6d16d3391 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -444,6 +444,8 @@ _msleep0 _nanotime _nanouptime _nd6_lookup_ipv6 +_net_aop_register_provider +_net_aop_deregister_provider _net_init_add _nop_access _nop_advlock diff --git a/config/BSDKernel.x86_64.exports b/config/BSDKernel.x86_64.exports index 0d534ae6d..081bfc9a7 100644 --- a/config/BSDKernel.x86_64.exports +++ b/config/BSDKernel.x86_64.exports @@ -1,5 +1,6 @@ _in6_cksum:_inet6_cksum _mbuf_data +_mbuf_data_len _mbuf_inet6_cksum _mbuf_len _mbuf_next diff --git a/config/IOKit.arm.exports b/config/IOKit.arm.exports index 99a73b9dd..6456fe613 100644 --- a/config/IOKit.arm.exports +++ b/config/IOKit.arm.exports @@ -118,6 +118,7 @@ __ZN18IOMemoryDescriptor12setOwnershipEP4taskim __ZN18IOMemoryDescriptor12setPurgeableEmPm __ZN18IOMemoryDescriptor12withSubRangeEPS_mm11IODirection __ZN18IOMemoryDescriptor13getPageCountsEPmS0_ +__ZN18IOMemoryDescriptor13getPageCountsEPmS0_S0_ __ZN18IOMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb __ZN18IOMemoryDescriptor15initWithAddressEPvm11IODirection __ZN18IOMemoryDescriptor15initWithAddressEjm11IODirectionP4task diff --git a/config/IOKit.arm64.exports b/config/IOKit.arm64.exports index cf62add29..2caa4c2f0 100644 --- a/config/IOKit.arm64.exports +++ b/config/IOKit.arm64.exports @@ -110,6 +110,7 @@ __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper __ZN18IOMemoryDescriptor12setOwnershipEP4taskij __ZN18IOMemoryDescriptor12setPurgeableEjPj __ZN18IOMemoryDescriptor13getPageCountsEPyS0_ +__ZN18IOMemoryDescriptor13getPageCountsEPyS0_S0_ __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper __ZN18IOMemoryDescriptor16performOperationEjyy __ZN18IOMemoryDescriptor16withAddressRangeEyyjP4task diff --git a/config/IOKit.exports b/config/IOKit.exports index f70f20f11..03822d86c 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -4,6 +4,23 @@ __ZN12IODMACommand8DispatchE5IORPC __ZN9IOService16StringFromReturnEiPP8OSStringPFiP15OSMetaClassBase5IORPCE + +_IOCircularDataQueueCopyCurrent +_IOCircularDataQueueCopyLatest +_IOCircularDataQueueCopyMemoryDescriptor +_IOCircularDataQueueCopyNext +_IOCircularDataQueueCopyPrevious +_IOCircularDataQueueCreateWithEntries +_IOCircularDataQueueDestroy +_IOCircularDataQueueEnqueue +_IOCircularDataQueueGetCurrent +_IOCircularDataQueueGetLatest +_IOCircularDataQueueGetNext +_IOCircularDataQueueGetPrevious +_IOCircularDataQueueIsCurrentDataValid +_IOCircularDataQueueSetCursorLatest + + _IOAlignmentToSize _IOBSDNameMatching _IOBSDRegistryEntryForDeviceTree @@ -117,6 +134,7 @@ _PE_cpu_signal _PE_cpu_start:_PE_cpu_start_from_kext _PE_enter_debugger _PE_halt_restart +_PE_boot_args _PE_parse_boot_argn _PE_parse_boot_arg_str @@ -562,6 +580,8 @@ __ZN14IOPMrootDomain5startEP9IOService __ZN14IOPMrootDomain9MetaClassC1Ev __ZN14IOPMrootDomain9MetaClassC2Ev __ZN14IOPMrootDomain9constructEv +__ZN14IOPMrootDomain9isAOTModeEv +__ZN14IOPMrootDomain9isLPWModeEv __ZN14IOPMrootDomain9metaClassE __ZN14IOPMrootDomainC1EPK11OSMetaClass __ZN14IOPMrootDomainC1Ev @@ -1528,6 +1548,7 @@ __ZN9IOService20getDeviceMemoryCountEv __ZN9IOService20powerOverrideOffPrivEv __ZN9IOService20unlockForArbitrationEv __ZN9IOService20ClientCrashed_InvokeE5IORPCP15OSMetaClassBasePFiS2_PS_yE +__ZN9IOService20getDesiredPowerStateEv __ZN9IOService21CopyProperties_InvokeE5IORPCP15OSMetaClassBasePFiS2_PP12OSDictionaryE __ZN9IOService21SearchProperty_InvokeE5IORPCP15OSMetaClassBasePFiS2_PKcS4_yPP8OSObjectE __ZN9IOService21getClientWithCategoryEPK8OSSymbol @@ -1999,3 +2020,7 @@ __ZN18IOMemoryDescriptor16getMapperOptionsEv __ZN18IOMemoryDescriptor16setMapperOptionsEt __ZN14IOPMrootDomain20copyWakeReasonStringEPcm + +__ZN23IOMultiMemoryDescriptor12setPurgeableEjPj +__ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEyPyj +__ZN23IOMultiMemoryDescriptor5doMapEP7_vm_mapPyjyy diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index c7a42640d..ddc371548 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -113,6 +113,7 @@ __ZN18IOMemoryDescriptor11withOptionsEPvjjP4taskjP8IOMapper __ZN18IOMemoryDescriptor12setOwnershipEP4taskij __ZN18IOMemoryDescriptor12setPurgeableEjPj __ZN18IOMemoryDescriptor13getPageCountsEPyS0_ +__ZN18IOMemoryDescriptor13getPageCountsEPyS0_S0_ __ZN18IOMemoryDescriptor15initWithOptionsEPvjjP4taskjP8IOMapper __ZN18IOMemoryDescriptor16performOperationEjyy __ZN18IOMemoryDescriptor16withAddressRangeEyyjP4task diff --git a/config/Kasan_enabled.arm.exports b/config/Kasan_enabled.arm.exports index 39a1160e7..260afdf93 100644 --- a/config/Kasan_enabled.arm.exports +++ b/config/Kasan_enabled.arm.exports @@ -117,6 +117,14 @@ ___asan_strlcat ___asan_strncat ___asan_strlen ___asan_strnlen +___asan_strcmp +___asan_strncmp +___asan_strlcmp +___asan_strbufcmp +___asan_strcasecmp +___asan_strncasecmp +___asan_strlcasecmp +___asan_strbufcasecmp ___ubsan_handle_add_overflow ___ubsan_handle_add_overflow_abort ___ubsan_handle_builtin_unreachable diff --git a/config/Kasan_enabled.arm64.exports b/config/Kasan_enabled.arm64.exports index 2b49581dd..f9c51871f 100644 --- a/config/Kasan_enabled.arm64.exports +++ b/config/Kasan_enabled.arm64.exports @@ -87,6 +87,14 @@ ___asan_strlcat ___asan_strncat ___asan_strlen ___asan_strnlen +___asan_strcmp +___asan_strncmp +___asan_strlcmp +___asan_strbufcmp +___asan_strcasecmp +___asan_strncasecmp +___asan_strlcasecmp +___asan_strbufcasecmp ___ubsan_handle_add_overflow ___ubsan_handle_add_overflow_abort ___ubsan_handle_builtin_unreachable diff --git a/config/Kasan_enabled.x86_64.exports b/config/Kasan_enabled.x86_64.exports index 9bee6a479..3e3a7c617 100644 --- a/config/Kasan_enabled.x86_64.exports +++ b/config/Kasan_enabled.x86_64.exports @@ -117,6 +117,14 @@ ___asan_strlcat ___asan_strncat ___asan_strlen ___asan_strnlen +___asan_strcmp +___asan_strncmp +___asan_strlcmp +___asan_strbufcmp +___asan_strcasecmp +___asan_strncasecmp +___asan_strlcasecmp +___asan_strbufcasecmp ___ubsan_handle_add_overflow ___ubsan_handle_add_overflow_abort ___ubsan_handle_builtin_unreachable diff --git a/config/Kcov_enabled.exports b/config/Kcov_enabled.exports index b0b8f093c..aa3b0e895 100644 --- a/config/Kcov_enabled.exports +++ b/config/Kcov_enabled.exports @@ -3,3 +3,12 @@ ___sanitizer_cov_trace_pc_guard ___sanitizer_cov_trace_pc_guard_init ___sanitizer_cov_trace_pc_indirect ___sanitizer_cov_pcs_init +___sanitizer_cov_trace_cmp1 +___sanitizer_cov_trace_cmp2 +___sanitizer_cov_trace_cmp4 +___sanitizer_cov_trace_cmp8 +___sanitizer_cov_trace_const_cmp1 +___sanitizer_cov_trace_const_cmp2 +___sanitizer_cov_trace_const_cmp4 +___sanitizer_cov_trace_const_cmp8 +___sanitizer_cov_trace_switch \ No newline at end of file diff --git a/config/Libkern.exports b/config/Libkern.exports index 24b4f2039..366d2e498 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -683,6 +683,7 @@ __os_log_internal _adler32 _atoi _amfi_interface_register +_amfi_core_entitlements_register _bcmp _bcopy _bcopy_phys diff --git a/config/MASTER b/config/MASTER index e9170fde8..2ffae3ab7 100644 --- a/config/MASTER +++ b/config/MASTER @@ -56,7 +56,6 @@ options INET # # options MACH # Standard Mach features # -options MACH_FLIPC # Fast-Local IPC # options LOOP # loopback support # options VLAN # # options BOND # # @@ -86,7 +85,6 @@ options REMOTE_VIF # # options SKYWALK # # options CONFIG_NEXUS_USER_PIPE # # options CONFIG_NEXUS_KERNEL_PIPE # # -options CONFIG_NEXUS_MONITOR # # options CONFIG_NEXUS_FLOWSWITCH # # options CONFIG_NEXUS_NETIF # # options CONFIG_MBUF_MCACHE # mbufs use mcache # @@ -105,7 +103,6 @@ options SENDFILE # sendfile # options NETWORKING # networking layer # options CONFIG_FSE # file system events # options CONFIG_IMAGEBOOT # local image boot # -options CONFIG_MBUF_JUMBO # jumbo cluster pool # options CONFIG_IMAGEBOOT_IMG4 # authenticate image with AppleImage4 # options CONFIG_IMAGEBOOT_CHUNKLIST # authenticate image with a chunk list # @@ -449,10 +446,15 @@ options CONFIG_PROC_UUID_POLICY # options CONFIG_ECC_LOGGING # # -# Application core dumps +# Application core dumps dumped synchronously by xnu # options CONFIG_COREDUMP # +# +# Application core dumps dumped in userland via corpses +# +options CONFIG_UCOREDUMP # + # # Vnode guards # @@ -644,6 +646,9 @@ options DEVELOPMENT # dev kernel # # DEBUG kernel options DEBUG # general debugging code # +# RELEASE define for release builds +options RELEASE # + options MACH_BSD # BSD subsystem on top of Mach # options IOKIT # # @@ -677,7 +682,6 @@ options CONFIG_SCHED_RT_ALLOW # options CONFIG_SCHED_IDLE_IN_PLACE # options CONFIG_SCHED_SFI # -options CONFIG_PROB_GZALLOC # options CONFIG_SCHED_DEFERRED_AST # options CONFIG_PREADOPT_TG # @@ -790,6 +794,3 @@ options CONFIG_EXT_PANICLOG # # support for collecting statistics on task suspension options CONFIG_TASK_SUSPEND_STATS # - -# Support for non-fatal Branch Target Identification exception reporting and logging -options CONFIG_BTI_TELEMETRY # diff --git a/config/MASTER.arm b/config/MASTER.arm index 11ef761de..4586f44fb 100644 --- a/config/MASTER.arm +++ b/config/MASTER.arm @@ -17,7 +17,7 @@ # -------- ----- -- --------------- # # KERNEL_BASE = [ arm xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_ext_paniclog ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_ext_paniclog ] # BSD_BASE = [ mach_bsd psynch config_proc_uuid_policy config_imageboot config_imageboot_img4 ] @@ -28,8 +28,7 @@ # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE config_union_mounts fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE config_union_mounts fdesc ] -# NFS_DEV = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] @@ -64,16 +63,16 @@ # SCHED_DEBUG = [ SCHED_BASE ] # VM_BASE = [ vm_pressure_events jetsam memorystatus config_code_decryption config_cs_validation_bitmap ] # VM_RELEASE = [ VM_BASE ] -# VM_DEV = [ VM_BASE dynamic_codesigning pgzalloc ] -# VM_DEBUG = [ VM_BASE dynamic_codesigning pgzalloc ] +# VM_DEV = [ VM_BASE dynamic_codesigning ] +# VM_DEBUG = [ VM_BASE dynamic_codesigning ] # SECURITY_BASE = [ config_macf config_secure_bsd_root ] # SECURITY_RELEASE = [ SECURITY_BASE ] # SECURITY_DEV = [ SECURITY_BASE config_setuid config_kas_info ] # SECURITY_DEBUG = [ SECURITY_BASE config_setuid config_kas_info ] # BASE = [ MULTIPATH VPN ] -# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] -# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] -# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] +# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] +# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] +# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] # ###################################################################### # diff --git a/config/MASTER.arm64 b/config/MASTER.arm64 index 15c35b8e6..9c8cf8ccd 100644 --- a/config/MASTER.arm64 +++ b/config/MASTER.arm64 @@ -17,19 +17,18 @@ # -------- ----- -- --------------- # # KERNEL_BASE = [ arm64 config_pmap_ppl xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_proc_resource_limits config_ext_paniclog ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_proc_resource_limits config_ext_paniclog ] # BSD_BASE = [ mach_bsd psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] -# BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access config_coredump pgo config_vnguard ] -# BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access config_coredump pgo config_vnguard ] +# BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access config_coredump config_ucoredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access config_coredump config_ucoredump pgo config_vnguard ] # FILESYS_BASE = [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_triggers config_fse routefs namedstreams config_dataless_files bindfs] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE config_union_mounts fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE config_union_mounts fdesc ] -# NFS_DEV = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] @@ -69,18 +68,18 @@ # SCHED_DEBUG = [ SCHED_BASE config_sched_rt_allow ] # VM_BASE = [ vm_pressure_events jetsam memorystatus config_code_decryption phantom_cache config_secluded_memory config_cs_validation_bitmap config_deferred_reclaim config_map_ranges ] # VM_RELEASE = [ VM_BASE ] -# VM_DEV = [ VM_BASE dynamic_codesigning pgzalloc] -# VM_DEBUG = [ VM_BASE dynamic_codesigning pgzalloc] +# VM_DEV = [ VM_BASE dynamic_codesigning ] +# VM_DEBUG = [ VM_BASE dynamic_codesigning ] # VM_KASAN = [ VM_BASE dynamic_codesigning kernel_tagging kernel_tbi config_kasan config_ubsan config_kcov config_ksancov ] # SECURITY_BASE = [ config_macf kernel_integrity config_secure_bsd_root ] # SECURITY_RELEASE = [ SECURITY_BASE ] # SECURITY_DEV = [ SECURITY_BASE config_setuid config_kas_info ] # SECURITY_DEBUG = [ SECURITY_BASE config_setuid config_kas_info ] # BASE = [ MULTIPATH VPN ] -# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] -# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] -# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] -# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] +# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] +# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] +# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] # ###################################################################### # diff --git a/config/MASTER.arm64.BridgeOS b/config/MASTER.arm64.BridgeOS index 0a09ff704..3f289b7d0 100644 --- a/config/MASTER.arm64.BridgeOS +++ b/config/MASTER.arm64.BridgeOS @@ -17,7 +17,7 @@ # -------- ----- -- --------------- # # KERNEL_BASE = [ arm64 config_pmap_ppl xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug ] # BSD_BASE = [ mach_bsd psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] @@ -28,7 +28,6 @@ # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE config_union_mounts fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE config_union_mounts fdesc ] -# NFS_DEV = [ nfsserver ] # NETWORKING = [ inet tcpdrop_synfin bpfilter if_bridge traffic_mgt dummynet ah_all_crypto if_fake if_redirect ] # NETWORKING_RELEASE = [ NETWORKING ] # NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless kctl_test ] @@ -60,18 +59,18 @@ # SCHED_DEBUG = [ SCHED_BASE ] # VM_BASE = [ vps_dynamic_prio vm_pressure_events jetsam memorystatus config_code_decryption phantom_cache config_secluded_memory config_cs_validation_bitmap config_deferred_reclaim config_map_ranges ] # VM_RELEASE = [ VM_BASE ] -# VM_DEV = [ VM_BASE dynamic_codesigning pgzalloc ] -# VM_DEBUG = [ VM_BASE dynamic_codesigning pgzalloc ] +# VM_DEV = [ VM_BASE dynamic_codesigning ] +# VM_DEBUG = [ VM_BASE dynamic_codesigning ] # VM_KASAN = [ VM_BASE dynamic_codesigning config_kasan config_ubsan config_kcov config_ksancov kernel_tagging kernel_tbi ] # SECURITY_BASE = [ config_macf kernel_integrity config_secure_bsd_root ] # SECURITY_RELEASE = [ SECURITY_BASE ] # SECURITY_DEV = [ SECURITY_BASE config_setuid config_kas_info ] # SECURITY_DEBUG = [ SECURITY_BASE config_setuid config_kas_info ] # BASE = [ MULTIPATH VPN ] -# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] -# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] -# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] -# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] +# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] +# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] +# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] # ###################################################################### # diff --git a/config/MASTER.arm64.MacOSX b/config/MASTER.arm64.MacOSX index 2690d5afc..b6f861b14 100644 --- a/config/MASTER.arm64.MacOSX +++ b/config/MASTER.arm64.MacOSX @@ -20,10 +20,10 @@ # ARM_EXTRAS_BASE = [ config_pmap_ppl ] #endif # KERNEL_BASE = [ arm64 medium msgb_large config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage config_uexc config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_proc_resource_limits config_ext_paniclog ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_proc_resource_limits config_ext_paniclog ] -# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot psynch config_proc_uuid_policy config_coredump pgo config_personas ] +# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot psynch config_proc_uuid_policy config_coredump config_ucoredump pgo config_personas ] # BSD_RELEASE = [ BSD_BASE ] # BSD_DEV = [ BSD_BASE config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_vnguard ] @@ -32,11 +32,11 @@ # FILESYS_DEV = [ FILESYS_BASE config_iocount_trace ] # FILESYS_DEBUG = [ FILESYS_BASE config_iocount_trace ] # NFS = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] -# NETWORKING = [ inet bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge MULTIPATH if_fake if_redirect remote_vif ] +# NETWORKING = [ inet bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk if_bridge MULTIPATH if_fake if_redirect remote_vif ] # NETWORKING_RELEASE = [ NETWORKING ] # NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless kctl_test ] # NETWORKING_DEBUG = [ NETWORKING_DEV ] @@ -84,7 +84,7 @@ # VM_EXTRA_DEV = [ ] #else # VM_EXTRA = [ ] -# VM_EXTRA_DEV = [ pgzalloc ] +# VM_EXTRA_DEV = [ ] #endif /* MASTER_CONFIG_ENABLE_KERNEL_TAG && MASTER_CONFIG_ENABLE_SPTM */ # VM_BASE = [ vm_pressure_events memorystatus config_code_decryption encrypted_swap config_deferred_reclaim VM_EXTRA ] # VM_RELEASE = [ VM_BASE ] @@ -122,8 +122,6 @@ options CONFIG_KERNEL_INTEGRITY # options CONFIG_MACF_LAZY_VNODE_LABELS # Turn on labels, don't preallocate -options CONFIG_HYPERVISOR_PUBLIC # unrestricted entitlement for hypervisor - options CONFIG_RESLIDE_SHARED_CACHE # options CONFIG_KERNEL_TBI # options CONFIG_KERNEL_TAGGING # diff --git a/config/MASTER.arm64.WatchOS b/config/MASTER.arm64.WatchOS index 0033e1fa6..bd1cf6396 100644 --- a/config/MASTER.arm64.WatchOS +++ b/config/MASTER.arm64.WatchOS @@ -17,20 +17,19 @@ # -------- ----- -- --------------- # # ARM_EXTRAS_BASE = [ config_pmap_ppl ] -# KERNEL_BASE = [ arm64 config_pmap_ppl xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_proc_resource_limits config_ext_paniclog ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_proc_resource_limits config_ext_paniclog ] # BSD_BASE = [ mach_bsd psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] -# BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access config_coredump pgo config_vnguard ] -# BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access config_coredump pgo config_vnguard ] +# BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access config_coredump config_ucoredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access config_coredump config_ucoredump pgo config_vnguard ] # FILESYS_BASE = [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_triggers config_fse routefs namedstreams config_dataless_files bindfs] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE config_union_mounts fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE config_union_mounts fdesc ] -# NFS_DEV = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] @@ -73,7 +72,7 @@ # VM_EXTRA_DEV = [ ] #else # VM_EXTRA = [ ] -# VM_EXTRA_DEV = [ pgzalloc ] +# VM_EXTRA_DEV = [ ] #endif /* MASTER_CONFIG_ENABLE_KERNEL_TAG && MASTER_CONFIG_ENABLE_SPTM */ # VM_BASE = [ vm_pressure_events jetsam memorystatus config_code_decryption phantom_cache config_secluded_memory config_cs_validation_bitmap config_deferred_reclaim config_map_ranges freeze VM_EXTRA ] # VM_RELEASE = [ VM_BASE ] @@ -86,9 +85,10 @@ # SECURITY_DEBUG = [ SECURITY_BASE config_setuid config_kas_info ] # BASE = [ MULTIPATH VPN ] # RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] -# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] -# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] -# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] +# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] +# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# SPTM = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] # ###################################################################### # diff --git a/config/MASTER.arm64.iPhoneOS b/config/MASTER.arm64.iPhoneOS index 651699f0c..da53364da 100644 --- a/config/MASTER.arm64.iPhoneOS +++ b/config/MASTER.arm64.iPhoneOS @@ -18,22 +18,21 @@ # # ARM_EXTRAS_BASE = [ config_pmap_ppl ] # KERNEL_BASE = [ arm64 xsmall msgb_small config_embedded config_enforce_signed_code config_code_signature_reconstitution config_requires_u32_munging config_darkboot ARM_EXTRAS_BASE ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_proc_resource_limits config_ext_paniclog ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_proc_resource_limits config_ext_paniclog ] # BSD_EXTRAS_BASE = [ ] -# BSD_EXTRAS_DEV = [ config_coredump ] -# BSD_EXTRAS_DEBUG = [ config_coredump ] +# BSD_EXTRAS_DEV = [ config_coredump config_ucoredump ] +# BSD_EXTRAS_DEBUG = [ config_coredump config_ucoredump ] # BSD_BASE = [ mach_bsd psynch config_proc_uuid_policy config_personas config_imageboot config_imageboot_img4 BSD_EXTRAS_BASE ] -# BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] +# BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel config_imgsrc_access ] # BSD_DEV = [ BSD_BASE config_netboot config_imgsrc_access pgo config_vnguard rosetta BSD_EXTRAS_DEV ] # BSD_DEBUG = [ BSD_BASE config_netboot config_imgsrc_access pgo config_vnguard rosetta BSD_EXTRAS_DEBUG ] # FILESYS_BASE = [ devfs fifo fs_compression config_protect config_mnt_rootsnap config_triggers config_fse routefs namedstreams config_dataless_files bindfs] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc config_union_mounts ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc config_union_mounts ] -# NFS_DEV = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] @@ -87,7 +86,7 @@ # VM_EXTRA_DEV = [ ] #else # VM_EXTRA = [ ] -# VM_EXTRA_DEV = [ pgzalloc ] +# VM_EXTRA_DEV = [ ] #endif /* MASTER_CONFIG_ENABLE_KERNEL_TAG && MASTER_CONFIG_ENABLE_SPTM */ # VM_BASE = [ vps_dynamic_prio vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_cs_validation_bitmap config_deferred_reclaim config_map_ranges VM_EXTRA ] # VM_RELEASE = [ VM_BASE ] @@ -100,9 +99,10 @@ # SECURITY_DEBUG = [ SECURITY_BASE config_setuid config_kas_info ] # BASE = [ MULTIPATH VPN ] # RELEASE = [ BASE KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING_RELEASE PF_RELEASE IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY_RELEASE ] -# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] -# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] -# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV NFS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_KASAN MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# DEVELOPMENT = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] +# DEBUG = [ BASE KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING_DEBUG PF_DEBUG IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY_DEBUG ] +# KASAN = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_KASAN MACH_DEV SCHED_DEV VM_KASAN SECURITY_DEV ] +# SPTM = [ BASE KERNEL_DEV BSD_DEV FILESYS_DEV SKYWALK_DEV NETWORKING_DEV PF_DEV IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY_DEV ] # ###################################################################### # diff --git a/config/MASTER.x86_64 b/config/MASTER.x86_64 index a0446076c..ca0de1613 100644 --- a/config/MASTER.x86_64 +++ b/config/MASTER.x86_64 @@ -17,10 +17,10 @@ # -------- ----- -- --------------- # # KERNEL_BASE = [ intel medium msgb_large config_requires_u32_munging config_delay_idle_sleep config_proc_udata_storage vsprintf ] -# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_RELEASE = [ KERNEL_BASE release ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug config_proc_resource_limits ] # KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_waitq_stats config_workloop_debug config_proc_resource_limits ] -# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist psynch config_proc_uuid_policy config_coredump pgo config_personas ] +# BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_netboot config_imageboot config_imageboot_chunklist psynch config_proc_uuid_policy config_coredump config_ucoredump pgo config_personas ] # BSD_RELEASE = [ BSD_BASE ] # BSD_DEV = [ BSD_BASE config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_vnguard ] @@ -29,11 +29,11 @@ # FILESYS_DEV = [ FILESYS_BASE config_iocount_trace ] # FILESYS_DEBUG = [ FILESYS_BASE config_iocount_trace ] # NFS = [ nfsserver ] -# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_monitor config_nexus_flowswitch config_nexus_netif ] +# SKYWALK_BASE = [ skywalk config_nexus_user_pipe config_nexus_kernel_pipe config_nexus_flowswitch config_nexus_netif ] # SKYWALK_RELEASE = [ SKYWALK_BASE ] # SKYWALK_DEV = [ SKYWALK_BASE ] # SKYWALK_DEBUG = [ SKYWALK_BASE ] -# NETWORKING = [ inet bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk config_mbuf_jumbo if_bridge MULTIPATH if_fake if_redirect remote_vif config_mbuf_mcache ] +# NETWORKING = [ inet bpfilter dummynet traffic_mgt sendfile ah_all_crypto bond vlan gif stf ifnet_input_chk if_bridge MULTIPATH if_fake if_redirect remote_vif ] # NETWORKING_RELEASE = [ NETWORKING ] # NETWORKING_DEV = [ NETWORKING_RELEASE packet_mangler if_headless kctl_test ] # NETWORKING_DEBUG = [ NETWORKING_DEV ] @@ -62,9 +62,9 @@ # SCHED_DEV = [ SCHED_BASE ] # SCHED_DEBUG = [ SCHED_BASE ] # VM_BASE = [ vm_pressure_events memorystatus dynamic_codesigning config_code_decryption encrypted_swap config_deferred_reclaim ] -# VM_DEBUG = [ VM_BASE pgzalloc ] -# VM_DEV = [ VM_BASE pgzalloc ] -# VM_RELEASE = [ VM_BASE pgzalloc ] +# VM_DEBUG = [ VM_BASE ] +# VM_DEV = [ VM_BASE ] +# VM_RELEASE = [ VM_BASE ] # VM_KASAN = [ VM_BASE config_kasan config_ubsan config_kcov config_ksancov ] # SECURITY = [ config_macf config_audit config_csr config_arcade config_setuid config_kas_info ] # BASE = [ PF VPN SECURITY NFS ] diff --git a/config/Private.arm64.exports b/config/Private.arm64.exports index 126605f19..8b96ac110 100644 --- a/config/Private.arm64.exports +++ b/config/Private.arm64.exports @@ -46,11 +46,14 @@ _sched_perfcontrol_register_callbacks _sched_perfcontrol_update_recommended_cores _sched_perfcontrol_update_recommended_cores_reason _sched_perfcontrol_update_powered_cores +_sched_perfcontrol_check_oncore_thread_preemption _sched_perfcontrol_thread_group_get_name _sched_perfcontrol_thread_group_recommend _sched_perfcontrol_thread_group_preferred_clusters_set _sched_perfcontrol_edge_matrix_get _sched_perfcontrol_edge_matrix_set +_sched_perfcontrol_edge_matrix_by_qos_get +_sched_perfcontrol_edge_matrix_by_qos_set _sched_perfcontrol_edge_cpu_rotation_bitmasks_set _sched_perfcontrol_edge_cpu_rotation_bitmasks_get _sched_perfcontrol_update_callback_deadline @@ -61,6 +64,7 @@ _socd_client_reinit _socd_client_trace _thread_group_join_io_storage _thread_group_join_perf_controller +_thread_group_join_cellular _ml_cpu_init_completed _ml_cpu_signal _ml_cpu_signal_deferred @@ -79,7 +83,6 @@ _ml_page_protection_type _ml_static_ptovirt _ml_static_mfree _ml_update_cluster_wfe_recommendation -_ml_paddr_is_exclaves_owned _mach_bridge_recv_timestamps _mach_bridge_init_timestamp _mach_bridge_set_params @@ -118,6 +121,8 @@ __ZN26IOUnifiedAddressTranslator17getPageTableEntryEy __ZN26IOUnifiedAddressTranslator18setClientContextIDEjb __ZN26IOUnifiedAddressTranslator21removeClientContextIDEv __ZN26IOUnifiedAddressTranslator19isPageFaultExpectedEyj +__ZN26IOUnifiedAddressTranslator7getModeEv +__ZN26IOUnifiedAddressTranslator29getFirmwareAddressSpaceHandleEv __ZN26IOUnifiedAddressTranslator22registerTaskForServiceEP4taskP9IOService __ZN26IOUnifiedAddressTranslator23createMappingInApertureEjP18IOMemoryDescriptorjym __ZN26IOUnifiedAddressTranslator23getTotalPageTableMemoryEv diff --git a/config/Private.exports b/config/Private.exports index 0a59d734e..dfe4e76f9 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -106,12 +106,14 @@ __ZN24IOPerfControlWorkContext10gMetaClassE __ZN24IOPerfControlWorkContext10superClassE __ZTV24IOPerfControlWorkContext __ZN19IOPerfControlClient10copyClientEP9IOServicey +__ZN19IOPerfControlClient23copyClientForDeviceTypeEP9IOServiceyNS_14IOPCDeviceTypeE __ZN19IOPerfControlClient10gMetaClassE __ZN19IOPerfControlClient10superClassE __ZN19IOPerfControlClient10workSubmitEP9IOServicePNS_14WorkSubmitArgsE __ZN19IOPerfControlClient14registerDeviceEP9IOServiceS1_ __ZN19IOPerfControlClient15copyWorkContextEv __ZN19IOPerfControlClient16unregisterDeviceEP9IOServiceS1_ +__ZN19IOPerfControlClient18querySubmitterRoleEP9IOServiceP4taskPj __ZN19IOPerfControlClient18workEndWithContextEP9IOServiceP8OSObjectPNS_11WorkEndArgsEb __ZN19IOPerfControlClient18workSubmitAndBeginEP9IOServicePNS_14WorkSubmitArgsEPNS_13WorkBeginArgsE __ZN19IOPerfControlClient20workBeginWithContextEP9IOServiceP8OSObjectPNS_13WorkBeginArgsE @@ -273,7 +275,7 @@ _apple_encrypted_archive_interface_register _assert_wait_deadline_with_leeway _assert_wait_timeout_with_leeway _audio_active -_address_space_debugged +_address_space_debugged_state _b_to_q _backtrace _backtrace_user @@ -287,11 +289,15 @@ _buf_create_shadow _buf_kernel_addrperm_addr _buf_setfilter _buf_shadow +_buf_verify_enable +_buf_verifyptr _bufattr_alloc _bufattr_cpoff _bufattr_cpx _bufattr_dup _bufattr_free +_bufattr_verifykind +_bufattr_verifyptr _bufattr_greedymode _bufattr_isochronous _bufattr_markgreedymode @@ -307,6 +313,7 @@ _bufattr_quickcomplete _bufattr_rawencrypted _bufattr_setcpoff _bufattr_setcpx +_bufattr_setverifyvalid _bufattr_throttled _bufattr_willverify _cache_lookup_ext @@ -441,6 +448,7 @@ _exclaves_register_xrt_hosted_callbacks _exclaves_sensor_start _exclaves_sensor_stop _exclaves_sensor_status +_exclaves_sensor_tick_rate _exclaves_display_healthcheck_rate _ext_paniclog_handle_set_active _ext_paniclog_handle_set_inactive @@ -458,6 +466,7 @@ _garbage_collect_provisioning_profiles _generate_file_permissions_guard_exception _get_aiotask _get_system_inshutdown +_get_system_inuserspacereboot _gpu_accumulate_time _gpu_describe _gpu_fceiling_cb_register @@ -479,6 +488,7 @@ _ifnet_enable_output _ifnet_enqueue _ifnet_flowid _ifnet_get_delegate +_ifnet_get_inband_wake_packet_tagging _ifnet_get_inuse_address_list _ifnet_get_keepalive_offload_frames _ifnet_get_local_ports @@ -505,6 +515,8 @@ _ifnet_report_issues _ifnet_set_bandwidths _ifnet_set_delegate _ifnet_set_idle_flags +_ifnet_set_inband_wake_packet_tagging +_ifnet_set_low_power_wake _ifnet_set_latencies _ifnet_set_link_quality _ifnet_set_management @@ -520,7 +532,10 @@ _ifnet_get_unsent_bytes _ifnet_get_buffer_status _ifnet_normalise_unsent_data _ifnet_set_low_power_mode +_ifnet_set_rx_flow_steering +_ifnet_get_rx_flow_steering _ifnet_notify_tcp_keepalive_offload_timeout +_ifnet_enable_cellular_thread_group _in6_localaddr _in6addr_any _in6addr_local @@ -539,6 +554,7 @@ _ipc_port_release_send _ipc_port_reset_thread_attr _ipf_addv4_internal _ipf_addv6_internal +_is_address_space_debugged _kalloc_data:_kalloc_data_external _kalloc_shared_data:_kalloc_shared_data_external _kalloc_type_impl:_kalloc_type_impl_external @@ -697,6 +713,7 @@ _kern_packet_set_timestamp _kern_packet_set_token _kern_packet_set_traffic_class _kern_packet_set_transport_header_offset +_kern_packet_set_ulpn_flag _kern_packet_set_vlan_tag _kern_packet_get_timestamp_requested _kern_packet_get_tx_completion_status @@ -757,7 +774,6 @@ _ktriage_register_subsystem_strings _ktriage_unregister_subsystem_strings _kx_qsort _linesw -_localnode_id _lockd_shutdown _lockd_request _log @@ -832,6 +848,7 @@ _ml_io_read16 _ml_io_read32 _ml_io_read64 _ml_io_read8 +_ml_io_read_cpu_reg _ml_io_reset_timeouts _ml_io_reset_timeouts_phys _ml_io_write @@ -839,15 +856,6 @@ _ml_io_write16 _ml_io_write32 _ml_io_write64 _ml_io_write8 -_mnl_instantiate -_mnl_register -_mnl_msg_alloc -_mnl_msg_complete -_mnl_msg_free -_mnl_msg_to_node -_mnl_msg_from_node -_mnl_set_link_state -_mnl_terminate _net_add_domain:_net_add_domain_old _net_add_proto:_net_add_proto_old _net_del_domain:_net_del_domain_old @@ -856,6 +864,7 @@ _net_domain_contains_hostname _netboot_root _nfs_register_hooks _nfs_unregister_hooks +_nxioctl_kernel _os_reason_create _os_reason_alloc_buffer_noblock _os_reason_get_kcdata_descriptor @@ -879,6 +888,8 @@ _current_persona_get _persona_put _pffinddomain:_pffinddomain_old _pffindproto:_pffindproto_old +_pktap_input_packet +_pktap_output_packet _code_signing_configuration _disable_code_signing_feature _set_compilation_service_cdhash @@ -908,7 +919,10 @@ _port_name_to_task:_port_name_to_task_external _port_name_to_thread _post_sys_powersource _proc_best_name -_proc_ident +_proc_find_ident_validated +_proc_ident_equal_ref +_proc_ident_equal +_proc_ident_equal_token _proc_csflags _proc_fdlist _proc_get_filter_message_flag @@ -981,6 +995,8 @@ _rootvp _rsr_bump_version _rsr_check_vnode _rsr_get_version +_research_mode_state +_extended_research_mode_state _sane_size _sbappendaddr _sbappendrecord @@ -1100,6 +1116,7 @@ _vfs_context_get_special_port _vfs_context_set_special_port _vfs_context_is_dataless_manipulator _vfs_context_skip_mtime_update +_vfs_context_allow_entitled_reserve_access _vfs_setdevvp _vfs_devvp _vfs_get_thread_fs_private @@ -1127,6 +1144,7 @@ _vm_map_page_shift _vm_map_page_size _vm_map_round_page_mask _vm_map_trunc_page_mask +_vm_map_kernel_max_simple_mappable_size _vm_page_wire_count _vnode_hold _vnode_drop @@ -1237,6 +1255,7 @@ _vnode_setautocandidate _vnode_setdirty _vnode_setfastdevicecandidate _vnode_setnoflush +_vnode_hasmultipath _vslock _vsunlock _vfs_isswapmount diff --git a/config/libTightbeam.exports b/config/libTightbeam.exports index a3a3f3828..e37ff0fe3 100644 --- a/config/libTightbeam.exports +++ b/config/libTightbeam.exports @@ -23,7 +23,6 @@ _tb_service_connection_message_construct _tb_service_connection_message_destruct _tb_transport_call_message_handler _tb_transport_get_context -_tb_transport_message_buffer_copy _tb_transport_message_buffer_wrap_buffer _tb_transport_set_message_handler _tb_transport_set_message_handler_f diff --git a/doc/debugging/extensible_paniclog.md b/doc/debugging/extensible_paniclog.md new file mode 100644 index 000000000..6d3ae4cde --- /dev/null +++ b/doc/debugging/extensible_paniclog.md @@ -0,0 +1,389 @@ +# Extensible Paniclog + +This documentation discusses the API and features of the extensible paniclog in XNU's panic flow. + +## Overview + +With this feature we want to provide an infrastructure for kexts / dexts to insert their system state into the paniclog. Currently there is no way of knowing the kext or dext state unless we take a full coredump. With this feature, they can drop relevant state information that will end up in the paniclog and can be used to triage panics. + +## UUID ↔ buffer data mapping + +All clients who adopt this infrastructure will have to use a UUID that maps to a format of the buffer data. Clients will have to provide a mapping that specifies how to decode the data. This mapping will be used to decode the data in DumpPanic or a tool integrated into MPT. + +## IOKit APIs + +Source Code: `iokit/IOKit/IOExtensiblePaniclog.h` + +```c +static bool createWithUUID(uuid_t uuid, const char *data_id, uint32_t max_len, ext_paniclog_create_options_t options, IOExtensiblePaniclog **out); +``` + +This is the first API that is called by a kext to initialize an IOExtensiblePaniclog instance. It takes a UUID, data_id, max len, and options as input and emits an instance in the out pointer. The data id takes a short description of the buffer and the maximum length is 32 bytes. + +```c +int setActive(); +int setInactive(); +``` + +These functions are called to make an IOExtensiblePaniclog instance active or inactive. An instance is collected and put into the panic file only if it's active. It's ignored in the panic path if it's inactive. + +```c +int insertData(void *addr, uint32_t len); +``` + +This function inserts the data pointed to by addr into the IOExtensiblePaniclog instance. It will copy the data into the buffer from offset 0. + +```c +int appendData(void *addr, uint32_t len); +``` + +This function appends the data pointed to by addr into the IOExtensiblePaniclog instance. It will position the data after the previous insert or append. + +```c +void *claimBuffer(); +``` + +This function returns the buffer of the IOExtensiblePaniclog instance. This function also sets the used length of the handle to the max length. The entire buffer is copied out when the system panic after this function call. yieldBuffer() has to be called before using insertData() or appendData().  + +```c +int yieldBuffer(uint32_t used_len); +``` + +This function is called to yield the buffer and set the used_len for the buffer. + +```c +int setUsedLen(uint32_t used_len) +``` + +This function is called to set the used len of the buffer. + +## DriverKit APIs + +Source Code: `iokit/DriverKit/IOExtensiblePaniclog.iig` + +```cpp +static kern_return_t Create(OSData *uuid, OSString *data_id, uint32_t max_len, IOExtensiblePaniclog **out); +``` + +This is first API that is called by a dext to initialize an IOExtensiblePaniclog instance. It takes a UUID, data_id and the max len as input and emits an instance in the out pointer. The data id takes a short description of the buffer and the maximum length is 32 bytes. + +```cpp +kern_return_t SetActive(); +kern_return_t SetInactive(); +``` + +These functions are called to make an IOExtensiblePaniclog instance active or inactive. An instance is collected and put into the panic file only if it's active. It's ignored in the panic path if it's inactive. + +```cpp +kern_return_t InsertData(OSData *data); +``` + +This function inserts the data pointed to by addr into the IOExtensiblePaniclog instance. It will copy the data into the buffer from offset 0. + +```cpp +kern_return_t AppendData(OSData *data); +``` + +This function appends the data pointed to by addr into the IOExtensiblePaniclog instance. It will position the data after the previous insert or append. + +```cpp +kern_return_t ClaimBuffer(uint64_t *addr, uint64_t *len); +``` + +This function is called to get a pointer to the ext paniclog buffer. After this function is called, the user is responsible for copying data into the buffer. The entire buffer is copied when a system panics. After claiming the buffer, YieldBuffer() has to be called to set the used_len of the buffer before calling InsertData() or AppendData(). + +```cpp +kern_return_t YieldBuffer(uint32_t used_len); +``` + +This function is called to yield the buffer and set the used_len for the buffer. + +```cpp +kern_return_t SetUsedLen(uint32_t used_len); +``` + +This function is called to set the used len of the buffer. + +## Low-Level Kernel APIs + +Source Code: `osfmk/kern/ext_paniclog.h` + +### ExtensiblePaniclog Handle Struct + +```c +typedef struct ext_paniclog_handle { + LIST_ENTRY(ext_paniclog_handle) handles; + uuid_t uuid; + char data_id[MAX_DATA_ID_SIZE]; + void *buf_addr; + uint32_t max_len; + uint32_t used_len; + ext_paniclog_create_options_t options; + ext_paniclog_flags_t flags; + uint8_t active; +} ext_paniclog_handle_t; +``` + +We employ handles in XNU to guarantee the effective management of buffer lifecycles, prevent nested panics from occurring during access from the panic path, and build a durable and expandable API. The primary reason for using handles is to allow XNU to oversee the entire buffer lifecycle. By keeping track of the buffer's state and managing its deallocation, we can avoid potential issues that may arise during panic scenarios. + +```c +ext_paniclog_handle_t *ext_paniclog_handle_alloc_with_uuid(uuid_t uuid, const char *data_id, uint32_t max_len, ext_paniclog_create_options_t); +``` + +This function will be called to initialize a buffer of the specified length. For all subsequent operations we use this handle as input. It takes a UUID, data_id, max len, and options as input and emits an instance in the out pointer. The data id takes a short description of the buffer and the maximum length is 32 bytes. This function will return a handle on success and NULL on failure. + +```c +int ext_paniclog_handle_set_active(ext_paniclog_handle_t *handle); +``` + +This function sets the handle as active. In active state, this buffer will get picked up by the panic path and put into the panic file. + +```c +int ext_paniclog_handle_set_inactive(ext_paniclog_handle_t *handle); +``` + +This function sets the handle as inactive. + +```c +void ext_paniclog_handle_free(ext_paniclog_handle_t *handle) +``` + +This functions deallocates all the memory that is allocated in the alloc function. The handle has to a be a valid and this function should only be called after handle_alloc is called. + +```c +int ext_paniclog_insert_data(ext_paniclog_handle_t *handle, void *addr, size_t len) +``` + +This function is called to insert the data from a buffer to the handle buffer. This function will take a handle that has been previously allocated, an address to the buffer and length of the buffer. This function will return 0 on success and a negative value on failure. + +```c +int ext_paniclog_append_data(ext_paniclog_handle_t *handle, void *addr, uint32_t len); +``` + +This function is called to append to the data that is already present in the buffer. + +```c +void *ext_paniclog_get_buffer(ext_paniclog_handle_t *handle) +``` + +This function is called to get a pointer to the ext paniclog buffer. To modify the buffer after getting the pointer use the `ext_paniclog_claim_buffer()`. + +```c +void *ext_paniclog_claim_buffer(ext_paniclog_handle_t *handle); +``` + +This function is called to get a pointer to the ext paniclog buffer. After this function is called, the user is responsible for copying data into the buffer. The entire buffer is copied when a system panics. After claiming the buffer, `ext_paniclog_yield_buffer()` has to be called to set the `used_len` of the buffer before calling `ext_paniclog_insert_data()` or `ext_paniclog_append_data()`. + +```c +int ext_paniclog_yield_buffer(ext_paniclog_handle_t *handle, uint32_t used_len); +``` + +This function is called to yield the buffer and set the used_len for the buffer. + +```c +int ext_paniclog_set_used_len(ext_paniclog_handle_t *handle, uint32_t used_len); +``` + +This function is called to set the used len of the buffer. + +## panic_with_data APIs + +```c +void panic_with_data(uuid_t uuid, void *addr, uint32_t len, uint64_t debugger_options_mask, const char *format, ...); +``` + +This function is called when a kernel client is panicking and wants to insert the data into the extensible panic log. We treat this as a special case and put this data at the start of the extensible panic log region. The client has to supply the UUID to decode the buffer that is pushed to the paniclog. + +```c +int panic_with_data(char *uuid, void *addr, uint32_t len, uint32_t flags, const char *msg); +``` + +This provides the same functionality as panic_with_data() for userspace clients. + +## Special Options + +### `EXT_PANICLOG_OPTIONS_ADD_SEPARATE_KEY` + +If the `EXT_PANICLOG_OPTIONS_ADD_SEPARATE_KEY` option is set when creating an ExtensiblePaniclog handle, the Data ID / buffer data (key / value) pair will be added directly to the paniclog instead of under the "ExtensiblePaniclog" key. + +## Implementation + +### Estimating the panic log size + +We want to add the utilization metrics of the panic log to the panic.ips file. This will give us an idea of the percentage of the panic log we currently use and how big each section in the panic log is. We will use this data to estimate how big the other log section usually is and ensure that we leave enough space for this section when inserting the extensible panic log. We will cut off the extensible panic log if we cannot fit all the buffers into the free region. + +### Registering a buffer + Writing data to the buffer + +We have APIs exposed at different layers so that a client can use whatever suits it best. In DriverKit and IOKit cases, they call the `createWithUUID` or `Create` methods to create an IOExtensiblePaniclog instance and use that instance to insert or append data to a buffer. + +Lower level clients use `ext_paniclog_handle_alloc_with_uuid` to allocate a handle and use that handle to insert data using `ext_paniclog_insert_data` and `ext_paniclog_append_data` functions. + +When a kernel client is panicking, it has the option to call `panic_with_data()`, which just takes a UUID, buffer address and length. This API makes sure that we copy this data in to the extensible panic log. + +### Insert data into the extended panic log + +Current structure of the panic log is as follows: + +``` +------------------------- +- Panic Header - +------------------------- +- - +- Panic Log - +- - +------------------------- +- - +- Stack shots - +- - +------------------------- +- - +- Other Log - +- - +------------------------- +- Misc Data - +------------------------- +- - +- - +- Free - +- - +- - +------------------------- +``` + +We want to use the free part of the panic log to insert the extensible panic log. After we insert the stackshots, we calculate and see how much space we have in the panic log to insert the extensible panic log. These calculations will use the data that we collect from our utilization metrics and leave out space for the other log section. We then go through the ext_paniclog linked list and start inserting the buffers into the panic log region until we fill out size we calculated. After this, we move onto inserting data into the other log section. + +## Format / structure of the extensible panic log: + +``` ++---------+------------+---------+---------+------------+------------+---------+---------+---------+-----------+------------+----------+ +| | | | | | | | | | | | | +|Version | No of logs | UUID 1 | Flags 1 | Data ID 1 | Data len 1 | Data 1 | UUID 2 | Flags 2 | Data ID 2 | Data len 2 | Data 2 | +| | | | | | | | | | | | | ++---------+------------+---------+---------+------------+------------+---------+---------+---------+-----------+------------+----------+ +``` + +## Extract and format the extensible panic log into the panic.ips file + +In DumpPanic, we will extract this data from the panic log region and format it to be readable. We can group the data according to uuid and sort it with the data_id of the data. An example of the extensible panic log data in the panic.ips file shown below. + +``` +{ + "ExtensiblePanicLog": { + "": [ + { + "DataID": "0x1" + "data" : + }, + { + "DataID": "0x2" + "data" : + } + ], + "": [ + { + "DataID": "0x1" + "data" : + }, + { + "DataID": "0x2" + "data" : + } + ], + }, + "SeparateFieldDataID1": "Separate buffer value here 1", + "SeparateFieldDataID2": "Separate buffer value here 2", +} +``` + +Notice that there are two fields below ExtensiblePanicLog in the panic.ips example above. If you were to pass the option `EXT_PANICLOG_CREATE_OPTIONS_ADD_SEPARATE_KEY` to the handle create function, DumpPanic would process that handle as seen above, by adding it as a field directly to the panic file instead of including it in the ExtensiblePanicLog field. + +## Example code + +### IOKit Example + +#### Creating the handle + +```c +char uuid_string_1[] = "E2070C7E-A1C3-41DF-ABA4-B9921DACCD87"; +bool res; +kern_return_t ret; + +uuid_t uuid_1; +uuid_parse(uuid_string_1, uuid_1); + +res = IOExtensiblePaniclog::createWithUUID(uuid_1, "Lha ops 1", 1024, EXT_PANICLOG_OPTIONS_NONE, &paniclog_handle_1); +if (res == false) { + DEBUG_LOG ("Failed to create ext paniclog handle: %d\n", res); +} + +DEBUG_LOG("Created panic log handle 1 with UUID: %s\n", uuid_string_1); + +char uuid_string_2[] = "28245A8F-04CA-4932-8A38-E6C159FD9C92"; +uuid_t uuid_2; +uuid_parse(uuid_string_2, uuid_2); +res = IOExtensiblePaniclog::createWithUUID(uuid_2, "Lha ops 2", 1024, EXT_PANICLOG_OPTIONS_NONE, &paniclog_handle_2); +if (res == false) { + DEBUG_LOG ("Failed to create ext paniclog handle: %d\n", res); +} + +DEBUG_LOG("Created panic log handle 2 with UUID: %s\n", uuid_string_2); +``` + +#### Inserting the data + +```c +DEBUG_LOG ("%s\n", __FUNCTION__); +char buff[1024] = {0}; +snprintf(buff, 1024, "HW access Dir: %u Type: %u Address: %llu\n", input->direction, input->type, input->address); + +char buff1[1024] = {0}; + +paniclog_handle_1->insertData(buff, (uint32_t)strlen(buff)); +paniclog_handle_1->setActive(); + +paniclog_handle_2->insertData(input, sizeof(HardwareAccessParameters)); +paniclog_handle_2->setActive(); +``` + +### DriverKit Example + +#### Creating the handle + +```cpp +OSData *uuid_data = OSData::withBytes(&uuid_3[0], sizeof(uuid_t)); +if (!uuid_data) { + IOLog("Data was not created\n"); + return NULL; +} + +OSString *data_id = OSString::withCString("DriverKit OP 1"); + +ret = IOExtensiblePaniclog::Create(uuid_data, data_id, 64, kIOExtensiblePaniclogOptionsNone, &paniclog_handle_3); +if (ret != kIOReturnSuccess) { + IOLog("Failed to create paniclog handle 3\n"); + return NULL; +} +IOLog("EXT_PANICLOG: Created panic log handle 3 with UUID: %s\n", uuid_string_3); +``` + +#### Inserting the data + +```cpp +ret = paniclog_handle_3->ClaimBuffer(&addr, &len); +if (ret != kIOReturnSuccess) { + IOLog("EXT_PANICLOG: Failed to claim buffer. Ret: %x\n", ret); + return NULL; +} + +IOLog("EXT_PANICLOG: Got buffer address %llu, %llu", addr, len); + +buff1 = (char *)addr; + +IOLog("EXT_PANICLOG: Ignoring write for now"); +memcpy(buff1, buff, strlen(buff)); + +paniclog_handle_3->YieldBuffer((uint32_t)strlen(buff)); + +paniclog_handle_3->SetActive(); +``` + diff --git a/doc/lifecycle/startup.md b/doc/lifecycle/startup.md index 484c29e5b..14e2469f3 100644 --- a/doc/lifecycle/startup.md +++ b/doc/lifecycle/startup.md @@ -56,7 +56,11 @@ tables, ... Available hooks are: ### Rank usage -- Rank 1: `TUNABLE`, `TUNABLE_WRITEABLE` +- Rank 1: + - All uses of `TUNABLE`, `TUNABLE_WRITEABLE` + - CSR configuration from DeviceTree or boot-args + - CTRR configuration from DeviceTree + - SMR initialization - Middle: globals that require complex initialization (e.g. SFI classes). @@ -285,7 +289,7 @@ interrupts or preemption enabled may begin enforcement. ### Rank usage - Rank 1: Initialize some BSD globals -- Middle: Initialize some early BSD subsystems +- Middle: Initialize some early BSD subsystems and tightbeam runtime `STARTUP_SUB_EXCLAVES` @@ -298,7 +302,6 @@ Early exclaves initialization. ### Rank usage - Rank 1: Determine run-time support for exclaves -- Middle: Initialize tightbeam runtime `STARTUP_SUB_LOCKDOWN` diff --git a/doc/mach_ipc/guard_exceptions.md b/doc/mach_ipc/guard_exceptions.md index 836f0aeba..6691b9768 100644 --- a/doc/mach_ipc/guard_exceptions.md +++ b/doc/mach_ipc/guard_exceptions.md @@ -119,7 +119,7 @@ hitting such a bug usually is a sign of port right mismanagement. ### `kGUARD_EXC_MOD_REFS` 0x00000002 -- **ReportCrash Name**: OVER\_DEALLOC or MOD\_REFS, +- **ReportCrash Name**: `OVER_DEALLOC` or `MOD_REFS`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: - `0x0100000000000000`: a `deallocate` function, @@ -136,7 +136,7 @@ sign of port-right mismanagement. ### `kGUARD_EXC_INVALID_OPTIONS` 0x00000003 -- **ReportCrash Name**: INVALID\_OPTIONS, +- **ReportCrash Name**: `INVALID_OPTIONS`, - **Target meaning**: the message ID of a rejected message via the legacy `mach_msg_trap()` or zero, - **Payload meaning**: the kernel sanitized (see `ipc_current_user_policy()`) @@ -148,7 +148,7 @@ There are several policies dictating the shape of options passed to calls of the ### `kGUARD_EXC_SET_CONTEXT` 0x00000004 -- **ReportCrash Name**: SET\_CONTEXT, +- **ReportCrash Name**: `SET_CONTEXT`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: the value of the context guarding the Mach Port. @@ -163,8 +163,11 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_THREAD_SET_STATE` 0x00000005 - **ReportCrash Name**: N/A, -- **Target meaning**: always zero, -- **Payload meaning**: always zero. +- **Target meaning**: exception flavor, +- **Payload meaning**: + - `0x0100000000000000`: tss called from userspace exception handler, + - `0x0200000000000000`: tss with flavor that modifies cpu registers, + - `0x0300000000000000`: tss called from fatal PAC exception. This exception is thrown when a process is trying to use the `thread_set_state()` interface, or any interface leading to it (such as trying @@ -175,8 +178,8 @@ is disallowed by policy for this process. ### `kGUARD_EXC_EXCEPTION_BEHAVIOR_ENFORCE` 0x00000006 - **ReportCrash Name**: N/A, -- **Target meaning**: always zero, -- **Payload meaning**: always zero. +- **Target meaning**: the new exception behavior, +- **Payload meaning**: the exception mask. This exception is thrown when a process is trying to register an exception port for a behavior not using a task identity port, and that this is disallowed by @@ -205,10 +208,31 @@ as `mach_port_unguard()` on a port that isn't guarded. This is usually a sign of port right mismanagement. +### `kGUARD_EXC_KOBJECT_REPLY_PORT_SEMANTICS` 0x00000009 + +- **ReportCrash Name**: KOBJECT\_REPLY\_PORT\_SEMANTICS, +- **Target meaning**: the mach port name the incorrect operation targets, +- **Payload meaning**: always zero. + +This exception is thrown when a hardened process is trying to send a message +to a kobject port without using an `IOT_REPLY_PORT` to receive the reply. + +### `kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS` 0x0000000a + +- **ReportCrash Name**: `REQUIRE_REPLY_PORT_SEMANTICS`, +- **Target meaning**: the mach port name the incorrect operation targets, +- **Payload meaning**: 1 if the port is a bootstrap port, 0 otherwise. + +This exception is thrown when a caller is violating the reply port semantics in +a process where this is disallowed by policy. This is used to gather telemetry +around violators pending enforcement in a future release. + +This is usually a sign of a programming mistake (violation of the reply port +semantics rules). ### `kGUARD_EXC_INCORRECT_GUARD` 0x00000010 -- **ReportCrash Name**: INCORRECT\_GUARD, +- **ReportCrash Name**: `INCORRECT_GUARD`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: the value of the context guarding the Mach Port. @@ -221,9 +245,9 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_IMMOVABLE` 0x00000020 -- **ReportCrash Name**: ILLEGAL\_MOVE, +- **ReportCrash Name**: `ILLEGAL_MOVE`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: (target port type << 32) | disposition. This exception is thrown when a process is attempting to move a port right, and this has been disallowed by policy for this port type and process. @@ -240,9 +264,37 @@ point, and that this is likely going to be phased out in favor of tracking reply ports at the port type level, this is left mostly undocumented on purpose. +### `kGUARD_EXC_INVALID_NOTIFICATION_REQ` 0x00000041 + +- **ReportCrash Name**: INVALID\_NOTIFICATION\_REQ, +- **Target meaning**: IOT_ port type that you are trying to arm the notification on +- **Payload meaning**: The type of notification you were registering for + +This exception is thrown when a process is trying to arm a notification +on a port type that disallows such requests. + + +### `kGUARD_EXC_INVALID_MPO_ENTITLEMENT` 0x00000042 + +- **ReportCrash Name**: `INVALID_MPO_ENTITLEMENT`, +- **Target meaning**: The `mpo_flags_t` that were passed into `mach_port_construct` + +This exception is thrown when you try to construct a mach port type that is disallowed +for your process based on entitlements. + +### `kGUARD_EXC_DESCRIPTOR_VIOLATION` 0x00000043 + +- **ReportCrash Name**: `DESCRIPTOR_VIOLATION`, +- **Target meaning**: The IPC space policy. +- **Payload meaning**: + - `(violation_type << 56) | aux` : the violation's type, among with associated metadata + +This exception is thrown when a process attempts to violate any +Mach message descriptor policies. + ### `kGUARD_EXC_MSG_FILTERED` 0x00000080 -- **ReportCrash Name**: MSG\_FILTERED, +- **ReportCrash Name**: `MSG_FILTERED`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: the message ID of the filtered message. @@ -270,9 +322,17 @@ exceptions in the Mach IPC and VM world. This is not a supported configuration. ### `kGUARD_EXC_INVALID_RIGHT` 0x00000100 -- **ReportCrash Name**: INVALID\_RIGHT, +- **ReportCrash Name**: `INVALID_RIGHT`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: + - `0x01 << 56` : `ipc_port_translate_receive` failed, + - `(0x02 << 56) | (right << 32) | ie_bits` : `ipc_right_delta` failed, + - `(0x03 << 56) | ie_bits` : `ipc_right_destruct` failed, + - `(0x04 << 56) | (reason << 32) | ie_bits` : `ipc_right_copyin` failed, + - `(0x05 << 56) | ie_bits` : `ipc_right_dealloc` failed, + - `(0x06 << 56) | (otype << 32) | io_type` : `ipc_right_deallocate_kernel` failed, + - `(0x07 << 56) | ie_bits` : invalid port in `ipc_object_translate_port_pset`, + - `(0x08 << 56) | ie_bits` : invalid pset in `ipc_object_translate_port_pset`. This exception is thrown when an operation is targetting a port which rights do not match the caller's expectations. Examples of such mistakes are: @@ -289,7 +349,7 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_INVALID_NAME` 0x00000200 -- **ReportCrash Name**: INVALID\_NAME, +- **ReportCrash Name**: `INVALID_NAME`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: always zero. @@ -304,9 +364,12 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_INVALID_VALUE` 0x00000400 -- **ReportCrash Name**: INVALID\_VALUE, +- **ReportCrash Name**: `INVALID_VALUE`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: + - `(0x01 << 56) | (type << 32) | size` : invalid trailer in `mach_port_peek`, + - `(0x02 << 56) | (right << 32) | (delta << 16) | ie_bits` : `ipc_right_delta` failed, + - `(0x03 << 56) | (srdelta << 32) | ie_bits` : `ipc_right_destruct` failed, This exception is thrown when: @@ -323,7 +386,7 @@ Mach IPC interfaces. ### `kGUARD_EXC_INVALID_ARGUMENT` 0x00000800 -- **ReportCrash Name**: INVALID\_ARGUMENT, +- **ReportCrash Name**: `INVALID_ARGUMENT`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: the correct value of the context guarding the Mach Port. @@ -335,9 +398,13 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_KERN_FAILURE` 0x00004000 -- **ReportCrash Name**: KERN\_FAILURE, -- **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **ReportCrash Name**: `KERN_FAILURE`, +- **Target meaning**: always zero, +- **Payload meaning**: + - `0x0100000000000000`: task other than launchd arm pd on service ports, + - `0x0200000000000000`: not using IOT_NOTIFICATION_PORT for pd notification, + - `0x0300000000000000`: notification port not owned by launchd, + - `0x0400000000000000`: register multiple pd notification. This exception is thrown when a caller is trying to request a port-destroyed notification that is disallowed by system policy. This should really have been @@ -348,9 +415,9 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_SEND_INVALID_REPLY` 0x00010000 -- **ReportCrash Name**: SEND\_INVALID\_REPLY, +- **ReportCrash Name**: `SEND_INVALID_REPLY`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: (reply port ie bits << 32) | disposition. This exception is thrown when a caller is trying to send a message whose reply port (the `msgh_local_port` field of a Mach message) violates policies around @@ -361,9 +428,12 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_SEND_INVALID_RIGHT` 0x00020000 -- **ReportCrash Name**: SEND\_INVALID\_RIGHT, +- **ReportCrash Name**: `SEND_INVALID_RIGHT`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: + - `(0x01 << 56) | disposition`: copyin port descriptor failed, + - `(0x02 << 56) | disposition`: copyin ool port descriptor failed, + - `(0x03 << 56) | disposition`: copyin guarded port descriptor failed, This exception is thrown when a caller is trying to send a message where one of the port descriptors denotes a right that doesn't match the requested @@ -375,9 +445,9 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_SEND_INVALID_VOUCHER` 0x00040000 -- **ReportCrash Name**: SEND\_INVALID\_VOUCHER, +- **ReportCrash Name**: `SEND_INVALID_VOUCHER`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: disposition of the voucher port. This exception is thrown when a caller is trying to send a message whose voucher port (the `msgh_voucher_port` field of a Mach message) violates policies around @@ -388,7 +458,7 @@ This is usually a sign of port right mismanagement. ### `kGUARD_EXC_RCV_INVALID_NAME` 0x00080000 -- **ReportCrash Name**: RCV\_INVALID\_NAME, +- **ReportCrash Name**: `RCV_INVALID_NAME`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: always zero. @@ -406,7 +476,7 @@ crash log, and the process will keep going). ### `kGUARD_EXC_RCV_GUARDED_DESC` 0x00100000 -- **ReportCrash Name**: RCV\_GUARDED\_DESC, +- **ReportCrash Name**: `RCV_GUARDED_DESC`, - **Target meaning**: the mach port name the incorrect operation targets, - **Payload meaning**: always zero. @@ -440,11 +510,41 @@ reply port on iOS. It is currently a soft crash to collect telemetry before the actual enforcement. +### `kGUARD_EXC_OOL_PORT_ARRAY_CREATION` 0x00100003 + +- **ReportCrash Name**: N/A, +- **Target meaning**: always zero. +- **Payload meaning**: always zero. + +This is telemetry for processes creating a port with flag +MPO_CONNECTION_PORT_WITH_PORT_ARRAY without an entitlement + + +### `kGUARD_EXC_MOVE_PROVISIONAL_REPLY_PORT` 0x00100004 + +- **ReportCrash Name**: N/A, +- **Target meaning**: the mach port name of the provisional reply port, +- **Payload meaning**: always zero. + +This exception is thrown when a process opted for enhanced security v2 moves +the receive right of a provisional reply port out of its ipc space. + +### `kGUARD_EXC_REPLY_PORT_SINGLE_SO_RIGHT` 0x00100005 + +- **ReportCrash Name**: N/A, +- **Target meaning**: the mach port name of the reply port, +- **Payload meaning**: the copyin reason. + +This exception is thrown when a process attempts to create more than +one single send-once right for a reply port. Reply ports are not allowed +to extend more than one single send-once right at any given moment. + + ### `kGUARD_EXC_MOD_REFS_NON_FATAL` 0x00200000 -- **ReportCrash Name**: OVERDEALLOC\_SOFT, +- **ReportCrash Name**: `OVERDEALLOC_SOFT`, - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: same as `kGUARD_EXC_MOD_REFS`. This is the same as `kGUARD_EXC_MOD_REFS`, except that this is delivered as a soft error. @@ -452,24 +552,10 @@ soft error. ### `kGUARD_EXC_IMMOVABLE_NON_FATAL` 0x00400000 -- **ReportCrash Name**: ILLEGALMOVE\_SOFT. +- **ReportCrash Name**: `ILLEGALMOVE_SOFT`. - **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. +- **Payload meaning**: same as `kGUARD_EXC_IMMOVABLE`. This is the same as `kGUARD_EXC_IMMOVABLE`, except that this is delivered as a soft error. - -### `kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS` 0x00800000 - -- **ReportCrash Name**: REQUIRE\_REPLY\_PORT\_SEMANTICS, -- **Target meaning**: the mach port name the incorrect operation targets, -- **Payload meaning**: always zero. - -This exception is thrown when a caller is violating the reply port semantics in -a process where this is disallowed by policy. This is used to gather telemetry -around violators pending enforcement in a future release. - -This is usually a sign of a programming mistake (violation of the reply port -semantics rules). - diff --git a/doc/mach_ipc/ipc_security_concepts.md b/doc/mach_ipc/ipc_security_concepts.md new file mode 100644 index 000000000..11a55f759 --- /dev/null +++ b/doc/mach_ipc/ipc_security_concepts.md @@ -0,0 +1,116 @@ +Mach IPC Security concepts +========================== + +This documentation aims at documenting various security concepts in this +subsystem. Each section covers a single concept, and will cover topics such as +motivation, design of the feature, and implementation details that are +important. + + +## IPC space policy + +### Motivation and design + +Over time our IPC policies have grown in complexity and depend on several +parameters, to name a few: being a simulated process, a platform binary, or +having browser entitlements. + +As a result, a notion of IPC space policy exists that projects all various +system policies into a single enum per IPC space. This policy is an inherent +immutable property of an IPC space which allows to query its value without +holding any locks. + + +### Implementation details + +The source of truth for IPC policies is the `struct ipc_space::is_policy` field, +which can be accessed with the `ipc_space_policy()` accessor. + +This field is computed when a task IPC space is enabled (in +`ipc_task_enable()`), and is immutable for the lifetime of this space. In +addition to that, the field is dPACed in order to be resilient to early memory +corruption primitives. + +For conveniency reasons, the policy bits of a space are injected in other enums +(such as `mach_msg_options64_t`). The `IPC_SPACE_POLICY_BASE()` macro helps +forming types that extend the space policy. + + +## Pinned Entries + +### Motivation and design + +Certain kinds of send rights have a well understood lifecycle on the system, +during which there must always be an extent send right alive for the port. +Obvious examples of this are task or thread control ports which must have +a live send right in their corresponding IPC space while the task or thread +they reference is alive. + +In order to catch port management issues that could lead to various confused +deputies issues, the Mach IPC subsystem provides a notion of pinned send rights. +Pinned send rights is a concept of the Mach IPC Entry, which denotes that this +entry must always have at least one extent send right alive. + +Pinning can be undone in two ways: + +- when a port receive right is destroyed, pinning is no longer effective, + and entries will be automatically unpinned as part of the dead-name check; +- unpinning can be explicitly requested by the kernel. + + +### When and how to used pinned rights? + +Pinned rights were designed to protect `mach_task_self()` and +`_pthread_mach_thread_self_direct()` which can lead to grave security bugs when +port lifecycle management mistakes are made. The bracketing there is very +simple: + +- task ports are never unpinned; +- thread ports are unpinned when the thread terminates. + + +There might be other ports on the system which can use this facility, however +they must have the right shape: either the port dying (the receive right being +destroyed) is an adequate way to unpin the entry, or there must be a clearly +identified kernel path that can unpin the entry without any confusion with other +ports. + +Adding unpinning paths that can't verify that the port being unpinned is +"theirs" would lead to weakening this feature and would reintroduce avenues +to confuse the system due to port mismanagement bugs. + + +### Implementation details + +Pinning is denoted by the `IE_BITS_PINNED_SEND` bit +of the `struct ipc_entry::ie_bits` field. + +IPC entries gain this bit the first time the kernel calls +`ipc_port_copyout_send_pinned()` for a given port and IPC space. + +When the `IE_BITS_PINNED_SEND` is set, then the `MACH_PORT_TYPE_SEND` bit must +be set too, with the `IE_BITS_UREFS()` for this entry being at least 1. + +In order to respect that pinning is ignored immediately when a port becomes +dead, enforcing `IE_BITS_PINNED_SEND` semantics must be done under the space +lock, either right after a dead-name conversion check happened +(`ipc_right_check()` has been called) or by checking explicitly that the port +is still active (`ip_active()` returns true) when a dead-name conversion isn't +desirable. + + +### Usage and enforcement + +Task and thread control ports are pinned for all processes within the +owning IPC space of the task in question, for all processes on the system. + +The `ipc_control_port_options` boot-arg determines the reaction of the system to +violations of pinning: + +- hardened processes and above have hard enforcement of pinning rules (violating + the rules terminates the process); +- other processes have a soft enforcement: violating pinning rules returns a + `KERN_INVALID_CAPABILITY` error and generates a non fatal guard exception. + + + diff --git a/doc/mach_ipc/port_types.md b/doc/mach_ipc/port_types.md new file mode 100644 index 000000000..3a6f1e5d6 --- /dev/null +++ b/doc/mach_ipc/port_types.md @@ -0,0 +1,164 @@ +Mach IPC Port Types +========================== + +This document is not a tutorial on or encouragement to write new mach code, but +serves as documentation for darwin engineers to map security policies to ports +used in the lower layers of the OS. See the warning/disclaimer at +https://at.apple.com/dont_write_new_mach_code and come talk to us in +#help-darwin before using the knowledge contained here. + +# Port Types +A port can have exactly one of the following types, and we describe what each +port means from the userspace perspective, including what you are/aren't allowed +to do with a particular port and how to construct it. The source of truth for +security policies in the kernel is the `ipc_policy_array` in `ipc_policy.c` + +### IOT_PORT_SET +#### Creation +- `mach_port_allocate(... MACH_PORT_RIGHT_PORT_SET ...)` +#### Behavior/Usage +- Allows you to monitor an entire set of ports for messages at once, similar to +`select(2)` +#### Security Restrictions +- No special security restrictions on this port + +### IOT_PORT +#### Creation +- Your standard port that you obtain through the port allocation APIs if you +don't pass any special flags. +#### Behavior/Usage +- Sends traditional mach messages and is generally associated with "raw mach" - +new code generally should not use these. +#### Security Restrictions +- No special security restrictions on this port + +### IOT_SERVICE_PORT +#### Creation +- pass `MPO_SERVICE_PORT` to `mach_port_construct` +#### Behavior/Usage +- Used by `launchd` as the port which drives the launch-on-demand behavior of +services/daemons on the system. Clients lookup the service port for some service +using `bootstrap_lookup` and then can form a connection with that service. - +`launchd` reclaims these ports when the process owning the service port is +killed so that the service port is always active. +#### Security Restrictions +- This is the "hardened" version of the service port which has various security +policies such as immovabile receive right and enforced reply port semantics. - +`launchd` enforces that all platform binary services use this version of the +service port (as opposed to the `WEAK` version below), and allows third parties +to opt into this following the completion of rdar://137633308. See the `launchd` +documentation for more details about how to opt into this restriction - The +kernel will enforce that platform restrictions binaries receive this version of the service +port following completion of rdar://133304899. + +### IOT_WEAK_SERVICE_PORT +#### Creation +- pass `MPO_WEAK_SERVICE_PORT` to `mach_port_construct` +#### Behavior/Usage +- Same feature set and usage as `IOT_SERVICE_PORT` above, the only difference is +the associated security policy. +#### Security Restrictions +- No security hardening. Launchd entirely controls which processes get the weak +vs. "strong" service ports, and the kernel will eventually enforce that launchd +has created the right kind of port for hardened processes. See `launchd` +documentation for more info. + +### IOT_CONNECTION_PORT +#### Creation +- pass `MPO_CONNECTION_PORT` to `mach_port_construct` +#### Behavior/Usage +- A connection port models an established connection between two parties, +commonly between a client and a service, but it's also generalizable to peer +connections. +#### Security Restrictions +- We assume that the handshake mechanism to create the connection between these +two parties is sufficiently hardened, so the security boundary we want to +protect here is that the connection and its replies are contained between the +two parties. To accomplish this the connection port is marked as immovable +receive and requires reply port semantics, both of which combined kills man in +the middle attacks at this layer. + +### IOT_CONNECTION_PORT_WITH_PORT_ARRAY +#### Creation +- pass `MPO_CONNECTION_PORT_WITH_PORT_ARRAY` to `mach_port_construct` +#### Behavior/Usage +- Serves as a connection port, however does not have the mechanics/behaviors +of `IOT_CONNECTION_PORT`. Unlike other port types, this type is allowed to +receive out-of-line port array descriptors (`MACH_MSG_OOL_PORTS_DESCRIPTOR`) +in platform binaries. For enhanced security v2, it's the only port type that +is allowed to receive `MACH_MSG_OOL_PORTS_DESCRIPTOR`. +- In addition to that, we enforce the following restrictions on messages with +`MACH_MSG_OOL_PORTS_DESCRIPTOR` descriptor (also, only for platform binaries): + 1. a message cannot carry more than ONE single OOL port array. + 2. the only allowed disposition is `MACH_MSG_TYPE_COPY_SEND`. + +#### Security Restrictions +- Can only be created by binaries that have the +`com.apple.developer.allow-connection-port-with-port-array` entitlement. + +### IOT_EXCEPTION_PORT +#### Creation +- pass `MPO_EXCEPTION_PORT` to `mach_port_construct` +#### Behavior/Usage +- Used in mach exception handling, if you have the entitlement +`com.apple.security.only-one-exception-port` then you must use the +`task_register_hardened_exception_handler` workflow which only accepts this type +of port. Otherwise the historical, non-hardened exception handling workflow +using `task_set_exception_ports` accepts either a standard `IOT_PORT` or an +`IOT_EXCEPTION_PORT`. +#### Security Restrictions +- This port must be immovable receive when using the hardened exception flow to +ensure that exceptions cannot be diverted and handled/stalled outside of the +process generating them. + +## Reply Ports +- If your port type requires reply port semantics, then you must use one of the +following reply port types as the reply field (`msgh_local_port`) when sending a +message to `p`. See [reply port +defense](ipc_security_concepts.md#reply-port-defense) for more details. + +### IOT_REPLY_PORT +#### Creation +- pass `MPO_REPLY_PORT` to `mach_port_construct` +#### Behavior/Usage +- A reply port exists in your space, and you use it to receive replies from +clients in response to messages that you send them. It's intended to model +synchronous IPC where you send a message to a peer and expect a single message +in response. +#### Security Restrictions +- Reply ports guarantee that a reply comes back to you from the person you are +messaging - in other words, it counters person-in-the-middle attacks. It +accomplishes this by using send-once rights to ensure that *exactly* one reply +from the destination will come back to your reply port, which is marked as +immovable receive, and the send right must be a make-send-once so that it cannot +be moved after its creation. + +### IOT_SPECIAL_REPLY_PORT +#### Creation +- Created by the kernel: libxpc and dispatch call into +`thread_get_special_reply_port` to obtain the send/receive right for this +thread-specific port. +#### Behavior/Usage +- Has the same properties as a reply port above, but this is used by libxpc and +dispatch to provide turnstile/importance inheritance capabilities. +#### Security Restrictions +- same as reply ports above + +### IOT_PROVISIONAL_REPLY_PORT +#### Creation +- pass `MPO_PROVISIONAL_REPLY_PORT` to `mach_port_construct` +#### Behavior/Usage +- This has the mechanics of a normal `IOT_PORT` in that it has no special +behaviors/usage/restrictions, but it counts as reply port for the purposes of +enforced reply port semantics. +#### Security Restrictions +- None, this requires special entitlements in platform restrictions binaries. + +# Violations of Port Security Policies + +If you violate the security policies outlined above, expect to receive a +`mach_port_exc_guard` exception, which will either emit telemetry or fatally +crash your process depending on the enforcement level of the security violation. +See [List of fatal Mach IPC guard +exceptions](guard_exceptions.md#List-of-fatal-Mach-Port-Guard-Exceptions) or +at.apple.com/exc-guard for more details. diff --git a/doc/observability/coalitions.md b/doc/observability/coalitions.md new file mode 100644 index 000000000..d7be7f5b8 --- /dev/null +++ b/doc/observability/coalitions.md @@ -0,0 +1,134 @@ +# Coalitions + +A look at coalitions and how they're implemented in XNU. + + +## Overview + +A coalition is a group of related processes. + +Currently there are two types of coalition: *resource coalitions* and *jetsam coalitions*. Each process on the system is a member of one of each. + +Coalition membership is preserved across `fork`, `exec` and `posix_spawn`, and cannot be changed after a process is created. + +`launchd` is responsible for creating, destroying and reaping coalitions (via `coalition_create`, `coalition_terminate` and `coalition_reap`). It's also responsible for spawning process into specific coalitions via `posix_spawn`+`posix_spawnattr_setcoalition_np`. These interfaces are for `launchd`'s use only (or unit testing XNU itself). Fundamentally, a coalition is intended to represent a `launchd` job. + + +Each coalition has a 64-bit ID (both resource and jetsam coalitions share the same ID space). ID 1 includes the `kernel_task` and `launchd`. + +Coalition IDs are not re-used (they are assigned monotonically by the kernel). However, since empty coalitions are valid, `launchd` tries to keep a single coalition per loaded service and spawn into an existing coalition rather than create a new one each time the service restarts. Apps usually get a fresh coalition on each launch. + +Each process occupies a *role* in its coalition: one process is tagged as the `COALITION_TASKROLE_LEADER`, XPC services have `COALITION_TASKROLE_XPC`, and app extensions have `COALITION_TASKROLE_EXT`, etc. + +`launchd` additionally stores a name string for each coalition; this can be obtained via XPC. This comes from the `Label` plist key / bundle ID / App name depending on the type of process. Jetsam coalitions, unlike resource coalitions, have a short/pretty name passed into `DEVELOPMENT` kernels. This is for quick debugging of thread groups and must not be used for any decisions on device. + +## Resource Coalitions + +Resource coalitions are used by things like Activity Monitor to aggregate CPU usage, energy consumption, I/O etc. The idea is we can make statements like 'Safari is using 50% CPU' even if Safari has 10 different processes using 5% CPU each. + +Every few minutes, powerlog samples the resource usage of each coalition on the system. This data ultimately feeds into the Battery UI that's displayed to users. + +We also use resource coalitions to drive the 'significant energy usage' report in macOS. + +Unlike jetsam coalitions, App extensions (which usually have a different vendor to their host app) are spawned into their own resource coalition for separate tracking. + +To query resources used by a given coalition, call `coalition_info_resource_usage`. From the command line you can use `coalitionctl show`. + +### Ledgers + +A ledger is a lightweight key-value store that we use to track various metrics. Ledgers are created from a template, which determines the set of keys (which we call 'entries'). For example, each task has a ledger containing entries like `cpu_time` and `wired_mem`. + +Entries are essentially 64-bit counters that increment monotonically (which is done via `ledger_credit*`). However some fields like `wired_mem` can logically decrease over time; this is implemented by incrementing a second 64-bit counter called 'debit' (via `ledger_debit*`) so that the overall `wired_mem` usage at a moment in time can be computed via `credit - debit` (sometimes called the 'balance'). + +Since ledgers are fundamentally lock-free, it's possible that readers may see bogus values. For example, if one thread writes entry A and then entry B, a reader thread might only see the update to A. Therefore computing metrics from multiple entries should be done with care. + +Ledgers also help us implement resource limits like 'this process should use no more than 10 seconds of CPU time in a 20 second period'. This is implemented via timers that 'refill' ledger debit periodically (see `ledger_refill`). Each credit/debit operation checks if the new balance exceeds the desired limit and invokes appropriate callbacks (e.g to kill the task). + +Each thread, task, and resource coalition has its own separate ledger. Thread ledgers currently *only* store `cpu_time`. Task ledgers have many other entries like memory usage, I/O, etc. Resource coalitions have a ledger that's instantiated from the task ledger template. Confusingly there's also a `resource_monitor_ledger` attached to each coalition which has a single `logical_writes` entry. + +When `cpu_time` is updated (at context switch), both the task and thread ledger is incremented. Therefore the `cpu_time` in the task's ledger is always equal to the sum of its threads, plus that of any threads that have exited. On the other hand, the coalition's ledger is only updated when a task dies. At this point, all entries in the task's ledger are added into the coalition's ledger (see the `ledger_rollup` call in `i_coal_resource_remove_task`). + +### Gathering resource usage + +Calculating the current resource usage of a coalition is a surprisingly tricky process, since data is stored in a number of places. The logic for this lives in `coalition_resource_usage_internal`. + +Fundamentally the goal is to sum the resources used by all tasks in the coalition, plus that of any dead tasks. + +We create a temporary ledger, into which we sum the coalition's ledger (which holds resources from dead tasks), then iterate alive tasks and sum their ledgers too. + +Some data like energy usage is tracked by `recount` (see [recount.md](doc/observability/recount.md)) rather than ledgers, so we sum this information in a similar manner. + +This is all done while holding the `coalition_lock`, which ensures we don't double count if a task dies while we iterate. Nothing stops a task updating its own ledger while we iterate, and while this could result in bogus data, increments are generally small enough that this is benign. + +### CPU time and energy billing + +Through the magic of Mach vouchers, XNU can track CPU time and energy consumed *on behalf of* other processes. + +For example, suppose a task sends an XPC request to `launchd`, which does some computation and sends a response. We would like to track the computation between request and response as done by the task itself rather than `launchd`. + +This information is surfaced in the following members of `struct coalition_resource_usage`: + +* `{cpu_time,energy}_billed_to_others` stores the amount of `cpu_time`/`energy` that *we* performed *on behalf of* other coalitions. +* `{cpu_time,energy}_billed_to_me` stores the amount of `cpu_time`/`energy` that *other* coalitions performed for us. +* `cpu_time`/`energy` stores the 'raw' amount of time/energy that was consumed by our processes. + +Therefore, the logical amount of `cpu_time`/`energy` that a coalition has consumed can be computed by `cpu_time + cpu_time_billed_to_me - cpu_time_billed_to_others` (and likewise for `energy`). + +Note that these fields are summed per-task, so in the case where `x` amount of `cpu_time` is billed between two tasks in the same coalition, one task's `cpu_time_billed_to_me` will increment by `x`, and the other task's `cpu_time_billed_to_others` will *also* increment by `x`. Therefore the coalition's billing-related fields will increment despite no work being done outside of the coalition itself. In other words, both `*_billed_to_{me,others}` must be jointly considered for accurate accounting (as is done by the expression above). + +We also surface `gpu_energy`/`gpu_energy_billed_to_{me,others}` but this is updated by the GPU kext rather than XNU. We expose the following KPIs for this purpose: + +* `current_energy_id`: returns a unique 'energy ID' for the current task's resource coalition (exposed as an opaque ID) +* `task_id_token_to_energy_id`: produces an energy ID from a particular task port name +* `energy_id_report_energy`: looks up the resource coalition from energy ID, and increments `gpu_energy`/`gpu_energy_billed_to_{me,others}` as appropriate. + +ANE billing information is not yet captured: [rdar://122812962](rdar://122812962) (Capture ANE energy billed to me and others per coalition). + +### Bank objects + +The above fields are tracked by 'bank' objects attached to vouchers, which are sent along with Mach IPC messages. A full treatment of vouchers is beyond the scope of this document, but using our previous 'task doing XPC to `launchd`' example, the mechanism can be summarized as follows: + +When `launchd` receives the message, it 'adopts' the voucher which binds the bank to its thread (see `thread->ith_voucher`). When `launchd` is done, the voucher is un-bound. Note that voucher tracking is implemented in libxpc/libdispatch so anyone using those libraries gets this automatically. + +Bank objects are split into two halves: + +A *bank account* is essentially just a small ledger with entries for `cpu_time` and `energy`. When a thread adopts a voucher with a bank attribute, we point to this ledger in `thread->t_bankledger` (see `bank_swap_thread_bank_ledger`). Then, any updates to the task's `cpu_time` or `energy` are *also* added onto `thread->t_bankledger`. + +A *bank task* is an object that provides linkage between a task and its associated *bank accounts*. For example, if we do an IPC to 3 other tasks, we'd like to link ourselves to those 3 separate bank accounts. The `bank_task->bt_accounts_to_{pay,charge}` lists track precisely this information. + +Each task is essentially 1-1 with a `bank_task`. However, a `bank_task` may outlive its associated task in case the task dies but is still being billed by someone else. + +When a *bank account* is deallocated (due to vouchers getting released, meaning tasks have completed their work), we rollup/sum the accumulated `cpu_time`/`energy` from the bank account as follows: + +* The 'bank merchant' (returning to the previous example, this would be `launchd`) has its `*billed_to_others` task ledger entry incremented by the amount stored in the bank's ledger +* The 'bank holder' (this would be the other task) has its `*billed_to_me` task ledger entry incremented by the amount stored in the bank's ledger. + +The logic for this lives in `bank_account_dealloc_with_sync`/`bank_rollup_chit_to_task`. + +### Bugs + +As described above, on-behalf-of accounting is only added into to the task ledgers when bank accounts/vouchers are deallocated. If the voucher has been open for a long time, this may be a large amount of `cpu_time`/`energy` in one instant. + +Since `coalition_info_resource_usage` does not try to iterate outstanding bank accounts, this means callers may observe very large increments in `{cpu_time,energy}_billed_to_{me,others}`, especially in the presence of voucher leaks. + +On the other hand, the raw `cpu_time`/`energy` values increment pretty much continously. As a result, the logical consumption `cpu_time + cpu_time_billed_to_me - cpu_time_billed_to_others` may appear to go backwards when `*_others` is incremented by a large amount (and likewise for `energy`). + +In other words, a process/coalition may appear to *bill out* more cpu_time/energy than it actually consumed in a given period: rdar://92275084 (In coalition_resource_usage, energy_billed_to_others > energy, resulting in a negative number for 'billed_energy'). That'd be like going on a business trip and expensing that fancy dinner you had with your friends two months ago... + +Note that the inequality `cpu_time_billed_to_others <= cpu_time` still holds at any given instant (as reported by `coalition_info_resource_usage`), but this is *not* true when looking at deltas between two samples. And likewise, each field increments monotonically, but the overall quantity `cpu_time + cpu_time_billed_to_others - cpu_time_billed_to_others` does not. + +Another 'bug' is that if a task dies, and is then subsequently billed for some work by another task, this information is dropped on the floor. This is NTBF but in an ideal world we would track this on the coalition. + +## Jetsam Coalitions + +Each process is also a member of a jetsam coalition. + +This is designed to encapsulate 'an app and all its subprocesses'. For example, App extensions are spawned into separate *resource* coalitions from their host app, but inherit the host app's *jetsam* coalition. XPC services/App extensions can opt out of this via the `_AbandonCoalition` key in `Info.plist`. + +The primary function of jetsam coalitions is to aggregate memory usage across entire applications (app level footprint). When jetsam needs to reclaim memory, it tries to kill processes associated with the most memory-intensive visible app. + +The scheduler and CLPC also look at jetsam coalitions to determine which processes are P-core eligible. In particular, the thread group of a jetsam coalition led by a P-core capable process will be allowed to use P-cores. This could have been its own coalition type but the rules matched the existing jetsam coalition. + + + +When in Game Mode, jetsam coalitions are used to throttle non-game apps. diff --git a/doc/scheduler/sched_clutch_edge.md b/doc/scheduler/sched_clutch_edge.md index e52121d95..96a6550b5 100644 --- a/doc/scheduler/sched_clutch_edge.md +++ b/doc/scheduler/sched_clutch_edge.md @@ -199,7 +199,7 @@ In order to choose a cluster & processor for a runnable thread, the edge schedul **Edge Scheduler Edge Matrix** -The Edge scheduler maintains a thread migration graph where each node represents a cluster and each directional edge represents the likelihood of migrating threads across that edge. Each graph edge encodes the following attributes: +The Edge scheduler maintains a thread migration graph for each QoS level, where each node in the graph represents a cluster and each directed edge represents the likelihood of migrating threads (of the QoS) across that edge. Each graph edge encodes the following attributes: ``` typedef union sched_clutch_edge { @@ -215,7 +215,7 @@ typedef union sched_clutch_edge { ``` The `sce_migration_allowed` & `sce_steal_allowed` flags indicate if threads are allowed to be migrated & stolen across the edge. The `sce_migration_weight` is a measure of the scheduling latency delta that should exist between the source and destination nodes (i.e. clusters) for the thread to be migrated. The per-cluster scheduling latency metric is described in the next section. -The performance controller can dynamically update the weights and properties of the edge matrix dynamically to change the width of the system for performance and efficiency reasons. +The performance controller can update the weights and properties of the edge matrix dynamically to change the width of the system for performance and efficiency reasons. **Edge Scheduler Cluster Scheduling Latency Metric** @@ -282,5 +282,6 @@ This policy distributes the threads so that they spread across all available clu This policy distributes threads so that the threads first fill up all the capacity on the preferred cluster and its homogeneous peers before spilling to different core type. The current implementation defines capacity based on the number of CPUs in the cluster; so a cluster's shared resource is considered full if there are "n" runnable + running shared resource threads on the cluster with n cpus. This policy is different from the default scheduling policy of the edge scheduler since this always tries to fill up the native clusters to capacity even when non-native clusters might be idle. #### Long Running Workload AMP Round Robining +The Edge scheduler implements a policy called "stir-the-pot" to round-robin long-running workload threads across clusters of various types, with the goal of ensuring those threads make roughly equal progress over time. This is essential for the performance of statically partitioned, multi-threaded workloads with NCPUs threads, as otherwise the threads on slower cores would become stragglers and the workload would lose out on a maximum amount of parallelism. +The scheduler implements stir-the-pot at the cadence of the quantum expiration and works by swapping a thread expiring its quantum with a thread on the opposite core type which has already expired quantum there. The swap itself occurs by having the slower core send its thread to the P-core involved in the swap, preempting the P-core thread which then spills down onto the newly available slow core based on the normal Edge migration policy. In order to reduce the chance of picking the same CPUs over and over unfairly for stir-the-pot, the swap selection scheme rotates the offset at which it begins the search for a candidate CPU of the opposite type, leading to a fair distribution on average. -The Edge scheduler implements a policy to round robining long running workload threads across clusters of various types to ensure that all threads of the workload make equal progress aka "stir-the-pot". This is essential for performance of workloads that statically partition work among ncpu threads. The scheduler invokes this mechanism when a thread expires a quantum on a non-preferred cluster (most likely due to migration/spilling from the preferred cluster). The scheduler recognizes this (via `AST_QUANTUM` and `AST_REBALANCE` being set) and enqueues it on a cluster native to the preferred cluster. On the next scheduling event for that cluster, the CPU will pickup this thread and spill/migrate the thread previously running onto the non-preferred cluster. In order to make sure all clusters native to the preferred cluster are euqally subject to this round-robining, the scheduler maintains a `scbg_amp_rebalance_last_chosen` value per sched_clutch_bucket_group (which represents all threads of a workload at the same QoS level). diff --git a/doc/vm/memorystatus.md b/doc/vm/memorystatus.md index 92eb11f30..05f4a8f68 100644 --- a/doc/vm/memorystatus.md +++ b/doc/vm/memorystatus.md @@ -35,13 +35,13 @@ The memorystatus code lives on the BSD side of xnu. It's comprised of the follow - `bsd/kern/kern_memorystatus_policy.c` Contains the policy decisions around when to perform which action. - `bsd/kern/kern_memorystatus_freeze.c` - Implementation of the freezer. See `doc/memorystatus/freezer.md` for details. + Implementation of the freezer. See `doc/vm/freezer.md` for details. - `bsd/kern/kern_memorystatus.c` Contains mechanical code to implement the kill and swap actions. Should not contain any policy (that should be in `bsd/kern/kern_memorystatus_policy.c`), but that's a recent refactor so is a bit of a WIP. - `bsd/kern/kern_memorystatus_notify.c` - Contains both the policy and mechanical bits to send out memory pressure notifications. See `doc/memorystatus/notify.md` + Contains both the policy and mechanical bits to send out memory pressure notifications. See `doc/vm/memorystatus_notify.md` And the following headers: - `bsd/kern/kern_memorystatus_internal.h` @@ -55,7 +55,7 @@ And the following headers: The memorystatus subsystem is designed around a central health check. All of the fields in this health check are defined in the `memorystatus_system_health_t` struct. See `bsd/kern/kern_memorystatus_internal.h` for the struct definition. -Most of the monitoring and actions taken by the memorystatus subsystem happen in the `memorystatus_thread` (`bsd/kern/kern_memorystatus.c`). However, there are some synchronous actions that happen on other threads. See `doc/memorystatus/kill.md` for more documentation on specific kill types. +Most of the monitoring and actions taken by the memorystatus subsystem happen in the `memorystatus_thread` (`bsd/kern/kern_memorystatus.c`). However, there are some synchronous actions that happen on other threads. See `doc/vm/memorystatus_kills.md` for more documentation on specific kill types. Whenever it's woken up the memorystatus thread does the following: 1. Fill in the system health state by calling `memorystatus_health_check`) @@ -75,7 +75,7 @@ The memorystatus subsystem has 210 priority levels. Every process in the system Each priority level is tracked as a TAILQ linked list . There is one global array, `memstat_bucket`, containing all of these TAILQ lists. A process's priority is tracked in the proc structure (See `bsd/sys/proc_internal.h`). `p_memstat_effective_priority` stores the proc's current jetsam priority, and `p_memstat_list` stores the TAILQ linkage. All lists are protected by the `proc_list_mlock` (Yes this is bad for scalability. Ideally we'd use finer grain locking or at least not share the global lock with the scheduler. See [rdar://36390487](rdar://36390487)) . -Many kill types kill in ascending jetsam priority level. See `doc/memorystatus/kill.md` for more details. +Many kill types kill in ascending jetsam priority level. See `doc/vm/memorystatus_kills.md` for more details. The jetsam band is either asserted by [RunningBoard](https://stashweb.sd.apple.com/projects/COREOS/repos/runningboard/browse) (apps and runningboard managed daemons) or determined by the jetsam priority set in the [JetsamProperties](https://stashweb.sd.apple.com/projects/COREOS/repos/jetsamproperties/browse) database. For reference, here are some of the band numbers: @@ -142,7 +142,7 @@ This section lists the threads that comprise the memorystatus subsystem. More de ### VM\_memorystatus\_1 -This is the jetsam thread. It's responsible for running the system health check and performing most jetsam kills (see `doc/memorystatus/kill.md` for a kill breakdown). +This is the jetsam thread. It's responsible for running the system health check and performing most jetsam kills (see `doc/vm/memorystatus_kills.md` for a kill breakdown). It's woken up via a call to `memorystatus_thread_wake` whenever any subsystem determines we're running low on a monitored resource. The wakeup is blind and the thread will immediately do a health check to determine what's wrong with the system. @@ -150,7 +150,7 @@ NB: There are technically three memorystatus threads: `VM_memorystatus_1`, `VM_m ### VM\_freezer -This is the freezer thread. It's responsible for freezing processes under memory pressure and demoting processes when the freezer is full. See `doc/memorystatus/freeze.md` for more details on the freezer. +This is the freezer thread. It's responsible for freezing processes under memory pressure and demoting processes when the freezer is full. See `doc/vm/freezer.md` for more details on the freezer. It's woken up by issuing a `thread_wakeup` call to the `memorystatus_freeze_wakeup` global. This is done in `memorystatus_pages_update` if `memorystatus_freeze_thread_should_run` returns true. It's also done whenever `memorystatus_on_inactivity` runs. diff --git a/doc/vm/memorystatus_kills.md b/doc/vm/memorystatus_kills.md index 2f5f91ef8..72b1f24cb 100644 --- a/doc/vm/memorystatus_kills.md +++ b/doc/vm/memorystatus_kills.md @@ -19,12 +19,14 @@ More information on each kill type is provided below | `JETSAM_REASON_MEMORY_FCTHRASHING` | `MEMORYSTATUS_KILL_TOP_PROCESS` | `memorystatus_thread` | No | | `JETSAM_REASON_MEMORY_PERPROCESSLIMIT` | N/A | thread that went over the process' memory limit | No | | `JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE` | N/A | thread that disabled the freezer | Yes | -| `JETSAM_REASON_MEMORY_IDLE_EXIT` | N/A | `vm_pressure_thread` | No | +| `JETSAM_REASON_MEMORY_IDLE_EXIT` | `MEMORYSTATUS_KILL_IDLE` | `vm_pressure_thread` | No | | `JETSAM_REASON_ZONE_MAP_EXHAUSTION` | `MEMORYSTATUS_KILL_TOP_PROCESS` | `memorystatus_thread` or thread in a zalloc | No | | `JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING` | `MEMORYSTATUS_KILL_TOP_PROCESS` | `memorystatus_thread` | No | -| `JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE` | `MEMORYSTATUS_KILL_TOP_PROCESS` | `memorystatus_thread` or thread in swapin | No | +| `JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE` | `MEMORYSTATUS_KILL_TOP_PROCESS` / `MEMORYSTATUS_NO_PAGING_SPACE` | `memorystatus_thread` | No | | `JETSAM_REASON_LOWSWAP` | `MEMORYSTATUS_KILL_SUSPENDED_SWAPPABLE` or `MEMORYSTATUS_KILL_SWAPPABLE` | `memorystatus_thread` | Yes | +| `JETSAM_REASON_MEMORY_VMPAGEOUT_STARVATION` | `MEMORYSTATUS_KILL_TOP_PROCESS` | `memorystatus_thread` | Yes | | `JETSAM_REASON_MEMORY_SUSTAINED_PRESSURE` | N/A | `vm_pressure_thread` | No | +| `JETSAM_REASON_MEMORY_CONCLAVELIMIT` | N/A | thread that went over the process' memory limit | No | ### JETSAM\_REASON\_MEMORY\_HIGHWATER @@ -75,9 +77,9 @@ See `kill_all_frozen_processes` in `bsd/kern/kern_memorystatus_freeze.c` for the ### JETSAM\_REASON\_MEMORY\_IDLE\_EXIT -These are idle kills. +The process was terminated while idle (i.e. clean or assertion-less). On iOS, this occurs whenever the available page count falls below `kern.memorysattus.available_pages_idle`. -On macOS, when the memory pressure level escalates above normal, the memorystatus notification thread calls `memorystatus_idle_exit_from_VM` to kill 1 idle daemon. Note that daemons must opt in to pressured exit on macOS. +On macOS, when the memory pressure level escalates above normal, the memorystatus notification thread calls `memstat_kill_idle_process()` to kill 1 idle daemon per second, up to a maximum of 100 daemons per normal \-\> pressure transition. These daemons must be opted in to idle-exit and clean. Note that apps may also appear in the idle band when they are app-napped, but are not eligible to be killed via this mechanism. ### JETSAM\_REASON\_ZONE\_MAP\_EXHAUSTION @@ -91,9 +93,11 @@ NB: These thresholds are very old and have probably not scaled well with current ### JETSAM\_REASON\_MEMORY\_VMCOMPRESSOR\_SPACE\_SHORTAGE -The compressor is at or near either the segment or compressed pages limit. See `vm_compressor_low_on_space` in `osfmk/vm/vm_compressor.c`. The `memorystatus_thread` will kill in ascending jetsam priority order until the space shortage is relieved. +The compressor is at or near either the segment or compressed pages limit. -If the compressor hits one of these limits while swapping in a segment, it will perform these kills synchronously on the thread doing the swapin. This can happen on app swap or freezer enabled systems. +On iOS, the `memorystatus_thread` will kill in ascending jetsam priority order until the space shortage is relieved. If the compressor hits one of these limits while swapping in a segment, it will perform these kills synchronously on the thread doing the swapin. This can happen on app swap or freezer enabled systems. + +On macOS, the `memorystatus_thread` will instead perform a "no-paging-space" action, which entails either killing the largest process if it has over half of the compressed pages on the system, or invoking a process's voluntary `pcontrol` action (one of: kill, suspend, or throttle). The no-paging-space action is only performed on one process every 5 seconds. If all `pcontrol`'s have been completed and the system is still out of paging space, a notification is sent to put up the "Out of Application Memory" dialuge, asking the user to Force Quit an application. ### JETSAM\_REASON\_LOWSWAP @@ -107,6 +111,10 @@ The memorystatus notification thread schedules a thread call to perform these ki Many system services (especially dasd) check the pressure level before doing work, so it's not good for the system to be at the warning level indefinitely. +### JETSAM\_REASON\_MEMORY\_CONCLAVELIMIT + +This behaves similarly to `JETSAM_REASON_MEMORY_PERPROCESSLIMIT`, except that it occurs when the process's corresponding conclave crosses its memory limit. The exclave memory use is reported in the `conclave_mem` ledger, and is not added to the process footprint. This is not currently implemented. + ## Picking an action diff --git a/doc/vm/memorystatus_notify.md b/doc/vm/memorystatus_notify.md index 46efdddf1..6af08142b 100644 --- a/doc/vm/memorystatus_notify.md +++ b/doc/vm/memorystatus_notify.md @@ -18,7 +18,7 @@ Processes may subscribe to notifications regarding memory limits. | Type | Knote Flags | Dispatch Source Mask | Description | | -------- | --------------------------------------- | --------------------------------------------- | ----------------------------------------------------------------------------------- | -| WARN | `NOTE_MEMORYSTATUS_PROC_LIMIT_WARN` | `DISPATCH_MEMORYPRESSURE_PROC_LIMIT_WARN` | Process is within 100 MB of its memory limit. | +| WARN | `NOTE_MEMORYSTATUS_PROC_LIMIT_WARN` | `DISPATCH_MEMORYPRESSURE_PROC_LIMIT_WARN` | Process has reached 80% of its memory limit. | | CRITICAL | `NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL` | `DISPATCH_MEMORYPRESSURE_PROC_LIMIT_CRITICAL` | Process has violated memory limit. Only sent if the memory limit is non-fatal/soft. | ## Memory Pressure diff --git a/doc/vm/pageout_scan.md b/doc/vm/pageout_scan.md new file mode 100644 index 000000000..66578b51c --- /dev/null +++ b/doc/vm/pageout_scan.md @@ -0,0 +1,231 @@ +# Pageout Scan + +The design of Mach VM's paging algorithm (implemented in `vm_pageout_scan()`). + +## Start/Stop Conditions + +When a thread needs a free page it calls `vm_page_grab[_options]()`. If the +system is running low on free pages for use (i.e. +`vm_page_free_count < vm_page_free_reserved`), the faulting thread will block in +`vm_page_wait()`. A subset of privileged (`TH_OPT_VMPRIV`) VM threads may +continue continue grabbing "reserved" pages without blocking. + +Whenever a page is grabbed and the free page count is nearing its floor +(`vm_page_free_count < vm_page_free_min`), a wakeup is immediately issued to +the pageout thread (`VM_pageout_scan`) is woken. `VM_pageout_scan` is +responsible for freeing clean pages and choosing dirty pages to evict so that +incoming page demand can be satisfied. + +The pageout thread will continue scanning for pages to evict until all of the +following conditions are met: +1. The free page count has reached its target (`vm_page_free_count >= +vm_page_free_target`)\* +2. there are no privileged threads waiting for pages (indicated by +`vm_page_free_wanted_privileged`) +3. there are no unprivileged threads waiting for pages (indicated by +`vm_page_free_wanted`) + +\*Invariant: `vm_page_free_target > vm_page_free_min > vm_page_free_reserved` + +## A Note on Complexity +The state machine is complex and can be difficult to predict. This document +serves as a high-level overview of the algorithm. Even seemingly minor +changes to tuning can result in drastic behavioral differences when +the system is pushed to the extreme. + +## Contribution Guidelines (Internal) + +1. The `return_from_scan` label is the only spot where `vm_pageout_scan()` +will stop. A single exit path makes for readability and understandability. Try +to keep it that way. +2. Try to reduce the use of backwards `goto`s. Great care has been taken to +remove these patterns. Don't regress readability! A to-be-completed +[refactor](https://stashweb.sd.apple.com/projects/COREOS/repos/xnu/pull-requests/21219/overview) +removes the remaining backwards `goto`s. +3. Be wary of 2nd order effects. For example: + - How might a bias towards paging anonymous memory affect jetsam? Too many + file backed pages may preclude jetsam from running and leave the system + unresponsive because of constant pageout/compressor activity + - How will varying compression ratios change the effectiveness of the + pageout algorithm? A bias towards anonymous pages may result in quicker + exhaustion of the compressor pool and increased memory pressure from the + resident compressed pages. + +It is critical that the pageout thread not block except as dictated by its +state machine (e.g. to yield VM locks, to wait until the free page pool is +depleted). Be very wary of introducing any new synchronization dependencies +outside of the VM. + +## The Pageout Algorithm +This section documents xnu's page eviction algorithm (`VM_pageout_scan`). It is broken into 5 "phases." + +### Phase 1 - Initialization & Rapid Reclamation +* Initialize the relevant page targets that will guide the algorithm +(`vps_init_page_targets()`). This determines how much anonymous memory and +speculative memory to keep around. Look at the refactor #2 for a more cohesive +collection of all the target page threshold calculations. +* Initialize the Flow Control machine to its default state (`FCS_IDLE`). +* Reclaim "cheap" memory from any other subsystems. These must be fast and non-blocking. + - `pmap_release_pages_fast()` + +**Note**: Phase 2 - 5 comprise the "FOR" loop in PageoutScan. The PageQ lock +(`vm_page_queue_lock`) is held for most of this loop. + +### Phase 2 +Check to see if we need to drop the PageQ lock: +- We have been holding for quite some time. The compressor/compactor + may need it. +- Drop the lock, free any pages we might have accumulated (usually + after a few iterations through the loop) +- Wake up the compactor and try to retake the lock. If the compactor + needed it, it would have grabbed it and we might block. +- We need a vm-object lock but another thread is holding it. That thread + may also need the PageQ lock. +- Drop the PageQ lock for 10us and try again. +- Another thread (usually the NVMe driver) is waiting for the PageQ lock so + it can free some pages back to the VM. Yield the PageQ lock and see if that + helps. + +General Page Q management: +1. Check for overflow secluded pages (secluded count > secluded target) to push + to the active queue. +2. Deactivate a single page. This deactivated page should "balance" the reactivated + or reclaimed page that we remove from one of the inactive/anonymous queues below. +3. Are we done? (`return_from_scan`)? +4. Check for: + - "ripe" purgeable vm-object. + - a speculative queue to age + - a vm-object in the object cache to evict +5. If we found any actions to take in step 4, repeat Phase 2. Else, continue + to Phase 3. + +### Phase 3 +The following page queues are eligible to be reclaimed from: +- Inactive Queue: deactivated file-backed pages +- Speculative Queue: file-backed pages which have never been activated. These + are generally generated by read-ahead. +- Anonymous Queue: deactivated anonymous pages +- Cleaned Queue: File backed pages that have been "cleaned" by writing their + contents back to disk and are now reclaimable. This queue is no longer used. + +1. Update the file cache targets. (TODO: how?) +2. Check the Flow Control state machine to evaluate if we should block to + allow the rest of the system to make forward progress. + - If the queues of interest are all empty, block for 50ms. There is nothing + `pageout_scan` can do, but the other VM threads may be able to make progress. + - If we have evaluated a significant number of pages without making *any* + progress (reactivations or frees), block for 1ms. + - If the compressor queues are full ("throttled"): + - `FCS_IDLE`: There are plenty of file-backed pages, bias the loop towards reclaiming these + - `FCS_DELAYED`: If the deadlock-detection period has elapsed then wakeup + the garbage collector, increase the reclamation target by 100, and + change state to `FCS_DEADLOCK_DETECTED`. Else, block. + - `FCS_DEADLOCK_DETECTED`: If the reclamation target is met, change state + back to `FCS_DELAYED`. Else, restart from Phase 2. + +### Phase 4 +We must now choose a "victim" page to attempt to reclaim. If a candidate page +has been referenced since deactivation, it will be reactivated (barring +certain "force-reclaim" conditions). + +1. Look for clean or speculative pages (unless we specifically want an + anonymous one). +2. On non-app-swap systems (macOS), look for a "self-donated" page. +3. Look for a background page. On Intel systems, we heavily bias towards + background pages during dark-wake mode to ensure background tasks (e.g. + Software Update) do not disrupt the user's normal working set. +4. Look for 2 anonymous pages for every 1 file-backed page.\* This ratio comes + from the days of spinning disks and software compression, where re-faulting a + file-backed page was roughly twice as costly as an anonymous one. +5. If steps 1-4 could not find an unreferenced page, restart from Phase 2. + +\* Certain extreme conditions may cause the 2:1 ratio to be ignored: + - The file-cache has fallen below its minimum size -> choose anonymous + - The number of inactive file-backed pages is less than 50% of all + file-backed pages -> choose anonymous + - The free page count is dangerously low (compression may require free pages + to compress into) -> choose file-backed + +### Phase 5 +We have found a victim page, and will now attempt to reclaim it. "Freed" pages +are placed on a thread-local free queue to be freed to the global free queue +in batches during Phase 2. + +1. Pull the page off of its current queue. +2. *Try* to take the vm-object lock corresponding to the victim page. Note + that this is an inversion of the typical lock ordering (vm-object -> + page-queues). As such, `pageout_scan` cannot block if the lock is currently + held by another thread. If it cannot take the vm-object lock, then identify + another potential victim page via Phase 4 and tell the system that a + "privileged" thread wants its vm-object lock (precluding other threads + from taking the lock until the privileged thread has had an opportunity + to take it), drop the PageQ lock, pause for 10µs, and restart from Phase 2. +3. Evaluate the page's current state: + - `busy`: this page is being transiently operated on by another thread, + place it back on its queue and restart from Phase 2. + - `free_when_done`/`cleaning`: this page is about to be freed by another + thread. Skip it and restart from Phase 2. + - `error`/`absent`/`pager==NULL`/`object==NULL`: this page can be freed + without any cleaning. Free the page. + - `purgeable(empty)`: object has already been purged, free the page. + - `purgeable(volatile)`: We'll purge this object wholesale once it is ripe, + so compressing it now isn't worth the work. Skip this page and restart + from Phase 2. +4. Check (with the pmap) if the page has been modified or referenced. +5. If the page has been referenced since we identified it as a victim, consider + reactivating it. If we have consecutively re-activated a sufficient number + of pages, then reclaim the page anyway to ensure forward progress is made.\* + On embedded systems, a sufficient number of these forced reclamations will + trigger jetsams. Pages which were first faulted by real-time threads are + exempted from these forced reclamations to prevent audio glitches. +6. Disconnect the page from all page-table and virtual mappings. If it is + anonymous, leave a breadcrumb in the page table entry for memory accounting + purposes. +7. If the page is clean, free it. +8. Otherwise, the page is dirty and needs to be "cleaned" before it can be reclaimed. + Place it on the relevant pageout queue (i.e. compressor for anonymous and external + for file-backed) and wakeup the relevant VM thread. +9. Restart from Phase 2. + +\* This can happen when the working set turns over rapidly or the system is +seriously overcommited. In such cases, we can't rely on the LRU approximation +to identified "good" victims and need to reclaim whatever we can find. + +## Historical Experiments + +### Latency-based Jetsam +By placing a "fake" page in the active page queue with an associated timestamp, +we can track the rate of paging by measuring how long it takes for the page to +be identified as a victim by `pageout_scan`. A rapid paging rate indicates +that the system cannot keep up with memory demand via paging alone. In such +cases, jetsams would be invoked directly by `pageout_scan` to free larger +amounts of memory and reduce demand. + +Experiments with this implementation highlighted that many iterations of +`pageout_scan` are required before the latency-detection mechanism will +trigger. The delay imposed by these LPF-characteristics was often larger than +the existing page-shortage mechanism and regressed use cases like Camera launch. +Further, performing kills directly on the pageout thread added significant +latency. + +Re-introducing the paging-rate measurement without the jetsam-trigger may be +worthwhile for diagnosing system health. + +### Dynamic Scheduling Priority +In theory, a misbehaving low-priority thread can generate lots of page demand, +invoking `pageout_scan` to run at a very high priority (91). Thus, the low-priority +thread can effectively preempt higher-priority user threads and starve them of +the core(s) used by the VM thread(s). This can be mitigated by using +propagating the priority of threads waiting on free pages to `pageout_scan`, +allowing `pageout_scan` to only run at a priority as high as its highest waiter. + +This approach was enabled on low core-count devices (i.e. watches) for 1-2 +years. However, it eventually appeared to contribute to audio glitches and had +to be disabled. + +In general, *any* page-wait (even short ones) can be catastrophic for latency +sensitive/real-time threads, especially if those threads will also have to +wait for an I/O to complete after the page-wait. By slowing the preemptive +paging done without any waiters (at `pageout_scan`'s now low base priority), +the likelihood of page-waits increases. + diff --git a/iokit/DriverKit/IOService.iig b/iokit/DriverKit/IOService.iig index 92bcce5cb..418087128 100644 --- a/iokit/DriverKit/IOService.iig +++ b/iokit/DriverKit/IOService.iig @@ -57,10 +57,13 @@ enum { kIOServicePowerCapabilityOff = 0x00000000, kIOServicePowerCapabilityOn = 0x00000002, kIOServicePowerCapabilityLow = 0x00010000, + kIOServicePowerCapabilityLPW = 0x00020000, }; enum { - _kIOPMWakeEventSource = 0x00000001, + _kIOPMWakeEventSource = 0x00000001, + _kIOPMWakeEventFullWake = 0x00000002, + _kIOPMWakeEventPossibleFullWake = 0x00000004, }; // values for OSNumber kIOSystemStateHaltDescriptionKey:kIOSystemStateHaltDescriptionHaltStateKey @@ -69,6 +72,20 @@ enum { kIOServiceHaltStateRestart = 0x00000002, }; +// Bitfields for CreatePMAssertion +enum { + /*! kIOServicePMAssertionCPUBit + * When set, PM kernel will prefer to leave the CPU and core hardware + * running in "Dark Wake" state, instead of sleeping. + */ + kIOServicePMAssertionCPUBit = 0x001, + + /*! kIOServicePMAssertionForceFullWakeupBit + * When set, the system will immediately do a full wakeup after going to sleep. + */ + kIOServicePMAssertionForceFullWakeupBit = 0x800, +}; + /*! * @class IOService * @@ -491,6 +508,7 @@ public: const char * StringFromReturn( IOReturn retval) LOCALONLY; + #endif /* PRIVATE_WIFI_ONLY */ /*! @function RemoveProperty @@ -522,12 +540,11 @@ public: * @function StateNotificationItemCreate * @abstract Create a state notification item. * @param itemName name of the item. - * @param schema dictionary describing behaviors for the item. Keys are defined in - * IOKitKeys.h kIOStateNotification* + * @param value initial value of the item. Can be set to NULL. * @return kIOReturnSuccess on success. See IOReturn.h for error codes. */ virtual kern_return_t - StateNotificationItemCreate(OSString * itemName, OSDictionary * schema); + StateNotificationItemCreate(OSString * itemName, OSDictionary * value); /*! * @function StateNotificationItemSet @@ -549,6 +566,31 @@ public: virtual kern_return_t StateNotificationItemCopy(OSString * itemName, OSDictionary ** value); + /*! + * @function CreatePMAssertion + * @abstract Create a power management assertion. + * @param assertionBits Bit masks including all the flavors that require to be asserted. + * @param assertionID pointer that will contain the unique identifier of the created + * power assertion. + * @param synced indicates if the assertion must prevent an imminent sleep transition. + * When set to true, and if a system sleep is irreversible, the call will return + * kIOReturnBusy, in which case the assertion is not created. Only + * kIOServicePMAssertionCPUBit is valid for assertionBits if sleepSafe is set to + * true. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + CreatePMAssertion(uint32_t assertionBits, uint64_t * assertionID, bool synced); + + /*! + * @function ReleasePMAssertion + * @abstract Release a previously created power management assertion. + * @param assertionID the assertion ID returned by CreatePMAssertion. + * @return kIOReturnSuccess on success. See IOReturn.h for error codes. + */ + virtual kern_return_t + ReleasePMAssertion(uint64_t assertionID); + private: virtual void Stop_async( diff --git a/iokit/DriverKit/queue_implementation.h b/iokit/DriverKit/queue_implementation.h index 59ff8f4b5..37119b70c 100644 --- a/iokit/DriverKit/queue_implementation.h +++ b/iokit/DriverKit/queue_implementation.h @@ -539,7 +539,7 @@ re_queue_tail(queue_t que, queue_entry_t elt) &((elt)->field) != (head); \ elt = _nelt, _nelt = qe_element((elt)->field.next, typeof(*(elt)), field)) \ -#ifdef XNU_KERNEL_PRIVATE +#if (defined(XNU_KERNEL_PRIVATE) || SCHED_TEST_HARNESS) /* Dequeue an element from head, or return NULL if the queue is empty */ #define qe_dequeue_head(head, type, field) ({ \ @@ -595,7 +595,7 @@ re_queue_tail(queue_t que, queue_entry_t elt) _tmp_element; \ }) -#endif /* XNU_KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE || SCHED_TEST_HARNESS */ /* * Macro: QUEUE_HEAD_INITIALIZER() diff --git a/iokit/Exclaves/Exclaves.cpp b/iokit/Exclaves/Exclaves.cpp index 9c600625c..9e3ff3614 100644 --- a/iokit/Exclaves/Exclaves.cpp +++ b/iokit/Exclaves/Exclaves.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include "../Kernel/IOServicePrivate.h" #include @@ -659,6 +660,69 @@ IOExclaveANEUpcallHandler(uint64_t id, struct IOExclaveANEUpcallArgs *args, bool return ret; } +IOReturn +IOExclaveLPWUpcallHandler(struct IOExclaveLPWUpcallArgs *args) +{ + IOReturn ret; + + if (!args) { + return kIOReturnBadArgument; + } + + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + if (!rootDomain) { + return kIOReturnInternalError; + } + + switch (args->type) { + case kIOExclaveLPWUpcallTypeCreateAssertion: + return IOExclaveLPWCreateAssertion(&args->data.createassertion.id_out, "Exclave LPW assertion"); + case kIOExclaveLPWUpcallTypeReleaseAssertion: + return IOExclaveLPWReleaseAssertion(args->data.releaseassertion.id); + case kIOExclaveLPWUpcallTypeRequestRunMode: + ret = rootDomain->requestRunMode(args->data.requestrunmode.runmode_mask); + break; + default: + return kIOReturnBadArgument; + } + + return ret; +} + +IOReturn +IOExclaveLPWCreateAssertion(uint64_t *id_out, const char *desc) +{ + IOPMDriverAssertionID assertionID; + + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + if (!rootDomain) { + return kIOReturnInternalError; + } + + assertionID = rootDomain->createPMAssertion( + kIOPMDriverAssertionCPUBit | kIOPMDriverAssertionForceWakeupBit, + kIOPMDriverAssertionLevelOn, + rootDomain, + desc); + if (assertionID == 0) { + return kIOReturnInternalError; + } + + *id_out = (uint64_t) assertionID; + return kIOReturnSuccess; +} + +IOReturn +IOExclaveLPWReleaseAssertion(uint64_t id) +{ + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + if (!rootDomain) { + return kIOReturnInternalError; + } + + return rootDomain->releasePMAssertion(id); +} + /* IOService exclave methods */ #endif /* CONFIG_EXCLAVES */ diff --git a/iokit/Exclaves/Exclaves.h b/iokit/Exclaves/Exclaves.h index 96c92770e..76f86e941 100644 --- a/iokit/Exclaves/Exclaves.h +++ b/iokit/Exclaves/Exclaves.h @@ -36,6 +36,8 @@ #include #include +#include + #ifdef __cplusplus #include @@ -134,6 +136,27 @@ struct IOExclaveANEUpcallArgs { }; }; +enum IOExclaveLPWUpcallType { + kIOExclaveLPWUpcallTypeCreateAssertion, + kIOExclaveLPWUpcallTypeReleaseAssertion, + kIOExclaveLPWUpcallTypeRequestRunMode, +}; + +struct IOExclaveLPWUpcallArgs { + enum IOExclaveLPWUpcallType type; + union { + struct { + uint64_t id_out; + } createassertion; + struct { + uint64_t id; + } releaseassertion; + struct { + uint64_t runmode_mask; + } requestrunmode; + } data; +}; + /* * Exclave upcall handlers * @@ -145,6 +168,10 @@ bool IOExclaveLockWorkloop(uint64_t id, bool lock); bool IOExclaveAsyncNotificationUpcallHandler(uint64_t id, struct IOExclaveAsyncNotificationUpcallArgs *args); bool IOExclaveMapperOperationUpcallHandler(uint64_t id, struct IOExclaveMapperOperationUpcallArgs *args); bool IOExclaveANEUpcallHandler(uint64_t id, struct IOExclaveANEUpcallArgs *args, bool *result); +IOReturn IOExclaveLPWUpcallHandler(struct IOExclaveLPWUpcallArgs *args); + +IOReturn IOExclaveLPWCreateAssertion(uint64_t *id_out, const char *desc); +IOReturn IOExclaveLPWReleaseAssertion(uint64_t id); /* Test support */ diff --git a/iokit/IOKit/IOBSD.h b/iokit/IOKit/IOBSD.h index cc5a1d1d7..543f394ab 100644 --- a/iokit/IOKit/IOBSD.h +++ b/iokit/IOKit/IOBSD.h @@ -66,6 +66,12 @@ extern void IOBSDLowSpaceUnlinkKernelCore(void); */ extern boolean_t IOCurrentTaskHasEntitlement(const char * entitlement); extern boolean_t IOTaskHasEntitlement(task_t task, const char *entitlement); + +/* + * IOTaskHasEntitlementAsBooleanOrObject returns true if the entitlement is true boolean, or + * a non-NULL object. + */ +extern boolean_t IOTaskHasEntitlementAsBooleanOrObject(task_t task, const char *entitlement); extern boolean_t IOVnodeHasEntitlement(struct vnode *vnode, int64_t off, const char *entitlement); extern boolean_t IOVnodeGetBooleanEntitlement( struct vnode *vnode, @@ -79,6 +85,8 @@ extern char * IOTaskGetEntitlement(task_t task, const char * entitlement); */ extern char *IOVnodeGetEntitlement(struct vnode *vnode, int64_t offset, const char *entitlement); +extern boolean_t IOTaskGetIntegerEntitlement(task_t task, const char *entitlement, uint64_t *value); + /* * Tests that the entitlement is present and has matching value */ @@ -98,6 +106,13 @@ extern struct IOPolledFileIOVars * gIOPolledCoreFileVars; extern kern_return_t gIOPolledCoreFileOpenRet; extern IOPolledCoreFileMode_t gIOPolledCoreFileMode; +extern bool IOPMIsAOTMode(void); +extern bool IOPMIsLPWMode(void); +enum { + kIOPMNetworkStackFullWakeFlag = 0x000000001, +}; +extern void IOPMNetworkStackFullWake(uint64_t flags, const char * reason); + #ifdef __cplusplus } #endif diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index c67e27e34..25eb1ae35 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -123,7 +123,7 @@ public: #ifdef KERNEL_PRIVATE /* * Create an IOBufferMemoryDescriptor with guard pages on each side of the buffer allocation. - * @param inTask The task the buffer will be allocated in. + * @param inTask The task the buffer will be allocated in. Pass NULL to allocate unmapped memory. * @param options Options for the IOBufferMemoryDescriptor. See inTaskWithOptions for a description of available options. * Some options are not available when using guard pages. Specifically, physically contiguous memory and pageable memory * options are not supported. If these options are used, this will fail to create the memory descriptor and return NULL. @@ -196,7 +196,7 @@ public: /*! @function inTaskWithOptions * @abstract Creates a memory buffer with memory descriptor for that buffer. * @discussion Added in Mac OS X 10.2, this method allocates a memory buffer with a given size and alignment in the task's address space specified, and returns a memory descriptor instance representing the memory. It is recommended that memory allocated for I/O or sharing via mapping be created via IOBufferMemoryDescriptor. Options passed with the request specify the kind of memory to be allocated - pageablity and sharing are specified with option bits. This function may block and so should not be called from interrupt level or while a simple lock is held. - * @param inTask The task the buffer will be allocated in. + * @param inTask The task the buffer will be allocated in. Pass NULL to allocate unmapped memory. * @param options Options for the allocation:
* kIODirectionOut, kIODirectionIn - set the direction of the I/O transfer.
* kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.
@@ -220,7 +220,7 @@ public: /*! @function inTaskWithOptions * @abstract Creates a memory buffer with memory descriptor for that buffer. * @discussion Added in Mac OS X 10.2, this method allocates a memory buffer with a given size and alignment in the task's address space specified, and returns a memory descriptor instance representing the memory. It is recommended that memory allocated for I/O or sharing via mapping be created via IOBufferMemoryDescriptor. Options passed with the request specify the kind of memory to be allocated - pageablity and sharing are specified with option bits. This function may block and so should not be called from interrupt level or while a simple lock is held. - * @param inTask The task the buffer will be allocated in. + * @param inTask The task the buffer will be allocated in. Pass NULL to allocate unmapped memory. * @param options Options for the allocation:
* kIODirectionOut, kIODirectionIn - set the direction of the I/O transfer.
* kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.
diff --git a/iokit/IOKit/IOCircularDataQueue.h b/iokit/IOKit/IOCircularDataQueue.h new file mode 100644 index 000000000..51e0a48cd --- /dev/null +++ b/iokit/IOKit/IOCircularDataQueue.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef IOGCCircularDataQueue_h +#define IOGCCircularDataQueue_h + +#if KERNEL +#include +#else +#include +#endif + +__BEGIN_DECLS + +struct IOCircularDataQueue; +typedef struct IOCircularDataQueue IOCircularDataQueue; + +// IOCircularDataQueueCreate* options +typedef enum __OS_ENUM_ATTR IOCircularDataQueueCreateOptions { + kIOCircularDataQueueCreateConsumer = 0x00000001, + kIOCircularDataQueueCreateProducer = 0x00000002 +} IOCircularDataQueueCreateOptions; + +#if KERNEL + +/*! + * @function IOCircularDataQueueCreateWithEntries + * @abstract Function that creates a new IOCircularDataQueue instance with the specified number of entries of the given + * size. + * @discussion This method will create a new IOCircularDataQueue instance with enough capacity for numEntries of + * entrySize each. It does account for the IOCircularDataQueueEntryHeader overhead for each entry. Note that the + * numEntries and entrySize are simply used to determine the data region size. They do not actually restrict the number + * of enqueues to the queue since its a circular buffer and will eventually overwrite old data. At any time, the + * allocated data region can hold a maximum of numEntries queue entries.
This method allocates a new + * IOCircularDataQueue instance with the given numEntries and entrySize parameters. + * @param options IOCircularDataQueueCreateOptions. + * @param numEntries Number of entries to allocate space for. + * @param entrySize Size of each entry. + * @param pQueue Pointer to a queue handle. On return, this holds a handle to the newly allocated queue. + * @return + * - `kIOReturnSuccess` if the queue was succesfully intialized. + * - `kIOReturnBadArgument` if the parameters passed were invalid. + * - `kIOReturnNoMemory` if there was a memory allocation failure. + */ +IOReturn IOCircularDataQueueCreateWithEntries(IOCircularDataQueueCreateOptions options, uint32_t numEntries, uint32_t entrySize, IOCircularDataQueue **pQueue); + +/*! + * @function IOCircularDataQueueCopyMemoryDescriptor + * @abstract Returns a reference to the IOMemoryDescriptor for the queue's memory. + * @discussion Returns a reference to the IOMemoryDescriptor for the queue's memory. + * @param queue Queue handle. On return, this holds a handle to the newly allocated queue. + * @return IOMemoryDescriptor reference + */ +IOMemoryDescriptor * IOCircularDataQueueCopyMemoryDescriptor(IOCircularDataQueue *queue); + +#else /* KERNEL */ + +/*! + * @function IOCircularDataQueueCreateWithConnection + * @abstract Function that creates a new IOCircularDataQueue instance with an open IOUserClient connection. + * @discussion This method will create a new IOCircularDataQueue instance with a queue owned by the IOUserClient + * instance passed in. The memory and queue attributes are created from the IOMemoryDescriptor returned by the IOUC + * returned by clientMemoryForType() with the memoryType parameter passed to this function. This memory descriptor must + * be one returned by the kernel api IOCircularDataQueueCopyMemoryDescriptor(). + * @param options IOCircularDataQueueCreateOptions. + * @param connect An open IOUserClient connection created by the caller with IOServiceOpen(). The connection must be + * valid while the queue is in use. + * @param memoryType memoryType argument that will passed to the IOUC clientMemoryForType() function to obtain the queue memory. + * @param pQueue Pointer to a queue handle. On return, this holds a handle to the newly allocated queue. + * @return + * - `kIOReturnSuccess` if the queue was succesfully intialized. + * - `kIOReturnBadArgument` if the parameters passed were invalid. + * - `kIOReturnNoMemory` if there was a memory allocation failure. + */ +IOReturn IOCircularDataQueueCreateWithConnection(IOCircularDataQueueCreateOptions options, io_connect_t connect, uint32_t memoryType, IOCircularDataQueue **pQueue); + +#endif /* !KERNEL */ + +/*! + * @function IOCircularDataQueueDestroy + * @abstract Function that destroys a previously created IOCircularDataQueue instance (created with + * IOCircularDataQueueCreateWithEntries). + * @param pQueue Pointer to the queue handle. + * @return + * - `kIOReturnSuccess` if the queue was succesfully destroyed. + * - `kIOReturnBadArgument` if an invalid queue was provided. + */ +IOReturn IOCircularDataQueueDestroy(IOCircularDataQueue **pQueue); + + +/*! + * @function IOCircularDataQueueEnqueue + * @abstract Enqueues a new entry on the queue. + * @discussion This method adds a new data entry of dataSize to the queue. It sets the size parameter of the entry + * pointed to by the write index and copies the memory pointed to by the data parameter in place in the queue. Once + * that is done, it moves the write index to the next index. + * @param queue Handle to the queue. + * @param data Pointer to the data to be added to the queue. + * @param dataSize Size of the data pointed to by data. + * @return + * - `kIOReturnSuccess` on success. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnBadArgument` if an invalid queue was provided. + * - `kIOReturnBusy` if another thread is enqueing concurrently + * - `kIOReturnUnsupported` if the queue has not been configured to support fixed size entries. Variable size is + * currently not supported + * - Other values indicate an error. + */ +IOReturn IOCircularDataQueueEnqueue(IOCircularDataQueue *queue, const void *data, size_t dataSize); + +/*! + * @function IOCircularDataQueueGetLatest + * Access the latest entry data, also update the cursor position to the latest. No copy is made of the data.
Caller + * is supposed to call IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the data is + * complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the latest entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnBadArgument` if an invalid queue was provided. + * - `kIOReturnTimeout` if the reader timed out when trying to read. This is possible if the writer overwrites the + * latest index a reader is about to read. The function times out if the read is unsuccessful after multiple retries. + * - Other values indicate an error. + */ +IOReturn IOCircularDataQueueGetLatest(IOCircularDataQueue *queue, void **data, size_t *size); + +/*! + * @function IOCircularDataQueueCopyLatest + * Access the latest entry data and copy into the provided buffer. Also update the cursor position to the latest. On a + * successful return, the function gaurantees that the latest data was successfully copied. In this case there is no + * need to call IOCircularDataQueueIsCurrentDataValid() after reading the data is complete, since the function returns a + * copy which cannot be overwritten by the writer. + * @param queue Handle to the queue. + * @param data Pointer to memory into which the latest data from the queue is copied. Lifetime of this memory is + * controlled by the caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to latest and the data was successfully copied. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue + * - `kIOReturnBadArgument` if the buffer provided to copy the data is NULL or if an invalid queue was provided.. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnTimeout` if the reader timed out when trying to copy the latest data. This is possible if the writer + * overwrites the latest index a reader is about to copy. The function times out if the copy is unsuccessful after + * multiple retries. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueCopyLatest(IOCircularDataQueue *queue, void *data, size_t *size); + +/*! + * @function IOCircularDataQueueGetNext + * Access the data at the next cursor position and updates the cursor position to the next. No copy is made of the data. + *
Caller is supposed to call IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the + * data is complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the next entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the latest. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the cursor has reached the latest available data. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetNext(IOCircularDataQueue *queue, void **data, size_t *size); + +/*! + * @function IOCircularDataQueueCopyNext + * Access the data at the next cursor position and copy into the provided buffer. Also update the cursor position to the + * next. On a successful return, the function gaurantees that the next entry data was successfully copied. In this case + * there is no need to call IOCircularDataQueueIsCurrentDataValid() after reading the data is complete, since the + * function returns a copy which cannot be overwritten by the writer. + * @param queue Handle to the queue. + * @param data Pointer to memory into which the next data from the queue is copied. Lifetime of this memory is + * controlled by the caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to next and the data was successfully copied. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the cursor has reached the latest available data. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueCopyLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueCopyNext(IOCircularDataQueue *queue, void *data, size_t *size); + +/*! + * @function IOCircularDataQueueGetPrevious + * Access the data at the previous cursor position and updates the cursor position to the previous. No copy is made of + * the data.
Caller is supposed to call IOCircularDataQueueIsCurrentDataValid() to check data integrity after + * reading the data is complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the previous entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the previous. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetPrevious(IOCircularDataQueue *queue, void **data, size_t *size); + +/*! + * @function IOCircularDataQueueCopyPrevious + * Access the data at the previous cursor position and copy into the provided buffer. Also update the cursor position to + * the previous. On a successful return, the function gaurantees that the previous entry data was successfully copied. + * In this case there is no need to call IOCircularDataQueueIsCurrentDataValid() after reading the data is complete, + * since the function returns a copy which cannot be overwritten by the writer. + * @param queue Handle to the queue. + * @param data Pointer to memory into which the previous data is copied. Lifetime of this memory is controlled by the + * caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the previous and the data was successfully copied. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueCopyLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueCopyPrevious(IOCircularDataQueue *queue, void *data, size_t *size); + +/*! + * @function IOCircularDataQueueIsCurrentDataValid + * Verify if the data at the current cursor position is the same as the data when the cursor was first updated to this + * position. Call this function after having read the data at the current cursor position from the queue, since the + * queue entry could potentially have been overwritten by the writer while being read by the caller.
+ * @param queue Handle to the queue. + * @return + * - `kIOReturnSuccess` if the data at the cursor position is unchanged. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer the same and is + * potentially overwritten. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnBadArgument` if an invalid param was passed. + * - `kIOReturnBadMedia` if the queueMemory is corrupted. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueIsCurrentDataValid(IOCircularDataQueue *queue); + +/*! + * @function IOCircularDataQueueSetCursorLatest + * Set the current cursor position to the latest entry in the queue. This only updates the cursor and does not read the + * data from the queue. If nothing has been enqueued into the queue yet, this returns an error. + * @param queue Handle to the queue. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the latest. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue since there is no latest entry. + * - `kIOReturnAborted` if the queue is in an irrecoverable state. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueSetCursorLatest(IOCircularDataQueue *queue); + +/*! + * @function IOCircularDataQueueGetCurrent + * Access the data at the current cursor position. The cursor position is unchanged. No copy is made of the data.
+ * Caller is supposed to call IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the data is + * complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the next entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue hence there is no entry at the current + * position.. + * - `kIOReturnOverrun` if the entry at the current cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetCurrent(IOCircularDataQueue *queue, void **data, size_t *size); + +/*! + * @function IOCircularDataQueueCopyCurrent + * Access the data at the current cursor position and copy into the provided buffer. The cursor position is unchanged. + * If successful, function gaurantees that the data returned is always valid, hence no need to call + * IOCircularDataQueueIsCurrentDataValid(). + * @param queue Handle to the queue. + * @param data Pointer to memory into which the previous data is copied. Lifetime of this memory is controlled by the + * caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue hence there is no entry at the current + * position.. + * - `kIOReturnOverrun` if the entry at the current cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueCopyLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueCopyCurrent(IOCircularDataQueue *queue, void *data, size_t *size); + +/*! + * @function IOCircularDataQueueGetLatestWithBlock + * Access the latest entry data, also update the cursor position to the latest. Calls the provided block with the data + * at the cursor position. No copy is made of the data.
Optionally the caller can call + * IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the data is complete in the block. + * Additionally the function also returns an error if the data has been overwritten after the block completion + * @param queue Handle to the queue. + * @param handler Block to call + * -param data Pointer to the latest data in the queue that the block is called with. + * -param size Size of the data pointed to by data that the block is called with. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the latest. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnBadArgument` if an invalid queue was provided. + * - `kIOReturnAborted` if the queue was reset. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetLatestWithBlock(IOCircularDataQueue * queue, void (^handler)(const void *data, size_t size)); + +/*! + * @function IOCircularDataQueueGetNextWithBlock + * Access the data at the next cursor position and updates the cursor position to the next. Calls the provided block + * with the data at the cursor position. No copy is made of the data.
Optionally the caller can call + * IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the data is complete in the block. + * Additionally the function also returns an error if the data has been overwritten after the block completion. + * @param queue Handle to the queue. + * @param handler Block to call + * -param data A pointer to the data memory region for the next entry data in the queue that the block is called with. + * -param size Size of the data pointed to by data that the block is called with. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to next. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the cursor has reached the latest available data. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetNextWithBlock(IOCircularDataQueue * queue, void (^handler)(const void *data, size_t size)); + +/*! + * @function IOCircularDataQueueGetPreviousWithBlock + * Access the data at the previous cursor position and updates the cursor position to the previous. Calls the provided + * block with the data at the cursor position. No copy is made of the data.
Optionally the caller can call + * IOCircularDataQueueIsCurrentDataValid() to check data integrity after reading the data is complete in the block. + * Additionally the function also returns an error if the data has been overwritten after the block completion. + * @param queue Handle to the queue. + * @param handler Block to call + * -param data A pointer to the data memory region for the previous entry data in the queue that the block is called + * with. + * -param size Size of the data pointed to by data that the block is called with. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to previous. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call IOCircularDataQueueGetLatest to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ +IOReturn IOCircularDataQueueGetPreviousWithBlock(IOCircularDataQueue * queue, void (^handler)(const void *data, size_t size)); + +__END_DECLS + +#endif /* IOGCCircularDataQueue_h */ diff --git a/iokit/IOKit/IOCircularDataQueueImplementation.h b/iokit/IOKit/IOCircularDataQueueImplementation.h new file mode 100644 index 000000000..625a2e478 --- /dev/null +++ b/iokit/IOKit/IOCircularDataQueueImplementation.h @@ -0,0 +1,1918 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +__BEGIN_DECLS + +/*! + * @header IOCircularDataQueueMemory + * + * This header contains the memory layout for a circular data queue. + * + * A circular data queue supports a single producer and zero or more consumers. + * + * + * The producer does not wait for consumers to read the data. If a + * consumer falls behind, it will miss data. + * + * The queue can be configured to support either fixed or variable sized + * entries. + * Currently only fixed is supported. + */ + +/* + * Fixed sized entry circular queue + * + +------------+ + | Queue | + | Header | + +------------+ <--- First Data Entry + |Entry Header| + +------------+ + | | + | Entry | + | Data | + | | + +------------+ <--- Second Data Entry + |Entry Header| + +------------+ + | | + | | + | ... | + | ... | + | | + | | + | | + +------------+ <--- Last Data Entry + |Entry Header| + +------------+ + | | + | Entry | + | Data | + | | + +------------+ + | + */ + +#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 201112L + #define _STATIC_ASSERT_OVERLOADED_MACRO(_1, _2, NAME, ...) NAME + #define static_assert(...) _STATIC_ASSERT_OVERLOADED_MACRO(__VA_ARGS__, _static_assert_2_args, _static_assert_1_arg)(__VA_ARGS__) + + #define _static_assert_2_args(ex, str) _Static_assert((ex), str) + #define _static_assert_1_arg(ex) _Static_assert((ex), #ex) +#endif + +#define HEADER_16BYTE_ALIGNED 1 // do the entry and entry headers need to be 16 byte aligned for perf/correctness ? + +/*! + * @typedef IOCircularDataQueueEntryHeaderInfo + * @abstract The state of an entry in the circular data queue. The state is part of each entry header in the queue. + * @discussion The entry state has the sequence number, data size, generation and current state of the entry. The state + * is read/updated atomically. + * @field seqNum A unique sequence number for this entry. The sequence number is monotonically increased on each enqueue + * to the queue. Each entry in the queue has a unique sequence number. + * @field dataSize The size of the data at this entry. + * @field generation The queue generation which is copied from the queue header memory into the entry state on an + * enqueue. + * @field `_reserved` Unused + * @field wrStatus Represents if the queue entry is currently being written to or not. + */ + +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_SIZE 1 +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_GENERATION_SIZE 30 +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_DATATSIZE_SIZE 32 +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_SEQNUM_SIZE 64 +// #define IOCIRCULARDATAQUEUE_ENTRY_STATE_RESERVED_SIZE 1 +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_RESERVED_SIZE \ + ((8 * sizeof(__uint128_t)) - IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_SIZE \ + - IOCIRCULARDATAQUEUE_ENTRY_STATE_GENERATION_SIZE - IOCIRCULARDATAQUEUE_ENTRY_STATE_DATATSIZE_SIZE \ + - IOCIRCULARDATAQUEUE_ENTRY_STATE_SEQNUM_SIZE) + +typedef union { + __uint128_t val; + struct { + __uint128_t seqNum : IOCIRCULARDATAQUEUE_ENTRY_STATE_SEQNUM_SIZE; // Sequence Number + __uint128_t dataSize : IOCIRCULARDATAQUEUE_ENTRY_STATE_DATATSIZE_SIZE; // datasize + __uint128_t generation : IOCIRCULARDATAQUEUE_ENTRY_STATE_GENERATION_SIZE; // generation + __uint128_t _reserved : IOCIRCULARDATAQUEUE_ENTRY_STATE_RESERVED_SIZE; // reserved, currently not used + __uint128_t wrStatus : IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_SIZE; // queue writing status + } fields; +} IOCircularDataQueueEntryHeaderInfo; + +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS (1) +#define IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_COMPLETE (0) + +static_assert(IOCIRCULARDATAQUEUE_ENTRY_STATE_RESERVED_SIZE > 0, "unexpected reserved field size"); + +/*! + * @typedef IOCircularDataQueueEntryHeader + * @abstract An entry in the circular data queue. The entry header is written at the beginning of each entry in the + * queue. + * @discussion The entry has the current state, sentinel, followed by the data at the enty. + * @field info The info of the queue entry. This includes the size, sequence number, generation and write status of the + * data at this entry. + * @field sentinel unique value written to the queue entry. This is copied from the sentinel in the queue header memory + * when an entry is written. + * @field data Represents the beginning of the data region. The address of the data field is a pointer to the start of + * the data region. + */ +typedef struct { + union { + volatile _Atomic __uint128_t headerInfoVal; + IOCircularDataQueueEntryHeaderInfo __headerInfo; // for clarity, unused + }; + volatile uint64_t sentinel; + uint64_t _pad; // pad for 16 byte aligment of data that follows +#if HEADER_16BYTE_ALIGNED + uint8_t data[16]; // Entry data begins. Aligned to 16 bytes. +#else + uint8_t data[8]; // Entry data begins. Aligned to 8 bytes. +#endif +} IOCircularDataQueueEntryHeader; + +#if HEADER_16BYTE_ALIGNED +#define CIRCULAR_DATA_QUEUE_ENTRY_HEADER_SIZE (sizeof(IOCircularDataQueueEntryHeader) - 16) +#else +#define CIRCULAR_DATA_QUEUE_ENTRY_HEADER_SIZE (sizeof(IOCircularDataQueueEntryHeader) - 8) +#endif + +/*! + * @typedef IOCircularDataQueueState + * @abstract The current state of the circular data queue. + * @discussion The queue state is part of the queue memory header. It has the current sequence number, next writing + * index, generation and current reset and writing state off the queue. The queue state is read/updated atomically. + * @field seqNum A monotonically increasing sequence number which is incremented for each enqueue. + * @field wrIndex The next write position into the queue. + * @field generation The generation of the queue. It is a monotonically increasing number, which is incremented on each + * queue reset. + * @field rstStatus The queue reset state. The bit is set if the queue is currently being reset. + * @field wrStatus The queue writing state. The bit is set if an enqueue is in progress. + */ +// Fahad : I dont think we need a reset bit, since we are doing everything in one atomic op. + +#define IOCIRCULARDATAQUEUE_STATE_WRITE_SIZE 1 +#define IOCIRCULARDATAQUEUE_STATE_RESET_SIZE 1 +#define IOCIRCULARDATAQUEUE_STATE_GENERATION_SIZE 30 +#define IOCIRCULARDATAQUEUE_STATE_WRITEINDEX_SIZE 32 +#define IOCIRCULARDATAQUEUE_STATE_SEQNUM_SIZE 64 +//#define IOCIRCULARDATAQUEUE_STATE_RESERVED_SIZE \ +// ((8 * sizeof(__uint128_t)) - IOCIRCULARDATAQUEUE_STATE_WRITE_SIZE \ +// - IOCIRCULARDATAQUEUE_STATE_GENERATION_SIZE - IOCIRCULARDATAQUEUE_STATE_WRITEINDEX_SIZE \ +// - IOCIRCULARDATAQUEUE_STATE_SEQNUM_SIZE) + +typedef union { + __uint128_t val; + struct { + __uint128_t seqNum : IOCIRCULARDATAQUEUE_STATE_SEQNUM_SIZE; // Sequence Number + __uint128_t wrIndex : IOCIRCULARDATAQUEUE_STATE_WRITEINDEX_SIZE; // write index + __uint128_t generation : IOCIRCULARDATAQUEUE_STATE_GENERATION_SIZE; // generation + // Fahad: We may not need reset. + __uint128_t rstStatus : IOCIRCULARDATAQUEUE_STATE_RESET_SIZE; // queue reset status + // __uint128_t _rsvd : IOCIRCULARDATAQUEUE_STATE_RESERVED_SIZE; // reserved + __uint128_t wrStatus : IOCIRCULARDATAQUEUE_STATE_WRITE_SIZE; // queue writing status + } fields; +} IOCircularDataQueueState; + +#define IOCIRCULARDATAQUEUE_STATE_WRITE_INPROGRESS (1) +#define IOCIRCULARDATAQUEUE_STATE_WRITE_COMPLETE (0) +#define IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS (1) +#define IOCIRCULARDATAQUEUE_STATE_RESET_COMPLETE (0) + +// #define IOCircularDataQueueStateGeneration (((uint32_t)1 << 30) - 1) +#define IOCIRCULARDATAQUEUE_STATE_GENERATION_MAX (((uint32_t)1 << 30)) + +// static_assert(IOCIRCULARDATAQUEUE_STATE_RESERVED_SIZE > 0, +// "unexpected reserved field size"); + +static_assert(IOCIRCULARDATAQUEUE_STATE_GENERATION_SIZE == IOCIRCULARDATAQUEUE_ENTRY_STATE_GENERATION_SIZE, + "mismatched generation sizes"); +static_assert(IOCIRCULARDATAQUEUE_STATE_SEQNUM_SIZE == IOCIRCULARDATAQUEUE_ENTRY_STATE_SEQNUM_SIZE, + "mismatched sequenece number sizes"); + +/*! + * @typedef IOCircularDataQueueMemory + * @abstract The queue memory header present at the start of queue shared memory region. + * @discussion The queue memory header contains the queue info and state and is followed by the data region of the + * queue. + * @field sentinel unique value when the queue was created. + * @field allocMemSize the allocated memory size of the queue including the queue header and the entries + * @field memorySize the memory size of the queue excluding the queue header + * @field entryDataSize size of each entry in the queue including the entry header. The size is a multiple of 8 bytes + * @field dataSize size of each entry in the queue excluding the entry header. + * @field numEntries the number of fixed entries in the queue + * @field `_padding` memory padding for alingment. + * @field state the current state of the queue. + * @field entries Represents the beginning of the data region. The address of the data field is a pointer to the start + * of the queue data region. + */ + +typedef struct IOCircularDataQueueMemory { + uint64_t sentinel; + uint64_t _padding; // since we want it to be 16 bytes aligned below this + union { + volatile _Atomic __uint128_t queueStateVal; // needs to be 16 bytes aligned. + IOCircularDataQueueState __queueState; // for clarity, unused + }; + IOCircularDataQueueEntryHeader entries[1]; // Entries begin. Aligned to 16 bytes. +} IOCircularDataQueueMemory; + +#define CIRCULAR_DATA_QUEUE_MEMORY_HEADER_SIZE \ + (sizeof(IOCircularDataQueueMemory) - sizeof(IOCircularDataQueueEntryHeader)) + +/*! + * @typedef IOCircularDataQueueMemoryCursor + * @abstract The circular data queue cursor struct. + * @discussion This struct represents a readers reference to a position in the queue. Each client holds an instance of + * this in its process indicating its current reading position in the queue. The cursor holds uniqely identifying + * information for the queue entry. + * @field generation the generation for the entry data at the position in the queue. This generation is only changed + * when the queue is reset. + * @field position the position in the queue the cursor is at + * @field sequenceNum The unique number for the data at the cursor position. The sequence number is unique for each + * entry in the queue. + * + */ +typedef struct IOCircularDataQueueMemoryCursor { + uint32_t generation; // uint32_t seems a little excessive right now, since we dont expect these many resets. but + // lets leave it for now. + uint32_t position; + uint64_t sequenceNum; +} IOCircularDataQueueMemoryCursor; + + +/*! + * @typedef IOCircularDataQueueDescription + * @abstract The circular data queue header shadow struct. + * @discussion This struct represents the queue header shadow. Each client has a copy of this struct in its process . + * This is used to detect any memory corruption of the shared memory queue header. This struct needs to be shared from + * the creator of the queue to the clients via an out of band mechanism. + * @field sentinel unique value written to the queue header memory and each queue entry. + * @field allocMemSize the allocated memory size of the queue including the queue header + * @field entryDataSize size of each entry in the queue including the entry header. The size is a multiple of 8 bytes + * @field memorySize the memory size of the queue excluding the queue header + * @field numEntries the number of fixed entries in the queue + * IOCircularDataQueueDescription + */ +typedef struct IOCircularDataQueueDescription { + uint64_t sentinel; + uint32_t allocMemSize; // total allocated size of the queue including the queue header. + uint32_t entryDataSize; // size of each queue entry including the per entry header. + uint32_t memorySize; // memory size of the queue (excluding the queue header) + uint32_t numEntries; + uint32_t dataSize; // the client provided data size excluding the per entry header. + uint32_t padding; +} IOCircularDataQueueDescription; + +#define kIOCircularQueueDescriptionKey "IOCircularQueueDescription" + + +#if !KERNEL +/* + * IORound and IOTrunc convenience functions, in the spirit + * of vm's round_page() and trunc_page(). + */ +#define IORound(value, multiple) ((((value) + (multiple)-1) / (multiple)) * (multiple)) + +#define IONew(type, count) (type *)calloc(count, sizeof(type)) +#define IODelete(p, type, count) free(p) + +// libkern/os/base.h +#if __has_feature(ptrauth_calls) +#include +#define OS_PTRAUTH_SIGNED_PTR(type) __ptrauth(ptrauth_key_process_independent_data, 1, ptrauth_string_discriminator(type)) +#define OS_PTRAUTH_SIGNED_PTR_AUTH_NULL(type) __ptrauth(ptrauth_key_process_independent_data, 1, ptrauth_string_discriminator(type), "authenticates-null-values") +#define OS_PTRAUTH_DISCRIMINATOR(str) ptrauth_string_discriminator(str) +#define __ptrauth_only +#else // __has_feature(ptrauth_calls) +#define OS_PTRAUTH_SIGNED_PTR(type) +#define OS_PTRAUTH_SIGNED_PTR_AUTH_NULL(type) +#define OS_PTRAUTH_DISCRIMINATOR(str) 0 +#define __ptrauth_only __unused +#endif // __has_feature(ptrauth_calls) +#endif /* !KERNEL */ + +#pragma mark - Debugging + +#define QUEUE_FORMAT "Queue(%" PRIu64 " gen:%" PRIu64 " pos:%" PRIu64 " next:%" PRIu64 ")" +#define QUEUE_ARGS(q) q->guard, q->generation, q->fixed.latestIndex, q->fixed.writingIndex + +#define CURSOR_FORMAT "Cursor(%p gen:%" PRIu64 " pos:%" PRIu64 ")" +#define CURSOR_ARGS(c) c, c->generation, c->position + +#define ENTRY_FORMAT "Entry(%" PRIu64 " gen:%" PRIu64 " pos:%" PRIu64 ")" +#define ENTRY_ARGS(e) e->guard, e->generation, e->position + +#if 1 +#define queue_debug_error(fmt, ...) +#define queue_debug_note(fmt, ...) +#define queue_debug_trace(fmt, ...) +#else +#define queue_debug_error(fmt, ...) \ + { \ + os_log_debug(LOG_QUEUE, "#ERROR %s:%d %s " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } +#define queue_debug_note(fmt, ...) \ + { \ + os_log_debug(LOG_QUEUE, "#NOTE %s:%d %s " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } +#define queue_debug_trace(fmt, ...) \ + { \ + os_log_debug(LOG_QUEUE, "#TRACE %s:%d %s " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } +#endif + +#if HEADER_16BYTE_ALIGNED +static_assert(offsetof(IOCircularDataQueueEntryHeader, data) % sizeof(__uint128_t) == 0, + "IOCircularDataQueueEntryHeader.data is not 16-byte aligned!"); +#else +static_assert(offsetof(IOCircularDataQueueEntryHeader, data) % sizeof(uint64_t) == 0, + "IOCircularDataQueueEntryHeader.data is not 8-byte aligned!"); +#endif + +static_assert(sizeof(IOCircularDataQueueState) == sizeof(__uint128_t), "Unexpected padding"); +static_assert(offsetof(IOCircularDataQueueMemory, queueStateVal) % sizeof(__uint128_t) == 0, + "IOCircularDataQueueMemory.entries is not 16-byte aligned!"); + +#if HEADER_16BYTE_ALIGNED +static_assert(offsetof(IOCircularDataQueueMemory, entries) % sizeof(__uint128_t) == 0, + "IOCircularDataQueueMemory.entries is not 16-byte aligned!"); +#else +static_assert(offsetof(IOCircularDataQueueMemory, entries) % sizeof(uint64_t) == 0, + "IOCircularDataQueueMemory.entries is not 8-byte aligned!"); +#endif + +/*! + * @typedef IOCircularDataQueue + * @abstract A fixed entry size circular queue that supports multiple concurrent readers and a single writer. + * @discussion The queue currently supports fixed size entries. The queue memory size is configured at init when the + * number of entries and size of each entry is specifiied and cannot be resized later. Since the queue is a circular + * buffer, the writer can potentially overwrite an entry while a reader is still reading it. The queue provides facility + * to check for data integrity after reading the entry is complete. There is no support for sending notifications to + * readers when data is enqueued into an empty queue by the writer. The queue supports a "pull model" for reading data + * from the queue. The queue can be used for passing data from user space to kernel and vice-versa. + * @field queueHeaderShadow The queue header shadow + * @field queueCursor The queue cursor + * @field isQueueMemoryAllocated Represents if the queue memory is allocated or if the queue uses a previously + * created queue memory region. + * @field queueMemory Pointer to the queue shared memory region + */ +typedef struct IOCircularDataQueue { + IOCircularDataQueueMemoryCursor queueCursor; + IOCircularDataQueueMemory * OS_PTRAUTH_SIGNED_PTR("IOCircularDataQueue.queueMemory") queueMemory; + IOCircularDataQueueDescription queueHeaderShadow; +#if KERNEL + IOBufferMemoryDescriptor * OS_PTRAUTH_SIGNED_PTR("IOCircularDataQueue.iomd") iomd; +#else /* KERNEL */ + io_connect_t connect; + uint32_t memoryType; +#endif /* !KERNEL */ +} IOCircularDataQueue; + + +#if defined(__arm64__) && !KERNEL +#define ATTR_LSE2 __attribute__((target("lse2"))) +#else +#define ATTR_LSE2 +#endif /* defined(__arm64__) && !KERNEL */ + +#pragma mark - Queue + +static bool ATTR_LSE2 +_isQueueMemoryCorrupted(IOCircularDataQueue *queue) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + + const size_t queueSentinel = queueMemory->sentinel; + if (os_unlikely(queueSentinel != queueHeaderShadow->sentinel)) { + return true; + } + return false; +} + +inline static bool ATTR_LSE2 +_isCursorPositionInvalid(IOCircularDataQueue *queue) +{ +// IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor const *cursor = &queue->queueCursor; + + if (os_unlikely(cursor->position >= queueHeaderShadow->numEntries)) { + return true; + } + + return false; +} + +inline __unused static bool ATTR_LSE2 +_isEntryOutOfBounds(IOCircularDataQueue *queue, IOCircularDataQueueEntryHeader *entry) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; +// IOCircularDataQueueMemoryCursor const *cursor = &queue->queueCursor; + + bool ret = false; + IOCircularDataQueueEntryHeader *firstEntry = (IOCircularDataQueueEntryHeader *)(&queueMemory->entries[0]); + IOCircularDataQueueEntryHeader *lastEntry + = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + + ((queueHeaderShadow->numEntries - 1) * queueHeaderShadow->entryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory allocation before we begin writing. + if (os_unlikely(entry < firstEntry || entry > lastEntry)) { + ret = true; + } + + return ret; +} + + +#if !KERNEL +/*! + * @function isQueueMemoryValid + * Verify if the queue header shadow matches the queue header in shared memory. + * @param queue Handle to the queue. + * @return `true` if the queue header shadow matches the queue header in shared memory, else `false`. + * + */ + +static bool ATTR_LSE2 +isQueueMemoryValid(IOCircularDataQueue *queue) +{ + return _isQueueMemoryCorrupted(queue) == false; +} +#endif /* KERNEL */ + +/*! + * @function destroyQueueMem + * @abstract Function that destroys a previously created IOCircularDataQueueMemory instance. + * @param queue Handle to the queue. + * @return + * - `kIOReturnSuccess` if the queue was succesfully destroyed. + * - `kIOReturnBadArgument` if an invalid queue was provided. + */ + +static IOReturn ATTR_LSE2 +destroyQueueMem(IOCircularDataQueue *queue) +{ + IOReturn ret = kIOReturnBadArgument; + if (queue != NULL) { +#if KERNEL + OSSafeReleaseNULL(queue->iomd); +#else /* !KERNEL */ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + if (queueMemory) { + ret = IOConnectUnmapMemory(queue->connect, queue->memoryType, + mach_task_self(), (mach_vm_address_t) queueMemory); +// assert(KERN_SUCCESS == ret); + queue->queueMemory = NULL; + } +#endif + ret = kIOReturnSuccess; + } + + return ret; +} + +static IOReturn ATTR_LSE2 +_reset(IOCircularDataQueue *queue) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + + if (queueMemory == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + if (!queueEntryDataSize) { + return kIOReturnUnsupported; + } + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.wrStatus & IOCIRCULARDATAQUEUE_STATE_WRITE_INPROGRESS)) { + // Another thread is modifying the queue + return kIOReturnBusy; + } + + uint32_t currGeneration = currState.fields.generation; + uint32_t newGen = (currGeneration + 1) % IOCIRCULARDATAQUEUE_STATE_GENERATION_MAX; + + IOCircularDataQueueState newState; + newState.fields.generation = newGen; + newState.fields.wrIndex = 0; + newState.fields.seqNum = UINT64_MAX; // since we first increment the seq num on an enqueue. + + if (!atomic_compare_exchange_strong(&queueMemory->queueStateVal, &currState.val, newState.val)) { + return kIOReturnBusy; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + queue_debug_trace("Reset " QUEUE_FORMAT, QUEUE_ARGS(queueMemory)); + return kIOReturnSuccess; +} + +/*! + * @function _enqueueInternal + * @abstract Internal function for enqueuing a new entry on the queue. + * @discussion This method adds a new data entry of dataSize to the queue. It sets the size parameter of the entry + * pointed to by the tail value and copies the memory pointed to by the data parameter in place in the queue. Once that + * is done, it moves the tail to the next available location. When attempting to add a new entry towards the end of the + * queue and there isn't enough space at the end, it wraps back to the beginning.
+ * @param queue Handle to the queue. + * @param data Pointer to the data to be added to the queue. + * @param dataSize Size of the data pointed to by data. + * @param earlyExitForTesting ealy exit flag used for testing only. + * @return + * - `kIOReturnSuccess` on success. + * - Other values indicate an error. + */ + +static IOReturn ATTR_LSE2 +_enqueueInternal(IOCircularDataQueue *queue, + const void *data, + size_t dataSize, + int earlyExitForTesting) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; +// IOCircularDataQueueMemoryCursor const *cursor = &queue->queueCursor; + + if (queueMemory == NULL || data == NULL || dataSize == 0 || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + if (os_unlikely(dataSize > queueHeaderShadow->dataSize)) { + return kIOReturnBadArgument; + } + + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + + if (!queueEntryDataSize) { + return kIOReturnUnsupported; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const uint32_t queueNumEntries = queueHeaderShadow->numEntries; + + // Do not allow instruction re-ordering prior to the header check. + os_compiler_barrier(); + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.wrStatus & IOCIRCULARDATAQUEUE_STATE_WRITE_INPROGRESS)) { + // Another thread is modifying the queue + return kIOReturnBusy; + } + + // size_t queueEntriesBufferSize = queueMemory->allocMemSize - CIRCULAR_DATA_QUEUE_MEMORY_HEADER_SIZE; + uint32_t writeIndex = currState.fields.wrIndex; + uint64_t nextWriteIndex = (writeIndex + 1) % queueNumEntries; + uint64_t nextSeqNum = currState.fields.seqNum + 1; + if (os_unlikely(nextSeqNum == UINT64_MAX)) { + // End of the world. How many enqueues are you trying to do !!! +// abort(); + return kIOReturnOverrun; + } + + __auto_type entry + = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + (writeIndex * queueEntryDataSize)); + // printf("entry=%p\n", (void *)entry); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory allocation before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + return kIOReturnBadArgument; + } + + // if (os_unlikely(_isEntryOutOfBounds(queueHeaderShadow, queueMemory, entry) )) { + // ret = kIOReturnBadArgument; + // break; + // } + + os_compiler_barrier(); + + // All checks passed. Set the write bit. + + IOCircularDataQueueState newState = currState; + newState.fields.wrStatus = IOCIRCULARDATAQUEUE_STATE_WRITE_INPROGRESS; + // lets not change the writeIndex and seq num here. + // newState.fields.wrIndex = nextWriteIndex; + // newState.fields.seqNum = currState.fields.seqNum + 1; // its ok even if we ever rollover UINT64_MAX!! + + if (!atomic_compare_exchange_strong(&queueMemory->queueStateVal, &currState.val, newState.val)) { + // someone else is modifying the queue + return kIOReturnBusy; + } + + // Update the entry header info + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = 0; + enHeaderInfo.fields.wrStatus = IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS; + enHeaderInfo.fields.generation = currState.fields.generation; + // enHeaderInfo.fields.seqNum = newState.fields.seqNum; + enHeaderInfo.fields.seqNum = nextSeqNum; + enHeaderInfo.fields.dataSize = dataSize; + atomic_store_explicit(&entry->headerInfoVal, enHeaderInfo.val, memory_order_release); + + entry->sentinel = queueHeaderShadow->sentinel; + memcpy(entry->data, data, dataSize); + enHeaderInfo.fields.wrStatus = IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_COMPLETE; + atomic_store_explicit(&entry->headerInfoVal, enHeaderInfo.val, memory_order_release); + + IOCircularDataQueueState finalState = newState; + finalState.fields.wrStatus = IOCIRCULARDATAQUEUE_STATE_WRITE_COMPLETE; + // Lets actually update the write index and seq num + finalState.fields.wrIndex = nextWriteIndex; + finalState.fields.seqNum = nextSeqNum; + atomic_store_explicit(&queueMemory->queueStateVal, finalState.val, memory_order_release); + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return kIOReturnSuccess; +} + +/*! + * @function enqueueQueueMem + * @abstract Enqueues a new entry on the queue. + * @discussion This method adds a new data entry of dataSize to the queue. It sets the size parameter of the entry + * pointed to by the write index and copies the memory pointed to by the data parameter in place in the queue. Once + * that is done, it moves the write index to the next index. + * @param queue Handle to the queue. + * @param data Pointer to the data to be added to the queue. + * @param dataSize Size of the data pointed to by data. + * @return + * - `kIOReturnSuccess` on success. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnBadArgument` if an invalid queue was provided. + * - `kIOReturnBusy` if another thread is enqueing concurrently + * - `kIOReturnUnsupported` if the queue has not been configured to support fixed size entries. Variable size is + * currently not supported + * - Other values indicate an error. + */ + +static IOReturn ATTR_LSE2 +enqueueQueueMem(IOCircularDataQueue *queue, + const void *data, + size_t dataSize) +{ + return _enqueueInternal(queue, data, dataSize, 0); +} + +/*! + * @function isDataEntryValidInQueueMem + * Verify if the data at the cursor position is still valid. Call this function after having read the data from the + * queue, since the buffer could potentially have been overwritten while being read.
+ * @param queue Handle to the queue. + * @return + * - `kIOReturnSuccess` if the data at the cursor position was valid. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer valid and is + * potentially overwritten. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnBadArgument` if an invalid param was passed. + * - `kIOReturnBadMedia` if the queueMemory is corrupted. + * + */ + +static IOReturn ATTR_LSE2 +isDataEntryValidInQueueMem(IOCircularDataQueue *queue) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor const *cursor = &queue->queueCursor; + + if (os_unlikely(queueMemory == NULL || queueHeaderShadow == NULL)) { + return kIOReturnBadArgument; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + if (os_unlikely(_isCursorPositionInvalid(queue))) { + return kIOReturnBadArgument; + } + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + // Fahad: We may remove this filed since we don't actually use it. Instead just use generation check below. + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + uint32_t queueGeneration = currState.fields.generation; + if (queueGeneration != cursor->generation) { + // return kIOReturnOverrun; + return kIOReturnAborted; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + __auto_type entry = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + + (cursor->position * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, QUEUE_ARGS(queueMemory), + CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadArgument; + } + + os_compiler_barrier(); + + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + uint32_t entryGeneration = enHeaderInfo.fields.generation; + if (os_unlikely(entryGeneration != queueGeneration)) { + queue_debug_note("entryGeneration != queueGeneration " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnOverrun; + } + + if (os_unlikely(enHeaderInfo.fields.wrStatus == IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS + || enHeaderInfo.fields.seqNum != cursor->sequenceNum)) { + return kIOReturnOverrun; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return kIOReturnSuccess; +} + +/*! + * @function setCursorLatestInQueueMem + * Set the current cursor position to the latest entry in the queue. This only updates the cursor and does not read the + * data from the queue. If nothing has been enqueued into the queue yet, this returns an error. + * @param queue Handle to the queue. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the latest. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue since there is no latest entry. + * - `kIOReturnAborted` if the queue is in an irrecoverable state. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +setCursorLatestInQueueMem(IOCircularDataQueue *queue) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor *cursor = &queue->queueCursor; + + if (queueMemory == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + if (os_unlikely(currState.fields.seqNum == UINT64_MAX)) { + // Nothing has ever been written to the queue yet. + return kIOReturnUnderrun; + } + + uint32_t queueGeneration = currState.fields.generation; + uint32_t readIndex + = (currState.fields.wrIndex > 0) ? (currState.fields.wrIndex - 1) : (queueHeaderShadow->numEntries - 1); + + __auto_type entry + = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + (readIndex * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, QUEUE_ARGS(queueMemory), + CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnAborted; + } + + os_compiler_barrier(); + + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + uint32_t entryGeneration = enHeaderInfo.fields.generation; + if (os_unlikely(entryGeneration != queueGeneration)) { + queue_debug_note("entryGeneration != queueGeneration " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnAborted; + } + + cursor->position = readIndex; + cursor->generation = entryGeneration; + cursor->sequenceNum = enHeaderInfo.fields.seqNum; + + return kIOReturnSuccess; +} + +static IOReturn ATTR_LSE2 +_getLatestInQueueMemInternal(IOCircularDataQueue *queue, + void **data, + size_t *size, + bool copyMem) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor *cursor = &queue->queueCursor; + + IOReturn ret = kIOReturnTimeout; + if (queueMemory == NULL || data == NULL || size == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + const size_t kNumRetries = 5; // Number of retries if the latest index data gets overwritten by a writer. + size_t retry = kNumRetries; + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + size_t inSize; + + inSize = *size; + do { + *size = 0; + retry--; + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_consume); + + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + if (os_unlikely(currState.fields.seqNum == UINT64_MAX)) { + // Nothing has ever been written to the queue yet. + return kIOReturnUnderrun; + } + + uint32_t queueGeneration = currState.fields.generation; + uint32_t readIndex + = (currState.fields.wrIndex > 0) ? (currState.fields.wrIndex - 1) : (queueHeaderShadow->numEntries - 1); + + __auto_type entry = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + + (readIndex * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadArgument; + } + + os_compiler_barrier(); + + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT + " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + uint32_t entryGeneration = enHeaderInfo.fields.generation; + /* Since the time we read the queue header, was the queue + * - reset + * - the entry is being overwritten + * - the entry was overwritten and hence the seq numbers don't match anymore. + * + * Lets retry in such a case + */ + if (os_unlikely(entryGeneration != queueGeneration + || enHeaderInfo.fields.wrStatus == IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS + || currState.fields.seqNum != enHeaderInfo.fields.seqNum)) { + continue; + } + + cursor->position = readIndex; + cursor->generation = entryGeneration; + cursor->sequenceNum = enHeaderInfo.fields.seqNum; + + if (os_unlikely(enHeaderInfo.fields.dataSize > queueHeaderShadow->entryDataSize)) { + ret = kIOReturnOverrun; + break; + } + *size = enHeaderInfo.fields.dataSize; + + if (!copyMem) { + *data = entry->data; + ret = kIOReturnSuccess; + break; // break out, we're done + } else { + if (os_unlikely(enHeaderInfo.fields.dataSize > inSize)) { + return kIOReturnOverrun; + } + memcpy(*data, entry->data, enHeaderInfo.fields.dataSize); + // Lets re-verify after the memcpy if the buffer is/has been overwritten. + + IOCircularDataQueueEntryHeaderInfo enHeaderInfoAfter; + enHeaderInfoAfter.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + // Did something change ? + if (enHeaderInfo.val == enHeaderInfoAfter.val) { + ret = kIOReturnSuccess; + break; + } else { + // we failed so we'll retry. + *size = 0; + } + } + } while (retry); + + if ((kIOReturnSuccess == ret) && os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return ret; +} + +/*! + * @function getLatestInQueueMem + * Access the latest entry data, also update the cursor position to the latest. No copy is made of the data.
Caller + * is supposed to call isDataEntryValidInQueueMem() to check data integrity after reading the data is complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the latest entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnBadArgument` if an invalid queue was provided. + * - `kIOReturnTimeout` if the reader timed out when trying to read. This is possible if the writer overwrites the + * latest index a reader is about to read. The function times out if the read is unsuccessful after multiple retries. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +getLatestInQueueMem(IOCircularDataQueue *queue, + void **data, + size_t *size) +{ + return _getLatestInQueueMemInternal(queue, data, size, false); +} + +/*! + * @function copyLatestInQueueMem + * Access the latest entry data and copy into the provided buffer. Also update the cursor position to the latest. + * Function gaurantees that the new data returned is always valid hence no need to call isDataEntryValidInQueueMem(). + * @param queue Handle to the queue. + * @param data Pointer to memory into which the latest data from the queue is copied. Lifetime of this memory is + * controlled by the caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnUnderrun` if nothing has ever been enqueued into the queue + * - `kIOReturnBadArgument` if the buffer provided to copy the data is NULL or if an invalid queue was provided.. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - `kIOReturnTimeout` if the reader timed out when trying to copy the latest data. This is possible if the writer + * overwrites the latest index a reader is about to copy. The function times out if the copy is unsuccessful after + * multiple retries. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +copyLatestInQueueMem(IOCircularDataQueue *queue, + void *data, + size_t *size) +{ + return _getLatestInQueueMemInternal(queue, &data, size, true); +} + +static IOReturn ATTR_LSE2 +_getNextInQueueMemInternal(IOCircularDataQueue *queue, + void **data, + size_t *size, + bool copyMem) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor *cursor = &queue->queueCursor; + + IOReturn ret = kIOReturnError; + size_t inSize; + + if (queueMemory == NULL || data == NULL || size == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + inSize = *size; + *size = 0; + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + if (os_unlikely(_isCursorPositionInvalid(queue))) { + return kIOReturnAborted; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + uint32_t queueGeneration = currState.fields.generation; + + // was the queue reset ? + if (os_unlikely(cursor->generation != queueGeneration || cursor->sequenceNum > currState.fields.seqNum)) { + return kIOReturnAborted; + } + + if (os_unlikely(currState.fields.seqNum == UINT64_MAX)) { + // Nothing has ever been written to the queue yet. + return kIOReturnUnderrun; + } + + // nothing new written or an active write is in progress for the next entry. + if (os_unlikely(cursor->sequenceNum == currState.fields.seqNum + || ((cursor->sequenceNum + 1) == currState.fields.seqNum + && currState.fields.wrStatus == IOCIRCULARDATAQUEUE_STATE_WRITE_INPROGRESS))) { + return kIOReturnUnderrun; + } + + uint32_t nextIndex = (cursor->position + 1) % queueHeaderShadow->numEntries; + __auto_type entry + = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + (nextIndex * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, QUEUE_ARGS(queueMemory), + CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadArgument; + } + + os_compiler_barrier(); + + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + uint32_t entryGeneration = enHeaderInfo.fields.generation; + if (os_unlikely(entryGeneration != queueGeneration)) { + queue_debug_note("entryGeneration != queueGeneration " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnAborted; + } + + // is the entry currently being written to or has the cursor fallen too far behind and the cursor is no longer + // valid. + if (os_unlikely(enHeaderInfo.fields.wrStatus == IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS + || enHeaderInfo.fields.seqNum != cursor->sequenceNum + 1)) { + return kIOReturnOverrun; + } + + cursor->position = nextIndex; + cursor->generation = entryGeneration; + cursor->sequenceNum = enHeaderInfo.fields.seqNum; + + if (os_unlikely(enHeaderInfo.fields.dataSize > queueHeaderShadow->entryDataSize)) { + return kIOReturnOverrun; + } + *size = enHeaderInfo.fields.dataSize; + + if (!copyMem) { + *data = entry->data; + ret = kIOReturnSuccess; + } else { + if (os_unlikely(enHeaderInfo.fields.dataSize > inSize)) { + return kIOReturnOverrun; + } + memcpy(*data, entry->data, enHeaderInfo.fields.dataSize); + // Lets re-verify after the memcpy if the buffer is/has been overwritten. + + IOCircularDataQueueEntryHeaderInfo enHeaderInfoAfter; + enHeaderInfoAfter.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + // Did something change, while we were memcopying ? + if (enHeaderInfo.val == enHeaderInfoAfter.val) { + ret = kIOReturnSuccess; + } else { + // while we were memcopying, the writer wrapped around and is writing into our index. or the queue got reset + *size = 0; + ret = kIOReturnOverrun; + } + } + + if ((kIOReturnSuccess == ret) && os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return ret; +} + +/*! + * @function getNextInQueueMem + * Access the data at the next cursor position and updates the cursor position to the next. No copy is made of the data. + *
Caller is supposed to call isDataEntryValidInQueueMem() to check data integrity after reading the data is + * complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the next entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the cursor has reached the latest available data. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +getNextInQueueMem(IOCircularDataQueue *queue, + void **data, + size_t *size) +{ + return _getNextInQueueMemInternal(queue, data, size, false); +} + +/*! + * @function copyNextInQueueMem + * Access the data at the next cursor position and copy into the provided buffer. Also update the cursor position to the + * next. If successful, function gaurantees that the data returned is always valid hence no need to call + * isDataEntryValidInQueueMem(). + * @param queue Handle to the queue. + * @param data Pointer to memory into which the next data from the queue is copied. Lifetime of this memory is + * controlled by the caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnUnderrun` if the cursor has reached the latest available data. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +copyNextInQueueMem(IOCircularDataQueue *queue, + void *data, + size_t *size) +{ + return _getNextInQueueMemInternal(queue, &data, size, true); +} + +/*! + * @function getPrevInQueueMem + * Access the data at the previous cursor position and updates the cursor position to the previous. No copy is made of + * the data.
Caller is supposed to call isDataEntryValidInQueueMem() to check data integrity after reading the data + * is complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the previous entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the previous. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +_getPrevInQueueMemInternal(IOCircularDataQueue *queue, + void **data, + size_t *size, + bool copyMem) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor *cursor = &queue->queueCursor; + size_t inSize; + + IOReturn ret = kIOReturnError; + if (queueMemory == NULL || data == NULL || size == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + inSize = *size; + *size = 0; + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + if (os_unlikely(_isCursorPositionInvalid(queue))) { + return kIOReturnAborted; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + uint32_t queueGeneration = currState.fields.generation; + + // was the queue reset ? + if (os_unlikely(cursor->generation != queueGeneration || cursor->sequenceNum > currState.fields.seqNum)) { + return kIOReturnAborted; + } + + if (os_unlikely(currState.fields.seqNum == UINT64_MAX)) { + // Nothing has ever been written to the queue yet. + return kIOReturnUnderrun; + } + + uint32_t prevIndex = (cursor->position == 0) ? (queueHeaderShadow->numEntries - 1) : (cursor->position - 1); + __auto_type entry + = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + (prevIndex * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, QUEUE_ARGS(queueMemory), + CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadArgument; + } + + os_compiler_barrier(); + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + // is the entry currently being written to or this is the newest entry that was just written. + if (os_unlikely(enHeaderInfo.fields.wrStatus == IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS + || enHeaderInfo.fields.seqNum > cursor->sequenceNum)) { + return kIOReturnOverrun; + } + + uint32_t entryGeneration = enHeaderInfo.fields.generation; + if (os_unlikely(entryGeneration != queueGeneration)) { + queue_debug_note("entryGeneration != queueGeneration " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnOverrun; + } + + // the sentinel has been corrupted. + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + cursor->position = prevIndex; + cursor->generation = entryGeneration; + cursor->sequenceNum = enHeaderInfo.fields.seqNum; + + if (os_unlikely(enHeaderInfo.fields.dataSize > queueHeaderShadow->entryDataSize)) { + return kIOReturnOverrun; + } + *size = enHeaderInfo.fields.dataSize; + ret = kIOReturnSuccess; + + if (!copyMem) { + *data = entry->data; + } else { + if (os_unlikely(enHeaderInfo.fields.dataSize > inSize)) { + return kIOReturnOverrun; + } + memcpy(*data, entry->data, enHeaderInfo.fields.dataSize); + // Lets re-verify after the memcpy if the buffer is/has been overwritten. + + IOCircularDataQueueEntryHeaderInfo enHeaderInfoAfter; + enHeaderInfoAfter.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + // Did something change, while we were memcopying ? + if (enHeaderInfo.val != enHeaderInfoAfter.val) { + // while we were memcopying, the writer wrapped around and is writing into our index. or the queue got reset + *size = 0; + ret = kIOReturnOverrun; + } + } + + if ((kIOReturnSuccess == ret) && os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return ret; +} + +static IOReturn ATTR_LSE2 +getPrevInQueueMem(IOCircularDataQueue *queue, + void **data, + size_t *size) +{ + return _getPrevInQueueMemInternal(queue, data, size, false); +} + +/*! + * @function copyPrevInQueueMem + * Access the data at the previous cursor position and copy into the provided buffer. Also update the cursor position to + * the previous. If successful, function gaurantees that the data returned is always valid, hence no need to call + * isDataEntryValidInQueueMem(). + * @param queue Handle to the queue. + * @param data Pointer to memory into which the previous data is copied. Lifetime of this memory is controlled by the + * caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +copyPrevInQueueMem(IOCircularDataQueue *queue, + void *data, + size_t *size) +{ + return _getPrevInQueueMemInternal(queue, &data, size, true); +} + +static IOReturn ATTR_LSE2 +_getCurrentInQueueMemInternal(IOCircularDataQueue *queue, + void **data, + size_t *size, + bool copyMem) +{ + IOCircularDataQueueMemory *queueMemory = queue->queueMemory; + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + IOCircularDataQueueMemoryCursor const *cursor = &queue->queueCursor; + + size_t inSize; + + if (queueMemory == NULL || data == NULL || size == NULL || queueHeaderShadow == NULL) { + return kIOReturnBadArgument; + } + + inSize = *size; + *size = 0; + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + if (os_unlikely(_isCursorPositionInvalid(queue))) { + return kIOReturnAborted; + } + + const size_t queueAllocMemSize = queueHeaderShadow->allocMemSize; + const size_t queueEntryDataSize = queueHeaderShadow->entryDataSize; + + IOCircularDataQueueState currState; + currState.val = atomic_load_explicit(&queueMemory->queueStateVal, memory_order_acquire); + + if (os_unlikely(currState.fields.rstStatus & IOCIRCULARDATAQUEUE_STATE_RESET_INPROGRESS)) { + // Another thread is resetting the queue + return kIOReturnBusy; + } + + uint32_t queueGeneration = currState.fields.generation; + + // was the queue reset ? + if (os_unlikely(cursor->generation != queueGeneration || cursor->sequenceNum > currState.fields.seqNum)) { + return kIOReturnAborted; + } + + if (os_unlikely(currState.fields.seqNum == UINT64_MAX)) { + // Nothing has ever been written to the queue yet. + return kIOReturnUnderrun; + } + + __auto_type entry = (IOCircularDataQueueEntryHeader *)(uintptr_t)((uint8_t *)&queueMemory->entries[0] + + (cursor->position * queueEntryDataSize)); + + // SANITY CHECK - Final check to ensure the 'entry' pointer is + // within the queueMemory entries buffer before we begin writing. + if (os_unlikely((uint8_t *)entry < (uint8_t *)(&queueMemory->entries[0]) + || (uint8_t *)entry >= (uint8_t *)queueMemory + queueAllocMemSize)) { + queue_debug_error("Out of Bounds! " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, QUEUE_ARGS(queueMemory), + CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadArgument; + } + + os_compiler_barrier(); + + if (os_unlikely(entry->sentinel != queueHeaderShadow->sentinel)) { + queue_debug_error("entry->sentinel != queueMemory->sentinel " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnBadMedia; + } + + IOCircularDataQueueEntryHeaderInfo enHeaderInfo; + enHeaderInfo.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + uint32_t entryGeneration = enHeaderInfo.fields.generation; + if (os_unlikely(entryGeneration != queueGeneration)) { + queue_debug_note("entryGeneration != queueGeneration " QUEUE_FORMAT " " CURSOR_FORMAT " " ENTRY_FORMAT, + QUEUE_ARGS(queueMemory), CURSOR_ARGS(cursor), ENTRY_ARGS(entry)); + return kIOReturnAborted; + } + + // is the entry currently being written to or has the cursor fallen too far behind and the cursor is no longer + // valid. + if (os_unlikely(enHeaderInfo.fields.wrStatus == IOCIRCULARDATAQUEUE_ENTRY_STATE_WRITE_INPROGRESS + || enHeaderInfo.fields.seqNum != cursor->sequenceNum)) { + return kIOReturnOverrun; + } + + if (os_unlikely(enHeaderInfo.fields.dataSize > queueHeaderShadow->entryDataSize)) { + return kIOReturnOverrun; + } + *size = enHeaderInfo.fields.dataSize; + + if (!copyMem) { + *data = entry->data; + } else { + if (os_unlikely(enHeaderInfo.fields.dataSize > inSize)) { + return kIOReturnOverrun; + } + memcpy(*data, entry->data, enHeaderInfo.fields.dataSize); + // Lets re-verify after the memcpy if the buffer is/has been overwritten. + + IOCircularDataQueueEntryHeaderInfo enHeaderInfoAfter; + enHeaderInfoAfter.val = atomic_load_explicit(&entry->headerInfoVal, memory_order_acquire); + // Did something change, while we were memcopying ? + if (enHeaderInfo.val != enHeaderInfoAfter.val) { + // while we were memcopying, the writer wrapped around and is writing into our index. or the queue got reset + *size = 0; + return kIOReturnBusy; + } + } + + if (os_unlikely(_isQueueMemoryCorrupted(queue))) { + return kIOReturnBadMedia; + } + + return kIOReturnSuccess; +} + +/*! + * @function getCurrentInQueueMem + * Access the data at the current cursor position. The cursor position is unchanged. No copy is made of the data.
+ * Caller is supposed to call isDataEntryValidInQueueMem() to check data integrity after reading the data is complete. + * @param queue Handle to the queue. + * @param data A pointer to the data memory region for the previous entry data in the queue. + * @param size A pointer to the size of the data parameter. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated to the previous. + * - `kIOReturnAborted` if the cursor has become invalid, possibly due to a reset of the queue. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +getCurrentInQueueMem(IOCircularDataQueue *queue, + void **data, + size_t *size) +{ + return _getCurrentInQueueMemInternal(queue, data, size, false); +} + +/*! + * @function copyCurrentInQueueMem + * Access the data at the current cursor position and copy into the provided buffer. The cursor position is unchanged. + * If successful, function gaurantees that the data returned is always valid, hence no need to call + * isDataEntryValidInQueueMem(). + * @param queue Handle to the queue. + * @param data Pointer to memory into which the previous data is copied. Lifetime of this memory is controlled by the + * caller. + * @param size Size of the data buffer provided for copying. On return, this contains the actual size of the data + * pointed to by data param. + * @return + * - `kIOReturnSuccess` if the cursor position was updated. + * - `kIOReturnAborted` if the cursor has become invalid. + * - `kIOReturnOverrun` if the entry at the cursor position is no longer in + * the queue's buffer. Call getLatestInQueueMem to get the latest data and cursor position. + * - `kIOReturnBadArgument` if an invalid argument is passsed. + * - `kIOReturnBadMedia` if the queue shared memory has been compromised. + * - Other values indicate an error. + * + */ + +static IOReturn ATTR_LSE2 +copyCurrentInQueueMem(IOCircularDataQueue *queue, + void *data, + size_t *size) +{ + return _getCurrentInQueueMemInternal(queue, &data, size, true); +} + + +/* API */ + +static void ATTR_LSE2 +_initCursor(IOCircularDataQueue *queue) +{ + // Invalidate the cursor + IOCircularDataQueueMemoryCursor *cursor = &queue->queueCursor; + cursor->generation = UINT32_MAX; + cursor->position = UINT32_MAX; + cursor->sequenceNum = UINT64_MAX; +} + +#if KERNEL + +IOReturn ATTR_LSE2 +IOCircularDataQueueCreateWithEntries(IOCircularDataQueueCreateOptions options, uint32_t numEntries, uint32_t entrySize, IOCircularDataQueue **pQueue) +{ + IOCircularDataQueueMemory *queueMemory; + IOReturn ret; + + if (!pQueue) { + return kIOReturnBadArgument; + } + *pQueue = NULL; + if (!numEntries || !entrySize) { + return kIOReturnBadArgument; + } + + uint64_t sentinel = 0xA5A5A5A5A5A5A5A5; + +#if HEADER_16BYTE_ALIGNED + size_t entryRoundedDataSize = IORound(entrySize, sizeof(__uint128_t)); +#else + size_t entryRoundedDataSize = IORound(entrySize, sizeof(UInt64)); +#endif + size_t entryDataSize = entryRoundedDataSize + CIRCULAR_DATA_QUEUE_ENTRY_HEADER_SIZE; + size_t entriesSize = numEntries * (entryDataSize); + size_t totalSize = entriesSize + CIRCULAR_DATA_QUEUE_MEMORY_HEADER_SIZE; + + if (os_unlikely(numEntries > UINT32_MAX - 1 + || entryRoundedDataSize > (UINT32_MAX - sizeof(IOCircularDataQueueEntryHeader)) + || entryDataSize > UINT32_MAX || totalSize > UINT32_MAX)) { + return kIOReturnBadArgument; + } + + IOCircularDataQueue *queue = IONew(IOCircularDataQueue, 1); + if (!queue) { + return kIOReturnNoMemory; + } + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + + OSData * desc; + queue->iomd = IOBufferMemoryDescriptor::inTaskWithOptions( + kernel_task, kIOMemoryDirectionOutIn | kIOMemoryKernelUserShared, totalSize, page_size); + if (os_unlikely(queue->iomd == NULL)) { + ret = kIOReturnNoMemory; + goto error; + } + queueMemory = (IOCircularDataQueueMemory *)queue->iomd->getBytesNoCopy(); + queue->queueMemory = queueMemory; + queueMemory->sentinel = queueHeaderShadow->sentinel = sentinel; + + queueHeaderShadow->allocMemSize = (uint32_t)totalSize; + queueHeaderShadow->entryDataSize + = (uint32_t)entryDataSize; // totalSize check above gaurantess this will not overflow UINT32_MAX. + queueHeaderShadow->numEntries = numEntries; + queueHeaderShadow->dataSize = entrySize; // the client requested fixed entry size. + queueHeaderShadow->memorySize = (uint32_t)entriesSize; + + desc = OSData::withBytes(queueHeaderShadow, sizeof(*queueHeaderShadow)); + queue->iomd->setSharingContext(kIOCircularQueueDescriptionKey, desc); + + IOCircularDataQueueState newState; + newState.val = 0; + newState.fields.seqNum = UINT64_MAX; + atomic_store_explicit(&queueMemory->queueStateVal, newState.val, memory_order_release); + + ret = _reset(queue); + if (ret != kIOReturnSuccess) { + goto error; + } + + _initCursor(queue); + *pQueue = queue; + return kIOReturnSuccess; + + +error: + IOCircularDataQueueDestroy(&queue); + return ret; +} + +IOMemoryDescriptor * ATTR_LSE2 +IOCircularDataQueueCopyMemoryDescriptor(IOCircularDataQueue *queue) +{ + IOMemoryDescriptor * md; + md = queue->iomd; + if (md) { + md->retain(); + } + return md; +} + +#else /* KERNEL */ + +#if defined(__arm64__) && defined(__LP64__) +#include +#endif /* defined(__arm64__) */ + +IOReturn ATTR_LSE2 +IOCircularDataQueueCreateWithConnection(IOCircularDataQueueCreateOptions options, io_connect_t connect, uint32_t memoryType, IOCircularDataQueue **pQueue) +{ + if (!pQueue) { + return kIOReturnBadArgument; + } + *pQueue = NULL; + +#if defined(__arm64__) && defined(__LP64__) + if (0 == (kHasFeatLSE2 & _get_cpu_capabilities())) { + return kIOReturnUnsupported; + } +#else + return kIOReturnUnsupported; +#endif /* defined(__arm64__) */ + + uint64_t sentinel = 0xA5A5A5A5A5A5A5A5; + + IOCircularDataQueue *queue = IONew(IOCircularDataQueue, 1); + if (!queue) { + return kIOReturnNoMemory; + } + IOCircularDataQueueDescription *queueHeaderShadow = &queue->queueHeaderShadow; + + queue->connect = connect; + queue->memoryType = memoryType; + + io_struct_inband_t inband_output; + mach_msg_type_number_t inband_outputCnt; + mach_vm_address_t map_address; + mach_vm_size_t map_size; + IOReturn ret; + + inband_outputCnt = sizeof(inband_output); + + ret = io_connect_map_shared_memory(connect, memoryType, mach_task_self(), + &map_address, &map_size, + /* flags */ 0, + (char *) kIOCircularQueueDescriptionKey, + inband_output, + &inband_outputCnt); + + printf("%x, %lx, 0x%llx, 0x%llx\n", inband_outputCnt, sizeof(IOCircularDataQueueDescription), map_address, map_size); + + assert(sizeof(IOCircularDataQueueDescription) == inband_outputCnt); + memcpy(queueHeaderShadow, inband_output, sizeof(IOCircularDataQueueDescription)); + printf("sentinel %qx\n", queueHeaderShadow->sentinel); + assert(queueHeaderShadow->allocMemSize == map_size); + queue->queueMemory = (IOCircularDataQueueMemory *) map_address; + + if (!isQueueMemoryValid(queue)) { + IOCircularDataQueueDestroy(&queue); + return kIOReturnBadArgument; + } + + _initCursor(queue); + *pQueue = queue; + + return ret; +} + +#endif /* !KERNEL */ + +IOReturn ATTR_LSE2 +IOCircularDataQueueDestroy(IOCircularDataQueue **pQueue) +{ + IOCircularDataQueue * queue; + IOReturn ret = kIOReturnSuccess; + + if (!pQueue) { + return kIOReturnBadArgument; + } + queue = *pQueue; + if (queue) { + ret = destroyQueueMem(queue); + IODelete(queue, IOCircularDataQueue, 1); + *pQueue = NULL; + } + return ret; +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueEnqueue(IOCircularDataQueue *queue, const void *data, size_t dataSize) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return enqueueQueueMem(queue, data, dataSize); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueGetLatest(IOCircularDataQueue *queue, void **data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return getLatestInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueCopyLatest(IOCircularDataQueue *queue, void *data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return copyLatestInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueGetNext(IOCircularDataQueue *queue, void **data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return getNextInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueCopyNext(IOCircularDataQueue *queue, void *data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return copyNextInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueGetPrevious(IOCircularDataQueue *queue, void **data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return getPrevInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueCopyPrevious(IOCircularDataQueue *queue, void *data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return copyPrevInQueueMem(queue, data, size); +} + +// IOReturn +//IOCircularDataQueueGetLatestWithBlock(IOCircularDataQueue *queue, void (^handler)(void * data, size_t size)) +//{ +// if (!queue) { +// return kIOReturnBadArgument; +// } +// +//// return getPrevInQueueMem(queue->queueMemory, (IOCircularDataQueueDescription *) +///&queue->queueHeaderShadow, (IOCircularDataQueueMemoryCursor *) &queue->queueCursor, data, size); +//} +// + +IOReturn ATTR_LSE2 +IOCircularDataQueueIsCurrentDataValid(IOCircularDataQueue *queue) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return isDataEntryValidInQueueMem(queue); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueSetCursorLatest(IOCircularDataQueue *queue) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return setCursorLatestInQueueMem(queue); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueGetCurrent(IOCircularDataQueue *queue, void **data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return getCurrentInQueueMem(queue, data, size); +} + +IOReturn ATTR_LSE2 +IOCircularDataQueueCopyCurrent(IOCircularDataQueue *queue, void *data, size_t *size) +{ + if (!queue) { + return kIOReturnBadArgument; + } + + return copyCurrentInQueueMem(queue, data, size); +} + +__END_DECLS diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index da70d3ef0..84d2d5d22 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -110,6 +110,10 @@ typedef struct { uint64_t dram_base; uint64_t dram_size; + /* Start and end of managed memory. */ + uint64_t managed_phys_start; + uint64_t managed_phys_end; + /** * Starting physical address of the Device Tree. * @@ -603,11 +607,6 @@ hibernate_pin_swap(boolean_t begin); kern_return_t hibernate_processor_setup(IOHibernateImageHeader * header); -void -hibernate_gobble_pages(uint32_t gobble_count, uint32_t free_page_time); -void -hibernate_free_gobble_pages(void); - void hibernate_vm_lock_queues(void); void diff --git a/iokit/IOKit/IOKitDebug.h b/iokit/IOKit/IOKitDebug.h index b2425dcd9..eefb3c49d 100644 --- a/iokit/IOKit/IOKitDebug.h +++ b/iokit/IOKit/IOKitDebug.h @@ -113,6 +113,7 @@ enum { | kIOSleepWakeWdogOff | kIOKextSpinDump | kIOWaitQuietPanics + | kIOLogExclaves }; enum { @@ -143,13 +144,14 @@ enum { kIODKDisableCDHashChecking = 0x00004000ULL, kIODKDisableEntitlementChecking = 0x00008000ULL, kIODKDisableCheckInTokenVerification = 0x00010000ULL, + kIODKDisableIOPMSystemOffPhase2Allow = 0x00020000ULL, }; #if XNU_KERNEL_PRIVATE #define DKLOG(fmt, args...) { IOLog("DK: " fmt, ## args); } #define DKS "%s-0x%qx" -#define DKN(s) s->getName(), s->getRegistryEntryID() +#define DKN(s) s ? s->getName() : "NO-NAME", s ? s->getRegistryEntryID() : UINT64_MAX #ifdef IOKITDEBUG #define DEBUG_INIT_VALUE IOKITDEBUG diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index 962e8ba85..6cc73fa05 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -125,6 +125,11 @@ enum { #define kIOWaitQuietPanicsEntitlement "com.apple.private.security.waitquiet-panics" #define kIOSystemStateEntitlement "com.apple.private.iokit.systemstate" +#define kIOMemoryDescriptorSharingContextKey "IOMemoryDescriptorSharingContext" + +// Entitlement allows io_connect_map_shared_memory to map writable in user space +#define kIOMapSharedMemoryWritableEntitlement "com.apple.private.iokit.sharedmemory.writable" + // Entitlement allows a DK driver to publish services to other dexts, using the // standard IOKit registerService() or DriverKit RegisterService() api. // Those client dexts must have an entitlement specified by the @@ -153,6 +158,8 @@ enum { #define kIOExclaveAssignedKey "exclave-assigned" #define kIOExclaveProxyKey "IOExclaveProxy" +#define kIOPMAOTAllowKey "IOPMAOTAllow" +#define kIOPMSystemOffPhase2AllowKey "IOPMSystemOffPhase2Allow" // IONVRAMSystemVariableList: // "one-time-boot-command" - Needed for diags customer install flows @@ -183,4 +190,7 @@ enum { "SystemAudioVolumeSaved" +// Uniform Type Identifiers supported by a service +#define kIOUniformTypeIdentifiersKey "UniformTypeIdentifiers" + #endif /* ! _IOKIT_IOKITKEYSPRIVATE_H */ diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index 9252998d0..a2b2fd88c 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -121,6 +121,9 @@ extern "C" { #endif /* __cplusplus */ #include +#if MACH_KERNEL_PRIVATE +#include +#endif /* MACH_KERNEL_PRIVATE */ /* * Functions in iokit:IOUserClient.cpp @@ -133,13 +136,15 @@ typedef IOMachPort * io_kobject_t; typedef struct IOMachPort * io_kobject_t; #endif -extern void iokit_add_reference( io_object_t obj, ipc_kobject_type_t type ); +extern void iokit_add_reference( io_object_t obj ); -extern ipc_port_t iokit_port_for_object(io_object_t obj, - ipc_kobject_type_t type, ipc_kobject_t * kobj); +extern ipc_port_t iokit_port_make_send_for_object( io_object_t obj, + ipc_kobject_type_t type ); -extern kern_return_t iokit_client_died( io_object_t obj, - ipc_port_t port, ipc_kobject_type_t type, mach_port_mscount_t * mscount ); +extern void iokit_ident_no_senders( ipc_port_t port, mach_port_mscount_t mscount ); +extern void iokit_object_no_senders( ipc_port_t port, mach_port_mscount_t mscount ); +extern void iokit_connect_no_senders( ipc_port_t port, mach_port_mscount_t mscount ); +extern void iokit_uext_no_senders( ipc_port_t port, mach_port_mscount_t mscount ); extern kern_return_t iokit_client_memory_for_type( @@ -149,28 +154,40 @@ iokit_client_memory_for_type( vm_address_t * address, vm_size_t * size ); +/* + * Re-externs from and for iokit/... + * + * Note: these are safe because IOKitServer.h is used from osfmk/... + * context and will fail to build if they diverge. + */ + +extern mach_port_t ipc_port_make_send_mqueue(mach_port_t) __result_use_check; +extern mach_port_t ipc_port_copy_send_mqueue(mach_port_t) __result_use_check; +extern void ipc_port_release_send(ipc_port_t port); + +extern bool ipc_kobject_is_mscount_current_locked(ipc_port_t port, mach_port_mscount_t mscount); +extern ipc_kobject_t ipc_kobject_get_locked(ipc_port_t port, ipc_kobject_type_t type); +extern void ipc_kobject_enable(ipc_port_t, ipc_kobject_t, ipc_kobject_type_t); +extern ipc_kobject_t ipc_kobject_disable(ipc_port_t, ipc_kobject_type_t); +extern mach_port_t ipc_kobject_make_send(mach_port_t, ipc_kobject_t, ipc_kobject_type_t) __result_use_check; +extern mach_port_t ipc_kobject_copy_send(mach_port_t, ipc_kobject_t, ipc_kobject_type_t) __result_use_check; + /* * Functions in osfmk:iokit_rpc.c */ +extern void iokit_lock_port(ipc_port_t port); +extern void iokit_unlock_port(ipc_port_t port); + extern ipc_port_t iokit_alloc_object_port( io_kobject_t obj, ipc_kobject_type_t type ); -extern void iokit_remove_object_port( ipc_port_t port, - ipc_kobject_type_t type ); -extern kern_return_t iokit_destroy_object_port( ipc_port_t port, - ipc_kobject_type_t type ); +extern void iokit_destroy_object_port( ipc_port_t port, ipc_kobject_type_t type ); extern ipc_kobject_type_t iokit_port_type(ipc_port_t port); extern mach_port_name_t iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ); -extern mach_port_t ipc_kobject_make_send(mach_port_t, ipc_kobject_t, ipc_kobject_type_t) __result_use_check; -extern mach_port_t ipc_kobject_copy_send(mach_port_t, ipc_kobject_t, ipc_kobject_type_t) __result_use_check; -extern mach_port_t ipc_port_make_send_mqueue(mach_port_t) __result_use_check; -extern mach_port_t ipc_port_copy_send_mqueue(mach_port_t) __result_use_check; -extern void ipc_port_release_send(ipc_port_t port); - extern io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type); extern kern_return_t iokit_mod_send_right( task_t task, mach_port_name_t name, mach_port_delta_t delta ); @@ -180,13 +197,8 @@ extern io_object_t iokit_lookup_object_with_port_name(mach_port_name_t name, ipc extern io_object_t iokit_lookup_connect_ref_current_task(mach_port_name_t name); extern io_object_t iokit_lookup_uext_ref_current_task(mach_port_name_t name); -extern void iokit_retain_port( ipc_port_t port ); -extern void iokit_release_port( ipc_port_t port ); extern void iokit_release_port_send( ipc_port_t port ); -extern void iokit_lock_port(ipc_port_t port); -extern void iokit_unlock_port(ipc_port_t port); - extern kern_return_t iokit_lookup_raw_current_task(mach_port_name_t name, ipc_kobject_type_t type, ipc_port_t *port); #ifndef MACH_KERNEL_PRIVATE diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 559807e81..c8e3fef6b 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -208,7 +208,7 @@ __IOMallocAligned_internal( } #define IOMallocAligned(size, alignment) \ - __IOMallocAligned_internal(KHEAP_DATA_BUFFERS, size, alignment, Z_WAITOK) + __IOMallocAligned_internal(GET_KEXT_KHEAP_DATA(), size, alignment, Z_WAITOK) #else /* XNU_KERNEL_PRIVATE */ @@ -288,8 +288,11 @@ void IOFreePageable(void * address, vm_size_t size); #if XNU_KERNEL_PRIVATE -#define IOMallocData(size) __IOMalloc_internal(KHEAP_DATA_BUFFERS, size, Z_WAITOK) -#define IOMallocZeroData(size) __IOMalloc_internal(KHEAP_DATA_BUFFERS, size, Z_ZERO) +#define IOMallocData(size) __IOMalloc_internal(GET_KEXT_KHEAP_DATA(), size, Z_WAITOK) +#define IOMallocZeroData(size) __IOMalloc_internal(GET_KEXT_KHEAP_DATA(), size, Z_ZERO) + +#define IOMallocDataSharable(size) __IOMalloc_internal(KHEAP_DATA_SHARED, size, Z_WAITOK) +#define IOMallocZeroDataSharable() __IOMalloc_internal(KHEAP_DATA_SHARED, size, Z_ZERO) #else /* XNU_KERNEL_PRIVATE */ @@ -307,6 +310,20 @@ void * IOMallocData(vm_size_t size) __attribute__((alloc_size(1))); * @result Pointer to the allocated memory, or zero on failure. */ void * IOMallocZeroData(vm_size_t size) __attribute__((alloc_size(1))); +/*! @function IOMallocDataSharable + * @abstract Allocates wired memory in the kernel map, from a separate section meant for pure data that meant to be shared. + * @discussion Same as IOMalloc except that this function should be used for allocating pure data. + * @param size Size of the memory requested. + * @result Pointer to the allocated memory, or zero on failure. */ +void * IOMallocDataSharable(vm_size_t size) __attribute__((alloc_size(1))); + +/*! @function IOMallocZeroDataSharable + * @abstract Allocates wired memory in the kernel map, from a separate section meant for pure data bytes that don't contain pointers and meant to be shared. + * @discussion Same as IOMallocDataSharable except that the memory returned is zeroed. + * @param size Size of the memory requested. + * @result Pointer to the allocated memory, or zero on failure. */ +void * IOMallocZeroDataSharable(vm_size_t size) __attribute__((alloc_size(1))); + #endif /* !XNU_KERNEL_PRIVATE */ /*! @function IOFreeData @@ -316,6 +333,13 @@ void * IOMallocZeroData(vm_size_t size) __attribute__((alloc_size(1))); * @param size Size of the memory allocated. It is acceptable to pass 0 size for a NULL address. */ void IOFreeData(void * address, vm_size_t size); +/*! @function IOFreeDataSharable + * @abstract Frees memory allocated with IOMallocDataSharable or IOMallocZeroDataSharable. + * @discussion This function frees memory allocated with IOMallocDataSharable/IOMallocZeroDataSharable, it may block and so should not be called from interrupt level or while a simple lock is held. + * @param address Virtual address of the allocated memory. Passing NULL here is acceptable. + * @param size Size of the memory allocated. It is acceptable to pass 0 size for a NULL address. */ +void IOFreeDataSharable(void * address, vm_size_t size); + #define IONewData(type, count) \ ((type *)IOMallocData(IOMallocArraySize(0, sizeof(type), count))) diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 17b16acbf..f4bfd17e9 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -306,7 +306,7 @@ public: vm_tag_t _kernelTag; vm_tag_t _userTag; int16_t _dmaReferences; - uint16_t _internalFlags; + uint16_t _internalIOMDFlags; kern_allocation_name_t _mapName; protected: #else /* XNU_KERNEL_PRIVATE */ @@ -391,6 +391,20 @@ public: IOReturn getPageCounts( IOByteCount * residentPageCount, IOByteCount * dirtyPageCount); +#if KERNEL_PRIVATE +#define IOMEMORYDESCRIPTOR_GETPAGECOUNTS_SUPPORTS_SWAPPED 1 +#endif +/*! @function getPageCounts + * @abstract Retrieve the number of resident, dirty, and swapped pages encompassed by an IOMemoryDescriptor. + * @param residentPageCount - If non-null, a pointer to a byte count that will return the number of resident pages encompassed by this IOMemoryDescriptor. + * @param dirtyPageCount - If non-null, a pointer to a byte count that will return the number of resident, dirty pages encompassed by this IOMemoryDescriptor. + * @param swappedPageCount - If non-null, a pointer to a byte count that will return the number of swapped pages encompassed by this IOMemoryDescriptor. + * @result An IOReturn code. */ + + IOReturn getPageCounts( IOByteCount * residentPageCount, + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount ); + /*! @function performOperation * @abstract Perform an operation on the memory descriptor's memory. * @discussion This method performs some operation on a range of the memory descriptor's memory. When a memory descriptor's memory is not mapped, it should be more efficient to use this method than mapping the memory to perform the operation virtually. @@ -846,13 +860,25 @@ public: * @discussion This method returns the context for the memory descriptor. The context is not interpreted by IOMemoryDescriptor. * @result The context, returned with an additional retain to be released by the caller. */ OSObject * copyContext(void) const; +#ifdef XNU_KERNEL_PRIVATE + OSObject * copyContext(const OSSymbol * key) const; + OSObject * copyContext(const char * key) const; + OSObject * copySharingContext(const char * key) const; +#endif /* XNU_KERNEL_PRIVATE */ /*! @function setContext * @abstract Set a context object for the memory descriptor. The context is not interpreted by IOMemoryDescriptor. * @discussion The context is retained, and will be released when the memory descriptor is freed or when a new context object is set. */ void setContext(OSObject * context); -#endif +#ifdef XNU_KERNEL_PRIVATE + void setContext(const OSSymbol * key, OSObject * context); + void setContext(const char * key, OSObject * context); + void setSharingContext(const char * key, OSObject * context); + bool hasSharingContext(void); + +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* KERNEL_PRIVATE */ protected: virtual void addMapping( @@ -1155,7 +1181,8 @@ public: static IOReturn memoryReferenceGetPageCounts( IOMemoryReference * ref, IOByteCount * residentPageCount, - IOByteCount * dirtyPageCount); + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount); static uint64_t memoryReferenceGetDMAMapLength( IOMemoryReference * ref, diff --git a/iokit/IOKit/IOMultiMemoryDescriptor.h b/iokit/IOKit/IOMultiMemoryDescriptor.h index 4a56dbe8a..1912f9509 100644 --- a/iokit/IOKit/IOMultiMemoryDescriptor.h +++ b/iokit/IOKit/IOMultiMemoryDescriptor.h @@ -32,6 +32,11 @@ #include #include +#if KERNEL_PRIVATE +#define IOMULTIMEMORYDESCRIPTOR_EXPORTED 1 +#endif /* KERNEL_PRIVATE */ + + /*! @class IOMultiMemoryDescriptor : public IOMemoryDescriptor * @abstract The IOMultiMemoryDescriptor object describes a memory area made up of several other IOMemoryDescriptors. * @discussion The IOMultiMemoryDescriptor object represents multiple ranges of memory, specified as an ordered list of IOMemoryDescriptors. The descriptors are chained end-to-end to make up a single contiguous buffer. */ @@ -121,6 +126,17 @@ public: IOReturn getPageCounts(IOByteCount * residentPageCount, IOByteCount * dirtyPageCount); +/*! @function getPageCounts + * @abstract Retrieve the number of resident, dirty, and swapped pages encompassed by an IOMemoryDescriptor. + * @param residentPageCount - If non-null, a pointer to a byte count that will return the number of resident pages encompassed by this IOMemoryDescriptor. + * @param dirtyPageCount - If non-null, a pointer to a byte count that will return the number of resident, dirty pages encompassed by this IOMemoryDescriptor. + * @param swappedPageCount - If non-null, a pointer to a byte count that will return the number of swapped pages encompassed by this IOMemoryDescriptor. + * @result An IOReturn code. */ + + IOReturn getPageCounts( IOByteCount * residentPageCount, + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount ); + virtual uint64_t getPreparationID( void ) APPLE_KEXT_OVERRIDE; #define IOMULTIMEMORYDESCRIPTOR_SUPPORTS_GETPAGECOUNTS 1 diff --git a/iokit/IOKit/IONVRAM.h b/iokit/IOKit/IONVRAM.h index 7c89293a2..6a5a56dff 100644 --- a/iokit/IOKit/IONVRAM.h +++ b/iokit/IOKit/IONVRAM.h @@ -111,7 +111,7 @@ private: IOReturn syncInternal(bool rateLimit); bool safeToSync(void); - + IOReturn clearTestVars(const uuid_t guid); public: virtual bool init(IORegistryEntry *old, const IORegistryPlane *plane) APPLE_KEXT_OVERRIDE; virtual bool start(IOService * provider) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOPolledInterface.h b/iokit/IOKit/IOPolledInterface.h index 45ffe050d..9fefb9a91 100644 --- a/iokit/IOKit/IOPolledInterface.h +++ b/iokit/IOKit/IOPolledInterface.h @@ -257,7 +257,7 @@ kern_open_file_for_direct_io(const char * name, void kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, off_t write_offset, void * addr, size_t write_length, - off_t discard_offset, off_t discard_end, bool unlink); + off_t discard_offset, off_t discard_end, off_t set_file_size, bool unlink); int kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, void * addr, size_t len, int ioflag); int diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index 876028365..5ea49a619 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -1676,11 +1676,12 @@ public: static void iokitDaemonLaunched(); void resetRematchProperties(); bool hasUserServer() const; - static void userSpaceWillReboot(); + static void setWillUserspaceReboot(); + static bool getWillUserspaceReboot(); static void userSpaceDidReboot(); kern_return_t CopyProperties_Local(OSDictionary ** properties); - IOStateNotificationItem * stateNotificationItemCopy(OSString * itemName, OSDictionary * schema); + IOStateNotificationItem * stateNotificationItemCopy(OSString * itemName, OSDictionary * initialValue); kern_return_t stateNotificationListenerAdd(OSArray * items, IOStateNotificationListenerRef * outRef, IOStateNotificationHandler handler); @@ -2096,6 +2097,13 @@ public: UInt32 getPowerState( void ); +/*! @function getDesiredPowerState + * @abstract Determines a device's desired power state. + * @discussion A device's "desired power state" is updated at the start of each power state transition (e.g. transition from state 1 to state 0, or state 0 to state 2). + * @result The desired power state's index into the device's power state array. */ + + UInt32 getDesiredPowerState( void ); + /*! @function setPowerState * @abstract Requests a power managed driver to change the power state of its device. * @discussion A power managed driver must override setPowerState to take part in system power management. After a driver is registered with power management, the system uses setPowerState to power the device off and on for system sleep and wake. @@ -2255,6 +2263,7 @@ public: bool getBlockingDriverCall(thread_t *thread, const void **callMethod); void cancelIdlePowerDown(IOService * service); void cancelIdlePowerDownSync( void ); + bool currentOrPendingPowerState(uint32_t state); protected: bool tellClientsWithResponse( int messageType ); @@ -2388,6 +2397,10 @@ private: IOReturn configureSimplePowerReport(IOReportConfigureAction action, void *result ); IOReturn updateSimplePowerReport( IOReportConfigureAction action, void *result, void *destination ); void waitForPMDriverCall( IOService * target = NULL ); + void addPMDriverClass(uint64_t driverClass); +#if DEBUG || DEVELOPMENT + void __patchProperties(void); +#endif friend class IOUserServer; #endif /* XNU_KERNEL_PRIVATE */ diff --git a/iokit/IOKit/IOSubMemoryDescriptor.h b/iokit/IOKit/IOSubMemoryDescriptor.h index fae29ba4e..48e5ccf22 100644 --- a/iokit/IOKit/IOSubMemoryDescriptor.h +++ b/iokit/IOKit/IOSubMemoryDescriptor.h @@ -119,6 +119,17 @@ public: IOReturn getPageCounts(IOByteCount * residentPageCount, IOByteCount * dirtyPageCount); + +/*! @function getPageCounts + * @abstract Retrieve the number of resident, dirty, and swapped pages encompassed by an IOMemoryDescriptor. + * @param residentPageCount - If non-null, a pointer to a byte count that will return the number of resident pages encompassed by this IOMemoryDescriptor. + * @param dirtyPageCount - If non-null, a pointer to a byte count that will return the number of resident, dirty pages encompassed by this IOMemoryDescriptor. + * @param swappedPageCount - If non-null, a pointer to a byte count that will return the number of swapped pages encompassed by this IOMemoryDescriptor. + * @result An IOReturn code. */ + + IOReturn getPageCounts( IOByteCount * residentPageCount, + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount ); }; #endif /* !_IOSUBMEMORYDESCRIPTOR_H */ diff --git a/iokit/IOKit/IOUserServer.h b/iokit/IOKit/IOUserServer.h index b678c328c..7844db7c5 100644 --- a/iokit/IOKit/IOUserServer.h +++ b/iokit/IOKit/IOUserServer.h @@ -104,6 +104,7 @@ typedef uint64_t IOTrapMessageBuffer[256]; #include #include #include +#include #include class IOUserServer; class OSUserMetaClass; @@ -125,8 +126,10 @@ struct OSObjectUserVars { bool willTerminate; bool didTerminate; bool serverDied; + bool instantiated; bool started; bool stopped; + bool needStop; bool userServerPM; bool willPower; bool powerState; @@ -134,6 +137,9 @@ struct OSObjectUserVars { bool deferredRegisterService; uint32_t powerOverride; IOLock * uvarsLock; + OSDictionary * originalProperties; + OSArray * pmAssertions; + OSArray * pmAssertionsSynced; }; extern IOLock * gIOUserServerLock; @@ -178,6 +184,10 @@ public: kern_allocation_name_t fAllocationName; task_t fOwningTask; os_reason_t fTaskCrashReason; + bool fPageout; + bool fSuspended; + bool fAOTAllow; + bool fSystemOffPhase2Allow; public: @@ -213,12 +223,15 @@ public: IOReturn serviceClose(IOService * provider, IOService * client); IOReturn serviceJoinPMTree(IOService * service); IOReturn serviceSetPowerState(IOService * controllingDriver, IOService * service, IOPMPowerFlags flags, IOPMPowerStateIndex powerState); + IOReturn serviceCreatePMAssertion(IOService * service, uint32_t assertionBits, uint64_t * assertionID, bool synced); + IOReturn serviceReleasePMAssertion(IOService * service, uint64_t assertionID); IOReturn serviceNewUserClient(IOService * service, task_t owningTask, void * securityID, uint32_t type, OSDictionary * properties, IOUserClient ** handler); IOReturn serviceNewUserClient(IOService * service, task_t owningTask, void * securityID, uint32_t type, OSDictionary * properties, OSSharedPtr& handler); IOReturn exit(const char * reason); IOReturn kill(const char * reason); + void serverAck(void); bool serviceMatchesCheckInToken(IOUserServerCheckInToken *token); bool checkEntitlements(IOService * provider, IOService * dext); @@ -231,8 +244,9 @@ public: void setDriverKitUUID(OSKext *kext); void setDriverKitStatistics(OSKext *kext); IOReturn setCheckInToken(IOUserServerCheckInToken *token); - void systemPower(bool powerOff, bool hibernate); - void systemHalt(int howto); + void systemPower(uint8_t systemState, bool hibernate); + void systemSuspend(void); + void systemHalt(int howto); static void powerSourceChanged(bool acAttached); bool checkPMReady(); @@ -249,8 +263,8 @@ public: OSObjectUserVars * varsForObject(OSObject * obj); LIBKERN_RETURNS_NOT_RETAINED IODispatchQueue * queueForObject(OSObject * obj, uint64_t msgid); - static ipc_port_t copySendRightForObject(OSObject * object, natural_t /* ipc_kobject_type_t */ type); - static OSObject * copyObjectForSendRight(ipc_port_t port, natural_t /* ipc_kobject_type_t */ type); + static ipc_port_t copySendRightForObject(OSObject * object, ipc_kobject_type_t type); + static OSObject * copyObjectForSendRight(ipc_port_t port, ipc_kobject_type_t type); IOReturn copyOutObjects(IORPCMessageMach * mach, IORPCMessage * message, size_t size, bool consume); @@ -270,6 +284,7 @@ public: static void beginLeakingObjects(); bool isPlatformDriver(); int getCSValidationCategory(); + void pageout(); }; typedef void (*IOUserServerCheckInCancellationHandler)(class IOUserServerCheckInToken*, void*); @@ -340,7 +355,6 @@ private: private: IOUserServerCheckInToken::State fState; - size_t fPendingCount; const OSSymbol * fServerName; const OSSymbol * fExecutableName; OSNumber * fServerTag; diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile index fb12bf3b1..43245e297 100644 --- a/iokit/IOKit/Makefile +++ b/iokit/IOKit/Makefile @@ -53,7 +53,8 @@ INSTALL_IF_MI_LCL_LIST += \ IOLocksPrivate.h IOStatistics.h \ AppleKeyStoreInterface.h \ IOReportTypes.h IOKernelReportStructs.h \ - IOReportMacros.h IOInterruptAccounting.h + IOReportMacros.h IOInterruptAccounting.h \ + IOCircularDataQueue.h IOCircularDataQueueImplementation.h INSTALL_MI_DIR = . diff --git a/iokit/IOKit/perfcontrol/IOPerfControl.h b/iokit/IOKit/perfcontrol/IOPerfControl.h index c0bcb2d3b..1385c5b03 100644 --- a/iokit/IOKit/perfcontrol/IOPerfControl.h +++ b/iokit/IOKit/perfcontrol/IOPerfControl.h @@ -42,6 +42,25 @@ public: */ static IOPerfControlClient *copyClient(IOService *driver, uint64_t maxWorkCapacity); + __enum_decl(IOPCDeviceType, uint8_t, { + IOPCDeviceTypeUnknown = 0x0, + IOPCDeviceTypeGPU = 0x1, + IOPCDeviceTypeANE = 0x2, + IOPCDeviceTypeMSR = 0x3, + IOPCDeviceTypeStorage = 0x4, + IOPCDeviceTypeMax = 0x5, + }); +/*! + * @function copyClientForDeviceType + * @abstract Return a retained reference to a client object, to be released by the driver. It may be + * shared with other drivers in the system. + * @param driver The device driver that will be using this interface. + * @param maxWorkCapacity The maximum number of concurrent work items supported by the device driver. + * @param deviceType The type of device that this driver controls. Unknown is fine to use for devices not listed. + * @returns An instance of IOPerfControlClient. + */ + static IOPerfControlClient *copyClientForDeviceType(IOService *driver, uint64_t maxWorkCapacity, IOPCDeviceType deviceType); + /*! * @function registerDevice * @abstract Inform the system that work will be dispatched to a device in the future. @@ -239,6 +258,19 @@ public: */ void workUpdateWithContext(IOService *device, OSObject *context, WorkUpdateArgs *args = nullptr); +/*! + * @function querySubmitterRole + * @abstract Reports the current role configured on the submitting task by app lifecycle management policy + * for this type of device. May be queried before submit to inform which policies should apply to this work. + * @param device The device that will submit the work. Some platforms require device to be a + * specific subclass of IOService. + * @note Must use the copyClientForDeviceType init to convey the type of device to query the role of. + * GPU role enums are found in sys/resource_private.h and are configured via PRIO_DARWIN_GPU. + */ + IOReturn querySubmitterRole(IOService *device, task_t submitting_task, uint32_t* role_out); + +#define PERFCONTROL_SUPPORTS_SUBMITTER_ROLE 1 + /* * Callers should always use the CURRENT version so that the kernel can detect both older * and newer structure layouts. New callbacks should always be added at the end of the @@ -275,7 +307,7 @@ public: uint64_t target_thread_group_id; void *target_thread_group_data; - PerfDeviceID device_type; + PerfDeviceID device_type; /* device-type determined by CLPC */ uint32_t instance_id; bool resource_accounting; }; @@ -349,6 +381,9 @@ public: } private: + + void setDeviceType(IOPCDeviceType deviceType); + struct WorkTableEntry { struct thread_group *thread_group; coalition_t coal; @@ -369,6 +404,7 @@ private: inline uint64_t tokenToGlobalUniqueToken(uint64_t token); void accountResources(coalition_t coal, PerfControllerInterface::PerfDeviceID device_type, PerfControllerInterface::ResourceAccounting *resources); + IOPCDeviceType deviceType; /* device-type provided by client via copyClientForDeviceType */ uint8_t driverIndex; IOPerfControlClientShared *shared; WorkTableEntry *workTable; diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index bce97b8ca..a13d0bf6a 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -370,7 +370,17 @@ enum { * When set, driver is informing PM that it is holding the network * interface up to do TCPKeepAlive */ - kIOPMDriverAssertionNetworkKeepAliveActiveBit = 0x200 + kIOPMDriverAssertionNetworkKeepAliveActiveBit = 0x200, + + /*! kIOPMDriverAssertionForceWakeupBit + * When set, the system will immediately wake up the CPU after going to sleep. + */ + kIOPMDriverAssertionForceWakeupBit = 0x400, + + /*! kIOPMDriverAssertionForceFullWakeupBit + * When set, the system will immediately do a full wakeup after going to sleep. + */ + kIOPMDriverAssertionForceFullWakeupBit = 0x800, }; /* kIOPMAssertionsDriverKey @@ -968,7 +978,8 @@ enum { kIOPMSystemCapabilityCPU = 0x01, kIOPMSystemCapabilityGraphics = 0x02, kIOPMSystemCapabilityAudio = 0x04, - kIOPMSystemCapabilityNetwork = 0x08 + kIOPMSystemCapabilityNetwork = 0x08, + kIOPMSystemCapabilityAOT = 0x10, }; #endif /* ! _IOKIT_IOPM_H */ diff --git a/iokit/IOKit/pwr_mgt/IOPMLibDefs.h b/iokit/IOKit/pwr_mgt/IOPMLibDefs.h index 814d99829..3e8d31c43 100644 --- a/iokit/IOKit/pwr_mgt/IOPMLibDefs.h +++ b/iokit/IOKit/pwr_mgt/IOPMLibDefs.h @@ -45,5 +45,6 @@ #define kPMSetDisplayPowerOn 15 #define kPMSetDisplayState 16 #define kPMRequestIdleSleepRevert 17 +#define kPMSetLDMHibernationDisable 18 -#define kNumPMMethods 18 +#define kNumPMMethods 19 diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 8bc6f7248..3d8616b58 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -25,11 +25,21 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _IOKIT_IOPMPRIVATE_H -#define _IOKIT_IOPMPRIVATE_H +#pragma once + #include +// Supported power states. +enum IOPMRootDomainPowerState { + OFF_STATE = 0, + RESTART_STATE = 1, + SLEEP_STATE = 2, + AOT_STATE = 3, + ON_STATE = 4, + NUM_POWER_STATES +}; + /* @constant kIOPMEventTypeIntermediateFlag * @abstract This bit indicates the event is an intermediate event * which must occur within a major system power event. @@ -1062,5 +1072,3 @@ enum { kIOPMPerformanceWarning = 100 }; - -#endif /* _IOKIT_IOPMPRIVATE_H */ diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index e4bf1206a..65dd76ea2 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -120,6 +120,11 @@ enum { #define kIOPMRootDomainLidCloseCString "LidClose" #define kIOPMRootDomainBatPowerCString "BatPower" +/* + * String constants to use as keys for a dictionary passed to IOPMRootDomain::claimSystemShutdownEvent + */ +#define kIOPMRootDomainShutdownTime "IOPMShutdownTime" + /* * Supported Feature bitfields for IOPMrootDomain::publishFeature() */ @@ -129,6 +134,13 @@ enum { kIOPMSupportedOnUPS = (1 << 2) }; +/* + * Supported run mode bitfields for IOPMrootDomain::requestRunMode() + */ +enum { + kIOPMRunModeFullWake = UINT64_MAX, +}; + typedef IOReturn (*IOPMSettingControllerCallback) (OSObject *target, const OSSymbol *type, OSObject *val, uintptr_t refcon); @@ -406,11 +418,19 @@ public: */ IOReturn restartWithStackshot(); +#ifdef KERNEL_PRIVATE IOReturn setWakeTime(uint64_t wakeContinuousTime); + bool isAOTMode(void); + bool isLPWMode(void); +#endif /* KERNEL_PRIVATE */ #if XNU_KERNEL_PRIVATE + IOReturn _setWakeTime(uint64_t wakeContinuousTime); IOReturn acquireDriverKitMatchingAssertion(); void releaseDriverKitMatchingAssertion(); + IOReturn acquireDriverKitSyncedAssertion(IOService * from, IOPMDriverAssertionID * assertionID); + void releaseDriverKitSyncedAssertion(IOPMDriverAssertionID assertionID); + int32_t considerRunMode(IOService * service, uint64_t pmDriverClass); #endif void copyWakeReasonString( char * outBuf, size_t bufSize ); @@ -521,6 +541,7 @@ public: void handleSetDisplayPowerOn(bool powerOn); void willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ); + void willNotifyInterested( IOPMPowerStateIndex newPowerState ); IOReturn setMaintenanceWakeCalendar( const IOPMCalendarStruct * calendar ); @@ -587,6 +608,7 @@ public: bool async = false); void copyShutdownReasonString( char * outBuf, size_t bufSize ); + void copyShutdownTime(uint64_t *time); void lowLatencyAudioNotify(uint64_t time, boolean_t state); #if HIBERNATION @@ -606,6 +628,14 @@ public: uint32_t getWatchdogTimeout(); void deleteStackshot(); + IOReturn createPMAssertionSafe( + IOPMDriverAssertionID *assertionID, + IOPMDriverAssertionType whichAssertionsBits, + IOPMDriverAssertionLevel assertionLevel, + IOService *ownerService, + const char *ownerDescription); + IOReturn requestRunMode(uint64_t runModeMask); + IOReturn handleRequestRunMode(uint64_t runModeMask); private: friend class PMSettingObject; friend class RootDomainUserClient; @@ -630,6 +660,7 @@ private: OSPtr wrangler; OSPtr wranglerIdleSettings; + OSPtr commandGate; IOLock *featuresDictLock;// guards supportedFeatures IOLock *wakeEventLock; IOPMPowerStateQueue *pmPowerStateQueue; @@ -704,8 +735,8 @@ private: }; uint32_t _systemMessageClientMask; -// Power state and capability change transitions. - enum { + // Power state and capability change transitions. + enum SystemTransitionType { kSystemTransitionNone = 0, kSystemTransitionSleep = 1, kSystemTransitionWake = 2, @@ -713,6 +744,12 @@ private: kSystemTransitionNewCapClient = 4 } _systemTransitionType; + // Update the current systemTransitionType and wakeup any waiters blocking on transitions. + void setSystemTransitionTypeGated(SystemTransitionType type); + + // Block until no system transitions are in progress and the current power state has reached at a minimum state. + void waitForSystemTransitionToMinPowerState(IOPMRootDomainPowerState state); + unsigned int systemBooting :1; unsigned int systemShutdown :1; unsigned int systemDarkWake :1; @@ -743,6 +780,7 @@ private: unsigned int sleepTimerMaintenance :1; unsigned int sleepToStandby :1; unsigned int lowBatteryCondition :1; + unsigned int ldmHibernateDisable :1; unsigned int hibernateDisabled :1; unsigned int hibernateRetry :1; unsigned int wranglerTickled :1; @@ -846,11 +884,13 @@ private: clock_sec_t _aotWakeTimeUTC; uint64_t _aotTestTime; uint64_t _aotTestInterval; + uint64_t _aotEndTime; uint32_t _aotPendingFlags; public: - IOPMAOTMetrics * _aotMetrics; - uint8_t _aotMode; + IOPMAOTMetrics * _aotMetrics; + uint32_t _aotMode; private: + uint32_t _aotLingerTime; uint8_t _aotNow; uint8_t _aotTasksSuspended; uint8_t _aotTimerScheduled; @@ -859,23 +899,23 @@ private: uint64_t _aotWakeTimeContinuous; uint64_t _aotWakePreWindow; uint64_t _aotWakePostWindow; - uint64_t _aotLingerTime; + uint64_t _aotRunMode; size_t _driverKitMatchingAssertionCount; IOPMDriverAssertionID _driverKitMatchingAssertion; + size_t _driverKitSyncedAssertionCount; bool aotShouldExit(bool software); void aotExit(bool cps); void aotEvaluate(IOTimerEventSource * timer); public: - bool isAOTMode(void); -private: // -- AOT enum { kTasksSuspendUnsuspended = 0, kTasksSuspendSuspended = 1, kTasksSuspendNoChange = -1, }; +private: bool updateTasksSuspend(int newTasksSuspended, int newAOTTasksSuspended); int findSuspendedPID(uint32_t pid, uint32_t *outRefCount); @@ -958,6 +998,7 @@ private: int phase, uint32_t * hibMode ); void evaluateSystemSleepPolicyEarly( void ); void evaluateSystemSleepPolicyFinal( void ); + void setLockdownModeHibernation(uint32_t status); #endif /* HIBERNATION */ bool latchDisplayWranglerTickle( bool latch ); diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index f780ba274..8f72a60a6 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -73,6 +73,12 @@ enum{ kInternalFlagInit = 0x00000008, kInternalFlagHasPointers = 0x00000010, kInternalFlagGuardPages = 0x00000020, + /** + * Should the IOBMD behave as if it has no kernel mapping for the + * underlying buffer? Note that this does not necessarily imply the + * existence (or non-existence) of a kernel mapping. + */ + kInternalFlagAsIfUnmapped = 0x00000040, }; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -190,6 +196,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( bool mapped = false; bool withCopy = false; bool mappedOrShared = false; + bool noSoftLimit = false; if (!capacity) { return false; @@ -267,8 +274,19 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( return false; } - if ((inTask != kernel_task) && !(options & kIOMemoryPageable)) { - return false; + if (inTask) { + if ((inTask != kernel_task) && !(options & kIOMemoryPageable)) { + // Cannot create non-pageable memory in user tasks + return false; + } + } else { + // Not passing a task implies the memory should not be mapped (or, at + // least, should behave as if it were not mapped) + _internalFlags |= kInternalFlagAsIfUnmapped; + + // Disable the soft-limit since the mapping, if any, will not escape the + // IOBMD. + noSoftLimit = true; } bzero(&mapSpec, sizeof(mapSpec)); @@ -326,7 +344,7 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( } } _buffer = (void *) IOKernelAllocateWithPhysicalRestrict(kheap, - capacity, highestMask, alignment, contig); + capacity, highestMask, alignment, contig, noSoftLimit); } else if (_internalFlags & kInternalFlagGuardPages) { vm_offset_t address = 0; kern_return_t kr; @@ -341,6 +359,10 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( kma_flags = (kma_flags_t) (kma_flags | KMA_DATA_SHARED); } + if (noSoftLimit) { + kma_flags = (kma_flags_t)(kma_flags | KMA_NOSOFTLIMIT); + } + alignMask = (1UL << log2up((uint32_t) alignment)) - 1; kr = kernel_memory_allocate(kernel_map, &address, capacity + page_size * 2, alignMask, kma_flags, @@ -367,13 +389,20 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( #endif } #endif /* defined(__x86_64__) */ - } else if (alignment > 1) { + } else { + zalloc_flags_t zflags = Z_ZERO_VM_TAG_BT_BIT; + if (noSoftLimit) { + zflags = (zalloc_flags_t)(zflags | Z_NOSOFTLIMIT); + } + /* BEGIN IGNORE CODESTYLE */ __typed_allocators_ignore_push - _buffer = IOMallocAligned_internal(kheap, capacity, alignment, - Z_ZERO_VM_TAG_BT_BIT); - } else { - _buffer = IOMalloc_internal(kheap, capacity, Z_ZERO_VM_TAG_BT_BIT); + if (alignment > 1) { + _buffer = IOMallocAligned_internal(kheap, capacity, alignment, + zflags); + } else { + _buffer = IOMalloc_internal(kheap, capacity, zflags); + } __typed_allocators_ignore_pop /* END IGNORE CODESTYLE */ } @@ -397,9 +426,6 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( if (!withCopy) { mapTask = inTask; } - if (NULL == inTask) { - inTask = kernel_task; - } } else if (options & kIOMapCacheMask) { // Prefetch each page to put entries into the pmap volatile UInt8 * startAddr = (UInt8 *)_buffer; @@ -413,11 +439,16 @@ IOBufferMemoryDescriptor::initWithPhysicalMask( } } - _ranges.v64->address = (mach_vm_address_t) pgz_decode(_buffer, _capacity); + _ranges.v64->address = (mach_vm_address_t) _buffer; _ranges.v64->length = _capacity; - if (!super::initWithOptions(_ranges.v64, 1, 0, - inTask, iomdOptions, /* System mapper */ NULL)) { + if (!super::initWithOptions( + /* buffers */ _ranges.v64, /* count */ 1, /* offset */ 0, + // Since we handle all "unmapped" behavior internally and our superclass + // requires a task, default all unbound IOBMDs to the kernel task. + /* task */ inTask ?: kernel_task, + /* options */ iomdOptions, + /* System mapper */ NULL)) { return false; } @@ -853,6 +884,10 @@ IOBufferMemoryDescriptor::appendBytes(const void * bytes, vm_size_t withLength) void * IOBufferMemoryDescriptor::getBytesNoCopy() { + if (__improbable(_internalFlags & kInternalFlagAsIfUnmapped)) { + return NULL; + } + if (kIOMemoryTypePhysical64 == (_flags & kIOMemoryTypeMask)) { return _buffer; } else { @@ -871,6 +906,10 @@ IOBufferMemoryDescriptor::getBytesNoCopy(vm_size_t start, vm_size_t withLength) { IOVirtualAddress address; + if (__improbable(_internalFlags & kInternalFlagAsIfUnmapped)) { + return NULL; + } + if ((start + withLength) < start) { return NULL; } diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 02d05e56e..459ea6fdd 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -1116,11 +1116,13 @@ IOCatalogue::startMatching( const OSSymbol * moduleName ) OSSharedPtr dextPersonalities = kext->copyPersonalitiesArray(); if (!dextPersonalities) { + IORWLockUnlock(lock); return false; } servicesToTerminate = OSArray::withCapacity(1); if (!servicesToTerminate) { + IORWLockUnlock(lock); return false; } diff --git a/iokit/Kernel/IOCircularDataQueue.cpp b/iokit/Kernel/IOCircularDataQueue.cpp new file mode 100644 index 000000000..18a37bdee --- /dev/null +++ b/iokit/Kernel/IOCircularDataQueue.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include +#include +#include "IOKitKernelInternal.h" + +#include diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 69a1738c8..71b395db9 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -594,7 +594,15 @@ IOHibernateSystemSleep(void) enum { setFileRound = 1024 * 1024ULL }; setFileSizeMin = ((setFileSizeMin + setFileRound) & ~(setFileRound - 1)); +#if defined(__arm64__) + // setFileSizeMin was our guess but if free disk space allows, + // open a file sized up for no compression and all memory saved, + // but leave at least kIOHibernateDiskFreeSpace bytes free on disk + setFileSizeMax = ptoa_64(vars->page_list->page_count); + setFileSizeMax = setFileSizeMax & ~(setFileRound - 1); +#else setFileSizeMax = setFileSizeMin; +#endif HIBLOG("hibernate_page_list_setall preflight pageCount %d est comp %qd setfilemin %qd setfilemax %qd min %qd\n", pageCount, (100ULL * gIOHibernateCompression) >> 8, setFileSizeMin, setFileSizeMax, vars->fileMinSize); @@ -700,7 +708,7 @@ IOHibernateSystemSleep(void) if (kIOHibernateOptionProgress & gIOHibernateCurrentHeader->options) { vars->videoAllocSize = kVideoMapSize; if (KERN_SUCCESS != kmem_alloc(kernel_map, &vars->videoMapping, vars->videoAllocSize, - (kma_flags_t)(KMA_PAGEABLE | KMA_DATA), VM_KERN_MEMORY_IOKIT)) { + (kma_flags_t)(KMA_PAGEABLE | KMA_DATA_SHARED), VM_KERN_MEMORY_IOKIT)) { vars->videoMapping = 0; } } diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index e94b6919e..b33a663dc 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -69,7 +69,8 @@ IOKernelAllocateWithPhysicalRestrict( mach_vm_size_t size, mach_vm_address_t maxPhys, mach_vm_size_t alignment, - bool contiguous); + bool contiguous, + bool noSoftLimit); void IOKernelFreePhysical( kalloc_heap_t kheap, @@ -172,7 +173,7 @@ struct IOMemoryDescriptorReserved { vm_tag_t kernelTag; vm_tag_t userTag; task_t creator; - OSObject * contextObject; + OSPtr contextObjects; }; #if defined(__x86_64__) @@ -246,6 +247,8 @@ IOReturn IORemoveServicePlatformActions(IOService * service); void IOCPUSleepKernel(void); void IOPlatformActionsInitialize(void); +void IOServicePHSystemAOT(int isAOT); + class IOSystemStateNotification : public IOService { OSDeclareDefaultStructors(IOSystemStateNotification); @@ -255,4 +258,6 @@ public: virtual bool serializeProperties(OSSerialize * serialize) const APPLE_KEXT_OVERRIDE; }; +extern class IOPMrootDomain * gIOPMRootDomain; + #endif /* ! _IOKIT_KERNELINTERNAL_H */ diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index aeb76730e..3778780ea 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -195,7 +195,7 @@ IOLibInit(void) kIOPageableMapSize, VM_MAP_CREATE_PAGEABLE, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, - (kms_flags_t)(KMS_PERMANENT | KMS_DATA | KMS_NOFAIL), + (kms_flags_t)(KMS_PERMANENT | KMS_DATA | KMS_NOFAIL | KMS_NOSOFTLIMIT), VM_KERN_MEMORY_IOKIT).kmr_submap; gIOKitPageableMap.address = gIOKitPageableFixedRange.min_address; @@ -643,7 +643,7 @@ void * IOMallocAligned_external( vm_size_t size, vm_size_t alignment) { - return IOMallocAligned_internal(KHEAP_DATA_BUFFERS, size, alignment, + return IOMallocAligned_internal(GET_KEXT_KHEAP_DATA(), size, alignment, Z_VM_TAG_BT_BIT); } @@ -652,7 +652,7 @@ IOFreeAligned( void * address, vm_size_t size) { - IOFreeAligned_internal(KHEAP_DATA_BUFFERS, address, size); + IOFreeAligned_internal(GET_KEXT_KHEAP_DATA(), address, size); } __typed_allocators_ignore_pop @@ -710,7 +710,8 @@ IOKernelAllocateWithPhysicalRestrict( mach_vm_size_t size, mach_vm_address_t maxPhys, mach_vm_size_t alignment, - bool contiguous) + bool contiguous, + bool noSoftLimit) { kern_return_t kr; mach_vm_address_t address; @@ -745,6 +746,10 @@ IOKernelAllocateWithPhysicalRestrict( options = (kma_flags_t) (options | KMA_DATA_SHARED); } + if (noSoftLimit) { + options = (kma_flags_t) (options | KMA_NOSOFTLIMIT); + } + adjustedSize = size; contiguous = (contiguous && (adjustedSize > page_size)) || (alignment > page_size); @@ -781,14 +786,21 @@ IOKernelAllocateWithPhysicalRestrict( address = 0; } } else { + zalloc_flags_t zflags = Z_WAITOK; + + if (noSoftLimit) { + zflags = (zalloc_flags_t)(zflags | Z_NOSOFTLIMIT); + } + adjustedSize += alignMask; if (adjustedSize < size) { return 0; } + /* BEGIN IGNORE CODESTYLE */ __typed_allocators_ignore_push // allocator implementation allocationAddress = (mach_vm_address_t) kheap_alloc(kheap, - adjustedSize, Z_VM_TAG_BT(Z_WAITOK, VM_KERN_MEMORY_IOKIT)); + adjustedSize, Z_VM_TAG_BT(zflags, VM_KERN_MEMORY_IOKIT)); __typed_allocators_ignore_pop /* END IGNORE CODESTYLE */ @@ -851,7 +863,7 @@ IOMallocContiguous(vm_size_t size, vm_size_t alignment, /* Do we want a physical address? */ if (!physicalAddress) { address = IOKernelAllocateWithPhysicalRestrict(KHEAP_DEFAULT, - size, 0 /*maxPhys*/, alignment, true); + size, 0 /*maxPhys*/, alignment, true, false /* noSoftLimit */); } else { do { IOBufferMemoryDescriptor * bmd; @@ -945,7 +957,7 @@ static kern_return_t IOMallocPageableCallback(vm_map_t map, void * _ref) { struct IOMallocPageableRef * ref = (struct IOMallocPageableRef *) _ref; - kma_flags_t flags = (kma_flags_t)(KMA_PAGEABLE | KMA_DATA); + kma_flags_t flags = (kma_flags_t)(KMA_PAGEABLE | KMA_DATA_SHARED); return kmem_alloc( map, &ref->address, ref->size, flags, ref->tag ); } @@ -1080,13 +1092,16 @@ IOFreePageable(void * address, vm_size_t size) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +__typed_allocators_ignore_push + void * IOMallocData_external( vm_size_t size); void * IOMallocData_external(vm_size_t size) { - return IOMalloc_internal(KHEAP_DATA_BUFFERS, size, Z_VM_TAG_BT_BIT); + return IOMalloc_internal(GET_KEXT_KHEAP_DATA(), size, Z_VM_TAG_BT_BIT); } void * @@ -1095,15 +1110,23 @@ IOMallocZeroData_external( void * IOMallocZeroData_external(vm_size_t size) { - return IOMalloc_internal(KHEAP_DATA_BUFFERS, size, Z_ZERO_VM_TAG_BT_BIT); + return IOMalloc_internal(GET_KEXT_KHEAP_DATA(), size, Z_ZERO_VM_TAG_BT_BIT); } void IOFreeData(void * address, vm_size_t size) { - return IOFree_internal(KHEAP_DATA_BUFFERS, address, size); + return IOFree_internal(GET_KEXT_KHEAP_DATA(), address, size); } +void +IOFreeDataSharable(void * address, vm_size_t size) +{ + return IOFree_internal(KHEAP_DATA_SHARED, address, size); +} + +__typed_allocators_ignore_pop + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ __typed_allocators_ignore_push // allocator implementation @@ -1484,6 +1507,7 @@ _IOLogv(const char *format, va_list ap, void *caller) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, format, ap, caller); #pragma clang diagnostic pop diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index 61e46b775..68c6f33cb 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -281,11 +281,6 @@ getAddrLenForInd( addr = cur.address; len = cur.length; } -#if CONFIG_PROB_GZALLOC - if (task == kernel_task) { - addr = pgz_decode(addr, len); - } -#endif /* CONFIG_PROB_GZALLOC */ } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -875,11 +870,11 @@ IOMemoryDescriptorMapAlloc(vm_map_t map, void * _ref) vmk_flags.vm_tag = ref->tag; /* - * Mapping memory into the kernel_map using IOMDs use a dedicated range. + * Mapping memory into the kernel_map using IOMDs use the data range. * Memory being mapped should not contain kernel pointers. */ if (map == kernel_map) { - vmk_flags.vmkf_range_id = KMEM_RANGE_ID_IOKIT; + vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; } err = mach_vm_map_kernel(map, &addr, size, @@ -1584,24 +1579,26 @@ IOReturn IOGeneralMemoryDescriptor::memoryReferenceGetPageCounts( IOMemoryReference * ref, IOByteCount * residentPageCount, - IOByteCount * dirtyPageCount) + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount) { IOReturn err; IOMemoryEntry * entries; - unsigned int resident, dirty; - unsigned int totalResident, totalDirty; + UInt64 resident, dirty, swapped; + UInt64 totalResident, totalDirty, totalSwapped; - totalResident = totalDirty = 0; + totalResident = totalDirty = totalSwapped = 0; err = kIOReturnSuccess; entries = ref->entries + ref->count; while (entries > &ref->entries[0]) { entries--; - err = mach_memory_entry_get_page_counts(entries->entry, &resident, &dirty); + err = mach_memory_entry_get_page_counts(entries->entry, &resident, &dirty, &swapped); if (KERN_SUCCESS != err) { break; } totalResident += resident; totalDirty += dirty; + totalSwapped += swapped; } if (residentPageCount) { @@ -1610,6 +1607,9 @@ IOGeneralMemoryDescriptor::memoryReferenceGetPageCounts( if (dirtyPageCount) { *dirtyPageCount = totalDirty; } + if (swappedPageCount) { + *swappedPageCount = totalSwapped; + } return err; } @@ -2475,10 +2475,10 @@ IOMemoryDescriptor::getFlags(void) } OSObject * -IOMemoryDescriptor::copyContext(void) const +IOMemoryDescriptor::copyContext(const OSSymbol * key) const { - if (reserved) { - OSObject * context = reserved->contextObject; + if (reserved && reserved->contextObjects) { + OSObject * context = reserved->contextObjects->getObject(key); if (context) { context->retain(); } @@ -2488,8 +2488,31 @@ IOMemoryDescriptor::copyContext(void) const } } +OSObject * +IOMemoryDescriptor::copyContext(const char * key) const +{ + OSSharedPtr sym = OSSymbol::withCString(key); + return copyContext(sym.get()); +} + +OSObject * +IOMemoryDescriptor::copySharingContext(const char * key) const +{ + OSObject * context = NULL; + OSObject * obj = copyContext(kIOMemoryDescriptorSharingContextKey); + OSDictionary * dict = OSDynamicCast(OSDictionary, obj); + if (dict) { + context = dict->getObject(key); + if (context) { + context->retain(); + } + } + OSSafeReleaseNULL(obj); + return context; +} + void -IOMemoryDescriptor::setContext(OSObject * obj) +IOMemoryDescriptor::setContext(const OSSymbol * key, OSObject * obj) { if (this->reserved == NULL && obj == NULL) { // No existing object, and no object to set @@ -2498,17 +2521,56 @@ IOMemoryDescriptor::setContext(OSObject * obj) IOMemoryDescriptorReserved * reserved = getKernelReserved(); if (reserved) { - OSObject * oldObject = reserved->contextObject; - if (oldObject && OSCompareAndSwapPtr(oldObject, NULL, &reserved->contextObject)) { - oldObject->release(); + if (NULL == reserved->contextObjects) { + reserved->contextObjects = OSDictionary::withCapacity(2); } - if (obj != NULL) { - obj->retain(); - reserved->contextObject = obj; + if (obj) { + reserved->contextObjects->setObject(key, obj); + } else { + reserved->contextObjects->removeObject(key); } } } +void +IOMemoryDescriptor::setContext(const char * key, OSObject * obj) +{ + OSSharedPtr sym = OSSymbol::withCString(key); + setContext(sym.get(), obj); +} + +OSObject * +IOMemoryDescriptor::copyContext(void) const +{ + return copyContext((const OSSymbol *) kOSBooleanFalse); +} +enum { + kIOMemoryDescriptorInternalFlagsSharing = 0x0001, +}; + +void +IOMemoryDescriptor::setSharingContext(const char * key, OSObject * obj) +{ + OSSharedPtr sym = OSSymbol::withCString(key); + OSSharedPtr dict = OSDictionary::withCapacity(1); + + dict->setObject(sym.get(), obj); + setContext(kIOMemoryDescriptorSharingContextKey, dict.get()); + OSBitOrAtomic16(kIOMemoryDescriptorInternalFlagsSharing, &_internalIOMDFlags); +} + +bool +IOMemoryDescriptor::hasSharingContext(void) +{ + return 0 != (kIOMemoryDescriptorInternalFlagsSharing & _internalIOMDFlags); +} + +void +IOMemoryDescriptor::setContext(OSObject * obj) +{ + setContext((const OSSymbol *) kOSBooleanFalse, obj); +} + #ifndef __LP64__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" @@ -2728,10 +2790,7 @@ IOMemoryDescriptor::cleanKernelReserved( IOMemoryDescriptorReserved * reserved ) reserved->creator = NULL; } - if (reserved->contextObject) { - reserved->contextObject->release(); - reserved->contextObject = NULL; - } + reserved->contextObjects = NULL; } IOMemoryDescriptorReserved * @@ -3846,10 +3905,10 @@ IOMemoryDescriptor::getDMAMapLength(uint64_t * offset) return length; } - IOReturn IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount, - IOByteCount * dirtyPageCount ) + IOByteCount * dirtyPageCount, + IOByteCount * swappedPageCount ) { IOReturn err = kIOReturnNotReady; @@ -3862,14 +3921,14 @@ IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount, LOCK; } if (_memRef) { - err = IOGeneralMemoryDescriptor::memoryReferenceGetPageCounts(_memRef, residentPageCount, dirtyPageCount); + err = IOGeneralMemoryDescriptor::memoryReferenceGetPageCounts(_memRef, residentPageCount, dirtyPageCount, swappedPageCount); } else { IOMultiMemoryDescriptor * mmd; IOSubMemoryDescriptor * smd; if ((smd = OSDynamicCast(IOSubMemoryDescriptor, this))) { - err = smd->getPageCounts(residentPageCount, dirtyPageCount); + err = smd->getPageCounts(residentPageCount, dirtyPageCount, swappedPageCount); } else if ((mmd = OSDynamicCast(IOMultiMemoryDescriptor, this))) { - err = mmd->getPageCounts(residentPageCount, dirtyPageCount); + err = mmd->getPageCounts(residentPageCount, dirtyPageCount, swappedPageCount); } } if (kIOMemoryThreadSafe & _flags) { @@ -3879,6 +3938,13 @@ IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount, return err; } +IOReturn +IOMemoryDescriptor::getPageCounts( IOByteCount * residentPageCount, + IOByteCount * dirtyPageCount ) +{ + return getPageCounts(residentPageCount, dirtyPageCount, NULL); +} + #if defined(__arm64__) extern "C" void dcache_incoherent_io_flush64(addr64_t pa, unsigned int count, unsigned int remaining, unsigned int *res); @@ -5791,23 +5857,6 @@ IOMemoryDescriptor::createMappingInTask( mapping = new IOMemoryMap; -#if 136275805 - /* - * XXX: Redundantly check the mapping size here so that failure stack traces - * are more useful. This has no functional value but is helpful because - * telemetry traps can currently only capture the last five calls and - * so we want to trap as shallow as possible in a select few cases - * where we anticipate issues. - * - * When telemetry collection is complete, this will be removed. - */ - if (__improbable(mapping && !vm_map_is_map_size_valid( - get_task_map(intoTask), length, /* no_soft_limit */ false))) { - mapping->release(); - mapping = NULL; - } -#endif /* 136275805 */ - if (mapping && !mapping->init( intoTask, atAddress, options, offset, length )) { diff --git a/iokit/Kernel/IOMultiMemoryDescriptor.cpp b/iokit/Kernel/IOMultiMemoryDescriptor.cpp index 4b81aaf49..eeb44f4d5 100644 --- a/iokit/Kernel/IOMultiMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMultiMemoryDescriptor.cpp @@ -414,21 +414,22 @@ IOMultiMemoryDescriptor::setOwnership( task_t newOwner, IOReturn IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount, - IOByteCount * pDirtyPageCount) + IOByteCount * pDirtyPageCount, IOByteCount * pSwappedPageCount) { IOReturn err; - IOByteCount totalResidentPageCount, totalDirtyPageCount; - IOByteCount residentPageCount, dirtyPageCount; + IOByteCount totalResidentPageCount, totalDirtyPageCount, totalSwappedPageCount; + IOByteCount residentPageCount, dirtyPageCount, swappedPageCount; err = kIOReturnSuccess; - totalResidentPageCount = totalDirtyPageCount = 0; + totalResidentPageCount = totalDirtyPageCount = totalSwappedPageCount = 0; for (unsigned index = 0; index < _descriptorsCount; index++) { - err = _descriptors[index]->getPageCounts(&residentPageCount, &dirtyPageCount); + err = _descriptors[index]->getPageCounts(&residentPageCount, &dirtyPageCount, &swappedPageCount); if (kIOReturnSuccess != err) { break; } totalResidentPageCount += residentPageCount; totalDirtyPageCount += dirtyPageCount; + totalSwappedPageCount += swappedPageCount; } if (pResidentPageCount) { @@ -437,10 +438,20 @@ IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount, if (pDirtyPageCount) { *pDirtyPageCount = totalDirtyPageCount; } + if (pSwappedPageCount) { + *pSwappedPageCount = totalSwappedPageCount; + } return err; } +IOReturn +IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount, + IOByteCount * pDirtyPageCount) +{ + return getPageCounts(pResidentPageCount, pDirtyPageCount, NULL); +} + uint64_t IOMultiMemoryDescriptor::getPreparationID( void ) { diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 9fb5a8865..ea6a958f9 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -53,9 +53,10 @@ class IONVRAMV3Handler; #define MAX_VAR_NAME_SIZE 63 -#define kNVRAMBankSizeKey "nvram-bank-size" -#define kNVRAMBankCountKey "nvram-bank-count" -#define kNVRAMCurrentBankKey "nvram-current-bank" +#define kNVRAMBankSizeKey "nvram-bank-size" +#define kNVRAMBankCountKey "nvram-bank-count" +#define kNVRAMCurrentBankKey "nvram-current-bank" +#define kNVRAMClearTestVarKey "clear-test-vars" #define kCurrentGenerationCountKey "Generation" #define kCurrentNVRAMVersionKey "Version" @@ -67,16 +68,31 @@ class IONVRAMV3Handler; #define MIN_SYNC_NOW_INTERVAL 15*60 /* Minimum 15 Minutes interval mandated */ +enum IONVRAMLogging { + kIONVRAMNoLogs = 0, + kIONVRAMInfoLogs = 1, + kIONVRAMErrorLogs = 2, + kIONVRAMDataHexDump = 4 +}; + +#define IS_LOG_BIT_SET(level) ((gNVRAMLogging & (level)) != 0) + #if defined(DEBUG) || defined(DEVELOPMENT) #define DEBUG_IFERROR(err, fmt, args...) \ ({ \ - if ((err != kIOReturnSuccess) || gNVRAMLogging) \ + if ((err != kIOReturnSuccess) || IS_LOG_BIT_SET(kIONVRAMErrorLogs)) \ + IOLog("%s:%s:%u - " fmt, __FILE_NAME__, __FUNCTION__, __LINE__, ##args); \ +}) + +#define DEBUG_INFO_IF(log, fmt, args...) \ +({ \ + if ((log) && IS_LOG_BIT_SET(kIONVRAMInfoLogs)) \ IOLog("%s:%s:%u - " fmt, __FILE_NAME__, __FUNCTION__, __LINE__, ##args); \ }) #define DEBUG_INFO(fmt, args...) \ ({ \ - if (gNVRAMLogging) \ + if (IS_LOG_BIT_SET(kIONVRAMInfoLogs)) \ IOLog("%s:%s:%u - " fmt, __FILE_NAME__, __FUNCTION__, __LINE__, ##args); \ }) @@ -88,6 +104,7 @@ class IONVRAMV3Handler; #define DEBUG_IFERROR(err, fmt, args...) (void)NULL #define DEBUG_INFO(fmt, args...) (void)NULL #define DEBUG_ALWAYS(fmt, args...) (void)NULL +#define DEBUG_INFO_IF(fmt, args...) (void)NULL #endif #define DEBUG_ERROR DEBUG_ALWAYS @@ -152,12 +169,12 @@ class IONVRAMV3Handler; // RST = Reset, Obliterate // RD = Read // DEL = Delete -#define ENT_MOD_RST ((1 << kIONVRAMOperationWrite) | (1 << kIONVRAMOperationDelete) | (1 << kIONVRAMOperationObliterate) | (1 << kIONVRAMOperationReset)) -#define ENT_MOD_RD ((1 << kIONVRAMOperationRead) | (1 << kIONVRAMOperationWrite) | (1 << kIONVRAMOperationDelete)) -#define ENT_MOD ((1 << kIONVRAMOperationWrite) | (1 << kIONVRAMOperationDelete)) -#define ENT_RST ((1 << kIONVRAMOperationObliterate) | (1 << kIONVRAMOperationReset)) -#define ENT_RD ((1 << kIONVRAMOperationRead)) -#define ENT_DEL ((1 << kIONVRAMOperationDelete)) +#define OP_RD ((1 << kIONVRAMOperationRead)) +#define OP_RST ((1 << kIONVRAMOperationObliterate) | (1 << kIONVRAMOperationReset)) +#define OP_DEL ((1 << kIONVRAMOperationDelete)) +#define OP_MOD ((1 << kIONVRAMOperationWrite) | OP_DEL) +#define OP_MOD_RD (OP_RD | OP_MOD) +#define OP_MOD_RST (OP_MOD | OP_RST) enum NVRAMVersion { kNVRAMVersionUnknown, @@ -182,7 +199,8 @@ UUID_DEFINE(gAppleWifiGuid, 0x36, 0xC2, 0x8A, 0xB5, 0x65, 0x66, 0x4C, 0x50, 0x9E // Prefix for kernel-only variables #define KERNEL_ONLY_VAR_NAME_PREFIX "krn." -static TUNABLE(bool, gNVRAMLogging, "nvram-log", false); +static TUNABLE(uint8_t, gNVRAMLogging, "nvram-log", kIONVRAMNoLogs); +static TUNABLE(bool, gRestoreBoot, "-restore", false); static bool gInternalBuild = false; // IONVRAMSystemVariableListInternal: @@ -274,6 +292,8 @@ union VariablePermission { uint64_t SystemReadHidden :1; uint64_t FullAccess :1; uint64_t InternalOnly :1; + uint64_t TestingOnly :1; + uint64_t RestoreModifyOnly :1; uint64_t Reserved:57; } Bits; uint64_t Uint64; @@ -316,13 +336,20 @@ VariablePermissionEntry gVariablePermissions[] = { {"darkboot", .p.Bits.UserWrite = 1}, {"nonce-seeds", .p.Bits.KernelOnly = 1}, #endif /* !defined(__x86_64__) */ + {"darwin-init-system", .p.Bits.RestoreModifyOnly = 1}, // Variables used for testing permissions - {"testSysReadHidden", .p.Bits.SystemReadHidden = 1}, - {"testKernelOnly", .p.Bits.KernelOnly = 1}, - {"testResetOnlyDel", .p.Bits.ResetNVRAMOnlyDelete = 1}, - {"testNeverDel", .p.Bits.NeverAllowedToDelete = 1}, - {"testUserWrite", .p.Bits.UserWrite = 1}, - {"testRootReq", .p.Bits.RootRequired = 1}, + {"testSysReadHidden", .p.Bits.SystemReadHidden = 1, + .p.Bits.TestingOnly = 1}, + {"testKernelOnly", .p.Bits.KernelOnly = 1, + .p.Bits.TestingOnly = 1}, + {"testResetOnlyDel", .p.Bits.ResetNVRAMOnlyDelete = 1, + .p.Bits.TestingOnly = 1}, + {"testNeverDel", .p.Bits.NeverAllowedToDelete = 1, + .p.Bits.TestingOnly = 1}, + {"testUserWrite", .p.Bits.UserWrite = 1, + .p.Bits.TestingOnly = 1}, + {"testRootReq", .p.Bits.RootRequired = 1, + .p.Bits.TestingOnly = 1}, {"reclaim-int", .p.Bits.InternalOnly = 1}, {nullptr, {.Bits.FullAccess = 1}} // Default access }; @@ -337,26 +364,32 @@ typedef struct { // variable-guid pair entries that require entitlement check to do specified nvram operations static const VariableEntitlementEntry gVariableEntitlements[] = { - {ENT_MOD_RST, &gAppleNVRAMGuid, "ownership-warning", "com.apple.private.iokit.ddl-write"}, - {ENT_MOD, &gAppleSystemVariableGuid, "BluetoothInfo", "com.apple.private.iokit.nvram-bluetooth"}, - {ENT_MOD, &gAppleSystemVariableGuid, "BluetoothUHEDevices", "com.apple.private.iokit.nvram-bluetooth"}, - {ENT_MOD, &gAppleNVRAMGuid, "bluetoothExternalDongleFailed", "com.apple.private.iokit.nvram-bluetooth"}, - {ENT_MOD, &gAppleNVRAMGuid, "bluetoothInternalControllerInfo", "com.apple.private.iokit.nvram-bluetooth"}, - {ENT_RD, &gAppleSystemVariableGuid, "current-network", "com.apple.private.security.nvram.wifi-psks"}, - {ENT_RD, &gAppleWifiGuid, "current-network", "com.apple.private.security.nvram.wifi-psks"}, - {ENT_RD, &gAppleSystemVariableGuid, "preferred-networks", "com.apple.private.security.nvram.wifi-psks"}, - {ENT_RD, &gAppleWifiGuid, "preferred-networks", "com.apple.private.security.nvram.wifi-psks"}, - {ENT_RD, &gAppleSystemVariableGuid, "preferred-count", "com.apple.private.security.nvram.wifi-psks"}, - {ENT_RD, &gAppleWifiGuid, "preferred-count", "com.apple.private.security.nvram.wifi-psks"}, + {OP_MOD_RST, &gAppleNVRAMGuid, "ownership-warning", "com.apple.private.iokit.ddl-write"}, + {OP_MOD, &gAppleSystemVariableGuid, "BluetoothInfo", "com.apple.private.iokit.nvram-bluetooth"}, + {OP_MOD, &gAppleSystemVariableGuid, "BluetoothUHEDevices", "com.apple.private.iokit.nvram-bluetooth"}, + {OP_MOD, &gAppleNVRAMGuid, "bluetoothExternalDongleFailed", "com.apple.private.iokit.nvram-bluetooth"}, + {OP_MOD, &gAppleNVRAMGuid, "bluetoothInternalControllerInfo", "com.apple.private.iokit.nvram-bluetooth"}, + {OP_RD, &gAppleSystemVariableGuid, "current-network", "com.apple.private.security.nvram.wifi-psks"}, + {OP_RD, &gAppleWifiGuid, "current-network", "com.apple.private.security.nvram.wifi-psks"}, + {OP_RD, &gAppleSystemVariableGuid, "preferred-networks", "com.apple.private.security.nvram.wifi-psks"}, + {OP_RD, &gAppleWifiGuid, "preferred-networks", "com.apple.private.security.nvram.wifi-psks"}, + {OP_RD, &gAppleSystemVariableGuid, "preferred-count", "com.apple.private.security.nvram.wifi-psks"}, + {OP_RD, &gAppleWifiGuid, "preferred-count", "com.apple.private.security.nvram.wifi-psks"}, + {OP_MOD_RD, &gAppleSystemVariableGuid, "fmm-mobileme-token-FMM", "com.apple.private.security.nvram.fmm"}, + {OP_MOD_RD, &gAppleNVRAMGuid, "fmm-mobileme-token-FMM", "com.apple.private.security.nvram.fmm"}, + {OP_MOD_RD, &gAppleSystemVariableGuid, "fmm-mobileme-token-FMM-BridgeHasAccount", "com.apple.private.security.nvram.fmm"}, + {OP_MOD_RD, &gAppleNVRAMGuid, "fmm-mobileme-token-FMM-BridgeHasAccount", "com.apple.private.security.nvram.fmm"}, + {OP_MOD_RD, &gAppleSystemVariableGuid, "fmm-computer-name", "com.apple.private.security.nvram.fmm"}, + {OP_MOD_RD, &gAppleNVRAMGuid, "fmm-computer-name", "com.apple.private.security.nvram.fmm"}, // Variables used for testing entitlement - {ENT_MOD_RST, &gAppleNVRAMGuid, "testEntModRst", "com.apple.private.iokit.testEntModRst"}, - {ENT_MOD_RST, &gAppleSystemVariableGuid, "testEntModRstSys", "com.apple.private.iokit.testEntModRst"}, - {ENT_RST, &gAppleNVRAMGuid, "testEntRst", "com.apple.private.iokit.testEntRst"}, - {ENT_RST, &gAppleSystemVariableGuid, "testEntRstSys", "com.apple.private.iokit.testEntRst"}, - {ENT_RD, &gAppleNVRAMGuid, "testEntRd", "com.apple.private.iokit.testEntRd"}, - {ENT_RD, &gAppleSystemVariableGuid, "testEntRdSys", "com.apple.private.iokit.testEntRd"}, - {ENT_DEL, &gAppleNVRAMGuid, "testEntDel", "com.apple.private.iokit.testEntDel"}, - {ENT_DEL, &gAppleSystemVariableGuid, "testEntDelSys", "com.apple.private.iokit.testEntDel"}, + {OP_MOD_RST, &gAppleNVRAMGuid, "testEntModRst", "com.apple.private.iokit.testEntModRst"}, + {OP_MOD_RST, &gAppleSystemVariableGuid, "testEntModRstSys", "com.apple.private.iokit.testEntModRst"}, + {OP_RST, &gAppleNVRAMGuid, "testEntRst", "com.apple.private.iokit.testEntRst"}, + {OP_RST, &gAppleSystemVariableGuid, "testEntRstSys", "com.apple.private.iokit.testEntRst"}, + {OP_RD, &gAppleNVRAMGuid, "testEntRd", "com.apple.private.iokit.testEntRd"}, + {OP_RD, &gAppleSystemVariableGuid, "testEntRdSys", "com.apple.private.iokit.testEntRd"}, + {OP_DEL, &gAppleNVRAMGuid, "testEntDel", "com.apple.private.iokit.testEntDel"}, + {OP_DEL, &gAppleSystemVariableGuid, "testEntDelSys", "com.apple.private.iokit.testEntDel"}, {0, &UUID_NULL, nullptr, nullptr} }; @@ -509,7 +542,7 @@ parseVariableName(const OSSymbol *key, uuid_t *guidResult, const char **nameResu * @param systemActive boolean to indicate if it has system partition size > 0 */ static void -translateGUID(const uuid_t varGuid, const char *variableName, uuid_t destGuid, bool systemActive) +translateGUID(const uuid_t varGuid, const char *variableName, uuid_t destGuid, bool systemActive, bool log) { if (varGuid == nullptr || variableName == nullptr || destGuid == nullptr) { DEBUG_ERROR("nullptr passed as an argument\n"); @@ -520,20 +553,20 @@ translateGUID(const uuid_t varGuid, const char *variableName, uuid_t destGuid, b if (systemActive) { if (variableInAllowList(variableName)) { - DEBUG_INFO("Using system GUID due to allow list\n"); + DEBUG_INFO_IF(log, "Using system GUID due to allow list\n"); uuid_copy(destGuid, gAppleSystemVariableGuid); } else if (systemGuid) { - DEBUG_INFO("System GUID used\n"); + DEBUG_INFO_IF(log, "System GUID used\n"); uuid_copy(destGuid, gAppleSystemVariableGuid); } else { - DEBUG_INFO("Use given guid\n"); + DEBUG_INFO_IF(log, "Use given guid\n"); uuid_copy(destGuid, varGuid); } } else if (systemGuid) { - DEBUG_INFO("Overriding to Apple guid\n"); + DEBUG_INFO_IF(log, "Overriding to Apple guid\n"); uuid_copy(destGuid, gAppleNVRAMGuid); } else { - DEBUG_INFO("Use given guid\n"); + DEBUG_INFO_IF(log, "Use given guid\n"); uuid_copy(destGuid, varGuid); } } @@ -553,7 +586,7 @@ translateGUID(const uuid_t varGuid, const char *variableName, uuid_t destGuid, b * @return false if varName/varGuid/veChecked was NULL or if entitlement check returned false */ static bool -verifyVarEntitlement(const uuid_t varGuid, const char *varName, IONVRAMOperation op, bool systemActive, bool *veChecked) +verifyVarEntitlement(const uuid_t varGuid, const char *varName, IONVRAMOperation op, bool systemActive, bool *veChecked, bool log) { if (varGuid == nullptr || varName == nullptr || veChecked == nullptr) { DEBUG_ERROR("nullptr passed as an argument\n"); @@ -564,7 +597,7 @@ verifyVarEntitlement(const uuid_t varGuid, const char *varName, IONVRAMOperation const VariableEntitlementEntry *entry; *veChecked = false; - translateGUID(varGuid, varName, translatedGuid, systemActive); + translateGUID(varGuid, varName, translatedGuid, systemActive, log); entry = gVariableEntitlements; while ((entry != nullptr) && (entry->varName != nullptr)) { @@ -572,7 +605,7 @@ verifyVarEntitlement(const uuid_t varGuid, const char *varName, IONVRAMOperation // check if task entitlement check is required for this operation if (entry->checkOp & (1 << op)) { *veChecked = true; - DEBUG_INFO("Checking entitlement %s for %s for operation %s\n", entry->varEntitlement, varName, getNVRAMOpString(op)); + DEBUG_INFO_IF(log, "Checking entitlement %s for %s for operation %s\n", entry->varEntitlement, varName, getNVRAMOpString(op)); return IOCurrentTaskHasEntitlement(entry->varEntitlement); } break; @@ -584,7 +617,7 @@ verifyVarEntitlement(const uuid_t varGuid, const char *varName, IONVRAMOperation } static bool -kernelOnlyVar(const uuid_t varGuid, const char *varName) +kernelOnlyVar(const char *varName) { if (strncmp(varName, KERNEL_ONLY_VAR_NAME_PREFIX, sizeof(KERNEL_ONLY_VAR_NAME_PREFIX) - 1) == 0) { return true; @@ -594,29 +627,34 @@ kernelOnlyVar(const uuid_t varGuid, const char *varName) } static bool -verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, const bool systemActive) +verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, const bool systemActive, bool log) { VariablePermission perm; bool kernel, varEntitled, writeEntitled = false, readEntitled = false, allowList, systemGuid = false, systemEntitled = false, systemInternalEntitled = false, systemAllow, systemReadHiddenAllow = false; bool admin = false; bool ok = false; - if (verifyVarEntitlement(varGuid, varName, op, systemActive, &varEntitled) == false) { + if (verifyVarEntitlement(varGuid, varName, op, systemActive, &varEntitled, log) == false) { goto exit; } perm = getVariablePermission(varName); kernel = current_task() == kernel_task; - - if (perm.Bits.KernelOnly || kernelOnlyVar(varGuid, varName)) { - DEBUG_INFO("KernelOnly access for %s, kernel=%d\n", varName, kernel); + if (perm.Bits.KernelOnly || kernelOnlyVar(varName)) { + DEBUG_INFO_IF(log, "KernelOnly access for %s, kernel=%d\n", varName, kernel); ok = kernel; goto exit; } + if (perm.Bits.RestoreModifyOnly && (OP_MOD & (1 << op))) { + DEBUG_INFO_IF(log, "RestoreModifyOnly access for %s, gRestoreBoot=%d\n", varName, gRestoreBoot); + ok = gRestoreBoot; + goto exit; + } + if (perm.Bits.InternalOnly && !gInternalBuild) { - DEBUG_INFO("InternalOnly access for %s, gInternalBuild=%d\n", varName, gInternalBuild); + DEBUG_INFO_IF(log, "InternalOnly access for %s, gInternalBuild=%d\n", varName, gInternalBuild); goto exit; } @@ -648,7 +686,7 @@ verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, DEBUG_ERROR("Allowed write to system region when NOT entitled for %s\n", varName); } } else if (varEntitled) { - DEBUG_INFO("Allowed write to system region using variable specific entitlement for %s\n", varName); + DEBUG_INFO_IF(log, "Allowed write to system region using variable specific entitlement for %s\n", varName); } else if (!systemAllow) { DEBUG_ERROR("Not entitled for system region writes for %s\n", varName); break; @@ -662,13 +700,13 @@ verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, case kIONVRAMOperationObliterate: case kIONVRAMOperationReset: if (perm.Bits.NeverAllowedToDelete) { - DEBUG_INFO("Never allowed to delete %s\n", varName); + DEBUG_INFO_IF(log, "Never allowed to delete %s\n", varName); break; } else if ((op == kIONVRAMOperationObliterate) && perm.Bits.ResetNVRAMOnlyDelete) { - DEBUG_INFO("Not allowed to obliterate %s\n", varName); + DEBUG_INFO_IF(log, "Not allowed to obliterate %s\n", varName); break; } else if ((op == kIONVRAMOperationDelete) && perm.Bits.ResetNVRAMOnlyDelete) { - DEBUG_INFO("Only allowed to delete %s via NVRAM reset\n", varName); + DEBUG_INFO_IF(log, "Only allowed to delete %s via NVRAM reset\n", varName); break; } @@ -679,7 +717,7 @@ verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, DEBUG_ERROR("Allowed delete to system region when NOT entitled for %s\n", varName); } } else if (varEntitled) { - DEBUG_INFO("Allowed delete to system region using variable specific entitlement for %s\n", varName); + DEBUG_INFO_IF(log, "Allowed delete to system region using variable specific entitlement for %s\n", varName); } else if (!systemAllow) { DEBUG_ERROR("Not entitled for system region deletes for %s\n", varName); break; @@ -694,8 +732,8 @@ verifyPermission(IONVRAMOperation op, const uuid_t varGuid, const char *varName, } exit: - DEBUG_INFO("Permission for %s of %s %s: I=%d kern=%d, adm=%d, wE=%d, rE=%d, sG=%d, sEd=%d, sIEd=%d, sRHA=%d, UW=%d, vE=%d\n", getNVRAMOpString(op), varName, ok ? "granted" : "denied", - gInternalBuild, kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled, systemInternalEntitled, systemReadHiddenAllow, perm.Bits.UserWrite, varEntitled); + DEBUG_INFO_IF(log, "Permission for %s of %s %s: I=%d R=%d kern=%d, adm=%d, wE=%d, rE=%d, sG=%d, sEd=%d, sIEd=%d, sRHA=%d, UW=%d, vE=%d\n", getNVRAMOpString(op), varName, ok ? "granted" : "denied", + gInternalBuild, gRestoreBoot, kernel, admin, writeEntitled, readEntitled, systemGuid, systemEntitled, systemInternalEntitled, systemReadHiddenAllow, perm.Bits.UserWrite, varEntitled); return ok; } @@ -708,7 +746,7 @@ verifyPermission(IONVRAMOperation op, const OSSymbol *canonicalKey, const bool s parseVariableName(canonicalKey->getCStringNoCopy(), &varGuid, &varName); - return verifyPermission(op, varGuid, varName, systemActive); + return verifyPermission(op, varGuid, varName, systemActive, true); } static bool @@ -772,6 +810,32 @@ exit: return; } +static void +dumpData(const char *name, OSSharedPtr propData) +{ + if (!IS_LOG_BIT_SET(kIONVRAMDataHexDump) || !propData) { + return; + } + + uint8_t *dataBuf = (uint8_t *)propData->getBytesNoCopy(); + size_t propDataSize = propData->getLength(); + + if (dataBuf == nullptr || propDataSize == 0) { + return; + } + + IOLog("%s:%s:%u - %s: ", __FILE_NAME__, __FUNCTION__, __LINE__, name); + for (size_t i = 0; i < propDataSize; i++) { + // if printable character, use that + if ((dataBuf[i] >= 0x20 && dataBuf[i] <= 0x7e) && dataBuf[i] != '%') { + IOLog("%c", dataBuf[i]); + } else { + IOLog("%%%02x", dataBuf[i]); + } + } + IOLog("\n"); +} + // ************************** IODTNVRAMPlatformNotifier **************************** // private IOService based class for passing notifications to IODTNVRAM @@ -1571,7 +1635,6 @@ IODTNVRAM::dictionaryWithProperties(void) const uuid_t varGuid; const char * varName; IOReturn status; - require_action(_format, exit, DEBUG_ERROR("Handler not initialized yet\n")); status = _format->getVarDict(localVarDict); @@ -1588,7 +1651,7 @@ IODTNVRAM::dictionaryWithProperties(void) const parseVariableName(canonicalKey, &varGuid, &varName); if ((uuid_compare(varGuid, gAppleSystemVariableGuid) == 0) && - verifyPermission(kIONVRAMOperationRead, varGuid, varName, _format->getSystemPartitionActive())) { + verifyPermission(kIONVRAMOperationRead, varGuid, varName, _format->getSystemPartitionActive(), false)) { OSSharedPtr returnKey = OSSymbol::withCString(varName); returnDict->setObject(returnKey.get(), localVarDict->getObject(canonicalKey)); } @@ -1608,7 +1671,7 @@ IODTNVRAM::dictionaryWithProperties(void) const continue; } - if (verifyPermission(kIONVRAMOperationRead, varGuid, varName, _format->getSystemPartitionActive())) { + if (verifyPermission(kIONVRAMOperationRead, varGuid, varName, _format->getSystemPartitionActive(), false)) { OSSharedPtr returnKey = OSSymbol::withCString(varName); returnDict->setObject(returnKey.get(), localVarDict->getObject(canonicalKey)); } @@ -1698,12 +1761,12 @@ IODTNVRAM::copyPropertyWithGUIDAndName(const uuid_t guid, const char *name) cons status = _format->getVarDict(localVarDict); require_noerr_action(status, exit, DEBUG_ERROR("Failed to get variable dictionary\n")); - if (!verifyPermission(kIONVRAMOperationRead, guid, name, _format->getSystemPartitionActive())) { + if (!verifyPermission(kIONVRAMOperationRead, guid, name, _format->getSystemPartitionActive(), true)) { DEBUG_INFO("Not privileged\n"); goto exit; } - translateGUID(guid, name, newGuid, _format->getSystemPartitionActive()); + translateGUID(guid, name, newGuid, _format->getSystemPartitionActive(), true); canonicalKey = keyWithGuidAndCString(newGuid, name); @@ -1773,6 +1836,29 @@ IODTNVRAM::getProperty(const char *aKey) const return theObject.get(); } +IOReturn +IODTNVRAM::clearTestVars(const uuid_t guid) +{ + const VariablePermissionEntry *entry; + IOReturn ret = kIOReturnSuccess; + uuid_t newGuid; + + entry = gVariablePermissions; + require_action(gInternalBuild, exit, DEBUG_INFO("Internal build only\n")); + require_action(_format != nullptr, exit, (ret = kIOReturnNotReady, DEBUG_ERROR("Handler not initialized yet\n"))); + + while ((entry != nullptr) && (entry->name != nullptr)) { + if (entry->p.Bits.TestingOnly) { + translateGUID(guid, entry->name, newGuid, _format->getSystemPartitionActive(), true); + ret = _format->setVariable(newGuid, entry->name, nullptr); + } + entry++; + } + +exit: + return ret; +} + IOReturn IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t guid, const char *name, OSObject *anObject) { @@ -1788,6 +1874,11 @@ IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t guid, const char *name, OSObj require_action(_format != nullptr, exit, (ret = kIOReturnNotReady, DEBUG_ERROR("Handler not initialized yet\n"))); + if (strncmp(name, kNVRAMClearTestVarKey, sizeof(kNVRAMClearTestVarKey)) == 0) { + ret = clearTestVars(guid); + goto exit; + } + deletePropertyKey = strncmp(name, kIONVRAMDeletePropertyKey, sizeof(kIONVRAMDeletePropertyKey)) == 0; deletePropertyKeyWRet = strncmp(name, kIONVRAMDeletePropertyKeyWRet, sizeof(kIONVRAMDeletePropertyKeyWRet)) == 0; syncNowPropertyKey = strncmp(name, kIONVRAMSyncNowPropertyKey, sizeof(kIONVRAMSyncNowPropertyKey)) == 0; @@ -1839,7 +1930,7 @@ IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t guid, const char *name, OSObj goto exit; } - if (!verifyPermission(kIONVRAMOperationWrite, guid, name, _format->getSystemPartitionActive())) { + if (!verifyPermission(kIONVRAMOperationWrite, guid, name, _format->getSystemPartitionActive(), true)) { DEBUG_INFO("Not privileged\n"); ret = kIOReturnNotPrivileged; goto exit; @@ -1887,8 +1978,10 @@ IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t guid, const char *name, OSObj } if (propObject != nullptr) { - propDataSize = (OSDynamicPtrCast(propObject))->getLength(); - record_system_event(SYSTEM_EVENT_TYPE_INFO, SYSTEM_EVENT_SUBSYSTEM_NVRAM, "write", "%s as data with size %#x", name, ((OSData *)propObject.get())->getLength()); + OSSharedPtr propData = OSDynamicPtrCast(propObject); + propDataSize = propData->getLength(); + record_system_event(SYSTEM_EVENT_TYPE_INFO, SYSTEM_EVENT_SUBSYSTEM_NVRAM, "write", "%s as data with size %zu", name, propDataSize); + dumpData(name, propData); } #if defined(XNU_TARGET_OS_OSX) @@ -1922,7 +2015,7 @@ IODTNVRAM::setPropertyWithGUIDAndName(const uuid_t guid, const char *name, OSObj if (remove == false) { DEBUG_INFO("Adding object\n"); - translateGUID(guid, name, newGuid, _format->getSystemPartitionActive()); + translateGUID(guid, name, newGuid, _format->getSystemPartitionActive(), true); ret = _format->setVariable(newGuid, name, propObject.get()); } else { DEBUG_INFO("Removing object\n"); @@ -1979,13 +2072,13 @@ IODTNVRAM::removePropertyWithGUIDAndName(const uuid_t guid, const char *name) DEBUG_INFO("name=%s\n", name); require_action(_format != nullptr, exit, (ret = kIOReturnNotReady, DEBUG_ERROR("Handler not initialized yet\n"))); - if (!verifyPermission(kIONVRAMOperationDelete, guid, name, _format->getSystemPartitionActive())) { + if (!verifyPermission(kIONVRAMOperationDelete, guid, name, _format->getSystemPartitionActive(), true)) { DEBUG_INFO("Not privileged\n"); ret = kIOReturnNotPrivileged; goto exit; } - translateGUID(guid, name, newGuid, _format->getSystemPartitionActive()); + translateGUID(guid, name, newGuid, _format->getSystemPartitionActive(), true); ret = _format->setVariable(newGuid, name, nullptr); if (ret != kIOReturnSuccess) { diff --git a/iokit/Kernel/IONVRAMCHRPHandler.cpp b/iokit/Kernel/IONVRAMCHRPHandler.cpp index d9c38dde0..cbd1607d5 100644 --- a/iokit/Kernel/IONVRAMCHRPHandler.cpp +++ b/iokit/Kernel/IONVRAMCHRPHandler.cpp @@ -363,7 +363,7 @@ IONVRAMCHRPHandler::flush(const uuid_t guid, IONVRAMOperation op) clear = ((flushSystem && (uuid_compare(varGuid, gAppleSystemVariableGuid) == 0)) || (flushCommon && (uuid_compare(varGuid, gAppleSystemVariableGuid) != 0))) && - verifyPermission(op, varGuid, varName, getSystemPartitionActive()); + verifyPermission(op, varGuid, varName, getSystemPartitionActive(), true); if (clear) { DEBUG_INFO("Clearing entry for %s:%s\n", uuidString, varName); diff --git a/iokit/Kernel/IONVRAMV3Handler.cpp b/iokit/Kernel/IONVRAMV3Handler.cpp index 5e3f589ca..5c0d1adca 100644 --- a/iokit/Kernel/IONVRAMV3Handler.cpp +++ b/iokit/Kernel/IONVRAMV3Handler.cpp @@ -375,7 +375,7 @@ IONVRAMV3Handler::flush(const uuid_t guid, IONVRAMOperation op) clear = ((flushSystem && (uuid_compare(varGuid, gAppleSystemVariableGuid) == 0)) || (flushCommon && (uuid_compare(varGuid, gAppleSystemVariableGuid) != 0))) && - verifyPermission(op, varGuid, varName, getSystemPartitionActive()); + verifyPermission(op, varGuid, varName, getSystemPartitionActive(), true); if (clear) { DEBUG_INFO("Clearing entry for %s:%s\n", uuidString, varName); diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 2d924eb14..d0f78f6fb 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include "IOKitKernelInternal.h" #if HIBERNATION #include @@ -68,6 +69,7 @@ #include #include #include +#include #include #include @@ -188,7 +190,8 @@ enum { kPowerEventPublishSleepWakeUUID, // 13 kPowerEventSetDisplayPowerOn, // 14 kPowerEventPublishWakeType, // 15 - kPowerEventAOTEvaluate // 16 + kPowerEventAOTEvaluate, // 16 + kPowerEventRunModeRequest // 17 }; // For evaluatePolicy() @@ -333,15 +336,6 @@ enum { kWranglerPowerStateMax = 4 }; -enum { - OFF_STATE = 0, - RESTART_STATE = 1, - SLEEP_STATE = 2, - AOT_STATE = 3, - ON_STATE = 4, - NUM_POWER_STATES -}; - const char * getPowerStateString( uint32_t state ) { @@ -623,6 +617,7 @@ struct timeval gIOLastUserSleepTime; static char gWakeReasonString[128]; static char gBootReasonString[80]; static char gShutdownReasonString[80]; +static uint64_t gShutdownTime; static bool gWakeReasonSysctlRegistered = false; static bool gBootReasonSysctlRegistered = false; static bool gShutdownReasonSysctlRegistered = false; @@ -1378,6 +1373,26 @@ SYSCTL_PROC(_kern, OID_AUTO, shutdownreason, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_shutdownreason, "A", "shutdownreason"); +// This value is meant to represent the last time the device shut down +// in a unit of the PMU driver's choosing see rdar://138590268 for details +static int +sysctl_shutdowntime SYSCTL_HANDLER_ARGS +{ + uint64_t shutdownTime = 0; + + if (gRootDomain && gShutdownReasonSysctlRegistered) { + gRootDomain->copyShutdownTime(&shutdownTime); + } else { + return ENOENT; + } + + return SYSCTL_OUT(req, &shutdownTime, sizeof(shutdownTime)); +} + +SYSCTL_PROC(_kern, OID_AUTO, shutdowntime, + CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + NULL, 0, sysctl_shutdowntime, "Q", "shutdowntime"); + static int sysctl_targettype SYSCTL_HANDLER_ARGS { @@ -1400,6 +1415,69 @@ SYSCTL_PROC(_hw, OID_AUTO, targettype, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, sysctl_targettype, "A", "targettype"); +static SECURITY_READ_ONLY_LATE(char*) jetsam_properties_product_type_string = NULL; +static SECURITY_READ_ONLY_LATE(size_t) jetsam_properties_product_type_string_len = 0; + +/* + * SecureDTLookupEntry() is only guaranteed to work before PE_init_iokit(), + * so we load the jetsam_properties_product_type string (if available) in a startup handler. + */ +__startup_func +static void +sysctl_load_jetsam_properties_product_type(void) +{ + DTEntry node; + void const *value = NULL; + unsigned int size = 0; + + if (kSuccess != SecureDTLookupEntry(nullptr, "/product", &node)) { + return; + } + + if (kSuccess != SecureDTGetProperty(node, "jetsam-properties-product-type", (void const **) &value, &size)) { + return; + } + + if (size == 0) { + return; + } + + jetsam_properties_product_type_string = (char *) zalloc_permanent(size, ZALIGN_NONE); + if (jetsam_properties_product_type_string == NULL) { + return; + } + + memcpy(jetsam_properties_product_type_string, value, size); + jetsam_properties_product_type_string_len = size; +} +STARTUP(SYSCTL, STARTUP_RANK_MIDDLE, sysctl_load_jetsam_properties_product_type); + +static int +sysctl_jetsam_properties_product_type SYSCTL_HANDLER_ARGS +{ + if (jetsam_properties_product_type_string != NULL) { + return SYSCTL_OUT(req, jetsam_properties_product_type_string, jetsam_properties_product_type_string_len); + } + + IOService * root; + OSSharedPtr obj; + OSData * data; + char tt[32]; + + tt[0] = '\0'; + root = IOService::getServiceRoot(); + if (root && (obj = root->copyProperty(gIODTTargetTypeKey))) { + if ((data = OSDynamicCast(OSData, obj.get()))) { + strlcpy(tt, (const char *) data->getBytesNoCopy(), sizeof(tt)); + } + } + return sysctl_io_string(req, tt, 0, 0, NULL); +} + +SYSCTL_PROC(_hw, OID_AUTO, jetsam_properties_product_type, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + NULL, 0, sysctl_jetsam_properties_product_type, "A", "jetsam_properties_product_type"); + static SYSCTL_INT(_debug, OID_AUTO, noidle, CTLFLAG_RW, &gNoIdleFlag, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, swd_sleep_timeout, CTLFLAG_RW, &gSwdSleepTimeout, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, swd_wake_timeout, CTLFLAG_RW, &gSwdWakeTimeout, 0, ""); @@ -1429,6 +1507,8 @@ sysctl_aotmetrics SYSCTL_HANDLER_ARGS return sysctl_io_opaque(req, gRootDomain->_aotMetrics, sizeof(IOPMAOTMetrics), NULL); } +TUNABLE_DT_WRITEABLE(uint32_t, gAOTMode, "/product/iopm", + "aot-mode", "aot_mode", 0, TUNABLE_DT_NONE); static SYSCTL_PROC(_kern, OID_AUTO, aotmetrics, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, NULL, 0, sysctl_aotmetrics, "S,IOPMAOTMetrics", ""); @@ -1504,6 +1584,15 @@ static SYSCTL_PROC(_kern, OID_AUTO, aotmode, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, NULL, 0, sysctl_aotmode, "I", ""); +TUNABLE_DT(uint32_t, gAOTLingerTimeMS, "/product/iopm", + "aot-linger-time-ms", "aot_linger_time_ms", 800, TUNABLE_DT_NONE); + +// Low Power Wake tunables +TUNABLE_DT_WRITEABLE(uint64_t, gLPWFlags, "/product/iopm", + "low-power-wake", "low_power_wake", false, TUNABLE_DT_NONE); +static SYSCTL_QUAD(_kern, OID_AUTO, lowpowerwake, CTLFLAG_RW | CTLFLAG_LOCKED, + &gLPWFlags, "Low Power Wake"); + //****************************************************************************** static OSSharedPtr gIOPMSettingAutoWakeCalendarKey; @@ -1605,6 +1694,10 @@ IOPMrootDomain::start( IOService * nub ) PE_parse_boot_argn("haltmspanic", &gHaltTimeMaxPanic, sizeof(gHaltTimeMaxPanic)); PE_parse_boot_argn("haltmslog", &gHaltTimeMaxLog, sizeof(gHaltTimeMaxLog)); + _aotMode = gAOTMode; + _aotLingerTime = gAOTLingerTimeMS; + _aotMetrics = _aotMode ? IOMallocType(IOPMAOTMetrics) : NULL; + // read noidle setting from Device Tree if (PE_get_default("no-idle", &gNoIdleFlag, sizeof(gNoIdleFlag))) { DLOG("Setting gNoIdleFlag to %u from device tree\n", gNoIdleFlag); @@ -1741,6 +1834,9 @@ IOPMrootDomain::start( IOService * nub ) PMinit(); // creates gIOPMWorkLoop gIOPMWorkLoop = getIOPMWorkloop(); + commandGate = IOCommandGate::commandGate(gIOPMWorkLoop); + gIOPMWorkLoop->addEventSource(commandGate.get()); + // Create IOPMPowerStateQueue used to queue external power // events, and to handle those events on the PM work loop. pmPowerStateQueue = IOPMPowerStateQueue::PMPowerStateQueue( @@ -1748,7 +1844,6 @@ IOPMrootDomain::start( IOService * nub ) &IOPMrootDomain::dispatchPowerEvent)); gIOPMWorkLoop->addEventSource(pmPowerStateQueue); - _aotMode = 0; _aotTimerES = IOTimerEventSource::timerEventSource(this, OSMemberFunctionCast(IOTimerEventSource::Action, this, &IOPMrootDomain::aotEvaluate)); @@ -2047,6 +2142,27 @@ exit: return return_value; } +#if HIBERNATION +// MARK: - +// MARK: setLockdownModeHibernation +// *************************************************************************** +void +IOPMrootDomain::setLockdownModeHibernation(uint32_t status) +{ + if (!gIOPMWorkLoop->inGate()) { + gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, this, + &IOPMrootDomain::setLockdownModeHibernation), + this, (void *)(uintptr_t) status); + return; + } + + ldmHibernateDisable = status; + DLOG("ldmHibernateDisable %d\n", status); + setProperty("IOPMLDMHibernationDisable", status); +} +#endif + // MARK: - // MARK: Aggressiveness @@ -2827,6 +2943,15 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) notifierThread = current_thread(); switch (getPowerState()) { case SLEEP_STATE: { + if (kIOPMDriverAssertionLevelOn == getPMAssertionLevel(kIOPMDriverAssertionForceWakeupBit)) { + IOLog("accelerate wake for assertion\n"); + setWakeTime(mach_continuous_time()); + } + if (kIOPMDriverAssertionLevelOn == getPMAssertionLevel(kIOPMDriverAssertionForceFullWakeupBit)) { + // Note: The scheduled RTC wakeup will trigger a full wake. + scheduleImmediateDebugWake(); + } + if (kPMCalendarTypeInvalid != _aotWakeTimeCalendar.selector) { secs = 0; microsecs = 0; @@ -2895,7 +3020,6 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) if (!_aotLastWakeTime) { gIOLastUserSleepTime = gIOLastSleepTime; } - gIOLastWakeTime.tv_sec = 0; gIOLastWakeTime.tv_usec = 0; gIOLastSleepAbsTime = now; @@ -3007,9 +3131,11 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) if (_aotTestTime) { if (_aotWakeTimeUTC <= secs) { - _aotTestTime = _aotTestTime + _aotTestInterval; + _aotTestTime = mach_continuous_time() + _aotTestInterval; + } + if (_aotTestTime < _aotEndTime) { + _setWakeTime(_aotTestTime); } - setWakeTime(_aotTestTime); } } @@ -3168,9 +3294,13 @@ IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) // and before the changePowerStateWithTagToPriv() call below. WAKEEVENT_LOCK(); aotShouldExit(false); + unsigned long newState = getRUN_STATE(); + if (AOT_STATE == newState) { + _aotRunMode = gLPWFlags; + } WAKEEVENT_UNLOCK(); - changePowerStateWithTagToPriv(getRUN_STATE(), kCPSReasonWake); + changePowerStateWithTagToPriv(newState, kCPSReasonWake); break; } #if !__i386__ && !__x86_64__ @@ -4176,34 +4306,25 @@ IOPMrootDomain::scheduleImmediateDebugWake( void ) } //****************************************************************************** -// willNotifyPowerChildren +// willNotifyInterest // -// Called after all interested drivers have all acknowledged the power change, -// but before any power children is informed. Dispatched though a thread call, -// so it is safe to perform work that might block on a sleeping disk. PM state -// machine (not thread) will block w/o timeout until this function returns. +// Called after all priority clients have all acknowledged the power change, +// but before any interested drivers and any power children are informed. +// Dispatched though a thread call, so it is safe to perform work that might block on a +// sleeping disk. PM state machine (not thread) will block w/o timeout until this function returns. //****************************************************************************** void -IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) +IOPMrootDomain::willNotifyInterested( IOPMPowerStateIndex newPowerState ) { if (SLEEP_STATE == newPowerState) { - notifierThread = current_thread(); - if (updateTasksSuspend(kTasksSuspendSuspended, kTasksSuspendNoChange)) { - AbsoluteTime deadline; - - clock_interval_to_deadline(10, kSecondScale, &deadline); -#if defined(XNU_TARGET_OS_OSX) - vm_pageout_wait(AbsoluteTime_to_scalar(&deadline)); -#endif /* defined(XNU_TARGET_OS_OSX) */ - } - _aotReadyToFullWake = false; #if 0 if (_aotLingerTime) { - uint64_t deadline; + uint64_t interval, deadline; IOLog("aot linger no return\n"); - clock_absolutetime_interval_to_deadline(_aotLingerTime, &deadline); + nanoseconds_to_absolutetime(_aotLingerTime * NSEC_PER_MSEC, &interval); + clock_absolutetime_interval_to_deadline(interval, &deadline); clock_delay_until(deadline); } #endif @@ -4221,19 +4342,44 @@ IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) _aotLastWakeTime = 0; bzero(_aotMetrics, sizeof(IOPMAOTMetrics)); if (kIOPMAOTModeCycle & _aotMode) { - clock_interval_to_absolutetime_interval(60, kSecondScale, &_aotTestInterval); + clock_interval_to_absolutetime_interval(10, kSecondScale, &_aotTestInterval); _aotTestTime = mach_continuous_time() + _aotTestInterval; - setWakeTime(_aotTestTime); + AbsoluteTime endInterval; + clock_interval_to_absolutetime_interval(60, kSecondScale, &endInterval); + _aotEndTime = mach_continuous_time() + endInterval; + _setWakeTime(_aotTestTime); } - uint32_t lingerSecs; - if (!PE_parse_boot_argn("aotlinger", &lingerSecs, sizeof(lingerSecs))) { - lingerSecs = 0; - } - clock_interval_to_absolutetime_interval(lingerSecs, kSecondScale, &_aotLingerTime); clock_interval_to_absolutetime_interval(2000, kMillisecondScale, &_aotWakePreWindow); clock_interval_to_absolutetime_interval(1100, kMillisecondScale, &_aotWakePostWindow); } + if (updateTasksSuspend(kTasksSuspendSuspended, kTasksSuspendNoChange)) { + IOLog("PMRD: tasks suspend\n"); + AbsoluteTime deadline; + + clock_interval_to_deadline(10, kSecondScale, &deadline); +#if defined(XNU_TARGET_OS_OSX) + vm_pageout_wait(AbsoluteTime_to_scalar(&deadline)); +#endif /* defined(XNU_TARGET_OS_OSX) */ + } + } +} + +//****************************************************************************** +// willNotifyPowerChildren +// +// Called after all interested drivers have all acknowledged the power change, +// but before any power children are informed. +// Dispatched though a thread call, so it is safe to perform work that might block on a +// sleeping disk. PM state machine (not thread) will block w/o timeout until this function returns. +//****************************************************************************** + +void +IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState ) +{ + if (SLEEP_STATE == newPowerState) { + notifierThread = current_thread(); + #if HIBERNATION // Adjust watchdog for IOHibernateSystemSleep int defaultTimeout = getWatchdogTimeout(); @@ -6166,7 +6312,7 @@ IOPMrootDomain::overrideOurPowerChange( #if HIBERNATION && defined(__arm64__) if (lowBatteryCondition && (desiredPowerState < currentPowerState)) { - if (!ml_is_secure_hib_supported()) { + if (!ml_is_secure_hib_supported() || ldmHibernateDisable) { // If hibernation is unsupported, reject sleep requests to avoid // racing with system shutdown. *inOutChangeFlags |= kIOPMNotDone; @@ -6248,21 +6394,22 @@ IOPMrootDomain::handleOurPowerChangeStart( if (changeFlags & kIOPMSynchronize) { if (newPowerState == ON_STATE) { if (changeFlags & kIOPMSyncNoChildNotify) { - _systemTransitionType = kSystemTransitionNewCapClient; + setSystemTransitionTypeGated(kSystemTransitionNewCapClient); } else { - _systemTransitionType = kSystemTransitionCapability; + setSystemTransitionTypeGated(kSystemTransitionCapability); } } } // 2. Going to sleep (cancellation still possible). else if (newPowerState < currentPowerState) { - _systemTransitionType = kSystemTransitionSleep; + setSystemTransitionTypeGated(kSystemTransitionSleep); } // 3. Woke from (idle or demand) sleep. else if (!systemBooting && (changeFlags & kIOPMSelfInitiated) && (newPowerState > currentPowerState)) { - _systemTransitionType = kSystemTransitionWake; + setSystemTransitionTypeGated(kSystemTransitionWake); + _desiredCapability = kIOPMSystemCapabilityCPU | kIOPMSystemCapabilityNetwork; // Early exit from dark wake to full (e.g. LID open) @@ -6304,7 +6451,7 @@ IOPMrootDomain::handleOurPowerChangeStart( if ((kSystemTransitionCapability == _systemTransitionType) && (_pendingCapability == _currentCapability)) { // Cancel the PM state change. - _systemTransitionType = kSystemTransitionNone; + setSystemTransitionTypeGated(kSystemTransitionNone); *inOutChangeFlags |= kIOPMNotDone; } if (__builtin_popcount(_pendingCapability) < @@ -6413,7 +6560,7 @@ IOPMrootDomain::handleOurPowerChangeStart( // Clear stats about sleep if (AOT_STATE == newPowerState) { - _pendingCapability = 0; + _pendingCapability = kIOPMSystemCapabilityAOT; } if (AOT_STATE == currentPowerState) { @@ -6503,6 +6650,38 @@ IOPMrootDomain::handleOurPowerChangeStart( } } +void +IOPMrootDomain::setSystemTransitionTypeGated(SystemTransitionType type) +{ + assert(gIOPMWorkLoop->inGate()); + _systemTransitionType = type; + commandGate->commandWakeup(&_systemTransitionType); +} + +void +IOPMrootDomain::waitForSystemTransitionToMinPowerState(IOPMRootDomainPowerState state) +{ + while (true) { + IOReturn ret = gIOPMWorkLoop->runActionBlock(^{ + // Block until all in progress transitions have completed. + while (_systemTransitionType != kSystemTransitionNone) { + commandGate->commandSleep(&_systemTransitionType); + } + + // Check the current power state. + if (getPowerState() >= state) { + return kIOReturnSuccess; + } + + return kIOReturnError; + }); + + if (ret == kIOReturnSuccess) { + break; + } + } +} + void IOPMrootDomain::handleOurPowerChangeDone( IOService * service, @@ -6512,7 +6691,7 @@ IOPMrootDomain::handleOurPowerChangeDone( IOPMPowerChangeFlags changeFlags ) { if (kSystemTransitionNewCapClient == _systemTransitionType) { - _systemTransitionType = kSystemTransitionNone; + setSystemTransitionTypeGated(kSystemTransitionNone); return; } @@ -6677,7 +6856,8 @@ IOPMrootDomain::handleOurPowerChangeDone( tracePoint( kIOPMTracePointSystemUp ); } - _systemTransitionType = kSystemTransitionNone; + setSystemTransitionTypeGated(kSystemTransitionNone); + _systemMessageClientMask = 0; toldPowerdCapWillChange = false; @@ -7519,6 +7699,29 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, break; #endif + if (_driverKitMatchingAssertionCount != 0 || _driverKitSyncedAssertionCount != 0) { + err = kPMCPUAssertion; + break; + } + + // Check for any dexts currently being added to the PM tree. Sleeping while + // this is in flight can cause IOServicePH to timeout. + if (!IOServicePH::checkPMReady()) { +#if !defined(XNU_TARGET_OS_OSX) + if (!(lowBatteryCondition || thermalWarningState || thermalEmergencyState)) { + // 116893363: kPMDKNotReady sleep cancellations often leaves embedded devices + // in dark wake for long periods of time, which causes issues as apps were + // already informed of sleep during the f->9 transition. As a temporary + // measure, always full wake if we hit this specific condition. + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeActivityTickle); + } +#endif + err = kPMDKNotReady; + break; + } + if (lowBatteryCondition || thermalWarningState || thermalEmergencyState) { break; // always sleep on low battery or when in thermal warning/emergency state } @@ -7532,26 +7735,6 @@ IOPMrootDomain::checkSystemSleepAllowed( IOOptionBits options, break; } - if (_driverKitMatchingAssertionCount != 0) { - err = kPMCPUAssertion; - break; - } - - // Check for any dexts currently being added to the PM tree. Sleeping while - // this is in flight can cause IOServicePH to timeout. - if (!IOServicePH::checkPMReady()) { -#if !defined(XNU_TARGET_OS_OSX) - // 116893363: kPMDKNotReady sleep cancellations often leaves embedded devices - // in dark wake for long periods of time, which causes issues as apps were - // already informed of sleep during the f->9 transition. As a temporary - // measure, always full wake if we hit this specific condition. - pmPowerStateQueue->submitPowerEvent( - kPowerEventPolicyStimulus, - (void *) kStimulusDarkWakeActivityTickle); -#endif - err = kPMDKNotReady; - break; - } if (getPMAssertionLevel( kIOPMDriverAssertionCPUBit ) == kIOPMDriverAssertionLevelOn) { @@ -7641,6 +7824,27 @@ IOPMrootDomain::checkSystemCanAbortIdleSleep( void ) return idleSleepRevertible && abortableSleepType; } +//****************************************************************************** +// considerRunMode +// consider the driver for AOT power on via the runmode mask +//****************************************************************************** + +int32_t +IOPMrootDomain::considerRunMode(IOService * service, uint64_t pmDriverClass) +{ + int32_t promote; + + if ((0 == _aotRunMode) || (service == this)) { + // neutral + return 0; + } + promote = (0 != (_aotRunMode & pmDriverClass)) ? 1 : -1; + if (promote > 0) { + IOLog("IOPMRD: %s 0x%llx runmode to %s\n", service->getName(), pmDriverClass, (promote < 0) ? "OFF" : "ON"); + } + return promote; +} + //****************************************************************************** // attemptIdleSleepAbort //****************************************************************************** @@ -7818,8 +8022,42 @@ IOPMrootDomain::isAOTMode() return _aotNow; } +bool +IOPMrootDomain::isLPWMode() +{ + return gLPWFlags && currentOrPendingPowerState(AOT_STATE); +} + +bool +IOPMIsAOTMode(void) +{ + return gIOPMRootDomain && gIOPMRootDomain->isAOTMode(); +} +bool +IOPMIsLPWMode(void) +{ + return gIOPMRootDomain && gIOPMRootDomain->isLPWMode(); +} + +void +IOPMNetworkStackFullWake(uint64_t flags, const char * reason) +{ + assert(kIOPMNetworkStackFullWakeFlag == flags); + assert(gIOPMRootDomain); + gIOPMRootDomain->claimSystemWakeEvent(gIOPMRootDomain, kIOPMWakeEventAOTExit, reason, NULL); +} + IOReturn IOPMrootDomain::setWakeTime(uint64_t wakeContinuousTime) +{ + if (kIOPMAOTModeCycle & _aotMode) { + return kIOReturnSuccess; + } + return _setWakeTime(wakeContinuousTime); +} + +IOReturn +IOPMrootDomain::_setWakeTime(uint64_t wakeContinuousTime) { clock_sec_t nowsecs, wakesecs; clock_usec_t nowmicrosecs, wakemicrosecs; @@ -7910,6 +8148,7 @@ IOPMrootDomain::aotExit(bool cps) ASSERT_GATED(); _aotNow = false; + _aotRunMode = 0; _aotReadyToFullWake = false; if (_aotTimerScheduled) { _aotTimerES->cancelTimeout(); @@ -7982,8 +8221,8 @@ IOPMrootDomain::aotEvaluate(IOTimerEventSource * timer) void IOPMrootDomain::adjustPowerState( bool sleepASAP ) { - DEBUG_LOG("adjustPowerState %s, asap %d, idleSleepEnabled %d\n", - getPowerStateString((uint32_t) getPowerState()), sleepASAP, idleSleepEnabled); + DEBUG_LOG("adjustPowerState %s, asap %d, idleSleepEnabled %d, _aotNow %d\n", + getPowerStateString((uint32_t) getPowerState()), sleepASAP, idleSleepEnabled, _aotNow); ASSERT_GATED(); @@ -7999,11 +8238,7 @@ IOPMrootDomain::adjustPowerState( bool sleepASAP ) && !_aotTimerScheduled && (kIOPMWakeEventAOTPossibleExit == (kIOPMWakeEventAOTPossibleFlags & _aotPendingFlags))) { _aotTimerScheduled = true; - if (_aotLingerTime) { - _aotTimerES->setTimeout(_aotLingerTime); - } else { - _aotTimerES->setTimeout(800, kMillisecondScale); - } + _aotTimerES->setTimeout(_aotLingerTime, kMillisecondScale); } WAKEEVENT_UNLOCK(); if (exitNow) { @@ -8259,6 +8494,11 @@ IOPMrootDomain::dispatchPowerEvent( aotEvaluate(NULL); } break; + case kPowerEventRunModeRequest: + DLOG("power event %u args %p 0x%llx\n", event, OBFUSCATE(arg0), arg1); + // arg1 == runModeMask + handleRequestRunMode(arg1); + break; } } @@ -8455,7 +8695,7 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg ) if (msg & kIOPMPowerEmergency) { DLOG("Received kIOPMPowerEmergency"); #if HIBERNATION && defined(__arm64__) - if (!ml_is_secure_hib_supported()) { + if (!ml_is_secure_hib_supported() || ldmHibernateDisable) { // Wait for the next low battery notification if the system state is // in transition. if ((_systemTransitionType == kSystemTransitionNone) && @@ -8465,6 +8705,7 @@ IOPMrootDomain::handlePowerNotification( UInt32 msg ) lowBatteryCondition = true; // Notify userspace to initiate system shutdown + DLOG("Initiating userspace shutdown ml_is_secure_hib_supported %d lockdownMode %d", ml_is_secure_hib_supported(), ldmHibernateDisable); messageClients(kIOPMMessageRequestSystemShutdown); } } else { @@ -9014,7 +9255,9 @@ IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) if (!systemBooting && (0 == idleSleepPreventersCount())) { if (!wrangler) { - changePowerStateWithTagToPriv(getRUN_STATE(), kCPSReasonEvaluatePolicy); + if (kStimulusNoIdleSleepPreventers != stimulus) { + changePowerStateWithTagToPriv(getRUN_STATE(), kCPSReasonEvaluatePolicy); + } if (idleSleepEnabled) { #if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT if (!extraSleepDelay && !idleSleepTimerPending && !gNoIdleFlag) { @@ -10617,6 +10860,12 @@ IOPMrootDomain::createPMAssertion( serviceName, ownerDescription); } #endif /* (DEVELOPMENT || DEBUG) */ + + const bool waitForWakeup = (whichAssertionBits & kIOPMDriverAssertionForceWakeupBit); + if (waitForWakeup) { + waitForSystemTransitionToMinPowerState(AOT_STATE); + } + return newAssertion; } else { return 0; @@ -10728,6 +10977,75 @@ IOPMrootDomain::releaseDriverKitMatchingAssertion() }); } +IOReturn +IOPMrootDomain::acquireDriverKitSyncedAssertion(IOService * from, IOPMDriverAssertionID * assertionID) +{ + return gIOPMWorkLoop->runActionBlock(^{ + if (kSystemTransitionSleep == _systemTransitionType && !idleSleepRevertible) { + // system going to sleep + return kIOReturnBusy; + } + // createPMAssertion is asynchronous. + // we must also set _driverKitSyncedAssertionCount under the PM workloop lock so that we can cancel sleep immediately + // only kIOPMDriverAssertionCPUBit is used for "synced" assertion + *assertionID = createPMAssertion(kIOPMDriverAssertionCPUBit, kIOPMDriverAssertionLevelOn, this, from->getName()); + if (*assertionID != kIOPMUndefinedDriverAssertionID) { + _driverKitSyncedAssertionCount++; + return kIOReturnSuccess; + } else { + return kIOReturnBusy; + } + }); +} + +void +IOPMrootDomain::releaseDriverKitSyncedAssertion(IOPMDriverAssertionID assertionID) +{ + gIOPMWorkLoop->runActionBlock(^{ + if (_driverKitSyncedAssertionCount != 0) { + _driverKitSyncedAssertionCount--; + releasePMAssertion(assertionID); + } else { + panic("Over-release of driverkit synced assertion"); + } + return kIOReturnSuccess; + }); +} + + +IOReturn +IOPMrootDomain::createPMAssertionSafe( + IOPMDriverAssertionID *assertionID, + IOPMDriverAssertionType whichAssertionBits, + IOPMDriverAssertionLevel assertionLevel, + IOService *ownerService, + const char *ownerDescription) +{ + IOReturn ret; + IOPMDriverAssertionID __block id; + + if (!assertionID) { + return kIOReturnBadArgument; + } + + // Grab workloop to check current transition + ret = gIOPMWorkLoop->runActionBlock(^{ + if (_systemTransitionType == kSystemTransitionSleep) { + return kIOReturnBusy; + } + id = createPMAssertion(whichAssertionBits, assertionLevel, ownerService, ownerDescription); + return id ? kIOReturnSuccess : kIOReturnError; + }); + + if (ret == kIOReturnSuccess) { + *assertionID = id; + } else if (ret == kIOReturnBusy && (kIOLogPMRootDomain & gIOKitDebug)) { + DLOG("assertion denied due to ongoing sleep transition (%s)\n", ownerDescription); + } + + return ret; +} + bool IOPMrootDomain::serializeProperties( OSSerialize * s ) const { @@ -10856,6 +11174,14 @@ IOPMrootDomain::copyShutdownReasonString( char * outBuf, size_t bufSize ) WAKEEVENT_UNLOCK(); } +void +IOPMrootDomain::copyShutdownTime( uint64_t * time ) +{ + WAKEEVENT_LOCK(); + *time = gShutdownTime; + WAKEEVENT_UNLOCK(); +} + //****************************************************************************** // acceptSystemWakeEvents // @@ -10970,19 +11296,15 @@ IOPMrootDomain::claimSystemWakeEvent( IOOptionBits aotFlags = 0; bool needAOTEvaluate = FALSE; - if (kIOPMAOTModeAddEventFlags & _aotMode) { + if ((kIOPMAOTModeAddEventFlags & _aotMode) && (!flags || (flags == kIOPMWakeEventSource))) { + flags |= kIOPMWakeEventAOTExit; + // Only allow lingering in AOT_STATE for the two wake reasons used for the wrist raise gesture. - if (strcmp("AOP.OutboxNotEmpty", reason) && strcmp("spu_gesture", reason)) { - flags |= kIOPMWakeEventAOTExit; + if (!strcmp("AOP.OutboxNotEmpty", reason) || !strcmp("spu_gesture", reason)) { + flags &= ~kIOPMWakeEventAOTExit; } } -#if DEVELOPMENT || DEBUG - if (_aotLingerTime && !strcmp("rtc", reason)) { - flags |= kIOPMWakeEventAOTPossibleExit; - } -#endif /* DEVELOPMENT || DEBUG */ - #if defined(XNU_TARGET_OS_OSX) && !DISPLAY_WRANGLER_PRESENT // Publishing the WakeType is serialized by the PM work loop if (!strcmp("rtc", reason) && (_nextScheduledAlarmType != NULL)) { @@ -11134,7 +11456,7 @@ IOPMrootDomain::claimSystemShutdownEvent( IOService * device, IOOptionBits flags, const char * reason, - __unused OSObject * details ) + OSObject * details ) { if (!device || !reason) { return; @@ -11157,10 +11479,57 @@ IOPMrootDomain::claimSystemShutdownEvent( } strlcat(gShutdownReasonString, reason, sizeof(gShutdownReasonString)); + if (details) { + OSDictionary *dict = OSDynamicCast(OSDictionary, details); + if (dict) { + OSSharedPtr sharedKey = OSString::withCString(kIOPMRootDomainShutdownTime); + if (sharedKey) { + OSNumber *num = OSDynamicCast(OSNumber, dict->getObject(sharedKey.get())); + if (num) { + gShutdownTime = (uint64_t)(num->unsigned64BitValue()); + } + } + } + } + gShutdownReasonSysctlRegistered = true; WAKEEVENT_UNLOCK(); } +//****************************************************************************** +// requestRunMode +// +// For clients to request a LPW run mode. Only full wake is supported currently. +//****************************************************************************** + +IOReturn +IOPMrootDomain::requestRunMode(uint64_t runModeMask) +{ + // We only support requesting full wake at the moment + if (runModeMask == kIOPMRunModeFullWake) { + pmPowerStateQueue->submitPowerEvent(kPowerEventRunModeRequest, NULL, runModeMask); + return kIOReturnSuccess; + } + return kIOReturnUnsupported; +} + +IOReturn +IOPMrootDomain::handleRequestRunMode(uint64_t runModeMask) +{ + // TODO: Replace with run mode logic when implemented + IOReturn ret = kIOReturnUnsupported; + + // We only support requesting full wake at the moment + if (runModeMask == kIOPMRunModeFullWake) { + // A simple CPS should suffice for now + changePowerStateWithTagToPriv(ON_STATE, kCPSReasonEvaluatePolicy); + ret = kIOReturnSuccess; + } + + DLOG("%s: mask %llx ret %x\n", __func__, runModeMask, ret); + return ret; +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // MARK: - @@ -11835,8 +12204,8 @@ OSDefineMetaClassAndFinalStructors(IORootParent, IOService) static IOPMPowerState patriarchPowerStates[2] = { - {1, 0, ON_POWER, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - {1, 0, ON_POWER, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {.version = kIOPMPowerStateVersion1, .outputPowerCharacter = ON_POWER }, + {.version = kIOPMPowerStateVersion1, .outputPowerCharacter = ON_POWER } }; void diff --git a/iokit/Kernel/IOPerfControl.cpp b/iokit/Kernel/IOPerfControl.cpp index 7b4633e6b..cdd499222 100644 --- a/iokit/Kernel/IOPerfControl.cpp +++ b/iokit/Kernel/IOPerfControl.cpp @@ -147,16 +147,38 @@ IOPerfControlClient::free() super::free(); } +void +IOPerfControlClient::setDeviceType(IOPCDeviceType newDeviceType) +{ + if (newDeviceType >= IOPCDeviceTypeMax) { + panic("unknown device type %d", newDeviceType); + } + + if (deviceType != IOPCDeviceTypeUnknown) { + panic("deviceType already set to %d", deviceType); + } + + deviceType = newDeviceType; +} + + IOPerfControlClient * -IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity) +IOPerfControlClient::copyClientForDeviceType(IOService *driver, uint64_t maxWorkCapacity, IOPCDeviceType deviceType) { IOPerfControlClient *client = new IOPerfControlClient; if (!client || !client->init(driver, maxWorkCapacity)) { panic("could not create IOPerfControlClient"); } + client->setDeviceType(deviceType); return client; } +IOPerfControlClient * +IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity) +{ + return copyClientForDeviceType(driver, maxWorkCapacity, IOPCDeviceTypeUnknown); +} + /* Convert the per driver token into a globally unique token for the performance * controller's consumption. This is achieved by setting the driver's unique * index onto the high order bits. The performance controller is shared between @@ -768,6 +790,30 @@ IOPerfControlClient::workEndWithContext(IOService *device, OSObject *context, Wo #endif } +IOReturn +IOPerfControlClient::querySubmitterRole(IOService *device, task_t clientTask, uint32_t* role_out) +{ + IOReturn result = kIOReturnNotFound; + + uint32_t role; + + switch (deviceType) { + case IOPCDeviceTypeGPU: + role = task_get_gpu_role(clientTask); + + KDBG(IMPORTANCE_CODE(IMP_QUERY_GPU_ROLE, 0), role); + + *role_out = role; + + result = kIOReturnSuccess; + break; + default: + result = kIOReturnNotFound; + } + + return result; +} + IOReturn IOPerfControlClient::registerPerformanceController(PerfControllerInterface *pci) { diff --git a/iokit/Kernel/IOPlatformActions.cpp b/iokit/Kernel/IOPlatformActions.cpp index 057012e55..483f30fb3 100644 --- a/iokit/Kernel/IOPlatformActions.cpp +++ b/iokit/Kernel/IOPlatformActions.cpp @@ -136,6 +136,7 @@ extern "C" kern_return_t IOCPURunPlatformQuiesceActions(void) { assert(preemption_enabled() == false); + cpu_event_debug_log(PLATFORM_QUIESCE, 0); return iocpu_run_platform_actions(&gActionQueues[kQueueQuiesce], 0, 0U - 1, NULL, NULL, NULL, PLATFORM_ACTION_FLAGS_ALLOW_NESTED_CALLOUTS); } @@ -144,6 +145,7 @@ extern "C" kern_return_t IOCPURunPlatformActiveActions(void) { assert(preemption_enabled() == false); + cpu_event_debug_log(PLATFORM_ACTIVE, 0); ml_hibernate_active_pre(); kern_return_t result = iocpu_run_platform_actions(&gActionQueues[kQueueActive], 0, 0U - 1, NULL, NULL, NULL, PLATFORM_ACTION_FLAGS_ALLOW_NESTED_CALLOUTS); @@ -157,6 +159,7 @@ IOCPURunPlatformHaltRestartActions(uint32_t message) if (!gActionQueues[kQueueHaltRestart].next) { return kIOReturnNotReady; } + cpu_event_debug_log(PLATFORM_HALT_RESTART, 0); return iocpu_run_platform_actions(&gActionQueues[kQueueHaltRestart], 0, 0U - 1, (void *)(uintptr_t) message, NULL, NULL, PLATFORM_ACTION_FLAGS_ALLOW_NESTED_CALLOUTS); } @@ -173,6 +176,7 @@ IOCPURunPlatformPanicActions(uint32_t message, uint32_t details) if (!verbose_panic_flow_logging) { platform_action_flags = PLATFORM_ACTION_FLAGS_NO_LOGGING; } + cpu_event_debug_log(PLATFORM_PANIC, 0); return iocpu_run_platform_actions(&gActionQueues[kQueuePanic], 0, 0U - 1, (void *)(uintptr_t) message, (void *)(uintptr_t) details, NULL, platform_action_flags); } @@ -190,6 +194,7 @@ IOCPURunPlatformPanicSyncAction(void *addr, uint32_t offset, uint32_t len) if (!gActionQueues[kQueuePanic].next) { return kIOReturnNotReady; } + cpu_event_debug_log(PLATFORM_PANIC_SYNC, 0); return iocpu_run_platform_actions(&gActionQueues[kQueuePanic], 0, 0U - 1, (void *)(uintptr_t)(kPEPanicSync), &context, NULL, FALSE); } @@ -197,6 +202,7 @@ IOCPURunPlatformPanicSyncAction(void *addr, uint32_t offset, uint32_t len) void IOPlatformActionsPreSleep(void) { + cpu_event_debug_log(PLATFORM_PRE_SLEEP, 0); iocpu_run_platform_actions(&gActionQueues[kQueueSleep], 0, 0U - 1, NULL, NULL, NULL, PLATFORM_ACTION_FLAGS_ALLOW_NESTED_CALLOUTS); } @@ -204,6 +210,7 @@ IOPlatformActionsPreSleep(void) void IOPlatformActionsPostResume(void) { + cpu_event_debug_log(PLATFORM_POST_RESUME, 0); iocpu_run_platform_actions(&gActionQueues[kQueueWake], 0, 0U - 1, NULL, NULL, NULL, PLATFORM_ACTION_FLAGS_ALLOW_NESTED_CALLOUTS); } diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index d0149a6e8..b0e14a2bd 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -967,6 +967,19 @@ PEGetPlatformEpoch(void) } } +#if defined(__arm64__) +__attribute__((noinline)) +static void +force_hard_hang_if_transaction_pending(void) +{ + /* + * Intentionally force a hang if all CPUs cannot complete it + * so that we get an AP watchdog hang *here* instead of later in the panic flow. + */ + arm64_sync_tlb(true); +} +#endif // defined(__arm64__) + /* Handle necessary platform specific actions prior to panic */ void PEInitiatePanic(void) @@ -977,7 +990,7 @@ PEInitiatePanic(void) * collection flow rather than hanging late in panic (see rdar://58062030) */ flush_mmu_tlb_entries_async(0, PAGE_SIZE, PAGE_SIZE, true, true); - arm64_sync_tlb(true); + force_hard_hang_if_transaction_pending(); #endif // defined(__arm64__) } diff --git a/iokit/Kernel/IOPolledInterface.cpp b/iokit/Kernel/IOPolledInterface.cpp index 4de5d3b93..e4b507ad0 100644 --- a/iokit/Kernel/IOPolledInterface.cpp +++ b/iokit/Kernel/IOPolledInterface.cpp @@ -789,7 +789,7 @@ IOPolledFileClose(IOPolledFileIOVars ** pVars, if (vars->fileRef) { kern_close_file_for_direct_io(vars->fileRef, write_offset, addr, write_length, - discard_offset, discard_end, unlink); + discard_offset, discard_end, vars->fileSizeMin, unlink); vars->fileRef = NULL; } if (vars->fileExtents) { diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 39f62c755..395e5ecf7 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,7 @@ enum{ }; #include "IOServicePrivate.h" +#include "IOServicePMPrivate.h" #include "IOKitKernelInternal.h" // take lockForArbitration before LOCKNOTIFY @@ -115,7 +117,7 @@ OSDefineMetaClassAndStructors(IOServiceCompatibility, IOService) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static IOPlatformExpert * gIOPlatform; -static class IOPMrootDomain * gIOPMRootDomain; +class IOPMrootDomain * gIOPMRootDomain; const IORegistryPlane * gIOServicePlane; const IORegistryPlane * gIOPowerPlane; const OSSymbol * gIODeviceMemoryKey; @@ -257,7 +259,7 @@ static int gNumWaitingThreads; static IOLock * gIOServiceBusyLock; bool gCPUsRunning; bool gIOKitWillTerminate; -bool gInUserspaceReboot; +atomic_bool gInUserspaceReboot; #define kIOServiceRootMediaParentInvalid ((IOService *) -1UL) #if NO_KEXTD @@ -395,6 +397,9 @@ requireMaxCpuDelay(IOService * service, UInt32 ns, UInt32 delayType); static IOReturn setLatencyHandler(UInt32 delayType, IOService * target, bool enable); +static bool IOServiceMatchingNotificationHandlerToBlock(void * target __unused, + void * refCon, IOService * newService, IONotifier * notifier); + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ IOCoreAnalyticsSendEventProc gIOCoreAnalyticsSendEventProc; @@ -425,9 +430,10 @@ OSArray * fMatchingWork; OSArray * fMatchingDelayed; IOService * fSystemPowerAckTo; uint32_t fSystemPowerAckRef; -uint8_t fSystemOff; -uint8_t fUserServerOff; -uint8_t fWaitingUserServers; +IOService * fSystemPowerAckTo2; +uint8_t fSystemState = kIOServiceSystemStateOn; +uint8_t fUserServerSystemState; +bool fWaitingUserServers; thread_call_t fUserServerAckTimer; void lock(); @@ -435,12 +441,16 @@ void unlock(); void init(IOPMrootDomain * root); +void systemPowerChange(uint8_t newState, + IOService * ackTo, + uint32_t ackRef, + uint32_t * pMaxWaitForReply); IOReturn systemPowerChange( void * target, void * refCon, UInt32 messageType, IOService * service, void * messageArgument, vm_size_t argSize); - +IOReturn rootWillChangeTo(IOPMPowerFlags flags, unsigned long state); bool matchingStart(IOService * service); void matchingEnd(IOService * service); void userServerAckTimerExpired(void *, void *); @@ -783,7 +793,7 @@ IOService::start( IOService * provider ) void IOService::stop( IOService * provider ) { - if (reserved->uvars && reserved->uvars->started && reserved->uvars->userServer) { + if (reserved->uvars && reserved->uvars->userServer) { reserved->uvars->userServer->serviceStop(this, provider); } } @@ -2274,19 +2284,8 @@ IOService::registerInterest(const OSSymbol * typeOfInterest, IOServiceInterestHandlerBlock handler) { IONotifier * notify; - void * block; - - block = Block_copy(handler); - if (!block) { - return NULL; - } - - notify = registerInterest(typeOfInterest, &IOServiceInterestHandlerToBlock, NULL, block); - - if (!notify) { - Block_release(block); - } + notify = registerInterest(typeOfInterest, &IOServiceInterestHandlerToBlock, NULL, handler); return notify; } @@ -2304,7 +2303,11 @@ IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol * notify->handler = handler; notify->target = target; - notify->ref = ref; + if (handler == &IOServiceInterestHandlerToBlock) { + notify->ref = Block_copy(ref); + } else { + notify->ref = ref; + } if ((typeOfInterest != gIOGeneralInterest) && (typeOfInterest != gIOBusyInterest) @@ -2745,6 +2748,7 @@ IOService::terminatePhase1( IOOptionBits options ) if (startPhase2) { retain(); lockForArbitration(); + __state[1] |= kIOServiceTermPhase2ReadyState; scheduleTerminatePhase2(options); unlockForArbitration(); release(); @@ -2817,7 +2821,7 @@ IOService::scheduleTerminatePhase2( IOOptionBits options ) (uintptr_t) __state[1], (uintptr_t) options); - if (__state[1] & kIOServiceTermPhase1State) { + if (0 == (__state[1] & kIOServiceTermPhase2ReadyState)) { return; } @@ -3214,6 +3218,8 @@ IOService::terminateWorker( IOOptionBits options ) if (doPhase2) { doPhase2 = (0 != (kIOServiceInactiveState & victim->__state[0])); if (doPhase2) { + victim->__state[1] |= kIOServiceTermPhase2ReadyState; + uint64_t regID1 = victim->getRegistryEntryID(); IOServiceTrace( IOSERVICE_TERM_TRY_PHASE2, @@ -4214,7 +4220,7 @@ IOService::probeCandidates( OSOrderedSet * matches ) IOLog("%s(0x%qx): matching deferred by %s%s\n", getName(), getRegistryEntryID(), symbol ? symbol->getCStringNoCopy() : "", - gInUserspaceReboot ? " in userspace reboot" : ""); + IOService::getWillUserspaceReboot() ? " in userspace reboot" : ""); // rematching will occur after the IOKit daemon loads all plists } IOLockUnlock(gJobsLock); @@ -4231,8 +4237,20 @@ IOService::probeCandidates( OSOrderedSet * matches ) inst->getRetainCount()); } #endif - if (!started && inst->propertyExists(gIOServiceMatchDeferredKey)) { - matchDeferred = true; + if (!started) { + if (inst->propertyExists(gIOServiceMatchDeferredKey)) { + matchDeferred = true; + } else if (inst->reserved->uvars && inst->reserved->uvars->userServer && !inst->reserved->uvars->instantiated && + (0 != (__state[1] & kIOServiceNeedConfigState))) { + // Start failed with no object instantiation + // Dext will be rematched as this nub got re-registered + // Do not start the next candidate, otherwise the rematched dext might not be able to replace it + OSString * bundleID = OSDynamicCast(OSString, inst->getProperty(gIOModuleIdentifierKey)); + IOLog("%s(0x%qx): stop matching as %s will be relaunched\n", + getName(), getRegistryEntryID(), + bundleID ? bundleID->getCStringNoCopy() : "(null)"); + matchDeferred = true; + } } } } @@ -4345,8 +4363,14 @@ IOService::willShutdown() OSKext::willShutdown(); } +bool +IOService::getWillUserspaceReboot() +{ + return os_atomic_load(&gInUserspaceReboot, relaxed); +} + void -IOService::userSpaceWillReboot() +IOService::setWillUserspaceReboot() { IOLockLock(gJobsLock); #if !NO_KEXTD @@ -4383,16 +4407,14 @@ IOService::userSpaceWillReboot() } } #endif - gInUserspaceReboot = true; + os_atomic_store(&gInUserspaceReboot, true, relaxed); IOLockUnlock(gJobsLock); } void IOService::userSpaceDidReboot() { - IOLockLock(gJobsLock); - gInUserspaceReboot = false; - IOLockUnlock(gJobsLock); + os_atomic_store(&gInUserspaceReboot, false, relaxed); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -4410,6 +4432,9 @@ IOServicePH::init(IOPMrootDomain * root) assert(fRootNotifier); + gIOUserResources->PMinit(); + root->registerInterestedDriver(gIOUserResources); + fUserServerAckTimer = thread_call_allocate(&IOServicePH::userServerAckTimerExpired, (thread_call_param_t)NULL); } @@ -4463,8 +4488,10 @@ IOServicePH::serverAck(IOUserServer * server) uint32_t idx; IOService * ackTo; uint32_t ackToRef; + OSArray * notifyServers; ackTo = NULL; + notifyServers = NULL; lock(); if (server && fUserServersWait) { idx = fUserServersWait->getNextIndexOfObject(server, 0); @@ -4480,6 +4507,9 @@ IOServicePH::serverAck(IOUserServer * server) ackToRef = fSystemPowerAckRef; fSystemPowerAckTo = NULL; if (ackTo) { + if (ackTo == gIOUserResources) { + notifyServers = OSArray::withArray(fUserServers); + } thread_call_cancel(fUserServerAckTimer); } } @@ -4488,8 +4518,22 @@ IOServicePH::serverAck(IOUserServer * server) } unlock(); - if (ackTo) { - DKLOG("allowPowerChange\n"); + if (ackTo == gIOUserResources) { + // suspend DK processes all at once since they can talk amongst themselves + // after SetPowerState() + if (notifyServers) { + notifyServers->iterateObjects(^bool (OSObject * obj) { + IOUserServer * us; + us = (typeof(us))obj; + us->systemSuspend(); + return false; + }); + OSSafeReleaseNULL(notifyServers); + } + DKLOG("allowPowerChange(2)\n"); + IOService::getPMRootDomain()->acknowledgePowerChange(gIOUserResources); + } else if (ackTo) { + DKLOG("allowPowerChange(1)\n"); ackTo->allowPowerChange((uintptr_t) ackToRef); } } @@ -4501,7 +4545,7 @@ IOServicePH::matchingStart(IOService * service) bool assertionActive = gIOPMRootDomain->acquireDriverKitMatchingAssertion() == kIOReturnSuccess; lock(); - bool matchNow = !fSystemOff && assertionActive; + bool matchNow = (kIOServiceSystemStateOn == fSystemState) && assertionActive; if (matchNow) { idx = fMatchingWork->getNextIndexOfObject(service, 0); if (idx == -1U) { @@ -4550,20 +4594,20 @@ IOServicePH::matchingEnd(IOService * service) } - if ((fUserServerOff != fSystemOff) && fUserServers->getCount()) { - if (fSystemOff) { + if ((fUserServerSystemState != fSystemState) && fUserServers->getCount()) { + if (IsIOServiceSystemStateOff(fSystemState)) { if (0 == fMatchingWork->getCount()) { fUserServersWait = OSArray::withArray(fUserServers); notifyServers = OSArray::withArray(fUserServers); - fUserServerOff = fSystemOff; + fUserServerSystemState = fSystemState; } } else { notifyServers = OSArray::withArray(fUserServers); - fUserServerOff = fSystemOff; + fUserServerSystemState = fSystemState; } } - if (!fSystemOff && fMatchingDelayed) { + if ((kIOServiceSystemStateOn == fSystemState) && fMatchingDelayed) { deferredMatches = fMatchingDelayed; fMatchingDelayed = NULL; } @@ -4574,13 +4618,14 @@ IOServicePH::matchingEnd(IOService * service) uint32_t sleepType = 0; uint32_t standbyTimer = 0; bool hibernate = false; - if (fSystemOff && IOService::getPMRootDomain()->getSystemSleepType(&sleepType, &standbyTimer) == kIOReturnSuccess) { + if (IsIOServiceSystemStateOff(fSystemState) + && IOService::getPMRootDomain()->getSystemSleepType(&sleepType, &standbyTimer) == kIOReturnSuccess) { hibernate = (sleepType == kIOPMSleepTypeHibernate); } notifyServers->iterateObjects(^bool (OSObject * obj) { IOUserServer * us; us = (typeof(us))obj; - us->systemPower(fSystemOff, hibernate); + us->systemPower(fSystemState, hibernate); return false; }); OSSafeReleaseNULL(notifyServers); @@ -4694,6 +4739,58 @@ IOServicePH::serverSlept(void) TUNABLE(uint32_t, dk_power_state_timeout_ms, "dk_power_state_timeout_ms", 30000); +// +// Handle system changes: +// +// kIOServiceSystemStateOn +// kIOServiceSystemStateAOT +// kIOServiceSystemStateOffPhase1 - non-DK user space suspension +// kIOServiceSystemStateOffPhase2 - DK user space suspension +// + +void +IOServicePH::systemPowerChange(uint8_t newState, + IOService * ackTo, + uint32_t ackRef, + uint32_t * pMaxWaitForReply) +{ + AbsoluteTime deadline; + + IOLog("IOServicePH::systemPowerChange to 0x%x\n", newState); + + switch (newState) { + case kIOServiceSystemStateOffPhase1: + case kIOServiceSystemStateOffPhase2: + + lock(); + DKLOG("arming ack timer, %u ms\n", dk_power_state_timeout_ms); + clock_interval_to_deadline(dk_power_state_timeout_ms, kMillisecondScale, &deadline); + fSystemState = newState; + fSystemPowerAckRef = ackRef; + fSystemPowerAckTo = ackTo; + thread_call_enter_delayed(fUserServerAckTimer, deadline); + unlock(); + matchingEnd(NULL); + + *pMaxWaitForReply = dk_power_state_timeout_ms * 2 * 1000; + break; + + case kIOServiceSystemStateAOT: + case kIOServiceSystemStateOn: + + lock(); + fSystemState = newState; + unlock(); + matchingEnd(NULL); + *pMaxWaitForReply = 0; + break; + + default: + assert(false); + break; + } +} + IOReturn IOServicePH::systemPowerChange( void * target, @@ -4702,52 +4799,29 @@ IOServicePH::systemPowerChange( void * messageArgument, vm_size_t argSize) { IOReturn ret; - IOUserServer * us; IOPMSystemCapabilityChangeParameters * params; - AbsoluteTime deadline; - - us = NULL; switch (messageType) { case kIOMessageSystemCapabilityChange: - params = (typeof params)messageArgument; - if (kIODKLogPM & gIODKDebug) { - IOLog("IOServicePH::kIOMessageSystemCapabilityChange: %s%s 0x%x->0x%x\n", - params->changeFlags & kIOPMSystemCapabilityWillChange ? "will" : "", - params->changeFlags & kIOPMSystemCapabilityDidChange ? "did" : "", - params->fromCapabilities, - params->toCapabilities); - } + IOLog("IOServicePH::kIOMessageSystemCapabilityChange: %s%s 0x%x->0x%x\n", + params->changeFlags & kIOPMSystemCapabilityWillChange ? "will" : "", + params->changeFlags & kIOPMSystemCapabilityDidChange ? "did" : "", + params->fromCapabilities, + params->toCapabilities); if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && - (params->fromCapabilities & kIOPMSystemCapabilityCPU) && - ((params->toCapabilities & kIOPMSystemCapabilityCPU) == 0)) { - lock(); - DKLOG("arming ack timer, %u ms\n", dk_power_state_timeout_ms); - clock_interval_to_deadline(dk_power_state_timeout_ms, kMillisecondScale, &deadline); - fSystemOff = true; - fSystemPowerAckRef = params->notifyRef; - fSystemPowerAckTo = service; - thread_call_enter_delayed(fUserServerAckTimer, deadline); - unlock(); - - matchingEnd(NULL); - - params->maxWaitForReply = dk_power_state_timeout_ms * 2 * 1000; + (params->fromCapabilities & (kIOPMSystemCapabilityCPU | kIOPMSystemCapabilityAOT)) && + ((params->toCapabilities & (kIOPMSystemCapabilityCPU | kIOPMSystemCapabilityAOT)) == 0)) { + systemPowerChange(kIOServiceSystemStateOffPhase1, service, params->notifyRef, ¶ms->maxWaitForReply); ret = kIOReturnSuccess; } else if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && - ((params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) && - (params->toCapabilities & kIOPMSystemCapabilityCPU)) { - lock(); - fSystemOff = false; - unlock(); - - matchingEnd(NULL); - - params->maxWaitForReply = 0; - ret = kIOReturnSuccess; + (0 != ((params->fromCapabilities ^ params->toCapabilities) + & (kIOPMSystemCapabilityCPU | kIOPMSystemCapabilityAOT)))) { + systemPowerChange((params->toCapabilities & kIOPMSystemCapabilityCPU) ? kIOServiceSystemStateOn : kIOServiceSystemStateAOT, + service, params->notifyRef, ¶ms->maxWaitForReply); + ret = kIOReturnSuccess; } else { params->maxWaitForReply = 0; ret = kIOReturnSuccess; @@ -4762,6 +4836,29 @@ IOServicePH::systemPowerChange( return ret; } +IOReturn +IOServicePH::rootWillChangeTo(IOPMPowerFlags flags, unsigned long state) +{ + IOReturn ret; + uint32_t maxWaitForReply = kIOPMAckImplied; + + if (kIOPMSleepCapability & flags) { + systemPowerChange(kIOServiceSystemStateOffPhase2, gIOUserResources, 0, &maxWaitForReply); + } else if (kIOPMAOTCapability & flags) { + systemPowerChange(kIOServiceSystemStateAOT, NULL, 0, &maxWaitForReply); + } else if (kIOPMPowerOn & flags) { + systemPowerChange(kIOServiceSystemStateOn, NULL, 0, &maxWaitForReply); + } + ret = maxWaitForReply; + return ret; +} + +bool +IOSystemStateAOT(void) +{ + return kIOServiceSystemStateAOT == IOServicePH::fSystemState; +} + bool IOServicePH::checkPMReady(void) { @@ -4785,6 +4882,29 @@ IOServicePH::checkPMReady(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +#if DEBUG || DEVELOPMENT +void +IOService::__patchProperties(void) +{ +#if 0 + if (!strcmp("AppleCentauriManager", getName())) { + setProperty(kIOPMAOTAllowKey, kIOPMDriverClassNetwork, 64); + } + if (!strcmp("CentauriControl", getName())) { + setProperty(kIOPMAOTAllowKey, kIOPMDriverClassNetwork, 64); + } + if (!strcmp("CentauriAlpha", getName())) { + setProperty(kIOPMAOTAllowKey, kIOPMDriverClassNetwork, 64); + } + if (!strcmp("CentauriBeta", getName())) { + setProperty(kIOPMAOTAllowKey, kIOPMDriverClassNetwork, 64); + } +#endif +} +#endif /* DEBUG || DEVELOPMENT */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + /* * Start a previously attached & probed instance, * called on exporting object instance @@ -4967,6 +5087,10 @@ IOService::startCandidate( IOService * service ) clock_get_uptime(&startTime); } +#if DEBUG || DEVELOPMENT + service->__patchProperties(); +#endif /* DEBUG || DEVELOPMENT */ + ok = service->start(this); if (recordTime) { @@ -5315,6 +5439,9 @@ IOService::setRootMedia(IOService * root) UNLOCKNOTIFY(); if (unhide) { + if (root) { + root->addPMDriverClass(kIOPMDriverClassStorage); + } publishHiddenMedia(root); } } @@ -5328,6 +5455,9 @@ IOService::canTerminateForReplacement(IOService * client) assert(kIOServiceRootMediaParentInvalid != gIOServiceRootMediaParent); + if (!gIOServiceHideIOMedia) { + return false; + } if (!client->propertyExists(gIOPrimaryDriverTerminateOptionsKey)) { return false; } @@ -6309,6 +6439,8 @@ IOService::setNotification( if (handler == &_IOServiceMatchingNotificationHandler) { notify->compatHandler = ((_IOServiceMatchingNotificationHandlerRef *)ref)->handler; notify->ref = ((_IOServiceMatchingNotificationHandlerRef *)ref)->ref; + } else if (handler == &IOServiceMatchingNotificationHandlerToBlock) { + notify->ref = Block_copy(ref); } else { notify->ref = ref; } @@ -6506,20 +6638,9 @@ IOService::addMatchingNotification( IOServiceMatchingNotificationHandlerBlock handler) { IONotifier * notify; - void * block; - - block = Block_copy(handler); - if (!block) { - return NULL; - } notify = addMatchingNotification(type, matching, - &IOServiceMatchingNotificationHandlerToBlock, NULL, block, priority); - - if (!notify) { - Block_release(block); - } - + &IOServiceMatchingNotificationHandlerToBlock, NULL, handler, priority); return notify; } @@ -7299,6 +7420,16 @@ IOUserResources::matchPropertyTable( OSDictionary * table ) return IOResourcesMatchPropertyTable(this, table); } +IOReturn +IOUserResources::powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) +{ + assert(service == getPMRootDomain()); + if (service != getPMRootDomain()) { + return kIOReturnSuccess; + } + return IOServicePH::rootWillChangeTo(flags, state); +} + // -- void diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index fe8894089..3e912d105 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -1044,9 +1045,35 @@ IOService::addPowerChild3( IOPMRequest * request ) PM_LOG("%s: addPowerChild3 not in power plane\n", getName()); } + if (child) { + OSNumber * num; + OSObject * obj = child->copyProperty(kIOPMAOTAllowKey); + if ((num = OSDynamicCast(OSNumber, obj))) { + child->addPMDriverClass(num->unsigned64BitValue()); + if (child->reserved->uvars && child->reserved->uvars->userServer) { + child->reserved->uvars->userServer->addPMDriverClass(num->unsigned64BitValue()); + } + } + OSSafeReleaseNULL(obj); + } + connection->release(); } +bool +IOService::currentOrPendingPowerState(uint32_t state) +{ + return (fCurrentPowerState == state) || (fHeadNotePowerState == state); +} + +void +IOService::addPMDriverClass(uint64_t driverClass) +{ + if (pwrMgt) { + fPMDriverClass |= driverClass; + } +} + #ifndef __LP64__ //********************************************************************************* // [deprecated] setPowerParent @@ -1953,6 +1980,21 @@ IOService::handlePowerDomainWillChangeTo( IOPMRequest * request ) // power flags should account for this power drop. if (parentChangeFlags & kIOPMDomainPowerDrop) { + if (fPMDriverClass && (0 == (kIOPMDriverClassDone & fPMDriverClass))) { + // on first power drop, propagate driver class to its parents, + // so they can come on if the driver is selected to be on + // by considerRunMode() + IOService * parent = whichParent; + while (true) { + parent = (IOService *) parent->getParentEntry(gIOPowerPlane); + if (parent == getPMRootDomain()) { + break; + } + parent->addPMDriverClass(fPMDriverClass); + parent = (IOService *) parent->getParentEntry(gIOPowerPlane); + } + fPMDriverClass |= kIOPMDriverClassDone; + } setParentInfo(parentPowerFlags, whichParent, true); } @@ -2001,7 +2043,6 @@ IOService::powerDomainDidChangeTo( //********************************************************************************* // [private] handlePowerDomainDidChangeTo //********************************************************************************* - void IOService::handlePowerDomainDidChangeTo( IOPMRequest * request ) { @@ -2084,8 +2125,16 @@ IOService::handlePowerDomainDidChangeTo( IOPMRequest * request ) myChangeFlags = kIOPMParentInitiated | kIOPMDomainDidChange | (parentChangeFlags & kIOPMRootBroadcastFlags); - if (kIOPMAOTPower & fPowerStates[maxPowerState].inputPowerFlags) { - IOLog("aotPS %s0x%qx[%ld]\n", getName(), getRegistryEntryID(), maxPowerState); + if (kIOPMAOTPower & fParentsCurrentPowerFlags) { + if (kIOPMAOTPower & fPowerStates[maxPowerState].inputPowerFlags) { + if (gLPWFlags && reserved->uvars && reserved->uvars->userServer) { + reserved->uvars->userServer->pageout(); + } + } + } + + if (getPMRootDomain()->isAOTMode()) { + IOLog("aotPS[%ld] %s0x%qx\n", maxPowerState, getName(), getRegistryEntryID()); } result = startPowerChange( @@ -3556,6 +3605,21 @@ IOService::getPowerState( void ) return (UInt32) fCurrentPowerState; } +//********************************************************************************* +// [public] getDesiredPowerState +// +//********************************************************************************* + +UInt32 +IOService::getDesiredPowerState( void ) +{ + if (!initialized) { + return kPowerStateZero; + } + + return (UInt32) fDesiredPowerState; +} + #ifndef __LP64__ //********************************************************************************* // [deprecated] systemWake @@ -4158,6 +4222,10 @@ IOService::pmDriverCallout( IOService * from, break; case kDriverCallInformPreChange: + if (from == getPMRootDomain()) { + getPMRootDomain()->willNotifyInterested(from->fHeadNotePowerState); + } + OS_FALLTHROUGH; case kDriverCallInformPostChange: from->driverInformPowerChange(); break; @@ -7652,11 +7720,34 @@ IOService::driverMaxCapabilityForDomainState( IOPMPowerFlags domainState ) { IOPMDriverCallEntry callEntry; IOPMPowerStateIndex powerState = kPowerStateZero; + int32_t promote; - if (assertPMDriverCall(&callEntry, kIOPMDriverCallMethodMaxCapabilityForDomainState)) { - powerState = maxCapabilityForDomainState(domainState); - deassertPMDriverCall(&callEntry); + promote = getPMRootDomain()->considerRunMode(this, fPMDriverClass); + + if ((promote < 0) && (0 == (kIOPMAOTPower & domainState))) { + return kPowerStateZero; } + + if (!assertPMDriverCall(&callEntry, kIOPMDriverCallMethodMaxCapabilityForDomainState)) { + return kPowerStateZero; + } + + if ((promote > 0) && (0 != (kIOPMPowerOn & domainState))) { + IOPMPowerFlags newDomainState = (domainState & ~kIOPMPowerOn) | kIOPMAOTPower; + powerState = maxCapabilityForDomainState(newDomainState); + } + + if (kPowerStateZero == powerState) { + powerState = maxCapabilityForDomainState(domainState); + } + + if ((promote > 0) && (kPowerStateZero == powerState) + && (0 != (kIOPMAOTPower & domainState))) { + IOPMPowerFlags newDomainState = (domainState & ~kIOPMAOTPower) | kIOPMPowerOn; + powerState = maxCapabilityForDomainState(newDomainState); + } + deassertPMDriverCall(&callEntry); + return powerState; } diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index e3e1a85cb..c1b42bb42 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -368,6 +368,9 @@ private: IOPMActions PMActions; + uint64_t PMDriverClass; +#define fPMDriverClass pwrMgt->PMDriverClass + // Serialize IOServicePM state for debug output. IOReturn gatedSerialize( OSSerialize * s ) const; virtual bool serialize( OSSerialize * s ) const APPLE_KEXT_OVERRIDE; @@ -599,6 +602,16 @@ enum { kIOPMDriverCallMethodInitialPowerStateForDomainState = 7 }; + +enum { + kIOPMDriverClassStorage = 0x00000010, + kIOPMDriverClassNetwork = 0x00000020, + kIOPMDriverClassDriverKit = 0x00000040, + kIOPMDriverClassDone = (1ULL << 63), +}; + +extern uint64_t gLPWFlags; + //****************************************************************************** // PM Statistics & Diagnostics //****************************************************************************** diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index fab488c2f..5806a0699 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -69,8 +69,7 @@ enum { kIOServiceRematchOnDetach = 0x00008000, kIOServiceUserUnhidden = 0x00004000, -// kIOServiceX1 = 0x00004000, -// kIOServiceX2 = 0x00002000, + kIOServiceTermPhase2ReadyState = 0x00002000, // kIOServiceX3 = 0x00001000, // kIOServiceX4 = 0x00000800, // kIOServiceX5 = 0x00000400, @@ -220,6 +219,7 @@ public: IOUserClient ** handler) APPLE_KEXT_OVERRIDE; virtual IOWorkLoop * getWorkLoop() const APPLE_KEXT_OVERRIDE; virtual bool matchPropertyTable( OSDictionary * table ) APPLE_KEXT_OVERRIDE; + virtual IOReturn powerStateWillChangeTo(IOPMPowerFlags flags, unsigned long state, IOService * service) APPLE_KEXT_OVERRIDE; }; class _IOOpenServiceIterator : public OSIterator @@ -272,4 +272,15 @@ extern const OSSymbol * gIOConsoleSessionAuditIDKey; extern const OSSymbol * gIOConsoleSessionOnConsoleKey; extern const OSSymbol * gIOConsoleSessionSecureInputPIDKey; +extern "C" bool +IOSystemStateAOT(void); + +enum { + kIOServiceSystemStateOffPhase1 = (1U << 4) | 1, + kIOServiceSystemStateOffPhase2 = (1U << 4) | 2, + kIOServiceSystemStateAOT = (2U << 4), + kIOServiceSystemStateOn = (3U << 4) +#define IsIOServiceSystemStateOff(state) ((state) < kIOServiceSystemStateAOT) +}; + #endif /* ! _IOKIT_IOSERVICEPRIVATE_H */ diff --git a/iokit/Kernel/IOSharedDataQueue.cpp b/iokit/Kernel/IOSharedDataQueue.cpp index 2b8ee80df..8b1cfd3cd 100644 --- a/iokit/Kernel/IOSharedDataQueue.cpp +++ b/iokit/Kernel/IOSharedDataQueue.cpp @@ -103,7 +103,7 @@ IOSharedDataQueue::initWithCapacity(UInt32 size) } kr = kmem_alloc(kernel_map, (vm_offset_t *)&dataQueue, allocSize, - (kma_flags_t)(KMA_DATA | KMA_ZERO), IOMemoryTag(kernel_map)); + (kma_flags_t)(KMA_DATA_SHARED | KMA_ZERO), IOMemoryTag(kernel_map)); if (kr != KERN_SUCCESS) { return false; } diff --git a/iokit/Kernel/IOSubMemoryDescriptor.cpp b/iokit/Kernel/IOSubMemoryDescriptor.cpp index 07ef3782b..adc51b602 100644 --- a/iokit/Kernel/IOSubMemoryDescriptor.cpp +++ b/iokit/Kernel/IOSubMemoryDescriptor.cpp @@ -241,3 +241,10 @@ IOSubMemoryDescriptor::getPageCounts(IOByteCount * residentPageCount, { return _parent->getPageCounts(residentPageCount, dirtyPageCount); } + +IOReturn +IOSubMemoryDescriptor::getPageCounts(IOByteCount * residentPageCount, + IOByteCount * dirtyPageCount, IOByteCount * swappedPageCount) +{ + return _parent->getPageCounts(residentPageCount, dirtyPageCount, swappedPageCount); +} diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index e1d0d4b97..e3c0911f3 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -141,30 +141,26 @@ extern "C" { struct IOMachPortHashList; -static_assert(IKOT_MAX_TYPE <= 255); - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ // IOMachPort maps OSObjects to ports, avoiding adding an ivar to OSObject. -class IOMachPort : public OSObject +class IOMachPort final : public OSObject { OSDeclareDefaultStructors(IOMachPort); public: - mach_port_mscount_t mscount; - IOLock lock; + bool hashed; SLIST_ENTRY(IOMachPort) link; ipc_port_t port; OSObject* XNU_PTRAUTH_SIGNED_PTR("IOMachPort.object") object; - static IOMachPort* withObjectAndType(OSObject *obj, ipc_kobject_type_t type); + static IOMachPort* withObject(OSObject *obj); - static IOMachPortHashList* bucketForObject(OSObject *obj, - ipc_kobject_type_t type); + static IOMachPortHashList* bucketForObject(OSObject *obj); static LIBKERN_RETURNS_NOT_RETAINED IOMachPort* portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc_kobject_type_t type); - static bool noMoreSendersForObject( OSObject * obj, - ipc_kobject_type_t type, mach_port_mscount_t * mscount ); + static IOMachPort *noMoreSenders( ipc_port_t port, + ipc_kobject_type_t type, mach_port_mscount_t mscount ); static void releasePortForObject( OSObject * obj, ipc_kobject_type_t type ); @@ -172,6 +168,35 @@ public: io_object_t obj, ipc_kobject_type_t type ); virtual void free() APPLE_KEXT_OVERRIDE; + + void + makePort(ipc_kobject_type_t type) + { + port = iokit_alloc_object_port(this, type); + } + + void + adoptPort(IOMachPort *other, ipc_kobject_type_t type) + { + port = other->port; + ipc_kobject_enable(port, this, IKOT_IOKIT_CONNECT); + other->port = NULL; + } + + void + disablePort(ipc_kobject_type_t type) + { + __assert_only ipc_kobject_t kobj; + kobj = ipc_kobject_disable(port, type); + assert(kobj == this); + } + + template + inline T * + getAs() const + { + return OSDynamicCast(T, object); + } }; #define super OSObject @@ -203,7 +228,7 @@ IOMachPortInitialize(void) } IOMachPortHashList* -IOMachPort::bucketForObject(OSObject *obj, ipc_kobject_type_t type ) +IOMachPort::bucketForObject(OSObject *obj) { return &gIOMachPortHash[os_hash_kernel_pointer(obj) % PORT_HASH_SIZE]; } @@ -222,90 +247,84 @@ IOMachPort::portForObjectInBucket(IOMachPortHashList *bucket, OSObject *obj, ipc } IOMachPort* -IOMachPort::withObjectAndType(OSObject *obj, ipc_kobject_type_t type) +IOMachPort::withObject(OSObject *obj) { IOMachPort *machPort = NULL; machPort = new IOMachPort; - if (__improbable(machPort && !machPort->init())) { - OSSafeReleaseNULL(machPort); - return NULL; - } - + release_assert(machPort->init()); machPort->object = obj; - machPort->port = iokit_alloc_object_port(machPort, type); - IOLockInlineInit(&machPort->lock); obj->taggedRetain(OSTypeID(OSCollection)); - machPort->mscount++; return machPort; } -bool -IOMachPort::noMoreSendersForObject( OSObject * obj, - ipc_kobject_type_t type, mach_port_mscount_t * mscount ) +IOMachPort * +IOMachPort::noMoreSenders( ipc_port_t port, ipc_kobject_type_t type, + mach_port_mscount_t mscount ) { - IOMachPort *machPort = NULL; - IOUserClient *uc; - OSAction *action; - bool destroyed = true; - - IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); - - obj->retain(); + IOUserClient *uc = NULL; + IOMachPort *machPort; + bool destroyed; lck_mtx_lock(gIOObjectPortLock); - machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); + iokit_lock_port(port); + machPort = (IOMachPort *)ipc_kobject_get_locked(port, type); + destroyed = ipc_kobject_is_mscount_current_locked(port, mscount); + iokit_unlock_port(port); - if (machPort) { - destroyed = (machPort->mscount <= *mscount); - if (!destroyed) { - *mscount = machPort->mscount; - lck_mtx_unlock(gIOObjectPortLock); - } else { - if ((IKOT_IOKIT_CONNECT == type) && (uc = OSDynamicCast(IOUserClient, obj))) { - uc->noMoreSenders(); - } - SLIST_REMOVE(bucket, machPort, IOMachPort, link); - - IOLockLock(&machPort->lock); - iokit_remove_object_port(machPort->port, type); - machPort->object = NULL; - IOLockUnlock(&machPort->lock); - - lck_mtx_unlock(gIOObjectPortLock); - - OS_ANALYZER_SUPPRESS("77508635") OSSafeReleaseNULL(machPort); - - obj->taggedRelease(OSTypeID(OSCollection)); - } - } else { + if (machPort == NULL) { lck_mtx_unlock(gIOObjectPortLock); + return NULL; } - if ((IKOT_UEXT_OBJECT == type) && (action = OSDynamicCast(OSAction, obj))) { - action->Aborted(); + assert(machPort->port == port); + + if (destroyed) { + if (machPort->hashed) { + IOMachPortHashList *bucket; + + bucket = IOMachPort::bucketForObject(machPort->object); + machPort->hashed = false; + SLIST_REMOVE(bucket, machPort, IOMachPort, link); + } + + machPort->disablePort(type); + + if (IKOT_IOKIT_CONNECT == type) { + uc = machPort->getAs(); + } } - if (IKOT_UEXT_OBJECT == type && IOUserServer::shouldLeakObjects()) { - // Leak object - obj->retain(); + if (uc) { + uc->noMoreSenders(); } - obj->release(); + lck_mtx_unlock(gIOObjectPortLock); - return destroyed; + if (IKOT_UEXT_OBJECT == type) { + if (OSAction *action = machPort->getAs()) { + action->Aborted(); + } + + if (IOUserServer::shouldLeakObjects()) { + // Leak object + machPort->object->retain(); + } + } + + return destroyed ? machPort : NULL; } void -IOMachPort::releasePortForObject( OSObject * obj, - ipc_kobject_type_t type ) +IOMachPort::releasePortForObject( OSObject * obj, ipc_kobject_type_t type ) { + bool destroyed = false; IOMachPort *machPort; IOService *service; - IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj); assert(IKOT_IOKIT_CONNECT != type); @@ -314,90 +333,64 @@ IOMachPort::releasePortForObject( OSObject * obj, machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); if (machPort - && (type == IKOT_IOKIT_OBJECT) - && (service = OSDynamicCast(IOService, obj)) - && !service->machPortHoldDestroy()) { - obj->retain(); + && ((type != IKOT_IOKIT_OBJECT) + || !(service = OSDynamicCast(IOService, obj)) + || !service->machPortHoldDestroy())) { + machPort->hashed = false; SLIST_REMOVE(bucket, machPort, IOMachPort, link); + machPort->disablePort(type); + destroyed = true; + } - IOLockLock(&machPort->lock); - iokit_remove_object_port(machPort->port, type); - machPort->object = NULL; - IOLockUnlock(&machPort->lock); + lck_mtx_unlock(gIOObjectPortLock); - lck_mtx_unlock(gIOObjectPortLock); - - OS_ANALYZER_SUPPRESS("77508635") OSSafeReleaseNULL(machPort); - - obj->taggedRelease(OSTypeID(OSCollection)); - obj->release(); - } else { - lck_mtx_unlock(gIOObjectPortLock); + if (destroyed) { + machPort->release(); } } void IOUserClient::destroyUserReferences( OSObject * obj ) { - IOMachPort *machPort; - bool destroyPort; + IOMachPort *machPort = NULL; + OSObject *mappings = NULL; IOMachPort::releasePortForObject( obj, IKOT_IOKIT_OBJECT ); - // panther, 3160200 - // IOMachPort::releasePortForObject( obj, IKOT_IOKIT_CONNECT ); - - obj->retain(); - IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, IKOT_IOKIT_CONNECT); - IOMachPortHashList *mappingBucket = NULL; + IOUserClient * uc = OSDynamicCast(IOUserClient, obj); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj); lck_mtx_lock(gIOObjectPortLock); - IOUserClient * uc = OSDynamicCast(IOUserClient, obj); - if (uc && uc->mappings) { - mappingBucket = IOMachPort::bucketForObject(uc->mappings, IKOT_IOKIT_CONNECT); - } - machPort = IOMachPort::portForObjectInBucket(bucket, obj, IKOT_IOKIT_CONNECT); if (machPort == NULL) { lck_mtx_unlock(gIOObjectPortLock); - goto end; + return; } + machPort->hashed = false; SLIST_REMOVE(bucket, machPort, IOMachPort, link); - obj->taggedRelease(OSTypeID(OSCollection)); + machPort->disablePort(IKOT_IOKIT_CONNECT); - destroyPort = true; if (uc) { + mappings = uc->mappings; + uc->mappings = NULL; + uc->noMoreSenders(); - if (uc->mappings) { - uc->mappings->taggedRetain(OSTypeID(OSCollection)); - SLIST_INSERT_HEAD(mappingBucket, machPort, link); - IOLockLock(&machPort->lock); - machPort->object = uc->mappings; - IOLockUnlock(&machPort->lock); + if (mappings) { + IOMachPort *newPort; - lck_mtx_unlock(gIOObjectPortLock); - - OSSafeReleaseNULL(uc->mappings); - destroyPort = false; + newPort = IOMachPort::withObject(mappings); + newPort->adoptPort(machPort, IKOT_IOKIT_CONNECT); } } - if (destroyPort) { - IOLockLock(&machPort->lock); - iokit_remove_object_port(machPort->port, IKOT_IOKIT_CONNECT); - machPort->object = NULL; - IOLockUnlock(&machPort->lock); + lck_mtx_unlock(gIOObjectPortLock); - lck_mtx_unlock(gIOObjectPortLock); - OS_ANALYZER_SUPPRESS("77508635") OSSafeReleaseNULL(machPort); - } - -end: - OSSafeReleaseNULL(obj); + OSSafeReleaseNULL(mappings); + machPort->release(); } mach_port_name_t @@ -413,10 +406,11 @@ IOMachPort::free( void ) if (port) { iokit_destroy_object_port(port, iokit_port_type(port)); } - IOLockInlineDestroy(&lock); + object->taggedRelease(OSTypeID(OSCollection)); super::free(); } + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ static bool @@ -573,7 +567,7 @@ iokit_port_object_description(io_object_t obj, kobject_description_t desc) // for retain and release. #ifndef __clang_analyzer__ void -iokit_add_reference( io_object_t obj, natural_t type ) +iokit_add_reference( io_object_t obj ) { if (!obj) { return; @@ -660,18 +654,18 @@ iokit_kobject_retain(io_kobject_t machPort) } io_object_t -iokit_copy_object_for_consumed_kobject(LIBKERN_CONSUMED io_kobject_t machPort, natural_t type) +iokit_copy_object_for_consumed_kobject(LIBKERN_CONSUMED io_kobject_t machPort) { - io_object_t result; + io_object_t result; assert(OSDynamicCast(IOMachPort, machPort)); - IOLockLock(&machPort->lock); + /* + * IOMachPort::object is never nil-ed, so this just borrows its port + * reference to make new rights. + */ result = machPort->object; - if (result) { - iokit_add_reference(result, type); - } - IOLockUnlock(&machPort->lock); + iokit_add_reference(result); machPort->release(); return result; } @@ -693,76 +687,98 @@ IOUserClient::finalizeUserReferences(OSObject * obj) } ipc_port_t -iokit_port_for_object( io_object_t obj, ipc_kobject_type_t type, ipc_kobject_t * kobj ) +iokit_port_make_send_for_object( io_object_t obj, ipc_kobject_type_t type ) { IOMachPort *machPort = NULL; - ipc_port_t port = NULL; + ipc_port_t port = NULL; - IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj, type); + IOMachPortHashList *bucket = IOMachPort::bucketForObject(obj); lck_mtx_lock(gIOObjectPortLock); machPort = IOMachPort::portForObjectInBucket(bucket, obj, type); if (__improbable(machPort == NULL)) { - machPort = IOMachPort::withObjectAndType(obj, type); - if (__improbable(machPort == NULL)) { - goto end; - } + machPort = IOMachPort::withObject(obj); + machPort->makePort(type); + machPort->hashed = true; SLIST_INSERT_HEAD(bucket, machPort, link); - } else { - machPort->mscount++; } - iokit_retain_port(machPort->port); - port = machPort->port; + port = ipc_kobject_make_send( machPort->port, machPort, type ); -end: - if (kobj) { - *kobj = machPort; - } lck_mtx_unlock(gIOObjectPortLock); return port; } -kern_return_t -iokit_client_died( io_object_t obj, ipc_port_t /* port */, - ipc_kobject_type_t type, mach_port_mscount_t * mscount ) +/* + * Handle the No-More_Senders notification generated from a device port destroy. + * Since there are no longer any tasks which hold a send right to this device + * port a NMS notification has been generated. + */ + +void +iokit_ident_no_senders( ipc_port_t port, mach_port_mscount_t mscount ) { - IOUserClient * client; - IOMemoryMap * map; - IOUserNotification * notify; - IOUserServerCheckInToken * token; - IOUserUserClient * uc; + IOMachPort *machPort; - if (!IOMachPort::noMoreSendersForObject( obj, type, mscount )) { - return kIOReturnNotReady; + machPort = IOMachPort::noMoreSenders(port, IKOT_IOKIT_IDENT, mscount); + + if (machPort) { + if (IOUserServerCheckInToken *token = + machPort->getAs()) { + token->cancel(); + } + machPort->release(); } +} - switch (type) { - case IKOT_IOKIT_CONNECT: - if ((client = OSDynamicCast( IOUserClient, obj ))) { +void +iokit_object_no_senders( ipc_port_t port, mach_port_mscount_t mscount ) +{ + IOMachPort *machPort; + + machPort = IOMachPort::noMoreSenders(port, IKOT_IOKIT_OBJECT, mscount); + + if (machPort) { + if (IOMemoryMap *map = machPort->getAs()) { + map->taskDied(); + } else if (IOUserNotification *notify = + machPort->getAs()) { + notify->setNotification( NULL ); + } + machPort->release(); + } +} + +void +iokit_connect_no_senders( ipc_port_t port, mach_port_mscount_t mscount ) +{ + IOMachPort *machPort; + + machPort = IOMachPort::noMoreSenders(port, IKOT_IOKIT_CONNECT, mscount); + + if (machPort) { + if (IOUserClient *client = machPort->getAs()) { IOStatisticsClientCall(); IORWLockWrite(&client->lock); client->clientDied(); IORWLockUnlock(&client->lock); } - break; - case IKOT_IOKIT_OBJECT: - if ((map = OSDynamicCast( IOMemoryMap, obj ))) { - map->taskDied(); - } else if ((notify = OSDynamicCast( IOUserNotification, obj ))) { - notify->setNotification( NULL ); - } - break; - case IKOT_IOKIT_IDENT: - if ((token = OSDynamicCast( IOUserServerCheckInToken, obj ))) { - token->cancel(); - } - break; - case IKOT_UEXT_OBJECT: - if ((uc = OSDynamicCast(IOUserUserClient, obj))) { + machPort->release(); + } +} + +void +iokit_uext_no_senders( ipc_port_t port, mach_port_mscount_t mscount ) +{ + IOMachPort *machPort; + + machPort = IOMachPort::noMoreSenders(port, IKOT_UEXT_OBJECT, mscount); + + if (machPort) { + if (IOUserClient *uc = machPort->getAs()) { IOService *provider = NULL; uc->lockForArbitration(); provider = uc->getProvider(); @@ -773,10 +789,8 @@ iokit_client_died( io_object_t obj, ipc_port_t /* port */, uc->setTerminateDefer(provider, false); OSSafeReleaseNULL(provider); } - break; + machPort->release(); } - - return kIOReturnSuccess; } }; /* extern "C" */ @@ -1033,7 +1047,12 @@ IOServiceUserNotification::handler( void * ref, } if (sendPing) { - port = iokit_port_for_object( this, IKOT_IOKIT_OBJECT, NULL ); + /* + * This right will be consumed when the message we form below + * is sent by kernel_mach_msg_send_with_builder_internal(), + * because we make the disposition for the right move-send. + */ + port = iokit_port_make_send_for_object( this, IKOT_IOKIT_OBJECT ); payloadSize = sizeof(PingMsgUdata) - sizeof(OSAsyncReference64) + msgReferenceSize; msgSize = (mach_msg_size_t)(sizeof(PingMsgKdata) + payloadSize); @@ -1047,7 +1066,7 @@ IOServiceUserNotification::handler( void * ref, hdr->msgh_local_port = port; hdr->msgh_bits = MACH_MSGH_BITS( MACH_MSG_TYPE_COPY_SEND /*remote*/, - MACH_MSG_TYPE_MAKE_SEND /*local*/); + MACH_MSG_TYPE_MOVE_SEND /*local*/); hdr->msgh_size = msgSize; hdr->msgh_id = kOSNotificationMessageID; @@ -1061,10 +1080,6 @@ IOServiceUserNotification::handler( void * ref, bcopy( msgReference, udata->notifyHeader.reference, msgReferenceSize ); }); - if (port) { - iokit_release_port( port ); - } - if ((KERN_SUCCESS != kr) && !ipcLogged) { ipcLogged = true; IOLog("%s: kernel_mach_msg_send (0x%x)\n", __PRETTY_FUNCTION__, kr ); @@ -1213,8 +1228,13 @@ IOServiceMessageUserNotification::handler( void * ref, } mach_msg_size_t payloadSize = thisMsgSize - sizeof(PingMsgKdata); - providerPort = iokit_port_for_object( provider, IKOT_IOKIT_OBJECT, NULL ); - thisPort = iokit_port_for_object( this, IKOT_IOKIT_OBJECT, NULL ); + /* + * These rights will be consumed when the message we form below + * is sent by kernel_mach_msg_send_with_builder_internal(), + * because we make the disposition for the rights move-send. + */ + providerPort = iokit_port_make_send_for_object( provider, IKOT_IOKIT_OBJECT ); + thisPort = iokit_port_make_send_for_object( this, IKOT_IOKIT_OBJECT ); kr = kernel_mach_msg_send_with_builder_internal(1, payloadSize, MACH_SEND_KERNEL_IMPORTANCE, MACH_MSG_TIMEOUT_NONE, NULL, @@ -1226,17 +1246,18 @@ IOServiceMessageUserNotification::handler( void * ref, hdr->msgh_remote_port = remotePort; hdr->msgh_local_port = thisPort; - hdr->msgh_bits = MACH_MSGH_BITS_COMPLEX - | MACH_MSGH_BITS( + hdr->msgh_bits = MACH_MSGH_BITS_SET( MACH_MSG_TYPE_COPY_SEND /*remote*/, - MACH_MSG_TYPE_MAKE_SEND /*local*/); + MACH_MSG_TYPE_MOVE_SEND /*local*/, + MACH_MSG_TYPE_NONE /*voucher*/, + MACH_MSGH_BITS_COMPLEX); hdr->msgh_size = thisMsgSize; hdr->msgh_id = kOSNotificationMessageID; /* body.msgh_descriptor_count is set automatically after the closure */ port_desc[0].name = providerPort; - port_desc[0].disposition = MACH_MSG_TYPE_MAKE_SEND; + port_desc[0].disposition = MACH_MSG_TYPE_MOVE_SEND; port_desc[0].type = MACH_MSG_PORT_DESCRIPTOR; /* End of kernel processed data */ @@ -1261,13 +1282,6 @@ IOServiceMessageUserNotification::handler( void * ref, } }); - if (thisPort) { - iokit_release_port( thisPort ); - } - if (providerPort) { - iokit_release_port( providerPort ); - } - if (kr == MACH_SEND_NO_BUFFER) { return kIOReturnNoMemory; } @@ -1300,6 +1314,8 @@ OSDefineMetaClassAndAbstractStructors( IOUserClient, IOService ) IOLock * gIOUserClientOwnersLock; +static TUNABLE(bool, gEnforcePowerEntitlement, "enforce-power-entitlement", false); + static_assert(offsetof(IOUserClient, __opaque_end) - offsetof(IOUserClient, __opaque_start) == sizeof(void *) * 9, "ABI check: Opaque ivars for IOUserClient must be 9 void * big"); @@ -1317,7 +1333,6 @@ IOUserClient::initialize( void ) IOTrackingQueueCollectUser(IOServiceMessageUserNotification::gMetaClass.getTracking()); IOTrackingQueueCollectUser(IOServiceUserNotification::gMetaClass.getTracking()); IOTrackingQueueCollectUser(IOUserClient::gMetaClass.getTracking()); - IOTrackingQueueCollectUser(IOMachPort::gMetaClass.getTracking()); #endif /* IOTRACKING */ } @@ -1572,6 +1587,10 @@ IOUserClient::copyClientEntitlement( task_t task, } proc_t proc = (proc_t)get_bsdtask_info(task); + if (proc == NULL) { + return NULL; + } + kern_return_t ret = amfi->OSEntitlements.copyEntitlementAsOSObjectWithProc( proc, entitlement, @@ -2057,7 +2076,7 @@ IOUserClient::mapClientMemory64( err = clientMemoryForType((UInt32) type, &options, &memory ); - if (memory && (kIOReturnSuccess == err)) { + if ((kIOReturnSuccess == err) && memory && !memory->hasSharingContext()) { FAKE_STACK_FRAME(getMetaClass()); options = (options & ~kIOMapUserOptionsMask) @@ -3150,7 +3169,6 @@ is_io_service_add_notification_old( matching, port, &ref, 1, notification ); } - static kern_return_t internal_io_service_add_interest_notification( io_object_t _service, @@ -3171,6 +3189,21 @@ internal_io_service_add_interest_notification( err = kIOReturnNoResources; if ((sym = OSSymbol::withCString( type_of_interest ))) { do { +#if XNU_PLATFORM_WatchOS + if (sym == gIOAppPowerStateInterest && + !(IOCurrentTaskHasEntitlement("com.apple.private.power.notifications") || IOCurrentTaskHasEntitlement("com.apple.private.power.notifications-temp"))) { + OSString * taskName = IOCopyLogNameForPID(proc_selfpid()); + IOLog("IORegisterForSystemPower called by %s without \"com.apple.private.power.notifications\" entitlement\n", + taskName ? taskName->getCStringNoCopy() : "???"); + OSSafeReleaseNULL(taskName); + + if (gEnforcePowerEntitlement) { + err = kIOReturnNotPermitted; + continue; + } + } +#endif // XNU_PLATFORM_WatchOS + userNotify = new IOServiceMessageUserNotification; if (userNotify && !userNotify->init( port, kIOServiceMessageNotificationType, @@ -4667,6 +4700,98 @@ is_io_connect_set_notification_port_64( return ret; } + +/* Routine io_connect_map_shared_memory */ +kern_return_t +is_io_connect_map_shared_memory +( + io_connect_t connection, + uint32_t memory_type, + task_t into_task, + mach_vm_address_t *address, + mach_vm_size_t *size, + uint32_t map_flags, + io_name_t property_name, + io_struct_inband_t inband_output, + mach_msg_type_number_t *inband_outputCnt +) +{ + IOReturn err; + IOMemoryMap * map = NULL; + IOOptionBits options = 0; + IOMemoryDescriptor * memory = NULL; + + CHECK( IOUserClient, connection, client ); + + if (!into_task) { + return kIOReturnBadArgument; + } + if (client->sharedInstance + || (into_task != current_task())) { + return kIOReturnUnsupported; + } + + IOStatisticsClientCall(); + + client->ipcEnter(client->defaultLocking ? kIPCLockWrite : kIPCLockNone); + + err = client->clientMemoryForType(memory_type, &options, &memory ); + + if (memory && (kIOReturnSuccess == err)) { + OSObject * context = memory->copySharingContext(property_name); + OSData * desc; + if (!(desc = OSDynamicCast(OSData, context))) { + err = kIOReturnNotReady; + } else { + if (!(kIOMapReadOnly & options) + && !IOCurrentTaskHasEntitlement(kIOMapSharedMemoryWritableEntitlement)) { + err = kIOReturnNotPermitted; + } else if (desc->getLength() > *inband_outputCnt) { + err = kIOReturnOverrun; + } else { + memcpy(inband_output, desc->getBytesNoCopy(), desc->getLength()); + *inband_outputCnt = desc->getLength(); + } + OSSafeReleaseNULL(context); + } + if (kIOReturnSuccess == err) { + FAKE_STACK_FRAME(client->getMetaClass()); + + options = (options & ~kIOMapUserOptionsMask) + | (map_flags & kIOMapUserOptionsMask) + | kIOMapAnywhere; + map = memory->createMappingInTask( into_task, 0, options ); + + FAKE_STACK_FRAME_END(); + if (!map) { + err = kIOReturnNotReadable; + } + } + memory->release(); + } + + if (map) { + *address = map->getAddress(); + if (size) { + *size = map->getSize(); + } + // keep it with the user client + IOLockLock( gIOObjectPortLock); + if (NULL == client->mappings) { + client->mappings = OSSet::withCapacity(2); + } + if (client->mappings) { + client->mappings->setObject( map); + } + IOLockUnlock( gIOObjectPortLock); + map->release(); + err = kIOReturnSuccess; + } + + client->ipcExit(client->defaultLocking ? kIPCLockWrite : kIPCLockNone); + + return err; +} /* Routine io_connect_map_memory_into_task */ kern_return_t is_io_connect_map_memory_into_task @@ -6425,13 +6550,29 @@ is_io_device_tree_entry_exists_with_name( boolean_t *exists ) { OSCollectionIterator *iter; + IORegistryEntry *entry; + io_name_t namebuf; + const char *entryname; + const char *propname; if (main_port != main_device_port) { return kIOReturnNotPrivileged; } - iter = IODTFindMatchingEntries(IORegistryEntry::getRegistryRoot(), kIODTRecursive, name); - *exists = iter && iter->getNextObject(); + if ((propname = strchr(name, ':'))) { + propname++; + strlcpy(namebuf, name, propname - name); + entryname = namebuf; + } else { + entryname = name; + } + + iter = IODTFindMatchingEntries(IORegistryEntry::getRegistryRoot(), kIODTRecursive, entryname); + if (iter && (entry = (IORegistryEntry *) iter->getNextObject())) { + *exists = !propname || entry->propertyExists(propname); + } else { + *exists = FALSE; + } OSSafeReleaseNULL(iter); return kIOReturnSuccess; diff --git a/iokit/Kernel/IOUserServer.cpp b/iokit/Kernel/IOUserServer.cpp index 86f51a157..13aa54ad7 100644 --- a/iokit/Kernel/IOUserServer.cpp +++ b/iokit/Kernel/IOUserServer.cpp @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -60,7 +61,9 @@ #include #include #include +#include #include "IOKitKernelInternal.h" +#include "IOServicePMPrivate.h" /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -1800,7 +1803,7 @@ IOServiceNotificationDispatchSource::Create_Impl( inst->ivars->notifier = IOService::addMatchingNotification(gIOMatchedNotification, matching, 0 /*priority*/, ^bool (IOService * newService, IONotifier * notifier) { bool notifyReady = false; - IONotifier * interest; + IONotifier * interest = NULL; OSObject * serverName; bool okToUse; @@ -1841,35 +1844,45 @@ IOServiceNotificationDispatchSource::Create_Impl( IOLockLock(inst->ivars->lock); notifyReady = (0 == inst->ivars->pending[kIOServiceNotificationTypeMatched]->getCount()); inst->ivars->pending[kIOServiceNotificationTypeMatched]->setObject(newService); + bool needInterest = (NULL == inst->ivars->interestNotifiers->getObject((const OSSymbol *) newService)); IOLockUnlock(inst->ivars->lock); - interest = newService->registerInterest(gIOGeneralInterest, - ^IOReturn (uint32_t messageType, IOService * provider, - void * messageArgument, size_t argSize) { - IONotifier * interest; - bool notifyReady = false; + if (needInterest) { + interest = newService->registerInterest(gIOGeneralInterest, + ^IOReturn (uint32_t messageType, IOService * provider, + void * messageArgument, size_t argSize) { + IONotifier * interest; + bool notifyReady = false; - switch (messageType) { - case kIOMessageServiceIsTerminated: - IOLockLock(inst->ivars->lock); - notifyReady = (0 == inst->ivars->pending[kIOServiceNotificationTypeTerminated]->getCount()); - inst->ivars->pending[kIOServiceNotificationTypeTerminated]->setObject(provider); - if (inst->ivars->interestNotifiers != NULL) { - interest = (typeof(interest))inst->ivars->interestNotifiers->getObject((const OSSymbol *) newService); - assert(interest); - interest->remove(); - inst->ivars->interestNotifiers->removeObject((const OSSymbol *) newService); + // after the notifier remove, IOServiceNotificationDispatchSource::free + // will not wait for this code to complete + if (!inst->taggedTryRetain(NULL)) { + return kIOReturnSuccess; } - IOLockUnlock(inst->ivars->lock); - break; - default: - break; - } - if (notifyReady && inst->ivars->action) { - inst->ServiceNotificationReady(inst->ivars->action); - } - return kIOReturnSuccess; - }); + + switch (messageType) { + case kIOMessageServiceIsTerminated: + IOLockLock(inst->ivars->lock); + notifyReady = (0 == inst->ivars->pending[kIOServiceNotificationTypeTerminated]->getCount()); + inst->ivars->pending[kIOServiceNotificationTypeTerminated]->setObject(provider); + if (inst->ivars->interestNotifiers != NULL) { + interest = (typeof(interest))inst->ivars->interestNotifiers->getObject((const OSSymbol *) newService); + assert(interest); + interest->remove(); + inst->ivars->interestNotifiers->removeObject((const OSSymbol *) newService); + } + IOLockUnlock(inst->ivars->lock); + break; + default: + break; + } + if (notifyReady && inst->ivars->action) { + inst->ServiceNotificationReady(inst->ivars->action); + } + inst->release(); + return kIOReturnSuccess; + }); + } if (interest) { IOLockLock(inst->ivars->lock); inst->ivars->interestNotifiers->setObject((const OSSymbol *) newService, interest); @@ -1966,6 +1979,7 @@ IOServiceNotificationDispatchSource::free() for (uint32_t idx = 0; idx < kIOServiceNotificationTypeCount; idx++) { OSSafeReleaseNULL(ivars->pending[idx]); } + OSSafeReleaseNULL(ivars->action); if (ivars->lock) { IOLockFree(ivars->lock); ivars->lock = NULL; @@ -2917,6 +2931,7 @@ IOUserServer::objectInstantiate(OSObject * obj, IORPC rpc, IORPCMessage * messag return kIOReturnError; } IOLockLock(service->reserved->uvars->userServer->fLock); + service->reserved->uvars->instantiated = true; userMeta = (typeof(userMeta))service->reserved->uvars->userServer->fClasses->getObject(str); IOLockUnlock(service->reserved->uvars->userServer->fLock); } @@ -3666,25 +3681,13 @@ IORPCMessageFromMachReply(IORPCMessageMach * msg) ipc_port_t IOUserServer::copySendRightForObject(OSObject * object, ipc_kobject_type_t type) { - ipc_port_t port; - ipc_port_t sendPort = NULL; - ipc_kobject_t kobj; - - port = iokit_port_for_object(object, type, &kobj); - if (port) { - sendPort = ipc_kobject_make_send(port, kobj, type); - iokit_release_port(port); - } - - return sendPort; + return iokit_port_make_send_for_object(object, type); } OSObject * IOUserServer::copyObjectForSendRight(ipc_port_t port, ipc_kobject_type_t type) { - OSObject * object; - object = iokit_lookup_io_object(port, type); - return object; + return iokit_lookup_io_object(port, type); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -4031,6 +4034,7 @@ IOUserServer::finalize(IOOptionBits options) IOService * nextService; IOService * provider; bool started = false; + bool instantiated = false; nextService = (IOService *) obj; if (kIODKLogSetup & gIODKDebug) { @@ -4044,6 +4048,7 @@ IOUserServer::finalize(IOOptionBits options) } (void)::acknowledgeSetPowerState(nextService); started = nextService->reserved->uvars->started; + instantiated = nextService->reserved->uvars->instantiated; nextService->reserved->uvars->serverDied = true; serviceDidStop(nextService, provider); @@ -4051,11 +4056,20 @@ IOUserServer::finalize(IOOptionBits options) provider->resetRematchProperties(); } if (started) { - nextService->terminate(terminateFlags); + IOService * provider = nextService; + while ((provider = provider->getProvider())) { + if (-1U != services->getNextIndexOfObject(provider, 0)) { + break; + } + } + if (!provider) { + // this service is the root of the set, so only terminate it + nextService->terminate(terminateFlags); + } } } - if (!started) { - DKLOG(DKS "::terminate(" DKS ") server exit before start()\n", DKN(this), DKN(nextService)); + if (!started || !instantiated) { + DKLOG(DKS "::terminate(" DKS ") server exit before start() instantiated %d\n", DKN(this), DKN(nextService), instantiated); serviceStop(nextService, NULL); } return false; @@ -4278,6 +4292,11 @@ IOUserServer::clientClose(void) if (policy == kOSDextCrashPolicyReboot && allowPanic) { panic("Driver %s has crashed too many times\n", getName()); } + + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + if (rootDomain) { + rootDomain->requestRunMode(kIOPMRunModeFullWake); + } } terminate(); @@ -4579,6 +4598,12 @@ IOUserServer::getTargetAndTrapForIndex( IOService **targetP, UInt32 index ) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +void +IOUserServer::pageout() +{ + fPageout = 1; +} + IOReturn IOUserServer::serviceAttach(IOService * service, IOService * provider) { @@ -4595,6 +4620,7 @@ IOUserServer::serviceAttach(IOService * service, IOService * provider) vars->userServer = this; vars->userServer->retain(); vars->uvarsLock = IOLockAlloc(); + vars->originalProperties = service->dictionaryWithProperties(); IOLockLock(fLock); if (-1U == fServices->getNextIndexOfObject(service, 0)) { fServices->setObject(service); @@ -4834,6 +4860,10 @@ static IOPMPowerState .capabilityFlags = kIOPMLowPower, .outputPowerCharacter = kIOPMLowPower, .inputPowerRequirement = kIOPMLowPower}, + { .version = kIOPMPowerStateVersion1, + .capabilityFlags = kIOPMAOTPower, + .outputPowerCharacter = kIOPMAOTPower, + .inputPowerRequirement = kIOPMAOTPower}, { .version = kIOPMPowerStateVersion1, .capabilityFlags = kIOPMPowerOn, .outputPowerCharacter = kIOPMPowerOn, @@ -4841,7 +4871,7 @@ static IOPMPowerState }; enum { - kUserServerMaxPowerState = 2 + kUserServerMaxPowerState = 3 }; IOReturn @@ -4871,8 +4901,24 @@ IOUserServer::serviceJoinPMTree(IOService * service) if (props->getObject(kIOPMResetPowerStateOnWakeKey) == kOSBooleanTrue) { service->setProperty(kIOPMResetPowerStateOnWakeKey, kOSBooleanTrue); } + fAOTAllow |= (NULL != props->getObject(kIOPMAOTAllowKey)); + if (!(kIODKDisableIOPMSystemOffPhase2Allow & gIODKDebug)) { + fSystemOffPhase2Allow |= (NULL != props->getObject(kIOPMSystemOffPhase2AllowKey)); + } OSSafeReleaseNULL(props); } + if (fAOTAllow) { + IOService * dtparent = service; + while (dtparent && !dtparent->inPlane(gIODTPlane)) { + dtparent = dtparent->getProvider(); + } + if (dtparent) { + uint32_t one = 1; + OSData * data = OSData::withBytes(&one, sizeof(one)); + dtparent->setProperty(kIOPMAOTPowerKey, data); + OSSafeReleaseNULL(data); + } + } service->PMinit(); ret = service->registerPowerDriver(this, sPowerStates, sizeof(sPowerStates) / sizeof(sPowerStates[0])); assert(kIOReturnSuccess == ret); @@ -4946,29 +4992,6 @@ IOUserServer::serviceSetPowerState(IOService * controllingDriver, IOService * se IOLockLock(fLock); if (service->reserved->uvars) { if (!fSystemOff && !(kIODKDisablePM & gIODKDebug)) { - OSDictionary * wakeDescription; - OSObject * prop; - char wakeReasonString[128]; - - wakeDescription = OSDictionary::withCapacity(4); - if (wakeDescription) { - wakeReasonString[0] = 0; - getPMRootDomain()->copyWakeReasonString(wakeReasonString, sizeof(wakeReasonString)); - - if (wakeReasonString[0]) { - prop = OSString::withCString(&wakeReasonString[0]); - wakeDescription->setObject(gIOSystemStateWakeDescriptionWakeReasonKey, prop); - OSSafeReleaseNULL(prop); - } -#if defined(__arm__) || defined(__arm64__) - prop = OSNumber::withNumber(ml_get_conttime_offset(), sizeof(uint64_t) * CHAR_BIT); - wakeDescription->setObject(gIOSystemStateWakeDescriptionContinuousTimeOffsetKey, prop); - OSSafeReleaseNULL(prop); -#endif /* defined(__arm__) || defined(__arm64__) */ - getSystemStateNotificationService()->StateNotificationItemSet(gIOSystemStateWakeDescriptionKey, wakeDescription); - OSSafeReleaseNULL(wakeDescription); - } - service->reserved->uvars->willPower = true; service->reserved->uvars->willPowerState = state; service->reserved->uvars->controllingDriver = controllingDriver; @@ -4980,10 +5003,31 @@ IOUserServer::serviceSetPowerState(IOService * controllingDriver, IOService * se IOLockUnlock(fLock); if (sendIt) { + uint32_t driverFlags = (uint32_t) flags; if (kIODKLogPM & gIODKDebug) { - DKLOG(DKS "::serviceSetPowerState(%ld) %d\n", DKN(service), state, fSystemPowerAck); + DKLOG(DKS "::serviceSetPowerState(%ld, 0x%x) %d\n", DKN(service), state, driverFlags, fSystemPowerAck); } - ret = service->SetPowerState((uint32_t) flags); +#if DEBUG || DEVELOPMENT + bool pageout = false; + uint64_t pageincount = 0; + if (gLPWFlags) { + pageout = fPageout; + if (pageout) { + fPageout = false; + DKLOG(DKS " pageout\n", DKN(service)); + pageincount = vm_task_evict_shared_cache(fOwningTask); + } + } +#endif /* DEBUG || DEVELOPMENT */ + + ret = service->SetPowerState(driverFlags); + +#if DEBUG || DEVELOPMENT + if (pageout) { + DKLOG(DKS " state %ld pageins %qd\n", DKN(service), state, vm_task_pageins(fOwningTask) - pageincount); + } +#endif /* DEBUG || DEVELOPMENT */ + if (kIOReturnSuccess == ret) { return 20 * 1000 * 1000; } else { @@ -5039,7 +5083,7 @@ IOUserServer::powerStateDidChangeTo(IOPMPowerFlags flags, unsigned long state, I IOLockUnlock(fLock); if (pmAck) { - IOServicePH::serverAck(this); + serverAck(); } return kIOPMAckImplied; @@ -5070,6 +5114,106 @@ IOUserServer::checkPMReady() return ready; } +IOReturn +IOUserServer::serviceCreatePMAssertion(IOService * service, uint32_t assertionBits, uint64_t * assertionID, bool synced) +{ + IOReturn ret = kIOReturnSuccess; + + *assertionID = kIOPMUndefinedDriverAssertionID; + + if (!service->reserved->uvars || service->reserved->uvars->userServer != this) { + return kIOReturnError; + } + + if (!service->reserved->uvars->userServerPM) { + // Cannot create PM assertion unless joined PM tree + return kIOReturnNotReady; + } + + // Check to make sure the bits are allowed + uint32_t userAllowedBits = kIOPMDriverAssertionCPUBit | + kIOPMDriverAssertionForceFullWakeupBit; + if (synced) { + userAllowedBits = kIOPMDriverAssertionCPUBit; + } + if (0 == (assertionBits & ~userAllowedBits)) { + if (synced) { + ret = getPMRootDomain()->acquireDriverKitSyncedAssertion(service, assertionID); + assert(ret != kIOReturnSuccess || *assertionID != kIOPMUndefinedDriverAssertionID); + } else { + *assertionID = getPMRootDomain()->createPMAssertion(assertionBits, + kIOPMDriverAssertionLevelOn, + getPMRootDomain(), + service->getName()); + if (!*assertionID) { + ret = kIOReturnInternalError; + } + } + } else { + ret = kIOReturnBadArgument; + } + if (*assertionID != kIOPMUndefinedDriverAssertionID) { + IOLockLock(fLock); + OSNumber * assertionIDNumber = OSNumber::withNumber(*assertionID, 64); + OSArray ** pmAssertions = (synced ? &service->reserved->uvars->pmAssertionsSynced : &service->reserved->uvars->pmAssertions); + if (!*pmAssertions) { + *pmAssertions = OSArray::withCapacity(1); + } + (*pmAssertions)->setObject(assertionIDNumber); + assertionIDNumber->release(); + IOLockUnlock(fLock); + } + + return ret; +} + +IOReturn +IOUserServer::serviceReleasePMAssertion(IOService * service, IOPMDriverAssertionID assertionID) +{ + kern_return_t ret = kIOReturnSuccess; + bool synced = false; + + bool (^findAndRemoveAssertionID)(OSArray *) = ^(OSArray * assertions) { + unsigned index; + if (!assertions) { + return false; + } + for (index = 0; index < assertions->getCount(); index++) { + OSNumber * theID = (OSNumber *)assertions->getObject(index); + if (theID->unsigned64BitValue() == assertionID) { + break; + } + } + if (index == assertions->getCount()) { + return false; + } + assertions->removeObject(index); + return true; + }; + + if (!service->reserved->uvars || !service->reserved->uvars->userServer) { + return kIOReturnError; + } + + IOLockLock(fLock); + if (findAndRemoveAssertionID(service->reserved->uvars->pmAssertionsSynced)) { + synced = true; + } else if (!findAndRemoveAssertionID(service->reserved->uvars->pmAssertions)) { + ret = kIOReturnNotFound; + } + IOLockUnlock(fLock); + + if (ret == kIOReturnSuccess) { + if (synced) { + getPMRootDomain()->releaseDriverKitSyncedAssertion(assertionID); + } else { + getPMRootDomain()->releasePMAssertion(assertionID); + } + } + + return ret; +} + kern_return_t IOService::JoinPMTree_Impl(void) { @@ -5118,7 +5262,7 @@ IOService::ChangePowerState_Impl( changePowerStateToPriv(1); break; case kIOServicePowerCapabilityOn: - changePowerStateToPriv(2); + changePowerStateToPriv(kUserServerMaxPowerState); break; default: return kIOReturnBadArgument; @@ -5184,12 +5328,19 @@ IOService::Create_Impl( if (provider != this) { return kIOReturnUnsupported; } + if (reserved == NULL || reserved->uvars == NULL) { + return kIOReturnUnsupported; + } ret = kIOReturnUnsupported; inst = NULL; service = NULL; - prop = copyProperty(propertiesKey); + prop = reserved->uvars->originalProperties->getObject(propertiesKey); + if (!prop) { + return kIOReturnBadArgument; + } + prop->retain(); properties = OSDynamicCast(OSDictionary, prop); if (!properties) { ret = kIOReturnBadArgument; @@ -5309,7 +5460,7 @@ IOService::SearchProperty_Impl( } return kIOReturnSuccess; }); - if (object || !(kIORegistryIterateParents & options)) { + if (object || !(kIORegistryIterateParents & regOptions)) { break; } } @@ -5432,15 +5583,55 @@ IOService::GetBusyState_Impl(uint32_t *busyState) return kIOReturnSuccess; } +kern_return_t +IOService::CreatePMAssertion_Impl(uint32_t assertionBits, uint64_t * assertionID, bool synced) +{ + *assertionID = kIOPMUndefinedDriverAssertionID; + + if (!reserved->uvars || !reserved->uvars->userServer) { + return kIOReturnError; + } + + return reserved->uvars->userServer->serviceCreatePMAssertion(this, assertionBits, assertionID, synced); +} + +kern_return_t +IOService::ReleasePMAssertion_Impl(uint64_t assertionID) +{ + if (!reserved->uvars || !reserved->uvars->userServer) { + return kIOReturnError; + } + + return reserved->uvars->userServer->serviceReleasePMAssertion(this, assertionID); +} + void -IOUserServer::systemPower(bool powerOff, bool hibernate) +IOUserServer::serverAck(void) +{ + IOServicePH::serverAck(this); +} + +void +IOUserServer::systemSuspend() +{ + if (fSystemOff && !fSuspended) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SUSPEND_DRIVERKIT_USERSPACE) | DBG_FUNC_START, + task_pid(fOwningTask)); + task_suspend_internal(fOwningTask); + DKLOG(DKS " did task_suspend_internal\n", DKN(this)); + fSuspended = true; + } +} + +void +IOUserServer::systemPower(uint8_t systemState, bool hibernate) { OSArray * services; { OSDictionary * sleepDescription; OSObject * prop; - sleepDescription = OSDictionary::withCapacity(4); + sleepDescription = OSDictionary::withCapacity(3); if (sleepDescription) { prop = getPMRootDomain()->copyProperty(kRootDomainSleepReasonKey); if (prop) { @@ -5463,6 +5654,30 @@ IOUserServer::systemPower(bool powerOff, bool hibernate) OSSafeReleaseNULL(sleepDescription); } } + if (!IsIOServiceSystemStateOff(systemState)) { + OSDictionary * wakeDescription; + OSObject * prop; + char wakeReasonString[128]; + + wakeDescription = OSDictionary::withCapacity(2); + if (wakeDescription) { + wakeReasonString[0] = 0; + getPMRootDomain()->copyWakeReasonString(wakeReasonString, sizeof(wakeReasonString)); + + if (wakeReasonString[0]) { + prop = OSString::withCString(&wakeReasonString[0]); + wakeDescription->setObject(gIOSystemStateWakeDescriptionWakeReasonKey, prop); + OSSafeReleaseNULL(prop); + } +#if defined(__arm__) || defined(__arm64__) + prop = OSNumber::withNumber(ml_get_conttime_offset(), sizeof(uint64_t) * CHAR_BIT); + wakeDescription->setObject(gIOSystemStateWakeDescriptionContinuousTimeOffsetKey, prop); + OSSafeReleaseNULL(prop); +#endif /* defined(__arm__) || defined(__arm64__) */ + getSystemStateNotificationService()->StateNotificationItemSet(gIOSystemStateWakeDescriptionKey, wakeDescription); + OSSafeReleaseNULL(wakeDescription); + } + } IOLockLock(fLock); @@ -5479,11 +5694,17 @@ IOUserServer::systemPower(bool powerOff, bool hibernate) return allPowerStates; }); - if (kIODKLogPM & gIODKDebug) { - DKLOG(DKS "::powerOff(%d) %d\n", DKN(this), powerOff, allPowerStates); - } + // figure what phase this DK server process will be suspended in, + // and make sure its power changes complete before suspension - if (powerOff) { + bool effectiveOff = IsIOServiceSystemStateOff(systemState) && !allPowerStates; + effectiveOff |= ((kIOServiceSystemStateOffPhase1 == systemState) && !fSystemOffPhase2Allow); + effectiveOff |= (kIOServiceSystemStateOffPhase2 == systemState); + effectiveOff |= ((kIOServiceSystemStateAOT == systemState) && !fAOTAllow); + + DKLOG(DKS "::systemPower(0x%x) effective %d current %d\n", DKN(this), systemState, !effectiveOff, allPowerStates != 0); + + if (effectiveOff) { fSystemPowerAck = allPowerStates; if (!fSystemPowerAck) { fSystemOff = true; @@ -5491,7 +5712,7 @@ IOUserServer::systemPower(bool powerOff, bool hibernate) IOLockUnlock(fLock); if (!fSystemPowerAck) { - IOServicePH::serverAck(this); + serverAck(); } else { if (services) { services->iterateObjects(^bool (OSObject * obj) { @@ -5507,8 +5728,17 @@ IOUserServer::systemPower(bool powerOff, bool hibernate) }); } } - } else { + } else if (fSystemOff) { fSystemOff = false; + + if (fSuspended) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SUSPEND_DRIVERKIT_USERSPACE) | DBG_FUNC_END, + task_pid(fOwningTask)); + task_resume_internal(fOwningTask); + DKLOG(DKS " did task_resume_internal\n", DKN(this)); + fSuspended = false; + } + IOLockUnlock(fLock); if (services) { services->iterateObjects(^bool (OSObject * obj) { @@ -5527,11 +5757,13 @@ IOUserServer::systemPower(bool powerOff, bool hibernate) return false; }); } + } else { + IOLockUnlock(fLock); + serverAck(); } OSSafeReleaseNULL(services); } - void IOUserServer::systemHalt(int howto) { @@ -5617,17 +5849,39 @@ IOReturn IOUserServer::serviceStarted(IOService * service, IOService * provider, bool result) { IOReturn ret; + bool needStop = false; DKLOG(DKS "::start(" DKS ") %s\n", DKN(service), DKN(provider), result ? "ok" : "fail"); if (!result) { + if (!service->reserved->uvars->instantiated && provider) { + // Object instantiation did not happen. This can happen if, + // 1. Dext crashed, in which case the user server has been terminated when the task is marked as corpse + // 2. Kernel IOService failed start, and it did not attempt Start + // A rematch should be attempted for 1, not 2 + bool shouldReRegister = true; + if (lockForArbitration()) { + shouldReRegister = (__state[0] & kIOServiceInactiveState); + unlockForArbitration(); + } + if (shouldReRegister) { + provider->registerService(kIOServiceAsynchronous); + } + } ret = kIOReturnSuccess; return ret; } ret = serviceJoinPMTree(service); + IOLockLock(service->reserved->uvars->uvarsLock); service->reserved->uvars->started = true; + needStop = service->reserved->uvars->needStop; + IOLockUnlock(service->reserved->uvars->uvarsLock); + if (needStop) { + serviceStop(service, provider); + return kIOReturnSuccess; + } if (service->reserved->uvars->deferredRegisterService) { service->registerService(kIOServiceAsynchronous | kIOServiceDextRequirePowerForMatching); @@ -5704,7 +5958,20 @@ IOUserServer::serviceStop(IOService * service, IOService *provider) IOReturn ret; uint32_t idx; bool pmAck; - OSObjectUserVars * uvars; + bool deferred = false; + OSObjectUserVars * uvars = service->reserved->uvars; + + IOLockLock(uvars->uvarsLock); + if (!uvars->started) { + // started will be set, at a later point + uvars->needStop = true; + deferred = true; + } + IOLockUnlock(uvars->uvarsLock); + if (deferred) { + return kIOReturnSuccess; + } + pmAck = false; IOLockLock(fLock); idx = fServices->getNextIndexOfObject(service, 0); @@ -5731,7 +5998,6 @@ IOUserServer::serviceStop(IOService * service, IOService *provider) OSSafeReleaseNULL(serviceArray); OSSafeReleaseNULL(serviceArrayObj); - uvars = service->reserved->uvars; uvars->stopped = true; uvars->powerState = 0; @@ -5753,7 +6019,7 @@ IOUserServer::serviceStop(IOService * service, IOService *provider) } IOLockUnlock(fLock); if (pmAck) { - IOServicePH::serverAck(this); + serverAck(); } if (-1U == idx) { @@ -5765,8 +6031,23 @@ IOUserServer::serviceStop(IOService * service, IOService *provider) (void) service->deRegisterInterestedDriver(this); if (uvars->userServerPM) { + IOPMrootDomain * rootDomain = getPMRootDomain(); service->PMstop(); service->acknowledgeSetPowerState(); + if (uvars->pmAssertions) { + uvars->pmAssertions->iterateObjects(^(OSObject * obj) { + rootDomain->releasePMAssertion(((OSNumber *)obj)->unsigned64BitValue()); + return false; + }); + OSSafeReleaseNULL(uvars->pmAssertions); + } + if (uvars->pmAssertionsSynced) { + uvars->pmAssertionsSynced->iterateObjects(^(OSObject * obj) { + rootDomain->releaseDriverKitSyncedAssertion(((OSNumber *)obj)->unsigned64BitValue()); + return false; + }); + OSSafeReleaseNULL(uvars->pmAssertionsSynced); + } } if (kIODKLogSetup & gIODKDebug) { DKLOG(DKS "::serviceStop(" DKS ", " DKS ")\n", DKN(this), DKN(service), DKN(provider)); @@ -5801,6 +6082,7 @@ IOUserServer::serviceFree(IOService * service) } OSSafeReleaseNULL(uvars->userServer); IOLockFree(uvars->uvarsLock); + OSSafeReleaseNULL(service->reserved->uvars->originalProperties); IOFreeType(service->reserved->uvars, OSObjectUserVars); } @@ -6384,7 +6666,7 @@ IOUserServerCheckInToken::complete() ret = kIOReturnSuccess; } - if (fState == kIOUserServerCheckInPending && --fPendingCount == 0) { + if (fState == kIOUserServerCheckInPending) { fState = kIOUserServerCheckInComplete; if (gDriverKitLaunches != NULL) { // Remove pending launch from list, if we have not shut down yet. @@ -6424,7 +6706,6 @@ IOUserServerCheckInToken::init(const OSSymbol * serverName, OSNumber * serverTag } fState = kIOUserServerCheckInPending; - fPendingCount = 1; fKextBundleID = NULL; fNeedDextDec = false; @@ -6669,7 +6950,6 @@ IOUserServerCheckInToken::findExistingToken(const OSSymbol * serverName) const OSSymbol * tokenServerName = token->fServerName; if (tokenServerName->isEqualTo(serverName)) { assert(token->fState == kIOUserServerCheckInPending); - token->fPendingCount++; result = token; result->retain(); } @@ -6978,7 +7258,6 @@ class IOStateNotificationItem : public OSObject public: virtual bool init() override; - OSDictionary * fSchema; OSDictionary * fValue; OSSet * fListeners; }; @@ -7051,8 +7330,19 @@ IOSystemStateNotification::initialize(void) gIOSystemStateWakeDescriptionWakeReasonKey = OSSymbol::withCStringNoCopy(kIOSystemStateWakeDescriptionWakeReasonKey); gIOSystemStateWakeDescriptionContinuousTimeOffsetKey = OSSymbol::withCStringNoCopy(kIOSystemStateWakeDescriptionContinuousTimeOffsetKey); +#if defined(__arm__) || defined(__arm64__) + // Make ml_get_conttime_offset available before systemPower + OSDictionary * wakeDescription = OSDictionary::withCapacity(1); + OSObject * prop = OSNumber::withNumber(ml_get_conttime_offset(), sizeof(uint64_t) * CHAR_BIT); + wakeDescription->setObject(gIOSystemStateWakeDescriptionContinuousTimeOffsetKey, prop); + ret = me->StateNotificationItemCreate(gIOSystemStateWakeDescriptionKey, wakeDescription); + OSSafeReleaseNULL(prop); + OSSafeReleaseNULL(wakeDescription); + assert(kIOReturnSuccess == ret); +#else /* !defined(__arm__) && !defined(__arm64__) */ ret = me->StateNotificationItemCreate(gIOSystemStateWakeDescriptionKey, NULL); assert(kIOReturnSuccess == ret); +#endif /* defined(__arm__) || defined(__arm64__) */ gIOSystemStateHaltDescriptionKey = (OSString *)OSSymbol::withCStringNoCopy(kIOSystemStateHaltDescriptionKey); gIOSystemStateHaltDescriptionHaltStateKey = OSSymbol::withCStringNoCopy(kIOSystemStateHaltDescriptionHaltStateKey); @@ -7104,7 +7394,6 @@ IOSystemStateNotification::setProperties(OSObject * properties) { kern_return_t kr; OSDictionary * dict; - OSDictionary * schema; OSDictionary * value; OSString * itemName; @@ -7117,9 +7406,12 @@ IOSystemStateNotification::setProperties(OSObject * properties) return kIOReturnNotPermitted; } - if ((schema = OSDynamicCast(OSDictionary, dict->getObject(kIOStateNotificationItemCreateKey)))) { - itemName = OSDynamicCast(OSString, schema->getObject(kIOStateNotificationNameKey)); - kr = StateNotificationItemCreate(itemName, schema); + if ((value = OSDynamicCast(OSDictionary, dict->getObject(kIOStateNotificationItemCreateKey)))) { + itemName = OSDynamicCast(OSString, value->getObject(kIOStateNotificationNameKey)); + itemName->retain(); + value->removeObject(kIOStateNotificationNameKey); + kr = StateNotificationItemCreate(itemName, value); + itemName->release(); } else if ((value = OSDynamicCast(OSDictionary, dict->getObject(kIOStateNotificationItemSetKey)))) { itemName = OSDynamicCast(OSString, value->getObject(kIOStateNotificationNameKey)); itemName->retain(); @@ -7146,7 +7438,7 @@ IOService::CopySystemStateNotificationService_Impl(IOService ** outService) } IOStateNotificationItem * -IOService::stateNotificationItemCopy(OSString * itemName, OSDictionary * schema) +IOService::stateNotificationItemCopy(OSString * itemName, OSDictionary * initialValue) { IOServiceStateChangeVars * ivars = reserved->svars; @@ -7163,13 +7455,10 @@ IOService::stateNotificationItemCopy(OSString * itemName, OSDictionary * schema) item->init(); item->fListeners = OSSet::withCapacity(16); - if (schema) { - schema->retain(); - } else { - schema = OSDictionary::withCapacity(8); + if (initialValue) { + initialValue->retain(); + item->fValue = initialValue; } - schema->setObject(kIOStateNotificationNameKey, name); - item->fSchema = schema; ivars->fItems->setObject(name, item); } IOLockUnlock(ivars->fLock); @@ -7180,11 +7469,11 @@ IOService::stateNotificationItemCopy(OSString * itemName, OSDictionary * schema) } kern_return_t -IOService::StateNotificationItemCreate_Impl(OSString * itemName, OSDictionary * schema) +IOService::StateNotificationItemCreate_Impl(OSString * itemName, OSDictionary * value) { IOStateNotificationItem * item; - item = stateNotificationItemCopy(itemName, schema); + item = stateNotificationItemCopy(itemName, value); if (!item) { return kIOReturnNoMemory; } @@ -7196,20 +7485,27 @@ IOService::StateNotificationItemCreate_Impl(OSString * itemName, OSDictionary * kern_return_t IOService::StateNotificationItemSet_Impl(OSString * itemName, OSDictionary * value) { + kern_return_t ret = kIOReturnSuccess; IOServiceStateChangeVars * ivars = reserved->svars; - OSSet * listeners; - IOStateNotificationItem * item; + OSSet * listeners = NULL; + IOStateNotificationItem * item; value->retain(); IOLockLock(ivars->fLock); - item = (typeof(item))ivars->fItems->getObject(itemName); - OSSafeReleaseNULL(item->fValue); - item->fValue = value; - listeners = NULL; - if (item->fListeners->getCount()) { - listeners = OSSet::withSet(item->fListeners); - } + do { + item = (typeof(item))ivars->fItems->getObject(itemName); + if (!item) { + ret = kIOReturnNotFound; + value->release(); + break; + } + OSSafeReleaseNULL(item->fValue); + item->fValue = value; + if (item->fListeners->getCount()) { + listeners = OSSet::withSet(item->fListeners); + } + } while (false); IOLockUnlock(ivars->fLock); if (listeners) { @@ -7223,7 +7519,7 @@ IOService::StateNotificationItemSet_Impl(OSString * itemName, OSDictionary * val OSSafeReleaseNULL(listeners); } - return kIOReturnSuccess; + return ret; } kern_return_t diff --git a/iokit/Kernel/RootDomainUserClient.cpp b/iokit/Kernel/RootDomainUserClient.cpp index 88561f9fd..839d76ca6 100644 --- a/iokit/Kernel/RootDomainUserClient.cpp +++ b/iokit/Kernel/RootDomainUserClient.cpp @@ -241,6 +241,28 @@ RootDomainUserClient::secureAttemptIdleSleepAbort( return ret; } +IOReturn +RootDomainUserClient::secureSetLockdownModeHibernation( + uint32_t status) +{ +#if HIBERNATION + int admin_priv = 0; + IOReturn ret; + + ret = clientHasPrivilege(fOwningTask, kIOClientPrivilegeAdministrator); + admin_priv = (kIOReturnSuccess == ret); + + if (admin_priv && fOwner) { + fOwner->setLockdownModeHibernation(status); + } else { + ret = kIOReturnNotPrivileged; + } + return kIOReturnSuccess; +#else + return kIOReturnError; +#endif +} + IOReturn RootDomainUserClient::clientClose( void ) { @@ -417,6 +439,15 @@ RootDomainUserClient::externalMethod(uint32_t selector, IOExternalMethodArgument .allowAsync = false, .checkEntitlement = NULL, }, + [kPMSetLDMHibernationDisable] = { + .function = &RootDomainUserClient::externalMethodDispatched, + .checkScalarInputCount = 1, + .checkStructureInputSize = 0, + .checkScalarOutputCount = 0, + .checkStructureOutputSize = 0, + .allowAsync = false, + .checkEntitlement = NULL, + }, }; return dispatchExternalMethod(selector, args, dispatchArray, sizeof(dispatchArray) / sizeof(dispatchArray[0]), this, NULL); @@ -531,6 +562,10 @@ RootDomainUserClient::externalMethodDispatched(OSObject * target, void * referen (uint32_t *) &arguments->scalarOutput[0]); break; + case kPMSetLDMHibernationDisable: + ret = me->secureSetLockdownModeHibernation((uint32_t)arguments->scalarInput[0]); + break; + default: // bad selector diff --git a/iokit/Kernel/RootDomainUserClient.h b/iokit/Kernel/RootDomainUserClient.h index 569a75738..c1bb0b958 100644 --- a/iokit/Kernel/RootDomainUserClient.h +++ b/iokit/Kernel/RootDomainUserClient.h @@ -71,6 +71,8 @@ private: IOReturn secureAttemptIdleSleepAbort( uint32_t *outReverted); + IOReturn secureSetLockdownModeHibernation( uint32_t status); + public: virtual IOReturn clientClose( void ) APPLE_KEXT_OVERRIDE; diff --git a/iokit/Tests/TestIOMemoryDescriptor.cpp b/iokit/Tests/TestIOMemoryDescriptor.cpp index 9b84f8aef..5c6e30fef 100644 --- a/iokit/Tests/TestIOMemoryDescriptor.cpp +++ b/iokit/Tests/TestIOMemoryDescriptor.cpp @@ -1156,6 +1156,19 @@ IOMemoryDescriptorTest(int newValue) panic("prepare() fail 0x%x", kr); break; } + + IOByteCount resident, dirty, swapped; + kr = md->getPageCounts(&resident, &dirty, &swapped); + if (kIOReturnSuccess != kr) { + panic("unable to getExtendedPageCounts"); + break; + } + IOLog("Page Counts: %llu resident, %llu dirty, %llu swapped\n", + resident, dirty, swapped); + if (swapped != 0) { + panic("Swapped page count is not 0 for prepared descriptor %llu", swapped); + } + for (idx = 0; idx < size; idx += sizeof(uint32_t)) { offidx = (typeof(offidx))(idx + mapoffset + srcoffset); if ((srcsize <= ptoa(5)) && (srcsize > ptoa(2)) && !(page_mask & srcoffset)) { diff --git a/iokit/Tests/TestServices/TestIODataQueues.cpp b/iokit/Tests/TestServices/TestIODataQueues.cpp new file mode 100644 index 000000000..70ff70396 --- /dev/null +++ b/iokit/Tests/TestServices/TestIODataQueues.cpp @@ -0,0 +1,79 @@ +#include +#include "TestIODataQueues.h" + +#if DEVELOPMENT || DEBUG + +OSDefineMetaClassAndStructors(TestIODataQueues, IOService); + +OSDefineMetaClassAndStructors(TestIODataQueuesUserClient, IOUserClient2022); + +bool +TestIODataQueues::start(IOService * provider) +{ + OSString * str = OSString::withCStringNoCopy("TestIODataQueuesUserClient"); + bool ok = IOService::start(provider); + if (ok && str != NULL) { + IOReturn ret; + ret = IOCircularDataQueueCreateWithEntries(kIOCircularDataQueueCreateProducer, 128, 16, &fCDQueue); + assert(kIOReturnSuccess == ret); + ret = IOCircularDataQueueEnqueue(fCDQueue, "hello", sizeof("hello")); + assert(kIOReturnSuccess == ret); + + setProperty(gIOUserClientClassKey, str); + registerService(); + } + OSSafeReleaseNULL(str); + return ok; +} + + +IOReturn +TestIODataQueuesUserClient::clientClose() +{ + if (!isInactive()) { + terminate(); + } + return kIOReturnSuccess; +} + +bool +TestIODataQueuesUserClient::start(IOService * provider) +{ + bool ok = IOUserClient2022::start(provider); + if (!ok) { + return false; + } + fTestIODataQueues = OSRequiredCast(TestIODataQueues, provider); + + setProperty(kIOUserClientDefaultLockingKey, kOSBooleanTrue); + setProperty(kIOUserClientDefaultLockingSetPropertiesKey, kOSBooleanTrue); + setProperty(kIOUserClientDefaultLockingSingleThreadExternalMethodKey, kOSBooleanTrue); + setProperty(kIOUserClientEntitlementsKey, "com.apple.iokit.TestIODataQueues"); + + return true; +} + +IOReturn +TestIODataQueuesUserClient::clientMemoryForType(UInt32 type, + IOOptionBits * koptions, + IOMemoryDescriptor ** kmemory) +{ + IOReturn ret = kIOReturnSuccess; + + *kmemory = IOCircularDataQueueCopyMemoryDescriptor(fTestIODataQueues->fCDQueue); + assert(*kmemory); + *koptions = kIOMapReadOnly; + + return ret; +} + +IOReturn +TestIODataQueuesUserClient::externalMethod(uint32_t selector, IOExternalMethodArgumentsOpaque * args) +{ + static const IOExternalMethodDispatch2022 dispatchArray[] = { + }; + + return dispatchExternalMethod(selector, args, dispatchArray, sizeof(dispatchArray) / sizeof(dispatchArray[0]), this, NULL); +} + +#endif /* DEVELOPMENT || DEBUG */ diff --git a/iokit/Tests/TestServices/TestIODataQueues.h b/iokit/Tests/TestServices/TestIODataQueues.h new file mode 100644 index 000000000..bdeed1425 --- /dev/null +++ b/iokit/Tests/TestServices/TestIODataQueues.h @@ -0,0 +1,36 @@ +#ifndef _IOKIT_TESTIOSERVICEUSERNOTIFICATION_H_ +#define _IOKIT_TESTIOSERVICEUSERNOTIFICATION_H_ + +#include +#include +#include + +#if DEVELOPMENT || DEBUG + +class TestIODataQueues : public IOService { + OSDeclareDefaultStructors(TestIODataQueues); + friend class TestIODataQueuesUserClient; + + IOCircularDataQueue * fCDQueue; + +public: + virtual bool start(IOService *provider) override; +}; + +class TestIODataQueuesUserClient : public IOUserClient2022 { + OSDeclareDefaultStructors(TestIODataQueuesUserClient); + + TestIODataQueues * fTestIODataQueues; + +public: + virtual bool start(IOService * provider) override; + virtual IOReturn clientClose() override; + virtual IOReturn externalMethod(uint32_t selector, IOExternalMethodArgumentsOpaque * args) override; + virtual IOReturn clientMemoryForType(UInt32 type, + IOOptionBits * koptions, + IOMemoryDescriptor ** kmemory) override; +}; + +#endif /* DEVELOPMENT || DEBUG */ + +#endif /* _IOKIT_TESTIOSERVICEUSERNOTIFICATION_H_ */ diff --git a/iokit/Tests/TestServices/TestIOServiceUserNotification.cpp b/iokit/Tests/TestServices/TestIOServiceUserNotification.cpp index 2e7f6d930..63ea5ff57 100644 --- a/iokit/Tests/TestServices/TestIOServiceUserNotification.cpp +++ b/iokit/Tests/TestServices/TestIOServiceUserNotification.cpp @@ -1,10 +1,10 @@ +#if DEVELOPMENT || DEBUG #include "TestIOServiceUserNotification.h" #include #include #include #include - -#if DEVELOPMENT || DEBUG +#include "../../Kernel/IOServicePrivate.h" OSDefineMetaClassAndStructors(TestIOServiceUserNotification, IOService); @@ -17,12 +17,92 @@ TestIOServiceUserNotification::start(IOService * provider) bool ret = IOService::start(provider); if (ret && str != NULL) { setProperty(gIOUserClientClassKey, str); + fUserNotifications = OSArray::withCapacity(1); + fLock = IOLockAlloc(); registerService(); } OSSafeReleaseNULL(str); return ret; } +void +TestIOServiceUserNotification::free() +{ + if (fLock) { + IOLockFree(fLock); + fLock = NULL; + } + OSSafeReleaseNULL(fUserNotifications); + IOService::free(); +} + +void +TestIOServiceUserNotification::registerUserNotification(OSObject * notification) +{ + IOLockLock(fLock); + // Proactively trim the list to avoid holding too many objects + trimUserNotificationsLocked(); + assert(fUserNotifications->getNextIndexOfObject(notification, 0) == -1); + fUserNotifications->setObject(notification); + IOLockUnlock(fLock); +} + +void +TestIOServiceUserNotification::trimUserNotificationsLocked() +{ + OSArray * remaining = OSArray::withCapacity(1); + if (!remaining) { + return; + } + fUserNotifications->iterateObjects(^(OSObject * obj) { + if (obj->getRetainCount() != 1) { + remaining->setObject(obj); + } + return false; + }); + fUserNotifications->release(); + fUserNotifications = remaining; +} + +size_t +TestIOServiceUserNotification::getUserNotificationLeakCount() +{ + size_t count = 0; + IOLockLock(fLock); + trimUserNotificationsLocked(); + count = fUserNotifications->getCount(); + IOLockUnlock(fLock); + return count; +} + +bool +TestIOServiceUserNotificationUserClient::start(IOService * provider) +{ + if (!IOUserClient::start(provider)) { + return false; + } + fProvider = OSDynamicCast(TestIOServiceUserNotification, provider); + assert(fProvider); + return true; +} + +IONotifier * +TestIOServiceUserNotificationUserClient::registerInterest(const OSSymbol * typeOfInterest, + IOServiceInterestHandler handler, + void * target, + void * ref) +{ + IONotifier * notify = IOService::registerInterest(typeOfInterest, handler, target, ref); + + // No straightforward way to make sure registerInterest is called from the test app + // Could check if handler is _ZN32IOServiceMessageUserNotification8_handlerEPvS0_jP9IOServiceS0_m + // But still cannot rule out other user process regisering interest + OSObject * obj = (OSObject *)target; + // Just panic the system if target isn't OSObject + fProvider->registerUserNotification(obj); + + return notify; +} IOReturn TestIOServiceUserNotificationUserClient::clientClose() @@ -37,7 +117,11 @@ IOReturn TestIOServiceUserNotificationUserClient::externalMethod(uint32_t selector, IOExternalMethodArguments * args, IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) { - registerService(); + if (selector == 0) { + registerService(); + } else if (selector == 1 && args->scalarOutputCount >= 1) { + args->scalarOutput[0] = fProvider->getUserNotificationLeakCount(); + } return kIOReturnSuccess; } diff --git a/iokit/Tests/TestServices/TestIOServiceUserNotification.h b/iokit/Tests/TestServices/TestIOServiceUserNotification.h index 740a8eb4c..78a7c8e63 100644 --- a/iokit/Tests/TestServices/TestIOServiceUserNotification.h +++ b/iokit/Tests/TestServices/TestIOServiceUserNotification.h @@ -11,15 +11,30 @@ class TestIOServiceUserNotification : public IOService { public: virtual bool start(IOService *provider) override; + virtual void free() override; + + void registerUserNotification(OSObject * notification); + void trimUserNotificationsLocked(void); + size_t getUserNotificationLeakCount(void); + +private: + OSArray * fUserNotifications; + IOLock * fLock; }; class TestIOServiceUserNotificationUserClient : public IOUserClient { OSDeclareDefaultStructors(TestIOServiceUserNotificationUserClient); public: + bool start(IOService * provider) override; + IONotifier * registerInterest(const OSSymbol * typeOfInterest, + IOServiceInterestHandler handler, + void * target, void * ref = NULL) override; virtual IOReturn clientClose() override; IOReturn externalMethod(uint32_t selector, IOExternalMethodArguments * args, IOExternalMethodDispatch * dispatch, OSObject * target, void * reference) override; +private: + TestIOServiceUserNotification * fProvider; }; #endif /* DEVELOPMENT || DEBUG */ diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 86e94c35c..a77da13a3 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -1569,6 +1569,10 @@ IOTaskHasStringEntitlement(task_t task, const char *entitlement, const char *val } proc_t proc = (proc_t)get_bsdtask_info(task); + if (proc == NULL) { + return false; + } + kern_return_t ret = amfi->OSEntitlements.queryEntitlementStringWithProc( proc, entitlement, @@ -1600,6 +1604,10 @@ IOTaskHasEntitlement(task_t task, const char *entitlement) } proc_t proc = (proc_t)get_bsdtask_info(task); + if (proc == NULL) { + return false; + } + kern_return_t ret = amfi->OSEntitlements.queryEntitlementBooleanWithProc( proc, entitlement); @@ -1611,6 +1619,49 @@ IOTaskHasEntitlement(task_t task, const char *entitlement) return false; } +extern "C" boolean_t +IOTaskGetIntegerEntitlement(task_t task, const char *entitlement, uint64_t *value) +{ + void *entitlement_object = NULL; + + if (task == NULL) { + task = current_task(); + } + + /* Validate input arguments */ + if (task == kernel_task || entitlement == NULL || value == NULL) { + return false; + } + proc_t proc = (proc_t)get_bsdtask_info(task); + + if (proc == NULL) { + return false; + } + + kern_return_t ret = amfi->OSEntitlements.copyEntitlementAsOSObjectWithProc( + proc, + entitlement, + &entitlement_object); + + if (ret != KERN_SUCCESS) { + return false; + } + assert(entitlement_object != NULL); + + OSObject *os_object = (OSObject*)entitlement_object; + OSNumber *os_number = OSDynamicCast(OSNumber, os_object); + + boolean_t has_entitlement = os_number != NULL; + if (has_entitlement) { + *value = os_number->unsigned64BitValue(); + } + + /* Free the OSObject which was given to us */ + OSSafeReleaseNULL(os_object); + + return has_entitlement; +} + extern "C" OS_ALWAYS_INLINE char* IOCurrentTaskGetEntitlement(const char *entitlement) { @@ -1633,6 +1684,10 @@ IOTaskGetEntitlement(task_t task, const char *entitlement) } proc_t proc = (proc_t)get_bsdtask_info(task); + if (proc == NULL) { + return NULL; + } + kern_return_t ret = amfi->OSEntitlements.copyEntitlementAsOSObjectWithProc( proc, entitlement, @@ -1655,6 +1710,51 @@ IOTaskGetEntitlement(task_t task, const char *entitlement) return return_value; } +extern "C" boolean_t +IOTaskHasEntitlementAsBooleanOrObject(task_t task, const char *entitlement) +{ + if (task == NULL) { + task = current_task(); + } + + /* Validate input arguments */ + if (task == kernel_task || entitlement == NULL) { + return false; + } + proc_t proc = (proc_t)get_bsdtask_info(task); + + if (proc == NULL) { + return false; + } + + kern_return_t ret = amfi->OSEntitlements.queryEntitlementBooleanWithProc( + proc, + entitlement); + if (ret == KERN_SUCCESS) { + return true; + } + + /* Check for the presence of an object */ + void *entitlement_object = NULL; + ret = amfi->OSEntitlements.copyEntitlementAsOSObjectWithProc( + proc, + entitlement, + &entitlement_object); + if (ret != KERN_SUCCESS) { + return false; + } + assert(entitlement_object != NULL); + + OSObject *os_object = (OSObject*)entitlement_object; + + bool not_false_entitlement = (os_object != kOSBooleanFalse); + + /* Free the OSObject which was given to us */ + OSSafeReleaseNULL(os_object); + + return not_false_entitlement; +} + extern "C" boolean_t IOVnodeHasEntitlement(vnode_t vnode, int64_t off, const char *entitlement) { diff --git a/iokit/conf/Makefile.template b/iokit/conf/Makefile.template index 2b3b97464..02dfec79c 100644 --- a/iokit/conf/Makefile.template +++ b/iokit/conf/Makefile.template @@ -225,6 +225,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/iokit/conf/files b/iokit/conf/files index ab12bab6e..ac2aa950f 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -98,6 +98,7 @@ iokit/Kernel/IOUserClient.cpp optional iokitcpp iokit/Kernel/IOKitDebug.cpp optional iokitcpp iokit/Kernel/IODataQueue.cpp optional iokitcpp iokit/Kernel/IOSharedDataQueue.cpp optional iokitcpp +iokit/Kernel/IOCircularDataQueue.cpp optional iokitcpp # IOKit tests iokit/Tests/Tests.cpp optional iokitcpp @@ -109,6 +110,7 @@ iokit/Tests/TestServices/TestIOConnectMapMemoryPortLeak45265408.cpp optional iok iokit/Tests/TestServices/TestIODeviceMemoryRosetta.cpp optional iokitcpp iokit/Tests/TestServices/TestIOUserClient2022Entitlements.cpp optional iokitcpp iokit/Tests/TestServices/TestIOServiceUserNotification.cpp optional iokitcpp +iokit/Tests/TestServices/TestIODataQueues.cpp optional iokitcpp iokit/Kernel/IOStatistics.cpp optional iokitcpp iokit/Kernel/IOInterruptAccounting.cpp optional iokitcpp diff --git a/libkdd/kcdata.h b/libkdd/kcdata.h index 88ea57984..38d4f701c 100644 --- a/libkdd/kcdata.h +++ b/libkdd/kcdata.h @@ -243,6 +243,8 @@ #include #include + + #define KCDATA_DESC_MAXLEN 32 /* including NULL byte at end */ #define KCDATA_FLAGS_STRUCT_PADDING_MASK 0xf @@ -488,7 +490,7 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ #define STACKSHOT_KCCONTAINER_TASK 0x903u #define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2, task_snapshot_v3 */ #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ #define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ #define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* dyld_shared_cache_loadinfo */ @@ -565,6 +567,10 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGMENTS 0x954u /* struct exclave_textlayout_segment_v2 */ #define STACKSHOT_KCTYPE_KERN_EXCLAVES_CRASH_THREADINFO 0x955u /* struct thread_crash_exclaves_info */ #define STACKSHOT_KCTYPE_LATENCY_INFO_CPU 0x956u /* struct stackshot_latency_cpu */ +#define STACKSHOT_KCTYPE_TASK_EXEC_META 0x957u /* struct task_exec_meta */ +#define STACKSHOT_KCTYPE_TASK_MEMORYSTATUS 0x958u /* struct task_memorystatus_snapshot */ +#define STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER 0x95au /* struct stackshot_latency_buffer */ + struct stack_snapshot_frame32 { uint32_t lr; @@ -703,12 +709,33 @@ enum task_snapshot_flags { kTaskDyldCompactInfoTriedFault = 0x1000000000, kTaskWqExceededCooperativeThreadLimit = 0x2000000000, kTaskWqExceededActiveConstrainedThreadLimit = 0x4000000000, + kTaskRunawayMitigation = 0x8000000000, + kTaskIsActive = 0x10000000000, + kTaskIsManaged = 0x20000000000, + kTaskHasAssertion = 0x40000000000, }; // Note: Add any new flags to kcdata.py (ts_ss_flags) enum task_transition_type { kTaskIsTerminated = 0x1,// Past LPEXIT }; +/* See kcdata_private.h for more flag definitions */ +enum task_exec_flags : uint64_t { + kTaskExecTranslated = 0x01, /* Task is running under translation (eg, Rosetta) */ + kTaskExecHardenedHeap = 0x02, /* Task has the hardened heap security feature */ + kTaskExecReserved00 = 0x04, + kTaskExecReserved01 = 0x08, + kTaskExecReserved02 = 0x10, + kTaskExecReserved03 = 0x20 +}; + +/* metadata about a task that is fixed at spawn/exec time */ +struct task_exec_meta { + uint64_t tem_flags; /* task_exec_flags */ +} __attribute__((packed)); + + + enum thread_snapshot_flags { /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kHasDispatchSerial = 0x4, @@ -962,6 +989,27 @@ struct task_snapshot_v2 { char ts_p_comm[32]; } __attribute__ ((packed)); +struct task_snapshot_v3 { + uint64_t ts_unique_pid; + uint64_t ts_ss_flags; + uint64_t ts_user_time_in_terminated_threads; + uint64_t ts_system_time_in_terminated_threads; + uint64_t ts_p_start_sec; + uint64_t ts_task_size; + uint64_t ts_max_resident_size; + uint32_t ts_suspend_count; + uint32_t ts_faults; + uint32_t ts_pageins; + uint32_t ts_cow_faults; + uint32_t ts_was_throttled; + uint32_t ts_did_throttle; + uint32_t ts_latency_qos; + int32_t ts_pid; + char ts_p_comm[32]; + uint32_t ts_uid; + uint32_t ts_gid; +} __attribute__ ((packed)); + struct transitioning_task_snapshot { uint64_t tts_unique_pid; uint64_t tts_ss_flags; @@ -986,6 +1034,13 @@ struct task_delta_snapshot_v2 { uint32_t tds_latency_qos; } __attribute__ ((packed)); +struct task_memorystatus_snapshot { + int32_t tms_current_memlimit; + int32_t tms_effectivepriority; + int32_t tms_requestedpriority; + int32_t tms_assertionpriority; +} __attribute__ ((packed)); + #define KCDATA_INVALID_CS_TRUST_LEVEL 0xffffffff struct stackshot_task_codesigning_info { uint64_t csflags; @@ -1137,6 +1192,14 @@ struct stackshot_latency_cpu { uint64_t intercluster_buf_used; } __attribute__((packed)); +/* only collected if STACKSHOT_COLLECTS_LATENCY_INFO is set to !0 */ +struct stackshot_latency_buffer { + int32_t cluster_type; + uint64_t size; + uint64_t used; + uint64_t overhead; +} __attribute__ ((packed)); + /* only collected if STACKSHOT_COLLECTS_LATENCY_INFO is set to !0 */ struct stackshot_latency_task { uint64_t task_uniqueid; @@ -1300,6 +1363,10 @@ struct crashinfo_mb { uint64_t data[64]; } __attribute__((packed)); +struct crashinfo_task_security_config { + uint32_t task_security_config; /* struct task_security_config */ +} __attribute__((packed)); + #define MAX_CRASHINFO_SIGNING_ID_LEN 64 #define MAX_CRASHINFO_TEAM_ID_LEN 32 @@ -1379,6 +1446,10 @@ struct crashinfo_mb { #define TASK_CRASHINFO_JIT_ADDRESS_RANGE 0x840 /* struct crashinfo_jit_address_range */ #define TASK_CRASHINFO_MB 0x841 /* struct crashinfo_mb */ #define TASK_CRASHINFO_CS_AUXILIARY_INFO 0x842 /* uint64_t */ +#define TASK_CRASHINFO_RLIM_CORE 0x843 /* rlim_t */ +#define TASK_CRASHINFO_CORE_ALLOWED 0x844 /* uint8_t */ +#define TASK_CRASHINFO_TASK_SECURITY_CONFIG 0x845 /* struct task_security_config */ + #define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END diff --git a/libkdd/kcdtypes.c b/libkdd/kcdtypes.c index 650a18ee9..db04de1e5 100644 --- a/libkdd/kcdtypes.c +++ b/libkdd/kcdtypes.c @@ -336,22 +336,24 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s case STACKSHOT_KCTYPE_TASK_SNAPSHOT: { i = 0; - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_unique_pid); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_ss_flags); - _SUBTYPE_TRUNC(KC_ST_UINT64, struct task_snapshot_v2, ts_user_time_in_terminated_threads, "ts_user_time_in_terminated_thre"); - _SUBTYPE_TRUNC(KC_ST_UINT64, struct task_snapshot_v2, ts_system_time_in_terminated_threads, "ts_system_time_in_terminated_th"); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_p_start_sec); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_task_size); - _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v2, ts_max_resident_size); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_suspend_count); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_faults); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_pageins); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_cow_faults); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_was_throttled); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_did_throttle); - _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v2, ts_latency_qos); - _SUBTYPE(KC_ST_INT32, struct task_snapshot_v2, ts_pid); - _SUBTYPE_ARRAY(KC_ST_CHAR, struct task_snapshot_v2, ts_p_comm, 32); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v3, ts_unique_pid); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v3, ts_ss_flags); + _SUBTYPE_TRUNC(KC_ST_UINT64, struct task_snapshot_v3, ts_user_time_in_terminated_threads, "ts_user_time_in_terminated_thre"); + _SUBTYPE_TRUNC(KC_ST_UINT64, struct task_snapshot_v3, ts_system_time_in_terminated_threads, "ts_system_time_in_terminated_th"); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v3, ts_p_start_sec); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v3, ts_task_size); + _SUBTYPE(KC_ST_UINT64, struct task_snapshot_v3, ts_max_resident_size); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_suspend_count); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_faults); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_pageins); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_cow_faults); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_was_throttled); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_did_throttle); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_latency_qos); + _SUBTYPE(KC_ST_INT32, struct task_snapshot_v3, ts_pid); + _SUBTYPE_ARRAY(KC_ST_CHAR, struct task_snapshot_v3, ts_p_comm, 32); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_uid); + _SUBTYPE(KC_ST_UINT32, struct task_snapshot_v3, ts_gid); setup_type_definition(retval, type_id, i, "task_snapshot"); break; } @@ -1013,6 +1015,16 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s setup_type_definition(retval, type_id, 1, "cs_auxiliary_info"); break; } + case TASK_CRASHINFO_RLIM_CORE: { + setup_subtype_description(&subtypes[0], KC_ST_UINT64, 0, "rlim_core"); + setup_type_definition(retval, type_id, 1, "rlim_core"); + break; + } + case TASK_CRASHINFO_CORE_ALLOWED: { + setup_subtype_description(&subtypes[0], KC_ST_UINT8, 0, "core_allowed"); + setup_type_definition(retval, type_id, 1, "core_allowed"); + break; + } case EXIT_REASON_SNAPSHOT: { _SUBTYPE(KC_ST_UINT32, struct exit_reason_snapshot, ers_namespace); _SUBTYPE(KC_ST_UINT64, struct exit_reason_snapshot, ers_code); @@ -1180,7 +1192,32 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s setup_type_definition(retval, type_id, i, "stackshot_latency_thread"); break; } - + case STACKSHOT_KCTYPE_LATENCY_INFO_CPU: { + i = 0; + _SUBTYPE(KC_ST_INT32, struct stackshot_latency_cpu, cpu_number); + _SUBTYPE(KC_ST_INT32, struct stackshot_latency_cpu, cluster_type); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, init_latency_mt); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, workqueue_latency_mt); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, total_latency_mt); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, total_cycles); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, total_instrs); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, tasks_processed); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, threads_processed); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, faulting_time_mt); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, total_buf); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_cpu, intercluster_buf_used); + setup_type_definition(retval, type_id, i, "stackshot_latency_cpu"); + break; + } + case STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER: { + i = 0; + _SUBTYPE(KC_ST_INT32, struct stackshot_latency_buffer, cluster_type); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_buffer, size); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_buffer, used); + _SUBTYPE(KC_ST_UINT64, struct stackshot_latency_buffer, overhead); + setup_type_definition(retval, type_id, i, "stackshot_latency_buffer"); + break; + } case TASK_CRASHINFO_KERNEL_TRIAGE_INFO_V1: { i = 0; _SUBTYPE_ARRAY(KC_ST_CHAR, struct kernel_triage_info_v1, triage_string1, MAX_TRIAGE_STRING_LEN); @@ -1191,7 +1228,6 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s setup_type_definition(retval, type_id, i, "kernel_triage_info_v1"); break; } - default: retval = NULL; break; diff --git a/libkern/amfi/amfi.c b/libkern/amfi/amfi.c index f97dad660..58df36b0f 100644 --- a/libkern/amfi/amfi.c +++ b/libkern/amfi/amfi.c @@ -2,7 +2,8 @@ #include #include -SECURITY_READ_ONLY_LATE(const amfi_t *) amfi = NULL; +SECURITY_READ_ONLY_LATE(const amfi_t*) amfi = NULL; +SECURITY_READ_ONLY_LATE(const CEKernelAPI_t*) libCoreEntitlements = NULL; void amfi_interface_register(const amfi_t *mfi) @@ -12,3 +13,12 @@ amfi_interface_register(const amfi_t *mfi) } amfi = mfi; } + +void +amfi_core_entitlements_register(const CEKernelAPI_t *implementation) +{ + if (libCoreEntitlements) { + panic("libCoreEntitlements interface already set"); + } + libCoreEntitlements = implementation; +} diff --git a/libkern/c++/OSData.cpp b/libkern/c++/OSData.cpp index 287708037..648c0b106 100644 --- a/libkern/c++/OSData.cpp +++ b/libkern/c++/OSData.cpp @@ -78,7 +78,7 @@ OSData::initWithCapacity(unsigned int inCapacity) if (capacity) { OSCONTAINER_ACCUMSIZE(-(size_t)capacity); /* can't use kfree() as we need to pass Z_MAY_COPYINMAP */ - __kheap_realloc(KHEAP_DATA_BUFFERS, data, capacity, 0, + __kheap_realloc(GET_KEXT_KHEAP_DATA(), data, capacity, 0, Z_VM_TAG_BT(Z_WAITOK_ZERO | Z_FULLSIZE | Z_MAY_COPYINMAP, VM_KERN_MEMORY_LIBKERN), (void *)&this->data); data = nullptr; @@ -89,7 +89,7 @@ OSData::initWithCapacity(unsigned int inCapacity) * Nothing to change */ } else { - kr = kalloc_ext(KHEAP_DATA_BUFFERS, inCapacity, + kr = kalloc_ext(GET_KEXT_KHEAP_DATA(), inCapacity, Z_VM_TAG_BT(Z_WAITOK_ZERO | Z_FULLSIZE | Z_MAY_COPYINMAP, VM_KERN_MEMORY_LIBKERN), (void *)&this->data); @@ -225,7 +225,7 @@ OSData::free() { if ((capacity != EXTERNAL) && data && capacity) { /* can't use kfree() as we need to pass Z_MAY_COPYINMAP */ - __kheap_realloc(KHEAP_DATA_BUFFERS, data, capacity, 0, + __kheap_realloc(GET_KEXT_KHEAP_DATA(), data, capacity, 0, Z_VM_TAG_BT(Z_WAITOK_ZERO | Z_FULLSIZE | Z_MAY_COPYINMAP, VM_KERN_MEMORY_LIBKERN), (void *)&this->data); OSCONTAINER_ACCUMSIZE( -((size_t)capacity)); @@ -284,7 +284,7 @@ OSData::ensureCapacity(unsigned int newCapacity) return capacity; } - kr = krealloc_ext((void *)KHEAP_DATA_BUFFERS, data, capacity, finalCapacity, + kr = krealloc_ext(GET_KEXT_KHEAP_DATA(), data, capacity, finalCapacity, Z_VM_TAG_BT(Z_WAITOK_ZERO | Z_FULLSIZE | Z_MAY_COPYINMAP, VM_KERN_MEMORY_LIBKERN), (void *)&this->data); @@ -319,7 +319,7 @@ OSData::clipForCopyout() * address stable. */ if (length >= msg_ool_size_small && newCapacity < capacity) { - kr = krealloc_ext((void *)KHEAP_DATA_BUFFERS, + kr = krealloc_ext(GET_KEXT_KHEAP_DATA(), data, capacity, newCapacity, Z_VM_TAG_BT(Z_WAITOK_ZERO | Z_FULLSIZE | Z_MAY_COPYINMAP, VM_KERN_MEMORY_LIBKERN), (void *)&this->data); diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index d15eb69bd..9252d101a 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -50,7 +50,9 @@ extern "C" { #include #include #include +#include #include +#include #include @@ -550,7 +552,7 @@ static uint32_t sKextAccountsCount; /********************************************************************* * sKextLoggingLock protects the logging variables declared immediately below. **********/ -static IOLock * sKextLoggingLock = NULL; +__static_testable IOLock * sKextLoggingLock = NULL; static const OSKextLogSpec kDefaultKernelLogFilter = kOSKextLogBasicLevel | kOSKextLogVerboseFlagsMask; @@ -1036,10 +1038,10 @@ OSKext::initialize(void) sKeepSymbols = true; } #endif /* CONFIG_DTRACE */ -#if KASAN_DYNAMIC_BLACKLIST +#if KASAN_DYNAMIC_DENYLIST /* needed for function lookup */ sKeepSymbols = true; -#endif +#endif /* KASAN_DYNAMIC_DENYLIST */ /* * Should we panic when the SystemKC is not linked against the @@ -1220,7 +1222,7 @@ OSKext::removeKextBootstrap(void) #if __arm__ || __arm64__ /* Free the memory that was set up by iBoot. */ -#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR) /* We cannot free the KLD segment with CTRR enabled as it contains text and * is covered by the contiguous rorgn. */ @@ -1591,14 +1593,28 @@ finish: /********************************************************************* *********************************************************************/ /* static */ + bool OSKext::driverkitEnabled(void) { -#if XNU_TARGET_OS_WATCH - return false; -#else //!XNU_TARGET_OS_WATCH + #if XNU_TARGET_OS_WATCH + /* + * Driverkit support is available on watchOS only if the device + * tree has the "supports-driverkit" property in its "/product" node + */ + DTEntry entry; + void const *prop = NULL; + unsigned int prop_size; + + if (kSuccess != SecureDTLookupEntry(NULL, "/product", &entry)) { + return false; + } + if (kSuccess != SecureDTGetProperty(entry, "supports-driverkit", &prop, &prop_size)) { + return false; + } + #endif /* XNU_TARGET_OS_WATCH */ + return true; -#endif //XNU_TARGET_OS_WATCH } /********************************************************************* @@ -1607,12 +1623,12 @@ OSKext::driverkitEnabled(void) bool OSKext::iokitDaemonAvailable(void) { -#if !XNU_TARGET_OS_IOS && !XNU_TARGET_OS_OSX +#if !XNU_TARGET_OS_IOS && !XNU_TARGET_OS_OSX && !XNU_TARGET_OS_WATCH int notused; if (PE_parse_boot_argn("-restore", ¬used, sizeof(notused))) { return false; } -#endif //!XNU_TARGET_OS_IOS && !XNU_TARGET_OS_OSX +#endif /* !XNU_TARGET_OS_IOS && !XNU_TARGET_OS_OSX && !XNU_TARGET_OS_WATCH */ return driverkitEnabled(); } @@ -1679,10 +1695,10 @@ finish: } void -OSKext::willUserspaceReboot(void) +OSKext::setWillUserspaceReboot(void) { OSKext::willShutdown(); - IOService::userSpaceWillReboot(); + IOService::setWillUserspaceReboot(); gIOCatalogue->terminateDriversForUserspaceReboot(); } @@ -1704,6 +1720,12 @@ OSKext::resetAfterUserspaceReboot(void) IORecursiveLockUnlock(sKextLock); } +extern "C" int +OSKextIsInUserspaceReboot(void) +{ + return IOService::getWillUserspaceReboot(); +} + extern "C" void OSKextResetAfterUserspaceReboot(void) { @@ -3823,7 +3845,7 @@ OSKext::extractMkext2FileData( if (KERN_SUCCESS != kmem_alloc(kernel_map, (vm_offset_t*)&uncompressedDataBuffer, fullSize, - KMA_DATA, VM_KERN_MEMORY_OSKEXT)) { + KMA_DATA_SHARED, VM_KERN_MEMORY_OSKEXT)) { /* How's this for cheesy? The kernel is only asked to extract * kext plists so we tailor the log messages. */ @@ -4220,7 +4242,7 @@ OSKext::serializeLogInfo( logInfoLength = serializer->getLength(); kmem_result = kmem_alloc(kernel_map, (vm_offset_t *)&buffer, round_page(logInfoLength), - KMA_DATA, VM_KERN_MEMORY_OSKEXT); + KMA_DATA_SHARED, VM_KERN_MEMORY_OSKEXT); if (kmem_result != KERN_SUCCESS) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | @@ -10727,7 +10749,7 @@ OSKext::handleRequest( /* This kmem_alloc sets the return value of the function. */ kmem_result = kmem_alloc(kernel_map, (vm_offset_t *)&buffer, - round_page(responseLength), KMA_DATA, VM_KERN_MEMORY_OSKEXT); + round_page(responseLength), KMA_DATA_SHARED, VM_KERN_MEMORY_OSKEXT); if (kmem_result != KERN_SUCCESS) { OSKextLog(/* kext */ NULL, kOSKextLogErrorLevel | @@ -16413,7 +16435,7 @@ OSKext::updateLoadedKextSummaries(void) sLoadedKextSummariesAllocSize = 0; } result = kmem_alloc(kernel_map, (vm_offset_t *)&summaryHeaderAlloc, size, - KMA_DATA, VM_KERN_MEMORY_OSKEXT); + KMA_NONE, VM_KERN_MEMORY_OSKEXT); if (result != KERN_SUCCESS) { goto finish; } @@ -16978,13 +17000,13 @@ static int sysctl_willuserspacereboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { - int new_value = 0, old_value = 0, changed = 0; + int new_value = 0, old_value = get_system_inuserspacereboot(), changed = 0; int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); if (error) { return error; } if (changed) { - OSKext::willUserspaceReboot(); + OSKext::setWillUserspaceReboot(); } return 0; } diff --git a/libkern/c++/OSObject.cpp b/libkern/c++/OSObject.cpp index cd00373ba..b8db91f55 100644 --- a/libkern/c++/OSObject.cpp +++ b/libkern/c++/OSObject.cpp @@ -136,8 +136,8 @@ OSObject::taggedTryRetain(const void *tag) const #else /* DEBUG */ // @@@ gvdl: eventually need to make this panic optional // based on a boot argument i.e. debug= boot flag - panic("OSObject::refcount: " - "About to wrap the reference count, reference leak?"); + panic("OSObject(%p)::refcount: " + "About to wrap the reference count, reference leak?", this); #endif /* !DEBUG */ } } @@ -152,7 +152,7 @@ void OSObject::taggedRetain(const void *tag) const { if (!taggedTryRetain(tag)) { - panic("OSObject::refcount: Attempting to retain a freed object"); + panic("OSObject(%p)::refcount: Attempting to retain a freed object", this); } } @@ -195,8 +195,8 @@ OSObject::taggedRelease(const void *tag, const int when) const #else /* DEBUG */ // @@@ gvdl: eventually need to make this panic optional // based on a boot argument i.e. debug= boot flag - panic("OSObject::refcount: %s", - "About to unreference a pegged object, reference leak?"); + panic("OSObject(%p)::refcount: %s", + "About to unreference a pegged object, reference leak?", this); #endif /* !DEBUG */ } } @@ -220,8 +220,8 @@ OSObject::taggedRelease(const void *tag, const int when) const // xxx - any code in the kernel could trip this, // xxx - and it applies as noted to all collections, not just the registry if ((UInt16) actualCount < (actualCount >> 16)) { - panic("A kext releasing a(n) %s has corrupted the registry.", - getClassName(this)); + panic("A kext releasing a(n) %s %p has corrupted the registry.", + getClassName(this), this); } // Check for a 'free' condition and that if we are first through diff --git a/libkern/c++/OSSerialize.cpp b/libkern/c++/OSSerialize.cpp index 7ea472987..b483621c7 100644 --- a/libkern/c++/OSSerialize.cpp +++ b/libkern/c++/OSSerialize.cpp @@ -227,7 +227,7 @@ OSSerialize::initWithCapacity(unsigned int inCapacity) // into user space (the primary use of the OSSerialize object) kmr = kmem_alloc_guard(kernel_map, inCapacity, /* mask */ 0, - (kma_flags_t)(KMA_ZERO | KMA_DATA), OSSerialize_guard()); + (kma_flags_t)(KMA_ZERO | KMA_DATA_SHARED), OSSerialize_guard()); if (kmr.kmr_return == KERN_SUCCESS) { data = (char *)kmr.kmr_ptr; diff --git a/libkern/c++/OSSymbol.cpp b/libkern/c++/OSSymbol.cpp index 1652cf2a8..9d4e08424 100644 --- a/libkern/c++/OSSymbol.cpp +++ b/libkern/c++/OSSymbol.cpp @@ -453,7 +453,7 @@ OSSymbol::withCString(const char *cString) return symbol; } -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * Empirically, symbols which string is from the rorgn part of the * kernel are asked about all the time. @@ -465,7 +465,7 @@ OSSymbol::withCString(const char *cString) * from baseline (~6k), but avoiding the string copies saves about 60k. */ permanent = rorgn_contains((vm_offset_t)cString, key.smrk_len + 1, false); -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ /* * can't use OSString::initWithCString* because it calls @@ -511,9 +511,9 @@ OSSymbol::withCStringNoCopy(const char *cString) return symbol; } -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) permanent = rorgn_contains((vm_offset_t)cString, key.smrk_len + 1, false); -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ auto newSymb = OSMakeShared(); diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index f0c8a72fd..6ef0911b6 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -160,6 +160,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/libkern/firehose/firehose_types_private.h b/libkern/firehose/firehose_types_private.h index ede7e150e..0e5d83f0a 100644 --- a/libkern/firehose/firehose_types_private.h +++ b/libkern/firehose/firehose_types_private.h @@ -76,6 +76,7 @@ OS_ENUM(firehose_stream, uint8_t, firehose_stream_signpost = 4, firehose_stream_memory_wifi = 5, firehose_stream_memory_baseband = 6, + firehose_stream_metric = 7, _firehose_stream_max, _firehose_stream_disabled = (uint8_t)-1, @@ -94,6 +95,7 @@ OS_ENUM(firehose_tracepoint_namespace, uint8_t, firehose_tracepoint_namespace_metadata = 0x05, firehose_tracepoint_namespace_signpost = 0x06, firehose_tracepoint_namespace_loss = 0x07, + firehose_tracepoint_namespace_metric = 0x08, ); /*! @@ -283,6 +285,34 @@ OS_OPTIONS(_firehose_tracepoint_flags_signpost, uint16_t, _firehose_tracepoint_flags_signpost_has_name = 0x8000, ); +/*! + * @enum _firehose_tracepoint_type_metric_t + * + * @abstract + * Types of Metric tracepoints (namespace metric). + */ +OS_ENUM(_firehose_tracepoint_type_metric, firehose_tracepoint_type_t, + _firehose_tracepoint_type_metric_integer = 0x00, + _firehose_tracepoint_type_metric_double = 0x01, + ); + +/*! + * @enum firehose_tracepoint_flags_metric_t + * + * @abstract + * Flags for Metric tracepoints (namespace metric). + * + * When flags are shared with the log type, they should have the same values. + */ +OS_OPTIONS(_firehose_tracepoint_flags_metric, uint16_t, + // shared with log + _firehose_tracepoint_flags_metric_has_private_data = 0x0100, + _firehose_tracepoint_flags_metric_has_subsystem = 0x0200, + _firehose_tracepoint_flags_metric_has_rules = 0x0400, + _firehose_tracepoint_flags_metric_has_oversize = 0x0800, + _firehose_tracepoint_flags_metric_has_context_data = 0x1000, + ); + /* MIG firehose push reply structure */ typedef struct firehose_push_reply_s { uint64_t fpr_mem_flushed_pos; diff --git a/libkern/kxld/kxld_util.c b/libkern/kxld/kxld_util.c index e5e889598..a53b7d165 100644 --- a/libkern/kxld/kxld_util.c +++ b/libkern/kxld/kxld_util.c @@ -160,7 +160,7 @@ kxld_calloc(size_t size) void * ptr = NULL; #if KERNEL - ptr = kheap_alloc_tag(KHEAP_DEFAULT, size, Z_WAITOK | Z_ZERO, + ptr = kheap_alloc_tag(GET_KEXT_KHEAP_DATA(), size, Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_OSKEXT); #else ptr = calloc(1, size); @@ -182,7 +182,7 @@ kxld_alloc(size_t size) void * ptr = NULL; #if KERNEL - ptr = kheap_alloc_tag(KHEAP_DEFAULT, size, Z_WAITOK | Z_ZERO, + ptr = kheap_alloc_tag(GET_KEXT_KHEAP_DATA(), size, Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_OSKEXT); #else ptr = malloc(size); @@ -208,7 +208,7 @@ kxld_page_alloc_untracked(size_t size) size = round_page(size); #if KERNEL - ptr = kheap_alloc_tag(KHEAP_DEFAULT, size, Z_WAITOK | Z_ZERO, + ptr = kheap_alloc_tag(GET_KEXT_KHEAP_DATA(), size, Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_OSKEXT); #else /* !KERNEL */ ptr = calloc(1, size); diff --git a/libkern/libkern/amfi/amfi.h b/libkern/libkern/amfi/amfi.h index 61e2fc328..67c259b97 100644 --- a/libkern/libkern/amfi/amfi.h +++ b/libkern/libkern/amfi/amfi.h @@ -32,9 +32,12 @@ #include #include #include +#include +#include #define KERN_AMFI_INTERFACE_VERSION 7 #define KERN_AMFI_SUPPORTS_DATA_ALLOC 2 +#define KERN_AMFI_SUPPORTS_CORE_ENTITLEMENTS_V2 1 #pragma mark Forward Declarations struct proc; @@ -269,6 +272,12 @@ __BEGIN_DECLS */ extern const amfi_t * amfi; +/*! + * @const amfi + * The AMFI interface that was registered. + */ +extern const CEKernelAPI_t *libCoreEntitlements; + /*! * @function amfi_interface_register * Registers the AMFI kext interface for use within the kernel proper. @@ -284,6 +293,21 @@ OS_EXPORT OS_NONNULL1 void amfi_interface_register(const amfi_t *mfi); +/*! + * @function amfi_core_entitlements_register + * Registers the CoreEntitlements_V2 implementation for use within the kernel. + * + * @param implementation + * The implementation to register. + * + * @discussion + * This routine may only be called once and must be called before late-const has + * been applied to kernel memory. + */ +OS_EXPORT OS_NONNULL1 +void +amfi_core_entitlements_register(const CEKernelAPI_t *implementation); + __END_DECLS #endif // __AMFI_H diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index 50d2a87ce..406be04a8 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -395,13 +395,14 @@ public: static OSPtr copyKexts(void); static OSReturn removeKextBootstrap(void); static void willShutdown(void);// called by IOPMrootDomain on shutdown - static void willUserspaceReboot(void); + static void setWillUserspaceReboot(void); static void resetAfterUserspaceReboot(void); static void reportOSMetaClassInstances( const char * kextIdentifier, OSKextLogSpec msgLogSpec); static void OSKextLogDriverKitInfoLoad(OSKext *kext); static bool iokitDaemonAvailable(void); + static bool driverkitEnabled(void); #endif /* XNU_KERNEL_PRIVATE */ private: @@ -606,7 +607,6 @@ private: /* Sync with user space. */ static OSReturn pingIOKitDaemon(void); - static bool driverkitEnabled(void); /* Getting info about loaded kexts (kextstat). */ @@ -945,6 +945,7 @@ public: } }; +extern "C" int OSKextIsInUserspaceReboot(void); extern "C" void OSKextResetAfterUserspaceReboot(void); #endif /* !_LIBKERN_OSKEXT_H */ diff --git a/libkern/libkern/section_keywords.h b/libkern/libkern/section_keywords.h index cb3a9e72c..0410aa5c2 100644 --- a/libkern/libkern/section_keywords.h +++ b/libkern/libkern/section_keywords.h @@ -44,6 +44,7 @@ #define SECURITY_SECTION_NAME "__const" #define SECURITY_SEGMENT_SECTION_NAME "__DATA,__const" +#ifndef __BUILDING_XNU_LIBRARY__ #define __security_const_early const #define __security_const_late __attribute__((section(SECURITY_SEGMENT_SECTION_NAME))) #define __security_read_write @@ -53,6 +54,17 @@ #define MARK_AS_HIBERNATE_DATA __attribute__((section("__HIB, __data"))) #define MARK_AS_HIBERNATE_DATA_CONST_LATE __attribute__((section("__HIB, __const"))) #endif /* HIBERNATION */ + +#else /* __BUILDING_XNU_LIBRARY__ */ +/* Special segments are not used when building for user-mode */ +#define __security_const_early +#define __security_const_late +#define __security_read_write +#define MARK_AS_HIBERNATE_TEXT +#define MARK_AS_HIBERNATE_DATA +#define MARK_AS_HIBERNATE_DATA_CONST_LATE +#endif /* __BUILDING_XNU_LIBRARY__ */ + #endif /* __arm64__ || __x86_64__ */ #ifndef __security_const_early diff --git a/libkern/os/atomic_private.h b/libkern/os/atomic_private.h index c3a1536e6..ce1079ec6 100644 --- a/libkern/os/atomic_private.h +++ b/libkern/os/atomic_private.h @@ -36,7 +36,7 @@ * This file defines nicer (terser and safer) wrappers for C11's . * * @discussion - * @see xnu.git::doc/atomics.md which provides more extensive documentation + * @see xnu.git::doc/primitives/atomics.md which provides more extensive documentation * about this header. * * Note that some of the macros defined in this file may be overridden by @@ -49,7 +49,7 @@ * * - the os_atomic-specific `dependency` memory ordering that is used to * document intent to a carry a data or address dependency. - * See doc/atomics.md for more information. + * See doc/primitives/atomics.md for more information. * * - a compiler barrier: compiler_acquire, compiler_release, compiler_acq_rel * without a corresponding memory fence. diff --git a/libkern/os/base.h b/libkern/os/base.h index 52dd7dce9..6ce6ffacb 100644 --- a/libkern/os/base.h +++ b/libkern/os/base.h @@ -71,7 +71,11 @@ #define OS_WEAK __attribute__((__weak__)) #define OS_WEAK_IMPORT __attribute__((__weak_import__)) #define OS_NOINLINE __attribute__((__noinline__)) +#ifndef __BUILDING_XNU_LIBRARY__ #define OS_ALWAYS_INLINE __attribute__((__always_inline__)) +#else /* __BUILDING_XNU_LIBRARY__ */ +#define OS_ALWAYS_INLINE +#endif /* __BUILDING_XNU_LIBRARY__ */ #define OS_TRANSPARENT_UNION __attribute__((__transparent_union__)) #define OS_ALIGNED(n) __attribute__((__aligned__((n)))) #define OS_FORMAT_PRINTF(x, y) __attribute__((__format__(printf,x,y))) @@ -380,4 +384,21 @@ typedef void (^os_block_t)(void); #define OS_COUNTED_BY(N) __counted_by(N) #define OS_SIZED_BY(N) __sized_by(N) +#if XNU_KERNEL_PRIVATE +#if __BUILDING_XNU_LIBRARY__ +// These are used to mark functions which should normally be static but +// should be callable from the tester +#define __static_testable +// inline makes the functions not visible +#define __inline_testable +// This marks a function which could be overriden by a mock in a unit-tester +#define __mockable __attribute__((noinline)) +#else // __BUILDING_XNU_LIBRARY__ +#define __static_testable static +#define __inline_testable inline +#define __mockable +#endif // __BUILDING_XNU_LIBRARY__ + +#endif // XNU_KERNEL_PRIVATE + #endif // __OS_BASE__ diff --git a/libkern/os/hash.h b/libkern/os/hash.h index 698303a64..e1f961ef7 100644 --- a/libkern/os/hash.h +++ b/libkern/os/hash.h @@ -119,11 +119,43 @@ os_hash_jenkins(const void *data, size_t length) static inline uint32_t os_hash_kernel_pointer(const void *pointer) { - uintptr_t key = (uintptr_t)pointer >> 4; + uintptr_t key = (uintptr_t)((intptr_t)pointer << 16) >> 20; key *= 0x5052acdb; return (uint32_t)key ^ __builtin_bswap32((uint32_t)key); } +/*! + * @function os_hash_uint64 + * + * @brief + * Hashes a 64 bit number. + * + * @discussion + * This is a really cheap and fast mixer. + * + * This should be not used for untrusted values from userspace, + * or cases when the pointer is somehow under the control of userspace. + * + * See https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html + * + * @param u64 + * The value to hash + * + * @returns + * The hash for this integer. + */ +static inline uint32_t +os_hash_uint64(uint64_t u64) +{ + u64 ^= (u64 >> 31); + u64 *= 0x7fb5d329728ea185ull; + u64 ^= (u64 >> 27); + u64 *= 0x81dadef4bc2dd44dull; + u64 ^= (u64 >> 33); + + return (uint32_t)u64; +} + __END_DECLS #endif // PRIVATE diff --git a/libkern/os/log.c b/libkern/os/log.c index 5f56c782b..26af99db9 100644 --- a/libkern/os/log.c +++ b/libkern/os/log.c @@ -708,6 +708,7 @@ _os_log_to_msgbuf_internal(const char *format, va_list args, uint64_t timestamp, #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" va_copy(args_copy, args); newlogline = vprintf_log_locked(format, args_copy, addcr); va_end(args_copy); @@ -868,7 +869,7 @@ _os_log_internal_driverKit(void *dso, os_log_t log, uint8_t type, const char *fm return 0; } -__attribute__((noinline, not_tail_called)) void +__attribute__((noinline, not_tail_called)) __mockable void os_log_with_args(os_log_t oslog, os_log_type_t type, const char *fmt, va_list args, void *addr) { @@ -1291,6 +1292,7 @@ save_pattern(char buf[static TESTBUFLEN], uint32_t *crc, const char *fmt, ...) va_start(va, fmt); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" int n = vscnprintf(buf, TESTBUFLEN, fmt, va); #pragma clang diagnostic pop va_end(va); diff --git a/libkern/os/log_encode.c b/libkern/os/log_encode.c index 6e2e17eb9..40dbf5d65 100644 --- a/libkern/os/log_encode.c +++ b/libkern/os/log_encode.c @@ -464,6 +464,18 @@ log_encode_fmt(os_log_context_t ctx, const char *format, va_list args) // Skipping field width, libtrace takes care of it. break; + case 'e': + case 'E': + case 'a': // float hex + case 'A': // float hex upper + case 'g': + case 'f': + // floats are always promoted to doubles + value.f = va_arg(args, double); + err = log_encode_fmt_arg(&value.f, sizeof(value.f), OSLF_CMD_TYPE_SCALAR, ctx); + done = true; + break; + default: return EINVAL; } diff --git a/libkern/os/log_encode_types.h b/libkern/os/log_encode_types.h index abf9dca6f..fe40843e6 100644 --- a/libkern/os/log_encode_types.h +++ b/libkern/os/log_encode_types.h @@ -87,6 +87,7 @@ union os_log_fmt_types_u { ptrdiff_t pd; long l; long long ll; + double f; }; typedef struct os_log_fmt_hdr_s { diff --git a/libkern/os/log_queue.c b/libkern/os/log_queue.c index 1a0db78b2..e2efab754 100644 --- a/libkern/os/log_queue.c +++ b/libkern/os/log_queue.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "log_queue.h" #include "log_mem.h" @@ -93,6 +94,7 @@ typedef struct { thread_call_t lq_mem_handler; size_t lq_cnt_mem_active; size_t lq_cnt_mem_avail; + size_t lq_cnt_mem_max; size_t lq_cnt_mem_meta_avail; _Atomic lq_req_state_t lq_req_state; void *lq_req_mem; @@ -128,6 +130,11 @@ extern bool os_log_disabled(void); TUNABLE(size_t, lq_bootarg_size_order, "lq_size_order", LQ_DEFAULT_SZ_ORDER); TUNABLE(size_t, lq_bootarg_nslots, "lq_nslots", LQ_MAX_LM_SLOTS); +atomic_size_t lq_max_slots = LQ_MAX_LM_SLOTS; +#if DEVELOPMENT || DEBUG +SYSCTL_UINT(_debug, OID_AUTO, log_queue_max_slots, CTLFLAG_RW, (unsigned int *)&lq_max_slots, 0, ""); +#endif + SCALABLE_COUNTER_DEFINE(log_queue_cnt_received); SCALABLE_COUNTER_DEFINE(log_queue_cnt_rejected_fh); SCALABLE_COUNTER_DEFINE(log_queue_cnt_queued); @@ -387,12 +394,32 @@ log_queue_mem_init(log_queue_t lq, size_t idx, void *buf, size_t buflen) LQ_MIN_LOG_SZ_ORDER, LQ_MAX_LOG_SZ_ORDER); } +void +log_queue_set_max_slots(size_t max_slots) +{ + max_slots = MAX(max_slots, LQ_MIN_ALLOCATED_LM_SLOTS); + assert(max_slots <= LQ_MAX_LM_SLOTS); + assert(max_slots >= LQ_MIN_ALLOCATED_LM_SLOTS); + atomic_store_explicit((atomic_size_t *)&lq_max_slots, max_slots, memory_order_relaxed); +} + +static size_t +log_queue_mem_max_slots(void) +{ + size_t max_slots = atomic_load_explicit((atomic_size_t *)&lq_max_slots, memory_order_relaxed); + max_slots = MAX(max_slots, LQ_MIN_ALLOCATED_LM_SLOTS); + assert(max_slots <= LQ_MAX_LM_SLOTS); + assert(max_slots >= LQ_MIN_ALLOCATED_LM_SLOTS); + return max_slots; +} + static int log_queue_mem_free_slot(log_queue_t lq) { assert(LQ_MEM_ENABLED(lq, 0)); + assert(lq->lq_cnt_mem_max <= LQ_MAX_LM_SLOTS); - for (int i = LQ_MIN_ALLOCATED_LM_SLOTS; i < LQ_MAX_LM_SLOTS; i++) { + for (int i = LQ_MIN_ALLOCATED_LM_SLOTS; i < lq->lq_cnt_mem_max; i++) { if (!LQ_MEM_ENABLED(lq, i)) { return i; } @@ -518,8 +545,11 @@ log_queue_mem_reconfigure(log_queue_t lq) static boolean_t log_queue_needs_memory(log_queue_t lq, boolean_t new_suspend) { + // Store the current upper bound before potentially growing the queue. + lq->lq_cnt_mem_max = log_queue_mem_max_slots(); + if (new_suspend || log_queue_low_mem(lq)) { - return lq->lq_cnt_mem_active < LQ_MAX_LM_SLOTS; + return lq->lq_cnt_mem_active < lq->lq_cnt_mem_max; } return false; } @@ -643,11 +673,12 @@ log_queue_add(log_payload_t lp, const uint8_t *lp_data) __startup_func static size_t -log_queue_init_memory(log_queue_t lq, size_t lm_count) +log_queue_init_memory(log_queue_t lq) { - assert(lm_count <= LQ_MAX_LM_SLOTS); + lq->lq_cnt_mem_max = log_queue_mem_max_slots(); + assert(lq->lq_cnt_mem_max <= LQ_MAX_LM_SLOTS); - for (size_t i = 0; i < lm_count; i++) { + for (size_t i = 0; i < lq->lq_cnt_mem_max; i++) { void *buf = log_queue_buffer_alloc(lq->lq_mem_size); if (!buf) { return i; @@ -657,7 +688,7 @@ log_queue_init_memory(log_queue_t lq, size_t lm_count) log_queue_mem_enable(lq, i); } - return lm_count; + return lq->lq_cnt_mem_max; } __startup_func @@ -679,6 +710,7 @@ oslog_init_log_queues(void) lq_bootarg_nslots = MAX(lq_bootarg_nslots, LQ_MIN_ALLOCATED_LM_SLOTS); lq_bootarg_nslots = MIN(lq_bootarg_nslots, LQ_MAX_LM_SLOTS); + log_queue_set_max_slots(lq_bootarg_nslots); lq_low_mem_limit = MAX(1 << (lq_bootarg_size_order - LQ_LOW_MEM_SCALE), 1024); @@ -688,7 +720,7 @@ oslog_init_log_queues(void) lq->lq_mem_size_order = lq_bootarg_size_order; lq->lq_mem_size = round_page(logmem_required_size(lq->lq_mem_size_order, LQ_MIN_LOG_SZ_ORDER)); lq->lq_mem_handler = thread_call_allocate(log_queue_memory_handler, (thread_call_param_t)lq); - slot_count += log_queue_init_memory(lq, lq_bootarg_nslots); + slot_count += log_queue_init_memory(lq); STAILQ_INIT(&lq->lq_log_list); STAILQ_INIT(&lq->lq_dispatch_list); lq->lq_ready = true; diff --git a/libkern/os/log_queue.h b/libkern/os/log_queue.h index 2ee0822b2..c07c62d60 100644 --- a/libkern/os/log_queue.h +++ b/libkern/os/log_queue.h @@ -27,5 +27,6 @@ #include "log_internal.h" bool log_queue_log(log_payload_t, const void *, bool); +void log_queue_set_max_slots(size_t); #endif /* log_queue */ diff --git a/libkern/os/refcnt.c b/libkern/os/refcnt.c index 485a2758c..369988ff3 100644 --- a/libkern/os/refcnt.c +++ b/libkern/os/refcnt.c @@ -63,9 +63,9 @@ os_ref_panic_underflow(void *rc) __abortlike static void -os_ref_panic_overflow(void *rc) +os_ref_panic_overflow(os_ref_atomic_t *rc) { - panic("os_refcnt: overflow (rc=%p)", rc); + panic("os_refcnt: overflow (rc=%p, count=%u, max=%u)", rc, os_atomic_load(rc, relaxed), OS_REFCNT_MAX_COUNT); __builtin_unreachable(); } @@ -74,7 +74,7 @@ static void os_ref_panic_retain(os_ref_atomic_t *rc) { if (os_atomic_load(rc, relaxed) >= OS_REFCNT_MAX_COUNT) { - panic("os_refcnt: overflow (rc=%p)", rc); + os_ref_panic_overflow(rc); } else { panic("os_refcnt: attempted resurrection (rc=%p)", rc); } diff --git a/libkern/os/refcnt_internal.h b/libkern/os/refcnt_internal.h index a0f9eff43..b7edf7cb0 100644 --- a/libkern/os/refcnt_internal.h +++ b/libkern/os/refcnt_internal.h @@ -452,6 +452,22 @@ os_ref_release_live_mask(os_ref_atomic_t *rc, uint32_t b, struct os_refgrp *grp) os_ref_release_live_raw_mask(rc, b, grp); } +static inline uint32_t +os_ref_release_last_raw_mask(os_ref_atomic_t *rc, uint32_t b, struct os_refgrp *grp) +{ + uint32_t val = os_ref_release_barrier_mask_internal(rc, 1u << b, grp); + if (__improbable(val >> b != 0)) { + os_ref_panic_last(rc); + } + return val; +} + +static inline void +os_ref_release_last_mask(os_ref_atomic_t *rc, uint32_t b, struct os_refgrp *grp) +{ + os_ref_release_last_raw_mask(rc, b, grp); +} + #if !OS_REFCNT_DEBUG /* remove the group argument for non-debug */ #define os_ref_init_count_mask(rc, b, grp, init_c, init_b) (os_ref_init_count_mask)(rc, b, NULL, init_c, init_b) @@ -462,9 +478,11 @@ os_ref_release_live_mask(os_ref_atomic_t *rc, uint32_t b, struct os_refgrp *grp) #define os_ref_release_mask(rc, b, grp) (os_ref_release_mask)((rc), (b), NULL) #define os_ref_release_relaxed_mask(rc, b, grp) (os_ref_release_relaxed_mask)((rc), (b), NULL) #define os_ref_release_raw_mask(rc, b, grp) (os_ref_release_raw_mask)((rc), (b), NULL) -#define os_ref_release_relaxed_raw_mask(rc, b, grp) (os_ref_release_relaxed_raw_mask)((rc), (b), NULL) +#define os_ref_release_raw_relaxed_mask(rc, b, grp) (os_ref_release_raw_relaxed_mask)((rc), (b), NULL) #define os_ref_release_live_raw_mask(rc, b, grp) (os_ref_release_live_raw_mask)((rc), (b), NULL) #define os_ref_release_live_mask(rc, b, grp) (os_ref_release_live_mask)((rc), (b), NULL) +#define os_ref_release_last_raw_mask(rc, b, grp) (os_ref_release_last_raw_mask)((rc), (b), NULL) +#define os_ref_release_last_mask(rc, b, grp) (os_ref_release_last_mask)((rc), (b), NULL) #endif #pragma GCC visibility pop diff --git a/libsa/conf/Makefile.template b/libsa/conf/Makefile.template index bb36037ec..35344540f 100644 --- a/libsa/conf/Makefile.template +++ b/libsa/conf/Makefile.template @@ -78,6 +78,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/libsyscall/mach/mach_vm.c b/libsyscall/mach/mach_vm.c index 365d9a94b..779f7a31c 100644 --- a/libsyscall/mach/mach_vm.c +++ b/libsyscall/mach/mach_vm.c @@ -38,6 +38,7 @@ #include #undef _mach_vm_user_ #include +#include #include "stack_logging_internal.h" @@ -413,3 +414,315 @@ vm_purgable_control( control, state); } + +kern_return_t +mach_vm_update_pointers_with_remote_tags( + mach_port_name_t target, + mach_vm_offset_list_t in_pointer_list, + mach_msg_type_number_t in_pointer_listCnt, + mach_vm_offset_list_t out_pointer_list, + mach_msg_type_number_t *out_pointer_listCnt) +{ + return _kernelrpc_mach_vm_update_pointers_with_remote_tags(target, in_pointer_list, in_pointer_listCnt, out_pointer_list, out_pointer_listCnt); +} + +/* + * The tag descriptions provided here are primarily exposed via vmmap(1) + * and footprint(1). The tag descriptions displayed by these tools must be + * human-readable and conform to a maximum length of 24 characters in order + * to fit within vmmap(1)'s type name column. i.e. 123456789012345678901234 + */ +static const char *vm_tag_descriptions[VM_MEMORY_COUNT] = { + /* vmmap also uses "shared memory" */ + /* maximum width indicator 123456789012345678901234 */ + [0] = "Untagged", + [VM_MEMORY_MALLOC] = "Malloc Metadata", + [VM_MEMORY_MALLOC_SMALL] = "Malloc Small", + [VM_MEMORY_MALLOC_LARGE] = "Malloc Large", + [VM_MEMORY_MALLOC_HUGE] = "Malloc Huge", + [VM_MEMORY_SBRK] = "SBRK", + [VM_MEMORY_REALLOC] = "Malloc Realloc", + [VM_MEMORY_MALLOC_TINY] = "Malloc Tiny", + [VM_MEMORY_MALLOC_LARGE_REUSABLE] = "Malloc Large (Reusable)", + [VM_MEMORY_MALLOC_LARGE_REUSED] = "Malloc Large (Reused)", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_ANALYSIS_TOOL] = "Performance Tool Data", + [VM_MEMORY_MALLOC_NANO] = "Malloc Nano", + [VM_MEMORY_MALLOC_MEDIUM] = "Malloc Medium", + [VM_MEMORY_MALLOC_PROB_GUARD] = "Malloc Prob. Guard", + [14] = "VM_MEMORY_14", + [15] = "VM_MEMORY_15", + [16] = "VM_MEMORY_16", + [17] = "VM_MEMORY_17", + [18] = "VM_MEMORY_18", + [19] = "VM_MEMORY_19", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_MACH_MSG] = "Mach Message", + [VM_MEMORY_IOKIT] = "IOKit", + [22] = "VM_MEMORY_22", + [23] = "VM_MEMORY_23", + [24] = "VM_MEMORY_24", + [25] = "VM_MEMORY_25", + [26] = "VM_MEMORY_26", + [27] = "VM_MEMORY_27", + [28] = "VM_MEMORY_28", + [29] = "VM_MEMORY_29", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_STACK] = "Stack", + [VM_MEMORY_GUARD] = "Guard", + [VM_MEMORY_SHARED_PMAP] = "Shared Pmap", + [VM_MEMORY_DYLIB] = "Dylib", + [VM_MEMORY_OBJC_DISPATCHERS] = "ObjC Dispatching Code", + [VM_MEMORY_UNSHARED_PMAP] = "Unshared Pmap", + [VM_MEMORY_LIBCHANNEL] = "Channel Library", + [37] = "VM_MEMORY_37", + [38] = "VM_MEMORY_38", + [39] = "VM_MEMORY_39", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_APPKIT] = "AppKit", + [VM_MEMORY_FOUNDATION] = "Foundation", + [VM_MEMORY_COREGRAPHICS] = "CoreGraphics", + [VM_MEMORY_CORESERVICES] = "CoreServices", + [VM_MEMORY_JAVA] = "Java", + [VM_MEMORY_COREDATA] = "CoreData", + [VM_MEMORY_COREDATA_OBJECTIDS] = "CoreData Object IDs", + [47] = "VM_MEMORY_47", + [48] = "VM_MEMORY_48", + [49] = "VM_MEMORY_49", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_ATS] = "ATS (Font Support)", + [VM_MEMORY_LAYERKIT] = "CoreAnimation", + [VM_MEMORY_CGIMAGE] = "CG Image", + [VM_MEMORY_TCMALLOC] = "WebKit Malloc", + [VM_MEMORY_COREGRAPHICS_DATA] = "CG Raster Data", + [VM_MEMORY_COREGRAPHICS_SHARED] = "CG Shared Images", + [VM_MEMORY_COREGRAPHICS_FRAMEBUFFERS] = "CG Frame Buffers", + [VM_MEMORY_COREGRAPHICS_BACKINGSTORES] = "CG Backing Stores", + [VM_MEMORY_COREGRAPHICS_XALLOC] = "CG Xalloc", + [59] = "VM_MEMORY_59", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_DYLD] = "Dyld Private Memory", + [VM_MEMORY_DYLD_MALLOC] = "Dyld Malloc Memory", + [VM_MEMORY_SQLITE] = "SQLite Page Cache", + [VM_MEMORY_WEBASSEMBLY] = "WebAssembly Memory", + [VM_MEMORY_JAVASCRIPT_JIT_EXECUTABLE_ALLOCATOR] = "JS JIT Generated Code", + [VM_MEMORY_JAVASCRIPT_JIT_REGISTER_FILE] = "JS VM Register File", + [VM_MEMORY_GLSL] = "OpenGL GLSL", + [VM_MEMORY_OPENCL] = "OpenCL", + [VM_MEMORY_COREIMAGE] = "CoreImage", + [VM_MEMORY_WEBCORE_PURGEABLE_BUFFERS] = "WebCore Purgeable Data", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_IMAGEIO] = "Image IO", + [VM_MEMORY_COREPROFILE] = "CoreProfile", + [VM_MEMORY_ASSETSD] = "Assets Library", + [VM_MEMORY_OS_ALLOC_ONCE] = "OS Alloc Once", + [VM_MEMORY_LIBDISPATCH] = "Dispatch Continuations", + [VM_MEMORY_ACCELERATE] = "Accelerate Framework", + [VM_MEMORY_COREUI] = "CoreUI Image Data", + [VM_MEMORY_COREUIFILE] = "CoreUI Image File", + [VM_MEMORY_GENEALOGY] = "Activity Tracing", + [VM_MEMORY_RAWCAMERA] = "RawCamera", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_CORPSEINFO] = "Process Corpse Info", + [VM_MEMORY_ASL] = "Apple System Log", + [VM_MEMORY_SWIFT_RUNTIME] = "Swift Runtime", + [VM_MEMORY_SWIFT_METADATA] = "Swift Metadata", + [VM_MEMORY_DHMM] = "DHMM", + [VM_MEMORY_DFR] = "DFR", + [VM_MEMORY_SCENEKIT] = "SceneKit", + [VM_MEMORY_SKYWALK] = "Skywalk Networking", + [VM_MEMORY_IOSURFACE] = "IOSurface", + [VM_MEMORY_LIBNETWORK] = "Libnetwork", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_AUDIO] = "Audio", + [VM_MEMORY_VIDEOBITSTREAM] = "Video Bitstream", + [VM_MEMORY_CM_XPC] = "CoreMedia XPC", + [VM_MEMORY_CM_RPC] = "CoreMedia RPC", + [VM_MEMORY_CM_MEMORYPOOL] = "CoreMedia Memory Pool", + [VM_MEMORY_CM_READCACHE] = "CoreMedia Read Cache", + [VM_MEMORY_CM_CRABS] = "CoreMedia HTTP Cache", + [VM_MEMORY_QUICKLOOK_THUMBNAILS] = "QuickLook Thumbnails", + [VM_MEMORY_ACCOUNTS] = "Accounts Framework", + [VM_MEMORY_SANITIZER] = "Sanitizer", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_IOACCELERATOR] = "IOAccelerator", + [VM_MEMORY_CM_REGWARP] = "CoreMedia Capture Data", + [VM_MEMORY_EAR_DECODER] = "EAR Speech Decoder", + [VM_MEMORY_COREUI_CACHED_IMAGE_DATA] = "CoreUI Cache Image Data", + [VM_MEMORY_COLORSYNC] = "ColorSync", + [VM_MEMORY_BTINFO] = "Simulated Crash Data", + [VM_MEMORY_CM_HLS] = "CoreMedia HLS", + [107] = "VM_MEMORY_107", + [108] = "VM_MEMORY_108", + [109] = "VM_MEMORY_109", + /* maximum width indicator 123456789012345678901234 */ + [110] = "VM_MEMORY_110", + [111] = "VM_MEMORY_111", + [112] = "VM_MEMORY_112", + [113] = "VM_MEMORY_113", + [114] = "VM_MEMORY_114", + [115] = "VM_MEMORY_115", + [116] = "VM_MEMORY_116", + [117] = "VM_MEMORY_117", + [118] = "VM_MEMORY_118", + [119] = "VM_MEMORY_119", + /* maximum width indicator 123456789012345678901234 */ + [120] = "VM_MEMORY_120", + [121] = "VM_MEMORY_121", + [122] = "VM_MEMORY_122", + [123] = "VM_MEMORY_123", + [124] = "VM_MEMORY_124", + [125] = "VM_MEMORY_125", + [126] = "VM_MEMORY_126", + [127] = "VM_MEMORY_127", + [128] = "VM_MEMORY_128", + [129] = "VM_MEMORY_129", + /* maximum width indicator 123456789012345678901234 */ + [130] = "VM_MEMORY_130", + [131] = "VM_MEMORY_131", + [132] = "VM_MEMORY_132", + [133] = "VM_MEMORY_133", + [134] = "VM_MEMORY_134", + [135] = "VM_MEMORY_135", + [136] = "VM_MEMORY_136", + [137] = "VM_MEMORY_137", + [138] = "VM_MEMORY_138", + [139] = "VM_MEMORY_139", + /* maximum width indicator 123456789012345678901234 */ + [140] = "VM_MEMORY_140", + [141] = "VM_MEMORY_141", + [142] = "VM_MEMORY_142", + [143] = "VM_MEMORY_143", + [144] = "VM_MEMORY_144", + [145] = "VM_MEMORY_145", + [146] = "VM_MEMORY_146", + [147] = "VM_MEMORY_147", + [148] = "VM_MEMORY_148", + [149] = "VM_MEMORY_149", + /* maximum width indicator 123456789012345678901234 */ + [150] = "VM_MEMORY_150", + [151] = "VM_MEMORY_151", + [152] = "VM_MEMORY_152", + [153] = "VM_MEMORY_153", + [154] = "VM_MEMORY_154", + [155] = "VM_MEMORY_155", + [156] = "VM_MEMORY_156", + [157] = "VM_MEMORY_157", + [158] = "VM_MEMORY_158", + [159] = "VM_MEMORY_159", + /* maximum width indicator 123456789012345678901234 */ + [160] = "VM_MEMORY_160", + [161] = "VM_MEMORY_161", + [162] = "VM_MEMORY_162", + [163] = "VM_MEMORY_163", + [164] = "VM_MEMORY_164", + [165] = "VM_MEMORY_165", + [166] = "VM_MEMORY_166", + [167] = "VM_MEMORY_167", + [168] = "VM_MEMORY_168", + [169] = "VM_MEMORY_169", + /* maximum width indicator 123456789012345678901234 */ + [170] = "VM_MEMORY_170", + [171] = "VM_MEMORY_171", + [172] = "VM_MEMORY_172", + [173] = "VM_MEMORY_173", + [174] = "VM_MEMORY_174", + [175] = "VM_MEMORY_175", + [176] = "VM_MEMORY_176", + [177] = "VM_MEMORY_177", + [178] = "VM_MEMORY_178", + [179] = "VM_MEMORY_179", + /* maximum width indicator 123456789012345678901234 */ + [180] = "VM_MEMORY_180", + [181] = "VM_MEMORY_181", + [182] = "VM_MEMORY_182", + [183] = "VM_MEMORY_183", + [184] = "VM_MEMORY_184", + [185] = "VM_MEMORY_185", + [186] = "VM_MEMORY_186", + [187] = "VM_MEMORY_187", + [188] = "VM_MEMORY_188", + [189] = "VM_MEMORY_189", + /* maximum width indicator 123456789012345678901234 */ + [190] = "VM_MEMORY_190", + [191] = "VM_MEMORY_191", + [192] = "VM_MEMORY_192", + [193] = "VM_MEMORY_193", + [194] = "VM_MEMORY_194", + [195] = "VM_MEMORY_195", + [196] = "VM_MEMORY_196", + [197] = "VM_MEMORY_197", + [198] = "VM_MEMORY_198", + [199] = "VM_MEMORY_199", + /* maximum width indicator 123456789012345678901234 */ + [200] = "VM_MEMORY_200", + [201] = "VM_MEMORY_201", + [202] = "VM_MEMORY_202", + [203] = "VM_MEMORY_203", + [204] = "VM_MEMORY_204", + [205] = "VM_MEMORY_205", + [206] = "VM_MEMORY_206", + [207] = "VM_MEMORY_207", + [208] = "VM_MEMORY_208", + [209] = "VM_MEMORY_209", + /* maximum width indicator 123456789012345678901234 */ + [210] = "VM_MEMORY_210", + [211] = "VM_MEMORY_211", + [212] = "VM_MEMORY_212", + [213] = "VM_MEMORY_213", + [214] = "VM_MEMORY_214", + [215] = "VM_MEMORY_215", + [216] = "VM_MEMORY_216", + [217] = "VM_MEMORY_217", + [218] = "VM_MEMORY_218", + [219] = "VM_MEMORY_219", + /* maximum width indicator 123456789012345678901234 */ + [220] = "VM_MEMORY_220", + [221] = "VM_MEMORY_221", + [222] = "VM_MEMORY_222", + [223] = "VM_MEMORY_223", + [224] = "VM_MEMORY_224", + [225] = "VM_MEMORY_225", + [226] = "VM_MEMORY_226", + [227] = "VM_MEMORY_227", + [228] = "VM_MEMORY_228", + [229] = "VM_MEMORY_229", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_ROSETTA] = "Rosetta Generic", + [VM_MEMORY_ROSETTA_THREAD_CONTEXT] = "Rosetta Thread Context", + [VM_MEMORY_ROSETTA_INDIRECT_BRANCH_MAP] = "Rosetta IndirectBranch", + [VM_MEMORY_ROSETTA_RETURN_STACK] = "Rosetta Return Stack", + [VM_MEMORY_ROSETTA_EXECUTABLE_HEAP] = "Rosetta JIT", + [VM_MEMORY_ROSETTA_USER_LDT] = "Rosetta User LDT", + [VM_MEMORY_ROSETTA_ARENA] = "Rosetta Arena", + [237] = "VM_MEMORY_237", + [238] = "VM_MEMORY_238", + [VM_MEMORY_ROSETTA_10] = "Rosetta Tag 10", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_APPLICATION_SPECIFIC_1] = "App-Specific Tag 1", + [VM_MEMORY_APPLICATION_SPECIFIC_2] = "App-Specific Tag 2", + [VM_MEMORY_APPLICATION_SPECIFIC_3] = "App-Specific Tag 3", + [VM_MEMORY_APPLICATION_SPECIFIC_4] = "App-Specific Tag 4", + [VM_MEMORY_APPLICATION_SPECIFIC_5] = "App-Specific Tag 5", + [VM_MEMORY_APPLICATION_SPECIFIC_6] = "App-Specific Tag 6", + [VM_MEMORY_APPLICATION_SPECIFIC_7] = "App-Specific Tag 7", + [VM_MEMORY_APPLICATION_SPECIFIC_8] = "App-Specific Tag 8", + [VM_MEMORY_APPLICATION_SPECIFIC_9] = "App-Specific Tag 9", + /* maximum width indicator 123456789012345678901234 */ + [VM_MEMORY_APPLICATION_SPECIFIC_10] = "App-Specific Tag 10", + [VM_MEMORY_APPLICATION_SPECIFIC_11] = "App-Specific Tag 11", + [VM_MEMORY_APPLICATION_SPECIFIC_12] = "App-Specific Tag 12", + [VM_MEMORY_APPLICATION_SPECIFIC_13] = "App-Specific Tag 13", + [VM_MEMORY_APPLICATION_SPECIFIC_14] = "App-Specific Tag 14", + [VM_MEMORY_APPLICATION_SPECIFIC_15] = "App-Specific Tag 15", + [VM_MEMORY_APPLICATION_SPECIFIC_16] = "App-Specific Tag 16", +}; + +const char * +mach_vm_tag_describe(unsigned int tag) +{ + if (tag < VM_MEMORY_COUNT) { + return vm_tag_descriptions[tag]; + } + return "Invalid Tag (?)"; +} diff --git a/libsyscall/mach/vm_reclaim.c b/libsyscall/mach/vm_reclaim.c index cc0e5f4db..10b6ecaf7 100644 --- a/libsyscall/mach/vm_reclaim.c +++ b/libsyscall/mach/vm_reclaim.c @@ -36,14 +36,17 @@ #include #include #include +#include +#include #include #include #undef _mach_vm_user_ #include #include +#include #include #include -#include +#include #include @@ -61,32 +64,6 @@ _Static_assert(VM_RECLAIM_MAX_CAPACITY <= UINT32_MAX, "Max capacity must fit in mach_vm_reclaim_count_t"); -static uint64_t kAccountingThreshold; - -static bool -update_accounting(mach_vm_reclaim_ring_t ring_buffer, int64_t size) -{ - ring_buffer->va_in_buffer += size; - if ((ring_buffer->va_in_buffer > ring_buffer->last_accounting_given_to_kernel && - ring_buffer->va_in_buffer - ring_buffer->last_accounting_given_to_kernel > kAccountingThreshold) || - (ring_buffer->last_accounting_given_to_kernel > ring_buffer->va_in_buffer && - ring_buffer->last_accounting_given_to_kernel - ring_buffer->va_in_buffer > kAccountingThreshold)) { - /* - * The caller should call mach_vm_reclaim_update_kernel_accounting. - * We store the value that they will give to the kernel here while we hold the lock. - * Technically it's out of sync with what the kernel has seen, but - * that will be rectified once the caller makes the mach_vm_reclaim_update_kernel_accounting call. - * If we forced this value to be in sync with the kernel's value - * all callers would start calling mach_vm_reclaim_update_kernel_accounting until one of them - * finishes & we'd have to take the ringbuffer lock again in - * mach_vm_reclaim_update_kernel_accounting. - */ - ring_buffer->last_accounting_given_to_kernel = ring_buffer->va_in_buffer; - return true; - } - return false; -} - static inline struct mach_vm_reclaim_entry_s construct_entry( mach_vm_address_t start_addr, @@ -128,9 +105,10 @@ mach_vm_reclaim_ring_allocate( mach_vm_reclaim_count_t initial_capacity, mach_vm_reclaim_count_t max_capacity) { - kAccountingThreshold = vm_page_size; kern_return_t kr; mach_vm_address_t vm_addr = 0; + uint64_t sampling_period_abs; + if (ring_out == NULL || max_capacity < initial_capacity || initial_capacity == 0 || max_capacity == 0) { return VM_RECLAIM_INVALID_ARGUMENT; @@ -141,16 +119,17 @@ mach_vm_reclaim_ring_allocate( *ring_out = NULL; kr = mach_vm_deferred_reclamation_buffer_allocate(mach_task_self(), - &vm_addr, initial_capacity, max_capacity); + &vm_addr, &sampling_period_abs, initial_capacity, max_capacity); if (kr == ERR_SUCCESS) { - mach_vm_reclaim_ring_t ringbuffer = + mach_vm_reclaim_ring_t ring = (mach_vm_reclaim_ring_t)vm_addr; - - ringbuffer->va_in_buffer = 0; - ringbuffer->last_accounting_given_to_kernel = 0; - ringbuffer->len = initial_capacity; - ringbuffer->max_len = max_capacity; - *ring_out = ringbuffer; + ring->last_sample_abs = mach_absolute_time(); + ring->reclaimable_bytes = 0; + ring->reclaimable_bytes_min = 0; + ring->len = initial_capacity; + ring->max_len = max_capacity; + ring->sampling_period_abs = sampling_period_abs; + *ring_out = ring; } return kr; } @@ -160,19 +139,26 @@ mach_vm_reclaim_ring_resize( mach_vm_reclaim_ring_t ring, mach_vm_reclaim_count_t capacity) { - kern_return_t kr; + mach_error_t err; + mach_vm_size_t bytes_reclaimed = 0; + if (ring == NULL) { return VM_RECLAIM_INVALID_RING; } if (capacity == 0 || capacity > ring->max_len) { return VM_RECLAIM_INVALID_CAPACITY; } - kr = mach_vm_deferred_reclamation_buffer_resize(mach_task_self(), - capacity); - if (kr == KERN_SUCCESS) { + + err = mach_vm_deferred_reclamation_buffer_resize(mach_task_self(), + capacity, &bytes_reclaimed); + if (err == ERR_SUCCESS) { ring->len = capacity; + /* Reset the accounting now that we've flushed the buffer */ + ring->last_sample_abs = mach_absolute_time(); } - return kr; + size_t reclaimable_bytes = os_atomic_sub(&ring->reclaimable_bytes, bytes_reclaimed, relaxed); + os_atomic_min(&ring->reclaimable_bytes_min, reclaimable_bytes, relaxed); + return err; } mach_vm_reclaim_count_t @@ -195,7 +181,6 @@ mach_vm_reclaim_try_enter( bool *should_update_kernel_accounting) { mach_vm_reclaim_id_t tail = 0, head = 0, original_tail = 0, busy = 0; - mach_vm_reclaim_indices_t indices = &ring->indices; mach_vm_reclaim_entry_t entries = ring->entries; uint64_t buffer_len = (uint64_t)ring->len; *should_update_kernel_accounting = false; @@ -218,8 +203,8 @@ mach_vm_reclaim_try_enter( *id = VM_RECLAIM_ID_NULL; if (requested_id == VM_RECLAIM_ID_NULL) { - tail = os_atomic_load_wide(&indices->tail, relaxed); - head = os_atomic_load_wide(&indices->head, relaxed); + tail = os_atomic_load_wide(&ring->tail, relaxed); + head = os_atomic_load_wide(&ring->head, relaxed); if (tail % buffer_len == head % buffer_len && tail > head) { /* Buffer is full */ @@ -233,10 +218,10 @@ mach_vm_reclaim_try_enter( struct mach_vm_reclaim_entry_s entry = construct_entry(region_start, size32, action); entries[tail % buffer_len] = entry; os_atomic_thread_fence(seq_cst); // tail increment can not be seen before the entry is cleared in the buffer - os_atomic_inc(&indices->tail, relaxed); + os_atomic_inc(&ring->tail, relaxed); *id = tail; } else { - head = os_atomic_load_wide(&indices->head, relaxed); + head = os_atomic_load_wide(&ring->head, relaxed); if (requested_id < head) { /* * This is just a fast path for the case where the buffer has wrapped. @@ -246,16 +231,16 @@ mach_vm_reclaim_try_enter( return VM_RECLAIM_SUCCESS; } /* Attempt to move tail to idx */ - original_tail = os_atomic_load_wide(&indices->tail, relaxed); + original_tail = os_atomic_load_wide(&ring->tail, relaxed); _assert("mach_vm_reclaim_mark_free_with_id", requested_id < original_tail, original_tail); - os_atomic_store_wide(&indices->tail, requested_id, relaxed); + os_atomic_store_wide(&ring->tail, requested_id, relaxed); os_atomic_thread_fence(seq_cst); // Our write to tail must happen before our read of busy - busy = os_atomic_load_wide(&indices->busy, relaxed); + busy = os_atomic_load_wide(&ring->busy, relaxed); if (requested_id < busy) { /* Kernel is acting on this entry. Undo. */ - os_atomic_store_wide(&indices->tail, original_tail, relaxed); + os_atomic_store_wide(&ring->tail, original_tail, relaxed); return VM_RECLAIM_SUCCESS; } @@ -269,16 +254,23 @@ mach_vm_reclaim_try_enter( /* Tail increment can not be seen before the entry is set in the buffer */ os_atomic_thread_fence(seq_cst); /* Reset tail. */ - os_atomic_store_wide(&indices->tail, original_tail, relaxed); + os_atomic_store_wide(&ring->tail, original_tail, relaxed); *id = requested_id; } - *should_update_kernel_accounting = update_accounting(ring, region_size); + + size_t reclaimable_bytes = os_atomic_add(&ring->reclaimable_bytes, region_size, relaxed); + os_atomic_min(&ring->reclaimable_bytes_min, reclaimable_bytes, relaxed); + + uint64_t now = mach_absolute_time(); + if (now - ring->last_sample_abs >= ring->sampling_period_abs) { + *should_update_kernel_accounting = true; + } return VM_RECLAIM_SUCCESS; } mach_vm_reclaim_error_t mach_vm_reclaim_try_cancel( - mach_vm_reclaim_ring_t ring_buffer, + mach_vm_reclaim_ring_t ring, mach_vm_reclaim_id_t id, mach_vm_address_t region_start, mach_vm_size_t region_size, @@ -286,12 +278,11 @@ mach_vm_reclaim_try_cancel( mach_vm_reclaim_state_t *state, bool *should_update_kernel_accounting) { - mach_vm_reclaim_indices_t indices = &ring_buffer->indices; - mach_vm_reclaim_entry_t entries = ring_buffer->entries; - uint64_t buffer_len = (uint64_t)ring_buffer->len; + mach_vm_reclaim_entry_t entries = ring->entries; + uint64_t buffer_len = (uint64_t)ring->len; uint64_t head = 0, busy = 0, original_tail = 0; - if (ring_buffer == NULL) { + if (ring == NULL) { return VM_RECLAIM_INVALID_RING; } if (id == VM_RECLAIM_ID_NULL) { @@ -310,7 +301,7 @@ mach_vm_reclaim_try_cancel( return VM_RECLAIM_INVALID_REGION_SIZE; } - head = os_atomic_load_wide(&indices->head, relaxed); + head = os_atomic_load_wide(&ring->head, relaxed); if (id < head) { /* * This is just a fast path for the case where the buffer has wrapped. @@ -333,19 +324,19 @@ mach_vm_reclaim_try_cancel( } /* Attempt to move tail to idx */ - original_tail = os_atomic_load_wide(&indices->tail, relaxed); + original_tail = os_atomic_load_wide(&ring->tail, relaxed); _assert("mach_vm_reclaim_mark_used", id < original_tail, original_tail); - os_atomic_store_wide(&indices->tail, id, relaxed); + os_atomic_store_wide(&ring->tail, id, relaxed); /* Our write to tail must happen before our read of busy */ os_atomic_thread_fence(seq_cst); - busy = os_atomic_load_wide(&indices->busy, relaxed); + busy = os_atomic_load_wide(&ring->busy, relaxed); if (id < busy) { /* * This entry is in the process of being reclaimed. It is * never safe to re-use while in this state. */ - os_atomic_store_wide(&indices->tail, original_tail, relaxed); + os_atomic_store_wide(&ring->tail, original_tail, relaxed); *state = VM_RECLAIM_BUSY; return VM_RECLAIM_SUCCESS; } @@ -359,9 +350,15 @@ mach_vm_reclaim_try_cancel( /* tail increment can not be seen before the entry is cleared in the buffer */ os_atomic_thread_fence(seq_cst); /* Reset tail. */ - os_atomic_store_wide(&indices->tail, original_tail, relaxed); + os_atomic_store_wide(&ring->tail, original_tail, relaxed); - *should_update_kernel_accounting = update_accounting(ring_buffer, -(int64_t)region_size); + size_t reclaimable_bytes = os_atomic_sub(&ring->reclaimable_bytes, region_size, relaxed); + os_atomic_min(&ring->reclaimable_bytes_min, reclaimable_bytes, relaxed); + + uint64_t now = mach_absolute_time(); + if (now - ring->last_sample_abs >= ring->sampling_period_abs) { + *should_update_kernel_accounting = true; + } *state = VM_RECLAIM_UNRECLAIMED; return VM_RECLAIM_SUCCESS; } @@ -379,9 +376,8 @@ mach_vm_reclaim_query_state( if (id == VM_RECLAIM_ID_NULL) { return VM_RECLAIM_INVALID_ID; } - mach_vm_reclaim_indices_t indices = &ring->indices; - mach_vm_reclaim_id_t head = os_atomic_load_wide(&indices->head, relaxed); + mach_vm_reclaim_id_t head = os_atomic_load_wide(&ring->head, relaxed); if (id < head) { switch (action) { case VM_RECLAIM_FREE: @@ -396,7 +392,7 @@ mach_vm_reclaim_query_state( return VM_RECLAIM_SUCCESS; } - mach_vm_reclaim_id_t busy = os_atomic_load_wide(&indices->busy, relaxed); + mach_vm_reclaim_id_t busy = os_atomic_load_wide(&ring->busy, relaxed); if (id < busy) { *state = VM_RECLAIM_BUSY; } else { @@ -408,8 +404,21 @@ mach_vm_reclaim_query_state( mach_vm_reclaim_error_t mach_vm_reclaim_update_kernel_accounting(const mach_vm_reclaim_ring_t ring) { - return mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes(current_task(), - ring->va_in_buffer); + mach_error_t err; + uint64_t bytes_reclaimed = 0; + uint64_t now, last_sample; + + os_atomic_rmw_loop(&ring->last_sample_abs, last_sample, now, relaxed, { + now = mach_absolute_time(); + if (now - last_sample < ring->sampling_period_abs) { + os_atomic_rmw_loop_give_up(return VM_RECLAIM_SUCCESS; ); + } + }); + err = mach_vm_reclaim_update_kernel_accounting_trap(current_task(), + &bytes_reclaimed); + size_t reclaimable_bytes = os_atomic_sub(&ring->reclaimable_bytes, bytes_reclaimed, relaxed); + os_atomic_min(&ring->reclaimable_bytes_min, reclaimable_bytes, relaxed); + return err; } bool @@ -434,17 +443,218 @@ mach_vm_reclaim_ring_capacity(mach_vm_reclaim_ring_t ring, mach_vm_reclaim_count mach_vm_reclaim_error_t mach_vm_reclaim_ring_flush( - mach_vm_reclaim_ring_t ring_buffer, + mach_vm_reclaim_ring_t ring, mach_vm_reclaim_count_t num_entries_to_reclaim) { - if (ring_buffer == NULL) { + mach_vm_size_t bytes_reclaimed; + mach_error_t err; + if (ring == NULL) { return VM_RECLAIM_INVALID_RING; } if (num_entries_to_reclaim == 0) { return VM_RECLAIM_INVALID_ARGUMENT; } - return mach_vm_deferred_reclamation_buffer_flush(mach_task_self(), num_entries_to_reclaim); + err = mach_vm_deferred_reclamation_buffer_flush(mach_task_self(), + num_entries_to_reclaim, &bytes_reclaimed); + if (err == ERR_SUCCESS) { + size_t reclaimable_bytes = os_atomic_sub(&ring->reclaimable_bytes, bytes_reclaimed, relaxed); + os_atomic_min(&ring->reclaimable_bytes_min, reclaimable_bytes, release); + } + return err; +} + +mach_vm_reclaim_error_t +mach_vm_reclaim_get_rings_for_task( + task_read_t task, + mach_vm_reclaim_ring_ref_t refs_out, + mach_vm_reclaim_count_t *count_inout) +{ + /* + * Technically, we could support multiple rings per task. But for now, we + * only have one - so this is kind of a weird-looking shim that fakes that + * behavior at the libsyscall layer to make things easier in case anything + * changes. + */ + + kern_return_t kr; + mach_vm_address_t addr; + mach_vm_size_t size; + + if (count_inout == NULL) { + return VM_RECLAIM_INVALID_ARGUMENT; + } + + kr = mach_vm_deferred_reclamation_buffer_query(task, &addr, &size); + + if (kr != KERN_SUCCESS) { + switch (kr) { + case KERN_NOT_SUPPORTED: + return VM_RECLAIM_NOT_SUPPORTED; + case KERN_INVALID_ARGUMENT: + case KERN_INVALID_TASK: + case KERN_INVALID_ADDRESS: + return VM_RECLAIM_INVALID_ARGUMENT; + default: + return kr; + } + } + + /* Size query. If addr == NULL, it doesn't have a ring */ + if (refs_out == NULL) { + *count_inout = addr ? 1 : 0; + return KERN_SUCCESS; + } + + if (addr) { + if (*count_inout >= 1) { + refs_out->addr = addr; + refs_out->size = size; + } + *count_inout = 1; + } else { + *count_inout = 0; + } + + return KERN_SUCCESS; +} + +static mach_vm_reclaim_error_t +verify_ring_allocation_size(mach_vm_address_t addr, mach_vm_size_t size) +{ + if (size < offsetof(struct mach_vm_reclaim_ring_s, entries)) { + return VM_RECLAIM_INVALID_RING; + } + + mach_vm_reclaim_ring_t ring = (mach_vm_reclaim_ring_t) addr; + mach_vm_size_t supposed_size = + offsetof(struct mach_vm_reclaim_ring_s, entries) + + (ring->max_len * sizeof(struct mach_vm_reclaim_entry_s)); + + /* store allocation size in ring->_unused so that we can free it later */ + ring->_unused = size; + + return (supposed_size <= size) ? VM_RECLAIM_SUCCESS : VM_RECLAIM_INVALID_RING; +} + +mach_vm_reclaim_error_t +mach_vm_reclaim_ring_copy( + task_read_t task, + mach_vm_reclaim_ring_ref_t ref, + mach_vm_reclaim_ring_copy_t *ring_out) +{ + mach_vm_address_t address; + vm_prot_t curprot = VM_PROT_DEFAULT; + vm_prot_t maxprot = VM_PROT_DEFAULT; + kern_return_t kr = mach_vm_remap( + mach_task_self(), + &address, + ref->size, + 0, + VM_FLAGS_ANYWHERE, + task, + ref->addr, + TRUE, + &curprot, + &maxprot, + VM_INHERIT_DEFAULT); + + switch (kr) { + case KERN_INVALID_TASK: + case KERN_INVALID_ADDRESS: + case KERN_INVALID_ARGUMENT: + return VM_RECLAIM_INVALID_ARGUMENT; + case KERN_SUCCESS: + break; + default: + return kr; + } + + kr = verify_ring_allocation_size(address, ref->size); + if (kr != VM_RECLAIM_SUCCESS) { + return kr; + } + + *ring_out = address; + return VM_RECLAIM_SUCCESS; +} + +mach_vm_reclaim_error_t +mach_vm_reclaim_copied_ring_free( + mach_vm_reclaim_ring_copy_t *cring) +{ + kern_return_t kr; + mach_vm_reclaim_ring_t ring = (mach_vm_reclaim_ring_t) *cring; + + kr = mach_vm_deallocate( + mach_task_self(), + (mach_vm_address_t) *cring, + ring->_unused); + + if (kr == KERN_SUCCESS) { + *cring = NULL; + } + + return kr; +} + +mach_vm_reclaim_error_t +mach_vm_reclaim_copied_ring_query( + mach_vm_reclaim_ring_copy_t *ring_copy, + mach_vm_reclaim_region_t regions_out, + mach_vm_reclaim_count_t *count_inout) +{ + mach_vm_reclaim_id_t head, tail, idx, entry_idx; + mach_vm_reclaim_entry_t entry; + mach_vm_reclaim_count_t count; + mach_vm_reclaim_ring_t ring = (mach_vm_reclaim_ring_t) *ring_copy; + + if (ring == NULL) { + return VM_RECLAIM_INVALID_RING; + } + + if (count_inout == NULL) { + return VM_RECLAIM_INVALID_ARGUMENT; + } + + head = os_atomic_load_wide(&ring->head, relaxed); + tail = os_atomic_load_wide(&ring->tail, relaxed); + + if (tail < head) { + *count_inout = 0; + return VM_RECLAIM_SUCCESS; + } + + count = (mach_vm_reclaim_count_t) (tail - head); + + /* Query size */ + if (regions_out == NULL) { + *count_inout = count; + return VM_RECLAIM_SUCCESS; + } + + count = (count < *count_inout) ? count : *count_inout; + + for (idx = 0; idx < count; idx++) { + entry_idx = (head + idx) % ring->len; + if (entry_idx > ring->max_len) { + /* + * Make sure we don't accidentally read outside of the mapped region + * due to a malformed ring + */ + *count_inout = (mach_vm_reclaim_count_t) idx; + return VM_RECLAIM_INVALID_CAPACITY; + } + entry = &ring->entries[entry_idx]; + regions_out->vmrr_addr = entry->address; + regions_out->vmrr_size = entry->size; + regions_out->vmrr_behavior = entry->behavior; + regions_out++; + } + + *count_inout = count; + + return VM_RECLAIM_SUCCESS; } #endif /* defined(__LP64__) */ diff --git a/libsyscall/wrappers/_libkernel_init.c b/libsyscall/wrappers/_libkernel_init.c index 506d7ecdc..64864fa9d 100644 --- a/libsyscall/wrappers/_libkernel_init.c +++ b/libsyscall/wrappers/_libkernel_init.c @@ -112,7 +112,7 @@ __libkernel_init_late(_libkernel_late_init_config_t config) { if (config->version >= 1) { #if SYSTEM_VERSION_COMPAT_ENABLED -#if TARGET_OS_OSX && !defined(__i386__) +#if SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX if (config->enable_system_version_compat) { /* enable the version compatibility shim for this process (macOS only) */ @@ -122,6 +122,7 @@ __libkernel_init_late(_libkernel_late_init_config_t config) system_version_compat_mode = SYSTEM_VERSION_COMPAT_MODE_MACOSX; +#if SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL /* * tell the kernel the shim is enabled for this process so it can shim any * necessary sysctls @@ -129,7 +130,12 @@ __libkernel_init_late(_libkernel_late_init_config_t config) int enable = 1; __sysctlbyname("kern.system_version_compat", strlen("kern.system_version_compat"), NULL, NULL, &enable, sizeof(enable)); - } else if ((config->version >= 2) && config->enable_ios_version_compat) { +#endif /* SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL */ + } +#endif /* SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX */ +#if SYSTEM_VERSION_COMPAT_HAS_MODE_IOS + if (!config->enable_system_version_compat && + (config->version >= 2) && config->enable_ios_version_compat) { /* enable the iOS ProductVersion compatibility shim for this process */ /* first hook up the shims we reference from open{at}() */ @@ -143,8 +149,7 @@ __libkernel_init_late(_libkernel_late_init_config_t config) * don't need to inform the kernel that this app has the SystemVersion shim enabled. */ } -#endif /* TARGET_OS_OSX && !defined(__i386__) */ - +#endif /* SYSTEM_VERSION_COMPAT_HAS_MODE_IOS */ #endif /* SYSTEM_VERSION_COMPAT_ENABLED */ #if POSIX_SPAWN_FILTERING_ENABLED diff --git a/libsyscall/wrappers/exclaves.c b/libsyscall/wrappers/exclaves.c index 5293879fa..eda9742b4 100644 --- a/libsyscall/wrappers/exclaves.c +++ b/libsyscall/wrappers/exclaves.c @@ -238,6 +238,28 @@ exclaves_sensor_status(mach_port_t sensor_port, uint64_t flags, (uintptr_t) sensor_status, 0, 0, 0, 0); } +kern_return_t +exclaves_indicator_min_on_time(mach_port_t port, uint64_t flags, + uint64_t *camera_indicator, uint64_t *mic_indicator, uint64_t *faceid_indicator) +{ + if ((camera_indicator == NULL) || (mic_indicator == NULL) || (faceid_indicator == NULL)) { + return KERN_INVALID_ARGUMENT; + } + + struct exclaves_indicator_deadlines indicator = { + .version = 1 + }; + + const uint32_t opf = EXCLAVES_CTL_OP_AND_FLAGS(SENSOR_MIN_ON_TIME, 0); + kern_return_t kr = EXCLAVES_CTL_TRAP(port, opf, flags, (mach_vm_address_t)&indicator, sizeof(indicator), 0, 0, 0); + if (kr == KERN_SUCCESS) { + *camera_indicator = indicator.camera_indicator; + *mic_indicator = indicator.mic_indicator; + *faceid_indicator = indicator.faceid_indicator; + } + return kr; +} + kern_return_t exclaves_notification_create(__unused mach_port_t port, const char *name, uint64_t *notification_id) @@ -260,3 +282,27 @@ exclaves_notification_create(__unused mach_port_t port, const char *name, } return kr; } + +kern_return_t +exclaves_aoe_setup(__unused mach_port_t port, uint8_t *num_message, + uint8_t *num_worker) +{ + /* Hack for now. */ + const uint32_t opf = EXCLAVES_CTL_OP_AND_FLAGS(AOE_SETUP, 0); + return EXCLAVES_CTL_TRAP(port, opf, 0, (uintptr_t)num_message, 0, 0, 0, + (uintptr_t)num_worker); +} + +kern_return_t +exclaves_aoe_work_loop(__unused mach_port_t port) +{ + const uint32_t opf = EXCLAVES_CTL_OP_AND_FLAGS(AOE_WORK_LOOP, 0); + return EXCLAVES_CTL_TRAP(port, opf, 0, 0, 0, 0, 0, 0); +} + +kern_return_t +exclaves_aoe_message_loop(__unused mach_port_t port) +{ + const uint32_t opf = EXCLAVES_CTL_OP_AND_FLAGS(AOE_MESSAGE_LOOP, 0); + return EXCLAVES_CTL_TRAP(port, opf, 0, 0, 0, 0, 0, 0); +} diff --git a/libsyscall/wrappers/getiopolicy_np.c b/libsyscall/wrappers/getiopolicy_np.c index 01d462b49..000ac15b8 100644 --- a/libsyscall/wrappers/getiopolicy_np.c +++ b/libsyscall/wrappers/getiopolicy_np.c @@ -39,7 +39,6 @@ getiopolicy_np(int iotype, int scope) iop_param.iop_iotype = iotype; error = __iopolicysys(IOPOL_CMD_GET, &iop_param); if (error != 0) { - errno = error; policy = -1; goto exit; } diff --git a/libsyscall/wrappers/skywalk/os_channel.c b/libsyscall/wrappers/skywalk/os_channel.c index 18f634192..04553d21b 100644 --- a/libsyscall/wrappers/skywalk/os_channel.c +++ b/libsyscall/wrappers/skywalk/os_channel.c @@ -283,13 +283,11 @@ os_channel_init_ring(struct channel_ring_desc *chrd, SK_ABORT("Channel schema not valid"); /* NOTREACHED */ __builtin_unreachable(); - } else if (!(md_type == NEXUS_META_TYPE_QUANTUM || - md_type == NEXUS_META_TYPE_PACKET)) { + } else if (md_type != NEXUS_META_TYPE_PACKET) { SK_ABORT_WITH_CAUSE("Metadata type unknown", md_type); /* NOTREACHED */ __builtin_unreachable(); - } else if (!(md_subtype == NEXUS_META_SUBTYPE_PAYLOAD || - md_subtype == NEXUS_META_SUBTYPE_RAW)) { + } else if (md_subtype != NEXUS_META_SUBTYPE_RAW) { SK_ABORT_WITH_CAUSE("Metadata subtype unknown", md_subtype); /* NOTREACHED */ __builtin_unreachable(); @@ -326,88 +324,74 @@ _initialize_metadata_address(const channel_ring_t chrd, int i; struct __user_buflet *ubft0; const struct __user_channel_ring *ring = chrd->chrd_ring; + struct __user_buflet *ubft, *pbft; + struct __user_packet *p = (struct __user_packet *)q; + uint16_t bcnt = p->pkt_bufs_cnt; + uint16_t bmax = p->pkt_bufs_max; - switch (chrd->chrd_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __user_buflet *ubft, *pbft; - struct __user_packet *p = (struct __user_packet *)q; - uint16_t bcnt = p->pkt_bufs_cnt; - uint16_t bmax = p->pkt_bufs_max; - - _CASSERT(sizeof(p->pkt_qum_buf.buf_addr) == - sizeof(mach_vm_address_t)); - /* - * In the event of a defunct, we'd be accessing zero-filled - * memory and end up with 0 for bcnt or bmax. - */ - if (__improbable((bcnt == 0) || (bmax == 0))) { - if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { - SK_ABORT("bad bufcnt"); - /* NOTREACHED */ - __builtin_unreachable(); - } - return 0; - } - _PKT_BUFCNT_VERIFY(chrd, bcnt, bmax); - _CH_PKT_GET_FIRST_BUFLET(p, ubft, chrd, ring); - if (__improbable(ubft == NULL)) { - SK_ABORT("bad packet: no buflet"); + _Static_assert(sizeof(p->pkt_qum_buf.buf_addr) == + sizeof(mach_vm_address_t), "invalid buffer size"); + /* + * In the event of a defunct, we'd be accessing zero-filled + * memory and end up with 0 for bcnt or bmax. + */ + if (__improbable((bcnt == 0) || (bmax == 0))) { + if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { + SK_ABORT("bad bufcnt"); /* NOTREACHED */ __builtin_unreachable(); } - /* - * special handling for empty packet buflet. - */ - if (__improbable(p->pkt_qum_buf.buf_idx == OBJ_IDX_NONE)) { - *__DECONST(mach_vm_address_t *, - &p->pkt_qum_buf.buf_addr) = 0; - *__DECONST(mach_vm_address_t *, - &p->pkt_qum_buf.buf_nbft_addr) = - (mach_vm_address_t)ubft; - } - ubft0 = ubft; - for (i = 0; (i < bcnt) && (ubft != NULL); i++) { - pbft = ubft; - if (__probable(pbft->buf_idx != OBJ_IDX_NONE)) { - *(mach_vm_address_t *)(uintptr_t) - &(pbft->buf_addr) = _CHANNEL_RING_BUF(chrd, - ring, pbft); - } else { - *(mach_vm_address_t *)(uintptr_t) - &(pbft->buf_addr) = NULL; - } - if (pbft->buf_nbft_idx != OBJ_IDX_NONE) { - ubft = _CHANNEL_RING_BFT(chrd, ring, - pbft->buf_nbft_idx); - } else { - ubft = NULL; - } - *__DECONST(mach_vm_address_t *, &pbft->buf_nbft_addr) = - (mach_vm_address_t)ubft; - } - if (__improbable(pbft->buf_nbft_idx != OBJ_IDX_NONE)) { - if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { - SK_ABORT("non terminating buflet chain"); - /* NOTREACHED */ - __builtin_unreachable(); - } - return 0; - } - if (__improbable(i != bcnt)) { - SK_ABORT_WITH_CAUSE("invalid buflet count", bcnt); - /* NOTREACHED */ - __builtin_unreachable(); - } - break; + return 0; } - default: - ubft0 = &q->qum_buf[0]; - _CASSERT(sizeof(q->qum_buf[0].buf_addr) == - sizeof(mach_vm_address_t)); - /* immutable: compute pointers from the index */ - *(mach_vm_address_t *)(uintptr_t)&ubft0->buf_addr = - _CHANNEL_RING_BUF(chrd, ring, ubft0); - break; + _PKT_BUFCNT_VERIFY(chrd, bcnt, bmax); + _CH_PKT_GET_FIRST_BUFLET(p, ubft, chrd, ring); + if (__improbable(ubft == NULL)) { + SK_ABORT("bad packet: no buflet"); + /* NOTREACHED */ + __builtin_unreachable(); + } + /* + * special handling for empty packet buflet. + */ + if (__improbable(p->pkt_qum_buf.buf_idx == OBJ_IDX_NONE)) { + *__DECONST(mach_vm_address_t *, + &p->pkt_qum_buf.buf_addr) = 0; + *__DECONST(mach_vm_address_t *, + &p->pkt_qum_buf.buf_nbft_addr) = + (mach_vm_address_t)ubft; + } + ubft0 = ubft; + for (i = 0; (i < bcnt) && (ubft != NULL); i++) { + pbft = ubft; + if (__probable(pbft->buf_idx != OBJ_IDX_NONE)) { + *(mach_vm_address_t *)(uintptr_t) + &(pbft->buf_addr) = _CHANNEL_RING_BUF(chrd, + ring, pbft); + } else { + *(mach_vm_address_t *)(uintptr_t) + &(pbft->buf_addr) = NULL; + } + if (pbft->buf_nbft_idx != OBJ_IDX_NONE) { + ubft = _CHANNEL_RING_BFT(chrd, ring, + pbft->buf_nbft_idx); + } else { + ubft = NULL; + } + *__DECONST(mach_vm_address_t *, &pbft->buf_nbft_addr) = + (mach_vm_address_t)ubft; + } + if (__improbable(pbft->buf_nbft_idx != OBJ_IDX_NONE)) { + if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { + SK_ABORT("non terminating buflet chain"); + /* NOTREACHED */ + __builtin_unreachable(); + } + return 0; + } + if (__improbable(i != bcnt)) { + SK_ABORT_WITH_CAUSE("invalid buflet count", bcnt); + /* NOTREACHED */ + __builtin_unreachable(); } /* return address and offset of the first buffer */ @@ -506,18 +490,6 @@ os_channel_create_extended(const uuid_t uuid, const nexus_port_t port, } init.ci_ch_mode |= CHMODE_EVENT_RING; } - if (cha->cha_monitor != 0) { - if (dir == CHANNEL_DIR_TX_RX) { - init.ci_ch_mode |= CHMODE_MONITOR; - } else if (dir == CHANNEL_DIR_TX) { - init.ci_ch_mode |= CHMODE_MONITOR_TX; - } else if (dir == CHANNEL_DIR_RX) { - init.ci_ch_mode |= CHMODE_MONITOR_RX; - } - if (cha->cha_monitor == CHANNEL_MONITOR_NO_COPY) { - init.ci_ch_mode |= CHMODE_MONITOR_NO_COPY; - } - } if (cha->cha_filter != 0) { init.ci_ch_mode |= CHMODE_FILTER; } @@ -1114,19 +1086,10 @@ os_channel_set_slot_properties(const channel_ring_t chrd, */ q = _SLOT_METADATA(chrd, ring, idx); q->qum_len = prop->sp_len; - switch (chrd->chrd_md_type) { - case NEXUS_META_TYPE_PACKET: { - struct __user_packet *p = (struct __user_packet *)q; - /* No multi-buflet support for slot based interface */ - p->pkt_qum_buf.buf_dlen = prop->sp_len; - p->pkt_qum_buf.buf_doff = 0; - break; - } - default: - q->qum_buf[0].buf_dlen = prop->sp_len; - q->qum_buf[0].buf_doff = 0; - break; - } + struct __user_packet *p = (struct __user_packet *)q; + /* No multi-buflet support for slot based interface */ + p->pkt_qum_buf.buf_dlen = prop->sp_len; + p->pkt_qum_buf.buf_doff = 0; } else if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { /* slot is out of bounds */ SK_ABORT_WITH_CAUSE("Index out of bounds in ssp", idx); @@ -1290,8 +1253,17 @@ os_channel_flow_admissible(const channel_ring_t chrd, uuid_t flow_id, } int -os_channel_flow_adv_get_ce_count(const channel_ring_t chrd, uuid_t flow_id, - const flowadv_idx_t flow_index, uint32_t *ce_cnt, uint32_t *pkt_cnt) +os_channel_flow_adv_get_ce_count(__unused const channel_ring_t chrd, + __unused uuid_t flow_id, __unused const flowadv_idx_t flow_index, + __unused uint32_t *ce_cnt, __unused uint32_t *pkt_cnt) +{ + return 0; +} + +int +os_channel_flow_adv_get_feedback(const channel_ring_t chrd, uuid_t flow_id, + const flowadv_idx_t flow_index, uint32_t *congestion_cnt, + __unused uint32_t *ce_cnt, uint32_t *pkt_cnt) { const struct __user_channel_ring *ring = chrd->chrd_ring; const struct channel *chd = chrd->chrd_channel; @@ -1321,7 +1293,7 @@ os_channel_flow_adv_get_ce_count(const channel_ring_t chrd, uuid_t flow_id, return ENOENT; } - *ce_cnt = fe->fae_ce_cnt; + *congestion_cnt = fe->fae_congestion_cnt; *pkt_cnt = fe->fae_pkt_cnt; return 0; } @@ -1397,17 +1369,6 @@ os_channel_attr_set(const channel_attr_t cha, const channel_attr_type_t type, } break; - case CHANNEL_ATTR_MONITOR: - switch (value) { - case CHANNEL_MONITOR_OFF: - case CHANNEL_MONITOR_NO_COPY: - case CHANNEL_MONITOR_COPY: - cha->cha_monitor = (uint32_t)value; - goto done; - } - err = EINVAL; - break; - case CHANNEL_ATTR_TX_LOWAT_UNIT: case CHANNEL_ATTR_RX_LOWAT_UNIT: switch (value) { @@ -1538,10 +1499,6 @@ os_channel_attr_get(const channel_attr_t cha, const channel_attr_type_t type, *value = 1; break; - case CHANNEL_ATTR_MONITOR: - *value = cha->cha_monitor; - break; - case CHANNEL_ATTR_TX_LOWAT_UNIT: *value = cha->cha_tx_lowat.cet_unit; break; @@ -1660,15 +1617,6 @@ os_channel_info2attr(struct channel *chd, channel_attr_t cha) void *cha_key = cha->cha_key; uint32_t caps; - _CASSERT((uint32_t)NEXUS_META_TYPE_INVALID == (uint32_t)CHANNEL_NEXUS_META_TYPE_INVALID); - _CASSERT((uint32_t)NEXUS_META_TYPE_QUANTUM == (uint32_t)CHANNEL_NEXUS_META_TYPE_QUANTUM); - _CASSERT((uint32_t)NEXUS_META_TYPE_PACKET == (uint32_t)CHANNEL_NEXUS_META_TYPE_PACKET); - _CASSERT((uint32_t)NEXUS_META_SUBTYPE_INVALID == - (uint32_t)CHANNEL_NEXUS_META_SUBTYPE_INVALID); - _CASSERT((uint32_t)NEXUS_META_SUBTYPE_PAYLOAD == - (uint32_t)CHANNEL_NEXUS_META_SUBTYPE_PAYLOAD); - _CASSERT((uint32_t)NEXUS_META_SUBTYPE_RAW == (uint32_t)CHANNEL_NEXUS_META_SUBTYPE_RAW); - bzero(cha, sizeof(*cha)); cha->cha_tx_rings = CHD_PARAMS(chd)->nxp_tx_rings; cha->cha_rx_rings = CHD_PARAMS(chd)->nxp_rx_rings; @@ -1684,13 +1632,6 @@ os_channel_info2attr(struct channel *chd, channel_attr_t cha) cha->cha_nexus_defunct_ok = !!(cinfo->cinfo_ch_mode & CHMODE_DEFUNCT_OK); cha->cha_nexusadv_size = CHD_PARAMS(chd)->nxp_nexusadv_size; - if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) { - cha->cha_monitor = - (cinfo->cinfo_ch_mode & CHMODE_MONITOR_NO_COPY) ? - CHANNEL_MONITOR_NO_COPY : CHANNEL_MONITOR_COPY; - } else { - cha->cha_monitor = CHANNEL_MONITOR_OFF; - } cha->cha_key_len = cha_key_len; cha->cha_key = cha_key; cha->cha_tx_lowat = cinfo->cinfo_tx_lowat; @@ -2021,25 +1962,23 @@ os_channel_purge_packet_alloc_ring_common(const channel_t chd, bool large) * defunct, we'd be accessing zero-filled memory; this is fine * since we ignore all changes made to region at that time. */ - if (chrd->chrd_md_type == NEXUS_META_TYPE_PACKET) { - struct __user_packet *p = (struct __user_packet *)q; - uint16_t bcnt = p->pkt_bufs_cnt; - uint16_t bmax = p->pkt_bufs_max; + struct __user_packet *p = (struct __user_packet *)q; + uint16_t bcnt = p->pkt_bufs_cnt; + uint16_t bmax = p->pkt_bufs_max; - if (__improbable((bcnt == 0) || (bmax == 0))) { - if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { - SK_ABORT("pkt pool purge, bad bufcnt"); - /* NOTREACHED */ - __builtin_unreachable(); - } else { - return ENXIO; - } + if (__improbable((bcnt == 0) || (bmax == 0))) { + if (!_CHANNEL_RING_IS_DEFUNCT(chrd)) { + SK_ABORT("pkt pool purge, bad bufcnt"); + /* NOTREACHED */ + __builtin_unreachable(); + } else { + return ENXIO; } - /* - * alloc ring will not have multi-buflet packets. - */ - _PKT_BUFCNT_VERIFY(chrd, bcnt, 1); } + /* + * alloc ring will not have multi-buflet packets. + */ + _PKT_BUFCNT_VERIFY(chrd, bcnt, 1); *(mach_vm_address_t *) (uintptr_t)&q->qum_buf[0].buf_addr = _CHANNEL_RING_BUF(chrd, ring, &q->qum_buf[0]); idx = _CHANNEL_RING_NEXT(ring, idx); @@ -2418,3 +2357,13 @@ os_channel_buflet_free(const channel_t chd, buflet_t ubft) return __improbable(_CHANNEL_RING_IS_DEFUNCT(chrd)) ? ENXIO : 0; } + +int +os_channel_get_upp_buffer_stats(const channel_t chd, uint64_t *buffer_total, + uint64_t *buffer_inuse) +{ + struct __user_channel_schema *csm = CHD_SCHEMA(chd); + *buffer_total = csm->csm_upp_buf_total; + *buffer_inuse = csm->csm_upp_buf_inuse; + return 0; +} diff --git a/libsyscall/wrappers/skywalk/os_nexus.c b/libsyscall/wrappers/skywalk/os_nexus.c index c83a953e2..cf8a6a556 100644 --- a/libsyscall/wrappers/skywalk/os_nexus.c +++ b/libsyscall/wrappers/skywalk/os_nexus.c @@ -233,6 +233,33 @@ add_traffic_rule_inet(const nexus_controller_t ncd, return 0; } +static int +add_traffic_rule_eth(const nexus_controller_t ncd, + const char *ifname, const struct ifnet_traffic_descriptor_eth *td, + const struct ifnet_traffic_rule_action_steer *ra, const uint32_t flags, + uuid_t *rule_uuid) +{ + struct nxctl_add_traffic_rule_eth_iocargs args; + int err; + + bzero(&args, sizeof(args)); + if (ifname != NULL) { + (void) strlcpy(args.atre_ifname, ifname, IFNAMSIZ); + } + bcopy(td, &args.atre_td, sizeof(args.atre_td)); + bcopy(ra, &args.atre_ra, sizeof(args.atre_ra)); + + if ((flags & NXCTL_ADD_TRAFFIC_RULE_FLAG_PERSIST) != 0) { + args.atre_flags |= NXIOC_ADD_TRAFFIC_RULE_FLAG_PERSIST; + } + err = ioctl(ncd->ncd_fd, NXIOC_ADD_TRAFFIC_RULE_ETH, &args); + if (err < 0) { + return errno; + } + bcopy(&args.atre_uuid, rule_uuid, sizeof(args.atre_uuid)); + return 0; +} + int os_nexus_controller_add_traffic_rule(const nexus_controller_t ncd, const char *ifname, const struct ifnet_traffic_descriptor_common *td, @@ -258,6 +285,16 @@ os_nexus_controller_add_traffic_rule(const nexus_controller_t ncd, (const struct ifnet_traffic_rule_action_steer *)ra, flags, rule_uuid); } + case IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH: { + if (td->itd_len != + sizeof(struct ifnet_traffic_descriptor_eth)) { + return EINVAL; + } + return add_traffic_rule_eth(ncd, ifname, + (const struct ifnet_traffic_descriptor_eth *)td, + (const struct ifnet_traffic_rule_action_steer *)ra, + flags, rule_uuid); + } default: return ENOTSUP; } @@ -280,27 +317,51 @@ os_nexus_controller_remove_traffic_rule(const nexus_controller_t ncd, return 0; } +static boolean_t +rule_iterate(struct nxctl_traffic_rule_generic_iocinfo *ginfo, + struct ifnet_traffic_descriptor_common *td, + struct ifnet_traffic_rule_action *ra, + nexus_traffic_rule_iterator_t itr, void *itr_arg) +{ + struct nexus_traffic_rule_info itr_info; + + bzero(&itr_info, sizeof(itr_info)); + itr_info.nri_rule_uuid = &ginfo->trg_uuid; + itr_info.nri_owner = ginfo->trg_procname; + itr_info.nri_ifname = ginfo->trg_ifname; + itr_info.nri_td = td; + itr_info.nri_ra = ra; + + if (!itr(itr_arg, &itr_info)) { + return false; + } + return true; +} + static void inet_rule_iterate(void *buf, uint32_t count, nexus_traffic_rule_iterator_t itr, void *itr_arg) { struct nxctl_traffic_rule_inet_iocinfo *info = buf; - struct nxctl_traffic_rule_generic_iocinfo *ginfo; - struct nexus_traffic_rule_info itr_info; - uint32_t c; - for (c = 0; c < count; c++) { - bzero(&itr_info, sizeof(itr_info)); - ginfo = &info->tri_common; - itr_info.nri_rule_uuid = &ginfo->trg_uuid; - itr_info.nri_owner = ginfo->trg_procname; - itr_info.nri_ifname = ginfo->trg_ifname; - itr_info.nri_td = - (struct ifnet_traffic_descriptor_common *)&info->tri_td; - itr_info.nri_ra = - (struct ifnet_traffic_rule_action *)&info->tri_ra; + for (uint32_t c = 0; c < count; c++) { + if (!rule_iterate(&info->tri_common, &info->tri_td.inet_common, + &info->tri_ra.ras_common, itr, itr_arg)) { + break; + } + info++; + } +} - if (!itr(itr_arg, &itr_info)) { +static void +eth_rule_iterate(void *buf, uint32_t count, + nexus_traffic_rule_iterator_t itr, void *itr_arg) +{ + struct nxctl_traffic_rule_eth_iocinfo *info = buf; + + for (uint32_t c = 0; c < count; c++) { + if (!rule_iterate(&info->tre_common, &info->tre_td.eth_common, + &info->tre_ra.ras_common, itr, itr_arg)) { break; } info++; @@ -319,6 +380,9 @@ static struct traffic_rule_type traffic_rule_types[] = { {IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET, sizeof(struct nxctl_traffic_rule_inet_iocinfo), NTRDEFAULTCOUNT, inet_rule_iterate}, + {IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH, + sizeof(struct nxctl_traffic_rule_eth_iocinfo), + NTRDEFAULTCOUNT, eth_rule_iterate}, }; #define NTRTYPES (sizeof(traffic_rule_types)/sizeof(struct traffic_rule_type)) @@ -476,3 +540,15 @@ __os_nexus_get_llink_info(const nexus_controller_t ncd, const uuid_t nx_uuid, return __nexus_set_opt(ncd->ncd_fd, NXOPT_NEXUS_CONFIG, &ncr, sizeof(ncr)); } + +int +os_nexus_flow_set_connection_idle(const uuid_t nx_uuid, const uuid_t flow_uuid, + bool enable) +{ + struct nx_flow_req nfr = {0}; + memcpy(nfr.nfr_flow_uuid, flow_uuid, sizeof(uuid_t)); + nfr.nfr_flags = enable ? NXFLOWREQF_CONNECTION_IDLE : + NXFLOWREQF_CONNECTION_REUSED; + + return __os_nexus_config_flow(nx_uuid, &nfr); +} diff --git a/libsyscall/wrappers/skywalk/os_packet.c b/libsyscall/wrappers/skywalk/os_packet.c index 720c03bdb..fb0bfb872 100644 --- a/libsyscall/wrappers/skywalk/os_packet.c +++ b/libsyscall/wrappers/skywalk/os_packet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2022 Apple Inc. All rights reserved. + * Copyright (c) 2015-2024 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,27 +39,6 @@ #error "LIBSYSCALL_INTERFACE not defined" #endif /* !LIBSYSCALL_INTERFACE */ -#if (DEBUG || DEVELOPMENT) -__attribute__((noreturn)) -void -pkt_subtype_assert_fail(const packet_t ph, uint64_t type __unused, - uint64_t subtype __unused) -{ - SK_ABORT_WITH_CAUSE("invalid packet subtype", ph); - /* NOTREACHED */ - __builtin_unreachable(); -} - -__attribute__((noreturn)) -void -pkt_type_assert_fail(const packet_t ph, uint64_t type __unused) -{ - SK_ABORT_WITH_CAUSE("invalid packet type", ph); - /* NOTREACHED */ - __builtin_unreachable(); -} -#endif /* DEBUG || DEVELOPMENT */ - int os_packet_set_headroom(const packet_t ph, const uint8_t headroom) { @@ -309,16 +288,15 @@ os_packet_set_packetid(const packet_t ph, packet_id_t *pktid) } int -os_packet_set_vlan_tag(const packet_t ph, const uint16_t tag, - const boolean_t tag_in_pkt) +os_packet_set_vlan_tag(const packet_t ph, const uint16_t tag) { - return __packet_set_vlan_tag(ph, tag, tag_in_pkt); + return __packet_set_vlan_tag(ph, tag); } int -os_packet_get_vlan_tag(const packet_t ph, uint16_t *tag, boolean_t *tag_in_pkt) +os_packet_get_vlan_tag(const packet_t ph, uint16_t *tag) { - return __packet_get_vlan_tag(ph, tag, tag_in_pkt); + return __packet_get_vlan_tag(ph, tag); } uint16_t @@ -346,6 +324,12 @@ os_packet_get_wake_flag(const packet_t ph) return __packet_get_wake_flag(ph); } +void +os_packet_set_wake_flag(const packet_t ph) +{ + __packet_set_wake_flag(ph); +} + boolean_t os_packet_get_keep_alive(const packet_t ph) { diff --git a/libsyscall/wrappers/spawn/posix_spawn.c b/libsyscall/wrappers/spawn/posix_spawn.c index dda096d97..d5a000d5f 100644 --- a/libsyscall/wrappers/spawn/posix_spawn.c +++ b/libsyscall/wrappers/spawn/posix_spawn.c @@ -185,6 +185,9 @@ __posix_spawnattr_init(struct _posix_spawnattr *psattrp) psattrp->psa_kqworkloop_soft_limit = 0; psattrp->psa_kqworkloop_hard_limit = 0; + /* Default is no conclave memory limit */ + psattrp->psa_conclave_mem_limit = 0; + psattrp->psa_crash_behavior = 0; psattrp->psa_crash_behavior_deadline = 0; psattrp->psa_launch_type = 0; @@ -937,7 +940,31 @@ posix_spawnattr_set_use_sec_transition_shims_np(posix_spawnattr_t *attr, uint32_ psattr = *(_posix_spawnattr_t *)attr; sec_flags = (posix_spawn_secflag_options)(flags); - sec_flags |= POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE; + + if (!(sec_flags & POSIX_SPAWN_SECFLAG_EXPLICIT_DISABLE) && + !(sec_flags & POSIX_SPAWN_SECFLAG_EXPLICIT_REQUIRE_ENABLE)) { + /* + * For a long time we've had this unconditional setting + * of POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE whenever this + * function is called. This setting makes little sense + * in face of a request to explicitly disable (in fact, that's + * a combo that is explicitly refused by the kernel) and + * completely defeats the purpose of EXPLICIT_REQUIRE_ENABLE. + * To not risk breaking test environments that may incorrectly + * rely on this behavior, we single out the DISABLE and EXPLICIT_REQUIRE cases + * and proceed otherwise setting the flag. + */ + sec_flags |= POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE; + } + + /* + * Inheritance used to be the internal default, so we maintain legacy + * behavior in this API, as Xcode and internal tests expect. + */ + if (!(sec_flags & POSIX_SPAWN_SECFLAG_EXPLICIT_DISABLE_INHERIT)) { + sec_flags |= POSIX_SPAWN_SECFLAG_EXPLICIT_ENABLE_INHERIT; + } + psattr->psa_sec_flags = (uint16_t)sec_flags; return 0; @@ -1912,9 +1939,19 @@ posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *file_actions, return 0; } +/* + * Deprecated alias of posix_spawn_file_actions_addchdir + */ +int +posix_spawn_file_actions_addchdir_np( + posix_spawn_file_actions_t * __restrict file_actions, + const char * __restrict path) +{ + return posix_spawn_file_actions_addchdir(file_actions, path); +} /* - * posix_spawn_file_actions_addchdir_np + * posix_spawn_file_actions_addchdir * * Description: Add a chdir action to the object referenced by 'file_actions' * that will cause the current working directory to attempt to be changed @@ -1932,7 +1969,7 @@ posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *file_actions, * EINVAL The value specified by file_actions is invalid. */ int -posix_spawn_file_actions_addchdir_np( +posix_spawn_file_actions_addchdir( posix_spawn_file_actions_t * __restrict file_actions, const char * __restrict path) { @@ -1967,9 +2004,18 @@ posix_spawn_file_actions_addchdir_np( return 0; } +/* + * Deprecated alias for posix_spawn_file_actions_addfchdir + */ +int +posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *file_actions, + int filedes) +{ + return posix_spawn_file_actions_addfchdir(file_actions, filedes); +} /* - * posix_spawn_file_actions_fchdir_np + * posix_spawn_file_actions_addfchdir * * Description: Add a fchdir action to the object referenced by 'file_actions' * that will cause the current working directory to attempt to be changed @@ -1988,7 +2034,7 @@ posix_spawn_file_actions_addchdir_np( * EINVAL The value specified by file_actions is invalid. */ int -posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *file_actions, +posix_spawn_file_actions_addfchdir(posix_spawn_file_actions_t *file_actions, int filedes) { _posix_spawn_file_actions_t *psactsp; @@ -2829,6 +2875,23 @@ posix_spawnattr_set_kqworklooplimit_ext(posix_spawnattr_t * __restrict attr, return 0; } +int +posix_spawnattr_set_conclavememlimit_ext(posix_spawnattr_t * __restrict attr, + uint32_t conclave_limit) +{ + _posix_spawnattr_t psattr; + + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + psattr = *(_posix_spawnattr_t *)attr; + + psattr->psa_conclave_mem_limit = conclave_limit; + + return 0; +} + /* * posix_spawnattr_set_jetsam_ttr_np * diff --git a/libsyscall/wrappers/spawn/spawn.h b/libsyscall/wrappers/spawn/spawn.h index b69b0d20b..59de066a7 100644 --- a/libsyscall/wrappers/spawn/spawn.h +++ b/libsyscall/wrappers/spawn/spawn.h @@ -69,6 +69,12 @@ int posix_spawnp(pid_t * __restrict, const char * __restrict, char *const __argv[__restrict], char *const __envp[__restrict]) __API_AVAILABLE(macos(10.5), ios(2.0)); +int posix_spawn_file_actions_addchdir(posix_spawn_file_actions_t *, + const char * __restrict) __API_AVAILABLE(macos(16.0)) __SPI_AVAILABLE(ios(19.0), tvos(19.0), watchos(12.0), visionos(3.0), bridgeos(10.0)); + +int posix_spawn_file_actions_addfchdir(posix_spawn_file_actions_t *, + int) __API_AVAILABLE(macos(16.0)) __SPI_AVAILABLE(ios(19.0), tvos(19.0), watchos(12.0), visionos(3.0), bridgeos(10.0)); + int posix_spawn_file_actions_addclose(posix_spawn_file_actions_t *, int) __API_AVAILABLE(macos(10.5), ios(2.0)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *, int, @@ -174,10 +180,10 @@ int posix_spawn_file_actions_addinherit_np(posix_spawn_file_actions_t *, int) __API_AVAILABLE(macos(10.7), ios(4.3)) __SPI_AVAILABLE(watchos(2.0), tvos(9.0), bridgeos(1.0)); int posix_spawn_file_actions_addchdir_np(posix_spawn_file_actions_t *, - const char * __restrict) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0)); + const char * __restrict) __API_DEPRECATED("posix_spawn_file_actions_addchdir(3) has replaced posix_spawn_file_actions_addchdir_np(3)", macos(10.15, 16.0)) __SPI_DEPRECATED("posix_spawn_file_actions_addchdir(3) has replaced posix_spawn_file_actions_addchdir_np(3)", ios(13.0, 19.0), tvos(13.0, 19.0), watchos(6.0, 12.0), visionos(1.0, 3.0), bridgeos(4.0, 10.0)); int posix_spawn_file_actions_addfchdir_np(posix_spawn_file_actions_t *, - int) __API_AVAILABLE(macos(10.15)) __SPI_AVAILABLE(ios(13.0), tvos(13.0), watchos(6.0), bridgeos(4.0)); + int) __API_DEPRECATED("posix_spawn_file_actions_addfchdir(3) has replaced posix_spawn_file_actions_addfchdir_np(3)", macos(10.15, 16.0)) __SPI_DEPRECATED("posix_spawn_file_actions_addfchdir(3) has replaced posix_spawn_file_actions_addfchdir_np(3)", ios(13.0, 19.0), tvos(13.0, 19.0), watchos(6.0, 12.0), visionos(1.0, 3.0), bridgeos(4.0, 10.0)); __END_DECLS diff --git a/libsyscall/wrappers/spawn/spawn_private.h b/libsyscall/wrappers/spawn/spawn_private.h index 4ce2e4fb0..f00f37991 100644 --- a/libsyscall/wrappers/spawn/spawn_private.h +++ b/libsyscall/wrappers/spawn/spawn_private.h @@ -47,6 +47,7 @@ int posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr, short flags, int priority, int memlimit) __API_UNAVAILABLE(macos) __API_AVAILABLE(ios(5.0)); #endif /* (TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR) */ +// All memory limits are in MiB. int posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, short flags, int priority, int memlimit_active, int memlimit_inactive) __API_AVAILABLE(macos(10.11), ios(9.0)); @@ -66,6 +67,9 @@ int posix_spawnattr_set_filedesclimit_ext(posix_spawnattr_t * __restrict att int posix_spawnattr_set_kqworklooplimit_ext(posix_spawnattr_t * __restrict attr, uint32_t kqwl_soft_limit, uint32_t kqwl_hard_limit) __API_AVAILABLE(macos(14.3), ios(17.4), tvos(17.4), watchos(10.4)); +int posix_spawnattr_set_conclavememlimit_ext(posix_spawnattr_t * __restrict attr, + uint32_t conclave_limit) __API_AVAILABLE(ios(19.0), macos(16.0), tvos(19.0), watchos(12.0)); + int posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, int count, mach_port_t portarray[]) __API_AVAILABLE(macos(10.9), ios(6.0)); diff --git a/libsyscall/wrappers/statfs_ext.c b/libsyscall/wrappers/statfs_ext.c index c9d6ec4b3..81b7f3516 100644 --- a/libsyscall/wrappers/statfs_ext.c +++ b/libsyscall/wrappers/statfs_ext.c @@ -141,16 +141,20 @@ __statfs_ext_impl(const char *path, int fd, struct statfs *buf, int flags) return -1; } - /* Simply wrap statfs() or fstatfs() if no option is provided */ - if (flags == 0) { - return __statfs_ext_default(path, fd, buf); - } - /* Retrieve filesystem statistics with extended options */ if (flags & STATFS_EXT_NOBLOCK) { ret = __statfs_ext_noblock(path, fd, buf); } + /* + * Fall back to statfs()/fstatfs() if: + * 1. No options are provided. + * 2. __statfs_ext_noblock() returns EINVAL. + */ + if ((flags == 0) || (ret == -1 && errno == EINVAL)) { + ret = __statfs_ext_default(path, fd, buf); + } + return ret; } diff --git a/libsyscall/wrappers/system-version-compat-support.h b/libsyscall/wrappers/system-version-compat-support.h index 31c67ef33..2310c0a86 100644 --- a/libsyscall/wrappers/system-version-compat-support.h +++ b/libsyscall/wrappers/system-version-compat-support.h @@ -32,9 +32,41 @@ #if TARGET_OS_OSX && !defined(__i386__) #define SYSTEM_VERSION_COMPAT_ENABLED 1 +#define SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX 1 +#define SYSTEM_VERSION_COMPAT_HAS_MODE_IOS 1 +#define SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL 1 +#define SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX 0 #endif +#if defined(RC_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) +#ifndef SYSTEM_VERSION_COMPAT_ENABLED +#define SYSTEM_VERSION_COMPAT_ENABLED 1 +#endif + +/* Force enabling macOS mode */ +#ifdef SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX +#undef SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX +#endif +#define SYSTEM_VERSION_COMPAT_HAS_MODE_MACOSX 1 + +#ifndef SYSTEM_VERSION_COMPAT_HAS_MODE_IOS +#define SYSTEM_VERSION_COMPAT_HAS_MODE_IOS 0 +#endif + +/* Force disabling sysctl submission */ +#ifdef SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL +#undef SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL +#endif +#define SYSTEM_VERSION_COMPAT_NEEDS_SYSCTL 0 + +/* Force shimming path from OS cryptex */ +#ifdef SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX +#undef SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX +#endif +#define SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX 1 +#endif /* defined(RC_EXPERIMENTAL_SYSTEM_VERSION_COMPAT) */ + #if SYSTEM_VERSION_COMPAT_ENABLED typedef enum system_version_compat_mode { SYSTEM_VERSION_COMPAT_MODE_DISABLED = 0, diff --git a/libsyscall/wrappers/system-version-compat.c b/libsyscall/wrappers/system-version-compat.c index 8964ba9c2..1960eeced 100644 --- a/libsyscall/wrappers/system-version-compat.c +++ b/libsyscall/wrappers/system-version-compat.c @@ -40,7 +40,13 @@ #define COMPAT_SUFFIX_IOS "" #define SYSTEM_VERSION_PLIST_FILENAME "SystemVersion.plist" -#define SYSTEM_VERSION_PLIST_PATH ("/System/Library/CoreServices/" SYSTEM_VERSION_PLIST_FILENAME) +#define SYSTEM_VERSION_PLIST_PATH "/System/Library/CoreServices/" SYSTEM_VERSION_PLIST_FILENAME + +#if TARGET_OS_OSX +#define SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes/OS" SYSTEM_VERSION_PLIST_PATH +#else +#define SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATH "/private/preboot/Cryptexes/OS" SYSTEM_VERSION_PLIST_PATH +#endif /* TARGET_OS_OSX */ #define SYSTEM_VERSION_COMPAT_PLIST_FILENAME(platform_prefix, compat_suffix) (platform_prefix "SystemVersion" compat_suffix ".plist") @@ -49,6 +55,7 @@ #define SYSTEM_VERSION_COMPAT_PLIST_FILENAMELEN(platform_prefix, compat_suffix) strlen(SYSTEM_VERSION_COMPAT_PLIST_FILENAME(platform_prefix, compat_suffix)) #define SYSTEM_VERSION_PLIST_PATHLEN strlen(SYSTEM_VERSION_PLIST_PATH) +#define SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATHLEN strlen(SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATH) extern system_version_compat_mode_t system_version_compat_mode; @@ -154,14 +161,28 @@ _system_version_compat_open_shim(int opened_fd, int openat_fd, const char *orig_ } } - /* Check to see whether the path matches SYSTEM_VERSION_PLIST_PATH */ + bool path_needs_shim = false; size_t newpathlen = strnlen(new_path, MAXPATHLEN); - if (newpathlen != SYSTEM_VERSION_PLIST_PATHLEN) { - errno = stashed_errno; - return opened_fd; + + /* Check to see whether the path matches SYSTEM_VERSION_PLIST_PATH */ + if (newpathlen == SYSTEM_VERSION_PLIST_PATHLEN && + strncmp(new_path, SYSTEM_VERSION_PLIST_PATH, newpathlen) == 0) { + path_needs_shim = true; } - if (strncmp(new_path, SYSTEM_VERSION_PLIST_PATH, SYSTEM_VERSION_PLIST_PATHLEN) != 0) { +#if SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX + /* Check to see whether the path matches SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATH */ + if (newpathlen == SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATHLEN && + strncmp(new_path, SYSTEM_VERSION_PLIST_OS_CRYPTEX_PATH, newpathlen) == 0) { + path_needs_shim = true; + + /* Redirect to the system volume path */ + orig_path = SYSTEM_VERSION_PLIST_PATH; + path_str_len = SYSTEM_VERSION_PLIST_PATHLEN; + } +#endif /* SYSTEM_VERSION_COMPAT_SHIM_OS_CRYPTEX */ + + if (!path_needs_shim) { errno = stashed_errno; return opened_fd; } diff --git a/libsyscall/wrappers/utimensat.c b/libsyscall/wrappers/utimensat.c index a98d50672..b5b29ff67 100644 --- a/libsyscall/wrappers/utimensat.c +++ b/libsyscall/wrappers/utimensat.c @@ -139,6 +139,9 @@ utimensat(int fd, const char *path, const struct timespec _times_in[2], int flag if (flags & AT_SYMLINK_NOFOLLOW_ANY) { flags_out |= FSOPT_NOFOLLOW_ANY; } + if (flags & AT_RESOLVE_BENEATH) { + flags_out |= FSOPT_RESOLVE_BENEATH; + } return setattrlistat(fd, path, &a, ×_out, attrbuf_size, flags_out); } diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index 86eabaf62..c9575f38f 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -374,6 +374,14 @@ endif # behave similarly to externally compiled commands # +_function_filter_out_vmapple_sptm_build_config = $(if \ + $(and \ + $(filter SPTM,$(call function_extract_kernel_config_from_build_config,$(1))), \ + $(filter VMAPPLE,$(call function_extract_machine_config_from_build_config,$(1))) \ + ) \ + ,,$(1) \ + ) + # $(1) is an expanded kernel config from a TARGET_CONFIGS_UC tuple # $(2) is an expanded arch config from a TARGET_CONFIGS_UC tuple # $(3) is an expanded machine config from a TARGET_CONFIGS_UC tuple @@ -400,11 +408,21 @@ _function_create_build_configs_do_expand = $(call _function_create_buil ) \ ) +# $(1) is an un-expanded kernel config from a TARGET_CONFIGS_UC tuple +# $(2) is an un-expanded arch config from a TARGET_CONFIGS_UC tuple +# $(3) is an un-expanded machine config from a TARGET_CONFIGS_UC tuple, that may be multiplexed (e.g. ConfigA&ConfigB) +# This function splits any multiplexed machine configs into separate items. +_function_create_build_configs_do_expand_with_muxed_machine_config = $(foreach machine_config, $(subst &, ,$(3)), \ + $(call _function_filter_out_vmapple_sptm_build_config,\ + $(call _function_create_build_configs_do_expand,$(1),$(2),$(machine_config)) \ + ) \ + ) + # $(1) is an un-expanded TARGET_CONFIGS_UC list, which must be consumed # 3 elements at a time function_create_build_configs = $(sort \ $(strip \ - $(call _function_create_build_configs_do_expand, \ + $(call _function_create_build_configs_do_expand_with_muxed_machine_config, \ $(word 1,$(1)), \ $(word 2,$(1)), \ $(word 3,$(1)), \ @@ -439,9 +457,21 @@ _function_create_alias_configs_do_expand = $(call _function_create_alias_con $(4) \ ) +# $(1) is an un-expanded kernel config from a TARGET_CONFIGS_UC tuple +# $(2) is an un-expanded arch config from a TARGET_CONFIGS_UC tuple +# $(3) is an un-expanded machine config from a TARGET_CONFIGS_UC tuple, that may be multiplexed (e.g. ConfigA&ConfigB) +# $(4) is an expanded SoC platform config from a TARGET_CONFIGS_ALIASES_UC tuple, +# which should be an alias of $(3) +# This function splits any multiplexed machine configs into separate items. +_function_create_alias_configs_do_expand_with_muxed_machine_config = $(foreach machine_config, $(subst &, ,$(3)), \ + $(call _function_filter_out_vmapple_sptm_build_config,\ + $(call _function_create_alias_configs_do_expand,$(1),$(2),$(machine_config),$(4)) \ + ) \ + ) + function_create_alias_configs = $(sort \ $(strip \ - $(call _function_create_alias_configs_do_expand, \ + $(call _function_create_alias_configs_do_expand_with_muxed_machine_config, \ $(word 1,$(1)), \ $(word 2,$(1)), \ $(word 3,$(1)), \ @@ -510,6 +540,17 @@ function_substitute_word_with_replacement = $(strip $(if $(2), \ ) \ ) +# $(1) is a string of form "arch;platform(&platform)*", where multiple platforms are separated by the "&" delimiter. +# Output is the arch. +function_get_arch = $(word 1,$(subst ;, ,$(1))) + +# $(1) is a string of form "arch;platform(&platform)*", where multiple platforms are separated by the "&" delimiter. +# Output is a space separated list of the platforms. +function_get_platforms = $(subst &, ,$(word 2,$(subst ;, ,$(1)))) + +# $(1) is a string of form "arch;platform(&platform)*", where multiple platforms are separated by the "&" delimiter. +function_parse_product_configs = $(foreach platform,$(call function_get_platforms,$(1)),$(call function_get_arch,$(1));$(platform);) + # You can't assign a variable to an empty space without these # shenanigans empty := diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index d8c180878..cccb411ec 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -1,6 +1,6 @@ # -*- mode: makefile;-*- # -# Copyright (C) 1999-2023 Apple Inc. All rights reserved. +# Copyright (C) 1999-2025 Apple Inc. All rights reserved. # # MakeInc.def contains global definitions for building, # linking, and installing files. @@ -31,7 +31,8 @@ SUPPORTED_ARM64_MACHINE_CONFIGS = NONE else SUPPORTED_ARM64_MACHINE_CONFIGS = T6000 T6020 T6030 T6031 T6041 T8101 T8103 T8112 T8122 T8132 VMAPPLE -SPTM_ENABLED_SOCS_MacOSX = t6041 t8132 +SPTM_ENABLED_SOCS_MacOSX = t8112 t6020 t8122 t6030 t6031 t8132 t6041 + endif @@ -51,15 +52,17 @@ ifndef CURRENT_MACHINE_CONFIG_LC export CURRENT_MACHINE_CONFIG_LC := $(shell printf "%s" "$(CURRENT_MACHINE_CONFIG)" | $(TR) A-Z a-z) endif +ARM64_EXTRA_BUILD_FLAGS = + # Enable BTI by default for all ARM64 targets BTI_BUILD = 1 # -mkernel contains BTI by default ifeq ($(BTI_BUILD),1) - BTI_BUILD_FLAGS = -DXNU_BUILT_WITH_BTI + ARM64_EXTRA_BUILD_FLAGS += -DXNU_BUILT_WITH_BTI else - BTI_BUILD_FLAGS = -fno-branch-target-identification -UXNU_BUILT_WITH_BTI + ARM64_EXTRA_BUILD_FLAGS += -fno-branch-target-identification -UXNU_BUILT_WITH_BTI endif @@ -80,6 +83,7 @@ MACHINE_FLAGS_ARM64_T6031 = -DARM64_BOARD_CONFIG_T6031 -mcpu=apple-m3 MACHINE_FLAGS_ARM64_T6041 = -DARM64_BOARD_CONFIG_T6041 -mcpu=apple-m4 MACHINE_FLAGS_ARM64_T8122 = -DARM64_BOARD_CONFIG_T8122_T8130 -mcpu=apple-m3 MACHINE_FLAGS_ARM64_T8132 = -DARM64_BOARD_CONFIG_T8132 -mcpu=apple-m4 + MACHINE_FLAGS_ARM64_VMAPPLE = -DARM64_BOARD_CONFIG_VMAPPLE -march=armv8.5a+sme2 @@ -126,6 +130,11 @@ DEFINES = -DAPPLE -DKERNEL -DKERNEL_PRIVATE -DXNU_KERNEL_PRIVATE \ -DCURRENT_MACHINE_CONFIG_LC=$(CURRENT_MACHINE_CONFIG_LC) \ $(CONFIG_DEFINES) $(SEED_DEFINES) +# Append experimental definition +ifneq ($(RC_EXPERIMENTAL_SYSTEM_VERSION_COMPAT),) +DEFINES += -DXNU_EXPERIMENTAL_SYSTEM_VERSION_COMPAT +endif + # Enable caching with `make CCACHE=ccache` # This intentionally does not override $(CC) because that will confuse # utilities like mig. @@ -197,6 +206,11 @@ WARNFLAGS_STD := $(WARNFLAGS_STD) \ -Wno-error=c99-designator \ -Wno-error=reorder-init-list \ -Wno-error=switch-default \ + -Wno-error=four-char-constants \ + -Wno-error=suggest-override \ + -Wno-error=suggest-destructor-override \ + -Wno-error=tautological-value-range-compare \ + -Wno-error=null-pointer-subtraction \ -Wno-deprecated-volatile \ -Wno-error=incompatible-function-pointer-types-strict \ -Wno-cast-function-type-strict @@ -268,6 +282,9 @@ ifeq ($(RC_ProjectName),xnu_libraries) BUILD_STATIC_LINK := 1 BUILD_XNU_LIBRARY := 1 RC_NONARCH_CFLAGS += -D__BUILDING_XNU_LIBRARY__=1 +ifneq ($(XNU_LibFlavour),) +RC_NONARCH_CFLAGS += -D__BUILDING_XNU_LIB_$(XNU_LibFlavour)__=1 +endif endif ifneq ($(filter ARM ARM64,$(CURRENT_ARCH_CONFIG)),) @@ -362,23 +379,44 @@ CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ CFLAGS_X86_64H = $(CFLAGS_X86_64) -LARGE_MEMORY_DEFINE=-UARM_LARGE_MEMORY -ARM64_PLKSEG_ADDR =0xfffffff004004000 -ARM64_LINK_ADDR =0xfffffff007004000 +# By default, all ARM64 targets use the small memory config +ARM64_LARGE_MEMORY = 0 +ARM64_PLKSEG_ADDR = 0xfffffff004004000 +ARM64_LINK_ADDR = 0xfffffff007004000 # Use ARM_LARGE_MEMORY config for all MacOSX targets. ifneq ($(filter $(PLATFORM),MacOSX),) -LARGE_MEMORY_DEFINE=-DARM_LARGE_MEMORY=1 -ARM64_PLKSEG_ADDR =0xfffffe0004004000 -ARM64_LINK_ADDR =0xfffffe0007004000 +ARM64_LARGE_MEMORY = 1 +ARM64_PLKSEG_ADDR = 0xfffffe0004004000 +ARM64_LINK_ADDR = 0xfffffe0007004000 endif -CFLAGS_ARM64 = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED -DVM_KERNEL_LINK_ADDRESS=$(ARM64_LINK_ADDR) \ - $(LARGE_MEMORY_DEFINE) -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4 -mkernel \ - $(BTI_BUILD_FLAGS) +ifeq ($(ARM64_LARGE_MEMORY),1) + ARM64_EXTRA_BUILD_FLAGS += -DARM_LARGE_MEMORY=1 +else + ARM64_EXTRA_BUILD_FLAGS += -UARM_LARGE_MEMORY +endif -CXXFLAGS_ARM64 = $(BTI_BUILD_FLAGS) +ifeq ($(ARM64_LARGE_MEMORY_KERNONLY),1) + ARM64_EXTRA_BUILD_FLAGS += -DARM_LARGE_MEMORY_KERNONLY=1 +else + ARM64_EXTRA_BUILD_FLAGS += -UARM_LARGE_MEMORY_KERNONLY +endif + + + +ifeq ($(BUILD_XNU_LIBRARY),1) +# __static_testable makes some static function not static and generate these warnings +CFLAGS_KERNEL = -Wno-missing-prototypes +else +CFLAGS_KERNEL = -mkernel +endif + +CFLAGS_ARM64 = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED -DVM_KERNEL_LINK_ADDRESS=$(ARM64_LINK_ADDR) \ + $(ARM64_EXTRA_BUILD_FLAGS) -mno-implicit-sme -momit-leaf-frame-pointer -fno-strict-aliasing -D__API__=v4 $(CFLAGS_KERNEL) + +CXXFLAGS_ARM64 = $(ARM64_EXTRA_BUILD_FLAGS) CFLAGS_RELEASEX86_64 = -O2 CFLAGS_DEVELOPMENTX86_64 = -O2 @@ -592,7 +630,7 @@ KASAN_LIGHT=0 HWASAN_INSTRUMENT_STACK=1 endif -KASAN_BLACKLIST=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) +KASAN_DENYLIST=$(OBJROOT)/san/kasan-denylist-$(CURRENT_ARCH_CONFIG_LC) # To calculate the kasan offset, subtract the lowest KVA to sanitize, shifted right by KASAN_SCALE_$INSTRUMENTATION bits, # from the base address of the kasan shadow area, (e.g. for x86_64 solve the following equation: @@ -613,7 +651,7 @@ KASAN_OFFSET=$($(addsuffix $(CURRENT_ARCH_CONFIG),KASAN_OFFSET_)) KASAN_SCALE_TBI=4 CFLAGS_KASAN_INSTRUMENTATION_TBI = -DKASAN_TBI=1 -DKASAN_SCALE=$(KASAN_SCALE_TBI) \ -fsanitize=kernel-hwaddress \ - -fsanitize-ignorelist=$(KASAN_BLACKLIST) \ + -fsanitize-ignorelist=$(KASAN_DENYLIST) \ -mllvm -hwasan-recover=0 \ -mllvm -hwasan-mapping-offset=$(KASAN_OFFSET) \ -mllvm -hwasan-instrument-atomics=1 \ @@ -629,7 +667,7 @@ CFLAGS_KASAN_INSTRUMENTATION_CLASSIC = -DKASAN_CLASSIC=1 -DKASAN_SCALE=$(KASAN_S -fsanitize=address \ -mllvm -asan-globals-live-support \ -mllvm -asan-mapping-offset=$(KASAN_OFFSET) \ - -fsanitize-ignorelist=$(KASAN_BLACKLIST) + -fsanitize-ignorelist=$(KASAN_DENYLIST) CFLAGS_KASANARM64 += $(CFLAGS_KASAN_INSTRUMENTATION_TBI) CFLAGS_KASANX86_64 += $(CFLAGS_KASAN_INSTRUMENTATION_CLASSIC) @@ -683,15 +721,15 @@ ifeq ($(PLATFORM),iPhoneOS) UBSAN_MINIMAL_RUNTIME := DEVELOPMENT DEBUG ifneq ($(filter $(CURRENT_KERNEL_CONFIG), $(UBSAN_MINIMAL_RUNTIME)),) -# This is (unfortunately) intentional. Currently the "kasan" blacklist, which folds both +# This is (unfortunately) intentional. Currently the "kasan" denylist, which folds both # ubsan and kasan specific files, is generated for all builds during the -# setup phase. The blacklist file itself is divided per-sanitizer, so won't +# setup phase. The denylist file itself is divided per-sanitizer, so won't # affect the UBSAN build outside of the entries that are legitimately # intended for it. -UBSAN_BLACKLIST=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) +UBSAN_DENYLIST=$(OBJROOT)/san/kasan-denylist-$(CURRENT_ARCH_CONFIG_LC) UBSAN_CHECKS = signed-integer-overflow -UBSAN_RUNTIME = -fsanitize-minimal-runtime -fsanitize-ignorelist=$(UBSAN_BLACKLIST) +UBSAN_RUNTIME = -fsanitize-minimal-runtime -fsanitize-ignorelist=$(UBSAN_DENYLIST) UBSAN_CHECKS_TRAP = UBSAN_CHECKS_FATAL = UBSAN_DISABLED = @@ -709,13 +747,13 @@ CFLAGS_GEN += $(foreach x,$(UBSAN_DISABLED),-fno-sanitize=$(x)) ifeq ($(KSANCOV),1) # Enable SanitizerCoverage instrumentation in xnu SAN = 1 -KCOV_BLACKLIST := $(OBJROOT)/san/kcov-blacklist-$(CURRENT_ARCH_CONFIG_LC) -KCOV_CFLAGS := -fsanitize-coverage=trace-pc-guard -fsanitize-coverage-ignorelist=$(KCOV_BLACKLIST) +KCOV_DENYLIST := $(OBJROOT)/san/kcov-denylist-$(CURRENT_ARCH_CONFIG_LC) +KCOV_CFLAGS := -fsanitize-coverage=trace-pc-guard,trace-cmp -fsanitize-coverage-ignorelist=$(KCOV_DENYLIST) CFLAGS_GEN += $(KCOV_CFLAGS) -DKSANCOV=1 endif ifeq ($(SAN),1) -CFLAGS_GEN += -fsanitize-ignorelist=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) +CFLAGS_GEN += -fsanitize-ignorelist=$(OBJROOT)/san/kasan-denylist-$(CURRENT_ARCH_CONFIG_LC) endif # Any extra flags that get passed at the command line during build. @@ -740,7 +778,13 @@ CFLAGS = $(CFLAGS_GEN) \ OTHER_CXXFLAGS = -CXXFLAGS_GEN = -std=gnu++2b -fsized-deallocation -fapple-kext $(OTHER_CXXFLAGS) +ifeq ($(BUILD_XNU_LIBRARY),1) +CXXFLAGS_KERNEL = -fno-exceptions -Wno-missing-prototypes +else +CXXFLAGS_KERNEL = -fapple-kext +endif + +CXXFLAGS_GEN = -std=gnu++2b -fsized-deallocation $(CXXFLAGS_KERNEL) $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \ @@ -1051,11 +1095,6 @@ LDFILES_KERNEL_ONLY = $(TARGET)/all-kpi.exp $(TARGET)/all-alias.exp $(TARGET)/sy LD_KERNEL_LIBS = -lcc_kext LD_KERNEL_ARCHIVES = $(LDFLAGS_KERNEL_SDK) -lfirehose_kernel -# Link binary support library -ifneq ($(KDKROOT),) - LDFLAGS_KERNEL_ONLY += -rdynamic -Wl,-force_load,$(KDKROOT)/System/Library/KernelSupport/lib$(CURRENT_MACHINE_CONFIG).os.$(CURRENT_KERNEL_CONFIG).a -endif - # # Derive SPTM/TXM and Exclaves enablement from the EDM properties. # @@ -1064,6 +1103,7 @@ PLATFORM_NORMALIZED := $(shell echo $(PLATFORM) | tr '[:upper:]' '[:lower:]') SPTM_ENABLED := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT EnableSPTMTXM FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" AND SDKPlatform IS \"$(PLATFORM_NORMALIZED)\" AND EnableSPTMTXM == 1) + EXCLAVES_ENABLED := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT HascL4 FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" AND SDKPlatform IS \"$(PLATFORM_NORMALIZED)\" AND HascL4 == 1) endif # !EMBEDDED_DEVICE_MAP @@ -1071,6 +1111,7 @@ ifeq ($(CURRENT_KERNEL_CONFIG),SPTM) SPTM_ENABLED = 1 endif # SPTM + ifneq ($(filter $(CURRENT_MACHINE_CONFIG_LC),$(SPTM_ENABLED_SOCS_$(PLATFORM))),) SPTM_ENABLED = 1 endif @@ -1082,10 +1123,12 @@ ifeq ($(SPTM_ENABLED),1) endif # !SPTM_ENABLED ifeq ($(EXCLAVES_ENABLED),1) - DOCONF_EXCLAVES := -exclaves + DOCONF_EXCLAVES := -exclaves endif # !EXCLAVES_ENABLED +ifeq ($(CURRENT_ARCH_CONFIG),ARM64) TIGHTBEAM_EXPORTS := $(SRCROOT)/config/libTightbeam.exports +endif # # DTrace support @@ -1107,14 +1150,15 @@ INCFLAGS_IMPORT = $(patsubst %, -I$(OBJROOT)/EXPORT_HDRS/%, $(COMPONENT_IMPORT_L INCFLAGS_EXTERN = -I$(SRCROOT)/EXTERNAL_HEADERS INCFLAGS_GEN = -I$(SRCROOT)/$(COMPONENT) -I$(OBJROOT)/EXPORT_HDRS/$(COMPONENT) INCFLAGS_LOCAL = -I. -INCFLAGS_SDK = -I$(SDKROOT)/usr/local/include/kernel -I$(SDKROOT)/$(KPINCDIR)/AppleFeatures +INCFLAGS_KERNEL = -I$(SDKROOT)/usr/local/include/kernel +INCFLAGS_SDK = -I$(SDKROOT)/usr/local/include/ -I$(SDKROOT)/$(KPINCDIR)/AppleFeatures INCFLAGS_PLATFORM = -I$(SDKROOT)/$(KPINCDIR)/platform ifneq ($(KDKROOT),) INCFLAGS_SDK += -I$(KDKROOT)/$(KPINCDIR) INCFLAGS_PLATFORM += -I$(KDKROOT)/$(KINCDIR)/platform -I$(KDKROOT)/$(KPINCDIR)/platform endif -INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLAGS_EXTERN) $(INCFLAGS_MAKEFILE) $(INCFLAGS_SDK) $(INCFLAGS_PLATFORM) +INCFLAGS = $(INCFLAGS_LOCAL) $(INCFLAGS_GEN) $(INCFLAGS_IMPORT) $(INCFLAGS_EXTERN) $(INCFLAGS_MAKEFILE) $(INCFLAGS_KERNEL) $(INCFLAGS_SDK) $(INCFLAGS_PLATFORM) # # Default MIGFLAGS @@ -1319,19 +1363,20 @@ KERNEL_ONLY_GUARDS_UNIFDEF = # Test-only guards should be stripped from all exported headers TEST_ONLY_GUARDS_UNIFDEF = -USCHED_TEST_HARNESS -SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -DKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -DDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -DKINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -DDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -EKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -DEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -EKINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -DEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -ECPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -UEXCLAVEKIT -DEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -ECINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -DEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) -KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -UXNU_LIBCXX_SDKROOT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) -LIBCXXINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -DXNU_LIBCXX_SDKROOT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) -KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -DKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) -PDATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) -DATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) +SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +SFPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +DKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -DDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +DKINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -DDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +EKPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -DEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +EKINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -DEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +ECPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -UDRIVERKIT -UEXCLAVEKIT -DEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +ECINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -DEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(KERNEL_ONLY_GUARDS_UNIFDEF) $(TEST_ONLY_GUARDS_UNIFDEF) +KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UMODULES_SUPPORTED -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -UXNU_LIBCXX_SDKROOT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) +LIBCXXINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UMODULES_SUPPORTED -DKERNEL_PRIVATE -DKERNEL -DPRIVATE -UDRIVERKIT -DXNU_LIBCXX_SDKROOT -UEXCLAVEKIT -UEXCLAVECORE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) +KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UMODULES_SUPPORTED -UKERNEL_PRIVATE -DKERNEL -UPRIVATE -UDRIVERKIT -UEXCLAVEKIT -UEXCLAVECORE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) +PDATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) +DATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DMODULES_SUPPORTED -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ $(TEST_ONLY_GUARDS_UNIFDEF) # # Compononent Header file destinations diff --git a/makedefs/MakeInc.kernel b/makedefs/MakeInc.kernel index 3939409d9..a544e650f 100644 --- a/makedefs/MakeInc.kernel +++ b/makedefs/MakeInc.kernel @@ -131,6 +131,7 @@ ifeq ($(DO_CTFMERGE),1) @$(LOG_CTFCONVERT) "$(@F)" $(_v)$(CTFCONVERT) -c -l xnu -u xnu -o $@ $(TARGET)/$(KERNEL_FILE_NAME).dSYM/Contents/Resources/DWARF/$(KERNEL_FILE_NAME) endif + $(_v)$(TOUCH) $@ # Strip the kernel and merge in the CTF $(TARGET)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped.noctf $(TARGET)/$(KERNEL_FILE_NAME).dSYM $(TARGET)/$(KERNEL_FILE_NAME).ctf @@ -180,10 +181,10 @@ endif $(_v)$(TOUCH) $@ ifeq ($(BUILD_XNU_LIBRARY),1) -$(TARGET)/lib$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).libfilelist)) nonlto.o version.o .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) +$(TARGET)/lib$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).libfilelist)) nonlto.o version.o lastkerneldataconst.o .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) @$(LOG_LIBTOOL) "$(@F)" - $(_v)$(CAT) $(filter %.libfilelist,$+) < /dev/null > link.filelist - $(_v)$(LIBTOOL) -static -csD -filelist link.filelist -o $@ + $(_v)$(CAT) $(filter %.libfilelist,$+) < /dev/null > archive.filelist + $(_v)$(LIBTOOL) -static -csD -filelist archive.filelist version.o lastkerneldataconst.o -o $@ $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/lib$(KERNEL_FILE_NAME).a $(OBJROOT)/lib$(KERNEL_FILE_NAME).a endif @@ -235,9 +236,15 @@ else $(_v)$(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_ONLY) -filelist link.filelist $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES) endif +ifeq ($(RC_ProjectName),xnu_libraries) +$(TARGET)/compile_commands.json: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).libfilelist)) + $(_v)files="$$(sed -e 's/$$/.json/' $(filter %.libfilelist,$+))"; \ + sed -e '1s/^/[\'$$'\n''/' -e '$$s/,$$/\'$$'\n'']/' $$files > compile_commands.json +else $(TARGET)/compile_commands.json: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) $(_v)files="$$(sed -e 's/$$/.json/' $(filter %.filelist,$+))"; \ sed -e '1s/^/[\'$$'\n''/' -e '$$s/,$$/\'$$'\n'']/' $$files > compile_commands.json +endif # for now, rename LASTDATA_CONST to LAST on static kernel cache builds EXTRA_KC_LINKARGS = -Wl,-rename_segment,__LASTDATA_CONST,__LAST diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 3290dd5d8..4b8a60f71 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -307,8 +307,8 @@ $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_IF_MI_LCL_GEN_FILES),1,ipincmi INSTALL_SF_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(SPINCDIR)/$(INSTALL_MI_DIR)/, $(sort $(INSTALL_SF_MI_LCL_LIST))) INSTALL_SF_MI_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(SPINCDIR)/$(INSTALL_MI_DIR)/, $(sort $(INSTALL_SF_MI_LCL_GEN_LIST))) -$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MI_LCL_FILES),,spincmidir,$(SPINCFRAME_UNIFDEF))) -$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MI_LCL_GEN_FILES),1,spincmigendir,$(SPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MI_LCL_FILES),,spincmidir,$(SFPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MI_LCL_GEN_FILES),1,spincmigendir,$(SFPINCFRAME_UNIFDEF))) ifeq ($(DRIVERKIT),1) INSTALL_DRIVERKIT_MI_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MI_DIR)/, $(INSTALL_DRIVERKIT_MI_LCL_LIST)) @@ -397,8 +397,8 @@ $(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_MODULEMAP_MD_LCL_FILES),,mmpin INSTALL_SF_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(SPINCDIR)/$(INSTALL_MD_DIR)/, $(sort $(INSTALL_SF_MD_LCL_LIST))) INSTALL_SF_MD_LCL_GEN_FILES = $(addprefix $(DSTROOT)/$(SPINCDIR)/$(INSTALL_MD_DIR)/, $(sort $(INSTALL_SF_MD_LCL_GEN_LIST))) -$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MD_LCL_FILES),,spincdir,$(SPINCFRAME_UNIFDEF))) -$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MD_LCL_GEN_FILES),1,spincgendir,$(SPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MD_LCL_FILES),,spincdir,$(SFPINCFRAME_UNIFDEF))) +$(eval $(call INSTALLHDRS_RULE_template,$(INSTALL_SF_MD_LCL_GEN_FILES),1,spincgendir,$(SFPINCFRAME_UNIFDEF))) ifeq ($(DRIVERKIT),1) INSTALL_DRIVERKIT_MD_LCL_FILES = $(addprefix $(DSTROOT)/$(DRIVERKITLCLDIR)/$(INSTALL_MD_DIR)/, $(INSTALL_DRIVERKIT_MD_LCL_LIST)) diff --git a/makedefs/MakeInc.top b/makedefs/MakeInc.top index f85a166a1..964d52266 100644 --- a/makedefs/MakeInc.top +++ b/makedefs/MakeInc.top @@ -163,13 +163,13 @@ DEVICEMAP_PRODUCT_SOC_MAPPINGS := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPAT # use embedded_device_map endif -# Map a product like "n75" to "arm;t8002" +# Map a product like "n75" to "arm;t8002", or potentially multiple "soc;platform" strings, if the product has multiple platforms. # $(1) is a product name in lower case -function_lookup_product = $(call function_substitute_word_with_replacement, \ +function_lookup_product = $(call function_parse_product_configs,$(call function_substitute_word_with_replacement, \ $(1), \ $(DEVICEMAP_PRODUCT_SOC_MAPPINGS), \ unknown_arch_for_$(1);unknown_platform_for_$(1) \ - ) + )) ifneq ($(PLATFORM),MacOSX) ifneq ($(EMBEDDED_DEVICE_MAP),) @@ -206,7 +206,7 @@ TARGET_CONFIGS_ALIASES := $(foreach my_devicemap_config,$(foreach my_product_con else ifeq (xnu_headers_driverkit,$(RC_ProjectName)) # generate TARGET_CONFIGS for all kernel configs for B&I TARGET_CONFIGS = DEVELOPMENT arm64 DEFAULT DEVELOPMENT X86_64 DEFAULT -else ifneq ($(filter %_release_embedded,$(MAKECMDGOALS)),) +else ifneq ($(filter %_release_embedded,$(MAKECMDGOALS))$(filter %_release_embedded_nohdrs,$(MAKECMDGOALS)),) # generate TARGET_CONFIGS for RELEASE kernel configs and products in the device map TARGET_CONFIGS := $(foreach my_devicemap_config,$(foreach my_arch_config,$(ARCH_CONFIGS_EMBEDDED),$(foreach my_product_config,$(DEVICEMAP_PRODUCTS_$(my_arch_config)),$(call function_lookup_product,$(my_product_config)))),$(foreach my_kernel_config,RELEASE,$(my_kernel_config) $(subst ;, ,$(my_devicemap_config)))) TARGET_CONFIGS += $(EXTRA_TARGET_CONFIGS_RELEASE) @@ -221,7 +221,7 @@ else ifneq ($(filter %_embedded,$(MAKECMDGOALS)),) TARGET_CONFIGS := $(foreach my_devicemap_config,$(foreach my_arch_config,$(ARCH_CONFIGS_EMBEDDED),$(foreach my_product_config,$(DEVICEMAP_PRODUCTS_$(my_arch_config)),$(call function_lookup_product,$(my_product_config)))),$(foreach my_kernel_config,$(KERNEL_CONFIGS_EMBEDDED),$(my_kernel_config) $(subst ;, ,$(my_devicemap_config)))) TARGET_CONFIGS += $(foreach my_kernel_config,$(KERNEL_CONFIGS_EMBEDDED),$(EXTRA_TARGET_CONFIGS_$(my_kernel_config))) TARGET_CONFIGS_ALIASES := $(foreach my_devicemap_config,$(foreach my_arch_config,$(ARCH_CONFIGS_EMBEDDED),$(foreach my_product_config,$(DEVICEMAP_PRODUCTS_$(my_arch_config)),$(call function_lookup_product_alias,$(my_product_config)))),$(foreach my_kernel_config,$(KERNEL_CONFIGS_EMBEDDED),$(my_kernel_config) $(subst ;, ,$(my_devicemap_config)))) -else ifneq ($(filter %_release_desktop,$(MAKECMDGOALS)),) +else ifneq ($(filter %_release_desktop,$(MAKECMDGOALS))$(filter %_release_desktop_nohdrs,$(MAKECMDGOALS)),) # generate TARGET_CONFIGS for B&I release builds TARGET_CONFIGS := $(foreach my_kern_config, RELEASE, $(foreach my_arch_config, $(ARCH_CONFIGS_DESKTOP), $(foreach my_machine_config, $(MACHINE_CONFIGS), $(my_kern_config) $(my_arch_config) $(my_machine_config)))) TARGET_CONFIGS += $(foreach my_devicemap_config,$(foreach my_arch_config,$(ARCH_CONFIGS_EMBEDDED),$(foreach my_product_config,$(DEVICEMAP_PRODUCTS_OSX_$(my_arch_config)),$(call function_lookup_product,$(my_product_config)))),$(foreach my_kernel_config,RELEASE,$(my_kernel_config) $(subst ;, ,$(my_devicemap_config)))) @@ -627,7 +627,7 @@ final_touch_config_timestamps: config_install_bootstrap install_config_primary i # Aggregate install targets, which install everything appropriate for the current build alias/make target # -.PHONY: install +.PHONY: install install_nohdrs ifneq ($(filter $(RC_ProjectName),xnu_debug),) install: install_kernels @@ -648,11 +648,12 @@ export INSTALLHDRS_SKIP_HOST=YES export EXPORTHDRS_SKIP_EXCLAVES=YES else -install: installhdrs install_textfiles install_config install_kernels install_aliases +install_nohdrs: install_textfiles install_config install_kernels install_aliases +install: installhdrs install_nohdrs endif -.PHONY: install_embedded install_release_embedded install_development_embedded -.PHONY: install_desktop install_release_desktop install_development_desktop +.PHONY: install_embedded install_release_embedded install_development_embedded install_release_embedded_nohdrs +.PHONY: install_desktop install_release_desktop install_development_desktop install_release_desktop_nohdrs # By default, all kernel files, headers, text files, and pseudo-kexts are installed install_embedded install_release_embedded install_desktop install_release_desktop: install @@ -660,6 +661,9 @@ install_embedded install_release_embedded install_desktop install_release_deskto # These special configs only install the kernel files install_development_embedded install_development_desktop: install_kernels install_aliases +# These install release kernels, text files, and pseudo-kexts, but no headers +install_release_embedded_nohdrs install_release_desktop_nohdrs: install_nohdrs + .PHONY: install_kernels final_touch_kernel_timestamps install_aliases install_kernels: build_install_primary_bootstrap build_install_non_primary_bootstrap final_touch_kernel_timestamps diff --git a/osfmk/UserNotification/KUNCUserNotifications.c b/osfmk/UserNotification/KUNCUserNotifications.c index 8cde4e86f..a032d77d6 100644 --- a/osfmk/UserNotification/KUNCUserNotifications.c +++ b/osfmk/UserNotification/KUNCUserNotifications.c @@ -66,6 +66,7 @@ static void UNDReply_no_senders(ipc_port_t port, mach_port_mscount_t mscount); IPC_KOBJECT_DEFINE(IKOT_UND_REPLY, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = UNDReply_no_senders); @@ -187,7 +188,7 @@ KUNCGetNotificationID(void) reply = kalloc_type(struct UNDReply, Z_WAITOK | Z_ZERO | Z_NOFAIL); reply->self_port = ipc_kobject_alloc_port((ipc_kobject_t)reply, - IKOT_UND_REPLY, IPC_KOBJECT_ALLOC_NSREQUEST); + IKOT_UND_REPLY, IPC_KOBJECT_ALLOC_NONE); lck_mtx_init(&reply->lock, &UNDLckGrp, LCK_ATTR_NULL); reply->userLandNotificationKey = -1; reply->inprogress = FALSE; diff --git a/osfmk/arm/arm_features.inc b/osfmk/arm/arm_features.inc index 147e8fb05..2174becb4 100644 --- a/osfmk/arm/arm_features.inc +++ b/osfmk/arm/arm_features.inc @@ -32,6 +32,7 @@ * below based on the MSR that will be used to populate the data. */ + /* Features from: ID_AA64ISAR0_EL1 */ ARM_FEATURE_FLAG(FEAT_CRC32); ARM_FEATURE_FLAG(FEAT_FlagM); @@ -73,6 +74,7 @@ ARM_FEATURE_FLAG(FEAT_RPRES); ARM_FEATURE_FLAG(FEAT_CSSC); ARM_FEATURE_FLAG(FEAT_HBC); + /* Features from: ID_AA64MMFR0_EL1 */ ARM_FEATURE_FLAG(FEAT_ECV); diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index 02ab93c12..2f207bd8c 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -153,8 +153,15 @@ MACHINE_TIMEOUT_DEV_WRITEABLE(stackshot_interrupt_masked_timeout, "sshot-interru #define XCALL_ACK_TIMEOUT_NS ((uint64_t) 6000000000) uint64_t xcall_ack_timeout_abstime; -boot_args const_boot_args __attribute__((section("__DATA, __const"))); -boot_args *BootArgs __attribute__((section("__DATA, __const"))); +#ifndef __BUILDING_XNU_LIBRARY__ +#define BOOTARGS_SECTION_ATTR __attribute__((section("__DATA, __const"))) +#else /* __BUILDING_XNU_LIBRARY__ */ +/* Special segments are not used when building for user-mode */ +#define BOOTARGS_SECTION_ATTR +#endif /* __BUILDING_XNU_LIBRARY__ */ + +boot_args const_boot_args BOOTARGS_SECTION_ATTR; +boot_args *BootArgs BOOTARGS_SECTION_ATTR; TUNABLE(uint32_t, arm_diag, "diag", 0); #ifdef APPLETYPHOON @@ -326,20 +333,6 @@ arm_auxkc_init(void *mh, void *base) #endif /* defined(HAS_APPLE_PAC) */ } -/* - * Routine: arm_setup_pre_sign - * Function: Perform HW initialization that must happen ahead of the first PAC sign - * operation. - */ -static void -arm_setup_pre_sign(void) -{ -#if __arm64__ - /* DATA TBI, if enabled, affects the number of VA bits that contain the signature */ - arm_set_kernel_tbi(); -#endif /* __arm64 */ -} - /* * Routine: arm_init * Function: Runs on the boot CPU, once, on entry from iBoot. @@ -357,8 +350,6 @@ arm_init( DTEntry chosen = NULL; unsigned int dt_entry_size = 0; - arm_setup_pre_sign(); - arm_slide_rebase_and_sign_image(); /* If kernel integrity is supported, use a constant copy of the boot args. */ @@ -386,24 +377,40 @@ arm_init( configure_misc_apple_boot_args(); configure_misc_apple_regs(true); -#if (DEVELOPMENT || DEBUG) - unsigned long const *platform_stall_ptr = NULL; +#if HAS_UPSI_FAILURE_INJECTION + /* UPSI (Universal Panic and Stall Injection) Logic + * iBoot/XNU are both configured for failure injection at specific stages + * The injected failure and stage is populated through EDT properties by iBoot + * + * iBoot populates the EDT properties for XNU based upon PMU scratch bits + * This is done because the EDT is available sooner in XNU than the PMU Kext + */ + uint64_t const *upsi_info = NULL; + /* Not usable TUNABLE here because TUNABLEs are parsed at a later point. */ if (SecureDTLookupEntry(NULL, "/chosen", &chosen) != kSuccess) { panic("%s: Unable to find 'chosen' DT node", __FUNCTION__); } - // Not usable TUNABLE here because TUNABLEs are parsed at a later point. - if (SecureDTGetProperty(chosen, "xnu_platform_stall", (void const **)&platform_stall_ptr, + /* Check if there is a requested injection stage */ + if (SecureDTGetProperty(chosen, "injection_stage", (void const **)&upsi_info, &dt_entry_size) == kSuccess) { - xnu_platform_stall_value = *platform_stall_ptr; + assert3u(dt_entry_size, ==, 8); + xnu_upsi_injection_stage = *upsi_info; } - platform_stall_panic_or_spin(PLATFORM_STALL_XNU_LOCATION_ARM_INIT); + /* Check if there is a requested injection action */ + if (SecureDTGetProperty(chosen, "injection_action", (void const **)&upsi_info, + &dt_entry_size) == kSuccess) { + assert3u(dt_entry_size, ==, 8); + xnu_upsi_injection_action = *upsi_info; + } + + check_for_failure_injection(XNU_STAGE_ARM_INIT); chosen = NULL; // Force a re-lookup later on since VM addresses are not final at this point dt_entry_size = 0; -#endif +#endif // HAS_UPSI_FAILURE_INJECTION { @@ -784,7 +791,7 @@ arm_init_cpu( PE_arm_debug_enable_trace(should_kprintf); #endif /* DEVELOPMENT || DEBUG */ -#if KERNEL_INTEGRITY_KTRR || KERNEL_INTEGRITY_CTRR +#if KERNEL_INTEGRITY_KTRR || KERNEL_INTEGRITY_CTRR || KERNEL_INTEGRITY_PV_CTRR rorgn_validate_core(); #endif diff --git a/osfmk/arm/commpage/commpage.c b/osfmk/arm/commpage/commpage.c index 591ab6c20..3da898b79 100644 --- a/osfmk/arm/commpage/commpage.c +++ b/osfmk/arm/commpage/commpage.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -229,6 +230,8 @@ commpage_populate(void) #if __arm64__ *((uint8_t*)(_COMM_PAGE_APT_MSG_POLICY + _COMM_PAGE_RW_OFFSET)) = apt_msg_policy(); #endif + + commpage_set_erm_active(extended_research_mode_state()); } #define COMMPAGE_TEXT_SEGMENT "__TEXT_EXEC" @@ -386,6 +389,8 @@ _get_commpage_text_priv_address(void) } #if defined(__arm64__) + + /** * Initializes all commpage entries and sysctls for EL0 visible features in ID_AA64ISAR0_EL1 */ @@ -553,6 +558,7 @@ commpage_init_arm_optional_features_isar2(void) } } + /** * Initializes all commpage entries and sysctls for EL0 visible features in ID_AA64MMFR0_EL1 */ @@ -657,6 +663,9 @@ commpage_init_arm_optional_features_pfr1(uint64_t *commpage_bits) static void commpage_init_arm_optional_features_pfr2(__unused uint64_t *commpage_bits) { + uint64_t pfr2 __unused = __builtin_arm_rsr64("ID_AA64PFR2_EL1"); + + } /** @@ -1050,3 +1059,20 @@ commpage_update_apt_active(bool active) uint8_t *slot = (uint8_t *)(void *)(_COMM_PAGE_APT_ACTIVE + _COMM_PAGE_RW_OFFSET); os_atomic_store(slot, active ? 1 : 0, relaxed); } + +/* + * set the Extended Research Mode active indicator + */ +void +commpage_set_erm_active(bool active) +{ + if (startup_phase < STARTUP_SUB_LOCKDOWN) { + uint8_t *slot = (uint8_t *)(void *)(_COMM_PAGE_SECURITY_RESEARCH_DEVICE_ERM_ACTIVE + _COMM_PAGE_RW_OFFSET); + os_atomic_store(slot, active ? 1 : 0, relaxed); + } +#if DEVELOPMENT || DEBUG + else { + kprintf("ERROR can't set ERM bit at startup_phase 0x%x. Action is ignored\n", startup_phase); + } +#endif +} diff --git a/osfmk/arm/commpage/commpage.h b/osfmk/arm/commpage/commpage.h index 7d7e8c024..747baa708 100644 --- a/osfmk/arm/commpage/commpage.h +++ b/osfmk/arm/commpage/commpage.h @@ -52,5 +52,8 @@ extern void commpage_update_dof(boolean_t enabled); extern void commpage_update_dyld_flags(uint64_t value); extern uint32_t commpage_is_in_pfz64(addr64_t addr); extern void commpage_update_apt_active(bool active); +#if defined(PRIVATE) +extern void commpage_set_erm_active(bool active); +#endif #endif /* _ARM_COMMPAGE_H */ diff --git a/osfmk/arm/commpage/commpage_asm.s b/osfmk/arm/commpage/commpage_asm.s index da0bbba9b..f5a477fce 100644 --- a/osfmk/arm/commpage/commpage_asm.s +++ b/osfmk/arm/commpage/commpage_asm.s @@ -300,7 +300,7 @@ Lset_new_tail: Ltrylock_enqueue_exit: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _pfz_trylock_and_enqueue /* Non-preemptible helper routine to FIFO dequeue: * void *pfz_trylock_and_dequeue(OSFifoQueueHead *__list, size_t __offset, uint32_t *lock_addr); @@ -350,7 +350,7 @@ Lreturn_head: Ltrylock_dequeue_exit: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _pfz_trylock_and_dequeue /* Preemptible functions */ @@ -410,7 +410,7 @@ Lenqueue_take_delayed_preemption_upon_success: Lenqueue_success: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _pfz_enqueue /* * void *pfz_dequeue(OSFifoQueueHead *__list, size_t __offset); @@ -466,7 +466,7 @@ Ldequeue_clear_monitor: Ldequeue_success: mov x0, x9 // Move x9 (where result was stored earlier) to x0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _pfz_dequeue /* void preempt_self(void) @@ -493,7 +493,7 @@ _preempt_self: ldp x0, x1, [sp], #16 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _preempt_self /* * void backoff(uint32_t *lock_addr); @@ -525,4 +525,4 @@ Lend_backoff: clrex POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _backoff diff --git a/osfmk/arm/cpu_capabilities.h b/osfmk/arm/cpu_capabilities.h index 83b0c6d7a..ef8f76c38 100644 --- a/osfmk/arm/cpu_capabilities.h +++ b/osfmk/arm/cpu_capabilities.h @@ -381,6 +381,12 @@ _Static_assert((_COMM_PAGE64_BASE_ADDRESS >= _COMM_PAGE64_NESTING_START) && #define _COMM_PAGE_APT_ACTIVE (_COMM_PAGE_START_ADDRESS+0x341) // uint8_t for APT active status (infrequently mutated) +#if defined(PRIVATE) +#define _COMM_PAGE_SECURITY_RESEARCH_DEVICE_ERM_ACTIVE (_COMM_PAGE_START_ADDRESS+0x342) // uint8_t for ERM active status (set at boot time) +#else +#define _COMM_PAGE_RESERVED_0 (_COMM_PAGE_START_ADDRESS+0x342) +#endif + #define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0xfff) // end of common page #if defined(__LP64__) diff --git a/osfmk/arm/cpu_capabilities_public.h b/osfmk/arm/cpu_capabilities_public.h index e7d0410af..fe37a7ad8 100644 --- a/osfmk/arm/cpu_capabilities_public.h +++ b/osfmk/arm/cpu_capabilities_public.h @@ -114,10 +114,11 @@ #define CAP_BIT_FP_SyncExceptions 73 + /* Legacy definitions for backwards compatibility */ #define CAP_BIT_CRC32 CAP_BIT_FEAT_CRC32 /* Total number of FEAT bits. */ -#define CAP_BIT_NB 74 +#define CAP_BIT_NB 80 #endif /* _ARM_CPU_CAPABILITIES_PUBLIC_H */ diff --git a/osfmk/arm/cpu_common.c b/osfmk/arm/cpu_common.c index ebf6e4816..73fa14343 100644 --- a/osfmk/arm/cpu_common.c +++ b/osfmk/arm/cpu_common.c @@ -211,6 +211,23 @@ cpu_idle_tickle(void) (void) ml_set_interrupts_enabled(intr); } +/* + * Routine: cpu_set_perfcontrol_timer + * + */ +void +cpu_set_perfcontrol_timer(uint64_t now, uint64_t timeout_ticks) +{ + assert(ml_get_interrupts_enabled() == FALSE); + processor_t processor = current_processor(); + if (timeout_ticks == EndOfAllTime) { + running_timer_cancel(processor, RUNNING_TIMER_PERFCONTROL); + } else { + uint64_t deadline = now + timeout_ticks; + running_timer_enter(processor, RUNNING_TIMER_PERFCONTROL, NULL, deadline, now); + } +} + static void cpu_handle_xcall(cpu_data_t *cpu_data_ptr) { @@ -222,25 +239,25 @@ cpu_handle_xcall(cpu_data_t *cpu_data_ptr) * added SIGPxcall to the pending mask, but hasn't yet assigned the call params.*/ if (cpu_data_ptr->cpu_xcall_p0 != NULL && cpu_data_ptr->cpu_xcall_p1 != NULL) { xfunc = ptrauth_auth_function(cpu_data_ptr->cpu_xcall_p0, ptrauth_key_function_pointer, cpu_data_ptr); - INTERRUPT_MASKED_DEBUG_START(xfunc, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(xfunc, DBG_INTR_TYPE_IPI); xparam = cpu_data_ptr->cpu_xcall_p1; cpu_data_ptr->cpu_xcall_p0 = NULL; cpu_data_ptr->cpu_xcall_p1 = NULL; os_atomic_thread_fence(acq_rel); os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcall, relaxed); xfunc(xparam); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } if (cpu_data_ptr->cpu_imm_xcall_p0 != NULL && cpu_data_ptr->cpu_imm_xcall_p1 != NULL) { xfunc = ptrauth_auth_function(cpu_data_ptr->cpu_imm_xcall_p0, ptrauth_key_function_pointer, cpu_data_ptr); - INTERRUPT_MASKED_DEBUG_START(xfunc, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(xfunc, DBG_INTR_TYPE_IPI); xparam = cpu_data_ptr->cpu_imm_xcall_p1; cpu_data_ptr->cpu_imm_xcall_p0 = NULL; cpu_data_ptr->cpu_imm_xcall_p1 = NULL; os_atomic_thread_fence(acq_rel); os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPxcallImm, relaxed); xfunc(xparam); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } } @@ -402,7 +419,7 @@ cpu_signal_internal(cpu_data_t *target_proc, /* We'll mandate that only IPIs meant to kick a core out of idle may ever be deferred. */ if (defer) { - assert(signal == SIGPnop); + assert(signal == SIGPnop || signal == SIGPdeferred); } if ((signal == SIGPxcall) || (signal == SIGPxcallImm)) { @@ -519,16 +536,20 @@ cpu_signal(cpu_data_t *target_proc, } kern_return_t -cpu_signal_deferred(cpu_data_t *target_proc) +cpu_signal_deferred(cpu_data_t *target_proc, cpu_signal_t signal) { - return cpu_signal_internal(target_proc, SIGPnop, NULL, NULL, TRUE); + return cpu_signal_internal(target_proc, signal, NULL, NULL, TRUE); } void -cpu_signal_cancel(cpu_data_t *target_proc) +cpu_signal_cancel(cpu_data_t *target_proc, cpu_signal_t signal) { - /* TODO: Should we care about the state of a core as far as squashing deferred IPIs goes? */ - if (!(target_proc->cpu_signal & SIGPdisabled)) { + cpu_signal_t current_signals; + + current_signals = os_atomic_andnot(&target_proc->cpu_signal, signal, acq_rel); + + + if (!(current_signals & SIGPdisabled)) { #if defined(HAS_IPI) if (gFastIPI) { ml_cpu_signal_retract(target_proc->cpu_phys_id); @@ -586,17 +607,17 @@ cpu_signal_handler_internal(boolean_t disable_signal) while (cpu_signal & ~SIGPdisabled) { if (cpu_signal & SIGPdebug) { os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdebug, acquire); - INTERRUPT_MASKED_DEBUG_START(DebuggerXCall, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(DebuggerXCall, DBG_INTR_TYPE_IPI); DebuggerXCall(cpu_data_ptr->cpu_int_state); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } #if KPERF if (cpu_signal & SIGPkppet) { os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPkppet, acquire); extern void kperf_signal_handler(void); - INTERRUPT_MASKED_DEBUG_START(kperf_signal_handler, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(kperf_signal_handler, DBG_INTR_TYPE_IPI); kperf_signal_handler(); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } #endif /* KPERF */ if (cpu_signal & (SIGPxcall | SIGPxcallImm)) { @@ -604,15 +625,18 @@ cpu_signal_handler_internal(boolean_t disable_signal) } if (cpu_signal & SIGPast) { os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPast, acquire); - INTERRUPT_MASKED_DEBUG_START(ast_check, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(ast_check, DBG_INTR_TYPE_IPI); ast_check(current_processor()); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } if (cpu_signal & SIGPTimerLocal) { os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPTimerLocal, acquire); - INTERRUPT_MASKED_DEBUG_START(timer_queue_expire_local, DBG_INTR_TYPE_IPI); + ml_interrupt_masked_debug_start(timer_queue_expire_local, DBG_INTR_TYPE_IPI); timer_queue_expire_local(current_processor()); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); + } + if (cpu_signal & SIGPdeferred) { + os_atomic_andnot(&cpu_data_ptr->cpu_signal, SIGPdeferred, acquire); } cpu_signal = os_atomic_or(&cpu_data_ptr->cpu_signal, 0, acquire); diff --git a/osfmk/arm/cpu_data.h b/osfmk/arm/cpu_data.h index 60c931a7a..743117b84 100644 --- a/osfmk/arm/cpu_data.h +++ b/osfmk/arm/cpu_data.h @@ -48,6 +48,8 @@ __ASSUME_PTR_ABI_SINGLE_BEGIN + +#ifndef __BUILDING_XNU_LIB_UNITTEST__ static inline __attribute__((const)) thread_t current_thread_fast(void) { @@ -67,6 +69,10 @@ current_thread_fast(void) return __unsafe_forge_single(thread_t, __builtin_arm_mrc(15, 0, 13, 0, 4)); #endif } +#else /* __BUILDING_XNU_LIB_UNITTEST__ */ +__attribute__((const)) thread_t current_thread_fast(void); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ + /* * The "volatile" flavor of current_thread() is intended for use by diff --git a/osfmk/arm/cpu_data_internal.h b/osfmk/arm/cpu_data_internal.h index 8700d4d23..673f79896 100644 --- a/osfmk/arm/cpu_data_internal.h +++ b/osfmk/arm/cpu_data_internal.h @@ -66,8 +66,10 @@ static_assert(sizeof(cpumap_t) * CHAR_BIT >= MAX_CPUS, "cpumap_t bitvector is to #define CPUWINDOWS_BASE (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) #define CPUWINDOWS_TOP (CPUWINDOWS_BASE + (MAX_CPUS * CPUWINDOWS_MAX * ARM_PGBYTES)) +#ifndef __BUILDING_XNU_LIBRARY__ /* in user-mode kernel addresses are low */ static_assert((CPUWINDOWS_BASE >= VM_MIN_KERNEL_ADDRESS) && ((CPUWINDOWS_TOP - 1) <= VM_MAX_KERNEL_ADDRESS), "CPU copy windows too large for CPUWINDOWS_BASE_MASK value"); +#endif typedef struct cpu_data_entry { void *cpu_data_paddr; /* Cpu data physical address */ @@ -126,6 +128,7 @@ __options_closed_decl(cpu_signal_t, unsigned int, { SIGPkppet = 0x00000100U, /* Request kperf PET handler */ SIGPxcallImm = 0x00000200U, /* Send a cross-call, fail if already pending */ SIGPTimerLocal = 0x00000400U, /* Update the decrementer via timer_queue_expire_local */ + SIGPdeferred = 0x00000800U, /* Scheduler deferred IPI to wake core */ SIGPdisabled = 0x80000000U, /* Signal disabled */ }); @@ -157,9 +160,7 @@ typedef struct cpu_data { bool cpu_hibernate; /* This cpu is currently hibernating the system */ bool cpu_running; bool cluster_master; -#if ERET_IS_NOT_CONTEXT_SYNCHRONIZING bool sync_on_cswitch; -#endif /* ERET_IS_NOT_CONTEXT_SYNCHRONIZING */ /* true if processor_start() or processor_exit() is operating on this CPU */ bool in_state_transition; @@ -227,6 +228,7 @@ typedef struct cpu_data { unsigned int cpu_sleep_token_last; cluster_type_t cpu_cluster_type; + uint32_t cpu_cluster_id; uint32_t cpu_l2_id; uint32_t cpu_l2_size; @@ -313,12 +315,4 @@ extern void cpu_data_init(cpu_data_t *cpu_data_ptr); extern void cpu_data_register(cpu_data_t *cpu_data_ptr); extern cpu_data_t *processor_to_cpu_datap( processor_t processor); -#if __arm64__ -typedef struct sysreg_restore { - uint64_t tcr_el1; -} sysreg_restore_t; - -extern sysreg_restore_t sysreg_restore; -#endif /* __arm64__ */ - #endif /* ARM_CPU_DATA_INTERNAL */ diff --git a/osfmk/arm/cpu_internal.h b/osfmk/arm/cpu_internal.h index 83cc70ec4..717b8e145 100644 --- a/osfmk/arm/cpu_internal.h +++ b/osfmk/arm/cpu_internal.h @@ -50,10 +50,12 @@ extern kern_return_t cpu_signal( void *p1); extern kern_return_t cpu_signal_deferred( - cpu_data_t *target); + cpu_data_t *target, + cpu_signal_t signal); extern void cpu_signal_cancel( - cpu_data_t *target); + cpu_data_t *target, + cpu_signal_t signal); extern bool cpu_has_SIGPdebug_pending(void); diff --git a/osfmk/arm/cpu_topology.h b/osfmk/arm/cpu_topology.h index 03798879f..4f873b368 100644 --- a/osfmk/arm/cpu_topology.h +++ b/osfmk/arm/cpu_topology.h @@ -39,34 +39,6 @@ #include __BEGIN_DECLS -/*! - * @typedef ml_topology_cpu_t - * @brief Describes one CPU core in the topology. - * - * @field cpu_id Logical CPU ID: 0, 1, 2, 3, 4, ... - * Dynamically assigned by XNU so it might not match EDT. No holes. - * @field phys_id Physical CPU ID (EDT: reg). Same as MPIDR[15:0], i.e. - * (cluster_id << 8) | core_number_within_cluster - * @field cluster_id Logical Cluster ID: 0, 1, 2, 3, 4, ... - * Dynamically assigned by XNU so it might not match EDT. No holes. - * @field die_id Die ID (EDT: die-id) - * @field cluster_type The type of CPUs found in this cluster. - * @field l2_cache_size Size of the L2 cache, in bytes. 0 if unknown or not present. - * @field l2_cache_id l2-cache-id property read from EDT. - * @field l3_cache_size Size of the L3 cache, in bytes. 0 if unknown or not present. - * @field l3_cache_id l3-cache-id property read from EDT. - * @field cpu_IMPL_regs IO-mapped virtual address of cpuX_IMPL (implementation-defined) register block. - * @field cpu_IMPL_pa Physical address of cpuX_IMPL register block. - * @field cpu_IMPL_len Length of cpuX_IMPL register block. - * @field cpu_UTTDBG_regs IO-mapped virtual address of cpuX_UTTDBG register block. - * @field cpu_UTTDBG_pa Physical address of cpuX_UTTDBG register block, if set in DT, else zero - * @field cpu_UTTDBG_len Length of cpuX_UTTDBG register block, if set in DT, else zero - * @field coresight_regs IO-mapped virtual address of CoreSight debug register block. - * @field coresight_pa Physical address of CoreSight register block. - * @field coresight_len Length of CoreSight register block. - * @field die_cluster_id Physical cluster ID within the local die (EDT: die-cluster-id) - * @field cluster_core_id Physical core ID within the local cluster (EDT: cluster-core-id) - */ typedef struct ml_topology_cpu { unsigned int cpu_id; uint32_t phys_id; @@ -91,25 +63,6 @@ typedef struct ml_topology_cpu { unsigned int cluster_core_id; } ml_topology_cpu_t; -/*! - * @typedef ml_topology_cluster_t - * @brief Describes one cluster in the topology. - * - * @field cluster_id Cluster ID (EDT: cluster-id) - * @field cluster_type The type of CPUs found in this cluster. - * @field num_cpus Total number of usable CPU cores in this cluster. - * @field first_cpu_id The cpu_id of the first CPU in the cluster. - * @field cpu_mask A bitmask representing the cpu_id's that belong to the cluster. Example: - * If the cluster contains CPU4 and CPU5, cpu_mask will be 0x30. - * @field die_id Die ID. - * @field die_cluster_id Physical cluster ID within the local die (EDT: die-cluster-id) - * @field acc_IMPL_regs IO-mapped virtual address of acc_IMPL (implementation-defined) register block. - * @field acc_IMPL_pa Physical address of acc_IMPL register block. - * @field acc_IMPL_len Length of acc_IMPL register block. - * @field cpm_IMPL_regs IO-mapped virtual address of cpm_IMPL (implementation-defined) register block. - * @field cpm_IMPL_pa Physical address of cpm_IMPL register block. - * @field cpm_IMPL_len Length of cpm_IMPL register block. - */ typedef struct ml_topology_cluster { unsigned int cluster_id; cluster_type_t cluster_type; @@ -131,28 +84,6 @@ typedef struct ml_topology_cluster { // headers are compatible with the running kernel #define CPU_TOPOLOGY_VERSION 1 -/*! - * @typedef ml_topology_info_t - * @brief Describes the CPU topology for all APs in the system. Populated from EDT and read-only at runtime. - * @discussion This struct only lists CPU cores that are considered usable by both iBoot and XNU. Some - * physically present CPU cores may be considered unusable due to configuration options like - * the "cpus=" boot-arg. Cores that are disabled in hardware will not show up in EDT at all, so - * they also will not be present in this struct. - * - * @field version Version of the struct (set to CPU_TOPOLOGY_VERSION). - * @field num_cpus Total number of usable CPU cores. - * @field max_cpu_id The highest usable logical CPU ID. - * @field num_clusters Total number of AP CPU clusters on the system (usable or not). - * @field max_cluster_id The highest cluster ID found in EDT. - * @field cpus List of |num_cpus| entries. - * @field clusters List of |num_clusters| entries. - * @field boot_cpu Points to the |cpus| entry for the boot CPU. - * @field boot_cluster Points to the |clusters| entry which contains the boot CPU. - * @field chip_revision Silicon revision reported by iBoot, which comes from the - * SoC-specific fuse bits. See CPU_VERSION_xx macros for definitions. - * @field cluster_power_down Set to 1 if there exists at least one cluster on the system that can be - * power-gated at runtime. - */ typedef struct ml_topology_info { unsigned int version; unsigned int num_cpus; diff --git a/osfmk/arm/cpuid.c b/osfmk/arm/cpuid.c index f68d0e48e..a24c15ae2 100644 --- a/osfmk/arm/cpuid.c +++ b/osfmk/arm/cpuid.c @@ -175,8 +175,8 @@ cpuid_get_cpufamily(void) case CPU_PART_FIRESTORM_JADE_DIE: case CPU_PART_ICESTORM_JADE_CHOP: case CPU_PART_ICESTORM_JADE_DIE: - case CPU_PART_FIRESTORM: - case CPU_PART_ICESTORM: + case CPU_PART_FIRESTORM_SICILY: + case CPU_PART_ICESTORM_SICILY: case CPU_PART_FIRESTORM_TONGA: case CPU_PART_ICESTORM_TONGA: cpufamily = CPUFAMILY_ARM_FIRESTORM_ICESTORM; @@ -187,8 +187,8 @@ cpuid_get_cpufamily(void) case CPU_PART_AVALANCHE_RHODES_CHOP: case CPU_PART_BLIZZARD_RHODES_DIE: case CPU_PART_AVALANCHE_RHODES_DIE: - case CPU_PART_BLIZZARD: - case CPU_PART_AVALANCHE: + case CPU_PART_BLIZZARD_ELLIS: + case CPU_PART_AVALANCHE_ELLIS: cpufamily = CPUFAMILY_ARM_BLIZZARD_AVALANCHE; break; case CPU_PART_EVEREST: @@ -255,10 +255,10 @@ cpuid_get_cpusubfamily(void) case CPU_PART_TEMPEST: case CPU_PART_LIGHTNING: case CPU_PART_THUNDER: - case CPU_PART_FIRESTORM: - case CPU_PART_ICESTORM: - case CPU_PART_BLIZZARD: - case CPU_PART_AVALANCHE: + case CPU_PART_FIRESTORM_SICILY: + case CPU_PART_ICESTORM_SICILY: + case CPU_PART_BLIZZARD_ELLIS: + case CPU_PART_AVALANCHE_ELLIS: case CPU_PART_SAWTOOTH: case CPU_PART_EVEREST: cpusubfamily = CPUSUBFAMILY_ARM_HP; diff --git a/osfmk/arm/cpuid.h b/osfmk/arm/cpuid.h index 714244f83..4596693e7 100644 --- a/osfmk/arm/cpuid.h +++ b/osfmk/arm/cpuid.h @@ -158,11 +158,13 @@ typedef union { /* M10 e-Core (ARMv8 architecture) */ #define CPU_PART_THUNDER_M10 0x26 -/* H13 e-Core */ -#define CPU_PART_ICESTORM 0x20 +/* H13P e-Core */ +#define CPU_PART_ICESTORM 0x20 /* Prefer CPU_PART_ICESTORM_SICILY. */ +#define CPU_PART_ICESTORM_SICILY 0x20 -/* H13 p-Core */ -#define CPU_PART_FIRESTORM 0x21 +/* H13P p-Core */ +#define CPU_PART_FIRESTORM 0x21 /* Prefer CPU_PART_FIRESTORM_SICILY. */ +#define CPU_PART_FIRESTORM_SICILY 0x21 /* H13G e-Core */ #define CPU_PART_ICESTORM_TONGA 0x22 @@ -178,11 +180,13 @@ typedef union { #define CPU_PART_FIRESTORM_JADE_CHOP 0x25 #define CPU_PART_FIRESTORM_JADE_DIE 0x29 -/* H14 e-Core */ -#define CPU_PART_BLIZZARD 0x30 +/* H14P e-Core */ +#define CPU_PART_BLIZZARD 0x30 /* Prefer CPU_PART_BLIZZARD_ELLIS. */ +#define CPU_PART_BLIZZARD_ELLIS 0x30 -/* H14 p-Core */ -#define CPU_PART_AVALANCHE 0x31 +/* H14P p-Core */ +#define CPU_PART_AVALANCHE 0x31 /* Prefer CPU_PART_AVALANCHE_ELLIS. */ +#define CPU_PART_AVALANCHE_ELLIS 0x31 /* H14G e-Core */ #define CPU_PART_BLIZZARD_STATEN 0x32 @@ -257,6 +261,8 @@ typedef union { + + /* Cache type identification */ /* Supported Cache Types */ diff --git a/osfmk/arm/data.s b/osfmk/arm/data.s index 5b8bd2e90..ef165e35e 100644 --- a/osfmk/arm/data.s +++ b/osfmk/arm/data.s @@ -95,7 +95,7 @@ LEXT(vfptrash_data) #if __arm64__ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #if CONFIG_SPTM .section __DATA_SPTM, __const .align 14 @@ -120,7 +120,7 @@ LEXT(ropagetable_begin) #endif #else LEXT(ropagetable_begin) -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ LEXT(ropagetable_end) diff --git a/osfmk/arm/io_map.c b/osfmk/arm/io_map.c index a8a92832e..44c8a2b79 100644 --- a/osfmk/arm/io_map.c +++ b/osfmk/arm/io_map.c @@ -76,7 +76,9 @@ io_map_init(void) VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL, VM_KERN_MEMORY_IOKIT).kmr_submap; } +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* io map is not supported in unit-tests */ STARTUP(KMEM, STARTUP_RANK_LAST, io_map_init); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ /* * Allocate and map memory for devices that may need to be mapped before @@ -124,7 +126,7 @@ io_map( kma_flags_t kmaflags = KMA_NOFAIL | KMA_PAGEABLE; if (unmappable) { - kmaflags |= KMA_DATA; + kmaflags |= KMA_DATA_SHARED; } else { kmaflags |= KMA_PERMANENT; } diff --git a/osfmk/arm/locks.h b/osfmk/arm/locks.h index 32f558405..bf65a02bf 100644 --- a/osfmk/arm/locks.h +++ b/osfmk/arm/locks.h @@ -106,6 +106,7 @@ extern bool has_lock_pv; #define wait_for_event() __builtin_arm_wfe() +#ifndef __BUILDING_XNU_LIB_UNITTEST__ #if SCHED_HYGIENE_DEBUG #define lock_disable_preemption_for_thread(t) ({ \ thread_t __dpft_thread = (t); \ @@ -128,6 +129,9 @@ extern bool has_lock_pv; os_atomic_store(__dpft_countp, *__dpft_countp + 1, compiler_acq_rel); \ }) #endif /* SCHED_HYGIENE_DEBUG */ +#else /* __BUILDING_XNU_LIB_UNITTEST__ */ +extern void lock_disable_preemption_for_thread(thread_t); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ #define lock_enable_preemption() enable_preemption() #define lock_preemption_level_for_thread(t) get_preemption_level_for_thread(t) #define lock_preemption_disabled_for_thread(t) (get_preemption_level_for_thread(t) != 0) diff --git a/osfmk/arm/machine_cpu.h b/osfmk/arm/machine_cpu.h index ee63a25d4..757c59f7f 100644 --- a/osfmk/arm/machine_cpu.h +++ b/osfmk/arm/machine_cpu.h @@ -46,6 +46,7 @@ extern void cpu_doshutdown(void (*doshutdown)(processor_t), processor_t processo extern void cpu_idle(void); extern void cpu_idle_exit(boolean_t from_reset) __attribute__((noreturn)); extern void cpu_idle_tickle(void); +extern void cpu_set_perfcontrol_timer(uint64_t now, uint64_t timeout_ticks); extern void cpu_machine_idle_init(boolean_t from_boot); diff --git a/osfmk/arm/machine_cpuid.c b/osfmk/arm/machine_cpuid.c index d6c9a1cf9..128d5f6c8 100644 --- a/osfmk/arm/machine_cpuid.c +++ b/osfmk/arm/machine_cpuid.c @@ -34,7 +34,7 @@ static arm_mvfp_info_t cpuid_mvfp_info; static arm_debug_info_t cpuid_debug_info; -uint32_t +MARK_AS_FIXUP_TEXT uint32_t machine_read_midr(void) { uint64_t midr; diff --git a/osfmk/arm/machine_routines.h b/osfmk/arm/machine_routines.h index 7d8e08a8b..fa282f6e2 100644 --- a/osfmk/arm/machine_routines.h +++ b/osfmk/arm/machine_routines.h @@ -101,46 +101,39 @@ void siq_cpu_init(void); #ifdef XNU_KERNEL_PRIVATE +char ml_get_current_core_type(void); + /* did this interrupt context interrupt userspace? */ bool ml_did_interrupt_userspace(void); -/* Clear interrupt spin debug state for thread */ +#if SCHED_HYGIENE_DEBUG +void _ml_interrupt_masked_debug_start(uintptr_t handler_addr, int type); +void _ml_interrupt_masked_debug_end(void); +#endif /* SCHED_HYGIENE_DEBUG */ + +static inline void +ml_interrupt_masked_debug_start(void *handler_addr, int type) +{ +#if SCHED_HYGIENE_DEBUG + if (static_if(sched_debug_interrupt_disable)) { + _ml_interrupt_masked_debug_start((uintptr_t)handler_addr, type); + } +#else /* !SCHED_HYGIENE_DEBUG */ +#pragma unused(handler_addr, type) +#endif /* SCHED_HYGIENE_DEBUG */ +} + +static inline void +ml_interrupt_masked_debug_end(void) +{ +#if SCHED_HYGIENE_DEBUG + if (static_if(sched_debug_interrupt_disable)) { + _ml_interrupt_masked_debug_end(); + } +#endif /* SCHED_HYGIENE_DEBUG */ +} #if SCHED_HYGIENE_DEBUG -void mt_cur_cpu_cycles_instrs_speculative(uint64_t *cycles, uint64_t *instrs); - -#if CONFIG_CPU_COUNTERS -#define INTERRUPT_MASKED_DEBUG_CAPTURE_PMC(thread) \ - if (static_if(sched_debug_pmc)) { \ - mt_cur_cpu_cycles_instrs_speculative(&thread->machine.intmask_cycles, \ - &thread->machine.intmask_instr); \ - } -#else /* CONFIG_CPU_COUNTERS */ -#define INTERRUPT_MASKED_DEBUG_CAPTURE_PMC(thread) -#endif /* !CONFIG_CPU_COUNTERS */ - -#define INTERRUPT_MASKED_DEBUG_START(handler_addr, type) \ -do { \ - if (static_if(sched_debug_interrupt_disable) && os_atomic_load(&interrupt_masked_timeout, relaxed) > 0) { \ - thread_t thread = current_thread(); \ - thread->machine.int_type = type; \ - thread->machine.int_handler_addr = (uintptr_t)VM_KERNEL_STRIP_UPTR(handler_addr); \ - thread->machine.inthandler_timestamp = ml_get_sched_hygiene_timebase(); \ - INTERRUPT_MASKED_DEBUG_CAPTURE_PMC(thread); \ - thread->machine.int_vector = (uintptr_t)NULL; \ - } \ -} while (0) - -#define INTERRUPT_MASKED_DEBUG_END() \ -do { \ - if (static_if(sched_debug_interrupt_disable) && os_atomic_load(&interrupt_masked_timeout, relaxed) > 0) { \ - thread_t thread = current_thread(); \ - ml_handle_interrupt_handler_duration(thread); \ - thread->machine.inthandler_timestamp = 0; \ - thread->machine.inthandler_abandon = false; \ - } \ -} while (0) - void ml_irq_debug_start(uintptr_t handler, uintptr_t vector); void ml_irq_debug_end(void); void ml_irq_debug_abandon(void); @@ -151,22 +144,18 @@ void ml_spin_debug_clear_self(void); void ml_handle_interrupts_disabled_duration(thread_t thread); void ml_handle_stackshot_interrupt_disabled_duration(thread_t thread); void ml_handle_interrupt_handler_duration(thread_t thread); - -#else /* SCHED_HYGIENE_DEBUG */ - -#define INTERRUPT_MASKED_DEBUG_START(handler_addr, type) -#define INTERRUPT_MASKED_DEBUG_END() - #endif /* SCHED_HYGIENE_DEBUG */ extern bool ml_snoop_thread_is_on_core(thread_t thread); extern boolean_t ml_is_quiescing(void); extern void ml_set_is_quiescing(boolean_t); extern uint64_t ml_get_booter_memory_size(void); -#endif + +#endif /* XNU_KERNEL_PRIVATE */ /* Type for the Time Base Enable function */ typedef void (*time_base_enable_t)(cpu_id_t cpu_id, boolean_t enable); + #if defined(PEXPERT_KERNEL_PRIVATE) || defined(MACH_KERNEL_PRIVATE) /* Type for the Processor Cache Dispatch function */ typedef void (*cache_dispatch_t)(cpu_id_t cpu_id, unsigned int select, unsigned int param0, unsigned int param1); @@ -267,9 +256,10 @@ ex_cb_action_t ex_cb_invoke( vm_offset_t far); typedef enum { - CLUSTER_TYPE_SMP = 0, - CLUSTER_TYPE_E = 1, - CLUSTER_TYPE_P = 2, + CLUSTER_TYPE_INVALID = -1, + CLUSTER_TYPE_SMP = 0, + CLUSTER_TYPE_E = 1, + CLUSTER_TYPE_P = 2, MAX_CPU_TYPES, } cluster_type_t; @@ -369,7 +359,9 @@ struct ml_processor_info { uint32_t log_id; uint32_t l2_access_penalty; /* unused */ uint32_t cluster_id; + cluster_type_t cluster_type; + uint32_t l2_cache_id; uint32_t l2_cache_size; uint32_t l3_cache_id; @@ -1060,33 +1052,35 @@ typedef void (*sched_perfcontrol_deadline_passed_t)(uint64_t deadline); * Context Switch Callout * * Parameters: - * event - The perfcontrol_event for this callout - * cpu_id - The CPU doing the context switch - * timestamp - The timestamp for the context switch - * flags - Flags for other relevant information - * offcore - perfcontrol_data structure for thread going off-core - * oncore - perfcontrol_data structure for thread going on-core - * cpu_counters - perfcontrol_cpu_counters for the CPU doing the switch + * event - The perfcontrol_event for this callout + * cpu_id - The CPU doing the context switch + * timestamp - The timestamp for the context switch + * flags - Flags for other relevant information + * offcore - perfcontrol_data structure for thread going off-core + * oncore - perfcontrol_data structure for thread going on-core + * cpu_counters - perfcontrol_cpu_counters for the CPU doing the switch + * timeout_ticks - Per core timer timeout */ typedef void (*sched_perfcontrol_csw_t)( perfcontrol_event event, uint32_t cpu_id, uint64_t timestamp, uint32_t flags, struct perfcontrol_thread_data *offcore, struct perfcontrol_thread_data *oncore, - struct perfcontrol_cpu_counters *cpu_counters, __unused void *unused); + struct perfcontrol_cpu_counters *cpu_counters, uint64_t *timeout_ticks); /* * Thread State Update Callout * * Parameters: - * event - The perfcontrol_event for this callout - * cpu_id - The CPU doing the state update - * timestamp - The timestamp for the state update - * flags - Flags for other relevant information - * thr_data - perfcontrol_data structure for the thread being updated + * event - The perfcontrol_event for this callout + * cpu_id - The CPU doing the state update + * timestamp - The timestamp for the state update + * flags - Flags for other relevant information + * thr_data - perfcontrol_data structure for the thread being updated + * timeout_ticks - Per core timer timeout */ typedef void (*sched_perfcontrol_state_update_t)( perfcontrol_event event, uint32_t cpu_id, uint64_t timestamp, uint32_t flags, - struct perfcontrol_thread_data *thr_data, __unused void *unused); + struct perfcontrol_thread_data *thr_data, uint64_t *timeout_ticks); /* * Thread Group Blocking Relationship Callout @@ -1112,6 +1106,19 @@ typedef void (*sched_perfcontrol_thread_group_blocked_t)( typedef void (*sched_perfcontrol_thread_group_unblocked_t)( thread_group_data_t unblocked_tg, thread_group_data_t unblocking_tg, uint32_t flags, perfcontrol_state_t unblocked_thr_state); +/* + * Per core timer expired callout + * + * Parameters: + * now - Current time + * flags - Flags for other relevant information + * cpu_id - The CPU for which the timer expired + * timeout_ticks - Per core timer timeout + */ +typedef void (*sched_perfcontrol_running_timer_expire_t)( + uint64_t now, uint32_t flags, uint32_t cpu_id, uint64_t *timeout_ticks); + + /* * Callers should always use the CURRENT version so that the kernel can detect both older * and newer structure layouts. New callbacks should always be added at the end of the @@ -1120,17 +1127,18 @@ typedef void (*sched_perfcontrol_thread_group_unblocked_t)( * to reset callbacks to their default in-kernel values. */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_0 (0) /* up-to oncore */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_1 (1) /* up-to max_runnable_latency */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_2 (2) /* up-to work_interval_notify */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_3 (3) /* up-to thread_group_deinit */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_4 (4) /* up-to deadline_passed */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_5 (5) /* up-to state_update */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_6 (6) /* up-to thread_group_flags_update */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_7 (7) /* up-to work_interval_ctl */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_8 (8) /* up-to thread_group_unblocked */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_9 (9) /* allows CLPC to specify resource contention flags */ -#define SCHED_PERFCONTROL_CALLBACKS_VERSION_CURRENT SCHED_PERFCONTROL_CALLBACKS_VERSION_6 +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_0 (0) /* up-to oncore */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_1 (1) /* up-to max_runnable_latency */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_2 (2) /* up-to work_interval_notify */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_3 (3) /* up-to thread_group_deinit */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_4 (4) /* up-to deadline_passed */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_5 (5) /* up-to state_update */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_6 (6) /* up-to thread_group_flags_update */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_7 (7) /* up-to work_interval_ctl */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_8 (8) /* up-to thread_group_unblocked */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_9 (9) /* allows CLPC to specify resource contention flags */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_10 (10) /* allows CLPC to register a per core timer callback */ +#define SCHED_PERFCONTROL_CALLBACKS_VERSION_CURRENT SCHED_PERFCONTROL_CALLBACKS_VERSION_10 struct sched_perfcontrol_callbacks { unsigned long version; /* Use SCHED_PERFCONTROL_CALLBACKS_VERSION_CURRENT */ @@ -1148,6 +1156,7 @@ struct sched_perfcontrol_callbacks { sched_perfcontrol_work_interval_ctl_t work_interval_ctl; sched_perfcontrol_thread_group_blocked_t thread_group_blocked; sched_perfcontrol_thread_group_unblocked_t thread_group_unblocked; + sched_perfcontrol_running_timer_expire_t running_timer_expire; }; typedef struct sched_perfcontrol_callbacks *sched_perfcontrol_callbacks_t; @@ -1186,16 +1195,37 @@ extern void sched_perfcontrol_thread_group_preferred_clusters_set(void *machine_ /* * Edge Scheduler-CLPC Interface * - * sched_perfcontrol_edge_matrix_get()/sched_perfcontrol_edge_matrix_set() + * sched_perfcontrol_edge_matrix_by_qos_get()/sched_perfcontrol_edge_matrix_by_qos_set() * - * The Edge scheduler uses edges between clusters to define the likelihood of migrating threads - * across clusters. The edge config between any two clusters defines the edge weight and whether - * migation and steal operations are allowed across that edge. The getter and setter allow CLPC - * to query and configure edge properties between various clusters on the platform. + * For each QoS, the Edge scheduler uses edges between clusters to define the likelihood of + * migrating threads of that QoS across the clusters. The edge config between any two clusters + * defines the edge weight and whether migation and steal operations are allowed across that + * edge. The getter and setter allow CLPC to query and configure edge properties between various + * clusters on the platform. + * + * The edge_matrix is a flattened array of dimension num_psets X num_psets X num_classes, where + * num_classes equals PERFCONTROL_CLASS_MAX and the scheduler will map perfcontrol classes onto + * QoS buckets. For perfcontrol classes lacking an equivalent QoS bucket, the "set" operation is + * a no-op, and the "get" operation returns zeroed edges. */ -extern void sched_perfcontrol_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, uint64_t flags, uint64_t matrix_order); -extern void sched_perfcontrol_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, uint64_t flags, uint64_t matrix_order); +extern void sched_perfcontrol_edge_matrix_by_qos_get(sched_clutch_edge *edge_matrix, bool *edge_requested, uint64_t flags, uint64_t num_psets, uint64_t num_classes); +extern void sched_perfcontrol_edge_matrix_by_qos_set(sched_clutch_edge *edge_matrix, bool *edge_changed, uint64_t flags, uint64_t num_psets, uint64_t num_classes); + +/* + * sched_perfcontrol_edge_matrix_get()/sched_perfcontrol_edge_matrix_set() + * + * Legacy interface for getting/setting the edge config properties, which determine the edge + * weight and whether steal and migration are allowed between any two clusters. Since the + * edge matrix has a per-QoS dimension, sched_perfcontrol_edge_matrix_set() sets the + * configuration to be the same across all QoSes. sched_perfcontrol_edge_matrix_get() reads + * the edge matrix setting from the highest QoS (fixed priority). + * + * Superceded by sched_perfcontrol_edge_matrix_by_qos_get()/sched_perfcontrol_edge_matrix_by_qos_set() + */ + +extern void sched_perfcontrol_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, uint64_t flags, uint64_t matrix_order); +extern void sched_perfcontrol_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, uint64_t flags, uint64_t matrix_order); /* * sched_perfcontrol_edge_cpu_rotation_bitmasks_get()/sched_perfcontrol_edge_cpu_rotation_bitmasks_set() @@ -1467,20 +1497,6 @@ bool ml_is_secure_hib_supported(void); bool ml_task_uses_1ghz_timebase(const task_t task); #endif /* XNU_KERNEL_PRIVATE */ -#ifdef KERNEL_PRIVATE -/** - * Given a physical address, return whether that address is owned by the secure - * world. - * - * @note This does not include memory shared between XNU and the secure world. - * - * @param paddr The physical address to check. - * - * @return True if the physical address is owned and being used exclusively by - * the secure world, false otherwise. - */ -bool ml_paddr_is_exclaves_owned(vm_offset_t paddr); -#endif /* KERNEL_PRIVATE */ diff --git a/osfmk/arm/machine_routines_apple.c b/osfmk/arm/machine_routines_apple.c index 4b1d95042..b0a12ed59 100644 --- a/osfmk/arm/machine_routines_apple.c +++ b/osfmk/arm/machine_routines_apple.c @@ -40,6 +40,10 @@ #include #include +#if APPLEVIRTUALPLATFORM +#include +#endif + #if defined(APPLEAVALANCHE) && defined(ARM64_BOARD_CONFIG_T6020) #define CHECK_RDAR_145882231 1 @@ -261,3 +265,4 @@ ml_non_arm64e_user_jop_pid(void) } #endif /* HAS_APPLE_PAC */ + diff --git a/osfmk/arm/machine_routines_common.c b/osfmk/arm/machine_routines_common.c index b27942013..84fb4da71 100644 --- a/osfmk/arm/machine_routines_common.c +++ b/osfmk/arm/machine_routines_common.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -124,7 +125,7 @@ sched_perfcontrol_csw_default( __unused perfcontrol_event event, __unused uint32_t cpu_id, __unused uint64_t timestamp, __unused uint32_t flags, __unused struct perfcontrol_thread_data *offcore, __unused struct perfcontrol_thread_data *oncore, - __unused struct perfcontrol_cpu_counters *cpu_counters, __unused void *unused) + __unused struct perfcontrol_cpu_counters *cpu_counters, __unused uint64_t *timeout_ticks) { } @@ -132,7 +133,7 @@ static void sched_perfcontrol_state_update_default( __unused perfcontrol_event event, __unused uint32_t cpu_id, __unused uint64_t timestamp, __unused uint32_t flags, __unused struct perfcontrol_thread_data *thr_data, - __unused void *unused) + __unused uint64_t *timeout_ticks) { } @@ -150,6 +151,12 @@ sched_perfcontrol_thread_group_unblocked_default( { } +static void +sched_perfcontrol_running_timer_expire_default( + __unused uint64_t now, __unused uint32_t flags, __unused uint32_t cpu_id, __unused uint64_t *timeout_ticks) +{ +} + sched_perfcontrol_offcore_t sched_perfcontrol_offcore = sched_perfcontrol_offcore_default; sched_perfcontrol_context_switch_t sched_perfcontrol_switch = sched_perfcontrol_switch_default; sched_perfcontrol_oncore_t sched_perfcontrol_oncore = sched_perfcontrol_oncore_default; @@ -164,6 +171,7 @@ sched_perfcontrol_csw_t sched_perfcontrol_csw = sched_pe sched_perfcontrol_state_update_t sched_perfcontrol_state_update = sched_perfcontrol_state_update_default; sched_perfcontrol_thread_group_blocked_t sched_perfcontrol_thread_group_blocked = sched_perfcontrol_thread_group_blocked_default; sched_perfcontrol_thread_group_unblocked_t sched_perfcontrol_thread_group_unblocked = sched_perfcontrol_thread_group_unblocked_default; +sched_perfcontrol_running_timer_expire_t sched_perfcontrol_running_timer_expire = sched_perfcontrol_running_timer_expire_default; boolean_t sched_perfcontrol_thread_shared_rsrc_flags_enabled = false; void @@ -218,6 +226,10 @@ sched_perfcontrol_register_callbacks(sched_perfcontrol_callbacks_t callbacks, un sched_perfcontrol_thread_shared_rsrc_flags_enabled = true; } + if (callbacks->version >= SCHED_PERFCONTROL_CALLBACKS_VERSION_10) { + sched_perfcontrol_running_timer_expire = callbacks->running_timer_expire; + } + if (callbacks->version >= SCHED_PERFCONTROL_CALLBACKS_VERSION_7) { if (callbacks->work_interval_ctl != NULL) { sched_perfcontrol_work_interval_ctl = callbacks->work_interval_ctl; @@ -430,13 +442,14 @@ machine_switch_perfcontrol_context(perfcontrol_event event, machine_switch_populate_perfcontrol_thread_data(&oncore, new, new_thread_same_pri_latency); machine_switch_populate_perfcontrol_cpu_counters(&cpu_counters); + uint64_t timeout_ticks = 0; #if CONFIG_CPU_COUNTERS uint64_t counters[MT_CORE_NFIXED]; bool ctrs_enabled = perfcontrol_callout_counters_begin(counters); #endif /* CONFIG_CPU_COUNTERS */ sched_perfcontrol_csw(event, cpu_id, timestamp, flags, - &offcore, &oncore, &cpu_counters, NULL); + &offcore, &oncore, &cpu_counters, &timeout_ticks); #if CONFIG_CPU_COUNTERS if (ctrs_enabled) { perfcontrol_callout_counters_end(counters, PERFCONTROL_CALLOUT_CONTEXT); @@ -450,6 +463,9 @@ machine_switch_perfcontrol_context(perfcontrol_event event, if (sched_perfcontrol_thread_shared_rsrc_flags_enabled) { sched_perfcontrol_thread_flags_update(old, &offcore, SHARED_RSRC_POLICY_AGENT_PERFCTL_CSW); } + if (timeout_ticks != 0) { + cpu_set_perfcontrol_timer(timestamp, timeout_ticks); + } #endif /* CONFIG_SCHED_EDGE */ } } @@ -467,13 +483,14 @@ machine_switch_perfcontrol_state_update(perfcontrol_event event, uint32_t cpu_id = (uint32_t)cpu_number(); struct perfcontrol_thread_data data; machine_switch_populate_perfcontrol_thread_data(&data, thread, 0); + uint64_t timeout_ticks = 0; #if CONFIG_CPU_COUNTERS uint64_t counters[MT_CORE_NFIXED]; bool ctrs_enabled = perfcontrol_callout_counters_begin(counters); #endif /* CONFIG_CPU_COUNTERS */ sched_perfcontrol_state_update(event, cpu_id, timestamp, flags, - &data, NULL); + &data, &timeout_ticks); #if CONFIG_CPU_COUNTERS if (ctrs_enabled) { perfcontrol_callout_counters_end(counters, PERFCONTROL_CALLOUT_STATE_UPDATE); @@ -490,6 +507,9 @@ machine_switch_perfcontrol_state_update(perfcontrol_event event, } else { assert(data.thread_flags_mask == 0); } + if (timeout_ticks != 0) { + cpu_set_perfcontrol_timer(timestamp, timeout_ticks); + } #endif /* CONFIG_SCHED_EDGE */ } @@ -693,6 +713,17 @@ machine_thread_group_unblocked(struct thread_group *unblocked_tg, #endif /* CONFIG_THREAD_GROUPS */ +void +machine_perfcontrol_running_timer_expire(uint64_t now, + uint32_t flags, + int cpu_id, + uint64_t *timeout_ticks) +{ + if (sched_perfcontrol_running_timer_expire != sched_perfcontrol_running_timer_expire_default) { + sched_perfcontrol_running_timer_expire(now, flags, cpu_id, timeout_ticks); + } +} + void machine_max_runnable_latency(uint64_t bg_max_latency, uint64_t default_max_latency, @@ -751,67 +782,14 @@ machine_perfcontrol_deadline_passed(uint64_t deadline) } } -#if SCHED_HYGIENE_DEBUG - -__options_decl(int_mask_hygiene_flags_t, uint8_t, { - INT_MASK_BASE = 0x00, - INT_MASK_FROM_HANDLER = 0x01, - INT_MASK_IS_STACKSHOT = 0x02, -}); - /* - * ml_spin_debug_reset() - * Reset the timestamp on a thread that has been unscheduled - * to avoid false alarms. Alarm will go off if interrupts are held - * disabled for too long, starting from now. - * - * Call ml_get_timebase() directly to prevent extra overhead on newer - * platforms that's enabled in DEVELOPMENT kernel configurations. + * Get a character representing the current thread's type of CPU core. */ -void -ml_spin_debug_reset(thread_t thread) +char +ml_get_current_core_type(void) { - if (thread->machine.intmask_timestamp) { - thread->machine.intmask_timestamp = ml_get_sched_hygiene_timebase(); - INTERRUPT_MASKED_DEBUG_CAPTURE_PMC(thread); - } -} + const thread_t thread = current_thread(); -/* - * ml_spin_debug_clear() - * Clear the timestamp and cycle/instruction counts on a thread that - * has been unscheduled to avoid false alarms - */ -void -ml_spin_debug_clear(thread_t thread) -{ - thread->machine.intmask_timestamp = 0; - thread->machine.intmask_cycles = 0; - thread->machine.intmask_instr = 0; -} - -/* - * ml_spin_debug_clear_self() - * Clear the timestamp on the current thread to prevent - * false alarms - */ -void -ml_spin_debug_clear_self(void) -{ - ml_spin_debug_clear(current_thread()); -} - -#ifndef KASAN - -/* - * Get a character representing the provided thread's kind of CPU. - */ -#if !CONFIG_CPU_COUNTERS -__unused -#endif // !CONFIG_CPU_COUNTERS -static char -__ml_interrupts_disabled_cpu_kind(thread_t thread) -{ #if __AMP__ processor_t processor = thread->last_processor; if (!processor) { @@ -831,23 +809,87 @@ __ml_interrupts_disabled_cpu_kind(thread_t thread) #endif // !__AMP__ } -#define EXTRA_INFO_STRING_SIZE 256 -#define LOW_FREQ_THRESHOLD_MHZ 500 -#define HIGH_CPI_THRESHOLD 3 +#if SCHED_HYGIENE_DEBUG + +__options_decl(int_mask_hygiene_flags_t, uint8_t, { + INT_MASK_BASE = 0x00, + INT_MASK_FROM_HANDLER = 0x01, + INT_MASK_IS_STACKSHOT = 0x02, +}); + +/* + * ml_spin_debug_reset() + * Reset the timestamp on a thread that has been unscheduled + * to avoid false alarms. Alarm will go off if interrupts are held + * disabled for too long, starting from now. + */ +void +ml_spin_debug_reset(thread_t thread) +{ + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | ML_TIMEOUT_PMC_FLAGS; + + kern_timeout_restart(&thread->machine.int_timeout, flags); +} + +/* + * ml_spin_debug_clear() + * Clear the timestamp and cycle/instruction counts on a thread that + * has been unscheduled to avoid false alarms + */ +void +ml_spin_debug_clear(thread_t thread) +{ + kern_timeout_override(&thread->machine.int_timeout); +} + +/* + * ml_spin_debug_clear_self() + * Clear the timestamp on the current thread to prevent + * false alarms + */ +void +ml_spin_debug_clear_self(void) +{ + ml_spin_debug_clear(current_thread()); +} + +void +_ml_interrupt_masked_debug_start(uintptr_t handler_addr, int type) +{ + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | ML_TIMEOUT_PMC_FLAGS; + const thread_t thread = current_thread(); + + thread->machine.int_type = type; + thread->machine.int_handler_addr = (uintptr_t)VM_KERNEL_STRIP_UPTR(handler_addr); + thread->machine.int_vector = (uintptr_t)NULL; + kern_timeout_start(&thread->machine.int_timeout, flags); +} + +void +_ml_interrupt_masked_debug_end(void) +{ + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS; + const thread_t thread = current_thread(); + + kern_timeout_end(&thread->machine.int_timeout, flags); + if (os_atomic_load(&interrupt_masked_timeout, relaxed) > 0) { + ml_handle_interrupt_handler_duration(thread); + } + os_compiler_barrier(); + thread->machine.int_type = 0; + thread->machine.int_handler_addr = (uintptr_t)NULL; + thread->machine.int_vector = (uintptr_t)NULL; +} + +#ifndef KASAN + +#define PREFIX_STRING_SIZE 256 static void -__ml_trigger_interrupts_disabled_handle(thread_t thread, uint64_t start, uint64_t now, uint64_t timeout, int_mask_hygiene_flags_t flags) +__ml_trigger_interrupts_disabled_handle(thread_t thread, uint64_t timeout, int_mask_hygiene_flags_t int_flags) { - mach_timebase_info_data_t timebase; - clock_timebase_info(&timebase); - bool is_int_handler = flags & INT_MASK_FROM_HANDLER; - bool is_stackshot = flags & INT_MASK_IS_STACKSHOT; - - const uint64_t time_elapsed = now - start; - const uint64_t time_elapsed_ns = (time_elapsed * timebase.numer) / timebase.denom; - #if __AMP__ - if (is_stackshot && interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) { + if (int_flags == INT_MASK_IS_STACKSHOT && interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) { /* * If there are no recommended performance cores, we double the timeout to compensate * for the difference in time it takes Stackshot to run on efficiency cores, and then @@ -865,6 +907,7 @@ __ml_trigger_interrupts_disabled_handle(thread_t thread, uint64_t start, uint64_ } } if (cpu > max_cpu) { + uint64_t time_elapsed = kern_timeout_gross_duration(&thread->machine.int_timeout); if (time_elapsed < timeout * 2) { return; } @@ -872,53 +915,33 @@ __ml_trigger_interrupts_disabled_handle(thread_t thread, uint64_t start, uint64_ } #endif /* __AMP__ */ - uint64_t current_cycles = 0, current_instrs = 0; - -#if CONFIG_CPU_COUNTERS - if (static_if(sched_debug_pmc)) { - mt_cur_cpu_cycles_instrs_speculative(¤t_cycles, ¤t_instrs); - } -#endif // CONFIG_CPU_COUNTERS - - const uint64_t cycles_elapsed = current_cycles - thread->machine.intmask_cycles; - const uint64_t instrs_elapsed = current_instrs - thread->machine.intmask_instr; - if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_PANIC) { - const uint64_t timeout_ns = ((timeout * debug_cpu_performance_degradation_factor) * timebase.numer) / timebase.denom; - char extra_info_string[EXTRA_INFO_STRING_SIZE] = { '\0' }; -#if CONFIG_CPU_COUNTERS - if (static_if(sched_debug_pmc)) { - const uint64_t time_elapsed_us = time_elapsed_ns / 1000; - const uint64_t average_freq_mhz = cycles_elapsed / time_elapsed_us; - const uint64_t average_cpi_whole = cycles_elapsed / instrs_elapsed; - const uint64_t average_cpi_fractional = ((cycles_elapsed * 100) / instrs_elapsed) % 100; - bool high_cpi = average_cpi_whole >= HIGH_CPI_THRESHOLD; - char core_kind = __ml_interrupts_disabled_cpu_kind(thread); - bool low_mhz = average_freq_mhz < LOW_FREQ_THRESHOLD_MHZ; + char prefix_string[PREFIX_STRING_SIZE] = { '\0' }; - snprintf(extra_info_string, EXTRA_INFO_STRING_SIZE, - ", %sfreq = %llu MHz, %sCPI = %llu.%llu, CPU kind = %c", - low_mhz ? "low " : "", - average_freq_mhz, - high_cpi ? "high " : "", - average_cpi_whole, - average_cpi_fractional, - core_kind); - } -#endif // CONFIG_CPU_COUNTERS - - if (is_int_handler) { - panic("Processing of an interrupt (type = %u, handler address = %p, vector = %p) " - "took %llu nanoseconds (start = %llu, now = %llu, timeout = %llu ns%s)", - thread->machine.int_type, (void *)thread->machine.int_handler_addr, (void *)thread->machine.int_vector, - time_elapsed_ns, start, now, timeout_ns, extra_info_string); + if (int_flags & INT_MASK_FROM_HANDLER) { + snprintf(prefix_string, PREFIX_STRING_SIZE, + "Processing of an interrupt (type = %u, handler address = %p, vector = %p) " + "timed out:", thread->machine.int_type, + (void *)thread->machine.int_handler_addr, + (void *)thread->machine.int_vector); + } else if (int_flags & INT_MASK_IS_STACKSHOT) { + snprintf(prefix_string, PREFIX_STRING_SIZE, + "Stackshot duration timed out:"); } else { - panic("%s for %llu nanoseconds (start = %llu, now = %llu, timeout = %llu ns%s)", - is_stackshot ? "Stackshot disabled interrupts" : "Interrupts held disabled", - time_elapsed_ns, start, now, timeout_ns, extra_info_string); + snprintf(prefix_string, PREFIX_STRING_SIZE, + "Interrupts held disabled timed out:"); } + kern_timeout_try_panic(KERN_TIMEOUT_INTERRUPT, thread->machine.int_type, + &thread->machine.int_timeout, prefix_string, timeout); } else if (interrupt_masked_debug_mode == SCHED_HYGIENE_MODE_TRACE) { - if (is_int_handler) { + uint64_t time_elapsed = kern_timeout_gross_duration(&thread->machine.int_timeout); + uint64_t cycles_elapsed; + uint64_t instrs_elapsed; + + kern_timeout_cycles_instrs(&thread->machine.int_timeout, + &cycles_elapsed, &instrs_elapsed); + + if (int_flags != INT_MASK_BASE) { static const uint32_t interrupt_handled_dbgid = MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_HANDLED_EXPIRED); DTRACE_SCHED3(interrupt_handled_dbgid, uint64_t, time_elapsed, @@ -938,40 +961,48 @@ __ml_trigger_interrupts_disabled_handle(thread_t thread, uint64_t start, uint64_ #endif // !defined(KASAN) static inline void -__ml_handle_interrupts_disabled_duration(thread_t thread, uint64_t timeout, bool is_int_handler) +__ml_handle_interrupts_disabled_duration(thread_t thread, uint64_t timeout, int_mask_hygiene_flags_t int_flags) { + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS; + if (timeout == 0) { return; // 0 means timeout disabled. } - uint64_t start = is_int_handler ? thread->machine.inthandler_timestamp : thread->machine.intmask_timestamp; - if (start != 0) { - uint64_t now = ml_get_sched_hygiene_timebase(); - if (interrupt_masked_debug_mode && - ((now - start) > timeout * debug_cpu_performance_degradation_factor) && - !thread->machine.inthandler_abandon) { - /* - * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the - * mechanism enabled so that KASAN can catch any bugs in the mechanism itself. - */ + kern_timeout_end(&thread->machine.int_timeout, flags); + + if (__improbable(interrupt_masked_debug_mode && + kern_timeout_gross_duration(&thread->machine.int_timeout) + >= timeout * debug_cpu_performance_degradation_factor)) { + /* + * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the + * mechanism enabled so that KASAN can catch any bugs in the mechanism itself. + */ #ifndef KASAN - __ml_trigger_interrupts_disabled_handle(thread, start, now, timeout, is_int_handler); + __ml_trigger_interrupts_disabled_handle(thread, timeout, int_flags); #endif - } - - if (is_int_handler) { - uint64_t const duration = now - start; - /* - * No need for an atomic add, the only thread modifying - * this is ourselves. Other threads querying will just see - * either the old or the new value. (This will also just - * resolve to regular loads and stores on relevant - * platforms.) - */ - uint64_t const old_duration = os_atomic_load_wide(&thread->machine.int_time_mt, relaxed); - os_atomic_store_wide(&thread->machine.int_time_mt, old_duration + duration, relaxed); - } } + + if (int_flags != INT_MASK_BASE) { + uint64_t const duration = kern_timeout_gross_duration(&thread->machine.int_timeout); + /* + * No need for an atomic add, the only thread modifying + * this is ourselves. Other threads querying will just see + * either the old or the new value. (This will also just + * resolve to regular loads and stores on relevant + * platforms.) + */ + uint64_t const old_duration = os_atomic_load(&thread->machine.int_time_mt, relaxed); + os_atomic_store(&thread->machine.int_time_mt, old_duration + duration, relaxed); + } + + /* + * There are some circumstances where interrupts will be disabled + * outside of the KPIs and then re-enabled, so we don't want to reuse + * an old start time in that case (which will blow up with timeout + * exceeded), so we just unconditionally reset the start time here. + */ + kern_timeout_override(&thread->machine.int_timeout); } void @@ -999,14 +1030,14 @@ ml_handle_interrupt_handler_duration(thread_t thread) void ml_irq_debug_start(uintptr_t handler, uintptr_t vector) { - INTERRUPT_MASKED_DEBUG_START(handler, DBG_INTR_TYPE_OTHER); + ml_interrupt_masked_debug_start((void *)handler, DBG_INTR_TYPE_OTHER); current_thread()->machine.int_vector = (uintptr_t)VM_KERNEL_STRIP_PTR(vector); } void ml_irq_debug_end() { - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } /* @@ -1022,24 +1053,20 @@ ml_irq_debug_abandon(void) { assert(!ml_get_interrupts_enabled()); - thread_t t = current_thread(); - if (t->machine.inthandler_timestamp != 0) { - t->machine.inthandler_abandon = true; - } + thread_t thread = current_thread(); + kern_timeout_override(&thread->machine.int_timeout); } -#endif // SCHED_HYGIENE_DEBUG -#if SCHED_HYGIENE_DEBUG -__attribute__((noinline)) static void ml_interrupt_masked_debug_timestamp(thread_t thread) { - thread->machine.intmask_timestamp = ml_get_sched_hygiene_timebase(); - INTERRUPT_MASKED_DEBUG_CAPTURE_PMC(thread); -} -#endif + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | ML_TIMEOUT_PMC_FLAGS; -boolean_t + kern_timeout_start(&thread->machine.int_timeout, flags); +} +#endif /* SCHED_HYGIENE_DEBUG */ + +__mockable boolean_t ml_set_interrupts_enabled_with_debug(boolean_t enable, boolean_t __unused debug) { thread_t thread; @@ -1063,9 +1090,6 @@ ml_set_interrupts_enabled_with_debug(boolean_t enable, boolean_t __unused debug) } else { ml_handle_interrupts_disabled_duration(thread); } - thread->machine.intmask_timestamp = 0; - thread->machine.intmask_cycles = 0; - thread->machine.intmask_instr = 0; } #endif // SCHED_HYGIENE_DEBUG if (get_preemption_level() == 0) { diff --git a/osfmk/arm/misc_protos.h b/osfmk/arm/misc_protos.h index 44c314f46..6a0fa868d 100644 --- a/osfmk/arm/misc_protos.h +++ b/osfmk/arm/misc_protos.h @@ -52,10 +52,7 @@ extern void arm_vm_prot_init(boot_args *args); extern void arm_vm_prot_finalize(boot_args *args); #if __arm64__ -extern void arm_set_kernel_tbi(void); - void __attribute__((__noreturn__)) _was_in_userspace(void); - #endif /* __arm64__ */ extern kern_return_t DebuggerXCallEnter(boolean_t, bool); @@ -109,21 +106,18 @@ extern void arm_get_matrix_cpu_state(struct arm_matrix_cpu_state *cpu_state); * Indicate during a context-switch event that we have updated some CPU * state which requires a later context-sync event. * - * When the CPU is configured to speculate across eret, this function sets a - * flag that will trigger an explicit isb instruction sometime before the - * upcoming eret instruction. - * - * Otherwise, the eret instruction itself is always synchronizing, and - * this function is an empty stub which serves only as documentation. + * Sets a per-CPU flag indicating the processor context needs synchronizing. + * This is done to defer synchronization until returning from an exception. If + * synchronization is needed before that, call arm_context_switch_sync(). */ -#if ERET_IS_NOT_CONTEXT_SYNCHRONIZING extern void arm_context_switch_requires_sync(void); -#else -static inline void -arm_context_switch_requires_sync(void) -{ -} -#endif /* ERET_IS_NOT_CONTEXT_SYNCHRONIZING */ + +/** + * Synchronize context switch state immediately. Clears the dirty flag used by + * arm_context_switch_requires_sync(). If the context switch state has already + * been synchronized, does nothing. + */ +extern void arm_context_switch_sync(void); #if __has_feature(ptrauth_calls) extern boolean_t arm_user_jop_disabled(void); diff --git a/osfmk/arm/model_dep.c b/osfmk/arm/model_dep.c index cedc8a9aa..ed61cf6b3 100644 --- a/osfmk/arm/model_dep.c +++ b/osfmk/arm/model_dep.c @@ -100,6 +100,7 @@ #include #endif + #if MACH_KDP void kdp_trap(unsigned int, struct arm_saved_state *); #endif @@ -108,7 +109,7 @@ void kdp_trap(unsigned int, struct arm_saved_state *); * Increment the PANICLOG_VERSION if you change the format of the panic * log in any way. */ -#define PANICLOG_VERSION 14 +#define PANICLOG_VERSION 15 static struct kcdata_descriptor kc_panic_data; extern char iBoot_version[]; @@ -174,7 +175,7 @@ extern uint64_t roots_installed; #define FP_MAX_NUM_TO_EVALUATE (50) /* Timeout for all processors responding to debug crosscall */ -MACHINE_TIMEOUT(debug_ack_timeout, "debug-ack", 240000, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL); +MACHINE_TIMEOUT_ALWAYS_ENABLED(debug_ack_timeout, "debug-ack", 240000, MACHINE_TIMEOUT_UNIT_TIMEBASE); /* Forward functions definitions */ void panic_display_times(void); @@ -203,11 +204,8 @@ uint64_t PE_nvram_stashed_x86_macos_slide = UINT64_MAX; #endif -/* - * Backtrace a single frame. - */ static void -print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, +do_print_backtrace_internal(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, boolean_t is_64_bit, boolean_t print_kexts_in_backtrace) { unsigned int i = 0; @@ -218,9 +216,6 @@ print_one_backtrace(pmap_t pmap, vm_offset_t topfp, const char *cur_marker, vm_offset_t raddrs[FP_MAX_NUM_TO_EVALUATE] = { 0 }; bool dump_kernel_stack = (fp >= VM_MIN_KERNEL_ADDRESS); -#if defined(HAS_APPLE_PAC) - fp = (addr64_t)ptrauth_strip((void *)fp, ptrauth_key_frame_pointer); -#endif do { if ((fp == 0) || ((fp & FP_ALIGNMENT_MASK) != 0)) { break; @@ -322,6 +317,7 @@ panic_display_tpidrs(void) } + static void panic_display_hung_cpus_help(void) { @@ -337,12 +333,27 @@ panic_display_hung_cpus_help(void) unsigned i, retry; for (i = 0; i < info->num_cpus; i++) { + ml_topology_cpu_t *cpu = &info->cpus[i]; + char cluster_name[16], cluster_letter; + + switch (cpu->cluster_type) { + case CLUSTER_TYPE_E: + cluster_letter = 'E'; + break; + case CLUSTER_TYPE_P: + cluster_letter = 'P'; + break; + default: + cluster_letter = '?'; + } + snprintf(cluster_name, sizeof(cluster_name), "%cACC%d", cluster_letter, cpu->cluster_id); + if (!PE_cpu_power_check_kdp(i)) { - paniclog_append_noflush("CORE %u is offline, skipping\n", i); + paniclog_append_noflush("CORE %u [%s] is offline, skipping\n", i, cluster_name); continue; } - if (info->cpus[i].cpu_UTTDBG_regs) { - volatile uint64_t *pcsr = (volatile uint64_t*)(info->cpus[i].cpu_UTTDBG_regs + pcsr_offset); + if (cpu->cpu_UTTDBG_regs) { + volatile uint64_t *pcsr = (volatile uint64_t*)(cpu->cpu_UTTDBG_regs + pcsr_offset); volatile uint32_t *pcsrTrigger = (volatile uint32_t*)pcsr; uint64_t pc = 0; @@ -357,7 +368,7 @@ panic_display_hung_cpus_help(void) if (pc >> 48) { pc |= 0xffff000000000000ull; } - paniclog_append_noflush("CORE %u recently retired instr at 0x%016llx\n", i, pc); + paniclog_append_noflush("CORE %u [%s] recently retired instr at 0x%016llx\n", i, cluster_name, pc); } } } @@ -419,16 +430,52 @@ panic_report_exclaves_stackshot(void) } #endif /* CONFIG_EXCLAVES */ +__attribute__((always_inline)) +static inline void +print_backtrace_internal(thread_t thread, bool filesetKC) +{ + uintptr_t cur_fp = (uintptr_t)__builtin_frame_address(0); + const char *nohilite_thread_marker = "\t"; + +#if defined(HAS_APPLE_PAC) + cur_fp = (addr64_t)ptrauth_strip((void *)cur_fp, ptrauth_key_frame_pointer); +#endif + + if (cur_fp < VM_MAX_KERNEL_ADDRESS) { + paniclog_append_noflush("Panicked thread: %p, backtrace: 0x%llx, tid: %llu\n", + thread, (addr64_t)cur_fp, thread_tid(thread)); +#if __LP64__ + do_print_backtrace_internal(kernel_pmap, cur_fp, nohilite_thread_marker, TRUE, filesetKC); +#else + do_print_backtrace_internal(kernel_pmap, cur_fp, nohilite_thread_marker, FALSE, filesetKC); +#endif + } else { + paniclog_append_noflush("Could not print panicked thread backtrace:" + "frame pointer outside kernel vm.\n"); + } +} + +static bool +is_filesetKC(void) +{ + kc_format_t kc_format; + bool filesetKC = false; + + __unused bool result = PE_get_primary_kc_format(&kc_format); + assert(result == true); + filesetKC = kc_format == KCFormatFileset; + return filesetKC; +} + + static void -do_print_all_backtraces(const char *message, uint64_t panic_options, const char *panic_initiator) +do_print_all_panic_info(const char *message, uint64_t panic_options, const char *panic_initiator) { int logversion = PANICLOG_VERSION; thread_t cur_thread = current_thread(); - uintptr_t cur_fp; task_t task; struct proc *proc; int print_vnodes = 0; - const char *nohilite_thread_marker = "\t"; /* end_marker_bytes set to 200 for printing END marker + stackshot summary info always */ int bytes_traced = 0, bytes_remaining = 0, end_marker_bytes = 200; @@ -436,27 +483,17 @@ do_print_all_backtraces(const char *message, uint64_t panic_options, const char uint64_t bytes_used = 0ULL; int err = 0; char *stackshot_begin_loc = NULL; - kc_format_t kc_format; - bool filesetKC = false; + bool filesetKC = is_filesetKC(); uint32_t panic_initiator_len = 0; #if CONFIG_EXT_PANICLOG uint32_t ext_paniclog_bytes = 0; #endif -#if defined(__arm64__) - __asm__ volatile ("add %0, xzr, fp":"=r"(cur_fp)); -#else -#error Unknown architecture. -#endif if (panic_bt_depth != 0) { return; } panic_bt_depth++; - __unused bool result = PE_get_primary_kc_format(&kc_format); - assert(result == true); - filesetKC = kc_format == KCFormatFileset; - /* Truncate panic string to 1200 bytes */ paniclog_append_noflush("Debugger message: %.1200s\n", message); if (debug_enabled) { @@ -578,6 +615,8 @@ do_print_all_backtraces(const char *message, uint64_t panic_options, const char panic_display_zalloc(); panic_display_hung_cpus_help(); panic_display_tpidrs(); + + panic_display_pvhs_locked(); panic_display_pvh_to_lock(); panic_display_last_pc_lr(); @@ -668,18 +707,10 @@ do_print_all_backtraces(const char *message, uint64_t panic_options, const char paniclog_append_noflush("\n"); } - if (cur_fp < VM_MAX_KERNEL_ADDRESS) { - paniclog_append_noflush("Panicked thread: %p, backtrace: 0x%llx, tid: %llu\n", - cur_thread, (addr64_t)cur_fp, thread_tid(cur_thread)); -#if __LP64__ - print_one_backtrace(kernel_pmap, cur_fp, nohilite_thread_marker, TRUE, filesetKC); -#else - print_one_backtrace(kernel_pmap, cur_fp, nohilite_thread_marker, FALSE, filesetKC); -#endif - } else { - paniclog_append_noflush("Could not print panicked thread backtrace:" - "frame pointer outside kernel vm.\n"); - } + print_backtrace_internal(cur_thread, filesetKC); + + paniclog_append_noflush("\n"); + dump_cpu_event_log(&paniclog_append_noflush); paniclog_append_noflush("\n"); if (filesetKC) { @@ -815,10 +846,10 @@ do_print_all_backtraces(const char *message, uint64_t panic_options, const char } /* - * Entry to print_all_backtraces is serialized by the debugger lock + * Entry to print_all_panic_info is serialized by the debugger lock */ static void -print_all_backtraces(const char *message, uint64_t panic_options, const char *panic_initiator) +print_all_panic_info(const char *message, uint64_t panic_options, const char *panic_initiator) { unsigned int initial_not_in_kdp = not_in_kdp; @@ -833,13 +864,19 @@ print_all_backtraces(const char *message, uint64_t panic_options, const char *pa * not_in_kdp. */ not_in_kdp = 0; - do_print_all_backtraces(message, panic_options, panic_initiator); + do_print_all_panic_info(message, panic_options, panic_initiator); not_in_kdp = initial_not_in_kdp; cpu_data_ptr->PAB_active = FALSE; } +void +print_curr_backtrace(void) +{ + print_backtrace_internal(current_thread(), is_filesetKC()); +} + void panic_display_times() { @@ -936,7 +973,7 @@ SavePanicInfo( PanicInfoSaved = TRUE; - print_all_backtraces(message, panic_options, panic_initiator); + print_all_panic_info(message, panic_options, panic_initiator); assert(panic_info->eph_panic_log_len != 0); panic_info->eph_other_log_len = PE_get_offset_into_panic_region(debug_buf_ptr) - panic_info->eph_other_log_offset; @@ -1089,8 +1126,8 @@ DebuggerXCallEnter( if (ret == KERN_SUCCESS) { os_atomic_inc(&debugger_sync, relaxed); os_atomic_inc(&debug_cpus_spinning, relaxed); - } else if (proceed_on_sync_failure) { - kprintf("cpu_signal failed in DebuggerXCallEnter\n"); + } else { + kprintf("%s: cpu_signal failed. cpu=%d ret=%d proceed=%d\n", __func__, cpu, ret, proceed_on_sync_failure); } } @@ -1146,6 +1183,7 @@ DebuggerXCallEnter( kprintf("%s>found CPU %d offline, debugger_sync=%d\n", __FUNCTION__, cpu, dbg_sync_count); continue; } + kprintf("%s>Debugger synch pending on cpu %d\n", __FUNCTION__, cpu); timeout_cpu = cpu; #if CONFIG_SPTM if (proceed_on_sync_failure) { @@ -1358,7 +1396,7 @@ DebuggerXCall( * we reset the timestamp so as to avoid hitting the interrupt timeout assert(). */ if ((serialmode & SERIALMODE_OUTPUT) || trap_is_stackshot) { - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } /* @@ -1400,11 +1438,11 @@ DebuggerXCall( * an event, which could be a panic. */ abandon_preemption_disable_measurement(); -#endif /* SCHED_HYGIENE_DEBUG */ if ((serialmode & SERIALMODE_OUTPUT) || trap_is_stackshot) { - INTERRUPT_MASKED_DEBUG_START(current_thread()->machine.int_handler_addr, current_thread()->machine.int_type); + ml_interrupt_masked_debug_start((void *)current_thread()->machine.int_handler_addr, current_thread()->machine.int_type); } +#endif /* SCHED_HYGIENE_DEBUG */ #if defined(__arm64__) current_thread()->machine.kpcb = NULL; diff --git a/osfmk/arm/pmap/pmap.c b/osfmk/arm/pmap/pmap.c index a1fd78dfa..f337dc95c 100644 --- a/osfmk/arm/pmap/pmap.c +++ b/osfmk/arm/pmap/pmap.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -87,9 +88,9 @@ #include #include -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include -#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include @@ -110,10 +111,11 @@ #include #endif /* HIBERNATION */ -#ifdef __ARM64_PMAP_SUBPAGE_L1__ -#define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t)) -#else -#define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES) +#define PMAP_L1_MAX_ENTRY (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) >> ARM_TT_L1_SHIFT) +#define PMAP_ROOT_ALLOC_SIZE ((PMAP_L1_MAX_ENTRY + 1) * sizeof(tt_entry_t)) + +#ifndef __ARM64_PMAP_SUBPAGE_L1__ +_Static_assert(ARM_PGBYTES == PMAP_ROOT_ALLOC_SIZE, "Unexpected L1 Size"); #endif #if __ARM_VMSA__ != 8 @@ -289,6 +291,7 @@ const struct page_table_attr pmap_pt_attr_4k = { .pta_page_size = 4096, .pta_pagezero_size = 4096, .pta_page_shift = 12, + .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB), }; const struct page_table_attr pmap_pt_attr_16k = { @@ -309,6 +312,7 @@ const struct page_table_attr pmap_pt_attr_16k = { .pta_page_size = 16384, .pta_pagezero_size = 16384, .pta_page_shift = 14, + .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB), }; #if __ARM_16K_PG__ @@ -421,7 +425,7 @@ const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM struct pmap kernel_pmap_store MARK_AS_PMAP_DATA; const pmap_t kernel_pmap = &kernel_pmap_store; -static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */ +__static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */ MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0); MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0); @@ -529,7 +533,7 @@ SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE; MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0); SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0; SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0; -SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap; +SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap; #if !HAS_16BIT_ASID SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1; static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA; @@ -1032,6 +1036,13 @@ PMAP_SUPPORT_PROTOTYPES( vm_offset_t offset, vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX); +PMAP_SUPPORT_PROTOTYPES( + kern_return_t, + pmap_set_shared_region, (pmap_t grand, + pmap_t subord, + addr64_t vstart, + uint64_t size), PMAP_SET_SHARED_REGION_INDEX); + PMAP_SUPPORT_PROTOTYPES( vm_map_offset_t, pmap_nest, (pmap_t grand, @@ -1378,6 +1389,7 @@ const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = { [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal, [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal, [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal, + [PMAP_SET_SHARED_REGION_INDEX] = pmap_set_shared_region_internal, [PMAP_NEST_INDEX] = pmap_nest_internal, [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal, [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal, @@ -1924,9 +1936,40 @@ pmap_map_bd_with_options( { pt_entry_t mem_attr; + if (__improbable(start & PAGE_MASK)) { + panic("%s: start 0x%lx is not page aligned", __func__, start); + } + + if (__improbable(end & PAGE_MASK)) { + panic("%s: end 0x%lx is not page aligned", __func__, end); + } + + if (__improbable(!gDramBase || !gDramSize)) { + panic("%s: gDramBase/gDramSize not initialized", __func__); + } + + const bool first_page_is_dram = is_dram_addr(start); + for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) { + if (first_page_is_dram != is_dram_addr(pa)) { + panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM", + __func__, pa, first_page_is_dram ? "is not" : "is"); + } + } + switch (options & PMAP_MAP_BD_MASK) { case PMAP_MAP_BD_WCOMB: - mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + if (is_dram_addr(start)) { + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + } else { +#if HAS_FEAT_XS + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY); break; case PMAP_MAP_BD_POSTED: @@ -2220,17 +2263,10 @@ pmap_bootstrap( lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL); #if XNU_MONITOR - -#if DEVELOPMENT || DEBUG - PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable)); +#if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT + pmap_ppl_disable = ml_unsafe_kernel_text(); #endif -#if CONFIG_CSR_FROM_DT - if (csr_unsafe_kernel_text) { - pmap_ppl_disable = true; - } -#endif /* CONFIG_CSR_FROM_DT */ - #endif /* XNU_MONITOR */ #if DEVELOPMENT || DEBUG @@ -2266,10 +2302,6 @@ pmap_bootstrap( kernel_pmap->is_rosetta = FALSE; #endif -#if ARM_PARAMETERIZED_PMAP - kernel_pmap->pmap_pt_attr = native_pt_attr; -#endif /* ARM_PARAMETERIZED_PMAP */ - kernel_pmap->nested_region_addr = 0x0ULL; kernel_pmap->nested_region_size = 0x0ULL; kernel_pmap->nested_region_unnested_table_bitmap = NULL; @@ -2312,12 +2344,6 @@ pmap_bootstrap( */ pmap_data_bootstrap(); - /** - * Bootstrap any necessary UAT data structures and values needed from the device tree. - */ - uat_bootstrap(); - - /** * Bootstrap any necessary SART data structures and values needed from the device tree. */ @@ -2465,7 +2491,7 @@ pmap_lockdown_kc(void) cur_pa += ARM_PGBYTES; cur_va += ARM_PGBYTES; } -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) extern uint64_t ctrr_ro_test; extern uint64_t ctrr_nx_test; pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)}; @@ -2594,7 +2620,7 @@ pmap_virtual_space( } -boolean_t +__mockable boolean_t pmap_virtual_region( unsigned int region_select, vm_map_offset_t *startp, @@ -2602,7 +2628,7 @@ pmap_virtual_region( ) { boolean_t ret = FALSE; -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) if (region_select == 0) { /* * In this config, the bootstrap mappings should occupy their own L2 @@ -2626,9 +2652,9 @@ pmap_virtual_region( ret = TRUE; } #endif -#else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */ +#else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */ #if defined(ARM_LARGE_MEMORY) - /* For large memory systems with no KTRR/CTRR such as virtual machines */ + /* For large memory systems with no KTRR/CTRR */ if (region_select == 0) { *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK; *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK); @@ -2684,7 +2710,7 @@ pmap_virtual_region( ret = TRUE; } #endif /* defined(ARM_LARGE_MEMORY) */ -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ return ret; } @@ -2829,7 +2855,7 @@ pmap_init( * structures for pages we allocate to be page tables in * pmap_expand(). */ - _vm_object_allocate(mem_size, pmap_object); + _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL); pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /* @@ -2955,7 +2981,25 @@ pmap_assert_free(ppnum_t ppnum) __func__, (uint64_t)pa, first_ptep, type, pmap); } } -#endif +#endif /* MACH_ASSERT */ + +inline void +pmap_recycle_page(ppnum_t pn) +{ + const bool is_freed = pmap_verify_free(pn); + + if (__improbable(!is_freed)) { + /* + * There is a redundancy here, but we are going to panic anyways, + * and ASSERT_PMAP_FREE traces useful information. So, we keep this + * behavior. + */ +#if MACH_ASSERT + pmap_assert_free(pn); +#endif /* MACH_ASSERT */ + panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn)); + } +} static vm_size_t @@ -2964,7 +3008,8 @@ pmap_root_alloc_size(pmap_t pmap) #pragma unused(pmap) const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); unsigned int root_level = pt_attr_root_level(pt_attr); - return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t); + const uint64_t index = pt_attr_va_valid_mask(pt_attr); + return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t); } @@ -3171,7 +3216,7 @@ pmap_create_fail: return PMAP_NULL; } -pmap_t +__mockable pmap_t pmap_create_options( ledger_t ledger, vm_map_size_t size, @@ -3423,7 +3468,7 @@ pmap_destroy_internal( #endif } -void +__mockable void pmap_destroy( pmap_t pmap) { @@ -4442,7 +4487,7 @@ done: return eva; } -void +__mockable void pmap_remove_options( pmap_t pmap, vm_map_address_t start, @@ -5939,7 +5984,18 @@ wimg_to_pte(unsigned int wimg, pmap_paddr_t pa) pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_RT: - pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT); + if (is_dram_addr(pa)) { + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT); + } else { +#if HAS_FEAT_XS + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_POSTED: @@ -5968,7 +6024,18 @@ wimg_to_pte(unsigned int wimg, pmap_paddr_t pa) pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_WCOMB: - pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + if (is_dram_addr(pa)) { + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + } else { +#if HAS_FEAT_XS + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_WTHRU: @@ -6163,7 +6230,7 @@ pmap_enter_options_internal( pa &= ARM_PTE_PAGE_MASK; if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) { -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) extern vm_offset_t ctrr_test_page; if (__probable(v != ctrr_test_page)) #endif @@ -7184,7 +7251,7 @@ coredumpok( return FALSE; } spte = *pte_p; - return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT); + return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT; } #endif @@ -7848,11 +7915,18 @@ MARK_AS_PMAP_TEXT static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr) { if (pmap != kernel_pmap) { - cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap; - cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ? - NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap); - cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr; - cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size; + pmap_t nested_pmap = pmap->nested_pmap; + cpu_data_ptr->cpu_nested_pmap = nested_pmap; + if (nested_pmap != NULL) { + cpu_data_ptr->cpu_nested_pmap_attr = pmap_get_pt_attr(nested_pmap); + /** + * Obtain the full shared region bounds from the nested pmap. If the top-level pmap + * hasn't been fully nested yet, its bounds may not yet be configured, or may be in the + * process of being configured on another core. + */ + cpu_data_ptr->cpu_nested_region_addr = nested_pmap->nested_region_addr; + cpu_data_ptr->cpu_nested_region_size = nested_pmap->nested_region_size; + } #if __ARM_MIXED_PAGE_SIZE__ cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap)); #endif @@ -9041,8 +9115,8 @@ pmap_set_nested_internal( #endif /* XNU_MONITOR */ /** - * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give - * this pmap its own nested pmap. + * Ensure that a (potentially concurrent) call to pmap_set_shared_region() hasn't tried + * to give this pmap its own nested pmap. */ if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) { panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap); @@ -9051,7 +9125,7 @@ pmap_set_nested_internal( pmap_get_pt_ops(pmap)->free_id(pmap); } -void +__mockable void pmap_set_nested( pmap_t pmap) { @@ -9577,79 +9651,30 @@ pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_ */ #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1) -/* - * kern_return_t pmap_nest(grand, subord, vstart, size) - * - * grand = the pmap that we will nest subord into - * subord = the pmap that goes into the grand - * vstart = start of range in pmap to be inserted - * size = Size of nest area (up to 16TB) - * - * Inserts a pmap into another. This is used to implement shared segments. - * - */ - /** - * Embeds a range of mappings from one pmap ('subord') into another ('grand') - * by inserting the twig-level TTEs from 'subord' directly into 'grand'. - * This function operates in 3 main phases: - * 1. Bookkeeping to ensure tracking structures for the nested region are set up. - * 2. Expansion of subord to ensure the required leaf-level page table pages for - * the mapping range are present in subord. - * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately - * contains pointers to subord's leaf-level pagetable pages for the specified - * VA range. + * Establishes the pmap associated with a shared region as the nested pmap + * for a top-level user pmap. * - * This function may return early due to pending AST_URGENT preemption; if so - * it will indicate the need to be re-entered. - * - * @param grand pmap to insert the TTEs into. Must be a user pmap. - * @param subord pmap from which to extract the TTEs. Must be a nested pmap. - * @param vstart twig-aligned virtual address for the beginning of the nesting range - * @param size twig-aligned size of the nesting range - * @param vrestart the twig-aligned starting address of the current call. May contain - * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above. - * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to - * KERN_RESOURCE_SHORTAGE on allocation failure. - * - * @return the virtual address at which to restart the operation, possibly including - * PMAP_NEST_GRAND to indicate the phase at which to restart. If - * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed. + * @param grand The top-level user pmap + * @param subord The pmap to be set as [grand]'s nested pmap + * @param vstart The base VA of the region to be nested. + * @param size The size (in bytes) of the region to be nested. */ -MARK_AS_PMAP_TEXT vm_map_offset_t -pmap_nest_internal( +MARK_AS_PMAP_TEXT kern_return_t +pmap_set_shared_region_internal( pmap_t grand, pmap_t subord, addr64_t vstart, - uint64_t size, - vm_map_offset_t vrestart, - kern_return_t *krp) + uint64_t size) { - kern_return_t kr = KERN_FAILURE; - vm_map_offset_t vaddr; - tt_entry_t *stte_p; - tt_entry_t *gtte_p; + addr64_t vend; uint64_t nested_region_unnested_table_bitmap_size; unsigned int* nested_region_unnested_table_bitmap = NULL; - uint64_t new_nested_region_unnested_table_bitmap_size; - unsigned int* new_nested_region_unnested_table_bitmap = NULL; - int expand_options = 0; - bool deref_subord = true; - bool grand_locked = false; + kern_return_t kr = KERN_SUCCESS; - addr64_t vend; - if (__improbable(os_add_overflow(vstart, size, &vend))) { - panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size); - } - if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) || - ((vrestart & ~PMAP_NEST_GRAND) < vstart))) { - panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__, - (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend); - } - - assert(krp != NULL); validate_pmap_mutable(grand); validate_pmap(subord); + #if XNU_MONITOR /* * Ordering is important here. validate_pmap() has already ensured subord is a @@ -9666,30 +9691,23 @@ pmap_nest_internal( panic("%s: invalid subordinate pmap %p", __func__, subord); } - const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); - if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) { - panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand); - } - -#if XNU_MONITOR - expand_options |= PMAP_TT_ALLOCATE_NOWAIT; -#endif - - if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) & - (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) { - panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx", - grand, vstart, size, (unsigned long long)vrestart); - } - if (__improbable(subord->type != PMAP_TYPE_NESTED)) { panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type); } - if (__improbable(grand->type != PMAP_TYPE_USER)) { - panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type); + const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + if (__improbable(os_add_overflow(vstart, size, &vend))) { + panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size); + } + if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) { + panic("%s: pmap %p unaligned set_shared_region request 0x%llx, 0x%llx", + __func__, grand, vstart, size); + } + if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) { + panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand); } - if (subord->nested_region_unnested_table_bitmap == NULL) { + if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) { nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1; /** @@ -9720,7 +9738,7 @@ pmap_nest_internal( kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); if (kr != KERN_SUCCESS) { - goto nest_cleanup; + goto done; } assert(pa); @@ -9734,7 +9752,7 @@ pmap_nest_internal( if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) { kr = KERN_ABORTED; - goto nest_cleanup; + goto done; } if (subord->nested_region_unnested_table_bitmap == NULL) { @@ -9743,107 +9761,163 @@ pmap_nest_internal( subord->nested_region_size = (mach_vm_offset_t) size; /** - * Ensure that the rest of the subord->nested_region_* fields are - * initialized and visible before setting the nested_region_unnested_table_bitmap + * Use a store-release operation to ensure that the rest of the subord->nested_region_* + * fields are initialized and visible before setting the nested_region_unnested_table_bitmap * field (which is used as the flag to say that the rest are initialized). */ - __builtin_arm_dmb(DMB_ISHST); - subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap; + os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release); nested_region_unnested_table_bitmap = NULL; } pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - if (nested_region_unnested_table_bitmap != NULL) { -#if XNU_MONITOR - pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE); -#else - kfree_data(nested_region_unnested_table_bitmap, - nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); -#endif - nested_region_unnested_table_bitmap = NULL; - } } + if (__improbable(!os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst))) { + panic("%s: attempt to nest pmap %p into pmap %p which already has a nested pmap %p", + __func__, subord, grand, grand->nested_pmap); + } /** - * Ensure subsequent reads of the subord->nested_region_* fields don't get - * speculated before their initialization. + * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand + * into a nested pmap, which would then produce multiple levels of nesting. */ - __builtin_arm_dmb(DMB_ISHLD); - - if ((subord->nested_region_addr + subord->nested_region_size) < vend) { - uint64_t new_size; + if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) { + panic("%s: attempt to nest into non-USER pmap %p", __func__, grand); + } +done: + if (nested_region_unnested_table_bitmap != NULL) { +#if XNU_MONITOR + pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE); +#else + kfree_data(nested_region_unnested_table_bitmap, + nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); +#endif nested_region_unnested_table_bitmap = NULL; - nested_region_unnested_table_bitmap_size = 0ULL; - new_size = vend - subord->nested_region_addr; + } - new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1; - new_nested_region_unnested_table_bitmap_size <<= 1; + if (kr != KERN_SUCCESS) { +#if XNU_MONITOR + os_atomic_dec(&subord->nested_count, relaxed); +#endif + pmap_destroy_internal(subord); + } - if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) { - panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, " - "grand=%p, subord=%p, vstart=0x%llx, size=%llx", - __func__, new_nested_region_unnested_table_bitmap_size, - grand, subord, vstart, size); + return kr; +} + +__mockable void +pmap_set_shared_region( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + uint64_t size) +{ + kern_return_t kr = KERN_SUCCESS; + + PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START, + VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size); + + pmap_verify_preemptible(); +#if XNU_MONITOR + do { + kr = pmap_set_shared_region_ppl(grand, subord, vstart, size); + if (kr == KERN_RESOURCE_SHORTAGE) { + pmap_alloc_page_for_ppl(0); + } else if ((kr != KERN_SUCCESS) && (kr != KERN_ABORTED)) { + panic("%s: unexpected return code 0x%x from pmap_set_shared_region_ppl", + __func__, kr); } + } while (kr != KERN_SUCCESS); + + pmap_ledger_check_balance(grand); + pmap_ledger_check_balance(subord); +#else + /** + * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because + * we have verified preemptibility. Therefore, pmap_set_shared_region_internal() + * will wait for a page or a lock instead of bailing out as in the PPL flavor. + */ + kr = pmap_set_shared_region_internal(grand, subord, vstart, size); + assert3u(kr, ==, KERN_SUCCESS); +#endif + + PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END); +} + +/** + * Embeds a range of mappings from one pmap ('subord') into another ('grand') + * by inserting the twig-level TTEs from 'subord' directly into 'grand'. + * This function operates in 3 main phases: + * 1. Bookkeeping to ensure tracking structures for the nested region are set up. + * 2. Expansion of subord to ensure the required leaf-level page table pages for + * the mapping range are present in subord. + * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately + * contains pointers to subord's leaf-level pagetable pages for the specified + * VA range. + * + * This function may return early due to pending AST_URGENT preemption; if so + * it will indicate the need to be re-entered. + * + * @note This function requires that [subord] has already been associated with + * [grand] through a call to pmap_set_shared_region(). + * + * @param grand pmap to insert the TTEs into. Must be a user pmap. + * @param subord pmap from which to extract the TTEs. Must be a nested pmap. + * @param vstart twig-aligned virtual address for the beginning of the nesting range + * @param size twig-aligned size of the nesting range + * @param vrestart the twig-aligned starting address of the current call. May contain + * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above. + * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to + * KERN_RESOURCE_SHORTAGE on allocation failure. + * + * @return the virtual address at which to restart the operation, possibly including + * PMAP_NEST_GRAND to indicate the phase at which to restart. If + * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed. + */ +MARK_AS_PMAP_TEXT vm_map_offset_t +pmap_nest_internal( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + uint64_t size, + vm_map_offset_t vrestart, + kern_return_t *krp) +{ + kern_return_t kr = KERN_FAILURE; + vm_map_offset_t vaddr; + tt_entry_t *stte_p; + tt_entry_t *gtte_p; + int expand_options = 0; + bool grand_locked = false; + + addr64_t vend; + if (__improbable(os_add_overflow(vstart, size, &vend))) { + panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size); + } + if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) || + ((vrestart & ~PMAP_NEST_GRAND) < vstart))) { + panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__, + (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend); + } + + assert(krp != NULL); + validate_pmap_mutable(grand); + validate_pmap(subord); + + const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + + if (__improbable(subord != grand->nested_pmap)) { + panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p", + __func__, subord, grand, grand->nested_pmap); + } #if XNU_MONITOR - pmap_paddr_t pa = 0; - - if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) { - panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, " - "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx", - __FUNCTION__, new_nested_region_unnested_table_bitmap_size, - grand, subord, vstart, new_size); - } - - kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); - - if (kr != KERN_SUCCESS) { - goto nest_cleanup; - } - - assert(pa); - - new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa); -#else - new_nested_region_unnested_table_bitmap = kalloc_data( - new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int), - Z_WAITOK | Z_ZERO); + expand_options |= PMAP_TT_ALLOCATE_NOWAIT; #endif - if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) { - kr = KERN_ABORTED; - goto nest_cleanup; - } - if (subord->nested_region_size < new_size) { - bcopy(subord->nested_region_unnested_table_bitmap, - new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); - nested_region_unnested_table_bitmap_size = subord->nested_region_unnested_table_bitmap_size; - nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap; - subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap; - subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size; - subord->nested_region_size = new_size; - new_nested_region_unnested_table_bitmap = NULL; - } - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - if (nested_region_unnested_table_bitmap != NULL) { -#if XNU_MONITOR - pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE); -#else - kfree_data(nested_region_unnested_table_bitmap, - nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); -#endif - nested_region_unnested_table_bitmap = NULL; - } - if (new_nested_region_unnested_table_bitmap != NULL) { -#if XNU_MONITOR - pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE); -#else - kfree_data(new_nested_region_unnested_table_bitmap, - new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); -#endif - new_nested_region_unnested_table_bitmap = NULL; - } + if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) & + (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) { + panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx", + grand, vstart, size, (unsigned long long)vrestart); } if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) { @@ -9851,20 +9925,16 @@ pmap_nest_internal( goto nest_cleanup; } - if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) { - /** - * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand - * into a nested pmap, which would then produce multiple levels of nesting. - */ - if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) { - panic("%s: attempt to nest into non-USER pmap %p", __func__, grand); - } + if (__improbable((subord->nested_region_addr + subord->nested_region_size) < vend) || + (subord->nested_region_addr > vstart)) { + panic("%s: attempt to nest [0x%llx, 0x%llx) in pmap %p outside nested pmap %p bounds [0x%llx, 0x%llx)\n", + __func__, vstart, vend, grand, subord, subord->nested_region_addr, subord->nested_region_addr + subord->nested_region_size); + } + if (grand->nested_region_size == 0) { /* * If this is grand's first nesting operation, keep the reference on subord. * It will be released by pmap_destroy_internal() when grand is destroyed. */ - deref_subord = false; - if (!subord->nested_bounds_set) { /* * We are nesting without the shared regions bounds @@ -9877,8 +9947,12 @@ pmap_nest_internal( subord->nested_no_bounds_refcnt++; } - if (__improbable(vstart < subord->nested_region_addr || - vend > (subord->nested_region_addr + subord->nested_region_size))) { + /** + * Ensure that we won't exceed the nested_region_unnested_table bitmap bounds established + * in pmap_set_shared_region_internal(). + */ + if (__improbable((vstart < subord->nested_region_addr) || + (vend > (subord->nested_region_addr + subord->nested_region_size)))) { panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))", __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr, (void *) (subord->nested_region_addr + subord->nested_region_size)); @@ -9887,9 +9961,7 @@ pmap_nest_internal( grand->nested_region_addr = vstart; grand->nested_region_size = (mach_vm_offset_t) size; } else { - if (__improbable(grand->nested_pmap != subord)) { - panic("pmap_nest() pmap %p has a nested pmap", grand); - } else if (__improbable(grand->nested_region_addr > vstart)) { + if (__improbable(grand->nested_region_addr > vstart)) { panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand); } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) { grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size); @@ -10026,32 +10098,10 @@ nest_cleanup: *krp = kr; } #endif - if (nested_region_unnested_table_bitmap != NULL) { -#if XNU_MONITOR - pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE); -#else - kfree_data(nested_region_unnested_table_bitmap, - nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); -#endif - } - if (new_nested_region_unnested_table_bitmap != NULL) { -#if XNU_MONITOR - pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE); -#else - kfree_data(new_nested_region_unnested_table_bitmap, - new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)); -#endif - } - if (deref_subord) { -#if XNU_MONITOR - os_atomic_dec(&subord->nested_count, relaxed); -#endif - pmap_destroy_internal(subord); - } return vrestart; } -kern_return_t +__mockable kern_return_t pmap_nest( pmap_t grand, pmap_t subord, @@ -10366,7 +10416,7 @@ unnest_subord_done: return vrestart; } -kern_return_t +__mockable kern_return_t pmap_unnest_options( pmap_t grand, addr64_t vaddr, @@ -10402,7 +10452,6 @@ pmap_adjust_unnest_parameters( return TRUE; /* to get to log_unnest_badness()... */ } -#if PMAP_FORK_NEST /** * Perform any necessary pre-nesting of the parent's shared region at fork() * time. @@ -10411,20 +10460,12 @@ pmap_adjust_unnest_parameters( * * @param old_pmap The pmap of the parent task. * @param new_pmap The pmap of the child task. - * @param nesting_start An output parameter that is updated with the start - * address of the range that was pre-nested - * @param nesting_end An output parameter that is updated with the end - * address of the range that was pre-nested * * @return KERN_SUCCESS if the pre-nesting was succesfully completed. * KERN_INVALID_ARGUMENT if the arguments were not valid. */ kern_return_t -pmap_fork_nest( - pmap_t old_pmap, - pmap_t new_pmap, - vm_map_offset_t *nesting_start, - vm_map_offset_t *nesting_end) +pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap) { if (old_pmap == NULL || new_pmap == NULL) { return KERN_INVALID_ARGUMENT; @@ -10432,25 +10473,16 @@ pmap_fork_nest( if (old_pmap->nested_pmap == NULL) { return KERN_SUCCESS; } - pmap_nest(new_pmap, + /** + * Obtain the full shared region bounds from the nested pmap. If old_pmap + * hasn't been fully nested yet, its bounds may not yet be configured. + */ + pmap_set_shared_region(new_pmap, old_pmap->nested_pmap, - old_pmap->nested_region_addr, - old_pmap->nested_region_size); - assertf(new_pmap->nested_pmap == old_pmap->nested_pmap && - new_pmap->nested_region_addr == old_pmap->nested_region_addr && - new_pmap->nested_region_size == old_pmap->nested_region_size, - "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)", - new_pmap->nested_pmap, - new_pmap->nested_region_addr, - new_pmap->nested_region_size, - old_pmap->nested_pmap, - old_pmap->nested_region_addr, - old_pmap->nested_region_size); - *nesting_start = old_pmap->nested_region_addr; - *nesting_end = *nesting_start + old_pmap->nested_region_size; + old_pmap->nested_pmap->nested_region_addr, + old_pmap->nested_pmap->nested_region_size); return KERN_SUCCESS; } -#endif /* PMAP_FORK_NEST */ /* * disable no-execute capability on @@ -12911,6 +12943,7 @@ pmap_user_va_size(pmap_t pmap) + bool pmap_in_ppl(void) { @@ -13217,34 +13250,41 @@ pmap_query_trust_cache( return ret; } -MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false; +MARK_AS_PMAP_DATA uint8_t ppl_developer_mode_set = 0; MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false; MARK_AS_PMAP_TEXT void pmap_toggle_developer_mode_internal( bool state) { - bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed); +#if PMAP_CS_INCLUDE_INTERNAL_CODE + /* + * On internal builds, we may call into the PPL twice in order to enable developer + * mode during early boot and during data migration. The latter does not happen for + * non-internal builds, and thus those only need to support a single transition to + * enabling developer mode. + */ + const uint8_t epoch_enable = 2; +#else + const uint8_t epoch_enable = 1; +#endif /* - * Only the following state transitions are allowed: - * -- not set --> false - * -- not set --> true - * -- true --> false - * -- true --> true - * -- false --> false - * - * We never allow false --> true transitions. + * We don't really care if the state is false -- in that case, the transition can + * happen as many times as needed. However, we still need to increment whenever we + * set the state as such. This is partly because we need to track whether we have + * actually resolved the state or not, and also because we expect developer mode + * to only be enabled during the first or second (internal-only) call into this + * function. */ - bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed); + uint8_t epoch = os_atomic_inc_orig(&ppl_developer_mode_set, relaxed); - if ((current == false) && (state == true) && state_set) { - panic("PMAP_CS: attempted to enable developer mode incorrectly"); + if (state == os_atomic_load(&ppl_developer_mode_storage, relaxed)) { + return; + } else if ((state == true) && (epoch >= epoch_enable)) { + panic("PMAP_CS: enabling developer mode incorrectly [%u]", epoch); } - /* We're going to update the developer mode state, so update this first */ - os_atomic_store(&ppl_developer_mode_set, true, relaxed); - /* Update the developer mode state on the system */ os_atomic_store(&ppl_developer_mode_storage, state, relaxed); } diff --git a/osfmk/arm/pmap/pmap.h b/osfmk/arm/pmap/pmap.h index 30683048f..f22e16ad0 100644 --- a/osfmk/arm/pmap/pmap.h +++ b/osfmk/arm/pmap/pmap.h @@ -241,20 +241,20 @@ typedef struct pmap_cpu_data pmap_cpu_data_t; * This indicates (roughly) where there is free space for the VM * to use for the heap; this does not need to be precise. */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #if defined(ARM_LARGE_MEMORY) #define KERNEL_PMAP_HEAP_RANGE_START (VM_MIN_KERNEL_AND_KEXT_ADDRESS+ARM_TT_L1_SIZE) #else /* defined(ARM_LARGE_MEMORY) */ #define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS #endif /* defined(ARM_LARGE_MEMORY) */ -#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #if defined(ARM_LARGE_MEMORY) /* For large memory systems with no KTRR/CTRR such as virtual machines */ #define KERNEL_PMAP_HEAP_RANGE_START (VM_MIN_KERNEL_AND_KEXT_ADDRESS+ARM_TT_L1_SIZE) #else #define KERNEL_PMAP_HEAP_RANGE_START LOW_GLOBAL_BASE_ADDRESS #endif -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ /** * For setups where the VM page size does not match the hardware page size (the @@ -737,7 +737,9 @@ void pmap_abandon_measurement(void); #define PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX 109 #define PMAP_IMAGE4_MONITOR_TRAP_INDEX 110 -#define PMAP_COUNT 111 +#define PMAP_SET_SHARED_REGION_INDEX 111 + +#define PMAP_COUNT 112 /** @@ -814,7 +816,6 @@ extern boolean_t pmap_is_monitor(ppnum_t pn); */ extern void pmap_static_allocations_done(void); - #ifdef KASAN #define PPL_STACK_SIZE (PAGE_SIZE << 2) #else /* KASAN */ @@ -849,6 +850,7 @@ extern void pmap_static_allocations_done(void); #endif /* XNU_MONITOR */ + /* * Indicates that we are done mutating sensitive state in the system, and that * the pmap may now restrict access as dictated by system security policy. @@ -888,11 +890,24 @@ extern kern_return_t pmap_test_text_corruption(pmap_paddr_t); #endif /* #ifndef ASSEMBLER */ #if __ARM_KERNEL_PROTECT__ + +/* + * The (full/uncontracted) size of the kernel address space. + */ +#define KERN_ADDRESS_SPACE_SIZE (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) + 1) + +/* + * Size of the kernel protect region portion of the address space. This region will be unmapped in + * EL0. + */ +#define KERN_PROTECT_REGION_SIZE (KERN_ADDRESS_SPACE_SIZE / 2ULL) + /* * The exception vector mappings start at the middle of the kernel page table * range (so that the EL0 mapping can be located at the base of the range). */ -#define ARM_KERNEL_PROTECT_EXCEPTION_START ((~((ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK) / 2ULL)) + 1ULL) +#define ARM_KERNEL_PROTECT_EXCEPTION_START (0ULL - KERN_PROTECT_REGION_SIZE) + #endif /* __ARM_KERNEL_PROTECT__ */ #endif /* #ifndef _ARM_PMAP_H_ */ diff --git a/osfmk/arm/pmap/pmap_data.c b/osfmk/arm/pmap/pmap_data.c index a0ee8d957..8a4ac8349 100644 --- a/osfmk/arm/pmap/pmap_data.c +++ b/osfmk/arm/pmap/pmap_data.c @@ -554,6 +554,11 @@ vm_size_t pmap_compute_io_filters(void); void pmap_load_io_filters(void); #endif /* HAS_GUARDED_IO_FILTER */ +#if DEBUG || DEVELOPMENT +/* Track number of instances a WC/RT mapping request is converted to Device-GRE. */ +static _Atomic unsigned int pmap_wcrt_on_non_dram_count = 0; +#endif /* DEBUG || DEVELOPMENT */ + /** * This function is called once during pmap_bootstrap() to allocate and * initialize many of the core data structures that are implemented in this @@ -3603,10 +3608,9 @@ pmap_compute_io_rgns(void) __func__, i, ranges[i].addr, ranges[i].len); } - if (((ranges[i].addr <= gPhysBase) && (rgn_end > gPhysBase)) || - ((ranges[i].addr < avail_end) && (rgn_end >= avail_end)) || - ((ranges[i].addr > gPhysBase) && (rgn_end < avail_end))) { - panic("%s: %u addr 0x%llx length 0x%llx overlaps physical memory", + if (!(ranges[i].wimg & PMAP_IO_RANGE_NOT_IO) && + !(ranges[i].addr >= avail_end || rgn_end <= gPhysBase)) { + panic("%s: I/O %u addr 0x%llx length 0x%llx overlaps physical memory", __func__, i, ranges[i].addr, ranges[i].len); } @@ -4338,3 +4342,25 @@ pmap_flush_noncoherent_page(pmap_paddr_t paddr __unused) panic("%s called on unsupported configuration", __func__); #endif /* HAS_DC_INCPA */ } + +#if DEBUG || DEVELOPMENT +/** + * Get the value of the WC/RT on non-DRAM mapping request counter. + * + * @return The value of the counter. + */ +unsigned int +pmap_wcrt_on_non_dram_count_get() +{ + return os_atomic_load(&pmap_wcrt_on_non_dram_count, relaxed); +} + +/** + * Atomically increment the WC/RT on non-DRAM mapping request counter. + */ +void +pmap_wcrt_on_non_dram_count_increment_atomic() +{ + os_atomic_inc(&pmap_wcrt_on_non_dram_count, relaxed); +} +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/arm/pmap/pmap_data.h b/osfmk/arm/pmap/pmap_data.h index 0657906ee..8a87f85a7 100644 --- a/osfmk/arm/pmap/pmap_data.h +++ b/osfmk/arm/pmap/pmap_data.h @@ -151,10 +151,12 @@ pai_to_pvh(unsigned int pai) * type needs to be checked before dereferencing the pointer to determine which * pointer type to dereference as. */ -#define PVH_TYPE_NULL 0x0UL -#define PVH_TYPE_PVEP 0x1UL -#define PVH_TYPE_PTEP 0x2UL -#define PVH_TYPE_PTDP 0x3UL +__enum_closed_decl(pvh_type_t, uint8_t, { + PVH_TYPE_NULL = 0b00, + PVH_TYPE_PVEP = 0b01, + PVH_TYPE_PTEP = 0b10, + PVH_TYPE_PTDP = 0b11, +}); #define PVH_TYPE_MASK (0x3UL) @@ -262,7 +264,7 @@ pai_to_pvh(unsigned int pai) * been mapped into a non-coherent coprocessor address space and requires a * cache flush operation once all mappings have been removed. */ -#define PVH_FLAG_FLUSH_NEEDED (1ULL << 52) +#define PVH_FLAG_FLUSH_NEEDED (1ULL << 54) /** * Marking a pv_head_table entry with any bit in this mask denotes that this page @@ -342,7 +344,7 @@ pvh_unlock(unsigned int index) * otherwise. */ static inline bool -pvh_test_type(pv_entry_t **pvh, vm_offset_t type) +pvh_test_type(pv_entry_t **pvh, pvh_type_t type) { return ((*(vm_offset_t *)pvh) & PVH_TYPE_MASK) == type; } @@ -1943,16 +1945,34 @@ typedef struct pmap_io_range { uint64_t len; /* Strong DSB required for pages in this range. */ - #define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31) + #define PMAP_IO_RANGE_STRONG_SYNC (1U << 31) /* Corresponds to memory carved out by bootloader. */ - #define PMAP_IO_RANGE_CARVEOUT (1UL << 30) + #define PMAP_IO_RANGE_CARVEOUT (1U << 30) /* Pages in this range need to be included in the hibernation image */ - #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29) + #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1U << 29) /* Mark the range as 'owned' by a given subsystem */ - #define PMAP_IO_RANGE_OWNED (1UL << 28) + #define PMAP_IO_RANGE_OWNED (1U << 28) + + /** + * Denotes a range that is *not* to be treated as an I/O range that + * needs to be mapped, but only to decorate arbitrary physical + * memory ranges (including of managed memory) with extra + * flags. I.e. this allows tagging of "ordinary" managed memory + * pages with flags like `PMAP_IO_RANGE_PROHIBIT_HIB_WRITE`, or + * informing the SPTM that some (nominally) managed memory pages are + * unavailable for some reason. + * + * Notably, `pmap_find_io_attr()`, and anything else that uses + * `pmap_io_range`s for denoting to-be-mapped I/O ranges, ignores + * entries with this flag. + */ + #define PMAP_IO_RANGE_NOT_IO (1U << 27) + + /* Pages in this range may never be written during hibernation restore. */ + #define PMAP_IO_RANGE_PROHIBIT_HIB_WRITE (1U << 26) /** * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional @@ -2019,4 +2039,8 @@ extern void pmap_cpu_data_init_internal(unsigned int); */ extern void pmap_flush_noncoherent_page(pmap_paddr_t paddr); +#if DEBUG || DEVELOPMENT +extern unsigned int pmap_wcrt_on_non_dram_count_get(void); +extern void pmap_wcrt_on_non_dram_count_increment_atomic(void); +#endif /* DEBUG || DEVELOPMENT */ #endif /* _ARM_PMAP_PMAP_DATA_H_ */ diff --git a/osfmk/arm/pmap/pmap_misc.c b/osfmk/arm/pmap/pmap_misc.c index 3666c4a10..dacf3b5f4 100644 --- a/osfmk/arm/pmap/pmap_misc.c +++ b/osfmk/arm/pmap/pmap_misc.c @@ -45,9 +45,9 @@ pmap_abandon_measurement(void) #if SCHED_HYGIENE_DEBUG struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); uint64_t istate = pmap_interrupts_disable(); - if (pcpu->pdp_start.pds_mach_time != 0) { - pcpu->pdp_abandon = true; - } + + kern_timeout_override(&pcpu->pdp_timeout); + pmap_interrupts_restore(istate); #endif /* SCHED_HYGIENE_DEBUG */ } diff --git a/osfmk/arm/pmap/pmap_ppl_interface.c b/osfmk/arm/pmap/pmap_ppl_interface.c index c99edb209..39f92e955 100644 --- a/osfmk/arm/pmap/pmap_ppl_interface.c +++ b/osfmk/arm/pmap/pmap_ppl_interface.c @@ -34,7 +34,7 @@ * contains the ppl_handler_table, as well as a few PPL-only entry/exit helper * functions. * - * See doc/ppl.md for more information about how these PPL entry points work. + * See doc/arm/PPL.md for more information about how these PPL entry points work. */ #include @@ -59,7 +59,7 @@ static uint64_t pmap_ppl_pages_returned_to_kernel_count_total = 0; * generates the code for the _ppl() variant which is what is used to jump into * the PPL. * - * See doc/ppl.md for more information about how these PPL entry points work. + * See doc/arm/PPL.md for more information about how these PPL entry points work. */ #if XNU_MONITOR diff --git a/osfmk/arm/pmap/pmap_pt_geometry.h b/osfmk/arm/pmap/pmap_pt_geometry.h index 5d44e72bd..fc64830ee 100644 --- a/osfmk/arm/pmap/pmap_pt_geometry.h +++ b/osfmk/arm/pmap/pmap_pt_geometry.h @@ -85,7 +85,7 @@ struct page_table_ops { * differences between stage 1 and stage 2 page tables. This allows one set of * code to seamlessly handle the differences between various address space * layouts as well as stage 1 vs stage 2 page tables on the fly. See - * doc/arm_pmap.md for more details. + * doc/arm/arm_pmap.md for more details. * * Instead of accessing the fields in this structure directly, it is recommended * to use the page table attribute getter functions defined below. @@ -142,6 +142,12 @@ struct page_table_attr { * virtual address. */ const uint64_t pta_page_shift; + + /** + * Mask of significant address bits. This is the mask needed to address the + * virtual page number portion of the VA. + */ + const uint64_t pta_va_valid_mask; }; typedef struct page_table_attr pt_attr_t; @@ -477,6 +483,12 @@ pt_attr_leaf_level(const pt_attr_t * const pt_attr) return pt_attr_twig_level(pt_attr) + 1; } +/* Significant address bits in PTE */ +static inline uint64_t +pt_attr_va_valid_mask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_va_valid_mask; +} /** * Return the index into a specific level of page table for a given virtual @@ -489,7 +501,8 @@ pt_attr_leaf_level(const pt_attr_t * const pt_attr) static inline unsigned int ttn_index(const pt_attr_t * const pt_attr, vm_map_address_t addr, unsigned int pt_level) { - const uint64_t index_unshifted = addr & pt_attr_ln_index_mask(pt_attr, pt_level); + const uint64_t addr_masked = addr & pt_attr_va_valid_mask(pt_attr); + const uint64_t index_unshifted = addr_masked & pt_attr_ln_index_mask(pt_attr, pt_level); return (unsigned int)(index_unshifted >> pt_attr_ln_shift(pt_attr, pt_level)); } diff --git a/osfmk/arm/preemption_disable.c b/osfmk/arm/preemption_disable.c index 46d7cd695..fe303028b 100644 --- a/osfmk/arm/preemption_disable.c +++ b/osfmk/arm/preemption_disable.c @@ -113,7 +113,7 @@ _enable_preemption_write_count(thread_t thread, unsigned int count) * * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\ */ -OS_ALWAYS_INLINE +OS_ALWAYS_INLINE __mockable void _disable_preemption(void) { @@ -146,7 +146,7 @@ _disable_preemption(void) * without taking measurements (and later potentially triggering * actions on those). */ -OS_ALWAYS_INLINE +OS_ALWAYS_INLINE __mockable void _disable_preemption_without_measurements(void) { @@ -184,7 +184,7 @@ _enable_preemption_underflow(void) * * /!\ Breaking inlining causes zalloc to be roughly 10% slower /!\ */ -OS_ALWAYS_INLINE +OS_ALWAYS_INLINE __mockable void _enable_preemption(void) { @@ -196,8 +196,10 @@ _enable_preemption(void) } #if SCHED_HYGIENE_DEBUG - if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) { - return _collect_preemption_disable_measurement(); + if (improbable_static_if(sched_debug_preemption_disable)) { + if (__improbable(count == SCHED_HYGIENE_MARKER + 1)) { + return _collect_preemption_disable_measurement(); + } } #endif /* SCHED_HYGIENE_DEBUG */ @@ -260,19 +262,14 @@ struct _preemption_disable_pcpu PERCPU_DATA(_preemption_disable_pcpu_data); * Interrupts must be disabled when calling this function, * but the assertion has been elided as this is on the fast path. */ +OS_ALWAYS_INLINE static void _preemption_disable_snap_start(void) { struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); - pcpu->pdp_abandon = false; - pcpu->pdp_start.pds_mach_time = ml_get_sched_hygiene_timebase(); - pcpu->pdp_start.pds_int_mach_time = recount_current_processor_interrupt_duration_mach(); -#if CONFIG_CPU_COUNTERS - if (static_if(sched_debug_pmc)) { - mt_cur_cpu_cycles_instrs_speculative(&pcpu->pdp_start.pds_cycles, - &pcpu->pdp_start.pds_instrs); - } -#endif /* CONFIG_CPU_COUNTERS */ + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | ML_TIMEOUT_PMC_FLAGS | TF_SAMPLE_INTERRUPT_TIME | TF_BACKTRACE; + + kern_timeout_start(&pcpu->pdp_timeout, flags); } /* @@ -280,24 +277,21 @@ _preemption_disable_snap_start(void) * End a measurement window for the current CPU's preemption disable timeout, * using the snapshot started by _preemption_disable_snap_start(). * -* @param start An out-parameter for the starting snapshot, -* captured while interrupts are disabled. -* -* @param now An out-parameter for the current times, +* @param top An out-parameter for the current times, * captured at the same time as the start and with interrupts disabled. +* * This is meant for computing a delta. * Even with @link sched_hygiene_debug_pmc , the PMCs will not be read. * This allows their (relatively expensive) reads to happen only if the time threshold has been violated. * * @return Whether to abandon the current measurement due to a call to abandon_preemption_disable_measurement(). */ +OS_ALWAYS_INLINE static bool -_preemption_disable_snap_end( - struct _preemption_disable_snap *start, - struct _preemption_disable_snap *now) +_preemption_disable_snap_end(kern_timeout_t *top) { struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); - + const timeout_flags_t flags = ML_TIMEOUT_TIMEBASE_FLAGS | TF_SAMPLE_INTERRUPT_TIME; const bool int_masked_debug = false; const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); /* @@ -313,31 +307,18 @@ _preemption_disable_snap_end( * grabbed time. With interrupts disabled we don't care much about * the order.) */ + kern_timeout_end(&pcpu->pdp_timeout, flags); - *start = pcpu->pdp_start; - uint64_t now_time = ml_get_sched_hygiene_timebase(); - now->pds_mach_time = now_time; - now->pds_int_mach_time = recount_current_processor_interrupt_duration_mach(); - const bool abandon = pcpu->pdp_abandon; const uint64_t max_duration = os_atomic_load(&pcpu->pdp_max_mach_duration, relaxed); - - pcpu->pdp_start.pds_mach_time = 0; - - /* - * Don't need to reset (or even save) pdp_abandon here: - * abandon_preemption_disable_measurement is a no-op anyway - * if pdp_start.pds_mach_time == 0 (which we just set), and it - * will stay that way until the next call to - * _collect_preemption_disable_measurement. - */ - ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); - if (__probable(!abandon)) { - const int64_t gross_duration = now_time - start->pds_mach_time; - if (__improbable(gross_duration > max_duration)) { - os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed); - } + const uint64_t gross_duration = kern_timeout_gross_duration(&pcpu->pdp_timeout); + if (__improbable(gross_duration > max_duration)) { + os_atomic_store(&pcpu->pdp_max_mach_duration, gross_duration, relaxed); } - return abandon; + + *top = pcpu->pdp_timeout; + ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); + + return gross_duration == 0; } OS_NOINLINE @@ -346,7 +327,7 @@ _prepare_preemption_disable_measurement(void) { thread_t thread = current_thread(); - if (thread->machine.inthandler_timestamp == 0) { + if (thread->machine.int_handler_addr == 0) { /* * Only prepare a measurement if not currently in an interrupt * handler. @@ -378,64 +359,34 @@ OS_NOINLINE void _collect_preemption_disable_measurement(void) { - struct _preemption_disable_snap start = { 0 }; - struct _preemption_disable_snap now = { 0 }; - const bool abandon = _preemption_disable_snap_end(&start, &now); + kern_timeout_t to; + const bool abandon = _preemption_disable_snap_end(&to); if (__improbable(abandon)) { goto out; } - int64_t const gross_duration = now.pds_mach_time - start.pds_mach_time; - uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed); + const uint64_t gross_duration = kern_timeout_gross_duration(&to); + const uint64_t threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed); if (__improbable(threshold > 0 && gross_duration >= threshold)) { /* * Double check that the time spent not handling interrupts is over the threshold. */ - int64_t const interrupt_duration = now.pds_int_mach_time - start.pds_int_mach_time; - int64_t const net_duration = gross_duration - interrupt_duration; + const int64_t net_duration = kern_timeout_net_duration(&to); + uint64_t average_cpi_whole, average_cpi_fractional; + assert3u(net_duration, >=, 0); if (net_duration < threshold) { goto out; } - uint64_t average_freq = 0; - uint64_t average_cpi_whole = 0; - uint64_t average_cpi_fractional = 0; - -#if CONFIG_CPU_COUNTERS - if (static_if(sched_debug_pmc)) { - /* - * We're getting these values a bit late, but getting them - * is a bit expensive, so we take the slight hit in - * accuracy for the reported values (which aren't very - * stable anyway). - */ - const bool int_masked_debug = false; - const bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); - mt_cur_cpu_cycles_instrs_speculative(&now.pds_cycles, &now.pds_instrs); - ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); - const uint64_t cycles_elapsed = now.pds_cycles - start.pds_cycles; - const uint64_t instrs_retired = now.pds_instrs - start.pds_instrs; - - uint64_t duration_ns; - absolutetime_to_nanoseconds(gross_duration, &duration_ns); - - average_freq = cycles_elapsed / (duration_ns / 1000); - average_cpi_whole = cycles_elapsed / instrs_retired; - average_cpi_fractional = - ((cycles_elapsed * 100) / instrs_retired) % 100; - } -#endif /* CONFIG_CPU_COUNTERS */ - if (__probable(sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC)) { - panic("preemption disable timeout exceeded: %llu >= %llu mt ticks (start: %llu, now: %llu, gross: %llu, inttime: %llu), " - "freq = %llu MHz, CPI = %llu.%llu", - net_duration, threshold, start.pds_mach_time, now.pds_mach_time, - gross_duration, interrupt_duration, - average_freq, average_cpi_whole, average_cpi_fractional); + kern_timeout_try_panic(KERN_TIMEOUT_PREEMPTION, 0, &to, + "preemption disable timeout exceeded:", threshold); } + kern_timeout_cpi(&to, &average_cpi_whole, &average_cpi_fractional); + DTRACE_SCHED4(mach_preemption_expired, uint64_t, net_duration, uint64_t, gross_duration, uint64_t, average_cpi_whole, uint64_t, average_cpi_fractional); KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED), net_duration, gross_duration, average_cpi_whole, average_cpi_fractional); @@ -457,13 +408,9 @@ out: void abandon_preemption_disable_measurement(void) { - const bool int_masked_debug = false; - bool istate = ml_set_interrupts_enabled_with_debug(false, int_masked_debug); struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); - if (pcpu->pdp_start.pds_mach_time != 0) { - pcpu->pdp_abandon = true; - } - ml_set_interrupts_enabled_with_debug(istate, int_masked_debug); + + kern_timeout_override(&pcpu->pdp_timeout); } /* Inner part of disable_preemption_without_measuerments() */ @@ -476,7 +423,7 @@ _do_disable_preemption_without_measurements(void) * that we didn't really care. */ struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); - pcpu->pdp_abandon = true; + kern_timeout_override(&pcpu->pdp_timeout); } /** @@ -528,6 +475,12 @@ sched_perfcontrol_abandon_preemption_disable_measurement(void) #else /* SCHED_HYGIENE_DEBUG */ +void +abandon_preemption_disable_measurement(void) +{ + // No-op. Function is exported, so needs to be defined +} + void sched_perfcontrol_abandon_preemption_disable_measurement(void) { diff --git a/osfmk/arm/preemption_disable_internal.h b/osfmk/arm/preemption_disable_internal.h index a15ed3e85..47d224bbe 100644 --- a/osfmk/arm/preemption_disable_internal.h +++ b/osfmk/arm/preemption_disable_internal.h @@ -27,48 +27,28 @@ */ #include +#include #include #include #include #pragma once -#if CONFIG_SPTM /* - * This header is only meant for the PPL and SPTM to access the preemption disable data structure directly. + * This header is only meant for the PPL and SPTM pmap to access the preemption disable data + * structure directly for the purposes of managing timeouts. */ -#endif /* CONFIG_SPTM */ /** * Track time and other counters during a preemption disabled window, * when `SCHED_HYGIENE` is configured. */ struct _preemption_disable_pcpu { - /** - * A snapshot of times and counters relevant to preemption disable measurement. - */ - struct _preemption_disable_snap { - /* The time when preemption was disabled, in Mach time units. */ - uint64_t pds_mach_time; - /* The amount of time spent in interrupts by the current CPU, in Mach time units. */ - uint64_t pds_int_mach_time; -#if CONFIG_CPU_COUNTERS - /* The number of cycles elapsed on this CPU. */ - uint64_t pds_cycles; - /* The number of instructions seen by this CPU. */ - uint64_t pds_instrs; -#endif /* CONFIG_CPU_COUNTERS */ - } - /* At the start of the preemption disabled window. */ - pdp_start; + /* Timeout structure to track this preemption disable. */ + kern_timeout_t pdp_timeout; /* The maximum duration seen by this CPU, in Mach time units. */ _Atomic uint64_t pdp_max_mach_duration; - /* - * Whether to abandon the measurement on this CPU, - * due to a call to abandon_preemption_disable_measurement(). - */ - bool pdp_abandon; }; PERCPU_DECL(struct _preemption_disable_pcpu, _preemption_disable_pcpu_data); diff --git a/osfmk/arm/task.h b/osfmk/arm/task.h index a69e9d0a2..9e77380c5 100644 --- a/osfmk/arm/task.h +++ b/osfmk/arm/task.h @@ -75,6 +75,8 @@ #define TASK_ADDITIONS_PAC #endif +#define TASK_ADDITIONS_HW_AND_EMULATION + #define TASK_ADDITIONS_UEXC uint64_t uexc[4]; @@ -90,6 +92,7 @@ #define MACHINE_TASK \ void * XNU_PTRAUTH_SIGNED_PTR("task.task_debug") task_debug; \ TASK_ADDITIONS_PAC \ + TASK_ADDITIONS_HW_AND_EMULATION \ \ TASK_ADDITIONS_UEXC \ TASK_ADDITIONS_X18 \ diff --git a/osfmk/arm/thread.h b/osfmk/arm/thread.h index bddbdb5d2..843eee4aa 100644 --- a/osfmk/arm/thread.h +++ b/osfmk/arm/thread.h @@ -68,7 +68,10 @@ #include #include #include +#if SCHED_HYGIENE_DEBUG +#include #endif +#endif /* MACH_KERNEL_PRIVATE */ struct perfcontrol_state { uint64_t opaque[8] __attribute__((aligned(8))); @@ -92,6 +95,10 @@ typedef arm_kernel_context_t machine_thread_kernel_state; #endif +#if HAVE_MACHINE_THREAD_MATRIX_STATE +#define UMATRIX_PTRAUTH XNU_PTRAUTH_SIGNED_PTR("machine_thread.umatrix_hdr") +#endif + /* * Machine Thread Structure */ @@ -114,9 +121,9 @@ struct machine_thread { #if HAVE_MACHINE_THREAD_MATRIX_STATE union { - arm_state_hdr_t *umatrix_hdr; + arm_state_hdr_t *UMATRIX_PTRAUTH umatrix_hdr; #if HAS_ARM_FEAT_SME - arm_sme_saved_state_t *usme; /* pointer to user SME state */ + arm_sme_saved_state_t *UMATRIX_PTRAUTH usme; /* pointer to user SME state */ #endif }; #endif /* HAVE_MACHINE_THREAD_MATRIX_STATE */ @@ -134,12 +141,7 @@ struct machine_thread { uint64_t reserved5; #if SCHED_HYGIENE_DEBUG - uint64_t intmask_timestamp; /* timestamp of when interrupts were manually masked */ - uint64_t inthandler_timestamp; /* timestamp of when interrupt handler started */ - uint64_t intmask_cycles; /* cycle count snapshot of when interrupts were masked */ - uint64_t intmask_instr; /* instruction count snapshot of when interrupts were masked */ - bool inthandler_abandon; /* whether to abandon the current measurement */ - + kern_timeout_t int_timeout; /* for interrupt disabled timeout mechanism */ unsigned int int_type; /* interrupt type of the interrupt that was processed */ uintptr_t int_handler_addr; /* slid, ptrauth-stripped virtual address of the interrupt handler */ uintptr_t int_vector; /* IOInterruptVector */ @@ -162,21 +164,26 @@ struct machine_thread { uint16_t exception_trace_code; bool reserved7; bool reserved8; + bool reserved9; #if defined(HAS_APPLE_PAC) uint64_t rop_pid; uint64_t jop_pid; #else - uint64_t reserved9; uint64_t reserved10; + uint64_t reserved11; #endif - uint64_t reserved11; + uint64_t reserved12; #if HAS_ARM_FEAT_SME uint64_t tpidr2_el0; #else - uint64_t reserved12; + uint64_t reserved13; #endif + + uint64_t reserved14; + + bool reserved15; }; #endif diff --git a/osfmk/arm64/Makefile b/osfmk/arm64/Makefile index 020ff7f41..31958f78c 100644 --- a/osfmk/arm64/Makefile +++ b/osfmk/arm64/Makefile @@ -54,6 +54,7 @@ EXPORT_MD_LIST = \ tlb.h \ pal_hibernate.h + # These headers will be available with #include EXPORT_MD_DIR = arm64 diff --git a/osfmk/arm64/amcc_rorgn.h b/osfmk/arm64/amcc_rorgn.h index 72d0e2b89..70872e0d3 100644 --- a/osfmk/arm64/amcc_rorgn.h +++ b/osfmk/arm64/amcc_rorgn.h @@ -35,61 +35,16 @@ __BEGIN_DECLS -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) extern vm_offset_t ctrr_begin, ctrr_end; -#if CONFIG_CSR_FROM_DT -extern bool csr_unsafe_kernel_text; -#endif /* CONFIG_CSR_FROM_DT */ - -#if DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT -extern bool rorgn_disable; -#else -#define rorgn_disable false -#endif /* DEVELOPMENT || DEBUG */ - void rorgn_stash_range(void); void rorgn_lockdown(void); bool rorgn_contains(vm_offset_t addr, vm_size_t size, bool defval); void rorgn_validate_core(void); -#if KERNEL_CTRR_VERSION >= 3 -#define CTXR_XN_DISALLOW_ALL \ - /* Execute Masks for EL2&0 */ \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL2_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL0TGE1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL2_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL0TGE1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_MMUOFF_shift) | \ - /* Execute Masks for EL1&0 when Stage2 Translation is disabled */ \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL0TGE0_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL0TGE0_shift) - -#define CTXR_XN_KERNEL \ - /* Execute Masks for EL2&0 */ \ - (CTXR3_XN_disallow_outside << CTXR3_x_CTL_EL2_XN_EL2_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL0TGE1_shift) | \ - (CTXR3_XN_disallow_outside << CTXR3_x_CTL_EL2_XN_GL2_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL0TGE1_shift) | \ - (CTXR3_XN_disallow_outside << CTXR3_x_CTL_EL2_XN_MMUOFF_shift) | \ - /* Execute Masks for EL1&0 when Stage2 Translation is disabled */ \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_EL0TGE0_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL1_shift) | \ - (CTXR3_XN_disallow_inside << CTXR3_x_CTL_EL2_XN_GL0TGE0_shift) -#endif /* KERNEL_CTRR_VERSION >= 3 */ - -#else - -#if CONFIG_CSR_FROM_DT -#define csr_unsafe_kernel_text false -#endif /* CONFIG_CSR_FROM_DT */ - -#define rorgn_disable false -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ __END_DECLS diff --git a/osfmk/arm64/apt.c b/osfmk/arm64/apt.c new file mode 100644 index 000000000..5a21f5cc6 --- /dev/null +++ b/osfmk/arm64/apt.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// fixme: rdar://114299113 tracks resolving the supportlib issue with hwtrace features + + +bool +apt_allocate_va_buffer(__unused size_t allocation_size, vm_map_offset_t *__unused ret_mapped_addr, upl_t *__unused ret_upl); +bool +apt_allocate_va_buffer(__unused size_t allocation_size, vm_map_offset_t *__unused ret_mapped_addr, upl_t *__unused ret_upl) +{ + return false; +} + +void +apt_free_va_buffer(__unused size_t allocation_size, __unused vm_map_offset_t mapped_addr, __unused upl_t upl); +void +apt_free_va_buffer(__unused size_t allocation_size, __unused vm_map_offset_t mapped_addr, __unused upl_t upl) +{ +} diff --git a/osfmk/arm64/arm_vm_init.c b/osfmk/arm64/arm_vm_init.c index 6b7316564..eb20109a8 100644 --- a/osfmk/arm64/arm_vm_init.c +++ b/osfmk/arm64/arm_vm_init.c @@ -66,7 +66,7 @@ static_assert((KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_ROOT_OFFMASK) > ARM_KERNEL * We must have enough space in the TTBR1_EL1 range to create the EL0 mapping of * the exception vectors. */ -static_assert((((~ARM_KERNEL_PROTECT_EXCEPTION_START) + 1) * 2ULL) <= (ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK)); +static_assert((KERN_PROTECT_REGION_SIZE * 2ULL) <= KERN_ADDRESS_SPACE_SIZE); #endif /* __ARM_KERNEL_PROTECT__ */ #define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN) @@ -457,7 +457,7 @@ alloc_ptpage(boolean_t map_static) { vm_offset_t vaddr; -#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) map_static = FALSE; #endif @@ -619,7 +619,7 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) * Walk the target page table to find the PTE for the given virtual * address. Allocate any page table pages needed to do this. */ - l1_ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + l1_ttep = ttp + L1_TABLE_T1_INDEX(vaddr, TCR_EL1_BOOT); l1_tte = *l1_ttep; if (l1_tte == ARM_TTE_EMPTY) { @@ -666,7 +666,7 @@ arm_vm_map(tt_entry_t * root_ttp, vm_offset_t vaddr, pt_entry_t pte) *ptep = pte; } -#endif // __ARM_KERNEL_PROTECT || XNU_MONITOR +#endif /* __ARM_KERNEL_PROTECT__ || XNU_MONITOR */ #if __ARM_KERNEL_PROTECT__ @@ -681,7 +681,7 @@ static void arm_vm_kernel_el0_map(vm_offset_t vaddr, pt_entry_t pte) { /* Calculate where vaddr will be in the EL1 kernel page tables. */ - vm_offset_t kernel_pmap_vaddr = vaddr - ((ARM_TT_ROOT_INDEX_MASK + ARM_TT_ROOT_SIZE) / 2ULL); + vm_offset_t kernel_pmap_vaddr = vaddr - KERN_PROTECT_REGION_SIZE; arm_vm_map(cpu_tte, kernel_pmap_vaddr, pte); } @@ -716,7 +716,7 @@ arm_vm_kernel_pte(vm_offset_t vaddr) pt_entry_t * ptep = NULL; pt_entry_t pte = 0; - ttep = ttp + ((vaddr & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + ttep = ttp + L1_TABLE_T1_INDEX(vaddr, TCR_EL1_BOOT); tte = *ttep; assert(tte & ARM_TTE_VALID); @@ -824,7 +824,7 @@ arm_vm_expand_kernel_el0_mappings(void) } #endif /* __ARM_KERNEL_PROTECT__ */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) extern void bootstrap_instructions; /* @@ -858,7 +858,7 @@ arm_replace_identity_map(void) */ l1_ptp_phys = kvtophys((vm_offset_t)&bootstrap_pagetables); l1_ptp_virt = (tt_entry_t *)phystokv(l1_ptp_phys); - tte1 = &l1_ptp_virt[L1_TABLE_INDEX(paddr)]; + tte1 = &l1_ptp_virt[L1_TABLE_T1_INDEX(paddr, TCR_EL1_BOOT)]; l2_ptp_virt = L2_TABLE_VA(tte1); l2_ptp_phys = (*tte1) & ARM_TTE_TABLE_MASK; @@ -887,7 +887,7 @@ arm_replace_identity_map(void) ARM_PTE_AP(AP_RONA) | ARM_PTE_NX; } -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ tt_entry_t *arm_kva_to_tte(vm_offset_t); @@ -895,7 +895,7 @@ tt_entry_t * arm_kva_to_tte(vm_offset_t va) { tt_entry_t *tte1, *tte2; - tte1 = cpu_tte + L1_TABLE_INDEX(va); + tte1 = cpu_tte + L1_TABLE_T1_INDEX(va, TCR_EL1_BOOT); tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va); return tte2; @@ -1560,7 +1560,7 @@ arm_vm_physmap_init(boot_args *args) arm_vm_physmap_slide(temp_ptov_table, gVirtBase, segLOWEST - gVirtBase, AP_RWNA, 0); // kext bootstrap segments -#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR) /* __KLD,__text is covered by the rorgn */ arm_vm_physmap_slide(temp_ptov_table, segKLDB, segSizeKLD, AP_RONA, 0); #endif @@ -1575,11 +1575,11 @@ arm_vm_physmap_init(boot_args *args) keep_linkedit = TRUE; } #endif /* CONFIG_DTRACE */ -#if KASAN_DYNAMIC_BLACKLIST - /* KASAN's dynamic blacklist needs to query the LINKEDIT segment at runtime. As such, the +#if KASAN_DYNAMIC_DENYLIST + /* KASAN's dynamic denylist needs to query the LINKEDIT segment at runtime. As such, the * kext bootstrap code will not jettison LINKEDIT on kasan kernels, so don't bother to relocate it. */ keep_linkedit = TRUE; -#endif +#endif /* KASAN_DYNAMIC_DENYLIST */ if (!keep_linkedit) { // Kernel LINKEDIT arm_vm_physmap_slide(temp_ptov_table, segLINKB, segSizeLINK, AP_RWNA, 0); @@ -1686,7 +1686,7 @@ arm_vm_prot_finalize(boot_args * args __unused) #endif /* __ARM_KERNEL_PROTECT__ */ #if XNU_MONITOR -#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) +#if !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR) /* __KLD,__text is covered by the rorgn */ for (vm_offset_t va = segKLDB; va < (segKLDB + segSizeKLD); va += ARM_PGBYTES) { pt_entry_t *pte = arm_kva_to_pte(va); @@ -1722,7 +1722,7 @@ arm_vm_prot_finalize(boot_args * args __unused) } #endif /* XNU_MONITOR */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * __LAST,__pinst should no longer be executable. */ @@ -1751,48 +1751,6 @@ arm_vm_prot_finalize(boot_args * args __unused) flush_mmu_tlb(); } -/* - * TBI (top-byte ignore) is an ARMv8 feature for ignoring the top 8 bits of - * address accesses. It can be enabled separately for TTBR0 (user) and - * TTBR1 (kernel). - */ -void -arm_set_kernel_tbi(void) -{ -#if !__ARM_KERNEL_PROTECT__ && CONFIG_KERNEL_TBI - uint64_t old_tcr, new_tcr; - - old_tcr = new_tcr = get_tcr(); - /* - * For kernel configurations that require TBI support on - * PAC systems, we enable DATA TBI only. - */ - new_tcr |= TCR_TBI1_TOPBYTE_IGNORED; - new_tcr |= TCR_TBID1_ENABLE; - - if (old_tcr != new_tcr) { - set_tcr(new_tcr); - sysreg_restore.tcr_el1 = new_tcr; - } -#endif /* !__ARM_KERNEL_PROTECT__ && CONFIG_KERNEL_TBI */ -} - -static void -arm_set_user_tbi(void) -{ -#if !__ARM_KERNEL_PROTECT__ - uint64_t old_tcr, new_tcr; - - old_tcr = new_tcr = get_tcr(); - new_tcr |= TCR_TBI0_TOPBYTE_IGNORED; - - if (old_tcr != new_tcr) { - set_tcr(new_tcr); - sysreg_restore.tcr_el1 = new_tcr; - } -#endif /* !__ARM_KERNEL_PROTECT__ */ -} - /* * Initialize and enter blank (invalid) page tables in a L1 translation table for a given VA range. * @@ -1819,7 +1777,7 @@ init_ptpages(tt_entry_t *tt, vm_map_address_t start, vm_map_address_t end, bool tt_entry_t *l1_tte; vm_offset_t ptpage_vaddr; - l1_tte = tt + ((start & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + l1_tte = tt + L1_TABLE_T1_INDEX(start, TCR_EL1_BOOT); while (start < end) { if (*l1_tte == ARM_TTE_EMPTY) { @@ -1973,7 +1931,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ first_avail_phys = avail_start = args->topOfKernelData; -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) arm_replace_identity_map(); #endif @@ -2136,7 +2094,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * LOW_GLOBAL_BASE_ADDRESS + 2MB */ va_l1 = va_l2 = LOW_GLOBAL_BASE_ADDRESS; - cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT); cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); ptpage_vaddr = alloc_ptpage(TRUE); *cpu_l2_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; @@ -2172,8 +2130,6 @@ arm_vm_init(uint64_t memory_size, boot_args * args) mt_early_init(); #endif /* CONFIG_CPU_COUNTERS */ - arm_set_user_tbi(); - arm_vm_physmap_init(args); set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK); @@ -2251,7 +2207,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) } dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin); -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) // reserve a 32MB region without permission overrides to use later for a CTRR unit test { extern vm_offset_t ctrr_test_page; @@ -2259,7 +2215,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) ctrr_test_page = dynamic_memory_begin; dynamic_memory_begin += ARM_TT_L2_SIZE; - cpu_l1_tte = cpu_tte + ((ctrr_test_page & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(ctrr_test_page, TCR_EL1_BOOT); assert((*cpu_l1_tte) & ARM_TTE_VALID); cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((ctrr_test_page & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); assert((*cpu_l2_tte) == ARM_TTE_EMPTY); @@ -2267,7 +2223,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) bzero(new_tte, ARM_PGBYTES); *cpu_l2_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; } -#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ +#endif /* (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) */ #if XNU_MONITOR for (vm_offset_t cur = (vm_offset_t)pmap_stacks_start; cur < (vm_offset_t)pmap_stacks_end; cur += ARM_PGBYTES) { arm_vm_map(cpu_tte, cur, ARM_PTE_EMPTY); @@ -2295,7 +2251,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes); va_l1_end = (va_l1_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL; - cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT); while (va_l1 < va_l1_end) { va_l2 = va_l1; @@ -2330,7 +2286,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) cpu_l1_tte++; } -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * In this configuration, the bootstrap mappings (arm_vm_init) and * the heap mappings occupy separate L1 regions. Explicitly set up @@ -2347,7 +2303,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) /* For large memory systems with no KTRR/CTRR such as virtual machines */ init_ptpages(cpu_tte, KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_L1_OFFMASK, VM_MAX_KERNEL_ADDRESS, FALSE, ARM_DYNAMIC_TABLE_XN | ARM_TTE_TABLE_AP(ARM_TTE_TABLE_AP_USER_NA)); #endif -#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * Initialize l3 page table pages : @@ -2357,7 +2313,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1 = (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - PE_EARLY_BOOT_VA; va_l1_end = VM_MAX_KERNEL_ADDRESS; - cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT); while (va_l1 < va_l1_end) { va_l2 = va_l1; diff --git a/osfmk/arm64/asm.h b/osfmk/arm64/asm.h index 4a477da7a..84deb3f38 100644 --- a/osfmk/arm64/asm.h +++ b/osfmk/arm64/asm.h @@ -172,6 +172,7 @@ #endif .endmacro + /** * Push a stack frame. * diff --git a/osfmk/arm64/bcopy.s b/osfmk/arm64/bcopy.s index cc1055a5b..d94fb506a 100644 --- a/osfmk/arm64/bcopy.s +++ b/osfmk/arm64/bcopy.s @@ -180,7 +180,7 @@ L_forwardCleanup: stp x12,x13,[x3, #32] stp x14,x15,[x3, #48] POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcpy /***************************************************************************** * forward small copy * @@ -206,7 +206,7 @@ L_forwardSmallCopy: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcpy /***************************************************************************** * Reverse copy engines * @@ -273,7 +273,7 @@ L_reverseCleanup: stp x12,x13,[x0, #16] // In the forward copy, we need to compute the stp x14,x15,[x0] // address of these stores, but here we already POP_FRAME // have a pointer to the start of the buffer. - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcpy /***************************************************************************** * reverse small copy * @@ -291,9 +291,9 @@ L_reverseSmallCopy: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcpy L_return: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcpy diff --git a/osfmk/arm64/bti_telemetry.c b/osfmk/arm64/bti_telemetry.c deleted file mode 100644 index 6fbbbb3ec..000000000 --- a/osfmk/arm64/bti_telemetry.c +++ /dev/null @@ -1,549 +0,0 @@ -/* - * Copyright (c) 2022 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * BTI Telemetry is a debug feature intended to support a safer, slow rollout - * of ARMv8.5's Branch Target Indication in and across the kernel. - * Telemetry mode converts normally fatal BTI exceptions into non-fatal, - * analytics generating events. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_BTI_TELEMETRY -#define TAG "[bti_telemetry] " - -/* ~* Module Configuration *~ */ -/** - * Enable reporting via CoreAnalytics in addition to local gathering. - */ -#define BTI_TELEMETRY_USE_CORE_ANALYTICS (1) - -typedef struct bti_telemetry_record { - SPLAY_ENTRY(bti_telemetry_record) link; - - /** Slid address at which the exception was thrown */ - uintptr_t faulting_address; - - /** The raw BTYPE for this exception */ - uint8_t branch_type; -} bti_telemetry_record_s; - -/* ~* Core Analytics *~ */ -CA_EVENT(arm_bti_exceptions, - CA_INT, branch_type, - CA_INT, faulting_offset, - CA_STATIC_STRING(CA_UUID_LEN), faulting_uuid); - -/* ~* Splay tree *~ */ -static int -bti_telemetry_record_compare(bti_telemetry_record_s *a1_r, - bti_telemetry_record_s *a2_r) -{ - /* Compare on fault address */ - if (a1_r->faulting_address > a2_r->faulting_address) { - return 1; - } else if (a1_r->faulting_address < a2_r->faulting_address) { - return -1; - } - - /* Same address but different BTI exception type? */ - if (a1_r->branch_type > a2_r->branch_type) { - return 1; - } else if (a1_r->branch_type < a2_r->branch_type) { - return -1; - } else { - return 0; - } -} - -SPLAY_HEAD(bti_telemetry_tree, bti_telemetry_record); -// These functions generated by SPLAY_PROTOTYPE but are currently unused -__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_NEXT( - struct bti_telemetry_tree *head, struct bti_telemetry_record *elm); -__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_SEARCH( - struct bti_telemetry_tree *head, struct bti_telemetry_record *elm); -__unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_MIN_MAX( - struct bti_telemetry_tree *head, int val); -SPLAY_PROTOTYPE(bti_telemetry_tree, - bti_telemetry_record, - link, - bti_telemetry_record_compare); -SPLAY_GENERATE(bti_telemetry_tree, - bti_telemetry_record, - link, - bti_telemetry_record_compare); - -/* ~* Globals *~ */ -/* Lock which protects the event submission queue */ -static LCK_GRP_DECLARE(bti_telemetry_lock_grp, "bti_telemetry_lock"); -static LCK_SPIN_DECLARE(bti_telemetry_lock, &bti_telemetry_lock_grp); - -/* - * Since BTI exceptions are, naturally, caught in an exception context, it is - * not safe to allocate or do other complex behaviors like calling into - * CoreAnalytics. To solve this, we use a short submission ring buffer which - * collects records for processing on the submission thread. - * - * This ring buffer is locked by BTI_TELEMETRY_LOCK. - */ -#define RECORD_SUBMISSION_BUFFER_LENGTH (16) -static bti_telemetry_record_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH]; -static size_t rsb_rd_idx; -static size_t rsb_wr_idx; -static size_t rsb_count; -static bool rsb_is_draining; - -/** - * For local telemetry and deduplication, we store hit records in a splay tree. - * We use a splay here for performance reasons since BTI exceptions exhibit a - * degree of temporal locality. - */ -static struct bti_telemetry_tree telemetry_splay_tree; - -/** - * Flag indicating whether this CPU is currently trying to acquire the - * telemetry lock or has already acquired the lock. - * This is used as a deadlock avoidance mechanism. - */ -static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked); - -/** - * Thread which is responsible for clearing the submission buffer by submitting - * to CoreAnalytics and the local tree. - */ -static struct thread_call *drain_record_submission_buffer_callout; - -/* ~* Implementation *~ */ -/** - * Enqueue SRC into the record submission buffer. Returns TRUE if successful, - * false otherwise. BTI_TELEMETRY_LOCK must be held during this operation. - */ -static bool -rsb_enqueue_locked(bti_telemetry_record_s *src) -{ - if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) { - return false; - } - - rsb_count += 1; - bti_telemetry_record_s *dst = record_submission_buffer + rsb_wr_idx; - memcpy(dst, src, sizeof(bti_telemetry_record_s)); - rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH; - - return true; -} - -/** - * Try and acquire a spin lock in an interrupt-deadlock safe way. - * - * This function differs from the standard lck_spin_try_lock function in that it - * will block if the lock is expected to be acquired *eventually* but will not - * block if it detects that the lock will never be acquired (such as when the - * current CPU owns the lock, which can happen if a BTI exception is taken while - * handling a telemetry operation under the lock). - */ -static inline bool OS_WARN_RESULT -safe_telemetry_lock_try_lock(void) -{ - uint8_t *telemetry_lock_blocked = NULL; - - /* - * Disable preemption to ensure that our block signal always corresponds - * to the CPU we're actually running on. - * - * If we did not disable preemption, there is a case where we may mark that - * we are trying to acquire the lock on core A, get approved, get preempted, - * get rescheduled on core B, and then take the lock there. If we then take - * a BTI exception on core B while handling the original exception (ex. we - * take an IRQ and a BTI exception is generated there), we may re-enter on - * core B, (incorrectly) see that we are not blocked, try to acquire the - * lock, and ultimately deadlock. - */ - disable_preemption(); - - telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked); - if (!os_atomic_cmpxchg(telemetry_lock_blocked, 0, 1, relaxed)) { - /* - * This CPU has already acquired/is blocked on the telemetry lock. - * Attempting to acquire again on this CPU will deadlock. Refuse the - * operation. - */ - enable_preemption(); - return false; - } - - /* We've been approved to acquire the lock on this core! */ - lck_spin_lock(&bti_telemetry_lock); - return true; -} - -/** - * Attempts to acquire the telemetry lock and panic if it cannot be acquired. - */ -static void -safe_telemetry_lock_lock(void) -{ - if (!safe_telemetry_lock_try_lock()) { - panic("Unexpectedly could not acquire telemetry lock (nested acquire will deadlock)"); - } -} - -/** - * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock - */ -static inline void -safe_telemetry_lock_unlock(void) -{ - uint8_t *telemetry_lock_blocked = NULL; - - lck_spin_unlock(&bti_telemetry_lock); - /* - * Clear the block only AFTER having dropped the lock so that we can't - * hit a really narrow deadlock race where we get interrupted between - * clearing the block and dropping the lock. - */ - telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked); - os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed); - - /* Finally, reenable preemption as this thread is now safe to move */ - enable_preemption(); -} - -/** - * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective - * binary image. Copy each into UUID and OFFSET. Returns negative on error. - * - * Acquires a sleeping lock, do not call while interrupts are disabled. - */ -static int -get_uuid_and_text_offset_for_addr(uintptr_t addr, uuid_t *uuid, size_t *offset) -{ - kernel_mach_header_t *mh = NULL; - kernel_segment_command_t *seg_text_exec = NULL; - void *mh_uuid = NULL; - unsigned long mh_uuid_len = 0; - - if (!(mh = OSKextKextForAddress((void *)addr))) { - return -1; - } - - if (!(seg_text_exec = getsegbynamefromheader(mh, "__TEXT_EXEC"))) { - return -2; - } - - if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) { - return -3; - } - - if (mh_uuid_len != sizeof(*uuid)) { - return -4; - } - - memcpy(uuid, mh_uuid, sizeof(*uuid)); - *offset = addr - seg_text_exec->vmaddr; - - return 0; -} - -static void __unused -dump_telemetry_record(bti_telemetry_record_s *record, - uuid_string_t uuid_str, - size_t offset) -{ - printf( - TAG "Unexpected BTI exception (pc=0x%08lx, BTYPE=%d)\n" - TAG "\t\n", - record->faulting_address, record->branch_type, - uuid_str, offset); -} - -/** - * Thread call which drains the record submission buffer. - * There must be no more than one instance of this thread running at a time. - */ -static void -drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0, - __unused thread_call_param_t p1) -{ - size_t drain_count = 0; - size_t drain_rd_idx = 0; - bti_telemetry_record_s *record_allocations[RECORD_SUBMISSION_BUFFER_LENGTH]; - - /* - * We never expect for the submission thread to be scheduled while another - * handler is suspended above it (acquiring disables preemption) or while - * another submission thread is suspended above it (only one submission - * thread should ever be running). Thus, failing to acquire the lock - * indicates that something is seriously wrong. - */ - safe_telemetry_lock_lock(); - - if (rsb_is_draining) { - panic("Unexpectedly found multiple concurrent drains!"); - } - rsb_is_draining = true; - - /* - * Iteratively drain the submission queue until no entries remain. - * Drops and reacquires the telemetry lock. - */ - while ((drain_count = rsb_count)) { - /* LOCKED IN */ - drain_rd_idx = rsb_rd_idx; - safe_telemetry_lock_unlock(); - - /* - * It is safe to read these entries based on snapshots of DRAIN_COUNT - * and DRAIN_RD_IDX without holding the lock because all of the records' - * writes will have already become visible due to the lock's store - * release on the enqueue side. We may miss some records in this pass if - * they enqueue after the snapshot but we'll just pick them up in the - * next loop iteration. Additionally, since only one instance of this - * function will be running at a time, we don't need to worry about - * duplicate allocations/work. - */ - - for (size_t i = 0; i < drain_count; i++) { - /* Create persistent copies of the entries in the RSB */ - size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH; - bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i; - - bti_telemetry_record_s *new_record = - kalloc_type(bti_telemetry_record_s, Z_WAITOK | Z_NOFAIL); - - memcpy(new_record, record_i, sizeof(bti_telemetry_record_s)); - record_allocations[i] = new_record; - } - - safe_telemetry_lock_lock(); - /* Insert all draining entries into the splay */ - for (size_t i = 0; i < drain_count; i++) { - bti_telemetry_record_s *duplicate = SPLAY_INSERT(bti_telemetry_tree, - &telemetry_splay_tree, - record_allocations[i]); - if (duplicate) { - /* - * Since we scan both the RSB and the splay tree before - * submitting a record, we never expect to have multiple - * instances of any record. If this occurs, it's a bug! - */ - panic("Unexpected duplicate splay entry!"); - } - } - - /* Dequeue the entries from the RSB */ - rsb_rd_idx = - (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH; - rsb_count -= drain_count; - safe_telemetry_lock_unlock(); - - /* Report entries */ - for (size_t i = 0; i < drain_count; i++) { - int result = 0; - uuid_t uuid; - uuid_string_t uuid_str; - size_t offset = 0; - bti_telemetry_record_s *record_i = record_allocations[i]; - - if ((result = get_uuid_and_text_offset_for_addr( - record_i->faulting_address, - &uuid, &offset)) < 0) { - /* - * We couldn't get the required data for symbolication for some - * odd reason. Report a NULL UUID and the address raw so we can - * track these invalid events. - */ - memset(&uuid, 0x00, sizeof(uuid)); - offset = VM_KERNEL_UNSLIDE(record_i->faulting_address); - } - uuid_unparse(uuid, uuid_str); - - /* Print events to the console for local debug */ - dump_telemetry_record(record_i, uuid_str, offset); - -#if BTI_TELEMETRY_USE_CORE_ANALYTICS - /* Report to CoreAnalytics */ - ca_event_t ca_event = CA_EVENT_ALLOCATE(arm_bti_exceptions); - CA_EVENT_TYPE(arm_bti_exceptions) * event_data = ca_event->data; - - event_data->branch_type = record_i->branch_type; - event_data->faulting_offset = offset; - strlcpy(event_data->faulting_uuid, uuid_str, CA_UUID_LEN); - - CA_EVENT_SEND(ca_event); -#endif /* BTI_TELEMETRY_USE_CORE_ANALYTICS */ - } - - safe_telemetry_lock_lock(); - /* LOCKED OUT */ - } - - /* Done for now, if submitters have entries they'll need to call again. */ - rsb_is_draining = false; - safe_telemetry_lock_unlock(); -} - -__startup_func -void -bti_telemetry_init(void) -{ - printf(TAG "bti_telemetry_init\n"); - SPLAY_INIT(&telemetry_splay_tree); - - drain_record_submission_buffer_callout = thread_call_allocate_with_options( - drain_record_submission_buffer_thread_call, NULL, - THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); - - if (!drain_record_submission_buffer_callout) { - panic("Failed to allocate drain callout!"); - } -} - -/** - * Submit RECORD to the submission queue. Returns TRUE if the record was - * ingested (either enqueued or dupe'd), FALSE otherwise. - */ -static bool -submit_telemetry_record(bti_telemetry_record_s *record) -{ - bool did_ingest = true; - bool should_flush_submission_buffer = false; - bti_telemetry_record_s *splay_found_record = NULL; - if (!safe_telemetry_lock_try_lock()) { - /* - * Failed to acquire the lock! - * We're likely in a nested exception. Since we can't safely do anything - * else with the record, just drop it. - */ - return false; - } - - /* First, scan the submission queue for matching, queued records */ - for (size_t i = 0; i < rsb_count; i++) { - size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH; - bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i; - if (bti_telemetry_record_compare(record, record_i) == 0) { - /* Match, no need to report again. */ - goto DONE_LOCKED; - } - } - - /* Next, try for a record in the splay */ - splay_found_record = SPLAY_FIND(bti_telemetry_tree, - &telemetry_splay_tree, - record); - if (splay_found_record) { - /* Match, no need to report again. */ - goto DONE_LOCKED; - } - - /* - * If we haven't hit anywhere, this means we have a new event that needs to - * be enqueued for reporting. - */ - did_ingest = rsb_enqueue_locked(record); - should_flush_submission_buffer = did_ingest && !rsb_is_draining; - -DONE_LOCKED: - safe_telemetry_lock_unlock(); - - if (should_flush_submission_buffer) { - /* - * We submitted a new entry while the drain thread was either exiting or - * not running. Queue a new flush. Multiple calls here before the drain - * starts running will not result in multiple calls being queued due to - * THREAD_CALL_OPTIONS_ONCE. - */ - thread_call_enter(drain_record_submission_buffer_callout); - } - - return did_ingest; -} - -/** Convert a BTI exception frame into a telemetry record */ -static void -generate_telemetry_record(arm_saved_state_t *state, - bti_telemetry_record_s *record) -{ - uintptr_t pc = 0; - uint64_t esr = 0; - - pc = get_saved_state_pc(state); - esr = get_saved_state_esr(state); - - /* Generate the exception record */ - record->branch_type = (uint8_t)(esr & ISS_BTI_BTYPE_MASK); - record->faulting_address = pc; -} - -/* - * Try and recover from a BTI exception. Returns true if we are able to recover, - * false otherwise. - */ -static bool -recover_from_bti_exception(arm_saved_state_t *state) -{ - /* - * Since BTI raises on a mismatched PSTATE.BTYPE, we can simply clear BTYPE - * and directly return from the exception to continue executing as if - * the exception never happened. - */ - uint32_t psr = get_saved_state_cpsr(state); - psr &= ~PSR_BTYPE_MASK; - set_saved_state_cpsr(state, psr); - - return true; -} - -bool -bti_telemetry_handle_exception(arm_saved_state_t *state) -{ - bti_telemetry_record_s record = { 0 }; - - /* Generate the telemetry record and hand it to the submission thread */ - generate_telemetry_record(state, &record); - (void)submit_telemetry_record(&record); - - /* Recover and prepare to keep executing */ - return recover_from_bti_exception(state); -} - -#endif /* CONFIG_BTI_TELEMETRY */ diff --git a/osfmk/arm64/bzero.s b/osfmk/arm64/bzero.s index a7abca2cb..0d3934bd8 100644 --- a/osfmk/arm64/bzero.s +++ b/osfmk/arm64/bzero.s @@ -56,7 +56,7 @@ ___bzero: eor x1, x1, x1 mov x3, x0 cmp x2, #128 - b.cc L_memsetSmall + b.cc L_bzeroSmall /***************************************************************************** * Large buffer zero engine * @@ -86,7 +86,23 @@ L_bzeroLarge: stp x1, x1, [x3, #32] stp x1, x1, [x3, #48] POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _bzero + +/***************************************************************************** + * Small buffer store engine * + *****************************************************************************/ + +0: str x1, [x3],#8 +L_bzeroSmall: + subs x2, x2, #8 + b.cs 0b + adds x2, x2, #8 + b.eq 2f +1: strb w1, [x3],#1 + subs x2, x2, #1 + b.ne 1b +2: POP_FRAME + ARM64_STACK_EPILOG _bzero /***************************************************************************** * memset entrypoint * @@ -136,7 +152,7 @@ L_memsetLarge: stp x1, x1, [x3, #32] stp x1, x1, [x3, #48] POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memset /***************************************************************************** * Small buffer store engine * @@ -152,5 +168,5 @@ L_memsetSmall: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memset diff --git a/osfmk/arm64/caches_asm.s b/osfmk/arm64/caches_asm.s index 1a45a0a85..1588830ae 100644 --- a/osfmk/arm64/caches_asm.s +++ b/osfmk/arm64/caches_asm.s @@ -80,7 +80,7 @@ L_ipui_loop: bl EXT(InvalidatePoU_Icache) #endif POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(InvalidatePoU_IcacheRegion) /* * Obtains cache physical layout information required for way/set @@ -348,7 +348,7 @@ LEXT(CleanPoC_DcacheRegion_Force) bl EXT(CleanPoC_DcacheRegion_Force_nopreempt) bl EXT(_enable_preemption) POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(CleanPoC_DcacheRegion_Force) #else ARM64_PROLOG b EXT(CleanPoC_DcacheRegion_internal) diff --git a/osfmk/arm64/copyio.c b/osfmk/arm64/copyio.c index d99723709..4952ffb85 100644 --- a/osfmk/arm64/copyio.c +++ b/osfmk/arm64/copyio.c @@ -90,6 +90,7 @@ typedef enum { USER_ACCESS_WRITE } user_access_direction_t; + static inline void user_access_enable(__unused user_access_direction_t user_access_direction, pmap_t __unused pmap) { @@ -110,7 +111,7 @@ user_access_disable(__unused user_access_direction_t user_access_direction, pmap } -#define WRAP_COPYIO_PAN(_dir, _map, _op) \ +#define WRAP_COPYIO_PAN(_dir, _map, _op) \ ({ \ int _ret; \ user_access_enable(_dir, (_map)->pmap); \ @@ -219,10 +220,10 @@ copy_validate_kernel_addr(uintptr_t kernel_addr, vm_size_t nbytes) (void *)kernel_addr, nbytes); } - bool in_kva = (VM_KERNEL_STRIP_UPTR(kernel_addr) >= VM_MIN_KERNEL_ADDRESS) && - (VM_KERNEL_STRIP_UPTR(kernel_addr_last) <= VM_MAX_KERNEL_ADDRESS); - bool in_physmap = (VM_KERNEL_STRIP_UPTR(kernel_addr) >= physmap_base) && - (VM_KERNEL_STRIP_UPTR(kernel_addr_last) <= physmap_end); + bool in_kva = (VM_KERNEL_STRIP_PTR(kernel_addr) >= VM_MIN_KERNEL_ADDRESS) && + (VM_KERNEL_STRIP_PTR(kernel_addr_last) <= VM_MAX_KERNEL_ADDRESS); + bool in_physmap = (VM_KERNEL_STRIP_PTR(kernel_addr) >= physmap_base) && + (VM_KERNEL_STRIP_PTR(kernel_addr_last) <= physmap_end); if (__improbable(!(in_kva || in_physmap))) { panic("%s(%p, %lu) - kaddr not in kernel", __func__, diff --git a/osfmk/arm64/cpc_arm64_events.c b/osfmk/arm64/cpc_arm64_events.c index 83f237eb4..7e4e02c03 100644 --- a/osfmk/arm64/cpc_arm64_events.c +++ b/osfmk/arm64/cpc_arm64_events.c @@ -113,7 +113,7 @@ cpc_event_allowed( static const struct cpc_event_list _known_cpmu_events = { #if defined(ARM64_BOARD_CONFIG_T6000) - .cel_event_count = 60, + .cel_event_count = 59, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -124,7 +124,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x0008, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x000a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x000b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x000d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0051, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0052, .cev_name = "SCHEDULE_UOP" }, { .cev_selector = 0x006c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0070, .cev_name = "MAP_STALL_DISPATCH" }, @@ -140,7 +140,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -163,7 +162,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -188,7 +187,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x0008, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x000a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x000b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x000d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0051, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x006c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0070, .cev_name = "MAP_STALL_DISPATCH" }, { .cev_selector = 0x0075, .cev_name = "MAP_REWIND" }, @@ -203,7 +202,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -212,6 +210,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x009a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x009b, .cev_name = "INST_LDST" }, { .cev_selector = 0x009c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x009f, .cev_name = "INST_SIMD_ALU_VEC" }, { .cev_selector = 0x00a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x00a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x00a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -226,7 +225,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -240,7 +239,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T6030) - .cel_event_count = 62, + .cel_event_count = 65, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -252,7 +251,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -261,10 +259,11 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x009a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x009b, .cev_name = "INST_LDST" }, { .cev_selector = 0x009c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x009f, .cev_name = "INST_SIMD_ALU_VEC" }, { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -284,13 +283,16 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x027c, .cev_name = "MAP_INT_UOP" }, { .cev_selector = 0x027d, .cev_name = "MAP_LDST_UOP" }, { .cev_selector = 0x027e, .cev_name = "MAP_SIMD_UOP" }, + { .cev_selector = 0x0283, .cev_name = "SCHEDULE_UOP_ANY" }, + { .cev_selector = 0x0290, .cev_name = "LDST_UNIT_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0291, .cev_name = "LDST_UNIT_WAITING_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0351, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0404, .cev_name = "L1I_TLB_FILL" }, { .cev_selector = 0x0405, .cev_name = "L1D_TLB_FILL" }, { .cev_selector = 0x0407, .cev_name = "MMU_TABLE_WALK_INSTRUCTION" }, { .cev_selector = 0x0408, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x040a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x040b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x040d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, { .cev_selector = 0x05a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x05a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x05a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -306,7 +308,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x05e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T6031) - .cel_event_count = 62, + .cel_event_count = 65, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -318,7 +320,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -327,10 +328,11 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x009a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x009b, .cev_name = "INST_LDST" }, { .cev_selector = 0x009c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x009f, .cev_name = "INST_SIMD_ALU_VEC" }, { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -350,13 +352,16 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x027c, .cev_name = "MAP_INT_UOP" }, { .cev_selector = 0x027d, .cev_name = "MAP_LDST_UOP" }, { .cev_selector = 0x027e, .cev_name = "MAP_SIMD_UOP" }, + { .cev_selector = 0x0283, .cev_name = "SCHEDULE_UOP_ANY" }, + { .cev_selector = 0x0290, .cev_name = "LDST_UNIT_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0291, .cev_name = "LDST_UNIT_WAITING_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0351, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0404, .cev_name = "L1I_TLB_FILL" }, { .cev_selector = 0x0405, .cev_name = "L1D_TLB_FILL" }, { .cev_selector = 0x0407, .cev_name = "MMU_TABLE_WALK_INSTRUCTION" }, { .cev_selector = 0x0408, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x040a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x040b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x040d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, { .cev_selector = 0x05a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x05a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x05a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -372,20 +377,33 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x05e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T6041) - .cel_event_count = 62, + .cel_event_count = 102, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, + { .cev_selector = 0x0003, .cev_name = "ARM_L1D_CACHE_REFILL" }, + { .cev_selector = 0x0004, .cev_name = "ARM_L1D_CACHE" }, { .cev_selector = 0x0008, .cev_name = "INST_ALL" }, + { .cev_selector = 0x0010, .cev_name = "ARM_BR_MIS_PRED" }, { .cev_selector = 0x0011, .cev_name = "CORE_ACTIVE_CYCLE" }, + { .cev_selector = 0x0012, .cev_name = "ARM_BR_PRED" }, { .cev_selector = 0x0021, .cev_name = "INST_BRANCH" }, { .cev_selector = 0x0022, .cev_name = "BRANCH_MISPRED_NONSPEC" }, + { .cev_selector = 0x0023, .cev_name = "ARM_STALL_FRONTEND" }, + { .cev_selector = 0x0024, .cev_name = "ARM_STALL_BACKEND" }, + { .cev_selector = 0x0039, .cev_name = "ARM_L1D_CACHE_LMISS_RD" }, { .cev_selector = 0x003a, .cev_name = "RETIRE_UOP" }, { .cev_selector = 0x003b, .cev_name = "MAP_UOP" }, + { .cev_selector = 0x003c, .cev_name = "ARM_STALL" }, + { .cev_selector = 0x003d, .cev_name = "ARM_STALL_SLOT_BACKEND" }, + { .cev_selector = 0x003e, .cev_name = "ARM_STALL_SLOT_FRONTEND" }, + { .cev_selector = 0x003f, .cev_name = "ARM_STALL_SLOT" }, + { .cev_selector = 0x0040, .cev_name = "ARM_L1D_CACHE_RD" }, { .cev_selector = 0x0182, .cev_name = "MAP_DISPATCH_BUBBLE_IC" }, { .cev_selector = 0x0183, .cev_name = "MAP_DISPATCH_BUBBLE_ITLB" }, { .cev_selector = 0x01d4, .cev_name = "L1I_TLB_MISS_DEMAND" }, { .cev_selector = 0x01d6, .cev_name = "MAP_DISPATCH_BUBBLE" }, { .cev_selector = 0x01de, .cev_name = "FETCH_RESTART" }, + { .cev_selector = 0x01e1, .cev_name = "MAP_DISPATCH_BUBBLE_SLOT" }, { .cev_selector = 0x026c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0270, .cev_name = "MAP_STALL_DISPATCH" }, { .cev_selector = 0x0275, .cev_name = "MAP_REWIND" }, @@ -393,13 +411,35 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x027c, .cev_name = "MAP_INT_UOP" }, { .cev_selector = 0x027d, .cev_name = "MAP_LDST_UOP" }, { .cev_selector = 0x027e, .cev_name = "MAP_SIMD_UOP" }, + { .cev_selector = 0x0283, .cev_name = "SCHEDULE_UOP_ANY" }, + { .cev_selector = 0x0285, .cev_name = "MAP_INT_SME_UOP" }, + { .cev_selector = 0x0286, .cev_name = "SME_ENGINE_SM_ENABLE" }, + { .cev_selector = 0x0287, .cev_name = "SME_ENGINE_SM_ZA_ENABLE" }, + { .cev_selector = 0x0288, .cev_name = "SME_ENGINE_ZA_ENABLED_SM_DISABLED" }, + { .cev_selector = 0x028c, .cev_name = "LDST_UNIT_WAITING_SME_ENGINE_INST_QUEUE_FULL" }, + { .cev_selector = 0x028e, .cev_name = "SCHEDULE_WAITING_SME_ENGINE_REG_DATA" }, + { .cev_selector = 0x028f, .cev_name = "LDST_UNIT_WAITING_SME_ENGINE_MEM_DATA" }, + { .cev_selector = 0x0290, .cev_name = "LDST_UNIT_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0291, .cev_name = "LDST_UNIT_WAITING_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0294, .cev_name = "LD_UNIT_WAITING_YOUNG_L1D_CACHE_MISS" }, + { .cev_selector = 0x02ad, .cev_name = "MAP_RECOVERY" }, + { .cev_selector = 0x02ae, .cev_name = "MAP_STALL_NONRECOVERY" }, + { .cev_selector = 0x0351, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0404, .cev_name = "L1I_TLB_FILL" }, { .cev_selector = 0x0405, .cev_name = "L1D_TLB_FILL" }, { .cev_selector = 0x0407, .cev_name = "MMU_TABLE_WALK_INSTRUCTION" }, { .cev_selector = 0x0408, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x040a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x040b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x040d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0508, .cev_name = "LDST_SME_XPG_UOP" }, + { .cev_selector = 0x0529, .cev_name = "INST_SME_ENGINE_PACKING_FUSED" }, + { .cev_selector = 0x052c, .cev_name = "LD_BLOCKED_BY_SME_LDST" }, + { .cev_selector = 0x052e, .cev_name = "ST_BARRIER_BLOCKED_BY_SME_LDST" }, + { .cev_selector = 0x0573, .cev_name = "LD_SME_NT_UOP" }, + { .cev_selector = 0x0574, .cev_name = "ST_SME_NT_UOP" }, + { .cev_selector = 0x0575, .cev_name = "LD_SME_NORMAL_UOP" }, + { .cev_selector = 0x0576, .cev_name = "ST_SME_NORMAL_UOP" }, + { .cev_selector = 0x0577, .cev_name = "LDST_SME_PRED_INACTIVE" }, { .cev_selector = 0x05a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x05a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x05a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -427,10 +467,15 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x089a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x089b, .cev_name = "INST_LDST" }, { .cev_selector = 0x089c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x089f, .cev_name = "INST_SIMD_ALU_VEC" }, + { .cev_selector = 0x08a0, .cev_name = "INST_SME_ENGINE_SCALARFP" }, + { .cev_selector = 0x08a1, .cev_name = "INST_SME_ENGINE_LD" }, + { .cev_selector = 0x08a2, .cev_name = "INST_SME_ENGINE_ST" }, + { .cev_selector = 0x08a3, .cev_name = "INST_SME_ENGINE_ALU" }, { .cev_selector = 0x08bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x08c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x08c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x08c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x08c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x08c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x08c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x08c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -438,7 +483,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x4006, .cev_name = "L1I_CACHE_MISS_DEMAND" }, }, #elif defined(ARM64_BOARD_CONFIG_T8101) - .cel_event_count = 60, + .cel_event_count = 59, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -449,7 +494,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x0008, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x000a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x000b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x000d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0051, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0052, .cev_name = "SCHEDULE_UOP" }, { .cev_selector = 0x006c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0070, .cev_name = "MAP_STALL_DISPATCH" }, @@ -465,7 +510,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -488,7 +532,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -502,7 +546,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T8103) - .cel_event_count = 60, + .cel_event_count = 59, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -513,7 +557,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x0008, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x000a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x000b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x000d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0051, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0052, .cev_name = "SCHEDULE_UOP" }, { .cev_selector = 0x006c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0070, .cev_name = "MAP_STALL_DISPATCH" }, @@ -529,7 +573,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -552,7 +595,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -577,7 +620,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x0008, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x000a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x000b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x000d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0051, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x006c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0070, .cev_name = "MAP_STALL_DISPATCH" }, { .cev_selector = 0x0075, .cev_name = "MAP_REWIND" }, @@ -592,7 +635,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -601,6 +643,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x009a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x009b, .cev_name = "INST_LDST" }, { .cev_selector = 0x009c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x009f, .cev_name = "INST_SIMD_ALU_VEC" }, { .cev_selector = 0x00a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x00a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x00a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -615,7 +658,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -629,7 +672,7 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x00e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T8122_T8130) - .cel_event_count = 62, + .cel_event_count = 65, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, { .cev_selector = 0x0001, .cev_name = "RETIRE_UOP" }, @@ -641,7 +684,6 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x008f, .cev_name = "INST_BRANCH_RET" }, { .cev_selector = 0x0090, .cev_name = "INST_BRANCH_TAKEN" }, { .cev_selector = 0x0093, .cev_name = "INST_BRANCH_INDIR" }, - { .cev_selector = 0x0094, .cev_name = "INST_BRANCH_COND" }, { .cev_selector = 0x0095, .cev_name = "INST_INT_LD" }, { .cev_selector = 0x0096, .cev_name = "INST_INT_ST" }, { .cev_selector = 0x0097, .cev_name = "INST_INT_ALU" }, @@ -650,10 +692,11 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x009a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x009b, .cev_name = "INST_LDST" }, { .cev_selector = 0x009c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x009f, .cev_name = "INST_SIMD_ALU_VEC" }, { .cev_selector = 0x00bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x00c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x00c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x00c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x00c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x00c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x00c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x00c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, @@ -673,13 +716,16 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x027c, .cev_name = "MAP_INT_UOP" }, { .cev_selector = 0x027d, .cev_name = "MAP_LDST_UOP" }, { .cev_selector = 0x027e, .cev_name = "MAP_SIMD_UOP" }, + { .cev_selector = 0x0283, .cev_name = "SCHEDULE_UOP_ANY" }, + { .cev_selector = 0x0290, .cev_name = "LDST_UNIT_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0291, .cev_name = "LDST_UNIT_WAITING_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0351, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0404, .cev_name = "L1I_TLB_FILL" }, { .cev_selector = 0x0405, .cev_name = "L1D_TLB_FILL" }, { .cev_selector = 0x0407, .cev_name = "MMU_TABLE_WALK_INSTRUCTION" }, { .cev_selector = 0x0408, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x040a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x040b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x040d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, { .cev_selector = 0x05a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x05a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x05a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -695,20 +741,33 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x05e6, .cev_name = "LD_NT_UOP" }, }, #elif defined(ARM64_BOARD_CONFIG_T8132) - .cel_event_count = 62, + .cel_event_count = 102, .cel_events = { { .cev_selector = 0x0000, .cev_name = "NONE" }, + { .cev_selector = 0x0003, .cev_name = "ARM_L1D_CACHE_REFILL" }, + { .cev_selector = 0x0004, .cev_name = "ARM_L1D_CACHE" }, { .cev_selector = 0x0008, .cev_name = "INST_ALL" }, + { .cev_selector = 0x0010, .cev_name = "ARM_BR_MIS_PRED" }, { .cev_selector = 0x0011, .cev_name = "CORE_ACTIVE_CYCLE" }, + { .cev_selector = 0x0012, .cev_name = "ARM_BR_PRED" }, { .cev_selector = 0x0021, .cev_name = "INST_BRANCH" }, { .cev_selector = 0x0022, .cev_name = "BRANCH_MISPRED_NONSPEC" }, + { .cev_selector = 0x0023, .cev_name = "ARM_STALL_FRONTEND" }, + { .cev_selector = 0x0024, .cev_name = "ARM_STALL_BACKEND" }, + { .cev_selector = 0x0039, .cev_name = "ARM_L1D_CACHE_LMISS_RD" }, { .cev_selector = 0x003a, .cev_name = "RETIRE_UOP" }, { .cev_selector = 0x003b, .cev_name = "MAP_UOP" }, + { .cev_selector = 0x003c, .cev_name = "ARM_STALL" }, + { .cev_selector = 0x003d, .cev_name = "ARM_STALL_SLOT_BACKEND" }, + { .cev_selector = 0x003e, .cev_name = "ARM_STALL_SLOT_FRONTEND" }, + { .cev_selector = 0x003f, .cev_name = "ARM_STALL_SLOT" }, + { .cev_selector = 0x0040, .cev_name = "ARM_L1D_CACHE_RD" }, { .cev_selector = 0x0182, .cev_name = "MAP_DISPATCH_BUBBLE_IC" }, { .cev_selector = 0x0183, .cev_name = "MAP_DISPATCH_BUBBLE_ITLB" }, { .cev_selector = 0x01d4, .cev_name = "L1I_TLB_MISS_DEMAND" }, { .cev_selector = 0x01d6, .cev_name = "MAP_DISPATCH_BUBBLE" }, { .cev_selector = 0x01de, .cev_name = "FETCH_RESTART" }, + { .cev_selector = 0x01e1, .cev_name = "MAP_DISPATCH_BUBBLE_SLOT" }, { .cev_selector = 0x026c, .cev_name = "INTERRUPT_PENDING" }, { .cev_selector = 0x0270, .cev_name = "MAP_STALL_DISPATCH" }, { .cev_selector = 0x0275, .cev_name = "MAP_REWIND" }, @@ -716,13 +775,35 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x027c, .cev_name = "MAP_INT_UOP" }, { .cev_selector = 0x027d, .cev_name = "MAP_LDST_UOP" }, { .cev_selector = 0x027e, .cev_name = "MAP_SIMD_UOP" }, + { .cev_selector = 0x0283, .cev_name = "SCHEDULE_UOP_ANY" }, + { .cev_selector = 0x0285, .cev_name = "MAP_INT_SME_UOP" }, + { .cev_selector = 0x0286, .cev_name = "SME_ENGINE_SM_ENABLE" }, + { .cev_selector = 0x0287, .cev_name = "SME_ENGINE_SM_ZA_ENABLE" }, + { .cev_selector = 0x0288, .cev_name = "SME_ENGINE_ZA_ENABLED_SM_DISABLED" }, + { .cev_selector = 0x028c, .cev_name = "LDST_UNIT_WAITING_SME_ENGINE_INST_QUEUE_FULL" }, + { .cev_selector = 0x028e, .cev_name = "SCHEDULE_WAITING_SME_ENGINE_REG_DATA" }, + { .cev_selector = 0x028f, .cev_name = "LDST_UNIT_WAITING_SME_ENGINE_MEM_DATA" }, + { .cev_selector = 0x0290, .cev_name = "LDST_UNIT_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0291, .cev_name = "LDST_UNIT_WAITING_OLD_L1D_CACHE_MISS" }, + { .cev_selector = 0x0294, .cev_name = "LD_UNIT_WAITING_YOUNG_L1D_CACHE_MISS" }, + { .cev_selector = 0x02ad, .cev_name = "MAP_RECOVERY" }, + { .cev_selector = 0x02ae, .cev_name = "MAP_STALL_NONRECOVERY" }, + { .cev_selector = 0x0351, .cev_name = "SCHEDULE_EMPTY" }, { .cev_selector = 0x0404, .cev_name = "L1I_TLB_FILL" }, { .cev_selector = 0x0405, .cev_name = "L1D_TLB_FILL" }, { .cev_selector = 0x0407, .cev_name = "MMU_TABLE_WALK_INSTRUCTION" }, { .cev_selector = 0x0408, .cev_name = "MMU_TABLE_WALK_DATA" }, { .cev_selector = 0x040a, .cev_name = "L2_TLB_MISS_INSTRUCTION" }, { .cev_selector = 0x040b, .cev_name = "L2_TLB_MISS_DATA" }, - { .cev_selector = 0x040d, .cev_name = "MMU_VIRTUAL_MEMORY_FAULT_NONSPEC" }, + { .cev_selector = 0x0508, .cev_name = "LDST_SME_XPG_UOP" }, + { .cev_selector = 0x0529, .cev_name = "INST_SME_ENGINE_PACKING_FUSED" }, + { .cev_selector = 0x052c, .cev_name = "LD_BLOCKED_BY_SME_LDST" }, + { .cev_selector = 0x052e, .cev_name = "ST_BARRIER_BLOCKED_BY_SME_LDST" }, + { .cev_selector = 0x0573, .cev_name = "LD_SME_NT_UOP" }, + { .cev_selector = 0x0574, .cev_name = "ST_SME_NT_UOP" }, + { .cev_selector = 0x0575, .cev_name = "LD_SME_NORMAL_UOP" }, + { .cev_selector = 0x0576, .cev_name = "ST_SME_NORMAL_UOP" }, + { .cev_selector = 0x0577, .cev_name = "LDST_SME_PRED_INACTIVE" }, { .cev_selector = 0x05a0, .cev_name = "L1D_TLB_ACCESS" }, { .cev_selector = 0x05a1, .cev_name = "L1D_TLB_MISS" }, { .cev_selector = 0x05a2, .cev_name = "L1D_CACHE_MISS_ST" }, @@ -750,10 +831,15 @@ static const struct cpc_event_list _known_cpmu_events = { { .cev_selector = 0x089a, .cev_name = "INST_SIMD_ALU" }, { .cev_selector = 0x089b, .cev_name = "INST_LDST" }, { .cev_selector = 0x089c, .cev_name = "INST_BARRIER" }, + { .cev_selector = 0x089f, .cev_name = "INST_SIMD_ALU_VEC" }, + { .cev_selector = 0x08a0, .cev_name = "INST_SME_ENGINE_SCALARFP" }, + { .cev_selector = 0x08a1, .cev_name = "INST_SME_ENGINE_LD" }, + { .cev_selector = 0x08a2, .cev_name = "INST_SME_ENGINE_ST" }, + { .cev_selector = 0x08a3, .cev_name = "INST_SME_ENGINE_ALU" }, { .cev_selector = 0x08bf, .cev_name = "L1D_CACHE_MISS_LD_NONSPEC" }, { .cev_selector = 0x08c0, .cev_name = "L1D_CACHE_MISS_ST_NONSPEC" }, { .cev_selector = 0x08c1, .cev_name = "L1D_TLB_MISS_NONSPEC" }, - { .cev_selector = 0x08c4, .cev_name = "ST_MEMORY_ORDER_VIOLATION_NONSPEC" }, + { .cev_selector = 0x08c4, .cev_name = "ST_MEM_ORDER_VIOL_LD_NONSPEC" }, { .cev_selector = 0x08c5, .cev_name = "BRANCH_COND_MISPRED_NONSPEC" }, { .cev_selector = 0x08c6, .cev_name = "BRANCH_INDIR_MISPRED_NONSPEC" }, { .cev_selector = 0x08c8, .cev_name = "BRANCH_RET_INDIR_MISPRED_NONSPEC" }, diff --git a/osfmk/arm64/cpu.c b/osfmk/arm64/cpu.c index 8d2cad37e..b58e28818 100644 --- a/osfmk/arm64/cpu.c +++ b/osfmk/arm64/cpu.c @@ -114,10 +114,6 @@ extern void typhoon_return_from_wfi(void); extern void arm64_retention_wfi(void); #endif -sysreg_restore_t sysreg_restore __attribute__((section("__DATA, __const"))) = { - .tcr_el1 = TCR_EL1_BOOT, -}; - // wfi - wfi mode // 0 : disabled // 1 : normal diff --git a/osfmk/arm64/cswitch.s b/osfmk/arm64/cswitch.s index 04bfce82f..e75c9e7fe 100644 --- a/osfmk/arm64/cswitch.s +++ b/osfmk/arm64/cswitch.s @@ -211,15 +211,11 @@ .macro set_process_dependent_keys_and_sync_context thread, new_key, tmp_key, cpudatap, wsync -#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) || defined(HAS_APPLE_PAC) +#if defined(HAS_APPLE_PAC) ldr \cpudatap, [\thread, ACT_CPUDATAP] -#endif /* defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) || defined(HAS_APPLE_PAC) */ +#endif /* defined(HAS_APPLE_PAC) */ -#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) - ldrb \wsync, [\cpudatap, CPU_SYNC_ON_CSWITCH] -#else /* defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) */ mov \wsync, #0 -#endif #if CSWITCH_ROP_KEYS ldr \new_key, [\thread, TH_ROP_PID] @@ -244,15 +240,16 @@ Lskip_rop_keys_\@: Lskip_jop_keys_\@: #endif /* CSWITCH_JOP_KEYS */ - cbz \wsync, 1f - isb sy + cbnz \wsync, Lsync_now_\@ + b 1f + +Lsync_now_\@: + isb sy #if HAS_PARAVIRTUALIZED_PAC 1: /* guests need to clear the sync flag even after skipping the isb, in case they synced via hvc instead */ #endif -#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) strb wzr, [\cpudatap, CPU_SYNC_ON_CSWITCH] -#endif 1: .endmacro diff --git a/osfmk/arm64/dbgwrap.c b/osfmk/arm64/dbgwrap.c index 7e7eeb8fe..18193a471 100644 --- a/osfmk/arm64/dbgwrap.c +++ b/osfmk/arm64/dbgwrap.c @@ -104,7 +104,7 @@ dbgwrap_status_t ml_dbgwrap_halt_cpu(int cpu_index, uint64_t timeout_ns) { cpu_data_t *cdp = cpu_datap(cpu_index); - if ((cdp == NULL) || (cdp->coresight_base[CORESIGHT_UTT] == 0)) { + if ((cdp == NULL) || (cdp->coresight_base[CORESIGHT_UTT] == 0) || (cdp->coresight_base[CORESIGHT_ED] == 0)) { return DBGWRAP_ERR_UNSUPPORTED; } @@ -120,6 +120,23 @@ ml_dbgwrap_halt_cpu(int cpu_index, uint64_t timeout_ns) return DBGWRAP_ERR_INPROGRESS; } + /* Accessing EDPRSR or DBGWRAP when cluster is powered off would result in LLC bus error. Hence, check + * the cluster power status to make sure these PIO registers are accessible.*/ + if (!PE_cpu_power_check_kdp(cpu_index)) { + return DBGWRAP_WARN_CPU_OFFLINE; + } + + /* Ensure memory-mapped coresight registers can be written */ + *((volatile uint32_t *)(cdp->coresight_base[CORESIGHT_ED] + ARM_DEBUG_OFFSET_DBGLAR)) = ARM_DBG_LOCK_ACCESS_KEY; + + /* A core that is not fully powered (e.g. idling in wfi) can still be halted; the dbgwrap + * register and certain coresight registers such EDPRSR are in the always-on domain and their + * values are retained over ACC power down. And the OS lock defaults to being set but we clear + * it first thing when CPU is up, so use that to detect the offline state of individual CPU. */ + if (*((volatile uint32_t *)(cdp->coresight_base[CORESIGHT_ED] + EDPRSR_REG_OFFSET)) & EDPRSR_OSLK) { + return DBGWRAP_WARN_CPU_OFFLINE; + } + volatile dbgwrap_reg_t *dbgWrapReg = (volatile dbgwrap_reg_t *)(cdp->coresight_base[CORESIGHT_UTT] + DBGWRAP_REG_OFFSET); if (ml_dbgwrap_cpu_is_halted(cpu_index)) { @@ -232,19 +249,11 @@ ml_dbgwrap_halt_cpu_with_state(int cpu_index, uint64_t timeout_ns, dbgwrap_threa return DBGWRAP_ERR_UNSUPPORTED; } - /* Ensure memory-mapped coresight registers can be written */ - *((volatile uint32_t *)(cdp->coresight_base[CORESIGHT_ED] + ARM_DEBUG_OFFSET_DBGLAR)) = ARM_DBG_LOCK_ACCESS_KEY; - dbgwrap_status_t status = ml_dbgwrap_halt_cpu(cpu_index, timeout_ns); - /* A core that is not fully powered (e.g. idling in wfi) can still be halted; the dbgwrap - * register and certain coresight registers such EDPRSR are in the always-on domain. - * However, EDSCR/EDITR are not in the always-on domain and will generate a parity abort - * on read. EDPRSR can be safely read in all cases, and the OS lock defaults to being set - * but we clear it first thing, so use that to detect the offline state. */ - if (*((volatile uint32_t *)(cdp->coresight_base[CORESIGHT_ED] + EDPRSR_REG_OFFSET)) & EDPRSR_OSLK) { + if (status == DBGWRAP_WARN_CPU_OFFLINE) { bzero(state, sizeof(*state)); - return DBGWRAP_WARN_CPU_OFFLINE; + return status; } uint32_t instr; diff --git a/osfmk/arm64/exception_asm.h b/osfmk/arm64/exception_asm.h index fa5f8cdfd..799d22c46 100644 --- a/osfmk/arm64/exception_asm.h +++ b/osfmk/arm64/exception_asm.h @@ -274,7 +274,7 @@ Lspill_registers_poison_continue_\@: .ifnb \options_register Lspill_registers_skip_elr_far_\@: .endif /* options_register != NONE */ - str w21, [x0, SS64_ESR] + str x21, [x0, SS64_ESR] str w23, [x0, SS64_CPSR] .endmacro diff --git a/osfmk/arm64/genassym.c b/osfmk/arm64/genassym.c index 918cf0cfa..7995de7f3 100644 --- a/osfmk/arm64/genassym.c +++ b/osfmk/arm64/genassym.c @@ -156,6 +156,7 @@ main(int argc, #endif #if HAS_ARM_FEAT_SME DECLARE("ACT_UMATRIX_HDR", offsetof(struct thread, machine.umatrix_hdr)); + DECLARE("ACT_UMATRIX_HDR_DIVERSIFIER", ptrauth_string_discriminator("machine_thread.umatrix_hdr")); #endif /* HAS_ARM_FEAT_SME */ DECLARE("TH_CTH_SELF", offsetof(struct thread, machine.cthread_self)); DECLARE("ACT_PREEMPT_CNT", offsetof(struct thread, machine.preemption_count)); @@ -352,8 +353,6 @@ main(int argc, DECLARE("BA_TOP_OF_KERNEL_DATA", offsetof(struct boot_args, topOfKernelData)); DECLARE("BA_BOOT_FLAGS", offsetof(struct boot_args, bootFlags)); - DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1)); - #if XNU_MONITOR DECLARE("PMAP_CPU_DATA_INFLIGHT_PMAP", offsetof(struct pmap_cpu_data, inflight_pmap)); DECLARE("PMAP_CPU_DATA_PPL_STATE", offsetof(struct pmap_cpu_data, ppl_state)); @@ -380,9 +379,7 @@ main(int argc, #endif /* defined(HAS_APPLE_PAC) */ -#if ERET_IS_NOT_CONTEXT_SYNCHRONIZING DECLARE("CPU_SYNC_ON_CSWITCH", offsetof(cpu_data_t, sync_on_cswitch)); -#endif /* ERET_IS_NOT_CONTEXT_SYNCHRONIZING */ #if HIBERNATION DECLARE("HIBHDR_STACKOFFSET", offsetof(IOHibernateImageHeader, restore1StackOffset)); @@ -402,7 +399,6 @@ main(int argc, #endif - #if CONFIG_SPTM && (DEVELOPMENT || DEBUG) DECLARE("PANIC_LOCKDOWN_INITIATOR_STATE_INITIATOR_PC", offsetof(struct panic_lockdown_initiator_state, initiator_pc)); DECLARE("PANIC_LOCKDOWN_INITIATOR_STATE_INITIATOR_SP", offsetof(struct panic_lockdown_initiator_state, initiator_sp)); diff --git a/osfmk/arm64/hibernate_arm64.c b/osfmk/arm64/hibernate_arm64.c index 56bfa5f91..595ff072f 100644 --- a/osfmk/arm64/hibernate_arm64.c +++ b/osfmk/arm64/hibernate_arm64.c @@ -186,23 +186,48 @@ hibernate_page_list_setall_machine(hibernate_page_list_t * page_list, #endif /* XNU_MONITOR */ if (!preflight) { - // mark the stack as unavailable for clobbering during restore; - // we won't actually save it because we mark these pages as free - // in hibernate_page_list_set_volatile + /* + * mark the stack as unavailable for clobbering during restore; + * we won't actually save it because we mark these pages as free + * in hibernate_page_list_set_volatile + */ hibernate_set_page_state(page_list, page_list_wired, stack_first_page, stack_page_count, kIOHibernatePageStateWiredSave); #if XNU_MONITOR - // Mark the PPL stack as not needing to be saved. Any PPL memory that is - // excluded from the image will need to be explicitly checked for in - // pmap_check_ppl_hashed_flag_all(). That function ensures that all - // PPL pages are contained within the image (so any memory explicitly - // not being saved, needs to be removed from the check). + /* + * Mark the PPL stack as not needing to be saved. Any PPL memory that is + * excluded from the image will need to be explicitly checked for in + * pmap_check_ppl_hashed_flag_all(). That function ensures that all + * PPL pages are contained within the image (so any memory explicitly + * not being saved, needs to be removed from the check). + */ hibernate_set_page_state(page_list, page_list_wired, atop_64(pmap_stacks_start_pa), pmap_stack_page_count, kIOHibernatePageStateFree); #endif /* XNU_MONITOR */ + +#if CONFIG_SPTM + /* + * Pages for which a hibernate-io-range explicitly prohibits + * hibernation restore to write to them must not be + * clobbered. They also will not be saved, because + * hibernate_page_list_set_volatile() will mark them + * appropriately as well. + */ + bool (^exclude)(pmap_io_range_t const *) = + ^bool (pmap_io_range_t const *range) { + if (range->wimg & PMAP_IO_RANGE_PROHIBIT_HIB_WRITE) { + /* No-op if page not in any bitmap (i.e. not managed DRAM). */ + hibernate_set_page_state(page_list, page_list_wired, + range->addr >> PAGE_SHIFT, range->len >> PAGE_SHIFT, + kIOHibernatePageStateWiredSave); + } + return true; + }; + pmap_range_iterate(exclude); +#endif /* CONFIG_SPTM */ } *pagesOut += stack_page_count; @@ -219,8 +244,10 @@ hibernate_page_list_set_volatile(hibernate_page_list_t * page_list, { vm_offset_t page, count; - // hibernation restore runs on the interrupt stack, - // so we need to make sure we don't save it + /* + * hibernation restore runs on the interrupt stack, + * so we need to make sure we don't save it + */ pal_hib_get_stack_pages(&page, &count); hibernate_set_page_state(page_list, page_list_wired, page, count, @@ -228,7 +255,22 @@ hibernate_page_list_set_volatile(hibernate_page_list_t * page_list, *pagesOut -= count; #if CONFIG_SPTM - /** + /* + * Pages that are explicitly prohibited to be restored by a + * pmap-io-range must also not be saved. + */ + bool (^exclude)(pmap_io_range_t const *) = ^bool (pmap_io_range_t const * range) { + if (range->wimg & PMAP_IO_RANGE_PROHIBIT_HIB_WRITE) { + /* No-op if page not in any bitmap (i.e. not managed DRAM). */ + hibernate_set_page_state(page_list, page_list_wired, + range->addr >> PAGE_SHIFT, range->len >> PAGE_SHIFT, + kIOHibernatePageStateFree); + } + return true; + }; + pmap_range_iterate(exclude); + + /* * On SPTM-based systems, parts of the CTRR-protected regions will be * loaded from disk by iBoot instead of being loaded from the hibernation * image for security reasons. Because those regions are being loaded from @@ -241,8 +283,7 @@ hibernate_page_list_set_volatile(hibernate_page_list_t * page_list, for (size_t i = 0; i < SPTMArgs->hib_metadata->num_iboot_loaded_ranges; ++i) { const hib_phys_range_t *range = &SPTMArgs->hib_metadata->iboot_loaded_ranges[i]; hibernate_set_page_state(page_list, page_list_wired, - range->first_page, range->page_count, - kIOHibernatePageStateFree); + range->first_page, range->page_count, kIOHibernatePageStateFree); *pagesOut -= range->page_count; } #endif /* CONFIG_SPTM */ diff --git a/osfmk/arm64/hibernate_restore.c b/osfmk/arm64/hibernate_restore.c index 748754726..aee58ac37 100644 --- a/osfmk/arm64/hibernate_restore.c +++ b/osfmk/arm64/hibernate_restore.c @@ -130,6 +130,11 @@ pal_hib_decompress_page(void *src, void *dst, void *scratch, unsigned int compre uint32_t reserved3:14; } result = { .status = ~0u }; __asm__ volatile ("wkdmd %0, %1" : "=r"(result): "r"(dst), "0"(wkdmSrc)); + +#if defined APPLEH16 || defined APPLEACC8 + __builtin_arm_dmb(DMB_ISH); +#endif /* defined APPLEH16 || defined APPLEACC8 */ + HIB_ASSERT(result.status == 0); } diff --git a/osfmk/arm64/iofilter_asm.s b/osfmk/arm64/iofilter_asm.s index 3598067eb..5a8f55f4d 100644 --- a/osfmk/arm64/iofilter_asm.s +++ b/osfmk/arm64/iofilter_asm.s @@ -52,5 +52,5 @@ LEXT(io_filter_vtop) tst x9, #0x1 // Check PAR_EL1.F to see if translation was successful. csel x0, x10, xzr, eq // If translation was successful return PA, else 0. - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(io_filter_vtop) #endif /* HAS_GUARDED_IO_FILTER && !CONFIG_SPTM */ diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index 48ba1b743..0a36e841b 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -42,6 +42,13 @@ #include #endif +// If __ARM_KERNEL_PROTECT__, eret is preceeded by an ISB before returning to userspace. +// Otherwise, use BIT_ISB_PENDING flag to track that we need to issue an isb before eret if needed. +#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) && !__ARM_KERNEL_PROTECT__ +#define ERET_NEEDS_ISB 1 +#define BIT_ISB_PENDING 0 +#endif /* defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) && !__ARM_KERNEL_PROTECT__ */ + #if XNU_MONITOR && !CONFIG_SPTM /* * CHECK_EXCEPTION_RETURN_DISPATCH_PPL @@ -267,10 +274,14 @@ Lbegin_panic_lockdown_continue_\@: b.ne Lvalid_stack_\@ // ...validate the stack pointer mrs x0, SP_EL0 // Get SP_EL0 mrs x1, TPIDR_EL1 // Get thread pointer - cbnz x1, Ltest_kstack_\@ // Can only continue if TPIDR_EL1 is set -0: - wfe - b 0b // Can't do much else but wait here for debugger. + /* + * Check for either a NULL TPIDR or a NULL kernel stack, both of which + * are expected in early boot, but will cause recursive faults if not + * handled specially, + */ + cbz x1, Lcorrupt_stack_\@ + ldr x2, [x1, TH_KSTACKPTR] + cbz x2, Lcorrupt_stack_\@ Ltest_kstack_\@: LOAD_KERN_STACK_TOP dst=x2, src=x1, tmp=x3 // Get top of kernel stack sub x3, x2, KERNEL_STACK_SIZE // Find bottom of kernel stack @@ -730,12 +741,15 @@ el1_sp1_serror_vector_long: mrs x1, TPIDR_EL1 // Load the thread register + #if HAS_ARM_FEAT_SME str x2, [sp, SS64_X2] // current_thread()->machine.umatrix_hdr == NULL: this thread has never // executed smstart, so no SME state to save - ldr x2, [x1, ACT_UMATRIX_HDR] + add x0, x1, ACT_UMATRIX_HDR + ldr x2, [x0] cbz x2, 1f + AUTDA_DIVERSIFIED x2, address=x0, diversifier=ACT_UMATRIX_HDR_DIVERSIFIER mrs x0, SVCR str x0, [x2, SME_SVCR] @@ -939,19 +953,8 @@ TRAP_UNWIND_DIRECTIVES ARM64_JUMP_TARGET mrs x1, ESR_EL1 // Load exception syndrome mrs x2, FAR_EL1 // Load fault address - - /* At this point, the LR contains the value of ELR_EL1. In the case of an - * instruction prefetch abort, this will be the faulting pc, which we know - * to be invalid. This will prevent us from backtracing through the - * exception if we put it in our stack frame, so we load the LR from the - * exception saved state instead. - */ - and w6, w1, #(ESR_EC_MASK) - lsr w6, w6, #(ESR_EC_SHIFT) - mov w4, #(ESR_EC_IABORT_EL1) - cmp w6, w4 - b.eq Lfleh_sync_load_lr -Lvalid_link_register: + mrs lr, ELR_EL1 + /* NB: lr might not be a valid address (e.g. instruction abort). */ PUSH_FRAME #if CONFIG_SPTM @@ -989,10 +992,6 @@ Lfleh_synchronous_continue: mov x28, xzr // Don't need to check PFZ if there are ASTs b exception_return_dispatch -Lfleh_sync_load_lr: - ldr lr, [x0, SS64_LR] - b Lvalid_link_register - #if CONFIG_SPTM Lfleh_synchronous_ool_check_exception_el1: /* Save off arguments needed for sleh_sync as we may clobber */ @@ -1045,10 +1044,10 @@ Lblocked_user_sync_exception: TRAP_UNWIND_DIRECTIVES /* * User space took a sync exception after panic lockdown had been initiated. - * The system is going to panic soon, so let's just re-enable FIQs and wait - * for debugger sync. + * The system is going to panic soon, so let's just re-enable interrupts and + * wait for debugger sync. */ - msr DAIFClr, #DAIFSC_FIQF + msr DAIFClr, #(DAIFSC_STANDARD_DISABLE) 0: wfe b 0b @@ -1191,6 +1190,8 @@ UNWIND_EPILOGUE .text .align 2 fleh_invalid_stack: + TRAP_UNWIND_PROLOGUE + TRAP_UNWIND_DIRECTIVES ARM64_JUMP_TARGET #if CONFIG_SPTM /* @@ -1217,10 +1218,13 @@ fleh_invalid_stack: PUSH_FRAME bl EXT(sleh_invalid_stack) // Shouldn't return! b . + UNWIND_EPILOGUE .text .align 2 fleh_synchronous_sp1: + TRAP_UNWIND_PROLOGUE + TRAP_UNWIND_DIRECTIVES ARM64_JUMP_TARGET #if CONFIG_SPTM /* @@ -1251,10 +1255,28 @@ Lfleh_synchronous_sp1_skip_panic_lockdown: mrs x1, ESR_EL1 mrs x2, FAR_EL1 #endif /* CONFIG_SPTM */ - + /* + * If we got here before we have a kernel thread or kernel stack (e.g. + * still on init_thread) and we try to panic(), we'll end up in an infinite + * nested exception, so just stop here instead to preserve the call stack. + */ + mrs x9, TPIDR_EL1 + cbz x9, 0f + ldr x9, [x9, TH_KSTACKPTR] + cbz x9, 0f PUSH_FRAME bl EXT(sleh_synchronous_sp1) b . +0: + PUSH_FRAME + bl EXT(el1_sp1_synchronous_vector_long_invalid_kstack) + b . + UNWIND_EPILOGUE + +LEXT(el1_sp1_synchronous_vector_long_invalid_kstack) +0: + wfe + b 0b // Spin for watchdog .text .align 2 @@ -1564,18 +1586,19 @@ Lexception_return_restore_registers: mrs x5, FPCR CMSR FPCR, x5, x4, 1 1: - + mov x5, #0 #if HAS_ARM_FEAT_SME - mrs x2, SPSR_EL1 and x2, x2, #(PSR64_MODE_EL_MASK) cmp x2, #(PSR64_MODE_EL0) // SPSR_EL1.M != EL0: no SME state to restore bne Lno_sme_saved_state mrs x3, TPIDR_EL1 - ldr x2, [x3, ACT_UMATRIX_HDR] + add x3, x3, ACT_UMATRIX_HDR + ldr x2, [x3] cbz x2, Lno_sme_saved_state + AUTDA_DIVERSIFIED x2, address=x3, diversifier=ACT_UMATRIX_HDR_DIVERSIFIER ldr x3, [x2, SME_SVCR] msr SVCR, x3 @@ -1615,6 +1638,35 @@ Lno_sme_saved_state: Lskip_restore_neon_saved_state: #endif + + // If sync_on_cswitch and ERET is not a CSE, issue an ISB now. Unconditionally clear the + // sync_on_cswitch flag. + mrs x1, TPIDR_EL1 + ldr x1, [x1, ACT_CPUDATAP] + + // Redefined for backporting. +#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) && !__ARM_KERNEL_PROTECT__ + ldrb w2, [x1, CPU_SYNC_ON_CSWITCH] +#if ERET_NEEDS_ISB + // Set the bit, but don't sync, it will be synced shortly after this. + orr x5, x5, x2, lsl #(BIT_ISB_PENDING) +#else + cbz w2, 1f + // Last chance, sync now. + isb sy +1: +#endif /* ERET_NEEDS_ISB */ +#endif /* defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) && !__ARM_KERNEL_PROTECT__ */ + strb wzr, [x1, CPU_SYNC_ON_CSWITCH] + + +#if ERET_NEEDS_ISB + // Apply any pending isb from earlier. + tbz x5, #(BIT_ISB_PENDING), Lskip_eret_isb + isb sy +Lskip_eret_isb: +#endif /* ERET_NEEDS_ISB */ + /* Restore arm_saved_state64 */ // Skip x0, x1 - we're using them @@ -1668,7 +1720,10 @@ Lskip_restore_neon_saved_state: msr TTBR0_EL1, x18 mov x18, #0 - /* We don't need an ISB here, as the eret is synchronizing. */ +#if defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) + isb sy +#endif /* defined(ERET_IS_NOT_CONTEXT_SYNCHRONIZING) */ + Lskip_ttbr1_switch: #endif /* __ARM_KERNEL_PROTECT__ */ @@ -2277,7 +2332,7 @@ Lskip_preemption_check_sptmhook: ldp x20, x21, [sp], #0x10 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_sptm_pre_entry_hook) .align 2 .globl EXT(_sptm_post_exit_hook) @@ -2367,7 +2422,7 @@ Lsptm_skip_ast_taken_sptmhook: /* Return. */ ldp x20, x21, [sp], 0x10 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_sptm_post_exit_hook) #endif /* CONFIG_SPTM */ #if CONFIG_SPTM && (DEVELOPMENT || DEBUG) diff --git a/osfmk/arm64/loose_ends.c b/osfmk/arm64/loose_ends.c index aab4b16a7..9f1264ffc 100644 --- a/osfmk/arm64/loose_ends.c +++ b/osfmk/arm64/loose_ends.c @@ -33,9 +33,11 @@ #include #include #include +#include #include #include #include +#include #include #if !MACH_KDP #include @@ -63,7 +65,6 @@ #define BCOPY_PHYS_SRC_IS_USER(flags) (((flags) & (cppvPsrc | cppvKmap)) == 0) #define BCOPY_PHYS_DST_IS_USER(flags) (((flags) & (cppvPsnk | cppvKmap)) == 0) - static kern_return_t bcopy_phys_internal(addr64_t src, addr64_t dst, vm_size_t bytes, int flags) { @@ -78,7 +79,6 @@ bcopy_phys_internal(addr64_t src, addr64_t dst, vm_size_t bytes, int flags) addr64_t end __assert_only; kern_return_t res = KERN_SUCCESS; - if (!BCOPY_PHYS_SRC_IS_USER(flags)) { assert(!__improbable(os_add_overflow(src, bytes, &end))); } @@ -374,20 +374,26 @@ ml_phys_read_data(pmap_paddr_t paddr, int size) } #ifdef ML_IO_TIMEOUTS_ENABLED - bool istate, timeread = false; - uint64_t sabs, eabs; + bool istate, use_timeout = false; + kern_timeout_t timeout; uint32_t report_phy_read_delay = os_atomic_load(&report_phy_read_delay_to, relaxed); uint32_t const trace_phy_read_delay = os_atomic_load(&trace_phy_read_delay_to, relaxed); if (__improbable(report_phy_read_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); - sabs = ml_get_timebase(); - timeread = true; + + kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); + use_timeout = true; + + mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); + mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); + mmiot->mmio_paddr = paddr; + mmiot->mmio_vaddr = 0; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED - if (__improbable(timeread && simulate_stretched_io)) { - sabs -= simulate_stretched_io; + if (__improbable(use_timeout && simulate_stretched_io)) { + kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */ #endif /* ML_IO_TIMEOUTS_ENABLED */ @@ -439,13 +445,14 @@ ml_phys_read_data(pmap_paddr_t paddr, int size) } #ifdef ML_IO_TIMEOUTS_ENABLED - if (__improbable(timeread)) { - eabs = ml_get_timebase(); + if (__improbable(use_timeout)) { + kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); + uint64_t duration = kern_timeout_gross_duration(&timeout); - iotrace(IOTRACE_PHYS_READ, 0, addr, size, result, sabs, eabs - sabs); + iotrace(IOTRACE_PHYS_READ, 0, addr, size, result, kern_timeout_start_time(&timeout), duration); - if (__improbable((eabs - sabs) > report_phy_read_delay)) { - DTRACE_PHYSLAT4(physread, uint64_t, (eabs - sabs), + if (__improbable(duration > report_phy_read_delay)) { + DTRACE_PHYSLAT4(physread, uint64_t, duration, uint64_t, addr, uint32_t, size, uint64_t, result); uint64_t override = 0; @@ -466,22 +473,23 @@ ml_phys_read_data(pmap_paddr_t paddr, int size) } } - if (__improbable((eabs - sabs) > report_phy_read_delay)) { + if (__improbable(duration > report_phy_read_delay)) { if (phy_read_panic && (machine_timeout_suspended() == FALSE)) { + char str[128]; const uint64_t hi = (uint64_t)(result >> 64); const uint64_t lo = (uint64_t)(result); - uint64_t nsec = 0; - absolutetime_to_nanoseconds(eabs - sabs, &nsec); - panic("Read from physical addr 0x%llx took %llu ns, " - "result: 0x%016llx%016llx (start: %llu, end: %llu), ceiling: %llu", - (unsigned long long)addr, nsec, hi, lo, sabs, eabs, - (uint64_t)report_phy_read_delay); + + snprintf(str, sizeof(str), + "Read from physical addr 0x%llx (result: 0x%016llx%016llx) timed out:", + (unsigned long long)addr, hi, lo); + kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, + report_phy_read_delay); } } - if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) { + if (__improbable(trace_phy_read_delay > 0 && duration > trace_phy_read_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_PHYS_READ), - (eabs - sabs), sabs, addr, result); + duration, kern_timeout_start_time(&timeout), addr, result); } ml_set_interrupts_enabled_with_debug(istate, false); @@ -599,20 +607,26 @@ ml_phys_write_data(pmap_paddr_t paddr, uint128_t data, int size) } #ifdef ML_IO_TIMEOUTS_ENABLED - bool istate, timewrite = false; - uint64_t sabs, eabs; + bool istate, use_timeout = false; + kern_timeout_t timeout; uint32_t report_phy_write_delay = os_atomic_load(&report_phy_write_delay_to, relaxed); uint32_t const trace_phy_write_delay = os_atomic_load(&trace_phy_write_delay_to, relaxed); if (__improbable(report_phy_write_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); - sabs = ml_get_timebase(); - timewrite = true; + + kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); + use_timeout = true; + + mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); + mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); + mmiot->mmio_paddr = paddr; + mmiot->mmio_vaddr = 0; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED - if (__improbable(timewrite && simulate_stretched_io)) { - sabs -= simulate_stretched_io; + if (__improbable(use_timeout && simulate_stretched_io)) { + kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */ #endif /* ML_IO_TIMEOUTS_ENABLED */ @@ -659,17 +673,19 @@ ml_phys_write_data(pmap_paddr_t paddr, uint128_t data, int size) } #ifdef ML_IO_TIMEOUTS_ENABLED - if (__improbable(timewrite)) { - eabs = ml_get_timebase(); + if (__improbable(use_timeout)) { + kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); + uint64_t duration = kern_timeout_gross_duration(&timeout); - iotrace(IOTRACE_PHYS_WRITE, 0, paddr, size, data, sabs, eabs - sabs); + iotrace(IOTRACE_PHYS_WRITE, 0, paddr, size, data, kern_timeout_start_time(&timeout), duration); - if (__improbable((eabs - sabs) > report_phy_write_delay)) { - DTRACE_PHYSLAT4(physwrite, uint64_t, (eabs - sabs), + if (__improbable(duration > report_phy_write_delay)) { + DTRACE_PHYSLAT4(physwrite, uint64_t, duration, uint64_t, paddr, uint32_t, size, uint64_t, data); uint64_t override = 0; override_io_timeouts(0, paddr, NULL, &override); + if (override != 0) { #if SCHED_HYGIENE_DEBUG /* @@ -685,22 +701,23 @@ ml_phys_write_data(pmap_paddr_t paddr, uint128_t data, int size) } } - if (__improbable((eabs - sabs) > report_phy_write_delay)) { + if (__improbable(duration > report_phy_write_delay)) { if (phy_write_panic && (machine_timeout_suspended() == FALSE)) { + char str[128]; const uint64_t hi = (uint64_t)(data >> 64); const uint64_t lo = (uint64_t)(data); - uint64_t nsec = 0; - absolutetime_to_nanoseconds(eabs - sabs, &nsec); - panic("Write from physical addr 0x%llx took %llu ns, " - "data: 0x%016llx%016llx (start: %llu, end: %llu), ceiling: %llu", - (unsigned long long)paddr, nsec, hi, lo, sabs, eabs, - (uint64_t)report_phy_write_delay); + + snprintf(str, sizeof(str), + "Write to physical addr 0x%llx (data: 0x%016llx%016llx) timed out:", + (unsigned long long)paddr, hi, lo); + kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, + report_phy_write_delay); } } - if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) { + if (__improbable(trace_phy_write_delay > 0 && duration > trace_phy_write_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_PHYS_WRITE), - (eabs - sabs), sabs, paddr, data); + duration, kern_timeout_start_time(&timeout), paddr, data); } ml_set_interrupts_enabled_with_debug(istate, false); diff --git a/osfmk/arm64/lowmem_vectors.c b/osfmk/arm64/lowmem_vectors.c index eeeef42cf..0e77368fb 100644 --- a/osfmk/arm64/lowmem_vectors.c +++ b/osfmk/arm64/lowmem_vectors.c @@ -73,7 +73,11 @@ lowglo lowGlo __attribute__ ((aligned(PAGE_MAX_SIZE))) = { .lgPmapMemPagesize = (uint64_t)sizeof(struct vm_page), .lgPmapMemFromArrayMask = VM_PAGE_PACKED_FROM_ARRAY, .lgPmapMemPackedShift = VM_PAGE_PACKED_PTR_SHIFT, +#ifndef __BUILDING_XNU_LIB_UNITTEST__ .lgPmapMemPackedBaseAddr = VM_PAGE_PACKED_PTR_BASE, +#else + .lgPmapMemPackedBaseAddr = 0, /* not a compile-time constant when building for unit-test */ +#endif .lgPmapMemStartAddr = -1, .lgPmapMemEndAddr = -1, .lgPmapMemFirstppnum = -1, diff --git a/osfmk/arm64/lz4_decode_arm64.s b/osfmk/arm64/lz4_decode_arm64.s index 632776e95..d5b3b2bfd 100644 --- a/osfmk/arm64/lz4_decode_arm64.s +++ b/osfmk/arm64/lz4_decode_arm64.s @@ -81,7 +81,7 @@ .macro clear_frame_and_return ldp fp, lr, [sp], #16 - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _lz4_decode_asm .endm // copy_1x16 SOURCE_ADDR DESTINATION_ADDR diff --git a/osfmk/arm64/lz4_encode_arm64.s b/osfmk/arm64/lz4_encode_arm64.s index 1c5a51e8c..6b9ec30f4 100644 --- a/osfmk/arm64/lz4_encode_arm64.s +++ b/osfmk/arm64/lz4_encode_arm64.s @@ -393,7 +393,7 @@ L_done: // clear frame ldp fp, lr, [sp], #16 - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _lz4_encode_2gb L_revert_x9_and_done: sub x9, x9, #1 diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index 911b6ef42..4ea7174fb 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include #include #include #include @@ -66,7 +68,7 @@ #include #endif /* HIBERNATION */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include #endif @@ -75,6 +77,7 @@ #include #endif /* CONFIG_SPTM */ +#include #include /** @@ -417,7 +420,7 @@ get_tcr(void) return value; } -boolean_t +__mockable boolean_t ml_get_interrupts_enabled(void) { uint64_t value; @@ -438,7 +441,7 @@ get_mmu_ttb(void) return value; } -uint32_t +MARK_AS_FIXUP_TEXT uint32_t get_arm_cpu_version(void) { uint32_t value = machine_read_midr(); @@ -544,6 +547,8 @@ ml_is_secure_hib_supported(void) return false; } +static void ml_release_deferred_pages(void); + void machine_lockdown(void) { @@ -559,6 +564,7 @@ machine_lockdown(void) #endif arm_vm_prot_finalize(PE_state.bootArgs); + ml_release_deferred_pages(); #if CONFIG_KERNEL_INTEGRITY #if KERNEL_INTEGRITY_WT @@ -594,7 +600,7 @@ machine_lockdown(void) #endif /* SCHED_HYGIENE_DEBUG */ enable_preemption(); #else -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* KTRR * * Lock physical KTRR region. KTRR region is read-only. Memory outside @@ -602,7 +608,7 @@ machine_lockdown(void) */ rorgn_lockdown(); -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #endif /* CONFIG_SPTM */ #if XNU_MONITOR @@ -924,7 +930,7 @@ void machine_signal_idle_deferred( processor_t processor) { - cpu_signal_deferred(processor_to_cpu_datap(processor)); + cpu_signal_deferred(processor_to_cpu_datap(processor), SIGPdeferred); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_DEFERRED_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0); } @@ -932,7 +938,7 @@ void machine_signal_idle_cancel( processor_t processor) { - cpu_signal_cancel(processor_to_cpu_datap(processor)); + cpu_signal_cancel(processor_to_cpu_datap(processor), SIGPdeferred); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_REMOTE_CANCEL_AST), processor->cpu_id, 0 /* nop */, 0, 0, 0); } @@ -1102,6 +1108,7 @@ ml_cluster_power_override(unsigned int *flag) PE_parse_boot_argn("cluster_power", flag, sizeof(*flag)); } + static void ml_read_chip_revision(unsigned int *rev __unused) { @@ -1354,7 +1361,7 @@ ml_map_cpu_pio(void) } } -unsigned int +__mockable unsigned int ml_get_cpu_count(void) { return topology_info.num_cpus; @@ -1505,7 +1512,7 @@ ml_get_max_die_id(void) void ml_lockdown_init() { -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) rorgn_stash_range(); #endif } @@ -1652,9 +1659,7 @@ ml_processor_register(ml_processor_info_t *in_processor_info, pset = pset_find(in_processor_info->cluster_id, NULL); kprintf("[%d]%s>pset_find(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, in_processor_info->cluster_id, pset ? pset->pset_id : -1); if (pset == NULL) { - pset_cluster_type_t pset_cluster_type = cluster_type_to_pset_cluster_type(this_cpu_datap->cpu_cluster_type); - pset_node_t pset_node = cluster_type_to_pset_node(this_cpu_datap->cpu_cluster_type); - pset = pset_create(pset_node, pset_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id); + pset = pset_create(this_cpu_datap->cpu_cluster_type, this_cpu_datap->cpu_cluster_id, this_cpu_datap->cpu_cluster_id); assert(pset != PROCESSOR_SET_NULL); #if __AMP__ kprintf("[%d]%s>pset_create(cluster_id=%d) returned pset %d\n", current_processor()->cpu_id, __FUNCTION__, this_cpu_datap->cpu_cluster_id, pset->pset_id); @@ -2041,6 +2046,45 @@ ml_physaddr_in_bootkc_range(vm_offset_t physaddr) } #endif /* defined(CONFIG_SPTM) */ +/* + * List of ml_static_mfree()'d pages that have been freed before + * physical aperture sliding has taken place. If sliding has not + * occurred yet, ml_static_mfree() will create pages, but not add them + * to the free page queue yet. If it did, code that e.g. calls + * pmap_page_alloc() could get a page back whose physical aperture + * will later be slid, potentially leaving dangling pointers pointing + * to the old kva of the page behind. + * + * Such errors are hard to avoid and hard to debug, so instead we + * queue pages in this dedicated list, and release all accumulated + * pages into the regular free queue all at once right after phys + * aperture sliding took place in arm_vm_prot_finalize(). + */ +static +vm_page_list_t ml_static_mfree_pre_slide_list; + +/* + * Indicates whether we still need ml_static_mfree() to queue up pages + * in ml_static_free_pre_slide_list. If not, ml_static_mfree() + * directly releases newly created pages into the free queue instead. + */ +static +bool ml_static_mfree_queue_up = true; + +/* + * Release all pages queued up by ml_static_mfree() to the free queue. + * This should be called after physical aperture sliding has taken + * place (i.e. in arm_vm_prot_finalize()), to indicate that the + * physical aperture is now stable, and subsequently ml_static_mfree() + * can directly release pages into the free queue instead. + */ +static void +ml_release_deferred_pages(void) +{ + vm_page_free_list(ml_static_mfree_pre_slide_list.vmpl_head, false); + ml_static_mfree_queue_up = false; +} + /* * Routine: ml_static_mfree * Function: @@ -2091,7 +2135,14 @@ ml_static_mfree( paddr_cur = ptoa(ppn); - vm_page_create_canonical(ppn); + if (__probable(!ml_static_mfree_queue_up)) { + vm_page_create_canonical(ppn); + } else { + vm_page_t m = vm_page_create(ppn, true, Z_WAITOK); + + vm_page_list_push(&ml_static_mfree_pre_slide_list, m); + } + freed_pages++; #if defined(CONFIG_SPTM) if (ml_physaddr_in_bootkc_range(paddr_cur)) @@ -2907,6 +2958,12 @@ ml_hibernate_active_pre(void) if (kIOHibernateStateWakingFromHibernate == gIOHibernateState) { hibernate_rebuild_vm_structs(); + +#if CONFIG_SPTM + /* Tell the pmap that hibernation restoration has started. */ + extern secure_hmac_hib_state_t pmap_hibernation_state; + pmap_hibernation_state = SECURE_HMAC_HIB_RESTORE; +#endif /* CONFIG_SPTM */ } #endif /* HIBERNATION */ } @@ -3059,18 +3116,6 @@ ml_get_backtrace_pc(struct arm_saved_state *state) } -bool -ml_paddr_is_exclaves_owned(vm_offset_t paddr) -{ -#if CONFIG_SPTM - const sptm_frame_type_t type = sptm_get_frame_type(paddr); - return type == SK_DEFAULT || type == SK_IO; // SK_SHARED_R[OW] are not exclusively exclaves frames -#else - #pragma unused(paddr) - return false; -#endif /* CONFIG_SPTM */ -} - /** * Panic because an ARM saved-state accessor expected user saved-state but was * passed non-user saved-state. @@ -3119,3 +3164,41 @@ ml_task_post_signature_processing_hook(__unused task_t task) } + +#if DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT +static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text_initialized) = false; +static bool SECURITY_READ_ONLY_LATE(_unsafe_kernel_text) = false; + +__mockable bool +ml_unsafe_kernel_text(void) +{ + assert(_unsafe_kernel_text_initialized); + return _unsafe_kernel_text; +} + +__startup_func +static void +ml_unsafe_kernel_text_init(void) +{ + /* Grab the values written by iBoot. */ + + DTEntry entry; + const void *value; + unsigned int size; + if (SecureDTLookupEntry(0, "/chosen", &entry) == kSuccess && + SecureDTGetProperty(entry, "kernel-ctrr-to-be-enabled", &value, &size) == kSuccess && + size == sizeof(int)) { + _unsafe_kernel_text_initialized = true; + _unsafe_kernel_text = (0 == *(const int *)value); + } +} +STARTUP(TUNABLES, STARTUP_RANK_FIRST, ml_unsafe_kernel_text_init); + +#else /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */ +bool +ml_unsafe_kernel_text(void) +{ + /* Kernel text is never writable under these configs. */ + return false; +} +#endif /* DEVELOPMENT || DEBUG || CONFIG_DTRACE || CONFIG_CSR_FROM_DT */ diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 535ff7a29..6ac282ad6 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -77,6 +77,10 @@ Lcre_start_\@: COPYIO_RECOVER_TABLE_SYM copyio_recover_table +.macro COPYIO_STACK_PROLOG +0: + ARM64_STACK_PROLOG +.endmacro #if defined(HAS_APPLE_PAC) @@ -459,7 +463,8 @@ L_mmu_kvtop_wpreflight_invalid: .align 2 copyio_error: POP_FRAME // Return the error populated in x0 - ARM64_STACK_EPILOG // by the exception handler + // by the exception handler + ARM64_STACK_EPILOG #if CONFIG_XNUPOST /* @@ -484,6 +489,99 @@ LEXT(arm64_panic_lockdown_test_copyio_fault_pc) #endif /* CONFIG_XNUPOST */ +/* + * We have several different _bcopy{in|out} implementations, with slightly different + * recovery models. This macro provides a common backbone for all of them, so that + * we don't risk (implementation/optimization) differences among them. + */ +.macro BCOPY_IMPL src, dst, len, end_label + // \src: Source pointer + // \dst: Destination pointer + // \len: Length + // \end_label: Label to jump to at the end of the copy + + /* If len is less than 256 bytes, do 16 bytewise copy */ + cmp \len, #256 + b.lt 2f + sub \len, \len, #256 + /* 256 bytes at a time */ +1: + /* 0-64 bytes */ + ldp x3, x4, [\src] + stp x3, x4, [\dst] + ldp x5, x6, [\src, #16] + stp x5, x6, [\dst, #16] + ldp x3, x4, [\src, #32] + stp x3, x4, [\dst, #32] + ldp x5, x6, [\src, #48] + stp x5, x6, [\dst, #48] + + /* 64-128 bytes */ + ldp x3, x4, [\src, #64] + stp x3, x4, [\dst, #64] + ldp x5, x6, [\src, #80] + stp x5, x6, [\dst, #80] + ldp x3, x4, [\src, #96] + stp x3, x4, [\dst, #96] + ldp x5, x6, [\src, #112] + stp x5, x6, [\dst, #112] + + /* 128-192 bytes */ + ldp x3, x4, [\src, #128] + stp x3, x4, [\dst, #128] + ldp x5, x6, [\src, #144] + stp x5, x6, [\dst, #144] + ldp x3, x4, [\src, #160] + stp x3, x4, [\dst, #160] + ldp x5, x6, [\src, #176] + stp x5, x6, [\dst, #176] + + /* 192-256 bytes */ + ldp x3, x4, [\src, #192] + stp x3, x4, [\dst, #192] + ldp x5, x6, [\src, #208] + stp x5, x6, [\dst, #208] + ldp x3, x4, [\src, #224] + stp x3, x4, [\dst, #224] + ldp x5, x6, [\src, #240] + stp x5, x6, [\dst, #240] + + add \src, \src, #256 + add \dst, \dst, #256 + + subs \len, \len, #256 + b.ge 1b + + /* Fixup the len and test for completion */ + adds \len, \len, #256 + b.eq \end_label + +2: + /* If len is less than 16 bytes, just do a bytewise copy */ + cmp \len, #16 + b.lt 4f + sub \len, \len, #16 + +3: + /* 16 bytes at a time */ + ldp x3, x4, [\src], #16 + stp x3, x4, [\dst], #16 + subs \len, \len, #16 + b.ge 3b + + /* Fixup the len and test for completion */ + adds \len, \len, #16 + b.eq \end_label + +4: /* Bytewise */ + subs \len, \len, #1 + ldrb w3, [\src], #1 + strb w3, [\dst], #1 + b.hi 4b + + //Fallthrough if copy finishes here. +.endm + /* * int _bcopyin(const user_addr_t src, char *dst, vm_size_t len) */ @@ -491,83 +589,13 @@ LEXT(arm64_panic_lockdown_test_copyio_fault_pc) .align 2 .globl EXT(_bcopyin) LEXT(_bcopyin) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME - COPYIO_RECOVER_RANGE 5f - /* If len is less than 256 bytes, do 16 bytewise copy */ - cmp x2, #256 - b.lt 2f - sub x2, x2, #256 - /* 256 bytes at a time */ -1: - /* 0-64 bytes */ - ldp x3, x4, [x0] - stp x3, x4, [x1] - ldp x5, x6, [x0, #16] - stp x5, x6, [x1, #16] - ldp x3, x4, [x0, #32] - stp x3, x4, [x1, #32] - ldp x5, x6, [x0, #48] - stp x5, x6, [x1, #48] + COPYIO_RECOVER_RANGE _bcopyin_end - /* 64-128 bytes */ - ldp x3, x4, [x0, #64] - stp x3, x4, [x1, #64] - ldp x5, x6, [x0, #80] - stp x5, x6, [x1, #80] - ldp x3, x4, [x0, #96] - stp x3, x4, [x1, #96] - ldp x5, x6, [x0, #112] - stp x5, x6, [x1, #112] + BCOPY_IMPL x0, x1, x2, _bcopyin_end - /* 128-192 bytes */ - ldp x3, x4, [x0, #128] - stp x3, x4, [x1, #128] - ldp x5, x6, [x0, #144] - stp x5, x6, [x1, #144] - ldp x3, x4, [x0, #160] - stp x3, x4, [x1, #160] - ldp x5, x6, [x0, #176] - stp x5, x6, [x1, #176] - - /* 192-256 bytes */ - ldp x3, x4, [x0, #192] - stp x3, x4, [x1, #192] - ldp x5, x6, [x0, #208] - stp x5, x6, [x1, #208] - ldp x3, x4, [x0, #224] - stp x3, x4, [x1, #224] - ldp x5, x6, [x0, #240] - stp x5, x6, [x1, #240] - - add x0, x0, #256 - add x1, x1, #256 - - subs x2, x2, #256 - b.ge 1b - /* Fixup the len and test for completion */ - adds x2, x2, #256 - b.eq 5f -2: - /* If len is less than 16 bytes, just do a bytewise copy */ - cmp x2, #16 - b.lt 4f - sub x2, x2, #16 -3: - /* 16 bytes at a time */ - ldp x3, x4, [x0], #16 - stp x3, x4, [x1], #16 - subs x2, x2, #16 - b.ge 3b - /* Fixup the len and test for completion */ - adds x2, x2, #16 - b.eq 5f -4: /* Bytewise */ - subs x2, x2, #1 - ldrb w3, [x0], #1 - strb w3, [x1], #1 - b.hi 4b -5: +_bcopyin_end: mov x0, xzr /* * x3, x4, x5 and x6 now contain user-controlled values which may be used to form @@ -580,7 +608,7 @@ LEXT(_bcopyin) mov x5, xzr mov x6, xzr POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_bcopyin) #if CONFIG_DTRACE /* @@ -590,7 +618,7 @@ LEXT(_bcopyin) .align 2 .globl EXT(dtrace_nofault_copy8) LEXT(dtrace_nofault_copy8) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 1f ldrb w8, [x0] @@ -598,7 +626,7 @@ LEXT(dtrace_nofault_copy8) strb w8, [x1] mov x0, #0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(dtrace_nofault_copy8) /* * int dtrace_nofault_copy16(const char *src, uint32_t *dst) @@ -607,7 +635,7 @@ LEXT(dtrace_nofault_copy8) .align 2 .globl EXT(dtrace_nofault_copy16) LEXT(dtrace_nofault_copy16) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 1f ldrh w8, [x0] @@ -615,7 +643,7 @@ LEXT(dtrace_nofault_copy16) strh w8, [x1] mov x0, #0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(dtrace_nofault_copy16) #endif /* CONFIG_DTRACE */ @@ -632,7 +660,7 @@ LEXT(dtrace_nofault_copy32) #endif .globl EXT(_copyin_atomic32) LEXT(_copyin_atomic32) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 1f ldr w8, [x0] @@ -645,7 +673,7 @@ LEXT(_copyin_atomic32) * C wrapper. So, no need to zero it out here. */ POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_copyin_atomic32) /* * int _copyin_atomic32_wait_if_equals(const user_addr_t src, uint32_t value) @@ -654,7 +682,7 @@ LEXT(_copyin_atomic32) .align 2 .globl EXT(_copyin_atomic32_wait_if_equals) LEXT(_copyin_atomic32_wait_if_equals) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 2f ldxr w8, [x0] @@ -672,7 +700,7 @@ LEXT(_copyin_atomic32_wait_if_equals) * C wrapper. So, no need to zero it out here. */ POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_copyin_atomic32_wait_if_equals) /* @@ -687,7 +715,7 @@ LEXT(dtrace_nofault_copy64) #endif .globl EXT(_copyin_atomic64) LEXT(_copyin_atomic64) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE Lcopyin_atomic64_common ldr x8, [x0] @@ -700,7 +728,7 @@ Lcopyin_atomic64_common: * C wrapper. So, no need to zero it out here. */ POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_copyin_atomic64) /* @@ -710,14 +738,14 @@ Lcopyin_atomic64_common: .align 2 .globl EXT(_copyout_atomic32) LEXT(_copyout_atomic32) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 1f str w0, [x1] 1: mov x0, #0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_copyout_atomic32) /* * int _copyout_atomic64(uint64_t u64, user_addr_t dst) @@ -726,15 +754,14 @@ LEXT(_copyout_atomic32) .align 2 .globl EXT(_copyout_atomic64) LEXT(_copyout_atomic64) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE 1f str x0, [x1] 1: mov x0, #0 POP_FRAME - ARM64_STACK_EPILOG - + ARM64_STACK_EPILOG EXT(_copyout_atomic64) /* * int _bcopyout(const char *src, user_addr_t dst, vm_size_t len) @@ -743,85 +770,16 @@ LEXT(_copyout_atomic64) .align 2 .globl EXT(_bcopyout) LEXT(_bcopyout) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME - COPYIO_RECOVER_RANGE 5f - /* If len is less than 256 bytes, do 16 bytewise copy */ - cmp x2, #256 - b.lt 2f - sub x2, x2, #256 - /* 256 bytes at a time */ -1: - /* 0-64 bytes */ - ldp x3, x4, [x0] - stp x3, x4, [x1] - ldp x5, x6, [x0, #16] - stp x5, x6, [x1, #16] - ldp x3, x4, [x0, #32] - stp x3, x4, [x1, #32] - ldp x5, x6, [x0, #48] - stp x5, x6, [x1, #48] + COPYIO_RECOVER_RANGE _bcopyout_end - /* 64-128 bytes */ - ldp x3, x4, [x0, #64] - stp x3, x4, [x1, #64] - ldp x5, x6, [x0, #80] - stp x5, x6, [x1, #80] - ldp x3, x4, [x0, #96] - stp x3, x4, [x1, #96] - ldp x5, x6, [x0, #112] - stp x5, x6, [x1, #112] + BCOPY_IMPL x0, x1, x2, _bcopyout_end - /* 128-192 bytes */ - ldp x3, x4, [x0, #128] - stp x3, x4, [x1, #128] - ldp x5, x6, [x0, #144] - stp x5, x6, [x1, #144] - ldp x3, x4, [x0, #160] - stp x3, x4, [x1, #160] - ldp x5, x6, [x0, #176] - stp x5, x6, [x1, #176] - - /* 192-256 bytes */ - ldp x3, x4, [x0, #192] - stp x3, x4, [x1, #192] - ldp x5, x6, [x0, #208] - stp x5, x6, [x1, #208] - ldp x3, x4, [x0, #224] - stp x3, x4, [x1, #224] - ldp x5, x6, [x0, #240] - stp x5, x6, [x1, #240] - - add x0, x0, #256 - add x1, x1, #256 - subs x2, x2, #256 - b.ge 1b - /* Fixup the len and test for completion */ - adds x2, x2, #256 - b.eq 5f -2: - /* If len is less than 16 bytes, just do a bytewise copy */ - cmp x2, #16 - b.lt 4f - sub x2, x2, #16 -3: - /* 16 bytes at a time */ - ldp x3, x4, [x0], #16 - stp x3, x4, [x1], #16 - subs x2, x2, #16 - b.ge 3b - /* Fixup the len and test for completion */ - adds x2, x2, #16 - b.eq 5f -4: /* Bytewise */ - subs x2, x2, #1 - ldrb w3, [x0], #1 - strb w3, [x1], #1 - b.hi 4b -5: +_bcopyout_end: mov x0, #0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_bcopyout) /* * int _bcopyinstr( @@ -834,7 +792,7 @@ LEXT(_bcopyout) .align 2 .globl EXT(_bcopyinstr) LEXT(_bcopyinstr) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE Lcopyinstr_done mov x4, #0 // x4 - total bytes copied @@ -875,7 +833,7 @@ Lcopyinstr_done: mov x4, xzr mov x5, xzr POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(_bcopyinstr) /* * int copyinframe(const vm_address_t frame_addr, char *kernel_addr, bool is64bit) @@ -898,7 +856,7 @@ Lcopyinstr_done: .align 2 .globl EXT(copyinframe) LEXT(copyinframe) - ARM64_STACK_PROLOG + COPYIO_STACK_PROLOG PUSH_FRAME COPYIO_RECOVER_RANGE Lcopyinframe_done cbnz w2, Lcopyinframe64 // Check frame size @@ -932,7 +890,7 @@ Lcopyinframe_valid: Lcopyinframe_done: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(copyinframe) /* @@ -993,7 +951,7 @@ LEXT(hw_lck_ticket_reserve_orig_allow_invalid) 7: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(hw_lck_ticket_reserve_orig_allow_invalid) 9: /* invalid */ #if !defined(__ARM_ARCH_8_2__) @@ -1001,7 +959,7 @@ LEXT(hw_lck_ticket_reserve_orig_allow_invalid) #endif mov w0, #0 POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(hw_lck_ticket_reserve_orig_allow_invalid) /* * uint32_t arm_debug_read_dscr(void) @@ -1080,6 +1038,13 @@ vm_sleep_individual_cpu: orr x9, x9, x10 msr CPU_OVRD, x9 isb +#else + // Mask timer IRQs before entering WFI + mrs x9, ACNTHV_CTL_EL2 + mov x10, #(~ACNTHV_CTL_EL2_EN_MASK) + and x9, x9, x10 + msr ACNTHV_CTL_EL2, x9 + isb #endif is_deep_sleep: #endif @@ -1143,6 +1108,11 @@ Lwfi_inst: dsb sy isb sy wfi +#if NO_CPU_OVRD + // Clear any spurious IPIs received during CPU shutdown + mov x9, #0x1 + msr S3_5_C15_C1_1, x9 +#endif b Lwfi_inst /* @@ -1163,7 +1133,7 @@ LEXT(arm64_force_wfi_clock_gate) #endif POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(arm64_force_wfi_clock_gate) #if HAS_RETENTION_STATE @@ -1233,7 +1203,7 @@ LEXT(arm64_replace_bootstack) mrs x4, DAIF // Load current DAIF; use x4 as pinst may trash x1-x3 msr DAIFSet, #(DAIFSC_STANDARD_DISABLE) // Disable all asynchronous exceptions // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) mov x1, lr bl EXT(pinst_spsel_1) mov lr, x1 @@ -1244,7 +1214,7 @@ LEXT(arm64_replace_bootstack) msr SPSel, #0 msr DAIF, x4 // Restore interrupt state POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(arm64_replace_bootstack) #ifdef MONITOR /* @@ -1400,7 +1370,7 @@ Lintr_enabled_panic: brk #0 Lintr_enabled_str: /* - * Please see the "Signing spilled register state" section of doc/pac.md + * Please see the "Signing spilled register state" section of doc/arm/pac.md * for an explanation of why this is bad and how it should be fixed. */ .asciz "Signed thread state manipulated with interrupts enabled" @@ -1500,11 +1470,37 @@ LEXT(fill32_nt) #if defined(HAS_APPLE_PAC) +/* + * vm_offset_t ml_addrperm_pacga(vm_offset_t addr) + * + * Permutes a 64bit address to a random 64bit value. Lowest + * bit is forced to 1, to distinguish from a NULL pointer. + * + * Expected to be called only with non static kernel addresses. + * + * Should only be called with canonicalized kernel addresses, no PAC + * signature, no TAGS. + */ + .text + .align 2 + .globl EXT(ml_addrperm_pacga) +LEXT(ml_addrperm_pacga) + ARM64_PROLOG + mov w17, #PACGA_TAG_ADDRPERM + pacga x16, x17, x0 + /* Force the output to not be NULL, so debug can tell them apart */ + orr x16, x16, #0x1 + mov w17, #(0x10 | PACGA_TAG_ADDRPERM) + /* pacga puts the output in the top 32bits */ + pacga x0, x17, x0 + /* combine the two outputs into a 64bit value, with lowest bit set to 1 */ + orr x0, x16, x0, lsr #32 + ret /* * ptrauth_utils_sign_blob_generic(const void * ptr, size_t len_bytes, uint64_t data, int flags) * - * See "Signing arbitrary data blobs" of doc/pac.md + * See "Signing arbitrary data blobs" of doc/arm/pac.md */ .text .align 2 @@ -1567,7 +1563,7 @@ Lepilogue_cookie: Lsign_ret: POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(ptrauth_utils_sign_blob_generic) #endif // defined(HAS_APPLE_PAC) diff --git a/osfmk/arm64/memcmp_zero.s b/osfmk/arm64/memcmp_zero.s index 0c0dca62c..7439146e5 100644 --- a/osfmk/arm64/memcmp_zero.s +++ b/osfmk/arm64/memcmp_zero.s @@ -53,7 +53,7 @@ .macro ClearFrameAndReturn ldp fp, lr, [sp], #16 - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _memcmp_zero_ptr_aligned .endm /***************************************************************************** diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index b3309aa39..f4354bccc 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -367,6 +367,20 @@ static uint16_t uncore_active_ctrs = 0; static_assert(sizeof(uncore_active_ctrs) * CHAR_BIT >= UNCORE_NCTRS, "counter mask should fit the full range of counters"); +#if UPMU_9BIT_SELECTORS +static uint16_t uncore_selectors_bit9 = 0; +#endif /* UPMU_9BIT_SELECTORS */ + +static uint64_t +_upmcr0_value(void) +{ +#if UPMU_9BIT_SELECTORS + return (uint64_t)uncore_selectors_bit9 << 36 | (uint64_t)uncore_active_ctrs; +#else /* UPMU_9BIT_SELECTORS */ + return uncore_active_ctrs; +#endif /* !UPMU_9BIT_SELECTORS */ +} + /* * mt_uncore_enabled is true when any uncore counters are active. */ @@ -826,7 +840,7 @@ uncmon_init_locked_l(unsigned int monid) */ CTRL_REG_SET("S3_7_C15_C5_4", uncmon_get_pmi_mask(monid)); uncmon_set_counting_locked_l(monid, - mt_uncore_enabled ? uncore_active_ctrs : 0); + mt_uncore_enabled ? _upmcr0_value() : 0); } #if UNCORE_PER_CLUSTER @@ -840,7 +854,7 @@ uncmon_init_locked_r(unsigned int monid) *(uint64_t *)(acc_impl[monid] + upmpcm_off) = uncmon_get_pmi_mask(monid); uncmon_set_counting_locked_r(monid, - mt_uncore_enabled ? uncore_active_ctrs : 0); + mt_uncore_enabled ? _upmcr0_value() : 0); } #endif /* UNCORE_PER_CLUSTER */ @@ -974,7 +988,7 @@ uncore_add(struct monotonic_config *config, uint32_t *ctr_out) return EBUSY; } - uint8_t selector = (uint8_t)config->event; + uint16_t selector = (uint16_t)config->event; uint32_t available = ~uncore_active_ctrs & config->allowed_ctr_mask; if (available == 0) { @@ -1026,7 +1040,11 @@ uncore_add(struct monotonic_config *config, uint32_t *ctr_out) uint32_t ctr = __builtin_ffsll(available) - 1; uncore_active_ctrs |= UINT64_C(1) << ctr; - uncore_config.uc_events.uce_ctrs[ctr] = selector; + uncore_config.uc_events.uce_ctrs[ctr] = (uint8_t)selector; +#if UPMU_9BIT_SELECTORS + uncore_selectors_bit9 &= ~(1 << ctr); + uncore_selectors_bit9 |= ((selector >> 8) & 1) << ctr; +#endif /* UPMU_9BIT_SELECTORS */ uint64_t cpu_mask = UINT64_MAX; if (config->cpu_mask != 0) { cpu_mask = config->cpu_mask; @@ -1109,6 +1127,9 @@ uncore_reset(void) } uncore_active_ctrs = 0; +#if UPMU_9BIT_SELECTORS + uncore_selectors_bit9 = 0; +#endif /* UPMU_9BIT_SELECTORS */ memset(&uncore_config, 0, sizeof(uncore_config)); if (mt_owns_counters()) { @@ -1151,7 +1172,7 @@ uncmon_set_enabled_l_locked(unsigned int monid, bool enable) if (enable) { uncmon_init_locked_l(monid); uncmon_program_events_locked_l(monid); - uncmon_set_counting_locked_l(monid, uncore_active_ctrs); + uncmon_set_counting_locked_l(monid, _upmcr0_value()); } else { uncmon_set_counting_locked_l(monid, 0); } @@ -1171,7 +1192,7 @@ uncmon_set_enabled_r_locked(unsigned int monid, bool enable) if (enable) { uncmon_init_locked_r(monid); uncmon_program_events_locked_r(monid); - uncmon_set_counting_locked_r(monid, uncore_active_ctrs); + uncmon_set_counting_locked_r(monid, _upmcr0_value()); } else { uncmon_set_counting_locked_r(monid, 0); } diff --git a/osfmk/arm64/pac_asm.h b/osfmk/arm64/pac_asm.h index ca26d254f..d5038a942 100644 --- a/osfmk/arm64/pac_asm.h +++ b/osfmk/arm64/pac_asm.h @@ -186,7 +186,7 @@ #define PACGA_TAG_THREAD 0b0010 #define PACGA_TAG_IRG 0b0011 #define PACGA_TAG_HV 0b0100 -#define PACGA_TAG_5 0b0101 +#define PACGA_TAG_ADDRPERM 0b0101 #define PACGA_TAG_6 0b0110 #define PACGA_TAG_7 0b0111 #define PACGA_TAG_8 0b1000 diff --git a/osfmk/arm64/pcb.c b/osfmk/arm64/pcb.c index 7f7630dca..a11dff59a 100644 --- a/osfmk/arm64/pcb.c +++ b/osfmk/arm64/pcb.c @@ -290,6 +290,15 @@ machine_switch_pmap_and_extended_context(thread_t old, thread_t new) * a pending kernel TLB or cache maintenance instruction. */ __builtin_arm_dsb(DSB_ISH); + + /* + * An ISB is needed for similar userspace reasons to the DSB above. Unlike the DSB + * case, the context synchronization needs to happen on the CPU the 'old' thread will + * later be scheduled on. We can rely on the fact that when 'old' is later scheduled, + * whatever thread it is replacing will go through this function as 'old' and will + * issue this ISB on its behalf. + */ + arm_context_switch_requires_sync(); } @@ -354,7 +363,6 @@ machine_thread_on_core_allow_invalid(thread_t thread) * from anything but a thread, zeroed or freed memory. */ assert(get_preemption_level() > 0); - thread = pgz_decode_allow_invalid(thread, ZONE_ID_THREAD); if (thread == THREAD_NULL) { return false; } @@ -1421,13 +1429,20 @@ machine_csv(__unused cpuvn_e cve) return 0; } -#if ERET_IS_NOT_CONTEXT_SYNCHRONIZING void arm_context_switch_requires_sync() { current_cpu_datap()->sync_on_cswitch = 1; } -#endif + +void +arm_context_switch_sync() +{ + if (__improbable(current_cpu_datap()->sync_on_cswitch != 0)) { + __builtin_arm_isb(ISB_SY); + current_cpu_datap()->sync_on_cswitch = 0; + } +} #if __has_feature(ptrauth_calls) boolean_t diff --git a/osfmk/arm64/pinst.s b/osfmk/arm64/pinst.s index 2dcb17308..74f0911e9 100644 --- a/osfmk/arm64/pinst.s +++ b/osfmk/arm64/pinst.s @@ -109,7 +109,7 @@ _pinst_set_sctlr: #endif /* defined(KERNEL_INTEGRITY_KTRR) */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) .text .section __LAST,__pinst @@ -129,5 +129,5 @@ _pinst_spsel_1: check_instruction x2, x3, __pinst_spsel_1, 0xd65f03c0d50041bf b __pinst_spsel_1 -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ diff --git a/osfmk/arm64/platform_tests.c b/osfmk/arm64/platform_tests.c index 612a3353b..36d10ce3f 100644 --- a/osfmk/arm64/platform_tests.c +++ b/osfmk/arm64/platform_tests.c @@ -60,6 +60,8 @@ #include #include #include +#include +#include #include #include #include @@ -68,6 +70,7 @@ #include #include #include +#include #include #include @@ -83,15 +86,18 @@ #include #include #include +#include #include #include +#include -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include -#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include +kern_return_t arm64_backtrace_test(void); kern_return_t arm64_lock_test(void); kern_return_t arm64_munger_test(void); kern_return_t arm64_pan_test(void); @@ -100,7 +106,7 @@ kern_return_t arm64_late_pan_test(void); #include kern_return_t arm64_ropjop_test(void); #endif -#if defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) kern_return_t ctrr_test(void); kern_return_t ctrr_test_cpu(void); #endif @@ -124,9 +130,11 @@ volatile char pan_fault_value = 0; kern_return_t arm64_panic_lockdown_test(void); #endif /* CONFIG_SPTM */ + #include kern_return_t arm64_speculation_guard_test(void); + #include #define LOCK_TEST_ITERATIONS 50 #define LOCK_TEST_SETUP_TIMEOUT_SEC 15 @@ -310,7 +318,7 @@ lt_stress_ticket_lock() uint cpuid = cpu_number(); - kprintf("%s>cpu %d starting\n", __FUNCTION__, cpuid); + kprintf("%s>cpu %u starting\n", __FUNCTION__, cpuid); lck_ticket_lock(<_ticket_lock, <_ticket_grp); lt_counter++; @@ -320,7 +328,7 @@ lt_stress_ticket_lock() /* Wait until all test threads have finished any binding */ while (lt_counter < lt_target_done_threads) { if (mach_absolute_time() > lt_setup_timeout) { - kprintf("%s>cpu %d noticed that we exceeded setup timeout of %d seconds during initial setup phase (only %d out of %d threads checked in)", + kprintf("%s>cpu %u noticed that we exceeded setup timeout of %d seconds during initial setup phase (only %u out of %u threads checked in)", __FUNCTION__, cpuid, LOCK_TEST_SETUP_TIMEOUT_SEC, lt_counter, lt_target_done_threads); return; } @@ -340,13 +348,13 @@ lt_stress_ticket_lock() */ while (lt_counter < 2 * lt_target_done_threads) { if (mach_absolute_time() > lt_setup_timeout) { - kprintf("%s>cpu %d noticed that we exceeded setup timeout of %d seconds during secondary setup phase (only %d out of %d threads checked in)", + kprintf("%s>cpu %u noticed that we exceeded setup timeout of %d seconds during secondary setup phase (only %u out of %u threads checked in)", __FUNCTION__, cpuid, LOCK_TEST_SETUP_TIMEOUT_SEC, lt_counter - lt_target_done_threads, lt_target_done_threads); return; } } - kprintf("%s>cpu %d started\n", __FUNCTION__, cpuid); + kprintf("%s>cpu %u started\n", __FUNCTION__, cpuid); while (lt_counter < limit) { lck_ticket_lock(<_ticket_lock, <_ticket_grp); @@ -359,7 +367,7 @@ lt_stress_ticket_lock() lt_stress_local_counters[cpuid] = local_counter; - kprintf("%s>final counter %d cpu %d incremented the counter %d times\n", __FUNCTION__, lt_counter, cpuid, local_counter); + kprintf("%s>final counter %u cpu %u incremented the counter %u times\n", __FUNCTION__, lt_counter, cpuid, local_counter); } #endif @@ -690,31 +698,33 @@ lt_bound_thread(void *arg, wait_result_t wres __unused) } static void -lt_e_thread(void *arg, wait_result_t wres __unused) +lt_cluster_bound_thread(void *arg, char cluster_type) { void (*func)(void) = (void (*)(void))arg; thread_t thread = current_thread(); - thread_soft_bind_cluster_type(thread, 'e'); + kern_return_t kr = thread_soft_bind_cluster_type(thread, cluster_type); + if (kr != KERN_SUCCESS) { + kprintf("%s>failed to bind to cluster type %c\n", __FUNCTION__, cluster_type); + } func(); OSIncrementAtomic((volatile SInt32*) <_done_threads); } +static void +lt_e_thread(void *arg, wait_result_t wres __unused) +{ + lt_cluster_bound_thread(arg, 'e'); +} + + static void lt_p_thread(void *arg, wait_result_t wres __unused) { - void (*func)(void) = (void (*)(void))arg; - - thread_t thread = current_thread(); - - thread_soft_bind_cluster_type(thread, 'p'); - - func(); - - OSIncrementAtomic((volatile SInt32*) <_done_threads); + lt_cluster_bound_thread(arg, 'p'); } static void @@ -1421,38 +1431,39 @@ arm64_munger_test() return 0; } -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) SECURITY_READ_ONLY_LATE(uint64_t) ctrr_ro_test; uint64_t ctrr_nx_test = 0xd65f03c0; /* RET */ volatile uint64_t ctrr_exception_esr; vm_offset_t ctrr_test_va; vm_offset_t ctrr_test_page; +atomic_bool ctrr_test_in_progress; kern_return_t ctrr_test(void) { processor_t p; - boolean_t ctrr_disable = FALSE; - PE_parse_boot_argn("-unsafe_kernel_text", &ctrr_disable, sizeof(ctrr_disable)); - -#if CONFIG_CSR_FROM_DT - if (csr_unsafe_kernel_text) { - ctrr_disable = TRUE; - } -#endif /* CONFIG_CSR_FROM_DT */ - - if (ctrr_disable) { - T_LOG("Skipping CTRR test when -unsafe_kernel_text boot-arg present"); + /* + * The test uses some globals and also a specific reserved VA region, so it + * can't run concurrently. This might otherwise happen via the sysctl + * interface. + */ + bool expected = false; + if (!atomic_compare_exchange_strong_explicit(&ctrr_test_in_progress, + &expected, true, + memory_order_acq_rel, memory_order_relaxed)) { + T_FAIL("Can't run multiple CTRR tests at once"); return KERN_SUCCESS; } + T_LOG("Running CTRR test."); for (p = processor_list; p != NULL; p = p->processor_list) { thread_bind(p); thread_block(THREAD_CONTINUE_NULL); - T_LOG("Running CTRR test on cpu %d\n", p->cpu_id); + T_LOG("Running CTRR test on CPU %d\n", p->cpu_id); ctrr_test_cpu(); } @@ -1460,6 +1471,9 @@ ctrr_test(void) thread_bind(PROCESSOR_NULL); thread_block(THREAD_CONTINUE_NULL); + T_PASS("Done running CTRR test on all CPUs"); + atomic_store_explicit(&ctrr_test_in_progress, false, memory_order_release); + return KERN_SUCCESS; } @@ -1507,7 +1521,7 @@ ctrr_test_nx_fault_handler(arm_saved_state_t * state) // Disable KASAN checking for CTRR tests as the test VA doesn't have a shadow mapping /* test CTRR on a cpu, caller to bind thread to desired cpu */ -/* ctrr_test_page was reserved during bootstrap process */ +/* ctrr_test_page was reserved during bootstrap process if no SPTM */ NOKASAN kern_return_t ctrr_test_cpu(void) { @@ -1517,23 +1531,59 @@ ctrr_test_cpu(void) kern_return_t kr; uint64_t prot = 0; extern vm_offset_t virtual_space_start; + extern vm_offset_t rorgn_begin; + extern vm_offset_t rorgn_end; - /* ctrr read only region = [rorgn_begin_va, rorgn_end_va) */ - -#if (KERNEL_CTRR_VERSION == 3) - const uint64_t rorgn_lwr = __builtin_arm_rsr64("S3_0_C11_C0_2"); - const uint64_t rorgn_upr = __builtin_arm_rsr64("S3_0_C11_C0_3"); -#else /* (KERNEL_CTRR_VERSION == 3) */ - const uint64_t rorgn_lwr = __builtin_arm_rsr64("S3_4_C15_C2_3"); - const uint64_t rorgn_upr = __builtin_arm_rsr64("S3_4_C15_C2_4"); -#endif /* (KERNEL_CTRR_VERSION == 3) */ - vm_offset_t rorgn_begin_va = phystokv(rorgn_lwr); - vm_offset_t rorgn_end_va = phystokv(rorgn_upr) + 0x1000; vm_offset_t ro_test_va = (vm_offset_t)&ctrr_ro_test; vm_offset_t nx_test_va = (vm_offset_t)&ctrr_nx_test; + bool ctrr_enabled = !ml_unsafe_kernel_text(); - T_EXPECT(rorgn_begin_va <= ro_test_va && ro_test_va < rorgn_end_va, "Expect ro_test_va to be inside the CTRR region"); - T_EXPECT((nx_test_va < rorgn_begin_va) ^ (nx_test_va >= rorgn_end_va), "Expect nx_test_va to be outside the CTRR region"); +#if CONFIG_SPTM + if (/* DISABLES CODE */ (1)) { + T_SKIP("Skipping CTRR test because testing under SPTM is not supported yet"); + return KERN_SUCCESS; + } +#endif + +#if defined(KERNEL_INTEGRITY_PV_CTRR) + if (rorgn_begin == 0 && rorgn_end == 0) { + // Under paravirtualized CTRR, it's possible that we want CTRR to be + // enabled but we're running under an older host that doesn't support + // it. + ctrr_enabled = false; + T_LOG("Treating paravirtualized CTRR as disabled due to lack of support"); + } +#endif + + // The CTRR read-only region is the physical address range [rorgn_begin, rorgn_end]. + // rorgn_end will be one byte short of a page boundary. + if (ctrr_enabled) { + T_EXPECT(rorgn_begin != 0, "Expect rorgn_begin to be set when CTRR enabled"); + T_EXPECT_GE_ULONG(rorgn_end, rorgn_begin, "Expect rorgn_end to be >= rorgn_begin when CTRR enabled"); + + pmap_paddr_t ro_test_pa = kvtophys_nofail(ro_test_va); + pmap_paddr_t nx_test_pa = kvtophys_nofail(nx_test_va); + + T_EXPECT(rorgn_begin <= ro_test_pa && ro_test_pa <= rorgn_end, "Expect ro_test_pa to be inside the CTRR region"); + T_EXPECT((nx_test_pa < rorgn_begin) ^ (nx_test_pa > rorgn_end), "Expect nx_test_pa to be outside the CTRR region"); + } else { + T_EXPECT_EQ_ULONG(rorgn_begin, 0, "Expect rorgn_begin to be unset when CTRR disabled"); + T_EXPECT_EQ_ULONG(rorgn_end, 0, "Expect rorgn_end to be unset when CTRR disabled"); + T_LOG("Skipping region check because CTRR is disabled"); + } + + if (ctrr_enabled) { + T_LOG("Expect no faults when reading CTRR region to verify correct programming of CTRR limits"); + for (pmap_paddr_t page_pa = rorgn_begin; page_pa <= rorgn_end; page_pa += PAGE_SIZE) { + vm_offset_t page_va = phystokv(page_pa); + for (vm_offset_t va = page_va; va < page_va + PAGE_SIZE; va += 8) { + volatile uint64_t x = *(uint64_t *)va; + (void) x; /* read for side effect only */ + } + } + } else { + T_LOG("Skipping read test because CTRR is disabled"); + } ro_pn = pmap_find_phys(kernel_pmap, ro_test_va); nx_pn = pmap_find_phys(kernel_pmap, nx_test_va); @@ -1541,6 +1591,7 @@ ctrr_test_cpu(void) T_LOG("test virtual page: %p, ctrr_ro_test: %p, ctrr_nx_test: %p, ro_pn: %x, nx_pn: %x ", (void *)ctrr_test_page, &ctrr_ro_test, &ctrr_nx_test, ro_pn, nx_pn); + T_ASSERT(ctrr_test_page != 0, "Expect ctrr_test_page to be initialized"); prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page); T_EXPECT(~prot & ARM_TTE_VALID, "Expect ctrr_test_page to be unmapped"); @@ -1568,9 +1619,13 @@ ctrr_test_cpu(void) // ensure write permission fault at expected level // data abort handler will set ctrr_exception_esr when ctrr_test_va takes a permission fault - T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_DABORT_EL1, "Data Abort from EL1 expected"); - T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); - T_EXPECT(ESR_ISS(ctrr_exception_esr) & ISS_DA_WNR, "Write Fault Expected"); + if (ctrr_enabled) { + T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_DABORT_EL1, "Data Abort from EL1 expected"); + T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); + T_EXPECT(ESR_ISS(ctrr_exception_esr) & ISS_DA_WNR, "Write Fault Expected"); + } else { + T_EXPECT(ctrr_exception_esr == 0, "No fault expected with CTRR disabled"); + } ctrr_test_va = 0; ctrr_exception_esr = 0; @@ -1580,11 +1635,12 @@ ctrr_test_cpu(void) kr = pmap_enter(kernel_pmap, ctrr_test_page, nx_pn, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE, PMAP_MAPPING_TYPE_INFER); + T_EXPECT(kr == KERN_SUCCESS, "Expect pmap_enter of RX mapping to succeed"); // assert entire mmu prot path (Hierarchical protection model) is NOT XN prot = pmap_get_arm64_prot(kernel_pmap, ctrr_test_page); - T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RONA && (~prot & ARM_PTE_PNX), "Mapping is EL1 ROX"); + T_EXPECT(ARM_PTE_EXTRACT_AP(prot) == AP_RONA && (~prot & ARM_PTE_PNX), "Mapping is EL1 ROX (prot=0x%lx)", prot); ctrr_test_va = ctrr_test_page + (nx_test_va & PAGE_MASK); #if __has_feature(ptrauth_calls) @@ -1600,24 +1656,26 @@ ctrr_test_cpu(void) ctrr_nx_test_ptr(); ml_expect_fault_end(); - // TODO: ensure execute permission fault at expected level - T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_IABORT_EL1, "Instruction abort from EL1 Expected"); - T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); + if (ctrr_enabled) { + // FIXME: rdar://143430725 (xnu support for paravirtualized CTXR) + // Without FEAT_XNX support on the host side, we cannot test kernel execution outside CTXR regions. +#if !defined(KERNEL_INTEGRITY_PV_CTRR) + // TODO: ensure execute permission fault at expected level + T_EXPECT(ESR_EC(ctrr_exception_esr) == ESR_EC_IABORT_EL1, "Instruction abort from EL1 Expected"); + T_EXPECT(ISS_DA_FSC(ESR_ISS(ctrr_exception_esr)) == FSC_PERMISSION_FAULT_L3, "Permission Fault Expected"); +#endif /* !defined(KERNEL_INTEGRITY_PV_CTRR) */ + } else { + T_EXPECT(ctrr_exception_esr == 0, "No fault expected with CTRR disabled"); + } ctrr_test_va = 0; ctrr_exception_esr = 0; pmap_remove(kernel_pmap, ctrr_test_page, ctrr_test_page + PAGE_SIZE); - T_LOG("Expect no faults when reading CTRR region to verify correct programming of CTRR limits"); - for (vm_offset_t addr = rorgn_begin_va; addr < rorgn_end_va; addr += 8) { - volatile uint64_t x = *(uint64_t *)addr; - (void) x; /* read for side effect only */ - } - return KERN_SUCCESS; } -#endif /* defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) */ +#endif /* (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) */ /** @@ -1644,6 +1702,7 @@ assert_uniprocessor(void) #if CONFIG_SPTM volatile uint8_t xnu_post_panic_lockdown_did_fire = false; typedef uint64_t (panic_lockdown_helper_fcn_t)(uint64_t raw); +typedef bool (panic_lockdown_precondition_fcn_t)(void); typedef bool (panic_lockdown_recovery_fcn_t)(arm_saved_state_t *); /* SP0 vector tests */ @@ -1659,7 +1718,6 @@ extern panic_lockdown_helper_fcn_t arm64_panic_lockdown_test_ldr_auth_fail; extern panic_lockdown_helper_fcn_t arm64_panic_lockdown_test_fpac; extern panic_lockdown_helper_fcn_t arm64_panic_lockdown_test_copyio; extern uint8_t arm64_panic_lockdown_test_copyio_fault_pc; -extern panic_lockdown_helper_fcn_t arm64_panic_lockdown_test_bti_telemetry; extern int gARM_FEAT_FPACCOMBINE; @@ -1678,6 +1736,7 @@ typedef struct arm64_panic_lockdown_test_case { const char *name; panic_lockdown_helper_fcn_t *func; uint64_t arg; + panic_lockdown_precondition_fcn_t *precondition; esr_exception_class_t expected_ec; bool check_fs; fault_status_t expected_fs; @@ -1858,6 +1917,7 @@ panic_lockdown_pacda_get_invalid_ptr(void) return (uint64_t)unsigned_ptr; } + kern_return_t arm64_panic_lockdown_test(void) { @@ -1993,6 +2053,12 @@ arm64_panic_lockdown_test(void) size_t test_count = sizeof(tests) / sizeof(*tests); for (size_t i = 0; i < test_count; i++) { + if (tests[i].precondition && + !tests[i].precondition()) { + T_LOG("%s skipped due to precondition check", tests[i].name); + continue; + } + panic_lockdown_expect_test( "Exceptions unmasked", &tests[i], @@ -2021,6 +2087,8 @@ arm64_panic_lockdown_test(void) + + #if HAS_SPECRES /*** CPS RCTX ***/ @@ -2346,6 +2414,7 @@ arm64_bti_test(void) #endif /* BTI_ENFORCED */ + /** * Test the speculation guards * We can't easily ensure that the guards actually behave correctly under @@ -2448,3 +2517,46 @@ arm64_speculation_guard_test(void) return KERN_SUCCESS; } + + +extern void arm64_brk_lr_gpr(void); +extern void arm64_brk_lr_fault(void); + +static NOKASAN bool +arm64_backtrace_test_fault_handler(arm_saved_state_t * state) +{ + /* Similar setup to backtrace_kernel_sysctl() */ + const unsigned int bt_len = 24; + const size_t bt_size = sizeof(uint8_t) * bt_len; + uint8_t *bt = kalloc_data(bt_size, Z_WAITOK | Z_ZERO); + backtrace_info_t packed_info = BTI_NONE; + + /* Call the backtrace function */ + backtrace_packed(BTP_KERN_OFFSET_32, bt, bt_size, NULL, &packed_info); + + add_saved_state_pc(state, 4); + return true; +} + +/** + * Make sure EL1 fleh doesn't push a bogus stack frame when LR is being used as + * a GPR in the caller. + * + * This test writes a GPR-like value into LR that is >4GB away from any kernel + * address and tries to run backtrace_packed() from a sync handler. + * backtrace_packed() has an invariant that all addresses in the stack frame are + * within 4GB of the kernel text. + */ +kern_return_t +arm64_backtrace_test(void) +{ + ml_expect_fault_pc_begin(arm64_backtrace_test_fault_handler, (uintptr_t)&arm64_brk_lr_fault); + arm64_brk_lr_gpr(); + ml_expect_fault_end(); + +#if CONFIG_SPTM && (DEVELOPMENT || DEBUG) + /* Reset the debug data so it can be filled later if needed */ + debug_panic_lockdown_initiator_state.initiator_pc = 0; +#endif /* CONFIG_SPTM && (DEVELOPMENT || DEBUG) */ + return KERN_SUCCESS; +} diff --git a/osfmk/arm64/platform_tests_asm.s b/osfmk/arm64/platform_tests_asm.s index afe07225e..81fbd52fd 100644 --- a/osfmk/arm64/platform_tests_asm.s +++ b/osfmk/arm64/platform_tests_asm.s @@ -29,6 +29,42 @@ #include #include +.macro SAVE_CALLEE_REGISTERS + stp x19, x20, [sp, #-(16 * 10)]! + stp x21, x22, [sp, #0x10] + stp x23, x24, [sp, #0x20] + stp x25, x26, [sp, #0x30] + stp x27, x28, [sp, #0x40] + stp x29, x30, [sp, #0x50] + stp q4, q5, [sp, #0x60] + stp q6, q7, [sp, #0x80] +.endmacro + +.macro LOAD_CALLEE_REGISTERS + ldp x21, x22, [sp, #0x10] + ldp x23, x24, [sp, #0x20] + ldp x25, x26, [sp, #0x30] + ldp x27, x28, [sp, #0x40] + ldp x29, x30, [sp, #0x50] + ldp q4, q5, [sp, #0x60] + ldp q6, q7, [sp, #0x80] + ldp x19, x20, [sp], #(16*10) +.endmacro + + +/** + * Raise a sync exception while LR is being used as a GPR. + */ + .globl EXT(arm64_brk_lr_fault) + .globl EXT(arm64_brk_lr_gpr) +LEXT(arm64_brk_lr_gpr) + ARM64_PROLOG + stp lr, xzr, [sp, #-0x10]! + mov lr, #0x80 +LEXT(arm64_brk_lr_fault) + brk 0xC470 + ldp lr, xzr, [sp], 0x10 + ret #if CONFIG_SPTM .text @@ -97,19 +133,6 @@ LEXT(arm64_panic_lockdown_test_fpac) ret #endif /* __ARM_ARCH_8_6__ */ -#if BTI_ENFORCED && CONFIG_BTI_TELEMETRY - .globl EXT(arm64_panic_lockdown_test_bti_telemetry) -LEXT(arm64_panic_lockdown_test_bti_telemetry) - ARM64_PROLOG - /* - * Trigger a BTI exception on the first instruction *after* the landing pad. - */ -0: - nop - adr x0, 0b - br x0 -#endif /* BTI_ENFORCED && CONFIG_BTI_TELEMETRY */ - /* * SP1 Panic Lockdown Tests * @@ -120,28 +143,6 @@ LEXT(arm64_panic_lockdown_test_bti_telemetry) * callee saved registers here. */ -.macro SAVE_CALLEE_REGISTERS - stp x19, x20, [sp, #-(16 * 10)]! - stp x21, x22, [sp, #0x10] - stp x23, x24, [sp, #0x20] - stp x25, x26, [sp, #0x30] - stp x27, x28, [sp, #0x40] - stp x29, x30, [sp, #0x50] - stp q4, q5, [sp, #0x60] - stp q6, q7, [sp, #0x80] -.endmacro - -.macro LOAD_CALLEE_REGISTERS - ldp x21, x22, [sp, #0x10] - ldp x23, x24, [sp, #0x20] - ldp x25, x26, [sp, #0x30] - ldp x27, x28, [sp, #0x40] - ldp x29, x30, [sp, #0x50] - ldp q4, q5, [sp, #0x60] - ldp q6, q7, [sp, #0x80] - ldp x19, x20, [sp], #(16*10) -.endmacro - /** * arm64_panic_lockdown_test_sp1_invalid_stack * @@ -176,7 +177,7 @@ LEXT(arm64_panic_lockdown_test_sp1_invalid_stack_handler) /* Return 1 to indicate success */ mov x0, #1 LOAD_CALLEE_REGISTERS - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(arm64_panic_lockdown_test_sp1_invalid_stack) /** * arm64_panic_lockdown_test_sp1_exception_in_vector @@ -198,7 +199,7 @@ LEXT(arm64_panic_lockdown_test_sp1_exception_in_vector_handler) /* Return 1 to indicate success */ mov x0, #1 LOAD_CALLEE_REGISTERS - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(arm64_panic_lockdown_test_sp1_exception_in_vector) #endif /* CONFIG_SPTM */ @@ -224,7 +225,7 @@ LEXT(arm64_bti_test_call_shim) blr x0 #endif /* __has_feature(ptrauth_calls) */ POP_FRAME - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG EXT(arm64_bti_test_call_shim) .globl EXT(arm64_bti_test_func_with_no_landing_pad) LEXT(arm64_bti_test_func_with_no_landing_pad) @@ -257,3 +258,5 @@ LEXT(arm64_bti_test_func_with_pac_landing_pad) retab #endif /* __has_feature(ptrauth_returns) */ #endif /* BTI_ENFORCED */ + + diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index 955da0840..2b6d7231c 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -80,14 +80,22 @@ #include #endif +#if !CONFIG_SPTM /* * Processor registers for ARM */ #if __ARM_42BIT_PA_SPACE__ -/* For now, force the issue! */ -/* We need more VA space for the identity map to bootstrap the MMU */ +/** + * On PPL, the identity map requires a smaller T0SZ value because DRAM starts + * at a PA not mappable by only 3 bits in L1 table on platforms with 42-bit + * PA space. On SPTM, this is overcome by boot with a smaller T0SZ and resize + * to the __ARM64_PMAP_SUBPAGE_L1__ T0SZ when the identity map is no longer + * used. + */ #undef __ARM64_PMAP_SUBPAGE_L1__ +#undef __ARM64_PMAP_KERN_SUBPAGE_L1__ #endif /* __ARM_42BIT_PA_SPACE__ */ +#endif /* !CONFIG_SPTM */ /* For arm platforms, create one pset per cluster */ #define MAX_PSETS MAX_CPU_CLUSTERS @@ -551,9 +559,7 @@ #if HAS_ARM_FEAT_SME // 60 EnTP2 Enable TPIDR2_EL0 at EL0 -#define SCTLR_OTHER (1ULL << 60) -#else -#define SCTLR_OTHER (0) +#define SCTLR_TP2_ENABLED (1ULL << 60) #endif #define SCTLR_EPAN_ENABLED (1ULL << 57) @@ -587,6 +593,7 @@ // 35 BT0 PACIxSP acts as a BTI C landing pad rather than BTI JC at EL0 #define SCTLR_BT0_ENABLED (1ULL << 35) + // 26 UCI User Cache Instructions #define SCTLR_UCI_ENABLED (1ULL << 26) @@ -699,12 +706,20 @@ #define SCTLR_BT_DEFAULT 0 #endif /* BTI_ENFORCED */ +#if HAS_ARM_FEAT_SME +#define SCTLR_TP2_DEFAULT SCTLR_TP2_ENABLED +#else +#define SCTLR_TP2_DEFAULT 0 +#endif + +#define SCTLR_OTHER 0 + #define SCTLR_EL1_REQUIRED \ (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \ SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED | SCTLR_BT_DEFAULT | \ SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED | \ SCTLR_EPAN_DEFAULT | SCTLR_EIS_DEFAULT | SCTLR_EOS_DEFAULT | SCTLR_DSSBS_DEFAULT | \ - SCTLR_PAC_KEYS_DEFAULT | SCTLR_OTHER) + SCTLR_PAC_KEYS_DEFAULT | SCTLR_TP2_DEFAULT | SCTLR_OTHER) #define SCTLR_EL1_OPTIONAL \ (SCTLR_EPAN_OPTIONAL) @@ -712,6 +727,7 @@ #define SCTLR_EL1_DEFAULT \ (SCTLR_EL1_REQUIRED | SCTLR_EL1_OPTIONAL) + /* * Coprocessor Access Control Register (CPACR) * @@ -867,6 +883,7 @@ */ #define TCR_T0SZ_SHIFT 0ULL +#define TCR_T0SZ_MASK 0x3FULL #define TCR_TSZ_BITS 6ULL #define TCR_TSZ_MASK ((1ULL << TCR_TSZ_BITS) - 1ULL) @@ -902,6 +919,7 @@ #endif #define TCR_T1SZ_SHIFT 16ULL +#define TCR_T1SZ_MASK 0x3FULL #define TCR_A1_ASID1 (1ULL << 22ULL) #define TCR_EPD1_TTBR1_DISABLED (1ULL << 23ULL) @@ -974,6 +992,7 @@ #define TCR_EL1_EXTRA 0 + /* * Multiprocessor Affinity Register (MPIDR_EL1) * @@ -1030,24 +1049,27 @@ #endif /* __ARM64_PMAP_SUBPAGE_L1__ */ #endif /* __ARM_16K_PG__ */ -#if defined(APPLE_ARM64_ARCH_FAMILY) -/* T0SZ must be the same as T1SZ */ -#define T1SZ_BOOT T0SZ_BOOT -#else /* defined(APPLE_ARM64_ARCH_FAMILY) */ +#if __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM +#define T0SZ_EARLY_BOOT 17ULL +#endif /*__ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM */ + +#if HAS_ARM_INDEPENDENT_TNSZ #ifdef __ARM_16K_PG__ -#if __ARM64_PMAP_SUBPAGE_L1__ +#if __ARM64_PMAP_KERN_SUBPAGE_L1__ #define T1SZ_BOOT 25ULL -#else /* !__ARM64_PMAP_SUBPAGE_L1__ */ +#else /* !__ARM64_PMAP_KERN_SUBPAGE_L1__ */ #define T1SZ_BOOT 17ULL -#endif /* !__ARM64_PMAP_SUBPAGE_L1__ */ +#endif /* !__ARM64_PMAP_KERN_SUBPAGE_L1__ */ #else /* __ARM_16K_PG__ */ -#if __ARM64_PMAP_SUBPAGE_L1__ +#if __ARM64_PMAP_KERN_SUBPAGE_L1__ #define T1SZ_BOOT 26ULL -#else /* __ARM64_PMAP_SUBPAGE_L1__ */ +#else /* __ARM64_PMAP_KERN_SUBPAGE_L1__ */ #define T1SZ_BOOT 25ULL -#endif /*__ARM64_PMAP_SUBPAGE_L1__*/ +#endif /*__ARM64_PMAP_KERN_SUBPAGE_L1__*/ #endif /* __ARM_16K_PG__ */ -#endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ +#else /* HAS_ARM_INDEPENDENT_TNSZ */ +#define T1SZ_BOOT T0SZ_BOOT +#endif /* HAS_ARM_INDEPENDENT_TNSZ */ #if __ARM_42BIT_PA_SPACE__ #define TCR_IPS_VALUE TCR_IPS_42BITS @@ -1075,20 +1097,282 @@ TCR_TBI0_TOPBYTE_IGNORED | (TCR_TBID0_ENABLE) | TCR_E0PD_VALUE | \ TCR_EL1_DTBI | TCR_EL1_ASID | TCR_EL1_EXTRA) +#if __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM +#define TCR_EL1_BASE_BOOT \ + (TCR_IPS_VALUE | TCR_SH0_OUTER | TCR_ORGN0_WRITEBACK | \ + TCR_IRGN0_WRITEBACK | (T0SZ_EARLY_BOOT << TCR_T0SZ_SHIFT) | \ + TCR_SH1_OUTER | TCR_ORGN1_WRITEBACK | \ + TCR_IRGN1_WRITEBACK | (TCR_TG1_GRANULE_SIZE) | \ + TCR_TBI0_TOPBYTE_IGNORED | (TCR_TBID0_ENABLE) | TCR_E0PD_VALUE | \ + TCR_EL1_DTBI | TCR_EL1_ASID | TCR_EL1_EXTRA) +#endif /* __ARM64_PMAP_SUBPAGE_L1__ && CONFIG_SPTM */ + #if __ARM_KERNEL_PROTECT__ #define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) #define T1SZ_USER (T1SZ_BOOT + 1) #define TCR_EL1_USER (TCR_EL1_BASE | (T1SZ_USER << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) #else +#if CONFIG_SPTM +#if __ARM64_PMAP_SUBPAGE_L1__ +#define TCR_EL1_BOOT (TCR_EL1_BASE_BOOT | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) +#define TCR_EL1_FINAL (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) +#else /* !__ARM64_PMAP_SUBPAGE_L1__ */ #define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) +#define TCR_EL1_FINAL TCR_EL1_BOOT +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ +#else /* !CONFIG_SPTM */ +#define TCR_EL1_BOOT (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_SIZE)) +#endif /* CONFIG_SPTM */ #endif /* __ARM_KERNEL_PROTECT__ */ #define TCR_EL1_4KB (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_4KB)) #define TCR_EL1_16KB (TCR_EL1_BASE | (T1SZ_BOOT << TCR_T1SZ_SHIFT) | (TCR_TG0_GRANULE_16KB)) +/* + * Bit 55 of the VA is used to select which TTBR to use during a translation table walk. + */ +#define TTBR_SELECTOR (1ULL << 55) +/* + * Hypervisor Fine-Grained Read Trap Register (HFGRTR) + */ + +#define HFGRTR_AMAIR2_SHIFT 63 +#define HFGRTR_AMAIR2 (1ULL << HFGRTR_AMAIR2_SHIFT) +#define HFGRTR_MAIR2_SHIFT 62 +#define HFGRTR_MAIR2 (1ULL << HFGRTR_MAIR2_SHIFT) +#define HFGRTR_S2POR_SHIFT 61 +#define HFGRTR_S2POR (1ULL << HFGRTR_S2POR_SHIFT) +#define HFGRTR_POR_EL1_SHIFT 60 +#define HFGRTR_POR_EL1 (1ULL << HFGRTR_POR_EL1_SHIFT) +#define HFGRTR_POR_EL0_SHIFT 59 +#define HFGRTR_POR_EL0 (1ULL << HFGRTR_POR_EL0_SHIFT) +#define HFGRTR_PIR_SHIFT 58 +#define HFGRTR_PIR (1ULL << HFGRTR_PIR_SHIFT) +#define HFGRTR_PIRE0_SHIFT 57 +#define HFGRTR_PIRE0 (1ULL << HFGRTR_PIRE0_SHIFT) +#define HFGRTR_RCWMASK_SHIFT 56 +#define HFGRTR_RCWMASK (1ULL << HFGRTR_RCWMASK_SHIFT) +#define HFGRTR_TPIDR2_SHIFT 55 +#define HFGRTR_TPIDR2 (1ULL << HFGRTR_TPIDR2_SHIFT) +#define HFGRTR_SMPRI_SHIFT 54 +#define HFGRTR_SMPRI (1ULL << HFGRTR_SMPRI_SHIFT) +#define HFGRTR_GCS_EL1_SHIFT 53 +#define HFGRTR_GCS_EL1 (1ULL << HFGRTR_GCS_EL1_SHIFT) +#define HFGRTR_GCS_EL0_SHIFT 52 +#define HFGRTR_GCS_EL0 (1ULL << HFGRTR_GCS_EL0_SHIFT) +#define HFGRTR_ACCDATA_SHIFT 50 +#define HFGRTR_ACCDATA (1ULL << HFGRTR_ACCDATA_SHIFT) +#define HFGRTR_ERXADDR_SHIFT 49 +#define HFGRTR_ERXADDR (1ULL << HFGRTR_ERXADDR_SHIFT) +#define HFGRTR_ERXPFGCDN_SHIFT 48 +#define HFGRTR_ERXPFGCDN (1ULL << HFGRTR_ERXPFGCDN_SHIFT) +#define HFGRTR_ERXPFGCTL_SHIFT 47 +#define HFGRTR_ERXPFGCTL (1ULL << HFGRTR_ERXPFGCTL_SHIFT) +#define HFGRTR_ERXPFGF_SHIFT 46 +#define HFGRTR_ERXPFGF (1ULL << HFGRTR_ERXPFGF_SHIFT) +#define HFGRTR_ERXMISC_SHIFT 45 +#define HFGRTR_ERXMISC (1ULL << HFGRTR_ERXMISC_SHIFT) +#define HFGRTR_ERXSTATUS_SHIFT 44 +#define HFGRTR_ERXSTATUS (1ULL << HFGRTR_ERXSTATUS_SHIFT) +#define HFGRTR_ERXCTLR_SHIFT 43 +#define HFGRTR_ERXCTLR (1ULL << HFGRTR_ERXCTLR_SHIFT) +#define HFGRTR_ERXFR_SHIFT 42 +#define HFGRTR_ERXFR (1ULL << HFGRTR_ERXFR_SHIFT) +#define HFGRTR_ERRSELR_SHIFT 41 +#define HFGRTR_ERRSELR (1ULL << HFGRTR_ERRSELR_SHIFT) +#define HFGRTR_ERRIDR_SHIFT 40 +#define HFGRTR_ERRIDR (1ULL << HFGRTR_ERRIDR_SHIFT) +#define HFGRTR_ICC_IGRPEN_SHIFT 39 +#define HFGRTR_ICC_IGRPEN (1ULL << HFGRTR_ICC_IGRPEN_SHIFT) +#define HFGRTR_VBAR_SHIFT 38 +#define HFGRTR_VBAR (1ULL << HFGRTR_VBAR_SHIFT) +#define HFGRTR_TTBR1_SHIFT 37 +#define HFGRTR_TTBR1 (1ULL << HFGRTR_TTBR1_SHIFT) +#define HFGRTR_TTBR0_SHIFT 36 +#define HFGRTR_TTBR0 (1ULL << HFGRTR_TTBR0_SHIFT) +#define HFGRTR_TPIDR_EL0_SHIFT 35 +#define HFGRTR_TPIDR_EL0 (1ULL << HFGRTR_TPIDR_EL0_SHIFT) +#define HFGRTR_TPIDRRO_SHIFT 34 +#define HFGRTR_TPIDRRO (1ULL << HFGRTR_TPIDRRO_SHIFT) +#define HFGRTR_TPIDR_EL1_SHIFT 33 +#define HFGRTR_TPIDR_EL1 (1ULL << HFGRTR_TPIDR_EL1_SHIFT) +#define HFGRTR_TCR_SHIFT 32 +#define HFGRTR_TCR (1ULL << HFGRTR_TCR_SHIFT) +#define HFGRTR_SCXTNUM_EL0_SHIFT 31 +#define HFGRTR_SCXTNUM_EL0 (1ULL << HFGRTR_SCXTNUM_EL0_SHIFT) +#define HFGRTR_SCXTNUM_EL1_SHIFT 30 +#define HFGRTR_SCXTNUM_EL1 (1ULL << HFGRTR_SCXTNUM_EL1_SHIFT) +#define HFGRTR_SCTLR_SHIFT 29 +#define HFGRTR_SCTLR (1ULL << HFGRTR_SCTLR_SHIFT) +#define HFGRTR_REVIDR_SHIFT 28 +#define HFGRTR_REVIDR (1ULL << HFGRTR_REVIDR_SHIFT) +#define HFGRTR_PAR_SHIFT 27 +#define HFGRTR_PAR (1ULL << HFGRTR_PAR_SHIFT) +#define HFGRTR_MPIDR_SHIFT 26 +#define HFGRTR_MPIDR (1ULL << HFGRTR_MPIDR_SHIFT) +#define HFGRTR_MIDR_SHIFT 25 +#define HFGRTR_MIDR (1ULL << HFGRTR_MIDR_SHIFT) +#define HFGRTR_MAIR_SHIFT 24 +#define HFGRTR_MAIR (1ULL << HFGRTR_MAIR_SHIFT) +#define HFGRTR_LORSA_SHIFT 23 +#define HFGRTR_LORSA (1ULL << HFGRTR_LORSA_SHIFT) +#define HFGRTR_LORN_SHIFT 22 +#define HFGRTR_LORN (1ULL << HFGRTR_LORN_SHIFT) +#define HFGRTR_LORID_SHIFT 21 +#define HFGRTR_LORID (1ULL << HFGRTR_LORID_SHIFT) +#define HFGRTR_LOREA_SHIFT 20 +#define HFGRTR_LOREA (1ULL << HFGRTR_LOREA_SHIFT) +#define HFGRTR_LORC_SHIFT 19 +#define HFGRTR_LORC (1ULL << HFGRTR_LORC_SHIFT) +#define HFGRTR_ISR_SHIFT 18 +#define HFGRTR_ISR (1ULL << HFGRTR_ISR_SHIFT) +#define HFGRTR_FAR_SHIFT 17 +#define HFGRTR_FAR (1ULL << HFGRTR_FAR_SHIFT) +#define HFGRTR_ESR_SHIFT 16 +#define HFGRTR_ESR (1ULL << HFGRTR_ESR_SHIFT) +#define HFGRTR_DCZID_SHIFT 15 +#define HFGRTR_DCZID (1ULL << HFGRTR_DCZID_SHIFT) +#define HFGRTR_CTR_SHIFT 14 +#define HFGRTR_CTR (1ULL << HFGRTR_CTR_SHIFT) +#define HFGRTR_CSSELR_SHIFT 13 +#define HFGRTR_CSSELR (1ULL << HFGRTR_CSSELR_SHIFT) +#define HFGRTR_CPACR_SHIFT 12 +#define HFGRTR_CPACR (1ULL << HFGRTR_CPACR_SHIFT) +#define HFGRTR_CONTEXTIDR_SHIFT 11 +#define HFGRTR_CONTEXTIDR (1ULL << HFGRTR_CONTEXTIDR_SHIFT) +#define HFGRTR_CLIDR_SHIFT 10 +#define HFGRTR_CLIDR (1ULL << HFGRTR_CLIDR_SHIFT) +#define HFGRTR_CCSIDR_SHIFT 9 +#define HFGRTR_CCSIDR (1ULL << HFGRTR_CCSIDR_SHIFT) +#define HFGRTR_APIBKEY_SHIFT 8 +#define HFGRTR_APIBKEY (1ULL << HFGRTR_APIBKEY_SHIFT) +#define HFGRTR_APIAKEY_SHIFT 7 +#define HFGRTR_APIAKEY (1ULL << HFGRTR_APIAKEY_SHIFT) +#define HFGRTR_APGAKEY_SHIFT 6 +#define HFGRTR_APGAKEY (1ULL << HFGRTR_APGAKEY_SHIFT) +#define HFGRTR_APDBKEY_SHIFT 5 +#define HFGRTR_APDBKEY (1ULL << HFGRTR_APDBKEY_SHIFT) +#define HFGRTR_APDAKEY_SHIFT 4 +#define HFGRTR_APDAKEY (1ULL << HFGRTR_APDAKEY_SHIFT) +#define HFGRTR_AMAIR_SHIFT 3 +#define HFGRTR_AMAIR (1ULL << HFGRTR_AMAIR_SHIFT) +#define HFGRTR_AIDR_SHIFT 2 +#define HFGRTR_AIDR (1ULL << HFGRTR_AIDR_SHIFT) +#define HFGRTR_AFSR1_SHIFT 1 +#define HFGRTR_AFSR1 (1ULL << HFGRTR_AFSR1_SHIFT) +#define HFGRTR_AFSR0_SHIFT 0 +#define HFGRTR_AFSR0 (1ULL << HFGRTR_AFSR0_SHIFT) + +/* + * Hypervisor Fine-Grained Write Trap Register (HFGWTR) + */ + +#define HFGWTR_AMAIR2_SHIFT 63 +#define HFGWTR_AMAIR2 (1ULL << HFGWTR_AMAIR2_SHIFT) +#define HFGWTR_MAIR2_SHIFT 62 +#define HFGWTR_MAIR2 (1ULL << HFGWTR_MAIR2_SHIFT) +#define HFGWTR_S2POR_SHIFT 61 +#define HFGWTR_S2POR (1ULL << HFGWTR_S2POR_SHIFT) +#define HFGWTR_POR_EL1_SHIFT 60 +#define HFGWTR_POR_EL1 (1ULL << HFGWTR_POR_EL1_SHIFT) +#define HFGWTR_POR_EL0_SHIFT 59 +#define HFGWTR_POR_EL0 (1ULL << HFGWTR_POR_EL0_SHIFT) +#define HFGWTR_PIR_SHIFT 58 +#define HFGWTR_PIR (1ULL << HFGWTR_PIR_SHIFT) +#define HFGWTR_PIRE0_SHIFT 57 +#define HFGWTR_PIRE0 (1ULL << HFGWTR_PIRE0_SHIFT) +#define HFGWTR_RCWMASK_SHIFT 56 +#define HFGWTR_RCWMASK (1ULL << HFGWTR_RCWMASK_SHIFT) +#define HFGWTR_TPIDR2_SHIFT 55 +#define HFGWTR_TPIDR2 (1ULL << HFGWTR_TPIDR2_SHIFT) +#define HFGWTR_SMPRI_SHIFT 54 +#define HFGWTR_SMPRI (1ULL << HFGWTR_SMPRI_SHIFT) +#define HFGWTR_GCS_EL1_SHIFT 53 +#define HFGWTR_GCS_EL1 (1ULL << HFGWTR_GCS_EL1_SHIFT) +#define HFGWTR_GCS_EL0_SHIFT 52 +#define HFGWTR_GCS_EL0 (1ULL << HFGWTR_GCS_EL0_SHIFT) +#define HFGWTR_ACCDATA_SHIFT 50 +#define HFGWTR_ACCDATA (1ULL << HFGWTR_ACCDATA_SHIFT) +#define HFGWTR_ERXADDR_SHIFT 49 +#define HFGWTR_ERXADDR (1ULL << HFGWTR_ERXADDR_SHIFT) +#define HFGWTR_ERXPFGCDN_SHIFT 48 +#define HFGWTR_ERXPFGCDN (1ULL << HFGWTR_ERXPFGCDN_SHIFT) +#define HFGWTR_ERXPFGCTL_SHIFT 47 +#define HFGWTR_ERXPFGCTL (1ULL << HFGWTR_ERXPFGCTL_SHIFT) +#define HFGWTR_ERXMISC_SHIFT 45 +#define HFGWTR_ERXMISC (1ULL << HFGWTR_ERXMISC_SHIFT) +#define HFGWTR_ERXSTATUS_SHIFT 44 +#define HFGWTR_ERXSTATUS (1ULL << HFGWTR_ERXSTATUS_SHIFT) +#define HFGWTR_ERXCTLR_SHIFT 43 +#define HFGWTR_ERXCTLR (1ULL << HFGWTR_ERXCTLR_SHIFT) +#define HFGWTR_ERRSELR_SHIFT 41 +#define HFGWTR_ERRSELR (1ULL << HFGWTR_ERRSELR_SHIFT) +#define HFGWTR_ICC_IGRPEN_SHIFT 39 +#define HFGWTR_ICC_IGRPEN (1ULL << HFGWTR_ICC_IGRPEN_SHIFT) +#define HFGWTR_VBAR_SHIFT 38 +#define HFGWTR_VBAR (1ULL << HFGWTR_VBAR_SHIFT) +#define HFGWTR_TTBR1_SHIFT 37 +#define HFGWTR_TTBR1 (1ULL << HFGWTR_TTBR1_SHIFT) +#define HFGWTR_TTBR0_SHIFT 36 +#define HFGWTR_TTBR0 (1ULL << HFGWTR_TTBR0_SHIFT) +#define HFGWTR_TPIDR_EL0_SHIFT 35 +#define HFGWTR_TPIDR_EL0 (1ULL << HFGWTR_TPIDR_EL0_SHIFT) +#define HFGWTR_TPIDRRO_SHIFT 34 +#define HFGWTR_TPIDRRO (1ULL << HFGWTR_TPIDRRO_SHIFT) +#define HFGWTR_TPIDR_EL1_SHIFT 33 +#define HFGWTR_TPIDR_EL1 (1ULL << HFGWTR_TPIDR_EL1_SHIFT) +#define HFGWTR_TCR_SHIFT 32 +#define HFGWTR_TCR (1ULL << HFGWTR_TCR_SHIFT) +#define HFGWTR_SCXTNUM_EL0_SHIFT 31 +#define HFGWTR_SCXTNUM_EL0 (1ULL << HFGWTR_SCXTNUM_EL0_SHIFT) +#define HFGWTR_SCXTNUM_EL1_SHIFT 30 +#define HFGWTR_SCXTNUM_EL1 (1ULL << HFGWTR_SCXTNUM_EL1_SHIFT) +#define HFGWTR_SCXTNUM_SHIFT 30 +#define HFGWTR_SCXTNUM (1ULL << HFGWTR_SCXTNUM_SHIFT) +#define HFGWTR_SCTLR_SHIFT 29 +#define HFGWTR_SCTLR (1ULL << HFGWTR_SCTLR_SHIFT) +#define HFGWTR_PAR_SHIFT 27 +#define HFGWTR_PAR (1ULL << HFGWTR_PAR_SHIFT) +#define HFGWTR_MAIR_SHIFT 24 +#define HFGWTR_MAIR (1ULL << HFGWTR_MAIR_SHIFT) +#define HFGWTR_LORSA_SHIFT 23 +#define HFGWTR_LORSA (1ULL << HFGWTR_LORSA_SHIFT) +#define HFGWTR_LORN_SHIFT 22 +#define HFGWTR_LORN (1ULL << HFGWTR_LORN_SHIFT) +#define HFGWTR_LOREA_SHIFT 20 +#define HFGWTR_LOREA (1ULL << HFGWTR_LOREA_SHIFT) +#define HFGWTR_LORC_SHIFT 19 +#define HFGWTR_LORC (1ULL << HFGWTR_LORC_SHIFT) +#define HFGWTR_FAR_SHIFT 17 +#define HFGWTR_FAR (1ULL << HFGWTR_FAR_SHIFT) +#define HFGWTR_ESR_SHIFT 16 +#define HFGWTR_ESR (1ULL << HFGWTR_ESR_SHIFT) +#define HFGWTR_CSSELR_SHIFT 13 +#define HFGWTR_CSSELR (1ULL << HFGWTR_CSSELR_SHIFT) +#define HFGWTR_CPACR_SHIFT 12 +#define HFGWTR_CPACR (1ULL << HFGWTR_CPACR_SHIFT) +#define HFGWTR_CONTEXTIDR_SHIFT 11 +#define HFGWTR_CONTEXTIDR (1ULL << HFGWTR_CONTEXTIDR_SHIFT) +#define HFGWTR_APIBKEY_SHIFT 8 +#define HFGWTR_APIBKEY (1ULL << HFGWTR_APIBKEY_SHIFT) +#define HFGWTR_APIAKEY_SHIFT 7 +#define HFGWTR_APIAKEY (1ULL << HFGWTR_APIAKEY_SHIFT) +#define HFGWTR_APGAKEY_SHIFT 6 +#define HFGWTR_APGAKEY (1ULL << HFGWTR_APGAKEY_SHIFT) +#define HFGWTR_APDBKEY_SHIFT 5 +#define HFGWTR_APDBKEY (1ULL << HFGWTR_APDBKEY_SHIFT) +#define HFGWTR_APDAKEY_SHIFT 4 +#define HFGWTR_APDAKEY (1ULL << HFGWTR_APDAKEY_SHIFT) +#define HFGWTR_AMAIR_SHIFT 3 +#define HFGWTR_AMAIR (1ULL << HFGWTR_AMAIR_SHIFT) +#define HFGWTR_AFSR1_SHIFT 1 +#define HFGWTR_AFSR1 (1ULL << HFGWTR_AFSR1_SHIFT) +#define HFGWTR_AFSR0_SHIFT 0 +#define HFGWTR_AFSR0 (1ULL << HFGWTR_AFSR0_SHIFT) + /* * Monitor Debug System Control Register (MDSCR) */ @@ -1122,19 +1406,242 @@ #define MDSCR_SS_SHIFT 0 #define MDSCR_SS (1ULL << MDSCR_SS_SHIFT) +/* + * Hypervisor Debug Fine-Grained Read Trap Register (HDFGRTR_EL2) + */ +#define HDFGRTR_PMBIDR_SHIFT 63 +#define HDFGRTR_PMBIDR (1ULL << HDFGRTR_PMBIDR_SHIFT) +#define HDFGRTR_PMSNEVFR_SHIFT 62 +#define HDFGRTR_PMSNEVFR (1ULL << HDFGRTR_PMSNEVFR_SHIFT) +#define HDFGRTR_BRBDATA_SHIFT 61 +#define HDFGRTR_BRBDATA (1ULL << HDFGRTR_BRBDATA_SHIFT) +#define HDFGRTR_BRBCTL_SHIFT 60 +#define HDFGRTR_BRBCTL (1ULL << HDFGRTR_BRBCTL_SHIFT) +#define HDFGRTR_BRBIDR_SHIFT 59 +#define HDFGRTR_BRBIDR (1ULL << HDFGRTR_BRBIDR_SHIFT) +#define HDFGRTR_PMCEID_SHIFT 58 +#define HDFGRTR_PMCEID (1ULL << HDFGRTR_PMCEID_SHIFT) +#define HDFGRTR_PMUSERENR_SHIFT 57 +#define HDFGRTR_PMUSERENR (1ULL << HDFGRTR_PMUSERENR_SHIFT) +#define HDFGRTR_TRBTRG_SHIFT 56 +#define HDFGRTR_TRBTRG (1ULL << HDFGRTR_TRBTRG_SHIFT) +#define HDFGRTR_TRBSR_SHIFT 55 +#define HDFGRTR_TRBSR (1ULL << HDFGRTR_TRBSR_SHIFT) +#define HDFGRTR_TRBPTR_SHIFT 54 +#define HDFGRTR_TRBPTR (1ULL << HDFGRTR_TRBPTR_SHIFT) +#define HDFGRTR_TRBMAR_SHIFT 53 +#define HDFGRTR_TRBMAR (1ULL << HDFGRTR_TRBMAR_SHIFT) +#define HDFGRTR_TRBLIMITR_SHIFT 52 +#define HDFGRTR_TRBLIMITR (1ULL << HDFGRTR_TRBLIMITR_SHIFT) +#define HDFGRTR_TRBIDR_SHIFT 51 +#define HDFGRTR_TRBIDR (1ULL << HDFGRTR_TRBIDR_SHIFT) +#define HDFGRTR_TRBBASER_SHIFT 50 +#define HDFGRTR_TRBBASER (1ULL << HDFGRTR_TRBBASER_SHIFT) +#define HDFGRTR_TRCVICTLR_SHIFT 48 +#define HDFGRTR_TRCVICTLR (1ULL << HDFGRTR_TRCVICTLR_SHIFT) +#define HDFGRTR_TRCSTATR_SHIFT 47 +#define HDFGRTR_TRCSTATR (1ULL << HDFGRTR_TRCSTATR_SHIFT) +#define HDFGRTR_TRCSSCSR_SHIFT 46 +#define HDFGRTR_TRCSSCSR (1ULL << HDFGRTR_TRCSSCSR_SHIFT) +#define HDFGRTR_TRCSEQSTR_SHIFT 45 +#define HDFGRTR_TRCSEQSTR (1ULL << HDFGRTR_TRCSEQSTR_SHIFT) +#define HDFGRTR_TRCPRGCTLR_SHIFT 44 +#define HDFGRTR_TRCPRGCTLR (1ULL << HDFGRTR_TRCPRGCTLR_SHIFT) +#define HDFGRTR_TRCOSLSR_SHIFT 43 +#define HDFGRTR_TRCOSLSR (1ULL << HDFGRTR_TRCOSLSR_SHIFT) +#define HDFGRTR_TRCIMSPEC_SHIFT 41 +#define HDFGRTR_TRCIMSPEC (1ULL << HDFGRTR_TRCIMSPEC_SHIFT) +#define HDFGRTR_TRCID_SHIFT 40 +#define HDFGRTR_TRCID (1ULL << HDFGRTR_TRCID_SHIFT) +#define HDFGRTR_TRCCNTVR_SHIFT 37 +#define HDFGRTR_TRCCNTVR (1ULL << HDFGRTR_TRCCNTVR_SHIFT) +#define HDFGRTR_TRCCLAIM_SHIFT 36 +#define HDFGRTR_TRCCLAIM (1ULL << HDFGRTR_TRCCLAIM_SHIFT) +#define HDFGRTR_TRCAUXCTLR_SHIFT 35 +#define HDFGRTR_TRCAUXCTLR (1ULL << HDFGRTR_TRCAUXCTLR_SHIFT) +#define HDFGRTR_TRCAUTHSTATUS_SHIFT 34 +#define HDFGRTR_TRCAUTHSTATUS (1ULL << HDFGRTR_TRCAUTHSTATUS_SHIFT) +#define HDFGRTR_TRC_SHIFT 33 +#define HDFGRTR_TRC (1ULL << HDFGRTR_TRC_SHIFT) +#define HDFGRTR_PMSLATFR_SHIFT 32 +#define HDFGRTR_PMSLATFR (1ULL << HDFGRTR_PMSLATFR_SHIFT) +#define HDFGRTR_PMSIRR_SHIFT 31 +#define HDFGRTR_PMSIRR (1ULL << HDFGRTR_PMSIRR_SHIFT) +#define HDFGRTR_PMSIDR_SHIFT 30 +#define HDFGRTR_PMSIDR (1ULL << HDFGRTR_PMSIDR_SHIFT) +#define HDFGRTR_PMSICR_SHIFT 29 +#define HDFGRTR_PMSICR (1ULL << HDFGRTR_PMSICR_SHIFT) +#define HDFGRTR_PMSFCR_SHIFT 28 +#define HDFGRTR_PMSFCR (1ULL << HDFGRTR_PMSFCR_SHIFT) +#define HDFGRTR_PMSEVFR_SHIFT 27 +#define HDFGRTR_PMSEVFR (1ULL << HDFGRTR_PMSEVFR_SHIFT) +#define HDFGRTR_PMSCR_SHIFT 26 +#define HDFGRTR_PMSCR (1ULL << HDFGRTR_PMSCR_SHIFT) +#define HDFGRTR_PMBSR_SHIFT 25 +#define HDFGRTR_PMBSR (1ULL << HDFGRTR_PMBSR_SHIFT) +#define HDFGRTR_PMBPTR_SHIFT 24 +#define HDFGRTR_PMBPTR (1ULL << HDFGRTR_PMBPTR_SHIFT) +#define HDFGRTR_PMBLIMITR_SHIFT 23 +#define HDFGRTR_PMBLIMITR (1ULL << HDFGRTR_PMBLIMITR_SHIFT) +#define HDFGRTR_PMMIR_SHIFT 22 +#define HDFGRTR_PMMIR (1ULL << HDFGRTR_PMMIR_SHIFT) +#define HDFGRTR_PMSELR_SHIFT 19 +#define HDFGRTR_PMSELR (1ULL << HDFGRTR_PMSELR_SHIFT) +#define HDFGRTR_PMOVS_SHIFT 18 +#define HDFGRTR_PMOVS (1ULL << HDFGRTR_PMOVS_SHIFT) +#define HDFGRTR_PMINTEN_SHIFT 17 +#define HDFGRTR_PMINTEN (1ULL << HDFGRTR_PMINTEN_SHIFT) +#define HDFGRTR_PMCNTEN_SHIFT 16 +#define HDFGRTR_PMCNTEN (1ULL << HDFGRTR_PMCNTEN_SHIFT) +#define HDFGRTR_PMCCNTR_SHIFT 15 +#define HDFGRTR_PMCCNTR (1ULL << HDFGRTR_PMCCNTR_SHIFT) +#define HDFGRTR_PMCCFILTR_SHIFT 14 +#define HDFGRTR_PMCCFILTR (1ULL << HDFGRTR_PMCCFILTR_SHIFT) +#define HDFGRTR_PMEVTYPER_SHIFT 13 +#define HDFGRTR_PMEVTYPER (1ULL << HDFGRTR_PMEVTYPER_SHIFT) +#define HDFGRTR_PMEVCNTR_SHIFT 12 +#define HDFGRTR_PMEVCNTR (1ULL << HDFGRTR_PMEVCNTR_SHIFT) +#define HDFGRTR_OSDLR_SHIFT 11 +#define HDFGRTR_OSDLR (1ULL << HDFGRTR_OSDLR_SHIFT) +#define HDFGRTR_OSECCR_SHIFT 10 +#define HDFGRTR_OSECCR (1ULL << HDFGRTR_OSECCR_SHIFT) +#define HDFGRTR_OSLSR_SHIFT 9 +#define HDFGRTR_OSLSR (1ULL << HDFGRTR_OSLSR_SHIFT) +#define HDFGRTR_DBGPRCR_SHIFT 7 +#define HDFGRTR_DBGPRCR (1ULL << HDFGRTR_DBGPRCR_SHIFT) +#define HDFGRTR_DBGAUTHSTATUS_SHIFT 6 +#define HDFGRTR_DBGAUTHSTATUS (1ULL << HDFGRTR_DBGAUTHSTATUS_SHIFT) +#define HDFGRTR_DBGCLAIM_SHIFT 5 +#define HDFGRTR_DBGCLAIM (1ULL << HDFGRTR_DBGCLAIM_SHIFT) +#define HDFGRTR_MDSCR_SHIFT 4 +#define HDFGRTR_MDSCR (1ULL << HDFGRTR_MDSCR_SHIFT) +#define HDFGRTR_DBGWVR_SHIFT 3 +#define HDFGRTR_DBGWVR (1ULL << HDFGRTR_DBGWVR_SHIFT) +#define HDFGRTR_DBGWCR_SHIFT 2 +#define HDFGRTR_DBGWCR (1ULL << HDFGRTR_DBGWCR_SHIFT) +#define HDFGRTR_DBGBVR_SHIFT 1 +#define HDFGRTR_DBGBVR (1ULL << HDFGRTR_DBGBVR_SHIFT) +#define HDFGRTR_DBGBCR_SHIFT 0 +#define HDFGRTR_DBGBCR (1ULL << HDFGRTR_DBGBCR_SHIFT) + +/* + * Hypervisor Debug Fine-Grained Write Trap Register (HDFGWTR_EL2) + */ +#define HDFGWTR_PMSNEVFR_SHIFT 62 +#define HDFGWTR_PMSNEVFR (1ULL << HDFGWTR_PMSNEVFR_SHIFT) +#define HDFGWTR_BRBDATA_SHIFT 61 +#define HDFGWTR_BRBDATA (1ULL << HDFGWTR_BRBDATA_SHIFT) +#define HDFGWTR_BRBCTL_SHIFT 60 +#define HDFGWTR_BRBCTL (1ULL << HDFGWTR_BRBCTL_SHIFT) +#define HDFGWTR_PMUSERENR_SHIFT 57 +#define HDFGWTR_PMUSERENR (1ULL << HDFGWTR_PMUSERENR_SHIFT) +#define HDFGWTR_TRBTRG_SHIFT 56 +#define HDFGWTR_TRBTRG (1ULL << HDFGWTR_TRBTRG_SHIFT) +#define HDFGWTR_TRBSR_SHIFT 55 +#define HDFGWTR_TRBSR (1ULL << HDFGWTR_TRBSR_SHIFT) +#define HDFGWTR_TRBPTR_SHIFT 54 +#define HDFGWTR_TRBPTR (1ULL << HDFGWTR_TRBPTR_SHIFT) +#define HDFGWTR_TRBMAR_SHIFT 53 +#define HDFGWTR_TRBMAR (1ULL << HDFGWTR_TRBMAR_SHIFT) +#define HDFGWTR_TRBLIMITR_SHIFT 52 +#define HDFGWTR_TRBLIMITR (1ULL << HDFGWTR_TRBLIMITR_SHIFT) +#define HDFGWTR_TRBBASER_SHIFT 50 +#define HDFGWTR_TRBBASER (1ULL << HDFGWTR_TRBBASER_SHIFT) +#define HDFGWTR_TRFCR_SHIFT 49 +#define HDFGWTR_TRFCR (1ULL << HDFGWTR_TRFCR_SHIFT) +#define HDFGWTR_TRCVICTLR_SHIFT 48 +#define HDFGWTR_TRCVICTLR (1ULL << HDFGWTR_TRCVICTLR_SHIFT) +#define HDFGWTR_TRCSSCSR_SHIFT 46 +#define HDFGWTR_TRCSSCSR (1ULL << HDFGWTR_TRCSSCSR_SHIFT) +#define HDFGWTR_TRCSEQSTR_SHIFT 45 +#define HDFGWTR_TRCSEQSTR (1ULL << HDFGWTR_TRCSEQSTR_SHIFT) +#define HDFGWTR_TRCPRGCTLR_SHIFT 44 +#define HDFGWTR_TRCPRGCTLR (1ULL << HDFGWTR_TRCPRGCTLR_SHIFT) +#define HDFGWTR_TRCOSLAR_SHIFT 42 +#define HDFGWTR_TRCOSLAR (1ULL << HDFGWTR_TRCOSLAR_SHIFT) +#define HDFGWTR_TRCIMSPEC_SHIFT 41 +#define HDFGWTR_TRCIMSPEC (1ULL << HDFGWTR_TRCIMSPEC_SHIFT) +#define HDFGWTR_TRCCNTVR_SHIFT 37 +#define HDFGWTR_TRCCNTVR (1ULL << HDFGWTR_TRCCNTVR_SHIFT) +#define HDFGWTR_TRCCLAIM_SHIFT 36 +#define HDFGWTR_TRCCLAIM (1ULL << HDFGWTR_TRCCLAIM_SHIFT) +#define HDFGWTR_TRCAUXCTLR_SHIFT 35 +#define HDFGWTR_TRCAUXCTLR (1ULL << HDFGWTR_TRCAUXCTLR_SHIFT) +#define HDFGWTR_TRC_SHIFT 33 +#define HDFGWTR_TRC (1ULL << HDFGWTR_TRC_SHIFT) +#define HDFGWTR_PMSLATFR_SHIFT 32 +#define HDFGWTR_PMSLATFR (1ULL << HDFGWTR_PMSLATFR_SHIFT) +#define HDFGWTR_PMSIRR_SHIFT 31 +#define HDFGWTR_PMSIRR (1ULL << HDFGWTR_PMSIRR_SHIFT) +#define HDFGWTR_PMSICR_SHIFT 29 +#define HDFGWTR_PMSICR (1ULL << HDFGWTR_PMSICR_SHIFT) +#define HDFGWTR_PMSFCR_SHIFT 28 +#define HDFGWTR_PMSFCR (1ULL << HDFGWTR_PMSFCR_SHIFT) +#define HDFGWTR_PMSEVFR_SHIFT 27 +#define HDFGWTR_PMSEVFR (1ULL << HDFGWTR_PMSEVFR_SHIFT) +#define HDFGWTR_PMSCR_SHIFT 26 +#define HDFGWTR_PMSCR (1ULL << HDFGWTR_PMSCR_SHIFT) +#define HDFGWTR_PMBSR_SHIFT 25 +#define HDFGWTR_PMBSR (1ULL << HDFGWTR_PMBSR_SHIFT) +#define HDFGWTR_PMBPTR_SHIFT 24 +#define HDFGWTR_PMBPTR (1ULL << HDFGWTR_PMBPTR_SHIFT) +#define HDFGWTR_PMBLIMITR_SHIFT 23 +#define HDFGWTR_PMBLIMITR (1ULL << HDFGWTR_PMBLIMITR_SHIFT) +#define HDFGWTR_PMCR_SHIFT 21 +#define HDFGWTR_PMCR (1ULL << HDFGWTR_PMCR_SHIFT) +#define HDFGWTR_PMSWINC_SHIFT 20 +#define HDFGWTR_PMSWINC (1ULL << HDFGWTR_PMSWINC_SHIFT) +#define HDFGWTR_PMSELR_SHIFT 19 +#define HDFGWTR_PMSELR (1ULL << HDFGWTR_PMSELR_SHIFT) +#define HDFGWTR_PMOVS_SHIFT 18 +#define HDFGWTR_PMOVS (1ULL << HDFGWTR_PMOVS_SHIFT) +#define HDFGWTR_PMINTEN_SHIFT 17 +#define HDFGWTR_PMINTEN (1ULL << HDFGWTR_PMINTEN_SHIFT) +#define HDFGWTR_PMCNTEN_SHIFT 16 +#define HDFGWTR_PMCNTEN (1ULL << HDFGWTR_PMCNTEN_SHIFT) +#define HDFGWTR_PMCCNTR_SHIFT 15 +#define HDFGWTR_PMCCNTR (1ULL << HDFGWTR_PMCCNTR_SHIFT) +#define HDFGWTR_PMCCFILTR_SHIFT 14 +#define HDFGWTR_PMCCFILTR (1ULL << HDFGWTR_PMCCFILTR_SHIFT) +#define HDFGWTR_PMEVTYPER_SHIFT 13 +#define HDFGWTR_PMEVTYPER (1ULL << HDFGWTR_PMEVTYPER_SHIFT) +#define HDFGWTR_PMEVCNTR_SHIFT 12 +#define HDFGWTR_PMEVCNTR (1ULL << HDFGWTR_PMEVCNTR_SHIFT) +#define HDFGWTR_OSDLR_SHIFT 11 +#define HDFGWTR_OSDLR (1ULL << HDFGWTR_OSDLR_SHIFT) +#define HDFGWTR_OSECCR_SHIFT 10 +#define HDFGWTR_OSECCR (1ULL << HDFGWTR_OSECCR_SHIFT) +#define HDFGWTR_OSLAR_SHIFT 8 +#define HDFGWTR_OSLAR (1ULL << HDFGWTR_OSLAR_SHIFT) +#define HDFGWTR_DBGPRCR_SHIFT 7 +#define HDFGWTR_DBGPRCR (1ULL << HDFGWTR_DBGPRCR_SHIFT) +#define HDFGWTR_DBGCLAIM_SHIFT 5 +#define HDFGWTR_DBGCLAIM (1ULL << HDFGWTR_DBGCLAIM_SHIFT) +#define HDFGWTR_MDSCR_SHIFT 4 +#define HDFGWTR_MDSCR (1ULL << HDFGWTR_MDSCR_SHIFT) +#define HDFGWTR_DBGWVR_SHIFT 3 +#define HDFGWTR_DBGWVR (1ULL << HDFGWTR_DBGWVR_SHIFT) +#define HDFGWTR_DBGWCR_SHIFT 2 +#define HDFGWTR_DBGWCR (1ULL << HDFGWTR_DBGWCR_SHIFT) +#define HDFGWTR_DBGBVR_SHIFT 1 +#define HDFGWTR_DBGBVR (1ULL << HDFGWTR_DBGBVR_SHIFT) +#define HDFGWTR_DBGBCR_SHIFT 0 +#define HDFGWTR_DBGBCR (1ULL << HDFGWTR_DBGBCR_SHIFT) + /* * Translation Table Base Register (TTBR) * - * 63 48 47 x x-1 0 - * +--------+------------------+------+ - * | ASID | Base Address | zero | - * +--------+------------------+------+ + * 63 48 47 x x-1 1 0 + * +--------+------------------+------+---+ + * | ASID | Base Address | zero |CnP| + * +--------+------------------+------+---+ * */ #define TTBR_ASID_SHIFT 48 #define TTBR_ASID_MASK 0xffff000000000000 -#define TTBR_BADDR_MASK 0x0000ffffffffffff +#define TTBR_BADDR_MASK 0x0000fffffffffffe +#define TTBR_CNP 0x0000000000000001 /* * Memory Attribute Indirection Register @@ -1243,7 +1750,7 @@ #if HAS_UCNORMAL_MEM || APPLEVIRTUALPLATFORM #define CACHE_ATTRINDX_RT CACHE_ATTRINDX_WRITECOMB #else -#define CACHE_ATTRINDX_RT CACHE_ATTRINDX_DISABLE +#define CACHE_ATTRINDX_RT CACHE_ATTRINDX_POSTED_COMBINED_REORDERED #endif /* HAS_UCNORMAL_MEM || APPLEVIRTUALPLATFORM */ @@ -1324,32 +1831,28 @@ #define ARM_16K_TT_L1_SIZE 0x0000001000000000ULL /* size of area covered by a tte */ #define ARM_16K_TT_L1_OFFMASK 0x0000000fffffffffULL /* offset within an L1 entry */ #define ARM_16K_TT_L1_SHIFT 36 /* page descriptor shift */ -#if __ARM64_PMAP_SUBPAGE_L1__ && __ARM_16K_PG__ -/* This config supports 512GB per TTBR. */ -#define ARM_16K_TT_L1_INDEX_MASK 0x0000007000000000ULL /* mask for getting index into L1 table from virtual address */ -#else /* __ARM64_PMAP_SUBPAGE_L1__ */ -#define ARM_16K_TT_L1_INDEX_MASK 0x00007ff000000000ULL /* mask for getting index into L1 table from virtual address */ -#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ +#define ARM_16K_TT_L1_INDEX_MASK 0x00007ff000000000ULL /* 4K L1 */ #define ARM_4K_TT_L1_SIZE 0x0000000040000000ULL /* size of area covered by a tte */ #define ARM_4K_TT_L1_OFFMASK 0x000000003fffffffULL /* offset within an L1 entry */ #define ARM_4K_TT_L1_SHIFT 30 /* page descriptor shift */ -#if __ARM64_PMAP_SUBPAGE_L1__ && !__ARM_16K_PG__ -/* This config supports 256GB per TTBR. */ -#define ARM_4K_TT_L1_INDEX_MASK 0x0000003fc0000000ULL /* mask for getting index into L1 table from virtual address */ -#else /* __ARM64_PMAP_SUBPAGE_L1__ */ -/* IPA[38:30] mask for getting index into L1 table from virtual address */ + #define ARM_4K_TT_L1_INDEX_MASK 0x0000007fc0000000ULL +/* + * Enable concatenated tables if: + * 1. We have a 42-bit PA, and + * 2. Either we're using 4k pages or mixed mode is supported. + */ #if __ARM_42BIT_PA_SPACE__ +#if !__ARM_16K_PG__ || __ARM_MIXED_PAGE_SIZE__ /* IPA[39:30] mask for getting index into L1 concatenated table from virtual address */ #define ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK 0x000000ffc0000000ULL +#endif /* !__ARM_16K_PG__ || __ARM_MIXED_PAGE_SIZE__ */ #endif /* __ARM_42BIT_PA_SPACE__ */ -#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ /* some sugar for getting pointers to page tables and entries */ - -#define L1_TABLE_INDEX(va) (((va) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT) +#define L1_TABLE_T1_INDEX(va, tcr) (((va) & ARM_PTE_T1_REGION_MASK(tcr)) >> ARM_TT_L1_SHIFT) #define L2_TABLE_INDEX(va) (((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) #define L3_TABLE_INDEX(va) (((va) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT) @@ -1583,6 +2086,16 @@ #define ARM_TTE_TYPE_BLOCK 0x0000000000000000ULL /* block entry type */ #define ARM_TTE_TYPE_L3BLOCK 0x0000000000000002ULL +/* Base AttrIndx transforms */ +#define ARM_TTE_ATTRINDXSHIFT (2) +#define ARM_TTE_ATTRINDXBITS (0x7ULL) +#define ARM_TTE_ATTRINDX(x) (((x) & ARM_TTE_ATTRINDXBITS) << ARM_TTE_ATTRINDXSHIFT) /* memory attributes index */ +#define ARM_TTE_EXTRACT_ATTRINDX(x) (((x) >> ARM_TTE_ATTRINDXSHIFT) & ARM_TTE_ATTRINDXBITS) /* extract memory attributes index */ +#define ARM_TTE_ATTRINDXMASK ARM_TTE_ATTRINDX(ARM_TTE_ATTRINDXBITS) /* mask memory attributes index */ +#define ARM_TTE_ATTRINDX_AIE(x) 0ULL +#define ARM_TTE_ATTRINDXMASK_AIE 0ULL +#define ARM_TTE_EXTRACT_ATTRINDX_AIE(x) 0ULL + #ifdef __ARM_16K_PG__ /* * Note that L0/L1 block entries are disallowed for the 16KB granule size; what @@ -1607,8 +2120,8 @@ #define ARM_TTE_BLOCK_AP(x) ((x)<> TCR_T0SZ_SHIFT) & TCR_T0SZ_MASK) +#define ARM_PTE_T1SZ(TCR) (((TCR) >> TCR_T1SZ_SHIFT) & TCR_T1SZ_MASK) +#define ARM_PTE_REGION_MASK(SZ) ((1ULL << (64 - (SZ))) - 1) #define ARM_TTE_PA_MASK 0x0000fffffffff000ULL +/* Handle Page table address bits in a TCR-aware way. */ +#define ARM_PTE_T0_REGION_MASK(TCR) (ARM_PTE_REGION_MASK(ARM_PTE_T0SZ(TCR))) +#define ARM_PTE_T1_REGION_MASK(TCR) (ARM_PTE_REGION_MASK(ARM_PTE_T1SZ(TCR))) + /* * L3 Page table entries * @@ -1748,9 +2285,9 @@ #define ARM_PTE_APMASK (0x3ULL << 6) /* mask access protections */ #define ARM_PTE_EXTRACT_AP(x) (((x) >> 6) & 0x3ULL) /* extract access protections from PTE */ -#define ARM_PTE_ATTRINDX(x) (uint64_t)((x) << 2) /* memory attributes index */ -#define ARM_PTE_ATTRINDXMASK (0x7ULL << 2) /* mask memory attributes index */ -#define ARM_PTE_EXTRACT_ATTRINDX(x) (((x) >> 2) & 0x7ULL) /* extract memory attributes index */ +#define ARM_PTE_ATTRINDX(x) (uint64_t)(ARM_TTE_ATTRINDX_AIE(x) | ARM_TTE_ATTRINDX(x)) /* memory attributes index */ +#define ARM_PTE_ATTRINDXMASK (ARM_TTE_ATTRINDXMASK_AIE | ARM_TTE_ATTRINDXMASK) /* mask memory attributes index */ +#define ARM_PTE_EXTRACT_ATTRINDX(x) (ARM_TTE_EXTRACT_ATTRINDX_AIE(x) | ARM_TTE_EXTRACT_ATTRINDX(x)) /* extract memory attributes index */ #define ARM_PTE_SH(x) ((x) << 8) /* access shared */ #define ARM_PTE_SHMASK (0x3ULL << 8) /* mask access shared */ @@ -1801,11 +2338,30 @@ #define ARM_PTE_WIRED_MASK 0x0400000000000000ULL /* software wired mask */ #define ARM_PTE_WRITEABLE 0x0800000000000000ULL /* value for software writeable bit */ -#define ARM_PTE_WRITABLE ARM_PTE_WRITEABLE #define ARM_PTE_WRITEABLE_MASK 0x0800000000000000ULL /* software writeable mask */ +#define ARM_PTE_WRITABLE ARM_PTE_WRITEABLE +/** Software use PTE bits which the kernel actually uses. */ #define ARM_PTE_SW_RESERVED_MASK (ARM_PTE_WIRED_MASK | ARM_PTE_WRITEABLE_MASK) +/** + * PTE bits which must be set to zero by software when the PTE is valid. + */ +#define ARM_PTE_RESERVED_MASK \ + (~(ARM_PTE_TYPE_MASK | \ + ARM_PTE_ATTRINDXMASK | \ + ARM_PTE_NS_MASK | \ + ARM_PTE_APMASK | \ + ARM_PTE_SHMASK | \ + ARM_PTE_AFMASK | \ + ARM_PTE_NG_MASK | \ + ARM_PTE_PAGE_MASK | \ + ARM_PTE_GP_MASK | \ + ARM_PTE_HINT_MASK | \ + ARM_PTE_PNXMASK | \ + ARM_PTE_NXMASK | \ + ARM_PTE_SW_RESERVED_MASK)) + #define ARM_PTE_BOOT_PAGE_BASE \ (ARM_PTE_TYPE_VALID | ARM_PTE_SH(SH_OUTER_MEMORY) | \ ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) | ARM_PTE_AF) @@ -2052,6 +2608,17 @@ typedef enum { #define ISS_DA_FNV_SHIFT 10 #define ISS_DA_FNV (0x1 << ISS_DA_FNV_SHIFT) +#define ISS_DA_ISV_SHIFT 24 +#define ISS_DA_ISV (0x1 << ISS_DA_ISV_SHIFT) + +#define ISS_DA_SAS_MASK 0x3 +#define ISS_DA_SAS_SHIFT 22 +#define ISS_DA_SAS(x) (((x) >> ISS_DA_SAS_SHIFT) & ISS_DA_SAS_MASK) + +#define ISS_DA_SRT_MASK 0x1f +#define ISS_DA_SRT_SHIFT 16 +#define ISS_DA_SRT(x) (((x) >> ISS_DA_SRT_SHIFT) & ISS_DA_SRT_MASK) + #define ISS_DA_EA_SHIFT 9 #define ISS_DA_EA (0x1 << ISS_DA_EA_SHIFT) @@ -2313,6 +2880,8 @@ typedef enum { #define MIDR_BRAVA_ACCP (0x055 << MIDR_EL1_PNUM_SHIFT) + + /* * Apple-ISA-Extensions ID Register. */ @@ -2344,6 +2913,7 @@ typedef enum { + /* * ID_AA64ISAR0_EL1 - AArch64 Instruction Set Attribute Register 0 * @@ -2474,6 +3044,7 @@ typedef enum { * +------+------+------+------+-------+-------+------+ */ + #define ID_AA64ISAR2_EL1_CSSC_OFFSET 52 #define ID_AA64ISAR2_EL1_CSSC_MASK (0xfull << ID_AA64ISAR2_EL1_CSSC_OFFSET) #define ID_AA64ISAR2_EL1_CSSC_EN (1ull << ID_AA64ISAR2_EL1_CSSC_OFFSET) @@ -2490,6 +3061,7 @@ typedef enum { #define ID_AA64ISAR2_EL1_WFxT_MASK (0xfull << ID_AA64ISAR2_EL1_WFxT_OFFSET) #define ID_AA64ISAR2_EL1_WFxT_EN (1ull << ID_AA64ISAR2_EL1_WFxT_OFFSET) + /* * ID_AA64MMFR0_EL1 - AArch64 Memory Model Feature Register 0 * 63 60 59 56 55 48 47 44 43 40 39 36 35 32 31 28 27 24 23 20 19 16 15 12 11 8 7 4 3 0 @@ -2516,6 +3088,10 @@ typedef enum { #define ID_AA64MMFR2_EL1_VARANGE_OFFSET 16 #define ID_AA64MMFR2_EL1_VARANGE_MASK (0xfull << ID_AA64MMFR2_EL1_VARANGE_OFFSET) +#define ID_AA64MMFR2_EL1_CNP_OFFSET 0 +#define ID_AA64MMFR2_EL1_CNP_MASK (0xfull << ID_AA64MMFR2_EL1_CNP_OFFSET) +#define ID_AA64MMFR2_EL1_CNP_EN (1ull << ID_AA64MMFR2_EL1_CNP_OFFSET) + /* * ID_AA64PFR0_EL1 - AArch64 Processor Feature Register 0 * 63 60 59 56 55 52 51 48 47 44 43 40 39 36 35 32 31 28 27 24 23 20 19 16 15 12 11 8 7 4 3 0 @@ -2578,6 +3154,7 @@ typedef enum { + /* * ID_AA64MMFR1_EL1 - AArch64 Memory Model Feature Register 1 * @@ -2604,6 +3181,7 @@ typedef enum { * +------+------+--------+--------+------+--------+--------+------+-------+--------+--------+---------+--------+------+ */ + #define ID_AA64SMFR0_EL1_SMEver_OFFSET 56 #define ID_AA64SMFR0_EL1_SMEver_MASK (0xfull << ID_AA64SMFR0_EL1_SMEver_OFFSET) #define ID_AA64SMFR0_EL1_SMEver_SME (0ull << ID_AA64SMFR0_EL1_SMEver_OFFSET) @@ -2622,6 +3200,7 @@ typedef enum { #define ID_AA64SMFR0_EL1_I16I32_EN (0x5ull << ID_AA64SMFR0_EL1_I16I32_OFFSET) + #define ID_AA64SMFR0_EL1_I8I32_OFFSET 36 #define ID_AA64SMFR0_EL1_I8I32_MASK (0xfull << ID_AA64SMFR0_EL1_I8I32_OFFSET) #define ID_AA64SMFR0_EL1_I8I32_EN (0xfull << ID_AA64SMFR0_EL1_I8I32_OFFSET) @@ -2687,6 +3266,10 @@ typedef enum { #define CTR_EL0_L1Ip_MASK (3ULL << CTR_EL0_L1Ip_OFFSET) +#define ACNTHV_CTL_EL2 S3_1_C15_C7_4 +#define ACNTHV_CTL_EL2_EN_OFFSET 0 +#define ACNTHV_CTL_EL2_EN_MASK (1ULL << ACNTHV_CTL_EL2_EN_OFFSET) + #ifdef __ASSEMBLER__ /* @@ -3027,4 +3610,8 @@ nop #endif +#if HAS_ESB +#define DISR_A_SHIFT 31 +#define DISR_A (1ULL << DISR_A_SHIFT) +#endif #endif /* _ARM64_PROC_REG_H_ */ diff --git a/osfmk/arm64/sleh.c b/osfmk/arm64/sleh.c index 4c4d55c4f..55e2acf4e 100644 --- a/osfmk/arm64/sleh.c +++ b/osfmk/arm64/sleh.c @@ -93,10 +93,6 @@ -#ifdef CONFIG_BTI_TELEMETRY -#include -#endif /* CONFIG_BTI_TELEMETRY */ - #ifndef __arm64__ #error Should only be compiling for arm64. #endif @@ -170,7 +166,7 @@ static void handle_uncategorized(arm_saved_state_t *); static void handle_kernel_breakpoint(arm_saved_state_t *, uint64_t); -static void handle_breakpoint(arm_saved_state_t *, uint64_t) __dead2; +static void handle_user_breakpoint(arm_saved_state_t *, uint64_t) __dead2; typedef void (*abort_inspector_t)(uint32_t, fault_status_t *, vm_prot_t *); static void inspect_instruction_abort(uint32_t, fault_status_t *, vm_prot_t *); @@ -281,6 +277,26 @@ extern unsigned int gFastIPI; static arm_saved_state64_t *original_faulting_state = NULL; +/* + * A self-restrict mode describes which (if any, or several) special permissive + * modes are active at the time of a fault. This, in part, determines how the + * fault will be handled. + */ +__options_closed_decl(self_restrict_mode_t, unsigned int, { + /* None of the special modes are active. */ + SELF_RESTRICT_NONE = 0U, + + /* + * Any of the other more specific modes, this should be active if any other + * mode is active. + */ + SELF_RESTRICT_ANY = (1U << 0), + + /* Reserved */ + + /* Reserved */ +}); + TUNABLE(bool, fp_exceptions_enabled, "-fp_exceptions", false); @@ -455,6 +471,7 @@ is_table_walk_error(fault_status_t status) + static inline int is_servicible_fault(fault_status_t status, uint64_t esr) { @@ -532,6 +549,7 @@ arm64_platform_error(arm_saved_state_t *state, uint64_t esr, vm_offset_t far, pl } } + void panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) { @@ -557,6 +575,7 @@ panic_with_thread_kernel_state(const char *msg, arm_saved_state_t *ss) } + panic_plain("%s at pc 0x%016llx, lr 0x%016llx (saved state: %p%s)\n" "\t x0: 0x%016llx x1: 0x%016llx x2: 0x%016llx x3: 0x%016llx\n" "\t x4: 0x%016llx x5: 0x%016llx x6: 0x%016llx x7: 0x%016llx\n" @@ -615,6 +634,7 @@ thread_exception_return() thread->machine.exception_trace_code = 0; } + #if KASAN_TBI kasan_unpoison_curstack(true); #endif /* KASAN_TBI */ @@ -844,6 +864,7 @@ sleh_synchronous(arm_context_t *context, uint64_t esr, vm_offset_t far, __unused ml_set_interrupts_enabled(TRUE); } + switch (class) { case ESR_EC_SVC_64: if (!is_saved_state64(state) || !is_user) { @@ -926,7 +947,7 @@ sleh_synchronous(arm_context_t *context, uint64_t esr, vm_offset_t far, __unused __builtin_unreachable(); case ESR_EC_BKPT_AARCH32: - handle_breakpoint(state, esr); + handle_user_breakpoint(state, esr); __builtin_unreachable(); case ESR_EC_BRK_AARCH64: @@ -939,13 +960,13 @@ sleh_synchronous(arm_context_t *context, uint64_t esr, vm_offset_t far, __unused handle_kernel_breakpoint(state, esr); break; } else { - handle_breakpoint(state, esr); + handle_user_breakpoint(state, esr); __builtin_unreachable(); } case ESR_EC_BKPT_REG_MATCH_EL0: if (FSC_DEBUG_FAULT == ISS_SSDE_FSC(esr)) { - handle_breakpoint(state, esr); + handle_user_breakpoint(state, esr); } panic("Unsupported Class %u event code. state=%p class=%u esr=%llu far=%p", class, state, class, esr, (void *)far); @@ -1019,12 +1040,6 @@ sleh_synchronous(arm_context_t *context, uint64_t esr, vm_offset_t far, __unused break; } #endif /* CONFIG_XNUPOST */ -#ifdef CONFIG_BTI_TELEMETRY - if (bti_telemetry_handle_exception(state)) { - /* Telemetry has accepted and corrected the exception, continue */ - break; - } -#endif /* CONFIG_BTI_TELEMETRY */ handle_bti_fail(state, esr); __builtin_unreachable(); @@ -1085,6 +1100,7 @@ sleh_synchronous(arm_context_t *context, uint64_t esr, vm_offset_t far, __unused panic("synchronous exception changed preemption level from %d to %d", preemption_level, sleh_get_preemption_level()); } #endif + } /* @@ -1180,14 +1196,6 @@ handle_uncategorized(arm_saved_state_t *state) } #if __has_feature(ptrauth_calls) -static const uint16_t PTRAUTH_TRAP_START = 0xC470; -static inline bool -brk_comment_is_ptrauth(uint16_t comment) -{ - return comment >= PTRAUTH_TRAP_START && - comment <= PTRAUTH_TRAP_START + ptrauth_key_asdb; -} - static inline const char * ptrauth_key_to_string(ptrauth_key key) { @@ -1283,7 +1291,7 @@ xnu_hard_trap_handle_breakpoint(void *tstate, uint16_t comment) KERNEL_BRK_DESCRIPTOR_DEFINE(ptrauth_desc, .type = TRAP_TELEMETRY_TYPE_KERNEL_BRK_PTRAUTH, .base = PTRAUTH_TRAP_START, - .max = PTRAUTH_TRAP_START + ptrauth_key_asdb, + .max = PTRAUTH_TRAP_END, .options = BRK_TELEMETRY_OPTIONS_FATAL_DEFAULT, .handle_breakpoint = ptrauth_handle_brk_trap); #endif @@ -1329,7 +1337,7 @@ handle_kernel_breakpoint(arm_saved_state_t *state, uint64_t esr) const struct kernel_brk_descriptor *desc; const char *msg = NULL; - desc = find_brk_descriptor_by_comment(comment); + desc = find_kernel_brk_descriptor_by_comment(comment); if (!desc) { goto brk_out; @@ -1361,10 +1369,15 @@ brk_out: if (msg == NULL) { kernel_panic_reason_t pr = PERCPU_GET(panic_reason); - msg = tsnprintf(pr->buf, sizeof(pr->buf), - "Break 0x%04X instruction exception from kernel. " - "Panic (by design)", - comment); + if (comment == CLANG_ARM_TRAP_BOUND_CHK) { + msg = tsnprintf(pr->buf, sizeof(pr->buf), + "Bounds safety trap"); + } else { + msg = tsnprintf(pr->buf, sizeof(pr->buf), + "Break 0x%04X instruction exception from kernel. " + "Panic (by design)", + comment); + } } panic_with_thread_kernel_state(msg, state); @@ -1372,19 +1385,88 @@ brk_out: #undef MSG_FMT } +/* + * Similar in spirit to kernel_brk_descriptor, but with less flexible semantics: + * each descriptor defines a `brk` label range for use from userspace. + * When used, system policy may decide to kill the calling process without giving them opportunity to + * catch the exception or continue execution from a signal handler. + * This is used to enforce security boundaries: userspace code may use this mechanism + * to reliably terminate when internal inconsistencies are detected. + * Note that we don't invariably terminate without giving the process a say: we might only enforce + * such a policy if a security feature is enabled, for example. + */ +typedef struct user_brk_label_range_descriptor { + uint16_t base; + uint16_t max; +} user_brk_label_range_descriptor_t; + +const user_brk_label_range_descriptor_t user_brk_descriptors[] = { +#if __has_feature(ptrauth_calls) + /* PAC failures detected in data by userspace */ + { + /* Use the exact same label range as kernel PAC */ + .base = PTRAUTH_TRAP_START, + .max = PTRAUTH_TRAP_END, + }, +#endif /* __has_feature(ptrauth_calls) */ + /* Available for use by system libraries when detecting disallowed conditions */ + { + /* Note this uses the same range as the kernel-specific XNU_HARD_TRAP range */ + .base = 0xB000, + .max = 0xBFFF, + } +}; +const int user_brk_descriptor_count = sizeof(user_brk_descriptors) / sizeof(user_brk_descriptors[0]); + +const static inline user_brk_label_range_descriptor_t * +find_user_brk_descriptor_by_comment(uint16_t comment) +{ + for (int desc_idx = 0; desc_idx < user_brk_descriptor_count; desc_idx++) { + const user_brk_label_range_descriptor_t* des = &user_brk_descriptors[desc_idx]; + if (comment >= des->base && comment <= des->max) { + return des; + } + } + + return NULL; +} + static void -handle_breakpoint(arm_saved_state_t *state, uint64_t esr __unused) +handle_user_breakpoint(arm_saved_state_t *state, uint64_t esr __unused) { exception_type_t exception = EXC_BREAKPOINT; mach_exception_data_type_t codes[2] = {EXC_ARM_BREAKPOINT}; mach_msg_type_number_t numcodes = 2; + if (ESR_EC(esr) == ESR_EC_BRK_AARCH64) { + /* + * Consult the trap labels we know about to decide whether userspace + * should be given the opportunity to handle the exception. + */ + uint16_t brk_label = ISS_BRK_COMMENT(esr); + const struct user_brk_label_range_descriptor* descriptor = find_user_brk_descriptor_by_comment(brk_label); + /* + * Note it's no problem if we don't recognize the label. + * In this case we'll just go through normal exception delivery. + */ + if (descriptor != NULL) { + exception |= EXC_MAY_BE_UNRECOVERABLE_BIT; + #if __has_feature(ptrauth_calls) - if (ESR_EC(esr) == ESR_EC_BRK_AARCH64 && - brk_comment_is_ptrauth(ISS_BRK_COMMENT(esr))) { - exception |= EXC_PTRAUTH_BIT; - } + /* + * We have additional policy specifically for PAC violations. + * To make the rest of the code easier to follow, don't set + * EXC_MAY_BE_UNRECOVERABLE_BIT here and just set EXC_PTRAUTH_BIT instead. + * Conceptually a PAC failure is absolutely 'maybe unrecoverable', but it's + * not really worth excising the discrepency from the plumbing. + */ + if (descriptor->base == PTRAUTH_TRAP_START) { + exception &= ~(EXC_MAY_BE_UNRECOVERABLE_BIT); + exception |= EXC_PTRAUTH_BIT; + } #endif /* __has_feature(ptrauth_calls) */ + } + } codes[1] = get_saved_state_pc(state); exception_triage(exception, codes, numcodes); @@ -1533,11 +1615,12 @@ user_fault_matches_pac_error_code(vm_offset_t fault_addr, uint64_t pc, bool data * potentially-compromised process try to handle the exception, it will be killed * by the kernel and a crash report will be generated. */ -static bool +static self_restrict_mode_t user_fault_in_self_restrict_mode(thread_t thread __unused) { + self_restrict_mode_t out = SELF_RESTRICT_NONE; - return false; + return out; } static void @@ -1756,7 +1839,7 @@ handle_sw_step_debug(arm_saved_state_t *state) } #if MACH_ASSERT -TUNABLE_WRITEABLE(int, panic_on_jit_guard, "panic_on_jit_guard", 0); +TUNABLE_WRITEABLE(self_restrict_mode_t, panic_on_jit_guard, "panic_on_jit_guard", SELF_RESTRICT_NONE); #endif /* MACH_ASSERT */ static void @@ -1900,7 +1983,8 @@ handle_user_abort(arm_saved_state_t *state, uint64_t esr, vm_offset_t fault_addr } #endif /* __has_feature(ptrauth_calls) */ - if (user_fault_in_self_restrict_mode(thread) && + const self_restrict_mode_t self_restrict_mode = user_fault_in_self_restrict_mode(thread); + if ((self_restrict_mode != SELF_RESTRICT_NONE) && task_is_jit_exception_fatal(get_threadtask(thread))) { int flags = PX_KTRIAGE; exception_info_t info = { @@ -1911,11 +1995,29 @@ handle_user_abort(arm_saved_state_t *state, uint64_t esr, vm_offset_t fault_addr }; #if MACH_ASSERT + /* + * Case: panic_on_jit_guard=1. Catch an early process creation TPRO issue causing + * rdar://129742083. Only panic during early process creation (1 thread, few syscalls + * issued) to avoid spurious panics. + */ + const self_restrict_mode_t self_restrict_panic_mask = panic_on_jit_guard & self_restrict_mode; + bool should_panic = ((self_restrict_panic_mask == SELF_RESTRICT_ANY) && + (current_task()->thread_count == 1) && + (thread->syscalls_unix < 24)); + + /* + * Modes other than ANY will force panic, skipping checks that were done in the ANY case, + * but allowing us to filter on a more specific scenario (e.g. TPRO, JIT, etc). This is + * meant to catch a TPRO issue causing rdar://145703251. Restrict to KERN_PROTECTION_FAILURE + * only to avoid failures from the more frequent case of KERN_INVALID_ADDRESS that aren't + * of interest for that radar. + */ + should_panic |= (codes[0] == KERN_PROTECTION_FAILURE) + && ((self_restrict_panic_mask & ~SELF_RESTRICT_ANY) != 0); + printf("\nGUARD_REASON_JIT exc %d codes=<0x%llx,0x%llx> syscalls %d task %p thread %p va 0x%lx code 0x%x type 0x%x esr 0x%llx\n", exc, codes[0], codes[1], thread->syscalls_unix, current_task(), thread, fault_addr, fault_code, fault_type, esr); - if (panic_on_jit_guard && - current_task()->thread_count == 1 && - thread->syscalls_unix < 24) { + if (should_panic) { panic("GUARD_REASON_JIT exc %d codes=<0x%llx,0x%llx> syscalls %d task %p thread %p va 0x%lx code 0x%x type 0x%x esr 0x%llx state %p j %d t %d s user 0x%llx (0x%llx) jb 0x%llx (0x%llx)", exc, codes[0], codes[1], thread->syscalls_unix, current_task(), thread, fault_addr, fault_code, fault_type, esr, state, 0, 0, 0ull, 0ull, @@ -2034,6 +2136,7 @@ handle_kernel_abort(arm_saved_state_t *state, uint64_t esr, vm_offset_t fault_ad (void)expected_fault_handler; #endif /* CONFIG_XNUPOST */ + #if CONFIG_DTRACE if (is_vm_fault(fault_code) && thread->t_dtrace_inprobe) { /* Executing under dtrace_probe? */ if (dtrace_tally_fault(fault_addr)) { /* Should a fault under dtrace be ignored? */ @@ -2539,9 +2642,9 @@ sleh_fiq(arm_saved_state_t *state) #endif /* defined(HAS_IPI) */ #if MONOTONIC_FIQ if (type == DBG_INTR_TYPE_PMI) { - INTERRUPT_MASKED_DEBUG_START(mt_fiq, DBG_INTR_TYPE_PMI); + ml_interrupt_masked_debug_start(mt_fiq, DBG_INTR_TYPE_PMI); mt_fiq(getCpuDatap(), pmcr0, upmsr); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } else #endif /* MONOTONIC_FIQ */ { @@ -2559,9 +2662,9 @@ sleh_fiq(arm_saved_state_t *state) * We can easily thread it through, but not bothering for the * moment (AArch32 doesn't either). */ - INTERRUPT_MASKED_DEBUG_START(rtclock_intr, DBG_INTR_TYPE_TIMER); + ml_interrupt_masked_debug_start(rtclock_intr, DBG_INTR_TYPE_TIMER); rtclock_intr(TRUE); - INTERRUPT_MASKED_DEBUG_END(); + ml_interrupt_masked_debug_end(); } #if APPLEVIRTUALPLATFORM @@ -2695,7 +2798,7 @@ sleh_invalid_stack(arm_context_t *context, uint64_t esr __unused, vm_offset_t fa panic_with_thread_kernel_state("Invalid kernel stack pointer (probable overflow).", &context->ss); } - panic_with_thread_kernel_state("Invalid kernel stack pointer (probable corruption).", &context->ss); + panic_with_thread_kernel_state("Invalid kernel stack pointer (probable corruption or early boot).", &context->ss); } @@ -2763,7 +2866,7 @@ sleh_panic_lockdown_should_initiate_el1_sp0_sync(uint64_t esr, uint64_t elr, */ #if HAS_TELEMETRY_KERNEL_BRK const struct kernel_brk_descriptor *desc; - desc = find_brk_descriptor_by_comment(ISS_BRK_COMMENT(esr)); + desc = find_kernel_brk_descriptor_by_comment(ISS_BRK_COMMENT(esr)); if (desc && desc->options.recoverable) { /* * We matched a breakpoint and it's recoverable, skip lockdown. @@ -2792,6 +2895,7 @@ sleh_panic_lockdown_should_initiate_el1_sp0_sync(uint64_t esr, uint64_t elr, } + /* * Heuristic: if FAR != XPAC(FAR), the pointer was likely corrupted * due to PAC. @@ -2821,12 +2925,8 @@ sleh_panic_lockdown_should_initiate_el1_sp0_sync(uint64_t esr, uint64_t elr, } case ESR_EC_BTI_FAIL: { - /* Kernel BTI exceptions are recoverable only in telemetry mode */ -#ifdef CONFIG_BTI_TELEMETRY - return false; -#else + /* Kernel BTI exceptions are always fatal */ return true; -#endif /* CONFIG_BTI_TELEMETRY */ } default: { diff --git a/osfmk/arm64/sptm/arm_init_sptm.c b/osfmk/arm64/sptm/arm_init_sptm.c index bdb9e76dc..45af39ac6 100644 --- a/osfmk/arm64/sptm/arm_init_sptm.c +++ b/osfmk/arm64/sptm/arm_init_sptm.c @@ -157,8 +157,15 @@ MACHINE_TIMEOUT_DEV_WRITEABLE(stackshot_interrupt_masked_timeout, "sshot-interru #define XCALL_ACK_TIMEOUT_NS ((uint64_t) 6000000000) uint64_t xcall_ack_timeout_abstime; -boot_args const_boot_args __attribute__((section("__DATA, __const"))); -boot_args *BootArgs __attribute__((section("__DATA, __const"))); +#ifndef __BUILDING_XNU_LIBRARY__ +#define BOOTARGS_SECTION_ATTR __attribute__((section("__DATA, __const"))) +#else /* __BUILDING_XNU_LIBRARY__ */ +/* Special segments are not used when building for user-mode */ +#define BOOTARGS_SECTION_ATTR +#endif /* __BUILDING_XNU_LIBRARY__ */ + +boot_args const_boot_args BOOTARGS_SECTION_ATTR; +boot_args *BootArgs BOOTARGS_SECTION_ATTR; /** * The SPTM provides a second set of boot arguments, on top of those @@ -730,24 +737,40 @@ arm_init(boot_args *args, sptm_bootstrap_args_xnu_t *sptm_boot_args) configure_misc_apple_boot_args(); configure_misc_apple_regs(true); -#if (DEVELOPMENT || DEBUG) - unsigned long const *platform_stall_ptr = NULL; +#if HAS_UPSI_FAILURE_INJECTION + /* UPSI (Universal Panic and Stall Injection) Logic + * iBoot/XNU are both configured for failure injection at specific stages + * The injected failure and stage is populated through EDT properties by iBoot + * + * iBoot populates the EDT properties for XNU based upon PMU scratch bits + * This is done because the EDT is available sooner in XNU than the PMU Kext + */ + uint64_t const *upsi_info = NULL; + /* Not usable TUNABLE here because TUNABLEs are parsed at a later point. */ if (SecureDTLookupEntry(NULL, "/chosen", &chosen) != kSuccess) { panic("%s: Unable to find 'chosen' DT node", __FUNCTION__); } - // Not usable TUNABLE here because TUNABLEs are parsed at a later point. - if (SecureDTGetProperty(chosen, "xnu_platform_stall", (void const **)&platform_stall_ptr, + /* Check if there is a requested injection stage */ + if (SecureDTGetProperty(chosen, "injection_stage", (void const **)&upsi_info, &dt_entry_size) == kSuccess) { - xnu_platform_stall_value = *platform_stall_ptr; + assert3u(dt_entry_size, ==, 8); + xnu_upsi_injection_stage = *upsi_info; } - platform_stall_panic_or_spin(PLATFORM_STALL_XNU_LOCATION_ARM_INIT); + /* Check if there is a requested injection action */ + if (SecureDTGetProperty(chosen, "injection_action", (void const **)&upsi_info, + &dt_entry_size) == kSuccess) { + assert3u(dt_entry_size, ==, 8); + xnu_upsi_injection_action = *upsi_info; + } + + check_for_failure_injection(XNU_STAGE_ARM_INIT); chosen = NULL; // Force a re-lookup later on since VM addresses are not final at this point dt_entry_size = 0; -#endif +#endif // HAS_UPSI_FAILURE_INJECTION #if HAS_ARM_FEAT_SME (void)PE_parse_boot_argn("enable_sme", &enable_sme, sizeof(enable_sme)); @@ -874,6 +897,57 @@ arm_init(boot_args *args, sptm_bootstrap_args_xnu_t *sptm_boot_args) #endif /* __ARM_PAN_AVAILABLE__ */ + /** + * Check SPTM feature flag for ARM_LARGE_MEMORY irrespective of XNU + * definition to detect mismatch in cases where ARM_LARGE_MEMORY is + * defined in SPTM but not in XNU and vice versa. + */ + const uint64_t sptm_is_large_memory = SPTMArgs->feature_flags & SPTM_FEATURE_LARGE_MEMORY; + const uint64_t sptm_is_large_memory_kernonly = SPTMArgs->feature_flags & SPTM_FEATURE_LARGE_MEMORY_KERNONLY; +#if ARM_LARGE_MEMORY + const uint64_t xnu_is_large_memory = SPTM_FEATURE_LARGE_MEMORY; +#if ARM_LARGE_MEMORY_KERNONLY + const uint64_t xnu_is_large_memory_kernonly = SPTM_FEATURE_LARGE_MEMORY_KERNONLY; +#else /* ARM_LARGE_MEMORY_KERNONLY */ + const uint64_t xnu_is_large_memory_kernonly = 0; +#endif /* ARM_LARGE_MEMORY_KERNONLY */ +#else /* ARM_LARGE_MEMORY */ + const uint64_t xnu_is_large_memory = 0; + const uint64_t xnu_is_large_memory_kernonly = 0; +#endif /* ARM_LARGE_MEMORY */ + + if (sptm_is_large_memory != xnu_is_large_memory) { + panic("Mismatch of ARM_LARGE_MEMORY in SPTM (%#llx)/XNU (%#llx)", sptm_is_large_memory, xnu_is_large_memory); + } + + if (sptm_is_large_memory_kernonly != xnu_is_large_memory_kernonly) { + panic("Mismatch of ARM_LARGE_MEMORY_KERNONLY in SPTM (%#llx)/XNU (%#llx)", + sptm_is_large_memory_kernonly, xnu_is_large_memory_kernonly); + } + + /* + * gPhysBase/Size only represent kernel-managed memory. These globals represent + * the actual DRAM base address and size as reported by iBoot through the + * device tree. + */ + unsigned long const *dram_base; + unsigned long const *dram_size; + if (SecureDTLookupEntry(NULL, "/chosen", &chosen) != kSuccess) { + panic("%s: Unable to find 'chosen' DT node", __FUNCTION__); + } + + if (SecureDTGetProperty(chosen, "dram-base", (void const **)&dram_base, &dt_entry_size) != kSuccess) { + panic("%s: Unable to find 'dram-base' entry in the 'chosen' DT node", __FUNCTION__); + } + + if (SecureDTGetProperty(chosen, "dram-size", (void const **)&dram_size, &dt_entry_size) != kSuccess) { + panic("%s: Unable to find 'dram-size' entry in the 'chosen' DT node", __FUNCTION__); + } + + gDramBase = *dram_base; + gDramSize = *dram_size; + pmap_first_pnum = (ppnum_t)atop(gDramBase); + arm_vm_init(xmaxmem, args); if (debug_boot_arg) { @@ -971,29 +1045,6 @@ arm_init(boot_args *args, sptm_bootstrap_args_xnu_t *sptm_boot_args) pal_hib_init(); #endif /* HIBERNATION */ - /* - * gPhysBase/Size only represent kernel-managed memory. These globals represent - * the actual DRAM base address and size as reported by iBoot through the - * device tree. - */ - unsigned long const *dram_base; - unsigned long const *dram_size; - if (SecureDTLookupEntry(NULL, "/chosen", &chosen) != kSuccess) { - panic("%s: Unable to find 'chosen' DT node", __FUNCTION__); - } - - if (SecureDTGetProperty(chosen, "dram-base", (void const **)&dram_base, &dt_entry_size) != kSuccess) { - panic("%s: Unable to find 'dram-base' entry in the 'chosen' DT node", __FUNCTION__); - } - - if (SecureDTGetProperty(chosen, "dram-size", (void const **)&dram_size, &dt_entry_size) != kSuccess) { - panic("%s: Unable to find 'dram-size' entry in the 'chosen' DT node", __FUNCTION__); - } - - gDramBase = *dram_base; - gDramSize = *dram_size; - pmap_first_pnum = (ppnum_t)atop(gDramBase); - /* * Initialize the stack protector for all future calls * to C code. Since kernel_bootstrap() eventually @@ -1432,6 +1483,12 @@ arm_vm_prot_init(__unused boot_args * args) vm_kernelcache_top = end_kern; } +static void +arm_vm_slide_region(vm_offset_t phys_start, size_t size) +{ + sptm_slide_region(phys_start, (unsigned int)(size >> PAGE_SHIFT)); +} + /* * return < 0 for a < b * 0 for a == b @@ -1460,7 +1517,7 @@ arm_vm_prot_finalize(boot_args * args __unused) */ /* Slide KLDDATA */ - sptm_slide_region(segKLDDATAB, (unsigned int)(segSizeKLDDATA >> PAGE_SHIFT)); + arm_vm_slide_region(segKLDDATAB, segSizeKLDDATA); /* * Replace the boot CPU's stacks with properly-guarded dynamically allocated stacks. @@ -1471,7 +1528,7 @@ arm_vm_prot_finalize(boot_args * args __unused) arm64_replace_bootstack(&BootCpuData); /* Slide early-boot data */ - sptm_slide_region(segBOOTDATAB, (unsigned int)(segSizeBOOTDATA >> PAGE_SHIFT)); + arm_vm_slide_region(segBOOTDATAB, segSizeBOOTDATA); /* Slide linkedit, unless otherwise requested */ bool keep_linkedit = false; @@ -1481,22 +1538,22 @@ arm_vm_prot_finalize(boot_args * args __unused) keep_linkedit = true; } #endif /* CONFIG_DTRACE */ -#if KASAN_DYNAMIC_BLACKLIST - /* KASAN's dynamic blacklist needs to query the LINKEDIT segment at runtime. As such, the +#if KASAN_DYNAMIC_DENYLIST + /* KASAN's dynamic denylist needs to query the LINKEDIT segment at runtime. As such, the * kext bootstrap code will not jettison LINKEDIT on kasan kernels, so don't bother to relocate it. */ keep_linkedit = true; -#endif +#endif /* KASAN_DYNAMIC_DENYLIST */ if (!keep_linkedit) { - sptm_slide_region(segLINKB, (unsigned int)(segSizeLINK >> PAGE_SHIFT)); + arm_vm_slide_region(segLINKB, segSizeLINK); if (segSizePLKLINKEDIT) { /* Prelinked kernel LINKEDIT */ - sptm_slide_region(segPLKLINKEDITB, (unsigned int)(segSizePLKLINKEDIT >> PAGE_SHIFT)); + arm_vm_slide_region(segPLKLINKEDITB, segSizePLKLINKEDIT); } } /* Slide prelinked kernel plists */ - sptm_slide_region(segPRELINKINFOB, (unsigned int)(segSizePRELINKINFO >> PAGE_SHIFT)); + arm_vm_slide_region(segPRELINKINFOB, segSizePRELINKINFO); /* * Free the portion of memory that precedes the first usable region, known @@ -1534,7 +1591,7 @@ alloc_ptpage(sptm_pt_level_t level, bool map_static) { pmap_paddr_t paddr = 0; -#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) map_static = FALSE; #endif @@ -1593,7 +1650,7 @@ init_image_offsets(size_t debug_header_entry, vm_image_offsets *offsets) #define ARM64_PHYSMAP_SLIDE_MASK (ARM64_PHYSMAP_SLIDE_RANGE - 1) void -arm_vm_init(uint64_t memory_size, boot_args * args) +arm_vm_init(uint64_t memory_size_override, boot_args * args) { vm_map_address_t va_l1, va_l1_end; tt_entry_t *cpu_l1_tte; @@ -1628,18 +1685,21 @@ arm_vm_init(uint64_t memory_size, boot_args * args) /* Obtain total memory size, including non-managed memory */ mem_actual = args->memSizeActual ? args->memSizeActual : mem_size; - - if ((memory_size != 0) && (mem_size > memory_size)) { - mem_size = memory_size; - max_mem_actual = memory_size; + if ((memory_size_override != 0) && (mem_size > memory_size_override)) { + { + mem_size = memory_size_override; + } + max_mem_actual = memory_size_override; } else { max_mem_actual = mem_actual; } +#if !defined(ARM_LARGE_MEMORY) /* Make sure the system does not have more physical memory than what can be mapped */ if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 2)) { panic("Unsupported memory configuration %lx", mem_size); } +#endif /* !defined(ARM_LARGE_MEMORY) */ physmap_base = SPTMArgs->physmap_base; physmap_end = static_memory_end = SPTMArgs->physmap_end; @@ -1822,11 +1882,19 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ use_xnu_restricted = false; + + #endif /* XNU_TARGET_OS_OSX */ } - sane_size = mem_size - (avail_start - gPhysBase); - max_mem = mem_size; + + if (memory_size_override && memory_size_override < mem_size) { + max_mem = memory_size_override; + sane_size = memory_size_override - (avail_start - gPhysBase); + } else { + max_mem = mem_size; + sane_size = mem_size - (avail_start - gPhysBase); + } // vm_kernel_slide is set by arm_init()->arm_slide_rebase_and_sign_image() vm_kernel_slid_base = segLOWESTTEXT; vm_kernel_stext = segTEXTB; @@ -1880,7 +1948,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes); va_l1_end = (va_l1_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL; - cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT); while (va_l1 < va_l1_end) { va_l2 = va_l1; @@ -1921,7 +1989,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) va_l1 = (VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - PE_EARLY_BOOT_VA; va_l1_end = VM_MAX_KERNEL_ADDRESS; - cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + cpu_l1_tte = cpu_tte + L1_TABLE_T1_INDEX(va_l1, TCR_EL1_BOOT); while (va_l1 < va_l1_end) { va_l2 = va_l1; diff --git a/osfmk/arm64/sptm/pmap/pmap.c b/osfmk/arm64/sptm/pmap/pmap.c index e55861e24..92c3180ce 100644 --- a/osfmk/arm64/sptm/pmap/pmap.c +++ b/osfmk/arm64/sptm/pmap/pmap.c @@ -90,9 +90,9 @@ #include #include #include -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include -#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #include @@ -112,7 +112,16 @@ #include #endif /* HIBERNATION */ +#ifdef __ARM64_PMAP_SUBPAGE_L1__ +/** + * Different from PPL, PMAP_ROOT_ALLOC_SIZE for subpage L1 devices is 128 bytes + * rather than 64 bytes, due to the metadata SPTM needs to track the subpage L1 + * tables. + */ +#define PMAP_ROOT_ALLOC_SIZE SUBPAGE_USER_ROOT_TABLE_SIZE +#else #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES) +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0])) @@ -302,6 +311,28 @@ const struct page_table_attr pmap_pt_attr_4k = { .pta_page_size = 4096, .pta_page_shift = 12, .geometry_id = SPTM_PT_GEOMETRY_4K, + .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB), +}; + +const struct page_table_attr pmap_pt_attr_16k_kern = { + .pta_level_info = pmap_table_level_info_16k, + .pta_root_level = PMAP_TT_L1_LEVEL, + .pta_commpage_level = PMAP_TT_L2_LEVEL, + .pta_max_level = PMAP_TT_L3_LEVEL, + .pta_ops = &native_pt_ops, + .ap_ro = ARM_PTE_AP(AP_RORO), + .ap_rw = ARM_PTE_AP(AP_RWRW), + .ap_rona = ARM_PTE_AP(AP_RONA), + .ap_rwna = ARM_PTE_AP(AP_RWNA), + .ap_xn = ARM_PTE_PNX | ARM_PTE_NX, + .ap_x = ARM_PTE_PNX, +#if __ARM_MIXED_PAGE_SIZE__ + .pta_tcr_value = TCR_EL1_16KB, +#endif /* __ARM_MIXED_PAGE_SIZE__ */ + .pta_page_size = 16384, + .pta_page_shift = 14, + .geometry_id = SPTM_PT_GEOMETRY_16K_KERN, + .pta_va_valid_mask = ARM_PTE_T1_REGION_MASK(TCR_EL1_16KB), }; const struct page_table_attr pmap_pt_attr_16k = { @@ -322,6 +353,7 @@ const struct page_table_attr pmap_pt_attr_16k = { .pta_page_size = 16384, .pta_page_shift = 14, .geometry_id = SPTM_PT_GEOMETRY_16K, + .pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB), }; #if __ARM_16K_PG__ @@ -482,7 +514,7 @@ const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM struct pmap kernel_pmap_store MARK_AS_PMAP_DATA; const pmap_t kernel_pmap = &kernel_pmap_store; -static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */ +__static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */ MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0); queue_head_t map_pmap_list MARK_AS_PMAP_DATA; @@ -571,7 +603,7 @@ SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE; MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0); SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0; -SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap; +SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap; #if !HAS_16BIT_ASID static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA; static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0}; @@ -583,7 +615,7 @@ static uint16_t last_allocated_asid = 0; SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table; -//SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table; +SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table; #if __ARM_MIXED_PAGE_SIZE__ SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table; //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table; @@ -647,7 +679,6 @@ pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep) } \ } while(0) - /** * Updated wired-mapping accountings in the PTD and ledger. * @@ -821,12 +852,12 @@ static kern_return_t pmap_expand( static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t); -static tt_entry_t *pmap_tt1_allocate(pmap_t, uint8_t); +static tt_entry_t *pmap_tt1_allocate(pmap_t, vm_size_t, uint8_t); -static void pmap_tt1_deallocate(pmap_t, tt_entry_t *); +static void pmap_tt1_deallocate(pmap_t, tt_entry_t *, vm_size_t); static kern_return_t pmap_tt_allocate( - pmap_t, tt_entry_t **, unsigned int, unsigned int); + pmap_t, tt_entry_t **, pt_desc_t **, unsigned int, unsigned int); const unsigned int arm_hardware_page_size = ARM_PGBYTES; const unsigned int arm_pt_desc_size = sizeof(pt_desc_t); @@ -850,8 +881,12 @@ static boolean_t arm_clear_fast_fault( pt_entry_t *pte_p, pp_attr_t attrs_to_clear); -static void pmap_trim_self(pmap_t pmap); -static void pmap_trim_subord(pmap_t subord); +static void pmap_tte_deallocate( + pmap_t pmap, + vm_offset_t va_start, + tt_entry_t *ttep, + unsigned int level, + bool pmap_locked); /* @@ -1367,6 +1402,10 @@ pmap_map( return virt; } +#if HAS_SPTM_SYSCTL +bool disarm_protected_io = false; +#endif /* HAS_SPTM_SYSCTL */ + /** * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type. * @@ -1379,6 +1418,15 @@ static inline pt_entry_t pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate) { +#if HAS_SPTM_SYSCTL + if (__improbable(disarm_protected_io)) { + /* Make sure disarm_protected_io is read before its counterpart in SPTM */ + os_atomic_thread_fence(acquire); + return tmplate; + } + +#endif /* HAS_SPTM_SYSCTL */ + /** * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade * the mapping to RO. This is required because IOKit relies on this @@ -1416,9 +1464,40 @@ pmap_map_bd_with_options( vm_offset_t paddr; pt_entry_t mem_attr; + if (__improbable(start & PAGE_MASK)) { + panic("%s: start 0x%lx is not page aligned", __func__, start); + } + + if (__improbable(end & PAGE_MASK)) { + panic("%s: end 0x%lx is not page aligned", __func__, end); + } + + if (__improbable(!gDramBase || !gDramSize)) { + panic("%s: gDramBase/gDramSize not initialized", __func__); + } + + bool first_page_is_dram = is_dram_addr(start); + for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) { + if (first_page_is_dram != is_dram_addr(pa)) { + panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM", + __func__, pa, first_page_is_dram ? "is not" : "is"); + } + } + switch (options & PMAP_MAP_BD_MASK) { case PMAP_MAP_BD_WCOMB: - mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + if (is_dram_addr(start)) { + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + } else { +#if HAS_FEAT_XS + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY); break; case PMAP_MAP_BD_POSTED: @@ -1599,6 +1678,13 @@ pmap_get_arm64_prot( return effective_prot_bits; } +/** + * Helper macros for accessing the "unnested" and "in-progress" bits in + * pmap->nested_region_unnested_table_bitmap. + */ +#define UNNEST_BIT(index) ((index) * 2) +#define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1) + /* * Bootstrap the system enough to run with virtual memory. * @@ -1639,7 +1725,7 @@ pmap_bootstrap( * Initialize the kernel pmap. */ #if ARM_PARAMETERIZED_PMAP - kernel_pmap->pmap_pt_attr = native_pt_attr; + kernel_pmap->pmap_pt_attr = &pmap_pt_attr_16k_kern; #endif /* ARM_PARAMETERIZED_PMAP */ #if HAS_APPLE_PAC kernel_pmap->disable_jop = 0; @@ -1655,10 +1741,6 @@ pmap_bootstrap( kernel_pmap->is_rosetta = FALSE; #endif -#if ARM_PARAMETERIZED_PMAP - kernel_pmap->pmap_pt_attr = native_pt_attr; -#endif /* ARM_PARAMETERIZED_PMAP */ - kernel_pmap->nested_region_addr = 0x0ULL; kernel_pmap->nested_region_size = 0x0ULL; kernel_pmap->nested_region_unnested_table_bitmap = NULL; @@ -1666,7 +1748,13 @@ pmap_bootstrap( kernel_pmap->asid = 0; + /** + * The kernel pmap lock is no longer needed; init it and then destroy it to + * place it in a known-invalid state that will cause any attempt to use it + * to fail. + */ pmap_lock_init(kernel_pmap); + pmap_lock_destroy(kernel_pmap); pmap_max_asids = SPTMArgs->num_asids; @@ -1679,11 +1767,6 @@ pmap_bootstrap( * */ pmap_data_bootstrap(); - /** - * Bootstrap any necessary UAT data structures and values needed from the device tree. - */ - uat_bootstrap(); - /** * Don't make any assumptions about the alignment of avail_start before this * point (i.e., pmap_data_bootstrap() performs allocations). @@ -1771,6 +1854,11 @@ pmap_bootstrap( (void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available); #endif /* (DEVELOPMENT || DEBUG) */ +#if __ARM64_PMAP_SUBPAGE_L1__ + /* Initialize the Subpage User Root Table subsystem. */ + surt_init(); +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + /* Signal that the pmap has been bootstrapped */ pmap_bootstrapped = true; } @@ -1825,6 +1913,15 @@ pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va, commpage_table_pa = new_table; } + pt_desc_t *ptdp = ptd_alloc(temp_commpage_pmap, PMAP_PAGE_ALLOCATE_NOWAIT); + assert(ptdp); + + const unsigned int pai = pa_index(new_table); + locked_pvh_t locked_pvh = pvh_lock(pai); + pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP); + + ptd_info_init(ptdp, temp_commpage_pmap, pt_attr_align_va(pt_attr, i, rw_va), i + 1, NULL); + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr); sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params); @@ -1833,6 +1930,13 @@ pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va, sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va), (sptm_pt_level_t)i, table_tte); + + ptd_info_finalize(ptdp); + + /* The PTD's assoicated pmap temp_commpage_pmap is to be destroyed, so set it to NULL here. */ + ptdp->pmap = NULL; + + pvh_unlock(&locked_pvh); } /* @@ -1863,6 +1967,7 @@ pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va, } + /* Unmap the commpage table here so that it won't be deallocated by pmap_destroy(). */ sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va), (sptm_pt_level_t)pt_attr_commpage_level(pt_attr)); pmap_destroy(temp_commpage_pmap); @@ -1902,18 +2007,18 @@ pmap_prepare_commpages(void) * this page. */ commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS, - commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, 0); + commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, PMAP_CREATE_64BIT); /* * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the * SPTM, if we ever need to support arm64_32 processes in the SPTM. - * - * commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS, - * commpage_data_pa, commpage_ro_data_pa, 0, 0); */ + commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS, + commpage_data_pa, commpage_ro_data_pa, 0, 0); + #if __ARM_MIXED_PAGE_SIZE__ commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS, - commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES); + commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_64BIT | PMAP_CREATE_FORCE_4K_PAGES); /* * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the @@ -1944,7 +2049,7 @@ pmap_virtual_region( ) { boolean_t ret = FALSE; -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) if (region_select == 0) { /* * In this config, the bootstrap mappings should occupy their own L2 @@ -1968,9 +2073,9 @@ pmap_virtual_region( ret = TRUE; } #endif -#else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */ +#else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */ #if defined(ARM_LARGE_MEMORY) - /* For large memory systems with no KTRR/CTRR such as virtual machines */ + /* For large memory systems with no KTRR/CTRR */ if (region_select == 0) { *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK; *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK); @@ -2026,7 +2131,7 @@ pmap_virtual_region( ret = TRUE; } #endif /* defined(ARM_LARGE_MEMORY) */ -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ return ret; } @@ -2066,16 +2171,16 @@ pmap_is_bad_ram(__unused ppnum_t ppn) static void initialize_ram_ranges(void) { - pmap_paddr_t first = first_avail; + __assert_only pmap_paddr_t first = first_avail; pmap_paddr_t end = avail_end; assert(first <= end); assert(first == (first & ~PAGE_MASK)); assert(end == (end & ~PAGE_MASK)); - avail_page_count = atop(end - first); need_ram_ranges_init = false; + avail_page_count = atop(end - first_avail); } unsigned int @@ -2175,14 +2280,14 @@ pmap_init( * structures for pages we allocate to be page tables in * pmap_expand(). */ - _vm_object_allocate(mem_size, pmap_object); + _vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL); pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /* * Initialize the TXM VM object in the same way as the * PMAP VM object. */ - _vm_object_allocate(mem_size, txm_vm_object); + _vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL); txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /* @@ -2223,100 +2328,23 @@ pmap_verify_free(ppnum_t ppnum) return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL); } -#if MACH_ASSERT -/** - * Verify that a given physical page contains no mappings (outside of the - * default physical aperture mapping) and if it does, then panic. - * - * @note It's recommended to use pmap_verify_free() directly when operating in - * the PPL since the PVH lock isn't getting grabbed here (due to this code - * normally being called from outside of the PPL, and the pv_head_table - * can't be modified outside of the PPL). - * - * @param ppnum Physical page number to check there are no mappings to. - */ -void -pmap_assert_free(ppnum_t ppnum) + +#if __ARM64_PMAP_SUBPAGE_L1__ +static inline bool +pmap_user_root_size_matches_subpage_l1(vm_size_t root_size) { - const pmap_paddr_t pa = ptoa(ppnum); - - /* Only mappings to kernel-managed physical memory are tracked. */ - if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) { - return; - } - - const unsigned int pai = pa_index(pa); - const uintptr_t pvh = pai_to_pvh(pai); - - /** - * This function is always called from outside of the PPL. Because of this, - * the PVH entry can't be locked. This function is generally only called - * before the VM reclaims a physical page and shouldn't be creating new - * mappings. Even if a new mapping is created while parsing the hierarchy, - * the worst case is that the system will panic in another way, and we were - * already about to panic anyway. - */ - - /** - * Since pmap_verify_free() returned false, that means there is at least one - * mapping left. Let's get some extra info on the first mapping we find to - * dump in the panic string (the common case is that there is one spare - * mapping that was never unmapped). - */ - pt_entry_t *first_ptep = PT_ENTRY_NULL; - - if (pvh_test_type(pvh, PVH_TYPE_PTEP)) { - first_ptep = pvh_ptep(pvh); - } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) { - pv_entry_t *pvep = pvh_pve_list(pvh); - - /* Each PVE can contain multiple PTEs. Let's find the first one. */ - for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) { - first_ptep = pve_get_ptep(pvep, pve_ptep_idx); - if (first_ptep != PT_ENTRY_NULL) { - break; - } - } - - /* The PVE should have at least one valid PTE. */ - assert(first_ptep != PT_ENTRY_NULL); - } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) { - panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)", - __func__, (void*)pvh, pai); - } else { - /** - * The mapping disappeared between here and the pmap_verify_free() call. - * The only way that can happen is if the VM was racing this call with - * a call that unmaps PTEs. Operations on this page should not be - * occurring at the same time as this check, and unfortunately we can't - * lock the PVH entry to prevent it, so just panic instead. - */ - panic("%s: Mapping was detected but is now gone. Is the VM racing this " - "call with an operation that unmaps PTEs? PVH %p (pai: %d)", - __func__, (void*)pvh, pai); - } - - /* Panic with a unique string identifying the first bad mapping and owner. */ - { - /* First PTE is mapped by the main CPUs. */ - pmap_t pmap = ptep_get_pmap(first_ptep); - const char *type = (pmap == kernel_pmap) ? "Kernel" : "User"; - - panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a " - "%s CPU mapping (pmap: %p)", - __func__, (uint64_t)pa, first_ptep, type, pmap); - } + return root_size == 8 * sizeof(tt_entry_t); } -#endif - +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ static vm_size_t pmap_root_alloc_size(pmap_t pmap) { #pragma unused(pmap) const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); - unsigned int root_level = pt_attr_root_level(pt_attr); - return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t); + const unsigned int root_level = pt_attr_root_level(pt_attr); + const uint64_t index = pt_attr_va_valid_mask(pt_attr); + return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t); } /* @@ -2395,6 +2423,11 @@ pmap_create_options_internal( p->nx_enabled = true; p->is_64bit = is_64bit; + + if (!is_64bit) { + sptm_root_flags |= SPTM_ROOT_PT_FLAG_ARM64_32; + } + p->nested_pmap = PMAP_NULL; p->type = PMAP_TYPE_USER; @@ -2424,9 +2457,20 @@ pmap_create_options_internal( panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size); } +#if __ARM64_PMAP_SUBPAGE_L1__ + /** + * Identify the case where the root qualifies for SURT, and update the + * root size to the TTEs + the SPTM metadata, reflecting the actual + * space taken by this subpage root table. + */ + if (!(flags & PMAP_CREATE_NESTED) && pmap_user_root_size_matches_subpage_l1(pmap_root_size)) { + pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE; + } +#endif + pmap_lock_init(p); - p->tte = pmap_tt1_allocate(p, sptm_root_flags); + p->tte = pmap_tt1_allocate(p, pmap_root_size, sptm_root_flags); if (!(p->tte)) { local_kr = KERN_RESOURCE_SHORTAGE; goto tt1_alloc_fail; @@ -2442,10 +2486,7 @@ pmap_create_options_internal( p->nested_region_size = 0x0ULL; p->nested_region_unnested_table_bitmap = NULL; - p->nested_has_no_bounds_ref = false; - p->nested_no_bounds_refcnt = 0; - p->nested_bounds_set = false; - + p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE; #if MACH_ASSERT p->pmap_pid = 0; @@ -2606,8 +2647,7 @@ pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_addres } /* Remove the TTE. */ - pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE); - pmap_tte_deallocate(pmap, va, ttep, level); + pmap_tte_deallocate(pmap, va, ttep, level, false); } } @@ -2665,11 +2705,11 @@ pmap_destroy_internal( * 1) All prior PTE removals required to empty the pmap have completed and * been synchronized with DSB, *except* the commpage removal which doesn't * involve pages that can ever be retyped. Subsequent calls not already - * in the retype epoch will no longer observe these mappings. + * in the pmap epoch will no longer observe these mappings. * 2) The pmap now has a zero refcount, so in a correctly functioning system * no further mappings will be requested for it. */ - pmap_retype_epoch_prepare_drain(); + pmap_epoch_prepare_drain(); if (!is_stage2_pmap) { pmap_unmap_commpage(pmap); @@ -2679,9 +2719,7 @@ pmap_destroy_internal( queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); pmap_simple_unlock(&pmaps_lock); - pmap_retype_epoch_drain(); - - pmap_trim_self(pmap); + pmap_epoch_drain(); /* * Free the memory maps, then the @@ -2690,7 +2728,35 @@ pmap_destroy_internal( pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr)); if (pmap->tte) { - pmap_tt1_deallocate(pmap, pmap->tte); + vm_size_t pmap_root_size = pmap_root_alloc_size(pmap); +#if __ARM64_PMAP_SUBPAGE_L1__ + /** + * Like in the allocation path, identify the case where the root table + * qualifies for SURT. + */ + if (pmap_user_root_size_matches_subpage_l1(pmap_root_size)) { + /** + * Nested tables cannot use SURT, so the allocated size has to be + * PAGE_SIZE. + */ + if (pmap_is_nested(pmap)) { + pmap_root_size = PAGE_SIZE; + } else { + /** + * Note: with SPTM, the kernel pmap is never supposed to be + * destroyed because the SPTM relies on the existence of the + * kernel root table. Also, the commpage-typed pmap doesn't + * exist. Not only is the pmap associated with a commpage + * table transient and destroyed right after the commpage + * table is setup, but also the pmap is just a plain + * PMAP_TYPE_USER typed pmap. + */ + assert(pmap->type == PMAP_TYPE_USER); + pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE; + } + } +#endif + pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_size); pmap->tte = (tt_entry_t *) NULL; pmap->ttep = 0; } @@ -2706,8 +2772,9 @@ pmap_destroy_internal( pmap_check_ledgers(pmap); - if (pmap->nested_region_unnested_table_bitmap) { - bitmap_free(pmap->nested_region_unnested_table_bitmap, pmap->nested_region_size >> pt_attr_twig_shift(pt_attr)); + if ((pmap->type == PMAP_TYPE_NESTED) && (pmap->nested_region_unnested_table_bitmap != NULL)) { + bitmap_free(pmap->nested_region_unnested_table_bitmap, + (pmap->nested_region_size >> (pt_attr_twig_shift(pt_attr) - 1))); } pmap_lock_destroy(pmap); @@ -2763,44 +2830,122 @@ get_sptm_pt_type(pmap_t pmap) } static tt_entry_t * -pmap_tt1_allocate(pmap_t pmap, uint8_t sptm_root_flags) +pmap_tt1_allocate(pmap_t pmap, vm_size_t size, uint8_t sptm_root_flags) { pmap_paddr_t pa = 0; const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); const bool is_stage2_pmap = false; + /** + * Allocate the entire page for root-level page table unless it is subpage + * L1 table, where size will be exactly PMAP_ROOT_ALLOC_SIZE. + */ + if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) { + size = PAGE_SIZE; + } + +#if __ARM64_PMAP_SUBPAGE_L1__ + /** + * At this moment, the allocation size is smaller than the page size only + * when it is a subpage L1 table. We will try to allocate a root table + * from the SURTs (SUbpage Root Tables). + */ + const bool use_surt = (size < PAGE_SIZE); + if (use_surt) { + /* It has to be a user pmap. */ + assert(pmap->type == PMAP_TYPE_USER); + + /** + * Subpage stage 2 root table is not supported. This is guaranteed by + * the stage 2 pmaps using a different pmap geometry than the stage + * 1 pmaps. + */ + assert(!is_stage2_pmap); + + /* Try allocating a SURT from the SURT page queue. */ + pa = surt_try_alloc(); + + /* If there is one SURT available, call SPTM to claim the SURT. */ + if (pa) { + sptm_surt_alloc(surt_page_pa_from_surt_pa(pa), + surt_index_from_surt_pa(pa), + pt_attr->geometry_id, + sptm_root_flags, + pmap->asid); + + /* We don't need to allocate a new page, so skip to the end. */ + goto ptt1a_done; + } + } +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + + /** + * Either the root table size is not suitable for SURT or SURT is out of + * tables. In either case, a page needs to be allocated. + */ const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL); + /* No page is allocated, so return 0 to signal failure. */ if (ret != KERN_SUCCESS) { return (tt_entry_t *)0; } /** - * Drain the epochs to ensure any lingering batched operations that may have taken - * an in-flight reference to this page are complete. + * Drain the epochs to ensure any lingering batched operations that may have + * taken an in-flight reference to this page are complete. */ - pmap_retype_epoch_prepare_drain(); + pmap_epoch_prepare_drain(); assert(pa); - /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size. - * Depending on the device, this can vary between 512b and 16K. */ - OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - pmap_tt_ledger_credit(pmap, PAGE_SIZE); +#if __ARM64_PMAP_SUBPAGE_L1__ + if (use_surt) { + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; - sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; - retype_params.attr_idx = pt_attr->geometry_id; - retype_params.flags = sptm_root_flags; - if (is_stage2_pmap) { - retype_params.vmid = pmap->vmid; - } else { - retype_params.asid = pmap->asid; + pmap_epoch_drain(); + + /** + * The allocated page is retyped to XNU_SUBPAGE_USER_ROOT_TABLES as the + * container of the SURTs. + */ + sptm_retype(pa, XNU_DEFAULT, XNU_SUBPAGE_USER_ROOT_TABLES, retype_params); + + /** + * Before we add the page to the SURT page queue, claim the first SURT + * for ourselves. This is safe since we are the only one accessing this + * page at this moment. + */ + sptm_surt_alloc(pa, 0, pt_attr->geometry_id, sptm_root_flags, pmap->asid); + + /** + * Add the newly allocated SURT page to the page queue. + */ + surt_feed_page_with_first_table_allocated(pa); + } else +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + { + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + retype_params.attr_idx = pt_attr->geometry_id; + retype_params.flags = sptm_root_flags; + if (is_stage2_pmap) { + retype_params.vmid = pmap->vmid; + } else { + retype_params.asid = pmap->asid; + } + + pmap_epoch_drain(); + + sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE, + retype_params); } - pmap_retype_epoch_drain(); - - sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE, - retype_params); +#if __ARM64_PMAP_SUBPAGE_L1__ +ptt1a_done: +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size. + * Depending on the device, this can vary between 512b and 16K. */ + OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); + pmap_tt_ledger_credit(pmap, size); return (tt_entry_t *) phystokv(pa); } @@ -2808,85 +2953,146 @@ pmap_tt1_allocate(pmap_t pmap, uint8_t sptm_root_flags) static void pmap_tt1_deallocate( pmap_t pmap, - tt_entry_t *tt) + tt_entry_t *tt, + vm_size_t size) { pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt); const bool is_stage2_pmap = false; - const sptm_frame_type_t page_type = is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : - pmap->type == PMAP_TYPE_NESTED ? XNU_SHARED_ROOT_TABLE : XNU_USER_ROOT_TABLE; + + /** + * Free the entire page unless it is subpage L1 table, where size will be + * exactly PMAP_ROOT_ALLOC_SIZE. + */ + if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) { + size = PAGE_SIZE; + } + +#if __ARM64_PMAP_SUBPAGE_L1__ + /** + * At this moment, the free size is smaller than the page size only + * when it is a subpage L1 table. We will try to free the root table + * from the SURT page. + */ + const bool use_surt = (size < PAGE_SIZE); + if (use_surt) { + /* It has to be a user pmap. */ + assert(pmap->type == PMAP_TYPE_USER); + + /* Subpage stage 2 root table is not supported. */ + assert(!is_stage2_pmap); + + /* Before we do anything in pmap, tell SPTM that the SURT is free. */ + sptm_surt_free(surt_page_pa_from_surt_pa(pa), + surt_index_from_surt_pa(pa)); + + /** + * Make sure the SURT bitmap update is not reordered before the SPTM + * rw guard release. + */ + os_atomic_thread_fence(release); + + /** + * Free the SURT in pmap scope, if surt_free() returns false, there + * are still other SURTs on the page. In such case, do not retype + * or free the page; just skip to the end to finish accounting. + */ + if (!surt_free(pa)) { + goto ptt1d_done; + } + + /** + * Make sure the SURT bitmap read is not reordered after the SPTM + * rw guard exclusive acquire in the retype case. + */ + os_atomic_thread_fence(acquire); + } +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + + sptm_frame_type_t page_type; +#if __ARM64_PMAP_SUBPAGE_L1__ + if (use_surt) { + page_type = XNU_SUBPAGE_USER_ROOT_TABLES; + } else +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + if (is_stage2_pmap) { + page_type = XNU_STAGE2_ROOT_TABLE; + } else if (pmap->type == PMAP_TYPE_NESTED) { + page_type = XNU_SHARED_ROOT_TABLE; + } else { + page_type = XNU_USER_ROOT_TABLE; + } sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; - sptm_retype(pa, page_type, XNU_DEFAULT, retype_params); - pmap_page_free(pa); + sptm_retype(pa & ~PAGE_MASK, page_type, XNU_DEFAULT, retype_params); + pmap_page_free(pa & ~PAGE_MASK); - OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - pmap_tt_ledger_debit(pmap, PAGE_SIZE); +#if __ARM64_PMAP_SUBPAGE_L1__ +ptt1d_done: +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + OSAddAtomic(-(int32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); + pmap_tt_ledger_debit(pmap, size); } MARK_AS_PMAP_TEXT static kern_return_t pmap_tt_allocate( pmap_t pmap, tt_entry_t **ttp, + pt_desc_t **ptdp_out, unsigned int level, unsigned int options) { pmap_paddr_t pa; - *ttp = NULL; + const unsigned int alloc_flags = + (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0; - if (*ttp == NULL) { - const unsigned int alloc_flags = - (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0; - - /* Allocate a VM page to be used as the page table. */ - if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) { - return KERN_RESOURCE_SHORTAGE; - } - - pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags); - if (ptdp == NULL) { - pmap_page_free(pa); - return KERN_RESOURCE_SHORTAGE; - } - - unsigned int pai = pa_index(pa); - locked_pvh_t locked_pvh = pvh_lock(pai); - assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p", - __func__, (void*)locked_pvh.pvh); - - /** - * Drain the epochs to ensure any lingering batched operations that may have taken - * an in-flight reference to this page are complete. - */ - pmap_retype_epoch_prepare_drain(); - - if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) { - OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count)); - } else { - OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count)); - } - - pmap_tt_ledger_credit(pmap, PAGE_SIZE); - - PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE); - - pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP); - pvh_unlock(&locked_pvh); - - sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; - retype_params.level = (sptm_pt_level_t)level; - - /** - * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages - * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above. - */ - pmap_retype_epoch_drain(); - - sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params); - - *ttp = (tt_entry_t *)phystokv(pa); + /* Allocate a VM page to be used as the page table. */ + if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) { + return KERN_RESOURCE_SHORTAGE; } - assert(*ttp); + pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags); + if (ptdp == NULL) { + pmap_page_free(pa); + return KERN_RESOURCE_SHORTAGE; + } + + unsigned int pai = pa_index(pa); + locked_pvh_t locked_pvh = pvh_lock(pai); + assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p", + __func__, (void*)locked_pvh.pvh); + + /** + * Drain the epochs to ensure any lingering batched operations that may have taken + * an in-flight reference to this page are complete. + */ + pmap_epoch_prepare_drain(); + + if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) { + OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count)); + } else { + OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count)); + } + + pmap_tt_ledger_credit(pmap, PAGE_SIZE); + + PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE); + + pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP); + pvh_unlock(&locked_pvh); + + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + retype_params.level = (sptm_pt_level_t)level; + + /** + * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages + * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above. + */ + pmap_epoch_drain(); + + sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params); + + *ptdp_out = ptdp; + *ttp = (tt_entry_t *)phystokv(pa); return KERN_SUCCESS; } @@ -2972,42 +3178,12 @@ pmap_tte_check_refcounts( * synchronize it against the disconnect operation. If that removal caused the * refcount to reach zero, the pagetable page could be freed before the disconnect * operation is finished using the relevant pagetable descriptor. - * Address these cases by waiting until all CPUs have been observed to not be - * executing pmap_disconnect(). + * Address these cases by draining the epochs to ensure other cores are no longer + * consuming the page table we're preparing to delete. */ if (remove_leaf_table) { - bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)]; - const int max_cpu = ml_get_max_cpu_number(); - bitmap_full(&active_disconnects[0], max_cpu + 1); - bool inflight_disconnect; - - /* - * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated - * ahead of any prior PTE load which may have observed the effect of a - * concurrent disconnect operation. An acquire fence is required for this; - * a load-acquire operation is insufficient. - */ - os_atomic_thread_fence(acquire); - do { - inflight_disconnect = false; - for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1); - i >= 0; - i = bitmap_next(&active_disconnects[0], i)) { - const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i); - if (cpu_data == NULL) { - continue; - } - if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) { - __builtin_arm_wfe(); - inflight_disconnect = true; - continue; - } - os_atomic_clear_exclusive(); - bitmap_clear(&active_disconnects[0], (unsigned int)i); - } - } while (inflight_disconnect); - /* Ensure the refcount is observed after any observation of inflight_disconnect */ - os_atomic_thread_fence(acquire); + pmap_epoch_prepare_drain(); + pmap_epoch_drain(); refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte)); } @@ -3099,7 +3275,6 @@ pmap_tte_trim( vm_offset_t va_start, tt_entry_t *ttep) { - pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE); assert(ttep != NULL); const tt_entry_t tte = *ttep; const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap); @@ -3112,8 +3287,6 @@ pmap_tte_trim( const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr); sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift); - pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE); - pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr)); } @@ -3122,22 +3295,25 @@ pmap_tte_trim( * * @note If the TTE to clear out points to a leaf table, then that leaf table * must have a mapping refcount of zero before the TTE can be removed. - * @note This function expects to be called with pmap locked exclusive, and will - * return with pmap unlocked. + * @note If locked_pvh is non-NULL, this function expects to be called with + * the PVH lock held and will return with it unlocked. Otherwise it + * expects pmap to be locked exclusive, and will return with pmap unlocked. * * @param pmap The pmap containing the page table whose TTE is being removed. * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance. * @param ttep Pointer to the TTE that should be cleared out. * @param level The level of the page table that contains the TTE to be removed. + * @param pmap_locked If true, the caller holds an exclusive pmap lock which should + * be dropped after removing the table entry. */ static void pmap_tte_remove( pmap_t pmap, vm_offset_t va_start, tt_entry_t *ttep, - unsigned int level) + unsigned int level, + bool pmap_locked) { - pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE); assert(ttep != NULL); const tt_entry_t tte = *ttep; @@ -3148,7 +3324,9 @@ pmap_tte_remove( sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level); - pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE); + if (pmap_locked) { + pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE); + } pmap_tte_check_refcounts(pmap, tte, level); } @@ -3161,8 +3339,9 @@ pmap_tte_remove( * * @note If the table getting deallocated is a leaf table, then that leaf table * must have a mapping refcount of zero before getting deallocated. - * @note This function expects to be called with pmap locked exclusive and will - * return with pmap unlocked. + * @note If locked_pvh is non-NULL, this function expects to be called with + * the PVH lock held and will return with it unlocked. Otherwise it + * expects pmap to be locked exclusive, and will return with pmap unlocked. * * @param pmap The pmap that owns the page table to be deallocated. * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance. @@ -3171,19 +3350,18 @@ pmap_tte_remove( * table to be removed. The deallocated page table will be a * `level` + 1 table (so if `level` is 2, then an L3 table will be * deleted). + * @param pmap_locked If true, the caller holds an exclusive pmap lock which should + * be dropped after removing the table entry. */ -void +static void pmap_tte_deallocate( pmap_t pmap, vm_offset_t va_start, tt_entry_t *ttep, - unsigned int level) + unsigned int level, + bool pmap_locked) { - tt_entry_t tte; - - pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE); - - tte = *ttep; + tt_entry_t tte = *ttep; if (tte_get_ptd(tte)->pmap != pmap) { panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p", @@ -3193,8 +3371,8 @@ pmap_tte_deallocate( assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep, (unsigned long long)tte); - /* pmap_tte_remove() will drop the pmap lock */ - pmap_tte_remove(pmap, va_start, ttep, level); + /* pmap_tte_remove() will drop the pmap lock if necessary. */ + pmap_tte_remove(pmap, va_start, ttep, level, pmap_locked); pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1); } @@ -3272,16 +3450,16 @@ pmap_remove_range_options( */ disable_preemption(); /** - * Enter the retype epoch for the batched unmap operation. This is necessary because we + * Enter the pmap epoch for the batched unmap operation. This is necessary because we * cannot reasonably hold the PVH locks for all pages mapped by the region during this * call, so a concurrent pmap_page_protect() operation against one of those pages may * race this call. That should be perfectly fine as far as the PTE updates are concerned, * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result * if it does not first drain our epoch. */ - pmap_retype_epoch_enter(); + pmap_epoch_enter(); sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes; for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) { @@ -3476,13 +3654,13 @@ pmap_remove_options_internal( * 1) All prior PTE removals required to produce refcnt == 0 have * completed and been synchronized for all observers by DSB, and the * relevant PV list entries removed. Subsequent calls not already in the - * retype epoch will no longer observe these mappings. + * pmap epoch will no longer observe these mappings. * 2) We now hold the pmap lock exclusive, so there will be no further attempt * to enter mappings in this page table before it is unmapped. */ - pmap_retype_epoch_prepare_drain(); - pmap_retype_epoch_drain(); - pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr)); + pmap_epoch_prepare_drain(); + pmap_epoch_drain(); + pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr), true); unlock = false; // pmap_tte_deallocate() has dropped the lock } } @@ -3878,7 +4056,7 @@ pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tl * can't be freed. The epoch still protects mappings for any prior page in * the batch, whose PV locks are no longer held. */ - pmap_retype_epoch_exit(); + pmap_epoch_exit(); enable_preemption(); if (flush_range->pending_region_entries != 0) { flush_range->processed_entries += flush_range->pending_disjoint_entries; @@ -3963,7 +4141,7 @@ pmap_multipage_op_add_page( if (pending_disjoint_entries == 0) { disable_preemption(); /** - * Enter the retype epoch while we gather the disjoint update arguments + * Enter the pmap epoch while we gather the disjoint update arguments * and issue the SPTM call. Since this operation may cover multiple physical * pages, we may construct the argument array and invoke the SPTM without holding * all relevant PVH locks or pmap locks. We therefore need to record that we are @@ -3971,7 +4149,7 @@ pmap_multipage_op_add_page( * not attempt to retype the underlying pages and pmap_remove() does not attempt * to free the page tables used for these mappings without first draining our epoch. */ - pmap_retype_epoch_enter(); + pmap_epoch_enter(); flush_range->pending_disjoint_entries = 1; } else { /** @@ -4028,7 +4206,7 @@ pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range) assert(get_preemption_level() > 0); pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED); /** - * If there are any pending disjoint entries, we're already in a retype epoch. + * If there are any pending disjoint entries, we're already in a pmap epoch. * For disjoint entries, we need to hold the epoch during the entire time we * construct the disjoint ops array because those ops may point to some arbitrary * pmap and we need to ensure the relevant page tables and even the pmap itself @@ -4040,14 +4218,14 @@ pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range) * flight. */ if (flush_range->pending_disjoint_entries == 0) { - pmap_retype_epoch_enter(); + pmap_epoch_enter(); } const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep, flush_range->pending_region_start, flush_range->pending_region_entries, PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa, SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI); if (flush_range->pending_disjoint_entries == 0) { - pmap_retype_epoch_exit(); + pmap_epoch_exit(); } enable_preemption(); if (flush_range->pending_disjoint_entries != 0) { @@ -4121,7 +4299,8 @@ pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range) * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range() * is removing/updating all mappings in preparation for a retype operation. In this case * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held - * by the calller, and will perform the necessary retype epoch drain prior to returning. + * by the calller, and will perform the necessary pmap epoch drain and retype the page back to XNU_DEFAULT + * prior to returning. */ #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK, @@ -4135,9 +4314,13 @@ _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK, * @param prot The permission to lower to. * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed. * PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is - * already locked and a retype epoch drain shold be performed. + * already locked and a pmap epoch drain shold be performed, along with + * retyping [ppnum] back to XNU_DEFAULT. * PMAP_OPTIONS_COMPRESSOR indicates the function is called by the * VM compressor. + * PMAP_OPTIONS_RETYPE requests the [ppnum] be retyped back to XNU_DEFAULT, + * along with an epoch drain; like PMAP_OPTIONS_PPO_PENDING_RETYPE but without + * the PVH lock being held by the caller. * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked * by the caller. This is an input/output parameter which may be updated * to reflect a new PV head value to be passed to a later call to pvh_unlock(). @@ -4236,6 +4419,7 @@ pmap_page_protect_options_with_flush_range( pvh_assert_locked(pai); bool pvh_lock_sleep_mode_needed = false; + bool clear_epoch = false; /* * PVH should be locked before accessing per-CPU data, as we're relying on the lock @@ -4263,17 +4447,8 @@ pmap_page_protect_options_with_flush_range( sptm_ptds = sptm_pcpu->sptm_ptds; \ sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \ if (remove) { \ - os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed); \ - /* \ - * Ensure the store to inflight_disconnect will be observed before any of the - * ensuing PTE/refcount stores in this function. This flag is used to avoid - * a race in which the VM may clear a pmap's mappings and destroy the pmap on - * another CPU, in between this function's clearing a PTE and dropping the - * corresponding pagetable refcount. That can lead to a panic if the - * destroying thread observes a non-zero refcount. For this we need a store- - * store barrier; a store-release operation would not be sufficient. - */ \ - os_atomic_thread_fence(release); \ + clear_epoch = true; \ + pmap_epoch_enter(); \ } \ } while (0) @@ -4320,13 +4495,9 @@ pmap_page_protect_options_with_flush_range( while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) { if (__improbable(pvh_lock_sleep_mode_needed)) { assert((num_mappings == 0) && (num_skipped_mappings == 0)); - if (remove) { - /** - * Clear the in-flight disconnect indicator for the current CPU, as we've - * already submitted any prior pending SPTM operations, and we're about to - * briefly re-enable preemption which may cause this thread to be migrated. - */ - os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release); + if (clear_epoch) { + pmap_epoch_exit(); + clear_epoch = false; } /** * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT(). @@ -4610,8 +4781,8 @@ protect_skip_pve: } } - if (remove) { - os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release); + if (clear_epoch) { + pmap_epoch_exit(); } /** @@ -4626,21 +4797,31 @@ protect_skip_pve: /* if we removed a bunch of entries, take care of them now */ if (remove) { /** - * If we (or our caller as indicated by PMAP_OPTIONS_PPO_PENDING_RETYPE) will - * be retyping the page, we need to drain the epochs to ensure that concurrent - * calls to batched operations such as pmap_remove() and the various multipage - * attribute update functions have finished consuming mappings of this page. + * If a retype is going to be needed here and/or by our caller, drain + * the epochs to ensure that concurrent calls to batched operations such as + * pmap_remove() and the various multipage attribute update functions have + * finished consuming mappings of this page. */ - const bool needs_retyping = pmap_prepare_unmapped_page_for_retype(phys); - if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !needs_retyping) { + bool retype_needed = false; + sptm_frame_type_t frame_type = XNU_DEFAULT; + if (options & (PMAP_OPTIONS_PPO_PENDING_RETYPE | PMAP_OPTIONS_RETYPE)) { /** - * pmap_prepare_unmapped_page_for_retype() will only return true if - * the page belongs to a certain set of types that need to be auto- - * retyped back to XNU_DEFAULT when they are unmapped. But if the - * caller indicated that it's going to retype the page, we need - * to drain the epochs regardless of the current page type. + * If the frame type isn't currently XNU_DEFAULT, retype it back either + * to satisfy the caller's request (PMAP_OPTIONS_RETYPE) or to ensure + * the caller's subsequent retype will work as not all non-default types + * can be directly retyped to one another without going through XNU_DEFAULT. */ - pmap_retype_epoch_prepare_drain(); + frame_type = sptm_get_frame_type(phys); + retype_needed = (frame_type != XNU_DEFAULT); + } + /** + * If the caller is indicating that it will subsequently retype the page + * by passing PMAP_OPTIONS_PPO_PENDING_RETYPE, then we'll need to drain the epochs + * regardless of current frame type to prepare for the caller's retype. + */ + const bool drain_needed = retype_needed || !!(options & PMAP_OPTIONS_PPO_PENDING_RETYPE); + if (__improbable(drain_needed)) { + pmap_epoch_prepare_drain(); } if (new_pve_p != PV_ENTRY_NULL) { pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP); @@ -4651,10 +4832,12 @@ protect_skip_pve: pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL); } - /* If removing the last mapping to a specially-protected page, retype the page back to XNU_DEFAULT. */ - const bool retype_needed = pmap_retype_unmapped_page(phys); - if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !retype_needed) { - pmap_retype_epoch_drain(); + if (__improbable(drain_needed)) { + pmap_epoch_drain(); + } + if (__improbable(retype_needed)) { + const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + sptm_retype(phys, frame_type, XNU_DEFAULT, retype_params); } } @@ -4905,7 +5088,7 @@ pmap_protect_options_internal( * Removing "NX" would grant "execute" access immediately, bypassing any * checks VM might want to do in its soft fault path. * pmap_protect() and co. are not allowed to increase access permissions, - * except in the PMAP_PROTECT_OPTIONS_IMMEDIATE internal-only case. + * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case. * Therefore, if we are not explicitly clearing execute permissions, inherit * the existing permissions. */ @@ -4923,20 +5106,16 @@ pmap_protect_options_internal( * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to * provide a "backdoor" to allow normally write-protected compressor pages to be * be temporarily written without triggering expensive write faults. - * SPTM TODO: Given the intended use of this flag, we may be able to relax some - * of our assumptions below when it comes to ref/mod accounting, and we may be - * able to avoid holding the PVH lock across the SPTM mapping operation and the - * ref/mod updates. This will be important if we move to a batched SPTM mapping - * API. */ - if (force_write) { + while (force_write) { if (spte == ARM_PTE_EMPTY) { spte = os_atomic_load(pte_p, relaxed); } + const pt_entry_t prev_pte = spte; - /* A concurrent remove or disconnect may have cleared the PTE. */ + /* A concurrent disconnect may have cleared the PTE. */ if (__improbable(!pte_is_valid(spte))) { - goto pmap_protect_insert_mapping; + break; } /* Inherit permissions and "was_writeable" from the template. */ @@ -4950,25 +5129,38 @@ pmap_protect_options_internal( locked_pvh_t locked_pvh; if (pa_valid(pa)) { locked_pvh = pvh_lock(pai); + + /** + * The VM may concurrently call pmap_disconnect() on the compressor + * page in question, e.g. if relocating the page to satisfy a precious + * allocation. Now that we hold the PVH lock, re-check the PTE and + * restart the loop if it's different from the value we read before + * we held the lock. + */ + if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) { + pvh_unlock(&locked_pvh); + spte = ARM_PTE_EMPTY; + continue; + } ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED); } __assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte); - /* - * We don't expect the VM to be concurrently removing these compressor mappings. - * If it does for some reason, we can check for SPTM_MAP_FLUSH_PENDING and continue - * the main loop. + /** + * We don't expect the VM to be concurrently calling pmap_remove() against these + * compressor mappings. If it does for some reason, that could cause the above + * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING. */ - assert((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)); + assert3u(sptm_status, ==, SPTM_MAP_VALID); if (pa_valid(pa)) { pvh_unlock(&locked_pvh); } + break; } -pmap_protect_insert_mapping: #endif /* DEVELOPMENT || DEBUG */ va += pmap_page_size; @@ -4982,17 +5174,17 @@ pmap_protect_insert_mapping: ++num_mappings; if (num_mappings == SPTM_MAPPING_LIMIT) { /** - * Enter the retype epoch for the batched update operation. This is necessary because we + * Enter the pmap epoch for the batched update operation. This is necessary because we * cannot reasonably hold the PVH locks for all pages mapped by the region during this * call, so a concurrent pmap_page_protect() operation against one of those pages may * race this call. That should be perfectly fine as far as the PTE updates are concerned, * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result * if it does not first drain our epoch. */ - pmap_retype_epoch_enter(); + pmap_epoch_enter(); sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_PERMS_AND_WAS_WRITABLE); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings); /* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */ @@ -5007,10 +5199,10 @@ pmap_protect_insert_mapping: /* This won't happen in the force_write case as we should never increment num_mappings. */ if (num_mappings != 0) { - pmap_retype_epoch_enter(); + pmap_epoch_enter(); sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_PERMS_AND_WAS_WRITABLE); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings); } @@ -5213,6 +5405,261 @@ pmap_enter( return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type); } +/** + * Helper function for determining the frame type that will be required for a physical page given + * a set of mapping constraints. + * + * @param pmap The address space in which the page will be mapped. + * @param pte The fully-configured page table entry, including permissions and output address, that + * will be used for the mapping. + * @param vaddr The virtual address that will be mapped using [pte] + * @param options Extra mapping options that would be passed to pmap_enter() when performing the mapping + * @param mapping_type The mapping type enum that would be passed to pmap_enter() when performing the mapping + * @param prev_frame_type Output param that will store the existing frame type for the physical page + * mapped by [pte]. As an optimization, this will only be queried if [*new_frame_type] + * is determined to be something other than XNU_DEFAULT, otherwise it will be assumed + * to be XNU_DEFAULT + * @param new_frame_type Output param that will store the new frame type that will be required for the + * physical page mapped by [pte] + */ +static inline void +pmap_frame_type_for_pte( + pmap_t pmap __assert_only, + pt_entry_t pte, + vm_map_address_t vaddr __assert_only, + unsigned int options, + pmap_mapping_type_t mapping_type, + sptm_frame_type_t *prev_frame_type, + sptm_frame_type_t *new_frame_type) +{ + const pmap_paddr_t paddr = pte_to_pa(pte) & ~PAGE_MASK; + assert(prev_frame_type != NULL); + assert(new_frame_type != NULL); + *prev_frame_type = *new_frame_type = XNU_DEFAULT; + + /* + * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we + * keep the existing logic of deriving the SPTM frame type from the XPRR permissions. + * + * If the caller specified another mapping type, we simply follow that. This refactor was + * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at + * what we want. It's better to let the caller specify the mapping type rather than use the + * permissions for that. + * + * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323. + */ + if (mapping_type != PMAP_MAPPING_TYPE_INFER) { + switch (mapping_type) { + case PMAP_MAPPING_TYPE_DEFAULT: + *new_frame_type = (sptm_frame_type_t)mapping_type; + break; + case PMAP_MAPPING_TYPE_ROZONE: + assert(((pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + pt_attr_page_size(pmap_get_pt_attr(pmap))))); + *new_frame_type = (sptm_frame_type_t)mapping_type; + break; + case PMAP_MAPPING_TYPE_RESTRICTED: + if (use_xnu_restricted) { + *new_frame_type = (sptm_frame_type_t)mapping_type; + } else { + *new_frame_type = XNU_DEFAULT; + } + break; + default: + panic("invalid mapping type: %d", mapping_type); + } + } else if (__improbable(pte_to_xprr_perm(pte) == XPRR_USER_JIT_PERM)) { + /* + * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using + * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other + * flags which the VM may have provided. + * + * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering + * this case. We can't do this for now because this might trigger on some macOS + * systems where applications use MAP_JIT with RW/RX permissions, and then later + * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG + * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can + * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application + * switches to RWX, then we can start asserting this requirement. + */ + *new_frame_type = XNU_USER_JIT; + } else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) { + /* + * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must + * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the + * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type. + */ + *new_frame_type = XNU_USER_DEBUG; + } else if (pte_to_xprr_perm(pte) == XPRR_USER_RX_PERM) { + *new_frame_type = XNU_USER_EXEC; + } else if ((pte_to_xprr_perm(pte) == XPRR_USER_RW_PERM) || + (pte_was_writeable(pte) && (pte_to_xprr_perm(pte) == XPRR_USER_RO_PERM))) { + /** + * Allow retyping from user executable types (except XNU_USER_DEBUG, which already + * allows user RW mappings) back to XNU_DEFAULT if a writable mapping is requested. + * Our retype logic will disconnect all existing mappings, so future attempts to + * execute these pages will fault, retype back to exec, and go back through any + * needed CS validation. For all other current frame types, just leave the previous + * and new frame types unchanged; for most other types attempting to add a user RW + * mapping is a bug and we should just let the SPTM throw a violation. + */ + const sptm_frame_type_t cur_frame_type = sptm_get_frame_type(paddr); + if (__improbable(sptm_type_is_user_executable(cur_frame_type) && + (cur_frame_type != XNU_USER_DEBUG))) { + *prev_frame_type = cur_frame_type; + } + } + + if (__improbable(*new_frame_type != XNU_DEFAULT)) { + *prev_frame_type = sptm_get_frame_type(paddr); + } +} + +/* + * Construct a PTE (and the physical page attributes) for the given virtual to + * physical mapping. + * + * @param pmap The pmap representing the address space for which to construct + * the mapping. + * @param pa The physical address to be mapped by the new PTE. + * @param prot Access permissions to apply to the new PTE. + * @param fault_type The type of access fault that is triggering the request + * to construct the new PTE. + * @param wired Whether the new PTE should have the wired bit set. + * @param pp_attr_bits Output parameter that will return the physical page attributes + * to apply to pp_attr_table for the new mapping. + * + * This function has no side effects and is safe to call while attempting a + * pmap_enter transaction. + */ +MARK_AS_PMAP_TEXT static pt_entry_t +pmap_construct_pte( + const pmap_t pmap, + pmap_paddr_t pa, + vm_prot_t prot, + vm_prot_t fault_type, + boolean_t wired, + uint16_t *pp_attr_bits /* OUTPUT */ + ) +{ + const pt_attr_t* const pt_attr = pmap_get_pt_attr(pmap); + bool set_NX = false, set_XO = false; + pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID; + assert(pp_attr_bits != NULL); + *pp_attr_bits = 0; + + if (wired) { + pte |= ARM_PTE_WIRED; + } + +#if DEVELOPMENT || DEBUG + if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) +#else + if ((prot & VM_PROT_EXECUTE)) +#endif + { + set_NX = false; + } else { + set_NX = true; + } + + if (prot == VM_PROT_EXECUTE) { + set_XO = true; + + } + + if (set_NX) { + pte |= pt_attr_leaf_xn(pt_attr); + } else { + if (pmap == kernel_pmap) { + pte |= ARM_PTE_NX; + } else { + pte |= pt_attr_leaf_x(pt_attr); + } + } + + if (pmap == kernel_pmap) { +#if __ARM_KERNEL_PROTECT__ + pte |= ARM_PTE_NG; +#endif /* __ARM_KERNEL_PROTECT__ */ + if (prot & VM_PROT_WRITE) { + pte |= ARM_PTE_AP(AP_RWNA); + *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED; + } else { + pte |= ARM_PTE_AP(AP_RONA); + *pp_attr_bits |= PP_ATTR_REFERENCED; + } + } else { + if (pmap->type != PMAP_TYPE_NESTED) { + pte |= ARM_PTE_NG; + } + if (prot & VM_PROT_WRITE) { + assert(pmap->type != PMAP_TYPE_NESTED); + if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) { + if (fault_type & VM_PROT_WRITE) { + pte |= pt_attr_leaf_rw(pt_attr); + *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED; + } else { + pte |= pt_attr_leaf_ro(pt_attr); + /* + * Mark the page as MODFAULT so that a subsequent write + * may be handled through arm_fast_fault(). + */ + *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT; + pte_set_was_writeable(pte, true); + } + } else { + pte |= pt_attr_leaf_rw(pt_attr); + *pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED); + } + } else { + if (set_XO) { + pte |= pt_attr_leaf_rona(pt_attr); + } else { + pte |= pt_attr_leaf_ro(pt_attr); + } + *pp_attr_bits |= PP_ATTR_REFERENCED; + } + } + + pte |= ARM_PTE_AF; + return pte; +} + +/** + * This function allows the VM to query whether a mapping operation will result in a page being + * retyped, without actually performing the mapping operation. It's useful for the VM to know + * this when performing up-front page validation under the VM object lock. + * + * @param pmap The address space in which the mapping will occur + * @param vaddr The virtual address that will be mapped + * @param pn The physical page number to be mapped by [vaddr] + * @param prot The permissions to be used for the mapping + * @param options The extra mapping options that would be passed to pmap_enter() if the + * mapping operation were performed + * @param mapping_type The mapping type enum that would be passed to pmap_enter() if the + * mapping operation were performed + * + * @return True if the mapping operation would produce a retype of the page at [pn], + * False otherwise + */ +bool +pmap_will_retype( + pmap_t pmap, + vm_map_address_t vaddr, + ppnum_t pn, + vm_prot_t prot, + unsigned int options, + pmap_mapping_type_t mapping_type) +{ + const pmap_paddr_t paddr = ptoa(pn); + uint16_t pp_attr_bits; + pt_entry_t pte = pmap_construct_pte(pmap, paddr, prot, prot, false, &pp_attr_bits); + sptm_frame_type_t prev_frame_type, new_frame_type; + pmap_frame_type_for_pte(pmap, pte, vaddr, options, mapping_type, &prev_frame_type, &new_frame_type); + + return new_frame_type != prev_frame_type; +} + /* * Attempt to update a PTE constructed by pmap_enter_options(). * @@ -5260,68 +5707,10 @@ pmap_enter_pte( assert(get_preemption_level() > 0); const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK; - sptm_frame_type_t prev_frame_type = XNU_DEFAULT; - sptm_frame_type_t new_frame_type = XNU_DEFAULT; + sptm_frame_type_t prev_frame_type; + sptm_frame_type_t new_frame_type; - /* - * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we - * keep the existing logic of deriving the SPTM frame type from the XPRR permissions. - * - * If the caller specified another mapping type, we simply follow that. This refactor was - * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at - * what we want. It's better to let the caller specify the mapping type rather than use the - * permissions for that. - * - * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323. - */ - if (mapping_type != PMAP_MAPPING_TYPE_INFER) { - switch (mapping_type) { - case PMAP_MAPPING_TYPE_DEFAULT: - new_frame_type = (sptm_frame_type_t)mapping_type; - break; - case PMAP_MAPPING_TYPE_ROZONE: - assert(((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pmap_get_pt_attr(pmap))))); - new_frame_type = (sptm_frame_type_t)mapping_type; - break; - case PMAP_MAPPING_TYPE_RESTRICTED: - if (use_xnu_restricted) { - new_frame_type = (sptm_frame_type_t)mapping_type; - } else { - new_frame_type = XNU_DEFAULT; - } - break; - default: - panic("invalid mapping type: %d", mapping_type); - } - } else if (__improbable(pte_to_xprr_perm(new_pte) == XPRR_USER_JIT_PERM)) { - /* - * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using - * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other - * flags which the VM may have provided. - * - * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering - * this case. We can't do this for now because this might trigger on some macOS - * systems where applications use MAP_JIT with RW/RX permissions, and then later - * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG - * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can - * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application - * switches to RWX, then we can start asserting this requirement. - */ - new_frame_type = XNU_USER_JIT; - } else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) { - /* - * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must - * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the - * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type. - */ - new_frame_type = XNU_USER_DEBUG; - } else if (pte_to_xprr_perm(new_pte) == XPRR_USER_RX_PERM) { - new_frame_type = XNU_USER_EXEC; - } - - if (__improbable(new_frame_type != XNU_DEFAULT)) { - prev_frame_type = sptm_get_frame_type(pa); - } + pmap_frame_type_for_pte(pmap, new_pte, v, options, mapping_type, &prev_frame_type, &new_frame_type); if (__improbable(new_frame_type != prev_frame_type)) { /** @@ -5346,10 +5735,35 @@ pmap_enter_pte( sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; /* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */ prev_frame_type = sptm_get_frame_type(pa); - sptm_retype(pa, prev_frame_type, new_frame_type, retype_params); + if (new_frame_type != prev_frame_type) { + sptm_retype(pa, prev_frame_type, new_frame_type, retype_params); + } } + if (pmap->type == PMAP_TYPE_NESTED) { + /** + * Enter the epoch before we check the unnesting state of the leaf page table, so that a + * concurrent pmap_unnest() operation can guarantee that we either observe the unnested + * table state and install a non-global mapping, or have finished installing a global mapping + * before it marks all existing mappings as non-global. + */ + pmap_epoch_enter(); + vm_map_offset_t nested_region_size = os_atomic_load(&pmap->nested_region_size, acquire); + if (nested_region_size && (v >= pmap->nested_region_addr) && (v < (pmap->nested_region_addr + nested_region_size))) { + assert(pmap->nested_region_addr != 0); + assert(pmap->nested_region_unnested_table_bitmap != NULL); + unsigned int index = (unsigned int)((v - pmap->nested_region_addr) >> + pt_attr_twig_shift(pmap_get_pt_attr(pmap))); + + if ((bitmap_test(pmap->nested_region_unnested_table_bitmap, UNNEST_IN_PROGRESS_BIT(index)))) { + new_pte |= ARM_PTE_NG; + } + } + } const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte); + if (pmap->type == PMAP_TYPE_NESTED) { + pmap_epoch_exit(); + } if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) { /* * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as @@ -5411,7 +5825,18 @@ wimg_to_pte(unsigned int wimg, pmap_paddr_t pa) pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_RT: - pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT); + if (is_dram_addr(pa)) { + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT); + } else { +#if HAS_FEAT_XS + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_POSTED: @@ -5440,7 +5865,18 @@ wimg_to_pte(unsigned int wimg, pmap_paddr_t pa) pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_WCOMB: - pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + if (is_dram_addr(pa)) { + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB); + } else { +#if HAS_FEAT_XS + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS); +#else /* HAS_FEAT_XS */ + pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED); +#endif /* HAS_FEAT_XS */ +#if DEBUG || DEVELOPMENT + pmap_wcrt_on_non_dram_count_increment_atomic(); +#endif /* DEBUG || DEVELOPMENT */ + } pte |= ARM_PTE_NX | ARM_PTE_PNX; break; case VM_WIMG_WTHRU: @@ -5463,118 +5899,6 @@ wimg_to_pte(unsigned int wimg, pmap_paddr_t pa) return pte; } - -/* - * Construct a PTE (and the physical page attributes) for the given virtual to - * physical mapping. - * - * This function has no side effects and is safe to call so that it is safe to - * call while attempting a pmap_enter transaction. - */ -MARK_AS_PMAP_TEXT static pt_entry_t -pmap_construct_pte( - const pmap_t pmap, - vm_map_address_t va, - pmap_paddr_t pa, - vm_prot_t prot, - vm_prot_t fault_type, - boolean_t wired, - const pt_attr_t* const pt_attr, - uint16_t *pp_attr_bits /* OUTPUT */ - ) -{ - bool set_NX = false, set_XO = false; - pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID; - assert(pp_attr_bits != NULL); - *pp_attr_bits = 0; - - if (wired) { - pte |= ARM_PTE_WIRED; - } - -#if DEVELOPMENT || DEBUG - if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled) -#else - if ((prot & VM_PROT_EXECUTE)) -#endif - { - set_NX = false; - } else { - set_NX = true; - } - - if (prot == VM_PROT_EXECUTE) { - set_XO = true; - - } - - if (set_NX) { - pte |= pt_attr_leaf_xn(pt_attr); - } else { - if (pmap == kernel_pmap) { - pte |= ARM_PTE_NX; - } else { - pte |= pt_attr_leaf_x(pt_attr); - } - } - - if (pmap == kernel_pmap) { -#if __ARM_KERNEL_PROTECT__ - pte |= ARM_PTE_NG; -#endif /* __ARM_KERNEL_PROTECT__ */ - if (prot & VM_PROT_WRITE) { - pte |= ARM_PTE_AP(AP_RWNA); - *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED; - } else { - pte |= ARM_PTE_AP(AP_RONA); - *pp_attr_bits |= PP_ATTR_REFERENCED; - } - } else { - if (pmap->type != PMAP_TYPE_NESTED) { - pte |= ARM_PTE_NG; - } else if ((pmap->nested_region_unnested_table_bitmap) - && (va >= pmap->nested_region_addr) - && (va < (pmap->nested_region_addr + pmap->nested_region_size))) { - unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr)); - - if ((pmap->nested_region_unnested_table_bitmap) - && bitmap_test(pmap->nested_region_unnested_table_bitmap, index)) { - pte |= ARM_PTE_NG; - } - } - if (prot & VM_PROT_WRITE) { - assert(pmap->type != PMAP_TYPE_NESTED); - if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) { - if (fault_type & VM_PROT_WRITE) { - pte |= pt_attr_leaf_rw(pt_attr); - *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED; - } else { - pte |= pt_attr_leaf_ro(pt_attr); - /* - * Mark the page as MODFAULT so that a subsequent write - * may be handled through arm_fast_fault(). - */ - *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT; - pte_set_was_writeable(pte, true); - } - } else { - pte |= pt_attr_leaf_rw(pt_attr); - *pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED); - } - } else { - if (set_XO) { - pte |= pt_attr_leaf_rona(pt_attr); - } else { - pte |= pt_attr_leaf_ro(pt_attr); - } - *pp_attr_bits |= PP_ATTR_REFERENCED; - } - } - - pte |= ARM_PTE_AF; - return pte; -} - MARK_AS_PMAP_TEXT kern_return_t pmap_enter_options_internal( pmap_t pmap, @@ -5593,7 +5917,6 @@ pmap_enter_options_internal( bool committed = false; kern_return_t kr = KERN_SUCCESS; uint16_t pp_attr_bits; - volatile uint16_t *wiredcnt = NULL; pv_free_list_t *local_pv_free; validate_pmap_mutable(pmap); @@ -5629,7 +5952,7 @@ pmap_enter_options_internal( pa &= ARM_PTE_PAGE_MASK; if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) { -#if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST) +#if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST) extern vm_offset_t ctrr_test_page; if (__probable(v != ctrr_test_page)) #endif @@ -5684,11 +6007,6 @@ pmap_enter_options_internal( pmap_remove_range(pmap, v, v + PAGE_SIZE); } - if (pmap != kernel_pmap) { - ptd_info_t *ptd_info = ptep_get_info(pte_p); - wiredcnt = &ptd_info->wiredcnt; - } - while (!committed) { pt_entry_t spte = ARM_PTE_EMPTY; pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS; @@ -5705,8 +6023,8 @@ pmap_enter_options_internal( * which needs to happen at every iteration of the commit loop in case we * previously dropped the pmap lock. */ - pt_entry_t pte = pmap_construct_pte(pmap, v, pa, - ((options & PMAP_OPTIONS_MAP_TPRO) ? VM_PROT_RORW_TP : prot), fault_type, wired, pt_attr, &pp_attr_bits); + pt_entry_t pte = pmap_construct_pte(pmap, pa, + ((options & PMAP_OPTIONS_MAP_TPRO) ? VM_PROT_RORW_TP : prot), fault_type, wired, &pp_attr_bits); if (pa_valid(pa)) { unsigned int pai; @@ -5740,6 +6058,11 @@ pmap_enter_options_internal( if (__improbable(allocation_required && (local_pv_free->count < 2))) { pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL}; int new_allocated_pves = 0; + volatile uint16_t *wiredcnt = NULL; + if (pmap != kernel_pmap) { + ptd_info_t *ptd_info = ptep_get_info(pte_p); + wiredcnt = &ptd_info->wiredcnt; + } while (new_allocated_pves < 2) { local_pv_free = &pmap_get_cpu_data()->pv_free; @@ -6044,9 +6367,9 @@ pmap_change_wiring_internal( pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring; - pmap_retype_epoch_enter(); + pmap_epoch_enter(); sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed); enable_preemption(); @@ -6303,8 +6626,8 @@ pmap_init_pte_page( * pmap_enter_options_internal() for an example. * * @param pmap The pmap for which to ensure mapping space is present. - * @param v The virtual address for which to ensure mapping space is present - * in [pmap]. + * @param vaddr The virtual address for which to ensure mapping space is present + * in [pmap]. * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be * allocated. The only valid option is PMAP_OPTIONS_NOWAIT, which * specifies that the allocation must not block. @@ -6326,7 +6649,7 @@ pmap_expand( if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) { return KERN_INVALID_ADDRESS; } - pmap_paddr_t pa; + pmap_paddr_t table_pa = pmap->ttep; const uint64_t pmap_page_size = pt_attr_page_size(pt_attr); const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1; unsigned int ttlevel = pt_attr_root_level(pt_attr); @@ -6334,8 +6657,6 @@ pmap_expand( tt_entry_t *ttep; tt_entry_t old_tte = ARM_TTE_EMPTY; - pa = 0x0ULL; - for (; ttlevel < level; ttlevel++) { /** * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE. @@ -6346,7 +6667,8 @@ pmap_expand( */ if (table_ttep == NULL) { assert(tte_is_valid_table(old_tte)); - table_ttep = (tt_entry_t*)phystokv(old_tte & ARM_TTE_TABLE_MASK); + table_pa = old_tte & ARM_TTE_TABLE_MASK; + table_ttep = (tt_entry_t*)phystokv(table_pa); } vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr); @@ -6362,18 +6684,30 @@ pmap_expand( table_ttep = NULL; if (!tte_is_valid_table(old_tte)) { tt_entry_t new_tte, *new_ttep; - while (pmap_tt_allocate(pmap, &new_ttep, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) { + pt_desc_t *new_ptdp; + while (pmap_tt_allocate(pmap, &new_ttep, &new_ptdp, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) { if (options & PMAP_OPTIONS_NOWAIT) { return KERN_RESOURCE_SHORTAGE; } VM_PAGE_WAIT(); } - /* Grab the pmap lock to ensure we don't try to concurrently map different tables at the same TTE. */ - pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE); + assert(pa_valid(table_pa)); + /** + * Grab the lower-level table's PVH lock to ensure we don't try to concurrently map different + * tables at the same TTE. + */ + locked_pvh_t locked_pvh = pvh_lock(pa_index(table_pa)); old_tte = os_atomic_load(ttep, relaxed); if (!tte_is_valid_table(old_tte)) { + /** + * This call must be issued prior to sptm_map_table() so that the page table's + * PTD info is valid by the time the new table becomes visible in the paging + * hierarchy. sptm_map_table() is expected to issue a barrier that effectively + * guarantees the PTD update will be visible to concurrent observers as soon as + * the new table becomes visible in the paging hierarchy. + */ pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE); - pa = kvtophys_nofail((vm_offset_t)new_ttep); + pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)new_ttep); /* * If the table is going to map a kernel RO zone VA region, then we must * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE. The SPTM's type system @@ -6392,6 +6726,15 @@ pmap_expand( sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte); PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)), VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte); + + /** + * Now that we've fully mapped the table, do final initialization of PTD + * state, which includes dropping the wired count to allow future reclamation + * of the page table page. + */ + ptd_info_finalize(new_ptdp); + + table_pa = pa; /** * If we need to set up multiple TTEs mapping different parts of the same page * (e.g. because we're carving multiple 4K page tables out of a 16K native page, @@ -6400,10 +6743,9 @@ pmap_expand( */ table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) * (pmap_page_size / sizeof(tt_entry_t))); - pa = 0x0ULL; new_ttep = (tt_entry_t *)NULL; } - pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE); + pvh_unlock(&locked_pvh); if (new_ttep != (tt_entry_t *)NULL) { pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1); @@ -6517,7 +6859,7 @@ coredumpok( return FALSE; } spte = *pte_p; - return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT); + return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT; } #endif @@ -7774,13 +8116,11 @@ arm_clear_fast_fault( const pt_desc_t * const ptdp = ptep_get_ptd(pte_p); const pmap_t pmap = ptdp->pmap; - const vm_map_address_t va = ptd_get_va(ptdp, pte_p); - - assert(va >= pmap->min && va < pmap->max); tmplate = spte; if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) { + assert(pmap); { if (pmap == kernel_pmap) { tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA)); @@ -7795,6 +8135,7 @@ arm_clear_fast_fault( pte_set_was_writeable(tmplate, false); attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED); } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) { + assert(pmap); tmplate = spte | ARM_PTE_AF; { @@ -7805,6 +8146,9 @@ arm_clear_fast_fault( assert(spte != ARM_PTE_EMPTY); if (spte != tmplate) { + const vm_map_address_t va = ptd_get_va(ptdp, pte_p); + assert(va >= pmap->min && va < pmap->max); + sptm_ops[num_mappings].root_pt_paddr = pmap->ttep; sptm_ops[num_mappings].vaddr = va; sptm_ops[num_mappings].pte_template = tmplate; @@ -8190,11 +8534,12 @@ pmap_map_cpu_windows_copy_internal( panic("%s: out of windows", __func__); } - pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX; + const pmap_paddr_t paddr = ptoa(pn); + pte = pa_to_pte(paddr) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX; #if __ARM_KERNEL_PROTECT__ pte |= ARM_PTE_NG; #endif /* __ARM_KERNEL_PROTECT__ */ - pte |= wimg_to_pte(wimg_bits, ptoa(pn)); + pte |= wimg_to_pte(wimg_bits, paddr); if (prot & VM_PROT_WRITE) { pte |= ARM_PTE_AP(AP_RWNA); @@ -8363,15 +8708,12 @@ pmap_trim_range( /* Iterate over the range, trying to remove TTEs. */ for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) { - pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE); - tte_p = pmap_tte(pmap, cur); if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) { - /* pmap_tte_deallocate()/pmap_tte_trim() will drop the pmap lock */ if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) { /* Deallocate for the nested map. */ - pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr)); + pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr), false); } else if (pmap->type == PMAP_TYPE_USER) { /** * Just remove for the parent map. If the leaf table pointed @@ -8384,8 +8726,6 @@ pmap_trim_range( } else { panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type); } - } else { - pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE); } } } @@ -8400,6 +8740,12 @@ pmap_trim_range( * * Attempts to trim the shared region page tables down to only cover the given * range in subord and grand. + * + * This function assumes that trimming of [subord] happens exactly once, against + * a temporary [grand] pmap, and that it happens before [subord] is ever actually + * nested in a real task pmap. Unlike its PPL predecessor (which can't trust its + * callers), the SPTM implementation therefore does not do any refcounting to + * track top-level pmaps that may have nested tables outside the trimmed range. */ MARK_AS_PMAP_TEXT void pmap_trim_internal( @@ -8422,8 +8768,6 @@ pmap_trim_internal( __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); - pmap_lock(subord, PMAP_LOCK_EXCLUSIVE); - if (__improbable(subord->type != PMAP_TYPE_NESTED)) { panic("%s: subord is of non-nestable type 0x%hhx, " "grand=%p, subord=%p, vstart=%p, size=%#llx", @@ -8442,121 +8786,34 @@ pmap_trim_internal( __func__, grand, subord, (void*)vstart, size); } - if (__improbable((size != 0) && - ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) { + if (__improbable((vstart < grand->nested_region_addr) || + (vend > (grand->nested_region_addr + grand->nested_region_size)))) { panic("%s: grand range not in nested region, " "grand=%p, subord=%p, vstart=%p, size=%#llx", __func__, grand, subord, (void*)vstart, size); } + const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr); + adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio; + vm_map_offset_t true_end = vend; - if (!grand->nested_has_no_bounds_ref) { - assert(subord->nested_bounds_set); + os_atomic_store(&subord->nested_region_true_start, vstart & ~adjust_offmask, relaxed); - if (!grand->nested_bounds_set) { - /* Inherit the bounds from subord. */ - grand->nested_region_true_start = subord->nested_region_true_start; - grand->nested_region_true_end = subord->nested_region_true_end; - grand->nested_bounds_set = true; - } - - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - return; + if (__improbable(os_add_overflow(true_end, adjust_offmask, &true_end))) { + panic("%s: padded true end wraps around, " + "grand=%p, subord=%p, vstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, size); } - if ((!subord->nested_bounds_set) && size) { - const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr); - adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio; - - subord->nested_region_true_start = vstart; - subord->nested_region_true_end = vend; - subord->nested_region_true_start &= ~adjust_offmask; - - if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) { - panic("%s: padded true end wraps around, " - "grand=%p, subord=%p, vstart=%p, size=%#llx", - __func__, grand, subord, (void*)vstart, size); - } - - subord->nested_region_true_end &= ~adjust_offmask; - subord->nested_bounds_set = true; - } - - if (subord->nested_bounds_set) { - /* Inherit the bounds from subord. */ - grand->nested_region_true_start = subord->nested_region_true_start; - grand->nested_region_true_end = subord->nested_region_true_end; - grand->nested_bounds_set = true; - - /* If we know the bounds, we can trim the pmap. */ - grand->nested_has_no_bounds_ref = false; - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - } else { - /* Don't trim if we don't know the bounds. */ - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - return; - } + os_atomic_store(&subord->nested_region_true_end, true_end & ~adjust_offmask, relaxed); + os_atomic_store(&grand->nested_region_true_start, subord->nested_region_true_start, relaxed); + os_atomic_store(&grand->nested_region_true_end, subord->nested_region_true_end, relaxed); /* Trim grand to only cover the given range. */ pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start); pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size)); - - /* Try to trim subord. */ - pmap_trim_subord(subord); -} - -MARK_AS_PMAP_TEXT static void -pmap_trim_self(pmap_t pmap) -{ - if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) { - /* If we have a no bounds ref, we need to drop it. */ - pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED); - pmap->nested_has_no_bounds_ref = false; - boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set; - vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start; - vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end; - pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED); - - if (nested_bounds_set) { - pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start); - pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size)); - } - /* - * Try trimming the nested pmap, in case we had the - * last reference. - */ - pmap_trim_subord(pmap->nested_pmap); - } -} - -/* - * pmap_trim_subord(grand, subord) - * - * grand = pmap that we have nested subord in - * subord = nested pmap we are attempting to trim - * - * Trims subord if possible - */ -MARK_AS_PMAP_TEXT static void -pmap_trim_subord(pmap_t subord) -{ - bool contract_subord = false; - - pmap_lock(subord, PMAP_LOCK_EXCLUSIVE); - - subord->nested_no_bounds_refcnt--; - - if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) { - /* If this was the last no bounds reference, trim subord. */ - contract_subord = true; - } - - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - - if (contract_subord) { - pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start); - pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size); - } + pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start); + pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size); } void @@ -8588,6 +8845,121 @@ pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_ return res; } +typedef struct { + void *locations[SPTM_BATCHED_OPS_LIMIT]; + unsigned int index; + uint64_t jop_key; +} pmap_batch_sign_user_ptr_state_t; + +static pmap_batch_sign_user_ptr_state_t PERCPU_DATA(percpu_pmap_batch_sign_user_ptr_state); + +/** + * Accumulates a user pointer signing request, and calls into SPTM to sign + * them as it sees fit or is told to do so. If an SPTM call is made, + * this function copies the signed pointers to their respective locations. + * + * @note This function will disable preemption when called for the first + * time or for the first time after a submission to SPTM. It enables + * preemption after a submission is made. + * + * @note The caller can force the submission of accumulated ops so far by + * passing a NULL location pointer. + * + * @note The jop_key argument is expected to be consistent throughout a + * batch. This function will panic if it detects the jop_key passed + * in is inconsistent with the other ops in the batch. + * + * @param location The destination where the signed pointer will be copied + * to. The caller can pass a NULL pointer to force an SPTM + * submission of the accumulated signing ops so far. In + * such case, the rest of the argument list is ignored. + * @param value The pointer to be signed. + * @param key The key used to sign the pointer. + * @param discriminator The discriminator used to sign the pointer. + * @param jop_key The JOP key used to sign the pointer. + * + * @return true if an SPTM call was made. Otherwise false. + */ +bool +pmap_batch_sign_user_ptr(void *location, void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key) +{ + bool submitted_to_sptm = false; + + /* Disable preemption to access percpu data. */ + disable_preemption(); + + pmap_batch_sign_user_ptr_state_t *state = PERCPU_GET(percpu_pmap_batch_sign_user_ptr_state); + void **locations = state->locations; + pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); + sptm_user_pointer_op_t *sptm_user_pointer_ops = (sptm_user_pointer_op_t *) sptm_pcpu->sptm_user_pointer_ops; + uintptr_t *sptm_values = (uintptr_t *) sptm_pcpu->sptm_prev_ptes; + + if (state->index != 0) { + /* Avoid leaking preemption counts by offsetting the disable at the beginning of this function. */ + enable_preemption(); + + /* Disabled preemption is still expected. */ + assert(!preemption_enabled()); + } + + assert(state->index < SPTM_BATCHED_OPS_LIMIT); + + /* Stash a pointer signing op if a copy location is supplied. */ + if (location != NULL) { + locations[state->index] = location; + sptm_user_pointer_ops[state->index].value = (uintptr_t)value; + sptm_user_pointer_ops[state->index].key = key; + sptm_user_pointer_ops[state->index].discriminator = discriminator; + + if (state->index == 0) { + state->jop_key = jop_key; + } else { + assert(state->jop_key == jop_key); + } + + state->index = state->index + 1; + } + + /** + * Submit the stashed ops on this cpu to SPTM when: + * 1. there are SPTM_BATCHED_OPS_LIMIT ops accumulated on the cpu, or + * 2. the caller asks us to submit whatever we have accumulated by + * passing in a NULL location argument. + */ + if (state->index == SPTM_BATCHED_OPS_LIMIT || location == NULL) { + if (__probable(state->index > 0)) { + const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE); + + uint64_t saved_jop_state = ml_enable_user_jop_key(state->jop_key); + sptm_batch_sign_user_pointer(sptm_pcpu->sptm_user_pointer_ops_pa, state->index, state->jop_key); + ml_disable_user_jop_key(state->jop_key, saved_jop_state); + + ml_set_interrupts_enabled(current_intr_state); + + for (unsigned int i = 0; i < state->index; i++) { + memcpy(locations[i], &(sptm_values[i]), sizeof(sptm_values[i])); + } + + state->index = 0; + state->jop_key = 0; + submitted_to_sptm = true; + } + } + + /** + * There is a slight difference between using submitted_to_sptm and + * state->index here. We need to take care of the case when there is + * no op accumulated but a NULL location passed in, where submitted_to_sptm + * will be false and leak a preemption count. + */ + if (state->index == 0) { + assert(submitted_to_sptm || (location == NULL)); + enable_preemption(); + } + + return submitted_to_sptm; +} + void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key) { @@ -8610,31 +8982,126 @@ pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_ } #endif /* HAS_APPLE_PAC */ -/* - * kern_return_t pmap_nest(grand, subord, vstart, size) - * - * grand = the pmap that we will nest subord into - * subord = the pmap that goes into the grand - * vstart = start of range in pmap to be inserted - * size = Size of nest area (up to 16TB) - * - * Inserts a pmap into another. This is used to implement shared segments. +/** + * Establishes the pmap associated with a shared region as the nested pmap + * for a top-level user pmap. * + * @param grand The top-level user pmap + * @param subord The pmap to be set as [grand]'s nested pmap + * @param vstart The base VA of the region to be nested. + * @param size The size (in bytes) of the region to be nested. */ +void +pmap_set_shared_region( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + uint64_t size) +{ + addr64_t vend; + + PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START, + VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size); + + if (__improbable(os_add_overflow(vstart, size, &vend))) { + panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size); + } + + validate_pmap_mutable(grand); + validate_pmap(subord); + os_ref_retain_raw(&subord->ref_count, &pmap_refgrp); + + const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); + if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) { + panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand); + } + + if (__improbable(((size | vstart) & + (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) { + panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx", + __func__, grand, vstart, size); + } + + if (__improbable(subord->type != PMAP_TYPE_NESTED)) { + panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type); + } + + if (__improbable(grand->type != PMAP_TYPE_USER)) { + panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type); + } + + if (subord->nested_region_size == 0) { + /** + * Since subord->nested_region_size is 0, this is the first time subord is being + * associated with a top-level pmap. We therefore need to take a few extra steps to + * ensure the shared region is properly configured. This initial setup step is expected + * to be issued by the VM layer against a temporary grand pmap before any other pmap + * is allowed to associate with subord, so synchronization is not needed here to prevent + * concurrent initialization. + */ + sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift); + + /** + * Since this is the first time subord is being associated with a top-level pmap, ensure + * its nested region is fully expanded to L3 so that all relevant L3 tables can later be + * inserted into top-level pmaps via pmap_nest(). Note that pmap_remove() will never + * dynamically free L3 tables from nested pmaps. However, some of these tables may be + * freed by a later call to pmap_trim(). + */ + vm_map_offset_t vaddr = vstart; + while (vaddr < vend) { + const tt_entry_t *const stte_p = pmap_tte(subord, vaddr); + if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) { + __assert_only kern_return_t kr; + kr = pmap_expand(subord, vaddr, 0, pt_attr_leaf_level(pt_attr)); + assert3u(kr, ==, KERN_SUCCESS); + } + vaddr += pt_attr_twig_size(pt_attr); + } + + const uint64_t nested_region_unnested_table_bits = (size >> (pt_attr_twig_shift(pt_attr) - 1)); + if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) { + panic("%s: bitmap allocation size %llu will truncate, " + "grand=%p, subord=%p, vstart=0x%llx, size=%llx", + __func__, nested_region_unnested_table_bits, + grand, subord, vstart, size); + } + + subord->nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits); + subord->nested_region_addr = vstart; + subord->nested_region_size = (mach_vm_offset_t)size; + } + + if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) { + grand->nested_region_addr = vstart; + grand->nested_region_size = (mach_vm_offset_t)size; + assert3u(grand->nested_region_addr, ==, subord->nested_region_addr); + assert3u(grand->nested_region_size, ==, subord->nested_region_size); + pmap_txm_acquire_exclusive_lock(grand); + pmap_txm_acquire_shared_lock(subord); + sptm_set_shared_region(grand->ttep, subord->ttep); + pmap_txm_release_shared_lock(subord); + pmap_txm_release_exclusive_lock(grand); + } else { + panic("%s: pmap %p already has a nested pmap %p", __func__, grand, grand->nested_pmap); + } + + PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END); +} /** * Embeds a range of mappings from one pmap ('subord') into another ('grand') * by inserting the twig-level TTEs from 'subord' directly into 'grand'. - * This function operates in 3 main phases: - * 1. Bookkeeping to ensure tracking structures for the nested region are set up. - * 2. Expansion of subord to ensure the required leaf-level page table pages for - * the mapping range are present in subord. - * 3. Expansion of grand to ensure the required twig-level page table pages for + * This function operates in 2 main phases: + * 1. Expands grand to ensure the required twig-level page table pages for * the mapping range are present in grand. - * 4. Invoke sptm_nest_region() to copy the relevant TTEs from subord to grand. + * 2. Invokes sptm_nest_region() to copy the relevant TTEs from subord to grand. * - * This function may return early due to pending AST_URGENT preemption; if so - * it will indicate the need to be re-entered. + * @note This function requires that pmap_set_shared_region() has already been + * called for the [grand, subord] pair. + * + * @note The VA region defined by vstart and vsize must lie entirely within the + * VA region established by the previous call to pmap_set_shared_region(). * * @param grand pmap to insert the TTEs into. Must be a user pmap. * @param subord pmap from which to extract the TTEs. Must be a nested pmap. @@ -8652,11 +9119,7 @@ pmap_nest_internal( { kern_return_t kr = KERN_SUCCESS; vm_map_offset_t vaddr; - tt_entry_t *stte_p; tt_entry_t *gtte_p; - bitmap_t *nested_region_unnested_table_bitmap; - int expand_options = 0; - bool deref_subord = true; addr64_t vend; if (__improbable(os_add_overflow(vstart, size, &vend))) { @@ -8665,102 +9128,23 @@ pmap_nest_internal( validate_pmap_mutable(grand); validate_pmap(subord); - os_ref_retain_raw(&subord->ref_count, &pmap_refgrp); const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand); - if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) { - panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand); - } if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) { - panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx", - grand, vstart, size); + panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx", + __func__, grand, vstart, size); } - if (__improbable(subord->type != PMAP_TYPE_NESTED)) { - panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type); + if (__improbable(subord != grand->nested_pmap)) { + panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p", + __func__, subord, grand, grand->nested_pmap); } - if (__improbable(grand->type != PMAP_TYPE_USER)) { - panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type); - } - - /** - * Use an acquire barrier to ensure that subsequent loads of nested_region_* fields are not - * speculated ahead of the load of nested_region_unnested_table_bitmap, so that if we observe a non-NULL - * nested_region_unnested_table_bitmap then we can be sure the other fields have been initialized as well. - */ - if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) { - uint64_t nested_region_unnested_table_bits = size >> pt_attr_twig_shift(pt_attr); - - if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) { - panic("%s: bitmap allocation size %llu will truncate, " - "grand=%p, subord=%p, vstart=0x%llx, size=%llx", - __func__, nested_region_unnested_table_bits, - grand, subord, vstart, size); - } - - nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits); - - pmap_lock(subord, PMAP_LOCK_EXCLUSIVE); - if (subord->nested_region_unnested_table_bitmap == NULL) { - subord->nested_region_addr = vstart; - subord->nested_region_size = (mach_vm_offset_t) size; - sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift); - - /** - * Ensure that the rest of the subord->nested_region_* fields are - * initialized and visible before setting the nested_region_unnested_table_bitmap - * field (which is used as the flag to say that the rest are initialized). - */ - os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release); - nested_region_unnested_table_bitmap = NULL; - } - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - if (nested_region_unnested_table_bitmap != NULL) { - bitmap_free(nested_region_unnested_table_bitmap, nested_region_unnested_table_bits); - } - } - - assertf(subord->nested_region_addr == vstart, "%s: pmap %p nested region addr 0x%llx doesn't match vstart 0x%llx", - __func__, subord, (unsigned long long)subord->nested_region_addr, (unsigned long long)vstart); - assertf(subord->nested_region_size == size, "%s: pmap %p nested region size 0x%llx doesn't match size 0x%llx", - __func__, subord, (unsigned long long)subord->nested_region_size, (unsigned long long)size); - - pmap_lock(subord, PMAP_LOCK_EXCLUSIVE); - - if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) { - /* - * If this is grand's first nesting operation, keep the reference on subord. - * It will be released by pmap_destroy_internal() when grand is destroyed. - */ - deref_subord = false; - - if (!subord->nested_bounds_set) { - /* - * We are nesting without the shared regions bounds - * being known. We'll have to trim the pmap later. - */ - grand->nested_has_no_bounds_ref = true; - subord->nested_no_bounds_refcnt++; - } - - grand->nested_region_addr = vstart; - grand->nested_region_size = (mach_vm_offset_t) size; - } else { - if (__improbable(grand->nested_pmap != subord)) { - panic("pmap_nest() pmap %p has a nested pmap", grand); - } else if (__improbable(grand->nested_region_addr > vstart)) { - panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand); - } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) { - grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size); - } - } - - vaddr = vstart; - if (vaddr < subord->nested_region_true_start) { - vaddr = subord->nested_region_true_start; + addr64_t true_start = vstart; + if (true_start < subord->nested_region_true_start) { + true_start = subord->nested_region_true_start; } addr64_t true_end = vend; @@ -8768,40 +9152,13 @@ pmap_nest_internal( true_end = subord->nested_region_true_end; } - while (vaddr < true_end) { - stte_p = pmap_tte(subord, vaddr); - if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) { - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr)); - - if (kr != KERN_SUCCESS) { - pmap_lock(grand, PMAP_LOCK_EXCLUSIVE); - goto done; - } - - pmap_lock(subord, PMAP_LOCK_EXCLUSIVE); - } - vaddr += pt_attr_twig_size(pt_attr); - } - - /* - * copy TTEs from subord pmap into grand pmap - */ - - vaddr = (vm_map_offset_t) vstart; - if (vaddr < subord->nested_region_true_start) { - vaddr = subord->nested_region_true_start; - } - - pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE); - pmap_lock(grand, PMAP_LOCK_EXCLUSIVE); + /* Ensure grand is expanded to L2 so that sptm_nest_region() can copy L3 entries from subord. */ + vaddr = (vm_map_offset_t) true_start; while (vaddr < true_end) { gtte_p = pmap_tte(grand, vaddr); if (gtte_p == PT_ENTRY_NULL) { - pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE); - kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr)); - pmap_lock(grand, PMAP_LOCK_EXCLUSIVE); + kr = pmap_expand(grand, vaddr, 0, pt_attr_twig_level(pt_attr)); if (kr != KERN_SUCCESS) { goto done; @@ -8811,31 +9168,7 @@ pmap_nest_internal( vaddr += pt_attr_twig_size(pt_attr); } - vaddr = (vm_map_offset_t) vstart; - - /* - * It is possible to have a preempted nest operation execute concurrently - * with a trim operation that sets nested_region_true_start. In this case, - * update the nesting bounds. This is useful both as a performance - * optimization and to prevent an attempt to nest a just-trimmed TTE, - * which will trigger an SPTM violation. - * Note that pmap_trim() may concurrently update grand's bounds as we are - * making these checks, but in that case pmap_trim_range() has not yet - * been called on grand and will wait for us to drop grand's lock, so it - * should see any TTEs we've nested here and clear them appropriately. - */ - if (vaddr < subord->nested_region_true_start) { - vaddr = subord->nested_region_true_start; - } - if (vaddr < grand->nested_region_true_start) { - vaddr = grand->nested_region_true_start; - } - if (true_end > subord->nested_region_true_end) { - true_end = subord->nested_region_true_end; - } - if (true_end > grand->nested_region_true_end) { - true_end = grand->nested_region_true_end; - } + vaddr = (vm_map_offset_t) true_start; while (vaddr < true_end) { /* @@ -8846,20 +9179,11 @@ pmap_nest_internal( if (vlim > true_end) { vlim = true_end; } - pmap_txm_acquire_exclusive_lock(grand); - pmap_txm_acquire_shared_lock(subord); sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift); - pmap_txm_release_shared_lock(subord); - pmap_txm_release_exclusive_lock(grand); vaddr = vlim; } done: - pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE); - if (deref_subord) { - pmap_destroy_internal(subord); - } - return kr; } @@ -8914,9 +9238,6 @@ pmap_unnest( * them non-global. * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand. * - * This function may return early due to pending AST_URGENT preemption; if so - * it will indicate the need to be re-entered. - * * @param grand pmap from which to unnest mappings * @param vaddr twig-aligned virtual address for the beginning of the nested range * @param size twig-aligned size of the nested range @@ -8951,13 +9272,14 @@ pmap_unnest_options_internal( (unsigned long long)vaddr, (unsigned long long)size); } - if (__improbable(grand->nested_pmap == NULL)) { + struct pmap * const subord = grand->nested_pmap; + if (__improbable(subord == NULL)) { panic("%s: %p has no nested pmap", __func__, grand); } true_end = vend; - if (true_end > grand->nested_pmap->nested_region_true_end) { - true_end = grand->nested_pmap->nested_region_true_end; + if (true_end > subord->nested_region_true_end) { + true_end = subord->nested_region_true_end; } if ((option & PMAP_UNNEST_CLEAN) == 0) { @@ -8965,59 +9287,63 @@ pmap_unnest_options_internal( panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend); } - /* - * SPTM TODO: I suspect we may be able to hold the nested pmap lock shared here. - * We would need to use atomic_bitmap_set below where we currently use bitmap_test + bitmap_set. - * The risk is that a concurrent pmap_enter() against the nested pmap could observe the relevant - * bit in the nested region bitmap to be clear, but could then create the (global) mapping after - * we've made our SPTM sweep below to set NG. In that case we could end up with a mix of global - * and non-global mappings for the same VA region and thus a TLB conflict. I'm uncertain if the - * VM would allow these operation to happen concurrently. Even if it does, we could still do - * something fancier here such as waiting for concurrent pmap_enter() to drain after updating - * the bitmap. - */ - pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE); - - disable_preemption(); - pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); - unsigned int num_mappings = 0; start = vaddr; - if (start < grand->nested_pmap->nested_region_true_start) { - start = grand->nested_pmap->nested_region_true_start; + if (start < subord->nested_region_true_start) { + start = subord->nested_region_true_start; } start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr)); max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr)); for (current_index = start_index, addr = start; current_index < max_index; current_index++) { - pt_entry_t *bpte, *cpte; - vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr); - bpte = pmap_pte(grand->nested_pmap, addr); + bool unnested = bitmap_test(subord->nested_region_unnested_table_bitmap, UNNEST_BIT(current_index)); + os_atomic_thread_fence(acquire); + if (!unnested) { + atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap, + UNNEST_IN_PROGRESS_BIT(current_index), memory_order_relaxed); + /* + * Issue a store-load barrier to ensure the UNNEST_IN_PROGRESS bit is visible to any pmap_enter() + * operation that enters the epoch after this point. + */ + os_atomic_thread_fence(seq_cst); + pmap_epoch_prepare_drain(); + pmap_epoch_drain(); - if (!bitmap_test(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index)) { + unsigned int num_mappings = 0; + disable_preemption(); + pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); /* * We've marked the 'twig' region as being unnested. Every mapping entered within * the nested pmap in this region will now be marked non-global. */ - bitmap_set(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index); - for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) { - pt_entry_t spte = os_atomic_load(cpte, relaxed); - - if (pte_is_valid(spte)) { - spte |= ARM_PTE_NG; - } - + while (addr < vlim) { addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO); - sptm_pcpu->sptm_templates[num_mappings] = spte; + sptm_pcpu->sptm_templates[num_mappings] = ARM_PTE_NG; ++num_mappings; if (num_mappings == SPTM_MAPPING_LIMIT) { - pmap_retype_epoch_enter(); - sptm_update_region(grand->nested_pmap->ttep, start, num_mappings, - sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG); - pmap_retype_epoch_exit(); + pmap_epoch_enter(); + /** + * It's technically possible (though highly unlikely) for subord to + * be concurrently trimmed, so re-check the bounds within the epoch to + * avoid potentially issuing an SPTM operation against a deleted leaf + * page table. This assumes the following: + * 1) The pmap_trim() code path always issues a barrier and an epoch + * drain in between updating subord's true bounds and actually + * trimming subord, effectively purging any operation here which + * may be using stale bounds. + * 2) The true bounds, if set, will always be twig-aligned, thus + * the region we operate on here can never span the starting or + * ending bounds. + */ + if ((start >= subord->nested_region_true_start) && + (start < subord->nested_region_true_end)) { + sptm_update_region(subord->ttep, start, num_mappings, + sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG); + } + pmap_epoch_exit(); enable_preemption(); num_mappings = 0; start = addr; @@ -9025,33 +9351,25 @@ pmap_unnest_options_internal( sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); } } - } - /** - * The SPTM does not allow region updates to span multiple leaf page tables, so request - * any remaining updates up to vlim before moving to the next page table page. - */ - if (num_mappings != 0) { - pmap_retype_epoch_enter(); - sptm_update_region(grand->nested_pmap->ttep, start, num_mappings, - sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG); - pmap_retype_epoch_exit(); + /** + * The SPTM does not allow region updates to span multiple leaf page tables, so request + * any remaining updates up to vlim before moving to the next page table page. + */ + if (num_mappings != 0) { + pmap_epoch_enter(); + if ((start >= subord->nested_region_true_start) && + (start < subord->nested_region_true_end)) { + sptm_update_region(subord->ttep, start, num_mappings, + sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG); + } + pmap_epoch_exit(); + } enable_preemption(); - num_mappings = 0; - disable_preemption(); - sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); + atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap, + UNNEST_BIT(current_index), memory_order_release); } addr = start = vlim; } - - if (num_mappings != 0) { - pmap_retype_epoch_enter(); - sptm_update_region(grand->nested_pmap->ttep, start, num_mappings, - sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG); - pmap_retype_epoch_exit(); - } - - enable_preemption(); - pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE); } /* @@ -9059,14 +9377,12 @@ pmap_unnest_options_internal( */ addr = vaddr; - pmap_lock(grand, PMAP_LOCK_EXCLUSIVE); - - if (addr < grand->nested_pmap->nested_region_true_start) { - addr = grand->nested_pmap->nested_region_true_start; + if (addr < subord->nested_region_true_start) { + addr = subord->nested_region_true_start; } - if (true_end > grand->nested_pmap->nested_region_true_end) { - true_end = grand->nested_pmap->nested_region_true_end; + if (true_end > subord->nested_region_true_end) { + true_end = subord->nested_region_true_end; } while (addr < true_end) { @@ -9074,11 +9390,9 @@ pmap_unnest_options_internal( if (vlim > true_end) { vlim = true_end; } - sptm_unnest_region(grand->ttep, grand->nested_pmap->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift); + sptm_unnest_region(grand->ttep, subord->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift); addr = vlim; } - - pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE); } kern_return_t @@ -9108,7 +9422,6 @@ pmap_adjust_unnest_parameters( return TRUE; /* to get to log_unnest_badness()... */ } -#if PMAP_FORK_NEST /** * Perform any necessary pre-nesting of the parent's shared region at fork() * time. @@ -9117,20 +9430,12 @@ pmap_adjust_unnest_parameters( * * @param old_pmap The pmap of the parent task. * @param new_pmap The pmap of the child task. - * @param nesting_start An output parameter that is updated with the start - * address of the range that was pre-nested - * @param nesting_end An output parameter that is updated with the end - * address of the range that was pre-nested * * @return KERN_SUCCESS if the pre-nesting was succesfully completed. * KERN_INVALID_ARGUMENT if the arguments were not valid. */ kern_return_t -pmap_fork_nest( - pmap_t old_pmap, - pmap_t new_pmap, - vm_map_offset_t *nesting_start, - vm_map_offset_t *nesting_end) +pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap) { if (old_pmap == NULL || new_pmap == NULL) { return KERN_INVALID_ARGUMENT; @@ -9138,25 +9443,12 @@ pmap_fork_nest( if (old_pmap->nested_pmap == NULL) { return KERN_SUCCESS; } - pmap_nest(new_pmap, + pmap_set_shared_region(new_pmap, old_pmap->nested_pmap, old_pmap->nested_region_addr, old_pmap->nested_region_size); - assertf(new_pmap->nested_pmap == old_pmap->nested_pmap && - new_pmap->nested_region_addr == old_pmap->nested_region_addr && - new_pmap->nested_region_size == old_pmap->nested_region_size, - "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)", - new_pmap->nested_pmap, - new_pmap->nested_region_addr, - new_pmap->nested_region_size, - old_pmap->nested_pmap, - old_pmap->nested_region_addr, - old_pmap->nested_region_size); - *nesting_start = old_pmap->nested_region_addr; - *nesting_end = *nesting_start + old_pmap->nested_region_size; return KERN_SUCCESS; } -#endif /* PMAP_FORK_NEST */ /* * disable no-execute capability on @@ -9260,7 +9552,7 @@ pmap_cache_attributes( if (!pa_valid(paddr)) { pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr); - return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg; + return (io_rgn == NULL || io_rgn->signature == 'SKIO') ? VM_WIMG_IO : io_rgn->wimg; } result = VM_WIMG_DEFAULT; @@ -9794,7 +10086,7 @@ pmap_batch_set_cache_attributes_internal( if (__improbable(!pvh_try_lock_success(&locked_pvh))) { assert(preemption_disabled); const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); enable_preemption(); preemption_disabled = false; if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) { @@ -9853,7 +10145,7 @@ pmap_batch_set_cache_attributes_internal( preemption_disabled = true; disable_preemption(); /** - * Enter the retype epoch while we gather the disjoint update arguments + * Enter the pmap epoch while we gather the disjoint update arguments * and issue the SPTM call. Since this operation may cover multiple physical * pages, we may construct the argument array and invoke the SPTM without holding * all relevant PVH locks, we need to record that we are collecting and modifying @@ -9861,7 +10153,7 @@ pmap_batch_set_cache_attributes_internal( * underlying pages and pmap_remove() does not attempt to free the page tables * used for these mappings without first draining our epoch. */ - pmap_retype_epoch_enter(); + pmap_epoch_enter(); sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops; @@ -9890,7 +10182,7 @@ pmap_batch_set_cache_attributes_internal( * can't be freed. The epoch still protects mappings for any prior page in * the batch, whose PV locks are no longer held. */ - pmap_retype_epoch_exit(); + pmap_epoch_exit(); /** * Balance out the explicit disable_preemption() made either at the beginning of * the function or on a prior iteration of the loop that placed the PVH lock in @@ -9928,7 +10220,7 @@ pmap_batch_set_cache_attributes_internal( if (pmap_is_sptm_update_cache_attr_ops_pending(state)) { assert(preemption_disabled); sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index); - pmap_retype_epoch_exit(); + pmap_epoch_exit(); if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) { tlb_flush_pass_needed = true; } @@ -9940,7 +10232,7 @@ pmap_batch_set_cache_attributes_internal( enable_preemption(); } else if (preemption_disabled) { - pmap_retype_epoch_exit(); + pmap_epoch_exit(); enable_preemption(); } @@ -10213,7 +10505,7 @@ pmap_insert_commpage_internal( if (pmap_is_64bit(pmap)) { commpage_table = commpage_4k_table; } else { - panic("32-bit commpage not currently supported for SPTM configurations"); + panic("32-bit 4k commpage not currently supported for SPTM configurations"); //commpage_table = commpage32_4k_table; } } else if (pmap_page_size != 16384) { @@ -10224,8 +10516,7 @@ pmap_insert_commpage_internal( if (pmap_is_64bit(pmap)) { commpage_table = commpage_default_table; } else { - panic("32-bit commpage not currently supported for SPTM configurations"); - //commpage_table = commpage32_default_table; + commpage_table = commpage32_default_table; } } @@ -11194,6 +11485,7 @@ pmap_user_va_size(pmap_t pmap) } + bool pmap_in_ppl(void) { @@ -11490,7 +11782,7 @@ pmap_txm_allocate_page(void) thread_vm_privileged = set_vm_privilege(true); /* Allocate a page from the VM free list */ - int grab_options = VM_PAGE_GRAB_OPTIONS_NONE; + vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE; while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) { VM_PAGE_WAIT(); } @@ -11569,7 +11861,29 @@ pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t valu const sptm_frame_type_t frame_type = sptm_get_frame_type(pa); if (frame_type == XNU_PROTECTED_IO) { - sptm_iofilter_protected_write(pa, value, width); + bool is_hibernating = false; + if (__improbable(is_hibernating)) { + /** + * Default set to NO_PANICKING_DOMAIN and not to INVALID_DOMAIN since + * INVALID_DOMAIN is set for panic in dispatch logic itself. + */ + sptm_domain_t panic_source = NO_PANICKING_DOMAIN; + (void)sptm_panic_source(&panic_source); + + /** + * If panic_source is invalid (NO_PANICKING_DOMAIN: sptm_panic_source() failed + * or no panic occurred) OR if the panic_source is XNU_DOMAIN, then use the + * hibernation-specific write. + */ + if (panic_source == NO_PANICKING_DOMAIN || panic_source == XNU_DOMAIN) { + sptm_hib_iofilter_protected_write(pa, value, width); + } else { + /* Panic source is valid (panic occurred) and not XNU_DOMAIN */ + sptm_iofilter_protected_write(pa, value, width); + } + } else { + sptm_iofilter_protected_write(pa, value, width); + } } else { /* Mappings is valid but not specified by I/O filter. However, we still try * accessing the address from kernel mode. This allows addresses that are not @@ -11773,7 +12087,9 @@ pmap_test_fault_handler(arm_saved_state_t * state) fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr)); if ((class == ESR_EC_DABORT_EL1) && - ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) { + ((fsc == FSC_PERMISSION_FAULT_L3) + || (fsc == FSC_ACCESS_FLAG_FAULT_L3) + || (fsc == FSC_TRANSLATION_FAULT_L0))) { pmap_test_took_fault = true; /* return to the instruction immediately after the call to NX page */ set_saved_state_pc(state, get_saved_state_pc(state) + 4); @@ -12144,11 +12460,25 @@ pmap_test_test_config(unsigned int flags) #if defined(ARM_LARGE_MEMORY) #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */ +#if !defined(ARM_LARGE_MEMORY_KERNONLY) T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY."); pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER); pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true); pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size); +#else /* !defined(ARM_LARGE_MEMORY_KERNONLY) */ + /* Using kernel-only large memory. Make sure user pmap will fail. */ + T_LOG("Expect wired mapping to fault in ARM_LARGE_MEMORY when using KERNONLY."); + + /* The mapping should be rejected, it's outside of T0SZ */ + const kern_return_t kr = pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER); + T_QUIET; T_ASSERT_NE_INT(kr, KERN_SUCCESS, NULL); + + /* Addressing outside of T0SZ should result in a L0 xlate fault */ + const bool did_fault = pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, false, false); + T_QUIET; T_ASSERT(did_fault, NULL); +#endif /* !defined(ARM_LARGE_MEMORY_KERNONLY) */ #endif /* ARM_LARGE_MEMORY */ T_LOG("Remove the wired mapping, so we can tear down the test map."); diff --git a/osfmk/arm64/sptm/pmap/pmap.h b/osfmk/arm64/sptm/pmap/pmap.h index 1e116bd78..f8b6a256d 100644 --- a/osfmk/arm64/sptm/pmap/pmap.h +++ b/osfmk/arm64/sptm/pmap/pmap.h @@ -172,7 +172,6 @@ struct page_table_attr; struct pmap_cpu_data { unsigned int cpu_number; bool copywindow_strong_sync[CPUWINDOWS_MAX]; - bool inflight_disconnect; pv_free_list_t pv_free; pv_entry_t *pv_free_spill_marker; }; @@ -199,20 +198,20 @@ typedef struct pmap_cpu_data pmap_cpu_data_t; * This indicates (roughly) where there is free space for the VM * to use for the heap; this does not need to be precise. */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #if defined(ARM_LARGE_MEMORY) #define KERNEL_PMAP_HEAP_RANGE_START (VM_MIN_KERNEL_AND_KEXT_ADDRESS+ARM_TT_L1_SIZE) #else /* defined(ARM_LARGE_MEMORY) */ #define KERNEL_PMAP_HEAP_RANGE_START VM_MIN_KERNEL_AND_KEXT_ADDRESS #endif /* defined(ARM_LARGE_MEMORY) */ -#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #if defined(ARM_LARGE_MEMORY) /* For large memory systems with no KTRR/CTRR such as virtual machines */ #define KERNEL_PMAP_HEAP_RANGE_START (VM_MIN_KERNEL_AND_KEXT_ADDRESS+ARM_TT_L1_SIZE) #else #define KERNEL_PMAP_HEAP_RANGE_START LOW_GLOBAL_BASE_ADDRESS #endif -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ /** * For setups where the VM page size does not match the hardware page size (the @@ -337,19 +336,18 @@ struct pmap { queue_chain_t pmaps; /* Information representing the "nested" (shared) region in this pmap. */ - struct pmap *nested_pmap; vm_map_address_t nested_region_addr; vm_map_offset_t nested_region_size; vm_map_offset_t nested_region_true_start; vm_map_offset_t nested_region_true_end; - bitmap_t *nested_region_unnested_table_bitmap; + union { + struct pmap *nested_pmap; + bitmap_t *nested_region_unnested_table_bitmap; + }; /* PMAP reference count */ os_ref_atomic_t ref_count; - /* Number of pmaps that nested this pmap without bounds set. */ - uint32_t nested_no_bounds_refcnt; - union { /** * Represents the address space identifier (ASID) for this pmap. @@ -401,12 +399,6 @@ struct pmap { /* Whether this pmap represents a 64-bit address space. */ bool is_64bit; - /* Nested a pmap when the bounds were not set. */ - bool nested_has_no_bounds_ref; - - /* The nesting bounds have been set. */ - bool nested_bounds_set; - #if HAS_APPLE_PAC bool disable_jop; #else @@ -428,6 +420,9 @@ struct pmap { uint32_t reserved7[4]; void *reserved8; uint8_t reserved9; + + /* The ID of the vm_map that this pmap is backing, if any */ + vm_map_serial_t associated_vm_map_serial_id; }; #define PMAP_VASID(pmap) ((pmap)->asid) @@ -473,6 +468,7 @@ extern void pmap_gc(void); #if HAS_APPLE_PAC extern void * pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key); extern void * pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t data, uint64_t jop_key); +extern bool pmap_batch_sign_user_ptr(void *location, void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key); #endif /* HAS_APPLE_PAC */ /** @@ -543,6 +539,7 @@ extern boolean_t pmap_bootloader_page(ppnum_t pn); extern boolean_t pmap_is_empty(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t end); + #define ARM_PMAP_MAX_OFFSET_DEFAULT 0x01 #define ARM_PMAP_MAX_OFFSET_MIN 0x02 #define ARM_PMAP_MAX_OFFSET_MAX 0x04 diff --git a/osfmk/arm64/sptm/pmap/pmap_data.c b/osfmk/arm64/sptm/pmap/pmap_data.c index 298a616b1..5c6cabefe 100644 --- a/osfmk/arm64/sptm/pmap/pmap_data.c +++ b/osfmk/arm64/sptm/pmap/pmap_data.c @@ -268,6 +268,24 @@ SECURITY_READ_ONLY_LATE(pmap_paddr_t) sptm_cpu_iommu_scratch_end = 0; /* Prototypes used by pmap_data_bootstrap(). */ void pmap_cpu_data_array_init(void); +#if __ARM64_PMAP_SUBPAGE_L1__ +/* A list of subpage user root table page tracking structures. */ +queue_head_t surt_list; + +/** + * A mutex protecting surt_list related operations. + */ +decl_lck_mtx_data(, surt_lock); + +/* Is the SURT subsystem initialized? */ +bool surt_ready = false; +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + +#if DEBUG || DEVELOPMENT +/* Track number of instances a WC/RT mapping request is converted to Device-GRE. */ +static _Atomic unsigned int pmap_wcrt_on_non_dram_count = 0; +#endif /* DEBUG || DEVELOPMENT */ + /** * This function is called once during pmap_bootstrap() to allocate and * initialize many of the core data structures that are implemented in this @@ -496,7 +514,7 @@ pmap_page_alloc(pmap_paddr_t *ppa, unsigned options) * If we're only allocating a single page, just grab one off the VM's * global page free list. */ - int grab_options = VM_PAGE_GRAB_OPTIONS_NONE; + vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE; while ((mem = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) { if (options & PMAP_PAGE_ALLOCATE_NOWAIT) { break; @@ -1601,8 +1619,6 @@ pmap_remove_pv( if (pvh_test_type(locked_pvh->pvh, PVH_TYPE_NULL)) { pvh_set_flags(locked_pvh, 0); - const pmap_paddr_t pa = pai_to_pa(pai); - pmap_prepare_unmapped_page_for_retype(pa); pp_attr_t attrs_to_clear = 0; if (is_internal) { attrs_to_clear |= PP_ATTR_INTERNAL; @@ -1613,8 +1629,6 @@ pmap_remove_pv( if (attrs_to_clear != 0) { ppattr_modify_bits(pai, attrs_to_clear, 0); } - /* If removing the last mapping to a specially-protected page, retype the page back to XNU_DEFAULT. */ - pmap_retype_unmapped_page(pa); } *is_internal_p = is_internal; @@ -1808,6 +1822,14 @@ ptd_alloc(pmap_t pmap, unsigned int alloc_flags) return NULL; } + /** + * For PTDs that are linked to pmaps, initialize the wired count to 1 + * to prevent pmap_remove() from concurrently attempting to free a + * newly-installed page table page while it is still being initialized. + * This wired reference will be atomically dropped in ptd_info_init() + * once page table initialization is complete. + */ + ptdp->ptd_info->wiredcnt = 1; ptdp->pmap = pmap; pmap_tt_ledger_credit(pmap, sizeof(*ptdp)); @@ -1846,15 +1868,12 @@ ptd_deallocate(pt_desc_t *ptdp) } /** - * In address spaces where the VM page size is larger than the underlying - * hardware page size, one page table descriptor (PTD) object can represent - * multiple page tables. Some fields (like the reference counts) still need to - * be tracked on a per-page-table basis. Because of this, those values are - * stored in a separate array of ptd_info_t objects within the PTD where there's - * one ptd_info_t for every page table a single PTD can manage. - * - * This function initializes the correct ptd_info_t field within a PTD based on - * the page table it's representing. + * This function initializes the VA within a PTD based on the page table it's + * representing. This function must be called before a newly-allocated page + * table is installed via sptm_map_table(), as other threads will be able to + * use that page table as soon as it is installed and will expect valid PTD + * info at that point. It is assumed that sptm_map_table() will issue barriers + * which effectively guarantee the ordering of these updates. * * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to * update. Must match up with the `pmap` and `ptep` parameters. @@ -1895,6 +1914,28 @@ ptd_info_init( ptdp->va = (vm_offset_t) va & ~pt_attr_ln_pt_offmask(pt_attr, level - 1); } +/** + * Performs final initialization of a newly-allocated page table descriptor. + * This function effectively marks the linked page table as eligible for deallocation + * and should therefore be called once initialization and mapping of the page table is + * complete. + * + * @param ptdp Pointer to the PTD object which contains the ptd_info_t field to + * finalize + */ +void +ptd_info_finalize(pt_desc_t *ptdp) +{ + /** + * Atomically drop the wired count (previously initialized to 1) with + * release ordering to ensure all prior page table initialization is visible + * to any subsequent pmap operation that attempts to operate on the PTD. + */ + __assert_only unsigned short prev_refcnt = + os_atomic_dec_orig(&ptdp->ptd_info->wiredcnt, release); + assert3u(prev_refcnt, >, 0); +} + /** * Credit a specific ledger entry within the passed in pmap's ledger object. * @@ -2044,8 +2085,14 @@ pmap_find_io_attr(pmap_paddr_t paddr) const int cmp = cmp_io_rgns(&wanted_range, &io_attr_table[middle]); if (cmp == 0) { - /* Success! Found the wanted I/O range. */ - return &io_attr_table[middle]; + pmap_io_range_t const *range = &io_attr_table[middle]; + if (!(range->wimg & PMAP_IO_RANGE_NOT_IO)) { + /* Success! Found the wanted I/O range. */ + return &io_attr_table[middle]; + } else { + /* Ranges may not overlap, so we're not going to find anything. */ + break; + } } else if (begin == end) { /* We've checked every range and didn't find a match. */ break; @@ -2061,6 +2108,25 @@ pmap_find_io_attr(pmap_paddr_t paddr) return NULL; } +/** + * Iterate over all pmap-io-ranges, call the given step function on + * each of them, returning prematurely if the step function returns + * false. + * + * @param step The step function applied to each range. If it returns + * false, iteration stops. + */ + +void +pmap_range_iterate(bool (^step)(pmap_io_range_t const *)) +{ + for (size_t i = 0; i < num_io_rgns; i++) { + if (!step(&io_attr_table[i])) { + return; + } + } +} + /** * Initialize the pmap per-CPU data structure for a single CPU. This is called * once for each CPU in the system, on the CPU whose per-cpu data needs to be @@ -2085,6 +2151,7 @@ pmap_cpu_data_init_internal(unsigned int cpu_number) /* Setup per-cpu fields used when calling into the SPTM. */ pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); assert(((uintptr_t)sptm_pcpu & (PMAP_SPTM_PCPU_ALIGN - 1)) == 0); + sptm_pcpu->sptm_user_pointer_ops_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_user_pointer_ops); sptm_pcpu->sptm_ops_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_ops); sptm_pcpu->sptm_templates_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_templates); sptm_pcpu->sptm_paddrs_pa = kvtophys_nofail((vm_offset_t)sptm_pcpu->sptm_paddrs); @@ -2227,3 +2294,305 @@ pmap_is_page_free(pmap_paddr_t paddr) */ return sptm_frame_is_last_mapping(paddr, SPTM_REFCOUNT_NONE); } + +#if MACH_ASSERT +/** + * Verify that a given physical page contains no mappings (outside of the + * default physical aperture mapping) and if it does, then panic. + * + * @note It's recommended to use pmap_verify_free() directly when operating in + * the PPL since the PVH lock isn't getting grabbed here (due to this code + * normally being called from outside of the PPL, and the pv_head_table + * can't be modified outside of the PPL). + * + * @param ppnum Physical page number to check there are no mappings to. + */ +void +pmap_assert_free(ppnum_t ppnum) +{ + const pmap_paddr_t pa = ptoa(ppnum); + + /* Only mappings to kernel-managed physical memory are tracked. */ + if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) { + return; + } + + const unsigned int pai = pa_index(pa); + const uintptr_t pvh = pai_to_pvh(pai); + + /** + * This function is always called from outside of the PPL. Because of this, + * the PVH entry can't be locked. This function is generally only called + * before the VM reclaims a physical page and shouldn't be creating new + * mappings. Even if a new mapping is created while parsing the hierarchy, + * the worst case is that the system will panic in another way, and we were + * already about to panic anyway. + */ + + /** + * Since pmap_verify_free() returned false, that means there is at least one + * mapping left. Let's get some extra info on the first mapping we find to + * dump in the panic string (the common case is that there is one spare + * mapping that was never unmapped). + */ + pt_entry_t *first_ptep = PT_ENTRY_NULL; + + if (pvh_test_type(pvh, PVH_TYPE_PTEP)) { + first_ptep = pvh_ptep(pvh); + } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) { + pv_entry_t *pvep = pvh_pve_list(pvh); + + /* Each PVE can contain multiple PTEs. Let's find the first one. */ + for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) { + first_ptep = pve_get_ptep(pvep, pve_ptep_idx); + if (first_ptep != PT_ENTRY_NULL) { + break; + } + } + + /* The PVE should have at least one valid PTE. */ + assert(first_ptep != PT_ENTRY_NULL); + } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) { + panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)", + __func__, (void*)pvh, pai); + } else { + /** + * The mapping disappeared between here and the pmap_verify_free() call. + * The only way that can happen is if the VM was racing this call with + * a call that unmaps PTEs. Operations on this page should not be + * occurring at the same time as this check, and unfortunately we can't + * lock the PVH entry to prevent it, so just panic instead. + */ + panic("%s: Mapping was detected but is now gone. Is the VM racing this " + "call with an operation that unmaps PTEs? PVH %p (pai: %d)", + __func__, (void*)pvh, pai); + } + + /* Panic with a unique string identifying the first bad mapping and owner. */ + { + /* First PTE is mapped by the main CPUs. */ + pmap_t pmap = ptep_get_pmap(first_ptep); + const char *type = (pmap == kernel_pmap) ? "Kernel" : "User"; + + panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a " + "%s CPU mapping (pmap: %p)", + __func__, (uint64_t)pa, first_ptep, type, pmap); + } +} +#endif /* MACH_ASSERT */ + +inline void +pmap_recycle_page(ppnum_t pn) +{ + const bool is_freed = pmap_is_page_free(ptoa(pn)); + + if (__improbable(!is_freed)) { + /* + * There is a redundancy here, but we are going to panic anyways, + * and ASSERT_PMAP_FREE traces useful information. So, we keep this + * behavior. + */ +#if MACH_ASSERT + pmap_assert_free(pn); +#endif /* MACH_ASSERT */ + panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn)); + } + + const pmap_paddr_t paddr = ptoa(pn); + const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr); + if (__improbable(pmap_type_requires_retype_on_recycle(frame_type))) { + const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + sptm_retype(paddr, frame_type, XNU_DEFAULT, retype_params); + } +} + +#if __ARM64_PMAP_SUBPAGE_L1__ +/* A structure tracking the state of a SURT page. */ +typedef struct { + /* The PA of the SURT page. */ + pmap_paddr_t surt_page_pa; + + /* A bitmap tracking the allocation status of the SURTs in the page. */ + bitmap_t surt_page_free_bitmap[SUBPAGE_USER_ROOT_TABLE_INDEXES / (sizeof(bitmap_t) * 8)]; + + /* A queue chain chaining all the tracking structures together. */ + queue_chain_t surt_chain; +} surt_page_t; + +/** + * Initialize the SURT subsystem. + * + * @note Expected to be called when pmap is being bootstrapped, before a user + * pmap is created. + */ +void +surt_init() +{ + if (__improbable(surt_ready)) { + panic("%s: initializing the SURT subsystem while it has already been initialized", __func__); + } + + queue_init(&surt_list); + lck_mtx_init(&surt_lock, &pmap_lck_grp, LCK_ATTR_NULL); + + /* A plain write is okay only in single-core early bootstrapping. */ + surt_ready = true; +} + +/** + * Lock the SURT lock. + */ +static inline void +surt_lock_lock() +{ + assert(surt_ready); + lck_mtx_lock(&surt_lock); +} + +/** + * Unlock the SURT lock. + */ +static inline void +surt_lock_unlock() +{ + lck_mtx_unlock(&surt_lock); +} + +/** + * Try to find a SURT from the SURT page queue. + * + * @note This function doesn't block. If a SURT is not found, the caller is + * responsible for allocating a page and feed it to the SURT subsystem. + * + * @return the PA of the SURT if one is found, 0 otherwise. + */ +pmap_paddr_t +surt_try_alloc() +{ + surt_lock_lock(); + pmap_paddr_t surt_pa = 0ULL; + + /* Look for a free table on existing SURT pages. */ + surt_page_t *surt_page; + qe_foreach_element(surt_page, &surt_list, surt_chain) { + const int first_available_index = bitmap_lsb_first(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES); + if (first_available_index >= 0) { + surt_pa = surt_pa_from_surt_page_pa_and_index(surt_page->surt_page_pa, (uint8_t) first_available_index); + bitmap_clear(&surt_page->surt_page_free_bitmap[0], first_available_index); + break; + } + } + + /** + * Either return a non-zero PA of the found SURT or zero. A zero return + * value indicates the caller should allocate a new SURT page + */ + surt_lock_unlock(); + return surt_pa; +} + +/** + * Free the SURT at a physical address. + * + * @return True if the SURT page has no allocated SURT and has been removed + * from the queue so that the caller can repurpose the page. False + * otherwise. + */ +bool +surt_free(pmap_paddr_t surt_pa) +{ + if (__improbable(surt_pa & (SUBPAGE_USER_ROOT_TABLE_SIZE - 1))) { + panic("%s: surt_pa %p is expected to be %u-byte aligned", + __func__, (void *)surt_pa, (unsigned int) SUBPAGE_USER_ROOT_TABLE_SIZE); + } + + surt_lock_lock(); + const uint8_t surt_index = (uint8_t) ((surt_pa & PAGE_MASK) / SUBPAGE_USER_ROOT_TABLE_SIZE); + + /* Look for a free table on existing SURT pages. */ + surt_page_t *surt_page; + qe_foreach_element_safe(surt_page, &surt_list, surt_chain) { + if (surt_page->surt_page_pa == surt_page_pa_from_surt_pa(surt_pa)) { + /* Mark the SURT as free. */ + bitmap_set(&surt_page->surt_page_free_bitmap[0], surt_index); + + /* If the entire SURT page is free, remove it from the page queue. */ + if (bitmap_is_full(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES)) { + remqueue(&surt_page->surt_chain); + + /* Done with the page queue so unlock it before freeing surt_page. */ + surt_lock_unlock(); + kfree_type(surt_page_t, surt_page); + return true; + } else { + surt_lock_unlock(); + return false; + } + } + } + + panic("%s: no matching surt_page_t found for surt_pa: %p", __func__, (void *)surt_pa); +} + +/** + * Add a SURT page to the SURT page queue, with its SURT at index 0 allocated. + * + * @note Designed this way so that the caller can call into SPTM for SURT + * allocation before the page is seen by the other threads in the + * system. + * + * @param surt_page_pa The phyiscal address of the SURT page. + */ +void +surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa) +{ + surt_page_t *surt_page = kalloc_type(surt_page_t, Z_ZERO | Z_WAITOK); + + if (__improbable(surt_page_pa & PAGE_MASK)) { + panic("%s: surt_page_pa %p is expected to be page aligned", __func__, (void *)surt_page_pa); + } + + surt_lock_lock(); + surt_page->surt_page_pa = surt_page_pa; + bitmap_full(&surt_page->surt_page_free_bitmap[0], SUBPAGE_USER_ROOT_TABLE_INDEXES); + bitmap_clear(&surt_page->surt_page_free_bitmap[0], 0); + enqueue_head(&surt_list, &surt_page->surt_chain); + surt_lock_unlock(); +} + +unsigned int +surt_list_len() +{ + unsigned int len = 0; + + surt_lock_lock(); + __unused surt_page_t *surt_page; + qe_foreach_element(surt_page, &surt_list, surt_chain) { + len = len + 1; + } + surt_lock_unlock(); + return len; +} +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + +#if DEBUG || DEVELOPMENT +/** + * Get the value of the WC/RT on non-DRAM mapping request counter. + * + * @return The value of the counter. + */ +unsigned int +pmap_wcrt_on_non_dram_count_get() +{ + return os_atomic_load(&pmap_wcrt_on_non_dram_count, relaxed); +} + +/** + * Atomically increment the WC/RT on non-DRAM mapping request counter. + */ +void +pmap_wcrt_on_non_dram_count_increment_atomic() +{ + os_atomic_inc(&pmap_wcrt_on_non_dram_count, relaxed); +} +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/arm64/sptm/pmap/pmap_data.h b/osfmk/arm64/sptm/pmap/pmap_data.h index 6f6244f58..00ff1e5cf 100644 --- a/osfmk/arm64/sptm/pmap/pmap_data.h +++ b/osfmk/arm64/sptm/pmap/pmap_data.h @@ -174,10 +174,12 @@ pai_to_pvh(unsigned int pai) * type needs to be checked before dereferencing the pointer to determine which * pointer type to dereference as. */ -#define PVH_TYPE_NULL 0x0UL -#define PVH_TYPE_PVEP 0x1UL -#define PVH_TYPE_PTEP 0x2UL -#define PVH_TYPE_PTDP 0x3UL +__enum_closed_decl(pvh_type_t, uint8_t, { + PVH_TYPE_NULL = 0b00, + PVH_TYPE_PVEP = 0b01, + PVH_TYPE_PTEP = 0b10, + PVH_TYPE_PTDP = 0b11, +}); #define PVH_TYPE_MASK (0x3UL) @@ -474,7 +476,7 @@ pvh_lock_enter_sleep_mode(locked_pvh_t *locked_pvh) * otherwise. */ static inline bool -pvh_test_type(uintptr_t pvh, uintptr_t type) +pvh_test_type(uintptr_t pvh, pvh_type_t type) { return (pvh & PVH_TYPE_MASK) == type; } @@ -989,7 +991,7 @@ pve_remove(locked_pvh_t *locked_pvh, pv_entry_t **pvepp, pv_entry_t *pvep) * the fields were within the same structure. */ typedef struct { - /* + /** * For non-leaf pagetables, should be 0. * For leaf pagetables, should reflect the number of wired entries. * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU @@ -1094,21 +1096,23 @@ typedef struct pt_desc { * This structure is intended to be embedded in the pmap per-CPU data object, * and is meant to be used for situations in which the caller needs to ensure * that potentially sensitive concurrent SPTM operations have completed on other - * CPUs prior to retyping a page. If these sensitive operations haven't completed - * when the retype occurs, and they happen to involve the page being retyped - * (either directly or through mappings thereof), an SPTM violation panic may - * result. + * CPUs prior to an operation (such as a retype) that requires page or mapping + * state to be stable. When draining these concurrent operations, the caller + * is also expected to have already taken steps to ensure the page/mapping + * state requirements will be visible to any concurrent pmap operation initiated + * after the drain operation is begun, so that only previously-initiated + * operations will need to be purged. */ typedef struct { /** * Critical section sequence number of the local CPU. A value of zero - * indicates that no retype epoch critical section is currently active on + * indicates that no pmap epoch critical section is currently active on * the CPU. */ uint64_t local_seq; /** - * The sequence number to use the next time a retype epoch critical section + * The sequence number to use the next time a pmap epoch critical section * is entered on the local CPU. This should monotonically increase. */ uint64_t next_seq; @@ -1124,7 +1128,7 @@ typedef struct { uint64_t remote_seq[MAX_CPUS]; /** - * Flags used to track the state of an active retype epoch drain operation + * Flags used to track the state of an active pmap epoch drain operation * on the local CPU. */ @@ -1133,7 +1137,7 @@ typedef struct { * local CPU by sampling remote CPU epoch states into the remote_seq array. * This must be set before the drain operation can be performed. */ - #define PMAP_RETYPE_EPOCH_PREPARED (1 << 0) + #define PMAP_EPOCH_PREPARED (1 << 0) /** * This flag indicates that one or more remote CPUs had a non-zero retype @@ -1142,9 +1146,9 @@ typedef struct { * be in a critical section in which prior mapping state for the page to * be retyped may have been observed, so we can skip the drain operation. */ - #define PMAP_RETYPE_EPOCH_DRAIN_REQUIRED (1 << 1) + #define PMAP_EPOCH_DRAIN_REQUIRED (1 << 1) uint8_t flags; -} pmap_retype_epoch_t; +} pmap_epoch_t; #define PMAP_SPTM_PCPU_ALIGN (8192) @@ -1161,6 +1165,9 @@ typedef struct { */ void *sptm_iommu_scratch; + /* Accumulator for batched user pointer SPTM ops, to avoid excessive stack usage. */ + sptm_user_pointer_op_t sptm_user_pointer_ops[SPTM_MAPPING_LIMIT]; + /* Accumulator for batched disjoint SPTM ops, to avoid excessive stack usage. */ sptm_disjoint_op_t sptm_ops[SPTM_MAPPING_LIMIT]; @@ -1172,6 +1179,9 @@ typedef struct { sptm_paddr_t sptm_paddrs[SPTM_MAPPING_LIMIT]; }; + /* Base PA of user pointer ops array, for passing the ops into the SPTM. */ + pmap_paddr_t sptm_user_pointer_ops_pa; + /* Base PA of ops array, for passing the ops into the SPTM. */ pmap_paddr_t sptm_ops_pa; @@ -1192,8 +1202,8 @@ typedef struct { #define PMAP_SPTM_FLAG_ALTACCT (0x2) uint8_t sptm_acct_flags[SPTM_MAPPING_LIMIT]; - /* Retype epoch tracking structure. */ - pmap_retype_epoch_t retype_epoch; + /* pmap epoch tracking structure. */ + pmap_epoch_t pmap_epoch; /* Guest virtual machine dispatch structure. */ sptm_guest_dispatch_t sptm_guest_dispatch; @@ -1958,54 +1968,57 @@ ppattr_test_modfault(unsigned int pai) } /** - * Retype epoch operations: + * pmap epoch operations: * - * The retype epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap - * can ensure all CPUs have observed updated mapping state before retyping a physical page. + * The pmap epoch facility provides an SMR/RCU-like mechanism by which the SPTM pmap + * can ensure all CPUs have observed updated mapping state before performing an operation + * such as a retype which requires that no other operations be in-flight against the + * prior mapping state. * * There are certain cases in which the pmap, while issuing an SPTM call that modifies - * mappings, cannot hold locks such as the PVH lock which would prevent the page from - * being concurrently retyped. This is particularly true for batched operations such - * as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes(). - * In these cases, the pmap may call pmap_retype_epoch_enter() to note that it is - * performing such a sensitive operation on the local CPU. It must then call - * pmap_retype_epoch_exit() upon completion of the sensitive operation. + * mappings, cannot hold locks such as the PVH lock which would prevent the mapped page + * from being concurrently retyped. This is particularly true for batched operations + * such as pmap_remove(), phys_attribute_clear_range(), and pmap_batch_set_cache_attributes(). + * In these cases, the pmap may call pmap_epoch_enter() to note that it is performing such + * a sensitive operation on the local CPU. It must then call pmap_epoch_exit() upon + * completion of the sensitive operation. While retyping is the most common case that + * requires epoch synchronization, there are a few other cases as well, such as marking + * a leaf page table as unnested so that all subsequent mappings in it will be non-global. * - * Then, for any instance in which the pmap needs to retype a page without being - * otherwise guaranteed (e.g. by VM layer locking or the existing page type) that such - * a sensitive operation is not in progress on some other CPU, it must drain these + * For any instance in which the pmap needs to retype a page (or otherwise alter mapping + * policy) without being guaranteed (e.g. by VM layer locking or the existing page type) + * that such a sensitive operation is not in progress on some other CPU, it must drain these * sensitive operations from other CPUs. Specifically, it must ensure that any - * sensitive operation which may have observed prior mapping state of the page that - * is to be retyped has completed. This is accomplished by first calling - * pmap_retype_epoch_prepare_drain() to record the initial retype epoch state of - * all CPUs, followed by pmap_retype_epoch_drain() to ensure all remote CPUs are - * either not in an epoch or have advanced beyond the initially recorded epoch. - * These are exposed as two separate functions in order to allow the calling CPU - * to do other work between calling pmap_retype_epoch_prepare_drain() and - * pmap_retype_epoch_drain(), as a best-effort attempt to minimize time wasted - * spinning in pmap_retype_epoch_drain(). + * sensitive operation which may have observed mapping state under the prior mapping policy + * has completed. This is accomplished by first calling pmap_epoch_prepare_drain() to + * record the initial pmap epoch state of all CPUs, followed by pmap_epoch_drain() to ensure + * all remote CPUs are either not in an epoch or have advanced beyond the initially recorded + * epoch. These are exposed as two separate functions in order to allow the calling CPU to + * do other work between calling pmap_epoch_prepare_drain() and pmap_epoch_drain(), as a + * best-effort attempt to minimize time wasted spinning in pmap_epoch_drain(). * - * When draining the retype epoch, the following assumptions must hold true: + * When draining the epoch, the following assumptions must hold true: * - * 1) The calling thread must guarantee that prior updates needed to bring the page - * into the correct mapping state for retyping have already been performed and made - * globally visible using the appropriate barriers. In most cases this means that - * all existing mappings of the page must have been removed. For any alterations - * of mapping state, global visibility is conveniently already guaranteed by the - * DSBs that are architecturally required to synchronize PTE updates and the TLBIs - * that follow them. + * 1) The calling thread must guarantee that prior updates needed to apply the new mapping + * policy have already been performed and made globally visible using the appropriate + * barriers. In the most common (retype) case, this means all existing mappings of the + * page must have been removed. For any alterations of mapping state, global visibility is + * conveniently already guaranteed by the DSBs that are architecturally required to + * synchronize PTE updates and the TLBIs that follow them. * - * 2) The calling thread must have some means of ensuring the new mappings cannot - * be added for the page that would bring it out of the correct state for retyping. - * This is typically done by holding the PVH lock and/or the exclusive pmap lock - * such that pmap_enter() cannot concurrently execute against the page. + * 2) For operations that require exclusive in-flight page references such as retyping, + * the calling thread must have some means of ensuring that new mappings cannot be added + * for the page that would bring it out of the correct state for the operation, or that + * would cause an SPTM violation due to a shared/exclusive in-flight reference conflict. + * For retyping this is typically done by holding the PVH lock such that pmap_enter() + * cannot concurrently execute against the page. * * 3) The calling thread must not perform any operation which requires preemptibility - * between calling pmap_retype_epoch_prepare_drain() and pmap_retype_epoch_drain(). + * between calling pmap_epoch_prepare_drain() and pmap_epoch_drain(). */ /** - * Enter the retype epoch on the local CPU to indicate an in-progress SPTM operation + * Enter the pmap epoch on the local CPU to indicate an in-progress SPTM operation * that may be sensitive to a concurrent retype operation on another CPU. * * @note This function increments the thread's preemption disable count and returns @@ -2015,17 +2028,17 @@ ppattr_test_modfault(unsigned int pai) * the epoch update relative to ensuing SPTM accesses. */ static inline void -pmap_retype_epoch_enter(void) +pmap_epoch_enter(void) { mp_disable_preemption(); - pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch; + pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch; assert(!preemption_enabled()); - /* Must not already been in a retype epoch on this CPU. */ - assert(retype_epoch->local_seq == 0); - retype_epoch->local_seq = ++retype_epoch->next_seq; + /* Must not already been in a pmap epoch on this CPU. */ + assert(pmap_epoch->local_seq == 0); + pmap_epoch->local_seq = ++pmap_epoch->next_seq; /* Unsigned 64-bit per-CPU integer should never overflow on any human timescale. */ - assert(retype_epoch->local_seq != 0); + assert(pmap_epoch->local_seq != 0); /** * Issue a store-load barrier to ensure that remote observers of any ensuing @@ -2035,25 +2048,25 @@ pmap_retype_epoch_enter(void) } /** - * Exit the retype epoch on the local CPU to indicate completion of an SPTM operation + * Exit the pmap epoch on the local CPU to indicate completion of an SPTM operation * that may be sensitive to a concurrent retype operation on another CPU. * * @note This function must be called with preemption disabled and will decrement * the current thread's preemption disable count. */ static inline void -pmap_retype_epoch_exit(void) +pmap_epoch_exit(void) { - pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch; + pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch; assert(!preemption_enabled()); - assert(retype_epoch->local_seq == retype_epoch->next_seq); + assert(pmap_epoch->local_seq == pmap_epoch->next_seq); /** * Clear the sequence using a store-release operation to ensure that prior * SPTM modifications will be visible to remote observers before the absence * of an epoch is visible. */ - os_atomic_store(&retype_epoch->local_seq, 0, release); + os_atomic_store(&pmap_epoch->local_seq, 0, release); mp_enable_preemption(); } @@ -2065,7 +2078,7 @@ pmap_retype_epoch_exit(void) static inline bool pmap_in_epoch(void) { - return !preemption_enabled() && (PERCPU_GET(pmap_sptm_percpu)->retype_epoch.local_seq != 0); + return !preemption_enabled() && (PERCPU_GET(pmap_sptm_percpu)->pmap_epoch.local_seq != 0); } /** @@ -2088,30 +2101,30 @@ pmap_in_epoch(void) * thread_fence) before calling this function. */ static inline void -pmap_retype_epoch_prepare_drain(void) +pmap_epoch_prepare_drain(void) { mp_disable_preemption(); - pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch; - assert(retype_epoch->flags == 0); + pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch; + assert(pmap_epoch->flags == 0); unsigned int i = 0; - uint8_t flags = PMAP_RETYPE_EPOCH_PREPARED; + uint8_t flags = PMAP_EPOCH_PREPARED; /* Sample each CPU's epoch state. */ percpu_foreach(pmap_pcpu, pmap_sptm_percpu) { const uint64_t remote_epoch = - os_atomic_load(&pmap_pcpu->retype_epoch.local_seq, relaxed); - retype_epoch->remote_seq[i] = remote_epoch; + os_atomic_load(&pmap_pcpu->pmap_epoch.local_seq, relaxed); + pmap_epoch->remote_seq[i] = remote_epoch; /** * If the remote CPU has an active epoch, make a note to ourselves that * we'll need to drain it. */ if (remote_epoch != 0) { - flags |= PMAP_RETYPE_EPOCH_DRAIN_REQUIRED; + flags |= PMAP_EPOCH_DRAIN_REQUIRED; } ++i; } - retype_epoch->flags = flags; + pmap_epoch->flags = flags; /** * Issue a load-load barrier to ensure subsequent drain or retype operations will @@ -2122,12 +2135,12 @@ pmap_retype_epoch_prepare_drain(void) /** * Ensure that all CPUs have advanced beyond any active epoch that was recorded in the - * most recent call to pmap_retype_epoch_prepare_drain(). + * most recent call to pmap_epoch_prepare_drain(). * * @note This function expects to be called with preemption disabled and will decrement * the current thread's preemption disable count. * - * @note pmap_retype_epoch_prepare_drain() must have been called on the local CPU + * @note pmap_epoch_prepare_drain() must have been called on the local CPU * prior to calling this function. This function will return immediately if * this prior call did not observe any active epochs on remote CPUs. * @@ -2135,28 +2148,28 @@ pmap_retype_epoch_prepare_drain(void) * retype operation is not speculated ahead of the epoch sampling. */ static inline void -pmap_retype_epoch_drain(void) +pmap_epoch_drain(void) { assert(!preemption_enabled()); - pmap_retype_epoch_t *retype_epoch = &PERCPU_GET(pmap_sptm_percpu)->retype_epoch; - const uint8_t flags = retype_epoch->flags; - assert(flags & PMAP_RETYPE_EPOCH_PREPARED); - retype_epoch->flags = 0; - if (!(flags & PMAP_RETYPE_EPOCH_DRAIN_REQUIRED)) { + pmap_epoch_t *pmap_epoch = &PERCPU_GET(pmap_sptm_percpu)->pmap_epoch; + const uint8_t flags = pmap_epoch->flags; + assert(flags & PMAP_EPOCH_PREPARED); + pmap_epoch->flags = 0; + if (!(flags & PMAP_EPOCH_DRAIN_REQUIRED)) { mp_enable_preemption(); return; } unsigned int i = 0; percpu_foreach(pmap_pcpu, pmap_sptm_percpu) { - if (retype_epoch->remote_seq[i] != 0) { - assert((pmap_pcpu->retype_epoch.local_seq == 0) || - (pmap_pcpu->retype_epoch.local_seq >= retype_epoch->remote_seq[i])); + if (pmap_epoch->remote_seq[i] != 0) { + assert((pmap_pcpu->pmap_epoch.local_seq == 0) || + (pmap_pcpu->pmap_epoch.local_seq >= pmap_epoch->remote_seq[i])); /** * If the remote CPU was in an epoch, WFE-spin until it either exits the epoch * or advances to a new epoch. */ - while ((os_atomic_load_exclusive(&pmap_pcpu->retype_epoch.local_seq, relaxed) == - retype_epoch->remote_seq[i])) { + while ((os_atomic_load_exclusive(&pmap_pcpu->pmap_epoch.local_seq, relaxed) == + pmap_epoch->remote_seq[i])) { __builtin_arm_wfe(); } /* Clear the monitor if we exclusive-loaded a value that didn't require WFE. */ @@ -2166,7 +2179,7 @@ pmap_retype_epoch_drain(void) } mp_enable_preemption(); /** - * Issue a load-load barrier to ensure subsequent retype operations will + * Issue a load-load barrier to ensure subsequent accesses to sensitive state will * not be speculated ahead of the sampling we just did. */ os_atomic_thread_fence(acquire); @@ -2174,70 +2187,16 @@ pmap_retype_epoch_drain(void) /** * Helper to determine whether a frame type is one that requires automatic - * retyping (by the pmap layer) back to XNU_DEFAULT when all mappings of the - * page are gone. + * retyping (by the pmap layer) back to XNU_DEFAULT when the page is about + * to be recycled by the VM layer. * * @return true if the type requires auto-retyping, false otherwise. */ static inline bool -pmap_type_requires_retype_on_unmap(sptm_frame_type_t frame_type) +pmap_type_requires_retype_on_recycle(sptm_frame_type_t frame_type) { - return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) || - (frame_type == XNU_USER_JIT) || (frame_type == XNU_ROZONE) || - (frame_type == XNU_KERNEL_RESTRICTED); -} - - -/** - * If necessary, prepare a physical page for being retyped back to XNU_DEFAULT - * after the last CPU mapping has been removed. This is only needed for pages of - * certain special types such as the various executable types and the kernel RO - * zone type. - * - * @note The PVH lock for the physical page that is getting a new mapping - * registered must already be held. - * - * @param pa The physical address of the recently-unmapped page. - * - * @return true if the page will need to be retyped, false otherwise. - */ -static inline bool -pmap_prepare_unmapped_page_for_retype(pmap_paddr_t pa) -{ - pvh_assert_locked(pa_index(pa)); - const sptm_frame_type_t frame_type = sptm_get_frame_type(pa); - if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) { - pmap_retype_epoch_prepare_drain(); - return true; - } - return false; -} - -/** - * If necessary, retype a physical page back to XNU_DEFAULT after the last CPU - * mapping has been removed. This is only needed for pages of certain special - * types such as the various executable types, the kernel RO zone type, - * and XNU_KERNEL_RESTRICTED. - * - * @note The PVH lock for the physical page that is getting a new mapping - * registered must already be held. - * - * @param pa The physical address of the recently-unmapped page. - * - * @return true if the page needed to be retyped, false otherwise. - */ -static inline bool -pmap_retype_unmapped_page(pmap_paddr_t pa) -{ - pvh_assert_locked(pa_index(pa)); - const sptm_frame_type_t frame_type = sptm_get_frame_type(pa); - if (__improbable(pmap_type_requires_retype_on_unmap(frame_type))) { - sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; - pmap_retype_epoch_drain(); - sptm_retype(pa & ~PAGE_MASK, frame_type, XNU_DEFAULT, retype_params); - return true; - } - return false; + return sptm_type_is_user_executable(frame_type) || + (frame_type == XNU_ROZONE) || (frame_type == XNU_KERNEL_RESTRICTED); } static inline boolean_t @@ -2367,6 +2326,7 @@ extern pt_desc_t *ptd_alloc(pmap_t, unsigned int); extern void ptd_deallocate(pt_desc_t *); extern void ptd_info_init( pt_desc_t *, pmap_t, vm_map_address_t, unsigned int, pt_entry_t *); +extern void ptd_info_finalize(pt_desc_t *); extern kern_return_t pmap_ledger_credit(pmap_t, int, ledger_amount_t); extern kern_return_t pmap_ledger_debit(pmap_t, int, ledger_amount_t); @@ -2382,11 +2342,14 @@ extern void validate_pmap_mutable_internal(const volatile struct pmap *, const c #define validate_pmap_mutable(x) validate_pmap_mutable_internal(x, __func__) /** - * This structure describes a SPTM-owned I/O range. + * This structure describes a SPTM-owned physical memory range. * - * @note This doesn't necessarily have to represent "I/O" only, this can also - * represent non-kernel-managed DRAM (e.g., iBoot carveouts). Any physical - * address region that isn't considered "kernel-managed" is fair game. + * @note This doesn't necessarily have to represent "I/O" only, this + * can also represent non-kernel-managed DRAM (e.g., iBoot + * carveouts). In some special cases, this can also represent + * kernel-managed DRAM, when adding flags for special behavior + * (e.g. the range being off limits for hibtext). Such ranges + * must be marked with the PMAP_IO_RANGE_NOT_IO flag. * * @note The layout of this structure needs to map 1-to-1 with the pmap-io-range * device tree nodes. Astris (through the LowGlobals) also depends on the @@ -2402,16 +2365,34 @@ typedef struct pmap_io_range { uint64_t len; /* Strong DSB required for pages in this range. */ - #define PMAP_IO_RANGE_STRONG_SYNC (1UL << 31) + #define PMAP_IO_RANGE_STRONG_SYNC (1U << 31) /* Corresponds to memory carved out by bootloader. */ - #define PMAP_IO_RANGE_CARVEOUT (1UL << 30) + #define PMAP_IO_RANGE_CARVEOUT (1U << 30) - /* Pages in this range need to be included in the hibernation image */ - #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1UL << 29) + /* Pages in this range need to be included in the hibernation image. */ + #define PMAP_IO_RANGE_NEEDS_HIBERNATING (1U << 29) - /* Mark the range as 'owned' by a given subsystem */ - #define PMAP_IO_RANGE_OWNED (1UL << 28) + /* Mark the range as 'owned' by a given subsystem. */ + #define PMAP_IO_RANGE_OWNED (1U << 28) + + /** + * Denotes a range that is *not* to be treated as an I/O range that + * needs to be mapped, but only to decorate arbitrary physical + * memory ranges (including of managed memory) with extra + * flags. I.e. this allows tagging of "ordinary" managed memory + * pages with flags like `PMAP_IO_RANGE_PROHIBIT_HIB_WRITE`, or + * informing the SPTM that some (nominally) managed memory pages are + * unavailable for some reason. + * + * Notably, `pmap_find_io_attr()`, and anything else that uses + * `pmap_io_range`s for denoting to-be-mapped I/O ranges, ignores + * entries with this flag. + */ + #define PMAP_IO_RANGE_NOT_IO (1U << 27) + + /* Pages in this range may never be written during hibernation restore. */ + #define PMAP_IO_RANGE_PROHIBIT_HIB_WRITE (1U << 26) /** * Lower 16 bits treated as pp_attr_t, upper 16 bits contain additional @@ -2428,6 +2409,8 @@ _Static_assert(sizeof(pmap_io_range_t) == 24, "unexpected size for pmap_io_range extern pmap_io_range_t* pmap_find_io_attr(pmap_paddr_t); +extern void pmap_range_iterate(bool (^step) (pmap_io_range_t const *)); + /** * This structure describes a sub-page-size I/O region owned by SPTM but the kernel can write to. * @@ -2452,3 +2435,57 @@ typedef struct pmap_io_filter_entry { _Static_assert(sizeof(pmap_io_filter_entry_t) == 8, "unexpected size for pmap_io_filter_entry_t"); extern void pmap_cpu_data_init_internal(unsigned int); + +/** + * Convert a SURT PA to the containing SURT page's PA. + * + * @param surt_pa The SURT's physical addresss. + * + * @return The containing SURT page's PA. + */ +static inline pmap_paddr_t +surt_page_pa_from_surt_pa(pmap_paddr_t surt_pa) +{ + return surt_pa & ~PAGE_MASK; +} + +/** + * Given a SURT PA, get its index in the containing SURT page. + * + * @param surt_pa The PA of the SURT. + * + * @return The index of the SURT in the containing SURT page. + */ +static inline uint8_t +surt_index_from_surt_pa(pmap_paddr_t surt_pa) +{ + return (uint8_t)((surt_pa & PAGE_MASK) / SUBPAGE_USER_ROOT_TABLE_SIZE); +} + +/** + * Given a SURT page PA and an index, compute the PA of the associated SURT. + * + * @param surt_page_pa The PA of the SURT page. + * @param index THe index of the SURT in the SURT page. + * + * @return The computed PA of the SURT. + */ +static inline pmap_paddr_t +surt_pa_from_surt_page_pa_and_index(pmap_paddr_t surt_page_pa, uint8_t index) +{ + assert((surt_page_pa & PAGE_MASK) == 0); + return surt_page_pa + index * SUBPAGE_USER_ROOT_TABLE_SIZE; +} + +#if __ARM64_PMAP_SUBPAGE_L1__ +extern void surt_init(void); +extern pmap_paddr_t surt_try_alloc(void); +extern bool surt_free(pmap_paddr_t surt_pa); +extern void surt_feed_page_with_first_table_allocated(pmap_paddr_t surt_page_pa); +extern unsigned int surt_list_len(void); +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ + +#if DEBUG || DEVELOPMENT +extern unsigned int pmap_wcrt_on_non_dram_count_get(void); +extern void pmap_wcrt_on_non_dram_count_increment_atomic(void); +#endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/arm64/sptm/pmap/pmap_internal.h b/osfmk/arm64/sptm/pmap/pmap_internal.h index 6218a5cb6..3fbd90dc9 100644 --- a/osfmk/arm64/sptm/pmap/pmap_internal.h +++ b/osfmk/arm64/sptm/pmap/pmap_internal.h @@ -83,9 +83,6 @@ extern bool sptm_stability_hacks; extern void pmap_remove_range_options( pmap_t, vm_map_address_t, vm_map_address_t, int); -extern void pmap_tte_deallocate( - pmap_t, vm_offset_t, tt_entry_t *, unsigned int); - #if defined(PVH_FLAG_EXEC) extern void pmap_set_ptov_ap(unsigned int, unsigned int, boolean_t); #endif /* defined(PVH_FLAG_EXEC) */ @@ -187,23 +184,22 @@ static inline void pmap_assert_locked(__unused pmap_t pmap, __unused pmap_lock_mode_t mode) { #if MACH_ASSERT + if (pmap == kernel_pmap) { + return; + } if (__improbable(sptm_stability_hacks)) { mode = PMAP_LOCK_EXCLUSIVE; } switch (mode) { case PMAP_LOCK_SHARED: - if (pmap != kernel_pmap) { - LCK_RW_ASSERT(&pmap->rwlock, LCK_RW_ASSERT_SHARED); - } + LCK_RW_ASSERT(&pmap->rwlock, LCK_RW_ASSERT_SHARED); break; case PMAP_LOCK_EXCLUSIVE: LCK_RW_ASSERT(&pmap->rwlock, LCK_RW_ASSERT_EXCLUSIVE); break; case PMAP_LOCK_HELD: - if (pmap != kernel_pmap) { - LCK_RW_ASSERT(&pmap->rwlock, LCK_RW_ASSERT_HELD); - } + LCK_RW_ASSERT(&pmap->rwlock, LCK_RW_ASSERT_HELD); break; default: panic("%s: Unknown pmap_lock_mode. pmap=%p, mode=%d", __FUNCTION__, pmap, mode); @@ -228,7 +224,7 @@ pmap_assert_locked_any(__unused pmap_t pmap) * Acquire a pmap object's reader/writer lock as either shared (read-only) or * exclusive (read/write). * - * @note If this function is called to request shared acquisition of the kernel pmap + * @note If this function is called to request acquisition of the kernel pmap * lock, the lock will not be acquired as a performance optimization. See the * the explanation in the function body for why this is safe to do. * @@ -238,32 +234,26 @@ pmap_assert_locked_any(__unused pmap_t pmap) static inline void pmap_lock(pmap_t pmap, pmap_lock_mode_t mode) { + /** + * The pmap lock is only held exclusive for removal of a leaf-level + * page table during pmap_remove(), to prevent concurrent mapping + * into the to-be-deleted table. + * The kernel pmap does not participate in the above, as table + * removal is only done for user pmaps. + * Since the kernel pmap never requires exclusive locking, it's + * also pointless to use shared locking and we can therefore elide + * any acquisition of the kernel pmap lock. + */ + if (pmap == kernel_pmap) { + return; + } if (__improbable(sptm_stability_hacks)) { mode = PMAP_LOCK_EXCLUSIVE; } switch (mode) { case PMAP_LOCK_SHARED: - /** - * There are three cases in which we hold the pmap lock exclusive: - * 1) Removal of a leaf-level page table during pmap_remove(), - * to prevent concurrent mapping into the to-be-deleted table. - * 2) Nesting/unnesting of a region of one pmap into another, to - * both concurrent nesting and concurrent mapping into the nested - * region. - * 3) Installing a new page table during pmap_expand(), to prevent - * another thread from concurrently expanding the same pmap at - * the same location. - * Of the above, the kernel pmap only participates in 3) (nesting - * and table removal are only done for user pmaps). Because the - * exclusive lock in case 3) above is only meant to synchronize - * against other instances of case 3), we can effectively elide - * shared holders of the kernel pmap because there is no case in - * which shared<>exclusive locking of the kernel pmap matters. - */ - if (pmap != kernel_pmap) { - lck_rw_lock_shared(&pmap->rwlock); - } + lck_rw_lock_shared(&pmap->rwlock); break; case PMAP_LOCK_EXCLUSIVE: lck_rw_lock_exclusive(&pmap->rwlock); @@ -285,6 +275,9 @@ pmap_lock(pmap_t pmap, pmap_lock_mode_t mode) static inline bool pmap_try_lock(pmap_t pmap, pmap_lock_mode_t mode) { + if (pmap == kernel_pmap) { + return true; + } bool ret = false; if (__improbable(sptm_stability_hacks)) { @@ -293,11 +286,7 @@ pmap_try_lock(pmap_t pmap, pmap_lock_mode_t mode) switch (mode) { case PMAP_LOCK_SHARED: - if (pmap != kernel_pmap) { - ret = lck_rw_try_lock_shared(&pmap->rwlock); - } else { - ret = true; - } + ret = lck_rw_try_lock_shared(&pmap->rwlock); break; case PMAP_LOCK_EXCLUSIVE: ret = lck_rw_try_lock_exclusive(&pmap->rwlock); @@ -338,15 +327,16 @@ pmap_lock_shared_to_exclusive(pmap_t pmap) static inline void pmap_unlock(pmap_t pmap, pmap_lock_mode_t mode) { + if (pmap == kernel_pmap) { + return; + } if (__improbable(sptm_stability_hacks)) { mode = PMAP_LOCK_EXCLUSIVE; } switch (mode) { case PMAP_LOCK_SHARED: - if (pmap != kernel_pmap) { - lck_rw_unlock_shared(&pmap->rwlock); - } + lck_rw_unlock_shared(&pmap->rwlock); break; case PMAP_LOCK_EXCLUSIVE: lck_rw_unlock_exclusive(&pmap->rwlock); diff --git a/osfmk/arm64/sptm/pmap/pmap_misc.c b/osfmk/arm64/sptm/pmap/pmap_misc.c index e264e494f..08a19f90e 100644 --- a/osfmk/arm64/sptm/pmap/pmap_misc.c +++ b/osfmk/arm64/sptm/pmap/pmap_misc.c @@ -44,10 +44,10 @@ pmap_abandon_measurement(void) { #if SCHED_HYGIENE_DEBUG struct _preemption_disable_pcpu *pcpu = PERCPU_GET(_preemption_disable_pcpu_data); - const boolean_t istate = ml_set_interrupts_enabled(FALSE); - if (pcpu->pdp_start.pds_mach_time != 0) { - pcpu->pdp_abandon = true; - } - ml_set_interrupts_enabled(istate); + const bool istate = ml_set_interrupts_enabled_with_debug(false, false); + + kern_timeout_override(&pcpu->pdp_timeout); + + ml_set_interrupts_enabled_with_debug(istate, false); #endif /* SCHED_HYGIENE_DEBUG */ } diff --git a/osfmk/arm64/sptm/pmap/pmap_ppl_interface.c b/osfmk/arm64/sptm/pmap/pmap_ppl_interface.c index 1167e0bdb..dc8b66bd7 100644 --- a/osfmk/arm64/sptm/pmap/pmap_ppl_interface.c +++ b/osfmk/arm64/sptm/pmap/pmap_ppl_interface.c @@ -34,7 +34,7 @@ * contains the ppl_handler_table, as well as a few PPL-only entry/exit helper * functions. * - * See doc/ppl.md for more information about how these PPL entry points work. + * See doc/arm/PPL.md for more information about how these PPL entry points work. */ #include @@ -50,7 +50,7 @@ * generates the code for the _ppl() variant which is what is used to jump into * the PPL. * - * See doc/ppl.md for more information about how these PPL entry points work. + * See doc/arm/PPL.md for more information about how these PPL entry points work. */ PMAP_SUPPORT_PROTOTYPES( diff --git a/osfmk/arm64/sptm/pmap/pmap_pt_geometry.h b/osfmk/arm64/sptm/pmap/pmap_pt_geometry.h index 6e8a9a608..f28416654 100644 --- a/osfmk/arm64/sptm/pmap/pmap_pt_geometry.h +++ b/osfmk/arm64/sptm/pmap/pmap_pt_geometry.h @@ -85,7 +85,7 @@ struct page_table_ops { * differences between stage 1 and stage 2 page tables. This allows one set of * code to seamlessly handle the differences between various address space * layouts as well as stage 1 vs stage 2 page tables on the fly. See - * doc/arm_pmap.md for more details. + * doc/arm/arm_pmap.md for more details. * * Instead of accessing the fields in this structure directly, it is recommended * to use the page table attribute getter functions defined below. @@ -141,6 +141,12 @@ struct page_table_attr { * SPTM page table geometry index. */ const uint8_t geometry_id; + + /** + * Mask of significant address bits. This is the mask needed to address the + * virtual page number portion of the VA. + */ + const uint64_t pta_va_valid_mask; }; typedef struct page_table_attr pt_attr_t; @@ -485,6 +491,12 @@ pt_attr_leaf_level(const pt_attr_t * const pt_attr) return pt_attr_twig_level(pt_attr) + 1; } +/* Significant address bits in PTE */ +static inline uint64_t +pt_attr_va_valid_mask(const pt_attr_t * const pt_attr) +{ + return pt_attr->pta_va_valid_mask; +} /** * Return the index into a specific level of page table for a given virtual @@ -497,7 +509,8 @@ pt_attr_leaf_level(const pt_attr_t * const pt_attr) static inline unsigned int ttn_index(const pt_attr_t * const pt_attr, vm_map_address_t addr, unsigned int pt_level) { - const uint64_t index_unshifted = addr & pt_attr_ln_index_mask(pt_attr, pt_level); + const uint64_t addr_masked = addr & pt_attr_va_valid_mask(pt_attr); + const uint64_t index_unshifted = addr_masked & pt_attr_ln_index_mask(pt_attr, pt_level); return (unsigned int)(index_unshifted >> pt_attr_ln_shift(pt_attr, pt_level)); } diff --git a/osfmk/arm64/sptm/sptm.h b/osfmk/arm64/sptm/sptm.h index 6a7460050..ebcca7526 100644 --- a/osfmk/arm64/sptm/sptm.h +++ b/osfmk/arm64/sptm/sptm.h @@ -137,3 +137,18 @@ sptm_get_page_table_refcnt(sptm_paddr_t table_paddr) return refcnt; } + +/** + * Convenience function for determining whether a frame type allows userspace + * executable mapping permissions. + * + * @param frame_type the frame type to query + * + * @return True If [frame_type] allows userspace mappings with executable + * privileges, false otherwise. + */ +static inline bool +sptm_type_is_user_executable(sptm_frame_type_t frame_type) +{ + return (frame_type == XNU_USER_EXEC) || (frame_type == XNU_USER_DEBUG) || (frame_type == XNU_USER_JIT); +} diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index da9002e46..53350e013 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -130,7 +130,7 @@ LEXT(reset_vector) msr OSLAR_EL1, xzr msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts -#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) +#if !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) // Set low reset vector before attempting any loads adrp x0, EXT(LowExceptionVectorBase)@page add x0, x0, EXT(LowExceptionVectorBase)@pageoff @@ -279,7 +279,7 @@ LEXT(LowExceptionVectorBase) b . .align 12, 0 -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * Provide a global symbol so that we can narrow the V=P mapping to cover * this page during arm_vm_init. @@ -288,7 +288,7 @@ LEXT(LowExceptionVectorBase) .globl EXT(bootstrap_instructions) LEXT(bootstrap_instructions) -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ .align 2 .globl EXT(resume_idle_cpu) LEXT(resume_idle_cpu) @@ -305,13 +305,13 @@ LEXT(start_cpu) .align 2 start_cpu: -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) // This is done right away in reset vector for pre-KTRR devices // Set low reset vector now that we are in the KTRR-free zone adrp x0, EXT(LowExceptionVectorBase)@page add x0, x0, EXT(LowExceptionVectorBase)@pageoff MSR_VBAR_EL1_X0 -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ // x20 set to BootArgs phys address // x21 set to cpu data phys address @@ -337,7 +337,7 @@ start_cpu: // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) mov x1, lr bl EXT(pinst_spsel_1) mov lr, x1 @@ -370,7 +370,7 @@ start_cpu: * arg5 - Scratch register */ .macro create_l1_table_entry - and $3, $0, #(ARM_TT_L1_INDEX_MASK) + and $3, $0, #(ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT)) lsr $3, $3, #(ARM_TT_L1_SHIFT) // Get index in L1 table for L2 table lsl $3, $3, #(TTE_SHIFT) // Convert index into pointer offset add $3, $1, $3 // Get L1 entry pointer @@ -500,7 +500,7 @@ LEXT(start_first_cpu) sub x0, x0, x23 // Set SP_EL1 to exception stack -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) bl EXT(pinst_spsel_1) #else msr SPSel, #1 @@ -651,16 +651,15 @@ common_start: #endif // Set the translation control register. - adrp x0, EXT(sysreg_restore)@page // Load TCR value from the system register restore structure - add x0, x0, EXT(sysreg_restore)@pageoff - ldr x1, [x0, SR_RESTORE_TCR_EL1] + MOV64 x1, TCR_EL1_BOOT MSR_TCR_EL1_X1 + /* Set up translation table base registers. * TTBR0 - V=P table @ top of kernel * TTBR1 - KVA table @ top of kernel + 1 page */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* Note that for KTRR configurations, the V=P map will be modified by * arm_vm_init.c. */ diff --git a/osfmk/arm64/strncmp.s b/osfmk/arm64/strncmp.s index 3a1a7a74b..f92010135 100644 --- a/osfmk/arm64/strncmp.s +++ b/osfmk/arm64/strncmp.s @@ -50,7 +50,7 @@ .macro ClearFrameAndReturn ldp fp, lr, [sp], #16 - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG _strncmp .endm #include "../mach/arm/vm_param.h" diff --git a/osfmk/arm64/strnlen.s b/osfmk/arm64/strnlen.s index 7da2b320c..00b9f60e2 100644 --- a/osfmk/arm64/strnlen.s +++ b/osfmk/arm64/strnlen.s @@ -48,9 +48,9 @@ mov fp, sp .endm -.macro ClearFrameAndReturn +.macro ClearFrameAndReturn label:req ldp fp, lr, [sp], #16 - ARM64_STACK_EPILOG + ARM64_STACK_EPILOG \label .endm /***************************************************************************** @@ -76,6 +76,7 @@ _strnlen: tst x1, x1 b.mi _strlen b.eq L_maxlenIsZero +L_strnlen_frame: EstablishFrame // Load the 16-byte aligned vector containing the start of the string. and x2, x0, #-16 @@ -117,7 +118,7 @@ _strnlen: sub x0, x2, x0 add x1, x1, #16 add x0, x0, x1 - ClearFrameAndReturn + ClearFrameAndReturn L_strnlen_frame L_maxlenIsZero: mov x0, #0 @@ -136,7 +137,7 @@ L_foundNUL: cmp x1, x3 // if NUL occurs before maxlen bytes csel x1, x1, x3, cc // return strlen, else maxlen add x0, x0, x1 - ClearFrameAndReturn + ClearFrameAndReturn L_strnlen_frame /***************************************************************************** * strlen entrypoint * @@ -199,4 +200,4 @@ _strlen: fmov w2, s1 sub x0, x1, x0 add x0, x0, x2 - ClearFrameAndReturn + ClearFrameAndReturn _strlen diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index c26210967..949bd1a45 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -21,7 +21,7 @@ SFLAGS+= -include meta_features.h ifeq ($(KSANCOV),1) # Don't instrument functions called by the ksancov runtime. SanitizeCoverage does -# not support blacklists, so exclude the whole file. +# not support denylists, so exclude the whole file. machine_routines.o_CFLAGS_RM = $(KCOV_CFLAGS) machine_routines_common.o_CFLAGS_RM = $(KCOV_CFLAGS) pcb_native.o_CFLAGS_RM = $(KCOV_CFLAGS) @@ -72,7 +72,8 @@ COMP_SUBDIRS = \ %MACHDEP -vm_tests.o_CFLAGS_ADD += -O0 -g +# Ease debuggability of VM test code as much as possible. +vm_tests.o_CFLAGS_ADD += -g # # vm_sanitize/vm_sanitize_error_compat use the UBSan verifier to detect @@ -475,6 +476,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/osfmk/conf/files b/osfmk/conf/files index 000c62782..eb2fbad57 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -31,7 +31,6 @@ OPTIONS/hibernation optional hibernation OPTIONS/crypto optional crypto OPTIONS/kdebug optional kdebug OPTIONS/mach_assert optional mach_assert -OPTIONS/mach_flipc optional mach_flipc OPTIONS/mach_kdp optional mach_kdp OPTIONS/config_serial_kdp optional config_serial_kdp OPTIONS/mach_ldebug optional mach_ldebug @@ -104,7 +103,6 @@ osfmk/ipc/ipc_right.c standard osfmk/ipc/ipc_space.c standard osfmk/ipc/ipc_service_port.c standard osfmk/ipc/ipc_voucher.c standard -osfmk/ipc/flipc.c optional mach_flipc osfmk/ipc/mach_debug.c standard osfmk/ipc/mach_kernelrpc.c standard osfmk/ipc/mach_msg.c standard @@ -129,6 +127,7 @@ osfmk/kern/ecc_logging.c optional config_ecc_logging osfmk/kern/energy_perf.c standard osfmk/kern/exception.c standard osfmk/kern/exclaves.c standard +osfmk/kern/exclaves_aoe.c standard osfmk/kern/exclaves_boot.c standard osfmk/kern/exclaves_conclave.c standard osfmk/kern/exclaves_driverkit.c standard @@ -164,7 +163,6 @@ osfmk/kern/lock_ptr.c standard osfmk/kern/lock_rw.c standard osfmk/kern/lock_ticket.c standard osfmk/kern/locks.c standard -osfmk/kern/mach_node.c standard osfmk/kern/machine.c standard osfmk/kern/mk_sp.c standard osfmk/kern/mk_timer.c standard @@ -181,9 +179,11 @@ osfmk/kern/sched_average.c standard osfmk/kern/sched_amp.c standard osfmk/kern/sched_amp_common.c standard #endif -osfmk/kern/sched_dualq.c standard osfmk/kern/sched_clutch.c optional config_clutch +osfmk/kern/sched_common.c standard +osfmk/kern/sched_dualq.c standard osfmk/kern/sched_prim.c standard +osfmk/kern/sched_rt.c standard osfmk/kern/sfi.c standard osfmk/kern/smr.c standard osfmk/kern/stack.c standard @@ -207,6 +207,7 @@ osfmk/kern/thread_group.c standard osfmk/kern/thread_policy.c standard osfmk/kern/thread_test_context.c optional development osfmk/kern/thread_test_context.c optional debug +osfmk/kern/timeout.c standard osfmk/kern/timer.c standard osfmk/kern/timer_call.c standard osfmk/kern/turnstile.c standard diff --git a/osfmk/conf/files.arm64 b/osfmk/conf/files.arm64 index 4806f2053..a28935dcd 100644 --- a/osfmk/conf/files.arm64 +++ b/osfmk/conf/files.arm64 @@ -35,8 +35,8 @@ osfmk/arm64/lz4_decode_arm64.s standard osfmk/arm64/lz4_encode_arm64.s standard osfmk/arm64/amcc_rorgn_ppl.c optional nos_arm_asm config_pmap_ppl osfmk/arm64/amcc_rorgn_ppl_amcc.c optional nos_arm_asm config_pmap_ppl -osfmk/arm64/amcc_rorgn_ppl_ctrr3.c optional nos_arm_asm config_pmap_ppl osfmk/arm64/amcc_rorgn_common.c optional nos_arm_asm +osfmk/arm64/amcc_rorgn_pv_ctrr.c optional nos_arm_asm osfmk/arm64/bcopy.s standard osfmk/arm64/bzero.s standard osfmk/arm/caches.c standard @@ -117,6 +117,4 @@ osfmk/arm64/sme.c standard osfmk/arm64/static_if.c standard osfmk/arm64/corecrypto/sha256_compress_arm64.s standard -osfmk/arm64/bti_telemetry.c optional config_bti_telemetry - osfmk/arm/counter.c standard diff --git a/osfmk/console/serial_protos.h b/osfmk/console/serial_protos.h index bfc2190eb..b4b6b74b4 100644 --- a/osfmk/console/serial_protos.h +++ b/osfmk/console/serial_protos.h @@ -40,6 +40,7 @@ extern "C" { #endif #include +#include void serial_keyboard_init(void); void serial_keyboard_start(void) __dead2; @@ -76,7 +77,7 @@ extern uint32_t serialmode; #endif extern uint32_t cons_ops_index; -extern const uint32_t nconsops; +extern __security_const_early uint32_t nconsops; /* disable_serial_output disables kprintf() *and* unbuffered panic output. */ extern bool disable_serial_output; diff --git a/osfmk/corpses/corpse.c b/osfmk/corpses/corpse.c index 7f9c12b10..3df73a438 100644 --- a/osfmk/corpses/corpse.c +++ b/osfmk/corpses/corpse.c @@ -138,6 +138,7 @@ #include #include #include +#include #if CONFIG_MACF #include diff --git a/osfmk/device/device.defs b/osfmk/device/device.defs index b03ddbc87..16088726d 100644 --- a/osfmk/device/device.defs +++ b/osfmk/device/device.defs @@ -229,14 +229,17 @@ routine io_registry_entry_get_parent_iterator( out iterator : io_object_t ); -skip; -/* was routine io_service_open - service : io_object_t; - in owningTask : task_t; - in connect_type : uint32_t; - out connection : io_connect_t +/* >was< routine io_service_open */ +routine io_connect_map_shared_memory( + connection : io_connect_t; + in memory_type : uint32_t; + in into_task : task_t; + inout address : mach_vm_address_t; + inout size : mach_vm_size_t; + in flags : uint32_t; + in property_name : io_name_t; + out inband_output : io_struct_inband_t, CountInOut ); -*/ routine io_service_close( connection : io_connect_t diff --git a/osfmk/device/device_init.c b/osfmk/device/device_init.c index 0630db7ff..a9bfa04c0 100644 --- a/osfmk/device/device_init.c +++ b/osfmk/device/device_init.c @@ -62,21 +62,7 @@ * Initialize device service as part of kernel task. */ -#include -#include - -#include -#include -#include - -#include #include -#include -#include -#include -#include - -#include #include static SECURITY_READ_ONLY_LATE(void *) main_device_kobject; diff --git a/osfmk/device/device_types.h b/osfmk/device/device_types.h index 48e8bef2a..d6be1d1a6 100644 --- a/osfmk/device/device_types.h +++ b/osfmk/device/device_types.h @@ -72,6 +72,7 @@ #include #ifdef MACH_KERNEL_PRIVATE #include +#include #endif #include @@ -129,7 +130,7 @@ typedef io_object_t io_connect_t; typedef io_object_t io_ident_t; typedef io_object_t uext_object_t; -extern void iokit_add_reference( io_object_t obj, natural_t type ); +extern void iokit_add_reference( io_object_t obj ); extern void iokit_remove_reference( io_object_t obj ); extern void iokit_remove_connect_reference( io_object_t obj ); extern void iokit_port_object_description(io_object_t obj, kobject_description_t desc); @@ -144,7 +145,7 @@ extern ipc_port_t iokit_make_connect_port( io_connect_t obj ); extern ipc_port_t iokit_make_ident_port( io_ident_t obj ); extern void iokit_kobject_retain(io_kobject_t machPort); -extern io_object_t iokit_copy_object_for_consumed_kobject(io_kobject_t machPort, natural_t type); +extern io_object_t iokit_copy_object_for_consumed_kobject(io_kobject_t machPort); #else diff --git a/osfmk/device/iokit_rpc.c b/osfmk/device/iokit_rpc.c index 7e9403143..7b26a45ff 100644 --- a/osfmk/device/iokit_rpc.c +++ b/osfmk/device/iokit_rpc.c @@ -31,12 +31,10 @@ #include #include #include -//#include #include #include /* spl definitions */ -#include #include #include @@ -69,29 +67,30 @@ #define EXTERN #define MIGEXTERN -static void -iokit_no_senders( ipc_port_t port, mach_port_mscount_t mscount ); - /* * Lifetime: * - non lazy port with no-more senders - * - can be destroyed by iokit_destroy_object_port - * + * - the object is not stable because of IOUserClient::destroyUserReferences(), + * which can kill the port even when there are outstanding send rights. */ IPC_KOBJECT_DEFINE(IKOT_IOKIT_IDENT, - .iko_op_no_senders = iokit_no_senders); + .iko_op_movable_send = true, + .iko_op_no_senders = iokit_ident_no_senders); IPC_KOBJECT_DEFINE(IKOT_IOKIT_OBJECT, - .iko_op_no_senders = iokit_no_senders); + .iko_op_movable_send = true, + .iko_op_no_senders = iokit_object_no_senders); IPC_KOBJECT_DEFINE(IKOT_IOKIT_CONNECT, - .iko_op_no_senders = iokit_no_senders); + .iko_op_no_senders = iokit_connect_no_senders); IPC_KOBJECT_DEFINE(IKOT_UEXT_OBJECT, - .iko_op_no_senders = iokit_no_senders); + .iko_op_movable_send = true, + .iko_op_no_senders = iokit_uext_no_senders, + .iko_op_label_free = ipc_kobject_label_free); /* * Lookup a device by its port. * Doesn't consume the naked send right; produces a device reference. */ -io_object_t +EXTERN io_object_t iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type) { io_object_t obj = NULL; @@ -102,44 +101,38 @@ iokit_lookup_io_object(ipc_port_t port, ipc_kobject_type_t type) } ip_mq_lock(port); - if (ip_active(port)) { - kobj = ipc_kobject_get_locked(port, type); - if (kobj) { - iokit_kobject_retain(kobj); - } + kobj = ipc_kobject_get_locked(port, type); + if (kobj) { + iokit_kobject_retain(kobj); } ip_mq_unlock(port); if (kobj) { - obj = iokit_copy_object_for_consumed_kobject(kobj, type); + obj = iokit_copy_object_for_consumed_kobject(kobj); } return obj; } MIGEXTERN io_object_t -iokit_lookup_object_port( - ipc_port_t port) +iokit_lookup_object_port(ipc_port_t port) { return iokit_lookup_io_object(port, IKOT_IOKIT_OBJECT); } MIGEXTERN io_object_t -iokit_lookup_connect_port( - ipc_port_t port) +iokit_lookup_connect_port(ipc_port_t port) { return iokit_lookup_io_object(port, IKOT_IOKIT_CONNECT); } MIGEXTERN io_object_t -iokit_lookup_ident_port( - ipc_port_t port) +iokit_lookup_ident_port(ipc_port_t port) { return iokit_lookup_io_object(port, IKOT_IOKIT_IDENT); } MIGEXTERN io_object_t -iokit_lookup_uext_object_port( - ipc_port_t port) +iokit_lookup_uext_object_port(ipc_port_t port) { return iokit_lookup_io_object(port, IKOT_UEXT_OBJECT); } @@ -165,7 +158,7 @@ iokit_lookup_object_in_space_with_port_name(mach_port_name_t name, ipc_kobject_t } ip_mq_unlock(port); if (kobj) { - obj = iokit_copy_object_for_consumed_kobject(kobj, type); + obj = iokit_copy_object_for_consumed_kobject(kobj); } } } @@ -215,14 +208,14 @@ iokit_lookup_raw_current_task(mach_port_name_t name, ipc_kobject_type_t type, ip */ if (MACH_PORT_VALID(name)) { kr = ipc_typed_port_copyin_send(current_space(), name, - IKOT_UNKNOWN, &port); + IOT_ANY, &port); if (kr != KERN_SUCCESS || !IP_VALID(port)) { return kIOReturnNotFound; } - if (type != IKOT_UNKNOWN && ip_kotype(port) != type) { - ipc_typed_port_release_send(port, IKOT_UNKNOWN); + if (type != IOT_ANY && ip_type(port) != type) { + ipc_typed_port_release_send(port, IOT_ANY); return kIOReturnBadArgument; } @@ -233,18 +226,6 @@ iokit_lookup_raw_current_task(mach_port_name_t name, ipc_kobject_type_t type, ip return kIOReturnNotFound; } -EXTERN void -iokit_retain_port( ipc_port_t port ) -{ - ipc_port_reference( port ); -} - -EXTERN void -iokit_release_port( ipc_port_t port ) -{ - ipc_port_release( port ); -} - EXTERN void iokit_release_port_send( ipc_port_t port ) { @@ -259,44 +240,30 @@ iokit_release_port_send( ipc_port_t port ) static ipc_port_t iokit_make_port_of_type(io_object_t obj, ipc_kobject_type_t type) { - ipc_port_t port; - ipc_port_t sendPort; - ipc_kobject_t kobj; + ipc_port_t sendPort = IP_NULL; - if (obj == NULL) { - return IP_NULL; + if (obj != NULL) { + sendPort = iokit_port_make_send_for_object(obj, type); + iokit_remove_reference( obj ); } - port = iokit_port_for_object(obj, type, &kobj); - if (port) { - sendPort = ipc_kobject_make_send( port, kobj, type ); - iokit_release_port( port ); - } else { - sendPort = IP_NULL; - } - - iokit_remove_reference( obj ); - return sendPort; } MIGEXTERN ipc_port_t -iokit_make_object_port( - io_object_t obj ) +iokit_make_object_port( io_object_t obj ) { return iokit_make_port_of_type(obj, IKOT_IOKIT_OBJECT); } MIGEXTERN ipc_port_t -iokit_make_connect_port( - io_object_t obj ) +iokit_make_connect_port( io_object_t obj ) { return iokit_make_port_of_type(obj, IKOT_IOKIT_CONNECT); } MIGEXTERN ipc_port_t -iokit_make_ident_port( - io_object_t obj ) +iokit_make_ident_port( io_object_t obj ) { return iokit_make_port_of_type(obj, IKOT_IOKIT_IDENT); } @@ -305,56 +272,51 @@ EXTERN ipc_port_t iokit_alloc_object_port( io_kobject_t obj, ipc_kobject_type_t type ) { /* Allocate port, keeping a reference for it. */ - ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_NSREQUEST; - if (type == IKOT_IOKIT_CONNECT) { - options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; - } + ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_NONE; + ipc_object_label_t label = IPC_OBJECT_LABEL(type); + if (type == IKOT_UEXT_OBJECT) { - ipc_label_t label = IPC_LABEL_DEXT; - return ipc_kobject_alloc_labeled_port((ipc_kobject_t) obj, type, label, options); - } else { - return ipc_kobject_alloc_port((ipc_kobject_t) obj, type, options); + label = ipc_kobject_label_alloc(IKOT_UEXT_OBJECT, + IPC_LABEL_DEXT, IP_NULL); } + return ipc_kobject_alloc_port(obj, label, options); } EXTERN void -iokit_remove_object_port( ipc_port_t port, ipc_kobject_type_t type ) +iokit_lock_port( ipc_port_t port ) { - ipc_kobject_disable(port, type); + ip_mq_lock(port); } -EXTERN kern_return_t +EXTERN void +iokit_unlock_port( ipc_port_t port ) +{ + ip_mq_unlock(port); +} + +EXTERN void iokit_destroy_object_port( ipc_port_t port, ipc_kobject_type_t type ) { - ipc_kobject_dealloc_port(port, 0, type); - return KERN_SUCCESS; + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, type); } EXTERN ipc_kobject_type_t iokit_port_type(ipc_port_t port) { - return ip_kotype(port); + return ip_type(port); } EXTERN mach_port_name_t iokit_make_send_right( task_t task, io_object_t obj, ipc_kobject_type_t type ) { - ipc_port_t port; ipc_port_t sendPort; mach_port_name_t name = 0; - ipc_kobject_t kobj; if (obj == NULL) { return MACH_PORT_NULL; } - port = iokit_port_for_object( obj, type, &kobj ); - if (port) { - sendPort = ipc_kobject_make_send( port, kobj, type ); - iokit_release_port( port ); - } else { - sendPort = IP_NULL; - } + sendPort = iokit_port_make_send_for_object( obj, type ); if (IP_VALID( sendPort )) { kern_return_t kr; @@ -383,60 +345,6 @@ iokit_mod_send_right( task_t task, mach_port_name_t name, mach_port_delta_t delt return mach_port_mod_refs( task->itk_space, name, MACH_PORT_RIGHT_SEND, delta ); } -/* - * Handle the No-More_Senders notification generated from a device port destroy. - * Since there are no longer any tasks which hold a send right to this device - * port a NMS notification has been generated. - */ - -static void -iokit_no_senders( ipc_port_t port, mach_port_mscount_t mscount ) -{ - io_object_t obj = NULL; - io_kobject_t kobj = NULL; - ipc_kobject_type_t type = IKOT_NONE; - - // convert a port to io_object_t. - if (IP_VALID(port)) { - ip_mq_lock(port); - if (ip_active(port)) { - type = ip_kotype( port ); - assert((IKOT_IOKIT_OBJECT == type) - || (IKOT_IOKIT_CONNECT == type) - || (IKOT_IOKIT_IDENT == type) - || (IKOT_UEXT_OBJECT == type)); - kobj = ipc_kobject_get_locked(port, type); - if (kobj) { - iokit_kobject_retain(kobj); - } - } - ip_mq_unlock(port); - if (kobj) { - // IKOT_IOKIT_OBJECT since iokit_remove_reference() follows - obj = iokit_copy_object_for_consumed_kobject(kobj, IKOT_IOKIT_OBJECT); - } - } - - if (obj) { - while (iokit_client_died( obj, port, type, &mscount ) != KERN_SUCCESS) { - kern_return_t kr; - - /* Re-request no-senders notifications on the port (if still active) */ - kr = ipc_kobject_nsrequest(port, mscount + 1, &mscount); - if (kr != KERN_FAILURE) { - break; - } - /* - * port has no outstanding rights or pending make-sends, - * and the notification would fire recursively, try again. - */ - } - - iokit_remove_reference( obj ); - } -} - - kern_return_t iokit_label_dext_task(task_t task) { @@ -463,7 +371,7 @@ iokit_clear_registered_ports( if (!IP_VALID(port)) { continue; } - type = ip_kotype( port ); + type = ip_type( port ); if ((IKOT_IOKIT_OBJECT == type) || (IKOT_IOKIT_CONNECT == type) || (IKOT_IOKIT_IDENT == type) diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index ec709b31f..ab30d3ef1 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -1656,6 +1656,12 @@ print_tasks_user_threads(task_t task) } } +void +print_curr_backtrace(void) +{ + /* Not implemented for i386 */ +} + void print_thread_num_that_crashed(task_t task) { diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 2e80670c3..082a622ea 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -652,6 +653,9 @@ commpage_populate( void ) asb_kaddr %= (kernel_max - kernel_min); asb_kaddr += kernel_min; commpage_update(_COMM_PAGE_ASB_TARGET_KERN_ADDRESS, &asb_kaddr, sizeof(asb_kaddr)); + + vm_map_seal(commpage32_map, true /* nested_pmap */); + vm_map_seal(commpage64_map, true /* nested_pmap */); } /* Fill in the common routines during kernel initialization. @@ -705,6 +709,9 @@ commpage_text_populate( void ) if (next > _COMM_PAGE_TEXT_END) { panic("commpage text overflow: next=0x%08x, commPagePtr=%p", next, commPagePtr); } + + vm_map_seal(commpage_text32_map, true /* nested_pmap */); + vm_map_seal(commpage_text64_map, true /* nested_pmap */); } /* Update commpage nanotime information. diff --git a/osfmk/i386/cpu.c b/osfmk/i386/cpu.c index 70dc7d59d..a3e951604 100644 --- a/osfmk/i386/cpu.c +++ b/osfmk/i386/cpu.c @@ -283,3 +283,9 @@ processor_to_datastring(const char *prefix, processor_t target_processor) return (const char *)&printBuf[0]; } + +void +abandon_preemption_disable_measurement(void) +{ + /* stub for libpthread */ +} diff --git a/osfmk/i386/cpu_topology.c b/osfmk/i386/cpu_topology.c index 9379226e6..a4d2a194c 100644 --- a/osfmk/i386/cpu_topology.c +++ b/osfmk/i386/cpu_topology.c @@ -192,7 +192,7 @@ cpu_topology_sort(int ncpus) aset->pset = processor_pset(master_processor); } else { pset_cluster_id++; - aset->pset = pset_create(pset_node_root(), PSET_SMP, pset_cluster_id, pset_cluster_id); + aset->pset = pset_create(CLUSTER_TYPE_SMP, pset_cluster_id, pset_cluster_id); if (aset->pset == PROCESSOR_SET_NULL) { panic("cpu_topology_start: pset_create"); } diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index e3a4e6886..ad7eecb04 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -825,8 +825,8 @@ i386_vm_init(uint64_t maxmem, vm_lopage_lowater = vm_lopage_free_limit / 16; } - vm_lopage_refill = TRUE; - vm_lopage_needed = TRUE; + vm_lopage_refill = true; + vm_lopage_needed = true; } } diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 4519787ea..0e5918bbd 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -1437,3 +1437,10 @@ void ml_task_post_signature_processing_hook(__unused task_t task) { } + +bool +ml_unsafe_kernel_text(void) +{ + /* No text lockdown on x86. */ + return true; +} diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 07cb3f016..1888ce56b 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -532,7 +532,6 @@ machine_thread_on_core_allow_invalid(thread_t thread) * from anything but a thread, zeroed or freed memory. */ assert(get_preemption_level() > 0); - thread = pgz_decode_allow_invalid(thread, ZONE_ID_THREAD); if (thread == THREAD_NULL) { return false; } diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index 4a4a62e6d..34a01c044 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -858,6 +858,14 @@ machine_switch_perfcontrol_state_update(__unused perfcontrol_event event, { } +void +machine_perfcontrol_running_timer_expire(__unused uint64_t now, + __unused uint32_t flags, + __unused int cpu_id, + __unused uint64_t *timeout_ticks) +{ +} + void active_rt_threads(boolean_t active) { diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index 43a6d053d..6f8a9b37e 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -81,6 +81,23 @@ pmap_commpage_size_min(__unused pmap_t pmap) return NBPDE; } +void +pmap_set_shared_region( + pmap_t grand __unused, + pmap_t subord __unused, + addr64_t vstart __unused, + uint64_t size __unused) +{ +} + +kern_return_t +pmap_fork_nest( + pmap_t old_pmap __unused, + pmap_t new_pmap __unused) +{ + return KERN_SUCCESS; +} + /* * kern_return_t pmap_nest(grand, subord, va_start, size) * diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index e3bfc8c1c..4a229676c 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -614,7 +614,7 @@ handle_kernel_breakpoint( if (out_comment) { *out_comment = comment; } - desc = find_brk_descriptor_by_comment(comment); + desc = find_kernel_brk_descriptor_by_comment(comment); if (!desc) { return false; diff --git a/osfmk/ipc/flipc.c b/osfmk/ipc/flipc.c deleted file mode 100644 index 3a2617486..000000000 --- a/osfmk/ipc/flipc.c +++ /dev/null @@ -1,640 +0,0 @@ -/* - * Copyright (c) 2015-2020 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* File: ipc/flipc.h - * Author: Dean Reece - * Date: 2016 - * - * Implementation of fast local ipc (flipc). - */ - - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#pragma pack(4) - - -/*** FLIPC Internal Implementation (private to flipc.c) ***/ - -ZONE_DEFINE_TYPE(flipc_port_zone, "flipc ports", - struct flipc_port, ZC_ZFREE_CLEARMEM); - -/* Get the mnl_name associated with local ipc_port . - * Returns MNL_NAME_NULL if is invalid or not a flipc port. - */ -static inline mnl_name_t -mnl_name_from_port(ipc_port_t lport) -{ - mnl_name_t name = MNL_NAME_NULL; - - if (IP_VALID(lport)) { - flipc_port_t fport = lport->ip_messages.imq_fport; - if (FPORT_VALID(fport)) { - name = fport->obj.name; - } - } - return name; -} - - -/* Lookup the ipc_port associated with mnl_name . - * Returns IP_NULL if is invalid or not a known mnl object. - */ -static inline ipc_port_t -mnl_name_to_port(mnl_name_t name) -{ - ipc_port_t lport = IP_NULL; - - if (MNL_NAME_VALID(name)) { - flipc_port_t fport = (flipc_port_t)mnl_obj_lookup(name); - if (FPORT_VALID(fport)) { - lport = fport->lport; - } - } - return lport; -} - - -/* flipc_port_create() is called to convert a regular mach port into a - * flipc port (i.e., the port has one or more rights off-node). - * must be locked on entry and is not unlocked on return. - */ -static kern_return_t -flipc_port_create(ipc_port_t lport, mach_node_t node, mnl_name_t name) -{ - /* Ensure parameters are valid and not already linked */ - assert(IP_VALID(lport)); - assert(MACH_NODE_VALID(node)); - assert(MNL_NAME_VALID(name)); - assert(!FPORT_VALID(lport->ip_messages.imq_fport)); - - /* Allocate and initialize a flipc port */ - flipc_port_t fport = zalloc_flags(flipc_port_zone, Z_WAITOK | Z_ZERO); - if (!FPORT_VALID(fport)) { - return KERN_RESOURCE_SHORTAGE; - } - fport->obj.name = name; - fport->hostnode = node; - if (node == localnode) { - fport->state = FPORT_STATE_PRINCIPAL; - } else { - fport->state = FPORT_STATE_PROXY; - } - - /* Link co-structures (lport is locked) */ - fport->lport = lport; - lport->ip_messages.imq_fport = fport; - - /* Add fport to the name hash table; revert link if insert fails */ - kern_return_t kr = mnl_obj_insert((mnl_obj_t)fport); - if (kr != KERN_SUCCESS) { - lport->ip_messages.imq_fport = FPORT_NULL; - fport->lport = IP_NULL; - zfree(flipc_port_zone, fport); - } - - return kr; -} - - -/* flipc_port_destroy() is called to convert a flipc port back to a - * local-only ipc port (i.e., the port has no remaining off-node rights). - * This will dispose of any undelivered flipc messages, generating NAKs if - * needed. must be locked on entry and is not unlocked on return. - */ -static void -flipc_port_destroy(ipc_port_t lport) -{ - /* Ensure parameter is valid, and linked to an fport with a valid name */ - assert(IP_VALID(lport)); - ipc_mqueue_t port_mq = &lport->ip_messages; - flipc_port_t fport = port_mq->imq_fport; - assert(FPORT_VALID(fport)); - assert(MNL_NAME_VALID(fport->obj.name)); - - /* Dispose of any undelivered messages */ - int m = port_mq->imq_msgcount; - if (m > 0) { - ipc_kmsg_t kmsg; -#if DEBUG - printf("flipc: destroying %p with %d undelivered msgs\n", lport, m); -#endif - - /* Logic was lifted from ipc_mqueue_select_on_thread() */ - while (m--) { - kmsg = ipc_kmsg_queue_first(&port_mq->imq_messages); - assert(kmsg != IKM_NULL); - ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); - if (fport->state == FPORT_STATE_PRINCIPAL) { - flipc_msg_ack(kmsg->ikm_node, port_mq, FALSE); - } - ipc_mqueue_release_msgcount(port_mq); - port_mq->imq_seqno++; - } - } - - /* Remove from name hash table, unlink co-structures, and free fport */ - mnl_obj_remove(fport->obj.name); - lport->ip_messages.imq_fport = FPORT_NULL; - fport->lport = IP_NULL; - zfree(flipc_port_zone, fport); -} - - -/* - * Routine: flipc_msg_size_from_kmsg(ipc_kmsg_t kmsg) - * Purpose: - * Compute the size of the buffer needed to hold the translated flipc - * message. All identifiers are converted to flipc_names which are 64b. - * If this node's pointers are a different size, we have to allow for - * expansion of the descriptors as appropriate. - * Conditions: - * Nothing locked. - * Returns: - * size of the message as it would be sent over the flipc link. - */ -static mach_msg_size_t -flipc_msg_size_from_kmsg(ipc_kmsg_t kmsg) -{ - mach_msg_size_t fsize = ikm_header(kmsg)->msgh_size; - - if (ikm_header(kmsg)->msgh_bits & MACH_MSGH_BITS_COMPLEX) { - PE_enter_debugger("flipc_msg_size_from_kmsg(): Complex messages not supported."); - } - - return fsize; -} - - -/* Translate a kmsg into a flipc msg suitable to transmit over the mach node - * link. All in-line rights and objects are similarly processed. If the msg - * moves a receive right, then queued messages may need to be moved as a - * result, causing this function to ultimately be recursive. - */ -static kern_return_t -mnl_msg_from_kmsg(ipc_kmsg_t kmsg, mnl_msg_t *fmsgp) -{ - if (ikm_header(kmsg)->msgh_bits & MACH_MSGH_BITS_COMPLEX) { - printf("mnl_msg_from_kmsg(): Complex messages not supported."); - return KERN_FAILURE; - } - - mach_msg_size_t fsize = flipc_msg_size_from_kmsg(kmsg); - - mnl_msg_t fmsg = mnl_msg_alloc(fsize, 0); - - if (fmsg == MNL_MSG_NULL) { - return KERN_RESOURCE_SHORTAGE; - } - - /* Setup flipc message header */ - fmsg->sub = MACH_NODE_SUB_FLIPC; - fmsg->cmd = FLIPC_CMD_IPCMESSAGE; - fmsg->node_id = localnode_id; // Message is from us - fmsg->qos = 0; // not used - fmsg->size = fsize; // Payload size (does NOT include mnl_msg header) - fmsg->object = ikm_header(kmsg)->msgh_remote_port->ip_messages.imq_fport->obj.name; - - /* Copy body of message */ - bcopy((const void*)ikm_header(kmsg), (void*)MNL_MSG_PAYLOAD(fmsg), fsize); - - // Convert port fields - mach_msg_header_t *mmsg = (mach_msg_header_t*)MNL_MSG_PAYLOAD(fmsg); - mmsg->msgh_remote_port = (mach_port_t)fmsg->object; - mmsg->msgh_local_port = (mach_port_t) - mnl_name_from_port(mmsg->msgh_local_port); - mmsg->msgh_voucher_port = (mach_port_name_t)MNL_NAME_NULL; - - *fmsgp = (mnl_msg_t)fmsg; - - return KERN_SUCCESS; -} - - -/* lifted from ipc_mig.c:mach_msg_send_from_kernel_proper() */ -static mach_msg_return_t -mach_msg_send_from_remote_kernel(mach_msg_header_t *msg, - mach_msg_size_t send_size, - mach_node_t node) -{ - ipc_kmsg_t kmsg; - mach_msg_return_t mr; - - mr = ipc_kmsg_get_from_kernel(msg, send_size, &kmsg); - if (mr != MACH_MSG_SUCCESS) { - return mr; - } - - mr = ipc_kmsg_copyin_from_kernel(kmsg); - if (mr != MACH_MSG_SUCCESS) { - ipc_kmsg_free(kmsg); - return mr; - } - - kmsg->ikm_node = node; // node that needs to receive message ack - mr = ipc_kmsg_send(kmsg, - MACH_SEND_KERNEL_DEFAULT, - MACH_MSG_TIMEOUT_NONE); - if (mr != MACH_MSG_SUCCESS) { - ipc_kmsg_destroy(kmsg, IPC_KMSG_DESTROY_ALL); - } - - return mr; -} - - -/* Translate a flipc msg into a kmsg and post it to the appropriate - * port. is the node that originated the message, not necessarily the - * node we received it from. This will block if the receiving port is full. - */ -static mach_msg_return_t -flipc_cmd_ipc(mnl_msg_t fmsg, - mach_node_t node, - uint32_t flags __unused) -{ - mach_msg_header_t *mmsg; - - // Convert flipc message into mach message in place to avoid alloc/copy - mmsg = (mach_msg_header_t*)MNL_MSG_PAYLOAD(fmsg); - mmsg->msgh_size = fmsg->size; - mmsg->msgh_remote_port = mnl_name_to_port(fmsg->object); - mmsg->msgh_local_port = mnl_name_to_port((mnl_name_t)mmsg->msgh_local_port); - mmsg->msgh_voucher_port = (mach_port_name_t)MACH_PORT_NULL; - mmsg->msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0); - // unchanged: msgh_id - - return mach_msg_send_from_remote_kernel(mmsg, fmsg->size, node); -} - - -/* Called when an ACKMESSAGE packet is received. indicates - * the flipc name of the port holding the messages to be acknowledged. - * indicates the number of messages being acked for this node:port. - */ -static void -flipc_cmd_ack(flipc_ack_msg_t fmsg, - mach_node_t node __unused, - uint32_t flags __unused) -{ - unsigned int msg_count = fmsg->msg_count; - thread_t thread = current_thread(); - boolean_t kick = FALSE; - - flipc_port_t fport = (flipc_port_t)mnl_obj_lookup(fmsg->mnl.object); - - ipc_port_t lport = fport->lport; - ip_mq_lock(lport); // Revisit the lock when enabling flipc - - ipc_mqueue_t lport_mq = &lport->ip_messages; - - assert(fport->peek_count >= msg_count); // Can't ack what we haven't peeked! - - while (msg_count--) { - ipc_mqueue_select_on_thread_locked(lport_mq, NULL, 0, thread); - fport->peek_count--; - kick |= ipc_kmsg_delayed_destroy(thread->ith_kmsg); - } - - ip_mq_unlock(lport); - - if (kick) { - ipc_kmsg_reap_delayed(); - } -} - - - -/*** FLIPC Node Managment Functions (called by mach node layer) ***/ - - -/* flipc_node_prepare() is called by mach node layer when a remote node is - * registered by a link driver, or when the bootstrap port changes for the - * local node. This is the flipc layer's opportunity to initialize per-node - * flipc state, and to convert the node's bootstrap port into a flipc port. - * Note that the node is not yet in the mach node table. - * Returns KERN_SUCCESS on success; otherwise node is not prepared. - */ -kern_return_t -flipc_node_prepare(mach_node_t node) -{ - kern_return_t kr; - - assert(MACH_NODE_VALID(node)); - ipc_port_t bs_port = node->bootstrap_port; - assert(IP_VALID(bs_port)); - - ip_mq_lock(bs_port); - - kr = flipc_port_create(bs_port, - node, - MNL_NAME_BOOTSTRAP(node->info.node_id)); - ip_mq_unlock(bs_port); - - return kr; -} - - -/* flipc_node_retire() is called by mach node layer when a remote node is - * terminated by a link driver, or when the local node's bootstrap port - * becomes invalid. This is the flipc layer's opportunity to free per-node - * flipc state, and to revert the node's bootstrap port to a local ipc port. - * must be locked by the caller. - * Returns KERN_SUCCESS on success. - */ -kern_return_t -flipc_node_retire(mach_node_t node) -{ - if (!MACH_NODE_VALID(node)) { - return KERN_NODE_DOWN; - } - - ipc_port_t bs_port = node->bootstrap_port; - if (IP_VALID(bs_port)) { - ip_mq_lock(bs_port); // Revisit the lock when enabling flipc - flipc_port_destroy(bs_port); - ip_mq_unlock(bs_port); - } - - return KERN_SUCCESS; -} - - -/*** FLIPC Message Functions (called by mach node layer) ***/ - - -/* The node layer calls flipc_msg_to_remote_node() to fetch the next message - * for . This function will block until a message is available or the - * node is terminated, in which case it returns MNL_MSG_NULL. - */ -mnl_msg_t -flipc_msg_to_remote_node(mach_node_t to_node, - uint32_t flags __unused) -{ - mach_port_seqno_t msgoff; - ipc_kmsg_t kmsg = IKM_NULL; - mnl_msg_t fmsg = MNL_MSG_NULL; - - assert(to_node != localnode); - assert(get_preemption_level() == 0); - - struct waitq *pset_waitq = &to_node->proxy_port_set->ips_wqset.wqset_q; - ipc_mqueue_t port_mq = IMQ_NULL; - - while (!to_node->dead) { - /* Fetch next message from proxy port */ - ipc_mqueue_receive(pset_waitq, MACH_PEEK_MSG, 0, 0, THREAD_ABORTSAFE); - - thread_t thread = current_thread(); - if (thread->ith_state == MACH_PEEK_READY) { - port_mq = thread->ith_peekq; - thread->ith_peekq = IMQ_NULL; - } else { - panic("Unexpected thread state %d after ipc_mqueue_receive()", - thread->ith_state); - } - - assert(get_preemption_level() == 0); - - flipc_port_t fport = port_mq->imq_fport; - - if (FPORT_VALID(fport)) { - msgoff = port_mq->imq_fport->peek_count; - - ipc_mqueue_peek_locked(port_mq, &msgoff, NULL, NULL, NULL, &kmsg); - if (kmsg != IKM_NULL) { - port_mq->imq_fport->peek_count++; - } - - /* Clean up outstanding prepost on port_mq. - * This also unlocks port_mq. - */ - ipc_mqueue_release_peek_ref(port_mq); - assert(get_preemption_level() == 0); - - /* DANGER: The code below must be allowed to allocate so it can't - * run under the protection of the imq_lock, but that leaves mqueue - * open for business for a small window before we examine kmsg. - * This SHOULD be OK, since we are the only thread looking. - */ - if (kmsg != IKM_NULL) { - mnl_msg_from_kmsg(kmsg, (mnl_msg_t*)&fmsg); - } - } else { - /* Must be from the control_port, which is not a flipc port */ - assert(!FPORT_VALID(port_mq->imq_fport)); - - /* This is a simplified copy of ipc_mqueue_select_on_thread() */ - kmsg = ipc_kmsg_queue_first(&port_mq->imq_messages); - assert(kmsg != IKM_NULL); - ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); - ipc_mqueue_release_msgcount(port_mq); - counter_inc(¤t_task()->messages_received); - ip_release(to_node->control_port); // Should derive ref from port_mq - - /* We just pass the kmsg payload as the fmsg. - * flipc_msg_free() will notice and free the kmsg properly. - */ - mach_msg_header_t *hdr = ikm_header(kmsg); - fmsg = (mnl_msg_t)(&hdr[1]); - /* Stash kmsg pointer just before fmsg */ - *(ipc_kmsg_t*)((vm_offset_t)fmsg - sizeof(vm_offset_t)) = kmsg; - } - - if (MNL_MSG_VALID(fmsg)) { - break; - } - } - assert(MNL_MSG_VALID(fmsg)); - return fmsg; -} - - -/* The mach node layer calls this to deliver an incoming message. It is the - * responsibility of the caller to release the received message buffer after - * return. - */ -void -flipc_msg_from_node(mach_node_t from_node __unused, - mnl_msg_t msg, - uint32_t flags) -{ - /* Note that if flipc message forwarding is supported, the from_node arg - * may not match fmsg->node_id. The former is the node from which we - * received the message; the latter is the node that originated the - * message. We use the originating node, which is where the ack goes. - */ - assert(msg->sub == MACH_NODE_SUB_FLIPC); - mach_node_t node = mach_node_for_id_locked(msg->node_id, FALSE, FALSE); - MACH_NODE_UNLOCK(node); - - switch (msg->cmd) { - case FLIPC_CMD_IPCMESSAGE: - flipc_cmd_ipc(msg, node, flags); - break; - - case FLIPC_CMD_ACKMESSAGE: - case FLIPC_CMD_NAKMESSAGE: - flipc_cmd_ack((flipc_ack_msg_t)msg, node, flags); - break; - - default: -#if DEBUG - PE_enter_debugger("flipc_incoming(): Invalid command"); -#endif - break; - } -} - - -/* The node layer calls flipc_msg_free() to dispose of sent messages that - * originated in the FLIPC layer. This allows us to repurpose the payload - * of an ack or nak kmsg as a flipc message to avoid a copy - we detect - * such messages here and free them appropriately. - */ -void -flipc_msg_free(mnl_msg_t msg, - uint32_t flags) -{ - switch (msg->cmd) { - case FLIPC_CMD_ACKMESSAGE: // Flipc msg is a kmsg in disguise... - case FLIPC_CMD_NAKMESSAGE: // Convert back to kmsg for disposal - ipc_kmsg_free(*(ipc_kmsg_t*)((vm_offset_t)msg - sizeof(vm_offset_t))); - break; - - default: // Flipc msg is not a kmsg in disguise; dispose of normally - mnl_msg_free(msg, flags); - break; - } -} - - -/*** FLIPC Message Functions (called by mach ipc subsystem) ***/ - -/* Ack's one message sent to from . A new kmsg is allocated - * and filled in as an ack, then posted to the node's contol port. This will - * wake the link driver (if sleeping) and cause the ack to be included with - * normal IPC traffic. - * - * This function immediately returns if or is invalid, so it - * is safe & quick to call speculatively. - * - * Called from mach ipc_mqueue.c when a flipc-originated message is consumed. - */ -void -flipc_msg_ack(mach_node_t node, - ipc_mqueue_t mqueue, - boolean_t delivered) -{ - flipc_port_t fport = mqueue->imq_fport; - - assert(FPORT_VALID(fport)); - assert(MACH_NODE_VALID(node)); - - mnl_name_t name = MNL_NAME_NULL; - mach_node_id_t nid = HOST_LOCAL_NODE; - ipc_port_t ack_port = IP_NULL; - - ip_mq_lock(fport->lport); - name = fport->obj.name; - ip_mq_unlock(fport->lport); - - if (!MNL_NAME_VALID(name)) { - return; - } - - MACH_NODE_LOCK(node); - if (node->active) { - nid = node->info.node_id; - ack_port = node->control_port; - } - MACH_NODE_UNLOCK(node); - - if (!IP_VALID(ack_port) || !MACH_NODE_ID_VALID(nid)) { - return; - } - - /* We have a valid node id & obj name, and a port to send the ack to. */ - ipc_kmsg_t kmsg = ipc_kmsg_alloc(sizeof(struct flipc_ack_msg), IPC_KMSG_ALLOC_KERNEL); - assert((unsigned long long)kmsg >= 4ULL);//!= IKM_NULL); - mach_msg_header_t *msg = ikm_header(kmsg); - - /* Fill in the mach_msg_header struct */ - msg->msgh_bits = MACH_MSGH_BITS_SET(0, 0, 0, 0); - msg->msgh_size = sizeof(msg); - msg->msgh_remote_port = ack_port; - msg->msgh_local_port = MACH_PORT_NULL; - msg->msgh_voucher_port = MACH_PORT_NULL; - msg->msgh_id = FLIPC_CMD_ID; - - /* Fill in the flipc_ack_msg struct */ - flipc_ack_msg_t fmsg = (flipc_ack_msg_t)(&msg[1]); - fmsg->resend_to = HOST_LOCAL_NODE; - fmsg->msg_count = 1; // Might want to coalesce acks to a node/name pair - - /* Fill in the mnl_msg struct */ - fmsg->mnl.sub = MACH_NODE_SUB_FLIPC; - fmsg->mnl.cmd = delivered ? FLIPC_CMD_ACKMESSAGE : FLIPC_CMD_NAKMESSAGE; - fmsg->mnl.qos = 0; // Doesn't do anything yet - fmsg->mnl.flags = 0; - fmsg->mnl.node_id = nid; - fmsg->mnl.object = name; - fmsg->mnl.options = 0; - fmsg->mnl.size = sizeof(struct flipc_ack_msg) - sizeof(struct mnl_msg); - -#if (0) - mach_msg_return_t mmr; - ipc_mqueue_t ack_mqueue; - - ip_mq_lock(ack_port); // Revisit the lock when enabling flipc - ack_mqueue = &ack_port->ip_messages; - - /* ipc_mqueue_send() unlocks ack_mqueue */ - mmr = ipc_mqueue_send_locked(ack_mqueue, kmsg, 0, 0); -#else - kern_return_t kr; - kr = ipc_kmsg_send(kmsg, - MACH_SEND_KERNEL_DEFAULT, - MACH_MSG_TIMEOUT_NONE); -#endif -} diff --git a/osfmk/ipc/flipc.h b/osfmk/ipc/flipc.h deleted file mode 100644 index d924f9cf9..000000000 --- a/osfmk/ipc/flipc.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2015-2020 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * File: ipc/flipc.h - * Author: Dean Reece - * Date: 2016 - * - * Definitions for fast local ipc (flipc). - */ - -#ifndef _IPC_FLIPC_H_ -#define _IPC_FLIPC_H_ - -#if MACH_KERNEL_PRIVATE && MACH_FLIPC - -#include -#include -#include - -__BEGIN_DECLS - - -/*** FLIPC Port Declarations ***/ - -/* A FLIPC port (flipc_port_t) is a companion structure to ipc_port_t. - * Any ipc_port object that is known to the flipc layer has one of these - * structures to maintain the state of the port with respect to flipc. - * When a port reverts to a purely local object (all rights for the port exist - * on a single node) the flipc port companion structure will be de-allocated. - */ - -typedef struct flipc_port { - struct mnl_obj obj; // Necessary to be in mnl_name_table[] - ipc_port_t lport; // The associated local ipc_port - mach_node_t hostnode; // Node holding the recieve right - uint32_t peek_count; // How many kmsgs in mq have been peeked - uint32_t state:3;// See FPORT_STATE_* defines below -} *flipc_port_t; - -#define FPORT_NULL ((flipc_port_t) 0UL) -#define FPORT_VALID(fport) ((fport) != FPORT_NULL) - -#define FPORT_STATE_INIT (0) // Port is being initialized -#define FPORT_STATE_PROXY (1) // Principal is on another node -#define FPORT_STATE_PRINCIPAL (2) // Principal is on this node -#define FPORT_STATE_PREPRINCIPAL (3) // Principal moving to this node -#define FPORT_STATE_POSTPRINCIPAL (4) // Principal moving to other node -#define FPORT_STATE_DEAD (5) // Port is being destroyed - - -/*** FLIPC Node Managment Declarations (used by mach node layer) ***/ - -extern mach_node_id_t localnode_id; // This node's FLIPC id. - -/* flipc_node_prepare() is called by mach node layer when a remote node is - * registered by a link driver. This is the flipc layer's opportunity to - * convert it to a flipc port and hook it into any appropriate structures. - * Note that the node is not yet in the mach node table. Returns KERN_SUCCESS - * on success; otherwise node is not prepared and cannot be used. - */ -kern_return_t flipc_node_prepare(mach_node_t node); - -/* flipc_node_retire() is called by mach node layer when a remote node is - * terminated by a link driver. This is the flipc layer's opportunity to - * convert it back to a local port and unhook it into any structures. - * Returns KERN_SUCCESS on success. - */ -kern_return_t flipc_node_retire(mach_node_t node); - - -/*** FLIPC Message Declarations (used by mach node layer) ***/ - -/* Definition for a flipc ack/nak message. These messages are sent to the - * originating node of a message to ack or nak the message. Ack'd messages - * are destroyed by the originating node (to avoid duplicate delivery). Nak'd - * messages are re-sent to the node specified in (used when a - * receive right moves to a different node). These messages are queued onto - * the originating node's control_port and sent along with other ipc traffic. - */ - -typedef struct flipc_ack_msg { - struct mnl_msg mnl; // Flipc message starts with mnl message - mach_node_id_t resend_to; // Node ID for resends (if NAK) - uint8_t msg_count; // Number of msgs being ackd/nakd -} __attribute__((__packed__)) * flipc_ack_msg_t; - -#define FLIPC_CMD_ID (0x43504952UL) // msgh_id "RIPC" for FLIPC msgs -#define FLIPC_CMD_IPCMESSAGE (1) // IPC Msg: is sender; is dest port -#define FLIPC_CMD_ACKMESSAGE (2) // is port being ack'd -#define FLIPC_CMD_NAKMESSAGE (3) // is port being nak'd - - -/* The node layer calls flipc_msg_to_remote_node() to fetch the next message - * for . This function will block until a message is available or the - * node is terminated, in which case it returns MNL_MSG_NULL. - */ -mnl_msg_t flipc_msg_to_remote_node(mach_node_t to_node, - uint32_t flags); - -/* The node layer calls flipc_msg_to_remote_node() to post the next message - * from . This function will block until a message is available - * or the node is terminated, in which case it returns MNL_MSG_NULL. - */ -void flipc_msg_from_node(mach_node_t from_node, - mnl_msg_t msg_arg, - uint32_t flags); - -/* The node layer calls flipc_msg_free() to dispose of sent messages that - * originated in the FLIPC layer. - */ -void flipc_msg_free(mnl_msg_t msg, - uint32_t flags); - - -/*** FLIPC Message Declarations (used by mach ipc subsystem) ***/ - -/* Ack a message sent by to . A new kmsg is allocated and - * filled in as an ack (or nak if is false), then posted to the - * node's contol port. This will wake the link driver (if sleeping) and cause - * the ack to be included with normal IPC traffic. - * - * This function immediately returns if or is invalid, so it - * is safe & quick to call speculatively. - * - * Called from mach ipc_mqueue.c when a flipc-originated message is consumed. - */ -void flipc_msg_ack(mach_node_t node, - ipc_mqueue_t mqueue, - boolean_t delivered); - - -__END_DECLS - -#endif // MACH_KERNEL_PRIVATE -#endif // _IPC_FLIPC_H_ diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index 82f97c148..4622f6ce7 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -94,6 +94,59 @@ ipc_entry_table_count_max(void) return ipc_entry_table_size_to_count(CONFIG_IPC_TABLE_ENTRIES_SIZE_MAX); } +/* + * Routine: ipc_entry_name_mask + * Purpose: + * Ensure a mach port name has the default ipc entry + * generation bits set. This can be used to ensure that + * a name passed in by user space matches names generated + * by the kernel. + * Conditions: + * None. + * Returns: + * 'name' input with default generation bits masked or added + * as appropriate. + */ +mach_port_name_t +ipc_entry_name_mask(mach_port_name_t name) +{ + return name | MACH_PORT_MAKE(0, IE_BITS_ROLL_MASK); +} + +/* + * Restart a generation counter with the specified bits for the rollover point. + * There are 4 different rollover points: + * bits rollover period + * 0 0 64 + * 0 1 32 + * 1 0 22 + * 1 1 16 + */ +static ipc_entry_bits_t +ipc_entry_make_gen(ipc_entry_bits_t ie_bits, ipc_space_t space) +{ + ipc_entry_bits_t roll_bits; + + roll_bits = random_bool_gen_bits(&space->is_prng, + space->is_entropy, IS_ENTROPY_CNT, IE_BITS_ROLL_BITS); + + roll_bits <<= __builtin_ctz(IE_BITS_ROLL_MASK); + return (ie_bits & IE_BITS_GEN_MASK) | roll_bits; +} + +static ipc_entry_bits_t +ipc_entry_next_gen(ipc_entry_bits_t oldgen, ipc_space_t space) +{ + ipc_entry_bits_t roll = oldgen & IE_BITS_ROLL_MASK; + ipc_entry_bits_t delta = IE_BITS_GEN_ONE + (roll << IE_BITS_ROLL_BITS); + ipc_entry_bits_t newgen; + + if (os_add_overflow(oldgen, delta, &newgen)) { + newgen = ipc_entry_make_gen(0, space); + } + return newgen; +} + /* * Routine: ipc_entry_lookup * Purpose: @@ -228,11 +281,7 @@ ipc_entry_claim( * Initialize the new entry: increment gencount and reset * rollover point if it rolled over, and clear ie_request. */ - gen = ipc_entry_new_gen(entry->ie_bits); - if (__improbable(ipc_entry_gen_rolled(entry->ie_bits, gen))) { - ipc_entry_bits_t roll = ipc_space_get_rollpoint(space); - gen = ipc_entry_new_rollpoint(roll); - } + gen = ipc_entry_next_gen(entry->ie_bits, space); entry->ie_bits = gen; entry->ie_request = IE_REQ_NONE; entry->ie_object = object; @@ -243,7 +292,7 @@ ipc_entry_claim( * the table isn't allowed to grow big enough. * (See comment in ipc/ipc_table.h.) */ - new_name = MACH_PORT_MAKE(first_free, gen); + new_name = MACH_PORT_MAKE(first_free, IE_BITS_GEN(gen)); assert(MACH_PORT_VALID(new_name)); *namep = new_name; *entryp = entry; @@ -418,7 +467,7 @@ ipc_entry_alloc_name( prev_entry); } - entry->ie_bits = gen; + entry->ie_bits = ipc_entry_make_gen(gen, space); entry->ie_request = IE_REQ_NONE; *entryp = entry; @@ -793,29 +842,3 @@ no_space: is_write_unlock(space); return KERN_NO_SPACE; } - - -/* - * Routine: ipc_entry_name_mask - * Purpose: - * Ensure a mach port name has the default ipc entry - * generation bits set. This can be used to ensure that - * a name passed in by user space matches names generated - * by the kernel. - * Conditions: - * None. - * Returns: - * 'name' input with default generation bits masked or added - * as appropriate. - */ -mach_port_name_t -ipc_entry_name_mask(mach_port_name_t name) -{ -#ifndef NO_PORT_GEN - static mach_port_name_t null_name = MACH_PORT_MAKE(0, IE_BITS_GEN_MASK + IE_BITS_GEN_ONE); - return name | null_name; -#else - static mach_port_name_t null_name = MACH_PORT_MAKE(0, ~(IE_BITS_GEN_MASK + IE_BITS_GEN_ONE)); - return name & ~null_name; -#endif -} diff --git a/osfmk/ipc/ipc_entry.h b/osfmk/ipc/ipc_entry.h index f1ff02464..9669168f9 100644 --- a/osfmk/ipc/ipc_entry.h +++ b/osfmk/ipc/ipc_entry.h @@ -77,6 +77,8 @@ #include +#include + /* * Spaces hold capabilities for ipc_object_t's. * Each ipc_entry_t records a capability. Most capabilities have @@ -128,6 +130,8 @@ struct ipc_entry { }; }; +typedef struct bool_gen *ipc_entry_prng_t; + #define IPC_ENTRY_TABLE_MIN 32 #define IPC_ENTRY_TABLE_PERIOD 16 KALLOC_ARRAY_TYPE_DECL(ipc_entry_table, struct ipc_entry); @@ -140,97 +144,29 @@ KALLOC_ARRAY_TYPE_DECL(ipc_entry_table, struct ipc_entry); #define IE_BITS_TYPE_MASK 0x001f0000 /* 5 bits of capability type */ #define IE_BITS_TYPE(bits) ((bits) & IE_BITS_TYPE_MASK) -#define IE_BITS_EXTYPE_MASK 0x00200000 /* 1 bit for extended capability */ +#define IE_BITS_EXTYPE_MASK 0x00e00000 /* 3 bit for extended capability */ +#define IE_BITS_EX_RECEIVE 0x00200000 /* entry used to be a receive right */ +#define IE_BITS_PINNED_SEND 0x00400000 /* last send right can't be destroyed */ +#define IE_BITS_IMMOVABLE_SEND 0x00800000 /* send right can't be moved */ -#ifndef NO_PORT_GEN -#define IE_BITS_GEN_MASK 0xff000000 /* 8 bits for generation */ -#define IE_BITS_GEN(bits) ((bits) & IE_BITS_GEN_MASK) +#define IE_BITS_ROLL_MASK 0x03000000 /* 2 bits for rollover period */ +#define IE_BITS_GEN_MASK 0xfc000000 /* 6 bits for generation */ +#define IE_BITS_GEN(bits) (((bits) & IE_BITS_GEN_MASK) | IE_BITS_ROLL_MASK) #define IE_BITS_GEN_ONE 0x04000000 /* low bit of generation */ -#define IE_BITS_ROLL_POS 22 /* LSB pos of generation rollover */ #define IE_BITS_ROLL_BITS 2 /* number of generation rollover bits */ -#define IE_BITS_ROLL_MASK (((1 << IE_BITS_ROLL_BITS) - 1) << IE_BITS_ROLL_POS) -#define IE_BITS_ROLL(bits) ((((bits) & IE_BITS_ROLL_MASK) << 8) ^ IE_BITS_GEN_MASK) +#define IE_BITS_GEN_INIT IE_BITS_GEN_MASK +#define IE_BITS_RIGHT_MASK 0x00ffffff /* relevant to the right */ -/* - * Restart a generation counter with the specified bits for the rollover point. - * There are 4 different rollover points: - * bits rollover period - * 0 0 64 - * 0 1 48 - * 1 0 32 - * 1 1 16 - */ -static inline ipc_entry_bits_t -ipc_entry_new_rollpoint( - ipc_entry_bits_t rollbits) -{ - rollbits = (rollbits << IE_BITS_ROLL_POS) & IE_BITS_ROLL_MASK; - ipc_entry_bits_t newgen = IE_BITS_GEN_MASK + IE_BITS_GEN_ONE; - return newgen | rollbits; -} - -/* - * Get the next gencount, modulo the entry's rollover point. If the sum rolls over, - * the caller should re-start the generation counter with a different rollpoint. - */ -static inline ipc_entry_bits_t -ipc_entry_new_gen( - ipc_entry_bits_t oldgen) -{ - ipc_entry_bits_t sum = (oldgen + IE_BITS_GEN_ONE) & IE_BITS_GEN_MASK; - ipc_entry_bits_t roll = oldgen & IE_BITS_ROLL_MASK; - ipc_entry_bits_t newgen = (sum % IE_BITS_ROLL(oldgen)) | roll; - return newgen; -} - -/* Determine if a gencount has rolled over or not. */ -static inline boolean_t -ipc_entry_gen_rolled( - ipc_entry_bits_t oldgen, - ipc_entry_bits_t newgen) -{ - return (oldgen & IE_BITS_GEN_MASK) > (newgen & IE_BITS_GEN_MASK); -} - -#else -#define IE_BITS_GEN_MASK 0 -#define IE_BITS_GEN(bits) 0 -#define IE_BITS_GEN_ONE 0 -#define IE_BITS_ROLL_POS 0 -#define IE_BITS_ROLL_MASK 0 -#define IE_BITS_ROLL(bits) (bits) - -static inline ipc_entry_bits_t -ipc_entry_new_rollpoint( - ipc_entry_bits_t rollbits) -{ - return 0; -} - -static inline ipc_entry_bits_t -ipc_entry_new_gen( - ipc_entry_bits_t oldgen) -{ - return 0; -} - -static inline boolean_t -ipc_entry_gen_rolled( - ipc_entry_bits_t oldgen, - ipc_entry_bits_t newgen) -{ - return FALSE; -} - -#endif /* !USE_PORT_GEN */ - -#define IE_BITS_RIGHT_MASK 0x007fffff /* relevant to the right */ /* * Exported interfaces */ extern unsigned int ipc_entry_table_count_max(void) __pure2; +/* mask on/off default entry generation bits */ +extern mach_port_name_t ipc_entry_name_mask( + mach_port_name_t name); + /* Search for entry in a space by name */ extern ipc_entry_t ipc_entry_lookup( ipc_space_t space, @@ -279,7 +215,4 @@ extern kern_return_t ipc_entry_grow_table( ipc_space_t space, ipc_table_elems_t target_size); -/* mask on/off default entry generation bits */ -extern mach_port_name_t ipc_entry_name_mask( - mach_port_name_t name); #endif /* _IPC_IPC_ENTRY_H_ */ diff --git a/osfmk/ipc/ipc_eventlink.c b/osfmk/ipc/ipc_eventlink.c index 194edbe8b..319a01949 100644 --- a/osfmk/ipc/ipc_eventlink.c +++ b/osfmk/ipc/ipc_eventlink.c @@ -47,8 +47,6 @@ #include #include -#include - static KALLOC_TYPE_DEFINE(ipc_eventlink_zone, struct ipc_eventlink_base, KT_DEFAULT); @@ -125,6 +123,7 @@ port_name_to_eventlink( struct ipc_eventlink **ipc_eventlink_ptr); IPC_KOBJECT_DEFINE(IKOT_EVENTLINK, + .iko_op_movable_send = true, .iko_op_no_senders = ipc_eventlink_no_senders); /* @@ -176,8 +175,8 @@ ipc_eventlink_initialize( for (int i = 0; i < 2; i++) { struct ipc_eventlink *ipc_eventlink = &(ipc_eventlink_base->elb_eventlink[i]); - ipc_eventlink->el_port = ipc_kobject_alloc_port((ipc_kobject_t)ipc_eventlink, - IKOT_EVENTLINK, IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + ipc_eventlink->el_port = ipc_kobject_alloc_port(ipc_eventlink, + IKOT_EVENTLINK, IPC_KOBJECT_ALLOC_MAKE_SEND); /* ipc_kobject_alloc_port never fails */ ipc_eventlink->el_thread = THREAD_NULL; ipc_eventlink->el_sync_counter = 0; @@ -321,7 +320,8 @@ ipc_eventlink_destroy_internal( splx(s); /* Destroy the local eventlink port */ - ipc_kobject_dealloc_port(ipc_eventlink_port, 0, IKOT_EVENTLINK); + ipc_kobject_dealloc_port(ipc_eventlink_port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_EVENTLINK); /* Drops port reference */ /* Clear the remote eventlink port without destroying it */ @@ -1005,7 +1005,7 @@ convert_port_to_eventlink_locked( kern_return_t kr = KERN_INVALID_CAPABILITY; struct ipc_eventlink *ipc_eventlink = IPC_EVENTLINK_NULL; - if (ip_active(port) && ip_kotype(port) == IKOT_EVENTLINK) { + if (ip_active(port) && ip_type(port) == IKOT_EVENTLINK) { ipc_eventlink = ipc_kobject_get_raw(port, IKOT_EVENTLINK); if (ipc_eventlink) { ipc_eventlink_reference(ipc_eventlink); diff --git a/osfmk/ipc/ipc_hash.c b/osfmk/ipc/ipc_hash.c index f71a43bfb..52f38a0a9 100644 --- a/osfmk/ipc/ipc_hash.c +++ b/osfmk/ipc/ipc_hash.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index 3ddf97409..f9bff39f1 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -2294,8 +2294,7 @@ ipc_importance_check_circularity( /* port (== base) is in limbo */ - require_ip_active(port); - assert(ip_in_limbo(port)); + ipc_release_assert(ip_in_limbo(port)); assert(!took_base_ref); base = dest; @@ -2304,7 +2303,7 @@ ipc_importance_check_circularity( /* base is in transit or in limbo */ - require_ip_active(base); + ipc_release_assert(ip_is_moving(base)); assert(base->ip_receiver_name == MACH_PORT_NULL); next = ip_get_destination(base); ip_mq_unlock(base); @@ -2329,23 +2328,15 @@ ipc_importance_check_circularity( ipc_port_multiple_unlock(); not_circular: - /* port is in limbo */ - require_ip_active(port); - assert(ip_in_limbo(port)); - /* Port is being enqueued in a kmsg, remove the watchport boost in order to push on destination port */ watchport_elem = ipc_port_clear_watchport_elem_internal(port); /* Check if the port is being enqueued as a part of sync bootstrap checkin */ - if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) { + if (ip_is_special_reply_port(dest) && dest->ip_sync_bootstrap_checkin) { port->ip_sync_bootstrap_checkin = 1; } - ip_reference(dest); - - /* port transitions to IN-TRANSIT state */ - assert(port->ip_receiver_name == MACH_PORT_NULL); - port->ip_destination = dest; + ipc_port_mark_in_transit(port, dest); /* must have been in limbo or still bound to a task */ assert(port->ip_tempowner != 0); @@ -2394,8 +2385,7 @@ not_circular: /* port is in transit */ - require_ip_active(dest); - assert(ip_in_transit(dest)); + ipc_release_assert(ip_in_transit(dest)); assert(dest->ip_tempowner == 0); next = ip_get_destination(dest); @@ -2560,7 +2550,7 @@ ipc_importance_send( mach_voucher_attr_value_handle_array_size_t val_count; ipc_voucher_t voucher; - assert(ip_kotype(voucher_port) == IKOT_VOUCHER); + assert(ip_type(voucher_port) == IKOT_VOUCHER); voucher = (ipc_voucher_t)ipc_kobject_get_raw(voucher_port, IKOT_VOUCHER); diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index 73ecf8a70..332a258ed 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -97,7 +97,6 @@ #include #include #include -#include #include #include @@ -112,18 +111,6 @@ const vm_size_t ipc_kmsg_max_vm_space = ((IPC_KERNEL_COPY_MAP_SIZE * 7) / 8); #define IPC_KERNEL_MAP_SIZE (CONFIG_IPC_KERNEL_MAP_SIZE << 20) -/* Note: Consider Developer Mode when changing the default. */ -#if XNU_TARGET_OS_OSX -#define IPC_CONTROL_PORT_OPTIONS_DEFAULT (ICP_OPTIONS_IMMOVABLE_1P_HARD | ICP_OPTIONS_PINNED_1P_HARD) -#else -#define IPC_CONTROL_PORT_OPTIONS_DEFAULT (ICP_OPTIONS_IMMOVABLE_ALL_HARD | \ - ICP_OPTIONS_PINNED_1P_HARD | \ - ICP_OPTIONS_PINNED_3P_SOFT) -#endif - -TUNABLE(ipc_control_port_options_t, ipc_control_port_options, - "ipc_control_port_options", IPC_CONTROL_PORT_OPTIONS_DEFAULT); - LCK_GRP_DECLARE(ipc_lck_grp, "ipc"); LCK_ATTR_DECLARE(ipc_lck_attr, 0, 0); @@ -152,15 +139,10 @@ __startup_func static void ipc_init(void) { - kern_return_t kr; - /* create special spaces */ - kr = ipc_space_create_special(&ipc_space_kernel); - assert(kr == KERN_SUCCESS); - - kr = ipc_space_create_special(&ipc_space_reply); - assert(kr == KERN_SUCCESS); + ipc_space_kernel = ipc_space_create_special(); + ipc_space_reply = ipc_space_create_special(); /* initialize modules with hidden data structures */ @@ -168,22 +150,6 @@ ipc_init(void) arcade_init(); #endif - bool pinned_control_port_enabled_1p = !!(ipc_control_port_options & ICP_OPTIONS_1P_PINNED); - bool immovable_control_port_enabled_1p = !!(ipc_control_port_options & ICP_OPTIONS_1P_IMMOVABLE); - - bool pinned_control_port_enabled_3p = !!(ipc_control_port_options & ICP_OPTIONS_3P_PINNED); - bool immovable_control_port_enabled_3p = !!(ipc_control_port_options & ICP_OPTIONS_3P_IMMOVABLE); - - if (pinned_control_port_enabled_1p && !immovable_control_port_enabled_1p) { - kprintf("Invalid ipc_control_port_options boot-arg: pinned control port cannot be enabled without immovability enforcement. Ignoring 1p pinning boot-arg."); - ipc_control_port_options &= ~ICP_OPTIONS_1P_PINNED; - } - - if (pinned_control_port_enabled_3p && !immovable_control_port_enabled_3p) { - kprintf("Invalid ipc_control_port_options boot-arg: pinned control port cannot be enabled without immovability enforcement. Ignoring 3p pinning boot-arg."); - ipc_control_port_options &= ~ICP_OPTIONS_3P_PINNED; - } - ipc_kernel_map = kmem_suballoc(kernel_map, &ipc_kernel_range.min_address, IPC_KERNEL_MAP_SIZE, VM_MAP_CREATE_PAGEABLE, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL, @@ -201,4 +167,6 @@ ipc_init(void) ipc_host_init(); ux_handler_init(); } +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* unittests don't support creating submap in kernel_map */ STARTUP(MACH_IPC, STARTUP_RANK_LAST, ipc_init); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ diff --git a/osfmk/ipc/ipc_init.h b/osfmk/ipc/ipc_init.h deleted file mode 100644 index 0559a4952..000000000 --- a/osfmk/ipc/ipc_init.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * HISTORY - * - * Revision 1.1.1.1 1998/09/22 21:05:29 wsanchez - * Import of Mac OS X kernel (~semeria) - * - * Revision 1.1.1.1 1998/03/07 02:26:15 wsanchez - * Import of OSF Mach kernel (~mburg) - * - * Revision 1.1.6.1 1994/09/23 02:07:56 ezf - * change marker to not FREE - * [1994/09/22 21:29:04 ezf] - * - * Revision 1.1.2.4 1993/07/22 16:16:03 rod - * Add ANSI prototypes. CR #9523. - * [1993/07/22 13:29:57 rod] - * - * Revision 1.1.2.3 1993/06/07 22:10:25 jeffc - * CR9176 - ANSI C violations: trailing tokens on CPP - * directives, extra semicolons after decl_ ..., asm keywords - * [1993/06/07 19:01:24 jeffc] - * - * Revision 1.1.2.2 1993/06/02 23:31:04 jeffc - * Added to OSF/1 R1.3 from NMK15.0. - * [1993/06/02 21:09:31 jeffc] - * - * Revision 1.1 1992/09/30 02:28:50 robert - * Initial revision - * - * $EndLog$ - */ -/* CMU_HIST */ -/* - * Revision 2.4 91/05/14 16:32:45 mrt - * Correcting copyright - * - * Revision 2.3 91/02/05 17:21:42 mrt - * Changed to new Mach copyright - * [91/02/01 15:45:16 mrt] - * - * Revision 2.2 90/06/02 14:49:59 rpd - * Created for new IPC. - * [90/03/26 20:55:26 rpd] - * - */ -/* CMU_ENDHIST */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ -/* - * File: ipc/ipc_init.h - * Author: Rich Draves - * Date: 1989 - * - * Declarations of functions to initialize the IPC system. - */ - -#ifndef _IPC_IPC_INIT_H_ -#define _IPC_IPC_INIT_H_ - -/* These boot-args decide whether control port is immovable and/or pinned */ -__options_decl(ipc_control_port_options_t, uint32_t, { - ICP_OPTIONS_NONE = 0x00, - - /* Must be in the same order as task_control_port_options_t (mach/task_info.h) */ - ICP_OPTIONS_PINNED_1P_SOFT = 0x01, - ICP_OPTIONS_PINNED_1P_HARD = 0x02, - ICP_OPTIONS_IMMOVABLE_1P_SOFT = 0x04, - ICP_OPTIONS_IMMOVABLE_1P_HARD = 0x08, - - ICP_OPTIONS_PINNED_3P_SOFT = 0x10, - ICP_OPTIONS_PINNED_3P_HARD = 0x20, - ICP_OPTIONS_IMMOVABLE_3P_SOFT = 0x40, - ICP_OPTIONS_IMMOVABLE_3P_HARD = 0x80, - - ICP_OPTIONS_PINNED_ALL_HARD = ICP_OPTIONS_PINNED_1P_HARD | ICP_OPTIONS_PINNED_3P_HARD, - ICP_OPTIONS_PINNED_ALL_SOFT = ICP_OPTIONS_PINNED_1P_SOFT | ICP_OPTIONS_PINNED_3P_SOFT, - - ICP_OPTIONS_IMMOVABLE_ALL_HARD = ICP_OPTIONS_IMMOVABLE_1P_HARD | ICP_OPTIONS_IMMOVABLE_3P_HARD, - ICP_OPTIONS_IMMOVABLE_ALL_SOFT = ICP_OPTIONS_IMMOVABLE_1P_SOFT | ICP_OPTIONS_IMMOVABLE_3P_SOFT, -}); - -#define ICP_OPTIONS_1P_MASK (ICP_OPTIONS_PINNED_1P_HARD | \ - ICP_OPTIONS_PINNED_1P_SOFT | \ - ICP_OPTIONS_IMMOVABLE_1P_HARD | \ - ICP_OPTIONS_IMMOVABLE_1P_SOFT) - -#define ICP_OPTIONS_1P_PINNED (ICP_OPTIONS_PINNED_1P_HARD | ICP_OPTIONS_PINNED_1P_SOFT) -#define ICP_OPTIONS_1P_IMMOVABLE (ICP_OPTIONS_IMMOVABLE_1P_HARD | ICP_OPTIONS_IMMOVABLE_1P_SOFT) - -#define ICP_OPTIONS_3P_MASK (ICP_OPTIONS_PINNED_3P_HARD | \ - ICP_OPTIONS_PINNED_3P_SOFT | \ - ICP_OPTIONS_IMMOVABLE_3P_HARD | \ - ICP_OPTIONS_IMMOVABLE_3P_SOFT) - -#define ICP_OPTIONS_3P_PINNED (ICP_OPTIONS_PINNED_3P_HARD | ICP_OPTIONS_PINNED_3P_SOFT) -#define ICP_OPTIONS_3P_IMMOVABLE (ICP_OPTIONS_IMMOVABLE_3P_HARD | ICP_OPTIONS_IMMOVABLE_3P_SOFT) - -#define ICP_OPTIONS_3P_SHIFT 4 - -#endif /* _IPC_IPC_INIT_H_ */ diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 8381fbdd6..3361c94fc 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -117,11 +117,6 @@ #include #include -#if MACH_FLIPC -#include -#include -#endif - #include #include @@ -131,7 +126,6 @@ #include #include -#include #include #if __has_feature(ptrauth_calls) @@ -189,8 +183,6 @@ ZONE_DEFINE_ID(ZONE_ID_IPC_KMSG, "ipc kmsgs", struct ipc_kmsg, KALLOC_TYPE_VAR_DEFINE(KT_IPC_KMSG_KDATA_OOL, mach_msg_base_t, mach_msg_kdescriptor_t, KT_DEFAULT); -static TUNABLE(bool, enforce_strict_reply, "ipc_strict_reply", false); - #pragma mark ipc_kmsg layout and accessors @@ -413,7 +405,7 @@ ipc_kmsg_set_voucher_port( mach_msg_type_name_t type) { if (IP_VALID(voucher_port)) { - assert(ip_kotype(voucher_port) == IKOT_VOUCHER); + assert(ip_type(voucher_port) == IKOT_VOUCHER); } kmsg->ikm_voucher_port = voucher_port; kmsg->ikm_voucher_type = type; @@ -1444,7 +1436,7 @@ ipc_kmsg_set_qos( kr = KERN_SUCCESS; if (IP_VALID(special_reply_port) && - special_reply_port->ip_specialreply && + ip_is_special_reply_port(special_reply_port) && !ip_is_kobject(dest_port) && MACH_MSGH_BITS_LOCAL(hdr->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) { boolean_t sync_bootstrap_checkin = !!(options & MACH_SEND_SYNC_BOOTSTRAP_CHECKIN); @@ -1512,82 +1504,6 @@ ipc_kmsg_link_reply_context_locked( return; } -static kern_return_t -ipc_kmsg_validate_reply_port_locked( - ipc_port_t reply_port, - mach_msg_option64_t options) -{ - ip_mq_lock_held(reply_port); - - if (!ip_active(reply_port)) { - /* - * Ideally, we would enforce that the reply receive right is - * active, but asynchronous XPC cancellation destroys the - * receive right, so we just have to return success here. - */ - return KERN_SUCCESS; - } - - if (options & MACH_SEND_MSG) { - /* - * If the rely port is active, then it should not be - * in-transit, and the receive right should be in the caller's - * IPC space. - */ - if (!ip_in_space(reply_port, current_task()->itk_space)) { - return KERN_INVALID_CAPABILITY; - } - - /* - * A port used as a reply port in an RPC should have exactly 1 - * extant send-once right which we either just made or are - * moving as part of the IPC. - */ - if (reply_port->ip_sorights != 1) { - return KERN_INVALID_CAPABILITY; - } - /* - * XPC uses an extra send-right to keep the name of the reply - * right around through cancellation. That makes it harder to - * enforce a particular semantic kere, so for now, we say that - * you can have a maximum of 1 send right (in addition to your - * send once right). In the future, it would be great to lock - * this down even further. - */ - if (reply_port->ip_srights > 1) { - return KERN_INVALID_CAPABILITY; - } - - /* - * The sender can also specify that the receive right should - * be immovable. Note that this check only applies to - * send-only operations. Combined send/receive or rcv-only - * operations can specify an immovable receive right by - * opt-ing into guarded descriptors (MACH_RCV_GUARDED_DESC) - * and using the MACH_MSG_STRICT_REPLY options flag. - */ - if (MACH_SEND_REPLY_IS_IMMOVABLE(options)) { - if (!reply_port->ip_immovable_receive) { - return KERN_INVALID_CAPABILITY; - } - } - } - - /* - * don't enforce this yet: need a better way of indicating the - * receiver wants this... - */ -#if 0 - if (MACH_RCV_WITH_IMMOVABLE_REPLY(options)) { - if (!reply_port->ip_immovable_receive) { - return KERN_INVALID_CAPABILITY; - } - } -#endif /* 0 */ - - return KERN_SUCCESS; -} - /* * Routine: ipc_kmsg_validate_reply_context_locked * Purpose: @@ -1617,7 +1533,7 @@ ipc_kmsg_validate_reply_context_locked( if (voucher == IPC_VOUCHER_NULL || !MACH_PORT_VALID(voucher_name)) { if ((option & MACH_SEND_KERNEL) == 0) { mach_port_guard_exception(voucher_name, - (MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER | dest_ctx), + MPG_PAYLOAD(MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER, dest_ctx), kGUARD_EXC_STRICT_REPLY); } return MACH_SEND_INVALID_CONTEXT; @@ -1631,7 +1547,8 @@ ipc_kmsg_validate_reply_context_locked( if (dest_ctx != persona_id) { if ((option & MACH_SEND_KERNEL) == 0) { mach_port_guard_exception(voucher_name, - (MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA | ((((uint64_t)persona_id << 32) & MPG_FLAGS_STRICT_REPLY_MASK) | dest_ctx)), + MPG_PAYLOAD(MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA, + persona_id, dest_ctx), kGUARD_EXC_STRICT_REPLY); } return MACH_SEND_INVALID_CONTEXT; @@ -1694,6 +1611,7 @@ typedef struct { mach_msg_type_name_t reply_type; ipc_port_t reply_port; ipc_copyin_cleanup_t reply_cleanup; + ipc_entry_bits_t reply_bits; /* for debugging purpose */ mach_port_name_t voucher_name; mach_msg_type_name_t voucher_type; @@ -1701,7 +1619,6 @@ typedef struct { ipc_copyin_cleanup_t voucher_cleanup; ipc_table_index_t dest_request; - ipc_policy_violation_id_t reply_port_semantics_violation; } ikm_copyinhdr_state_t; /* @@ -1714,7 +1631,7 @@ typedef struct { static mach_msg_return_t ipc_kmsg_copyin_header_validate( ipc_kmsg_t kmsg, - mach_msg_option64_t options, + __unused mach_msg_option64_t options, ikm_copyinhdr_state_t *st) { mach_msg_header_t *msg = ikm_header(kmsg); @@ -1803,21 +1720,9 @@ ipc_kmsg_copyin_header_validate( } } - if (enforce_strict_reply && - MACH_SEND_WITH_STRICT_REPLY(options) && - (!MACH_PORT_VALID(st->reply_name) || - !MACH_MSG_TYPE_PORT_ANY_SEND_ONCE(st->reply_type))) { - /* - * The caller cannot enforce a reply context with an invalid - * reply port name, or a non-send_once reply disposition. - */ - mach_port_guard_exception(st->reply_name, - (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP | st->reply_type), - kGUARD_EXC_STRICT_REPLY); - return MACH_SEND_INVALID_REPLY; - } - if (MACH_PORT_VALID(st->reply_name) && st->reply_name == st->voucher_name) { + /* Special case where the voucher name == reply name */ + st->reply_bits = -1; return MACH_SEND_INVALID_REPLY; } @@ -1977,7 +1882,10 @@ ipc_kmsg_copyin_header_rights( ipc_entry_t voucher_entry = IE_NULL; mach_msg_type_name_t dest_type; ipc_object_copyin_flags_t dest_xtra; - kern_return_t kr; + kern_return_t kr = KERN_SUCCESS; + /* for service port immovability violation */ + ipc_port_t violation_port = MACH_PORT_NULL; + mach_port_name_t violation_name = 0; is_write_lock(space); if (__improbable(!is_active(space))) { @@ -2002,7 +1910,7 @@ ipc_kmsg_copyin_header_rights( if (voucher_entry == IE_NULL || (voucher_entry->ie_bits & MACH_PORT_TYPE_SEND) == 0 || - ip_kotype(voucher_entry->ie_port) != IKOT_VOUCHER) { + ip_type(voucher_entry->ie_port) != IKOT_VOUCHER) { is_write_unlock(space); return MACH_SEND_INVALID_VOUCHER; } @@ -2026,6 +1934,9 @@ ipc_kmsg_copyin_header_rights( } else { reply_entry = ipc_entry_lookup(space, st->reply_name); } + if (reply_entry != IE_NULL) { + st->reply_bits = reply_entry->ie_bits; + } if (__improbable(reply_entry == IE_NULL || (reply_entry->ie_bits & MACH_PORT_TYPE_PORT_RIGHTS) == 0)) { is_write_unlock(space); @@ -2033,8 +1944,7 @@ ipc_kmsg_copyin_header_rights( } if (__improbable(!ipc_right_copyin_check_reply(space, - st->reply_name, reply_entry, st->reply_type, dest_entry, - &st->reply_port_semantics_violation))) { + st->reply_name, reply_entry, st->reply_type))) { is_write_unlock(space); return MACH_SEND_INVALID_REPLY; } @@ -2052,18 +1962,16 @@ ipc_kmsg_copyin_header_rights( dest_type = ipc_kmsg_copyin_dest_disposition(st, &dest_xtra); kr = ipc_right_copyin(space, st->dest_name, dest_type, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MAKE_SEND_ONCE | - IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MOVE_SEND_ONCE | IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND | - IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE | - dest_xtra, dest_entry, + dest_xtra, IPC_COPYIN_KMSG_DESTINATION, dest_entry, &st->dest_port, &st->dest_cleanup, NULL); if (kr == KERN_SUCCESS) { assert(IP_VALID(st->dest_port)); assert(!IP_VALID(st->dest_cleanup.icc_release_port)); } else { ipc_space_unlock(space); - return MACH_SEND_INVALID_DEST; + kr = MACH_SEND_INVALID_DEST; + goto send_telemetry; } /* @@ -2073,7 +1981,7 @@ ipc_kmsg_copyin_header_rights( st->voucher_port = st->dest_port; } else if (st->voucher_name) { kr = ipc_right_copyin(space, st->voucher_name, st->voucher_type, - IPC_OBJECT_COPYIN_FLAGS_NONE, voucher_entry, + IPC_OBJECT_COPYIN_FLAGS_NONE, IPC_COPYIN_KMSG_VOUCHER, voucher_entry, &st->voucher_port, &st->voucher_cleanup, NULL); ipc_release_assert(kr == KERN_SUCCESS); @@ -2084,8 +1992,7 @@ ipc_kmsg_copyin_header_rights( st->reply_port = st->dest_port; } else if (MACH_PORT_VALID(st->reply_name)) { kr = ipc_right_copyin(space, st->reply_name, st->reply_type, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MAKE_SEND_ONCE | - IPC_OBJECT_COPYIN_FLAGS_DEADOK, reply_entry, + IPC_OBJECT_COPYIN_FLAGS_DEADOK, IPC_COPYIN_KMSG_REPLY, reply_entry, &st->reply_port, &st->reply_cleanup, NULL); /* @@ -2112,7 +2019,32 @@ ipc_kmsg_copyin_header_rights( is_write_unlock(space); - return KERN_SUCCESS; +send_telemetry: + if (IP_VALID(st->dest_port) && + ip_type(st->dest_port) == IOT_SERVICE_PORT && + st->dest_type == MACH_MSG_TYPE_MOVE_RECEIVE) { + violation_port = st->dest_port; + violation_name = st->dest_name; + } else if (IP_VALID(st->voucher_port) && + ip_type(st->voucher_port) == IOT_SERVICE_PORT && + st->voucher_type == MACH_MSG_TYPE_MOVE_RECEIVE) { + violation_port = st->voucher_port; + violation_name = st->voucher_name; + } else if (IP_VALID(st->reply_port) && + ip_type(st->reply_port) == IOT_SERVICE_PORT && + st->reply_type == MACH_MSG_TYPE_MOVE_RECEIVE) { + violation_port = st->reply_port; + violation_name = st->reply_name; + } + + if (violation_port && + !task_is_initproc(space->is_task) && + !ipc_space_has_telemetry_type(space, IS_HAS_SERVICE_PORT_TELEMETRY)) { + ipc_stash_policy_violations_telemetry(IPCPV_MOVE_SERVICE_PORT, + violation_port, violation_name); + } + + return kr; } /* @@ -2138,7 +2070,6 @@ ipc_kmsg_copyin_header_rights( * MACH_SEND_INVALID_REPLY Can't copyin reply port. * (Either KERN_INVALID_NAME or KERN_INVALID_RIGHT.) */ - static mach_msg_return_t ipc_kmsg_copyin_header( ipc_kmsg_t kmsg, @@ -2148,7 +2079,6 @@ ipc_kmsg_copyin_header( { mach_msg_option64_t options = *option64p; ikm_copyinhdr_state_t st = { }; - struct mach_service_port_info sp_info = {}; bool needboost = false; kern_return_t kr; @@ -2159,11 +2089,12 @@ ipc_kmsg_copyin_header( if (__improbable(kr != KERN_SUCCESS)) { if (kr == MACH_SEND_INVALID_VOUCHER) { - mach_port_guard_exception(st.voucher_name, 0, + mach_port_guard_exception(st.voucher_name, st.voucher_type, kGUARD_EXC_SEND_INVALID_VOUCHER); } if (kr == MACH_SEND_INVALID_REPLY) { - mach_port_guard_exception(st.reply_name, 0, + mach_port_guard_exception(st.reply_name, + MPG_PAYLOAD(MPG_FLAGS_NONE, st.reply_bits, st.reply_type), kGUARD_EXC_SEND_INVALID_REPLY); } ipc_kmsg_copyin_header_cleanup(&st); @@ -2193,16 +2124,6 @@ ipc_kmsg_copyin_header( st.dest_type = ipc_object_copyin_type(st.dest_type); st.reply_type = ipc_object_copyin_type(st.reply_type); -#if CONFIG_SERVICE_PORT_INFO - /* - * Service name is later used in CA telemetry in case of reply port security semantics violations. - */ - if (ip_active(st.dest_port) && st.dest_port->ip_service_port) { - assert(st.dest_port->ip_splabel); - ipc_service_port_label_get_info(st.dest_port->ip_splabel, &sp_info); - } -#endif /* CONFIG_SERVICE_PORT_INFO */ - if (!ip_active(st.dest_port) || (ip_is_kobject(st.dest_port) && ip_in_space(st.dest_port, ipc_space_kernel))) { @@ -2214,7 +2135,6 @@ ipc_kmsg_copyin_header( * * See: ipc_object_copyin(). */ - assert(ip_kotype(st.dest_port) != IKOT_TIMER); kmsg->ikm_flags |= IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND; } @@ -2271,56 +2191,6 @@ ipc_kmsg_copyin_header( ipc_kmsg_copyin_header_cleanup(&st); - if (enforce_strict_reply && MACH_SEND_WITH_STRICT_REPLY(options) && - IP_VALID(st.msg->msgh_local_port)) { - /* - * We've already validated that the reply disposition is a - * [make/move] send-once. Ideally, we should enforce that the - * reply port is also not dead, but XPC asynchronous - * cancellation can make the reply port dead before we - * actually make it to the mach_msg send. - * - * Here, we ensure that if we have a non-dead reply port, then - * the reply port's receive right should not be in-transit, - * and should live in the caller's IPC space. - */ - ipc_port_t rport = st.msg->msgh_local_port; - ip_mq_lock(rport); - kr = ipc_kmsg_validate_reply_port_locked(rport, options); - ip_mq_unlock(rport); - if (kr != KERN_SUCCESS) { - /* - * no descriptors have been copied in yet, but the - * full header has been copied in: clean it up - */ - ipc_kmsg_clean_header(kmsg); - if ((options & MACH_SEND_KERNEL) == 0) { - mach_port_guard_exception(st.reply_name, - (MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT | kr), - kGUARD_EXC_STRICT_REPLY); - } - return MACH_SEND_INVALID_REPLY; - } - } - - if (st.reply_port_semantics_violation) { - /* Currently rate limiting it to sucess paths only. */ - task_t task = current_task_early(); - if (task && st.reply_port_semantics_violation == IPCPV_REPLY_PORT_SEMANTICS) { - task_lock(task); - if (!task_has_reply_port_telemetry(task)) { - /* Crash report rate limited to once per task per host. */ - mach_port_guard_exception(st.reply_name, 0, - kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS); - task_set_reply_port_telemetry(task); - } - task_unlock(task); - } - - ipc_stash_policy_violations_telemetry(st.reply_port_semantics_violation, - &sp_info, st.msg->msgh_id); - } - return MACH_MSG_SUCCESS; } @@ -2362,11 +2232,13 @@ ipc_kmsg_copyin_port_descriptor( result_disp = ipc_object_copyin_type(user_disp); if (MACH_PORT_VALID(name)) { - kr = ipc_object_copyin(space, name, user_disp, - kmsg->ikm_flags, NULL, &port); + kr = ipc_object_copyin(space, name, user_disp, kmsg->ikm_flags, + IPC_COPYIN_KMSG_PORT_DESCRIPTOR, NULL, &port); if (kr != KERN_SUCCESS) { if (kr == KERN_INVALID_RIGHT) { - mach_port_guard_exception(name, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_SEND_INVALID_RIGHT_PORT, user_disp), + kGUARD_EXC_SEND_INVALID_RIGHT); } return MACH_SEND_INVALID_RIGHT; } @@ -2558,7 +2430,8 @@ ipc_kmsg_copyin_ool_ports_descriptor( vm_map_t map, ipc_space_t space, ipc_port_t dest_port, - ipc_kmsg_t kmsg) + ipc_kmsg_t kmsg, + mach_msg_option64_t options) { mach_msg_type_name_t user_disp = dsc->disposition; mach_msg_size_t count = dsc->count; @@ -2566,10 +2439,26 @@ ipc_kmsg_copyin_ool_ports_descriptor( mach_port_array_t array = NULL; mach_port_name_t *names; mach_vm_size_t names_size; + ipc_space_policy_t current_policy; result_disp = ipc_object_copyin_type(user_disp); names_size = count * sizeof(mach_port_name_t); + /* + * For enhanced v2 binaries, we restrict sending OOL + * port array with any disposition besdies COPY_SEND. + */ + current_policy = ipc_convert_msg_options_to_space(options); + if (ool_port_array_enforced && + ipc_should_apply_policy(current_policy, IPC_POLICY_ENHANCED_V2) && + (user_disp != MACH_MSG_TYPE_COPY_SEND)) { + mach_port_guard_exception(current_policy, + MPG_PAYLOAD(MPG_FLAGS_INVALID_OPTIONS_OOL_DISP, user_disp), + kGUARD_EXC_DESCRIPTOR_VIOLATION); + + return MACH_SEND_INVALID_OPTIONS; + } + if (count) { array = mach_port_array_alloc(count, Z_WAITOK | Z_SPRAYQTN); @@ -2596,8 +2485,8 @@ ipc_kmsg_copyin_ool_ports_descriptor( continue; } - kr = ipc_object_copyin(space, name, user_disp, - kmsg->ikm_flags, NULL, &port); + kr = ipc_object_copyin(space, name, user_disp, kmsg->ikm_flags, + IPC_COPYIN_KMSG_OOL_PORT_ARRAY_DESCRIPTOR, NULL, &port); if (kr != KERN_SUCCESS) { for (mach_msg_size_t j = 0; j < i; j++) { @@ -2609,7 +2498,9 @@ ipc_kmsg_copyin_ool_ports_descriptor( mach_port_array_free(array, count); if (kr == KERN_INVALID_RIGHT) { - mach_port_guard_exception(name, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_SEND_INVALID_RIGHT_OOL_PORT, user_disp), + kGUARD_EXC_SEND_INVALID_RIGHT); } return MACH_SEND_INVALID_RIGHT; } @@ -2693,11 +2584,13 @@ ipc_kmsg_copyin_guarded_port_descriptor( result_disp = ipc_object_copyin_type(user_disp); if (MACH_PORT_VALID(name)) { - kr = ipc_object_copyin(space, name, user_disp, - kmsg->ikm_flags, dsc, &port); + kr = ipc_object_copyin(space, name, user_disp, kmsg->ikm_flags, + IPC_COPYIN_KMSG_GUARDED_PORT_DESCRIPTOR, dsc, &port); if (kr != KERN_SUCCESS) { if (kr == KERN_INVALID_RIGHT) { - mach_port_guard_exception(name, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_SEND_INVALID_RIGHT_GUARDED, user_disp), + kGUARD_EXC_SEND_INVALID_RIGHT); } return MACH_SEND_INVALID_RIGHT; } @@ -2874,6 +2767,16 @@ ipc_kmsg_measure_descriptors_from_user( if (!ipc_kmsg_user_desc_type_is_valid(dtype, options)) { return MACH_SEND_INVALID_TYPE; } + + if (dtype == MACH_MSG_OOL_PORTS_DESCRIPTOR) { + /* + * No need to check for int overflow here, since due to kmsg + * restrictions and sanitization, it's not possible to have + * more than 2**32-1 arrays. + */ + send_uctx->send_dsc_port_arrays_count++; + } + dsize = ikm_user_desc_size(dtype, isU64); if (dsize == USER_DESC_SIZE_MAX) { mask |= bit; @@ -2909,7 +2812,6 @@ ipc_kmsg_measure_descriptors_from_user( * MACH_SEND_MSG_TOO_SMALL Body is too small for types/data. * MACH_SEND_INVALID_RT_OOL_SIZE OOL Buffer too large for RT * MACH_MSG_INVALID_RT_DESCRIPTOR Dealloc and RT are incompatible - * MACH_SEND_NO_GRANT_DEST Dest port doesn't accept ports in body */ static mach_msg_return_t @@ -2917,7 +2819,8 @@ ipc_kmsg_copyin_body( ipc_kmsg_t kmsg, mach_msg_send_uctx_t *send_uctx, ipc_space_t space, - vm_map_t map) + vm_map_t map, + mach_msg_option64_t options) { mach_msg_type_number_t dsc_count = send_uctx->send_dsc_count; vm_size_t psize = send_uctx->send_dsc_vm_size; @@ -2944,15 +2847,6 @@ ipc_kmsg_copyin_body( } } - /* - * Receive right of a libxpc connection port is moved as a part of kmsg's body - * 1. from a client to a service during connection etsablishment. - * 2. back to the client on service's death or port deallocation. - * - * Any other attempt to move this receive right is not allowed. - */ - kmsg->ikm_flags |= IPC_OBJECT_COPYIN_FLAGS_ALLOW_CONN_IMMOVABLE_RECEIVE; - for (mach_msg_size_t copied_in_dscs = 0; copied_in_dscs < dsc_count; copied_in_dscs++) { mach_msg_kdescriptor_t *kdesc = &kbase->msgb_dsc_array[copied_in_dscs]; mach_msg_return_t mr; @@ -2969,7 +2863,7 @@ ipc_kmsg_copyin_body( break; case MACH_MSG_OOL_PORTS_DESCRIPTOR: mr = ipc_kmsg_copyin_ool_ports_descriptor(&kdesc->kdesc_port_array, - map, space, dest_port, kmsg); + map, space, dest_port, kmsg, options); break; case MACH_MSG_GUARDED_PORT_DESCRIPTOR: mr = ipc_kmsg_copyin_guarded_port_descriptor(&kdesc->kdesc_guarded_port, @@ -3234,7 +3128,7 @@ ipc_kmsg_copyin_from_user( hdr->msgh_id); if (hdr->msgh_bits & MACH_MSGH_BITS_COMPLEX) { - mr = ipc_kmsg_copyin_body(kmsg, send_uctx, space, map); + mr = ipc_kmsg_copyin_body(kmsg, send_uctx, space, map, options); } return mr; @@ -3396,7 +3290,6 @@ ipc_kmsg_copyout_header( uint32_t entries_held = 0; boolean_t need_write_lock = FALSE; - ipc_object_copyout_flags_t reply_copyout_options = IPC_OBJECT_COPYOUT_FLAGS_NONE; kern_return_t kr; assert(IP_VALID(dest)); @@ -3453,17 +3346,18 @@ handle_reply_again: /* Handle reply port. */ if (IP_VALID(reply)) { ipc_port_t reply_subst = IP_NULL; + ipc_object_label_t label; ipc_entry_t entry; - ip_mq_lock_check_aligned(reply); + label = ip_mq_lock_check_aligned(reply); /* Is the reply port still active and allowed to be copied out? */ - if (!ip_active(reply) || - !ip_label_check(space, reply, reply_type, - &reply_copyout_options, &reply_subst)) { + if (!io_state_active(label.io_state) || + !ip_label_check_or_substitute(space, reply, &label, + reply_type, &reply_subst)) { /* clear the context value */ reply->ip_reply_context = 0; - ip_mq_unlock(reply); + ip_mq_unlock_label_put(reply, &label); assert(reply_subst == IP_NULL); release_reply_port = reply; @@ -3478,6 +3372,8 @@ handle_reply_again: * port is unlocked, its right consumed * space is unlocked */ + /* control ports need to be immovable and don't belong here */ + release_assert(!ip_is_tt_control_port(reply_subst)); assert(reply_type == MACH_MSG_TYPE_PORT_SEND); msg->msgh_local_port = reply = reply_subst; goto handle_reply_again; @@ -3508,16 +3404,6 @@ handle_reply_again: * matching context (voucher). */ if (enforce_strict_reply && MACH_RCV_WITH_STRICT_REPLY(option) && IP_VALID(voucher)) { - if (ipc_kmsg_validate_reply_port_locked(reply, option) != KERN_SUCCESS) { - /* if the receiver isn't happy with the reply port: fail the receive. */ - assert(!ip_is_pinned(reply)); - ipc_entry_dealloc(space, ip_to_object(reply), - reply_name, entry); - ip_mq_unlock(reply); - is_write_unlock(space); - ip_release(reply); - return MACH_RCV_INVALID_REPLY; - } ipc_kmsg_link_reply_context_locked(reply, voucher); } else { /* @@ -3529,10 +3415,10 @@ handle_reply_again: reply->ip_reply_context = 0; } - kr = ipc_right_copyout(space, reply, reply_type, - IPC_OBJECT_COPYOUT_FLAGS_NONE, reply_name, entry, - NULL); - assert(kr == KERN_SUCCESS); + ip_label_put(reply, &label); + ipc_right_copyout_any_send(space, reply, reply_type, + IPC_OBJECT_COPYOUT_FLAGS_NONE, reply_name, entry); + kr = KERN_SUCCESS; /* reply port is unlocked */ } else { reply_name = CAST_MACH_PORT_TO_NAME(reply); @@ -3569,9 +3455,11 @@ done_with_reply: ipc_kmsg_clear_voucher_port(kmsg); if ((option & MACH_RCV_VOUCHER) != 0) { + ipc_object_label_t label; ipc_entry_t entry; - ip_mq_lock_check_aligned(voucher); + label = ip_mq_lock_check_aligned(voucher); + ipc_release_assert(label.io_type == IKOT_VOUCHER); if (ipc_right_reverse(space, voucher, &voucher_name, &entry)) { @@ -3583,10 +3471,12 @@ done_with_reply: } /* space is locked and active */ - assert(ip_kotype(voucher) == IKOT_VOUCHER); - kr = ipc_right_copyout(space, voucher, - MACH_MSG_TYPE_MOVE_SEND, IPC_OBJECT_COPYOUT_FLAGS_NONE, - voucher_name, entry, NULL); + assert(label.io_type == IKOT_VOUCHER); + ip_label_put(voucher, &label); + ipc_right_copyout_any_send(space, voucher, + MACH_MSG_TYPE_MOVE_SEND, + IPC_OBJECT_COPYOUT_FLAGS_NONE, + voucher_name, entry); /* voucher port is unlocked */ } else { voucher_type = MACH_MSGH_BITS_ZERO; @@ -3908,13 +3798,7 @@ ipc_kmsg_deflate_port_descriptor( return ipc_kmsg_deflate_put(udesc_end, &udesc); } -#if 0 /* done to avoid merge conflicts, will be cleaned up with RDAR_91262248 */ -} -extern const char *proc_best_name(struct proc *proc); -static mach_msg_descriptor_t * - -#endif static mach_msg_return_t ipc_kmsg_copyout_ool_descriptor( mach_msg_ool_descriptor_t *dsc, @@ -3983,7 +3867,8 @@ ipc_kmsg_copyout_ool_descriptor( } rcv_addr = rounded_addr; - kr = vm_map_copy_overwrite(map, rcv_addr, copy, size, FALSE); + kr = vm_map_copy_overwrite(map, rcv_addr, copy, size, + FALSE); } } else { kr = vm_map_copyout_size(map, &rcv_addr, copy, size); @@ -5171,22 +5056,6 @@ ipc_kmsg_copyin_from_kernel( mach_msg_size_t count = kbase->msgb_dsc_count; mach_msg_kdescriptor_t *kdesc = kbase->msgb_dsc_array; - /* - * Check if the remote port accepts ports in the body. - */ - if (remote->ip_no_grant) { - for (mach_msg_size_t i = 0; i < count; i++) { - switch (mach_msg_kdescriptor_type(&kdesc[i])) { - case MACH_MSG_PORT_DESCRIPTOR: - case MACH_MSG_OOL_PORTS_DESCRIPTOR: - case MACH_MSG_GUARDED_PORT_DESCRIPTOR: - /* no descriptors have been copied in yet */ - ipc_kmsg_clean_header(kmsg); - return MACH_SEND_NO_GRANT_DEST; - } - } - } - for (mach_msg_size_t i = 0; i < count; i++) { switch (mach_msg_kdescriptor_type(&kdesc[i])) { case MACH_MSG_PORT_DESCRIPTOR: { @@ -5338,11 +5207,6 @@ retry: */ if (!ip_active(port)) { ip_mq_unlock(port); -#if MACH_FLIPC - if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) { - flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); - } -#endif if (did_importance) { /* * We're going to pretend we delivered this message @@ -5361,7 +5225,6 @@ retry: } if (ip_in_space(port, ipc_space_kernel)) { - require_ip_active(port); port->ip_messages.imq_seqno++; ip_mq_unlock(port); @@ -5462,11 +5325,6 @@ retry: * as a successful delivery (like we do for an inactive port). */ if (error == MACH_SEND_INVALID_DEST) { -#if MACH_FLIPC - if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) { - flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); - } -#endif ip_release(port); /* JMM - Future: release right, not just ref */ ipc_kmsg_destroy(kmsg, IPC_KMSG_DESTROY_SKIP_REMOTE); KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_END, MACH_SEND_INVALID_DEST); @@ -5479,11 +5337,6 @@ retry: * pseudo-receive on error conditions. We need to just treat * the message as a successful delivery. */ -#if MACH_FLIPC - if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) { - flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); - } -#endif ip_release(port); /* JMM - Future: release right, not just ref */ ipc_kmsg_destroy(kmsg, IPC_KMSG_DESTROY_SKIP_REMOTE); KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_END, error); @@ -5595,7 +5448,7 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, mach_msg_option64_t option) mach_msg_header_t *msg; mach_msg_trailer_t *trailer; - int kotype = 0; + int dest_type = 0; uint32_t msg_size = 0; uint64_t msg_flags = KMSG_TRACE_FLAG_TRACED; uint32_t num_ports = 0; @@ -5725,15 +5578,15 @@ ipc_kmsg_trace_send(ipc_kmsg_t kmsg, mach_msg_option64_t option) msg_flags |= KMSG_TRACE_FLAG_DSTQFULL; } - kotype = ip_kotype(dst_port); + dest_type = ip_type(dst_port); ip_mq_unlock(dst_port); - switch (kotype) { + switch (dest_type) { case IKOT_SEMAPHORE: msg_flags |= KMSG_TRACE_FLAG_SEMA; break; - case IKOT_TIMER: + case IOT_TIMER_PORT: case IKOT_CLOCK: msg_flags |= KMSG_TRACE_FLAG_TIMER; break; diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index 702c90282..0b6c8aa9d 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -147,9 +147,6 @@ struct ipc_kmsg { ipc_port_t XNU_PTRAUTH_SIGNED_PTR("kmsg.ikm_voucher_port") ikm_voucher_port; /* voucher port carried */ struct ipc_importance_elem *ikm_importance; /* inherited from */ queue_chain_t ikm_inheritance; /* inherited from link */ -#if MACH_FLIPC - struct mach_node *ikm_node; /* originating node - needed for ack */ -#endif uint16_t ikm_aux_size; /* size reserved for auxiliary data */ ipc_kmsg_keep_alive_t ikm_keep_alive; /* only used for IKM_TYPE_ALL_INLINED */ uint8_t __ikm_padding; @@ -411,11 +408,6 @@ extern void ipc_kmsg_clear_voucher_port( extern mach_msg_size_t ipc_kmsg_validate_signature( ipc_kmsg_t kmsg) __result_use_check; -#define moved_provisional_reply_port(port_type, port) \ - (port_type == MACH_MSG_TYPE_MOVE_RECEIVE && IP_VALID(port) && ip_is_provisional_reply_port(port)) \ - -extern void send_prp_telemetry(int msgh_id); - #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) extern void ipc_kmsg_trace_send( ipc_kmsg_t kmsg, diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index dbcf62030..5e075c88d 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -93,30 +93,27 @@ #include #include -#if MACH_FLIPC -#include -#endif - #ifdef __LP64__ #include #endif #include -extern char *proc_name_address(void *p); +const bool ipc_mqueue_full; /* address is event for queue space */ -int ipc_mqueue_full; /* address is event for queue space */ -int ipc_mqueue_rcv; /* address is event for message arrival */ +KALLOC_TYPE_DEFINE(mqueue_zone, struct ipc_mqueue, KT_DEFAULT); /* forward declarations */ static void ipc_mqueue_receive_results(wait_result_t result); -#if MACH_FLIPC -static void ipc_mqueue_peek_on_thread_locked( - ipc_mqueue_t port_mq, - mach_msg_option64_t option, - thread_t thread); -#endif /* MACH_FLIPC */ +static void ipc_mqueue_select_on_thread_locked( + ipc_mqueue_t mqueue, + mach_msg_option64_t option64, + thread_t thread); + +/* Clear a message count reservation */ +static void ipc_mqueue_release_msgcount( + ipc_mqueue_t mqueue); /* Deliver message to message queue or waiting receiver */ static void ipc_mqueue_post( @@ -237,7 +234,7 @@ ipc_mqueue_add_locked( thread_t th; th = waitq_wakeup64_identify_locked(wqset, IPC_MQUEUE_RECEIVE, - THREAD_AWAKENED, WAITQ_KEEP_LOCKED); + WAITQ_KEEP_LOCKED); /* port and pset still locked, thread not runnable */ if (th == THREAD_NULL) { @@ -265,19 +262,6 @@ ipc_mqueue_add_locked( * go look for another thread that can. */ if (th->ith_state != MACH_RCV_IN_PROGRESS) { -#if MACH_FLIPC - if (th->ith_state == MACH_PEEK_IN_PROGRESS) { - /* - * wakeup the peeking thread, but - * continue to loop over the threads - * waiting on the port's mqueue to see - * if there are any actual receivers - */ - ipc_mqueue_peek_on_thread_locked(port_mqueue, - th->ith_option, th); - } -#endif /* MACH_FLIPC */ - waitq_resume_identified_thread(wqset, th, THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); continue; @@ -323,10 +307,6 @@ ipc_mqueue_add_locked( */ ipc_kmsg_rmqueue(kmsgq, kmsg); -#if MACH_FLIPC - mach_node_t node = kmsg->ikm_node; -#endif - ipc_mqueue_release_msgcount(port_mqueue); th->ith_kmsg = kmsg; @@ -334,12 +314,6 @@ ipc_mqueue_add_locked( waitq_resume_identified_thread(wqset, th, THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); - -#if MACH_FLIPC - if (MACH_NODE_VALID(node) && FPORT_VALID(port_mqueue->imq_fport)) { - flipc_msg_ack(node, port_mqueue, TRUE); - } -#endif } return KERN_SUCCESS; @@ -354,14 +328,14 @@ ipc_mqueue_add_locked( bool ipc_port_has_klist(ipc_port_t port) { - return !port->ip_specialreply && + return !ip_is_special_reply_port(port) && port->ip_sync_link_state == PORT_SYNC_LINK_ANY; } static inline struct klist * ipc_object_klist(ipc_object_t object) { - if (io_otype(object) == IOT_PORT) { + if (io_is_any_port(object)) { ipc_port_t port = ip_object_to_port(object); return ipc_port_has_klist(port) ? &port->ip_klist : NULL; @@ -415,9 +389,9 @@ ipc_mqueue_changed( knote_vanish(klist, is_active(space)); } - if (io_otype(object) == IOT_PORT) { + if (io_is_any_port(object)) { ipc_port_t port = ip_object_to_port(object); - if (!port->ip_specialreply) { + if (!ip_is_special_reply_port(port)) { ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL); } @@ -609,7 +583,7 @@ ipc_mqueue_override_send_locked( * There is no need to pass reserved preposts because this will * never prepost to anyone */ -void +static void ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq) { ipc_port_t port = ip_from_mq(port_mq); @@ -693,7 +667,7 @@ ipc_mqueue_post( thread_t receiver; receiver = waitq_wakeup64_identify_locked(waitq, - IPC_MQUEUE_RECEIVE, THREAD_AWAKENED, WAITQ_KEEP_LOCKED); + IPC_MQUEUE_RECEIVE, WAITQ_KEEP_LOCKED); /* waitq still locked, thread not runnable */ if (receiver == THREAD_NULL) { @@ -737,26 +711,6 @@ ipc_mqueue_post( break; } -#if MACH_FLIPC - /* - * If a thread is attempting a "peek" into the message queue - * (MACH_PEEK_IN_PROGRESS), then we enqueue the message and set the - * thread running. A successful peek is essentially the same as - * message delivery since the peeking thread takes responsibility - * for delivering the message and (eventually) removing it from - * the mqueue. Only one thread can successfully use the peek - * facility on any given port, so we exit the waitq loop after - * encountering such a thread. - */ - if (receiver->ith_state == MACH_PEEK_IN_PROGRESS && mqueue->imq_msgcount > 0) { - ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg); - ipc_mqueue_peek_on_thread_locked(mqueue, receiver->ith_option, receiver); - waitq_resume_identified_thread(waitq, receiver, - THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); - break; /* Message was posted, so break out of loop */ - } -#endif /* MACH_FLIPC */ - /* * If the receiver waited with a facility not directly related * to Mach messaging, then it isn't prepared to get handed the @@ -798,20 +752,12 @@ ipc_mqueue_post( !(receiver->ith_option & MACH_RCV_LARGE)) { receiver->ith_kmsg = kmsg; receiver->ith_seqno = mqueue->imq_seqno++; -#if MACH_FLIPC - mach_node_t node = kmsg->ikm_node; -#endif + waitq_resume_identified_thread(waitq, receiver, THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT); /* we didn't need our reserved spot in the queue */ ipc_mqueue_release_msgcount(mqueue); - -#if MACH_FLIPC - if (MACH_NODE_VALID(node) && FPORT_VALID(mqueue->imq_fport)) { - flipc_msg_ack(node, mqueue, TRUE); - } -#endif break; } @@ -886,11 +832,6 @@ ipc_mqueue_receive_results(wait_result_t saved_wait_result) return; case MACH_MSG_SUCCESS: return; -#if MACH_FLIPC - case MACH_PEEK_READY: - return; -#endif /* MACH_FLIPC */ - default: panic("ipc_mqueue_receive_results: strange ith_state %d", self->ith_state); } @@ -900,10 +841,8 @@ ipc_mqueue_receive_results(wait_result_t saved_wait_result) } } -void -ipc_mqueue_receive_continue( - __unused void *param, - wait_result_t wresult) +static void +ipc_mqueue_receive_continue(__unused void *param, wait_result_t wresult) { ipc_mqueue_receive_results(wresult); mach_msg_receive_continue(); /* hard-coded for now */ @@ -1000,11 +939,6 @@ ipc_mqueue_receive_on_thread_and_unlock( * * Might drop the pset lock temporarily. */ -#if MACH_FLIPC - if (option64 & MACH64_PEEK_MSG) { - wqs_flags |= WQS_PREPOST_PEEK; - } -#endif /* MACH_FLIPC */ port_wq = waitq_set_first_prepost(&pset->ips_wqset, wqs_flags); /* Returns with port locked */ @@ -1032,16 +966,8 @@ ipc_mqueue_receive_on_thread_and_unlock( } if (port) { -#if MACH_FLIPC - if (option64 & MACH64_PEEK_MSG) { - ipc_mqueue_peek_on_thread_locked(&port->ip_messages, - option64, thread); - } else -#endif /* MACH_FLIPC */ - { - ipc_mqueue_select_on_thread_locked(&port->ip_messages, - option64, thread); - } + ipc_mqueue_select_on_thread_locked(&port->ip_messages, + option64, thread); ip_mq_unlock(port); return THREAD_NOT_WAITING; } @@ -1069,11 +995,6 @@ ipc_mqueue_receive_on_thread_and_unlock( } thread->ith_state = MACH_RCV_IN_PROGRESS; -#if MACH_FLIPC - if (option64 & MACH64_PEEK_MSG) { - thread->ith_state = MACH_PEEK_IN_PROGRESS; - } -#endif /* MACH_FLIPC */ if (option64 & MACH_RCV_TIMEOUT) { clock_interval_to_deadline(rcv_timeout, 1000 * NSEC_PER_USEC, &deadline); @@ -1149,40 +1070,6 @@ ipc_mqueue_receive_on_thread_and_unlock( return wresult; } -#if MACH_FLIPC -/* - * Routine: ipc_mqueue_peek_on_thread_locked - * Purpose: - * A receiver discovered that there was a message on the queue - * before he had to block. Tell a thread about the message queue, - * but don't pick off any messages. - * Conditions: - * port_mq locked - * at least one message on port_mq's message queue - * - * Returns: (on thread->ith_state) - * MACH_PEEK_READY ith_peekq contains a message queue - */ -void -ipc_mqueue_peek_on_thread_locked( - ipc_mqueue_t port_mq, - __assert_only mach_msg_option64_t option64, - thread_t thread) -{ - assert(option64 & MACH64_PEEK_MSG); - assert(ipc_kmsg_queue_first(&port_mq->imq_messages) != IKM_NULL); - - /* - * Take a reference on the mqueue's associated port: - * the peeking thread will be responsible to release this reference - */ - ip_validate(ip_from_mq(port_mq)); - ip_reference(ip_from_mq(port_mq)); - thread->ith_peekq = port_mq; - thread->ith_state = MACH_PEEK_READY; -} -#endif /* MACH_FLIPC */ - /* * Routine: ipc_mqueue_select_on_thread_locked * Purpose: @@ -1199,7 +1086,7 @@ ipc_mqueue_peek_on_thread_locked( * MACH_MSG_SUCCESS Actually selected a message for ourselves. * MACH_RCV_TOO_LARGE May or may not have pull it, but it is large */ -void +static void ipc_mqueue_select_on_thread_locked( ipc_mqueue_t port_mq, mach_msg_option64_t options, @@ -1243,11 +1130,6 @@ ipc_mqueue_select_on_thread_locked( } ipc_kmsg_rmqueue(&port_mq->imq_messages, kmsg); -#if MACH_FLIPC - if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port_mq->imq_fport)) { - flipc_msg_ack(kmsg->ikm_node, port_mq, TRUE); - } -#endif ipc_mqueue_release_msgcount(port_mq); thread->ith_seqno = port_mq->imq_seqno++; thread->ith_kmsg = kmsg; @@ -1344,76 +1226,6 @@ out: } -/* - * Routine: ipc_mqueue_peek - * Purpose: - * Peek at a (non-set) message queue to see if it has a message - * matching the sequence number provided (if zero, then the - * first message in the queue) and return vital info about the - * message. - * - * Conditions: - * The ipc_mqueue_t is unlocked. - * Locks may be held by callers, so this routine cannot block. - * Caller holds reference on the message queue. - */ -unsigned -ipc_mqueue_peek(ipc_mqueue_t mq, - mach_port_seqno_t * seqnop, - mach_msg_size_t * msg_sizep, - mach_msg_id_t * msg_idp, - mach_msg_max_trailer_t * msg_trailerp, - ipc_kmsg_t *kmsgp) -{ - ipc_port_t port = ip_from_mq(mq); - unsigned res; - - ip_mq_lock(port); - - res = ipc_mqueue_peek_locked(mq, seqnop, msg_sizep, msg_idp, - msg_trailerp, kmsgp); - - ip_mq_unlock(port); - return res; -} - -#if MACH_FLIPC -/* - * Routine: ipc_mqueue_release_peek_ref - * Purpose: - * Release the reference on an mqueue's associated port which was - * granted to a thread in ipc_mqueue_peek_on_thread (on the - * MACH64_PEEK_MSG thread wakeup path). - * - * Conditions: - * The ipc_mqueue_t should be locked on entry. - * The ipc_mqueue_t will be _unlocked_ on return - * (and potentially invalid!) - * - */ -void -ipc_mqueue_release_peek_ref(ipc_mqueue_t mqueue) -{ - ipc_port_t port = ip_from_mq(mqueue); - - ip_mq_lock_held(port); - - /* - * clear any preposts this mq may have generated - * (which would cause subsequent immediate wakeups) - */ - waitq_clear_prepost_locked(&port->ip_waitq); - - ip_mq_unlock(port); - - /* - * release the port reference: we need to do this outside the lock - * because we might be holding the last port reference! - **/ - ip_release(port); -} -#endif /* MACH_FLIPC */ - /* * Routine: ipc_mqueue_destroy_locked * Purpose: @@ -1444,17 +1256,6 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue, waitq_link_list_t *free_l) THREAD_RESTART, WAITQ_WAKEUP_DEFAULT); } -#if MACH_FLIPC - ipc_kmsg_t kmsg; - - cqe_foreach_element_safe(kmsg, &mqueue->imq_messages, ikm_link) { - if (MACH_NODE_VALID(kmsg->ikm_node) && - FPORT_VALID(mqueue->imq_fport)) { - flipc_msg_ack(kmsg->ikm_node, mqueue, TRUE); - } - } -#endif - /* * Move messages from the specified queue to the per-thread * clean/drain queue while we have the mqueue lock. @@ -1589,7 +1390,7 @@ ipc_mqueue_copyin( } else { io_unlock(object); /* guard exception if we never held the receive right in this entry */ - if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { + if ((bits & IE_BITS_EX_RECEIVE) == 0) { mach_port_guard_exception(name, 0, kGUARD_EXC_RCV_INVALID_NAME); } return MACH_RCV_INVALID_NAME; diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 161df1baf..74f63e594 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -73,7 +73,6 @@ #include #include #include -#include #include #include @@ -81,12 +80,6 @@ #include -/* this type doesn't exist and is only used to do math */ -struct ipc_object_waitq { - struct ipc_object iowq_object; - struct waitq iowq_waitq; -}; - typedef struct ipc_mqueue { circle_queue_head_t imq_messages; mach_port_seqno_t imq_seqno; @@ -98,12 +91,10 @@ typedef struct ipc_mqueue { * in ipc_mqueue. */ uint32_t imq_context; -#if MACH_FLIPC - struct flipc_port *imq_fport; // Null for local port, or ptr to flipc port -#endif + union { /* - * Special Reply Ports (ip_specialreply == true): + * Special Reply Ports (ip_type() == IOT_SPECIAL_REPLY_PORT): * only use imq_srp_owner_thread * * Ports, based on ip_sync_link_state, use: @@ -125,8 +116,7 @@ typedef struct ipc_mqueue { #define imq_full(mq) ((mq)->imq_msgcount >= (mq)->imq_qlimit) #define imq_full_kernel(mq) ((mq)->imq_msgcount >= MACH_PORT_QLIMIT_KERNEL) -extern int ipc_mqueue_full; -// extern int ipc_mqueue_rcv; +extern const bool ipc_mqueue_full; #define IPC_MQUEUE_FULL CAST_EVENT64_T(&ipc_mqueue_full) #define IPC_MQUEUE_RECEIVE NO_EVENT64 @@ -182,17 +172,6 @@ extern wait_result_t ipc_mqueue_receive_on_thread_and_unlock( int interruptible, thread_t thread); -/* Continuation routine for message receive */ -extern void ipc_mqueue_receive_continue( - void *param, - wait_result_t wresult); - -/* Select a message from a queue and try to post it to ourself */ -extern void ipc_mqueue_select_on_thread_locked( - ipc_mqueue_t port_mq, - mach_msg_option64_t option64, - thread_t thread); - /* Peek into a messaqe queue to see if there are messages */ extern unsigned ipc_mqueue_peek( ipc_mqueue_t mqueue, @@ -211,16 +190,6 @@ extern unsigned ipc_mqueue_peek_locked( mach_msg_max_trailer_t *msg_trailerp, ipc_kmsg_t *kmsgp); -#if MACH_FLIPC -/* Release an mqueue/port reference that was granted by MACH64_PEEK_MSG */ -extern void ipc_mqueue_release_peek_ref( - ipc_mqueue_t mqueue); -#endif /* MACH_FLIPC */ - -/* Clear a message count reservation */ -extern void ipc_mqueue_release_msgcount( - ipc_mqueue_t port_mq); - /* Change a queue limit */ extern void ipc_mqueue_set_qlimit_locked( ipc_mqueue_t mqueue, @@ -238,8 +207,7 @@ extern mach_msg_return_t ipc_mqueue_copyin( ipc_object_t *objectp); /* Safe to use the klist ptr */ -extern bool -ipc_port_has_klist( +extern bool ipc_port_has_klist( ipc_port_t port); #endif /* _IPC_IPC_MQUEUE_H_ */ diff --git a/osfmk/ipc/ipc_notify.c b/osfmk/ipc/ipc_notify.c index 7616bce42..68f7bfc6e 100644 --- a/osfmk/ipc/ipc_notify.c +++ b/osfmk/ipc/ipc_notify.c @@ -64,103 +64,89 @@ */ #include -#include #include -#include #include #include -#include +#include -/* - * Routine: ipc_notify_port_deleted - * Purpose: - * Send a port-deleted notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. +/*! + * @abstract + * Perform a check on whether forming a notification message + * to the specified notification port can be elided. + * + * @discussion + * This is racy but helps avoiding costly messages to be formed + * just to be destroyed because the notification port is already + * dead. + * + * This happens quite a lot during ipc_space_terminate(): all + * receive rights are destroyed first, then other ports. + * This avoids sending notifications to receive rights in that + * space reliably. */ - -void -ipc_notify_port_deleted( - ipc_port_t port, - mach_port_name_t name) +static inline bool +ipc_notify_should_send(ipc_port_t notification_port) { - (void)mach_notify_port_deleted(port, name); - /* send-once right consumed */ + return ip_active(notification_port); } -/* - * Routine: ipc_notify_send_possible - * Purpose: - * Send a send-possible notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. - */ - void -ipc_notify_send_possible( - ipc_port_t port, - mach_port_name_t name) +ipc_notify_dead_name(ipc_port_t port, mach_port_name_t name) { - (void)mach_notify_send_possible(port, name); - /* send-once right consumed */ + if (ipc_notify_should_send(port)) { + (void)mach_notify_dead_name(port, name); + /* send-once right consumed */ + } else { + ipc_port_release_sonce(port); + } } -/* - * Routine: ipc_notify_port_destroyed - * Purpose: - * Send a port-destroyed notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. - * Consumes a ref for right, which should be a receive right - * prepped for placement into a message. (In-transit, - * or in-limbo if a circularity was detected.) - */ +void +ipc_notify_send_possible(ipc_port_t port, mach_port_name_t name) +{ + if (ipc_notify_should_send(port)) { + (void)mach_notify_send_possible(port, name); + /* send-once right consumed */ + } else { + ipc_port_release_sonce(port); + } +} void -ipc_notify_port_destroyed( - ipc_port_t port, - ipc_port_t right) +ipc_notify_port_deleted(ipc_port_t port, mach_port_name_t name) +{ + if (ipc_notify_should_send(port)) { + (void)mach_notify_port_deleted(port, name); + /* send-once right consumed */ + } else { + ipc_port_release_sonce(port); + } +} + +void +ipc_notify_port_destroyed(ipc_port_t port, ipc_port_t right) { mach_notify_port_destroyed(port, right); /* send-once and receive rights consumed */ } -/* - * Routine: ipc_notify_no_senders_prepare - * Purpose: - * Prepare for consuming a no senders notification - * when the port send right count just hit 0. - * Conditions: - * The port is locked. - * - * For kobjects (ns_is_kobject), the `ns_notify` port has a reference. - * For regular ports, the `ns_notify` has an outstanding send once right. - * Returns: - * A token that must be passed to ipc_notify_no_senders_emit. - */ ipc_notify_nsenders_t -ipc_notify_no_senders_prepare( - ipc_port_t port) +ipc_notify_no_senders_prepare(ipc_port_t port) { ipc_notify_nsenders_t req = { }; + ipc_object_type_t type = ip_type(port); ip_mq_lock_held(port); - if (port->ip_nsrequest == IP_KOBJECT_NSREQUEST_ARMED) { - port->ip_nsrequest = IP_NULL; - - if (ip_active(port)) { + if (io_is_kobject_type(type)) { + if (ip_active(port) && ipc_policy(type)->pol_notif_no_senders) { + ip_reference(port); req.ns_notify = port; req.ns_mscount = port->ip_mscount; req.ns_is_kobject = true; - } else { - /* silently consume the port-ref */ - ip_release_live(port); } } else if (port->ip_nsrequest) { + ipc_release_assert(ipc_policy(type)->pol_notif_no_senders); req.ns_notify = port->ip_nsrequest; req.ns_mscount = port->ip_mscount; req.ns_is_kobject = false; @@ -171,64 +157,36 @@ ipc_notify_no_senders_prepare( return req; } -/* - * Routine: ipc_notify_no_senders - * Purpose: - * Send a no-senders notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. - */ - void -ipc_notify_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount, - boolean_t kobject) +ipc_notify_no_senders_mqueue(ipc_port_t port, mach_port_mscount_t mscount) { - if (kobject) { - ipc_kobject_notify_no_senders(port, mscount); - } else { + if (ipc_notify_should_send(port)) { (void)mach_notify_no_senders(port, mscount); /* send-once right consumed */ + } else { + ipc_port_release_sonce(port); } } -/* - * Routine: ipc_notify_no_senders_consume - * Purpose: - * Consume a no-senders notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. - */ - void -ipc_notify_no_senders_consume( - ipc_notify_nsenders_t nsrequest) +ipc_notify_no_senders_kobject(ipc_port_t port, mach_port_mscount_t mscount) { - if (nsrequest.ns_notify) { - if (nsrequest.ns_is_kobject) { - ip_release(nsrequest.ns_notify); - } else { - ipc_port_release_sonce(nsrequest.ns_notify); - } + if (ipc_notify_should_send(port)) { + ipc_policy(port)->pol_kobject_no_senders(port, mscount); } + ip_release(port); } -/* - * Routine: ipc_notify_send_once_and_unlock - * Purpose: - * Send a send-once notification. - * Conditions: - * Port is locked. - * Consumes a ref/soright for port. - */ - void -ipc_notify_send_once_and_unlock( - ipc_port_t port) +ipc_notify_send_once_and_unlock(ipc_port_t port) { + /* + * clear any reply context: + * no one will be sending the response b/c we are destroying + * the single, outstanding send once right. + */ + port->ip_reply_context = 0; + if (!ip_active(port)) { ipc_port_release_sonce_and_unlock(port); } else if (ip_in_space(port, ipc_space_kernel)) { @@ -241,21 +199,3 @@ ipc_notify_send_once_and_unlock( } /* send-once right consumed */ } - -/* - * Routine: ipc_notify_dead_name - * Purpose: - * Send a dead-name notification. - * Conditions: - * Nothing locked. - * Consumes a ref/soright for port. - */ - -void -ipc_notify_dead_name( - ipc_port_t port, - mach_port_name_t name) -{ - (void)mach_notify_dead_name(port, name); - /* send-once right consumed */ -} diff --git a/osfmk/ipc/ipc_notify.h b/osfmk/ipc/ipc_notify.h index 19a53dee3..b60c36be3 100644 --- a/osfmk/ipc/ipc_notify.h +++ b/osfmk/ipc/ipc_notify.h @@ -68,63 +68,202 @@ #include +__BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN #pragma GCC visibility push(hidden) + typedef struct ipc_notify_nsenders { ipc_port_t ns_notify; mach_port_mscount_t ns_mscount; boolean_t ns_is_kobject; } ipc_notify_nsenders_t; -/* - * Exported interfaces + +/*! + * @abstract + * Send a dead-name notification. + * + * @discussion + * A dead-name notification is sent when the port being monitored + * has its receive right destroyed. + * + * Conditions: + * - Nothing locked. + * - Consumes a ref/soright for @c notify. + * + * @param notify The port receiving the notification. + * @param name The name for the port whose receive right has been + * destroyed. */ - -/* Send a port-deleted notification */ -extern void ipc_notify_port_deleted( - ipc_port_t port, +extern void ipc_notify_dead_name( + ipc_port_t notify, mach_port_name_t name); -/* Send a send-possible notification */ +/*! + * @abstract + * Send a send-possible notification. + * + * @discussion + * A send-possible notification is sent when the port being monitored + * has a message queue that becomes non full, and messages can be sent + * to it without blocking. + * + * This consumes the dead-name/send-possible notification slot for this port. + * + * Conditions: + * - Nothing locked. + * - Consumes a ref/soright for @c notify. + * + * @param notify The port receiving the notification. + * @param name The name for the port which can now receive messages + * without blocking. + */ extern void ipc_notify_send_possible( - ipc_port_t port, + ipc_port_t notify, mach_port_name_t name); -/* Send a port-destroyed notification */ +/*! + * @abstract + * Send a port-deleted notification. + * + * @discussion + * A port-deleted notification is sent whenever the last send(-once) right + * which has an active dead-name/send-possible notification armed is removed + * from the space. + * + * Conditions: + * - Nothing locked. + * - Consumes a ref/soright for notify. + * + * @param notify The port receiving the notification. + * @param name The name for the port which has been removed from the + * space. + */ +extern void ipc_notify_port_deleted( + ipc_port_t notify, + mach_port_name_t name); + +/*! + * @abstract + * Send a port-destroyed notification. + * + * @discussion + * A port-destroyed notification allows for a task to get a receive right + * back instead of it being destroyed. + * + * Conditions: + * - Nothing locked. + * - Consumes a ref/soright for @c notify. + * - Consumes a ref for @c right, which should be a receive right + * prepped for placement into a message. (In-transit, or in-limbo if + * a circularity was detected.) + * + * @param notify The port receiving the notification. + * @param right The receive right being sent back. + */ extern void ipc_notify_port_destroyed( - ipc_port_t port, + ipc_port_t notify, ipc_port_t right); -/* Send a no-senders notification */ -extern void ipc_notify_no_senders( +/*! + * @abstract + * Send a no-senders notification to a regular message queue port. + * + * @discussion + * Condition: + * - Nothing locked. + * - Consumes a ref/soright for @c notify. + * + * @param notify The port receiving the notification. + * @param mscount The make-send count at the time this notification + * was sent (it can be used to synchronize new rights + * being made by the client concurrently). + */ +extern void ipc_notify_no_senders_mqueue( ipc_port_t notify, - mach_port_mscount_t mscount, - boolean_t kobject); + mach_port_mscount_t mscount); +/*! + * @abstract + * Send a no-senders notification to a kobject port. + * + * @discussion + * Condition: + * - Nothing locked. + * - Consumes a port reference to @c notify. + * + * @param notify The port receiving the notification, + * which is also the port for which the notification + * is being emitted. + * @param mscount The make-send count at the time this notification + * was sent (it can be used to synchronize new rights + * being made by the client concurrently). + */ +extern void ipc_notify_no_senders_kobject( + ipc_port_t notify, + mach_port_mscount_t mscount); + +/*! + * @abstract + * Prepare for consuming a no-senders notification + * when the port send right count just hit 0. + * + * @discussion + * This allows for a two-phase prepare/emit because sending the no-senders + * notification requires no lock to be held. + * + * @c ipc_notify_no_senders_emit() must be called on the value returned + * by this function. + * + * Conditions: + * - @c port is locked. + * + * For kobjects (ns_is_kobject), the `ns_notify` port has a port reference. + * For regular ports, the `ns_notify` has an outstanding send once right. + * + * @returns + * A token that must be passed to ipc_notify_no_senders_emit. + */ extern ipc_notify_nsenders_t ipc_notify_no_senders_prepare( ipc_port_t port); +/*! + * @abstract + * Emits a no-senders notification that was prepared by + * @c ipc_notify_no_senders_prepare(). + */ static inline void ipc_notify_no_senders_emit(ipc_notify_nsenders_t nsrequest) { - if (nsrequest.ns_notify) { - ipc_notify_no_senders(nsrequest.ns_notify, - nsrequest.ns_mscount, nsrequest.ns_is_kobject); + if (!nsrequest.ns_notify) { + /* nothing to do */ + } else if (nsrequest.ns_is_kobject) { + ipc_notify_no_senders_kobject(nsrequest.ns_notify, + nsrequest.ns_mscount); + } else { + ipc_notify_no_senders_mqueue(nsrequest.ns_notify, + nsrequest.ns_mscount); } } -extern void ipc_notify_no_senders_consume( - ipc_notify_nsenders_t nsrequest); - /* Send a send-once notification */ +/*! + * @abstract + * Send a send-once notification. + * + * @discussion + * A send-once notification is sent when a send-once right to @c port is being + * destroyed without any message having been sent to it. + * + * Conditions: + * - @c port is locked. + * - Consumes a ref/soright for @c port. + */ extern void ipc_notify_send_once_and_unlock( ipc_port_t port); -/* Send a dead-name notification */ -extern void ipc_notify_dead_name( - ipc_port_t port, - mach_port_name_t name); #pragma GCC visibility pop +__ASSUME_PTR_ABI_SINGLE_END __END_DECLS #endif /* _IPC_IPC_NOTIFY_H_ */ diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index f026bef49..8a84771b0 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -98,7 +98,6 @@ #include static struct mpsc_daemon_queue ipc_object_deallocate_queue; -SECURITY_READ_ONLY_LATE(zone_t) ipc_object_zones[IOT_NUMBER]; /* * In order to do lockfree lookups in the IPC space, we combine two schemes: @@ -125,26 +124,21 @@ SECURITY_READ_ONLY_LATE(zone_t) ipc_object_zones[IOT_NUMBER]; */ #define IPC_OBJECT_ZC_BASE (ZC_ZFREE_CLEARMEM | ZC_SEQUESTER) -ZONE_INIT(&ipc_object_zones[IOT_PORT], - "ipc ports", sizeof(struct ipc_port), - IPC_OBJECT_ZC_BASE | ZC_CACHING, ZONE_ID_IPC_PORT, NULL); +ZONE_DEFINE_ID(ZONE_ID_IPC_PORT, "ipc ports", struct ipc_port, + IPC_OBJECT_ZC_BASE | ZC_CACHING); -ZONE_INIT(&ipc_object_zones[IOT_PORT_SET], - "ipc port sets", sizeof(struct ipc_pset), - IPC_OBJECT_ZC_BASE, ZONE_ID_IPC_PORT_SET, NULL); +ZONE_DEFINE_ID(ZONE_ID_IPC_PORT_SET, "ipc port sets", struct ipc_pset, + IPC_OBJECT_ZC_BASE); __attribute__((noinline)) static void -ipc_object_free(unsigned int otype, ipc_object_t object, bool last_ref) +ipc_object_free(ipc_object_t object) { - if (last_ref) { - if (otype == IOT_PORT) { - ipc_port_finalize(ip_object_to_port(object)); - } else { - ipc_pset_finalize(ips_object_to_pset(object)); - } + if (io_is_any_port(object)) { + ipc_port_free(ip_object_to_port(object)); + } else { + ipc_pset_free(ips_object_to_pset(object)); } - zfree(ipc_object_zones[otype], object); } __attribute__((noinline)) @@ -169,7 +163,7 @@ ipc_object_deallocate_queue_invoke(mpsc_queue_chain_t e, assert(dq == &ipc_object_deallocate_queue); os_atomic_store(&wq->waitq_defer.mpqc_next, NULL, relaxed); - ipc_object_free(io_otype(io), io, true); + ipc_object_free(io); } void @@ -190,7 +184,7 @@ ipc_object_reference( ipc_object_t io) { static_assert(sizeof(os_ref_atomic_t) == sizeof(io->io_references)); - os_ref_retain_raw((os_ref_atomic_t *)&io->io_references, NULL); + os_ref_retain_raw(&io->io_references, NULL); } /* @@ -207,9 +201,9 @@ ipc_object_release( assert(get_preemption_level() == 0); #endif - if (os_ref_release_raw((os_ref_atomic_t *)&io->io_references, NULL) == 0) { + if (os_ref_release_raw(&io->io_references, NULL) == 0) { /* Free the object */ - ipc_object_free(io_otype(io), io, true); + ipc_object_free(io); } } @@ -223,9 +217,9 @@ void ipc_object_release_safe( ipc_object_t io) { - if (os_ref_release_raw((os_ref_atomic_t *)&io->io_references, NULL) == 0) { + if (os_ref_release_raw(&io->io_references, NULL) == 0) { if (get_preemption_level() == 0) { - ipc_object_free(io_otype(io), io, true); + ipc_object_free(io); } else { ipc_object_free_safe(io); } @@ -242,7 +236,7 @@ void ipc_object_release_live( ipc_object_t io) { - os_ref_release_live_raw((os_ref_atomic_t *)&io->io_references, NULL); + os_ref_release_live_raw(&io->io_references, NULL); } /* @@ -324,10 +318,13 @@ ipc_object_translate_port_pset( /* space is read-locked and active */ if ((port_entry->ie_bits & MACH_PORT_TYPE_RECEIVE) == 0) { - bool guard = !(port_entry->ie_bits & MACH_PORT_TYPE_EX_RECEIVE); + bool guard = !(port_entry->ie_bits & IE_BITS_EX_RECEIVE); + is_read_unlock(space); if (guard) { - mach_port_guard_exception(port_name, 0, + mach_port_guard_exception(port_name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_TRANSLATE_PORT, + port_entry->ie_bits), kGUARD_EXC_INVALID_RIGHT); } return KERN_INVALID_RIGHT; @@ -335,7 +332,9 @@ ipc_object_translate_port_pset( if ((pset_entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) { is_read_unlock(space); - mach_port_guard_exception(pset_name, 0, + mach_port_guard_exception(pset_name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_TRANSLATE_PSET, + pset_entry->ie_bits), kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -385,6 +384,8 @@ ipc_object_alloc_dead( { ipc_entry_t entry; kern_return_t kr; + mach_port_type_t type = MACH_PORT_TYPE_DEAD_NAME; + mach_port_urefs_t urefs = 1; kr = ipc_entry_alloc(space, IPC_OBJECT_NULL, namep, &entry); if (kr != KERN_SUCCESS) { @@ -392,22 +393,21 @@ ipc_object_alloc_dead( } /* space is write-locked */ - /* null object, MACH_PORT_TYPE_DEAD_NAME, 1 uref */ + ipc_entry_init(space, IPC_OBJECT_NULL, type, entry, urefs, *namep); - entry->ie_bits |= MACH_PORT_TYPE_DEAD_NAME | 1; - ipc_entry_modified(space, *namep, entry); is_write_unlock(space); return KERN_SUCCESS; } /* - * Routine: ipc_object_alloc + * Routine: ipc_object_alloc_entry * Purpose: - * Allocate an object. + * Allocate an unitialized entry for a new object. + * That entry must later be initialized by `ipc_entry_init` after + * the object has been initialized * Conditions: * Nothing locked. * The space is write locked on successful return. - * The caller doesn't get a reference for the object. * Returns: * KERN_SUCCESS The object is allocated. * KERN_INVALID_TASK The space is dead. @@ -415,53 +415,28 @@ ipc_object_alloc_dead( */ kern_return_t -ipc_object_alloc( - ipc_space_t space, - ipc_object_type_t otype, - mach_port_type_t type, - mach_port_urefs_t urefs, - mach_port_name_t *namep, - ipc_object_t *objectp) +ipc_object_alloc_entry( + ipc_space_t space, + ipc_object_t object, + mach_port_name_t *namep, + ipc_entry_t *entry) { - ipc_object_t object; - ipc_entry_t entry; kern_return_t kr; - assert(otype < IOT_NUMBER); - assert((type & MACH_PORT_TYPE_ALL_RIGHTS) == type); - assert(type != MACH_PORT_TYPE_NONE); - assert(urefs <= MACH_PORT_UREFS_MAX); + kr = ipc_entry_alloc(space, object, namep, entry); - object = io_alloc(otype, Z_WAITOK | Z_ZERO | Z_NOFAIL); - os_atomic_init(&object->io_bits, io_makebits(otype)); - os_atomic_init(&object->io_references, 1); /* for entry, not caller */ - - *namep = CAST_MACH_PORT_TO_NAME(object); - kr = ipc_entry_alloc(space, object, namep, &entry); - if (kr != KERN_SUCCESS) { - ipc_object_free(otype, object, false); - return kr; - } - /* space is write-locked */ - - entry->ie_bits |= type | urefs; - ipc_entry_modified(space, *namep, entry); - - *objectp = object; - return KERN_SUCCESS; + return kr; } /* - * Routine: ipc_object_alloc_name + * Routine: ipc_object_alloc_entry_with_name * Purpose: - * Allocate an object, with a specific name. + * Allocate an uninitalized entry for a new object, with a specific name. + * That entry must later be initialized by `ipc_entry_init` after + * the object has been initialized * Conditions: - * Nothing locked. If successful, the object is returned locked. - * The caller doesn't get a reference for the object. - * - * finish_init() must call an ipc_*_init function - * that will return the object locked (using IPC_PORT_INIT_LOCKED, - * or SYNC_POLICY_INIT_LOCKED, or equivalent). + * Nothing locked. + * The space is write locked on successful return. * * Returns: * KERN_SUCCESS The object is allocated. @@ -470,53 +445,20 @@ ipc_object_alloc( */ kern_return_t -ipc_object_alloc_name( - ipc_space_t space, - ipc_object_type_t otype, - mach_port_type_t type, - mach_port_urefs_t urefs, - mach_port_name_t name, - ipc_object_t *objectp, - void (^finish_init)(ipc_object_t)) +ipc_object_alloc_entry_with_name( + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t *entry) { - ipc_object_t object; - ipc_entry_t entry; kern_return_t kr; - assert(otype < IOT_NUMBER); - assert((type & MACH_PORT_TYPE_ALL_RIGHTS) == type); - assert(type != MACH_PORT_TYPE_NONE); - assert(urefs <= MACH_PORT_UREFS_MAX); - - object = io_alloc(otype, Z_WAITOK | Z_ZERO | Z_NOFAIL); - os_atomic_init(&object->io_bits, io_makebits(otype)); - os_atomic_init(&object->io_references, 1); /* for entry, not caller */ - - kr = ipc_entry_alloc_name(space, name, &entry); - if (kr != KERN_SUCCESS) { - ipc_object_free(otype, object, false); - return kr; - } - /* space is write-locked */ - - if (ipc_right_inuse(entry)) { + kr = ipc_entry_alloc_name(space, name, entry); + if (kr == KERN_SUCCESS && ipc_right_inuse(*entry)) { is_write_unlock(space); - ipc_object_free(otype, object, false); - return KERN_NAME_EXISTS; + kr = KERN_NAME_EXISTS; } - entry->ie_bits |= type | urefs; - entry->ie_object = object; - - finish_init(object); - /* object is locked */ - io_lock_held(object); - - ipc_entry_modified(space, name, entry); - is_write_unlock(space); - - *objectp = object; - return KERN_SUCCESS; + return kr; } /* Routine: ipc_object_validate @@ -530,7 +472,7 @@ ipc_object_validate( ipc_object_t object, ipc_object_type_t type) { - if (type != IOT_PORT_SET) { + if (io_is_any_port_type(type)) { ip_validate(object); } else { ips_validate(object); @@ -586,6 +528,7 @@ ipc_object_copyin( mach_port_name_t name, mach_msg_type_name_t msgt_name, ipc_object_copyin_flags_t copyin_flags, + ipc_copyin_op_t copyin_reason, mach_msg_guarded_port_descriptor_t *gdesc, ipc_port_t *portp) { @@ -594,20 +537,9 @@ ipc_object_copyin( ipc_entry_t entry; kern_return_t kr; - ipc_object_copyin_flags_t copyin_mask = IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND - | IPC_OBJECT_COPYIN_FLAGS_ALLOW_CONN_IMMOVABLE_RECEIVE; + ipc_object_copyin_flags_t copyin_mask = IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND; copyin_mask = (copyin_flags & copyin_mask) | IPC_OBJECT_COPYIN_FLAGS_DEADOK; - /* - * We allow moving of immovable receive right of a service port when it is from launchd. - */ - task_t task = current_task_early(); -#ifdef MACH_BSD - if (task && proc_isinitproc(get_bsdtask_info(task))) { - copyin_mask |= IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_RECEIVE; - } -#endif - /* * Could first try a read lock when doing * MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND, @@ -622,17 +554,22 @@ ipc_object_copyin( } /* space is write-locked and active */ - kr = ipc_right_copyin(space, name, msgt_name, copyin_mask, entry, - portp, &icc, &icrc); + kr = ipc_right_copyin(space, name, msgt_name, copyin_mask, copyin_reason, + entry, portp, &icc, &icrc); is_write_unlock(space); - if (moved_provisional_reply_port(msgt_name, icc.icc_deleted_port)) { - send_prp_telemetry(-1); - } - ipc_right_copyin_cleanup_destroy(&icc, name); ipc_right_copyin_rcleanup_destroy(&icrc); + if (IP_VALID(*portp) && + ip_type(*portp) == IOT_SERVICE_PORT && + msgt_name == MACH_MSG_TYPE_MOVE_RECEIVE && + !task_is_initproc(space->is_task) && + !ipc_space_has_telemetry_type(space, IS_HAS_SERVICE_PORT_TELEMETRY)) { + ipc_stash_policy_violations_telemetry(IPCPV_MOVE_SERVICE_PORT, + *portp, name); + } + return kr; } @@ -669,24 +606,26 @@ ipc_object_copyin_from_kernel( ipc_port_t port, mach_msg_type_name_t msgt_name) { + ipc_object_label_t label; assert(IP_VALID(port)); switch (msgt_name) { case MACH_MSG_TYPE_MOVE_RECEIVE: - ip_mq_lock(port); - require_ip_active(port); - if (ip_in_a_space(port)) { + label = ip_mq_lock_label_get(port); + + if (label.io_state == IO_STATE_IN_SPACE) { assert(ip_in_space(port, ipc_space_kernel)); - assert(port->ip_immovable_receive == 0); - /* relevant part of ipc_port_clear_receiver */ - port->ip_mscount = 0; - - /* port transtions to IN-LIMBO state */ - port->ip_receiver_name = MACH_PORT_NULL; - port->ip_destination = IP_NULL; + /* + * Ports in kernel are never hooked to a pset, + * so we don't need to pass a waitq linkage free list. + */ + ipc_port_mark_in_limbo(port, &label, NULL); + ip_mq_unlock(port); + } else { + ipc_release_assert(io_state_in_limbo(label.io_state)); + ip_mq_unlock_label_put(port, &label); } - ip_mq_unlock(port); break; case MACH_MSG_TYPE_COPY_SEND: @@ -703,8 +642,7 @@ ipc_object_copyin_from_kernel( ip_mq_lock(port); if (ip_active(port)) { assert(ip_in_a_space(port)); - assert((ip_in_space(port, ipc_space_kernel)) || - (port->ip_receiver->is_node_id != HOST_LOCAL_NODE)); + assert(ip_in_space(port, ipc_space_kernel)); port->ip_mscount++; } @@ -930,6 +868,7 @@ ipc_object_copyout( mach_port_name_t *namep) { struct knote *kn = current_thread()->ith_knote; + ipc_object_label_t label; mach_port_name_t name; ipc_entry_t entry; kern_return_t kr; @@ -963,17 +902,13 @@ ipc_object_copyout( continue; } - ip_mq_lock_check_aligned(port); - if (!ip_active(port)) { - ip_mq_unlock(port); - is_write_unlock(space); - kr = KERN_INVALID_CAPABILITY; - goto out; - } + label = ip_mq_lock_check_aligned(port); /* Don't actually copyout rights we aren't allowed to */ - if (!ip_label_check(space, port, msgt_name, &flags, &port_subst)) { - ip_mq_unlock(port); + if (!io_state_active(label.io_state) || + !ip_label_check_or_substitute(space, port, &label, + msgt_name, &port_subst)) { + ip_mq_unlock_label_put(port, &label); is_write_unlock(space); assert(port_subst == IP_NULL); kr = KERN_INVALID_CAPABILITY; @@ -1010,11 +945,17 @@ ipc_object_copyout( ipc_entry_claim(space, ip_to_object(port), &name, &entry); } - kr = ipc_right_copyout(space, port, msgt_name, - flags, name, entry, gdesc); + if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { + ipc_right_copyout_recv_and_unlock_space(space, port, &label, + name, entry, gdesc); + } else { + ip_label_put(port, &label); + ipc_right_copyout_any_send(space, port, msgt_name, + flags, name, entry); + is_write_unlock(space); + } /* port is unlocked */ - is_write_unlock(space); out: if (kr == KERN_SUCCESS) { @@ -1052,6 +993,7 @@ ipc_object_copyout_name( mach_msg_type_name_t msgt_name, mach_port_name_t name) { + ipc_object_label_t label; mach_port_name_t oname; ipc_entry_t oentry; ipc_entry_t entry; @@ -1070,7 +1012,7 @@ ipc_object_copyout_name( } /* space is write-locked and active */ - ip_mq_lock_check_aligned(port); + label = ip_mq_lock_check_aligned(port); /* * Don't actually copyout rights we aren't allowed to @@ -1078,8 +1020,9 @@ ipc_object_copyout_name( * In particular, kolabel-ed ports do not allow callers * to pick the name they end up with. */ - if (!ip_active(port) || ip_is_kolabeled(port)) { - ip_mq_unlock(port); + if (!io_state_active(label.io_state) || + (io_is_kobject_type(label.io_type) && label.iol_kobject)) { + ip_mq_unlock_label_put(port, &label); if (!ipc_right_inuse(entry)) { ipc_entry_dealloc(space, IPC_OBJECT_NULL, name, entry); } @@ -1092,7 +1035,7 @@ ipc_object_copyout_name( if ((msgt_name != MACH_MSG_TYPE_PORT_SEND_ONCE) && ipc_right_reverse(space, port, &oname, &oentry)) { if (name != oname) { - ip_mq_unlock(port); + ip_mq_unlock_label_put(port, &label); if (!ipc_right_inuse(entry)) { ipc_entry_dealloc(space, IPC_OBJECT_NULL, name, entry); } @@ -1103,7 +1046,7 @@ ipc_object_copyout_name( assert(entry == oentry); assert(entry->ie_bits & MACH_PORT_TYPE_SEND_RECEIVE); } else if (ipc_right_inuse(entry)) { - ip_mq_unlock(port); + ip_mq_unlock_label_put(port, &label); is_write_unlock(space); return KERN_NAME_EXISTS; } else { @@ -1111,14 +1054,14 @@ ipc_object_copyout_name( entry->ie_port = port; } -#if IMPORTANCE_INHERITANCE - /* - * We are slamming a receive right into the space, without - * first having been enqueued on a port destined there. So, - * we have to arrange to boost the task appropriately if this - * port has assertions (and the task wants them). - */ if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { +#if IMPORTANCE_INHERITANCE + /* + * We are slamming a receive right into the space, without + * first having been enqueued on a port destined there. So, + * we have to arrange to boost the task appropriately if this + * port has assertions (and the task wants them). + */ if (space->is_task != TASK_NULL) { task_imp = space->is_task->task_imp_base; if (ipc_importance_task_is_any_receiver_type(task_imp)) { @@ -1131,15 +1074,16 @@ ipc_object_copyout_name( /* take port out of limbo */ port->ip_tempowner = 0; - } #endif /* IMPORTANCE_INHERITANCE */ - - kr = ipc_right_copyout(space, port, msgt_name, - IPC_OBJECT_COPYOUT_FLAGS_NONE, name, entry, NULL); - - /* port is unlocked */ - is_write_unlock(space); + ipc_right_copyout_recv_and_unlock_space(space, port, &label, + name, entry, NULL); + } else { + ip_label_put(port, &label); + ipc_right_copyout_any_send(space, port, msgt_name, + IPC_OBJECT_COPYOUT_FLAGS_NONE, name, entry); + is_write_unlock(space); + } #if IMPORTANCE_INHERITANCE /* @@ -1151,7 +1095,7 @@ ipc_object_copyout_name( } #endif /* IMPORTANCE_INHERITANCE */ - return kr; + return KERN_SUCCESS; } /* @@ -1209,8 +1153,6 @@ ipc_object_copyout_dest( } case MACH_MSG_TYPE_PORT_SEND_ONCE: { - assert(port->ip_sorights > 0); - if (ip_in_space(port, space)) { /* quietly consume the send-once right */ ip_sorights_dec(port); @@ -1242,6 +1184,32 @@ ipc_object_copyout_dest( *namep = name; } +void +ipc_object_unpin( + ipc_space_t space, + ipc_port_t port) +{ + mach_port_name_t name; + ipc_entry_t entry; + + if (IP_VALID(port)) { + is_write_lock(space); + ip_mq_lock(port); + + if (is_active(space) && + ipc_right_reverse(space, port, &name, &entry) && + (entry->ie_bits & IE_BITS_PINNED_SEND)) { + assert(entry->ie_bits & MACH_PORT_TYPE_SEND); + entry->ie_bits &= ~IE_BITS_PINNED_SEND; + + ipc_entry_modified(space, name, entry); + } + + ip_mq_unlock(port); + is_write_unlock(space); + } +} + static_assert(offsetof(struct ipc_object_waitq, iowq_waitq) == offsetof(struct ipc_port, ip_waitq)); static_assert(offsetof(struct ipc_object_waitq, iowq_waitq) == @@ -1286,47 +1254,29 @@ ipc_object_validate_preflight_panic(ipc_object_t io) bool ipc_object_lock_allow_invalid(ipc_object_t orig_io) { - struct waitq *orig_wq = io_waitq(orig_io); - struct waitq *wq = pgz_decode_allow_invalid(orig_wq, ZONE_ID_ANY); + struct waitq *wq = io_waitq(orig_io); switch (zone_id_for_element(wq, sizeof(*wq))) { case ZONE_ID_IPC_PORT: case ZONE_ID_IPC_PORT_SET: break; default: -#if CONFIG_PROB_GZALLOC - if (orig_wq != wq) { - /* - * The element was PGZ protected, and the translation - * returned another type than port or port-set, or - * ZONE_ID_INVALID (wq is NULL). - * - * We have to allow this skew, and assumed the slot - * has held a now freed port/port-set. - */ - return false; - } -#endif /* CONFIG_PROB_GZALLOC */ ipc_object_validate_preflight_panic(orig_io); } if (__probable(waitq_lock_allow_invalid(wq))) { ipc_object_t io = io_from_waitq(wq); - ipc_object_validate(io, io_otype(io)); -#if CONFIG_PROB_GZALLOC - if (__improbable(wq != orig_wq && - wq != pgz_decode_allow_invalid(orig_wq, ZONE_ID_ANY))) { - /* - * This object is no longer held in the slot, - * whatever this object is, it's not the droid - * we're looking for. Pretend we failed the lock. - */ - waitq_unlock(wq); - return false; - } -#endif /* CONFIG_PROB_GZALLOC */ + ipc_object_validate(io, io_type(io)); return true; } return false; } + +__attribute__((always_inline)) +void +ipc_object_unlock(ipc_object_t object) +{ + release_assert(!object->io_label_lock); + io_unlock_nocheck(object); +} diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index daf10c30e..4bdb29e18 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -72,12 +72,14 @@ #ifndef _IPC_IPC_OBJECT_H_ #define _IPC_IPC_OBJECT_H_ +#include #include #include #include #include #include #include +#include #include #include #include @@ -85,124 +87,387 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN #pragma GCC visibility push(hidden) -typedef natural_t ipc_object_refs_t; /* for ipc/ipc_object.h */ typedef natural_t ipc_object_bits_t; -typedef natural_t ipc_object_type_t; __options_closed_decl(ipc_object_copyout_flags_t, uint32_t, { IPC_OBJECT_COPYOUT_FLAGS_NONE = 0x0, IPC_OBJECT_COPYOUT_FLAGS_PINNED = 0x1, - IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK = 0x2, }); __options_closed_decl(ipc_object_copyin_flags_t, uint16_t, { IPC_OBJECT_COPYIN_FLAGS_NONE = 0x0, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND = 0x1, /* Dest port contains an immovable send right */ - IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE = 0x2, - IPC_OBJECT_COPYIN_FLAGS_DEADOK = 0x4, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MAKE_SEND_ONCE = 0x8, /* Port is a reply port. */ - IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MOVE_SEND_ONCE = 0x10, /* Port is a reply port. */ - IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_RECEIVE = 0x20, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_CONN_IMMOVABLE_RECEIVE = 0x40, /* Port is a libxpc connection port. */ - IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_COPY = 0x80, - IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_MOVE = 0x100, + IPC_OBJECT_COPYIN_FLAGS_DEADOK = 0x2, + IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_COPY = 0x4, + IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_MOVE = 0x8, }); -/* - * The ipc_object is used to both tag and reference count these two data - * structures, and (Noto Bene!) pointers to either of these or the - * ipc_object at the head of these are freely cast back and forth; hence - * the ipc_object MUST BE FIRST in the ipc_common_data. +__enum_decl(ipc_copyin_op_t, uint16_t, { + IPC_COPYIN_REASON_NONE, + IPC_COPYIN_KMSG_DESTINATION, + IPC_COPYIN_KMSG_REPLY, + IPC_COPYIN_KMSG_VOUCHER, + IPC_COPYIN_KMSG_PORT_DESCRIPTOR, + IPC_COPYIN_KMSG_GUARDED_PORT_DESCRIPTOR, + IPC_COPYIN_KMSG_OOL_PORT_ARRAY_DESCRIPTOR, + IPC_COPYIN_KERNEL_DESTINATION, +}); + +/*! + * @typedef ipc_object_state_t * - * If the RPC implementation enabled user-mode code to use kernel-level - * data structures (as ours used to), this peculiar structuring would - * avoid having anything in user code depend on the kernel configuration - * (with which lock size varies). + * @abstract + * Denotes the state of an IPC object. + * + * @const IO_STATE_INACTIVE + * The object is dead. + * + * Inactive ports have: + * - the receiver union is using the ip_timestamp field, + * holding a timestamp of when ipc_port_mark_inactive() + * was called; + * - ip_receiver_name set to MACH_PORT_NULL. + * + * @const IO_STATE_IN_SPACE + * The object is owned by some IPC space. + * + * In-space ports have: + * - the receiver union is using the ip_receiver field, pointing + * to the (possibly special) space holding the receive right for this port; + * - ip_receiver_name set to a non MACH_PORT_NULL value corresponding to the + * name of the receive right for this port within the corresponding space + * (MACH_PORT_SPECIAL_DEFAULT is used for special spaces). + * + * @const IO_STATE_IN_SPACE_IMMOVABLE + * The object is owned by some IPC space, and can't move out of it. + * + * @see IO_STATE_IN_SPACE for details of the receiver fields. + * + * @const IO_STATE_IN_LIMBO + * The object is a port and is currently being manipulated by the kernel + * and is in between states. + * + * In limbo ports have: + * - the receiver union set to 0; + * - ip_receiver_name set to MACH_PORT_NULL. + * + * @const IO_STATE_IN_LIMBO_PD + * The object is a port and is currently being manipulated by the kernel + * before being enqueued into a port-destroyed notification message. + * + * @see IO_STATE_IN_LIMBO_PD for details of the receiver fields. + * + * @const IO_STATE_IN_TRANSIT + * The object is a port and is enqueued on some port inside a message. + * + * In-transit ports have: + * - the receiver union using the ip_destination field, with a reference + * owning pointer to a valid port; + * - ip_receiver_name set to MACH_PORT_NULL. + * + * @const IO_STATE_IN_TRANSIT_PD + * The object is a port and is enqueued on some port inside a port-destroyed + * notification message. + * + * @see IO_STATE_IN_TRANSIT for details on the receiver fields. + */ +__enum_closed_decl(ipc_object_state_t, uint8_t, { + IO_STATE_INACTIVE, + IO_STATE_IN_SPACE, + IO_STATE_IN_SPACE_IMMOVABLE, + IO_STATE_IN_LIMBO, + IO_STATE_IN_LIMBO_PD, + IO_STATE_IN_TRANSIT, + IO_STATE_IN_TRANSIT_PD, +}); + +/*! + * @brief + * The header of an IPC object (port or pset). + * + * @discussion + * This header must be at the start of any IPC object that can be held + * in a port-space (currently, IPC ports and IPC port sets). + * + * @field io_type + * The type of the IPC object, this value is an immutable property + * of the IPC object and can be read without holding any locks. + * + * @field io_state + * This field denots the state of the object, it is mutable, + * and must be read under the object lock held. + * + * Furthermore, it can be autnenticated by extracting the IPC object label + * from the object (using @c io_label_get()), which is the preferred way + * to look at this field. + * + * @field io_filtered + * Whether this port uses IPC filtering, this is an immutable property + * of the object and can be consulted without holding any lock. + * + * @field io_label_lock + * This is used to track extent IPC object labels for this object, + * and should not be consulted or manipulated directly. + * + * @field io_references + * The refcount for this IPC object (meaningless on a IPC object label). + * + * @field iol_pointer + * This union holds the actual label for an IPC object, it is signed + * and must be read by using the @c io_label_get() accessor. The pointer + * is not stable during the lifecycle of the object. */ struct ipc_object { - ipc_object_bits_t _Atomic io_bits; - ipc_object_refs_t _Atomic io_references; -} __attribute__((aligned(8))); + union { + struct { + ipc_object_type_t io_type; + ipc_object_state_t io_state : 3; + uint8_t io_filtered : 1; + uint8_t __io_unused1 : 4; + /* dPAC modifier boundary */ + /* + * the io_label_lock supports io_label_get/put, + * it could be a single bit, but a full byte + * yields much better codegen, and the bits are unused. + */ + bool io_label_lock; + uint8_t __io_unused2; + }; + ipc_object_bits_t io_bits; + }; + os_ref_atomic_t io_references; + union { + /* these are dPACed when on a port header */ + const void *iol_pointer; + unsigned long iol_value; + struct ipc_service_port_label *iol_service; + struct ipc_conn_port_label *iol_connection; + struct ipc_kobject_label *iol_kobject; + struct mk_timer *iol_mktimer; + }; +}; -/* - * IPC steals the high-order bits from the kotype to use - * for its own purposes. This allows IPC to record facts - * about ports that aren't otherwise obvious from the - * existing port fields. In particular, IPC can optionally - * mark a port for no more senders detection. Any change - * to IO_BITS_PORT_INFO must be coordinated with bitfield - * definitions in ipc_port.h. +/*! + * @brief + * Type used to hold details about a resolved IPC object type and label. * - * Note that the io_bits can be read atomically without - * holding the object lock (for example to read the kobject type). - * As such updates to this field need to use the io_bits_or() - * or io_bits_andnot() functions. + * @discussion + * This is a non-copyable, movable value type which is never stored + * on any data structure. + * + * It is also required that at most one such structure is extant for a given + * port at any point in time, and that the port lock is held for the whole + * time this structure is live. This is enforced at runtime. + * + * The core API to acquire such a type is @c io_label_get(), and when it is + * no longer used, @c ip_label_put() can be used to denote it's no longer + * valid, and the value will be poisoned. @c ip_label_set() can also be used + * if the value is going to be updated, which also consumes the label. + * + * Passing this structure by value to functions that will not call + * @c ip_label_put() or release the port lock is a valid and even encouraged + * practice, as this is a much better calling convention. */ -#define IO_BITS_PORT_INFO 0x0000f000 /* stupid port tricks */ -#define IO_BITS_KOTYPE 0x000003ff /* used by the object */ -#define IO_BITS_KOLABEL 0x00000400 /* The kobject has a label */ -#define IO_BITS_OTYPE 0x7fff0000 /* determines a zone */ -#define IO_BITS_ACTIVE 0x80000000 /* is object alive? */ +typedef struct ipc_object ipc_object_label_t; -#define io_bits(io) atomic_load_explicit(&(io)->io_bits, memory_order_relaxed) +#define IPC_OBJECT_LABEL(otype, ...) \ + ((ipc_object_label_t){ \ + .io_type = otype, \ + .io_state = IO_STATE_IN_SPACE, \ + ## __VA_ARGS__, \ + }) -static inline void -io_bits_or(ipc_object_t io, ipc_object_bits_t bits) -{ - /* - * prevent any possibility for the compiler to tear the update, - * the update still requires the io lock to be held. - */ - os_atomic_store(&io->io_bits, io_bits(io) | bits, relaxed); -} +#define IPC_OBJECT_LABEL_INVALID \ + ((ipc_object_label_t){ \ + .io_bits = ~0u, \ + .io_references = ~0u, \ + .iol_value = ~0ul, \ + }) -static inline void -io_bits_andnot(ipc_object_t io, ipc_object_bits_t bits) -{ - /* - * prevent any possibility for the compiler to tear the update, - * the update still requires the io lock to be held. - */ - os_atomic_store(&io->io_bits, io_bits(io) & ~bits, relaxed); -} +#define io_type(io) ((io)->io_type) +#define io_is_pset_type(t) ((t) == IOT_PORT_SET) +#define io_is_any_port_type(t) (!io_is_pset_type(t)) +#define io_is_kobject_type(t) ((t) >= __IKOT_FIRST) +#define io_is_any_port(io) io_is_any_port_type(io_type(io)) +#define io_is_pset(io) io_is_pset_type(io_type(io)) +#define io_is_kobject(io) io_is_kobject_type(io_type(io)) -#define io_active(io) ((io_bits(io) & IO_BITS_ACTIVE) != 0) - -#define io_otype(io) ((io_bits(io) & IO_BITS_OTYPE) >> 16) -#define io_kotype(io) (io_bits(io) & IO_BITS_KOTYPE) -#define io_is_kobject(io) (io_kotype(io) != 0) -#define io_is_kolabeled(io) ((io_bits(io) & IO_BITS_KOLABEL) != 0) -#define io_makebits(otype) (IO_BITS_ACTIVE | ((otype) << 16)) - -/* - * Object types: ports, port sets, kernel-loaded ports - */ -#define IOT_PORT 0 -#define IOT_PORT_SET 1 -#define IOT_NUMBER 2 /* number of types used */ - -extern zone_t __single ipc_object_zones[IOT_NUMBER]; - -#define io_alloc(otype, flags) \ - zalloc_flags(ipc_object_zones[otype], flags) +ZONE_DECLARE_ID(ZONE_ID_IPC_PORT, struct ipc_port); +ZONE_DECLARE_ID(ZONE_ID_IPC_PORT_SET, struct ipc_pset); /* * Here we depend on all ipc_objects being an ipc_wait_queue + * + * this type doesn't exist and is only used to do math */ +struct ipc_object_waitq { + struct ipc_object iowq_object; + struct waitq iowq_waitq; +}; #define io_waitq(io) \ (&__container_of(io, struct ipc_object_waitq, iowq_object)->iowq_waitq) #define io_from_waitq(waitq) \ (&__container_of(waitq, struct ipc_object_waitq, iowq_waitq)->iowq_object) -#define io_unlock(io) waitq_unlock(io_waitq(io)) -#define io_lock_held(io) assert(waitq_held(io_waitq(io))) -#define io_lock_held_kdp(io) waitq_held(io_waitq(io)) -#define io_lock_allow_invalid(io) ipc_object_lock_allow_invalid(io) +#define io_unlock(io) ipc_object_unlock(io) +#define io_unlock_nocheck(io) waitq_unlock(io_waitq(io)) +#define io_lock_held(io) assert(waitq_held(io_waitq(io))) +#define io_lock_held_kdp(io) waitq_held(io_waitq(io)) +#define io_lock_allow_invalid(io) ipc_object_lock_allow_invalid(io) -#define io_reference(io) ipc_object_reference(io) -#define io_release(io) ipc_object_release(io) -#define io_release_safe(io) ipc_object_release_safe(io) -#define io_release_live(io) ipc_object_release_live(io) +#define io_reference(io) ipc_object_reference(io) +#define io_release(io) ipc_object_release(io) +#define io_release_safe(io) ipc_object_release_safe(io) +#define io_release_live(io) ipc_object_release_live(io) + +static inline bool +io_state_active(ipc_object_state_t state) +{ + return state != IO_STATE_INACTIVE; +} + +static inline bool +io_state_in_space(ipc_object_state_t state) +{ + switch (state) { + case IO_STATE_IN_SPACE: + case IO_STATE_IN_SPACE_IMMOVABLE: + return true; + default: + return false; + } +} + +static inline bool +io_state_in_limbo(ipc_object_state_t state) +{ + switch (state) { + case IO_STATE_IN_LIMBO: + case IO_STATE_IN_LIMBO_PD: + return true; + default: + return false; + } +} + +static inline bool +io_state_in_transit(ipc_object_state_t state) +{ + switch (state) { + case IO_STATE_IN_TRANSIT: + case IO_STATE_IN_TRANSIT_PD: + return true; + default: + return false; + } +} + +static inline bool +io_state_is_moving(ipc_object_state_t state) +{ + switch (state) { + case IO_STATE_IN_LIMBO: + case IO_STATE_IN_LIMBO_PD: + case IO_STATE_IN_TRANSIT: + case IO_STATE_IN_TRANSIT_PD: + return true; + default: + return false; + } +} + +__result_use_check +__attribute__((always_inline)) +static inline ipc_object_label_t +__io_label_validate(ipc_object_t io, ipc_object_label_t label, bool lock) +{ + if (lock) { + io_lock_held(io); + release_assert(!io->io_label_lock); + io->io_label_lock = true; + } + + label.iol_pointer = ptrauth_auth_data(label.iol_pointer, + ptrauth_key_process_independent_data, + ptrauth_blend_discriminator(io, (uint32_t)(label.io_bits + + ptrauth_string_discriminator("ipc_object.iol_pointer")))); + +#if __has_feature(ptrauth_calls) + /* + * io_label() must guarantee that we always do the PAC evaluation, + * as callers even if they do not use bits or the pointer value, + * expect this validation to take place. + */ + __compiler_materialize_and_prevent_reordering_on(label.iol_pointer); +#endif + + return label; +} + +__result_use_check +__attribute__((always_inline, overloadable)) +static inline ipc_object_label_t +io_label_get(ipc_object_t io, ipc_object_type_t otype) +{ + ipc_object_label_t label; + + label = *io; + label.io_type = otype; + + return __io_label_validate(io, label, true); +} + +__result_use_check +__attribute__((always_inline, overloadable)) +static inline ipc_object_label_t +io_label_get(ipc_object_t io) +{ + return __io_label_validate(io, *io, true); +} + +__attribute__((always_inline, overloadable)) +static inline ipc_object_label_t +io_label_peek_kdp(ipc_object_t io) +{ + assert(!io_lock_held_kdp(io)); + return __io_label_validate(io, *io, false); +} + +__attribute__((always_inline)) +static inline void +io_label_init(ipc_object_t io, ipc_object_label_t label) +{ + atomic_store_explicit(os_cast_to_atomic_pointer(&io->io_bits), + label.io_bits, memory_order_relaxed); + + io->iol_pointer = ptrauth_sign_unauthenticated(label.iol_pointer, + ptrauth_key_process_independent_data, + ptrauth_blend_discriminator(io, (uint32_t)(label.io_bits + + ptrauth_string_discriminator("ipc_object.iol_pointer")))); +} + +__attribute__((always_inline)) +static inline void +io_label_set_and_put(ipc_object_t io, ipc_object_label_t *label) +{ + release_assert(io->io_label_lock); + io_lock_held(io); + + io_label_init(io, *label); + *label = IPC_OBJECT_LABEL_INVALID; +} + +__attribute__((always_inline)) +static inline void +io_label_put(ipc_object_t io, ipc_object_label_t *label) +{ + assert(io->io_type == label->io_type && + io->io_state == label->io_state); + release_assert(io->io_label_lock); + + io->io_label_lock = false; + *label = IPC_OBJECT_LABEL_INVALID; +} /* * Exported interfaces @@ -211,22 +476,25 @@ extern zone_t __single ipc_object_zones[IOT_NUMBER]; extern bool ipc_object_lock_allow_invalid( ipc_object_t object) __result_use_check; +extern void ipc_object_unlock( + ipc_object_t object); + extern void ipc_object_deallocate_register_queue(void); /* Take a reference to an object */ extern void ipc_object_reference( - ipc_object_t object); + ipc_object_t object); /* Release a reference to an object */ extern void ipc_object_release( - ipc_object_t object); + ipc_object_t object); extern void ipc_object_release_safe( - ipc_object_t object); + ipc_object_t object); /* Release a reference to an object that isn't the last one */ extern void ipc_object_release_live( - ipc_object_t object); + ipc_object_t object); /* Look up an object in a space */ extern kern_return_t ipc_object_translate( @@ -250,27 +518,21 @@ extern void ipc_object_validate( /* Allocate a dead-name entry */ extern kern_return_t ipc_object_alloc_dead( - ipc_space_t space, - mach_port_name_t *namep); + ipc_space_t space, + mach_port_name_t *namep); /* Allocate an object */ -extern kern_return_t ipc_object_alloc( - ipc_space_t space, - ipc_object_type_t otype, - mach_port_type_t type, - mach_port_urefs_t urefs, - mach_port_name_t *namep, - ipc_object_t *objectp); +extern kern_return_t ipc_object_alloc_entry( + ipc_space_t space, + ipc_object_t object, + mach_port_name_t *namep, + ipc_entry_t *entry); /* Allocate an object, with a specific name */ -extern kern_return_t ipc_object_alloc_name( - ipc_space_t space, - ipc_object_type_t otype, - mach_port_type_t type, - mach_port_urefs_t urefs, - mach_port_name_t name, - ipc_object_t *objectp, - void (^finish_init)(ipc_object_t object)); +extern kern_return_t ipc_object_alloc_entry_with_name( + ipc_space_t space, + mach_port_name_t name, + ipc_entry_t *entry); /* Convert a send type name to a received type name */ extern mach_msg_type_name_t ipc_object_copyin_type( @@ -282,6 +544,7 @@ extern kern_return_t ipc_object_copyin( mach_port_name_t name, mach_msg_type_name_t msgt_name, ipc_object_copyin_flags_t copyin_flags, + ipc_copyin_op_t copyin_reason, mach_msg_guarded_port_descriptor_t *gdesc, ipc_port_t *portp); @@ -329,6 +592,11 @@ extern void ipc_object_copyout_dest( mach_msg_type_name_t msgt_name, mach_port_name_t *namep); +/* Unpin the entry for a send right pointing to "object" */ +extern void ipc_object_unpin( + ipc_space_t space, + ipc_port_t port); + #pragma GCC visibility pop __ASSUME_PTR_ABI_SINGLE_END __END_DECLS diff --git a/osfmk/ipc/ipc_policy.c b/osfmk/ipc/ipc_policy.c index 8e5392ec3..9f40321b1 100644 --- a/osfmk/ipc/ipc_policy.c +++ b/osfmk/ipc/ipc_policy.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -36,6 +38,7 @@ #include #include #include +#include /* is_ux_handler_port() */ #include /* current_map() */ #include /* current_proc() */ @@ -53,7 +56,6 @@ #include -extern int proc_isinitproc(struct proc *p); extern bool proc_is_simulated(struct proc *); extern char *proc_name_address(struct proc *p); extern int exit_with_guard_exception( @@ -61,8 +63,6 @@ extern int exit_with_guard_exception( mach_exception_data_type_t code, mach_exception_data_type_t subcode); - - #pragma mark policy tunables extern const vm_size_t ipc_kmsg_max_vm_space; @@ -73,11 +73,107 @@ static TUNABLE(bool, allow_legacy_mach_msg, "allow_legacy_mach_msg", false); #endif /* DEVELOPMENT || DEBUG */ #endif /* IPC_HAS_LEGACY_MACH_MSG_TRAP */ +/* a boot-arg to enable/disable OOL port array restrictions */ +#if XNU_TARGET_OS_XR +TUNABLE(bool, ool_port_array_enforced, "ool_port_array_enforced", false); +#else +TUNABLE(bool, ool_port_array_enforced, "ool_port_array_enforced", true); +#endif /* XNU_TARGET_OS_XR */ + +/* Note: Consider Developer Mode when changing the default. */ +TUNABLE(ipc_control_port_options_t, ipc_control_port_options, + "ipc_control_port_options", + ICP_OPTIONS_IMMOVABLE_1P_HARD | + ICP_OPTIONS_PINNED_1P_HARD | +#if !XNU_TARGET_OS_OSX + ICP_OPTIONS_IMMOVABLE_3P_HARD | +#endif + ICP_OPTIONS_PINNED_3P_SOFT); + +TUNABLE(bool, service_port_defense_enabled, "-service_port_defense_enabled", true); + +/* The bootarg to disable ALL ipc policy violation telemetry */ +TUNABLE(bool, ipcpv_telemetry_enabled, "-ipcpv_telemetry_enabled", true); + +/* boot-arg for provisional reply port enforcement */ +#if XNU_TARGET_OS_OSX || XNU_TARGET_OS_BRIDGE +TUNABLE(bool, prp_enforcement_enabled, "-prp_enforcement_enabled", false); +#else +TUNABLE(bool, prp_enforcement_enabled, "-prp_enforcement_enabled", true); +#endif /* XNU_TARGET_OS_OSX || XNU_TARGET_OS_BRIDGE */ + +/* + * bootargs for reply port semantics on bootstrap ports + */ +TUNABLE(bool, bootstrap_port_telemetry_enabled, "-bootstrap_port_telemetry_enabled", true); +TUNABLE(bool, bootstrap_port_enforcement_enabled, "-bootstrap_port_enforcement_enabled", true); + +/* Enables reply port/voucher/persona debugging code */ +TUNABLE(bool, enforce_strict_reply, "-enforce_strict_reply", false); #pragma mark policy options +ipc_space_policy_t +ipc_policy_for_task(task_t task) +{ +#if XNU_TARGET_OS_OSX + struct proc *proc = get_bsdtask_info(task); +#endif /* XNU_TARGET_OS_OSX */ + ipc_space_policy_t policy = IPC_SPACE_POLICY_DEFAULT; + uint32_t ro_flags; + + if (task == kernel_task) { + return policy | IPC_SPACE_POLICY_KERNEL; + } + + ro_flags = task_ro_flags_get(task); + if (ro_flags & TFRO_PLATFORM) { + policy |= IPC_SPACE_POLICY_PLATFORM; + policy |= IPC_POLICY_ENHANCED_V2; + } + + if (task_get_platform_restrictions_version(task) >= 2) { + policy |= IPC_POLICY_ENHANCED_V2; + } else if (task_get_platform_restrictions_version(task) == 1) { + policy |= IPC_POLICY_ENHANCED_V1; +#if XNU_TARGET_OS_OSX + } else if (proc && csproc_hardened_runtime(proc)) { + policy |= IPC_POLICY_ENHANCED_V0; +#endif /* XNU_TARGET_OS_OSX */ + } + +#if XNU_TARGET_OS_OSX + if (task_opted_out_mach_hardening(task)) { + policy |= IPC_SPACE_POLICY_OPTED_OUT; + } +#endif /* XNU_TARGET_OS_OSX */ + + /* + * policy modifiers + */ +#if XNU_TARGET_OS_OSX + if (proc && proc_is_simulated(proc)) { + policy |= IPC_SPACE_POLICY_SIMULATED; + } +#endif +#if CONFIG_ROSETTA + if (task_is_translated(task)) { + policy |= IPC_SPACE_POLICY_TRANSLATED; + } +#endif + + return policy; +} + + +inline ipc_space_policy_t +ipc_convert_msg_options_to_space(mach_msg_option64_t opts) +{ + return opts >> MACH64_POLICY_SHIFT; +} + mach_msg_option64_t -ipc_current_user_policy( +ipc_current_msg_options( task_t task, mach_msg_option64_t opts) { @@ -104,34 +200,33 @@ ipc_current_user_policy( /* * Step 2: derive policy flags from the current context */ - if (ro_flags & TFRO_PLATFORM) { - opts |= MACH64_POLICY_PLATFORM; - opts |= MACH64_POLICY_RIGID; - opts |= MACH64_POLICY_HARDENED; - } - if (ro_flags & TFRO_HARDENED) { - opts |= MACH64_POLICY_RIGID; - opts |= MACH64_POLICY_HARDENED; - } -#if CONFIG_ROSETTA - if (task_is_translated(task)) { - opts |= MACH64_POLICY_TRANSLATED; - } -#endif -#if XNU_TARGET_OS_OSX - struct proc *proc = get_bsdtask_info(task); - if (proc_is_simulated(proc)) { - opts |= MACH64_POLICY_SIMULATED; - } - if (csproc_hardened_runtime(proc)) { - opts |= MACH64_POLICY_HARDENED; - } -#endif - if (!(opts & MACH64_POLICY_NEEDED_MASK)) { - /* helps assert that a policy has been set */ - opts |= MACH64_POLICY_DEFAULT; + { + /* + * mach_msg_option64_t can't use IPC_SPACE_POLICY_BASE(), + * check using this MACH64_POLICY_SHIFT is legitimate. + */ +#define verify_policy_enum(name) \ + static_assert(IPC_SPACE_POLICY_ ## name == \ + MACH64_POLICY_ ## name >> MACH64_POLICY_SHIFT) + + verify_policy_enum(DEFAULT); + verify_policy_enum(ENHANCED); + verify_policy_enum(PLATFORM); + verify_policy_enum(KERNEL); + verify_policy_enum(SIMULATED); + verify_policy_enum(TRANSLATED); + verify_policy_enum(OPTED_OUT); + verify_policy_enum(ENHANCED_V0); + verify_policy_enum(ENHANCED_V1); + verify_policy_enum(ENHANCED_V2); + verify_policy_enum(ENHANCED_VERSION_MASK); + verify_policy_enum(MASK); + +#undef verify_policy_enum } + opts |= (uint64_t)ipc_space_policy(task->itk_space) << MACH64_POLICY_SHIFT; + return opts; } @@ -169,6 +264,34 @@ ipc_preflight_msg_option64(mach_msg_option64_t opts) return KERN_NOT_SUPPORTED; } +#pragma mark helpers + +bool +ipc_should_apply_policy( + const ipc_space_policy_t current_policy, + const ipc_space_policy_t requested_level) +{ + /* Do not apply security policies on these binaries to avoid bincompat regression */ + if ((current_policy & IPC_SPACE_POLICY_SIMULATED) || + (current_policy & IPC_SPACE_POLICY_OPTED_OUT) || + (current_policy & IPC_SPACE_POLICY_TRANSLATED)) { + return false; + } + + /* Check versioning for applying platform restrictions policy */ + if (requested_level & current_policy & IPC_SPACE_POLICY_ENHANCED) { + /* Platform is always opted into platform restrictions */ + if (current_policy & IPC_SPACE_POLICY_PLATFORM) { + return true; + } + + const ipc_space_policy_t requested_version = requested_level & IPC_SPACE_POLICY_ENHANCED_VERSION_MASK; + const ipc_space_policy_t current_es_version = current_policy & IPC_SPACE_POLICY_ENHANCED_VERSION_MASK; + assert(requested_version != 0); + return requested_version <= current_es_version; + } + return current_policy & requested_level; +} #pragma mark legacy trap policies #if IPC_HAS_LEGACY_MACH_MSG_TRAP @@ -257,7 +380,8 @@ ipc_policy_allow_legacy_send_trap( mach_msg_id_t msgid, mach_msg_option64_t opts) { - if ((opts & MACH64_POLICY_HARDENED) == 0) { + /* equivalent to ENHANCED_V0 */ + if ((opts & MACH64_POLICY_ENHANCED) == 0) { #if __x86_64__ if (current_map()->max_offset <= VM_MAX_ADDRESS) { /* @@ -317,12 +441,17 @@ struct ipc_policy_violations_rb_entry { char signing_id[CA_SIGNINGID_MAX_LEN]; ipc_policy_violation_id_t violation_id; int sw_platform; - int msgh_id; + int aux_data; int sdk; }; struct ipc_policy_violations_rb_entry ipc_policy_violations_rb[IPC_POLICY_VIOLATIONS_RB_SIZE]; static uint8_t ipc_policy_violations_rb_index = 0; +#if DEBUG || DEVELOPMENT +/* sysctl debug.ipcpv_telemetry_count */ +_Atomic unsigned int ipcpv_telemetry_count = 0; +#endif + LCK_GRP_DECLARE(ipc_telemetry_lock_grp, "ipc_telemetry_lock_grp"); LCK_TICKET_DECLARE(ipc_telemetry_lock, &ipc_telemetry_lock_grp); @@ -336,7 +465,8 @@ CA_EVENT(reply_port_semantics_violations, CA_STATIC_STRING(CA_MACH_SERVICE_PORT_NAME_LEN), service_name, CA_STATIC_STRING(CA_TEAMID_MAX_LEN), team_id, CA_STATIC_STRING(CA_SIGNINGID_MAX_LEN), signing_id, - CA_INT, reply_port_semantics_violation); + CA_INT, reply_port_semantics_violation, + CA_INT, msgh_id); /* for aux_data, keeping the legacy name msgh_id to avoid CA shenanigan */ static void send_telemetry( @@ -351,6 +481,7 @@ send_telemetry( strlcpy(event->team_id, entry->team_id, CA_TEAMID_MAX_LEN); strlcpy(event->signing_id, entry->signing_id, CA_SIGNINGID_MAX_LEN); event->reply_port_semantics_violation = entry->violation_id; + event->msgh_id = entry->aux_data; CA_EVENT_SEND(ca_event); } @@ -401,32 +532,33 @@ flush_ipc_policy_violations_telemetry(void) void ipc_stash_policy_violations_telemetry( ipc_policy_violation_id_t violation_id, - mach_service_port_info_t sp_info, + ipc_port_t service_port, int aux_data) { + if (!ipcpv_telemetry_enabled) { + return; + } + struct ipc_policy_violations_rb_entry *entry; char *service_name = (char *) "unknown"; task_t task = current_task_early(); int pid = -1; - bool skip_telemetry = false; - if (task && violation_id == IPCPV_REPLY_PORT_SEMANTICS_OPTOUT) { - task_lock(task); - /* Telemetry rate limited to once per task per host. */ - skip_telemetry = task_has_reply_port_telemetry(task); - if (!skip_telemetry) { - task_set_reply_port_telemetry(task); +#if CONFIG_SERVICE_PORT_INFO + if (IP_VALID(service_port)) { + /* + * dest_port lock must be held to avoid race condition + * when accessing ip_splabel rdar://139066947 + */ + struct mach_service_port_info sp_info; + ipc_object_label_t label = ip_mq_lock_label_get(service_port); + if (io_state_active(label.io_state) && ip_is_any_service_port_type(label.io_type)) { + ipc_service_port_label_get_info(label.iol_service, &sp_info); + service_name = sp_info.mspi_string_name; } - task_unlock(task); - } - - if (skip_telemetry) { - return; - } - - if (sp_info) { - service_name = sp_info->mspi_string_name; + ip_mq_unlock_label_put(service_port, &label); } +#endif /* CONFIG_SERVICE_PORT_INFO */ if (task) { pid = task_pid(task); @@ -463,7 +595,7 @@ ipc_stash_policy_violations_telemetry( if (signing_id) { strlcpy(entry->signing_id, signing_id, CA_SIGNINGID_MAX_LEN); } - entry->msgh_id = aux_data; + entry->aux_data = aux_data; entry->sw_platform = platform; entry->sdk = sdk; } @@ -475,11 +607,85 @@ ipc_stash_policy_violations_telemetry( lck_ticket_unlock(&ipc_telemetry_lock); } +#if DEBUG || DEVELOPMENT void -send_prp_telemetry(int msgh_id) +ipc_inc_telemetry_count(void) { - ipc_policy_violation_id_t violation_type = (csproc_hardened_runtime(current_proc())) ? IPCPV_MOVE_REPLY_PORT_HARDENED_RUNTIME : IPCPV_MOVE_REPLY_PORT_3P; - ipc_stash_policy_violations_telemetry(violation_type, NULL, msgh_id); + unsigned int count = os_atomic_load(&ipcpv_telemetry_count, relaxed); + if (!os_add_overflow(count, 1, &count)) { + os_atomic_store(&ipcpv_telemetry_count, count, relaxed); + } +} +#endif /* DEBUG || DEVELOPMENT */ + +/*! + * @brief + * Checks that this message conforms to reply port policies, which are: + * 1. IOT_REPLY_PORT's must be make-send-once disposition + * 2. You must use an IOT_REPLY_PORT (or weak variant) if the dest_port requires it + * + * @param reply_port the message local/reply port + * @param dest_port the message remote/dest port + * + * @returns + * - true if there is a violation in the security policy for this mach msg + * - false otherwise + */ +static mach_msg_return_t +ipc_validate_local_port( + mach_port_t reply_port, + mach_port_t dest_port, + mach_msg_option64_t opts) +{ + assert(IP_VALID(dest_port)); + /* An empty reply port, or an inactive reply port / dest port violates nothing */ + if (!IP_VALID(reply_port) || !ip_active(reply_port) || !ip_active(dest_port)) { + return MACH_MSG_SUCCESS; + } + + if (ip_is_reply_port(reply_port)) { + return MACH_MSG_SUCCESS; + } + + ipc_space_policy_t pol = ipc_convert_msg_options_to_space(opts); + /* skip translated and simulated process */ + if (!ipc_should_apply_policy((pol), IPC_SPACE_POLICY_DEFAULT)) { + return MACH_MSG_SUCCESS; + } + + /* kobject enforcement */ + if (ip_is_kobject(dest_port) && + ipc_should_apply_policy(pol, IPC_POLICY_ENHANCED_V1)) { + mach_port_guard_exception(ip_get_receiver_name(dest_port), 0, kGUARD_EXC_KOBJECT_REPLY_PORT_SEMANTICS); + return MACH_SEND_INVALID_REPLY; + } + + if (!ipc_policy(dest_port)->pol_enforce_reply_semantics || ip_is_provisional_reply_port(reply_port)) { + return MACH_MSG_SUCCESS; + } + + /* bootstrap port defense */ + if (ip_is_bootstrap_port(dest_port) && ipc_should_apply_policy(pol, IPC_POLICY_ENHANCED_V2)) { + if (bootstrap_port_telemetry_enabled && + !ipc_space_has_telemetry_type(current_space(), IS_HAS_BOOTSTRAP_PORT_TELEMETRY)) { + ipc_stash_policy_violations_telemetry(IPCPV_BOOTSTRAP_PORT, dest_port, 0); + } + if (bootstrap_port_enforcement_enabled) { + mach_port_guard_exception(ip_get_receiver_name(dest_port), 1, kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS); + return MACH_SEND_INVALID_REPLY; + } + } + + /* regular enforcement */ + if (!ip_is_bootstrap_port(dest_port)) { + if (ip_type(dest_port) == IOT_SERVICE_PORT) { + ipc_stash_policy_violations_telemetry(IPCPV_REPLY_PORT_SEMANTICS_OPTOUT, dest_port, 0); + } + mach_port_guard_exception(ip_get_receiver_name(dest_port), 0, kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS); + return MACH_SEND_INVALID_REPLY; + } + + return MACH_MSG_SUCCESS; } #pragma mark MACH_SEND_MSG policies @@ -530,48 +736,52 @@ ipc_validate_kmsg_schema_from_user( static mach_msg_return_t ipc_filter_kmsg_header_from_user( mach_msg_header_t *hdr, + mach_port_t dport, mach_msg_option64_t opts) { static const uint32_t MACH_BOOTSTRAP_PORT_MSG_ID_MASK = ((1u << 24) - 1); mach_msg_filter_id fid = 0; - mach_port_t remote_port = hdr->msgh_remote_port; + ipc_object_label_t dlabel; mach_msg_id_t msg_id = hdr->msgh_id; - ipc_service_port_label_t label = NULL; - void *sblabel = NULL; + struct ipc_conn_port_label *sblabel = NULL; - if (!ip_enforce_msg_filtering(remote_port)) { - return MACH_MSG_SUCCESS; - } + dlabel = ip_mq_lock_label_get(dport); - ip_mq_lock(remote_port); - if (!ip_active(remote_port)) { - /* nothing to do */ - } else if (remote_port->ip_service_port) { - label = remote_port->ip_splabel; - sblabel = label->ispl_sblabel; - - /* - * Mask the top byte for messages sent to launchd's bootstrap port. - * Filter any messages with domain 0 (as they correspond to MIG - * based messages) - */ - if (ipc_service_port_label_is_bootstrap_port(label)) { - if ((msg_id & ~MACH_BOOTSTRAP_PORT_MSG_ID_MASK) == 0) { - ip_mq_unlock(remote_port); - goto filtered_msg; + if (io_state_active(dlabel.io_state) && dlabel.io_filtered) { + switch (dlabel.io_type) { + case IOT_SERVICE_PORT: + case IOT_WEAK_SERVICE_PORT: + /* + * Mask the top byte for messages sent to launchd's bootstrap port. + * Filter any messages with domain 0 (as they correspond to MIG + * based messages) + */ + if (dlabel.iol_service->ispl_bootstrap_port) { + if ((msg_id & ~MACH_BOOTSTRAP_PORT_MSG_ID_MASK) == 0) { + ip_mq_unlock_label_put(dport, &dlabel); + goto filtered_msg; + } + msg_id = msg_id & MACH_BOOTSTRAP_PORT_MSG_ID_MASK; } - msg_id = msg_id & MACH_BOOTSTRAP_PORT_MSG_ID_MASK; + + sblabel = dlabel.iol_service->ispl_sblabel; + break; + + case IOT_CONNECTION_PORT: + /* Connection ports can also have send-side message filters */ + sblabel = dlabel.iol_connection; + break; + + default: + break; } - } else { - assert(!ip_is_kolabeled(remote_port)); - /* Connection ports can also have send-side message filters */ - sblabel = remote_port->ip_splabel; } if (sblabel) { mach_msg_filter_retain_sblabel_callback(sblabel); } - ip_mq_unlock(remote_port); + + ip_mq_unlock_label_put(dport, &dlabel); if (sblabel && !mach_msg_fetch_filter_policy(sblabel, msg_id, &fid)) { goto filtered_msg; @@ -647,25 +857,24 @@ ipc_validate_kmsg_dest_from_user( } #endif /* XNU_TARGET_OS_OSX */ - if (ip_is_kobject(port)) { - natural_t kotype = ip_kotype(port); - - if (__improbable(kotype == IKOT_TIMER)) { + natural_t otype = ip_type(port); + if (otype == IOT_TIMER_PORT) { #if XNU_TARGET_OS_OSX - if (__improbable(opts & MACH64_POLICY_HARDENED)) { - return MACH_SEND_INVALID_OPTIONS; - } - /* - * For bincompat, let's still allow user messages to timer port, but - * force MACH64_SEND_MQ_CALL flag for memory segregation. - */ - if (__improbable(!(opts & MACH64_SEND_MQ_CALL))) { - return MACH_SEND_INVALID_OPTIONS; - } -#else + if (__improbable(opts & MACH64_POLICY_ENHANCED)) { return MACH_SEND_INVALID_OPTIONS; + } + /* + * For bincompat, let's still allow user messages to timer port, but + * force MACH64_SEND_MQ_CALL flag for memory segregation. + */ + if (__improbable(!(opts & MACH64_SEND_MQ_CALL))) { + return MACH_SEND_INVALID_OPTIONS; + } +#else + return MACH_SEND_INVALID_OPTIONS; #endif - } else if (kotype == IKOT_UEXT_OBJECT) { + } else if (io_is_kobject_type(otype)) { + if (otype == IKOT_UEXT_OBJECT) { if (__improbable(!(opts & MACH64_SEND_DK_CALL))) { return MACH_SEND_INVALID_OPTIONS; } @@ -677,7 +886,7 @@ ipc_validate_kmsg_dest_from_user( /* kobject calls must be a combined send/receive */ if (__improbable((opts & MACH64_RCV_MSG) == 0)) { - if ((opts & MACH64_POLICY_HARDENED) || + if ((opts & MACH64_POLICY_ENHANCED) || IP_VALID(hdr->msgh_local_port) || !ipc_policy_allow_send_only_kobject_calls()) { return MACH_SEND_INVALID_OPTIONS; @@ -708,7 +917,9 @@ ipc_validate_kmsg_header_from_user( mach_msg_option64_t opts) { ipc_port_t dest_port = hdr->msgh_remote_port; - mach_msg_return_t mr = KERN_SUCCESS; + ipc_port_t reply_port = hdr->msgh_local_port; + mach_msg_return_t mr = MACH_MSG_SUCCESS; + ipc_space_policy_t current_policy; if (opts & MACH64_MACH_MSG2) { mr = ipc_validate_kmsg_dest_from_user(hdr, dest_port, opts); @@ -718,12 +929,40 @@ ipc_validate_kmsg_header_from_user( } /* - * Check if dest is a no-grant port; Since this bit is set only on - * port construction and cannot be unset later, we can peek at the - * bit without paying the cost of locking the port. + * For enhanced v2 binaries, enforce two OOL port array restrictions: + * - the receive right has to be of a type that explicitly + * allows receiving that descriptor + * - there could be no more than ONE single array in a kmsg */ - if (send_uctx->send_dsc_port_count && dest_port->ip_no_grant) { - mr = MACH_SEND_NO_GRANT_DEST; + current_policy = ipc_convert_msg_options_to_space(opts); + if (ool_port_array_enforced && + send_uctx->send_dsc_port_arrays_count && + ipc_should_apply_policy(current_policy, IPC_POLICY_ENHANCED_V2)) { + if (!ip_is_port_array_allowed(dest_port)) { + mach_port_guard_exception(current_policy, + MPG_PAYLOAD(MPG_FLAGS_INVALID_OPTIONS_OOL_RIGHT, + ip_type(dest_port)), + kGUARD_EXC_DESCRIPTOR_VIOLATION); + + return MACH_SEND_INVALID_OPTIONS; + } + + if (send_uctx->send_dsc_port_arrays_count > 1) { + mach_port_guard_exception(current_policy, + MPG_PAYLOAD(MPG_FLAGS_INVALID_OPTIONS_OOL_ARRAYS, + send_uctx->send_dsc_port_arrays_count), + kGUARD_EXC_DESCRIPTOR_VIOLATION); + + return MACH_SEND_INVALID_OPTIONS; + } + } + + /* + * Ensure that the reply field follows our security policies, + * including IOT_REPLY_PORT requirements + */ + mr = ipc_validate_local_port(reply_port, dest_port, opts); + if (mr != MACH_MSG_SUCCESS) { goto out; } @@ -731,8 +970,9 @@ ipc_validate_kmsg_header_from_user( * Evaluate message filtering if the sender is filtered. */ if ((opts & MACH64_POLICY_FILTER_MSG) && - mach_msg_filter_at_least(MACH_MSG_FILTER_CALLBACKS_VERSION_1)) { - mr = ipc_filter_kmsg_header_from_user(hdr, opts); + mach_msg_filter_at_least(MACH_MSG_FILTER_CALLBACKS_VERSION_1) && + ip_to_object(dest_port)->io_filtered) { + mr = ipc_filter_kmsg_header_from_user(hdr, dest_port, opts); if (mr != MACH_MSG_SUCCESS) { goto out; } @@ -745,6 +985,136 @@ out: return mr; } +#pragma mark receive immovability + +bool +ipc_move_receive_allowed( + ipc_space_t space, + ipc_port_t port, + mach_port_name_t name) +{ + ipc_space_policy_t policy = ipc_space_policy(space); + /* + * Check for service port before immovability so the task crash + * with reason kGUARD_EXC_SERVICE_PORT_VIOLATION_FATAL + */ + if (service_port_defense_enabled && + ip_type(port) == IOT_SERVICE_PORT && + !task_is_initproc(space->is_task)) { + mach_port_guard_exception(IPCPV_MOVE_SERVICE_PORT, name, + kGUARD_EXC_SERVICE_PORT_VIOLATION_FATAL); + return false; + } + + if (ip_type(port) == IOT_PROVISIONAL_REPLY_PORT && + ipc_should_apply_policy(policy, IPC_POLICY_ENHANCED_V2) && + !ipc_space_has_telemetry_type(space, IS_HAS_MOVE_PRP_TELEMETRY)) { + mach_port_guard_exception(name, 0, kGUARD_EXC_MOVE_PROVISIONAL_REPLY_PORT); + } + + if (ip_is_immovable_receive(port)) { + mach_port_guard_exception(name, 0, kGUARD_EXC_IMMOVABLE); + return false; + } + + return true; +} + +#pragma mark send immovability + + +bool +ipc_should_mark_immovable_send( + task_t task, + ipc_port_t port, + ipc_object_label_t label) +{ + /* + * some entitled processes are allowed to get movable control ports + * see `task_set_ctrl_port_default` - also all control ports are movable + * before/after the space becomes inactive. They will be made movable before + * the `task` is able to run code in userspace in `task_wait_to_return` + */ + if ((!task_is_immovable(task) || + !is_active(task->itk_space)) && + ip_is_tt_control_port_type(label.io_type)) { + return false; + } + + /* tasks get their own thread control port as immovable */ + if (label.io_type == IKOT_THREAD_CONTROL) { + thread_t thread = ipc_kobject_get_raw(port, IKOT_THREAD_CONTROL); + if (thread != THREAD_NULL && task == get_threadtask(thread)) { + return true; + } + } + + /* tasks get their own task control port as immovable */ + if (task->itk_task_ports[TASK_FLAVOR_CONTROL] == port) { + return true; + } + + /* special cases are handled, check the default policy */ + if (!ipc_policy(label)->pol_movable_send) { + return true; + } + + return false; +} + +/* requires: nothing locked, port is valid */ +static bool +ip_is_currently_immovable_send(ipc_port_t port) +{ + ipc_object_label_t label = ipc_port_lock_label_get(port); + if (task_is_immovable(current_task()) && + (ip_is_tt_control_port_type(label.io_type))) { + /* most tasks cannot move their control ports */ + ip_mq_unlock_label_put(port, &label); + return true; + } + + bool is_always_immovable_send = !ipc_policy(label)->pol_movable_send; + ip_mq_unlock_label_put(port, &label); + return is_always_immovable_send; +} + +bool +ipc_can_stash_naked_send(ipc_port_t port) +{ + return !IP_VALID(port) || !ip_is_currently_immovable_send(port); +} + +#pragma mark entry init + +void +ipc_entry_init( + ipc_space_t space, + ipc_object_t object, + mach_port_type_t type, + ipc_entry_t entry, + mach_port_urefs_t urefs, + mach_port_name_t name) +{ + /* object type can be deadname, port, or a portset */ + assert((type & MACH_PORT_TYPE_ALL_RIGHTS) == type); + assert(type != MACH_PORT_TYPE_NONE); + assert(urefs <= MACH_PORT_UREFS_MAX); + assert(entry); + + if (object && (type & MACH_PORT_TYPE_SEND_RIGHTS)) { + ipc_port_t port = ip_object_to_port(object); + ipc_object_label_t label = ip_label_get(port); + + if (ipc_should_mark_immovable_send(space->is_task, port, label)) { + entry->ie_bits |= IE_BITS_IMMOVABLE_SEND; + } + io_label_set_and_put(&port->ip_object, &label); + } + entry->ie_object = object; + entry->ie_bits |= type | urefs; + ipc_entry_modified(space, name, entry); +} #pragma mark policy guard violations @@ -772,47 +1142,62 @@ void mach_port_guard_exception_immovable( ipc_space_t space, mach_port_name_t name, - mach_port_t port) + mach_port_t port, + mach_msg_type_name_t disp, + __assert_only ipc_entry_t entry) { if (space == current_space()) { - assert(ip_is_immovable_send(port)); + assert(entry->ie_bits & IE_BITS_IMMOVABLE_SEND); + assert(entry->ie_port == port); boolean_t hard = task_get_control_port_options(current_task()) & TASK_CONTROL_PORT_IMMOVABLE_HARD; + uint64_t payload = MPG_PAYLOAD(MPG_FLAGS_NONE, ip_type(port), disp); - if (ip_is_control(port)) { + if (ip_is_tt_control_port(port)) { assert(task_is_immovable(current_task())); - mach_port_guard_exception(name, MPG_FLAGS_NONE, + mach_port_guard_exception(name, payload, hard ? kGUARD_EXC_IMMOVABLE : kGUARD_EXC_IMMOVABLE_NON_FATAL); } else { /* always fatal exception for non-control port violation */ - mach_port_guard_exception(name, MPG_FLAGS_NONE, - kGUARD_EXC_IMMOVABLE); + mach_port_guard_exception(name, payload, kGUARD_EXC_IMMOVABLE); } } } -/* - * Deliver a soft or hard immovable guard exception. - * - * Conditions: port is marked as immovable and pinned. - */ void mach_port_guard_exception_pinned( ipc_space_t space, mach_port_name_t name, - __assert_only mach_port_t port, uint64_t payload) { - if (space == current_space()) { - assert(ip_is_immovable_send(port)); - assert(ip_is_control(port)); /* only task/thread control ports can be pinned */ + ipc_space_policy_t policy = ipc_space_policy(space); + int guard; - boolean_t hard = task_get_control_port_options(current_task()) & TASK_CONTROL_PORT_PINNED_HARD; + if (space != current_space()) { + guard = kGUARD_EXC_NONE; + } else if (policy & + (IPC_SPACE_POLICY_TRANSLATED | IPC_SPACE_POLICY_SIMULATED)) { + guard = kGUARD_EXC_NONE; + } else if (ipc_should_apply_policy(policy, IPC_POLICY_ENHANCED_V1)) { + if (ipc_control_port_options & ICP_OPTIONS_PINNED_1P_HARD) { + guard = kGUARD_EXC_MOD_REFS; + } else if (ipc_control_port_options & ICP_OPTIONS_PINNED_1P_SOFT) { + guard = kGUARD_EXC_MOD_REFS_NON_FATAL; + } else { + guard = kGUARD_EXC_NONE; + } + } else { + if (ipc_control_port_options & ICP_OPTIONS_PINNED_3P_HARD) { + guard = kGUARD_EXC_MOD_REFS; + } else if (ipc_control_port_options & ICP_OPTIONS_PINNED_3P_SOFT) { + guard = kGUARD_EXC_MOD_REFS_NON_FATAL; + } else { + guard = kGUARD_EXC_NONE; + } + } - assert(task_is_pinned(current_task())); - - mach_port_guard_exception(name, payload, - hard ? kGUARD_EXC_MOD_REFS : kGUARD_EXC_MOD_REFS_NON_FATAL); + if (guard != kGUARD_EXC_NONE) { + mach_port_guard_exception(name, payload, guard); } } @@ -874,7 +1259,8 @@ mach_port_guard_ast( while (behavior & TASK_EXC_GUARD_MP_ONCE) { uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_MP_DELIVER; - if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) { + if (os_atomic_cmpxchg(&task->task_exc_guard, + behavior, new_behavior, relaxed)) { break; } behavior = task->task_exc_guard; @@ -921,3 +1307,323 @@ mach_port_guard_ast( } } } + +#pragma mark notification policies + +static bool +ipc_allow_service_port_register_pd( + ipc_port_t service_port, + ipc_port_t notify_port, + uint64_t *payload) +{ + /* boot-arg disables this security policy */ + if (!service_port_defense_enabled || !IP_VALID(notify_port)) { + return true; + } + /* enforce this policy only on service port types */ + if (ip_is_any_service_port(service_port)) { + /* Only launchd should be able to register for port destroyed notification on a service port. */ + if (!task_is_initproc(current_task())) { + *payload = MPG_FLAGS_KERN_FAILURE_TASK; + return false; + } + /* notify_port needs to be immovable */ + if (!ip_is_immovable_receive(notify_port)) { + *payload = MPG_FLAGS_KERN_FAILURE_NOTIFY_TYPE; + return false; + } + /* notify_port should be owned by launchd */ + if (!task_is_initproc(notify_port->ip_receiver->is_task)) { + *payload = MPG_FLAGS_KERN_FAILURE_NOTIFY_RECV; + return false; + } + } + return true; +} + +kern_return_t +ipc_allow_register_pd_notification( + ipc_port_t pd_port, + ipc_port_t notify_port) +{ + uint64_t payload; + + /* + * you cannot register for port destroyed notifications + * on an immovable receive right (which includes kobjects), + * or a (special) reply port or any other port that explicitly disallows them. + */ + release_assert(ip_in_a_space(pd_port)); + if (ip_is_immovable_receive(pd_port) || + !ipc_policy(pd_port)->pol_notif_port_destroy) { + mach_port_guard_exception(ip_type(pd_port), MACH_NOTIFY_PORT_DESTROYED, kGUARD_EXC_INVALID_NOTIFICATION_REQ); + return KERN_INVALID_RIGHT; + } + + /* Stronger pd enforcement for service ports */ + if (!ipc_allow_service_port_register_pd(pd_port, notify_port, &payload)) { + mach_port_guard_exception(0, payload, kGUARD_EXC_KERN_FAILURE); + return KERN_INVALID_RIGHT; + } + + /* Allow only one registration of this notification */ + if (ipc_port_has_prdrequest(pd_port)) { + mach_port_guard_exception(0, MPG_FLAGS_KERN_FAILURE_MULTI_NOTI, kGUARD_EXC_KERN_FAILURE); + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + + +#pragma mark policy array + +__dead2 +static void +no_kobject_no_senders( + ipc_port_t port, + mach_port_mscount_t mscount __unused) +{ + panic("unexpected call to no_senders for object %p, type %d", + port, ip_type(port)); +} + +__dead2 +static void +no_label_free(ipc_object_label_t label) +{ + panic("unexpected call to label_free for object type %d, label %p", + label.io_type, label.iol_pointer); +} + +/* + * Denotes a policy which safe value is the argument to PENDING(), + * but is currently not default and pending validation/prep work. + */ +#define PENDING(value) value + +__security_const_late +struct ipc_object_policy ipc_policy_array[IOT_UNKNOWN] = { + [IOT_PORT_SET] = { + .pol_name = "port set", + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_movable_send = false, + }, + [IOT_PORT] = { + .pol_name = "port", + .pol_movability = IPC_MOVE_POLICY_ALWAYS, + .pol_movable_send = true, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + .pol_notif_port_destroy = true, + }, + [IOT_SERVICE_PORT] = { + .pol_name = "service port", + .pol_movability = PENDING(IPC_MOVE_POLICY_ONCE_OR_AFTER_PD), + .pol_movable_send = true, + .pol_label_free = ipc_service_port_label_dealloc, + .pol_enforce_reply_semantics = PENDING(true), /* pending on service port defense cleanup */ + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + .pol_notif_port_destroy = true, + }, + [IOT_WEAK_SERVICE_PORT] = { + .pol_name = "weak service port", + .pol_movability = IPC_MOVE_POLICY_ALWAYS, + .pol_movable_send = true, + .pol_label_free = ipc_service_port_label_dealloc, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + .pol_notif_port_destroy = true, + }, + [IOT_CONNECTION_PORT] = { + .pol_name = "connection port", + .pol_movability = IPC_MOVE_POLICY_ONCE, + .pol_label_free = ipc_connection_port_label_dealloc, + .pol_enforce_reply_semantics = true, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + .pol_notif_port_destroy = true, + }, + [IOT_CONNECTION_PORT_WITH_PORT_ARRAY] = { + .pol_name = "conn port with ool port array", + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_movable_send = true, + .pol_construct_entitlement = MACH_PORT_CONNECTION_PORT_WITH_PORT_ARRAY, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + }, + [IOT_EXCEPTION_PORT] = { + .pol_name = "exception port", + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_movable_send = true, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + }, + [IOT_TIMER_PORT] = { + .pol_name = "timer port", + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_movable_send = true, + .pol_label_free = mk_timer_port_label_dealloc, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + }, + [IOT_REPLY_PORT] = { + .pol_name = "reply port", + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_notif_dead_name = true, + }, + [IOT_SPECIAL_REPLY_PORT] = { + .pol_name = "special reply port", + /* + * General use of a special reply port as a receive right + * can cause type confusion in the importance code. + */ + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_notif_dead_name = true, + }, + [IOT_PROVISIONAL_REPLY_PORT] = { + .pol_name = "provisional reply port", + .pol_movability = IPC_MOVE_POLICY_ALWAYS, + .pol_movable_send = true, + .pol_construct_entitlement = MACH_PORT_PROVISIONAL_REPLY_ENTITLEMENT, + .pol_notif_dead_name = true, + .pol_notif_no_senders = true, + .pol_notif_port_destroy = true, + }, + + [__IKOT_FIRST ... IOT_UNKNOWN - 1] = { + .pol_movability = IPC_MOVE_POLICY_NEVER, + .pol_notif_dead_name = true, + }, +}; + +__startup_func +static void +ipc_policy_update_from_tunables(void) +{ + if (!service_port_defense_enabled) { + ipc_policy_array[IOT_SERVICE_PORT].pol_movability = + IPC_MOVE_POLICY_ALWAYS; + } +} +STARTUP(TUNABLES, STARTUP_RANK_LAST, ipc_policy_update_from_tunables); + +/* + * Ensure new port types that requires a construction entitlement + * are marked as immovable. + */ +__startup_func +static void +ipc_policy_construct_entitlement_hardening(void) +{ + /* No need to check kobjects because they are always immovable */ + for (ipc_object_type_t i = 0; i < __IKOT_FIRST; i++) { + /* + * IOT_PROVISIONAL_REPLY_PORT is an exception as it used to be + * movable. For process opted for enhanced security V2, + * kGUARD_EXC_MOVE_PROVISIONAL_REPLY_PORT will be thrown when a + * provisional reply port is being moved. + */ + if (i == IOT_PROVISIONAL_REPLY_PORT) { + continue; + } + if (ipc_policy_array[i].pol_construct_entitlement) { + assert(ipc_policy_array[i].pol_movability == IPC_MOVE_POLICY_NEVER); + } + } +} +STARTUP(TUNABLES, STARTUP_RANK_LAST, ipc_policy_construct_entitlement_hardening); + +__startup_func +void +ipc_kobject_register_startup(ipc_kobject_ops_t ops) +{ + struct ipc_object_policy *pol = &ipc_policy_array[ops->iko_op_type]; + + if (pol->pol_name) { + panic("trying to register kobject(%d) twice", ops->iko_op_type); + } + + /* + * Always make sure kobject ports have immovable receive rights. + * + * They use the ip_kobject field of the ipc_port structure, + * which is unioned with ip_imp_task. + * + * Thus, general use of a kobject port as a receive right can + * cause type confusion in the importance code. + */ + ipc_release_assert(pol->pol_movability == IPC_MOVE_POLICY_NEVER); + if (ops->iko_op_no_senders) { + pol->pol_notif_no_senders = true; + } + + pol->pol_name = ops->iko_op_name; + pol->pol_kobject_stable = ops->iko_op_stable; + pol->pol_kobject_permanent = ops->iko_op_permanent; + pol->pol_kobject_no_senders = ops->iko_op_no_senders; + pol->pol_label_free = ops->iko_op_label_free; + pol->pol_movable_send = ops->iko_op_movable_send; +} + +__startup_func +static void +ipc_policy_set_defaults(void) +{ + /* + * Check that implicit init to 0 picks the right "values" + * for all properties. + */ + static_assert(IPC_MOVE_POLICY_NEVER == 0); + + for (uint32_t i = 0; i < IOT_UNKNOWN; i++) { + struct ipc_object_policy *pol = &ipc_policy_array[i]; + + if (!pol->pol_kobject_no_senders) { + pol->pol_kobject_no_senders = no_kobject_no_senders; + } + if (!pol->pol_label_free) { + pol->pol_label_free = no_label_free; + } + } +} +STARTUP(MACH_IPC, STARTUP_RANK_LAST, ipc_policy_set_defaults); + +#pragma mark exception port policy + +bool +ipc_is_valid_exception_port( + task_t task, + ipc_port_t port) +{ + if (task == TASK_NULL && is_ux_handler_port(port)) { + return true; + } + + if (ip_is_exception_port(port)) { + return true; + } + + /* + * rdar://77996387 + * Avoid exposing immovable ports send rights (kobjects) to `get_exception_ports`, + * but exception ports to still be set. + */ + if (!ipc_can_stash_naked_send(port)) { + return false; + } + + if (ip_is_immovable_receive(port)) { + /* + * rdar://153108740 + * Temporarily allow service ports until telemetry is clean. + */ + if (ip_type(port) == IOT_SERVICE_PORT) { + return true; + } + return false; + } + + return true; +} diff --git a/osfmk/ipc/ipc_policy.h b/osfmk/ipc/ipc_policy.h index 6bf37e6c3..369bddbd0 100644 --- a/osfmk/ipc/ipc_policy.h +++ b/osfmk/ipc/ipc_policy.h @@ -30,9 +30,11 @@ #define _IPC_IPC_POLICY_H_ #include +#include #include #include + __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN #pragma GCC visibility push(hidden) @@ -88,6 +90,64 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN */ #define IPC_KMSG_MAX_OOL_PORT_COUNT 16383 +/*! + * @const IPC_POLICY_ENHANCED_V0 + * This policy represents platform binaries, hardened-runtime and + * everything below it. + */ +#define IPC_POLICY_ENHANCED_V0 \ + (IPC_SPACE_POLICY_ENHANCED | IPC_SPACE_POLICY_ENHANCED_V0) + +/*! + * @const IPC_POLICY_ENHANCED_V1 + * This policy represents ES features exposed to 3P in FY2024 release. + */ +#define IPC_POLICY_ENHANCED_V1 \ + (IPC_SPACE_POLICY_ENHANCED | IPC_SPACE_POLICY_ENHANCED_V1) + +/*! + * @const IPC_POLICY_ENHANCED_V2 + * This policy represents ES features exposed to 3P in FY2025 release. + */ +#define IPC_POLICY_ENHANCED_V2 \ + (IPC_SPACE_POLICY_ENHANCED | IPC_SPACE_POLICY_ENHANCED_V2) + +#pragma mark policy tunables + +__options_decl(ipc_control_port_options_t, uint32_t, { + ICP_OPTIONS_NONE = 0x00, + + /* policy for IPC_SPACE_POLICY_{PLATFORM,HARDENED} */ + ICP_OPTIONS_PINNED_1P_SOFT = 0x01, + ICP_OPTIONS_PINNED_1P_HARD = 0x02, + ICP_OPTIONS_IMMOVABLE_1P_SOFT = 0x04, + ICP_OPTIONS_IMMOVABLE_1P_HARD = 0x08, + + /* policy for other processes */ + ICP_OPTIONS_PINNED_3P_SOFT = 0x10, + ICP_OPTIONS_PINNED_3P_HARD = 0x20, + ICP_OPTIONS_IMMOVABLE_3P_SOFT = 0x40, + ICP_OPTIONS_IMMOVABLE_3P_HARD = 0x80, +}); + +/*! + * @brief + * Policy for task and thread control ports. + */ +extern ipc_control_port_options_t ipc_control_port_options; + +/*! + * @brief + * Whether service port defense in depth is enabled. + */ +extern bool service_port_defense_enabled; + +/*! + * @brief + * Whether out-of-line port array descriptor + * restrictions are enabled. + */ +extern bool ool_port_array_enforced; #pragma mark policy utils @@ -110,6 +170,27 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN #pragma mark policy options + +/*! + * @brief + * Convert mach_msg policy options (originally derived from the current_task space) back into the space namespace + * + * @param opts the options to convert + * + * @return the options for the space + */ +extern ipc_space_policy_t ipc_convert_msg_options_to_space( + mach_msg_option64_t opts); + +/*! + * @brief + * Computes the IPC policy for a given task. + * + * @param task the current task + */ +extern ipc_space_policy_t ipc_policy_for_task( + task_t task); + /*! * @brief * Derive the current policy flags for the current process. @@ -121,7 +202,7 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN * @param task the current task * @param user_flags flags passed in from userspace */ -extern mach_msg_option64_t ipc_current_user_policy( +extern mach_msg_option64_t ipc_current_msg_options( task_t task, mach_msg_option64_t user_flags); @@ -134,7 +215,7 @@ extern mach_msg_option64_t ipc_current_user_policy( * then a mach port guard exception (@c kGUARD_EXC_INVALID_OPTIONS) is raised. * * @param opts the mach_msg() options, after sanitization - * via @c ipc_current_user_policy(). + * via @c ipc_current_msg_options(). * @returns * - MACH_MSG_SUCCESS success, * - MACH_SEND_INVALID_OPTIONS @@ -144,6 +225,28 @@ extern mach_msg_option64_t ipc_current_user_policy( extern mach_msg_return_t ipc_preflight_msg_option64( mach_msg_option64_t opts); +/*! + * @brief + * Determines whether ipc policies should be applied + * + * @discussion + * This checks whether the current policy level matches the policy level + * of this particular feature, but this helper also allows for various + * ways for a task to be opted out of ipc security policies, such as if + * they have the IPC_SPACE_POLICY_SIMULATED, *_TRANSLATED, or *_OPTED_OUT flags. + * + * @param current_policy the policy level for the task/space that we are enforcing on + * @param requested_level the policy level that is required to be opted into this enforcement + * + * @returns + * - true if the current policy level matches the requested policy + * level for this feature, and the task is not opted out + * - false otherwise + */ +extern bool ipc_should_apply_policy( + const ipc_space_policy_t current_policy, + const ipc_space_policy_t requested_level); + #pragma mark legacy trap policies #if IPC_HAS_LEGACY_MACH_MSG_TRAP @@ -156,12 +259,13 @@ extern mach_msg_return_t ipc_preflight_msg_option64( * If using the legacy mach_msg_trap() is disallowed, this will raise * a mach port guard exception (@c kGUARD_EXC_INVALID_OPTIONS). * + * @discussion * Nothing should be locked. * * @param msgid the message ID of the message being sent * with the legacy interface. * @param opts the mach_msg() options passed to the legacy interface, - * after sanitization via @c ipc_current_user_policy(). + * after sanitization via @c ipc_current_msg_options(). * @returns * - MACH_MSG_SUCCESS success, * - KERN_NOT_SUPPORTED for failure cases. @@ -172,8 +276,151 @@ extern mach_msg_return_t ipc_policy_allow_legacy_send_trap( #endif /* IPC_HAS_LEGACY_MACH_MSG_TRAP */ +#pragma mark policy array + +/*! + * @brief + * Decides the policy around receive right movability. + * + * @const IPC_MOVE_POLICY_NEVER + * Such ports are born in the IO_STATE_IN_SPACE_IMMOVABLE state. + * Moving or arming port-destroyed notification on such rights + * is disallowed. + * + * @const IPC_MOVE_POLICY_ONCE + * Such ports are born in the IO_STATE_IN_SPACE state. + * + * Arming port destroyed notification on such ports is allowed, + * and they will move to IO_STATE_IN_SPACE_IMMOVABLE after their first move. + * + * Their state will remain IO_STATE_IN_SPACE_IMMOVABLE after a port-destroyed + * notification fires. + * + * @const IPC_MOVE_POLICY_ONCE_OR_AFTER_PD + * Such ports are born in the IO_STATE_IN_SPACE state. + * + * This behaves like @c IPC_MOVE_POLICY_ONCE, but resets to IO_STATE_IN_SPACE + * after a port-destroyed notification is delivered. + * + * @const IPC_MOVE_POLICY_ALWAYS + * The port is always movable. + */ +__enum_decl(ipc_move_policy_t, uint32_t, { + IPC_MOVE_POLICY_NEVER, + IPC_MOVE_POLICY_ONCE, + IPC_MOVE_POLICY_ONCE_OR_AFTER_PD, + IPC_MOVE_POLICY_ALWAYS, +}); + +/*! + * @brief + * Type for port policies + */ +typedef const struct ipc_object_policy { + const char *pol_name; + + /** see iko_op_stable */ + unsigned long pol_kobject_stable : 1; + /** see iko_op_permanent */ + unsigned long pol_kobject_permanent : 1; + + /** whether the port is movable */ + ipc_move_policy_t pol_movability : 2; + + + /** `mach_port_request_notification` protections */ + + /** + * allow arming a `MACH_NOTIFY_PORT_DESTROYED` notification + * on this receive right + */ + unsigned long pol_notif_port_destroy : 1; + /** + * allow arming a `MACH_NOTIFY_NO_SENDERS` notification + * on this receive right + */ + unsigned long pol_notif_no_senders : 1; + /** + * allow arming a `MACH_NOTIFY_DEAD_NAME/MACH_NOTIFY_SEND_POSSIBLE` + * notification on this receive right + */ + unsigned long pol_notif_dead_name : 1; + + + /** whether the port requires incoming messages to use an IOT_REPLY_PORT properly */ + unsigned long pol_enforce_reply_semantics : 1; + + /** + * whether send rights created on this port are movable, + * immovable ports still allow "movement" via MAKE_SEND(_ONCE) + */ + unsigned long pol_movable_send : 1; + + /** required entitlement for platform restrictions binaries to create this port */ + const char *pol_construct_entitlement; + + /** see iko_op_no_senders */ + void (*pol_kobject_no_senders)( + ipc_port_t port, + mach_port_mscount_t mscount); + + /** destroys the label for this port */ + void (*pol_label_free)( + ipc_object_label_t label); +} *ipc_object_policy_t; + +/*! + * @brief + * Array of policies per port type. + */ +extern struct ipc_object_policy ipc_policy_array[IOT_UNKNOWN]; + +/*! + * @brief + * Returns the policy for a given type/object/port/... + */ +__attribute__((overloadable, always_inline, const)) +static inline ipc_object_policy_t +ipc_policy(ipc_object_type_t otype) +{ + ipc_release_assert(otype < IOT_UNKNOWN); + return &ipc_policy_array[otype]; +} + +__attribute__((overloadable, always_inline, const)) +static inline ipc_object_policy_t +ipc_policy(ipc_object_label_t label) +{ + return ipc_policy(label.io_type); +} + +__attribute__((overloadable, always_inline, const)) +static inline ipc_object_policy_t +ipc_policy(ipc_object_t object) +{ + return ipc_policy(object->io_type); +} + +__attribute__((overloadable, always_inline, const)) +static inline ipc_object_policy_t +ipc_policy(ipc_port_t port) +{ + return ipc_policy(ip_type(port)); +} + + #pragma mark ipc policy telemetry [temporary] +/* The bootarg to disable ALL ipc policy violation telemetry */ +extern bool ipcpv_telemetry_enabled; + +/* Enables reply port/voucher/persona debugging code */ +extern bool enforce_strict_reply; + +extern bool prp_enforcement_enabled; + +extern bool bootstrap_port_telemetry_enabled; + /*! * @brief * Identifier of the type of ipc policy violation in a CA telemetry event @@ -182,69 +429,90 @@ extern mach_msg_return_t ipc_policy_allow_legacy_send_trap( * be extended to report more violations in the future. */ __enum_closed_decl(ipc_policy_violation_id_t, uint8_t, { - /* Rigid Reply Port and Move Reply Port violators Start */ - IPCPV_REPLY_PORT_SEMANTICS, /* normal reply port semantics violator */ - IPCPV_RIGID_REPLY_PORT_HARDENED_RUNTIME, - IPCPV_MOVE_REPLY_PORT_HARDENED_RUNTIME, - IPCPV_MOVE_REPLY_PORT_3P, - IPCPV_RIGID_REPLY_PORT_3P, + IPCPV_VIOLATION_NONE, /* 0, denote no violations */ + + /* Kobject Reply Port and Move Reply Port violators Start */ + IPCPV_REPLY_PORT_SEMANTICS, /* 1, normal reply port semantics violator */ + /* [2-5] were previously used; should be avoided to avoid telemetry confusion */ + __UNUSED2, /* previously used, should be avoided */ + __UNUSED3, /* previously used, should be avoided */ + __UNUSED4, /* previously used, should be avoided */ + __UNUSED5, /* previously used, should be avoided */ /* services opted out of reply port semantics previously should have fixed their violations */ - IPCPV_REPLY_PORT_SEMANTICS_OPTOUT, - /* Rigid Reply Port and Move Reply Port violators End */ + IPCPV_REPLY_PORT_SEMANTICS_OPTOUT, /* 6 */ + /* Kobject Reply Port and Move Reply Port Violators End */ + + /* Service Port Defense Violators Start */ + IPCPV_MOVE_SERVICE_PORT, /* 7 */ + IPCPV_SERVICE_PORT_PD_NOTIFICATION, /* 8, for future telemetry */ + /* Service Port Defense Violators End */ + + /* + * [9-12] were previously used for OOL port array restrictions; + * these should be avoided to avoid telemetry confusion + */ + __UNUSED6, /* 9 previously used, should be avoided */ + __UNUSED7, /* 10 previously used, should be avoided */ + __UNUSED8, /* 11 previously used, should be avoided */ + __UNUSED9, /* 12 previously used, should be avoided */ + /* OOL ports array violators End */ + + /* Bootstrap port reply port semantics violators Start */ + IPCPV_BOOTSTRAP_PORT, /* 13 */ + /* Bootstrap port reply port semantics violators End */ _IPCPV_VIOLATION_COUNT, }); -/*! - * @brief - * Whether the dest and reply port pair violates rigid reply port semantics - * - * @param dest_port message destination port - * @param reply_port message reply port - * @param reply_port_semantics_violation type of the violation - * - * @returns - * - TRUE if there is a violation, - * - FALSE otherwise - */ -extern bool ip_violates_rigid_reply_port_semantics( - ipc_port_t dest_port, - ipc_port_t reply_port, - ipc_policy_violation_id_t *reply_port_semantics_violation); - -/*! - * @brief - * Whether the dest and reply port pair violates reply port semantics - * - * @param dest_port message destination port - * @param reply_port message reply port - * @param reply_port_semantics_violation type of the violation - * - * @returns - * - TRUE if there is a violation, - * - FALSE otherwise - */ -extern bool ip_violates_reply_port_semantics( - ipc_port_t dest_port, - ipc_port_t reply_port, - ipc_policy_violation_id_t *reply_port_semantics_violation); - /*! * @brief * Record ipc policy violations into a buffer for sending to CA at a later time. * + * @discussion * The ipc telemetry lock is not locked. * * @param violation_id type of ipc policy violation - * @param sp_info service port info of the violator + * @param service_port service port involved in violation, if any * @param aux_data additional data to include in the CA event: * violator msgh_id for reply port defense */ extern void ipc_stash_policy_violations_telemetry( ipc_policy_violation_id_t violation_id, - mach_service_port_info_t sp_info, + ipc_port_t service_port, int aux_data); +#if DEBUG || DEVELOPMENT +/*! + * @brief + * Helper function to record the total number of ipcpv violation occured. + * Telemetry count should be 0 in presub testing as we shouldn't emit any + * telemetry for known issue. + */ +extern void ipc_inc_telemetry_count(void); +#endif /* DEBUG || DEVELOPMENT */ + +/*! + * @brief + * Check if the ipc space has emitted a certain type of telemetry. + * + * @param is ipc space in question + * @param type ipc policy violation type + */ +__attribute__((always_inline)) +static inline bool +ipc_space_has_telemetry_type(ipc_space_t is, uint8_t type) +{ + if (!ipcpv_telemetry_enabled) { + return true; + } + +#if DEBUG || DEVELOPMENT + ipc_inc_telemetry_count(); +#endif + + return (os_atomic_or_orig(&is->is_telemetry, type, relaxed) & type) != 0; +} + #pragma mark MACH_SEND_MSG policies /*! @@ -259,7 +527,7 @@ extern void ipc_stash_policy_violations_telemetry( * else has been copied in. * @param dsc_count the number of inline descriptors for the user message. * @param opts the mach_msg() options, after sanitization - * via @c ipc_current_user_policy(). + * via @c ipc_current_msg_options(). * * @returns * - MACH_MSG_SUCCESS the message passed validation @@ -285,7 +553,7 @@ extern mach_msg_return_t ipc_validate_kmsg_header_schema_from_user( * the descriptors data is copied in "kernel" format. * @param send_uctx the IPC kmsg send context for the current send operation. * @param opts the mach_msg() options, after sanitization - * via @c ipc_current_user_policy(). + * via @c ipc_current_msg_options(). * * @returns * - MACH_MSG_SUCCESS the message passed validation @@ -299,7 +567,6 @@ extern mach_msg_return_t ipc_validate_kmsg_schema_from_user( mach_msg_send_uctx_t *send_uctx, mach_msg_option64_t opts); - /*! * @brief * Validation function that runs after the rights in the message header have @@ -311,7 +578,7 @@ extern mach_msg_return_t ipc_validate_kmsg_schema_from_user( * @param hdr the copied in message header. * @param send_uctx the IPC kmsg send context for the current send operation. * @param opts the mach_msg() options, after sanitization - * via @c ipc_current_user_policy(). + * via @c ipc_current_msg_options(). * @returns * - MACH_MSG_SUCCESS the message passed validation * - MACH_SEND_INVALID_OPTIONS @@ -321,8 +588,6 @@ extern mach_msg_return_t ipc_validate_kmsg_schema_from_user( * - MACH_SEND_MSG_FILTERED * the message failed a filtering check. * a kGUARD_EXC_MSG_FILTERED guard might be raised. - * - MACH_SEND_NO_GRANT_DEST - * attempting to send descriptors to a no_grant port. */ extern mach_msg_return_t ipc_validate_kmsg_header_from_user( mach_msg_header_t *hdr, @@ -330,6 +595,102 @@ extern mach_msg_return_t ipc_validate_kmsg_header_from_user( mach_msg_option64_t opts); +#pragma mark port type policies and callouts + +/*! + * @brief + * Frees a label value according to the port type callout. + * + * @param label The label to destroy. + */ +static inline void +ipc_port_label_free(ipc_object_label_t label) +{ + if (label.iol_pointer) { + ipc_policy(label)->pol_label_free(label); + } +} + +#pragma mark send immovability + +/*! + * @brief + * Returns whether an entry for this port should be marked as immovable send + * + * @param task The task where the new entry is being created/copied out + * @param port The port that the entry is being created/copied out for + * + * @returns + * - true The send right entry should be marked as immovable + * - false The send right entry should not be marked as immovable + */ +extern bool ipc_should_mark_immovable_send( + task_t task, + ipc_port_t port, + ipc_object_label_t label); + +/*! + * @brief + * Determine whether we need to protect this port from being stashed as a naked + * send right in the kernel. We disallow this if the port is supposed to be immovable send + * as this would allow userspace to bypass the immovable send checks and move the send + * right to another process. + * + * @param port The port that we want to protect + * + * @returns + * - true The port is allowed to be stashed + * - false The port is immovable send and should not be stashed + */ +extern bool ipc_can_stash_naked_send( + ipc_port_t port); + + +#pragma mark entry init + +/*! + * @brief + * Initialize the security fields/flags on a new right entry created through the + * new port creation path. This right could be any port or port set right. + * + * @param space The space this entry is being created in + * @param object The *initialized* port/portset object that is getting a new entry + * @param type The type of this entry (send, send-once, receive, deadname, portset) + * @param entry Pointer to the entry that is being initialized + * @param urefs Number of refs this entry will be initialized to + * @param name The name this entry will occupy in the space + */ +extern void ipc_entry_init( + ipc_space_t space, + ipc_object_t object, + mach_port_type_t type, + ipc_entry_t entry, + mach_port_urefs_t urefs, + mach_port_name_t name); + + +#pragma mark receive immovability + +/*! + * @brief + * Returns whether the receive right of a port is allowed to move out + * of an ipc space. + * + * Condition: Space is write-locked and active. Port is not locked. + * + * @param space The ipc space to copyin from + * @param port The port whose receive right is being moved + * + * @returns + * - true The receive right can move out of the space + * - false The receive right can not move out of the space + */ +extern bool ipc_move_receive_allowed( + ipc_space_t space, + ipc_port_t port, + mach_port_name_t name); + + #pragma mark policy guard violations /*! @@ -356,33 +717,67 @@ extern void mach_port_guard_exception( * @param space The space causing the immovable exception. * The guard isn't delivered if it isn't the current space. * @param name The name of the port in @c space violating immovability. - * @param port The port violating immovability (must be - * ip_immovable_send). + * @param port The port violating immovability (must be pol_movable_send). */ __cold extern void mach_port_guard_exception_immovable( ipc_space_t space, mach_port_name_t name, - mach_port_t port); + mach_port_t port, + mach_msg_type_name_t disp, + ipc_entry_t entry); /*! * @brief * Deliver a soft or hard mod_refs guard exception. * * @param space The space causing the pinned exception. - * The guard isn't delivered if it isn't the current space. - * @param name The name of the port in @c space violating pinned rules. - * @param port The port violating pinned rules (must be - * ip_immovable_send). + * The guard isn't delivered if it isn't the current space, + * or the task disables guards on pinned violations. + * @param name The name of the entry in @c space violating pinned rules. * @param payload A valid @c MPG_FLAGS_MOD_REFS_PINNED_* value. */ __cold extern void mach_port_guard_exception_pinned( ipc_space_t space, mach_port_name_t name, - __unused mach_port_t port, uint64_t payload); +#pragma mark exception port policy + +/*! + * @brief + * Check whether the port can be a valid exception port for a given task. + * + * @param task The task registering an exception port. + * @param port The port being registered as exception port. + */ +extern bool ipc_is_valid_exception_port( + task_t task, + ipc_port_t port); + +#pragma mark notification policies + +/*! + * @brief + * Check if requesting a port destroyed notification on pd_port is allowed. + * + * @discussion + * pd_port is locked and active. + * This function must raise a guard exception along every error path + * + * @param pd_port the port to be reaped after destroy + * @param notify_port the notify port that pd_port will be sent to after deat + * + * @returns + * - KERN_SUCCESS port destroyed notification is allowed to be requested + * on this pd_port with this notify_port + * - KERN_FAILURE pd_port already has a pd notification + * - KERN_INVALID_RIGHT some violation in the security policy + */ +extern kern_return_t ipc_allow_register_pd_notification( + ipc_port_t pd_port, + ipc_port_t notify_port); #pragma GCC visibility pop __ASSUME_PTR_ABI_SINGLE_END __END_DECLS diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 7285192be..773bf4460 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -87,7 +87,7 @@ #include #include #include -#include +#include #include #include #include @@ -110,10 +110,8 @@ extern int csproc_hardened_runtime(struct proc* p); static TUNABLE(bool, prioritize_launch, "prioritize_launch", true); TUNABLE_WRITEABLE(int, ipc_portbt, "ipc_portbt", false); -extern zone_t ipc_kobject_label_zone; - LCK_SPIN_DECLARE_ATTR(ipc_port_multiple_lock_data, &ipc_lck_grp, &ipc_lck_attr); -ipc_port_timestamp_t ipc_port_timestamp_data; +static ipc_port_timestamp_t ipc_port_timestamp_data; KALLOC_ARRAY_TYPE_DEFINE(ipc_port_request_table, struct ipc_port_request, KT_DEFAULT); @@ -168,11 +166,20 @@ ipc_port_lock(ipc_port_t port) waitq_lock(&port->ip_waitq); } -void +ipc_object_label_t +ipc_port_lock_label_get(ipc_port_t port) +{ + ip_validate(port); + waitq_lock(&port->ip_waitq); + return ip_label_get(port); +} + +ipc_object_label_t ipc_port_lock_check_aligned(ipc_port_t port) { zone_id_require_aligned(ZONE_ID_IPC_PORT, port); waitq_lock(&port->ip_waitq); + return ip_label_get(port); } bool @@ -204,7 +211,7 @@ ipc_port_reference(ipc_port_t port) ipc_port_timestamp_t ipc_port_timestamp(void) { - return OSIncrementAtomic(&ipc_port_timestamp_data); + return os_atomic_inc_orig(&ipc_port_timestamp_data, relaxed); } @@ -645,73 +652,20 @@ ipc_port_request_cancel( /* - * Routine: ipc_port_nsrequest - * Purpose: - * Make a no-senders request, returning the - * previously registered send-once right. - * Just cancels the previous request if notify is IP_NULL. - * Conditions: - * The port is locked and active. It is unlocked. - * Consumes a ref for notify (if non-null), and - * returns previous with a ref (if non-null). - */ - -void -ipc_port_nsrequest( - ipc_port_t port, - mach_port_mscount_t sync, - ipc_port_t notify, - ipc_port_t *previousp) -{ - ipc_port_t previous; - mach_port_mscount_t mscount; - require_ip_active(port); - - assert(!ip_in_space(port, ipc_space_kernel)); - assert(port->ip_nsrequest != IP_KOBJECT_NSREQUEST_ARMED); - - previous = port->ip_nsrequest; - mscount = port->ip_mscount; - - if ((port->ip_srights == 0) && (sync <= mscount) && - (notify != IP_NULL)) { - port->ip_nsrequest = IP_NULL; - ip_mq_unlock(port); - ipc_notify_no_senders(notify, mscount, /* kobject */ false); - } else { - port->ip_nsrequest = notify; - ip_mq_unlock(port); - } - - *previousp = previous; -} - - -/* - * Routine: ipc_port_clear_receiver + * Routine: ipc_port_prepare_move * Purpose: * Prepares a receive right for transmission/destruction, - * optionally performs mqueue destruction (with port lock held) * * Conditions: * The port is locked and active. - * Returns: - * If should_destroy is TRUE, then the return value indicates - * whether the caller needs to reap kmsg structures that should - * be destroyed (by calling ipc_kmsg_reap_delayed) - * - * If should_destroy is FALSE, this always returns FALSE */ - -boolean_t -ipc_port_clear_receiver( - ipc_port_t port, - boolean_t should_destroy, - waitq_link_list_t *free_l) +__attribute__((always_inline)) +static void +ipc_port_prepare_move( + ipc_port_t port, + ipc_object_label_t *label, + waitq_link_list_t *free_l) { - ipc_mqueue_t mqueue = &port->ip_messages; - boolean_t reap_messages = FALSE; - /* * Pull ourselves out of any sets to which we belong. * We hold the write space lock or the receive entry has @@ -727,71 +681,146 @@ ipc_port_clear_receiver( * Send anyone waiting on the port's queue directly away. * Also clear the mscount, seqno, guard bits */ - if (ip_in_a_space(port)) { + if (io_state_in_space(label->io_state)) { ipc_mqueue_changed(ip_get_receiver(port), &port->ip_waitq); } else { ipc_mqueue_changed(NULL, &port->ip_waitq); } + port->ip_mscount = 0; - mqueue->imq_seqno = 0; + port->ip_messages.imq_seqno = 0; port->ip_context = port->ip_guarded = port->ip_strict_guard = 0; - - if (should_destroy) { - /* - * Mark the port and mqueue invalid, preventing further send/receive - * operations from succeeding. It's important for this to be - * done under the same lock hold as the ipc_mqueue_changed - * call to avoid additional threads blocking on an mqueue - * that's being destroyed. - * - * The port active bit needs to be guarded under mqueue lock for - * turnstiles - */ - - /* port transitions to INACTIVE state */ - io_bits_andnot(ip_to_object(port), IO_BITS_ACTIVE); - port->ip_receiver_name = MACH_PORT_NULL; - port->ip_timestamp = ipc_port_timestamp(); - - reap_messages = ipc_mqueue_destroy_locked(mqueue, free_l); - } else { - /* - * clear the immovable bit so the port can move back to anyone - * listening for the port destroy notification. - */ - port->ip_immovable_receive = 0; - - /* port transtions to IN-LIMBO state */ - port->ip_receiver_name = MACH_PORT_NULL; - port->ip_destination = IP_NULL; - } - - return reap_messages; } - -/* - * Routine: ipc_port_init_validate_flags - * Purpose: - * Validates the flag arguments for ipc_port_init - * so that overlapping flags are not accidentally used together - */ - -static kern_return_t -ipc_port_init_validate_flags(ipc_port_init_flags_t flags) +__attribute__((always_inline)) +ipc_port_t +ipc_port_mark_in_space( + ipc_port_t port, + ipc_object_label_t *label, + ipc_space_t space, + mach_port_name_t name, + ipc_object_state_t force_state) { - uint32_t at_most_one_flags = flags & (IPC_PORT_ENFORCE_REPLY_PORT_SEMANTICS | - IPC_PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS | - IPC_PORT_INIT_EXCEPTION_PORT | - IPC_PORT_INIT_PROVISIONAL_REPLY); + ipc_move_policy_t pol = ipc_policy(label)->pol_movability; + ipc_port_t dest; - if (at_most_one_flags & (at_most_one_flags - 1)) { - /* at most one of the listed flags can be set */ - return KERN_INVALID_ARGUMENT; + /* + * Unfortunately, IO_STATE_IN_LIMBO has to be allowed because + * of _kernelrpc_mach_port_insert_right_trap(MACH_MSG_TYPE_MOVE_RECEIVE) + * which will copyin a naked receive right and copy it back out, + * without it ever being in a message. + */ + ipc_release_assert(pol != IPC_MOVE_POLICY_NEVER && + (io_state_in_transit(label->io_state) || + label->io_state == IO_STATE_IN_LIMBO)); + + dest = port->ip_destination; + port->ip_receiver_name = name; + port->ip_receiver = space; + + if (io_state_in_space(force_state)) { + label->io_state = force_state; + } else if (pol == IPC_MOVE_POLICY_ONCE) { + label->io_state = IO_STATE_IN_SPACE_IMMOVABLE; + } else if (pol == IPC_MOVE_POLICY_ONCE_OR_AFTER_PD && + label->io_state != IO_STATE_IN_TRANSIT_PD) { + label->io_state = IO_STATE_IN_SPACE_IMMOVABLE; + } else { + label->io_state = IO_STATE_IN_SPACE; } - return KERN_SUCCESS; + + io_label_set_and_put(&port->ip_object, label); + + return dest; } +__attribute__((always_inline)) +void +ipc_port_mark_in_limbo( + ipc_port_t port, + ipc_object_label_t *label, + waitq_link_list_t *free_l) +{ + ipc_release_assert(io_state_in_space(label->io_state)); + + ipc_port_prepare_move(port, label, free_l); + + port->ip_receiver_name = MACH_PORT_NULL; + port->ip_receiver = IS_NULL; + + label->io_state = IO_STATE_IN_LIMBO; + io_label_set_and_put(&port->ip_object, label); +} + +__attribute__((always_inline)) +static void +ipc_port_mark_in_limbo_pd( + ipc_port_t port, + ipc_object_label_t *label, + waitq_link_list_t *free_l) +{ + ipc_release_assert(ipc_policy(label)->pol_movability != IPC_MOVE_POLICY_NEVER && + (io_state_in_space(label->io_state) || + label->io_state == IO_STATE_IN_LIMBO || + label->io_state == IO_STATE_IN_TRANSIT)); + + ipc_port_prepare_move(port, label, free_l); + + port->ip_receiver_name = MACH_PORT_NULL; + port->ip_receiver = IS_NULL; + + label->io_state = IO_STATE_IN_LIMBO_PD; + io_label_set_and_put(&port->ip_object, label); +} + +void +ipc_port_mark_in_transit(ipc_port_t port, ipc_port_t dest) +{ + ipc_object_label_t label = ip_label_get(port); + + ipc_release_assert(io_state_in_limbo(label.io_state)); + + ip_reference(dest); + port->ip_receiver_name = MACH_PORT_NULL; + port->ip_destination = dest; + + if (label.io_state == IO_STATE_IN_LIMBO) { + label.io_state = IO_STATE_IN_TRANSIT; + } else { + assert(label.io_state == IO_STATE_IN_LIMBO_PD); + label.io_state = IO_STATE_IN_TRANSIT_PD; + } + + io_label_set_and_put(&port->ip_object, &label); +} + +__attribute__((always_inline)) +static bool +ipc_port_mark_inactive( + ipc_port_t port, + ipc_object_label_t *label, + waitq_link_list_t *free_l) +{ + ipc_release_assert(io_state_active(label->io_state)); + + ipc_port_prepare_move(port, label, free_l); + + port->ip_receiver_name = MACH_PORT_NULL; + port->ip_receiver = IS_NULL; + port->ip_timestamp = ipc_port_timestamp(); + + /* + * It's important for this to be done under the same lock hold + * as the ipc_mqueue_changed call that ipc_port_prepare_move() + * did to avoid additional threads blocking on an mqueue that's + * being destroyed. + */ + label->io_state = IO_STATE_INACTIVE; + label->iol_pointer = NULL; /* the caller will free it */ + io_label_set_and_put(&port->ip_object, label); + + return ipc_mqueue_destroy_locked(&port->ip_messages, free_l); +} /* * Routine: ipc_port_init @@ -800,59 +829,30 @@ ipc_port_init_validate_flags(ipc_port_init_flags_t flags) * * The memory is expected to be zero initialized (allocated with Z_ZERO). */ - -void +static void ipc_port_init( ipc_port_t port, ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags, mach_port_name_t name) { - int policy = SYNC_POLICY_FIFO; - task_t task = TASK_NULL; - /* the port has been 0 initialized when called */ - if (flags & IPC_PORT_INIT_FILTER_MESSAGE) { - io_bits_or(ip_to_object(port), IP_BIT_FILTER_MSG); - } - if (flags & IPC_PORT_INIT_LOCKED) { - policy |= SYNC_POLICY_INIT_LOCKED; - } + assert(label.io_type != IOT_PORT_SET && label.io_type < IOT_UNKNOWN); /* must be done first, many ip_* bits live inside the waitq */ - waitq_init(&port->ip_waitq, WQT_PORT, policy); - if (flags & IPC_PORT_INIT_TG_BLOCK_TRACKING) { - port->ip_tg_block_tracking = true; - } - if (flags & IPC_PORT_INIT_SPECIAL_REPLY) { - port->ip_specialreply = true; - } - if ((flags & IPC_PORT_INIT_REPLY) || (flags & IPC_PORT_INIT_SPECIAL_REPLY)) { - task = current_task_early(); + os_ref_init_raw(&port->ip_object.io_references, NULL); + waitq_init(&port->ip_waitq, WQT_PORT, SYNC_POLICY_INIT_LOCKED); - /* Strict enforcement of reply port semantics are disabled for 3p - rdar://97441265. */ - if (task && task_is_hardened_binary(task)) { - port->ip_immovable_receive = true; - ip_mark_reply_port(port); - } else { - ip_mark_provisional_reply_port(port); - } - } - if (flags & IPC_PORT_ENFORCE_REPLY_PORT_SEMANTICS) { - ip_enforce_reply_port_semantics(port); - } - if (flags & IPC_PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS) { - ip_enforce_rigid_reply_port_semantics(port); - } - if (flags & IPC_PORT_INIT_PROVISIONAL_REPLY) { - ip_mark_provisional_reply_port(port); + + /* ensure default policies are enforced */ + + if (ipc_policy(label)->pol_movability == IPC_MOVE_POLICY_NEVER) { + label.io_state = IO_STATE_IN_SPACE_IMMOVABLE; } - if (flags & IPC_PORT_INIT_EXCEPTION_PORT) { - ip_mark_exception_port(port); - port->ip_immovable_receive = true; - } + /* initialize the other fields */ port->ip_kernel_qos_override = THREAD_QOS_UNSPECIFIED; port->ip_kernel_iotier_override = THROTTLE_LEVEL_END; @@ -862,11 +862,15 @@ ipc_port_init( ipc_port_init_debug(port, __builtin_frame_address(0)); #endif /* MACH_ASSERT */ - /* port transitions to IN-SPACE state */ + /* ports are born "in-space" */ port->ip_receiver_name = name; port->ip_receiver = space; - if (flags & IPC_PORT_INIT_MAKE_SEND_RIGHT) { + + assert(io_state_in_space(label.io_state)); + io_label_init(&port->ip_object, label); + + if (flags & IP_INIT_MAKE_SEND_RIGHT) { port->ip_srights = 1; port->ip_mscount = 1; } @@ -879,6 +883,7 @@ ipc_port_init( * Conditions: * Nothing locked. If successful, the port is returned * locked. (The caller doesn't have a reference.) + * On failure, port and label will be freed. * Returns: * KERN_SUCCESS The port is allocated. * KERN_INVALID_TASK The space is dead. @@ -889,39 +894,38 @@ ipc_port_init( kern_return_t ipc_port_alloc( ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags, - mach_port_name_t *namep, - ipc_port_t *portp) + mach_port_name_t *namep, + ipc_port_t *portp) { - ipc_port_t port; mach_port_name_t name; kern_return_t kr; mach_port_type_t type = MACH_PORT_TYPE_RECEIVE; mach_port_urefs_t urefs = 0; + ipc_entry_t entry; + ipc_port_t port; + ipc_object_t object; - kr = ipc_port_init_validate_flags(flags); - if (kr != KERN_SUCCESS) { - return kr; - } - - if (flags & IPC_PORT_INIT_MAKE_SEND_RIGHT) { + if (flags & IP_INIT_MAKE_SEND_RIGHT) { type |= MACH_PORT_TYPE_SEND; urefs = 1; } - kr = ipc_object_alloc(space, IOT_PORT, type, urefs, - &name, (ipc_object_t *) &port); + + port = ip_alloc(); + object = ip_to_object(port); + kr = ipc_object_alloc_entry(space, object, &name, &entry); if (kr != KERN_SUCCESS) { + ipc_port_label_free(label); + ip_free(port); return kr; } /* space is locked */ - ipc_port_init(port, space, flags | IPC_PORT_INIT_LOCKED, name); + ipc_port_init(port, space, label, flags, name); /* port is locked */ -#if MACH_ASSERT - ipc_port_init_debug(port, __builtin_frame_address(0)); -#endif /* MACH_ASSERT */ + ipc_entry_init(space, object, type, entry, urefs, name); - /* unlock space after init */ is_write_unlock(space); *namep = name; @@ -947,28 +951,41 @@ ipc_port_alloc( kern_return_t ipc_port_alloc_name( ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags, mach_port_name_t name, - ipc_port_t *portp) + ipc_port_t *portp) { + kern_return_t kr; + ipc_entry_t entry; mach_port_type_t type = MACH_PORT_TYPE_RECEIVE; mach_port_urefs_t urefs = 0; + ipc_port_t port; + ipc_object_t object; - kern_return_t kr = ipc_port_init_validate_flags(flags); - if (kr != KERN_SUCCESS) { - return kr; - } - - if (flags & IPC_PORT_INIT_MAKE_SEND_RIGHT) { + if (flags & IP_INIT_MAKE_SEND_RIGHT) { type |= MACH_PORT_TYPE_SEND; urefs = 1; } - flags |= IPC_PORT_INIT_LOCKED; - return ipc_object_alloc_name(space, IOT_PORT, type, urefs, - name, (ipc_object_t *)portp, ^(ipc_object_t object){ - ipc_port_init(ip_object_to_port(object), space, flags, name); - }); + port = ip_alloc(); + object = ip_to_object(port); + kr = ipc_object_alloc_entry_with_name(space, name, &entry); + if (kr != KERN_SUCCESS) { + ipc_port_label_free(label); + ip_free(port); + return kr; + } + + /* space is locked */ + ipc_port_init(port, space, label, flags, name); + /* port is locked */ + ipc_entry_init(space, object, type, entry, urefs, name); + + is_write_unlock(space); + + *portp = port; + return kr; } /* @@ -1112,10 +1129,6 @@ ipc_port_dnnotify( void ipc_port_destroy(ipc_port_t port) { - bool special_reply = port->ip_specialreply; - bool service_port = port->ip_service_port; - bool reap_msgs; - ipc_port_t pdrequest = IP_NULL; struct task_watchport_elem *twe = NULL; waitq_link_list_t free_l = { }; @@ -1127,7 +1140,20 @@ ipc_port_destroy(ipc_port_t port) natural_t assertcnt = 0; #endif /* IMPORTANCE_INHERITANCE */ - require_ip_active(port); + ipc_object_label_t label = ip_label_get(port); + ipc_release_assert(io_state_active(label.io_state)); + + /* + * permanent ports cannot be destroyed. + * + * It's safe to check this on entry of port destruction, + * since kobjects cannot register to port-destroyed notifications. + */ + if (ipc_policy(label)->pol_kobject_permanent) { + panic("trying to destroy a permanent port %p with kobject type: %d", + port, ip_type(port)); + } + /* port->ip_receiver_name is garbage */ /* port->ip_receiver/port->ip_destination is garbage */ @@ -1138,7 +1164,7 @@ ipc_port_destroy(ipc_port_t port) twe = ipc_port_clear_watchport_elem_internal(port); assert(!port->ip_has_watchport); - if (!special_reply) { + if (!ip_is_special_reply_port_type(label.io_type)) { /* we assume the ref for pdrequest */ pdrequest = port->ip_pdrequest; port->ip_pdrequest = IP_NULL; @@ -1169,27 +1195,11 @@ ipc_port_destroy(ipc_port_t port) } #endif /* IMPORTANCE_INHERITANCE */ - /* - * If no port-destroyed notification is armed, calling - * ipc_port_clear_receiver() will mark the port inactive - * and will wakeup any threads which may be blocked receiving on it. - */ - reap_msgs = ipc_port_clear_receiver(port, pdrequest == IP_NULL, &free_l); - assert(!ip_in_pset(port)); - assert(port->ip_mscount == 0); - /* * Handle port-destroyed notification */ - if (pdrequest != IP_NULL) { - assert(reap_msgs == false); - - if (service_port) { - assert(port->ip_splabel != NULL); - if (ipc_service_port_label_is_special_pdrequest((ipc_service_port_label_t)port->ip_splabel)) { - ipc_service_port_label_set_flag(port->ip_splabel, ISPL_FLAGS_SEND_PD_NOTIFICATION); - } - } + if (pdrequest != IP_NULL && ip_active(pdrequest)) { + ipc_port_mark_in_limbo_pd(port, &label, &free_l); ipc_port_send_turnstile_recompute_push_locked(port); /* port unlocked */ @@ -1197,40 +1207,54 @@ ipc_port_destroy(ipc_port_t port) /* consumes our refs for port and pdrequest */ ipc_notify_port_destroyed(pdrequest, port); } else { - ipc_service_port_label_t splabel = NULL; ipc_notify_nsenders_t nsrequest; + ipc_object_label_t label_unsafe_copy = label; + bool reap_msgs; + + /* + * Mark the port and mqueue invalid, + * preventing further send/receive operations from succeeding. + */ + reap_msgs = ipc_port_mark_inactive(port, &label, &free_l); nsrequest = ipc_notify_no_senders_prepare(port); - if (!ip_is_kolabeled(port)) { - splabel = port->ip_splabel; - port->ip_splabel = NULL; - port->ip_service_port = false; - } - ipc_port_send_turnstile_recompute_push_locked(port); /* port unlocked */ /* unlink the kmsg from special reply port */ - if (special_reply) { + if (ip_is_special_reply_port_type(label.io_type)) { ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE); } - /* Deallocate the service/connection port label */ - if (splabel) { - ipc_service_port_label_dealloc(splabel, service_port); - splabel = NULL; + /* + * If the port-destroyed notification port didn't look active, + * we destroyed the port right away but still need to consume + * a send-once right to it. + * + * This is racy check, which is ok because it is really an + * optimization. See ipc_notify_should_send(). + */ + if (pdrequest) { + ipc_port_release_sonce(pdrequest); } + /* + * We violate the rules around labels here by making a copy + * because we know that ipc_port_mark_inactive() will nil out + * the iol_pointer value to the port and we must free it. + */ + ipc_port_label_free(label_unsafe_copy); + if (reap_msgs) { ipc_kmsg_reap_delayed(); } if (nsrequest.ns_notify) { /* - * ipc_notify_no_senders_prepare will consume - * the reference for kobjects. + * ipc_notify_no_senders_prepare will never set + * ns_notify for a dead kobject port. */ assert(!nsrequest.ns_is_kobject); ip_mq_lock(nsrequest.ns_notify); @@ -1240,8 +1264,6 @@ ipc_port_destroy(ipc_port_t port) /* generate dead-name notifications */ ipc_port_dnnotify(port); - ipc_kobject_destroy(port); - ip_release(port); /* consume caller's ref */ } @@ -1398,19 +1420,16 @@ ipc_port_check_circularity( ipc_port_multiple_unlock(); /* port (== base) is in limbo */ - require_ip_active(port); - assert(ip_in_limbo(port)); + ipc_release_assert(ip_in_limbo(port)); assert(!took_base_ref); base = dest; while (base != IP_NULL) { ipc_port_t next; - /* dest is in transit or in limbo */ - require_ip_active(base); - assert(!ip_in_a_space(base)); - + ipc_release_assert(ip_is_moving(base)); next = ip_get_destination(base); + ip_mq_unlock(base); base = next; } @@ -1429,22 +1448,15 @@ ipc_port_check_circularity( ipc_port_multiple_unlock(); not_circular: - require_ip_active(port); - assert(ip_in_limbo(port)); - /* Clear the watchport boost */ watchport_elem = ipc_port_clear_watchport_elem_internal(port); /* Check if the port is being enqueued as a part of sync bootstrap checkin */ - if (dest->ip_specialreply && dest->ip_sync_bootstrap_checkin) { + if (ip_is_special_reply_port(dest) && dest->ip_sync_bootstrap_checkin) { port->ip_sync_bootstrap_checkin = 1; } - ip_reference(dest); - - /* port transitions to IN-TRANSIT state */ - assert(port->ip_receiver_name == MACH_PORT_NULL); - port->ip_destination = dest; + ipc_port_mark_in_transit(port, dest); /* Setup linkage for source port if it has sync ipc push */ struct turnstile *send_turnstile = TURNSTILE_NULL; @@ -1476,11 +1488,9 @@ not_circular: break; } - /* port is IN-TRANSIT */ - require_ip_active(dest); - assert(ip_in_transit(dest)); - + ipc_release_assert(ip_in_transit(dest)); next = ip_get_destination(dest); + ip_mq_unlock(dest); dest = next; } @@ -1525,7 +1535,7 @@ static struct task_watchport_elem * ipc_port_watchport_elem(ipc_port_t port) { if (port->ip_has_watchport) { - assert(!port->ip_specialreply); + assert(!ip_is_special_reply_port(port)); return port->ip_twe; } return NULL; @@ -1545,7 +1555,7 @@ ipc_port_update_watchport_elem(ipc_port_t port, struct task_watchport_elem *we) struct task_watchport_elem *old_we; ipc_port_t pdrequest; - assert(!port->ip_specialreply); + assert(!ip_is_special_reply_port(port)); /* * Note: ip_pdrequest and ip_twe are unioned. @@ -1583,7 +1593,7 @@ ipc_port_update_watchport_elem(ipc_port_t port, struct task_watchport_elem *we) static inline void ipc_special_reply_stash_pid_locked(ipc_port_t port, int pid) { - assert(port->ip_specialreply); + assert(ip_is_special_reply_port(port)); port->ip_pid = pid; } @@ -1598,7 +1608,7 @@ ipc_special_reply_stash_pid_locked(ipc_port_t port, int pid) int ipc_special_reply_get_pid_locked(ipc_port_t port) { - assert(port->ip_specialreply); + assert(ip_is_special_reply_port(port)); return port->ip_pid; } @@ -1626,7 +1636,7 @@ ipc_port_recv_update_inheritor( struct turnstile *inheritor = TURNSTILE_NULL; struct knote *kn; - if (ip_active(port) && port->ip_specialreply) { + if (ip_active(port) && ip_is_special_reply_port(port)) { ip_mq_lock_held(port); switch (port->ip_sync_link_state) { @@ -1696,7 +1706,7 @@ ipc_port_send_update_inheritor( if (!ip_active(port)) { /* this port is no longer active, it should not push anywhere */ - } else if (port->ip_specialreply) { + } else if (ip_is_special_reply_port(port)) { /* Case 1. */ if (port->ip_sync_bootstrap_checkin && prioritize_launch) { inheritor = port->ip_messages.imq_srp_owner_thread; @@ -1858,7 +1868,7 @@ ipc_port_link_special_reply_port( /* Lock the special reply port and establish the linkage */ ip_mq_lock(special_reply_port); - special_reply = special_reply_port->ip_specialreply; + special_reply = ip_is_special_reply_port(special_reply_port); if (sync_bootstrap_checkin && special_reply) { special_reply_port->ip_sync_bootstrap_checkin = 1; @@ -1967,7 +1977,7 @@ ipc_special_reply_port_bits_reset(ipc_port_t special_reply_port) static inline void ipc_special_reply_port_msg_sent_reset(ipc_port_t special_reply_port) { - if (special_reply_port->ip_specialreply == 1) { + if (ip_is_special_reply_port(special_reply_port)) { special_reply_port->ip_srp_msg_sent = 0; } } @@ -1975,7 +1985,7 @@ ipc_special_reply_port_msg_sent_reset(ipc_port_t special_reply_port) inline void ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port) { - if (special_reply_port->ip_specialreply == 1) { + if (ip_is_special_reply_port(special_reply_port)) { special_reply_port->ip_srp_msg_sent = 1; } } @@ -1983,7 +1993,7 @@ ipc_special_reply_port_msg_sent(ipc_port_t special_reply_port) static inline void ipc_special_reply_port_lost_link(ipc_port_t special_reply_port) { - if (special_reply_port->ip_specialreply == 1 && special_reply_port->ip_srp_msg_sent == 0) { + if (ip_is_special_reply_port(special_reply_port) && special_reply_port->ip_srp_msg_sent == 0) { special_reply_port->ip_srp_lost_link = 1; } } @@ -2040,7 +2050,7 @@ ipc_port_adjust_special_reply_port_locked( ip_mq_lock_held(special_reply_port); // ip_sync_link_state is touched - if (!special_reply_port->ip_specialreply) { + if (!ip_is_special_reply_port(special_reply_port)) { // only mach_msg_receive_results_complete() calls this with any port assert(get_turnstile); goto not_special; @@ -2182,7 +2192,7 @@ ipc_port_adjust_special_reply_port( ipc_port_t port, uint8_t flags) { - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { ip_mq_lock(port); ipc_port_adjust_special_reply_port_locked(port, NULL, flags, FALSE); } @@ -2261,7 +2271,7 @@ ipc_port_adjust_port_locked( turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; ip_mq_lock_held(port); // ip_sync_link_state is touched - assert(!port->ip_specialreply); + assert(!ip_is_special_reply_port(port)); if (kn) { inheritor = filt_machport_stash_port(kn, port, &sync_link_state); @@ -2318,7 +2328,7 @@ bool ipc_port_has_prdrequest( ipc_port_t port) { - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { return false; } if (port->ip_has_watchport) { @@ -2347,7 +2357,7 @@ ipc_port_add_watchport_elem_locked( ip_mq_lock_held(port); /* Watchport boost only works for non-special active ports mapped in an ipc space */ - if (!ip_active(port) || port->ip_specialreply || !ip_in_a_space(port)) { + if (!ip_active(port) || ip_is_special_reply_port(port) || !ip_in_a_space(port)) { ip_mq_unlock(port); return KERN_FAILURE; } @@ -2412,7 +2422,7 @@ ipc_port_replace_watchport_elem_conditional_locked( { ip_mq_lock_held(port); - if (port->ip_specialreply || + if (ip_is_special_reply_port(port) || ipc_port_watchport_elem(port) != old_watchport_elem) { ip_mq_unlock(port); return KERN_FAILURE; @@ -2503,7 +2513,7 @@ ipc_port_get_watchport_inheritor( * Assumes the port is locked. */ pid_t -ipc_port_get_receiver_task_locked(ipc_port_t port, uintptr_t *task) +ipc_port_get_receiver_task_locked(ipc_port_t port, task_t *task) { task_t receiver = TASK_NULL; pid_t pid = -1; @@ -2521,7 +2531,7 @@ ipc_port_get_receiver_task_locked(ipc_port_t port, uintptr_t *task) out: if (task) { - *task = (uintptr_t)receiver; + *task = receiver; } return pid; } @@ -2535,13 +2545,13 @@ out: * Nothing locked. The routine takes port lock. */ pid_t -ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task) +ipc_port_get_receiver_task(ipc_port_t port, task_t *task) { pid_t pid = -1; if (!port) { if (task) { - *task = (uintptr_t)TASK_NULL; + *task = TASK_NULL; } return pid; } @@ -2682,7 +2692,6 @@ ipc_port_importance_delta_internal( if (ip_in_transit(port)) { dropped = true; - ip_mq_unlock(port); ipc_port_multiple_lock(); /* massive serialization */ @@ -2859,10 +2868,10 @@ ipc_port_make_send_mqueue( ip_mq_lock(port); if (__improbable(!ip_active(port))) { sright = IP_DEAD; - } else if (ip_kotype(port) == IKOT_NONE) { - ipc_port_make_send_any_locked(port); - } else { + } else if (__improbable(ip_is_kobject(port))) { sright = IP_NULL; + } else { + ipc_port_make_send_any_locked(port); } ip_mq_unlock(port); } @@ -2908,10 +2917,10 @@ ipc_port_copy_send_mqueue( ip_mq_lock(port); if (__improbable(!ip_active(port))) { sright = IP_DEAD; - } else if (ip_kotype(port) == IKOT_NONE) { - ipc_port_copy_send_any_locked(port); - } else { + } else if (__improbable(ip_is_kobject(port))) { sright = IP_NULL; + } else { + ipc_port_copy_send_any_locked(port); } ip_mq_unlock(port); } @@ -2960,7 +2969,8 @@ ipc_port_copyout_send( ipc_port_t sright, /* can be invalid */ ipc_space_t space) { - return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_NONE); + return ipc_port_copyout_send_internal(sright, space, + IPC_OBJECT_COPYOUT_FLAGS_NONE); } /* Used by pthread kext to copyout thread port only */ @@ -2969,17 +2979,8 @@ ipc_port_copyout_send_pinned( ipc_port_t sright, /* can be invalid */ ipc_space_t space) { - assert(space->is_task != TASK_NULL); - - if (IP_VALID(sright)) { - assert(ip_kotype(sright) == IKOT_THREAD_CONTROL); - } - - if (task_is_pinned(space->is_task)) { - return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_PINNED); - } else { - return ipc_port_copyout_send_internal(sright, space, IPC_OBJECT_COPYOUT_FLAGS_NONE); - } + return ipc_port_copyout_send_internal(sright, space, + IPC_OBJECT_COPYOUT_FLAGS_PINNED); } /* @@ -3094,7 +3095,7 @@ ipc_port_release_sonce_and_unlock( ip_sorights_dec(port); - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { ipc_port_adjust_special_reply_port_locked(port, NULL, IPC_PORT_ADJUST_RESET_BOOSTRAP_CHECKIN, FALSE); } else { @@ -3147,8 +3148,8 @@ ipc_port_release_receive( } ip_mq_lock(port); - require_ip_active(port); - assert(!ip_in_a_space(port)); + + ipc_release_assert(ip_is_moving(port)); dest = ip_get_destination(port); ipc_port_destroy(port); /* consumes ref, unlocks */ @@ -3163,7 +3164,7 @@ ipc_port_release_receive( * Routine: ipc_port_alloc_special * Purpose: * Allocate a port in a special space. - * The new port is returned with one ref. + * The new port is returned with one ref and locked. * If unsuccessful, IP_NULL is returned. * Conditions: * Nothing locked. @@ -3172,81 +3173,18 @@ ipc_port_release_receive( ipc_port_t ipc_port_alloc_special( ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags) { ipc_port_t port; - kern_return_t kr = ipc_port_init_validate_flags(flags); - if (kr != KERN_SUCCESS) { - return IP_NULL; - } - - port = ip_object_to_port(io_alloc(IOT_PORT, Z_WAITOK | Z_ZERO)); - if (port == IP_NULL) { - return IP_NULL; - } - - os_atomic_init(&port->ip_object.io_bits, io_makebits(IOT_PORT)); - os_atomic_init(&port->ip_object.io_references, 1); - - ipc_port_init(port, space, flags, MACH_PORT_SPECIAL_DEFAULT); + port = ip_alloc(); + ipc_port_init(port, space, label, flags, MACH_PORT_SPECIAL_DEFAULT); return port; } /* - * Routine: ipc_port_dealloc_special_and_unlock - * Purpose: - * Deallocate a port in a special space. - * Consumes one ref for the port. - * Conditions: - * Port is locked. - */ - -void -ipc_port_dealloc_special_and_unlock( - ipc_port_t port, - __assert_only ipc_space_t space) -{ - require_ip_active(port); -// assert(port->ip_receiver_name != MACH_PORT_NULL); - assert(ip_in_space(port, space)); - - /* - * We clear ip_receiver_name and ip_receiver to simplify - * the ipc_space_kernel check in ipc_mqueue_send. - */ - - /* port transtions to IN-LIMBO state */ - port->ip_receiver_name = MACH_PORT_NULL; - port->ip_receiver = IS_NULL; - - /* relevant part of ipc_port_clear_receiver */ - port->ip_mscount = 0; - port->ip_messages.imq_seqno = 0; - - ipc_port_destroy(port); -} - -/* - * Routine: ipc_port_dealloc_special - * Purpose: - * Deallocate a port in a special space. - * Consumes one ref for the port. - * Conditions: - * Nothing locked. - */ - -void -ipc_port_dealloc_special( - ipc_port_t port, - ipc_space_t space) -{ - ip_mq_lock(port); - ipc_port_dealloc_special_and_unlock(port, space); -} - -/* - * Routine: ipc_port_finalize + * Routine: ipc_port_free * Purpose: * Called on last reference deallocate to * free any remaining data associated with the @@ -3255,7 +3193,7 @@ ipc_port_dealloc_special( * Nothing locked. */ void -ipc_port_finalize( +ipc_port_free( ipc_port_t port) { ipc_port_request_table_t requests = port->ip_requests; @@ -3266,9 +3204,7 @@ ipc_port_finalize( assert(ipc_port_rcv_turnstile(port) == TURNSTILE_NULL); } - if (ip_active(port)) { - panic("Trying to free an active port. port %p", port); - } + ipc_release_assert(!ip_active(port)); if (requests) { port->ip_requests = NULL; @@ -3281,6 +3217,7 @@ ipc_port_finalize( btref_put(port->ip_made_bt); } #endif + ip_free(port); } /* @@ -3311,7 +3248,7 @@ kdp_mqueue_send_find_owner( turnstile = waitq_to_turnstile(waitq); ipc_port_t port = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */ - zone_id_require(ZONE_ID_IPC_PORT, sizeof(struct ipc_port), port); + ip_validate(port); waitinfo->owner = 0; waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(port); @@ -3326,13 +3263,6 @@ kdp_mqueue_send_find_owner( /* now we are the only one accessing the port */ if (ip_active(port)) { - /* - * In kdp context, port must be left unlocked throughout. - * Therefore can't use union field accessor helpers, manually strip PAC - * and compare raw pointer. - */ - void *raw_ptr = ip_get_receiver_ptr_noauth(port); - if (port->ip_tempowner) { ipc_importance_task_t imp_task = ip_get_imp_task(port); if (imp_task != IIT_NULL && imp_task->iit_task != NULL) { @@ -3342,7 +3272,9 @@ kdp_mqueue_send_find_owner( waitinfo->owner = STACKSHOT_WAITOWNER_INTRANSIT; } } else if (ip_in_a_space(port)) { /* no port lock needed */ - if ((ipc_space_t)raw_ptr == ipc_space_kernel) { /* access union field as ip_receiver */ + ipc_space_t space = port->ip_receiver; + + if (space == ipc_space_kernel) { /* access union field as ip_receiver */ /* * The kernel pid is 0, make this * distinguishable from no-owner and @@ -3350,14 +3282,14 @@ kdp_mqueue_send_find_owner( */ waitinfo->owner = STACKSHOT_WAITOWNER_KERNEL; } else { - waitinfo->owner = pid_from_task(((ipc_space_t)raw_ptr)->is_task); + waitinfo->owner = pid_from_task(space->is_task); } - } else if ((ipc_port_t)raw_ptr != IP_NULL) { /* access union field as ip_destination */ + } else if (ip_in_transit(port)) { /* access union field as ip_destination */ waitinfo->wait_type = kThreadWaitPortSendInTransit; - waitinfo->owner = VM_KERNEL_UNSLIDE_OR_PERM((ipc_port_t)raw_ptr); + waitinfo->owner = VM_KERNEL_UNSLIDE_OR_PERM(port->ip_destination); } - if (port->ip_service_port && port->ip_splabel != NULL) { - *isplp = (struct ipc_service_port_label *)port->ip_splabel; + if (ip_is_any_service_port(port)) { + *isplp = ip_label_peek_kdp(port).iol_service; } } } @@ -3390,7 +3322,7 @@ kdp_mqueue_recv_find_owner( if (waitq_type(waitq) == WQT_PORT_SET) { ipc_pset_t set = ips_from_waitq(waitq); - zone_id_require(ZONE_ID_IPC_PORT_SET, sizeof(struct ipc_pset), set); + ips_validate(set); /* Reset wait type to specify waiting on port set receive */ waitinfo->wait_type = kThreadWaitPortSetReceive; @@ -3402,7 +3334,7 @@ kdp_mqueue_recv_find_owner( } else if (waitq_type(waitq) == WQT_PORT) { ipc_port_t port = ip_from_waitq(waitq); - zone_id_require(ZONE_ID_IPC_PORT, sizeof(struct ipc_port), port); + ip_validate(port); waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(port); if (ip_mq_lock_held_kdp(port)) { @@ -3412,36 +3344,20 @@ kdp_mqueue_recv_find_owner( if (ip_active(port)) { if (ip_in_a_space(port)) { /* no port lock needed */ - waitinfo->owner = ip_get_receiver_name(port); + waitinfo->owner = port->ip_receiver_name; } else { waitinfo->owner = STACKSHOT_WAITOWNER_INTRANSIT; } - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { waitinfo->wait_flags |= STACKSHOT_WAITINFO_FLAGS_SPECIALREPLY; } - if (port->ip_splabel != NULL) { - *isplp = (struct ipc_service_port_label *)port->ip_splabel; + if (ip_is_any_service_port(port)) { + *isplp = ip_label_peek_kdp(port).iol_service; } } } } -void -ipc_port_set_label( - ipc_port_t port, - ipc_label_t label) -{ - ipc_kobject_label_t labelp; - - assert(!ip_is_kolabeled(port)); - - labelp = zalloc_flags(ipc_kobject_label_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); - labelp->ikol_label = label; - - port->ip_kolabel = labelp; - io_bits_or(ip_to_object(port), IO_BITS_KOLABEL); -} - kern_return_t ipc_port_reset_thread_attr( ipc_port_t port) @@ -3480,7 +3396,7 @@ ipc_port_update_qos_n_iotier( return KERN_TERMINATED; } - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { ip_mq_unlock(port); return KERN_INVALID_ARGUMENT; } @@ -3498,78 +3414,6 @@ ipc_port_update_qos_n_iotier( return KERN_SUCCESS; } -/* Returns true if a rigid reply port violation should be enforced (by killing the process) */ -static bool -__ip_rigid_reply_port_semantics_violation( - ipc_port_t reply_port, - ipc_policy_violation_id_t *reply_port_semantics_violation) -{ - bool hardened_runtime = csproc_hardened_runtime(current_proc()); - - if (proc_is_simulated(current_proc()) -#if CONFIG_ROSETTA - || task_is_translated(current_task()) -#endif -#if XNU_TARGET_OS_OSX - || task_opted_out_mach_hardening(current_task()) -#endif /* XNU_TARGET_OS_OSX */ - ) { - return FALSE; - } - - if (task_is_hardened_binary(current_task())) { - return TRUE; - } - if (!ip_is_provisional_reply_port(reply_port)) { - /* record telemetry for when third party fails to use a provisional reply port */ - *reply_port_semantics_violation = hardened_runtime ? IPCPV_RIGID_REPLY_PORT_HARDENED_RUNTIME : IPCPV_RIGID_REPLY_PORT_3P; - } - return FALSE; -} - -bool -ip_violates_reply_port_semantics( - ipc_port_t dest_port, - ipc_port_t reply_port, - ipc_policy_violation_id_t *reply_port_semantics_violation) -{ - /* - * dest_port lock must be held to avoid race condition - * when accessing ip_splabel rdar://139066947 - */ - ip_mq_lock_held(dest_port); - - if (ip_require_reply_port_semantics(dest_port) - && !ip_is_reply_port(reply_port) - && !ip_is_provisional_reply_port(reply_port)) { - *reply_port_semantics_violation = IPCPV_REPLY_PORT_SEMANTICS; - return TRUE; - } - - if (dest_port->ip_service_port) { - ipc_service_port_label_t label = dest_port->ip_splabel; - if (!ipc_service_port_label_is_bootstrap_port(label) - && !ip_is_reply_port(reply_port) - && !ip_is_provisional_reply_port(reply_port)) { - *reply_port_semantics_violation = IPCPV_REPLY_PORT_SEMANTICS_OPTOUT; - } - } - - return FALSE; -} - -/* Rigid reply port semantics don't allow for provisional reply ports */ -bool -ip_violates_rigid_reply_port_semantics( - ipc_port_t dest_port, - ipc_port_t reply_port, - ipc_policy_violation_id_t *violates_3p) -{ - return ip_require_rigid_reply_port_semantics(dest_port) - && !ip_is_reply_port(reply_port) - && __ip_rigid_reply_port_semantics_violation(reply_port, violates_3p); -} - #if MACH_ASSERT #include diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index 60796668f..d653b81a4 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -72,15 +72,14 @@ #ifndef _IPC_IPC_PORT_H_ #define _IPC_IPC_PORT_H_ -#ifdef MACH_KERNEL_PRIVATE - -#include - #include #include #include #include +#ifdef MACH_KERNEL_PRIVATE +#include + #include #include #include @@ -88,17 +87,17 @@ #include #include #include -#include - -#include #include +#endif /* MACH_KERNEL_PRIVATE */ -extern int proc_isinitproc(struct proc *p); +__BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN +#if MACH_KERNEL_PRIVATE +#pragma GCC visibility push(hidden) struct task_watchport_elem; -typedef unsigned int ipc_port_timestamp_t; +typedef unsigned long ipc_port_timestamp_t; struct ipc_port_request { union { @@ -134,19 +133,17 @@ struct ipc_port { , ip_tempowner:1 /* dont give donations to current receiver */ , ip_guarded:1 /* port guarded (use context value as guard) */ , ip_strict_guard:1 /* Strict guarding; Prevents user manipulation of context values directly */ - , ip_specialreply:1 /* port is a special reply port */ , ip_sync_link_state:3 /* link the port to destination port/ Workloop */ , ip_sync_bootstrap_checkin:1 /* port part of sync bootstrap checkin, push on thread doing the checkin */ - , ip_immovable_receive:1 /* the receive right cannot be moved out of a space, until it is destroyed */ - , ip_immovable_send:1 /* No send(once) rights to this port can be moved out of a space, never unset */ - , ip_no_grant:1 /* Port wont accept complex messages containing (ool) port descriptors */ , ip_tg_block_tracking:1 /* Track blocking relationship between thread groups during sync IPC */ - , ip_pinned:1 /* Can't deallocate the last send right from a space while the bit is set */ - , ip_service_port:1 /* port is a service port */ , ip_has_watchport:1 /* port has an exec watchport */ , ip_kernel_iotier_override:2 /* kernel iotier override */ , ip_kernel_qos_override:3 /* kernel qos override */ - , ip_reply_port_semantics:3 /* reply port defense in depth type */ + /* development bits only */ + , ip_srp_lost_link:1 /* special reply port turnstile link chain broken */ + , ip_srp_msg_sent:1 /* special reply port msg sent */ + , ip_bootstrap:1 /* whether it is a bootstrap port */ + , __ip_unused:6 /* reserve of bits */ ); struct waitq ip_waitq; }; @@ -163,9 +160,12 @@ struct ipc_port { ipc_port_timestamp_t ip_timestamp; }; - /* update ipc_kobject_upgrade_locked() if this union is changed */ union { uintptr_t ip_kobject; /* manually PAC-ed, see ipc_kobject_get_raw() */ + struct ipc_port *XNU_PTRAUTH_SIGNED_PTR("ipc_port.ip_nsrequest") ip_nsrequest; + }; + + union { ipc_importance_task_t ip_imp_task; /* use accessor ip_get_imp_task() */ struct ipc_port *ip_sync_inheritor_port; struct knote *ip_sync_inheritor_knote; @@ -173,9 +173,9 @@ struct ipc_port { }; /* - * ip_specialreply: ip_pid - * ip_has_watchport: ip_twe - * else: ip_pdrequest + * IOT_SPECIAL_REPLY: ip_pid + * ip_has_watchport: ip_twe + * else: ip_pdrequest */ union { int ip_pid; @@ -183,29 +183,15 @@ struct ipc_port { struct ipc_port *XNU_PTRAUTH_SIGNED_PTR("ipc_port.ip_pdrequest") ip_pdrequest; }; -#define IP_KOBJECT_NSREQUEST_ARMED ((struct ipc_port *)1) - struct ipc_port *ip_nsrequest; ipc_port_request_table_t XNU_PTRAUTH_SIGNED_PTR("ipc_port.ip_request") ip_requests; struct turnstile *ip_send_turnstile; mach_vm_address_t ip_context; -#if DEVELOPMENT || DEBUG - natural_t ip_srp_lost_link : 1; /* special reply port turnstile link chain broken */ - natural_t ip_srp_msg_sent : 1; /* special reply port msg sent */ - natural_t ip_impcount : 30; /* number of importance donations in nested queue */ -#else - natural_t ip_impcount; /* number of importance donations in nested queue */ -#endif + natural_t ip_impcount; /* number of importance donations in nested queue */ mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; mach_port_rights_t ip_sorights; - union { - ipc_kobject_label_t XNU_PTRAUTH_SIGNED_PTR("ipc_port.kolabel") ip_kolabel; - /* Union of service and connection ports' message filtering metadata */ - void * XNU_PTRAUTH_SIGNED_PTR("ipc_port.ip_splabel") ip_splabel; - }; - #if MACH_ASSERT unsigned long ip_timetrack; /* give an idea of "when" created */ uint32_t ip_made_bt; /* stack trace (btref_t) */ @@ -246,6 +232,7 @@ extern void __ipc_right_delta_overflow_panic( if (os_add_overflow(__port->field, delta, &__port->field)) { \ __ipc_right_delta_overflow_panic(__port, &__port->field, delta); \ } \ + __port->field; \ }) #define ip_srights_inc(port) ip_right_delta(port, ip_srights, 1) @@ -297,9 +284,9 @@ extern void __ipc_right_delta_overflow_panic( #define ip_object_to_port(io) __container_of(io, struct ipc_port, ip_object) #define ip_to_object(port) (&(port)->ip_object) -#define ip_active(port) io_active(ip_to_object(port)) #define ip_mq_lock_held(port) io_lock_held(ip_to_object(port)) #define ip_mq_lock(port) ipc_port_lock(port) +#define ip_mq_lock_label_get(port) ipc_port_lock_label_get(port) #define ip_mq_lock_check_aligned(port) ipc_port_lock_check_aligned(port) #define ip_mq_lock_try(port) ipc_port_lock_try(port) #define ip_mq_lock_held_kdp(port) io_lock_held_kdp(ip_to_object(port)) @@ -309,75 +296,63 @@ extern void __ipc_right_delta_overflow_panic( #define ip_release(port) io_release(ip_to_object(port)) #define ip_release_safe(port) io_release_safe(ip_to_object(port)) #define ip_release_live(port) io_release_live(ip_to_object(port)) +#define ip_alloc() zalloc_id(ZONE_ID_IPC_PORT, Z_WAITOK_ZERO_NOFAIL) +#define ip_free(port) zfree_id(ZONE_ID_IPC_PORT, port) #define ip_validate(port) \ zone_id_require(ZONE_ID_IPC_PORT, sizeof(struct ipc_port), port) #define ip_from_waitq(wq) __container_of(wq, struct ipc_port, ip_waitq) #define ip_from_mq(mq) __container_of(mq, struct ipc_port, ip_messages) -#define ip_kotype(port) io_kotype(ip_to_object(port)) +#define ip_type(port) io_type(ip_to_object(port)) #define ip_is_kobject(port) io_is_kobject(ip_to_object(port)) -#define ip_is_control(port) \ - (ip_kotype(port) == IKOT_TASK_CONTROL || ip_kotype(port) == IKOT_THREAD_CONTROL) -#define ip_is_kolabeled(port) io_is_kolabeled(ip_to_object(port)) +#define ip_label_get(port, ...) io_label_get(ip_to_object(port), ## __VA_ARGS__) +#define ip_label_put(port, label) io_label_put(ip_to_object(port), label) +#define ip_label_peek_kdp(port, ...) io_label_peek_kdp(ip_to_object(port), ## __VA_ARGS__) #define ip_full_kernel(port) imq_full_kernel(&(port)->ip_messages) #define ip_full(port) imq_full(&(port)->ip_messages) -/* - * IPC Port flags for reply port defense in depth - * - * PORT_MARK_REPLY_PORT - * Port is marked as a reply port. - * - * PORT_ENFORCE_REPLY_PORT_SEMANTICS - * When talking to this port, the local port of mach msg needs to be a reply port. - * Currrently service ports and libxpc connection ports adopt this. - * - * PORT_MARK_PROVISIONAL_REPLY_PORT - * Port is marked as a provisional reply port with an eventual goal of making it port as PORT_MARK_REPLY_PORT. - * - * PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS - * Same as PORT_ENFORCE_REPLY_PORT_SEMANTICS above, but does not allow for provisional reply ports. - * Once provisional reply ports no longer exist, this will be removed as "rigidness/strictness" will be irrelavant. - * - * PORT_MARK_EXCEPTION_PORT - * Port is used as a mach exception port. It has an immovable receive right and can be used in the - * hardened exception flow provided by `task_register_hardened_exception_handler` - */ -#define PORT_MARK_REPLY_PORT 0x01 -#define PORT_ENFORCE_REPLY_PORT_SEMANTICS 0x02 -#define PORT_MARK_PROVISIONAL_REPLY_PORT 0x03 -#define PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS 0x04 -#define PORT_MARK_EXCEPTION_PORT 0x05 +#define ip_active(port) io_state_active(ip_to_object(port)->io_state) +#define ip_in_a_space(port) io_state_in_space(ip_to_object(port)->io_state) +#define ip_in_limbo(port) io_state_in_limbo(ip_to_object(port)->io_state) +#define ip_in_transit(port) io_state_in_transit(ip_to_object(port)->io_state) +#define ip_is_moving(port) io_state_is_moving(ip_to_object(port)->io_state) +#define ip_is_immovable_receive(port) (ip_to_object(port)->io_state == IO_STATE_IN_SPACE_IMMOVABLE) -/* ip_reply_port_semantics can be read without a lock as it is never unset after port creation. */ -#define ip_is_reply_port(port) (((port)->ip_reply_port_semantics) == PORT_MARK_REPLY_PORT) -#define ip_require_reply_port_semantics(port) (((port)->ip_reply_port_semantics) == PORT_ENFORCE_REPLY_PORT_SEMANTICS) -#define ip_is_provisional_reply_port(port) (((port)->ip_reply_port_semantics) == PORT_MARK_PROVISIONAL_REPLY_PORT) -#define ip_require_rigid_reply_port_semantics(port) (((port)->ip_reply_port_semantics) == PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS) -#define ip_is_exception_port(port) (((port)->ip_reply_port_semantics) == PORT_MARK_EXCEPTION_PORT) +#define ip_is_exception_port(port) (ip_type(port) == IOT_EXCEPTION_PORT) +#define ip_is_provisional_reply_port(port) (ip_type(port) == IOT_PROVISIONAL_REPLY_PORT) +#define ip_is_special_reply_port_type(type) ((type) == IOT_SPECIAL_REPLY_PORT) +#define ip_is_special_reply_port(port) (ip_is_special_reply_port_type(ip_type(port))) +#define ip_is_any_service_port(port) ip_is_any_service_port_type(ip_type(port)) +#define ip_is_port_array_allowed(port) (ip_type(port) == IOT_CONNECTION_PORT_WITH_PORT_ARRAY) +#define ip_is_timer(port) (ip_type(port) == IOT_TIMER_PORT) +#define ip_is_bootstrap_port(port) ((port)->ip_bootstrap) -#define ip_mark_reply_port(port) ((port)->ip_reply_port_semantics = PORT_MARK_REPLY_PORT) -#define ip_enforce_reply_port_semantics(port) ((port)->ip_reply_port_semantics = PORT_ENFORCE_REPLY_PORT_SEMANTICS) -#define ip_mark_provisional_reply_port(port) ((port)->ip_reply_port_semantics = PORT_MARK_PROVISIONAL_REPLY_PORT) -#define ip_enforce_rigid_reply_port_semantics(port) ((port)->ip_reply_port_semantics = PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS) -#define ip_mark_exception_port(port) ((port)->ip_reply_port_semantics = PORT_MARK_EXCEPTION_PORT) +static inline bool +ip_is_any_service_port_type(ipc_object_type_t type) +{ + return type == IOT_SERVICE_PORT || type == IOT_WEAK_SERVICE_PORT; +} +static inline bool +ip_is_reply_port_type(ipc_object_type_t type) +{ + return type == IOT_REPLY_PORT || type == IOT_SPECIAL_REPLY_PORT; +} +static inline bool +ip_is_reply_port(ipc_port_t port) +{ + ipc_object_type_t type = ip_type(port); + return ip_is_reply_port_type(type); +} -#define ip_is_immovable_send(port) ((port)->ip_immovable_send) -#define ip_is_pinned(port) ((port)->ip_pinned) +#define ip_is_tt_control_port(port) (ip_is_tt_control_port_type(ip_type(port))) -#define ip_is_libxpc_connection_port(port) \ - (!ip_is_kolabeled(port) && (!(port)->ip_service_port) && ((port)->ip_splabel != NULL)) - -/* Bits reserved in IO_BITS_PORT_INFO are defined here */ - -/* - * This flag indicates that the port has opted into message filtering based - * on a policy defined in the Sandbox. - */ -#define IP_BIT_FILTER_MSG 0x00001000 -#define ip_enforce_msg_filtering(port) ((io_bits(ip_to_object(port)) & IP_BIT_FILTER_MSG) != 0) +static inline bool +ip_is_tt_control_port_type(ipc_object_type_t type) +{ + return type == IKOT_TASK_CONTROL || type == IKOT_THREAD_CONTROL; +} /* * Use the low bits in the ipr_soright to specify the request type @@ -423,8 +398,6 @@ extern boolean_t ipc_port_destination_chain_lock( * mach_port_names with port death. */ -extern ipc_port_timestamp_t ipc_port_timestamp_data; - /* Retrieve a port timestamp value */ extern ipc_port_timestamp_t ipc_port_timestamp(void); @@ -447,36 +420,11 @@ require_ip_active(ipc_port_t port) } } -/* - * A receive right (port) can be in ONE of the following four states: - * - * 1) INACTIVE: Dead - * 2) IN-SPACE: In a space - * 3) IN-TRANSIT: Enqueued in a message - * 4) IN-LIMBO - * - * If the port is active and ip_receiver_name != MACH_PORT_NULL, we can safely - * deference the union as ip_receiver, which points to the space that holds - * receive right (but doesn't hold a ref for it). - * - * If the port is active and ip_receiver_name == MACH_PORT_NULL, we can safely - * deference the union as ip_destination. The port is either IN-LIMBO (ip_destination == IP_NULL) - * or ip_destination points to the destination port and holds a ref for it. - * - * If the port is not active, we can safely deference the union as ip_timestamp, - * which contains a timestamp taken when the port was destroyed. - * - * If the port is in a space, ip_receiver_name denotes the port name its receive - * right occupies in the receiving space. The only exception, as an optimization trick, - * is task's self port (itk_self), whose ip_receiver_name actually denotes the name - * of mach_task_self() in owning task's space (a send right, with receive right in ipc_space_kernel). - */ - -static inline bool -ip_in_a_space(ipc_port_t port) +static inline void +ip_mq_unlock_label_put(ipc_port_t port, ipc_object_label_t *label) { - /* IN-SPACE */ - return ip_active(port) && port->ip_receiver_name != MACH_PORT_NULL; + ip_label_put(port, label); + io_unlock_nocheck(ip_to_object(port)); } static inline bool @@ -494,22 +442,6 @@ ip_in_space_noauth(ipc_port_t port, void* space) return raw_ptr == space; } -static inline bool -ip_in_transit(ipc_port_t port) -{ - /* IN-TRANSIT */ - ip_mq_lock_held(port); /* port must be locked, otherwise PAC could fail */ - return ip_active(port) && !ip_in_a_space(port) && port->ip_destination != IP_NULL; -} - -static inline bool -ip_in_limbo(ipc_port_t port) -{ - /* IN-LIMBO */ - ip_mq_lock_held(port); /* port must be locked, otherwise PAC could fail */ - return ip_active(port) && !ip_in_a_space(port) && port->ip_destination == IP_NULL; -} - static inline ipc_space_t ip_get_receiver(ipc_port_t port) { @@ -517,13 +449,6 @@ ip_get_receiver(ipc_port_t port) return ip_in_a_space(port) ? port->ip_receiver : NULL; } -static inline void* -ip_get_receiver_ptr_noauth(ipc_port_t port) -{ - void *__single raw_ptr = ptrauth_strip(*(void **)&port->ip_receiver, ptrauth_key_process_independent_data); - return raw_ptr; -} - static inline mach_port_name_t ip_get_receiver_name(ipc_port_t port) { @@ -534,7 +459,7 @@ static inline ipc_port_t ip_get_destination(ipc_port_t port) { ip_mq_lock_held(port); /* port must be locked, otherwise PAC could fail */ - return ip_active(port) && !ip_in_a_space(port) ? port->ip_destination : IP_NULL; + return ip_is_moving(port) ? port->ip_destination : IP_NULL; } static inline ipc_port_timestamp_t @@ -547,7 +472,7 @@ ip_get_death_time(ipc_port_t port) static inline ipc_importance_task_t ip_get_imp_task(ipc_port_t port) { - return (!ip_is_kobject(port) && !port->ip_specialreply && port->ip_tempowner) ? port->ip_imp_task : IIT_NULL; + return (!ip_is_kobject(port) && !ip_is_special_reply_port(port) && port->ip_tempowner) ? port->ip_imp_task : IIT_NULL; } extern kern_return_t ipc_port_translate_send( @@ -597,46 +522,87 @@ extern bool ipc_port_request_sparm( mach_msg_option64_t option, mach_msg_priority_t priority); -/* Make a no-senders request */ -extern void ipc_port_nsrequest( - ipc_port_t port, - mach_port_mscount_t sync, - ipc_port_t notify, - ipc_port_t *previousp); -/* Prepare a receive right for transmission/destruction */ -extern boolean_t ipc_port_clear_receiver( +/*! + * @abstract + * Marks a port as in-space. + * + * @discussion + * The port must be in transit. + * @c port must be locked. + * + * @param port the port to mark as in-space. + * @param label the current object label for @c port. + * @param space the space the port is being received into. + * @param name the name the port will have in @c space. + * @param force_state the state to force. Must be one of: + * - IO_STATE_INACTIVE (means default policy), + * - IO_STATE_IN_SPACE, + * - IO_STATE_IN_SPACE_IMMOVABLE. + * @returns the current port destination or IP_NULL. + */ +extern ipc_port_t ipc_port_mark_in_space( ipc_port_t port, - boolean_t should_destroy, + ipc_object_label_t *label, + ipc_space_t space, + mach_port_name_t name, + ipc_object_state_t force_state); + +#define IPC_PORT_SET_IN_SPACE_DEFAULT 0 +#define IPC_PORT_SET_IN_SPACE_PSEUDO_RECEIVE 1 +#define IPC_PORT_SET_IN_SPACE_FORCE_IMMOVABLE 2 + + +/*! + * @abstract + * Marks a port as in-limbo, and prepare it for a move. + * + * @discussion + * The port must be in space. + * @c port must be locked. + * + * @param port the port to mark as in-space. + * @param label the current object label for @c port. + * @param free_l a list to accumulate waitq linkages to free + * by calling waitq_link_free_list(WQT_PORT_SET, &free_l) + * on it. + */ +extern void ipc_port_mark_in_limbo( + ipc_port_t port, + ipc_object_label_t *label, waitq_link_list_t *free_l); -__options_decl(ipc_port_init_flags_t, uint32_t, { - IPC_PORT_INIT_NONE = 0x00000000, - IPC_PORT_INIT_MAKE_SEND_RIGHT = 0x00000001, - IPC_PORT_INIT_MESSAGE_QUEUE = 0x00000002, - IPC_PORT_INIT_SPECIAL_REPLY = 0x00000004, - IPC_PORT_INIT_FILTER_MESSAGE = 0x00000008, - IPC_PORT_INIT_TG_BLOCK_TRACKING = 0x00000010, - IPC_PORT_INIT_LOCKED = 0x00000020, - IPC_PORT_INIT_REPLY = 0x00000040, - IPC_PORT_ENFORCE_REPLY_PORT_SEMANTICS = 0x00000080, - IPC_PORT_INIT_PROVISIONAL_REPLY = 0x00000100, - IPC_PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS = 0x00000400, - IPC_PORT_INIT_EXCEPTION_PORT = 0x00000800, -}); -/* Initialize a newly-allocated port */ -extern void ipc_port_init( +/*! + * @abstract + * Sets a port as in-transit + * + * @discussion + * The port must be in limbo. + * @c port must be locked. + * + * A reference on @c dest is taken. + * + * @param port the port to mark as in-space. + * @param dest the port @c port is enqueued onto. + */ +extern void ipc_port_mark_in_transit( ipc_port_t port, - ipc_space_t space, - ipc_port_init_flags_t flags, - mach_port_name_t name); + ipc_port_t dest); + +__options_decl(ipc_port_init_flags_t, uint32_t, { + IP_INIT_NONE = 0x00000000, + IP_INIT_MAKE_SEND_RIGHT = 0x00000001, +}); extern void ipc_port_lock( ipc_port_t port); -extern void ipc_port_lock_check_aligned( - ipc_port_t port); +extern ipc_object_label_t ipc_port_lock_label_get( + ipc_port_t port) __result_use_check; + +extern ipc_object_label_t ipc_port_lock_check_aligned( + ipc_port_t port) __result_use_check; extern bool ipc_port_lock_try( ipc_port_t port); @@ -644,21 +610,26 @@ extern bool ipc_port_lock_try( /* Allocate a port */ extern kern_return_t ipc_port_alloc( ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags, - mach_port_name_t *namep, - ipc_port_t *portp); + mach_port_name_t *namep, + ipc_port_t *portp); /* Allocate a port, with a specific name */ extern kern_return_t ipc_port_alloc_name( ipc_space_t space, + ipc_object_label_t label, ipc_port_init_flags_t flags, mach_port_name_t name, - ipc_port_t *portp); + ipc_port_t *portp); -/* Attach a label to the port */ -extern void ipc_port_set_label( - ipc_port_t port, - ipc_label_t label); +extern ipc_object_label_t ipc_kobject_label_alloc( + ipc_object_type_t otype, + ipc_label_t label_tag, + ipc_port_t alt_port); + +extern void ipc_kobject_label_free( + ipc_object_label_t label); /* Generate dead name notifications */ extern void ipc_port_dnnotify( @@ -670,14 +641,14 @@ extern void ipc_port_spnotify( /* Destroy a port */ extern void ipc_port_destroy( - ipc_port_t port); + ipc_port_t port); /* Check if queueing "port" in a message for "dest" would create a circular * group of ports and messages */ extern boolean_t ipc_port_check_circularity( - ipc_port_t port, - ipc_port_t dest); + ipc_port_t port, + ipc_port_t dest); #if IMPORTANCE_INHERITANCE @@ -867,7 +838,7 @@ extern ipc_port_t ipc_port_make_send_any( * * @returns * - IP_NULL if @c port was not a message queue port - * (IKOT_NONE), or @c port was IP_NULL. + * (!ip_is_kobject()), or @c port was IP_NULL. * - IP_DEAD if @c port was dead. * - @c port if @c port was valid, in which case * a naked send right was made. @@ -929,7 +900,7 @@ extern ipc_port_t ipc_port_copy_send_any( * * @returns * - IP_NULL if @c port was not a message queue port - * (IKOT_NONE), or @c port was IP_NULL. + * (!ip_is_kobject()), or @c port was IP_NULL. * - IP_DEAD if @c port was dead. * - @c port if @c port was valid, in which case * a naked send right was made. @@ -957,20 +928,74 @@ extern void ipc_port_release_send_and_unlock( extern kern_return_t mach_port_deallocate_kernel( ipc_space_t space, mach_port_name_t name, - natural_t kotype); + ipc_object_type_t otype); +/* Make a naked send-once right from a locked and active receive right */ +extern ipc_port_t ipc_port_make_sonce_locked( + ipc_port_t port); + +/* Make a naked send-once right from a receive right */ +extern ipc_port_t ipc_port_make_sonce( + ipc_port_t port); + +/* Release a naked send-once right */ +extern void ipc_port_release_sonce( + ipc_port_t port); + +/* Release a naked send-once right */ +extern void ipc_port_release_sonce_and_unlock( + ipc_port_t port); + +/* Release a naked (in limbo or in transit) receive right */ +extern void ipc_port_release_receive( + ipc_port_t port); + +/* Finalize the destruction of a port and free it */ +extern void ipc_port_free( + ipc_port_t port); + +/* Get receiver task and its pid (if any) for port. Assumes port is locked. */ +extern pid_t ipc_port_get_receiver_task_locked( + ipc_port_t port, + task_t *task); + +/* Get receiver task and its pid (if any) for port. */ +extern pid_t ipc_port_get_receiver_task( + ipc_port_t port, + task_t *task); + +/* Allocate a port in a special space */ +extern ipc_port_t ipc_port_alloc_special( + ipc_space_t space, + ipc_object_label_t label, + ipc_port_init_flags_t flags); + +extern void ipc_port_recv_update_inheritor( + ipc_port_t port, + struct turnstile *turnstile, + turnstile_update_flags_t flags); + +extern void ipc_port_send_update_inheritor( + ipc_port_t port, + struct turnstile *turnstile, + turnstile_update_flags_t flags); + +extern int ipc_special_reply_get_pid_locked( + ipc_port_t port); + +#pragma GCC visibility pop #endif /* MACH_KERNEL_PRIVATE */ #if KERNEL_PRIVATE /* Release a (valid) naked send right */ extern void ipc_port_release_send( - ipc_port_t port); + ipc_port_t port); extern void ipc_port_reference( - ipc_port_t port); + ipc_port_t port); extern void ipc_port_release( - ipc_port_t port); + ipc_port_t port); struct thread_attr_for_ipc_propagation { union { @@ -983,78 +1008,15 @@ struct thread_attr_for_ipc_propagation { uint64_t tafip_reserved; }; -extern kern_return_t -ipc_port_propagate_thread_attr( - ipc_port_t port, +extern kern_return_t ipc_port_propagate_thread_attr( + ipc_port_t port, struct thread_attr_for_ipc_propagation attr); -extern kern_return_t -ipc_port_reset_thread_attr(ipc_port_t port); +extern kern_return_t ipc_port_reset_thread_attr( + ipc_port_t port); + #endif /* KERNEL_PRIVATE */ -#ifdef MACH_KERNEL_PRIVATE - -/* Make a naked send-once right from a locked and active receive right */ -extern ipc_port_t ipc_port_make_sonce_locked( - ipc_port_t port); - -/* Make a naked send-once right from a receive right */ -extern ipc_port_t ipc_port_make_sonce( - ipc_port_t port); - -/* Release a naked send-once right */ -extern void ipc_port_release_sonce( - ipc_port_t port); - -/* Release a naked send-once right */ -extern void ipc_port_release_sonce_and_unlock( - ipc_port_t port); - -/* Release a naked (in limbo or in transit) receive right */ -extern void ipc_port_release_receive( - ipc_port_t port); - -/* Finalize the destruction of a port before it gets freed */ -extern void ipc_port_finalize( - ipc_port_t port); - -/* Get receiver task and its pid (if any) for port. Assumes port is locked. */ -extern pid_t ipc_port_get_receiver_task_locked(ipc_port_t port, uintptr_t *task); - -/* Get receiver task and its pid (if any) for port. */ -extern pid_t ipc_port_get_receiver_task(ipc_port_t port, uintptr_t *task); - -/* Allocate a port in a special space */ -extern ipc_port_t ipc_port_alloc_special( - ipc_space_t space, - ipc_port_init_flags_t flags); - -/* Deallocate a port in a special space */ -extern void ipc_port_dealloc_special_and_unlock( - ipc_port_t port, - ipc_space_t space); - -/* Deallocate a port in a special space */ -extern void ipc_port_dealloc_special( - ipc_port_t port, - ipc_space_t space); - -extern void ipc_port_recv_update_inheritor(ipc_port_t port, - struct turnstile *turnstile, - turnstile_update_flags_t flags); - -extern void ipc_port_send_update_inheritor(ipc_port_t port, - struct turnstile *turnstile, - turnstile_update_flags_t flags); - -extern int -ipc_special_reply_get_pid_locked(ipc_port_t port); - -#define ipc_port_alloc_reply() \ - ipc_port_alloc_special(ipc_space_reply, IPC_PORT_INIT_MESSAGE_QUEUE | IPC_PORT_INIT_SPECIAL_REPLY) -#define ipc_port_dealloc_reply(port) \ - ipc_port_dealloc_special((port), ipc_space_reply) - -#endif /* MACH_KERNEL_PRIVATE */ +__ASSUME_PTR_ABI_SINGLE_END __END_DECLS #endif /* _IPC_IPC_PORT_H_ */ diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index 4f2ec1f05..0836eff76 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -84,11 +84,19 @@ /* processor_set stole ipc_pset_init */ static void -ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy) +ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name) { - waitq_init(&pset->ips_wqset, WQT_PORT_SET, policy | SYNC_POLICY_FIFO); + waitq_init(&pset->ips_wqset, WQT_PORT_SET, + SYNC_POLICY_INIT_LOCKED | SYNC_POLICY_FIFO); klist_init(&pset->ips_klist); pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name); + + /* init io_bits */ + os_ref_init_raw(&pset->ips_object.io_references, NULL); + io_label_init(&pset->ips_object, (ipc_object_label_t){ + .io_type = IOT_PORT_SET, + .io_state = IO_STATE_IN_SPACE_IMMOVABLE, + }); } void @@ -117,20 +125,26 @@ ipc_pset_alloc( mach_port_name_t *namep, ipc_pset_t *psetp) { - ipc_pset_t pset; mach_port_name_t name; kern_return_t kr; + ipc_entry_t entry; + mach_port_type_t type = MACH_PORT_TYPE_PORT_SET; + mach_port_urefs_t urefs = 0; + ipc_pset_t pset; + ipc_object_t object; - kr = ipc_object_alloc(space, IOT_PORT_SET, - MACH_PORT_TYPE_PORT_SET, 0, - &name, (ipc_object_t *) &pset); + pset = ips_alloc(); + object = ips_to_object(pset); + kr = ipc_object_alloc_entry(space, object, &name, &entry); if (kr != KERN_SUCCESS) { + ips_free(pset); return kr; } /* space is locked */ - ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED); + ipc_port_set_init(pset, name); /* port set is locked */ + ipc_entry_init(space, object, type, entry, urefs, name); is_write_unlock(space); @@ -156,14 +170,31 @@ kern_return_t ipc_pset_alloc_name( ipc_space_t space, mach_port_name_t name, - ipc_pset_t *psetp) + ipc_pset_t *psetp) { - return ipc_object_alloc_name(space, IOT_PORT_SET, - MACH_PORT_TYPE_PORT_SET, 0, - name, (ipc_object_t *)psetp, ^(ipc_object_t object){ - ipc_port_set_init(ips_object_to_pset(object), name, - SYNC_POLICY_INIT_LOCKED); - }); + kern_return_t kr; + ipc_entry_t entry; + mach_port_type_t type = MACH_PORT_TYPE_PORT_SET; + mach_port_urefs_t urefs = 0; + ipc_pset_t pset; + ipc_object_t object; + + pset = ips_alloc(); + object = ips_to_object(pset); + kr = ipc_object_alloc_entry_with_name(space, name, &entry); + if (kr != KERN_SUCCESS) { + ips_free(pset); + return kr; + } + /* space is locked */ + + ipc_port_set_init(pset, name); + /* port set is locked */ + ipc_entry_init(space, object, type, entry, urefs, name); + + is_write_unlock(space); + *psetp = pset; + return KERN_SUCCESS; } @@ -171,7 +202,7 @@ ipc_pset_alloc_name( * Routine: ipc_pset_alloc_special * Purpose: * Allocate a port set in a special space. - * The new port set is returned with one ref. + * The new port set is returned with one ref and locked. * If unsuccessful, IPS_NULL is returned. * Conditions: * Nothing locked. @@ -180,21 +211,13 @@ ipc_pset_t ipc_pset_alloc_special( __assert_only ipc_space_t space) { - ipc_pset_t pset; + ipc_pset_t pset = ips_alloc(); assert(space != IS_NULL); assert(!is_active(space)); - pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO)); - if (pset == IPS_NULL) { - return IPS_NULL; - } - - os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET)); - os_atomic_init(&pset->ips_object.io_references, 1); - - ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, 0); - + ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT); + /* port set is locked */ return pset; } @@ -215,10 +238,11 @@ ipc_pset_destroy( ipc_pset_t pset) { waitq_link_list_t free_l = { }; + ipc_object_label_t label = io_label_get(&pset->ips_object, IOT_PORT_SET); - assert(ips_active(pset)); - - io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE); + ipc_release_assert(io_state_in_space(label.io_state)); + label.io_state = IO_STATE_INACTIVE; + io_label_set_and_put(&pset->ips_object, &label); /* * Set all waiters on the portset running to @@ -240,7 +264,7 @@ ipc_pset_destroy( } /* - * Routine: ipc_pset_finalize + * Routine: ipc_pset_free * Purpose: * Called on last reference deallocate to * free any remaining data associated with the pset. @@ -248,10 +272,11 @@ ipc_pset_destroy( * Nothing locked. */ void -ipc_pset_finalize( +ipc_pset_free( ipc_pset_t pset) { waitq_deinit(&pset->ips_wqset); + ips_free(pset); } @@ -450,7 +475,7 @@ filt_machport_turnstile_prepare_lazily( } struct turnstile *ts = filt_ipc_kqueue_turnstile(kn); - if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) || + if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && ip_is_special_reply_port(port)) || (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) { struct turnstile *kn_ts = turnstile_alloc(); struct turnstile *ts_store = TURNSTILE_NULL; @@ -469,7 +494,7 @@ filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port) struct turnstile *ts = TURNSTILE_NULL; ip_mq_lock(port); - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { /* * If the reply has been sent to the special reply port already, * then the special reply port may already be reused to do something @@ -599,12 +624,12 @@ filt_wlattach_sync_ipc(struct knote *kn) if (bits & MACH_PORT_TYPE_RECEIVE) { port = ip_object_to_port(object); - if (port->ip_specialreply || ip_is_kobject(port)) { + if (ip_is_special_reply_port(port) || ip_is_kobject(port)) { error = ENOENT; } } else if (bits & MACH_PORT_TYPE_SEND_ONCE) { port = ip_object_to_port(object); - if (!port->ip_specialreply) { + if (!ip_is_special_reply_port(port)) { error = ENOENT; } } else { @@ -625,7 +650,7 @@ filt_wlattach_sync_ipc(struct knote *kn) return ENOENT; } - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { ipc_port_adjust_special_reply_port_locked(port, kn, IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE); } else { @@ -661,7 +686,7 @@ filt_portattach(struct knote *kn, ipc_port_t port) struct turnstile *send_turnstile = TURNSTILE_NULL; int result = 0; - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { /* * Registering for kevents on special reply ports * isn't supported for two reasons: @@ -1005,7 +1030,7 @@ filt_machportprocess( option64 = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY); - option64 = ipc_current_user_policy(current_task(), option64); + option64 = ipc_current_msg_options(current_task(), option64); if (option64 & MACH_RCV_MSG) { msg_addr = (mach_vm_address_t) kn->kn_ext[0]; @@ -1109,7 +1134,7 @@ filt_machportprocess( kqueue_process_preadopt_thread_group(self, kq, tg); } #endif - if (otype == IOT_PORT) { + if (io_is_any_port_type(otype)) { ipc_port_t port = ip_object_to_port(object); struct kqueue *kqwl = knote_get_kq(kn); if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) { diff --git a/osfmk/ipc/ipc_pset.h b/osfmk/ipc/ipc_pset.h index 44781f03a..b1a62cf48 100644 --- a/osfmk/ipc/ipc_pset.h +++ b/osfmk/ipc/ipc_pset.h @@ -92,13 +92,15 @@ ips_from_waitq(waitq_t wq) return __container_of(wq.wqs_set, struct ipc_pset, ips_wqset); } -#define ips_active(pset) io_active(ips_to_object(pset)) +#define ips_active(pset) io_state_active(ips_to_object(pset)->io_state) #define ips_mq_lock_held(pset) io_lock_held(ips_to_object(pset)) #define ips_mq_lock(pset) ipc_pset_lock(pset) #define ips_mq_lock_held_kdp(pset) io_lock_held_kdp(ips_to_object(pset)) #define ips_mq_unlock(pset) io_unlock(ips_to_object(pset)) #define ips_reference(pset) io_reference(ips_to_object(pset)) #define ips_release(pset) io_release(ips_to_object(pset)) +#define ips_alloc() zalloc_id(ZONE_ID_IPC_PORT_SET, Z_WAITOK_ZERO_NOFAIL) +#define ips_free(pset) zfree_id(ZONE_ID_IPC_PORT_SET, pset) #define ips_validate(pset) \ zone_id_require(ZONE_ID_IPC_PORT_SET, sizeof(struct ipc_pset), pset) @@ -126,8 +128,8 @@ extern void ipc_pset_destroy( ipc_space_t space, ipc_pset_t pset); -/* Finalize the destruction of a pset before it gets freed */ -extern void ipc_pset_finalize( +/* Finalize the destruction of a pset and free it */ +extern void ipc_pset_free( ipc_pset_t pset); #if MACH_KERNEL_PRIVATE diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index 305f88b8a..153f004ec 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -77,9 +77,10 @@ #include #include #include +#include #include -#include -#include + +#include #include #include #include @@ -91,15 +92,6 @@ #include #include #include -#include - -extern struct proc *current_proc(void); -extern int csproc_hardened_runtime(struct proc* p); - -extern void * XNU_PTRAUTH_SIGNED_PTR("initproc") initproc; - -TUNABLE(bool, service_port_defense_enabled, "-service_port_defense_enabled", false); -static TUNABLE(bool, reply_port_semantics, "reply_port_semantics", true); /* * Routine: ipc_right_lookup_read @@ -460,7 +452,8 @@ ipc_right_request_alloc( mach_port_name_t name, ipc_port_request_opts_t options, ipc_port_t notify, - ipc_port_t *previousp) + mach_msg_id_t id, + ipc_port_t *previousp) { ipc_port_t previous = IP_NULL; ipc_entry_t entry; @@ -493,9 +486,25 @@ ipc_right_request_alloc( port = entry->ie_port; assert(port != IP_NULL); - if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { /* port is locked and active */ + /* + * if this port doesn't allow send_possible / + * deadname notifications, fail We only need to + * protect send_once rights since they do not + * coalesce and allow for repeated notification + * requests/allocations + */ + if ((entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE) && + !ipc_policy(port)->pol_notif_dead_name) { + ip_mq_unlock(port); + is_write_unlock(space); + *previousp = IP_NULL; + mach_port_guard_exception(ip_type(port), id, kGUARD_EXC_INVALID_NOTIFICATION_REQ); + return KERN_DENIED; + } + /* * No matter what, we need to cancel any * previous request. @@ -634,10 +643,10 @@ ipc_right_inuse( /* * Routine: ipc_right_check * Purpose: - * Check if the port has died. If it has, - * and IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE is not - * passed and it is not a send once right then - * clean up the entry and return TRUE. + * Check if the port has died. If it has, + * and the location is not IPC_COPYIN_KMSG_DESTINATION + * and it is not a send once right then + * clean up the entry and return TRUE. * Conditions: * The space is write-locked; the port is not locked. * If returns FALSE, the port is also locked. @@ -647,13 +656,13 @@ ipc_right_inuse( * had died (returns TRUE). */ -boolean_t +bool ipc_right_check( ipc_space_t space, ipc_port_t port, mach_port_name_t name, ipc_entry_t entry, - ipc_object_copyin_flags_t flags) + ipc_copyin_op_t copyin_reason) { ipc_entry_bits_t bits; @@ -662,10 +671,10 @@ ipc_right_check( ip_mq_lock(port); if (ip_active(port) || - ((flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_DEAD_SEND_ONCE) && + ((copyin_reason == IPC_COPYIN_KMSG_DESTINATION) && entry->ie_request == IE_REQ_NONE && (entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE))) { - return FALSE; + return false; } /* this was either a pure send right or a send-once right */ @@ -693,7 +702,8 @@ ipc_right_check( } /* convert entry to dead name */ - bits = (bits & ~IE_BITS_TYPE_MASK) | MACH_PORT_TYPE_DEAD_NAME; + bits &= ~(IE_BITS_TYPE_MASK | IE_BITS_PINNED_SEND); + bits |= MACH_PORT_TYPE_DEAD_NAME; /* * If there was a notification request outstanding on this @@ -727,7 +737,7 @@ ipc_right_check( ipc_entry_modified(space, name, entry); - return TRUE; + return true; } /* @@ -746,14 +756,10 @@ ipc_right_terminate( mach_port_name_t name, ipc_entry_t entry) { - mach_port_type_t type; - ipc_port_t port = IP_NULL; - ipc_pset_t pset = IPS_NULL; + mach_port_type_t type = IE_BITS_TYPE(entry->ie_bits); assert(!is_active(space)); - type = IE_BITS_TYPE(entry->ie_bits); - /* * Hollow the entry under the port lock, * in order to avoid dangling pointers. @@ -763,46 +769,37 @@ ipc_right_terminate( * of termination (receive rights first, the rest second). */ - if (type & MACH_PORT_TYPE_PORT_SET) { - pset = entry->ie_pset; - ips_mq_lock(pset); - } else if (type != MACH_PORT_TYPE_DEAD_NAME) { - port = entry->ie_port; - ip_mq_lock(port); - } - entry->ie_object = IPC_OBJECT_NULL; - entry->ie_bits &= (IE_BITS_GEN_MASK | IE_BITS_ROLL_MASK); - switch (type) { case MACH_PORT_TYPE_DEAD_NAME: assert(entry->ie_request == IE_REQ_NONE); + assert(entry->ie_object == IPC_OBJECT_NULL); break; - case MACH_PORT_TYPE_PORT_SET: - assert(entry->ie_request == IE_REQ_NONE); - assert(ips_active(pset)); + case MACH_PORT_TYPE_PORT_SET: { + ipc_pset_t pset = entry->ie_pset; + assert(entry->ie_request == IE_REQ_NONE); + ips_mq_lock(pset); ipc_pset_destroy(space, pset); /* consumes ref, unlocks */ break; + } case MACH_PORT_TYPE_SEND: case MACH_PORT_TYPE_RECEIVE: case MACH_PORT_TYPE_SEND_RECEIVE: case MACH_PORT_TYPE_SEND_ONCE: { + ipc_port_t port = entry->ie_port; ipc_port_t request = IP_NULL; ipc_notify_nsenders_t nsrequest = { }; - if (!ip_active(port)) { - ip_mq_unlock(port); - ip_release(port); - break; + ip_mq_lock(port); + + if (ip_active(port)) { + request = ipc_right_request_cancel(port, name, entry); } - request = ipc_right_request_cancel(port, name, entry); - if (type & MACH_PORT_TYPE_SEND) { - ip_srights_dec(port); - if (port->ip_srights == 0) { + if (ip_srights_dec(port) == 0) { nsrequest = ipc_notify_no_senders_prepare(port); } } @@ -813,9 +810,6 @@ ipc_right_terminate( ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { - assert(port->ip_sorights > 0); - port->ip_reply_context = 0; - ipc_notify_send_once_and_unlock(port); /* consumes our ref */ } else { /* port could be dead, in-transit, or in a foreign space */ @@ -825,40 +819,20 @@ ipc_right_terminate( ip_release(port); } - /* - * For both no-senders and port-deleted notifications, - * look at whether the destination is still active. - * If it isn't, just swallow the send-once right. - * - * This is a racy check, but this ok because we can only - * fail to notice that the port is now inactive, which - * only causes us to fail at an optimizaiton. - * - * The purpose here is to avoid sending messages - * to receive rights that used to be in this space, - * which we can't fail to observe. - */ - if (nsrequest.ns_notify != IP_NULL) { - if (ip_active(nsrequest.ns_notify)) { - ipc_notify_no_senders_emit(nsrequest); - } else { - ipc_notify_no_senders_consume(nsrequest); - } - } + ipc_notify_no_senders_emit(nsrequest); if (request != IP_NULL) { - if (ip_active(request)) { - ipc_notify_port_deleted(request, name); - } else { - ipc_port_release_sonce(request); - } + ipc_notify_port_deleted(request, name); } break; } default: - panic("ipc_right_terminate: strange type - 0x%x", type); + ipc_unreachable("ipc_right_terminate: strange type"); } + + entry->ie_object = IPC_OBJECT_NULL; + entry->ie_bits &= (IE_BITS_GEN_MASK | IE_BITS_ROLL_MASK); } /* @@ -869,18 +843,16 @@ ipc_right_terminate( * The space is write-locked (returns unlocked). * The space must be active. * Returns: - * KERN_SUCCESS The entry was destroyed. - * KERN_INVALID_CAPABILITY The port is pinned. - * KERN_INVALID_RIGHT Port guard violation. + * KERN_SUCCESS The entry was destroyed. + * KERN_INVALID_CAPABILITY The port is pinned. + * KERN_INVALID_RIGHT Port guard violation. */ kern_return_t ipc_right_destroy( ipc_space_t space, mach_port_name_t name, - ipc_entry_t entry, - boolean_t check_guard, - uint64_t guard) + ipc_entry_t entry) { ipc_entry_bits_t bits; mach_port_type_t type; @@ -922,49 +894,41 @@ ipc_right_destroy( case MACH_PORT_TYPE_SEND_ONCE: { ipc_port_t port = entry->ie_port; ipc_notify_nsenders_t nsrequest = { }; - ipc_port_t request; - - assert(port != IP_NULL); - - if (type == MACH_PORT_TYPE_SEND) { - if (ip_is_pinned(port)) { - assert(ip_active(port)); - is_write_unlock(space); - mach_port_guard_exception_pinned(space, name, port, MPG_FLAGS_MOD_REFS_PINNED_DESTROY); - return KERN_INVALID_CAPABILITY; - } - ipc_hash_delete(space, ip_to_object(port), name, entry); - } + ipc_port_t request = IP_NULL; ip_mq_lock(port); - if (!ip_active(port)) { - assert((type & MACH_PORT_TYPE_RECEIVE) == 0); - entry->ie_request = IE_REQ_NONE; - assert(!ip_is_pinned(port)); - ipc_entry_dealloc(space, ip_to_object(port), name, entry); - ip_mq_unlock(port); - is_write_unlock(space); - ip_release(port); - break; - } - - /* For receive rights, check for guarding */ - if ((type & MACH_PORT_TYPE_RECEIVE) && - (check_guard) && (port->ip_guarded) && - (guard != port->ip_context)) { - /* Guard Violation */ + if ((type & MACH_PORT_TYPE_RECEIVE) && port->ip_guarded && + port->ip_context != 0) { uint64_t portguard = port->ip_context; + ip_mq_unlock(port); is_write_unlock(space); - /* Raise mach port guard exception */ - mach_port_guard_exception(name, portguard, kGUARD_EXC_DESTROY); + mach_port_guard_exception(name, portguard, + kGUARD_EXC_DESTROY); return KERN_INVALID_RIGHT; } + if ((bits & IE_BITS_PINNED_SEND) && ip_active(port)) { + ip_mq_unlock(port); + is_write_unlock(space); + mach_port_guard_exception_pinned(space, name, + MPG_FLAGS_MOD_REFS_PINNED_DESTROY); + return KERN_INVALID_CAPABILITY; + } - request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); + /* point of no return */ + + if (ip_active(port)) { + request = ipc_right_request_cancel(port, name, entry); + } else { + assert((type & MACH_PORT_TYPE_RECEIVE) == 0); + entry->ie_request = IE_REQ_NONE; + } + + if (type == MACH_PORT_TYPE_SEND) { + ipc_hash_delete(space, ip_to_object(port), name, entry); + } ipc_entry_dealloc(space, ip_to_object(port), name, entry); is_write_unlock(space); @@ -982,8 +946,6 @@ ipc_right_destroy( ipc_port_destroy(port); /* clears receiver, consumes our ref, unlocks */ } else if (type & MACH_PORT_TYPE_SEND_ONCE) { - assert(port->ip_sorights > 0); - port->ip_reply_context = 0; ipc_notify_send_once_and_unlock(port); /* consumes our ref */ } else { assert(!ip_in_space(port, space)); @@ -1021,7 +983,7 @@ ipc_right_destroy( * Returns: * KERN_SUCCESS A user ref was released. * KERN_INVALID_RIGHT Entry has wrong type. - * KERN_INVALID_CAPABILITY Deallocating a pinned right. + * KERN_INVALID_CAPABILITY Deallocating a pinned right. */ kern_return_t @@ -1060,7 +1022,6 @@ ipc_right_dealloc( case MACH_PORT_TYPE_DEAD_NAME: { dead_name: - assert(IE_BITS_UREFS(bits) > 0); assert(entry->ie_request == IE_REQ_NONE); assert(entry->ie_object == IPC_OBJECT_NULL); @@ -1091,24 +1052,14 @@ dead_name: port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); goto dead_name; /* it will release port */ } /* port is locked and active */ - assert(port->ip_sorights > 0); - - /* - * clear any reply context: - * no one will be sending the response b/c we are destroying - * the single, outstanding send once right. - */ - port->ip_reply_context = 0; - request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); is_write_unlock(space); @@ -1130,7 +1081,7 @@ dead_name: port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { bits = entry->ie_bits; assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_DEAD_NAME); goto dead_name; /* it will release port */ @@ -1140,12 +1091,11 @@ dead_name: assert(port->ip_srights > 0); if (IE_BITS_UREFS(bits) == 1) { - if (ip_is_pinned(port)) { + if (bits & IE_BITS_PINNED_SEND) { ip_mq_unlock(port); - is_write_unlock(space); - mach_port_guard_exception_pinned(space, name, port, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC); - return KERN_INVALID_CAPABILITY; + goto destroy_pinned; } + ip_srights_dec(port); if (port->ip_srights == 0) { nsrequest = ipc_notify_no_senders_prepare(port); @@ -1191,6 +1141,11 @@ dead_name: assert(port->ip_srights > 0); if (IE_BITS_UREFS(bits) == 1) { + if (bits & IE_BITS_PINNED_SEND) { + ip_mq_unlock(port); + goto destroy_pinned; + } + ip_srights_dec(port); if (port->ip_srights == 0) { nsrequest = ipc_notify_no_senders_prepare(port); @@ -1215,11 +1170,19 @@ dead_name: default: is_write_unlock(space); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_DEALLOC, bits), + kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } return KERN_SUCCESS; + +destroy_pinned: + is_write_unlock(space); + mach_port_guard_exception_pinned(space, name, + MPG_FLAGS_MOD_REFS_PINNED_DEALLOC); + return KERN_INVALID_CAPABILITY; } /* @@ -1247,9 +1210,10 @@ ipc_right_delta( { ipc_port_t port = IP_NULL; ipc_port_t port_to_release = IP_NULL; - ipc_entry_bits_t bits; + ipc_entry_bits_t bits = entry->ie_bits; - bits = entry->ie_bits; + /* Debugging information in case a mach port guard exception is raised */ + bool throw_exception = false; /* * The following is used (for case MACH_PORT_RIGHT_DEAD_NAME) in the @@ -1269,7 +1233,7 @@ ipc_right_delta( ipc_pset_t pset; if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + throw_exception = true; goto invalid_right; } @@ -1301,8 +1265,8 @@ ipc_right_delta( ipc_port_t request = IP_NULL; if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + if ((bits & IE_BITS_EX_RECEIVE) == 0) { + throw_exception = true; } goto invalid_right; } @@ -1357,7 +1321,7 @@ ipc_right_delta( * into the hash table. */ bits &= ~MACH_PORT_TYPE_RECEIVE; - bits |= MACH_PORT_TYPE_EX_RECEIVE; + bits |= IE_BITS_EX_RECEIVE; ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); @@ -1369,8 +1333,8 @@ ipc_right_delta( * or use ipc_right_dncancel, because the * port is destroyed "first". */ - bits &= ~IE_BITS_TYPE_MASK; - bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE); + bits &= ~(IE_BITS_TYPE_MASK | IE_BITS_PINNED_SEND | IE_BITS_IMMOVABLE_SEND); + bits |= (MACH_PORT_TYPE_DEAD_NAME | IE_BITS_EX_RECEIVE); if (entry->ie_request) { entry->ie_request = IE_REQ_NONE; /* if urefs are pegged due to overflow, leave them pegged */ @@ -1387,7 +1351,6 @@ ipc_right_delta( assert(IE_BITS_UREFS(bits) == 0); request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); } is_write_unlock(space); @@ -1413,9 +1376,10 @@ ipc_right_delta( port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { assert(!(entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE)); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + bits = entry->ie_bits; + throw_exception = true; /* port has died and removed from entry, release port */ goto invalid_right; } @@ -1433,15 +1397,7 @@ ipc_right_delta( goto success; } - /* - * clear any reply context: - * no one will be sending the response b/c we are destroying - * the single, outstanding send once right. - */ - port->ip_reply_context = 0; - request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); is_write_unlock(space); @@ -1461,11 +1417,11 @@ ipc_right_delta( port = entry->ie_port; assert(port != IP_NULL); - if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { /* port is locked and active */ ip_mq_unlock(port); port = IP_NULL; - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + throw_exception = true; goto invalid_right; } bits = entry->ie_bits; @@ -1473,7 +1429,7 @@ ipc_right_delta( port_to_release = port; port = IP_NULL; } else if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + throw_exception = true; goto invalid_right; } @@ -1543,7 +1499,7 @@ ipc_right_delta( && (((bits & MACH_PORT_TYPE_RECEIVE) == 0) || (delta != 1)) #endif ) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + throw_exception = true; } goto invalid_right; } @@ -1553,8 +1509,9 @@ ipc_right_delta( port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { assert((entry->ie_bits & MACH_PORT_TYPE_SEND) == 0); + bits = entry->ie_bits; /* port has died and removed from entry, release port */ goto invalid_right; } @@ -1591,13 +1548,10 @@ ipc_right_delta( } if ((urefs + delta) == 0) { - if (ip_is_pinned(port)) { + if (bits & IE_BITS_PINNED_SEND) { ip_mq_unlock(port); - is_write_unlock(space); - mach_port_guard_exception_pinned(space, name, port, MPG_FLAGS_MOD_REFS_PINNED_DEALLOC); - return KERN_INVALID_CAPABILITY; + goto destroy_pinned; } - ip_srights_dec(port); if (port->ip_srights == 0) { nsrequest = ipc_notify_no_senders_prepare(port); @@ -1619,7 +1573,6 @@ ipc_right_delta( request = ipc_right_request_cancel(port, name, entry); ipc_hash_delete(space, ip_to_object(port), name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); port_to_release = port; @@ -1665,6 +1618,11 @@ invalid_right: if (port != IP_NULL) { ip_release(port); } + if (throw_exception) { + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_DELTA, right, bits), + kGUARD_EXC_INVALID_RIGHT); + } return KERN_INVALID_RIGHT; invalid_value: @@ -1672,11 +1630,20 @@ invalid_value: if (port_to_release) { ip_release(port_to_release); } - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_VALUE); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_VALUE_DELTA, right, (uint16_t)delta, + IE_BITS_UREFS(bits)), + kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; guard_failure: return KERN_INVALID_RIGHT; + +destroy_pinned: + is_write_unlock(space); + mach_port_guard_exception_pinned(space, name, + MPG_FLAGS_MOD_REFS_PINNED_DEALLOC); + return KERN_INVALID_CAPABILITY; } /* @@ -1716,15 +1683,19 @@ ipc_right_destruct( is_write_unlock(space); /* No exception if we used to have receive and held entry since */ - if ((bits & MACH_PORT_TYPE_EX_RECEIVE) == 0) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + if ((bits & IE_BITS_EX_RECEIVE) == 0) { + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_DESTRUCT, bits), + kGUARD_EXC_INVALID_RIGHT); } return KERN_INVALID_RIGHT; } if (srdelta && (bits & MACH_PORT_TYPE_SEND) == 0) { is_write_unlock(space); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_DESTRUCT, bits), + kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -1815,7 +1786,7 @@ ipc_right_destruct( * into the hash table. */ bits &= ~MACH_PORT_TYPE_RECEIVE; - bits |= MACH_PORT_TYPE_EX_RECEIVE; + bits |= IE_BITS_EX_RECEIVE; ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); @@ -1827,8 +1798,8 @@ ipc_right_destruct( * or use ipc_right_dncancel, because the * port is destroyed "first". */ - bits &= ~IE_BITS_TYPE_MASK; - bits |= (MACH_PORT_TYPE_DEAD_NAME | MACH_PORT_TYPE_EX_RECEIVE); + bits &= ~(IE_BITS_TYPE_MASK | IE_BITS_PINNED_SEND); + bits |= (MACH_PORT_TYPE_DEAD_NAME | IE_BITS_EX_RECEIVE); if (entry->ie_request) { entry->ie_request = IE_REQ_NONE; if (IE_BITS_UREFS(bits) < MACH_PORT_UREFS_MAX) { @@ -1843,7 +1814,6 @@ ipc_right_destruct( assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_RECEIVE); assert(IE_BITS_UREFS(bits) == 0); request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); } @@ -1862,7 +1832,10 @@ ipc_right_destruct( invalid_value: is_write_unlock(space); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_VALUE); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_VALUE_DESTRUCT, srdelta, + IE_BITS_UREFS(bits)), + kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; } @@ -1911,7 +1884,7 @@ ipc_right_info( * types while we still have it locked. Otherwise, * recapture the (now dead) bits. */ - if (!ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (!ipc_right_check(space, port, name, entry, IPC_COPYIN_REASON_NONE)) { if (request != IE_REQ_NONE) { type |= ipc_port_request_type(port, name, request); } @@ -1944,52 +1917,60 @@ ipc_right_info( * The space is locked (read or write) and active. */ -boolean_t +bool ipc_right_copyin_check_reply( __assert_only ipc_space_t space, mach_port_name_t reply_name, ipc_entry_t reply_entry, - mach_msg_type_name_t reply_type, - ipc_entry_t dest_entry, - uint8_t *reply_port_semantics_violation) + mach_msg_type_name_t reply_type) { - ipc_entry_bits_t bits; - ipc_port_t reply_port; - ipc_port_t dest_port; - bool violate_reply_port_semantics = false; + ipc_entry_bits_t reply_bits = reply_entry->ie_bits; + ipc_port_t reply_port = reply_entry->ie_port; - bits = reply_entry->ie_bits; assert(is_active(space)); + if (ip_is_reply_port(reply_port) && + !MACH_MSG_TYPE_PORT_ANY_SEND_ONCE(reply_type)) { + return false; + } + switch (reply_type) { case MACH_MSG_TYPE_MAKE_SEND: - if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - return FALSE; + if ((reply_bits & MACH_PORT_TYPE_RECEIVE) == 0) { + return false; } break; case MACH_MSG_TYPE_MAKE_SEND_ONCE: - if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - return FALSE; + if ((reply_bits & MACH_PORT_TYPE_RECEIVE) == 0) { + return false; } break; case MACH_MSG_TYPE_MOVE_RECEIVE: /* ipc_kmsg_copyin_header already filters it out */ - return FALSE; + return false; + + case MACH_MSG_TYPE_MOVE_SEND: + if ((reply_bits & IE_BITS_PINNED_SEND) && + ip_active(reply_port) && + IE_BITS_UREFS(reply_bits) == 1) { + mach_port_guard_exception_pinned(space, reply_name, + MPG_FLAGS_MOD_REFS_PINNED_COPYIN); + return false; + } + OS_FALLTHROUGH; case MACH_MSG_TYPE_COPY_SEND: - case MACH_MSG_TYPE_MOVE_SEND: case MACH_MSG_TYPE_MOVE_SEND_ONCE: { - if (bits & MACH_PORT_TYPE_DEAD_NAME) { + if (reply_bits & MACH_PORT_TYPE_DEAD_NAME) { break; } - if ((bits & MACH_PORT_TYPE_SEND_RIGHTS) == 0) { - return FALSE; + if ((reply_bits & MACH_PORT_TYPE_SEND_RIGHTS) == 0) { + return false; } - reply_port = reply_entry->ie_port; assert(reply_port != IP_NULL); /* @@ -2002,22 +1983,21 @@ ipc_right_copyin_check_reply( } /* - * Can't copyin a send right that is marked immovable. This bit - * is set only during port creation and never unset. So it can - * be read without a lock. + * Can't copyin a send right that is marked immovable. This bit is on + * the entry and protected by the space lock. */ - if (ip_is_immovable_send(reply_port)) { - mach_port_guard_exception_immovable(space, reply_name, reply_port); + if (reply_entry->ie_bits & IE_BITS_IMMOVABLE_SEND) { + mach_port_guard_exception_immovable(space, reply_name, reply_port, MACH_MSG_TYPE_MOVE_SEND_ONCE, reply_entry); return FALSE; } if (reply_type == MACH_MSG_TYPE_MOVE_SEND_ONCE) { - if ((bits & MACH_PORT_TYPE_SEND_ONCE) == 0) { - return FALSE; + if ((reply_bits & MACH_PORT_TYPE_SEND_ONCE) == 0) { + return false; } } else { - if ((bits & MACH_PORT_TYPE_SEND) == 0) { - return FALSE; + if ((reply_bits & MACH_PORT_TYPE_SEND) == 0) { + return false; } } @@ -2028,38 +2008,7 @@ ipc_right_copyin_check_reply( panic("ipc_right_copyin_check: strange rights"); } - if ((IE_BITS_TYPE(dest_entry->ie_bits) == MACH_PORT_TYPE_PORT_SET) || - (IE_BITS_TYPE(reply_entry->ie_bits) == MACH_PORT_TYPE_PORT_SET)) { - return TRUE; - } - - /* The only disp allowed when a reply port is a local port of mach msg is MAKE_SO. */ - reply_port = reply_entry->ie_port; - assert(reply_port != IP_NULL); - - if (ip_active(reply_port)) { - if (ip_is_reply_port(reply_port) && (reply_type != MACH_MSG_TYPE_MAKE_SEND_ONCE)) { - return FALSE; - } - - /* When sending a msg to remote port that requires reply port semantics enforced the local port of that msg needs to be a reply port. */ - dest_port = dest_entry->ie_port; - if (IP_VALID(dest_port)) { - ip_mq_lock(dest_port); - if (ip_active(dest_port)) { - /* populates reply_port_semantics_violation if we need to send telemetry */ - violate_reply_port_semantics = ip_violates_rigid_reply_port_semantics(dest_port, reply_port, reply_port_semantics_violation) || - ip_violates_reply_port_semantics(dest_port, reply_port, reply_port_semantics_violation); - } - ip_mq_unlock(dest_port); - if (violate_reply_port_semantics && reply_port_semantics) { - mach_port_guard_exception(reply_name, 0, kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS); - return FALSE; - } - } - } - - return TRUE; + return true; } /* @@ -2165,19 +2114,20 @@ ipc_right_copyin( mach_port_name_t name, mach_msg_type_name_t msgt_name, ipc_object_copyin_flags_t flags, + ipc_copyin_op_t copyin_reason, ipc_entry_t entry, ipc_port_t *portp, ipc_copyin_cleanup_t *icc, ipc_copyin_rcleanup_t *icrc) { - ipc_entry_bits_t bits; - ipc_port_t port; + ipc_entry_bits_t bits = entry->ie_bits; + ipc_port_t port = entry->ie_port; + ipc_object_label_t label; kern_return_t kr; + uint32_t moves = (flags & IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_MOVE) ? 2 : 1; - boolean_t deadok = !!(flags & IPC_OBJECT_COPYIN_FLAGS_DEADOK); - boolean_t allow_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); - boolean_t allow_reply_make_so = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MAKE_SEND_ONCE); - boolean_t allow_reply_move_so = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_REPLY_MOVE_SEND_ONCE); + bool deadok = !!(flags & IPC_OBJECT_COPYIN_FLAGS_DEADOK); + bool allow_imm_send = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND); if (flags & IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_MOVE) { assert((flags & IPC_OBJECT_COPYIN_FLAGS_DEST_EXTRA_COPY) == 0); @@ -2192,24 +2142,25 @@ ipc_right_copyin( icc->icc_release_port = IP_NULL; icc->icc_deleted_port = IP_NULL; - bits = entry->ie_bits; - assert(is_active(space)); + /* Only allow send_once disposition on certain ports */ + if (IP_VALID(port) && ip_is_reply_port(port) && + !MACH_MSG_TYPE_PORT_ANY_SEND_ONCE(msgt_name)) { + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_NONE, ip_type(port), msgt_name), + kGUARD_EXC_IMMOVABLE); + return KERN_INVALID_CAPABILITY; + } + switch (msgt_name) { case MACH_MSG_TYPE_MAKE_SEND: { if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { goto invalid_right; } - port = entry->ie_port; assert(port != IP_NULL); - if (ip_is_reply_port(port)) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); - return KERN_INVALID_CAPABILITY; - } - ip_mq_lock(port); assert(ip_get_receiver_name(port) == name); assert(ip_in_space(port, space)); @@ -2222,15 +2173,20 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MAKE_SEND_ONCE: { + bool send_telemetry = false; + if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { goto invalid_right; } - - port = entry->ie_port; assert(port != IP_NULL); - if ((ip_is_reply_port(port)) && !allow_reply_make_so) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + if (ip_is_reply_port(port) && + (copyin_reason != IPC_COPYIN_KMSG_REPLY && + copyin_reason != IPC_COPYIN_KMSG_DESTINATION)) { + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_COPYIN, copyin_reason, + msgt_name), + kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_CAPABILITY; } @@ -2239,70 +2195,56 @@ ipc_right_copyin( assert(ip_get_receiver_name(port) == name); assert(ip_in_space(port, space)); - ipc_port_make_sonce_locked(port); - ip_mq_unlock(port); - - *portp = port; - break; - } - - case MACH_MSG_TYPE_MOVE_RECEIVE: { - bool allow_imm_recv = false; - ipc_port_t request = IP_NULL; - - if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { - goto invalid_right; - } - - port = entry->ie_port; - assert(port != IP_NULL); - - ip_mq_lock(port); - require_ip_active(port); - assert(ip_get_receiver_name(port) == name); - assert(ip_in_space(port, space)); - /* - * Disallow moving receive-right kobjects/kolabel, e.g. mk_timer ports - * The ipc_port structure uses the kdata union of kobject and - * imp_task exclusively. Thus, general use of a kobject port as - * a receive right can cause type confusion in the importance - * code. + * Reply ports can extend one single + * send-once right at any given moment. */ - if (ip_is_kobject(port) || ip_is_kolabeled(port)) { - /* - * Distinguish an invalid right, e.g., trying to move - * a send right as a receive right, from this - * situation which is, "This is a valid receive right, - * but it's also a kobject and you can't move it." - */ - ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_IMMOVABLE); +#if DEVELOPMENT || DEBUG + if (ip_is_reply_port(port) && (port->ip_sorights > 0)) { + send_telemetry = true; + } +#endif /* DEVELOPMENT || DEBUG */ + + ipc_port_make_sonce_locked(port); + ip_mq_unlock(port); + + if (__improbable(send_telemetry)) { + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_COPYIN, copyin_reason, + msgt_name), + kGUARD_EXC_REPLY_PORT_SINGLE_SO_RIGHT); + } + + *portp = port; + break; + } + + case MACH_MSG_TYPE_MOVE_RECEIVE: { + ipc_port_t request = IP_NULL; + + if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { + goto invalid_right; + } + assert(port != IP_NULL); + + /* + * ipc_move_receive_allowed raises the appropritate + * Guard exception if needed + */ + if (!ipc_move_receive_allowed(space, port, name)) { return KERN_INVALID_CAPABILITY; } - if (port->ip_service_port && port->ip_splabel && - !ipc_service_port_label_is_bootstrap_port((ipc_service_port_label_t)port->ip_splabel)) { - allow_imm_recv = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_RECEIVE); - } else if (ip_is_libxpc_connection_port(port)) { - allow_imm_recv = !!(flags & IPC_OBJECT_COPYIN_FLAGS_ALLOW_CONN_IMMOVABLE_RECEIVE); - } - - if ((!allow_imm_recv && port->ip_immovable_receive) || - ip_is_reply_port(port) || /* never move reply port rcv right */ - port->ip_specialreply) { - assert(!ip_in_space(port, ipc_space_kernel)); - ip_mq_unlock(port); - assert(current_task() != kernel_task); - mach_port_guard_exception(name, 0, kGUARD_EXC_IMMOVABLE); - return KERN_INVALID_CAPABILITY; - } + label = ip_mq_lock_label_get(port); + require_ip_active(port); + assert(ip_get_receiver_name(port) == name); + assert(ip_in_space(port, space)); if (icrc->icrc_guarded_desc) { kr = ipc_right_copyin_check_guard_locked(port, name, icrc->icrc_guarded_desc); if (kr != KERN_SUCCESS) { - ip_mq_unlock(port); + ip_mq_unlock_label_put(port, &label); return kr; } /* this flag will be cleared during copyout */ @@ -2317,7 +2259,7 @@ ipc_right_copyin( assert(port->ip_srights > 0); bits &= ~MACH_PORT_TYPE_RECEIVE; - bits |= MACH_PORT_TYPE_EX_RECEIVE; + bits |= IE_BITS_EX_RECEIVE; entry->ie_bits = bits; ipc_hash_insert(space, ip_to_object(port), name, entry); ip_reference(port); @@ -2327,12 +2269,11 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) == 0); request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); } - /* ipc_port_clear_receiver unguards the port and clears the ip_immovable_receive bit */ - (void)ipc_port_clear_receiver(port, FALSE, &icrc->icrc_free_list); /* don't destroy the port/mqueue */ + /* This will unguard the port and make it movable. */ + ipc_port_mark_in_limbo(port, &label, &icrc->icrc_free_list); #if IMPORTANCE_INHERITANCE /* @@ -2353,6 +2294,7 @@ ipc_right_copyin( } #endif /* IMPORTANCE_INHERITANCE */ + /* We already set the label above */ ip_mq_unlock(port); *portp = port; @@ -2373,10 +2315,7 @@ ipc_right_copyin( assert(IE_BITS_UREFS(bits) > 0); - port = entry->ie_port; - assert(port != IP_NULL); - - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, copyin_reason)) { bits = entry->ie_bits; icc->icc_release_port = port; goto copy_dead; @@ -2391,15 +2330,9 @@ ipc_right_copyin( goto invalid_right; } - if (ip_is_reply_port(port)) { + if (!allow_imm_send && (entry->ie_bits & IE_BITS_IMMOVABLE_SEND)) { ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); - return KERN_INVALID_CAPABILITY; - } - - if (!allow_imm_send && ip_is_immovable_send(port)) { - ip_mq_unlock(port); - mach_port_guard_exception_immovable(space, name, port); + mach_port_guard_exception_immovable(space, name, port, MACH_MSG_TYPE_COPY_SEND, entry); return KERN_INVALID_CAPABILITY; } @@ -2427,11 +2360,9 @@ ipc_right_copyin( } assert(IE_BITS_UREFS(bits) > 0); - - port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, IPC_OBJECT_COPYIN_FLAGS_NONE)) { + if (ipc_right_check(space, port, name, entry, copyin_reason)) { bits = entry->ie_bits; icc->icc_release_port = port; goto move_dead; @@ -2444,22 +2375,16 @@ ipc_right_copyin( goto invalid_right; } - if (ip_is_pinned(port) && IE_BITS_UREFS(bits) == moves) { + if ((bits & IE_BITS_PINNED_SEND) && IE_BITS_UREFS(bits) == moves) { ip_mq_unlock(port); mach_port_guard_exception_pinned(space, name, - port, MPG_FLAGS_MOD_REFS_PINNED_COPYIN); + MPG_FLAGS_MOD_REFS_PINNED_COPYIN); return KERN_INVALID_CAPABILITY; } - if (ip_is_reply_port(port)) { + if (!allow_imm_send && (entry->ie_bits & IE_BITS_IMMOVABLE_SEND)) { ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); - return KERN_INVALID_CAPABILITY; - } - - if (!allow_imm_send && ip_is_immovable_send(port)) { - ip_mq_unlock(port); - mach_port_guard_exception_immovable(space, name, port); + mach_port_guard_exception_immovable(space, name, port, MACH_MSG_TYPE_MOVE_SEND, entry); return KERN_INVALID_CAPABILITY; } @@ -2523,7 +2448,7 @@ ipc_right_copyin( } case MACH_MSG_TYPE_MOVE_SEND_ONCE: { - ipc_port_t request; + ipc_port_t request = IP_NULL; if (bits & MACH_PORT_TYPE_DEAD_NAME) { goto move_dead; @@ -2536,11 +2461,9 @@ ipc_right_copyin( } assert(IE_BITS_UREFS(bits) > 0); - - port = entry->ie_port; assert(port != IP_NULL); - if (ipc_right_check(space, port, name, entry, flags)) { + if (ipc_right_check(space, port, name, entry, copyin_reason)) { bits = entry->ie_bits; icc->icc_release_port = port; goto move_dead; @@ -2560,15 +2483,17 @@ ipc_right_copyin( goto invalid_right; } - if (ip_is_reply_port(port) && !allow_reply_move_so) { + if (ip_is_reply_port(port) && copyin_reason != IPC_COPYIN_KMSG_DESTINATION) { ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_COPYIN, copyin_reason, msgt_name), + kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_CAPABILITY; } - if (!allow_imm_send && ip_is_immovable_send(port)) { + if (!allow_imm_send && (entry->ie_bits & IE_BITS_IMMOVABLE_SEND)) { ip_mq_unlock(port); - mach_port_guard_exception_immovable(space, name, port); + mach_port_guard_exception_immovable(space, name, port, MACH_MSG_TYPE_MOVE_SEND_ONCE, entry); return KERN_INVALID_CAPABILITY; } @@ -2577,7 +2502,6 @@ ipc_right_copyin( assert(port->ip_sorights > 0); request = ipc_right_request_cancel(port, name, entry); - assert(!ip_is_pinned(port)); ipc_entry_dealloc(space, ip_to_object(port), name, entry); ip_mq_unlock(port); @@ -2630,7 +2554,7 @@ move_dead: } /* - * Routine: ipc_right_copyout + * Routine: ipc_right_copyout_any_send * Purpose: * Copyout a capability to a space. * If successful, consumes a ref for the port. @@ -2646,37 +2570,28 @@ move_dead: * The space is write-locked and active. * The port is locked and active. * The port is unlocked; the space isn't. - * Returns: - * KERN_SUCCESS Copied out capability. */ -kern_return_t -ipc_right_copyout( +void +ipc_right_copyout_any_send( ipc_space_t space, ipc_port_t port, mach_msg_type_name_t msgt_name, ipc_object_copyout_flags_t flags, mach_port_name_t name, - ipc_entry_t entry, - mach_msg_guarded_port_descriptor_t *gdesc) + ipc_entry_t entry) { - ipc_entry_bits_t bits; - mach_port_name_t sp_name = MACH_PORT_NULL; - mach_port_context_t sp_context = 0; - - bits = entry->ie_bits; + ipc_entry_bits_t bits = entry->ie_bits; assert(IP_VALID(port)); assert(ip_active(port)); assert(entry->ie_port == port); + ipc_object_label_t label = ip_label_get(port); - if (flags & IPC_OBJECT_COPYOUT_FLAGS_PINNED) { - assert(!ip_is_pinned(port)); - assert(ip_is_immovable_send(port)); - assert(task_is_immovable(space->is_task)); - assert(task_is_pinned(space->is_task)); - port->ip_pinned = 1; + if (ipc_should_mark_immovable_send(space->is_task, port, label)) { + bits |= IE_BITS_IMMOVABLE_SEND; } + ip_label_put(port, &label); switch (msgt_name) { case MACH_MSG_TYPE_PORT_SEND_ONCE: @@ -2685,7 +2600,7 @@ ipc_right_copyout( assert(IE_BITS_UREFS(bits) == 0); assert(port->ip_sorights > 0); - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { ipc_port_adjust_special_reply_port_locked(port, current_thread()->ith_knote, IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE); /* port unlocked on return */ @@ -2701,24 +2616,12 @@ ipc_right_copyout( assert(port->ip_srights > 0); if (bits & MACH_PORT_TYPE_SEND) { - mach_port_urefs_t urefs = IE_BITS_UREFS(bits); + __assert_only mach_port_urefs_t urefs = IE_BITS_UREFS(bits); assert(port->ip_srights > 1); assert(urefs > 0); assert(urefs <= MACH_PORT_UREFS_MAX); - if (urefs == MACH_PORT_UREFS_MAX) { - /* - * leave urefs pegged to maximum, - * consume send right and ref - */ - - ip_srights_dec(port); - ip_mq_unlock(port); - ip_release_live(port); - return KERN_SUCCESS; - } - /* consume send right and ref */ ip_srights_dec(port); ip_mq_unlock(port); @@ -2742,151 +2645,148 @@ ipc_right_copyout( ipc_hash_insert(space, ip_to_object(port), name, entry); } - entry->ie_bits = (bits | MACH_PORT_TYPE_SEND) + 1; /* increment urefs */ + if (flags & IPC_OBJECT_COPYOUT_FLAGS_PINNED) { + bits |= IE_BITS_PINNED_SEND; + } + if (IE_BITS_UREFS(bits) != MACH_PORT_UREFS_MAX) { + bits = (bits | MACH_PORT_TYPE_SEND) + 1; /* increment urefs */ + } + entry->ie_bits = bits; ipc_entry_modified(space, name, entry); break; - case MACH_MSG_TYPE_PORT_RECEIVE: { - ipc_port_t dest; -#if IMPORTANCE_INHERITANCE - natural_t assertcnt = port->ip_impcount; -#endif /* IMPORTANCE_INHERITANCE */ - - assert(port->ip_mscount == 0); - assert(!ip_in_a_space(port)); - - /* - * Don't copyout kobjects or kolabels as receive right - */ - if (ip_is_kobject(port) || ip_is_kolabeled(port)) { - panic("ipc_right_copyout: Copyout kobject/kolabel as receive right"); - } - - dest = ip_get_destination(port); - - /* port transitions to IN-SPACE state */ - port->ip_receiver_name = name; - port->ip_receiver = space; - - struct knote *kn = current_thread()->ith_knote; - - if (gdesc && gdesc->flags & MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE) { - assert(port->ip_immovable_receive == 0); - port->ip_guarded = 1; - port->ip_strict_guard = 0; - /* pseudo receive shouldn't set the receive right as immovable in the sender's space */ - if (kn != ITH_KNOTE_PSEUDO) { - port->ip_immovable_receive = 1; - } - port->ip_context = current_thread()->ith_recv_bufs.recv_msg_addr; - gdesc->u_context = port->ip_context; - gdesc->flags &= ~MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND; - } - - if (ip_is_libxpc_connection_port(port)) { - /* - * There are 3 ways to reach here. - * 1. A libxpc client successfully sent this receive right to a named service - * and we are copying out in that service's ipc space. - * 2. A libxpc client tried doing (1) but failed so we are doing pseudo-receive. - * 3. Kernel sent this receive right to a libxpc client as a part of port destroyed notification. - * - * This flag needs to be set again in all 3 cases as they reset it as part of their flow. - */ - port->ip_immovable_receive = 1; - } - - /* Check if this is a service port */ - if (port->ip_service_port) { - assert(port->ip_splabel != NULL); - /* - * This flag gets reset during all 3 ways described above for libxpc connection port. - * The only difference is launchd acts as an initiator instead of a libxpc client. - */ - if (service_port_defense_enabled) { - port->ip_immovable_receive = 1; - } - - /* Check if this is a port-destroyed notification to ensure - * that initproc doesnt end up with a guarded service port - * sent in a regular message - */ - if (!ipc_service_port_label_is_pd_notification((ipc_service_port_label_t)port->ip_splabel)) { - goto skip_sp_check; - } - - ipc_service_port_label_clear_flag(port->ip_splabel, ISPL_FLAGS_SEND_PD_NOTIFICATION); -#if !(DEVELOPMENT || DEBUG) - if (get_bsdtask_info(current_task()) != initproc) { - goto skip_sp_check; - } -#endif /* !(DEVELOPMENT || DEBUG) */ - ipc_service_port_label_get_attr(port->ip_splabel, &sp_name, &sp_context); - assert(sp_name != MACH_PORT_NULL); - /* Verify the port name and restore the guard value, if any */ - if (name != sp_name) { - panic("Service port name = 0x%x doesnt match the stored launchd port name = 0x%x", name, sp_name); - } - if (sp_context) { - port->ip_guarded = 1; - port->ip_strict_guard = 1; - port->ip_context = sp_context; - } - } -skip_sp_check: - - assert((bits & MACH_PORT_TYPE_RECEIVE) == 0); - if (bits & MACH_PORT_TYPE_SEND) { - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); - assert(IE_BITS_UREFS(bits) > 0); - assert(port->ip_srights > 0); - } else { - assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); - assert(IE_BITS_UREFS(bits) == 0); - } - entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; - ipc_entry_modified(space, name, entry); - - boolean_t sync_bootstrap_checkin = FALSE; - if (kn != ITH_KNOTE_PSEUDO && port->ip_sync_bootstrap_checkin) { - sync_bootstrap_checkin = TRUE; - } - if (!ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) { - kn = NULL; - } - ipc_port_adjust_port_locked(port, kn, sync_bootstrap_checkin); - /* port unlocked */ - - if (bits & MACH_PORT_TYPE_SEND) { - ip_release_live(port); - - /* entry is locked holding ref, so can use port */ - ipc_hash_delete(space, ip_to_object(port), name, entry); - } - - if (dest != IP_NULL) { -#if IMPORTANCE_INHERITANCE - /* - * Deduct the assertion counts we contributed to - * the old destination port. They've already - * been reflected into the task as a result of - * getting enqueued. - */ - ip_mq_lock(dest); - ipc_port_impcount_delta(dest, 0 - assertcnt, IP_NULL); - ip_mq_unlock(dest); -#endif /* IMPORTANCE_INHERITANCE */ - - /* Drop turnstile ref on dest */ - ipc_port_send_turnstile_complete(dest); - /* space lock is held */ - ip_release_safe(dest); - } - break; - } - default: - ipc_unreachable("ipc_right_copyout: strange rights"); + ipc_unreachable("ipc_right_copyout_any_send: strange rights"); + } +} + +void +ipc_right_copyout_recv_and_unlock_space( + ipc_space_t space, + ipc_port_t port, + ipc_object_label_t *label, + mach_port_name_t name, + ipc_entry_t entry, + mach_msg_guarded_port_descriptor_t *gdesc) +{ + struct knote *kn; + ipc_port_t dest; +#if IMPORTANCE_INHERITANCE + natural_t assertcnt = port->ip_impcount; +#endif /* IMPORTANCE_INHERITANCE */ + ipc_object_state_t in_space = IO_STATE_INACTIVE; /* means default */ + ipc_entry_bits_t bits = entry->ie_bits; + + assert(IP_VALID(port)); + assert(ip_active(port)); + assert(entry->ie_port == port); + assert(port->ip_mscount == 0); + + kn = current_thread()->ith_knote; + + if (gdesc && gdesc->flags & MACH_MSG_GUARD_FLAGS_IMMOVABLE_RECEIVE) { + port->ip_guarded = 1; + port->ip_strict_guard = 0; + port->ip_context = current_thread()->ith_recv_bufs.recv_msg_addr; + gdesc->u_context = port->ip_context; + gdesc->flags &= ~MACH_MSG_GUARD_FLAGS_UNGUARDED_ON_SEND; + in_space = IO_STATE_IN_SPACE_IMMOVABLE; + } + + if (ip_is_any_service_port_type(label->io_type) && + label->io_state == IO_STATE_IN_TRANSIT_PD) { + ipc_service_port_label_t sp_label = label->iol_service; + + /* + * Check if this is a special port-destroyed + * notification to ensure that initproc doesnt end up + * with a guarded service port sent in a regular message + */ + +#if !(DEVELOPMENT || DEBUG) +#if CONFIG_COALITIONS + if (!task_is_in_privileged_coalition(current_task(), COALITION_TYPE_JETSAM)) { + panic("Service port not sent back to launchd"); + } +#else /* CONFIG_COALITIONS */ + if (!task_is_initproc(current_task())) { + panic("Service port not sent back to launchd"); + } +#endif /* CONFIG_COALITIONS */ +#endif /* !(DEVELOPMENT || DEBUG) */ + + /* + * If the service port was guarded, verify the port name + * and restore the guard value. + * + * See mach_port_construct(). + */ + if (sp_label->ispl_launchd_name) { + if (name != sp_label->ispl_launchd_name) { + panic("Service port name = 0x%x doesnt match " + "the stored launchd port name = 0x%x", + name, sp_label->ispl_launchd_name); + } + + port->ip_guarded = 1; + port->ip_strict_guard = 1; + port->ip_context = sp_label->ispl_launchd_context; + } + } + + /* + * pseudo receive shouldn't set the receive right + * as immovable in the sender's space, it clearly moved once. + */ + dest = ipc_port_mark_in_space(port, label, space, name, + (kn == ITH_KNOTE_PSEUDO) ? IO_STATE_IN_SPACE : in_space); + + assert((bits & MACH_PORT_TYPE_RECEIVE) == 0); + if (bits & MACH_PORT_TYPE_SEND) { + assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); + assert(IE_BITS_UREFS(bits) > 0); + assert(port->ip_srights > 0); + } else { + assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); + assert(IE_BITS_UREFS(bits) == 0); + } + entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; + ipc_entry_modified(space, name, entry); + + boolean_t sync_bootstrap_checkin = FALSE; + if (kn != ITH_KNOTE_PSEUDO && port->ip_sync_bootstrap_checkin) { + sync_bootstrap_checkin = TRUE; + } + if (!ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) { + kn = NULL; + } + ipc_port_adjust_port_locked(port, kn, sync_bootstrap_checkin); + /* port unlocked */ + + if (bits & MACH_PORT_TYPE_SEND) { + ip_release_live(port); + + /* entry is locked holding ref, so can use port */ + ipc_hash_delete(space, ip_to_object(port), name, entry); + } + + is_write_unlock(space); + + if (dest != IP_NULL) { +#if IMPORTANCE_INHERITANCE + /* + * Deduct the assertion counts we contributed to + * the old destination port. They've already + * been reflected into the task as a result of + * getting enqueued. + */ + ip_mq_lock(dest); + ipc_port_impcount_delta(dest, 0 - assertcnt, IP_NULL); + ip_mq_unlock(dest); +#endif /* IMPORTANCE_INHERITANCE */ + + /* Drop turnstile ref on dest */ + ipc_port_send_turnstile_complete(dest); + ip_release(dest); } - return KERN_SUCCESS; } diff --git a/osfmk/ipc/ipc_right.h b/osfmk/ipc/ipc_right.h index 5e580f683..3afd3593d 100644 --- a/osfmk/ipc/ipc_right.h +++ b/osfmk/ipc/ipc_right.h @@ -76,8 +76,6 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN #define ipc_right_lookup_two_read ipc_right_lookup_two_write -extern bool service_port_defense_enabled; - /* Find an entry in a space, given the name */ extern kern_return_t ipc_right_lookup_read( ipc_space_t space, @@ -112,6 +110,7 @@ extern kern_return_t ipc_right_request_alloc( mach_port_name_t name, ipc_port_request_opts_t options, ipc_port_t notify, + mach_msg_id_t id, ipc_port_t *previousp); /* Check if an entry is being used */ @@ -119,12 +118,12 @@ extern bool ipc_right_inuse( ipc_entry_t entry); /* Check if the port has died */ -extern boolean_t ipc_right_check( +extern bool ipc_right_check( ipc_space_t space, ipc_port_t port, mach_port_name_t name, ipc_entry_t entry, - ipc_object_copyin_flags_t flags); + ipc_copyin_op_t copyin_reason); /* Clean up an entry in a dead space */ extern void ipc_right_terminate( @@ -136,9 +135,7 @@ extern void ipc_right_terminate( extern kern_return_t ipc_right_destroy( ipc_space_t space, mach_port_name_t name, - ipc_entry_t entry, - boolean_t check_guard, - uint64_t guard); + ipc_entry_t entry); /* Release a send/send-once/dead-name user reference */ extern kern_return_t ipc_right_dealloc( @@ -171,13 +168,11 @@ extern kern_return_t ipc_right_info( mach_port_urefs_t *urefsp); /* Check if a subsequent ipc_right_copyin of the reply port will succeed */ -extern boolean_t ipc_right_copyin_check_reply( +extern bool ipc_right_copyin_check_reply( ipc_space_t space, mach_port_name_t reply_name, ipc_entry_t reply_entry, - mach_msg_type_name_t reply_type, - ipc_entry_t dest_entry, - uint8_t *reply_port_semantics_violation); + mach_msg_type_name_t reply_type); typedef struct { ipc_port_t icc_release_port; @@ -210,18 +205,27 @@ extern kern_return_t ipc_right_copyin( mach_port_name_t name, mach_msg_type_name_t msgt_name, ipc_object_copyin_flags_t flags, + ipc_copyin_op_t copyin_reason, ipc_entry_t entry, ipc_port_t *portp, ipc_copyin_cleanup_t *icc, ipc_copyin_rcleanup_t *icrc); -/* Copyout a capability to a space */ -extern kern_return_t ipc_right_copyout( +/* Copyout a send or send-once capability to a space */ +extern void ipc_right_copyout_any_send( ipc_space_t space, ipc_port_t port, mach_msg_type_name_t msgt_name, ipc_object_copyout_flags_t flags, mach_port_name_t name, + ipc_entry_t entry); + +/* Copyout a receive capability to a space */ +extern void ipc_right_copyout_recv_and_unlock_space( + ipc_space_t space, + ipc_port_t port, + ipc_object_label_t *label, + mach_port_name_t name, ipc_entry_t entry, mach_msg_guarded_port_descriptor_t *gdesc); diff --git a/osfmk/ipc/ipc_service_port.c b/osfmk/ipc/ipc_service_port.c index d104dcac6..1e606874a 100644 --- a/osfmk/ipc/ipc_service_port.c +++ b/osfmk/ipc/ipc_service_port.c @@ -26,15 +26,10 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -#include -#include -#include +#include #include -#include -#include #include +#include #include #include @@ -66,7 +61,7 @@ kdp_ipc_fill_splabel(struct ipc_service_port_label *ispl, #if CONFIG_SERVICE_PORT_INFO *namep = ispl->ispl_service_name; spl->portlabel_domain = ispl->ispl_domain; - if (ipc_service_port_label_is_throttled(ispl)) { + if (ispl->ispl_throttled) { spl->portlabel_flags |= STACKSHOT_PORTLABEL_THROTTLED; } #endif @@ -86,9 +81,11 @@ kdp_ipc_fill_splabel(struct ipc_service_port_label *ispl, * KERN_SUCCESS */ kern_return_t -ipc_service_port_label_alloc(mach_service_port_info_t sp_info, void **port_label_ptr) +ipc_service_port_label_alloc( + mach_service_port_info_t sp_info, + ipc_object_label_t *label) { - ipc_service_port_label_t sp_label = IPC_SERVICE_PORT_LABEL_NULL; + ipc_service_port_label_t sp_label = NULL; kern_return_t ret; void *sblabel = NULL; @@ -112,12 +109,27 @@ ipc_service_port_label_alloc(mach_service_port_info_t sp_info, void **port_label #endif /* CONFIG_SERVICE_PORT_INFO */ if (sp_info->mspi_domain_type == XPC_DOMAIN_PORT) { - sp_label->ispl_flags |= ISPL_FLAGS_BOOTSTRAP_PORT; + sp_label->ispl_bootstrap_port = true; + } + + label->iol_service = sp_label; + if (sblabel) { + /* always filter service ports with a label */ + label->io_filtered = true; + } + if (sp_label->ispl_bootstrap_port) { + /* bootstrap ports are completely immovable thank you */ + label->io_state = IO_STATE_IN_SPACE_IMMOVABLE; } - *port_label_ptr = (void *)sp_label; return KERN_SUCCESS; } +void +ipc_connection_port_label_dealloc(ipc_object_label_t label) +{ + mach_msg_filter_dealloc_service_port_sblabel_callback(label.iol_connection); +} + /* * Name: ipc_service_port_dealloc * @@ -131,21 +143,17 @@ ipc_service_port_label_alloc(mach_service_port_info_t sp_info, void **port_label * Should not be called with the port lock held. */ void -ipc_service_port_label_dealloc(void *ip_splabel, bool service_port) +ipc_service_port_label_dealloc(ipc_object_label_t label) { - void *sblabel = ip_splabel; + ipc_service_port_label_t sp_label = label.iol_service; + void *sblabel = sp_label->ispl_sblabel; - if (service_port) { - ipc_service_port_label_t sp_label = (ipc_service_port_label_t)ip_splabel; - sblabel = sp_label->ispl_sblabel; #if CONFIG_SERVICE_PORT_INFO - kfree_data(sp_label->ispl_service_name, strlen(sp_label->ispl_service_name) + 1); + kfree_data(sp_label->ispl_service_name, strlen(sp_label->ispl_service_name) + 1); #endif /* CONFIG_SERVICE_PORT_INFO */ - zfree(ipc_service_port_label_zone, sp_label); - } + zfree(ipc_service_port_label_zone, sp_label); if (sblabel) { - assert(mach_msg_filter_dealloc_service_port_sblabel_callback); mach_msg_filter_dealloc_service_port_sblabel_callback(sblabel); } } @@ -166,14 +174,13 @@ ipc_service_port_label_dealloc(void *ip_splabel, bool service_port) * KERN_INVALID_CAPABILITY: service_port_name is not a right to a service port */ kern_return_t -ipc_service_port_derive_sblabel(mach_port_name_t service_port_name, void **sblabel_ptr, bool *filter_msgs) +ipc_service_port_derive_sblabel( + mach_port_name_t service_port_name, + bool force, + ipc_object_label_t *label) { - ipc_service_port_label_t port_label; - void *derived_sblabel = NULL; - void *sblabel = NULL; ipc_port_t port; kern_return_t kr; - boolean_t send_side_filtering = FALSE; #if CONFIG_MACF && XNU_TARGET_OS_OSX struct mach_service_port_info sp_info = {}; #endif @@ -183,36 +190,39 @@ ipc_service_port_derive_sblabel(mach_port_name_t service_port_name, void **sblab } if (mach_msg_filter_at_least(MACH_MSG_FILTER_CALLBACKS_VERSION_1)) { + ipc_object_label_t sp_label; + boolean_t send_side_filtering; + void *sblabel = NULL; + kr = ipc_port_translate_send(current_space(), service_port_name, &port); if (kr != KERN_SUCCESS) { return kr; } /* port is locked and active */ - if (ip_is_kolabeled(port) || !port->ip_service_port) { - ip_mq_unlock(port); + sp_label = ip_label_get(port); + if (!ip_is_any_service_port_type(sp_label.io_type)) { + ip_mq_unlock_label_put(port, &sp_label); return KERN_INVALID_CAPABILITY; } - port_label = (ipc_service_port_label_t)port->ip_splabel; - if (!port_label) { - ip_mq_unlock(port); - return KERN_SUCCESS; - } - #if CONFIG_MACF && XNU_TARGET_OS_OSX - ipc_service_port_label_get_info(port_label, &sp_info); + ipc_service_port_label_get_info(sp_label.iol_service, &sp_info); #endif - sblabel = port_label->ispl_sblabel; + sblabel = sp_label.iol_service->ispl_sblabel; if (sblabel) { mach_msg_filter_retain_sblabel_callback(sblabel); } - ip_mq_unlock(port); + ip_mq_unlock_label_put(port, &sp_label); if (sblabel) { /* This callback will release the reference on sblabel */ - derived_sblabel = mach_msg_filter_derive_sblabel_from_service_port_callback(sblabel, &send_side_filtering); + label->iol_connection = mach_msg_filter_derive_sblabel_from_service_port_callback(sblabel, + &send_side_filtering); + if (label->iol_connection && (send_side_filtering || force)) { + label->io_filtered = true; + } } #if CONFIG_MACF && XNU_TARGET_OS_OSX @@ -222,100 +232,9 @@ ipc_service_port_derive_sblabel(mach_port_name_t service_port_name, void **sblab #endif } - *sblabel_ptr = derived_sblabel; - *filter_msgs = (bool)send_side_filtering; return KERN_SUCCESS; } -/* - * Name: ipc_service_port_get_sblabel - * - * Description: Get the port's sandbox label. - * - * Args: - * port - * - * Conditions: - * Should be called on an active port with the lock held. - * - * Returns: - * Sandbox label - */ -void * -ipc_service_port_get_sblabel(ipc_port_t port) -{ - void *sblabel = NULL; - void *ip_splabel = NULL; - - if (port == IP_NULL) { - return NULL; - } - - ip_mq_lock_held(port); - assert(ip_active(port)); - - if (ip_is_kolabeled(port) || !port->ip_splabel) { - return NULL; - } - - ip_splabel = port->ip_splabel; - - if (!port->ip_service_port) { - sblabel = ip_splabel; - assert(sblabel != NULL); - } else { - ipc_service_port_label_t sp_label = (ipc_service_port_label_t)ip_splabel; - sblabel = sp_label->ispl_sblabel; - } - - return sblabel; -} - -/* - * Name: ipc_service_port_label_set_attr - * - * Description: Set the remaining port label attributes after port allocation - * - * Args: - * port_splabel - * name : port name in launchd's ipc space - * context : launchd's port guard; will be restored after a port destroyed notification if non-zero - * - * Conditions: - * Should be called only once in mach_port_construct on a newly created port with the lock held - * The context should be set only if the port is guarded. - */ -void -ipc_service_port_label_set_attr(ipc_service_port_label_t port_splabel, mach_port_name_t name, mach_port_context_t context) -{ - assert(port_splabel->ispl_launchd_name == MACH_PORT_NULL); - port_splabel->ispl_launchd_name = name; - port_splabel->ispl_launchd_context = context; - if (context) { - ipc_service_port_label_set_flag(port_splabel, ISPL_FLAGS_SPECIAL_PDREQUEST); - } -} - -/* - * Name: ipc_service_port_label_get_attr - * - * Description: Get the port label attributes - * - * Args: - * port_splabel - * name : port name in launchd's ipc space - * context : launchd's port guard - * - * Conditions: - * Should be called with port lock held. - */ -void -ipc_service_port_label_get_attr(ipc_service_port_label_t port_splabel, mach_port_name_t *name, mach_port_context_t *context) -{ - *name = port_splabel->ispl_launchd_name; - *context = port_splabel->ispl_launchd_context; -} - #if CONFIG_SERVICE_PORT_INFO void ipc_service_port_label_get_info(ipc_service_port_label_t port_splabel, mach_service_port_info_t info) diff --git a/osfmk/ipc/ipc_service_port.h b/osfmk/ipc/ipc_service_port.h index f1d1ec4e7..c4cfba4e1 100644 --- a/osfmk/ipc/ipc_service_port.h +++ b/osfmk/ipc/ipc_service_port.h @@ -29,100 +29,54 @@ #ifndef _IPC_IPC_SERVICE_PORT_H_ #define _IPC_IPC_SERVICE_PORT_H_ -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - #include -#include #include -#include -#include #ifdef MACH_KERNEL_PRIVATE -__options_decl(ipc_service_port_label_flags_t, uint16_t, { - ISPL_FLAGS_SPECIAL_PDREQUEST = 1,/* Special port destroyed notification for service ports */ - ISPL_FLAGS_SEND_PD_NOTIFICATION = (1 << 1),/* Port destroyed notification is being sent */ - ISPL_FLAGS_BOOTSTRAP_PORT = (1 << 2), - ISPL_FLAGS_THROTTLED = (1 << 3),/* Service throttled by launchd */ -}); - struct ipc_service_port_label { - void * XNU_PTRAUTH_SIGNED_PTR("ipc_service_port_label.ispl_sblabel") ispl_sblabel; /* points to the Sandbox's message filtering data structure */ - mach_port_context_t ispl_launchd_context; /* context used to guard the port, specific to launchd */ - mach_port_name_t ispl_launchd_name; /* port name in launchd's ipc space */ - ipc_service_port_label_flags_t ispl_flags; + /* points to the Sandbox's message filtering data structure */ + struct ipc_conn_port_label *XNU_PTRAUTH_SIGNED_PTR_AUTH_NULL("ipc_service_port_label.ispl_sblabel") ispl_sblabel; + mach_port_context_t ispl_launchd_context; /* context used to guard the port, specific to launchd */ + mach_port_name_t ispl_launchd_name; /* port name in launchd's ipc space */ + uint8_t ispl_bootstrap_port : 1; /* port is a boostrap port */ + uint8_t ispl_throttled : 1; /* service throttled by launchd */ + uint8_t __ispl_unused : 6; #if CONFIG_SERVICE_PORT_INFO - uint8_t ispl_domain; /* launchd domain */ - char *ispl_service_name; /* string name used to identify the service port */ + uint8_t ispl_domain; /* launchd domain */ + char *ispl_service_name; /* string name used to identify the service port */ #endif /* CONFIG_SERVICE_PORT_INFO */ }; -typedef struct ipc_service_port_label* ipc_service_port_label_t; - -#define IPC_SERVICE_PORT_LABEL_NULL ((ipc_service_port_label_t)NULL) - -/* - * These ispl_flags based macros/functions should be called with the port lock held - */ -#define ipc_service_port_label_is_special_pdrequest(port_splabel) \ - (((port_splabel)->ispl_flags & ISPL_FLAGS_SPECIAL_PDREQUEST) == ISPL_FLAGS_SPECIAL_PDREQUEST) - -#define ipc_service_port_label_is_pd_notification(port_splabel) \ - (((port_splabel)->ispl_flags & ISPL_FLAGS_SEND_PD_NOTIFICATION) == ISPL_FLAGS_SEND_PD_NOTIFICATION) - -#define ipc_service_port_label_is_bootstrap_port(port_splabel) \ - (((port_splabel)->ispl_flags & ISPL_FLAGS_BOOTSTRAP_PORT) == ISPL_FLAGS_BOOTSTRAP_PORT) - -#define ipc_service_port_label_is_throttled(port_splabel) \ - (((port_splabel)->ispl_flags & ISPL_FLAGS_THROTTLED) == ISPL_FLAGS_THROTTLED) - -static inline void -ipc_service_port_label_set_flag(ipc_service_port_label_t port_splabel, ipc_service_port_label_flags_t flag) -{ - assert(port_splabel != IPC_SERVICE_PORT_LABEL_NULL); - port_splabel->ispl_flags |= flag; -} - -static inline void -ipc_service_port_label_clear_flag(ipc_service_port_label_t port_splabel, ipc_service_port_label_flags_t flag) -{ - assert(port_splabel != IPC_SERVICE_PORT_LABEL_NULL); - port_splabel->ispl_flags &= ~flag; -} +typedef struct ipc_service_port_label *ipc_service_port_label_t; /* Function declarations */ -kern_return_t -ipc_service_port_label_alloc(mach_service_port_info_t sp_info, void **port_label_ptr); +extern kern_return_t ipc_service_port_label_alloc( + mach_service_port_info_t sp_info, + ipc_object_label_t *label); -void -ipc_service_port_label_dealloc(void * ip_splabel, bool service_port); +extern void ipc_connection_port_label_dealloc( + ipc_object_label_t label); -kern_return_t -ipc_service_port_derive_sblabel(mach_port_name_t service_port_name, void **sblabel_ptr, bool *filter_msgs); +extern void ipc_service_port_label_dealloc( + ipc_object_label_t label); -void * -ipc_service_port_get_sblabel(ipc_port_t port); +extern kern_return_t ipc_service_port_derive_sblabel( + mach_port_name_t service_port_name, + bool force, + ipc_object_label_t *label); -void -ipc_service_port_label_set_attr(ipc_service_port_label_t port_splabel, mach_port_name_t name, mach_port_context_t context); - -void -ipc_service_port_label_get_attr(ipc_service_port_label_t port_splabel, mach_port_name_t *name, mach_port_context_t *context); +extern void ipc_service_port_label_set_attr( + ipc_service_port_label_t port_splabel, + mach_port_name_t name, + mach_port_context_t context); #if CONFIG_SERVICE_PORT_INFO -void -ipc_service_port_label_get_info(ipc_service_port_label_t port_splabel, mach_service_port_info_t info); -#endif /* CONFIG_SERVICE_PORT_INFO */ +extern void ipc_service_port_label_get_info( + ipc_service_port_label_t port_splabel, + mach_service_port_info_t info); + +#endif /* CONFIG_SERVICE_PORT_INFO */ #endif /* MACH_KERNEL_PRIVATE */ #endif /* _IPC_IPC_SERVICE_PORT_H_ */ diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index e317f2c7f..47f839d82 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -178,21 +178,6 @@ ipc_space_lock_sleep( THREAD_UNINT, TIMEOUT_WAIT_FOREVER); } -/* Routine: ipc_space_get_rollpoint - * Purpose: - * Generate a new gencount rollover point from a space's entropy pool - */ -ipc_entry_bits_t -ipc_space_get_rollpoint( - ipc_space_t space) -{ - return random_bool_gen_bits( - &space->bool_gen, - &space->is_entropy[0], - IS_ENTROPY_CNT, - IE_BITS_ROLL_BITS); -} - /* * Routine: ipc_entry_rand_freelist * Purpose: @@ -234,7 +219,9 @@ ipc_space_rand_freelist( */ while (bottom <= top) { ipc_entry_t entry = &table[curr]; + mach_port_index_t next; int which; + #ifdef CONFIG_SEMI_RANDOM_ENTRIES /* * XXX: This is a horrible hack to make sure that randomizing the port @@ -245,13 +232,11 @@ ipc_space_rand_freelist( which = 0; } else #endif - which = random_bool_gen_bits( - &space->bool_gen, - &space->is_entropy[0], - IS_ENTROPY_CNT, - 1); + { + which = random_bool_gen_bits(&space->is_prng, + space->is_entropy, IS_ENTROPY_CNT, 1); + } - mach_port_index_t next; if (which) { next = top; top--; @@ -261,14 +246,14 @@ ipc_space_rand_freelist( } /* - * The entry's gencount will roll over on its first allocation, at which - * point a random rollover will be set for the entry. + * The entry's gencount will roll over on its first allocation, + * at which point a random rollover will be set for the entry. */ - entry->ie_bits = IE_BITS_GEN_MASK; - entry->ie_next = next; + entry->ie_bits = IE_BITS_GEN_INIT; + entry->ie_next = next; curr = next; } - table[curr].ie_bits = IE_BITS_GEN_MASK; + table[curr].ie_bits = IE_BITS_GEN_INIT; } @@ -300,14 +285,13 @@ ipc_space_create( space = ipc_space_alloc(); count = ipc_entry_table_count(table); - random_bool_init(&space->bool_gen); + random_bool_init(&space->is_prng); ipc_space_rand_freelist(space, ipc_entry_table_base(table), 0, count); os_ref_init_count_mask(&space->is_bits, IS_FLAGS_BITS, &is_refgrp, 2, 0); space->is_table_free = count - 1; space->is_label = label; space->is_low_mod = count; - space->is_node_id = HOST_LOCAL_NODE; /* HOST_LOCAL_NODE, except proxy spaces */ smr_init_store(&space->is_table, table); *spacep = space; @@ -375,6 +359,7 @@ ipc_space_add_label( is_write_unlock(space); return KERN_SUCCESS; } + /* * Routine: ipc_space_create_special * Purpose: @@ -382,27 +367,24 @@ ipc_space_add_label( * doesn't hold rights in the normal way. * Instead it is place-holder for holding * disembodied (naked) receive rights. - * See ipc_port_alloc_special/ipc_port_dealloc_special. + * See ipc_port_alloc_special. * Conditions: * Nothing locked. * Returns: * KERN_SUCCESS Created a space. * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. */ - -kern_return_t -ipc_space_create_special( - ipc_space_t *spacep) +ipc_space_t +ipc_space_create_special(void) { ipc_space_t space; space = ipc_space_alloc(); os_ref_init_count_mask(&space->is_bits, IS_FLAGS_BITS, &is_refgrp, 1, 0); - space->is_label = IPC_LABEL_SPECIAL; - space->is_node_id = HOST_LOCAL_NODE; /* HOST_LOCAL_NODE, except proxy spaces */ + ipc_space_set_policy(space, IPC_SPACE_POLICY_KERNEL); + space->is_label = IPC_LABEL_SPECIAL; - *spacep = space; - return KERN_SUCCESS; + return space; } /* @@ -452,6 +434,22 @@ ipc_space_terminate( * was a receive right in this space. */ + for (mach_port_index_t index = 1; + ipc_entry_table_contains(table, index); + index++) { + ipc_entry_t entry = ipc_entry_table_get_nocheck(table, index); + mach_port_type_t type; + + type = IE_BITS_TYPE(entry->ie_bits); + if (type & MACH_PORT_TYPE_RECEIVE) { + mach_port_name_t name; + + name = MACH_PORT_MAKE(index, + IE_BITS_GEN(entry->ie_bits)); + ipc_right_terminate(space, name, entry); + } + } + for (mach_port_index_t index = 1; ipc_entry_table_contains(table, index); index++) { diff --git a/osfmk/ipc/ipc_space.h b/osfmk/ipc/ipc_space.h index 0c03d9f44..98e7d5cba 100644 --- a/osfmk/ipc/ipc_space.h +++ b/osfmk/ipc/ipc_space.h @@ -72,8 +72,6 @@ #ifndef _IPC_IPC_SPACE_H_ #define _IPC_IPC_SPACE_H_ - -#include #include #include #include @@ -81,18 +79,18 @@ #include -#ifdef __APPLE_API_PRIVATE #ifdef MACH_KERNEL_PRIVATE -#include -#include +#include #include -#include -#include -#include #include #include #include +#endif + +__BEGIN_DECLS + +#ifdef MACH_KERNEL_PRIVATE /* * Every task has a space of IPC capabilities. @@ -120,29 +118,66 @@ typedef natural_t ipc_space_refs_t; #define IS_AT_MAX_LIMIT_NOTIFY 0x10 /* space has hit the max limit */ #define IS_AT_MAX_LIMIT_NOTIFIED 0x20 /* sent max limit notification */ +/* is_telemetry flags */ +__options_decl(is_telemetry_t, uint8_t, { + IS_HAS_BOOTSTRAP_PORT_TELEMETRY = 0x01, /* space has emitted a bootstrap port telemetry */ + IS_HAS_CREATE_PRP_TELEMETRY = 0x02, /* space has emitted a create provisional reply port telemetry */ + IS_HAS_SERVICE_PORT_TELEMETRY = 0x04, /* space has emitted a service port telemetry */ + IS_HAS_MOVE_PRP_TELEMETRY = 0x08, /* space has emitted a move provisional reply port telemetry */ +}); + struct ipc_space { lck_ticket_t is_lock; os_ref_atomic_t is_bits; /* holds refs, active, growing */ ipc_entry_num_t is_table_hashed;/* count of hashed elements */ ipc_entry_num_t is_table_free; /* count of free elements */ + unsigned int is_entropy[IS_ENTROPY_CNT]; /* pool of entropy taken from RNG */ + struct bool_gen is_prng; SMR_POINTER(ipc_entry_table_t XNU_PTRAUTH_SIGNED_PTR("ipc_space.is_table")) is_table; /* an array of entries */ task_t XNU_PTRAUTH_SIGNED_PTR("ipc_space.is_task") is_task; /* associated task */ + unsigned long is_policy; /* manually dPACed, ipc_space_policy_t */ thread_t is_grower; /* thread growing the space */ ipc_label_t is_label; /* [private] mandatory access label */ ipc_entry_num_t is_low_mod; /* lowest modified entry during growth */ ipc_entry_num_t is_high_mod; /* highest modified entry during growth */ - struct bool_gen bool_gen; /* state for boolean RNG */ - unsigned int is_entropy[IS_ENTROPY_CNT]; /* pool of entropy taken from RNG */ - int is_node_id; /* HOST_LOCAL_NODE, or remote node if proxy space */ #if CONFIG_PROC_RESOURCE_LIMITS ipc_entry_num_t is_table_size_soft_limit; /* resource_notify is sent when the table size hits this limit */ ipc_entry_num_t is_table_size_hard_limit; /* same as soft limit except the task is killed soon after data collection */ #endif /* CONFIG_PROC_RESOURCE_LIMITS */ + _Atomic is_telemetry_t is_telemetry; /* rate limit each type of telemetry to once per space */ }; #define IS_NULL ((ipc_space_t) 0) #define IS_INSPECT_NULL ((ipc_space_inspect_t) 0) +static inline uintptr_t +ipc_space_policy_discriminator(ipc_space_t is) +{ + uint16_t base = ptrauth_string_discriminator("ipc_space.is_policy"); + + return ptrauth_blend_discriminator(&is->is_policy, base); +} + +static inline ipc_space_policy_t +ipc_space_policy(ipc_space_t is) +{ + unsigned long policy = is->is_policy; + + return (ipc_space_policy_t)(unsigned long)ptrauth_auth_data( + __unsafe_forge_single(void *, policy), + ptrauth_key_process_independent_data, + ipc_space_policy_discriminator(is)); +} + +static inline void +ipc_space_set_policy(ipc_space_t is, ipc_space_policy_t policy) +{ + is->is_policy = (unsigned long)ptrauth_sign_unauthenticated( + (void *)(unsigned long)policy, + ptrauth_key_process_independent_data, + ipc_space_policy_discriminator(is)); +} + static inline bool is_bits_set(ipc_space_t is, uint32_t bit) { @@ -237,8 +272,6 @@ extern lck_attr_t ipc_lck_attr; #define is_reference(is) ipc_space_reference(is) #define is_release(is) ipc_space_release(is) -#define current_space() (current_task()->itk_space) - extern void ipc_space_lock( ipc_space_t space); @@ -252,8 +285,7 @@ extern void ipc_space_retire_table( ipc_entry_table_t table); /* Create a special IPC space */ -extern kern_return_t ipc_space_create_special( - ipc_space_t *spacep); +extern ipc_space_t ipc_space_create_special(void); /* Create a new IPC space */ extern kern_return_t ipc_space_create( @@ -281,9 +313,6 @@ extern void ipc_space_rand_freelist( mach_port_index_t bottom, mach_port_index_t top); -/* Generate a new gencount rollover point from a space's entropy pool */ -extern ipc_entry_bits_t ipc_space_get_rollpoint(ipc_space_t space); - #if CONFIG_PROC_RESOURCE_LIMITS /* Set limits on a space's size */ extern kern_return_t ipc_space_set_table_size_limits( @@ -305,15 +334,6 @@ extern void ipc_space_set_at_max_limit( ipc_space_t space); #endif /* MACH_KERNEL_PRIVATE */ -#endif /* __APPLE_API_PRIVATE */ - -#ifdef __APPLE_API_UNSTABLE -#ifndef MACH_KERNEL_PRIVATE - -extern ipc_space_t current_space(void); - -#endif /* !MACH_KERNEL_PRIVATE */ -#endif /* __APPLE_API_UNSTABLE */ /* Take a reference on a space */ extern void ipc_space_reference( @@ -323,4 +343,6 @@ extern void ipc_space_reference( extern void ipc_space_release( ipc_space_t space); +__END_DECLS + #endif /* _IPC_IPC_SPACE_H_ */ diff --git a/osfmk/ipc/ipc_types.h b/osfmk/ipc/ipc_types.h index 9fe712c53..3bd6a2035 100644 --- a/osfmk/ipc/ipc_types.h +++ b/osfmk/ipc/ipc_types.h @@ -69,9 +69,6 @@ typedef uint64_t ipc_label_t; #define IPC_LABEL_SPECIAL ((ipc_label_t)0x0003) #define IPC_LABEL_SPACE_MASK ((ipc_label_t)0x00ff) -#define IPC_LABEL_SUBST_TASK ((ipc_label_t)0x0100) -#define IPC_LABEL_SUBST_THREAD ((ipc_label_t)0x0200) -#define IPC_LABEL_SUBST_ONCE ((ipc_label_t)0x0300) #define IPC_LABEL_SUBST_TASK_READ ((ipc_label_t)0x0400) #define IPC_LABEL_SUBST_THREAD_READ ((ipc_label_t)0x0500) #define IPC_LABEL_SUBST_MASK ((ipc_label_t)0xff00) @@ -98,11 +95,223 @@ typedef struct ipc_importance_task *__single ipc_importance_task_t; typedef struct ipc_importance_inherit *__single ipc_importance_inherit_t; #define III_NULL ((ipc_importance_inherit_t)NULL) +/*! + * @typedef ipc_space_policy_t + * + * @brief + * Flags used to determine the IPC policy for a given task/space. + * + * @const IPC_SPACE_POLICY_INVALID + * This policy is never used, the zero value is never a valid policy. + * + * @const IPC_SPACE_POLICY_DEFAULT + * Denotes that this task has the default policy. + * This bit is always set in a properly inited policy. + * + * @const IPC_SPACE_POLICY_ENHANCED + * Denotes an IPC space for a task that has opted in some way to receive more + * security. The "enhanced" security space has several versions for bincompat + * reasons, where each increasing version opts you into more security features. + * `ENHANCED_V0` includes those opted into macOS hardened runtime + * `ENHANCED_V1` includes those opted into browser entitlements (FY24) + * `ENHANCED_V2` includes those opted into the FY25 platform restrictions entitlement + * No new features should be placed into the previous versions for bincompat + * reasons, and binaries opted into the newer versions always get the features + * from all previous versions. + * + * @const IPC_SPACE_POLICY_PLATFORM + * Denotes an IPC space for a platform binary. This flag always implies + * @c IPC_SPACE_POLICY_ENHANCED is set, meaning platform binaries always get the + * highest version of platform restrictions. + * + * @const IPC_SPACE_POLICY_KERNEL + * Denotes that this is the IPC space for the kernel. + * + * @const IPC_SPACE_POLICY_SIMULATED + * Denotes IPC spaces for simulator environments (macOS only). + * In general this bit will cause policies to be relaxed because software + * running in these environment was written before policies were made, + * and probably do not comply with them naturally. + * + * @const IPC_SPACE_POLICY_TRANSLATED + * Denotes IPC spaces for translated environments (macOS only). + * Similarly to @c IPC_SPACE_POLICY_SIMULATED, processes running in a Rosetta + * environment are likely older software that predate policy changes, + * and these processes tend to be opted out of certain policies as a result. + */ + +__options_closed_decl(ipc_space_policy_t, uint32_t, { + IPC_SPACE_POLICY_INVALID = 0x0000, + + /* Security level */ + IPC_SPACE_POLICY_DEFAULT = 0x0001, /* MACH64_POLICY_DEFAULT */ + IPC_SPACE_POLICY_ENHANCED = 0x0002, + IPC_SPACE_POLICY_PLATFORM = 0x0004, + IPC_SPACE_POLICY_KERNEL = 0x0010, + + /* flags to turn off security */ +#if XNU_TARGET_OS_OSX + IPC_SPACE_POLICY_SIMULATED = 0x0020, +#else + IPC_SPACE_POLICY_SIMULATED = 0x0000, +#endif +#if CONFIG_ROSETTA + IPC_SPACE_POLICY_TRANSLATED = 0x0040, +#else + IPC_SPACE_POLICY_TRANSLATED = 0x0000, +#endif +#if XNU_TARGET_OS_OSX + IPC_SPACE_POLICY_OPTED_OUT = 0x0080, +#else + IPC_SPACE_POLICY_OPTED_OUT = 0x0000, +#endif + + + IPC_SPACE_POLICY_MASK = ( + IPC_SPACE_POLICY_DEFAULT | + IPC_SPACE_POLICY_ENHANCED | + IPC_SPACE_POLICY_PLATFORM | + IPC_SPACE_POLICY_KERNEL | + IPC_SPACE_POLICY_SIMULATED | + IPC_SPACE_POLICY_TRANSLATED | + IPC_SPACE_POLICY_OPTED_OUT), + + +/* platform restrictions Versioning Levels */ + IPC_SPACE_POLICY_ENHANCED_V0 = 0x100, /* DEPRECATED - includes macos hardened runtime */ + IPC_SPACE_POLICY_ENHANCED_V1 = 0x200, /* ES features exposed to 3P in FY2024 release */ + IPC_SPACE_POLICY_ENHANCED_V2 = 0x400, /* ES features exposed to 3P in FY2025 release */ + IPC_SPACE_POLICY_ENHANCED_VERSION_MASK = ( + IPC_SPACE_POLICY_ENHANCED_V0 | + IPC_SPACE_POLICY_ENHANCED_V1 | + IPC_SPACE_POLICY_ENHANCED_V2 + ), +}); + +#define IPC_SPACE_POLICY_BASE(prefix) \ + prefix ## _DEFAULT = IPC_SPACE_POLICY_DEFAULT, \ + prefix ## _ENHANCED = IPC_SPACE_POLICY_ENHANCED, \ + prefix ## _PLATFORM = IPC_SPACE_POLICY_PLATFORM, \ + prefix ## _KERNEL = IPC_SPACE_POLICY_KERNEL, \ + prefix ## _SIMULATED = IPC_SPACE_POLICY_SIMULATED, \ + prefix ## _TRANSLATED = IPC_SPACE_POLICY_TRANSLATED, \ + prefix ## _MASK = IPC_SPACE_POLICY_MASK + #else /* MACH_KERNEL_PRIVATE */ struct ipc_object; #endif /* MACH_KERNEL_PRIVATE */ +#if XNU_KERNEL_PRIVATE + +/*! + * @brief + * Type for IPC objects + * + * @discussion + * This type is non ABI stable, and limited to XNU internally. + * Please keep this type ordered semantically for readability purposes. + * + * When adding types here, update @c mach_port_kobject_type() which maps + * these values to the previously stable legacy IKOT_* values for the sake + * of userspace (and tools like lsmp(1)). + */ +__enum_decl(ipc_object_type_t, uint8_t, { + /* + * Object is a port set (see ). + */ + IOT_PORT_SET, + + /* + * Catchall type for generic ports. + */ + IOT_PORT, + + /* + * Service/Connection ports + */ + IOT_SERVICE_PORT, + IOT_WEAK_SERVICE_PORT, + IOT_CONNECTION_PORT, + IOT_CONNECTION_PORT_WITH_PORT_ARRAY, + + /* + * Notification ports + */ + IOT_EXCEPTION_PORT, + IOT_TIMER_PORT, + + /* + * Reply Ports + */ + IOT_REPLY_PORT, + IOT_SPECIAL_REPLY_PORT, + IOT_PROVISIONAL_REPLY_PORT, + + /* + * IPC Kernel Object types + * + * Matching entries must be added to , + * and case labels to mach_port_kobject_type(). + */ + __IKOT_FIRST, + + /* thread ports */ + IKOT_THREAD_CONTROL = __IKOT_FIRST, + IKOT_THREAD_READ, + IKOT_THREAD_INSPECT, + + /* task ports */ + IKOT_TASK_CONTROL, + IKOT_TASK_READ, + IKOT_TASK_INSPECT, + IKOT_TASK_NAME, + + IKOT_TASK_RESUME, + IKOT_TASK_ID_TOKEN, + IKOT_TASK_FATAL, /* CONFIG_PROC_RESOURCE_LIMITS only */ + + /* host services */ + IKOT_HOST, + IKOT_HOST_PRIV, + IKOT_CLOCK, + IKOT_PROCESSOR, + IKOT_PROCESSOR_SET, + IKOT_PROCESSOR_SET_NAME, + + /* common userspace used ports */ + IKOT_EVENTLINK, + IKOT_FILEPORT, + IKOT_SEMAPHORE, + IKOT_VOUCHER, + IKOT_WORK_INTERVAL, + + /* VM ports */ + IKOT_MEMORY_OBJECT, + IKOT_NAMED_ENTRY, + + /* IOKit & exclaves ports */ + IKOT_MAIN_DEVICE, + IKOT_IOKIT_IDENT, + IKOT_IOKIT_CONNECT, + IKOT_IOKIT_OBJECT, + IKOT_UEXT_OBJECT, + IKOT_EXCLAVES_RESOURCE, /* CONFIG_EXCLAVES only */ + + /* misc. */ + IKOT_ARCADE_REG, /* CONFIG_ARCADE only */ + IKOT_AU_SESSIONPORT, /* CONFIG_AUDIT only */ + IKOT_HYPERVISOR, /* HYPERVISOR only */ + IKOT_KCDATA, + IKOT_UND_REPLY, /* CONFIG_USER_NOTIFICATION only */ + IKOT_UX_HANDLER, + + /* catchall, keep last */ + IOT_UNKNOWN, + IOT_ANY = 0xff, +}); + +#endif /* XNU_KERNEL_PRIVATE */ typedef struct ipc_object *ipc_object_t; diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index 7d1da7698..8cb0231f2 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -40,8 +40,6 @@ #include #include -#include - #include #include #include @@ -58,6 +56,7 @@ ZONE_DEFINE_ID(ZONE_ID_IPC_VOUCHERS, "ipc vouchers", struct ipc_voucher, static void ipc_voucher_no_senders(ipc_port_t, mach_port_mscount_t); IPC_KOBJECT_DEFINE(IKOT_VOUCHER, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = ipc_voucher_no_senders); @@ -278,7 +277,8 @@ iv_dealloc(ipc_voucher_t iv, bool unhash) */ if (IP_VALID(port)) { assert(port->ip_srights == 0); - ipc_kobject_dealloc_port(port, 0, IKOT_VOUCHER); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_VOUCHER); iv->iv_port = MACH_PORT_NULL; } @@ -353,7 +353,7 @@ ipc_voucher_t convert_port_to_voucher( ipc_port_t port) { - if (IP_VALID(port) && ip_kotype(port) == IKOT_VOUCHER) { + if (IP_VALID(port) && ip_type(port) == IKOT_VOUCHER) { /* * No need to lock because we have a reference on the * port, and if it is a true voucher port, that reference @@ -426,7 +426,7 @@ ipc_voucher_no_senders(ipc_port_t port, __unused mach_port_mscount_t mscount) { ipc_voucher_t voucher = ip_get_voucher(port); - assert(IKOT_VOUCHER == ip_kotype(port)); + assert(ip_type(port) == IKOT_VOUCHER); /* consume the reference donated by convert_voucher_to_port */ ipc_voucher_release(voucher); @@ -450,7 +450,7 @@ convert_voucher_to_port(ipc_voucher_t voucher) * if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&voucher->iv_port, - voucher, IKOT_VOUCHER, IPC_KOBJECT_ALLOC_NONE)) { + voucher, IKOT_VOUCHER)) { ipc_voucher_release(voucher); } return voucher->iv_port; @@ -1891,7 +1891,7 @@ mach_voucher_debug_info( kern_return_t kr; ipc_port_t port = MACH_PORT_NULL; - if (space == IS_NULL) { + if (space == NULL) { return KERN_INVALID_TASK; } @@ -2316,9 +2316,6 @@ ipc_voucher_prepare_processing_recipe( */ uint64_t voucher_activity_id; -#define generate_activity_id(x) \ - ((uint64_t)OSAddAtomic64((x), (int64_t *)&voucher_activity_id)) - /* * Routine: mach_init_activity_id * Purpose: @@ -2346,7 +2343,8 @@ mach_generate_activity_id( return KERN_INVALID_ARGUMENT; } - activity_id = generate_activity_id(args->count); + activity_id = os_atomic_add_orig(&voucher_activity_id, + args->count, relaxed); kr = copyout(&activity_id, args->activity_id, sizeof(activity_id)); return kr; diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index c6086dc76..156a8ff54 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -441,6 +442,72 @@ mach_port_dnrequest_info( return KERN_SUCCESS; } +static ipc_info_object_type_t +mach_port_kobject_type(ipc_port_t port) +{ +#define MAKE_CASE(name) \ + case IKOT_ ## name: return IPC_OTYPE_ ## name + + switch (ip_type(port)) { + /* thread ports */ + MAKE_CASE(THREAD_CONTROL); + MAKE_CASE(THREAD_READ); + MAKE_CASE(THREAD_INSPECT); + + /* task ports */ + MAKE_CASE(TASK_CONTROL); + MAKE_CASE(TASK_READ); + MAKE_CASE(TASK_INSPECT); + MAKE_CASE(TASK_NAME); + + MAKE_CASE(TASK_RESUME); + MAKE_CASE(TASK_ID_TOKEN); + MAKE_CASE(TASK_FATAL); + + /* host services, upcalls, security */ + MAKE_CASE(HOST); + MAKE_CASE(HOST_PRIV); + MAKE_CASE(CLOCK); + MAKE_CASE(PROCESSOR); + MAKE_CASE(PROCESSOR_SET); + MAKE_CASE(PROCESSOR_SET_NAME); + + /* common userspace used ports */ + MAKE_CASE(EVENTLINK); + MAKE_CASE(FILEPORT); + MAKE_CASE(SEMAPHORE); + MAKE_CASE(VOUCHER); + MAKE_CASE(WORK_INTERVAL); + + /* VM ports */ + MAKE_CASE(MEMORY_OBJECT); + MAKE_CASE(NAMED_ENTRY); + + /* IOKit & exclaves ports */ + MAKE_CASE(MAIN_DEVICE); + MAKE_CASE(IOKIT_IDENT); + MAKE_CASE(IOKIT_CONNECT); + MAKE_CASE(IOKIT_OBJECT); + MAKE_CASE(UEXT_OBJECT); + MAKE_CASE(EXCLAVES_RESOURCE); + + /* misc. */ + MAKE_CASE(ARCADE_REG); + MAKE_CASE(AU_SESSIONPORT); + MAKE_CASE(HYPERVISOR); + MAKE_CASE(KCDATA); + MAKE_CASE(UND_REPLY); + MAKE_CASE(UX_HANDLER); + + case IOT_TIMER_PORT: + return IPC_OTYPE_TIMER; + + default: + return IPC_OTYPE_UNKNOWN; + } +#undef MAKE_CASE +} + /* * Routine: mach_port_kobject [kernel call] * Purpose: @@ -464,52 +531,62 @@ static kern_return_t mach_port_kobject_description( ipc_space_t space, mach_port_name_t name, - natural_t *typep, + ipc_info_object_type_t *typep, mach_vm_address_t *addrp, kobject_description_t desc) { ipc_entry_bits_t bits; - ipc_object_t object; + ipc_object_t ipc_object; kern_return_t kr; mach_vm_address_t kaddr = 0; - io_object_t obj = NULL; - io_kobject_t kobj = NULL; - ipc_port_t port = IP_NULL; if (space == IS_NULL) { return KERN_INVALID_TASK; } - kr = ipc_right_lookup_read(space, name, &bits, &object); + kr = ipc_right_lookup_read(space, name, &bits, &ipc_object); if (kr != KERN_SUCCESS) { return kr; } /* object is locked and active */ if ((bits & MACH_PORT_TYPE_SEND_RECEIVE) == 0) { - io_unlock(object); + io_unlock(ipc_object); return KERN_INVALID_RIGHT; } - *typep = (unsigned int)io_kotype(object); - if (io_is_kobject(object)) { - port = ip_object_to_port(object); - kaddr = (mach_vm_address_t)ipc_kobject_get_raw(port, io_kotype(object)); + ipc_port_t port = ip_object_to_port(ipc_object); + *typep = mach_port_kobject_type(port); + if (ip_is_kobject(port)) { + kaddr = (mach_vm_address_t)ipc_kobject_get_raw(port, ip_type(port)); } *addrp = 0; if (desc) { *desc = '\0'; - switch (io_kotype(object)) { + switch (ip_type(port)) { case IKOT_IOKIT_OBJECT: case IKOT_IOKIT_CONNECT: case IKOT_IOKIT_IDENT: case IKOT_UEXT_OBJECT: - kobj = (io_kobject_t) kaddr; - if (kobj) { - iokit_kobject_retain(kobj); + { + io_kobject_t io_kobject = (io_kobject_t)kaddr; + if (io_kobject) { + iokit_kobject_retain(io_kobject); + io_unlock(ipc_object); + + // IKOT_IOKIT_OBJECT since iokit_remove_reference() follows + io_object_t io_object = iokit_copy_object_for_consumed_kobject(io_kobject); + io_kobject = NULL; + if (io_object) { + iokit_port_object_description(io_object, desc); + iokit_remove_reference(io_object); + io_object = NULL; + } + goto unlocked; } break; + } case IKOT_TASK_ID_TOKEN: { task_id_token_t token; @@ -517,25 +594,23 @@ mach_port_kobject_description( snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, "%d,%llu,%d", token->ident.p_pid, token->ident.p_uniqueid, token->ident.p_idversion); break; } + case IKOT_NAMED_ENTRY: + { + vm_named_entry_t named_entry = (vm_named_entry_t)ipc_kobject_get_stable(port, IKOT_NAMED_ENTRY); + mach_memory_entry_describe(named_entry, desc); + break; + } default: break; } } - io_unlock(object); + io_unlock(ipc_object); +unlocked: #if (DEVELOPMENT || DEBUG) *addrp = VM_KERNEL_ADDRHASH(kaddr); #endif - if (kobj) { - // IKOT_IOKIT_OBJECT since iokit_remove_reference() follows - obj = iokit_copy_object_for_consumed_kobject(kobj, IKOT_IOKIT_OBJECT); - } - if (obj) { - iokit_port_object_description(obj, desc); - iokit_remove_reference(obj); - } - return KERN_SUCCESS; } @@ -543,7 +618,7 @@ kern_return_t mach_port_kobject_description_from_user( mach_port_t port, mach_port_name_t name, - natural_t *typep, + ipc_info_object_type_t *typep, mach_vm_address_t *addrp, kobject_description_t desc) { @@ -565,7 +640,7 @@ kern_return_t mach_port_kobject_from_user( mach_port_t port, mach_port_name_t name, - natural_t *typep, + ipc_info_object_type_t *typep, mach_vm_address_t *addrp) { return mach_port_kobject_description_from_user(port, name, typep, addrp, NULL); diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c index b7c9ebc43..5fbc0516c 100644 --- a/osfmk/ipc/mach_kernelrpc.c +++ b/osfmk/ipc/mach_kernelrpc.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include #include @@ -37,8 +38,11 @@ #include #include #include +#include #include #include +#include +#include #include kern_return_t @@ -246,7 +250,7 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_ } rv = ipc_object_copyin(task->itk_space, args->poly, args->polyPoly, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, NULL, &port); + IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, IPC_COPYIN_KERNEL_DESTINATION, NULL, &port); if (rv != KERN_SUCCESS) { goto done; } @@ -439,7 +443,7 @@ _kernelrpc_mach_port_request_notification_trap( if (MACH_PORT_VALID(args->notify)) { rv = ipc_object_copyin(task->itk_space, args->notify, args->notifyPoly, - IPC_OBJECT_COPYIN_FLAGS_NONE, NULL, ¬ify); + IPC_OBJECT_COPYIN_FLAGS_NONE, IPC_COPYIN_KERNEL_DESTINATION, NULL, ¬ify); } else { notify = CAST_MACH_NAME_TO_PORT(args->notify); } @@ -771,3 +775,30 @@ copyout_failed: return kr; } + +#if __LP64__ +mach_error_t +mach_vm_reclaim_update_kernel_accounting_trap( + struct mach_vm_reclaim_update_kernel_accounting_trap_args *args) +{ + task_t task = port_name_to_current_task_noref(args->target_task); + if (task == TASK_NULL) { + return MACH_SEND_INVALID_DEST; + } + if (args->bytes_reclaimed_out == USER_ADDR_NULL) { + return KERN_INVALID_ARGUMENT; + } +#if CONFIG_DEFERRED_RECLAIM + mach_error_t err; + uint64_t bytes_reclaimed; + err = vm_deferred_reclamation_update_accounting_internal( + task, &bytes_reclaimed); + if (!err) { + mach_copyout(&bytes_reclaimed, (user_addr_t)args->bytes_reclaimed_out, sizeof(bytes_reclaimed)); + } + return err; +#else /* !CONFIG_DEFERRED_RECLAIM */ + return KERN_NOT_SUPPORTED; +#endif /* CONFIG_DEFERRED_RECLAIM */ +} +#endif /* __LP64__ */ diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 3829475db..afaaf38f2 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -391,13 +391,6 @@ mach_msg_receive_continue(void) ipc_port_thread_group_unblocked(); -#if MACH_FLIPC - if (current_thread()->ith_state == MACH_PEEK_READY) { - thread_syscall_return(MACH_PEEK_READY); - __builtin_unreachable(); - } -#endif /* MACH_FLIPC */ - mr = mach_msg_receive_results(NULL); thread_syscall_return(mr); } @@ -834,7 +827,7 @@ mach_msg_overwrite_trap( mach_msg_option64_t options = args->option; mach_msg_return_t mr = MACH_MSG_SUCCESS; - options = ipc_current_user_policy(current_task(), options); + options = ipc_current_msg_options(current_task(), options); KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START); @@ -948,7 +941,7 @@ mach_msg2_trap( mach_msg_option64_t option64; mach_msg_return_t mr = MACH_MSG_SUCCESS; - option64 = ipc_current_user_policy(current_task(), + option64 = ipc_current_msg_options(current_task(), args->options) | MACH64_MACH_MSG2; KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_KMSG_INFO) | DBG_FUNC_START); @@ -1121,9 +1114,7 @@ mach_msg_receive_results_complete(ipc_object_t object) ipc_port_t port = IP_NULL; boolean_t get_turnstile = (self->turnstile == TURNSTILE_NULL); - if (io_otype(object) == IOT_PORT) { - port = ip_object_to_port(object); - } else { + if (io_is_pset(object)) { assert(self->turnstile != TURNSTILE_NULL); return; } @@ -1136,14 +1127,12 @@ mach_msg_receive_results_complete(ipc_object_t object) if (!((self->ith_state == MACH_RCV_TOO_LARGE && self->ith_option & MACH_RCV_LARGE) || //msg was too large and the next receive will get it self->ith_state == MACH_RCV_INTERRUPTED || self->ith_state == MACH_RCV_TIMED_OUT || -#if MACH_FLIPC - self->ith_state == MACH_PEEK_READY || -#endif /* MACH_FLIPC */ self->ith_state == MACH_RCV_PORT_CHANGED)) { flags |= IPC_PORT_ADJUST_SR_RECEIVED_MSG; } - if (port->ip_specialreply || get_turnstile) { + port = ip_object_to_port(object); + if (ip_is_special_reply_port(port) || get_turnstile) { ip_mq_lock(port); ipc_port_adjust_special_reply_port_locked(port, NULL, flags, get_turnstile); diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index 2583f8191..60229751e 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -88,7 +88,7 @@ #include #include #include -#include +#include #include #include #include @@ -101,16 +101,17 @@ #include #include #include +#include #if IMPORTANCE_INHERITANCE #include #endif -static TUNABLE(bool, provisional_reply_port_enforced, "-provisional_reply_port_enforced", false); +#if CONFIG_CSR +#include +#endif extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); -extern int proc_isinitproc(struct proc *p); -extern boolean_t proc_is_simulated(const struct proc *p); static int mach_port_name_cmp(const void *_n1, const void *_n2) { @@ -186,10 +187,6 @@ mach_port_names_helper( } else { bits &= ~(IE_BITS_TYPE_MASK); bits |= MACH_PORT_TYPE_DEAD_NAME; - /* account for additional reference for dead-name notification */ - if (reqtype != 0) { - bits++; - } } ip_mq_unlock(port); } @@ -631,14 +628,15 @@ mach_port_allocate_full( switch (right) { case MACH_PORT_RIGHT_RECEIVE: { - ipc_port_t port; + ipc_object_label_t label = IPC_OBJECT_LABEL(IOT_PORT); + ipc_port_t port; if (qosp->name) { - kr = ipc_port_alloc_name(space, IPC_PORT_INIT_MESSAGE_QUEUE, - *namep, &port); + kr = ipc_port_alloc_name(space, label, + IP_INIT_NONE, *namep, &port); } else { - kr = ipc_port_alloc(space, IPC_PORT_INIT_MESSAGE_QUEUE, - namep, &port); + kr = ipc_port_alloc(space, label, + IP_INIT_NONE, namep, &port); } if (kr == KERN_SUCCESS) { ip_mq_unlock(port); @@ -713,8 +711,7 @@ mach_port_destroy( } /* space is write-locked and active */ - kr = ipc_right_destroy(space, name, entry, TRUE, 0); /* unlocks space */ - return kr; + return ipc_right_destroy(space, name, entry); /* unlocks space */ } /* @@ -739,7 +736,7 @@ kern_return_t mach_port_deallocate_kernel( ipc_space_t space, mach_port_name_t name, - ipc_kobject_type_t kotype) + ipc_object_type_t otype) { ipc_entry_t entry; kern_return_t kr; @@ -759,11 +756,14 @@ mach_port_deallocate_kernel( } /* space is write-locked */ - if (kotype != IKOT_UNKNOWN && + if (otype != IOT_ANY && entry->ie_object && - io_kotype(entry->ie_object) != kotype) { + io_type(entry->ie_object) != otype) { is_write_unlock(space); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_RIGHT_DEALLOC_KERNEL, + otype, io_type(entry->ie_object)), + kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -776,7 +776,7 @@ mach_port_deallocate( ipc_space_t space, mach_port_name_t name) { - return mach_port_deallocate_kernel(space, name, IKOT_UNKNOWN); + return mach_port_deallocate_kernel(space, name, IOT_ANY); } /* @@ -992,7 +992,9 @@ mach_port_peek( */ if (GET_RCV_ELEMENTS(trailer_type) > MACH_RCV_TRAILER_AUDIT || REQUESTED_TRAILER_SIZE(TRUE, trailer_type) > *trailer_sizep) { - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_VALUE); + mach_port_guard_exception(name, + MPG_PAYLOAD(MPG_FLAGS_INVALID_VALUE_PEEK, trailer_type, *trailer_sizep), + kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; } @@ -1000,10 +1002,9 @@ mach_port_peek( kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, - ((KERN_INVALID_NAME == kr) ? - kGUARD_EXC_INVALID_NAME : - kGUARD_EXC_INVALID_RIGHT)); + uint64_t payload = (KERN_INVALID_NAME == kr) ? 0 : MPG_FLAGS_INVALID_RIGHT_RECV; + unsigned reason = (KERN_INVALID_NAME == kr) ? kGUARD_EXC_INVALID_NAME : kGUARD_EXC_INVALID_RIGHT; + mach_port_guard_exception(name, payload, reason); return kr; } @@ -1461,48 +1462,6 @@ done: return kr; } -/* - * Routine: mach_service_pd_request_notification_check - * Purpose: - * Check if requesting port destroyed notification on a service port is allowed. - * Conditions: - * Assumes service_port is locked and active. - */ -static bool -mach_service_pd_request_notification_check( - ipc_port_t service_port, - ipc_port_t notify_port - ) -{ -#ifdef MACH_BSD - - uintptr_t task; - - /* Only launchd should be able to register for port destroyed notification on a service port. */ - (void)ipc_port_get_receiver_task_locked(service_port, &task); - if (task && !proc_isinitproc(get_bsdtask_info((task_t)task))) { - return false; - } - - /* Notify port should indicate immovable receive right owned by launchd. */ - if (IP_VALID(notify_port)) { - ip_mq_lock(notify_port); - (void)ipc_port_get_receiver_task_locked(notify_port, &task); - if (task && !proc_isinitproc(get_bsdtask_info((task_t)task))) { - ip_mq_unlock(notify_port); - return false; - } - if (!notify_port->ip_immovable_receive) { - ip_mq_unlock(notify_port); - return false; - } - ip_mq_unlock(notify_port); - } -#endif - - return true; -} - /* * Routine: mach_port_request_notification [kernel call] * Purpose: @@ -1570,6 +1529,10 @@ mach_port_request_notification( return KERN_INVALID_CAPABILITY; } + if (!MACH_PORT_VALID(name)) { + return KERN_INVALID_ARGUMENT; + } + switch (id) { case MACH_NOTIFY_PORT_DESTROYED: { ipc_port_t port; @@ -1578,39 +1541,16 @@ mach_port_request_notification( return KERN_INVALID_VALUE; } - if (!MACH_PORT_VALID(name)) { - return KERN_INVALID_RIGHT; - } - kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { return kr; } /* port is locked and active */ - /* - * you cannot register for port death notifications on a kobject, - * kolabel or special reply port. - */ - if (ip_is_kobject(port) || ip_is_kolabeled(port) || - port->ip_specialreply || ip_is_reply_port(port)) { + kr = ipc_allow_register_pd_notification(port, notify); + if (kr != KERN_SUCCESS) { ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); - return KERN_INVALID_RIGHT; - } - - if (service_port_defense_enabled && port->ip_service_port && - !mach_service_pd_request_notification_check(port, notify)) { - ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_KERN_FAILURE); - return KERN_FAILURE; - } - - /* Allow only one registeration of this notification */ - if (ipc_port_has_prdrequest(port)) { - ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_KERN_FAILURE); - return KERN_FAILURE; + return kr; } if (port->ip_has_watchport) { @@ -1624,26 +1564,41 @@ mach_port_request_notification( } case MACH_NOTIFY_NO_SENDERS: { + ipc_object_label_t label; + mach_port_mscount_t mscount; ipc_port_t port; - if (!MACH_PORT_VALID(name)) { - return KERN_INVALID_RIGHT; - } - kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { return kr; } /* port is locked and active */ - if (ip_is_reply_port(port)) { - ip_mq_unlock(port); - mach_port_guard_exception(name, 0, kGUARD_EXC_INVALID_RIGHT); + label = ip_label_get(port); + + if (!ipc_policy(label)->pol_notif_no_senders) { + mach_port_guard_exception(label.io_type, id, + kGUARD_EXC_INVALID_NOTIFICATION_REQ); + ip_mq_unlock_label_put(port, &label); return KERN_INVALID_RIGHT; } - ipc_port_nsrequest(port, sync, notify, previousp); - /* port is unlocked */ + *previousp = port->ip_nsrequest; + mscount = port->ip_mscount; + + if (port->ip_srights == 0 && sync <= mscount && IP_VALID(notify)) { + port->ip_nsrequest = IP_NULL; + } else { + port->ip_nsrequest = notify; + notify = IP_NULL; + } + + ip_mq_unlock_label_put(port, &label); + + if (notify) { + ipc_notify_no_senders_mqueue(notify, mscount); + } + break; } @@ -1651,10 +1606,6 @@ mach_port_request_notification( case MACH_NOTIFY_DEAD_NAME: { ipc_port_request_opts_t opts = 0; - if (!MACH_PORT_VALID(name)) { - return KERN_INVALID_ARGUMENT; - } - if (id == MACH_NOTIFY_SEND_POSSIBLE) { opts |= IPR_SOR_SPREQ_MASK; if (sync) { @@ -1662,7 +1613,7 @@ mach_port_request_notification( } } - kr = ipc_right_request_alloc(space, name, opts, notify, previousp); + kr = ipc_right_request_alloc(space, name, opts, notify, id, previousp); if (kr != KERN_SUCCESS) { return kr; } @@ -1766,7 +1717,7 @@ mach_port_extract_right( kr = ipc_object_copyin(space, name, msgt_name, (space == current_space() && msgt_name == MACH_MSG_TYPE_COPY_SEND) ? IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND : IPC_OBJECT_COPYIN_FLAGS_NONE, - NULL, poly); + IPC_COPYIN_KERNEL_DESTINATION, NULL, poly); if (kr == KERN_SUCCESS) { *polyPoly = ipc_object_copyin_type(msgt_name); @@ -1799,7 +1750,9 @@ mach_port_get_status_helper( statusp->mps_sorights = port->ip_sorights; statusp->mps_srights = port->ip_srights > 0; statusp->mps_pdrequest = ipc_port_has_prdrequest(port); - statusp->mps_nsrequest = port->ip_nsrequest != IP_NULL; + if (!ip_is_kobject(port)) { + statusp->mps_nsrequest = port->ip_nsrequest != IP_NULL; + } statusp->mps_flags = 0; if (port->ip_impdonation) { statusp->mps_flags |= MACH_PORT_STATUS_FLAG_IMP_DONATION; @@ -1815,14 +1768,10 @@ mach_port_get_status_helper( if (port->ip_strict_guard) { statusp->mps_flags |= MACH_PORT_STATUS_FLAG_STRICT_GUARD; } - if (port->ip_immovable_receive) { + if (ip_is_immovable_receive(port)) { statusp->mps_flags |= MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE; } } - if (port->ip_no_grant) { - statusp->mps_flags |= MACH_PORT_STATUS_FLAG_NO_GRANT; - } - return; } kern_return_t @@ -1940,6 +1889,7 @@ mach_port_get_attributes( case MACH_PORT_SERVICE_THROTTLED: { boolean_t *is_throttled = info; + ipc_object_label_t label; if (!MACH_PORT_VALID(name)) { return KERN_INVALID_RIGHT; @@ -1951,16 +1901,16 @@ mach_port_get_attributes( } /* port is locked and active */ - if (!port->ip_service_port) { - ip_mq_unlock(port); - return KERN_INVALID_CAPABILITY; + label = ip_label_get(port); + if (ip_is_any_service_port_type(label.io_type)) { + *is_throttled = label.iol_service->ispl_throttled; + *count = MACH_PORT_SERVICE_THROTTLED_COUNT; + } else { + kr = KERN_INVALID_CAPABILITY; } - assert(port->ip_splabel != NULL); - *is_throttled = ipc_service_port_label_is_throttled((ipc_service_port_label_t)port->ip_splabel); - *count = MACH_PORT_SERVICE_THROTTLED_COUNT; - ip_mq_unlock(port); - break; + ip_mq_unlock_label_put(port, &label); + return kr; } default: @@ -2070,7 +2020,7 @@ mach_port_set_attributes( * associated it with a kobject already (timer, host_notify target), * or is a special reply port. */ - if (ip_is_kobject(port) || port->ip_specialreply) { + if (ip_is_kobject(port) || ip_is_special_reply_port(port)) { ip_mq_unlock(port); return KERN_INVALID_ARGUMENT; } @@ -2129,7 +2079,7 @@ mach_port_set_attributes( * it with a kobject already (timer, host_notify target), * or is a special reply port. */ - if (ip_is_kobject(port) || port->ip_specialreply) { + if (ip_is_kobject(port) || ip_is_special_reply_port(port)) { ip_mq_unlock(port); return KERN_INVALID_ARGUMENT; } @@ -2143,7 +2093,7 @@ mach_port_set_attributes( } case MACH_PORT_SERVICE_THROTTLED: { - boolean_t is_throttled = *info; + ipc_object_label_t label; if (!MACH_PORT_VALID(name)) { return KERN_INVALID_RIGHT; @@ -2155,19 +2105,15 @@ mach_port_set_attributes( } /* port is locked and active */ - if (!port->ip_service_port) { - ip_mq_unlock(port); - return KERN_INVALID_CAPABILITY; + label = ip_label_get(port); + if (ip_is_any_service_port_type(label.io_type)) { + label.iol_service->ispl_throttled = (*info != 0); + } else { + kr = KERN_INVALID_CAPABILITY; } - assert(port->ip_splabel != NULL); - if (is_throttled) { - ipc_service_port_label_set_flag(port->ip_splabel, ISPL_FLAGS_THROTTLED); - } else { - ipc_service_port_label_clear_flag(port->ip_splabel, ISPL_FLAGS_THROTTLED); - } - ip_mq_unlock(port); - break; + ip_mq_unlock_label_put(port, &label); + return kr; } default: @@ -2334,15 +2280,16 @@ mach_port_guard_locked( return KERN_INVALID_ARGUMENT; } - int strict = (flags & MPG_STRICT)? 1 : 0; - int immovable_receive = (flags & MPG_IMMOVABLE_RECEIVE)? 1 : 0; - port->ip_context = guard; port->ip_guarded = 1; - port->ip_strict_guard = strict; - /* ip_immovable_receive bit is sticky and can't be un-guarded */ - if (!port->ip_immovable_receive) { - port->ip_immovable_receive = immovable_receive; + port->ip_strict_guard = (flags & MPG_STRICT) != 0; + + if ((flags & MPG_IMMOVABLE_RECEIVE) && !ip_is_immovable_receive(port)) { + ipc_object_label_t label = ip_label_get(port); + + ipc_release_assert(label.io_state == IO_STATE_IN_SPACE); + label.io_state = IO_STATE_IN_SPACE_IMMOVABLE; + io_label_set_and_put(&port->ip_object, &label); } return KERN_SUCCESS; @@ -2380,12 +2327,70 @@ mach_port_unguard_locked( port->ip_context = 0; port->ip_guarded = port->ip_strict_guard = 0; - /* Don't clear the ip_immovable_receive bit */ return KERN_SUCCESS; } +static kern_return_t +mach_port_construct_check_service_port( + mach_port_options_t *options, + struct mach_service_port_info *sp_info) +{ + user_addr_t service_port_info = 0; + size_t sp_name_length = 0; + + /* + * Allow only launchd to add the service port labels + * Not enforcing on development/debug kernels to + * support testing + */ +#if !(DEVELOPMENT || DEBUG) +#if CONFIG_COALITIONS + if (!task_is_in_privileged_coalition(current_task(), COALITION_TYPE_JETSAM)) { + return KERN_DENIED; + } +#else /* CONFIG_COALITIONS */ + if (task_is_initproc(current_task())) { + return KERN_DENIED; + } +#endif /* CONFIG_COALITIONS */ +#endif /* !(DEVELOPMENT || DEBUG) */ + + if (task_has_64Bit_addr(current_task())) { + service_port_info = CAST_USER_ADDR_T(options->service_port_info64); + } else { + service_port_info = CAST_USER_ADDR_T(options->service_port_info32); + } + + if (!service_port_info) { + return KERN_INVALID_ARGUMENT; + } + + if (copyin(service_port_info, (void *)sp_info, sizeof(*sp_info))) { + return KERN_MEMORY_ERROR; + } + + sp_name_length = strnlen(sp_info->mspi_string_name, MACH_SERVICE_PORT_INFO_STRING_NAME_MAX_BUF_LEN); + if (sp_name_length >= (MACH_SERVICE_PORT_INFO_STRING_NAME_MAX_BUF_LEN)) { + return KERN_INVALID_ARGUMENT; + } + + /* + * Setting the guard on a service port triggers a special port + * destroyed notification that restores the guard when the + * receive right moves back to launchd. + * + * This must be a strict guard. + */ + if ((options->flags & MPO_CONTEXT_AS_GUARD) != 0 && + (options->flags & MPO_STRICT) == 0) { + return KERN_INVALID_ARGUMENT; + } + + return KERN_SUCCESS; +} + /* * Routine: mach_port_construct [kernel call] * Purpose: @@ -2393,14 +2398,15 @@ mach_port_unguard_locked( * Conditions: * None. * Returns: - * KERN_SUCCESS The right is allocated. - * KERN_INVALID_TASK The space is null. - * KERN_INVALID_TASK The space is dead. - * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. - * KERN_INVALID_VALUE Invalid value passed in options - * KERN_INVALID_ARGUMENT Invalid arguments passed in options - * KERN_NO_SPACE No room in space for another right. - * KERN_FAILURE Illegal option values requested. + * KERN_SUCCESS The right is allocated. + * KERN_INVALID_TASK The space is null. + * KERN_INVALID_TASK The space is dead. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + * KERN_INVALID_VALUE Invalid value passed in options + * KERN_INVALID_ARGUMENT Invalid arguments passed in options + * KERN_NO_SPACE No room in space for another right. + * KERN_DENIED Missing an entitlement for the request. + * KERN_FAILURE Illegal option values requested. */ kern_return_t @@ -2410,185 +2416,204 @@ mach_port_construct( uint64_t context, mach_port_name_t *name) { - kern_return_t kr; ipc_port_t port; - ipc_port_init_flags_t init_flags = IPC_PORT_INIT_MESSAGE_QUEUE; - void *port_splabel = NULL; - bool filter_msgs = FALSE; + kern_return_t kr = KERN_SUCCESS; + ipc_port_init_flags_t init_flags = IP_INIT_NONE; + /* new port labels start in IO_STATE_IN_SPACE */ + ipc_object_label_t label = IPC_OBJECT_LABEL(IOT_PORT); + ipc_space_policy_t policy = ipc_space_policy(space); struct mach_service_port_info sp_info = {}; - size_t sp_name_length = 0; - user_addr_t service_port_info = 0; - - uint32_t at_most_one_flags = options->flags & (MPO_SERVICE_PORT | MPO_CONNECTION_PORT | MPO_TG_BLOCK_TRACKING); - if (at_most_one_flags & (at_most_one_flags - 1)) { - /* at most one of the listed flags can be set */ - return KERN_INVALID_ARGUMENT; - } - - at_most_one_flags = options->flags & (MPO_REPLY_PORT | MPO_ENFORCE_REPLY_PORT_SEMANTICS | - MPO_EXCEPTION_PORT | MPO_PROVISIONAL_REPLY_PORT); - if (at_most_one_flags & (at_most_one_flags - 1)) { - /* at most one of the listed flags can be set */ - return KERN_INVALID_ARGUMENT; - } - -#if !XNU_TARGET_OS_OSX && !XNU_TARGET_OS_BRIDGE && !XNU_TARGET_OS_XR - task_t task = current_task(); - if ((options->flags & MPO_PROVISIONAL_REPLY_PORT) && - task_is_hardened_binary(task) && - !proc_is_simulated(current_proc())) { - task_lock(task); - if (!task_has_provisional_reply_port_telemetry(task)) { - /* rdar://136996362 (iOS+ telemetry for restricting 1P usage of provisional reply port) */ - mach_port_guard_exception(0, 0, kGUARD_EXC_PROVISIONAL_REPLY_PORT); - task_set_provisional_reply_port_telemetry(task); - } - task_unlock(task); - } -#endif /* !XNU_TARGET_OS_OSX && !XNU_TARGET_OS_BRIDGE && !XNU_TARGET_OS_XR */ if (space == IS_NULL) { return KERN_INVALID_TASK; } - if (options->flags & MPO_INSERT_SEND_RIGHT) { - init_flags |= IPC_PORT_INIT_MAKE_SEND_RIGHT; + if (options->flags & MPO_UNUSED_BITS) { + return KERN_INVALID_ARGUMENT; } - if (options->flags & MPO_FILTER_MSG) { - init_flags |= IPC_PORT_INIT_FILTER_MESSAGE; + /* exactly one port type must be set */ + mpo_flags_t port_type_flag = options->flags & MPO_PORT_TYPE_MASK; + switch (port_type_flag) { + case MPO_PORT: + case MPO_SERVICE_PORT: + case MPO_CONNECTION_PORT: + case MPO_REPLY_PORT: + case MPO_PROVISIONAL_REPLY_PORT: + case MPO_EXCEPTION_PORT: + case MPO_CONNECTION_PORT_WITH_PORT_ARRAY: + break; + + default: + return KERN_INVALID_ARGUMENT; } - if (options->flags & MPO_REPLY_PORT) { - init_flags |= IPC_PORT_INIT_REPLY; - } - - if (options->flags & MPO_ENFORCE_REPLY_PORT_SEMANTICS) { - init_flags |= IPC_PORT_ENFORCE_REPLY_PORT_SEMANTICS; - } - - if (options->flags & MPO_EXCEPTION_PORT) { - init_flags |= IPC_PORT_INIT_EXCEPTION_PORT; - } - - if (options->flags & MPO_PROVISIONAL_REPLY_PORT) { - if (provisional_reply_port_enforced) { - init_flags |= IPC_PORT_INIT_REPLY; + /* + * Step 1. Determine port type + */ + switch (port_type_flag) { + case MPO_PORT: + label.io_type = IOT_PORT; + break; + case MPO_SERVICE_PORT: + kr = mach_port_construct_check_service_port(options, &sp_info); + if (kr != KERN_SUCCESS) { + return kr; + } + if ((options->flags & MPO_ENFORCE_REPLY_PORT_SEMANTICS) && + !(policy & IPC_SPACE_POLICY_SIMULATED)) { + label.io_type = IOT_SERVICE_PORT; } else { - init_flags |= IPC_PORT_INIT_PROVISIONAL_REPLY; + label.io_type = IOT_WEAK_SERVICE_PORT; + } + break; + case MPO_CONNECTION_PORT: + if (!options->service_port_name) { + return KERN_INVALID_ARGUMENT; + } + label.io_type = IOT_CONNECTION_PORT; + break; + case MPO_EXCEPTION_PORT: + label.io_type = IOT_EXCEPTION_PORT; + break; + case MPO_REPLY_PORT: + label.io_type = IOT_REPLY_PORT; + if (!ipc_should_apply_policy(policy, IPC_POLICY_ENHANCED_V1)) { + /* + * non-hardened tasks won't adopt reply port semantics, + * opt them out with provisional reply ports + */ + label.io_type = IOT_PROVISIONAL_REPLY_PORT; + } + break; + case MPO_PROVISIONAL_REPLY_PORT: + label.io_type = IOT_PROVISIONAL_REPLY_PORT; + break; + case MPO_CONNECTION_PORT_WITH_PORT_ARRAY: + label.io_type = IOT_CONNECTION_PORT_WITH_PORT_ARRAY; + break; + } + + + /* + * If the port type policy requires an entitlement, + * enforce it here before proceeding any further. + */ + const char *port_policy_entitlement = ipc_policy(label.io_type)->pol_construct_entitlement; + if (port_policy_entitlement && + ipc_should_apply_policy(policy, IPC_POLICY_ENHANCED_V1) && + !IOCurrentTaskHasEntitlement(port_policy_entitlement)) { + /* + * enforce the policy construct entitlement on all + * port types, besides provisional reply port (yet). + */ + if (!(options->flags & MPO_PROVISIONAL_REPLY_PORT)) { + mach_port_guard_exception(options->flags, 0, + kGUARD_EXC_INVALID_MPO_ENTITLEMENT); + return KERN_DENIED; + } + + /* emit telemetry if needed */ + if (ipcpv_telemetry_enabled && +#if XNU_TARGET_OS_OSX && CONFIG_CSR + (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) && /* SIP enabled */ +#endif /* XNU_TARGET_OS_OSX && CONFIG_CSR */ + !ipc_space_has_telemetry_type(space, IS_HAS_CREATE_PRP_TELEMETRY)) { + mach_port_guard_exception(0, 0, kGUARD_EXC_PROVISIONAL_REPLY_PORT); + } + + /* If we have enforcement */ + if (prp_enforcement_enabled) { + mach_port_guard_exception(options->flags, 0, + kGUARD_EXC_INVALID_MPO_ENTITLEMENT); + return KERN_DENIED; + } + } + + /* + * Step 2. Handle and verify flags + */ + if (options->flags & MPO_IMMOVABLE_RECEIVE) { + label.io_state = IO_STATE_IN_SPACE_IMMOVABLE; + } + + if (options->flags & MPO_INSERT_SEND_RIGHT) { + init_flags |= IP_INIT_MAKE_SEND_RIGHT; + } + + if (options->flags & MPO_QLIMIT) { + if (options->mpl.mpl_qlimit > MACH_PORT_QLIMIT_MAX) { + return KERN_INVALID_VALUE; } } if (options->flags & MPO_TG_BLOCK_TRACKING) { - /* Check the task role to allow only TASK_GRAPHICS_SERVER to set this option */ + /* + * Check the task role to allow only TASK_GRAPHICS_SERVER + * to set this option + */ if (proc_get_effective_task_policy(current_task(), TASK_POLICY_ROLE) != TASK_GRAPHICS_SERVER) { return KERN_DENIED; } /* - * Check the work interval port passed in to make sure it is the render server type. - * Since the creation of the render server work interval is privileged, this check - * acts as a guard to make sure only the render server is setting the thread group - * blocking behavior on the port. + * Check the work interval port passed in to make sure it is + * the render server type. + * + * Since the creation of the render server work interval is + * privileged, this check acts as a guard to make sure only + * the render server is setting the thread group blocking + * behavior on the port. */ mach_port_name_t wi_port_name = options->work_interval_port; if (work_interval_port_type_render_server(wi_port_name) == false) { return KERN_INVALID_ARGUMENT; } - init_flags |= IPC_PORT_INIT_TG_BLOCK_TRACKING; } - if (options->flags & MPO_SERVICE_PORT) { -#if !(DEVELOPMENT || DEBUG) -#if CONFIG_COALITIONS - /* - * Allow only launchd to add the service port labels - * Not enforcing on development/debug kernels to - * support testing - */ - if (!task_is_in_privileged_coalition(current_task(), COALITION_TYPE_JETSAM)) { - return KERN_DENIED; - } -#else /* CONFIG_COALITIONS */ - /* - * This flag is not used by launchd on simulators - */ - if (proc_isinitproc(get_bsdtask_info(current_task()))) { - return KERN_DENIED; - } -#endif /* CONFIG_COALITIONS */ -#endif /* !(DEVELOPMENT || DEBUG) */ + /* + * Step 3. Allocate labels and ports. + * + * Code past this point has side effects, + * and early returns for errors is fraught with peril. + */ - if (task_has_64Bit_addr(current_task())) { - service_port_info = CAST_USER_ADDR_T(options->service_port_info64); - } else { - service_port_info = CAST_USER_ADDR_T(options->service_port_info32); - } - - if (!service_port_info) { - return KERN_INVALID_ARGUMENT; - } - - if (copyin(service_port_info, (void *)&sp_info, sizeof(sp_info))) { - return KERN_MEMORY_ERROR; - } - - sp_name_length = strnlen(sp_info.mspi_string_name, MACH_SERVICE_PORT_INFO_STRING_NAME_MAX_BUF_LEN); - if (sp_name_length >= (MACH_SERVICE_PORT_INFO_STRING_NAME_MAX_BUF_LEN)) { - return KERN_INVALID_ARGUMENT; - } - - kr = ipc_service_port_label_alloc(&sp_info, &port_splabel); - if (kr != KERN_SUCCESS) { - return kr; - } - /* Always filter messages on service ports */ - init_flags |= IPC_PORT_INIT_FILTER_MESSAGE; + if (ip_is_any_service_port_type(label.io_type)) { + kr = ipc_service_port_label_alloc(&sp_info, &label); + } else if (label.io_type == IOT_CONNECTION_PORT && + options->service_port_name != MPO_ANONYMOUS_SERVICE) { + kr = ipc_service_port_derive_sblabel(options->service_port_name, + (options->flags & MPO_FILTER_MSG), &label); } - - if (options->flags & MPO_CONNECTION_PORT) { - if (!options->service_port_name) { - return KERN_INVALID_ARGUMENT; - } - - kr = ipc_service_port_derive_sblabel(options->service_port_name, &port_splabel, &filter_msgs); - if (kr != KERN_SUCCESS) { - return kr; - } - if (filter_msgs) { - init_flags |= IPC_PORT_INIT_FILTER_MESSAGE; - } - } - - - if (options->flags & MPO_QLIMIT) { - const mach_msg_type_number_t count = sizeof(options->mpl) / sizeof(int); - static_assert(count >= MACH_PORT_LIMITS_INFO_COUNT); - - if (options->mpl.mpl_qlimit > MACH_PORT_QLIMIT_MAX) { - return KERN_INVALID_VALUE; - } + if (kr != KERN_SUCCESS) { + return kr; } /* Allocate a new port in the IPC space */ - kr = ipc_port_alloc(space, init_flags, name, &port); + kr = ipc_port_alloc(space, label, init_flags, name, &port); if (kr != KERN_SUCCESS) { - if (port_splabel != NULL) { - ipc_service_port_label_dealloc(port_splabel, - (options->flags & MPO_SERVICE_PORT)); - } return kr; } /* Port locked and active */ - /* Mutate the new port based on flags - see above for error checks */ + /* + * Step 4. Apply configuration to our newly minted port. + * + * This is the point of no return, failure isn't allowed + * past this point. + */ + if (options->flags & MPO_QLIMIT) { ipc_mqueue_set_qlimit_locked(&port->ip_messages, options->mpl.mpl_qlimit); } + if (options->flags & MPO_TG_BLOCK_TRACKING) { + port->ip_tg_block_tracking = true; + } + if (options->flags & (MPO_IMPORTANCE_RECEIVER | MPO_DENAP_RECEIVER | MPO_TEMPOWNER)) { - assert(!port->ip_specialreply); + assert(!ip_is_special_reply_port(port)); port->ip_impdonation = 1; if (options->flags & MPO_TEMPOWNER) { @@ -2596,47 +2621,35 @@ mach_port_construct( } } - if (port_splabel != NULL) { - port->ip_service_port = (bool)(options->flags & MPO_SERVICE_PORT); - port->ip_splabel = port_splabel; - - /* Check if this is a service port */ - if (service_port_defense_enabled && port->ip_service_port) { - port->ip_immovable_receive = true; - } - - /* Check if this is a libxpc connection port */ - if (!port->ip_service_port) { - assert(options->flags & MPO_CONNECTION_PORT); - port->ip_immovable_send = true; - port->ip_immovable_receive = true; - } - } - if (options->flags & MPO_CONTEXT_AS_GUARD) { - uint64_t flags = 0; - if (options->flags & MPO_STRICT) { - flags |= MPG_STRICT; - } - if (options->flags & MPO_IMMOVABLE_RECEIVE) { - flags |= MPG_IMMOVABLE_RECEIVE; - } - kr = mach_port_guard_locked(*name, port, context, flags); - /* A newly allocated and locked port should always be guarded successfully */ + /* MPO_IMMOVABLE_RECEIVE was dealt with already */ + + kr = mach_port_guard_locked(*name, port, context, + (options->flags & MPO_STRICT) ? MPG_STRICT : 0); assert(kr == KERN_SUCCESS); - if (options->flags & MPO_SERVICE_PORT) { + + if (ip_is_any_service_port_type(label.io_type)) { /* - * Setting the guard on a service port triggers a special port destroyed notification - * that restores the guard when the receive right moves back to launchd. This - * must be a strict guard. + * Guarded service ports remember their name, + * and are re-guarded when port-destroyed notifications + * are received by launchd. + * See ipc_right_copyout_recv_and_unlock_space() */ - assert((options->flags & MPO_STRICT) == MPO_STRICT); - ipc_service_port_label_set_attr(port_splabel, *name, (mach_port_context_t)context); + label.iol_service->ispl_launchd_name = *name; + label.iol_service->ispl_launchd_context = context; } } else { port->ip_context = context; - if (options->flags & MPO_SERVICE_PORT) { - ipc_service_port_label_set_attr(port_splabel, *name, 0); + } + + /* + * Set ip_bootstrap for bootstrap ports to avoid holding the port lock + * in ipc_validate_local_port(). Lock needed to access port label. + */ + if (ip_is_any_service_port_type(label.io_type)) { + ipc_service_port_label_t sp_label = label.iol_service; + if (sp_label->ispl_bootstrap_port) { + port->ip_bootstrap = 1; } } @@ -2730,10 +2743,9 @@ mach_port_guard( /* Guard can be applied only to receive rights */ kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, - ((KERN_INVALID_NAME == kr) ? - kGUARD_EXC_INVALID_NAME : - kGUARD_EXC_INVALID_RIGHT)); + uint64_t payload = (KERN_INVALID_NAME == kr) ? 0 : MPG_FLAGS_INVALID_RIGHT_RECV; + unsigned reason = (KERN_INVALID_NAME == kr) ? kGUARD_EXC_INVALID_NAME : kGUARD_EXC_INVALID_RIGHT; + mach_port_guard_exception(name, payload, reason); return kr; } @@ -2781,10 +2793,9 @@ mach_port_unguard( kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, - ((KERN_INVALID_NAME == kr) ? - kGUARD_EXC_INVALID_NAME : - kGUARD_EXC_INVALID_RIGHT)); + uint64_t payload = (KERN_INVALID_NAME == kr) ? 0 : MPG_FLAGS_INVALID_RIGHT_RECV; + unsigned reason = (KERN_INVALID_NAME == kr) ? kGUARD_EXC_INVALID_NAME : kGUARD_EXC_INVALID_RIGHT; + mach_port_guard_exception(name, payload, reason); return kr; } @@ -2832,10 +2843,9 @@ mach_port_guard_with_flags( kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, - ((KERN_INVALID_NAME == kr) ? - kGUARD_EXC_INVALID_NAME : - kGUARD_EXC_INVALID_RIGHT)); + uint64_t payload = (KERN_INVALID_NAME == kr) ? 0 : MPG_FLAGS_INVALID_RIGHT_RECV; + unsigned reason = (KERN_INVALID_NAME == kr) ? kGUARD_EXC_INVALID_NAME : kGUARD_EXC_INVALID_RIGHT; + mach_port_guard_exception(name, payload, reason); return kr; } @@ -2880,10 +2890,9 @@ mach_port_swap_guard( kr = ipc_port_translate_receive(space, name, &port); if (kr != KERN_SUCCESS) { - mach_port_guard_exception(name, 0, - ((KERN_INVALID_NAME == kr) ? - kGUARD_EXC_INVALID_NAME : - kGUARD_EXC_INVALID_RIGHT)); + uint64_t payload = (KERN_INVALID_NAME == kr) ? 0 : MPG_FLAGS_INVALID_RIGHT_RECV; + unsigned reason = (KERN_INVALID_NAME == kr) ? kGUARD_EXC_INVALID_NAME : kGUARD_EXC_INVALID_RIGHT; + mach_port_guard_exception(name, payload, reason); return kr; } @@ -2923,10 +2932,11 @@ mach_port_is_connection_for_service( mach_port_name_t service_port_name, uint64_t *filter_policy_id) { + ipc_object_label_t label; mach_port_t service_port; mach_port_t connection_port; - void *service_port_sblabel = NULL; - void *conn_port_sblabel = NULL; + struct ipc_conn_port_label *service_port_sblabel = NULL; + struct ipc_conn_port_label *conn_port_sblabel = NULL; kern_return_t ret; @@ -2951,17 +2961,18 @@ mach_port_is_connection_for_service( return ret; } - if (!service_port->ip_service_port) { - ip_mq_unlock(service_port); + label = ip_label_get(service_port); + if (!ip_is_any_service_port_type(label.io_type)) { + ip_mq_unlock_label_put(service_port, &label); return KERN_INVALID_CAPABILITY; } /* Port is locked and active */ - service_port_sblabel = ipc_service_port_get_sblabel(service_port); + service_port_sblabel = label.iol_service->ispl_sblabel; if (service_port_sblabel) { mach_msg_filter_retain_sblabel_callback(service_port_sblabel); } - ip_mq_unlock(service_port); + ip_mq_unlock_label_put(service_port, &label); if (!service_port_sblabel) { /* Nothing to check */ @@ -2974,12 +2985,14 @@ mach_port_is_connection_for_service( mach_msg_filter_dealloc_service_port_sblabel_callback(service_port_sblabel); return ret; } + /* Port is locked and active */ - conn_port_sblabel = ipc_service_port_get_sblabel(connection_port); - if (conn_port_sblabel) { + label = ip_label_get(connection_port); + if (label.io_type == IOT_CONNECTION_PORT && label.iol_connection) { + conn_port_sblabel = label.iol_connection; mach_msg_filter_retain_sblabel_callback(conn_port_sblabel); } - ip_mq_unlock(connection_port); + ip_mq_unlock_label_put(connection_port, &label); /* This callback will release the sblabel references */ ret = mach_msg_filter_get_connection_port_filter_policy_callback(service_port_sblabel, @@ -2995,6 +3008,7 @@ mach_port_get_service_port_info( mach_port_name_t name, mach_service_port_info_t sp_info) { + ipc_object_label_t label; ipc_port_t port; kern_return_t kr; @@ -3016,16 +3030,16 @@ mach_port_get_service_port_info( } /* port is locked and active */ - if (!port->ip_service_port) { - ip_mq_unlock(port); - return KERN_INVALID_CAPABILITY; + label = ip_label_get(port); + if (ip_is_any_service_port_type(label.io_type)) { + ipc_service_port_label_get_info(label.iol_service, sp_info); + } else { + kr = KERN_INVALID_CAPABILITY; } - assert(port->ip_splabel != NULL); - ipc_service_port_label_get_info((ipc_service_port_label_t)port->ip_splabel, sp_info); - ip_mq_unlock(port); + ip_mq_unlock_label_put(port, &label); - return KERN_SUCCESS; + return kr; } #else /* CONFIG_SERVICE_PORT_INFO */ diff --git a/osfmk/kdp/kdp_common.c b/osfmk/kdp/kdp_common.c index 7ef8556aa..c252d2726 100644 --- a/osfmk/kdp/kdp_common.c +++ b/osfmk/kdp/kdp_common.c @@ -152,6 +152,7 @@ kdp_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_fla return 0; } + cur_phys_addr = (vm_offset_t)kdp_vtophys(map->pmap, target_addr); if (!pmap_valid_page((ppnum_t) atop(cur_phys_addr))) { if (!(fault_flags & KDP_FAULT_FLAGS_ENABLE_FAULTING)) { @@ -243,6 +244,7 @@ kdp_generic_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, kdp_fa kdp_memcpy(kvaddr, (const void *)phystokv((pmap_paddr_t)phys_src), cur_size); } else #endif /* defined(__arm64__) */ + bcopy_phys(phys_src, phys_dest, cur_size); } else { break; diff --git a/osfmk/kdp/kdp_core.c b/osfmk/kdp/kdp_core.c index 71e216647..435859e6c 100644 --- a/osfmk/kdp/kdp_core.c +++ b/osfmk/kdp/kdp_core.c @@ -741,7 +741,8 @@ kern_dump_save_segment_data(__unused void *refcon, core_save_segment_data_cb cal } kern_return_t -kdp_reset_output_vars(void *kdp_core_out_state, uint64_t totalbytes, bool encrypt_core, bool *out_should_skip_coredump) +kdp_reset_output_vars(void *kdp_core_out_state, uint64_t totalbytes, bool encrypt_core, bool *out_should_skip_coredump, + const char *corename, kern_coredump_type_t coretype) { struct kdp_core_out_state *outstate = (struct kdp_core_out_state *)kdp_core_out_state; struct kdp_output_stage *current_stage = NULL; @@ -754,7 +755,17 @@ kdp_reset_output_vars(void *kdp_core_out_state, uint64_t totalbytes, bool encryp /* Reset the output stages */ STAILQ_FOREACH(current_stage, &outstate->kcos_out_stage, kos_next) { - current_stage->kos_funcs.kosf_reset(current_stage); + kern_return_t res = current_stage->kos_funcs.kosf_reset(current_stage, corename, coretype); + + /* Skip coredump if requested by an output stage. */ + if (res == KERN_NODE_DOWN) { + *out_should_skip_coredump = true; + return KERN_SUCCESS; + } + + if (res != KERN_SUCCESS) { + return res; + } } *out_should_skip_coredump = false; @@ -1009,6 +1020,9 @@ chain_output_stages(enum kern_dump_type kd_variant, struct kdp_core_out_state *o } #if defined(__arm64__) + +static const char *panic_buf_filename = "panic_region"; + static kern_return_t dump_panic_buffer(struct kdp_core_out_state *outstate, char *panic_buf, size_t panic_len, uint64_t *foffset, uint64_t details_flags) @@ -1018,7 +1032,8 @@ dump_panic_buffer(struct kdp_core_out_state *outstate, char *panic_buf, size_t p kern_coredump_log(NULL, "\nBeginning dump of panic region of size 0x%zx\n", panic_len); - ret = kdp_reset_output_vars(outstate, panic_len, true, &should_skip); + ret = kdp_reset_output_vars(outstate, panic_len, true, &should_skip, + panic_buf_filename, RAW_COREDUMP); if (KERN_SUCCESS != ret) { return ret; } @@ -1042,7 +1057,7 @@ dump_panic_buffer(struct kdp_core_out_state *outstate, char *panic_buf, size_t p return ret; } - ret = kern_dump_record_file(outstate, "panic_region", *foffset, &compressed_panic_region_len, + ret = kern_dump_record_file(outstate, panic_buf_filename, *foffset, &compressed_panic_region_len, details_flags); if (KERN_SUCCESS != ret) { kern_coredump_log(NULL, "Failed to record panic region in corefile header, kern_dump_record_file returned 0x%x\n", ret); @@ -1169,10 +1184,11 @@ do_kern_dump(enum kern_dump_type kd_variant) #if defined(__x86_64__) if (((kd_variant == KERN_DUMP_STACKSHOT_DISK) || (kd_variant == KERN_DUMP_DISK)) && ((panic_stackshot_buf != 0) && (panic_stackshot_len != 0))) { bool should_skip = false; + static const char *stackshot_filename = "panic_stackshot.kcdata"; kern_coredump_log(NULL, "\nBeginning dump of kernel stackshot\n"); - ret = kdp_reset_output_vars(&outstate, panic_stackshot_len, true, &should_skip); + ret = kdp_reset_output_vars(&outstate, panic_stackshot_len, true, &should_skip, stackshot_filename, RAW_COREDUMP); if (ret != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to reset outstate for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret); @@ -1186,7 +1202,7 @@ do_kern_dump(enum kern_dump_type kd_variant) } else if ((ret = kdp_core_output(&outstate, 0, NULL)) != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outstate, ret); dump_succeeded = FALSE; - } else if ((ret = kern_dump_record_file(&outstate, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len, details_flags)) != KERN_SUCCESS) { + } else if ((ret = kern_dump_record_file(&outstate, stackshot_filename, foffset, &compressed_stackshot_len, details_flags)) != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret); dump_succeeded = FALSE; } else { @@ -1208,12 +1224,19 @@ do_kern_dump(enum kern_dump_type kd_variant) * Dump co-processors as well, foffset will be overwritten with the * offset of the next location in the file to be written to. */ - if (kern_do_coredump(&outstate, FALSE, foffset, &foffset, details_flags) != 0) { + if (kern_do_coredump(&outstate, KCF_NONE, foffset, &foffset, details_flags) != 0) { dump_succeeded = FALSE; } +#if defined (__arm64__) + } else if (kd_variant == KERN_DUMP_HW_SHMEM_DBG) { + kern_coredump_log(NULL, "Writing all cores through shared memory debugger\n"); + if (kern_do_coredump(&outstate, KCF_ABORT_ON_FAILURE, foffset, &foffset, details_flags) != 0) { + dump_succeeded = FALSE; + } +#endif /* __arm64__ */ } else if (kd_variant != KERN_DUMP_STACKSHOT_DISK) { /* Only the kernel */ - if (kern_do_coredump(&outstate, TRUE, foffset, &foffset, details_flags) != 0) { + if (kern_do_coredump(&outstate, KCF_KERNEL_ONLY, foffset, &foffset, details_flags) != 0) { dump_succeeded = FALSE; } } @@ -1783,7 +1806,7 @@ kdp_core_init(void) kmem_alloc(kernel_map, (vm_offset_t*)&kdp_core_header, kdp_core_header_size, - KMA_NOFAIL | KMA_ZERO | KMA_PERMANENT | KMA_KOBJECT | KMA_DATA, + KMA_NOFAIL | KMA_ZERO | KMA_PERMANENT | KMA_KOBJECT | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); kdp_core_header->signature = MACH_CORE_FILEHEADER_V2_SIGNATURE; diff --git a/osfmk/kdp/kdp_core.h b/osfmk/kdp/kdp_core.h index 93dd4ed33..108da3933 100644 --- a/osfmk/kdp/kdp_core.h +++ b/osfmk/kdp/kdp_core.h @@ -38,6 +38,7 @@ #include #include +#include #include #include @@ -150,7 +151,7 @@ kern_return_t kdp_core_output(void *kdp_core_out_vars, uint64_t length, void * d * Note that the 'encrypt_core' parameter instructs the output vars to encrypt the coredump data (if possible) * The 'out_should_skip_coredump' parameter will be set to true if the calling code should skip this coredump (for reasons). */ -kern_return_t kdp_reset_output_vars(void *kdp_core_out_vars, uint64_t totalbytes, bool encrypt_core, bool *out_should_skip_coredump); +kern_return_t kdp_reset_output_vars(void *kdp_core_out_vars, uint64_t totalbytes, bool encrypt_core, bool *out_should_skip_coredump, const char *corename, kern_coredump_type_t coretype); kern_return_t kern_dump_record_file(void *kdp_core_out_vars, const char *filename, uint64_t file_offset, uint64_t *out_file_length, uint64_t details_flags); diff --git a/osfmk/kdp/kdp_out_stage.h b/osfmk/kdp/kdp_out_stage.h index 4b02c6ee6..47aea7b5a 100644 --- a/osfmk/kdp/kdp_out_stage.h +++ b/osfmk/kdp/kdp_out_stage.h @@ -32,6 +32,7 @@ #include #include #include +#include struct kdp_output_stage; @@ -46,7 +47,7 @@ struct kdp_core_out_state { }; struct kdp_output_stage_funcs { - void (*kosf_reset)(struct kdp_output_stage *stage); + kern_return_t (*kosf_reset)(struct kdp_output_stage *stage, const char *corename, kern_coredump_type_t coretype); kern_return_t (*kosf_outproc)(struct kdp_output_stage *stage, unsigned int request, char *corename, uint64_t length, void *panic_data); void (*kosf_free)(struct kdp_output_stage *stage); diff --git a/osfmk/kdp/ml/arm/kdp_machdep.c b/osfmk/kdp/ml/arm/kdp_machdep.c index 9fdd86336..3ea05c8a7 100644 --- a/osfmk/kdp/ml/arm/kdp_machdep.c +++ b/osfmk/kdp/ml/arm/kdp_machdep.c @@ -240,6 +240,7 @@ kdp_panic(const char * fmt, ...) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" char kdp_fmt[256]; va_list args; diff --git a/osfmk/kdp/ml/arm/kdp_vm.c b/osfmk/kdp/ml/arm/kdp_vm.c index ed6e45164..a403ebef8 100644 --- a/osfmk/kdp/ml/arm/kdp_vm.c +++ b/osfmk/kdp/ml/arm/kdp_vm.c @@ -63,6 +63,7 @@ kdp_vtophys( { pmap_paddr_t pa; + /* Ensure that the provided va resides within the provided pmap range. */ if (!pmap || ((pmap != kernel_pmap) && ((va < pmap->min) || (va >= pmap->max)))) { #ifdef KDP_VTOPHYS_DEBUG diff --git a/osfmk/kdp/ml/x86_64/kdp_machdep.c b/osfmk/kdp/ml/x86_64/kdp_machdep.c index 0f9b05716..8ab1abdd7 100644 --- a/osfmk/kdp/ml/x86_64/kdp_machdep.c +++ b/osfmk/kdp/ml/x86_64/kdp_machdep.c @@ -278,6 +278,7 @@ kdp_panic( { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" char kdp_fmt[256]; va_list args; diff --git a/osfmk/kdp/output_stages/out_aea.c b/osfmk/kdp/output_stages/out_aea.c index 1391a1154..b703d5f35 100644 --- a/osfmk/kdp/output_stages/out_aea.c +++ b/osfmk/kdp/output_stages/out_aea.c @@ -90,8 +90,8 @@ aea_read_callback(void *context, void *buffer, size_t length, off_t offset) return length; } -static void -aea_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +aea_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { int aea_ret = 0; struct aea_stage_data *stage_data = (struct aea_stage_data *) stage->kos_data; @@ -108,6 +108,8 @@ aea_stage_reset(struct kdp_output_stage *stage) stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } static kern_return_t @@ -239,7 +241,7 @@ aea_stage_initialize(struct kdp_output_stage *stage, const void *recipient_publi } stage->kos_data_size = sizeof(struct aea_stage_data) + state_size; ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { printf("Failed to allocate memory (%zu bytes) for the AEA stage. Error 0x%x\n", stage->kos_data_size, ret); return ret; diff --git a/osfmk/kdp/output_stages/out_buffer.c b/osfmk/kdp/output_stages/out_buffer.c index 18522d9df..fb80c56c7 100644 --- a/osfmk/kdp/output_stages/out_buffer.c +++ b/osfmk/kdp/output_stages/out_buffer.c @@ -41,14 +41,16 @@ struct buffer_stage_data { char buffer[]; }; -static void -buffer_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +buffer_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { struct buffer_stage_data *data = (struct buffer_stage_data *) stage->kos_data; data->current_size = 0; stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } static kern_return_t @@ -154,7 +156,7 @@ buffer_stage_initialize(struct kdp_output_stage *stage, size_t buffer_size) stage->kos_data_size = sizeof(struct buffer_stage_data) + buffer_size; ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { printf("buffer_stage_initialize failed to allocate memory. Error 0x%x\n", ret); return ret; diff --git a/osfmk/kdp/output_stages/out_disk.c b/osfmk/kdp/output_stages/out_disk.c index 9cdf7e5f6..f82116eda 100644 --- a/osfmk/kdp/output_stages/out_disk.c +++ b/osfmk/kdp/output_stages/out_disk.c @@ -209,11 +209,13 @@ disk_stage_read(struct kdp_output_stage *stage, uint64_t offset, uint64_t length return err; } -static void -disk_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +disk_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } static kern_return_t @@ -316,7 +318,7 @@ disk_stage_initialize(struct kdp_output_stage *stage) stage->kos_data_size = sizeof(struct disk_stage_data); ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { return ret; } diff --git a/osfmk/kdp/output_stages/out_lz4.c b/osfmk/kdp/output_stages/out_lz4.c index cd8a79e1d..7aff370a7 100644 --- a/osfmk/kdp/output_stages/out_lz4.c +++ b/osfmk/kdp/output_stages/out_lz4.c @@ -56,8 +56,8 @@ struct lz4_stage_data { bool reset_failed; }; -static void -lz4_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +lz4_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { struct lz4_stage_data *data; compression_status_t status; @@ -81,6 +81,8 @@ lz4_stage_reset(struct kdp_output_stage *stage) stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } static kern_return_t @@ -110,6 +112,7 @@ lz4_stage_stream(struct lz4_stage_data *data, struct kdp_output_stage *next_stag status = compression_ki_ptr->compression_stream_process(&data->stream, finalize ? COMPRESSION_STREAM_FINALIZE : 0); if (COMPRESSION_STATUS_ERROR == status) { + kern_coredump_log(NULL, "(%s) compression_stream_process failed\n", __func__); return KERN_FAILURE; } @@ -120,6 +123,7 @@ lz4_stage_stream(struct lz4_stage_data *data, struct kdp_output_stage *next_stag ret = next_stage->kos_funcs.kosf_outproc(next_stage, KDP_DATA, corename, produced, data->dst_buf); if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, ret); return ret; } *written += produced; @@ -130,7 +134,11 @@ lz4_stage_stream(struct lz4_stage_data *data, struct kdp_output_stage *next_stag } while (data->stream.src_size || (finalize && COMPRESSION_STATUS_END != status)); if (finalize) { - return next_stage->kos_funcs.kosf_outproc(next_stage, KDP_DATA, corename, 0, NULL); + ret = next_stage->kos_funcs.kosf_outproc(next_stage, KDP_DATA, corename, 0, NULL); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, ret); + } + return ret; } return KERN_SUCCESS; @@ -164,33 +172,47 @@ lz4_stage_outproc(struct kdp_output_stage *stage, unsigned int request, } if (stage->kos_bypass || KDP_DATA != request) { - return next_stage->kos_funcs.kosf_outproc(next_stage, request, corename, length, - panic_data); + ret = next_stage->kos_funcs.kosf_outproc(next_stage, request, corename, length, + panic_data); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, ret); + } + return ret; } if (panic_data) { // Write panic data to the stream. - return lz4_stage_stream(data, next_stage, corename, panic_data, (size_t)length, - &stage->kos_bytes_written); - } else { - if (length) { - // Pad the stream with zeroes. - pad_length = (size_t)length; - do { - zero_size = MIN(pad_length, ZERO_BUF_SIZE); - ret = lz4_stage_stream(data, next_stage, corename, data->zero_buf, - zero_size, &stage->kos_bytes_written); - if (KERN_SUCCESS != ret) { - return ret; - } - pad_length -= zero_size; - } while (pad_length); - return KERN_SUCCESS; - } else { - // Finalize the stream. - return lz4_stage_stream(data, next_stage, corename, NULL, 0, &stage->kos_bytes_written); + ret = lz4_stage_stream(data, next_stage, corename, panic_data, (size_t)length, + &stage->kos_bytes_written); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) lz4_stage_stream failed with error 0x%x\n", __func__, ret); } + return ret; } + + if (length) { + // Pad the stream with zeroes. + pad_length = (size_t)length; + do { + zero_size = MIN(pad_length, ZERO_BUF_SIZE); + ret = lz4_stage_stream(data, next_stage, corename, data->zero_buf, + zero_size, &stage->kos_bytes_written); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) lz4_stage_stream failed with error 0x%x\n", __func__, ret); + return ret; + } + pad_length -= zero_size; + } while (pad_length); + return KERN_SUCCESS; + } + + // Finalize the stream. + ret = lz4_stage_stream(data, next_stage, corename, NULL, 0, &stage->kos_bytes_written); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) lz4_stage_stream failed with error 0x%x\n", __func__, ret); + } + + return ret; } static void diff --git a/osfmk/kdp/output_stages/out_memory_backing_aware_buffer.c b/osfmk/kdp/output_stages/out_memory_backing_aware_buffer.c index 86ae6d3ba..30756e2dc 100644 --- a/osfmk/kdp/output_stages/out_memory_backing_aware_buffer.c +++ b/osfmk/kdp/output_stages/out_memory_backing_aware_buffer.c @@ -39,6 +39,10 @@ #include #include +__static_testable kern_return_t +memory_backing_aware_buffer_stage_outproc(struct kdp_output_stage *stage, unsigned int request, + char *corename, uint64_t length, void * panic_data); + static bool is_normal_memory(uint64_t phys) { @@ -51,12 +55,13 @@ is_normal_memory(uint64_t phys) return (attr & VM_WIMG_MASK) == VM_WIMG_DEFAULT; } -static void -memory_backing_aware_buffer_stage_reset(__unused struct kdp_output_stage *stage) +static kern_return_t +memory_backing_aware_buffer_stage_reset(__unused struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { + return KERN_SUCCESS; } -static kern_return_t +__static_testable kern_return_t memory_backing_aware_buffer_stage_outproc(struct kdp_output_stage *stage, unsigned int request, char *corename, uint64_t length, void * panic_data) { @@ -151,7 +156,7 @@ memory_backing_aware_buffer_stage_initialize(struct kdp_output_stage *stage) stage->kos_data_size = PAGE_SIZE; ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { printf("%s failed to allocate memory. Error 0x%x\n", __func__, ret); return ret; diff --git a/osfmk/kdp/output_stages/out_net.c b/osfmk/kdp/output_stages/out_net.c index 043ba4d34..007a7f480 100644 --- a/osfmk/kdp/output_stages/out_net.c +++ b/osfmk/kdp/output_stages/out_net.c @@ -34,11 +34,13 @@ #include #include -static void -net_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +net_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } static kern_return_t diff --git a/osfmk/kdp/output_stages/out_progress_notify.c b/osfmk/kdp/output_stages/out_progress_notify.c index f3eade060..41742c844 100644 --- a/osfmk/kdp/output_stages/out_progress_notify.c +++ b/osfmk/kdp/output_stages/out_progress_notify.c @@ -43,12 +43,14 @@ struct progress_notify_stage_data { uint64_t last_notify_timestamp; }; -static void -progress_notify_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +progress_notify_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { struct progress_notify_stage_data *data = (struct progress_notify_stage_data*) stage->kos_data; data->last_notify_timestamp = 0; + + return KERN_SUCCESS; } static kern_return_t @@ -98,7 +100,7 @@ progress_notify_stage_initialize(struct kdp_output_stage *stage) stage->kos_data_size = sizeof(struct progress_notify_stage_data); ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { printf("progress_notify_stage_initialize failed to allocate memory. Error 0x%x\n", ret); return ret; diff --git a/osfmk/kdp/output_stages/out_shmem.c b/osfmk/kdp/output_stages/out_shmem.c index 13be63a48..6f924a1c7 100644 --- a/osfmk/kdp/output_stages/out_shmem.c +++ b/osfmk/kdp/output_stages/out_shmem.c @@ -51,6 +51,8 @@ #define KDP_CORE_HW_SHMEM_DBG_TOTAL_BUF_SIZE 64 * 1024 #define KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS 30 +TUNABLE(uint64_t, shmem_timeout_sec, "shmem_timeout_sec", KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS); + /* * Astris can read up to 4064 bytes at a time over * the probe, so we should try to make our buffer @@ -70,8 +72,32 @@ * Currently used for sending compressed coredumps to * astris. */ + +__enum_closed_decl(xhsdci_status_t, uint32_t, { + XHSDCI_STATUS_NONE = 0, /* default status */ + XHSDCI_STATUS_KERNEL_BUSY = 1, /* kernel is busy with other procedure */ + XHSDCI_STATUS_KERNEL_READY = 2, /* kernel ready to begin command */ + XHSDCI_COREDUMP_BEGIN = 3, /* indicates hardware debugger is ready to begin consuming coredump info */ + XHSDCI_COREDUMP_BUF_READY = 4, /* indicates the kernel has populated the buffer */ + XHSDCI_COREDUMP_BUF_EMPTY = 5, /* indicates hardware debugger is done consuming the current data */ + XHSDCI_COREDUMP_STATUS_DONE = 6, /* indicates last compressed data is in buffer */ + XHSDCI_COREDUMP_ERROR = 7, /* indicates an error was encountered */ + XHSDCI_COREDUMP_REMOTE_DONE = 8, /* indicates that hardware debugger is done */ + XHSDCI_COREDUMP_INFO = 9, /* anounces new file available for consumption */ + XHSDCI_COREDUMP_ACK = 10, /* remote side ack/nack anounced file */ +}); + +typedef union xhscdi_file_flags { + uint64_t value; + struct { + bool xff_ack :1; /* Remote side ACKed file transfer */ + bool xff_gzip :1; /* File is gzipped */ + uint8_t xff_type :4; /* coredump type */ + }; +} xhsdci_file_flags_t; + struct xnu_hw_shmem_dbg_command_info { - volatile uint32_t xhsdci_status; + volatile xhsdci_status_t xhsdci_status; uint32_t xhsdci_seq_no; volatile uint64_t xhsdci_buf_phys_addr; volatile uint32_t xhsdci_buf_data_length; @@ -79,19 +105,12 @@ struct xnu_hw_shmem_dbg_command_info { uint64_t xhsdci_coredump_total_size_uncomp; uint64_t xhsdci_coredump_total_size_sent_uncomp; uint32_t xhsdci_page_size; + /* end of version 1 structure */ + char xhsdci_file_name[64]; /* name of a core that XNU offers */ + xhsdci_file_flags_t xhsdci_file_flags; /* file flags */ } __attribute__((packed)); -#define CUR_XNU_HWSDCI_STRUCT_VERS 1 - -#define XHSDCI_STATUS_NONE 0 /* default status */ -#define XHSDCI_STATUS_KERNEL_BUSY 1 /* kernel is busy with other procedure */ -#define XHSDCI_STATUS_KERNEL_READY 2 /* kernel ready to begin command */ -#define XHSDCI_COREDUMP_BEGIN 3 /* indicates hardware debugger is ready to begin consuming coredump info */ -#define XHSDCI_COREDUMP_BUF_READY 4 /* indicates the kernel has populated the buffer */ -#define XHSDCI_COREDUMP_BUF_EMPTY 5 /* indicates hardware debugger is done consuming the current data */ -#define XHSDCI_COREDUMP_STATUS_DONE 6 /* indicates last compressed data is in buffer */ -#define XHSDCI_COREDUMP_ERROR 7 /* indicates an error was encountered */ -#define XHSDCI_COREDUMP_REMOTE_DONE 8 /* indicates that hardware debugger is done */ +#define CUR_XNU_HWSDCI_STRUCT_VERS 2 struct kdp_hw_shmem_dbg_buf_elm { vm_offset_t khsd_buf; @@ -100,6 +119,7 @@ struct kdp_hw_shmem_dbg_buf_elm { }; struct shmem_stage_data { + bool signal_done; uint32_t seq_no; uint64_t contact_deadline; uint64_t contact_deadline_interval; @@ -115,18 +135,128 @@ static STAILQ_HEAD(, kdp_hw_shmem_dbg_buf_elm) free_hw_shmem_dbg_bufs = static STAILQ_HEAD(, kdp_hw_shmem_dbg_buf_elm) hw_shmem_dbg_bufs_to_flush = STAILQ_HEAD_INITIALIZER(hw_shmem_dbg_bufs_to_flush); + +#pragma mark Shared memory protocol implementation + /* - * Whenever we start a coredump, make sure the buffers + * Waits for remote side to move protocol to expected state. Check for errors + * and timeouts. + */ +static kern_return_t +shmem_wait_for_state(struct shmem_stage_data *data, xhsdci_status_t status) +{ + data->contact_deadline = mach_absolute_time() + data->contact_deadline_interval; + + while (hwsd_info->xhsdci_status != status) { + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_ERROR) { + kern_coredump_log(NULL, "%s: Detected remote side error (state %d, waiting %d)\n", + __func__, hwsd_info->xhsdci_status, status); + return KERN_FAILURE; + } + + if (mach_absolute_time() > data->contact_deadline) { + kern_coredump_log(NULL, "%s: Timed out waiting for the reply (state %d, waiting %d)\n", + __func__, hwsd_info->xhsdci_status, status); + return KERN_OPERATION_TIMED_OUT; + } + } + + if (hwsd_info->xhsdci_seq_no != (data->seq_no + 1)) { + kern_coredump_log(NULL, "%s: Detected stale/invalid seq num (state %d, waiting %d). Expected: %d, received %d\n", + __func__, hwsd_info->xhsdci_status, status, (data->seq_no + 1), hwsd_info->xhsdci_seq_no); + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + +/* + * Publish new state, update seq number and flush cache. + */ +static kern_return_t +shmem_set_status(struct shmem_stage_data *data, xhsdci_status_t status) +{ + data->seq_no = hwsd_info->xhsdci_seq_no; + hwsd_info->xhsdci_seq_no = ++(data->seq_no); + hwsd_info->xhsdci_status = status; + FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + return KERN_SUCCESS; +} + +#pragma mark Output stage implementation + +/* + * Anounces file to be written to the other side and waits for response. + * + * Return value meaning: + * KERN_SUCCESS - A coredump should proceed + * KERN_NODE_DOWN - Other side is not interested + * KERN_* - Error occured + */ +static kern_return_t +shmem_stage_announce(struct kdp_output_stage *stage, const char *corename, uint8_t coretype) +{ + struct shmem_stage_data *data = (struct shmem_stage_data *) stage->kos_data; + kern_return_t ret = KERN_SUCCESS; + + /* Don't signal XHSDCI_COREDUMP_DONE unless remote side has seen XHSDCI_COREDUMP_INFO. */ + data->signal_done = false; + + /* + * This is the first state after XHSDCI_COREDUMP_BEGIN is set. + * If that's the case then reset the sequence number to 1. + */ + if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_BEGIN) { + data->seq_no = 1; + } + + /* Announce new corefile to the remote side. */ + strlcpy(hwsd_info->xhsdci_file_name, corename, sizeof(hwsd_info->xhsdci_file_name)); + hwsd_info->xhsdci_file_flags.xff_gzip = true; + hwsd_info->xhsdci_file_flags.xff_type = (coretype & 0xf); + shmem_set_status(data, XHSDCI_COREDUMP_INFO); + + /* wait for response */ + ret = shmem_wait_for_state(data, XHSDCI_COREDUMP_ACK); + if (ret != KERN_SUCCESS) { + kern_coredump_log(NULL, "%s: no ACK from remote side: %d\n", __func__, ret); + return ret; + } + + /* Remote side has seen XHSDCI_COREDUMP_INFO so it will expect XHSDCI_COREDUMP_DONE. */ + data->signal_done = true; + + /* Return whether transfer has been acked/nacked. */ + return (hwsd_info->xhsdci_file_flags.xff_ack) ? KERN_SUCCESS : KERN_NODE_DOWN; +} + +/* + * Whenever a new file gets transfered, make sure the buffers * are all on the free queue and the state is as expected. * The buffers may have been left in a different state if * a previous coredump attempt failed. */ -static void -shmem_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +shmem_stage_reset(struct kdp_output_stage *stage, const char *corename, kern_coredump_type_t coretype) { struct shmem_stage_data *data = (struct shmem_stage_data *) stage->kos_data; struct kdp_hw_shmem_dbg_buf_elm *cur_elm = NULL, *tmp_elm = NULL; + kern_return_t res = KERN_SUCCESS; + /* + * Announce new file and wait for remote side's ACK. + */ + res = shmem_stage_announce(stage, corename, coretype); + if (res != KERN_SUCCESS) { + return res; + } + + /* + * Proceed with the stage output reset. + */ STAILQ_FOREACH(cur_elm, &free_hw_shmem_dbg_bufs, khsd_elms) { cur_elm->khsd_data_length = 0; } @@ -152,19 +282,24 @@ shmem_stage_reset(struct kdp_output_stage *stage) STAILQ_INSERT_HEAD(&free_hw_shmem_dbg_bufs, cur_elm, khsd_elms); } - hwsd_info->xhsdci_status = XHSDCI_COREDUMP_BUF_EMPTY; - data->seq_no = 0; hwsd_info->xhsdci_buf_phys_addr = 0; hwsd_info->xhsdci_buf_data_length = 0; hwsd_info->xhsdci_coredump_total_size_uncomp = 0; hwsd_info->xhsdci_coredump_total_size_sent_uncomp = 0; hwsd_info->xhsdci_page_size = PAGE_SIZE; - FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + + /* + * Do not modify sequence numbers here. This is not a message for a remote + * side. This sets only initial state for the file transfer itself. + */ + hwsd_info->xhsdci_status = XHSDCI_COREDUMP_BUF_EMPTY; data->contact_deadline = mach_absolute_time() + data->contact_deadline_interval; stage->kos_bypass = false; stage->kos_bytes_written = 0; + + return KERN_SUCCESS; } /* @@ -180,12 +315,12 @@ shmem_dbg_process_buffers(struct kdp_output_stage *stage) FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_ERROR) { - kern_coredump_log(NULL, "Detected remote error, terminating...\n"); + kern_coredump_log(NULL, "%s: Detected remote error, terminating...\n", __func__); return kIOReturnError; } else if (hwsd_info->xhsdci_status == XHSDCI_COREDUMP_BUF_EMPTY) { if (hwsd_info->xhsdci_seq_no != (data->seq_no + 1)) { - kern_coredump_log(NULL, "Detected stale/invalid seq num. Expected: %d, received %d\n", - (data->seq_no + 1), hwsd_info->xhsdci_seq_no); + kern_coredump_log(NULL, "%s: Detected stale/invalid seq num. Expected: %d, received %d\n", + __func__, (data->seq_no + 1), hwsd_info->xhsdci_seq_no); hwsd_info->xhsdci_status = XHSDCI_COREDUMP_ERROR; FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); return kIOReturnError; @@ -208,9 +343,7 @@ shmem_dbg_process_buffers(struct kdp_output_stage *stage) hwsd_info->xhsdci_coredump_total_size_uncomp = stage->kos_outstate->kcos_totalbytes; hwsd_info->xhsdci_coredump_total_size_sent_uncomp = stage->kos_outstate->kcos_bytes_written; FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, KDP_CORE_HW_SHMEM_DBG_TOTAL_BUF_SIZE); - hwsd_info->xhsdci_seq_no = ++(data->seq_no); - hwsd_info->xhsdci_status = XHSDCI_COREDUMP_BUF_READY; - FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + shmem_set_status(data, XHSDCI_COREDUMP_BUF_READY); } data->contact_deadline = mach_absolute_time() + data->contact_deadline_interval; @@ -218,7 +351,7 @@ shmem_dbg_process_buffers(struct kdp_output_stage *stage) return KERN_SUCCESS; } else if (mach_absolute_time() > data->contact_deadline) { kern_coredump_log(NULL, "Kernel timed out waiting for hardware debugger to update handshake structure."); - kern_coredump_log(NULL, "No contact in %d seconds\n", KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS); + kern_coredump_log(NULL, "No contact in %llu seconds\n", shmem_timeout_sec); hwsd_info->xhsdci_status = XHSDCI_COREDUMP_ERROR; FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); @@ -274,9 +407,23 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, uint32_t bytes_remaining = (uint32_t) length; uint32_t bytes_to_copy; - if (request == KDP_EOF) { + /* + * Flush the buffers and signal that coredump is finished. + */ + if (request == KDP_EOF || request == KDP_SEEK) { assert(data->currently_filling_buf == NULL); + /* + * Do not signal XHSDCI_COREDUMP_STATUS_DONE if no file transfer is in + * progress. + * + * If connection is already in ERROR state then avoid touching status + * field. Remote side is waiting for protocol restart (KERNEL_READY). + */ + if (!data->signal_done || hwsd_info->xhsdci_status == XHSDCI_COREDUMP_ERROR) { + return KERN_SUCCESS; + } + /* * Wait until we've flushed all the buffers * before setting the connection status to done. @@ -284,9 +431,10 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, while (!STAILQ_EMPTY(&hw_shmem_dbg_bufs_to_flush) || data->currently_flushing_buf != NULL) { ret = shmem_dbg_process_buffers(stage); - if (ret) { - return ret; + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) shmem_dbg_process_buffers failed with error 0x%x\n", __func__, ret); } + return ret; } /* @@ -300,12 +448,15 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, return -1; } - data->seq_no = hwsd_info->xhsdci_seq_no; - kern_coredump_log(NULL, "Setting coredump status as done!\n"); - hwsd_info->xhsdci_seq_no = ++(data->seq_no); - hwsd_info->xhsdci_status = XHSDCI_COREDUMP_STATUS_DONE; - FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); + shmem_set_status(data, XHSDCI_COREDUMP_STATUS_DONE); + + /* wait for remote side to signal it is done */ + ret = shmem_wait_for_state(data, XHSDCI_COREDUMP_REMOTE_DONE); + if (ret != KERN_SUCCESS) { + kern_coredump_log(NULL, "%s: remote is not done: %d\n", __func__, ret); + return ret; + } return ret; } @@ -329,6 +480,9 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, * Move the current buffer along if possible. */ ret = shmem_dbg_process_buffers(stage); + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) shmem_dbg_process_buffers failed with error 0x%x\n", __func__, ret); + } return ret; } @@ -338,7 +492,8 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, */ while (data->currently_filling_buf == NULL) { ret = shmem_dbg_get_buffer(stage); - if (ret) { + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) shmem_dbg_get_buffer failed with error 0x%x\n", __func__, ret); return ret; } } @@ -359,7 +514,8 @@ shmem_stage_outproc(struct kdp_output_stage *stage, unsigned int request, * Move it along if possible. */ ret = shmem_dbg_process_buffers(stage); - if (ret) { + if (KERN_SUCCESS != ret) { + kern_coredump_log(NULL, "(%s) shmem_dbg_process_buffers failed with error 0x%x\n", __func__, ret); return ret; } } @@ -453,15 +609,16 @@ shmem_stage_initialize(struct kdp_output_stage *stage) stage->kos_data_size = sizeof(struct shmem_stage_data); ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { return ret; } data = (struct shmem_stage_data*) stage->kos_data; + data->signal_done = false; data->seq_no = 0; data->contact_deadline = 0; - nanoseconds_to_absolutetime(KDP_HW_SHMEM_DBG_TIMEOUT_DEADLINE_SECS * NSEC_PER_SEC, &(data->contact_deadline_interval)); + nanoseconds_to_absolutetime(shmem_timeout_sec * NSEC_PER_SEC, &(data->contact_deadline_interval)); data->currently_filling_buf = NULL; data->currently_flushing_buf = NULL; diff --git a/osfmk/kdp/output_stages/out_zlib.c b/osfmk/kdp/output_stages/out_zlib.c index 19ef13626..36688be52 100644 --- a/osfmk/kdp/output_stages/out_zlib.c +++ b/osfmk/kdp/output_stages/out_zlib.c @@ -169,8 +169,8 @@ zlib_stream_output_chunk(struct kdp_output_stage *stage, unsigned int length, vo return stage->kos_outstate->kcos_error; } -static void -zlib_stage_reset(struct kdp_output_stage *stage) +static kern_return_t +zlib_stage_reset(struct kdp_output_stage *stage, __unused const char *corename, __unused kern_coredump_type_t coretype) { struct zlib_stage_data *data = (struct zlib_stage_data *) stage->kos_data; @@ -184,6 +184,8 @@ zlib_stage_reset(struct kdp_output_stage *stage) data->zs.next_out = NULL; deflateResetWithIO(&(data->zs), zlib_zinput, zlib_zoutput); + + return KERN_SUCCESS; } static kern_return_t @@ -200,6 +202,9 @@ zlib_stage_outproc(struct kdp_output_stage *stage, unsigned int request, case KDP_SEEK: stage->kos_bypass = true; err = next_stage->kos_funcs.kosf_outproc(next_stage, request, corename, length, panic_data); + if (KERN_SUCCESS != err) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, err); + } break; case KDP_DATA: if (!stage->kos_bypass) { @@ -211,6 +216,9 @@ zlib_stage_outproc(struct kdp_output_stage *stage, unsigned int request, } err = zlib_stream_output_chunk(stage, chunk, panic_data); + if (KERN_SUCCESS != err) { + kern_coredump_log(NULL, "(%s) zlib_stream_output_chunk failed with error 0x%x\n", __func__, err); + } length -= chunk; @@ -220,6 +228,9 @@ zlib_stage_outproc(struct kdp_output_stage *stage, unsigned int request, } while (length && (KERN_SUCCESS == err)); } else { err = next_stage->kos_funcs.kosf_outproc(next_stage, request, corename, length, panic_data); + if (KERN_SUCCESS != err) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, err); + } } break; case KDP_WRQ: @@ -228,6 +239,9 @@ zlib_stage_outproc(struct kdp_output_stage *stage, unsigned int request, OS_FALLTHROUGH; case KDP_EOF: err = next_stage->kos_funcs.kosf_outproc(next_stage, request, corename, length, panic_data); + if (KERN_SUCCESS != err) { + kern_coredump_log(NULL, "(%s) next stage output failed with error 0x%x\n", __func__, err); + } break; default: break; @@ -268,7 +282,7 @@ zlib_stage_initialize(struct kdp_output_stage *stage) * chances to have VA in catastrophic cases. */ ret = kmem_alloc(kernel_map, (vm_offset_t*) &stage->kos_data, stage->kos_data_size, - KMA_DATA, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG); if (KERN_SUCCESS != ret) { printf("zlib_stage_initialize failed to allocate memory. Error 0x%x\n", ret); return ret; diff --git a/osfmk/kdp/processor_core.c b/osfmk/kdp/processor_core.c index 01b8311cf..05415f6e9 100644 --- a/osfmk/kdp/processor_core.c +++ b/osfmk/kdp/processor_core.c @@ -53,14 +53,6 @@ typedef struct { } __attribute__((packed)) legacy_bin_spec; #define LEGACY_BIN_SPEC_VERSION 1 -__enum_closed_decl(kern_coredump_type_t, uint8_t, { - XNU_COREDUMP, - USERSPACE_COREDUMP, - COPROCESSOR_COREDUMP, - SECURE_COREDUMP, - NUM_COREDUMP_TYPES, -}); - static uint32_t bin_spec_map[NUM_COREDUMP_TYPES] = { [XNU_COREDUMP] = MAIN_BIN_SPEC_TYPE_KERNEL, [USERSPACE_COREDUMP] = MAIN_BIN_SPEC_TYPE_USER, @@ -104,6 +96,7 @@ typedef struct { uint64_t core_cur_foffset; /* Current offset in this core's overall file */ uint64_t core_header_size; /* Size of this core's header */ uint64_t core_total_bytes; /* Total amount of data to be included in this core (excluding zero fill) */ + const char *core_name; /* Name of corefile being produced */ } processor_core_context; /* @@ -471,7 +464,8 @@ coredump_save_summary(uint64_t core_segment_count, uint64_t core_byte_count, * Reset the zstream and other output context before writing any data out. We do this here * to update the total file length on the outvars before we start writing out. */ - ret = kdp_reset_output_vars(core_context->core_outvars, core_context->core_file_length, true, &should_skip); + ret = kdp_reset_output_vars(core_context->core_outvars, core_context->core_file_length, true, &should_skip, + core_context->core_name, core_context->core_type); if (ret != KERN_SUCCESS) { kern_coredump_log(context, "%s() : failed to reset the out vars : kdp_reset_output_vars(%p, %llu, true, %p) returned error 0x%x\n", __func__, core_context->core_outvars, core_context->core_file_length, &should_skip, ret); @@ -831,7 +825,7 @@ coredump_save_sw_vers(uint64_t address, uuid_t uuid, uint32_t log2_pagesize, voi } static kern_return_t -kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_core, uint64_t core_begin_offset, uint64_t *core_file_length, boolean_t *header_update_failed, kern_coredump_type_t type, uint64_t details_flags) +kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_core, uint64_t core_begin_offset, uint64_t *core_file_length, boolean_t *abort_on_failure, kern_coredump_type_t type, uint64_t details_flags) { #if CONFIG_CPU_COUNTERS uint64_t start_cycles; @@ -840,7 +834,6 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor kern_return_t ret; processor_core_context context = { }; *core_file_length = 0; - *header_update_failed = FALSE; #if CONFIG_CPU_COUNTERS start_cycles = mt_cur_cpu_cycles(); @@ -855,6 +848,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor context.core_cpu_type = current_core->kcc_cpu_type; context.core_cpu_subtype = current_core->kcc_cpu_subtype; context.core_type = type; + context.core_name = current_core->kcc_corename; kern_coredump_log(&context, "\nBeginning coredump of %s\n", current_core->kcc_corename); @@ -880,7 +874,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor /* Populate the context with metadata about the corefile (cmd info, sizes etc) */ ret = current_core->kcc_cb.kcc_coredump_get_summary(context.core_refcon, coredump_save_summary, &context); - if (ret != KERN_SUCCESS) { + if (ret != KERN_SUCCESS && ret != KERN_NODE_DOWN) { kern_coredump_log(&context, "(%s) : get_summary failed with %d\n", __func__, ret); return ret; } @@ -1034,7 +1028,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor /* If we're writing to disk (we have a begin offset), we need to update the header */ ret = kern_dump_record_file(context.core_outvars, current_core->kcc_corename, core_begin_offset, &context.core_file_length_compressed, details_flags); if (ret != KERN_SUCCESS) { - *header_update_failed = TRUE; + *abort_on_failure = TRUE; kern_coredump_log(&context, "\n(kern_coredump_routine) : kern_dump_record_file failed with %d\n", ret); return ret; } @@ -1051,11 +1045,10 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor * Collect coprocessor and userspace coredumps */ static kern_return_t -kern_do_auxiliary_coredump(void * core_outvars, struct kern_coredump_core * list, uint64_t * last_file_offset, uint64_t details_flags) +kern_do_auxiliary_coredump(void * core_outvars, struct kern_coredump_core * list, uint64_t * last_file_offset, uint64_t details_flags, boolean_t *abort_on_failure) { struct kern_coredump_core *current_core = list; uint64_t prev_core_length = 0; - boolean_t header_update_failed = FALSE; kern_coredump_type_t type = current_core == kern_userspace_coredump_core_list ? USERSPACE_COREDUMP : COPROCESSOR_COREDUMP; kern_return_t ret = KERN_SUCCESS; kern_return_t cur_ret = KERN_SUCCESS; @@ -1074,11 +1067,10 @@ kern_do_auxiliary_coredump(void * core_outvars, struct kern_coredump_core * list return KERN_FAILURE; } - cur_ret = kern_coredump_routine(core_outvars, current_core, *last_file_offset, &prev_core_length, &header_update_failed, type, details_flags); + cur_ret = kern_coredump_routine(core_outvars, current_core, *last_file_offset, &prev_core_length, abort_on_failure, type, details_flags); if (cur_ret != KERN_SUCCESS) { - // As long as we didn't fail while updating the header for the raw file, we should be able to try - // to capture other corefiles. - if (header_update_failed) { + // Fail early without trying remaing corefiles when requested. + if (*abort_on_failure) { // The header may be in an inconsistent state, so bail now return KERN_FAILURE; } else { @@ -1099,21 +1091,21 @@ kern_do_auxiliary_coredump(void * core_outvars, struct kern_coredump_core * list } kern_return_t -kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_offset, uint64_t *last_file_offset, uint64_t details_flags) +kern_do_coredump(void *core_outvars, kern_coredump_flags_t flags, uint64_t first_file_offset, uint64_t *last_file_offset, uint64_t details_flags) { uint64_t prev_core_length = 0; kern_return_t cur_ret = KERN_SUCCESS, ret = KERN_SUCCESS; - boolean_t header_update_failed = FALSE; + boolean_t abort_dump = flags & KCF_ABORT_ON_FAILURE; assert(last_file_offset != NULL); *last_file_offset = first_file_offset; - cur_ret = kern_coredump_routine(core_outvars, kernel_helper, *last_file_offset, &prev_core_length, &header_update_failed, XNU_COREDUMP, details_flags); + cur_ret = kern_coredump_routine(core_outvars, kernel_helper, *last_file_offset, &prev_core_length, &abort_dump, XNU_COREDUMP, details_flags); if (cur_ret != KERN_SUCCESS) { // As long as we didn't fail while updating the header for the raw file, we should be able to try // to capture other corefiles. - if (header_update_failed) { + if (abort_dump) { // The header may be in an inconsistent state, so bail now return KERN_FAILURE; } else { @@ -1124,7 +1116,7 @@ kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_ *last_file_offset = roundup(((*last_file_offset) + prev_core_length), KERN_COREDUMP_BEGIN_FILEBYTES_ALIGN); - if (kernel_only) { + if (flags & KCF_KERNEL_ONLY) { return ret; } @@ -1138,9 +1130,9 @@ kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_ } /* Dump the secure core to disk. */ - cur_ret = kern_coredump_routine(core_outvars, sk_helper, *last_file_offset, &prev_core_length, &header_update_failed, SECURE_COREDUMP, details_flags); + cur_ret = kern_coredump_routine(core_outvars, sk_helper, *last_file_offset, &prev_core_length, &abort_dump, SECURE_COREDUMP, details_flags); if (cur_ret != KERN_SUCCESS) { - if (header_update_failed) { + if (abort_dump) { return KERN_FAILURE; } else { prev_core_length = 0; @@ -1152,13 +1144,13 @@ kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_ } // Collect coprocessor coredumps first, in case userspace coredumps fail - ret = kern_do_auxiliary_coredump(core_outvars, kern_coredump_core_list, last_file_offset, details_flags); + ret = kern_do_auxiliary_coredump(core_outvars, kern_coredump_core_list, last_file_offset, details_flags, &abort_dump); if (ret != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to dump coprocessor cores\n"); return ret; } - ret = kern_do_auxiliary_coredump(core_outvars, kern_userspace_coredump_core_list, last_file_offset, details_flags); + ret = kern_do_auxiliary_coredump(core_outvars, kern_userspace_coredump_core_list, last_file_offset, details_flags, &abort_dump); if (ret != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to dump userspace process cores\n"); return ret; diff --git a/osfmk/kdp/processor_core.h b/osfmk/kdp/processor_core.h index e33b4e022..572d0d5bd 100644 --- a/osfmk/kdp/processor_core.h +++ b/osfmk/kdp/processor_core.h @@ -239,7 +239,13 @@ kern_return_t kern_register_sk_coredump_helper(kern_coredump_callback_config *kc kern_return_t kern_register_userspace_coredump(task_t task, const char * name); kern_return_t kern_unregister_userspace_coredump(task_t task); -kern_return_t kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64_t first_file_offset, uint64_t *last_file_offset, uint64_t details_flags); +__options_closed_decl(kern_coredump_flags_t, uint64_t, { + KCF_NONE = 0, + KCF_KERNEL_ONLY = (1 << 0), + KCF_ABORT_ON_FAILURE = (1 << 1) +}); + +kern_return_t kern_do_coredump(void *core_outvars, kern_coredump_flags_t flags, uint64_t first_file_offset, uint64_t *last_file_offset, uint64_t details_flags); #define KERN_COREDUMP_MAXDEBUGLOGSIZE 16384 #define KERN_COREDUMP_BEGIN_FILEBYTES_ALIGN 4096 @@ -247,6 +253,15 @@ kern_return_t kern_do_coredump(void *core_outvars, boolean_t kernel_only, uint64 #if XNU_KERNEL_PRIVATE +__enum_closed_decl(kern_coredump_type_t, uint8_t, { + XNU_COREDUMP, + USERSPACE_COREDUMP, + COPROCESSOR_COREDUMP, + SECURE_COREDUMP, + RAW_COREDUMP, + NUM_COREDUMP_TYPES, +}); + struct kern_userspace_coredump_context { /* Task to dump */ task_t task; diff --git a/osfmk/kdp/sk_core.c b/osfmk/kdp/sk_core.c index ba195edde..465f6d030 100644 --- a/osfmk/kdp/sk_core.c +++ b/osfmk/kdp/sk_core.c @@ -522,6 +522,7 @@ sk_dump_init(void *refcon, void *context) /* validate debug signpost and discover UUID. */ struct secure_core_context *scc = (struct secure_core_context *)refcon; + bzero(scc, sizeof(struct secure_core_context)); scc->scc_uuid = sc_find_uuid_cdbg(); if (scc->scc_uuid == NULL) { diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index 5c1744cbd..b5977304c 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -46,6 +46,7 @@ PRIVATE_DATAFILES = $(sort \ trustcache.h \ turnstile.h \ socd_client.h \ + kcdata_private.h \ $(EXTRA_PRIVATE_DATAFILES)) PRIVATE_MODULEMAPFILES = $(sort \ @@ -124,7 +125,6 @@ PRIVATE_EXPORT_FILES = \ copyout_shim.h \ kern_apfs_reflock.h \ mach_filter.h \ - mach_node_link.h \ sched_clutch.h \ smr.h \ smr_hash.h \ @@ -145,17 +145,17 @@ XNU_ONLY_EXPORTS = \ recount.h \ sched_hygiene.h \ sync_sema.h \ + upsi.h \ ux_handler.h \ workload_config.h \ kern_stackshot.h \ thread_test_context.h - INSTALL_MI_LIST = ${DATAFILES} INSTALL_MODULEMAP_MI_LIST = ${MODULEMAPFILES} -INSTALL_MI_LCL_LIST = $(sort cs_blobs.h debug.h panic_call.h ext_paniclog.h ${EXTRA_PRIVATE_DATAFILES}) +INSTALL_MI_LCL_LIST = $(sort cs_blobs.h debug.h panic_call.h ext_paniclog.h kcdata_private.h ${EXTRA_PRIVATE_DATAFILES}) INSTALL_MODULEMAP_MI_LCL_LIST = ${PRIVATE_MODULEMAPFILES} @@ -188,8 +188,11 @@ TIGHTBEAM_MODULES := \ Stackshot.tbmodule \ SharedMemoryBase.tbmodule \ ExclavesCHelloServer.tbmodule \ + HelloDriverInterrupts.tbmodule \ StackshotPanicSetup.tbmodule \ - ExclaveAudioArbiter.tbmodule + ExclaveAudioArbiter.tbmodule \ + ExclavesMessageQueueTypes.tbmodule \ + ExclavesMessageQueueProxy.tbmodule endif diff --git a/osfmk/kern/arcade.c b/osfmk/kern/arcade.c index a4a4aefe2..a571671fb 100644 --- a/osfmk/kern/arcade.c +++ b/osfmk/kern/arcade.c @@ -75,6 +75,7 @@ struct arcade_register { typedef struct arcade_register *arcade_register_t; IPC_KOBJECT_DEFINE(IKOT_ARCADE_REG, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); diff --git a/osfmk/kern/assert.h b/osfmk/kern/assert.h index fcbeeb9a6..4224c9d0c 100644 --- a/osfmk/kern/assert.h +++ b/osfmk/kern/assert.h @@ -100,23 +100,33 @@ __enum_decl(mach_assert_type_t, unsigned char, { MACH_ASSERT_3U, }); +#ifndef __BUILDING_XNU_LIBRARY__ +#define MACH_ASSERT_DESC_ALIGN __attribute__((packed, aligned(4))) +#else /* __BUILDING_XNU_LIBRARY__ */ +/* The assert __desc struct is packed to 4 bytes to save stack usage. + * This is not done in user build since there is some difference between the + * user-mode linker and the kernel linker which causes this to produce + * unaligned pointer exception */ +#define MACH_ASSERT_DESC_ALIGN +#endif /* __BUILDING_XNU_LIBRARY__ */ + struct mach_assert_hdr { mach_assert_type_t type; unsigned lineno : 24; const char *filename; -} __attribute__((packed, aligned(4))); +} MACH_ASSERT_DESC_ALIGN; struct mach_assert_default { struct mach_assert_hdr hdr; const char *expr; -} __attribute__((packed, aligned(4))); +} MACH_ASSERT_DESC_ALIGN; struct mach_assert_3x { struct mach_assert_hdr hdr; const char *a; const char *op; const char *b; -} __attribute__((packed, aligned(4))); +} MACH_ASSERT_DESC_ALIGN; #if MACH_ASSERT # if XNU_KERNEL_PRIVATE @@ -151,10 +161,21 @@ STATIC_IF_KEY_DECLARE_TRUE(mach_assert); { MACH_ASSERT_DEFAULT, __LINE__, __FILE_NAME__, }, \ reason, \ }; \ - \ + \ ml_fatal_trap_with_value(MACH_ASSERT_TRAP_CODE, &__desc); \ }) +#define mach_assert_abort3x(how, s_a, s_op, s_b, v_a, v_b) ({ \ + __attribute__((used, section(MACH_ASSERT_SEGSECT))) \ + static const struct mach_assert_3x __desc_ ## how = { \ + { MACH_ASSERT_ ## how, __LINE__, __FILE_NAME__, }, \ + s_a, s_op, s_b, \ + }; \ + \ + ml_fatal_trap_with_value3(MACH_ASSERT_TRAP_CODE, \ + &__desc_ ## how, v_a, v_b); \ +}) + /*! * @abstract * assert() that is never elided or removed even in release builds. @@ -191,58 +212,40 @@ STATIC_IF_KEY_DECLARE_TRUE(mach_assert); * assert(a > b) -> file.c:123 Assertion failed: a > b * assert3u(a, >, b) -> file.c:124 Assertion failed: a > b (1 >= 10) * + * These macros define a local variable with name starting with __desc which + * contain the assert info and then call the brk instruction. The trap + * is then handled and panic_assert_format() is called to parse this struct. */ -#define assert3u(a, op, b) ({ \ +#define assert3u(a, op, b) ({ \ if (mach_assert_enabled_expr((unsigned long long)(a) op \ (unsigned long long)(b))) { \ const unsigned long long a_ = (a); \ const unsigned long long b_ = (b); \ - \ + \ if (__builtin_expect(!(a_ op b_), 0L)) { \ - __attribute__((used, section(MACH_ASSERT_SEGSECT))) \ - static const struct mach_assert_3x __desc3u = { \ - { MACH_ASSERT_3U, __LINE__, __FILE_NAME__, }, \ - #a, #op, #b, \ - }; \ - \ - ml_fatal_trap_with_value3(MACH_ASSERT_TRAP_CODE, \ - &__desc3u, a_, b_); \ + mach_assert_abort3x(3U, #a, #op, #b, a_, b_); \ } \ } \ }) -#define assert3s(a, op, b) ({ \ +#define assert3s(a, op, b) ({ \ if (mach_assert_enabled_expr((long long)(a) op ((long long)b))) { \ const signed long long a_ = (a); \ const signed long long b_ = (b); \ - \ + \ if (__builtin_expect(!(a_ op b_), 0L)) { \ - __attribute__((used, section(MACH_ASSERT_SEGSECT))) \ - static const struct mach_assert_3x __desc3s = { \ - { MACH_ASSERT_3S, __LINE__, __FILE_NAME__, }, \ - #a, #op, #b, \ - }; \ - \ - ml_fatal_trap_with_value3(MACH_ASSERT_TRAP_CODE, \ - &__desc3s, a_, b_); \ + mach_assert_abort3x(3S, #a, #op, #b, a_, b_); \ } \ } \ }) -#define assert3p(a, op, b) ({ \ +#define assert3p(a, op, b) ({ \ if (mach_assert_enabled_expr((const void *)(a) op (const void *)(b))) { \ const void *a_ = (a); \ const void *b_ = (b); \ - \ + \ if (__builtin_expect(!(a_ op b_), 0L)) { \ - __attribute__((used, section(MACH_ASSERT_SEGSECT))) \ - static const struct mach_assert_3x __desc3p = { \ - { MACH_ASSERT_3P, __LINE__, __FILE_NAME__, }, \ - #a, #op, #b, \ - }; \ - \ - ml_fatal_trap_with_value3(MACH_ASSERT_TRAP_CODE, \ - &__desc3p, a_, b_); \ + mach_assert_abort3x(3P, #a, #op, #b, a_, b_); \ } \ } \ }) diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index 0b2206ec0..cb14d5ad7 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -80,6 +80,7 @@ #include #endif + static void __attribute__((noinline, noreturn, disable_tail_calls)) thread_preempted(__unused void* parameter, __unused wait_result_t result) { @@ -244,6 +245,7 @@ ast_taken_user(void) thread_apc_ast(thread); } + if (reasons & AST_MACH_EXCEPTION) { thread_ast_clear(thread, AST_MACH_EXCEPTION); mach_exception_ast(thread); @@ -379,6 +381,10 @@ ast_taken_user(void) TH_SFLAG_EXEC_PROMOTED | TH_SFLAG_FLOOR_PROMOTED | TH_SFLAG_DEPRESS)); + +#if CONFIG_EXCLAVES + assert3u(thread->options & TH_OPT_AOE, ==, 0); +#endif /* CONFIG_EXCLAVES */ } /* diff --git a/osfmk/kern/ast.h b/osfmk/kern/ast.h index 2e7d10f69..000646094 100644 --- a/osfmk/kern/ast.h +++ b/osfmk/kern/ast.h @@ -132,6 +132,7 @@ __options_decl(ast_t, uint32_t, { AST_PROC_RESOURCE = 0x400000, /* port space and/or file descriptor table has reached its limits */ AST_DEBUG_ASSERT = 0x800000, /* check debug assertion */ AST_TELEMETRY_MACF = 0x1000000, /* telemetry sample requested by MAC framework */ + AST_SYNTHESIZE_MACH = 0x2000000, }); #define AST_NONE 0x00 @@ -145,7 +146,7 @@ __options_decl(ast_t, uint32_t, { /* Per-thread ASTs follow the thread at context-switch time. */ #define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_RESET_PCS | \ - AST_ARCADE | AST_LEDGER | AST_MACH_EXCEPTION | AST_TELEMETRY_ALL | AST_KEVENT | AST_PROC_RESOURCE | AST_DEBUG_ASSERT) + AST_ARCADE | AST_LEDGER | AST_MACH_EXCEPTION | AST_SYNTHESIZE_MACH | AST_TELEMETRY_ALL | AST_KEVENT | AST_PROC_RESOURCE | AST_DEBUG_ASSERT) /* Handle AST_URGENT detected while in the kernel */ extern void ast_taken_kernel(void); diff --git a/osfmk/kern/audit_sessionport.c b/osfmk/kern/audit_sessionport.c index e29aa5307..2b05f950e 100644 --- a/osfmk/kern/audit_sessionport.c +++ b/osfmk/kern/audit_sessionport.c @@ -59,7 +59,7 @@ audit_session_mksend(struct auditinfo_addr *aia_p, ipc_port_t *sessionport) { audit_session_aiaref(aia_p); if (!ipc_kobject_make_send_lazy_alloc_port(sessionport, - aia_p, IKOT_AU_SESSIONPORT, IPC_KOBJECT_ALLOC_NONE)) { + aia_p, IKOT_AU_SESSIONPORT)) { audit_session_aiaunref(aia_p); } @@ -129,11 +129,13 @@ audit_session_portdestroy(ipc_port_t *sessionport) *sessionport = IP_NULL; if (IP_VALID(port)) { - ipc_kobject_dealloc_port(port, 0, IKOT_AU_SESSIONPORT); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_AU_SESSIONPORT); } } IPC_KOBJECT_DEFINE(IKOT_AU_SESSIONPORT, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = audit_session_no_senders); diff --git a/osfmk/kern/backtrace.c b/osfmk/kern/backtrace.c index 1a1cd0782..05e612adf 100644 --- a/osfmk/kern/backtrace.c +++ b/osfmk/kern/backtrace.c @@ -44,6 +44,7 @@ #include #endif // defined(HAS_APPLE_PAC) + #if __x86_64__ static void _backtrace_packed_out_of_reach(void) @@ -363,7 +364,8 @@ backtrace_unpack(backtrace_pack_t packing, uintptr_t *dst, unsigned int dst_len, static errno_t _backtrace_copyin(void * __unused ctx, void *dst, user_addr_t src, size_t size) { - return copyin((user_addr_t)src, dst, size); + int error = copyin((user_addr_t)src, dst, size); + return error; } errno_t @@ -384,7 +386,6 @@ backtrace_user(uintptr_t *bt, unsigned int max_frames, const struct backtrace_control *ctl = ctl_in ?: &ctl_default; uintptr_t pc = 0, next_fp = 0; uintptr_t fp = ctl->btc_frame_addr; - bool custom_fp = fp != 0; int64_t addr_offset = ctl ? ctl->btc_addr_offset : 0; vm_map_t map = NULL; vm_map_switch_context_t switch_ctx; @@ -416,15 +417,18 @@ backtrace_user(uintptr_t *bt, unsigned int max_frames, assert(max_frames > 0); if (!custom_copy) { - assert(ml_get_interrupts_enabled() == TRUE); - if (!ml_get_interrupts_enabled()) { + bool interrupts_enabled = ml_get_interrupts_enabled(); + assert(interrupts_enabled); + if (!interrupts_enabled) { error = EDEADLK; + goto out; } if (cur_thread == NULL) { cur_thread = current_thread(); } - if (thread != cur_thread) { + bool const must_switch_maps = thread != cur_thread; + if (must_switch_maps) { map = get_task_map_reference(task); if (map == NULL) { error = ENOMEM; @@ -565,10 +569,10 @@ backtrace_user(uintptr_t *bt, unsigned int max_frames, break; } - // Stacks grow down; backtracing should be moving to higher addresses, - // unless a custom frame pointer is provided, in which case, an async - // stack might be walked, which is allocated on the heap in any order. - if ((next_fp == fp) || (!custom_fp && next_fp < fp)) { + // User space stacks generally grow down, but in some cases can jump to a different stack. + // Skip the check that the frame pointer moves downward here. + + if (next_fp == fp) { break; } fp = next_fp; diff --git a/osfmk/kern/bits.h b/osfmk/kern/bits.h index eb36b6076..d7aec266d 100644 --- a/osfmk/kern/bits.h +++ b/osfmk/kern/bits.h @@ -56,6 +56,7 @@ typedef unsigned int uint; #define extract(x, shift, width) ((((uint64_t)(x)) >> (shift)) & mask(width)) #define bits(x, hi, lo) extract((x), (lo), (hi) - (lo) + 1) +#define bit_assign(x, b, e) ((x) = (((x) & ~BIT(b))) | ((((uint64_t) (!!(e)))) << (b))) #define bit_set(x, b) ((x) |= BIT(b)) #define bit_clear(x, b) ((x) &= ~BIT(b)) #define bit_test(x, b) ((bool)((x) & BIT(b))) @@ -63,26 +64,13 @@ typedef unsigned int uint; inline static uint64_t bit_ror64(uint64_t bitmap, uint n) { -#if defined(__arm64__) - uint64_t result; - uint64_t _n = (uint64_t)n; - asm volatile ("ror %0, %1, %2" : "=r" (result) : "r" (bitmap), "r" (_n)); - return result; -#else - n = n & 63; - return (bitmap >> n) | (bitmap << (64 - n)); -#endif + return __builtin_rotateright64(bitmap, n); } inline static uint64_t bit_rol64(uint64_t bitmap, uint n) { -#if defined(__arm64__) - return bit_ror64(bitmap, 64U - n); -#else - n = n & 63; - return (bitmap << n) | (bitmap >> (64 - n)); -#endif + return __builtin_rotateleft64(bitmap, n); } /* Non-atomically clear the bit and returns whether the bit value was changed */ @@ -105,17 +93,16 @@ bit_rol64(uint64_t bitmap, uint n) !_bit_is_set; \ }) +/* + * Note on bit indexing: bit indices are offsets from the least significant bit. + * So the bit at index `i` would be found by `1 & (bitmap >> i)`. + */ + /* Returns the most significant '1' bit, or -1 if all zeros */ inline static int bit_first(uint64_t bitmap) { -#if defined(__arm64__) - int64_t result; - asm volatile ("clz %0, %1" : "=r" (result) : "r" (bitmap)); - return 63 - (int)result; -#else - return (bitmap == 0) ? -1 : 63 - __builtin_clzll(bitmap); -#endif + return 63 - __builtin_clzg(bitmap, 64); } @@ -144,7 +131,7 @@ bit_next(uint64_t bitmap, int previous_bit) inline static int lsb_first(uint64_t bitmap) { - return __builtin_ffsll((long long)bitmap) - 1; + return __builtin_ctzg(bitmap, -1); } /* Returns the least significant '1' bit that is more significant than previous_bit, diff --git a/osfmk/kern/block_hint.h b/osfmk/kern/block_hint.h index 332c42899..fdd5dda1c 100644 --- a/osfmk/kern/block_hint.h +++ b/osfmk/kern/block_hint.h @@ -29,6 +29,13 @@ #ifndef _KERN_BLOCK_HINT_H_ #define _KERN_BLOCK_HINT_H_ +#include +#ifdef XNU_KERNEL_PRIVATE +#include +#endif + +__BEGIN_DECLS + typedef enum thread_snapshot_wait_flags { kThreadWaitNone = 0x00, kThreadWaitKernelMutex = 0x01, @@ -100,4 +107,6 @@ extern void kdp_esync_find_owner(struct waitq *waitq, event64_t event, thread_wa #endif /* XNU_KERNEL_PRIVATE */ +__END_DECLS + #endif /* !_KERN_BLOCK_HINT_H_ */ diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index faa8aaede..6b4ae7181 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -25,8 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "mach/arm/vm_param.h" -#include "mach/kern_return.h" + #include #include #include @@ -41,7 +40,7 @@ #include #include #include -#include +#include #include #include #include @@ -438,8 +437,8 @@ swap_task_map(task_t task, thread_t thread, vm_map_t map) task_lock(task); mp_disable_preemption(); - /* verify that the map has been activated if the task is enabled for IPC access */ - assert(!task->ipc_active || (map->owning_task == task)); + /* verify that the map has been activated if the task is enabled for IPC access and is not a corpse */ + assert(!task->ipc_active || task_is_a_corpse(task) || (map->owning_task == task)); old_map = task->map; thread->map = task->map = map; @@ -910,9 +909,12 @@ get_vmsubmap_entries( int total_entries = 0; vm_map_entry_t entry; + vmlp_api_start(GET_VMSUBMAP_ENTRIES); + if (not_in_kdp) { - vm_map_lock(map); + vm_map_lock_read(map); } + vmlp_range_event(map, start, end - start); entry = vm_map_first_entry(map); while ((entry != vm_map_to_entry(map)) && (entry->vme_start < start)) { entry = entry->vme_next; @@ -932,8 +934,9 @@ get_vmsubmap_entries( entry = entry->vme_next; } if (not_in_kdp) { - vm_map_unlock(map); + vm_map_unlock_read(map); } + vmlp_api_end(GET_VMSUBMAP_ENTRIES, total_entries); return total_entries; } @@ -944,12 +947,15 @@ get_vmmap_entries( int total_entries = 0; vm_map_entry_t entry; + vmlp_api_start(GET_VMMAP_ENTRIES); + if (not_in_kdp) { - vm_map_lock(map); + vm_map_lock_read(map); } entry = vm_map_first_entry(map); while (entry != vm_map_to_entry(map)) { + vmlp_range_event_entry(map, entry); if (entry->is_sub_map) { total_entries += get_vmsubmap_entries(VME_SUBMAP(entry), @@ -963,8 +969,9 @@ get_vmmap_entries( entry = entry->vme_next; } if (not_in_kdp) { - vm_map_unlock(map); + vm_map_unlock_read(map); } + vmlp_api_end(GET_VMMAP_ENTRIES, total_entries); return total_entries; } #endif /* CONFIG_COREDUMP */ diff --git a/osfmk/kern/btlog.h b/osfmk/kern/btlog.h index 630d28f8d..32822775d 100644 --- a/osfmk/kern/btlog.h +++ b/osfmk/kern/btlog.h @@ -39,7 +39,7 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN -#pragma GCC visibility push(hidden) +__exported_push_hidden /* * The btlog subsystem allows for fast unobtrusive backtraces @@ -315,7 +315,7 @@ extern void btlog_copy_backtraces_for_elements( leak_site_proc proc); #endif /* DEBUG || DEVELOPMENT */ -#pragma GCC visibility pop +__exported_pop __ASSUME_PTR_ABI_SINGLE_END __END_DECLS diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index f99060e28..c60f8beba 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -807,18 +807,23 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us int64_t energy_billed_to_me = 0; int64_t energy_billed_to_others = 0; int64_t phys_footprint = 0; + int64_t swapins = 0; struct recount_usage stats_sum = { 0 }; struct recount_usage stats_perf_only = { 0 }; recount_coalition_usage_perf_only(&coal->r.co_recount, &stats_sum, &stats_perf_only); uint64_t cpu_time_eqos[COALITION_NUM_THREAD_QOS_TYPES] = { 0 }; uint64_t cpu_time_rqos[COALITION_NUM_THREAD_QOS_TYPES] = { 0 }; + memcpy(cpu_time_eqos, &coal->r.cpu_time_eqos, sizeof(cpu_time_eqos)); + memcpy(cpu_time_rqos, &coal->r.cpu_time_rqos, sizeof(cpu_time_rqos)); /* * Add to that all the active tasks' ledgers. Tasks cannot deallocate * out from under us, since we hold the coalition lock. */ task_t task; qe_foreach_element(task, &coal->r.tasks, task_coalition[COALITION_TYPE_RESOURCE]) { + int64_t task_phys_footprint = 0; + /* * Rolling up stats for exec copy task or exec'd task will lead to double accounting. * Cannot take task lock after taking coaliton lock @@ -846,6 +851,14 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us fs_metadata_writes += task->task_fs_metadata_writes; #endif /* CONFIG_PHYS_WRITE_ACCT */ + /* The exited process ledger can have a phys_footprint balance, which should be ignored. */ + kr = ledger_get_balance(task->ledger, task_ledgers.phys_footprint, (int64_t *)&task_phys_footprint); + if (kr != KERN_SUCCESS || task_phys_footprint < 0) { + task_phys_footprint = 0; + } + + phys_footprint += task_phys_footprint; + task_update_cpu_time_qos_stats(task, cpu_time_eqos, cpu_time_rqos); recount_task_usage_perf_only(task, &stats_sum, &stats_perf_only); } @@ -875,9 +888,9 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us energy_billed_to_others = 0; } - kr = ledger_get_balance(sum_ledger, task_ledgers.phys_footprint, (int64_t *)&phys_footprint); - if (kr != KERN_SUCCESS || phys_footprint < 0) { - phys_footprint = 0; + kr = ledger_get_balance(sum_ledger, task_ledgers.swapins, (int64_t *)&swapins); + if (kr != KERN_SUCCESS || swapins < 0) { + swapins = 0; } /* collect information from the coalition itself */ @@ -906,6 +919,7 @@ coalition_resource_usage_internal(coalition_t coal, struct coalition_resource_us cru_out->energy_billed_to_me = (uint64_t)energy_billed_to_me; cru_out->energy_billed_to_others = (uint64_t)energy_billed_to_others; cru_out->phys_footprint = phys_footprint; + cru_out->swapins = swapins; kr = ledger_get_entries(sum_ledger, task_ledgers.interrupt_wakeups, &credit, &debit); diff --git a/osfmk/kern/compact_id.h b/osfmk/kern/compact_id.h index 995d1670f..4d499f21b 100644 --- a/osfmk/kern/compact_id.h +++ b/osfmk/kern/compact_id.h @@ -36,7 +36,7 @@ #include __BEGIN_DECLS -#pragma GCC visibility push(hidden) +__exported_push_hidden #define COMPACT_ID_SHIFT_BASE (10) #define COMPACT_ID_COUNT_BASE (1u << COMPACT_ID_SHIFT_BASE) @@ -149,7 +149,7 @@ extern void compact_id_table_unlock( }; \ STARTUP_ARG(LOCKS, STARTUP_RANK_THIRD, compact_id_table_init, &var) -#pragma GCC visibility pop +__exported_pop __END_DECLS #endif /* _KERN_COMPACT_ID_H_ */ diff --git a/osfmk/kern/cpu_data.h b/osfmk/kern/cpu_data.h index 20db14dda..06b50e262 100644 --- a/osfmk/kern/cpu_data.h +++ b/osfmk/kern/cpu_data.h @@ -57,8 +57,8 @@ extern void _enable_preemption(void); #if SCHED_HYGIENE_DEBUG #define SCHED_HYGIENE_MARKER (1u << 31) -extern void abandon_preemption_disable_measurement(void); #endif +extern void abandon_preemption_disable_measurement(void); __END_DECLS diff --git a/osfmk/kern/cs_blobs.h b/osfmk/kern/cs_blobs.h index 305e33887..9d36c1af4 100644 --- a/osfmk/kern/cs_blobs.h +++ b/osfmk/kern/cs_blobs.h @@ -314,4 +314,16 @@ struct launch_constraint_data { }; typedef struct launch_constraint_data* launch_constraint_data_t; +/* + * Ideally, this definition should live within sys/codesign.h, but adding it there + * causes an issue with compiling certain Swift projects due to some ambigious macro + * definition error on CD_CDHASH_LEN. Thus, we keep it here. + * + * For more information: rdar://145187726. + */ +typedef struct _csops_cdhash { + uint8_t hash[CS_CDHASH_LEN]; + uint8_t type; +} csops_cdhash_t; + #endif /* _KERN_CODESIGN_H */ diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 331291bc4..dfe70a797 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -87,6 +87,7 @@ #include #include #include +#include #if !(MACH_KDP && CONFIG_KDP_INTERACTIVE_DEBUGGING) #include @@ -151,7 +152,6 @@ extern int vsnprintf(char *, size_t, const char *, va_list); extern int IODTGetLoaderInfo( const char *key, void **infoAddr, int *infosize ); extern void IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ); extern unsigned int debug_boot_arg; -extern int serial_init(void); unsigned int halt_in_debugger = 0; unsigned int current_debugger = 0; @@ -325,7 +325,7 @@ boolean_t extended_debug_log_enabled = FALSE; #define KDBG_TRACE_PANIC_FILENAME "/var/log/panic.trace" #endif -static inline void debug_fatal_panic_begin(void); +static inline boolean_t debug_fatal_panic_begin(void); /* Debugger state */ atomic_int debugger_cpu = DEBUGGER_NO_CPU; @@ -375,10 +375,6 @@ int kext_assertions_enable = FALSE; #endif -#if (DEVELOPMENT || DEBUG) -uint64_t xnu_platform_stall_value = PLATFORM_STALL_XNU_DISABLE; -#endif - /* * Maintain the physically-contiguous carveouts for the carveout bootargs. */ @@ -537,7 +533,7 @@ debug_log_init(void) * up. */ kr = kmem_alloc(kernel_map, &panic_stackshot_buf, PANIC_STACKSHOT_BUFSIZE, - KMA_DATA | KMA_ZERO, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG); assert(kr == KERN_SUCCESS); if (kr == KERN_SUCCESS) { panic_stackshot_buf_len = PANIC_STACKSHOT_BUFSIZE; @@ -586,7 +582,7 @@ phys_carveout_init(void) kmem_alloc_contig(kernel_map, carveouts[i].va, temp_carveout_size, PAGE_MASK, 0, 0, - KMA_NOFAIL | KMA_PERMANENT | KMA_NOPAGEWAIT | KMA_DATA | + KMA_NOFAIL | KMA_PERMANENT | KMA_NOPAGEWAIT | KMA_DATA_SHARED | KMA_NOSOFTLIMIT, VM_KERN_MEMORY_DIAG); @@ -609,7 +605,7 @@ debug_can_coredump_phys_carveout(void) return phys_carveout_core; } -static void +static boolean_t DebuggerLock(void) { int my_cpu = cpu_number(); @@ -617,14 +613,14 @@ DebuggerLock(void) assert(ml_get_interrupts_enabled() == FALSE); if (atomic_load(&debugger_cpu) == my_cpu) { - return; + return true; } - while (!atomic_compare_exchange_strong(&debugger_cpu, &debugger_exp_cpu, my_cpu)) { - debugger_exp_cpu = DEBUGGER_NO_CPU; + if (!atomic_compare_exchange_strong(&debugger_cpu, &debugger_exp_cpu, my_cpu)) { + return false; } - return; + return true; } static void @@ -824,6 +820,7 @@ check_and_handle_nested_panic(uint64_t panic_options_mask, unsigned long panic_c // if we panic *after* the log is finalized then we will only see it in the serial log // paniclog_append_noflush("Nested panic detected - entry count: %d panic_caller: 0x%016lx\n", CPUDEBUGGERCOUNT, panic_caller); + print_curr_backtrace(); paniclog_flush(); // print the *new* panic string to the console, we might not get it by other means... @@ -833,6 +830,7 @@ check_and_handle_nested_panic(uint64_t panic_options_mask, unsigned long panic_c printf("Nested panic string:\n"); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" _doprnt(db_panic_str, db_panic_args, PE_kputc, 0); #pragma clang diagnostic pop printf("\n\n"); @@ -1213,7 +1211,7 @@ panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_opti #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wmissing-noreturn" -void +__mockable void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, uint64_t panic_options_mask, void *panic_data_ptr, unsigned long panic_caller, const char *panic_initiator) { @@ -1237,19 +1235,16 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign /* track depth of debugger/panic entry */ CPUDEBUGGERCOUNT++; + __unused uint32_t panic_initiator_crc = panic_initiator ? crc32(0, panic_initiator, strnlen(panic_initiator, MAX_PANIC_INITIATOR_SIZE)) : 0; + /* emit a tracepoint as early as possible in case of hang */ SOCD_TRACE_XNU(PANIC, ((CPUDEBUGGERCOUNT <= 2) ? SOCD_TRACE_MODE_STICKY_TRACEPOINT : SOCD_TRACE_MODE_NONE), PACK_2X32(VALUE(cpu_number()), VALUE(CPUDEBUGGERCOUNT)), - VALUE(panic_options_mask), + PACK_2X32(VALUE(panic_initiator_crc), VALUE(panic_options_mask & 0xFFFFFFFF)), ADDR(panic_format_str), ADDR(panic_caller)); - /* enable serial on the first panic if the always-on panic print flag is set */ - if ((debug_boot_arg & DB_PRT) && (CPUDEBUGGERCOUNT == 1)) { - serial_init(); - } - /* do max nested panic/debugger check, this will report nesting to the console and spin forever if we exceed a limit */ check_and_handle_nested_panic(panic_options_mask, panic_caller, panic_format_str, panic_args); @@ -1280,7 +1275,15 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign ml_set_interrupts_enabled(FALSE); disable_preemption(); - debug_fatal_panic_begin(); + if (!debug_fatal_panic_begin()) { + /* + * This CPU lost the race to be the first to panic. Re-enable + * interrupts and dead loop here awaiting the debugger xcall from + * the CPU that first panicked. + */ + ml_set_interrupts_enabled(TRUE); + panic_stop(); + } #if defined (__x86_64__) pmSafeMode(x86_lcpu(), PM_SAFE_FL_SAFE); @@ -1317,7 +1320,8 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign __builtin_unreachable(); } -void +/* We rely on this symbol being visible in the debugger for triage automation */ +void __attribute__((noinline, optnone)) panic_spin_forever(void) { for (;;) { @@ -1362,6 +1366,7 @@ panic_debugger_log(const char *string, ...) va_start(panic_debugger_log_args, string); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" _doprnt(string, &panic_debugger_log_args, consdebug_putc, 16); #pragma clang diagnostic pop va_end(panic_debugger_log_args); @@ -1452,6 +1457,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned if (debugger_panic_str) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" _doprnt(debugger_panic_str, debugger_panic_args, consdebug_putc, 0); #pragma clang diagnostic pop } @@ -1625,6 +1631,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned panic_debugger_log("panic(cpu %u caller 0x%lx): ", (unsigned) cpu_number(), debugger_panic_caller); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" _doprnt(debugger_panic_str, debugger_panic_args, consdebug_putc, 0); #pragma clang diagnostic pop panic_debugger_log("\n"); @@ -1663,9 +1670,18 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub kern_return_t ret = KERN_SUCCESS; debugger_op db_prev_op = debugger_current_op; + if (!DebuggerLock()) { + /* + * We lost the race to be the first to panic. + * Return here so that we will enter the panic stop + * infinite loop and take the debugger IPI from the + * first CPU that got the debugger lock. + */ + return; + } + DEBUGGER_TRAP_TIMESTAMP(0); - DebuggerLock(); ret = DebuggerHaltOtherCores(CPUDEBUGGERSYNC, (CPUDEBUGGEROP == DBOP_STACKSHOT)); DEBUGGER_TRAP_TIMESTAMP(1); @@ -1744,7 +1760,15 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub #endif } else { /* note: this is the panic path... */ - debug_fatal_panic_begin(); + if (!debug_fatal_panic_begin()) { + /* + * This CPU lost the race to be the first to panic. Re-enable + * interrupts and dead loop here awaiting the debugger xcall from + * the CPU that first panicked. + */ + ml_set_interrupts_enabled(TRUE); + panic_stop(); + } #if defined(__arm64__) && (DEBUG || DEVELOPMENT) if (!PE_arm_debug_and_trace_initialized()) { paniclog_append_noflush("kernel panicked before debug and trace infrastructure initialized!\n" @@ -1814,6 +1838,7 @@ log(__unused int level, char *fmt, ...) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); #pragma clang diagnostic pop va_end(listp2); @@ -2351,23 +2376,47 @@ sysctl_debug_free_preoslog(void) #endif // RELEASE } +#if HAS_UPSI_FAILURE_INJECTION +uint64_t xnu_upsi_injection_stage = 0; +uint64_t xnu_upsi_injection_action = 0; -#if (DEVELOPMENT || DEBUG) +__attribute__((optnone)) static void +SPINNING_FOREVER(void) +{ + // Decided to disable optimizations on this function instead of using a + // volatile bool for the deadloop. + // This simplifies the process of using the deadloop as an LLDB attach point + bool loop = true; + + while (loop) { + } + return; +} void -platform_stall_panic_or_spin(uint32_t req) +check_for_failure_injection(failure_injection_stage_t current_stage) { - if (xnu_platform_stall_value & req) { - if (xnu_platform_stall_value & PLATFORM_STALL_XNU_ACTION_PANIC) { - panic("Platform stall: User requested panic"); - } else { - paniclog_append_noflush("\nUser requested platform stall. Stall Code: 0x%x", req); - panic_spin_forever(); - } + // Can't call this function with the default initialization for xnu_upsi_injection_stage + assert(current_stage != 0); + + // Check condition to inject a panic/stall/hang + if (current_stage != xnu_upsi_injection_stage) { + return; + } + + // Do the requested action + switch (xnu_upsi_injection_action) { + case INJECTION_ACTION_PANIC: + panic("Test panic at stage 0x%llx", current_stage); + case INJECTION_ACTION_WATCHDOG_TIMEOUT: + case INJECTION_ACTION_DEADLOOP: + SPINNING_FOREVER(); + break; + default: + break; } } -#endif - +#endif // HAS_UPSI_FAILURE_INJECTION #define AWL_HV_ENTRY_FLAG (0x1) @@ -2438,9 +2487,15 @@ set_awl_scratch_exists_flag_and_subscribe_for_pm(void) STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, set_awl_scratch_exists_flag_and_subscribe_for_pm); /** - * Signal that the system is going down for a panic + * Signal that the system is going down for a panic. Returns true if it is safe to + * proceed with the panic flow, false if we should re-enable interrupts and spin + * to allow another CPU to proceed with its panic flow. + * + * This function is idempotent when called from the same CPU; in the normal + * panic case it is invoked twice, since it needs to be invoked in the case + * where we enter the panic flow outside of panic() from DebuggerWithContext(). */ -static inline void +static inline boolean_t debug_fatal_panic_begin(void) { #if CONFIG_SPTM @@ -2457,5 +2512,15 @@ debug_fatal_panic_begin(void) panic_lockdown_record_debug_data(); #endif /* DEVELOPMENT || DEBUG */ sptm_xnu_panic_begin(); + + pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); + uint16_t sptm_cpu_id = sptm_pcpu->sptm_cpu_id; + uint64_t sptm_panicking_cpu_id; + + if (sptm_get_panicking_cpu_id(&sptm_panicking_cpu_id) == LIBSPTM_SUCCESS && + sptm_panicking_cpu_id != sptm_cpu_id) { + return false; + } #endif /* CONFIG_SPTM */ + return true; } diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index e15838114..117f0713c 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -43,6 +43,7 @@ #ifdef XNU_KERNEL_PRIVATE #include +#include #else #include #endif @@ -143,6 +144,10 @@ struct task_snapshot { * I/O Statistics * XXX: These fields must be together. */ + /* + * In microstackshots, `disk_reads_count` is actually + * the full 64-bits of ss_flags. + */ uint64_t disk_reads_count; uint64_t disk_reads_size; uint64_t disk_writes_count; @@ -265,6 +270,7 @@ __options_decl(stackshot_flags_t, uint64_t, { STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY = 0x100, STACKSHOT_GET_BOOT_PROFILE = 0x200, STACKSHOT_DO_COMPRESS = 0x400, + /* Now on by default/unused */ STACKSHOT_SAVE_IMP_DONATION_PIDS = 0x2000, STACKSHOT_SAVE_IN_KERNEL_BUFFER = 0x4000, STACKSHOT_RETRIEVE_EXISTING_BUFFER = 0x8000, @@ -631,23 +637,6 @@ enum { #define __FILE_NAME__ __FILE__ #endif -/* Macros for XNU platform stalls - * The "location" macros specify points where we can stall or panic - * The "action" macros specify the action to take at these points. - * The default action is to stall. */ -#if (DEVELOPMENT || DEBUG) -#define PLATFORM_STALL_XNU_DISABLE (0) -#define PLATFORM_STALL_XNU_LOCATION_ARM_INIT (0x1ULL << 0) -#define PLATFORM_STALL_XNU_LOCATION_KERNEL_BOOTSTRAP (0x1ULL << 1) -#define PLATFORM_STALL_XNU_LOCATION_BSD_INIT (0x1ULL << 2) -#define PLATFORM_STALL_XNU_ACTION_PANIC (0x1ULL << 7) - -extern uint64_t xnu_platform_stall_value; - -void platform_stall_panic_or_spin(uint32_t req); - -#endif - struct task; struct thread; struct proc; @@ -817,6 +806,7 @@ extern size_t panic_stackshot_len; #endif /* defined (__x86_64__) */ void SavePanicInfo(const char *message, void *panic_data, uint64_t panic_options, const char* panic_initiator); +void print_curr_backtrace(void); void paniclog_flush(void); void panic_display_zalloc(void); /* in zalloc.c */ void panic_display_kernel_aslr(void); diff --git a/osfmk/kern/epoch_sync.c b/osfmk/kern/epoch_sync.c index 3c6a7cf07..31a915886 100644 --- a/osfmk/kern/epoch_sync.c +++ b/osfmk/kern/epoch_sync.c @@ -571,7 +571,7 @@ esync_wake(esync_space_t space, const uint64_t id, const uint64_t epoch, KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EPOCH_SYNC, MACH_EPOCH_SYNC_WAKE_THREAD), unique_id, epoch, tid); kr = waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T(sync), - thread, WAITQ_WAKEUP_DEFAULT); + thread, THREAD_AWAKENED); } turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); diff --git a/osfmk/kern/exc_resource.h b/osfmk/kern/exc_resource.h index 402fd9bb7..9d6d3d81a 100644 --- a/osfmk/kern/exc_resource.h +++ b/osfmk/kern/exc_resource.h @@ -136,7 +136,8 @@ /* RESOURCE_TYPE_MEMORY flavors */ #define FLAVOR_HIGH_WATERMARK 1 /* Indicates that the exception is due to memory limit warning */ -#define FLAVOR_DIAG_MEMLIMIT 2 /* Indicates that the exception is due to a preset diagnostics memory consumption threshold */ +#define FLAVOR_DIAG_MEMLIMIT 2 /* Indicates that the exception is due to a preset diagnostics memory consumption threshold */ +#define FLAVOR_CONCLAVE_LIMIT 3 /* Indicates that the exception is due to the hard conclave memory limit */ /* * RESOURCE_TYPE_MEMORY / FLAVOR_HIGH_WATERMARK diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index b724c1c29..49c0c5399 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -72,7 +72,6 @@ #include #include #include -#include #include #include @@ -551,7 +550,7 @@ exception_deliver( if (kr == KERN_SUCCESS) { if (exception != EXC_CORPSE_NOTIFY && - ip_kotype(thread_port) == IKOT_THREAD_CONTROL) { + ip_type(thread_port) == IKOT_THREAD_CONTROL) { kr = thread_setstatus_from_user(thread, flavor, (thread_state_t)new_state, new_state_cnt, (thread_state_t)old_state, old_state_cnt, set_flags); @@ -853,6 +852,56 @@ pac_exception_triage( } #endif /* __has_feature(ptrauth_calls) */ +static void +maybe_unrecoverable_exception_triage( + exception_type_t exception, + mach_exception_data_t code) +{ + task_t task = current_task(); + void *proc = get_bsdtask_info(task); + +#ifdef MACH_BSD + if (!proc) { + return; + } + + /* + * Note that the below policy to decide whether this should be unrecoverable is + * likely conceptually specific to the particular exception. + * If you find yourself adding another user_brk_..._descriptor and want to customize the + * policy for whether it should be unrecoverable, consider attaching each policy to + * the corresponding descriptor and somehow carrying it through to here. + */ + /* These exceptions are deliverable (and potentially recoverable) if the process is being debugged. */ + if (is_address_space_debugged(proc)) { + return; + } + + /* + * By policy, this exception is uncatchable by exception/signal handlers. + * Therefore exit immediately. + */ + /* Should only be called on current proc */ + int pid = proc_selfpid(); + char *proc_name = proc_name_address(proc); + os_log_error(OS_LOG_DEFAULT, "%s: process %s[%d] hit an unrecoverable exception\n", __func__, proc_name, pid); + + exception_info_t info = { + /* + * For now, hard-code this to OS_REASON_FOUNDATION as that's the path we expect to be on today. + * In the future this should probably be carried by the user_brk_..._descriptor and piped through. + */ + .os_reason = OS_REASON_FOUNDATION, + .exception_type = exception, + .mx_code = code[0], + .mx_subcode = code[1] + }; + exit_with_mach_exception(proc, info, PX_FLAGS_NONE); + thread_exception_return(); + /* NOT_REACHABLE */ +#endif /* MACH_BSD */ +} + /* * Routine: exception_triage * Purpose: @@ -902,9 +951,16 @@ exception_triage( if (exception & EXC_PTRAUTH_BIT) { exception &= ~EXC_PTRAUTH_BIT; assert(codeCnt == 2); + /* Note this may consume control flow if it decides the exception is unrecoverable. */ pac_exception_triage(exception, code); } #endif /* __has_feature(ptrauth_calls) */ + if (exception & EXC_MAY_BE_UNRECOVERABLE_BIT) { + exception &= ~EXC_MAY_BE_UNRECOVERABLE_BIT; + assert(codeCnt == 2); + /* Note this may consume control flow if it decides the exception is unrecoverable. */ + maybe_unrecoverable_exception_triage(exception, code); + } return exception_triage_thread(exception, code, codeCnt, thread); } diff --git a/osfmk/kern/exclaves.c b/osfmk/kern/exclaves.c index 9050a5790..0a4e16026 100644 --- a/osfmk/kern/exclaves.c +++ b/osfmk/kern/exclaves.c @@ -85,6 +85,8 @@ #include "exclaves_inspection.h" #include "exclaves_memory.h" #include "exclaves_internal.h" +#include "exclaves_aoe.h" +#include "exclaves_sensor.h" LCK_GRP_DECLARE(exclaves_lck_grp, "exclaves"); @@ -254,7 +256,11 @@ _exclaves_ctl_trap(struct exclaves_ctl_trap_args *uap) * If requirements are relaxed during development, tasks with no * conclaves are also allowed. */ - if (task_get_conclave(task) == NULL && + if (operation == EXCLAVES_CTL_OP_SENSOR_MIN_ON_TIME) { + if (!exclaves_has_priv(task, EXCLAVES_PRIV_INDICATOR_MIN_ON_TIME)) { + return KERN_DENIED; + } + } else if (task_get_conclave(task) == NULL && !exclaves_has_priv(task, EXCLAVES_PRIV_KERNEL_DOMAIN) && !exclaves_requirement_is_relaxed(EXCLAVES_R_CONCLAVE_RESOURCES)) { return KERN_DENIED; @@ -495,6 +501,14 @@ _exclaves_ctl_trap(struct exclaves_ctl_trap_args *uap) return KERN_NOT_FOUND; } + /* + * Looking up a forwarding service verifies its existence, but + * doesn't return the id since communication with it is not possible + */ + if (id > EXCLAVES_FORWARDING_RESOURCE_ID_BASE) { + return KERN_NAME_EXISTS; + } + uresource.r_id = id; uresource.r_port = MACH_PORT_NULL; @@ -738,6 +752,85 @@ notification_resource_lookup_out: break; } + case EXCLAVES_CTL_OP_AOE_SETUP: { + uint8_t num_message = 0; + uint8_t num_worker = 0; + + if (task_get_conclave(task) == NULL) { + kr = KERN_FAILURE; + break; + } + + kr = exclaves_aoe_setup(&num_message, &num_worker); + if (kr != KERN_SUCCESS) { + break; + } + + error = copyout(&num_message, ubuffer, sizeof(num_message)); + if (error != 0) { + kr = KERN_INVALID_ADDRESS; + break; + } + + error = copyout(&num_worker, ustatus, sizeof(num_worker)); + if (error != 0) { + kr = KERN_INVALID_ADDRESS; + break; + } + + break; + } + + case EXCLAVES_CTL_OP_AOE_MESSAGE_LOOP: { + if (task_get_conclave(task) == NULL) { + kr = KERN_FAILURE; + break; + } + + kr = exclaves_aoe_message_loop(); + break; + } + + case EXCLAVES_CTL_OP_AOE_WORK_LOOP: { + if (task_get_conclave(task) == NULL) { + kr = KERN_FAILURE; + break; + } + + kr = exclaves_aoe_work_loop(); + break; + } + + case EXCLAVES_CTL_OP_SENSOR_MIN_ON_TIME: { + if (name != MACH_PORT_NULL) { + /* Only accept MACH_PORT_NULL for now */ + return KERN_INVALID_CAPABILITY; + } + + if (ubuffer == USER_ADDR_NULL || usize == 0 || + usize != sizeof(struct exclaves_indicator_deadlines)) { + return KERN_INVALID_ARGUMENT; + } + + struct exclaves_indicator_deadlines udurations; + error = copyin(ubuffer, &udurations, usize); + if (error) { + return KERN_INVALID_ARGUMENT; + } + + kr = exclaves_indicator_min_on_time_deadlines(&udurations); + if (kr != KERN_SUCCESS) { + return kr; + } + + error = copyout(&udurations, ubuffer, usize); + if (error) { + return KERN_INVALID_ADDRESS; + } + + break; + } + default: kr = KERN_INVALID_ARGUMENT; break; @@ -1043,6 +1136,35 @@ exclaves_endpoint_call_internal(__unused ipc_port_t port, /* -------------------------------------------------------------------------- */ #pragma mark secure kernel communication +/** save SME state before entering exclaves */ +static bool +exclaves_save_matrix_state(void) +{ + bool saved = false; +#if HAS_ARM_FEAT_SME + /* Save only the ZA/ZT0 state. SPTM will save/restore TPIDR2. */ + if (arm_sme_version() > 0 && !!(__builtin_arm_rsr64("SVCR") & SVCR_ZA)) { + arm_sme_saved_state_t *sme_state = machine_thread_get_sme_state(current_thread()); + arm_save_sme_za_zt0(&sme_state->context, sme_state->svl_b); + asm volatile ("smstop za"); + saved = true; + } +#endif /* HAS_ARM_FEAT_SME */ + return saved; +} + +static void +exclaves_restore_matrix_state(bool did_save_sme __unused) +{ +#if HAS_ARM_FEAT_SME + if (did_save_sme) { + arm_sme_saved_state_t *sme_state = machine_thread_get_sme_state(current_thread()); + asm volatile ("smstart za"); + arm_load_sme_za_zt0(&sme_state->context, sme_state->svl_b); + } +#endif /* HAS_ARM_FEAT_SME */ +} + /* ringgate entry endpoints */ enum { RINGGATE_EP_ENTER, @@ -1065,7 +1187,7 @@ exclaves_enter(void) sptm_call_regs_t regs = { }; - __assert_only thread_t thread = current_thread(); + thread_t thread = current_thread(); /* * Should never re-enter exclaves. @@ -1086,6 +1208,11 @@ exclaves_enter(void) assert3u(thread->th_exclaves_state & mask, !=, 0); assert3u(thread->th_exclaves_intstate & TH_EXCLAVES_EXECUTION, ==, 0); + /* + * Save any SME matrix state before entering exclaves. + */ + bool did_save_sme = exclaves_save_matrix_state(); + #if MACH_ASSERT /* * Set the ast to check that the thread doesn't return to userspace @@ -1133,6 +1260,11 @@ exclaves_enter(void) KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCLAVES, MACH_EXCLAVES_SWITCH) | DBG_FUNC_END); + /* + * Restore SME matrix state, if it existed. + */ + exclaves_restore_matrix_state(did_save_sme); + switch (result) { case RINGGATE_STATUS_SUCCESS: return KERN_SUCCESS; @@ -2408,7 +2540,6 @@ exclaves_hosted_error(bool success, XrtHosted_Error_t *error) } } - #pragma mark exclaves privilege management /* @@ -2499,6 +2630,23 @@ exclaves_has_priv(task_t task, exclaves_priv_t priv) "com.apple.private.exclaves.boot"); /* END IGNORE CODESTYLE */ + case EXCLAVES_PRIV_INDICATOR_MIN_ON_TIME: + /* + * If the task was entitled and has been through this path + * before, it will have set the TFRO_HAS_SENSOR_MIN_ON_TIME_ACCESS flag. + */ + if ((task_ro_flags_get(task) & TFRO_HAS_SENSOR_MIN_ON_TIME_ACCESS) != 0) { + return true; + } + + if (has_entitlement(task, priv, + "com.apple.private.exclaves.indicator_min_on_time")) { + task_ro_flags_set(task, TFRO_HAS_SENSOR_MIN_ON_TIME_ACCESS); + return true; + } + + return false; + /* The CONCLAVE HOST priv is always checked by vnode. */ case EXCLAVES_PRIV_CONCLAVE_HOST: default: diff --git a/osfmk/kern/exclaves_aoe.c b/osfmk/kern/exclaves_aoe.c new file mode 100644 index 000000000..013c14501 --- /dev/null +++ b/osfmk/kern/exclaves_aoe.c @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_EXCLAVES + +#include +#include +#include + +#include + +#include + +#include "exclaves_aoe.h" +#include "exclaves_boot.h" +#include "exclaves_resource.h" +#include "exclaves_debug.h" + +#include "kern/exclaves.tightbeam.h" + +#define EXCLAVES_AOE_PROXY "com.apple.service.AlwaysOnExclavesProxy" + +static exclavesmessagequeueproxy_exclavesmessagequeueproxy_s aoeproxy_client; + +static kern_return_t +exclaves_aoe_boot(void) +{ + exclaves_id_t aoeproxy_id = exclaves_service_lookup( + EXCLAVES_DOMAIN_KERNEL, EXCLAVES_AOE_PROXY); + + if (aoeproxy_id == EXCLAVES_INVALID_ID) { + /* + * For now just silently return if the AOE proxy can't be found. + * In future this should call: + * exclaves_requirement_assert(EXCLAVES_R_AOE, + * "exclaves always on exclave proxy not found"); + */ + return KERN_SUCCESS; + } + + tb_endpoint_t ep = tb_endpoint_create_with_value( + TB_TRANSPORT_TYPE_XNU, aoeproxy_id, TB_ENDPOINT_OPTIONS_NONE); + + tb_error_t ret = + exclavesmessagequeueproxy_exclavesmessagequeueproxy__init(&aoeproxy_client, ep); + if (ret != TB_ERROR_SUCCESS) { + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} +EXCLAVES_BOOT_TASK(exclaves_aoe_boot, EXCLAVES_BOOT_RANK_ANY); + +kern_return_t +exclaves_aoe_setup(uint8_t *num_message, uint8_t *num_worker) +{ + exclaves_resource_t *conclave = task_get_conclave(current_task()); + assert3p(conclave, !=, NULL); + + /* Return with an error if uninitialised. */ + if (aoeproxy_client.connection == NULL) { + return KERN_NOT_SUPPORTED; + } + + lck_mtx_lock(&conclave->r_mutex); + + if (!queue_empty(&conclave->r_conclave.c_aoe_q)) { + lck_mtx_unlock(&conclave->r_mutex); + return KERN_FAILURE; /* Already initialised. */ + } + + /* + * Iterate over each AOE Service in the conclave and call setup for each + * one. + */ + + __block uint8_t nmessage = 0; + __block uint8_t nworker = 0; + __block bool saw_error = false; + __block tb_error_t ret = TB_ERROR_SUCCESS; + + /* BEGIN IGNORE CODESTYLE */ + exclaves_resource_aoeservice_iterate(conclave->r_name, + ^(exclaves_resource_t *aoe_service) { + + ret = exclavesmessagequeueproxy_exclavesmessagequeueproxy_setup( + &aoeproxy_client, aoe_service->r_id, + ^(exclavesmessagequeueproxy_exclavesmessagequeueproxy_setup__result_s result) { + + exclavesmessagequeuetypes_workercount_s *wc = + exclavesmessagequeueproxy_exclavesmessagequeueproxy_setup__result_get_success(&result); + if (wc != NULL) { + + /* + * Allocate an aoe item for each service to be + * used as a per-service rendezvous for message + * threads and to hold worker counts for worker + * requests. + */ + aoe_item_t *aitem = kalloc_type(aoe_item_t, + Z_WAITOK | Z_ZERO | Z_NOFAIL); + aitem->aoei_serviceid = aoe_service->r_id; + aitem->aoei_message_count = 0; + aitem->aoei_work_count = 0; + aitem->aoei_worker_count = 0; + + queue_enter(&conclave->r_conclave.c_aoe_q, aitem, + aoe_item_t *, aoei_chain); + + nmessage++; + nworker += *wc; + return; + } + + exclavesmessagequeueproxy_proxyerror_s *error = + exclavesmessagequeueproxy_exclavesmessagequeueproxy_setup__result_get_failure(&result); + assert3p(error, !=, NULL); + + exclaves_debug_printf(show_errors, + "AOE setup failed for service: %llu (error: %llu)\n", + aoe_service->r_id, error->tag); + saw_error = true; + }); + + /* Break out early for errors. */ + if (saw_error || ret != TB_ERROR_SUCCESS) { + return (bool)true; + } + + return (bool)(false); + }); + /* END IGNORE CODESTYLE */ + + if (saw_error || ret != TB_ERROR_SUCCESS) { + exclaves_aoe_teardown(); + lck_mtx_unlock(&conclave->r_mutex); + return KERN_FAILURE; + } + + lck_mtx_unlock(&conclave->r_mutex); + + if (nmessage == 0) { + return KERN_FAILURE; + } + + *num_message = nmessage; + *num_worker = nworker; + + return KERN_SUCCESS; +} + +static bool +exclaves_aoe_service_is_idle(const aoe_item_t * const item) +{ + return item->aoei_message_count == 0 && item->aoei_work_count == 0 && item->aoei_worker_count == 0; +} + +static void +exclaves_aoe_service_try_take_assertion(exclaves_resource_t * const conclave, aoe_item_t * const item) +{ + assert3p(conclave, !=, NULL); + LCK_MTX_ASSERT(&conclave->r_mutex, LCK_MTX_ASSERT_OWNED); + + if (item->aoei_assertion_id == 0 && exclaves_aoe_service_is_idle(item)) { + const char *desc = exclaves_conclave_get_domain(conclave); + __assert_only IOReturn ret = IOExclaveLPWCreateAssertion(&item->aoei_assertion_id, desc); + assert3u(ret, ==, kIOReturnSuccess); + } +} + +static void +exclaves_aoe_service_drop_assertion(exclaves_resource_t * const __assert_only conclave, aoe_item_t * const item) +{ + assert3p(conclave, !=, NULL); + LCK_MTX_ASSERT(&conclave->r_mutex, LCK_MTX_ASSERT_OWNED); + + __assert_only IOReturn ret = IOExclaveLPWReleaseAssertion(item->aoei_assertion_id); + assert3u(ret, ==, kIOReturnSuccess); + item->aoei_assertion_id = 0; +} + +static void +exclaves_aoe_service_try_drop_assertion(exclaves_resource_t * const __assert_only conclave, aoe_item_t * const item) +{ + assert3p(conclave, !=, NULL); + LCK_MTX_ASSERT(&conclave->r_mutex, LCK_MTX_ASSERT_OWNED); + + if (item->aoei_assertion_id && exclaves_aoe_service_is_idle(item)) { + exclaves_aoe_service_drop_assertion(conclave, item); + } +} + +void +exclaves_aoe_teardown(void) +{ + exclaves_resource_t *conclave = task_get_conclave(current_task()); + assert3p(conclave, !=, NULL); + + LCK_MTX_ASSERT(&conclave->r_mutex, LCK_MTX_ASSERT_OWNED); + + aoe_item_t *aitem = NULL; + while (!queue_empty(&conclave->r_conclave.c_aoe_q)) { + queue_remove_first(&conclave->r_conclave.c_aoe_q, aitem, + aoe_item_t *, aoei_chain); + + exclaves_aoe_service_drop_assertion(conclave, aitem); + + kfree_type(aoe_item_t, aitem); + } +} + +static wait_result_t +exclaves_aoe_claim_work(exclaves_resource_t *conclave, + exclavesmessagequeuetypes_serviceidentifier_s *id) +{ + while (true) { + lck_mtx_lock(&conclave->r_mutex); + + aoe_item_t *aitem = NULL; + queue_iterate(&conclave->r_conclave.c_aoe_q, aitem, + aoe_item_t *, aoei_chain) { + if (aitem->aoei_work_count != 0) { + aitem->aoei_work_count--; + aitem->aoei_worker_count++; + *id = aitem->aoei_serviceid; + + lck_mtx_unlock(&conclave->r_mutex); + return THREAD_AWAKENED; + } + } + + /* Nothing on the work queue, sleep */ + assert_wait(&conclave->r_conclave.c_aoe_q, + THREAD_INTERRUPTIBLE); + + lck_mtx_unlock(&conclave->r_mutex); + + wait_result_t wr = thread_block(THREAD_CONTINUE_NULL); + assert(wr == THREAD_AWAKENED || wr == THREAD_INTERRUPTED); + + if (wr == THREAD_INTERRUPTED) { + return wr; + } + } +} + +static void +exclaves_aoe_finish_work(exclaves_resource_t *conclave, + exclavesmessagequeuetypes_serviceidentifier_s id) +{ + bool work_finished = false; + + lck_mtx_lock(&conclave->r_mutex); + + aoe_item_t *aitem = NULL; + queue_iterate(&conclave->r_conclave.c_aoe_q, aitem, + aoe_item_t *, aoei_chain) { + if (id == aitem->aoei_serviceid) { + aitem->aoei_worker_count--; + + exclaves_aoe_service_try_drop_assertion(conclave, aitem); + + work_finished = true; + } + } + + lck_mtx_unlock(&conclave->r_mutex); + + assert(work_finished); +} + +static void +exclaves_aoe_post_work(exclaves_resource_t *conclave, + exclavesmessagequeuetypes_serviceidentifier_s service_id, uint8_t worker_count) +{ + lck_mtx_lock(&conclave->r_mutex); + + /* Find the associated aoe item. */ + aoe_item_t *aitem = NULL; + queue_iterate(&conclave->r_conclave.c_aoe_q, aitem, aoe_item_t *, + aoei_chain) { + if (aitem->aoei_serviceid == service_id) { + if (worker_count != 0) { + aitem->aoei_work_count += worker_count; + thread_wakeup(&conclave->r_conclave.c_aoe_q); + } else { + // If there are no workers, check if the active assertion can be dropped. + exclaves_aoe_service_try_drop_assertion(conclave, aitem); + } + break; + } + } + + lck_mtx_unlock(&conclave->r_mutex); +} + +/* + * Worker thread run-loop. + */ +kern_return_t +exclaves_aoe_work_loop(void) +{ + uint64_t id = + EXCLAVESMESSAGEQUEUETYPES_SERVICEIDENTIFIER_INVALID; + exclaves_resource_t *conclave = task_get_conclave(current_task()); + assert3p(conclave, !=, NULL); + + /* Return with an error if uninitialised. */ + if (aoeproxy_client.connection == NULL) { + return KERN_NOT_SUPPORTED; + } + + /* + * Mark this thread as being an Exclaves AOE thread. After this point + * cannot return to userspace. + */ + current_thread()->options |= TH_OPT_AOE; + + // Wait to be interrupted or aborted.. + while (exclaves_aoe_claim_work(conclave, &id) != THREAD_INTERRUPTED) { + // Call into AOE proxy to process. + + assert3u(id, !=, EXCLAVESMESSAGEQUEUETYPES_SERVICEIDENTIFIER_INVALID); + + /* BEGIN IGNORE CODESTYLE */ + __assert_only tb_error_t ret = exclavesmessagequeueproxy_exclavesmessagequeueproxy_workerinvoke( + &aoeproxy_client, id); + + assert3u(ret, ==, TB_ERROR_SUCCESS); + + exclaves_aoe_finish_work(conclave, id); + } + + /* + * This thread was aborted, assert that the thread has actually aborted + * and won't try to return to userspace. + */ + assert3u(current_thread()->sched_flags & TH_SFLAG_ABORT, !=, 0); + + return KERN_SUCCESS; +} + +static wait_result_t +exclaves_aoe_claim_message(exclaves_resource_t *conclave, aoe_item_t *item) +{ + while (true) { + lck_mtx_lock(&conclave->r_mutex); + + /* Claim message and return immediately if available. */ + if (item->aoei_message_count > 0) { + item->aoei_message_count--; + lck_mtx_unlock(&conclave->r_mutex); + return THREAD_AWAKENED; + } + + /* Nothing on the message queue, sleep. */ + assert_wait(&item->aoei_message_count, + THREAD_INTERRUPTIBLE); + + lck_mtx_unlock(&conclave->r_mutex); + + wait_result_t wr = thread_block(THREAD_CONTINUE_NULL); + assert(wr == THREAD_AWAKENED || wr == THREAD_INTERRUPTED); + + if (wr == THREAD_INTERRUPTED) { + return wr; + } + } +} + +static void +exclaves_aoe_post_message(exclaves_resource_t *conclave, + __unused exclavesmessagequeuetypes_serviceidentifier_s id) +{ + lck_mtx_lock(&conclave->r_mutex); + + aoe_item_t *aitem = NULL; + queue_iterate(&conclave->r_conclave.c_aoe_q, aitem, aoe_item_t *, + aoei_chain) { + if (aitem->aoei_serviceid == id) { + exclaves_aoe_service_try_take_assertion(conclave, aitem); + + aitem->aoei_message_count++; + thread_wakeup(&aitem->aoei_message_count); + break; + } + } + + lck_mtx_unlock(&conclave->r_mutex); +} + +static aoe_item_t * +exclaves_aoe_associate_serviceid(void) +{ + exclaves_resource_t *conclave = task_get_conclave(current_task()); + assert3p(conclave, !=, NULL); + + lck_mtx_lock(&conclave->r_mutex); + + aoe_item_t *aitem = NULL; + queue_iterate(&conclave->r_conclave.c_aoe_q, aitem, aoe_item_t *, + aoei_chain) { + if (!aitem->aoei_associated) { + aitem->aoei_associated = true; + lck_mtx_unlock(&conclave->r_mutex); + + return aitem; + } + } + + lck_mtx_unlock(&conclave->r_mutex); + + return NULL; +} + + +/* Message thread run-loop. */ +kern_return_t +exclaves_aoe_message_loop(void) +{ + exclaves_resource_t *conclave = task_get_conclave(current_task()); + assert3p(conclave, !=, NULL); + + /* Return with an error if uninitialised. */ + if (aoeproxy_client.connection == NULL) { + return KERN_NOT_SUPPORTED; + } + + /* Claim a message endpoint. */ + aoe_item_t *item = exclaves_aoe_associate_serviceid(); + if (item == NULL) { + return KERN_NOT_FOUND; + } + + /* + * Mark this thread as being an Exclaves AOE thread. After this point + * cannot return to userspace. + */ + current_thread()->options |= TH_OPT_AOE; + + // Wait to be interrupted or aborted.. + while (exclaves_aoe_claim_message(conclave, item) != + THREAD_INTERRUPTED) { + // Call into AOE proxy to handle message. + + /* BEGIN IGNORE CODESTYLE */ + __assert_only tb_error_t ret = exclavesmessagequeueproxy_exclavesmessagequeueproxy_messagedeliver( + &aoeproxy_client, item->aoei_serviceid, + ^(workercount__opt_s wc_opt) { + + exclavesmessagequeuetypes_workercount_s *wc = NULL; + wc = workercount__opt_get(&wc_opt); + + // Post work for the worker threads. + exclaves_aoe_post_work(conclave, item->aoei_serviceid, wc ? *wc : 0); + }); + /* END IGNORE CODESTYLE */ + + assert3u(ret, ==, TB_ERROR_SUCCESS); + } + + /* + * This thread was aborted, assert that the thread has actually aborted + * and won't try to return to userspace. + */ + assert3u(current_thread()->sched_flags & TH_SFLAG_ABORT, !=, 0); + + return KERN_SUCCESS; +} + +tb_error_t +exclaves_aoe_upcall_work_available(const xnuupcallsv2_aoeworkinfo_s *work_info, + tb_error_t (^completion)(void)) +{ + assert3p(work_info, !=, NULL); + + const xnuupcallsv2_aoeworkinfo_conclavework_s *cw = + xnuupcallsv2_aoeworkinfo_conclavework__get(work_info); + + // Only conclave work is supported right now. + assert3p(cw, !=, NULL); + + exclavesmessagequeuetypes_serviceidentifier_s id = cw->field0; + assert3u(id, !=, EXCLAVESMESSAGEQUEUETYPES_SERVICEIDENTIFIER_INVALID); + + exclaves_resource_t *conclave = + exclaves_conclave_lookup_by_aoeserviceid(id); + if (conclave == NULL || + queue_empty(&conclave->r_conclave.c_aoe_q)) { + exclaves_debug_printf(show_errors, + "exclaves: work available but conclave not found or " + "uninitialised: %llu\n", id); + completion(); + return TB_ERROR_USER_FAILURE; + } + + exclaves_aoe_post_message(conclave, id); + + return completion(); +} + +#endif /* CONFIG_EXCLAVES */ diff --git a/osfmk/kern/exclaves_aoe.h b/osfmk/kern/exclaves_aoe.h new file mode 100644 index 000000000..4289c9b3d --- /dev/null +++ b/osfmk/kern/exclaves_aoe.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_EXCLAVES + +#pragma once + +#include + +#include + +#include "kern/exclaves.tightbeam.h" + +__BEGIN_DECLS + +/*! + * @function exclaves_aoe_setup + * + * @abstract + * Called from a thread in an Always-On conclave. Returns the number of message + * and worker threads required. + * + * @param num_message + * Returns the number of message threads that should be spawned. + * + * @param num_worker + * Returns the number of worker threads that should be spawned. + * + * @return + * KERN_SUCCESS or error code on failure. + */ +extern kern_return_t +exclaves_aoe_setup(uint8_t *num_message, uint8_t *num_worker); + +/*! + * @function exclaves_aoe_teardown + * + * @abstract + * Cleans up state (if any) initialised by exclaves_aoe_setup(). + */ +extern void +exclaves_aoe_teardown(void); + +/*! + * @function exclaves_aoe_message_loop + * + * @abstract + * Called from an AOE message thread. Can return with an error if AOE is not + * supported or uninitialised. Once successfully setup will only ever return if + * the thread was aborted. + * Used to handle message delivery. + */ +extern kern_return_t +exclaves_aoe_message_loop(void); + +/*! + * @function exclaves_aoe_work_loop + * + * @abstract + * Called from an AOE worker thread. Can return with an error if AOE is not + * supported or uninitialised. Once successfully setup will only ever return if + * the thread was aborted. + * Worker threads for message processing. + */ +extern kern_return_t +exclaves_aoe_work_loop(void); + +/*! + * @function exclaves_aoe_upcall_work_available + * + * @abstract + * Upcall invoked when AOE proxy has recieved new work that needs to be processed. + * + * @param work_info + * Information on the type of work available. + * + * @param completion + * Tightbeam completion callback. + * + * @return + * TB_ERROR_SUCCESS or error code on failure. + */ +extern tb_error_t + exclaves_aoe_upcall_work_available(const xnuupcallsv2_aoeworkinfo_s * work_info, + tb_error_t (^completion)(void)); + +__END_DECLS + +#endif /* CONFIG_EXCLAVES */ diff --git a/osfmk/kern/exclaves_boot.c b/osfmk/kern/exclaves_boot.c index 3192598fd..eb682ab3a 100644 --- a/osfmk/kern/exclaves_boot.c +++ b/osfmk/kern/exclaves_boot.c @@ -276,13 +276,13 @@ exclaves_boot_exclavekit(void) } /* - * Treat a failure to boot exclavekit as a transition to the - * EXCLAVES_BS_BOOTED_FAILURE state and return a failure. On RELEASE we - * simply panic as exclavekit is required. + * If framebank initialization fails, treat a failure to boot exclavekit + * as a transition to the EXCLAVES_BS_BOOTED_FAILURE state + * and return a failure. On RELEASE we simply panic as exclavekit is required. */ kern_return_t kr = exclaves_frame_mint_populate(); if (kr != KERN_SUCCESS) { - exclaves_requirement_assert(EXCLAVES_R_EXCLAVEKIT, + exclaves_requirement_assert(EXCLAVES_R_FRAMEBANK, "failed to populate frame mint"); exclaves_boot_status_set(EXCLAVES_BS_BOOTED_FAILURE); return KERN_FAILURE; @@ -306,7 +306,18 @@ exclaves_boot(exclaves_boot_stage_t boot_stage) break; case EXCLAVES_BOOT_STAGE_EXCLAVEKIT: - kr = exclaves_boot_exclavekit(); + if (!exclaves_requirement_is_relaxed(EXCLAVES_R_EXCLAVEKIT)) { + kr = exclaves_boot_exclavekit(); + } else { + /* + * If booting exclavekit was skipped due to a relaxed requirement, + * treat that as a transiton to the EXCLAVES_BS_BOOTED_FAILURE + * state and return a failure. On RELEASE we simply panic + * as exclavekit is required. + */ + exclaves_requirement_assert(EXCLAVES_R_EXCLAVEKIT, "booting exclavekit skipped"); + exclaves_boot_status_set(EXCLAVES_BS_BOOTED_FAILURE); + } break; default: diff --git a/osfmk/kern/exclaves_driverkit.c b/osfmk/kern/exclaves_driverkit.c index cb69690b2..bf44dafac 100644 --- a/osfmk/kern/exclaves_driverkit.c +++ b/osfmk/kern/exclaves_driverkit.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -521,7 +522,7 @@ exclaves_driverkit_upcall_legacy_notification_signal(const uint64_t id, "[notification_upcalls] notification_signal " "id %llx mask %x\n", id, mask); exclaves_resource_t *notification_resource = - exclaves_notification_lookup_by_id(EXCLAVES_DOMAIN_KERNEL, id); + exclaves_notification_lookup_by_id(id); xnuupcalls_xnuupcalls_notification_signal__result_s result = {}; @@ -1061,7 +1062,7 @@ exclaves_driverkit_upcall_notification_signal(const uint64_t id, "[notification_upcalls] notification_signal " "id %llx mask %x\n", id, mask); exclaves_resource_t *notification_resource = - exclaves_notification_lookup_by_id(EXCLAVES_DOMAIN_KERNEL, id); + exclaves_notification_lookup_by_id(id); xnuupcallsv2_notificationupcallsprivate_notificationsignal__result_s result = {}; @@ -1084,6 +1085,93 @@ exclaves_driverkit_upcall_notification_signal(const uint64_t id, return completion(result); } +tb_error_t +exclaves_driverkit_upcall_lpw_createpowerassertion( + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_s)) +{ + exclaves_debug_printf(show_iokit_upcalls, "[lpw_upcalls] createPowerAssertion\n"); + + struct IOExclaveLPWUpcallArgs args; + args.type = kIOExclaveLPWUpcallTypeCreateAssertion; + args.data.createassertion.id_out = 0; + + xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_s result = {}; + IOReturn ret = IOExclaveLPWUpcallHandler(&args); + uint64_t assertionID = args.data.createassertion.id_out; + if (ret == kIOReturnSuccess && assertionID != 0) { + xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_init_success( + &result, assertionID); + } else if (ret == kIOReturnBusy) { + xnuupcallsv2_lpwerror_s err; + xnuupcallsv2_lpwerror_assertiondenied__init(&err); + xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_init_failure( + &result, err); + } else { + xnuupcallsv2_lpwerror_s err; + xnuupcallsv2_lpwerror_internalerror__init(&err); + xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_init_failure( + &result, err); + } + + return completion(result); +} + +tb_error_t +exclaves_driverkit_upcall_lpw_releasepowerassertion(const uint64_t assertionID, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_s)) +{ + exclaves_debug_printf(show_iokit_upcalls, + "[lpw_upcalls] releasePowerAssertion id %llx\n", assertionID); + + struct IOExclaveLPWUpcallArgs args; + args.type = kIOExclaveLPWUpcallTypeReleaseAssertion; + args.data.releaseassertion.id = assertionID; + + xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_s result = {}; + if (IOExclaveLPWUpcallHandler(&args) == kIOReturnSuccess) { + xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_init_success( + &result); + } else { + xnuupcallsv2_lpwerror_s err; + xnuupcallsv2_lpwerror_internalerror__init(&err); + xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_init_failure( + &result, err); + } + + return completion(result); +} + +tb_error_t +exclaves_driverkit_upcall_lpw_requestrunmode(const uint64_t runmode_mask, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_s)) +{ + exclaves_debug_printf(show_iokit_upcalls, + "[lpw_upcalls] requestRunMode mask %llx\n", runmode_mask); + + struct IOExclaveLPWUpcallArgs args; + args.type = kIOExclaveLPWUpcallTypeRequestRunMode; + args.data.requestrunmode.runmode_mask = runmode_mask; + + xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_s result = {}; + IOReturn ret = IOExclaveLPWUpcallHandler(&args); + if (ret == kIOReturnSuccess) { + xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_init_success( + &result); + } else if (ret == kIOReturnBusy || ret == kIOReturnUnsupported) { + xnuupcallsv2_lpwerror_s err; + xnuupcallsv2_lpwerror_runmoderequestdenied__init(&err); + xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_init_failure( + &result, err); + } else { + xnuupcallsv2_lpwerror_s err; + xnuupcallsv2_lpwerror_internalerror__init(&err); + xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_init_failure( + &result, err); + } + + return completion(result); +} + /* -------------------------------------------------------------------------- */ #pragma mark Tests @@ -1092,9 +1180,7 @@ exclaves_driverkit_upcall_notification_signal(const uint64_t id, #define EXCLAVES_HELLO_DRIVER_INTERRUPTS_INDEX 0 #define EXCLAVES_HELLO_DRIVER_INTERRUPTS_CHECK_RET(test) if (test) { break; } -#define EXCLAVES_ID_HELLO_INTERRUPTS_EP \ - (exclaves_service_lookup(EXCLAVES_DOMAIN_KERNEL, \ - "com.apple.service.HelloDriverInterrupts")) +#define EXCLAVES_HELLO_INTERRUPTS "com.apple.service.HelloDriverInterrupts" typedef enum hello_driverkit_interrupts_test_type { TEST_IRQ_REGISTER, @@ -1133,63 +1219,45 @@ static const char *hello_driverkit_interrupts_test_string[] = { static int hello_driverkit_interrupts(hello_driverkit_interrupts_test_type_t test_type) { + int err = 0; + __block uint8_t res = 0; + hellodriverinterrupts_hellodriverinterrupts_s client; + exclaves_debug_printf(show_test_output, "****** START: %s ******\n", hello_driverkit_interrupts_test_string[test_type]); - int err = 0; assert(test_type < HELLO_DRIVER_INTERRUPTS_NUM_TESTS); - tb_endpoint_t ep = tb_endpoint_create_with_value( - TB_TRANSPORT_TYPE_XNU, EXCLAVES_ID_HELLO_INTERRUPTS_EP, 0); - - tb_client_connection_t client = - tb_client_connection_create_with_endpoint(ep); - - tb_client_connection_activate(client); - - tb_message_t message = NULL; - tb_transport_message_buffer_t tpt_buf = NULL; - - message = kalloc_type(struct tb_message_s, Z_WAITOK | Z_ZERO | Z_NOFAIL); - tpt_buf = kalloc_type(struct tb_transport_message_buffer_s, - Z_WAITOK | Z_ZERO | Z_NOFAIL); - - // Encode TB buffer with test_type - tb_error_t tb_err = TB_ERROR_SUCCESS; - tb_err = tb_client_connection_message_construct(client, message, - tpt_buf, sizeof(uint8_t), 0); - if (tb_err != TB_ERROR_SUCCESS) { + exclaves_id_t id = exclaves_service_lookup(EXCLAVES_DOMAIN_KERNEL, + EXCLAVES_HELLO_INTERRUPTS); + if (id == EXCLAVES_INVALID_ID) { + exclaves_debug_printf(show_test_output, "%s: Found %s service failed\n", + __func__, EXCLAVES_HELLO_INTERRUPTS); err = 1; goto out; } - exclaves_debug_printf(show_test_output, "%s: Tightbeam constructing message: %u\n", __func__, - (uint8_t) test_type); - tb_message_encode_u8(message, (uint8_t) test_type); - tb_message_complete(message); - exclaves_debug_printf(show_test_output, "%s: Tightbeam message completed\n", __func__); + tb_endpoint_t ep = tb_endpoint_create_with_value( + TB_TRANSPORT_TYPE_XNU, id, TB_ENDPOINT_OPTIONS_NONE); - tb_message_t response = NULL; - - // Perform downcall - tb_err = tb_connection_send_query(client, message, &response, - TB_CONNECTION_WAIT_FOR_REPLY); + tb_error_t tb_err = + hellodriverinterrupts_hellodriverinterrupts__init(&client, ep); if (tb_err != TB_ERROR_SUCCESS) { + exclaves_debug_printf(show_test_output, "%s: Failed to initialize hellodriverinterrupts service\n", + __func__); err = 2; goto out; } - exclaves_debug_printf(show_test_output, "%s: Tightbeam message send success, reply: ", __func__); - // Decode downcall reply - uint8_t reply = 0; - tb_message_decode_u8(response, &reply); - exclaves_debug_printf(show_test_output, "%u\n", reply); - - if (reply != 0) { + tb_err = hellodriverinterrupts_hellodriverinterrupts_hellointerrupt(&client, (uint8_t)test_type, ^(uint8_t result) { + res = result; + }); + if (tb_err != TB_ERROR_SUCCESS || res != 0) { + exclaves_debug_printf(show_test_output, "%s: Sending TB message hellointerrupt (type %d), failed with result %d\n", + __func__, (uint8_t)test_type, res); err = 3; goto out; } - tb_client_connection_message_destruct(client, message); out: if (err == 0) { @@ -1200,9 +1268,6 @@ out: hello_driverkit_interrupts_test_string[test_type], err); } - kfree_type(struct tb_message_s, message); - kfree_type(struct tb_transport_message_buffer_s, tpt_buf); - return err; } diff --git a/osfmk/kern/exclaves_driverkit.h b/osfmk/kern/exclaves_driverkit.h index 7050ff607..6c63c71c4 100644 --- a/osfmk/kern/exclaves_driverkit.h +++ b/osfmk/kern/exclaves_driverkit.h @@ -225,6 +225,18 @@ extern tb_error_t exclaves_driverkit_upcall_ane_workend(const uint64_t id, const uint64_t requestID, tb_error_t (^completion)(xnuupcallsv2_aneupcallsprivate_aneworkend__result_s)); +extern tb_error_t + exclaves_driverkit_upcall_lpw_createpowerassertion( + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_s)); + +extern tb_error_t + exclaves_driverkit_upcall_lpw_releasepowerassertion(const uint64_t assertionID, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_s)); + +extern tb_error_t + exclaves_driverkit_upcall_lpw_requestrunmode(const uint64_t runmode_mask, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_s)); + extern kern_return_t exclaves_driver_service_lookup(const char *service_name, uint64_t *endpoint); diff --git a/osfmk/kern/exclaves_inspection.c b/osfmk/kern/exclaves_inspection.c index 9441cc56d..71f677472 100644 --- a/osfmk/kern/exclaves_inspection.c +++ b/osfmk/kern/exclaves_inspection.c @@ -530,9 +530,6 @@ exclaves_inspection_check_ast(void) lck_mtx_unlock(&exclaves_collect_mtx); } - -/* this should come from somewhere in EP */ -#define STACKSHOT_PANIC_MAGIC 0xdeadcafebeefbabe typedef struct stackshot_panic_magic { uint64_t magic; uint64_t size; @@ -558,7 +555,7 @@ kdp_read_panic_exclaves_stackshot(struct exclaves_panic_stackshot *eps) /* check for panic magic value in xnu's copy of the region */ stackshot_panic_magic_t *panic_magic = __IGNORE_WCASTALIGN((stackshot_panic_magic_t *)(exclaves_stackshot_buffer + (EXCLAVES_STACKSHOT_BUFFER_SIZE - sizeof(stackshot_panic_magic_t)))); - if (panic_magic->magic != STACKSHOT_PANIC_MAGIC) { + if (panic_magic->magic != STACKSHOT_STACKSHOTCONSTANTS_PANICMAGIC) { return; } diff --git a/osfmk/kern/exclaves_log.c b/osfmk/kern/exclaves_log.c index 93501b716..d0b93a2f3 100644 --- a/osfmk/kern/exclaves_log.c +++ b/osfmk/kern/exclaves_log.c @@ -58,7 +58,7 @@ TUNABLE(bool, oslog_exclaves, "oslog_exclaves", true); #if DEVELOPMENT || DEBUG -#define OS_LOG_MAX_SIZE (2048) +#define OS_LOG_MAX_SIZE (4096) #define dbg_counter_inc(c) counter_inc((c)) SCALABLE_COUNTER_DEFINE(oslog_e_log_count); @@ -67,6 +67,7 @@ SCALABLE_COUNTER_DEFINE(oslog_e_metadata_count); SCALABLE_COUNTER_DEFINE(oslog_e_metadata_dropped_count); SCALABLE_COUNTER_DEFINE(oslog_e_signpost_count); SCALABLE_COUNTER_DEFINE(oslog_e_signpost_dropped_count); +SCALABLE_COUNTER_DEFINE(oslog_e_replay_failure_count); SCALABLE_COUNTER_DEFINE(oslog_e_query_count); SCALABLE_COUNTER_DEFINE(oslog_e_query_error_count); SCALABLE_COUNTER_DEFINE(oslog_e_trace_mode_set_count); @@ -96,8 +97,14 @@ os_log_replay_log(const oslogdarwin_logdata_s *ld, uint8_t *ld_data, size_t ld_d { firehose_stream_t stream = (firehose_stream_t)ld->stream; const size_t ld_size = oslogdarwin_logdata_data(ld, ld_data, ld_data_size); - assert3u(ld_size, <=, ld_data_size); - assert3u(ld->pubsize, <=, ld_size); + if (ld_size > ld_data_size || ld->pubsize > ld_size) { +#if DEVELOPMENT || DEBUG + panic("ld_size:%lu was >: %lu or <=: %hu", ld_size, ld_data_size, ld->pubsize); +#else + counter_inc(&oslog_e_replay_failure_count); + return; +#endif // DEVELOPMENT || DEBUG + } firehose_tracepoint_id_u ftid = { .ftid_value = ld->ftid @@ -106,8 +113,7 @@ os_log_replay_log(const oslogdarwin_logdata_s *ld, uint8_t *ld_data, size_t ld_d switch (ftid.ftid._namespace) { case firehose_tracepoint_namespace_metadata: counter_inc(&oslog_e_metadata_count); - assert3u(stream, ==, firehose_stream_metadata); - if (!os_log_encoded_metadata(ftid, ld->stamp, ld_data, ld_size)) { + if (stream != firehose_stream_metadata || !os_log_encoded_metadata(ftid, ld->stamp, ld_data, ld_size)) { counter_inc(&oslog_e_metadata_dropped_count); } break; diff --git a/osfmk/kern/exclaves_memory.c b/osfmk/kern/exclaves_memory.c index 8f409eb13..fc3e344ab 100644 --- a/osfmk/kern/exclaves_memory.c +++ b/osfmk/kern/exclaves_memory.c @@ -461,10 +461,7 @@ exclaves_memory_upcall_alloc(uint32_t npages, xnuupcallsv2_pagekind_s kind, (exclaves_memory_pagekind_t) kind, EXCLAVES_MEMORY_PAGE_FLAGS_NONE); - tb_error_t err = u32__v_assign_copy(&pagelist, pages, npages); - if (err != TB_ERROR_SUCCESS) { - panic("u32__v_assign_copy err %u", err); - } + u32__v_assign_unowned(&pagelist, pages, npages); return completion(pagelist); } @@ -489,10 +486,7 @@ exclaves_memory_upcall_alloc_ext(uint32_t npages, xnuupcallsv2_pageallocflagsv2_ exclaves_memory_alloc(npages, pages, kind, alloc_flags); - tb_error_t err = u32__v_assign_copy(&pagelist, pages, npages); - if (err != TB_ERROR_SUCCESS) { - panic("u32__v_assign_copy err %u", err); - } + u32__v_assign_unowned(&pagelist, pages, npages); return completion(pagelist); } diff --git a/osfmk/kern/exclaves_resource.c b/osfmk/kern/exclaves_resource.c index ebab666ef..5f6362746 100644 --- a/osfmk/kern/exclaves_resource.c +++ b/osfmk/kern/exclaves_resource.c @@ -31,6 +31,8 @@ #include #include +#include + #include #include @@ -56,13 +58,14 @@ #include #include +#include "exclaves_aoe.h" #include "exclaves_conclave.h" #include "exclaves_debug.h" +#include "exclaves_memory.h" #include "exclaves_resource.h" #include "exclaves_sensor.h" #include "exclaves_shared_memory.h" #include "exclaves_xnuproxy.h" -#include "exclaves_memory.h" #include "kern/exclaves.tightbeam.h" @@ -360,6 +363,7 @@ lookup_resource_by_id(exclaves_resource_domain_t *domain, uint64_t id, xnuproxy_resourcetype_s type) { __block exclaves_resource_t *resource = NULL; + table_get(domain->d_table_id, &id, sizeof(id), ^bool (void *data) { exclaves_resource_t *tmp = data; if (tmp->r_type == type) { @@ -400,10 +404,66 @@ exclaves_resource_domain_alloc(const char *scope) return domain; } +static void +exclaves_resource_insert_name_table(xnuproxy_resourcetype_s type __unused, const char *name, + exclaves_resource_domain_t *domain, exclaves_resource_t *resource) +{ + table_item_t *name_item = kalloc_type(table_item_t, + Z_WAITOK | Z_ZERO | Z_NOFAIL); + + name_item->i_key = resource->r_name; + name_item->i_key_len = strlen(resource->r_name); + name_item->i_value = resource; + + assertf(lookup_resource_by_name(domain, name, type) == NULL, + "Duplicate entry in exclaves resource table for \"%s\" , \"%s\"", domain->d_name, name); + table_put(domain->d_table_name, name, strlen(name), name_item); +} + +static void +exclaves_resource_insert_id_table(xnuproxy_resourcetype_s type, uint64_t id, + exclaves_resource_t *resource) +{ + switch (type) { + case XNUPROXY_RESOURCETYPE_NOTIFICATION: { + /* Stick the newly created resource into the ID table. */ + table_item_t *id_item = kalloc_type(table_item_t, + Z_WAITOK | Z_ZERO | Z_NOFAIL); + id_item->i_key = &resource->r_id; + id_item->i_key_len = sizeof(resource->r_id); + id_item->i_value = resource; + + /* + * Globally unique notification ids are added to the kernel domain for + * lookup while signalling + */ + exclaves_resource_domain_t *kernel_domain = lookup_domain(EXCLAVES_DOMAIN_KERNEL); + table_put(kernel_domain->d_table_id, &id, sizeof(id), id_item); + + break; + } + + default: + break; + } +} + static exclaves_resource_t * exclaves_resource_alloc(xnuproxy_resourcetype_s type, const char *name, uint64_t id, exclaves_resource_domain_t *domain, bool connected) { + if (type == XNUPROXY_RESOURCETYPE_NOTIFICATION) { + exclaves_resource_t *resource = exclaves_notification_lookup_by_id(id); + if (resource != NULL) { + /* + * Name entry should refer to the resource associated with the + * already present id + */ + exclaves_resource_insert_name_table(type, name, domain, resource); + return NULL; + } + } + exclaves_resource_t *resource = kalloc_type(exclaves_resource_t, Z_WAITOK | Z_ZERO | Z_NOFAIL); @@ -417,47 +477,24 @@ exclaves_resource_alloc(xnuproxy_resourcetype_s type, const char *name, uint64_t * Each resource has an associated kobject of type * IKOT_EXCLAVES_RESOURCE. */ - ipc_port_t port = ipc_kobject_alloc_port((ipc_kobject_t)resource, - IKOT_EXCLAVES_RESOURCE, IPC_KOBJECT_ALLOC_NSREQUEST); + ipc_port_t port = ipc_kobject_alloc_port(resource, + IKOT_EXCLAVES_RESOURCE, IPC_KOBJECT_ALLOC_NONE); resource->r_port = port; lck_mtx_init(&resource->r_mutex, &resource_lck_grp, NULL); (void) strlcpy(resource->r_name, name, sizeof(resource->r_name)); - - /* Stick the newly created resource into the name table. */ - table_item_t *name_item = kalloc_type(table_item_t, - Z_WAITOK | Z_ZERO | Z_NOFAIL); - - name_item->i_key = resource->r_name; - name_item->i_key_len = strlen(resource->r_name); - name_item->i_value = resource; - - assert(lookup_resource_by_name(domain, name, type) == NULL); - table_put(domain->d_table_name, name, strlen(name), name_item); + /* + * Add the resource to the name table, used for lookup + */ + exclaves_resource_insert_name_table(type, name, domain, resource); /* * Some types also need to lookup by id in addition to looking up by * name. */ - switch (type) { - case XNUPROXY_RESOURCETYPE_NOTIFICATION: { - /* Stick the newly created resource into the ID table. */ - table_item_t *id_item = kalloc_type(table_item_t, - Z_WAITOK | Z_ZERO | Z_NOFAIL); - id_item->i_key = &resource->r_id; - id_item->i_key_len = sizeof(resource->r_id); - id_item->i_value = resource; - - assert(lookup_resource_by_id(domain, id, type) == NULL); - table_put(domain->d_table_id, &id, sizeof(id), id_item); - break; - } - - default: - break; - } + exclaves_resource_insert_id_table(type, id, resource); return resource; } @@ -469,6 +506,7 @@ static void exclaves_resource_no_senders(ipc_port_t port, mach_port_mscount_t mscount); IPC_KOBJECT_DEFINE(IKOT_EXCLAVES_RESOURCE, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = exclaves_resource_no_senders); @@ -499,8 +537,11 @@ populate_conclave_services(void) if (cm != NULL) { conclave_resource_t *c = &cm->r_conclave; - bitmap_set(c->c_service_bitmap, - (uint32_t)resource->r_id); + if (exclaves_is_forwarding_resource(resource)) { + return (bool)false; + } + assert3u(resource->r_id, <, CONCLAVE_SERVICE_MAX); + bitmap_set(c->c_service_bitmap, (uint32_t)resource->r_id); return (bool)false; } @@ -524,6 +565,77 @@ populate_conclave_services(void) /* END IGNORE CODESTYLE */ } +/* + * The aoe_service_table is a hash table which contains a map of aoe service to + * conclave. + */ +static table_t aoe_service_table = { + .t_buckets = (queue_chain_t *)(queue_chain_t[TABLE_LEN]){}, + .t_buckets_count = TABLE_LEN, +}; + +exclaves_resource_t * +exclaves_conclave_lookup_by_aoeserviceid(uint64_t id) +{ + __block exclaves_resource_t *resource = NULL; + table_get(&aoe_service_table, &id, sizeof(id), ^bool (void *data) { + resource = data; + return true; + }); + + /* Ignore entries not marked connected. */ + if (resource == NULL || !resource->r_connected) { + return NULL; + } + + return resource; +} + +static void +populate_aoeservice_to_conclave(void) +{ + table_init(&aoe_service_table); + + /* BEGIN IGNORE CODESTYLE */ + iterate_domains(^(exclaves_resource_domain_t *domain) { + + exclaves_resource_t *cm = exclaves_resource_lookup_by_name( + EXCLAVES_DOMAIN_KERNEL, domain->d_name, + XNUPROXY_RESOURCETYPE_CONCLAVEMANAGER); + if (cm == NULL) { + return (bool)false; + } + + iterate_resources(domain, ^(exclaves_resource_t *resource) { + if (resource->r_type != XNUPROXY_RESOURCETYPE_ALWAYSONEXCLAVESSERVICE) { + return (bool)false; + } + + /* Found an ALWAYSONEXCLAVESSERVICE, add an entry to the map. */ + + /* Assert that there's no existing entry. */ + assert3p(exclaves_conclave_lookup_by_aoeserviceid(resource->r_id), + ==, NULL); + + /* Stick the newly created resource into the table. */ + table_item_t *item = kalloc_type(table_item_t, + Z_WAITOK | Z_ZERO | Z_NOFAIL); + + item->i_key = &resource->r_id; + item->i_key_len = sizeof(resource->r_id); + item->i_value = cm; + + table_put(&aoe_service_table, &resource->r_id, + sizeof(resource->r_id), item); + + return (bool)false; + }); + + return (bool)false; + }); + /* END IGNORE CODESTYLE */ +} + /* * Discover all the static exclaves resources populating the resource tables as * we go. @@ -551,9 +663,14 @@ exclaves_resource_init(void) exclaves_resource_t *resource = exclaves_resource_alloc(type, name, id, domain, connected); + if (!resource) { + assert3u(type, ==, XNUPROXY_RESOURCETYPE_NOTIFICATION); + return; + } + /* * Type specific initialization. - */ + */ switch (type) { case XNUPROXY_RESOURCETYPE_CONCLAVEMANAGER: exclaves_conclave_init(resource); @@ -563,13 +680,11 @@ exclaves_resource_init(void) exclaves_notification_init(resource); break; - case XNUPROXY_RESOURCETYPE_SERVICE: - assert3u(resource->r_id, <, CONCLAVE_SERVICE_MAX); - break; - default: break; } + + }); /* END IGNORE CODESTYLE */ @@ -580,6 +695,9 @@ exclaves_resource_init(void) /* Populate the conclave service ID bitmaps. */ populate_conclave_services(); + /* Build a map of AOE service -> conclave. */ + populate_aoeservice_to_conclave(); + return KERN_SUCCESS; } @@ -597,11 +715,25 @@ exclaves_resource_lookup_by_name(const char *domain_name, const char *name, exclaves_resource_t *r = lookup_resource_by_name(domain, name, type); - /* Ignore entries not marked connected. */ - if (r == NULL || !r->r_connected) { + if (r == NULL) { return NULL; } + /* + * Ignore exclave resources that are not connected + */ + if (!r->r_connected) { + switch (r->r_type) { + case XNUPROXY_RESOURCETYPE_CONCLAVEMANAGER: + case XNUPROXY_RESOURCETYPE_SERVICE: + if (exclaves_is_forwarding_resource(r)) { + break; + } + default: + return NULL; + } + } + return r; } @@ -748,32 +880,16 @@ exclaves_resource_create_port_name(exclaves_resource_t *resource, ipc_space_t sp { assert3u(os_atomic_load(&resource->r_usecnt, relaxed), >, 0); - ipc_port_t port = resource->r_port; - - ip_mq_lock(port); - - /* Create an armed send right. */ - kern_return_t ret = ipc_kobject_make_send_nsrequest_locked(port, - resource, IKOT_EXCLAVES_RESOURCE); - if (ret != KERN_SUCCESS && - ret != KERN_ALREADY_WAITING) { - ip_mq_unlock(port); - exclaves_resource_release(resource); - return ret; - } - /* - * If there was already a send right, then the port already has an - * associated use count so drop this one. + * make a send right and donate our reference for + * exclaves_resource_no_senders if this is the first send right */ - if (port->ip_srights > 1) { - assert3u(os_atomic_load(&resource->r_usecnt, relaxed), >, 1); + if (!ipc_kobject_make_send_lazy_alloc_port(&resource->r_port, + resource, IKOT_EXCLAVES_RESOURCE)) { exclaves_resource_release(resource); } - ip_mq_unlock(port); - - *name = ipc_port_copyout_send(port, space); + *name = ipc_port_copyout_send(resource->r_port, space); if (!MACH_PORT_VALID(*name)) { /* * ipc_port_copyout_send() releases the send right on failure @@ -818,6 +934,8 @@ exclaves_conclave_init(exclaves_resource_t *resource) conclave->c_active_stopcall = false; conclave->c_downcall_thread = THREAD_NULL; conclave->c_task = TASK_NULL; + + queue_init(&conclave->c_aoe_q); } kern_return_t @@ -888,6 +1006,9 @@ exclaves_conclave_detach(exclaves_resource_t *resource, task_t task) assert3p(task->conclave, !=, NULL); assert3p(resource, ==, task->conclave); + /* Cleanup any residual AOE state. */ + exclaves_aoe_teardown(); + task->conclave = NULL; conclave->c_task = TASK_NULL; @@ -959,8 +1080,13 @@ exclaves_conclave_launch(exclaves_resource_t *resource) * This should only ever happen if the EXCLAVEKIT requirement was * relaxed. */ - exclaves_requirement_assert(EXCLAVES_R_EXCLAVEKIT, - "failed to boot to exclavekit"); + if (exclaves_requirement_is_relaxed(EXCLAVES_R_EXCLAVEKIT) || + exclaves_requirement_is_relaxed(EXCLAVES_R_FRAMEBANK)) { + exclaves_debug_printf(show_errors, + "exclaves: requirement was relaxed, ignoring error: failed to boot to exclavekit\n"); + } else { + panic("exclaves: requirement failed: failed to boot to exclavekit\n"); + } return KERN_NOT_SUPPORTED; } @@ -978,6 +1104,26 @@ exclaves_conclave_launch(exclaves_resource_t *resource) return kr; } +bool +exclaves_is_forwarding_resource(exclaves_resource_t *resource) +{ + if (resource->r_type == XNUPROXY_RESOURCETYPE_CONCLAVEMANAGER || + resource->r_type == XNUPROXY_RESOURCETYPE_SERVICE) { + if (resource->r_id > EXCLAVES_FORWARDING_RESOURCE_ID_BASE) { + return true; + } + } + return false; +} + +void +exclaves_conclave_prepare_teardown(task_t task __unused) +{ + /* We explicitly do not handle HAS_ARM_FEAT_SME here because it's always + * handled on exclaves_enter + */ +} + static kern_return_t exclaves_update_state_machine_locked(exclaves_resource_t *resource) { @@ -1558,9 +1704,9 @@ exclaves_notification_signal(exclaves_resource_t *exclaves_resource, long event_ } exclaves_resource_t * -exclaves_notification_lookup_by_id(const char *domain, uint64_t id) +exclaves_notification_lookup_by_id(uint64_t id) { - return exclaves_resource_lookup_by_id(domain, id, + return exclaves_resource_lookup_by_id(EXCLAVES_DOMAIN_KERNEL, id, XNUPROXY_RESOURCETYPE_NOTIFICATION); } @@ -1948,4 +2094,26 @@ exclaves_resource_audio_memory_copyout(exclaves_resource_t *resource, return KERN_SUCCESS; } +#pragma mark AOE Service + +void +exclaves_resource_aoeservice_iterate(const char *domain_name, + bool (^cb)(exclaves_resource_t *)) +{ + assert3u(strlen(domain_name), >, 0); + + exclaves_resource_domain_t *domain = lookup_domain(domain_name); + if (domain == NULL) { + return; + } + + iterate_resources(domain, ^(exclaves_resource_t *resource) { + if (resource->r_type != XNUPROXY_RESOURCETYPE_ALWAYSONEXCLAVESSERVICE) { + return (bool)false; + } + + return cb(resource); + }); +} + #endif /* CONFIG_EXCLAVES */ diff --git a/osfmk/kern/exclaves_resource.h b/osfmk/kern/exclaves_resource.h index a05504878..6abcdb5cb 100644 --- a/osfmk/kern/exclaves_resource.h +++ b/osfmk/kern/exclaves_resource.h @@ -51,6 +51,8 @@ __BEGIN_DECLS #define EXCLAVES_DOMAIN_KERNEL "com.apple.kernel" #define EXCLAVES_DOMAIN_DARWIN "com.apple.darwin" +#define EXCLAVES_FORWARDING_RESOURCE_ID_BASE (1ULL << 48) + /* * Data associated with a conclave. */ @@ -97,8 +99,22 @@ typedef enum __attribute__((flag_enum)) { CONCLAVE_R_STOP_REQUESTED = 0x4, } conclave_request_t; -/* The maximum number of services available in any conclave. */ -#define CONCLAVE_SERVICE_MAX 192 +/* + * Data associated with Always-On Exclaves endpoints stashed in the conclave + * resource. + */ +typedef struct { + uint64_t aoei_serviceid; + uint8_t aoei_message_count; + uint8_t aoei_work_count; + uint8_t aoei_worker_count; + bool aoei_associated; + queue_chain_t aoei_chain; + uint64_t aoei_assertion_id; +} aoe_item_t; + +/* The highest service identifier in any conclave. */ +#define CONCLAVE_SERVICE_MAX 256 typedef struct { conclave_state_t c_state; @@ -110,6 +126,11 @@ typedef struct { task_t XNU_PTRAUTH_SIGNED_PTR("conclave.task") c_task; thread_t XNU_PTRAUTH_SIGNED_PTR("conclave.thread") c_downcall_thread; bitmap_t c_service_bitmap[BITMAP_LEN(CONCLAVE_SERVICE_MAX)]; + + /* + * Always-On Exclaves specific. + */ + queue_head_t c_aoe_q; } conclave_resource_t; typedef struct { @@ -503,6 +524,53 @@ exclaves_conclave_get_domain(exclaves_resource_t *resource); extern bool exclaves_conclave_has_service(exclaves_resource_t *resource, uint64_t id); +/*! + * @function exclaves_conclave_lookup_by_aoeserviceid + * + * @abstract + * Find a conclave by Always-On Exclaves service ID. + * + * @param id + * The AOE service ID. + * + * @return + * Pointer to the resource + */ +exclaves_resource_t * +exclaves_conclave_lookup_by_aoeserviceid(uint64_t id); + +/*! + * @function exclaves_is_forwarding_resource + * + * @abstract + * Check if the resource is a forwarding conclave, i.e. the conclave + * manager isn't hosting an actual exclave resource + * + * @param resource + * Conclave Manager resource + * + * @return + * true if resource is a forwarding conclave, false otherwise + * + */ +extern bool +exclaves_is_forwarding_resource(exclaves_resource_t *resource); + +/*! + * @function exclaves_conclave_prepare_teardown + * + * @bastract + * Before we can start tearing down the conclave, + * we may want to clear up some machine context. + * + * @param task is the pointer to owner of conclave resource. + * + */ +extern void +exclaves_conclave_prepare_teardown( + task_t task); + + /* -------------------------------------------------------------------------- */ #pragma mark Sensors @@ -652,14 +720,11 @@ exclaves_notification_signal(exclaves_resource_t *resource, long event_mask); * @param id * The resource ID. * - * @param domain - * The domain to search. - * * @return * Pointer to the resource */ exclaves_resource_t * -exclaves_notification_lookup_by_id(const char *domain, uint64_t id); +exclaves_notification_lookup_by_id(uint64_t id); /* -------------------------------------------------------------------------- */ @@ -873,6 +938,30 @@ exclaves_resource_audio_memory_copyout(exclaves_resource_t *resource, user_addr_t ubuffer, mach_vm_size_t usize1, mach_vm_size_t uoffset1, mach_vm_size_t usize2, mach_vm_size_t uoffset2, user_addr_t ustatus); + +/* -------------------------------------------------------------------------- */ +#pragma mark Always-On Exclaves Services + +/*! + * @function exclaves_resource_aoeservice_iterate + * + * @abstract + * Iterate through all AOE Services for the given domain. + * + * @param domain + * The domain to search. + * + * @param cb + * The callback to call on each found AOE Service. Return true to break out + * early. + */ +/* BEGIN IGNORE CODESTYLE */ +extern void +exclaves_resource_aoeservice_iterate(const char *domain, + bool (^cb)(exclaves_resource_t *)); +/* END IGNORE CODESTYLE */ + + extern exclaves_resource_t * exclaves_resource_lookup_by_name(const char *domain_name, const char *name, xnuproxy_resourcetype_s type); diff --git a/osfmk/kern/exclaves_sensor.c b/osfmk/kern/exclaves_sensor.c index e57fa5df4..65b8b3145 100644 --- a/osfmk/kern/exclaves_sensor.c +++ b/osfmk/kern/exclaves_sensor.c @@ -70,6 +70,12 @@ sensor_type_to_eic_sensortype(exclaves_sensor_type_t type) return EXCLAVEINDICATORCONTROLLER_SENSORTYPE_SENSOR_CAM_ALT_FACEID; case EXCLAVES_SENSOR_CAM_ALT_FACEID_DELAYED: return EXCLAVEINDICATORCONTROLLER_SENSORTYPE_SENSOR_CAM_ALT_FACEID_DELAYED; + case EXCLAVES_SENSOR_TEST: + return EXCLAVEINDICATORCONTROLLER_SENSORTYPE_SENSOR_TEST; + case EXCLAVES_SENSOR_TEST_MIL: + return EXCLAVEINDICATORCONTROLLER_SENSORTYPE_SENSOR_TEST_MIL; + case EXCLAVES_SENSOR_TEST_CIL: + return EXCLAVEINDICATORCONTROLLER_SENSORTYPE_SENSOR_TEST_CIL; default: panic("unknown sensor type"); } @@ -117,28 +123,28 @@ exclaves_eic_init(void) } static kern_return_t -exclaves_eic_display_healthcheck_rate(uint64_t ns) +exclaves_eic_tick_rate(uint64_t rate_hz) { - exclaveindicatorcontroller_requestedrefreshrate_s rate; + exclaveindicatorcontroller_indicatorrefreshrate_s rate; - /* Convert time to frequency and round up to nearest supported value. */ - switch (NSEC_PER_SEC / ns) { + /* Round up to nearest supported value. */ + switch (rate_hz) { case 0 ... 30: exclaves_display_healthcheck_rate_hz = 30; - rate = EXCLAVEINDICATORCONTROLLER_REQUESTEDREFRESHRATE_HZ_30; + rate.tag = EXCLAVEINDICATORCONTROLLER_INDICATORREFRESHRATE__HZ_30; break; case 31 ... 60: exclaves_display_healthcheck_rate_hz = 60; - rate = EXCLAVEINDICATORCONTROLLER_REQUESTEDREFRESHRATE_HZ_60; + rate.tag = EXCLAVEINDICATORCONTROLLER_INDICATORREFRESHRATE__HZ_60; break; default: exclaves_display_healthcheck_rate_hz = 120; - rate = EXCLAVEINDICATORCONTROLLER_REQUESTEDREFRESHRATE_HZ_120; + rate.tag = EXCLAVEINDICATORCONTROLLER_INDICATORREFRESHRATE__HZ_120; break; } - tb_error_t ret = exclaveindicatorcontroller_sensorrequest_requestdisplayhealthcheckrate( - &eic_client, rate, ^(__unused exclaveindicatorcontroller_requestresponse_s result) {}); + tb_error_t ret = exclaveindicatorcontroller_sensorrequest_setindicatorrefreshrate( + &eic_client, &rate, ^(__unused exclaveindicatorcontroller_requesterror_s result) {}); return ret == TB_ERROR_SUCCESS ? KERN_SUCCESS : KERN_FAILURE; } @@ -192,6 +198,19 @@ exclaves_eic_sensor_copy(uint32_t buffer, uint64_t size1, uint64_t offset1, return ret == TB_ERROR_SUCCESS ? KERN_SUCCESS : KERN_FAILURE; } +static bool +exclaves_sensor_tick(void) +{ + __block bool again = true; + __unused tb_error_t ret = exclaveindicatorcontroller_sensorrequest_tick( + &eic_client, ^(bool result) { + again = result; + }); + assert3u(ret, ==, TB_ERROR_SUCCESS); + + return again; +} + /* -------------------------------------------------------------------------- */ #pragma mark sensor @@ -235,6 +254,9 @@ valid_sensor(exclaves_sensor_type_t sensor_type) case EXCLAVES_SENSOR_MIC: case EXCLAVES_SENSOR_CAM_ALT_FACEID: case EXCLAVES_SENSOR_CAM_ALT_FACEID_DELAYED: + case EXCLAVES_SENSOR_TEST: + case EXCLAVES_SENSOR_TEST_MIL: + case EXCLAVES_SENSOR_TEST_CIL: return true; default: return false; @@ -248,15 +270,6 @@ sensor_type_to_sensor(exclaves_sensor_type_t sensor_type) return &sensors[sensor_type - 1]; } -static inline exclaves_sensor_type_t -sensor_to_sensor_type(exclaves_sensor_t *sensor) -{ - assert3p(sensor, <=, &sensors[EXCLAVES_SENSOR_MAX]); - assert3p(sensor, >=, &sensors[0]); - - return (exclaves_sensor_type_t)((sensor - &sensors[0]) + 1); -} - /* Calculate the next healthcheck time. */ static void healthcheck_deadline(uint64_t *deadline, uint64_t *leeway) @@ -267,69 +280,6 @@ healthcheck_deadline(uint64_t *deadline, uint64_t *leeway) nanoseconds_to_absolutetime(interval / 2, leeway); } -/* - * Do a healthcheck status call. The status call may be skipped if certain conditions are met. - * Returns false is status call was skipped. - */ -static bool -do_healthcheck(exclaves_sensor_t *sensor) -{ - LCK_MTX_ASSERT(&sensor->s_mutex, LCK_MTX_ASSERT_OWNED); - - /* - * If the sensor has not started, and the min on-time has been processed, - * skip health check. - */ - if (sensor->s_startcount == 0 && sensor->s_stop_abs == 0) { - return false; - } - - exclaves_sensor_status_t status; - (void) exclaves_sensor_status(sensor_to_sensor_type(sensor), 0, &status); - - return true; -} - -/* - * For stopped sensors, see if the minimum on-time has been reached. If so, do a - * status call. If the minimum on-time has not been reached, return a deadline - * for when it will be. - */ -static void -do_min_on_time(exclaves_sensor_t *sensor, uint64_t *deadline, - uint64_t *leeway) -{ - LCK_MTX_ASSERT(&sensor->s_mutex, LCK_MTX_ASSERT_OWNED); - - /* - * The sensor hasn't stopped yet or has already had its min on-time - * processed. - */ - if (sensor->s_startcount != 0 || sensor->s_stop_abs == 0) { - *deadline = UINT64_MAX; - return; - } - - uint64_t min_time = 0; - nanoseconds_to_absolutetime(EXCLAVES_EIC_MIN_SENSOR_TIME, &min_time); - nanoseconds_to_absolutetime(50 * NSEC_PER_MSEC, leeway); - - *deadline = sensor->s_stop_abs + min_time; - - if (*deadline <= mach_absolute_time()) { - /* The minimum on-time has been hit. Call status. */ - exclaves_sensor_status_t status; - (void) exclaves_sensor_status(sensor_to_sensor_type(sensor), 0, - &status); - - sensor->s_stop_abs = 0; - *deadline = UINT64_MAX; - return; - } - - /* The minimum on-time is in the future. Need to reschedule. */ -} - /* * Called from the threadcall to call into exclaves with a status command for * every started sensor. Re-arms itself so it runs at a frequency set by the @@ -340,9 +290,7 @@ do_min_on_time(exclaves_sensor_t *sensor, uint64_t *deadline, static void exclaves_sensor_healthcheck(__unused void *param0, __unused void *param1) { - uint64_t leeway, deadline = UINT64_MAX; uint64_t hc_leeway, hc_deadline; - uint64_t mot_leeway, mot_deadline; /* * Calculate the next deadline up-front so the overhead of calling into @@ -350,33 +298,9 @@ exclaves_sensor_healthcheck(__unused void *param0, __unused void *param1) */ healthcheck_deadline(&hc_deadline, &hc_leeway); - for (int i = 0; i < EXCLAVES_SENSOR_MAX; i++) { - exclaves_sensor_t *sensor = &sensors[i]; - - if (!sensor->s_initialised) { - continue; - } - - lck_mtx_lock(&sensor->s_mutex); - - if (do_healthcheck(sensor) && - hc_deadline < deadline) { - deadline = hc_deadline; - leeway = hc_leeway; - } - - do_min_on_time(sensor, &mot_deadline, &mot_leeway); - if (mot_deadline < deadline) { - deadline = mot_deadline; - leeway = mot_leeway; - } - - lck_mtx_unlock(&sensor->s_mutex); - } - - if (deadline != UINT64_MAX) { + if (exclaves_sensor_tick()) { thread_call_enter_delayed_with_leeway(sensor_healthcheck_tcall, - NULL, deadline, leeway, THREAD_CALL_DELAY_LEEWAY); + NULL, hc_deadline, hc_leeway, THREAD_CALL_DELAY_LEEWAY); } } @@ -498,8 +422,6 @@ exclaves_sensor_stop(exclaves_sensor_type_t sensor_type, uint64_t flags, lck_mtx_unlock(&sensor->s_mutex); - (void)thread_call_enter(sensor_healthcheck_tcall); - return kr; } @@ -520,7 +442,7 @@ exclaves_sensor_status(exclaves_sensor_type_t sensor_type, uint64_t flags, } kern_return_t -exclaves_display_healthcheck_rate(uint64_t ns) +exclaves_sensor_tick_rate(uint64_t rate_hz) { /* * Make sure that the initialisation has taken place before calling into @@ -531,7 +453,14 @@ exclaves_display_healthcheck_rate(uint64_t ns) return KERN_FAILURE; } - return exclaves_eic_display_healthcheck_rate(ns); + return exclaves_eic_tick_rate(rate_hz); +} + +kern_return_t +exclaves_display_healthcheck_rate(uint64_t __unused ns) +{ + /* Deprecated, no longer does anything */ + return KERN_SUCCESS; } kern_return_t @@ -552,6 +481,35 @@ exclaves_sensor_copy(uint32_t buffer, uint64_t size1, uint64_t offset1, status); } +kern_return_t +exclaves_indicator_min_on_time_deadlines(struct exclaves_indicator_deadlines *deadlines) +{ + assert(deadlines); + + //For now, only one version is supported. Return an error if libsyscall sends us any other versions + if (deadlines->version != 1) { + return KERN_INVALID_ARGUMENT; + } + + // Make sure that the initialisation has taken place before calling into + // the EIC. Any sensor is sufficient. + exclaves_sensor_t *sensor = sensor_type_to_sensor(EXCLAVES_SENSOR_CAM); + if (!sensor->s_initialised) { + return KERN_FAILURE; + } + + tb_error_t ret = exclaveindicatorcontroller_sensorrequest_getmotstate( + &eic_client, ^(exclaveindicatorcontroller_motstate_s result) { + deadlines->camera_indicator = result.deadlinecil; + deadlines->mic_indicator = result.deadlinemil; + deadlines->faceid_indicator = result.deadlinefid; + }); + + return ret == TB_ERROR_SUCCESS ? KERN_SUCCESS : KERN_FAILURE; +} + + + #else /* CONFIG_EXCLAVES */ kern_return_t @@ -560,4 +518,10 @@ exclaves_display_healthcheck_rate(__unused uint64_t ns) return KERN_NOT_SUPPORTED; } +kern_return_t +exclaves_sensor_tick_rate(uint64_t __unused rate_hz) +{ + return KERN_NOT_SUPPORTED; +} + #endif /* CONFIG_EXCLAVES */ diff --git a/osfmk/kern/exclaves_sensor.h b/osfmk/kern/exclaves_sensor.h index 85b046d9c..db23d37be 100644 --- a/osfmk/kern/exclaves_sensor.h +++ b/osfmk/kern/exclaves_sensor.h @@ -69,6 +69,14 @@ exclaves_sensor_copy(uint32_t buffer, uint64_t size1, uint64_t offset1, uint64_t size2, uint64_t offset2, exclaves_sensor_status_t *sensor_status); +/*! + * Returns the minimum on time deadlines for various sensors + * @param deadlines out parameter filled with indicator deadlines + */ +kern_return_t +exclaves_indicator_min_on_time_deadlines(struct exclaves_indicator_deadlines *deadlines); + + __END_DECLS #endif /* CONFIG_EXCLAVES */ diff --git a/osfmk/kern/exclaves_storage.c b/osfmk/kern/exclaves_storage.c index 5f8e27cb9..8d933a09d 100644 --- a/osfmk/kern/exclaves_storage.c +++ b/osfmk/kern/exclaves_storage.c @@ -591,7 +591,7 @@ exclaves_storage_upcall_rootex(const uint32_t fstag, xnuupcallsv2_storageupcallsprivate_rootex__result_s result = {}; if ((error = verify_string_length((const char *)&exclaveid[0], 32))) { - xnuupcallsv2_storageupcallsprivate_rootex__result_init_failure(&result, error); + xnuupcallsv2_storageupcallsprivate_rootex__result_init_failure(&result, consolidate_storage_error(error)); return completion(result); } error = vfs_exclave_fs_root_ex(fstag, (const char *)&exclaveid[0], &rootid); @@ -599,7 +599,7 @@ exclaves_storage_upcall_rootex(const uint32_t fstag, exclaves_debug_printf(show_errors, "[storage_upcalls_server] vfs_exclave_fs_rootex failed with %d\n", error); - xnuupcallsv2_storageupcallsprivate_rootex__result_init_failure(&result, error); + xnuupcallsv2_storageupcallsprivate_rootex__result_init_failure(&result, consolidate_storage_error(error)); } else { exclaves_debug_printf(show_storage_upcalls, "[storage_upcalls_server] vfs_exclave_fs_rootex return " diff --git a/osfmk/kern/exclaves_tests.c b/osfmk/kern/exclaves_tests.c index 8a3b853b6..3c7ca3ec3 100644 --- a/osfmk/kern/exclaves_tests.c +++ b/osfmk/kern/exclaves_tests.c @@ -134,9 +134,10 @@ exclaves_sensor_kpi_test(int64_t in, int64_t *out) bool success = true; exclaves_debug_printf(show_test_output, "%s: STARTING\n", __func__); - // TODO: EIC-based camera tests are disabled until rdar://132025387 exclaves_sensor_type_t sensors[] = { - EXCLAVES_SENSOR_MIC, + EXCLAVES_SENSOR_TEST, + EXCLAVES_SENSOR_TEST_MIL, + EXCLAVES_SENSOR_TEST_CIL, }; unsigned num_sensors = sizeof(sensors) / sizeof(sensors[0]); exclaves_sensor_status_t sensor_status = EXCLAVES_SENSOR_STATUS_DENIED; diff --git a/osfmk/kern/exclaves_upcalls.c b/osfmk/kern/exclaves_upcalls.c index 861862875..549283c5e 100644 --- a/osfmk/kern/exclaves_upcalls.c +++ b/osfmk/kern/exclaves_upcalls.c @@ -58,6 +58,7 @@ #include "exclaves_storage.h" #include "exclaves_test_stackshot.h" #include "exclaves_xnuproxy.h" +#include "exclaves_aoe.h" #include @@ -68,6 +69,9 @@ #define EXCLAVES_ID_TIGHTBEAM_UPCALL \ ((exclaves_id_t)XNUPROXY_UPCALL_TIGHTBEAM) +#define EXCLAVES_ID_TIGHTBEAM_PMM_UPCALL_V2 \ + ((exclaves_id_t)XNUPROXY_PMM_UPCALL_TIGHTBEAM_V2) + #define EXCLAVES_ID_TIGHTBEAM_UPCALL_V2 \ ((exclaves_id_t)XNUPROXY_UPCALL_TIGHTBEAM_V2) @@ -322,15 +326,9 @@ static const xnuupcalls_xnuupcalls__server_s exclaves_tightbeam_upcalls = { /* END IGNORE CODESTYLE */ }; -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-parameter" -static const xnuupcallsv2_xnuupcalls__server_s exclaves_tightbeam_upcalls_v2 = { +static const xnuupcallsv2_memoryupcallsprivate__server_s exclaves_tightbeam_memory_upcalls_v2 = { /* BEGIN IGNORE CODESTYLE */ /* Uncrustify doesn't deal well with Blocks. */ - .helloupcall = ^(const uint64_t arg, tb_error_t (^completion)(uint64_t)) { - return exclaves_helloupcall(arg, completion); - }, - .alloc = ^(const uint32_t npages, xnuupcallsv2_pagekind_s kind, tb_error_t (^completion)(xnuupcallsv2_pagelist_s)) { return exclaves_memory_upcall_alloc(npages, kind, completion); @@ -348,6 +346,19 @@ static const xnuupcallsv2_xnuupcalls__server_s exclaves_tightbeam_upcalls_v2 = { .free_ext = ^(const xnuupcallsv2_pagelist_s pages, xnuupcallsv2_pagefreeflagsv2_s flags, tb_error_t (^completion)(void)) { return exclaves_memory_upcall_free_ext(pages, flags, completion); }, + /* END IGNORE CODESTYLE */ +}; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-parameter" +static const xnuupcallsv2_xnuupcalls__server_s exclaves_tightbeam_upcalls_v2 = { + /* BEGIN IGNORE CODESTYLE */ + /* Uncrustify doesn't deal well with Blocks. */ + .helloupcall = ^(const uint64_t arg, tb_error_t (^completion)(uint64_t)) { + return exclaves_helloupcall(arg, completion); + }, + + /* Unset memoryupcalls handlers */ .root = ^(const uint8_t exclaveid[_Nonnull 32], tb_error_t (^completion)(xnuupcallsv2_storageupcallsprivate_root__result_s)) { @@ -548,6 +559,22 @@ static const xnuupcallsv2_xnuupcalls__server_s exclaves_tightbeam_upcalls_v2 = { tb_error_t (^completion)(xnuupcallsv2_conclaveupcallsprivate_crashinfo__result_s)) { return exclaves_conclave_upcall_crash_info(shared_buf, length, completion); }, + .createpowerassertion = ^( + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_createpowerassertion__result_s)) { + return exclaves_driverkit_upcall_lpw_createpowerassertion(completion); + }, + .releasepowerassertion = ^(const xnuupcallsv2_assertionid_s id, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_releasepowerassertion__result_s)) { + return exclaves_driverkit_upcall_lpw_releasepowerassertion(id, completion); + }, + .requestrunmode = ^(const xnuupcallsv2_lpwrunmode_s mode, + tb_error_t (^completion)(xnuupcallsv2_lpwupcallsprivate_requestrunmode__result_s)) { + return exclaves_driverkit_upcall_lpw_requestrunmode(mode, completion); + }, + .workavailable = ^(const xnuupcallsv2_aoeworkinfo_s * workInfo, + tb_error_t (^completion)(void)) { + return exclaves_aoe_upcall_work_available(workInfo, completion); + }, /* END IGNORE CODESTYLE */ }; #pragma clang diagnostic pop @@ -590,6 +617,10 @@ exclaves_upcall_init(void) TB_TRANSPORT_TYPE_XNU, EXCLAVES_ID_TIGHTBEAM_UPCALL, TB_ENDPOINT_OPTIONS_NONE); + tb_endpoint_t tb_pmm_upcall_v2_ep = tb_endpoint_create_with_value( + TB_TRANSPORT_TYPE_XNU, EXCLAVES_ID_TIGHTBEAM_PMM_UPCALL_V2, + TB_ENDPOINT_OPTIONS_NONE); + tb_endpoint_t tb_upcall_v2_ep = tb_endpoint_create_with_value( TB_TRANSPORT_TYPE_XNU, EXCLAVES_ID_TIGHTBEAM_UPCALL_V2, TB_ENDPOINT_OPTIONS_NONE); @@ -598,11 +629,17 @@ exclaves_upcall_init(void) #pragma clang diagnostic ignored "-Wcast-qual" /* FIXME: rdar://103647654 */ tb_error_t error = xnuupcalls_xnuupcalls__server_start(tb_upcall_ep, (xnuupcalls_xnuupcalls__server_s *)&exclaves_tightbeam_upcalls); - tb_error_t error2 = xnuupcallsv2_xnuupcalls__server_start(tb_upcall_v2_ep, + tb_error_t error2 = xnuupcallsv2_memoryupcallsprivate__server_start( + tb_pmm_upcall_v2_ep, + (xnuupcallsv2_memoryupcallsprivate__server_s*)&exclaves_tightbeam_memory_upcalls_v2); + tb_error_t error3 = xnuupcallsv2_xnuupcalls__server_start(tb_upcall_v2_ep, (xnuupcallsv2_xnuupcalls__server_s *)&exclaves_tightbeam_upcalls_v2); #pragma clang diagnostic pop - return (error == TB_ERROR_SUCCESS && error2 == TB_ERROR_SUCCESS) ? KERN_SUCCESS : KERN_FAILURE; + return (error == TB_ERROR_SUCCESS + && error2 == TB_ERROR_SUCCESS + && error3 == TB_ERROR_SUCCESS) + ? KERN_SUCCESS : KERN_FAILURE; } /* Unslid pointers defining the range of code which triggers upcall handlers */ diff --git a/osfmk/kern/hibernate.c b/osfmk/kern/hibernate.c index baf6d1bfa..29633eae1 100644 --- a/osfmk/kern/hibernate.c +++ b/osfmk/kern/hibernate.c @@ -133,8 +133,6 @@ hibernate_teardown(hibernate_page_list_t * page_list, hibernate_page_list_t * page_list_wired, hibernate_page_list_t * page_list_pal) { - hibernate_free_gobble_pages(); - if (page_list) { kfree_data(page_list, page_list->list_size); } diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 0e28a8197..653d1d60a 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -92,7 +93,6 @@ #include #include #include -#include // mach_node_port_changed() #include #include @@ -131,6 +131,9 @@ SCALABLE_COUNTER_DEFINE(vm_statistics_swapins); /* # of pages swa SCALABLE_COUNTER_DEFINE(vm_statistics_swapouts); /* # of pages swapped out (via compression segments) */ SCALABLE_COUNTER_DEFINE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */ SCALABLE_COUNTER_DEFINE(vm_page_grab_count); +SCALABLE_COUNTER_DEFINE(vm_page_grab_count_kern); +SCALABLE_COUNTER_DEFINE(vm_page_grab_count_iopl); +SCALABLE_COUNTER_DEFINE(vm_page_grab_count_upl); host_data_t realhost; @@ -383,6 +386,16 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num } user_arch_info->cpu_type = preferred_cpu_type; user_arch_info->cpu_subtype = preferred_cpu_subtype; +#elif APPLEVIRTUALPLATFORM + extern uint32_t force_arm64_32; + if (force_arm64_32) { + user_arch_info->cpu_type = CPU_TYPE_ARM64_32; + user_arch_info->cpu_subtype = CPU_SUBTYPE_ARM64_32_V8; + } else { + int master_id = master_processor->cpu_id; + user_arch_info->cpu_type = slot_type(master_id); + user_arch_info->cpu_subtype = slot_subtype(master_id); + } #else int master_id = master_processor->cpu_id; user_arch_info->cpu_type = slot_type(master_id); @@ -580,10 +593,12 @@ static LCK_MTX_DECLARE(host_statistics_lck, &host_statistics_lck_grp); #define HOST_EXPIRED_TASK_INFO_REV0 8 #define HOST_EXPIRED_TASK_INFO_REV1 9 #define HOST_VM_COMPRESSOR_Q_LEN_REV0 10 -#define NUM_HOST_INFO_DATA_TYPES 11 +#define HOST_VM_INFO64_REV2 11 +#define NUM_HOST_INFO_DATA_TYPES 12 static vm_statistics64_data_t host_vm_info64_rev0 = {}; static vm_statistics64_data_t host_vm_info64_rev1 = {}; +static vm_statistics64_data_t host_vm_info64_rev2 = {}; static vm_extmod_statistics_data_t host_extmod_info64 = {}; static host_load_info_data_t host_load_info = {}; static vm_statistics_data_t host_vm_info_rev0 = {}; @@ -614,6 +629,7 @@ static struct host_stats_cache g_host_stats_cache[NUM_HOST_INFO_DATA_TYPES] = { [HOST_EXPIRED_TASK_INFO_REV0] = { .last_access = 0, .current_requests = 0, .max_requests = 0, .data = (uintptr_t)&host_expired_task_info, .count = TASK_POWER_INFO_COUNT }, [HOST_EXPIRED_TASK_INFO_REV1] = { .last_access = 0, .current_requests = 0, .max_requests = 0, .data = (uintptr_t)&host_expired_task_info2, .count = TASK_POWER_INFO_V2_COUNT}, [HOST_VM_COMPRESSOR_Q_LEN_REV0] = { .last_access = 0, .current_requests = 0, .max_requests = 0, .data = (uintptr_t)&host_vm_compressor_q_lens, .count = VM_COMPRESSOR_Q_LENS_COUNT}, + [HOST_VM_INFO64_REV2] = { .last_access = 0, .current_requests = 0, .max_requests = 0, .data = (uintptr_t)&host_vm_info64_rev2, .count = HOST_VM_INFO64_REV2_COUNT }, }; @@ -663,6 +679,9 @@ get_host_info_data_index(bool is_stat64, host_flavor_t flavor, mach_msg_type_num *ret = KERN_FAILURE; return -1; } + if (*count >= HOST_VM_INFO64_REV2_COUNT) { + return HOST_VM_INFO64_REV2; + } if (*count >= HOST_VM_INFO64_REV1_COUNT) { return HOST_VM_INFO64_REV1; } @@ -854,6 +873,10 @@ vm_stats(void *info, unsigned int *count) stat->total_uncompressed_pages_in_compressor = c_segment_pages_compressed; *count = HOST_VM_INFO64_REV1_COUNT; } + if (original_count >= HOST_VM_INFO64_REV2_COUNT) { + stat->swapped_count = os_atomic_load(&vm_page_swapped_count, relaxed); + *count = HOST_VM_INFO64_REV2_COUNT; + } return KERN_SUCCESS; } @@ -1233,8 +1256,6 @@ is_valid_host_special_port(int id) ((id <= HOST_LAST_SPECIAL_KERNEL_PORT) || (id > HOST_MAX_SPECIAL_KERNEL_PORT)); } -extern void * XNU_PTRAUTH_SIGNED_PTR("initproc") initproc; - /* * Kernel interface for setting a special port. */ @@ -1247,23 +1268,15 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) panic("attempted to set invalid special port %d", id); } -#if !MACH_FLIPC if (id == HOST_NODE_PORT) { return KERN_NOT_SUPPORTED; } -#endif host_lock(host_priv); old_port = host_priv->special[id]; host_priv->special[id] = port; host_unlock(host_priv); -#if MACH_FLIPC - if (id == HOST_NODE_PORT) { - mach_node_port_changed(); - } -#endif - if (IP_VALID(old_port)) { ipc_port_release_send(old_port); } @@ -1312,8 +1325,8 @@ host_set_special_port_from_user(host_priv_t host_priv, int id, ipc_port_t port) * rdar://70585367 * disallow immovable send so other process can't retrieve it through host_get_special_port() */ - if (IP_VALID(port) && port->ip_immovable_send) { - return KERN_INVALID_RIGHT; + if (!ipc_can_stash_naked_send(port)) { + return KERN_DENIED; } return host_set_special_port(host_priv, id, port); @@ -1326,7 +1339,7 @@ host_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) return KERN_INVALID_ARGUMENT; } - if (current_task() != kernel_task && get_bsdtask_info(current_task()) != initproc) { + if (current_task() != kernel_task && !task_is_initproc(current_task())) { bool allowed = (id == HOST_TELEMETRY_PORT && IOTaskHasEntitlement(current_task(), "com.apple.private.xpc.launchd.event-monitor")); #if CONFIG_CSR diff --git a/osfmk/kern/host_notify.c b/osfmk/kern/host_notify.c index de5043b0e..93c7c1d81 100644 --- a/osfmk/kern/host_notify.c +++ b/osfmk/kern/host_notify.c @@ -29,6 +29,8 @@ #include #include +#include + #include #include #include @@ -97,11 +99,14 @@ again: /* * Preserve original ABI of host-notify ports being immovable * as a side effect of being a kobject. - * - * Unlike the original ABI, multiple registrations - * for the same port are now allowed. */ - port->ip_immovable_receive = true; + if (!ip_is_immovable_receive(port)) { + ipc_object_label_t label = ip_label_get(port); + + ipc_release_assert(label.io_state == IO_STATE_IN_SPACE); + label.io_state = IO_STATE_IN_SPACE_IMMOVABLE; + io_label_set_and_put(&port->ip_object, &label); + } enqueue_tail(&host_notify_queue[notify_type], &entry->entries); } diff --git a/osfmk/kern/host_statistics.h b/osfmk/kern/host_statistics.h index d6e12f31f..12af4a781 100644 --- a/osfmk/kern/host_statistics.h +++ b/osfmk/kern/host_statistics.h @@ -56,5 +56,8 @@ SCALABLE_COUNTER_DECLARE(vm_statistics_swapouts); /* # of pages sw SCALABLE_COUNTER_DECLARE(vm_statistics_total_uncompressed_pages_in_compressor); /* # of pages (uncompressed) held within the compressor. */ SCALABLE_COUNTER_DECLARE(vm_page_grab_count); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl); +SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl); #endif /* _KERN_HOST_STATISTICS_H_ */ diff --git a/osfmk/kern/hv_io_notifier.c b/osfmk/kern/hv_io_notifier.c index 20aba15a9..60a5cca89 100644 --- a/osfmk/kern/hv_io_notifier.c +++ b/osfmk/kern/hv_io_notifier.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -112,7 +113,7 @@ hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *notifier) ion->port_name = notifier->port_name; kern_return_t ret = ipc_typed_port_copyin_send(current_task()->itk_space, - ion->port_name, IKOT_UNKNOWN, &ion->port); + ion->port_name, IOT_ANY, &ion->port); if (!IP_VALID(ion->port)) { ret = KERN_FAILURE; @@ -127,7 +128,7 @@ hv_io_notifier_grp_add(hv_ion_grp_t *grp, const hv_ion_t *notifier) if (hv_io_notifier_grp_lookup(grp, ion) != NULL) { lck_rw_done(&grp->lock); - ipc_typed_port_release_send(ion->port, IKOT_UNKNOWN); + ipc_typed_port_release_send(ion->port, IOT_ANY); kfree_type(hv_ion_entry_t, ion); return KERN_FAILURE; } @@ -167,7 +168,7 @@ hv_io_notifier_grp_remove(hv_ion_grp_t *grp, const hv_ion_t *notifier) lck_rw_done(&grp->lock); - ipc_typed_port_release_send(entry->port, IKOT_UNKNOWN); + ipc_typed_port_release_send(entry->port, IOT_ANY); kfree_type(hv_ion_entry_t, entry); return KERN_SUCCESS; @@ -262,7 +263,7 @@ hv_io_notifier_grp_free(hv_ion_grp_t **grp_p) LIST_REMOVE(ion, list); - ipc_typed_port_release_send(ion->port, IKOT_UNKNOWN); + ipc_typed_port_release_send(ion->port, IOT_ANY); kfree_type(hv_ion_entry_t, ion); } diff --git a/osfmk/kern/ipc_clock.c b/osfmk/kern/ipc_clock.c index 8ed3b0c9b..c0773a35d 100644 --- a/osfmk/kern/ipc_clock.c +++ b/osfmk/kern/ipc_clock.c @@ -43,10 +43,9 @@ #include #include #include -#include -#include IPC_KOBJECT_DEFINE(IKOT_CLOCK, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); diff --git a/osfmk/kern/ipc_host.c b/osfmk/kern/ipc_host.c index bd83033b7..bf7c546d1 100644 --- a/osfmk/kern/ipc_host.c +++ b/osfmk/kern/ipc_host.c @@ -74,8 +74,6 @@ #include #include #include -#include -#include #if CONFIG_CSR #include @@ -92,19 +90,24 @@ extern lck_grp_t host_notify_lock_grp; IPC_KOBJECT_DEFINE(IKOT_HOST, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); IPC_KOBJECT_DEFINE(IKOT_HOST_PRIV, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); IPC_KOBJECT_DEFINE(IKOT_PROCESSOR, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); -IPC_KOBJECT_DEFINE(IKOT_PSET, +IPC_KOBJECT_DEFINE(IKOT_PROCESSOR_SET, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); -IPC_KOBJECT_DEFINE(IKOT_PSET_NAME, +IPC_KOBJECT_DEFINE(IKOT_PROCESSOR_SET_NAME, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); @@ -204,9 +207,9 @@ ipc_pset_init( processor_set_t pset) { pset->pset_self = ipc_kobject_alloc_port(pset, - IKOT_PSET, IPC_KOBJECT_ALLOC_NONE); + IKOT_PROCESSOR_SET, IPC_KOBJECT_ALLOC_NONE); pset->pset_name_self = ipc_kobject_alloc_port(pset, - IKOT_PSET_NAME, IPC_KOBJECT_ALLOC_NONE); + IKOT_PROCESSOR_SET_NAME, IPC_KOBJECT_ALLOC_NONE); } /* @@ -245,7 +248,7 @@ convert_port_to_host( ipc_kobject_type_t type; if (IP_VALID(port)) { - type = ip_kotype(port); + type = ip_type(port); if (type == IKOT_HOST || type == IKOT_HOST_PRIV) { host = (host_t)ipc_kobject_get_stable(port, type); if (host && host != &realhost) { @@ -326,7 +329,7 @@ convert_port_to_pset( processor_set_t pset = PROCESSOR_SET_NULL; if (IP_VALID(port)) { - pset = ipc_kobject_get_stable(port, IKOT_PSET); + pset = ipc_kobject_get_stable(port, IKOT_PROCESSOR_SET); } return pset; @@ -350,8 +353,8 @@ convert_port_to_pset_name( ipc_kobject_type_t type; if (IP_VALID(port)) { - type = ip_kotype(port); - if (type == IKOT_PSET || type == IKOT_PSET_NAME) { + type = ip_type(port); + if (type == IKOT_PROCESSOR_SET || type == IKOT_PROCESSOR_SET_NAME) { pset = ipc_kobject_get_stable(port, type); } } @@ -370,7 +373,7 @@ ipc_port_t host_port_copy_send(ipc_port_t port) { if (IP_VALID(port)) { - ipc_kobject_type_t kotype = ip_kotype(port); + ipc_kobject_type_t kotype = ip_type(port); if (kotype == IKOT_HOST) { port = ipc_kobject_copy_send(port, @@ -379,8 +382,8 @@ host_port_copy_send(ipc_port_t port) port = ipc_kobject_copy_send(port, host_priv_self(), IKOT_HOST_PRIV); #if CONFIG_CSR - } else if (kotype == IKOT_NONE && - (csr_check(CSR_ALLOW_KERNEL_DEBUGGER) == 0)) { + } else if (!io_is_kobject_type(kotype) && + csr_check(CSR_ALLOW_KERNEL_DEBUGGER) == 0) { port = ipc_port_copy_send_mqueue(port); #endif } else { @@ -448,7 +451,7 @@ ipc_port_t convert_pset_to_port( processor_set_t pset) { - return ipc_kobject_make_send(pset->pset_self, pset, IKOT_PSET); + return ipc_kobject_make_send(pset->pset_self, pset, IKOT_PROCESSOR_SET); } /* @@ -465,7 +468,7 @@ ipc_port_t convert_pset_name_to_port( processor_set_name_t pset) { - return ipc_kobject_make_send(pset->pset_name_self, pset, IKOT_PSET_NAME); + return ipc_kobject_make_send(pset->pset_name_self, pset, IKOT_PROCESSOR_SET_NAME); } /* @@ -505,7 +508,8 @@ host_set_exception_ports( return KERN_INVALID_ARGUMENT; } - kern_return_t kr = set_exception_ports_validation(NULL, exception_mask, new_port, new_behavior, new_flavor, false); + kern_return_t kr = set_exception_ports_validation(NULL, exception_mask, + new_port, new_behavior, new_flavor, false); if (kr != KERN_SUCCESS) { return kr; } @@ -679,7 +683,8 @@ host_swap_exception_ports( return KERN_INVALID_ARGUMENT; } - kern_return_t kr = set_exception_ports_validation(NULL, exception_mask, new_port, new_behavior, new_flavor, false); + kern_return_t kr = set_exception_ports_validation(NULL, exception_mask, + new_port, new_behavior, new_flavor, false); if (kr != KERN_SUCCESS) { return kr; } diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index f91e128d4..1933c2ef5 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -154,11 +154,7 @@ typedef struct { unsigned int kreply_desc_cnt; /* Number of descs in kernel reply msg */ } mig_hash_t; -static void ipc_kobject_subst_once_no_senders(ipc_port_t, mach_msg_type_number_t); - IPC_KOBJECT_DEFINE(IKOT_MEMORY_OBJECT); /* vestigial, no real instance */ -IPC_KOBJECT_DEFINE(IKOT_PORT_SUBST_ONCE, - .iko_op_no_senders = ipc_kobject_subst_once_no_senders); #define MAX_MIG_ENTRIES 1031 #define MIG_HASH(x) (x) @@ -201,28 +197,6 @@ static struct mig_kern_subsystem *mig_e[] = { (const struct mig_kern_subsystem *)&mach_eventlink_subsystem, }; -static struct ipc_kobject_ops __security_const_late - ipc_kobject_ops_array[IKOT_MAX_TYPE]; - -__startup_func -void -ipc_kobject_register_startup(ipc_kobject_ops_t ops) -{ - if (ipc_kobject_ops_array[ops->iko_op_type].iko_op_type) { - panic("trying to register kobject(%d) twice", ops->iko_op_type); - } - ipc_kobject_ops_array[ops->iko_op_type] = *ops; -} - -static ipc_kobject_ops_t -ipc_kobject_ops_get(ipc_kobject_type_t ikot) -{ - if (ikot < IKOT_NONE || ikot >= IKOT_MAX_TYPE) { - panic("invalid kobject type %d", ikot); - } - return &ipc_kobject_ops_array[ikot]; -} - __startup_func static void mig_init(void) @@ -682,7 +656,7 @@ ipc_kobject_server( ipc_kmsg_trace_send(request, option); - if (ip_kotype(port) == IKOT_UEXT_OBJECT) { + if (ip_type(port) == IKOT_UEXT_OBJECT) { kr = uext_server(port, request, &reply); } else { kr = ipc_kobject_server_internal(port, request, &reply); @@ -772,7 +746,7 @@ ipc_kobject_server( */ #if DEVELOPMENT || DEBUG printf("%s: refusing to send reply to kobject %d port (id:%d)\n", - __func__, ip_kotype(reply_port), request_msgh_id); + __func__, ip_type(reply_port), request_msgh_id); #endif /* DEVELOPMENT || DEBUG */ ipc_kmsg_destroy(reply, IPC_KMSG_DESTROY_NOT_SIGNED); reply = IKM_NULL; @@ -782,17 +756,15 @@ ipc_kobject_server( return reply; } -static __header_always_inline void +static inline void ipc_kobject_set_raw( ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t type) + ipc_kobject_type_t type, + ipc_kobject_t kobject) { uintptr_t *store = &port->ip_kobject; #if __has_feature(ptrauth_calls) - type |= port->ip_immovable_receive << 14; - type |= port->ip_immovable_send << 15; type ^= OS_PTRAUTH_DISCRIMINATOR("ipc_port.ip_kobject"); kobject = ptrauth_sign_unauthenticated(kobject, ptrauth_key_process_independent_data, @@ -804,17 +776,6 @@ ipc_kobject_set_raw( *store = (uintptr_t)kobject; } -static inline void -ipc_kobject_set_internal( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t type) -{ - assert(type != IKOT_NONE); - io_bits_or(ip_to_object(port), type); - ipc_kobject_set_raw(port, kobject, type); -} - /* * Routine: ipc_kobject_get_raw * Purpose: @@ -829,15 +790,13 @@ ipc_kobject_set_internal( */ __header_always_inline ipc_kobject_t ipc_kobject_get_raw( - ipc_port_t port, - ipc_kobject_type_t type) + ipc_port_t port, + ipc_kobject_type_t type) { uintptr_t *store = &port->ip_kobject; ipc_kobject_t kobject = (ipc_kobject_t)*store; #if __has_feature(ptrauth_calls) - type |= port->ip_immovable_receive << 14; - type |= port->ip_immovable_send << 15; type ^= OS_PTRAUTH_DISCRIMINATOR("ipc_port.ip_kobject"); kobject = ptrauth_auth_data(kobject, ptrauth_key_process_independent_data, @@ -856,9 +815,9 @@ ipc_kobject_require_panic( ipc_kobject_t kobject, ipc_kobject_type_t kotype) { - if (ip_kotype(port) != kotype) { + if (ip_type(port) != kotype) { panic("port %p: invalid kobject type, got %d wanted %d", - port, ip_kotype(port), kotype); + port, ip_type(port), kotype); } panic("port %p: invalid kobject, got %p wanted %p", port, ipc_kobject_get_raw(port, kotype), kobject); @@ -872,7 +831,7 @@ ipc_kobject_require( { ipc_kobject_t cur; - if (__improbable(ip_kotype(port) != kotype)) { + if (ip_type(port) != kotype) { ipc_kobject_require_panic(port, kobject, kotype); } cur = ipc_kobject_get_raw(port, kotype); @@ -898,13 +857,11 @@ ipc_kobject_require( * The port is a kobject of the proper type. */ ipc_kobject_t -ipc_kobject_get_locked( - ipc_port_t port, - ipc_kobject_type_t type) +ipc_kobject_get_locked(ipc_port_t port, ipc_kobject_type_t type) { ipc_kobject_t kobject = IKO_NULL; - if (ip_active(port) && type == ip_kotype(port)) { + if (ip_active(port) && ip_type(port) == type) { kobject = ipc_kobject_get_raw(port, type); } @@ -924,46 +881,32 @@ ipc_kobject_get_locked( * The port is a kobject of the proper type. */ ipc_kobject_t -ipc_kobject_get_stable( - ipc_port_t port, - ipc_kobject_type_t type) +ipc_kobject_get_stable(ipc_port_t port, ipc_kobject_type_t type) { - assert(ipc_kobject_ops_get(type)->iko_op_stable); + assert(ipc_policy(type)->pol_kobject_stable); return ipc_kobject_get_locked(port, type); } -/* - * Routine: ipc_kobject_init_port - * Purpose: - * Initialize a kobject port with the given types and options. - * - * This function never fails. - */ -static inline void -ipc_kobject_init_port( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_kobject_alloc_options_t options) +ipc_object_label_t +ipc_kobject_label_alloc( + ipc_object_type_t otype, + ipc_label_t label_tag, + ipc_port_t alt_port) { - if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { - ipc_port_make_send_any_locked(port); - } - if (options & IPC_KOBJECT_ALLOC_NSREQUEST) { - port->ip_nsrequest = IP_KOBJECT_NSREQUEST_ARMED; - ip_reference(port); - } - if (options & IPC_KOBJECT_ALLOC_NO_GRANT) { - port->ip_no_grant = 1; - } - if (options & IPC_KOBJECT_ALLOC_IMMOVABLE_SEND) { - port->ip_immovable_send = 1; - } - if (options & IPC_KOBJECT_ALLOC_PINNED) { - port->ip_pinned = 1; - } + ipc_kobject_label_t kolabel; - ipc_kobject_set_internal(port, kobject, type); + kolabel = zalloc_flags(ipc_kobject_label_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + kolabel->ikol_label = label_tag; + kolabel->ikol_alt_port = alt_port; + + return IPC_OBJECT_LABEL(otype, .iol_kobject = kolabel); +} + +void +ipc_kobject_label_free(ipc_object_label_t label) +{ + assert(label.iol_kobject->ikol_alt_port == IP_NULL); + zfree(ipc_kobject_label_zone, label.iol_kobject); } /* @@ -979,143 +922,48 @@ ipc_kobject_init_port( ipc_port_t ipc_kobject_alloc_port( ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_kobject_alloc_options_t options) + ipc_object_label_t label, + ipc_kobject_alloc_options_t options) { ipc_port_t port; - port = ipc_port_alloc_special(ipc_space_kernel, IPC_PORT_ENFORCE_RIGID_REPLY_PORT_SEMANTICS); - if (port == IP_NULL) { - panic("ipc_kobject_alloc_port(): failed to allocate port"); + port = ipc_port_alloc_special(ipc_space_kernel, label, IP_INIT_NONE); + + if (options & IPC_KOBJECT_ALLOC_MAKE_SEND) { + ipc_port_make_send_any_locked(port); } - ipc_kobject_init_port(port, kobject, type, options); - return port; -} + ipc_kobject_set_raw(port, label.io_type, kobject); -/* - * Routine: ipc_kobject_alloc_labeled_port - * Purpose: - * Allocate a kobject port and associated mandatory access label - * in the kernel space of the specified type. - * - * This function never fails. - * - * Conditions: - * No locks held (memory is allocated) - */ - -ipc_port_t -ipc_kobject_alloc_labeled_port( - ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_label_t label, - ipc_kobject_alloc_options_t options) -{ - ipc_port_t port; - - port = ipc_kobject_alloc_port(kobject, type, options); - - ipc_port_set_label(port, label); + ip_mq_unlock(port); return port; } -static void -ipc_kobject_subst_once_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount) -{ - ipc_port_t ko_port; - - ko_port = ipc_kobject_dealloc_port(port, mscount, IKOT_PORT_SUBST_ONCE); - - if (ko_port) { - /* - * Clean up the right if the wrapper wasn't hollowed out - * by ipc_kobject_alloc_subst_once(). - */ - ipc_port_release_send(ko_port); - } -} - -/* - * Routine: ipc_kobject_alloc_subst_once - * Purpose: - * Make a port that will be substituted by the kolabel - * rules once, preventing the next substitution (of its target) - * to happen if any. - * - * Returns: - * A port with a send right, that will substitute to its "kobject". - * - * Conditions: - * No locks held (memory is allocated). - * - * `target` holds a send-right donated to this function, - * consumed in ipc_kobject_subst_once_no_senders(). - */ -ipc_port_t -ipc_kobject_alloc_subst_once( - ipc_port_t target) -{ - if (!IP_VALID(target)) { - return target; - } - return ipc_kobject_alloc_labeled_port(target, - IKOT_PORT_SUBST_ONCE, IPC_LABEL_SUBST_ONCE, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); -} - -/* - * Routine: ipc_kobject_make_send_lazy_alloc_port - * Purpose: - * Make a send once for a kobject port. - * - * A location owning this port is passed in port_store. - * If no port exists, a port is made lazily. - * - * A send right is made for the port, and if this is the first one - * (possibly not for the first time), then the no-more-senders - * notification is rearmed. - * - * When a notification is armed, the kobject must donate - * one of its references to the port. It is expected - * the no-more-senders notification will consume this reference. - * - * Returns: - * TRUE if a notification was armed - * FALSE else - * - * Conditions: - * Nothing is locked, memory can be allocated. - * The caller must be able to donate a kobject reference to the port. - */ bool ipc_kobject_make_send_lazy_alloc_port( - ipc_port_t *port_store, + ipc_port_t *port_store, ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_kobject_alloc_options_t alloc_opts) + ipc_kobject_type_t type) { ipc_port_t port, previous; - kern_return_t kr; + bool was_armed = false; + + assert(ipc_policy(type)->pol_kobject_no_senders && + ipc_policy(type)->pol_kobject_stable); - alloc_opts |= IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST; port = os_atomic_load(port_store, dependency); - if (!IP_VALID(port)) { - port = ipc_kobject_alloc_port(kobject, type, alloc_opts); + port = ipc_kobject_alloc_port(kobject, type, + IPC_KOBJECT_ALLOC_MAKE_SEND); if (os_atomic_cmpxchgv(port_store, IP_NULL, port, &previous, release)) { - return TRUE; + return true; } /* - * undo IPC_KOBJECT_ALLOC_MAKE_SEND, - * ipc_kobject_dealloc_port will handle - * IPC_KOBJECT_ALLOC_NSREQUEST. + * undo IPC_KOBJECT_ALLOC_MAKE_SEND */ port->ip_mscount = 0; port->ip_srights = 0; @@ -1125,144 +973,30 @@ ipc_kobject_make_send_lazy_alloc_port( port = previous; } - kr = ipc_kobject_make_send_nsrequest(port, kobject, type); - assert(kr == KERN_SUCCESS || kr == KERN_ALREADY_WAITING); + ip_mq_lock(port); + ipc_port_make_send_any_locked(port); + was_armed = (port->ip_srights == 1); + ip_mq_unlock(port); - return kr == KERN_SUCCESS; + return was_armed; } -/* - * Routine: ipc_kobject_make_send_lazy_alloc_labeled_port - * Purpose: - * Make a send once for a kobject port. - * - * A location owning this port is passed in port_store. - * If no port exists, a port is made lazily. - * - * A send right is made for the port, and if this is the first one - * (possibly not for the first time), then the no-more-senders - * notification is rearmed. - * - * When a notification is armed, the kobject must donate - * one of its references to the port. It is expected - * the no-more-senders notification will consume this reference. - * - * Returns: - * TRUE if a notification was armed - * FALSE else - * - * Conditions: - * Nothing is locked, memory can be allocated. - * The caller must be able to donate a kobject reference to the port. - */ -boolean_t -ipc_kobject_make_send_lazy_alloc_labeled_port( - ipc_port_t *port_store, - ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_label_t label) +bool +ipc_kobject_is_mscount_current_locked(ipc_port_t port, mach_port_mscount_t mscount) { - ipc_port_t port, previous; - kern_return_t kr; - - port = os_atomic_load(port_store, dependency); - - if (!IP_VALID(port)) { - port = ipc_kobject_alloc_labeled_port(kobject, type, label, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); - if (os_atomic_cmpxchgv(port_store, IP_NULL, port, &previous, release)) { - return TRUE; - } - - /* - * undo IPC_KOBJECT_ALLOC_MAKE_SEND, - * ipc_kobject_dealloc_port will handle - * IPC_KOBJECT_ALLOC_NSREQUEST. - */ - port->ip_mscount = 0; - port->ip_srights = 0; - ip_release_live(port); - ipc_kobject_dealloc_port(port, 0, type); - - port = previous; - assert(ip_is_kolabeled(port)); - } - - kr = ipc_kobject_make_send_nsrequest(port, kobject, type); - assert(kr == KERN_SUCCESS || kr == KERN_ALREADY_WAITING); - - return kr == KERN_SUCCESS; + return ip_active(port) && port->ip_srights == 0 && port->ip_mscount == mscount; } -/* - * Routine: ipc_kobject_nsrequest_locked - * Purpose: - * Arm the no-senders notification for the given kobject - * if it doesn't have one armed yet. - * - * Conditions: - * Port is locked and active. - * - * Returns: - * KERN_SUCCESS: the notification was armed - * KERN_ALREADY_WAITING: the notification was already armed - * KERN_FAILURE: the notification would fire immediately - */ -static inline kern_return_t -ipc_kobject_nsrequest_locked( - ipc_port_t port, - mach_port_mscount_t sync) +bool +ipc_kobject_is_mscount_current(ipc_port_t port, mach_port_mscount_t mscount) { - if (port->ip_nsrequest == IP_KOBJECT_NSREQUEST_ARMED) { - return KERN_ALREADY_WAITING; - } + bool is_last; - if (port->ip_srights == 0 && sync <= port->ip_mscount) { - return KERN_FAILURE; - } + ip_mq_lock(port); + is_last = ipc_kobject_is_mscount_current_locked(port, mscount); + ip_mq_unlock(port); - port->ip_nsrequest = IP_KOBJECT_NSREQUEST_ARMED; - ip_reference(port); - return KERN_SUCCESS; -} - - -/* - * Routine: ipc_kobject_nsrequest - * Purpose: - * Arm the no-senders notification for the given kobject - * if it doesn't have one armed yet. - * - * Returns: - * KERN_SUCCESS: the notification was armed - * KERN_ALREADY_WAITING: the notification was already armed - * KERN_FAILURE: the notification would fire immediately - * KERN_INVALID_RIGHT: the port is dead - */ -kern_return_t -ipc_kobject_nsrequest( - ipc_port_t port, - mach_port_mscount_t sync, - mach_port_mscount_t *mscount) -{ - kern_return_t kr = KERN_INVALID_RIGHT; - - if (IP_VALID(port)) { - ip_mq_lock(port); - - if (mscount) { - *mscount = port->ip_mscount; - } - if (ip_active(port)) { - kr = ipc_kobject_nsrequest_locked(port, sync); - } - - ip_mq_unlock(port); - } else if (mscount) { - *mscount = 0; - } - - return kr; + return is_last; } kern_return_t @@ -1275,15 +1009,15 @@ ipc_typed_port_copyin_send( kern_return_t kr; kr = ipc_object_copyin(space, name, MACH_MSG_TYPE_COPY_SEND, - IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, NULL, portp); + IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND, IPC_COPYIN_KERNEL_DESTINATION, NULL, portp); if (kr != KERN_SUCCESS) { *portp = IP_NULL; return kr; } - if (kotype != IKOT_UNKNOWN && + if (kotype != IOT_ANY && IP_VALID(*portp) && - ip_kotype(*portp) != kotype) { + ip_type(*portp) != kotype) { ipc_port_release_send(*portp); *portp = IP_NULL; return KERN_INVALID_CAPABILITY; @@ -1341,64 +1075,23 @@ ipc_typed_port_release_send( ipc_port_t port, ipc_kobject_type_t kotype) { - if (kotype != IKOT_UNKNOWN && - IP_VALID(port) && - ip_kotype(port) != kotype) { + if (kotype != IOT_ANY && IP_VALID(port) && ip_type(port) != kotype) { ipc_kobject_require_panic(port, IKO_NULL, kotype); } ipc_port_release_send(port); } -kern_return_t -ipc_kobject_make_send_nsrequest( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t kotype) -{ - kern_return_t kr = KERN_INVALID_RIGHT; - - if (IP_VALID(port)) { - ip_mq_lock(port); - if (ip_active(port)) { - ipc_kobject_require(port, kobject, kotype); - ipc_port_make_send_any_locked(port); - kr = ipc_kobject_nsrequest_locked(port, 0); - assert(kr != KERN_FAILURE); - } - ip_mq_unlock(port); - } - - return kr; -} - -kern_return_t -ipc_kobject_make_send_nsrequest_locked( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t kotype) -{ - kern_return_t kr = KERN_INVALID_RIGHT; - - if (ip_active(port)) { - ipc_kobject_require(port, kobject, kotype); - ipc_port_make_send_any_locked(port); - kr = ipc_kobject_nsrequest_locked(port, 0); - assert(kr != KERN_FAILURE); - } - - return kr; -} - static inline ipc_kobject_t ipc_kobject_disable_internal( ipc_port_t port, + ipc_kobject_label_t kolabel, ipc_kobject_type_t type) { ipc_kobject_t kobject = ipc_kobject_get_raw(port, type); - ipc_kobject_set_raw(port, IKO_NULL, type); - if (ip_is_kolabeled(port)) { - port->ip_kolabel->ikol_alt_port = IP_NULL; + ipc_kobject_set_raw(port, type, IKO_NULL); + if (kolabel) { + kolabel->ikol_alt_port = IP_NULL; } return kobject; @@ -1429,19 +1122,19 @@ ipc_kobject_disable_internal( */ __abortlike static void -__ipc_kobject_dealloc_bad_type_panic(ipc_port_t port, ipc_kobject_type_t type) +__ipc_kobject_bad_type_panic(ipc_port_t port, ipc_kobject_type_t type) { - panic("port %p of type %d, expecting %d", port, ip_kotype(port), type); + panic("port %p of type %d, expecting %d", port, ip_type(port), type); } __abortlike static void __ipc_kobject_dealloc_bad_mscount_panic( ipc_port_t port, - mach_port_mscount_t mscount, + uint64_t mscount, ipc_kobject_type_t type) { - panic("unexpected make-send count: %p[%d], %d, %d", + panic("unexpected make-send count: %p[%d], %d, %lld", port, type, port->ip_mscount, mscount); } @@ -1458,30 +1151,31 @@ __ipc_kobject_dealloc_bad_srights_panic( ipc_kobject_t ipc_kobject_dealloc_port_and_unlock( ipc_port_t port, - mach_port_mscount_t mscount, + uint64_t mscount, ipc_kobject_type_t type) { ipc_kobject_t kobject = IKO_NULL; - ipc_kobject_ops_t ops = ipc_kobject_ops_get(type); + ipc_object_policy_t pol = ipc_policy(type); + ipc_object_label_t label = ip_label_get(port, type); - require_ip_active(port); + ipc_release_assert(io_state_active(label.io_state)); - if (ip_kotype(port) != type) { - __ipc_kobject_dealloc_bad_type_panic(port, type); + if (label.io_type != type) { + __ipc_kobject_bad_type_panic(port, type); } - if (mscount && port->ip_mscount != mscount) { + if (mscount != IPC_KOBJECT_NO_MSCOUNT && port->ip_mscount != mscount) { __ipc_kobject_dealloc_bad_mscount_panic(port, mscount, type); } - if ((mscount || ops->iko_op_stable) && port->ip_srights != 0) { + if (port->ip_srights && + (mscount != IPC_KOBJECT_NO_MSCOUNT || pol->pol_kobject_stable)) { __ipc_kobject_dealloc_bad_srights_panic(port, type); } - if (!ops->iko_op_destroy) { - kobject = ipc_kobject_disable_internal(port, type); - } + kobject = ipc_kobject_disable_internal(port, label.iol_kobject, type); - ipc_port_dealloc_special_and_unlock(port, ipc_space_kernel); + ip_label_put(port, &label); + ipc_port_destroy(port); return kobject; } @@ -1513,7 +1207,7 @@ ipc_kobject_dealloc_port_and_unlock( ipc_kobject_t ipc_kobject_dealloc_port( ipc_port_t port, - mach_port_mscount_t mscount, + uint64_t mscount, ipc_kobject_type_t type) { ip_mq_lock(port); @@ -1536,17 +1230,16 @@ ipc_kobject_enable( ipc_kobject_t kobject, ipc_kobject_type_t type) { - assert(!ipc_kobject_ops_get(type)->iko_op_stable); + assert(!ipc_policy(type)->pol_kobject_stable); ip_mq_lock(port); require_ip_active(port); - if (type != ip_kotype(port)) { - panic("%s: unexpected kotype of port %p: want %d, got %d", - __func__, port, type, ip_kotype(port)); + if (ip_type(port) != type) { + __ipc_kobject_bad_type_panic(port, type); } - ipc_kobject_set_raw(port, kobject, type); + ipc_kobject_set_raw(port, type, kobject); ip_mq_unlock(port); } @@ -1556,24 +1249,28 @@ ipc_kobject_enable( * Purpose: * Clear the kobject pointer for a port. * Conditions: - * The port is locked. + * port is locked. * Returns the current kobject pointer. */ ipc_kobject_t -ipc_kobject_disable_locked( - ipc_port_t port, - ipc_kobject_type_t type) +ipc_kobject_disable_locked(ipc_port_t port, ipc_kobject_type_t type) { - if (ip_active(port)) { - assert(!ipc_kobject_ops_get(type)->iko_op_stable); + ipc_object_label_t label; + ipc_kobject_t kobject; + + label = ip_label_get(port); + if (io_state_active(label.io_state)) { + assert(!ipc_policy(type)->pol_kobject_stable); } - if (ip_kotype(port) != type) { - panic("port %p of type %d, expecting %d", - port, ip_kotype(port), type); + if (label.io_type != type) { + __ipc_kobject_bad_type_panic(port, type); } - return ipc_kobject_disable_internal(port, type); + kobject = ipc_kobject_disable_internal(port, label.iol_kobject, type); + ip_label_put(port, &label); + + return kobject; } /* @@ -1585,9 +1282,7 @@ ipc_kobject_disable_locked( * Returns the current kobject pointer. */ ipc_kobject_t -ipc_kobject_disable( - ipc_port_t port, - ipc_kobject_type_t type) +ipc_kobject_disable(ipc_port_t port, ipc_kobject_type_t type) { ipc_kobject_t kobject; @@ -1599,53 +1294,7 @@ ipc_kobject_disable( } /* - * Routine: ipc_kobject_upgrade_mktimer_locked - * Purpose: - * Upgrades a port to mktimer kobject status - * - * This pattern is rather bad as it leads to various - * confusions that need to be special cased with kobject-ness - * of ports. No new port with dual kobject/message-queue - * semantics should be made ever. - * - * Conditions: - * Port is locked - */ -void -ipc_kobject_upgrade_mktimer_locked( - ipc_port_t port, - ipc_kobject_t kobject) -{ - ipc_kobject_set_internal(port, kobject, IKOT_TIMER); -} - -/* - * Routine: ipc_kobject_notify_no_senders - * Purpose: - * Handles a no-senders notification - * sent to a kobject. - * - * A port reference is consumed. - * - * Conditions: - * Nothing locked. - */ -void -ipc_kobject_notify_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount) -{ - ipc_kobject_ops_t ops = ipc_kobject_ops_get(ip_kotype(port)); - - assert(ops->iko_op_no_senders); - ops->iko_op_no_senders(port, mscount); - - /* consume the ref ipc_notify_no_senders_prepare left */ - ip_release(port); -} - -/* - * Routine: ipc_kobject_notify_no_senders + * Routine: ipc_kobject_notify_send_once_and_unlock * Purpose: * Handles a send once notifications * sent to a kobject. @@ -1667,7 +1316,7 @@ ipc_kobject_notify_send_once_and_unlock( * This a simplified version of ipc_port_release_sonce() * since kobjects can't be special reply ports. */ - assert(!port->ip_specialreply); + assert(!ip_is_special_reply_port(port)); ip_sorights_dec(port); ip_mq_unlock(port); @@ -1676,7 +1325,7 @@ ipc_kobject_notify_send_once_and_unlock( * because there's very few consumers, * the code here isn't generic as it's really not worth it. */ - switch (ip_kotype(port)) { + switch (ip_type(port)) { case IKOT_TASK_RESUME: task_suspension_send_once(port); break; @@ -1687,78 +1336,6 @@ ipc_kobject_notify_send_once_and_unlock( ip_release(port); } - -/* - * Routine: ipc_kobject_destroy - * Purpose: - * Release any kernel object resources associated - * with the port, which is being destroyed. - * - * This path to free object resources should only be - * needed when resources are associated with a user's port. - * In the normal case, when the kernel is the receiver, - * the code calling ipc_kobject_dealloc_port() should clean - * up the object resources. - * - * Cleans up any kobject label that might be present. - * Conditions: - * The port is not locked, but it is dead. - */ -void -ipc_kobject_destroy( - ipc_port_t port) -{ - ipc_kobject_ops_t ops = ipc_kobject_ops_get(ip_kotype(port)); - - if (ops->iko_op_permanent) { - panic("trying to destroy a permanent port %p with kobject type: %d", port, ip_kotype(port)); - } - if (ops->iko_op_destroy) { - ops->iko_op_destroy(port); - } - - if (ip_is_kolabeled(port)) { - ipc_kobject_label_t labelp = port->ip_kolabel; - - assert(labelp != NULL); - assert(labelp->ikol_alt_port == IP_NULL); - assert(ip_is_kobject(port)); - port->ip_kolabel = NULL; - io_bits_andnot(ip_to_object(port), IO_BITS_KOLABEL); - zfree(ipc_kobject_label_zone, labelp); - } -} - -/* - * Routine: ipc_kobject_label_substitute_task - * Purpose: - * Substitute a task control port for its immovable - * equivalent when the receiver is that task. - * Conditions: - * Space is write locked and active. - * Port is locked and active. - * Returns: - * - IP_NULL port if no substitution is to be done - * - a valid port if a substitution needs to happen - */ -static ipc_port_t -ipc_kobject_label_substitute_task( - ipc_space_t space, - ipc_kobject_label_t kolabel, - ipc_port_t port) -{ - ipc_port_t subst = IP_NULL; - task_t task = ipc_kobject_get_raw(port, IKOT_TASK_CONTROL); - - if (task != TASK_NULL && task == space->is_task) { - if ((subst = kolabel->ikol_alt_port)) { - return subst; - } - } - - return IP_NULL; -} - /* * Routine: ipc_kobject_label_substitute_task_read * Purpose: @@ -1789,36 +1366,6 @@ ipc_kobject_label_substitute_task_read( return IP_NULL; } -/* - * Routine: ipc_kobject_label_substitute_thread - * Purpose: - * Substitute a thread control port for its immovable - * equivalent when it belongs to the receiver task. - * Conditions: - * Space is write locked and active. - * Port is locked and active. - * Returns: - * - IP_NULL port if no substitution is to be done - * - a valid port if a substitution needs to happen - */ -static ipc_port_t -ipc_kobject_label_substitute_thread( - ipc_space_t space, - ipc_kobject_label_t kolabel, - ipc_port_t port) -{ - ipc_port_t subst = IP_NULL; - thread_t thread = ipc_kobject_get_raw(port, IKOT_THREAD_CONTROL); - - if (thread != THREAD_NULL && space->is_task == get_threadtask(thread)) { - if ((subst = kolabel->ikol_alt_port) != IP_NULL) { - return subst; - } - } - - return IP_NULL; -} - /* * Routine: ipc_kobject_label_substitute_thread_read * Purpose: @@ -1850,7 +1397,7 @@ ipc_kobject_label_substitute_thread_read( } /* - * Routine: ipc_kobject_label_check + * Routine: ipc_kobject_label_check_or_substitute * Purpose: * Check to see if the space is allowed to possess * a right for the given port. In order to qualify, @@ -1870,80 +1417,44 @@ ipc_kobject_label_substitute_thread_read( * As of now, substituted ports only happen for send rights. */ bool -ipc_kobject_label_check( - ipc_space_t space, - ipc_port_t port, - mach_msg_type_name_t msgt_name, - ipc_object_copyout_flags_t *flags, - ipc_port_t *subst_portp) +ipc_kobject_label_check_or_substitute( + ipc_space_t space, + ipc_port_t port, + ipc_object_label_t *label, + mach_msg_type_name_t msgt_name, + ipc_port_t *subst_portp) { - ipc_kobject_label_t kolabel; - ipc_label_t label; + ipc_kobject_label_t kolabel = label->iol_kobject; + ipc_label_t label_tag = kolabel->ikol_label; assert(is_active(space)); assert(ip_active(port)); *subst_portp = IP_NULL; - /* Unlabled ports/kobjects are always allowed */ - if (!ip_is_kolabeled(port)) { - return true; - } - /* Never OK to copyout the receive right for a labeled kobject */ if (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE) { - panic("ipc_kobject_label_check: attempted receive right " - "copyout for labeled kobject"); + panic("attempted receive right copyout for labeled kobject"); } - kolabel = port->ip_kolabel; - label = kolabel->ikol_label; - - if ((*flags & IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK) == 0 && - (label & IPC_LABEL_SUBST_MASK)) { + if ((label_tag & IPC_LABEL_SUBST_MASK)) { ipc_port_t subst = IP_NULL; if (msgt_name != MACH_MSG_TYPE_PORT_SEND) { return false; } - if ((label & IPC_LABEL_SUBST_MASK) == IPC_LABEL_SUBST_ONCE) { - /* - * The next check will _not_ substitute. - * hollow out our one-time wrapper, - * and steal its send right. - */ - *flags |= IPC_OBJECT_COPYOUT_FLAGS_NO_LABEL_CHECK; - subst = ipc_kobject_disable_locked(port, - IKOT_PORT_SUBST_ONCE); - is_write_unlock(space); - ipc_port_release_send_and_unlock(port); - if (subst == IP_NULL) { - panic("subst-once port %p was consumed twice", port); - } - *subst_portp = subst; - return true; - } - - switch (label & IPC_LABEL_SUBST_MASK) { - case IPC_LABEL_SUBST_TASK: - subst = ipc_kobject_label_substitute_task(space, - kolabel, port); - break; + switch (label_tag & IPC_LABEL_SUBST_MASK) { case IPC_LABEL_SUBST_TASK_READ: subst = ipc_kobject_label_substitute_task_read(space, kolabel, port); break; - case IPC_LABEL_SUBST_THREAD: - subst = ipc_kobject_label_substitute_thread(space, - kolabel, port); - break; case IPC_LABEL_SUBST_THREAD_READ: subst = ipc_kobject_label_substitute_thread_read(space, kolabel, port); break; default: - panic("unexpected label: %llx", label); + panic("unexpected label tag: %llx", label_tag); } if (subst != IP_NULL) { @@ -1963,6 +1474,7 @@ ipc_kobject_label_check( * the no-senders notification. */ + ip_label_put(port, label); ipc_port_release_send_and_unlock(port); /* no check: dPAC integrity */ port = ipc_port_make_send_any(subst); @@ -1972,6 +1484,6 @@ ipc_kobject_label_check( } } - return (label & space->is_label & IPC_LABEL_SPACE_MASK) == - (label & IPC_LABEL_SPACE_MASK); + return (label_tag & space->is_label & IPC_LABEL_SPACE_MASK) == + (label_tag & IPC_LABEL_SPACE_MASK); } diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 1f81b31af..4897e0393 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -79,85 +79,12 @@ #endif /* MACH_KERNEL_PRIVATE */ #include #include +#include __BEGIN_DECLS #pragma GCC visibility push(hidden) -__enum_decl(ipc_kotype_t, natural_t, { - IKOT_NONE = 0, - IKOT_THREAD_CONTROL = 1, - IKOT_TASK_CONTROL = 2, - IKOT_HOST = 3, - IKOT_HOST_PRIV = 4, - IKOT_PROCESSOR = 5, - IKOT_PSET = 6, - IKOT_PSET_NAME = 7, - IKOT_TIMER = 8, - IKOT_PORT_SUBST_ONCE = 9, - // IKOT_MIG = 10, - IKOT_MEMORY_OBJECT = 11, - // IKOT_XMM_PAGER = 12, - // IKOT_XMM_KERNEL = 13, - // IKOT_XMM_REPLY = 14, - IKOT_UND_REPLY = 15, - // IKOT_HOST_NOTIFY = 16, - // IKOT_HOST_SECURITY = 17, - // IKOT_LEDGER = 18, - IKOT_MAIN_DEVICE = 19, - IKOT_TASK_NAME = 20, - // IKOT_SUBSYSTEM = 21, - // IKOT_IO_DONE_QUEUE = 22, - IKOT_SEMAPHORE = 23, - // IKOT_LOCK_SET = 24, - IKOT_CLOCK = 25, - // IKOT_CLOCK_CTRL = 26, - IKOT_IOKIT_IDENT = 27, - IKOT_NAMED_ENTRY = 28, - IKOT_IOKIT_CONNECT = 29, - IKOT_IOKIT_OBJECT = 30, - // IKOT_UPL = 31, - // IKOT_MEM_OBJ_CONTROL = 32, -#if CONFIG_AUDIT - IKOT_AU_SESSIONPORT = 33, -#endif - IKOT_FILEPORT = 34, - // IKOT_LABELH = 35, - IKOT_TASK_RESUME = 36, - IKOT_VOUCHER = 37, - // IKOT_VOUCHER_ATTR_CONTROL = 38, - IKOT_WORK_INTERVAL = 39, - IKOT_UX_HANDLER = 40, - IKOT_UEXT_OBJECT = 41, - IKOT_ARCADE_REG = 42, - IKOT_EVENTLINK = 43, - IKOT_TASK_INSPECT = 44, - IKOT_TASK_READ = 45, - IKOT_THREAD_INSPECT = 46, - IKOT_THREAD_READ = 47, - // IKOT_SUID_CRED = 48, -#if HYPERVISOR - IKOT_HYPERVISOR = 49, -#endif - IKOT_TASK_ID_TOKEN = 50, -#if CONFIG_PROC_RESOURCE_LIMITS - IKOT_TASK_FATAL = 51, -#endif - IKOT_KCDATA = 52, -#if CONFIG_EXCLAVES - IKOT_EXCLAVES_RESOURCE = 53, -#endif - /* magic catch-all; should be the last entry */ - IKOT_UNKNOWN, -}); - -#define IKOT_MAX_TYPE (IKOT_UNKNOWN+1) /* # of IKOT_ types */ - -#ifdef __cplusplus -/* preserve legacy ABI for c++ */ -typedef natural_t ipc_kobject_type_t; -#else -typedef ipc_kotype_t ipc_kobject_type_t; -#endif +typedef ipc_object_type_t ipc_kobject_type_t; /* set the bitstring index for kobject */ extern kern_return_t ipc_kobject_set_kobjidx( @@ -173,7 +100,7 @@ extern kern_return_t ipc_kobject_set_kobjidx( * Describes the operations for a given kobject. * * @field iko_ko_type - * An @c IKOT_* value. + * An @c IOT_* value. * * @field iko_op_stable * The kobject/port association is stable: @@ -189,29 +116,36 @@ extern kern_return_t ipc_kobject_set_kobjidx( * @field iko_op_no_senders * A callback to run when a NO_SENDERS notification fires. * + * This callback is called each time a kobject port reaches 0 send rights + * (from a non 0 value). There is no need to actively arm no-senders. + * * Kobjects that destroy their port on no senders only are guaranteed * to be called with an active port only. * * However kobject ports that can be destroyed concurrently need * to be prepared for no senders to fail to acquire the kobject port. * - * @field iko_op_destroy - * A callback to run as part of destroying the kobject port. - * * When this callback is set, @c ipc_kobject_dealloc_port() * will not implicitly call @c ipc_kobject_disable(). * * The callback runs after the port has been marked inactive, * hence @c ipc_kobject_get_raw() needs to be used to get to the port. + * + * @field iko_op_label_free + * How to free the label on this kobject port (if it supports one). + * + * @field iko_op_movable_send + * Whether send rights created to this kobject are movable */ typedef const struct ipc_kobject_ops { ipc_kobject_type_t iko_op_type; unsigned long - iko_op_stable : 1, - iko_op_permanent : 1; + iko_op_stable : 1, + iko_op_permanent : 1, + iko_op_movable_send : 1; const char *iko_op_name; void (*iko_op_no_senders)(ipc_port_t port, mach_port_mscount_t mscount); - void (*iko_op_destroy)(ipc_port_t port); + void (*iko_op_label_free)(ipc_object_label_t label); } *ipc_kobject_ops_t; #define IPC_KOBJECT_DEFINE(type, ...) \ @@ -229,57 +163,102 @@ struct ipc_kobject_label { ipc_port_t XNU_PTRAUTH_SIGNED_PTR("ipc_kobject_label.ikol_alt_port") ikol_alt_port; }; +extern ipc_object_label_t ipc_kobject_label_alloc( + ipc_object_type_t otype, + ipc_label_t label_tag, + ipc_port_t alt_port); + +extern void ipc_kobject_label_free( + ipc_object_label_t label); + __options_decl(ipc_kobject_alloc_options_t, uint32_t, { /* Just make the naked port */ IPC_KOBJECT_ALLOC_NONE = 0x00000000, /* Make a send right */ IPC_KOBJECT_ALLOC_MAKE_SEND = 0x00000001, - /* Register for no-more-senders */ - IPC_KOBJECT_ALLOC_NSREQUEST = 0x00000002, - /* Make it no grant port */ - IPC_KOBJECT_ALLOC_NO_GRANT = 0x00000004, - /* Mark the port as immovable send right */ - IPC_KOBJECT_ALLOC_IMMOVABLE_SEND = 0x00000008, - /* Add a label structure to the port */ - IPC_KOBJECT_ALLOC_LABEL = 0x00000010, - /* Mark the port as pinned (non dealloc-able) in an ipc space */ - IPC_KOBJECT_ALLOC_PINNED = 0x00000020, }); /* Allocates a kobject port, never fails */ extern ipc_port_t ipc_kobject_alloc_port( ipc_kobject_t kobject, - ipc_kobject_type_t type, + ipc_object_label_t label, ipc_kobject_alloc_options_t options); -/* Allocates a kobject port, never fails */ -extern ipc_port_t ipc_kobject_alloc_labeled_port( +__attribute__((always_inline, overloadable)) +static inline ipc_port_t +ipc_kobject_alloc_port( ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_label_t label, - ipc_kobject_alloc_options_t options); + ipc_object_type_t otype, + ipc_kobject_alloc_options_t options) +{ + return ipc_kobject_alloc_port(kobject, IPC_OBJECT_LABEL(otype), options); +} -extern ipc_port_t ipc_kobject_alloc_subst_once( - ipc_port_t target); - -/* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ +/*! + * @function ipc_kobject_make_send_lazy_alloc_port() + * + * @brief + * Make a send once for a kobject port, lazily allocating the port. + * + * @discussion + * A location owning this port is passed in port_store. + * If no port exists, a port is made lazily. + * + * A send right is made for the port, and if this is the first one + * (possibly not for the first time), then the no-more-senders + * notification is rearmed. + * + * When a notification is armed, the kobject must donate + * one of its references to the port. It is expected + * the no-more-senders notification will consume this reference. + * + * In order to use this function, the kobject type requested must: + * - be use stable objects (iko_op_stable is true), + * - have a no-senders callback (iko_op_no_senders is set). + * + * @returns + * - true, if this was the first send right made for this port, + * and an object reference must be donated to the port; + * - false otherwise. + */ extern bool ipc_kobject_make_send_lazy_alloc_port( ipc_port_t *port_store, ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_kobject_alloc_options_t alloc_opts); + ipc_kobject_type_t type); -/* Makes a send right, lazily allocating a kobject port, arming for no-senders, never fails */ -extern boolean_t ipc_kobject_make_send_lazy_alloc_labeled_port( - ipc_port_t *port_store, - ipc_kobject_t kobject, - ipc_kobject_type_t type, - ipc_label_t label) __result_use_check; - -extern kern_return_t ipc_kobject_nsrequest( +/*! + * @function ipc_kobject_is_mscount_current() + * + * @brief + * Returns whether the current make-send count is the current one. + * + * @discussion + * This is meant to be called from the context of a no-senders notification + * callout to determine whether the object/port has since been rematerialized. + * + * Most kobjects are uniquely owned by their port, and the object is otherwise + * not reachable from any place in the system (see semaphores, eventlink, etc), + * and die when the port has no more senders. + * + * However some kobjects might still be reachable from other means, + * and can make new send rights in a way that isn't synchronized with Mach IPC. + * (See IKOT_TASK_RESUME for an example of that). + * + * This function allows for such kobject types to verify under the + * synchronization it uses whether this no-senders callout is the last one, + * or if there has been new send rights made concurrently. + * + * @param port The target port. + * @param mscount The make-send count for which the no-senders + * notification was issued. + */ +extern bool ipc_kobject_is_mscount_current( ipc_port_t port, - mach_port_mscount_t sync, - mach_port_mscount_t *mscount) __result_use_check; + mach_port_mscount_t mscount); + +extern bool ipc_kobject_is_mscount_current_locked( + ipc_port_t port, + mach_port_mscount_t mscount); /*! * @function ipc_kobject_copy_send() @@ -332,47 +311,16 @@ extern ipc_port_t ipc_kobject_make_send( ipc_kobject_t kobject, ipc_kobject_type_t kotype) __result_use_check; -/*! - * @function ipc_kobject_make_send_nsrequest() - * - * @brief - * Makes a naked send right for the specified kobject port, - * and arms no-more-senders if it wasn't already. - * - * @decription - * @see ipc_port_make_send_any_locked() for a general warning about - * making send rights. - * - * This function will validate that the specified port is pointing - * to the expected kobject pointer and type (by calling ipc_kobject_require()). - * - * @param port The target port. - * @param kobject The kobject pointer this port should be associated to. - * @param kotype The kobject type this port should have. - * - * @returns - * - KERN_SUCCESS: the notification was armed - * - KERN_ALREADY_WAITING: the notification was already armed - * - KERN_INVALID_RIGHT: the port is dead - */ -extern kern_return_t ipc_kobject_make_send_nsrequest( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t kotype) __result_use_check; - -extern kern_return_t ipc_kobject_make_send_nsrequest_locked( - ipc_port_t port, - ipc_kobject_t kobject, - ipc_kobject_type_t kotype) __result_use_check; +#define IPC_KOBJECT_NO_MSCOUNT (~0ull) extern ipc_kobject_t ipc_kobject_dealloc_port_and_unlock( ipc_port_t port, - mach_port_mscount_t mscount, + uint64_t mscount, ipc_kobject_type_t type); extern ipc_kobject_t ipc_kobject_dealloc_port( ipc_port_t port, - mach_port_mscount_t mscount, + uint64_t mscount, ipc_kobject_type_t type); extern void ipc_kobject_enable( @@ -424,32 +372,53 @@ extern ipc_kobject_t ipc_kobject_disable( ipc_port_t port, ipc_kobject_type_t type); -extern void ipc_kobject_upgrade_mktimer_locked( - ipc_port_t port, - ipc_kobject_t kobject); - /* Check if a kobject can be copied out to a given space */ -extern bool ipc_kobject_label_check( +extern bool ipc_kobject_label_check_or_substitute( ipc_space_t space, ipc_port_t port, + ipc_object_label_t *label, mach_msg_type_name_t msgt_name, - ipc_object_copyout_flags_t *flags, ipc_port_t *subst_portp) __result_use_check; +/*! + * @brief + * Evaluate a port for substitution and kobject label rules. + * + * @discussion + * This function has a really cumbersome calling convention. + * + * If it returns false, then it means that some policy was violated, + * in that case, @c port has been unlocked, and @c label put. + * + * If it returns true, and subst_portp is not IP_NULL, then @c port + * has been unlocked, and @c label put, and the caller is expected + * to redrive evaluation with that substitution port. + * + * If it returns true, and subst_port is IP_NULL, then @c port + * is still locked, and @c label still valid, and the caller is expected + * to proceed further. + * + * @param space The current space + * @param port The port to evaluate (must be locked and active) + * @param label (In/out) the label for @c port. + * @param msgt_name The disposition for @c port in the message. + * @param subst_portp (out) an optional substitution port, + * to replace @c port with. + */ __result_use_check static inline bool -ip_label_check( +ip_label_check_or_substitute( ipc_space_t space, ipc_port_t port, + ipc_object_label_t *label, mach_msg_type_name_t msgt_name, - ipc_object_copyout_flags_t *flags, ipc_port_t *subst_portp) { - if (!ip_is_kolabeled(port)) { + if (!io_is_kobject_type(label->io_type) || !label->iol_kobject) { *subst_portp = IP_NULL; return true; } - return ipc_kobject_label_check(space, port, msgt_name, flags, subst_portp); + return ipc_kobject_label_check_or_substitute(space, port, label, msgt_name, subst_portp); } /* implementation details */ @@ -458,25 +427,14 @@ __startup_func extern void ipc_kobject_register_startup( ipc_kobject_ops_t ops); -/* initialization of kobject subsystem */ -extern void ipc_kobject_init(void); - /* Dispatch a kernel server function */ extern ipc_kmsg_t ipc_kobject_server( ipc_port_t receiver, ipc_kmsg_t request, mach_msg_option64_t option); -/* Release any kernel object resources associated with a port */ -extern void ipc_kobject_destroy( - ipc_port_t port); - #define null_conversion(port) (port) -extern void ipc_kobject_notify_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount); - extern void ipc_kobject_notify_send_once_and_unlock( ipc_port_t port); @@ -496,7 +454,7 @@ extern kern_return_t uext_server( * * @decription * This function will validate that the specified port is pointing - * to the expected kobject type, unless @c kotype is IKOT_UNKNOWN, + * to the expected kobject type, unless @c kotype is IOT_ANY, * in which case any right is accepted. * * @param space The space to copyin in from. @@ -526,7 +484,7 @@ extern kern_return_t ipc_typed_port_copyin_send( * * @description * This is an alias for ipc_port_release_send() that the BSD side can use. - * If @c kotype is IKOT_UNKNOWN, any right is accepted. + * If @c kotype is IOT_ANY, any right is accepted. */ extern void ipc_typed_port_release_send( ipc_port_t port, diff --git a/osfmk/kern/ipc_misc.c b/osfmk/kern/ipc_misc.c index c3f1f781f..3242db383 100644 --- a/osfmk/kern/ipc_misc.c +++ b/osfmk/kern/ipc_misc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -52,8 +53,8 @@ extern void fileport_releasefg(struct fileglob *); ipc_port_t fileport_alloc(struct fileglob *fg) { - return ipc_kobject_alloc_port((ipc_kobject_t)fg, IKOT_FILEPORT, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + return ipc_kobject_alloc_port(fg, IKOT_FILEPORT, + IPC_KOBJECT_ALLOC_MAKE_SEND); } @@ -101,6 +102,7 @@ fileport_no_senders(ipc_port_t port, mach_port_mscount_t mscount) } IPC_KOBJECT_DEFINE(IKOT_FILEPORT, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = fileport_no_senders); diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index f72a90ec2..c510efeee 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -98,6 +98,8 @@ #include #include +#include + #include #include #include @@ -175,7 +177,6 @@ ipc_task_init( ipc_space_t space; ipc_port_t kport; ipc_port_t nport; - ipc_port_t pport; kern_return_t kr; struct label *temp_label; int i; @@ -190,7 +191,6 @@ ipc_task_init( kport = ipc_kobject_alloc_port(IKO_NULL, IKOT_TASK_CONTROL, IPC_KOBJECT_ALLOC_NONE); - pport = kport; nport = ipc_kobject_alloc_port(IKO_NULL, IKOT_TASK_NAME, IPC_KOBJECT_ALLOC_NONE); @@ -207,7 +207,6 @@ ipc_task_init( task->itk_resource_notify = NULL; #endif /* CONFIG_PROC_RESOURCE_LIMITS */ - task->itk_self = pport; task->itk_resume = IP_NULL; /* Lazily allocated on-demand */ #if CONFIG_CSR if (task_is_a_corpse_fork(task)) { @@ -294,52 +293,67 @@ ipc_task_init( } /* - * Routine: ipc_task_set_immovable_pinned + * Routine: ipc_task_copyout_control_port * Purpose: - * Make a task's control port immovable and/or pinned - * according to its control port options. If control port - * is immovable, allocate an immovable control port for the - * task and optionally pin it. - * Conditions: - * Task's control port is movable and not pinned. + * Copyout the task control port as pinned + * and stash the send right name in the port */ void -ipc_task_set_immovable_pinned( +ipc_task_copyout_control_port( task_t task) { ipc_port_t kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; - ipc_port_t new_pport; + mach_port_name_t name; + ipc_port_t pport; - /* pport is the same as kport at ipc_task_init() time */ - assert(task->itk_self == task->itk_task_ports[TASK_FLAVOR_CONTROL]); #if CONFIG_CSR - assert(task->itk_self == task->itk_settable_self); + assert(kport == task->itk_settable_self); #endif /* CONFIG_CSR */ assert(!task_is_a_corpse(task)); - /* only tasks opt in immovable control port can have pinned control port */ - if (task_is_immovable(task)) { - ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; - - if (task_is_pinned(task)) { - options |= IPC_KOBJECT_ALLOC_PINNED; - } - - new_pport = ipc_kobject_alloc_port(IKO_NULL, IKOT_TASK_CONTROL, options); - - assert(kport != IP_NULL); - ipc_port_set_label(kport, IPC_LABEL_SUBST_TASK); - kport->ip_kolabel->ikol_alt_port = new_pport; - - itk_lock(task); - task->itk_self = new_pport; - itk_unlock(task); - - /* enable the pinned port */ - ipc_kobject_enable(new_pport, task, IKOT_TASK_CONTROL); + pport = ipc_port_make_send_any(kport); + /* + * mach_task_self() is pinned, memorize the name we gave it + * in ip_receiver_name (it's an abuse as this port really + * isn't a message queue, but the field is up for grabs + * and otherwise `MACH_PORT_SPECIAL_DEFAULT` for special ports). + * + * port_name_to_task* use this to fastpath IPCs. + * + * ipc_task_disable() will revert this when the task dies. + */ + name = ipc_port_copyout_send_pinned(pport, task->itk_space); + if (MACH_PORT_VALID(name)) { + pport->ip_receiver_name = name; } } +/* + * Routine: ipc_thread_set_immovable_pinned + * Purpose: + * Copyout the task control port as pinned and immovable + * and stash the send right name in the port + * Notes: + * Consumes a thread ref; produces a naked send right + * which may be invalid. + */ +void +ipc_thread_set_immovable_pinned( + thread_t thread) +{ + ipc_port_t kport = convert_thread_to_port_immovable(thread); + + task_t task = get_threadtask(thread); + mach_port_name_t name; + +#if CONFIG_CSR + assert(kport == thread->t_tro->tro_settable_self_port); +#endif /* CONFIG_CSR */ + assert(!task_is_a_corpse(task)); + + name = ipc_port_copyout_send_pinned(kport, task->itk_space); +} + /* * Routine: ipc_task_enable * Purpose: @@ -355,7 +369,8 @@ ipc_task_enable( ipc_port_t nport; ipc_port_t iport; ipc_port_t rdport; - ipc_port_t pport; + + ipc_space_set_policy(task->itk_space, ipc_policy_for_task(task)); itk_lock(task); if (!task->active) { @@ -375,7 +390,7 @@ ipc_task_enable( return; } - assert(task->map->owning_task == task); /* verify vm_map_setup called */ + assert(task_is_a_corpse(task) || task->map->owning_task == task); /* verify vm_map_setup called */ assert(!task->ipc_active || task_is_a_corpse(task)); task->ipc_active = true; @@ -395,11 +410,6 @@ ipc_task_enable( if (rdport != IP_NULL) { ipc_kobject_enable(rdport, task, IKOT_TASK_READ); } - pport = task->itk_self; - if (pport != kport && pport != IP_NULL) { - assert(task_is_immovable(task)); - ipc_kobject_enable(pport, task, IKOT_TASK_CONTROL); - } itk_unlock(task); } @@ -421,7 +431,6 @@ ipc_task_disable( ipc_port_t iport; ipc_port_t rdport; ipc_port_t rport; - ipc_port_t pport; itk_lock(task); @@ -437,7 +446,6 @@ ipc_task_disable( kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; if (kport != IP_NULL) { - /* clears ikol_alt_port */ ipc_kobject_disable(kport, IKOT_TASK_CONTROL); } nport = task->itk_task_ports[TASK_FLAVOR_NAME]; @@ -453,16 +461,6 @@ ipc_task_disable( /* clears ikol_alt_port */ ipc_kobject_disable(rdport, IKOT_TASK_READ); } - pport = task->itk_self; - if (pport != IP_NULL) { - /* see port_name_is_pinned_itk_self() */ - pport->ip_receiver_name = MACH_PORT_SPECIAL_DEFAULT; - if (pport != kport) { - assert(task_is_immovable(task)); - assert(pport->ip_immovable_send); - ipc_kobject_disable(pport, IKOT_TASK_CONTROL); - } - } rport = task->itk_resume; if (rport != IP_NULL) { @@ -501,7 +499,6 @@ ipc_task_terminate( ipc_port_t iport; ipc_port_t rdport; ipc_port_t rport; - ipc_port_t pport; #if CONFIG_CSR ipc_port_t sself; #endif /* CONFIG_CSR */ @@ -521,7 +518,6 @@ ipc_task_terminate( #if CONFIG_CSR sself = task->itk_settable_self; #endif /* CONFIG_CSR */ - pport = IP_NULL; if (kport == IP_NULL) { /* the task is already terminated (can this happen?) */ @@ -545,9 +541,6 @@ ipc_task_terminate( task->itk_dyld_notify = NULL; } - pport = task->itk_self; - task->itk_self = IP_NULL; - rport = task->itk_resume; task->itk_resume = IP_NULL; @@ -612,21 +605,22 @@ ipc_task_terminate( /* clears read port ikol_alt_port, must be done first */ if (rdport != IP_NULL) { - ipc_kobject_dealloc_port(rdport, 0, IKOT_TASK_READ); + ipc_kobject_dealloc_port(rdport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_READ); } - ipc_kobject_dealloc_port(kport, 0, IKOT_TASK_CONTROL); - /* ikol_alt_port cleared */ + ipc_kobject_dealloc_port(kport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_CONTROL); /* destroy other kernel ports */ - ipc_kobject_dealloc_port(nport, 0, IKOT_TASK_NAME); + ipc_kobject_dealloc_port(nport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_NAME); if (iport != IP_NULL) { - ipc_kobject_dealloc_port(iport, 0, IKOT_TASK_INSPECT); - } - if (pport != IP_NULL && pport != kport) { - ipc_kobject_dealloc_port(pport, 0, IKOT_TASK_CONTROL); + ipc_kobject_dealloc_port(iport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_INSPECT); } if (rport != IP_NULL) { - ipc_kobject_dealloc_port(rport, 0, IKOT_TASK_RESUME); + ipc_kobject_dealloc_port(rport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_RESUME); } itk_lock_destroy(task); @@ -648,7 +642,7 @@ void ipc_task_reset( task_t task) { - ipc_port_t old_kport, old_pport, new_kport, new_pport; + ipc_port_t old_kport, new_kport; #if CONFIG_CSR ipc_port_t old_sself; #endif /* CONFIG_CSR */ @@ -668,31 +662,23 @@ ipc_task_reset( /* * ipc_task_reset() only happens during sugid or corpsify. * - * (1) sugid happens early in exec_mach_imgact(), at which point the old task - * port has not been enabled, and is left movable/not pinned. - * (2) corpse cannot execute more code so the notion of the immovable/pinned - * task port is bogus, and should appear as if it doesn't have one. + * (1) sugid happens early in exec_mach_imgact(), + * at which point the old task port has not been enabled, + * and is left movable. + * (2) corpse cannot execute more code so the notion of the immovable + * task port is bogus, and should appear as if it doesn't have one. * - * So simply leave pport the same as kport. */ - new_pport = new_kport; - itk_lock(task); old_kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; old_rdport = task->itk_task_ports[TASK_FLAVOR_READ]; old_iport = task->itk_task_ports[TASK_FLAVOR_INSPECT]; - old_pport = task->itk_self; - - if (old_pport == IP_NULL) { + if (old_kport == IP_NULL) { /* the task is already terminated (can this happen?) */ itk_unlock(task); - ipc_kobject_dealloc_port(new_kport, 0, IKOT_TASK_CONTROL); - if (new_pport != new_kport) { - assert(task_is_immovable(task)); - ipc_kobject_dealloc_port(new_pport, 0, IKOT_TASK_CONTROL); - } + ipc_kobject_dealloc_port(new_kport, IPC_KOBJECT_NO_MSCOUNT, IKOT_TASK_CONTROL); #if CONFIG_MACF mac_exc_free_label(unset_label); #endif @@ -700,7 +686,6 @@ ipc_task_reset( } task->itk_task_ports[TASK_FLAVOR_CONTROL] = new_kport; - task->itk_self = new_pport; #if CONFIG_CSR old_sself = task->itk_settable_self; @@ -713,18 +698,12 @@ ipc_task_reset( } #endif /* CONFIG_CSR */ - /* clears ikol_alt_port */ ipc_kobject_disable(old_kport, IKOT_TASK_CONTROL); /* Reset the read and inspect flavors of task port */ task->itk_task_ports[TASK_FLAVOR_READ] = IP_NULL; task->itk_task_ports[TASK_FLAVOR_INSPECT] = IP_NULL; - if (old_pport != old_kport) { - assert(task_is_immovable(task)); - ipc_kobject_disable(old_pport, IKOT_TASK_CONTROL); - } - if (IP_VALID(task->hardened_exception_action.ea.port) && !task->hardened_exception_action.ea.privileged) { old_hardened_exception = task->hardened_exception_action.ea.port; @@ -790,17 +769,15 @@ ipc_task_reset( /* destroy all task port flavors */ if (old_rdport != IP_NULL) { /* read port ikol_alt_port may point to kport, dealloc first */ - ipc_kobject_dealloc_port(old_rdport, 0, IKOT_TASK_READ); + ipc_kobject_dealloc_port(old_rdport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_READ); } - ipc_kobject_dealloc_port(old_kport, 0, IKOT_TASK_CONTROL); - /* ikol_alt_port cleared */ + ipc_kobject_dealloc_port(old_kport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_CONTROL); if (old_iport != IP_NULL) { - ipc_kobject_dealloc_port(old_iport, 0, IKOT_TASK_INSPECT); - } - if (old_pport != old_kport) { - assert(task_is_immovable(task)); - ipc_kobject_dealloc_port(old_pport, 0, IKOT_TASK_CONTROL); + ipc_kobject_dealloc_port(old_iport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_INSPECT); } } @@ -814,42 +791,20 @@ ipc_task_reset( void ipc_thread_init( - task_t task, + __unused task_t task, thread_t thread, - thread_ro_t tro, - ipc_thread_init_options_t options) + thread_ro_t tro) { - ipc_port_t kport; - ipc_port_t pport; - ipc_kobject_alloc_options_t alloc_options = IPC_KOBJECT_ALLOC_NONE; + ipc_port_t kport; - if (task_is_immovable(task) && !(options & IPC_THREAD_INIT_MAINTHREAD)) { - /* - * pthreads and raw threads both have immovable port upon creation. - * pthreads are subsequently pinned via ipc_port_copyout_send_pinned() whereas - * raw threads are left unpinned. - */ - alloc_options |= IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; + /* + * pthreads are subsequently pinned via + * ipc_port_copyout_send_pinned() whereas raw threads are left + * unpinned. + */ + kport = ipc_kobject_alloc_port(thread, IKOT_THREAD_CONTROL, + IPC_KOBJECT_ALLOC_NONE); - pport = ipc_kobject_alloc_port((ipc_kobject_t)thread, - IKOT_THREAD_CONTROL, alloc_options); - - kport = ipc_kobject_alloc_labeled_port((ipc_kobject_t)thread, - IKOT_THREAD_CONTROL, IPC_LABEL_SUBST_THREAD, IPC_KOBJECT_ALLOC_NONE); - kport->ip_kolabel->ikol_alt_port = pport; - } else { - /* - * Main thread is created movable but may be set immovable and pinned in - * main_thread_set_immovable_pinned(). It needs to be handled separately - * because task_control_port_options is not available at main thread creation time. - */ - kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, - IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE); - - pport = kport; - } - - tro->tro_self_port = pport; /* we just made the port, no need to triple check */ #if CONFIG_CSR tro->tro_settable_self_port = ipc_port_make_send_any(kport); @@ -868,48 +823,6 @@ ipc_thread_init( thread->ith_kernel_reply_port = IP_NULL; } -void -ipc_main_thread_set_immovable_pinned(thread_t thread) -{ - thread_ro_t tro = get_thread_ro(thread); - ipc_port_t kport = tro->tro_ports[THREAD_FLAVOR_CONTROL]; - task_t task = tro->tro_task; - ipc_port_t new_pport; - - assert(thread_get_tag(thread) & THREAD_TAG_MAINTHREAD); - - /* pport is the same as kport at ipc_thread_init() time */ - assert(tro->tro_self_port == tro->tro_ports[THREAD_FLAVOR_CONTROL]); -#if CONFIG_CSR - assert(tro->tro_self_port == tro->tro_settable_self_port); -#endif /* CONFIG_CSR */ - - /* - * Main thread port is immovable/pinned depending on whether owner task has - * immovable/pinned task control port. task_control_port_options is now set. - */ - if (task_is_immovable(task)) { - ipc_kobject_alloc_options_t options = IPC_KOBJECT_ALLOC_IMMOVABLE_SEND; - - if (task_is_pinned(task)) { - options |= IPC_KOBJECT_ALLOC_PINNED; - } - - new_pport = ipc_kobject_alloc_port(IKO_NULL, IKOT_THREAD_CONTROL, options); - - assert(kport != IP_NULL); - ipc_port_set_label(kport, IPC_LABEL_SUBST_THREAD); - kport->ip_kolabel->ikol_alt_port = new_pport; - - thread_mtx_lock(thread); - zalloc_ro_update_field(ZONE_ID_THREAD_RO, tro, tro_self_port, &new_pport); - thread_mtx_unlock(thread); - - /* enable the pinned port */ - ipc_kobject_enable(new_pport, thread, IKOT_THREAD_CONTROL); - } -} - struct thread_init_exc_actions { struct exception_action array[EXC_TYPES_COUNT]; }; @@ -956,27 +869,22 @@ ipc_thread_ro_update_ports( thread_ro_t tro, const struct thread_ro *tro_tpl) { - vm_size_t offs = offsetof(struct thread_ro, tro_self_port); - vm_size_t size = sizeof(struct ipc_port *) + + vm_size_t offs = offsetof(struct thread_ro, tro_ports); + vm_size_t size = sizeof(tro_tpl->tro_ports) #if CONFIG_CSR - sizeof(struct ipc_port *) + + + sizeof(struct ipc_port *); +#else + ; #endif /* CONFIG_CSR */ - sizeof(tro_tpl->tro_ports); #if CONFIG_CSR static_assert(offsetof(struct thread_ro, tro_settable_self_port) == - offsetof(struct thread_ro, tro_self_port) + - sizeof(struct ipc_port_t *)); + offsetof(struct thread_ro, tro_ports) + + sizeof(tro_tpl->tro_ports)); #endif /* CONFIG_CSR */ - static_assert(offsetof(struct thread_ro, tro_ports) == - offsetof(struct thread_ro, tro_self_port) + -#if CONFIG_CSR - sizeof(struct ipc_port_t *) + -#endif /* CONFIG_CSR */ - sizeof(struct ipc_port_t *)); zalloc_ro_mut(ZONE_ID_THREAD_RO, tro, - offs, &tro_tpl->tro_self_port, size); + offs, &tro_tpl->tro_ports, size); } /* @@ -994,7 +902,6 @@ ipc_thread_disable( ipc_port_t kport = tro->tro_ports[THREAD_FLAVOR_CONTROL]; ipc_port_t iport = tro->tro_ports[THREAD_FLAVOR_INSPECT]; ipc_port_t rdport = tro->tro_ports[THREAD_FLAVOR_READ]; - ipc_port_t pport = tro->tro_self_port; /* * This innocuous looking line is load bearing. @@ -1007,7 +914,6 @@ ipc_thread_disable( thread->ipc_active = false; if (kport != IP_NULL) { - /* clears ikol_alt_port */ ipc_kobject_disable(kport, IKOT_THREAD_CONTROL); } @@ -1020,12 +926,6 @@ ipc_thread_disable( ipc_kobject_disable(rdport, IKOT_THREAD_READ); } - if (pport != kport && pport != IP_NULL) { - assert(task_is_immovable(tro->tro_task)); - assert(pport->ip_immovable_send); - ipc_kobject_disable(pport, IKOT_THREAD_CONTROL); - } - /* unbind the thread special reply port */ if (IP_VALID(thread->ith_special_reply_port)) { ipc_port_unbind_special_reply_port(thread, IRPT_USER); @@ -1048,7 +948,6 @@ ipc_thread_terminate( ipc_port_t kport = IP_NULL; ipc_port_t iport = IP_NULL; ipc_port_t rdport = IP_NULL; - ipc_port_t pport = IP_NULL; #if CONFIG_CSR ipc_port_t sport = IP_NULL; #endif /* CONFIG_CSR */ @@ -1066,7 +965,6 @@ ipc_thread_terminate( kport = tro->tro_ports[THREAD_FLAVOR_CONTROL]; iport = tro->tro_ports[THREAD_FLAVOR_INSPECT]; rdport = tro->tro_ports[THREAD_FLAVOR_READ]; - pport = tro->tro_self_port; #if CONFIG_CSR sport = tro->tro_settable_self_port; #endif /* CONFIG_CSR */ @@ -1099,20 +997,17 @@ ipc_thread_terminate( /* clears read port ikol_alt_port, must be done first */ if (rdport != IP_NULL) { - ipc_kobject_dealloc_port(rdport, 0, IKOT_THREAD_READ); + ipc_kobject_dealloc_port(rdport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_READ); } - /* control port can also have ikol_alt_port */ - if (kport != IP_NULL) { - ipc_kobject_dealloc_port(kport, 0, IKOT_THREAD_CONTROL); - } - /* ikol_alt_port cleared */ - if (iport != IP_NULL) { - ipc_kobject_dealloc_port(iport, 0, IKOT_THREAD_INSPECT); + if (kport != IP_NULL) { + ipc_kobject_dealloc_port(kport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_CONTROL); } - if (pport != kport && pport != IP_NULL) { - assert(task_is_immovable(tro->tro_task)); - ipc_kobject_dealloc_port(pport, 0, IKOT_THREAD_CONTROL); + if (iport != IP_NULL) { + ipc_kobject_dealloc_port(iport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_INSPECT); } if (thread->ith_kernel_reply_port != IP_NULL) { thread_dealloc_kernel_special_reply_port(thread); @@ -1137,7 +1032,7 @@ ipc_thread_reset( thread_t thread) { thread_ro_t tro = get_thread_ro(thread); - ipc_port_t old_kport, new_kport, old_pport, new_pport; + ipc_port_t old_kport, new_kport; #if CONFIG_CSR ipc_port_t old_sself; #endif /* CONFIG_CSR */ @@ -1145,28 +1040,22 @@ ipc_thread_reset( ipc_port_t old_iport; ipc_port_t old_exc_actions[EXC_TYPES_COUNT]; boolean_t has_old_exc_actions = FALSE; - boolean_t thread_is_immovable; int i; #if CONFIG_MACF struct label *new_label = mac_exc_create_label(NULL); #endif - thread_is_immovable = ip_is_immovable_send(tro->tro_self_port); - new_kport = ipc_kobject_alloc_port((ipc_kobject_t)thread, IKOT_THREAD_CONTROL, IPC_KOBJECT_ALLOC_NONE); /* * ipc_thread_reset() only happens during sugid or corpsify. * - * (1) sugid happens early in exec_mach_imgact(), at which point the old thread - * port is still movable/not pinned. - * (2) corpse cannot execute more code so the notion of the immovable/pinned - * thread port is bogus, and should appear as if it doesn't have one. - * - * So simply leave pport the same as kport. + * (1) sugid happens early in exec_mach_imgact(), at which point + * the old thread port is still movable. + * (2) corpse cannot execute more code so the notion of the immovable + * thread port is bogus, and should appear as if it doesn't have one. */ - new_pport = new_kport; thread_mtx_lock(thread); @@ -1177,16 +1066,12 @@ ipc_thread_reset( #if CONFIG_CSR old_sself = tro->tro_settable_self_port; #endif /* CONFIG_CSR */ - old_pport = tro->tro_self_port; if (old_kport == IP_NULL && thread->inspection == FALSE) { /* thread is already terminated (can this happen?) */ thread_mtx_unlock(thread); - ipc_kobject_dealloc_port(new_kport, 0, IKOT_THREAD_CONTROL); - if (thread_is_immovable) { - ipc_kobject_dealloc_port(new_pport, 0, - IKOT_THREAD_CONTROL); - } + ipc_kobject_dealloc_port(new_kport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_CONTROL); #if CONFIG_MACF mac_exc_free_label(new_label); #endif @@ -1196,18 +1081,16 @@ ipc_thread_reset( thread->ipc_active = true; struct thread_ro tpl = { - .tro_self_port = new_pport, + .tro_ports[THREAD_FLAVOR_CONTROL] = new_kport, /* we just made the port, no need to triple check */ #if CONFIG_CSR .tro_settable_self_port = ipc_port_make_send_any(new_kport), #endif /* CONFIG_CSR */ - .tro_ports[THREAD_FLAVOR_CONTROL] = new_kport, }; ipc_thread_ro_update_ports(tro, &tpl); if (old_kport != IP_NULL) { - /* clears ikol_alt_port */ (void)ipc_kobject_disable(old_kport, IKOT_THREAD_CONTROL); } if (old_rdport != IP_NULL) { @@ -1217,9 +1100,6 @@ ipc_thread_reset( if (old_iport != IP_NULL) { (void)ipc_kobject_disable(old_iport, IKOT_THREAD_INSPECT); } - if (thread_is_immovable && old_pport != IP_NULL) { - (void)ipc_kobject_disable(old_pport, IKOT_THREAD_CONTROL); - } /* * Only ports that were set by root-owned processes @@ -1261,19 +1141,18 @@ ipc_thread_reset( /* destroy the kernel ports */ if (old_rdport != IP_NULL) { - ipc_kobject_dealloc_port(old_rdport, 0, IKOT_THREAD_READ); + ipc_kobject_dealloc_port(old_rdport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_READ); + /* ikol_alt_port cleared */ } if (old_kport != IP_NULL) { - ipc_kobject_dealloc_port(old_kport, 0, IKOT_THREAD_CONTROL); + ipc_kobject_dealloc_port(old_kport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_CONTROL); } - /* ikol_alt_port cleared */ if (old_iport != IP_NULL) { - ipc_kobject_dealloc_port(old_iport, 0, IKOT_THREAD_INSPECT); - } - if (old_pport != old_kport && old_pport != IP_NULL) { - assert(thread_is_immovable); - ipc_kobject_dealloc_port(old_pport, 0, IKOT_THREAD_CONTROL); + ipc_kobject_dealloc_port(old_iport, IPC_KOBJECT_NO_MSCOUNT, + IKOT_THREAD_INSPECT); } /* unbind the thread special reply port */ @@ -1299,33 +1178,20 @@ retrieve_task_self_fast( task_t task) { ipc_port_t port = IP_NULL; + ipc_port_t kport = task->itk_task_ports[TASK_FLAVOR_CONTROL]; assert(task == current_task()); itk_lock(task); - assert(task->itk_self != IP_NULL); + assert(kport != IP_NULL); #if CONFIG_CSR - if (task->itk_settable_self != task->itk_task_ports[TASK_FLAVOR_CONTROL]) { + if (task->itk_settable_self != kport) { port = ipc_port_copy_send_mqueue(task->itk_settable_self); } else #endif { - /* no interposing, return the IMMOVABLE port */ - port = ipc_kobject_make_send(task->itk_self, task, - IKOT_TASK_CONTROL); -#if (DEBUG || DEVELOPMENT) - if (task_is_immovable(task)) { - assert(ip_is_immovable_send(port)); - if (task_is_pinned(task)) { - /* pinned port is also immovable */ - assert(ip_is_pinned(port)); - } - } else { - assert(!ip_is_immovable_send(port)); - assert(!ip_is_pinned(port)); - } -#endif + port = ipc_kobject_make_send(kport, task, IKOT_TASK_CONTROL); } itk_unlock(task); @@ -1376,32 +1242,14 @@ retrieve_thread_self_fast( thread_mtx_lock(thread); - assert(tro->tro_self_port != IP_NULL); - #if CONFIG_CSR if (tro->tro_settable_self_port != tro->tro_ports[THREAD_FLAVOR_CONTROL]) { port = ipc_port_copy_send_mqueue(tro->tro_settable_self_port); } else #endif { - /* no interposing, return IMMOVABLE_PORT */ - port = ipc_kobject_make_send(tro->tro_self_port, thread, - IKOT_THREAD_CONTROL); -#if (DEBUG || DEVELOPMENT) - if (task_is_immovable(tro->tro_task)) { - assert(ip_is_immovable_send(port)); - uint16_t tag = thread_get_tag(thread); - /* terminated threads are unpinned */ - if (thread->active && (tag & (THREAD_TAG_PTHREAD | THREAD_TAG_MAINTHREAD))) { - assert(ip_is_pinned(port)); - } else { - assert(!ip_is_pinned(port)); - } - } else { - assert(!ip_is_immovable_send(port)); - assert(!ip_is_pinned(port)); - } -#endif + port = ipc_kobject_make_send(tro->tro_ports[THREAD_FLAVOR_CONTROL], + thread, IKOT_THREAD_CONTROL); } thread_mtx_unlock(thread); @@ -1412,7 +1260,7 @@ retrieve_thread_self_fast( /* * Routine: task_self_trap [mach trap] * Purpose: - * Give the caller send rights for his own task port. + * Give the caller send rights for their own task port. * Conditions: * Nothing locked. * Returns: @@ -1426,35 +1274,9 @@ task_self_trap( { task_t task = current_task(); ipc_port_t sright; - mach_port_name_t name; sright = retrieve_task_self_fast(task); - name = ipc_port_copyout_send(sright, task->itk_space); - - /* - * When the right is pinned, memorize the name we gave it - * in ip_receiver_name (it's an abuse as this port really - * isn't a message queue, but the field is up for grabs - * and otherwise `MACH_PORT_SPECIAL_DEFAULT` for special ports). - * - * port_name_to_task* use this to fastpath IPCs to mach_task_self() - * when it is pinned. - * - * ipc_task_disable() will revert this when the task dies. - */ - if (sright == task->itk_self && sright->ip_pinned && - MACH_PORT_VALID(name)) { - itk_lock(task); - if (task->ipc_active) { - if (ip_get_receiver_name(sright) == MACH_PORT_SPECIAL_DEFAULT) { - sright->ip_receiver_name = name; - } else if (ip_get_receiver_name(sright) != name) { - panic("mach_task_self() name changed"); - } - } - itk_unlock(task); - } - return name; + return ipc_port_copyout_send(sright, task->itk_space); } /* @@ -1501,8 +1323,8 @@ mach_reply_port( mach_port_name_t name; kern_return_t kr; - kr = ipc_port_alloc(current_task()->itk_space, IPC_PORT_INIT_MESSAGE_QUEUE, - &name, &port); + kr = ipc_port_alloc(current_space(), IPC_OBJECT_LABEL(IOT_PORT), + IP_INIT_NONE, &name, &port); if (kr == KERN_SUCCESS) { ip_mq_unlock(port); } else { @@ -1531,15 +1353,14 @@ thread_get_special_reply_port( mach_port_name_t name; kern_return_t kr; thread_t thread = current_thread(); - ipc_port_init_flags_t flags = IPC_PORT_INIT_MESSAGE_QUEUE | - IPC_PORT_INIT_MAKE_SEND_RIGHT | IPC_PORT_INIT_SPECIAL_REPLY; /* unbind the thread special reply port */ if (IP_VALID(thread->ith_special_reply_port)) { ipc_port_unbind_special_reply_port(thread, IRPT_USER); } - kr = ipc_port_alloc(current_task()->itk_space, flags, &name, &port); + kr = ipc_port_alloc(current_space(), IPC_OBJECT_LABEL(IOT_SPECIAL_REPLY_PORT), + IP_INIT_MAKE_SEND_RIGHT, &name, &port); if (kr == KERN_SUCCESS) { ipc_port_bind_special_reply_port_locked(port, IRPT_USER); ip_mq_unlock(port); @@ -1572,13 +1393,14 @@ thread_get_kernel_special_reply_port(void) ipc_port_unbind_special_reply_port(thread, IRPT_KERNEL); } - port = ipc_port_alloc_reply(); /*returns a reference on the port */ - if (port != IPC_PORT_NULL) { - ip_mq_lock(port); - ipc_port_bind_special_reply_port_locked(port, IRPT_KERNEL); - ip_mq_unlock(port); - ip_release(port); /* release the reference returned by ipc_port_alloc_reply */ - } + port = ipc_port_alloc_special(ipc_space_reply, + IPC_OBJECT_LABEL(IOT_SPECIAL_REPLY_PORT), IP_INIT_NONE); + ipc_port_bind_special_reply_port_locked(port, IRPT_KERNEL); + ip_mq_unlock(port); + + /* release the reference returned by ipc_port_alloc_special */ + ip_release(port); + return KERN_SUCCESS; } @@ -1607,7 +1429,7 @@ ipc_port_bind_special_reply_port_locked( } assert(*reply_portp == NULL); - assert(port->ip_specialreply); + assert(ip_is_special_reply_port(port)); assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY); ip_reference(port); @@ -1654,9 +1476,9 @@ ipc_port_unbind_special_reply_port( if (reply_type == IRPT_USER) { ip_release(special_reply_port); } else { - ipc_port_dealloc_reply(special_reply_port); + ip_mq_lock(special_reply_port); + ipc_port_destroy(special_reply_port); } - return; } /* @@ -1765,27 +1587,6 @@ thread_get_special_port( which, portp, THREAD_FLAVOR_CONTROL); } -static ipc_port_t -thread_get_non_substituted_self(thread_t thread, thread_ro_t tro) -{ - ipc_port_t port = IP_NULL; - - thread_mtx_lock(thread); - port = tro->tro_ports[THREAD_FLAVOR_CONTROL]; -#if CONFIG_CSR - if (tro->tro_settable_self_port != port) { - port = ipc_port_make_send_mqueue(tro->tro_settable_self_port); - } else -#endif /* CONFIG_CSR */ - { - port = ipc_kobject_make_send(port, thread, IKOT_THREAD_CONTROL); - } - thread_mtx_unlock(thread); - - /* takes ownership of the send right */ - return ipc_kobject_alloc_subst_once(port); -} - kern_return_t thread_get_special_port_from_user( mach_port_t port, @@ -1804,7 +1605,7 @@ thread_get_special_port_from_user( } tro = get_thread_ro(thread); - kotype = ip_kotype(port); + kotype = ip_type(port); if (which == THREAD_KERNEL_PORT && tro->tro_task == current_task()) { #if CONFIG_MACF @@ -1817,10 +1618,12 @@ thread_get_special_port_from_user( goto out; } #endif - if (kotype == IKOT_THREAD_CONTROL) { - *portp = thread_get_non_substituted_self(thread, tro); - goto out; - } + /* + * if `mac_task_check_get_movable_control_port` returned 0, + * then we must also have a movable task. + * see `task_set_exc_guard_default` + */ + assert(!task_is_immovable(current_task())); } switch (kotype) { @@ -1911,7 +1714,7 @@ thread_set_special_port( * rdar://70585367 * disallow immovable send so other process can't retrieve it through thread_get_special_port() */ - if (IP_VALID(port) && port->ip_immovable_send) { + if (!ipc_can_stash_naked_send(port)) { return KERN_INVALID_RIGHT; } @@ -1954,6 +1757,24 @@ thread_set_special_port( } } +static inline mach_task_flavor_t +task_special_type_to_flavor(task_special_port_t which) +{ + switch (which) { + case TASK_KERNEL_PORT: + return TASK_FLAVOR_CONTROL; + case TASK_NAME_PORT: + return TASK_FLAVOR_NAME; + case TASK_INSPECT_PORT: + return TASK_FLAVOR_INSPECT; + case TASK_READ_PORT: + return TASK_FLAVOR_READ; + default: + break; + } + panic("invalid special port: %d", which); +} + /* * Routine: task_get_special_port [kernel call] * Purpose: @@ -2009,8 +1830,7 @@ task_get_special_port_internal( case TASK_READ_PORT: case TASK_INSPECT_PORT: itk_unlock(task); - mach_task_flavor_t current_flavor = (which == TASK_READ_PORT) ? - TASK_FLAVOR_READ : TASK_FLAVOR_INSPECT; + mach_task_flavor_t current_flavor = task_special_type_to_flavor(which); /* convert_task_to_port_with_flavor consumes a task reference */ task_reference(task); port = convert_task_to_port_with_flavor(task, current_flavor, TASK_GRP_KERNEL); @@ -2068,27 +1888,6 @@ task_get_special_port( return task_get_special_port_internal(task, which, portp, TASK_FLAVOR_CONTROL); } -static ipc_port_t -task_get_non_substituted_self(task_t task) -{ - ipc_port_t port = IP_NULL; - - itk_lock(task); - port = task->itk_task_ports[TASK_FLAVOR_CONTROL]; -#if CONFIG_CSR - if (task->itk_settable_self != port) { - port = ipc_port_make_send_mqueue(task->itk_settable_self); - } else -#endif /* CONFIG_CSR */ - { - port = ipc_kobject_make_send(port, task, IKOT_TASK_CONTROL); - } - itk_unlock(task); - - /* takes ownership of the send right */ - return ipc_kobject_alloc_subst_once(port); -} - /* MIG call only. Kernel/Kext uses task_get_special_port() */ kern_return_t task_get_special_port_from_user( @@ -2106,7 +1905,7 @@ task_get_special_port_from_user( return KERN_INVALID_ARGUMENT; } - kotype = ip_kotype(port); + kotype = ip_type(port); #if CONFIG_MACF if (mac_task_check_get_task_special_port(current_task(), task, which)) { @@ -2126,10 +1925,12 @@ task_get_special_port_from_user( goto out; } #endif - if (kotype == IKOT_TASK_CONTROL) { - *portp = task_get_non_substituted_self(task); - goto out; - } + /* + * if `mac_task_check_get_movable_control_port` returned 0, + * then we must also have a movable task. + * see `task_set_exc_guard_default` + */ + assert(!task_is_immovable(current_task())); } switch (kotype) { @@ -2242,10 +2043,11 @@ task_set_special_port( * rdar://70585367 * disallow immovable send so other process can't retrieve it through task_get_special_port() */ - if (IP_VALID(port) && port->ip_immovable_send) { + if (!ipc_can_stash_naked_send(port)) { return KERN_INVALID_RIGHT; } + switch (which) { case TASK_KERNEL_PORT: case TASK_HOST_PORT: @@ -2403,7 +2205,7 @@ _kernelrpc_mach_ports_register3( * rdar://70585367 * disallow immovable send so other process can't retrieve it through mach_ports_lookup() */ - if (IP_VALID(ports[i]) && ports[i]->ip_immovable_send) { + if (!ipc_can_stash_naked_send(ports[i])) { return KERN_INVALID_RIGHT; } } @@ -2644,7 +2446,7 @@ convert_port_to_task_with_flavor_locked_noref( mach_task_flavor_t flavor, port_intrans_options_t options) { - ipc_kobject_type_t type = ip_kotype(port); + ipc_kobject_type_t type = ip_type(port); task_t task; ip_mq_lock_held(port); @@ -2734,7 +2536,7 @@ convert_port_to_task_with_flavor( task_t self = current_task(); if (IP_VALID(port)) { - if (port == self->itk_self) { + if (port == self->itk_task_ports[TASK_FLAVOR_CONTROL]) { task_reference_grp(self, grp); return self; } @@ -3151,7 +2953,7 @@ convert_port_to_thread_with_flavor_locked( { thread_t thread = THREAD_NULL; task_t task; - ipc_kobject_type_t type = ip_kotype(port); + ipc_kobject_type_t type = ip_type(port); ip_mq_lock_held(port); require_ip_active(port); @@ -3284,6 +3086,26 @@ thread_flavor_to_kotype(mach_thread_flavor_t flavor) } } + +ipc_port_t +convert_thread_to_port_immovable( + thread_t thread) +{ + thread_ro_t tro = get_thread_ro(thread); + ipc_port_t port = IP_NULL; + + thread_mtx_lock(thread); + + if (thread->ipc_active) { + port = ipc_kobject_make_send(tro->tro_ports[THREAD_FLAVOR_CONTROL], + thread, IKOT_THREAD_CONTROL); + } + + thread_mtx_unlock(thread); + thread_deallocate(thread); + return port; +} + /* * Routine: convert_thread_to_port_with_flavor * Purpose: @@ -3323,8 +3145,22 @@ convert_thread_to_port_with_flavor( if (flavor == THREAD_FLAVOR_CONTROL) { port = ipc_kobject_make_send(port, thread, IKOT_THREAD_CONTROL); } else if (IP_VALID(port)) { - (void)ipc_kobject_make_send_nsrequest(port, thread, kotype); + (void)ipc_kobject_make_send(port, thread, kotype); } else { + ipc_object_label_t label = IPC_OBJECT_LABEL(kotype); + + /* + * If Developer Mode is off, substitute read port for control + * port if copying out to owning task's space, for the sake of + * in-process exception handler. + * + * Also see: exception_deliver(). + */ + if (!developer_mode_state() && flavor == THREAD_FLAVOR_READ) { + label = ipc_kobject_label_alloc(kotype, + IPC_LABEL_SUBST_THREAD_READ, tro->tro_ports[THREAD_FLAVOR_CONTROL]); + } + /* * Claim a send right on the thread read/inspect port, and request a no-senders * notification on that port (if none outstanding). A thread reference is not @@ -3334,20 +3170,9 @@ convert_thread_to_port_with_flavor( * send-once notification firing, and this is done under the thread mutex * rather than with atomics. */ - port = ipc_kobject_alloc_port(thread, kotype, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST | - IPC_KOBJECT_ALLOC_IMMOVABLE_SEND); - /* - * If Developer Mode is off, substitute read port for control - * port if copying out to owning task's space, for the sake of - * in-process exception handler. - * - * Also see: exception_deliver(). - */ - if (!developer_mode_state() && flavor == THREAD_FLAVOR_READ) { - ipc_port_set_label(port, IPC_LABEL_SUBST_THREAD_READ); - port->ip_kolabel->ikol_alt_port = tro->tro_self_port; - } + port = ipc_kobject_alloc_port(thread, label, + IPC_KOBJECT_ALLOC_MAKE_SEND); + zalloc_ro_update_field(ZONE_ID_THREAD_RO, tro, tro_ports[flavor], &port); } @@ -3387,7 +3212,6 @@ convert_thread_array_to_ports( mach_thread_flavor_t flavor) { thread_t *thread_list = (thread_t *)array; - task_t task_self = current_task(); for (size_t i = 0; i < count; i++) { thread_t thread = thread_list[i]; @@ -3395,11 +3219,7 @@ convert_thread_array_to_ports( switch (flavor) { case THREAD_FLAVOR_CONTROL: - if (get_threadtask(thread) == task_self) { - port = convert_thread_to_port_pinned(thread); - } else { - port = convert_thread_to_port(thread); - } + port = convert_thread_to_port(thread); break; case THREAD_FLAVOR_READ: port = convert_thread_read_to_port(thread); @@ -3447,15 +3267,15 @@ port_name_to_thread( } /* - * Routine: port_name_is_pinned_itk_self + * Routine: port_name_is_pinned_self * Purpose: * Returns whether this port name is for the pinned * mach_task_self (if it exists). * - * task_self_trap() when the task port is pinned, - * will memorize the name the port has in the space - * in ip_receiver_name, which we can use to fast-track - * this answer without taking any lock. + * task_self_trap() will memorize the name the port has + * in the space in ip_receiver_name when it gets pinned, + * which we can use to fast-track this answer without + * taking any lock. * * ipc_task_disable() will set `ip_receiver_name` back to * MACH_PORT_SPECIAL_DEFAULT. @@ -3465,13 +3285,13 @@ port_name_to_thread( * Nothing locked. */ static bool -port_name_is_pinned_itk_self( +port_name_is_pinned_self( task_t self, mach_port_name_t name) { - ipc_port_t kport = self->itk_self; + ipc_port_t kport = self->itk_task_ports[TASK_FLAVOR_CONTROL]; return MACH_PORT_VALID(name) && name != MACH_PORT_SPECIAL_DEFAULT && - kport->ip_pinned && ip_get_receiver_name(kport) == name; + ip_get_receiver_name(kport) == name; } /* @@ -3489,7 +3309,7 @@ port_name_is_pinned_itk_self( * has to handle anyway. * * ipc_space_disable() does try to narrow this race, - * by causing port_name_is_pinned_itk_self() to fail. + * by causing port_name_is_pinned_self() to fail. * * Returns: * current_task() if the port name was for current_task() @@ -3510,14 +3330,14 @@ port_name_to_current_task_internal_noref( task_t task = TASK_NULL; task_t self = current_task(); - if (port_name_is_pinned_itk_self(self, name)) { + if (port_name_is_pinned_self(self, name)) { return self; } if (MACH_PORT_VALID(name)) { kr = ipc_port_translate_send(self->itk_space, name, &kport); if (kr == KERN_SUCCESS) { - ipc_kobject_type_t type = ip_kotype(kport); + ipc_kobject_type_t type = ip_type(kport); if (task_port_kotype_valid_for_flavor(type, flavor)) { task = ipc_kobject_get_locked(kport, type); } @@ -3567,7 +3387,7 @@ port_name_to_task_grp( task_t task = TASK_NULL; task_t self = current_task(); - if (!kportp && port_name_is_pinned_itk_self(self, name)) { + if (!kportp && port_name_is_pinned_self(self, name)) { task_reference_grp(self, grp); return self; } @@ -3788,13 +3608,10 @@ convert_task_to_port_with_flavor( case TASK_FLAVOR_INSPECT: port = task->itk_task_ports[flavor]; if (IP_VALID(port)) { - (void)ipc_kobject_make_send_nsrequest(port, - task, kotype); + (void)ipc_kobject_make_send(port, task, kotype); } else { - port = ipc_kobject_alloc_port(task, kotype, - IPC_KOBJECT_ALLOC_MAKE_SEND | - IPC_KOBJECT_ALLOC_NSREQUEST | - IPC_KOBJECT_ALLOC_IMMOVABLE_SEND); + ipc_object_label_t label = IPC_OBJECT_LABEL(kotype); + /* * If Developer Mode is off, substitute read port for control port if * copying out to owning task's space, for the sake of in-process @@ -3803,10 +3620,12 @@ convert_task_to_port_with_flavor( * Also see: exception_deliver(). */ if (!developer_mode_state() && flavor == TASK_FLAVOR_READ) { - ipc_port_set_label(port, IPC_LABEL_SUBST_TASK_READ); - port->ip_kolabel->ikol_alt_port = task->itk_self; + label = ipc_kobject_label_alloc(kotype, + IPC_LABEL_SUBST_TASK_READ, task->itk_task_ports[TASK_FLAVOR_CONTROL]); } + port = ipc_kobject_alloc_port(task, label, + IPC_KOBJECT_ALLOC_MAKE_SEND); task->itk_task_ports[flavor] = port; } break; @@ -3823,13 +3642,11 @@ convert_corpse_to_port_and_nsrequest( task_t corpse) { ipc_port_t port = IP_NULL; - __assert_only kern_return_t kr; assert(task_is_a_corpse(corpse)); itk_lock(corpse); port = corpse->itk_task_ports[TASK_FLAVOR_CONTROL]; - kr = ipc_kobject_make_send_nsrequest(port, corpse, IKOT_TASK_CONTROL); - assert(kr == KERN_SUCCESS || kr == KERN_ALREADY_WAITING); + port = ipc_kobject_make_send(port, corpse, IKOT_TASK_CONTROL); itk_unlock(corpse); task_deallocate(corpse); @@ -3876,31 +3693,6 @@ convert_task_read_to_port_external(task_t task) return convert_task_to_port_with_flavor(task, TASK_FLAVOR_READ, TASK_GRP_EXTERNAL); } -ipc_port_t -convert_task_to_port_pinned( - task_t task) -{ - ipc_port_t port = IP_NULL; - - assert(task == current_task()); - - itk_lock(task); - - if (task->ipc_active) { - port = ipc_kobject_make_send(task->itk_self, task, - IKOT_TASK_CONTROL); - } - - if (port && task_is_immovable(task)) { - assert(ip_is_pinned(port)); - assert(ip_is_immovable_send(port)); - } - - itk_unlock(task); - task_deallocate(task); - return port; -} - void convert_task_array_to_ports( task_array_t array, @@ -3908,7 +3700,6 @@ convert_task_array_to_ports( mach_task_flavor_t flavor) { task_t *task_list = (task_t *)array; - task_t task_self = current_task(); for (size_t i = 0; i < count; i++) { task_t task = task_list[i]; @@ -3916,12 +3707,8 @@ convert_task_array_to_ports( switch (flavor) { case TASK_FLAVOR_CONTROL: - if (task == task_self) { - /* if current_task(), return pinned port */ - port = convert_task_to_port_pinned(task); - } else { - port = convert_task_to_port(task); - } + /* copyout determines immovability, see `should_mark_immovable_send` */ + port = convert_task_to_port(task); break; case TASK_FLAVOR_READ: port = convert_task_read_to_port(task); @@ -3996,28 +3783,6 @@ convert_task_suspension_token_to_port_mig( return convert_task_suspension_token_to_port_grp(task, TASK_GRP_MIG); } -ipc_port_t -convert_thread_to_port_pinned( - thread_t thread) -{ - thread_ro_t tro = get_thread_ro(thread); - ipc_port_t port = IP_NULL; - - thread_mtx_lock(thread); - - if (thread->ipc_active) { - port = ipc_kobject_make_send(tro->tro_self_port, - thread, IKOT_THREAD_CONTROL); - } - - if (port && task_is_immovable(tro->tro_task)) { - assert(ip_is_immovable_send(port)); - } - - thread_mtx_unlock(thread); - thread_deallocate(thread); - return port; -} /* * Routine: space_deallocate * Purpose: @@ -4093,11 +3858,11 @@ send_set_exception_telemetry(const task_t excepting_task, const exception_mask_t /* Returns whether the violation should be ignored */ static boolean_t -set_exception_behavior_violation(const task_t excepting_task, const exception_mask_t mask) +set_exception_behavior_violation(const task_t excepting_task, const exception_mask_t mask, int new_behavior) { if (thid_should_crash) { /* create lightweight corpse */ - mach_port_guard_exception(0, 0, kGUARD_EXC_EXCEPTION_BEHAVIOR_ENFORCE); + mach_port_guard_exception(new_behavior, mask, kGUARD_EXC_EXCEPTION_BEHAVIOR_ENFORCE); } /* always report the proc name to CA */ @@ -4107,29 +3872,17 @@ set_exception_behavior_violation(const task_t excepting_task, const exception_ma return !thid_should_crash; } -/* - * Protect platform binary task/thread ports. - * excepting_task is NULL if we are setting a host exception port. - */ -static boolean_t -exception_exposes_protected_ports(const ipc_port_t new_port, const task_t excepting_task) +static bool +exception_exposes_protected_ports(const ipc_port_t new_port) { - if (!IP_VALID(new_port) || is_ux_handler_port(new_port)) { - /* - * sending exceptions to invalid port does not pose risk - * ux_handler port is an immovable, read-only kobject port; doesn't need protection. - */ - return FALSE; - } else if (excepting_task) { - /* setting task/thread exception port - protect hardened binaries */ - return task_is_hardened_binary(excepting_task); - } - - /* setting host port exposes all processes - always protect. */ - return TRUE; + /* + * sending exceptions to invalid port does not pose risk + * ux_handler port is an immovable, read-only kobject port; doesn't need protection. + */ + return IP_VALID(new_port) && !is_ux_handler_port(new_port); } -static boolean_t +static bool exception_ports_frozen(task_t excepting_task) { return excepting_task && @@ -4144,24 +3897,29 @@ SIP_is_enabled() } #endif /* XNU_TARGET_OS_OSX && CONFIG_CSR*/ -static boolean_t +static bool exception_is_identity_protected(const ipc_port_t new_port, int new_behavior, const task_t excepting_task, const exception_mask_t mask) { - if (exception_exposes_protected_ports(new_port, excepting_task) + ipc_space_policy_t policy = {}; + + /* excepting_task is NULL if we are setting a host exception port. */ + if (excepting_task) { + policy = ipc_policy_for_task(excepting_task); + } + + if (exception_exposes_protected_ports(new_port) + && (!excepting_task || ipc_should_apply_policy(policy, IPC_POLICY_ENHANCED_V1)) && !behavior_is_identity_protected(new_behavior) -#if XNU_TARGET_OS_OSX - && !task_opted_out_mach_hardening(excepting_task) /* Some tasks are opted out more generally */ #if CONFIG_CSR && SIP_is_enabled() /* cannot enforce if SIP is disabled */ #endif /* CONFIG_CSR */ -#endif /* XNU_TARGET_OS_OSX */ #if CONFIG_ROSETTA && !task_is_translated(current_task()) #endif /* CONFIG_ROSETTA */ && !proc_is_simulated(current_proc()) ) { - return set_exception_behavior_violation(excepting_task, mask); + return set_exception_behavior_violation(excepting_task, mask, new_behavior); } return true; @@ -4194,14 +3952,15 @@ set_exception_behavior_allowed(const ipc_port_t new_port, int new_behavior, * Only allow hardened set_exception_port calls on hardened tasks * that opt in via entitlement */ + ipc_space_policy_t pol = ipc_policy_for_task(excepting_task); bool only_one_exception_port = IOTaskHasEntitlement(excepting_task, IPC_ONLY_ONE_EXCEPTION_PORT) - && task_is_hardened_binary(excepting_task); + && ipc_should_apply_policy(pol, IPC_SPACE_POLICY_ENHANCED_V1); if (!hardened_exception && only_one_exception_port) { kprintf("Disallowing set_exception_ports from [%s] on [%s] due " "to only_one_exception_port policy\n", cur_task_name, excepting_task_name); - return set_exception_behavior_violation(excepting_task, mask); + return set_exception_behavior_violation(excepting_task, mask, new_behavior); } } @@ -4210,16 +3969,18 @@ set_exception_behavior_allowed(const ipc_port_t new_port, int new_behavior, } /* - * Routine: set_exception_ports_validation + * Routine: set_exception_ports_validation * Purpose: - * Common argument validation shared between all exception port setting/swapping routines + * Common argument validation shared between all exception port + * setting/swapping routines * Conditions: * Nothing locked. - * Returns: - * KERN_SUCCESS Setting the exception port is allowed with these arguments - * KERN_INVALID_ARGUMENT Invalid arguments - * KERN_INVALID_RIGHT Incorrect port configuration - * KERN_DENIED Denied by security policy + * Returns: + * KERN_SUCCESS Setting the exception port is allowed + * with these arguments + * KERN_INVALID_ARGUMENT Invalid arguments + * KERN_INVALID_RIGHT Incorrect port configuration + * KERN_DENIED Denied by security policy */ kern_return_t set_exception_ports_validation( @@ -4228,8 +3989,7 @@ set_exception_ports_validation( ipc_port_t new_port, exception_behavior_t new_behavior, thread_state_flavor_t new_flavor, - bool hardened_exception - ) + bool hardened_exception) { if (exception_mask & ~EXC_MASK_VALID) { return KERN_INVALID_ARGUMENT; @@ -4249,14 +4009,7 @@ set_exception_ports_validation( } } - /* - * rdar://77996387 - * Avoid exposing immovable ports send rights (kobjects) to `get_exception_ports`, - * but exception ports to still be set. - */ - if (IP_VALID(new_port) && - ((!ip_is_exception_port(new_port) && new_port->ip_immovable_receive) || - new_port->ip_immovable_send)) { + if (IP_VALID(new_port) && !ipc_is_valid_exception_port(task, new_port)) { return KERN_INVALID_RIGHT; } @@ -4834,7 +4587,7 @@ thread_get_exception_ports_internal( if (!IP_VALID(exc_port)) { ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 }; } else { - uintptr_t receiver; + task_t receiver = TASK_NULL; (void)ipc_port_get_receiver_task(exc_port, &receiver); ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRHASH(exc_port); ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRHASH(receiver) : 0; @@ -4990,7 +4743,7 @@ task_get_exception_ports_internal( if (!IP_VALID(exc_port)) { ports_info[j] = (ipc_info_port_t){ .iip_port_object = 0, .iip_receiver_object = 0 }; } else { - uintptr_t receiver; + task_t receiver = TASK_NULL; (void)ipc_port_get_receiver_task(exc_port, &receiver); ports_info[j].iip_port_object = (natural_t)VM_KERNEL_ADDRHASH(exc_port); ports_info[j].iip_receiver_object = receiver ? (natural_t)VM_KERNEL_ADDRHASH(receiver) : 0; @@ -5088,36 +4841,32 @@ void ipc_thread_port_unpin( ipc_port_t port) { - if (port == IP_NULL) { - return; - } - ip_mq_lock(port); - port->ip_pinned = 0; - ip_mq_unlock(port); + ipc_object_unpin(current_space(), port); } /* * Routine: task_register_hardened_exception_handler * Purpose: - * Register a port as a hardened exception handler. + * Register a port as a hardened exception handler. * See task.defs for additional info * Conditions: * Nothing locked. * Limit of one hardened exception handler per task * Returns: - * KERN_INVALID_ARGUMENT invalid thread - * KERN_DENIED breaking the security policy - * KERN_NAME_EXISTS Already set a hardened exception handler on this task - * KERN_SUCCESS + * KERN_INVALID_ARGUMENT invalid thread + * KERN_DENIED violating the security policy + * KERN_NAME_EXISTS Already set a hardened exception handler + * on this task + * KERN_SUCCESS */ kern_return_t task_register_hardened_exception_handler( - task_t task, - uint32_t signed_pc_key, - exception_mask_t exceptions_allowed, - exception_behavior_t behaviors_allowed, - thread_state_flavor_t flavors_allowed, - mach_port_t new_port) + task_t task, + uint32_t signed_pc_key, + exception_mask_t exceptions_allowed, + exception_behavior_t behaviors_allowed, + thread_state_flavor_t flavors_allowed, + mach_port_t new_port) { ipc_port_t old_port; @@ -5169,24 +4918,25 @@ task_register_hardened_exception_handler( /* * Routine: thread_adopt_exception_handler * Purpose: - * Adopt the hardened exception handler from the current task, for this thread. - * Allows you to set exception ports on a thread after exception ports + * Adopt the hardened exception handler from the current task, + * for this thread. + * + * Allows to set exception ports on a thread after exception ports * have been frozen for the task. * Conditions: * Nothing locked * Returns: - * KERN_INVALID_ARGUMENT invalid thread - * KERN_DENIED breaking the security policy + * KERN_INVALID_ARGUMENT invalid thread + * KERN_DENIED violating the security policy * KERN_SUCCESS */ kern_return_t thread_adopt_exception_handler( - thread_t thread, - mach_port_t exc_port, - exception_mask_t exc_mask, - exception_behavior_t behavior_mask, - thread_state_flavor_t flavor_mask - ) + thread_t thread, + mach_port_t exc_port, + exception_mask_t exc_mask, + exception_behavior_t behavior_mask, + thread_state_flavor_t flavor_mask) { if (thread == THREAD_NULL) { return KERN_INVALID_ARGUMENT; @@ -5220,9 +4970,13 @@ thread_adopt_exception_handler( return KERN_DENIED; } - assert(!IP_VALID(exc_port) || exc_port->ip_immovable_receive); assert(!IP_VALID(exc_port) || ip_is_exception_port(exc_port)); - /* We can safely assume this will be valid because we called set_exception_ports_validation on it when it was originally set on the task */ - return thread_set_exception_ports_internal(thread, exc_mask, exc_port, behavior_mask, flavor_mask, true); + /* + * We can safely assume this will be valid because we called + * set_exception_ports_validation on it when it was originally + * set on the task + */ + return thread_set_exception_ports_internal(thread, exc_mask, exc_port, + behavior_mask, flavor_mask, true); } diff --git a/osfmk/kern/ipc_tt.h b/osfmk/kern/ipc_tt.h index 3a107cd7d..c1c36df76 100644 --- a/osfmk/kern/ipc_tt.h +++ b/osfmk/kern/ipc_tt.h @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include #include #include @@ -93,18 +93,14 @@ extern void ipc_task_reset( extern void ipc_task_terminate( task_t task); -/* Setup task control port according to it's control port options */ -extern void ipc_task_set_immovable_pinned( +/* Setup task control port according to its control port options */ +extern void ipc_task_copyout_control_port( task_t task); -/* Setup thread control port according to it's owning task's port options */ -extern void ipc_main_thread_set_immovable_pinned( - thread_t thread); - -__options_decl(ipc_thread_init_options_t, uint32_t, { - IPC_THREAD_INIT_NONE = 0x00, - IPC_THREAD_INIT_MAINTHREAD = 0x01, -}); +/* Setup thread control port according to its control port options */ +extern void +ipc_thread_set_immovable_pinned( + thread_t thread); __options_decl(port_intrans_options_t, uint32_t, { PORT_INTRANS_OPTIONS_NONE = 0x0000, @@ -119,8 +115,7 @@ __options_decl(port_intrans_options_t, uint32_t, { extern void ipc_thread_init( task_t task, thread_t thread, - thread_ro_t tro, - ipc_thread_init_options_t options); + thread_ro_t tro); /* Disable IPC access to a thread */ extern void ipc_thread_disable( @@ -183,10 +178,6 @@ extern task_read_t convert_port_to_task_read_mig( extern task_t convert_port_to_task( ipc_port_t port); -/* Convert from a port to a pinned task */ -extern task_t convert_port_to_task_pinned( - ipc_port_t port); - /* Convert from a port to a task */ extern task_t convert_port_to_task_mig( ipc_port_t port); @@ -273,15 +264,15 @@ extern thread_read_t convert_port_to_thread_read( ipc_port_t port); extern thread_t port_name_to_thread( - mach_port_name_t port_name, - port_intrans_options_t options); + mach_port_name_t port_name, + port_intrans_options_t options); /* Deallocate a space ref produced by convert_port_to_space */ extern void space_deallocate( ipc_space_t space); extern void space_read_deallocate( - ipc_space_read_t space); + ipc_space_read_t space); extern void space_inspect_deallocate( ipc_space_inspect_t space); @@ -290,50 +281,40 @@ extern kern_return_t thread_get_kernel_special_reply_port(void); extern void thread_dealloc_kernel_special_reply_port(thread_t thread); -extern kern_return_t -set_exception_ports_validation( +extern kern_return_t set_exception_ports_validation( task_t task, exception_mask_t exception_mask, ipc_port_t new_port, exception_behavior_t new_behavior, thread_state_flavor_t new_flavor, - bool hardened_exception - ); + bool hardened_exception); -extern kern_return_t -thread_set_exception_ports_internal( - thread_t thread, +extern kern_return_t thread_set_exception_ports_internal( + thread_t thread, exception_mask_t exception_mask, ipc_port_t new_port, exception_behavior_t new_behavior, thread_state_flavor_t new_flavor, - boolean_t hardened); + boolean_t hardened); #if MACH_KERNEL_PRIVATE extern void ipc_thread_port_unpin( - ipc_port_t port); + ipc_port_t port); -extern ipc_port_t -convert_task_suspension_token_to_port_external( - task_suspension_token_t task); +extern ipc_port_t convert_task_suspension_token_to_port_external( + task_suspension_token_t task); -extern ipc_port_t -convert_task_suspension_token_to_port_mig( - task_suspension_token_t task); +extern ipc_port_t convert_task_suspension_token_to_port_mig( + task_suspension_token_t task); -extern task_suspension_token_t -convert_port_to_task_suspension_token_external( - ipc_port_t port); +extern task_suspension_token_t convert_port_to_task_suspension_token_external( + ipc_port_t port); -extern task_suspension_token_t -convert_port_to_task_suspension_token_mig( - ipc_port_t port); - -extern task_suspension_token_t -convert_port_to_task_suspension_token_kernel( - ipc_port_t port); +extern task_suspension_token_t convert_port_to_task_suspension_token_mig( + ipc_port_t port); +extern task_suspension_token_t convert_port_to_task_suspension_token_kernel( + ipc_port_t port); #endif - #endif /* _KERN_IPC_TT_H_ */ diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 6e17b3448..5c0d35a76 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -387,7 +387,6 @@ kalloc_zone_init( zone_id_t *kheap_zstart, zone_create_flags_t zc_flags) { - zc_flags |= ZC_PGZ_USE_GUARDS; if (kheap_id == KHEAP_ID_DATA_BUFFERS) { zc_flags |= ZC_DATA; } @@ -472,6 +471,80 @@ TUNABLE(uint16_t, kt_fixed_zones, "kt_fixed_zones", TUNABLE(uint16_t, kt_var_ptr_heaps, "kt_var_ptr_heaps", 2); static TUNABLE(bool, kt_shared_fixed, "-kt-shared", true); + +/** + * @const kexts_enroll_data_shared + * + * @brief + * We have two heaps for data allocations: + * - KHEAP_DATA_BUFFERS, which is for allocations that never shared. + * - KHEAP_DATA_SHARED, which is for allocations that need to be shared. + * + * This is a control that indicates which heap we expose to kexts via the + * exported allocations functions. + */ +STATIC_IF_KEY_DEFINE_TRUE(kexts_enroll_data_shared); + +/** + * @const restricted_data_mode + * + * @brief + * This is a control that sets the mode of mapping policies + * enforcement on data allocations: + * - none: the state before the change (no telemetry, no enforcement). + * - telemetry: do not enforce, do emit telemetry + * - enforce: type the KHEAP_DATA_BUFFERS pages as restricted mappings. + * + * Combined with kexts_enroll_data_shared, we can create the modes we need + * for none/telemetry/enforcement on core kernel/kexts. + * + * restricted_data_mode_t is an enum used to specify the mode being used. + */ + +__options_decl(restricted_data_mode_t, uint8_t, { + RESTRICTED_DATA_MODE_NONE = 0x0000, + RESTRICTED_DATA_MODE_TELEMETRY = 0x0001, + RESTRICTED_DATA_MODE_ENFORCE = 0x0002 +}); + +TUNABLE(restricted_data_mode_t, + restricted_data_mode, + "restricted_data_mode", +#if __x86_64__ + RESTRICTED_DATA_MODE_NONE +#else + RESTRICTED_DATA_MODE_TELEMETRY +#endif /* __x86_64__ */ + ); + +inline bool +kalloc_is_restricted_data_mode_telemetry(void) +{ + return restricted_data_mode == RESTRICTED_DATA_MODE_TELEMETRY; +} + +inline bool +kalloc_is_restricted_data_mode_enforced(void) +{ + return restricted_data_mode == RESTRICTED_DATA_MODE_ENFORCE; +} + +inline bool +kmem_needs_data_share_range(void) +{ + /* + * The dedicated range is required only for + * telemetry reporting, when we need to distinguish + * between the two kind of data via kmem ranges. + * + * Even though this is strictly like checking telemetry + * mode, it's better to have well-defined abstraction layer + * for that adopted in all the call-sites, to be flexible + * w.r.t future changes / unrolling. + */ + return kalloc_is_restricted_data_mode_telemetry(); +} + /* * Section start/end for fixed kalloc_type views */ @@ -720,7 +793,8 @@ kalloc_type_get_flags_var(vm_offset_t addr, uuid_string_t kext_uuid) } /* - * Check if signature of type is made up of only data and padding + * Check if signature of type is made up of only data and padding, + * which is meant to never be shared. */ static bool kalloc_type_is_data(kalloc_type_flags_t kt_flags) @@ -801,6 +875,25 @@ kalloc_type_handle_data_view_var(vm_offset_t addr) kalloc_type_assign_zone_var(&ktv, &ktv + 1, KT_VAR_DATA_HEAP); } +__startup_func +static void +kalloc_type_handle_data_shared_view_fixed(vm_offset_t addr) +{ + kalloc_type_view_t cur_data_view = (kalloc_type_view_t) addr; + zone_t z = kalloc_zone_for_size(KHEAP_DATA_SHARED->kh_zstart, + cur_data_view->kt_size); + kalloc_type_assign_zone_fixed(&cur_data_view, &cur_data_view + 1, z, NULL, + NULL); +} + +__startup_func +static void +kalloc_type_handle_data_shared_view_var(vm_offset_t addr) +{ + kalloc_type_var_view_t ktv = (kalloc_type_var_view_t) addr; + kalloc_type_assign_zone_var(&ktv, &ktv + 1, KT_VAR_DATA_SHARED_HEAP); +} + __startup_func static uint32_t kalloc_type_handle_parray_var(void) @@ -948,10 +1041,10 @@ kalloc_type_view_copy( } /* - * If signature indicates that the entire allocation is data move it to - * KHEAP_DATA_BUFFERS. Note that KT_VAR_DATA_HEAP is a fake "data" heap, - * variable kalloc_type handles the actual redirection in the entry points - * kalloc/kfree_type_var_impl. + * Check if the signature indicates that the entire allocation is data. + * + * Note that KT_VAR_DATA_HEAP is fake "data" heap, variable kalloc_type handles + * the actual redirection in the entry points kalloc/kfree_type_var_impl. */ if (kalloc_type_is_data(kt_flags)) { kalloc_type_func(type, handle_data_view, cur); @@ -1030,6 +1123,7 @@ kalloc_type_view_parse(const kalloc_type_variant_t type) kalloc_type_var(type, sec_start), kalloc_type_var(type, sec_end), &cur_count, false, NULL); +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* no kexts in unit-test */ /* * Parse __kalloc_type section for kexts * @@ -1122,6 +1216,7 @@ kalloc_type_view_parse(const kalloc_type_variant_t type) cur += ((kext_text_sz + (KEXT_ALIGN_BYTES - 1)) & (~KEXT_ALIGN_MASK)); } +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ } else { /* * When kc_format is KCFormatDynamic or KCFormatUnknown, we don't handle @@ -1636,6 +1731,13 @@ kalloc_type_init_sig_eq( } } +#ifndef __BUILDING_XNU_LIB_UNITTEST__ +#define KT_ZONES_FOR_SIZE_SIZE 32 +#else /* __BUILDING_XNU_LIB_UNITTEST__ */ +/* different init sequence in unit-test requires a bigger buffer in the kalloc zones initialization */ +#define KT_ZONES_FOR_SIZE_SIZE 35 +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ + __startup_func static uint16_t kalloc_type_distribute_zone_for_type( @@ -1645,7 +1747,7 @@ kalloc_type_distribute_zone_for_type( uint16_t zones_total_type, uint16_t total_types, uint16_t *kt_skip_list, - zone_t kt_zones_for_size[32], + zone_t kt_zones_for_size[KT_ZONES_FOR_SIZE_SIZE], uint16_t type_zones_start, zone_t sig_zone, zone_t early_zone) @@ -1742,8 +1844,8 @@ kalloc_type_create_zones_fixed( } zone_carry = 0; - assert(n_zones_sig + n_zones_type + 1 <= 32); - zone_t kt_zones_for_size[32] = {}; + assert(n_zones_sig + n_zones_type + 1 <= KT_ZONES_FOR_SIZE_SIZE); + zone_t kt_zones_for_size[KT_ZONES_FOR_SIZE_SIZE] = {}; kalloc_type_create_zone_for_size(kt_zones_for_size, n_zones_sig + n_zones_type, z_size); @@ -2229,7 +2331,12 @@ kalloc_type_get_heap(kalloc_type_flags_t kt_flags) * Redirect data-only views */ if (kalloc_type_is_data(kt_flags)) { - return KHEAP_DATA_BUFFERS; + /* + * There are kexts that allocate arrays of data types (uint8_t etc.) + * and use krealloc_data / kfree_data to free it; therefore, + * until adoption will land, we need to use shared heap for now. + */ + return GET_KEXT_KHEAP_DATA(); } if (kt_flags & KT_PROCESSED) { @@ -2285,7 +2392,7 @@ kalloc_large( kma_flags |= KMA_KOBJECT; #endif } else { - assert(kheap == KHEAP_DATA_BUFFERS); + assert(kheap == KHEAP_DATA_BUFFERS || kheap == KHEAP_DATA_SHARED); } if (flags & Z_NOPAGEWAIT) { kma_flags |= KMA_NOPAGEWAIT; @@ -2300,6 +2407,9 @@ kalloc_large( } else if (flags & (Z_KALLOC_ARRAY | Z_SPRAYQTN)) { kma_flags |= KMA_SPRAYQTN; } + if (flags & Z_NOSOFTLIMIT) { + kma_flags |= KMA_NOSOFTLIMIT; + } tag = zalloc_flags_get_tag(flags); @@ -2410,7 +2520,7 @@ kalloc_use_early_heap( #undef kalloc_ext -struct kalloc_result +__mockable struct kalloc_result kalloc_ext( void *kheap_or_kt_view, vm_size_t size, @@ -2473,7 +2583,7 @@ void * kalloc_data_external(vm_size_t size, zalloc_flags_t flags) { flags = Z_VM_TAG_BT(flags & Z_KPI_MASK, VM_KERN_MEMORY_KALLOC_DATA); - return kheap_alloc(KHEAP_DATA_BUFFERS, size, flags); + return kheap_alloc(GET_KEXT_KHEAP_DATA(), size, flags); } void * @@ -2554,7 +2664,11 @@ kalloc_data_require(void *addr, vm_size_t size) return; } } else if (kmem_range_id_contains(KMEM_RANGE_ID_DATA, - (vm_address_t)pgz_decode(addr, size), size)) { + (vm_address_t)addr, size)) { + return; + } else if (kmem_needs_data_share_range() && + kmem_range_id_contains(KMEM_RANGE_ID_DATA_SHARED, + (vm_address_t)addr, size)) { return; } @@ -2584,14 +2698,49 @@ kalloc_non_data_require(void *addr, vm_size_t size) break; } } else if (!kmem_range_id_contains(KMEM_RANGE_ID_DATA, - (vm_address_t)pgz_decode(addr, size), size)) { + (vm_address_t)addr, size)) { + return; + } else if (kmem_needs_data_share_range() && + !kmem_range_id_contains(KMEM_RANGE_ID_DATA_SHARED, + (vm_address_t)addr, size)) { return; } kalloc_non_data_require_panic(addr, size); } -void * +bool +kalloc_is_data_buffers(void *addr, vm_size_t size) +{ + zone_id_t zid = zone_id_for_element(addr, size); + + /* + * If we do not use dedicated data share range, + * there is no way to fully distinguish between + * shared and buffers heaps. + * + * When kmem_needs_data_share_range() == true, the + * KMEM_RANGE_ID_DATA range is strictly for DATA_BUFFERS, + * and KMEM_RANGE_ID_DATA_SHARED is strictly for DATA_SHARED. + */ + assert(kmem_needs_data_share_range()); + + if (zid != ZONE_ID_INVALID) { + zone_t z = &zone_array[zid]; + zone_security_flags_t zsflags = zone_security_array[zid]; + if (zone_is_data_buffers_kheap(zsflags.z_kheap_id) && + size <= zone_elem_inner_size(z)) { + return true; + } + } else if (kmem_range_id_contains(KMEM_RANGE_ID_DATA, + (vm_address_t)addr, size)) { + return true; + } + + return false; +} + +__mockable void * kalloc_type_impl_external(kalloc_type_view_t kt_view, zalloc_flags_t flags) { /* @@ -2757,7 +2906,7 @@ kfree_zone( } zsflags = zone_security_config(z); - if (kheap == KHEAP_DATA_BUFFERS) { + if (kheap == KHEAP_DATA_BUFFERS || kheap == KHEAP_DATA_SHARED) { if (kheap->kh_heap_id != zsflags.z_kheap_id) { kfree_heap_confusion_panic(kheap, data, size, z); } @@ -2775,7 +2924,7 @@ kfree_zone( zfree_ext(z, zstats ?: z->z_stats, data, ZFREE_PACK_SIZE(zsize, size)); } -void +__mockable void kfree_ext(void *kheap_or_kt_view, void *data, vm_size_t size) { vm_size_t bucket_size; @@ -2851,7 +3000,7 @@ void kfree_addr_ext(kheap, addr); } -void * +__mockable void * kalloc_type_impl_internal(kalloc_type_view_t kt_view, zalloc_flags_t flags) { zone_stats_t zs = kt_view->kt_zv.zv_stats; @@ -2907,7 +3056,7 @@ kfree_data_external(void *ptr, vm_size_t size); void kfree_data_external(void *ptr, vm_size_t size) { - return kheap_free(KHEAP_DATA_BUFFERS, ptr, size); + return kheap_free(GET_KEXT_KHEAP_DATA(), ptr, size); } void @@ -2915,7 +3064,7 @@ kfree_data_addr_external(void *ptr); void kfree_data_addr_external(void *ptr) { - return kheap_free_addr(KHEAP_DATA_BUFFERS, ptr); + return kheap_free_addr(GET_KEXT_KHEAP_DATA(), ptr); } void @@ -2991,7 +3140,7 @@ krealloc_large( kmr_flags |= KMR_KOBJECT; #endif } else { - assert(kheap == KHEAP_DATA_BUFFERS); + assert(kheap == KHEAP_DATA_BUFFERS || kheap == KHEAP_DATA_SHARED); } if (flags & Z_NOPAGEWAIT) { kmr_flags |= KMR_NOPAGEWAIT; @@ -3262,7 +3411,7 @@ krealloc_data_external( zalloc_flags_t flags) { flags = Z_VM_TAG_BT(flags & Z_KPI_MASK, VM_KERN_MEMORY_KALLOC_DATA); - return krealloc_ext(KHEAP_DATA_BUFFERS, ptr, old_size, new_size, flags, NULL).addr; + return krealloc_ext(GET_KEXT_KHEAP_DATA(), ptr, old_size, new_size, flags, NULL).addr; } void * @@ -3279,7 +3428,7 @@ krealloc_shared_data_external( zalloc_flags_t flags) { flags = Z_VM_TAG_BT(flags & Z_KPI_MASK, VM_KERN_MEMORY_KALLOC_SHARED); - return krealloc_ext(KHEAP_DATA_SHARED, ptr, old_size, new_size, flags, NULL).addr; + return krealloc_ext(GET_KEXT_KHEAP_DATA(), ptr, old_size, new_size, flags, NULL).addr; } __startup_func diff --git a/osfmk/kern/kalloc.h b/osfmk/kern/kalloc.h index 3c69f197a..065b79d0f 100644 --- a/osfmk/kern/kalloc.h +++ b/osfmk/kern/kalloc.h @@ -203,6 +203,20 @@ KALLOC_HEAP_DECLARE(KHEAP_KT_VAR); STARTUP_ARG(ZALLOC, STARTUP_RANK_MIDDLE, kheap_startup_init, var) +STATIC_IF_KEY_DECLARE_TRUE(kexts_enroll_data_shared); + +#define GET_KEXT_KHEAP_DATA() \ + static_if(kexts_enroll_data_shared) ? KHEAP_DATA_SHARED : KHEAP_DATA_BUFFERS + +/* + * Helper functions to query the status of security policies + * regarding data allocations. This has consequences for things + * like dedicated kmem range for shared data allocations + */ +extern bool kalloc_is_restricted_data_mode_telemetry(void); +extern bool kalloc_is_restricted_data_mode_enforced(void); +extern bool kmem_needs_data_share_range(void); + /* * Allocations of type SO_NAME are known to not have pointers for * most platforms -- for macOS this is not guaranteed @@ -239,8 +253,9 @@ KALLOC_HEAP_DECLARE(KHEAP_KT_VAR); * zone that your @c kalloc_type_view points to. * * @const KT_DATA_ONLY - * Represents that the type is "data-only". Adopters should not - * set this flag manually, it is meant for the compiler to set + * Represents that the type is "data-only" and is never shared + * with another security domain. Adopters should not set + * this flag manually, it is meant for the compiler to set * automatically when KALLOC_TYPE_CHECK(DATA) passes. * * @const KT_VM @@ -524,22 +539,22 @@ typedef struct kalloc_type_var_view *kalloc_type_var_view_t; * do not contain any pointers */ #define kalloc_data_tag(size, flags, itag) \ - kheap_alloc_tag(KHEAP_DATA_BUFFERS, size, flags, itag) + kheap_alloc_tag(GET_KEXT_KHEAP_DATA(), size, flags, itag) #define kalloc_data(size, flags) \ - kheap_alloc(KHEAP_DATA_BUFFERS, size, flags) + kheap_alloc(GET_KEXT_KHEAP_DATA(), size, flags) #define krealloc_data_tag(elem, old_size, new_size, flags, itag) \ - __kheap_realloc(KHEAP_DATA_BUFFERS, elem, old_size, new_size, \ + __kheap_realloc(GET_KEXT_KHEAP_DATA(), elem, old_size, new_size, \ __zone_flags_mix_tag(flags, itag), NULL) #define krealloc_data(elem, old_size, new_size, flags) \ krealloc_data_tag(elem, old_size, new_size, flags, \ VM_ALLOC_SITE_TAG()) #define kfree_data(elem, size) \ - kheap_free(KHEAP_DATA_BUFFERS, elem, size); + kheap_free(GET_KEXT_KHEAP_DATA(), elem, size); #define kfree_data_addr(elem) \ - kheap_free_addr(KHEAP_DATA_BUFFERS, elem); + kheap_free_addr(GET_KEXT_KHEAP_DATA(), elem); extern void kheap_free_bounded( kalloc_heap_t heap, @@ -555,6 +570,10 @@ extern void kalloc_non_data_require( void *data __unsafe_indexable, vm_size_t size); +extern bool kalloc_is_data_buffers( + void *addr, + vm_size_t size); + #else /* XNU_KERNEL_PRIVATE */ extern void *__sized_by(size) kalloc( @@ -1496,7 +1515,12 @@ kt_size(vm_size_t s1, vm_size_t s2, vm_size_t c2) #define kt_view_var \ KALLOC_CONCAT(kalloc_type_view_, __LINE__) +#ifndef __BUILDING_XNU_LIBRARY__ #define KALLOC_TYPE_SEGMENT "__DATA_CONST" +#else /* __BUILDING_XNU_LIBRARY__ */ +/* Special segments are not used when building for user-mode */ +#define KALLOC_TYPE_SEGMENT "__DATA" +#endif /* __BUILDING_XNU_LIBRARY__ */ /* * When kalloc_type_impl is called from xnu, it calls zalloc_flags @@ -1560,6 +1584,10 @@ kt_size(vm_size_t s1, vm_size_t s2, vm_size_t c2) /* * Kalloc type flags are adjusted to indicate if the type is "data-only" or * will use the VM or is a pointer array. + * + * There is no need to create another sig for data shared. We expect shareable + * allocations to be marked explicitly by the callers, by using the dedicated + * allocation API for shared data. */ #define KALLOC_TYPE_ADJUST_FLAGS(flags, ...) \ KALLOC_TYPE_CAST_FLAGS((flags | KT_CHANGED | KT_CHANGED2 | \ @@ -1738,6 +1766,12 @@ kalloc_type_get_size(uint32_t kt_size) return kt_size & KALLOC_TYPE_SIZE_MASK; } +static inline uint32_t +kalloc_type_size(kalloc_type_view_t ktv) +{ + return kalloc_type_get_size(ktv->kt_size); +} + extern bool IOMallocType_from_vm( kalloc_type_view_t ktv); diff --git a/osfmk/kern/kcdata.h b/osfmk/kern/kcdata.h index 88ea57984..38d4f701c 100644 --- a/osfmk/kern/kcdata.h +++ b/osfmk/kern/kcdata.h @@ -243,6 +243,8 @@ #include #include + + #define KCDATA_DESC_MAXLEN 32 /* including NULL byte at end */ #define KCDATA_FLAGS_STRUCT_PADDING_MASK 0xf @@ -488,7 +490,7 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ #define STACKSHOT_KCCONTAINER_TASK 0x903u #define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2, task_snapshot_v3 */ #define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ #define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ #define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* dyld_shared_cache_loadinfo */ @@ -565,6 +567,10 @@ struct kcdata_type_definition { #define STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGMENTS 0x954u /* struct exclave_textlayout_segment_v2 */ #define STACKSHOT_KCTYPE_KERN_EXCLAVES_CRASH_THREADINFO 0x955u /* struct thread_crash_exclaves_info */ #define STACKSHOT_KCTYPE_LATENCY_INFO_CPU 0x956u /* struct stackshot_latency_cpu */ +#define STACKSHOT_KCTYPE_TASK_EXEC_META 0x957u /* struct task_exec_meta */ +#define STACKSHOT_KCTYPE_TASK_MEMORYSTATUS 0x958u /* struct task_memorystatus_snapshot */ +#define STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER 0x95au /* struct stackshot_latency_buffer */ + struct stack_snapshot_frame32 { uint32_t lr; @@ -703,12 +709,33 @@ enum task_snapshot_flags { kTaskDyldCompactInfoTriedFault = 0x1000000000, kTaskWqExceededCooperativeThreadLimit = 0x2000000000, kTaskWqExceededActiveConstrainedThreadLimit = 0x4000000000, + kTaskRunawayMitigation = 0x8000000000, + kTaskIsActive = 0x10000000000, + kTaskIsManaged = 0x20000000000, + kTaskHasAssertion = 0x40000000000, }; // Note: Add any new flags to kcdata.py (ts_ss_flags) enum task_transition_type { kTaskIsTerminated = 0x1,// Past LPEXIT }; +/* See kcdata_private.h for more flag definitions */ +enum task_exec_flags : uint64_t { + kTaskExecTranslated = 0x01, /* Task is running under translation (eg, Rosetta) */ + kTaskExecHardenedHeap = 0x02, /* Task has the hardened heap security feature */ + kTaskExecReserved00 = 0x04, + kTaskExecReserved01 = 0x08, + kTaskExecReserved02 = 0x10, + kTaskExecReserved03 = 0x20 +}; + +/* metadata about a task that is fixed at spawn/exec time */ +struct task_exec_meta { + uint64_t tem_flags; /* task_exec_flags */ +} __attribute__((packed)); + + + enum thread_snapshot_flags { /* k{User,Kernel}64_p (values 0x1 and 0x2) are defined in generic_snapshot_flags */ kHasDispatchSerial = 0x4, @@ -962,6 +989,27 @@ struct task_snapshot_v2 { char ts_p_comm[32]; } __attribute__ ((packed)); +struct task_snapshot_v3 { + uint64_t ts_unique_pid; + uint64_t ts_ss_flags; + uint64_t ts_user_time_in_terminated_threads; + uint64_t ts_system_time_in_terminated_threads; + uint64_t ts_p_start_sec; + uint64_t ts_task_size; + uint64_t ts_max_resident_size; + uint32_t ts_suspend_count; + uint32_t ts_faults; + uint32_t ts_pageins; + uint32_t ts_cow_faults; + uint32_t ts_was_throttled; + uint32_t ts_did_throttle; + uint32_t ts_latency_qos; + int32_t ts_pid; + char ts_p_comm[32]; + uint32_t ts_uid; + uint32_t ts_gid; +} __attribute__ ((packed)); + struct transitioning_task_snapshot { uint64_t tts_unique_pid; uint64_t tts_ss_flags; @@ -986,6 +1034,13 @@ struct task_delta_snapshot_v2 { uint32_t tds_latency_qos; } __attribute__ ((packed)); +struct task_memorystatus_snapshot { + int32_t tms_current_memlimit; + int32_t tms_effectivepriority; + int32_t tms_requestedpriority; + int32_t tms_assertionpriority; +} __attribute__ ((packed)); + #define KCDATA_INVALID_CS_TRUST_LEVEL 0xffffffff struct stackshot_task_codesigning_info { uint64_t csflags; @@ -1137,6 +1192,14 @@ struct stackshot_latency_cpu { uint64_t intercluster_buf_used; } __attribute__((packed)); +/* only collected if STACKSHOT_COLLECTS_LATENCY_INFO is set to !0 */ +struct stackshot_latency_buffer { + int32_t cluster_type; + uint64_t size; + uint64_t used; + uint64_t overhead; +} __attribute__ ((packed)); + /* only collected if STACKSHOT_COLLECTS_LATENCY_INFO is set to !0 */ struct stackshot_latency_task { uint64_t task_uniqueid; @@ -1300,6 +1363,10 @@ struct crashinfo_mb { uint64_t data[64]; } __attribute__((packed)); +struct crashinfo_task_security_config { + uint32_t task_security_config; /* struct task_security_config */ +} __attribute__((packed)); + #define MAX_CRASHINFO_SIGNING_ID_LEN 64 #define MAX_CRASHINFO_TEAM_ID_LEN 32 @@ -1379,6 +1446,10 @@ struct crashinfo_mb { #define TASK_CRASHINFO_JIT_ADDRESS_RANGE 0x840 /* struct crashinfo_jit_address_range */ #define TASK_CRASHINFO_MB 0x841 /* struct crashinfo_mb */ #define TASK_CRASHINFO_CS_AUXILIARY_INFO 0x842 /* uint64_t */ +#define TASK_CRASHINFO_RLIM_CORE 0x843 /* rlim_t */ +#define TASK_CRASHINFO_CORE_ALLOWED 0x844 /* uint8_t */ +#define TASK_CRASHINFO_TASK_SECURITY_CONFIG 0x845 /* struct task_security_config */ + #define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END diff --git a/osfmk/kern/kcdata_private.h b/osfmk/kern/kcdata_private.h new file mode 100644 index 000000000..d48a8ddee --- /dev/null +++ b/osfmk/kern/kcdata_private.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#pragma once + diff --git a/osfmk/kern/kern_cdata.c b/osfmk/kern/kern_cdata.c index 3eed7e9fb..f6a8fe236 100644 --- a/osfmk/kern/kern_cdata.c +++ b/osfmk/kern/kern_cdata.c @@ -78,6 +78,7 @@ struct _uint32_with_description_data { int _Atomic lw_corpse_obj_cnt = 0; IPC_KOBJECT_DEFINE(IKOT_KCDATA, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = kcdata_object_no_senders); @@ -189,7 +190,8 @@ kcdata_object_destroy(kcdata_object_t obj) /* Release the port */ if (IP_VALID(port)) { - ipc_kobject_dealloc_port(port, 0, IKOT_KCDATA); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_KCDATA); } /* Release the ref for rate-limited kcdata object type(s) */ @@ -247,7 +249,7 @@ convert_kcdata_object_to_port(kcdata_object_t obj) zone_require(KCDATA_OBJECT->kt_zv.zv_zone, obj); if (!ipc_kobject_make_send_lazy_alloc_port(&obj->ko_port, - obj, IKOT_KCDATA, IPC_KOBJECT_ALLOC_NONE)) { + obj, IKOT_KCDATA)) { kcdata_object_release(obj); } /* object ref consumed */ diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index dc836597a..8f9b7d61e 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -57,6 +57,7 @@ #include /* bcopy */ #include +#include #include #include #include @@ -73,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +84,8 @@ #include #include + + #ifdef CONFIG_EXCLAVES #include #endif /* CONFIG_EXCLAVES */ @@ -478,8 +482,11 @@ extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t extern void proc_archinfo_kdp(void* p, cpu_type_t* cputype, cpu_subtype_t* cpusubtype); extern uint64_t proc_getcsflags_kdp(void * p); extern boolean_t proc_binary_uuid_kdp(task_t task, uuid_t uuid); +extern uint32_t proc_getuid(proc_t); +extern uint32_t proc_getgid(proc_t); +extern void proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion); extern int memorystatus_get_pressure_status_kdp(void); -extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); +extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion); extern void panic_stackshot_release_lock(void); extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */ @@ -496,6 +503,7 @@ static size_t stackshot_plh_est_size(void); #if CONFIG_EXCLAVES static kern_return_t collect_exclave_threads(uint64_t); static kern_return_t stackshot_setup_exclave_waitlist(void); +static void stackshot_cleanup_exclave_waitlist(void); #endif /* @@ -797,6 +805,7 @@ stackshot_buffer_alloc( stackshot_panic_guard(); assert(!stackshot_ctx.sc_is_singlethreaded); + assert(buffer->ssb_ptr != NULL); os_atomic_rmw_loop(&buffer->ssb_used, o_used, new_used, relaxed, { new_used = o_used + size; @@ -840,7 +849,7 @@ stackshot_best_buffer_alloc(size_t size, kern_return_t *error) /* Try other buffers now. */ if (err != KERN_SUCCESS) { for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) { - if (buf_idx == my_cluster) { + if ((buf_idx == my_cluster) || (stackshot_ctx.sc_buffers[buf_idx].ssb_ptr == NULL)) { continue; } @@ -1286,8 +1295,11 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, ui } #if CONFIG_EXCLAVES - if (error == KERN_SUCCESS && stackshot_exclave_inspect_ctids) { - error = collect_exclave_threads(flags); + if (stackshot_exclave_inspect_ctids) { + if (error == KERN_SUCCESS) { + error = collect_exclave_threads(flags); + } + stackshot_cleanup_exclave_waitlist(); } #endif /* CONFIG_EXCLAVES */ @@ -1441,7 +1453,7 @@ get_stackshot_est_tasksize(uint64_t trace_flags) size_t total_size; size_t threads_per_task = (((threads_count + terminated_threads_count) - 1) / (tasks_count + terminated_tasks_count)) + 1; size_t est_thread_size = sizeof(struct thread_snapshot_v4) + 42 * sizeof(uintptr_t); - size_t est_task_size = sizeof(struct task_snapshot_v2) + + size_t est_task_size = sizeof(struct task_snapshot_v3) + TASK_UUID_AVG_SIZE + TASK_SHARED_CACHE_AVG_SIZE + sizeof_if_traceflag(struct io_stats_snapshot, STACKSHOT_INSTRS_CYCLES) + @@ -1574,8 +1586,9 @@ stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata) mach_vm_address_t out_addr; bool use_fault_path = ((stackshot_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0); #if STACKSHOT_COLLECTS_LATENCY_INFO - size_t buffer_used = 0; - size_t buffer_overhead = 0; + size_t buffer_used = 0; + size_t buffer_overhead = 0; + struct stackshot_latency_buffer buffer_latency; #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */ if (use_fault_path) { @@ -1619,12 +1632,35 @@ stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata) } kcd_exit_on_error(kcdata_compression_window_close(kcdata)); + kcdata_compression_window_open(kcdata); + kcd_exit_on_error(kcdata_get_memory_addr_for_array( + kcdata, STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER, sizeof(struct stackshot_latency_buffer), stackshot_ctx.sc_num_buffers, &out_addr)); + /* Add up buffer info */ - for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) { + for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++, out_addr += sizeof(buffer_latency)) { struct stackshot_buffer *buf = &stackshot_ctx.sc_buffers[buf_idx]; - buffer_used += os_atomic_load(&buf->ssb_used, relaxed); - buffer_overhead += os_atomic_load(&buf->ssb_overhead, relaxed); + if (buf->ssb_ptr == NULL) { + kcdata_bzero(kcdata, out_addr, sizeof(struct stackshot_latency_buffer)); + continue; + } + +#if defined(__arm64__) + ml_topology_cluster_t *cluster = &ml_get_topology_info()->clusters[buf_idx]; + buffer_latency.cluster_type = cluster->cluster_type; +#else /* __arm64__ */ + buffer_latency.cluster_type = CLUSTER_TYPE_SMP; +#endif /* !__arm64__ */ + buffer_latency.size = buf->ssb_size; + buffer_latency.used = os_atomic_load(&buf->ssb_used, relaxed); + buffer_latency.overhead = os_atomic_load(&buf->ssb_overhead, relaxed); + kcd_exit_on_error(kcdata_memcpy( + kcdata, out_addr, &buffer_latency, sizeof(buffer_latency))); + + buffer_used += buffer_latency.used; + buffer_overhead += buffer_latency.overhead; } + kcd_exit_on_error(kcdata_compression_window_close(kcdata)); + stackshot_ctx.sc_latency.buffer_size = stackshot_ctx.sc_args.buffer_size; stackshot_ctx.sc_latency.buffer_overhead = buffer_overhead; stackshot_ctx.sc_latency.buffer_used = buffer_used; @@ -1673,7 +1709,7 @@ stackshot_alloc_final_kcdata(void) } if ((error = kmem_alloc(kernel_map, &final_kcdata_buffer, stackshot_args.buffer_size, - KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) { + KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) { os_log_error(OS_LOG_DEFAULT, "stackshot: final allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, stackshot_args.buffer_size, max_tracebuf_size, stackshot_tries); return KERN_RESOURCE_SHORTAGE; } @@ -1924,6 +1960,11 @@ stackshot_remap_buffer(void *stackshotbuf, uint32_t bytes_traced, uint64_t out_b #if CONFIG_EXCLAVES +/* + * Allocates an array for exclaves inspection from the stackshot buffer. This + * state must be cleaned up by calling `stackshot_cleanup_exclave_waitlist` + * after the stackshot is finished. + */ static kern_return_t stackshot_setup_exclave_waitlist(void) { @@ -1950,6 +1991,14 @@ error: return error; } +static void +stackshot_cleanup_exclave_waitlist(void) +{ + stackshot_exclave_inspect_ctids = NULL; + stackshot_exclave_inspect_ctid_capacity = 0; + stackshot_exclave_inspect_ctid_count = 0; +} + static kern_return_t collect_exclave_threads(uint64_t ss_flags) { @@ -1997,11 +2046,6 @@ collect_exclave_threads(uint64_t ss_flags) goto out; } out: - /* clear Exclave buffer now that it's been used */ - stackshot_exclave_inspect_ctids = NULL; - stackshot_exclave_inspect_ctid_capacity = 0; - stackshot_exclave_inspect_ctid_count = 0; - lck_mtx_unlock(&exclaves_collect_mtx); return kr; } @@ -2482,7 +2526,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi for (; snapshot_args.buffer_size <= max_tracebuf_size; snapshot_args.buffer_size = MIN(snapshot_args.buffer_size << 1, max_tracebuf_size)) { stackshot_tries++; if ((error = kmem_alloc(kernel_map, (vm_offset_t *)&snapshot_args.buffer, snapshot_args.buffer_size, - KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) { + KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) { os_log_error(OS_LOG_DEFAULT, "stackshot: initial allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, snapshot_args.buffer_size, max_tracebuf_size, stackshot_tries); error = KERN_RESOURCE_SHORTAGE; goto error_exit; @@ -2558,11 +2602,14 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi } #if CONFIG_EXCLAVES - if (error == KERN_SUCCESS && stackshot_exclave_inspect_ctids) { - if (stackshot_exclave_inspect_ctid_count > 0) { - STACKSHOT_TESTPOINT(TP_START_COLLECTION); + if (stackshot_exclave_inspect_ctids) { + if (error == KERN_SUCCESS) { + if (stackshot_exclave_inspect_ctid_count > 0) { + STACKSHOT_TESTPOINT(TP_START_COLLECTION); + } + error = collect_exclave_threads(snapshot_args.flags); } - error = collect_exclave_threads(snapshot_args.flags); + stackshot_cleanup_exclave_waitlist(); } #endif /* CONFIG_EXCLAVES */ @@ -2577,7 +2624,12 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi goto error_exit; } if (error == KERN_INSUFFICIENT_BUFFER_SIZE && snapshot_args.buffer_size == max_tracebuf_size) { - os_log_error(OS_LOG_DEFAULT, "stackshot: final buffer size was insufficient at maximum size\n"); + os_log_error(OS_LOG_DEFAULT, "stackshot: final buffer size was insufficient at maximum size: " + "try %llu, estimate %u, flags %llu, pid %d, " + "tasks: %d, terminated_tasks %d, threads: %d, terminated_threads: %d\n", + stackshot_tries, snapshot_args.buffer_size, snapshot_args.flags, snapshot_args.pid, + tasks_count, terminated_tasks_count, + threads_count, terminated_threads_count); error = KERN_RESOURCE_SHORTAGE; goto error_exit; } @@ -2600,7 +2652,12 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi time_end - time_start, stackshot_estimate, snapshot_args.buffer_size); stackshot_duration_prior_abs += (time_end - time_start); if (snapshot_args.buffer_size == max_tracebuf_size) { - os_log_error(OS_LOG_DEFAULT, "stackshot: initial buffer size was insufficient at maximum size\n"); + os_log_error(OS_LOG_DEFAULT, "stackshot: initial buffer size was insufficient at maximum size: " + "try %llu, estimate %u, flags %llu, pid %d, " + "tasks: %d, terminated_tasks %d, threads: %d, terminated_threads: %d\n", + stackshot_tries, snapshot_args.buffer_size, snapshot_args.flags, snapshot_args.pid, + tasks_count, terminated_tasks_count, + threads_count, terminated_threads_count); error = KERN_RESOURCE_SHORTAGE; goto error_exit; } @@ -2724,17 +2781,12 @@ kdp_snapshot_preflight_internal(struct kdp_snapshot_args args) #else /* __AMP__ */ stackshot_ctx.sc_num_buffers = 1; #endif /* !__AMP__ */ - size_t bufsz = args.buffer_size / stackshot_ctx.sc_num_buffers; - for (int buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) { - stackshot_ctx.sc_buffers[buf_idx] = (struct stackshot_buffer) { - .ssb_ptr = (void*) ((mach_vm_address_t) args.buffer + (bufsz * buf_idx)), - .ssb_size = bufsz, - .ssb_used = 0, - .ssb_freelist = NULL, - .ssb_freelist_lock = 0, - .ssb_overhead = 0 - }; - } + + /* + * Set all buffer sizes to zero. We'll use ssb_size to track how many CPUs in + * that cluster are participating in the stackshot. + */ + bzero(stackshot_ctx.sc_buffers, sizeof(stackshot_ctx.sc_buffers)); /* Setup per-cpu state */ percpu_foreach_base(base) { @@ -2978,7 +3030,8 @@ stackshot_plh_setup(void) static int16_t stackshot_plh_hash(struct ipc_service_port_label *ispl) { - uintptr_t ptr = (uintptr_t)ispl; + uintptr_t ptr = VM_KERNEL_STRIP_PTR((uintptr_t)ispl); + static_assert(STACKSHOT_PLH_SHIFT < 16, "plh_hash must fit in 15 bits"); #define PLH_HASH_STEP(ptr, x) \ ((((x) * STACKSHOT_PLH_SHIFT) < (sizeof(ispl) * CHAR_BIT)) ? ((ptr) >> ((x) * STACKSHOT_PLH_SHIFT)) : 0) @@ -3189,8 +3242,12 @@ error_exit: } #endif /* DEVELOPMENT || DEBUG */ -static uint64_t -kcdata_get_task_ss_flags(task_t task) +/* + * This function can be called from stackshot / kdp context or + * from telemetry / current task context + */ +uint64_t +kcdata_get_task_ss_flags(task_t task, bool from_stackshot) { uint64_t ss_flags = 0; boolean_t task_64bit_addr = task_has_64Bit_addr(task); @@ -3211,6 +3268,9 @@ kcdata_get_task_ss_flags(task_t task) if (task->effective_policy.tep_darwinbg == 1) { ss_flags |= kTaskDarwinBG; } + if (task->requested_policy.trp_ext_darwinbg == 1) { + ss_flags |= kTaskExtDarwinBG; + } if (task->requested_policy.trp_role == TASK_FOREGROUND_APPLICATION) { ss_flags |= kTaskIsForeground; } @@ -3223,7 +3283,8 @@ kcdata_get_task_ss_flags(task_t task) #if CONFIG_MEMORYSTATUS boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE; - memorystatus_proc_flags_unsafe(bsd_info, &dirty, &dirty_tracked, &allow_idle_exit); + boolean_t is_active = FALSE, is_managed = FALSE, has_assertion = FALSE; + memorystatus_proc_flags_unsafe(bsd_info, &dirty, &dirty_tracked, &allow_idle_exit, &is_active, &is_managed, &has_assertion); if (dirty) { ss_flags |= kTaskIsDirty; } @@ -3233,13 +3294,24 @@ kcdata_get_task_ss_flags(task_t task) if (allow_idle_exit) { ss_flags |= kTaskAllowIdleExit; } + if (is_active) { + ss_flags |= kTaskIsActive; + } + if (is_managed) { + ss_flags |= kTaskIsManaged; + } + if (has_assertion) { + ss_flags |= kTaskHasAssertion; + } #endif if (task->effective_policy.tep_tal_engaged) { ss_flags |= kTaskTALEngaged; } - ss_flags |= workqueue_get_task_ss_flags_from_pwq_state_kdp(bsd_info); + if (from_stackshot) { + ss_flags |= workqueue_get_task_ss_flags_from_pwq_state_kdp(bsd_info); + } #if IMPORTANCE_INHERITANCE if (task->task_imp_base) { @@ -3251,6 +3323,15 @@ kcdata_get_task_ss_flags(task_t task) } } #endif + + if (task->effective_policy.tep_runaway_mitigation) { + ss_flags |= kTaskRunawayMitigation; + } + + if (task->t_flags & TF_TELEMETRY) { + ss_flags |= kTaskRsrcFlagged; + } + return ss_flags; } @@ -3261,7 +3342,6 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ uint64_t shared_cache_slide = 0; uint64_t shared_cache_first_mapping = 0; - uint32_t kdp_fault_results = 0; uint32_t shared_cache_id = 0; struct dyld_shared_cache_loadinfo shared_cache_data = {0}; @@ -3287,8 +3367,14 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ goto error_exit; } - /* We haven't copied in the shared region UUID yet as part of setup */ + /* + * We haven't copied in the shared region UUID yet as part of setup + * This seems to happen infrequently with DriverKit processes on certain + * configurations, even once the process has already been set up. + * rdar://139753101 + */ if (!shared_cache_first_mapping || !task->shared_region->sr_uuid_copied) { + *task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable; goto error_exit; } @@ -3335,18 +3421,6 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_ kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(shared_cache_data), &shared_cache_data)); error_exit: - if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) { - *task_snap_ss_flags |= kTaskUUIDInfoMissing; - } - - if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) { - *task_snap_ss_flags |= kTaskUUIDInfoTriedFault; - } - - if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) { - *task_snap_ss_flags |= kTaskUUIDInfoFaultedIn; - } - return error; } @@ -3600,6 +3674,59 @@ error_exit: return error; } +uint64_t kdp_task_exec_meta_flags(task_t task); + +uint64_t +kdp_task_exec_meta_flags(task_t task) +{ + uint64_t flags = 0; + +#if CONFIG_ROSETTA + if (task_is_translated(task)) { + flags |= kTaskExecTranslated; + } +#endif /* CONFIG_ROSETTA */ + + if (task_has_hardened_heap(task)) { + flags |= kTaskExecHardenedHeap; + } + + + return flags; +} + +/* Compute the set of flags that kdp_task_exec_meta_flags can return based on the kernel config */ +static uint64_t +stackshot_available_task_exec_flags(void) +{ + uint64_t flags_mask = 0; + +#if CONFIG_ROSETTA + flags_mask |= kTaskExecTranslated; +#endif /* CONFIG_ROSETTA */ + + flags_mask |= kTaskExecHardenedHeap; + + + return flags_mask; +} + +static kern_return_t +kcdata_record_task_exec_meta(kcdata_descriptor_t kcd, task_t task) +{ + struct task_exec_meta tem = {}; + kern_return_t error = KERN_SUCCESS; + + tem.tem_flags = kdp_task_exec_meta_flags(task); + + if (tem.tem_flags != 0) { + kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_TASK_EXEC_META, sizeof(struct task_exec_meta), &tem)); + } + +error_exit: + return error; +} + static kern_return_t kcdata_record_task_iostats(kcdata_descriptor_t kcd, task_t task) { @@ -3780,14 +3907,14 @@ kcdata_record_transitioning_task_snapshot(kcdata_descriptor_t kcd, task_t task, task_pid = 0 - task_pid; } - /* the task_snapshot_v2 struct is large - avoid overflowing the stack */ + /* the transitioning_task_snapshot struct is large - avoid overflowing the stack */ kcdata_compression_window_open(kcd); kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TRANSITIONING_TASK_SNAPSHOT, sizeof(struct transitioning_task_snapshot), &out_addr)); cur_tsnap = (struct transitioning_task_snapshot *)out_addr; bzero(cur_tsnap, sizeof(*cur_tsnap)); cur_tsnap->tts_unique_pid = task_uniqueid; - cur_tsnap->tts_ss_flags = kcdata_get_task_ss_flags(task); + cur_tsnap->tts_ss_flags = kcdata_get_task_ss_flags(task, true); cur_tsnap->tts_ss_flags |= task_snap_ss_flags; cur_tsnap->tts_transition_type = transition_type; cur_tsnap->tts_pid = task_pid; @@ -3825,7 +3952,11 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace kern_return_t error = KERN_SUCCESS; mach_vm_address_t out_addr = 0; - struct task_snapshot_v2 * cur_tsnap = NULL; + struct task_snapshot_v3 * cur_tsnap = NULL; +#if CONFIG_MEMORYSTATUS + mach_vm_address_t memorystatus_addr = 0; + struct task_memorystatus_snapshot *memorystatus_snapshot = NULL; +#endif /* CONFIG_MEMORYSTATUS */ #if STACKSHOT_COLLECTS_LATENCY_INFO latency_info->cur_tsnap_latency = mach_absolute_time(); #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */ @@ -3843,14 +3974,14 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace task_pid = 0 - task_pid; } - /* the task_snapshot_v2 struct is large - avoid overflowing the stack */ + /* the task_snapshot_v3 struct is large - avoid overflowing the stack */ kcdata_compression_window_open(kcd); - kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr)); - cur_tsnap = (struct task_snapshot_v2 *)out_addr; + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v3), &out_addr)); + cur_tsnap = (struct task_snapshot_v3 *)out_addr; bzero(cur_tsnap, sizeof(*cur_tsnap)); cur_tsnap->ts_unique_pid = task_uniqueid; - cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task); + cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task, true); cur_tsnap->ts_ss_flags |= task_snap_ss_flags; struct recount_usage term_usage = { 0 }; @@ -3877,8 +4008,12 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace /* Add the BSD process identifiers */ if (task_pid != -1 && bsd_info != NULL) { proc_name_kdp(bsd_info, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm)); + cur_tsnap->ts_uid = proc_getuid(bsd_info); + cur_tsnap->ts_gid = proc_getgid(bsd_info); } else { cur_tsnap->ts_p_comm[0] = '\0'; + cur_tsnap->ts_uid = UINT32_MAX; + cur_tsnap->ts_gid = UINT32_MAX; #if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) if (task->task_imp_base != NULL) { kdp_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0], @@ -3887,6 +4022,20 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace #endif /* IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) */ } +#if CONFIG_MEMORYSTATUS + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_MEMORYSTATUS, sizeof(struct task_memorystatus_snapshot), &memorystatus_addr)); + memorystatus_snapshot = (struct task_memorystatus_snapshot *)memorystatus_addr; + bzero(memorystatus_snapshot, sizeof(*memorystatus_snapshot)); + + + int32_t current_memlimit = 0, effectiveprio = 0, requestedprio = 0, assertionprio = 0; + proc_memstat_data_kdp(bsd_info, ¤t_memlimit, &effectiveprio, &requestedprio, &assertionprio); + memorystatus_snapshot->tms_current_memlimit = current_memlimit; + memorystatus_snapshot->tms_effectivepriority = effectiveprio; + memorystatus_snapshot->tms_requestedpriority = requestedprio; + memorystatus_snapshot->tms_assertionpriority = assertionprio; +#endif /* CONFIG_MEMORYSTATUS */ + kcd_exit_on_error(kcdata_compression_window_close(kcd)); #if CONFIG_COALITIONS @@ -3992,7 +4141,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t cur_tsnap = (struct task_delta_snapshot_v2 *)out_addr; cur_tsnap->tds_unique_pid = task_uniqueid; - cur_tsnap->tds_ss_flags = kcdata_get_task_ss_flags(task); + cur_tsnap->tds_ss_flags = kcdata_get_task_ss_flags(task, true); cur_tsnap->tds_ss_flags |= task_snap_ss_flags; struct recount_usage usage = { 0 }; @@ -4128,6 +4277,7 @@ _stackshot_backtrace_copy(void *vctx, void *dst, user_addr_t src, size_t size) */ kasan_notify_address_nopoison(src_kva, size); #endif + memcpy(dst, (const void *)src_kva, size); return 0; @@ -4605,7 +4755,6 @@ static kern_return_t kdp_stackshot_record_task(task_t task) { boolean_t active_kthreads_only_p = ((stackshot_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); - boolean_t save_donating_pids_p = ((stackshot_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); boolean_t collect_delta_stackshot = ((stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); boolean_t save_owner_info = ((stackshot_flags & STACKSHOT_THREAD_WAITINFO) != 0); boolean_t include_drivers = ((stackshot_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) != 0); @@ -4646,6 +4795,8 @@ kdp_stackshot_record_task(task_t task) boolean_t task_in_transition = task_in_teardown; // here we can add other types of transition. uint32_t container_type = (task_in_transition) ? STACKSHOT_KCCONTAINER_TRANSITIONING_TASK : STACKSHOT_KCCONTAINER_TASK; uint32_t transition_type = (task_in_teardown) ? kTaskIsTerminated : 0; + /* Task just exec'd and this is the old task */ + bool task_is_exec_transit = task_did_exec_internal(task) || task_is_exec_copy_internal(task); if (task_in_transition) { collect_delta_stackshot = FALSE; @@ -4676,7 +4827,9 @@ kdp_stackshot_record_task(task_t task) #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */ /* Trace everything, unless a process was specified. Add in driver tasks if requested. */ - if ((stackshot_args.pid == -1) || (stackshot_args.pid == task_pid) || (include_drivers && task_is_driver(task))) { + if ((stackshot_args.pid == -1) || + ((stackshot_args.pid == task_pid) && !task_is_exec_transit) || + (include_drivers && task_is_driver(task))) { #if STACKSHOT_COLLECTS_LATENCY_INFO stackshot_cpu_latency.tasks_processed++; #endif @@ -4738,6 +4891,7 @@ kdp_stackshot_record_task(task_t task) kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, &task_snap_ss_flags)); kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, stackshot_flags, have_pmap, &task_snap_ss_flags)); + kcd_exit_on_error(kcdata_record_task_exec_meta(stackshot_kcdata_p, task)); #if STACKSHOT_COLLECTS_LATENCY_INFO if (!task_in_transition) { kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags, &latency_info)); @@ -4919,18 +5073,16 @@ kdp_stackshot_record_task(task_t task) } #if IMPORTANCE_INHERITANCE - if (save_donating_pids_p) { - /* Ensure the buffer is big enough, since we're using the stack buffer for this. */ - static_assert(TASK_IMP_WALK_LIMIT * sizeof(int32_t) <= MAX_FRAMES * sizeof(uintptr_t)); - saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, - (char*) stackshot_cpu_ctx.scc_stack_buffer, TASK_IMP_WALK_LIMIT); - if (saved_count > 0) { - /* Variable size array - better not have it on the stack. */ - kcdata_compression_window_open(stackshot_kcdata_p); - kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS, - sizeof(int32_t), saved_count, stackshot_cpu_ctx.scc_stack_buffer)); - kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p)); - } + /* Ensure the buffer is big enough, since we're using the stack buffer for this. */ + static_assert(TASK_IMP_WALK_LIMIT * sizeof(int32_t) <= MAX_FRAMES * sizeof(uintptr_t)); + saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, + (char*) stackshot_cpu_ctx.scc_stack_buffer, TASK_IMP_WALK_LIMIT); + if (saved_count > 0) { + /* Variable size array - better not have it on the stack. */ + kcdata_compression_window_open(stackshot_kcdata_p); + kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS, + sizeof(int32_t), saved_count, stackshot_cpu_ctx.scc_stack_buffer)); + kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p)); } #endif @@ -5131,6 +5283,8 @@ kdp_stackshot_kcdata_format(void) kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate, "stackshot_size_estimate")); kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate_adj, "stackshot_size_estimate_adj")); } + kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, stackshot_available_task_exec_flags(), "stackshot_te_flags_mask")); + #if STACKSHOT_COLLECTS_LATENCY_INFO stackshot_ctx.sc_latency.setup_latency_mt = mach_absolute_time(); @@ -5176,6 +5330,7 @@ kdp_stackshot_kcdata_format(void) kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_GLOBAL_MEM_STATS, sizeof(mais), &mais)); } + #if CONFIG_THREAD_GROUPS struct thread_group_snapshot_v3 *thread_groups = NULL; int num_thread_groups = 0; @@ -5528,6 +5683,7 @@ kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap) } } + static vm_offset_t stackshot_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_flags, uint32_t *kdp_fault_result_flags) { @@ -5608,10 +5764,46 @@ do_stackshot(void *context) uint64_t abs_time = mach_absolute_time(), abs_time_end = 0; kdp_snapshot++; - _stackshot_validation_reset(); - error = stackshot_plh_setup(); /* set up port label hash */ - if (!stackshot_ctx.sc_is_singlethreaded) { +#if defined(__arm64__) + /* + * Set up buffers. We used the ssb_size entry in each buffer entry + * to indicate how many CPUs in that cluster are participating in the + * stackshot, so that we can divvy up buffer space accordingly. + */ + size_t buf_per_cpu = stackshot_args.buffer_size / os_atomic_load(&stackshot_ctx.sc_cpus_working, relaxed); + buf_per_cpu -= buf_per_cpu % sizeof(uint64_t); /* align to uint64_t */ + mach_vm_address_t cur_addr = (mach_vm_address_t) stackshot_args.buffer; + for (int buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) { + size_t bufsz = buf_per_cpu * stackshot_ctx.sc_buffers[buf_idx].ssb_size; + if (bufsz == 0) { + continue; + } + stackshot_ctx.sc_buffers[buf_idx] = (struct stackshot_buffer) { + .ssb_ptr = (void*) cur_addr, + .ssb_size = bufsz, + .ssb_used = 0, + .ssb_freelist = NULL, + .ssb_freelist_lock = 0, + .ssb_overhead = 0 + }; + cur_addr += bufsz; + } + assert(cur_addr <= ((mach_vm_address_t) stackshot_args.buffer + stackshot_args.buffer_size)); +#else /* __arm64__ */ + /* + * On Intel, we always just have one buffer + */ + stackshot_ctx.sc_buffers[0] = (struct stackshot_buffer) { + .ssb_ptr = stackshot_args.buffer, + .ssb_size = stackshot_args.buffer_size, + .ssb_used = 0, + .ssb_freelist = NULL, + .ssb_freelist_lock = 0, + .ssb_overhead = 0 + }; +#endif /* !__arm64__ */ + /* Set up queues. These numbers shouldn't change, but slightly fudge queue size just in case. */ queue_size = FUDGED_SIZE(tasks_count + terminated_tasks_count, 10); for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) { @@ -5628,6 +5820,9 @@ do_stackshot(void *context) } } + _stackshot_validation_reset(); + error = stackshot_plh_setup(); /* set up port label hash */ + if (error != KERN_SUCCESS) { stackshot_set_error(error); return error; @@ -5717,9 +5912,7 @@ do_stackshot(void *context) } if (stackshot_ctx.sc_retval != KERN_SUCCESS && stackshot_exclave_inspect_ctids) { /* Clear inspection CTID list: no need to wait for these threads */ - stackshot_exclave_inspect_ctid_count = 0; - stackshot_exclave_inspect_ctid_capacity = 0; - stackshot_exclave_inspect_ctids = NULL; + stackshot_cleanup_exclave_waitlist(); } #endif @@ -5791,7 +5984,14 @@ stackshot_cpu_preflight(void) stackshot_cpu_ctx.scc_can_work = is_calling_cpu || (is_recommended && !stackshot_ctx.sc_is_singlethreaded); if (stackshot_cpu_ctx.scc_can_work) { - os_atomic_inc(&stackshot_ctx.sc_cpus_working, relaxed); + /* + * Increase size of our cluster's buffer to indicate how many CPUs in this + * cluster are participating + */ +#if defined(__arm64__) + os_atomic_inc(&stackshot_ctx.sc_buffers[cpu_cluster_id()].ssb_size, relaxed); +#endif /* __arm64__ */ + os_atomic_inc(&stackshot_ctx.sc_cpus_working, seq_cst); } } @@ -5845,18 +6045,18 @@ stackshot_cpu_do_work(void) bool high_perf = true; #if defined(__AMP__) - if (current_cpu_datap()->cpu_cluster_type == CLUSTER_TYPE_E) { + if (current_cpu_datap()->cpu_cluster_type != CLUSTER_TYPE_P) { high_perf = false; } #endif /* __AMP__ */ if (high_perf) { - /* Non-E cores: Work from most difficult to least difficult */ + /* High Perf: Work from most difficult to least difficult */ for (size_t i = STACKSHOT_NUM_WORKQUEUES; i > 0; i--) { kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i - 1])); } } else { - /* E: Work from least difficult to most difficult */ + /* Low Perf: Work from least difficult to most difficult */ for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) { kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i])); } diff --git a/osfmk/kern/kern_stackshot.h b/osfmk/kern/kern_stackshot.h index 3135874f8..da16adc7d 100644 --- a/osfmk/kern/kern_stackshot.h +++ b/osfmk/kern/kern_stackshot.h @@ -53,6 +53,8 @@ extern boolean_t panic_stackshot_active(void); extern kern_return_t do_panic_stackshot(void *context); extern void * stackshot_alloc_with_size(size_t size, kern_return_t *err); +extern uint64_t kcdata_get_task_ss_flags(task_t task, bool from_stackshot); + /* Allocates an array of elements of a type from the stackshot buffer. Works in regular & panic stackshots. */ #define stackshot_alloc_arr(type, count, err) stackshot_alloc_with_size(sizeof(type) * (count), err) diff --git a/osfmk/kern/kern_types.h b/osfmk/kern/kern_types.h index 52fa2670b..b20919282 100644 --- a/osfmk/kern/kern_types.h +++ b/osfmk/kern/kern_types.h @@ -336,6 +336,20 @@ typedef enum { REASON_PMGR_SYSTEM, } processor_reason_t; +/* + * Internal validation policy for resolving a proc ref from a proc_ident + */ +enum proc_ident_validation_policy { + // Use all identifier metadata to validate the lookup + IDENT_VALIDATION_PROC_EXACT = 0b0000, + // The process may begin to exit, or has exited before the lookup, + // meaning proc_find() may fail. + IDENT_VALIDATION_PROC_MAY_EXIT = 0b0001, + // Use only p_uniqueid for validation, since p_idversion is allowed + // to increment across exec + IDENT_VALIDATION_PROC_MAY_EXEC = 0b0010, +}; +typedef uint8_t proc_ident_validation_policy_t; /* * struct sched_clutch_edge diff --git a/osfmk/kern/ledger.c b/osfmk/kern/ledger.c index 503444255..56056fae0 100644 --- a/osfmk/kern/ledger.c +++ b/osfmk/kern/ledger.c @@ -65,6 +65,7 @@ #define LF_TRACK_CREDIT_ONLY 0x10000 /* only update "credit" */ #define LF_DIAG_WARNED 0x20000 /* callback was called for balance diag */ #define LF_DIAG_DISABLED 0x40000 /* diagnostics threshold are disabled at the moment */ +#define LF_IS_COUNTER 0x80000 /* entry uses a scalable counter */ /* @@ -91,6 +92,18 @@ _Static_assert(sizeof(int) * 8 >= ENTRY_ID_SIZE_SHIFT * 2, "Ledger indices don't /* These features can fit in a small ledger entry. All others require a full size ledger entry */ #define LEDGER_ENTRY_SMALL_FLAGS (LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE | LEDGER_ENTRY_ALLOW_INACTIVE) +/* + * struct ledger_entry_info is available to user space and used in ledger() syscall. + * Changing its size would cause memory corruption. See rdar://132747700 + */ +static_assert(sizeof(struct ledger_entry_info) == (6 * sizeof(int64_t))); +static_assert(sizeof(struct ledger_entry_info_v2) == (11 * sizeof(int64_t))); + +/* + * Make sure ledger_entry_small and ledger_entry_counter are the same size. + */ +static_assert(sizeof(struct ledger_entry_small) == sizeof(struct ledger_entry_counter)); + /* Turn on to debug invalid ledger accesses */ #if MACH_ASSERT #define PANIC_ON_INVALID_LEDGER_ACCESS 1 @@ -214,6 +227,14 @@ struct ledger_template { struct entry_template *lt_entries; /* Lookup table to go from entry_offset to index in the lt_entries table. */ uint16_t *lt_entries_lut; +#if ATOMIC_COUNTER_USE_PERCPU + /* Number of counters in this template */ + uint16_t lt_counters; + /* Offset of the first counter entry, used to free the counters */ + uint16_t lt_counter_offset; + zone_t lt_counter_zone; + char lt_counter_zone_name[32]; +#endif }; static inline uint16_t @@ -328,7 +349,7 @@ ledger_template_create(const char *name) template->lt_table_size, Z_WAITOK | Z_ZERO); if (template->lt_entries == NULL) { kfree_type(struct ledger_template, template); - template = NULL; + return NULL; } template->lt_entries_lut = kalloc_type(uint16_t, ledger_template_entries_lut_size(template->lt_table_size), Z_WAITOK | Z_ZERO); @@ -341,6 +362,27 @@ ledger_template_create(const char *name) return template; } +static void +ledger_template_create_counter_zone(ledger_template_t template) +{ +#if ATOMIC_COUNTER_USE_PERCPU + if (template->lt_counters) { + snprintf( + template->lt_counter_zone_name, + sizeof(template->lt_counter_zone_name), + "%s.c", + template->lt_name); + + template->lt_counter_zone = zone_create( + template->lt_counter_zone_name, + sizeof(uint64_t) * template->lt_counters, + ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE | ZC_DESTRUCTIBLE); + } +#else /* ATOMIC_COUNTER_USE_PERCPU */ + (void) template; +#endif /* !ATOMIC_COUNTER_USE_PERCPU */ +} + ledger_template_t ledger_template_copy(ledger_template_t template, const char *name) { @@ -388,6 +430,11 @@ ledger_template_copy(ledger_template_t template, const char *name) new_template->lt_cnt = template->lt_cnt; new_template->lt_next_offset = template->lt_next_offset; new_template->lt_entries_lut = new_entries_lut; +#if ATOMIC_COUNTER_USE_PERCPU + new_template->lt_counters = template->lt_counters; + new_template->lt_counter_offset = template->lt_counter_offset; + ledger_template_create_counter_zone(new_template); +#endif out: template_unlock(template); @@ -409,6 +456,11 @@ ledger_template_dereference(ledger_template_t template) if (template->lt_zone) { zdestroy(template->lt_zone); } +#if ATOMIC_COUNTER_USE_PERCPU + if (template->lt_counter_zone) { + zdestroy(template->lt_counter_zone); + } +#endif kfree_type(struct ledger_template, template); } } @@ -506,11 +558,23 @@ ledger_entry_add_with_flags(ledger_template_t template, const char *key, strlcpy(et->et_units, units, LEDGER_NAME_MAX); et->et_flags = LF_ENTRY_ACTIVE; /* - * Currently we only have two types of variable sized entries - * CREDIT_ONLY and full-fledged leger_entry. - * In the future, we can add more gradations based on the flags. + * Currently we have three types of ledger entries: + * - full-fledged ledger entries + * - smaller CREDIT_ONLY entries + * - smaller counter entries */ - if ((flags & ~(LEDGER_ENTRY_SMALL_FLAGS)) == 0) { + if ((flags & LEDGER_ENTRY_USE_COUNTER) != 0) { + /* We cannot use any other flags with scalable counter. */ + assert((flags & (~LEDGER_ENTRY_USE_COUNTER)) == 0); + size = sizeof(struct ledger_entry_counter); + et->et_flags |= LF_IS_COUNTER; +#if ATOMIC_COUNTER_USE_PERCPU + if (template->lt_counters == 0) { + template->lt_counter_offset = (template->lt_next_offset / sizeof(struct ledger_entry_small)); + } + template->lt_counters++; +#endif + } else if ((flags & ~(LEDGER_ENTRY_SMALL_FLAGS)) == 0) { size = sizeof(struct ledger_entry_small); et->et_flags |= LF_TRACK_CREDIT_ONLY; } else { @@ -605,7 +669,8 @@ ledger_template_complete(ledger_template_t template) ledger_size = sizeof(struct ledger) + template->lt_next_offset; assert(ledger_size > sizeof(struct ledger)); template->lt_zone = zone_create(template->lt_name, ledger_size, - ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE); + ZC_DESTRUCTIBLE); + ledger_template_create_counter_zone(template); template->lt_initialized = true; } @@ -626,6 +691,7 @@ ledger_template_complete_secure_alloc(ledger_template_t template) * ledger is large enough. */ pmap_ledger_verify_size(ledger_size); + ledger_template_create_counter_zone(template); template->lt_initialized = true; } @@ -643,6 +709,10 @@ ledger_instantiate(ledger_template_t template, int entry_type) uint16_t entries_size; uint16_t num_entries; uint16_t i; +#if ATOMIC_COUNTER_USE_PERCPU + int counters_inited = 0; + counter_t counters; +#endif template_lock(template); template->lt_refs++; @@ -668,6 +738,12 @@ ledger_instantiate(ledger_template_t template, int entry_type) return LEDGER_NULL; } +#if ATOMIC_COUNTER_USE_PERCPU + if (template->lt_counter_zone) { + counters = zalloc_percpu(template->lt_counter_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL); + } +#endif + ledger->l_template = template; ledger->l_id = ledger_cnt++; os_ref_init(&ledger->l_refs, &ledger_refgrp); @@ -706,6 +782,21 @@ ledger_instantiate(ledger_template_t template, int entry_type) le->le_diag_threshold_scaled = LEDGER_DIAG_MEM_THRESHOLD_INFINITY; le->_le.le_refill.le_refill_period = 0; le->_le.le_refill.le_last_refill = 0; + } else if (et->et_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *lec = (struct ledger_entry_counter *) les; + lec->lec_flags = et->et_flags; +#if ATOMIC_COUNTER_USE_PERCPU + assert(template->lt_counter_zone != NULL); + assert(counters_inited < template->lt_counters); + lec->lec_counter = &counters[counters_inited]; + counters_inited++; +#else /* ATOMIC_COUNTER_USE_PERCPU */ + /* + * When we're using regular (non-percpu) atomic counters, + * this is just a wide store. + */ + counter_alloc(&lec->lec_counter); +#endif /* !ATOMIC_COUNTER_USE_PERCPU */ } else { les->les_flags = et->et_flags; les->les_credit = 0; @@ -741,6 +832,26 @@ ledger_reference(ledger_t ledger) os_ref_retain(&ledger->l_refs); } +#if ATOMIC_COUNTER_USE_PERCPU +static void +ledger_free_counters(ledger_t ledger) +{ + struct ledger_entry_counter *lec; + ledger_template_t template = ledger->l_template; + + if (!template->lt_counter_zone) { + /* Nothing to do */ + assert(!template->lt_counters); + return; + } + + /* We hold the index of the first counter entry which has the pointer to the allocation */ + lec = (struct ledger_entry_counter *) &ledger->l_entries[template->lt_counter_offset]; + assert(lec->lec_flags & LF_IS_COUNTER); + zfree_percpu(template->lt_counter_zone, lec->lec_counter); +} +#endif /* ATOMIC_COUNTER_USE_PERCPU */ + /* * Remove a reference on a ledger. If this is the last reference, * deallocate the unused ledger. @@ -754,6 +865,11 @@ ledger_dereference(ledger_t ledger) if (os_ref_release(&ledger->l_refs) == 0) { ledger_template_t template = ledger->l_template; + +#if ATOMIC_COUNTER_USE_PERCPU + ledger_free_counters(ledger); +#endif /* ATOMIC_COUNTER_USE_PERCPU */ + if (template->lt_zone) { zfree(template->lt_zone, ledger); } else { @@ -873,11 +989,7 @@ entry_get_callback(ledger_t ledger, int entry) static inline void ledger_limit_entry_wakeup(struct ledger_entry *le) { - uint32_t flags; - if (!limit_exceeded(le)) { - flags = flag_clear(&le->le_flags, LF_CALLED_BACK); - while (le->le_flags & LF_WAKE_NEEDED) { flag_clear(&le->le_flags, LF_WAKE_NEEDED); thread_wakeup((event_t)le); @@ -900,7 +1012,7 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry) } if (ENTRY_ID_SIZE(entry) != sizeof(struct ledger_entry)) { - /* Small entries can't do refills */ + /* Small & counter entries can't do refills */ return; } @@ -988,6 +1100,7 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry) lprintf(("Refill %lld %lld->%lld\n", periods, balance, balance - due)); if (!limit_exceeded(le)) { + flag_clear(&le->le_flags, LF_CALLED_BACK); ledger_limit_entry_wakeup(le); } } @@ -1006,6 +1119,9 @@ ledger_entry_check_new_balance(thread_t thread, ledger_t ledger, offset = ENTRY_ID_OFFSET(entry); les = &ledger->l_entries[offset]; if (size == sizeof(struct ledger_entry_small)) { + if (les->les_flags & LF_IS_COUNTER) { + return; /* Nothing to do with a counter */ + } if ((les->les_flags & LF_PANIC_ON_NEGATIVE) && les->les_credit < 0) { panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld debit:0 balance:%lld", ledger, entry, les, @@ -1055,6 +1171,8 @@ ledger_entry_check_new_balance(thread_t thread, ledger_t ledger, act_set_astledger_async(thread); } } else { + flag_clear(&le->le_flags, LF_CALLED_BACK); + /* * The balance on the account is below the limit. * @@ -1147,8 +1265,14 @@ ledger_credit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_ if (entry_size == sizeof(struct ledger_entry_small)) { struct ledger_entry_small *les = &ledger->l_entries[ENTRY_ID_OFFSET(entry)]; - old = OSAddAtomic64(amount, &les->les_credit); - new = old + amount; + if (les->les_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *lec = (struct ledger_entry_counter *) les; + counter_add(&lec->lec_counter, amount); + return KERN_SUCCESS; + } else { + old = OSAddAtomic64(amount, &les->les_credit); + new = old + amount; + } } else if (entry_size == sizeof(struct ledger_entry)) { le = ledger_entry_identifier_to_entry(ledger, entry); @@ -1236,7 +1360,14 @@ ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledger, int entry) OSAddAtomic64(from->le_credit, &to->le_credit); OSAddAtomic64(from->le_debit, &to->le_debit); } else if (entry_size == sizeof(struct ledger_entry_small)) { - OSAddAtomic64(from_les->les_credit, &to_les->les_credit); + if (from_les->les_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *from_lec = (struct ledger_entry_counter *) from_les; + struct ledger_entry_counter *to_lec = (struct ledger_entry_counter *) to_les; + uint64_t from_val = counter_load(&from_lec->lec_counter); + counter_add(&to_lec->lec_counter, from_val); + } else { + OSAddAtomic64(from_les->les_credit, &to_les->les_credit); + } } else { panic("Unknown ledger entry size! ledger=%p, entry=0x%x, entry_size=%d\n", from_ledger, entry, entry_size); } @@ -1266,6 +1397,9 @@ ledger_zero_balance(ledger_t ledger, int entry) les = &ledger->l_entries[entry_offset]; if (entry_size == sizeof(struct ledger_entry_small)) { + if (les->les_flags & LF_IS_COUNTER) { + return KERN_INVALID_ARGUMENT; + } while (true) { credit = les->les_credit; if (OSCompareAndSwap64(credit, 0, &les->les_credit)) { @@ -1505,6 +1639,10 @@ ledger_panic_on_negative(ledger_template_t template, int entry) return KERN_INVALID_VALUE; } + if (template->lt_entries[idx].et_flags & LF_IS_COUNTER) { + return KERN_INVALID_ARGUMENT; + } + template->lt_entries[idx].et_flags |= LF_PANIC_ON_NEGATIVE; template_unlock(template); @@ -1819,8 +1957,14 @@ ledger_debit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_t if (entry_size == sizeof(struct ledger_entry_small)) { struct ledger_entry_small *les = &ledger->l_entries[ENTRY_ID_OFFSET(entry)]; - old = OSAddAtomic64(-amount, &les->les_credit); - new = old - amount; + if (les->les_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *lec = (struct ledger_entry_counter *) les; + counter_add(&lec->lec_counter, -amount); + return KERN_SUCCESS; + } else { + old = OSAddAtomic64(-amount, &les->les_credit); + new = old - amount; + } } else if (entry_size == sizeof(struct ledger_entry)) { le = ledger_entry_identifier_to_entry(ledger, entry); @@ -2199,7 +2343,12 @@ ledger_get_entries(ledger_t ledger, int entry, ledger_amount_t *credit, *credit = le->le_credit; *debit = le->le_debit; } else if (entry_size == sizeof(struct ledger_entry_small)) { - *credit = les->les_credit; + if (les->les_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *lec = (struct ledger_entry_counter *) les; + *credit = counter_load(&lec->lec_counter); + } else { + *credit = les->les_credit; + } *debit = 0; } else { panic("Unknown ledger entry size! ledger=%p, entry=0x%x, entry_size=%d\n", ledger, entry, entry_size); @@ -2326,7 +2475,7 @@ ledger_template_info(void **buf, int *len) } static kern_return_t -ledger_fill_entry_info(ledger_t ledger, +_ledger_fill_entry_info(ledger_t ledger, int entry, struct ledger_entry_info *lei, uint64_t now) @@ -2345,8 +2494,13 @@ ledger_fill_entry_info(ledger_t ledger, les = &ledger->l_entries[entry_offset]; memset(lei, 0, sizeof(*lei)); if (entry_size == sizeof(struct ledger_entry_small)) { + if (les->les_flags & LF_IS_COUNTER) { + struct ledger_entry_counter *lec = (struct ledger_entry_counter *) les; + lei->lei_credit = counter_load(&lec->lec_counter); + } else { + lei->lei_credit = les->les_credit; + } lei->lei_limit = LEDGER_LIMIT_INFINITY; - lei->lei_credit = les->les_credit; lei->lei_debit = 0; lei->lei_refill_period = 0; lei->lei_last_refill = abstime_to_nsecs(now); @@ -2367,12 +2521,40 @@ ledger_fill_entry_info(ledger_t ledger, return KERN_SUCCESS; } -int -ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len) +static kern_return_t +ledger_fill_entry_info(ledger_t ledger, + int entry, + void *lei_generic, + uint64_t now, + bool v2) { - struct ledger_entry_info *lei_buf = NULL, *lei_curr = NULL; + ledger_amount_t max; + kern_return_t kr; + struct ledger_entry_info *lei = (struct ledger_entry_info *)lei_generic; + struct ledger_entry_info_v2 *lei_v2 = (struct ledger_entry_info_v2 *)lei_generic; + + kr = _ledger_fill_entry_info(ledger, entry, lei, now); + if (kr != KERN_SUCCESS) { + return kr; + } + + if (v2) { + lei_v2->lei_lifetime_max = -1; + if (KERN_SUCCESS == ledger_get_lifetime_max(ledger, entry, &max)) { + lei_v2->lei_lifetime_max = max; + } + } + + return KERN_SUCCESS; +} + + +int +ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len, bool v2) +{ + void *lei_buf = NULL, *lei_curr = NULL; uint64_t now = mach_absolute_time(); - vm_size_t size = 0; + vm_size_t buf_size = 0, entry_size = 0; int i; ledger_t l; ledger_template_t template; @@ -2387,8 +2569,9 @@ ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len) if (*len > template->lt_cnt) { *len = template->lt_cnt; } - size = (*len) * sizeof(struct ledger_entry_info); - lei_buf = kalloc_data(size, Z_WAITOK); + entry_size = (v2) ? sizeof(struct ledger_entry_info_v2) : sizeof(struct ledger_entry_info); + buf_size = (*len) * entry_size; + lei_buf = kalloc_data(buf_size, Z_WAITOK); if (lei_buf == NULL) { return ENOMEM; } @@ -2397,12 +2580,12 @@ ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len) for (i = 0; i < *len; i++) { et = &template->lt_entries[i]; int index = ledger_entry_id_from_template_entry(et); - if (ledger_fill_entry_info(l, index, lei_curr, now) != KERN_SUCCESS) { - kfree_data(lei_buf, size); + if (ledger_fill_entry_info(l, index, lei_curr, now, v2) != KERN_SUCCESS) { + kfree_data(lei_buf, buf_size); lei_buf = NULL; return EINVAL; } - lei_curr++; + lei_curr = (void *)((mach_vm_address_t)lei_curr + entry_size); } *buf = lei_buf; @@ -2419,7 +2602,7 @@ ledger_get_entry_info(ledger_t ledger, assert(ledger != NULL); assert(lei != NULL); - ledger_fill_entry_info(ledger, entry, lei, now); + _ledger_fill_entry_info(ledger, entry, lei, now); } int diff --git a/osfmk/kern/ledger.h b/osfmk/kern/ledger.h index b01d5f64e..9e8e35fa9 100644 --- a/osfmk/kern/ledger.h +++ b/osfmk/kern/ledger.h @@ -36,14 +36,16 @@ #ifdef MACH_KERNEL_PRIVATE #include +#include #endif /* MACH_KERNEL_PRIVATE */ #define LEDGER_INFO 0 #define LEDGER_ENTRY_INFO 1 #define LEDGER_TEMPLATE_INFO 2 #define LEDGER_LIMIT 3 +#define LEDGER_ENTRY_INFO_V2 4 /* LEDGER_MAX_CMD always tracks the index of the last ledger command. */ -#define LEDGER_MAX_CMD LEDGER_LIMIT +#define LEDGER_MAX_CMD LEDGER_ENTRY_INFO_V2 #define LEDGER_NAME_MAX 32 @@ -102,6 +104,15 @@ struct ledger_entry_small { volatile ledger_amount_t les_credit __attribute__((aligned(8))); } __attribute__((aligned(8))); +/* + * Some ledger entries would benefit from the use of a scalable counter + * and don't care about limits - those entries use this struct. + */ +struct ledger_entry_counter { + volatile uint32_t lec_flags; + counter_t lec_counter __attribute__((aligned(8))); +} __attribute__((aligned(8))); + struct ledger { uint64_t l_id; os_refcnt_t l_refs; @@ -120,6 +131,17 @@ struct ledger_entry_info { uint64_t lei_last_refill; /* Time since last refill */ }; +struct ledger_entry_info_v2 { + int64_t lei_balance; + int64_t lei_credit; + int64_t lei_debit; + uint64_t lei_limit; + uint64_t lei_refill_period; /* In nanoseconds */ + uint64_t lei_last_refill; /* Time since last refill */ + int64_t lei_lifetime_max; /* for phys_footprint/neural_nofootprint_lifetime_max */ + uint64_t lei_reserved[4]; +}; + struct ledger_limit_args { char lla_name[LEDGER_NAME_MAX]; uint64_t lla_limit; @@ -177,6 +199,7 @@ __options_decl(ledger_entry_flags, uint64_t, { LEDGER_ENTRY_ALLOW_LIMIT = 0x10, LEDGER_ENTRY_ALLOW_ACTION = 0x20, LEDGER_ENTRY_ALLOW_INACTIVE = 0x40, + LEDGER_ENTRY_USE_COUNTER = 0x80, }); /* @@ -259,7 +282,7 @@ extern int ledger_limit(task_t task, struct ledger_limit_args *args); extern int ledger_info(task_t task, struct ledger_info *info); extern int -ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len); +ledger_get_task_entry_info_multiple(task_t task, void **buf, int *len, bool v2); extern void ledger_get_entry_info(ledger_t ledger, int entry, diff --git a/osfmk/kern/lock_group.h b/osfmk/kern/lock_group.h index 835da7464..0bcb40c7c 100644 --- a/osfmk/kern/lock_group.h +++ b/osfmk/kern/lock_group.h @@ -88,7 +88,7 @@ extern void lck_grp_free( lck_grp_t *grp); #if XNU_KERNEL_PRIVATE -#pragma GCC visibility push(hidden) +__exported_push_hidden /* * Arguments wrapped in LCK_GRP_ARG() will be elided @@ -254,7 +254,7 @@ lck_opts_get(void) return LcksOpts; } -#pragma GCC visibility pop +__exported_pop #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/lock_mtx.c b/osfmk/kern/lock_mtx.c index 0bdba2edc..bd4e5448c 100644 --- a/osfmk/kern/lock_mtx.c +++ b/osfmk/kern/lock_mtx.c @@ -280,7 +280,7 @@ lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp) zfree(KT_LCK_MTX, lck); } -void +__mockable void lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr) { if (attr == LCK_ATTR_NULL) { @@ -298,7 +298,7 @@ lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr) lck_grp_reference(grp, &grp->lck_grp_mtxcnt); } -void +__mockable void lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp) { if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) { @@ -695,6 +695,8 @@ lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state) KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0); + deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed) * processor_avail_count; + /* * Take a spot in the adaptive spin queue, * and then spin until we're at the head of it. @@ -726,7 +728,7 @@ lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state) os_atomic_store(&node->lmm_as_next, mcs, release); while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev, - prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT))) { + prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT) || (ml_get_timebase() > deadline))) { hw_spin_should_keep_spinning(lock, pol, to, &ss); } @@ -1047,7 +1049,7 @@ lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode) } } -void +__mockable void lck_mtx_lock(lck_mtx_t *lock) { lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE); @@ -1267,7 +1269,7 @@ lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data) lck_mtx_unlock_contended(lock, thread, data); } -void +__mockable void lck_mtx_unlock(lck_mtx_t *lock) { thread_t thread = current_thread(); diff --git a/osfmk/kern/lock_rw.c b/osfmk/kern/lock_rw.c index 82a03b2ba..887c487d5 100644 --- a/osfmk/kern/lock_rw.c +++ b/osfmk/kern/lock_rw.c @@ -1241,6 +1241,7 @@ lck_rw_lock_exclusive_internal( * * @param lock rw_lock to lock. */ +__mockable void lck_rw_lock_exclusive( lck_rw_t *lock) @@ -1517,6 +1518,7 @@ lck_rw_lock_shared_internal( * * @param lock rw_lock to lock. */ +__mockable void lck_rw_lock_shared( lck_rw_t *lock) @@ -1720,6 +1722,7 @@ lck_rw_lock_shared_to_exclusive_success( * If the function was not able to upgrade the lock, the lock will be dropped * by the function. */ +__mockable boolean_t lck_rw_lock_shared_to_exclusive( lck_rw_t *lock) @@ -1841,6 +1844,7 @@ lck_rw_lock_exclusive_to_shared_gen( * * @param lock rw_lock already held in exclusive mode to downgrade. */ +__mockable void lck_rw_lock_exclusive_to_shared( lck_rw_t *lock) @@ -1908,6 +1912,7 @@ _lck_rw_lock_type_panic( * @param lck rw_lock to lock. * @param lck_rw_type LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE */ +__mockable void lck_rw_lock( lck_rw_t *lck, @@ -2012,6 +2017,7 @@ lck_rw_try_lock_shared_internal( * * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held. */ +__mockable boolean_t lck_rw_try_lock_shared( lck_rw_t *lock) @@ -2090,6 +2096,7 @@ lck_rw_try_lock_exclusive_internal( * * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held. */ +__mockable boolean_t lck_rw_try_lock_exclusive( lck_rw_t *lock) @@ -2132,6 +2139,7 @@ _lck_rw_try_lock_type_panic( * * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held. */ +__mockable boolean_t lck_rw_try_lock( lck_rw_t *lck, @@ -2220,6 +2228,7 @@ lck_rw_done_gen( * * @param lock rw_lock to unlock. */ +__mockable lck_rw_type_t lck_rw_done( lck_rw_t *lock) @@ -2304,6 +2313,7 @@ check_waiters: * * @param lck rw_lock held in shared mode to unlock. */ +__mockable void lck_rw_unlock_shared( lck_rw_t *lck) @@ -2332,6 +2342,7 @@ lck_rw_unlock_shared( * * @param lck rw_lock held in exclusive mode to unlock. */ +__mockable void lck_rw_unlock_exclusive( lck_rw_t *lck) @@ -2362,6 +2373,7 @@ lck_rw_unlock_exclusive( * @param lck rw_lock to unlock. * @param lck_rw_type LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE */ +__mockable void lck_rw_unlock( lck_rw_t *lck, @@ -2393,6 +2405,7 @@ lck_rw_unlock( * @param lck rw_lock to check. * @param type assert type */ +__mockable void lck_rw_assert( lck_rw_t *lck, @@ -2513,6 +2526,7 @@ kdp_rwlck_find_owner( * * @returns TRUE if the lock would yield, FALSE otherwise */ +__mockable bool lck_rw_lock_would_yield_shared( lck_rw_t *lck) @@ -2577,6 +2591,7 @@ lck_rw_lock_yield_shared( * * @returns TRUE if the lock would yield, FALSE otherwise */ +__mockable bool lck_rw_lock_would_yield_exclusive( lck_rw_t *lck, diff --git a/osfmk/kern/lock_ticket.c b/osfmk/kern/lock_ticket.c index 1ebca697d..da719b007 100644 --- a/osfmk/kern/lock_ticket.c +++ b/osfmk/kern/lock_ticket.c @@ -72,11 +72,7 @@ extern uint64_t TLockTimeOut; * to execute before checking for timeouts and * issuing a "wait" hypercall. */ -#if __x86_64__ #define DEFAULT_TICKET_LOOPS (LOCK_SNOOP_SPINS) -#else -#define DEFAULT_TICKET_LOOPS (LOCK_SNOOP_SPINS / 8) -#endif uint32_t ticket_lock_spins = DEFAULT_TICKET_LOOPS; #define TICKET_LOCK_SNOOP_LOOPS ticket_lock_spins diff --git a/osfmk/kern/mach_node.c b/osfmk/kern/mach_node.c deleted file mode 100644 index f84adc001..000000000 --- a/osfmk/kern/mach_node.c +++ /dev/null @@ -1,903 +0,0 @@ -/* - * Copyright (c) 2015-2020 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* File: kern/mach_node.h - * Author: Dean Reece - * Date: 2016 - * - * Implementation of mach node support. - * This is the basis for flipc, which provides inter-node communication. - */ - - -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include // mach_msg_send_from_kernel_proper() - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include // OSAddAtomic64(), OSCompareAndSwap() -#include // OSHostByteOrder() - -#pragma pack(4) - -#define MNL_NAME_TABLE_SIZE (256) // Hash is evenly distributed, so ^2 is ok -#define MNL_NAME_HASH(name) (name % MNL_NAME_TABLE_SIZE) - -/*** Visible outside mach_node layer ***/ -mach_node_id_t localnode_id = -1; // This node's FLIPC id. -#if MACH_FLIPC -mach_node_t localnode; // This node's mach_node_t struct - - -/*** Private to mach_node layer ***/ -static int mach_nodes_to_publish; -static mach_node_t mach_node_table[MACH_NODES_MAX]; -static LCK_SPIN_DECLARE_ATTR(mach_node_table_lock_data, - &ipc_lck_grp, &ipc_lck_attr); -#define MACH_NODE_TABLE_LOCK() lck_spin_lock(&mach_node_table_lock_data) -#define MACH_NODE_TABLE_UNLOCK() lck_spin_unlock(&mach_node_table_lock_data) - -static volatile SInt64 mnl_name_next; -static queue_head_t mnl_name_table[MNL_NAME_TABLE_SIZE]; -static LCK_SPIN_DECLARE_ATTR(mnl_name_table_lock_data, - &ipc_lck_grp, &ipc_lck_attr); -#define MNL_NAME_TABLE_LOCK() lck_spin_lock(&mnl_name_table_lock_data) -#define MNL_NAME_TABLE_UNLOCK() lck_spin_unlock(&mnl_name_table_lock_data) - -static void mach_node_init(void); -static void mnl_name_table_init(void); -static void mach_node_table_init(void); -static void mach_node_publish(mach_node_t node); - -static mach_node_t mach_node_alloc_init(mach_node_id_t node_id); -static kern_return_t mach_node_register(mach_node_t node); - - -/* mach_node_init() is run lazily when a node link driver registers - * or the node special port is set. - * The variable localnode_id is used to determine if init has already run. - */ -void -mach_node_init(void) -{ - mach_node_id_t node_id = 0; // TODO: Read from device tree? - if (OSCompareAndSwap((UInt32)(HOST_LOCAL_NODE), - (UInt32)node_id, - &localnode_id)) { - printf("mach_node_init(): localnode_id=%d of %d\n", - localnode_id, MACH_NODES_MAX); - mach_node_table_init(); - mnl_name_table_init(); - } // TODO: else block until init is finished (init completion race) -} - -void -mach_node_table_init(void) -{ - MACH_NODE_TABLE_LOCK(); - - /* Start with an enpty node table. */ - bzero(mach_node_table, sizeof(mach_node_t) * MACH_NODES_MAX); - mach_nodes_to_publish = 0; - - /* Allocate localnode's struct */ - localnode = mach_node_for_id_locked(localnode_id, 1, 1); - assert(MACH_NODE_VALID(localnode)); - - MACH_NODE_TABLE_UNLOCK(); - - /* Set up localnode's struct */ - bzero(localnode, sizeof(*localnode)); - localnode->info.datamodel = LOCAL_DATA_MODEL; - localnode->info.byteorder = OSHostByteOrder(); - localnode->info.proto_vers_min = MNL_PROTOCOL_V1; - localnode->info.proto_vers_max = MNL_PROTOCOL_V1; - localnode->proto_vers = MNL_PROTOCOL_V1; - localnode->published = 0; - localnode->active = 1; - - MACH_NODE_UNLOCK(localnode); -} - -/* Sends a publication message to the local node's bootstrap server. - * This function is smart and will only send a notification if one as really - * needed - it can be called speculatively on any node at any time. - * - * Note: MUST be called with the node table lock held. - */ - -void -mach_node_publish(mach_node_t node) -{ - kern_return_t kr; - - if (!MACH_NODE_VALID(node) || (!node->active) || (node->published)) { - return; // node is invalid or not suitable for publication - } - ipc_port_t bs_port = localnode->bootstrap_port; - if (!IP_VALID(bs_port)) { - return; // No bootstrap server to notify! - } - /* Node is suitable and server is present, so make registration message */ - struct mach_node_server_register_msg msg; - - msg.node_header.header.msgh_remote_port = bs_port; - msg.node_header.header.msgh_size = sizeof(msg); - msg.node_header.header.msgh_local_port = MACH_PORT_NULL; - msg.node_header.header.msgh_voucher_port = MACH_PORT_NULL; - msg.node_header.header.msgh_id = MACH_NODE_SERVER_MSG_ID; - msg.node_header.node_id = node->info.node_id; - msg.node_header.options = 0; - msg.datamodel = node->info.datamodel; - msg.byteorder = node->info.byteorder; - - if (node == localnode) { - msg.node_header.identifier = MACH_NODE_SM_REG_LOCAL; - msg.node_header.header.msgh_bits = - MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0, 0, 0); - } else { - msg.node_header.identifier = MACH_NODE_SM_REG_REMOTE; - msg.node_header.header.msgh_local_port = node->bootstrap_port; - msg.node_header.header.msgh_bits = MACH_MSGH_BITS_SET - (MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND, 0, 0); - } - - kr = mach_msg_send_from_kernel_proper(&msg.node_header.header, - sizeof(msg)); - if (kr == KERN_SUCCESS) { - node->published = 1; - mach_nodes_to_publish--; - } - printf("mach_node_publish(%d)=%d\n", node->info.node_id, kr); -} - -/* Called whenever the node special port changes */ -void -mach_node_port_changed(void) -{ - ipc_port_t bs_port; - - mach_node_init(); // Lazy init of mach_node layer - - /* Cleanup previous bootstrap port if necessary */ - MACH_NODE_LOCK(localnode); - flipc_node_retire(localnode); - bs_port = localnode->bootstrap_port; - if (IP_VALID(bs_port)) { - localnode->bootstrap_port = IP_NULL; - // TODO: destroy send right to outgoing bs_port - } - - kernel_get_special_port(host_priv_self(), HOST_NODE_PORT, &bs_port); - assert(IP_VALID(bs_port)); - localnode->bootstrap_port = bs_port; - flipc_node_prepare(localnode); - MACH_NODE_UNLOCK(localnode); - - /* Cleanup the publication state of all nodes in the table */ - MACH_NODE_TABLE_LOCK(); - // TODO: Signup for bootstrap port death notifications - localnode->active = 1; - - mach_nodes_to_publish = 0; - - int n; - for (n = 0; n < MACH_NODES_MAX; n++) { - mach_node_t np = mach_node_table[n]; - // Publish all active nodes (except the local node) - if (!MACH_NODE_VALID(np)) { - continue; - } - np->published = 0; - if (np->active == 1) { - mach_nodes_to_publish++; - } - } - - mach_node_publish(localnode); // Always publish local node first - - for (n = 0; n < MACH_NODES_MAX; n++) { - mach_node_publish(mach_node_table[n]); - } - - MACH_NODE_TABLE_UNLOCK(); - - // TODO: notify all active nodes we are bootstrapped -} - -/* Allocate/init a mach_node struct and fill in the node_id field. - * This does NOT insert the node struct into the node table. - */ -mach_node_t -mach_node_alloc_init(mach_node_id_t node_id) -{ - mach_node_t node = MACH_NODE_ALLOC(); - if (MACH_NODE_VALID(node)) { - bzero(node, sizeof(struct mach_node)); - MACH_NODE_LOCK_INIT(node); - node->info.node_id = node_id; - } - return node; -} - - -/* This function takes a mach_node struct with a completed info field and - * registers it with the mach_node and flipc (if flipc is enabled) layers. - */ -kern_return_t -mach_node_register(mach_node_t node) -{ - assert(MACH_NODE_VALID(node)); - mach_node_id_t nid = node->info.node_id; - assert(MACH_NODE_ID_VALID(nid)); - - kern_return_t kr; - ipc_space_t proxy_space = IS_NULL; - ipc_pset_t pp_set = IPS_NULL; // pset for proxy ports - ipc_port_t bs_port = MACH_PORT_NULL; - ipc_port_t ack_port = MACH_PORT_NULL; - - printf("mach_node_register(%d)\n", nid); - - /* TODO: Support non-native byte order and data models */ - if ((node->info.byteorder != OSHostByteOrder()) || - (node->info.datamodel != LOCAL_DATA_MODEL)) { - printf("mach_node_register: unsupported byte order (%d) or width (%d)", - node->info.byteorder, node->info.datamodel); - return KERN_INVALID_ARGUMENT; - } - - /* Create the space that holds all local rights assigned to */ - kr = ipc_space_create_special(&proxy_space); - if (kr != KERN_SUCCESS) { - goto out; - } - proxy_space->is_node_id = nid; - - /* Create the bootstrap proxy port for this remote node */ - bs_port = ipc_port_alloc_special(proxy_space, IPC_PORT_INIT_MESSAGE_QUEUE); - if (bs_port == MACH_PORT_NULL) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - - /* Create the control (ack) port for this remote node */ - ack_port = ipc_port_alloc_special(proxy_space, IPC_PORT_INIT_MESSAGE_QUEUE); - if (ack_port == MACH_PORT_NULL) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - - /* Create the set that holds all proxy ports for this remote node */ - pp_set = ipc_pset_alloc_special(proxy_space); - if (pp_set == IPS_NULL) { - kr = KERN_RESOURCE_SHORTAGE; - goto out; - } - - waitq_set_lazy_init_link(&pp_set->ips_wqset); - /* Add the bootstrap port to the proxy port set */ - waitq_link_t link = waitq_link_alloc(WQT_PORT_SET); - ip_mq_lock(bs_port); - ips_mq_lock(pp_set); // Revisit the lock when enabling flipc - ipc_mqueue_add_locked(bs_port, pp_set, &link); - ips_mq_unlock(pp_set); - ip_mq_unlock(bs_port); - - /* Add the control port to the proxy port set */ - if (link.wqlh == NULL) { - link = waitq_link_alloc(WQT_PORT_SET); - } - ip_mq_lock(ack_port); - ips_mq_lock(pp_set); // Revisit the lock when enabling flipc - ipc_mqueue_add_locked(ack_port, pp_set, &link); - ips_mq_unlock(pp_set); - ips_mq_unlock(ack_port); - - if (link.wqlh) { - waitq_link_free(WQT_PORT_SET, link); - } - - // Setup mach_node struct - node->published = 0; - node->active = 1; - node->proxy_space = proxy_space; - node->proxy_port_set = pp_set; - node->bootstrap_port = bs_port; - node->proto_vers = node->info.proto_vers_max; - node->control_port = ack_port; - - // Place new mach_node struct into node table - MACH_NODE_TABLE_LOCK(); - - mach_node_t old_node = mach_node_table[nid]; - if (!MACH_NODE_VALID(old_node) || (old_node->dead)) { - node->antecedent = old_node; - flipc_node_prepare(node); - mach_node_table[nid] = node; - mach_nodes_to_publish++; - mach_node_publish(node); - kr = KERN_SUCCESS; - } else { - printf("mach_node_register: id %d already active!", nid); - kr = KERN_FAILURE; - } - MACH_NODE_TABLE_UNLOCK(); - -out: - if (kr != KERN_SUCCESS) { // Dispose of whatever we allocated - if (pp_set) { - ips_mq_lock(pp_set); - ipc_pset_destroy(proxy_space, pp_set); - } - - if (bs_port) { - ipc_port_dealloc_special(bs_port, proxy_space); - } - - if (ack_port) { - ipc_port_dealloc_special(ack_port, proxy_space); - } - - if (proxy_space) { - ipc_space_terminate(proxy_space); - } - } - - return kr; -} - - -/* Gets or allocates a locked mach_node struct for the specified . - * The current node is locked and returned if it is not dead, or if it is dead - * and is false. A new node struct is allocated, locked and - * returned if the node is dead and is true, or if the node - * is absent and is true. MACH_NODE_NULL is returned if - * the node is absent and is false. MACH_NODE_NULL is also - * returned if a new node structure was not able to be allocated. - * - * Note: This function must be called with the node table lock held! - */ -mach_node_t -mach_node_for_id_locked(mach_node_id_t node_id, - boolean_t alloc_if_dead, - boolean_t alloc_if_absent) -{ - if ((node_id < 0) || (node_id >= MACH_NODES_MAX)) { - return MACH_NODE_NULL; - } - - mach_node_t node = mach_node_table[node_id]; - - if ((!MACH_NODE_VALID(node) && alloc_if_absent) || - (MACH_NODE_VALID(node) && node->dead && alloc_if_dead)) { - node = mach_node_alloc_init(node_id); - if (MACH_NODE_VALID(node)) { - node->antecedent = mach_node_table[node_id]; - mach_node_table[node_id] = node; - } - } - - if (MACH_NODE_VALID(node)) { - MACH_NODE_LOCK(node); - } - - return node; -} - - - -/*** Mach Node Link Name and Hash Table Implementation ***/ - -/* Allocate a new unique name and return it. - * Dispose of this with mnl_name_free(). - * Returns MNL_NAME_NULL on failure. - */ -mnl_name_t -mnl_name_alloc(void) -{ - return (mnl_name_t)OSAddAtomic64(MACH_NODES_MAX, &mnl_name_next); -} - - -/* Deallocate a unique name that was allocated via mnl_name_alloc(). - */ -void -mnl_name_free(mnl_name_t name __unused) -{ - ; // Nothing to do for now since we don't recycle mnl names. -} - - -/* Called once from mach_node_init(), this sets up the hash table structures. - */ -void -mnl_name_table_init(void) -{ - MNL_NAME_TABLE_LOCK(); - - // Set the first name to this node's bootstrap name - mnl_name_next = localnode_id + MACH_NODES_MAX; - - for (int i = 0; i < MNL_NAME_TABLE_SIZE; i++) { - queue_head_init(mnl_name_table[i]); - } - - MNL_NAME_TABLE_UNLOCK(); -} - - -/* Initialize the data structures in the mnl_obj structure at the head of the - * provided object. This should be called on an object before it is passed to - * any other mnl_obj* routine. - */ -void -mnl_obj_init(mnl_obj_t obj) -{ - queue_chain_init(obj->links); - obj->name = MNL_NAME_NULL; -} - - -/* Search the local node's hash table for the object associated with a - * mnl_name_t and return it. Returns MNL_NAME_NULL on failure. - */ -mnl_obj_t -mnl_obj_lookup(mnl_name_t name) -{ - mnl_obj_t obj = MNL_OBJ_NULL; - - if (name != MNL_NAME_NULL) { - qe_foreach_element(obj, &mnl_name_table[MNL_NAME_HASH(name)], links) { - if (obj->name == name) { - break; - } - } - } - return obj; -} - - -/* Search the local node's hash table for the object associated with a - * mnl_name_t and remove it. The pointer to the removed object is returned so - * that the caller can appropriately dispose of the object. - * Returns MNL_NAME_NULL on failure. - */ -mnl_obj_t -mnl_obj_remove(mnl_name_t name) -{ - mnl_obj_t obj = MNL_OBJ_NULL; - - if (name != MNL_NAME_NULL) { - qe_foreach_element_safe(obj, &mnl_name_table[MNL_NAME_HASH(name)], links) { - if (obj->name == name) { - remqueue(&obj->links); - } - } - } - return obj; -} - - -/* Insert an object into the local node's hash table. If the name of the - * provided object is MNL_NAME_NULL then a new mnl_name is allocated and - * assigned to the object. - * Returns KERN_SUCCESS if obj was added to hash table - * Returns KERN_INVALID_ARGUMENT if obj is invalid - * Returns KERN_NAME_EXISTS if obj's name already exists in hash table - */ -kern_return_t -mnl_obj_insert(mnl_obj_t obj) -{ - if (!MNL_OBJ_VALID(obj)) { - return KERN_INVALID_ARGUMENT; - } - - MNL_NAME_TABLE_LOCK(); - - if (!MNL_NAME_VALID(obj->name)) { - // obj is unnammed, so lets allocate a fresh one - obj->name = mnl_name_alloc(); - } - - enqueue(&mnl_name_table[MNL_NAME_HASH(obj->name)], &obj->links); - MNL_NAME_TABLE_UNLOCK(); - - if (obj->name >= (MACH_NODES_MAX << 1)) { - panic("Unexpected MNL_NAME %lld in obj %p", obj->name, obj); - } - - return KERN_SUCCESS; -} - - -/*** Mach Node Link Driver Interface Implementation ***/ - -/* Allocate a mnl_msg struct plus additional payload. Link drivers are not - * required to use this to allocate messages; any wired and mapped kernel - * memory is acceptable. - * - * Arguments: - * payload Number of additional bytes to allocate for message payload - * flags Currently unused; 0 should be passed - * - * Return values: - * MNL_MSG_NULL: Allocation failed - * *: Pointer to new mnl_msg struct of requested size - */ -mnl_msg_t -mnl_msg_alloc(int payload, - uint32_t flags __unused) -{ - mnl_msg_t msg = kalloc(MNL_MSG_SIZE + payload); - - if (MNL_MSG_VALID(msg)) { - bzero(msg, MNL_MSG_SIZE); // Only zero the header - msg->size = payload; - } - - return msg; -} - - -/* Free a mnl_msg struct allocated by mnl_msg_alloc(). - * - * Arguments: - * msg Pointer to the message buffer to be freed - * flags Currently unused; 0 should be passed - */ -void -mnl_msg_free(mnl_msg_t msg, - uint32_t flags __unused) -{ - if (MNL_MSG_VALID(msg)) { - kfree(msg, MNL_MSG_SIZE + msg->size); - } -} - - -/* The link driver calls this to setup a new (or restarted) node, and to get - * an mnl_node_info struct for use as a parameter to other mnl functions. - * If MNL_NODE_NULL is returned, the operation failed. Otherwise, a pointer - * to a new mnl_node struct is returned. The caller should set all fields - * in the structure, then call mnl_register() to complete node registration. - * - * Arguments: - * nid The id of the node to be instantiated - * flags Currently unused; 0 should be passed - * - * Return values: - * MNL_NODE_NULL: Operation failed - * *: Pointer to a new mnl_node struct - */ -mnl_node_info_t -mnl_instantiate(mach_node_id_t nid, - uint32_t flags __unused) -{ - mach_node_init(); // Lazy init of mach_node layer - - if ((nid == localnode_id) || !MACH_NODE_ID_VALID(nid)) { - return MNL_NODE_NULL; - } - - return (mnl_node_info_t)mach_node_alloc_init(nid); -} - -/* The link driver calls mnl_register() to complete the node registration - * process. KERN_SUCCESS is returned if registration succeeded, otherwise - * an error is returned. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Registration succeeded - * KERN_INVALID_ARGUMENT: Field(s) in contained unacceptable values - * KERN_*: Values returned from underlying functions - */ -kern_return_t -mnl_register(mnl_node_info_t node, - uint32_t flags __unused) -{ - if (MNL_NODE_VALID(node) && (node->node_id != localnode_id)) { - return mach_node_register((mach_node_t)node); - } - - return KERN_INVALID_ARGUMENT; -} - - -/* The link driver calls this to report that the link has been raised in one - * or both directions. If the link is two uni-directional channels, each link - * driver will independently call this function, each only raising the link - * they are responsible for. The mach_node layer will not communicate with - * the remote node until both rx and tx links are up. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * link Indicates which link(s) are up (see MNL_LINK_* defines) - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Link state changed successfully. - * KERN_INVALID_ARGUMENT: An argument value was not allowed. - * KERN_*: Values returned from underlying functions. - */ -kern_return_t -mnl_set_link_state(mnl_node_info_t node, - int link, - uint32_t flags __unused) -{ - kern_return_t kr; - mach_node_t mnode = (mach_node_t)node; - - if (!MACH_NODE_VALID(mnode) || !(link & MNL_LINK_UP) || (link & mnode->link)) { - return KERN_INVALID_ARGUMENT; // bad node, or bad link argument - } - MACH_NODE_LOCK(mnode); - - if (mnode->dead) { - kr = KERN_NODE_DOWN; - } else { - mnode->link |= link; - kr = KERN_SUCCESS; - } - - MACH_NODE_UNLOCK(mnode); - - return kr; -} - -/* The link driver calls this to indicate a node has terminated and is no - * longer available for messaging. This may be due to a crash or an orderly - * shutdown, but either way the remote node no longer retains any state about - * the remaining nodes. References held on behalf of the terminated node - * will be cleaned up. After this is called, both the rx and tx links are - * marked as down. If the remote node restarts, the link driver can bring - * up the link using mnl_instantiate() again. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Node was terminated. - * KERN_INVALID_ARGUMENT: Node id was invalid or non-existant. - * KERN_*: Values returned from underlying functions. - */ -kern_return_t -mnl_terminate(mnl_node_info_t node, - uint32_t flags __unused) -{ - kern_return_t kr = KERN_SUCCESS; - mach_node_t mnode = (mach_node_t)node; - - if (!MACH_NODE_VALID(mnode)) { - return KERN_INVALID_ARGUMENT; // bad node - } - MACH_NODE_LOCK(mnode); - if (mnode->dead) { - kr = KERN_NODE_DOWN; // node is already terminated - goto unlock; - } - - mnode->link = MNL_LINK_DOWN; - mnode->active = 0; - mnode->suspended = 0; - mnode->dead = 1; - - flipc_node_retire(mnode); - - // Wake any threads sleeping on the proxy port set - if (mnode->proxy_port_set != IPS_NULL) { - ips_mq_lock(mnode->proxy_port_set); - ipc_pset_destroy(mnode->proxy_space, mnode->proxy_port_set); - mnode->proxy_port_set = IPS_NULL; - } - - // TODO: Inform node name server (if registered) of termination - -unlock: - MACH_NODE_UNLOCK(mnode); - return kr; -} - - -/* The link driver calls this to deliver an incoming message. Note that the - * link driver must dispose of the memory pointed to by after the - * function call returns. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * msg Pointer to the message buffer - * flags Currently unused; 0 should be passed - */ -void -mnl_msg_from_node(mnl_node_info_t node __unused, - mnl_msg_t msg, - uint32_t flags __unused) -{ - assert(MNL_MSG_VALID(msg)); - assert(MACH_NODE_ID_VALID(msg->node_id)); - assert(MNL_NODE_VALID(node)); - - /* If node message forwarding is supported, the from_node_id arg may not - * match fmsg->info.node_id. The former is the node from which we received - * the message; the latter is the node that generated the message originally. - * We always use fmsg->info.node_id, which is where the ack needs to go. - */ - - switch (msg->sub) { - case MACH_NODE_SUB_FLIPC: - flipc_msg_from_node((mach_node_t)node, msg, flags); - break; - - default: -#if DEBUG - PE_enter_debugger("mnl_msg_from_node(): Invalid subsystem"); -#endif - break; - } -} - - -/* The link driver calls this to fetch the next message to transmit. - * This function will block until a message is available, or will return - * FLIPC_MSG_NULL if the link is to be terminated. After the caller has - * completed the transmission and no longer needs the msg buffer, it should - * call mnl_msg_complete(). - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - */ -mnl_msg_t -mnl_msg_to_node(mnl_node_info_t node __unused, - uint32_t flags __unused) -{ - assert(MNL_NODE_VALID(node)); - -#if DEBUG - thread_set_thread_name(current_thread(), "MNL_Link"); -#endif - - return flipc_msg_to_remote_node((mach_node_t)node, 0); -} - - -/* The link driver calls this to indicate that the specified msg buffer has - * been sent over the link and can be deallocated. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * msg Pointer to the message buffer - * flags Currently unused; 0 should be passed - */ -void -mnl_msg_complete(mnl_node_info_t node __unused, - mnl_msg_t msg, - uint32_t flags) -{ - switch (msg->sub) { - case MACH_NODE_SUB_NODE: - mnl_msg_free(msg, flags); - break; - - case MACH_NODE_SUB_FLIPC: - flipc_msg_free(msg, flags); - break; - - default: -#if DEBUG - PE_enter_debugger("mnl_msg_complete(): Invalid subsystem"); -#endif - break; - } -} - -#else // MACH_FLIPC not configured, so provide KPI stubs - -mnl_msg_t -mnl_msg_alloc(int payload __unused, uint32_t flags __unused) -{ - return MNL_MSG_NULL; -} - -void -mnl_msg_free(mnl_msg_t msg __unused, uint32_t flags __unused) -{ - return; -} - -mnl_node_info_t -mnl_instantiate(mach_node_id_t nid __unused, uint32_t flags __unused) -{ - return MNL_NODE_NULL; -} - -kern_return_t -mnl_register(mnl_node_info_t node __unused, uint32_t flags __unused) -{ - return KERN_FAILURE; -} - -kern_return_t -mnl_set_link_state(mnl_node_info_t node __unused, - int link __unused, - uint32_t flags __unused) -{ - return KERN_FAILURE; -} - -kern_return_t -mnl_terminate(mnl_node_info_t node __unused, uint32_t flags __unused) -{ - return KERN_FAILURE; -} - -void -mnl_msg_from_node(mnl_node_info_t node __unused, - mnl_msg_t msg __unused, - uint32_t flags __unused) -{ - return; -} - -mnl_msg_t -mnl_msg_to_node(mnl_node_info_t node __unused, uint32_t flags __unused) -{ - return MNL_MSG_NULL; -} - -void -mnl_msg_complete(mnl_node_info_t node __unused, - mnl_msg_t msg __unused, - uint32_t flags __unused) -{ - return; -} - -#endif // MACH_FLIPC diff --git a/osfmk/kern/mach_node.h b/osfmk/kern/mach_node.h deleted file mode 100644 index 1dd7adb05..000000000 --- a/osfmk/kern/mach_node.h +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (c) 2015-2016 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * File: kern/mach_node.h - * Author: Dean Reece - * Date: 2016 - * - * Definitions for mach internode communication (used by flipc). - * This header is intended for use inside the kernel only. - */ - -#ifndef _KERN_MACH_NODE_H_ -#define _KERN_MACH_NODE_H_ - -#if defined(MACH_KERNEL_PRIVATE) || defined(__APPLE_API_PRIVATE) - -/*** Mach Node Name Server Section - * Definitions shared by the mach_node layer in the kernel and the - * node's bootstrap server (noded). - */ - -/* This structure describes messages sent from the mach_node layer to the - * node bootstrap server. - */ -#pragma pack(4) -typedef struct mach_node_server_msg { - mach_msg_header_t header; - uint32_t identifier; // See FLIPC_SM_* defines - uint32_t options; // Currently unused - uint32_t node_id; // Node number -} *mach_node_server_msg_t; -#pragma pack() - -/* This structure describes node registration messages sent from the mach_node - * layer to the node bootstrap server. - */ -typedef struct mach_node_server_register_msg { - struct mach_node_server_msg node_header; - uint8_t datamodel; // 1==ILP32, 2==LP64; matches dtrace - uint8_t byteorder; // Uses defines from libkern/OSByteOrder.h -} *mach_node_server_register_msg_t; -#pragma pack() - -#define MACH_NODE_SERVER_MSG_ID (0x45444f4eUL) // msgh_id "NODE" for Node msgs -#define MACH_NODE_SM_REG_LOCAL (0UL) // Register the local node -#define MACH_NODE_SM_REG_REMOTE (1UL) // Register a remote node - -#if defined(__LP64__) -#define LOCAL_DATA_MODEL (2) // Native data model is LP64 -#else -#define LOCAL_DATA_MODEL (1) // Native data model is ILP32 -#endif - -#endif - - -#if MACH_FLIPC && defined(MACH_KERNEL_PRIVATE) - -#include -#include - -#include - -__BEGIN_DECLS - -#define MACH_NODES_MAX (2) // Must be a power-of-2 -#define MACH_NODE_ID_VALID(nid) (((nid) >= 0) && ((nid) < MACH_NODES_MAX)) - -typedef struct flipc_node *flipc_node_t; // Defined in ipc/flipc.h - - -/*** Mach Node Section - * - * An instance of mach_node is allocated for each node known to mach. - * In-kernel interfaces use a pointer to this structure to refer to a node. - * External interfaces and protocols refer to node by id (mach_node_id_t). - */ -typedef struct mach_node *mach_node_t; - -struct mach_node { - /* Static node details, provided by the link driver at registration */ - struct mnl_node_info info; - - lck_spin_t node_lock_data; - - /* Flags and status word */ - uint32_t link:2; // See MNL_LINK* defines - uint32_t published:1;// True if node server has send-right - uint32_t active:1; // True if node is up and ready - uint32_t suspended:1;// True if node is active but sleeping - uint32_t dead:1; // True if node is dead - uint32_t _reserved:26;// Fill out the 32b flags field - - /* port/space/set */ - ipc_space_t proxy_space;// Kernel special space for proxy rights - ipc_pset_t proxy_port_set;// All proxy ports are in this set - ipc_port_t bootstrap_port;// Port for which "noded" holds rcv right - ipc_port_t control_port;// For control & ack/nak messages - - /* Misc */ - int proto_vers; // Protocol version in use for this node - mach_node_t antecedent; // Pointer to prior encarnation of this node id -}; - -extern mach_node_t localnode; // This node's mach_node_t struct - -#define MACH_NODE_NULL ((mach_node_t) 0UL) -#define MACH_NODE_SIZE ((vm_offset_t)sizeof(struct mach_node)) -#define MACH_NODE_VALID(node) ((node) != MACH_NODE_NULL) -#define MACH_NODE_ALLOC() ((mach_node_t)kalloc(MACH_NODE_SIZE)) -#define MACH_NODE_FREE(node) kfree(node, MACH_NODE_SIZE) - -#define MACH_NODE_LOCK_INIT(np) lck_spin_init(&(np)->node_lock_data, \ - &ipc_lck_grp, &ipc_lck_attr) -#define MACH_NODE_LOCK_DESTROY(np) lck_spin_destroy(&(np)->node_lock_data, \ - &ipc_lck_grp) -#define MACH_NODE_LOCK(np) lck_spin_lock(&(np)->node_lock_data) -#define MACH_NODE_UNLOCK(np) lck_spin_unlock(&(np)->node_lock_data) - -/* Gets or allocates a locked mach_node struct for the specified . - * The current node is locked and returned if it is not dead, or if it is dead - * and is false. A new node struct is allocated, locked and - * returned if the node is dead and is true, or if the node - * is absent and is true. MACH_NODE_NULL is returned if - * the node is absent and is false. MACH_NODE_NULL is also - * returned if a new node structure was not able to be allocated. - */ -mach_node_t -mach_node_for_id_locked(mach_node_id_t node_id, - boolean_t alloc_if_dead, - boolean_t alloc_if_absent); - - -/*** Mach Node Link Name Section - * - * A node link name (mnl_name_t) is an oqaque value guaranteed unique across - * kernel instances on all nodes. This guarantee requires that node ids not - * be recycled. - * - * Names 0..(MACH_NODES_MAX-1) represent null (invalid) names - * Names MACH_NODES_MAX..(MACH_NODES_MAX*2-1) represent bootstrap names - * Names >=(MACH_NODES_MAX*2) represent normal names. - */ - -/* Allocate a new unique name and return it. - * Dispose of this with mnl_name_free(). - * Returns MNL_NAME_NULL on failure. - */ -extern mnl_name_t mnl_name_alloc(void); - -/* Deallocate a unique name that was allocated via mnl_name_alloc(). - */ -extern void mnl_name_free(mnl_name_t name); - -/* This macro is used to convert a node id to a bootstrap port name. - */ -#define MNL_NAME_BOOTSTRAP(nid) ((mnl_name_t) MACH_NODES_MAX | (nid)) -#define MNL_NAME_NULL ((mnl_name_t) 0UL) -#define MNL_NAME_VALID(obj) ((obj) >= MACH_NODES_MAX) - - -/* The mnl hash table may optionally be used by clients to associate mnl_names - * with objects. Objects to be stored in the hash table must start with an - * instance of struct mnk_obj. It is up to clients of the hash table to - * allocate and free the actual objects being stored. - */ -typedef struct mnl_obj { - queue_chain_t links;// List of mnk_name_obj (See kern/queue.h "Method 1") - mnl_name_t name;// Unique mnl_name -} *mnl_obj_t; - -#define MNL_OBJ_NULL ((mnl_obj_t) 0UL) -#define MNL_OBJ_VALID(obj) ((obj) != MNL_OBJ_NULL) - - -/* Initialize the data structures in the mnl_obj structure at the head of the - * provided object. This should be called on an object before it is passed to - * any other mnl_obj* routine. - */ -void mnl_obj_init(mnl_obj_t obj); - -/* Search the local node's hash table for the object associated with a - * mnl_name_t and return it. Returns MNL_NAME_NULL on failure. - */ -mnl_obj_t mnl_obj_lookup(mnl_name_t name); - -/* Search the local node's hash table for the object associated with a - * mnl_name_t and remove it. The pointer to the removed object is returned so - * that the caller can appropriately dispose of the object. - * Returns MNL_NAME_NULL on failure. - */ -mnl_obj_t mnl_obj_remove(mnl_name_t name); - -/* Insert an object into the locak node's hash table. If the name of the - * provided object is MNL_NAME_NULL then a new mnl_name is allocated and - * assigned to the object. Returns KERN_SUCCESS, or KERN_NAME_EXISTS if - * an object associated with that name is already in the hash table. - */ -kern_return_t mnl_obj_insert(mnl_obj_t obj); - - -/*** Mach Node Link Message Section *** - * - * Struct mnl_msg is only the header for a mnl_msg buffer; - * the actual buffer is normally larger. The rest of the buffer - * holds the body of the message to be transmitted over the link. - * - * Note: A mnl_msg received over a link will be in the byte-order of the - * node that send it. fname and size must be corrected to the hosts' native - * byte order by the link driver before it is sent up to the flipc layer. - * However, the link driver should not attempt to adjust the data model or - * byte order of the payload that follows the mnl_msg header - that will - * be done by the flipc layer. - */ - - -/* Values for mnl_msg.sub - */ -#define MACH_NODE_SUB_INVALID (0) // Never sent -#define MACH_NODE_SUB_NODE (1) // MNL msg is for node management -#define MACH_NODE_SUB_FLIPC (2) // MNL msg is for FLIPC subsystem -#define MACH_NODE_SUB_VMSYS (3) // MNL msg is for VM subsystem - - -/* Called whenever the node special port changes - */ -void mach_node_port_changed(void); - - -__END_DECLS - -#endif // MACH_FLIPC && MACH_KERNEL_PRIVATE -#endif // _KERN_MACH_NODE_H_ diff --git a/osfmk/kern/mach_node_link.h b/osfmk/kern/mach_node_link.h deleted file mode 100644 index a848987db..000000000 --- a/osfmk/kern/mach_node_link.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2015-2016 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * File: kern/mach_node_link.h - * Author: Dean Reece - * Date: 2016 - * - * This header provides definitions required by Mach Node Link (MNL) drivers. - * MNL drivers pass messages between nodes within a host. - * - * The constructs available at the node link level are very basic: - * Node IDs (mach_node_id_t) uniquely identify nodes within a host. - * MNL Info (mnl_node_info) describe the static characteristics of a node. - * MNL Names (mnl_name_t) uniquely identify abjects across all nodes. - * MNL Messages (mnl_msg) are passed between nodes (kernels) within a host. - */ - -#ifndef _KERN_MACH_NODE_LINK_H_ -#define _KERN_MACH_NODE_LINK_H_ - -#if KERNEL_PRIVATE - -#include - -__BEGIN_DECLS - - -/*** Node Info Section ***/ - -typedef int mach_node_id_t; // Used to uniquely identify a node -extern mach_node_id_t localnode_id; // This node's unique id. - -/* An mnl_node struct describes static characteristcs of a node. The link - * driver requests this structure from the mach_node layer and fills out - * the fields. All fields must be filled in (non-zero) before both rx and tx - * links are brought up. - */ -typedef struct mnl_node_info { - mach_node_id_t node_id; // The node ID of this node - uint8_t datamodel; // 1==ILP32, 2==LP64 (matches dtrace) - uint8_t byteorder; // See libkern/OSByteOrder.h - uint32_t proto_vers_min;// Oldest MNL protocol vers node can accept - uint32_t proto_vers_max;// Newest MNL protocol vers node can accept -} __attribute__ ((aligned(8))) * mnl_node_info_t; - -#define MNL_NODE_NULL ((mnl_node_info_t) 0UL) -#define MNL_NODE_VALID(n) ((n) != MNL_NODE_NULL) -#define MNL_PROTOCOL_V1 (1UL) // Current Node Link Protocol Version - -/*** Mach Node Link Name Section - * - * A node link name (mnl_name_t) is an oqaque value guaranteed unique across - * kernel instances on all nodes. - */ -typedef uint64_t mnl_name_t; - -/*** Mach Node Link Message Section ***/ - -/* This structure is the header for an MNL Message buffer; the actual buffer - * is normally larger, and holds this header followed by the body of the - * message to be transmitted over the link. - * - * Note: The and fields are in host-native byte order when - * passed to mnl_msg_from_node() and from mnl_msg_to_node(). - * The byte order of these fields as sent over the link is left to the link - * specification. The link drivers on both sides must translate these fields - * between the link's byte order and host-native byte order. - * - * The body of the message, however, is treated as a byte-stream and passed - * to/from the mach_node layer without any introspection or byte reordering. - */ -typedef struct mnl_msg { - uint8_t sub; // 8b subsystem code - uint8_t cmd; // 8b command code - uint8_t qos; // 8b TODO: Doesn't do anything yet - uint8_t flags; // 8b Command-specific flag byte - uint32_t node_id;// 32b id of node that originated message - mnl_name_t object; // 64b object ref (use is determined by sub & cmd) - uint32_t options;// 32b Currently unused - uint32_t size; // 32b Number of bytes that follow mnl_msg header -} __attribute__((__packed__)) * mnl_msg_t; - - -/* Allocate a mnl_msg struct plus additional payload. Link drivers are not - * required to use this to allocate messages; any wired and mapped kernel - * memory is acceptable. - * - * Arguments: - * payload Number of additional bytes to allocate for message payload - * flags Currently unused; 0 should be passed - * - * Return values: - * MNL_MSG_NULL: Allocation failed - * *: Pointer to new mnl_msg struct of requested size - */ -mnl_msg_t mnl_msg_alloc(int payload, uint32_t flags); - - -/* Free a mnl_msg struct allocated by mnl_msg_alloc(). - * - * Arguments: - * msg Pointer to the message buffer to be freed - * flags Currently unused; 0 should be passed - */ -void mnl_msg_free(mnl_msg_t msg, uint32_t flags); - -#define MNL_MSG_NULL ((mnl_msg_t) 0UL) -#define MNL_MSG_VALID(msg) ((msg) != MNL_MSG_NULL) -#define MNL_MSG_SIZE ((vm_offset_t)sizeof(struct mnl_msg)) -#define MNL_MSG_PAYLOAD(msg) ((vm_offset_t)(msg) + MNL_MSG_SIZE) - - -/*** Mach Node Link Driver Interface Section ***/ - -/* The link driver calls this to setup a new (or restarted) node, and to get - * an mnl_node_info struct for use as a parameter to other mnl functions. - * If MNL_NODE_NULL is returned, the operation failed. Otherwise, a pointer - * to a new mnl_node struct is returned. The caller should set all fields - * in the structure, then call mnl_register() to complete node registration. - * - * Arguments: - * nid The id of the node to be instantiated - * flags Currently unused; 0 should be passed - * - * Return values: - * MNL_NODE_NULL: Operation failed - * *: Pointer to a new mnl_node struct - */ -mnl_node_info_t mnl_instantiate(mach_node_id_t nid, - uint32_t flags); - - -/* The link driver calls mnl_register() to complete the node registration - * process. KERN_SUCCESS is returned if registration succeeded, otherwise - * an error is returned. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Registration succeeded - * KERN_INVALID_ARGUMENT: Field(s) in contained unacceptable values - * KERN_*: Values returned from underlying functions - */ -kern_return_t mnl_register(mnl_node_info_t node, - uint32_t flags); - - -/* The link driver calls this to report that the link has been raised in one - * or both directions. If the link is two uni-directional channels, each link - * driver will independently call this function, each only raising the link - * they are responsible for. The mach_node layer will not communicate with - * the remote node until both rx and tx links are up. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * link Indicates which link(s) are up (see MNL_LINK_* defines) - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Link state changed successfully. - * KERN_INVALID_ARGUMENT: An argument value was not allowed. - * KERN_*: Values returned from underlying functions. - */ -kern_return_t mnl_set_link_state(mnl_node_info_t node, - int link, - uint32_t flags); - -#define MNL_LINK_DOWN (0UL) -#define MNL_LINK_RX (1UL) -#define MNL_LINK_TX (2UL) -#define MNL_LINK_UP (MNL_LINK_RX|MNL_LINK_TX) - - -/* The link driver calls this to indicate a node has terminated and is no - * longer available for messaging. This may be due to a crash or an orderly - * shutdown, but either way the remote node no longer retains any state about - * the remaining nodes. References held on behalf of the terminated node - * will be cleaned up. After this is called, both the rx and tx links are - * marked as down. If the remote node restarts, the link driver can bring - * up the link using mnl_instantiate() again. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - * - * Return values: - * KERN_SUCCESS: Node was terminated. - * KERN_INVALID_ARGUMENT: Node id was invalid or non-existant. - * KERN_*: Values returned from underlying functions. - */ -kern_return_t mnl_terminate(mnl_node_info_t node, - uint32_t flags); - - -/* The link driver calls this to deliver an incoming message. Note that the - * link driver must dispose of the memory pointed to by after the - * function call returns. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * msg Pointer to the message buffer - * flags Currently unused; 0 should be passed - */ -void mnl_msg_from_node(mnl_node_info_t node, - mnl_msg_t msg, - uint32_t flags); - - -/* The link driver calls this to fetch the next message to transmit. - * This function will block until a message is available, or will return - * FLIPC_MSG_NULL if the link is to be terminated. After the caller has - * completed the transmission and no longer needs the msg buffer, it should - * call mnl_msg_complete(). - * - * Arguments: - * node Pointer to the node's mnl_node structure - * flags Currently unused; 0 should be passed - */ -mnl_msg_t mnl_msg_to_node(mnl_node_info_t node, - uint32_t flags); - - -/* The link driver calls this to indicate that the specified msg buffer has - * been sent over the link and can be deallocated. - * - * Arguments: - * node Pointer to the node's mnl_node structure - * msg Pointer to the message buffer - * flags Currently unused; 0 should be passed - */ -void mnl_msg_complete(mnl_node_info_t node, - mnl_msg_t msg, - uint32_t flags); - -__END_DECLS - -#endif /* KERNEL_PRIVATE */ -#endif /* _KERN_MACH_NODE_LINK_H_ */ diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 7b90c88c3..6192ac61c 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -88,7 +88,9 @@ #include #include #include +#include #include +#include #include #if ML_IO_TIMEOUTS_ENABLED @@ -122,14 +124,6 @@ extern void wait_while_mp_kdp_trap(bool check_SIGPdebug); #include #endif -#if ML_IO_TIMEOUTS_ENABLED -#if defined(__x86_64__) -#define ml_io_timestamp mach_absolute_time -#else -#define ml_io_timestamp ml_get_timebase -#endif /* __x86_64__ */ -#endif /* ML_IO_TIMEOUTS_ENABLED */ - /* * Exported variables: */ @@ -138,6 +132,7 @@ TUNABLE(long, wdt, "wdt", 0); struct machine_info machine_info; + /* Forwards */ static void processor_offline(void * parameter, __unused wait_result_t result); @@ -620,6 +615,10 @@ uint32_t phy_read_panic = 0; uint32_t phy_write_panic = 0; #endif +#if ML_IO_TIMEOUTS_ENABLED +mmio_track_t PERCPU_DATA(mmio_tracker); +#endif + #if !defined(__x86_64__) #if DEVELOPMENT || DEBUG @@ -972,6 +971,37 @@ ml_io_reset_timeouts_phys(vm_offset_t ioaddr_base, unsigned int size) #endif /* ML_IO_TIMEOUTS_ENABLED */ } +#if ML_IO_TIMEOUTS_ENABLED +boolean_t +ml_io_check_for_mmio_overrides(__unused uint64_t mt) +{ +#if __arm64__ + /* Issue a barrier before accessing the remote mmio trackers */ + __builtin_arm_dmb(DMB_ISH); +#endif + boolean_t istate = ml_set_interrupts_enabled_with_debug(false, false); + percpu_foreach(mmiot, mmio_tracker) { + uint64_t read_timeout; + uint64_t write_timeout; + + override_io_timeouts(mmiot->mmio_vaddr, mmiot->mmio_paddr, &read_timeout, &write_timeout); + + if (read_timeout > 0 || write_timeout > 0) { + if (mt < (mmiot->mmio_start_mt + MAX(read_timeout, write_timeout))) { + ml_set_interrupts_enabled_with_debug(istate, false); + return true; + } + } + } + ml_set_interrupts_enabled_with_debug(istate, false); + return false; +} +#endif /* ML_IO_TIMEOUTS_ENABLED */ + +#if DEVELOPMENT || DEBUG +static int ml_io_read_test_mode; +#endif + unsigned long long ml_io_read(uintptr_t vaddr, int size) { @@ -979,6 +1009,19 @@ ml_io_read(uintptr_t vaddr, int size) unsigned char s1; unsigned short s2; +#if DEVELOPMENT || DEBUG + /* For testing */ + extern void IODelay(int); + if (__improbable(ml_io_read_test_mode)) { + if (vaddr == 1) { + IODelay(100); + return 0; + } else if (vaddr == 2) { + return 0; + } + } +#endif /* DEVELOPMENT || DEBUG */ + #ifdef ML_IO_VERIFY_UNCACHEABLE uintptr_t paddr = pmap_verify_noncacheable(vaddr); #elif defined(ML_IO_TIMEOUTS_ENABLED) @@ -986,8 +1029,8 @@ ml_io_read(uintptr_t vaddr, int size) #endif #ifdef ML_IO_TIMEOUTS_ENABLED - uint64_t sabs, eabs; - boolean_t istate, timeread = FALSE; + kern_timeout_t timeout; + boolean_t istate, use_timeout = FALSE; uint64_t report_read_delay; #if __x86_64__ report_read_delay = report_phy_read_delay; @@ -998,13 +1041,22 @@ ml_io_read(uintptr_t vaddr, int size) if (__improbable(report_read_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); - sabs = ml_io_timestamp(); - timeread = TRUE; + + kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); + use_timeout = true; + + if (paddr == 0) { + paddr = kvtophys(vaddr); + } + mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); + mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); + mmiot->mmio_paddr = paddr; + mmiot->mmio_vaddr = vaddr; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED - if (__improbable(timeread && simulate_stretched_io)) { - sabs -= simulate_stretched_io; + if (__improbable(use_timeout && simulate_stretched_io)) { + kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* ML_IO_SIMULATE_STRETCHED_ENABLED */ #endif /* ML_IO_TIMEOUTS_ENABLED */ @@ -1043,22 +1095,18 @@ ml_io_read(uintptr_t vaddr, int size) #endif #ifdef ML_IO_TIMEOUTS_ENABLED - if (__improbable(timeread == TRUE)) { - eabs = ml_io_timestamp(); - + if (__improbable(use_timeout == TRUE)) { + kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); + uint64_t duration = kern_timeout_gross_duration(&timeout); /* Prevent the processor from calling iotrace during its * initialization procedure. */ if (current_processor()->state == PROCESSOR_RUNNING) { - iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, sabs, eabs - sabs); + iotrace(IOTRACE_IO_READ, vaddr, paddr, size, result, kern_timeout_start_time(&timeout), duration); } - if (__improbable((eabs - sabs) > report_read_delay)) { - if (paddr == 0) { - paddr = kvtophys(vaddr); - } - - DTRACE_PHYSLAT5(physioread, uint64_t, (eabs - sabs), + if (__improbable(duration > report_read_delay)) { + DTRACE_PHYSLAT5(physioread, uint64_t, duration, uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, result); uint64_t override = 0; @@ -1079,23 +1127,23 @@ ml_io_read(uintptr_t vaddr, int size) } } - if (__improbable((eabs - sabs) > report_read_delay)) { + if (__improbable(duration > report_read_delay)) { if (phy_read_panic && (machine_timeout_suspended() == FALSE)) { + char str[128]; #if defined(__x86_64__) panic_notify(); #endif /* defined(__x86_64__) */ - uint64_t nsec = 0; - absolutetime_to_nanoseconds(eabs - sabs, &nsec); - panic("Read from IO vaddr 0x%lx paddr 0x%lx took %llu ns, " - "result: 0x%llx (start: %llu, end: %llu), ceiling: %llu", - vaddr, paddr, nsec, result, sabs, eabs, + snprintf(str, sizeof(str), + "Read from IO vaddr 0x%lx paddr 0x%lx (result: 0x%llx) timed out:", + vaddr, paddr, result); + kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, report_read_delay); } } - if (__improbable(trace_phy_read_delay > 0 && (eabs - sabs) > trace_phy_read_delay)) { + if (__improbable(trace_phy_read_delay > 0 && duration > trace_phy_read_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_READ), - (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result); + duration, VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, result); } (void)ml_set_interrupts_enabled_with_debug(istate, false); @@ -1128,6 +1176,20 @@ ml_io_read64(uintptr_t vaddr) return ml_io_read(vaddr, 8); } + +uint64_t +ml_io_read_cpu_reg(uintptr_t vaddr, int sz, __unused int logical_cpu) +{ + uint64_t val; + + + val = ml_io_read(vaddr, sz); + + + return val; +} + + /* ml_io_write* */ void @@ -1140,8 +1202,8 @@ ml_io_write(uintptr_t vaddr, uint64_t val, int size) #endif #ifdef ML_IO_TIMEOUTS_ENABLED - uint64_t sabs, eabs; - boolean_t istate, timewrite = FALSE; + kern_timeout_t timeout; + boolean_t istate, use_timeout = FALSE; uint64_t report_write_delay; #if __x86_64__ report_write_delay = report_phy_write_delay; @@ -1151,13 +1213,22 @@ ml_io_write(uintptr_t vaddr, uint64_t val, int size) #endif /* !defined(__x86_64__) */ if (__improbable(report_write_delay != 0)) { istate = ml_set_interrupts_enabled_with_debug(false, false); - sabs = ml_io_timestamp(); - timewrite = TRUE; + + kern_timeout_start(&timeout, TF_NONSPEC_TIMEBASE | TF_SAMPLE_PMC); + use_timeout = TRUE; + + if (paddr == 0) { + paddr = kvtophys(vaddr); + } + mmio_track_t *mmiot = PERCPU_GET(mmio_tracker); + mmiot->mmio_start_mt = kern_timeout_start_time(&timeout); + mmiot->mmio_paddr = paddr; + mmiot->mmio_vaddr = vaddr; } #ifdef ML_IO_SIMULATE_STRETCHED_ENABLED - if (__improbable(timewrite && simulate_stretched_io)) { - sabs -= simulate_stretched_io; + if (__improbable(use_timeout && simulate_stretched_io)) { + kern_timeout_stretch(&timeout, simulate_stretched_io); } #endif /* DEVELOPMENT || DEBUG */ #endif /* ML_IO_TIMEOUTS_ENABLED */ @@ -1194,22 +1265,18 @@ ml_io_write(uintptr_t vaddr, uint64_t val, int size) #endif #ifdef ML_IO_TIMEOUTS_ENABLED - if (__improbable(timewrite == TRUE)) { - eabs = ml_io_timestamp(); + if (__improbable(use_timeout == TRUE)) { + kern_timeout_end(&timeout, TF_NONSPEC_TIMEBASE); + uint64_t duration = kern_timeout_gross_duration(&timeout); /* Prevent the processor from calling iotrace during its * initialization procedure. */ if (current_processor()->state == PROCESSOR_RUNNING) { - iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, sabs, eabs - sabs); + iotrace(IOTRACE_IO_WRITE, vaddr, paddr, size, val, kern_timeout_start_time(&timeout), duration); } - - if (__improbable((eabs - sabs) > report_write_delay)) { - if (paddr == 0) { - paddr = kvtophys(vaddr); - } - - DTRACE_PHYSLAT5(physiowrite, uint64_t, (eabs - sabs), + if (__improbable(duration > report_write_delay)) { + DTRACE_PHYSLAT5(physiowrite, uint64_t, duration, uint64_t, vaddr, uint32_t, size, uint64_t, paddr, uint64_t, val); uint64_t override = 0; @@ -1230,24 +1297,23 @@ ml_io_write(uintptr_t vaddr, uint64_t val, int size) } } - if (__improbable((eabs - sabs) > report_write_delay)) { + if (__improbable(duration > report_write_delay)) { if (phy_write_panic && (machine_timeout_suspended() == FALSE)) { + char str[128]; #if defined(__x86_64__) panic_notify(); #endif /* defined(__x86_64__) */ - - uint64_t nsec = 0; - absolutetime_to_nanoseconds(eabs - sabs, &nsec); - panic("Write to IO vaddr %p paddr %p val 0x%llx took %llu ns," - " (start: %llu, end: %llu), ceiling: %llu", - (void *)vaddr, (void *)paddr, val, nsec, sabs, eabs, + snprintf(str, sizeof(str), + "Write to IO vaddr 0x%lx paddr 0x%lx (value: 0x%llx) timed out:", + vaddr, paddr, val); + kern_timeout_try_panic(KERN_TIMEOUT_MMIO, paddr, &timeout, str, report_write_delay); } } - if (__improbable(trace_phy_write_delay > 0 && (eabs - sabs) > trace_phy_write_delay)) { + if (__improbable(trace_phy_write_delay > 0 && duration > trace_phy_write_delay)) { KDBG(MACHDBG_CODE(DBG_MACH_IO, DBC_MACH_IO_MMIO_WRITE), - (eabs - sabs), VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val); + duration, VM_KERNEL_UNSLIDE_OR_PERM(vaddr), paddr, val); } (void)ml_set_interrupts_enabled_with_debug(istate, false); @@ -1289,6 +1355,94 @@ static struct cpu_callback_chain_elem *cpu_callback_chain; static LCK_GRP_DECLARE(cpu_callback_chain_lock_grp, "cpu_callback_chain"); static LCK_SPIN_DECLARE(cpu_callback_chain_lock, &cpu_callback_chain_lock_grp); +struct cpu_event_log_entry { + uint64_t abstime; + enum cpu_event event; + unsigned int cpu_or_cluster; +}; + +#if DEVELOPMENT || DEBUG + +#define CPU_EVENT_RING_SIZE 128 +static struct cpu_event_log_entry cpu_event_ring[CPU_EVENT_RING_SIZE]; +static _Atomic int cpu_event_widx; +static _Atomic uint64_t cpd_cycles; + +void +cpu_event_debug_log(enum cpu_event event, unsigned int cpu_or_cluster) +{ + int oldidx, newidx; + + os_atomic_rmw_loop(&cpu_event_widx, oldidx, newidx, relaxed, { + newidx = (oldidx + 1) % CPU_EVENT_RING_SIZE; + }); + cpu_event_ring[newidx].abstime = ml_get_timebase(); + cpu_event_ring[newidx].event = event; + cpu_event_ring[newidx].cpu_or_cluster = cpu_or_cluster; + + if (event == CLUSTER_EXIT_REQUESTED) { + os_atomic_inc(&cpd_cycles, relaxed); + } +} + +static const char * +cpu_event_log_string(enum cpu_event e) +{ + const char *event_strings[] = { + "CPU_BOOT_REQUESTED", + "CPU_BOOTED", + "CPU_ACTIVE", + "CLUSTER_ACTIVE", + "CPU_EXIT_REQUESTED", + "CPU_DOWN", + "CLUSTER_EXIT_REQUESTED", + "CPU_EXITED", + "PLATFORM_QUIESCE", + "PLATFORM_ACTIVE", + "PLATFORM_HALT_RESTART", + "PLATFORM_PANIC", + "PLATFORM_PANIC_SYNC", + "PLATFORM_PRE_SLEEP", + "PLATFORM_POST_RESUME", + }; + + assert((unsigned)e < sizeof(event_strings) / sizeof(event_strings[0])); + return event_strings[e]; +} + +void +dump_cpu_event_log(int (*printf_func)(const char * fmt, ...)) +{ + printf_func("CPU event history @ %016llx: (CPD cycles: %lld)\n", + ml_get_timebase(), os_atomic_load(&cpd_cycles, relaxed)); + + int idx = os_atomic_load(&cpu_event_widx, relaxed); + for (int c = 0; c < CPU_EVENT_RING_SIZE; c++) { + idx = (idx + 1) % CPU_EVENT_RING_SIZE; + + struct cpu_event_log_entry *e = &cpu_event_ring[idx]; + if (e->abstime != 0) { + printf_func(" %016llx: %s %d\n", e->abstime, + cpu_event_log_string(e->event), e->cpu_or_cluster); + } + } +} + +#else /* DEVELOPMENT || DEBUG */ + +void +cpu_event_debug_log(__unused enum cpu_event event, __unused unsigned int cpu_or_cluster) +{ + /* no logging on production builds */ +} + +void +dump_cpu_event_log(__unused int (*printf_func)(const char * fmt, ...)) +{ +} + +#endif /* DEVELOPMENT || DEBUG */ + void cpu_event_register_callback(cpu_callback_t fn, void *param) { @@ -1319,6 +1473,8 @@ ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster) { struct cpu_callback_chain_elem *cursor; + cpu_event_debug_log(event, cpu_or_cluster); + cursor = os_atomic_load(&cpu_callback_chain, dependency); for (; cursor != NULL; cursor = cursor->next) { cursor->fn(cursor->param, event, cpu_or_cluster); @@ -1329,9 +1485,9 @@ ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster) // definition) void -machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix) +machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *suffix, bool always_enabled) { - if (wdt == -1 || (spec->skip_predicate != NULL && spec->skip_predicate(spec))) { + if (!always_enabled && (wdt == -1 || (spec->skip_predicate != NULL && spec->skip_predicate(spec)))) { // This timeout should be disabled. os_atomic_store_wide((uint64_t*)spec->ptr, 0, relaxed); return; @@ -1509,7 +1665,7 @@ machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char c } if (os_mul_overflow(timeout, scale, &timeout)) { - timeout = UINT64_MAX; // clamp + timeout = UINT64_MAX; // clamp } os_atomic_store_wide((uint64_t*)spec->ptr, timeout, relaxed); @@ -1518,7 +1674,13 @@ machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char c void machine_timeout_init(const struct machine_timeout_spec *spec) { - machine_timeout_init_with_suffix(spec, ""); + machine_timeout_init_with_suffix(spec, "", false); +} + +void +machine_timeout_init_always_enabled(const struct machine_timeout_spec *spec) +{ + machine_timeout_init_with_suffix(spec, "", true); } #if DEVELOPMENT || DEBUG @@ -1530,8 +1692,8 @@ machine_timeout_bsd_init(void) { char const * const __unused mt_suffix = "-b"; #if SCHED_HYGIENE_DEBUG - machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix); - machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix); + machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(interrupt_masked_timeout), mt_suffix, false); + machine_timeout_init_with_suffix(MACHINE_TIMEOUT_SPEC_REF(sched_preemption_disable_threshold_mt), mt_suffix, false); /* * The io timeouts can inherit from interrupt_masked_timeout. @@ -1694,3 +1856,24 @@ ml_io_timeout_test(void) return KERN_SUCCESS; } #endif /* CONFIG_XNUPOST */ + +#if DEVELOPMENT || DEBUG +static int +ml_io_read_cpu_reg_test(__unused int64_t in, int64_t *out) +{ + printf("Testing ml_io_read_cpu_reg()...\n"); + + ml_io_read_test_mode = 1; + boolean_t istate = ml_set_interrupts_enabled_with_debug(false, false); + (void) ml_io_read_cpu_reg((uintptr_t)1, 8, 1); + (void) ml_io_read_cpu_reg((uintptr_t)2, 8, 1); + ml_set_interrupts_enabled_with_debug(istate, false); + (void) ml_io_read_cpu_reg((uintptr_t)1, 8, 1); + (void) ml_io_read_cpu_reg((uintptr_t)2, 8, 1); + ml_io_read_test_mode = 0; + + *out = 0; + return 0; +} +SYSCTL_TEST_REGISTER(ml_io_read_cpu_reg, ml_io_read_cpu_reg_test); +#endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kern/machine.h b/osfmk/kern/machine.h index 9df60f002..3e4cfb80f 100644 --- a/osfmk/kern/machine.h +++ b/osfmk/kern/machine.h @@ -161,4 +161,6 @@ extern void machine_thread_group_blocked(struct thread_group *tg_blocked, struct extern void machine_thread_group_unblocked(struct thread_group *tg_unblocked, struct thread_group *tg_unblocking, uint32_t flags, thread_t unblocked_thread); #endif +extern void machine_perfcontrol_running_timer_expire(uint64_t now, uint32_t flags, int cpu_id, uint64_t *timeout_ticks); + #endif /* _KERN_MACHINE_H_ */ diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 1c87f0642..7bb772048 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -162,6 +162,7 @@ extern int copyoutmsg( user_addr_t user_addr, mach_msg_size_t nbytes); + #if (DEBUG || DEVELOPMENT) extern int verify_write(const void *source, void *dst, size_t size); #endif diff --git a/osfmk/kern/mk_sp.c b/osfmk/kern/mk_sp.c index 37181373d..3b097ad0a 100644 --- a/osfmk/kern/mk_sp.c +++ b/osfmk/kern/mk_sp.c @@ -34,8 +34,6 @@ #include #include -#include -#include #include #include #include diff --git a/osfmk/kern/mk_timer.c b/osfmk/kern/mk_timer.c index 67382b168..0572cec5f 100644 --- a/osfmk/kern/mk_timer.c +++ b/osfmk/kern/mk_timer.c @@ -42,10 +42,10 @@ #include #include +#include #include #include -#include struct mk_timer { decl_simple_lock_data(, lock); @@ -60,12 +60,8 @@ struct mk_timer { static ZONE_DEFINE_TYPE(mk_timer_zone, "mk_timer", struct mk_timer, ZC_ZFREE_CLEARMEM); -static void mk_timer_port_destroy(ipc_port_t); static void mk_timer_expire(void *p0, void *p1); -IPC_KOBJECT_DEFINE(IKOT_TIMER, - .iko_op_destroy = mk_timer_port_destroy); - mach_port_name_t mk_timer_create_trap( __unused struct mk_timer_create_trap_args *args) @@ -73,10 +69,11 @@ mk_timer_create_trap( struct mk_timer* timer; ipc_space_t myspace = current_space(); mach_port_name_t name = MACH_PORT_NULL; - ipc_port_init_flags_t init_flags; ipc_port_t port; kern_return_t result; ipc_kmsg_t kmsg; + ipc_object_label_t label = IPC_OBJECT_LABEL(IOT_TIMER_PORT); + /* Allocate and initialize local state of a timer object */ timer = zalloc_flags(mk_timer_zone, Z_ZERO | Z_WAITOK | Z_NOFAIL); @@ -88,24 +85,20 @@ mk_timer_create_trap( IPC_KMSG_ALLOC_KERNEL | IPC_KMSG_ALLOC_ZERO | IPC_KMSG_ALLOC_ALL_INLINE | IPC_KMSG_ALLOC_NOFAIL | IPC_KMSG_ALLOC_USE_KEEP_ALIVE); - init_flags = IPC_PORT_INIT_MESSAGE_QUEUE; - result = ipc_port_alloc(myspace, init_flags, &name, &port); - if (result != KERN_SUCCESS) { - zfree(mk_timer_zone, timer); - ipc_kmsg_keep_alive_abandon(kmsg); - return MACH_PORT_NULL; - } - /* port locked, receive right at user-space */ - port->ip_immovable_receive = true; - ipc_kobject_upgrade_mktimer_locked(port, (ipc_kobject_t)timer); - - /* make a (naked) send right for the timer to keep */ - timer->port = ipc_port_make_send_any_locked(port); + label.iol_mktimer = timer; /* Associate the pre-allocated kmsg with the port */ timer->prealloc = kmsg; + result = ipc_port_alloc(myspace, label, IP_INIT_NONE, &name, &port); + if (result != KERN_SUCCESS) { + return MACH_PORT_NULL; + } + + /* make a (naked) send right for the timer to keep */ + timer->port = ipc_port_make_send_any_locked(port); + ip_mq_unlock(port); return name; @@ -123,13 +116,11 @@ mk_timer_unlock_and_destroy(struct mk_timer *timer, ipc_port_t port) ipc_port_release_send(port); } -static void -mk_timer_port_destroy( - ipc_port_t port) +void +mk_timer_port_label_dealloc( + ipc_object_label_t label) { - struct mk_timer *timer = NULL; - - timer = ipc_kobject_disable(port, IKOT_TIMER); + struct mk_timer *timer = label.iol_mktimer; simple_lock(&timer->lock, LCK_GRP_NULL); @@ -140,7 +131,7 @@ mk_timer_port_destroy( timer->is_dead = true; if (timer->active == 0) { - mk_timer_unlock_and_destroy(timer, port); + mk_timer_unlock_and_destroy(timer, timer->port); } else { simple_unlock(&timer->lock); } @@ -225,7 +216,7 @@ mk_timer_destroy_trap( return KERN_INVALID_RIGHT; } - if (ip_kotype(entry->ie_port) != IKOT_TIMER) { + if (!ip_is_timer(entry->ie_port)) { is_write_unlock(myspace); return KERN_INVALID_ARGUMENT; } @@ -234,7 +225,7 @@ mk_timer_destroy_trap( * This should have been a mach_mod_refs(RR, -1) but unfortunately, * the fact this is a mach_port_destroy() is ABI now. */ - return ipc_right_destroy(myspace, name, entry, TRUE, 0); /* unlocks space */ + return ipc_right_destroy(myspace, name, entry); /* unlocks space */ } /* @@ -255,6 +246,7 @@ static kern_return_t mk_timer_arm_trap_internal(mach_port_name_t name, uint64_t expire_time, uint64_t mk_leeway, uint64_t mk_timer_flags) { struct mk_timer* timer; + ipc_object_label_t label; ipc_space_t myspace = current_space(); ipc_port_t port; kern_return_t result; @@ -264,7 +256,14 @@ mk_timer_arm_trap_internal(mach_port_name_t name, uint64_t expire_time, uint64_t return result; } - timer = ipc_kobject_get_locked(port, IKOT_TIMER); + if (!ip_is_timer(port)) { + ip_mq_unlock(port); + return KERN_INVALID_ARGUMENT; + } + + label = ip_label_get(port, IOT_TIMER_PORT); + timer = label.iol_mktimer; + ip_label_put(port, &label); if (timer) { @@ -337,18 +336,27 @@ mk_timer_cancel_trap( { mach_port_name_t name = args->name; mach_vm_address_t result_time_addr = args->result_time; - uint64_t armed_time = 0; - struct mk_timer* timer; - ipc_space_t myspace = current_space(); - ipc_port_t port; + uint64_t armed_time = 0; + struct mk_timer* timer; + ipc_space_t myspace = current_space(); + ipc_port_t port; kern_return_t result; + ipc_object_label_t label; result = ipc_port_translate_receive(myspace, name, &port); if (result != KERN_SUCCESS) { return result; } - timer = ipc_kobject_get_locked(port, IKOT_TIMER); + if (!ip_is_timer(port)) { + ip_mq_unlock(port); + return KERN_INVALID_ARGUMENT; + } + + label = ip_label_get(port, IOT_TIMER_PORT); + timer = label.iol_mktimer; + ip_label_put(port, &label); + if (timer != NULL) { simple_lock(&timer->lock, LCK_GRP_NULL); assert(timer->port == port); diff --git a/osfmk/kern/policy_internal.h b/osfmk/kern/policy_internal.h index b675ebe46..39099af70 100644 --- a/osfmk/kern/policy_internal.h +++ b/osfmk/kern/policy_internal.h @@ -73,7 +73,7 @@ extern kern_return_t task_importance(task_t task, integer_t importance); #define TASK_POLICY_THREAD 0x8 #define TASK_POLICY_COALITION 0x10 -/* flavors (also DBG_IMPORTANCE subclasses 0x20 - 0x40) */ +/* flavors (also DBG_IMPORTANCE subclasses 0x20 - 0x50) */ /* internal or external, thread or task */ #define TASK_POLICY_DARWIN_BG IMP_TASK_POLICY_DARWIN_BG @@ -97,6 +97,7 @@ extern kern_return_t task_importance(task_t task, integer_t importance); #define TASK_POLICY_WATCHERS_BG IMP_TASK_POLICY_WATCHERS_BG #define TASK_POLICY_SFI_MANAGED IMP_TASK_POLICY_SFI_MANAGED #define TASK_POLICY_ALL_SOCKETS_BG IMP_TASK_POLICY_ALL_SOCKETS_BG +#define TASK_POLICY_RUNAWAY_MITIGATION IMP_TASK_POLICY_RUNAWAY_MITIGATION #define TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS IMP_TASK_POLICY_BASE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */ #define TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS IMP_TASK_POLICY_OVERRIDE_LATENCY_AND_THROUGHPUT_QOS /* latency as value1, throughput as value2 */ @@ -114,7 +115,7 @@ extern kern_return_t task_importance(task_t task, integer_t importance); #define TASK_POLICY_IOTIER_KEVENT_OVERRIDE IMP_TASK_POLICY_IOTIER_KEVENT_OVERRIDE #define TASK_POLICY_WI_DRIVEN IMP_TASK_POLICY_WI_DRIVEN -#define TASK_POLICY_MAX 0x41 +#define TASK_POLICY_MAX 0x42 /* The main entrance to task policy is this function */ extern void proc_set_task_policy(task_t task, int category, int flavor, int value); @@ -168,7 +169,6 @@ _Static_assert(IOSCHED_METADATA_EXPEDITED_TIER < IOSCHED_METADATA_TIER, #endif /* CONFIG_IOSCHED */ extern int proc_get_darwinbgstate(task_t task, uint32_t *flagsp); -extern int task_get_apptype(task_t); #ifdef MACH_BSD extern void proc_apply_task_networkbg(int pid, thread_t thread); @@ -218,9 +218,6 @@ extern int task_importance_hold_legacy_external_assertion(task_t target_task, ui extern int task_importance_drop_legacy_external_assertion(task_t target_task, uint32_t count); #endif /* IMPORTANCE_INHERITANCE */ -/* Functions used by process_policy.c */ -extern boolean_t proc_task_is_tal(task_t task); - /* Arguments to proc_set_task_ruse_cpu */ #define TASK_POLICY_RESOURCE_ATTRIBUTE_NONE 0x00 #define TASK_POLICY_RESOURCE_ATTRIBUTE_THROTTLE 0x01 @@ -275,7 +272,7 @@ extern void thread_drop_kevent_override(thread_t thread); /* for ipc_pset.c */ extern thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri); -extern boolean_t task_is_app(task_t task); +extern bool task_is_app(task_t task); extern const struct thread_requested_policy default_thread_requested_policy; /* diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index c5c6da565..75d3e3df8 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -880,6 +880,7 @@ vprintf_internal(const char *fmt, va_list ap_in, void *caller) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, ap_in, caller); #pragma clang diagnostic pop diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 709972734..6c0ff8fa9 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -623,7 +623,7 @@ boolean_t can_update_priority( thread_t thread) { - if (sched_tick == thread->sched_stamp) { + if (os_atomic_load(&sched_tick, relaxed) == thread->sched_stamp) { return FALSE; } else { return TRUE; @@ -643,7 +643,7 @@ update_priority( { uint32_t ticks, delta; - ticks = sched_tick - thread->sched_stamp; + ticks = os_atomic_load(&sched_tick, relaxed) - thread->sched_stamp; assert(ticks != 0); thread->sched_stamp += ticks; diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 3133708ec..bcbac53ca 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -65,15 +65,18 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -98,20 +101,43 @@ #include #include -/* The boot pset and pset node */ -struct processor_set pset0; +#if __AMP__ + +/* + * For AMP platforms, all psets of the same type are part of + * the same pset_node. This allows for easier CPU selection logic. + * + * The nodes in pset_nodes are indexed in pset boot order and + * initialization is protected by pset_node_lock. + */ +struct pset_node pset_nodes[MAX_AMP_CLUSTER_TYPES]; +static int next_pset_node_index = 1; +static _Atomic pset_node_t pset_nodes_by_cluster_type[MAX_AMP_CLUSTER_TYPES]; + +/* pset_node_lock must be held */ +static void +pset_node_set_for_pset_cluster_type(pset_node_t node, pset_cluster_type_t pset_cluster_type) +{ + assert3p(os_atomic_load(&pset_nodes_by_cluster_type[pset_cluster_type - 1], relaxed), ==, PSET_NODE_NULL); + os_atomic_store(&pset_nodes_by_cluster_type[pset_cluster_type - 1], node, release); +} + +pset_node_t +pset_node_for_pset_cluster_type(pset_cluster_type_t pset_cluster_type) +{ + assert3u(pset_cluster_type, !=, PSET_SMP); + return os_atomic_load(&pset_nodes_by_cluster_type[pset_cluster_type - 1], acquire); +} + +#else /* !__AMP__ */ + +/* The boot node */ struct pset_node pset_node0; -#if __AMP__ -/* Additional AMP node */ -static struct pset_node pset_node1; -/* - * For AMP platforms, all clusters of the same type are part of - * the same pset_node. This allows for easier CPU selection logic. - */ -pset_node_t ecore_node; -pset_node_t pcore_node; -#endif /* __AMP__ */ +#endif /* !__AMP__ */ + +/* The boot pset */ +struct processor_set pset0; LCK_SPIN_DECLARE(pset_node_lock, LCK_GRP_NULL); @@ -165,6 +191,7 @@ static timer_call_func_t running_timer_funcs[] = { [RUNNING_TIMER_QUANTUM] = thread_quantum_expire, [RUNNING_TIMER_PREEMPT] = thread_preempt_expire, [RUNNING_TIMER_KPERF] = kperf_timer_expire, + [RUNNING_TIMER_PERFCONTROL] = perfcontrol_timer_expire, }; static_assert(sizeof(running_timer_funcs) / sizeof(running_timer_funcs[0]) == RUNNING_TIMER_MAX, "missing running timer function"); @@ -240,44 +267,26 @@ processor_bootstrap(void) simple_lock_init(&sched_available_cores_lock, 0); simple_lock_init(&processor_start_state_lock, 0); - /* Initialize boot pset node */ - pset_node0.psets = &pset0; - pset_node0.pset_cluster_type = PSET_SMP; - + /* Initialize boot pset and node */ #if __AMP__ - const ml_topology_info_t *topology_info = ml_get_topology_info(); - /* - * Continue initializing boot pset and node. * Since this is an AMP system, fill up cluster type and ID information; this should do the * same kind of initialization done via ml_processor_register() */ + const ml_topology_info_t *topology_info = ml_get_topology_info(); ml_topology_cluster_t *boot_cluster = topology_info->boot_cluster; + pset_cluster_type_t boot_cluster_type = cluster_type_to_pset_cluster_type(boot_cluster->cluster_type); pset0.pset_id = boot_cluster->cluster_id; pset0.pset_cluster_id = boot_cluster->cluster_id; - pset_cluster_type_t boot_type = cluster_type_to_pset_cluster_type(boot_cluster->cluster_type); - pset0.pset_cluster_type = boot_type; - pset_node0.pset_cluster_type = boot_type; - - /* Initialize pset node pointers according to their type */ - switch (boot_type) { - case PSET_AMP_P: - pcore_node = &pset_node0; - ecore_node = &pset_node1; - break; - case PSET_AMP_E: - ecore_node = &pset_node0; - pcore_node = &pset_node1; - break; - default: - panic("Unexpected boot pset cluster type %d", boot_type); - } - ecore_node->pset_cluster_type = PSET_AMP_E; - pcore_node->pset_cluster_type = PSET_AMP_P; - - /* Link pset_node1 to pset_node0 */ - pset_node0.node_list = &pset_node1; -#endif /* __AMP__ */ + pset_node0.pset_cluster_type = boot_cluster_type; + pset0.pset_cluster_type = boot_cluster_type; + pset_node_set_for_pset_cluster_type(&pset_node0, boot_cluster_type); +#else /* !__AMP__ */ + pset0.pset_id = 0; + pset0.pset_cluster_id = 0; + pset_node0.pset_cluster_type = PSET_SMP; + pset0.pset_cluster_type = PSET_SMP; +#endif /* !__AMP__ */ pset_init(&pset0, &pset_node0); queue_init(&tasks); @@ -343,6 +352,10 @@ processor_init( } recount_processor_init(processor); +#if CONFIG_SCHED_EDGE + os_atomic_init(&processor->stir_the_pot_inbox_cpu, -1); +#endif /* CONFIG_SCHED_EDGE */ + s = splsched(); simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); @@ -436,6 +449,12 @@ processor_pset( return processor->processor_set; } +cpumap_t +pset_available_cpumap(processor_set_t pset) +{ + return pset->cpu_available_map & pset->recommended_bitmask; +} + #if CONFIG_SCHED_EDGE /* Returns the scheduling type for the pset */ @@ -519,6 +538,7 @@ processor_state_update_idle(processor_t processor) os_atomic_store(&processor->processor_set->cpu_running_buckets[processor->cpu_id], TH_BUCKET_SCHED_MAX, relaxed); bit_clear(processor->processor_set->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_RR], processor->cpu_id); bit_clear(processor->processor_set->cpu_running_cluster_shared_rsrc_thread[CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST], processor->cpu_id); + sched_edge_stir_the_pot_clear_registry_entry(); #endif /* CONFIG_SCHED_EDGE */ sched_update_pset_load_average(processor->processor_set, 0); } @@ -535,6 +555,7 @@ processor_state_update_from_thread(processor_t processor, thread_t thread, boole /* Since idle and bound threads are not tracked by the edge scheduler, ignore when those threads go on-core */ sched_bucket_t bucket = ((thread->state & TH_IDLE) || (thread->bound_processor != PROCESSOR_NULL)) ? TH_BUCKET_SCHED_MAX : thread->th_sched_bucket; os_atomic_store(&processor->processor_set->cpu_running_buckets[processor->cpu_id], bucket, relaxed); + sched_edge_stir_the_pot_update_registry_state(thread); #endif /* CONFIG_SCHED_EDGE */ #if CONFIG_THREAD_GROUPS @@ -559,13 +580,49 @@ pset_node_root(void) return &pset_node0; } +#if __AMP__ + +/* + * Only need to dynamically initialize pset nodes when + * there are multiple cluster types. + */ +static pset_node_t +pset_node_create(cluster_type_t cluster_type) +{ + lck_spin_lock(&pset_node_lock); + assert3u(cluster_type, !=, CLUSTER_TYPE_SMP); + + pset_node_t node; + pset_cluster_type_t pset_cluster_type = cluster_type_to_pset_cluster_type(cluster_type); + /* + * Check if we raced with another booting pset of the same type, + * and this node has already been created. + */ + if ((node = pset_node_for_pset_cluster_type(pset_cluster_type)) != PSET_NODE_NULL) { + lck_spin_unlock(&pset_node_lock); + return node; + } + + assert3u(next_pset_node_index, <, MAX_AMP_CLUSTER_TYPES); + node = &pset_nodes[next_pset_node_index++]; + node->psets = PROCESSOR_SET_NULL; + node->pset_cluster_type = pset_cluster_type; + /* Insert into node linked list */ + pset_nodes[next_pset_node_index - 2].node_list = node; + pset_node_set_for_pset_cluster_type(node, pset_cluster_type); + + lck_spin_unlock(&pset_node_lock); + return node; +} + +#endif /* __AMP__*/ + LCK_GRP_DECLARE(pset_create_grp, "pset_create"); LCK_MTX_DECLARE(pset_create_lock, &pset_create_grp); processor_set_t pset_create( - pset_node_t node, - pset_cluster_type_t pset_type, + cluster_type_t cluster_type, uint32_t pset_cluster_id, int pset_id) { @@ -574,25 +631,29 @@ pset_create( return processor_pset(master_processor); } - processor_set_t *prev, pset = zalloc_permanent_type(struct processor_set); - - if (pset != PROCESSOR_SET_NULL) { - pset->pset_cluster_type = pset_type; - pset->pset_cluster_id = pset_cluster_id; - pset->pset_id = pset_id; - pset_init(pset, node); - - lck_spin_lock(&pset_node_lock); - - prev = &node->psets; - while (*prev != PROCESSOR_SET_NULL) { - prev = &(*prev)->pset_list; - } - - *prev = pset; - - lck_spin_unlock(&pset_node_lock); + pset_node_t node; + pset_cluster_type_t pset_cluster_type; +#if __AMP__ + pset_cluster_type = cluster_type_to_pset_cluster_type(cluster_type); + node = pset_node_for_pset_cluster_type(pset_cluster_type); + if (node == PSET_NODE_NULL) { + /* First pset of this cluster type */ + node = pset_node_create(cluster_type); } +#else /* !__AMP__ */ + pset_cluster_type = PSET_SMP; + node = &pset_node0; + (void)cluster_type; +#endif /* !__AMP__ */ + + processor_set_t pset = zalloc_permanent_type(struct processor_set); + if (pset == PROCESSOR_SET_NULL) { + panic("Failed to allocate struct processor_set"); + } + pset->pset_cluster_type = pset_cluster_type; + pset->pset_cluster_id = pset_cluster_id; + pset->pset_id = pset_id; + pset_init(pset, node); return pset; } @@ -686,15 +747,13 @@ pset_init( pset->perfcontrol_cpu_migration_bitmask = 0; pset->cpu_preferred_last_chosen = -1; - pset->stealable_rt_threads_earliest_deadline = UINT64_MAX; - if (pset != &pset0) { /* * Scheduler runqueue initialization for non-boot psets. * This initialization for pset0 happens in sched_init(). */ SCHED(pset_init)(pset); - SCHED(rt_init)(pset); + SCHED(rt_init_pset)(pset); } /* @@ -705,8 +764,17 @@ pset_init( os_atomic_store(&pset_array[pset->pset_id], pset, release); lck_spin_lock(&pset_node_lock); + + /* Initialize pset node state regarding this pset */ bit_set(node->pset_map, pset->pset_id); pset->node = node; + + processor_set_t *prev = &node->psets; + while (*prev != PROCESSOR_SET_NULL) { + prev = &(*prev)->pset_list; + } + *prev = pset; + lck_spin_unlock(&pset_node_lock); } @@ -1832,6 +1900,23 @@ recommended_pset_type(thread_t thread) #if __arm64__ +cluster_type_t +pset_cluster_type_to_cluster_type(pset_cluster_type_t pset_cluster_type) +{ + switch (pset_cluster_type) { +#if __AMP__ + case PSET_AMP_E: + return CLUSTER_TYPE_E; + case PSET_AMP_P: + return CLUSTER_TYPE_P; +#endif /* __AMP__ */ + case PSET_SMP: + return CLUSTER_TYPE_SMP; + default: + panic("Unexpected pset cluster type %d", pset_cluster_type); + } +} + pset_cluster_type_t cluster_type_to_pset_cluster_type(cluster_type_t cluster_type) { @@ -1849,23 +1934,6 @@ cluster_type_to_pset_cluster_type(cluster_type_t cluster_type) } } -pset_node_t -cluster_type_to_pset_node(cluster_type_t cluster_type) -{ - switch (cluster_type) { -#if __AMP__ - case CLUSTER_TYPE_E: - return ecore_node; - case CLUSTER_TYPE_P: - return pcore_node; -#endif /* __AMP__ */ - case CLUSTER_TYPE_SMP: - return &pset_node0; - default: - panic("Unexpected cluster type %d", cluster_type); - } -} - #endif /* __arm64__ */ #if CONFIG_THREAD_GROUPS && __AMP__ && !CONFIG_SCHED_EDGE diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 290b79716..b8f9566ad 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -70,6 +70,8 @@ #include #if defined(MACH_KERNEL_PRIVATE) || SCHED_TEST_HARNESS +#include +#include #include #include #endif /* defined(MACH_KERNEL_PRIVATE) || SCHED_TEST_HARNESS */ @@ -159,10 +161,13 @@ typedef enum { PSET_AMP_E = 1, PSET_AMP_P = 2, #endif /* __AMP__ */ + MAX_PSET_TYPES, } pset_cluster_type_t; #if __AMP__ +#define MAX_AMP_CLUSTER_TYPES (MAX_PSET_TYPES - 1) + typedef enum { SCHED_PERFCTL_POLICY_DEFAULT, /* static policy: set at boot */ SCHED_PERFCTL_POLICY_FOLLOW_GROUP, /* dynamic policy: perfctl_class follows thread group across amp clusters */ @@ -178,8 +183,8 @@ typedef bitmap_t cpumap_t; #if __arm64__ +extern cluster_type_t pset_cluster_type_to_cluster_type(pset_cluster_type_t pset_cluster_type); extern pset_cluster_type_t cluster_type_to_pset_cluster_type(cluster_type_t cluster_type); -extern pset_node_t cluster_type_to_pset_node(cluster_type_t cluster_type); /* * pset_execution_time_t @@ -236,7 +241,18 @@ struct processor_set { struct run_queue pset_runq; /* runq for this processor set, used by the amp and dualq scheduler policies */ struct rt_queue rt_runq; /* realtime runq for this processor set */ - uint64_t stealable_rt_threads_earliest_deadline; /* if this pset has stealable RT threads, the earliest deadline; else UINT64_MAX */ + /* + * stealable_rt_threads_earliest_deadline stores the earliest deadline of + * the rt_runq if this pset has stealable RT threads, and RT_DEADLINE_NONE + * otherwise. + * + * It can only be read outside of the pset lock in sched_rt_steal_thread as + * a hint for which pset to lock. It must be re-checked under the lock + * before relying on its value to dequeue a thread. + * + * Updates are made under the pset lock by pset_update_rt_stealable_state. + */ + _Atomic uint64_t stealable_rt_threads_earliest_deadline; #if CONFIG_SCHED_CLUTCH struct sched_clutch_root pset_clutch_root; /* clutch hierarchy root */ #endif /* CONFIG_SCHED_CLUTCH */ @@ -288,10 +304,30 @@ struct processor_set { bitmap_t native_psets[BITMAP_LEN(MAX_PSETS)]; bitmap_t local_psets[BITMAP_LEN(MAX_PSETS)]; bitmap_t remote_psets[BITMAP_LEN(MAX_PSETS)]; - sched_clutch_edge sched_edges[MAX_PSETS]; pset_execution_time_t pset_execution_time[TH_BUCKET_SCHED_MAX]; uint64_t pset_cluster_shared_rsrc_load[CLUSTER_SHARED_RSRC_TYPE_COUNT]; + _Atomic sched_clutch_edge sched_edges[MAX_PSETS][TH_BUCKET_SCHED_MAX]; + sched_pset_search_order_t spill_search_order[TH_BUCKET_SCHED_MAX]; + /* + * Recommended width of threads (one per core) or shared resource threads + * (one per cluster), if this is the preferred pset. + */ + uint8_t max_parallel_cores[TH_BUCKET_SCHED_MAX]; + uint8_t max_parallel_clusters[TH_BUCKET_SCHED_MAX]; #endif /* CONFIG_SCHED_EDGE */ + +#if __AMP__ + /* Writes to sched_rt_* fields are guarded by sched_available_cores_lock to + * prevent concurrent updates. Reads are not guaranteed to be consistent + * except atomicity of specific fields, as noted below */ + + /* sched_rt_edges controls realtime thread scheduling policies like migration and steal. */ + sched_clutch_edge sched_rt_edges[MAX_PSETS]; + sched_pset_search_order_t sched_rt_spill_search_order; /* should be stored/accessed atomically */ +#if CONFIG_SCHED_EDGE + sched_pset_search_order_t sched_rt_steal_search_order; /* should be stored/accessed atomically */ +#endif /* CONFIG_SCHED_EDGE */ +#endif /* __AMP__ */ cpumap_t perfcontrol_cpu_preferred_bitmask; cpumap_t perfcontrol_cpu_migration_bitmask; int cpu_preferred_last_chosen; @@ -308,7 +344,6 @@ typedef bitmap_t pset_map_t; struct pset_node { processor_set_t psets; /* list of associated psets */ - pset_node_t nodes; /* list of associated subnodes */ pset_node_t node_list; /* chain of associated nodes */ pset_cluster_type_t pset_cluster_type; /* Same as the type of all psets in this node */ @@ -326,9 +361,18 @@ struct pset_node { extern struct pset_node pset_node0; #if __AMP__ -extern pset_node_t ecore_node; -extern pset_node_t pcore_node; -#endif /* __AMP__ */ + +/* Boot pset node */ +#define pset_node0 (pset_nodes[0]) +extern struct pset_node pset_nodes[MAX_AMP_CLUSTER_TYPES]; +extern pset_node_t pset_node_for_pset_cluster_type(pset_cluster_type_t pset_cluster_type); + +#else /* !__AMP__ */ + +/* Boot pset node and head of the pset node linked list */ +extern struct pset_node pset_node0; + +#endif /* !__AMP__ */ extern queue_head_t tasks, threads, corpse_tasks; extern int tasks_count, terminated_tasks_count, threads_count, terminated_threads_count; @@ -482,6 +526,10 @@ struct processor { bool processor_inshutdown; /* is the processor between processor_shutdown and processor_startup */ processor_offline_state_t processor_offline_state; + +#if CONFIG_SCHED_EDGE + _Atomic int stir_the_pot_inbox_cpu; /* ID of P-core available to be preempted for stir-the-pot */ +#endif /* CONFIG_SCHED_EDGE */ }; extern bool sched_all_cpus_offline(void); @@ -506,6 +554,21 @@ extern uint32_t processor_avail_count_user; extern uint32_t primary_processor_avail_count_user; #endif /* CONFIG_SCHED_SMT */ +#define cpumap_foreach(cpu_id, cpumap) \ + for (int cpu_id = lsb_first(cpumap); \ + (cpu_id) >= 0; \ + cpu_id = lsb_next((cpumap), cpu_id)) + +#define foreach_node(node) \ + for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) + +#define foreach_pset_id(pset_id, node) \ + for (int pset_id = lsb_first((node)->pset_map); \ + pset_id >= 0; \ + pset_id = lsb_next((node)->pset_map, pset_id)) + +cpumap_t pset_available_cpumap(processor_set_t pset); + /* * All of the operations on a processor that change the processor count * published to userspace and kernel. @@ -541,6 +604,17 @@ extern lck_grp_t pset_lck_grp; #define pset_assert_locked(p) LCK_SPIN_ASSERT(&(p)->sched_lock, LCK_ASSERT_OWNED) #endif /*!SCHED_PSET_TLOCK*/ +inline static processor_set_t +change_locked_pset(processor_set_t current_pset, processor_set_t new_pset) +{ + if (current_pset != new_pset) { + pset_unlock(current_pset); + pset_lock(new_pset); + } + + return new_pset; +} + extern lck_spin_t pset_node_lock; #endif /* !SCHED_TEST_HARNESS */ @@ -619,8 +693,7 @@ extern processor_set_t processor_pset( extern pset_node_t pset_node_root(void); extern processor_set_t pset_create( - pset_node_t node, - pset_cluster_type_t pset_type, + cluster_type_t cluster_type, uint32_t pset_cluster_id, int pset_id); @@ -851,6 +924,9 @@ extern void sched_perfcontrol_update_recommended_cores_reason(uint64_t recommend /* Request a change to the powered cores mask that CLPC wants. Does not block waiting for completion. */ extern void sched_perfcontrol_update_powered_cores(uint64_t powered_cores, processor_reason_t reason, uint32_t flags); +/* Reevaluate the thread placement decision on cpu_id and force a preemption if necessary. */ +extern bool sched_perfcontrol_check_oncore_thread_preemption(uint64_t flags, int cpu_id); + #endif /* KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index 59ff8f4b5..37119b70c 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -539,7 +539,7 @@ re_queue_tail(queue_t que, queue_entry_t elt) &((elt)->field) != (head); \ elt = _nelt, _nelt = qe_element((elt)->field.next, typeof(*(elt)), field)) \ -#ifdef XNU_KERNEL_PRIVATE +#if (defined(XNU_KERNEL_PRIVATE) || SCHED_TEST_HARNESS) /* Dequeue an element from head, or return NULL if the queue is empty */ #define qe_dequeue_head(head, type, field) ({ \ @@ -595,7 +595,7 @@ re_queue_tail(queue_t que, queue_entry_t elt) _tmp_element; \ }) -#endif /* XNU_KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE || SCHED_TEST_HARNESS */ /* * Macro: QUEUE_HEAD_INITIALIZER() diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index 2c3108a6a..489815c1d 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -206,6 +206,11 @@ typedef enum { #endif /* defined(__arm64__) && CONFIG_CLUTCH && !CONFIG_SCHED_EDGE_OPT_OUT */ +#if CONFIG_SCHED_EDGE + + +#endif /* CONFIG_SCHED_EDGE */ + /* * Since the clutch scheduler organizes threads based on the thread group * and the scheduling bucket, its important to not mix threads from multiple @@ -291,11 +296,6 @@ typedef struct rt_queue *rt_queue_t; #define RT_DEADLINE_NONE UINT64_MAX #define RT_DEADLINE_QUANTUM_EXPIRED (UINT64_MAX - 1) -extern int rt_runq_count(processor_set_t); -extern uint64_t rt_runq_earliest_deadline(processor_set_t); - - - /* * Scheduler routines. */ @@ -310,6 +310,11 @@ extern void thread_preempt_expire( timer_call_param_t processor, timer_call_param_t thread); +/* Invoke the performance controller supplied callback on the processor */ +extern void perfcontrol_timer_expire( + timer_call_param_t processor, + timer_call_param_t thread); + /* Context switch check for current processor */ extern ast_t csw_check( thread_t thread, @@ -347,7 +352,7 @@ extern int default_preemption_rate; #define SCHED_TICK_SHIFT 3 #define SCHED_TICK_MAX_DELTA (8) -extern unsigned sched_tick; +extern _Atomic uint32_t sched_tick; extern uint32_t sched_tick_interval; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -419,6 +424,11 @@ struct shift_data { /* * Save the current thread time and compute a delta since the last call for the * scheduler tick. + * + * Places that consume this delta should also accumulate it to + * thread->sched_usage, thread->cpu_delta, and any policy-specific + * tracking like in sched_clutch_cpu_usage_update(), to maintain + * accurate CPU usage accounting for the scheduler. */ #define sched_tick_delta(thread, delta) \ MACRO_BEGIN \ diff --git a/osfmk/kern/sched_amp.c b/osfmk/kern/sched_amp.c index d497fb3fe..d1598fb89 100644 --- a/osfmk/kern/sched_amp.c +++ b/osfmk/kern/sched_amp.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,10 @@ #if __AMP__ && !CONFIG_SCHED_EDGE +#if CONFIG_SCHED_SMT +#error "The AMP scheduler does not support CONFIG_SCHED_SMT." +#endif /* CONFIG_SCHED_SMT */ + static thread_t sched_amp_steal_thread(processor_set_t pset); @@ -97,7 +102,7 @@ static sched_mode_t sched_amp_initial_thread_sched_mode(task_t parent_task); static processor_t -sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread); +sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options); static bool sched_amp_thread_avoid_processor(processor_t processor, thread_t thread, __unused ast_t reason); @@ -148,13 +153,13 @@ const struct sched_dispatch_table sched_amp_dispatch = { .avoid_processor_enabled = TRUE, .thread_avoid_processor = sched_amp_thread_avoid_processor, .processor_balance = sched_amp_balance, - - .rt_runq = sched_rtlocal_runq, - .rt_init = sched_rtlocal_init, - .rt_queue_shutdown = sched_rtlocal_queue_shutdown, - .rt_runq_scan = sched_rtlocal_runq_scan, - .rt_runq_count_sum = sched_rtlocal_runq_count_sum, - .rt_steal_thread = sched_rtlocal_steal_thread, + .rt_choose_processor = sched_rt_choose_processor, + .rt_steal_thread = NULL, + .rt_init_pset = sched_rt_init_pset, + .rt_init_completed = sched_rt_init_completed, + .rt_queue_shutdown = sched_rt_queue_shutdown, + .rt_runq_scan = sched_rt_runq_scan, + .rt_runq_count_sum = sched_rt_runq_count_sum, .qos_max_parallelism = sched_amp_qos_max_parallelism, .check_spill = sched_amp_check_spill, @@ -565,7 +570,7 @@ sched_amp_thread_update_scan(sched_update_scan_context_t scan_context) } thread = processor->idle_thread; - if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { + if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) { if (thread_update_add_thread(thread) == FALSE) { restart_needed = TRUE; break; @@ -651,7 +656,7 @@ sched_amp_thread_avoid_processor(processor_t processor, thread_t thread, __unuse } static processor_t -sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread) +sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, __unused sched_options_t *options) { /* Bound threads don't call this function */ assert(thread->bound_processor == PROCESSOR_NULL); @@ -683,7 +688,7 @@ sched_amp_choose_processor(processor_set_t pset, processor_t processor, thread_t #if CONFIG_SCHED_SMT return choose_processor_smt(nset, processor, thread); #else /* CONFIG_SCHED_SMT */ - return choose_processor(nset, processor, thread); + return choose_processor(nset, processor, thread, options); #endif /* CONFIG_SCHED_SMT */ } diff --git a/osfmk/kern/sched_amp_common.c b/osfmk/kern/sched_amp_common.c index 701f999b7..44ebfc874 100644 --- a/osfmk/kern/sched_amp_common.c +++ b/osfmk/kern/sched_amp_common.c @@ -490,7 +490,8 @@ sched_amp_qos_max_parallelism(int qos, uint64_t options) pset_node_t sched_amp_choose_node(thread_t thread) { - pset_node_t node = (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_node : ecore_node; + pset_cluster_type_t pset_cluster_type = (recommended_pset_type(thread) == PSET_AMP_P) ? PSET_AMP_P : PSET_AMP_E; + pset_node_t node = pset_node_for_pset_cluster_type(pset_cluster_type); return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0; } #endif /* !CONFIG_SCHED_EDGE */ diff --git a/osfmk/kern/sched_clutch.c b/osfmk/kern/sched_clutch.c index 3ea317171..28026017a 100644 --- a/osfmk/kern/sched_clutch.c +++ b/osfmk/kern/sched_clutch.c @@ -28,28 +28,31 @@ #if !SCHED_TEST_HARNESS -#include -#include -#include -#include -#include -#include #include +#include #include #include #include +#include #include #include #include -#include + +#include +#include + #include -#include +#include +#include +#include + #include #endif /* !SCHED_TEST_HARNESS */ #include #include +#include #if CONFIG_SCHED_EDGE #include @@ -57,6 +60,10 @@ #if CONFIG_SCHED_CLUTCH +#if CONFIG_SCHED_SMT +#error "The clutch scheduler does not support CONFIG_SCHED_SMT." +#endif /* CONFIG_SCHED_SMT */ + #define SCHED_CLUTCH_DBG_THREAD_SELECT_PACKED_VERSION 1 typedef union { struct __attribute__((packed)) { @@ -132,7 +139,7 @@ static uint8_t sched_clutch_bucket_pri_calculate(sched_clutch_bucket_t, uint64_t /* Clutch bucket group level properties management */ static void sched_clutch_bucket_group_cpu_usage_update(sched_clutch_bucket_group_t, uint64_t); static void sched_clutch_bucket_group_cpu_adjust(sched_clutch_bucket_group_t, uint8_t); -static void sched_clutch_bucket_group_timeshare_update(sched_clutch_bucket_group_t, sched_clutch_bucket_t, uint64_t); +static void sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_t); static uint8_t sched_clutch_bucket_group_pending_ageout(sched_clutch_bucket_group_t, uint64_t); static uint32_t sched_clutch_bucket_group_run_count_inc(sched_clutch_bucket_group_t); static uint32_t sched_clutch_bucket_group_run_count_dec(sched_clutch_bucket_group_t); @@ -164,7 +171,6 @@ static inline bool sched_clutch_pri_greater_than_tiebreak(int, int, bool); #if CONFIG_SCHED_EDGE /* System based routines */ static uint32_t sched_edge_thread_bound_cluster_id(thread_t); -static int sched_edge_iterate_clusters_ordered(processor_set_t, uint64_t, int); /* Global indicating the maximum number of clusters on the current platform */ static int sched_edge_max_clusters = 0; @@ -277,6 +283,20 @@ sched_clutch_us_to_abstime(uint32_t *us_vals, uint64_t *abstime_vals) #if DEVELOPMENT || DEBUG +kern_return_t +sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats) +{ + if (sched_bucket < 0 || sched_bucket >= TH_BUCKET_MAX) { + return KERN_INVALID_ARGUMENT; + } + sched_clutch_bucket_group_t clutch_bucket_group = &sched_clutch_for_thread(thread)->sc_clutch_groups[sched_bucket]; + sched_clutch_bucket_cpu_data_t scb_cpu_data; + scb_cpu_data.scbcd_cpu_data_packed = os_atomic_load_wide(&clutch_bucket_group->scbg_cpu_data.scbcd_cpu_data_packed, relaxed); + cpu_stats[0] = scb_cpu_data.cpu_data.scbcd_cpu_used; + cpu_stats[1] = scb_cpu_data.cpu_data.scbcd_cpu_blocked; + return KERN_SUCCESS; +} + /* * sched_clutch_hierarchy_locked_assert() * @@ -1016,6 +1036,12 @@ evaluate_root_buckets: goto evaluate_root_buckets; } +static inline bool +sched_clutch_bucket_is_above_timeshare(sched_bucket_t bucket) +{ + return bucket == TH_BUCKET_FIXPRI; +} + /* * sched_clutch_root_bucket_deadline_calculate() * @@ -1027,7 +1053,7 @@ sched_clutch_root_bucket_deadline_calculate( uint64_t timestamp) { /* For fixpri AboveUI bucket always return it as the earliest deadline */ - if (root_bucket->scrb_bucket < TH_BUCKET_SHARE_FG) { + if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) { return 0; } @@ -1049,7 +1075,7 @@ sched_clutch_root_bucket_deadline_update( uint64_t timestamp, bool bucket_is_enqueued) { - if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) { /* The algorithm never uses the deadlines for scheduling TH_BUCKET_FIXPRI bucket */ return; } @@ -1084,7 +1110,7 @@ sched_clutch_root_bucket_runnable( bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap; bitmap_set(runnable_bitmap, root_bucket->scrb_bucket); - if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) { /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ return; } @@ -1121,7 +1147,7 @@ sched_clutch_root_bucket_empty( bitmap_t *runnable_bitmap = (root_bucket->scrb_bound) ? root_clutch->scr_bound_runnable_bitmap : root_clutch->scr_unbound_runnable_bitmap; bitmap_clear(runnable_bitmap, root_bucket->scrb_bucket); - if (root_bucket->scrb_bucket == TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(root_bucket->scrb_bucket)) { /* Since the TH_BUCKET_FIXPRI bucket is not scheduled based on deadline, nothing more needed here */ return; } @@ -1389,7 +1415,6 @@ sched_clutch_bucket_group_init( os_atomic_store(&clutch_bucket_group->scbg_cpu_data.cpu_data.scbcd_cpu_blocked, (clutch_cpu_data_t)sched_clutch_bucket_group_adjust_threshold, relaxed); clutch_bucket_group->scbg_blocked_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_BLOCKED_TS_INVALID; clutch_bucket_group->scbg_pending_data.scct_timestamp = SCHED_CLUTCH_BUCKET_GROUP_PENDING_INVALID; - clutch_bucket_group->scbg_amp_rebalance_last_chosen = UINT32_MAX; } static void @@ -1572,7 +1597,7 @@ sched_clutch_bucket_hierarchy_insert( sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); - if (bucket > TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(bucket) == false) { /* Enqueue the timeshare clutch buckets into the global runnable clutch_bucket list; used for sched tick operations */ enqueue_tail(&root_clutch->scr_clutch_buckets, &clutch_bucket->scb_listlink); } @@ -1610,7 +1635,7 @@ sched_clutch_bucket_hierarchy_remove( __unused sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); - if (bucket > TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(bucket) == false) { /* Remove the timeshare clutch bucket from the globally runnable clutch_bucket list */ remqueue(&clutch_bucket->scb_listlink); } @@ -1773,8 +1798,9 @@ sched_clutch_bucket_runnable( clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); sched_clutch_bucket_hierarchy_insert(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options); - /* Update the timesharing properties of this clutch_bucket; also done every sched_tick */ - sched_clutch_bucket_group_timeshare_update(clutch_bucket->scb_group, clutch_bucket, timestamp); + /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */ + sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group); + int16_t root_old_pri = root_clutch->scr_priority; sched_clutch_root_pri_update(root_clutch); return root_clutch->scr_priority > root_old_pri; @@ -1844,8 +1870,13 @@ sched_clutch_bucket_empty( sched_clutch_bucket_options_t options) { sched_clutch_hierarchy_locked_assert(root_clutch); + assert3u(clutch_bucket->scb_thr_count, ==, 0); sched_clutch_bucket_hierarchy_remove(root_clutch, clutch_bucket, clutch_bucket->scb_bucket, timestamp, options); - clutch_bucket->scb_priority = sched_clutch_bucket_pri_calculate(clutch_bucket, timestamp); + + /* Update the timesharing properties of this clutch_bucket_group; also done every sched_tick */ + sched_clutch_bucket_group_pri_shift_update(clutch_bucket->scb_group); + + clutch_bucket->scb_priority = 0; sched_clutch_root_pri_update(root_clutch); } @@ -1878,7 +1909,7 @@ sched_clutch_bucket_group_cpu_usage_update( sched_clutch_bucket_group_t clutch_bucket_group, uint64_t delta) { - if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) { /* Since Above UI bucket has maximum interactivity score always, nothing to do here */ return; } @@ -2017,45 +2048,27 @@ sched_clutch_run_bucket_decr( } /* - * sched_clutch_bucket_group_timeshare_update() + * sched_clutch_bucket_group_pri_shift_update() * - * Routine to update the load and priority shift for the clutch_bucket_group - * every sched_tick. For multi-cluster platforms, each QoS level will have multiple - * clutch buckets with runnable threads in them. So it is important to maintain - * the timesharing information at the clutch_bucket_group level instead of - * individual clutch buckets (because the algorithm is trying to timeshare all - * threads at the same QoS irrespective of which hierarchy they are enqueued in). - * - * The routine is called from the sched tick handling code to make sure this value - * is updated at least once every sched tick. For clutch bucket groups which have - * not been runnable for very long, the clutch_bucket_group maintains a "last - * updated schedtick" parameter. As threads become runnable in the clutch bucket group, - * if this value is outdated, the load and shifts are updated. - * - * Possible optimization: - * - The current algorithm samples the load every sched tick (125ms). - * This is prone to spikes in runnable counts; if that turns out to be - * a problem, a simple solution would be to do the EWMA trick to sample - * load at every load_tick (30ms) and use the averaged value for the pri - * shift calculation. + * Routine to update the priority shift for a clutch bucket group, + * necessary for timesharing correctly with priority decay within a + * thread group + QoS. */ static void -sched_clutch_bucket_group_timeshare_update( - sched_clutch_bucket_group_t clutch_bucket_group, - sched_clutch_bucket_t clutch_bucket, - uint64_t ctime) +sched_clutch_bucket_group_pri_shift_update( + sched_clutch_bucket_group_t clutch_bucket_group) { - if (clutch_bucket_group->scbg_bucket < TH_BUCKET_SHARE_FG) { + if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) { /* No timesharing needed for fixed priority Above UI threads */ return; } /* * Update the timeshare parameters for the clutch bucket group - * if they havent been updated in this tick. + * if they haven't been updated in this tick. */ uint32_t sched_ts = os_atomic_load(&clutch_bucket_group->scbg_timeshare_tick, relaxed); - uint32_t current_sched_ts = sched_tick; + uint32_t current_sched_ts = os_atomic_load(&sched_tick, relaxed); if (sched_ts < current_sched_ts) { os_atomic_store(&clutch_bucket_group->scbg_timeshare_tick, current_sched_ts, relaxed); /* NCPU wide workloads should not experience decay */ @@ -2067,7 +2080,42 @@ sched_clutch_bucket_group_timeshare_update( pri_shift = (pri_shift > SCHED_PRI_SHIFT_MAX) ? INT8_MAX : pri_shift; os_atomic_store(&clutch_bucket_group->scbg_pri_shift, pri_shift, relaxed); } +} +/* + * sched_clutch_bucket_group_timeshare_update() + * + * Routine to update the priority shift and priority for the clutch_bucket_group + * every sched_tick. For multi-cluster platforms, each QoS level will have multiple + * clutch buckets with runnable threads in them. So it is important to maintain + * the timesharing information at the clutch_bucket_group level instead of + * individual clutch buckets (because the algorithm is trying to timeshare all + * threads at the same QoS irrespective of which hierarchy they are enqueued in). + * + * The routine is called from the sched tick handling code to make sure this value + * is updated at least once every sched tick. For clutch bucket groups which have + * not been runnable for very long, the clutch_bucket_group maintains a "last + * updated schedtick" parameter. As threads become runnable in the clutch bucket group, + * if this value is outdated, we update the priority shift. + * + * Possible optimization: + * - The current algorithm samples the load at most once every sched tick (125ms). + * This is prone to spikes in runnable counts; if that turns out to be + * a problem, a simple solution would be to do the EWMA trick to sample + * load at every load_tick (30ms) and use the averaged value for the pri + * shift calculation. + */ +static void +sched_clutch_bucket_group_timeshare_update( + sched_clutch_bucket_group_t clutch_bucket_group, + sched_clutch_bucket_t clutch_bucket, + uint64_t ctime) +{ + if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) { + /* No timesharing needed for fixed priority Above UI threads */ + return; + } + sched_clutch_bucket_group_pri_shift_update(clutch_bucket_group); /* * Update the clutch bucket priority; this allows clutch buckets that have been pending * for a long time to get an updated interactivity score. @@ -2075,6 +2123,29 @@ sched_clutch_bucket_group_timeshare_update( sched_clutch_bucket_update(clutch_bucket, clutch_bucket->scb_root, ctime, SCHED_CLUTCH_BUCKET_OPTIONS_NONE); } +/* + * Calculate the CPU used by this thread and attribute it to the + * thread's current scheduling bucket and clutch bucket group, or + * a previous clutch bucket group if specified. + * Also update the general scheduler CPU usage, matching + * what we do for lightweight_update_priority(). + */ +static inline void +sched_clutch_thread_tick_delta(thread_t thread, sched_clutch_bucket_group_t _Nullable clutch_bucket_group) +{ + uint32_t cpu_delta; + sched_tick_delta(thread, cpu_delta); + if (thread->pri_shift < INT8_MAX) { + thread->sched_usage += cpu_delta; + } + thread->cpu_delta += cpu_delta; + if (clutch_bucket_group != NULL) { + sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta); + } else { + sched_clutch_cpu_usage_update(thread, cpu_delta); + } +} + /* * sched_clutch_thread_clutch_update() * @@ -2091,26 +2162,17 @@ sched_clutch_thread_clutch_update( sched_clutch_t old_clutch, sched_clutch_t new_clutch) { - uint32_t cpu_delta; - if (old_clutch) { assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN); sched_clutch_run_bucket_decr(old_clutch, thread->th_sched_bucket); - /* - * Calculate the CPU used by this thread in the old bucket and - * add it to the old clutch bucket. This uses the same CPU usage - * logic as update_priority etc. - */ - sched_tick_delta(thread, cpu_delta); - if (thread->pri_shift < INT8_MAX) { - thread->sched_usage += cpu_delta; - } - thread->cpu_delta += cpu_delta; + + /* Attribute CPU usage with the old clutch */ + sched_clutch_bucket_group_t old_clutch_bucket_group = NULL; if (!SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) { - sched_clutch_bucket_group_t clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]); - sched_clutch_bucket_group_cpu_usage_update(clutch_bucket_group, cpu_delta); + old_clutch_bucket_group = &(old_clutch->sc_clutch_groups[thread->th_sched_bucket]); } + sched_clutch_thread_tick_delta(thread, old_clutch_bucket_group); } if (new_clutch) { @@ -2155,6 +2217,12 @@ sched_edge_bound_thread_insert( run_queue_enqueue(&root_bucket->scrb_bound_thread_runq, thread, options); thread->th_bound_cluster_enqueued = true; + /* + * Trigger an update to the thread's clutch bucket group's priority shift parameters, + * needed for global timeshare within a clutch bucket group. + */ + sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread)); + /* Increment the urgency counter for the root if necessary */ sched_clutch_root_urgency_inc(root_clutch, thread); @@ -2182,6 +2250,12 @@ sched_edge_bound_thread_remove( sched_clutch_root_bucket_empty(root_bucket, root_clutch, mach_absolute_time()); } sched_clutch_root_pri_update(root_clutch); + + /* + * Trigger an update to the thread's clutch bucket group's priority shift parameters, + * needed for global timeshare within a clutch bucket group. + */ + sched_clutch_bucket_group_pri_shift_update(sched_clutch_bucket_group_for_thread(thread)); } /* @@ -2520,7 +2594,7 @@ sched_clutch_bucket_group_interactivity_score_calculate( sched_clutch_bucket_group_t clutch_bucket_group, uint64_t timestamp) { - if (clutch_bucket_group->scbg_bucket == TH_BUCKET_FIXPRI) { + if (sched_clutch_bucket_is_above_timeshare(clutch_bucket_group->scbg_bucket)) { /* * Since the root bucket selection algorithm for Above UI looks at clutch bucket * priorities, make sure all AboveUI buckets are marked interactive. @@ -3112,6 +3186,7 @@ const struct sched_dispatch_table sched_clutch_dispatch = { .processor_init = sched_clutch_processor_init, .pset_init = sched_clutch_pset_init, .choose_thread = sched_clutch_choose_thread, + .steal_thread_enabled = sched_steal_thread_enabled, .steal_thread = sched_clutch_steal_thread, .processor_enqueue = sched_clutch_processor_enqueue, .processor_queue_remove = sched_clutch_processor_queue_remove, @@ -3129,16 +3204,18 @@ const struct sched_dispatch_table sched_clutch_dispatch = { .update_thread_bucket = sched_clutch_update_thread_bucket, .cpu_init_completed = NULL, .thread_eligible_for_pset = NULL, + + .rt_choose_processor = sched_rt_choose_processor, + .rt_steal_thread = NULL, + .rt_init_pset = sched_rt_init_pset, + .rt_init_completed = sched_rt_init_completed, + .rt_runq_count_sum = sched_rt_runq_count_sum, + #if !SCHED_TEST_HARNESS .maintenance_continuation = sched_timeshare_maintenance_continue, - .steal_thread_enabled = sched_steal_thread_enabled, .compute_timeshare_priority = sched_compute_timeshare_priority, .choose_node = sched_choose_node, -#if CONFIG_SCHED_SMT - .choose_processor = choose_processor_smt, -#else /* CONFIG_SCHED_SMT */ .choose_processor = choose_processor, -#endif .processor_queue_shutdown = sched_clutch_processor_queue_shutdown, .can_update_priority = can_update_priority, .update_priority = update_priority, @@ -3147,12 +3224,6 @@ const struct sched_dispatch_table sched_clutch_dispatch = { .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum, .thread_update_scan = sched_clutch_thread_update_scan, .processor_balance = sched_SMT_balance, - .rt_runq = sched_rtlocal_runq, - .rt_init = sched_rtlocal_init, - .rt_queue_shutdown = sched_rtlocal_queue_shutdown, - .rt_runq_scan = sched_rtlocal_runq_scan, - .rt_runq_count_sum = sched_rtlocal_runq_count_sum, - .rt_steal_thread = sched_rtlocal_steal_thread, .qos_max_parallelism = sched_qos_max_parallelism, .check_spill = sched_check_spill, .ipi_policy = sched_ipi_policy, @@ -3161,6 +3232,9 @@ const struct sched_dispatch_table sched_clutch_dispatch = { .run_count_decr = sched_clutch_run_decr, .pset_made_schedulable = sched_pset_made_schedulable, .thread_group_recommendation_change = sched_clutch_thread_group_recommendation_change, + + .rt_queue_shutdown = sched_rt_queue_shutdown, + .rt_runq_scan = sched_rt_runq_scan, #endif /* !SCHED_TEST_HARNESS */ }; @@ -3518,7 +3592,7 @@ sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context) } thread = processor->idle_thread; - if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { + if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) { if (thread_update_add_thread(thread) == FALSE) { restart_needed = TRUE; break; @@ -3577,6 +3651,24 @@ sched_clutch_thread_update_scan(sched_update_scan_context_t scan_context) } while (restart_needed); } +/* + * For threads that have changed sched_pri without changing the + * base_pri for any reason other than decay, use the sched_pri + * as the bucketizing priority instead of base_pri. All such + * changes are typically due to kernel locking primitives boosts + * or demotions. + */ +static boolean_t +sched_thread_sched_pri_promoted(thread_t thread) +{ + return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) || + (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) || + (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || + (thread->kern_promotion_schedpri != 0); +} + +#endif /* !SCHED_TEST_HARNESS */ + /* * For the clutch scheduler, the run counts are maintained in the clutch * buckets (i.e thread group scheduling structure). @@ -3599,24 +3691,6 @@ sched_clutch_run_decr(thread_t thread) return new_count; } -/* - * For threads that have changed sched_pri without changing the - * base_pri for any reason other than decay, use the sched_pri - * as the bucketizing priority instead of base_pri. All such - * changes are typically due to kernel locking primitives boosts - * or demotions. - */ -static boolean_t -sched_thread_sched_pri_promoted(thread_t thread) -{ - return (thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) || - (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) || - (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || - (thread->kern_promotion_schedpri != 0); -} - -#endif /* !SCHED_TEST_HARNESS */ - /* * Routine to update the scheduling bucket for the thread. * @@ -3647,8 +3721,16 @@ sched_clutch_update_thread_bucket(thread_t thread) return; } + /* Bypass accounting CPU usage for a newly created thread */ + if (old_bucket != TH_BUCKET_RUN) { + /* Attribute CPU usage with the old scheduling bucket */ + sched_clutch_thread_tick_delta(thread, NULL); + } + + /* Transition to the new sched_bucket */ thread->th_sched_bucket = new_bucket; thread->pri_shift = sched_clutch_thread_pri_shift(thread, new_bucket); + /* * Since this is called after the thread has been removed from the runq, * only the run counts need to be updated. The re-insert into the runq @@ -3692,7 +3774,10 @@ static void sched_edge_processor_queue_shutdown(processor_t processor); static processor_t -sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread); +sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout); + +static void +sched_edge_quantum_expire(thread_t thread); static bool sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason); @@ -3727,6 +3812,12 @@ sched_edge_qos_max_parallelism(int qos, uint64_t options); static uint32_t sched_edge_cluster_load_metric(processor_set_t pset, sched_bucket_t sched_bucket); +static uint32_t +sched_edge_run_count_incr(thread_t thread); + +static bool +sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset); + const struct sched_dispatch_table sched_edge_dispatch = { .sched_name = "edge", .init = sched_edge_init, @@ -3758,6 +3849,13 @@ const struct sched_dispatch_table sched_edge_dispatch = { .update_thread_bucket = sched_clutch_update_thread_bucket, .cpu_init_completed = sched_edge_cpu_init_completed, .thread_eligible_for_pset = sched_edge_thread_eligible_for_pset, + + .rt_choose_processor = sched_rt_choose_processor, + .rt_steal_thread = sched_rt_steal_thread, + .rt_init_pset = sched_rt_init_pset, + .rt_init_completed = sched_rt_init_completed, + .rt_runq_count_sum = sched_rt_runq_count_sum, + #if !SCHED_TEST_HARNESS .maintenance_continuation = sched_timeshare_maintenance_continue, .compute_timeshare_priority = sched_compute_timeshare_priority, @@ -3766,19 +3864,16 @@ const struct sched_dispatch_table sched_edge_dispatch = { .can_update_priority = can_update_priority, .update_priority = update_priority, .lightweight_update_priority = lightweight_update_priority, - .quantum_expire = sched_default_quantum_expire, + .quantum_expire = sched_edge_quantum_expire, .processor_runq_stats_count_sum = sched_clutch_runq_stats_count_sum, .thread_update_scan = sched_clutch_thread_update_scan, - .rt_runq = sched_rtlocal_runq, - .rt_init = sched_rtlocal_init, - .rt_queue_shutdown = sched_rtlocal_queue_shutdown, - .rt_runq_scan = sched_rtlocal_runq_scan, - .rt_runq_count_sum = sched_rtlocal_runq_count_sum, - .rt_steal_thread = sched_rtlocal_steal_thread, - .run_count_incr = sched_clutch_run_incr, + .run_count_incr = sched_edge_run_count_incr, .run_count_decr = sched_clutch_run_decr, .pset_made_schedulable = sched_edge_pset_made_schedulable, .thread_group_recommendation_change = NULL, + + .rt_queue_shutdown = sched_rt_queue_shutdown, + .rt_runq_scan = sched_rt_runq_scan, #endif /* !SCHED_TEST_HARNESS */ }; @@ -3803,7 +3898,9 @@ sched_edge_thread_bound_cluster_id(thread_t thread) static boolean_t sched_edge_foreign_runnable_thread_available(processor_set_t pset); static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset); static processor_set_t sched_edge_steal_candidate(processor_set_t pset); -static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks); +static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, sched_options_t *options_inout); + +static_assert(sizeof(sched_clutch_edge) == sizeof(uint64_t), "sched_clutch_edge fits in 64 bits"); /* * sched_edge_config_set() @@ -3812,10 +3909,9 @@ static processor_set_t sched_edge_migrate_candidate(processor_set_t preferred_ps * policies in the scheduler. */ static void -sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_edge edge_config) +sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket, sched_clutch_edge edge_config) { - sched_clutch_edge *edge = &pset_array[src_cluster]->sched_edges[dst_cluster]; - edge->sce_edge_packed = edge_config.sce_edge_packed; + os_atomic_store(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], edge_config, relaxed); } /* @@ -3825,53 +3921,124 @@ sched_edge_config_set(uint32_t src_cluster, uint32_t dst_cluster, sched_clutch_e * if it needs to update edges. */ static sched_clutch_edge -sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster) +sched_edge_config_get(uint32_t src_cluster, uint32_t dst_cluster, sched_bucket_t bucket) { - return pset_array[src_cluster]->sched_edges[dst_cluster]; + return os_atomic_load(&pset_array[src_cluster]->sched_edges[dst_cluster][bucket], relaxed); } +/* + * sched_edge_config_pset_push() + * + * After using sched_edge_config_set() to update edge tunables outgoing from a particular source + * pset, this function should be called in order to propagate the updates to derived metadata for + * the pset, such as search orders for outgoing spill and steal. + */ +static void +sched_edge_config_pset_push(uint32_t src_pset_id) +{ + processor_set_t src_pset = pset_array[src_pset_id]; + uint8_t search_order_len = sched_edge_max_clusters - 1; + sched_pset_search_order_sort_data_t search_order_datas[MAX_PSETS - 1]; + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + uint8_t dst_pset_id = 0; + for (int i = 0; i < search_order_len; i++, dst_pset_id++) { + if (dst_pset_id == src_pset->pset_id) { + dst_pset_id++; + } + search_order_datas[i].spsosd_src_pset = src_pset; + search_order_datas[i].spsosd_dst_pset_id = dst_pset_id; + sched_clutch_edge edge = sched_edge_config_get(src_pset->pset_id, dst_pset_id, bucket); + search_order_datas[i].spsosd_migration_weight = edge.sce_migration_allowed ? + edge.sce_migration_weight : UINT32_MAX; + } + sched_pset_search_order_compute(&src_pset->spill_search_order[bucket], + search_order_datas, search_order_len, sched_edge_search_order_weight_then_locality_cmp); + } +} + +static int +sched_edge_search_order_weight_then_locality(const void *a, const void *b) +{ + const sched_pset_search_order_sort_data_t *data_a = (const sched_pset_search_order_sort_data_t *)a; + const sched_pset_search_order_sort_data_t *data_b = (const sched_pset_search_order_sort_data_t *)b; + assert3p(data_a->spsosd_src_pset, ==, data_b->spsosd_src_pset); + assert3u(data_a->spsosd_dst_pset_id, !=, data_b->spsosd_dst_pset_id); + /* + * Sort based on lowest edge migration weight, followed by die-local psets + * first, followed by lowest pset id. + */ + if (data_a->spsosd_migration_weight != data_b->spsosd_migration_weight) { + return (data_a->spsosd_migration_weight < data_b->spsosd_migration_weight) ? -1 : 1; + } + + bool is_local_a = bitmap_test(data_a->spsosd_src_pset->local_psets, data_a->spsosd_dst_pset_id); + bool is_local_b = bitmap_test(data_b->spsosd_src_pset->local_psets, data_b->spsosd_dst_pset_id); + if (is_local_a != is_local_b) { + return is_local_a ? -1 : 1; + } + + if (data_a->spsosd_dst_pset_id != data_b->spsosd_dst_pset_id) { + return (data_a->spsosd_dst_pset_id < data_b->spsosd_dst_pset_id) ? -1 : 1; + } + return 0; +} + +cmpfunc_t sched_edge_search_order_weight_then_locality_cmp = &sched_edge_search_order_weight_then_locality; + /* * sched_edge_matrix_set() * - * Routine to update various edges in the cluster edge matrix. The edge_changes_bitmap - * indicates which edges need to be updated. Both the edge_matrix & edge_changes_bitmap - * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array. + * Routine to update various edges in the edge migration graph. The edge_changed array + * indicates which edges need to be updated. Both the edge_matrix and edge_changed arrays + * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a + * single-dimensional array. */ void -sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, __unused uint64_t flags, uint64_t matrix_order) +sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, __unused uint64_t flags, + __assert_only uint64_t num_psets) { + assert3u(num_psets, ==, sched_edge_max_clusters); uint32_t edge_index = 0; - for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) { - for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) { - if (edge_changes_bitmap[edge_index]) { - sched_edge_config_set(src_cluster, dst_cluster, edge_matrix[edge_index]); + for (uint32_t src_cluster = 0; src_cluster < sched_edge_max_clusters; src_cluster++) { + for (uint32_t dst_cluster = 0; dst_cluster < sched_edge_max_clusters; dst_cluster++) { + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + if (edge_changed[edge_index]) { + sched_edge_config_set(src_cluster, dst_cluster, bucket, edge_matrix[edge_index]); + } + edge_index++; } - edge_index++; } + sched_edge_config_pset_push(src_cluster); } } /* * sched_edge_matrix_get() * - * Routine to retrieve various edges in the cluster edge matrix. The edge_request_bitmap - * indicates which edges need to be retrieved. Both the edge_matrix & edge_request_bitmap - * are MAX_PSETS * MAX_PSETS matrices flattened into a single dimensional array. + * Routine to retrieve various edges in the edge migration graph. The edge_requested array + * indicates which edges need to be retrieved. Both the edge_matrix and edge_requested arrays + * are matrices with dimension num_psets * num_psets * TH_BUCKET_SCHED_MAX, flattened into a + * single-dimensional array. */ void -sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, __unused uint64_t flags, uint64_t matrix_order) +sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, __unused uint64_t flags, + __assert_only uint64_t num_psets) { + assert3u(num_psets, ==, sched_edge_max_clusters); uint32_t edge_index = 0; - for (uint32_t src_cluster = 0; src_cluster < matrix_order; src_cluster++) { - for (uint32_t dst_cluster = 0; dst_cluster < matrix_order; dst_cluster++) { - if (edge_request_bitmap[edge_index]) { - edge_matrix[edge_index] = sched_edge_config_get(src_cluster, dst_cluster); + for (uint32_t src_pset = 0; src_pset < sched_edge_max_clusters; src_pset++) { + for (uint32_t dst_pset = 0; dst_pset < sched_edge_max_clusters; dst_pset++) { + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + if (edge_requested[edge_index]) { + edge_matrix[edge_index] = sched_edge_config_get(src_pset, dst_pset, bucket); + } + edge_index++; } - edge_index++; } } } + /* * sched_edge_init() * @@ -3892,14 +4059,21 @@ static void sched_edge_pset_init(processor_set_t pset) { uint32_t pset_cluster_id = pset->pset_cluster_id; - pset->pset_type = (pset->pset_cluster_type == PSET_AMP_P) ? CLUSTER_TYPE_P : CLUSTER_TYPE_E; + pset->pset_type = pset_cluster_type_to_cluster_type(pset->pset_cluster_type); + /* Each pset must declare an AMP type */ + assert(pset->pset_type != CLUSTER_TYPE_SMP); /* Set the edge weight and properties for the pset itself */ bitmap_clear(pset->foreign_psets, pset_cluster_id); bitmap_clear(pset->native_psets, pset_cluster_id); bitmap_clear(pset->local_psets, pset_cluster_id); bitmap_clear(pset->remote_psets, pset_cluster_id); - pset->sched_edges[pset_cluster_id].sce_edge_packed = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}.sce_edge_packed; + bzero(&pset->sched_edges, sizeof(pset->sched_edges)); + bzero(&pset->max_parallel_cores, sizeof(pset->max_parallel_cores)); + bzero(&pset->max_parallel_clusters, sizeof(pset->max_parallel_cores)); + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + sched_pset_search_order_init(pset, &pset->spill_search_order[bucket]); + } sched_clutch_root_init(&pset->pset_clutch_root, pset); bitmap_set(sched_edge_available_pset_bitmask, pset_cluster_id); } @@ -4092,9 +4266,13 @@ sched_edge_steal_candidate(processor_set_t pset) if (candidate_pset == NULL) { continue; } - sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[dst_cluster_id]; - if (incoming_edge->sce_steal_allowed && (bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX) != -1)) { - return candidate_pset; + int highest_bucket = bitmap_lsb_first(candidate_pset->pset_clutch_root.scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX); + if (highest_bucket != -1) { + /* Assumes that higher root buckets have the less restrictive sce_steal_allowed edges */ + sched_clutch_edge edge = sched_edge_config_get(cluster_id, dst_cluster_id, highest_bucket); + if (edge.sce_steal_allowed) { + return candidate_pset; + } } } return NULL; @@ -4200,10 +4378,10 @@ static boolean_t sched_edge_foreign_running_thread_available(processor_set_t pset) { bitmap_t *foreign_pset_bitmap = pset->foreign_psets; - int cluster = -1; - while ((cluster = sched_edge_iterate_clusters_ordered(pset, foreign_pset_bitmap[0], cluster)) != -1) { + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], foreign_pset_bitmap[0], &istate)) { /* Skip the pset if its not schedulable */ - processor_set_t target_pset = pset_array[cluster]; + processor_set_t target_pset = pset_array[istate.spis_pset_id]; if (pset_is_recommended(target_pset) == false) { continue; } @@ -4240,13 +4418,17 @@ sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_p return false; } - if (idle_pset->pset_type == candidate_pset->pset_type) { - /* Always allow stealing from homogeneous clusters */ - *bucket_for_steal = (sched_bucket_t)highest_runnable_bucket; - return true; - } - for (int unbound_qos = highest_runnable_bucket; unbound_qos >= 0; unbound_qos = bitmap_lsb_next(candidate_clutch_root->scr_unbound_runnable_bitmap, TH_BUCKET_SCHED_MAX, unbound_qos)) { + /* Confirm we are allowed to steal across the edge at this QoS */ + sched_clutch_edge edge = sched_edge_config_get(candidate_pset->pset_cluster_id, idle_pset->pset_cluster_id, unbound_qos); + if (edge.sce_steal_allowed == false) { + continue; + } + if (edge.sce_migration_weight == 0) { + /* Allow free stealing across a zero edge weight, even with idle cores in the candidate pset */ + *bucket_for_steal = (sched_bucket_t)unbound_qos; + return true; + } uint32_t candidate_runq_depth = os_atomic_load(&candidate_pset->pset_runnable_depth[unbound_qos], relaxed); if (candidate_runq_depth > pset_available_cpu_count(candidate_pset)) { /* Candidate cluster has excess load at this QoS (and at least one unbound thread we can steal!) */ @@ -4254,7 +4436,7 @@ sched_edge_steal_possible(processor_set_t idle_pset, processor_set_t candidate_p return true; } } - /* None of the unbound root buckets are overloaded */ + /* None of the unbound root buckets are available for steal */ return false; } @@ -4270,14 +4452,21 @@ sched_edge_steal_thread(processor_set_t pset, uint64_t candidate_pset_bitmap) * greater than the edge weight. Maybe it should have a more advanced version * which looks for the maximum delta etc. */ - int cluster_id = -1; - while ((cluster_id = sched_edge_iterate_clusters_ordered(pset, candidate_pset_bitmap, cluster_id)) != -1) { - processor_set_t steal_from_pset = pset_array[cluster_id]; + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(pset, &pset->spill_search_order[0], candidate_pset_bitmap, &istate)) { + processor_set_t steal_from_pset = pset_array[istate.spis_pset_id]; if (steal_from_pset == NULL) { continue; } - sched_clutch_edge *incoming_edge = &pset_array[cluster_id]->sched_edges[pset->pset_cluster_id]; - if (incoming_edge->sce_steal_allowed == false) { + bool steal_allowed = false; + for (sched_bucket_t bucket = TH_BUCKET_FIXPRI; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + sched_clutch_edge edge = sched_edge_config_get(istate.spis_pset_id, pset->pset_cluster_id, bucket); + if (edge.sce_steal_allowed) { + steal_allowed = true; + break; + } + } + if (steal_allowed == false) { continue; } pset_lock(steal_from_pset); @@ -4390,6 +4579,427 @@ sched_edge_shared_rsrc_migrate_possible(thread_t thread, processor_set_t preferr return false; } +/* + * Stir-the-pot Registry: + * + * Global state tracking which cores currently have threads that + * are ready to be stirred onto cores of the opposite type. + * + * The registry state updates are implemented with atomic transaction + * operations rather than a global lock, in order to avoid the cost + * of serializing some of the most frequent registry state update + * callsites that depend on consistent speed--namely the + * preemption check and context-switch paths. The most expensive + * state update, in sched_edge_stir_the_pot_try_trigger_swap(), only + * happens at quantum expiration, which should allow cheaper + * operations at other callsites to win the race. + */ +typedef unsigned __int128 sched_edge_stp_registry_t; +_Atomic sched_edge_stp_registry_t sched_edge_stir_the_pot_global_registry = 0LL; +#define SESTP_BITS_PER_CORE (2) +#define SESTP_BIT_POS(cpu_id) ((sched_edge_stp_registry_t)(cpu_id * SESTP_BITS_PER_CORE)) +#define SESTP_MASK(cpu_id) ((sched_edge_stp_registry_t)mask(SESTP_BITS_PER_CORE) << SESTP_BIT_POS(cpu_id)) +static_assert((SESTP_BITS_PER_CORE * MAX_CPUS) <= (sizeof(sched_edge_stp_registry_t) * 8), + "Global registry must fit per-core bits for each core"); + +#define SESTP_EXTRACT_STATE(registry, cpu_id) ((registry >> SESTP_BIT_POS(cpu_id)) & mask(SESTP_BITS_PER_CORE)) +#define SESTP_SET_STATE(registry, cpu_id, state) ((registry & ~SESTP_MASK(cpu_id)) | ((sched_edge_stp_registry_t)state << SESTP_BIT_POS(cpu_id))) +__enum_decl(sched_edge_stp_state_t, uint8_t, { + SCHED_EDGE_STP_NOT_WANT = 0, + SCHED_EDGE_STP_REQUESTED = 1, + SCHED_EDGE_STP_PENDING = 2, + SCHED_EDGE_STP_MAX = SCHED_EDGE_STP_PENDING +}); +static_assert(SCHED_EDGE_STP_MAX <= mask(SESTP_BITS_PER_CORE), + "Per-core stir-the-pot request state must fit in per-core bits"); + +#if OS_ATOMIC_USE_LLSC +#error "Expecting CAS implementation of os_atomic_rmw_loop()" +#endif /* OS_ATOMIC_USE_LLSC */ + +static cpumap_t sched_edge_p_core_map = 0ULL; +static cpumap_t sched_edge_non_p_core_map = 0ULL; + +/* + * In order to reduce the chance of picking the same CPUs over + * and over unfairly for stir-the-pot swaps, use an offset value + * for the lsb selection, which rotates by one index each time + * the choice is evaluated. + */ +static _Atomic uint64_t sched_edge_stp_selection_p_core_offset = 0; +static _Atomic uint64_t sched_edge_stp_selection_non_p_core_offset = 0; + +/* + * sched_edge_stir_the_pot_try_trigger_swap() + * + * Search for an eligible swap candidate on the opposite core + * type, and if one is found, initiate a swap for stir-the-pot. + * From a P-core, initiating means sending an inbox message and IPI + * to the swapping lower performance core. For initiating swap from + * a lower performance core, only an inbox message needs to be sent + * to itself, naming the P-core for swap. + * If no eligible candidate is found, mark the current processor + * as requesting stir-the-pot swap--that is unless a swap has already + * been initiated for this core, in which case we should sit tight. + * Thread lock must be held. + */ +static inline int +sched_edge_stir_the_pot_try_trigger_swap(thread_t thread) +{ + processor_t self_processor = current_processor(); + int self_cpu = self_processor->cpu_id; + /* + * Prepare the core mask of candidate cores (of the opposite type), + * and compute an offset where the candidate search should begin, + * to avoid unfairly swapping with the same cores repeatedly. + */ + cpumap_t swap_candidates_map; + uint64_t offset; + if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) { + swap_candidates_map = sched_edge_non_p_core_map; + offset = os_atomic_inc_orig(&sched_edge_stp_selection_non_p_core_offset, relaxed); + } else { + swap_candidates_map = sched_edge_p_core_map; + offset = os_atomic_inc_orig(&sched_edge_stp_selection_p_core_offset, relaxed); + } + int num_candidates = bit_count(swap_candidates_map); + if (num_candidates == 0) { + /* Too early in boot, no cores of opposite type */ + return -1; + } + int cpu_of_type_offset_ind = offset % num_candidates; + int search_start_ind = lsb_first(swap_candidates_map); + for (int i = 0; i < cpu_of_type_offset_ind; i++) { + search_start_ind = lsb_next(swap_candidates_map, search_start_ind); + assert3s(search_start_ind, !=, -1); + } + assert3s(search_start_ind, !=, -1); + swap_candidates_map = bit_ror64(swap_candidates_map, search_start_ind); + /* + * Search the registry for candidate cores of the opposite type which + * have requested swap. + */ + int swap_cpu; + sched_edge_stp_registry_t old_registry, new_registry, intermediate_registry; + sched_edge_stp_state_t self_state; + /* BEGIN IGNORE CODESTYLE */ + os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry, + old_registry, new_registry, relaxed, { + swap_cpu = -1; + self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu); + if (self_state == SCHED_EDGE_STP_PENDING) { + /* + * Another core already initiated a swap with us, so we should + * wait for that one to finish rather than initiate or request + * a new one. + */ + os_atomic_rmw_loop_give_up(break); + } + /* Scan candidates */ + for (int rotid = lsb_first(swap_candidates_map); rotid != -1; rotid = lsb_next(swap_candidates_map, rotid)) { + int candidate_cpu = (rotid + search_start_ind) % 64; // un-rotate the bit + sched_edge_stp_state_t candidate_state = SESTP_EXTRACT_STATE(old_registry, candidate_cpu); + if (candidate_state == SCHED_EDGE_STP_REQUESTED) { + sched_bucket_t candidate_qos = os_atomic_load( + &processor_array[candidate_cpu]->processor_set->cpu_running_buckets[candidate_cpu], relaxed); + if (candidate_qos == thread->th_sched_bucket) { + /* Found a requesting candidate of matching QoS */ + swap_cpu = candidate_cpu; + break; + } + } + } + if (swap_cpu == -1) { + /* No candidates requesting swap, so mark this core as requesting */ + intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED); + } else { + /* + * Mark candidate core as selected/pending for swap, and mark + * current CPU as not needing a swap anymore, since we will now + * start one. + */ + intermediate_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_PENDING); + intermediate_registry = SESTP_SET_STATE(intermediate_registry, swap_cpu, SCHED_EDGE_STP_PENDING); + } + new_registry = intermediate_registry; + }); + /* END IGNORE CODESTYLE */ + /* Leave debug tracepoints for tracking any updates to registry state */ + if (self_state != SCHED_EDGE_STP_PENDING) { + if (swap_cpu == -1) { + if (self_state != SCHED_EDGE_STP_REQUESTED) { + /* Now requesting */ + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | + DBG_FUNC_START, 0, self_cpu, cpu_of_type_offset_ind, 0); + } + } else { + if (self_state == SCHED_EDGE_STP_REQUESTED) { + /* Now pending */ + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | + DBG_FUNC_END, 1, self_cpu, cpu_of_type_offset_ind, 0); + } + int swap_state = SESTP_EXTRACT_STATE(old_registry, swap_cpu); + if (swap_state == SCHED_EDGE_STP_REQUESTED) { + /* Swap core now pending */ + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | + DBG_FUNC_END, 1, swap_cpu, cpu_of_type_offset_ind, 0); + } + } + } + if (swap_cpu != -1) { + /* Initiate a stir-the-pot swap */ + assert3s(swap_cpu, <, ml_get_topology_info()->num_cpus); + assert3s(swap_cpu, !=, self_processor->cpu_id); + processor_t swap_processor = processor_array[swap_cpu]; + if (swap_processor == PROCESSOR_NULL) { + /* Unlikely early boot initialization race */ + return -1; + } + assert3u(sched_edge_stir_the_pot_core_type_is_desired(swap_processor->processor_set), !=, + sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)); + if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) { + /* + * Send a message and IPI notification to the lower-performance + * core we found which wants to swap, so it will know to send its + * thread back here. + */ + os_atomic_store(&swap_processor->stir_the_pot_inbox_cpu, self_cpu, relaxed); + processor_set_t swap_pset = swap_processor->processor_set; + pset_lock(swap_pset); + sched_ipi_type_t ipi_type = sched_ipi_action(swap_processor, NULL, + SCHED_IPI_EVENT_REBALANCE); + pset_unlock(swap_pset); + sched_ipi_perform(swap_processor, ipi_type); + } else { + /* + * Send message to self to send this thread to the swap P-core. P-core + * will clear its own pending state upon commiting to the incoming swap + * thread after that happens. + */ + os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, swap_cpu, relaxed); + } + } + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE, + (swap_cpu != -1) ? 1 : 0, swap_cpu, old_registry, cpu_of_type_offset_ind); + return swap_cpu; +} + +/* + * sched_edge_stir_the_pot_clear_registry_entry() + * + * Mark the current CPU as NOT containing a thread which is eligible + * to be swapped for stir-the-pot. + * Preemption must be disabled. + */ +void +sched_edge_stir_the_pot_clear_registry_entry(void) +{ + int self_cpu = current_processor()->cpu_id; + sched_edge_stp_state_t self_state; + sched_edge_stp_registry_t old_registry, new_registry; + os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry, + old_registry, new_registry, relaxed, { + self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu); + if (self_state == SCHED_EDGE_STP_NOT_WANT) { + /* State already cleared, nothing to be done */ + os_atomic_rmw_loop_give_up(break); + } + new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_NOT_WANT); + }); + if (self_state == SCHED_EDGE_STP_REQUESTED) { + /* Request was cleared */ + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_END, + 2, self_cpu, 0, 0); + } +} + +/* + * sched_edge_stir_the_pot_set_registry_entry() + * + * Mark the current CPU as containing a thread which is eligible + * to be swapped to a core of the opposite type for stir-the-pot. + * Preemption must be disabled. + */ +static inline void +sched_edge_stir_the_pot_set_registry_entry(void) +{ + int self_cpu = current_processor()->cpu_id; + sched_edge_stp_state_t self_state; + sched_edge_stp_registry_t old_registry, new_registry; + bool newly_requested = os_atomic_rmw_loop(&sched_edge_stir_the_pot_global_registry, + old_registry, new_registry, relaxed, { + self_state = SESTP_EXTRACT_STATE(old_registry, self_cpu); + if (self_state == SCHED_EDGE_STP_REQUESTED) { + /* Core already registered, nothing to be done */ + os_atomic_rmw_loop_give_up(break); + } + new_registry = SESTP_SET_STATE(old_registry, self_cpu, SCHED_EDGE_STP_REQUESTED); + }); + if (newly_requested) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_START, + 3, self_cpu, self_state, 0); + } +} + +/* Stir-the-pot is designed for sharing time on the P-cores */ +static inline bool +sched_edge_stir_the_pot_core_type_is_desired(processor_set_t pset) +{ + return pset->pset_type == CLUSTER_TYPE_P; +} + +/* + * sched_edge_stir_the_pot_thread_eligible() + * + * Determine whether a thread is eligible to engage in a + * stir-the-pot swap. It must be P-recommended, unbound, and not + * round-robin shared resource. Additionally, it must have already + * expired quantum on its current core type. + */ +static inline bool +sched_edge_stir_the_pot_thread_eligible(thread_t thread) +{ + processor_set_t preferred_pset; + if ((thread == THREAD_NULL) || + ((preferred_pset = pset_array[sched_edge_thread_preferred_cluster(thread)]) == PROCESSOR_SET_NULL)) { + /* Still initializing at boot */ + return false; + } + cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread); + bool right_kind_of_thread = + sched_edge_stir_the_pot_core_type_is_desired(preferred_pset) && + (thread->sched_mode != TH_MODE_REALTIME) && + ((thread->state & TH_IDLE) == 0) && + SCHED_CLUTCH_THREAD_ELIGIBLE(thread) && + (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false) && + (shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NONE || + shared_rsrc_type == CLUSTER_SHARED_RSRC_TYPE_NATIVE_FIRST); + bool ready_for_swap = sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set) ? + thread->th_expired_quantum_on_higher_core : + thread->th_expired_quantum_on_lower_core; + return right_kind_of_thread && ready_for_swap; +} + +/* + * sched_edge_stir_the_pot_check_inbox_for_thread() + * + * Check whether this thread on a non-P-core has been chosen by a P-core to + * swap places for stir-the-pot, optionally consuming the inbox message. + * Preemption must be disabled. + */ +static inline int +sched_edge_stir_the_pot_check_inbox_for_thread(thread_t thread, bool consume_message) +{ + processor_t self_processor = current_processor(); + int dst_cpu = -1; + if (sched_edge_stir_the_pot_thread_eligible(thread)) { + /* Thread can accept the inbox message */ + dst_cpu = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed); + } else { + /* Ensure registry state is cleared for ineligible thread, if it hasn't been already */ + sched_edge_stir_the_pot_clear_registry_entry(); + /* + * Note, we don't clear a possible inbox message, in case an eligible + * thread comes back on-core quickly to receive it. + */ + } + if (consume_message) { + /* + * Unconditionally clear inbox, since either we are triggering a + * swap now or ultimately discarding the message because conditions + * have changed (thread not eligible). + */ + os_atomic_store(&self_processor->stir_the_pot_inbox_cpu, -1, relaxed); + /* + * We may have delayed requesting stir-the-pot swap for the the current thread + * due to a pending inbox message for the previous thread. Now that that such + * a message has been received, finishing updating the registry state. + */ + if (sched_edge_stir_the_pot_thread_eligible(self_processor->active_thread)) { + sched_edge_stir_the_pot_set_registry_entry(); + } + } + return dst_cpu; +} + +/* + * sched_edge_stir_the_pot_update_registry_state() + * + * Update stir-the-pot state for the current processor based on its + * (possibly new) current thread. This sets or clears the registry state + * which indicates whether the processor is running a thread that wants + * and is eligible to be swapped with a thread on the opposite core type. + * Preemption must be disabled. + */ +void +sched_edge_stir_the_pot_update_registry_state(thread_t thread) +{ + processor_t self_processor = current_processor(); + /* + * Clear corresponding th_expired_quantum_on_ field now that thread + * is getting a chance to run on the opposite type. + */ + if (sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set)) { + thread->th_expired_quantum_on_lower_core = false; + } else { + thread->th_expired_quantum_on_higher_core = false; + } + if (sched_edge_stir_the_pot_thread_eligible(thread)) { + int inbox_message = os_atomic_load(&self_processor->stir_the_pot_inbox_cpu, relaxed); + if (inbox_message == -1) { + /* Set the registry bit */ + sched_edge_stir_the_pot_set_registry_entry(); + } else { + assert(sched_edge_stir_the_pot_core_type_is_desired(self_processor->processor_set) == false); + /* + * There's an inbox message which still needs to be used at the next + * migration decision, so avoid starting a new request or clearing the + * interim pending status until then. + */ + } + } else { + /* Thread is ineligible for swap, so clear the registry bit */ + sched_edge_stir_the_pot_clear_registry_entry(); + } +} + +/* + * sched_edge_quantum_expire() + * + * Update stir-the-pot eligibility and drive stir-the-pot swaps. + * Thread lock must be held. + */ +static void +sched_edge_quantum_expire(thread_t thread) +{ + if (sched_edge_stir_the_pot_core_type_is_desired(current_processor()->processor_set)) { + thread->th_expired_quantum_on_higher_core = true; + } else { + thread->th_expired_quantum_on_lower_core = true; + } + if (sched_edge_stir_the_pot_thread_eligible(thread)) { + sched_edge_stir_the_pot_try_trigger_swap(thread); + } +} + +/* + * sched_edge_run_count_incr() + * + * Update runnable thread counts in the same way as + * sched_clutch_run_incr(), and reset per-thread, quantum- + * expired tracking used by stir-the-pot, as the thread + * is unblocking. + */ +static uint32_t +sched_edge_run_count_incr(thread_t thread) +{ + uint32_t new_count = sched_clutch_run_incr(thread); + /* Thread is unblocking and so resets its quantum tracking */ + thread->th_expired_quantum_on_lower_core = false; + thread->th_expired_quantum_on_higher_core = false; + return new_count; +} + /* Return true if this thread should not continue running on this processor */ static bool sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t reason) @@ -4421,6 +5031,35 @@ sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t return true; } + sched_clutch_edge edge = (thread->sched_pri >= BASEPRI_RTQUEUES) + ? sched_rt_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id) + : sched_edge_config_get(preferred_pset->pset_cluster_id, processor->processor_set->pset_cluster_id, thread->th_sched_bucket); + if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread) == false && + preferred_pset->pset_id != processor->processor_set->pset_id && + edge.sce_migration_allowed == false && + edge.sce_steal_allowed == false) { + /* + * Thread isn't allowed to be here, according to the edge migration graph. + * Perhaps the thread's priority or boundness or its thread group's preferred + * pset or the edge migration graph changed. + * + * We should only preempt after confirming the thread actually has a + * recommended, allowed alternative pset to run on. + */ + for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) { + if (pset_id == processor->processor_set->pset_id) { + continue; + } + edge = (thread->sched_pri >= BASEPRI_RTQUEUES) + ? sched_rt_config_get(preferred_pset->pset_id, pset_id) + : sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket); + if (pset_is_recommended(pset_array[pset_id]) && ((pset_id == preferred_pset->pset_id) || edge.sce_migration_allowed)) { + /* Thread can be run elsewhere. */ + return true; + } + } + } + /* Evaluate shared resource policies */ if (thread_shared_rsrc_policy_get(thread, CLUSTER_SHARED_RSRC_TYPE_RR)) { return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set); @@ -4433,22 +5072,50 @@ sched_edge_thread_avoid_processor(processor_t processor, thread_t thread, ast_t return sched_edge_shared_rsrc_migrate_possible(thread, preferred_pset, processor->processor_set); } + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + return false; + } + /* ~~ No realtime or shared resource threads beyond this point ~~ */ + /* - * For long running parallel workloads, it is important to rebalance threads across - * E/P clusters so that they make equal forward progress. This is achieved through - * threads expiring their quantum on the non-preferred cluster type and explicitly - * rebalancing to the preferred cluster runqueue. + * Stir-the-Pot: + * A non-P-core should preempt if a P-core has been found to swap the current, + * quantum-expired thread to for stir-the-pot. This is in order for threads in a + * multi-threaded workload to share time on the P-cores so they make roughly equal + * forward progress. */ - if ((processor->processor_set->pset_type != preferred_pset->pset_type) && - pset_type_is_recommended(preferred_pset)) { + if (sched_edge_stir_the_pot_check_inbox_for_thread(thread, false) != -1) { return true; } - /* If the preferred pset for the thread is now idle, try and migrate thread to that cluster */ + + /* + * Compaction: + * If the preferred pset for the thread is now idle, try and migrate the thread to that cluster. + */ if ((processor->processor_set != preferred_pset) && (sched_edge_cluster_load_metric(preferred_pset, thread->th_sched_bucket) == 0)) { return true; } + /* + * Running Rebalance: + * We are willing to preempt the thread in order to migrate it onto an idle core + * of the preferred type. + */ + if ((processor->processor_set->pset_type != preferred_pset->pset_type) && + pset_type_is_recommended(preferred_pset)) { + /* Scan for idle pset */ + for (uint32_t pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) { + processor_set_t candidate_pset = pset_array[pset_id]; + edge = sched_edge_config_get(preferred_pset->pset_id, pset_id, thread->th_sched_bucket); + if ((candidate_pset->pset_type == preferred_pset->pset_type) && + edge.sce_migration_allowed && + (sched_edge_cluster_load_metric(candidate_pset, thread->th_sched_bucket) == 0)) { + return true; + } + } + } + return false; } @@ -4521,12 +5188,13 @@ sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset, return false; } - sched_clutch_edge *edge = preferred_pset->sched_edges; - if (edge[cluster_id].sce_migration_allowed == false) { + sched_clutch_edge edge = sched_edge_config_get(preferred_cluster_id, cluster_id, thread->th_sched_bucket); + if (edge.sce_migration_allowed == false) { return false; } uint32_t dst_load = shared_rsrc_thread ? (uint32_t)sched_pset_cluster_shared_rsrc_load(dst_pset, shared_rsrc_type) : sched_edge_cluster_load_metric(dst_pset, thread->th_sched_bucket); - if (dst_load == 0) { + if (dst_load == 0 + ) { /* The candidate cluster is idle; select it immediately for execution */ *selected_pset = dst_pset; *max_edge_delta = preferred_cluster_load; @@ -4538,7 +5206,7 @@ sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset, return false; } edge_delta = preferred_cluster_load - dst_load; - if (!shared_rsrc_thread && (edge_delta < edge[cluster_id].sce_migration_weight)) { + if (!shared_rsrc_thread && (edge_delta < edge.sce_migration_weight)) { /* * For non shared resource threads, use the edge migration weight to decide if * this cluster is over-committed at the QoS level of this thread. @@ -4551,8 +5219,8 @@ sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset, } if (edge_delta == *max_edge_delta) { /* If the edge delta is the same as the max delta, make sure a homogeneous cluster is picked */ - boolean_t selected_homogeneous = (pset_type_for_id((*selected_pset)->pset_cluster_id) == preferred_cluster_type); - boolean_t candidate_homogeneous = (pset_type_for_id(dst_pset->pset_cluster_id) == preferred_cluster_type); + boolean_t selected_homogeneous = ((*selected_pset)->pset_type == preferred_cluster_type); + boolean_t candidate_homogeneous = (dst_pset->pset_type == preferred_cluster_type); if (selected_homogeneous || !candidate_homogeneous) { return false; } @@ -4563,48 +5231,6 @@ sched_edge_migration_check(uint32_t cluster_id, processor_set_t preferred_pset, return false; } -/* - * sched_edge_iterate_clusters_ordered() - * - * Routine to iterate clusters in die local order. For multi-die machines, - * the routine ensures that the candidate clusters on the same die as the - * passed in pset are returned before the remote die clusters. This should - * be used in all places where cluster selection in die order matters. - */ - -static int -sched_edge_iterate_clusters_ordered(processor_set_t starting_pset, uint64_t candidate_map, int previous_cluster) -{ - int cluster_id = -1; - - uint64_t local_candidate_map = starting_pset->local_psets[0] & candidate_map; - uint64_t remote_candidate_map = starting_pset->remote_psets[0] & candidate_map; - - if (previous_cluster == -1) { - /* previous_cluster == -1 indicates the initial condition */ - cluster_id = bit_first(local_candidate_map); - if (cluster_id != -1) { - return cluster_id; - } - return bit_first(remote_candidate_map); - } else { - /* - * After the initial condition, the routine attempts to return a - * cluster in the previous_cluster's locality. If none is available, - * it looks at remote clusters. - */ - if (bit_test(local_candidate_map, previous_cluster)) { - cluster_id = bit_next(local_candidate_map, previous_cluster); - if (cluster_id != -1) { - return cluster_id; - } else { - return bit_first(remote_candidate_map); - } - } - return bit_next(remote_candidate_map, previous_cluster); - } -} - /* * sched_edge_migrate_edges_evaluate() * @@ -4625,9 +5251,9 @@ sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t prefe bitmap_t *foreign_pset_bitmap = preferred_pset->foreign_psets; bitmap_t *native_pset_bitmap = preferred_pset->native_psets; /* Always start the search with the native clusters */ - int cluster_id = -1; - while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, native_pset_bitmap[0], cluster_id)) != -1) { - search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta); + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], native_pset_bitmap[0], &istate)) { + search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta); if (search_complete) { break; } @@ -4666,9 +5292,9 @@ sched_edge_migrate_edges_evaluate(processor_set_t preferred_pset, uint32_t prefe } /* Now look at the non-native clusters */ - cluster_id = -1; - while ((cluster_id = sched_edge_iterate_clusters_ordered(preferred_pset, foreign_pset_bitmap[0], cluster_id)) != -1) { - search_complete = sched_edge_migration_check(cluster_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta); + istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(preferred_pset, &preferred_pset->spill_search_order[thread->th_sched_bucket], foreign_pset_bitmap[0], &istate)) { + search_complete = sched_edge_migration_check(istate.spis_pset_id, preferred_pset, preferred_cluster_load, thread, &selected_pset, &max_edge_delta); if (search_complete) { break; } @@ -4724,43 +5350,6 @@ sched_edge_switch_pset_lock(processor_set_t selected_pset, processor_set_t locke } } -/* - * sched_edge_amp_rebalance_pset() - * - * Routine to decide where a thread which is eligible for AMP rebalance (i.e. - * has executed on non-preferred cluster type for a while) should be enqueued. - * The algorithm maintains a history of AMP rebalance decisions on the clutch - * bucket group of the workload and round-robins between clusters to ensure - * that all threads get a chance on the performance cores and make equal - * progress. - */ -static processor_set_t -sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread) -{ - sched_clutch_t clutch = sched_clutch_for_thread(thread); - sched_clutch_bucket_group_t clutch_bucket_group = &clutch->sc_clutch_groups[thread->th_sched_bucket]; - - uint32_t last_chosen_cluster, new_chosen_cluster; - - /* Only AMP rebalance within clusters native to the preferred cluster */ - uint64_t eligible_pset_bitmask = preferred_pset->native_psets[0]; - /* Preferred cluster is also eligible for rebalancing */ - bit_set(eligible_pset_bitmask, preferred_pset->pset_cluster_id); - /* Atomically update the AMP rebalance cluster for the clutch bucket group */ - os_atomic_rmw_loop(&clutch_bucket_group->scbg_amp_rebalance_last_chosen, last_chosen_cluster, new_chosen_cluster, relaxed, { - if (last_chosen_cluster == UINT32_MAX) { - new_chosen_cluster = preferred_pset->pset_cluster_id; - } else { - new_chosen_cluster = lsb_next(eligible_pset_bitmask, last_chosen_cluster); - if (new_chosen_cluster == -1) { - /* Rotate to the start of the eligible bitmask */ - new_chosen_cluster = lsb_first(eligible_pset_bitmask); - } - } - }); - return pset_array[new_chosen_cluster]; -} - /* * sched_edge_migrate_candidate() * @@ -4779,11 +5368,14 @@ sched_edge_amp_rebalance_pset(processor_set_t preferred_pset, thread_t thread) * resultant pset lock is held. */ static processor_set_t -sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread, processor_set_t locked_pset, bool switch_pset_locks) +sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t thread, + processor_set_t locked_pset, bool switch_pset_locks, processor_t *processor_hint_out, + sched_options_t *options_inout) { processor_set_t selected_pset = preferred_pset; cluster_shared_rsrc_type_t shared_rsrc_type = sched_edge_thread_shared_rsrc_type(thread); bool shared_rsrc_thread = (shared_rsrc_type != CLUSTER_SHARED_RSRC_TYPE_NONE); + bool stirring_the_pot = false; if (SCHED_CLUTCH_THREAD_CLUSTER_BOUND(thread)) { /* @@ -4829,23 +5421,23 @@ sched_edge_migrate_candidate(processor_set_t _Nullable preferred_pset, thread_t } /* - * If a thread is being rebalanced for achieving equal progress of parallel workloads, - * it needs to end up on the preferred runqueue. This mechanism should only be used for - * threads which have been previously migrated to the non-preferred cluster type. + * If this thread has expired quantum on a non-preferred core and is waiting on + * "stir-the-pot" to get a turn running on a P-core, check our processor inbox for + * stir-the-pot to see if an eligible P-core has already been found for swap. + * If so, try to migrate to the corresponding pset and also carry over the + * processor hint to preempt that specific P-core. * * The AMP rebalancing mechanism is available for regular threads or shared resource * threads with the EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST policy. */ - bool amp_rebalance_eligible = (!shared_rsrc_thread) || (shared_rsrc_thread && (edge_shared_rsrc_policy[shared_rsrc_type] == EDGE_SHARED_RSRC_SCHED_POLICY_NATIVE_FIRST)); - if (amp_rebalance_eligible) { - boolean_t amp_rebalance = (thread->reason & (AST_REBALANCE | AST_QUANTUM)) == (AST_REBALANCE | AST_QUANTUM); - if (amp_rebalance) { - boolean_t non_preferred_pset = (thread->last_processor->processor_set->pset_type != preferred_pset->pset_type); - if (non_preferred_pset) { - selected_pset = sched_edge_amp_rebalance_pset(preferred_pset, thread); - goto migrate_candidate_available_check; - } - } + int stir_the_pot_swap_cpu = sched_edge_stir_the_pot_check_inbox_for_thread(thread, true); + if (stir_the_pot_swap_cpu != -1) { + *processor_hint_out = processor_array[stir_the_pot_swap_cpu]; + selected_pset = processor_array[stir_the_pot_swap_cpu]->processor_set; + stirring_the_pot = true; + KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_STIR_THE_POT) | DBG_FUNC_NONE, + 2, stir_the_pot_swap_cpu, 0, 0); + goto migrate_candidate_available_check; } /* Look at edge weights to decide the most ideal migration candidate for this thread */ @@ -4860,9 +5452,14 @@ migrate_candidate_available_check: locked_pset = sched_edge_switch_pset_lock(selected_pset, locked_pset, switch_pset_locks); if (pset_is_recommended(selected_pset) == true) { + /* Committing to the pset */ + if (stirring_the_pot) { + *options_inout |= SCHED_STIR_POT; + } KDBG(MACHDBG_CODE(DBG_MACH_SCHED_CLUTCH, MACH_SCHED_EDGE_CLUSTER_OVERLOAD) | DBG_FUNC_NONE, thread_tid(thread), preferred_pset->pset_cluster_id, selected_pset->pset_cluster_id, preferred_cluster_load); return selected_pset; } + stirring_the_pot = false; /* Looks like selected_pset is not available for scheduling; remove it from candidate_cluster_bitmap */ bitmap_clear(&candidate_cluster_bitmap, selected_pset->pset_cluster_id); if (__improbable(bitmap_first(&candidate_cluster_bitmap, sched_edge_max_clusters) == -1)) { @@ -4875,7 +5472,7 @@ migrate_candidate_available_check: } static processor_t -sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread) +sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_t thread, sched_options_t *options_inout) { /* Bound threads don't call this function */ assert(thread->bound_processor == PROCESSOR_NULL); @@ -4897,12 +5494,10 @@ sched_edge_choose_processor(processor_set_t pset, processor_t processor, thread_ * It might be useful to build a recency metric for the thread for multiple clusters and * factor that into the migration decisions. */ - chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true); + chosen_pset = sched_edge_migrate_candidate(preferred_pset, thread, pset, true, &processor, options_inout); if (chosen_pset) { - chosen_processor = choose_processor(chosen_pset, processor, thread); + chosen_processor = choose_processor(chosen_pset, processor, thread, options_inout); } - /* For RT threads, choose_processor() can return a different cluster than the one passed into it */ - assert(chosen_processor ? chosen_processor->processor_set->pset_type == chosen_pset->pset_type : true); return chosen_processor; } @@ -5128,6 +5723,8 @@ sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_buc if (old_preferred_cluster != new_preferred_cluster) { bitmap_set(clutch_bucket_modify_bitmap, bucket); } + KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREFERRED_PSET) | DBG_FUNC_NONE, + thread_group_get_id(tg), bucket, new_preferred_cluster, options); } if (bitmap_lsb_first(clutch_bucket_modify_bitmap, TH_BUCKET_SCHED_MAX) == -1) { /* No changes in any clutch buckets; nothing to do here */ @@ -5198,6 +5795,7 @@ sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t #endif /* !SCHED_TEST_HARNESS */ + /* * sched_edge_cpu_init_completed() * @@ -5207,33 +5805,68 @@ sched_edge_pset_made_schedulable(__unused processor_t processor, processor_set_t static void sched_edge_cpu_init_completed(void) { + /* Now that all cores have registered, compute bitmaps for different core types */ + for (int pset_id = 0; pset_id < sched_edge_max_clusters; pset_id++) { + processor_set_t pset = pset_array[pset_id]; + if (sched_edge_stir_the_pot_core_type_is_desired(pset)) { + os_atomic_or(&sched_edge_p_core_map, pset->cpu_bitmask, relaxed); + } else { + os_atomic_or(&sched_edge_non_p_core_map, pset->cpu_bitmask, relaxed); + } + } + /* Build policy table for setting edge weight tunables based on cluster types */ + sched_clutch_edge edge_config_defaults[MAX_CPU_TYPES][MAX_CPU_TYPES]; + sched_clutch_edge free_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1}; + sched_clutch_edge no_spill = (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}; + sched_clutch_edge weighted_spill = (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1}; + /* P -> P */ + edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_P] = free_spill; + /* E -> E */ + edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_E] = free_spill; + /* P -> E */ + edge_config_defaults[CLUSTER_TYPE_P][CLUSTER_TYPE_E] = weighted_spill; + /* E -> P */ + edge_config_defaults[CLUSTER_TYPE_E][CLUSTER_TYPE_P] = no_spill; + spl_t s = splsched(); for (int src_cluster_id = 0; src_cluster_id < sched_edge_max_clusters; src_cluster_id++) { processor_set_t src_pset = pset_array[src_cluster_id]; pset_lock(src_pset); + /* Each pset recommendation is at least allowed to access its own cluster */ + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + src_pset->max_parallel_cores[bucket] = src_pset->cpu_set_count; + src_pset->max_parallel_clusters[bucket] = 1; + } + /* For each cluster, set all its outgoing edge parameters */ for (int dst_cluster_id = 0; dst_cluster_id < sched_edge_max_clusters; dst_cluster_id++) { + processor_set_t dst_pset = pset_array[dst_cluster_id]; if (dst_cluster_id == src_cluster_id) { continue; } - processor_set_t dst_pset = pset_array[dst_cluster_id]; - if (src_pset->pset_type == dst_pset->pset_type) { - /* P->P/E->E edge config */ + + bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type); + if (clusters_homogenous) { bitmap_clear(src_pset->foreign_psets, dst_cluster_id); bitmap_set(src_pset->native_psets, dst_cluster_id); - sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 1, .sce_steal_allowed = 1}); - } else if ((src_pset->pset_type == CLUSTER_TYPE_P) && (dst_pset->pset_type == CLUSTER_TYPE_E)) { - /* P->E edge config */ - bitmap_set(src_pset->foreign_psets, dst_cluster_id); - bitmap_clear(src_pset->native_psets, dst_cluster_id); - sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 64, .sce_migration_allowed = 1, .sce_steal_allowed = 1}); + /* Default realtime policy: spill allowed among homogeneous psets. */ + sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) { + .sce_migration_allowed = true, + .sce_steal_allowed = true, + .sce_migration_weight = 0, + }); } else { - /* E->P edge config */ bitmap_set(src_pset->foreign_psets, dst_cluster_id); bitmap_clear(src_pset->native_psets, dst_cluster_id); - sched_edge_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge){.sce_migration_weight = 0, .sce_migration_allowed = 0, .sce_steal_allowed = 0}); + /* Default realtime policy: disallow spill among heterogeneous psets. */ + sched_rt_config_set(src_cluster_id, dst_cluster_id, (sched_clutch_edge) { + .sce_migration_allowed = false, + .sce_steal_allowed = false, + .sce_migration_weight = 0, + }); } + bool clusters_local = (ml_get_die_id(src_cluster_id) == ml_get_die_id(dst_cluster_id)); if (clusters_local) { bitmap_set(src_pset->local_psets, dst_cluster_id); @@ -5242,7 +5875,18 @@ sched_edge_cpu_init_completed(void) bitmap_set(src_pset->remote_psets, dst_cluster_id); bitmap_clear(src_pset->local_psets, dst_cluster_id); } + + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + /* Set tunables for an edge based on the cluster types at either ends of it */ + sched_clutch_edge edge_config = edge_config_defaults[src_pset->pset_type][dst_pset->pset_type]; + sched_edge_config_set(src_cluster_id, dst_cluster_id, bucket, edge_config); + if (edge_config.sce_migration_allowed) { + src_pset->max_parallel_cores[bucket] += dst_pset->cpu_set_count; + src_pset->max_parallel_clusters[bucket] += 1; + } + } } + sched_edge_config_pset_push(src_cluster_id); pset_unlock(src_pset); } @@ -5256,8 +5900,13 @@ sched_edge_thread_eligible_for_pset(thread_t thread, processor_set_t pset) if (preferred_cluster_id == pset->pset_cluster_id) { return true; } else { - processor_set_t preferred_pset = pset_array[preferred_cluster_id]; - return preferred_pset->sched_edges[pset->pset_cluster_id].sce_migration_allowed; + sched_clutch_edge edge; + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + edge = sched_rt_config_get(preferred_cluster_id, pset->pset_id); + } else { + edge = sched_edge_config_get(preferred_cluster_id, pset->pset_cluster_id, thread->th_sched_bucket); + } + return edge.sce_migration_allowed; } } @@ -5320,24 +5969,24 @@ sched_edge_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sche return sched_ipi_policy(dst, thread, dst_idle, event); } + /* * sched_edge_qos_max_parallelism() */ uint32_t sched_edge_qos_max_parallelism(int qos, uint64_t options) { - uint32_t ecpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_E, false, false); - uint32_t pcpu_count = ml_get_cpu_number_type(CLUSTER_TYPE_P, false, false); - uint32_t ecluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_E); - uint32_t pcluster_count = ml_get_cluster_number_type(CLUSTER_TYPE_P); - + cluster_type_t low_core_type = CLUSTER_TYPE_E; + cluster_type_t high_core_type = CLUSTER_TYPE_P; if (options & QOS_PARALLELISM_REALTIME) { /* For realtime threads on AMP, we would want them * to limit the width to just the P-cores since we * do not spill/rebalance for RT threads. */ - return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcpu_count; + uint32_t high_cpu_count = ml_get_cpu_number_type(high_core_type, false, false); + uint32_t high_cluster_count = ml_get_cluster_number_type(high_core_type); + return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? high_cluster_count : high_cpu_count; } /* @@ -5351,10 +6000,14 @@ sched_edge_qos_max_parallelism(int qos, uint64_t options) */ switch (qos) { case THREAD_QOS_BACKGROUND: - case THREAD_QOS_MAINTENANCE: - return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecpu_count; - default: - return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecpu_count + pcpu_count); + case THREAD_QOS_MAINTENANCE:; + uint32_t low_cpu_count = ml_get_cpu_number_type(low_core_type, false, false); + uint32_t low_cluster_count = ml_get_cluster_number_type(low_core_type); + return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? low_cluster_count : low_cpu_count; + default:; + uint32_t total_cpus = ml_get_cpu_count(); + uint32_t total_clusters = ml_get_cluster_count(); + return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? total_clusters : total_cpus; } } diff --git a/osfmk/kern/sched_clutch.h b/osfmk/kern/sched_clutch.h index bfe43c78f..68bc84d5e 100644 --- a/osfmk/kern/sched_clutch.h +++ b/osfmk/kern/sched_clutch.h @@ -282,8 +282,6 @@ struct sched_clutch_bucket_group { uint32_t _Atomic scbg_pri_shift; /* (A) preferred cluster ID for clutch bucket */ uint32_t _Atomic scbg_preferred_cluster; - /* (A) cluster ID for AMP rebalancing */ - uint32_t scbg_amp_rebalance_last_chosen; /* (I) clutch to which this clutch bucket_group belongs */ struct sched_clutch *scbg_clutch; /* (A) holds blocked timestamp and runnable/running count */ @@ -348,18 +346,40 @@ uint32_t sched_clutch_root_count(sched_clutch_root_t); extern sched_clutch_t sched_clutch_for_thread(thread_t); extern sched_clutch_t sched_clutch_for_thread_group(struct thread_group *); +#if DEVELOPMENT || DEBUG + +extern kern_return_t sched_clutch_thread_group_cpu_time_for_thread(thread_t thread, int sched_bucket, uint64_t *cpu_stats); + +#endif /* DEVELOPMENT || DEBUG */ + #if CONFIG_SCHED_EDGE /* * Getter and Setter for Edge configuration. Used by CLPC to affect thread migration behavior. */ -void sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, uint64_t flags, uint64_t matrix_order); -void sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, uint64_t flags, uint64_t matrix_order); +void sched_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, uint64_t flags, uint64_t num_psets); +void sched_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, uint64_t flags, uint64_t num_psets); void sched_edge_tg_preferred_cluster_change(struct thread_group *tg, uint32_t *tg_bucket_preferred_cluster, sched_perfcontrol_preferred_cluster_options_t options); uint16_t sched_edge_cluster_cumulative_count(sched_clutch_root_t root_clutch, sched_bucket_t bucket); uint16_t sched_edge_shared_rsrc_runnable_load(sched_clutch_root_t root_clutch, cluster_shared_rsrc_type_t load_type); +/* + * sched_edge_search_order_weight_then_locality_cmp() + * + * Search order that prioritizes outgoing edges with a lower + * migration weight, then breaks ties with die-locality followed + * by least pset id. + */ +extern int (*sched_edge_search_order_weight_then_locality_cmp)(const void *a, const void *b); + +/* + * Used to keep stir-the-pot state up-to-date for the current + * processor, as new threads come on-core. + */ +extern void sched_edge_stir_the_pot_update_registry_state(thread_t thread); +extern void sched_edge_stir_the_pot_clear_registry_entry(void); + #endif /* CONFIG_SCHED_EDGE */ #endif /* CONFIG_SCHED_CLUTCH */ diff --git a/osfmk/kern/sched_common.c b/osfmk/kern/sched_common.c new file mode 100644 index 000000000..dae2d47c5 --- /dev/null +++ b/osfmk/kern/sched_common.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include + +#if __AMP__ + +void +sched_pset_search_order_compute(sched_pset_search_order_t *search_order_out, + sched_pset_search_order_sort_data_t *datas, size_t num_datas, + sched_pset_search_order_sort_cmpfunc_t cmp) +{ + qsort(datas, num_datas, sizeof(sched_pset_search_order_sort_data_t), cmp); + sched_pset_search_order_t search_order; + for (int i = 0; i < num_datas; i++) { + search_order.spso_search_order[i] = datas[i].spsosd_dst_pset_id; + } + int num_psets = ml_get_cluster_count(); + for (int i = (int)num_datas; i < num_psets - 1; i++) { + /* + * If fewer sort datas were passed in than the number of psets minus + * 1 (AKA the maximum length of a pset search order), then mark the + * remaining slots at the end with an invalid pset id. + */ + search_order.spso_search_order[i] = PSET_ID_INVALID; + } + os_atomic_store_wide(&search_order_out->spso_packed, search_order.spso_packed, relaxed); +} + +void +sched_pset_search_order_init(processor_set_t src_pset, sched_pset_search_order_t *search_order_out) +{ + pset_id_t other_pset_id = 0; + sched_pset_search_order_t spill_order; + int num_psets = ml_get_cluster_count(); + for (int i = 0; i < MAX_PSETS - 1; i++, other_pset_id++) { + if (i < num_psets - 1) { + if (other_pset_id == src_pset->pset_id) { + /* Exclude the source pset */ + other_pset_id++; + } + assert3u(other_pset_id, <, num_psets); + spill_order.spso_search_order[i] = other_pset_id; + } else { + /* Mark unneeded slots with an invalid id, as they should not be accessed */ + spill_order.spso_search_order[i] = PSET_ID_INVALID; + } + } + os_atomic_store_wide(&search_order_out->spso_packed, spill_order.spso_packed, relaxed); +} + +bool +sched_iterate_psets_ordered(processor_set_t starting_pset, sched_pset_search_order_t *search_order, + uint64_t candidate_map, sched_pset_iterate_state_t *istate) +{ + int num_psets = ml_get_cluster_count(); + while (istate->spis_search_index < num_psets - 1) { + int pset_id; + if (istate->spis_search_index == -1) { + /* Initial condition */ + pset_id = starting_pset->pset_id; + istate->spis_cached_search_order = + (sched_pset_search_order_t)os_atomic_load_wide(&search_order->spso_packed, relaxed); + } else { + pset_id = istate->spis_cached_search_order.spso_search_order[istate->spis_search_index]; + if (pset_id == PSET_ID_INVALID) { + /* The given search order does not include all psets */ + break; + } + assert3u(pset_id, !=, starting_pset->pset_id); + } + istate->spis_search_index++; + if (bit_test(candidate_map, pset_id)) { + istate->spis_pset_id = pset_id; + return true; + } + } + istate->spis_pset_id = -1; + return false; +} + +#endif /* __AMP__ */ diff --git a/osfmk/kern/sched_common.h b/osfmk/kern/sched_common.h new file mode 100644 index 000000000..99fe5223b --- /dev/null +++ b/osfmk/kern/sched_common.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SCHED_COMMON_H_ +#define _KERN_SCHED_COMMON_H_ + +#include +#include +#include + +typedef uint8_t pset_id_t; +static_assert(MAX_PSETS < UINT8_MAX, "Can store pset ids within 8 bits"); +#define PSET_ID_INVALID UINT8_MAX + +#if __AMP__ + +/* + * sched_pset_search_order_t + * + * Used for storing a computed search order of pset ids, relative to a + * scanning pset not included in the list. + * + * Storing/accessing the search order atomically avoids issues caused + * by editing the search order while processors are in the middle of + * traversing it, for example causing them to miss a pset or visit a + * particular pset more than once. Instead, the search order should be + * read atomically before traversing, so that new edits are ignored by + * that processor until its traversal is complete. + */ +typedef union { + pset_id_t spso_search_order[MAX_PSETS - 1]; + unsigned __int128 spso_packed; +} sched_pset_search_order_t; + +static_assert(sizeof(sched_pset_search_order_t) <= sizeof(unsigned __int128), + "(MAX_PSETS - 1) * 8 bits fits in 128 bits, allowing sched_pset_search_order_t fields " + "to be accessed atomically"); + +typedef struct processor_set *processor_set_t; + +/* + * sched_pset_search_order_sort_data_t + * + * Pset data used when generating search orders, expected to be + * populated for each pset before calling sched_pset_search_order_compute() + */ +typedef struct { + processor_set_t spsosd_src_pset; + uint64_t spsosd_migration_weight; + pset_id_t spsosd_dst_pset_id; +} sched_pset_search_order_sort_data_t; + +/* + * sched_pset_search_order_sort_cmpfunc_t + * + * Expected to compare two sched_pset_search_order_sort_data_t pointers, + * for the purpose of generating a pset search order. + */ +typedef cmpfunc_t sched_pset_search_order_sort_cmpfunc_t; + +/* + * sched_pset_search_order_compute() + * + * Generates a pset search order by sorting the per-pset search order datas + * using the given comparator. + */ +void +sched_pset_search_order_compute(sched_pset_search_order_t *search_order_out, + sched_pset_search_order_sort_data_t *datas, size_t num_datas, + sched_pset_search_order_sort_cmpfunc_t cmp); + +/* + * sched_pset_search_order_init() + * + * Generates a search order of all psets sorted by increasing pset id, still + * excluding the source pset. + */ +void +sched_pset_search_order_init(processor_set_t src_pset, sched_pset_search_order_t *search_order_out); + +/* + * sched_pset_iterate_state_t + * + * Used for tracking state across calls to sched_iterate_psets_ordered() + * for the same search order traversal, and for returning the current pset_id. + */ +typedef struct { + int spis_search_index; + sched_pset_search_order_t spis_cached_search_order; + int spis_pset_id; // out +} sched_pset_iterate_state_t; + +#define SCHED_PSET_ITERATE_STATE_INIT ((sched_pset_iterate_state_t) { .spis_search_index = -1 }) + +/* + * sched_iterate_psets_ordered() + * + * Routine to iterate through candidate psets based on a given search_order + * and starting from starting_pset. + * Returns true if iteration continues and another candidate pset was found, + * which will be stored at istate->spis_pset_id. Returns false and + * istate->spis_pset_id of -1 once iteration is complete. Iterate state should + * start out initialized to SCHED_PSET_ITERATE_STATE_INIT. + */ +bool +sched_iterate_psets_ordered(processor_set_t starting_pset, sched_pset_search_order_t *search_order, + uint64_t candidate_map, sched_pset_iterate_state_t *istate); + +#endif /* __AMP__ */ + +#endif /* _KERN_SCHED_COMMON_H_ */ diff --git a/osfmk/kern/sched_dualq.c b/osfmk/kern/sched_dualq.c index d5cb84c7d..bc2aa8e17 100644 --- a/osfmk/kern/sched_dualq.c +++ b/osfmk/kern/sched_dualq.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -137,12 +138,17 @@ const struct sched_dispatch_table sched_dualq_dispatch = { .thread_avoid_processor = sched_dualq_thread_avoid_processor, .processor_balance = sched_SMT_balance, - .rt_runq = sched_rtlocal_runq, - .rt_init = sched_rtlocal_init, - .rt_queue_shutdown = sched_rtlocal_queue_shutdown, - .rt_runq_scan = sched_rtlocal_runq_scan, - .rt_runq_count_sum = sched_rtlocal_runq_count_sum, - .rt_steal_thread = sched_rtlocal_steal_thread, +#if CONFIG_SCHED_SMT + .rt_choose_processor = sched_rtlocal_choose_processor_smt, +#else /* !CONFIG_SCHED_SMT */ + .rt_choose_processor = sched_rt_choose_processor, +#endif /* !CONFIG_SCHED_SMT */ + .rt_steal_thread = NULL, + .rt_init_pset = sched_rt_init_pset, + .rt_init_completed = sched_rt_init_completed, + .rt_queue_shutdown = sched_rt_queue_shutdown, + .rt_runq_scan = sched_rt_runq_scan, + .rt_runq_count_sum = sched_rt_runq_count_sum, .qos_max_parallelism = sched_qos_max_parallelism, .check_spill = sched_check_spill, @@ -519,7 +525,7 @@ sched_dualq_thread_update_scan(sched_update_scan_context_t scan_context) } thread = processor->idle_thread; - if (thread != THREAD_NULL && thread->sched_stamp != sched_tick) { + if (thread != THREAD_NULL && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) { if (thread_update_add_thread(thread) == FALSE) { restart_needed = TRUE; break; diff --git a/osfmk/kern/sched_hygiene.h b/osfmk/kern/sched_hygiene.h index cff715efb..ddd518ee7 100644 --- a/osfmk/kern/sched_hygiene.h +++ b/osfmk/kern/sched_hygiene.h @@ -76,6 +76,9 @@ extern machine_timeout_t stackshot_interrupt_masked_timeout; extern bool sched_hygiene_nonspec_tb; #define ml_get_sched_hygiene_timebase() (sched_hygiene_nonspec_tb ? ml_get_timebase() : ml_get_speculative_timebase()) +#define ml_use_sched_hygiene_nonspec_timebase() (sched_hygiene_nonspec_tb) +#define ML_TIMEOUT_TIMEBASE_FLAGS (ml_use_sched_hygiene_nonspec_timebase() ? TF_NONSPEC_TIMEBASE : 0) +#define ML_TIMEOUT_PMC_FLAGS (static_if(sched_debug_pmc) ? TF_SAMPLE_PMC : 0) extern bool kprintf_spam_mt_pred(struct machine_timeout_spec const *spec); diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 57d8af9e0..dff06f752 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +98,7 @@ #include #include #include +#include #include #include #include @@ -136,120 +138,9 @@ struct sched_statistics PERCPU_DATA(sched_stats); bool sched_stats_active; -static uint64_t -deadline_add(uint64_t d, uint64_t e) -{ - uint64_t sum; - return os_add_overflow(d, e, &sum) ? UINT64_MAX : sum; -} - -int -rt_runq_count(processor_set_t pset) -{ - return os_atomic_load(&SCHED(rt_runq)(pset)->count, relaxed); -} - -uint64_t -rt_runq_earliest_deadline(processor_set_t pset) -{ - return os_atomic_load_wide(&SCHED(rt_runq)(pset)->earliest_deadline, relaxed); -} - -static int -rt_runq_priority(processor_set_t pset) -{ - pset_assert_locked(pset); - rt_queue_t rt_run_queue = SCHED(rt_runq)(pset); - - bitmap_t *map = rt_run_queue->bitmap; - int i = bitmap_first(map, NRTQS); - assert(i < NRTQS); - - if (i >= 0) { - return i + BASEPRI_RTQUEUES; - } - - return i; -} - -static thread_t rt_runq_first(rt_queue_t rt_runq); - -#if DEBUG -static void -check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread) -{ - bitmap_t *map = rt_run_queue->bitmap; - - uint64_t earliest_deadline = RT_DEADLINE_NONE; - uint32_t constraint = RT_CONSTRAINT_NONE; - int ed_index = NOPRI; - int count = 0; - bool found_thread = false; - - for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { - int i = pri - BASEPRI_RTQUEUES; - rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; - queue_t queue = &rt_runq->pri_queue; - queue_entry_t iter; - int n = 0; - uint64_t previous_deadline = 0; - qe_foreach(iter, queue) { - thread_t iter_thread = qe_element(iter, struct thread, runq_links); - assert_thread_magic(iter_thread); - if (iter_thread == thread) { - found_thread = true; - } - assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES)); - assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE); - assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE); - assert(previous_deadline <= iter_thread->realtime.deadline); - n++; - if (iter == queue_first(queue)) { - assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline); - assert(rt_runq->pri_constraint == iter_thread->realtime.constraint); - } - previous_deadline = iter_thread->realtime.deadline; - } - assert(n == rt_runq->pri_count); - if (n == 0) { - assert(bitmap_test(map, i) == false); - assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE); - assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE); - } else { - assert(bitmap_test(map, i) == true); - } - if (rt_runq->pri_earliest_deadline < earliest_deadline) { - earliest_deadline = rt_runq->pri_earliest_deadline; - constraint = rt_runq->pri_constraint; - ed_index = i; - } - count += n; - } - assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline); - assert(os_atomic_load(&rt_run_queue->count, relaxed) == count); - assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint); - assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index); - if (thread) { - assert(found_thread); - } -} -#define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th) -#else -#define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0) -#endif - -uint32_t rt_constraint_threshold; - -static bool -rt_runq_is_low_latency(processor_set_t pset) -{ - return os_atomic_load(&SCHED(rt_runq)(pset)->constraint, relaxed) <= rt_constraint_threshold; -} - TUNABLE(bool, cpulimit_affects_quantum, "cpulimit_affects_quantum", true); -/* TODO: enable this, to 50us (less than the deferred IPI latency, to beat a spill) */ -TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 0); /* microseconds */ +TUNABLE(uint32_t, nonurgent_preemption_timer_us, "nonurgent_preemption_timer", 50); /* microseconds */ static uint64_t nonurgent_preemption_timer_abs = 0; #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ @@ -303,16 +194,10 @@ uint32_t thread_depress_time; uint32_t default_timeshare_computation; uint32_t default_timeshare_constraint; -uint32_t max_rt_quantum; -uint32_t min_rt_quantum; - -uint32_t rt_deadline_epsilon; - -uint32_t rt_constraint_threshold; #if defined(CONFIG_SCHED_TIMESHARE_CORE) -unsigned sched_tick; +_Atomic uint32_t sched_tick; uint32_t sched_tick_interval; /* Timeshare load calculation interval (15ms) */ @@ -345,7 +230,7 @@ thread_t sched_maintenance_thread; LCK_GRP_DECLARE(cluster_powerdown_grp, "cluster_powerdown"); LCK_MTX_DECLARE(cluster_powerdown_lock, &cluster_powerdown_grp); -/* interrupts disabled lock to guard core online, recommendation, pcs state */ +/* interrupts disabled lock to guard core online, recommendation, pcs state, scheduling policy bits */ decl_simple_lock_data(, sched_available_cores_lock); /* @@ -491,10 +376,7 @@ csw_check_locked( static void processor_setrun( processor_t processor, thread_t thread, - integer_t options); - -static void -sched_realtime_timebase_init(void); + sched_options_t options); static void sched_timer_deadline_tracking_init(void); @@ -519,19 +401,6 @@ int8_t sched_load_shifts[NRQS]; bitmap_t sched_preempt_pri[BITMAP_LEN(NRQS_MAX)]; #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -#define cpumap_foreach(cpu_id, cpumap) \ - for (int cpu_id = lsb_first(cpumap); \ - (cpu_id) >= 0; \ - cpu_id = lsb_next((cpumap), cpu_id)) - -#define foreach_node(node) \ - for (pset_node_t node = &pset_node0; node != NULL; node = node->node_list) - -#define foreach_pset_id(pset_id, node) \ - for (int pset_id = lsb_first((node)->pset_map); \ - pset_id >= 0; \ - pset_id = lsb_next((node)->pset_map, pset_id)) - /* * Statically allocate a buffer to hold the longest possible * scheduler description string, as currently implemented. @@ -589,10 +458,9 @@ sched_init(void) #endif /* __arm64__ */ SCHED(init)(); - SCHED(rt_init)(&pset0); sched_timer_deadline_tracking_init(); - SCHED(pset_init)(&pset0); + SCHED(rt_init_pset)(&pset0); SCHED(processor_init)(master_processor); if (PE_parse_boot_argn("direct_handoff", &direct_handoff, sizeof(direct_handoff))) { @@ -643,7 +511,7 @@ sched_timeshare_init(void) load_shift_init(); preempt_pri_init(); - sched_tick = 0; + os_atomic_store(&sched_tick, 0, relaxed); } void @@ -767,69 +635,6 @@ sched_timeshare_timebase_init(void) #endif /* CONFIG_SCHED_TIMESHARE_CORE */ -void -pset_rt_init(processor_set_t pset) -{ - for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { - int i = pri - BASEPRI_RTQUEUES; - rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i]; - queue_init(&rqi->pri_queue); - rqi->pri_count = 0; - rqi->pri_earliest_deadline = RT_DEADLINE_NONE; - rqi->pri_constraint = RT_CONSTRAINT_NONE; - } - os_atomic_init(&pset->rt_runq.count, 0); - os_atomic_init(&pset->rt_runq.earliest_deadline, RT_DEADLINE_NONE); - os_atomic_init(&pset->rt_runq.constraint, RT_CONSTRAINT_NONE); - os_atomic_init(&pset->rt_runq.ed_index, NOPRI); - memset(&pset->rt_runq.runq_stats, 0, sizeof pset->rt_runq.runq_stats); -} - -/* epsilon for comparing RT deadlines */ -int rt_deadline_epsilon_us = 100; - -int -sched_get_rt_deadline_epsilon(void) -{ - return rt_deadline_epsilon_us; -} - -void -sched_set_rt_deadline_epsilon(int new_epsilon_us) -{ - rt_deadline_epsilon_us = new_epsilon_us; - - uint64_t abstime; - clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0)); - rt_deadline_epsilon = (uint32_t)abstime; -} - -static void -sched_realtime_timebase_init(void) -{ - uint64_t abstime; - - /* smallest rt computation (50 us) */ - clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - min_rt_quantum = (uint32_t)abstime; - - /* maximum rt computation (50 ms) */ - clock_interval_to_absolutetime_interval( - 50, 1000 * NSEC_PER_USEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - max_rt_quantum = (uint32_t)abstime; - - /* constraint threshold for sending backup IPIs (4 ms) */ - clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime); - assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); - rt_constraint_threshold = (uint32_t)abstime; - - /* epsilon for comparing deadlines */ - sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us); -} - void sched_check_spill(processor_set_t pset, thread_t thread) { @@ -1867,7 +1672,7 @@ clear_wait_internal( * thread thread to awaken * result Wakeup result the thread should see */ -kern_return_t +__mockable kern_return_t clear_wait( thread_t thread, wait_result_t result) @@ -1911,8 +1716,11 @@ thread_wakeup_nthreads_prim( } struct waitq *wq = global_eventq(event); + uint32_t count; - return waitq_wakeup64_nthreads(wq, CAST_EVENT64_T(event), result, WAITQ_WAKEUP_DEFAULT, nthreads); + count = waitq_wakeup64_nthreads(wq, CAST_EVENT64_T(event), result, + WAITQ_WAKEUP_DEFAULT, nthreads); + return count ? KERN_SUCCESS : KERN_NOT_WAITING; } /* @@ -1922,7 +1730,7 @@ thread_wakeup_nthreads_prim( * and thread_wakeup_one. * */ -kern_return_t +__mockable kern_return_t thread_wakeup_prim( event_t event, boolean_t one_thread, @@ -1956,45 +1764,6 @@ thread_wakeup_thread( return waitq_wakeup64_thread(wq, CAST_EVENT64_T(event), thread, THREAD_AWAKENED); } -/* - * Wakeup a thread waiting on an event and promote it to a priority. - * - * Requires woken thread to un-promote itself when done. - */ -kern_return_t -thread_wakeup_one_with_pri( - event_t event, - int priority) -{ - if (__improbable(event == NO_EVENT)) { - panic("%s() called with NO_EVENT", __func__); - } - - struct waitq *wq = global_eventq(event); - - return waitq_wakeup64_one(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); -} - -/* - * Wakeup a thread waiting on an event, - * promote it to a priority, - * and return a reference to the woken thread. - * - * Requires woken thread to un-promote itself when done. - */ -thread_t -thread_wakeup_identify(event_t event, - int priority) -{ - if (__improbable(event == NO_EVENT)) { - panic("%s() called with NO_EVENT", __func__); - } - - struct waitq *wq = global_eventq(event); - - return waitq_wakeup64_identify(wq, CAST_EVENT64_T(event), THREAD_AWAKENED, priority); -} - /* * thread_bind: * @@ -2272,11 +2041,8 @@ sched_vm_group_maintenance(void) #define SCHED_AVOID_CPU0 0 #endif -int sched_allow_rt_smt = 1; int sched_avoid_cpu0 = SCHED_AVOID_CPU0; -int sched_allow_rt_steal = 1; int sched_backup_cpu_timeout_count = 5; /* The maximum number of 10us delays to wait before using a backup cpu */ - int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS; int @@ -2297,19 +2063,6 @@ sched_set_rt_n_backup_processors(int n) sched_rt_n_backup_processors = n; } -int sched_rt_runq_strict_priority = false; - -inline static processor_set_t -change_locked_pset(processor_set_t current_pset, processor_set_t new_pset) -{ - if (current_pset != new_pset) { - pset_unlock(current_pset); - pset_lock(new_pset); - } - - return new_pset; -} - /* * Invoked prior to idle entry to determine if, on SMT capable processors, an SMT * rebalancing opportunity exists when a core is (instantaneously) idle, but @@ -2387,13 +2140,6 @@ sched_SMT_balance(__unused processor_t cprocessor, __unused processor_set_t cpse } #endif /* CONFIG_SCHED_SMT */ - -static cpumap_t -pset_available_cpumap(processor_set_t pset) -{ - return pset->cpu_available_map & pset->recommended_bitmask; -} - int pset_available_cpu_count(processor_set_t pset) { @@ -2443,60 +2189,6 @@ pset_has_stealable_threads(processor_set_t pset) return (pset->pset_runq.count > 0) && ((pset->pset_runq.count + rt_runq_count(pset)) > bit_count(avail_map)); } -static cpumap_t -pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset) -{ - cpumap_t avail_map = pset_available_cpumap(pset); -#if CONFIG_SCHED_SMT - if (!sched_allow_rt_smt) { - /* - * Secondary CPUs are not allowed to run RT threads, so - * only primary CPUs should be included - */ - avail_map &= pset->primary_map; - } -#endif /* CONFIG_SCHED_SMT */ - - return avail_map & ~pset->realtime_map; -} - -static bool -pset_needs_a_followup_IPI(processor_set_t pset) -{ - int nbackup_cpus = 0; - - if (rt_runq_is_low_latency(pset)) { - nbackup_cpus = sched_rt_n_backup_processors; - } - - int rt_rq_count = rt_runq_count(pset); - - return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0); -} - -bool -pset_has_stealable_rt_threads(processor_set_t pset) -{ - pset_node_t node = pset->node; - if (bit_count(node->pset_map) == 1) { - return false; - } - - cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset); - - return rt_runq_count(pset) > bit_count(avail_map); -} - -static void -pset_update_rt_stealable_state(processor_set_t pset) -{ - if (pset_has_stealable_rt_threads(pset)) { - pset->stealable_rt_threads_earliest_deadline = rt_runq_earliest_deadline(pset); - } else { - pset->stealable_rt_threads_earliest_deadline = RT_DEADLINE_NONE; - } -} - static void clear_pending_AST_bits(processor_set_t pset, processor_t processor, __kdebug_only const int trace_point_number) { @@ -2583,17 +2275,12 @@ pset_commit_processor_to_new_thread(processor_set_t pset, processor_t processor, } #if CONFIG_SCHED_SMT -static processor_t choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills); static bool all_available_primaries_are_running_realtime_threads(processor_set_t pset, bool include_backups); static bool these_processors_are_running_realtime_threads(processor_set_t pset, uint64_t these_map, bool include_backups); -#else /* CONFIG_SCHED_SMT */ -static processor_t choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills); -#endif /* CONFIG_SCHED_SMT */ -static processor_t choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, - processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus); -static processor_t choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries); +#else /* !CONFIG_SCHED_SMT */ +processor_t pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills); +#endif /* !CONFIG_SCHED_SMT */ static bool sched_ok_to_run_realtime_thread(processor_set_t pset, processor_t processor, bool as_backup); -static bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor); static bool other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint64_t earliest_deadline) @@ -2605,7 +2292,7 @@ other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint6 for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) { processor_set_t nset = pset_array[pset_id]; - if (deadline_add(nset->stealable_rt_threads_earliest_deadline, rt_deadline_epsilon) < earliest_deadline) { + if (rt_deadline_add(os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed), rt_deadline_epsilon) < earliest_deadline) { return true; } } @@ -2613,57 +2300,6 @@ other_psets_have_earlier_rt_threads_pending(processor_set_t stealing_pset, uint6 return false; } -/* - * starting_pset must be locked, but returns true if it is unlocked before return - */ -static bool -choose_next_rt_processor_for_IPI(processor_set_t starting_pset, processor_t chosen_processor, bool spill_ipi, - processor_t *result_processor, sched_ipi_type_t *result_ipi_type) -{ - bool starting_pset_is_unlocked = false; - uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset); - int max_pri = rt_runq_priority(starting_pset); - __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq)); - processor_set_t pset = starting_pset; - processor_t next_rt_processor = PROCESSOR_NULL; - if (spill_ipi) { - processor_set_t nset = next_pset(pset); - assert(nset != starting_pset); - pset = change_locked_pset(pset, nset); - starting_pset_is_unlocked = true; - } - do { - const bool consider_secondaries = true; - next_rt_processor = choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, consider_secondaries); - if (next_rt_processor == PROCESSOR_NULL) { - if (!spill_ipi) { - break; - } - processor_set_t nset = next_pset(pset); - if (nset == starting_pset) { - break; - } - pset = change_locked_pset(pset, nset); - starting_pset_is_unlocked = true; - } - } while (next_rt_processor == PROCESSOR_NULL); - if (next_rt_processor) { - if (pset != starting_pset) { - if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) { - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START, - next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, (uintptr_t)spill_tid); - } - } - *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT); - *result_processor = next_rt_processor; - } - if (pset != starting_pset) { - pset_unlock(pset); - } - - return starting_pset_is_unlocked; -} - /* * backup processor - used by choose_processor to send a backup IPI to in case the preferred processor can't immediately respond * followup processor - used in thread_select when there are still threads on the run queue and available processors @@ -2893,12 +2529,12 @@ restart: * See if the current lower priority thread can continue to run without causing * the higher priority thread on the runq queue to miss its deadline. */ - thread_t hi_thread = rt_runq_first(SCHED(rt_runq)(pset)); + thread_t hi_thread = rt_runq_first(&pset->rt_runq); if (thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon >= hi_thread->realtime.constraint) { /* The next RT thread is better, so pick it off the runqueue. */ goto pick_new_rt_thread; } - } else if ((rt_runq_count(pset) > 0) && (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) { + } else if ((rt_runq_count(pset) > 0) && (rt_deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < thread->realtime.deadline)) { /* The next RT thread is better, so pick it off the runqueue. */ goto pick_new_rt_thread; } @@ -2918,13 +2554,16 @@ restart: next_rt_ipi_type = SCHED_IPI_NONE; bool pset_unlocked = false; - __kdebug_only next_processor_type_t nptype = none; - if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) { + next_processor_type_t nptype = none; +#if CONFIG_SCHED_EDGE + if (rt_pset_has_stealable_threads(pset)) { nptype = spill; - pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type); - } else if (pset_needs_a_followup_IPI(pset)) { + pset_unlocked = rt_choose_next_processor_for_spill_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type); + } +#endif /* CONFIG_SCHED_EDGE */ + if (nptype == none && rt_pset_needs_a_followup_IPI(pset)) { nptype = followup; - pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type); + rt_choose_next_processor_for_followup_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type); } if (!pset_unlocked) { pset_unlock(pset); @@ -2979,7 +2618,9 @@ restart: /* OK, so we're not going to run the current thread. Look at the RT queue. */ if (ok_to_run_realtime_thread) { pick_new_rt_thread: - new_thread = sched_rt_choose_thread(pset); + /* sched_rt_choose_thread may drop and re-take the processor's pset lock. */ + new_thread = sched_rt_choose_thread(processor); + pset_assert_locked(pset); if (new_thread != THREAD_NULL) { processor->deadline = new_thread->realtime.deadline; pset_commit_processor_to_new_thread(pset, processor, new_thread); @@ -3000,17 +2641,18 @@ pick_new_rt_thread: send_followup_ipi_before_idle: /* This might not have been cleared if we didn't call sched_rt_choose_thread() */ - if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 5); - } - __kdebug_only next_processor_type_t nptype = none; + rt_clear_pending_spill(processor, 5); + next_processor_type_t nptype = none; bool pset_unlocked = false; - if (sched_allow_rt_steal && pset_has_stealable_rt_threads(pset)) { +#if CONFIG_SCHED_EDGE + if (rt_pset_has_stealable_threads(pset)) { nptype = spill; - pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, true, &next_rt_processor, &next_rt_ipi_type); - } else if (pset_needs_a_followup_IPI(pset)) { + pset_unlocked = rt_choose_next_processor_for_spill_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type); + } +#endif /* CONFIG_SCHED_EDGE */ + if (nptype == none && rt_pset_needs_a_followup_IPI(pset)) { nptype = followup; - pset_unlocked = choose_next_rt_processor_for_IPI(pset, processor, false, &next_rt_processor, &next_rt_ipi_type); + rt_choose_next_processor_for_followup_IPI(pset, processor, &next_rt_processor, &next_rt_ipi_type); } assert(new_thread || !ast_processor); @@ -3891,7 +3533,7 @@ thread_dispatch( * it may just be stolen back by the idle core we just forced it off. * But only do this at the end of a quantum to prevent cascading effects. */ - options |= SCHED_PREEMPT; + options |= SCHED_STIR_POT; } } @@ -4106,7 +3748,7 @@ thread_dispatch( * thread resumes, it will execute the continuation function * on a new kernel stack. */ -wait_result_t +__mockable wait_result_t thread_block_reason( thread_continue_t continuation, void *parameter, @@ -4488,403 +4130,6 @@ run_queue_peek( } } -static bool -rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor) -{ - int pri = thread->sched_pri; - assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); - int i = pri - BASEPRI_RTQUEUES; - rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; - bitmap_t *map = rt_run_queue->bitmap; - - bitmap_set(map, i); - - queue_t queue = &rt_runq->pri_queue; - uint64_t deadline = thread->realtime.deadline; - bool preempt = false; - bool earliest = false; - - if (queue_empty(queue)) { - enqueue_tail(queue, &thread->runq_links); - preempt = true; - earliest = true; - rt_runq->pri_earliest_deadline = deadline; - rt_runq->pri_constraint = thread->realtime.constraint; - } else { - /* Insert into rt_runq in thread deadline order */ - queue_entry_t iter; - qe_foreach(iter, queue) { - thread_t iter_thread = qe_element(iter, struct thread, runq_links); - assert_thread_magic(iter_thread); - - if (deadline < iter_thread->realtime.deadline) { - if (iter == queue_first(queue)) { - preempt = true; - earliest = true; - rt_runq->pri_earliest_deadline = deadline; - rt_runq->pri_constraint = thread->realtime.constraint; - } - insque(&thread->runq_links, queue_prev(iter)); - break; - } else if (iter == queue_last(queue)) { - enqueue_tail(queue, &thread->runq_links); - break; - } - } - } - if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) { - os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed); - os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed); - os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed); - } - - SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); - rt_runq->pri_count++; - os_atomic_inc(&rt_run_queue->count, relaxed); - - thread_set_runq_locked(thread, processor); - - CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); - - return preempt; -} - -static thread_t -rt_runq_dequeue(rt_queue_t rt_run_queue) -{ - bitmap_t *map = rt_run_queue->bitmap; - int i = bitmap_first(map, NRTQS); - assert((i >= 0) && (i < NRTQS)); - - rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; - - if (!sched_rt_runq_strict_priority) { - int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed); - if (ed_index != i) { - assert((ed_index >= 0) && (ed_index < NRTQS)); - rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index]; - - thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links); - thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); - - if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) { - /* choose the earliest deadline thread */ - rt_runq = ed_runq; - i = ed_index; - } - } - } - - assert(rt_runq->pri_count > 0); - uint64_t earliest_deadline = RT_DEADLINE_NONE; - uint32_t constraint = RT_CONSTRAINT_NONE; - int ed_index = NOPRI; - thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links); - SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); - if (--rt_runq->pri_count > 0) { - thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); - assert(next_rt != THREAD_NULL); - earliest_deadline = next_rt->realtime.deadline; - constraint = next_rt->realtime.constraint; - ed_index = i; - } else { - bitmap_clear(map, i); - } - rt_runq->pri_earliest_deadline = earliest_deadline; - rt_runq->pri_constraint = constraint; - - for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { - rt_runq = &rt_run_queue->rt_queue_pri[i]; - if (rt_runq->pri_earliest_deadline < earliest_deadline) { - earliest_deadline = rt_runq->pri_earliest_deadline; - constraint = rt_runq->pri_constraint; - ed_index = i; - } - } - os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); - os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); - os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); - os_atomic_dec(&rt_run_queue->count, relaxed); - - thread_clear_runq(new_thread); - - CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); - - return new_thread; -} - -static thread_t -rt_runq_first(rt_queue_t rt_run_queue) -{ - bitmap_t *map = rt_run_queue->bitmap; - int i = bitmap_first(map, NRTQS); - if (i < 0) { - return THREAD_NULL; - } - rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; - thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); - - return next_rt; -} - -static void -rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread) -{ - CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); - - int pri = thread->sched_pri; - assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); - int i = pri - BASEPRI_RTQUEUES; - rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; - bitmap_t *map = rt_run_queue->bitmap; - - assert(rt_runq->pri_count > 0); - uint64_t earliest_deadline = RT_DEADLINE_NONE; - uint32_t constraint = RT_CONSTRAINT_NONE; - int ed_index = NOPRI; - remqueue(&thread->runq_links); - SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); - if (--rt_runq->pri_count > 0) { - thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); - earliest_deadline = next_rt->realtime.deadline; - constraint = next_rt->realtime.constraint; - ed_index = i; - } else { - bitmap_clear(map, i); - } - rt_runq->pri_earliest_deadline = earliest_deadline; - rt_runq->pri_constraint = constraint; - - for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { - rt_runq = &rt_run_queue->rt_queue_pri[i]; - if (rt_runq->pri_earliest_deadline < earliest_deadline) { - earliest_deadline = rt_runq->pri_earliest_deadline; - constraint = rt_runq->pri_constraint; - ed_index = i; - } - } - os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); - os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); - os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); - os_atomic_dec(&rt_run_queue->count, relaxed); - - thread_clear_runq_locked(thread); - - CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); -} - -rt_queue_t -sched_rtlocal_runq(processor_set_t pset) -{ - return &pset->rt_runq; -} - -void -sched_rtlocal_init(processor_set_t pset) -{ - pset_rt_init(pset); -} - -void -sched_rtlocal_queue_shutdown(processor_t processor) -{ - processor_set_t pset = processor->processor_set; - thread_t thread; - queue_head_t tqueue; - - pset_lock(pset); - - /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ - if (bit_count(pset_available_cpumap(pset)) > 0) { - pset_unlock(pset); - return; - } - - queue_init(&tqueue); - - while (rt_runq_count(pset) > 0) { - thread = rt_runq_dequeue(&pset->rt_runq); - enqueue_tail(&tqueue, &thread->runq_links); - } - sched_update_pset_load_average(pset, 0); - pset_update_rt_stealable_state(pset); - pset_unlock(pset); - - qe_foreach_element_safe(thread, &tqueue, runq_links) { - remqueue(&thread->runq_links); - - thread_lock(thread); - - thread_setrun(thread, SCHED_TAILQ); - - thread_unlock(thread); - } -} - -/* Assumes RT lock is not held, and acquires splsched/rt_lock itself */ -void -sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context) -{ - thread_t thread; - - pset_node_t node = &pset_node0; - processor_set_t pset = node->psets; - - spl_t s = splsched(); - do { - while (pset != NULL) { - pset_lock(pset); - - bitmap_t *map = pset->rt_runq.bitmap; - for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { - rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i]; - - qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) { - if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { - scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; - } - } - } - - pset_unlock(pset); - - pset = pset->pset_list; - } - } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); - splx(s); -} - -int64_t -sched_rtlocal_runq_count_sum(void) -{ - pset_node_t node = &pset_node0; - processor_set_t pset = node->psets; - int64_t count = 0; - - do { - while (pset != NULL) { - count += pset->rt_runq.runq_stats.count_sum; - - pset = pset->pset_list; - } - } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); - - return count; -} - -/* - * Called with stealing_pset locked and - * returns with stealing_pset locked - * but the lock will have been dropped - * if a thread is returned. - */ -thread_t -sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline) -{ - if (!sched_allow_rt_steal) { - return THREAD_NULL; - } - pset_map_t pset_map = stealing_pset->node->pset_map; - - bit_clear(pset_map, stealing_pset->pset_id); - - processor_set_t pset = stealing_pset; - - processor_set_t target_pset; - uint64_t target_deadline; - -retry: - target_pset = NULL; - target_deadline = earliest_deadline - rt_deadline_epsilon; - - for (int pset_id = lsb_first(pset_map); pset_id >= 0; pset_id = lsb_next(pset_map, pset_id)) { - processor_set_t nset = pset_array[pset_id]; - - /* - * During startup, while pset_array[] and node->pset_map are still being initialized, - * the update to pset_map may become visible to this cpu before the update to pset_array[]. - * It would be good to avoid inserting a memory barrier here that is only needed during startup, - * so just check nset is not NULL instead. - */ - if (nset && (nset->stealable_rt_threads_earliest_deadline < target_deadline)) { - target_deadline = nset->stealable_rt_threads_earliest_deadline; - target_pset = nset; - } - } - - if (target_pset != NULL) { - pset = change_locked_pset(pset, target_pset); - if (pset->stealable_rt_threads_earliest_deadline <= target_deadline) { - thread_t new_thread = rt_runq_dequeue(&pset->rt_runq); - pset_update_rt_stealable_state(pset); - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0); - - pset = change_locked_pset(pset, stealing_pset); - return new_thread; - } - pset = change_locked_pset(pset, stealing_pset); - earliest_deadline = rt_runq_earliest_deadline(pset); - goto retry; - } - - pset = change_locked_pset(pset, stealing_pset); - return THREAD_NULL; -} - -/* - * pset is locked - */ -thread_t -sched_rt_choose_thread(processor_set_t pset) -{ - processor_t processor = current_processor(); - - if (SCHED(steal_thread_enabled)(pset)) { - do { - bool spill_pending = bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id); - if (spill_pending) { - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 2); - } - thread_t new_thread = SCHED(rt_steal_thread)(pset, rt_runq_earliest_deadline(pset)); - if (new_thread != THREAD_NULL) { - if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 3); - } - return new_thread; - } - } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)); - } - - if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { - KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, 4); - } - - if (rt_runq_count(pset) > 0) { - thread_t new_thread = rt_runq_dequeue(SCHED(rt_runq)(pset)); - assert(new_thread != THREAD_NULL); - pset_update_rt_stealable_state(pset); - return new_thread; - } - - return THREAD_NULL; -} - -/* - * realtime_queue_insert: - * - * Enqueue a thread for realtime execution. - */ -static bool -realtime_queue_insert(processor_t processor, processor_set_t pset, thread_t thread) -{ - pset_assert_locked(pset); - - bool preempt = rt_runq_enqueue(SCHED(rt_runq)(pset), thread, processor); - pset_update_rt_stealable_state(pset); - - return preempt; -} - /* * realtime_setrun: * @@ -4925,7 +4170,7 @@ realtime_setrun( /* */ assert(thread->bound_processor == PROCESSOR_NULL); - realtime_queue_insert(chosen_processor, pset, thread); + rt_runq_insert(chosen_processor, pset, thread); processor_t processor = chosen_processor; @@ -4940,7 +4185,7 @@ realtime_setrun( if (thread->sched_pri > processor->current_pri) { preempt = (AST_PREEMPT | AST_URGENT); } else if (thread->sched_pri == processor->current_pri) { - if (deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) { + if (rt_deadline_add(thread->realtime.deadline, rt_deadline_epsilon) < processor->deadline) { preempt = (AST_PREEMPT | AST_URGENT); } } @@ -4996,7 +4241,7 @@ realtime_setrun( } ipi_type[i] = SCHED_IPI_NONE; ipi_processor[i] = PROCESSOR_NULL; - pset_is_locked = !choose_next_rt_processor_for_IPI(pset, chosen_processor, false, &ipi_processor[i], &ipi_type[i]); + rt_choose_next_processor_for_followup_IPI(pset, chosen_processor, &ipi_processor[i], &ipi_type[i]); if (ipi_processor[i] == PROCESSOR_NULL) { break; } @@ -5057,6 +4302,7 @@ sched_ipi_deferred_policy(processor_set_t pset, processor_t dst, return SCHED_IPI_NONE; } +/* Requires the destination pset lock to be held */ sched_ipi_type_t sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event) { @@ -5183,7 +4429,7 @@ static void processor_setrun( processor_t processor, thread_t thread, - integer_t options) + sched_options_t options) { processor_set_t pset = processor->processor_set; pset_assert_locked(pset); @@ -5214,7 +4460,8 @@ processor_setrun( preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; } - if ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE)) { + if ((options & SCHED_STIR_POT) || + ((options & (SCHED_PREEMPT | SCHED_REBALANCE)) == (SCHED_PREEMPT | SCHED_REBALANCE))) { /* * Having gone to the trouble of forcing this thread off a less preferred core, * we should force the preferable core to reschedule immediately to give this @@ -5338,7 +4585,8 @@ processor_t choose_processor_smt( processor_set_t starting_pset, processor_t processor, - thread_t thread) + thread_t thread, + __unused sched_options_t *options) { processor_set_t pset = starting_pset; processor_set_t nset; @@ -5444,6 +4692,14 @@ choose_processor_smt( * core has full use of its resources). */ + assert(pset == starting_pset); + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + return SCHED(rt_choose_processor)(pset, processor, thread); + } + + /* No realtime threads from this point on */ + assert(thread->sched_pri < BASEPRI_RTQUEUES); + integer_t lowest_priority = MAXPRI + 1; integer_t lowest_secondary_priority = MAXPRI + 1; integer_t lowest_unpaired_primary_priority = MAXPRI + 1; @@ -5467,93 +4723,6 @@ choose_processor_smt( lc_processor = processor; } - if (thread->sched_pri >= BASEPRI_RTQUEUES) { - pset_node_t node = pset->node; - bool include_ast_urgent_pending_cpus = false; - cpumap_t ast_urgent_pending; -try_again: - ast_urgent_pending = 0; - int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus; - for (; consider_secondaries < 2; consider_secondaries++) { - pset = change_locked_pset(pset, starting_pset); - do { - cpumap_t available_map = pset_available_cpumap(pset); - if (available_map == 0) { - goto no_available_cpus; - } - - processor = choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false); - if (processor) { - return processor; - } - - if (consider_secondaries) { - processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus); - if (processor) { - /* - * Instead of looping through all the psets to find the global - * furthest deadline processor, preempt the first candidate found. - * The preempted thread will then find any other available far deadline - * processors to preempt. - */ - return processor; - } - - ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask; - - if (rt_runq_count(pset) < lowest_count) { - int cpuid = bit_first(available_map); - assert(cpuid >= 0); - lc_processor = processor_array[cpuid]; - lowest_count = rt_runq_count(pset); - } - } - -no_available_cpus: - nset = next_pset(pset); - - if (nset != starting_pset) { - pset = change_locked_pset(pset, nset); - } - } while (nset != starting_pset); - } - - /* Short cut for single pset nodes */ - if (bit_count(node->pset_map) == 1) { - if (lc_processor) { - pset_assert_locked(lc_processor->processor_set); - return lc_processor; - } - } else { - if (ast_urgent_pending && !include_ast_urgent_pending_cpus) { - /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */ - include_ast_urgent_pending_cpus = true; - goto try_again; - } - } - - processor = lc_processor; - - if (processor) { - pset = change_locked_pset(pset, processor->processor_set); - /* Check that chosen processor is still usable */ - cpumap_t available_map = pset_available_cpumap(pset); - if (bit_test(available_map, processor->cpu_id)) { - return processor; - } - - /* processor is no longer usable */ - processor = PROCESSOR_NULL; - } - - pset_assert_locked(pset); - pset_unlock(pset); - return PROCESSOR_NULL; - } - - /* No realtime threads from this point on */ - assert(thread->sched_pri < BASEPRI_RTQUEUES); - do { /* * Choose an idle processor, in pset traversal order @@ -5813,7 +4982,7 @@ no_available_cpus: pset_unlock(pset); return PROCESSOR_NULL; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT */ /* * choose_processor: * @@ -5830,12 +4999,13 @@ processor_t choose_processor( processor_set_t starting_pset, processor_t processor, - thread_t thread) + thread_t thread, + __unused sched_options_t *options) { processor_set_t pset = starting_pset; processor_set_t nset; - assert(thread->sched_pri <= MAXPRI); + assert3u(thread->sched_pri, <=, MAXPRI); /* * At this point, we may have a processor hint, and we may have @@ -5908,6 +5078,14 @@ choose_processor( * to replace it. */ + assert(pset == starting_pset); + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + return SCHED(rt_choose_processor)(pset, processor, thread); + } + + /* No realtime threads from this point on */ + assert(thread->sched_pri < BASEPRI_RTQUEUES); + integer_t lowest_priority = MAXPRI + 1; integer_t lowest_count = INT_MAX; processor_t lp_processor = PROCESSOR_NULL; @@ -5925,88 +5103,6 @@ choose_processor( lc_processor = processor; } - if (thread->sched_pri >= BASEPRI_RTQUEUES) { - pset_node_t node = pset->node; - bool include_ast_urgent_pending_cpus = false; - cpumap_t ast_urgent_pending; -try_again: - ast_urgent_pending = 0; - pset = change_locked_pset(pset, starting_pset); - do { - cpumap_t available_map = pset_available_cpumap(pset); - if (available_map == 0) { - goto no_available_cpus; - } - - processor = choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false); - if (processor) { - return processor; - } - - processor = choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus); - if (processor) { - /* - * Instead of looping through all the psets to find the global - * furthest deadline processor, preempt the first candidate found. - * The preempted thread will then find any other available far deadline - * processors to preempt. - */ - return processor; - } - - ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask; - - if (rt_runq_count(pset) < lowest_count) { - int cpuid = bit_first(available_map); - assert(cpuid >= 0); - lc_processor = processor_array[cpuid]; - lowest_count = rt_runq_count(pset); - } - -no_available_cpus: - nset = next_pset(pset); - - if (nset != starting_pset) { - pset = change_locked_pset(pset, nset); - } - } while (nset != starting_pset); - - - /* Short cut for single pset nodes */ - if (bit_count(node->pset_map) == 1) { - if (lc_processor) { - pset_assert_locked(lc_processor->processor_set); - return lc_processor; - } - } else { - if (ast_urgent_pending && !include_ast_urgent_pending_cpus) { - /* See the comment in choose_furthest_deadline_processor_for_realtime_thread() */ - include_ast_urgent_pending_cpus = true; - goto try_again; - } - } - - processor = lc_processor; - - if (processor) { - pset = change_locked_pset(pset, processor->processor_set); - /* Check that chosen processor is still usable */ - cpumap_t available_map = pset_available_cpumap(pset); - if (bit_test(available_map, processor->cpu_id)) { - return processor; - } - - /* processor is no longer usable */ - processor = PROCESSOR_NULL; - } - - pset_assert_locked(pset); - pset_unlock(pset); - return PROCESSOR_NULL; - } - - /* No realtime threads from this point on */ - assert(thread->sched_pri < BASEPRI_RTQUEUES); do { /* @@ -6155,7 +5251,7 @@ no_available_cpus: pset_unlock(pset); return PROCESSOR_NULL; } -#endif /* CHOOSE_PROCESSOR_SMT*/ +#endif /* !CONFIG_SCHED_SMT */ @@ -6359,7 +5455,7 @@ thread_setrun( pset_lock(starting_pset); - processor = SCHED(choose_processor)(starting_pset, processor_hint, thread); + processor = SCHED(choose_processor)(starting_pset, processor_hint, thread, &options); if (processor != PROCESSOR_NULL) { pset = processor->processor_set; pset_assert_locked(pset); @@ -6567,7 +5663,7 @@ update_pending_nonurgent_preemption(processor_t processor, ast_t reason) uint64_t deadline = now + nonurgent_preemption_timer_abs; running_timer_enter(processor, RUNNING_TIMER_PREEMPT, NULL, - now, deadline); + deadline, now); return reason; } @@ -6601,7 +5697,7 @@ csw_check_locked( if (rt_runq_count(pset) > 0) { if ((rt_runq_priority(pset) > processor->current_pri) || !processor->first_timeslice) { return check_reason | AST_PREEMPT | AST_URGENT; - } else if (deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) { + } else if (rt_deadline_add(rt_runq_earliest_deadline(pset), rt_deadline_epsilon) < processor->deadline) { return check_reason | AST_PREEMPT | AST_URGENT; } else { return check_reason | AST_PREEMPT; @@ -6770,6 +5866,24 @@ thread_preempt_expire( KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PREEMPT_TIMER_ACTIVE), preempt); } +void +perfcontrol_timer_expire( + timer_call_param_t p0, + __unused timer_call_param_t p1 + ) +{ + processor_t processor = p0; + uint64_t now = mach_absolute_time(); + /* Default behavior is to cancel the timer */ + uint64_t timeout_ticks = EndOfAllTime; + machine_perfcontrol_running_timer_expire(now, 0, processor->cpu_id, &timeout_ticks); + if (timeout_ticks == EndOfAllTime) { + running_timer_clear(processor, RUNNING_TIMER_PERFCONTROL); + } else { + uint64_t deadline = now + timeout_ticks; + running_timer_setup(processor, RUNNING_TIMER_PERFCONTROL, NULL, deadline, now); + } +} /* * set_sched_pri: @@ -6821,8 +5935,11 @@ set_sched_pri( #if CONFIG_SCHED_CLUTCH /* * Since for the clutch scheduler, the thread's bucket determines its runq - * in the hierarchy it is important to update the bucket when the thread + * in the hierarchy, it is important to update the bucket when the thread * lock is held and the thread has been removed from the runq hierarchy. + * + * If the thread's bucket has changed, this will consume sched_tick_delta() + * in order to account CPU time with the correct scheduling bucket. */ SCHED(update_thread_bucket)(thread); @@ -7054,7 +6171,7 @@ thread_run_queue_remove( * Thread is on the RT run queue and we have a lock on * that run queue. */ - rt_runq_remove(SCHED(rt_runq)(pset), thread); + rt_runq_remove(&pset->rt_runq, thread); pset_update_rt_stealable_state(pset); removed = TRUE; @@ -7465,10 +6582,9 @@ static _Atomic uint64_t sched_perfcontrol_callback_deadline; #if defined(CONFIG_SCHED_TIMESHARE_CORE) -static volatile uint64_t sched_maintenance_deadline; +static _Atomic uint64_t sched_maintenance_deadline; +/* Exclusively read/written by sched_timeshare_maintenance_continue */ static uint64_t sched_tick_last_abstime; -static uint64_t sched_tick_delta; -uint64_t sched_tick_max_delta; /* @@ -7480,7 +6596,7 @@ uint64_t sched_tick_max_delta; void sched_timeshare_maintenance_continue(void) { - uint64_t sched_tick_ctime, late_time; + uint64_t sched_tick_ctime, late_time, sched_tick_delta; struct sched_update_scan_context scan_context = { .earliest_bg_make_runnable_time = UINT64_MAX, @@ -7512,7 +6628,6 @@ sched_timeshare_maintenance_continue(void) sched_tick_delta = MIN(sched_tick_delta, SCHED_TICK_MAX_DELTA); sched_tick_last_abstime = sched_tick_ctime; - sched_tick_max_delta = MAX(sched_tick_delta, sched_tick_max_delta); } scan_context.sched_tick_last_abstime = sched_tick_last_abstime; @@ -7524,7 +6639,7 @@ sched_timeshare_maintenance_continue(void) * all processors are idle occur, which rarely occurs in practice. */ - sched_tick += sched_tick_delta; + os_atomic_add(&sched_tick, (uint32_t)sched_tick_delta, relaxed); update_vm_info(); @@ -7540,6 +6655,7 @@ sched_timeshare_maintenance_continue(void) */ SCHED(thread_update_scan)(&scan_context); + /* rt_runq_scan also records pset bitmasks. */ SCHED(rt_runq_scan)(&scan_context); uint64_t ctime = mach_absolute_time(); @@ -7598,7 +6714,7 @@ static uint64_t sched_maintenance_wakeups; void sched_timeshare_consider_maintenance(uint64_t ctime, bool safe_point) { - uint64_t deadline = sched_maintenance_deadline; + uint64_t deadline = os_atomic_load(&sched_maintenance_deadline, relaxed); if (__improbable(ctime >= deadline)) { if (__improbable(current_thread() == sched_maintenance_thread)) { @@ -7728,7 +6844,7 @@ thread_update_process_threads(void) const bool should_report_failsafe = thread_should_report_failsafe(thread); const sched_mode_t saved_mode = thread->saved_mode; // if reporting - if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != sched_tick) { + if (!(thread->state & (TH_WAIT)) && thread->sched_stamp != os_atomic_load(&sched_tick, relaxed)) { SCHED(update_priority)(thread); } thread_unlock(thread); @@ -7772,7 +6888,7 @@ runq_scan_thread( { assert_thread_magic(thread); - if (thread->sched_stamp != sched_tick && + if (thread->sched_stamp != os_atomic_load(&sched_tick, relaxed) && thread->sched_mode == TH_MODE_TIMESHARE) { if (thread_update_add_thread(thread) == FALSE) { return TRUE; @@ -8300,6 +7416,8 @@ sched_cpu_init_completed(void) SCHED(cpu_init_completed)(); } + SCHED(rt_init_completed)(); + /* Wait for any cpu that is still starting, and enforce that they eventually complete. */ check_all_cpus_are_done_starting(PROCESSOR_FIRST_BOOT); @@ -8475,6 +7593,41 @@ sched_perfcontrol_update_powered_cores( } } +/* + * The performance controller invokes this method to reevaluate a thread + * placement on the processor cpu_id when the per-core timer expires to force + * a preemption if necessary. + */ +bool +sched_perfcontrol_check_oncore_thread_preemption( + __unused uint64_t flags, + int cpu_id __assert_only) +{ + bool ret = false; + assert(ml_get_interrupts_enabled() == false); + + processor_t processor = current_processor(); + thread_t thread = current_thread(); + assert(processor->cpu_id == cpu_id); + + thread_lock(thread); + ast_t preempt = csw_check(thread, processor, AST_NONE); + if (preempt != AST_NONE) { + /* + * TODO: Returning true here is best effort and isn't guaranteed to preempt the thread since thread_select can + * choose to leave the thread on the same processor. Consider using the flags passed in here to callback into + * CLPC before the next scheduling decision point (or sampler tick) if this decision needs to be reevaluated or + * to otherwise adjust this behavior. + */ + ret = true; + ast_on(preempt); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_ONCORE_PREEMPT), thread_tid(thread), processor->cpu_id, 0, 0, 0); + } + thread_unlock(thread); + + return ret; +} + /* * This doesn't just suspend cluster powerdown. * It also powers up all the cores and leaves them up, @@ -9878,7 +9031,7 @@ sched_perfcontrol_sfi_set_window(uint64_t window_usecs) } /* - * Set background and maintenance SFI class offtimes + * Set background / maintenance / mitigation SFI class offtimes */ kern_return_t sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs) @@ -9888,9 +9041,11 @@ sched_perfcontrol_sfi_set_bg_offtime(uint64_t offtime_usecs) if (offtime_usecs == 0ULL) { ret = sfi_class_offtime_cancel(SFI_CLASS_MAINTENANCE); ret |= sfi_class_offtime_cancel(SFI_CLASS_DARWIN_BG); + ret |= sfi_class_offtime_cancel(SFI_CLASS_RUNAWAY_MITIGATION); } else { ret = sfi_set_class_offtime(SFI_CLASS_MAINTENANCE, offtime_usecs); ret |= sfi_set_class_offtime(SFI_CLASS_DARWIN_BG, offtime_usecs); + ret |= sfi_set_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, offtime_usecs); } #endif // CONFIG_THREAD_GROUPS return ret; @@ -10118,7 +9273,7 @@ sched_update_pset_avg_execution_time(__unused processor_set_t pset, __unused uin #endif /* CONFIG_SCHED_EDGE */ /* pset is locked */ -static bool +bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor) { int cpuid = processor->cpu_id; @@ -10133,281 +9288,6 @@ processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, proc return bit_test(fasttrack_map, cpuid); } -#if CONFIG_SCHED_SMT -/* pset is locked */ -static processor_t -choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills) -{ -#if defined(__x86_64__) - bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0); -#else - const bool avoid_cpu0 = false; -#endif - cpumap_t cpu_map; - -try_again: - cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; - if (skip_processor) { - bit_clear(cpu_map, skip_processor->cpu_id); - } - if (skip_spills) { - cpu_map &= ~pset->rt_pending_spill_cpu_mask; - } - - if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { - bit_clear(cpu_map, 0); - } - - cpumap_t primary_map = cpu_map & pset->primary_map; - if (avoid_cpu0) { - primary_map = bit_ror64(primary_map, 1); - } - - int rotid = lsb_first(primary_map); - if (rotid >= 0) { - int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid; - - processor_t processor = processor_array[cpuid]; - - return processor; - } - - if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { - goto out; - } - - if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { - /* Also avoid cpu1 */ - bit_clear(cpu_map, 1); - } - - /* Consider secondary processors whose primary is actually running a realtime thread */ - cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1); - if (avoid_cpu0) { - /* Also avoid cpu1 */ - secondary_map = bit_ror64(secondary_map, 2); - } - rotid = lsb_first(secondary_map); - if (rotid >= 0) { - int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; - - processor_t processor = processor_array[cpuid]; - - return processor; - } - - /* Consider secondary processors */ - secondary_map = cpu_map & ~pset->primary_map; - if (avoid_cpu0) { - /* Also avoid cpu1 */ - secondary_map = bit_ror64(secondary_map, 2); - } - rotid = lsb_first(secondary_map); - if (rotid >= 0) { - int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; - - processor_t processor = processor_array[cpuid]; - - return processor; - } - - /* - * I was hoping the compiler would optimize - * this away when avoid_cpu0 is const bool false - * but it still complains about the assignmnent - * in that case. - */ - if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { -#if defined(__x86_64__) - avoid_cpu0 = false; -#else - assert(0); -#endif - goto try_again; - } - -out: - if (skip_processor) { - return PROCESSOR_NULL; - } - - /* - * If we didn't find an obvious processor to choose, but there are still more CPUs - * not already running realtime threads than realtime threads in the realtime run queue, - * this thread belongs in this pset, so choose some other processor in this pset - * to ensure the thread is enqueued here. - */ - cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map; - if (bit_count(non_realtime_map) > rt_runq_count(pset)) { - cpu_map = non_realtime_map; - assert(cpu_map != 0); - int cpuid = bit_first(cpu_map); - assert(cpuid >= 0); - return processor_array[cpuid]; - } - - if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { - goto skip_secondaries; - } - - non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map; - if (bit_count(non_realtime_map) > rt_runq_count(pset)) { - cpu_map = non_realtime_map; - assert(cpu_map != 0); - int cpuid = bit_first(cpu_map); - assert(cpuid >= 0); - return processor_array[cpuid]; - } - -skip_secondaries: - return PROCESSOR_NULL; -} -#else /* CONFIG_SCHED_SMT*/ -/* pset is locked */ -static processor_t -choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills) -{ - cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; - if (skip_processor) { - bit_clear(cpu_map, skip_processor->cpu_id); - } - if (skip_spills) { - cpu_map &= ~pset->rt_pending_spill_cpu_mask; - } - - int rotid = lsb_first(cpu_map); - if (rotid >= 0) { - return processor_array[rotid]; - } - - /* - * If we didn't find an obvious processor to choose, but there are still more CPUs - * not already running realtime threads than realtime threads in the realtime run queue, - * this thread belongs in this pset, so choose some other processor in this pset - * to ensure the thread is enqueued here. - */ - cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map; - if (bit_count(non_realtime_map) > rt_runq_count(pset)) { - cpu_map = non_realtime_map; - assert(cpu_map != 0); - int cpuid = bit_first(cpu_map); - assert(cpuid >= 0); - return processor_array[cpuid]; - } - - return PROCESSOR_NULL; -} -#endif /* CONFIG_SCHED_SMT */ - -/* - * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority. - * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline. - * - * pset is locked. - */ -static processor_t -choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus) -{ - uint64_t furthest_deadline = deadline_add(minimum_deadline, rt_deadline_epsilon); - processor_t fd_processor = PROCESSOR_NULL; - int lowest_priority = max_pri; - - cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask; - if (skip_processor) { - bit_clear(cpu_map, skip_processor->cpu_id); - } - if (skip_spills) { - cpu_map &= ~pset->rt_pending_spill_cpu_mask; - } - - for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { - processor_t processor = processor_array[cpuid]; - - if (processor->current_pri > lowest_priority) { - continue; - } - - if (processor->current_pri < lowest_priority) { - lowest_priority = processor->current_pri; - furthest_deadline = processor->deadline; - fd_processor = processor; - continue; - } - - if (processor->deadline > furthest_deadline) { - furthest_deadline = processor->deadline; - fd_processor = processor; - } - } - - if (fd_processor) { - return fd_processor; - } - - /* - * There is a race condition possible when there are multiple processor sets. - * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU, - * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds - * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear - * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find - * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads, - * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling. - * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue. - * But even if we chose the better run queue, we still wouldn't send an IPI in this case.) - * - * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads - * on the run queue of that pset. - */ - if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) { - cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask; - assert(skip_processor == PROCESSOR_NULL); - assert(skip_spills == false); - - for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { - processor_t processor = processor_array[cpuid]; - - if (processor->current_pri > lowest_priority) { - continue; - } - - if (processor->current_pri < lowest_priority) { - lowest_priority = processor->current_pri; - furthest_deadline = processor->deadline; - fd_processor = processor; - continue; - } - - if (processor->deadline > furthest_deadline) { - furthest_deadline = processor->deadline; - fd_processor = processor; - } - } - } - - return fd_processor; -} - -/* pset is locked */ -static processor_t -choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries) -{ - (void) consider_secondaries; - bool skip_spills = true; - bool include_ast_urgent_pending_cpus = false; - -#if CONFIG_SCHED_SMT - processor_t next_processor = choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills); -#else /* CONFIG_SCHED_SMT */ - processor_t next_processor = choose_processor_for_realtime_thread(pset, skip_processor, skip_spills); -#endif /* CONFIG_SCHED_SMT */ - if (next_processor != PROCESSOR_NULL) { - return next_processor; - } - - next_processor = choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus); - return next_processor; -} - #if CONFIG_SCHED_SMT /* pset is locked */ static bool @@ -10588,43 +9468,61 @@ sysctl_task_get_no_smt(void) #endif /* DEBUG || DEVELOPMENT */ #endif /* CONFIG_SCHED_SMT */ -__private_extern__ void +#if __AMP__ +static kern_return_t +pset_cluster_type_from_name_char(char cluster_type_name, pset_cluster_type_t *pset_cluster_type) +{ + switch (cluster_type_name) { + case 'E': + case 'e': + *pset_cluster_type = PSET_AMP_E; + return KERN_SUCCESS; + case 'P': + case 'p': + *pset_cluster_type = PSET_AMP_P; + return KERN_SUCCESS; + default: + return KERN_INVALID_ARGUMENT; + } +} +#endif /* __AMP__ */ + +__private_extern__ kern_return_t thread_soft_bind_cluster_type(thread_t thread, char cluster_type) { #if __AMP__ + kern_return_t kr; spl_t s = splsched(); thread_lock(thread); thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE; - pset_node_t bind_node = PSET_NODE_NULL; - switch (cluster_type) { - case 'e': - case 'E': - if (ecore_node->psets != PROCESSOR_SET_NULL) { - bind_node = ecore_node; + pset_cluster_type_t pset_cluster_type; + kr = pset_cluster_type_from_name_char(cluster_type, &pset_cluster_type); + if (kr == KERN_SUCCESS) { + pset_node_t bind_node = pset_node_for_pset_cluster_type(pset_cluster_type); + if (bind_node != PSET_NODE_NULL) { + thread->th_bound_cluster_id = bind_node->psets->pset_id; + } else { + /* + * The specified cluster type isn't present on the system, + * either because we're too early in boot or because the + * underlying platform lacks that cluster type. This error + * code assumes the latter. + */ + kr = KERN_INVALID_ARGUMENT; } - break; - case 'p': - case 'P': - if (pcore_node->psets != PROCESSOR_SET_NULL) { - bind_node = pcore_node; - } - break; - default: - break; - } - if (bind_node != PSET_NODE_NULL) { - thread->th_bound_cluster_id = bind_node->psets->pset_id; } thread_unlock(thread); splx(s); - if (thread == current_thread()) { + if ((kr == KERN_SUCCESS) && (thread == current_thread())) { /* Trigger a context-switch to get on the newly bound cluster */ thread_block(THREAD_CONTINUE_NULL); } + return kr; #else /* __AMP__ */ (void)thread; (void)cluster_type; + return KERN_SUCCESS; #endif /* __AMP__ */ } @@ -10659,7 +9557,7 @@ thread_soft_bind_cluster_id(thread_t thread, uint32_t cluster_id, thread_bind_op return KERN_FAILURE; } if (options & THREAD_BIND_ELIGIBLE_ONLY) { - if (SCHED(thread_eligible_for_pset(thread, pset)) == false) { + if (SCHED(thread_eligible_for_pset)(thread, pset) == false) { /* Thread is not recommended for the cluster type */ return KERN_INVALID_POLICY; } @@ -10731,8 +9629,9 @@ unbind: } #if __AMP__ + static char -pset_cluster_type_name_char(pset_cluster_type_t pset_type) +pset_cluster_type_to_name_char(pset_cluster_type_t pset_type) { switch (pset_type) { case PSET_AMP_E: @@ -10743,6 +9642,7 @@ pset_cluster_type_name_char(pset_cluster_type_t pset_type) panic("Unexpected AMP pset cluster type %d", pset_type); } } + #endif /* __AMP__ */ extern char sysctl_get_task_cluster_type(void); @@ -10756,7 +9656,7 @@ sysctl_get_task_cluster_type(void) if (!pset_hint) { return '0'; } - return pset_cluster_type_name_char(pset_hint->pset_cluster_type); + return pset_cluster_type_to_name_char(pset_hint->pset_cluster_type); #else /* !__AMP__ */ return '0'; #endif /* __AMP__ */ @@ -10773,7 +9673,7 @@ sysctl_get_bound_cluster_type(void) return '0'; } pset_cluster_type_t pset_type = pset_array[self->th_bound_cluster_id]->pset_cluster_type; - return pset_cluster_type_name_char(pset_type); + return pset_cluster_type_to_name_char(pset_type); } static processor_set_t @@ -10801,40 +9701,55 @@ find_pset_of_type(pset_cluster_type_t t) } #endif /* __AMP__ */ -extern void sysctl_task_set_cluster_type(char cluster_type); -void +extern kern_return_t sysctl_task_set_cluster_type(char cluster_type); +kern_return_t sysctl_task_set_cluster_type(char cluster_type) { - task_t task = current_task(); - processor_set_t pset_hint = PROCESSOR_SET_NULL; - #if __AMP__ - switch (cluster_type) { - case 'e': - case 'E': - pset_hint = find_pset_of_type(PSET_AMP_E); - break; - case 'p': - case 'P': - pset_hint = find_pset_of_type(PSET_AMP_P); - break; - default: - break; - } + kern_return_t kr; + task_t task = current_task(); + pset_cluster_type_t pset_cluster_type; + kr = pset_cluster_type_from_name_char(cluster_type, &pset_cluster_type); + if (kr == KERN_SUCCESS) { + processor_set_t pset_hint = find_pset_of_type(pset_cluster_type); + if (pset_hint) { + task_lock(task); + task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE; + task->pset_hint = pset_hint; + task_unlock(task); - if (pset_hint) { - task_lock(task); - task->t_flags |= TF_USE_PSET_HINT_CLUSTER_TYPE; - task->pset_hint = pset_hint; - task_unlock(task); - - thread_block(THREAD_CONTINUE_NULL); + thread_block(THREAD_CONTINUE_NULL); + return KERN_SUCCESS; + } } + return KERN_INVALID_ARGUMENT; #else (void)cluster_type; - (void)task; - (void)pset_hint; + return KERN_SUCCESS; #endif } +extern kern_return_t sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread, + int sched_bucket, uint64_t *cpu_stats); + +#if CONFIG_SCHED_CLUTCH + +kern_return_t +sysctl_clutch_thread_group_cpu_time_for_thread(thread_t thread, + int sched_bucket, uint64_t *cpu_stats) +{ + return sched_clutch_thread_group_cpu_time_for_thread(thread, sched_bucket, cpu_stats); +} + +#else /* !CONFIG_SCHED_CLUTCH */ + +kern_return_t +sysctl_clutch_thread_group_cpu_time_for_thread(__unused thread_t thread, + __unused int sched_bucket, __unused uint64_t *cpu_stats) +{ + return KERN_NOT_SUPPORTED; +} + +#endif /* !CONFIG_SCHED_CLUTCH */ + #endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index 8deff3d86..34175c1db 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -101,19 +101,7 @@ extern void sched_startup(void); extern void sched_timebase_init(void); -extern void pset_rt_init(processor_set_t pset); - -extern void sched_rtlocal_init(processor_set_t pset); - -extern rt_queue_t sched_rtlocal_runq(processor_set_t pset); - -extern void sched_rtlocal_queue_shutdown(processor_t processor); - -extern int64_t sched_rtlocal_runq_count_sum(void); - -extern thread_t sched_rtlocal_steal_thread(processor_set_t stealing_pset, uint64_t earliest_deadline); - -extern thread_t sched_rt_choose_thread(processor_set_t pset); +extern bool processor_is_fast_track_candidate_for_realtime_thread(processor_set_t pset, processor_t processor); extern void sched_check_spill(processor_set_t pset, thread_t thread); @@ -279,6 +267,7 @@ __options_decl(sched_options_t, uint32_t, { SCHED_HEADQ = 0x2, SCHED_PREEMPT = 0x4, SCHED_REBALANCE = 0x8, + SCHED_STIR_POT = 0x10, }); /* Reschedule thread for execution */ @@ -304,9 +293,6 @@ extern void thread_unbind_after_queue_shutdown( extern bool pset_has_stealable_threads( processor_set_t pset); -extern bool pset_has_stealable_rt_threads( - processor_set_t pset); - extern processor_set_t choose_starting_pset( pset_node_t node, thread_t thread, @@ -328,14 +314,16 @@ extern pset_node_t sched_choose_node( extern processor_t choose_processor_smt( processor_set_t pset, processor_t processor, - thread_t thread); -#else /* CONFIG_SCHED_SMT */ + thread_t thread, + sched_options_t *options); +#else /* !CONFIG_SCHED_SMT */ /* Choose the best processor to run a thread */ extern processor_t choose_processor( processor_set_t pset, processor_t processor, - thread_t thread); -#endif /* CONFIG_SCHED_SMT */ + thread_t thread, + sched_options_t *options); +#endif /* !CONFIG_SCHED_SMT */ extern bool sched_SMT_balance( processor_t processor, @@ -373,7 +361,6 @@ struct sched_update_scan_context { }; typedef struct sched_update_scan_context *sched_update_scan_context_t; -extern void sched_rtlocal_runq_scan(sched_update_scan_context_t scan_context); extern void sched_pset_made_schedulable( processor_t processor, @@ -581,7 +568,7 @@ __BEGIN_DECLS #ifdef XNU_KERNEL_PRIVATE -extern void thread_soft_bind_cluster_type(thread_t, char cluster_type); +extern kern_return_t thread_soft_bind_cluster_type(thread_t, char cluster_type); __options_decl(thread_bind_option_t, uint64_t, { /* Unbind a previously cluster bound thread */ @@ -647,10 +634,6 @@ extern void thread_handoff_parameter(thread_t thread, extern struct waitq *assert_wait_queue(event_t event); -extern kern_return_t thread_wakeup_one_with_pri(event_t event, int priority); - -extern thread_t thread_wakeup_identify(event_t event, int priority); - /* * sched_cond_t: * @@ -830,17 +813,17 @@ extern kern_return_t thread_wakeup_nthreads_prim( uint32_t nthreads, wait_result_t result); -#define thread_wakeup(x) \ - thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) -#define thread_wakeup_with_result(x, z) \ - thread_wakeup_prim((x), FALSE, (z)) -#define thread_wakeup_one(x) \ - thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) +#define thread_wakeup(x) \ + thread_wakeup_prim((x), FALSE, THREAD_AWAKENED) +#define thread_wakeup_with_result(x, z) \ + thread_wakeup_prim((x), FALSE, (z)) +#define thread_wakeup_one(x) \ + thread_wakeup_prim((x), TRUE, THREAD_AWAKENED) #define thread_wakeup_nthreads(x, nthreads) \ - thread_wakeup_nthreads_prim((x), (nthreads), THREAD_AWAKENED) + thread_wakeup_nthreads_prim((x), (nthreads), THREAD_AWAKENED) #define thread_wakeup_nthreads_with_result(x, nthreads, z) \ - thread_wakeup_nthreads_prim((x), (nthreads), (z)) + thread_wakeup_nthreads_prim((x), (nthreads), (z)) /* Wakeup the specified thread if it is waiting on this event */ extern kern_return_t thread_wakeup_thread(event_t event, thread_t thread); @@ -927,7 +910,9 @@ struct sched_dispatch_table { processor_t (*choose_processor)( processor_set_t pset, processor_t processor, - thread_t thread); + thread_t thread, + sched_options_t *options); + /* * Enqueue a timeshare or fixed priority thread onto the per-processor * runqueue @@ -1029,12 +1014,14 @@ struct sched_dispatch_table { * Called with pset lock held, returns with pset lock unlocked. */ bool (*processor_balance)(processor_t processor, processor_set_t pset); - rt_queue_t (*rt_runq)(processor_set_t pset); - void (*rt_init)(processor_set_t pset); + + processor_t (*rt_choose_processor)(processor_set_t starting_pset, processor_t starting_processor, thread_t thread); + thread_t (*rt_steal_thread)(processor_set_t stealing_pset); + void (*rt_init_pset)(processor_set_t pset); + void (*rt_init_completed)(void); void (*rt_queue_shutdown)(processor_t processor); void (*rt_runq_scan)(sched_update_scan_context_t scan_context); int64_t (*rt_runq_count_sum)(void); - thread_t (*rt_steal_thread)(processor_set_t pset, uint64_t earliest_deadline); uint32_t (*qos_max_parallelism)(int qos, uint64_t options); void (*check_spill)(processor_set_t pset, thread_t thread); diff --git a/osfmk/kern/sched_rt.c b/osfmk/kern/sched_rt.c new file mode 100644 index 000000000..47758d6ab --- /dev/null +++ b/osfmk/kern/sched_rt.c @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#ifdef KDBG_MACOS_RELEASE +#define KTRC KDBG_MACOS_RELEASE +#else +#define KTRC KDBG_RELEASE +#endif + +#pragma mark - Constants and Tunables + +#if (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) +#include + +/* + * Tunables controlling how xnu initializes the realtime matrix. CLPC can + * override their effects with sched_perfcontrol interfaces. + */ + +TUNABLE(unsigned int, sched_rt_spill_policy, "sched_rt_spill_policy", 1); + +TUNABLE(unsigned, sched_rt_steal_policy, "sched_rt_steal_policy", 2); +#endif /* (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) */ + +uint32_t rt_deadline_epsilon; +uint32_t rt_constraint_threshold; +/* epsilon for comparing RT deadlines */ +int rt_deadline_epsilon_us = 100; +uint32_t max_rt_quantum; +uint32_t min_rt_quantum; +int sched_allow_rt_smt = 1; +int sched_rt_runq_strict_priority = false; + +int +sched_get_rt_deadline_epsilon(void) +{ + return rt_deadline_epsilon_us; +} + +void +sched_set_rt_deadline_epsilon(int new_epsilon_us) +{ + rt_deadline_epsilon_us = new_epsilon_us; + + uint64_t abstime; + clock_interval_to_absolutetime_interval(rt_deadline_epsilon_us, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && ((rt_deadline_epsilon_us == 0) || (uint32_t)abstime != 0)); + rt_deadline_epsilon = (uint32_t)abstime; +} + +#pragma mark - Initialization + +static int sched_rt_max_clusters = 0; + +void +sched_realtime_timebase_init(void) +{ + uint64_t abstime; + + /* smallest rt computation (50 us) */ + clock_interval_to_absolutetime_interval(50, NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + min_rt_quantum = (uint32_t)abstime; + + /* maximum rt computation (50 ms) */ + clock_interval_to_absolutetime_interval( + 50, 1000 * NSEC_PER_USEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + max_rt_quantum = (uint32_t)abstime; + + /* constraint threshold for sending backup IPIs (4 ms) */ + clock_interval_to_absolutetime_interval(4, NSEC_PER_MSEC, &abstime); + assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); + rt_constraint_threshold = (uint32_t)abstime; + + /* epsilon for comparing deadlines */ + sched_set_rt_deadline_epsilon(rt_deadline_epsilon_us); +} + +#if CONFIG_SCHED_EDGE +/* forward-declare config utility */ +static void +sched_rt_config_pset_push(processor_set_t pset); +#endif /* CONFIG_SCHED_EDGE */ + +static void +rt_init_completed(void) +{ + /* This should be unified with sched_edge_max_clusters and moved to a common location. */ + sched_rt_max_clusters = ml_get_cluster_count(); + + /* Realtime spill/steal are only supported on platforms with the edge scheduler. */ +#if CONFIG_SCHED_EDGE + /* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */ + spl_t s = splsched(); + simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); + for (int src_cluster_id = 0; src_cluster_id < sched_rt_max_clusters; src_cluster_id++) { + processor_set_t src_pset = pset_array[src_cluster_id]; + assert3p(src_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */ + + /* For each cluster, set all its outgoing edge parameters */ + for (int dst_cluster_id = 0; dst_cluster_id < sched_rt_max_clusters; dst_cluster_id++) { + if (dst_cluster_id == src_cluster_id) { + continue; + } + processor_set_t dst_pset = pset_array[dst_cluster_id]; + assert3p(dst_pset, !=, PROCESSOR_SET_NULL); /* all psets should be initialized */ + + bool clusters_homogenous = (src_pset->pset_type == dst_pset->pset_type); + if (clusters_homogenous) { + /* Default realtime policy: spill allowed among homogeneous psets. */ + sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) { + .sce_migration_allowed = true, + .sce_steal_allowed = true, + .sce_migration_weight = 0, + }); + } else { + /* Default realtime policy: disallow spill among heterogeneous psets. */ + sched_rt_config_set((pset_id_t) src_cluster_id, (pset_id_t) dst_cluster_id, (sched_clutch_edge) { + .sce_migration_allowed = false, + .sce_steal_allowed = false, + .sce_migration_weight = 0, + }); + } + } + } + + + for (pset_id_t pset_id = 0; pset_id < sched_rt_max_clusters; pset_id++) { + sched_rt_config_pset_push(pset_array[pset_id]); + } + + simple_unlock(&sched_available_cores_lock); + splx(s); +#endif /* CONFIG_SCHED_EDGE */ +} + +static void +pset_rt_init(processor_set_t pset) +{ + for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { + int i = pri - BASEPRI_RTQUEUES; + rt_queue_pri_t *rqi = &pset->rt_runq.rt_queue_pri[i]; + queue_init(&rqi->pri_queue); + rqi->pri_count = 0; + rqi->pri_earliest_deadline = RT_DEADLINE_NONE; + rqi->pri_constraint = RT_CONSTRAINT_NONE; + } + os_atomic_init(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE); + + rt_queue_t rt_runq = &pset->rt_runq; + os_atomic_init(&rt_runq->count, 0); + os_atomic_init(&rt_runq->earliest_deadline, RT_DEADLINE_NONE); + os_atomic_init(&rt_runq->constraint, RT_CONSTRAINT_NONE); + os_atomic_init(&rt_runq->ed_index, NOPRI); + bzero(&rt_runq->bitmap, sizeof(rt_runq->bitmap)); + bzero(&rt_runq->runq_stats, sizeof(rt_runq->runq_stats)); + +#if __AMP__ + /* + * Initialize spill/steal search orders as invalid to prevent spill/steal + * before the matrix is configured. + */ + bzero(pset->sched_rt_edges, sizeof(pset->sched_rt_edges)); + for (pset_id_t i = 0; i < MAX_PSETS - 1; i++) { + pset->sched_rt_spill_search_order.spso_search_order[i] = PSET_ID_INVALID; +#if CONFIG_SCHED_EDGE + pset->sched_rt_steal_search_order.spso_search_order[i] = PSET_ID_INVALID; +#endif /* CONFIG_SCHED_EDGE */ + } +#endif /* __AMP__ */ +} + +#pragma mark - Realtime Scheduler/CLPC interface + +#if CONFIG_SCHED_EDGE +void +sched_rt_config_set( + uint8_t src_pset, + uint8_t dst_pset, + sched_clutch_edge edge_config) +{ + assert(src_pset != dst_pset || !edge_config.sce_migration_allowed); /* No self-edges. */ + os_atomic_store(&pset_array[src_pset]->sched_rt_edges[dst_pset], edge_config, relaxed); +} + +sched_clutch_edge +sched_rt_config_get( + uint8_t src_pset, + uint8_t dst_pset) +{ + return os_atomic_load(&pset_array[src_pset]->sched_rt_edges[dst_pset], relaxed); +} + +void +sched_rt_matrix_get( + sched_clutch_edge *edge_matrix, + bool *edge_requests, + uint64_t num_psets) +{ + uint64_t edge_index = 0; + for (uint8_t src_pset = 0; src_pset < num_psets; src_pset++) { + for (uint8_t dst_pset = 0; dst_pset < num_psets; dst_pset++) { + if (edge_requests[edge_index]) { + edge_matrix[edge_index] = sched_rt_config_get(src_pset, dst_pset); + } + edge_index++; + } + } +} + +/* + * sched_rt_config_pset_push() + * + * After using sched_rt_config_set() to update edge tunables outgoing from a particular source + * pset, this function should be called in order to propagate the updates to derived metadata for + * the pset, such as search orders for outgoing spill and steal. + */ +static void +sched_rt_config_pset_push(processor_set_t pset) +{ + assert3u(pset->pset_id, <, UINT8_MAX); + + sched_pset_search_order_sort_data_t spill_datas[MAX_PSETS - 1], steal_datas[MAX_PSETS - 1]; + uint num_spill_datas = 0, num_steal_datas = 0; + for (pset_id_t other_pset_id = 0; other_pset_id < sched_rt_max_clusters; other_pset_id++) { + if (pset->pset_id == other_pset_id) { + continue; /* No self-edges. */ + } + /* Spill */ + sched_clutch_edge out_edge = sched_rt_config_get((pset_id_t)pset->pset_cluster_id, other_pset_id); + if (out_edge.sce_migration_allowed) { + spill_datas[num_spill_datas++] = (sched_pset_search_order_sort_data_t) { + .spsosd_src_pset = pset, + .spsosd_migration_weight = out_edge.sce_migration_weight, + .spsosd_dst_pset_id = other_pset_id + }; + } + /* Steal */ + sched_clutch_edge in_edge = sched_rt_config_get(other_pset_id, (pset_id_t)pset->pset_cluster_id); + if (in_edge.sce_steal_allowed) { + steal_datas[num_steal_datas++] = (sched_pset_search_order_sort_data_t) { + .spsosd_src_pset = pset, + .spsosd_migration_weight = in_edge.sce_migration_weight, + .spsosd_dst_pset_id = other_pset_id, + }; + } + } + sched_pset_search_order_compute(&pset->sched_rt_spill_search_order, spill_datas, num_spill_datas, sched_edge_search_order_weight_then_locality_cmp); + sched_pset_search_order_compute(&pset->sched_rt_steal_search_order, steal_datas, num_steal_datas, sched_edge_search_order_weight_then_locality_cmp); +} + +void +sched_rt_matrix_set( + sched_clutch_edge *rt_matrix, + bool *edge_changes, + uint64_t num_psets) +{ + /* Hold sched_available_cores_lock to prevent multiple concurrent matrix updates. */ + spl_t s = splsched(); + simple_lock(&sched_available_cores_lock, LCK_GRP_NULL); + + for (uint8_t src_pset_id = 0; src_pset_id < num_psets; src_pset_id++) { + for (uint8_t dst_pset_id = 0; dst_pset_id < num_psets; dst_pset_id++) { + const uint64_t rt_matrix_index = src_pset_id * num_psets + dst_pset_id; + if (edge_changes[rt_matrix_index]) { + sched_rt_config_set(src_pset_id, dst_pset_id, rt_matrix[rt_matrix_index]); + } + } + } + + for (pset_id_t pset_id = 0; pset_id < num_psets; pset_id++) { + sched_rt_config_pset_push(pset_array[pset_id]); + } + + simple_unlock(&sched_available_cores_lock); + splx(s); +} +#endif /* CONFIG_SCHED_EDGE */ + +#pragma mark - Scheduler Callouts + +#if CONFIG_SCHED_SMT +/* + * SMT-aware callout for rt_choose_processor. + */ +processor_t +sched_rtlocal_choose_processor_smt( + processor_set_t starting_pset, + processor_t processor, + thread_t thread) +{ + processor_set_t nset = PROCESSOR_SET_NULL; + processor_set_t pset = starting_pset; + pset_node_t node = pset->node; + + processor_t lc_processor = processor; + integer_t lowest_count = INT_MAX; + if (lc_processor != PROCESSOR_NULL) { + lowest_count = SCHED(processor_runq_count)(processor); + } + + bool include_ast_urgent_pending_cpus = false; + cpumap_t ast_urgent_pending; +try_again: + ast_urgent_pending = 0; + int consider_secondaries = (!pset->is_SMT) || (bit_count(node->pset_map) == 1) || (node->pset_non_rt_primary_map == 0) || include_ast_urgent_pending_cpus; + for (; consider_secondaries < 2; consider_secondaries++) { + pset = change_locked_pset(pset, starting_pset); + do { + cpumap_t available_map = pset_available_cpumap(pset); + if (available_map == 0) { + goto no_available_cpus; + } + + processor = pset_choose_processor_for_realtime_thread_smt(pset, PROCESSOR_NULL, consider_secondaries, false); + if (processor) { + return processor; + } + + if (consider_secondaries) { + processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus); + if (processor) { + /* + * Instead of looping through all the psets to find the global + * furthest deadline processor, preempt the first candidate found. + * The preempted thread will then find any other available far deadline + * processors to preempt. + */ + return processor; + } + + ast_urgent_pending |= pset->pending_AST_URGENT_cpu_mask; + + if (rt_runq_count(pset) < lowest_count) { + int cpuid = bit_first(available_map); + assert(cpuid >= 0); + lc_processor = processor_array[cpuid]; + lowest_count = rt_runq_count(pset); + } + } + +no_available_cpus: + nset = next_pset(pset); + + if (nset != starting_pset) { + pset = change_locked_pset(pset, nset); + } + } while (nset != starting_pset); + } + + /* Short cut for single pset nodes */ + if (bit_count(node->pset_map) == 1) { + if (lc_processor) { + pset_assert_locked(lc_processor->processor_set); + return lc_processor; + } + } else { + if (ast_urgent_pending && !include_ast_urgent_pending_cpus) { + /* See the comment in pset_choose_furthest_deadline_processor_for_realtime_thread() */ + include_ast_urgent_pending_cpus = true; + goto try_again; + } + } + + processor = lc_processor; + + if (processor) { + pset = change_locked_pset(pset, processor->processor_set); + /* Check that chosen processor is still usable */ + cpumap_t available_map = pset_available_cpumap(pset); + if (bit_test(available_map, processor->cpu_id)) { + return processor; + } + + /* processor is no longer usable */ + processor = PROCESSOR_NULL; + } + + pset_assert_locked(pset); + pset_unlock(pset); + return PROCESSOR_NULL; +} +#else /* !CONFIG_SCHED_SMT */ +/* + * Called with thread and starting_pset locked. The returned processor's pset is + * locked on return. + */ +processor_t +sched_rt_choose_processor( + const processor_set_t starting_pset, + processor_t processor, + thread_t thread) +{ + assert3u(thread->sched_pri, >=, BASEPRI_RTQUEUES); + assert3u(thread->sched_pri, <=, MAXPRI); + + /* + * In choose_starting_pset, we found a good candidate pset for this thread. + * Now, we pick the best processor to preempt, if there is one. It is also + * possible that conditions have changed and the thread should spill to + * another pset. + */ + + processor_set_t pset = starting_pset; /* Lock is held on this pset. */ + pset_assert_locked(pset); + +#if __AMP__ + /* + * If there are processors with outstanding urgent preemptions, we consider + * them in a second pass. While we are changing pset locks here, it is + * possible a processor may resolve its outstanding urgent preemption and + * become eligible to run this thread. See comment in + * pset_choose_furthest_deadline_processor_for_realtime_thread(). + */ + bool found_ast_urgent_pending = false; /* Tracks whether any (eligible) processors have pending urgent ASTs. */ + for (int include_ast_urgent_pending_cpus = 0; include_ast_urgent_pending_cpus < 2; include_ast_urgent_pending_cpus++) { + if (include_ast_urgent_pending_cpus && !found_ast_urgent_pending) { + break; /* Skip the second pass. */ + } + + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, ~0, &istate)) { + /* Switch to the next pset. We need to check for null psets because + * we do not use acquire/release semantics for the spill order. */ + processor_set_t nset = pset_array[istate.spis_pset_id]; + if (__improbable(nset == PROCESSOR_SET_NULL)) { + continue; + } + pset = change_locked_pset(pset, nset); + + processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false); + if (processor != PROCESSOR_NULL) { + /* We found a candidate processor on this pset to wake or preempt. */ + pset_assert_locked(processor->processor_set); + return processor; + } + + /* TODO : Policy question of EDF vs targeting idle cores on another pset. */ + processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, include_ast_urgent_pending_cpus); + if (processor) { + /* + * Instead of looping through all the psets to find the global + * furthest deadline processor, preempt the first candidate found. + * The preempted thread will then find any other available far deadline + * processors to preempt. + */ + pset_assert_locked(processor->processor_set); + return processor; + } + + found_ast_urgent_pending = found_ast_urgent_pending || (pset->pending_AST_URGENT_cpu_mask != 0); + } + } + + /* + * There was no obvious (idle or non-realtime) processor to run the thread. + * Instead, do EDF scheduling again on starting_pset, putting the thread on + * the run queue if there is no processor to preempt. + */ + + pset = change_locked_pset(pset, starting_pset); +#endif /* __AMP__ */ + + /* Check (again, for AMP systems) that there is no lower-priority or idle processor. */ + processor = pset_choose_processor_for_realtime_thread(pset, PROCESSOR_NULL, false); + if (processor != PROCESSOR_NULL) { + /* We found a candidate processor on this pset to wake or preempt. */ + pset_assert_locked(processor->processor_set); + return processor; + } + + processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, thread->sched_pri, thread->realtime.deadline, PROCESSOR_NULL, false, true); + if (processor == PROCESSOR_NULL) { + /* Choose an arbitrary available and recommended processor from the pset. + * It won't get preempted anyways, since this thread has a later + * deadline. */ + int processor_id = lsb_first(pset_available_cpumap(pset)); + + /* starting_pset had available, recommended processors coming into + * rt_choose_processor(), but that might have changed after dropping the + * pset lock. If there are no such processors, bail out here and let + * sched_edge_migrate_candidate() find a better starting pset. */ + if (processor_id < 0) { + pset_unlock(pset); + return PROCESSOR_NULL; + } + + processor = processor_array[processor_id]; + } + + pset_assert_locked(processor->processor_set); + return processor; +} +#endif /* !CONFIG_SCHED_SMT */ + +#if CONFIG_SCHED_EDGE +/* + * Called with stealing_pset locked and returns with stealing_pset locked but + * the lock will have been dropped if a thread is returned. The lock may have + * been temporarily dropped, even if no thread is returned. + */ +thread_t +sched_rt_steal_thread(processor_set_t stealing_pset) +{ + uint64_t earliest_deadline = rt_runq_earliest_deadline(stealing_pset); + processor_set_t pset = stealing_pset; + + /* Continue searching until there are no steal candidates found in a single iteration. */ + while (true) { + processor_set_t target_pset = NULL; + uint64_t target_deadline; + if (__improbable(os_sub_overflow(earliest_deadline, rt_deadline_epsilon, &target_deadline))) { + target_deadline = 0; + } + + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(stealing_pset, &stealing_pset->sched_rt_steal_search_order, ~BIT(stealing_pset->pset_id), &istate)) { + assert3s(istate.spis_pset_id, !=, stealing_pset->pset_id); /* stealing_pset's runqueue is drained by sched_rt_choose_processor */ + const processor_set_t nset = pset_array[istate.spis_pset_id]; + /* Check for null because we do not use acquire/release semantics for steal order. */ + if (__improbable(nset == PROCESSOR_SET_NULL)) { + continue; + } + uint64_t nset_deadline = os_atomic_load(&nset->stealable_rt_threads_earliest_deadline, relaxed); + if (nset_deadline < target_deadline) { + target_pset = nset; + target_deadline = nset_deadline; + } + } + + if (target_pset != PROCESSOR_SET_NULL) { + assert3u(target_deadline, !=, RT_DEADLINE_NONE); + + /* target_pset is a candidate for steal. Check again under its pset lock. */ + + pset = change_locked_pset(pset, target_pset); + if (os_atomic_load(&pset->stealable_rt_threads_earliest_deadline, relaxed) <= target_deadline) { + /* Steal the next thread from target_pset's runqueue. */ + thread_t new_thread = rt_runq_dequeue(&pset->rt_runq); + pset_update_rt_stealable_state(pset); + KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(new_thread), pset->pset_id, pset->cpu_set_low, 0); + + pset = change_locked_pset(pset, stealing_pset); + return new_thread; + } else { + /* Failed to steal (another pset stole first). Try again. */ + KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_STEAL) | DBG_FUNC_NONE, (uintptr_t)thread_tid(THREAD_NULL), pset->pset_id, pset->cpu_set_low, 1); + pset = change_locked_pset(pset, stealing_pset); + /* Update earliest_deadline in case it changed while the stealing_pset lock was not held. */ + earliest_deadline = rt_runq_earliest_deadline(pset); + continue; + } + } else { + /* No steal candidates, stop searching. */ + break; + } + } + /* No stealable threads, return with stealing_pset locked. */ + pset = change_locked_pset(pset, stealing_pset); + return THREAD_NULL; +} +#endif /* CONFIG_SCHED_EDGE */ + +/* + * processor's pset is locked, may drop and retake the lock + */ +thread_t +sched_rt_choose_thread(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + pset_assert_locked(pset); + + if (SCHED(rt_steal_thread) != NULL) { + do { + rt_clear_pending_spill(processor, 2); + thread_t new_thread = SCHED(rt_steal_thread)(pset); + /* pset lock may have been dropped and retaken, is currently locked */ + pset_assert_locked(pset); + if (new_thread != THREAD_NULL) { + /* Spill might have been set if the pset lock was dropped in steal. */ + rt_clear_pending_spill(processor, 3); + return new_thread; + } + } while (bit_test(pset->rt_pending_spill_cpu_mask, processor->cpu_id)); + } + rt_clear_pending_spill(processor, 5); + + if (rt_runq_count(pset) > 0) { + thread_t new_thread = rt_runq_dequeue(&pset->rt_runq); + assert(new_thread != THREAD_NULL); + pset_update_rt_stealable_state(pset); + return new_thread; + } + + return THREAD_NULL; +} + +void +sched_rt_init_pset(processor_set_t pset) +{ + pset_rt_init(pset); +} + +void +sched_rt_init_completed(void) +{ + rt_init_completed(); +} + +void +sched_rt_queue_shutdown(processor_t processor) +{ + processor_set_t pset = processor->processor_set; + thread_t thread; + queue_head_t tqueue; + + pset_lock(pset); + + /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ + if (bit_count(pset_available_cpumap(pset)) > 0) { + pset_unlock(pset); + return; + } + + queue_init(&tqueue); + + while (rt_runq_count(pset) > 0) { + thread = rt_runq_dequeue(&pset->rt_runq); + enqueue_tail(&tqueue, &thread->runq_links); + } + sched_update_pset_load_average(pset, 0); + pset_update_rt_stealable_state(pset); + pset_unlock(pset); + + qe_foreach_element_safe(thread, &tqueue, runq_links) { + remqueue(&thread->runq_links); + + thread_lock(thread); + + thread_setrun(thread, SCHED_TAILQ); + + thread_unlock(thread); + } +} + +/* + * Assumes RT lock is not held, and acquires splsched/rt_lock itself. + * Also records tracepoints for pset bitmasks under the pset lock. + */ +void +sched_rt_runq_scan(sched_update_scan_context_t scan_context) +{ + thread_t thread; + + pset_node_t node = &pset_node0; + processor_set_t pset = node->psets; + + spl_t s = splsched(); + do { + while (pset != NULL) { + pset_lock(pset); + + bitmap_t *map = pset->rt_runq.bitmap; + for (int i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { + rt_queue_pri_t *rt_runq = &pset->rt_runq.rt_queue_pri[i]; + + qe_foreach_element_safe(thread, &rt_runq->pri_queue, runq_links) { + if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { + scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; + } + } + } + + KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_PSET_BITMASKS), + pset->pset_id, + pset->recommended_bitmask, + pset->perfcontrol_cpu_migration_bitmask, + pset->perfcontrol_cpu_preferred_bitmask); + + pset_unlock(pset); + + pset = pset->pset_list; + } + } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); + splx(s); +} + +int64_t +sched_rt_runq_count_sum(void) +{ + pset_node_t node = &pset_node0; + processor_set_t pset = node->psets; + int64_t count = 0; + + do { + while (pset != NULL) { + count += pset->rt_runq.runq_stats.count_sum; + + pset = pset->pset_list; + } + } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); + + return count; +} + +#pragma mark - Utilities + +uint64_t +rt_deadline_add(uint64_t d, uint64_t e) +{ + uint64_t sum; + return os_add_overflow(d, e, &sum) ? RT_DEADLINE_NONE : sum; +} + +cpumap_t +pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset) +{ + cpumap_t avail_map = pset_available_cpumap(pset); +#if CONFIG_SCHED_SMT + if (!sched_allow_rt_smt) { + /* + * Secondary CPUs are not allowed to run RT threads, so + * only primary CPUs should be included + */ + avail_map &= pset->primary_map; + } +#endif /* CONFIG_SCHED_SMT */ + + return avail_map & ~pset->realtime_map; +} + +/* pset is locked */ +static processor_t +pset_choose_next_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool consider_secondaries) +{ + (void) consider_secondaries; + bool skip_spills = true; + bool include_ast_urgent_pending_cpus = false; + +#if CONFIG_SCHED_SMT + processor_t next_processor = pset_choose_processor_for_realtime_thread_smt(pset, skip_processor, consider_secondaries, skip_spills); +#else /* CONFIG_SCHED_SMT */ + processor_t next_processor = pset_choose_processor_for_realtime_thread(pset, skip_processor, skip_spills); +#endif /* CONFIG_SCHED_SMT */ + if (next_processor != PROCESSOR_NULL) { + return next_processor; + } + + next_processor = pset_choose_furthest_deadline_processor_for_realtime_thread(pset, max_pri, minimum_deadline, skip_processor, skip_spills, include_ast_urgent_pending_cpus); + return next_processor; +} + +#if CONFIG_SCHED_EDGE +/* + * Realtime Steal Utilities + * + * Realtime steal is only supported on platforms with the edge scheduler. + */ + +/* Update realtime stealable state. */ +void +pset_update_rt_stealable_state(processor_set_t pset) +{ + pset_assert_locked(pset); + if (rt_pset_has_stealable_threads(pset)) { + os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, rt_runq_earliest_deadline(pset), relaxed); + } else { + os_atomic_store(&pset->stealable_rt_threads_earliest_deadline, RT_DEADLINE_NONE, relaxed); + } +} + +bool +rt_pset_has_stealable_threads(processor_set_t pset) +{ + cpumap_t avail_map = pset_available_but_not_running_rt_threads_cpumap(pset); + + return rt_runq_count(pset) > bit_count(avail_map); +} + +/* + * Returns the next processor to IPI for a migrating realtime thread. Realtime + * spill is only supported with the edge scheduler. + * + * Expects starting_pset to be locked. Returns false if starting_pset was never + * unlocked; otherwise, returns true with no lock held. + */ +bool +rt_choose_next_processor_for_spill_IPI( + processor_set_t starting_pset, + processor_t chosen_processor, + processor_t *result_processor, + sched_ipi_type_t *result_ipi_type + ) +{ + assert3p(starting_pset, !=, PROCESSOR_SET_NULL); + assert3p(chosen_processor, !=, PROCESSOR_NULL); + + uint64_t earliest_deadline = rt_runq_earliest_deadline(starting_pset); + int max_pri = rt_runq_priority(starting_pset); + __kdebug_only uint64_t spill_tid = thread_tid(rt_runq_first(&starting_pset->rt_runq)); + processor_set_t pset = starting_pset; /* lock is held on this pset */ + processor_t next_rt_processor = PROCESSOR_NULL; + /* Optimization so caller can avoid unnecessary lock-takes if there are no psets eligible for spill: */ + bool starting_pset_was_unlocked = false; + + cpumap_t candidate_map = ~BIT(starting_pset->pset_id); /* exclude stealing_pset */ + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + while (sched_iterate_psets_ordered(starting_pset, &starting_pset->sched_rt_spill_search_order, candidate_map, &istate)) { + assert3u(starting_pset->pset_id, !=, istate.spis_pset_id); + /* Check for null pset because we do not use acquire/release semantics for spill order. */ + processor_set_t nset = pset_array[istate.spis_pset_id]; + if (__improbable(nset == PROCESSOR_SET_NULL)) { + continue; + } + + /* Make sure the pset is allowed to steal threads from stealing_pset's runqueue. */ + sched_clutch_edge edge = sched_rt_config_get((pset_id_t) starting_pset->pset_id, (pset_id_t) istate.spis_pset_id); + if (istate.spis_pset_id != starting_pset->pset_id && edge.sce_steal_allowed == false) { + continue; + } + pset = change_locked_pset(pset, nset); + starting_pset_was_unlocked = true; + + next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true); + if (next_rt_processor != PROCESSOR_NULL) { + break; + } + } + + if (next_rt_processor != PROCESSOR_NULL) { + if (pset != starting_pset) { + if (bit_set_if_clear(pset->rt_pending_spill_cpu_mask, next_rt_processor->cpu_id)) { + KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_START, + next_rt_processor->cpu_id, pset->rt_pending_spill_cpu_mask, starting_pset->cpu_set_low, spill_tid); + } + } + *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT); + *result_processor = next_rt_processor; + } + if (starting_pset_was_unlocked) { + pset_unlock(pset); + return true; + } else { + return false; + } +} +#endif /* CONFIG_SCHED_EDGE */ + +bool +rt_pset_needs_a_followup_IPI(processor_set_t pset) +{ + int nbackup_cpus = 0; + + if (rt_runq_is_low_latency(pset)) { + nbackup_cpus = sched_rt_n_backup_processors; + } + + int rt_rq_count = rt_runq_count(pset); + + return (rt_rq_count > 0) && ((rt_rq_count + nbackup_cpus - bit_count(pset->pending_AST_URGENT_cpu_mask)) > 0); +} + +/* + * Returns the next processor to IPI as a followup for low-latency realtime + * threads on the runqueue. + * + * pset should be locked, and stays locked the whole time. + */ +void +rt_choose_next_processor_for_followup_IPI( + processor_set_t pset, + processor_t chosen_processor, + processor_t *result_processor, + sched_ipi_type_t *result_ipi_type) +{ + uint64_t earliest_deadline = rt_runq_earliest_deadline(pset); + int max_pri = rt_runq_priority(pset); + processor_t next_rt_processor = pset_choose_next_processor_for_realtime_thread(pset, max_pri, earliest_deadline, chosen_processor, true); + if (next_rt_processor != PROCESSOR_NULL) { + *result_ipi_type = sched_ipi_action(next_rt_processor, NULL, SCHED_IPI_EVENT_RT_PREEMPT); + *result_processor = next_rt_processor; + } +} + +#if CONFIG_SCHED_SMT +extern int sched_avoid_cpu0; +extern int sched_allow_rt_smt; + +/* pset is locked */ +processor_t +pset_choose_processor_for_realtime_thread_smt(processor_set_t pset, processor_t skip_processor, bool consider_secondaries, bool skip_spills) +{ +#if defined(__x86_64__) + bool avoid_cpu0 = sched_avoid_cpu0 && bit_test(pset->cpu_bitmask, 0); +#else + const bool avoid_cpu0 = false; +#endif + cpumap_t cpu_map; + +try_again: + cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; + if (skip_processor) { + bit_clear(cpu_map, skip_processor->cpu_id); + } + if (skip_spills) { + cpu_map &= ~pset->rt_pending_spill_cpu_mask; + } + + if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { + bit_clear(cpu_map, 0); + } + + cpumap_t primary_map = cpu_map & pset->primary_map; + if (avoid_cpu0) { + primary_map = bit_ror64(primary_map, 1); + } + + int rotid = lsb_first(primary_map); + if (rotid >= 0) { + int cpuid = avoid_cpu0 ? ((rotid + 1) & 63) : rotid; + + processor_t processor = processor_array[cpuid]; + + return processor; + } + + if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { + goto out; + } + + if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { + /* Also avoid cpu1 */ + bit_clear(cpu_map, 1); + } + + /* Consider secondary processors whose primary is actually running a realtime thread */ + cpumap_t secondary_map = cpu_map & ~pset->primary_map & (pset->realtime_map << 1); + if (avoid_cpu0) { + /* Also avoid cpu1 */ + secondary_map = bit_ror64(secondary_map, 2); + } + rotid = lsb_first(secondary_map); + if (rotid >= 0) { + int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; + + processor_t processor = processor_array[cpuid]; + + return processor; + } + + /* Consider secondary processors */ + secondary_map = cpu_map & ~pset->primary_map; + if (avoid_cpu0) { + /* Also avoid cpu1 */ + secondary_map = bit_ror64(secondary_map, 2); + } + rotid = lsb_first(secondary_map); + if (rotid >= 0) { + int cpuid = avoid_cpu0 ? ((rotid + 2) & 63) : rotid; + + processor_t processor = processor_array[cpuid]; + + return processor; + } + + /* + * I was hoping the compiler would optimize + * this away when avoid_cpu0 is const bool false + * but it still complains about the assignmnent + * in that case. + */ + if (avoid_cpu0 && (sched_avoid_cpu0 == 2)) { +#if defined(__x86_64__) + avoid_cpu0 = false; +#else + assert(0); +#endif + goto try_again; + } + +out: + if (skip_processor) { + return PROCESSOR_NULL; + } + + /* + * If we didn't find an obvious processor to choose, but there are still more CPUs + * not already running realtime threads than realtime threads in the realtime run queue, + * this thread belongs in this pset, so choose some other processor in this pset + * to ensure the thread is enqueued here. + */ + cpumap_t non_realtime_map = pset_available_cpumap(pset) & pset->primary_map & ~pset->realtime_map; + if (bit_count(non_realtime_map) > rt_runq_count(pset)) { + cpu_map = non_realtime_map; + assert(cpu_map != 0); + int cpuid = bit_first(cpu_map); + assert(cpuid >= 0); + return processor_array[cpuid]; + } + + if (!pset->is_SMT || !sched_allow_rt_smt || !consider_secondaries) { + goto skip_secondaries; + } + + non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map; + if (bit_count(non_realtime_map) > rt_runq_count(pset)) { + cpu_map = non_realtime_map; + assert(cpu_map != 0); + int cpuid = bit_first(cpu_map); + assert(cpuid >= 0); + return processor_array[cpuid]; + } + +skip_secondaries: + return PROCESSOR_NULL; +} +#else /* !CONFIG_SCHED_SMT*/ +/* pset is locked */ +processor_t +pset_choose_processor_for_realtime_thread(processor_set_t pset, processor_t skip_processor, bool skip_spills) +{ + cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask & ~pset->realtime_map; + if (skip_processor) { + bit_clear(cpu_map, skip_processor->cpu_id); + } + if (skip_spills) { + cpu_map &= ~pset->rt_pending_spill_cpu_mask; + } + + int rotid = lsb_first(cpu_map); + if (rotid >= 0) { + return processor_array[rotid]; + } + + /* + * If we didn't find an obvious processor to choose, but there are still more CPUs + * not already running realtime threads than realtime threads in the realtime run queue, + * this thread belongs in this pset, so choose some other processor in this pset + * to ensure the thread is enqueued here. + */ + cpumap_t non_realtime_map = pset_available_cpumap(pset) & ~pset->realtime_map; + if (bit_count(non_realtime_map) > rt_runq_count(pset)) { + cpu_map = non_realtime_map; + assert(cpu_map != 0); + int cpuid = bit_first(cpu_map); + assert(cpuid >= 0); + return processor_array[cpuid]; + } + + return PROCESSOR_NULL; +} +#endif /* !CONFIG_SCHED_SMT */ + +/* + * Choose the processor with (1) the lowest priority less than max_pri and (2) the furthest deadline for that priority. + * If all available processors are at max_pri, choose the furthest deadline that is greater than minimum_deadline. + * + * pset is locked. + */ +processor_t +pset_choose_furthest_deadline_processor_for_realtime_thread(processor_set_t pset, int max_pri, uint64_t minimum_deadline, processor_t skip_processor, bool skip_spills, bool include_ast_urgent_pending_cpus) +{ + uint64_t furthest_deadline = rt_deadline_add(minimum_deadline, rt_deadline_epsilon); + processor_t fd_processor = PROCESSOR_NULL; + int lowest_priority = max_pri; + + cpumap_t cpu_map = pset_available_cpumap(pset) & ~pset->pending_AST_URGENT_cpu_mask; + if (skip_processor) { + bit_clear(cpu_map, skip_processor->cpu_id); + } + if (skip_spills) { + cpu_map &= ~pset->rt_pending_spill_cpu_mask; + } + + for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->current_pri > lowest_priority) { + continue; + } + + if (processor->current_pri < lowest_priority) { + lowest_priority = processor->current_pri; + furthest_deadline = processor->deadline; + fd_processor = processor; + continue; + } + + if (processor->deadline > furthest_deadline) { + furthest_deadline = processor->deadline; + fd_processor = processor; + } + } + + if (fd_processor) { + return fd_processor; + } + + /* + * There is a race condition possible when there are multiple processor sets. + * choose_processor() takes pset lock A, sees the pending_AST_URGENT_cpu_mask set for a processor in that set and finds no suitable candiate CPU, + * so it drops pset lock A and tries to take pset lock B. Meanwhile the pending_AST_URGENT_cpu_mask CPU is looking for a thread to run and holds + * pset lock B. It doesn't find any threads (because the candidate thread isn't yet on any run queue), so drops lock B, takes lock A again to clear + * the pending_AST_URGENT_cpu_mask bit, and keeps running the current (far deadline) thread. choose_processor() now has lock B and can only find + * the lowest count processor in set B so enqueues it on set B's run queue but doesn't IPI anyone. (The lowest count includes all threads, + * near and far deadlines, so will prefer a low count of earlier deadlines to a high count of far deadlines, which is suboptimal for EDF scheduling. + * To make a better choice we would need to know how many threads with earlier deadlines than the candidate thread exist on each pset's run queue. + * But even if we chose the better run queue, we still wouldn't send an IPI in this case.) + * + * The migitation is to also look for suitable CPUs that have their pending_AST_URGENT_cpu_mask bit set where there are no earlier deadline threads + * on the run queue of that pset. + */ + if (include_ast_urgent_pending_cpus && (rt_runq_earliest_deadline(pset) > furthest_deadline)) { + cpu_map = pset_available_cpumap(pset) & pset->pending_AST_URGENT_cpu_mask; + assert(skip_processor == PROCESSOR_NULL); + assert(skip_spills == false); + + for (int cpuid = bit_first(cpu_map); cpuid >= 0; cpuid = bit_next(cpu_map, cpuid)) { + processor_t processor = processor_array[cpuid]; + + if (processor->current_pri > lowest_priority) { + continue; + } + + if (processor->current_pri < lowest_priority) { + lowest_priority = processor->current_pri; + furthest_deadline = processor->deadline; + fd_processor = processor; + continue; + } + + if (processor->deadline > furthest_deadline) { + furthest_deadline = processor->deadline; + fd_processor = processor; + } + } + } + + return fd_processor; +} + +bool +rt_clear_pending_spill(processor_t processor, int reason) +{ + processor_set_t pset = processor->processor_set; + if (bit_clear_if_set(pset->rt_pending_spill_cpu_mask, processor->cpu_id)) { + KTRC(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RT_SIGNAL_SPILL) | DBG_FUNC_END, processor->cpu_id, pset->rt_pending_spill_cpu_mask, 0, reason); + return true; + } else { + return false; + } +} + +#pragma mark - Realtime Runqueues + +#if DEBUG || SCHED_TEST_HARNESS +void +check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread) +{ + bitmap_t *map = rt_run_queue->bitmap; + + uint64_t earliest_deadline = RT_DEADLINE_NONE; + uint32_t constraint = RT_CONSTRAINT_NONE; + int ed_index = NOPRI; + int count = 0; + bool found_thread = false; + + for (int pri = BASEPRI_RTQUEUES; pri <= MAXPRI; pri++) { + int i = pri - BASEPRI_RTQUEUES; + rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; + queue_t queue = &rt_runq->pri_queue; + queue_entry_t iter; + int n = 0; + uint64_t previous_deadline = 0; + qe_foreach(iter, queue) { + thread_t iter_thread = qe_element(iter, struct thread, runq_links); + assert_thread_magic(iter_thread); + if (iter_thread == thread) { + found_thread = true; + } + assert(iter_thread->sched_pri == (i + BASEPRI_RTQUEUES)); + assert(iter_thread->realtime.deadline < RT_DEADLINE_NONE); + assert(iter_thread->realtime.constraint < RT_CONSTRAINT_NONE); + assert(previous_deadline <= iter_thread->realtime.deadline); + n++; + if (iter == queue_first(queue)) { + assert(rt_runq->pri_earliest_deadline == iter_thread->realtime.deadline); + assert(rt_runq->pri_constraint == iter_thread->realtime.constraint); + } + previous_deadline = iter_thread->realtime.deadline; + } + assert(n == rt_runq->pri_count); + if (n == 0) { + assert(bitmap_test(map, i) == false); + assert(rt_runq->pri_earliest_deadline == RT_DEADLINE_NONE); + assert(rt_runq->pri_constraint == RT_CONSTRAINT_NONE); + } else { + assert(bitmap_test(map, i) == true); + } + if (rt_runq->pri_earliest_deadline < earliest_deadline) { + earliest_deadline = rt_runq->pri_earliest_deadline; + constraint = rt_runq->pri_constraint; + ed_index = i; + } + count += n; + } + assert(os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed) == earliest_deadline); + assert(os_atomic_load(&rt_run_queue->count, relaxed) == count); + assert(os_atomic_load(&rt_run_queue->constraint, relaxed) == constraint); + assert(os_atomic_load(&rt_run_queue->ed_index, relaxed) == ed_index); + if (thread) { + assert(found_thread); + } +} +#endif /* DEBUG || SCHED_TEST_HARNESS */ + +static bool +rt_runq_enqueue(rt_queue_t rt_run_queue, thread_t thread, processor_t processor) +{ + int pri = thread->sched_pri; + assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); + int i = pri - BASEPRI_RTQUEUES; + rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; + bitmap_t *map = rt_run_queue->bitmap; + + bitmap_set(map, i); + + queue_t queue = &rt_runq->pri_queue; + uint64_t deadline = thread->realtime.deadline; + bool preempt = false; + bool earliest = false; + + if (queue_empty(queue)) { + enqueue_tail(queue, &thread->runq_links); + preempt = true; + earliest = true; + rt_runq->pri_earliest_deadline = deadline; + rt_runq->pri_constraint = thread->realtime.constraint; + } else { + /* Insert into rt_runq in thread deadline order */ + queue_entry_t iter; + qe_foreach(iter, queue) { + thread_t iter_thread = qe_element(iter, struct thread, runq_links); + assert_thread_magic(iter_thread); + + if (deadline < iter_thread->realtime.deadline) { + if (iter == queue_first(queue)) { + preempt = true; + earliest = true; + rt_runq->pri_earliest_deadline = deadline; + rt_runq->pri_constraint = thread->realtime.constraint; + } + insque(&thread->runq_links, queue_prev(iter)); + break; + } else if (iter == queue_last(queue)) { + enqueue_tail(queue, &thread->runq_links); + break; + } + } + } + if (earliest && (deadline < os_atomic_load_wide(&rt_run_queue->earliest_deadline, relaxed))) { + os_atomic_store_wide(&rt_run_queue->earliest_deadline, deadline, relaxed); + os_atomic_store(&rt_run_queue->constraint, thread->realtime.constraint, relaxed); + os_atomic_store(&rt_run_queue->ed_index, pri - BASEPRI_RTQUEUES, relaxed); + } + + SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); + rt_runq->pri_count++; + os_atomic_inc(&rt_run_queue->count, relaxed); + + thread_set_runq_locked(thread, processor); + + CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); + + return preempt; +} + +uint64_t +rt_runq_earliest_deadline(processor_set_t pset) +{ + return os_atomic_load_wide(&pset->rt_runq.earliest_deadline, relaxed); +} + +/* + * rt_runq_insert: + * + * Enqueue a thread for realtime execution. + */ +bool +rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread) +{ + pset_assert_locked(pset); + + bool preempt = rt_runq_enqueue(&pset->rt_runq, thread, processor); + pset_update_rt_stealable_state(pset); + + return preempt; +} + +int +rt_runq_count(processor_set_t pset) +{ + return os_atomic_load(&pset->rt_runq.count, relaxed); +} + +int +rt_runq_priority(processor_set_t pset) +{ + pset_assert_locked(pset); + rt_queue_t rt_run_queue = &pset->rt_runq; + + bitmap_t *map = rt_run_queue->bitmap; + int i = bitmap_first(map, NRTQS); + assert(i < NRTQS); + + if (i >= 0) { + return i + BASEPRI_RTQUEUES; + } + + return i; +} + +bool +rt_runq_is_low_latency(processor_set_t pset) +{ + return os_atomic_load(&pset->rt_runq.constraint, relaxed) <= rt_constraint_threshold; +} + +thread_t +rt_runq_dequeue(rt_queue_t rt_run_queue) +{ + bitmap_t *map = rt_run_queue->bitmap; + int i = bitmap_first(map, NRTQS); + assert((i >= 0) && (i < NRTQS)); + + rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; + + if (!sched_rt_runq_strict_priority) { + int ed_index = os_atomic_load(&rt_run_queue->ed_index, relaxed); + if (ed_index != i) { + assert((ed_index >= 0) && (ed_index < NRTQS)); + rt_queue_pri_t *ed_runq = &rt_run_queue->rt_queue_pri[ed_index]; + + thread_t ed_thread = qe_queue_first(&ed_runq->pri_queue, struct thread, runq_links); + thread_t hi_thread = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); + + if (ed_thread->realtime.computation + hi_thread->realtime.computation + rt_deadline_epsilon < hi_thread->realtime.constraint) { + /* choose the earliest deadline thread */ + rt_runq = ed_runq; + i = ed_index; + } + } + } + + assert(rt_runq->pri_count > 0); + uint64_t earliest_deadline = RT_DEADLINE_NONE; + uint32_t constraint = RT_CONSTRAINT_NONE; + int ed_index = NOPRI; + thread_t new_thread = qe_dequeue_head(&rt_runq->pri_queue, struct thread, runq_links); + SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); + if (--rt_runq->pri_count > 0) { + thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); + assert(next_rt != THREAD_NULL); + earliest_deadline = next_rt->realtime.deadline; + constraint = next_rt->realtime.constraint; + ed_index = i; + } else { + bitmap_clear(map, i); + } + rt_runq->pri_earliest_deadline = earliest_deadline; + rt_runq->pri_constraint = constraint; + + for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { + rt_runq = &rt_run_queue->rt_queue_pri[i]; + if (rt_runq->pri_earliest_deadline < earliest_deadline) { + earliest_deadline = rt_runq->pri_earliest_deadline; + constraint = rt_runq->pri_constraint; + ed_index = i; + } + } + os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); + os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); + os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); + os_atomic_dec(&rt_run_queue->count, relaxed); + + thread_clear_runq(new_thread); + + CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); + + return new_thread; +} + +thread_t +rt_runq_first(rt_queue_t rt_run_queue) +{ + bitmap_t *map = rt_run_queue->bitmap; + int i = bitmap_first(map, NRTQS); + if (i < 0) { + return THREAD_NULL; + } + rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; + thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); + + return next_rt; +} + +void +rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread) +{ + CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, thread); + + int pri = thread->sched_pri; + assert((pri >= BASEPRI_RTQUEUES) && (pri <= MAXPRI)); + int i = pri - BASEPRI_RTQUEUES; + rt_queue_pri_t *rt_runq = &rt_run_queue->rt_queue_pri[i]; + bitmap_t *map = rt_run_queue->bitmap; + + assert(rt_runq->pri_count > 0); + uint64_t earliest_deadline = RT_DEADLINE_NONE; + uint32_t constraint = RT_CONSTRAINT_NONE; + int ed_index = NOPRI; + remqueue(&thread->runq_links); + SCHED_STATS_RUNQ_CHANGE(&rt_run_queue->runq_stats, os_atomic_load(&rt_run_queue->count, relaxed)); + if (--rt_runq->pri_count > 0) { + thread_t next_rt = qe_queue_first(&rt_runq->pri_queue, struct thread, runq_links); + earliest_deadline = next_rt->realtime.deadline; + constraint = next_rt->realtime.constraint; + ed_index = i; + } else { + bitmap_clear(map, i); + } + rt_runq->pri_earliest_deadline = earliest_deadline; + rt_runq->pri_constraint = constraint; + + for (i = bitmap_first(map, NRTQS); i >= 0; i = bitmap_next(map, i)) { + rt_runq = &rt_run_queue->rt_queue_pri[i]; + if (rt_runq->pri_earliest_deadline < earliest_deadline) { + earliest_deadline = rt_runq->pri_earliest_deadline; + constraint = rt_runq->pri_constraint; + ed_index = i; + } + } + os_atomic_store_wide(&rt_run_queue->earliest_deadline, earliest_deadline, relaxed); + os_atomic_store(&rt_run_queue->constraint, constraint, relaxed); + os_atomic_store(&rt_run_queue->ed_index, ed_index, relaxed); + os_atomic_dec(&rt_run_queue->count, relaxed); + + thread_clear_runq_locked(thread); + + CHECK_RT_RUNQ_CONSISTENCY(rt_run_queue, THREAD_NULL); +} diff --git a/osfmk/kern/sched_rt.h b/osfmk/kern/sched_rt.h new file mode 100644 index 000000000..ddd9a1f9f --- /dev/null +++ b/osfmk/kern/sched_rt.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_SCHED_RT_H_ +#define _KERN_SCHED_RT_H_ + +#include +#include +#include +#include + +__BEGIN_DECLS + +#pragma mark - Constants and Tunables + +#if (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) +extern unsigned sched_rt_spill_policy; +extern unsigned sched_rt_steal_policy; +#endif /* (DEVELOPMENT || DEBUG || SCHED_TEST_HARNESS) */ + +extern uint32_t rt_deadline_epsilon; +extern uint32_t rt_constraint_threshold; +extern int sched_rt_runq_strict_priority; +extern int sched_allow_rt_smt; + +#pragma mark - Initialization + +void sched_realtime_timebase_init(void); + +/* Initialize realtime runqueues for the given pset. */ +void sched_rt_init_pset(processor_set_t pset); + +/* Called once all psets are initialized. */ +void sched_rt_init_completed(void); + +#if CONFIG_SCHED_EDGE +#pragma mark - Realtime Scheduler-CLPC Interface + +/* + * The realtime scheduler uses edges between psets to define policies + * regarding migration and steal operations, similar to the edge scheduler. + * The weights define an explicit search order for the scheduler to identify + * alternative psets when a realtime thread's preferred pset is overloaded. + * + * The matrix can be directly manipulated with + * sched_rt_config_set()/sched_rt_config_get(), but the preferred interface for + * updates is to call sched_rt_matrix_set(), which will update cached values + * computed from the matrix. + */ + +void sched_rt_config_set(pset_id_t src_pset, pset_id_t dst_pset, sched_clutch_edge edge_config); +sched_clutch_edge sched_rt_config_get(pset_id_t src_pset, pset_id_t dst_pset); + +/* + * sched_rt_matrix_get()/sched_rt_matrix_set() + * + * Selectively retrieve (or update, respectively) multiple edges in the realtime + * matrix. The realtime spill order is recomputed for every pset with a changed + * outgoing edge. + * + * The matrix provided should be `num_psets * num_psets`, where `num_psets` + * is equal to `sched_edge_max_clusters`. Like the Edge matrix, it is indexed + * first by source pset (major), then by destination pset (minor). + */ + +void sched_rt_matrix_get(sched_clutch_edge *rt_matrix, bool *edge_requests, uint64_t num_psets); +void sched_rt_matrix_set(sched_clutch_edge *rt_matrix, bool *edge_changes, uint64_t num_psets); + +#endif /* CONFIG_SCHED_EDGE */ + +#pragma mark - Scheduler Callouts + +#if CONFIG_SCHED_SMT +/* SMT-aware callout for rt_choose_processor. */ +processor_t sched_rtlocal_choose_processor_smt(processor_set_t starting_pset, processor_t processor, thread_t thread); +#else /* !CONFIG_SCHED_SMT */ +processor_t sched_rt_choose_processor(processor_set_t starting_pset, processor_t processor, thread_t thread); +#endif /* !CONFIG_SCHED_SMT */ + +#if CONFIG_SCHED_EDGE +thread_t sched_rt_steal_thread(processor_set_t stealing_pset); +#endif /* CONFIG_SCHED_EDGE */ +thread_t sched_rt_choose_thread(processor_t processor); + +void sched_rt_queue_shutdown(processor_t processor); + +void sched_rt_runq_scan(sched_update_scan_context_t scan_context); + +int64_t sched_rt_runq_count_sum(void); + +#pragma mark - Utilities + +/* + * We are in the process of migrating realtime scheduler code into sched_rt.c + * to make it unit-testable in isolation. + * + * For the time being, these methods are made accessible to code that include + * sched_rt.h. They will be made static members of sched_rt.c as soon as + * practicable. + */ +uint64_t rt_deadline_add(uint64_t d, uint64_t e); + +cpumap_t pset_available_but_not_running_rt_threads_cpumap(processor_set_t pset); + +processor_t +pset_choose_furthest_deadline_processor_for_realtime_thread( + processor_set_t pset, + int max_pri, + uint64_t minimum_deadline, + processor_t skip_processor, + bool skip_spills, + bool include_ast_urgent_pending_cpus); + +#if CONFIG_SCHED_SMT +processor_t pset_choose_processor_for_realtime_thread_smt( + processor_set_t pset, + processor_t skip_processor, + bool consider_secondaries, + bool skip_spills); +#else /* !CONFIG_SCHED_SMT */ +processor_t +pset_choose_processor_for_realtime_thread( + processor_set_t pset, + processor_t skip_processor, + bool skip_spills); +#endif /* !CONFIG_SCHED_SMT */ + +#if CONFIG_SCHED_EDGE +bool rt_pset_has_stealable_threads(processor_set_t pset); +void pset_update_rt_stealable_state(processor_set_t pset); +/* Realtime spill is only supported on platforms with the edge scheduler. */ +bool rt_choose_next_processor_for_spill_IPI(processor_set_t starting_pset, processor_t chosen_processor, processor_t *result_processor, sched_ipi_type_t *result_ipi_type); +#else /* !CONFIG_SCHED_EDGE */ +#define pset_update_rt_stealable_state(x) do {(void) x;} while (0) +#endif /* !CONFIG_SCHED_EDGE */ + +bool rt_pset_needs_a_followup_IPI(processor_set_t pset); +void rt_choose_next_processor_for_followup_IPI(processor_set_t pset, processor_t chosen_processor, processor_t *result_processor, sched_ipi_type_t *result_ipi_type); + +bool rt_clear_pending_spill(processor_t processor, int reason); + +#pragma mark - Realtime Runqueues + +#if DEBUG || SCHED_TEST_HARNESS +void check_rt_runq_consistency(rt_queue_t rt_run_queue, thread_t thread); +#define CHECK_RT_RUNQ_CONSISTENCY(q, th) check_rt_runq_consistency(q, th) +#else /* !(DEBUG || SCHED_TEST_HARNESS) */ +#define CHECK_RT_RUNQ_CONSISTENCY(q, th) do {} while (0) +#endif /* !(DEBUG || SCHED_TEST_HARNESS) */ + +int rt_runq_count(processor_set_t); +thread_t rt_runq_dequeue(rt_queue_t rt_run_queue); +uint64_t rt_runq_earliest_deadline(processor_set_t); +thread_t rt_runq_first(rt_queue_t rt_run_queue); +bool rt_runq_insert(processor_t processor, processor_set_t pset, thread_t thread); +bool rt_runq_is_low_latency(processor_set_t pset); +int rt_runq_priority(processor_set_t pset); +void rt_runq_remove(rt_queue_t rt_run_queue, thread_t thread); + +__END_DECLS + +#endif /* _KERN_SCHED_RT_H_ */ diff --git a/osfmk/kern/sfi.c b/osfmk/kern/sfi.c index 235c77046..fb3820308 100644 --- a/osfmk/kern/sfi.c +++ b/osfmk/kern/sfi.c @@ -171,6 +171,7 @@ SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE); SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE); SFI_CLASS_REGISTER(KERNEL, OPTED_OUT); SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT); +SFI_CLASS_REGISTER(RUNAWAY_MITIGATION, RUNAWAY_MITIGATION); struct sfi_class_state { uint64_t off_time_usecs; @@ -757,16 +758,17 @@ sfi_thread_classify(thread_t thread) int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS); int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED); + int runaway_bg = proc_get_effective_task_policy(task, TASK_POLICY_RUNAWAY_MITIGATION); int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG); if (thread_qos == THREAD_QOS_MAINTENANCE) { - return SFI_CLASS_MAINTENANCE; + return runaway_bg ? SFI_CLASS_RUNAWAY_MITIGATION : SFI_CLASS_MAINTENANCE; } if (thread_bg || thread_qos == THREAD_QOS_BACKGROUND) { - return SFI_CLASS_DARWIN_BG; + return runaway_bg ? SFI_CLASS_RUNAWAY_MITIGATION : SFI_CLASS_DARWIN_BG; } if (latency_qos != 0) { diff --git a/osfmk/kern/smr.c b/osfmk/kern/smr.c index a8e3532d2..6bf4d38cb 100644 --- a/osfmk/kern/smr.c +++ b/osfmk/kern/smr.c @@ -504,7 +504,7 @@ __smrw_wakeup_and_unlock(struct smr_worker *smrw) assert(!ml_get_interrupts_enabled()); thread = waitq_wakeup64_identify_locked(&smrw->waitq, - __smrw_drain_event(smrw), THREAD_AWAKENED, WAITQ_UNLOCK); + __smrw_drain_event(smrw), WAITQ_UNLOCK); if (thread != THREAD_NULL) { assert(thread == smrw->thread); diff --git a/osfmk/kern/smr_hash.h b/osfmk/kern/smr_hash.h index 19928900c..aabac621e 100644 --- a/osfmk/kern/smr_hash.h +++ b/osfmk/kern/smr_hash.h @@ -501,7 +501,13 @@ smr_hash_array_decode(const struct smr_hash *smrh) uintptr_t ptr = os_atomic_load(&smrh->smrh_array, relaxed); array.smrh_order = (uint8_t)(ptr >> SMRH_ARRAY_ORDER_SHIFT); +#ifndef __BUILDING_XNU_LIBRARY__ + /* when running in kernel space, top bits are supposed to be 0xff*/ ptr |= SMRH_ARRAY_ORDER_MASK; +#else + /* in user-mode top bits need to be 00 */ + ptr &= ~SMRH_ARRAY_ORDER_MASK; +#endif array.smrh_array = (struct smrq_slist_head *)ptr; return array; diff --git a/osfmk/kern/socd_client.c b/osfmk/kern/socd_client.c index 764a32c45..bc95823df 100644 --- a/osfmk/kern/socd_client.c +++ b/osfmk/kern/socd_client.c @@ -48,6 +48,7 @@ typedef struct { static SECURITY_READ_ONLY_LATE(socd_client_cfg_t) socd_client_cfg = {0}; static SECURITY_READ_ONLY_LATE(bool) socd_client_trace_available = false; static SECURITY_READ_WRITE(bool) socd_client_trace_has_sticky_events = false; +static bool PERCPU_DATA(is_in_buffer_write); // = false /* Run-time state */ static struct { @@ -120,6 +121,7 @@ socd_client_trace( vm_offset_t offset; bool has_sticky; uint32_t tries = 0; + bool *is_buf_wr; available = os_atomic_load(&socd_client_trace_available, dependency); @@ -127,6 +129,15 @@ socd_client_trace( return; } + /* is_in_buffer_write is an indicator that the code is in SOCD buffer write routine */ + is_buf_wr = PERCPU_GET(is_in_buffer_write); + if (*is_buf_wr) { + /* If we are here this means previously code already entered SOCD buffer write routine but never exited meaning it caused a panic. + * To avoid recursive panic returning here */ + return; + } + *is_buf_wr = true; + len = os_atomic_load_with_dependency_on(&socd_client_cfg.trace_buff_len, available); offset = os_atomic_load_with_dependency_on(&socd_client_cfg.trace_buff_offset, available); has_sticky = os_atomic_load_with_dependency_on(&socd_client_trace_has_sticky_events, available); @@ -164,6 +175,8 @@ socd_client_trace( break; } + *is_buf_wr = false; + /* Duplicate tracepoint to kdebug */ if (!debug_is_current_cpu_in_panic_state()) { KDBG(debugid, arg1, arg2, arg3, arg4); diff --git a/osfmk/kern/socd_client.h b/osfmk/kern/socd_client.h index 9c2184c8a..7e4135009 100644 --- a/osfmk/kern/socd_client.h +++ b/osfmk/kern/socd_client.h @@ -74,10 +74,17 @@ __BEGIN_DECLS #define SOCD_TRACE_GEN_CLASS_ENUM(entry) SOCD_TRACE_CLASS_##entry, #define SOCD_TRACE_GEN_CODE_ENUM(entry) SOCD_TRACE_CODE_##entry, -/* List of socd trace classes */ +/* * + * List of socd trace classes + * + * XNU: Tracepoints from xnu itself + * WDT: Tracepoints from whatever kext is in charge of the system watchdog + * SOC: Tracepoints relating to SoC-wide events or errors (e.g. an SoC thermal sensor reading) + */ #define SOCD_TRACE_FOR_EACH_CLASS(iter) \ iter(XNU) \ - iter(WDT) + iter(WDT) \ + iter(SOC) /* List of xnu trace codes */ #define SOCD_TRACE_FOR_EACH_XNU_CODE(iter) \ diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index 44929c35c..4d8c9ec30 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -441,13 +441,13 @@ processor_set_stack_usage( #endif /* DEVELOPMENT || DEBUG */ } -vm_offset_t +__mockable vm_offset_t min_valid_stack_address(void) { return (vm_offset_t)vm_map_min(kernel_map); } -vm_offset_t +__mockable vm_offset_t max_valid_stack_address(void) { return (vm_offset_t)vm_map_max(kernel_map); diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index 7616f2069..5745aa9f7 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -89,6 +89,7 @@ #if CONFIG_SCHED_SFI #include #endif +#include #include #include #include @@ -190,10 +191,6 @@ extern void dtrace_early_init(void); extern void sdt_early_init(void); #endif -#ifdef CONFIG_BTI_TELEMETRY -#include -#endif /* CONFIG_BTI_TELEMETRY */ - // libkern/OSKextLib.cpp extern void OSKextRemoveKextBootstrap(void); @@ -280,7 +277,8 @@ kernel_startup_bootstrap(void) qsort(startup_entries, n, sizeof(struct startup_entry), startup_entry_cmp); -#if !CONFIG_SPTM +#if !CONFIG_SPTM && !defined(__BUILDING_XNU_LIBRARY__) + /* static_if relies on TEXT editing and not supported in user-mode build*/ static_if_init(PE_boot_args()); #endif @@ -347,11 +345,13 @@ kernel_startup_tunable_dt_source_init(const struct startup_tunable_dt_source_spe /* boot-arg overrides. */ - if (PE_parse_boot_argn(spec->boot_arg_name, spec->var_addr, spec->var_len)) { - if (spec->var_is_bool) { - *(bool *)spec->var_addr = *(uint8_t *)spec->var_addr; + if (spec->boot_arg_name != NULL) { + if (PE_parse_boot_argn(spec->boot_arg_name, spec->var_addr, spec->var_len)) { + if (spec->var_is_bool) { + *(bool *)spec->var_addr = *(uint8_t *)spec->var_addr; + } + *spec->source_addr = STARTUP_SOURCE_BOOTPARAM; } - *spec->source_addr = STARTUP_SOURCE_BOOTPARAM; } } @@ -395,9 +395,11 @@ kernel_startup_tunable_dt_init(const struct startup_tunable_dt_spec *spec) /* boot-arg overrides. */ - if (PE_parse_boot_argn(spec->boot_arg_name, spec->var_addr, spec->var_len)) { - if (spec->var_is_bool) { - *(bool *)spec->var_addr = *(uint8_t *)spec->var_addr; + if (spec->boot_arg_name != NULL) { + if (PE_parse_boot_argn(spec->boot_arg_name, spec->var_addr, spec->var_len)) { + if (spec->var_is_bool) { + *(bool *)spec->var_addr = *(uint8_t *)spec->var_addr; + } } } } @@ -477,6 +479,25 @@ kernel_startup_initialize_upto(startup_subsystem_id_t upto) startup_phase = upto; } +#ifdef __BUILDING_XNU_LIB_UNITTEST__ +/* unit-test initialization needs to pick specific phases */ +void +kernel_startup_initialize_only(startup_subsystem_id_t sysid) +{ + assert(startup_phase < sysid); + struct startup_entry *cur = startup_entry_cur; + while (cur < startup_entries_end && cur->subsystem <= sysid) { + if (cur->subsystem == sysid) { + startup_phase = cur->subsystem - 1; + kernel_startup_log(cur->subsystem); + cur->func(cur->arg); + } + startup_entry_cur = ++cur; + } + startup_phase = sysid; +} +#endif + void kernel_bootstrap(void) { @@ -488,6 +509,10 @@ kernel_bootstrap(void) printf("%s\n", version); /* log kernel version */ +#if HAS_UPSI_FAILURE_INJECTION + check_for_failure_injection(XNU_STAGE_BOOTSTRAP_START); +#endif + scale_setup(); kernel_bootstrap_log("vm_mem_bootstrap"); @@ -514,11 +539,6 @@ kernel_bootstrap(void) ubsan_minimal_init(); #endif -#ifdef CONFIG_BTI_TELEMETRY - kernel_bootstrap_log("BTI exception telemetry runtime init"); - bti_telemetry_init(); -#endif /* CONFIG_BTI_TELEMETRY */ - #if KASAN kernel_bootstrap_log("kasan_late_init"); kasan_late_init(); @@ -669,8 +689,8 @@ kernel_bootstrap_thread(void) { processor_t processor = current_processor(); -#if (DEVELOPMENT || DEBUG) - platform_stall_panic_or_spin(PLATFORM_STALL_XNU_LOCATION_KERNEL_BOOTSTRAP); +#if HAS_UPSI_FAILURE_INJECTION + check_for_failure_injection(XNU_STAGE_SCHEDULER_START); #endif kernel_bootstrap_thread_log("idle_thread_create"); @@ -1064,14 +1084,14 @@ scale_setup(void) pe_serverperfmode = PE_get_default("kern.serverperfmode", &pe_serverperfmode, sizeof(pe_serverperfmode)); if (pe_serverperfmode) { - serverperfmode = pe_serverperfmode; + serverperfmode = (pe_serverperfmode != 0); } #if defined(__LP64__) typeof(task_max) task_max_base = task_max; /* Raise limits for servers with >= 16G */ - if ((serverperfmode != 0) && ((uint64_t)max_mem_actual >= (uint64_t)(16 * 1024 * 1024 * 1024ULL))) { + if (serverperfmode && ((uint64_t)max_mem_actual >= (uint64_t)(16 * 1024 * 1024 * 1024ULL))) { scale = (int)((uint64_t)sane_size / (uint64_t)(8 * 1024 * 1024 * 1024ULL)); /* limit to 128 G */ if (scale > 16) { diff --git a/osfmk/kern/startup.h b/osfmk/kern/startup.h index d95ebc691..ea771f9fc 100644 --- a/osfmk/kern/startup.h +++ b/osfmk/kern/startup.h @@ -42,7 +42,7 @@ __BEGIN_DECLS -#pragma GCC visibility push(hidden) +__exported_push_hidden /*! * @enum startup_subsystem_id_t @@ -53,7 +53,7 @@ __BEGIN_DECLS * * @discussion * Documentation of each subsystem initialization sequence exists in - * @file doc/startup.md. + * @file doc/lifecycle/startup.md. */ __enum_decl(startup_subsystem_id_t, uint32_t, { STARTUP_SUB_NONE = 0, /**< reserved for the startup subsystem */ @@ -186,10 +186,14 @@ __enum_decl(startup_rank_t, uint32_t, { * @description * Code marked with this attribute will be unmapped after kernel lockdown. */ +#ifndef __BUILDING_XNU_LIBRARY__ #define __startup_func \ __PLACE_IN_SECTION(STARTUP_CODE_SEGSECT) \ __attribute__((cold, visibility("hidden"))) - +#else +/* tester needs some startup function to be visible from outside the XNU library */ +#define __startup_func __unused +#endif /*! * @macro __startup_data * @@ -199,8 +203,12 @@ __enum_decl(startup_rank_t, uint32_t, { * @description * Data marked with this attribute will be unmapped after kernel lockdown. */ +#ifndef __BUILDING_XNU_LIBRARY__ #define __startup_data \ __PLACE_IN_SECTION(STARTUP_DATA_SEGSECT) +#else +#define __startup_data +#endif /*! * @macro __startup_const @@ -325,11 +333,13 @@ __enum_decl(startup_rank_t, uint32_t, { * //. /chosen is by convention the area where * synthesized values not coming from the serialized device tree are * being added, so this provides a way for e.g. the boot-loader to - * set/override tunables. + * set/override tunables. Don't override with a boot-arg if + * TUNABLE_DT_NO_BOOTARG is set. */ __options_decl(tunable_dt_flags_t, uint32_t, { TUNABLE_DT_NONE = 0x00000000, TUNABLE_DT_CHECK_CHOSEN = 0x00000001, + TUNABLE_DT_NO_BOOTARG = 0x00000002, }); /*! @@ -535,11 +545,14 @@ struct machine_timeout_spec { }; extern void -machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *phase_suffix); +machine_timeout_init_with_suffix(const struct machine_timeout_spec *spec, char const *phase_suffix, bool always_enabled); extern void machine_timeout_init(const struct machine_timeout_spec *spec); +extern void +machine_timeout_init_always_enabled(const struct machine_timeout_spec *spec); + #if DEVELOPMENT || DEBUG // Late timeout (re-)initialization, at the end of bsd_init() extern void @@ -547,7 +560,7 @@ machine_timeout_bsd_init(void); #endif /* DEVELOPMENT || DEBUG */ /*! - * @macro MACHINE_TIMEOUT and MACHINE_TIMEOUT_DEV_WRITEABLE + * @macro MACHINE_TIMEOUT, MACHINE_TIMEOUT_ALWAYS_ENABLED, and MACHINE_TIMEOUT_DEV_WRITEABLE * * @abstract * Defines a Machine Timeout that can be overridden and @@ -583,7 +596,7 @@ machine_timeout_bsd_init(void); * bool skip_predicate (struct machine_timeout_spec const *) */ -#define _MACHINE_TIMEOUT(var, timeout_name, timeout_default, var_unit, skip_pred) \ +#define _MACHINE_TIMEOUT(var, timeout_name, timeout_default, var_unit, skip_pred, init_fn) \ struct machine_timeout_spec \ __machine_timeout_spec_ ## var = { \ .ptr = &var, \ @@ -593,16 +606,23 @@ machine_timeout_bsd_init(void); .skip_predicate = skip_pred, \ }; \ __STARTUP_ARG(var, __LINE__, TIMEOUTS, STARTUP_RANK_FIRST, \ - machine_timeout_init, &__machine_timeout_spec_ ## var) + init_fn, &__machine_timeout_spec_ ## var) #define MACHINE_TIMEOUT(var, name, default, unit, skip_predicate) \ SECURITY_READ_ONLY_LATE(machine_timeout_t) var = 0; \ - _MACHINE_TIMEOUT(var, name, default, unit, skip_predicate) + _MACHINE_TIMEOUT(var, name, default, unit, skip_predicate, machine_timeout_init) + +/* + * Variant of MACHINE_TIMEOUT that does not get zeroed if wdt == -1 boot arg is set + */ +#define MACHINE_TIMEOUT_ALWAYS_ENABLED(var, name, default, unit) \ + SECURITY_READ_ONLY_LATE(machine_timeout_t) var = 0; \ + _MACHINE_TIMEOUT(var, name, default, unit, NULL, machine_timeout_init_always_enabled) #if DEVELOPMENT || DEBUG #define MACHINE_TIMEOUT_DEV_WRITEABLE(var, name, default, unit, skip_predicate) \ machine_timeout_t var = 0; \ - _MACHINE_TIMEOUT(var, name, default, unit, skip_predicate) + _MACHINE_TIMEOUT(var, name, default, unit, skip_predicate, machine_timeout_init) #else #define MACHINE_TIMEOUT_DEV_WRITEABLE(var, name, default, unit, skip_predicate) \ MACHINE_TIMEOUT(var, name, default, unit, skip_predicate) @@ -781,7 +801,7 @@ __BEGIN_DECLS .dt_base = __startup_TUNABLES_dt_base_ ## var, \ .dt_name = __startup_TUNABLES_dt_name_ ## var, \ .dt_chosen_override = (bool)((flags) & TUNABLE_DT_CHECK_CHOSEN), \ - .boot_arg_name = __startup_TUNABLES_name_ ## var, \ + .boot_arg_name = (flags & TUNABLE_DT_NO_BOOTARG) ? NULL : __startup_TUNABLES_name_ ## var, \ .var_addr = (void *)&var, \ .var_len = sizeof(type_t), \ .var_is_bool = __startup_type_is_bool(type_t), \ @@ -798,7 +818,7 @@ __BEGIN_DECLS .dt_base = __startup_TUNABLES_dt_base_ ## var, \ .dt_name = __startup_TUNABLES_dt_name_ ## var, \ .dt_chosen_override = (bool)((flags) & TUNABLE_DT_CHECK_CHOSEN), \ - .boot_arg_name = __startup_TUNABLES_name_ ## var, \ + .boot_arg_name = (flags & TUNABLE_DT_NO_BOOTARG) ? NULL : __startup_TUNABLES_name_ ## var, \ .var_addr = (void *)&var, \ .var_len = sizeof(type_t), \ .var_is_bool = __startup_type_is_bool(type_t), \ @@ -888,6 +908,10 @@ extern void kernel_startup_tunable_dt_init(const struct startup_tunable_dt_spec extern void kernel_startup_tunable_dt_source_init(const struct startup_tunable_dt_source_spec *); extern void kernel_bootstrap(void); +#ifdef __BUILDING_XNU_LIB_UNITTEST__ +void kernel_startup_initialize_only(startup_subsystem_id_t sysid); +#endif + /* Initialize machine dependent stuff */ extern void machine_init(void); @@ -931,13 +955,13 @@ kernel_is_macos_or_server(void) static inline bool kernel_is_macos_or_server(void) { - return !!serverperfmode; + return serverperfmode != 0; } #endif /* XNU_TARGET_OS_OSX */ #endif /* MACH_BSD */ -#pragma GCC visibility pop +__exported_pop __END_DECLS diff --git a/osfmk/kern/static_if_common.c b/osfmk/kern/static_if_common.c index 003f676bc..3a30d87f2 100644 --- a/osfmk/kern/static_if_common.c +++ b/osfmk/kern/static_if_common.c @@ -45,6 +45,9 @@ __SECTION_START_SYM(STATIC_IF_SEGMENT, STATIC_IFINIT_SECTION); extern static_if_initializer __static_if_initializer_entries_end[] __SECTION_END_SYM(STATIC_IF_SEGMENT, STATIC_IFINIT_SECTION); +/* libhwtrace knows about this contract */ +SECURITY_READ_ONLY_LATE(static_if_key_t) static_if_modified_keys; +SECURITY_READ_ONLY_LATE(uint32_t) static_if_abi = STATIC_IF_ABI_CURRENT; #endif /* STATIC_IF_TEST */ #pragma mark boot-arg parsing @@ -284,10 +287,20 @@ __static_if_key_delta(static_if_key_t key, int delta) bool was_enabled = (key->sik_enable_count >= 0); + /* + * Remember modified keys. + */ + if (!key->sik_modified) { + key->sik_modified_next = static_if_modified_keys; + static_if_modified_keys = key; + key->sik_modified = true; + } + key->sik_enable_count += delta; + if (was_enabled != (key->sik_enable_count >= 0)) { static_if_entry_t sie = key->sik_entries_head; - bool init_enabled = key->sik_init_value >= 0; + bool init_enabled = key->sik_init_value; while (sie) { ml_static_if_entry_patch(sie, diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 585f0d00c..0d8300d6a 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -50,8 +50,6 @@ #include #include #include -#include -#include #include #include #include @@ -221,7 +219,8 @@ semaphore_free( port = semaphore->port; if (IP_VALID(port)) { assert(!port->ip_srights); - ipc_kobject_dealloc_port(port, 0, IKOT_SEMAPHORE); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_SEMAPHORE); } /* @@ -1207,6 +1206,8 @@ convert_port_to_semaphore(ipc_port_t port) if (IP_VALID(port)) { semaphore = ipc_kobject_get_stable(port, IKOT_SEMAPHORE); if (semaphore != SEMAPHORE_NULL) { + zone_id_require(ZONE_ID_SEMAPHORE, + sizeof(struct semaphore), semaphore); semaphore_reference(semaphore); } } @@ -1233,12 +1234,14 @@ convert_semaphore_to_port(semaphore_t semaphore) return IP_NULL; } + zone_id_require(ZONE_ID_SEMAPHORE, sizeof(struct semaphore), semaphore); + /* * make a send right and donate our reference for * semaphore_no_senders if this is the first send right */ if (!ipc_kobject_make_send_lazy_alloc_port(&semaphore->port, - semaphore, IKOT_SEMAPHORE, IPC_KOBJECT_ALLOC_NONE)) { + semaphore, IKOT_SEMAPHORE)) { semaphore_dereference(semaphore); } return semaphore->port; @@ -1270,5 +1273,6 @@ semaphore_no_senders(ipc_port_t port, __unused mach_port_mscount_t mscount) } IPC_KOBJECT_DEFINE(IKOT_SEMAPHORE, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = semaphore_no_senders); diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 39c79dbc6..3add7368a 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -56,8 +56,6 @@ #include #include -#include -#include #include #include #include @@ -76,6 +74,10 @@ #include #include +#if DEVELOPMENT || DEBUG +SCALABLE_COUNTER_DECLARE(mach_eventlink_handoff_success_count); +#endif /* DEVELOPMENT || DEBUG */ + static void thread_depress_abstime(uint64_t interval); static void thread_depress_ms(mach_msg_timeout_t interval); @@ -280,6 +282,10 @@ thread_switch( pulled_thread ? TRUE : FALSE, 0, 0); if (pulled_thread != THREAD_NULL) { +#if DEVELOPMENT || DEBUG + counter_inc_preemption_disabled(&mach_eventlink_handoff_success_count); +#endif /* DEVELOPMENT || DEBUG */ + /* We can't be dropping the last ref here */ thread_deallocate_safe(thread); @@ -370,6 +376,10 @@ thread_handoff_internal(thread_t thread, thread_continue_t continuation, } if (pulled_thread != THREAD_NULL) { +#if DEVELOPMENT || DEBUG + counter_inc_preemption_disabled(&mach_eventlink_handoff_success_count); +#endif /* DEVELOPMENT || DEBUG */ + int result = thread_run(self, continuation, parameter, pulled_thread); splx(s); diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index 865241e63..43a1d5225 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -170,7 +170,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 60 */ MACH_TRAP(swtch, 0, 0, NULL), /* 61 */ MACH_TRAP(thread_switch, 3, 3, munge_www), /* 62 */ MACH_TRAP(clock_sleep_trap, 5, 5, munge_wwwww), -/* 63 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 63 */ MACH_TRAP(mach_vm_reclaim_update_kernel_accounting_trap, 2, 2, munge_wl), /* 64 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 65 */ MACH_TRAP(kern_invalid, 0, 0, NULL), /* 66 */ MACH_TRAP(kern_invalid, 0, 0, NULL), @@ -306,7 +306,7 @@ const char * const mach_syscall_name_table[MACH_TRAP_TABLE_COUNT] = { /* 60 */ "swtch", /* 61 */ "thread_switch", /* 62 */ "clock_sleep_trap", -/* 63 */ "kern_invalid", +/* 63 */ "mach_vm_reclaim_update_kernel_accounting_trap", /* traps 64 - 95 reserved (debo) */ /* 64 */ "kern_invalid", /* 65 */ "kern_invalid", diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index a654e6b9c..205ac1358 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -104,7 +104,7 @@ #include #include #include -#include +#include #include #include @@ -166,7 +166,7 @@ #include /* for coredump */ #include #include -#include /* for address_space_debugged */ +#include /* for is_address_space_debugged */ #include /* @@ -214,8 +214,6 @@ extern size_t proc_struct_size; extern size_t proc_and_task_size; size_t task_struct_size; -extern uint32_t ipc_control_port_options; - extern int large_corpse_count; extern boolean_t proc_send_synchronous_EXC_RESOURCE(void *p); @@ -227,6 +225,7 @@ static void task_suspension_no_senders(ipc_port_t, mach_msg_type_number_t); static inline void task_zone_init(void); static void task_store_owned_vmobject_info(task_t to_task, task_t from_task); +static void task_set_control_port_options(task_t task, task_control_port_options_t opts); #if CONFIG_EXCLAVES static bool task_should_panic_on_exit_due_to_conclave_taint(task_t task); @@ -236,14 +235,19 @@ kern_return_t task_crash_info_conclave_upcall(task_t task, const struct conclave_sharedbuffer_t *shared_buf, uint32_t length); #endif /* CONFIG_EXCLAVES */ -IPC_KOBJECT_DEFINE(IKOT_TASK_NAME); +IPC_KOBJECT_DEFINE(IKOT_TASK_NAME, + .iko_op_movable_send = true); IPC_KOBJECT_DEFINE(IKOT_TASK_CONTROL, - .iko_op_no_senders = task_port_no_senders); + .iko_op_no_senders = task_port_no_senders, + .iko_op_movable_send = true, /* see ipc_should_mark_immovable_send */ + .iko_op_label_free = ipc_kobject_label_free); IPC_KOBJECT_DEFINE(IKOT_TASK_READ, - .iko_op_no_senders = task_port_with_flavor_no_senders); + .iko_op_no_senders = task_port_with_flavor_no_senders, + .iko_op_label_free = ipc_kobject_label_free); IPC_KOBJECT_DEFINE(IKOT_TASK_INSPECT, .iko_op_no_senders = task_port_with_flavor_no_senders); IPC_KOBJECT_DEFINE(IKOT_TASK_RESUME, + .iko_op_movable_send = true, .iko_op_no_senders = task_suspension_no_senders); #if CONFIG_PROC_RESOURCE_LIMITS @@ -251,6 +255,7 @@ static void task_fatal_port_no_senders(ipc_port_t, mach_msg_type_number_t); static mach_port_t task_allocate_fatal_port(void); IPC_KOBJECT_DEFINE(IKOT_TASK_FATAL, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = task_fatal_port_no_senders); @@ -391,6 +396,7 @@ __options_decl(send_exec_resource_options_t, uint8_t, { EXEC_RESOURCE_FATAL = 0x01, EXEC_RESOURCE_DIAGNOSTIC = 0x02, EXEC_RESOURCE_WARNING = 0x04, + EXEC_RESOURCE_CONCLAVE = 0x08 // A side memory limit independent of the main footprint. }); /** @@ -404,6 +410,7 @@ void init_task_ledgers(void); void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1); void task_wakeups_rate_exceeded(int warning, __unused const void *param0, __unused const void *param1); void task_io_rate_exceeded(int warning, const void *param0, __unused const void *param1); +void task_conclave_mem_limit_exceeded(int warning, __unused const void *param0, __unused const void *param1); void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void); void __attribute__((noinline)) PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, send_exec_resource_options_t exception_options); void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MUCH_IO(int flavor); @@ -502,12 +509,14 @@ extern int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize); extern void workq_proc_suspended(struct proc *p); extern void workq_proc_resumed(struct proc *p); extern struct proc *kernproc; +extern void * XNU_PTRAUTH_SIGNED_PTR("initproc") initproc; #if CONFIG_MEMORYSTATUS extern void proc_memstat_skip(struct proc* p, boolean_t set); extern void memorystatus_on_ledger_footprint_exceeded(int warning, bool memlimit_is_active, bool memlimit_is_fatal); extern void memorystatus_log_exception(const int max_footprint_mb, bool memlimit_is_active, bool memlimit_is_fatal); extern void memorystatus_log_diag_threshold_exception(const int diag_threshold_value); +extern void memorystatus_on_conclave_limit_exceeded(const int max_footprint_mb); extern boolean_t memorystatus_allowed_vm_map_fork(task_t task, bool *is_large); extern uint64_t memorystatus_available_memory_internal(struct proc *p); @@ -692,6 +701,12 @@ task_set_platform_binary( } else { task_ro_flags_clear(task, TFRO_PLATFORM); } + assert(task->map); + if (task->map) { + vm_map_lock(task->map); + vm_map_set_platform_binary(task->map, (bool)is_platform); + vm_map_unlock(task->map); + } } #if XNU_TARGET_OS_OSX @@ -712,45 +727,12 @@ task_opted_out_mach_hardening(task_t task) } #endif /* XNU_TARGET_OS_OSX */ -/* - * Use the `task_is_hardened_binary` macro below - * when applying new security policies. - * - * Kernel security policies now generally apply to - * "hardened binaries" - which are platform binaries, and - * third party binaries who adopt hardened runtime on ios. - */ boolean_t task_get_platform_binary(task_t task) { return (task_ro_flags_get(task) & TFRO_PLATFORM) != 0; } -static boolean_t -task_get_hardened_runtime(task_t task) -{ - return (task_ro_flags_get(task) & TFRO_HARDENED) != 0; -} - -boolean_t -task_is_hardened_binary(task_t task) -{ - return task_get_platform_binary(task) || - task_get_hardened_runtime(task); -} - -void -task_set_hardened_runtime( - task_t task, - bool is_hardened) -{ - if (is_hardened) { - task_ro_flags_set(task, TFRO_HARDENED); - } else { - task_ro_flags_clear(task, TFRO_HARDENED); - } -} - boolean_t task_is_a_corpse(task_t task) { @@ -770,9 +752,9 @@ task_set_corpse(task_t task) } void -task_set_immovable_pinned(task_t task) +task_copyout_control_port(task_t task) { - ipc_task_set_immovable_pinned(task); + ipc_task_copyout_control_port(task); } /* @@ -930,10 +912,57 @@ task_clear_return_wait(task_t task, uint32_t flags) } } +/* + * Set default behavior for a task's control ports + */ +static void +task_set_ctrl_port_default( + task_t task, + thread_t thread) +{ + ipc_space_policy_t pol = ipc_policy_for_task(task); + bool movable_allowed = mac_task_check_get_movable_control_port() == 0; + bool is_simulated = pol & IPC_SPACE_POLICY_SIMULATED; + bool is_translated = false; + + task_control_port_options_t opts = TASK_CONTROL_PORT_OPTIONS_NONE; + if (movable_allowed || is_simulated || is_translated) { + /* Disable control port hardening for entitled||simulated binaries */ + opts = TASK_CONTROL_PORT_OPTIONS_NONE; + } else if (ipc_should_apply_policy(pol, IPC_POLICY_ENHANCED_V1)) { + /* set control port options for 1p code, inherited from parent task by default */ + if (ipc_control_port_options & ICP_OPTIONS_IMMOVABLE_1P_HARD) { + opts |= TASK_CONTROL_PORT_IMMOVABLE_HARD; + } else if (ipc_control_port_options & ICP_OPTIONS_IMMOVABLE_1P_SOFT) { + opts |= TASK_CONTROL_PORT_IMMOVABLE_SOFT; + } + } else { + /* set control port options for 3p code, inherited from parent task by default */ + if (ipc_control_port_options & ICP_OPTIONS_IMMOVABLE_3P_HARD) { + opts |= TASK_CONTROL_PORT_IMMOVABLE_HARD; + } else if (ipc_control_port_options & ICP_OPTIONS_IMMOVABLE_3P_SOFT) { + opts |= TASK_CONTROL_PORT_IMMOVABLE_SOFT; + } + } + + /* see `copyout_should_mark_immovable_send`, which consumes these flags */ + task_set_control_port_options(task, opts); + + /* + * now that we have marked the task as immovable, copyout the task/thread ports + * again so that they get marked as immovable on copyout + */ + ipc_task_copyout_control_port(task); + /* consumed by ipc_thread_set_immovable_pinned */ + thread_reference(thread); + ipc_thread_set_immovable_pinned(thread); +} + void __attribute__((noreturn)) task_wait_to_return(void) { task_t task = current_task(); + thread_t thread = current_thread(); uint8_t returnwaitflags; is_write_lock(task->itk_space); @@ -987,6 +1016,11 @@ task_wait_to_return(void) } #endif + /* + * Set task/thread control port movability now that we can call AMFI + */ + task_set_ctrl_port_default(task, thread); + thread_bootstrap_return(); } @@ -1002,6 +1036,12 @@ task_post_signature_processing_hook(task_t task) ml_task_post_signature_processing_hook(task); } +bool +task_is_initproc(task_t task) +{ + return get_bsdtask_info(task) == initproc; +} + boolean_t task_is_exec_copy(task_t task) { @@ -1151,7 +1191,9 @@ task_init(void) } + vm_map_setup(get_task_map(kernel_task), kernel_task); + ipc_task_enable(kernel_task); #if defined(HAS_APPLE_PAC) @@ -1264,10 +1306,10 @@ init_task_ledgers(void) task_ledgers.purgeable_nonvolatile = ledger_entry_add_with_flags(t, "purgeable_nonvolatile", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); task_ledgers.purgeable_volatile_compressed = ledger_entry_add_with_flags(t, "purgeable_volatile_compress", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add_with_flags(t, "purgeable_nonvolatile_compress", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); - task_ledgers.pages_grabbed = ledger_entry_add_with_flags(t, "pages_grabbed", "physmem", "count", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); - task_ledgers.pages_grabbed_kern = ledger_entry_add_with_flags(t, "pages_grabbed_kern", "physmem", "count", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); - task_ledgers.pages_grabbed_iopl = ledger_entry_add_with_flags(t, "pages_grabbed_iopl", "physmem", "count", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); - task_ledgers.pages_grabbed_upl = ledger_entry_add_with_flags(t, "pages_grabbed_upl", "physmem", "count", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); + task_ledgers.pages_grabbed = ledger_entry_add_with_flags(t, "pages_grabbed", "physmem", "count", LEDGER_ENTRY_USE_COUNTER); + task_ledgers.pages_grabbed_kern = ledger_entry_add_with_flags(t, "pages_grabbed_kern", "physmem", "count", LEDGER_ENTRY_USE_COUNTER); + task_ledgers.pages_grabbed_iopl = ledger_entry_add_with_flags(t, "pages_grabbed_iopl", "physmem", "count", LEDGER_ENTRY_USE_COUNTER); + task_ledgers.pages_grabbed_upl = ledger_entry_add_with_flags(t, "pages_grabbed_upl", "physmem", "count", LEDGER_ENTRY_USE_COUNTER); task_ledgers.tagged_nofootprint = ledger_entry_add_with_flags(t, "tagged_nofootprint", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); task_ledgers.tagged_footprint = ledger_entry_add_with_flags(t, "tagged_footprint", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); task_ledgers.tagged_nofootprint_compressed = ledger_entry_add_with_flags(t, "tagged_nofootprint_compressed", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); @@ -1290,6 +1332,10 @@ init_task_ledgers(void) task_ledgers.neural_footprint_compressed = ledger_entry_add_with_flags(t, "neural_footprint_compressed", "physmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); task_ledgers.neural_nofootprint_total = ledger_entry_add(t, "neural_nofootprint_total", "physmem", "bytes"); +#if CONFIG_DEFERRED_RECLAIM + task_ledgers.est_reclaimable = ledger_entry_add_with_flags(t, "est_reclaimable", "virtmem", "bytes", LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE); +#endif /* CONFIG_DEFERRED_RECLAIM */ + #if CONFIG_FREEZE task_ledgers.frozen_to_swap = ledger_entry_add(t, "frozen_to_swap", "physmem", "bytes"); #endif /* CONFIG_FREEZE */ @@ -1462,6 +1508,7 @@ init_task_ledgers(void) #if CONFIG_MEMORYSTATUS ledger_set_callback(t, task_ledgers.phys_footprint, task_footprint_exceeded, NULL, NULL); + ledger_set_callback(t, task_ledgers.conclave_mem, task_conclave_mem_limit_exceeded, NULL, NULL); #endif /* CONFIG_MEMORYSTATUS */ ledger_set_callback(t, task_ledgers.interrupt_wakeups, @@ -1609,11 +1656,14 @@ task_create_internal( task_ro_data.t_flags_ro |= (parent_t_flags_ro & (TFRO_PAC_ENFORCE_USER_STATE | TFRO_PAC_EXC_FATAL)); #endif /* __has_feature(ptrauth_calls) */ - /* Inherit the hardened binary flags from parent if in fork */ - task_ro_data.t_flags_ro |= parent_t_flags_ro & (TFRO_HARDENED | TFRO_PLATFORM | TFRO_JIT_EXC_FATAL); + /* Inherit the platform restrictions flags from parent if in fork */ + task_ro_data.t_flags_ro |= parent_t_flags_ro & (TFRO_PLATFORM | TFRO_JIT_EXC_FATAL); #if XNU_TARGET_OS_OSX task_ro_data.t_flags_ro |= parent_t_flags_ro & TFRO_MACH_HARDENING_OPT_OUT; #endif /* XNU_TARGET_OS_OSX */ + + /* task_security_config options are always inherited on fork */ + new_task->security_config = parent_task->security_config; } #ifdef MACH_BSD @@ -1692,7 +1742,6 @@ task_create_internal( #endif /* CONFIG_TASKWATCH */ new_task->mem_notify_reserved = 0; - new_task->memlimit_attrs_reserved = 0; new_task->requested_policy = default_task_requested_policy; new_task->effective_policy = default_task_effective_policy; @@ -1703,9 +1752,6 @@ task_create_internal( task_ro_data.task_tokens.sec_token = *task_get_sec_token(parent_task); task_ro_data.task_tokens.audit_token = *task_get_audit_token(parent_task); - /* only inherit the option bits, no effect until task_set_immovable_pinned() */ - task_ro_data.task_control_port_options = task_get_control_port_options(parent_task); - task_ro_data.t_flags_ro |= parent_t_flags_ro & TFRO_FILTER_MSG; #if CONFIG_MACF if (!(t_flags & TF_CORPSE_FORK)) { @@ -1716,9 +1762,12 @@ task_create_internal( } else { task_ro_data.task_tokens.sec_token = KERNEL_SECURITY_TOKEN; task_ro_data.task_tokens.audit_token = KERNEL_AUDIT_TOKEN; - - task_ro_data.task_control_port_options = TASK_CONTROL_PORT_OPTIONS_NONE; } + /* + * intentionally initialized to zero, it will be set before returning + * to userspace in task_set_ctrl_port_default + */ + task_ro_data.task_control_port_options = TASK_CONTROL_PORT_OPTIONS_NONE; /* must set before task_importance_init_from_parent: */ if (proc_ro != NULL) { @@ -1839,10 +1888,6 @@ task_create_internal( counter_alloc(&(new_task->cow_faults)); counter_alloc(&(new_task->messages_sent)); counter_alloc(&(new_task->messages_received)); - counter_alloc(&(new_task->pages_grabbed)); - counter_alloc(&(new_task->pages_grabbed_kern)); - counter_alloc(&(new_task->pages_grabbed_iopl)); - counter_alloc(&(new_task->pages_grabbed_upl)); /* Copy resource acc. info from Parent for Corpe Forked task. */ if (parent_task != NULL && (t_flags & TF_CORPSE_FORK)) { @@ -1862,10 +1907,7 @@ task_create_internal( new_task->purged_memory_warn = 0; new_task->purged_memory_critical = 0; new_task->low_mem_privileged_listener = 0; - new_task->memlimit_is_active = 0; - new_task->memlimit_is_fatal = 0; - new_task->memlimit_active_exc_resource = 0; - new_task->memlimit_inactive_exc_resource = 0; + os_atomic_store(&new_task->memlimit_flags, 0, relaxed); new_task->task_timer_wakeups_bin_1 = 0; new_task->task_timer_wakeups_bin_2 = 0; new_task->task_gpu_ns = 0; @@ -2230,10 +2272,6 @@ task_deallocate_internal( counter_free(&task->cow_faults); counter_free(&task->messages_sent); counter_free(&task->messages_received); - counter_free(&task->pages_grabbed); - counter_free(&task->pages_grabbed_kern); - counter_free(&task->pages_grabbed_iopl); - counter_free(&task->pages_grabbed_upl); #if CONFIG_COALITIONS task_release_coalitions(task); @@ -2604,6 +2642,19 @@ task_mark_corpse(task_t task) return KERN_TERMINATED; } + /* + * ipc_task_reset() moved to last thread_terminate_self(): rdar://75737960. + * disable old ports here instead. + * + * The vm_map and ipc_space must exist until this function returns, + * convert_port_to_{map,space}_with_flavor relies on this behavior. + * + * Note this must be done before we mark the port as a corpse, + * so that task_port_no_senders() can determine if the no-senders + * is for a real corpse or not. + */ + ipc_task_disable(task); + task_set_corpse_pending_report(task); task_set_corpse(task); task->crashed_thread_id = thread_tid(self_thread); @@ -2615,15 +2666,6 @@ task_mark_corpse(task_t task) task_unlock(task); - /* - * ipc_task_reset() moved to last thread_terminate_self(): rdar://75737960. - * disable old ports here instead. - * - * The vm_map and ipc_space must exist until this function returns, - * convert_port_to_{map,space}_with_flavor relies on this behavior. - */ - ipc_task_disable(task); - /* let iokit know 1 */ iokit_task_terminate(task, 1); @@ -2687,23 +2729,57 @@ task_clear_corpse(task_t task) * task_port_no_senders * * Called whenever the Mach port system detects no-senders on - * the task port of a corpse. - * Each notification that comes in should terminate the task (corpse). + * a control task port. + * + * Only task ports for corpses need to take action on it, + * and each notification that comes in should terminate + * the task (corpse). */ static void task_port_no_senders(ipc_port_t port, __unused mach_port_mscount_t mscount) { - task_t task = ipc_kobject_get_locked(port, IKOT_TASK_CONTROL); + bool is_corpse = false; + task_t task; - assert(task != TASK_NULL); - assert(task_is_a_corpse(task)); + ip_mq_lock(port); + task = ipc_kobject_get_locked(port, IKOT_TASK_CONTROL); + if (task == TASK_NULL || !task_is_a_corpse(task)) { + task = TASK_NULL; + } else { + task_reference_mig(task); + } + ip_mq_unlock(port); - /* Remove the task from global corpse task list */ - task_remove_from_corpse_task_list(task); + /* + * Task might be a corpse, we must inspect this under + * the itk_lock to resolve the race with task_mark_corpse(): + * + * If the task associated with the port is NULL under the itk_lock(), + * then the port was a former IKOT_TASK_CONTROL port and we should + * leave it alone. + * + * TODO: we should really make corpses use their own IKOT_TASK_CORPSE + * port type instead of these hacks. + */ + if (task) { + itk_lock(task); + ip_mq_lock(port); + assert(task_is_a_corpse(task)); + is_corpse = (ipc_kobject_get_locked(port, IKOT_TASK_CONTROL) != + TASK_NULL); + ip_mq_unlock(port); + itk_unlock(task); + task_deallocate_mig(task); + } - task_clear_corpse(task); - vm_map_unset_corpse_source(task->map); - task_terminate_internal(task); + if (is_corpse) { + /* Remove the task from global corpse task list */ + task_remove_from_corpse_task_list(task); + + task_clear_corpse(task); + vm_map_unset_corpse_source(task->map); + task_terminate_internal(task); + } } /* @@ -2714,20 +2790,19 @@ task_port_no_senders(ipc_port_t port, __unused mach_port_mscount_t mscount) * should be deallocated here when there are no senders remaining. */ static void -task_port_with_flavor_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount __unused) +task_port_with_flavor_no_senders(ipc_port_t port, mach_port_mscount_t mscount) { task_t task; mach_task_flavor_t flavor; ipc_kobject_type_t kotype; ip_mq_lock(port); - if (port->ip_srights > 0) { + if (!ipc_kobject_is_mscount_current_locked(port, mscount)) { ip_mq_unlock(port); return; } - kotype = ip_kotype(port); + + kotype = ip_type(port); assert((IKOT_TASK_READ == kotype) || (IKOT_TASK_INSPECT == kotype)); task = ipc_kobject_get_locked(port, kotype); if (task != TASK_NULL) { @@ -2762,20 +2837,18 @@ task_port_with_flavor_no_senders( * that vends out send rights to this port could resurrect it between * this notification being generated and actually being handled here. */ - if (!ip_active(port) || - task->itk_task_ports[flavor] != port || - port->ip_srights > 0) { + if (task->itk_task_ports[flavor] != port || + !ipc_kobject_is_mscount_current_locked(port, mscount)) { ip_mq_unlock(port); itk_unlock(task); task_deallocate(task); return; } - assert(task->itk_task_ports[flavor] == port); task->itk_task_ports[flavor] = IP_NULL; itk_unlock(task); - ipc_kobject_dealloc_port_and_unlock(port, 0, kotype); + ipc_kobject_dealloc_port_and_unlock(port, mscount, kotype); task_deallocate(task); } @@ -3100,6 +3173,9 @@ task_terminate_internal( ipc_task_disable(task); #if CONFIG_EXCLAVES + /* before conclave can be suspended */ + exclaves_conclave_prepare_teardown(task); + //rdar://139307390, first suspension might not have done conclave suspend. first_suspension = true; if (first_suspension) { @@ -3283,6 +3359,9 @@ tasks_system_suspend(boolean_t suspend) if (task == kernel_task) { continue; } + if (task_is_driver(task)) { + continue; + } suspend ? task_suspend_internal(task) : task_resume_internal(task); } lck_mtx_unlock(&tasks_threads_lock); @@ -3369,6 +3448,10 @@ task_start_halt_locked(task_t task, boolean_t should_mark_corpse) first_suspension = true; if (first_suspension || should_mark_corpse) { task_unlock(task); + + /* before we can teardown the conclave */ + exclaves_conclave_prepare_teardown(task); + if (first_suspension) { task_suspend_conclave(task); } @@ -3979,7 +4062,7 @@ task_threads_from_user( return KERN_INVALID_ARGUMENT; } - kotype = ip_kotype(port); + kotype = ip_type(port); switch (kotype) { case IKOT_TASK_CONTROL: @@ -4160,9 +4243,9 @@ kern_return_t task_suspend( task_t task) { - kern_return_t kr; - mach_port_t port; - mach_port_name_t name; + kern_return_t kr; + mach_port_t port; + mach_port_name_t name; if (task == TASK_NULL || task == kernel_task) { return KERN_INVALID_ARGUMENT; @@ -4187,10 +4270,10 @@ task_suspend( port = task->itk_resume; if (port == IP_NULL) { port = ipc_kobject_alloc_port(task, IKOT_TASK_RESUME, - IPC_KOBJECT_ALLOC_NSREQUEST | IPC_KOBJECT_ALLOC_MAKE_SEND); + IPC_KOBJECT_ALLOC_MAKE_SEND); task->itk_resume = port; } else { - (void)ipc_kobject_make_send_nsrequest(port, task, IKOT_TASK_RESUME); + (void)ipc_kobject_make_send(port, task, IKOT_TASK_RESUME); } itk_unlock(task); @@ -4256,7 +4339,7 @@ task_resume( if (kr == KERN_SUCCESS) { ipc_right_dealloc(space, resume_port_name, resume_port_entry); } else { - ipc_right_destroy(space, resume_port_name, resume_port_entry, FALSE, 0); + ipc_right_destroy(space, resume_port_name, resume_port_entry); } /* space unlocked */ } else { @@ -4390,7 +4473,6 @@ static void task_suspension_no_senders(ipc_port_t port, mach_port_mscount_t mscount) { task_t task = convert_port_to_task_suspension_token(port); - kern_return_t kr; if (task == TASK_NULL) { return; @@ -4403,15 +4485,14 @@ task_suspension_no_senders(ipc_port_t port, mach_port_mscount_t mscount) task_lock(task); - kr = ipc_kobject_nsrequest(port, mscount, NULL); - if (kr == KERN_FAILURE) { + if (ipc_kobject_is_mscount_current(port, mscount)) { /* release all the [remaining] outstanding legacy holds */ release_task_hold(task, TASK_HOLD_LEGACY_ALL); } task_unlock(task); - task_suspension_token_deallocate(task); /* drop token reference */ + task_suspension_token_deallocate(task); /* drop token reference */ } /* @@ -5803,6 +5884,9 @@ task_info( struct proc *p; uint32_t platform, sdk; + + vmlp_api_start(TASK_INFO); /* this is the only case that is relevant to the lock */ + p = current_proc(); platform = proc_platform(p); sdk = proc_sdk(p); @@ -5924,6 +6008,7 @@ task_info( if (*task_info_count < TASK_VM_INFO_REV0_COUNT) { error = KERN_INVALID_ARGUMENT; + vmlp_api_end(TASK_INFO, error); break; } @@ -5945,6 +6030,8 @@ task_info( vm_map_lock_read(map); } + vmlp_range_event_all(map); + vm_info->virtual_size = (typeof(vm_info->virtual_size))vm_map_adjusted_size(map); vm_info->region_count = map->hdr.nentries; vm_info->page_size = vm_map_page_size(map); @@ -6041,6 +6128,7 @@ task_info( if ((task != current_task()) && (!task->active)) { error = KERN_INVALID_ARGUMENT; + vmlp_api_end(TASK_INFO, error); break; } @@ -6153,6 +6241,7 @@ task_info( *task_info_count = TASK_VM_INFO_REV7_COUNT; } + vmlp_api_end(TASK_INFO, error); break; } @@ -6312,6 +6401,38 @@ task_info( break; #endif /* CONFIG_TASK_SUSPEND_STATS && (DEVELOPMENT || DEBUG) */ } + case TASK_SECURITY_CONFIG_INFO: + { + task_security_config_info_t security_config; + + if (*task_info_count < TASK_SECURITY_CONFIG_INFO_COUNT) { + error = KERN_INVALID_ARGUMENT; + break; + } + + security_config = (task_security_config_info_t)task_info_out; + security_config->config = (uint32_t)task->security_config.value; + + *task_info_count = TASK_SECURITY_CONFIG_INFO_COUNT; + break; + } + case TASK_IPC_SPACE_POLICY_INFO: + { + task_ipc_space_policy_info_t ipc_space_config; + + if (*task_info_count < TASK_IPC_SPACE_POLICY_INFO_COUNT) { + error = KERN_INVALID_ARGUMENT; + break; + } + + ipc_space_config = (task_ipc_space_policy_info_t)task_info_out; + struct ipc_space *space = task->itk_space; + if (space) { + ipc_space_config->space_policy = (uint32_t)space->is_policy; + *task_info_count = TASK_SECURITY_CONFIG_INFO_COUNT; + } + break; + } default: error = KERN_INVALID_ARGUMENT; } @@ -6447,6 +6568,10 @@ task_dyld_process_info_notify_register( return KERN_INVALID_TASK; } + if (!ipc_can_stash_naked_send(sright)) { + return KERN_INVALID_RIGHT; + } + if (!IP_VALID(sright)) { return KERN_INVALID_RIGHT; } @@ -7079,51 +7204,43 @@ task_violated_guard( #if CONFIG_MEMORYSTATUS -boolean_t +bool task_get_memlimit_is_active(task_t task) { assert(task != NULL); - if (task->memlimit_is_active == 1) { - return TRUE; - } else { - return FALSE; - } + return os_atomic_load(&task->memlimit_flags, relaxed) & TASK_MEMLIMIT_IS_ACTIVE; } void -task_set_memlimit_is_active(task_t task, boolean_t memlimit_is_active) +task_set_memlimit_is_active(task_t task, bool memlimit_is_active) { assert(task != NULL); if (memlimit_is_active) { - task->memlimit_is_active = 1; + os_atomic_or(&task->memlimit_flags, TASK_MEMLIMIT_IS_ACTIVE, relaxed); } else { - task->memlimit_is_active = 0; + os_atomic_andnot(&task->memlimit_flags, TASK_MEMLIMIT_IS_ACTIVE, relaxed); } } -boolean_t +bool task_get_memlimit_is_fatal(task_t task) { assert(task != NULL); - if (task->memlimit_is_fatal == 1) { - return TRUE; - } else { - return FALSE; - } + return os_atomic_load(&task->memlimit_flags, relaxed) & TASK_MEMLIMIT_IS_FATAL; } void -task_set_memlimit_is_fatal(task_t task, boolean_t memlimit_is_fatal) +task_set_memlimit_is_fatal(task_t task, bool memlimit_is_fatal) { assert(task != NULL); if (memlimit_is_fatal) { - task->memlimit_is_fatal = 1; + os_atomic_or(&task->memlimit_flags, TASK_MEMLIMIT_IS_FATAL, relaxed); } else { - task->memlimit_is_fatal = 0; + os_atomic_andnot(&task->memlimit_flags, TASK_MEMLIMIT_IS_FATAL, relaxed); } } @@ -7141,41 +7258,48 @@ task_set_dirty_start(task_t task, uint64_t start) task_unlock(task); } -boolean_t -task_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active) +bool +task_set_exc_resource_bit(task_t task, bool memlimit_is_active) { - boolean_t triggered = FALSE; - - assert(task == current_task()); - /* - * Returns true, if task has already triggered an exc_resource exception. + * Sets the specified EXC_RESOURCE bit if not set already, and returns + * true if the bit was changed (i.e. it was 0 before). */ - if (memlimit_is_active) { - triggered = (task->memlimit_active_exc_resource ? TRUE : FALSE); - } else { - triggered = (task->memlimit_inactive_exc_resource ? TRUE : FALSE); - } + task_memlimit_flags_t memlimit_orig; + task_memlimit_flags_t bit = + memlimit_is_active ? + TASK_MEMLIMIT_ACTIVE_EXC_RESOURCE : + TASK_MEMLIMIT_INACTIVE_EXC_RESOURCE; - return triggered; + memlimit_orig = os_atomic_or_orig(&task->memlimit_flags, bit, acquire); + + return !(memlimit_orig & bit); } void -task_mark_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active) +task_reset_triggered_exc_resource(task_t task, bool memlimit_is_active) { - assert(task == current_task()); + task_memlimit_flags_t bit = + memlimit_is_active ? + TASK_MEMLIMIT_ACTIVE_EXC_RESOURCE : + TASK_MEMLIMIT_INACTIVE_EXC_RESOURCE; - /* - * We allow one exc_resource per process per active/inactive limit. - * The limit's fatal attribute does not come into play. - */ + os_atomic_andnot(&task->memlimit_flags, bit, relaxed); +} - if (memlimit_is_active) { - task->memlimit_active_exc_resource = 1; - } else { - task->memlimit_inactive_exc_resource = 1; - } +bool +task_get_jetsam_realtime_audio(task_t task) +{ + return task->task_jetsam_realtime_audio; +} + +void +task_set_jetsam_realtime_audio(task_t task, bool realtime_audio) +{ + task_lock(task); + task->task_jetsam_realtime_audio = realtime_audio; + task_unlock(task); } #define HWM_USERCORE_MINSPACE 250 // free space (in MB) required *after* core file creation @@ -7186,9 +7310,11 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, task_t task = current_task(); int pid = 0; const char *procname = "unknown"; + const char *reason = "high watermark"; mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; boolean_t send_sync_exc_resource = FALSE; void *cur_bsd_info = get_bsdtask_info(current_task()); + int flavor = FLAVOR_HIGH_WATERMARK; #ifdef MACH_BSD pid = proc_selfpid(); @@ -7239,8 +7365,23 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, "suppressed by a boot-arg.\n", procname, pid, max_footprint_mb); return; } + + /* + * For the reason string, diagnostic limit is prioritized over fatal limit, + * but for the EXC_RESOURCE flavor it's the other way round. + */ + if (exception_options & EXEC_RESOURCE_DIAGNOSTIC) { + reason = "diagnostics limit"; + if (!(exception_options & EXEC_RESOURCE_FATAL)) { + flavor = FLAVOR_DIAG_MEMLIMIT; + } + } else if (exception_options & EXEC_RESOURCE_CONCLAVE) { + reason = "conclave limit"; + flavor = FLAVOR_CONCLAVE_LIMIT; + } + printf("process %s [%d] crossed memory %s (%d MB); EXC_RESOURCE " - "\n", procname, pid, (!(exception_options & EXEC_RESOURCE_DIAGNOSTIC) ? "high watermark" : "diagnostics limit"), max_footprint_mb); + "\n", procname, pid, reason, max_footprint_mb); /* * A task that has triggered an EXC_RESOURCE, should not be @@ -7256,16 +7397,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, code[0] = code[1] = 0; EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_MEMORY); - /* - * Regardless if there was a diag memlimit violation, fatal exceptions shall be notified always - * as high level watermaks. In another words, if there was a diag limit and a watermark, and the - * violation if for limit watermark, a watermark shall be reported. - */ - if (!(exception_options & EXEC_RESOURCE_FATAL)) { - EXC_RESOURCE_ENCODE_FLAVOR(code[0], !(exception_options & EXEC_RESOURCE_DIAGNOSTIC) ? FLAVOR_HIGH_WATERMARK : FLAVOR_DIAG_MEMLIMIT); - } else { - EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_HIGH_WATERMARK ); - } + EXC_RESOURCE_ENCODE_FLAVOR(code[0], flavor); EXC_RESOURCE_HWM_ENCODE_LIMIT(code[0], max_footprint_mb); /* * Do not generate a corpse fork if the violation is a fatal one @@ -7287,7 +7419,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, task_resume_internal(task); } } else { - if (disable_exc_resource_during_audio && audio_active) { + if (disable_exc_resource_during_audio && audio_active && task->task_jetsam_realtime_audio) { printf("process %s[%d] crossed memory high watermark (%d MB); EXC_RESOURCE " "suppressed due to audio playback.\n", procname, pid, max_footprint_mb); } else { @@ -7309,12 +7441,8 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, void task_footprint_exceeded(int warning, __unused const void *param0, __unused const void *param1) { - ledger_amount_t max_footprint = 0; - ledger_amount_t max_footprint_mb = 0; -#if DEBUG || DEVELOPMENT - ledger_amount_t diag_threshold_limit_mb = 0; - ledger_amount_t diag_threshold_limit = 0; -#endif + ledger_amount_t enforced_limit_mb = 0; + ledger_amount_t enforced_limit = 0; #if CONFIG_DEFERRED_RECLAIM ledger_amount_t current_footprint; #endif /* CONFIG_DEFERRED_RECLAIM */ @@ -7347,38 +7475,40 @@ task_footprint_exceeded(int warning, __unused const void *param0, __unused const task = current_task(); - ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &max_footprint); #if DEBUG || DEVELOPMENT - ledger_get_diag_mem_threshold(task->ledger, task_ledgers.phys_footprint, &diag_threshold_limit); -#endif + if (is_diag_mem_threshold == IS_DIAGNOSTICS) { + ledger_get_diag_mem_threshold(task->ledger, task_ledgers.phys_footprint, &enforced_limit); + } else { + ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &enforced_limit); + } +#else /* DEBUG || DEVELOPMENT */ + ledger_get_limit(task->ledger, task_ledgers.phys_footprint, &enforced_limit); +#endif /* !(DEBUG || DEVELOPMENT) */ #if CONFIG_DEFERRED_RECLAIM - if (vm_deferred_reclamation_task_has_ring(task)) { + if (!is_warning && vm_deferred_reclamation_task_has_ring(task)) { /* * Task is enrolled in deferred reclamation. * Do a reclaim to ensure it's really over its limit. */ vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE); ledger_get_balance(task->ledger, task_ledgers.phys_footprint, ¤t_footprint); - if (current_footprint < max_footprint) { + if (current_footprint < enforced_limit) { return; } } #endif /* CONFIG_DEFERRED_RECLAIM */ - max_footprint_mb = max_footprint >> 20; -#if DEBUG || DEVELOPMENT - diag_threshold_limit_mb = diag_threshold_limit >> 20; -#endif + enforced_limit_mb = enforced_limit >> 20; memlimit_is_active = task_get_memlimit_is_active(task); memlimit_is_fatal = task_get_memlimit_is_fatal(task) == FALSE ? IS_NOT_FATAL : IS_FATAL; #if DEBUG || DEVELOPMENT if (is_diag_mem_threshold == IS_NOT_DIAGNOSTICS) { - task_process_crossed_limit_no_diag(task, max_footprint_mb, memlimit_is_fatal, memlimit_is_active, is_warning); + task_process_crossed_limit_no_diag(task, enforced_limit_mb, memlimit_is_fatal, memlimit_is_active, is_warning); } else { - task_process_crossed_limit_diag(diag_threshold_limit_mb); + task_process_crossed_limit_diag(enforced_limit_mb); } -#else - task_process_crossed_limit_no_diag(task, max_footprint_mb, memlimit_is_fatal, memlimit_is_active, is_warning); -#endif +#else /* DEBUG || DEVELOPMENT */ + task_process_crossed_limit_no_diag(task, enforced_limit_mb, memlimit_is_fatal, memlimit_is_active, is_warning); +#endif /* !(DEBUG || DEVELOPMENT) */ } /* @@ -7396,16 +7526,39 @@ task_process_crossed_limit_no_diag(task_t task, ledger_amount_t ledger_limit_siz * To enforce this, we monitor state based on the memlimit's active/inactive attribute * and we disable it by marking that memlimit as exception triggered. */ - if (is_warning == IS_NOT_WARNING && !task_has_triggered_exc_resource(task, memlimit_is_active)) { + if (is_warning == IS_NOT_WARNING && task_set_exc_resource_bit(task, memlimit_is_active)) { PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND((int)ledger_limit_size, exception_options); // If it was not a diag threshold (if was a memory limit), then we do not want more signalling, // however, if was a diag limit, the user may reload a different limit and signal again the violation memorystatus_log_exception((int)ledger_limit_size, memlimit_is_active, memlimit_is_fatal); - task_mark_has_triggered_exc_resource(task, memlimit_is_active); } memorystatus_on_ledger_footprint_exceeded(is_warning == IS_NOT_WARNING ? FALSE : TRUE, memlimit_is_active, memlimit_is_fatal); } +/* + * Callback invoked when a task exceeds its conclave memory limit. + */ +void +task_conclave_mem_limit_exceeded(__unused int warning, __unused const void *param0, __unused const void *param1) +{ + ledger_amount_t max_footprint = 0; + ledger_amount_t max_footprint_mb = 0; + + task_t task = current_task(); + + ledger_get_limit(task->ledger, task_ledgers.conclave_mem, &max_footprint); + max_footprint_mb = max_footprint >> 20; + + /* + * The conclave memory limit is always fatal. + * For the moment, we assume conclave memory isn't tied to process memory + * and so this doesn't participate in the once-per-process rule above. + */ + PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND((int)max_footprint_mb, EXEC_RESOURCE_FATAL | EXEC_RESOURCE_CONCLAVE); + + memorystatus_on_conclave_limit_exceeded((int)max_footprint_mb); +} + #if DEBUG || DEVELOPMENT /** * Actions to take when a process has crossed the diagnostics limit @@ -7831,6 +7984,48 @@ task_set_thread_limit(task_t task, uint16_t thread_limit) } } +kern_return_t +task_get_conclave_mem_limit(task_t task, uint64_t *conclave_limit) +{ + kern_return_t ret; + ledger_amount_t max; + + ret = ledger_get_limit(task->ledger, task_ledgers.conclave_mem, &max); + if (ret != KERN_SUCCESS) { + return ret; + } + + *conclave_limit = max; + + return KERN_SUCCESS; +} + +kern_return_t +task_set_conclave_mem_limit(task_t task, uint64_t conclave_limit) +{ + kern_return_t error; + + if ((error = proc_check_footprint_priv())) { + (void) error; + /* Following task_set_phys_footprint_limit, always returns KERN_NO_ACCESS. */ + return KERN_NO_ACCESS; + } + + task_lock(task); + + ledger_set_limit(task->ledger, task_ledgers.conclave_mem, + (ledger_amount_t)conclave_limit << 20, 0); + + if (task == current_task()) { + ledger_check_new_balance(current_thread(), task->ledger, + task_ledgers.conclave_mem); + } + + task_unlock(task); + + return KERN_SUCCESS; +} + #if CONFIG_PROC_RESOURCE_LIMITS kern_return_t task_set_port_space_limits(task_t task, uint32_t soft_limit, uint32_t hard_limit) @@ -8193,7 +8388,7 @@ SENDING_NOTIFICATION__THIS_PROCESS_IS_CAUSING_TOO_MANY_WAKEUPS(void) "suppressed by a boot-arg\n", procname, pid); return; } - if (disable_exc_resource_during_audio && audio_active) { + if (disable_exc_resource_during_audio && audio_active && task->task_jetsam_realtime_audio) { os_log(OS_LOG_DEFAULT, "process %s[%d] caught causing excessive wakeups. EXC_RESOURCE " "suppressed due to audio playback\n", procname, pid); return; @@ -8455,8 +8650,8 @@ task_allocate_fatal_port(void) if (kr) { return MACH_PORT_NULL; } - task_fatal_port = ipc_kobject_alloc_port((ipc_kobject_t)token, IKOT_TASK_FATAL, - IPC_KOBJECT_ALLOC_NSREQUEST | IPC_KOBJECT_ALLOC_MAKE_SEND); + task_fatal_port = ipc_kobject_alloc_port(token, IKOT_TASK_FATAL, + IPC_KOBJECT_ALLOC_MAKE_SEND); task_id_token_set_port(token, task_fatal_port); @@ -8707,24 +8902,27 @@ task_swap_mach_voucher( } void -task_set_gpu_denied(task_t task, boolean_t denied) +task_set_gpu_role(task_t task, darwin_gpu_role_t gpu_role) { task_lock(task); - if (denied) { - task->t_flags |= TF_GPU_DENIED; - } else { - task->t_flags &= ~TF_GPU_DENIED; - } + os_atomic_store(&task->t_gpu_role, gpu_role, relaxed); + + KDBG(IMPORTANCE_CODE(IMP_SET_GPU_ROLE, 0), gpu_role); task_unlock(task); } +darwin_gpu_role_t +task_get_gpu_role(task_t task) +{ + return os_atomic_load(&task->t_gpu_role, relaxed); +} + boolean_t task_is_gpu_denied(task_t task) { - /* We don't need the lock to read this flag */ - return (task->t_flags & TF_GPU_DENIED) ? TRUE : FALSE; + return (os_atomic_load(&task->t_gpu_role, relaxed) == PRIO_DARWIN_GPU_DENY) ? TRUE : FALSE; } /* @@ -9237,26 +9435,22 @@ task_get_darkwake_mode(task_t task) } /* - * Set default behavior for task's control port and EXC_GUARD variants that have - * settable behavior. + * Set task default behavior for EXC_GUARD variants that have settable behavior. * * Platform binaries typically have one behavior, third parties another - * but there are special exception we may need to account for. */ void -task_set_exc_guard_ctrl_port_default( +task_set_exc_guard_default( task_t task, - thread_t main_thread, const char *name, - unsigned int namelen, + unsigned long namelen, boolean_t is_simulated, uint32_t platform, uint32_t sdk) { - task_control_port_options_t opts = TASK_CONTROL_PORT_OPTIONS_NONE; - - if (task_is_hardened_binary(task)) { - /* set exc guard default behavior for hardened binaries */ + if (task_get_platform_restrictions_version(task) >= 1) { + /* set exc guard default behavior for platform restrictions binaries */ task->task_exc_guard = (task_exc_guard_default & TASK_EXC_GUARD_ALL); if (1 == task_pid(task)) { @@ -9277,14 +9471,9 @@ task_set_exc_guard_ctrl_port_default( } } } - - /* set control port options for 1p code, inherited from parent task by default */ - opts = ipc_control_port_options & ICP_OPTIONS_1P_MASK; } else { /* set exc guard default behavior for third-party code */ task->task_exc_guard = ((task_exc_guard_default >> TASK_EXC_GUARD_THIRD_PARTY_DEFAULT_SHIFT) & TASK_EXC_GUARD_ALL); - /* set control port options for 3p code, inherited from parent task by default */ - opts = (ipc_control_port_options & ICP_OPTIONS_3P_MASK) >> ICP_OPTIONS_3P_SHIFT; } if (is_simulated) { @@ -9294,15 +9483,7 @@ task_set_exc_guard_ctrl_port_default( (platform == PLATFORM_WATCHOSSIMULATOR && sdk < 0x80000)) { task->task_exc_guard = TASK_EXC_GUARD_NONE; } - /* Disable protection for control ports for simulated binaries */ - opts = TASK_CONTROL_PORT_OPTIONS_NONE; } - - - task_set_control_port_options(task, opts); - - task_set_immovable_pinned(task); - main_thread_set_immovable_pinned(main_thread); } kern_return_t @@ -9521,46 +9702,19 @@ task_ledger_settle_dirty_time_locked(task_t t) } #endif /* CONFIG_MEMORYSTATUS */ -static void -task_ledger_settle_counter(ledger_t ledger, int entry, counter_t *counter) -{ - ledger_amount_t ledger_val; - kern_return_t kr; - uint64_t counter_val; - - kr = ledger_get_balance(ledger, entry, &ledger_val); - if (kr != KERN_SUCCESS) { - return; - } - - counter_val = counter_load(counter); - if (counter_val <= ledger_val) { - return; /* These counters should only move forward, but just in case. */ - } - - ledger_credit(ledger, entry, counter_val - ledger_val); -} - void task_ledger_settle(task_t t) { - ledger_t ledger; - - task_lock(t); - - /* Settle pages grabbed */ - ledger = get_task_ledger(t); - task_ledger_settle_counter(ledger, task_ledgers.pages_grabbed, &t->pages_grabbed); - task_ledger_settle_counter(ledger, task_ledgers.pages_grabbed_kern, &t->pages_grabbed_kern); - task_ledger_settle_counter(ledger, task_ledgers.pages_grabbed_iopl, &t->pages_grabbed_iopl); - task_ledger_settle_counter(ledger, task_ledgers.pages_grabbed_upl, &t->pages_grabbed_upl); - #if CONFIG_MEMORYSTATUS + task_lock(t); /* Settle memorystatus dirty time */ task_ledger_settle_dirty_time_locked(t); -#endif - task_unlock(t); +#endif /* CONFIG_MEMORYSTATUS */ + +#if CONFIG_DEFERRED_RECLAIM + vm_deferred_reclamation_settle_ledger(t); +#endif /* CONFIG_DEFERRED_RECLAIM */ } void @@ -9814,6 +9968,66 @@ task_is_translated(task_t task) } #endif +/* Task runtime security mitigations configuration. */ +#define TASK_SECURITY_CONFIG_HELPER_DEFINE(suffix, checked) \ + bool task_has_##suffix(task_t task) \ + { \ + assert(task); \ + return (task->security_config. suffix); \ + } \ + \ + void task_set_##suffix(task_t task) \ + { \ + assert(task);\ + task->security_config. suffix = true; \ + } \ + \ + void task_clear_##suffix(task_t task) \ + { \ + assert(task);\ + task->security_config. suffix = false; \ + } + +uint32_t +task_get_security_config(task_t task) +{ + assert(task); + return (uint32_t)(task->security_config.value); +} + +TASK_SECURITY_CONFIG_HELPER_DEFINE(hardened_heap, true) +TASK_SECURITY_CONFIG_HELPER_DEFINE(tpro, true) + +uint8_t +task_get_platform_restrictions_version(task_t task) +{ + assert(task); + return task->security_config.platform_restrictions_version; +} + +void +task_set_platform_restrictions_version(task_t task, uint64_t version) +{ + assert(task); + /* platform_restrictions_version is a 3-bit field */ + if (version < 8) { + task->security_config.platform_restrictions_version = (uint8_t)version; + } +} + +uint8_t +task_get_hardened_process_version(task_t task) +{ + assert(task); + return task->security_config.hardened_process_version; +} +void +task_set_hardened_process_version(task_t task, uint64_t version) +{ + assert(task); + task->security_config.hardened_process_version = (uint8_t)version; +} + #if __has_feature(ptrauth_calls) @@ -9850,9 +10064,10 @@ task_set_pac_exception_fatal_flag( * We must not apply this security policy on tasks which have opted out of mach hardening to * avoid regressions in third party plugins and third party apps when using AMFI boot-args */ - bool platform_binary = task_get_platform_binary(task); + ipc_space_policy_t pol = ipc_policy_for_task(task); + bool platform_binary = pol & IPC_SPACE_POLICY_PLATFORM; #if XNU_TARGET_OS_OSX - platform_binary &= !task_opted_out_mach_hardening(task); + platform_binary &= !(pol & IPC_SPACE_POLICY_OPTED_OUT); #endif /* XNU_TARGET_OS_OSX */ /* @@ -9864,7 +10079,8 @@ task_set_pac_exception_fatal_flag( return; } - if (IOTaskHasEntitlement(task, PAC_EXCEPTION_ENTITLEMENT) || task_get_hardened_runtime(task)) { + if (IOTaskHasEntitlement(task, PAC_EXCEPTION_ENTITLEMENT) || + (task_get_platform_restrictions_version(task) >= 1)) { pac_hardened_task = true; set_flags |= TFRO_PAC_ENFORCE_USER_STATE; } @@ -10132,7 +10348,11 @@ task_get_control_port_options(task_t task) return task_get_ro(task)->task_control_port_options; } -void +/* + * intentionally static, as calling this after the task has been started + * will have no affect, control ports cannot go from immovable back to movable + */ +static void task_set_control_port_options(task_t task, task_control_port_options_t opts) { zalloc_ro_update_field(ZONE_ID_PROC_RO, task_get_ro(task), @@ -10232,7 +10452,7 @@ task_launch_conclave(mach_port_name_t port __unused) kern_return_t kr = KERN_FAILURE; assert3u(port, ==, MACH_PORT_NULL); exclaves_resource_t *conclave = task_get_conclave(current_task()); - if (conclave == NULL) { + if (conclave == NULL || exclaves_is_forwarding_resource(conclave)) { return kr; } @@ -10300,7 +10520,7 @@ task_stop_conclave(task_t task, bool gather_crash_bt) { thread_t thread = current_thread(); - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return; } @@ -10324,7 +10544,7 @@ task_suspend_conclave(task_t task) { thread_t thread = current_thread(); - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return; } @@ -10344,7 +10564,7 @@ task_resume_conclave(task_t task) { thread_t thread = current_thread(); - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return; } @@ -10363,7 +10583,7 @@ kern_return_t task_stop_conclave_upcall(void) { task_t task = current_task(); - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return KERN_INVALID_TASK; } @@ -10392,7 +10612,7 @@ task_suspend_conclave_upcall(uint64_t *scid_list, size_t scid_list_count) thread_t thread; int scid_count = 0; kern_return_t kr; - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return KERN_INVALID_TASK; } @@ -10417,7 +10637,7 @@ kern_return_t task_crash_info_conclave_upcall(task_t task, const struct conclave_sharedbuffer_t *shared_buf, uint32_t length) { - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return KERN_INVALID_TASK; } @@ -10517,7 +10737,7 @@ task_add_conclave_crash_info(task_t task, void *crash_info_ptr) void *crash_info; uint32_t crash_info_length = 0; - if (task->conclave == NULL) { + if (task->conclave == NULL || exclaves_is_forwarding_resource(task->conclave)) { return; } @@ -10572,36 +10792,4 @@ task_best_name(task_t task) return proc_best_name(task_get_proc_raw(task)); } -/* - * Set AST_MACH_EXCEPTION on all threads owned by this task. - * Called with the task locked. - */ -void -task_set_ast_mach_exception(task_t task) -{ - spl_t s = splsched(); - - /* Set an AST on each of the task's threads, sending IPIs if needed */ - thread_t thread; - queue_iterate(&task->threads, thread, thread_t, task_threads) { - if (thread == current_thread()) { - thread_ast_set(thread, AST_MACH_EXCEPTION); - ast_propagate(thread); - } else { - processor_t processor; - - thread_lock(thread); - thread_ast_set(thread, AST_MACH_EXCEPTION); - processor = thread->last_processor; - if (processor != PROCESSOR_NULL && - processor->state == PROCESSOR_RUNNING && - processor->active_thread == thread) { - cause_ast_check(processor); - } - thread_unlock(thread); - } - }; - - splx(s); -} diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index e04334ee5..fe717a0c8 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -105,6 +105,8 @@ #include #include #include +#include + #if CONFIG_EXCLAVES #include #endif /* CONFIG_EXCLAVES */ @@ -169,7 +171,8 @@ struct task_pend_token { tpt_update_turnstile :1, tpt_update_tg_app_flag :1, tpt_update_game_mode :1, - tpt_update_carplay_mode :1; + tpt_update_carplay_mode :1, + tpt_update_appnap :1; }; uint32_t tpt_value; }; @@ -178,15 +181,41 @@ struct task_pend_token { typedef struct task_pend_token task_pend_token_s; typedef struct task_pend_token *task_pend_token_t; +struct task_security_config { + union { + struct { + uint8_t hardened_heap: 1, + tpro :1, + reserved: 1, + platform_restrictions_version :3; + uint8_t hardened_process_version; + }; + uint16_t value; + }; +}; + +typedef struct task_security_config task_security_config_s; struct task_watchports; #include +struct ucred; + #ifdef MACH_BSD struct proc; struct proc_ro; #endif +__options_closed_decl(task_memlimit_flags_t, uint32_t, { + /* if set, use active attributes, otherwise use inactive attributes */ + TASK_MEMLIMIT_IS_ACTIVE = 0x01, + /* if set, exceeding current memlimit will prove fatal to the task */ + TASK_MEMLIMIT_IS_FATAL = 0x02, + /* if set, suppress exc_resource exception when task exceeds active memory limit */ + TASK_MEMLIMIT_ACTIVE_EXC_RESOURCE = 0x04, + /* if set, suppress exc_resource exception when task exceeds inactive memory limit */ + TASK_MEMLIMIT_INACTIVE_EXC_RESOURCE = 0x08 +}); struct task { /* Synchronization/destruction information */ @@ -241,9 +270,7 @@ struct task { integer_t importance; /* priority offset (BSD 'nice' value) */ #define task_is_immovable(task) \ - !!(task_get_control_port_options(task) & TASK_CONTROL_PORT_IMMOVABLE) -#define task_is_pinned(task) \ - !!(task_get_control_port_options(task) & TASK_CONTROL_PORT_PINNED) + !!(task_get_control_port_options(task) & TASK_CONTROL_PORT_IMMOVABLE_MASK) /* Statistics */ uint64_t total_runnable_time; @@ -260,7 +287,6 @@ struct task { #if CONFIG_CSR struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_settable_self") itk_settable_self; /* a send right */ #endif /* CONFIG_CSR */ - struct ipc_port * XNU_PTRAUTH_SIGNED_PTR("task.itk_self") itk_self; /* immovable/pinned task port, does not hold right */ struct exception_action exc_actions[EXC_TYPES_COUNT]; /* special exception port used by task_register_hardened_exception_handler */ struct hardened_exception_action hardened_exception_action; @@ -293,10 +319,6 @@ struct task { counter_t cow_faults; /* copy on write fault counter */ counter_t messages_sent; /* messages sent counter */ counter_t messages_received; /* messages received counter */ - counter_t pages_grabbed; /* pages grabbed */ - counter_t pages_grabbed_kern; /* pages grabbed (kernel) */ - counter_t pages_grabbed_iopl; /* pages grabbed (iopl) */ - counter_t pages_grabbed_upl; /* pages grabbed (upl) */ uint32_t decompressions; /* decompression counter (from threads that already terminated) */ uint32_t syscalls_mach; /* mach system call counter */ uint32_t syscalls_unix; /* unix system call counter */ @@ -336,8 +358,6 @@ struct task { #define TF_USE_PSET_HINT_CLUSTER_TYPE 0x00200000 /* bind task to task->pset_hint->pset_cluster_type */ #define TF_DYLD_ALL_IMAGE_FINAL 0x00400000 /* all_image_info_addr can no longer be changed */ #define TF_HASPROC 0x00800000 /* task points to a proc */ -#define TF_HAS_REPLY_PORT_TELEMETRY 0x10000000 /* Rate limit telemetry for reply port security semantics violations rdar://100244531 */ -#define TF_HAS_PROVISIONAL_REPLY_PORT_TELEMETRY 0x20000000 /* Rate limit telemetry for creating provisional reply port rdar://136996362 */ #define TF_GAME_MODE 0x40000000 /* Set the game mode bit for CLPC */ #define TF_CARPLAY_MODE 0x80000000 /* Set the carplay mode bit for CLPC */ @@ -350,11 +370,11 @@ struct task { * RO-protected flags: */ #define TFRO_CORPSE 0x00000020 /* task is a corpse */ -#define TFRO_HARDENED 0x00000100 /* task is a hardened runtime binary */ #if XNU_TARGET_OS_OSX -#define TFRO_MACH_HARDENING_OPT_OUT 0x00000200 /* task might load third party plugins on macOS and should be opted out of mach hardening */ +#define TFRO_MACH_HARDENING_OPT_OUT 0x00000040 /* task might load third party plugins on macOS and should be opted out of mach hardening */ #endif /* XNU_TARGET_OS_OSX */ -#define TFRO_PLATFORM 0x00000400 /* task is a platform binary */ +#define TFRO_PLATFORM 0x00000080 /* task is a platform binary */ + #define TFRO_FILTER_MSG 0x00004000 /* task calls into message filter callback before sending a message */ #define TFRO_PAC_EXC_FATAL 0x00010000 /* task is marked a corpse if a PAC exception occurs */ #define TFRO_JIT_EXC_FATAL 0x00020000 /* kill the task on access violations from privileged JIT code */ @@ -363,6 +383,9 @@ struct task { #define TFRO_HAS_KD_ACCESS 0x02000000 /* Access to the kernel exclave resource domain */ #endif /* CONFIG_EXCLAVES */ #define TFRO_FREEZE_EXCEPTION_PORTS 0x04000000 /* Setting new exception ports on the task/thread is disallowed */ +#if CONFIG_EXCLAVES +#define TFRO_HAS_SENSOR_MIN_ON_TIME_ACCESS 0x08000000 /* Access to sensor minimum on time call */ +#endif /* CONFIG_EXCLAVES */ /* * Task is running within a 64-bit address space. @@ -414,18 +437,6 @@ struct task { #define task_clear_has_proc(task) \ ((task)->t_flags &= ~TF_HASPROC) -#define task_has_reply_port_telemetry(task) \ - (((task)->t_flags & TF_HAS_REPLY_PORT_TELEMETRY) != 0) - -#define task_set_reply_port_telemetry(task) \ - ((task)->t_flags |= TF_HAS_REPLY_PORT_TELEMETRY) - -#define task_has_provisional_reply_port_telemetry(task) \ - (((task)->t_flags & TF_HAS_PROVISIONAL_REPLY_PORT_TELEMETRY) != 0) - -#define task_set_provisional_reply_port_telemetry(task) \ - ((task)->t_flags |= TF_HAS_PROVISIONAL_REPLY_PORT_TELEMETRY) - uint32_t t_procflags; /* general-purpose task flags protected by proc_lock (PL) */ #define TPF_NONE 0 #define TPF_DID_EXEC 0x00000001 /* task has been execed to a new task */ @@ -445,6 +456,8 @@ struct task { uint32_t t_kpc; /* kpc flags */ #endif /* CONFIG_CPU_COUNTERS */ + _Atomic darwin_gpu_role_t t_gpu_role; + bool pidsuspended; /* pid_suspend called; no threads can execute */ bool frozen; /* frozen; private resident pages committed to swap */ bool changing_freeze_state; /* in the process of freezing or thawing */ @@ -511,11 +524,7 @@ struct task { low_mem_privileged_listener :1, /* if set, task would like to know about pressure changes before other tasks on the system */ mem_notify_reserved :27; /* reserved for future use */ - uint32_t memlimit_is_active :1, /* if set, use active attributes, otherwise use inactive attributes */ - memlimit_is_fatal :1, /* if set, exceeding current memlimit will prove fatal to the task */ - memlimit_active_exc_resource :1, /* if set, suppress exc_resource exception when task exceeds active memory limit */ - memlimit_inactive_exc_resource :1, /* if set, suppress exc_resource exception when task exceeds inactive memory limit */ - memlimit_attrs_reserved :28; /* reserved for future use */ + task_memlimit_flags_t _Atomic memlimit_flags; io_stat_info_t task_io_stats; @@ -557,6 +566,8 @@ struct task { unsigned int task_region_info_flags:1; unsigned int task_has_crossed_thread_limit:1; unsigned int task_rr_in_flight:1; /* a t_rr_synchronzie() is in flight */ + unsigned int task_jetsam_realtime_audio:1; + /* * A task's coalition set is "adopted" in task_create_internal * and unset in task_deallocate_internal, so each array member @@ -616,13 +627,14 @@ struct task { /* Auxiliary code-signing information */ uint64_t task_cs_auxiliary_info; + /* Runtime security mitigations */ + task_security_config_s security_config; }; ZONE_DECLARE_ID(ZONE_ID_PROC_TASK, void *); extern zone_t proc_task_zone; extern task_control_port_options_t task_get_control_port_options(task_t task); -extern void task_set_control_port_options(task_t task, task_control_port_options_t opts); /* * EXC_GUARD default delivery behavior for optional Mach port and VM guards. @@ -669,6 +681,13 @@ extern void init_task_ledgers(void); extern task_t current_task(void) __pure2; +__pure2 +static inline ipc_space_t +current_space(void) +{ + return current_task()->itk_space; +} + extern bool task_is_driver(task_t task); extern uint32_t task_ro_flags_get(task_t task); extern void task_ro_flags_set(task_t task, uint32_t flags); @@ -994,15 +1013,6 @@ extern bool task_opted_out_mach_hardening( task_t task); #endif /* XNU_TARGET_OS_OSX */ -extern void -task_set_hardened_runtime( - task_t task, - bool is_hardened); - -extern boolean_t -task_is_hardened_binary( - task_t task); - extern boolean_t task_is_a_corpse( task_t task); @@ -1012,16 +1022,16 @@ extern boolean_t task_is_ipc_active( extern void task_set_corpse( task_t task); -extern void task_set_exc_guard_ctrl_port_default( +extern void task_set_exc_guard_default( task_t task, - thread_t main_thread, const char *name, - unsigned int namelen, + unsigned long namelen, boolean_t is_simulated, uint32_t platform, uint32_t sdk); -extern void task_set_immovable_pinned(task_t task); +extern void task_copyout_control_port( + task_t task); extern bool task_set_ca_client_wi( task_t task, @@ -1119,6 +1129,8 @@ extern kern_return_t task_get_diag_footprint_limit_internal(task_t, uint64_t *, extern kern_return_t task_set_diag_footprint_limit(task_t task, uint64_t new_limit_mb, uint64_t *old_limit_mb); #endif /* CONFIG_MEMORYSTATUS */ #endif /* DEBUG || DEVELOPMENT */ +extern kern_return_t task_get_conclave_mem_limit(task_t, uint64_t *conclave_limit); +extern kern_return_t task_set_conclave_mem_limit(task_t, uint64_t conclave_limit); extern security_token_t *task_get_sec_token(task_t task); extern void task_set_sec_token(task_t task, security_token_t *token); @@ -1133,12 +1145,14 @@ extern void task_set_mach_kobj_filter_mask(task_t task, uint8_t *mask); extern mach_vm_address_t task_get_all_image_info_addr(task_t task); /* Jetsam memlimit attributes */ -extern boolean_t task_get_memlimit_is_active(task_t task); -extern boolean_t task_get_memlimit_is_fatal(task_t task); -extern void task_set_memlimit_is_active(task_t task, boolean_t memlimit_is_active); -extern void task_set_memlimit_is_fatal(task_t task, boolean_t memlimit_is_fatal); -extern boolean_t task_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active); -extern void task_mark_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active); +extern bool task_get_memlimit_is_active(task_t task); +extern bool task_get_memlimit_is_fatal(task_t task); +extern void task_set_memlimit_is_active(task_t task, bool memlimit_is_active); +extern void task_set_memlimit_is_fatal(task_t task, bool memlimit_is_fatal); +extern bool task_set_exc_resource_bit(task_t task, bool memlimit_is_active); +extern void task_reset_triggered_exc_resource(task_t task, bool memlimit_is_active); +extern bool task_get_jetsam_realtime_audio(task_t task); +extern void task_set_jetsam_realtime_audio(task_t task, bool realtime_audio); extern uint64_t task_get_dirty_start(task_t task); extern void task_set_dirty_start(task_t task, uint64_t start); @@ -1235,6 +1249,9 @@ struct _task_ledger_indices { int pages_grabbed_kern; int pages_grabbed_iopl; int pages_grabbed_upl; +#if CONFIG_DEFERRED_RECLAIM + int est_reclaimable; +#endif /* CONFIG_DEFERRED_RECLAIM */ #if CONFIG_FREEZE int frozen_to_swap; #endif /* CONFIG_FREEZE */ @@ -1244,6 +1261,28 @@ struct _task_ledger_indices { int swapins; }; +/* + * Each runtime security mitigation that we support for userland processes + * is tracked in the task security configuration and managed by the following + * helpers. + */ +#define TASK_SECURITY_CONFIG_HELPER_DECLARE(suffix) \ + extern bool task_has_##suffix(task_t); \ + extern void task_set_##suffix(task_t); \ + extern void task_clear_##suffix(task_t); \ + extern void task_no_set_##suffix(task_t task) \ + +extern uint32_t task_get_security_config(task_t); + +TASK_SECURITY_CONFIG_HELPER_DECLARE(hardened_heap); +TASK_SECURITY_CONFIG_HELPER_DECLARE(tpro); + +uint8_t task_get_platform_restrictions_version(task_t task); +void task_set_platform_restrictions_version(task_t task, uint64_t version); +uint8_t task_get_hardened_process_version(task_t task); +void task_set_hardened_process_version(task_t task, uint64_t version); + + /* * Many of the task ledger entries use a reduced feature set * (specifically they just use LEDGER_ENTRY_ALLOW_PANIC_ON_NEGATIVE) @@ -1259,7 +1298,11 @@ struct _task_ledger_indices { * flags, you need to increment this count. * Otherwise, PPL systems will panic at boot. */ +#if CONFIG_DEFERRED_RECLAIM +#define TASK_LEDGER_NUM_SMALL_INDICES 34 +#else /* CONFIG_DEFERRED_RECLAIM */ #define TASK_LEDGER_NUM_SMALL_INDICES 33 +#endif /* !CONFIG_DEFERRED_RECLAIM */ extern struct _task_ledger_indices task_ledgers; /* requires task to be unlocked, returns a referenced thread */ @@ -1272,6 +1315,7 @@ extern void task_rollup_accounting_info(task_t new_task, task_t parent_task); extern kern_return_t task_io_monitor_ctl(task_t task, uint32_t *flags); extern void task_set_did_exec_flag(task_t task); extern void task_clear_exec_copy_flag(task_t task); +extern bool task_is_initproc(task_t task); extern boolean_t task_is_exec_copy(task_t); extern boolean_t task_did_exec(task_t task); extern boolean_t task_is_active(task_t task); @@ -1304,8 +1348,10 @@ void task_set_shared_region_id(task_t task, char *id); extern boolean_t task_has_assertions(task_t task); /* End task_policy */ -extern void task_set_gpu_denied(task_t task, boolean_t denied); +extern void task_set_gpu_role(task_t task, darwin_gpu_role_t gpu_role); extern boolean_t task_is_gpu_denied(task_t task); +/* Returns PRIO_DARWIN_GPU values defined in sys/resource_private.h */ +extern darwin_gpu_role_t task_get_gpu_role(task_t task); extern void task_set_game_mode(task_t task, bool enabled); /* returns true if update must be pushed to coalition (Automatically handled by task_set_game_mode) */ @@ -1409,7 +1455,6 @@ extern boolean_t get_task_frozen(task_t); extern ipc_port_t convert_task_to_port(task_t); extern ipc_port_t convert_task_to_port_kernel(task_t); extern ipc_port_t convert_task_to_port_external(task_t); -extern ipc_port_t convert_task_to_port_pinned(task_t); extern void convert_task_array_to_ports(task_array_t, size_t, mach_task_flavor_t); extern ipc_port_t convert_task_read_to_port(task_t); @@ -1555,14 +1600,11 @@ extern bool task_is_translated(task_t task); #endif - #ifdef MACH_KERNEL_PRIVATE void task_procname(task_t task, char *buf, int size); const char *task_best_name(task_t task); -void task_set_ast_mach_exception(task_t task); - #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/task_ident.c b/osfmk/kern/task_ident.c index daef04141..230eeaea6 100644 --- a/osfmk/kern/task_ident.c +++ b/osfmk/kern/task_ident.c @@ -44,7 +44,7 @@ extern void* proc_find_ident(struct proc_ident const *i); extern int proc_rele(void* p); extern task_t proc_task(void* p); -extern struct proc_ident proc_ident(void* p); +extern struct proc_ident proc_ident_with_policy(void* p, uint8_t policy); extern kern_return_t task_conversion_eval(task_t caller, task_t victim, int flavor); /* Exported to kexts */ @@ -86,15 +86,14 @@ tidt_release(task_id_token_t token) * Ports of type IKOT_TASK_FATAL use task_ident objects to avoid holding a task reference * and are created to send resource limit notifications */ - int kotype = ip_kotype(port); - if (kotype == IKOT_TASK_ID_TOKEN || kotype == IKOT_TASK_FATAL) { - ipc_kobject_dealloc_port(port, 0, kotype); - } else { - panic("%s: unexpected kotype of port %p: got %d", - __func__, port, kotype); - } + ipc_kobject_type_t kotype = ip_type(port); + release_assert(kotype == IKOT_TASK_ID_TOKEN || + kotype == IKOT_TASK_FATAL); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + kotype); #else /* CONFIG_PROC_RESOURCE_LIMITS */ - ipc_kobject_dealloc_port(port, 0, IKOT_TASK_ID_TOKEN); + ipc_kobject_dealloc_port(port, IPC_KOBJECT_NO_MSCOUNT, + IKOT_TASK_ID_TOKEN); #endif /* CONFIG_PROC_RESOURCE_LIMITS */ } @@ -120,6 +119,7 @@ task_id_token_no_senders(ipc_port_t port, __unused mach_port_mscount_t mscount) } IPC_KOBJECT_DEFINE(IKOT_TASK_ID_TOKEN, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = task_id_token_no_senders); @@ -144,7 +144,7 @@ task_create_identity_token( token->task_uniqueid = task->task_uniqueid; } else if (task->active && bsd_info != NULL) { /* must check if the task is active to avoid a UAF - rdar://91431693 */ - token->ident = proc_ident(bsd_info); + token->ident = proc_ident_with_policy(bsd_info, IDENT_VALIDATION_PROC_EXACT); } else { task_unlock(task); zfree(task_id_token_zone, token); @@ -235,7 +235,8 @@ task_identity_token_get_task_port( /* holding a ref on (corpse) task */ if (flavor == TASK_FLAVOR_CONTROL && task == current_task()) { - *portp = convert_task_to_port_pinned(task); /* consumes task ref */ + /* copyout determines immovability, see `should_mark_immovable_send` */ + *portp = convert_task_to_port(task); /* consumes task ref */ return KERN_SUCCESS; } @@ -317,17 +318,20 @@ ipc_port_t convert_task_id_token_to_port( task_id_token_t token) { - __assert_only bool kr; - if (token == TASK_ID_TOKEN_NULL) { return IP_NULL; } zone_require(task_id_token_zone, token); - kr = ipc_kobject_make_send_lazy_alloc_port(&token->port, - token, IKOT_TASK_ID_TOKEN, IPC_KOBJECT_ALLOC_NONE); - assert(kr == TRUE); /* no-senders notification is armed, consumes token ref */ + /* + * make a send right and donate our reference for + * task_id_token_no_senders if this is the first send right + */ + if (!ipc_kobject_make_send_lazy_alloc_port(&token->port, + token, IKOT_TASK_ID_TOKEN)) { + tidt_release(token); + } return token->port; } @@ -340,7 +344,8 @@ task_id_token_set_port( task_id_token_t token, ipc_port_t port) { - assert(token && port && (ip_kotype(port) == IKOT_TASK_FATAL)); + assert(token && port && ip_type(port) == IKOT_TASK_FATAL); token->port = port; } + #endif /* CONFIG_PROC_RESOURCE_LIMITS */ diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index 1393800a6..2a7145682 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -201,7 +201,7 @@ typedef struct thread_watchlist { #endif /* CONFIG_TASKWATCH */ -extern int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap); +extern int memorystatus_update_priority_for_appnap(proc_t p); /* Importance Inheritance related helper functions */ @@ -830,7 +830,8 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ thread_qos_t role_clamp = THREAD_QOS_UNSPECIFIED; - if (requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT) { + if (requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT || + requested.trp_apptype == TASK_APPTYPE_APP_NONUI) { switch (next.tep_role) { case TASK_FOREGROUND_APPLICATION: /* Foreground apps get urgent scheduler priority */ @@ -853,6 +854,11 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ next.tep_qos_ceiling = THREAD_QOS_LEGACY; break; + case TASK_USER_INIT_APPLICATION: + /* i.e. 'off-screen', but doing user-initiated work */ + next.tep_qos_ceiling = THREAD_QOS_USER_INITIATED; + break; + case TASK_CONTROL_APPLICATION: case TASK_GRAPHICS_SERVER: next.tep_qos_ui_is_urgent = 1; @@ -896,13 +902,15 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ bool wants_darwinbg = false; bool wants_all_sockets_bg = false; /* Do I want my existing sockets to be bg */ bool wants_watchersbg = false; /* Do I want my pidbound threads to be bg */ - bool adaptive_bg_only = false; /* This task is BG only because it's adaptive unboosted */ + bool bg_clamps_turnstiles = false; /* This task does not want turnstile-boost-above-task */ - /* Adaptive daemons are DARWIN_BG unless boosted, and don't get network throttled. */ + /* + * Adaptive daemons are DARWIN_BG unless boosted, and don't get network throttled. + * Their threads can be turnstile-boosted out of BG. + */ if (requested.trp_apptype == TASK_APPTYPE_DAEMON_ADAPTIVE && requested.trp_boosted == 0) { wants_darwinbg = true; - adaptive_bg_only = true; } /* @@ -914,16 +922,17 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg || next.tep_role == TASK_DARWINBG_APPLICATION) { wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = true; - adaptive_bg_only = false; + bg_clamps_turnstiles = true; } if (next.tep_coalition_bg) { wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = true; - adaptive_bg_only = false; + bg_clamps_turnstiles = true; } /* Application launching in special Transparent App Lifecycle throttle mode */ - if (requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT && + if ((requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT || + requested.trp_apptype == TASK_APPTYPE_APP_NONUI) && requested.trp_role == TASK_THROTTLE_APPLICATION) { next.tep_tal_engaged = 1; } @@ -931,13 +940,22 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ /* Background daemons are always DARWIN_BG, no exceptions, and don't get network throttled. */ if (requested.trp_apptype == TASK_APPTYPE_DAEMON_BACKGROUND) { wants_darwinbg = true; - adaptive_bg_only = false; + bg_clamps_turnstiles = true; } if (next.tep_qos_clamp == THREAD_QOS_BACKGROUND || next.tep_qos_clamp == THREAD_QOS_MAINTENANCE) { wants_darwinbg = true; - adaptive_bg_only = false; + bg_clamps_turnstiles = true; + } + + /* + * Runaway-mitigated processes are darwinbg unless their threads + * are turnstile-boosted. + */ + if (requested.trp_runaway_mitigation) { + wants_darwinbg = true; + next.tep_runaway_mitigation = 1; } /* Calculate side effects of DARWIN_BG */ @@ -957,7 +975,9 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ next.tep_watchers_bg = 1; } - next.tep_adaptive_bg = adaptive_bg_only; + if (wants_darwinbg && bg_clamps_turnstiles == false) { + next.tep_promote_above_task = 1; + } /* Calculate low CPU priority */ @@ -1010,16 +1030,10 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ } /* Calculate suppression-active flag */ - boolean_t appnap_transition = false; - if (requested.trp_sup_active && requested.trp_boosted == 0) { next.tep_sup_active = 1; } - if (task->effective_policy.tep_sup_active != next.tep_sup_active) { - appnap_transition = true; - } - /* Calculate timer QOS */ int latency_qos = requested.trp_base_latency_qos; @@ -1078,13 +1092,14 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ /* Calculate 'live donor' status for live importance */ switch (requested.trp_apptype) { - case TASK_APPTYPE_APP_TAL: + case TASK_APPTYPE_APP_NONUI: case TASK_APPTYPE_APP_DEFAULT: if (requested.trp_ext_darwinbg == 1 || next.tep_coalition_bg || (next.tep_sup_active == 1 && (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_NONDONOR)) || - next.tep_role == TASK_DARWINBG_APPLICATION) { + next.tep_role == TASK_DARWINBG_APPLICATION || + next.tep_runaway_mitigation) { next.tep_live_donor = 0; } else { next.tep_live_donor = 1; @@ -1120,6 +1135,8 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ next.tep_tal_engaged = 0; next.tep_role = TASK_UNSPECIFIED; next.tep_suppressed_cpu = 0; + next.tep_runaway_mitigation = 0; + next.tep_promote_above_task = 0; } /* @@ -1165,6 +1182,17 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ pend_token->tpt_update_live_donor = 1; } + if (prev.tep_sup_active != next.tep_sup_active) { + pend_token->tpt_update_appnap = 1; + } + + /* runaway mitigation mode generates its own dedicated tracepoint */ + if (prev.tep_runaway_mitigation != next.tep_runaway_mitigation) { + KDBG_RELEASE(IMPORTANCE_CODE(IMP_RUNAWAY_MITIGATION, 0) | + (next.tep_runaway_mitigation ? DBG_FUNC_START : DBG_FUNC_END), + task_pid(task), next.tep_terminated); + } + /* * Step 5: * Update other subsystems as necessary if something has changed @@ -1188,7 +1216,7 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ prev.tep_lowpri_cpu != next.tep_lowpri_cpu || prev.tep_new_sockets_bg != next.tep_new_sockets_bg || prev.tep_terminated != next.tep_terminated || - prev.tep_adaptive_bg != next.tep_adaptive_bg) { + prev.tep_promote_above_task != next.tep_promote_above_task) { update_threads = true; } @@ -1198,6 +1226,7 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ */ if (prev.tep_latency_qos != next.tep_latency_qos || prev.tep_role != next.tep_role || + prev.tep_runaway_mitigation != next.tep_runaway_mitigation || prev.tep_sfi_managed != next.tep_sfi_managed) { update_sfi = true; } @@ -1297,20 +1326,6 @@ task_policy_update_internal_locked(task_t task, bool in_create, task_pend_token_ } } - /* - * Use the app-nap transitions to influence the - * transition of the process within the jetsam band - * [and optionally its live-donor status] - * On macOS only. - */ - if (appnap_transition) { - if (task->effective_policy.tep_sup_active == 1) { - memorystatus_update_priority_for_appnap(((proc_t) get_bsdtask_info(task)), TRUE); - } else { - memorystatus_update_priority_for_appnap(((proc_t) get_bsdtask_info(task)), FALSE); - } - } - if (update_termination) { /* * This update is done after the terminated bit is set, @@ -1480,6 +1495,16 @@ task_policy_update_complete_unlocked(task_t task, task_pend_token_t pend_token) task_coalition_thread_group_carplay_mode_update(task); } #endif /* CONFIG_THREAD_GROUPS */ + + /* + * Use the app-nap transitions to influence the + * transition of the process within the jetsam band + * [and optionally its live-donor status] + * On macOS only. + */ + if (pend_token->tpt_update_appnap) { + memorystatus_update_priority_for_appnap((proc_t) get_bsdtask_info(task)); + } } /* @@ -1663,6 +1688,11 @@ proc_set_task_policy_locked(task_t task, requested.trp_over_through_qos = value2; break; + case TASK_POLICY_RUNAWAY_MITIGATION: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.trp_runaway_mitigation = value; + break; + default: panic("unknown task policy: %d %d %d %d", category, flavor, value, value2); break; @@ -1728,6 +1758,10 @@ proc_get_task_policy(task_t task, assert(category == TASK_POLICY_ATTRIBUTE); value = requested.trp_sfi_managed; break; + case TASK_POLICY_RUNAWAY_MITIGATION: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.trp_runaway_mitigation; + break; default: panic("unknown policy_flavor %d", flavor); break; @@ -1860,6 +1894,12 @@ proc_get_effective_task_policy(task_t task, */ value = task->effective_policy.tep_terminated; break; + case TASK_POLICY_RUNAWAY_MITIGATION: + /* + * This shows whether or not a process has been tagged for runaway mitigation. + */ + value = task->effective_policy.tep_runaway_mitigation; + break; default: panic("unknown policy_flavor %d", flavor); break; @@ -1959,6 +1999,9 @@ proc_darwin_role_to_task_role(int darwin_role, task_role_t* task_role) case PRIO_DARWIN_ROLE_DARWIN_BG: role = TASK_DARWINBG_APPLICATION; break; + case PRIO_DARWIN_ROLE_USER_INIT: + role = TASK_USER_INIT_APPLICATION; + break; default: return EINVAL; } @@ -1984,6 +2027,8 @@ proc_task_role_to_darwin_role(task_role_t task_role) return PRIO_DARWIN_ROLE_TAL_LAUNCH; case TASK_DARWINBG_APPLICATION: return PRIO_DARWIN_ROLE_DARWIN_BG; + case TASK_USER_INIT_APPLICATION: + return PRIO_DARWIN_ROLE_USER_INIT; case TASK_UNSPECIFIED: default: return PRIO_DARWIN_ROLE_DEFAULT; @@ -2027,6 +2072,7 @@ proc_set_task_spawnpolicy(task_t task, thread_t thread, int apptype, int qos_cla switch (apptype) { case TASK_APPTYPE_APP_DEFAULT: + case TASK_APPTYPE_APP_NONUI: /* Apps become donors via the 'live-donor' flag instead of the static donor flag */ task_importance_mark_donor(task, FALSE); task_importance_mark_live_donor(task, TRUE); @@ -2159,8 +2205,6 @@ proc_inherit_task_role(task_t new_task, proc_set_task_policy(new_task, TASK_POLICY_ATTRIBUTE, TASK_POLICY_ROLE, role); } -extern void * XNU_PTRAUTH_SIGNED_PTR("initproc") initproc; - /* * Compute the default main thread qos for a task */ @@ -2172,7 +2216,6 @@ task_compute_main_thread_qos(task_t task) thread_qos_t qos_clamp = task->requested_policy.trp_qos_clamp; switch (task->requested_policy.trp_apptype) { - case TASK_APPTYPE_APP_TAL: case TASK_APPTYPE_APP_DEFAULT: primordial_qos = THREAD_QOS_USER_INTERACTIVE; break; @@ -2181,6 +2224,7 @@ task_compute_main_thread_qos(task_t task) case TASK_APPTYPE_DAEMON_STANDARD: case TASK_APPTYPE_DAEMON_ADAPTIVE: case TASK_APPTYPE_DRIVER: + case TASK_APPTYPE_APP_NONUI: primordial_qos = THREAD_QOS_LEGACY; break; @@ -2189,7 +2233,7 @@ task_compute_main_thread_qos(task_t task) break; } - if (get_bsdtask_info(task) == initproc) { + if (task_is_initproc(task)) { /* PID 1 gets a special case */ primordial_qos = MAX(primordial_qos, THREAD_QOS_USER_INITIATED); } @@ -2205,20 +2249,6 @@ task_compute_main_thread_qos(task_t task) return primordial_qos; } - -/* for process_policy to check before attempting to set */ -boolean_t -proc_task_is_tal(task_t task) -{ - return (task->requested_policy.trp_apptype == TASK_APPTYPE_APP_TAL) ? TRUE : FALSE; -} - -int -task_get_apptype(task_t task) -{ - return task->requested_policy.trp_apptype; -} - boolean_t task_is_daemon(task_t task) { @@ -2242,12 +2272,12 @@ task_is_driver(task_t task) return task->requested_policy.trp_apptype == TASK_APPTYPE_DRIVER; } -boolean_t +bool task_is_app(task_t task) { switch (task->requested_policy.trp_apptype) { case TASK_APPTYPE_APP_DEFAULT: - case TASK_APPTYPE_APP_TAL: + case TASK_APPTYPE_APP_NONUI: return TRUE; default: return FALSE; @@ -2284,8 +2314,7 @@ proc_get_darwinbgstate(task_t task, uint32_t * flagsp) } #endif /* !defined(XNU_TARGET_OS_OSX) */ - if (task->requested_policy.trp_apptype == TASK_APPTYPE_APP_DEFAULT || - task->requested_policy.trp_apptype == TASK_APPTYPE_APP_TAL) { + if (task_is_app(task)) { *flagsp |= PROC_FLAG_APPLICATION; } @@ -2376,6 +2405,7 @@ trequested_1(task_t task) static uintptr_t teffective_0(task_t task) { + static_assert(sizeof(struct task_effective_policy) == sizeof(uint64_t), "size invariant violated"); uintptr_t* raw = (uintptr_t*)&task->effective_policy; return raw[0]; diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index 7c3bd533c..302aa1c4b 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -68,6 +69,7 @@ #include #include +#include #include #include @@ -1040,7 +1042,6 @@ _write_task_snapshot( { struct task *task = get_threadtask(target->thread); struct proc *p = get_bsdtask_info(task); - bool user64_va = task_has_64Bit_addr(task); tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC; tsnap->pid = proc_pid(p); @@ -1078,31 +1079,14 @@ _write_task_snapshot( tsnap->p_start_usec = ((uint64_t)proximate_pid << 32) | (uint32_t)origin_pid; #endif /* CONFIG_COALITIONS */ - if (task->t_flags & TF_TELEMETRY) { - tsnap->ss_flags |= kTaskRsrcFlagged; - } - - if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) { - tsnap->ss_flags |= kTaskDarwinBG; - } - - if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION) { - tsnap->ss_flags |= kTaskIsForeground; - } - if (user64_va) { - tsnap->ss_flags |= kUser64_p; - } - - uint32_t bgstate = 0; - proc_get_darwinbgstate(task, &bgstate); - - if (bgstate & PROC_FLAG_ADAPTIVE_IMPORTANT) { - tsnap->ss_flags |= kTaskIsBoosted; - } - if (bgstate & PROC_FLAG_SUPPRESSED) { - tsnap->ss_flags |= kTaskIsSuppressed; - } + uint64_t ss_flags = kcdata_get_task_ss_flags(task, false); + /* + * sadly the original ss_flags field is not big enough, replicate the + * full flags in the unused disk_reads_count field + */ + tsnap->ss_flags = (uint32_t)ss_flags; + tsnap->disk_reads_count = ss_flags; tsnap->latency_qos = task_grab_latency_qos(task); diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index c0eb7d962..f689e1e78 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -185,9 +185,12 @@ ZONE_DEFINE_ID(ZONE_ID_THREAD_RO, "threads_ro", struct thread_ro, ZC_READONLY); static void thread_port_with_flavor_no_senders(ipc_port_t, mach_port_mscount_t); -IPC_KOBJECT_DEFINE(IKOT_THREAD_CONTROL); +IPC_KOBJECT_DEFINE(IKOT_THREAD_CONTROL, + .iko_op_movable_send = true, /* see ipc_should_mark_immovable_send */ + .iko_op_label_free = ipc_kobject_label_free); IPC_KOBJECT_DEFINE(IKOT_THREAD_READ, - .iko_op_no_senders = thread_port_with_flavor_no_senders); + .iko_op_no_senders = thread_port_with_flavor_no_senders, + .iko_op_label_free = ipc_kobject_label_free); IPC_KOBJECT_DEFINE(IKOT_THREAD_INSPECT, .iko_op_no_senders = thread_port_with_flavor_no_senders); @@ -244,7 +247,7 @@ static_assert(CTID_MAX_THREAD_NUMBER <= COMPACT_ID_MAX); __startup_data static struct thread init_thread; static SECURITY_READ_ONLY_LATE(uint32_t) ctid_nonce; -COMPACT_ID_TABLE_DEFINE(static, ctid_table); +COMPACT_ID_TABLE_DEFINE(__static_testable, ctid_table); __startup_func static void @@ -263,9 +266,9 @@ STARTUP(ZALLOC, STARTUP_RANK_FOURTH, thread_zone_startup); static void thread_deallocate_enqueue(thread_t thread); static void thread_deallocate_complete(thread_t thread); -static void ctid_table_remove(thread_t thread); -static void ctid_table_add(thread_t thread); -static void ctid_table_init(void); +__static_testable void ctid_table_remove(thread_t thread); +__static_testable void ctid_table_add(thread_t thread); +__static_testable void ctid_table_init(void); #ifdef MACH_BSD extern void proc_exit(void *); @@ -327,7 +330,8 @@ void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(t os_refgrp_decl(static, thread_refgrp, "thread", NULL); -static inline void +__static_testable __inline_testable void init_thread_from_template(thread_t thread); +__static_testable __inline_testable void init_thread_from_template(thread_t thread) { /* @@ -1110,11 +1114,7 @@ thread_terminate_queue_invoke(mpsc_queue_chain_t e, /* * Clear the port low two bits to tell pthread that thread is gone. */ -#ifndef NO_PORT_GEN - kport &= ~MACH_PORT_MAKE(0, IE_BITS_GEN_MASK + IE_BITS_GEN_ONE); -#else - kport |= MACH_PORT_MAKE(0, ~(IE_BITS_GEN_MASK + IE_BITS_GEN_ONE)); -#endif + kport &= ~ipc_entry_name_mask(MACH_PORT_NULL); (void)copyoutmap_atomic32(task->map, kport, uthread_joiner_address(uth)); uthread_joiner_wake(task, uth); @@ -1286,14 +1286,9 @@ __options_decl(thread_create_internal_options_t, uint32_t, { TH_OPTION_NOSUSP = 0x02, TH_OPTION_WORKQ = 0x04, TH_OPTION_MAINTHREAD = 0x08, + TH_OPTION_AIO_WORKQ = 0x10, }); -void -main_thread_set_immovable_pinned(thread_t thread) -{ - ipc_main_thread_set_immovable_pinned(thread); -} - /* * Create a new thread. * Doesn't start the thread running. @@ -1310,7 +1305,6 @@ thread_create_internal( thread_t *out_thread) { thread_t new_thread; - ipc_thread_init_options_t init_options = IPC_THREAD_INIT_NONE; struct thread_ro tro_tpl = { }; bool first_thread = false; kern_return_t kr = KERN_FAILURE; @@ -1355,10 +1349,6 @@ thread_create_internal( init_thread_from_template(new_thread); } - if (options & TH_OPTION_MAINTHREAD) { - init_options |= IPC_THREAD_INIT_MAINTHREAD; - } - os_ref_init_count_raw(&new_thread->ref_count, &thread_refgrp, 2); machine_thread_create(new_thread, parent_task, first_thread); @@ -1366,7 +1356,7 @@ thread_create_internal( #ifdef MACH_BSD uthread_init(parent_task, get_bsdthread_info(new_thread), - &tro_tpl, (options & TH_OPTION_WORKQ) != 0); + &tro_tpl, (options & (TH_OPTION_WORKQ | TH_OPTION_AIO_WORKQ)) != 0); if (!task_is_a_corpse(parent_task)) { /* * uthread_init will set tro_cred (with a +1) @@ -1381,7 +1371,7 @@ thread_create_internal( lck_mtx_init(&new_thread->mutex, &thread_lck_grp, LCK_ATTR_NULL); - ipc_thread_init(parent_task, new_thread, &tro_tpl, init_options); + ipc_thread_init(parent_task, new_thread, &tro_tpl); thread_ro_create(parent_task, new_thread, &tro_tpl); @@ -1535,7 +1525,7 @@ thread_create_internal( sched_set_thread_base_priority(new_thread, new_priority); #if defined(CONFIG_SCHED_TIMESHARE_CORE) - new_thread->sched_stamp = sched_tick; + new_thread->sched_stamp = os_atomic_load(&sched_tick, relaxed); #if CONFIG_SCHED_CLUTCH new_thread->pri_shift = sched_clutch_thread_pri_shift(new_thread, new_thread->th_sched_bucket); #else /* CONFIG_SCHED_CLUTCH */ @@ -1733,12 +1723,23 @@ thread_create_waiting_internal( thread_mtx_lock(thread); thread_set_pending_block_hint(thread, block_hint); - if (options & TH_OPTION_WORKQ) { + + switch (options & (TH_OPTION_WORKQ | TH_OPTION_AIO_WORKQ | TH_OPTION_MAINTHREAD)) { + case TH_OPTION_WORKQ: thread->static_param = true; event = workq_thread_init_and_wq_lock(task, thread); - } else if (options & TH_OPTION_MAINTHREAD) { + break; + case TH_OPTION_AIO_WORKQ: + thread->static_param = true; + event = aio_workq_thread_init_and_wq_lock(task, thread); + break; + case TH_OPTION_MAINTHREAD: wait_interrupt = THREAD_UNINT; + break; + default: + panic("Invalid thread options 0x%x", options); } + thread_start_in_assert_wait(thread, assert_wait_queue(event), CAST_EVENT64_T(event), wait_interrupt); @@ -1895,6 +1896,24 @@ thread_create_workq_waiting( options, new_thread); } +kern_return_t +thread_create_aio_workq_waiting( + task_t task, + thread_continue_t continuation, + thread_t *new_thread) +{ + /* + * Create thread, but don't pin control port just yet, in case someone calls + * task_threads() and deallocates pinned port before kernel copyout happens, + * which will result in pinned port guard exception. Instead, pin and copyout + * atomically during workq_setup_and_run(). + */ + int options = TH_OPTION_AIO_WORKQ | TH_OPTION_NOSUSP; + + return thread_create_waiting_internal(task, continuation, NULL, + kThreadWaitParkedWorkQueue, options, new_thread); +} + /* * kernel_thread_create: * @@ -2572,7 +2591,6 @@ mach_exception_ast(thread_t t) }; exit_with_mach_exception(bsd_info, info, flags); } - } static void @@ -2695,7 +2713,7 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void) return; } - if (disable_exc_resource_during_audio && audio_active) { + if (disable_exc_resource_during_audio && audio_active && task->task_jetsam_realtime_audio) { printf("process %s[%d] thread %llu caught burning CPU! " "EXC_RESOURCE & termination suppressed due to audio playback\n", procname, pid, tid); @@ -2767,7 +2785,7 @@ SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task, int thread_count) return; } - if (disable_exc_resource_during_audio && audio_active) { + if (disable_exc_resource_during_audio && audio_active && task->task_jetsam_realtime_audio) { printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE " "suppressed due to audio playback.\n", procname, pid, thread_count); return; @@ -3098,6 +3116,12 @@ uthread_tid( return 0; } +uint64_t +thread_c_switch(thread_t thread) +{ + return thread != THREAD_NULL ? thread->c_switch : 0; +} + uint16_t thread_set_tag(thread_t th, uint16_t tag) { @@ -4000,9 +4024,7 @@ thread_kern_get_kernel_maxpri(void) * should be deallocated here when there are no senders remaining. */ static void -thread_port_with_flavor_no_senders( - ipc_port_t port, - mach_port_mscount_t mscount __unused) +thread_port_with_flavor_no_senders(ipc_port_t port, mach_port_mscount_t mscount) { thread_ro_t tro; thread_t thread; @@ -4010,11 +4032,12 @@ thread_port_with_flavor_no_senders( ipc_kobject_type_t kotype; ip_mq_lock(port); - if (port->ip_srights > 0) { + if (!ipc_kobject_is_mscount_current_locked(port, mscount)) { ip_mq_unlock(port); return; } - kotype = ip_kotype(port); + + kotype = ip_type(port); assert((IKOT_THREAD_READ == kotype) || (IKOT_THREAD_INSPECT == kotype)); thread = ipc_kobject_get_locked(port, kotype); if (thread != THREAD_NULL) { @@ -4050,20 +4073,18 @@ thread_port_with_flavor_no_senders( * this notification being generated and actually being handled here. */ tro = get_thread_ro(thread); - if (!ip_active(port) || - tro->tro_ports[flavor] != port || - port->ip_srights > 0) { + if (tro->tro_ports[flavor] != port || + !ipc_kobject_is_mscount_current_locked(port, mscount)) { ip_mq_unlock(port); thread_mtx_unlock(thread); thread_deallocate(thread); return; } - assert(tro->tro_ports[flavor] == port); zalloc_ro_clear_field(ZONE_ID_THREAD_RO, tro, tro_ports[flavor]); thread_mtx_unlock(thread); - ipc_kobject_dealloc_port_and_unlock(port, 0, kotype); + ipc_kobject_dealloc_port_and_unlock(port, mscount, kotype); thread_deallocate(thread); } @@ -4100,7 +4121,7 @@ thread_self_region_page_shift_set( } __startup_func -static void +__static_testable void ctid_table_init(void) { /* @@ -4152,7 +4173,7 @@ ctid_table_remove(thread_t thread) thread_t ctid_get_thread_unsafe(ctid_t ctid) { - if (ctid && compact_id_slab_valid(&ctid_table, ctid_unmangle(ctid))) { + if (ctid && ctid <= CTID_MAX_THREAD_NUMBER && compact_id_slab_valid(&ctid_table, ctid_unmangle(ctid))) { return *compact_id_resolve(&ctid_table, ctid_unmangle(ctid)); } return THREAD_NULL; @@ -4404,8 +4425,9 @@ dtrace_thread_bootstrap(void) if (thread->t_dtrace_flags & TH_DTRACE_EXECSUCCESS) { thread->t_dtrace_flags &= ~TH_DTRACE_EXECSUCCESS; DTRACE_PROC(exec__success); + extern uint64_t kdp_task_exec_meta_flags(task_t task); KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC), - task_pid(task)); + task_pid(task), kdp_task_exec_meta_flags(task)); } DTRACE_PROC(start); } diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index 14981217d..7e55c4aa4 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -108,6 +108,7 @@ #include #include #include +#include #include #include @@ -165,6 +166,7 @@ __options_closed_decl(thread_tag_t, uint16_t, { THREAD_TAG_PTHREAD = 0x10, THREAD_TAG_WORKQUEUE = 0x20, THREAD_TAG_USER_JOIN = 0x40, + THREAD_TAG_AIO_WORKQUEUE = 0x80, }); typedef struct thread_ro *thread_ro_t; @@ -204,11 +206,10 @@ struct thread_ro { #endif struct task *tro_task; - struct ipc_port *tro_self_port; + struct ipc_port *tro_ports[THREAD_SELF_PORT_COUNT]; /* no right */ #if CONFIG_CSR struct ipc_port *tro_settable_self_port; /* send right */ #endif /* CONFIG_CSR */ - struct ipc_port *tro_ports[THREAD_SELF_PORT_COUNT]; /* no right */ struct exception_action *tro_exc_actions; }; @@ -341,7 +342,6 @@ __options_decl(thread_exclaves_inspection_flags_t, uint16_t, { #endif /* CONFIG_EXCLAVES */ - typedef union thread_rr_state { uint32_t trr_value; struct { @@ -423,6 +423,15 @@ struct thread { bool th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_COUNT]; bool th_shared_rsrc_heavy_user[CLUSTER_SHARED_RSRC_TYPE_COUNT]; bool th_shared_rsrc_heavy_perf_control[CLUSTER_SHARED_RSRC_TYPE_COUNT]; + /* + * Caution! These bits should only be written/read by the current, + * just previous, or thread_setrun()-ing processor, before this + * thread can be picked up to run by its next processor. + */ + uint8_t + th_expired_quantum_on_lower_core:1, + th_expired_quantum_on_higher_core:1, + :0; #endif /* CONFIG_SCHED_EDGE */ #if CONFIG_SCHED_CLUTCH @@ -458,6 +467,9 @@ struct thread { #define TH_OPT_IPC_TG_BLOCKED 0x2000 /* Thread blocked in sync IPC and has made the thread group blocked callout */ #define TH_OPT_FORCED_LEDGER 0x4000 /* Thread has a forced CPU limit */ #define TH_IN_MACH_EXCEPTION 0x8000 /* Thread is currently handling a mach exception */ +#if CONFIG_EXCLAVES +#define TH_OPT_AOE 0x10000 /* Thread is an AOE exclave thread */ +#endif /* CONFIG_EXCLAVES */ bool wake_active; /* wake event on stop */ bool at_safe_point; /* thread_abort_safely allowed */ @@ -717,12 +729,7 @@ struct thread { mach_msg_size_t msize; /* actual size for the msg */ mach_msg_size_t asize; /* actual size for aux data */ mach_port_name_t receiver_name; /* the receive port name */ - union { - struct ipc_kmsg *XNU_PTRAUTH_SIGNED_PTR("thread.ith_kmsg") kmsg; /* received message */ -#if MACH_FLIPC - struct ipc_mqueue *XNU_PTRAUTH_SIGNED_PTR("thread.ith_peekq") peekq; /* mqueue to peek at */ -#endif /* MACH_FLIPC */ - }; + struct ipc_kmsg *XNU_PTRAUTH_SIGNED_PTR("thread.ith_kmsg") kmsg; /* received message */ } receive; struct { struct semaphore *waitsemaphore; /* semaphore ref */ @@ -1070,9 +1077,6 @@ struct thread { #define ith_asize saved.receive.asize #define ith_receiver_name saved.receive.receiver_name #define ith_kmsg saved.receive.kmsg -#if MACH_FLIPC -#define ith_peekq saved.receive.peekq -#endif /* MACH_FLIPC */ #define sth_waitsemaphore saved.sema.waitsemaphore #define sth_signalsemaphore saved.sema.signalsemaphore @@ -1421,63 +1425,69 @@ thread_get_tag_internal(thread_t thread) } #endif /* MACH_KERNEL_PRIVATE */ -uint64_t thread_last_run_time(thread_t thread); +extern uint64_t thread_last_run_time( + thread_t thread); extern kern_return_t thread_state_initialize( - thread_t thread); + thread_t thread); extern kern_return_t thread_setstatus( - thread_t thread, - int flavor, - thread_state_t tstate, + thread_t thread, + int flavor, + thread_state_t tstate, mach_msg_type_number_t count); extern kern_return_t thread_setstatus_from_user( - thread_t thread, - int flavor, - thread_state_t tstate, + thread_t thread, + int flavor, + thread_state_t tstate, mach_msg_type_number_t count, - thread_state_t old_tstate, + thread_state_t old_tstate, mach_msg_type_number_t old_count, thread_set_status_flags_t flags); extern kern_return_t thread_getstatus( - thread_t thread, - int flavor, - thread_state_t tstate, - mach_msg_type_number_t *count); - -extern void main_thread_set_immovable_pinned(thread_t thread); + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t *count); extern kern_return_t thread_getstatus_to_user( - thread_t thread, - int flavor, - thread_state_t tstate, - mach_msg_type_number_t *count, + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t *count, thread_set_status_flags_t flags); extern kern_return_t thread_create_with_continuation( - task_t task, - thread_t *new_thread, - thread_continue_t continuation); + task_t task, + thread_t *new_thread, + thread_continue_t continuation); -extern kern_return_t main_thread_create_waiting(task_t task, - thread_continue_t continuation, - event_t event, - thread_t *new_thread); +extern kern_return_t main_thread_create_waiting( + task_t task, + thread_continue_t continuation, + event_t event, + thread_t *new_thread); extern kern_return_t thread_create_workq_waiting( task_t task, thread_continue_t thread_return, - thread_t *new_thread, + thread_t *new_thread, bool is_permanently_bound); +extern kern_return_t thread_create_aio_workq_waiting( + task_t task, + thread_continue_t thread_return, + thread_t *new_thread); + extern void thread_yield_internal( mach_msg_timeout_t interval); extern void thread_yield_to_preemption(void); -extern void thread_depress_timer_setup(thread_t self); +extern void thread_depress_timer_setup( + thread_t self); /* * Thread-private CPU limits: apply a private CPU limit to this thread only. Available actions are: @@ -1609,6 +1619,7 @@ extern int thread_task_has_ldt(thread_t); #endif extern void set_thread_pagein_error(thread_t, int); extern event_t workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/pthread/ +extern event_t aio_workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/aio/ struct proc; struct uthread; @@ -1850,7 +1861,7 @@ void thread_clear_eager_preempt(thread_t thread); void thread_set_honor_qlimit(thread_t thread); void thread_clear_honor_qlimit(thread_t thread); extern ipc_port_t convert_thread_to_port(thread_t); -extern ipc_port_t convert_thread_to_port_pinned(thread_t); +extern ipc_port_t convert_thread_to_port_immovable(thread_t); extern ipc_port_t convert_thread_inspect_to_port(thread_inspect_t); extern ipc_port_t convert_thread_read_to_port(thread_read_t); extern void convert_thread_array_to_ports(thread_act_array_t, size_t, mach_thread_flavor_t); @@ -1863,7 +1874,7 @@ extern void thread_iokit_tls_set(uint32_t index, void * data); extern int thread_self_region_page_shift(void); extern void thread_self_region_page_shift_set(int pgshift); extern kern_return_t thread_create_immovable(task_t task, thread_t *new_thread); -extern kern_return_t thread_terminate_pinned(thread_t thread); +extern kern_return_t thread_terminate_immovable(thread_t thread); struct thread_attr_for_ipc_propagation; extern kern_return_t thread_get_ipc_propagate_attr(thread_t thread, struct thread_attr_for_ipc_propagation *attr); @@ -1951,6 +1962,8 @@ extern kern_return_t thread_get_special_port( int which, ipc_port_t *portp); +extern uint64_t thread_c_switch(thread_t thread); + #endif /* XNU_KERNEL_PRIVATE */ /*! @function thread_has_thread_name diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index 800b0e089..a706309e3 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -81,11 +81,11 @@ #include #include +#include #include #include -#include static void act_abort(thread_t thread); @@ -93,76 +93,87 @@ static void thread_suspended(void *arg, wait_result_t result); static void thread_set_apc_ast(thread_t thread); static void thread_set_apc_ast_locked(thread_t thread); -extern void proc_name(int pid, char * buf, int size); -extern boolean_t IOCurrentTaskHasEntitlement(const char *); - -CA_EVENT(thread_set_state, - CA_STATIC_STRING(CA_PROCNAME_LEN), current_proc); - -static void -send_thread_set_state_telemetry(void) -{ - ca_event_t ca_event = CA_EVENT_ALLOCATE(thread_set_state); - CA_EVENT_TYPE(thread_set_state) * event = ca_event->data; - - proc_name(task_pid(current_task()), (char *) &event->current_proc, CA_PROCNAME_LEN); - - CA_EVENT_SEND(ca_event); -} +extern boolean_t IOTaskHasEntitlement(task_t task, const char *entitlement); /* bootarg to create lightweight corpse for thread set state lockdown */ TUNABLE(bool, tss_should_crash, "tss_should_crash", true); -static inline boolean_t -thread_set_state_allowed(thread_t thread, int flavor) +#define task_has_tss_entitlement(task) IOTaskHasEntitlement((task), \ + "com.apple.private.thread-set-state") + +static inline bool +thread_set_state_allowed( + thread_t thread, + int flavor, + thread_set_status_flags_t flags) { - task_t target_task = get_threadtask(thread); + task_t curr_task = TASK_NULL; + task_t target_task = TASK_NULL; + ipc_space_policy_t target_pol; + ipc_space_policy_t exception_tss_policy_level; #if DEVELOPMENT || DEBUG /* disable the feature if the boot-arg is disabled. */ if (!tss_should_crash) { - return TRUE; + return true; } #endif /* DEVELOPMENT || DEBUG */ - /* hardened binaries must have entitlement - all others ok */ - if (task_is_hardened_binary(target_task) - && !(thread->options & TH_IN_MACH_EXCEPTION) /* Allowed for now - rdar://103085786 */ - && FLAVOR_MODIFIES_CORE_CPU_REGISTERS(flavor) /* only care about locking down PC/LR */ -#if XNU_TARGET_OS_OSX - && !task_opted_out_mach_hardening(target_task) -#endif /* XNU_TARGET_OS_OSX */ -#if CONFIG_ROSETTA - && !task_is_translated(target_task) /* Ignore translated tasks */ -#endif /* CONFIG_ROSETTA */ - && !IOCurrentTaskHasEntitlement("com.apple.private.thread-set-state") - ) { - /* fatal crash */ - mach_port_guard_exception(MACH_PORT_NULL, 0, kGUARD_EXC_THREAD_SET_STATE); - send_thread_set_state_telemetry(); - return FALSE; + /* No security check needed if neither of these two flags were set */ + if ((flags & TSSF_CHECK_ENTITLEMENT) == 0 && + (thread->options & TH_IN_MACH_EXCEPTION) == 0) { + return true; + } + + curr_task = current_task(); + target_task = get_threadtask(thread); + target_pol = ipc_space_policy(get_task_ipcspace(target_task)); + /* Allow if the task is translated, simulated, or has IPC hardening turned off */ + if (!ipc_should_apply_policy(target_pol, IPC_SPACE_POLICY_DEFAULT)) { + return true; + } + + /* + * Setting the thread state from a userspace mach exception handler is + * allowed iff it comes from the same process, or if the process is + * being debugged (in dev mode) + */ +#if !(XNU_TARGET_OS_OSX || XNU_TARGET_OS_BRIDGE) + exception_tss_policy_level = IPC_POLICY_ENHANCED_V1; +#else + exception_tss_policy_level = IPC_POLICY_ENHANCED_V2; +#endif /* !(XNU_TARGET_OS_OSX || XNU_TARGET_OS_BRIDGE) */ + if ((thread->options & TH_IN_MACH_EXCEPTION) && + target_task != curr_task && + ipc_should_apply_policy(target_pol, exception_tss_policy_level) && + (!is_address_space_debugged(get_bsdtask_info(target_task))) && + !task_has_tss_entitlement(curr_task)) { + mach_port_guard_exception(flavor, 0, kGUARD_EXC_THREAD_SET_STATE); + return false; + } + + /* enhanced security binaries must have entitlement - all others ok */ + if ((flags & TSSF_CHECK_ENTITLEMENT) && + !(thread->options & TH_IN_MACH_EXCEPTION) && /* Allowed for now - rdar://103085786 */ + ipc_should_apply_policy(target_pol, IPC_POLICY_ENHANCED_V1) && + FLAVOR_MODIFIES_CORE_CPU_REGISTERS(flavor) && /* only care about locking down PC/LR */ + !task_has_tss_entitlement(curr_task)) { + mach_port_guard_exception(flavor, 0, kGUARD_EXC_THREAD_SET_STATE); + return false; } #if __has_feature(ptrauth_calls) /* Do not allow Fatal PAC exception binaries to set Debug state */ - if (task_is_pac_exception_fatal(target_task) - && machine_thread_state_is_debug_flavor(flavor) -#if XNU_TARGET_OS_OSX - && !task_opted_out_mach_hardening(target_task) -#endif /* XNU_TARGET_OS_OSX */ -#if CONFIG_ROSETTA - && !task_is_translated(target_task) /* Ignore translated tasks */ -#endif /* CONFIG_ROSETTA */ - && !IOCurrentTaskHasEntitlement("com.apple.private.thread-set-state") - ) { - /* fatal crash */ - mach_port_guard_exception(MACH_PORT_NULL, 0, kGUARD_EXC_THREAD_SET_STATE); - send_thread_set_state_telemetry(); - return FALSE; + if ((flags & TSSF_CHECK_ENTITLEMENT) && + task_is_pac_exception_fatal(target_task) && + machine_thread_state_is_debug_flavor(flavor) && + !task_has_tss_entitlement(curr_task)) { + mach_port_guard_exception(flavor, 0, kGUARD_EXC_THREAD_SET_STATE); + return false; } #endif /* __has_feature(ptrauth_calls) */ - return TRUE; + return true; } /* @@ -265,7 +276,7 @@ thread_terminate_internal( } /* unconditionally unpin the thread in internal termination */ - ipc_thread_port_unpin(get_thread_ro(thread)->tro_self_port); + ipc_thread_port_unpin(get_thread_ro(thread)->tro_ports[THREAD_FLAVOR_CONTROL]); thread_mtx_unlock(thread); @@ -337,34 +348,19 @@ thread_terminate_from_user( } /* - * Terminate a thread with pinned control port. + * Terminate a thread with immovable control port. * * Can only be used on threads managed by pthread. Exported in pthread_kern. */ kern_return_t -thread_terminate_pinned( +thread_terminate_immovable( thread_t thread) { - task_t task; - - if (thread == THREAD_NULL) { - return KERN_INVALID_ARGUMENT; - } - - task = get_threadtask(thread); - - - assert(task != kernel_task); + assert(thread == current_thread()); + assert(get_threadtask(thread) != kernel_task); assert(thread_get_tag(thread) & (THREAD_TAG_PTHREAD | THREAD_TAG_MAINTHREAD)); - thread_mtx_lock(thread); - if (task_is_pinned(task) && thread->active) { - assert(get_thread_ro(thread)->tro_self_port->ip_pinned == 1); - } - thread_mtx_unlock(thread); - - kern_return_t result = thread_terminate_internal(thread); - return result; + return thread_terminate_internal(thread); } /* @@ -723,8 +719,11 @@ thread_set_state_internal( return KERN_INVALID_ARGUMENT; } - if ((flags & TSSF_CHECK_ENTITLEMENT) && - !thread_set_state_allowed(thread, flavor)) { + /* + * process will be crashed with kGUARD_EXC_THREAD_SET_STATE + * if thread_set_state_allowed() return false. + */ + if (!thread_set_state_allowed(thread, flavor, flags)) { return KERN_NO_ACCESS; } diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index 589f6b05a..c65772c7b 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -1982,6 +1982,18 @@ thread_call_start_deallocate_timer(thread_call_group_t group) assert(already_enqueued == false); } +static inline uint64_t +thread_call_get_time(thread_call_flavor_t flavor) +{ + if (flavor == TCF_CONTINUOUS) { + return mach_continuous_time(); + } else if (flavor == TCF_ABSOLUTE) { + return mach_absolute_time(); + } else { + panic("invalid timer flavor: %d", flavor); + } +} + /* non-static so dtrace can find it rdar://problem/31156135&31379348 */ void thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1) @@ -1994,17 +2006,11 @@ thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1) thread_call_t call; uint64_t now; + extern uint64_t timer_scan_limit_abs; thread_call_lock_spin(group); - if (flavor == TCF_CONTINUOUS) { - now = mach_continuous_time(); - } else if (flavor == TCF_ABSOLUTE) { - now = mach_absolute_time(); - } else { - panic("invalid timer flavor: %d", flavor); - } - + now = thread_call_get_time(flavor); while ((call = priority_queue_min(&group->delayed_pqueues[flavor], struct thread_call, tc_pqlink)) != NULL) { assert(thread_call_get_group(call) == group); @@ -2020,6 +2026,13 @@ thread_call_delayed_timer(timer_call_param_t p0, timer_call_param_t p1) break; } + /* + * Don't do too much work in one timer interrupt. + */ + if (thread_call_get_time(flavor) > (now + timer_scan_limit_abs)) { + break; + } + /* * If we hit a rate-limited timer, don't eagerly wake it up. * Wait until it reaches the end of the leeway window. diff --git a/osfmk/kern/thread_group.c b/osfmk/kern/thread_group.c index 5df48f1f6..f13d9ff66 100644 --- a/osfmk/kern/thread_group.c +++ b/osfmk/kern/thread_group.c @@ -39,6 +39,7 @@ #include #include #include +#include #if CONFIG_THREAD_GROUPS @@ -82,6 +83,7 @@ static struct thread_group *tg_system; static struct thread_group *tg_background; static struct thread_group *tg_vm; static struct thread_group *tg_io_storage; +static struct thread_group *tg_cellular; static struct thread_group *tg_perf_controller; int tg_set_by_bankvoucher; @@ -140,6 +142,8 @@ thread_group_init(void) thread_group_set_name(tg_io_storage, "io storage"); tg_perf_controller = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT); thread_group_set_name(tg_perf_controller, "perf_controller"); + tg_cellular = thread_group_create_and_retain(THREAD_GROUP_FLAGS_DEFAULT); + thread_group_set_name(tg_cellular, "Cellular"); /* * The thread group deallocation queue must be a thread call based queue @@ -517,6 +521,10 @@ thread_group_find_by_id_and_retain(uint64_t id) result = tg_perf_controller; thread_group_retain(tg_perf_controller); break; + case THREAD_GROUP_CELLULAR: + result = tg_cellular; + thread_group_retain(tg_cellular); + break; default: lck_mtx_lock(&tg_lock); qe_foreach_element(tg, &tg_queue, tg_queue_chain) { @@ -1232,6 +1240,15 @@ thread_group_join_io_storage(void) thread_set_thread_group(current_thread(), tg); } +void +thread_group_join_cellular(void) +{ + struct thread_group *tg = thread_group_find_by_id_and_retain(THREAD_GROUP_CELLULAR); + assert(tg != NULL); + assert(current_thread()->thread_group != tg); + thread_set_thread_group(current_thread(), tg); +} + void thread_group_join_perf_controller(void) { @@ -1274,16 +1291,172 @@ sched_perfcontrol_thread_group_recommend(__unused void *machine_data, __unused c /* Use sched_perfcontrol_thread_group_preferred_clusters_set() instead */ } -void -sched_perfcontrol_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_request_bitmap, uint64_t flags, uint64_t matrix_order) +static perfcontrol_class_t +sched_bucket_to_perfcontrol_class(sched_bucket_t bucket) { - sched_edge_matrix_get(edge_matrix, edge_request_bitmap, flags, matrix_order); + switch (bucket) { + case TH_BUCKET_FIXPRI: + return PERFCONTROL_CLASS_ABOVEUI; + case TH_BUCKET_SHARE_FG: + return PERFCONTROL_CLASS_UI; + case TH_BUCKET_SHARE_IN: + return PERFCONTROL_CLASS_USER_INITIATED; + case TH_BUCKET_SHARE_DF: + return PERFCONTROL_CLASS_NONUI; + case TH_BUCKET_SHARE_UT: + return PERFCONTROL_CLASS_UTILITY; + case TH_BUCKET_SHARE_BG: + return PERFCONTROL_CLASS_BACKGROUND; + default: + panic("Unexpected sched bucket %d", bucket); + } +} + +#define MAX_EDGE_MATRIX_SIZE (MAX_PSETS * MAX_PSETS * TH_BUCKET_SCHED_MAX) + +/* + * Iterate through indices of the edge matrix (dimension: num_psets X num_psets X TH_BUCKET_SCHED_MAX), + * and along the way, compute the corresponding index in CLPC's version of the matrix, which has + * dimension: num_psets X num_psets X PERFCONTROL_CLASS_MAX + */ +#define sched_perfcontrol_sched_edge_matrix_iterate(num_psets, edge_ind, sched_ind, ...) \ + for (int src = 0; src < num_psets; src++) { \ + for (int dst = 0; dst < num_psets; dst++) { \ + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { \ + perfcontrol_class_t pc = sched_bucket_to_perfcontrol_class(bucket); \ + int edge_ind = (src * (int)num_psets * PERFCONTROL_CLASS_MAX) + (dst * PERFCONTROL_CLASS_MAX) + pc; \ + int sched_ind = (src * (int)num_psets * TH_BUCKET_SCHED_MAX) + (dst * TH_BUCKET_SCHED_MAX) + bucket; \ + __VA_ARGS__; \ + } \ + } \ + } + +/* Compute the index of a realtime edge within the perfcontrol matrix. */ +static uint64_t +rt_config_edge_index(uint64_t src_pset_id, uint64_t dst_pset_id, uint64_t num_psets) +{ + return (src_pset_id * num_psets * PERFCONTROL_CLASS_MAX) + + (dst_pset_id * PERFCONTROL_CLASS_MAX) + + PERFCONTROL_CLASS_REALTIME; } void -sched_perfcontrol_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changes_bitmap, uint64_t flags, uint64_t matrix_order) +sched_perfcontrol_edge_matrix_by_qos_get(sched_clutch_edge *edge_matrix, bool *edge_requested, uint64_t flags, + uint64_t num_psets, __assert_only uint64_t num_classes) { - sched_edge_matrix_set(edge_matrix, edge_changes_bitmap, flags, matrix_order); + assert3u(num_psets, <=, MAX_PSETS); + assert3u(num_classes, ==, PERFCONTROL_CLASS_MAX); + bool sched_edge_requested[MAX_EDGE_MATRIX_SIZE] = {0}; + sched_perfcontrol_sched_edge_matrix_iterate(num_psets, edge_matrix_ind, sched_matrix_ind, { + if (edge_requested[edge_matrix_ind]) { + sched_edge_requested[sched_matrix_ind] = true; + } + }); + + sched_clutch_edge sched_matrix[MAX_EDGE_MATRIX_SIZE] = {0}; + sched_edge_matrix_get(sched_matrix, sched_edge_requested, flags, num_psets); + + sched_perfcontrol_sched_edge_matrix_iterate(num_psets, edge_matrix_ind, sched_matrix_ind, { + if (sched_edge_requested[sched_matrix_ind]) { + edge_matrix[edge_matrix_ind] = sched_matrix[sched_matrix_ind]; + } + }); + + bool sched_rt_requested[MAX_PSETS * MAX_PSETS] = {}; + for (uint src = 0; src < num_psets; src++) { + for (uint dst = 0; dst < num_psets; dst++) { + const uint64_t edge_matrix_index = rt_config_edge_index(src, dst, num_psets); + if (sched_rt_requested[edge_matrix_index]) { + sched_rt_requested[src * num_psets + dst] = true; + } + } + } + + sched_clutch_edge sched_rt_matrix[MAX_PSETS * MAX_PSETS] = {}; + sched_rt_matrix_get(sched_rt_matrix, sched_rt_requested, num_psets); + + uint64_t rt_matrix_index = 0; + for (uint src = 0; src < num_psets; src++) { + for (uint dst = 0; dst < num_psets; dst++) { + const uint64_t edge_matrix_index = rt_config_edge_index(src, dst, num_psets); + if (edge_requested[edge_matrix_index]) { + edge_matrix[edge_matrix_index] = sched_rt_matrix[rt_matrix_index]; + } + rt_matrix_index++; + } + } +} + +void +sched_perfcontrol_edge_matrix_by_qos_set(sched_clutch_edge *edge_matrix, bool *edge_changed, uint64_t flags, + uint64_t num_psets, __assert_only uint64_t num_classes) +{ + assert3u(num_psets, <=, MAX_PSETS); + assert3u(num_classes, ==, PERFCONTROL_CLASS_MAX); + sched_clutch_edge sched_matrix[MAX_EDGE_MATRIX_SIZE] = {0}; + bool sched_edge_changed[MAX_EDGE_MATRIX_SIZE] = {0}; + sched_perfcontrol_sched_edge_matrix_iterate(num_psets, edge_matrix_ind, sched_matrix_ind, { + if (edge_changed[edge_matrix_ind]) { + sched_matrix[sched_matrix_ind] = edge_matrix[edge_matrix_ind]; + sched_edge_changed[sched_matrix_ind] = true; + } + }); + + sched_edge_matrix_set(sched_matrix, sched_edge_changed, flags, num_psets); + + sched_clutch_edge sched_rt_matrix[MAX_PSETS * MAX_PSETS] = {}; + bool sched_rt_changed[MAX_PSETS * MAX_PSETS] = {}; + for (uint src = 0; src < num_psets; src++) { + for (uint dst = 0; dst < num_psets; dst++) { + const uint64_t edge_matrix_ind = rt_config_edge_index(src, dst, num_psets); + const uint64_t sched_matrix_ind = src * num_psets + dst; + if (edge_changed[edge_matrix_ind]) { + sched_rt_matrix[sched_matrix_ind] = edge_matrix[edge_matrix_ind]; + sched_rt_changed[sched_matrix_ind] = true; + } + } + } + sched_rt_matrix_set(sched_rt_matrix, sched_rt_changed, num_psets); +} + +void +sched_perfcontrol_edge_matrix_get(sched_clutch_edge *edge_matrix, bool *edge_requested, uint64_t flags, + uint64_t matrix_order) +{ + assert3u(matrix_order, <=, MAX_PSETS); + bool edge_requested_per_qos[MAX_EDGE_MATRIX_SIZE] = {0}; + for (uint32_t i = 0; i < matrix_order * matrix_order; i++) { + uint32_t expanded_index = (i * TH_BUCKET_SCHED_MAX) + TH_BUCKET_FIXPRI; + edge_requested_per_qos[expanded_index] = edge_requested[i]; + } + + sched_clutch_edge expanded_matrix[MAX_EDGE_MATRIX_SIZE] = {0}; + sched_edge_matrix_get(expanded_matrix, edge_requested_per_qos, flags, matrix_order); + + for (uint32_t i = 0; i < matrix_order * matrix_order; i++) { + if (edge_requested[i]) { + uint32_t expanded_index = (i * TH_BUCKET_SCHED_MAX) + TH_BUCKET_FIXPRI; + edge_matrix[i] = expanded_matrix[expanded_index]; + } + } +} + +void +sched_perfcontrol_edge_matrix_set(sched_clutch_edge *edge_matrix, bool *edge_changed, uint64_t flags, + uint64_t matrix_order) +{ + assert3u(matrix_order, <=, MAX_PSETS); + bool edge_changed_per_qos[MAX_EDGE_MATRIX_SIZE] = {0}; + sched_clutch_edge expanded_matrix[MAX_EDGE_MATRIX_SIZE] = {0}; + for (uint32_t i = 0; i < matrix_order * matrix_order; i++) { + for (uint32_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + uint32_t expanded_index = (i * TH_BUCKET_SCHED_MAX) + bucket; + edge_changed_per_qos[expanded_index] = edge_changed[i]; + expanded_matrix[expanded_index] = edge_matrix[i]; + } + } + + sched_edge_matrix_set(expanded_matrix, edge_changed_per_qos, flags, matrix_order); } void @@ -1291,14 +1464,11 @@ sched_perfcontrol_thread_group_preferred_clusters_set(void *machine_data, uint32 uint32_t overrides[PERFCONTROL_CLASS_MAX], sched_perfcontrol_preferred_cluster_options_t options) { struct thread_group *tg = (struct thread_group *)((uintptr_t)machine_data - offsetof(struct thread_group, tg_machine_data)); - uint32_t tg_bucket_preferred_cluster[TH_BUCKET_SCHED_MAX] = { - [TH_BUCKET_FIXPRI] = (overrides[PERFCONTROL_CLASS_ABOVEUI] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_ABOVEUI] : tg_preferred_cluster, - [TH_BUCKET_SHARE_FG] = (overrides[PERFCONTROL_CLASS_UI] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_UI] : tg_preferred_cluster, - [TH_BUCKET_SHARE_IN] = (overrides[PERFCONTROL_CLASS_USER_INITIATED] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_USER_INITIATED] : tg_preferred_cluster, - [TH_BUCKET_SHARE_DF] = (overrides[PERFCONTROL_CLASS_NONUI] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_NONUI] : tg_preferred_cluster, - [TH_BUCKET_SHARE_UT] = (overrides[PERFCONTROL_CLASS_UTILITY] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_UTILITY] : tg_preferred_cluster, - [TH_BUCKET_SHARE_BG] = (overrides[PERFCONTROL_CLASS_BACKGROUND] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[PERFCONTROL_CLASS_BACKGROUND] : tg_preferred_cluster, - }; + uint32_t tg_bucket_preferred_cluster[TH_BUCKET_SCHED_MAX]; + for (sched_bucket_t bucket = 0; bucket < TH_BUCKET_SCHED_MAX; bucket++) { + perfcontrol_class_t pc = sched_bucket_to_perfcontrol_class(bucket); + tg_bucket_preferred_cluster[bucket] = (overrides[pc] != SCHED_PERFCONTROL_PREFERRED_CLUSTER_OVERRIDE_NONE) ? overrides[pc] : tg_preferred_cluster; + } sched_edge_tg_preferred_cluster_change(tg, tg_bucket_preferred_cluster, options); } @@ -1330,6 +1500,18 @@ sched_perfcontrol_thread_group_recommend(__unused void *machine_data, __unused c SCHED(thread_group_recommendation_change)(tg, new_recommendation); } +void +sched_perfcontrol_edge_matrix_by_qos_get(__unused sched_clutch_edge *edge_matrix, __unused bool *edge_requested, __unused uint64_t flags, + __unused uint64_t num_psets, __unused uint64_t num_classes) +{ +} + +void +sched_perfcontrol_edge_matrix_by_qos_set(__unused sched_clutch_edge *edge_matrix, __unused bool *edge_changed, __unused uint64_t flags, + __unused uint64_t num_psets, __unused uint64_t num_classes) +{ +} + void sched_perfcontrol_edge_matrix_get(__unused sched_clutch_edge *edge_matrix, __unused bool *edge_request_bitmap, __unused uint64_t flags, __unused uint64_t matrix_order) { diff --git a/osfmk/kern/thread_group.h b/osfmk/kern/thread_group.h index 7403d88e7..803ba82d9 100644 --- a/osfmk/kern/thread_group.h +++ b/osfmk/kern/thread_group.h @@ -64,6 +64,7 @@ struct thread_group; #define THREAD_GROUP_VM 3 // kernel VM threads #define THREAD_GROUP_IO_STORAGE 4 // kernel io storage threads #define THREAD_GROUP_PERF_CONTROLLER 5 // kernel CLPC threads +#define THREAD_GROUP_CELLULAR 6 // kernel Cellular threads #define THREAD_GROUP_INVALID UINT64_MAX diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index e81797708..5bcbd01eb 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include #include #include @@ -1653,11 +1654,11 @@ thread_policy_update_internal_spinlocked(thread_t thread, bool recompute_priorit next_qos = MAX(requested.thrp_qos_workq_override, next_qos); } - if (task_effective.tep_darwinbg && task_effective.tep_adaptive_bg && + if (task_effective.tep_darwinbg && task_effective.tep_promote_above_task && requested.thrp_qos_promote > THREAD_QOS_BACKGROUND) { /* - * This thread is turnstile-boosted higher than the adaptive clamp - * by a synchronous waiter. Allow that to override the adaptive + * This thread is turnstile-boosted higher than the background clamp + * by a synchronous waiter, and this clamp allows that to override the * clamp temporarily for this thread only. */ next.thep_promote_above_task = true; @@ -1867,6 +1868,13 @@ thread_policy_update_internal_spinlocked(thread_t thread, bool recompute_priorit integer_t old_base_pri = thread->base_pri; + /* promote-above-task generates its own dedicated tracepoint */ + if (prev.thep_promote_above_task != next.thep_promote_above_task) { + KDBG_RELEASE(IMPORTANCE_CODE(IMP_THREAD_PROMOTE_ABOVE_TASK, 0) | + (next.thep_promote_above_task ? DBG_FUNC_START : DBG_FUNC_END), + thread_tid(thread), next.thep_terminated); + } + /* * Step 5: * Update other subsystems as necessary if something has changed diff --git a/osfmk/kern/ticket_lock.h b/osfmk/kern/ticket_lock.h index 4a0fe7b75..64a0dec37 100644 --- a/osfmk/kern/ticket_lock.h +++ b/osfmk/kern/ticket_lock.h @@ -58,9 +58,6 @@ __BEGIN_DECLS * * This lower level interface supports an @c *_allow_invalid() * to implement advanced memory reclamation schemes using sequestering. - * Do note that when @c CONFIG_PROB_GZALLOC is engaged, and the target lock - * comes from a zone, PGZ must be handled manually. - * See ipc_object_lock_allow_invalid() for an example of that. * * @c hw_lck_ticket_invalidate() must be used on locks * that will be used this way: in addition to make subsequent calls to diff --git a/osfmk/kern/timeout.c b/osfmk/kern/timeout.c new file mode 100644 index 000000000..ac3c3e54b --- /dev/null +++ b/osfmk/kern/timeout.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +kern_timeout_t panic_timeout; /* for debugging */ +boolean_t kern_timeout_panic_initiated = false; + +#if defined(__x86_64__) +#define ml_get_speculative_timebase ml_get_timebase +#endif + +#if DEVELOPMENT || DEBUG +TUNABLE_DT_WRITEABLE(int, timeouts_are_fatal, "machine-timeouts", "timeouts-are-fatal", + "timeouts_are_fatal", 1, TUNABLE_DT_CHECK_CHOSEN); +#endif + +void +kern_timeout_restart(kern_timeout_t *to, timeout_flags_t flags) +{ +#if CONFIG_CPU_COUNTERS + if (__improbable(!(flags & TF_SAMPLE_PMC))) { + to->start_cycles = 0; + to->start_instrs = 0; + } else { + mt_cur_cpu_cycles_instrs_speculative(&to->start_cycles, &to->start_instrs); + } +#endif /* CONFIG_CPU_COUNTERS */ + + if (flags & TF_SAMPLE_INTERRUPT_TIME) { + to->int_mt = recount_current_processor_interrupt_duration_mach(); + } else { + to->int_mt = 0; + } + + to->start_mt = (flags & TF_NONSPEC_TIMEBASE)? ml_get_timebase() : ml_get_speculative_timebase(); +} + +void +kern_timeout_start(kern_timeout_t *to, timeout_flags_t flags) +{ + if (flags & TF_BACKTRACE) { + (void) backtrace(&to->bt[0], TO_BT_FRAMES, NULL, NULL); + } + kern_timeout_restart(to, flags); +} + +void +kern_timeout_end(kern_timeout_t *to, timeout_flags_t flags) +{ + to->end_mt = (flags & TF_NONSPEC_TIMEBASE)? ml_get_timebase() : ml_get_speculative_timebase(); + if (flags & TF_SAMPLE_INTERRUPT_TIME) { + to->int_mt = recount_current_processor_interrupt_duration_mach() - to->int_mt; + } +} + +/* + * Zero out the timeout state so that we won't have a timeout triggered later in the processing + * of this timeout. + */ +void +kern_timeout_override(kern_timeout_t *to) +{ + to->start_mt = 0; +#if CONFIG_CPU_COUNTERS + to->start_cycles = 0; + to->start_instrs = 0; +#endif /* CONFIG_CPU_COUNTERS */ +} + +#if CONFIG_CPU_COUNTERS +void +kern_timeout_cycles_instrs(kern_timeout_t *to, uint64_t *cycles, uint64_t *instrs) +{ + uint64_t now_cycles, now_instrs; + + if (__improbable(to->start_cycles == 0)) { + *cycles = 0; + *instrs = 0; + } else { + mt_cur_cpu_cycles_instrs_speculative(&now_cycles, &now_instrs); + *cycles = now_cycles - to->start_cycles; + *instrs = now_instrs - to->start_instrs; + } +} + +void +kern_timeout_cpi(kern_timeout_t *to, uint64_t *cpi_whole, uint64_t *cpi_fractional) +{ + uint64_t cycles, instrs; + + kern_timeout_cycles_instrs(to, &cycles, &instrs); + *cpi_whole = cycles / instrs; + *cpi_fractional = ((cycles * 100) / instrs) % 100; +} +#else /* !CONFIG_CPU_COUNTERS */ +void +kern_timeout_cycles_instrs(kern_timeout_t __unused *to, uint64_t *cycles, uint64_t *instrs) +{ + *cycles = 0; + *instrs = 0; +} + +void +kern_timeout_cpi(kern_timeout_t __unused *to, uint64_t *cpi_whole, uint64_t *cpi_fractional) +{ + *cpi_whole = 0; + *cpi_fractional = 0; +} +#endif /* CONFIG_CPU_COUNTERS */ + +__enum_closed_decl(timeout_mode_t, uint32_t, { + TIMEOUT_TELEMETRY, + TIMEOUT_PANIC +}); + +/* + * This interface is a "try panic" because we won't invoke a nested panic + * if a timeout has already happened that initiated the original panic. + */ +void +kern_timeout_try_panic(kern_timeout_type_t __unused type, uint64_t __unused payload, kern_timeout_t *to, const char *prefix, uint64_t threshold) +{ + char cpi[80]; + char duration[80]; + const uint64_t gross_duration = kern_timeout_gross_duration(to); + const uint64_t net_duration = kern_timeout_net_duration(to); + uint64_t gross_ns, net_ns, threshold_ns; + uint64_t gross_us, net_us, threshold_us; + uint64_t gross_ms, net_ms, threshold_ms; + uint64_t start_mt, end_mt; + uint64_t __unused average_freq = 0; + uint64_t __unused cpi_whole = 0; +#ifdef __arm64__ + const char __unused core_type = ml_get_current_core_type(); +#else + const char __unused core_type = '-'; +#endif /* __arm64__ */ + + /* + * We can recursively try to panic due to a timeout in the panic flow, + * so if that happens, just bail out here. + */ + if (kern_timeout_panic_initiated) { + return; + } + + absolutetime_to_nanoseconds(gross_duration, &gross_ns); + absolutetime_to_nanoseconds(net_duration, &net_ns); + absolutetime_to_nanoseconds(threshold, &threshold_ns); + kern_timeout_mach_times(to, &start_mt, &end_mt); + + cpi[0] = 0; + +#if CONFIG_CPU_COUNTERS + uint64_t cycles; + uint64_t instrs; + + /* + * We're getting these values a bit late, but getting them + * is a bit expensive, so we take the slight hit in + * accuracy for the reported values (which aren't very + * stable anyway). + */ + kern_timeout_cycles_instrs(to, &cycles, &instrs); + if (cycles > 0 && instrs > 0) { + cpi_whole = cycles / instrs; + average_freq = cycles / (gross_ns / 1000); + } +#endif /* CONFIG_CPU_COUNTERS */ + +#if DEVELOPMENT || DEBUG + timeout_mode_t mode = timeouts_are_fatal ? TIMEOUT_PANIC : TIMEOUT_TELEMETRY; + if (mode == TIMEOUT_PANIC) { + +#if CONFIG_CPU_COUNTERS && !defined(HAS_FEAT_XS) + /* + * POLICY: if CPI > 100 and we are on a SoC that does not support + * FEAT_XS, it's likely the stall was caused by a long TLBI. This + * isn't an actionable radar condition for preemption or interrupt + * disabled timeouts, so do nothing. + */ + if ((type == KERN_TIMEOUT_PREEMPTION || type == KERN_TIMEOUT_INTERRUPT) && + cpi_whole > 100) { + return; + } +#endif /* CONFIG_CPU_COUNTERS && !HAS_FEAT_XS */ + +#if ML_IO_TIMEOUTS_ENABLED + /* + * POLICY: check the MMIO override window to see if we are still + * within it. If we are, abandon the attempt to panic, since + * the timeout is almost certainly due to a known issue causing + * a stall that got entangled with this core. We don't emit + * telemetry in this case because the MMIO overrides have their + * own telemetry mechanism. + */ + if (ml_io_check_for_mmio_overrides(start_mt)) { + return; + } +#endif /* ML_IO_TIMEOUTS_ENABLED */ + } + + if (mode == TIMEOUT_TELEMETRY) { + trap_telemetry_type_t trap_type; + switch (type) { + case KERN_TIMEOUT_PREEMPTION: + trap_type = TRAP_TELEMETRY_TYPE_PREEMPTION_TIMEOUT; + break; + case KERN_TIMEOUT_INTERRUPT: + trap_type = TRAP_TELEMETRY_TYPE_INTERRUPT_TIMEOUT; + break; + case KERN_TIMEOUT_MMIO: + trap_type = TRAP_TELEMETRY_TYPE_MMIO_TIMEOUT; + break; + case KERN_TIMEOUT_LOCK: + trap_type = TRAP_TELEMETRY_TYPE_LOCK_TIMEOUT; + break; + default: + panic("unknown timeout type\n"); + } + trap_telemetry_report_latency_violation( + trap_type, + (trap_telemetry_latency_s) { + .violation_cpi = cpi_whole, + .violation_freq = average_freq, + .violation_cpu_type = core_type, + .violation_duration = net_ns, + .violation_threshold = threshold_ns, + .violation_payload = payload + }); + return; + } +#endif /* DEVELOPMENT || DEBUG */ + + kern_timeout_panic_initiated = true; + panic_timeout = *to; + + gross_us = gross_ns / 1000; + net_us = net_ns / 1000; + threshold_us = threshold_ns / 1000; + gross_ms = gross_us / 1000; + net_ms = net_us / 1000; + threshold_ms = threshold_us / 1000; + +#if CONFIG_CPU_COUNTERS + if (cycles > 0 && instrs > 0) { + uint64_t cpi_fractional; + + cpi_fractional = ((cycles * 100) / instrs) % 100; + + snprintf(cpi, sizeof(cpi), ", freq %llu MHz, type = %c, CPI = %llu.%llu [%llu, %llu]", + average_freq, core_type, cpi_whole, cpi_fractional, cycles, instrs); + } +#endif /* CONFIG_CPU_COUNTERS */ + + if (gross_ns > net_ns) { + if (threshold_ms > 0) { + snprintf(duration, sizeof(duration), "gross %llu.%llu ms, net %llu.%llu ms >= %llu.%llu ms", + gross_ms, gross_us % 1000, net_ms, net_us % 1000, threshold_ms, threshold_us % 1000); + } else { + snprintf(duration, sizeof(duration), "gross %llu us, net %llu us >= %llu us", + gross_us, net_us, threshold_us); + } + } else { + if (threshold_ms > 0) { + snprintf(duration, sizeof(duration), "%llu.%llu ms >= %llu.%llu ms", + gross_ms, gross_us % 1000, threshold_ms, threshold_us % 1000); + } else { + snprintf(duration, sizeof(duration), "%llu us >= %llu us", + gross_us, threshold_us); + } + } + + panic_plain("%s %s (start: %llu, end: %llu)%s", prefix, duration, start_mt, end_mt, cpi); +} + diff --git a/osfmk/kern/timeout.h b/osfmk/kern/timeout.h new file mode 100644 index 000000000..8ce34c7ba --- /dev/null +++ b/osfmk/kern/timeout.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_TIMEOUT_H_ +#define _KERN_TIMEOUT_H_ + +#include +#include + +__options_closed_decl(timeout_flags_t, uint32_t, { + TF_NONSPEC_TIMEBASE = 0x01, + TF_BACKTRACE = 0x02, +#if XNU_KERNEL_PRIVATE + TF_SAMPLE_INTERRUPT_TIME = 0x04, + TF_SAMPLE_PMC = 0x08, +#endif /* XNU_KERNEL_PRIVATE */ +}); + +__enum_decl(kern_timeout_type_t, uint32_t, { + KERN_TIMEOUT_PREEMPTION = 1, + KERN_TIMEOUT_INTERRUPT = 2, + KERN_TIMEOUT_MMIO = 3, + KERN_TIMEOUT_LOCK = 4, +}); + +extern void kern_timeout_start(kern_timeout_t *to, timeout_flags_t flags); +extern void kern_timeout_restart(kern_timeout_t *to, timeout_flags_t flags); +extern void kern_timeout_end(kern_timeout_t *to, timeout_flags_t flags); +extern void kern_timeout_override(kern_timeout_t *to); +extern void kern_timeout_try_panic(kern_timeout_type_t type, uint64_t payload, kern_timeout_t *to, + const char *prefix, uint64_t threshold); + +#if XNU_KERNEL_PRIVATE +extern void kern_timeout_cycles_instrs(kern_timeout_t *to, uint64_t *cycles, uint64_t *instrs); +extern void kern_timeout_cpi(kern_timeout_t *to, uint64_t *cpi_whole, uint64_t *cpi_fractional); +#endif /* XNU_KERNEL_PRIVATE */ + +static inline void +kern_timeout_stretch(kern_timeout_t *to, uint64_t mt_ticks) +{ + to->start_mt -= mt_ticks; +} + +static inline uint64_t +kern_timeout_start_time(kern_timeout_t *to) +{ + return to->start_mt; +} + +/* + * Return the mach time elapsed beteween calls to kern_timeout_start() and kern_timeout_end(). + */ +static inline uint64_t +kern_timeout_gross_duration(kern_timeout_t *to) +{ + if (__improbable(to->start_mt == 0 || to->end_mt < to->start_mt)) { + return 0; + } + return to->end_mt - to->start_mt; +} + +#if XNU_KERNEL_PRIVATE +/* + * Return the mach time elapsed beteween calls to kern_timeout_start() and kern_timeout_end() + * subtracting the mach time that elapsed handling interrupts. + */ +static inline uint64_t +kern_timeout_net_duration(kern_timeout_t *to) +{ + uint64_t gross_duration = kern_timeout_gross_duration(to); + uint64_t int_duration = to->int_mt; + + if (__improbable(to->start_mt == 0 || gross_duration < int_duration)) { + return 0; + } + return gross_duration - int_duration; +} +#endif /* XNU_KERNEL_PRIVATE */ + +static inline void +kern_timeout_mach_times(kern_timeout_t *to, uint64_t *start_mt, uint64_t *end_mt) +{ + *start_mt = to->start_mt; + *end_mt = to->end_mt; +} + +#endif /* _KERN_TIMEOUT_H_ */ diff --git a/osfmk/kern/timeout_decl.h b/osfmk/kern/timeout_decl.h new file mode 100644 index 000000000..770235897 --- /dev/null +++ b/osfmk/kern/timeout_decl.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_TIMEOUT_DECL_H_ +#define _KERN_TIMEOUT_DECL_H_ + +#include + +/* + * The interrupt disabled timeouts mechanism requires that we include this + * header in arm/thread.h, which is why this is here and not in the timeout.h + * header. + */ + +#define TO_BT_FRAMES 3 + +typedef struct kern_timeout { + uint64_t start_mt; + uint64_t end_mt; + uint64_t int_mt; + uint64_t start_cycles; + uint64_t int_cycles; + uint64_t start_instrs; + uint64_t int_instrs; + uintptr_t bt[TO_BT_FRAMES]; +} kern_timeout_t; + +#endif /* _KERN_TIMEOUT_DECL_H_ */ diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index c7b27f4ad..a6e60deae 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -104,11 +104,12 @@ LCK_GRP_DECLARE(timer_queue_lck_grp, "timer_queue"); /* * In a similar way to the longterm queue's scan limit, the following bounds the - * amount of time spent processing regular timers. + * amount of time spent processing regular timers. This limit is also obeyed by + * thread_call_delayed_timer(). */ TUNABLE_WRITEABLE(uint64_t, timer_scan_limit_us, "timer_scan_limit_us", 400); TUNABLE_WRITEABLE(uint64_t, timer_scan_interval_us, "timer_scan_interval_us", 40); -static uint64_t timer_scan_limit_abs = 0; +uint64_t timer_scan_limit_abs = 0; static uint64_t timer_scan_interval_abs = 0; /* diff --git a/osfmk/kern/timer_call.h b/osfmk/kern/timer_call.h index bf6cf1ac0..7433f39c4 100644 --- a/osfmk/kern/timer_call.h +++ b/osfmk/kern/timer_call.h @@ -214,6 +214,7 @@ enum running_timer { #if KPERF RUNNING_TIMER_KPERF, #endif /* KPERF */ + RUNNING_TIMER_PERFCONTROL, RUNNING_TIMER_MAX, }; diff --git a/osfmk/kern/trap_telemetry.c b/osfmk/kern/trap_telemetry.c index 1725d7776..858bc7544 100644 --- a/osfmk/kern/trap_telemetry.c +++ b/osfmk/kern/trap_telemetry.c @@ -40,6 +40,9 @@ #include #include #include +#if __arm64__ +#include +#endif #define TAG "[trap_telemetry] " @@ -67,6 +70,9 @@ /** Number of last events per-CPU to remember and reject. */ #define DEBOUNCE_RECORD_COUNT (2) +/** Length of the kernel_platform string (eg t8132). */ +#define KERNEL_PLATFORM_STR_LEN 12 + /** * When true, trap telemetry will not report events to CoreAnalytics. * @@ -103,6 +109,7 @@ typedef struct match_record { typedef struct rsb_entry { match_record_s record; + trap_telemetry_extra_data_u extra_data; trap_telemetry_options_s options; size_t bt_frames_count; uintptr_t bt_frames[TRAP_TELEMETRY_BT_FRAMES]; @@ -137,11 +144,24 @@ CA_EVENT(kernel_breakpoint_event, CA_EVENT(trap_telemetry_internal, CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace, + CA_STATIC_STRING(KERNEL_PLATFORM_STR_LEN), kernel_platform, CA_INT, trap_code, CA_INT, trap_offset, CA_INT, trap_type, CA_STATIC_STRING(CA_UUID_LEN), trap_uuid); +CA_EVENT(latency_violations, + CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace, + CA_STATIC_STRING(KERNEL_PLATFORM_STR_LEN), kernel_platform, + CA_STATIC_STRING(CA_UUID_LEN), uuid, + CA_INT, violation_code, + CA_INT, violation_cpi, + CA_STATIC_STRING(2), violation_cpu_type, + CA_INT, violation_duration, + CA_INT, violation_freq, + CA_INT, violation_payload, + CA_INT, violation_threshold); + /* ~* Splay tree *~ */ static int match_record_compare(match_record_s *r1, @@ -730,6 +750,16 @@ rsb_entry_submit(rsb_entry_s *rsb_e) rsb_e->bt_frames, rsb_e->bt_frames_count); +#if __arm64__ + /* + * We want the value of ARM64_SOC_NAME define as a string, so we need + * to do a two level indirection of macros to get to it. + */ +#define tostr(s) __STRINGIFY(s) + strlcpy(event->kernel_platform, tostr(ARM64_SOC_NAME), KERNEL_PLATFORM_STR_LEN); +#undef tostr +#endif + /* * Internal events report the UUID of the binary containing the * fault PC and offset of the fault PC into the executable region of @@ -754,9 +784,42 @@ rsb_entry_submit(rsb_entry_s *rsb_e) event->trap_type = (uint32_t)rsb_e->record.trap_type; event->trap_code = rsb_e->record.trap_code; + break; } + case TRAP_TELEMETRY_CA_EVENT_LATENCY: { + ca_event = CA_EVENT_ALLOCATE(latency_violations); + CA_EVENT_TYPE(latency_violations) * event = ca_event->data; + + backtrace_to_offset_bt_string( + /* buf */ event->backtrace, + /* buf_len */ TRAP_TELEMETRY_BT_STR_LEN, + rsb_e->bt_frames, + rsb_e->bt_frames_count); + +#if __arm64__ + /* + * We want the value of ARM64_SOC_NAME define as a string, so we need + * to do a two level indirection of macros to get to it. + */ +#define tostr(s) __STRINGIFY(s) + strlcpy(event->kernel_platform, tostr(ARM64_SOC_NAME), KERNEL_PLATFORM_STR_LEN); +#undef tostr +#endif + strlcpy(event->uuid, kernel_uuid_string, CA_UUID_LEN); + (void)scnprintf(event->violation_cpu_type, 2, "%c", + rsb_e->extra_data.latency_data.violation_cpu_type); + + event->violation_code = (uint32_t)rsb_e->record.trap_type; + event->violation_cpi = rsb_e->extra_data.latency_data.violation_cpi; + event->violation_freq = rsb_e->extra_data.latency_data.violation_freq; + event->violation_duration = rsb_e->extra_data.latency_data.violation_duration; + event->violation_threshold = rsb_e->extra_data.latency_data.violation_threshold; + event->violation_payload = rsb_e->extra_data.latency_data.violation_payload; + + break; + } default: { panic("Unexpected telemetry CA event: %u\n", options.telemetry_ca_event); @@ -1059,11 +1122,12 @@ trap_telemetry_report_exception( return rsb_enqueue_if_needed(&submission_e); } -__attribute__((noinline)) -bool -trap_telemetry_report_simulated_trap( +__attribute__((always_inline)) +static bool +trap_telemetry_report_simulated_trap_impl( trap_telemetry_type_t trap_type, uint64_t trap_code, + trap_telemetry_extra_data_u *extra_data, trap_telemetry_options_s options) { if (should_ignore_trap(trap_type, trap_code, options)) { @@ -1101,6 +1165,7 @@ trap_telemetry_report_simulated_trap( trap_type, trap_code, options, + extra_data, /* fault_pc */ frames[0], /* frames */ frames + 1, /* frames_valid_count */ frames_valid_count - 1); @@ -1110,17 +1175,43 @@ trap_telemetry_report_simulated_trap( trap_type, trap_code, options, + extra_data, /* fault_pc */ (uintptr_t)__builtin_return_address(0), /* frames */ NULL, /* frames_valid_count */ 0); } } +__attribute__((noinline)) +bool +trap_telemetry_report_simulated_trap( + trap_telemetry_type_t trap_type, + uint64_t trap_code, + trap_telemetry_options_s options) +{ + return trap_telemetry_report_simulated_trap_impl(trap_type, trap_code, NULL, options); +} + +__attribute__((noinline)) +bool +trap_telemetry_report_latency_violation( + trap_telemetry_type_t trap_type, + trap_telemetry_latency_s latency_data) +{ + return trap_telemetry_report_simulated_trap_impl(trap_type, 0, + (trap_telemetry_extra_data_u *)&latency_data, + (trap_telemetry_options_s) { + .telemetry_ca_event = TRAP_TELEMETRY_CA_EVENT_LATENCY, + .report_once_per_site = false + }); +} + bool trap_telemetry_report_simulated_trap_with_backtrace( trap_telemetry_type_t trap_type, uint64_t trap_code, trap_telemetry_options_s options, + trap_telemetry_extra_data_u *extra_data, uintptr_t fault_pc, uintptr_t *frames, size_t frames_valid_count) @@ -1136,6 +1227,9 @@ trap_telemetry_report_simulated_trap_with_backtrace( rsb_entry_s submission_e = { 0 }; submission_e.record.trap_type = trap_type; submission_e.record.trap_code = trap_code; + if (extra_data != NULL) { + submission_e.extra_data = *extra_data; + } submission_e.options = options; // only copy up to TRAP_TELEMETRY_BT_FRAMES frames diff --git a/osfmk/kern/trap_telemetry.h b/osfmk/kern/trap_telemetry.h index f638411d3..0698cda47 100644 --- a/osfmk/kern/trap_telemetry.h +++ b/osfmk/kern/trap_telemetry.h @@ -45,6 +45,8 @@ __enum_decl(trap_telemetry_ca_event_t, uint8_t, { TRAP_TELEMETRY_CA_EVENT_INTERNAL = 2, + TRAP_TELEMETRY_CA_EVENT_LATENCY = 3, + /** Used for validation, keep this value last. */ TRAP_TELEMETRY_CA_EVENT_COUNT, }); @@ -72,6 +74,19 @@ typedef struct { report_once_per_site:1; } trap_telemetry_options_s; +typedef struct { + uint64_t violation_cpi; + uint64_t violation_freq; + uint64_t violation_duration; + uint64_t violation_threshold; + uint64_t violation_payload; + char violation_cpu_type; +} trap_telemetry_latency_s; + +typedef union trap_telemetry_extra_data { + trap_telemetry_latency_s latency_data; +} trap_telemetry_extra_data_u; + __enum_decl(trap_telemetry_type_t, uint32_t, { /* These show up in telemetry, do not renumber */ TRAP_TELEMETRY_TYPE_KERNEL_BRK_KASAN = 0, /* KASan violation traps */ @@ -85,6 +100,13 @@ __enum_decl(trap_telemetry_type_t, uint32_t, { TRAP_TELEMETRY_TYPE_KERNEL_SOFT_ERROR = 6, TRAP_TELEMETRY_TYPE_SPTM_SOFT_ERROR = 7, + /* Latency guards violations when telemetry mode is enabled */ + TRAP_TELEMETRY_TYPE_PREEMPTION_TIMEOUT = 8, + TRAP_TELEMETRY_TYPE_INTERRUPT_TIMEOUT = 9, + TRAP_TELEMETRY_TYPE_MMIO_TIMEOUT = 10, + TRAP_TELEMETRY_TYPE_MMIO_OVERRIDE = 11, + TRAP_TELEMETRY_TYPE_LOCK_TIMEOUT = 12, + TRAP_TELEMETRY_TYPE_KERNEL_BRK_TEST = ~0u, /* Development only */ }); @@ -113,6 +135,17 @@ trap_telemetry_report_simulated_trap( uint64_t trap_code, trap_telemetry_options_s options); +/** + * Report a latency violation of the given type and parameters. + * Fault PC and backtrace will begin at the call site of this function. + * + * Returns true if the event was submitted (or duped) and false on error. + */ +extern bool +trap_telemetry_report_latency_violation( + trap_telemetry_type_t trap_type, + trap_telemetry_latency_s params); + /** * Perform a simulated trap of a given type and code, with given fault PC and * backtrace. @@ -126,6 +159,7 @@ trap_telemetry_report_simulated_trap_with_backtrace( trap_telemetry_type_t trap_type, uint64_t trap_code, trap_telemetry_options_s options, + trap_telemetry_extra_data_u *extra_data, uintptr_t fault_pc, uintptr_t *frames, size_t frames_valid_count); @@ -162,6 +196,8 @@ enum kernel_brk_trap_comment { XNU_HARD_TRAP_END = 0xBFFF, /* PTRAUTH (sleh.c) : [0xC470 ~ 0xC473] */ + PTRAUTH_TRAP_START = 0xC470, + PTRAUTH_TRAP_END = 0xC473, /* TELEMETRY : [0xFF00 ~ 0xFFFE] */ XNU_SOFT_TRAP_START = 0xFF00, @@ -214,20 +250,20 @@ typedef struct kernel_brk_descriptor { const char *(*handle_breakpoint)(void *states, uint16_t comment); } *kernel_brk_descriptor_t; -extern struct kernel_brk_descriptor brk_descriptors[] -__SECTION_START_SYM("__DATA_CONST", "__brk_desc"); +extern struct kernel_brk_descriptor kernel_brk_descriptors[] +__SECTION_START_SYM("__DATA_CONST", "__kern_brk_desc"); -extern struct kernel_brk_descriptor brk_descriptors_end[] -__SECTION_END_SYM("__DATA_CONST", "__brk_desc"); +extern struct kernel_brk_descriptor kernel_brk_descriptors_end[] +__SECTION_END_SYM("__DATA_CONST", "__kern_brk_desc"); #define KERNEL_BRK_DESCRIPTOR_DEFINE(name, ...) \ -__PLACE_IN_SECTION("__DATA_CONST,__brk_desc") \ +__PLACE_IN_SECTION("__DATA_CONST,__kern_brk_desc") \ static const struct kernel_brk_descriptor name = { __VA_ARGS__ }; const static inline struct kernel_brk_descriptor * -find_brk_descriptor_by_comment(uint16_t comment) +find_kernel_brk_descriptor_by_comment(uint16_t comment) { - for (kernel_brk_descriptor_t des = brk_descriptors; des < brk_descriptors_end; des++) { + for (kernel_brk_descriptor_t des = kernel_brk_descriptors; des < kernel_brk_descriptors_end; des++) { if (comment >= des->base && comment <= des->max) { return des; } @@ -241,6 +277,7 @@ find_brk_descriptor_by_comment(uint16_t comment) __enum_decl(trap_telemetry_kernel_soft_error_code_t, uint64_t, { /* Do not renumber entries -- IDs are used in telemetry */ TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE = 0, + TRAP_TELEMETRY_KERNEL_SOFT_ERROR_RES0 = 1, }); /** diff --git a/osfmk/kern/turnstile.c b/osfmk/kern/turnstile.c index a99941c88..e1175ab49 100644 --- a/osfmk/kern/turnstile.c +++ b/osfmk/kern/turnstile.c @@ -43,6 +43,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1520,6 +1523,15 @@ turnstile_update_inheritor( turnstile_update_inheritor_locked(turnstile); +#if SCHED_HYGIENE_DEBUG + /* + * Disable the timeout here until the latency of priority queue updates + * is fixed (rdar://144402635) + */ + ml_spin_debug_reset(current_thread()); + ml_irq_debug_abandon(); +#endif + waitq_unlock(&turnstile->ts_waitq); splx(spl); @@ -3297,16 +3309,18 @@ kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, ui if (turnstile_is_send_turnstile(ts)) { ipc_port_t port = (ipc_port_t)ts->ts_proprietor; - if (port && ip_active(port) && port->ip_service_port && port->ip_splabel != NULL) { + if (port && ip_active(port) && ip_is_any_service_port(port)) { *flags = STACKSHOT_TURNSTILE_FLAGS_WITHPORT(*flags, STACKSHOT_TURNSTILE_STATUS_SENDPORT); - *isplp = (struct ipc_service_port_label *)port->ip_splabel; + *isplp = ptrauth_strip(port->ip_object.iol_service, + ptrauth_key_process_independent_data); } } if (turnstile_is_receive_turnstile(ts)) { ipc_port_t port = (ipc_port_t)ts->ts_proprietor; - if (port && ip_active(port) && port->ip_service_port && port->ip_splabel != NULL) { + if (port && ip_active(port) && ip_is_any_service_port(port)) { *flags = STACKSHOT_TURNSTILE_FLAGS_WITHPORT(*flags, STACKSHOT_TURNSTILE_STATUS_RECEIVEPORT); - *isplp = (struct ipc_service_port_label *)port->ip_splabel; + *isplp = ptrauth_strip(port->ip_object.iol_service, + ptrauth_key_process_independent_data); } } /* @@ -3372,7 +3386,7 @@ kdp_turnstile_traverse_inheritor_chain(struct turnstile *ts, uint64_t *flags, ui *flags |= STACKSHOT_TURNSTILE_STATUS_HELD_IPLOCK; return 0; } - if (port->ip_specialreply) { + if (ip_is_special_reply_port(port)) { /* try getting the pid stored in the port */ uint64_t pid_candidate = ipc_special_reply_get_pid_locked(port); diff --git a/osfmk/kern/upsi.h b/osfmk/kern/upsi.h new file mode 100644 index 000000000..ac64657fb --- /dev/null +++ b/osfmk/kern/upsi.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#if !XNU_KERNEL_PRIVATE +#error upsi.h is XNU private +#endif + +#if (DEVELOPMENT || DEBUG) +#define HAS_UPSI_FAILURE_INJECTION 1 +#endif + +#if HAS_UPSI_FAILURE_INJECTION +/* Enumeration of the injectable failure locations/stages - Must be kept in sync with iBoot's "power_boot_stages.h" + * The "stage" macros specify points where failure injection is possible + */ +__enum_decl(failure_injection_stage_t, uint64_t, { + XNU_STAGE_ARM_INIT = 0x31, + XNU_STAGE_BOOTSTRAP_START = 0x32, + XNU_STAGE_SCHEDULER_START = 0x33, + XNU_STAGE_BSD_INIT_START = 0x34, + XNU_STAGE_BSD_INIT_END = 0x35, +}); + +/* Enumeration of the injectable failure actions + * + * ACTION_WATCHDOG_TIMEOUT and ACTION_DEADLOOP look functionally equivalent. + * However they are different in the way iBoot configures the system watchdog. + * + * ACTION_WATCHDOG_TIMEOUT -> Deadloops with the system watchdog enabled + * ACTION_DEADLOOP -> Deadloops with the system watchdog disabled + * The watchdog behavior is configured by iBoot. Convey'd to XNU through the wdt=-1 boot-arg + */ +__enum_decl(failure_injection_action_t, uint64_t, { + INJECTION_ACTION_PANIC = 0x01, + INJECTION_ACTION_WATCHDOG_TIMEOUT = 0x02, + INJECTION_ACTION_DEADLOOP = 0x03, +}); + +extern uint64_t xnu_upsi_injection_stage; +extern uint64_t xnu_upsi_injection_action; + +void check_for_failure_injection(failure_injection_stage_t req); +#endif // HAS_UPSI_FAILURE_INJECTION diff --git a/osfmk/kern/ux_handler.c b/osfmk/kern/ux_handler.c index 731f2d96d..ddf4b1b79 100644 --- a/osfmk/kern/ux_handler.c +++ b/osfmk/kern/ux_handler.c @@ -62,6 +62,7 @@ static SECURITY_READ_ONLY_LATE(const void *) ux_handler_kobject = NULL; SECURITY_READ_ONLY_LATE(ipc_port_t) ux_handler_port = IP_NULL; IPC_KOBJECT_DEFINE(IKOT_UX_HANDLER, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_permanent = true); diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c index 8ae94f670..3e1c6bf06 100644 --- a/osfmk/kern/waitq.c +++ b/osfmk/kern/waitq.c @@ -200,7 +200,7 @@ static_assert(__alignof(struct waitq) == WQ_OPAQUE_ALIGN, "waitq structure align static KALLOC_TYPE_DEFINE(waitq_sellink_zone, struct waitq_sellink, KT_PRIV_ACCT); static KALLOC_TYPE_DEFINE(waitq_link_zone, struct waitq_link, KT_PRIV_ACCT); ZONE_DEFINE_ID(ZONE_ID_SELECT_SET, "select_set", struct select_set, - ZC_SEQUESTER | ZC_NOPGZ | ZC_ZFREE_CLEARMEM); + ZC_SEQUESTER | ZC_ZFREE_CLEARMEM); static LCK_GRP_DECLARE(waitq_lck_grp, "waitq"); @@ -377,17 +377,15 @@ static SECURITY_READ_ONLY_LATE(uint32_t) g_num_waitqs = 1; #define _CAST_TO_EVENT_MASK(event) \ ((waitq_flags_t)(uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul)) -static inline uint32_t -waitq_hash(char *key, size_t length) -{ - return os_hash_jenkins(key, length) & (g_num_waitqs - 1); -} - /* return a global waitq pointer corresponding to the given event */ struct waitq * -_global_eventq(char *event, size_t event_length) +_global_eventq(event64_t event) { - return &global_waitqs[waitq_hash(event, event_length)]; + /* + * this doesn't use os_hash_kernel_pointer() because + * some clients use "numbers" here. + */ + return &global_waitqs[os_hash_uint64(event) & (g_num_waitqs - 1)]; } bool @@ -621,8 +619,10 @@ waitq_wait_possible(thread_t thread) ((thread->state & TH_WAKING) == 0); } +__static_testable void waitq_bootstrap(void); + __startup_func -static void +__static_testable void waitq_bootstrap(void) { const uint32_t qsz = sizeof(struct waitq); @@ -712,19 +712,19 @@ static const struct hw_spin_policy waitq_spin_policy = { .hwsp_op_timeout = waitq_timeout_handler, }; -void +__mockable void waitq_invalidate(waitq_t waitq) { hw_lck_ticket_invalidate(&waitq.wq_q->waitq_interlock); } -bool +__mockable bool waitq_held(waitq_t wq) { return hw_lck_ticket_held(&wq.wq_q->waitq_interlock); } -void +__mockable void waitq_lock(waitq_t wq) { (void)hw_lck_ticket_lock_to(&wq.wq_q->waitq_interlock, @@ -734,7 +734,7 @@ waitq_lock(waitq_t wq) #endif } -bool +__mockable bool waitq_lock_try(waitq_t wq) { bool rc = hw_lck_ticket_lock_try(&wq.wq_q->waitq_interlock, &waitq_lck_grp); @@ -753,7 +753,7 @@ waitq_lock_reserve(waitq_t wq, uint32_t *ticket) return hw_lck_ticket_reserve(&wq.wq_q->waitq_interlock, ticket, &waitq_lck_grp); } -void +__mockable void waitq_lock_wait(waitq_t wq, uint32_t ticket) { (void)hw_lck_ticket_wait(&wq.wq_q->waitq_interlock, ticket, @@ -779,7 +779,7 @@ waitq_lock_allow_invalid(waitq_t wq) return rc == HW_LOCK_ACQUIRED; } -void +__mockable void waitq_unlock(waitq_t wq) { assert(waitq_held(wq)); @@ -1202,23 +1202,6 @@ do_waitq_select_n_locked_sets(waitq_t waitq, struct waitq_select_args *args) } if (wq_type == WQT_SELECT) { - /* - * If PGZ picked this select set, - * translate it to the real address - * - * If it is still a select set - * (the slot could have been reused), - * then keep using it for the rest of the logic. - * - * Even in the extremely unlikely case where - * the slot was reused for another select_set, - * the `wql_sellink_valid` check below will - * take care of debouncing it. But we must - * forget the original pointer we read - * so that we unlock the proper object. - */ - wqset.wqs_sel = pgz_decode_allow_invalid(wqset.wqs_sel, - ZONE_ID_SELECT_SET); if (!wqset.wqs_sel) { continue; } @@ -1512,7 +1495,7 @@ waitq_should_enable_interrupts(waitq_wakeup_flags_t flags) return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED | WAITQ_ENABLE_INTERRUPTS)) == (WAITQ_UNLOCK | WAITQ_ENABLE_INTERRUPTS); } -kern_return_t +__mockable uint32_t waitq_wakeup64_nthreads_locked( waitq_t waitq, event64_t wake_event, @@ -1523,7 +1506,7 @@ waitq_wakeup64_nthreads_locked( struct waitq_select_args args = { .event = wake_event, .result = result, - .flags = (nthreads == 1) ? flags: (flags & ~WAITQ_HANDOFF), + .flags = (nthreads == 1) ? flags : (flags & ~WAITQ_HANDOFF), .max_threads = nthreads, }; @@ -1549,11 +1532,7 @@ waitq_wakeup64_nthreads_locked( waitq_select_queue_flush(waitq, &args); } - if (args.nthreads > 0) { - return KERN_SUCCESS; - } - - return KERN_NOT_WAITING; + return args.nthreads; } kern_return_t @@ -1563,7 +1542,11 @@ waitq_wakeup64_all_locked( wait_result_t result, waitq_wakeup_flags_t flags) { - return waitq_wakeup64_nthreads_locked(waitq, wake_event, result, flags, UINT32_MAX); + uint32_t count; + + count = waitq_wakeup64_nthreads_locked(waitq, wake_event, result, + flags, UINT32_MAX); + return count ? KERN_SUCCESS : KERN_NOT_WAITING; } kern_return_t @@ -1573,19 +1556,22 @@ waitq_wakeup64_one_locked( wait_result_t result, waitq_wakeup_flags_t flags) { - return waitq_wakeup64_nthreads_locked(waitq, wake_event, result, flags, 1); + uint32_t count; + + count = waitq_wakeup64_nthreads_locked(waitq, wake_event, result, + flags, 1); + return count ? KERN_SUCCESS : KERN_NOT_WAITING; } -thread_t +__mockable thread_t waitq_wakeup64_identify_locked( waitq_t waitq, event64_t wake_event, - wait_result_t result, waitq_wakeup_flags_t flags) { struct waitq_select_args args = { .event = wake_event, - .result = result, + .result = THREAD_AWAKENED, /* this won't be used */ .flags = flags, .max_threads = 1, .is_identified = true, @@ -1617,7 +1603,7 @@ waitq_wakeup64_identify_locked( return THREAD_NULL; } -void +__mockable void waitq_resume_identified_thread( waitq_t waitq, thread_t thread, @@ -2331,7 +2317,7 @@ waitq_assert_wait64_leeway(struct waitq *waitq, return ret; } -kern_return_t +uint32_t waitq_wakeup64_nthreads( waitq_t waitq, event64_t wake_event, @@ -2361,7 +2347,11 @@ waitq_wakeup64_all( wait_result_t result, waitq_wakeup_flags_t flags) { - return waitq_wakeup64_nthreads(waitq, wake_event, result, flags, UINT32_MAX); + uint32_t count; + + count = waitq_wakeup64_nthreads(waitq, wake_event, result, + flags, UINT32_MAX); + return count ? KERN_SUCCESS : KERN_NOT_WAITING; } kern_return_t @@ -2371,7 +2361,10 @@ waitq_wakeup64_one( wait_result_t result, waitq_wakeup_flags_t flags) { - return waitq_wakeup64_nthreads(waitq, wake_event, result, flags, 1); + uint32_t count; + + count = waitq_wakeup64_nthreads(waitq, wake_event, result, flags, 1); + return count ? KERN_SUCCESS : KERN_NOT_WAITING; } kern_return_t @@ -2413,7 +2406,7 @@ waitq_wakeup64_identify( waitq_lock(waitq); thread_t thread = waitq_wakeup64_identify_locked(waitq, wake_event, - result, flags | waitq_flags_splx(spl) | WAITQ_UNLOCK); + flags | waitq_flags_splx(spl) | WAITQ_UNLOCK); /* waitq is unlocked, thread is not go-ed yet */ /* preemption disabled if thread non-null */ /* splx is handled */ @@ -2433,6 +2426,7 @@ waitq_wakeup64_identify( #pragma mark tests #if DEBUG || DEVELOPMENT +#include #include #include @@ -2503,6 +2497,8 @@ wqt_wqset_create(void) struct waitq_set *wqset; wqset = &ipc_pset_alloc_special(ipc_space_kernel)->ips_wqset; + waitq_unlock(wqset); + printf("[WQ]: created waitq set %p\n", wqset); return wqset; } diff --git a/osfmk/kern/waitq.h b/osfmk/kern/waitq.h index 9278d7bbb..5379f3ddb 100644 --- a/osfmk/kern/waitq.h +++ b/osfmk/kern/waitq.h @@ -55,7 +55,7 @@ __BEGIN_DECLS __ASSUME_PTR_ABI_SINGLE_BEGIN -#pragma GCC visibility push(hidden) +__exported_push_hidden /*! * @enum waitq_wakeup_flags_t @@ -395,8 +395,8 @@ waitq_valid(waitq_t waitq) /* * global waitqs */ -extern struct waitq *_global_eventq(char *event, size_t event_length); -#define global_eventq(event) _global_eventq((char *)&(event), sizeof(event)) +extern struct waitq *_global_eventq(event64_t event) __pure2; +#define global_eventq(event) _global_eventq(CAST_EVENT64_T(event)) static inline waitq_wakeup_flags_t waitq_flags_splx(spl_t spl_level) @@ -585,8 +585,10 @@ extern kern_return_t waitq_wakeup64_one( * to TURNSTILE_INHERITOR_NULL if it is a turnstile wait queue. * * @c waitq must be unlocked + * + * @returns how many threads have been woken up */ -extern kern_return_t waitq_wakeup64_nthreads( +extern uint32_t waitq_wakeup64_nthreads( waitq_t waitq, event64_t wake_event, wait_result_t result, @@ -739,8 +741,10 @@ extern kern_return_t waitq_wakeup64_all_locked( * @c waitq must be locked. * * May temporarily disable and re-enable interrupts. + * + * @returns how many threads have been woken up */ -extern kern_return_t waitq_wakeup64_nthreads_locked( +extern uint32_t waitq_wakeup64_nthreads_locked( waitq_t waitq, event64_t wake_event, wait_result_t result, @@ -778,7 +782,6 @@ extern kern_return_t waitq_wakeup64_one_locked( extern thread_t waitq_wakeup64_identify_locked( waitq_t waitq, event64_t wake_event, - wait_result_t result, waitq_wakeup_flags_t flags); /** @@ -1115,7 +1118,7 @@ extern void select_waitq_wakeup_and_deinit( #endif /* XNU_KERNEL_PRIVATE */ -#pragma GCC visibility pop +__exported_pop __ASSUME_PTR_ABI_SINGLE_END __END_DECLS diff --git a/osfmk/kern/work_interval.c b/osfmk/kern/work_interval.c index 5a144b3ee..3829f5fb5 100644 --- a/osfmk/kern/work_interval.c +++ b/osfmk/kern/work_interval.c @@ -47,8 +47,6 @@ #include #include -#include - /* * With the introduction of auto-join work intervals, it is possible * to change the work interval (and related thread group) of a thread in a @@ -75,6 +73,7 @@ static kern_return_t thread_set_work_interval(thread_t, struct work_interval *, static void work_interval_port_no_senders(ipc_port_t, mach_port_mscount_t); IPC_KOBJECT_DEFINE(IKOT_WORK_INTERVAL, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = work_interval_port_no_senders); @@ -1222,9 +1221,8 @@ kern_work_interval_create(thread_t thread, mach_port_name_t name = MACH_PORT_NULL; /* work_interval has a +1 ref, moves to the port */ - work_interval->wi_port = ipc_kobject_alloc_port( - (ipc_kobject_t)work_interval, IKOT_WORK_INTERVAL, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + work_interval->wi_port = ipc_kobject_alloc_port(work_interval, + IKOT_WORK_INTERVAL, IPC_KOBJECT_ALLOC_MAKE_SEND); name = ipc_port_copyout_send(work_interval->wi_port, current_space()); @@ -1412,6 +1410,10 @@ kern_work_interval_set_workload_id(mach_port_name_t port_name, wlida_flags = WORK_INTERVAL_WORKLOAD_ID_RT_ALLOWED; } + if (workload_id_args->wlida_flags & WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED) { + wlida_flags |= WORK_INTERVAL_WORKLOAD_ID_COMPLEXITY_ALLOWED; + } + /* * If the workload config wasn't even loaded then fallback to * older behaviour where the new thread group gets the default diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 0529df8e5..0d79c2dd5 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -89,11 +89,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -113,6 +115,7 @@ #include /* ml_cpu_get_info */ #include +#include #include #include @@ -124,6 +127,7 @@ #include #include #include +#include #include #include @@ -134,10 +138,6 @@ #define z_debug_assert(expr) (void)(expr) #endif -#if CONFIG_PROB_GZALLOC && CONFIG_SPTM -#error This is not a supported configuration -#endif - /* Returns pid of the task with the largest number of VM map entries. */ extern pid_t find_largest_process_vm_map_entries(void); @@ -175,10 +175,7 @@ __enum_closed_decl(zm_len_t, uint16_t, { ZM_CHUNK_LEN_MAX = 0x8, /* PGZ magical values */ - ZM_PGZ_FREE = 0x0, - ZM_PGZ_ALLOCATED = 0xa, /* [a]llocated */ ZM_PGZ_GUARD = 0xb, /* oo[b] */ - ZM_PGZ_DOUBLE_FREE = 0xd, /* [d]ouble_free */ /* secondary page markers */ ZM_SECONDARY_PAGE = 0xe, @@ -220,13 +217,6 @@ struct zone_page_metadata { * For those, zm_page_index holds the index of that page * in the run, and zm_subchunk_len the remaining length * within the chunk. - * - * Metadata used for PGZ pages can have 3 values: - * - ZM_PGZ_FREE: slot is free - * - ZM_PGZ_ALLOCATED: slot holds an allocated element - * at offset (zm_pgz_orig_addr & PAGE_MASK) - * - ZM_PGZ_DOUBLE_FREE: slot detected a double free - * (will panic). */ zm_len_t zm_chunk_len : 4; }; @@ -252,8 +242,6 @@ struct zone_page_metadata { zone_pva_t zm_page_next; zone_pva_t zm_page_prev; }; - vm_offset_t zm_pgz_orig_addr; - struct zone_page_metadata *zm_pgz_slot_next; }; }; static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing"); @@ -391,8 +379,6 @@ __security_const_late struct { struct mach_vm_range zi_meta_range; /* debugging only */ struct mach_vm_range zi_bits_range; /* bits buddy allocator */ struct mach_vm_range zi_xtra_range; /* vm tracking metadata */ - struct mach_vm_range zi_pgz_range; - struct zone_page_metadata *zi_pgz_meta; /* * The metadata lives within the zi_meta_range address range. @@ -616,8 +602,13 @@ static int zalloc_simulate_vm_pressure; * before auto-trim kicks in for empty buckets. * * zc_free_batch_size - * The size of batches of frees/reclaim that can be done keeping - * the zone lock held (and preemption disabled). + * The size of batches of frees/reclaim that can be done before we + * check if we have kept the zone lock held (and preemption disabled) + * for too long. + * + * zc_free_batch_timeout + * The number of mach ticks that may elapse before we will drop and + * reaquire the zone lock. */ Z_TUNABLE(uint16_t, zc_mag_size, 8); static Z_TUNABLE(uint32_t, zc_enable_level, 10); @@ -626,7 +617,8 @@ static Z_TUNABLE(uint32_t, zc_shrink_level, Z_WMA_UNIT / 2); static Z_TUNABLE(uint32_t, zc_pcpu_max, 128 << 10); static Z_TUNABLE(uint32_t, zc_autotrim_size, 16 << 10); static Z_TUNABLE(uint32_t, zc_autotrim_buckets, 8); -static Z_TUNABLE(uint32_t, zc_free_batch_size, 128); +static Z_TUNABLE(uint32_t, zc_free_batch_size, 64); +static Z_TUNABLE(uint64_t, zc_free_batch_timeout, 9600); // 400us static SECURITY_READ_ONLY_LATE(size_t) zone_pages_wired_max; static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT]; @@ -655,83 +647,6 @@ static struct zone_bool_gen { uint32_t zbg_entropy[ZONE_ENTROPY_CNT]; } zone_bool_gen[MAX_CPUS]; -#if CONFIG_PROB_GZALLOC -/* - * Probabilistic gzalloc - * ===================== - * - * - * Probabilistic guard zalloc samples allocations and will protect them by - * double-mapping the page holding them and returning the secondary virtual - * address to its callers. - * - * Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args - * are set. - * - * - * Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with - * most zalloc/kalloc features: - * - zone_require is functional - * - zone caching or zone tagging is compatible - * - non-blocking allocation work (they will always return NULL with gzalloc). - * - * PGZ limitations: - * - VA sequestering isn't respected, as the slots (which are in limited - * quantity) will be reused for any type, however the PGZ quarantine - * somewhat mitigates the impact. - * - zones with elements larger than a page cannot be protected. - * - * - * Tunables: - * -------- - * - * pgz=1: - * Turn on probabilistic guard malloc for all zones - * - * (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified) - * - * pgz_sample_rate=0 to 2^31 - * average sample rate between two guarded allocations. - * 0 means every allocation. - * - * The default is a random number between 1000 and 10,000 - * - * pgz_slots - * how many allocations to protect. - * - * Each costs: - * - a PTE in the pmap (when allocated) - * - 2 zone page meta's (every other page is a "guard" one, 32B total) - * - 64 bytes per backtraces. - * On LP64 this is <16K per 100 slots. - * - * The default is ~200 slots per G of physical ram (32k / G) - * - * TODO: - * - try harder to allocate elements at the "end" to catch OOB more reliably. - * - * pgz_quarantine - * how many slots should be free at any given time. - * - * PGZ will round robin through free slots to be reused, but free slots are - * important to detect use-after-free by acting as a quarantine. - * - * By default, PGZ will keep 33% of the slots around at all time. - * - * pgz1=, pgz2=, ..., pgzn=... - * Specific zones for which to enable probabilistic guard malloc. - * There must be no numbering gap (names after the gap will be ignored). - */ -#if DEBUG || DEVELOPMENT -static TUNABLE(bool, pgz_all, "pgz", true); -#else -static TUNABLE(bool, pgz_all, "pgz", false); -#endif -static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate", 0); -static TUNABLE(uint32_t, pgz_slots, "pgz_slots", UINT32_MAX); -static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine", 0); -#endif /* CONFIG_PROB_GZALLOC */ - static zone_t zone_find_largest(uint64_t *zone_size); #endif /* !ZALLOC_TEST */ @@ -880,8 +795,8 @@ zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end) #define from_range(r, addr, size) \ __builtin_choose_expr(__builtin_constant_p(size) ? (size) == 1 : 0, \ - mach_vm_range_contains(r, (mach_vm_offset_t)(addr)), \ - mach_vm_range_contains(r, (mach_vm_offset_t)(addr), size)) + mach_vm_range_contains(r, vm_memtag_canonicalize_kernel((mach_vm_offset_t)(addr))), \ + mach_vm_range_contains(r, vm_memtag_canonicalize_kernel((mach_vm_offset_t)(addr)), size)) #define from_ro_map(addr, size) \ from_range(&zone_info.zi_ro_range, addr, size) @@ -1305,12 +1220,6 @@ zone_element_bounds_check(vm_address_t addr, vm_size_t len) return; } -#if CONFIG_PROB_GZALLOC - if (__improbable(pgz_owned(addr))) { - meta = zone_meta_from_addr(addr); - addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); - } -#endif /* CONFIG_PROB_GZALLOC */ meta = zone_meta_from_addr(addr); zone = zone_by_id(meta->zm_index); @@ -1465,7 +1374,7 @@ zone_id_require_panic(zone_id_t zid, void *addr) * unmapped from the zone, or the pointer contains an uninitialized value that * does not belong to any zone. */ -void +__mockable void zone_require(zone_t zone, void *addr) { vm_size_t esize = zone_elem_inner_size(zone); @@ -1477,7 +1386,7 @@ zone_require(zone_t zone, void *addr) zone_require_panic(zone, addr); } -void +__mockable void zone_id_require(zone_id_t zid, vm_size_t esize, void *addr) { if (from_zone_map(addr, esize) && zid == zone_index_from_ptr(addr)) { @@ -1532,11 +1441,14 @@ zone_kmem_suballoc( vm_map_t *new_map) { struct mach_vm_range r; - +#ifndef __BUILDING_XNU_LIB_UNITTEST__ + /* Don't create the zalloc submap, unit-test mock all zalloc functionality */ *new_map = kmem_suballoc(kernel_map, &addr, size, VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST, flags, KMS_PERMANENT | KMS_NOFAIL | KMS_NOSOFTLIMIT, tag).kmr_submap; - +#else +#pragma unused(flags, tag, new_map) +#endif r.min_address = addr; r.max_address = addr + size; return r; @@ -2797,7 +2709,7 @@ zone_magazine_free_list(struct zone_depot *zd) zd->zd_empty = 0; } -void +__mockable void zone_enable_caching(zone_t zone) { size_t size_per_mag = zone_elem_inner_size(zone) * zc_mag_size(); @@ -2871,7 +2783,7 @@ zpercpu_count(void) return zpercpu_early_count; } -#if ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC +#if ZSECURITY_CONFIG(SAD_FENG_SHUI) /* * Returns a random number of a given bit-width. * @@ -2929,8 +2841,8 @@ zalloc_random_uniform32(uint32_t bound_min, uint32_t bound_max) return bound_min + (uint32_t)(zalloc_random_mask64(64) % delta); } -#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC */ -#if ZALLOC_ENABLE_LOGGING || CONFIG_PROB_GZALLOC +#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ +#if ZALLOC_ENABLE_LOGGING /* * Track all kalloc zones of specified size for zlog name * kalloc.type. or kalloc.type.var. or kalloc. @@ -3825,7 +3737,8 @@ zone_kma_flags(zone_t z, zone_security_flags_t zsflags, zalloc_flags_t flags) if (zsflags.z_kheap_id == KHEAP_ID_DATA_BUFFERS) { kmaflags |= KMA_DATA; - } else if (zsflags.z_kheap_id == Z_SUBMAP_IDX_DATA) { + } else if ((zsflags.z_kheap_id == KHEAP_ID_DATA_SHARED) || + (zsflags.z_submap_idx == Z_SUBMAP_IDX_DATA)) { /* * assume zones which are manually in the data heap, * like mbufs, are going to be shared somehow. @@ -3879,18 +3792,15 @@ zone_remove_wired_pages(zone_t z, uint32_t pages) } #if ZSECURITY_CONFIG(ZONE_TAGGING) -static inline caddr_t + +static inline void zone_tag_element(zone_t zone, caddr_t addr, vm_size_t elem_size) { - addr = vm_memtag_generate_and_store_tag(addr, elem_size); - if (zone->z_percpu) { zpercpu_foreach_cpu(index) { vm_memtag_store_tag(addr + ptoa(index), elem_size); } } - - return addr; } static inline caddr_t @@ -3900,7 +3810,10 @@ zone_tag_free_element(zone_t zone, caddr_t addr, vm_size_t elem_size) return addr; } - return zone_tag_element(zone, addr, elem_size); + addr = vm_memtag_generate_and_store_tag(addr, elem_size); + zone_tag_element(zone, addr, elem_size); + + return addr; } static inline void @@ -3915,10 +3828,12 @@ zcram_memtag_init(zone_t zone, vm_offset_t base, uint32_t start, uint32_t end) vm_size_t elem_size = zone_elem_outer_size(zone); vm_size_t oob_offs = zone_elem_outer_offs(zone); + for (uint32_t i = start; i < end; i++) { caddr_t elem_addr = (caddr_t)(base + oob_offs + i * elem_size); - (void)zone_tag_element(zone, elem_addr, elem_size); + elem_addr = vm_memtag_generate_and_store_tag(elem_addr, elem_size); + zone_tag_element(zone, elem_addr, elem_size); } } #else /* ZSECURITY_CONFIG(ZONE_TAGGING) */ @@ -4155,13 +4070,22 @@ zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags, uint32_t pages, vm_map_entry_t first, last; vm_map_offset_t addr; + vmlp_api_start(ZONE_SUBMAP_ALLOC_SEQUESTERED_VA); + vm_map_lock(map); first = vm_map_first_entry(map); last = vm_map_last_entry(map); + if (zsflags.z_submap_from_end) { + vmlp_range_event(map, last->vme_start - size, size); + } else { + vmlp_range_event(map, first->vme_end, size); + } + if (first->vme_end + size > last->vme_start) { vm_map_unlock(map); + vmlp_api_end(ZONE_SUBMAP_ALLOC_SEQUESTERED_VA, KERN_NO_SPACE); return KERN_NO_SPACE; } @@ -4178,6 +4102,7 @@ zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags, uint32_t pages, vm_map_unlock(map); *addrp = addr; + vmlp_api_end(ZONE_SUBMAP_ALLOC_SEQUESTERED_VA, KERN_SUCCESS); return KERN_SUCCESS; } @@ -4395,7 +4320,7 @@ zone_allocate_va_locked(zone_t z, zalloc_flags_t flags) #endif /* !ZSECURITY_CONFIG(SAD_FENG_SHUI) */ /* - * Zones that are suceptible to OOB (kalloc, ZC_PGZ_USE_GUARDS), + * For zones that are suceptible to OOB, * guards might be added after each chunk. * * Those guard pages are marked with the ZM_PGZ_GUARD @@ -4557,8 +4482,11 @@ static inline void ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages) { task_t task = current_task_early(); - if (pages && task) { - counter_add(&task->pages_grabbed_kern, pages); + if (pages) { + if (task) { + ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages); + } + counter_add(&vm_page_grab_count_kern, pages); } VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END, pages, 0, 0, 0); @@ -4877,8 +4805,10 @@ zone_expand_locked(zone_t z, zalloc_flags_t flags) ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages)); while (pages < z->z_chunk_pages - cur_pages) { - uint_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE; - vm_page_t m = vm_page_grab_options(grab_options); + vm_grab_options_t grab_options = VM_PAGE_GRAB_NOPAGEWAIT; + vm_page_t m; + + m = vm_page_grab_options(grab_options); if (m) { pages++; @@ -5148,616 +5078,6 @@ kill_process_in_largest_zone(void) return largest_zone; } -#endif /* !ZALLOC_TEST */ -#pragma mark probabilistic gzalloc -#if !ZALLOC_TEST -#if CONFIG_PROB_GZALLOC - -extern uint32_t random(void); -struct pgz_backtrace { - uint32_t pgz_depth; - int32_t pgz_bt[MAX_ZTRACE_DEPTH]; -}; - -static int32_t PERCPU_DATA(pgz_sample_counter); -static SECURITY_READ_ONLY_LATE(struct pgz_backtrace *) pgz_backtraces; -static uint32_t pgz_uses; /* number of zones using PGZ */ -static int32_t pgz_slot_avail; -#if OS_ATOMIC_HAS_LLSC -struct zone_page_metadata *pgz_slot_head; -#else -static struct pgz_slot_head { - uint32_t psh_count; - uint32_t psh_slot; -} pgz_slot_head; -#endif -struct zone_page_metadata *pgz_slot_tail; -static SECURITY_READ_ONLY_LATE(vm_map_t) pgz_submap; - -static struct zone_page_metadata * -pgz_meta_raw(uint32_t index) -{ - return VM_FAR_ADD_PTR_UNBOUNDED(zone_info.zi_pgz_meta, index); -} - -static struct zone_page_metadata * -pgz_meta(uint32_t index) -{ - return pgz_meta_raw(2 * index + 1); -} - -static struct pgz_backtrace * -pgz_bt(uint32_t slot, bool free) -{ - /* - * While we could use a bounds checked variant, slot is generally - * trustworthy and so it isn't necessary. - */ - return VM_FAR_ADD_PTR_UNBOUNDED(pgz_backtraces, 2 * slot + free); -} - -static void -pgz_backtrace(struct pgz_backtrace *bt, void *fp) -{ - struct backtrace_control ctl = { - .btc_frame_addr = (uintptr_t)fp, - }; - - bt->pgz_depth = (uint32_t)backtrace_packed(BTP_KERN_OFFSET_32, - (uint8_t *)bt->pgz_bt, sizeof(bt->pgz_bt), &ctl, NULL) / 4; -} - -static uint32_t -pgz_slot(vm_offset_t addr) -{ - return (uint32_t)((addr - zone_info.zi_pgz_range.min_address) >> (PAGE_SHIFT + 1)); -} - -static vm_offset_t -pgz_addr(uint32_t slot) -{ - return zone_info.zi_pgz_range.min_address + ptoa(2 * slot + 1); -} - -static bool -pgz_sample(vm_offset_t addr, vm_size_t esize) -{ - int32_t *counterp, cnt; - - if (zone_addr_size_crosses_page(addr, esize)) { - return false; - } - - /* - * Note: accessing pgz_sample_counter is racy but this is - * kind of acceptable given that this is not - * a security load bearing feature. - */ - - counterp = PERCPU_GET(pgz_sample_counter); - cnt = *counterp; - if (__probable(cnt > 0)) { - *counterp = cnt - 1; - return false; - } - - if (pgz_slot_avail <= 0) { - return false; - } - - /* - * zalloc_random_uniform() might block, so when preemption is disabled, - * set the counter to `-1` which will cause the next allocation - * that can block to generate a new random value. - * - * No allocation on this CPU will sample until then. - */ - if (get_preemption_level()) { - *counterp = -1; - } else { - *counterp = zalloc_random_uniform32(0, 2 * pgz_sample_rate); - } - - return cnt == 0; -} - -static inline bool -pgz_slot_alloc(uint32_t *slot) -{ - struct zone_page_metadata *m; - uint32_t tries = 100; - - disable_preemption(); - -#if OS_ATOMIC_USE_LLSC - int32_t ov, nv; - os_atomic_rmw_loop(&pgz_slot_avail, ov, nv, relaxed, { - if (__improbable(ov <= 0)) { - os_atomic_rmw_loop_give_up({ - enable_preemption(); - return false; - }); - } - nv = ov - 1; - }); -#else - if (__improbable(os_atomic_dec_orig(&pgz_slot_avail, relaxed) <= 0)) { - os_atomic_inc(&pgz_slot_avail, relaxed); - enable_preemption(); - return false; - } -#endif - -again: - if (__improbable(tries-- == 0)) { - /* - * Too much contention, - * extremely unlikely but do not stay stuck. - */ - os_atomic_inc(&pgz_slot_avail, relaxed); - enable_preemption(); - return false; - } - -#if OS_ATOMIC_HAS_LLSC - uint32_t castries = 20; - do { - if (__improbable(castries-- == 0)) { - /* - * rdar://115922110 On many many cores devices, - * this can fail for a very long time. - */ - goto again; - } - - m = os_atomic_load_exclusive(&pgz_slot_head, dependency); - if (__improbable(m->zm_pgz_slot_next == NULL)) { - /* - * Either we are waiting for an enqueuer (unlikely) - * or we are competing with another core and - * are looking at a popped element. - */ - os_atomic_clear_exclusive(); - goto again; - } - } while (!os_atomic_store_exclusive(&pgz_slot_head, - m->zm_pgz_slot_next, relaxed)); -#else - struct zone_page_metadata *base = zone_info.zi_pgz_meta; - struct pgz_slot_head ov, nv; - os_atomic_rmw_loop(&pgz_slot_head, ov, nv, dependency, { - m = pgz_meta_raw(ov.psh_slot * 2); - if (__improbable(m->zm_pgz_slot_next == NULL)) { - /* - * Either we are waiting for an enqueuer (unlikely) - * or we are competing with another core and - * are looking at a popped element. - */ - os_atomic_rmw_loop_give_up(goto again); - } - nv.psh_count = ov.psh_count + 1; - nv.psh_slot = (uint32_t)((m->zm_pgz_slot_next - base) / 2); - }); -#endif - - enable_preemption(); - - m->zm_pgz_slot_next = NULL; - *slot = (uint32_t)((m - zone_info.zi_pgz_meta) / 2); - return true; -} - -static inline bool -pgz_slot_free(uint32_t slot) -{ - struct zone_page_metadata *m = pgz_meta_raw(2 * slot); - struct zone_page_metadata *t; - - disable_preemption(); - t = os_atomic_xchg(&pgz_slot_tail, m, relaxed); - os_atomic_store(&t->zm_pgz_slot_next, m, release); - os_atomic_inc(&pgz_slot_avail, relaxed); - enable_preemption(); - - return true; -} - -/*! - * @function pgz_protect() - * - * @brief - * Try to protect an allocation with PGZ. - * - * @param zone The zone the allocation was made against. - * @param addr An allocated element address to protect. - * @param fp The caller frame pointer (for the backtrace). - * @returns The new address for the element, or @c addr. - */ -__attribute__((noinline)) -static vm_offset_t -pgz_protect(zone_t zone, vm_offset_t addr, void *fp) -{ - kern_return_t kr; - uint32_t slot; - uint_t flags = 0; - - if (!pgz_slot_alloc(&slot)) { - return addr; - } - - /* - * Try to double-map the page (may fail if Z_NOWAIT). - * we will always find a PA because pgz_init() pre-expanded the pmap. - */ - pmap_paddr_t pa = kvtophys(trunc_page(addr)); - vm_offset_t new_addr = pgz_addr(slot); - kr = pmap_enter_options_addr(kernel_pmap, new_addr, pa, - VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, flags, TRUE, - get_preemption_level() ? (PMAP_OPTIONS_NOWAIT | PMAP_OPTIONS_NOPREEMPT) : 0, - NULL, PMAP_MAPPING_TYPE_INFER); - - if (__improbable(kr != KERN_SUCCESS)) { - pgz_slot_free(slot); - return addr; - } - - struct zone_page_metadata tmp = { - .zm_chunk_len = ZM_PGZ_ALLOCATED, - .zm_index = zone_index(zone), - }; - struct zone_page_metadata *meta = pgz_meta(slot); - - os_atomic_store(&meta->zm_bits, tmp.zm_bits, relaxed); - os_atomic_store(&meta->zm_pgz_orig_addr, addr, relaxed); - pgz_backtrace(pgz_bt(slot, false), fp); - - return new_addr + (addr & PAGE_MASK); -} - -/*! - * @function pgz_unprotect() - * - * @brief - * Release a PGZ slot and returns the original address of a freed element. - * - * @param addr A PGZ protected element address. - * @param fp The caller frame pointer (for the backtrace). - * @returns The non protected address for the element - * that was passed to @c pgz_protect(). - */ -__attribute__((noinline)) -static vm_offset_t -pgz_unprotect(vm_offset_t addr, void *fp) -{ - struct zone_page_metadata *meta; - struct zone_page_metadata tmp; - uint32_t slot; - - slot = pgz_slot(addr); - meta = zone_meta_from_addr(addr); - tmp = *meta; - if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { - goto double_free; - } - - pmap_remove_options(kernel_pmap, vm_memtag_canonicalize_kernel(trunc_page(addr)), - vm_memtag_canonicalize_kernel(trunc_page(addr) + PAGE_SIZE), - PMAP_OPTIONS_REMOVE | PMAP_OPTIONS_NOPREEMPT); - - pgz_backtrace(pgz_bt(slot, true), fp); - - tmp.zm_chunk_len = ZM_PGZ_FREE; - tmp.zm_bits = os_atomic_xchg(&meta->zm_bits, tmp.zm_bits, relaxed); - if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { - goto double_free; - } - - pgz_slot_free(slot); - return tmp.zm_pgz_orig_addr; - -double_free: - panic_fault_address = addr; - meta->zm_chunk_len = ZM_PGZ_DOUBLE_FREE; - panic("probabilistic gzalloc double free: %p", (void *)addr); -} - -bool -pgz_owned(mach_vm_address_t addr) -{ - return mach_vm_range_contains(&zone_info.zi_pgz_range, vm_memtag_canonicalize_kernel(addr)); -} - - -__attribute__((always_inline)) -vm_offset_t -__pgz_decode(mach_vm_address_t addr, mach_vm_size_t size) -{ - struct zone_page_metadata *meta; - - if (__probable(!pgz_owned(addr))) { - return (vm_offset_t)addr; - } - - if (zone_addr_size_crosses_page(addr, size)) { - panic("invalid size for PGZ protected address %p:%p", - (void *)addr, (void *)(addr + size)); - } - - meta = zone_meta_from_addr((vm_offset_t)addr); - if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { - panic_fault_address = (vm_offset_t)addr; - panic("probabilistic gzalloc use-after-free: %p", (void *)addr); - } - - return trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); -} - -__attribute__((always_inline)) -vm_offset_t -__pgz_decode_allow_invalid(vm_offset_t addr, zone_id_t zid) -{ - struct zone_page_metadata *meta; - struct zone_page_metadata tmp; - - if (__probable(!pgz_owned(addr))) { - return addr; - } - - meta = zone_meta_from_addr(addr); - tmp.zm_bits = os_atomic_load(&meta->zm_bits, relaxed); - - addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK); - - if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) { - return 0; - } - - if (zid != ZONE_ID_ANY && tmp.zm_index != zid) { - return 0; - } - - return addr; -} - -static void -pgz_zone_init(zone_t z) -{ - char zn[MAX_ZONE_NAME]; - char zv[MAX_ZONE_NAME]; - char key[30]; - - if (zone_elem_inner_size(z) > PAGE_SIZE) { - return; - } - - if (pgz_all) { - os_atomic_inc(&pgz_uses, relaxed); - z->z_pgz_tracked = true; - return; - } - - snprintf(zn, sizeof(zn), "%s%s", zone_heap_name(z), zone_name(z)); - - for (int i = 1;; i++) { - snprintf(key, sizeof(key), "pgz%d", i); - if (!PE_parse_boot_argn(key, zv, sizeof(zv))) { - break; - } - if (track_this_zone(zn, zv) || track_kalloc_zones(z, zv)) { - os_atomic_inc(&pgz_uses, relaxed); - z->z_pgz_tracked = true; - break; - } - } -} - -__startup_func -static vm_size_t -pgz_get_size(void) -{ - if (pgz_slots == UINT32_MAX) { - /* - * Scale with RAM size: ~200 slots a G - */ - pgz_slots = (uint32_t)(sane_size >> 22); - } - - /* - * Make sure that the slot allocation scheme works. - * see pgz_slot_alloc() / pgz_slot_free(); - */ - if (pgz_slots < zpercpu_count() * 4) { - pgz_slots = zpercpu_count() * 4; - } - if (pgz_slots >= UINT16_MAX) { - pgz_slots = UINT16_MAX - 1; - } - - /* - * Quarantine is 33% of slots by default, no more than 90%. - */ - if (pgz_quarantine == 0) { - pgz_quarantine = pgz_slots / 3; - } - if (pgz_quarantine > pgz_slots * 9 / 10) { - pgz_quarantine = pgz_slots * 9 / 10; - } - pgz_slot_avail = pgz_slots - pgz_quarantine; - - return ptoa(2 * pgz_slots + 1); -} - -__startup_func -static void -pgz_init(void) -{ - if (!pgz_uses) { - return; - } - - if (pgz_sample_rate == 0) { - /* - * If no rate was provided, pick a random one that scales - * with the number of protected zones. - * - * Use a binomal distribution to avoid having too many - * really fast sample rates. - */ - uint32_t factor = MIN(pgz_uses, 10); - uint32_t max_rate = 1000 * factor; - uint32_t min_rate = 100 * factor; - - pgz_sample_rate = (zalloc_random_uniform32(min_rate, max_rate) + - zalloc_random_uniform32(min_rate, max_rate)) / 2; - } - - struct mach_vm_range *r = &zone_info.zi_pgz_range; - zone_info.zi_pgz_meta = zone_meta_from_addr(r->min_address); - zone_meta_populate(r->min_address, mach_vm_range_size(r)); - - for (uint32_t i = 0; i < 2 * pgz_slots + 1; i += 2) { - pgz_meta_raw(i)->zm_chunk_len = ZM_PGZ_GUARD; - } - - for (uint32_t i = 1; i < pgz_slots; i++) { - pgz_meta_raw(2 * i - 1)->zm_pgz_slot_next = pgz_meta_raw(2 * i + 1); - } -#if OS_ATOMIC_HAS_LLSC - pgz_slot_head = pgz_meta_raw(1); -#endif - pgz_slot_tail = pgz_meta_raw(2 * pgz_slots - 1); - - kernel_memory_allocate(kernel_map, (vm_offset_t *)&pgz_backtraces, - /* size */ sizeof(struct pgz_backtrace) * 2 * pgz_slots, - /* mask */ ZALIGN_PTR, - KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO | KMA_NOSOFTLIMIT, - VM_KERN_MEMORY_KALLOC); - - /* - * expand the pmap so that pmap_enter_options_addr() - * in pgz_protect() never need to call pmap_expand(). - */ - for (uint32_t slot = 0; slot < pgz_slots; slot++) { - (void)pmap_enter_options_addr(kernel_pmap, pgz_addr(slot), 0, - VM_PROT_NONE, VM_PROT_NONE, 0, FALSE, - PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER); - } - - /* do this last as this will enable pgz */ - percpu_foreach(counter, pgz_sample_counter) { - *counter = zalloc_random_uniform32(0, 2 * pgz_sample_rate); - } -} -STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, pgz_init); - -static void -panic_display_pgz_bt(bool has_syms, uint32_t slot, bool free) -{ - struct pgz_backtrace *bt = pgz_bt(slot, free); - const char *what = free ? "Free" : "Allocation"; - uintptr_t buf[MAX_ZTRACE_DEPTH]; - - if (!ml_validate_nofault((vm_offset_t)bt, sizeof(*bt))) { - paniclog_append_noflush(" Can't decode %s Backtrace\n", what); - return; - } - - backtrace_unpack(BTP_KERN_OFFSET_32, buf, MAX_ZTRACE_DEPTH, - (uint8_t *)bt->pgz_bt, 4 * bt->pgz_depth); - - paniclog_append_noflush(" %s Backtrace:\n", what); - for (uint32_t i = 0; i < bt->pgz_depth && i < MAX_ZTRACE_DEPTH; i++) { - if (has_syms) { - paniclog_append_noflush(" %p ", (void *)buf[i]); - panic_print_symbol_name(buf[i]); - paniclog_append_noflush("\n"); - } else { - paniclog_append_noflush(" %p\n", (void *)buf[i]); - } - } - kmod_panic_dump((vm_offset_t *)buf, bt->pgz_depth); -} - -static void -panic_display_pgz_uaf_info(bool has_syms, vm_offset_t addr) -{ - struct zone_page_metadata *meta; - vm_offset_t elem, esize; - const char *type; - const char *prob; - uint32_t slot; - zone_t z; - - slot = pgz_slot(addr); - meta = pgz_meta(slot); - elem = pgz_addr(slot) + (meta->zm_pgz_orig_addr & PAGE_MASK); - - paniclog_append_noflush("Probabilistic GZAlloc Report:\n"); - - if (ml_validate_nofault((vm_offset_t)meta, sizeof(*meta)) && - meta->zm_index && - meta->zm_index < os_atomic_load(&num_zones, relaxed)) { - z = &zone_array[meta->zm_index]; - } else { - paniclog_append_noflush(" Zone : \n"); - paniclog_append_noflush(" Address : %p\n", (void *)addr); - paniclog_append_noflush("\n"); - return; - } - - esize = zone_elem_inner_size(z); - paniclog_append_noflush(" Zone : %s%s\n", - zone_heap_name(z), zone_name(z)); - paniclog_append_noflush(" Address : %p\n", (void *)addr); - paniclog_append_noflush(" Element : [%p, %p) of size %d\n", - (void *)elem, (void *)(elem + esize), (uint32_t)esize); - - if (addr < elem) { - type = "out-of-bounds(underflow) + use-after-free"; - prob = "low"; - } else if (meta->zm_chunk_len == ZM_PGZ_DOUBLE_FREE) { - type = "double-free"; - prob = "high"; - } else if (addr < elem + esize) { - type = "use-after-free"; - prob = "high"; - } else if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { - type = "out-of-bounds + use-after-free"; - prob = "low"; - } else { - type = "out-of-bounds"; - prob = "high"; - } - paniclog_append_noflush(" Kind : %s (%s confidence)\n", - type, prob); - if (addr < elem) { - paniclog_append_noflush(" Access : %d byte(s) before\n", - (uint32_t)(elem - addr) + 1); - } else if (addr < elem + esize) { - paniclog_append_noflush(" Access : %d byte(s) inside\n", - (uint32_t)(addr - elem) + 1); - } else { - paniclog_append_noflush(" Access : %d byte(s) past\n", - (uint32_t)(addr - (elem + esize)) + 1); - } - - panic_display_pgz_bt(has_syms, slot, false); - if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) { - panic_display_pgz_bt(has_syms, slot, true); - } - - paniclog_append_noflush("\n"); -} - -vm_offset_t pgz_protect_for_testing_only(zone_t zone, vm_offset_t addr, void *fp); -vm_offset_t -pgz_protect_for_testing_only(zone_t zone, vm_offset_t addr, void *fp) -{ - return pgz_protect(zone, addr, fp); -} - - -#endif /* CONFIG_PROB_GZALLOC */ #endif /* !ZALLOC_TEST */ #pragma mark zfree #if !ZALLOC_TEST @@ -6008,11 +5328,6 @@ __zcache_mark_invalid(zone_t zone, vm_offset_t elem, uint64_t combined_size) vm_offset_t offs; #pragma unused(combined_size) -#if CONFIG_PROB_GZALLOC - if (__improbable(pgz_owned(elem))) { - elem = pgz_unprotect(elem, __builtin_frame_address(0)); - } -#endif /* CONFIG_PROB_GZALLOC */ meta = zone_meta_from_addr(elem); if (!from_zone_map(elem, 1) || !zone_has_index(zone, meta->zm_index)) { @@ -6068,7 +5383,7 @@ zcache_mark_invalid(zone_t zone, void *elem) #if ZALLOC_ENABLE_LOGGING __attribute__((noinline)) #endif /* ZALLOC_ENABLE_LOGGING */ -void +__mockable void zfree_ext(zone_t zone, zone_stats_t zstats, void *addr, uint64_t combined_size) { vm_offset_t esize = ZFREE_ELEM_SIZE(combined_size); @@ -6133,7 +5448,7 @@ zcache_free_stack_to_cpu( o = ops->zc_op_mark_invalid(zid, o); } else { if (zero) { - bzero(o, esize); + vm_memtag_bzero_unchecked(o, esize); } o = (void *)__zcache_mark_invalid(zone_by_id(zid), (vm_offset_t)o, ZFREE_PACK_SIZE(esize, esize)); @@ -6211,7 +5526,7 @@ zcache_free_n_ext(zone_id_t zid, zstack_t stack, zone_cache_ops_t ops, bool zero vm_offset_t addr = (vm_offset_t)zstack_pop(&stack); if (zero) { - bzero((void *)addr, esize); + vm_memtag_bzero_unchecked((void *)addr, esize); } addr = __zcache_mark_invalid(zone, addr, ZFREE_PACK_SIZE(esize, esize)); @@ -6268,7 +5583,7 @@ void assert(zone > &zone_array[ZONE_ID__LAST_RO]); assert(!zone->z_percpu && !zone->z_permanent && !zone->z_smr); - vm_memtag_bzero_fast_checked(addr, esize); + vm_memtag_bzero_unchecked(addr, esize); zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize)); } @@ -6284,7 +5599,7 @@ zfree_percpu(union zone_or_view zov, void *addr) assert(zone > &zone_array[ZONE_ID__LAST_RO]); assert(zone->z_percpu); zpercpu_foreach_cpu(i) { - vm_memtag_bzero_fast_checked((char *)addr + ptoa(i), esize); + vm_memtag_bzero_unchecked((char *)addr + ptoa(i), esize); } zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize)); } @@ -6510,7 +5825,7 @@ static inline vm_offset_t __zcache_mark_valid(zone_t zone, vm_offset_t addr, zalloc_flags_t flags) { #pragma unused(zone, flags) -#if KASAN_CLASSIC || CONFIG_PROB_GZALLOC || VM_TAG_SIZECLASSES +#if KASAN_CLASSIC || VM_TAG_SIZECLASSES vm_offset_t esize = zone_elem_inner_size(zone); #endif @@ -6539,12 +5854,6 @@ __zcache_mark_valid(zone_t zone, vm_offset_t addr, zalloc_flags_t flags) } #endif /* VM_TAG_SIZECLASSES */ -#if CONFIG_PROB_GZALLOC - if (zone->z_pgz_tracked && pgz_sample(addr, esize)) { - addr = pgz_protect(zone, addr, __builtin_frame_address(0)); - } -#endif - #if KASAN_CLASSIC /* * KASAN_CLASSIC integration of kalloc heaps are handled by kalloc_ext() @@ -6844,7 +6153,7 @@ zalloc_cached_get_pcpu_cache( * @brief * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu(). */ -struct kalloc_result +__mockable struct kalloc_result zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags) { /* @@ -7133,7 +6442,7 @@ void * struct kalloc_result kr; kr = zalloc_ext(zone, zstats, flags); -#if ZSECURITY_CONFIG(READ_ONLY) +#if ZSECURITY_CONFIG(READ_ONLY) && !__BUILDING_XNU_LIBRARY__ /* zalloc mocks don't create ro memory */ assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY); if (kr.addr) { zone_require_ro(zid, kr.size, kr.addr); @@ -7160,14 +6469,14 @@ from_current_stack(vm_offset_t addr, vm_size_t size) * Check if an address is from const memory i.e TEXT or DATA CONST segements * or the SECURITY_READ_ONLY_LATE section. */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) __attribute__((always_inline)) static bool from_const_memory(const vm_offset_t addr, vm_size_t size) { return rorgn_contains(addr, size, true); } -#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ __attribute__((always_inline)) static bool from_const_memory(const vm_offset_t addr, vm_size_t size) @@ -7175,7 +6484,7 @@ from_const_memory(const vm_offset_t addr, vm_size_t size) #pragma unused(addr, size) return true; } -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ __abortlike static void @@ -7184,7 +6493,7 @@ zalloc_ro_mut_validation_panic(zone_id_t zid, void *elem, { vm_offset_t stack_start = (vm_offset_t)__builtin_frame_address(0); vm_offset_t stack_end = (stack_start + kernel_stack_size - 1) & -kernel_stack_size; -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) extern vm_offset_t rorgn_begin; extern vm_offset_t rorgn_end; #else @@ -7225,7 +6534,7 @@ zalloc_ro_mut_validate_src(zone_id_t zid, void *elem, #endif /* ZSECURITY_CONFIG(READ_ONLY) */ -__attribute__((noinline)) +__mockable __attribute__((noinline)) void zalloc_ro_mut(zone_id_t zid, void *elem, vm_offset_t offset, const void *new_data, vm_size_t new_data_size) @@ -7293,7 +6602,7 @@ zalloc_ro_clear(zone_id_t zid, void *elem, vm_offset_t offset, vm_size_t size) * against an attacker with arbitrary kernel write. */ -#if ZSECURITY_CONFIG(READ_ONLY) +#if ZSECURITY_CONFIG(READ_ONLY) && !defined(__BUILDING_XNU_LIBRARY__) __abortlike static void @@ -7333,7 +6642,8 @@ __attribute__((always_inline)) void zone_require_ro(zone_id_t zid, vm_size_t elem_size __unused, void *addr) { -#if ZSECURITY_CONFIG(READ_ONLY) +#if ZSECURITY_CONFIG(READ_ONLY) && !defined(__BUILDING_XNU_LIBRARY__) \ + /* can't do this in user-mode because there's no zones submap */ struct zone_size_params p = zone_ro_size_params[zid]; vm_offset_t elem = (vm_offset_t)addr; @@ -7424,7 +6734,7 @@ _zalloc_permanent_large(size_t size, vm_offset_t mask, vm_tag_t tag) return (void *)addr; } -void * +__mockable void * zalloc_permanent_tag(vm_size_t size, vm_offset_t mask, vm_tag_t tag) { if (size <= PAGE_SIZE) { @@ -7434,7 +6744,7 @@ zalloc_permanent_tag(vm_size_t size, vm_offset_t mask, vm_tag_t tag) return _zalloc_permanent_large(size, mask, tag); } -void * +__mockable void * zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask) { zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT]; @@ -7702,6 +7012,8 @@ static void zone_reclaim_recirc_trim(zone_t z, struct zone_depot *zd) { for (;;) { + uint64_t maxtime = mach_continuous_speculative_time() + + zc_free_batch_timeout(); uint32_t budget = zc_free_batch_size(); uint32_t count; bool done = true; @@ -7742,14 +7054,21 @@ zone_reclaim_recirc_trim(zone_t z, struct zone_depot *zd) return; } + if (mach_continuous_speculative_time() < maxtime) { + continue; + } + /* - * If the number of magazines to reclaim is too large, - * we might be keeping preemption disabled for too long. - * - * Drop and retake the lock to allow for preemption to occur. + * We have held preemption disabled for too long. Drop and + * retake the lock to allow a pending preemption to occur. */ +#if SCHED_HYGIENE_DEBUG + abandon_preemption_disable_measurement(); +#endif zone_unlock(z); zone_lock(z); + maxtime = mach_continuous_speculative_time() + + zc_free_batch_timeout(); } } @@ -7829,6 +7148,8 @@ zone_reclaim(zone_t z, zone_reclaim_mode_t mode) zone_reclaim_pcpu(z, mode, &zd); if (z->z_chunk_elems) { + uint64_t maxtime = mach_continuous_speculative_time() + + zc_free_batch_timeout(); zone_cache_t cache = zpercpu_get_cpu(z->z_pcpu_cache, 0); smr_t smr = zone_cache_smr(cache); @@ -7844,12 +7165,18 @@ zone_reclaim(zone_t z, zone_reclaim_mode_t mode) zone_depot_insert_head_empty(&zd, mag); freed += zc_mag_size(); - if (freed >= zc_free_batch_size()) { + if (freed >= zc_free_batch_size() || + mach_continuous_speculative_time() >= maxtime) { +#if SCHED_HYGIENE_DEBUG + abandon_preemption_disable_measurement(); +#endif zone_unlock(z); zone_magazine_free_list(&zd); thread_yield_to_preemption(); zone_lock(z); freed = 0; + maxtime = mach_continuous_speculative_time() + + zc_free_batch_timeout(); } } } else { @@ -8333,13 +7660,6 @@ panic_display_zone_info(void) paniclog_append_noflush(" Zone map: %p - %p\n", (void *)zone_info.zi_map_range.min_address, (void *)zone_info.zi_map_range.max_address); -#if CONFIG_PROB_GZALLOC - if (pgz_submap) { - paniclog_append_noflush(" . PGZ : %p - %p\n", - (void *)pgz_submap->min_offset, - (void *)pgz_submap->max_offset); - } -#endif /* CONFIG_PROB_GZALLOC */ for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) { vm_map_t map = zone_submaps[i]; @@ -8452,11 +7772,6 @@ panic_display_zalloc(void) panic_display_zone_info(); if (panic_fault_address) { -#if CONFIG_PROB_GZALLOC - if (pgz_owned(panic_fault_address)) { - panic_display_pgz_uaf_info(keepsyms, panic_fault_address); - } else -#endif /* CONFIG_PROB_GZALLOC */ if (zone_maps_owned(panic_fault_address, 1)) { panic_display_zone_fault(panic_fault_address); } @@ -8606,7 +7921,7 @@ mach_memory_info_security_check(bool redact_info) return KERN_NO_ACCESS; } - if (PE_srd_fused) { + if (research_mode_state() == true) { return KERN_SUCCESS; } @@ -8645,6 +7960,52 @@ mach_memory_info_security_check(bool redact_info) return KERN_SUCCESS; } +#if DEVELOPMENT || DEBUG + +kern_return_t +zone_reset_peak(const char *zonename) +{ + unsigned int max_zones; + + if (zonename == NULL) { + return KERN_INVALID_ARGUMENT; + } + + max_zones = os_atomic_load(&num_zones, relaxed); + for (unsigned int i = 0; i < max_zones; i++) { + zone_t z = &zone_array[i]; + + if (zone_name(z) && + track_this_zone(zone_name(z), zonename)) { + /* Found the matching zone */ + os_log_info(OS_LOG_DEFAULT, + "zalloc: resetting peak size for zone %s\n", zone_name(z)); + zone_lock(z); + z->z_wired_hwm = z->z_wired_cur; + zone_unlock(z); + return KERN_SUCCESS; + } + } + return KERN_NOT_FOUND; +} + +kern_return_t +zone_reset_all_peaks(void) +{ + unsigned int max_zones; + os_log_info(OS_LOG_DEFAULT, "zalloc: resetting all zone size peaks\n"); + max_zones = os_atomic_load(&num_zones, relaxed); + for (unsigned int i = 0; i < max_zones; i++) { + zone_t z = &zone_array[i]; + zone_lock(z); + z->z_wired_hwm = z->z_wired_cur; + zone_unlock(z); + } + return KERN_SUCCESS; +} + +#endif /* DEVELOPMENT || DEBUG */ + kern_return_t mach_zone_info( mach_port_t host_port, @@ -8865,7 +8226,7 @@ mach_memory_info_internal( names_size = round_page(max_zones * sizeof *names); kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { return kr; } @@ -8873,7 +8234,7 @@ mach_memory_info_internal( info_size = round_page(max_zones * sizeof *info); kr = kmem_alloc(ipc_kernel_map, &info_addr, info_size, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { kmem_free(ipc_kernel_map, names_addr, names_size); @@ -8884,7 +8245,7 @@ mach_memory_info_internal( if (redact_info) { coalesce_size = round_page(max_zones * sizeof *coalesce); kr = kmem_alloc(ipc_kernel_map, &coalesce_addr, coalesce_size, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { kmem_free(ipc_kernel_map, names_addr, names_size); @@ -8900,7 +8261,7 @@ mach_memory_info_internal( memory_info_size = num_info * sizeof(*memory_info); memory_info_vmsize = round_page(memory_info_size); kr = kmem_alloc(ipc_kernel_map, &memory_info_addr, memory_info_vmsize, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { return kr; } @@ -9057,7 +8418,7 @@ mach_zone_get_zlog_zones( names_size = round_page(max_zones * sizeof *names); kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { return kr; } @@ -9604,7 +8965,6 @@ zone_create_ext( if (flags & ZC_PERCPU) { zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_READONLY); - zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_PGZ_USE_GUARDS); z->z_percpu = true; } if (flags & ZC_NOGC) { @@ -9626,19 +8986,6 @@ zone_create_ext( /* * Handle Internal flags */ -#if ZSECURITY_CONFIG(SAD_FENG_SHUI) - if (flags & ZC_PGZ_USE_GUARDS) { - /* - * Try to turn on guard pages only for zones - * with a chance of OOB. - */ - if (startup_phase < STARTUP_SUB_LOCKDOWN) { - zsflags->z_pgz_use_guards = true; - } - z->z_pgz_use_guards = true; - } -#endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */ - #if ZSECURITY_CONFIG(ZONE_TAGGING) if (flags & (ZC_NO_TBI_TAG)) { zsflags->z_tag = false; @@ -9693,11 +9040,6 @@ zone_create_ext( /* * Configure debugging features */ -#if CONFIG_PROB_GZALLOC - if ((flags & (ZC_READONLY | ZC_PERCPU | ZC_OBJ_CACHE | ZC_NOPGZ)) == 0) { - pgz_zone_init(z); - } -#endif if (zc_magazine_zone) { /* proxy for "has zone_init run" */ #if ZALLOC_ENABLE_LOGGING /* @@ -9717,7 +9059,11 @@ zone_create_ext( assert(startup_phase < STARTUP_SUB_LOCKDOWN); z->z_uses_tags = true; - if (zone_is_data_kheap(zsflags->z_kheap_id)) { + if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) { + /* + * Note that we don't use zone_is_data_kheap() here because we don't + * want to insert the kheap size classes more than once. + */ zone_tags_sizeclasses[sizeclass_idx] = (uint16_t)size; z->z_tags_sizeclass = sizeclass_idx++; } else { @@ -9784,7 +9130,7 @@ zone_get_sig_eq(zone_t zone) return zone_security_array[zone_index(zone)].z_sig_eq; } -void +__mockable void zone_enable_smr(zone_t zone, struct smr *smr, zone_smr_free_cb_t free_cb) { /* moving to SMR must be done before the zone has ever been used */ @@ -10004,7 +9350,9 @@ zone_bootstrap(void) #endif /* DEBUG || DEVELOPMENT */ /* Validate struct zone_packed_virtual_address expectations */ +#ifndef __BUILDING_XNU_LIBRARY__ /* user-mode addresses are low*/ static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1"); +#endif /* __BUILDING_XNU_LIBRARY__ */ if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) { panic("zone_pva_t can't pack a kernel page address in 31 bits"); } @@ -10032,14 +9380,6 @@ zone_bootstrap(void) random_bool_init(&zone_bool_gen[cpu].zbg_bg); } -#if CONFIG_PROB_GZALLOC - /* - * Set pgz_sample_counter on the boot CPU so that we do not sample - * any allocation until PGZ has been properly setup (in pgz_init()). - */ - *PERCPU_GET_MASTER(pgz_sample_counter) = INT32_MAX; -#endif /* CONFIG_PROB_GZALLOC */ - #if ZSECURITY_CONFIG(SAD_FENG_SHUI) /* * Randomly assign zones to one of the 4 general submaps, @@ -10078,17 +9418,6 @@ zone_tunables_fixup(void) { int wdt = 0; -#if CONFIG_PROB_GZALLOC && (DEVELOPMENT || DEBUG) - if (!PE_parse_boot_argn("pgz", NULL, 0) && - PE_parse_boot_argn("pgz1", NULL, 0)) { - /* - * if pgz1= was used, but pgz= was not, - * then the more specific pgz1 takes precedence. - */ - pgz_all = false; - } -#endif - if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) { zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT; } @@ -10271,6 +9600,8 @@ zone_metadata_init(void) vm_map_t vm_map = zone_submaps[Z_SUBMAP_IDX_VM]; vm_map_entry_t first; + vmlp_api_start(ZONE_METADATA_INIT); + struct mach_vm_range meta_r, bits_r, xtra_r, early_r; vm_size_t early_sz; vm_offset_t reloc_base; @@ -10422,6 +9753,7 @@ zone_metadata_init(void) } } + vmlp_api_end(ZONE_METADATA_INIT, 0); } @@ -10514,9 +9846,6 @@ zone_set_map_sizes(void) zone_submap_left_guard_size(Z_SUBMAP_IDX_VM) + zone_submap_right_guard_size(Z_SUBMAP_IDX_VM); -#if CONFIG_PROB_GZALLOC - vm_submap_size += pgz_get_size(); -#endif /* CONFIG_PROB_GZALLOC */ if (os_sub_overflow(zone_restricted_va_max(), vm_submap_size, &zone_map_range.min_address)) { zone_map_range.min_address = 0; @@ -10580,25 +9909,7 @@ zone_init(void) submap_min = zone_map_range.min_address; -#if CONFIG_PROB_GZALLOC - vm_size_t pgz_size = pgz_get_size(); - - vm_map_will_allocate_early_map(&pgz_submap); - zone_info.zi_pgz_range = zone_kmem_suballoc(submap_min, pgz_size, - VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, - VM_KERN_MEMORY_ZONE, &pgz_submap); - - submap_min += pgz_size; - remaining_size -= pgz_size; -#if DEBUG || DEVELOPMENT - printf("zone_init: pgzalloc %p:%p (%u%c) [%d slots]\n", - (void *)zone_info.zi_pgz_range.min_address, - (void *)zone_info.zi_pgz_range.max_address, - mach_vm_size_pretty(pgz_size), mach_vm_size_unit(pgz_size), - pgz_slots); -#endif /* DEBUG || DEVELOPMENT */ -#endif /* CONFIG_PROB_GZALLOC */ - +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* zone submap is not maintained in unit-test */ /* * Allocate the submaps */ @@ -10622,6 +9933,9 @@ zone_init(void) } zone_metadata_init(); +#else +#pragma unused(denom, remaining_size) +#endif #if VM_TAG_SIZECLASSES if (zone_tagging_on) { @@ -10645,7 +9959,7 @@ zone_init(void) zc_magazine_zone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) + zc_mag_size() * sizeof(vm_offset_t), - ZC_VM | ZC_NOCACHING | ZC_ZFREE_CLEARMEM | ZC_PGZ_USE_GUARDS); + ZC_VM | ZC_NOCACHING | ZC_ZFREE_CLEARMEM); zone_raise_reserve(zc_magazine_zone, (uint16_t)(2 * zpercpu_count())); /* @@ -11469,7 +10783,7 @@ zone_gc_stress_test_run(__unused int64_t in, int64_t *out) } for (uint64_t i = 0; i < in; i++) { - size_t count = zc_mag_size() * zc_free_batch_size() * 10; + size_t count = zc_mag_size() * zc_free_batch_size() * 20; if (count < ZONE_GC_OBJ_PER_PAGE * 20) { count = ZONE_GC_OBJ_PER_PAGE * 20; diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index ec5049399..fcb4dfaec 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -172,17 +172,15 @@ __options_decl(zone_create_flags_t, uint64_t, { /** This zone is a built object cache */ ZC_OBJ_CACHE = 0x0080000000000000, - /** Use guard pages in PGZ mode */ - ZC_PGZ_USE_GUARDS = 0x0100000000000000, + // was ZC_PGZ_USE_GUARDS 0x0100000000000000, /** Zone doesn't support TBI tagging */ - ZC_NO_TBI_TAG = 0x0200000000000000, + ZC_NO_TBI_TAG = 0x0200000000000000, /** This zone will back a kalloc type */ ZC_KALLOC_TYPE = 0x0400000000000000, - /** Disable PGZ for this zone */ - ZC_NOPGZ = 0x0800000000000000, + // was ZC_NOPGZ = 0x0800000000000000, /** This zone contains pure data */ ZC_DATA = 0x1000000000000000, @@ -469,6 +467,7 @@ __options_decl(zalloc_flags_t, uint32_t, { Z_REALLOCF = 0x0008, #if XNU_KERNEL_PRIVATE + Z_NOSOFTLIMIT = 0x0020, Z_SET_NOTEARLY = 0x0040, Z_SPRAYQTN = 0x0080, Z_KALLOC_ARRAY = 0x0100, @@ -1118,7 +1117,7 @@ extern zone_t zinit( #include -#pragma GCC visibility push(hidden) +__exported_push_hidden #pragma mark XNU only: zalloc (extended) @@ -2015,7 +2014,20 @@ __enum_decl(zone_kheap_id_t, uint8_t, { static inline bool zone_is_data_kheap(zone_kheap_id_t kheap_id) { - return kheap_id == KHEAP_ID_DATA_BUFFERS || kheap_id == KHEAP_ID_DATA_SHARED; + return kheap_id == KHEAP_ID_DATA_BUFFERS || + kheap_id == KHEAP_ID_DATA_SHARED; +} + +static inline bool +zone_is_data_buffers_kheap(zone_kheap_id_t kheap_id) +{ + return kheap_id == KHEAP_ID_DATA_BUFFERS; +} + +static inline bool +zone_is_data_shared_kheap(zone_kheap_id_t kheap_id) +{ + return kheap_id == KHEAP_ID_DATA_SHARED; } /*! @@ -2463,71 +2475,6 @@ extern void zcache_drain( extern zone_cache_ops_t zcache_ops[ZONE_ID__FIRST_DYNAMIC]; -#pragma mark XNU only: PGZ support - -/*! - * @function pgz_owned() - * - * @brief - * Returns whether an address is PGZ owned. - * - * @param addr The address to translate. - * @returns Whether it is PGZ owned - */ -#if CONFIG_PROB_GZALLOC -extern bool pgz_owned(mach_vm_address_t addr) __pure2; -#else -#define pgz_owned(addr) false -#endif - -/*! - * @function pgz_decode() - * - * @brief - * Translates a PGZ protected virtual address to its unprotected - * backing store. - * - * @discussion - * This is exposed so that the VM can lookup the vm_page_t for PGZ protected - * elements since the PGZ protected virtual addresses are maintained by PGZ - * at the pmap level without the VM involvment. - * - * "allow_invalid" schemes relying on sequestering also need this - * to perform the locking attempts on the unprotected address. - * - * @param addr The address to translate. - * @param size The object size. - * @returns The unprotected address or @c addr. - */ -#if CONFIG_PROB_GZALLOC -#define pgz_decode(addr, size) \ - ((typeof(addr))__pgz_decode((mach_vm_address_t)(addr), size)) -#else -#define pgz_decode(addr, size) (addr) -#endif - -/*! - * @function pgz_decode_allow_invalid() - * - * @brief - * Translates a PGZ protected virtual address to its unprotected - * backing store, but doesn't assert it is still allocated/valid. - * - * @discussion - * "allow_invalid" schemes relying on sequestering also need this - * to perform the locking attempts on the unprotected address. - * - * @param addr The address to translate. - * @param want_zid The expected zone ID for the element. - * @returns The unprotected address or @c addr. - */ -#if CONFIG_PROB_GZALLOC -#define pgz_decode_allow_invalid(addr, want_zid) \ - ((typeof(addr))__pgz_decode_allow_invalid((vm_offset_t)(addr), want_zid)) -#else -#define pgz_decode_allow_invalid(addr, zid) (addr) -#endif - #pragma mark XNU only: misc & implementation details struct zone_create_startup_spec { @@ -2619,17 +2566,6 @@ __zone_flags_mix_tag(zalloc_flags_t flags, vm_tag_t tag) extern unsigned zpercpu_count(void) __pure2; -#if CONFIG_PROB_GZALLOC - -extern vm_offset_t __pgz_decode( - mach_vm_address_t addr, - mach_vm_size_t size); - -extern vm_offset_t __pgz_decode_allow_invalid( - vm_offset_t offs, - zone_id_t zid); - -#endif #if DEBUG || DEVELOPMENT /* zone_max_zone is here (but not zalloc_internal.h) for the BSD kernel */ extern unsigned int zone_max_zones(void); @@ -2651,6 +2587,16 @@ extern uint32_t zone_map_jetsam_limit; extern kern_return_t zone_map_jetsam_set_limit(uint32_t value); +/* max length of a zone name we can take from boot-args/sysctl */ +#define MAX_ZONE_NAME 32 + +#if DEVELOPMENT || DEBUG + +extern kern_return_t zone_reset_peak(const char *zonename); +extern kern_return_t zone_reset_all_peaks(void); + +#endif /* DEVELOPMENT || DEBUG */ + extern zone_t percpu_u64_zone; /*! @@ -2681,7 +2627,7 @@ mach_memory_info_sample( extern void zone_gc_trim(void); extern void zone_gc_drain(void); -#pragma GCC visibility pop +__exported_pop #endif /* XNU_KERNEL_PRIVATE */ /* diff --git a/osfmk/kern/zalloc_internal.h b/osfmk/kern/zalloc_internal.h index fade1fe10..55af880a2 100644 --- a/osfmk/kern/zalloc_internal.h +++ b/osfmk/kern/zalloc_internal.h @@ -101,7 +101,7 @@ __BEGIN_DECLS -#pragma GCC visibility push(hidden) +__exported_push_hidden /* * A zone is a collection of fixed size blocks for which there @@ -271,13 +271,11 @@ struct zone { no_callout :1, z_destructible :1, /* zone can be zdestroy()ed */ - _reserved :6, + _reserved :8, /* * Debugging features */ - z_pgz_tracked :1, /* this zone is tracked by pgzalloc */ - z_pgz_use_guards :1, /* this zone uses guards with PGZ */ z_kasan_fakestacks :1, z_kasan_quarantine :1, /* whether to use the kasan quarantine */ z_tags_sizeclass :6, /* idx into zone_tags_sizeclasses to associate @@ -421,11 +419,10 @@ typedef struct zone_security_flags { z_kheap_id :3, /* zone_kheap_id_t when part of a kalloc heap */ z_kalloc_type :1, /* zones that does types based seggregation */ z_lifo :1, /* depot and recirculation layer are LIFO */ - z_pgz_use_guards :1, /* this zone uses guards with PGZ */ z_submap_from_end :1, /* allocate from the left or the right ? */ z_noencrypt :1, /* do not encrypt pages when hibernating */ z_tag :1, /* zone supports TBI tagging */ - z_unused :15; + z_unused :16; /* * Signature equivalance zone */ @@ -521,6 +518,11 @@ __enum_decl(kt_var_heap_id_t, uint32_t, { * have been redirected to KHEAP_DATA_BUFFERS */ KT_VAR_DATA_HEAP, + /* + * Fake "data" heap used to link views of data-only allocation that + * have been redirected to KHEAP_DATA_SHARED + */ + KT_VAR_DATA_SHARED_HEAP, /* * Heaps for pointer arrays */ @@ -1074,8 +1076,6 @@ zone_unlock(zone_t zone) #endif /* KASAN_FAKESTACK */ } -#define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */ - int track_this_zone(const char *zonename, const char *logname); extern bool panic_include_kalloc_types; extern zone_t kalloc_type_src_zone; @@ -1085,7 +1085,7 @@ extern zone_t kalloc_type_dst_zone; extern vm_size_t zone_element_info(void *addr, vm_tag_t * ptag); #endif /* DEBUG || DEVELOPMENT */ -#pragma GCC visibility pop +__exported_pop __END_DECLS diff --git a/osfmk/kperf/kptimer.c b/osfmk/kperf/kptimer.c index 4273fd27a..c24672bb9 100644 --- a/osfmk/kperf/kptimer.c +++ b/osfmk/kperf/kptimer.c @@ -319,10 +319,10 @@ kptimer_expire(processor_t processor, int cpuid, uint64_t now) case KPERF_SAMPLING_ON: break; case KPERF_SAMPLING_SHUTDOWN: + // Treat off the same as shutdown: this CPU just missed the shutdown request. + case KPERF_SAMPLING_OFF: kptimer_stop_cpu(processor); return; - case KPERF_SAMPLING_OFF: - panic("kperf: timer fired at %llu, but sampling is disabled", now); default: panic("kperf: unknown sampling state 0x%x", status); } diff --git a/osfmk/kperf/task_samplers.c b/osfmk/kperf/task_samplers.c index 08a1c998e..82182ccd8 100644 --- a/osfmk/kperf/task_samplers.c +++ b/osfmk/kperf/task_samplers.c @@ -34,7 +34,8 @@ #include extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, - boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); + boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, + boolean_t *is_managed, boolean_t *has_assertion); void kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn) @@ -55,7 +56,8 @@ kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn) } #if CONFIG_MEMORYSTATUS boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE; - memorystatus_proc_flags_unsafe(get_bsdtask_info(task), &dirty, &dirty_tracked, &allow_idle_exit); + boolean_t is_active = FALSE, is_managed = FALSE, has_assertion = FALSE; + memorystatus_proc_flags_unsafe(get_bsdtask_info(task), &dirty, &dirty_tracked, &allow_idle_exit, &is_active, &is_managed, &has_assertion); if (dirty) { tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY; } @@ -65,6 +67,15 @@ kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn) if (allow_idle_exit) { tksn->kptksn_flags |= KPERF_TASK_ALLOW_IDLE_EXIT; } + if (is_active) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_ACTIVE; + } + if (is_managed) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_MANAGED; + } + if (has_assertion) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_HAS_ASSERTION; + } #endif tksn->kptksn_suspend_count = task->suspend_count; diff --git a/osfmk/kperf/task_samplers.h b/osfmk/kperf/task_samplers.h index d47b15bca..66d75936f 100644 --- a/osfmk/kperf/task_samplers.h +++ b/osfmk/kperf/task_samplers.h @@ -49,6 +49,9 @@ struct kperf_task_snapshot { #define KPERF_TASK_FLAG_WQ_EXCEEDED_CONSTRAINED (1U << 6) #define KPERF_TASK_FLAG_DIRTY_TRACKED (1U << 7) #define KPERF_TASK_ALLOW_IDLE_EXIT (1U << 8) +#define KPERF_TASK_FLAG_ACTIVE (1U << 9) +#define KPERF_TASK_FLAG_MANAGED (1U << 10) +#define KPERF_TASK_FLAG_HAS_ASSERTION (1U << 11) void kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn); void kperf_task_snapshot_log(struct kperf_task_snapshot *tksn); diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index 8889ac9da..5c6b8eae1 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -246,10 +246,13 @@ PRIVATE_DATAFILES = $(sort \ ${MIG_PRIVATE_DEFS}) INSTALL_MI_LCL_LIST = $(sort \ + arcade_register.defs \ coalition.h \ + mach_eventlink.defs \ mach_time_private.h \ mk_timer.h \ resource_monitors.h \ + restartable.defs \ task_policy_private.h \ thread_policy_private.h \ vm_reclaim.h \ diff --git a/osfmk/mach/arm/_structs.h b/osfmk/mach/arm/_structs.h index 71fcbbc30..2a34f2e66 100644 --- a/osfmk/mach/arm/_structs.h +++ b/osfmk/mach/arm/_structs.h @@ -657,25 +657,25 @@ _STRUCT_ARM_SME_STATE _STRUCT_ARM_SVE_Z_STATE { char __z[16][256]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SVE_P_STATE struct __darwin_arm_sve_p_state _STRUCT_ARM_SVE_P_STATE { char __p[16][256 / 8]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SME_ZA_STATE struct __darwin_arm_sme_za_state _STRUCT_ARM_SME_ZA_STATE { char __za[4096]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SME2_STATE struct __darwin_arm_sme2_state _STRUCT_ARM_SME2_STATE { char __zt0[64]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #else /* !__DARWIN_UNIX03 */ #define _STRUCT_ARM_SME_STATE struct arm_sme_state _STRUCT_ARM_SME_STATE @@ -689,25 +689,25 @@ _STRUCT_ARM_SME_STATE _STRUCT_ARM_SVE_Z_STATE { char z[16][256]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SVE_P_STATE struct arm_sve_p_state _STRUCT_ARM_SVE_P_STATE { char p[16][256 / 8]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SME_ZA_STATE struct arm_sme_za_state _STRUCT_ARM_SME_ZA_STATE { char za[4096]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #define _STRUCT_ARM_SME2_STATE struct arm_sme2_state _STRUCT_ARM_SME2_STATE { char zt0[64]; -} __attribute__((aligned(_Alignof(unsigned int)))); +} __attribute__((aligned(4))); #endif /* __DARWIN_UNIX03 */ /* diff --git a/osfmk/mach/arm/exception.h b/osfmk/mach/arm/exception.h index a6b6349e6..0b2b776ed 100644 --- a/osfmk/mach/arm/exception.h +++ b/osfmk/mach/arm/exception.h @@ -38,8 +38,14 @@ #define EXCEPTION_CODE_MAX 2 /* code and subcode */ #if XNU_KERNEL_PRIVATE + #if __has_feature(ptrauth_calls) -#define EXC_PTRAUTH_BIT 0x200 /* bit set if exception could have been caused by ptrauth failure */ +/* + * Note that while this bit can show up in the reported exception code, + * it also does double-duty by being set just while the exception is temporarily shuffled around + * within xnu, and it can be cleared before we reached exception_triage_thread(). + */ +#define EXC_PTRAUTH_BIT 0x200 /* Set if the exception was caused by a ptrauth failure */ #endif /* __has_feature(ptrauth_calls) */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/mach/arm/thread_status.h b/osfmk/mach/arm/thread_status.h index 1cd15f8e2..ed953235c 100644 --- a/osfmk/mach/arm/thread_status.h +++ b/osfmk/mach/arm/thread_status.h @@ -345,11 +345,6 @@ typedef _STRUCT_ARM_LEGACY_DEBUG_STATE arm_debug_state_t; #define HAS_ADD_SAVED_STATE_PC 1 #endif -#if CONFIG_BTI_TELEMETRY -/* BTI Telemetry needs CPSR to recover from BTI exceptions */ -#define HAS_SET_SAVED_STATE_CPSR 1 -#endif /* CONFIG_HAS_BTI_TELEMETRY */ - static inline boolean_t is_thread_state32(const arm_unified_thread_state_t *its) diff --git a/osfmk/mach/arm/vm_param.h b/osfmk/mach/arm/vm_param.h index ed2da8856..0d0df1151 100644 --- a/osfmk/mach/arm/vm_param.h +++ b/osfmk/mach/arm/vm_param.h @@ -200,6 +200,8 @@ extern int PAGE_SHIFT_CONST; * constrain the address space further. */ + +#ifndef __BUILDING_XNU_LIBRARY__ #if XNU_KERNEL_PRIVATE #if defined(ARM_LARGE_MEMORY) /* @@ -248,13 +250,13 @@ extern int PAGE_SHIFT_CONST; * | | | | PMAP_HEAP_RANGE_START | >= H9 * +-----------------------+--------+--------+------------------------+ */ -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #define VM_KERNEL_POINTER_SIGNIFICANT_BITS 38 #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) (0ULL - GiB(144))) -#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #define VM_KERNEL_POINTER_SIGNIFICANT_BITS 37 #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) 0xffffffe000000000ULL) -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xfffffffbffffffffULL) #endif // ARM_LARGE_MEMORY @@ -265,6 +267,11 @@ extern int PAGE_SHIFT_CONST; #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) (0ULL - TiB(2))) #define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xfffffffbffffffffULL) #endif // XNU_KERNEL_PRIVATE +#else /* __BUILDING_XNU_LIBRARY__ */ +#define VM_MIN_KERNEL_ADDRESS ((vm_address_t)(0x100000000ULL)) +#define VM_MAX_KERNEL_ADDRESS ((vm_address_t)(0ULL + GiB(2))) +#define VM_KERNEL_POINTER_SIGNIFICANT_BITS 31 +#endif /*__BUILDING_XNU_LIBRARY__ */ #else #error architecture not supported #endif @@ -279,33 +286,59 @@ extern int PAGE_SHIFT_CONST; #define VM_USER_STRIP_TBI(_v) (_v) #endif /* __arm64__ */ -#if CONFIG_KERNEL_TAGGING -#include -/* - * 'strip' in PAC sense, therefore replacing the stripped bits sign extending - * the sign bit. In kernel space the sign bit is 1, so 0xFF is a valid mask - * here. - */ -#define VM_KERNEL_STRIP_TAG(_v) (vm_memtag_canonicalize_kernel((vm_offset_t)_v)) -#else /* CONFIG_KERNEL_TAGGING */ -#define VM_KERNEL_STRIP_TAG(_v) (_v) -#endif /* CONFIG_KERNEL_TAGGING */ + +#if __arm64__ + +#if XNU_KERNEL_PRIVATE +#define VM_KERNEL_STRIP_MASK (-1ULL << (64 - T1SZ_BOOT)) +#define VM_USER_STRIP_MASK (-1ULL >> (T0SZ_BOOT)) +#define _VM_KERNEL_STRIP_PTR(_va) ({((_va) & 1ULL << 55) ? ((_va) | VM_KERNEL_STRIP_MASK) : ((_va) & VM_USER_STRIP_MASK);}) +#else /* XNU_KERNEL_PRIVATE */ #if __has_feature(ptrauth_calls) #include -#define VM_KERNEL_STRIP_PAC(_v) (ptrauth_strip((void *)(uintptr_t)(_v), ptrauth_key_asia)) +#define VM_KERNEL_STRIP_PAC(_v) ((uintptr_t)(ptrauth_strip((void *)(uintptr_t)(_v), ptrauth_key_asia))) #else /* !ptrauth_calls */ -#define VM_KERNEL_STRIP_PAC(_v) (_v) +#define VM_KERNEL_STRIP_PAC(_v) (_v) #endif /* ptrauth_calls */ +/* For KEXT, just blow away TBI bits, even if only used for KASAN. */ +#define _VM_KERNEL_STRIP_PTR(_v) (VM_KERNEL_STRIP_PAC(_v) | (0xFF00000000000000ULL)) +#endif /* XNU_KERNEL_PRIVATE */ + +#else /* __arm64__ */ +#define _VM_KERNEL_STRIP_PTR(_v) (_v) +#endif /* __arm64__ */ + +#define VM_KERNEL_STRIP_PTR(_va) (_VM_KERNEL_STRIP_PTR((uintptr_t)(_va))) + +/* Vestige from the past, kept for retro-compatibility. */ +#define VM_KERNEL_STRIP_UPTR(_va) (VM_KERNEL_STRIP_PTR(_va)) -#define VM_KERNEL_STRIP_PTR(_va) ((VM_KERNEL_STRIP_TAG(VM_KERNEL_STRIP_PAC((_va))))) -#define VM_KERNEL_STRIP_UPTR(_va) ((vm_address_t)VM_KERNEL_STRIP_PTR((uintptr_t)(_va))) #define VM_KERNEL_ADDRESS(_va) \ - ((VM_KERNEL_STRIP_UPTR(_va) >= VM_MIN_KERNEL_ADDRESS) && \ - (VM_KERNEL_STRIP_UPTR(_va) <= VM_MAX_KERNEL_ADDRESS)) + ((VM_KERNEL_STRIP_PTR(_va) >= VM_MIN_KERNEL_ADDRESS) && \ + (VM_KERNEL_STRIP_PTR(_va) <= VM_MAX_KERNEL_ADDRESS)) #define VM_USER_STRIP_PTR(_v) (VM_USER_STRIP_TBI(_v)) +#if DEBUG || DEVELOPMENT || !defined(HAS_APPLE_PAC) + +#define ML_ADDRPERM(addr, slide) ((addr) + (slide)) + +#else /* DEBUG || DEVELOPMENT || !defined(HAS_APPLE_PAC) */ + +/** + * While these function's implementations are machine specific, due to the need + * to prevent header file circular dependencies, they need to be externed here + * for usage in the addrperm macro + */ +__BEGIN_DECLS +vm_offset_t ml_addrperm_pacga(vm_offset_t addr); +__END_DECLS + +#define ML_ADDRPERM(addr, slide) ml_addrperm_pacga(addr) + +#endif /* DEBUG || DEVELOPMENT || !defined(HAS_APPLE_PAC) */ + #ifdef MACH_KERNEL_PRIVATE /* * Physical memory is mapped linearly at an offset virtual memory. diff --git a/osfmk/mach/coalition.h b/osfmk/mach/coalition.h index 39a03c7e3..8e807e074 100644 --- a/osfmk/mach/coalition.h +++ b/osfmk/mach/coalition.h @@ -149,6 +149,7 @@ struct coalition_resource_usage { uint64_t gpu_energy_nj; /* nanojoules that I did */ uint64_t gpu_energy_nj_billed_to_me; /* nanojoules that others did on my behalf */ uint64_t gpu_energy_nj_billed_to_others; /* nanojoules that I did on others' behalf */ + uint64_t swapins; }; #ifdef PRIVATE diff --git a/osfmk/mach/exception_types.h b/osfmk/mach/exception_types.h index 4a12acc52..0856429f0 100644 --- a/osfmk/mach/exception_types.h +++ b/osfmk/mach/exception_types.h @@ -177,6 +177,13 @@ #ifdef KERNEL_PRIVATE #define EXC_MASK_VALID (EXC_MASK_ALL | EXC_MASK_CRASH | EXC_MASK_CORPSE_NOTIFY) + +/* + * Additional mask for use with EXC_BREAKPOINT. + * Note this is used just while the exception is shuffled around within xnu. + * It's wiped off before we reach exception_triage_thread(). + */ +#define EXC_MAY_BE_UNRECOVERABLE_BIT 0x400 /* Set if this exception may be uncatchable by userspace */ #endif /* KERNEL_PRIVATE */ #define FIRST_EXCEPTION 1 /* ZERO is illegal */ diff --git a/osfmk/mach/exclaves.h b/osfmk/mach/exclaves.h index ae3533412..c7ae1be95 100644 --- a/osfmk/mach/exclaves.h +++ b/osfmk/mach/exclaves.h @@ -134,11 +134,11 @@ OS_CLOSED_OPTIONS(exclaves_requirement, uint64_t, EXCLAVES_R_CONCLAVE = 0x20, /* - * ExclaveKit initialization. - * If relaxed and exclavekit initialization fails, continue on without + * Framebank initialization. + * If relaxed and framebank initialization fails, set exclavekit boot to failed and continue on without * panicking. All conclave related functionality will fail. */ - EXCLAVES_R_EXCLAVEKIT = 0x40, + EXCLAVES_R_FRAMEBANK = 0x40, /* * Conclave resource support. @@ -167,6 +167,18 @@ OS_CLOSED_OPTIONS(exclaves_requirement, uint64_t, */ EXCLAVES_R_TEST_STRESS = 0x400, + /* + * Support for Always On Exclaves. + */ + EXCLAVES_R_AOE = 0x800, + + /* + * ExclaveKit initialization. + * If relaxed, skip exclavekit initialization and continue on without + * panicking. All conclave related functionality will fail. + */ + EXCLAVES_R_EXCLAVEKIT = 0x1000, + ); #if !defined(KERNEL) @@ -660,6 +672,29 @@ kern_return_t exclaves_sensor_status(mach_port_t sensor_port, uint64_t flags, exclaves_sensor_status_t *sensor_status); +/*! + * @function exclaves_indicator_min_on_time + * + * @abstract + * Get time remaining until minimum on time is satisfied for all sensor types. + * The return value for each indicator type is a future clock tick on the Global time base + * if the minimum on time is not satisfied, and 0 otherwise. + * + * @param port Reserved, must be MACH_PORT_NULL for now. + * @param flags Reserved, must be 0 for now. + * @param camera_indicator Out parameter filled with remaining camera indicator time to meet minimum on time + * @param mic_indicator Out parameter filled with remaining microphone indicator time to meet minimum on time + * @param faceid Out parameter filled with remaining Face ID indicator time to meet minimum on time + * + * @result + * KERN_SUCCESS or mach system call error code. + */ + +SPI_AVAILABLE(macos(15.5), ios(18.5), tvos(18.5), watchos(11.5), visionos(2.5)) +kern_return_t +exclaves_indicator_min_on_time(mach_port_t port, uint64_t flags, + uint64_t *camera_indicator, uint64_t *mic_indicator, uint64_t *faceid); + /*! * @function exclaves_launch_conclave * @@ -728,6 +763,60 @@ SPI_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4)) kern_return_t exclaves_notification_create(mach_port_t port, const char *name, uint64_t *notification_id); +/*! + * @function exclaves_aoe_setup + * + * @abstract + * Discover the number of threads this always-on conclave supports. + * + * @param port + * Reserved, must be MACH_PORT_NULL for now. + * + * @param num_message + * Returns the number of message threads + * + * @param num_worker + * Returns the number of worker threads + * + * @result + * KERN_SUCCESS or mach system call error code. + */ +SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), watchos(12.0), xros(3.0)) +kern_return_t +exclaves_aoe_setup(mach_port_t port, uint8_t *num_message, uint8_t *num_worker); + +/*! + * @function exclaves_aoe_work_loop + * + * @abstract + * Enter the always-on exclaves worker run loop. This function never returns. + * + * @param port + * Reserved, must be MACH_PORT_NULL for now. + * + * @result + * KERN_SUCCESS or mach system call error code. + */ +SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), watchos(12.0), xros(3.0)) +kern_return_t +exclaves_aoe_work_loop(mach_port_t port); + +/*! + * @function exclaves_aoe_message_loop + * + * @abstract + * Enter the always-on exclaves message loop. This function never returns. + * + * @param port + * Reserved, must be MACH_PORT_NULL for now. + * + * @result + * KERN_SUCCESS or mach system call error code. + */ +SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), watchos(12.0), xros(3.0)) +kern_return_t +exclaves_aoe_message_loop(mach_port_t port); + #else /* defined(KERNEL) */ /*! @@ -875,8 +964,11 @@ OS_ENUM(exclaves_sensor_type, uint32_t, EXCLAVES_SENSOR_MIC = 2, EXCLAVES_SENSOR_CAM_ALT_FACEID = 3, EXCLAVES_SENSOR_CAM_ALT_FACEID_DELAYED = 4, + EXCLAVES_SENSOR_TEST = 5, + EXCLAVES_SENSOR_TEST_MIL = 6, + EXCLAVES_SENSOR_TEST_CIL = 7, /* update max if more sensors added */ - EXCLAVES_SENSOR_MAX = 4, + EXCLAVES_SENSOR_MAX = 7, ); /*! @@ -926,6 +1018,7 @@ exclaves_sensor_start(exclaves_sensor_type_t sensor_type, uint64_t flags, kern_return_t exclaves_sensor_stop(exclaves_sensor_type_t sensor_type, uint64_t flags, exclaves_sensor_status_t *sensor_status); + /*! * @function exclaves_sensor_status * @@ -948,21 +1041,36 @@ exclaves_sensor_status(exclaves_sensor_type_t sensor_type, uint64_t flags, exclaves_sensor_status_t *sensor_status); /*! - * @function exclaves_display_healthcheck_rate + * @function exclaves_sensor_tick_rate * * @abstract - * Update the rate of the display healthcheck based on the specified - * display update rate + * Set the fire rate of the timer that ticks the EIC periodically. + * This should only be called by the brightness stack to adjust the rate at which + * LED indicators can get new brightness values. * - * @param ns - * The rate in nanoseconds. - * Note: This value may be be rounded to the nearest rate supported and not used - * as-is. + * @param rate_hz + * Timer rate in Hz. * * @result * KERN_SUCCESS or mach system call error code. */ kern_return_t +exclaves_sensor_tick_rate(uint64_t rate_hz); + +/*! + * @function exclaves_display_healthcheck_rate + * + * @abstract + * Deprecated, no longer does anything. + * + * @param ns + * Unused. + * + * @result + * KERN_SUCCESS. + */ +/* __kpi_deprecated("Inoperative noop, can remove") */ +kern_return_t exclaves_display_healthcheck_rate(uint64_t ns); #endif /* defined(KERNEL) */ @@ -1017,6 +1125,10 @@ OS_ENUM(exclaves_ctl_op, uint8_t, EXCLAVES_CTL_OP_SENSOR_STOP = 12, EXCLAVES_CTL_OP_SENSOR_STATUS = 13, EXCLAVES_CTL_OP_NOTIFICATION_RESOURCE_LOOKUP = 14, + EXCLAVES_CTL_OP_AOE_SETUP = 15, + EXCLAVES_CTL_OP_AOE_MESSAGE_LOOP = 16, + EXCLAVES_CTL_OP_AOE_WORK_LOOP = 17, + EXCLAVES_CTL_OP_SENSOR_MIN_ON_TIME = 18, EXCLAVES_CTL_OP_LAST, ); #define EXCLAVES_CTL_FLAGS_MASK (0xfffffful) @@ -1041,6 +1153,20 @@ typedef struct exclaves_resource_user { mach_port_name_t r_port; } exclaves_resouce_user_t; +/*! + * @struct exclaves_indicator_deadline + * + * @brief + * This struct will contain the amount of time remaining before + * minimum on time is met for various sensors + */ +typedef struct exclaves_indicator_deadlines { + uint64_t version; + uint64_t camera_indicator; + uint64_t mic_indicator; + uint64_t faceid_indicator; +} exclaves_indicator_deadlines_t; + #if !defined(KERNEL) SPI_AVAILABLE(macos(14.4), ios(17.4), tvos(17.4), watchos(10.4)) @@ -1126,6 +1252,7 @@ __options_closed_decl(exclaves_priv_t, unsigned int, { EXCLAVES_PRIV_CONCLAVE_SPAWN = 0x2, /* Can spawn conclaves. */ EXCLAVES_PRIV_KERNEL_DOMAIN = 0x4, /* Access to kernel resources. */ EXCLAVES_PRIV_BOOT = 0x8, /* Can boot exclaves. */ + EXCLAVES_PRIV_INDICATOR_MIN_ON_TIME = 0x10 /* Can access sensor minimum on time*/ }); /* diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index efb4782d6..b8beb7e7a 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -216,7 +216,8 @@ struct vm_compressor_q_lens { typedef struct vm_compressor_q_lens vm_compressor_q_lens_data_t; #define HOST_VM_COMPRESSOR_Q_LENS 7 -#define VM_COMPRESSOR_Q_LENS_COUNT sizeof(struct vm_compressor_q_lens)/sizeof(integer_t) +#define VM_COMPRESSOR_Q_LENS_COUNT ((mach_msg_type_number_t) \ + (sizeof(struct vm_compressor_q_lens)/sizeof(integer_t))) #endif #ifdef XNU_KERNEL_PRIVATE @@ -245,11 +246,12 @@ typedef struct vm_purgeable_info *host_purgable_info_t; /* size of the latest version of the structure */ #define HOST_VM_INFO64_LATEST_COUNT HOST_VM_INFO64_COUNT -#define HOST_VM_INFO64_REV1_COUNT HOST_VM_INFO64_LATEST_COUNT +#define HOST_VM_INFO64_REV2_COUNT HOST_VM_INFO64_COUNT +#define HOST_VM_INFO64_REV1_COUNT ((mach_msg_type_number_t) \ + (offsetof(vm_statistics64_data_t, swapped_count) / sizeof(integer_t))) /* previous versions: adjust the size according to what was added each time */ -#define HOST_VM_INFO64_REV0_COUNT /* added compression and swapper info (14 ints) */ \ - ((mach_msg_type_number_t) \ - (HOST_VM_INFO64_REV1_COUNT - 14)) +#define HOST_VM_INFO64_REV0_COUNT ((mach_msg_type_number_t) \ + (offsetof(vm_statistics64_data_t, decompressions) / sizeof(integer_t))) /* in */ /* vm_extmod_statistics */ diff --git a/osfmk/mach/i386/vm_param.h b/osfmk/mach/i386/vm_param.h index fc6807f37..4936fe4ce 100644 --- a/osfmk/mach/i386/vm_param.h +++ b/osfmk/mach/i386/vm_param.h @@ -320,6 +320,12 @@ #endif /* MACH_KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE + +#define ML_ADDRPERM(addr, slide) ((addr) + (slide)) + +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL_PRIVATE */ #endif /* defined (__i386__) || defined (__x86_64__) */ diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index 208a54bea..cb6beafb8 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -555,7 +555,7 @@ mach_port_kobject( task : ipc_space_read_t; #endif name : mach_port_name_t; - out object_type : natural_t; + out object_type : ipc_info_object_type_t; out object_addr : mach_vm_address_t); @@ -688,7 +688,7 @@ mach_port_kobject_description( task : ipc_space_read_t; #endif name : mach_port_name_t; - out object_type : natural_t; + out object_type : ipc_info_object_type_t; out object_addr : mach_vm_address_t; out description : kobject_description_t); diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index c336a9dcd..69280efb9 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -68,6 +68,7 @@ #include +#include #include #include #include @@ -368,6 +369,10 @@ extern kern_return_t debug_control_port_for_pid( int pid, mach_port_name_t *t); +extern mach_error_t mach_vm_reclaim_update_kernel_accounting_trap( + mach_port_name_t target_tport, + uint64_t *bytes_reclaimed); + #else /* KERNEL */ #ifdef XNU_KERNEL_PRIVATE @@ -917,6 +922,15 @@ struct iokit_user_client_trap_args { kern_return_t iokit_user_client_trap( struct iokit_user_client_trap_args *args); +#if __LP64__ +struct mach_vm_reclaim_update_kernel_accounting_trap_args { + PAD_ARG_(mach_port_name_t, target_task); + PAD_ARG_(user_addr_t, bytes_reclaimed_out); +}; +extern mach_error_t mach_vm_reclaim_update_kernel_accounting_trap( + struct mach_vm_reclaim_update_kernel_accounting_trap_args *args); +#endif /* __LP64__ */ + #undef PAD_ #undef PADL_ #undef PADR_ diff --git a/osfmk/mach/mach_types.defs b/osfmk/mach/mach_types.defs index 40ca90ec7..fe00742b2 100644 --- a/osfmk/mach/mach_types.defs +++ b/osfmk/mach/mach_types.defs @@ -722,6 +722,8 @@ type dyld_kernel_image_info_t = struct[40] of MACH_MSG_TYPE_BYTE; type dyld_kernel_image_info_array_t = ^array[] of dyld_kernel_image_info_t; type dyld_kernel_process_info_t = struct[64] of MACH_MSG_TYPE_BYTE; +type mach_vm_offset_list_t = array[*:512] of mach_vm_offset_t; + #if KERNEL_SERVER #ifdef MACH_KERNEL_PRIVATE simport ; /* for voucher conversions */ diff --git a/osfmk/mach/mach_vm.defs b/osfmk/mach/mach_vm.defs index d3458e5e2..a230c37ab 100644 --- a/osfmk/mach/mach_vm.defs +++ b/osfmk/mach/mach_vm.defs @@ -551,18 +551,17 @@ routine PREFIX(KERNEL_SERVER_SUFFIX(vm_remap_new)) ( routine mach_vm_deferred_reclamation_buffer_allocate( target_task : task_t; - out address : mach_vm_address_t; - len : uint32_t; - max_len : uint32_t); + out address : mach_vm_address_t; + out sampling_period : uint64_t; + len : uint32_t; + max_len : uint32_t); routine mach_vm_deferred_reclamation_buffer_flush( target_task : task_t; - num_entries_to_reclaim : uint32_t); - -routine mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes( - target_task : task_t; - reclaimable_bytes : mach_vm_size_t); + num_entries_to_reclaim : uint32_t; + out bytes_reclaimed : mach_vm_size_t); +skip; /* was: mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes() */ #else skip; skip; @@ -580,10 +579,30 @@ skip; #if !defined(LIBSYSCALL_INTERFACE) && !defined(_MACH_VM_PUBLISH_AS_LOCAL_) routine mach_vm_deferred_reclamation_buffer_resize( - target_task : task_t; - size : uint32_t); + target_task : task_t; + new_len : uint32_t; + out bytes_reclaimed : mach_vm_size_t); #else skip; #endif +#if !defined(_MACH_VM_PUBLISH_AS_LOCAL_) +routine PREFIX(mach_vm_update_pointers_with_remote_tags) ( + target : vm_map_t; + in_pointer_list : mach_vm_offset_list_t; + out out_pointer_list : mach_vm_offset_list_t, CountInOut); +#else /* !defined(_MACH_VM_PUBLISH_AS_LOCAL_) */ +skip; +#endif /* !defined(_MACH_VM_PUBLISH_AS_LOCAL_) */ + +#if !defined(LIBSYSCALL_INTERFACE) && !defined(_MACH_VM_PUBLISH_AS_LOCAL_) +routine mach_vm_deferred_reclamation_buffer_query( + target : task_read_t; + out addr : mach_vm_address_t; + out size : mach_vm_size_t); +#else +skip; +#endif + + /* vim: set ft=c : */ diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index b71a04c6c..0169116b2 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -376,6 +376,9 @@ __END_DECLS #define CPU_SUBTYPE_ARM_V7M ((cpu_subtype_t) 15) /* Not meant to be run under xnu */ #define CPU_SUBTYPE_ARM_V7EM ((cpu_subtype_t) 16) /* Not meant to be run under xnu */ #define CPU_SUBTYPE_ARM_V8M ((cpu_subtype_t) 17) /* Not meant to be run under xnu */ +#define CPU_SUBTYPE_ARM_V8M_MAIN CPU_SUBTYPE_ARM_V8M /* Not meant to be run under xnu */ +#define CPU_SUBTYPE_ARM_V8M_BASE ((cpu_subtype_t) 18) /* Not meant to be run under xnu */ +#define CPU_SUBTYPE_ARM_V8_1M_MAIN ((cpu_subtype_t) 19) /* Not meant to be run under xnu */ /* * ARM64 subtypes @@ -388,7 +391,14 @@ __END_DECLS #define CPU_SUBTYPE_ARM64_PTR_AUTH_MASK 0x0f000000 #define CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(x) (((x) & CPU_SUBTYPE_ARM64_PTR_AUTH_MASK) >> 24) #ifdef PRIVATE -#define CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION 0 +#define CPU_SUBTYPE_ARM64_PTR_AUTHV0_VERSION 0 +#define CPU_SUBTYPE_ARM64_PTR_AUTHV1_VERSION 1 +#define CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION CPU_SUBTYPE_ARM64_PTR_AUTHV0_VERSION +#if XNU_TARGET_OS_OSX +#define CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION CPU_SUBTYPE_ARM64_PTR_AUTHV1_VERSION +#else /* XNU_TARGET_OS_OSX */ +#define CPU_SUBTYPE_ARM64_PTR_AUTH_MAX_PREFERRED_VERSION CPU_SUBTYPE_ARM64_PTR_AUTHV0_VERSION +#endif /* XNU_TARGET_OS_OSX */ #endif /* PRIVATE */ /* diff --git a/osfmk/mach/memory_entry.defs b/osfmk/mach/memory_entry.defs index fbdaebdbd..c1c100c76 100644 --- a/osfmk/mach/memory_entry.defs +++ b/osfmk/mach/memory_entry.defs @@ -58,3 +58,21 @@ routine mach_memory_entry_ownership( #endif ledger_tag : int; ledger_flags : int); + +/// Query the residency of the physical memory backing a given memory entry. +/// +/// This operation is only supported on "named" memory entries created with +/// `MAP_MEM_NAMED_CREATE`. It is unsupported on "mappings" created with +/// `MAP_MEM_COPY` or `MAM_MEM_SHARED` +/// +/// - Parameters +/// - mem_entry: The memory entry to query +/// - resident_cnt: If non-null, the number of resident pages written out +/// - dirty_cnt: If non-null, the number of resident, modified pages written out +/// - swapped_cnt: If non-null, the number of evicted pages written out +routine mach_memory_entry_get_page_counts( + mem_entry : mem_entry_name_port_t; + out resident_cnt : uint64_t; + out dirty_cnt : uint64_t; + out swapped_cnt : uint64_t); + diff --git a/osfmk/mach/memory_object_types.h b/osfmk/mach/memory_object_types.h index fc8174637..319deb18e 100644 --- a/osfmk/mach/memory_object_types.h +++ b/osfmk/mach/memory_object_types.h @@ -497,7 +497,7 @@ struct upl_page_info { needed:1, /* page should be left in cache on abort */ mark:1, /* a mark flag for the creator to use as they wish */ - reserved: 12, + reserved: 12, :0; /* force to long boundary */ #else opaque; /* use upl_page_xxx() accessor funcs */ @@ -563,8 +563,11 @@ typedef uint64_t upl_control_flags_t; #define UPL_NOZEROFILLIO 0x40000000ULL /* allow non zerofill pages present */ #define UPL_REQUEST_FORCE_COHERENCY 0x80000000ULL + + +#define UPL_CARRY_VA_TAG 0x10000000000ULL /* UPL flags known by this kernel */ -#define UPL_VALID_FLAGS 0xFFFFFFFFFFULL +#define UPL_VALID_FLAGS 0x1FFFFFFFFFFULL /* upl abort error flags */ @@ -812,6 +815,7 @@ extern boolean_t upl_has_wired_pages(upl_t upl); void upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v); boolean_t upl_page_get_mark(upl_page_info_t *upl, int index); +boolean_t upl_page_is_needed(upl_page_info_t *upl, int index); #endif // KERNEL_PRIVATE diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index 3f3203812..21a4b631a 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -1092,44 +1092,52 @@ __options_decl(mach_msg_option64_t, uint64_t, { MACH64_SEND_DK_CALL = 0x0000001000000000ull, #ifdef XNU_KERNEL_PRIVATE + MACH64_POLICY_KERNEL_EXTENSION = 0x0000002000000000ull, + MACH64_POLICY_FILTER_NON_FATAL = 0x0000004000000000ull, + MACH64_POLICY_FILTER_MSG = 0x0000008000000000ull, /* * Policy for the mach_msg2_trap() call + * `MACH64_POLICY_MASK` holds an ipc_space_policy_t bitfield, shifted. */ - MACH64_POLICY_KERNEL_EXTENSION = 0x0002000000000000ull, - MACH64_POLICY_FILTER_NON_FATAL = 0x0004000000000000ull, - MACH64_POLICY_FILTER_MSG = 0x0008000000000000ull, - MACH64_POLICY_DEFAULT = 0x0010000000000000ull, + MACH64_POLICY_DEFAULT = 0x0000010000000000ull, /* IPC_SPACE_POLICY_DEFAULT */ + MACH64_POLICY_ENHANCED = 0x0000020000000000ull, /* IPC_SPACE_POLICY_ENHANCED */ + MACH64_POLICY_PLATFORM = 0x0000040000000000ull, /* IPC_SPACE_POLICY_PLATFORM */ + MACH64_POLICY_KERNEL = 0x0000100000000000ull, /* IPC_SPACE_POLICY_KERNEL */ + #if XNU_TARGET_OS_OSX - MACH64_POLICY_SIMULATED = 0x0020000000000000ull, + MACH64_POLICY_SIMULATED = 0x0000200000000000ull, /* IPC_SPACE_POLICY_SIMULATED */ #else - MACH64_POLICY_SIMULATED = 0x0000000000000000ull, + MACH64_POLICY_SIMULATED = 0x0000000000000000ull, /* IPC_SPACE_POLICY_SIMULATED */ #endif #if CONFIG_ROSETTA - MACH64_POLICY_TRANSLATED = 0x0040000000000000ull, + MACH64_POLICY_TRANSLATED = 0x0000400000000000ull, /* IPC_SPACE_POLICY_TRANSLATED */ #else - MACH64_POLICY_TRANSLATED = 0x0000000000000000ull, + MACH64_POLICY_TRANSLATED = 0x0000000000000000ull, /* IPC_SPACE_POLICY_TRANSLATED */ +#endif +#if XNU_TARGET_OS_OSX + MACH64_POLICY_OPTED_OUT = 0x0000800000000000ull, /* IPC_SPACE_POLICY_OPTED_OUT */ +#else + MACH64_POLICY_OPTED_OUT = 0x0000000000000000ull, /* IPC_SPACE_POLICY_OPTED_OUT */ #endif - MACH64_POLICY_HARDENED = 0x0080000000000000ull, - MACH64_POLICY_RIGID = 0x0100000000000000ull, - MACH64_POLICY_PLATFORM = 0x0200000000000000ull, - MACH64_POLICY_KERNEL = MACH64_SEND_KERNEL, - /* one of these bits must be set to have a valid policy */ - MACH64_POLICY_NEEDED_MASK = ( + MACH64_POLICY_ENHANCED_V0 = 0x0001000000000000ull, /* DEPRECATED - includes macos hardened runtime */ + MACH64_POLICY_ENHANCED_V1 = 0x0002000000000000ull, /* ES features exposed to 3P in FY2024 release */ + MACH64_POLICY_ENHANCED_V2 = 0x0004000000000000ull, /* ES features exposed to 3P in FY2025 release */ + + MACH64_POLICY_ENHANCED_VERSION_MASK = ( + MACH64_POLICY_ENHANCED_V0 | /* IPC_SPACE_POLICY_ENHANCED_V0 */ + MACH64_POLICY_ENHANCED_V1 | /* IPC_SPACE_POLICY_ENHANCED_V1 */ + MACH64_POLICY_ENHANCED_V2 /* IPC_SPACE_POLICY_ENHANCED_V2 */ + ), + + MACH64_POLICY_MASK = ( + MACH64_POLICY_DEFAULT | + MACH64_POLICY_ENHANCED | + MACH64_POLICY_PLATFORM | + MACH64_POLICY_KERNEL | MACH64_POLICY_SIMULATED | MACH64_POLICY_TRANSLATED | - MACH64_POLICY_DEFAULT | - MACH64_POLICY_HARDENED | - MACH64_POLICY_RIGID | - MACH64_POLICY_PLATFORM | - MACH64_POLICY_KERNEL), - - /* extra policy modifiers */ - MACH64_POLICY_MASK = ( - MACH64_POLICY_KERNEL_EXTENSION | - MACH64_POLICY_FILTER_NON_FATAL | - MACH64_POLICY_FILTER_MSG | - MACH64_POLICY_NEEDED_MASK), + MACH64_POLICY_OPTED_OUT), /* * If kmsg has auxiliary data, append it immediate after the message @@ -1141,21 +1149,18 @@ __options_decl(mach_msg_option64_t, uint64_t, { MACH64_RCV_LINEAR_VECTOR = 0x1000000000000000ull, /* Receive into highest addr of buffer */ MACH64_RCV_STACK = 0x2000000000000000ull, -#if MACH_FLIPC - /* - * This internal-only flag is intended for use by a single thread per-port/set! - * If more than one thread attempts to MACH64_PEEK_MSG on a port or set, one of - * the threads may miss messages (in fact, it may never wake up). - */ - MACH64_PEEK_MSG = 0x4000000000000000ull, -#endif /* MACH_FLIPC */ + + /* unused = 0x4000000000000000ull, */ + /* * This is a mach_msg2() send/receive operation. */ MACH64_MACH_MSG2 = 0x8000000000000000ull -#endif +#endif /* XNU_KERNEL_PRIVATE */ }); +#define MACH64_POLICY_SHIFT __builtin_ctzll(MACH64_POLICY_MASK) + /* old spelling */ #define MACH64_SEND_USER_CALL MACH64_SEND_MQ_CALL #endif /* PRIVATE */ @@ -1217,21 +1222,9 @@ __options_decl(mach_msg_option64_t, uint64_t, { /* The options implemented by the library interface to mach_msg et. al. */ #define MACH_MSG_OPTION_LIB (MACH_SEND_INTERRUPT | MACH_RCV_INTERRUPT) -#define MACH_SEND_WITH_STRICT_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG)) == \ - (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG)) - -#define MACH_SEND_REPLY_IS_IMMOVABLE(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \ - MACH_SEND_MSG | MACH_RCV_MSG | \ - MACH_RCV_GUARDED_DESC)) == \ - (MACH_MSG_STRICT_REPLY | MACH_SEND_MSG | MACH_RCV_GUARDED_DESC)) - #define MACH_RCV_WITH_STRICT_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG)) == \ (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG)) -#define MACH_RCV_WITH_IMMOVABLE_REPLY(_opts) (((_opts) & (MACH_MSG_STRICT_REPLY | \ - MACH_RCV_MSG | MACH_RCV_GUARDED_DESC)) == \ - (MACH_MSG_STRICT_REPLY | MACH_RCV_MSG | MACH_RCV_GUARDED_DESC)) - #endif /* MACH_KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE @@ -1369,7 +1362,7 @@ typedef kern_return_t mach_msg_return_t; #define MACH_SEND_INVALID_RT_OOL_SIZE 0x10000015 /* compatibility: no longer a returned error */ #define MACH_SEND_NO_GRANT_DEST 0x10000016 -/* The destination port doesn't accept ports in body */ +/* compatibility: no longer a returned error */ #define MACH_SEND_MSG_FILTERED 0x10000017 /* Message send was rejected by message filter */ #define MACH_SEND_AUX_TOO_SMALL 0x10000018 @@ -1414,15 +1407,6 @@ typedef kern_return_t mach_msg_return_t; #define MACH_RCV_INVALID_ARGUMENTS 0x10004013 /* invalid receive arguments, receive has not started */ -#ifdef XNU_KERNEL_PRIVATE -#if MACH_FLIPC -#define MACH_PEEK_IN_PROGRESS 0x10008001 -/* Waiting for a peek. (Internal use only.) */ -#define MACH_PEEK_READY 0x10008002 -/* Waiting for a peek. (Internal use only.) */ -#endif /* MACH_FLIPC */ -#endif - __BEGIN_DECLS @@ -1587,6 +1571,7 @@ typedef struct { mach_msg_size_t send_dsc_usize; mach_msg_size_t send_dsc_port_count; vm_size_t send_dsc_vm_size; + mach_msg_size_t send_dsc_port_arrays_count; } mach_msg_send_uctx_t; diff --git a/osfmk/mach/mk_timer.h b/osfmk/mach/mk_timer.h index 8b718f122..e675b6b93 100644 --- a/osfmk/mach/mk_timer.h +++ b/osfmk/mach/mk_timer.h @@ -39,6 +39,9 @@ #include #include +#ifdef MACH_KERNEL_PRIVATE +#include +#endif /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -55,6 +58,11 @@ extern kern_return_t mk_timer_cancel( mach_port_name_t name, uint64_t *result_time); +#ifdef MACH_KERNEL_PRIVATE +extern void mk_timer_port_label_dealloc( + ipc_object_label_t label); +#endif /* MACH_KERNEL_PRIVATE */ + /* mk_timer_flags */ #define MK_TIMER_NORMAL (0) #define MK_TIMER_CRITICAL (1) diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index 498cc6023..4fde76b2f 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -232,28 +232,12 @@ typedef mach_port_t *mach_port_array_t; * and reused too quickly [to catch right/reference counting bugs]. * The dividing line between the constituent parts is exposed so * that efficient "mach_port_name_t to data structure pointer" - * conversion implementation can be made. But it is possible - * for user-level code to assign their own names to Mach ports. - * These are not required to participate in this algorithm. So - * care should be taken before "assuming" this model. - * + * conversion implementation can be made. */ -#ifndef NO_PORT_GEN - #define MACH_PORT_INDEX(name) ((name) >> 8) #define MACH_PORT_GEN(name) (((name) & 0xff) << 24) -#define MACH_PORT_MAKE(index, gen) \ - (((index) << 8) | (gen) >> 24) - -#else /* NO_PORT_GEN */ - -#define MACH_PORT_INDEX(name) (name) -#define MACH_PORT_GEN(name) (0) -#define MACH_PORT_MAKE(index, gen) (index) - -#endif /* NO_PORT_GEN */ - +#define MACH_PORT_MAKE(index, gen) (((index) << 8) | ((gen) >> 24)) /* * These are the different rights a task may have for a port. @@ -300,8 +284,6 @@ __options_closed_decl(mach_port_type_t, uint32_t, { MACH_PORT_TYPE_SEND_ONCE = MACH_PORT_TYPE(MACH_PORT_RIGHT_SEND_ONCE), MACH_PORT_TYPE_PORT_SET = MACH_PORT_TYPE(MACH_PORT_RIGHT_PORT_SET), MACH_PORT_TYPE_DEAD_NAME = MACH_PORT_TYPE(MACH_PORT_RIGHT_DEAD_NAME), - /* Holder used to have a receive right - remembered to filter exceptions */ - MACH_PORT_TYPE_EX_RECEIVE = MACH_PORT_TYPE(MACH_PORT_RIGHT_LABELH), /* Dummy type bits that mach_port_type/mach_port_names can return. */ MACH_PORT_TYPE_DNREQUEST = 0x80000000, @@ -397,7 +379,7 @@ typedef struct mach_port_limits { #define MACH_PORT_STATUS_FLAG_REVIVE 0x10 #define MACH_PORT_STATUS_FLAG_TASKPTR 0x20 #define MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE 0x40 -#define MACH_PORT_STATUS_FLAG_NO_GRANT 0x80 +#define MACH_PORT_STATUS_FLAG_NO_GRANT 0x80 /* Obsolete */ typedef struct mach_port_info_ext { mach_port_status_t mpie_status; @@ -460,36 +442,99 @@ typedef struct mach_service_port_info { typedef struct mach_service_port_info * mach_service_port_info_t; +/* + * Platform binaries are not allowed to send OOL port array to any port. + * + * MACH_MSG_OOL_PORTS_DESCRIPTOR are allowed to be sent ONLY to receive + * rights that are explicitly allow to receive that descriptor. + * + * Such ports have a dedicated port type, and are created using the + * MPO_CONNECTION_PORT_WITH_PORT_ARRAY flag. + * + * Creation of such ports requires the binary to have the following entitlement. + */ +#define MACH_PORT_CONNECTION_PORT_WITH_PORT_ARRAY "com.apple.developer.allow-connection-port-with-port-array" + +/* Allows 1p process to create provisional reply port (to be rename to weak reply port) */ +#define MACH_PORT_PROVISIONAL_REPLY_ENTITLEMENT "com.apple.private.allow-weak-reply-port" + /* * Flags for mach_port_options (used for * invocation of mach_port_construct). * Indicates attributes to be set for the newly * allocated port. */ -#define MPO_CONTEXT_AS_GUARD 0x01 /* Add guard to the port */ -#define MPO_QLIMIT 0x02 /* Set qlimit for the port msg queue */ -#define MPO_TEMPOWNER 0x04 /* Set the tempowner bit of the port */ -#define MPO_IMPORTANCE_RECEIVER 0x08 /* Mark the port as importance receiver */ -#define MPO_INSERT_SEND_RIGHT 0x10 /* Insert a send right for the port */ -#define MPO_STRICT 0x20 /* Apply strict guarding for port */ -#define MPO_DENAP_RECEIVER 0x40 /* Mark the port as App de-nap receiver */ -#define MPO_IMMOVABLE_RECEIVE 0x80 /* Mark the port as immovable; protected by the guard context */ -#define MPO_FILTER_MSG 0x100 /* Allow message filtering */ -#define MPO_TG_BLOCK_TRACKING 0x200 /* Track blocking relationship for thread group during sync IPC */ -#define MPO_SERVICE_PORT 0x400 /* Create a service port with the given name; should be used only by launchd */ -#define MPO_CONNECTION_PORT 0x800 /* Derive new peer connection port from a given service port */ -#define MPO_REPLY_PORT 0x1000 /* Designate port as a reply port. */ -#define MPO_ENFORCE_REPLY_PORT_SEMANTICS 0x2000 /* When talking to this port, local port of mach msg needs to follow reply port semantics.*/ -#define MPO_PROVISIONAL_REPLY_PORT 0x4000 /* Designate port as a provisional reply port. */ -#define MPO_EXCEPTION_PORT 0x8000 /* Used for hardened exceptions - immovable */ +/* MPO options flags */ +#define MPO_CONTEXT_AS_GUARD 0x01 /* Add guard to the port */ +#define MPO_QLIMIT 0x02 /* Set qlimit for the port msg queue */ +#define MPO_TEMPOWNER 0x04 /* Set the tempowner bit of the port */ +#define MPO_IMPORTANCE_RECEIVER 0x08 /* Mark the port as importance receiver */ +#define MPO_INSERT_SEND_RIGHT 0x10 /* Insert a send right for the port */ +#define MPO_STRICT 0x20 /* Apply strict guarding for port */ +#define MPO_DENAP_RECEIVER 0x40 /* Mark the port as App de-nap receiver */ +#define MPO_IMMOVABLE_RECEIVE 0x80 /* Mark the port as immovable; protected by the guard context */ +#define MPO_FILTER_MSG 0x100 /* Allow message filtering */ +#define MPO_TG_BLOCK_TRACKING 0x200 /* Track blocking relationship for thread group during sync IPC */ +#define MPO_ENFORCE_REPLY_PORT_SEMANTICS 0x2000 /* When talking to this port, local port of mach msg needs to follow reply port semantics.*/ +/* This service port has requested security hardening */ +#define MPO_STRICT_SERVICE_PORT (MPO_SERVICE_PORT | MPO_ENFORCE_REPLY_PORT_SEMANTICS) + +#define MPO_OPTIONS_MASK \ + (MPO_CONTEXT_AS_GUARD | \ + MPO_QLIMIT | \ + MPO_TEMPOWNER | \ + MPO_IMPORTANCE_RECEIVER | \ + MPO_INSERT_SEND_RIGHT | \ + MPO_STRICT | \ + MPO_DENAP_RECEIVER | \ + MPO_IMMOVABLE_RECEIVE | \ + MPO_FILTER_MSG | \ + MPO_TG_BLOCK_TRACKING | \ + MPO_ENFORCE_REPLY_PORT_SEMANTICS) + +/* MPO port type flags */ +#define MPO_MAKE_PORT_TYPE(a, b) (((a & 0x7) << 14) | ((b & 0x7) << 10)) +#define MPO_PORT_TYPE_MASK MPO_MAKE_PORT_TYPE(0x7, 0x7) /* 0x1dc00 */ +#if KERNEL_PRIVATE +__enum_closed_decl(mpo_flags_t, uint32_t, { +#else /* KERNEL_PRIVATE */ +/* These need to be defined for libxpc and other clients who `#ifdef` */ + #define MPO_PORT MPO_PORT + #define MPO_SERVICE_PORT MPO_SERVICE_PORT + #define MPO_CONNECTION_PORT MPO_CONNECTION_PORT + #define MPO_REPLY_PORT MPO_REPLY_PORT + #define MPO_PROVISIONAL_REPLY_PORT MPO_PROVISIONAL_REPLY_PORT + #define MPO_EXCEPTION_PORT MPO_EXCEPTION_PORT + #define MPO_CONNECTION_PORT_WITH_PORT_ARRAY MPO_CONNECTION_PORT_WITH_PORT_ARRAY +__options_decl(mpo_flags_t, uint32_t, { +#endif /* KERNEL_PRIVATE */ + /* Your classic IOT_PORT, an uninteresting message queue */ + MPO_PORT = MPO_MAKE_PORT_TYPE(0, 0), /* 0x0 */ + /* Create a service port with the given name; should be used only by launchd */ + MPO_SERVICE_PORT = MPO_MAKE_PORT_TYPE(0, 1), /* 0x400 */ + /* Derive new peer connection port from a given service port */ + MPO_CONNECTION_PORT = MPO_MAKE_PORT_TYPE(0, 2), /* 0x800 */ + /* Designate port as a reply port */ + MPO_REPLY_PORT = MPO_MAKE_PORT_TYPE(0, 4), /* 0x1000 */ + /* Designate port as a provisional (fake) reply port */ + MPO_PROVISIONAL_REPLY_PORT = MPO_MAKE_PORT_TYPE(1, 0), /* 0x4000 */ + /* Used for hardened exceptions - immovable */ + MPO_EXCEPTION_PORT = MPO_MAKE_PORT_TYPE(2, 0), /* 0x8000 */ + /* Can receive OOL port array descriptors */ + MPO_CONNECTION_PORT_WITH_PORT_ARRAY = MPO_MAKE_PORT_TYPE(4, 0), /* 0x10000 */ +}); +#define MPO_UNUSED_BITS ~(MPO_OPTIONS_MASK | MPO_PORT_TYPE_MASK) + +/* Denotes an anonymous service */ +#define MPO_ANONYMOUS_SERVICE (MACH_PORT_DEAD - 1) /* * Structure to define optional attributes for a newly * constructed port. */ typedef struct mach_port_options { - uint32_t flags; /* Flags defining attributes for port */ + uint32_t flags; mach_port_limits_t mpl; /* Message queue limit for port */ union { uint64_t reserved[2]; /* Reserved */ @@ -526,6 +571,7 @@ typedef mach_port_options_t *mach_port_options_ptr_t; * but are truly an enum, please add new values in the "holes". */ enum mach_port_guard_exception_codes { + kGUARD_EXC_NONE = 0, /* never sent */ kGUARD_EXC_DESTROY = 1, kGUARD_EXC_MOD_REFS = 2, kGUARD_EXC_INVALID_OPTIONS = 3, @@ -534,9 +580,14 @@ enum mach_port_guard_exception_codes { kGUARD_EXC_EXCEPTION_BEHAVIOR_ENFORCE = 6, kGUARD_EXC_SERVICE_PORT_VIOLATION_FATAL = 7, /* unused, for future sp defense enablement */ kGUARD_EXC_UNGUARDED = 8, + kGUARD_EXC_KOBJECT_REPLY_PORT_SEMANTICS = 9, + kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS = 10, kGUARD_EXC_INCORRECT_GUARD = 16, kGUARD_EXC_IMMOVABLE = 32, kGUARD_EXC_STRICT_REPLY = 64, + kGUARD_EXC_INVALID_NOTIFICATION_REQ = 65, + kGUARD_EXC_INVALID_MPO_ENTITLEMENT = 66, + kGUARD_EXC_DESCRIPTOR_VIOLATION = 67, kGUARD_EXC_MSG_FILTERED = 128, /* start of [optionally] non-fatal guards */ kGUARD_EXC_INVALID_RIGHT = 256, @@ -555,40 +606,125 @@ enum mach_port_guard_exception_codes { kGUARD_EXC_RCV_GUARDED_DESC = 0x00100000, /* for development only */ kGUARD_EXC_SERVICE_PORT_VIOLATION_NON_FATAL = 0x00100001, /* unused, for future sp defense enablement */ kGUARD_EXC_PROVISIONAL_REPLY_PORT = 0x00100002, + kGUARD_EXC_OOL_PORT_ARRAY_CREATION = 0x00100003, /* unused */ + kGUARD_EXC_MOVE_PROVISIONAL_REPLY_PORT = 0x00100004, + kGUARD_EXC_REPLY_PORT_SINGLE_SO_RIGHT = 0x00100005, kGUARD_EXC_MOD_REFS_NON_FATAL = 1u << 21, kGUARD_EXC_IMMOVABLE_NON_FATAL = 1u << 22, - kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS = 1u << 23, }; #define MAX_FATAL_kGUARD_EXC_CODE kGUARD_EXC_MSG_FILTERED #define MAX_OPTIONAL_kGUARD_EXC_CODE kGUARD_EXC_RCV_INVALID_NAME +#ifdef XNU_KERNEL_PRIVATE +/* + * Mach port guard payload construction helpers + * + * The order of the argument is the same as their position in + * the payload, with flag being the MSB and the last argument + * in the least siginificant end. + */ +#define MPG_24BIT_MASK ((0x1ULL << 24) - 1) + +/* + * +-------------+----------------+----------------------+ + * |[63:56] flag | [55:32] unused | [31:0] a | + * +-------------+----------------+----------------------+ + */ +__header_always_inline __attribute__((overloadable)) +uint64_t +MPG_PAYLOAD(uint8_t flag, uint32_t a) +{ + return ((uint64_t)flag << 56) | a; +} + +/* + * +-------------+----------------+----------------------+ + * |[63:56] flag | [55:32] a | [31:0] b | + * +-------------+----------------+----------------------+ + */ +__header_always_inline __attribute__((overloadable)) +uint64_t +MPG_PAYLOAD(uint8_t flag, uint32_t a, uint32_t b) +{ + return ((uint64_t)flag << 56) | + ((uint64_t)(a & MPG_24BIT_MASK) << 32) | b; +} + +/* + * +-------------+----------------+-----------+----------+ + * |[63:56] flag | [55:32] a | [31:16] b | [15:0] c | + * +-------------+----------------+-----------+----------+ + */ +__header_always_inline __attribute__((overloadable)) +uint64_t +MPG_PAYLOAD(uint8_t flag, uint32_t a, uint16_t b, uint16_t c) +{ + return ((uint64_t)flag << 56) | + ((uint64_t)(a & MPG_24BIT_MASK) << 32) | + ((uint64_t)b << 16) | + c; +} +#endif /* XNU_KERNEL_PRIVATE */ + /* * Mach port guard flags. */ -#define MPG_FLAGS_NONE (0x00ull) +#define MPG_FLAGS_NONE 0x00 /* * These flags are used as bits in the subcode of kGUARD_EXC_STRICT_REPLY exceptions. */ -#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_DISP (0x01ull << 56) -#define MPG_FLAGS_STRICT_REPLY_INVALID_REPLY_PORT (0x02ull << 56) -#define MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER (0x04ull << 56) -#define MPG_FLAGS_STRICT_REPLY_NO_BANK_ATTR (0x08ull << 56) -#define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA (0x10ull << 56) -#define MPG_FLAGS_STRICT_REPLY_MASK (0xffull << 56) +#define MPG_FLAGS_STRICT_REPLY_INVALID_VOUCHER 0x04 +#define MPG_FLAGS_STRICT_REPLY_MISMATCHED_PERSONA 0x10 /* * These flags are used as bits in the subcode of kGUARD_EXC_MOD_REFS exceptions. */ -#define MPG_FLAGS_MOD_REFS_PINNED_DEALLOC (0x01ull << 56) -#define MPG_FLAGS_MOD_REFS_PINNED_DESTROY (0x02ull << 56) -#define MPG_FLAGS_MOD_REFS_PINNED_COPYIN (0x04ull << 56) +#define MPG_FLAGS_MOD_REFS_PINNED_DEALLOC 0x01 +#define MPG_FLAGS_MOD_REFS_PINNED_DESTROY 0x02 +#define MPG_FLAGS_MOD_REFS_PINNED_COPYIN 0x03 /* - * These flags are used as bits in the subcode of kGUARD_EXC_IMMOVABLE exceptions. + * These flags are used as bits in the subcode of kGUARD_EXC_INVALID_RIGHT exceptions. */ -#define MPG_FLAGS_IMMOVABLE_PINNED (0x01ull << 56) +#define MPG_FLAGS_INVALID_RIGHT_RECV 0x01 /* does not have receive right */ +#define MPG_FLAGS_INVALID_RIGHT_DELTA 0x02 /* ipc_right_delta() */ +#define MPG_FLAGS_INVALID_RIGHT_DESTRUCT 0x03 /* ipc_right_destruct() */ +#define MPG_FLAGS_INVALID_RIGHT_COPYIN 0x04 /* ipc_right_copyin() */ +#define MPG_FLAGS_INVALID_RIGHT_DEALLOC 0x05 /* ipc_right_dealloc() */ +#define MPG_FLAGS_INVALID_RIGHT_DEALLOC_KERNEL 0x06 /* mach_port_deallocate_kernel() */ +#define MPG_FLAGS_INVALID_RIGHT_TRANSLATE_PORT 0x07 /* port in ipc_object_translate_port_pset() */ +#define MPG_FLAGS_INVALID_RIGHT_TRANSLATE_PSET 0x08 /* pset in ipc_object_translate_port_pset() */ + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_INVALID_VALUE exceptions. + */ +#define MPG_FLAGS_INVALID_VALUE_PEEK 0x01 /* mach_port_peek() */ +#define MPG_FLAGS_INVALID_VALUE_DELTA 0x02 /* ipc_right_delta() */ +#define MPG_FLAGS_INVALID_VALUE_DESTRUCT 0x03 /* ipc_right_destruct() */ + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_KERN_FAILURE exceptions. + */ +#define MPG_FLAGS_KERN_FAILURE_TASK 0x01 /* task other than launchd arm pd on service ports */ +#define MPG_FLAGS_KERN_FAILURE_NOTIFY_TYPE 0x02 /* not using IOT_NOTIFICATION_PORT for pd notification */ +#define MPG_FLAGS_KERN_FAILURE_NOTIFY_RECV 0x03 /* notification port not owned by launchd */ +#define MPG_FLAGS_KERN_FAILURE_MULTI_NOTI 0x04 /* register multiple pd notification */ + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_SEND_INVALID_RIGHT exceptions. + */ +#define MPG_FLAGS_SEND_INVALID_RIGHT_PORT 0x01 /* ipc_kmsg_copyin_port_descriptor() */ +#define MPG_FLAGS_SEND_INVALID_RIGHT_OOL_PORT 0x02 /* ipc_kmsg_copyin_ool_ports_descriptor() */ +#define MPG_FLAGS_SEND_INVALID_RIGHT_GUARDED 0x03 /* ipc_kmsg_copyin_guarded_port_descriptor */ + +/* + * These flags are used as bits in the subcode of kGUARD_EXC_INVALID_OPTIONS exceptions. + */ +#define MPG_FLAGS_INVALID_OPTIONS_OOL_DISP 0x01 /* ipc_kmsg_copyin_ool_ports_descriptor() */ +#define MPG_FLAGS_INVALID_OPTIONS_OOL_ARRAYS 0x02 /* ipc_validate_kmsg_header_from_user() */ +#define MPG_FLAGS_INVALID_OPTIONS_OOL_RIGHT 0x03 /* ipc_validate_kmsg_header_from_user() */ /* * Flags for mach_port_guard_with_flags. These flags extend diff --git a/osfmk/mach/sfi_class.h b/osfmk/mach/sfi_class.h index 3841f37dd..46a4c8e69 100644 --- a/osfmk/mach/sfi_class.h +++ b/osfmk/mach/sfi_class.h @@ -52,7 +52,7 @@ typedef uint32_t sfi_class_id_t; * Total number of classes supported including SFI_CLASS_UNSPECIFIED. * If new class is defined increase this number. */ -#define MAX_SFI_CLASS_ID 0x00000011 +#define MAX_SFI_CLASS_ID 0x00000012 /* * Threads may initially start out unspecified @@ -126,6 +126,12 @@ typedef uint32_t sfi_class_id_t; #define SFI_CLASS_USER_INTERACTIVE_NONFOCAL 0x0000000F #define SFI_CLASS_MAINTENANCE 0x00000010 +/* + * Threads that have been tagged for runaway mitigation and are + * not turnstile boosted by something not-mitigated. + */ +#define SFI_CLASS_RUNAWAY_MITIGATION 0x00000011 + /* * Windows that are specified smaller than MIN_SFI_WINDOW_USEC * will be automatically rounded up. diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index bf79f2a36..f6e3b6bea 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -147,6 +147,9 @@ kernel_trap(swtch,-60,0) kernel_trap(syscall_thread_switch,-61,3) kernel_trap(clock_sleep_trap,-62,5) +#if defined(__LP64__) +kernel_trap(mach_vm_reclaim_update_kernel_accounting_trap,-63,2) +#endif /* __LP64__ */ /* voucher traps */ kernel_trap(host_create_mach_voucher_trap,-70,4) diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index 670a13001..995d6c5ad 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -576,6 +576,25 @@ typedef struct task_suspend_source_s task_suspend_source_data_t; #endif /* PRIVATE */ +#define TASK_SECURITY_CONFIG_INFO 32 /* Runtime security mitigations configuration for the task */ +struct task_security_config_info { + uint32_t config; /* Configuration bitmask */ +}; + +typedef struct task_security_config_info * task_security_config_info_t; +#define TASK_SECURITY_CONFIG_INFO_COUNT ((mach_msg_type_number_t) \ + (sizeof(struct task_security_config_info) / sizeof(natural_t))) + + +#define TASK_IPC_SPACE_POLICY_INFO 33 /* Runtime security mitigations configuration for the task */ +struct task_ipc_space_policy_info { + uint32_t space_policy; /* Configuration bitmask */ +}; + +typedef struct task_ipc_space_policy_info * task_ipc_space_policy_info_t; +#define TASK_IPC_SPACE_POLICY_INFO_COUNT ((mach_msg_type_number_t) \ + (sizeof(struct task_ipc_space_policy_info) / sizeof(natural_t))) + /* * Type to control EXC_GUARD delivery options for a task * via task_get/set_exc_guard_behavior interface(s). @@ -622,14 +641,13 @@ typedef uint32_t task_corpse_forking_behavior_t; __options_decl(task_control_port_options_t, uint32_t, { TASK_CONTROL_PORT_OPTIONS_NONE = 0x00000000, - TASK_CONTROL_PORT_PINNED_SOFT = 0x00000001, - TASK_CONTROL_PORT_PINNED_HARD = 0x00000002, - TASK_CONTROL_PORT_IMMOVABLE_SOFT = 0x00000004, - TASK_CONTROL_PORT_IMMOVABLE_HARD = 0x00000008, -}); + TASK_CONTROL_PORT_IMMOVABLE_SOFT = 0x00000001, + TASK_CONTROL_PORT_IMMOVABLE_HARD = 0x00000002, -#define TASK_CONTROL_PORT_IMMOVABLE (TASK_CONTROL_PORT_IMMOVABLE_SOFT | TASK_CONTROL_PORT_IMMOVABLE_HARD) -#define TASK_CONTROL_PORT_PINNED (TASK_CONTROL_PORT_PINNED_SOFT | TASK_CONTROL_PORT_PINNED_HARD) + TASK_CONTROL_PORT_IMMOVABLE_MASK = ( + TASK_CONTROL_PORT_IMMOVABLE_SOFT | + TASK_CONTROL_PORT_IMMOVABLE_HARD), +}); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index 36fd8eb57..c3d1702bf 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -121,6 +121,7 @@ typedef enum task_role { TASK_NONUI_APPLICATION = 6, TASK_DEFAULT_APPLICATION = 7, TASK_DARWINBG_APPLICATION = 8, + TASK_USER_INIT_APPLICATION = 9, } task_role_t; struct task_category_policy { diff --git a/osfmk/mach/task_policy_private.h b/osfmk/mach/task_policy_private.h index 14b16cd04..7ef8b7bbc 100644 --- a/osfmk/mach/task_policy_private.h +++ b/osfmk/mach/task_policy_private.h @@ -40,7 +40,7 @@ * When they do, we will update TASK_POLICY_INTERNAL_STRUCT_VERSION. */ -#define TASK_POLICY_INTERNAL_STRUCT_VERSION 4 +#define TASK_POLICY_INTERNAL_STRUCT_VERSION 5 #define trp_tal_enabled trp_reserved /* trp_tal_enabled is unused, reuse its slot to grow trp_role */ @@ -73,7 +73,9 @@ struct task_requested_policy { trp_sup_cpu :1, /* Wants suppressed CPU priority (MAXPRI_SUPPRESSED) */ trp_sup_bg_sockets :1, /* Wants background sockets */ - trp_reserved :17; + trp_runaway_mitigation :1, /* Is runaway-mitigated */ + + trp_reserved :16; }; struct task_effective_policy { @@ -98,10 +100,11 @@ struct task_effective_policy { tep_live_donor :1, /* task is a live importance boost donor */ tep_qos_clamp :3, /* task qos clamp (applies to qos-disabled threads too) */ tep_qos_ceiling :3, /* task qos ceiling (applies to only qos-participating threads) */ - tep_adaptive_bg :1, /* task is bg solely due to the adaptive daemon clamp */ + tep_promote_above_task :1, /* task should allow turnstiles to boost above BG clamp */ tep_coalition_bg :1, /* task is bg due to coalition suppresssion */ + tep_runaway_mitigation :1, /* task is bg due to runaway-mitigation */ - tep_reserved :29; + tep_reserved :28; }; /* @@ -161,7 +164,8 @@ typedef struct task_policy_state *task_policy_state_t; #define TASK_APPTYPE_DAEMON_ADAPTIVE 3 #define TASK_APPTYPE_DAEMON_BACKGROUND 4 #define TASK_APPTYPE_APP_DEFAULT 5 -#define TASK_APPTYPE_APP_TAL 6 /* unused */ +#define TASK_APPTYPE_APP_NONUI 6 +#define TASK_APPTYPE_APP_TAL TASK_APPTYPE_APP_NONUI /* unused */ #define TASK_APPTYPE_DRIVER 7 /* task policy state flags */ diff --git a/osfmk/mach/thread_info.h b/osfmk/mach/thread_info.h index a20d3dd90..4d717462a 100644 --- a/osfmk/mach/thread_info.h +++ b/osfmk/mach/thread_info.h @@ -74,7 +74,6 @@ #include #include #include - /* * Generic information structure to allow for expansion. */ @@ -215,6 +214,7 @@ typedef struct io_stat_info *io_stat_info_t; __BEGIN_DECLS void thread_group_join_io_storage(void); void thread_group_join_perf_controller(void); +void thread_group_join_cellular(void); __END_DECLS #endif diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 7e47f63ac..fbe223043 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -261,14 +261,6 @@ extern uint64_t max_mem; /* 64-bit size of memory - limit */ #define MALLOC_MEDIUM_CHUNK_SIZE (8ULL * 1024 * 1024) /* 8 MB */ -/** - * The size of the largest allocation which can be used in the kernel without - * special accessors/attributes. When using accessors/attributes, this limit can - * be overridden when making allocations/mappings through various APIs by - * setting the "no soft limit" option. - */ -#define VM_KERNEL_SIMPLE_MAX_SIZE (1ULL << 30) /* 1GB */ - #ifdef KERNEL_PRIVATE extern uint64_t sane_size; /* Memory size to use for defaults calculations */ #endif /* KERNEL_PRIVATE */ @@ -408,9 +400,13 @@ vm_is_addr_slid(vm_offset_t addr) #define VM_KERNEL_ADDRHASH(_v) vm_kernel_addrhash((vm_offset_t)(_v)) +/* + * ML_ADDRPERM is defined as a macro that dispatches to the correct machine version. + * For systems that support the generic ml_addrperm version, the actual slide address is unused. + */ #define VM_KERNEL_UNSLIDE_OR_PERM(_v) ({ \ VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : \ - VM_KERNEL_ADDRESS(_v) ? ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) + vm_kernel_addrperm) : \ + VM_KERNEL_ADDRESS(_v) ? (ML_ADDRPERM((uintptr_t)VM_KERNEL_STRIP_UPTR(_v), vm_kernel_addrperm)) : \ (vm_offset_t)VM_KERNEL_STRIP_PTR(_v); \ }) diff --git a/osfmk/mach/vm_reclaim_private.h b/osfmk/mach/vm_reclaim_private.h index b0054df0b..befaf9410 100644 --- a/osfmk/mach/vm_reclaim_private.h +++ b/osfmk/mach/vm_reclaim_private.h @@ -31,12 +31,6 @@ #include #include -/* - * This header exists for the internal implementation in libsyscall/xnu - * and for observability with debugging tools. It should _NOT_ be used by - * clients. - */ - #define VM_RECLAIM_MAX_BUFFER_SIZE (128ull << 20) #define VM_RECLAIM_MAX_CAPACITY ((VM_RECLAIM_MAX_BUFFER_SIZE - \ offsetof(struct mach_vm_reclaim_ring_s, entries)) / \ @@ -44,12 +38,6 @@ __BEGIN_DECLS -typedef struct mach_vm_reclaim_indices_s { - _Atomic mach_vm_reclaim_id_t head; - _Atomic mach_vm_reclaim_id_t tail; - _Atomic mach_vm_reclaim_id_t busy; -} *mach_vm_reclaim_indices_t; - typedef struct mach_vm_reclaim_entry_s { mach_vm_address_t address; uint32_t size; @@ -57,24 +45,161 @@ typedef struct mach_vm_reclaim_entry_s { uint8_t _unused[3]; } *mach_vm_reclaim_entry_t; +/* This struct is no longer used () */ +typedef struct mach_vm_reclaim_indices_s { + _Atomic mach_vm_reclaim_id_t head; + _Atomic mach_vm_reclaim_id_t tail; + _Atomic mach_vm_reclaim_id_t busy; +} *mach_vm_reclaim_indices_t; + /* - * Contains the data used for synchronization with the kernel. This structure - * should be page-aligned. + * Contains the data used for synchronization with the kernel */ struct mach_vm_reclaim_ring_s { + /* no longer used () */ mach_vm_size_t va_in_buffer; + /* no longer used () */ mach_vm_size_t last_accounting_given_to_kernel; + /* The current length of the ringbuffer */ mach_vm_reclaim_count_t len; + /* The maximum length of the ringbuffer */ mach_vm_reclaim_count_t max_len; + /* no longer used () */ struct mach_vm_reclaim_indices_s indices; + /* The minimum period of time between kernel accounting updates */ + uint64_t sampling_period_abs; + /* timestamp (MAS) of the last kernel accounting update */ + uint64_t last_sample_abs; + /* + * An estimate for the number of reclaimable bytes currently in the ring. This + * is updating atomically after entering a new reclaimable region, after + * successfully cancelling a region, and after reclaiming regions. + */ + _Atomic uint64_t reclaimable_bytes; + /* + * The minimum amount of reclaimable memory in this buffer for the current + * sampling interval. + */ + _Atomic uint64_t reclaimable_bytes_min; + /* Marks IDs which have been reclaimed */ + _Atomic mach_vm_reclaim_id_t head; + /* Marks IDs which are in the process of being reclaimed */ + _Atomic mach_vm_reclaim_id_t busy; + /* The ID of the most recent entry */ + _Atomic mach_vm_reclaim_id_t tail; + /* Pad to a multiple of the entry size */ + uint64_t _unused; /* * The ringbuffer entries themselves populate the remainder of this * buffer's vm allocation. - * NB: the fields preceding `entries` must be aligned to a multiple of + * NB: the fields preceding `entries` should be aligned to a multiple of * the entry size. */ struct mach_vm_reclaim_entry_s entries[] __counted_by(len); }; +/* + * The above definitions exist for the internal implementation in libsyscall / + * xnu and for observability with debugging tools. They should _NOT_ be used by + * clients. + */ + +#if !KERNEL + +/* + * The below interfaces are intended for observing a task's reclaim ring(s) and + * querying which regions are reclaimable. General usage would look something + * like the following: + * + * - Use `mach_vm_reclaim_get_rings_for_task` to get a list of reclaim rings + * for a task. + * - Use `mach_vm_reclaim_ring_copy` for each ring to map a copy of the + * reclaim ring into your address space. + * - Use `mach_vm_reclaim_copied_ring_query` to query a list of reclaimable + * regions in the ring. + * - Use `mach_vm_reclaim_copied_ring_free` to free the copied reclaim ring. + */ + +/// A descriptor for a reclaimable region +typedef struct mach_vm_reclaim_region_s { + mach_vm_address_t vmrr_addr; + mach_vm_size_t vmrr_size; + mach_vm_reclaim_action_t vmrr_behavior; + uint8_t _vmrr_unused[3]; +} *mach_vm_reclaim_region_t; + +/// A reference to a task's reclaim ring +typedef struct mach_vm_reclaim_ring_ref_s { + mach_vm_address_t addr; + mach_vm_size_t size; +} *mach_vm_reclaim_ring_ref_t; + +/// A reclaim ring copied from another task +typedef void *mach_vm_reclaim_ring_copy_t; + +/// Get references to another task's reclaim rings. +/// +/// - Parameters: +/// - task: The target task +/// - refs_out: A buffer to store the references in. If NULL, only the number +/// of rings will be queried. +/// - count_inout: A pointer to the count of the buffer, which will be +/// overwritten with the number of rings in the target task. +/// +/// - Returns: `VM_RECLAIM_SUCCESS` upon success. +__SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), visionos(3.0)) +mach_vm_reclaim_error_t mach_vm_reclaim_get_rings_for_task( + task_read_t task, + mach_vm_reclaim_ring_ref_t refs_out, + mach_vm_reclaim_count_t *count_inout); + +/// Copy another task's reclaim ring into this task's VA. +/// +/// - Parameters: +/// - task: The task to copy the ring from +/// - ref: The reference to the ring to copy +/// (obtained via mach_vm_reclaim_get_rings_for_task). +/// - ring_out: The pointer to the copied ring to be written out upon success +/// +/// - Returns: `VM_RECLAIM_SUCCESS` upon success. +__SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), visionos(3.0)) +mach_vm_reclaim_error_t mach_vm_reclaim_ring_copy( + task_read_t task, + mach_vm_reclaim_ring_ref_t ref, + mach_vm_reclaim_ring_copy_t *ring_out); + +/// Free a reclaim ring copied from another task. +/// +/// - Parameters: +/// - ring: The copied ring to free. +/// +/// - Returns: `VM_RECLAIM_SUCCESS` upon success. +__SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), visionos(3.0)) +mach_vm_reclaim_error_t mach_vm_reclaim_copied_ring_free( + mach_vm_reclaim_ring_copy_t *ring); + +/// Query the reclaimable regions in a copied reclaim ring. +/// +/// - Parameters: +/// - ring: The ring to query +/// - regions_out: A buffer of `mach_vm_reclaim_region_s` to copy the query +/// results into. If NULL, only the size of the ring will be queried. +/// - count_inout: A pointer to the size, in regions, of the buffer. Will +/// be overwritten with the count of regions in the ring upon success. +/// +/// - Returns: `VM_RECLAIM_SUCCESS` on success. +/// `KERN_NO_SPACE` if there is insufficient space in the buffer to store +/// the queried ring's regions. +/// `VM_RECLAIM_INVALID_CAPACITY` if the ringbuffer structure was malformed +/// and had a buffer too small for its reported size. The entries that were +/// able to be queried and the count will still be written out. +__SPI_AVAILABLE(macos(16.0), ios(19.0), tvos(19.0), visionos(3.0)) +mach_vm_reclaim_error_t mach_vm_reclaim_copied_ring_query( + mach_vm_reclaim_ring_copy_t *ring, + mach_vm_reclaim_region_t regions_out, + mach_vm_reclaim_count_t *count_inout); + +#endif /* !KERNEL */ + __END_DECLS #endif /* __LP64__ */ diff --git a/osfmk/mach/vm_region.h b/osfmk/mach/vm_region.h index e101e7330..429e97c52 100644 --- a/osfmk/mach/vm_region.h +++ b/osfmk/mach/vm_region.h @@ -261,6 +261,7 @@ typedef struct vm_region_submap_info vm_region_submap_info_data_t; (sizeof(vm_region_submap_info_data_t) / sizeof(natural_t))) struct vm_region_submap_info_64 { + /* v0 fields */ vm_prot_t protection; /* present access protection */ vm_prot_t max_protection; /* max avail through vm_prot */ vm_inherit_t inheritance;/* behavior of map/obj on fork */ @@ -278,18 +279,30 @@ struct vm_region_submap_info_64 { vm_behavior_t behavior; /* access behavior hint */ vm32_object_id_t object_id; /* obj/map name, not a handle */ unsigned short user_wired_count; + unsigned short flags; + /* v1 fields */ unsigned int pages_reusable; + /* v2 fields */ vm_object_id_t object_id_full; }; typedef struct vm_region_submap_info_64 *vm_region_submap_info_64_t; typedef struct vm_region_submap_info_64 vm_region_submap_info_data_64_t; +/* + * Note that this size is hard-coded at the MIG boundary in mach_types.defs + * so if we ever increase this you'll need to also bump the definition of + * vm_region_recurse_info_t. + */ #define VM_REGION_SUBMAP_INFO_V2_SIZE \ (sizeof (vm_region_submap_info_data_64_t)) + +/* v1 size is v2 size minus v2's new fields */ #define VM_REGION_SUBMAP_INFO_V1_SIZE \ (VM_REGION_SUBMAP_INFO_V2_SIZE - \ sizeof (vm_object_id_t) /* object_id_full */ ) + +/* v0 size is v1 size minus v1's new fields */ #define VM_REGION_SUBMAP_INFO_V0_SIZE \ (VM_REGION_SUBMAP_INFO_V1_SIZE - \ sizeof (unsigned int) /* pages_reusable */ ) @@ -307,6 +320,10 @@ typedef struct vm_region_submap_info_64 vm_region_submap_info_data_64_t /* set this to the latest version */ #define VM_REGION_SUBMAP_INFO_COUNT_64 VM_REGION_SUBMAP_INFO_V2_COUNT_64 +#define VM_REGION_FLAG_JIT_ENABLED 0x1 +#define VM_REGION_FLAG_TPRO_ENABLED 0x2 + + struct vm_region_submap_short_info_64 { vm_prot_t protection; /* present access protection */ vm_prot_t max_protection; /* max avail through vm_prot */ @@ -321,6 +338,7 @@ struct vm_region_submap_short_info_64 { vm_behavior_t behavior; /* access behavior hint */ vm32_object_id_t object_id; /* obj/map name, not a handle */ unsigned short user_wired_count; + unsigned short flags; }; typedef struct vm_region_submap_short_info_64 *vm_region_submap_short_info_64_t; diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 4a5d91663..006575f5e 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -66,6 +66,9 @@ #ifndef _MACH_VM_STATISTICS_H_ #define _MACH_VM_STATISTICS_H_ + +#include +#include #include #include @@ -74,6 +77,8 @@ __BEGIN_DECLS +#pragma mark VM Statistics + /* * vm_statistics * @@ -141,7 +146,7 @@ struct vm_statistics64 { natural_t wire_count; /* # of pages wired down */ uint64_t zero_fill_count; /* # of zero fill pages */ uint64_t reactivations; /* # of pages reactivated */ - uint64_t pageins; /* # of pageins */ + uint64_t pageins; /* # of pageins (lifetime) */ uint64_t pageouts; /* # of pageouts */ uint64_t faults; /* # of faults */ uint64_t cow_faults; /* # of copy-on-writes */ @@ -158,15 +163,17 @@ struct vm_statistics64 { natural_t speculative_count; /* # of pages speculative */ /* added for rev1 */ - uint64_t decompressions; /* # of pages decompressed */ - uint64_t compressions; /* # of pages compressed */ - uint64_t swapins; /* # of pages swapped in (via compression segments) */ - uint64_t swapouts; /* # of pages swapped out (via compression segments) */ + uint64_t decompressions; /* # of pages decompressed (lifetime) */ + uint64_t compressions; /* # of pages compressed (lifetime) */ + uint64_t swapins; /* # of pages swapped in via compressor segments (lifetime) */ + uint64_t swapouts; /* # of pages swapped out via compressor segments (lifetime) */ natural_t compressor_page_count; /* # of pages used by the compressed pager to hold all the compressed data */ natural_t throttled_count; /* # of pages throttled */ natural_t external_page_count; /* # of pages that are file-backed (non-swap) */ natural_t internal_page_count; /* # of pages that are anonymous */ uint64_t total_uncompressed_pages_in_compressor; /* # of pages (uncompressed) held within the compressor. */ + /* added for rev2 */ + uint64_t swapped_count; /* # of compressor-stored pages currently stored in swap */ } __attribute__((aligned(8))); typedef struct vm_statistics64 *vm_statistics64_t; @@ -232,6 +239,8 @@ typedef struct vm_purgeable_info *vm_purgeable_info_t; #define VM_PAGE_QUERY_PAGE_CS_NX 0x400 #define VM_PAGE_QUERY_PAGE_REUSABLE 0x800 +#pragma mark User Flags + /* * VM allocation flags: * @@ -364,33 +373,28 @@ typedef struct vm_purgeable_info *vm_purgeable_info_t; __enum_decl(virtual_memory_guard_exception_code_t, uint32_t, { kGUARD_EXC_DEALLOC_GAP = 1, kGUARD_EXC_RECLAIM_COPYIO_FAILURE = 2, - kGUARD_EXC_SEC_LOOKUP_DENIED = 3, kGUARD_EXC_RECLAIM_INDEX_FAILURE = 4, - kGUARD_EXC_SEC_RANGE_DENIED = 6, - kGUARD_EXC_SEC_ACCESS_FAULT = 7, kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE = 8, - kGUARD_EXC_SEC_COPY_DENIED = 16, - kGUARD_EXC_SEC_SHARING_DENIED = 32, - kGUARD_EXC_SEC_ASYNC_ACCESS_FAULT = 64, + kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE = 9, + kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE = 10, + kGUARD_EXC_SEC_EXEC_ON_IOPL_PAGE = 11, + kGUARD_EXC_SEC_UPL_WRITE_ON_EXEC_REGION = 12, + /* + * rdar://151450801 (Remove spurious kGUARD_EXC_SEC_ACCESS_FAULT and kGUARD_EXC_SEC_ASYNC_ACCESS_FAULT once CrashReporter is aligned) + */ + kGUARD_EXC_SEC_ACCESS_FAULT = 98, + kGUARD_EXC_SEC_ASYNC_ACCESS_FAULT = 99, + /* VM policy decisions */ + kGUARD_EXC_SEC_COPY_DENIED = 100, + kGUARD_EXC_SEC_SHARING_DENIED = 101, + }); + #ifdef XNU_KERNEL_PRIVATE -static inline bool -vm_guard_is_sec_access(uint32_t flavor) -{ - return flavor == kGUARD_EXC_SEC_ACCESS_FAULT || - flavor == kGUARD_EXC_SEC_ASYNC_ACCESS_FAULT; -} -static inline bool -vm_guard_is_sec_policy(uint32_t flavor) -{ - return flavor == kGUARD_EXC_SEC_LOOKUP_DENIED || - flavor == kGUARD_EXC_SEC_RANGE_DENIED || - flavor == kGUARD_EXC_SEC_COPY_DENIED || - flavor == kGUARD_EXC_SEC_SHARING_DENIED; -} +#pragma mark Map Ranges /*! * @enum vm_map_range_id_t @@ -420,12 +424,13 @@ vm_guard_is_sec_policy(uint32_t flavor) * - OOBs on the allocation is carefully considered and sufficiently * addressed. * - * @const KMEM_RANGE_ID_IOKIT - * Range containing memory mappings belonging to IOKit. - * * @const KMEM_RANGE_ID_DATA * Range containing allocations that are bags of bytes and contain no * pointers. + * + * @const KMEM_RANGE_ID_DATA_SHARED + * Range containing allocations that are bags of bytes and contain no + * pointers and meant to be shared with external domains. */ __enum_decl(vm_map_range_id_t, uint8_t, { KMEM_RANGE_ID_NONE, @@ -433,12 +438,12 @@ __enum_decl(vm_map_range_id_t, uint8_t, { KMEM_RANGE_ID_PTR_1, KMEM_RANGE_ID_PTR_2, KMEM_RANGE_ID_SPRAYQTN, - KMEM_RANGE_ID_IOKIT, KMEM_RANGE_ID_DATA, + KMEM_RANGE_ID_DATA_SHARED, KMEM_RANGE_ID_FIRST = KMEM_RANGE_ID_PTR_0, KMEM_RANGE_ID_NUM_PTR = KMEM_RANGE_ID_PTR_2, - KMEM_RANGE_ID_MAX = KMEM_RANGE_ID_DATA, + KMEM_RANGE_ID_MAX = KMEM_RANGE_ID_DATA_SHARED, /* these UMEM_* correspond to the MACH_VM_RANGE_* tags and are ABI */ UMEM_RANGE_ID_DEFAULT = 0, /* same as MACH_VM_RANGE_DEFAULT */ @@ -459,6 +464,8 @@ typedef vm_map_range_id_t kmem_range_id_t; ? KMEM_RANGE_ID_MAX : UMEM_RANGE_ID_MAX) #define KMEM_RANGE_BITS kmem_log2down(2 * KMEM_RANGE_MAX - 1) +#pragma mark Kernel Flags + typedef union { struct { unsigned long long @@ -570,9 +577,13 @@ typedef union { #define VM_MAP_KERNEL_FLAGS_ANYWHERE_PERMANENT(...) \ VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmf_permanent = true, __VA_ARGS__) -#define VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(...) \ +#define VM_MAP_KERNEL_FLAGS_DATA_BUFFERS_ANYWHERE(...) \ VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_range_id = KMEM_RANGE_ID_DATA, __VA_ARGS__) +#define VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(...) \ + VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_range_id = kmem_needs_data_share_range() ? \ + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA, __VA_ARGS__) + typedef struct { unsigned int vmnekf_ledger_tag:3, @@ -587,6 +598,8 @@ typedef struct { #endif /* XNU_KERNEL_PRIVATE */ +#pragma mark Ledger Tags + /* current accounting postmark */ #define __VM_LEDGER_ACCOUNTING_POSTMARK 2019032600 @@ -614,6 +627,14 @@ typedef struct { #define VM_LEDGER_FLAGS_USER (VM_LEDGER_FLAG_NO_FOOTPRINT | VM_LEDGER_FLAG_NO_FOOTPRINT_FOR_DEBUG) #define VM_LEDGER_FLAGS_ALL (VM_LEDGER_FLAGS_USER | VM_LEDGER_FLAG_FROM_KERNEL) +#pragma mark User Memory Tags + +/* + * These tags may be used to identify memory regions created with + * `mach_vm_map()` or `mach_vm_allocate()` via the top 8 bits of the `flags` + * parameter. Users should pass `VM_MAKE_TAG(tag) | flags` (see section + * "User Flags"). + */ #define VM_MEMORY_MALLOC 1 #define VM_MEMORY_MALLOC_SMALL 2 #define VM_MEMORY_MALLOC_LARGE 3 @@ -642,6 +663,8 @@ typedef struct { /* Was a nested pmap (VM_MEMORY_SHARED_PMAP) which has now been unnested */ #define VM_MEMORY_UNSHARED_PMAP 35 +/* for libchannel memory, mostly used on visionOS for communication with realtime threads */ +#define VM_MEMORY_LIBCHANNEL 36 // Placeholders for now -- as we analyze the libraries and find how they // use memory, we can make these labels more specific. @@ -653,6 +676,7 @@ typedef struct { #define VM_MEMORY_JAVA 44 #define VM_MEMORY_COREDATA 45 #define VM_MEMORY_COREDATA_OBJECTIDS 46 + #define VM_MEMORY_ATS 50 #define VM_MEMORY_LAYERKIT 51 #define VM_MEMORY_CGIMAGE 52 @@ -748,6 +772,8 @@ typedef struct { /* DHMM data */ #define VM_MEMORY_DHMM 84 +/* memory needed for DFR related actions */ +#define VM_MEMORY_DFR 85 /* memory allocated by SceneKit.framework */ #define VM_MEMORY_SCENEKIT 86 @@ -804,6 +830,9 @@ typedef struct { /* memory allocated by CoreMedia */ #define VM_MEMORY_CM_HLS 106 +/* memory allocated for CompositorServices */ +#define VM_MEMORY_COMPOSITOR_SERVICES 107 + /* Reserve 230-239 for Rosetta */ #define VM_MEMORY_ROSETTA 230 #define VM_MEMORY_ROSETTA_THREAD_CONTEXT 231 @@ -815,7 +844,21 @@ typedef struct { #define VM_MEMORY_ROSETTA_10 239 /* Reserve 240-255 for application */ -#define VM_MEMORY_APPLICATION_SPECIFIC_1 240 +#define VM_MEMORY_APPLICATION_SPECIFIC_1 240 +#define VM_MEMORY_APPLICATION_SPECIFIC_2 241 +#define VM_MEMORY_APPLICATION_SPECIFIC_3 242 +#define VM_MEMORY_APPLICATION_SPECIFIC_4 243 +#define VM_MEMORY_APPLICATION_SPECIFIC_5 244 +#define VM_MEMORY_APPLICATION_SPECIFIC_6 245 +#define VM_MEMORY_APPLICATION_SPECIFIC_7 246 +#define VM_MEMORY_APPLICATION_SPECIFIC_8 247 +#define VM_MEMORY_APPLICATION_SPECIFIC_9 248 +#define VM_MEMORY_APPLICATION_SPECIFIC_10 249 +#define VM_MEMORY_APPLICATION_SPECIFIC_11 250 +#define VM_MEMORY_APPLICATION_SPECIFIC_12 251 +#define VM_MEMORY_APPLICATION_SPECIFIC_13 252 +#define VM_MEMORY_APPLICATION_SPECIFIC_14 253 +#define VM_MEMORY_APPLICATION_SPECIFIC_15 254 #define VM_MEMORY_APPLICATION_SPECIFIC_16 255 #define VM_MEMORY_COUNT 256 @@ -824,16 +867,33 @@ typedef struct { #define VM_MAKE_TAG(tag) ((tag) << 24) #endif /* XNU_KERNEL_PRIVATE */ +#if PRIVATE && !KERNEL +/// +/// Return a human-readable description for a given VM user tag. +/// +/// - Parameters: +/// - tag: A VM tag between `[0,VM_MEMORY_COUNT)` +/// +/// - Returns: A string literal description of the tag +/// +__SPI_AVAILABLE(macos(16.0), ios(19.0), watchos(12.0), tvos(19.0), visionos(3.0), bridgeos(10.0)) +OS_EXPORT +const char *mach_vm_tag_describe(unsigned int tag); +#endif /* PRIVATE && !KERNEL */ #if KERNEL_PRIVATE -/* kernel map tags */ -/* please add new definition strings to zprint */ +#pragma mark Kernel Tags + +#if XNU_KERNEL_PRIVATE /* - * When making a new VM_KERN_MEMORY_*, update tests vm_parameter_validation_[user|kern] - * and their expected results; they deliberately call VM functions with invalid - * kernel tag values and you may be turning one of those invalid tags valid. + * When making a new VM_KERN_MEMORY_*, update: + * - tests vm_parameter_validation_[user|kern] + * and their expected results; they deliberately call VM functions with invalid + * kernel tag values and you may be turning one of those invalid tags valid. + * - vm_kern_memory_names, which is used to map tags to their string name */ +#endif /* XNU_KERNEL_PRIVATE */ #define VM_KERN_MEMORY_NONE 0 @@ -874,13 +934,14 @@ typedef struct { #define VM_KERN_MEMORY_EXCLAVES_SHARED 36 #define VM_KERN_MEMORY_KALLOC_SHARED 37 /* add new tags here and adjust first-dynamic value */ -#define VM_KERN_MEMORY_FIRST_DYNAMIC 38 +#define VM_KERN_MEMORY_CPUTRACE 38 +#define VM_KERN_MEMORY_FIRST_DYNAMIC 39 /* out of tags: */ #define VM_KERN_MEMORY_ANY 255 #define VM_KERN_MEMORY_COUNT 256 -/* end kernel map tags */ +#pragma mark Kernel Wired Counts // mach_memory_info.flags #define VM_KERN_SITE_TYPE 0x000000FF @@ -895,6 +956,13 @@ typedef struct { #define VM_KERN_SITE_ZONE_VIEW 0x00001000 #define VM_KERN_SITE_KALLOC 0x00002000 /* zone field is size class */ +/* Kernel Memory Counters */ +#if XNU_KERNEL_PRIVATE +/* + * When making a new VM_KERN_COUNT_*, also update vm_kern_count_names + */ +#endif /* XNU_KERNEL_PRIVATE */ + #define VM_KERN_COUNT_MANAGED 0 #define VM_KERN_COUNT_RESERVED 1 #define VM_KERN_COUNT_WIRED 2 @@ -922,7 +990,6 @@ typedef struct { /* The number of VM_KERN_COUNT_ stats. New VM_KERN_COUNT_ entries should be less than this. */ #define VM_KERN_COUNTER_COUNT 15 - #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/mach/vm_types.h b/osfmk/mach/vm_types.h index 2afec31a9..a2b37b60f 100644 --- a/osfmk/mach/vm_types.h +++ b/osfmk/mach/vm_types.h @@ -79,6 +79,8 @@ __options_decl(vm_map_create_options_t, uint32_t, { VM_MAP_CREATE_CORPSE_FOOTPRINT = 0x00000002, VM_MAP_CREATE_DISABLE_HOLELIST = 0x00000004, VM_MAP_CREATE_NEVER_FAULTS = 0x00000008, + /* Denote that we're creating this map as part of a fork() */ + VM_MAP_CREATE_VIA_FORK = 0x00000010, }); /* @@ -96,11 +98,22 @@ typedef struct upl *upl_t; typedef struct vm_map_copy *vm_map_copy_t; typedef struct vm_named_entry *vm_named_entry_t; typedef struct vm_page *vm_page_t; +/* + * A generation ID for vm_maps, which increments monotonically. + * These IDs are not globally unique among VM maps, however. Instead, + * IDs represent 'independent' VM map lineages: maps interrelated via + * fork() identify with the same ID. + */ +typedef const void *vm_map_serial_t; #define PMAP_NULL ((pmap_t) NULL) #define VM_OBJECT_NULL ((vm_object_t) NULL) #define VM_MAP_COPY_NULL ((vm_map_copy_t) NULL) +#define VM_MAP_SERIAL_NONE ((vm_map_serial_t)-1) +/* Denotes 'special'/one-off kernel-managed objects that don't belong to a parent map */ +#define VM_MAP_SERIAL_SPECIAL ((vm_map_serial_t)-2) + #else /* KERNEL_PRIVATE */ typedef mach_port_t vm_map_t, vm_map_read_t, vm_map_inspect_t; @@ -109,6 +122,8 @@ typedef mach_port_t vm_named_entry_t; #endif /* KERNEL_PRIVATE */ +typedef mach_vm_offset_t *mach_vm_offset_list_t; + #ifdef KERNEL #define VM_MAP_NULL ((vm_map_t) NULL) #define VM_MAP_INSPECT_NULL ((vm_map_inspect_t) NULL) diff --git a/osfmk/mach_debug/ipc_info.h b/osfmk/mach_debug/ipc_info.h index 01e9965c1..69219b5cd 100644 --- a/osfmk/mach_debug/ipc_info.h +++ b/osfmk/mach_debug/ipc_info.h @@ -70,6 +70,81 @@ #include #include +/*! + * @brief + * Type for mach_port_kobject_description() only. + * + * @discussion + * This type preserved the `IOT_*` values that @c ipc_kobject_type_t used + * to carry, whose ABI of this type was known to debugging tools of userspace, + * by copying XNU's source. + * + * This provides a guaranteed stable interface now (however no guarantee + * is made that values are still in use). + * + * Values should never be removed to that list, merely abandonned with + * a comment. + */ +__enum_decl(ipc_info_object_type_t, natural_t, { + IPC_OTYPE_NONE = 0, + IPC_OTYPE_THREAD_CONTROL = 1, + IPC_OTYPE_TASK_CONTROL = 2, + IPC_OTYPE_HOST = 3, + IPC_OTYPE_HOST_PRIV = 4, + IPC_OTYPE_PROCESSOR = 5, + IPC_OTYPE_PROCESSOR_SET = 6, + IPC_OTYPE_PROCESSOR_SET_NAME = 7, + IPC_OTYPE_TIMER = 8, + IPC_OTYPE_PORT_SUBST_ONCE = 9, /* obsolete: no instances */ + IPC_OTYPE_MIG = 10, /* obsolete: no instances */ + IPC_OTYPE_MEMORY_OBJECT = 11, /* no port instances */ + IPC_OTYPE_XMM_PAGER = 12, /* obsolete: no instances */ + IPC_OTYPE_XMM_KERNEL = 13, /* obsolete: no instances */ + IPC_OTYPE_XMM_REPLY = 14, /* obsolete: no instances */ + IPC_OTYPE_UND_REPLY = 15, + IPC_OTYPE_HOST_NOTIFY = 16, /* obsolete: no instances */ + IPC_OTYPE_HOST_SECURITY = 17, /* obsolete: no instances */ + IPC_OTYPE_LEDGER = 18, /* obsolete: no instances */ + IPC_OTYPE_MAIN_DEVICE = 19, + IPC_OTYPE_TASK_NAME = 20, + IPC_OTYPE_SUBSYSTEM = 21, /* obsolete: no instances */ + IPC_OTYPE_IO_DONE_QUEUE = 22, /* obsolete: no instances */ + IPC_OTYPE_SEMAPHORE = 23, + IPC_OTYPE_LOCK_SET = 24, /* obsolete: no instances */ + IPC_OTYPE_CLOCK = 25, + IPC_OTYPE_CLOCK_CTRL = 26, /* obsolete: no instances */ + IPC_OTYPE_IOKIT_IDENT = 27, + IPC_OTYPE_NAMED_ENTRY = 28, + IPC_OTYPE_IOKIT_CONNECT = 29, + IPC_OTYPE_IOKIT_OBJECT = 30, + IPC_OTYPE_UPL = 31, /* obsolete: no instances */ + IPC_OTYPE_MEM_OBJ_CONTROL = 32, /* obsolete: no instances */ + IPC_OTYPE_AU_SESSIONPORT = 33, + IPC_OTYPE_FILEPORT = 34, + IPC_OTYPE_LABELH = 35, /* obsolete: no instances */ + IPC_OTYPE_TASK_RESUME = 36, + IPC_OTYPE_VOUCHER = 37, + IPC_OTYPE_VOUCHER_ATTR_CONTROL = 38, /* obsolete: no instances */ + IPC_OTYPE_WORK_INTERVAL = 39, + IPC_OTYPE_UX_HANDLER = 40, + IPC_OTYPE_UEXT_OBJECT = 41, + IPC_OTYPE_ARCADE_REG = 42, + IPC_OTYPE_EVENTLINK = 43, + IPC_OTYPE_TASK_INSPECT = 44, + IPC_OTYPE_TASK_READ = 45, + IPC_OTYPE_THREAD_INSPECT = 46, + IPC_OTYPE_THREAD_READ = 47, + IPC_OTYPE_SUID_CRED = 48, /* obsolete: no instances */ + IPC_OTYPE_HYPERVISOR = 49, + IPC_OTYPE_TASK_ID_TOKEN = 50, + IPC_OTYPE_TASK_FATAL = 51, + IPC_OTYPE_KCDATA = 52, + IPC_OTYPE_EXCLAVES_RESOURCE = 53, + + /* catchall */ + IPC_OTYPE_UNKNOWN = ~0u, +}); + /* * Remember to update the mig type definitions * in mach_debug_types.defs when adding/removing fields. diff --git a/osfmk/mach_debug/mach_debug_types.defs b/osfmk/mach_debug/mach_debug_types.defs index d5cfc89c3..fcc3c0362 100644 --- a/osfmk/mach_debug/mach_debug_types.defs +++ b/osfmk/mach_debug/mach_debug_types.defs @@ -94,6 +94,8 @@ type ipc_info_name_array_t = array[] of ipc_info_name_t; type ipc_info_tree_name_t = struct[9] of natural_t; type ipc_info_tree_name_array_t = array[] of ipc_info_tree_name_t; +type ipc_info_object_type_t = natural_t; + type vm_info_region_t = struct[10] of natural_t; type vm_info_region_64_t = struct[11] of natural_t; type mach_vm_info_region_t = struct[14] of natural_t; diff --git a/osfmk/machine/machine_routines.h b/osfmk/machine/machine_routines.h index 2f2103a26..9013e5974 100644 --- a/osfmk/machine/machine_routines.h +++ b/osfmk/machine/machine_routines.h @@ -150,6 +150,13 @@ enum cpu_event { CPU_DOWN, CLUSTER_EXIT_REQUESTED, CPU_EXITED, + PLATFORM_QUIESCE, + PLATFORM_ACTIVE, + PLATFORM_HALT_RESTART, + PLATFORM_PANIC, + PLATFORM_PANIC_SYNC, + PLATFORM_PRE_SLEEP, + PLATFORM_POST_RESUME, }; typedef bool (*cpu_callback_t)(void *param, enum cpu_event event, unsigned int cpu_or_cluster); @@ -179,6 +186,10 @@ void cpu_event_unregister_callback(cpu_callback_t fn); void ml_broadcast_cpu_event(enum cpu_event event, unsigned int cpu_or_cluster); #endif +void cpu_event_debug_log(enum cpu_event event, unsigned int cpu_or_cluster); + +void dump_cpu_event_log(int (*printf_func)(const char * fmt, ...)); + /*! * @function ml_io_read() * @brief Perform an MMIO read access @@ -192,6 +203,9 @@ unsigned int ml_io_read16(uintptr_t iovaddr); unsigned int ml_io_read32(uintptr_t iovaddr); unsigned long long ml_io_read64(uintptr_t iovaddr); +uint64_t ml_io_read_cpu_reg(uintptr_t io_vaddr, int io_sz, int logical_cpu); + + /*! * @function ml_io_write() * @brief Perform an MMIO write access @@ -323,6 +337,7 @@ int ml_io_reset_timeouts_phys(vm_offset_t iopaddr_base, unsigned int size); #if XNU_KERNEL_PRIVATE #if ML_IO_TIMEOUTS_ENABLED + #if !defined(__x86_64__) /* x86 does not have the MACHINE_TIMEOUTs types, and the variables are * declared elsewhere. */ @@ -334,6 +349,16 @@ extern machine_timeout_t trace_phy_write_delay_to; #endif /* !defined(__x86_64__) */ extern void override_io_timeouts(uintptr_t vaddr, uint64_t paddr, uint64_t *read_timeout, uint64_t *write_timeout); + +typedef struct { + uint64_t mmio_start_mt; + uint64_t mmio_paddr; + uintptr_t mmio_vaddr; +} mmio_track_t; +PERCPU_DECL(mmio_track_t, mmio_tracker); + +extern boolean_t ml_io_check_for_mmio_overrides(uint64_t mt); + #endif /* ML_IO_TIMEOUTS_ENABLED */ void ml_get_cluster_type_name(cluster_type_t cluster_type, char *name, @@ -380,6 +405,18 @@ void ml_map_cpus_to_clusters(uint8_t *table); void ml_task_post_signature_processing_hook(task_t task); #endif /* MACH_KERNEL_PRIVATE */ +#if XNU_KERNEL_PRIVATE +/** + * Returns whether kernel text should be writable. + * + * @note This is always true on x86_64. + * + * @note On ARM, this can be set through LocalPolicy, or internally through the + * -unsafe_kernel_text boot arg. + */ +bool ml_unsafe_kernel_text(void); +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* _MACHINE_MACHINE_ROUTINES_H */ diff --git a/osfmk/machine/static_if.h b/osfmk/machine/static_if.h index e2e9ec55c..fc3aca724 100644 --- a/osfmk/machine/static_if.h +++ b/osfmk/machine/static_if.h @@ -38,10 +38,12 @@ typedef const struct static_if_entry *static_if_entry_t; typedef struct static_if_key { - short sik_enable_count; - short sik_init_value; - unsigned sik_entries_count; + int16_t sik_enable_count; + bool sik_init_value; + bool sik_modified; + uint32_t sik_entries_count; static_if_entry_t sik_entries_head; + struct static_if_key *sik_modified_next; } *static_if_key_t; #if defined (__x86_64__) @@ -74,8 +76,9 @@ __BEGIN_DECLS extern struct static_if_key_true name##_jump_key #define STATIC_IF_KEY_DEFINE_TRUE(name) \ + __security_const_late \ __used struct static_if_key_true name##_jump_key = { \ - .key.sik_init_value = 0, \ + .key.sik_init_value = true, \ .key.sik_enable_count = 0, \ } @@ -83,8 +86,9 @@ __BEGIN_DECLS extern struct static_if_key_false name##_jump_key #define STATIC_IF_KEY_DEFINE_FALSE(name) \ + __security_const_late \ __used struct static_if_key_false name##_jump_key = { \ - .key.sik_init_value = -1, \ + .key.sik_init_value = false, \ .key.sik_enable_count = -1, \ } @@ -268,6 +272,13 @@ extern void __static_if_key_delta( static_if_key_t key, int delta); +extern static_if_key_t static_if_modified_keys; + +#define STATIC_IF_ABI_V1 1 +#define STATIC_IF_ABI_CURRENT STATIC_IF_ABI_V1 + +extern uint32_t static_if_abi; + #if MACH_KERNEL_PRIVATE __attribute__((always_inline)) diff --git a/osfmk/machine/trap.h b/osfmk/machine/trap.h index a9c1b29cb..454452600 100644 --- a/osfmk/machine/trap.h +++ b/osfmk/machine/trap.h @@ -55,6 +55,8 @@ __asm__ __volatile__ ("" : "+r"(_a), "+r"(_b), "+r"(_c)); \ }) +#ifndef __BUILDING_XNU_LIB_UNITTEST__ + #define ml_fatal_trap_with_value(code, a) ({ \ ml_trap_pin_value_1(a); \ ml_fatal_trap(code); \ @@ -70,6 +72,29 @@ ml_fatal_trap(code); \ }) +#else /* __BUILDING_XNU_LIB_UNITTEST__ */ +/* assert trap call into unit-test harness instead of calling brk */ +#ifdef __cplusplus +extern "C" +#else +extern +#endif +__attribute__((noreturn)) void ut_assert_trap(int code, long a, long b, long c); + +#define ml_fatal_trap_with_value(code, a) ({ \ + ut_assert_trap(code, (long)a, 0, 0); \ +}) + +#define ml_fatal_trap_with_value2(code, a) ({ \ + ut_assert_trap(code, (long)a, (long)b, 0); \ +}) + +#define ml_fatal_trap_with_value3(code, a, b, c) ({ \ + ut_assert_trap(code, (long)a, (long)b, (long)c); \ +}) + +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ + /* * Used for when `e` failed a linked list safe unlinking check. * On optimized builds, `e`'s value will be in: diff --git a/osfmk/man/vm_copy.html b/osfmk/man/vm_copy.html index ea4c926f1..ea1353e63 100644 --- a/osfmk/man/vm_copy.html +++ b/osfmk/man/vm_copy.html @@ -21,8 +21,7 @@ The port for the task whose memory is to be copied.
source_address
[in scalar] -The starting address for the source region. The address must -be on a page boundary. +The starting address for the source region.

count
@@ -33,8 +32,7 @@ bytes must convert to an integral number of virtual pages.
dest_address
[in scalar] -The starting address for the destination region. The address -must be on a page boundary. +The starting address for the destination region.

DESCRIPTION

diff --git a/osfmk/prng/prng_random.c b/osfmk/prng/prng_random.c index f51b4b0e9..9691e7a0e 100644 --- a/osfmk/prng/prng_random.c +++ b/osfmk/prng/prng_random.c @@ -280,7 +280,7 @@ early_random_init(void) cc_clear(sizeof(earlyseed), earlyseed); } -static void read_erandom(void * buf, size_t nbytes); +__static_testable void read_erandom(void * buf, size_t nbytes); /* * Return a uniformly distributed 64-bit random number. @@ -364,7 +364,7 @@ read_erandom_generate(void * buf, size_t nbytes) } } -static void +__static_testable __mockable void read_erandom(void * buf, size_t nbytes) { // We defer to the kernel PRNG after it has been installed and @@ -417,7 +417,7 @@ random_cpu_init(int cpu) } /* export good random numbers to the rest of the kernel */ -void +__mockable void read_random(void * buffer, u_int numbytes) { prng_funcs.refresh(prng_ctx); @@ -462,13 +462,21 @@ read_random_generate(uint8_t *buffer, size_t numbytes) int write_random(void * buffer, u_int numbytes) { - uint8_t seed[SHA256_DIGEST_LENGTH]; - SHA256_CTX ctx; + /* + * The reseed function requires at least 16 bytes of input entropy, + * hence we always pass the entire seed below, even if it isn't "full". + */ + uint8_t seed[SHA512_DIGEST_LENGTH] = {0}; - /* hash the input to minimize the time we need to hold the lock */ - SHA256_Init(&ctx); - SHA256_Update(&ctx, buffer, numbytes); - SHA256_Final(seed, &ctx); + if (numbytes > SHA512_DIGEST_LENGTH) { + /* hash the input to minimize the time we need to hold the lock */ + SHA512_CTX ctx; + SHA512_Init(&ctx); + SHA512_Update(&ctx, buffer, numbytes); + SHA512_Final(seed, &ctx); + } else { + memcpy(seed, buffer, numbytes); + } prng_funcs.reseed(prng_ctx, sizeof(seed), seed); cc_clear(sizeof(seed), seed); diff --git a/osfmk/tests/kernel_tests.c b/osfmk/tests/kernel_tests.c index cb6bf50e1..ea4d058f6 100644 --- a/osfmk/tests/kernel_tests.c +++ b/osfmk/tests/kernel_tests.c @@ -95,6 +95,7 @@ extern kern_return_t ml_io_timeout_test(void); #endif #ifdef __arm64__ +extern kern_return_t arm64_backtrace_test(void); extern kern_return_t arm64_munger_test(void); #if __ARM_PAN_AVAILABLE__ extern kern_return_t arm64_pan_test(void); @@ -112,6 +113,7 @@ extern kern_return_t specres_test(void); kern_return_t arm64_bti_test(void); #endif /* BTI_ENFORCED */ extern kern_return_t arm64_speculation_guard_test(void); +extern kern_return_t arm64_aie_test(void); #endif /* __arm64__ */ extern kern_return_t test_thread_call(void); @@ -129,6 +131,7 @@ struct xnupost_test kernel_post_tests[] = { XNUPOST_TEST_CONFIG_BASIC(test_os_log), XNUPOST_TEST_CONFIG_BASIC(test_os_log_parallel), #ifdef __arm64__ + XNUPOST_TEST_CONFIG_BASIC(arm64_backtrace_test), XNUPOST_TEST_CONFIG_BASIC(arm64_munger_test), #if __ARM_PAN_AVAILABLE__ XNUPOST_TEST_CONFIG_BASIC(arm64_pan_test), @@ -3352,3 +3355,53 @@ static_if_tests(void) } } STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, static_if_tests); + +#if __BUILDING_XNU_LIB_UNITTEST__ +/* these functions are used for testing the unittest mocking framework and interposing */ + +__mockable size_t +kernel_func1(__unused int a, __unused char b) +{ + return 1000; +} +__mockable size_t +kernel_func2(__unused int a, __unused char b) +{ + return 2000; +} +__mockable size_t +kernel_func3(__unused int a, __unused char b) +{ + return 3000; +} +__mockable size_t +kernel_func4(__unused int a, __unused char b) +{ + return 4000; +} +__mockable size_t +kernel_func5(__unused int a, __unused char b) +{ + return 5000; +} +int kernel_func6_was_called = 0; +__mockable void +kernel_func6(__unused int a, __unused char b) +{ + printf("in void func6"); + kernel_func6_was_called = a; +} +__mockable size_t +kernel_func7(__unused int a, __unused char b) +{ + return 7000; +} +int kernel_func8_was_called = 0; +__mockable void +kernel_func8(__unused int a, __unused char b) +{ + printf("in void func8"); + kernel_func8_was_called = a; +} + +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ diff --git a/osfmk/tests/pmap_tests.c b/osfmk/tests/pmap_tests.c index 764e6fda2..67df75ff4 100644 --- a/osfmk/tests/pmap_tests.c +++ b/osfmk/tests/pmap_tests.c @@ -34,11 +34,13 @@ #include #if CONFIG_SPTM #include +#include #else /* CONFIG_SPTM */ #include #endif /* CONFIG_SPTM */ #endif /* defined(__arm64__) */ #include +#include extern void read_random(void* buffer, u_int numBytes); @@ -58,6 +60,7 @@ uint64_t test_pmap_page_protect_overhead(unsigned int num_loops, unsigned int nu #if CONFIG_SPTM kern_return_t test_pmap_huge_pv_list(unsigned int num_loops, unsigned int num_mappings); kern_return_t test_pmap_reentrance(unsigned int num_loops); +kern_return_t test_surt(unsigned int num_surts); #endif #define PMAP_TEST_VA (0xDEADULL << PAGE_SHIFT) @@ -251,6 +254,10 @@ test_pmap_exec_remove(unsigned int num_loops __unused) static const vm_map_address_t nesting_start = SHARED_REGION_BASE; static const vm_map_address_t nesting_size = 16 * ARM_16K_TT_L2_SIZE; +static const vm_map_address_t final_unnest_size = 2 * ARM_16K_TT_L2_SIZE; +static const vm_map_address_t initial_unnest_size = nesting_size - final_unnest_size; +static const vm_map_address_t trimmed_start = nesting_start + ARM_16K_TT_L2_SIZE; +static const vm_map_address_t trimmed_size = nesting_size - (3 * ARM_16K_TT_L2_SIZE); static void pmap_nest_thread(void *arg, wait_result_t __unused wres) @@ -270,10 +277,11 @@ pmap_nest_thread(void *arg, wait_result_t __unused wres) * in the main thread. */ if (main_pmap != NULL) { + pmap_set_shared_region(main_pmap, args->pmap, nesting_start, nesting_size); kr = pmap_nest(main_pmap, args->pmap, nesting_start, nesting_size); assert(kr == KERN_SUCCESS); - kr = pmap_unnest(main_pmap, nesting_start, nesting_size - ARM_16K_TT_L2_SIZE); + kr = pmap_unnest(main_pmap, nesting_start, initial_unnest_size); assert(kr == KERN_SUCCESS); } @@ -289,7 +297,7 @@ pmap_nest_thread(void *arg, wait_result_t __unused wres) /* Unnest all remaining mappings so that we can safely destroy our pmap. */ if (main_pmap != NULL) { - kr = pmap_unnest(main_pmap, nesting_start + nesting_size - ARM_16K_TT_L2_SIZE, ARM_16K_TT_L2_SIZE); + kr = pmap_unnest(main_pmap, nesting_start + initial_unnest_size, final_unnest_size); assert(kr == KERN_SUCCESS); pmap_destroy(main_pmap); } @@ -324,7 +332,7 @@ test_pmap_nesting(unsigned int num_loops) const ppnum_t pp1 = VM_PAGE_GET_PHYS_PAGE(m1); const ppnum_t pp2 = VM_PAGE_GET_PHYS_PAGE(m2); for (unsigned int i = 0; (i < num_loops) && (kr == KERN_SUCCESS); i++) { - pmap_t nested_pmap = pmap_create_wrapper(0); + pmap_t nested_pmap = pmap_create_wrapper(PMAP_CREATE_NESTED); pmap_t main_pmap = pmap_create_wrapper(0); if ((nested_pmap == NULL) || (main_pmap == NULL)) { pmap_destroy(main_pmap); @@ -333,7 +341,10 @@ test_pmap_nesting(unsigned int num_loops) break; } pmap_set_nested(nested_pmap); - for (vm_map_address_t va = nesting_start; va < (nesting_start + nesting_size); va += PAGE_SIZE) { +#if CODE_SIGNING_MONITOR + csm_setup_nested_address_space(nested_pmap, nesting_start, nesting_size); +#endif /* CODE_SIGNING_MONITOR */ + for (vm_map_address_t va = trimmed_start; va < (trimmed_start + trimmed_size); va += PAGE_SIZE) { uint8_t rand; read_random(&rand, sizeof(rand)); uint8_t rand_mod = rand % 3; @@ -344,6 +355,7 @@ test_pmap_nesting(unsigned int num_loops) VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE, PMAP_MAPPING_TYPE_INFER); assert(kr == KERN_SUCCESS); } + pmap_set_shared_region(main_pmap, nested_pmap, nesting_start, nesting_size); kr = pmap_nest(main_pmap, nested_pmap, nesting_start, nesting_size); assert(kr == KERN_SUCCESS); @@ -361,7 +373,30 @@ test_pmap_nesting(unsigned int num_loops) } } - /* Now kick off various worker threads to concurrently nest and unnest. */ + pmap_trim(main_pmap, nested_pmap, trimmed_start, trimmed_size); + + /** + * Validate that the trimmed-off regions at the beginning and end no longer have L3 tables + * in the main or nested pmaps. + */ + if (pmap_pte(main_pmap, nesting_start) != NULL) { + panic("%s: L3 table still present in main pmap for trimmed VA 0x%llx", __func__, + (unsigned long long)nesting_start); + } + if (pmap_pte(main_pmap, trimmed_start + trimmed_size) != NULL) { + panic("%s: L3 table still present in main pmap for trimmed VA 0x%llx", __func__, + (unsigned long long)(trimmed_start + trimmed_size)); + } + if (pmap_pte(nested_pmap, nesting_start) != NULL) { + panic("%s: L3 table still present in nested pmap for trimmed VA 0x%llx", __func__, + (unsigned long long)nesting_start); + } + if (pmap_pte(nested_pmap, trimmed_start + trimmed_size) != NULL) { + panic("%s: L3 table still present in nested pmap for trimmed VA 0x%llx", __func__, + (unsigned long long)(trimmed_start + trimmed_size)); + } + + /* Now kick off various worker threads to concurrently nest, trim, and unnest. */ const processor_t nest_proc = current_processor(); thread_bind(nest_proc); thread_block(THREAD_CONTINUE_NULL); @@ -394,10 +429,18 @@ test_pmap_nesting(unsigned int num_loops) } /* Unnest the bulk of the nested region and validate that it produced the expected PTE contents. */ - kr = pmap_unnest(main_pmap, nesting_start, nesting_size - ARM_16K_TT_L2_SIZE); + kr = pmap_unnest(main_pmap, nesting_start, initial_unnest_size); assert(kr == KERN_SUCCESS); - for (vm_map_address_t va = nesting_start; va < (nesting_start + nesting_size - ARM_16K_TT_L2_SIZE); va += PAGE_SIZE) { + /** + * Explicitly install a new mapping in the nested pmap after unnesting; this should be created non-global, + * which we'll verify below. + */ + kr = pmap_enter(nested_pmap, trimmed_start, pp1, VM_PROT_READ, + VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE, PMAP_MAPPING_TYPE_INFER); + assert(kr == KERN_SUCCESS); + + for (vm_map_address_t va = trimmed_start; va < (nesting_start + initial_unnest_size); va += PAGE_SIZE) { pt_entry_t *nested_pte = pmap_pte(nested_pmap, va); pt_entry_t *main_pte = pmap_pte(main_pmap, va); @@ -412,7 +455,7 @@ test_pmap_nesting(unsigned int num_loops) } /* Validate that the prior unnest did not unnest too much. */ - for (vm_map_address_t va = nesting_start + nesting_size - ARM_16K_TT_L2_SIZE; va < (nesting_start + nesting_size); va += PAGE_SIZE) { + for (vm_map_address_t va = nesting_start + initial_unnest_size; va < (trimmed_start + trimmed_size); va += PAGE_SIZE) { pt_entry_t *nested_pte = pmap_pte(nested_pmap, va); pt_entry_t *main_pte = pmap_pte(main_pmap, va); if (nested_pte != main_pte) { @@ -426,13 +469,13 @@ test_pmap_nesting(unsigned int num_loops) } /* Now unnest the remainder. */ - kr = pmap_unnest(main_pmap, nesting_start + nesting_size - ARM_16K_TT_L2_SIZE, ARM_16K_TT_L2_SIZE); + kr = pmap_unnest(main_pmap, nesting_start + initial_unnest_size, final_unnest_size); assert(kr == KERN_SUCCESS); thread_bind(PROCESSOR_NULL); thread_block(THREAD_CONTINUE_NULL); - for (vm_map_address_t va = nesting_start + nesting_size - ARM_16K_TT_L2_SIZE; va < (nesting_start + nesting_size); va += PAGE_SIZE) { + for (vm_map_address_t va = nesting_start + initial_unnest_size; va < (trimmed_start + trimmed_size); va += PAGE_SIZE) { pt_entry_t *nested_pte = pmap_pte(nested_pmap, va); pt_entry_t *main_pte = pmap_pte(main_pmap, va); @@ -456,6 +499,7 @@ test_pmap_nesting(unsigned int num_loops) kr = thread_krs[j]; } } + pmap_remove(nested_pmap, nesting_start, nesting_start + nesting_size); pmap_destroy(main_pmap); pmap_destroy(nested_pmap); @@ -488,8 +532,6 @@ test_pmap_iommu_disconnect(void) kern_return_t test_pmap_extended(void) { -#if !CONFIG_SPTM /* SPTM TODO: remove this condition once the SPTM supports 4K and stage-2 mappings */ -#endif /* !CONFIG_SPTM */ return KERN_SUCCESS; } @@ -837,4 +879,191 @@ test_pmap_reentrance(unsigned int num_loops __unused) } +#if __ARM64_PMAP_SUBPAGE_L1__ +/* Data shared between the main testing thread and the workers. */ +typedef struct { + /* A pointer to an atomic counter of the active worker threads. */ + unsigned int *surt_test_active_surge_thread; + + /* The SURT physical address this worker is responsible for. */ + pmap_paddr_t surt_pa; +} surt_emulation_thread_data; + +/** + * SURT allocation emulation + * + * This function emulates the behavior of a thread trying to allocate a SURT. + * It tries to find a free SURT in the SURT page list first, and if it does + * not manage to find one, it allocates a new SURT page, takes the first SURT, + * and feeds the page to the SURT page list. + * + * @param arg Pointer to the shared structure between the main thread and the + * worker. + * @param wres Wait result - unused. + */ +static void +surt_allocation_emulation_thread(void *arg, wait_result_t __unused wres) +{ + pmap_paddr_t surt_pa; + + surt_emulation_thread_data *thread_data = (surt_emulation_thread_data *)arg; + + surt_pa = surt_try_alloc(); + + if (surt_pa) { + goto saet_done; + } + + const kern_return_t ret = pmap_page_alloc(&surt_pa, PMAP_PAGE_NOZEROFILL); + + if (ret != KERN_SUCCESS) { + goto saet_done; + } + + /** + * This has to be retyped to XNU_SUBPAGE_USER_ROOT_TABLES in case + * a SURT request from real process creation shows up. It does not + * need to, and cannot, call SPTM's SURT alloc function, however, + * because some extreme stress test parameters can exhaust available + * ASIDs. The normal operation of the system should be unaffected + * as long as the xnu bitmap tracking used SURTs is a superset of + * the SPTM tracking structures. + */ + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + sptm_retype(surt_pa, XNU_DEFAULT, XNU_SUBPAGE_USER_ROOT_TABLES, retype_params); + + /* Feed the SURT page to the SURT list. */ + surt_feed_page_with_first_table_allocated(surt_pa); + +saet_done: + /* Update the shared structure. */ + thread_data->surt_pa = surt_pa; + if (os_atomic_dec(thread_data->surt_test_active_surge_thread, relaxed) == 0) { + thread_wakeup(thread_data->surt_test_active_surge_thread); + } +} + +/** + * SURT free emulation + * + * This function pairs with the allocation emulation function to complete the + * emulation of the lifecycle of a SURT table. It records and reports the time + * it takes to free the SURT, and when applicable, the time it takes to free + * the SURT page. + * + * @param arg Pointer to the shared structure between the main thread and the + * worker. + * @param wres Wait result - unused. + */ +static void +surt_free_emulation_thread(void *arg, wait_result_t __unused wres) +{ + surt_emulation_thread_data *thread_data = (surt_emulation_thread_data *)arg; + + if (thread_data->surt_pa == 0) { + goto sfet_free; + } + + const bool retype = surt_free(thread_data->surt_pa); + + if (retype) { + os_atomic_thread_fence(acquire); + sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL}; + sptm_retype(thread_data->surt_pa & ~PAGE_MASK, XNU_SUBPAGE_USER_ROOT_TABLES, + XNU_DEFAULT, retype_params); + pmap_page_free(thread_data->surt_pa & ~PAGE_MASK); + } + +sfet_free: + if (os_atomic_dec(thread_data->surt_test_active_surge_thread, relaxed) == 0) { + thread_wakeup(thread_data->surt_test_active_surge_thread); + } +} + +/** + * SURT stress test + * + * This function tries to stress the SURT system by launching certain numbers + * of threads allocating a SURT then free them. + * + * @param num_surts The number of SURTs to allocate and free. Note that this + * many of worker threads will be allocated so take care when + * passing in a large number: memory zones can be exhausted. + * + * @return Whether the test finishes successfully. + */ +kern_return_t +test_surt(unsigned int num_surts) +{ + surt_emulation_thread_data *thread_data_array = kalloc_type(surt_emulation_thread_data, + num_surts, Z_WAITOK | Z_ZERO); + if (!thread_data_array) { + return KERN_FAILURE; + } + + thread_t *thread_array = kalloc_type(thread_t, num_surts, Z_WAITOK | Z_ZERO); + if (!thread_array) { + return KERN_FAILURE; + } + + unsigned int active_threads = 0; + + for (unsigned int i = 0; i < num_surts; i++) { + os_atomic_inc(&active_threads, relaxed); + thread_data_array[i].surt_test_active_surge_thread = &active_threads; + + kernel_thread_start_priority(surt_allocation_emulation_thread, + &thread_data_array[i], + thread_kern_get_pri(current_thread()) - 1, + &thread_array[i]); + } + + assert_wait(&active_threads, THREAD_UNINT); + + if (os_atomic_load(&active_threads, relaxed) == 0) { + clear_wait(current_thread(), THREAD_AWAKENED); + } else { + thread_block(THREAD_CONTINUE_NULL); + } + + if (os_atomic_load(&active_threads, relaxed) != 0) { + panic("%s: unexpected wakeup of main test thread while workers are active.", + __func__); + } + + for (unsigned int i = 0; i < num_surts; i++) { + thread_deallocate(thread_array[i]); + } + + for (unsigned int i = 0; i < num_surts; i++) { + os_atomic_inc(&active_threads, relaxed); + kernel_thread_start_priority(surt_free_emulation_thread, + &thread_data_array[i], + thread_kern_get_pri(current_thread()) - 1, + &thread_array[i]); + } + + assert_wait(&active_threads, THREAD_UNINT); + + if (os_atomic_load(&active_threads, relaxed) == 0) { + clear_wait(current_thread(), THREAD_AWAKENED); + } else { + thread_block(THREAD_CONTINUE_NULL); + } + + if (os_atomic_load(&active_threads, relaxed) != 0) { + panic("%s: unexpected wakeup of main test thread while workers are active.", + __func__); + } + + for (unsigned int i = 0; i < num_surts; i++) { + thread_deallocate(thread_array[i]); + } + + kfree_type(surt_emulation_thread_data, num_surts, thread_data_array); + kfree_type(thread_t, num_surts, thread_array); + + return KERN_SUCCESS; +} +#endif /* __ARM64_PMAP_SUBPAGE_L1__ */ #endif /* CONFIG_SPTM */ diff --git a/osfmk/tests/ptrauth_data_tests.c b/osfmk/tests/ptrauth_data_tests.c index 993b4bbb9..237669530 100644 --- a/osfmk/tests/ptrauth_data_tests.c +++ b/osfmk/tests/ptrauth_data_tests.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -114,9 +115,6 @@ ptrauth_data_tests(void) /* _vm_map */ ALLOC_VALIDATE_DATA_PTR(struct _vm_map, pmap_t, pmap, "_vm_map.pmap"); - /* ipc_port */ - ALLOC_VALIDATE_DATA_PTR(struct ipc_port, ipc_kobject_label_t, ip_kolabel, "ipc_port.kolabel"); - /* ipc_kobject_label */ ALLOC_VALIDATE_DATA_PTR(struct ipc_kobject_label, ipc_kobject_t, ikol_alt_port, "ipc_kobject_label.ikol_alt_port"); diff --git a/osfmk/tests/vm_parameter_validation.h b/osfmk/tests/vm_parameter_validation.h index c365f68d5..9bfb36d39 100644 --- a/osfmk/tests/vm_parameter_validation.h +++ b/osfmk/tests/vm_parameter_validation.h @@ -231,6 +231,74 @@ is_fake_error(int err) err == PANIC || err == GUARD || err == OUT_PARAM_BAD; } +// Parameters passed between userspace and kernel +// for sysctl test vm_parameter_validation_kern +typedef struct { + // Set this to sizeof(vm_parameter_validation_kern_args_t) + uint64_t sizeof_args; + + // Buffer for kernel test output. Allocated by userspace. + uint64_t output_buffer_address; + uint64_t output_buffer_size; + + // File descriptor for kernel tests that map files. Allocated by userspace. + uint64_t file_descriptor; + + // Set if the kernel test output should be a golden file. + // Read from GENERATE_GOLDEN_IMAGE. + uint64_t generate_golden; +} vm_parameter_validation_kern_args_t; + +// Result values from sysctl test vm_parameter_validation_kern +#define KERN_TEST_SUCCESS 0 +#define KERN_TEST_BAD_ARGS 1 // sizeof(args) didn't match args->sizeof_args +#define KERN_TEST_FAILED 2 // failed without running any tests; error text in output buffer + +#if KERNEL + +// "Global" data for test vm_parameter_validation_kern +// stored in the kernel thread test context. +typedef struct { + thread_test_context_t ttc; + + // Buffer for kernel test output. Allocated by userspace. + user_addr_t output_buffer_start; + user_addr_t output_buffer_cur; + user_addr_t output_buffer_end; + + // File descriptor for kernel tests that map files. Allocated by userspace. + int file_descriptor; + + // Set if the kernel test output should be a golden file. + bool generate_golden; + + // Cached lists of offsets. Populated by CACHE_OFFSETS(). + struct offset_list_t *addr_trial_offsets; + struct offset_list_t *size_trial_offsets; + struct offset_list_t *start_size_trial_offsets; + struct offset_list_t *ssoo_absolute_offsets; + struct offset_list_t *ssoo_absolute_and_relative_offsets; +} vm_parameter_validation_kern_thread_context_t; + +DECLARE_TEST_IDENTITY(test_identity_vm_parameter_validation_kern); + +// Get the test's global storage from thread-local data. +// Panics if not running on a development kernel. +// Panics if not running on the vm_parameter_validation_kern test's thread. +static vm_parameter_validation_kern_thread_context_t * +get_globals(void) +{ + thread_test_context_t *ttc = thread_get_test_context(); + if (ttc == NULL || + ttc->ttc_identity != test_identity_vm_parameter_validation_kern) { + panic("no thread context or wrong thread context in test vm_parameter_validation_kern"); + } + + return __container_of(ttc, vm_parameter_validation_kern_thread_context_t, ttc); +} + +#endif /* KERNEL */ + // Return the count of a (non-decayed!) array. #define countof(array) (sizeof(array) / sizeof((array)[0])) @@ -400,28 +468,6 @@ adjust_page_size() return test_page_size; } -#if KERNEL -// Knobs controlled from userspace (and passed in MSB of the file_descriptor) -extern bool kernel_generate_golden; -#else -// Knobs controlled by environment variables -extern bool dump; -extern bool generate_golden; -extern bool dump_golden; -extern int out_param_bad_count; -extern bool should_test_results; -static void -read_env() -{ - dump = (getenv("DUMP_RESULTS") != NULL); - dump_golden = (getenv("DUMP_GOLDEN_IMAGE") != NULL); - // Shouldn't do both - generate_golden = (getenv("GENERATE_GOLDEN_IMAGE") != NULL) && !dump_golden; - // Only test when no other golden image flag is set - should_test_results = (getenv("SKIP_TESTS") == NULL) && !dump_golden && !generate_golden; -} -#endif - ///////////////////////////////////////////////////// // String functions that work in both kernel and userspace. @@ -587,14 +633,10 @@ typedef struct { unsigned capacity; unsigned count; unsigned tested_count; + bool kernel_buffer_full; /* incomplete, parsed from a truncated buffer */ result_t list[]; } results_t; -extern results_t *golden_list[]; -extern results_t *kern_list[]; -static uint32_t num_tests = 0; // num of tests in golden list -static uint32_t num_kern_tests = 0; // num of tests in kernel results list - static __attribute__((overloadable)) results_t * alloc_results(const char *testname, char *testconfig, @@ -617,6 +659,7 @@ alloc_results(const char *testname, char *testconfig, results->capacity = capacity; results->count = 0; results->tested_count = 0; + results->kernel_buffer_full = false; return results; } @@ -661,9 +704,13 @@ static void __unused dealloc_results(results_t *results) { for (unsigned int i = 0; i < results->count; i++) { - kfree_str(results->list[i].name); + if (results->list[i].name) { + kfree_str(results->list[i].name); + } + } + if (results->testconfig) { + kfree_str(results->testconfig); } - kfree_str(results->testconfig); #if KERNEL kfree_type(results_t, result_t, results->capacity, results); #else @@ -693,6 +740,8 @@ append_result(results_t *results, int ret, const char *name) #define TRIALSFORMULA_DELIMITER "TRIALSFORMULA " #define TRIALSARGUMENTS_DELIMITER "TRIALSARGUMENTS" #define KERN_TESTRESULT_DELIMITER " RESULT " +#define KERN_FAILURE_DELIMITER "FAIL: " +#define KERN_RESULT_DELIMITER "\n" // print results, unformatted // This output is read by populate_kernel_results() @@ -727,6 +776,7 @@ dump_golden_results(results_t *results) goldenprintf(TESTRESULT_DELIMITER "%d: %d\n", i, results->list[i].ret); #if !KERNEL if (results->list[i].ret == OUT_PARAM_BAD) { + extern int out_param_bad_count; out_param_bad_count += 1; T_FAIL("Out parameter violation in test %s - %s\n", results->testname, results->list[i].name); } @@ -736,163 +786,6 @@ dump_golden_results(results_t *results) return results; } -#if !KERNEL -// Comparator function for sorting result_t list by name -static int -compare_names(const void *a, const void *b) -{ - assert(((const result_t *)a)->name); - assert(((const result_t *)b)->name); - return strcmp(((const result_t *)a)->name, ((const result_t *)b)->name); -} - -static unsigned -binary_search(result_t *list, unsigned count, const result_t *trial) -{ - assert(count > 0); - const char *name = trial->name; - unsigned left = 0, right = count - 1; - while (left <= right) { - unsigned mid = left + (right - left) / 2; - int cmp = strcmp(list[mid].name, name); - if (cmp == 0) { - return mid; - } else if (cmp < 0) { - left = mid + 1; - } else { - right = mid - 1; - } - } - return UINT_MAX; // Not found -} - -static inline bool -trial_name_equals(const result_t *a, const result_t *b) -{ - // NB: strlen match need to handle cases where a shorter 'bname' would match a longer 'aname'. - if (strlen(a->name) == strlen(b->name) && compare_names(a, b) == 0) { - return true; - } - return false; -} - -static const result_t * -get_golden_result(results_t *golden_results, const result_t *trial, unsigned trial_idx) -{ - if (golden_results->trialsformula == eUNKNOWN_TRIALS) { - // golden results don't contain trials names - T_LOG("%s: update test's alloc_results to have a valid trialsformula_t\n", golden_results->testname); - return NULL; - } - - if (trial_idx < golden_results->count && - golden_results->list[trial_idx].name && - trial_name_equals(&golden_results->list[trial_idx], trial)) { - // "fast search" path taken when golden file is in sync to test. - return &golden_results->list[trial_idx]; - } - - // "slow search" path taken when tests idxs are not aligned. Sort the array - // by name and do binary search. - qsort(golden_results->list, golden_results->count, sizeof(result_t), compare_names); - unsigned g_idx = binary_search(golden_results->list, golden_results->count, trial); - if (g_idx < golden_results->count) { - return &golden_results->list[g_idx]; - } - - return NULL; -} - -static void -test_results(results_t *golden_results, results_t *results) -{ - bool passed = TRUE; - unsigned result_count = results->count; - unsigned acceptable_count = 0; - const unsigned acceptable_max = 16; // log up to this many ACCEPTABLE results - const result_t *golden_result = NULL; - if (golden_results->count != results->count) { - T_LOG("%s: number of iterations mismatch (%u vs %u)", - results->testname, golden_results->count, results->count); - } - for (unsigned i = 0; i < result_count; i++) { - golden_result = get_golden_result(golden_results, &results->list[i], i); - if (golden_result) { - if (results->list[i].ret == ACCEPTABLE) { - // trial has declared itself to be correct - // no matter what the golden result is - acceptable_count++; - if (acceptable_count <= acceptable_max) { - T_LOG("%s RESULT ACCEPTABLE (expected %d), %s\n", - results->testname, - golden_result->ret, results->list[i].name); - } - } else if (results->list[i].ret != golden_result->ret) { - T_FAIL("%s RESULT %d (expected %d), %s\n", - results->testname, results->list[i].ret, - golden_result->ret, results->list[i].name); - passed = FALSE; - } - } else { - // new trial not present in golden results - T_FAIL("%s NEW RESULT %d, %s - (regenerate golden files to fix this)\n", - results->testname, results->list[i].ret, results->list[i].name); - passed = FALSE; - } - } - - if (acceptable_count > acceptable_max) { - T_LOG("%s %u more RESULT ACCEPTABLE trials not logged\n", - results->testname, acceptable_count - acceptable_max); - } - if (passed) { - T_PASS("%s passed\n", results->testname); - } -} -#endif - -#if !KERNEL -static results_t * -test_name_to_golden_results(const char* testname); -#endif - -static results_t * -process_results(results_t *results) -{ -#if KERNEL - if (kernel_generate_golden) { - return dump_golden_results(results); - } else { - return __dump_results(results); - } -#else - results_t *golden_results = NULL; - - if (dump && !generate_golden) { - __dump_results(results); - } - - if (generate_golden) { - dump_golden_results(results); - } - - if (should_test_results) { - golden_results = test_name_to_golden_results(results->testname); - - if (golden_results) { - test_results(golden_results, results); - } else { - T_FAIL("New test %s found, update golden list to allow return code testing", results->testname); - // Dump results if not done previously - if (!dump) { - __dump_results(results); - } - } - } - - return results; -#endif -} static inline mach_vm_address_t truncate_vm_map_addr_with_flags(MAP_T map, mach_vm_address_t addr, int flags) @@ -939,7 +832,7 @@ typedef struct { addr_t offset; } absolute_or_relative_offset_t; -typedef struct { +typedef struct offset_list_t { unsigned count; unsigned capacity; absolute_or_relative_offset_t list[]; @@ -968,6 +861,31 @@ append_offset(offset_list_t *offsets, bool is_absolute, addr_t offset) offsets->count++; } +#if KERNEL + +/* kernel globals are shared across processes, store cached offsets in thread-local storage */ +#define CACHE_OFFSETS(name, ctor) \ + offset_list_t *name = get_globals()->name; \ + do { \ + if (name == NULL) { \ + name = ctor(); \ + get_globals()->name = name; \ + } \ + } while (0) + +#else /* not KERNEL */ + +/* userspace test is single-threaded, store cached offsets in a static variable */ +#define CACHE_OFFSETS(name, ctor) \ + static offset_list_t *name; \ + do { \ + if (name == NULL) { \ + name = ctor(); \ + } \ + } while (0) + +#endif /* not KERNEL */ + ///////////////////////////////////////////////////// // Generation of trials and their parameter values @@ -2120,10 +2038,9 @@ slide_trial(addr_trial_t trial, mach_vm_address_t slide) static const offset_list_t * get_addr_trial_offsets(void) { - static offset_list_t *offsets; addr_t test_page_size = adjust_page_size(); - if (!offsets) { - offsets = allocate_offsets(20); + CACHE_OFFSETS(addr_trial_offsets, ^{ + offset_list_t *offsets = allocate_offsets(20); append_offset(offsets, true, 0); append_offset(offsets, true, 1); append_offset(offsets, true, 2); @@ -2145,8 +2062,9 @@ get_addr_trial_offsets(void) append_offset(offsets, false, 2); append_offset(offsets, false, test_page_size - 2); append_offset(offsets, false, test_page_size - 1); - } - return offsets; + return offsets; + }); + return addr_trial_offsets; } TRIALS_IMPL(addr) @@ -2212,10 +2130,9 @@ typedef struct { static const offset_list_t * get_size_trial_offsets(void) { - static offset_list_t *offsets; addr_t test_page_size = adjust_page_size(); - if (!offsets) { - offsets = allocate_offsets(15); + CACHE_OFFSETS(size_trial_offsets, ^{ + offset_list_t *offsets = allocate_offsets(15); append_offset(offsets, true, 0); append_offset(offsets, true, 1); append_offset(offsets, true, 2); @@ -2231,8 +2148,9 @@ get_size_trial_offsets(void) append_offset(offsets, true, -(mach_vm_address_t)test_page_size + 2); append_offset(offsets, true, -(mach_vm_address_t)2); append_offset(offsets, true, -(mach_vm_address_t)1); - } - return offsets; + return offsets; + }); + return size_trial_offsets; } TRIALS_IMPL(size) @@ -2303,19 +2221,19 @@ get_start_size_trial_start_offsets(void) static const offset_list_t * get_start_size_trial_size_offsets(void) { - static offset_list_t *offsets; - if (!offsets) { + CACHE_OFFSETS(start_size_trial_offsets, ^{ // use each size offset twice: once absolute and once relative const offset_list_t *old_offsets = get_size_trial_offsets(); - offsets = allocate_offsets(2 * old_offsets->count); + offset_list_t *offsets = allocate_offsets(2 * old_offsets->count); for (unsigned i = 0; i < old_offsets->count; i++) { - append_offset(offsets, true, old_offsets->list[i].offset); + append_offset(offsets, true, old_offsets->list[i].offset); } for (unsigned i = 0; i < old_offsets->count; i++) { - append_offset(offsets, false, old_offsets->list[i].offset); + append_offset(offsets, false, old_offsets->list[i].offset); } - } - return offsets; + return offsets; + }); + return start_size_trial_offsets; } TRIALS_IMPL(start_size) @@ -2448,10 +2366,9 @@ slide_trial(start_size_offset_object_trial_t trial, mach_vm_address_t slide) static offset_list_t * get_ssoo_absolute_offsets() { - static offset_list_t *offsets; addr_t test_page_size = adjust_page_size(); - if (!offsets) { - offsets = allocate_offsets(20); + CACHE_OFFSETS(ssoo_absolute_offsets, ^{ + offset_list_t *offsets = allocate_offsets(20); append_offset(offsets, true, 0); append_offset(offsets, true, 1); append_offset(offsets, true, 2); @@ -2467,21 +2384,21 @@ get_ssoo_absolute_offsets() append_offset(offsets, true, -(mach_vm_address_t)test_page_size + 2); append_offset(offsets, true, -(mach_vm_address_t)2); append_offset(offsets, true, -(mach_vm_address_t)1); - } - return offsets; + return offsets; + }); + return ssoo_absolute_offsets; } static offset_list_t * get_ssoo_absolute_and_relative_offsets() { - static offset_list_t *offsets; addr_t test_page_size = adjust_page_size(); - if (!offsets) { + CACHE_OFFSETS(ssoo_absolute_and_relative_offsets, ^{ const offset_list_t *old_offsets = get_ssoo_absolute_offsets(); - offsets = allocate_offsets(old_offsets->count + 5); + offset_list_t *offsets = allocate_offsets(old_offsets->count + 5); // absolute offsets for (unsigned i = 0; i < old_offsets->count; i++) { - append_offset(offsets, true, old_offsets->list[i].offset); + append_offset(offsets, true, old_offsets->list[i].offset); } // relative offsets append_offset(offsets, false, 0); @@ -2489,8 +2406,9 @@ get_ssoo_absolute_and_relative_offsets() append_offset(offsets, false, 2); append_offset(offsets, false, test_page_size - 2); append_offset(offsets, false, test_page_size - 1); - } - return offsets; + return offsets; + }); + return ssoo_absolute_and_relative_offsets; } start_size_offset_object_trials_t * diff --git a/osfmk/tests/vm_parameter_validation_kern.c b/osfmk/tests/vm_parameter_validation_kern.c index 3a762e06c..c3c840ae4 100644 --- a/osfmk/tests/vm_parameter_validation_kern.c +++ b/osfmk/tests/vm_parameter_validation_kern.c @@ -10,19 +10,7 @@ #pragma clang diagnostic ignored "-Wgcc-compat" -// Kernel sysctl test prints its output into a userspace buffer. -// fixme these global variables prevent test concurrency - -static user_addr_t SYSCTL_OUTPUT_BUF; -static user_addr_t SYSCTL_OUTPUT_END; - -// This is a read/write fd passed from userspace. -// It's passed to make it easier for kernel tests to interact with a file. -static int file_descriptor; - -// Output to create a golden test result in kern test, controlled by -// MSB in file_descriptor and set by GENERATE_GOLDEN_IMAGE from userspace. -bool kernel_generate_golden = FALSE; +DEFINE_TEST_IDENTITY(test_identity_vm_parameter_validation_kern); // vprintf() to a userspace buffer // output is incremented to point at the new nul terminator @@ -35,17 +23,47 @@ user_vprintf(user_addr_t *output, user_addr_t output_end, const char *format, va printed = vsnprintf(linebuf, sizeof(linebuf), format, args); assert(printed < sizeof(linebuf) - 1); - assert(*output + printed + 1 < output_end); - copyout(linebuf, *output, printed + 1); - *output += printed; + if (*output + printed + 1 < output_end) { + copyout(linebuf, *output, printed + 1); + *output += printed; + + /* *output + 1 == output_end occurs only after the error case below */ + assert(*output + 1 < output_end); + } else if (*output + 1 < output_end) { + /* + * Not enough space in the output buffer for this text. + * Print as much as we can, then rewind and terminate + * the buffer with an error message. + * The tests will continue to run after this, but they + * won't be able to output anything more. + */ + static const char err_msg[] = + KERN_RESULT_DELIMITER KERN_FAILURE_DELIMITER + "kernel output buffer full, output truncated\n"; + size_t err_len = strlen(err_msg); + size_t printable = output_end - *output - 1; + assert(printable <= printed); + copyout(linebuf, *output, printable + 1); + copyout(err_msg, output_end - err_len - 1, err_len + 1); + *output = output_end - 1; + } else { + /* + * Not enough space in the output buffer, + * and we already inserted the error message. + * Do nothing. + */ + assert(*output + 1 == output_end); + } } void testprintf(const char *format, ...) { + vm_parameter_validation_kern_thread_context_t *globals = get_globals(); + va_list args; va_start(args, format); - user_vprintf(&SYSCTL_OUTPUT_BUF, SYSCTL_OUTPUT_END, format, args); + user_vprintf(&globals->output_buffer_cur, globals->output_buffer_end, format, args); va_end(args); } @@ -510,6 +528,27 @@ static kern_return_t call_mach_vm_wire_level_monitor(int64_t requested_pages) { kern_return_t kr = mach_vm_wire_level_monitor(requested_pages); + /* + * KERN_RESOURCE_SHORTAGE and KERN_SUCCESS are + * equivalent acceptable results for this test. + */ + if (kr == KERN_RESOURCE_SHORTAGE) { +#if !defined(XNU_TARGET_OS_BRIDGE) + kr = KERN_SUCCESS; +#else /* defined(XNU_TARGET_OS_BRIDGE) */ + /* + * ...but the bridgeOS golden file recorded + * KERN_RESOURCE_SHORTAGE for some values so + * match that to avoid a golden file update. + * This code can be removed during any golden file update. + */ + if (requested_pages == 1 || requested_pages == 2) { + kr = KERN_SUCCESS; + } else { + kr = KERN_RESOURCE_SHORTAGE; + } +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + } return kr; } @@ -565,10 +604,10 @@ will_copyio_panic_in_copy_validate(void *kernel_addr, vm_size_t size) return true; } - bool in_kva = (VM_KERNEL_STRIP_UPTR(kernel_addr) >= VM_MIN_KERNEL_ADDRESS) && - (VM_KERNEL_STRIP_UPTR(kernel_addr_last) <= VM_MAX_KERNEL_ADDRESS); - bool in_physmap = (VM_KERNEL_STRIP_UPTR(kernel_addr) >= physmap_base) && - (VM_KERNEL_STRIP_UPTR(kernel_addr_last) <= physmap_end); + bool in_kva = (VM_KERNEL_STRIP_PTR(kernel_addr) >= VM_MIN_KERNEL_ADDRESS) && + (VM_KERNEL_STRIP_PTR(kernel_addr_last) <= VM_MAX_KERNEL_ADDRESS); + bool in_physmap = (VM_KERNEL_STRIP_PTR(kernel_addr) >= physmap_base) && + (VM_KERNEL_STRIP_PTR(kernel_addr_last) <= physmap_end); if (!(in_kva || in_physmap)) { return true; @@ -625,14 +664,9 @@ call_vm_map_write_user(MAP_T map, void * ptr, vm_map_address_t dst_addr, vm_size static kern_return_t call_vm_map_copy_overwrite_interruptible(MAP_T dst_map, vm_map_copy_t copy, mach_vm_address_t dst_addr, mach_vm_size_t copy_size) { - kern_return_t kr = vm_map_copy_overwrite(dst_map, dst_addr, copy, copy_size, TRUE); + kern_return_t kr = vm_map_copy_overwrite(dst_map, dst_addr, copy, copy_size, + TRUE); - const mach_vm_size_t va_mask = ((1ULL << 48) - 1); - if ((dst_addr & ~va_mask) == 0ULL && ((dst_addr + copy_size) & ~va_mask) == ~va_mask) { - if (kr == KERN_INVALID_ADDRESS) { - return ACCEPTABLE; - } - } return kr; } @@ -770,30 +804,6 @@ call_vm_map_purgable_control__purgeable_state(MAP_T map, vm_address_t addr, vm_p return kr; } -#if XNU_PLATFORM_MacOSX -static void -check_vm_region_object_create_outparam_changes(kern_return_t * kr, ipc_port_t handle) -{ - if (handle == NULL) { - *kr = OUT_PARAM_BAD; - } -} - -static kern_return_t -call_vm_region_object_create(MAP_T map, vm_size_t size) -{ - ipc_port_t handle = NULL; - kern_return_t kr = vm_region_object_create(map, size, &handle); - check_vm_region_object_create_outparam_changes(&kr, handle); - - if (kr == KERN_SUCCESS) { - mach_memory_entry_port_release(handle); - } - - return kr; -} -#endif /* #if XNU_PLATFORM_MacOSX */ - static kern_return_t call_vm_map_page_info(MAP_T map, mach_vm_address_t addr) { @@ -995,6 +1005,7 @@ test_kext_unix_with_allocated_vnode_addr(kern_return_t (*func)(MAP_T dst_map, ma for (unsigned i = 0; i < trials->count; i++) { mach_vm_address_t addr = (mach_vm_address_t)trials->list[i].addr; + int file_descriptor = get_globals()->file_descriptor; struct file_control_return control_info = get_control_from_fd(file_descriptor); vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true); kern_return_t kr = vm_map_enter_mem_object_control(map, &addr, TEST_ALLOC_SIZE, 0, vmk_flags, (memory_object_control_t) control_info.control, 0, false, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); @@ -1571,6 +1582,7 @@ vm_map_enter_mem_object_control_wrapped( vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; vm_map_kernel_flags_set_vmflags(&vmk_flags, flags); + int file_descriptor = get_globals()->file_descriptor; struct file_control_return control_info = get_control_from_fd(file_descriptor); kern_return_t kr = vm_map_enter_mem_object_control(target_map, &vmmaddr, size, mask, vmk_flags, (memory_object_control_t) control_info.control, offset, copy, cur_protection, max_protection, inheritance); check_vm_map_enter_mem_object_control_outparam_changes(&kr, vmmaddr, *address, flags, target_map); @@ -1723,43 +1735,61 @@ IMPL(vm_map_enter_mem_object_control_wrapped) #undef IMPL +static void +cleanup_context(vm_parameter_validation_kern_thread_context_t *ctx) +{ + thread_cleanup_test_context(&ctx->ttc); +} + +static results_t * +process_results(results_t *results) +{ + if (get_globals()->generate_golden) { + return dump_golden_results(results); + } else { + return __dump_results(results); + } +} + static int vm_parameter_validation_kern_test(int64_t in_value, int64_t *out_value) { - // in_value has the userspace address of the fixed-size output buffer and a file descriptor. - // The address is KB16 aligned, so the bottom bits are used for the fd. - // fd bit 15 also indicates if we want to generate golden results. - // in_value is KB16 aligned - uint64_t fd_mask = KB16 - 1; - file_descriptor = (int)(((uint64_t) in_value) & fd_mask); - uint64_t buffer_address = in_value - file_descriptor; - SYSCTL_OUTPUT_BUF = buffer_address; - SYSCTL_OUTPUT_END = SYSCTL_OUTPUT_BUF + SYSCTL_OUTPUT_BUFFER_SIZE; - - // check if running to generate golden result list via boot-arg - kernel_generate_golden = (file_descriptor & (KB16 >> 1)) > 0; - if (kernel_generate_golden) { - file_descriptor &= ~(KB16 >> 1); + // Copyin the arguments from userspace. + // Fail if the structure sizes don't match. + vm_parameter_validation_kern_args_t args; + if (copyin(in_value, &args, sizeof(args)) != 0 || + args.sizeof_args != sizeof(args)) { + *out_value = KERN_TEST_BAD_ARGS; + return 0; } - // Test options: - // - avoid panics for untagged wired memory (set to true during some tests) - // - clamp vm addresses before passing to pmap to avoid pmap panics - thread_test_context_t ctx CLEANUP_THREAD_TEST_CONTEXT = { - .test_option_vm_prevent_wire_tag_panic = false, - .test_option_vm_map_clamp_pmap_remove = true, + // Use the thread test context to store our "global" variables. + vm_parameter_validation_kern_thread_context_t ctx + __attribute__((cleanup(cleanup_context))) = { + .ttc = { + .ttc_identity = test_identity_vm_parameter_validation_kern, + // - avoid panics for untagged wired memory (set to true during some tests) + // - clamp vm addresses before passing to pmap to avoid pmap panics + .test_option_vm_prevent_wire_tag_panic = false, + .test_option_vm_map_clamp_pmap_remove = true, + }, + .output_buffer_start = args.output_buffer_address, + .output_buffer_cur = args.output_buffer_address, + .output_buffer_end = args.output_buffer_address + args.output_buffer_size, + .file_descriptor = (int)args.file_descriptor, + .generate_golden = args.generate_golden, }; - thread_set_test_context(&ctx); + thread_set_test_context(&ctx.ttc); #if !CONFIG_SPTM && (__ARM_42BIT_PA_SPACE__ || ARM_LARGE_MEMORY) - if (kernel_generate_golden) { + if (get_globals()->generate_golden) { // Some devices skip some trials to avoid timeouts. // Golden files cannot be generated on these devices. testprintf("Can't generate golden files on this device " "(PPL && (__ARM_42BIT_PA_SPACE__ || ARM_LARGE_MEMORY)). " "Try again on a different device.\n"); - *out_value = 0; // failure - goto done; + *out_value = KERN_TEST_FAILED; + return 0; } #else #pragma clang diagnostic ignored "-Wunused-label" @@ -2108,11 +2138,6 @@ vm_parameter_validation_kern_test(int64_t in_value, int64_t *out_value) RUN(call_mach_vm_region, "mach_vm_region"); RUN(call_vm_region, "vm_region"); #undef RUN -#if XNU_PLATFORM_MacOSX -#define RUN(fn, name) dealloc_results(process_results(test_mach_with_size(fn, name " (size)"))) - RUN(call_vm_region_object_create, "vm_region_object_create"); -#undef RUN -#endif /* * -- page info functions -- @@ -2132,11 +2157,10 @@ vm_parameter_validation_kern_test(int64_t in_value, int64_t *out_value) dealloc_results(process_results(test_kext_unix_with_allocated_vnode_addr(call_task_find_region_details, "task_find_region_details (addr)"))); - *out_value = 1; // success -done: - SYSCTL_OUTPUT_BUF = 0; - SYSCTL_OUTPUT_END = 0; + *out_value = KERN_TEST_SUCCESS; return 0; } -SYSCTL_TEST_REGISTER(vm_parameter_validation_kern, vm_parameter_validation_kern_test); +// The "_v2" suffix is here because sysctl "vm_parameter_validation_kern" was an +// older version of this test that used incompatibly different sysctl parameters. +SYSCTL_TEST_REGISTER(vm_parameter_validation_kern_v2, vm_parameter_validation_kern_test); diff --git a/osfmk/vm/Makefile b/osfmk/vm/Makefile index f8852a010..9c58933e1 100644 --- a/osfmk/vm/Makefile +++ b/osfmk/vm/Makefile @@ -55,11 +55,13 @@ XNU_ONLY_EXPORTS = \ XNU_ONLY_EXPORTS += \ vm_compressor_backing_store_internal.h \ vm_dyld_pager_internal.h \ + vm_log.h \ vm_map_internal.h \ vm_protos_internal.h \ vm_sanitize_internal.h \ vm_sanitize_telemetry.h \ - vm_shared_region_internal.h + vm_shared_region_internal.h \ + vm_lock_perf.h # Internal files exported with specific subsystems # who needs access to some VM internals. diff --git a/osfmk/vm/analytics.c b/osfmk/vm/analytics.c index afdba6621..5e355b9bf 100644 --- a/osfmk/vm/analytics.c +++ b/osfmk/vm/analytics.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include #if CONFIG_EXCLAVES @@ -83,7 +83,7 @@ add_trial_uuids(char *treatment_id, char *experiment_id) } static void -report_vm_swapusage() +report_vm_swapusage(void) { uint64_t max_alloced, max_used; ca_event_t event = CA_EVENT_ALLOCATE(vm_swapusage); @@ -98,7 +98,7 @@ report_vm_swapusage() } static void -report_mlock_failures() +report_mlock_failures(void) { ca_event_t event = CA_EVENT_ALLOCATE(mlock_failures); CA_EVENT_TYPE(mlock_failures) * e = event->data; @@ -148,11 +148,11 @@ typedef struct { * Report the age of segments in the compressor. */ static void -report_compressor_age() +report_compressor_age(void) { /* If the compressor is not configured, do nothing and return early. */ if (vm_compressor_mode == VM_PAGER_NOT_CONFIGURED) { - os_log(OS_LOG_DEFAULT, "%s: vm_compressor_mode == VM_PAGER_NOT_CONFIGURED, returning early", __func__); + vm_log("%s: vm_compressor_mode == VM_PAGER_NOT_CONFIGURED, returning early", __func__); return; } @@ -212,7 +212,7 @@ CA_EVENT(accounting_health, CA_INT, percentage); * Report health of resident vm page accounting. */ static void -report_accounting_health() +report_accounting_health(void) { /** * @note If a new accounting bucket is added, it must also be added in @@ -239,7 +239,7 @@ report_accounting_health() } static void -schedule_analytics_thread_call() +schedule_analytics_thread_call(void) { static const uint64_t analytics_period_ns = ANALYTICS_PERIOD_HOURS * 60 * 60 * NSEC_PER_SEC; uint64_t analytics_period_absolutetime; @@ -269,7 +269,7 @@ vm_analytics_tick(void *arg0, void *arg1) } static void -vm_analytics_init() +vm_analytics_init(void) { vm_analytics_thread_call = thread_call_allocate_with_options(vm_analytics_tick, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); schedule_analytics_thread_call(); diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index 545d8f74c..0a9f38a22 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -46,9 +46,6 @@ #include #include -#include -#include - #include #include #include @@ -254,6 +251,20 @@ memory_object_control_uiomove( } if (mark_dirty) { +#if CONFIG_SPTM + if (__improbable(PMAP_PAGE_IS_USER_EXECUTABLE(dst_page))) { + /* + * This is analogous to the PMAP_OPTIONS_RETYPE disconnect we perform + * in vm_object_upl_request() when setting up a UPL to overwrite the + * destination pages, which is the UPL-based analogue of this path. + * See the comment there for the gory details, but it essentially boils + * down to the same situation of being asked to overwrite page contents + * that were already marked executable from some prior use of the vnode + * associated with this VM object. + */ + pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(dst_page), PMAP_OPTIONS_RETYPE, NULL); + } +#endif /* CONFIG_SPTM */ if (dst_page->vmp_dirty == FALSE) { dirty_count++; } @@ -919,7 +930,7 @@ vnode_object_create( * The vm_map call takes both named entry ports and raw memory * objects in the same parameter. We need to make sure that * vm_map does not see this object as a named entry port. So, - * we reserve the first word in the object for a fake ip_kotype + * we reserve the first word in the object for a fake object type * setting - that will tell vm_map to use it as a memory object. */ vnode_object->vn_pgr_hdr.mo_ikot = IKOT_MEMORY_OBJECT; @@ -979,10 +990,13 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * boolean_t do_region_footprint; int effective_page_shift, effective_page_size; + vmlp_api_start(FILL_PROCREGIONINFO); + task_lock(task); map = task->map; if (map == VM_MAP_NULL) { task_unlock(task); + vmlp_api_end(FILL_PROCREGIONINFO, 0); return 0; } @@ -998,7 +1012,7 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * start = address; - if (!vm_map_lookup_entry_allow_pgz(map, start, &tmp_entry)) { + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { if (do_region_footprint && address == tmp_entry->vme_end) { @@ -1020,6 +1034,7 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * /* nothing to report */ vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO, 0); return 0; } @@ -1052,10 +1067,12 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO, 1); return 1; } vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO, 0); return 0; } } else { @@ -1063,6 +1080,7 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * } start = entry->vme_start; + vmlp_range_event_entry(map, entry); pinfo->pri_offset = VME_OFFSET(entry); pinfo->pri_protection = entry->protection; @@ -1119,12 +1137,14 @@ fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal * if (fill_vnodeinfoforaddr(entry, vnodeaddr, vid, NULL) == 0) { vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO, 1); return 1; } } vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO, 1); return 1; } @@ -1136,10 +1156,13 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi vm_map_entry_t tmp_entry; vm_map_entry_t entry; + vmlp_api_start(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES); + task_lock(task); map = task->map; if (map == VM_MAP_NULL) { task_unlock(task); + vmlp_api_end(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES, 0); return 0; } vm_map_reference(map); @@ -1147,10 +1170,11 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi vm_map_lock_read(map); - if (!vm_map_lookup_entry_allow_pgz(map, address, &tmp_entry)) { + if (!vm_map_lookup_entry(map, address, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES, 0); return 0; } } else { @@ -1158,6 +1182,7 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi } while (entry != vm_map_to_entry(map)) { + vmlp_range_event_entry(map, entry); *vnodeaddr = 0; *vid = 0; @@ -1193,6 +1218,7 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES, 1); return 1; } } @@ -1203,6 +1229,7 @@ fill_procregioninfo_onlymappedvnodes(task_t task, uint64_t arg, struct proc_regi vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES, 0); return 0; } @@ -1222,6 +1249,8 @@ task_find_region_details( vm_map_entry_t entry; int rc; + vmlp_api_start(TASK_FIND_REGION_DETAILS); + rc = 0; *vp_p = 0; *vid_p = 0; @@ -1229,6 +1258,7 @@ task_find_region_details( *start_p = 0; *len_p = 0; if (options & ~FIND_REGION_DETAILS_OPTIONS_ALL) { + vmlp_api_end(TASK_FIND_REGION_DETAILS, 0); return 0; } @@ -1236,13 +1266,14 @@ task_find_region_details( map = task->map; if (map == VM_MAP_NULL) { task_unlock(task); + vmlp_api_end(TASK_FIND_REGION_DETAILS, 0); return 0; } vm_map_reference(map); task_unlock(task); vm_map_lock_read(map); - if (!vm_map_lookup_entry_allow_pgz(map, offset, &entry)) { + if (!vm_map_lookup_entry(map, offset, &entry)) { if (options & FIND_REGION_DETAILS_AT_OFFSET) { /* no mapping at this offset */ goto ret; @@ -1258,6 +1289,8 @@ task_find_region_details( for (; entry != vm_map_to_entry(map); entry = entry->vme_next) { + vmlp_range_event_entry(map, entry); + if (entry->is_sub_map) { /* fallthru to check next entry */ } else if (fill_vnodeinfoforaddr(entry, vp_p, vid_p, is_map_shared_p)) { @@ -1288,6 +1321,7 @@ task_find_region_details( ret: vm_map_unlock_read(map); vm_map_deallocate(map); + vmlp_api_end(TASK_FIND_REGION_DETAILS, rc); return rc; } diff --git a/osfmk/vm/device_vm.c b/osfmk/vm/device_vm.c index 6453be19c..226bcd347 100644 --- a/osfmk/vm/device_vm.c +++ b/osfmk/vm/device_vm.c @@ -39,8 +39,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 4b74b0c9c..f69a10fe2 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -155,6 +155,11 @@ extern void pmap_disable_user_jop( #include #include + +#if CONFIG_SPTM +#include +#endif + /* * Routines used for initialization. * There is traditionally also a pmap_bootstrap, @@ -218,6 +223,17 @@ __enum_decl(pmap_mapping_type_t, uint8_t, { PMAP_MAPPING_TYPE_ROZONE = XNU_ROZONE, PMAP_MAPPING_TYPE_RESTRICTED = XNU_KERNEL_RESTRICTED }); + +#define PMAP_PAGE_IS_USER_EXECUTABLE(m) \ +({ \ + const sptm_paddr_t __paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(m)); \ + const sptm_frame_type_t __frame_type = sptm_get_frame_type(__paddr); \ + sptm_type_is_user_executable(__frame_type); \ +}) + +extern bool pmap_will_retype(pmap_t pmap, vm_map_address_t vaddr, ppnum_t pn, + vm_prot_t prot, unsigned int options, pmap_mapping_type_t mapping_type); + #else __enum_decl(pmap_mapping_type_t, uint8_t, { PMAP_MAPPING_TYPE_INFER = 0, @@ -440,6 +456,9 @@ extern ppnum_t unified_page_list_iterator_page( const unified_page_list_iterator_t *iter, bool *is_fictitious); +extern vm_page_t unified_page_list_iterator_vm_page( + const unified_page_list_iterator_t *iter); + extern void pmap_batch_set_cache_attributes( const unified_page_list_t *, unsigned int); @@ -447,6 +466,16 @@ extern void pmap_sync_page_data_phys(ppnum_t pa); extern void pmap_sync_page_attributes_phys(ppnum_t pa); +/** + * pmap entry point for performing platform-specific integrity checks and cleanup when + * the VM is about to free a page. This function will typically at least validate + * that the page has no outstanding mappings or other references, and depending + * upon the platform may also take additional steps to reset page state. + * + * @param pn The page that is about to be freed by the VM. + */ +extern void pmap_recycle_page(ppnum_t pn); + /* * debug/assertions. pmap_verify_free returns true iff * the given physical page is mapped into no pmap. @@ -636,6 +665,11 @@ extern void(pmap_pageable)( extern uint64_t pmap_shared_region_size_min(pmap_t map); +extern void + pmap_set_shared_region(pmap_t, + pmap_t, + addr64_t, + uint64_t); extern kern_return_t pmap_nest(pmap_t, pmap_t, addr64_t, @@ -646,19 +680,9 @@ extern kern_return_t pmap_unnest(pmap_t, #define PMAP_UNNEST_CLEAN 1 -#if __arm64__ -#if CONFIG_SPTM -#define PMAP_FORK_NEST 1 -#endif /* CONFIG_SPTM */ - -#if PMAP_FORK_NEST extern kern_return_t pmap_fork_nest( pmap_t old_pmap, - pmap_t new_pmap, - vm_map_offset_t *nesting_start, - vm_map_offset_t *nesting_end); -#endif /* PMAP_FORK_NEST */ -#endif /* __arm64__ */ + pmap_t new_pmap); extern kern_return_t pmap_unnest_options(pmap_t, addr64_t, @@ -695,6 +719,8 @@ extern const pmap_t kernel_pmap; /* The kernel's map */ #define PMAP_CREATE_TEST 0x4 /* pmap will be used for testing purposes only */ #define PMAP_CREATE_KNOWN_FLAGS (PMAP_CREATE_64BIT | PMAP_CREATE_EPT | PMAP_CREATE_TEST) +#define PMAP_CREATE_NESTED 0 /* this flag is a nop on x86 */ + #else #define PMAP_CREATE_STAGE2 0 @@ -717,9 +743,11 @@ extern const pmap_t kernel_pmap; /* The kernel's map */ #define PMAP_CREATE_TEST 0x40 /* pmap will be used for testing purposes only */ +#define PMAP_CREATE_NESTED 0x80 /* pmap will not try to allocate a subpage root table to save space */ + /* Define PMAP_CREATE_KNOWN_FLAGS in terms of optional flags */ #define PMAP_CREATE_KNOWN_FLAGS (PMAP_CREATE_64BIT | PMAP_CREATE_STAGE2 | PMAP_CREATE_DISABLE_JOP | \ - PMAP_CREATE_FORCE_4K_PAGES | PMAP_CREATE_X86_64 | PMAP_CREATE_ROSETTA | PMAP_CREATE_TEST) + PMAP_CREATE_FORCE_4K_PAGES | PMAP_CREATE_X86_64 | PMAP_CREATE_ROSETTA | PMAP_CREATE_TEST | PMAP_CREATE_NESTED) #endif /* __x86_64__ */ @@ -754,6 +782,11 @@ extern const pmap_t kernel_pmap; /* The kernel's map */ /* Indicates that pmap_enter() or pmap_remove() is being called with preemption already disabled. */ #define PMAP_OPTIONS_NOPREEMPT 0x80000 +#if CONFIG_SPTM +/* Requests pmap_disconnect() to reset the page frame type (only meaningful for SPTM systems) */ +#define PMAP_OPTIONS_RETYPE 0x100000 +#endif /* CONFIG_SPTM */ + #define PMAP_OPTIONS_MAP_TPRO 0x40000 #define PMAP_OPTIONS_RESERVED_MASK 0xFF000000 /* encoding space reserved for internal pmap use */ @@ -804,7 +837,6 @@ extern bool pmap_get_jit_entitled(pmap_t pmap); /* Inform the pmap layer that the XO register is repurposed for this map */ extern void pmap_set_tpro(pmap_t pmap); - /* Ask the pmap layer if there is a TPRO entry in this map. */ extern bool pmap_get_tpro(pmap_t pmap); diff --git a/osfmk/vm/pmap_cs.h b/osfmk/vm/pmap_cs.h index 74ae04920..66dda8f25 100644 --- a/osfmk/vm/pmap_cs.h +++ b/osfmk/vm/pmap_cs.h @@ -110,7 +110,7 @@ typedef struct _pmap_img4_payload { } pmap_img4_payload_t; /* State for whether developer mode has been set or not */ -extern bool ppl_developer_mode_set; +extern uint8_t ppl_developer_mode_set; /* State of developer mode on the system */ extern bool ppl_developer_mode_storage; diff --git a/osfmk/vm/vm32_user.c b/osfmk/vm/vm32_user.c index c92a616d8..1a676cc1b 100644 --- a/osfmk/vm/vm32_user.c +++ b/osfmk/vm/vm32_user.c @@ -651,7 +651,11 @@ kern_return_t vm32_vm_map_exec_lockdown( vm_map_t map) { + vmlp_api_start(VM32__MAP_EXEC_LOCKDOWN); + vmlp_range_event_none(map); + if (map == VM_MAP_NULL) { + vmlp_api_end(VM32__MAP_EXEC_LOCKDOWN, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -659,6 +663,7 @@ vm32_vm_map_exec_lockdown( map->map_disallow_new_exec = TRUE; vm_map_unlock(map); + vmlp_api_end(VM32__MAP_EXEC_LOCKDOWN, KERN_SUCCESS); return KERN_SUCCESS; } diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 7bc1e8b81..da5b127ab 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -50,9 +50,6 @@ #include -#include -#include - #include #include #include @@ -1036,7 +1033,7 @@ apple_protect_pager_create( * The vm_map call takes both named entry ports and raw memory * objects in the same parameter. We need to make sure that * vm_map does not see this object as a named entry port. So, - * we reserve the first word in the object for a fake ip_kotype + * we reserve the first word in the object for a fake object type * setting - that will tell vm_map to use it as a memory object. */ pager->ap_pgr_hdr.mo_ikot = IKOT_MEMORY_OBJECT; diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index 36457115f..be836d2c5 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -455,6 +455,12 @@ vm_compressor_needs_to_major_compact(void) return false; } +uint32_t +vm_compressor_get_swapped_segment_count(void) +{ + return c_swappedout_count + c_swappedout_sparse_count; +} + uint32_t vm_compressor_incore_fragmentation_wasted_pages(void) { @@ -488,14 +494,12 @@ vm_compressor_needs_to_minor_compact(void) return is_fragmented; } - uint64_t vm_available_memory(void) { return ((uint64_t)AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE_64; } - uint32_t vm_compressor_pool_size(void) { @@ -999,7 +1003,7 @@ vm_compressor_init(void) } #else if (error != 0) { - os_log_with_startup_serial(OS_LOG_DEFAULT, "vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error); + vm_log_error("vm_compressor_init: Unable to get swap volume capacity. error=%d\n", error); } #endif /* DEVELOPMENT || DEBUG */ if (vm_swap_volume_capacity < swap_vol_min_capacity) { @@ -1133,7 +1137,7 @@ vm_compressor_init(void) } compressor_segment_zone = zone_create("compressor_segment", - c_segment_size, ZC_PGZ_USE_GUARDS | ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM); + c_segment_size, ZC_NOENCRYPT | ZC_ZFREE_CLEARMEM); c_segments_busy = FALSE; @@ -1164,7 +1168,7 @@ vm_compressor_init(void) #endif kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize, - KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT, + KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR); /* @@ -1260,7 +1264,7 @@ vm_compressor_kdp_init(void) /* Allocate the per-cpu decompression pages. */ err = kmem_alloc(kernel_map, (vm_offset_t *)&buf, bufsize, - KMA_DATA | KMA_NOFAIL | KMA_KOBJECT, + KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR); if (err != KERN_SUCCESS) { @@ -2444,7 +2448,13 @@ c_seg_major_compact( c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot); + /* + * We don't want pages to get stolen by the contiguous memory allocator + * when copying data from one segment to another. + */ + PAGE_REPLACEMENT_DISALLOWED(TRUE); memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], combined_size); + PAGE_REPLACEMENT_DISALLOWED(FALSE); c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_slots++; c_seg_major_compact_stats[c_seg_major_compact_stats_now].moved_bytes += combined_size; @@ -3406,7 +3416,14 @@ vm_compressor_process_special_swapped_in_segments(void) lck_mtx_unlock_always(c_list_lock); } -#define C_SEGMENT_SWAPPEDIN_AGE_LIMIT 10 +#define ENABLE_DYNAMIC_SWAPPED_AGE_LIMIT 1 + +/* minimum time that segments can be in swappedin q as a grace period after they were swapped-in + * before they are added to age-q */ +#define C_SEGMENT_SWAPPEDIN_AGE_LIMIT_LOW 1 /* seconds */ +#define C_SEGMENT_SWAPPEDIN_AGE_LIMIT_NORMAL 10 /* seconds */ +#define C_AGE_Q_COUNT_LOW_THRESHOLD 50 + /* * Processing regular csegs means aging them. */ @@ -3417,12 +3434,32 @@ vm_compressor_process_regular_swapped_in_segments(boolean_t flush_all) clock_sec_t now; clock_nsec_t nsec; + unsigned long limit = C_SEGMENT_SWAPPEDIN_AGE_LIMIT_NORMAL; + +#ifdef ENABLE_DYNAMIC_SWAPPED_AGE_LIMIT + /* In normal operation, segments are kept in the swapped-in-q for a grace period of 10 seconds so that whoever + * needed to decompress something from a segment that was just swapped-in would have a chance to decompress + * more out of it. + * If the system is in high memory pressure state, this may cause the age-q to be completely empty so that + * there are no candidate segments for swap-out. In this state we use a lower limit of 1 second. + * condition 1: the age-q absolute size is too low + * condition 2: there are more segments in swapped-in-q than in age-q + * each of these represent a bad situation which we want to try to alleviate by moving more segments from + * swappped-in-q to age-q so that we have a better selection of who to swap-out + */ + if (c_age_count < C_AGE_Q_COUNT_LOW_THRESHOLD || c_age_count < c_regular_swappedin_count) { + limit = C_SEGMENT_SWAPPEDIN_AGE_LIMIT_LOW; + } +#endif + clock_get_system_nanotime(&now, &nsec); while (!queue_empty(&c_regular_swappedin_list_head)) { c_seg = (c_segment_t)queue_first(&c_regular_swappedin_list_head); - if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < C_SEGMENT_SWAPPEDIN_AGE_LIMIT) { + if (flush_all == FALSE && (now - c_seg->c_swappedin_ts) < limit) { + /* swappedin q is sorted by the order of time of addition os if we reached a seg that's too + * young, we know that all the rest after it are also too young */ break; } @@ -3839,13 +3876,11 @@ vm_compressor_compact_and_swap(boolean_t flush_all) bytes_freed = 0; yield_after_considered_per_pass = MAX(min_csegs_per_major_compaction, DELAYED_COMPACTIONS_PER_PASS); -#if 0 /** * SW: Need to figure out how to properly rate limit this log because it is currently way too * noisy. rdar://99379414 (Figure out how to rate limit the fragmentation level logging) */ - os_log(OS_LOG_DEFAULT, "memorystatus: before compaction fragmentation level %u\n", vm_compressor_fragmentation_level()); -#endif + vm_log_debug("before compaction fragmentation level %u\n", vm_compressor_fragmentation_level()); while (!queue_empty(&c_age_list_head) && !compaction_swapper_abort && !compressor_store_stop_compaction) { if (hibernate_flushing == TRUE) { @@ -4583,7 +4618,7 @@ c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_ c_seg_swapin_requeue(c_seg, FALSE, TRUE, age_on_swapin_q); } else { #if ENCRYPTED_SWAP - vm_swap_decrypt(c_seg); + vm_swap_decrypt(c_seg, true); #endif /* ENCRYPTED_SWAP */ #if CHECKSUM_THE_SWAP @@ -4624,6 +4659,9 @@ c_seg_swapin(c_segment_t c_seg, boolean_t force_minor_compaction, boolean_t age_ } #endif /* CONFIG_FREEZE */ + __assert_only unsigned int prev_swapped_count = os_atomic_sub_orig( + &vm_page_swapped_count, c_seg->c_slots_used, relaxed); + assert3u(prev_swapped_count, >=, c_seg->c_slots_used); os_atomic_add(&compressor_bytes_used, c_seg->c_bytes_used, relaxed); if (force_minor_compaction == TRUE) { @@ -4784,7 +4822,7 @@ retry: /* may need to retry if the currently filling c_seg will not have enough } /* - * returns with c_seg lock held + * c_seg_allocate() returns with c_seg lock held * and PAGE_REPLACEMENT_DISALLOWED(TRUE)... * c_nextslot has been allocated and * c_store.c_buffer populated @@ -4810,52 +4848,63 @@ retry: /* may need to retry if the currently filling c_seg will not have enough cs->c_hash_data = vmc_hash(src, PAGE_SIZE); #endif boolean_t incomp_copy = FALSE; /* codec indicates it already did copy an incompressible page */ - int max_csize_adj = (max_csize - 4); /* how much size we have left in this c_seg to fill. */ + /* The SW codec case needs 4 bytes for its header and these are not accounted for in the bytes_budget argument. + * Also, the the SV-not-in-hash case needs 4 bytes. */ + int max_csize_adj = (max_csize - 4); + if (__improbable(max_csize_adj < 0)) { + max_csize_adj = 0; + } - if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) { + if (max_csize > 0 && max_csize_adj > 0) { + if (vm_compressor_algorithm() != VM_COMPRESSOR_DEFAULT_CODEC) { #if defined(__arm64__) - uint16_t ccodec = CINVALID; - uint32_t inline_popcount; - if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) { - vm_memtag_disable_checking(); - c_size = metacompressor((const uint8_t *) src, - (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset], - max_csize_adj, &ccodec, - scratch_buf, &incomp_copy, &inline_popcount); - vm_memtag_enable_checking(); - assert(inline_popcount == C_SLOT_NO_POPCOUNT); + uint16_t ccodec = CINVALID; + uint32_t inline_popcount; + if (max_csize >= C_SEG_OFFSET_ALIGNMENT_BOUNDARY) { + vm_memtag_disable_checking(); + c_size = metacompressor((const uint8_t *) src, + (uint8_t *) &c_seg->c_store.c_buffer[cs->c_offset], + max_csize_adj, &ccodec, + scratch_buf, &incomp_copy, &inline_popcount); + vm_memtag_enable_checking(); + assert(inline_popcount == C_SLOT_NO_POPCOUNT); #if C_SEG_OFFSET_ALIGNMENT_BOUNDARY > 4 - if (c_size > max_csize_adj) { + /* The case of HW codec doesn't detect overflow on its own, instead it spills the the next page + * and we need to detect this happened */ + if (c_size > max_csize_adj) { + c_size = -1; + } +#endif + } else { c_size = -1; } + assert(ccodec == CCWK || ccodec == CCLZ4); + cs->c_codec = ccodec; #endif } else { - c_size = -1; - } - assert(ccodec == CCWK || ccodec == CCLZ4); - cs->c_codec = ccodec; -#endif - } else { #if defined(__arm64__) - vm_memtag_disable_checking(); - cs->c_codec = CCWK; - __unreachable_ok_push - if (PAGE_SIZE == 4096) { - c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], - (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); - } else { - c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], - (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); - } - __unreachable_ok_pop - vm_memtag_enable_checking(); + vm_memtag_disable_checking(); + cs->c_codec = CCWK; + __unreachable_ok_push + if (PAGE_SIZE == 4096) { + c_size = WKdm_compress_4k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], + (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); + } else { + c_size = WKdm_compress_16k((WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], + (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); + } + __unreachable_ok_pop + vm_memtag_enable_checking(); #else - vm_memtag_disable_checking(); - c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], - (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); - vm_memtag_enable_checking(); + vm_memtag_disable_checking(); + c_size = WKdm_compress_new((const WK_word *)(uintptr_t)src, (WK_word *)(uintptr_t)&c_seg->c_store.c_buffer[cs->c_offset], + (WK_word *)(uintptr_t)scratch_buf, max_csize_adj); + vm_memtag_enable_checking(); #endif + } + } else { /* max_csize == 0 or max_csize_adj == 0 */ + c_size = -1; } /* c_size is the size written by the codec, or 0 if it's uniform 32 bit value or (-1 if there was not enough space * or it was incompressible) */ @@ -4873,7 +4922,7 @@ retry: /* may need to retry if the currently filling c_seg will not have enough * right now this assumes that if the space we had is > PAGE_SIZE, then the codec failed due to incompressible input */ PAGE_REPLACEMENT_DISALLOWED(FALSE); - goto retry; /* previous c_seg didn't have enought space, we finalized it and can try again with a fresh c_seg */ + goto retry; /* previous c_seg didn't have enough space, we finalized it and can try again with a fresh c_seg */ } c_size = PAGE_SIZE; /* tag:WK-INCOMPRESSIBLE */ @@ -5352,11 +5401,14 @@ bypass_busy_check: #endif /* TRACK_C_SEGMENT_UTILIZATION */ } /* dst */ else { -#if CONFIG_FREEZE /* * We are freeing an uncompressed page from this c_seg and so balance the ledgers. */ if (C_SEG_IS_ONDISK(c_seg)) { + __assert_only unsigned int prev_swapped_count = + os_atomic_dec_orig(&vm_page_swapped_count, relaxed); + assert3u(prev_swapped_count, >, 0); +#if CONFIG_FREEZE /* * The compression sweep feature will push out anonymous pages to disk * without going through the freezer path and so those c_segs, while @@ -5381,8 +5433,8 @@ bypass_busy_check: retval = DECOMPRESS_FAILED_BAD_Q_FREEZE; goto done; /* this is intended to avoid the decrement of c_segment_pages_compressed_incore below */ - } #endif /* CONFIG_FREEZE */ + } } if (flags & C_KEEP) { @@ -5925,8 +5977,6 @@ Relookup_src: lck_mtx_unlock_always(&c_seg_src->c_lock); - PAGE_REPLACEMENT_DISALLOWED(FALSE); - /* find the c_slot */ c_indx = src_slot->s_cindx; @@ -5942,8 +5992,6 @@ Relookup_src: * This segment is full. We need a new one. */ - PAGE_REPLACEMENT_DISALLOWED(TRUE); - lck_mtx_lock_spin_always(&c_seg_src->c_lock); C_SEG_WAKEUP_DONE(c_seg_src); lck_mtx_unlock_always(&c_seg_src->c_lock); @@ -5973,6 +6021,7 @@ Relookup_src: c_dst = C_SEG_SLOT_FROM_INDEX(c_seg_dst, c_seg_dst->c_nextslot); memcpy(&c_seg_dst->c_store.c_buffer[c_seg_dst->c_nextoffset], &c_seg_src->c_store.c_buffer[c_src->c_offset], combined_size); + PAGE_REPLACEMENT_DISALLOWED(FALSE); /* * Is platform alignment actually necessary since wkdm aligns its output? */ @@ -6216,6 +6265,11 @@ vm_compressor_serialize_segment_debug_info(int segno, char *buf, size_t *size, v #else csi->csi_decompressions_since_swapin = 0; #endif /* TRACK_C_SEGMENT_UTILIZATION */ + /* This entire data collection races with the compressor threads which can change any + * of this data members, and specifically can drop the data buffer to swap + * We don't take the segment lock since that would slow the iteration over the segments down + * and hurt the "snapshot-ness" of the data. The race risk is acceptable since this is + * used only for a tester in development. */ for (int si = 0; si < nslots; ++si) { if (offset + sizeof(struct c_slot_info) > insize) { @@ -6235,7 +6289,6 @@ vm_compressor_serialize_segment_debug_info(int segno, char *buf, size_t *size, v #endif /* DEVELOPMENT || DEBUG */ - #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES struct vnode; @@ -6431,7 +6484,7 @@ again: } uncompress_offset = vm_uncompressed_extract_swap_offset(swapinfo); if ((retval = vnode_getwithref(uncompressed_vp)) != 0) { - os_log_error_with_startup_serial(OS_LOG_DEFAULT, "vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval); + vm_log_error("vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval); } else { int i = 0; retry: @@ -6488,7 +6541,7 @@ vm_uncompressed_get(ppnum_t pn, int *slot, __unused vm_compressor_options_t flag } if ((retval = vnode_getwithref(uncompressed_vp)) != 0) { - os_log_error_with_startup_serial(OS_LOG_DEFAULT, "vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval); + vm_log_error("vm_uncompressed_put: vnode_getwithref on swapfile failed with %d\n", retval); } else { int i = 0; retry: diff --git a/osfmk/vm/vm_compressor_backing_store.c b/osfmk/vm/vm_compressor_backing_store.c index 465440bc0..373ddf7d7 100644 --- a/osfmk/vm/vm_compressor_backing_store.c +++ b/osfmk/vm/vm_compressor_backing_store.c @@ -117,8 +117,8 @@ boolean_t swp_trim_supported = FALSE; extern uint64_t dont_trim_until_ts; uint64_t vm_swapfile_last_failed_to_create_ts = 0; uint64_t vm_swapfile_last_successful_create_ts = 0; -int vm_swapfile_can_be_created = FALSE; -boolean_t delayed_trim_handling_in_progress = FALSE; +static bool vm_swapfile_can_be_created = false; +static bool delayed_trim_handling_in_progress = false; boolean_t hibernate_in_progress_with_pinned_swap = FALSE; @@ -364,6 +364,13 @@ vm_swap_encrypt(c_segment_t c_seg) swap_crypt_initialize(); } + /* + * Data stored in the compressor should never need to be faulted in. + * Make sure pages storing data that we're encrypting cannot + * be stolen out from under us in the off chance that the mapping + * gets disconnected while we're actively encrypting. + */ + PAGE_REPLACEMENT_DISALLOWED(TRUE); #if DEVELOPMENT || DEBUG C_SEG_MAKE_WRITEABLE(c_seg); #endif @@ -382,10 +389,11 @@ vm_swap_encrypt(c_segment_t c_seg) #if DEVELOPMENT || DEBUG C_SEG_WRITE_PROTECT(c_seg); #endif + PAGE_REPLACEMENT_DISALLOWED(FALSE); } void -vm_swap_decrypt(c_segment_t c_seg) +vm_swap_decrypt(c_segment_t c_seg, bool disallow_page_replacement) { uint8_t *ptr; uint8_t *iv; @@ -395,6 +403,15 @@ vm_swap_decrypt(c_segment_t c_seg) assert(swap_crypt_initialized); + /* + * See comment in vm_swap_encrypt(). + * The master lock may already be held, though, which is why we don't do + * PAGE_REPLACEMENT_DISALLOWED(TRUE) and do a try_lock instead. + */ + if (disallow_page_replacement) { + PAGE_REPLACEMENT_DISALLOWED(TRUE); + } + #if DEVELOPMENT || DEBUG C_SEG_MAKE_WRITEABLE(c_seg); #endif @@ -413,6 +430,9 @@ vm_swap_decrypt(c_segment_t c_seg) #if DEVELOPMENT || DEBUG C_SEG_WRITE_PROTECT(c_seg); #endif + if (disallow_page_replacement) { + PAGE_REPLACEMENT_DISALLOWED(FALSE); + } } #endif /* ENCRYPTED_SWAP */ @@ -1410,7 +1430,7 @@ vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_retu } #if ENCRYPTED_SWAP else { - vm_swap_decrypt(c_seg); + vm_swap_decrypt(c_seg, false); } #endif /* ENCRYPTED_SWAP */ lck_mtx_lock_spin_always(c_list_lock); @@ -1434,21 +1454,25 @@ vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_retu c_seg->c_store.c_swap_handle = f_offset; counter_add(&vm_statistics_swapouts, size >> PAGE_SHIFT); + __assert_only unsigned int new_swapped_count = os_atomic_add( + &vm_page_swapped_count, c_seg->c_slots_used, relaxed); + /* Detect overflow */ + assert3u(new_swapped_count, >=, c_seg->c_slots_used); c_seg->c_swappedin = false; if (c_seg->c_bytes_used) { - OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used); + os_atomic_sub(&compressor_bytes_used, c_seg->c_bytes_used, relaxed); } #if CONFIG_FREEZE /* * Successful swapout. Decrement the in-core compressed pages count. */ - OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore); + os_atomic_sub(&c_segment_pages_compressed_incore, c_seg->c_slots_used, relaxed); assertf(c_segment_pages_compressed_incore >= 0, "-ve incore count %p 0x%x", c_seg, c_segment_pages_compressed_incore); if (c_seg->c_has_donated_pages) { - OSAddAtomic(-(c_seg->c_slots_used), &c_segment_pages_compressed_incore_late_swapout); + os_atomic_sub(&c_segment_pages_compressed_incore_late_swapout, (c_seg->c_slots_used), relaxed); } #endif /* CONFIG_FREEZE */ } else { @@ -1554,7 +1578,7 @@ vm_swap_create_file() } return FALSE; } - vm_swapfile_can_be_created = TRUE; + vm_swapfile_can_be_created = true; size = MAX_SWAP_FILE_SIZE; @@ -1976,7 +2000,7 @@ done: static void vm_swap_wait_on_trim_handling_in_progress() { - while (delayed_trim_handling_in_progress == TRUE) { + while (delayed_trim_handling_in_progress) { assert_wait((event_t) &delayed_trim_handling_in_progress, THREAD_UNINT); lck_mtx_unlock(&vm_swap_data_lock); @@ -2006,7 +2030,7 @@ vm_swap_handle_delayed_trims(boolean_t force_now) lck_mtx_lock(&vm_swap_data_lock); - delayed_trim_handling_in_progress = TRUE; + delayed_trim_handling_in_progress = true; lck_mtx_unlock(&vm_swap_data_lock); @@ -2028,7 +2052,7 @@ vm_swap_handle_delayed_trims(boolean_t force_now) } lck_mtx_lock(&vm_swap_data_lock); - delayed_trim_handling_in_progress = FALSE; + delayed_trim_handling_in_progress = false; thread_wakeup((event_t) &delayed_trim_handling_in_progress); if (VM_SWAP_SHOULD_RECLAIM() && !vm_swapfile_gc_thread_running) { @@ -2122,7 +2146,7 @@ vm_swap_reclaim(void) c_segment_t c_seg = NULL; kmem_alloc(compressor_map, (vm_offset_t *)&addr, c_seg_bufsize, - KMA_NOFAIL | KMA_KOBJECT | KMA_DATA, VM_KERN_MEMORY_COMPRESSOR); + KMA_NOFAIL | KMA_KOBJECT | KMA_DATA_SHARED, VM_KERN_MEMORY_COMPRESSOR); lck_mtx_lock(&vm_swap_data_lock); @@ -2293,7 +2317,7 @@ ReTry_for_cseg: c_seg->c_store.c_buffer = (int32_t *)c_buffer; #if ENCRYPTED_SWAP - vm_swap_decrypt(c_seg); + vm_swap_decrypt(c_seg, true); #endif /* ENCRYPTED_SWAP */ c_seg_swapin_requeue(c_seg, TRUE, TRUE, FALSE); /* @@ -2407,16 +2431,27 @@ vm_swap_get_max_configured_space(void) bool vm_swap_low_on_space(void) { - if (vm_num_swap_files == 0 && vm_swapfile_can_be_created == FALSE) { + if (vm_num_swap_files == 0 && + (!vm_swapfile_can_be_created || !SWAPPER_NEEDS_TO_UNTHROTTLE())) { + /* We haven't started creating swap files yet */ return false; } - if (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < ((unsigned int)vm_swapfile_hiwater_segs) / 8)) { - if (vm_num_swap_files == 0 && !SWAPPER_NEEDS_TO_UNTHROTTLE()) { - return false; + if (vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used < + (unsigned int)vm_swapfile_hiwater_segs / 8) { + /* + * We're running low on swapfile segments + */ + if (vm_swapfile_last_failed_to_create_ts >= vm_swapfile_last_successful_create_ts) { + /* + * We've recently failed to create a new swapfile, likely due to disk + * space exhaustion + */ + return true; } - if (vm_swapfile_last_failed_to_create_ts >= vm_swapfile_last_successful_create_ts) { + if (vm_num_swap_files == vm_num_swap_files_config) { + /* We've reached the swapfile limit */ return true; } } @@ -2426,13 +2461,21 @@ vm_swap_low_on_space(void) bool vm_swap_out_of_space(void) { - if ((vm_num_swap_files == vm_num_swap_files_config) && - ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < VM_SWAPOUT_LIMIT_MAX)) { + if (vm_num_swap_files == 0 && + (!vm_swapfile_can_be_created || !SWAPPER_NEEDS_TO_UNTHROTTLE())) { + /* We haven't started creating swap files yet */ + return false; + } + + if (vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used < + VM_SWAPOUT_LIMIT_MAX) { /* - * Last swapfile and we have only space for the - * last few swapouts. + * We have run out of swapfile segments */ - return true; + if (vm_num_swap_files == vm_num_swap_files_config) { + /* And we can't create any more swapfiles */ + return true; + } } return false; diff --git a/osfmk/vm/vm_compressor_internal.h b/osfmk/vm/vm_compressor_internal.h index f35548e11..7ebeb422b 100644 --- a/osfmk/vm/vm_compressor_internal.h +++ b/osfmk/vm/vm_compressor_internal.h @@ -56,7 +56,7 @@ extern lck_rw_t c_master_lock; #define PAGE_REPLACEMENT_ALLOWED(enable) (enable == TRUE ? lck_rw_lock_exclusive(&c_master_lock) : lck_rw_done(&c_master_lock)) #if ENCRYPTED_SWAP -extern void vm_swap_decrypt(c_segment_t); +extern void vm_swap_decrypt(c_segment_t, bool); #endif /* ENCRYPTED_SWAP */ extern void vm_swap_free(uint64_t); @@ -104,7 +104,6 @@ extern void c_compressed_record_init(void); extern void c_compressed_record_write(char *, int); #endif - #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS #endif /* _VM_VM_COMPRESSOR_INTERNAL_H_ */ diff --git a/osfmk/vm/vm_compressor_pager.c b/osfmk/vm/vm_compressor_pager.c index 586fc60fa..c3ab6a876 100644 --- a/osfmk/vm/vm_compressor_pager.c +++ b/osfmk/vm/vm_compressor_pager.c @@ -70,6 +70,7 @@ #include #include +#include #include #include #include @@ -702,7 +703,7 @@ vm_compressor_slots_init(void) compressor_slots_zones[idx] = zone_create( compressor_slots_zones_names[idx], compressor_slots_zones_sizes[idx], - ZC_PGZ_USE_GUARDS | ZC_VM); + ZC_VM); } } STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_compressor_slots_init); diff --git a/osfmk/vm/vm_compressor_pager_xnu.h b/osfmk/vm/vm_compressor_pager_xnu.h index 8a04ab31d..d0268b5cd 100644 --- a/osfmk/vm/vm_compressor_pager_xnu.h +++ b/osfmk/vm/vm_compressor_pager_xnu.h @@ -33,6 +33,7 @@ #include #include +#include #include __options_decl(vm_compressor_options_t, uint32_t, { diff --git a/osfmk/vm/vm_compressor_xnu.h b/osfmk/vm/vm_compressor_xnu.h index cb4cd71f2..6ea58e8ef 100644 --- a/osfmk/vm/vm_compressor_xnu.h +++ b/osfmk/vm/vm_compressor_xnu.h @@ -493,7 +493,31 @@ extern uint64_t compressor_perf_test_pages_processed; #endif /* MACH_KERNEL_PRIVATE */ +/* + * @func vm_swap_low_on_space + * + * @brief Return true if the system is running low on swap space + * + * @discussion + * Returns true if the number of free swapfile segments is low and we aren't + * likely to be able to create another swapfile (e.g. because the swapfile + * creation thread has failed to create a new swapfile). + */ extern bool vm_swap_low_on_space(void); + +/* + * @func vm_swap_out_of_space + * + * @brief Return true if the system has totally exhausted it's swap space + * + * @discussion + * Returns true iff all free swapfile segments have been exhausted and we aren't + * able to create another swapfile (because we've reached the configured limit). + * Unlike @c vm_swap_low_on_space(), @c vm_swap_out_of_space() will not return + * true if the swapfile creation thread has failed in the recent past -- even + * if we've run out of swapfile segments. This is because conditions may change + * and allow for future creation of new swapfiles. + */ extern bool vm_swap_out_of_space(void); #define HIBERNATE_FLUSHING_SECS_TO_COMPLETE 120 @@ -520,6 +544,8 @@ bool vm_compressor_is_thrashing(void); bool vm_compressor_swapout_is_ripe(void); uint32_t vm_compressor_pages_compressed(void); void vm_compressor_process_special_swapped_in_segments(void); +uint32_t vm_compressor_get_swapped_segment_count(void); + #if DEVELOPMENT || DEBUG __enum_closed_decl(vm_c_serialize_add_data_t, uint32_t, { diff --git a/osfmk/vm/vm_debug.c b/osfmk/vm/vm_debug.c index 0e0c971f5..43bc63e6a 100644 --- a/osfmk/vm/vm_debug.c +++ b/osfmk/vm/vm_debug.c @@ -95,6 +95,8 @@ #include #include +#include +#include /* * Routine: mach_vm_region_info [kernel call] @@ -118,7 +120,10 @@ vm32_mach_vm_region_info( __DEBUG_ONLY vm_info_object_array_t *objectsp, __DEBUG_ONLY mach_msg_type_number_t *objectsCntp) { + vmlp_api_start(VM32_REGION_INFO); + #if !MACH_VM_DEBUG + vmlp_api_end(VM32_REGION_INFO, KERN_FAILURE); return KERN_FAILURE; #else /* This unwrap is safe as this function is DEBUG only. */ @@ -132,6 +137,7 @@ vm32_mach_vm_region_info( kern_return_t kr; if (map == VM_MAP_NULL) { + vmlp_api_end(VM32_REGION_INFO, KERN_INVALID_TASK); return KERN_INVALID_TASK; } @@ -149,8 +155,7 @@ vm32_mach_vm_region_info( for (cmap = map;; cmap = nmap) { /* cmap is read-locked */ - if (!vm_map_lookup_entry_allow_pgz(cmap, - (vm_map_address_t)address, &entry)) { + if (!vm_map_lookup_entry(cmap, address, &entry)) { entry = entry->vme_next; if (entry == vm_map_to_entry(cmap)) { vm_map_unlock_read(cmap); @@ -158,6 +163,7 @@ vm32_mach_vm_region_info( kmem_free(ipc_kernel_map, addr, size); } + vmlp_api_end(VM32_REGION_INFO, KERN_NO_SPACE); return KERN_NO_SPACE; } } @@ -175,6 +181,7 @@ vm32_mach_vm_region_info( } /* cmap is read-locked; we have a real entry */ + vmlp_range_event_entry(cmap, entry); object = VME_OBJECT(entry); region.vir_start = (natural_t) entry->vme_start; @@ -279,6 +286,7 @@ vm32_mach_vm_region_info( kr = kmem_alloc(ipc_kernel_map, &addr, size, KMA_DATA, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { + vmlp_api_end(VM32_REGION_INFO, KERN_RESOURCE_SHORTAGE); return KERN_RESOURCE_SHORTAGE; } } @@ -316,6 +324,7 @@ vm32_mach_vm_region_info( *regionp = region; *objectsp = (vm_info_object_array_t) copy; *objectsCntp = used; + vmlp_api_end(VM32_REGION_INFO, KERN_SUCCESS); return KERN_SUCCESS; #endif /* MACH_VM_DEBUG */ } @@ -332,7 +341,10 @@ vm32_mach_vm_region_info_64( __DEBUG_ONLY vm_info_object_array_t *objectsp, __DEBUG_ONLY mach_msg_type_number_t *objectsCntp) { + vmlp_api_start(VM32_REGION_INFO_64); + #if !MACH_VM_DEBUG + vmlp_api_end(VM32_REGION_INFO_64, KERN_FAILURE); return KERN_FAILURE; #else /* This unwrap is safe as this function is DEBUG only. */ @@ -346,6 +358,7 @@ vm32_mach_vm_region_info_64( kern_return_t kr; if (map == VM_MAP_NULL) { + vmlp_api_end(VM32_REGION_INFO_64, KERN_INVALID_TASK); return KERN_INVALID_TASK; } @@ -363,7 +376,7 @@ vm32_mach_vm_region_info_64( for (cmap = map;; cmap = nmap) { /* cmap is read-locked */ - if (!vm_map_lookup_entry_allow_pgz(cmap, address, &entry)) { + if (!vm_map_lookup_entry(cmap, address, &entry)) { entry = entry->vme_next; if (entry == vm_map_to_entry(cmap)) { vm_map_unlock_read(cmap); @@ -371,6 +384,7 @@ vm32_mach_vm_region_info_64( kmem_free(ipc_kernel_map, addr, size); } + vmlp_api_end(VM32_REGION_INFO_64, KERN_NO_SPACE); return KERN_NO_SPACE; } } @@ -388,6 +402,7 @@ vm32_mach_vm_region_info_64( } /* cmap is read-locked; we have a real entry */ + vmlp_range_event_entry(cmap, entry); object = VME_OBJECT(entry); region.vir_start = (natural_t) entry->vme_start; @@ -492,6 +507,7 @@ vm32_mach_vm_region_info_64( kr = kmem_alloc(ipc_kernel_map, &addr, size, KMA_DATA, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { + vmlp_api_end(VM32_REGION_INFO_64, KERN_RESOURCE_SHORTAGE); return KERN_RESOURCE_SHORTAGE; } } @@ -529,6 +545,7 @@ vm32_mach_vm_region_info_64( *regionp = region; *objectsp = (vm_info_object_array_t) copy; *objectsCntp = used; + vmlp_api_end(VM32_REGION_INFO_64, KERN_SUCCESS); return KERN_SUCCESS; #endif /* MACH_VM_DEBUG */ } @@ -682,7 +699,7 @@ host_virtual_physical_table_info( size = vm_map_round_page(actual * sizeof *info, VM_MAP_PAGE_MASK(ipc_kernel_map)); kr = kmem_alloc(ipc_kernel_map, &addr, size, - KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC); + KMA_PAGEABLE | KMA_DATA_SHARED, VM_KERN_MEMORY_IPC); if (kr != KERN_SUCCESS) { return KERN_RESOURCE_SHORTAGE; } diff --git a/osfmk/vm/vm_dyld_pager.c b/osfmk/vm/vm_dyld_pager.c index 856fcbf73..e471ef9ad 100644 --- a/osfmk/vm/vm_dyld_pager.c +++ b/osfmk/vm/vm_dyld_pager.c @@ -45,9 +45,6 @@ #include #include -#include -#include - #include #include #include @@ -67,6 +64,15 @@ #include #endif /* defined(HAS_APPLE_PAC) */ + +/* For speculation macros */ +#if __arm64__ +#include +#endif /* #if __arm64__ */ + +extern int proc_selfpid(void); +extern char *proc_name_address(struct proc *p); + extern int panic_on_dyld_issue; /* @@ -134,6 +140,84 @@ const struct memory_object_pager_ops dyld_pager_ops = { .memory_object_pager_name = "dyld" }; +/* funciton that calculates delta pointer that remains within the same page by using nospec ISA */ +static inline bool +_delta_ptr_within_page_nospec(uint64_t ** __nonnull ptr, uint64_t deltaByteCount, bool *crossing_page, uintptr_t userVA) +{ + uintptr_t old_page = (uintptr_t)*ptr >> PAGE_SHIFT; + uintptr_t new_page = ((uintptr_t)*ptr + deltaByteCount) >> PAGE_SHIFT; + uint64_t nospec_delta = deltaByteCount; + uintptr_t page_offset = (uintptr_t)*ptr & PAGE_MASK; +#if __arm64__ + bool nospec_delta_valid = false; + SPECULATION_GUARD_ZEROING_XXX( + /* out */ nospec_delta, /* out_valid */ nospec_delta_valid, + /* value */ nospec_delta, + /* cmp1 */ old_page, /* cmp2 */ new_page, + /* cc */ "EQ"); +#elif __i386__ || __x86_64__ + if (old_page == new_page) { + nospec_delta = deltaByteCount; + } else { + nospec_delta = 0; + } + // MAYBE: lfence here +#endif /* __arm64__ */ + *ptr = (uint64_t*)((uintptr_t)*ptr + nospec_delta); + *crossing_page = nospec_delta != deltaByteCount; + if (*crossing_page) { + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_DELTA_TOO_LARGE), (uintptr_t)userVA); + printf("%s(): fixup chain delta crossing to the next page [{%p} + {%lld}]\n", __func__, (void*)(userVA + page_offset), deltaByteCount); + if (panic_on_dyld_issue) { + panic("%s(): delta offset > page size %lld", __func__, deltaByteCount); + } + } + + if (nospec_delta != 0) { + return true; + } else { + return false; + } +} + +static inline bool +_delta_ptr_within_page32_nospec(uint32_t ** __nonnull ptr, uint32_t deltaByteCount, bool *crossing_page, uintptr_t userVA) +{ + uintptr_t old_page = (uintptr_t)*ptr >> PAGE_SHIFT; + uintptr_t new_page = ((uintptr_t)*ptr + deltaByteCount) >> PAGE_SHIFT; + uintptr_t page_offset = (uintptr_t)*ptr & PAGE_MASK; + uint64_t nospec_delta = deltaByteCount; +#if __arm64__ + bool nospec_delta_valid = false; + SPECULATION_GUARD_ZEROING_XXX( + /* out */ nospec_delta, /* out_valid */ nospec_delta_valid, + /* value */ nospec_delta, + /* cmp1 */ old_page, /* cmp2 */ new_page, + /* cc */ "EQ"); +#elif __i386__ || __x86_64__ + if (old_page == new_page) { + nospec_delta = deltaByteCount; + } else { + nospec_delta = 0; + } + // MAYBE: lfence here +#endif /* __arm64__ */ + *ptr = (uint32_t*)((uintptr_t)*ptr + nospec_delta); + *crossing_page = nospec_delta != deltaByteCount; + if (*crossing_page) { + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_DELTA_TOO_LARGE), (uintptr_t)userVA); + printf("%s(): fixup chain delta crossing to the next page [{%p} + {%d}]\n", __func__, (void*)(userVA + page_offset), deltaByteCount); + if (panic_on_dyld_issue) { + panic("%s(): delta offset > page size %d", __func__, deltaByteCount); + } + } + if (nospec_delta != 0) { + return true; + } else { + return false; + } +} + /* * The "dyld_pager" structure. We create one of these for each use of * map_with_linking_np() that dyld uses. @@ -277,7 +361,6 @@ static kern_return_t fixupPage64( uint64_t userVA, vm_offset_t contents, - vm_offset_t end_contents, void *link_info, struct dyld_chained_starts_in_segment *segInfo, uint32_t pageIndex, @@ -286,7 +369,9 @@ fixupPage64( struct mwl_info_hdr *hdr = (struct mwl_info_hdr *)link_info; uint64_t *bindsArray = (uint64_t *)((uintptr_t)hdr + hdr->mwli_binds_offset); uint16_t firstStartOffset = segInfo->page_start[pageIndex]; - + vm_offset_t end_contents = contents + PAGE_SIZE; + // For DYLD_CHAINED_PTR_64 (arm64 and x86_64) and DYLD_CHAINED_PTR_32 (arm64_32) the stride is always 4 + uint64_t step_multiplier = 4; // 4-byte stride /* * Done if no fixups on the page */ @@ -300,6 +385,7 @@ fixupPage64( uint64_t *chain = (uint64_t *)(contents + firstStartOffset); uint64_t targetAdjust = (offsetBased ? hdr->mwli_image_address : hdr->mwli_slide); uint64_t delta = 0; + bool valid_chain = false; do { if ((uintptr_t)chain < contents || (uintptr_t)chain + sizeof(*chain) > end_contents) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_CHAIN_OUT_OF_RANGE), (uintptr_t)userVA); @@ -314,7 +400,9 @@ fixupPage64( } uint64_t value = *chain; bool isBind = (value & 0x8000000000000000ULL); + /* delta that can be used speculatively */ delta = (value >> 51) & 0xFFF; + delta *= step_multiplier; if (isBind) { uint32_t bindOrdinal = value & 0x00FFFFFF; if (bindOrdinal >= hdr->mwli_binds_count) { @@ -335,16 +423,13 @@ fixupPage64( uint64_t high8 = (value >> 36) & 0xFF; *chain = target + targetAdjust + (high8 << 56); } - if (delta * 4 >= PAGE_SIZE) { - ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_DELTA_TOO_LARGE), (uintptr_t)userVA); - printf("%s(): delta offset > page size %lld\n", __func__, delta * 4); - if (panic_on_dyld_issue) { - panic("%s(): delta offset > page size %lld", __func__, delta * 4); - } + /* shifts chain to a delta, chain cannot be used to access outside of page speculatively after this point */ + bool crossing_page = false; + valid_chain = _delta_ptr_within_page_nospec(&chain, delta, &crossing_page, (uintptr_t)userVA); + if (crossing_page) { return KERN_FAILURE; } - chain = (uint64_t *)((uintptr_t)chain + (delta * 4)); // 4-byte stride - } while (delta != 0); + } while (valid_chain); return KERN_SUCCESS; } @@ -353,18 +438,19 @@ fixupPage64( * Apply fixups within a page used by a 32 bit process. */ static kern_return_t -fixupChain32( +fixupPageChain32( uint64_t userVA, uint32_t *chain, vm_offset_t contents, - vm_offset_t end_contents, void *link_info, struct dyld_chained_starts_in_segment *segInfo, uint32_t *bindsArray) { struct mwl_info_hdr *hdr = (struct mwl_info_hdr *)link_info; uint32_t delta = 0; - + bool chain_valid = false; + vm_offset_t end_contents = contents + PAGE_SIZE; + uint32_t step_multiplier = 4; // always 4-bytes stride do { if ((uintptr_t)chain < contents || (uintptr_t)chain + sizeof(*chain) > end_contents) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_CHAIN_OUT_OF_RANGE), (uintptr_t)userVA); @@ -377,7 +463,9 @@ fixupChain32( return KERN_FAILURE; } uint32_t value = *chain; + /* delta that can be used speculatively */ delta = (value >> 26) & 0x1F; + delta *= step_multiplier; if (value & 0x80000000) { // is bind uint32_t bindOrdinal = value & 0x000FFFFF; @@ -404,8 +492,13 @@ fixupChain32( *chain = target + (uint32_t)hdr->mwli_slide; } } - chain += delta; - } while (delta != 0); + bool crossing_page = false; + chain_valid = _delta_ptr_within_page32_nospec(&chain, delta, &crossing_page, (uintptr_t)userVA); + + if (crossing_page) { + return KERN_FAILURE; + } + } while (chain_valid); return KERN_SUCCESS; } @@ -417,7 +510,6 @@ static kern_return_t fixupPage32( uint64_t userVA, vm_offset_t contents, - vm_offset_t end_contents, void *link_info, uint32_t link_info_size, struct dyld_chained_starts_in_segment *segInfo, @@ -426,7 +518,6 @@ fixupPage32( struct mwl_info_hdr *hdr = (struct mwl_info_hdr *)link_info; uint32_t *bindsArray = (uint32_t *)((uintptr_t)hdr + hdr->mwli_binds_offset); uint16_t startOffset = segInfo->page_start[pageIndex]; - /* * done if no fixups */ @@ -453,12 +544,12 @@ fixupPage32( chainEnd = (segInfo->page_start[overflowIndex] & DYLD_CHAINED_PTR_START_LAST); startOffset = (segInfo->page_start[overflowIndex] & ~DYLD_CHAINED_PTR_START_LAST); uint32_t *chain = (uint32_t *)(contents + startOffset); - fixupChain32(userVA, chain, contents, end_contents, link_info, segInfo, bindsArray); + fixupPageChain32(userVA, chain, contents, link_info, segInfo, bindsArray); ++overflowIndex; } } else { uint32_t *chain = (uint32_t *)(contents + startOffset); - fixupChain32(userVA, chain, contents, end_contents, link_info, segInfo, bindsArray); + fixupPageChain32(userVA, chain, contents, link_info, segInfo, bindsArray); } return KERN_SUCCESS; } @@ -516,7 +607,6 @@ static kern_return_t fixupPageAuth64( uint64_t userVA, vm_offset_t contents, - vm_offset_t end_contents, dyld_pager_t pager, struct dyld_chained_starts_in_segment *segInfo, uint32_t pageIndex, @@ -526,6 +616,9 @@ fixupPageAuth64( uint32_t link_info_size = pager->dyld_link_info_size; struct mwl_info_hdr *hdr = (struct mwl_info_hdr *)link_info; uint64_t *bindsArray = (uint64_t*)((uintptr_t)link_info + hdr->mwli_binds_offset); + vm_offset_t end_contents = contents + PAGE_SIZE; + bool valid_chain = false; + uint64_t step_multiplier = 8; // always 8-bytes stride for arm64e pages /* * range check against link_info, note +1 to include data we'll dereference @@ -565,7 +658,9 @@ fixupPageAuth64( return KERN_FAILURE; } uint64_t value = *chain; + /* delta that can be used speculatively */ delta = (value >> 51) & 0x7FF; + delta *= step_multiplier; bool isAuth = (value & 0x8000000000000000ULL); bool isBind = (value & 0x4000000000000000ULL); if (isAuth) { @@ -620,8 +715,13 @@ fixupPageAuth64( *chain = target + targetAdjust + high8; } } - chain += delta; - } while (delta != 0); + bool crossing_page = false;; + valid_chain = _delta_ptr_within_page_nospec(&chain, delta, &crossing_page, (uintptr_t)userVA); + + if (crossing_page) { + return KERN_FAILURE; + } + } while (valid_chain); return KERN_SUCCESS; } @@ -632,7 +732,6 @@ static kern_return_t fixupCachePageAuth64( uint64_t userVA, vm_offset_t contents, - vm_offset_t end_contents, dyld_pager_t pager, struct dyld_chained_starts_in_segment *segInfo, uint32_t pageIndex) @@ -640,6 +739,9 @@ fixupCachePageAuth64( void *link_info = pager->dyld_link_info; uint32_t link_info_size = pager->dyld_link_info_size; struct mwl_info_hdr *hdr = (struct mwl_info_hdr *)link_info; + vm_offset_t end_contents = contents + PAGE_SIZE; + bool valid_chain = false; + uint64_t step_multiplier = 8; // always 8-bytes stride for arm64e /* * range check against link_info, note +1 to include data we'll dereference @@ -678,7 +780,9 @@ fixupCachePageAuth64( return KERN_FAILURE; } uint64_t value = *chain; + /* delta that can be used speculatively */ delta = (value >> 52) & 0x7FF; + delta *= step_multiplier; bool isAuth = (value & 0x8000000000000000ULL); if (isAuth) { bool addrDiv = ((value & (1ULL << 50)) != 0); @@ -698,8 +802,12 @@ fixupCachePageAuth64( uint64_t high8 = (value << 22) & 0xFF00000000000000ULL; *chain = target + high8; } - chain += delta; - } while (delta != 0); + bool crossing_page = false; + valid_chain = _delta_ptr_within_page_nospec(&chain, delta, &crossing_page, (uintptr_t)userVA); + if (crossing_page) { + return KERN_FAILURE; + } + } while (valid_chain); return KERN_SUCCESS; } #endif /* defined(HAS_APPLE_PAC) */ @@ -711,7 +819,6 @@ fixupCachePageAuth64( static kern_return_t fixup_page( vm_offset_t contents, - vm_offset_t end_contents, uint64_t userVA, dyld_pager_t pager) { @@ -814,24 +921,24 @@ fixup_page( switch (hdr->mwli_pointer_format) { #if defined(HAS_APPLE_PAC) case DYLD_CHAINED_PTR_ARM64E: - fixupPageAuth64(userVA, contents, end_contents, pager, segInfo, pageIndex, false); + fixupPageAuth64(userVA, contents, pager, segInfo, pageIndex, false); break; case DYLD_CHAINED_PTR_ARM64E_USERLAND: case DYLD_CHAINED_PTR_ARM64E_USERLAND24: - fixupPageAuth64(userVA, contents, end_contents, pager, segInfo, pageIndex, true); + fixupPageAuth64(userVA, contents, pager, segInfo, pageIndex, true); break; case DYLD_CHAINED_PTR_ARM64E_SHARED_CACHE: - fixupCachePageAuth64(userVA, contents, end_contents, pager, segInfo, pageIndex); + fixupCachePageAuth64(userVA, contents, pager, segInfo, pageIndex); break; #endif /* defined(HAS_APPLE_PAC) */ case DYLD_CHAINED_PTR_64: - fixupPage64(userVA, contents, end_contents, link_info, segInfo, pageIndex, false); + fixupPage64(userVA, contents, link_info, segInfo, pageIndex, false); break; case DYLD_CHAINED_PTR_64_OFFSET: - fixupPage64(userVA, contents, end_contents, link_info, segInfo, pageIndex, true); + fixupPage64(userVA, contents, link_info, segInfo, pageIndex, true); break; case DYLD_CHAINED_PTR_32: - fixupPage32(userVA, contents, end_contents, link_info, link_info_size, segInfo, pageIndex); + fixupPage32(userVA, contents, link_info, link_info_size, segInfo, pageIndex); break; default: ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_DYLD_PAGER, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_DYLD_PAGER_BAD_POINTER_FMT), (uintptr_t)userVA); @@ -1075,7 +1182,7 @@ retry_src_fault: panic("%s(): Range not found for offset 0x%llx", __func__, (long long)cur_offset); } retval = KERN_FAILURE; - } else if (fixup_page(dst_vaddr, dst_vaddr + PAGE_SIZE, userVA, pager) != KERN_SUCCESS) { + } else if (fixup_page(dst_vaddr, userVA, pager) != KERN_SUCCESS) { /* KDBG / printf was done under fixup_page() */ retval = KERN_FAILURE; } @@ -1094,7 +1201,7 @@ retry_src_fault: /* * Cleanup the result of vm_fault_page() of the source page. */ - vm_page_wakeup_done(src_top_object, src_page); + vm_page_wakeup_done(src_page_object, src_page); src_page = VM_PAGE_NULL; vm_object_paging_end(src_page_object); vm_object_unlock(src_page_object); @@ -1398,7 +1505,7 @@ dyld_pager_create( * The vm_map call takes both named entry ports and raw memory * objects in the same parameter. We need to make sure that * vm_map does not see this object as a named entry port. So, - * we reserve the first word in the object for a fake ip_kotype + * we reserve the first word in the object for a fake object type * setting - that will tell vm_map to use it as a memory object. */ pager->dyld_header.mo_ikot = IKOT_MEMORY_OBJECT; @@ -1508,22 +1615,107 @@ vm_map_with_linking( memory_object_control_t file_control) { vm_map_t map = task->map; - vm_object_t object = VM_OBJECT_NULL; + vm_object_t file_object = VM_OBJECT_NULL; memory_object_t pager = MEMORY_OBJECT_NULL; uint32_t r; vm_map_address_t map_addr; kern_return_t kr = KERN_SUCCESS; + vm_map_entry_t map_entry; + vm_object_t backing_object = VM_OBJECT_NULL; + vm_object_t shadow_object; + int num_extra_shadows; - object = memory_object_control_to_vm_object(file_control); - if (object == VM_OBJECT_NULL || object->internal) { - printf("%s no object for file_control\n", __func__); - object = VM_OBJECT_NULL; - kr = KERN_INVALID_ADDRESS; + if (region_cnt == 0) { + kr = KERN_INVALID_ARGUMENT; + goto done; + } + file_object = memory_object_control_to_vm_object(file_control); + if (file_object == VM_OBJECT_NULL || file_object->internal) { + printf("%d[%s] %s: invalid object for provided file\n", + proc_selfpid(), proc_name_address(current_proc()), __func__); + file_object = VM_OBJECT_NULL; + kr = KERN_INVALID_ARGUMENT; goto done; } - /* create a pager */ - pager = dyld_pager_setup(task, object, regions, region_cnt, *link_info, link_info_size); + /* + * Check that the mapping is backed by the same file. + */ + map_addr = regions[0].mwlr_address; + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, + map_addr, + &map_entry) || + map_entry->is_sub_map || + VME_OBJECT(map_entry) == VM_OBJECT_NULL) { + vm_map_unlock_read(map); + kr = KERN_INVALID_ADDRESS; + goto done; + } + /* go down the shadow chain looking for the file object and its copy object */ + num_extra_shadows = 0; + shadow_object = VME_OBJECT(map_entry); + vm_object_lock(shadow_object); + while (shadow_object->shadow != VM_OBJECT_NULL) { + vm_object_t next_object = shadow_object->shadow; + if (shadow_object->shadow == file_object && + shadow_object->vo_shadow_offset == 0) { + /* + * Found our file object as shadow_object's shadow. + * shadow_object should be its copy object (we'll check below + * when we have its lock). + * shadow_object will be the backing object for our dyld pager, + * so let's take a reference to keep it alive until we create + * our dyld pager. + */ + backing_object = shadow_object; + vm_object_reference_locked(backing_object); + } + if (backing_object == VM_OBJECT_NULL) { + num_extra_shadows++; + } + vm_object_lock(next_object); + vm_object_unlock(shadow_object); + shadow_object = next_object; + } + if (shadow_object != file_object) { + /* the shadow chain does not end at the file provided by the caller */ + printf("%d[%s] %s: mapping at 0x%llx is not backed by the expected file", + proc_selfpid(), proc_name_address(current_proc()), __func__, + (uint64_t)map_addr); + // ktriage_record(...); + vm_object_unlock(shadow_object); + shadow_object = VM_OBJECT_NULL; + vm_map_unlock_read(map); + kr = KERN_INVALID_ARGUMENT; + goto done; + } + vm_object_unlock(shadow_object); + shadow_object = VM_OBJECT_NULL; + vm_map_unlock_read(map); + if (backing_object == VM_OBJECT_NULL || + backing_object != file_object->vo_copy) { + printf("%d[%s] %s: mapping at 0x%llx not a proper copy-on-write mapping\n", + proc_selfpid(), proc_name_address(current_proc()), __func__, + (uint64_t)map_addr); + kr = KERN_INVALID_ARGUMENT; + goto done; + } + if (num_extra_shadows) { + /* + * We found some extra shadow objects in the shadow chain for this mapping. + * We're about to replace that mapping with a "dyld" pager backed by the + * latest snapshot (copy) of the provided file, so any pages that had + * previously been copied and modified in these extra shadow objects + * will no longer be visible in this mapping. + */ + printf("%d[%s] %s: (warn) skipped %d shadow object(s) at 0x%llx\n", + proc_selfpid(), proc_name_address(current_proc()), __func__, + num_extra_shadows, (uint64_t)map_addr); + } + + /* create a pager, backed by the latest snapshot (copy object) of the file */ + pager = dyld_pager_setup(task, backing_object, regions, region_cnt, *link_info, link_info_size); if (pager == MEMORY_OBJECT_NULL) { kr = KERN_RESOURCE_SHORTAGE; goto done; @@ -1566,11 +1758,19 @@ vm_map_with_linking( kr = KERN_SUCCESS; done: - + if (backing_object != VM_OBJECT_NULL) { + /* + * Release our extra reference on the backing object. + * The pager (if created) took an extra reference on it. + */ + vm_object_deallocate(backing_object); + backing_object = VM_OBJECT_NULL; + } if (pager != MEMORY_OBJECT_NULL) { /* * Release the pager reference obtained by dyld_pager_setup(). - * The mapping, if it succeeded, is now holding a reference on the memory object. + * The mappings, if succesful, are each holding a reference on the + * pager's VM object, which keeps the pager (aka memory object) alive. */ memory_object_deallocate(pager); pager = MEMORY_OBJECT_NULL; diff --git a/osfmk/vm/vm_dyld_pager_internal.h b/osfmk/vm/vm_dyld_pager_internal.h index 28374a45f..c790ac22a 100644 --- a/osfmk/vm/vm_dyld_pager_internal.h +++ b/osfmk/vm/vm_dyld_pager_internal.h @@ -41,6 +41,7 @@ extern uint32_t dyld_pager_count; extern uint32_t dyld_pager_count_max; +extern const struct memory_object_pager_ops dyld_pager_ops; /* * VM call to implement map_with_linking_np() system call. diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 5faa7e372..ead9cd65c 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -85,9 +85,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -398,6 +400,22 @@ vm_fault_cleanup( vm_object_t object, vm_page_t top_page) { + thread_pri_floor_t token = { + .thread = THREAD_NULL + }; + if (top_page != VM_PAGE_NULL && + top_page->vmp_busy) { + /* + * We busied the top page. Apply a priority floor before dropping the + * current object (and therefore the rw-lock boost) to avoid + * inversions due to another thread sleeping on the top-level page. + * + * TODO: Register a page-worker token when busying the top-level page instead + * (rdar://154313767) + */ + token = thread_priority_floor_start(); + } + vm_object_paging_end(object); vm_object_unlock(object); @@ -409,12 +427,16 @@ vm_fault_cleanup( vm_object_paging_end(object); vm_object_unlock(object); } + if (token.thread != THREAD_NULL) { + thread_priority_floor_end(&token); + } } #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) -boolean_t vm_page_deactivate_behind = TRUE; +TUNABLE(bool, vm_page_deactivate_behind, "vm_deactivate_behind", true); +TUNABLE(uint32_t, vm_page_deactivate_behind_min_resident_ratio, "vm_deactivate_behind_min_resident_ratio", 3); /* * default sizes given VM_BEHAVIOR_DEFAULT reference behavior */ @@ -550,12 +572,13 @@ vm_fault_is_sequential( } #if DEVELOPMENT || DEBUG -uint64_t vm_page_deactivate_behind_count = 0; +SCALABLE_COUNTER_DEFINE(vm_page_deactivate_behind_count); #endif /* DEVELOPMENT || DEBUG */ /* - * vm_page_deactivate_behind + * @func vm_fault_deactivate_behind * + * @description * Determine if sequential access is in progress * in accordance with the behavior specified. If * so, compute a potential page to deactivate and @@ -563,30 +586,32 @@ uint64_t vm_page_deactivate_behind_count = 0; * * object must be locked. * - * return TRUE if we actually deactivate a page + * @returns the number of deactivated pages */ static -boolean_t +uint32_t vm_fault_deactivate_behind( vm_object_t object, vm_object_offset_t offset, vm_behavior_t behavior) { - int n; - int pages_in_run = 0; - int max_pages_in_run = 0; - int sequential_run; - int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + uint32_t pages_in_run = 0; + uint32_t max_pages_in_run = 0; + int32_t sequential_run; + vm_behavior_t sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; vm_object_offset_t run_offset = 0; vm_object_offset_t pg_offset = 0; vm_page_t m; vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER]; - pages_in_run = 0; #if TRACEFAULTPAGE dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */ #endif - if (is_kernel_object(object) || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) { + if (is_kernel_object(object) || + !vm_page_deactivate_behind || + (vm_object_trunc_page(offset) != offset) || + (object->resident_page_count < + vm_page_active_count / vm_page_deactivate_behind_min_resident_ratio)) { /* * Do not deactivate pages from the kernel object: they * are not intended to become pageable. @@ -596,9 +621,19 @@ vm_fault_deactivate_behind( * handle the deactivation on the aligned offset and, thus, * the full PAGE_SIZE page once. This helps us avoid the redundant * deactivates and the extra faults. + * + * Objects need only participate in backwards + * deactivation if they are exceedingly large (i.e. their + * resident pages are liable to comprise a substantially large + * portion of the active queue and push out the rest of the + * system's working set). */ - return FALSE; + return 0; } + + KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_START, + VM_KERNEL_ADDRHIDE(object), offset, behavior); + if ((sequential_run = object->sequential)) { if (sequential_run < 0) { sequential_behavior = VM_BEHAVIOR_RSEQNTL; @@ -653,7 +688,7 @@ vm_fault_deactivate_behind( } break;} } - for (n = 0; n < max_pages_in_run; n++) { + for (unsigned n = 0; n < max_pages_in_run; n++) { m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && @@ -675,16 +710,17 @@ vm_fault_deactivate_behind( pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); } } + if (pages_in_run) { vm_page_lockspin_queues(); - for (n = 0; n < pages_in_run; n++) { + for (unsigned n = 0; n < pages_in_run; n++) { m = page_run[n]; vm_page_deactivate_internal(m, FALSE); #if DEVELOPMENT || DEBUG - vm_page_deactivate_behind_count++; + counter_inc(&vm_page_deactivate_behind_count); #endif /* DEVELOPMENT || DEBUG */ #if TRACEFAULTPAGE @@ -692,10 +728,12 @@ vm_fault_deactivate_behind( #endif } vm_page_unlock_queues(); - - return TRUE; } - return FALSE; + + KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_END, + pages_in_run); + + return pages_in_run; } @@ -1091,7 +1129,7 @@ vm_fault_page( int external_state = VM_EXTERNAL_STATE_UNKNOWN; memory_object_t pager; vm_fault_return_t retval; - int grab_options; + vm_grab_options_t grab_options; bool clear_absent_on_error = false; /* @@ -1162,12 +1200,7 @@ vm_fault_page( dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */ #endif - grab_options = 0; -#if CONFIG_SECLUDED_MEMORY - if (object->can_grab_secluded) { - grab_options |= VM_PAGE_GRAB_SECLUDED; - } -#endif /* CONFIG_SECLUDED_MEMORY */ + grab_options = vm_page_grab_options_for_object(object); if (!object->alive) { /* @@ -1870,7 +1903,7 @@ vm_fault_page( * so we can release the object lock. */ - if (object->object_is_shared_cache) { + if (object->object_is_shared_cache || pager->mo_pager_ops == &dyld_pager_ops) { token = thread_priority_floor_start(); /* * A non-native shared cache object might @@ -1878,6 +1911,9 @@ vm_fault_page( * fault and so we can't assume that this * check will be valid after we drop the * object lock below. + * + * FIXME: This should utilize @c page_worker_register_worker() + * (rdar://153586539) */ drop_floor = true; } @@ -1963,7 +1999,7 @@ vm_fault_page( #endif vm_object_lock(object); - if (drop_floor && object->object_is_shared_cache) { + if (drop_floor) { thread_priority_floor_end(&token); drop_floor = false; } @@ -2401,7 +2437,7 @@ dont_look_for_page: * * Allocate a page for the copy */ - copy_m = vm_page_alloc(copy_object, copy_offset); + copy_m = vm_page_grab_options(grab_options); if (copy_m == VM_PAGE_NULL) { vm_fault_page_release_page(m, &clear_absent_on_error); @@ -2416,9 +2452,11 @@ dont_look_for_page: return VM_FAULT_MEMORY_SHORTAGE; } + /* * Must copy page into copy-object. */ + vm_page_insert(copy_m, copy_object, copy_offset); vm_page_copy(m, copy_m); /* @@ -3301,7 +3339,7 @@ MACRO_END vm_page_check_pageable_safe(m); vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq); m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q; - m->vmp_local_id = lid; + m->vmp_local_id = (uint16_t)lid; lq->vpl_count++; if (object->internal) { @@ -3461,6 +3499,42 @@ vm_fault_enter_set_mapped( return page_needs_sync; } + +static inline kern_return_t +vm_fault_pmap_validate_page( + pmap_t pmap __unused, + vm_page_t m __unused, + vm_map_offset_t vaddr __unused, + vm_prot_t prot __unused, + vm_object_fault_info_t fault_info __unused, + bool *page_sleep_needed) +{ + assert(page_sleep_needed != NULL); + *page_sleep_needed = false; +#if CONFIG_SPTM + /* + * Reject the executable or debug mapping if the page is already wired for I/O. The SPTM's security + * model doesn't allow us to reliably use executable pages for I/O due to both CS integrity + * protections and the possibility that the pages may be dynamically retyped while wired for I/O. + * This check is required to happen under the VM object lock in order to synchronize with the + * complementary check on the I/O wiring path in vm_page_do_delayed_work(). + */ + if (__improbable((m->vmp_cleaning || m->vmp_iopl_wired) && + pmap_will_retype(pmap, vaddr, VM_PAGE_GET_PHYS_PAGE(m), prot, fault_info->pmap_options | + ((fault_info->fi_xnu_user_debug && !VM_PAGE_OBJECT(m)->code_signed) ? PMAP_OPTIONS_XNU_USER_DEBUG : 0), + PMAP_MAPPING_TYPE_INFER))) { + if (__improbable(m->vmp_iopl_wired)) { + vm_map_guard_exception(vaddr, kGUARD_EXC_SEC_EXEC_ON_IOPL_PAGE); + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, + KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE), (uintptr_t)vaddr); + return KERN_PROTECTION_FAILURE; + } + *page_sleep_needed = m->vmp_cleaning; + } +#endif /* CONFIG_SPTM */ + return KERN_SUCCESS; +} + /* * wrappers for pmap_enter_options() */ @@ -3483,6 +3557,8 @@ pmap_enter_object_options_check( extra_options |= PMAP_OPTIONS_INTERNAL; } pmap_paddr_t physical_address = (pmap_paddr_t)ptoa(pn) + fault_phys_offset; + + return pmap_enter_options_addr(pmap, virtual_address, physical_address, @@ -3513,6 +3589,19 @@ pmap_enter_options_check( if (page->vmp_reusable || obj->all_reusable) { options |= PMAP_OPTIONS_REUSABLE; } + assert(page->vmp_pmapped); + if (fault_type & VM_PROT_WRITE) { + if (pmap == kernel_pmap) { + /* + * The kernel sometimes needs to map a page to provide its + * initial contents but that does not mean that the page is + * actually dirty/modified, so let's not assert that it's been + * "wpmapped". + */ + } else { + assert(page->vmp_wpmapped); + } + } return pmap_enter_object_options_check( pmap, virtual_address, @@ -3804,7 +3893,8 @@ vm_fault_enter_prepare( vm_prot_t fault_type, vm_object_fault_info_t fault_info, int *type_of_fault, - bool *page_needs_data_sync) + bool *page_needs_data_sync, + bool *page_needs_sleep) { kern_return_t kr; bool is_tainted = false; @@ -3894,6 +3984,10 @@ vm_fault_enter_prepare( } } + kr = vm_fault_pmap_validate_page(pmap, m, vaddr, *prot, fault_info, page_needs_sleep); + if (__improbable((kr != KERN_SUCCESS) || *page_needs_sleep)) { + return kr; + } kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr, *prot, caller_prot, fault_page_size, fault_phys_offset, fault_info, &is_tainted); @@ -3982,7 +4076,8 @@ vm_fault_enter( vm_object_fault_info_t fault_info, boolean_t *need_retry, int *type_of_fault, - uint8_t *object_lock_type) + uint8_t *object_lock_type, + bool *page_needs_sleep) { kern_return_t kr; vm_object_t object; @@ -3999,12 +4094,12 @@ vm_fault_enter( assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m); kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot, fault_page_size, fault_phys_offset, fault_type, - fault_info, type_of_fault, &page_needs_data_sync); + fault_info, type_of_fault, &page_needs_data_sync, page_needs_sleep); object = VM_PAGE_OBJECT(m); vm_fault_enqueue_page(object, m, wired, fault_info->fi_change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr); - if (kr == KERN_SUCCESS) { + if (__probable((kr == KERN_SUCCESS) && !(*page_needs_sleep))) { if (page_needs_data_sync) { pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m)); } @@ -4124,6 +4219,8 @@ current_proc_is_privileged(void) } uint64_t vm_copied_on_read = 0; +uint64_t vm_copied_on_read_kernel_map = 0; +uint64_t vm_copied_on_read_platform_map = 0; /* * Cleanup after a vm_fault_enter. @@ -4327,7 +4424,7 @@ vm_fault_internal( vm_object_offset_t written_on_offset = 0; int throttle_delay; int compressed_count_delta; - uint8_t grab_options; + vm_grab_options_t grab_options; bool need_copy; bool need_copy_on_read; vm_map_offset_t trace_vaddr; @@ -4350,25 +4447,24 @@ vm_fault_internal( */ bool object_is_contended = false; + vmlp_api_start(VM_FAULT_INTERNAL); + real_vaddr = vaddr; trace_real_vaddr = vaddr; /* - * Some (kernel) submaps are marked with "should never fault". - * - * We do this for two reasons: - * - PGZ which is inside the zone map range can't go down the normal - * lookup path (vm_map_lookup_entry() would panic). - * - * - we want for guard pages to not have to use fictitious pages at all - * to prevent from ZFOD pages to be made. + * Some (kernel) submaps are marked with "should never fault", so that + * guard pages in such submaps do not need to use fictitious + * placeholders at all, while not causing ZFOD pages to be made + * (which is the default behavior otherwise). * * We also want capture the fault address easily so that the zone * allocator might present an enhanced panic log. */ - if (map->never_faults || (pgz_owned(vaddr) && map->pmap == kernel_pmap)) { + if (map->never_faults) { assert(map->pmap == kernel_pmap); + vmlp_api_end(VM_FAULT_INTERNAL, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -4410,6 +4506,7 @@ vm_fault_internal( KERN_FAILURE); ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */); + vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE); return KERN_FAILURE; } @@ -4426,10 +4523,12 @@ vm_fault_internal( KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULTS_DISABLED), 0 /* arg */); + vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE); return KERN_FAILURE; } bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME); + bool page_sleep_needed = false; uint64_t fstart = 0; if (rtfault) { @@ -4521,12 +4620,12 @@ RetryFault: goto done; } - pmap = real_map->pmap; fault_info->io_sync = FALSE; fault_info->mark_zf_absent = FALSE; fault_info->batch_pmap_op = FALSE; + if (resilient_media_retry) { /* * We're retrying this fault after having detected a media @@ -4683,12 +4782,7 @@ RetryFault: cur_object = object; cur_offset = offset; - grab_options = 0; -#if CONFIG_SECLUDED_MEMORY - if (object->can_grab_secluded) { - grab_options |= VM_PAGE_GRAB_SECLUDED; - } -#endif /* CONFIG_SECLUDED_MEMORY */ + grab_options = vm_page_grab_options_for_object(object); while (TRUE) { if (!cur_object->pager_created && @@ -4710,7 +4804,22 @@ RetryFault: if (m != VM_PAGE_NULL) { m_object = cur_object; - if (m->vmp_busy) { + if (__improbable(page_sleep_needed)) { + /* + * If a prior iteration of the loop requested vm_page_sleep(), re-validate the page + * to see if it's still needed. + */ + kr = vm_fault_pmap_validate_page(pmap, m, vaddr, prot, fault_info, &page_sleep_needed); + if (__improbable(kr != KERN_SUCCESS)) { + vm_map_unlock_read(map); + if (real_map != map) { + vm_map_unlock(real_map); + } + goto done; + } + } + if (m->vmp_busy || page_sleep_needed) { + page_sleep_needed = false; wait_result_t result; /* @@ -4935,9 +5044,11 @@ upgrade_lock_and_retry: cur_object != object && !cur_object->internal && !cur_object->pager_trusted && - vm_protect_privileged_from_untrusted && !cur_object->code_signed && - current_proc_is_privileged()) { + vm_protect_privileged_from_untrusted && + (current_proc_is_privileged() || + vm_kernel_map_is_kernel(map) || + vm_map_is_platform_binary(map))) { /* * We're faulting on a page in "object" and * went down the shadow chain to "cur_object" @@ -4967,6 +5078,16 @@ upgrade_lock_and_retry: */ // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset); vm_copied_on_read++; + if (!current_proc_is_privileged()) { + /* not a privileged proc but still copy-on-read... */ + if (vm_kernel_map_is_kernel(map)) { + /* ... because target map is a kernel map */ + vm_copied_on_read_kernel_map++; + } else { + /* ... because target map is "platform" */ + vm_copied_on_read_platform_map++; + } + } need_copy = TRUE; vm_object_unlock(object); @@ -5092,7 +5213,7 @@ FastPmapEnter: vm_page_lock_queues(); if (!m->vmp_realtime) { m->vmp_realtime = true; - vm_page_realtime_count++; + VM_COUNTER_INC(&vm_page_realtime_count); } vm_page_unlock_queues(); } @@ -5111,7 +5232,8 @@ FastPmapEnter: fault_info, need_retry_ptr, &type_of_fault, - &object_lock_type); + &object_lock_type, + &page_sleep_needed); } else { kr = vm_fault_enter(m, pmap, @@ -5125,7 +5247,8 @@ FastPmapEnter: fault_info, need_retry_ptr, &type_of_fault, - &object_lock_type); + &object_lock_type, + &page_sleep_needed); } vm_fault_complete( @@ -5168,6 +5291,9 @@ FastPmapEnter: need_retry = FALSE; goto RetryFault; } + if (page_sleep_needed) { + goto RetryFault; + } goto done; } /* @@ -5226,8 +5352,6 @@ FastPmapEnter: * we don't drop either object lock until * the page has been copied and inserted */ - - cur_m = m; m = vm_page_grab_options(grab_options); m_object = NULL; @@ -5484,7 +5608,8 @@ FastPmapEnter: cur_object); if (kr != KERN_SUCCESS) { - vm_page_release(m, FALSE); + vm_page_release(m, + VMP_RELEASE_NONE); m = VM_PAGE_NULL; } /* @@ -5682,7 +5807,8 @@ FastPmapEnter: break; } #endif /* MACH_ASSERT */ - m = vm_page_alloc(object, vm_object_trunc_page(offset)); + + m = vm_page_grab_options(grab_options); m_object = NULL; if (m == VM_PAGE_NULL) { @@ -5693,6 +5819,7 @@ FastPmapEnter: break; } m_object = object; + vm_page_insert(m, m_object, vm_object_trunc_page(offset)); if ((prot & VM_PROT_WRITE) && !(fault_type & VM_PROT_WRITE) && @@ -5799,7 +5926,10 @@ FastPmapEnter: enter_fault_type, fault_info, &type_of_fault, - &page_needs_data_sync); + &page_needs_data_sync, + &page_sleep_needed); + + assert(!page_sleep_needed); if (kr != KERN_SUCCESS) { goto zero_fill_cleanup; } @@ -5915,7 +6045,7 @@ zero_fill_cleanup: vm_page_lock_queues(); if (!m->vmp_realtime) { m->vmp_realtime = true; - vm_page_realtime_count++; + VM_COUNTER_INC(&vm_page_realtime_count); } vm_page_unlock_queues(); } @@ -6471,7 +6601,8 @@ handle_copy_delay: fault_info, NULL, &type_of_fault, - &object_lock_type); + &object_lock_type, + &page_sleep_needed); } else { kr = vm_fault_enter(m, pmap, @@ -6485,7 +6616,8 @@ handle_copy_delay: fault_info, NULL, &type_of_fault, - &object_lock_type); + &object_lock_type, + &page_sleep_needed); } assert(VM_PAGE_OBJECT(m) == m_object); @@ -6505,7 +6637,7 @@ handle_copy_delay: DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag); } - if (kr != KERN_SUCCESS) { + if ((kr != KERN_SUCCESS) || page_sleep_needed) { /* abort this page fault */ vm_map_unlock_read(map); if (real_map != map) { @@ -6514,7 +6646,11 @@ handle_copy_delay: vm_page_wakeup_done(m_object, m); vm_fault_cleanup(m_object, top_page); vm_object_deallocate(object); - goto done; + if (kr != KERN_SUCCESS) { + goto done; + } else { + goto RetryFault; + } } if (physpage_p != NULL) { /* for vm_map_wire_and_extract() */ @@ -6658,7 +6794,7 @@ cleanup: vm_page_lock_queues(); if (!m->vmp_realtime) { m->vmp_realtime = true; - vm_page_realtime_count++; + VM_COUNTER_INC(&vm_page_realtime_count); } vm_page_unlock_queues(); } @@ -6746,6 +6882,7 @@ done: DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr); } + vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE); return kr; } @@ -7157,6 +7294,18 @@ vm_fault_wire_fast( * */ + if (entry->needs_copy) { + panic("attempting to wire needs_copy memory"); + } + + /* + * Since we don't have the machinary to resolve CoW obligations on the fast + * path, if we might have to push pages to a copy, just give up. + */ + if (object->vo_copy != VM_OBJECT_NULL) { + GIVE_UP; + } + /* * Look for page in top-level object. If it's not there or * there's something going on, give up. @@ -7191,14 +7340,6 @@ vm_fault_wire_fast( m->vmp_busy = TRUE; assert(!m->vmp_absent); - /* - * Give up if the page is being written and there's a copy object - */ - if ((object->vo_copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { - RELEASE_PAGE(m); - GIVE_UP; - } - fault_info.user_tag = VME_ALIAS(entry); fault_info.pmap_options = 0; if (entry->iokit_acct || @@ -7226,6 +7367,7 @@ vm_fault_wire_fast( */ type_of_fault = DBG_CACHE_HIT_FAULT; assert3p(VM_PAGE_OBJECT(m), ==, object); + bool page_sleep_needed = false; kr = vm_fault_enter(m, pmap, pmap_addr, @@ -7238,8 +7380,9 @@ vm_fault_wire_fast( &fault_info, NULL, &type_of_fault, - &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/ - if (kr != KERN_SUCCESS) { + &object_lock_type, /* Exclusive lock mode. Will remain unchanged.*/ + &page_sleep_needed); + if ((kr != KERN_SUCCESS) || page_sleep_needed) { RELEASE_PAGE(m); GIVE_UP; } @@ -7374,6 +7517,9 @@ vm_fault_copy( struct vm_object_fault_info fault_info_src = {}; struct vm_object_fault_info fault_info_dst = {}; + vmlp_api_start(VM_FAULT_COPY); + vmlp_range_event(dst_map, dst_offset, *copy_size); + /* * In order not to confuse the clustered pageins, align * the different offsets on a page boundary. @@ -7382,6 +7528,7 @@ vm_fault_copy( #define RETURN(x) \ MACRO_BEGIN \ *copy_size -= amount_left; \ + vmlp_api_end(VM_FAULT_COPY, x); \ MACRO_RETURN(x); \ MACRO_END @@ -7452,8 +7599,10 @@ RetryDestinationFault:; OS_FALLTHROUGH; case VM_FAULT_MEMORY_ERROR: if (error) { + vmlp_api_end(VM_FAULT_COPY, error); return error; } else { + vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR); return KERN_MEMORY_ERROR; } default: @@ -7549,8 +7698,10 @@ RetrySourceFault:; case VM_FAULT_MEMORY_ERROR: vm_fault_copy_dst_cleanup(dst_page); if (error) { + vmlp_api_end(VM_FAULT_COPY, error); return error; } else { + vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR); return KERN_MEMORY_ERROR; } default: @@ -7791,6 +7942,8 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu) ppnum_t decomp_ppnum; addr64_t decomp_paddr; + vmlp_api_start(KDP_LIGHTWEIGHT_FAULT); + if (multi_cpu) { compressor_flags |= C_KDP_MULTICPU; } @@ -7811,23 +7964,30 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu) assert((cur_target_addr & effective_page_mask) == 0); if ((cur_target_addr & effective_page_mask) != 0) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } + vmlp_range_event_entry(map, entry); + if (entry->is_sub_map) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } @@ -7835,11 +7995,13 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu) while (TRUE) { if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } if (object->pager_created && (object->paging_in_progress || object->activity_in_progress)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } @@ -7847,30 +8009,36 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu) if (m != VM_PAGE_NULL) { if (!object_supports_coredump(object)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning || m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } assert(!vm_page_is_private(m)); if (vm_page_is_private(m)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } assert(!vm_page_is_fictitious(m)); if (vm_page_is_fictitious(m)) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0); return ptoa(VM_PAGE_GET_PHYS_PAGE(m)); } @@ -7893,14 +8061,17 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu) decomp_ppnum, &my_fault_type, compressor_flags, &compressed_count_delta); if (kr == KERN_SUCCESS) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0); return decomp_paddr; } else { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } } } if (object->shadow == VM_OBJECT_NULL) { + vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1); return 0; } @@ -8546,6 +8717,8 @@ vmtc_revalidate_lookup( vm_prot_t prot; vm_object_t shadow; + vmlp_api_start(VMTC_REVALIDATE_LOOKUP); + /* * Find the object/offset for the given location/map. * Note this returns with the object locked. @@ -8622,6 +8795,7 @@ done: if (kr != KERN_SUCCESS && object != NULL) { vm_object_unlock(object); } + vmlp_api_end(VMTC_REVALIDATE_LOOKUP, kr); return kr; } diff --git a/osfmk/vm/vm_fault_internal.h b/osfmk/vm/vm_fault_internal.h index 3ebc77e85..fa70c3176 100644 --- a/osfmk/vm/vm_fault_internal.h +++ b/osfmk/vm/vm_fault_internal.h @@ -105,7 +105,8 @@ extern kern_return_t vm_fault_enter( vm_object_fault_info_t fault_info, boolean_t *need_retry, int *type_of_fault, - uint8_t *object_lock_type); + uint8_t *object_lock_type, + bool *page_sleep_needed); extern kern_return_t vm_pre_fault_with_info( vm_map_t map, diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 486896f17..58e89d181 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -124,7 +124,6 @@ vm_mem_bootstrap(void) vm_retire_boot_pages(); - vm_mem_bootstrap_log("vm_map_init"); vm_map_init(); diff --git a/osfmk/vm/vm_iokit.h b/osfmk/vm/vm_iokit.h index 8a27e03ff..2d4e0fa90 100644 --- a/osfmk/vm/vm_iokit.h +++ b/osfmk/vm/vm_iokit.h @@ -47,6 +47,10 @@ extern kern_return_t memory_object_iopl_request( vm_tag_t tag); extern uint32_t vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen); +#if DEBUG || DEVELOPMENT +extern uint64_t vm_task_evict_shared_cache(task_t task); +extern uint64_t vm_task_pageins(task_t task); +#endif /* DEBUG || DEVELOPMENT */ extern void iopl_valid_data( @@ -132,8 +136,9 @@ extern kern_return_t memory_entry_purgeable_control_internal( extern kern_return_t mach_memory_entry_get_page_counts( ipc_port_t entry_port, - unsigned int *resident_page_count, - unsigned int *dirty_page_count); + uint64_t *resident_page_count, + uint64_t *dirty_page_count, + uint64_t *swapped_page_count); extern kern_return_t mach_memory_entry_phys_page_offset( ipc_port_t entry_port, @@ -160,6 +165,12 @@ extern kern_return_t vm_map_enter_mem_object_prefault( upl_page_list_ptr_t page_list, unsigned int page_list_count); +extern void vm_report_disallowed_sharing_data_buffers(void); + +extern bool vm_map_should_allow_entering_alias( + vm_map_t originating_map, + vm_map_t destination_map, + ipc_port_t ne_port); __END_DECLS diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index 083642986..6289f175d 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -117,8 +117,8 @@ static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges", btlog_t kmem_outlier_log; #endif /* DEBUG || DEVELOPMENT */ -__startup_data static vm_map_size_t iokit_range_size; __startup_data static vm_map_size_t data_range_size; +__startup_data static vm_map_size_t shared_data_range_size; __startup_data static vm_map_size_t ptr_range_size; __startup_data static vm_map_size_t sprayqtn_range_size; @@ -210,8 +210,11 @@ __kmem_object(kmem_flags_t flags) static inline pmap_mapping_type_t __kmem_mapping_type(kmem_flags_t flags) { - if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) { + if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) { return PMAP_MAPPING_TYPE_DEFAULT; + } else if (flags & KMEM_DATA) { + return kalloc_is_restricted_data_mode_enforced() ? + PMAP_MAPPING_TYPE_RESTRICTED : PMAP_MAPPING_TYPE_DEFAULT; } else { return PMAP_MAPPING_TYPE_RESTRICTED; } @@ -289,13 +292,6 @@ __header_always_inline bool mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr) { mach_vm_offset_t rmin, rmax; - -#if CONFIG_KERNEL_TAGGING - if (VM_KERNEL_ADDRESS(addr)) { - addr = vm_memtag_canonicalize_kernel(addr); - } -#endif /* CONFIG_KERNEL_TAGGING */ - /* * The `&` is not a typo: we really expect the check to pass, * so encourage the compiler to eagerly load and test without branches @@ -312,14 +308,8 @@ mach_vm_range_contains( mach_vm_offset_t size) { mach_vm_offset_t rmin, rmax; - -#if CONFIG_KERNEL_TAGGING - if (VM_KERNEL_ADDRESS(addr)) { - addr = vm_memtag_canonicalize_kernel(addr); - } -#endif /* CONFIG_KERNEL_TAGGING */ - mach_vm_offset_t end; + if (__improbable(os_add_overflow(addr, size, &end))) { return false; } @@ -365,10 +355,6 @@ mach_vm_range_intersects( { struct mach_vm_range r2; -#if CONFIG_KERNEL_TAGGING - addr = VM_KERNEL_STRIP_UPTR(addr); -#endif /* CONFIG_KERNEL_TAGGING */ - r2.min_address = addr; if (os_add_overflow(addr, size, &r2.max_address)) { __mach_vm_range_overflow(addr, size); @@ -383,7 +369,7 @@ kmem_range_id_contains( vm_map_offset_t addr, vm_map_size_t size) { - return mach_vm_range_contains(&kmem_ranges[range_id], addr, size); + return mach_vm_range_contains(&kmem_ranges[range_id], vm_memtag_canonicalize_kernel(addr), size); } __abortlike @@ -623,6 +609,8 @@ kmem_size_guard( vm_map_entry_t entry; vm_size_t size; + vmlp_api_start(KMEM_SIZE_GUARD); + vm_map_lock_read(map); #if KASAN_CLASSIC @@ -634,6 +622,8 @@ kmem_size_guard( __kmem_entry_not_found_panic(map, addr); } + vmlp_range_event_entry(map, entry); + if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) { __kmem_entry_validate_panic(map, entry, addr, 0, flags, guard); } @@ -642,6 +632,7 @@ kmem_size_guard( vm_map_unlock_read(map); + vmlp_api_end(KMEM_SIZE_GUARD, 0); return size; } @@ -660,7 +651,7 @@ kmem_hash_backtrace( return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0])); } -static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK, +static_assert(KMEM_RANGE_ID_DATA_SHARED - 1 <= KMEM_RANGE_MASK, "Insufficient bits to represent ptr ranges"); kmem_range_id_t @@ -730,7 +721,16 @@ kmem_apply_security_policy( #endif if (kma_flags & (KMA_DATA | KMA_DATA_SHARED)) { - range_id = KMEM_RANGE_ID_DATA; + /* + * Choose the specific which data range. + */ + if (kma_flags & KMA_DATA) { + range_id = KMEM_RANGE_ID_DATA; + } else { + range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; + } + /* * As an optimization in KMA_DATA to avoid fragmentation, * allocate static carveouts at the end of the DATA range. @@ -778,8 +778,23 @@ kmem_alloc_guard_internal( bool skip_guards; kmem_return_t kmr = { }; + vmlp_api_start(KMEM_ALLOC_GUARD_INTERNAL); + assert(kernel_map && map->pmap == kernel_pmap); + /* DATA and DATA_SHARED are mutually exclusive */ + assert((flags & (KMA_DATA | KMA_DATA_SHARED)) != (KMA_DATA | KMA_DATA_SHARED)); + +#if defined(__arm64__) + /* + * Pageable allocations should be marked as shared. + * + * Only assert this on arm64 architectures, since we do not + * adopt the shared heap on older ones. + */ + assert((flags & (KMA_PAGEABLE | KMA_DATA)) != (KMA_PAGEABLE | KMA_DATA)); +#endif /* defined(__arm64__) */ + #if DEBUG || DEVELOPMENT VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0); @@ -808,23 +823,6 @@ kmem_alloc_guard_internal( goto out_error; } -#if 136275805 - /* - * XXX: Redundantly check the mapping size here so that failure stack traces - * are more useful. This has no functional value but is helpful because - * telemetry traps can currently only capture the last five calls and - * so we want to trap as shallow as possible in a select few cases - * where we anticipate issues. - * - * When telemetry collection is complete, this will be removed. - */ - if (__improbable(!vm_map_is_map_size_valid( - kernel_map, size, flags & KMA_NOSOFTLIMIT))) { - kmr.kmr_return = KERN_RESOURCE_SHORTAGE; - goto out_error; - } -#endif /* 136275805 */ - /* * Guard pages: * @@ -913,7 +911,7 @@ kmem_alloc_guard_internal( object = compressor_object; vm_object_reference(object); } else { - object = vm_object_allocate(map_size); + object = vm_object_allocate(map_size, map->serial_id); vm_object_lock(object); vm_object_set_size(object, map_size, size); /* stabilize the object to prevent shadowing */ @@ -938,6 +936,8 @@ kmem_alloc_guard_internal( goto out_error; } + vmlp_range_event_entry(map, entry); + map_addr = entry->vme_start; VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context); VME_ALIAS_SET(entry, guard.kmg_tag); @@ -1028,6 +1028,7 @@ kmem_alloc_guard_internal( #endif /* KASAN_TBI */ } #endif /* CONFIG_KERNEL_TAGGING */ + vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return); return kmr; out_error: @@ -1051,10 +1052,11 @@ out_error: 0, 0, 0, 0); #endif /* DEBUG || DEVELOPMENT */ + vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return); return kmr; } -kmem_return_t +__mockable kmem_return_t kmem_alloc_guard( vm_map_t map, vm_size_t size, @@ -1103,7 +1105,7 @@ kmem_suballoc( vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag); if (parent == kernel_map) { - assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA)); + assert(vmk_flags.vmf_overwrite || (flags & (KMS_DATA | KMS_DATA_SHARED))); } if (vmk_flags.vmf_fixed) { @@ -1124,7 +1126,7 @@ kmem_suballoc( vm_map_reference(map); vmk_flags.vmkf_submap = true; - if ((flags & KMS_DATA) == 0) { + if ((flags & (KMS_DATA | KMS_DATA_SHARED)) == 0) { /* FIXME: IOKit submaps get fragmented and can't be atomic */ vmk_flags.vmkf_submap_atomic = true; } @@ -1135,8 +1137,13 @@ kmem_suballoc( if (flags & KMS_PERMANENT) { vmk_flags.vmf_permanent = true; } - if (flags & KMS_DATA) { - vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; + if (flags & (KMS_DATA | KMS_DATA_SHARED)) { + if (flags & KMS_DATA) { + vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; + } else { + vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; + } } if (flags & KMS_NOSOFTLIMIT) { vmk_flags.vmkf_no_soft_limit = true; @@ -1245,7 +1252,7 @@ kmem_alloc_pageable_external( vm_size_t size) { if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) { - return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt()); + return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA_SHARED, vm_tag_bt()); } /* Maintain ABI compatibility: invalid sizes used to be allowed */ return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT; @@ -1831,6 +1838,8 @@ kmem_realloc_guard( .vmkf_last_free = (bool)(flags & KMR_LAST_FREE), }; + vmlp_api_start(KMEM_REALLOC_GUARD); + assert(KMEM_REALLOC_FLAGS_VALID(flags)); if (!guard.kmg_atomic) { @@ -1846,12 +1855,15 @@ kmem_realloc_guard( } if (req_oldaddr == 0ul) { - return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard); + kmem_return_t ret = kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard); + vmlp_api_end(KMEM_REALLOC_GUARD, ret.kmr_return); + return ret; } if (req_newsize == 0ul) { kmem_free_guard(map, req_oldaddr, req_oldsize, (kmf_flags_t)flags, guard); + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); return kmr; } @@ -1890,6 +1902,7 @@ kmem_realloc_guard( */ if (oldsize == newsize) { kmr.kmr_address = req_oldaddr; + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); return kmr; } #endif /* !KASAN */ @@ -1927,6 +1940,7 @@ kmem_realloc_guard( DBG_VM_KERN_REQUEST, DBG_FUNC_END, 0, 0, 0, 0); #endif /* DEBUG || DEVELOPMENT */ + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); return kmr; } @@ -1946,6 +1960,9 @@ again: if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) { __kmem_entry_not_found_panic(map, req_oldaddr); } + + vmlp_range_event_entry(map, oldentry); + if ((flags & KMR_KOBJECT) && oldentry->in_transition) { oldentry->needs_wakeup = true; vm_map_entry_wait(map, THREAD_UNINT); @@ -1989,6 +2006,7 @@ again: kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize); } #endif /* KASAN_TBI */ + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); return kmr; } #endif /* KASAN */ @@ -1996,8 +2014,10 @@ again: guard.kmg_tag = VME_ALIAS(oldentry); if (newsize < oldsize) { - return kmem_realloc_shrink_guard(map, req_oldaddr, - req_oldsize, req_newsize, flags, guard, oldentry); + kmem_return_t ret = kmem_realloc_shrink_guard(map, req_oldaddr, + req_oldsize, req_newsize, flags, guard, oldentry); + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); + return ret; } @@ -2019,6 +2039,8 @@ again: newaddr = newentry->vme_start; newoffs = oldsize; + vmlp_range_event_entry(map, newentry); + VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context); VME_ALIAS_SET(newentry, guard.kmg_tag); if (flags & KMR_KOBJECT) { @@ -2301,6 +2323,7 @@ again: } #endif /* CONFIG_KERNEL_TAGGING */ + vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return); return kmr; } @@ -2394,7 +2417,7 @@ __kmem_free_invalid_object_size_panic( #endif /* KASAN */ -vm_size_t +__mockable vm_size_t kmem_free_guard( vm_map_t map, vm_offset_t req_addr, @@ -2410,6 +2433,8 @@ kmem_free_guard( vm_map_entry_t entry; #endif /* KASAN */ + vmlp_api_start(KMEM_FREE_GUARD); + assert(map->pmap == kernel_pmap); #if KASAN_CLASSIC @@ -2476,6 +2501,8 @@ kmem_free_guard( * there is no extra step that is required for memory tagging to "clear" * it -- the page will be later laundered when reused. */ + vmlp_range_event(map, addr, size); + vmlp_api_end(KMEM_FREE_GUARD, 0); return vm_map_remove_and_unlock(map, addr, addr + size, vmr_flags, guard).kmr_size - delta; } @@ -2541,7 +2568,9 @@ kmem_free_external( #define KMEM_LAST_IDX (kmem_log2down(KMEM_MAX_SIZE)) #define KMEM_NUM_SIZECLASS (KMEM_LAST_IDX - KMEM_START_IDX + 1) #define KMEM_FRONTS (KMEM_RANGE_ID_NUM_PTR * 2) +#define KMEM_NUM_SLOTS 8 #define KMEM_NUM_GUARDS 2 +#define KMEM_NUM_QUARANTINE 2 struct kmem_page_meta { union { @@ -2607,6 +2636,19 @@ static SECURITY_READ_ONLY_LATE(vm_map_t) kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1]; static vm_map_size_t kmem_meta_size; +static uint32_t +kmem_guard_count(struct kmem_sizeclass *kmem) +{ + return kmem->ks_num_elem * KMEM_NUM_GUARDS / KMEM_NUM_SLOTS; +} + +static uint32_t +kmem_guard_and_quarantine_count(struct kmem_sizeclass *kmem) +{ + return kmem->ks_num_elem * (KMEM_NUM_GUARDS + KMEM_NUM_QUARANTINE) / + KMEM_NUM_SLOTS; +} + static uint32_t kmem_get_front( kmem_range_id_t range_id, @@ -2796,8 +2838,8 @@ kmem_sizeclass_init(void) kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST; ks->ks_size = kmem_get_size_from_idx(i); - ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) / - KMEM_CHUNK_SIZE_MIN; + ks->ks_num_chunk = roundup(KMEM_NUM_SLOTS * ks->ks_size, + KMEM_CHUNK_SIZE_MIN) / KMEM_CHUNK_SIZE_MIN; ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size; assert(ks->ks_num_elem <= (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8)); @@ -2947,8 +2989,12 @@ kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to) { vm_offset_t page_addr = trunc_page(from); + vmlp_api_start(KMEM_POPULATE_META_LOCKED); + vm_map_unlock(kernel_map); + vmlp_range_event(kernel_map, from, to - from); + for (; page_addr < to; page_addr += PAGE_SIZE) { for (;;) { kern_return_t ret = KERN_SUCCESS; @@ -2978,6 +3024,7 @@ kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to) } vm_map_lock(kernel_map); + vmlp_api_end(KMEM_POPULATE_META_LOCKED, 0); } __abortlike @@ -3440,7 +3487,7 @@ again: * Requeue to full if necessary */ assert(meta->km_page_marker == KMEM_META_PRIMARY); - if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) { + if (__builtin_popcount(meta->km_bitmap) == kmem_guard_count(sizeclass)) { kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]); } } else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx, @@ -3450,7 +3497,7 @@ again: * Queue to partial */ assert(meta->km_page_marker == KMEM_META_PRIMARY); - assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS); + assert(__builtin_popcount(meta->km_bitmap) > kmem_guard_count(sizeclass)); LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link); } else { meta = kmem_get_new_chunk(range_id, from_right, size_idx); @@ -3568,7 +3615,7 @@ kmem_free_slot( bool from_right = kmem_meta_is_from_right(range_id, meta); kmem_free_chunk(range_id, meta, from_right); - } else if (num_elem == KMEM_NUM_GUARDS + 1) { + } else if (num_elem == kmem_guard_and_quarantine_count(sizeclass)) { /* * If we freed to full chunk move it to partial */ @@ -3970,8 +4017,6 @@ kmem_add_extra_claims(void) ptr_range_size = round_page(ptr_range_size); sprayqtn_range_size = round_page(sprayqtn_range_size); - iokit_range_size = 0; - /* Less any necessary allocation padding... */ ptr_range_size = kmem_allocation_to_claim_size(ptr_range_size); sprayqtn_range_size = kmem_allocation_to_claim_size(sprayqtn_range_size); @@ -4004,23 +4049,33 @@ kmem_add_extra_claims(void) data_range_size = largest_free_size - sprayqtn_allocation_size - ptr_total_allocation_size; -#if defined(ARM_LARGE_MEMORY) /* - * Reserve space for our dedicated IOKit carveout. - * Currently, we carve off a quarter of the data region. + * If we need the data shared range, divide the size + * for the data ranges between BUFFERS and SHARED. + * + * If not, all data allocations go into KMEM_RANGE_ID_DATA. */ - iokit_range_size = round_page(data_range_size / 4); - data_range_size -= kmem_claim_to_allocation_size( - iokit_range_size, /* known_last */ false); -#endif /* defined(ARM_LARGE_MEMORY) */ + if (kmem_needs_data_share_range()) { + /* + * Round down the size, because our kmem ranges logic round + * these sizes to page size, and we need to make sure we never + * exceed the remaining allocable space we divided. + */ + shared_data_range_size = data_range_size = + trunc_page(data_range_size / 2); + } else { + shared_data_range_size = 0; + } /* Less any necessary allocation padding... */ data_range_size = kmem_allocation_to_claim_size(data_range_size); + shared_data_range_size = shared_data_range_size ? + kmem_allocation_to_claim_size(shared_data_range_size) : 0; /* Check: our allocations should all still fit in the free space */ assert(sprayqtn_allocation_size + ptr_total_allocation_size + - kmem_claim_to_allocation_size(iokit_range_size, /* known_last */ false) + - kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) <= + kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) + + kmem_claim_to_allocation_size(shared_data_range_size, /* known_last */ false) <= largest_free_size); struct kmem_range_startup_spec kmem_spec_sprayqtn = { @@ -4031,28 +4086,23 @@ kmem_add_extra_claims(void) }; kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn; - /* - * If !defined(ARM_LARGE_MEMORY), KMEM_RANGE_ID_IOKIT is coalesced into the data range. - * This is to minimize wasted translation tables in constrained environments. - * The coalescing happens during kmem_scramble_ranges. - */ -#if defined(ARM_LARGE_MEMORY) - struct kmem_range_startup_spec kmem_spec_iokit = { - .kc_name = "kmem_iokit_range", - .kc_range = &kmem_ranges[KMEM_RANGE_ID_IOKIT], - .kc_size = iokit_range_size, - .kc_flags = KC_NO_ENTRY, - }; - kmem_claims[kmem_claim_count++] = kmem_spec_iokit; -#endif /* defined(ARM_LARGE_MEMORY) */ - - struct kmem_range_startup_spec kmem_spec_data = { - .kc_name = "kmem_data_range", + struct kmem_range_startup_spec kmem_spec_data_buffers = { + .kc_name = "kmem_data_buffers_range", .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA], .kc_size = data_range_size, .kc_flags = KC_NO_ENTRY, }; - kmem_claims[kmem_claim_count++] = kmem_spec_data; + kmem_claims[kmem_claim_count++] = kmem_spec_data_buffers; + + if (kmem_needs_data_share_range()) { + struct kmem_range_startup_spec kmem_spec_data_shared = { + .kc_name = "kmem_data_shared_range", + .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA_SHARED], + .kc_size = shared_data_range_size, + .kc_flags = KC_NO_ENTRY, + }; + kmem_claims[kmem_claim_count++] = kmem_spec_data_shared; + } } __startup_func @@ -4193,14 +4243,6 @@ kmem_scramble_ranges(void) vm_map_unlock(kernel_map); } - /* - * If we're not on a large memory system KMEM_RANGE_ID_IOKIT acts as a synonym for KMEM_RANGE_ID_DATA. - * On large memory systems KMEM_RANGE_ID_IOKIT is a dedicated carveout. - */ -#if !defined(ARM_LARGE_MEMORY) - kmem_ranges[KMEM_RANGE_ID_IOKIT] = kmem_ranges[KMEM_RANGE_ID_DATA]; -#endif /* !defined(ARM_LARGE_MEMORY) */ - /* * Now that we are done assigning all the ranges, reset * kmem_ranges[KMEM_RANGE_ID_NONE] @@ -4245,18 +4287,20 @@ kmem_range_init(void) kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address = kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address; - range_adjustment = iokit_range_size >> 3; - kmem_large_ranges[KMEM_RANGE_ID_IOKIT].min_address = - kmem_ranges[KMEM_RANGE_ID_IOKIT].min_address + range_adjustment; - kmem_large_ranges[KMEM_RANGE_ID_IOKIT].max_address = - kmem_ranges[KMEM_RANGE_ID_IOKIT].max_address; - range_adjustment = data_range_size >> 3; kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address = kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment; kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address = kmem_ranges[KMEM_RANGE_ID_DATA].max_address; + if (kmem_needs_data_share_range()) { + range_adjustment = shared_data_range_size >> 3; + kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address = + kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address + range_adjustment; + kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address = + kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address; + } + pmap_init(); kmem_metadata_init(); kmem_sizeclass_init(); @@ -4272,7 +4316,9 @@ kmem_range_init(void) } #endif } +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* kernel map is not maintained in unit-test */ STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ #if DEBUG || DEVELOPMENT __startup_func @@ -4290,6 +4336,7 @@ STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init); kmem_gobj_stats kmem_get_gobj_stats(void) { + vmlp_api_start(KMEM_GET_GOBJ_STATS); kmem_gobj_stats stats = {}; vm_map_lock(kernel_map); @@ -4325,6 +4372,9 @@ kmem_get_gobj_stats(void) if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) { entry = entry->vme_next; } + + vmlp_range_event_entry(kernel_map, entry); + while (entry != vm_map_to_entry(kernel_map) && entry->vme_start < range.max_address) { used += (entry->vme_end - entry->vme_start); @@ -4340,6 +4390,7 @@ kmem_get_gobj_stats(void) } vm_map_unlock(kernel_map); + vmlp_api_end(KMEM_GET_GOBJ_STATS, 0); return stats; } @@ -4440,11 +4491,6 @@ kmem_init( #pragma mark map copyio -static inline void -current_thread_set_sec_override(bool val) -{ -#pragma unused(val) -} /* * Note: semantic types aren't used as `copyio` already validates. @@ -4469,12 +4515,10 @@ copyinmap( } } else { vm_map_reference(map); - current_thread_set_sec_override(true); - switch_ctx = vm_map_switch_to(map); + switch_ctx = vm_map_switch_with_sec_override(map, TRUE); if (copyin(fromaddr, todata, length) != 0) { kr = KERN_INVALID_ADDRESS; } - current_thread_set_sec_override(false); vm_map_switch_back(switch_ctx); vm_map_deallocate(map); } @@ -4505,8 +4549,7 @@ copyoutmap( } } else { vm_map_reference(map); - current_thread_set_sec_override(true); - switch_ctx = vm_map_switch_to(map); + switch_ctx = vm_map_switch_with_sec_override(map, TRUE); if (copyout(fromdata, toaddr, length) != 0) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, @@ -4515,7 +4558,6 @@ copyoutmap( KERN_INVALID_ADDRESS /* arg */); kr = KERN_INVALID_ADDRESS; } - current_thread_set_sec_override(false); vm_map_switch_back(switch_ctx); vm_map_deallocate(map); } @@ -4540,12 +4582,10 @@ copyoutmap_atomic32( } } else { vm_map_reference(map); - current_thread_set_sec_override(true); - switch_ctx = vm_map_switch_to(map); + switch_ctx = vm_map_switch_with_sec_override(map, TRUE); if (copyout_atomic32(value, toaddr) != 0) { kr = KERN_INVALID_ADDRESS; } - current_thread_set_sec_override(false); vm_map_switch_back(switch_ctx); vm_map_deallocate(map); } @@ -4570,12 +4610,10 @@ copyoutmap_atomic64( } } else { vm_map_reference(map); - current_thread_set_sec_override(true); - switch_ctx = vm_map_switch_to(map); + switch_ctx = vm_map_switch_with_sec_override(map, TRUE); if (copyout_atomic64(value, toaddr) != 0) { kr = KERN_INVALID_ADDRESS; } - current_thread_set_sec_override(false); vm_map_switch_back(switch_ctx); vm_map_deallocate(map); } @@ -4609,7 +4647,7 @@ vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt) return VM_KERNEL_UNSLIDE(addr); } - addr = VM_KERNEL_STRIP_UPTR(addr); + addr = VM_KERNEL_STRIP_PTR(addr); vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)]; SHA256_CTX sha_ctx; @@ -4643,10 +4681,12 @@ vm_kernel_addrperm_external( vm_offset_t addr, vm_offset_t *perm_addr) { + addr = VM_KERNEL_STRIP_UPTR(addr); + if (VM_KERNEL_IS_SLID(addr)) { *perm_addr = VM_KERNEL_UNSLIDE(addr); } else if (VM_KERNEL_ADDRESS(addr)) { - *perm_addr = addr + vm_kernel_addrperm_ext; + *perm_addr = ML_ADDRPERM(addr, vm_kernel_addrperm_ext); } else { *perm_addr = addr; } @@ -4755,7 +4795,10 @@ kmem_test_verify_type_policy(vm_offset_t addr, kmem_flags_t flags) pmap_mapping_type_t expected_type = PMAP_MAPPING_TYPE_RESTRICTED; /* Explicitly state the expected policy */ - if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) { + if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) { + expected_type = PMAP_MAPPING_TYPE_DEFAULT; + } else if ((flags & KMEM_DATA) && + !kalloc_is_restricted_data_mode_enforced()) { expected_type = PMAP_MAPPING_TYPE_DEFAULT; } @@ -4839,7 +4882,7 @@ static void kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind) { kmem_guard_t guard = { - .kmg_atomic = !(kind & KMR_DATA), + .kmg_atomic = !(kind & (KMR_DATA | KMR_DATA_SHARED)), .kmg_tag = VM_KERN_MEMORY_DIAG, .kmg_context = 0xefface, }; @@ -4948,7 +4991,7 @@ kmem_basic_test(__unused int64_t in, int64_t *out) map = kmem_suballoc(kernel_map, &addr, 64U << 20, VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE, - KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap; + KMS_NOFAIL | KMS_DATA_SHARED, VM_KERN_MEMORY_DIAG).kmr_submap; printf("%s: kmem_alloc ...\n", __func__); kmem_alloc_basic_test(map); @@ -4992,8 +5035,14 @@ kmem_basic_test(__unused int64_t in, int64_t *out) kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD); printf("%s: PASS\n", __func__); - printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__); - kmem_realloc_basic_test(map, KMR_DATA); + /* + * Using KMR_DATA without KMR_FREEOLD violates the + * single-mappability of RESTRICTED pages. + */ + + /* test KMR_SHARED_DATA for the new shared kheap */ + printf("%s: kmem_realloc (KMR_DATA_SHARED | KMR_FREEOLD) ...\n", __func__); + kmem_realloc_basic_test(map, KMR_DATA_SHARED | KMR_FREEOLD); printf("%s: PASS\n", __func__); /* test KMR_SHARED_DATA for the new shared kheap */ diff --git a/osfmk/vm/vm_kern_xnu.h b/osfmk/vm/vm_kern_xnu.h index 3a7410671..f34188ee2 100644 --- a/osfmk/vm/vm_kern_xnu.h +++ b/osfmk/vm/vm_kern_xnu.h @@ -33,7 +33,7 @@ #include __BEGIN_DECLS -#pragma GCC visibility push(hidden) +__exported_push_hidden #ifdef XNU_KERNEL_PRIVATE @@ -337,7 +337,7 @@ __options_decl(kmem_claims_flags_t, uint32_t, { * Security config that creates the additional splits in non data part of * kernel_map */ -#if KASAN || (__arm64__ && !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR)) +#if KASAN || (__arm64__ && !defined(KERNEL_INTEGRITY_KTRR) && !defined(KERNEL_INTEGRITY_CTRR) && !defined(KERNEL_INTEGRITY_PV_CTRR)) # define ZSECURITY_CONFIG_KERNEL_PTR_SPLIT OFF #else # define ZSECURITY_CONFIG_KERNEL_PTR_SPLIT ON @@ -626,6 +626,7 @@ __options_decl(kms_flags_t, uint32_t, { /* How to look for addresses */ KMS_LAST_FREE = KMEM_LAST_FREE, KMS_DATA = KMEM_DATA, + KMS_DATA_SHARED = KMEM_DATA_SHARED, /* Entry properties */ KMS_PERMANENT = KMEM_PERMANENT, @@ -694,7 +695,10 @@ __options_decl(kmr_flags_t, uint32_t, { #define KMEM_REALLOC_FLAGS_VALID(flags) \ (((flags) & (KMR_KOBJECT | KMEM_GUARD_LAST | KMEM_KASAN_GUARD | KMR_DATA)) == KMR_DATA \ || ((flags) & (KMR_KOBJECT | KMEM_GUARD_LAST | KMEM_KASAN_GUARD | KMR_DATA_SHARED)) == KMR_DATA_SHARED \ - || ((flags) & KMR_FREEOLD)) + || ((flags) & KMR_FREEOLD) \ + && (((flags) & (KMR_DATA | KMR_DATA_SHARED)) != (KMR_DATA | KMR_DATA_SHARED)) \ + && (((flags) & (KMA_PAGEABLE | KMA_DATA)) != (KMA_PAGEABLE | KMA_DATA))) + /*! * @function kmem_realloc_guard() @@ -1755,6 +1759,14 @@ extern kern_return_t vm_kern_allocation_info( */ extern void vm_init_before_launchd(void); +#if DEVELOPMENT || DEBUG + +extern kern_return_t vm_tag_reset_peak(vm_tag_t tag); + +extern void vm_tag_reset_all_peaks(void); + +#endif /* DEVELOPMENT || DEBUG */ + #if VM_TAG_SIZECLASSES /*! @@ -1774,7 +1786,7 @@ extern kern_return_t device_pager_populate_object( memory_object_t device, memory_object_offset_t offset, ppnum_t page_num, vm_size_t size); #endif /* XNU_KERNEL_PRIVATE */ -#pragma GCC visibility pop +__exported_pop __END_DECLS #endif /* _VM_VM_KERN_XNU_H_ */ diff --git a/osfmk/vm/vm_lock_perf.h b/osfmk/vm/vm_lock_perf.h new file mode 100644 index 000000000..1a17211e0 --- /dev/null +++ b/osfmk/vm/vm_lock_perf.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include + +/* This should only be enabled at desk */ +#define ENABLE_VM_LOCK_PERF 0 + +/* + * The VM Lock Perf (VMLP) module uses ktrace to gather insights into the + * performance profile of the VM subsystem, particularly as it pertains to + * locking behavior. + * We use the ktrace events, further subdividing the code field as below. + * The "type" field indicates which type of VMLP event is being reported. + * Currently supported types are API, Lock, and Range (see below). + * The subcode is type-dependent. + * DBG_MACH VMLP type subcode function + * ╭──────┬───────┬────┬────────┬─╮ + * │ 8 │ 8 │ 5 | 9 │2│ + * ╰──────┴───────┴────┴────────┴─╯ + */ + +#pragma mark VM Lock Performance Event IDs + +typedef enum __enum_closed { + VM_LOCK_PERF_API_EVENT = 1, /* Operations on map lock */ + VM_LOCK_PERF_LOCK_EVENT, /* Function start/end */ + VM_LOCK_PERF_RANGE_EVENT, /* Reporting a range */ +} vmlp_event_type_t; + +#define VMLP_CODE_TYPE_OFFSET (9) +#define VMLP_CODE_TYPE_MASK (0x1f) +#define VMLP_CODE_SUBCODE_OFFSET (0) +#define VMLP_CODE_SUBCODE_MASK (0x1ff) +#define VMLP_CODE(type, subcode) ((((type) & VMLP_CODE_TYPE_MASK) << VMLP_CODE_TYPE_OFFSET) | (((subcode) & VMLP_CODE_SUBCODE_MASK) << VMLP_CODE_SUBCODE_OFFSET)) +#define VMLP_EVENTID(type, subcode, function) (MACHDBG_CODE(DBG_MACH_VM_LOCK_PERF, VMLP_CODE((type), (subcode))) | (function)) + +#pragma mark Subcodes for API events + +#define VMLPAN(name) VMLP_EVENT_API_ ## name /* VM Perf API Name */ + +typedef enum __enum_closed { + VMLPAN(FILL_PROCREGIONINFO) = 1, + VMLPAN(FILL_PROCREGIONINFO_ONLYMAPPEDVNODES), + VMLPAN(FIND_MAPPING_TO_SLIDE), + VMLPAN(GET_VMMAP_ENTRIES), + VMLPAN(GET_VMSUBMAP_ENTRIES), + VMLPAN(KDP_LIGHTWEIGHT_FAULT), + VMLPAN(KMEM_ALLOC_GUARD_INTERNAL), + VMLPAN(KMEM_FREE_GUARD), + VMLPAN(KMEM_GET_GOBJ_STATS), + VMLPAN(KMEM_POPULATE_META_LOCKED), + VMLPAN(KMEM_REALLOC_GUARD), + VMLPAN(KMEM_SIZE_GUARD), + VMLPAN(MACH_MAKE_MEMORY_ENTRY_SHARE), + VMLPAN(MACH_VM_RANGE_CREATE_V1), + VMLPAN(MOVE_PAGES_TO_QUEUE), + VMLPAN(TASK_FIND_REGION_DETAILS), + VMLPAN(TASK_INFO), + VMLPAN(VM32_REGION_INFO), + VMLPAN(VM32_REGION_INFO_64), + VMLPAN(VM32__MAP_EXEC_LOCKDOWN), + VMLPAN(VMTC_REVALIDATE_LOOKUP), + VMLPAN(VM_FAULT_COPY), + VMLPAN(VM_FAULT_INTERNAL), + VMLPAN(VM_KERN_ALLOCATION_INFO), + VMLPAN(VM_MAP_APPLE_PROTECTED), + VMLPAN(VM_MAP_BEHAVIOR_SET), + VMLPAN(VM_MAP_CAN_REUSE), + VMLPAN(VM_MAP_CHECK_PROTECTION), + VMLPAN(VM_MAP_COPYIN_INTERNAL), + VMLPAN(VM_MAP_COPYOUT_INTERNAL), + VMLPAN(VM_MAP_COPY_OVERWRITE), + VMLPAN(VM_MAP_COPY_OVERWRITE_ALIGNED), + VMLPAN(VM_MAP_COPY_OVERWRITE_NESTED), + VMLPAN(VM_MAP_COPY_OVERWRITE_UNALIGNED), + VMLPAN(VM_MAP_CREATE_UPL), + VMLPAN(VM_MAP_CS_DEBUGGED_SET), + VMLPAN(VM_MAP_CS_ENFORCEMENT_SET), + VMLPAN(VM_MAP_DELETE), + VMLPAN(VM_MAP_DELETE_SUBMAP_RECURSE), + VMLPAN(VM_MAP_DESTROY), + VMLPAN(VM_MAP_DISCONNECT_PAGE_MAPPINGS), + VMLPAN(VM_MAP_ENTER), + VMLPAN(VM_MAP_ENTER_MEM_OBJECT), + VMLPAN(VM_MAP_ENTRY_HAS_DEVICE_PAGER), + VMLPAN(VM_MAP_EXEC_LOCKDOWN), + VMLPAN(VM_MAP_FIND_SPACE), + VMLPAN(VM_MAP_FORK), + VMLPAN(VM_MAP_FORK_COPY), + VMLPAN(VM_MAP_FREEZE), + VMLPAN(VM_MAP_GET_PHYS_PAGE), + VMLPAN(VM_MAP_INHERIT), + VMLPAN(VM_MAP_INJECT_ERROR), + VMLPAN(VM_MAP_IS_CORPSE_SOURCE), + VMLPAN(VM_MAP_LOOKUP_AND_LOCK_OBJECT), + VMLPAN(VM_MAP_MACHINE_ATTRIBUTE), + VMLPAN(VM_MAP_MARK_ALIEN), + VMLPAN(VM_MAP_MSYNC), + VMLPAN(VM_MAP_NON_ALIGNED_TEST), /* now unused; can be removed on next breaking change */ + VMLPAN(VM_MAP_OVERWRITE_SUBMAP_RECURSE), + VMLPAN(VM_MAP_PAGEOUT), + VMLPAN(VM_MAP_PAGE_RANGE_INFO_INTERNAL), + VMLPAN(VM_MAP_PARTIAL_REAP), + VMLPAN(VM_MAP_PROTECT), + VMLPAN(VM_MAP_PURGABLE_CONTROL), + VMLPAN(VM_MAP_RAISE_MAX_OFFSET), + VMLPAN(VM_MAP_RAISE_MIN_OFFSET), + VMLPAN(VM_MAP_RANGE_CONFIGURE), + VMLPAN(VM_MAP_REGION), + VMLPAN(VM_MAP_REGION_RECURSE_64), + VMLPAN(VM_MAP_REMAP), + VMLPAN(VM_MAP_REMAP_EXTRACT), + VMLPAN(VM_MAP_REMOVE_AND_UNLOCK), + VMLPAN(VM_MAP_REMOVE_GUARD), + VMLPAN(VM_MAP_REUSABLE_PAGES), + VMLPAN(VM_MAP_REUSE_PAGES), + VMLPAN(VM_MAP_SET_CACHE_ATTR), + VMLPAN(VM_MAP_SET_CORPSE_SOURCE), + VMLPAN(VM_MAP_SET_DATA_LIMIT), + VMLPAN(VM_MAP_SET_MAX_ADDR), + VMLPAN(VM_MAP_SET_SIZE_LIMIT), + VMLPAN(VM_MAP_SET_TPRO_ENFORCEMENT), + VMLPAN(VM_MAP_SET_TPRO_RANGE), + VMLPAN(VM_MAP_SET_USER_WIRE_LIMIT), + VMLPAN(VM_MAP_SHADOW_MAX), + VMLPAN(VM_MAP_SIGN), + VMLPAN(VM_MAP_SIMPLIFY), + VMLPAN(VM_MAP_SINGLE_JIT), + VMLPAN(VM_MAP_SIZES), + VMLPAN(VM_MAP_SUBMAP_PMAP_CLEAN), + VMLPAN(VM_MAP_SWITCH_PROTECT), + VMLPAN(VM_MAP_TERMINATE), + VMLPAN(VM_MAP_UNSET_CORPSE_SOURCE), + VMLPAN(VM_MAP_UNWIRE_NESTED), + VMLPAN(VM_MAP_WILLNEED), + VMLPAN(VM_MAP_WIRE_NESTED), + VMLPAN(VM_MAP_ZERO), + VMLPAN(VM_PAGE_DIAGNOSE), + VMLPAN(VM_SHARED_REGION_MAP_FILE), + VMLPAN(VM_TOGGLE_ENTRY_REUSE), + VMLPAN(ZONE_METADATA_INIT), + VMLPAN(ZONE_SUBMAP_ALLOC_SEQUESTERED_VA), +} vmlp_api_event_t; + +#pragma mark Subcodes for Lock events + +typedef enum __enum_closed { + VMLP_EVENT_LOCK_TRY_EXCL = 1, + VMLP_EVENT_LOCK_FAIL_EXCL, + VMLP_EVENT_LOCK_REQ_EXCL, + VMLP_EVENT_LOCK_GOT_EXCL, + VMLP_EVENT_LOCK_UNLOCK_EXCL, + VMLP_EVENT_LOCK_DOWNGRADE, + VMLP_EVENT_LOCK_TRY_SH, + VMLP_EVENT_LOCK_FAIL_SH, + VMLP_EVENT_LOCK_REQ_SH, + VMLP_EVENT_LOCK_GOT_SH, + VMLP_EVENT_LOCK_UNLOCK_SH, + VMLP_EVENT_LOCK_TRY_UPGRADE, + VMLP_EVENT_LOCK_GOT_UPGRADE, + VMLP_EVENT_LOCK_FAIL_UPGRADE, + VMLP_EVENT_LOCK_SLEEP_BEGIN, + VMLP_EVENT_LOCK_SLEEP_END, + VMLP_EVENT_LOCK_YIELD_BEGIN, + VMLP_EVENT_LOCK_YIELD_END, +} vmlp_lock_event_t; + +#pragma mark Subcodes for Range events + +typedef enum __enum_closed { + VMLP_EVENT_RANGE = 1, +} vmlp_range_event_t; + +/* + * vmlp_* function calls do nothing under normal circumstances... + * If we ever change this behavior we need to reconsider whether DBG_MACH is + * the right class to be a subclass of given that it is enabled entirely in + * default traces. + */ +#if !ENABLE_VM_LOCK_PERF + +#define vmlp_lock_event_unlocked(event, map) +#define vmlp_lock_event_locked(event, map) +#define vmlp_api_start(func) +#define vmlp_api_end(func, kr) +#define vmlp_range_event(map, addr, size) +#define vmlp_range_event_entry(map, entry) +#define vmlp_range_event_none(map) +#define vmlp_range_event_all(map) + +#else /* ...but when the module is enabled they emit tracepoints */ + +#pragma mark Debug infra + +/* + * Use stack counters to debug extra or missing end annotations. + * Should only be turned on while debugging annotations. + */ +#define VMLP_DEBUG_COUNTERS 0 + +#if VMLP_DEBUG_COUNTERS +static inline void +__vmlp_debug_counter_check(int *__vmlp_debug_counter) +{ + if (1 != *__vmlp_debug_counter) { + panic("vmlp_api_end was run %d times in this function (expected 1).", *__vmlp_debug_counter); + } +} +#define VMLP_DEBUG_COUNTER_DECLARE int __vmlp_debug_counter __attribute__((cleanup(__vmlp_debug_counter_check))) = 0 +#define VMLP_DEBUG_COUNTER_UPDATE __vmlp_debug_counter++ +#else +#define VMLP_DEBUG_COUNTER_DECLARE +#define VMLP_DEBUG_COUNTER_UPDATE +#endif + +#pragma mark API events + +static inline void +__vmlp_api_start(vmlp_api_event_t api) +{ + (void)api; + KDBG(VMLP_EVENTID(VM_LOCK_PERF_API_EVENT, api, DBG_FUNC_START)); +} +#define vmlp_api_start(func) VMLP_DEBUG_COUNTER_DECLARE; \ + __vmlp_api_start(VMLPAN(func)); + +static inline void +__vmlp_api_end(vmlp_api_event_t api, uint64_t kr) +{ + (void)api, (void)kr; + KDBG(VMLP_EVENTID(VM_LOCK_PERF_API_EVENT, api, DBG_FUNC_END), kr); +} +/* + * Note that post-processing will treat any non-zero kr as failure, so annotate + * accordingly when APIs do not return a kern_return_t. + */ +#define vmlp_api_end(func, kr) do { \ + VMLP_DEBUG_COUNTER_UPDATE; \ + __vmlp_api_end(VMLPAN(func), kr); \ +} while (0) + +#pragma mark Lock events + +static inline void +__vmlp_lock_event(vmlp_lock_event_t event, vm_map_t map, unsigned int timestamp) +{ + (void)event, (void)map, (void)timestamp; + KDBG(VMLP_EVENTID(VM_LOCK_PERF_LOCK_EVENT, event, DBG_FUNC_NONE), map, timestamp); +} +static inline void +vmlp_lock_event_unlocked(vmlp_lock_event_t event, vm_map_t map) +{ + /* + * If we don't hold a lock on the map it's not safe to access the + * timestamp. Pass 0 as placeholder. + */ + __vmlp_lock_event(event, map, 0); +} +/* + * Map timestamps get incremented at unlock time. Care should be taken to + * position this annotation before the timestamp increase. + */ +static inline void +vmlp_lock_event_locked(vmlp_lock_event_t event, vm_map_t map) +{ + /* + * Postprocessing can use the map timestamp to reorder events that are + * causally related but end up having the same ktrace-timestamp and + * showing up in reverse order because they occured on different CPUs. + */ + __vmlp_lock_event(event, map, map->timestamp); +} + +#pragma mark Range events + +static inline void +vmlp_range_event(vm_map_t map, mach_vm_address_t addr, mach_vm_size_t size) +{ + (void)map, (void)addr, (void)size; + KDBG(VMLP_EVENTID(VM_LOCK_PERF_RANGE_EVENT, VMLP_EVENT_RANGE, DBG_FUNC_NONE), map, map->timestamp, addr, size); +} + +static inline void +vmlp_range_event_entry(vm_map_t map, vm_map_entry_t entry) +{ + vmlp_range_event(map, entry->vme_start, entry->vme_end - entry->vme_start); +} + +static inline void +vmlp_range_event_none(vm_map_t map) +{ + vmlp_range_event(map, 0, 0); +} + +static inline void +vmlp_range_event_all(vm_map_t map) +{ + vmlp_range_event(map, 0, 0xffffffffffffffff); +} + +#endif /* !ENABLE_VM_LOCK_PERF */ diff --git a/osfmk/vm/vm_log.h b/osfmk/vm/vm_log.h new file mode 100644 index 000000000..7d26c7a4c --- /dev/null +++ b/osfmk/vm/vm_log.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + +#pragma once + +extern os_log_t vm_log_handle; +extern bool vm_log_debug_enabled; +extern bool vm_log_to_serial; + +#define _vm_log_with_type(type, format, ...) MACRO_BEGIN \ + if (os_unlikely(vm_log_to_serial)) { \ + printf("vm: " format, ##__VA_ARGS__); \ + } else { \ + os_log_with_startup_serial_and_type(vm_log_handle, type, "vm: " format, ##__VA_ARGS__); \ + } \ +MACRO_END +#define vm_log(format, ...) _vm_log_with_type(OS_LOG_TYPE_DEFAULT, format, ##__VA_ARGS__) +#define vm_log_info(format, ...) _vm_log_with_type(OS_LOG_TYPE_INFO, format, ##__VA_ARGS__) +#define vm_log_debug(format, ...) \ +MACRO_BEGIN \ + if (os_unlikely(vm_log_debug_enabled)) { \ + _vm_log_with_type(OS_LOG_TYPE_DEBUG, format, ##__VA_ARGS__); \ + } \ +MACRO_END +#define vm_log_error(format, ...) _vm_log_with_type(OS_LOG_TYPE_ERROR, format, ##__VA_ARGS__) +#define vm_log_fault(format, ...) _vm_log_with_type(OS_LOG_TYPE_FAULT, format, ##__VA_ARGS__) diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 17c51aaff..fb4fae013 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -91,6 +91,7 @@ #include #include +#include #include #include #include @@ -113,6 +114,7 @@ #include #include +#include #include #include #include @@ -123,6 +125,7 @@ #if DEVELOPMENT || DEBUG #include #endif /* DEVELOPMENT || DEBUG */ +#include #include #include @@ -137,6 +140,7 @@ #include #include +#include #include @@ -203,10 +207,13 @@ static TUNABLE(bool, vm_map_executable_immutable, #if DEVELOPMENT || DEBUG static TUNABLE(int, vm_map_kernel_alloc_limit_mode, - "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP); + "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT); #else -#define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS -#endif /* DEVELOPMENT || DEBUG */ +#define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT +#endif + +SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_map_entry_packing_params = + VM_PACKING_PARAMS(VM_ENTRY_PACKED_PTR); os_refgrp_decl(static, map_refgrp, "vm_map", NULL); @@ -254,8 +261,7 @@ static vm_map_entry_t vm_map_entry_insert( boolean_t needs_copy, vm_prot_t cur_protection, vm_prot_t max_protection, - vm_inherit_t inheritance, - boolean_t clear_map_aligned); + vm_inherit_t inheritance); static void vm_map_simplify_range( vm_map_t map, @@ -320,10 +326,17 @@ static kern_return_t vm_map_copy_overwrite_aligned( vm_map_offset_t start, pmap_t pmap); +__options_closed_decl(vm_map_copyin_strategy, uint8_t, { + VM_MAP_COPYIN_STRATEGY_INVALID_ARGUMENT, + VM_MAP_COPYIN_STRATEGY_KERNEL_BUFFER, + VM_MAP_COPYIN_STRATEGY_VIRTUAL_COPY, +}); + static kern_return_t vm_map_copyin_kernel_buffer( vm_map_t src_map, vm_map_address_t src_addr, vm_map_size_t len, + vm_map_copyin_strategy strategy, boolean_t src_destroy, vm_map_copy_t *copy_result); /* OUT */ @@ -464,7 +477,7 @@ kern_return_t vm_map_corpse_footprint_query_page_info( vm_map_t map, vm_map_offset_t va, int *disposition_p); -void vm_map_footprint_query_page_info( +static void vm_map_footprint_query_page_info_exclusive( vm_map_t map, vm_map_entry_t map_entry, vm_map_offset_t curr_s_offset, @@ -476,6 +489,18 @@ static void vm_map_range_map_init(void); pid_t find_largest_process_vm_map_entries(void); +uint8_t vm_map_entry_info_flags( + vm_map_entry_t entry); + + +#if DEBUG || DEVELOPMENT +#define panic_on_release_builds(format, ...) \ + ({}) +#else /* not DEBUG || DEVELOPMENT */ +#define panic_on_release_builds(format, ...) \ + panic(format __VA_OPT__(,) __VA_ARGS__) +#endif /* not DEBUG || DEVELOPMENT */ + __attribute__((always_inline)) int vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags) @@ -729,10 +754,14 @@ __attribute__((always_inline)) int vm_map_lock_read_to_write(vm_map_t map) { + vmlp_lock_event_locked(VMLP_EVENT_LOCK_TRY_UPGRADE, map); + assert(!vm_map_is_sealed(map)); if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) { DTRACE_VM(vm_map_lock_upgrade); + vmlp_lock_event_locked(VMLP_EVENT_LOCK_GOT_UPGRADE, map); return 0; } + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_FAIL_UPGRADE, map); return 1; } @@ -740,10 +769,13 @@ __attribute__((always_inline)) boolean_t vm_map_try_lock(vm_map_t map) { + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_TRY_EXCL, map); if (lck_rw_try_lock_exclusive(&(map)->lock)) { DTRACE_VM(vm_map_lock_w); + vmlp_lock_event_locked(VMLP_EVENT_LOCK_GOT_EXCL, map); return TRUE; } + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_FAIL_EXCL, map); return FALSE; } @@ -751,10 +783,13 @@ __attribute__((always_inline)) boolean_t vm_map_try_lock_read(vm_map_t map) { + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_TRY_SH, map); if (lck_rw_try_lock_shared(&(map)->lock)) { DTRACE_VM(vm_map_lock_r); + vmlp_lock_event_locked(VMLP_EVENT_LOCK_GOT_SH, map); return TRUE; } + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_FAIL_SH, map); return FALSE; } @@ -996,6 +1031,8 @@ vm_map_set_cache_attr( vm_object_t object; kern_return_t kr = KERN_SUCCESS; + vmlp_api_start(VM_MAP_SET_CACHE_ATTR); + vm_map_lock_read(map); if (!vm_map_lookup_entry(map, va, &map_entry) || @@ -1006,6 +1043,9 @@ vm_map_set_cache_attr( kr = KERN_INVALID_ARGUMENT; goto done; } + + vmlp_range_event_entry(map, map_entry); + object = VME_OBJECT(map_entry); if (object == VM_OBJECT_NULL) { @@ -1023,6 +1063,7 @@ vm_map_set_cache_attr( done: vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_SET_CACHE_ATTR, kr); return kr; } @@ -1056,10 +1097,13 @@ vm_map_apple_protected( vm_object_offset_t crypto_start, crypto_end; boolean_t cache_pager; + vmlp_api_start(VM_MAP_APPLE_PROTECTED); + map_locked = FALSE; unprotected_mem_obj = MEMORY_OBJECT_NULL; if (__improbable(vm_map_range_overflows(map, start, end - start))) { + vmlp_api_end(VM_MAP_APPLE_PROTECTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } start_aligned = vm_map_trunc_page(start, PAGE_MASK_64); @@ -1124,6 +1168,7 @@ vm_map_apple_protected( vm_map_clip_end(map, map_entry, end_aligned); tmp_entry = *map_entry; + vmlp_range_event_entry(map, &tmp_entry); map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */ vm_map_unlock(map); map_locked = FALSE; @@ -1261,6 +1306,7 @@ done: if (map_locked) { vm_map_unlock(map); } + vmlp_api_end(VM_MAP_APPLE_PROTECTED, kr); return kr; } #endif /* CONFIG_CODE_DECRYPTION */ @@ -1429,7 +1475,7 @@ vm_map_steal_memory(void) * are the ones registered with vm_map_will_allocate_early_map(), * which are: * - the kernel map - * - the various submaps used by zones (pgz, meta, ...) + * - the various submaps used by zones (meta, ...) * * We also need enough entries and holes to support them * until zone_metadata_init() is called, which is when @@ -1494,7 +1540,7 @@ vm_map_disable_hole_optimization(vm_map_t map) next_hole_entry = hole_entry->vme_next; hole_entry->vme_next = NULL; - hole_entry->vme_prev = NULL; + VME_PREV_SET(hole_entry, NULL); zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry); if (next_hole_entry == head_entry) { @@ -1507,7 +1553,7 @@ vm_map_disable_hole_optimization(vm_map_t map) map->holes_list = NULL; map->holelistenabled = FALSE; - map->first_free = vm_map_first_entry(map); + map->first_free = vm_map_to_entry(map); SAVE_HINT_HOLE_WRITE(map, NULL); } } @@ -1615,7 +1661,10 @@ vm_map_relocate_early_elem( } if (zone_id == ZONE_ID_VM_MAP) { - relocate(vm_map_t, hdr.links.prev); + struct vm_map_header * hdr_ptr = &((vm_map_t)new_addr)->hdr; + if (VMH_PREV(hdr_ptr)) { + VMH_PREV_SET(hdr_ptr, (vm_map_entry_t)(((vm_offset_t) VMH_PREV(hdr_ptr)) + delta)); + } relocate(vm_map_t, hdr.links.next); ((vm_map_t)new_addr)->pmap = kernel_pmap; #ifdef VM_MAP_STORE_USE_RB @@ -1627,7 +1676,10 @@ vm_map_relocate_early_elem( return; } - relocate(struct vm_map_links *, prev); + struct vm_map_links * links_ptr = ((struct vm_map_links *)new_addr); + if (VML_PREV(links_ptr)) { + VML_PREV_SET(links_ptr, (vm_map_entry_t) (((vm_offset_t) VML_PREV(links_ptr)) + delta)); + } relocate(struct vm_map_links *, next); if (zone_id == ZONE_ID_VM_MAP_ENTRY) { @@ -1649,6 +1701,32 @@ vm_map_relocate_early_elem( #undef relocate } +/* + * Generate a serial ID to identify a newly allocated vm_map + */ +static uintptr_t vm_map_serial_current = 0; +vm_map_serial_t vm_map_serial_generate(void); +void vm_map_assign_serial(vm_map_t, vm_map_serial_t); + +vm_map_serial_t +vm_map_serial_generate(void) +{ + vm_map_serial_t serial = (void *)os_atomic_inc(&vm_map_serial_current, relaxed); + return serial; +} + +void +vm_map_assign_serial(vm_map_t map, vm_map_serial_t serial) +{ + map->serial_id = serial; +#if CONFIG_SPTM + /* Copy through our ID to the pmap (only available on SPTM systems) */ + if (map->pmap) { + map->pmap->associated_vm_map_serial_id = map->serial_id; + } +#endif /* CONFIG_SPTM */ +} + vm_map_t vm_map_create_options( pmap_t pmap, @@ -1682,7 +1760,21 @@ vm_map_create_options( result->data_limit = RLIM_INFINITY; /* default unlimited */ result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1); + result->pmap = pmap; + + /* + * Immediately give ourselves an ID + * Unless this map is being created as part of a fork, in which case + * the caller will reassign the ID of the parent (so don't waste an + * increment here). + * Reusing parent IDs on fork enacts our policy that fork() pairs share + * a domain and can freely alias tagged MTE mappings between themselves. + */ + if ((options & VM_MAP_CREATE_VIA_FORK) == 0) { + vm_map_assign_serial(result, vm_map_serial_generate()); + } + result->min_offset = min; result->max_offset = max; result->first_free = vm_map_to_entry(result); @@ -1707,7 +1799,8 @@ vm_map_create_options( */ hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS); result->holes_list = result->hole_hint = hole_entry; - hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry); + VML_PREV_SET(hole_entry, CAST_TO_VM_MAP_ENTRY(hole_entry)); + hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry); result->holelistenabled = true; } @@ -1963,7 +2056,15 @@ vm_map_destroy( VM_MAP_ZAP_DECLARE(zap); - vm_map_lock(map); + vmlp_api_start(VM_MAP_DESTROY); + vmlp_range_event_all(map); + + if (vm_map_is_sealed(map)) { + vm_map_lock_unseal(map); + assert(!vm_map_is_sealed(map)); + } else { + vm_map_lock(map); + } map->terminated = true; /* clean up regular map entries */ @@ -1994,6 +2095,7 @@ vm_map_destroy( #endif zfree_id(ZONE_ID_VM_MAP, map); + vmlp_api_end(VM_MAP_DESTROY, 0); } /* @@ -2048,18 +2150,12 @@ vm_map_lookup_entry( { bool result = false; -#if CONFIG_KERNEL_TAGGING +#if KASAN_TBI if (VM_KERNEL_ADDRESS(address)) { address = vm_memtag_canonicalize_kernel(address); } -#endif /* CONFIG_KERNEL_TAGGING */ +#endif /* KASAN_TBI */ -#if CONFIG_PROB_GZALLOC - if (map->pmap == kernel_pmap) { - assertf(!pgz_owned(address), - "it is the responsibility of callers to unguard PGZ addresses"); - } -#endif /* CONFIG_PROB_GZALLOC */ result = vm_map_store_lookup_entry( map, address, entry ); return result; @@ -2079,23 +2175,6 @@ vm_map_lookup_entry_or_next( return false; } -#if CONFIG_PROB_GZALLOC -boolean_t -vm_map_lookup_entry_allow_pgz( - vm_map_t map, - vm_map_offset_t address, - vm_map_entry_t *entry) /* OUT */ -{ -#if CONFIG_KERNEL_TAGGING - if (VM_KERNEL_ADDRESS(address)) { - address = vm_memtag_canonicalize_kernel(address); - } -#endif /* CONFIG_KERNEL_TAGGING */ - - return vm_map_store_lookup_entry( map, address, entry ); -} -#endif /* CONFIG_PROB_GZALLOC */ - /* * Routine: vm_map_range_invalid_panic * Purpose: @@ -2474,7 +2553,7 @@ vm_map_locate_space_fixed( } if (vmk_flags.vmf_overwrite) { - vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE; + vmr_flags_t remove_flags = VM_MAP_REMOVE_TO_OVERWRITE; kern_return_t remove_kr; /* @@ -2553,7 +2632,10 @@ vm_map_find_space( vm_map_entry_t new_entry, entry; kern_return_t kr; + vmlp_api_start(VM_MAP_FIND_SPACE); + if (size == 0) { + vmlp_api_end(VM_MAP_FIND_SPACE, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -2562,9 +2644,6 @@ vm_map_find_space( new_entry->protection = VM_PROT_DEFAULT; new_entry->max_protection = VM_PROT_ALL; - if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { - new_entry->map_aligned = true; - } if (vmk_flags.vmf_permanent) { new_entry->vme_permanent = true; } @@ -2576,6 +2655,7 @@ vm_map_find_space( if (kr != KERN_SUCCESS) { vm_map_unlock(map); vm_map_entry_dispose(new_entry); + vmlp_api_end(VM_MAP_FIND_SPACE, kr); return kr; } new_entry->vme_start = hint_address; @@ -2606,6 +2686,7 @@ vm_map_find_space( vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE); map->size += size; + vmlp_range_event_entry(map, new_entry); /* * Update the lookup hint @@ -2613,6 +2694,7 @@ vm_map_find_space( SAVE_HINT_MAP_WRITE(map, new_entry); *o_entry = new_entry; + vmlp_api_end(VM_MAP_FIND_SPACE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -2686,6 +2768,7 @@ vm_map_pmap_enter( map, (unsigned long long)addr, object, (unsigned long long)offset); } type_of_fault = DBG_CACHE_HIT_FAULT; + bool page_sleep_needed = false; kr = vm_fault_enter(m, map->pmap, addr, PAGE_SIZE, 0, @@ -2695,9 +2778,11 @@ vm_map_pmap_enter( &fault_info, NULL, /* need_retry */ &type_of_fault, - &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/ + &object_lock_type, /* Exclusive lock mode. Will remain unchanged.*/ + &page_sleep_needed); vm_object_unlock(object); + assert(!page_sleep_needed); offset += PAGE_SIZE_64; addr += PAGE_SIZE; @@ -2746,12 +2831,6 @@ vm_map_random_address_for_size( effective_range.min_address + (random_addr % addr_space_size), VM_MAP_PAGE_MASK(map)); -#if CONFIG_PROB_GZALLOC - if (map->pmap == kernel_pmap && pgz_owned(random_addr)) { - continue; - } -#endif /* CONFIG_PROB_GZALLOC */ - if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) { if (prev_entry == vm_map_to_entry(map)) { next_entry = vm_map_first_entry(map); @@ -2850,12 +2929,13 @@ vm_map_enter( const vm_tag_t alias = vmk_flags.vm_tag; vm_tag_t user_alias; kern_return_t kr; - bool clear_map_aligned = FALSE; vm_map_size_t chunk_size = 0; vm_object_t caller_object; VM_MAP_ZAP_DECLARE(zap_old_list); VM_MAP_ZAP_DECLARE(zap_new_list); + vmlp_api_start(VM_MAP_ENTER); + caller_object = object; assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2); @@ -2875,6 +2955,7 @@ vm_map_enter( if (superpage_size) { if (object != VM_OBJECT_NULL) { /* caller can't provide their own VM object */ + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } switch (superpage_size) { @@ -2894,10 +2975,12 @@ vm_map_enter( break; #endif default: + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } mask = SUPERPAGE_SIZE - 1; if (size & (SUPERPAGE_SIZE - 1)) { + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */ @@ -2938,6 +3021,7 @@ vm_map_enter( (vm_protect_wx_fail ? "failing" : "turning off execute")); cur_protection &= ~VM_PROT_EXECUTE; if (vm_protect_wx_fail) { + vmlp_api_end(VM_MAP_ENTER, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } @@ -2984,6 +3068,7 @@ vm_map_enter( : "?"), __FUNCTION__, cur_protection); + vmlp_api_end(VM_MAP_ENTER, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; #endif } @@ -2994,6 +3079,7 @@ vm_map_enter( */ if (map->map_disallow_new_exec == TRUE) { if (cur_protection & VM_PROT_EXECUTE) { + vmlp_api_end(VM_MAP_ENTER, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } @@ -3002,6 +3088,7 @@ vm_map_enter( assert(!is_submap); int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC)); if ((cur_protection | max_protection) & reject_prot) { + vmlp_api_end(VM_MAP_ENTER, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } @@ -3020,6 +3107,7 @@ vm_map_enter( * contents of the mapped object (e.g. the file), * so we can't provide any media resiliency here. */ + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -3034,6 +3122,7 @@ vm_map_enter( if (!(max_protection & VM_PROT_READ) || !(max_protection & VM_PROT_WRITE) || !(cur_protection & VM_PROT_READ)) { + vmlp_api_end(VM_MAP_ENTER, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -3049,15 +3138,18 @@ vm_map_enter( vm_map_t submap; if (purgable) { /* submaps can not be purgeable */ + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (object == VM_OBJECT_NULL) { /* submaps can not be created lazily */ + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } submap = (vm_map_t) object; if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) { /* page size mismatch */ + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -3070,6 +3162,7 @@ vm_map_enter( * (!VM_FLAGS_OVERWRITE). */ if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) { + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -3077,6 +3170,7 @@ vm_map_enter( if (size == 0 || (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) { *address = 0; + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -3092,38 +3186,8 @@ vm_map_enter( #define RETURN(value) { result = value; goto BailOut; } - assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address); - assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size); - if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { - assertf(page_aligned(*address), "0x%llx", (uint64_t)*address); - assertf(page_aligned(size), "0x%llx", (uint64_t)size); - } - - if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK && - !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) { - /* - * In most cases, the caller rounds the size up to the - * map's page size. - * If we get a size that is explicitly not map-aligned here, - * we'll have to respect the caller's wish and mark the - * mapping as "not map-aligned" to avoid tripping the - * map alignment checks later. - */ - clear_map_aligned = TRUE; - } - if (!anywhere && - VM_MAP_PAGE_MASK(map) >= PAGE_MASK && - !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) { - /* - * We've been asked to map at a fixed address and that - * address is not aligned to the map's specific alignment. - * The caller should know what it's doing (i.e. most likely - * mapping some fragmented copy map, transferring memory from - * a VM map with a different alignment), so clear map_aligned - * for this new VM map entry and proceed. - */ - clear_map_aligned = TRUE; - } + assertf(VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map)), "0x%llx", (uint64_t)*address); + assertf(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)), "0x%llx", (uint64_t)size); /* * Only zero-fill objects are allowed to be purgable. @@ -3138,6 +3202,7 @@ vm_map_enter( || size > ANON_MAX_SIZE #endif )) { + vmlp_api_end(VM_MAP_ENTER, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -3149,6 +3214,7 @@ vm_map_enter( vm_map_lock(map); map_locked = TRUE; + if (anywhere) { result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags, address, &entry); @@ -3159,6 +3225,7 @@ vm_map_enter( vmk_flags, &entry, &zap_old_list); } + vmlp_range_event(map, start, size); end = start + size; assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); @@ -3275,7 +3342,7 @@ vm_map_enter( if (object == VM_OBJECT_NULL) { assert(!superpage_size); - object = vm_object_allocate(size); + object = vm_object_allocate(size, map->serial_id); vm_object_lock(object); object->copy_strategy = MEMORY_OBJECT_COPY_NONE; VM_OBJECT_SET_TRUE_SHARE(object, FALSE); @@ -3337,11 +3404,6 @@ vm_map_enter( !((entry->protection & VM_PROT_EXECUTE) && entry->vme_permanent) && (!entry->superpage_size && !superpage_size) && - /* - * No coalescing if not map-aligned, to avoid propagating - * that condition any further than needed: - */ - (!entry->map_aligned || !clear_map_aligned) && (!entry->zero_wired_pages) && (!entry->used_for_jit && !entry_for_jit) && #if __arm64e__ @@ -3440,8 +3502,7 @@ vm_map_enter( needs_copy, cur_protection, max_protection, (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ? - VM_INHERIT_NONE : inheritance), - clear_map_aligned); + VM_INHERIT_NONE : inheritance)); assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias)); @@ -3512,12 +3573,25 @@ vm_map_enter( ledger_t ledger = map->pmap->ledger; /* we need a sub pmap to nest... */ submap->pmap = pmap_create_options(ledger, 0, - submap_is_64bit ? PMAP_CREATE_64BIT : 0); + submap_is_64bit ? PMAP_CREATE_64BIT | PMAP_CREATE_NESTED : PMAP_CREATE_NESTED); if (submap->pmap == NULL) { /* let's proceed without nesting... */ } #if defined(__arm64__) else { + /* + * When a nested pmap is created within vm_shared_region_create, we + * need to call csm_setup_nested_address_space, but the same doesn't + * need to happen here. + * + * We only enter the parent if-block if use_pmap is set to true, which + * is based on vmkf_nested_pmap. This flag is only set by two functions, + * vm_shared_region_enter, and vm_commpage_enter. The former performs a + * shared region lookup, which uses vm_shared_region_create. This path + * already creates a pmap, so submap->pmap != NULL. The latter doesn't + * go through the VM layer on arm64 systems anymore. As a result, there + * is no case on arm64 where a nested pmap is actually in this path. + */ pmap_set_nested(submap->pmap); } #endif @@ -3568,7 +3642,7 @@ vm_map_enter( } /* create one vm_object per superpage */ - sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start)); + sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start), map->serial_id); vm_object_lock(sp_object); sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE); @@ -3768,7 +3842,6 @@ BailOut: vm_map_lock(map); map_locked = TRUE; } - remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN; remove_flags |= VM_MAP_REMOVE_NO_YIELD; if (permanent) { remove_flags |= VM_MAP_REMOVE_IMMUTABLE; @@ -3869,6 +3942,7 @@ BailOut: } } + vmlp_api_end(VM_MAP_ENTER, result); return result; #undef RETURN @@ -3879,6 +3953,9 @@ BailOut: */ int64_t vm_prefault_nb_pages = 0; int64_t vm_prefault_nb_bailout = 0; +int64_t vm_prefault_nb_no_page = 0; +int64_t vm_prefault_nb_wrong_page = 0; + static kern_return_t vm_map_enter_adjust_offset( @@ -4013,6 +4090,8 @@ vm_map_enter_mem_object( boolean_t kernel_prefault, try_prefault = (page_list_count != 0); vm_map_offset_t offset_in_mapping = 0; + vmlp_api_start(VM_MAP_ENTER_MEM_OBJECT); + if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) { /* XXX TODO4K prefaulting depends on page size... */ try_prefault = FALSE; @@ -4023,6 +4102,7 @@ vm_map_enter_mem_object( */ if ((target_map == VM_MAP_NULL) || (try_prefault && (copy || !page_list))) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -4052,6 +4132,7 @@ vm_map_enter_mem_object( &max_protection, &inheritance); if (__improbable(result != KERN_SUCCESS)) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, vm_sanitize_get_kr(result)); return vm_sanitize_get_kr(result); } @@ -4075,7 +4156,7 @@ vm_map_enter_mem_object( if (!IP_VALID(port)) { object = VM_OBJECT_NULL; copy = FALSE; - } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) { + } else if (ip_type(port) == IKOT_NAMED_ENTRY) { vm_named_entry_t named_entry; vm_object_size_t initial_size; @@ -4086,6 +4167,7 @@ vm_map_enter_mem_object( result = vm_map_enter_adjust_offset(&obj_offs, &obj_end, named_entry->data_offset); if (__improbable(result)) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, result); return result; } } @@ -4099,10 +4181,12 @@ vm_map_enter_mem_object( } if ((named_entry->protection & max_protection) != max_protection) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_RIGHT); return KERN_INVALID_RIGHT; } if ((named_entry->protection & cur_protection) != cur_protection) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -4112,6 +4196,7 @@ vm_map_enter_mem_object( */ initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u); if (named_entry->size < obj_offs + initial_size) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -4136,6 +4221,7 @@ vm_map_enter_mem_object( result = vm_map_enter_adjust_offset(&obj_offs, &obj_end, named_entry->offset); if (__improbable(result)) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, result); return result; } } @@ -4183,6 +4269,7 @@ vm_map_enter_mem_object( inheritance); if (result != KERN_SUCCESS) { vm_map_deallocate(submap); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, result); return result; } /* @@ -4231,6 +4318,7 @@ vm_map_enter_mem_object( if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) { named_entry_unlock(named_entry); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -4243,6 +4331,7 @@ vm_map_enter_mem_object( "unsupported type 0x%x\n", copy_map->type); named_entry_unlock(named_entry); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -4277,6 +4366,7 @@ vm_map_enter_mem_object( &trimmed_start); if (kr != KERN_SUCCESS) { named_entry_unlock(named_entry); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, kr); return kr; } target_size = target_copy_map->size; @@ -4317,6 +4407,7 @@ vm_map_enter_mem_object( target_copy_map = VM_MAP_COPY_NULL; } named_entry_unlock(named_entry); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, kr); return kr; } @@ -4638,12 +4729,12 @@ vm_map_enter_mem_object( } else { panic("invalid VM named entry %p", named_entry); } - } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) { + } else if (ip_type(port) == IKOT_MEMORY_OBJECT) { /* * JMM - This is temporary until we unify named entries * and raw memory objects. * - * Detected fake ip_kotype for a memory object. In + * Detected fake object type for a memory object. In * this case, the port isn't really a port at all, but * instead is just a raw memory object. */ @@ -4654,6 +4745,7 @@ vm_map_enter_mem_object( object = memory_object_to_vm_object((memory_object_t)port); if (object == VM_OBJECT_NULL) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_OBJECT); return KERN_INVALID_OBJECT; } vm_object_reference(object); @@ -4663,6 +4755,7 @@ vm_map_enter_mem_object( if (is_kernel_object(object)) { printf("Warning: Attempt to map kernel object" " by a non-private kernel entity\n"); + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_OBJECT); return KERN_INVALID_OBJECT; } if (!object->pager_ready) { @@ -4678,6 +4771,7 @@ vm_map_enter_mem_object( } } } else { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, KERN_INVALID_OBJECT); return KERN_INVALID_OBJECT; } @@ -4777,6 +4871,7 @@ vm_map_enter_mem_object( vm_object_deallocate(object); if (result != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, result); return result; } @@ -4791,6 +4886,11 @@ vm_map_enter_mem_object( kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map)); vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault); + if (try_prefault) { + /* take an extra reference to keep object alive during "prefault" */ + vm_object_reference(object); + } + result = vm_map_enter(target_map, &map_addr, map_size, (vm_map_offset_t)mask, @@ -4808,12 +4908,15 @@ vm_map_enter_mem_object( */ if (result == KERN_SUCCESS && try_prefault) { mach_vm_address_t va = map_addr; + vm_object_offset_t page_offset; kern_return_t kr = KERN_SUCCESS; unsigned int i = 0; int pmap_options; pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT; + vm_object_lock(object); + page_offset = obj_offs; for (i = 0; i < page_list_count; ++i) { if (!UPL_VALID_PAGE(page_list, i)) { if (kernel_prefault) { @@ -4822,6 +4925,47 @@ vm_map_enter_mem_object( break; } } else { + if (object->phys_contiguous) { + /* no VM page to look up... */ + } else { + vm_page_t p; + + p = vm_page_lookup(object, page_offset); + assertf(p, "offset 0x%llx: no VM page", page_offset); + if (p == VM_PAGE_NULL) { + /* + * No VM page , so nothing to prefault. + * Note that this should not happen if + * we really had the page in the UPL, + * so let's give up on prefaulting... + */ + vm_prefault_nb_no_page++; + break; + } else if (VM_PAGE_GET_PHYS_PAGE(p) != + UPL_PHYS_PAGE(page_list, i)) { + /* + * Different physical page; that should + * also not happen, so let's give up... + */ + assertf(VM_PAGE_GET_PHYS_PAGE(p) == UPL_PHYS_PAGE(page_list, i), + "offset 0x%llx: page %p phys 0x%x != 0x%x\n", + page_offset, p, + VM_PAGE_GET_PHYS_PAGE(p), + UPL_PHYS_PAGE(page_list, i)); + vm_prefault_nb_wrong_page++; + break; + } else { + /* + * Register that this VM page was pmapped, + * so that we know to clean up its pmap + * mappings if we end up reclaiming it + * before this mapping goes away... + */ + if (!p->vmp_pmapped) { + p->vmp_pmapped = true; + } + } + } /* * If this function call failed, we should stop * trying to optimize, other calls are likely @@ -4847,12 +4991,19 @@ vm_map_enter_mem_object( /* Next virtual address */ va += PAGE_SIZE; + page_offset += PAGE_SIZE; } + vm_object_unlock(object); if (vmk_flags.vmkf_keep_map_locked) { vm_map_unlock(target_map); } } + if (try_prefault) { + /* release our extra "prefault" reference */ + vm_object_deallocate(object); + } + out: if (result == KERN_SUCCESS) { #if KASAN @@ -4861,7 +5012,9 @@ out: } #endif *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping); + vmlp_range_event(target_map, map_addr, map_size); } + vmlp_api_end(VM_MAP_ENTER_MEM_OBJECT, result); return result; } @@ -5275,6 +5428,20 @@ __vm_map_clip_atomic_entry_panic( (uint64_t)where); } +__abortlike +static void +__vm_map_clip_sealed_panic( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t where) +{ + panic("vm_map_clip(%p): Attempting to clip in a sealed VM map " + "%p [0x%llx:0x%llx] at 0x%llx", map, entry, + (uint64_t)entry->vme_start, + (uint64_t)entry->vme_end, + (uint64_t)where); +} + /* * vm_map_clip_start: [ internal use only ] * @@ -5288,6 +5455,10 @@ vm_map_clip_start( vm_map_entry_t entry, vm_map_offset_t startaddr) { + if (__improbable(vm_map_is_sealed(map))) { + __vm_map_clip_sealed_panic(map, entry, startaddr); + } + #ifndef NO_NESTED_PMAP if (entry->is_sub_map && entry->use_pmap && @@ -5361,11 +5532,7 @@ _vm_map_clip_start( * this entry has the specified starting * address. */ - - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(start, - VM_MAP_HDR_PAGE_MASK(map_header))); - } + assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_HDR_PAGE_MASK(map_header))); new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy_full(new_entry, entry); @@ -5379,13 +5546,7 @@ _vm_map_clip_start( assert(start < entry->vme_end); entry->vme_start = start; -#if VM_BTLOG_TAGS - if (new_entry->vme_kernel_object) { - btref_retain(new_entry->vme_tag_btref); - } -#endif /* VM_BTLOG_TAGS */ - - _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); + _vm_map_store_entry_link(map_header, VME_PREV(entry), new_entry); if (entry->is_sub_map) { vm_map_reference(VME_SUBMAP(new_entry)); @@ -5408,6 +5569,10 @@ vm_map_clip_end( vm_map_entry_t entry, vm_map_offset_t endaddr) { + if (__improbable(vm_map_is_sealed(map))) { + __vm_map_clip_sealed_panic(map, entry, endaddr); + } + if (endaddr > entry->vme_end) { /* * Within the scope of this clipping, limit "endaddr" to @@ -5485,10 +5650,7 @@ _vm_map_clip_end( * AFTER the specified entry */ - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(end, - VM_MAP_HDR_PAGE_MASK(map_header))); - } + assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_HDR_PAGE_MASK(map_header))); new_entry = _vm_map_entry_create(map_header); vm_map_entry_copy_full(new_entry, entry); @@ -5502,12 +5664,6 @@ _vm_map_clip_end( VME_OFFSET(new_entry) + (end - entry->vme_start)); assert(new_entry->vme_start < new_entry->vme_end); -#if VM_BTLOG_TAGS - if (new_entry->vme_kernel_object) { - btref_retain(new_entry->vme_tag_btref); - } -#endif /* VM_BTLOG_TAGS */ - _vm_map_store_entry_link(map_header, entry, new_entry); if (entry->is_sub_map) { @@ -5613,6 +5769,8 @@ vm_map_protect_sanitize( { kern_return_t kr; vm_map_size_t size; + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT, map, VM_PROT_COPY, new_prot); @@ -5621,7 +5779,7 @@ vm_map_protect_sanitize( } kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT, - map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size); + map, flags, start, end, &size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } @@ -5655,6 +5813,8 @@ vm_map_protect( vm_map_offset_t start, original_start; vm_map_offset_t end; + vmlp_api_start(VM_MAP_PROTECT); + kr = vm_map_protect_sanitize(map, start_u, end_u, @@ -5663,7 +5823,9 @@ vm_map_protect( &end, &new_prot); if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kr = vm_sanitize_get_kr(kr); + vmlp_api_end(VM_MAP_PROTECT, kr); + return kr; } original_start = start; @@ -5674,6 +5836,7 @@ vm_map_protect( /* LP64todo - see below */ if (start >= map->max_offset) { + vmlp_api_end(VM_MAP_PROTECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -5703,6 +5866,7 @@ vm_map_protect( (uint64_t)0, #endif /* DEVELOPMENT || DEBUG */ new_prot); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -5735,12 +5899,15 @@ vm_map_protect( vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */ VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_PROTECT, kr); return kr; } new_prot &= ~VM_PROT_COPY; } vm_map_lock(map); + vmlp_range_event(map, start, end - start); + restart_after_unlock: /* LP64todo - remove this check when vm_map_commpage64() @@ -5749,6 +5916,7 @@ restart_after_unlock: */ if (start >= map->max_offset) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -5759,6 +5927,7 @@ restart_after_unlock: */ if (!vm_map_lookup_entry(map, start, &entry)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -5786,6 +5955,7 @@ restart_after_unlock: */ if (current->vme_start != prev) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -5803,12 +5973,14 @@ restart_after_unlock: #endif if ((new_prot & new_max) != new_prot) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } if (current->used_for_jit && pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -5816,6 +5988,7 @@ restart_after_unlock: /* Disallow protecting hw assisted TPRO mappings */ if (current->used_for_tpro) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } #endif /* __arm64e__ */ @@ -5856,6 +6029,7 @@ restart_after_unlock: new_prot &= ~VM_PROT_ALLEXEC; if (VM_MAP_POLICY_WX_FAIL(map)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } @@ -5870,6 +6044,7 @@ restart_after_unlock: if ((new_prot & VM_PROT_ALLEXEC) || ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } @@ -5878,30 +6053,9 @@ restart_after_unlock: current = current->vme_next; } -#if __arm64__ - if (end > prev && - end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) { - vm_map_entry_t prev_entry; - - prev_entry = current->vme_prev; - if (prev_entry != vm_map_to_entry(map) && - !prev_entry->map_aligned && - (vm_map_round_page(prev_entry->vme_end, - VM_MAP_PAGE_MASK(map)) - == end)) { - /* - * The last entry in our range is not "map-aligned" - * but it would have reached all the way to "end" - * if it had been map-aligned, so this is not really - * a hole in the range and we can proceed. - */ - prev = end; - } - } -#endif /* __arm64__ */ - if (end > prev) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -6129,6 +6283,7 @@ restart_after_unlock: } vm_map_unlock(map); + vmlp_api_end(VM_MAP_PROTECT, KERN_SUCCESS); return KERN_SUCCESS; } @@ -6152,8 +6307,11 @@ vm_map_inherit_sanitize( return kr; } + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT, - map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size); + map, flags, start, end, &size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } @@ -6183,6 +6341,8 @@ vm_map_inherit( vm_map_offset_t end; vm_inherit_t new_inheritance; + vmlp_api_start(VM_MAP_INHERIT); + kr = vm_map_inherit_sanitize(map, start_u, end_u, @@ -6191,12 +6351,15 @@ vm_map_inherit( &end, &new_inheritance); if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kr = vm_sanitize_get_kr(kr); + vmlp_api_end(VM_MAP_INHERIT, kr); + return kr; } vm_map_lock(map); VM_MAP_RANGE_CHECK(map, start, end); + vmlp_range_event(map, start, end - start); if (vm_map_lookup_entry(map, start, &temp_entry)) { entry = temp_entry; @@ -6211,6 +6374,7 @@ vm_map_inherit( if (entry->is_sub_map) { if (new_inheritance == VM_INHERIT_COPY) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_INHERIT, KERN_INVALID_ADDRESS); return KERN_INVALID_ARGUMENT; } } @@ -6237,6 +6401,7 @@ vm_map_inherit( } vm_map_unlock(map); + vmlp_api_end(VM_MAP_INHERIT, KERN_SUCCESS); return KERN_SUCCESS; } @@ -6400,6 +6565,9 @@ vm_map_wire_nested( boolean_t wire_and_extract; vm_prot_t extra_prots; + vmlp_api_start(VM_MAP_WIRE_NESTED); + vmlp_range_event(map, start, end - start); + extra_prots = VM_PROT_COPY; extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE; #if XNU_TARGET_OS_OSX @@ -6436,6 +6604,7 @@ vm_map_wire_nested( assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (start == end) { /* We wired what the caller asked for, zero pages */ + vmlp_api_end(VM_MAP_WIRE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; } @@ -6627,6 +6796,7 @@ vm_map_wire_nested( assert(map_pmap == NULL); vm_map_unwire_nested(map, start, s, user_wire, PMAP_NULL, 0); + vmlp_api_end(VM_MAP_WIRE_NESTED, rc); return rc; } vm_object_unlock(object); @@ -6894,7 +7064,7 @@ vm_map_wire_nested( rc = KERN_INVALID_ARGUMENT; goto done; } - VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0); + VME_OBJECT_SET(entry, vm_object_allocate(size, map->serial_id), false, 0); VME_OFFSET_SET(entry, (vm_object_offset_t)0); assert(entry->use_pmap); } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { @@ -7126,6 +7296,7 @@ done: } } + vmlp_api_end(VM_MAP_WIRE_NESTED, rc); return rc; } @@ -7144,9 +7315,11 @@ vm_map_wire_sanitize( { kern_return_t kr; + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map, - VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, - size); + flags, start, end, size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } @@ -7262,16 +7435,22 @@ vm_map_unwire_nested( boolean_t main_map = FALSE; unsigned int last_timestamp; + vmlp_api_start(VM_MAP_UNWIRE_NESTED); + VM_MAP_RANGE_CHECK(map, start, end); assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (start == end) { /* We unwired what the caller asked for: zero pages */ + vmlp_api_end(VM_MAP_UNWIRE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; } vm_map_lock(map); + + vmlp_range_event(map, start, end - start); + if (map_pmap == NULL) { main_map = TRUE; } @@ -7289,12 +7468,14 @@ vm_map_unwire_nested( } /* Start address is not in map. */ vm_map_unlock(map); + vmlp_api_end(VM_MAP_UNWIRE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } if (entry->superpage_size) { /* superpages are always wired */ vm_map_unlock(map); + vmlp_api_end(VM_MAP_UNWIRE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -7594,6 +7775,7 @@ vm_map_unwire_nested( if (need_wakeup) { vm_map_entry_wakeup(map); } + vmlp_api_end(VM_MAP_UNWIRE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; } @@ -7619,9 +7801,11 @@ vm_map_unwire_sanitize( vm_map_offset_t *end, vm_map_size_t *size) { + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map, - VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, - size); + flags, start, end, size); } kern_return_t @@ -7672,16 +7856,8 @@ vm_map_entry_zap( s = entry->vme_start; e = entry->vme_end; - assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK)); - assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK)); - if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { - assert(page_aligned(s)); - assert(page_aligned(e)); - } - if (entry->map_aligned == TRUE) { - assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))); - assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map))); - } + assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))); + assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map))); assert(entry->wired_count == 0); assert(entry->user_wired_count == 0); assert(!entry->vme_permanent); @@ -7705,6 +7881,8 @@ vm_map_submap_pmap_clean( vm_map_size_t remove_size; vm_map_entry_t entry; + vmlp_api_start(VM_MAP_SUBMAP_PMAP_CLEAN); + submap_end = offset + (end - start); submap_start = offset; @@ -7728,6 +7906,7 @@ vm_map_submap_pmap_clean( VME_SUBMAP(entry), VME_OFFSET(entry)); } else { + vmlp_range_event_entry(sub_map, entry); if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0 && VME_OBJECT(entry) != NULL) { @@ -7766,6 +7945,7 @@ vm_map_submap_pmap_clean( VME_SUBMAP(entry), VME_OFFSET(entry)); } else { + vmlp_range_event_entry(sub_map, entry); if (map->mapped_in_other_pmaps && os_ref_get_count_raw(&map->map_refcnt) != 0 && VME_OBJECT(entry) != NULL) { @@ -7789,9 +7969,12 @@ vm_map_submap_pmap_clean( entry = entry->vme_next; } vm_map_unlock_read(sub_map); + vmlp_api_end(VM_MAP_SUBMAP_PMAP_CLEAN, 0); return; } + + /* * virt_memory_guard_ast: * @@ -7888,7 +8071,6 @@ vm_map_guard_exception_internal( return false; } - *code = 0; EXC_GUARD_ENCODE_TYPE(*code, guard_type); EXC_GUARD_ENCODE_FLAVOR(*code, reason); @@ -7918,7 +8100,6 @@ vm_map_guard_exception( if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) { task_t task = current_task(); bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL; - thread_guard_violation(current_thread(), code, subcode, fatal); } } @@ -7932,6 +8113,8 @@ vm_map_delete_submap_recurse( { vm_map_entry_t submap_entry; + vmlp_api_start(VM_MAP_DELETE_SUBMAP_RECURSE); + /* * Verify that the submap does not contain any "permanent" entries * within the specified range. We permit TPRO ranges to be overwritten @@ -7941,7 +8124,7 @@ vm_map_delete_submap_recurse( * We do not care about gaps. */ - vm_map_lock(submap); + vm_map_lock_read(submap); if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) { submap_entry = submap_entry->vme_next; @@ -7951,6 +8134,7 @@ vm_map_delete_submap_recurse( submap_entry != vm_map_to_entry(submap) && submap_entry->vme_start < submap_end; submap_entry = submap_entry->vme_next) { + vmlp_range_event_entry(submap, submap_entry); if (submap_entry->vme_permanent #ifdef __arm64e__ /* allow TPRO submap entries to be overwritten */ @@ -7958,12 +8142,14 @@ vm_map_delete_submap_recurse( #endif ) { /* "permanent" entry -> fail */ - vm_map_unlock(submap); + vm_map_unlock_read(submap); + vmlp_api_end(VM_MAP_DELETE_SUBMAP_RECURSE, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } } /* no "permanent" entries in the range -> success */ - vm_map_unlock(submap); + vm_map_unlock_read(submap); + vmlp_api_end(VM_MAP_DELETE_SUBMAP_RECURSE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -8108,6 +8294,9 @@ vm_map_delete( uint32_t size_idx, slot_idx; struct mach_vm_range slot; + vmlp_api_start(VM_MAP_DELETE); + vmlp_range_event(map, start, end - start); + if (vm_map_pmap(map) == kernel_pmap) { state |= VMDS_KERNEL_PMAP; range_id = kmem_addr_get_range(start, end - start); @@ -8137,8 +8326,7 @@ vm_map_delete( interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ? THREAD_ABORTSAFE : THREAD_UNINT; - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 && - (start & VM_MAP_PAGE_MASK(map))) { + if (start & VM_MAP_PAGE_MASK(map)) { __vm_map_delete_misaligned_panic(map, start, end); } @@ -8212,7 +8400,7 @@ vm_map_delete( if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { start = SUPERPAGE_ROUND_DOWN(start); } else { - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + SAVE_HINT_MAP_WRITE(map, VME_PREV(entry)); break; } } @@ -8255,7 +8443,7 @@ vm_map_delete( state &= ~VMDS_NEEDS_LOOKUP; if (vm_map_lookup_entry_or_next(map, s, &entry)) { - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + SAVE_HINT_MAP_WRITE(map, VME_PREV(entry)); } if (state & VMDS_KERNEL_KMEMPTR) { @@ -8422,29 +8610,11 @@ vm_map_delete( */ if (entry->vme_start < s) { - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && - entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) { - /* - * The entry will no longer be map-aligned - * after clipping and the caller said it's OK. - */ - entry->map_aligned = FALSE; - } vm_map_clip_start(map, entry, s); - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + SAVE_HINT_MAP_WRITE(map, VME_PREV(entry)); } if (end < entry->vme_end) { - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && - entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) { - /* - * The entry will no longer be map-aligned - * after clipping and the caller said it's OK. - */ - entry->map_aligned = FALSE; - } vm_map_clip_end(map, entry, end); } @@ -8504,6 +8674,7 @@ in_transition: * since we cannot tell if we were the only one. */ ret.kmr_return = KERN_ABORTED; + vmlp_api_end(VM_MAP_DELETE, KERN_ABORTED); return ret; } @@ -8579,6 +8750,7 @@ in_transition: * only one. */ ret.kmr_return = KERN_ABORTED; + vmlp_api_end(VM_MAP_DELETE, KERN_ABORTED); return ret; } @@ -8864,21 +9036,6 @@ in_transition: /* move "s" forward */ s = entry->vme_end; next = entry->vme_next; - if (!entry->map_aligned) { - vm_map_offset_t rounded_s; - - /* - * Skip artificial gap due to mis-aligned entry - * on devices with a page size smaller than the - * map's page size (i.e. 16k task on a 4k device). - */ - rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map)); - if (next == vm_map_to_entry(map)) { - s = rounded_s; - } else if (s < rounded_s) { - s = MIN(rounded_s, next->vme_start); - } - } ret.kmr_size += s - entry->vme_start; if (entry->vme_permanent) { @@ -8921,6 +9078,7 @@ in_transition: next = VM_MAP_ENTRY_NULL; if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) { + vmlp_lock_event_locked(VMLP_EVENT_LOCK_YIELD_BEGIN, map); unsigned int last_timestamp = map->timestamp++; if (lck_rw_lock_yield_exclusive(&map->lock, @@ -8932,6 +9090,7 @@ in_transition: /* we didn't yield, undo our change */ map->timestamp--; } + vmlp_lock_event_locked(VMLP_EVENT_LOCK_YIELD_END, map); } } @@ -8964,6 +9123,7 @@ out: } } + vmlp_api_end(VM_MAP_DELETE, ret.kmr_return); return ret; } @@ -8978,11 +9138,15 @@ vm_map_remove_and_unlock( kmem_return_t ret; VM_MAP_ZAP_DECLARE(zap); + vmlp_api_start(VM_MAP_REMOVE_AND_UNLOCK); + vmlp_range_event(map, start, end - start); + ret = vm_map_delete(map, start, end, flags, guard, &zap); vm_map_unlock(map); vm_map_zap_dispose(&zap); + vmlp_api_end(VM_MAP_REMOVE_AND_UNLOCK, ret.kmr_return); return ret; } @@ -9000,8 +9164,13 @@ vm_map_remove_guard( vmr_flags_t flags, kmem_guard_t guard) { + kmem_return_t ret; + vmlp_api_start(VM_MAP_REMOVE_GUARD); vm_map_lock(map); - return vm_map_remove_and_unlock(map, start, end, flags, guard); + vmlp_range_event(map, start, end - start); + ret = vm_map_remove_and_unlock(map, start, end, flags, guard); + vmlp_api_end(VM_MAP_REMOVE_GUARD, ret.kmr_return); + return ret; } @@ -9041,12 +9210,15 @@ kern_return_t vm_map_terminate( vm_map_t map) { + vmlp_api_start(VM_MAP_TERMINATE); + vmlp_range_event_all(map); vm_map_lock(map); map->terminated = TRUE; map->owning_task = NULL; vm_map_disable_hole_optimization(map); (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset, VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE); + vmlp_api_end(VM_MAP_TERMINATE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -9178,8 +9350,7 @@ vm_map_copy_copy( * The links in the entry chain must be * changed to point to the new copy object. */ - vm_map_copy_first_entry(copy)->vme_prev - = vm_map_copy_to_entry(new_copy); + VME_PREV_SET(vm_map_copy_first_entry(copy), vm_map_copy_to_entry(new_copy)); vm_map_copy_last_entry(copy)->vme_next = vm_map_copy_to_entry(new_copy); } @@ -9271,7 +9442,7 @@ vm_map_overwrite_submap_recurse( kern_return_t result; boolean_t encountered_sub_map = FALSE; - + vmlp_api_start(VM_MAP_OVERWRITE_SUBMAP_RECURSE); /* * Verify that the destination is all writeable @@ -9287,6 +9458,7 @@ vm_map_overwrite_submap_recurse( start_pass_1: if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -9343,15 +9515,18 @@ start_pass_1: sub_map = VM_MAP_NULL; if (result != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, result); return result; } if (dst_end <= entry->vme_end) { + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_SUCCESS); return KERN_SUCCESS; } vm_map_lock(dst_map); if (!vm_map_lookup_entry(dst_map, local_end, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } entry = tmp_entry; @@ -9361,14 +9536,18 @@ start_pass_1: if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } + vmlp_range_event_entry(dst_map, entry); + /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen @@ -9389,6 +9568,7 @@ start_pass_1: */ if (dst_end <= entry->vme_end) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_SUCCESS); return KERN_SUCCESS; } /* @@ -9397,6 +9577,7 @@ start_pass_1: if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -9409,6 +9590,7 @@ start_pass_1: (VME_OBJECT(entry)->true_share))) { if (encountered_sub_map) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_INVALID_ADDRESS); return KERN_FAILURE; } } @@ -9417,6 +9599,7 @@ start_pass_1: entry = next; }/* for */ vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_OVERWRITE_SUBMAP_RECURSE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -9495,6 +9678,9 @@ vm_map_copy_overwrite_nested( vm_map_size_t total_size; uint16_t copy_page_shift; + vmlp_api_start(VM_MAP_COPY_OVERWRITE_NESTED); + + /* * Check for special kernel buffer allocated * by new_ipc_kmsg_copyin. @@ -9503,7 +9689,9 @@ vm_map_copy_overwrite_nested( if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { kr = vm_map_copyout_kernel_buffer( dst_map, &dst_addr, - copy, copy->size, TRUE, discard_on_success); + copy, copy->size, TRUE, + discard_on_success); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, kr); return kr; } @@ -9518,6 +9706,7 @@ vm_map_copy_overwrite_nested( if (discard_on_success) { vm_map_copy_discard(copy); } + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; } @@ -9552,12 +9741,14 @@ vm_map_copy_overwrite_nested( */ if (dst_addr >= dst_map->max_offset) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } start_pass_1: if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } vm_map_clip_start(dst_map, @@ -9612,6 +9803,7 @@ start_pass_1: vm_map_deallocate(sub_map); sub_map = VM_MAP_NULL; if (kr != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, kr); return kr; } vm_map_lock(dst_map); @@ -9623,6 +9815,7 @@ start_pass_1: if (!vm_map_lookup_entry(dst_map, local_end, &entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } next = entry->vme_next; @@ -9631,14 +9824,18 @@ start_pass_1: if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } + vmlp_range_event_entry(dst_map, entry); + /* * If the entry is in transition, we must wait * for it to exit that state. Anything could happen @@ -9666,6 +9863,7 @@ start_pass_1: if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -9691,6 +9889,7 @@ start_overwrite: if (interruptible && contains_permanent_objects) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_FAILURE); return KERN_FAILURE; /* XXX */ } @@ -9711,6 +9910,7 @@ start_overwrite: /* unlocked */ if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } else { @@ -9751,6 +9951,7 @@ start_overwrite: if (!vm_map_lookup_entry(dst_map, base_addr, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } copy_size = 0; @@ -9854,9 +10055,8 @@ start_overwrite: next_copy = copy_entry->vme_next; copy_entry->vme_next = vm_map_copy_to_entry(copy); - previous_prev = - copy->cpy_hdr.links.prev; - copy->cpy_hdr.links.prev = copy_entry; + previous_prev = VMH_PREV(©->cpy_hdr); + VMH_PREV_SET(©->cpy_hdr, copy_entry); copy->size = copy_size; remaining_entries = copy->cpy_hdr.nentries; @@ -9904,15 +10104,16 @@ start_overwrite: if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; - copy->cpy_hdr.links.prev->vme_next = + VMH_PREV(©->cpy_hdr)->vme_next = next_copy; - copy->cpy_hdr.links.prev - = previous_prev; + VMH_PREV_SET(©->cpy_hdr, previous_prev); copy->size = total_size; } + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, kr); return kr; } if (dst_end <= local_end) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; } /* otherwise copy no longer exists, it was */ @@ -9927,10 +10128,9 @@ start_overwrite: if (next_copy != NULL) { copy->cpy_hdr.nentries = remaining_entries; copy->cpy_hdr.links.next = next_copy; - copy->cpy_hdr.links.prev = previous_prev; + VMH_PREV_SET(©->cpy_hdr, previous_prev); copy->size = total_size; - next_copy->vme_prev = - vm_map_copy_to_entry(copy); + VME_PREV_SET(next_copy, vm_map_copy_to_entry(copy)); next_copy = NULL; } base_addr = local_end; @@ -9938,6 +10138,7 @@ start_overwrite: if (!vm_map_lookup_entry(dst_map, local_end, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } entry = tmp_entry; @@ -9953,6 +10154,7 @@ start_overwrite: if ((next == vm_map_to_entry(dst_map)) || (next->vme_start != entry->vme_end)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -9987,9 +10189,8 @@ start_overwrite: next_copy = copy_entry->vme_next; copy_entry->vme_next = vm_map_copy_to_entry(copy); - previous_prev = - copy->cpy_hdr.links.prev; - copy->cpy_hdr.links.prev = copy_entry; + previous_prev = VMH_PREV(©->cpy_hdr); + VMH_PREV_SET(©->cpy_hdr, copy_entry); copy->size = copy_size; remaining_entries = copy->cpy_hdr.nentries; @@ -10020,12 +10221,12 @@ start_overwrite: if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; - copy->cpy_hdr.links.prev->vme_next = + VMH_PREV(©->cpy_hdr)->vme_next = next_copy; - copy->cpy_hdr.links.prev = - previous_prev; + VMH_PREV_SET(©->cpy_hdr, previous_prev); copy->size += copy_size; } + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, kr); return kr; } vm_map_unlock(dst_map); @@ -10051,12 +10252,12 @@ start_overwrite: if (next_copy != NULL) { copy->cpy_hdr.nentries += remaining_entries; - copy->cpy_hdr.links.prev->vme_next = + VMH_PREV(©->cpy_hdr)->vme_next = next_copy; - copy->cpy_hdr.links.prev = - previous_prev; + VMH_PREV_SET(©->cpy_hdr, previous_prev); copy->size += copy_size; } + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, kr); return kr; } } @@ -10070,8 +10271,8 @@ start_overwrite: if (next_copy != NULL) { copy->cpy_hdr.nentries = remaining_entries; copy->cpy_hdr.links.next = next_copy; - copy->cpy_hdr.links.prev = previous_prev; - next_copy->vme_prev = vm_map_copy_to_entry(copy); + VMH_PREV_SET(©->cpy_hdr, previous_prev); + VME_PREV_SET(next_copy, vm_map_copy_to_entry(copy)); copy->size = total_size; } vm_map_lock(dst_map); @@ -10079,6 +10280,7 @@ start_overwrite: if (!vm_map_lookup_entry(dst_map, base_addr, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } if (tmp_entry->in_transition) { @@ -10103,6 +10305,7 @@ start_overwrite: vm_map_copy_discard(copy); } + vmlp_api_end(VM_MAP_COPY_OVERWRITE_NESTED, KERN_SUCCESS); return KERN_SUCCESS; }/* vm_map_copy_overwrite */ @@ -10145,6 +10348,8 @@ vm_map_copy_overwrite( vm_map_offset_t effective_page_mask, effective_page_size; uint16_t copy_page_shift; + vmlp_api_start(VM_MAP_COPY_OVERWRITE); + head_size = 0; tail_size = 0; head_copy = NULL; @@ -10156,6 +10361,7 @@ vm_map_copy_overwrite( * Check for null copy object. */ if (copy == VM_MAP_COPY_NULL) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -10171,9 +10377,13 @@ vm_map_copy_overwrite( &dst_end, ©_size); if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kern_return_t ret = vm_sanitize_get_kr(kr); + vmlp_api_end(VM_MAP_COPY_OVERWRITE, ret); + return ret; } + vmlp_range_event(dst_map, dst_addr, copy_size); + /* * Assert that the vm_map_copy is coming from the right * zone and hasn't been forged @@ -10196,6 +10406,7 @@ blunt_copy: if (kr) { ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */); } + vmlp_api_end(VM_MAP_COPY_OVERWRITE, kr); return kr; } @@ -10441,6 +10652,7 @@ done: tail_copy = NULL; } } + vmlp_api_end(VM_MAP_COPY_OVERWRITE, kr); return kr; } @@ -10494,6 +10706,7 @@ vm_map_copy_overwrite_unaligned( amount_left; kern_return_t kr = KERN_SUCCESS; + vmlp_api_start(VM_MAP_COPY_OVERWRITE_UNALIGNED); copy_entry = vm_map_copy_first_entry(copy); @@ -10508,6 +10721,7 @@ vm_map_copy_overwrite_unaligned( while (amount_left > 0) { if (entry == vm_map_to_entry(dst_map)) { vm_map_unlock_read(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -10519,6 +10733,7 @@ vm_map_copy_overwrite_unaligned( */ if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock_read(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } if (entry->is_sub_map) { @@ -10528,6 +10743,7 @@ vm_map_copy_overwrite_unaligned( } if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock_read(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } /* @@ -10545,6 +10761,8 @@ vm_map_copy_overwrite_unaligned( goto RetryLookup; } + vmlp_range_event_entry(dst_map, entry); + dst_offset = start - entry->vme_start; dst_size = entry->vme_end - start; @@ -10597,7 +10815,8 @@ vm_map_copy_overwrite_unaligned( goto RetryLookup; } dst_object = vm_object_allocate((vm_map_size_t) - entry->vme_end - entry->vme_start); + entry->vme_end - entry->vme_start, + dst_map->serial_id); VME_OBJECT_SET(entry, dst_object, false, 0); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); @@ -10636,6 +10855,7 @@ vm_map_copy_overwrite_unaligned( * If a hard error occurred, return it now */ if (kr != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, kr); return kr; } @@ -10658,6 +10878,7 @@ vm_map_copy_overwrite_unaligned( /* * not finished copying but run out of source */ + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -10667,6 +10888,7 @@ vm_map_copy_overwrite_unaligned( } if (amount_left == 0) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_SUCCESS); return KERN_SUCCESS; } @@ -10685,6 +10907,7 @@ vm_map_copy_overwrite_unaligned( */ if (start != entry->vme_start) { vm_map_unlock_read(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } @@ -10697,11 +10920,13 @@ vm_map_copy_overwrite_unaligned( RetryLookup: if (!vm_map_lookup_entry(dst_map, start, &entry)) { vm_map_unlock_read(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } }/* while */ + vmlp_api_end(VM_MAP_COPY_OVERWRITE_UNALIGNED, KERN_SUCCESS); return KERN_SUCCESS; }/* vm_map_copy_overwrite_unaligned */ @@ -10746,6 +10971,8 @@ vm_map_copy_overwrite_aligned( vm_map_size_t size; vm_map_entry_t entry; + vmlp_api_start(VM_MAP_COPY_OVERWRITE_ALIGNED); + while ((copy_entry = vm_map_copy_first_entry(copy)) != vm_map_copy_to_entry(copy)) { copy_size = (copy_entry->vme_end - copy_entry->vme_start); @@ -10758,6 +10985,7 @@ vm_map_copy_overwrite_aligned( } if (entry == vm_map_to_entry(dst_map)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } size = (entry->vme_end - entry->vme_start); @@ -10771,6 +10999,7 @@ vm_map_copy_overwrite_aligned( if ((entry->vme_start != start) || ((entry->is_sub_map) && !entry->needs_copy)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } assert(entry != vm_map_to_entry(dst_map)); @@ -10781,6 +11010,7 @@ vm_map_copy_overwrite_aligned( if (!(entry->protection & VM_PROT_WRITE)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -10792,6 +11022,7 @@ vm_map_copy_overwrite_aligned( if (!vm_map_entry_is_overwritable(dst_map, entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -10815,12 +11046,6 @@ vm_map_copy_overwrite_aligned( */ if (copy_size < size) { - if (entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - entry->map_aligned = FALSE; - } vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); size = copy_size; } @@ -10839,6 +11064,8 @@ vm_map_copy_overwrite_aligned( assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); assert((copy_entry->vme_end - copy_entry->vme_start) == size); + vmlp_range_event_entry(dst_map, tmp_entry); + /* * If the destination contains temporary unshared memory, * we can perform the copy by throwing it away and @@ -11100,7 +11327,9 @@ slow_copy: * allocate a new VM object for this map entry. */ dst_object = vm_object_allocate( - entry->vme_end - entry->vme_start); + entry->vme_end - entry->vme_start, + dst_map->serial_id + ); dst_offset = 0; VME_OBJECT_SET(entry, dst_object, false, 0); VME_OFFSET_SET(entry, dst_offset); @@ -11140,6 +11369,7 @@ slow_copy: */ if (r != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, r); return r; } @@ -11167,14 +11397,6 @@ slow_copy: if (version.main_timestamp == dst_map->timestamp && copy_size != 0) { /* We can safely use saved tmp_entry value */ - - if (tmp_entry->map_aligned && - !VM_MAP_PAGE_ALIGNED( - start, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - tmp_entry->map_aligned = FALSE; - } vm_map_clip_end(dst_map, tmp_entry, start); tmp_entry = tmp_entry->vme_next; } else { @@ -11183,20 +11405,15 @@ slow_copy: RetryLookup: if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { vm_map_unlock(dst_map); + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } - if (tmp_entry->map_aligned && - !VM_MAP_PAGE_ALIGNED( - start, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - tmp_entry->map_aligned = FALSE; - } vm_map_clip_start(dst_map, tmp_entry, start); } } }/* while */ + vmlp_api_end(VM_MAP_COPY_OVERWRITE_ALIGNED, KERN_SUCCESS); return KERN_SUCCESS; }/* vm_map_copy_overwrite_aligned */ @@ -11215,6 +11432,7 @@ vm_map_copyin_kernel_buffer( vm_map_t src_map, vm_map_offset_t src_addr, vm_map_size_t len, + vm_map_copyin_strategy strategy, boolean_t src_destroy, vm_map_copy_t *copy_result) { @@ -11223,6 +11441,7 @@ vm_map_copyin_kernel_buffer( void *kdata; if (len > msg_ool_size_small) { +#pragma unused(strategy) return KERN_INVALID_ARGUMENT; } @@ -11291,7 +11510,11 @@ vm_map_copyout_kernel_buffer( /* * check for corrupted vm_map_copy structure */ - if (copy_size > msg_ool_size_small || copy->offset) { + bool is_corrupt_vm_map_copy_detected = copy->offset != 0; + if (copy_size > msg_ool_size_small) { + is_corrupt_vm_map_copy_detected = true; + } + if (is_corrupt_vm_map_copy_detected) { panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", (long long)copy->size, (long long)copy->offset); } @@ -11303,7 +11526,8 @@ vm_map_copyout_kernel_buffer( vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE(); if (map == kernel_map) { - vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; + vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; } *addr = 0; @@ -11414,6 +11638,7 @@ vm_map_copy_insert( vm_map_copy_entry_unlink(copy, entry); vm_map_store_entry_link(map, after_where, entry, VM_MAP_KERNEL_FLAGS_NONE); + vmlp_range_event_entry(map, entry); after_where = entry; } zfree_id(ZONE_ID_VM_MAP_COPY, copy); @@ -11461,6 +11686,7 @@ vm_map_copy_remap( /* insert the new entry in the map */ vm_map_store_entry_link(map, where, new_entry, VM_MAP_KERNEL_FLAGS_NONE); + vmlp_range_event_entry(map, new_entry); /* continue inserting the "copy entries" after the new entry */ where = new_entry; } @@ -11535,12 +11761,15 @@ vm_map_copyout_internal( kern_return_t kr; vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE(); + vmlp_api_start(VM_MAP_COPYOUT_INTERNAL); + /* * Check for null copy object. */ if (copy == VM_MAP_COPY_NULL) { *dst_addr = 0; + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, KERN_SUCCESS); return KERN_SUCCESS; } @@ -11557,6 +11786,7 @@ vm_map_copyout_internal( KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */); + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, KERN_FAILURE); return KERN_FAILURE; } copy_size = copy->size; @@ -11576,6 +11806,7 @@ vm_map_copyout_internal( KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */); } + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, kr); return kr; } @@ -11600,6 +11831,7 @@ vm_map_copyout_internal( if (kr != KERN_SUCCESS) { DEBUG4K_COPY("adjust failed 0x%x\n", kr); ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */); + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, kr); return kr; } DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start); @@ -11627,6 +11859,7 @@ vm_map_copyout_internal( if (kr != KERN_SUCCESS) { vm_map_unlock(dst_map); ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */); + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, kr); return kr; } @@ -11700,25 +11933,13 @@ vm_map_copyout_internal( for (entry = vm_map_copy_first_entry(copy); entry != vm_map_copy_to_entry(copy); entry = entry->vme_next) { - if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) { - /* - * We're injecting this copy entry into a map that - * has the standard page alignment, so clear - * "map_aligned" (which might have been inherited - * from the original map entry). - */ - entry->map_aligned = FALSE; - } - entry->vme_start += adjustment; entry->vme_end += adjustment; - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, - VM_MAP_PAGE_MASK(dst_map))); - assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, - VM_MAP_PAGE_MASK(dst_map))); - } + assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, + VM_MAP_PAGE_MASK(dst_map))); + assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, + VM_MAP_PAGE_MASK(dst_map))); entry->inheritance = VM_INHERIT_DEFAULT; entry->protection = VM_PROT_DEFAULT; @@ -11808,6 +12029,7 @@ vm_map_copyout_internal( fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG; } + bool page_sleep_needed = false; vm_fault_enter(m, dst_map->pmap, va, @@ -11819,9 +12041,11 @@ vm_map_copyout_internal( &fault_info, NULL, /* need_retry */ &type_of_fault, - &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/ + &object_lock_type, /*Exclusive mode lock. Will remain unchanged.*/ + &page_sleep_needed); vm_object_unlock(object); + assert(!page_sleep_needed); offset += PAGE_SIZE_64; va += PAGE_SIZE; @@ -11880,6 +12104,7 @@ after_adjustments: * XXX If wiring_required, call vm_map_pageable */ + vmlp_api_end(VM_MAP_COPYOUT_INTERNAL, KERN_SUCCESS); return KERN_SUCCESS; } @@ -12041,9 +12266,50 @@ vm_map_copyin_sanitize( *src_start = vm_map_trunc_page(*src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)); *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map)); + + return KERN_SUCCESS; } + +static vm_map_copyin_strategy +_vm_map_copyin_select_strategy( + vm_map_t src_map, + vm_map_size_t len, + vm_map_offset_t src_start, + vm_map_offset_t src_end, + boolean_t use_maxprot, + boolean_t preserve_purgeable, + int flags + ) +{ + /* + * If the copy is sufficiently small, use a kernel buffer instead + * of making a virtual copy. The theory being that the cost of + * setting up VM (and taking C-O-W faults) dominates the copy costs + * for small regions. + */ + if ((len <= msg_ool_size_small) && + !use_maxprot && + !preserve_purgeable && + !(flags & VM_MAP_COPYIN_ENTRY_LIST) && + /* + * Since the "msg_ool_size_small" threshold was increased and + * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the + * address space limits, we revert to doing a virtual copy if the + * copied range goes beyond those limits. Otherwise, mach_vm_read() + * of the commpage would now fail when it used to work. + */ + (src_start >= vm_map_min(src_map) && + src_start < vm_map_max(src_map) && + src_end >= vm_map_min(src_map) && + src_end < vm_map_max(src_map))) { + return VM_MAP_COPYIN_STRATEGY_KERNEL_BUFFER; + } + + return VM_MAP_COPYIN_STRATEGY_VIRTUAL_COPY; +} + kern_return_t vm_map_copyin_internal( vm_map_t src_map, @@ -12081,7 +12347,10 @@ vm_map_copyin_internal( vm_map_entry_t saved_src_entry; kern_return_t kr; + vmlp_api_start(VM_MAP_COPYIN_INTERNAL); + if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) { + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -12090,6 +12359,7 @@ vm_map_copyin_internal( */ if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) { *copy_result = VM_MAP_COPY_NULL; + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, KERN_SUCCESS); return KERN_SUCCESS; } @@ -12105,40 +12375,38 @@ vm_map_copyin_internal( &len, &src_addr_unaligned); if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kr = vm_sanitize_get_kr(kr); + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, kr); + return kr; } - + vmlp_range_event(src_map, src_start, len); src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE; use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE; preserve_purgeable = (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE; - /* - * If the copy is sufficiently small, use a kernel buffer instead - * of making a virtual copy. The theory being that the cost of - * setting up VM (and taking C-O-W faults) dominates the copy costs - * for small regions. - */ - if ((len <= msg_ool_size_small) && - !use_maxprot && - !preserve_purgeable && - !(flags & VM_MAP_COPYIN_ENTRY_LIST) && - /* - * Since the "msg_ool_size_small" threshold was increased and - * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the - * address space limits, we revert to doing a virtual copy if the - * copied range goes beyond those limits. Otherwise, mach_vm_read() - * of the commpage would now fail when it used to work. - */ - (src_start >= vm_map_min(src_map) && - src_start < vm_map_max(src_map) && - src_end >= vm_map_min(src_map) && - src_end < vm_map_max(src_map))) { - return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len, - src_destroy, copy_result); + vm_map_copyin_strategy strategy = _vm_map_copyin_select_strategy( + src_map, + len, + src_start, + src_end, + use_maxprot, + preserve_purgeable, + flags); + if (strategy == VM_MAP_COPYIN_STRATEGY_INVALID_ARGUMENT) { + return KERN_INVALID_ADDRESS; + } else if ( + strategy == VM_MAP_COPYIN_STRATEGY_KERNEL_BUFFER) { + kr = vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len, strategy, + src_destroy, copy_result); + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, kr); + return kr; } + /* Ensure we don't forget about a newly defined type */ + assert(strategy == VM_MAP_COPYIN_STRATEGY_VIRTUAL_COPY); + /* * Allocate a header element for the list. * @@ -12156,7 +12424,11 @@ vm_map_copyin_internal( #define RETURN(x) \ MACRO_BEGIN \ - vm_map_unlock(src_map); \ + if (vm_map_is_sealed(src_map)) { \ + vm_map_unlock_read(src_map); \ + } else { \ + vm_map_unlock(src_map); \ + } \ if(src_map != base_map) \ vm_map_deallocate(src_map); \ if (new_entry != VM_MAP_ENTRY_NULL) \ @@ -12172,6 +12444,7 @@ vm_map_copyin_internal( kfree_type(submap_map_t, _ptr); \ } \ } \ + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, x); \ MACRO_RETURN(x); \ MACRO_END @@ -12258,7 +12531,11 @@ vm_map_copyin_internal( src_start += VME_OFFSET(tmp_entry); src_end = src_start + submap_len; src_map = VME_SUBMAP(tmp_entry); - vm_map_lock(src_map); + if (vm_map_is_sealed(src_map)) { + vm_map_lock_read(src_map); + } else { + vm_map_lock(src_map); + } /* keep an outstanding reference for all maps in */ /* the parents tree except the base map */ vm_map_reference(src_map); @@ -12268,7 +12545,10 @@ vm_map_copyin_internal( RETURN(KERN_INVALID_ADDRESS); } map_share = TRUE; - if (!tmp_entry->is_sub_map) { + if (vm_map_is_sealed(src_map)) { + assert(!tmp_entry->is_sub_map); + /* no clipping (to "src_start") in sealed map */ + } else if (!tmp_entry->is_sub_map) { vm_map_clip_start(src_map, tmp_entry, src_start); } src_entry = tmp_entry; @@ -12294,12 +12574,22 @@ vm_map_copyin_internal( */ if (new_entry == VM_MAP_ENTRY_NULL) { version.main_timestamp = src_map->timestamp; - vm_map_unlock(src_map); + if (vm_map_is_sealed(src_map)) { + version.main_timestamp--; /* no increment expected */ + vm_map_unlock_read(src_map); + } else { + vm_map_unlock(src_map); + } new_entry = vm_map_copy_entry_create(copy); - vm_map_lock(src_map); + if (vm_map_is_sealed(src_map)) { + vm_map_lock_read(src_map); + } else { + vm_map_lock(src_map); + } if ((version.main_timestamp + 1) != src_map->timestamp) { + assert(!vm_map_is_sealed(src_map)); if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { RETURN(KERN_INVALID_ADDRESS); @@ -12323,18 +12613,41 @@ vm_map_copyin_internal( src_object = VME_OBJECT(src_entry); + src_offset = VME_OFFSET(src_entry); + src_object = VME_OBJECT(src_entry); + was_wired = (src_entry->wired_count != 0); + /* * Clip against the endpoints of the entire region. */ - - vm_map_clip_end(src_map, src_entry, src_end); - - src_size = src_entry->vme_end - src_start; - src_offset = VME_OFFSET(src_entry); - was_wired = (src_entry->wired_count != 0); + if (vm_map_is_sealed(src_map)) { + /* no clipping in a sealed map: adjust manually */ + src_size = src_entry->vme_end - src_entry->vme_start; + if (src_start > src_entry->vme_start) { + assert(src_size > src_start - src_entry->vme_start); + src_size -= src_start - src_entry->vme_start; + src_offset += src_start - src_entry->vme_start; + } + if (src_end < src_entry->vme_end) { + assert(src_size > src_entry->vme_end - src_end); + src_size -= src_entry->vme_end - src_end; + } + } else { + vm_map_clip_end(src_map, src_entry, src_end); + src_size = src_entry->vme_end - src_start; + } vm_map_entry_copy(src_map, new_entry, src_entry); + + if (vm_map_is_sealed(src_map)) { + /* we did not clip src_entry: "clip" new_entry */ + new_entry->vme_start = src_start; + new_entry->vme_end = src_start + src_size; + VME_OFFSET_SET(new_entry, src_offset); + } + if (new_entry->is_sub_map) { + assert(!vm_map_is_sealed(src_map)); /* clr address space specifics */ new_entry->use_pmap = FALSE; } else { @@ -12390,6 +12703,8 @@ RestartCopy: if (src_needs_copy && !tmp_entry->needs_copy) { vm_prot_t prot; + assert(!vm_map_is_sealed(src_map)); + prot = src_entry->protection & ~VM_PROT_WRITE; if (override_nx(src_map, VME_ALIAS(src_entry)) @@ -12437,7 +12752,12 @@ RestartCopy: */ version.main_timestamp = src_map->timestamp; - vm_map_unlock(src_map); /* Increments timestamp once! */ + if (vm_map_is_sealed(src_map)) { + version.main_timestamp--; /* no expecting an increment */ + vm_map_unlock_read(src_map); + } else { + vm_map_unlock(src_map); /* Increments timestamp once! */ + } saved_src_entry = src_entry; tmp_entry = VM_MAP_ENTRY_NULL; src_entry = VM_MAP_ENTRY_NULL; @@ -12470,6 +12790,7 @@ CopySlowly: (entry_was_shared || map_share)) { vm_object_t new_object; + assert(!vm_map_is_sealed(src_map)); vm_object_lock_shared(src_object); new_object = vm_object_copy_delayed( src_object, @@ -12567,7 +12888,11 @@ CopySlowly: if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { - vm_map_lock(src_map); + if (vm_map_is_sealed(src_map)) { + vm_map_lock_read(src_map); + } else { + vm_map_lock(src_map); + } RETURN(result); } @@ -12576,7 +12901,11 @@ CopySlowly: * changed while the copy was being made. */ - vm_map_lock(src_map); + if (vm_map_is_sealed(src_map)) { + vm_map_lock_read(src_map); + } else { + vm_map_lock(src_map); + } if ((version.main_timestamp + 1) == src_map->timestamp) { /* src_map hasn't changed: src_entry is still valid */ @@ -12584,6 +12913,8 @@ CopySlowly: goto VerificationSuccessful; } + assert(!vm_map_is_sealed(src_map)); + /* * Simple version comparison failed. * @@ -12705,12 +13036,17 @@ CopySuccessful: ; assert(ptr != NULL); parent_maps = parent_maps->next; - /* fix up the damage we did in that submap */ - vm_map_simplify_range(src_map, - src_base, - src_end); + if (vm_map_is_sealed(src_map)) { + /* no clipping -> no damage */ + vm_map_unlock_read(src_map); + } else { + /* fix up the damage we did in that submap */ + vm_map_simplify_range(src_map, + src_base, + src_end); + vm_map_unlock(src_map); + } - vm_map_unlock(src_map); vm_map_deallocate(src_map); vm_map_lock(ptr->parent_map); src_map = ptr->parent_map; @@ -12727,7 +13063,7 @@ CopySuccessful: ; if (parent_maps == NULL) { map_share = FALSE; } - src_entry = tmp_entry->vme_prev; + src_entry = VME_PREV(tmp_entry); } if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) && @@ -12771,6 +13107,7 @@ CopySuccessful: ; if (src_destroy) { vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS; + assert(!vm_map_is_sealed(src_map)); if (src_map == kernel_map) { remove_flags |= VM_MAP_REMOVE_KUNWIRE; } @@ -12779,6 +13116,9 @@ CopySuccessful: ; src_end, remove_flags, KMEM_GUARD_NONE); + } else if (vm_map_is_sealed(src_map)) { + /* no clipping -> no damage */ + vm_map_unlock_read(src_map); } else { /* fix up the damage we did in the base map */ vm_map_simplify_range( @@ -12906,12 +13246,6 @@ CopySuccessful: ; copy_addr, MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK))); - /* - * The copy_entries will be injected directly into the - * destination map and might not be "map aligned" there... - */ - tmp_entry->map_aligned = FALSE; - tmp_entry->vme_end = copy_addr + (tmp_entry->vme_end - tmp_entry->vme_start); tmp_entry->vme_start = copy_addr; @@ -12943,6 +13277,7 @@ CopySuccessful: ; } *copy_result = copy; + vmlp_api_end(VM_MAP_COPYIN_INTERNAL, KERN_SUCCESS); return KERN_SUCCESS; #undef RETURN @@ -13056,7 +13391,6 @@ vm_map_fork_share( if (old_entry->is_sub_map) { assert(old_entry->wired_count == 0); #ifndef NO_NESTED_PMAP -#if !PMAP_FORK_NEST if (old_entry->use_pmap) { kern_return_t result; @@ -13068,11 +13402,10 @@ vm_map_fork_share( panic("vm_map_fork_share: pmap_nest failed!"); } } -#endif /* !PMAP_FORK_NEST */ #endif /* NO_NESTED_PMAP */ } else if (object == VM_OBJECT_NULL) { object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end - - old_entry->vme_start)); + old_entry->vme_start), old_map->serial_id); VME_OFFSET_SET(old_entry, 0); VME_OBJECT_SET(old_entry, object, false, 0); old_entry->use_pmap = TRUE; @@ -13325,6 +13658,9 @@ vm_map_fork_copy( vm_map_copy_t copy; vm_map_entry_t last = vm_map_last_entry(new_map); + vmlp_api_start(VM_MAP_FORK_COPY); + vmlp_range_event_entry(old_map, old_entry); /* new_map is covered by call to vm_map_copy_insert */ + vm_map_unlock(old_map); /* * Use maxprot version of copyin because we @@ -13356,6 +13692,7 @@ vm_map_fork_copy( * PROTECTION_FAILURE are handled above. */ + vmlp_api_end(VM_MAP_FORK_COPY, FALSE); return FALSE; } @@ -13392,65 +13729,10 @@ vm_map_fork_copy( } *old_entry_p = last; + vmlp_api_end(VM_MAP_FORK_COPY, TRUE); return TRUE; } -#if PMAP_FORK_NEST -#define PMAP_FORK_NEST_DEBUG 0 -static inline void -vm_map_fork_unnest( - pmap_t new_pmap, - vm_map_offset_t pre_nested_start, - vm_map_offset_t pre_nested_end, - vm_map_offset_t start, - vm_map_offset_t end) -{ - kern_return_t kr; - vm_map_offset_t nesting_mask, start_unnest, end_unnest; - - assertf(pre_nested_start <= pre_nested_end, - "pre_nested start 0x%llx end 0x%llx", - (uint64_t)pre_nested_start, (uint64_t)pre_nested_end); - assertf(start <= end, - "start 0x%llx end 0x%llx", - (uint64_t) start, (uint64_t)end); - - if (pre_nested_start == pre_nested_end) { - /* nothing was pre-nested: done */ - return; - } - if (end <= pre_nested_start) { - /* fully before pre-nested range: done */ - return; - } - if (start >= pre_nested_end) { - /* fully after pre-nested range: done */ - return; - } - /* ignore parts of range outside of pre_nested range */ - if (start < pre_nested_start) { - start = pre_nested_start; - } - if (end > pre_nested_end) { - end = pre_nested_end; - } - nesting_mask = pmap_shared_region_size_min(new_pmap) - 1; - start_unnest = start & ~nesting_mask; - end_unnest = (end + nesting_mask) & ~nesting_mask; - kr = pmap_unnest(new_pmap, - (addr64_t)start_unnest, - (uint64_t)(end_unnest - start_unnest)); -#if PMAP_FORK_NEST_DEBUG - printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr); -#endif /* PMAP_FORK_NEST_DEBUG */ - assertf(kr == KERN_SUCCESS, - "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x", - (uint64_t)start, (uint64_t)end, new_pmap, - (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest), - kr); -} -#endif /* PMAP_FORK_NEST */ - void vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map) { @@ -13488,11 +13770,14 @@ vm_map_fork( int map_create_options; kern_return_t footprint_collect_kr; + vmlp_api_start(VM_MAP_FORK); + if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE | VM_MAP_FORK_PRESERVE_PURGEABLE | VM_MAP_FORK_CORPSE_FOOTPRINT | VM_MAP_FORK_SHARE_IF_OWNED)) { /* unsupported option */ + vmlp_api_end(VM_MAP_FORK, -1); return VM_MAP_NULL; } @@ -13521,13 +13806,15 @@ vm_map_fork( #endif /* PMAP_CREATE_FORCE_4K_PAGES */ new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags); if (new_pmap == NULL) { + vmlp_api_end(VM_MAP_FORK, -1); return VM_MAP_NULL; } vm_map_reference(old_map); vm_map_lock(old_map); - map_create_options = 0; + /* Note that we're creating a map out of fork() */ + map_create_options = VM_MAP_CREATE_VIA_FORK; if (old_map->hdr.entries_pageable) { map_create_options |= VM_MAP_CREATE_PAGEABLE; } @@ -13540,6 +13827,9 @@ vm_map_fork( old_map->max_offset, map_create_options); + /* Inherit our parent's ID. */ + vm_map_assign_serial(new_map, old_map->serial_id); + /* inherit cs_enforcement */ vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement); @@ -13561,78 +13851,30 @@ vm_map_fork( csm_fork_prepare(old_map->pmap, new_pmap); #endif -#if PMAP_FORK_NEST /* * Pre-nest the shared region's pmap. */ - vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0; - pmap_fork_nest(old_map->pmap, new_pmap, - &pre_nested_start, &pre_nested_end); -#if PMAP_FORK_NEST_DEBUG - printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end); -#endif /* PMAP_FORK_NEST_DEBUG */ -#endif /* PMAP_FORK_NEST */ + pmap_fork_nest(old_map->pmap, new_pmap); for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) { + vmlp_range_event_entry(old_map, old_entry); /* * Abort any corpse collection if the system is shutting down. */ if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) && get_system_inshutdown()) { -#if PMAP_FORK_NEST - new_entry = vm_map_last_entry(new_map); - if (new_entry == vm_map_to_entry(new_map)) { - /* unnest all that was pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - vm_map_min(new_map), vm_map_max(new_map)); - } else if (new_entry->vme_end < vm_map_max(new_map)) { - /* unnest hole at the end, if pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - new_entry->vme_end, vm_map_max(new_map)); - } -#endif /* PMAP_FORK_NEST */ vm_map_corpse_footprint_collect_done(new_map); vm_map_unlock(new_map); vm_map_unlock(old_map); vm_map_deallocate(new_map); vm_map_deallocate(old_map); printf("Aborting corpse map due to system shutdown\n"); + vmlp_api_end(VM_MAP_FORK, -1); return VM_MAP_NULL; } entry_size = old_entry->vme_end - old_entry->vme_start; -#if PMAP_FORK_NEST - /* - * Undo any unnecessary pre-nesting. - */ - vm_map_offset_t prev_end; - if (old_entry == vm_map_first_entry(old_map)) { - prev_end = vm_map_min(old_map); - } else { - prev_end = old_entry->vme_prev->vme_end; - } - if (prev_end < old_entry->vme_start) { - /* unnest hole before this entry, if pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - prev_end, old_entry->vme_start); - } - if (old_entry->is_sub_map && old_entry->use_pmap) { - /* keep this entry nested in the child */ -#if PMAP_FORK_NEST_DEBUG - printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end); -#endif /* PMAP_FORK_NEST_DEBUG */ - } else { - /* undo nesting for this entry, if pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - old_entry->vme_start, old_entry->vme_end); - } -#endif /* PMAP_FORK_NEST */ - old_entry_inheritance = old_entry->inheritance; /* @@ -13815,6 +14057,7 @@ vm_map_fork( new_entry, VM_MAP_KERNEL_FLAGS_NONE); new_size += entry_size; + vmlp_range_event_entry(new_map, new_entry); break; slow_vm_map_fork_copy: @@ -13834,21 +14077,6 @@ slow_vm_map_fork_copy: old_entry = old_entry->vme_next; } -#if PMAP_FORK_NEST - new_entry = vm_map_last_entry(new_map); - if (new_entry == vm_map_to_entry(new_map)) { - /* unnest all that was pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - vm_map_min(new_map), vm_map_max(new_map)); - } else if (new_entry->vme_end < vm_map_max(new_map)) { - /* unnest hole at the end, if pre-nested */ - vm_map_fork_unnest(new_pmap, - pre_nested_start, pre_nested_end, - new_entry->vme_end, vm_map_max(new_map)); - } -#endif /* PMAP_FORK_NEST */ - #if defined(__arm64__) pmap_insert_commpage(new_map->pmap); #endif /* __arm64__ */ @@ -13876,6 +14104,7 @@ slow_vm_map_fork_copy: vm_map_unlock(old_map); vm_map_deallocate(old_map); + vmlp_api_end(VM_MAP_FORK, 0); return new_map; } @@ -14048,6 +14277,8 @@ vm_map_lookup_and_lock_object( vm_prot_t original_fault_type; vm_map_size_t fault_page_mask; + vmlp_api_start(VM_MAP_LOOKUP_AND_LOCK_OBJECT); + /* * VM_PROT_MASK means that the caller wants us to use "fault_type" * as a mask against the mapping's actual protections, not as an @@ -14092,6 +14323,7 @@ RetryLookup: && (*real_map != cow_sub_map_parent)) { vm_map_unlock(*real_map); } + vmlp_api_end(VM_MAP_LOOKUP_AND_LOCK_OBJECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -14108,14 +14340,13 @@ RetryLookup: */ submap_needed_copy = FALSE; -submap_recurse: if (entry->is_sub_map) { vm_map_offset_t local_vaddr; vm_map_offset_t end_delta; vm_map_offset_t start_delta; vm_map_offset_t top_entry_saved_start; vm_object_offset_t top_entry_saved_offset; - vm_map_entry_t submap_entry, saved_submap_entry; + vm_map_entry_t submap_entry; vm_object_offset_t submap_entry_offset; vm_object_size_t submap_entry_size; vm_prot_t subentry_protection; @@ -14127,7 +14358,6 @@ submap_recurse: boolean_t subentry_used_for_tpro; #endif /* __arm64e__ */ boolean_t mapped_needs_copy = FALSE; - vm_map_version_t version; assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map), "map %p (%d) entry %p submap %p (%d)\n", @@ -14211,6 +14441,7 @@ RetrySubMap: vm_map_unlock(*real_map); } *real_map = map; + vmlp_api_end(VM_MAP_LOOKUP_AND_LOCK_OBJECT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -14234,17 +14465,11 @@ RetrySubMap: (old_end - old_start)) - submap_entry->vme_end; + assertf(!submap_entry->is_sub_map, "Unexpected recursive submap entry %p", submap_entry); old_start += start_delta; old_end -= end_delta; - if (submap_entry->is_sub_map) { - entry = submap_entry; - vaddr = local_vaddr; - goto submap_recurse; - } - - if (((fault_type & VM_PROT_WRITE) || - force_copy) + if (((fault_type & VM_PROT_WRITE) || force_copy) && cow_sub_map_parent) { vm_object_t sub_object, copy_object; vm_object_offset_t copy_offset; @@ -14255,47 +14480,36 @@ RetrySubMap: boolean_t object_copied_needs_copy = FALSE; kern_return_t kr = KERN_SUCCESS; - if (vm_map_lock_read_to_write(map)) { - vm_map_lock_read(map); - old_start -= start_delta; - old_end += end_delta; - goto RetrySubMap; + if (__improbable(!vm_map_is_sealed(map))) { + panic("%s: CoW fault on not-yet-sealed submap %p", __func__, map); } - - sub_object = VME_OBJECT(submap_entry); - if (sub_object == VM_OBJECT_NULL) { - sub_object = - vm_object_allocate( - (vm_map_size_t) - (submap_entry->vme_end - - submap_entry->vme_start)); - VME_OBJECT_SET(submap_entry, sub_object, false, 0); - VME_OFFSET_SET(submap_entry, 0); - assert(!submap_entry->is_sub_map); - assert(submap_entry->use_pmap); - } - local_start = local_vaddr - + assert(sub_object != VM_OBJECT_NULL); + local_start = local_vaddr - (cow_parent_vaddr - old_start); local_end = local_vaddr + (old_end - cow_parent_vaddr); - vm_map_clip_start(map, submap_entry, local_start); - vm_map_clip_end(map, submap_entry, local_end); - if (submap_entry->is_sub_map) { - /* unnesting was done when clipping */ - assert(!submap_entry->use_pmap); - } - /* This is the COW case, lets connect */ /* an entry in our space to the underlying */ /* object in the submap, bypassing the */ /* submap. */ submap_entry_offset = VME_OFFSET(submap_entry); submap_entry_size = submap_entry->vme_end - submap_entry->vme_start; - - if ((submap_entry->wired_count != 0 || - sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) && - (submap_entry->protection & VM_PROT_EXECUTE) && + /* adjust to out local range */ + if (submap_entry->vme_start < local_start) { + vm_map_offset_t clip_start; + clip_start = local_start - submap_entry->vme_start; + submap_entry_offset += clip_start; + submap_entry_size -= clip_start; + } + if (local_end < submap_entry->vme_end) { + vm_map_offset_t clip_end; + clip_end = submap_entry->vme_end - local_end; + submap_entry_size -= clip_end; + } + assert(!submap_entry->wired_count); + assert(sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC); + if ((submap_entry->protection & VM_PROT_EXECUTE) && no_force_copy_if_executable) { // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy); if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { @@ -14307,7 +14521,8 @@ RetrySubMap: } *real_map = map; ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */); - vm_map_lock_write_to_read(map); + // sealed map, so we already only hold the lock for read... + // vm_map_lock_write_to_read(map); kr = KERN_PROTECTION_FAILURE; DTRACE_VM4(submap_no_copy_executable, vm_map_t, map, @@ -14317,193 +14532,60 @@ RetrySubMap: return kr; } - if (submap_entry->wired_count != 0) { - vm_object_reference(sub_object); - - assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)), - "submap_entry %p offset 0x%llx\n", - submap_entry, VME_OFFSET(submap_entry)); - - DTRACE_VM6(submap_copy_slowly, - vm_map_t, cow_sub_map_parent, - vm_map_offset_t, vaddr, - vm_map_t, map, - vm_object_size_t, submap_entry_size, - int, submap_entry->wired_count, - int, sub_object->copy_strategy); - - saved_submap_entry = submap_entry; - version.main_timestamp = map->timestamp; - vm_map_unlock(map); /* Increments timestamp by 1 */ - submap_entry = VM_MAP_ENTRY_NULL; - - vm_object_lock(sub_object); - kr = vm_object_copy_slowly(sub_object, - submap_entry_offset, - submap_entry_size, - FALSE, /* interruptible */ - ©_object); - object_copied = TRUE; - object_copied_offset = 0; - /* 4k: account for extra offset in physical page */ - object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset); - object_copied_needs_copy = FALSE; - vm_object_deallocate(sub_object); - - vm_map_lock(map); - - if (kr != KERN_SUCCESS && - kr != KERN_MEMORY_RESTART_COPY) { - if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { - vm_map_unlock(cow_sub_map_parent); - } - if ((*real_map != map) - && (*real_map != cow_sub_map_parent)) { - vm_map_unlock(*real_map); - } - *real_map = map; - vm_object_deallocate(copy_object); - copy_object = VM_OBJECT_NULL; - ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */); - vm_map_lock_write_to_read(map); - DTRACE_VM4(submap_copy_error_slowly, - vm_object_t, sub_object, - vm_object_offset_t, submap_entry_offset, - vm_object_size_t, submap_entry_size, - int, kr); - vm_map_lookup_and_lock_object_copy_slowly_error++; - return kr; - } - - if ((kr == KERN_SUCCESS) && - (version.main_timestamp + 1) == map->timestamp) { - submap_entry = saved_submap_entry; - } else { - saved_submap_entry = NULL; - old_start -= start_delta; - old_end += end_delta; - vm_object_deallocate(copy_object); - copy_object = VM_OBJECT_NULL; - vm_map_lock_write_to_read(map); - vm_map_lookup_and_lock_object_copy_slowly_restart++; - goto RetrySubMap; - } - vm_map_lookup_and_lock_object_copy_slowly_count++; - vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size; - if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) { - vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size; - } - } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { - submap_entry_offset = VME_OFFSET(submap_entry); + assert(sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC); + copy_object = VM_OBJECT_NULL; + object_copied_offset = submap_entry_offset; + object_copied_needs_copy = FALSE; + DTRACE_VM6(submap_copy_strategically, + vm_map_t, cow_sub_map_parent, + vm_map_offset_t, vaddr, + vm_map_t, map, + vm_object_size_t, submap_entry_size, + int, submap_entry->wired_count, + int, sub_object->copy_strategy); + kr = vm_object_copy_strategically( + sub_object, + submap_entry_offset, + submap_entry_size, + false, /* forking */ + ©_object, + &object_copied_offset, + &object_copied_needs_copy); + if (kr == KERN_MEMORY_RESTART_COPY) { + old_start -= start_delta; + old_end += end_delta; + vm_object_deallocate(copy_object); copy_object = VM_OBJECT_NULL; - object_copied_offset = submap_entry_offset; - object_copied_needs_copy = FALSE; - DTRACE_VM6(submap_copy_strategically, - vm_map_t, cow_sub_map_parent, - vm_map_offset_t, vaddr, - vm_map_t, map, + vm_map_lookup_and_lock_object_copy_strategically_restart++; + goto RetrySubMap; + } + if (kr != KERN_SUCCESS) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */); + DTRACE_VM4(submap_copy_error_strategically, + vm_object_t, sub_object, + vm_object_offset_t, submap_entry_offset, vm_object_size_t, submap_entry_size, - int, submap_entry->wired_count, - int, sub_object->copy_strategy); - kr = vm_object_copy_strategically( - sub_object, - submap_entry_offset, - submap_entry->vme_end - submap_entry->vme_start, - false, /* forking */ - ©_object, - &object_copied_offset, - &object_copied_needs_copy); - if (kr == KERN_MEMORY_RESTART_COPY) { - old_start -= start_delta; - old_end += end_delta; - vm_object_deallocate(copy_object); - copy_object = VM_OBJECT_NULL; - vm_map_lock_write_to_read(map); - vm_map_lookup_and_lock_object_copy_strategically_restart++; - goto RetrySubMap; - } - if (kr != KERN_SUCCESS) { - if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { - vm_map_unlock(cow_sub_map_parent); - } - if ((*real_map != map) - && (*real_map != cow_sub_map_parent)) { - vm_map_unlock(*real_map); - } - *real_map = map; - vm_object_deallocate(copy_object); - copy_object = VM_OBJECT_NULL; - ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */); - vm_map_lock_write_to_read(map); - DTRACE_VM4(submap_copy_error_strategically, - vm_object_t, sub_object, - vm_object_offset_t, submap_entry_offset, - vm_object_size_t, submap_entry_size, - int, kr); - vm_map_lookup_and_lock_object_copy_strategically_error++; - return kr; - } - assert(copy_object != VM_OBJECT_NULL); - assert(copy_object != sub_object); - object_copied = TRUE; - vm_map_lookup_and_lock_object_copy_strategically_count++; - vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size; - if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) { - vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size; - } - } else { - /* set up shadow object */ - object_copied = FALSE; - copy_object = sub_object; - vm_object_lock(sub_object); - vm_object_reference_locked(sub_object); - VM_OBJECT_SET_SHADOWED(sub_object, TRUE); - vm_object_unlock(sub_object); - - assert(submap_entry->wired_count == 0); - submap_entry->needs_copy = TRUE; - - prot = submap_entry->protection; - if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) { - panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", - __FUNCTION__, - map, map->pmap, submap_entry, - (uint64_t)submap_entry->vme_start, - (uint64_t)submap_entry->vme_end, - prot); - } - prot = prot & ~VM_PROT_WRITE; - if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) { - panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x", - __FUNCTION__, - map, map->pmap, submap_entry, - (uint64_t)submap_entry->vme_start, - (uint64_t)submap_entry->vme_end, - prot); - } - - if (override_nx(old_map, - VME_ALIAS(submap_entry)) - && prot) { - prot |= VM_PROT_EXECUTE; - } - - vm_object_pmap_protect( - sub_object, - VME_OFFSET(submap_entry), - submap_entry->vme_end - - submap_entry->vme_start, - (submap_entry->is_shared - || map->mapped_in_other_pmaps) ? - PMAP_NULL : map->pmap, - VM_MAP_PAGE_SIZE(map), - submap_entry->vme_start, - prot); - vm_map_lookup_and_lock_object_copy_shadow_count++; - vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size; - if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) { - vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size; - } + int, kr); + vm_map_lookup_and_lock_object_copy_strategically_error++; + return kr; + } + assert(copy_object != VM_OBJECT_NULL); + assert(copy_object != sub_object); + object_copied = TRUE; + vm_map_lookup_and_lock_object_copy_strategically_count++; + vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) { + vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size; } /* @@ -14528,7 +14610,7 @@ RetrySubMap: #if __arm64e__ subentry_used_for_tpro = submap_entry->used_for_tpro; #endif // __arm64e__ - vm_map_unlock(map); + vm_map_unlock_read(map); submap_entry = NULL; /* not valid after map unlock */ local_start = old_start; @@ -14687,6 +14769,15 @@ RetrySubMap: } vm_map_lock_write_to_read(map); + } else if (vm_map_is_sealed(map)) { + assert(!submap_entry->is_sub_map); + if ((cow_sub_map_parent) + && (cow_sub_map_parent != *real_map) + && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + entry = submap_entry; + vaddr = local_vaddr; } else { if ((cow_sub_map_parent) && (cow_sub_map_parent != *real_map) @@ -14754,6 +14845,7 @@ protection_failure: * * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0); */ + vmlp_api_end(VM_MAP_LOOKUP_AND_LOCK_OBJECT, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -14841,7 +14933,9 @@ protection_failure: VME_OBJECT_SET(entry, vm_object_allocate( (vm_map_size_t)(entry->vme_end - - entry->vme_start)), false, 0); + entry->vme_start), + map->serial_id + ), false, 0); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); vm_map_lock_write_to_read(map); @@ -14926,6 +15020,8 @@ protection_failure: out_version->main_timestamp = map->timestamp; + vmlp_range_event(map, entry->vme_start, entry->vme_end - entry->vme_start); + vmlp_api_end(VM_MAP_LOOKUP_AND_LOCK_OBJECT, KERN_SUCCESS); return KERN_SUCCESS; } @@ -14952,6 +15048,19 @@ vm_map_verify( } +/* Helper function to interrogate a VM entry's state for vm_map_region_recurse */ +uint8_t +vm_map_entry_info_flags(vm_map_entry_t entry) +{ + uint8_t flags = 0; + if (entry->used_for_jit) { + flags |= VM_REGION_FLAG_JIT_ENABLED; + } + + + return flags; +} + /* * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY * Goes away after regular vm_region_recurse function migrates to @@ -14976,6 +15085,8 @@ vm_map_region_recurse_64( vm_map_offset_t user_address; unsigned int user_max_depth; + vmlp_api_start(VM_MAP_REGION_RECURSE_64); + /* * "curr_entry" is the VM map entry preceding or including the * address we're looking for. @@ -15026,11 +15137,13 @@ vm_map_region_recurse_64( if (map == VM_MAP_NULL) { /* no address space to work on */ + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } user_address = vm_sanitize_addr(map, *address_u); + effective_page_shift = vm_self_region_page_shift(map); effective_page_size = (1 << effective_page_shift); @@ -15039,6 +15152,7 @@ vm_map_region_recurse_64( * "info" structure is not big enough and * would overflow */ + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -15254,7 +15368,7 @@ recurse_again: next_entry == NULL && /* & there are no more regions */ /* & we haven't already provided our fake region: */ user_address <= vm_map_last_entry(map)->vme_end) { - ledger_amount_t ledger_resident, ledger_compressed; + ledger_amount_t ledger_resident = 0, ledger_compressed = 0; /* * Add a fake memory region to account for @@ -15263,11 +15377,17 @@ recurse_again: * i.e. the resident/compressed pages of non-volatile * objects owned by that task. */ - task_ledgers_footprint(map->pmap->ledger, - &ledger_resident, - &ledger_compressed); + if (__improbable(map->pmap == NULL)) { + /* Some VM tests reach this. (TODO make this more strict, rdar://148290198) */ + panic_on_release_builds("null pmap"); + } else { + task_ledgers_footprint(map->pmap->ledger, + &ledger_resident, + &ledger_compressed); + } if (ledger_resident + ledger_compressed == 0) { /* no purgeable memory usage to report */ + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } /* fake region to show nonvolatile footprint */ @@ -15314,11 +15434,13 @@ recurse_again: *nesting_depth = 0; *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end); *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed); + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_SUCCESS); return KERN_SUCCESS; } if (next_entry == NULL) { /* ... and no VM region follows it either */ + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } /* ... gather info about the next VM region */ @@ -15359,6 +15481,8 @@ recurse_again: goto recurse_again; } + vmlp_range_event_entry(curr_map, curr_entry); + *nesting_depth = curr_depth; *address_u = vm_sanitize_wrap_addr( user_address + curr_skip - curr_max_below); @@ -15377,6 +15501,7 @@ recurse_again: submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry)); } else { submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); + submap_info->flags = vm_map_entry_info_flags(curr_entry); } } else { short_info->user_tag = VME_ALIAS(curr_entry); @@ -15391,6 +15516,7 @@ recurse_again: short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry)); } else { short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); + short_info->flags = vm_map_entry_info_flags(curr_entry); } } @@ -15467,6 +15593,7 @@ recurse_again: vm_map_unlock_read(curr_map); } + vmlp_api_end(VM_MAP_REGION_RECURSE_64, KERN_SUCCESS); return KERN_SUCCESS; } @@ -15500,8 +15627,11 @@ vm_map_region( return KERN_INVALID_ARGUMENT; } + vmlp_api_start(VM_MAP_REGION); + start = vm_sanitize_addr(map, *address_u); + switch (flavor) { case VM_REGION_BASIC_INFO: /* legacy for old 32-bit objects info */ @@ -15509,6 +15639,7 @@ vm_map_region( vm_region_basic_info_t basic; if (*count < VM_REGION_BASIC_INFO_COUNT) { + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -15520,6 +15651,7 @@ vm_map_region( if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } else { @@ -15527,6 +15659,7 @@ vm_map_region( } start = entry->vme_start; + vmlp_range_event(map, start, entry->vme_end - start); basic->offset = (uint32_t)VME_OFFSET(entry); basic->protection = entry->protection; @@ -15549,6 +15682,7 @@ vm_map_region( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_SUCCESS); return KERN_SUCCESS; } @@ -15557,6 +15691,7 @@ vm_map_region( vm_region_basic_info_64_t basic; if (*count < VM_REGION_BASIC_INFO_COUNT_64) { + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -15568,6 +15703,7 @@ vm_map_region( if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } else { @@ -15575,6 +15711,7 @@ vm_map_region( } start = entry->vme_start; + vmlp_range_event(map, start, entry->vme_end - start); basic->offset = VME_OFFSET(entry); basic->protection = entry->protection; @@ -15597,10 +15734,12 @@ vm_map_region( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_SUCCESS); return KERN_SUCCESS; } case VM_REGION_EXTENDED_INFO: if (*count < VM_REGION_EXTENDED_INFO_COUNT) { + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } OS_FALLTHROUGH; @@ -15611,6 +15750,7 @@ vm_map_region( int effective_page_size, effective_page_shift; if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) { + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -15624,12 +15764,14 @@ vm_map_region( if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; + vmlp_range_event(map, start, entry->vme_end - start); extended->protection = entry->protection; extended->user_tag = VME_ALIAS(entry); @@ -15658,6 +15800,7 @@ vm_map_region( *size_u = vm_sanitize_wrap_size(entry->vme_end - start); vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_SUCCESS); return KERN_SUCCESS; } case VM_REGION_TOP_INFO: @@ -15665,6 +15808,7 @@ vm_map_region( vm_region_top_info_t top; if (*count < VM_REGION_TOP_INFO_COUNT) { + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -15676,12 +15820,14 @@ vm_map_region( if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } else { entry = tmp_entry; } start = entry->vme_start; + vmlp_range_event(map, start, entry->vme_end - start); top->private_pages_resident = 0; top->shared_pages_resident = 0; @@ -15696,9 +15842,11 @@ vm_map_region( *size_u = vm_sanitize_wrap_size(entry->vme_end - start); vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_REGION, KERN_SUCCESS); return KERN_SUCCESS; } default: + vmlp_api_end(VM_MAP_REGION, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -15821,7 +15969,7 @@ vm_map_region_walk( do_region_footprint = task_self_region_footprint(); if ((entry->is_sub_map) || - (VME_OBJECT(entry) == 0) || + (VME_OBJECT(entry) == VM_OBJECT_NULL) || (VME_OBJECT(entry)->phys_contiguous && !entry->superpage_size)) { extended->share_mode = SM_EMPTY; @@ -15877,7 +16025,7 @@ vm_map_region_walk( /* * Query the pmap. */ - vm_map_footprint_query_page_info( + vm_map_footprint_query_page_info_exclusive( map, entry, va, @@ -16176,7 +16324,7 @@ vm_map_simplify_entry( { vm_map_entry_t prev_entry; - prev_entry = this_entry->vme_prev; + prev_entry = VME_PREV(this_entry); if ((this_entry != vm_map_to_entry(map)) && (prev_entry != vm_map_to_entry(map)) && @@ -16199,7 +16347,6 @@ vm_map_simplify_entry( (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) && (prev_entry->no_cache == this_entry->no_cache) && (prev_entry->vme_permanent == this_entry->vme_permanent) && - (prev_entry->map_aligned == this_entry->map_aligned) && (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) && (prev_entry->used_for_jit == this_entry->used_for_jit) && #if __arm64e__ @@ -16233,10 +16380,8 @@ vm_map_simplify_entry( } vm_map_store_entry_unlink(map, prev_entry, true); assert(prev_entry->vme_start < this_entry->vme_end); - if (prev_entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start, - VM_MAP_PAGE_MASK(map))); - } + assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start, + VM_MAP_PAGE_MASK(map))); this_entry->vme_start = prev_entry->vme_start; VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry)); @@ -16260,13 +16405,16 @@ vm_map_simplify( vm_map_offset_t start) { vm_map_entry_t this_entry; - + vmlp_api_start(VM_MAP_SIMPLIFY); vm_map_lock(map); if (vm_map_lookup_entry(map, start, &this_entry)) { + vmlp_range_event_entry(map, this_entry); vm_map_simplify_entry(map, this_entry); + vmlp_range_event_entry(map, this_entry->vme_next); vm_map_simplify_entry(map, this_entry->vme_next); } vm_map_unlock(map); + vmlp_api_end(VM_MAP_SIMPLIFY, 0); } static void @@ -16281,6 +16429,10 @@ vm_map_simplify_range( * The map should be locked (for "write") by the caller. */ + if (vm_map_is_sealed(map)) { + return; + } + if (start >= end) { /* invalid address range */ return; @@ -16320,10 +16472,12 @@ vm_map_machine_attribute_sanitize( mach_vm_offset_t *end, vm_map_size_t *size) { + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + return vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map, - VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, - size); + flags, start, end, size); } @@ -16354,6 +16508,8 @@ vm_map_machine_attribute( kern_return_t ret; vm_map_entry_t entry; + vmlp_api_start(VM_MAP_MACHINE_ATTRIBUTE); + ret = vm_map_machine_attribute_sanitize(map, start_u, end_u, @@ -16361,14 +16517,18 @@ vm_map_machine_attribute( &end, &sync_size); if (__improbable(ret != KERN_SUCCESS)) { - return vm_sanitize_get_kr(ret); + ret = vm_sanitize_get_kr(ret); + vmlp_api_end(VM_MAP_MACHINE_ATTRIBUTE, ret); + return ret; } if (start < vm_map_min(map) || end > vm_map_max(map)) { + vmlp_api_end(VM_MAP_MACHINE_ATTRIBUTE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } vm_map_lock(map); + vmlp_range_event(map, start, end - start); if (attribute != MATTR_CACHE) { /* If we don't have to find physical addresses, we */ @@ -16376,6 +16536,7 @@ vm_map_machine_attribute( ret = pmap_attribute(map->pmap, start, end - start, attribute, value); vm_map_unlock(map); + vmlp_api_end(VM_MAP_MACHINE_ATTRIBUTE, ret); return ret; } @@ -16460,12 +16621,14 @@ vm_map_machine_attribute( start += sub_size; } else { vm_map_unlock(map); + vmlp_api_end(VM_MAP_MACHINE_ATTRIBUTE, KERN_FAILURE); return KERN_FAILURE; } } vm_map_unlock(map); + vmlp_api_end(VM_MAP_MACHINE_ATTRIBUTE, ret); return ret; } @@ -16484,15 +16647,21 @@ vm_map_behavior_set( vm_map_offset_t end, vm_behavior_t new_behavior) { + kern_return_t kr; vm_map_entry_t entry; vm_map_entry_t temp_entry; + vmlp_api_start(VM_MAP_BEHAVIOR_SET); + vmlp_range_event(map, start, end - start); + if (start > end || start < vm_map_min(map) || end > vm_map_max(map)) { + vmlp_api_end(VM_MAP_BEHAVIOR_SET, KERN_NO_SPACE); return KERN_NO_SPACE; } if (__improbable(vm_map_range_overflows(map, start, end - start))) { + vmlp_api_end(VM_MAP_BEHAVIOR_SET, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -16522,6 +16691,7 @@ vm_map_behavior_set( vm_map_clip_start(map, entry, start); } else { vm_map_unlock(map); + vmlp_api_end(VM_MAP_BEHAVIOR_SET, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -16563,6 +16733,7 @@ vm_map_behavior_set( } vm_map_unlock(map); + kr = KERN_SUCCESS; break; /* @@ -16572,36 +16743,46 @@ vm_map_behavior_set( */ case VM_BEHAVIOR_WILLNEED: - return vm_map_willneed(map, start, end); + kr = vm_map_willneed(map, start, end); + break; case VM_BEHAVIOR_DONTNEED: - return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS); + kr = vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS); + break; case VM_BEHAVIOR_FREE: - return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS); + kr = vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS); + break; case VM_BEHAVIOR_REUSABLE: - return vm_map_reusable_pages(map, start, end); + kr = vm_map_reusable_pages(map, start, end); + break; case VM_BEHAVIOR_REUSE: - return vm_map_reuse_pages(map, start, end); + kr = vm_map_reuse_pages(map, start, end); + break; case VM_BEHAVIOR_CAN_REUSE: - return vm_map_can_reuse(map, start, end); + kr = vm_map_can_reuse(map, start, end); + break; #if MACH_ASSERT case VM_BEHAVIOR_PAGEOUT: - return vm_map_pageout(map, start, end); + kr = vm_map_pageout(map, start, end); + break; #endif /* MACH_ASSERT */ case VM_BEHAVIOR_ZERO: - return vm_map_zero(map, start, end); + kr = vm_map_zero(map, start, end); + break; default: - return KERN_INVALID_ARGUMENT; + kr = KERN_INVALID_ARGUMENT; + break; } - return KERN_SUCCESS; + vmlp_api_end(VM_MAP_BEHAVIOR_SET, kr); + return kr; } @@ -16624,6 +16805,8 @@ vm_map_willneed( vm_object_size_t len; vm_size_t region_size; + vmlp_api_start(VM_MAP_WILLNEED); + KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START, start, end); struct vm_object_fault_info fault_info = { @@ -16642,6 +16825,8 @@ vm_map_willneed( vm_map_lock_read(map); + vmlp_range_event(map, start, end); + /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return @@ -16737,6 +16922,7 @@ vm_map_willneed( done: KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END, start, kr); + vmlp_api_end(VM_MAP_WILLNEED, kr); return kr; } @@ -16744,10 +16930,9 @@ static boolean_t vm_map_entry_is_reusable( vm_map_entry_t entry) { - /* Only user map entries */ - vm_object_t object; + /* Only user map entries */ if (entry->is_sub_map) { return FALSE; } @@ -16830,6 +17015,8 @@ vm_map_reuse_pages( vm_object_t object; vm_object_offset_t start_offset, end_offset; + vmlp_api_start(VM_MAP_REUSE_PAGES); + /* * The MADV_REUSE operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. @@ -16841,6 +17028,7 @@ vm_map_reuse_pages( * need to figure out what reusable means for a * portion of a native page. */ + vmlp_api_end(VM_MAP_REUSE_PAGES, KERN_SUCCESS); return KERN_SUCCESS; } @@ -16856,9 +17044,12 @@ vm_map_reuse_pages( if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_failure++; + vmlp_api_end(VM_MAP_REUSE_PAGES, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } + vmlp_range_event(map, start, end - start); + /* * Examine each vm_map_entry_t in the range. */ @@ -16870,6 +17061,7 @@ vm_map_reuse_pages( if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_failure++; + vmlp_api_end(VM_MAP_REUSE_PAGES, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -16910,6 +17102,7 @@ vm_map_reuse_pages( vm_map_unlock_read(map); vm_page_stats_reusable.reuse_pages_success++; + vmlp_api_end(VM_MAP_REUSE_PAGES, KERN_SUCCESS); return KERN_SUCCESS; } @@ -16925,12 +17118,15 @@ vm_map_reusable_pages( vm_object_offset_t start_offset, end_offset; vm_map_offset_t pmap_offset; + vmlp_api_start(VM_MAP_REUSABLE_PAGES); + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* * XXX TODO4K * need to figure out what reusable means for a portion * of a native page. */ + vmlp_api_end(VM_MAP_REUSABLE_PAGES, KERN_SUCCESS); return KERN_SUCCESS; } @@ -16951,9 +17147,12 @@ vm_map_reusable_pages( if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_failure++; + vmlp_api_end(VM_MAP_REUSABLE_PAGES, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } + vmlp_range_event(map, start, end - start); + /* * Examine each vm_map_entry_t in the range. */ @@ -16968,6 +17167,7 @@ vm_map_reusable_pages( if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_failure++; + vmlp_api_end(VM_MAP_REUSABLE_PAGES, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -16980,6 +17180,7 @@ vm_map_reusable_pages( vm_map_unlock_read(map); vm_page_stats_reusable.reusable_nonwritable++; vm_page_stats_reusable.reusable_pages_failure++; + vmlp_api_end(VM_MAP_REUSABLE_PAGES, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -17070,6 +17271,7 @@ vm_map_reusable_pages( vm_map_unlock_read(map); vm_page_stats_reusable.reusable_pages_success++; + vmlp_api_end(VM_MAP_REUSABLE_PAGES, KERN_SUCCESS); return KERN_SUCCESS; } @@ -17082,6 +17284,9 @@ vm_map_can_reuse( { vm_map_entry_t entry; + vmlp_api_start(VM_MAP_CAN_REUSE); + vmlp_range_event(map, start, end - start); + /* * The MADV_REUSABLE operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. @@ -17099,6 +17304,7 @@ vm_map_can_reuse( if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_failure++; + vmlp_api_end(VM_MAP_CAN_REUSE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -17113,12 +17319,14 @@ vm_map_can_reuse( if (!vm_map_entry_is_reusable(entry)) { vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_failure++; + vmlp_api_end(VM_MAP_CAN_REUSE, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } } vm_map_unlock_read(map); vm_page_stats_reusable.can_reuse_success++; + vmlp_api_end(VM_MAP_CAN_REUSE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -17132,12 +17340,15 @@ vm_map_pageout( { vm_map_entry_t entry; + vmlp_api_start(VM_MAP_PAGEOUT); + /* * The MADV_PAGEOUT operation doesn't require any changes to the * vm_map_entry_t's, so the read lock is sufficient. */ vm_map_lock_read(map); + vmlp_range_event(map, start, end - start); /* * The madvise semantics require that the address range be fully @@ -17147,6 +17358,7 @@ vm_map_pageout( if (!vm_map_range_check(map, start, end, &entry)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PAGEOUT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -17179,6 +17391,7 @@ vm_map_pageout( &submap_entry)) { vm_map_unlock_read(submap); vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PAGEOUT, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -17210,6 +17423,7 @@ vm_map_pageout( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PAGEOUT, KERN_SUCCESS); return KERN_SUCCESS; } #endif /* MACH_ASSERT */ @@ -17280,12 +17494,15 @@ vm_map_zero( vm_map_offset_t cur = start; kern_return_t ret; + vmlp_api_start(VM_MAP_ZERO); + /* * This operation isn't supported where the map page size is less than * the hardware page size. Caller will need to handle error and * explicitly zero memory if needed. */ if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { + vmlp_api_end(VM_MAP_ZERO, KERN_NO_ACCESS); return KERN_NO_ACCESS; } @@ -17296,6 +17513,8 @@ vm_map_zero( vm_map_lock_read(map); assert(map->pmap != kernel_pmap); /* protect alias access */ + vmlp_range_event(map, start, end - start); + /* * The madvise semantics require that the address range be fully * allocated with no holes. Otherwise, we're required to return @@ -17303,6 +17522,7 @@ vm_map_zero( */ if (!vm_map_range_check(map, cur, end, &entry)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_ZERO, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -17310,23 +17530,26 @@ vm_map_zero( * Examine each vm_map_entry_t in the range. */ while (entry != vm_map_to_entry(map) && entry->vme_start < end) { + vm_map_offset_t start_offset; vm_map_offset_t cur_offset; vm_map_offset_t end_offset; unsigned int last_timestamp = map->timestamp; - vm_object_t object = VME_OBJECT(entry); + vm_object_t object; ret = vm_map_zero_entry_preflight(entry); if (ret != KERN_SUCCESS) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_ZERO, ret); return ret; } + object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { entry = entry->vme_next; continue; } - vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset); + vm_map_get_bounds_in_object(entry, cur, end, &start_offset, &end_offset); vm_object_lock(object); /* * Take a reference on the object as vm_object_zero will drop the object @@ -17335,31 +17558,43 @@ vm_map_zero( vm_object_reference_locked(object); vm_map_unlock_read(map); - ret = vm_object_zero(object, cur_offset, end_offset); + cur_offset = start_offset; + ret = vm_object_zero(object, &cur_offset, end_offset); vm_object_unlock(object); vm_object_deallocate(object); if (ret != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_ZERO, ret); return ret; } /* * Update cur as vm_object_zero has succeeded. + * It can bail out early if it had to unlock a COPY_SYMMETRIC object, so + * the processed range is [start_offset:cur_offset). + * If the mapping became "needs_copy" while the object (and the map) was + * unlocked, we'll detect that the map has changed while unlocked + * and we'll redo the lookup and preflight for the remaining range. */ - cur += (end_offset - cur_offset); + cur += (cur_offset - start_offset); if (cur == end) { + vmlp_api_end(VM_MAP_ZERO, KERN_SUCCESS); return KERN_SUCCESS; } /* * If the map timestamp has changed, restart by relooking up cur in the - * map + * map. + * We also need to re-lookup the entry if we did not fully process + * this entry (i.e. cur_offset did not reach end_offset). */ vm_map_lock_read(map); - if (last_timestamp != map->timestamp) { + if (last_timestamp != map->timestamp || + cur_offset < end_offset) { /* * Relookup cur in the map */ if (!vm_map_range_check(map, cur, end, &entry)) { vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_ZERO, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } continue; @@ -17371,6 +17606,7 @@ vm_map_zero( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_ZERO, KERN_SUCCESS); return KERN_SUCCESS; } @@ -17392,11 +17628,9 @@ vm_map_entry_insert( boolean_t needs_copy, vm_prot_t cur_protection, vm_prot_t max_protection, - vm_inherit_t inheritance, - boolean_t clear_map_aligned) + vm_inherit_t inheritance) { vm_map_entry_t new_entry; - boolean_t map_aligned = FALSE; assert(insp_entry != (vm_map_entry_t)0); vm_map_lock_assert_exclusive(map); @@ -17404,21 +17638,8 @@ vm_map_entry_insert( __assert_only vm_object_offset_t end_offset = 0; assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset); - if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { - map_aligned = TRUE; - } - if (clear_map_aligned && - (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) || - !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) { - map_aligned = FALSE; - } - if (map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); - assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); - } else { - assert(page_aligned(start)); - assert(page_aligned(end)); - } + assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); + assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); assert(start < end); new_entry = vm_map_entry_create(map); @@ -17435,7 +17656,6 @@ vm_map_entry_insert( VME_OFFSET_SET(new_entry, offset); VME_ALIAS_SET(new_entry, vmk_flags.vm_tag); - new_entry->map_aligned = map_aligned; new_entry->needs_copy = needs_copy; new_entry->inheritance = inheritance; new_entry->protection = cur_protection; @@ -17519,6 +17739,8 @@ vm_map_remap_extract( vm_object_t new_copy_object; /* vm_object_copy_* result */ boolean_t saved_used_for_jit; /* Saved used_for_jit. */ + vmlp_api_start(VM_MAP_REMAP_EXTRACT); + pageable = vmk_flags.vmkf_copy_pageable; same_map = vmk_flags.vmkf_copy_same_map; @@ -17606,7 +17828,12 @@ vm_map_remap_extract( * The specified source virtual space might correspond to * multiple map entries, need to loop on them. */ - vm_map_lock(map); + if (vm_map_is_sealed(map)) { + vm_map_lock_read(map); + } else { + vm_map_lock(map); + } + vmlp_range_event(map, addr, size); if (map->pmap == kernel_pmap) { map_copy->is_kernel_range = true; @@ -17760,7 +17987,7 @@ vm_map_remap_extract( /* nothing to share */ } else { assert(copy_offset == 0); - copy_object = vm_object_allocate(copy_size); + copy_object = vm_object_allocate(copy_size, submap->serial_id); VME_OFFSET_SET(copy_entry, 0); VME_OBJECT_SET(copy_entry, copy_object, false, 0); assert(copy_entry->use_pmap); @@ -17800,10 +18027,12 @@ vm_map_remap_extract( } } + vmlp_api_end(VM_MAP_REMAP_EXTRACT, result); return result; } if (src_entry->is_sub_map) { + assert(!vm_map_is_sealed(map)); /* protections for submap mapping are irrelevant here */ } else if (((src_entry->protection & required_cur_prot) != required_cur_prot) || @@ -17839,6 +18068,7 @@ vm_map_remap_extract( vm_prot_t submap_curprot, submap_maxprot; boolean_t submap_needs_copy; + assert(!vm_map_is_sealed(map)); /* * No check for "required protection" on "src_entry" * because the protections that matter are the ones @@ -17971,7 +18201,7 @@ vm_map_remap_extract( src_start += copy_entry_size; assert(src_start <= src_end); _vm_map_store_entry_link(map_header, - map_header->links.prev, + VMH_PREV(map_header), copy_entry); } /* done with submap_copy */ @@ -18041,6 +18271,7 @@ vm_map_remap_extract( } if (object == VM_OBJECT_NULL) { + assert(!vm_map_is_sealed(map)); assert(!src_entry->needs_copy); if (src_entry->max_protection == VM_PROT_NONE) { assert(src_entry->protection == VM_PROT_NONE); @@ -18057,7 +18288,7 @@ vm_map_remap_extract( offset = 0; /* no object => no offset */ goto copy_src_entry; } - object = vm_object_allocate(entry_size); + object = vm_object_allocate(entry_size, map->serial_id); VME_OFFSET_SET(src_entry, 0); VME_OBJECT_SET(src_entry, object, false, 0); assert(src_entry->use_pmap); @@ -18086,6 +18317,7 @@ vm_map_remap_extract( object->vo_size > entry_size)) { bool is_writable; + assert(!vm_map_is_sealed(map)); VME_OBJECT_SHADOW(src_entry, entry_size, vm_map_always_shadow(map)); assert(src_entry->use_pmap); @@ -18180,6 +18412,7 @@ vm_map_remap_extract( * So we always switch from COPY_SYMMETRIC to * COPY_DELAY. */ + assert(!vm_map_is_sealed(map)); object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; VM_OBJECT_SET_TRUE_SHARE(object, TRUE); @@ -18197,6 +18430,7 @@ copy_src_entry: vm_map_entry_copy(map, new_entry, src_entry); if (new_entry->is_sub_map) { /* clr address space specifics */ + assert(!vm_map_is_sealed(map)); new_entry->use_pmap = FALSE; } else if (copy) { /* @@ -18212,8 +18446,6 @@ copy_src_entry: /* "iokit_acct" was cleared in vm_map_entry_copy() */ assert(!new_entry->iokit_acct); - new_entry->map_aligned = FALSE; - new_entry->vme_start = map_address; new_entry->vme_end = map_address + tmp_size; assert(new_entry->vme_start < new_entry->vme_end); @@ -18289,6 +18521,7 @@ RestartCopy: if (!src_entry->is_sub_map && VME_OBJECT(src_entry) == VM_OBJECT_NULL) { /* no accessible memory; nothing to share */ + assert(!vm_map_is_sealed(map)); assert(src_entry->protection == VM_PROT_NONE); assert(src_entry->max_protection == VM_PROT_NONE); src_entry->is_shared = FALSE; @@ -18309,6 +18542,7 @@ RestartCopy: } } else if (src_entry->is_sub_map) { /* make this a COW sub_map if not already */ + assert(!vm_map_is_sealed(map)); assert(new_entry->wired_count == 0); new_entry->needs_copy = TRUE; object = VM_OBJECT_NULL; @@ -18389,7 +18623,11 @@ RestartCopy: * verification, and unlock the map. */ version.main_timestamp = map->timestamp; - vm_map_unlock(map); /* Increments timestamp once! */ + if (vm_map_is_sealed(map)) { + vm_map_unlock_read(map); + } else { + vm_map_unlock(map); /* Increments timestamp once! */ + } /* * Perform the copy. @@ -18443,7 +18681,11 @@ RestartCopy: if (result != KERN_SUCCESS && result != KERN_MEMORY_RESTART_COPY) { vm_map_entry_dispose(new_entry); - vm_map_lock(map); + if (vm_map_is_sealed(map)) { + vm_map_lock_read(map); + } else { + vm_map_lock(map); + } break; } @@ -18452,7 +18694,12 @@ RestartCopy: * changed while the copy was being made. */ - vm_map_lock(map); + if (vm_map_is_sealed(map)) { + vm_map_lock_read(map); + version.main_timestamp--; /* we don't expect an increment */ + } else { + vm_map_lock(map); + } if (version.main_timestamp + 1 != map->timestamp) { /* * Simple version comparison failed. @@ -18479,7 +18726,7 @@ RestartCopy: } _vm_map_store_entry_link(map_header, - map_header->links.prev, new_entry); + VMH_PREV(map_header), new_entry); /* protections for submap mapping are irrelevant here */ if (vm_remap_legacy && !src_entry->is_sub_map) { @@ -18506,7 +18753,11 @@ RestartCopy: } } /* end while */ - vm_map_unlock(map); + if (vm_map_is_sealed(map)) { + vm_map_unlock_read(map); + } else { + vm_map_unlock(map); + } if (result != KERN_SUCCESS) { /* * Free all allocated elements. @@ -18524,6 +18775,7 @@ RestartCopy: vm_map_entry_dispose(src_entry); } } + vmlp_api_end(VM_MAP_REMAP_EXTRACT, result); return result; } @@ -18546,18 +18798,24 @@ void vm_map_mark_alien( vm_map_t map) { + vmlp_api_start(VM_MAP_MARK_ALIEN); vm_map_lock(map); + vmlp_range_event_none(map); map->is_alien = true; vm_map_unlock(map); + vmlp_api_end(VM_MAP_MARK_ALIEN, 0); } void vm_map_single_jit( vm_map_t map) { + vmlp_api_start(VM_MAP_SINGLE_JIT); vm_map_lock(map); + vmlp_range_event_none(map); map->single_jit = true; vm_map_unlock(map); + vmlp_api_end(VM_MAP_SINGLE_JIT, 0); } #endif /* XNU_TARGET_OS_OSX */ @@ -18606,7 +18864,7 @@ vm_map_copy_to_physcopy( /* allocate new VM object */ size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK); - new_object = vm_object_allocate(size); + new_object = vm_object_allocate(size, VM_MAP_SERIAL_NONE); assert(new_object); /* allocate new VM map entry */ @@ -19342,7 +19600,10 @@ vm_map_remap( vm_map_size_t initial_size; VM_MAP_ZAP_DECLARE(zap_list); + vmlp_api_start(VM_MAP_REMAP); + if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) { + vmlp_api_end(VM_MAP_REMAP, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } src_page_mask = VM_MAP_PAGE_MASK(src_map); @@ -19378,9 +19639,14 @@ vm_map_remap( &max_protection, &inheritance); if (__improbable(result != KERN_SUCCESS)) { - return vm_sanitize_get_kr(result); + result = vm_sanitize_get_kr(result); + vmlp_api_end(VM_MAP_REMAP, result); + return result; } + vmlp_range_event(target_map, target_addr, memory_size); + vmlp_range_event(src_map, memory_address, memory_size); + if (vmk_flags.vmf_return_data_addr) { /* * This is safe to unwrap now that the quantities @@ -19408,6 +19674,7 @@ vm_map_remap( if (vmk_flags.vmf_resilient_media) { /* must be copy-on-write to be "media resilient" */ if (!copy) { + vmlp_api_end(VM_MAP_REMAP, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -19425,6 +19692,7 @@ vm_map_remap( inheritance, vmk_flags); if (result != KERN_SUCCESS) { + vmlp_api_end(VM_MAP_REMAP, result); return result; } assert(copy_map != VM_MAP_COPY_NULL); @@ -19463,6 +19731,7 @@ vm_map_remap( if (result != KERN_SUCCESS) { DEBUG4K_COPY("failed to adjust 0x%x\n", result); vm_map_copy_discard(copy_map); + vmlp_api_end(VM_MAP_REMAP, result); return result; } if (trimmed_start == 0) { @@ -19490,6 +19759,7 @@ vm_map_remap( if (target_size == 0) { vm_map_copy_discard(copy_map); + vmlp_api_end(VM_MAP_REMAP, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -19589,7 +19859,6 @@ vm_map_remap( } entry->vme_start += target_addr; entry->vme_end += target_addr; - assert(!entry->map_aligned); if (vmk_flags.vmf_resilient_media && !entry->is_sub_map && (VME_OBJECT(entry) == VM_OBJECT_NULL || @@ -19653,19 +19922,22 @@ vm_map_remap( vm_map_copy_discard(copy_map); copy_map = VM_MAP_COPY_NULL; + vmlp_api_end(VM_MAP_REMAP, result); return result; } /* - * vm_map_switch_to: + * vm_map_switch_with_sec_override: * * Set the address map for the current thread to the specified map. + * Optionally sets the `sec_override` property on the current thread for + * the duration of the switch. * Returns a struct containing info about the previous map, which should be * restored with `vm_map_switch_back` */ vm_map_switch_context_t -vm_map_switch_to(vm_map_t map) +vm_map_switch_with_sec_override(vm_map_t map, boolean_t sec_override) { thread_t thread = current_thread(); vm_map_t oldmap = thread->map; @@ -19684,7 +19956,7 @@ vm_map_switch_to(vm_map_t map) } vm_map_unlock(map); - return (vm_map_switch_context_t) { oldmap, task }; + return (vm_map_switch_context_t) { oldmap, task, sec_override }; } void @@ -19911,6 +20183,8 @@ vm_map_check_protection( vm_prot_t protection; kern_return_t kr; + vmlp_api_start(VM_MAP_CHECK_PROTECTION); + kr = vm_map_check_protection_sanitize(map, start_u, end_u, @@ -19922,20 +20196,25 @@ vm_map_check_protection( if (__improbable(kr != KERN_SUCCESS)) { kr = vm_sanitize_get_kr(kr); if (kr == KERN_SUCCESS) { + vmlp_api_end(VM_MAP_CHECK_PROTECTION, 0); return true; } + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } vm_map_lock(map); + vmlp_range_event(map, start, end - start); if (start < vm_map_min(map) || end > vm_map_max(map)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } if (!vm_map_lookup_entry(map, start, &tmp_entry)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } @@ -19944,6 +20223,7 @@ vm_map_check_protection( while (start < end) { if (entry == vm_map_to_entry(map)) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } @@ -19953,6 +20233,7 @@ vm_map_check_protection( if (start < entry->vme_start) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } @@ -19962,6 +20243,7 @@ vm_map_check_protection( if ((entry->protection & protection) != protection) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, -1); return false; } @@ -19971,6 +20253,7 @@ vm_map_check_protection( entry = entry->vme_next; } vm_map_unlock(map); + vmlp_api_end(VM_MAP_CHECK_PROTECTION, 0); return true; } @@ -19987,11 +20270,14 @@ vm_map_purgable_control( kern_return_t kr; boolean_t was_nonvolatile; + vmlp_api_start(VM_MAP_PURGABLE_CONTROL); + /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ if (map == VM_MAP_NULL) { + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -19999,11 +20285,13 @@ vm_map_purgable_control( control != VM_PURGABLE_GET_STATE && control != VM_PURGABLE_PURGE_ALL && control != VM_PURGABLE_SET_STATE_FROM_KERNEL) { + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (control == VM_PURGABLE_PURGE_ALL) { vm_purgeable_object_purge_all(); + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_SUCCESS); return KERN_SUCCESS; } @@ -20011,6 +20299,7 @@ vm_map_purgable_control( control == VM_PURGABLE_SET_STATE_FROM_KERNEL) && (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) || ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) { + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -20023,15 +20312,19 @@ vm_map_purgable_control( * Must pass a valid non-submap address. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } + vmlp_range_event_entry(map, entry); + if ((entry->protection & VM_PROT_WRITE) == 0 && control != VM_PURGABLE_GET_STATE) { /* * Can't apply purgable controls to something you can't write. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_PROTECTION_FAILURE); return KERN_PROTECTION_FAILURE; } @@ -20042,6 +20335,7 @@ vm_map_purgable_control( * Object must already be present and be purgeable. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -20079,11 +20373,16 @@ vm_map_purgable_control( vm_object_unlock(object); + vmlp_api_end(VM_MAP_PURGABLE_CONTROL, kr); return kr; } -void -vm_map_footprint_query_page_info( +/* + * Query the disposition of a page at the given (entry, offset). Caller must + * hold the map lock exclusively. + */ +static void +vm_map_footprint_query_page_info_exclusive( vm_map_t map, vm_map_entry_t map_entry, vm_map_offset_t curr_s_offset, @@ -20094,7 +20393,11 @@ vm_map_footprint_query_page_info( int disposition; int effective_page_size; - vm_map_lock_assert_held(map); + /* + * XXX: Do *not* assert that the map lock is held. This routine is called + * frequently in a loop and rw-lock assertions have significant overhead + * (rdar://126486334). + */ assert(!map->has_corpse_footprint); assert(curr_s_offset >= map_entry->vme_start); assert(curr_s_offset < map_entry->vme_end); @@ -20346,10 +20649,12 @@ vm_map_page_range_info_internal( vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0; vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0; boolean_t do_region_footprint; - ledger_amount_t ledger_resident, ledger_compressed; + ledger_amount_t ledger_resident = 0, ledger_compressed = 0; int effective_page_size; vm_map_offset_t effective_page_mask; + vmlp_api_start(VM_MAP_PAGE_RANGE_INFO_INTERNAL); + switch (flavor) { case VM_PAGE_INFO_BASIC: if (*count != VM_PAGE_INFO_BASIC_COUNT) { @@ -20359,17 +20664,20 @@ vm_map_page_range_info_internal( * one to maintain backwards binary compatibility... */ if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) { + vmlp_api_end(VM_MAP_PAGE_RANGE_INFO_INTERNAL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } break; default: + vmlp_api_end(VM_MAP_PAGE_RANGE_INFO_INTERNAL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (effective_page_shift == -1) { effective_page_shift = vm_self_region_page_shift_safely(map); if (effective_page_shift == -1) { + vmlp_api_end(VM_MAP_PAGE_RANGE_INFO_INTERNAL, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -20385,7 +20693,9 @@ vm_map_page_range_info_internal( &end, &offset_in_page); if (retval != KERN_SUCCESS) { - return vm_sanitize_get_kr(retval); + retval = vm_sanitize_get_kr(retval); + vmlp_api_end(VM_MAP_PAGE_RANGE_INFO_INTERNAL, retval); + return retval; } assert((end - start) <= MAX_PAGE_RANGE_QUERY); @@ -20397,9 +20707,13 @@ vm_map_page_range_info_internal( info_idx = 0; /* Tracks the next index within the info structure to be filled.*/ vm_map_lock_read(map); - - - task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); + vmlp_range_event(map, start, end - start); + if (__improbable(map->pmap == NULL)) { + /* Some VM tests reach this. (TODO make this more strict, rdar://148290198) */ + panic_on_release_builds("null pmap"); + } else { + task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); + } for (curr_s_offset = start; curr_s_offset < end;) { /* @@ -20603,7 +20917,7 @@ vm_map_page_range_info_internal( * Query the live pmap for footprint info * about this page. */ - vm_map_footprint_query_page_info( + vm_map_footprint_query_page_info_exclusive( map, map_entry, curr_s_offset, @@ -20790,6 +21104,7 @@ vm_map_page_range_info_internal( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_PAGE_RANGE_INFO_INTERNAL, retval); return retval; } @@ -20803,11 +21118,12 @@ vm_map_msync_sanitize( vm_map_size_t *size) { vm_object_offset_t end; + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + return vm_sanitize_addr_size(address_u, size_u, VM_SANITIZE_CALLER_VM_MAP_MSYNC, - map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, - address, &end, size); + map, flags, address, &end, size); } /* @@ -20864,12 +21180,16 @@ vm_map_msync( vm_map_offset_t pmap_offset; kern_return_t kr; + vmlp_api_start(VM_MAP_MSYNC); + if ((sync_flags & VM_SYNC_ASYNCHRONOUS) && (sync_flags & VM_SYNC_SYNCHRONOUS)) { + vmlp_api_end(VM_MAP_MSYNC, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (map == VM_MAP_NULL) { + vmlp_api_end(VM_MAP_MSYNC, KERN_INVALID_TASK); return KERN_INVALID_TASK; } @@ -20882,7 +21202,9 @@ vm_map_msync( DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags); } if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kr = vm_sanitize_get_kr(kr); + vmlp_api_end(VM_MAP_MSYNC, kr); + return kr; } amount_left = size; @@ -20966,6 +21288,17 @@ vm_map_msync( vm_map_offset_t local_offset; local_map = VME_SUBMAP(entry); + if (vm_map_is_sealed(local_map)) { + /* + * We could do most of the work with a READER lock on the + * map but we might do some pmap operations which might + * need to be serialized by the WRITER lock. + * Since we can't take a WRITER lock on a sealed map, + * let's just skip this range. + */ + vm_map_unlock(map); + continue; + } local_offset = VME_OFFSET(entry); vm_map_reference(local_map); vm_map_unlock(map); @@ -20980,6 +21313,9 @@ vm_map_msync( local_map = VM_MAP_NULL; continue; } + + vmlp_range_event_entry(map, entry); + object = VME_OBJECT(entry); /* @@ -21110,9 +21446,11 @@ vm_map_msync( /* for proper msync() behaviour */ if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) { + vmlp_api_end(VM_MAP_MSYNC, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } + vmlp_api_end(VM_MAP_MSYNC, KERN_SUCCESS); return KERN_SUCCESS; }/* vm_msync */ @@ -21207,7 +21545,7 @@ convert_port_entry_to_map( return VM_MAP_NULL; } - if (ip_kotype(port) != IKOT_NAMED_ENTRY) { + if (ip_type(port) != IKOT_NAMED_ENTRY) { return convert_port_to_map(port); } @@ -21377,6 +21715,23 @@ vm_map_set_jit_entitled(vm_map_t map) #endif } +void +vm_map_set_platform_binary( + vm_map_t map, + bool is_platform_binary) +{ + /* map should be locked by caller, unless still private */ + map->cs_platform_binary = is_platform_binary; +} + +bool +vm_map_is_platform_binary( + vm_map_t map) +{ + /* map should be locked by caller, unless still private */ + return map->cs_platform_binary; +} + /* * Get status of this maps TPRO flag */ @@ -21405,6 +21760,7 @@ vm_map_set_tpro(vm_map_t map) } + /* * Does this map have TPRO enforcement enabled */ @@ -21420,11 +21776,14 @@ vm_map_tpro_enforcement(vm_map_t map) void vm_map_set_tpro_enforcement(vm_map_t map) { + vmlp_api_start(VM_MAP_SET_TPRO_ENFORCEMENT); + vmlp_range_event_none(map); if (vm_map_tpro(map)) { vm_map_lock(map); map->tpro_enforcement = TRUE; vm_map_unlock(map); } + vmlp_api_end(VM_MAP_SET_TPRO_ENFORCEMENT, 0); } /* @@ -21441,6 +21800,9 @@ vm_map_set_tpro_range( __unused vm_map_address_t start, __unused vm_map_address_t end) { + vmlp_api_start(VM_MAP_SET_TPRO_RANGE); + + vmlp_api_end(VM_MAP_SET_TPRO_RANGE, 0); return TRUE; } @@ -21453,6 +21815,8 @@ vm_map_set_max_addr( vm_map_offset_t new_max_offset, __unused bool extra_jumbo) { + vmlp_api_start(VM_MAP_SET_MAX_ADDR); + #if defined(__arm64__) vm_map_offset_t max_supported_offset; vm_map_offset_t old_max_offset; @@ -21460,6 +21824,8 @@ vm_map_set_max_addr( vm_map_lock(map); + vmlp_range_event_none(map); + old_max_offset = map->max_offset; #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT if (extra_jumbo) { @@ -21473,6 +21839,7 @@ vm_map_set_max_addr( /* The address space cannot be shrunk using this routine. */ if (old_max_offset >= new_max_offset) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_SET_MAX_ADDR, 0); return; } @@ -21521,6 +21888,7 @@ vm_map_set_max_addr( (void)map; (void)new_max_offset; #endif + vmlp_api_end(VM_MAP_SET_MAX_ADDR, 0); } vm_map_offset_t @@ -21610,7 +21978,12 @@ vm_map_raise_max_offset( { kern_return_t ret; + vmlp_api_start(VM_MAP_RAISE_MAX_OFFSET); + vm_map_lock(map); + + vmlp_range_event_none(map); + ret = KERN_INVALID_ADDRESS; if (new_max_offset >= map->max_offset) { @@ -21628,6 +22001,7 @@ vm_map_raise_max_offset( } vm_map_unlock(map); + vmlp_api_end(VM_MAP_RAISE_MAX_OFFSET, ret); return ret; } @@ -21643,6 +22017,8 @@ vm_map_raise_min_offset( { vm_map_entry_t first_entry; + vmlp_api_start(VM_MAP_RAISE_MIN_OFFSET); + new_min_offset = vm_map_round_page(new_min_offset, VM_MAP_PAGE_MASK(map)); @@ -21655,15 +22031,22 @@ vm_map_raise_min_offset( * possibly good reasons, inaccessible. */ vm_map_unlock(map); + vmlp_api_end(VM_MAP_RAISE_MIN_OFFSET, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } if (new_min_offset >= map->max_offset) { /* can't go beyond the end of the address space */ vm_map_unlock(map); + vmlp_api_end(VM_MAP_RAISE_MIN_OFFSET, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } first_entry = vm_map_first_entry(map); + + if (!first_entry->is_sub_map) { + vmlp_range_event_entry(map, first_entry); + } + if (first_entry != vm_map_to_entry(map) && first_entry->vme_start < new_min_offset) { /* @@ -21671,6 +22054,7 @@ vm_map_raise_min_offset( * minimun offset. It's too late to change it now... */ vm_map_unlock(map); + vmlp_api_end(VM_MAP_RAISE_MIN_OFFSET, KERN_NO_SPACE); return KERN_NO_SPACE; } @@ -21684,6 +22068,7 @@ vm_map_raise_min_offset( vm_map_unlock(map); + vmlp_api_end(VM_MAP_RAISE_MIN_OFFSET, KERN_SUCCESS); return KERN_SUCCESS; } @@ -21700,7 +22085,12 @@ vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit) { kern_return_t kr; + vmlp_api_start(VM_MAP_SET_SIZE_LIMIT); + vm_map_lock(map); + + vmlp_range_event_none(map); + if (new_size_limit < map->size) { /* new limit should not be lower than its current size */ DTRACE_VM2(vm_map_set_size_limit_fail, @@ -21722,6 +22112,7 @@ vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit) kr = KERN_SUCCESS; } vm_map_unlock(map); + vmlp_api_end(VM_MAP_SET_SIZE_LIMIT, kr); return kr; } @@ -21731,7 +22122,12 @@ vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit) { kern_return_t kr; + vmlp_api_start(VM_MAP_SET_DATA_LIMIT); + vm_map_lock(map); + + vmlp_range_event_none(map); + if (new_data_limit < map->size) { /* new limit should not be lower than its current size */ DTRACE_VM2(vm_map_set_data_limit_fail, @@ -21753,6 +22149,7 @@ vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit) kr = KERN_SUCCESS; } vm_map_unlock(map); + vmlp_api_end(VM_MAP_SET_DATA_LIMIT, kr); return kr; } @@ -21760,9 +22157,12 @@ void vm_map_set_user_wire_limit(vm_map_t map, vm_size_t limit) { + vmlp_api_start(VM_MAP_SET_USER_WIRE_LIMIT); vm_map_lock(map); + vmlp_range_event_none(map); map->user_wire_limit = limit; vm_map_unlock(map); + vmlp_api_end(VM_MAP_SET_USER_WIRE_LIMIT, 0); } @@ -21770,9 +22170,12 @@ void vm_map_switch_protect(vm_map_t map, boolean_t val) { + vmlp_api_start(VM_MAP_SWITCH_PROTECT); + vmlp_range_event_none(map); vm_map_lock(map); map->switch_protect = val; vm_map_unlock(map); + vmlp_api_end(VM_MAP_SWITCH_PROTECT, 0); } extern int cs_process_enforcement_enable; @@ -21819,9 +22222,12 @@ vm_map_cs_debugged_set( vm_map_t map, boolean_t val) { + vmlp_api_start(VM_MAP_CS_DEBUGGED_SET); vm_map_lock(map); + vmlp_range_event_none(map); map->cs_debugged = val; vm_map_unlock(map); + vmlp_api_end(VM_MAP_CS_DEBUGGED_SET, 0); } void @@ -21829,10 +22235,13 @@ vm_map_cs_enforcement_set( vm_map_t map, boolean_t val) { + vmlp_api_start(VM_MAP_CS_ENFORCEMENT_SET); vm_map_lock(map); + vmlp_range_event_none(map); map->cs_enforcement = val; pmap_set_vm_map_cs_enforced(map->pmap, val); vm_map_unlock(map); + vmlp_api_end(VM_MAP_CS_ENFORCEMENT_SET, 0); } /* @@ -21871,25 +22280,32 @@ vm_map_sign(vm_map_t map, vm_page_t m; vm_object_t object; + vmlp_api_start(VM_MAP_SIGN); + /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ if (map == VM_MAP_NULL) { + vmlp_api_end(VM_MAP_SIGN, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (__improbable(vm_map_range_overflows(map, start, end - start))) { + vmlp_api_end(VM_MAP_SIGN, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } vm_map_lock_read(map); + vmlp_range_event(map, start, end - start); + if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) { /* * Must pass a valid non-submap address. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_SIGN, KERN_INVALID_ADDRESS); return KERN_INVALID_ADDRESS; } @@ -21899,6 +22315,7 @@ vm_map_sign(vm_map_t map, * this situation currently. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_SIGN, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -21908,6 +22325,7 @@ vm_map_sign(vm_map_t map, * Object must already be present or we can't sign. */ vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_SIGN, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -21927,6 +22345,7 @@ vm_map_sign(vm_map_t map, /* shoud we try to fault a page here? we can probably * demand it exists and is locked for this request */ vm_object_unlock(object); + vmlp_api_end(VM_MAP_SIGN, KERN_FAILURE); return KERN_FAILURE; } /* deal with special page status */ @@ -21934,6 +22353,7 @@ vm_map_sign(vm_map_t map, (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || vm_page_is_private(m) || m->vmp_absent))) { vm_object_unlock(object); + vmlp_api_end(VM_MAP_SIGN, KERN_FAILURE); return KERN_FAILURE; } @@ -21961,6 +22381,7 @@ vm_map_sign(vm_map_t map, } vm_object_unlock(object); + vmlp_api_end(VM_MAP_SIGN, KERN_SUCCESS); return KERN_SUCCESS; } #endif @@ -21973,6 +22394,8 @@ vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int kern_return_t kr = KERN_SUCCESS; VM_MAP_ZAP_DECLARE(zap_list); + vmlp_api_start(VM_MAP_PARTIAL_REAP); + vm_map_lock(map); for (entry = vm_map_first_entry(map); @@ -21980,6 +22403,10 @@ vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int entry = next_entry) { next_entry = entry->vme_next; + if (!entry->is_sub_map) { + vmlp_range_event_entry(map, entry); + } + if (!entry->is_sub_map && VME_OBJECT(entry) && (VME_OBJECT(entry)->internal == TRUE) && @@ -21997,6 +22424,7 @@ vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int vm_map_zap_dispose(&zap_list); + vmlp_api_end(VM_MAP_PARTIAL_REAP, kr); return kr; } @@ -22011,6 +22439,8 @@ vm_map_disconnect_page_mappings( vm_map_entry_t entry; ledger_amount_t byte_count = 0; + vmlp_api_start(VM_MAP_DISCONNECT_PAGE_MAPPINGS); + if (do_unnest == TRUE) { #ifndef NO_NESTED_PMAP vm_map_lock(map); @@ -22038,6 +22468,7 @@ vm_map_disconnect_page_mappings( for (entry = vm_map_first_entry(map); entry != vm_map_to_entry(map); entry = entry->vme_next) { + vmlp_range_event_entry(map, entry); if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) || (VME_OBJECT(entry)->phys_contiguous))) { continue; @@ -22050,6 +22481,7 @@ vm_map_disconnect_page_mappings( } vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_DISCONNECT_PAGE_MAPPINGS, (int) (byte_count / VM_MAP_PAGE_SIZE(map))); return (int) (byte_count / VM_MAP_PAGE_SIZE(map)); } @@ -22064,6 +22496,8 @@ vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr) vm_map_t real_map; int result = KERN_FAILURE; + vmlp_api_start(VM_MAP_INJECT_ERROR); + vaddr = vm_map_trunc_page(vaddr, PAGE_MASK); vm_map_lock(map); @@ -22088,6 +22522,7 @@ vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr) } vm_map_unlock(map); + vmlp_api_end(VM_MAP_INJECT_ERROR, result); return result; } @@ -22231,6 +22666,8 @@ vm_map_freeze( int cur_shared_obj_ref_cnt = 0; unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0; + vmlp_api_start(VM_MAP_FREEZE); + *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0; /* @@ -22288,6 +22725,8 @@ again: continue; } + vmlp_range_event_entry(map, entry2); + src_object = VME_OBJECT(entry2); if (!src_object || src_object->phys_contiguous || @@ -22409,12 +22848,13 @@ again: *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); if (evaluation_phase) { - unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; - - if (dirty_shared_count > shared_pages_threshold) { - *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY; - kr = KERN_FAILURE; - goto done; + if (memorystatus_freeze_shared_mb_per_process_max) { + unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; + if (dirty_shared_count > shared_pages_threshold) { + *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY; + kr = KERN_FAILURE; + goto done; + } } if (dirty_shared_count && @@ -22448,6 +22888,7 @@ done: if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) { vm_object_compressed_freezer_done(); } + vmlp_api_end(VM_MAP_FREEZE, kr); return kr; } @@ -22743,8 +23184,11 @@ vm_map_sizes(vm_map_t map, vm_map_size_t free, total_free, largest_free; boolean_t end; + vmlp_api_start(VM_MAP_SIZES); + if (!map) { *psize = *pfree = *plargest_free = 0; + vmlp_api_end(VM_MAP_SIZES, 0); return; } total_free = largest_free = 0; @@ -22758,6 +23202,10 @@ vm_map_sizes(vm_map_t map, for (entry = vm_map_first_entry(map);; entry = entry->vme_next) { end = (entry == vm_map_to_entry(map)); + if (!entry->is_sub_map) { + vmlp_range_event_entry(map, entry); + } + if (end) { free = entry->vme_end - prev; } else { @@ -22781,6 +23229,7 @@ vm_map_sizes(vm_map_t map, if (plargest_free) { *plargest_free = largest_free; } + vmlp_api_end(VM_MAP_SIZES, 0); } #if VM_SCAN_FOR_SHADOW_CHAIN @@ -22792,7 +23241,10 @@ vm_map_shadow_max( vm_map_entry_t entry; vm_object_t object, next_object; + vmlp_api_start(VM_MAP_SHADOW_MAX); + if (map == NULL) { + vmlp_api_end(VM_MAP_SHADOW_MAX, 0); return 0; } @@ -22806,6 +23258,9 @@ vm_map_shadow_max( if (entry->is_sub_map) { continue; } + + vmlp_range_event_entry(map, entry); + object = VME_OBJECT(entry); if (object == NULL) { continue; @@ -22826,6 +23281,7 @@ vm_map_shadow_max( vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_SHADOW_MAX, shadows_max); return shadows_max; } #endif /* VM_SCAN_FOR_SHADOW_CHAIN */ @@ -23089,32 +23545,41 @@ inline bool vm_map_is_corpse_source(vm_map_t map) { bool status = false; + vmlp_api_start(VM_MAP_IS_CORPSE_SOURCE); + vmlp_range_event_none(map); if (map) { vm_map_lock_read(map); status = map->corpse_source; vm_map_unlock_read(map); } + vmlp_api_end(VM_MAP_IS_CORPSE_SOURCE, status); return status; } inline void vm_map_set_corpse_source(vm_map_t map) { + vmlp_api_start(VM_MAP_SET_CORPSE_SOURCE); + vmlp_range_event_none(map); if (map) { vm_map_lock(map); map->corpse_source = true; vm_map_unlock(map); } + vmlp_api_end(VM_MAP_SET_CORPSE_SOURCE, 0); } inline void vm_map_unset_corpse_source(vm_map_t map) { + vmlp_api_start(VM_MAP_UNSET_CORPSE_SOURCE); + vmlp_range_event_none(map); if (map) { vm_map_lock(map); map->corpse_source = false; vm_map_unlock(map); } + vmlp_api_end(VM_MAP_UNSET_CORPSE_SOURCE, 0); } /* * FORKED CORPSE FOOTPRINT @@ -23209,9 +23674,9 @@ struct vm_map_corpse_footprint_header { }; typedef uint8_t cf_disp_t; struct vm_map_corpse_footprint_region { - vm_map_offset_t cfr_vaddr; /* region start virtual address */ - uint32_t cfr_num_pages; /* number of pages in this "region" */ - cf_disp_t cfr_disposition[0]; /* disposition of each page */ + vm_map_offset_t cfr_vaddr; /* region start virtual address */ + uint32_t cfr_num_pages; /* number of pages in this "region" */ + cf_disp_t cfr_disposition[] __counted_by(cfr_num_pages); /* disposition of each page */ } __attribute__((packed)); static cf_disp_t @@ -23316,6 +23781,13 @@ vm_map_corpse_footprint_new_region( return new_footprint_region; } +TUNABLE(vm_size_t, vm_map_corpse_footprint_max_buffer_size, "vm_cf_max_buf_size", +#if XNU_TARGET_OS_OSX + MiB(8)); +#else /* !XNU_TARGET_OS_OSX */ + KiB(512)); +#endif /* XNU_TARGET_OS_OSX */ + /* * vm_map_corpse_footprint_collect: * collect footprint information for "old_entry" in "old_map" and @@ -23367,20 +23839,15 @@ vm_map_corpse_footprint_collect( ((old_map->size / effective_page_size) * sizeof(cf_disp_t))); /* disposition for each page */ -// printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size); + vm_log_debug("corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t)buf_size); buf_size = round_page(buf_size); - /* limit buffer to 1 page to validate overflow detection */ -// buf_size = PAGE_SIZE; - /* limit size to a somewhat sane amount */ -#if XNU_TARGET_OS_OSX -#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */ -#else /* XNU_TARGET_OS_OSX */ -#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */ -#endif /* XNU_TARGET_OS_OSX */ - if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) { - buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE; + if (buf_size > vm_map_corpse_footprint_max_buffer_size) { + vm_log("WARNING truncating corpse footprint buffer (%lu KiB) to maximum " + "size (%lu KiB) -- footprint data may be lost\n", + buf_size >> 10, vm_map_corpse_footprint_max_buffer_size >> 10); + buf_size = vm_map_corpse_footprint_max_buffer_size; } kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map); kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0, @@ -23417,6 +23884,15 @@ vm_map_corpse_footprint_collect( footprint_edge = ((uintptr_t)footprint_header + footprint_header->cf_size); + if ((!old_entry->is_sub_map && VME_OBJECT(old_entry) == VM_OBJECT_NULL) || + (old_entry->is_sub_map && !old_entry->use_pmap)) { + /* + * This entry has no vm-object or is a nested pmap and therefore no page + * dispositions to record. + */ + return KERN_SUCCESS; + } + if ((footprint_region->cfr_vaddr + (((vm_map_offset_t)footprint_region->cfr_num_pages) * effective_page_size)) @@ -23442,7 +23918,12 @@ vm_map_corpse_footprint_collect( sizeof(int)) - ((footprint_region->cfr_num_pages - footprint_header->cf_last_zeroes) * sizeof(cf_disp_t))); -// printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta); + DTRACE_VM5(corpse_footprint_collect, + uint32_t, footprint_header->cf_last_region, + vm_map_offset_t, footprint_region->cfr_vaddr, + uint32_t, footprint_region->cfr_num_pages, + vm_map_offset_t, old_entry->vme_start, + uin64_t, num_pages_delta); if (region_offset_delta_size < num_pages_delta_size || os_add3_overflow(footprint_region->cfr_num_pages, (uint32_t) num_pages_delta, @@ -23460,7 +23941,10 @@ vm_map_corpse_footprint_collect( * if we added "zero" page dispositions for the gap, * no choice but to start a new region. */ -// printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__); + DTRACE_VM3(corpse_footprint_collect_new_region, + vm_map_offset_t, old_entry->vme_start, + vm_map_offset_t, footprint_region->cfr_vaddr, + uint64_t, num_pages_delta); new_footprint_region = vm_map_corpse_footprint_new_region(footprint_header); /* check that we're not going over the edge */ @@ -23476,7 +23960,10 @@ vm_map_corpse_footprint_collect( * Store "zero" page dispositions for the missing * pages. */ -// printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__); + DTRACE_VM3(corpse_footprint_collect_zero_gap, + vm_map_offset_t, old_entry->vme_start, + vm_map_offset_t, footprint_region->cfr_vaddr, + uint64_t, num_pages_delta); for (; num_pages_delta > 0; num_pages_delta--) { next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region + @@ -23500,13 +23987,15 @@ vm_map_corpse_footprint_collect( int disposition; cf_disp_t cf_disp; - vm_map_footprint_query_page_info(old_map, + vm_map_footprint_query_page_info_exclusive(old_map, old_entry, va, &disposition); cf_disp = vm_page_disposition_to_cf_disp(disposition); -// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp); + DTRACE_VM2(corpse_footprint_collect_page_info, + vm_map_offset_t, va, + cf_disp_t, cf_disp); if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) { /* @@ -23582,11 +24071,11 @@ vm_map_corpse_footprint_collect( /* ... and skip this "zero" disp */ footprint_region->cfr_vaddr = va + effective_page_size; } - return KERN_SUCCESS; over_the_edge: -// printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va); + vm_log_error("corpse footprint buffer was exhausted at 0x%llx -- footprint " + "data may be lost\n", va); vm_map_corpse_footprint_full++; return KERN_RESOURCE_SHORTAGE; } @@ -23901,6 +24390,8 @@ vm_map_get_phys_page( vm_map_entry_t entry; ppnum_t phys_page = 0; + vmlp_api_start(VM_MAP_GET_PHYS_PAGE); + map_offset = vm_map_trunc_page(addr, PAGE_MASK); vm_map_lock(map); @@ -23915,8 +24406,12 @@ vm_map_get_phys_page( vm_map_unlock(old_map); continue; } + + vmlp_range_event_entry(map, entry); + if (VME_OBJECT(entry) == VM_OBJECT_NULL) { vm_map_unlock(map); + vmlp_api_end(VM_MAP_GET_PHYS_PAGE, 0); return (ppnum_t) 0; } if (VME_OBJECT(entry)->phys_contiguous) { @@ -23967,6 +24462,7 @@ vm_map_get_phys_page( } vm_map_unlock(map); + vmlp_api_end(VM_MAP_GET_PHYS_PAGE, phys_page); return phys_page; } @@ -24111,10 +24607,13 @@ vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) vm_map_offset_t default_end; kern_return_t kr; + vmlp_api_start(VM_MAP_RANGE_CONFIGURE); + if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) { /* * No point doing vm ranges in a 32bit address space. */ + vmlp_api_end(VM_MAP_RANGE_CONFIGURE, KERN_NOT_SUPPORTED); return KERN_NOT_SUPPORTED; } @@ -24229,6 +24728,7 @@ vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) * safely without interfering with the existing map. */ if (default_end > vm_compute_max_offset(true)) { + vmlp_api_end(VM_MAP_RANGE_CONFIGURE, KERN_NO_SPACE); return KERN_NO_SPACE; } @@ -24240,6 +24740,7 @@ vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) * no longer test the behavior changing the value * of ARM64_MAX_OFFSET_DEVICE_* would have. */ + vmlp_api_end(VM_MAP_RANGE_CONFIGURE, KERN_NOT_SUPPORTED); return KERN_NOT_SUPPORTED; } @@ -24275,6 +24776,11 @@ vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) */ vm_map_lock_read(map); vm_map_lookup_entry_or_next(map, data_range.max_address, &entry); + + if (entry != vm_map_to_entry(map) && !entry->is_sub_map) { + vmlp_range_event_entry(map, entry); + } + if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) { size = vm_map_max(map) - data_range.max_address; } else { @@ -24309,6 +24815,7 @@ vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va) map->uses_user_ranges = true; vm_map_unlock(map); + vmlp_api_end(VM_MAP_RANGE_CONFIGURE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -24486,11 +24993,14 @@ vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr) vm_object_t object; boolean_t result; + vmlp_api_start(VM_MAP_ENTRY_HAS_DEVICE_PAGER); + if (map == NULL) { + vmlp_api_end(VM_MAP_ENTRY_HAS_DEVICE_PAGER, FALSE); return FALSE; } - vm_map_lock(map); + vm_map_lock_read(map); while (TRUE) { if (!vm_map_lookup_entry(map, vaddr, &entry)) { result = FALSE; @@ -24500,11 +25010,14 @@ vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr) // Check the submap vm_map_t submap = VME_SUBMAP(entry); assert(submap != NULL); - vm_map_lock(submap); - vm_map_unlock(map); + vm_map_lock_read(submap); + vm_map_unlock_read(map); map = submap; continue; } + + vmlp_range_event_entry(map, entry); + object = VME_OBJECT(entry); if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) { result = TRUE; @@ -24514,10 +25027,383 @@ vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr) break; } - vm_map_unlock(map); + vm_map_unlock_read(map); + vmlp_api_end(VM_MAP_ENTRY_HAS_DEVICE_PAGER, result); return result; } +bool +vm_map_is_sealed( + vm_map_t map) +{ + return map->vmmap_sealed == VM_MAP_SEALED; +} + +void +vm_map_seal( + vm_map_t map, + bool nested_pmap) +{ + vm_map_entry_t entry; + + vm_map_lock(map); + + assert3u(map->vmmap_sealed, ==, VM_MAP_WILL_BE_SEALED); + if (nested_pmap && map->pmap != PMAP_NULL) { + map->mapped_in_other_pmaps = true; + } + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + vm_object_t object; + + assert(!entry->is_sub_map); + if (VME_OBJECT(entry) == VM_OBJECT_NULL) { + object = vm_object_allocate(entry->vme_end - entry->vme_start, map->serial_id); + VME_OBJECT_SET(entry, object, true, 0); + VME_OFFSET_SET(entry, 0); + entry->needs_copy = false; + } + object = VME_OBJECT(entry); + if (entry->needs_copy) { + assert(object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); + VME_OBJECT_SHADOW(entry, entry->vme_end - entry->vme_start, TRUE); + entry->needs_copy = FALSE; + object = VME_OBJECT(entry); + } + vm_object_lock(object); + if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { + object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + object->true_share = true; + } + vm_object_unlock(object); + assert(VME_OBJECT(entry) != VM_OBJECT_NULL); + assert(VME_OBJECT(entry)->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC); + } + + map->vmmap_sealed = VM_MAP_SEALED; + /* we can't hold a write lock on a sealed map so downgrade */ + vm_map_lock_write_to_read(map); + vm_map_unlock_read(map); +} + + +#if DEVELOPMENT || DEBUG + +/* + * Support functions for userspace tests of constant submaps. + * + * Two functions below can be called by userspace via sysctl: + * vm_map_testing_make_sealed_submap() + * vm_map_testing_remap_submap() + * + * To write a test that uses a submap: + * 1. Create allocations in your map that will become the new submap's contents. + * 2. Call vm_map_testing_make_sealed_submap() to replace those + * allocations with a submap containing those allocations. + * 3. Call vm_map_testing_remap_submap() to create additional + * mappings of the submap (or portions thereof) in the parent map. + */ + +/* + * Look for a submap mapped in parent_map at submap_base_address. + * Panic if it is not there. + */ +static void +vm_map_testing_require_submap_at_address( + vm_map_t parent_map, + mach_vm_address_t address, + vm_map_t * const out_submap, + vm_map_entry_t * const out_parent_entry) +{ + vm_map_entry_t entry; + + vm_map_lock_assert_held(parent_map); + + if (!vm_map_lookup_entry(parent_map, address, &entry)) { + panic("no map entry contains address"); + } + assertf(entry->vme_start == address, + "submap test: submap_base_address is not the start of a map entry"); + assertf(entry->is_sub_map, + "submap test: entry at submap_base_address is not a submap"); + + if (out_submap) { + *out_submap = VME_SUBMAP(entry); + } + if (out_parent_entry) { + *out_parent_entry = entry; + } +} + +/* + * Map a submap into current_map(). + * current_map's submap entry will be at [start, end). + * The start of the mapping will be at submap_offset in the submap. + * + * parent_map must be locked on entry and is unlocked on exit. + * + * For testing purposes only. + */ +static void +vm_map_testing_remap_submap_and_unlock( + vm_map_t parent_map, + vm_map_t submap, + mach_vm_address_t start, + mach_vm_address_t end, + mach_vm_address_t submap_offset) +{ + kern_return_t kr; + mach_vm_address_t submap_end_offset; + mach_vm_address_t new_start; + vm_prot_t cur_prot, max_prot; + vm_map_kernel_flags_t vmk_flags; + bool overflowed; + + assertf(!vm_kernel_map_is_kernel(parent_map), + "submap test: for userspace maps only"); + assertf(!vm_map_is_sealed(parent_map), + "submap test: parent map may not be sealed"); + + assertf(parent_map != submap, + "submap test: parent map and submap must be distinct"); + assertf(!vm_kernel_map_is_kernel(submap), + "submap test: submap must not be the kernel map"); + assertf(vm_map_is_sealed(submap), + "submap test: submap must be sealed"); + assertf(VM_MAP_PAGE_MASK(parent_map) == VM_MAP_PAGE_MASK(submap), + "submap test: parent map and submap must have the same page size"); + + assertf((start & VM_MAP_PAGE_MASK(parent_map)) == 0, + "submap test: start address must be page-aligned"); + assertf((end & VM_MAP_PAGE_MASK(parent_map)) == 0, + "submap test: end address must be page-aligned"); + assertf((submap_offset & VM_MAP_PAGE_MASK(parent_map)) == 0, + "submap test: offset in submap must be page-aligned"); + assertf(start < end, + "submap test: start must precede end"); + + /* submap_end_offset = submap_offset + (end - start) */ + overflowed = os_add_overflow(submap_offset, end - start, &submap_end_offset); + assertf(!overflowed, "arithmetic overflow in submap_offset + (end - start)"); + + /* + * The range to be mapped must exist in the submap. + * We assume here that a sealed map has no holes. + */ + assertf(vm_map_first_entry(submap) != vm_map_to_entry(submap), + "submap test: submap must not be empty"); + assertf(vm_map_first_entry(submap)->vme_start <= submap_offset, + "submap test: submap range to remap is unmapped in the submap"); + assertf(vm_map_last_entry(submap)->vme_end >= submap_end_offset, + "submap test: submap range to remap is unmapped in the submap"); + + vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_submap = true); + + /* + * Compute protections for the submap's map entry in the parent map. + * Copied from vm_shared_region_insert_submap(): we want to do as + * many of the same things as real shared region submaps as we can. + */ + cur_prot = VM_PROT_READ; + if (VM_MAP_POLICY_WRITABLE_SHARED_REGION(parent_map)) { + max_prot = VM_PROT_ALL; + } else { + max_prot = VM_PROT_READ; + vmk_flags.vmf_permanent = true; + } + + vm_map_reference(submap); + vm_map_unlock(parent_map); + + /* Map the submap. */ + new_start = start; + kr = vm_map_enter(parent_map, &new_start, end - start, 0, + vmk_flags, (vm_object_t)(uintptr_t)submap, submap_offset, true /* copy */, + cur_prot, max_prot, VM_INHERIT_SHARE /* same as vm_shared_region.c */); + assertf(kr == KERN_SUCCESS, + "submap test: vm_map_enter of submap entry into parent map failed"); + assertf(new_start == start, + "submap test: submap entry was inserted at the wrong address"); +} + +/* + * Map part of a submap as a new submap entry in parent_map. + * The submap must already be mapped in its entirety at submap_base_address. + * The remapping destination is [start..end) in parent_map. + * The remapping source is [offset, offset + (end-start)) in the submap. + * + * For testing purposes only. + */ +void +vm_map_testing_remap_submap( + vm_map_t parent_map, + mach_vm_address_t submap_base_address, + mach_vm_address_t start, + mach_vm_address_t end, + mach_vm_address_t offset) +{ + vm_map_t submap; + + vm_map_lock(parent_map); + + /* Find the submap. It is mapped in parent_map at submap_base_address. */ + vm_map_testing_require_submap_at_address( + parent_map, submap_base_address, &submap, NULL); + + /* Map the submap as requested. */ + vm_map_testing_remap_submap_and_unlock(parent_map, submap, start, end, offset); +} + +/* + * Create a new constant/sealed submap. Map it into parent_map at [start, end). + * The new submap's contents are the map entries initially in the range [start, end). + * The new submap does not use a nested pmap. + * On entry: no existing map entry may cross start or end + * + * For testing purposes only. + */ +void +vm_map_testing_make_sealed_submap( + vm_map_t parent_map, + mach_vm_address_t start, + mach_vm_address_t end) +{ + vm_map_t submap; + vm_map_entry_t entry, next_entry; + + assertf(!vm_kernel_map_is_kernel(parent_map), + "submap test: for userspace maps only"); + assertf(!vm_map_is_sealed(parent_map), + "submap test: parent map may not be sealed"); + + assertf((start & VM_MAP_PAGE_MASK(parent_map)) == 0, + "submap test: start address must be page-aligned"); + assertf((end & VM_MAP_PAGE_MASK(parent_map)) == 0, + "submap test: end address must be page-aligned"); + assertf(start < end, + "submap test: start must precede end"); + + vm_map_lock(parent_map); + + /* + * Create the map that will become the submap. + * + * Submap's address range starts at 0 to match the real shared region. + * + * PPL/SPTM allows only one pmap_nested submap per map. + * The real shared region gets that, so we can't test pmap nesting. + * Instead this submap gets a NULL pmap. + */ + submap = vm_map_create_options(NULL /* pmap */, 0, end - start, VM_MAP_CREATE_PAGEABLE); + assert(submap); + submap->is_nested_map = true; + submap->vmmap_sealed = VM_MAP_WILL_BE_SEALED; + vm_map_set_page_shift(submap, VM_MAP_PAGE_SHIFT(parent_map)); + + /* + * Move map entries from the parent map into the submap. + * Also verify that no entry crosses start or end. + * + * This is not a copy+delete operation because for testing purposes + * we want to preserve as much of the map entry state as possible. + * This operation temporarily overrides things like wire count and + * permanent, but for testing purposes that is acceptable. + */ + next_entry = NULL; + for (vm_map_lookup_entry_or_next(parent_map, start, &entry); + entry != vm_map_to_entry(parent_map) && entry->vme_start < end; + entry = next_entry) { + vm_map_offset_t entry_size; + + /* Fetch next_entry now because we're about to unlink this entry. */ + next_entry = entry->vme_next; + + /* + * Executable or TPRO memory in the submap is unimplemented. + * Real shared region submaps get vm_map_set_tpro() + * and vm_map_cs_enforcement_set(true). + */ + assertf((entry->protection & VM_PROT_ALLEXEC) == 0, + "submap test: executable memory is unimplemented"); + assertf((entry->max_protection & VM_PROT_ALLEXEC) == 0, + "submap test: executable memory is unimplemented"); +#if __arm64e__ + assertf(!entry->used_for_tpro, + "submap test: TPRO memory is unimplemented"); +#endif + + /* + * Entry ends after our start address, guaranteed by + * vm_map_lookup_entry_or_next() and our loop condition. + * If the entry starts before our start address then + * it crosses our start address which is not allowed. + */ + assertf(entry->vme_start >= start, + "submap test: entry crosses new submap bounds"); /* verify */ + assertf(entry->vme_end > start, + "submap test: entry crosses new submap bounds"); /* known */ + + /* + * Entry starts before our end address, guaranteed by our + * loop condition. If the entry ends after our end address + * then it crosses our end address which is not allowed. + */ + assertf(entry->vme_start < end, + "submap test: entry crosses new submap bounds"); /* known */ + assertf(entry->vme_end <= end, + "submap test: entry crosses new submap bounds"); /* verify */ + + /* Entry's address range is good. Move it. */ + + assertf(!entry->is_sub_map, + "submap test: can't move submap entry into a submap"); + assertf(!entry->in_transition, + "submap test: can't move in_transition entry into a submap"); + + /* Unlink map entry from parent map. */ + entry_size = entry->vme_end - entry->vme_start; + vm_map_store_entry_unlink(parent_map, entry, false /* check_permanent */); + parent_map->size -= entry_size; + + /* + * Address `start` in the parent map is address 0 in the submap. + * Subtract `start` from the entry's bounds to move it to its + * place in the submap. + */ + entry->vme_start -= start; + entry->vme_end -= start; + + /* Link map entry into the submap. */ + vm_map_store_entry_link(submap, vm_map_last_entry(submap), entry, VM_MAP_KERNEL_FLAGS_NONE); + submap->size += entry_size; + } + + /* Submap is now populated. Seal it. */ + vm_map_seal(submap, false /* nested_pmap */); + + /* + * Verify that the parent map now contains nothing in [start, end). + * Find the first map entry containing our start address or after it. + * Either there must be no such entry, + * or that entry must start after our end address. + */ + vm_map_lookup_entry_or_next(parent_map, start, &entry); + assertf(entry == vm_map_to_entry(parent_map) || entry->vme_start >= end, + "submap test: intended submap range is not empty"); + + /* + * Map the entire submap into the parent map at the same range we depopulated. + * Also unlocks parent_map. + */ + vm_map_testing_remap_submap_and_unlock(parent_map, submap, + start, end, /* parent map range */ + 0 /* start offset in the submap we created */); + vm_map_deallocate(submap); /* now referenced only by the map entry */ +} + +#endif /* DEVELOPMENT || DEBUG */ #if MACH_ASSERT @@ -24678,6 +25564,18 @@ vm_map_pmap_set_process( #endif /* MACH_ASSERT */ +__attribute__((always_inline)) +vm_size_t +vm_map_kernel_max_simple_mappable_size(void) +{ +#ifdef __arm64__ + return (2ULL << 30) - PAGE_SIZE; +#else + /* No particular size limit */ + return -1; +#endif +} + /** * Check if a given given map operation size is valid for the given map, taking * in to account whether or not the map operation has overridden the soft limit. @@ -24702,7 +25600,7 @@ vm_map_is_map_size_valid( return true; #else if (__probable(target_map->pmap != kernel_pmap || - size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) { + size <= vm_map_kernel_max_simple_mappable_size() || no_soft_limit)) { // Allocation size matches policy return true; } @@ -24719,8 +25617,14 @@ vm_map_is_map_size_valid( case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT: return false; case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC: - panic("1,000,000K ought to be enough for anybody " + panic("2,000,000K ought to be enough for anybody " "(requested %lu bytes)", size); } #endif /* __x86_64__ */ } + +vm_map_serial_t +vm_map_maybe_serial_id(vm_map_t maybe_vm_map) +{ + return maybe_vm_map != NULL ? maybe_vm_map->serial_id : VM_MAP_SERIAL_NONE; +} diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 3847547a2..61a739947 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -562,6 +562,23 @@ extern kern_return_t vm_map_copyout( extern void vm_map_copy_discard( vm_map_copy_t copy); + +/** + * @function vm_map_kernel_max_simple_mappable_size() + * + * @brief + * Get the size of the largest contiguous mapping which can be used in the + * kernel without special accessors/attributes. When using accessors/attributes + * in XNU proper, this limit can be overridden when making allocations/mappings + * through various APIs by setting the "no soft limit" option. Such + * functionalities, however, are not available outside of XNU. + * + * Note that this function does not guarantee that the returned size will + * actually be mappable. Rather, this function specifies that simple kernel + * mappings larger than this will fail. + */ +extern vm_size_t vm_map_kernel_max_simple_mappable_size(void); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/vm/vm_map_internal.h b/osfmk/vm/vm_map_internal.h index 44e024ff4..7989be1f6 100644 --- a/osfmk/vm/vm_map_internal.h +++ b/osfmk/vm/vm_map_internal.h @@ -107,11 +107,13 @@ extern kern_return_t vm_map_unwire_impl( * This file contains interfaces that are private to the VM */ -#define KiB(x) (1024 * (x)) -#define MeB(x) (1024 * 1024 * (x)) +#define KiB(kb) ((kb) << 10ull) +#define BtoKiB(b) ((b) >> 10) +#define MiB(mb) ((mb) << 20ull) +#define BtoMiB(b) ((b) >> 20) #if __LP64__ -#define KMEM_SMALLMAP_THRESHOLD (MeB(1)) +#define KMEM_SMALLMAP_THRESHOLD (MiB(1)) #else #define KMEM_SMALLMAP_THRESHOLD (KiB(256)) #endif @@ -191,6 +193,10 @@ extern void vm_map_clip_end( extern boolean_t vm_map_entry_should_cow_for_true_share( vm_map_entry_t entry); +extern void vm_map_seal( + vm_map_t map, + bool nested_pmap); + /*! * @typedef vmr_flags_t * @@ -233,13 +239,12 @@ __options_decl(vmr_flags_t, uint32_t, { VM_MAP_REMOVE_KUNWIRE = 0x001, VM_MAP_REMOVE_INTERRUPTIBLE = 0x002, VM_MAP_REMOVE_NOKUNWIRE_LAST = 0x004, - VM_MAP_REMOVE_NO_MAP_ALIGN = 0x008, - VM_MAP_REMOVE_IMMUTABLE = 0x010, - VM_MAP_REMOVE_GAPS_FAIL = 0x020, - VM_MAP_REMOVE_NO_YIELD = 0x040, - VM_MAP_REMOVE_GUESS_SIZE = 0x080, - VM_MAP_REMOVE_IMMUTABLE_CODE = 0x100, - VM_MAP_REMOVE_TO_OVERWRITE = 0x200, + VM_MAP_REMOVE_IMMUTABLE = 0x008, + VM_MAP_REMOVE_GAPS_FAIL = 0x010, + VM_MAP_REMOVE_NO_YIELD = 0x020, + VM_MAP_REMOVE_GUESS_SIZE = 0x040, + VM_MAP_REMOVE_IMMUTABLE_CODE = 0x080, + VM_MAP_REMOVE_TO_OVERWRITE = 0x100, }); /* Deallocate a region */ @@ -486,17 +491,6 @@ extern boolean_t vm_map_lookup_entry_or_next( vm_map_address_t address, vm_map_entry_t *entry); /* OUT */ -/* like vm_map_lookup_entry without the PGZ bear trap */ -#if CONFIG_PROB_GZALLOC -extern boolean_t vm_map_lookup_entry_allow_pgz( - vm_map_t map, - vm_map_address_t address, - vm_map_entry_t *entry); /* OUT */ -#else /* !CONFIG_PROB_GZALLOC */ -#define vm_map_lookup_entry_allow_pgz vm_map_lookup_entry -#endif /* !CONFIG_PROB_GZALLOC */ - - extern void vm_map_copy_remap( vm_map_t map, vm_map_entry_t where, diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index 0b4479b94..55d948043 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -242,9 +242,9 @@ vm_map_store_entry_unlink( if (entry == map->hint) { map->hint = vm_map_to_entry(map); } - if (map->holelistenabled == FALSE) { + if ((map->holelistenabled == FALSE) && (map->disable_vmentry_reuse == FALSE)) { if (VMEU_entry->vme_start <= VMEU_map->first_free->vme_start) { - VMEU_first_free = VMEU_entry->vme_prev; + VMEU_first_free = VME_PREV(VMEU_entry); } else { VMEU_first_free = VMEU_map->first_free; } @@ -336,14 +336,14 @@ vm_map_store_find_space_backwards( return VM_MAP_ENTRY_NULL; } - entry = entry->vme_prev; + entry = VME_PREV(entry); while (end <= entry->vme_start) { if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { return VM_MAP_ENTRY_NULL; } - entry = entry->vme_prev; + entry = VME_PREV(entry); } if (entry->vme_end < end) { @@ -395,10 +395,10 @@ vm_map_store_find_space_backwards( return VM_MAP_ENTRY_NULL; } - entry = entry->vme_prev; + entry = VME_PREV(entry); end = entry->vme_end; } else { - entry = entry->vme_prev; + entry = VME_PREV(entry); if (entry == vm_map_to_entry(map)) { /* diff --git a/osfmk/vm/vm_map_store_internal.h b/osfmk/vm/vm_map_store_internal.h index 86f104617..7873b3770 100644 --- a/osfmk/vm/vm_map_store_internal.h +++ b/osfmk/vm/vm_map_store_internal.h @@ -51,6 +51,14 @@ struct vm_map_store { RB_HEAD(rb_head, vm_map_store); #endif +#define VM_ENTRY_PACKED_PTR_BITS 48 +#define VM_ENTRY_PACKED_PTR_SHIFT 0 +#define VM_ENTRY_PACKED_PTR_BASE ((uintptr_t)0) + +#define VM_PREV_PACK(prev) (uintptr_t) (VM_PACK_POINTER((uintptr_t)(prev), VM_ENTRY_PACKED_PTR)) +#define VM_PREV_UNPACK(p) ((vm_map_entry_t) VM_UNPACK_POINTER((vm_offset_t)p, VM_ENTRY_PACKED_PTR)) +static_assert(VM_KERNEL_POINTER_SIGNIFICANT_BITS <= VM_ENTRY_PACKED_PTR_BITS); + /* * Type: vm_map_entry_t [internal use only] * @@ -69,7 +77,8 @@ RB_HEAD(rb_head, vm_map_store); * and needs to be kept in sync. */ struct vm_map_links { - struct vm_map_entry *prev; /* previous entry */ + uintptr_t prev : VM_ENTRY_PACKED_PTR_BITS; + uint8_t vme_zero_wire_count_waiters :1; struct vm_map_entry *next; /* next entry */ vm_map_offset_t start; /* start address */ vm_map_offset_t end; /* end address */ diff --git a/osfmk/vm/vm_map_store_ll.c b/osfmk/vm/vm_map_store_ll.c index 937c000fe..003757230 100644 --- a/osfmk/vm/vm_map_store_ll.c +++ b/osfmk/vm/vm_map_store_ll.c @@ -58,7 +58,8 @@ first_free_is_valid_ll(vm_map_t map) void vm_map_store_init_ll(struct vm_map_header *hdr) { - hdr->links.next = hdr->links.prev = CAST_TO_VM_MAP_ENTRY(hdr); + hdr->links.next = CAST_TO_VM_MAP_ENTRY(hdr); + VMH_PREV_SET(hdr, CAST_TO_VM_MAP_ENTRY(hdr)); } void @@ -67,24 +68,23 @@ vm_map_store_entry_link_ll( vm_map_entry_t after_where, vm_map_entry_t entry) { - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, - VM_MAP_HDR_PAGE_MASK(hdr))); - assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, - VM_MAP_HDR_PAGE_MASK(hdr))); - } + assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, + VM_MAP_HDR_PAGE_MASK(hdr))); + assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, + VM_MAP_HDR_PAGE_MASK(hdr))); hdr->nentries++; - entry->vme_prev = after_where; + VME_PREV_SET(entry, after_where); entry->vme_next = after_where->vme_next; - entry->vme_prev->vme_next = entry->vme_next->vme_prev = entry; + VME_PREV(entry)->vme_next = entry; + VME_PREV_SET(entry->vme_next, entry); } void vm_map_store_entry_unlink_ll(struct vm_map_header *hdr, vm_map_entry_t entry) { hdr->nentries--; - entry->vme_next->vme_prev = entry->vme_prev; - entry->vme_prev->vme_next = entry->vme_next; + VME_PREV_SET(entry->vme_next, VME_PREV(entry)); + VME_PREV(entry)->vme_next = entry->vme_next; } void @@ -94,9 +94,8 @@ vm_map_store_copy_reset_ll( __unused int nentries) { copy->cpy_hdr.nentries = 0; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = - vm_map_copy_to_entry(copy); + vm_map_copy_first_entry(copy) = vm_map_copy_to_entry(copy); + VMH_PREV_SET(©->cpy_hdr, vm_map_copy_to_entry(copy)); } /* diff --git a/osfmk/vm/vm_map_store_rb.c b/osfmk/vm/vm_map_store_rb.c index 6dd3c3d32..bb4cb752b 100644 --- a/osfmk/vm/vm_map_store_rb.c +++ b/osfmk/vm/vm_map_store_rb.c @@ -143,13 +143,13 @@ vm_map_combine_hole(__unused vm_map_t map, vm_map_entry_t hole_entry) middle_hole_entry = hole_entry->vme_next; last_hole_entry = middle_hole_entry->vme_next; - assert(last_hole_entry->vme_prev == middle_hole_entry); + assert(VME_PREV(last_hole_entry) == middle_hole_entry); assert(middle_hole_entry->vme_end != last_hole_entry->vme_start); - last_hole_entry->vme_prev = hole_entry; + VME_PREV_SET(last_hole_entry, hole_entry); hole_entry->vme_next = last_hole_entry; - middle_hole_entry->vme_prev = NULL; + VME_PREV_SET(middle_hole_entry, NULL); middle_hole_entry->vme_next = NULL; zfree_id(ZONE_ID_VM_MAP_HOLES, middle_hole_entry); @@ -172,23 +172,23 @@ vm_map_delete_hole(vm_map_t map, vm_map_entry_t hole_entry) vm_map_entry_t l_next, l_prev; l_next = (vm_map_entry_t) map->holes_list->next; - l_prev = (vm_map_entry_t) map->holes_list->prev; + l_prev = (vm_map_entry_t) VML_PREV(map->holes_list); map->holes_list = (struct vm_map_links*) l_next; - l_next->vme_prev = l_prev; + VME_PREV_SET(l_next, l_prev); l_prev->vme_next = l_next; SAVE_HINT_HOLE_WRITE(map, (struct vm_map_links*) l_next); } } else { - SAVE_HINT_HOLE_WRITE(map, (struct vm_map_links*) hole_entry->vme_prev); + SAVE_HINT_HOLE_WRITE(map, (struct vm_map_links*) VME_PREV(hole_entry)); - hole_entry->vme_prev->vme_next = hole_entry->vme_next; - hole_entry->vme_next->vme_prev = hole_entry->vme_prev; + VME_PREV(hole_entry)->vme_next = hole_entry->vme_next; + VME_PREV_SET(hole_entry->vme_next, VME_PREV(hole_entry)); } hole_entry->vme_next = NULL; - hole_entry->vme_prev = NULL; + VME_PREV_SET(hole_entry, NULL); zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry); } @@ -263,7 +263,7 @@ check_map_sanity(vm_map_t map, vm_map_entry_t old_hole_entry) static void copy_hole_info(vm_map_entry_t hole_entry, vm_map_entry_t old_hole_entry) { - old_hole_entry->vme_prev = hole_entry->vme_prev; + VME_PREV_SET(old_hole_entry) = VME_PREV(hole_entry); old_hole_entry->vme_next = hole_entry->vme_next; old_hole_entry->vme_start = hole_entry->vme_start; old_hole_entry->vme_end = hole_entry->vme_end; @@ -302,7 +302,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) * Hit. */ - hole_entry = hole_entry->vme_prev; + hole_entry = VME_PREV(hole_entry); } } else if (hole_entry->vme_start > old_entry->vme_end) { /* @@ -390,7 +390,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) { assert(hole_entry->vme_start != old_entry->vme_start); - hole_entry = hole_entry->vme_prev; + hole_entry = VME_PREV(hole_entry); } break; } @@ -398,7 +398,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) hole_entry = next_hole_entry; if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { - hole_entry = hole_entry->vme_prev; + hole_entry = VME_PREV(hole_entry); break; } } @@ -418,25 +418,27 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) if (map->holes_list == NULL || (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list) && hole_entry->vme_start > old_entry->vme_start)) { if (map->holes_list == NULL) { map->holes_list = new_hole_entry; - new_hole_entry->prev = new_hole_entry->next = CAST_TO_VM_MAP_ENTRY(map->holes_list); + VML_PREV_SET(new_hole_entry, CAST_TO_VM_MAP_ENTRY(map->holes_list)); + new_hole_entry->next = CAST_TO_VM_MAP_ENTRY(map->holes_list); } else { l_next = CAST_TO_VM_MAP_ENTRY(map->holes_list); - l_prev = map->holes_list->prev; + l_prev = VML_PREV(map->holes_list); map->holes_list = new_hole_entry; new_hole_entry->next = l_next; - new_hole_entry->prev = l_prev; + VML_PREV_SET(new_hole_entry, l_prev); - l_prev->vme_next = l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + l_prev->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + VME_PREV_SET(l_next, CAST_TO_VM_MAP_ENTRY(new_hole_entry)); } } else { l_next = hole_entry->vme_next; - l_prev = hole_entry->vme_next->vme_prev; + l_prev = VME_PREV(hole_entry->vme_next); - new_hole_entry->prev = hole_entry; + VML_PREV_SET(new_hole_entry, hole_entry); new_hole_entry->next = l_next; hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry); - l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + VME_PREV_SET(l_next, CAST_TO_VM_MAP_ENTRY(new_hole_entry)); } new_hole_entry->start = old_entry->vme_start; @@ -540,9 +542,9 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry) copy_hole_info(hole_entry, &old_hole_entry); #endif /* DEBUG */ - new_hole_entry->prev = hole_entry; + VML_PREV_SET(new_hole_entry, hole_entry); new_hole_entry->next = hole_entry->vme_next; - hole_entry->vme_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + VME_PREV_SET(hole_entry->vme_next, CAST_TO_VM_MAP_ENTRY(new_hole_entry)); hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry); new_hole_entry->start = new_entry->vme_end; @@ -617,9 +619,9 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry) } panic("Illegal action: h1: %p, s:0x%llx, e:0x%llx...h2:%p, s:0x%llx, e:0x%llx...h3:0x%p, s:0x%llx, e:0x%llx", - hole_entry->vme_prev, - (unsigned long long)hole_entry->vme_prev->vme_start, - (unsigned long long)hole_entry->vme_prev->vme_end, + VME_PREV(hole_entry), + (unsigned long long)VME_PREV(hole_entry)->vme_start, + (unsigned long long)VME_PREV(hole_entry)->vme_end, hole_entry, (unsigned long long)hole_entry->vme_start, (unsigned long long)hole_entry->vme_end, diff --git a/osfmk/vm/vm_map_xnu.h b/osfmk/vm/vm_map_xnu.h index a964ff312..5e9d1f384 100644 --- a/osfmk/vm/vm_map_xnu.h +++ b/osfmk/vm/vm_map_xnu.h @@ -31,6 +31,7 @@ #ifdef XNU_KERNEL_PRIVATE +#include #include #include @@ -71,7 +72,6 @@ extern kern_return_t vm_map_exec( typedef struct vm_map_entry *vm_map_entry_t; #define VM_MAP_ENTRY_NULL ((vm_map_entry_t) NULL) - #define named_entry_lock_init(object) lck_mtx_init(&(object)->Lock, &vm_object_lck_grp, &vm_object_lck_attr) #define named_entry_lock_destroy(object) lck_mtx_destroy(&(object)->Lock, &vm_object_lck_grp) #define named_entry_lock(object) lck_mtx_lock(&(object)->Lock) @@ -167,7 +167,6 @@ struct vm_named_entry { struct vm_map_entry { struct vm_map_links links; /* links to other entries */ -#define vme_prev links.prev #define vme_next links.next #define vme_start links.start #define vme_end links.end @@ -244,7 +243,6 @@ struct vm_map_entry { /* boolean_t */ no_cache:1, /* should new pages be cached? */ /* boolean_t */ vme_permanent:1, /* mapping can not be removed */ /* boolean_t */ superpage_size:1, /* use superpages of a certain size */ - /* boolean_t */ map_aligned:1, /* align to map's page size */ /* * zero out the wired pages of this entry * if is being deleted without unwiring them @@ -260,7 +258,8 @@ struct vm_map_entry { /* boolean_t */ vme_xnu_user_debug:1, /* boolean_t */ vme_no_copy_on_read:1, /* boolean_t */ translated_allow_execute:1, /* execute in translated processes */ - /* boolean_t */ vme_kernel_object:1; /* vme_object is a kernel_object */ + /* boolean_t */ vme_kernel_object:1, /* vme_object is a kernel_object */ + /* boolean_t */ __unused:1; unsigned short wired_count; /* can be paged if = 0 */ unsigned short user_wired_count; /* for vm_wire */ @@ -457,8 +456,14 @@ struct _vm_map { /* boolean_t */ uses_user_ranges:1, /* has the map been configured to use user VM ranges */ /* boolean_t */ tpro_enforcement:1, /* enforce TPRO propagation */ /* boolean_t */ corpse_source:1, /* map is being used to create a corpse for diagnostics.*/ + /* boolean_t */ cs_platform_binary:1, /* map belongs to a platform binary */ + +#define VM_MAP_NOT_SEALED 0 /* map is not sealed and may be freely modified. */ +#define VM_MAP_WILL_BE_SEALED 1 /* map will be sealed and is subject to limited modification. */ +#define VM_MAP_SEALED 2 /* map is sealed and should not be modified. */ + /* unsigned int */ vmmap_sealed:2, /* sealed state of map, see definitions above. */ /* reserved */ res0:1, - /* reserved */pad:9; + /* reserved */pad:6; unsigned int timestamp; /* Version number */ /* * Weak reference to the task that owns this map. This will be NULL if the @@ -467,12 +472,46 @@ struct _vm_map { * if owning_task is not NULL, since vm_map_terminate requires the map lock. */ task_t owning_task; + + /* + * A generation ID for maps that increments monotonically. + * This is a pointer type just so we get dPAC out-of-the-box, but + * conceptually it's just an ID. + * Note that this is not a unique object ID. In particular, fork() + * will produce a child map with the same ID as its parent. + */ + vm_map_serial_t serial_id; }; +#define VME_PREV(entry) VM_PREV_UNPACK((entry)->links.prev) +#define VMH_PREV(hdr) (VM_PREV_UNPACK((hdr)->links.prev)) +#define VML_PREV(links) (VM_PREV_UNPACK((links)->prev)) + +static inline +void +VME_PREV_SET(vm_map_entry_t entry, vm_map_entry_t prev) +{ + entry->links.prev = VM_PREV_PACK(prev); +} + +static inline +void +VMH_PREV_SET(struct vm_map_header * hdr, vm_map_entry_t prev) +{ + hdr->links.prev = VM_PREV_PACK(prev); +} + +static inline +void +VML_PREV_SET(struct vm_map_links * links, vm_map_entry_t prev) +{ + links->prev = VM_PREV_PACK(prev); +} + #define CAST_TO_VM_MAP_ENTRY(x) ((struct vm_map_entry *)(uintptr_t)(x)) #define vm_map_to_entry(map) CAST_TO_VM_MAP_ENTRY(&(map)->hdr.links) #define vm_map_first_entry(map) ((map)->hdr.links.next) -#define vm_map_last_entry(map) ((map)->hdr.links.prev) +#define vm_map_last_entry(map) (VME_PREV(vm_map_to_entry(map))) /* * Type: vm_map_version_t [exported; contents invisible] @@ -534,7 +573,9 @@ struct vm_map_copy { vm_map_size_t size; union { struct vm_map_header hdr; /* ENTRY_LIST */ - void *XNU_PTRAUTH_SIGNED_PTR("vm_map_copy.kdata") kdata; /* KERNEL_BUFFER */ + struct { + void *XNU_PTRAUTH_SIGNED_PTR("vm_map_copy.kdata") kdata; /* KERNEL_BUFFER */ + } buffer_data; } c_u; }; @@ -550,7 +591,7 @@ ZONE_DECLARE_ID(ZONE_ID_VM_MAP, struct _vm_map); #define cpy_hdr c_u.hdr -#define cpy_kdata c_u.kdata +#define cpy_kdata c_u.buffer_data.kdata #define VM_MAP_COPY_PAGE_SHIFT(copy) ((copy)->cpy_hdr.page_shift) #define VM_MAP_COPY_PAGE_SIZE(copy) (1 << VM_MAP_COPY_PAGE_SHIFT((copy))) @@ -564,7 +605,7 @@ ZONE_DECLARE_ID(ZONE_ID_VM_MAP, struct _vm_map); #define vm_map_copy_first_entry(copy) \ ((copy)->cpy_hdr.links.next) #define vm_map_copy_last_entry(copy) \ - ((copy)->cpy_hdr.links.prev) + (VM_PREV_UNPACK((copy)->cpy_hdr.links.prev)) /* @@ -575,6 +616,8 @@ ZONE_DECLARE_ID(ZONE_ID_VM_MAP, struct _vm_map); * (See vm_map.c::vm_remap()) */ +#include + #define vm_map_lock_init(map) \ ((map)->timestamp = 0 , \ lck_rw_init(&(map)->lock, &vm_map_lck_grp, &vm_map_lck_rw_attr)) @@ -582,12 +625,25 @@ ZONE_DECLARE_ID(ZONE_ID_VM_MAP, struct _vm_map); #define vm_map_lock(map) \ MACRO_BEGIN \ DTRACE_VM(vm_map_lock_w); \ + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_REQ_EXCL, map); \ + assert(!vm_map_is_sealed(map)); \ lck_rw_lock_exclusive(&(map)->lock); \ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_GOT_EXCL, map); \ + MACRO_END + +#define vm_map_lock_unseal(map) \ + MACRO_BEGIN \ + DTRACE_VM(vm_map_lock_w); \ + assert(vm_map_is_sealed(map)); \ + lck_rw_lock_exclusive(&(map)->lock); \ + (map)->vmmap_sealed = VM_MAP_NOT_SEALED; \ MACRO_END #define vm_map_unlock(map) \ MACRO_BEGIN \ DTRACE_VM(vm_map_unlock_w); \ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_UNLOCK_EXCL, map); \ + assert(!vm_map_is_sealed(map)); \ (map)->timestamp++; \ lck_rw_done(&(map)->lock); \ MACRO_END @@ -595,18 +651,22 @@ ZONE_DECLARE_ID(ZONE_ID_VM_MAP, struct _vm_map); #define vm_map_lock_read(map) \ MACRO_BEGIN \ DTRACE_VM(vm_map_lock_r); \ + vmlp_lock_event_unlocked(VMLP_EVENT_LOCK_REQ_SH, map); \ lck_rw_lock_shared(&(map)->lock); \ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_GOT_SH, map); \ MACRO_END #define vm_map_unlock_read(map) \ MACRO_BEGIN \ DTRACE_VM(vm_map_unlock_r); \ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_UNLOCK_SH, map); \ lck_rw_done(&(map)->lock); \ MACRO_END #define vm_map_lock_write_to_read(map) \ MACRO_BEGIN \ DTRACE_VM(vm_map_lock_downgrade); \ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_DOWNGRADE, map); \ (map)->timestamp++; \ lck_rw_lock_exclusive_to_shared(&(map)->lock); \ MACRO_END @@ -652,11 +712,17 @@ extern void vm_map_reference( /* * Wait and wakeup macros for in_transition map entries. */ -#define vm_map_entry_wait(map, interruptible) \ - ((map)->timestamp++ , \ - lck_rw_sleep(&(map)->lock, LCK_SLEEP_EXCLUSIVE|LCK_SLEEP_PROMOTED_PRI, \ - (event_t)&(map)->hdr, interruptible)) - +static inline wait_result_t +_vm_map_entry_wait_helper(vm_map_t map, wait_interrupt_t interruptible) +{ + vmlp_lock_event_locked(VMLP_EVENT_LOCK_SLEEP_BEGIN, map); + map->timestamp++; + wait_result_t res = lck_rw_sleep(&map->lock, LCK_SLEEP_EXCLUSIVE | LCK_SLEEP_PROMOTED_PRI, + (event_t)&map->hdr, interruptible); + vmlp_lock_event_locked(VMLP_EVENT_LOCK_SLEEP_END, map); + return res; +} +#define vm_map_entry_wait(map, interruptible) _vm_map_entry_wait_helper((map), (interruptible)) #define vm_map_entry_wakeup(map) \ thread_wakeup((event_t)(&(map)->hdr)) @@ -722,6 +788,9 @@ extern size_t ml_get_vm_reserved_regions( */ extern void ml_fp_save_area_prealloc(void); +extern bool vm_map_is_sealed( + vm_map_t map); + #endif /* MACH_KERNEL_PRIVATE */ /* @@ -744,8 +813,14 @@ extern vm_map_size_t vm_map_adjusted_size(vm_map_t map); typedef struct { vm_map_t map; task_t task; + boolean_t sec_overridden; } vm_map_switch_context_t; -extern vm_map_switch_context_t vm_map_switch_to(vm_map_t map); +extern vm_map_switch_context_t vm_map_switch_with_sec_override(vm_map_t, boolean_t sec_override); +static inline vm_map_switch_context_t +vm_map_switch_to(vm_map_t map) +{ + return vm_map_switch_with_sec_override(map, FALSE); +} extern void vm_map_switch_back(vm_map_switch_context_t ctx); extern boolean_t vm_map_cs_enforcement( @@ -902,6 +977,12 @@ extern boolean_t vm_map_has_hard_pagezero( extern void vm_commit_pagezero_status(vm_map_t tmap); +extern void vm_map_set_platform_binary( + vm_map_t map, + bool is_platform_binary); +extern bool vm_map_is_platform_binary( + vm_map_t map); + extern boolean_t vm_map_tpro( vm_map_t map); @@ -909,6 +990,7 @@ extern void vm_map_set_tpro( vm_map_t map); + extern void vm_map_set_tpro_enforcement( vm_map_t map); @@ -1061,7 +1143,6 @@ extern pmap_t vm_map_get_pmap(vm_map_t map); extern void vm_map_guard_exception(vm_map_offset_t gap_start, unsigned reason); - extern bool vm_map_is_corpse_source(vm_map_t map); extern void vm_map_set_corpse_source(vm_map_t map); extern void vm_map_unset_corpse_source(vm_map_t map); @@ -1122,6 +1203,18 @@ extern kern_return_t vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_ kern_return_t (^entry_handler)(void* entry)); extern kern_return_t vm_map_dump_entry_and_compressor_pager(void* entry, char *buf, size_t *count); +extern void vm_map_testing_make_sealed_submap( + vm_map_t parent_map, + mach_vm_address_t start, + mach_vm_address_t end); + +extern void vm_map_testing_remap_submap( + vm_map_t parent_map, + mach_vm_address_t submap_base_address, + mach_vm_address_t start, + mach_vm_address_t end, + mach_vm_address_t offset); + #endif /* DEVELOPMENT || DEBUG */ boolean_t kdp_vm_map_is_acquired_exclusive(vm_map_t map); @@ -1135,6 +1228,9 @@ int vm_map_shadow_max(vm_map_t map); bool vm_map_is_map_size_valid(vm_map_t target_map, vm_size_t size, bool no_soft_limit); +/* Returns the map's ID or VM_MAP_SERIAL_NONE if the input map is NULL */ +vm_map_serial_t vm_map_maybe_serial_id(vm_map_t maybe_vm_map); + __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_memory_entry.c b/osfmk/vm/vm_memory_entry.c index 0b9aedfc6..dcc878884 100644 --- a/osfmk/vm/vm_memory_entry.c +++ b/osfmk/vm/vm_memory_entry.c @@ -43,6 +43,7 @@ static void mach_memory_entry_no_senders(ipc_port_t, mach_port_mscount_t); IPC_KOBJECT_DEFINE(IKOT_NAMED_ENTRY, + .iko_op_movable_send = true, .iko_op_stable = true, .iko_op_no_senders = mach_memory_entry_no_senders); @@ -240,28 +241,6 @@ mach_make_memory_entry_mem_only( return KERN_SUCCESS; } -#if CONFIG_PROB_GZALLOC -static inline vm_map_offset_ut -vm_memory_entry_pgz_decode_offset( - vm_map_t target_map, - vm_map_offset_ut offset_u, - memory_object_size_ut *size_u __unused) -{ - if (target_map == NULL || target_map->pmap == kernel_pmap) { - vm_map_offset_t pgz_offset; - - /* - * It's ok to unsafe unwrap because PGZ does not ship to - * customers. - */ - pgz_offset = pgz_decode(VM_SANITIZE_UNSAFE_UNWRAP(offset_u), - VM_SANITIZE_UNSAFE_UNWRAP(*size_u)); - return vm_sanitize_wrap_addr(pgz_offset); - } - return offset_u; -} -#endif /* CONFIG_PROB_GZALLOC */ - static __attribute__((always_inline, warn_unused_result)) kern_return_t mach_make_memory_entry_generic_sanitize( @@ -326,14 +305,6 @@ mach_make_memory_entry_named_create( size_u, offset_u, permission, user_entry, object_handle); } -#if CONFIG_PROB_GZALLOC - /* - * If offset is PGZ protected we need PGZ to fix it up to the right - * value prior to validation and use. - */ - offset_u = vm_memory_entry_pgz_decode_offset(target_map, offset_u, size_u); -#endif /* CONFIG_PROB_GZALLOC */ - /* * Sanitize addr and size. Permimssions have been sanitized prior to * dispatch @@ -370,7 +341,7 @@ mach_make_memory_entry_named_create( } #endif /* __LP64__ */ - object = vm_object_allocate(map_size); + object = vm_object_allocate(map_size, vm_map_maybe_serial_id(target_map)); assert(object != VM_OBJECT_NULL); vm_object_lock(object); @@ -530,14 +501,6 @@ mach_make_memory_entry_copy( size_u, offset_u, permission, user_entry, object_handle); } -#if CONFIG_PROB_GZALLOC - /* - * If offset is PGZ protected we need PGZ to fix it up to the right - * value prior to validation and use. - */ - offset_u = vm_memory_entry_pgz_decode_offset(target_map, offset_u, size_u); -#endif /* CONFIG_PROB_GZALLOC */ - /* * Sanitize addr and size. Permimssions have been sanitized prior to * dispatch @@ -643,18 +606,14 @@ mach_make_memory_entry_share( vm_map_size_t map_size; vm_map_offset_t map_start, map_end, offset; - if (VM_SANITIZE_UNSAFE_IS_ZERO(*size_u)) { - return mach_make_memory_entry_cleanup(KERN_INVALID_ARGUMENT, target_map, - size_u, offset_u, permission, user_entry, object_handle); - } + vmlp_api_start(MACH_MAKE_MEMORY_ENTRY_SHARE); -#if CONFIG_PROB_GZALLOC - /* - * If offset is PGZ protected we need PGZ to fix it up to the right - * value prior to validation and use. - */ - offset_u = vm_memory_entry_pgz_decode_offset(target_map, offset_u, size_u); -#endif /* CONFIG_PROB_GZALLOC */ + if (VM_SANITIZE_UNSAFE_IS_ZERO(*size_u)) { + kr = mach_make_memory_entry_cleanup(KERN_INVALID_ARGUMENT, target_map, + size_u, offset_u, permission, user_entry, object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; + } /* * Sanitize addr and size. Permimssions have been sanitized prior to @@ -668,8 +627,10 @@ mach_make_memory_entry_share( &map_size, &offset); if (__improbable(kr != KERN_SUCCESS)) { - return mach_make_memory_entry_cleanup(kr, target_map, - size_u, offset_u, permission, user_entry, object_handle); + kr = mach_make_memory_entry_cleanup(kr, target_map, + size_u, offset_u, permission, user_entry, object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; } assert(map_size != 0); @@ -678,12 +639,16 @@ mach_make_memory_entry_share( &mask_protections, &use_data_addr, &use_4K_compat); if (target_map == VM_MAP_NULL) { - return mach_make_memory_entry_cleanup(KERN_INVALID_TASK, target_map, - size_u, offset_u, permission, user_entry, object_handle); + kr = mach_make_memory_entry_cleanup(KERN_INVALID_TASK, target_map, + size_u, offset_u, permission, user_entry, object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; } vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; - vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; + vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; + parent_copy_entry = VM_MAP_ENTRY_NULL; if (!(permission & MAP_MEM_VM_SHARE)) { vm_map_t tmp_map, real_map; @@ -782,8 +747,10 @@ mach_make_memory_entry_share( VM_INHERIT_SHARE, vmk_flags); if (kr != KERN_SUCCESS) { - return mach_make_memory_entry_cleanup(kr, target_map, - size_u, offset_u, permission, user_entry, object_handle); + kr = mach_make_memory_entry_cleanup(kr, target_map, + size_u, offset_u, permission, user_entry, object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; } assert(copy != VM_MAP_COPY_NULL); @@ -796,9 +763,11 @@ mach_make_memory_entry_share( if (protections == VM_PROT_NONE) { /* no access at all: fail */ vm_map_copy_discard(copy); - return mach_make_memory_entry_cleanup(KERN_PROTECTION_FAILURE, - target_map, size_u, offset_u, permission, user_entry, - object_handle); + kr = mach_make_memory_entry_cleanup(KERN_PROTECTION_FAILURE, + target_map, size_u, offset_u, permission, user_entry, + object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; } } else { /* @@ -810,9 +779,11 @@ mach_make_memory_entry_share( /* XXX FBDP TODO: no longer needed? */ if ((cur_prot & protections) != protections) { vm_map_copy_discard(copy); - return mach_make_memory_entry_cleanup(KERN_PROTECTION_FAILURE, - target_map, size_u, offset_u, permission, user_entry, - object_handle); + kr = mach_make_memory_entry_cleanup(KERN_PROTECTION_FAILURE, + target_map, size_u, offset_u, permission, user_entry, + object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; } } @@ -844,11 +815,25 @@ mach_make_memory_entry_share( DEBUG4K_MEMENTRY("map %p offset 0x%llx size 0x%llx prot 0x%x -> " "entry %p kr 0x%x\n", target_map, offset, VM_SANITIZE_UNSAFE_UNWRAP(*size_u), permission, user_entry, KERN_SUCCESS); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, KERN_SUCCESS); return KERN_SUCCESS; } /* no match: we need to create a new entry */ object = VME_OBJECT(copy_entry); + + if (object == VM_OBJECT_NULL) { + /* object can be null when protection == max_protection == VM_PROT_NONE + * return a failure because the code that follows and other APIs that consume + * a named-entry expect to have non-null object */ + vm_map_copy_discard(copy); + kr = mach_make_memory_entry_cleanup(KERN_PROTECTION_FAILURE, + target_map, size_u, offset_u, permission, user_entry, + object_handle); + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, kr); + return kr; + } + vm_object_lock(object); wimg_mode = object->wimg_bits; if (!(object->nophyscache)) { @@ -888,7 +873,8 @@ mach_make_memory_entry_share( if (VM_OBJECT_OWNER(VME_OBJECT(copy_entry)) == TASK_NULL) { object = VME_OBJECT(copy_entry); if (object && !object->internal) { - /* external objects can be "owned" */ + /* external objects can be "owned", + * is_fully_owned remains TRUE as far as this entry is concerned */ continue; } /* this memory is not "owned" */ @@ -897,17 +883,18 @@ mach_make_memory_entry_share( } } } else { + assert3p(object, !=, VM_OBJECT_NULL); /* Sanity, this was set above */ user_entry->is_object = TRUE; + assert3p(object, ==, vm_named_entry_to_vm_object(user_entry)); /* Sanity, this was set above */ user_entry->internal = object->internal; user_entry->offset = VME_OFFSET(vm_map_copy_first_entry(copy)); user_entry->access = GET_MAP_MEM(permission); /* is all memory in this named entry "owned"? */ user_entry->is_fully_owned = FALSE; - object = vm_named_entry_to_vm_object(user_entry); if (VM_OBJECT_OWNER(object) != TASK_NULL) { /* object is owned */ user_entry->is_fully_owned = TRUE; - } else if (object && !object->internal) { + } else if (!object->internal) { /* external objects can become "owned" */ user_entry->is_fully_owned = TRUE; } @@ -918,6 +905,8 @@ mach_make_memory_entry_share( DEBUG4K_MEMENTRY("map %p offset 0x%llx size 0x%llx prot 0x%x -> entry " "%p kr 0x%x\n", target_map, offset, VM_SANITIZE_UNSAFE_UNWRAP(*size_u), permission, user_entry, KERN_SUCCESS); + + vmlp_api_end(MACH_MAKE_MEMORY_ENTRY_SHARE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -980,7 +969,7 @@ mach_make_memory_entry_from_parent_entry_sanitize( * Additional checks to make sure explicitly computed aligned start and end * still make sense. */ - if (__improbable(*map_end < *map_start) || (*map_end > parent_entry->size)) { + if (__improbable(*map_end <= *map_start) || (*map_end > parent_entry->size)) { return KERN_INVALID_ARGUMENT; } @@ -1246,9 +1235,8 @@ mach_memory_entry_allocate(ipc_port_t *user_handle_p) Z_WAITOK | Z_ZERO | Z_NOFAIL); named_entry_lock_init(user_entry); - *user_handle_p = ipc_kobject_alloc_port((ipc_kobject_t)user_entry, - IKOT_NAMED_ENTRY, - IPC_KOBJECT_ALLOC_MAKE_SEND | IPC_KOBJECT_ALLOC_NSREQUEST); + *user_handle_p = ipc_kobject_alloc_port(user_entry, IKOT_NAMED_ENTRY, + IPC_KOBJECT_ALLOC_MAKE_SEND); #if VM_NAMED_ENTRY_DEBUG /* backtrace at allocation time, for debugging only */ @@ -1322,7 +1310,7 @@ mach_memory_object_memory_entry_64( } if (pager == MEMORY_OBJECT_NULL && internal) { - object = vm_object_allocate(size); + object = vm_object_allocate(size, VM_MAP_SERIAL_NONE); if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; } @@ -1594,23 +1582,19 @@ mach_memory_entry_ownership( } #endif /* DEVELOPMENT || DEBUG */ if (!transfer_ok) { -#define TRANSFER_ENTITLEMENT_MAX_LENGTH 1024 /* XXX ? */ - const char *our_id, *their_id; + char *our_id, *their_id; our_id = IOTaskGetEntitlement(current_task(), "com.apple.developer.memory.transfer-send"); their_id = IOTaskGetEntitlement(owner, "com.apple.developer.memory.transfer-accept"); if (our_id && their_id && - !strncmp(our_id, their_id, TRANSFER_ENTITLEMENT_MAX_LENGTH)) { + !strcmp(our_id, their_id)) { /* These are guaranteed to be null-terminated */ /* allow transfer between tasks that have matching entitlements */ - if (strnlen(our_id, TRANSFER_ENTITLEMENT_MAX_LENGTH) < TRANSFER_ENTITLEMENT_MAX_LENGTH && - strnlen(their_id, TRANSFER_ENTITLEMENT_MAX_LENGTH) < TRANSFER_ENTITLEMENT_MAX_LENGTH) { - transfer_ok = true; - } else { - /* complain about entitlement(s) being too long... */ - assertf((strlen(our_id) <= TRANSFER_ENTITLEMENT_MAX_LENGTH && - strlen(their_id) <= TRANSFER_ENTITLEMENT_MAX_LENGTH), - "our_id:%lu their_id:%lu", - strlen(our_id), strlen(their_id)); - } + transfer_ok = true; + } + if (our_id) { + kfree_data_addr(our_id); + } + if (their_id) { + kfree_data_addr(their_id); } } if (!transfer_ok) { @@ -1735,7 +1719,7 @@ mach_memory_entry_ownership_from_user( } if (IP_VALID(owner_port)) { - if (ip_kotype(owner_port) == IKOT_TASK_ID_TOKEN) { + if (ip_type(owner_port) == IKOT_TASK_ID_TOKEN) { task_id_token_t token = convert_port_to_task_id_token(owner_port); (void)task_identity_token_get_task_grp(token, &owner, TASK_GRP_MIG); task_id_token_release(token); @@ -1770,8 +1754,9 @@ mach_memory_entry_ownership_from_user( kern_return_t mach_memory_entry_get_page_counts( ipc_port_t entry_port, - unsigned int *resident_page_count, - unsigned int *dirty_page_count) + uint64_t *resident_page_count, + uint64_t *dirty_page_count, + uint64_t *swapped_page_count) { kern_return_t kr; vm_named_entry_t mem_entry; @@ -1808,7 +1793,7 @@ mach_memory_entry_get_page_counts( named_entry_unlock(mem_entry); - kr = vm_object_get_page_counts(object, offset, size, resident_page_count, dirty_page_count); + kr = vm_object_get_page_counts(object, offset, size, resident_page_count, dirty_page_count, swapped_page_count); vm_object_unlock(object); @@ -1999,7 +1984,7 @@ void mach_memory_entry_port_release( ipc_port_t port) { - assert(ip_kotype(port) == IKOT_NAMED_ENTRY); + assert(ip_type(port) == IKOT_NAMED_ENTRY); ipc_port_release_send(port); } @@ -2012,6 +1997,35 @@ mach_memory_entry_from_port(ipc_port_t port) return NULL; } +void +mach_memory_entry_describe( + vm_named_entry_t named_entry, + kobject_description_t desc) +{ + vm_object_t vm_object; + if (named_entry->is_object) { + vm_object = vm_named_entry_to_vm_object(named_entry); + vm_object_size_t size = vm_object->internal ? + vm_object->vo_un1.vou_size : 0; + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, + "VM-OBJECT(0x%x, %lluKiB)", + VM_OBJECT_ID(vm_object), + BtoKiB(size)); + } else if (named_entry->is_copy) { + vm_map_copy_t copy_map = named_entry->backing.copy; + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, + "VM-MAP-COPY(0x%lx, %lluKiB)", + VM_KERNEL_ADDRHASH(copy_map), + BtoKiB(copy_map->size)); + } else if (named_entry->is_sub_map) { + vm_map_t submap = named_entry->backing.map; + snprintf(desc, KOBJECT_DESCRIPTION_LENGTH, + "VM-SUB-MAP(0x%lx, %lluKiB)", + VM_KERNEL_ADDRHASH(submap), + BtoKiB(submap->size)); + } +} + /* * mach_memory_entry_no_senders: * @@ -2180,7 +2194,9 @@ memory_entry_check_for_adjustment( vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL; assert(port); - assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port)); + assertf(ip_type(port) == IKOT_NAMED_ENTRY, + "Port Type expected: %d...received:%d\n", + IKOT_NAMED_ENTRY, ip_type(port)); vm_named_entry_t named_entry; @@ -2211,21 +2227,34 @@ memory_entry_check_for_adjustment( return kr; } +vm_named_entry_t +vm_convert_port_to_named_entry( + ipc_port_t port) +{ + /* Invalid / wrong port type? */ + if (!IP_VALID(port) || ip_type(port) != IKOT_NAMED_ENTRY) { + return NULL; + } + + vm_named_entry_t named_entry = mach_memory_entry_from_port(port); + + /* This is a no-op, it's here for reader clarity */ + if (!named_entry) { + return NULL; + } + + return named_entry; +} + vm_object_t vm_convert_port_to_copy_object( ipc_port_t port) { - /* Invalid / wrong port type? */ - if (!IP_VALID(port) || ip_kotype(port) != IKOT_NAMED_ENTRY) { - return NULL; - } - + vm_named_entry_t named_entry = vm_convert_port_to_named_entry(port); /* We expect the named entry to point to an object. */ - vm_named_entry_t named_entry = mach_memory_entry_from_port(port); if (!named_entry || !named_entry->is_object) { return NULL; } - /* Pull out the copy map object... */ return vm_named_entry_to_vm_object(named_entry); } diff --git a/osfmk/vm/vm_memory_entry.h b/osfmk/vm/vm_memory_entry.h index c0c9d7d25..1e8badfe4 100644 --- a/osfmk/vm/vm_memory_entry.h +++ b/osfmk/vm/vm_memory_entry.h @@ -49,6 +49,18 @@ extern kern_return_t mach_memory_entry_range_op( int *range); #endif /* XNU_PLATFORM_MacOSX */ +/* + * Routine: vm_convert_port_to_named_entry + * Purpose: + * Convert from a port specifying a named entry + * backed by a copy map to the named entry itself. + * Returns NULL if the port does not refer to a named entry. + * Conditions: + * Nothing locked. + */ +extern vm_named_entry_t vm_convert_port_to_named_entry( + ipc_port_t port); + /* * Routine: vm_convert_port_to_copy_object * Purpose: diff --git a/osfmk/vm/vm_memory_entry_xnu.h b/osfmk/vm/vm_memory_entry_xnu.h index e8844d35d..ae4b908ab 100644 --- a/osfmk/vm/vm_memory_entry_xnu.h +++ b/osfmk/vm/vm_memory_entry_xnu.h @@ -30,6 +30,8 @@ #define _VM_VM_MEMORY_ENTRY_XNU_H_ #ifdef XNU_KERNEL_PRIVATE +#include +#include #include __BEGIN_DECLS @@ -38,6 +40,8 @@ extern void mach_memory_entry_port_release(ipc_port_t port); extern vm_named_entry_t mach_memory_entry_from_port(ipc_port_t port); extern struct vm_named_entry *mach_memory_entry_allocate(ipc_port_t *user_handle_p); +extern void mach_memory_entry_describe(vm_named_entry_t named_entry, kobject_description_t desc); + __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ #endif /* _VM_VM_MEMORY_ENTRY_XNU_H_ */ diff --git a/osfmk/vm/vm_memtag.c b/osfmk/vm/vm_memtag.c index c3bcbb2a6..f24281d33 100644 --- a/osfmk/vm/vm_memtag.c +++ b/osfmk/vm/vm_memtag.c @@ -36,6 +36,18 @@ vm_map_address_t vm_memtag_canonicalize(vm_map_t map, vm_map_address_t addr) { + assert(map); + + /* With no pmap assigned we cannot make a decision. Leave the address as is */ + if (map->pmap == NULL) { + return addr; + } + + /* NULL is a frequent enough special case. */ + if (addr == (vm_map_address_t)NULL) { + return addr; + } + return (map->pmap == kernel_pmap) ? (vm_map_address_t)vm_memtag_canonicalize_kernel(addr) : (vm_map_address_t)vm_memtag_canonicalize_user(addr); diff --git a/osfmk/vm/vm_memtag.h b/osfmk/vm/vm_memtag.h index 9474e3a26..ad7d37bf8 100644 --- a/osfmk/vm/vm_memtag.h +++ b/osfmk/vm/vm_memtag.h @@ -169,13 +169,17 @@ vm_memtag_extract_tag(vm_map_address_t tagged_ptr) } /* - * when passed a tagged pointer, strip away the tag bits and return the - * canonical address. Since it's used in a number of frequently called checks + * when passed a tagged pointer, strip away only the tag bits with their canonical + * value. Since these are used in a number of frequently called checks * (e.g. when packing VM pointers), the following definition hardcodes the * tag value to achieve optimal codegen and no external calls. */ -#define vm_memtag_canonicalize_kernel(addr) vm_memtag_insert_tag(addr, 0xF) -#define vm_memtag_canonicalize_user(addr) vm_memtag_insert_tag(addr, 0x0) +#ifndef __BUILDING_XNU_LIBRARY__ +#define vm_memtag_canonicalize_kernel(addr) vm_memtag_insert_tag(addr, 0xF) +#else /* __BUILDING_XNU_LIBRARY__ */ +#define vm_memtag_canonicalize_kernel(addr) vm_memtag_insert_tag(addr, 0x0) +#endif/* __BUILDING_XNU_LIBRARY__ */ +#define vm_memtag_canonicalize_user(addr) vm_memtag_insert_tag(addr, 0x0) extern vm_map_address_t vm_memtag_canonicalize(vm_map_t map, vm_map_address_t addr); diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index b6c188f66..e2dd65e1f 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #include @@ -324,6 +325,9 @@ static const struct vm_object vm_object_template = { .pages_created = 0, .pages_used = 0, .scan_collisions = 0, +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 + .vo_chead_hint = 0, +#endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */ #if CONFIG_PHANTOM_CACHE .phantom_object_id = 0, #endif @@ -388,6 +392,7 @@ static const struct vm_object vm_object_template = { .vo_purgeable_volatilizer = NULL, .purgeable_volatilizer_bt = {0}, #endif /* DEBUG */ + .vmo_provenance = VM_MAP_SERIAL_NONE, }; LCK_GRP_DECLARE(vm_object_lck_grp, "vm_object"); @@ -482,9 +487,12 @@ vm_object_set_size( __private_extern__ void _vm_object_allocate( vm_object_size_t size, - vm_object_t object) + vm_object_t object, + vm_map_serial_t provenance) { *object = vm_object_template; + object->vmo_provenance = provenance; + vm_page_queue_init(&object->memq); #if UPL_DEBUG || CONFIG_IOSCHED queue_init(&object->uplq); @@ -505,12 +513,12 @@ _vm_object_allocate( __private_extern__ vm_object_t vm_object_allocate( - vm_object_size_t size) + vm_object_size_t size, vm_map_serial_t provenance) { vm_object_t object; object = zalloc_flags(vm_object_zone, Z_WAITOK | Z_NOFAIL); - _vm_object_allocate(size, object); + _vm_object_allocate(size, object, provenance); return object; } @@ -548,8 +556,8 @@ vm_object_bootstrap(void) * Note that in the following size specifications, we need to add 1 because * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size. */ - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object_default); - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, compressor_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, kernel_object_default, VM_MAP_SERIAL_SPECIAL); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, compressor_object, VM_MAP_SERIAL_SPECIAL); kernel_object_default->copy_strategy = MEMORY_OBJECT_COPY_NONE; compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; kernel_object_default->no_tag_update = TRUE; @@ -557,14 +565,14 @@ vm_object_bootstrap(void) /* * The object to hold retired VM pages. */ - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, retired_pages_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, retired_pages_object, VM_MAP_SERIAL_SPECIAL); retired_pages_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /** * The object to hold pages owned by exclaves. */ - _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, exclaves_object); + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, exclaves_object, VM_MAP_SERIAL_SPECIAL); exclaves_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; } @@ -929,7 +937,64 @@ take_page: return NULL; } +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 +/* This is the actual number of filling cheads that's going to be used. + * must be 1 <= vm_cheads <= COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT */ +TUNABLE_WRITEABLE(uint32_t, vm_cheads, "vm_cheads", 8); +/* This determines what criteria is used for selecting the chead, + * either the PID of the grabber task or it's coalition */ +TUNABLE_WRITEABLE(vm_chead_select_t, vm_chead_select, "vm_chead_select", CSEL_BY_PID); +/* This determines if the grabber-id is set on every page-fault insert or just the first insert */ +TUNABLE_WRITEABLE(boolean_t, vm_chead_rehint, "vm_chead_rehint", false); + +/* + * This function is called from vm_page_insert_internal(). When it's called from the context + * of a vm_fault where a task has just requested a new page/paged-in a existing page, + * this function records some bits of information about the task. These bits are then + * going to be used when the page is sent to the compressor to select the compressor-head + * that will be used. + * The goal of this is to make pages that come from the same task/coalition be compressed to the + * same compressor segment, This helps the locality of swap-in and decompression. + * This optimization relies on a heuristic assumptions that the vm_object is only ever mapped + * in a single task/coalition. vm_objects that violate this would not benefit from this optimization. + * See also vm_pageout_select_filling_chead() + */ +void +vm_object_set_chead_hint( + vm_object_t object) +{ + if (!object->internal) { + /* not relevant for pages that are not going to get to the compressor */ + return; + } + + if (object->vo_chead_hint != 0 && !vm_chead_rehint) { + /* there's already a value there and we don't want to set it again */ + return; + } + task_t cur_task = current_task_early(); + if (cur_task == TASK_NULL || cur_task == kernel_task || vm_cheads <= 1) { + /* avoid doing extra work for the kernel map case */ + object->vo_chead_hint = 0; + return; + } + int value = 0; + if (vm_chead_select == CSEL_BY_PID) { + value = task_pid(cur_task); + } else if (vm_chead_select == CSEL_BY_COALITION) { + /* The choice of coalition type is not very significant here since both + * types seem to have a similar task division. */ + coalition_t coalition = task_get_coalition(cur_task, COALITION_TYPE_JETSAM); + if (coalition != COALITION_NULL) { + value = coalition_id(coalition); + } + } + uint32_t mod_by = MIN(vm_cheads, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT); + object->vo_chead_hint = (uint8_t)value % mod_by; +} + +#endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */ #define EVICT_PREPARE_LIMIT 64 #define EVICT_AGE 10 @@ -1030,6 +1095,10 @@ vm_object_cache_evict( return 0; } clock_get_system_nanotime(&sec, &nsec); + if (max_objects_to_examine == INT_MAX) { + /* evict all pages from all cached objects now */ + sec = (clock_sec_t)-1; + } /* * the object on the head of the queue has not @@ -1132,6 +1201,33 @@ vm_object_cache_evict( ep_skipped++; continue; } + if (!object->internal && + object->pager_created && + object->pager == NULL) { + /* + * This object has lost its pager, most likely + * due to a force-unmount or ungraft. The pager + * will never come back, so there's no point in + * keeping these pages, even if modified. + * The object could still be mapped, so we need + * to clear any PTE that might still be pointing + * at this physical page before we can reclaim + * it. + */ + if (p->vmp_pmapped) { + int refmod; + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); + if (refmod & VM_MEM_MODIFIED) { + assert(p->vmp_wpmapped); + p->vmp_dirty = TRUE; + } + } +// printf("FBDP %s:%d object %p reason %d page %p offset 0x%llx pmapped %d wpmapped %d xpmapped %d dirty %d precious %d\n", __FUNCTION__, __LINE__, object, object->no_pager_reason, p, p->vmp_offset, p->vmp_pmapped, p->vmp_wpmapped, p->vmp_xpmapped, p->vmp_dirty, p->vmp_precious); + /* clear any reason to skip this page below */ + p->vmp_dirty = FALSE; + p->vmp_precious = FALSE; + p->vmp_wpmapped = FALSE; + } if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) { vm_page_queue_remove(&object->memq, p, vmp_listq); vm_page_queue_enter(&object->memq, p, vmp_listq); @@ -1220,9 +1316,23 @@ vm_object_cache_evict( vm_object_cache_pages_skipped += ep_skipped; KDBG_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed); +// printf("FBDP %s(0x%x,0x%x) freed %d moved %d skipped %u\n", __func__, num_to_evict, max_objects_to_examine, ep_freed, ep_moved, ep_skipped); return ep_freed; } +int vm_object_cache_evict_all(void); +int +vm_object_cache_evict_all(void) +{ + int freed; + + vm_page_lock_queues(); + freed = vm_object_cache_evict(INT_MAX, INT_MAX); + vm_page_unlock_queues(); + printf("%s: freed %d\n", __func__, freed); + return freed; +} + /* * Routine: vm_object_terminate * Purpose: @@ -1568,7 +1678,7 @@ vm_object_reap_freelist(vm_page_t local_free_q, bool do_disconnect, bool set_cac vm_page_t page; if (local_free_q) { if (do_disconnect) { - vm_page_list_foreach(page, local_free_q) { + _vm_page_list_foreach(page, local_free_q) { if (page->vmp_pmapped) { pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page)); } @@ -2797,27 +2907,35 @@ vm_object_zero_page(vm_page_t m) kern_return_t vm_object_zero( vm_object_t object, - vm_object_offset_t cur_offset, + vm_object_offset_t *cur_offset_p, vm_object_offset_t end_offset) { kern_return_t ret; vm_object_lock_assert_exclusive(object); - ret = vm_object_zero_preflight(object, cur_offset, end_offset); + ret = vm_object_zero_preflight(object, *cur_offset_p, end_offset); if (ret != KERN_SUCCESS) { return ret; } - while (cur_offset < end_offset) { - vm_page_t m = vm_page_lookup(object, cur_offset); + while (*cur_offset_p < end_offset) { + vm_page_t m = vm_page_lookup(object, *cur_offset_p); if (m != VM_PAGE_NULL && m->vmp_busy) { vm_page_sleep(object, m, THREAD_UNINT, LCK_SLEEP_DEFAULT); /* Object lock was dropped -- reverify validity */ - ret = vm_object_zero_preflight(object, cur_offset, end_offset); + ret = vm_object_zero_preflight(object, *cur_offset_p, end_offset); if (ret != KERN_SUCCESS) { return ret; } + if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * Our mapping could have been made "needs_copy" while + * the map and object were unlocked. + * We need to do the mapping preflight again... + */ + return KERN_SUCCESS; + } continue; } @@ -2827,12 +2945,12 @@ vm_object_zero( * we dropped the object lock during the lookup retry the lookup for the * cur_offset. */ - if (page_is_paged_out(object, cur_offset)) { - vm_object_compressor_pager_state_clr(object, cur_offset); + if (page_is_paged_out(object, *cur_offset_p)) { + vm_object_compressor_pager_state_clr(object, *cur_offset_p); } else { vm_object_zero_page(m); } - cur_offset += PAGE_SIZE_64; + *cur_offset_p += PAGE_SIZE_64; /* * TODO: May need a vm_object_lock_yield_shared in this loop if it takes * too long, as holding the object lock for too long can stall pageout @@ -3150,7 +3268,7 @@ uint32_t vm_page_busy_absent_skipped = 0; * an error, this parameter will contain the value * VM_OBJECT_NULL. */ -__private_extern__ kern_return_t +__exported_hidden kern_return_t vm_object_copy_slowly( vm_object_t src_object, vm_object_offset_t src_offset, @@ -3187,7 +3305,8 @@ vm_object_copy_slowly( size = vm_object_round_page(src_offset + size) - vm_object_trunc_page(src_offset); src_offset = vm_object_trunc_page(src_offset); - new_object = vm_object_allocate(size); + + new_object = vm_object_allocate(size, src_object->vmo_provenance); new_offset = 0; if (src_object->copy_strategy == MEMORY_OBJECT_COPY_NONE && src_object->vo_inherit_copy_none) { @@ -3211,21 +3330,21 @@ vm_object_copy_slowly( ) { vm_page_t new_page; vm_fault_return_t result; + vm_grab_options_t options; - vm_object_lock(new_object); - - while ((new_page = vm_page_alloc(new_object, new_offset)) - == VM_PAGE_NULL) { - vm_object_unlock(new_object); + options = vm_page_grab_options_for_object(new_object); + while ((new_page = vm_page_grab_options(options)) == VM_PAGE_NULL) { if (!vm_page_wait(interruptible)) { vm_object_deallocate(new_object); vm_object_deallocate(src_object); *_result_object = VM_OBJECT_NULL; return MACH_SEND_INTERRUPTED; } - vm_object_lock(new_object); } + + vm_object_lock(new_object); + vm_page_insert(new_page, new_object, new_offset); vm_object_unlock(new_object); do { @@ -3575,6 +3694,8 @@ vm_object_copy_delayed( copy_size = vm_object_round_page(copy_size); Retry: + // For iOS, we want to always skip this block. For other OS types, we use the sysctl to control the flow. + #if !XNU_TARGET_OS_IOS if (!vm_object_copy_delayed_paging_wait_disable) { /* * Wait for paging in progress. @@ -3591,6 +3712,7 @@ Retry: vm_object_paging_wait(src_object, THREAD_UNINT); } } +#endif /* * See whether we can reuse the result of a previous @@ -3744,7 +3866,8 @@ Retry: if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(old_copy); vm_object_unlock(src_object); - new_copy = vm_object_allocate(copy_size); + /* Carry over the provenance from the object that's backing us */ + new_copy = vm_object_allocate(copy_size, src_object->vmo_provenance); vm_object_lock(src_object); vm_object_lock(new_copy); @@ -3767,7 +3890,8 @@ Retry: (old_copy->vo_shadow_offset == (vm_object_offset_t) 0)); } else if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(src_object); - new_copy = vm_object_allocate(copy_size); + /* Carry over the provenance from the object that's backing us */ + new_copy = vm_object_allocate(copy_size, src_object->vmo_provenance); vm_object_lock(src_object); vm_object_lock(new_copy); @@ -4070,7 +4194,7 @@ vm_object_shadow( * Allocate a new object with the given length */ - if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) { + if ((result = vm_object_allocate(length, source->vmo_provenance)) == VM_OBJECT_NULL) { panic("vm_object_shadow: no object for shadowing"); } @@ -4212,7 +4336,8 @@ vm_object_memory_object_associate( assert(!object->pager_ready); assert(object->pager_trusted); } else { - object = vm_object_allocate(size); + /* No provenance yet */ + object = vm_object_allocate(size, VM_MAP_SERIAL_NONE); assert(object != VM_OBJECT_NULL); vm_object_lock(object); VM_OBJECT_SET_INTERNAL(object, FALSE); @@ -6163,14 +6288,14 @@ vm_object_get_page_counts( vm_object_t object, vm_object_offset_t offset, vm_object_size_t size, - unsigned int *resident_page_count, - unsigned int *dirty_page_count) + uint64_t *resident_page_count, + uint64_t *dirty_page_count, + uint64_t *swapped_page_count) { - kern_return_t kr = KERN_SUCCESS; - boolean_t count_dirty_pages = FALSE; vm_page_t p = VM_PAGE_NULL; unsigned int local_resident_count = 0; unsigned int local_dirty_count = 0; + unsigned int local_swapped_count = 0; vm_object_offset_t cur_offset = 0; vm_object_offset_t end_offset = 0; @@ -6178,57 +6303,60 @@ vm_object_get_page_counts( return KERN_INVALID_ARGUMENT; } - cur_offset = offset; - end_offset = offset + size; vm_object_lock_assert_exclusive(object); - if (dirty_page_count != NULL) { - count_dirty_pages = TRUE; - } - - if (resident_page_count != NULL && count_dirty_pages == FALSE) { + if (resident_page_count != NULL && + dirty_page_count == NULL && + offset == 0 && + object->vo_size == size) { /* * Fast path when: * - we only want the resident page count, and, * - the entire object is exactly covered by the request. */ - if (offset == 0 && (object->vo_size == size)) { - *resident_page_count = object->resident_page_count; - goto out; + local_resident_count = object->resident_page_count; + if (object->internal && object->pager != NULL) { + local_swapped_count = vm_compressor_pager_get_count(object->pager); } + goto out; } - if (object->resident_page_count <= (size >> PAGE_SHIFT)) { + if (object->resident_page_count <= (size >> PAGE_SHIFT) && + swapped_page_count == NULL) { + /* + * Faster path when we don't care about non-resident pages and the object has + * fewer resident pages than the requested range. + */ vm_page_queue_iterate(&object->memq, p, vmp_listq) { if (p->vmp_offset >= cur_offset && p->vmp_offset < end_offset) { local_resident_count++; - - if (count_dirty_pages) { - if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { - local_dirty_count++; - } + if (p->vmp_dirty || + (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + local_dirty_count++; } } } - } else { - for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) { - p = vm_page_lookup(object, cur_offset); + goto out; + } - if (p != VM_PAGE_NULL) { - local_resident_count++; + for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) { + p = vm_page_lookup(object, cur_offset); - if (count_dirty_pages) { - if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { - local_dirty_count++; - } - } + if (p != VM_PAGE_NULL) { + local_resident_count++; + if (p->vmp_dirty || + (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + local_dirty_count++; } + } else if (page_is_paged_out(object, cur_offset)) { + local_swapped_count++; } } +out: if (resident_page_count != NULL) { *resident_page_count = local_resident_count; } @@ -6237,8 +6365,11 @@ vm_object_get_page_counts( *dirty_page_count = local_dirty_count; } -out: - return kr; + if (swapped_page_count != NULL) { + *swapped_page_count = local_swapped_count; + } + + return KERN_SUCCESS; } @@ -6315,7 +6446,7 @@ vm_object_transpose( * Allocate a temporary VM object to hold object1's contents * while we copy object2 to object1. */ - tmp_object = vm_object_allocate(transpose_size); + tmp_object = vm_object_allocate(transpose_size, object1->vmo_provenance); vm_object_lock(tmp_object); VM_OBJECT_SET_CAN_PERSIST(tmp_object, FALSE); @@ -6558,6 +6689,7 @@ MACRO_END assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.prev == NULL)); assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.next == NULL)); assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.prev == NULL)); + __TRANSPOSE_FIELD(vmo_provenance); #undef __TRANSPOSE_FIELD @@ -6573,7 +6705,15 @@ done: * Re-initialize the temporary object to avoid * deallocating a real pager. */ - _vm_object_allocate(transpose_size, tmp_object); + _vm_object_allocate( + transpose_size, + tmp_object, + /* + * Since we're reallocating purely to deallocate, + * don't bother trying to set a sensible provenance. + */ + VM_MAP_SERIAL_NONE + ); vm_object_deallocate(tmp_object); tmp_object = VM_OBJECT_NULL; } @@ -7691,6 +7831,10 @@ vm_object_compressed_freezer_pageout( #endif /* CONFIG_FREEZE */ +uint64_t vm_object_pageout_not_on_queue = 0; +uint64_t vm_object_pageout_not_pageable = 0; +uint64_t vm_object_pageout_pageable = 0; +uint64_t vm_object_pageout_active_local = 0; void vm_object_pageout( vm_object_t object) @@ -7737,7 +7881,10 @@ ReScan: p = next; next = (vm_page_t)vm_page_queue_next(&next->vmp_listq); + vm_page_lockspin_queues(); + assert(p->vmp_q_state != VM_PAGE_ON_FREE_Q); + assert(p->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); if ((p->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) || p->vmp_cleaning || @@ -7750,16 +7897,34 @@ ReScan: /* * Page is already being cleaned or can't be cleaned. */ + vm_page_unlock_queues(); continue; } + if (p->vmp_q_state == VM_PAGE_NOT_ON_Q) { +// printf("FBDP %s:%d page %p object %p offset 0x%llx state %d not on queue\n", __FUNCTION__, __LINE__, p, VM_PAGE_OBJECT(p), p->vmp_offset, p->vmp_q_state); + vm_object_pageout_not_on_queue++; + vm_page_unlock_queues(); + continue; + } + if (!VM_PAGE_PAGEABLE(p)) { + if (p->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) { + vm_object_pageout_active_local++; + } else { + vm_object_pageout_not_pageable++; + vm_page_unlock_queues(); + continue; + } + } else { + vm_object_pageout_pageable++; + } + if (vm_compressor_low_on_space()) { + vm_page_unlock_queues(); break; } /* Throw to the pageout queue */ - vm_page_lockspin_queues(); - if (VM_PAGE_Q_THROTTLED(iq)) { iq->pgo_draining = TRUE; diff --git a/osfmk/vm/vm_object_internal.h b/osfmk/vm/vm_object_internal.h index 3da1bc023..e42d2bedf 100644 --- a/osfmk/vm/vm_object_internal.h +++ b/osfmk/vm/vm_object_internal.h @@ -67,6 +67,12 @@ extern uint16_t vm_object_pagein_throttle; &vm_object_lck_attr))) #define vm_object_lock_destroy(object) lck_rw_destroy(&(object)->Lock, &vm_object_lck_grp) +/* + * This is used whenever we try to acquire the VM object lock + * without mutex_pause. The mutex_pause is intended to let + * pageout_scan try getting the object lock if it's trying to + * reclaim pages from that object (see vm_pageout_scan_wants_object). + */ #define vm_object_lock_try_scan(object) _vm_object_lock_try(object) /* @@ -273,10 +279,11 @@ __private_extern__ void vm_object_bootstrap(void); __private_extern__ void vm_object_reaper_init(void); -__private_extern__ vm_object_t vm_object_allocate(vm_object_size_t size); +__private_extern__ vm_object_t vm_object_allocate(vm_object_size_t size, + vm_map_serial_t provenance); __private_extern__ void _vm_object_allocate(vm_object_size_t size, - vm_object_t object); + vm_object_t object, vm_map_serial_t provenance); __private_extern__ void vm_object_set_size( vm_object_t object, @@ -360,7 +367,7 @@ __private_extern__ void vm_object_reuse_pages( __private_extern__ kern_return_t vm_object_zero( vm_object_t object, - vm_object_offset_t cur_offset, + vm_object_offset_t *cur_offset_p, vm_object_offset_t end_offset); __private_extern__ uint64_t vm_object_purge( @@ -376,8 +383,9 @@ __private_extern__ kern_return_t vm_object_get_page_counts( vm_object_t object, vm_object_offset_t offset, vm_object_size_t size, - unsigned int *resident_page_count, - unsigned int *dirty_page_count); + uint64_t *resident_page_count, + uint64_t *dirty_page_count, + uint64_t *swapped_page_count); __private_extern__ boolean_t vm_object_coalesce( vm_object_t prev_object, @@ -535,6 +543,9 @@ extern kern_return_t vm_object_range_op( int ops, uint32_t *range); +__private_extern__ void vm_object_set_chead_hint( + vm_object_t object); + __private_extern__ void vm_object_reap_pages( vm_object_t object, diff --git a/osfmk/vm/vm_object_xnu.h b/osfmk/vm/vm_object_xnu.h index fe2aaa805..25d152b9c 100644 --- a/osfmk/vm/vm_object_xnu.h +++ b/osfmk/vm/vm_object_xnu.h @@ -123,7 +123,7 @@ struct vm_object_fault_info { /* boolean_t */ fi_used_for_tpro:1, /* boolean_t */ fi_change_wiring:1, /* boolean_t */ fi_no_sleep:1, - __vm_object_fault_info_unused_bits:19; + __vm_object_fault_info_unused_bits:19; int pmap_options; }; @@ -140,7 +140,7 @@ struct vm_object { * the packed pointers are required to be on a 64 byte boundary * which means 2 things for the vm_object... (1) the memq * struct has to be the first element of the structure so that - * we can control it's alignment... (2) the vm_object must be + * we can control its alignment... (2) the vm_object must be * aligned on a 64 byte boundary... for static vm_object's * this is accomplished via the 'aligned' attribute... for * vm_object's in the zone pool, this is accomplished by @@ -320,7 +320,7 @@ struct vm_object { vm_object_offset_t last_alloc; /* last allocation offset */ vm_offset_t cow_hint; /* last page present in */ /* shadow but not in object */ - int sequential; /* sequential access size */ + int32_t sequential; /* sequential access size */ uint32_t pages_created; uint32_t pages_used; @@ -363,7 +363,13 @@ struct vm_object { #endif /* VM_OBJECT_ACCESS_TRACKING */ uint8_t scan_collisions; - uint8_t __object4_unused_bits[1]; +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 + /* This value is used for selecting a chead in the compressor for internal objects. + * see rdar://140849693 for a possible way to implement the chead_hint functionality + * in a way that doesn't require these bits */ + uint8_t vo_chead_hint:COMPRESSOR_PAGEOUT_CHEADS_BITS; +#endif /*COMPRESSOR_PAGEOUT_CHEADS_COUNT */ + uint8_t __object4_unused_bits:8 - COMPRESSOR_PAGEOUT_CHEADS_BITS; vm_tag_t wire_tag; #if CONFIG_PHANTOM_CACHE @@ -397,6 +403,14 @@ struct vm_object { task_t vo_purgeable_volatilizer; /* who made it volatile? */ void *purgeable_volatilizer_bt[16]; #endif /* DEBUG */ + + /* + * If this object is backed by anonymous memory, this represents the ID of + * the vm_map that the memory originated from (i.e. this points backwards in + * shadow chains). Note that an originator is present even if the object + * hasn't been faulted into the backing pmap yet. + */ + vm_map_serial_t vmo_provenance; }; #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object) \ @@ -565,6 +579,12 @@ extern void vm_io_reprioritize_init(void); extern void page_worker_init(void); +__enum_closed_decl(vm_chead_select_t, uint32_t, { + CSEL_MIN = 1, + CSEL_BY_PID = 1, + CSEL_BY_COALITION = 2, + CSEL_MAX = 2 +}); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_options.h b/osfmk/vm/vm_options.h index 1ac117b7c..7ede5d23b 100644 --- a/osfmk/vm/vm_options.h +++ b/osfmk/vm/vm_options.h @@ -46,6 +46,20 @@ #define FBDP_DEBUG_OBJECT_NO_PAGER (DEVELOPMENT || DEBUG) +#if XNU_TARGET_OS_OSX && defined(__arm64__) +/* + * These control whether the compressor thread is filling more than one segment at time. It's enabled only in macOS + * since the goal is to better handle multiple processes that do page-outs at the same time. Processes in + * embedded platforms are less likely to run more than one app at a time so this optimization is less likely + * to be helpful. + */ +#define COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT 16 +#define COMPRESSOR_PAGEOUT_CHEADS_BITS 4 +#else /* XNU_TARGET_OS_OSX && defined(__arm64__) */ +#define COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT 1 +#define COMPRESSOR_PAGEOUT_CHEADS_BITS 0 +#endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */ + #define PAGE_SLEEP_WITH_INHERITOR (1) #endif /* __VM_VM_OPTIONS_H__ */ diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 67c88e181..2e732d5e4 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -82,9 +82,11 @@ #if __x86_64__ #define XNU_VM_HAS_DELAYED_PAGES 1 +#define XNU_VM_HAS_LOPAGE 1 #define XNU_VM_HAS_LINEAR_PAGES_ARRAY 0 #else #define XNU_VM_HAS_DELAYED_PAGES 0 +#define XNU_VM_HAS_LOPAGE 0 #define XNU_VM_HAS_LINEAR_PAGES_ARRAY 1 #endif @@ -127,38 +129,32 @@ __enum_closed_decl(vm_relocate_reason_t, unsigned int, { VM_RELOCATE_REASON_COUNT, }); -/* - * vm_remove_reason_t: - * A type to describe why a page is being removed from a global free queue. +/*! + * @typedef vm_memory_class_t * - * VM_REMOVE_REASON_USE: - * The page is going to be used by the system (likely through the vm_page_grab - * path). Do any state updates to the page that are relevant. - * - * VM_REMOVE_REASON_REBALANCE: - * The page is going to be put onto a different free queue. Don't do any state - * updates to the page; the client will do such updates. Structured this way - * because rebalance operations are likely to be done in bulk, so this allows - * clients to perform any operations in bulk. - */ -__enum_closed_decl(vm_remove_reason_t, unsigned int, { - VM_REMOVE_REASON_USE, - VM_REMOVE_REASON_REBALANCE, - - VM_REMOVE_REASON_COUNT, -}); - -/* - * vm_memory_class_t: + * @abstract * A type to describe what kind of memory a page represents. * - * VM_MEMORY_CLASS_REGULAR: + * @const VM_MEMORY_CLASS_REGULAR * Normal memory, which should participate in the normal page lifecycle. + * + * @const VM_MEMORY_CLASS_LOPAGE + * this exists to support hardware controllers + * incapable of generating DMAs with more than 32 bits + * of address on platforms with physical memory > 4G... + * + * @const VM_MEMORY_CLASS_SECLUDED + * Denotes memory must be put on the secluded queue, + * this is not returned by @c vm_page_get_memory_class(). */ -__enum_closed_decl(vm_memory_class_t, unsigned int, { +__enum_closed_decl(vm_memory_class_t, uint8_t, { VM_MEMORY_CLASS_REGULAR, - - VM_MEMORY_CLASS_COUNT, +#if XNU_VM_HAS_LOPAGE + VM_MEMORY_CLASS_LOPAGE, +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + VM_MEMORY_CLASS_SECLUDED, +#endif }); /* pages of compressed data */ @@ -192,97 +188,106 @@ __enum_closed_decl(vm_memory_class_t, unsigned int, { #define VM_PAGE_NULL ((vm_page_t) 0) +__enum_closed_decl(vm_page_q_state_t, uint8_t, { + VM_PAGE_NOT_ON_Q = 0, /* page is not present on any queue, nor is it wired... mainly a transient state */ + VM_PAGE_IS_WIRED = 1, /* page is currently wired */ + VM_PAGE_USED_BY_COMPRESSOR = 2, /* page is in use by the compressor to hold compressed data */ + VM_PAGE_ON_FREE_Q = 3, /* page is on the main free queue */ + VM_PAGE_ON_FREE_LOCAL_Q = 4, /* page is on one of the per-CPU free queues */ +#if XNU_VM_HAS_LOPAGE + VM_PAGE_ON_FREE_LOPAGE_Q = 5, /* page is on the lopage pool free list */ +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + VM_PAGE_ON_SECLUDED_Q = 5, /* page is on secluded queue */ +#endif /* CONFIG_SECLUDED_MEMORY */ + VM_PAGE_ON_THROTTLED_Q = 6, /* page is on the throttled queue... we stash anonymous pages here when not paging */ + VM_PAGE_ON_PAGEOUT_Q = 7, /* page is on one of the pageout queues (internal/external) awaiting processing */ + VM_PAGE_ON_SPECULATIVE_Q = 8, /* page is on one of the speculative queues */ + VM_PAGE_ON_ACTIVE_LOCAL_Q = 9, /* page has recently been created and is being held in one of the per-CPU local queues */ + VM_PAGE_ON_ACTIVE_Q = 10, /* page is in global active queue */ + VM_PAGE_ON_INACTIVE_INTERNAL_Q = 11, /* page is on the inactive internal queue a.k.a. anonymous queue */ + VM_PAGE_ON_INACTIVE_EXTERNAL_Q = 12, /* page in on the inactive external queue a.k.a. file backed queue */ + VM_PAGE_ON_INACTIVE_CLEANED_Q = 13, /* page has been cleaned to a backing file and is ready to be stolen */ +}); +#define VM_PAGE_Q_STATE_LAST_VALID_VALUE 13 /* we currently use 4 bits for the state... don't let this go beyond 15 */ -#define VM_PAGE_INACTIVE(m) (vm_page_inactive_states[m->vmp_q_state]) -#define VM_PAGE_PAGEABLE(m) (vm_page_pageable_states[m->vmp_q_state]) -#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m) (vm_page_non_speculative_pageable_states[m->vmp_q_state]) -#define VM_PAGE_ACTIVE_OR_INACTIVE(m) (vm_page_active_or_inactive_states[m->vmp_q_state]) +__enum_closed_decl(vm_page_specialq_t, uint8_t, { + VM_PAGE_SPECIAL_Q_EMPTY = 0, + VM_PAGE_SPECIAL_Q_BG = 1, + VM_PAGE_SPECIAL_Q_DONATE = 2, + VM_PAGE_SPECIAL_Q_FG = 3, +}); +#define VM_PAGE_INACTIVE(m) bit_test(vm_page_inactive_states, (m)->vmp_q_state) +#define VM_PAGE_ACTIVE_OR_INACTIVE(m) bit_test(vm_page_active_or_inactive_states, (m)->vmp_q_state) +#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m) bit_test(vm_page_non_speculative_pageable_states, (m)->vmp_q_state) +#define VM_PAGE_PAGEABLE(m) bit_test(vm_page_pageable_states, (m)->vmp_q_state) -#define VM_PAGE_NOT_ON_Q 0 /* page is not present on any queue, nor is it wired... mainly a transient state */ -#define VM_PAGE_IS_WIRED 1 /* page is currently wired */ -#define VM_PAGE_USED_BY_COMPRESSOR 2 /* page is in use by the compressor to hold compressed data */ -#define VM_PAGE_ON_FREE_Q 3 /* page is on the main free queue */ -#define VM_PAGE_ON_FREE_LOCAL_Q 4 /* page is on one of the per-CPU free queues */ -#define VM_PAGE_ON_FREE_LOPAGE_Q 5 /* page is on the lopage pool free list */ -#define VM_PAGE_ON_THROTTLED_Q 6 /* page is on the throttled queue... we stash anonymous pages here when not paging */ -#define VM_PAGE_ON_PAGEOUT_Q 7 /* page is on one of the pageout queues (internal/external) awaiting processing */ -#define VM_PAGE_ON_SPECULATIVE_Q 8 /* page is on one of the speculative queues */ -#define VM_PAGE_ON_ACTIVE_LOCAL_Q 9 /* page has recently been created and is being held in one of the per-CPU local queues */ -#define VM_PAGE_ON_ACTIVE_Q 10 /* page is in global active queue */ -#define VM_PAGE_ON_INACTIVE_INTERNAL_Q 11 /* page is on the inactive internal queue a.k.a. anonymous queue */ -#define VM_PAGE_ON_INACTIVE_EXTERNAL_Q 12 /* page in on the inactive external queue a.k.a. file backed queue */ -#define VM_PAGE_ON_INACTIVE_CLEANED_Q 13 /* page has been cleaned to a backing file and is ready to be stolen */ -#define VM_PAGE_ON_SECLUDED_Q 14 /* page is on secluded queue */ -#define VM_PAGE_Q_STATE_LAST_VALID_VALUE 14 /* we currently use 4 bits for the state... don't let this go beyond 15 */ - -#define VM_PAGE_Q_STATE_ARRAY_SIZE (VM_PAGE_Q_STATE_LAST_VALID_VALUE+1) - -extern const bool vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; -extern const bool vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; -extern const bool vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; -extern const bool vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; +extern const uint16_t vm_page_inactive_states; +extern const uint16_t vm_page_active_or_inactive_states; +extern const uint16_t vm_page_non_speculative_pageable_states; +extern const uint16_t vm_page_pageable_states; /* * The structure itself. See the block comment above for what (O) and (P) mean. */ -#define vmp_pageq vmp_q_un.vmp_q_pageq -#define vmp_snext vmp_q_un.vmp_q_snext - struct vm_page { union { - vm_page_queue_chain_t vmp_q_pageq; /* queue info for FIFO queue or free list (P) */ - struct vm_page *vmp_q_snext; - } vmp_q_un; + vm_page_queue_chain_t vmp_pageq; /* queue info for FIFO queue or free list (P) */ + struct vm_page *vmp_snext; + }; + vm_page_queue_chain_t vmp_specialq; /* anonymous pages in the special queues (P) */ - vm_page_queue_chain_t vmp_listq; /* all pages in same object (O) */ + vm_page_queue_chain_t vmp_listq; /* all pages in same object (O) */ + vm_page_packed_t vmp_next_m; /* VP bucket link (O) */ - vm_page_queue_chain_t vmp_specialq; /* anonymous pages in the special queues (P) */ - vm_object_offset_t vmp_offset; /* offset into that object (O,P) */ + vm_page_object_t vmp_object; /* which object am I in (O&P) */ + vm_object_offset_t vmp_offset; /* offset into that object (O,P) */ - vm_page_object_t vmp_object; /* which object am I in (O&P) */ + + /* + * Either the current page wire count, + * or the local queue id (if local queues are enabled). + * + * See the comments at 'vm_page_queues_remove' + * as to why this is safe to do. + */ + union { + uint16_t vmp_wire_count; + uint16_t vmp_local_id; + }; /* * The following word of flags used to be protected by the "page queues" lock. * That's no longer true and what lock, if any, is needed may depend on the * value of vmp_q_state. * - * We use 'vmp_wire_count' to store the local queue id if local queues are enabled. - * See the comments at 'vm_page_queues_remove' as to why this is safe to do. + * This bitfield is kept in its own struct to prevent coalescing + * with the next one (which C allows the compiler to do) as they + * are under different locking domains */ -#define VM_PAGE_SPECIAL_Q_EMPTY (0) -#define VM_PAGE_SPECIAL_Q_BG (1) -#define VM_PAGE_SPECIAL_Q_DONATE (2) -#define VM_PAGE_SPECIAL_Q_FG (3) -#define vmp_local_id vmp_wire_count - unsigned int vmp_wire_count:16, /* how many wired down maps use me? (O&P) */ - vmp_q_state:4, /* which q is the page on (P) */ - vmp_on_specialq:2, - vmp_canonical:1, /* this page is a canonical kernel page (immutable) */ - vmp_gobbled:1, /* page used internally (P) */ - vmp_laundry:1, /* page is being cleaned now (P)*/ - vmp_no_cache:1, /* page is not to be cached and should */ - /* be reused ahead of other pages (P) */ - vmp_reference:1, /* page has been used (P) */ - vmp_lopage:1, - vmp_realtime:1, /* page used by realtime thread */ -#if !CONFIG_TRACK_UNMODIFIED_ANON_PAGES - vmp_unused_page_bits:3; -#else /* ! CONFIG_TRACK_UNMODIFIED_ANON_PAGES */ - vmp_unmodified_ro:1, /* Tracks if an anonymous page is modified after a decompression (O&P).*/ - vmp_unused_page_bits:2; -#endif /* ! CONFIG_TRACK_UNMODIFIED_ANON_PAGES */ - - /* - * MUST keep the 2 32 bit words used as bit fields - * separated since the compiler has a nasty habit - * of using 64 bit loads and stores on them as - * if they were a single 64 bit field... since - * they are protected by 2 different locks, this - * is a real problem - */ - vm_page_packed_t vmp_next_m; /* VP bucket link (O) */ + struct { + vm_page_q_state_t vmp_q_state:4; /* which q is the page on (P) */ + vm_page_specialq_t vmp_on_specialq:2; + uint8_t vmp_lopage:1; + uint8_t vmp_canonical:1; /* this page is a canonical kernel page (immutable) */ + }; + struct { + uint8_t vmp_gobbled:1; /* page used internally (P) */ + uint8_t vmp_laundry:1; /* page is being cleaned now (P)*/ + uint8_t vmp_no_cache:1; /* page is not to be cached and should */ + /* be reused ahead of other pages (P) */ + uint8_t vmp_reference:1; /* page has been used (P) */ + uint8_t vmp_realtime:1; /* page used by realtime thread (P) */ + uint8_t vmp_iopl_wired:1; /* page has been wired for I/O UPL (O&P) */ +#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES + uint8_t vmp_unmodified_ro:1;/* Tracks if an anonymous page is modified after a decompression (O&P).*/ +#else + uint8_t __vmp_reserved1:1; +#endif + uint8_t __vmp_reserved2:1; + }; /* * The following word of flags is protected by the "VM object" lock. @@ -327,7 +332,7 @@ struct vm_page { * Setting this value to or away from vm_page_fictitious_addr * must be done with (P) held */ - ppnum_t vmp_phys_page; + ppnum_t vmp_phys_page; #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */ }; @@ -380,6 +385,7 @@ vm_page_get(uint32_t i) return VM_FAR_ADD_PTR_UNBOUNDED(vm_pages_array_internal(), i); } + __pure2 static inline bool vm_page_in_array(const struct vm_page *m) @@ -405,6 +411,15 @@ struct vm_page_with_ppnum { * [pmap_first_pnum, vm_pages_first_pnum + vm_pages_count) */ extern vm_page_t vm_page_find_canonical(ppnum_t pnum) __pure2; + +extern vm_page_t vm_pages_radix_next(uint32_t *cursor, ppnum_t *pnum); + +#define vm_pages_radix_for_each(mem) \ + for (uint32_t __index = 0; ((mem) = vm_pages_radix_next(&__index, NULL)); ) + +#define vm_pages_radix_for_each_pnum(pnum) \ + for (uint32_t __index = 0; vm_pages_radix_next(&__index, &pnum); ) + #else #define vm_page_with_ppnum vm_page #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */ @@ -444,10 +459,13 @@ VM_PAGE_SET_PHYS_PAGE(struct vm_page *m, ppnum_t pnum) #if defined(__x86_64__) extern unsigned int vm_clump_mask, vm_clump_shift; -#define VM_PAGE_GET_CLUMP(m) ((VM_PAGE_GET_PHYS_PAGE(m)) >> vm_clump_shift) -#define VM_PAGE_GET_COLOR(m) ((VM_PAGE_GET_CLUMP(m)) & vm_color_mask) +#define VM_PAGE_GET_CLUMP_PNUM(pn) ((pn) >> vm_clump_shift) +#define VM_PAGE_GET_CLUMP(m) VM_PAGE_GET_CLUMP_PNUM(VM_PAGE_GET_PHYS_PAGE(m)) +#define VM_PAGE_GET_COLOR_PNUM(pn) (VM_PAGE_GET_CLUMP_PNUM(pn) & vm_color_mask) +#define VM_PAGE_GET_COLOR(m) VM_PAGE_GET_COLOR_PNUM(VM_PAGE_GET_PHYS_PAGE(m)) #else -#define VM_PAGE_GET_COLOR(m) (VM_PAGE_GET_PHYS_PAGE(m) & vm_color_mask) +#define VM_PAGE_GET_COLOR_PNUM(pn) ((pn) & vm_color_mask) +#define VM_PAGE_GET_COLOR(m) VM_PAGE_GET_COLOR_PNUM(VM_PAGE_GET_PHYS_PAGE(m)) #endif /* @@ -475,14 +493,18 @@ extern unsigned int vm_clump_mask, vm_clump_shift; #define VM_PAGE_PACKED_ALIGNED __attribute__((aligned(VM_PAGE_PACKED_PTR_ALIGNMENT))) #define VM_PAGE_PACKED_PTR_BITS 31 #define VM_PAGE_PACKED_PTR_SHIFT 6 +#ifndef __BUILDING_XNU_LIB_UNITTEST__ #define VM_PAGE_PACKED_PTR_BASE ((uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS) - +#else +extern uintptr_t mock_page_ptr_base; +#define VM_PAGE_PACKED_PTR_BASE (mock_page_ptr_base) +#endif #define VM_PAGE_PACKED_FROM_ARRAY 0x80000000 static inline vm_page_packed_t vm_page_pack_ptr(uintptr_t p) { - if (vm_page_in_array((vm_page_t)p)) { + if (vm_page_in_array(__unsafe_forge_single(vm_page_t, p))) { ptrdiff_t diff = (vm_page_t)p - vm_pages_array_internal(); assert((vm_page_t)p == vm_page_get((uint32_t)diff)); return (vm_page_packed_t)(diff | VM_PAGE_PACKED_FROM_ARRAY); @@ -525,6 +547,19 @@ MACRO_END #define VM_PAGE_CONVERT_TO_QUEUE_ENTRY(p) VM_PAGE_PACK_PTR(p) + +/*! + * @abstract + * The type for free queue heads that live in the kernel __DATA segment. + * + * @discussion + * This type must be used so that the queue is properly aligned + * for the VM Page packing to be able to represent pointers to this queue. + */ +typedef struct vm_page_queue_free_head { + vm_page_queue_head_t qhead; +} VM_PAGE_PACKED_ALIGNED *vm_page_queue_free_head_t; + /* * Macro: vm_page_queue_init * Function: @@ -914,30 +949,206 @@ typedef struct vm_locks_array { #define NEXT_PAGE(m) ((m)->vmp_snext) #define NEXT_PAGE_PTR(m) (&(m)->vmp_snext) -static inline vm_page_t -vm_page_list_pop(vm_page_t *list) +/*! + * @abstract + * Represents a singly linked list of pages with a count. + * + * @discussion + * This type is used as a way to exchange transient collections of VM pages + * by various subsystems. + * + * This type is designed to be less than sizeof(_Complex) which means + * it that can be passed by value efficiently (either as a function argument + * or its result). + * + * + * @field vmpl_head + * The head of the list, or VM_PAGE_NULL. + * + * @field vmpl_count + * How many pages are on that list. + * + * @field vmpl_has_realtime + * At least one page on the list has vmp_realtime set. + */ +typedef struct { + vm_page_t vmpl_head; + uint32_t vmpl_count; + bool vmpl_has_realtime; +} vm_page_list_t; + + +/*! + * @abstract + * Low level function that pushes a page on a naked singly linked list of VM + * pages. + * + * @param head The list head. + * @param mem The page to push on the list. + */ +static inline void +_vm_page_list_push(vm_page_t *head, vm_page_t mem) { - vm_page_t mem = *list; + NEXT_PAGE(mem) = *head; + *head = mem; +} + +/*! + * @abstract + * Pushes a page onto a VM page list, adjusting its properties. + * + * @param list The VM page list to push onto + * @param mem The page to push on the list. + */ +static inline void +vm_page_list_push(vm_page_list_t *list, vm_page_t mem) +{ + _vm_page_list_push(&list->vmpl_head, mem); + list->vmpl_count++; + if (mem->vmp_realtime) { + list->vmpl_has_realtime = true; + } +} + +/*! + * @abstract + * Conveniency function that creates a VM page list from a single page. + * + * @param mem The VM page to put on the list. + */ +static inline vm_page_list_t +vm_page_list_for_page(vm_page_t mem) +{ + assert(NEXT_PAGE(mem) == VM_PAGE_NULL); + return (vm_page_list_t){ + .vmpl_head = mem, + .vmpl_count = 1, + .vmpl_has_realtime = mem->vmp_realtime, + }; +} + +/*! + * @abstract + * Low level function that pops a page from a naked singly linked list of VM + * pages. + * + * @param head The list head. + * + * @returns The first page that was on the list + * or VM_PAGE_NULL if it was empty. + */ +static inline vm_page_t +_vm_page_list_pop(vm_page_t *head) +{ + vm_page_t mem = *head; if (mem) { - *list = NEXT_PAGE(mem); + *head = NEXT_PAGE(mem); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); } + return mem; } -static inline void -vm_page_list_push(vm_page_t *list, vm_page_t mem) +/*! + * @abstract + * Pops a page from a VM page list, adjusting its properties. + * + * @param list The VM page list to pop from. + * + * @returns The first page that was on the list + * or VM_PAGE_NULL if it was empty. + */ +static inline vm_page_t +vm_page_list_pop(vm_page_list_t *list) { - mem->vmp_snext = *list; - *list = mem; + if (list->vmpl_head) { + list->vmpl_count--; + return _vm_page_list_pop(&list->vmpl_head); + } + *list = (vm_page_list_t){ }; + return VM_PAGE_NULL; } -#define vm_page_list_foreach(m, list) \ - for ((m) = (list); (m); (m) = (m)->vmp_snext) -#define vm_page_list_foreach_consume(it, list) \ - while (((it) = vm_page_list_pop((list)))) +/*! + * @abstract + * Reverses a list of VM pages in place. + * + * @param list The VM page list to reverse. + */ +static inline void +vm_page_list_reverse(vm_page_list_t *list) +{ + vm_page_t cur, next; + + cur = list->vmpl_head; + list->vmpl_head = NULL; + + while (cur) { + next = NEXT_PAGE(cur); + _vm_page_list_push(&list->vmpl_head, cur); + cur = next; + } +} + + +/*! + * @abstract + * Low level iterator over all pages on a naked singly linked list + * of VM pages. + * + * @discussion + * Mutating the list during enumeration is undefined. + * + * @param mem The variable to use for iteration. + * @param head The list head. + */ +#define _vm_page_list_foreach(mem, list) \ + for ((mem) = (list); (mem); (mem) = NEXT_PAGE(mem)) + + +/*! + * @abstract + * Iterator over a VM page list. + * + * @discussion + * Mutating the list during enumeration is undefined. + * + * @param mem The variable to use for iteration. + * @param head The list head. + */ +#define vm_page_list_foreach(mem, list) \ + _vm_page_list_foreach(mem, (list).vmpl_head) + + +/*! + * @abstract + * Low level iterator over all pages on a naked singly linked list + * of VM pages, that also consumes the list as it iterates. + * + * @discussion + * Each element is removed from the list as it is being iterated. + * + * @param mem The variable to use for iteration. + * @param head The list head. + */ +#define _vm_page_list_foreach_consume(mem, list) \ + while (((mem) = _vm_page_list_pop((list)))) + +/*! + * @abstract + * Iterator over a VM page list, that consumes the list. + * + * @discussion + * Each element is removed from the list as it is being iterated. + * + * @param mem The variable to use for iteration. + * @param head The list head. + */ +#define vm_page_list_foreach_consume(mem, list) \ + while (((mem) = vm_page_list_pop((list)))) + /* * XXX The unusual bit should not be necessary. Most of the bit @@ -968,10 +1179,25 @@ vm_page_list_push(vm_page_t *list, vm_page_t mem) #define MAX_COLORS 128 #define DEFAULT_COLORS 32 +/* + * Page free queue type. Abstracts the notion of a free queue of pages, that + * contains free pages of a particular memory class, and maintains a count of + * the number of pages in the free queue. + * + * Pages in the queue will be marked VM_PAGE_ON_FREE_Q when they are added to + * the free queue, and VM_PAGE_NOT_ON_Q when they are removed. + * + * These free queues will color pages, consistent with MachVMs color mask. + */ +typedef struct vm_page_free_queue { + struct vm_page_queue_free_head vmpfq_queues[MAX_COLORS]; + uint32_t vmpfq_count; +} *vm_page_free_queue_t; + extern unsigned int vm_colors; /* must be in range 1..MAX_COLORS */ extern unsigned int vm_color_mask; /* must be (vm_colors-1) */ extern unsigned int vm_cache_geometry_colors; /* optimal #colors based on cache geometry */ - +extern unsigned int vm_free_magazine_refill_limit; /* * Wired memory is a very limited resource and we can't let users exhaust it @@ -1038,8 +1264,6 @@ unsigned int vm_page_local_q_hard_limit; extern vm_locks_array_t vm_page_locks; -extern -vm_page_queue_head_t vm_lopage_queue_free; /* low memory free queue */ extern vm_page_queue_head_t vm_page_queue_active; /* active memory queue */ extern @@ -1205,6 +1429,10 @@ unsigned int vm_page_purgeable_wired_count;/* How many purgeable pages are wi extern uint64_t vm_page_purged_count; /* How many pages got purged so far ? */ +extern +_Atomic unsigned int vm_page_swapped_count; +/* How many pages are swapped to disk? */ + extern unsigned int vm_page_free_wanted; /* how many threads are waiting for memory */ @@ -1235,20 +1463,54 @@ extern boolean_t vm_himemory_mode; #define vm_himemory_mode TRUE #endif -extern boolean_t vm_lopage_needed; +#if XNU_VM_HAS_LOPAGE +extern bool vm_lopage_needed; +extern bool vm_lopage_refill; extern uint32_t vm_lopage_free_count; extern uint32_t vm_lopage_free_limit; extern uint32_t vm_lopage_lowater; -extern boolean_t vm_lopage_refill; +#else +#define vm_lopage_needed 0 +#define vm_lopage_free_count 0 +#endif extern uint64_t max_valid_dma_address; extern ppnum_t max_valid_low_ppnum; +/*! + * @abstract + * Options that alter the behavior of vm_page_grab_options(). + * + * @const VM_PAGE_GRAB_OPTIONS_NONE + * The default value when no other specific options are required. + * + * @const VM_PAGE_GRAB_Q_LOCK_HELD + * Denotes the caller is holding the vm page queues lock held. + * + * @const VM_PAGE_GRAB_NOPAGEWAIT + * Denotes that the caller never wants @c vm_page_grab_options() to call + * @c VM_PAGE_WAIT(), even if the thread is privileged. + * + * @const VM_PAGE_GRAB_SECLUDED + * The caller is eligible to the secluded pool. + */ +__enum_decl(vm_grab_options_t, uint32_t, { + VM_PAGE_GRAB_OPTIONS_NONE = 0x00000000, + VM_PAGE_GRAB_Q_LOCK_HELD = 0x00000001, + VM_PAGE_GRAB_NOPAGEWAIT = 0x00000002, + + /* architecture/platform-specific flags */ +#if CONFIG_SECLUDED_MEMORY + VM_PAGE_GRAB_SECLUDED = 0x00010000, +#endif /* CONFIG_SECLUDED_MEMORY */ +}); + /* * Prototypes for functions exported by this module. */ extern void vm_page_init_local_q(unsigned int num_cpus); +extern vm_page_t vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags); extern void vm_page_create_canonical(ppnum_t pnum); extern void vm_page_create_retired(ppnum_t pn); @@ -1259,10 +1521,6 @@ extern void vm_free_delayed_pages(void); extern void vm_pages_array_finalize(void); -extern vm_page_t vm_page_alloc( - vm_object_t object, - vm_object_offset_t offset); - extern void vm_page_reactivate_all_throttled(void); extern void vm_pressure_response(void); diff --git a/osfmk/vm/vm_page_internal.h b/osfmk/vm/vm_page_internal.h index 1db4453ed..1d459350e 100644 --- a/osfmk/vm/vm_page_internal.h +++ b/osfmk/vm/vm_page_internal.h @@ -35,11 +35,113 @@ __BEGIN_DECLS #ifdef XNU_KERNEL_PRIVATE -struct vm_page_queue_free_head { - vm_page_queue_head_t qhead; -} VM_PAGE_PACKED_ALIGNED; +PERCPU_DECL(unsigned int, start_color); -extern struct vm_page_queue_free_head vm_page_queue_free[MAX_COLORS]; +extern struct vm_page_free_queue vm_page_queue_free; + +/*! + * @abstract + * Applies a signed delta to a VM counter that is not meant to ever overflow. + * + * @discussion + * This is not meant for counters counting "events", but for counters that + * maintain how many objects there is in a given state (free pages, ...). + * + * @param counter A pointer to a counter of any integer type. + * @param value The signed delta to apply. + * @returns The new value of the counter. + */ +#define VM_COUNTER_DELTA(counter, value) ({ \ + __auto_type __counter = (counter); \ + release_assert(!os_add_overflow(*__counter, value, __counter)); \ + *__counter; \ +}) +#define VM_COUNTER_ATOMIC_DELTA(counter, value) ({ \ + __auto_type __value = (value); \ + __auto_type __orig = os_atomic_add_orig(counter, __value, relaxed); \ + release_assert(!os_add_overflow(__orig, __value, &__orig)); \ + __orig + __value; \ +}) + + +/*! + * @abstract + * Applies an unsigned increment to a VM counter that is not meant to ever + * overflow. + * + * @discussion + * This is not meant for counters counting "events", but for counters that + * maintain how many objects there is in a given state (free pages, ...). + * + * @param counter A pointer to a counter of any integer type. + * @param value The unsigned value to add. + * @returns The new value of the counter. + */ +#define VM_COUNTER_ADD(counter, value) ({ \ + __auto_type __counter = (counter); \ + release_assert(!os_add_overflow(*__counter, value, __counter)); \ + *__counter; \ +}) +#define VM_COUNTER_ATOMIC_ADD(counter, value) ({ \ + __auto_type __value = (value); \ + __auto_type __orig = os_atomic_add_orig(counter, __value, relaxed); \ + release_assert(!os_add_overflow(__orig, __value, &__orig)); \ + __orig + __value; \ +}) + +/*! + * @abstract + * Applies an unsigned decrement to a VM counter that is not meant to ever + * overflow. + * + * @discussion + * This is not meant for counters counting "events", but for counters that + * maintain how many objects there is in a given state (free pages, ...). + * + * @param counter A pointer to a counter of any integer type. + * @param value The unsigned value to substract. + * @returns The new value of the counter. + */ +#define VM_COUNTER_SUB(counter, value) ({ \ + __auto_type __counter = (counter); \ + release_assert(!os_sub_overflow(*__counter, value, __counter)); \ + *__counter; \ +}) +#define VM_COUNTER_ATOMIC_SUB(counter, value) ({ \ + __auto_type __value = (value); \ + __auto_type __orig = os_atomic_sub_orig(counter, __value, relaxed); \ + release_assert(!os_sub_overflow(__orig, __value, &__orig)); \ + __orig - __value; \ +}) + + +/*! + * @abstract + * Convenience wrapper to increment a VM counter. + * + * @discussion + * This is not meant for counters counting "events", but for counters that + * maintain how many objects there is in a given state (free pages, ...). + * + * @param counter A pointer to a counter of any integer type. + * @returns The new value of the counter. + */ +#define VM_COUNTER_INC(counter) VM_COUNTER_ADD(counter, 1) +#define VM_COUNTER_ATOMIC_INC(counter) VM_COUNTER_ATOMIC_ADD(counter, 1) + +/*! + * @abstract + * Convenience wrapper to decrement a VM counter. + * + * @discussion + * This is not meant for counters counting "events", but for counters that + * maintain how many objects there is in a given state (free pages, ...). + * + * @param counter A pointer to a counter of any integer type. + * @returns The new value of the counter. + */ +#define VM_COUNTER_DEC(counter) VM_COUNTER_SUB(counter, 1) +#define VM_COUNTER_ATOMIC_DEC(counter) VM_COUNTER_ATOMIC_SUB(counter, 1) static inline int VMP_CS_FOR_OFFSET( @@ -304,7 +406,143 @@ vm_page_queue_enter_clump( #endif /* __LP64__ */ -extern void vm_page_assign_special_state(vm_page_t mem, int mode); +/*! + * @abstract + * The number of pages to try to free/process at once while under + * the free page queue lock. + * + * @discussion + * The value is chosen to be a trade off between: + * - creating a lot of contention on the free page queue lock + * taking and dropping it all the time, + * - avoiding to hold the free page queue lock for too long periods of time. + */ +#define VMP_FREE_BATCH_SIZE 64 + +/*! + * @function vm_page_free_queue_init() + * + * @abstract + * Initialize a free queue. + * + * @param free_queue The free queue to initialize. + */ +extern void vm_page_free_queue_init( + vm_page_free_queue_t free_queue); + +/*! + * @function vm_page_free_queue_enter() + * + * @abstract + * Add a page to a free queue. + * + * @discussion + * Internally, the free queue is not synchronized, so any locking must be done + * outside of this function. + * + * The page queue state will be set to the appropriate free queue state for the + * memory class (typically VM_PAGE_ON_FREE_Q). + * + * Note that the callers are responsible for making sure that this operation is + * a valid transition. This is a helper to abstract handling of the several + * free page queues on the system which sits above vm_page_queue_enter() and + * maintains counters as well, but is otherwise oblivious to the page state + * machine. + * + * Most clients should use a wrapper around this function (typically + * vm_page_release() or vm_page_free_list()) and not call it directly. + * + * @param mem_class The memory class to free pages to. + * @param page The page to free. + * @param pnum the physical address of @c page + */ +extern void vm_page_free_queue_enter( + vm_memory_class_t mem_class, + vm_page_t page, + ppnum_t pnum); + +/*! + * @function vm_page_free_queue_remove() + * + * @abstract + * Removes an arbitrary free page from the given free queue. + * + * @discussion + * The given page must be in the given free queue, or state may be corrupted. + * + * Internally, the free queue is not synchronized, so any locking must be done + * outside of this function. + * + * Note that the callers are responsible for making sure that the requested + * queue state corresponds to a valid transition. This is a helper to abstract + * handling of the several free page queues on the system which sits above + * vm_page_queue_remove() and maintains counters as well, but is otherwise + * oblivious to the page state machine. + * + * Most clients should use a wrapper around this function (typically + * vm_page_free_queue_steal()) and not call it directly. + * + * @param class The memory class corresponding to the free queue + * @c page is enqueued on. + * @param mem The page to remove. + * @param pnum The physical address of @c page + * @param q_state The desired queue state for the page. + */ +__attribute__((always_inline)) +extern void vm_page_free_queue_remove( + vm_memory_class_t class, + vm_page_t mem, + ppnum_t pnum, + vm_page_q_state_t q_state); + +/*! + * @function vm_page_free_queue_grab() + * + * @abstract + * Gets pages from the free queue. + * + * @discussion + * Clients cannot get more pages than the free queue has; attempting to do so + * will cause a panic. + * + * Internally, the free queue is not synchronized, so any locking must be done + * outside of this function. + * + * Note that the callers are responsible for making sure that the requested + * queue state corresponds to a valid transition. This is a helper to abstract + * handling of the several free page queues on the system which sits above + * vm_page_queue_remove() and maintains counters as well, but is otherwise + * oblivious to the page state machine. + * + * Most clients should use a wrapper (typically vm_page_grab_options()) + * around this function and not call it directly. + * + * @param options The grab options. + * @param mem_class The memory class to allocate from. + * @param num_pages The number of pages to grab. + * @param q_state The vmp_q_state to set on the page. + * + * @returns + * A list of pages; the list will be num_pages long. + */ +extern vm_page_list_t vm_page_free_queue_grab( + vm_grab_options_t options, + vm_memory_class_t mem_class, + unsigned int num_pages, + vm_page_q_state_t q_state); + +/*! + * @abstract + * Perform a wakeup for a free page queue wait event. + * + * @param event the free page queue event to wake up + * @param n the number of threads to try to wake up + * (UINT32_MAX means all). + */ +extern void vm_page_free_wakeup(event_t event, uint32_t n); + + +extern void vm_page_assign_special_state(vm_page_t mem, vm_page_specialq_t mode); extern void vm_page_update_special_state(vm_page_t mem); extern void vm_page_add_to_specialq(vm_page_t mem, boolean_t first); extern void vm_page_remove_from_specialq(vm_page_t mem); @@ -443,25 +681,63 @@ extern void vm_page_make_private(vm_page_t m, ppnum_t base_page); */ extern void vm_page_reset_private(vm_page_t m); + extern bool vm_pool_low(void); -extern vm_page_t vm_page_grab(void); -extern vm_page_t vm_page_grab_options(int flags); +/*! + * @abstract + * Grabs a page. + * + * @discussion + * Allocate a page by looking at: + * - per-cpu queues, + * - global free queues, + * - magical queues (delayed, secluded, ...) + * + * This function always succeeds for VM privileged threads, + * unless VM_PAGE_GRAB_NOPAGEWAIT is passed. + * + * This function might return VM_PAGE_NULL if there are no pages left. + */ +extern vm_page_t vm_page_grab_options(vm_grab_options_t options); -#define VM_PAGE_GRAB_OPTIONS_NONE 0x00000000 -#if CONFIG_SECLUDED_MEMORY -#define VM_PAGE_GRAB_SECLUDED 0x00000001 -#endif /* CONFIG_SECLUDED_MEMORY */ -#define VM_PAGE_GRAB_Q_LOCK_HELD 0x00000002 +static inline vm_page_t +vm_page_grab(void) +{ + return vm_page_grab_options(VM_PAGE_GRAB_OPTIONS_NONE); +} -extern vm_page_t vm_page_grablo(void); +/*! + * @abstract + * Returns the proper grab options for the specified object. + */ +extern vm_grab_options_t vm_page_grab_options_for_object(vm_object_t object); -extern void vm_page_release( - vm_page_t page, - boolean_t page_queues_locked); +#if XNU_VM_HAS_LOPAGE +extern vm_page_t vm_page_grablo(vm_grab_options_t options); +#else +static inline vm_page_t +vm_page_grablo(vm_grab_options_t options) +{ + return vm_page_grab_options(options); +} +#endif + + +__options_closed_decl(vmp_release_options_t, uint32_t, { + VMP_RELEASE_NONE = 0x00, + VMP_RELEASE_Q_LOCKED = 0x01, + VMP_RELEASE_SKIP_FREE_CHECK = 0x02, + VMP_RELEASE_HIBERNATE = 0x04, + VMP_RELEASE_STARTUP = 0x08, +}); + +extern void vm_page_release( + vm_page_t page, + vmp_release_options_t options); extern boolean_t vm_page_wait( - int interruptible ); + int interruptible); extern void vm_page_init( vm_page_t page, @@ -474,50 +750,6 @@ extern void vm_page_free_unlocked( vm_page_t page, boolean_t remove_from_hash); -/* - * vm_page_get_memory_class: - * Given a page, returns the memory class of that page. - */ -extern vm_memory_class_t vm_page_get_memory_class( - vm_page_t page); - -/* - * vm_page_steal_free_page: - * Given a VM_PAGE_ON_FREE_Q page, steals it from its free queue. - */ -extern void vm_page_steal_free_page( - vm_page_t page, - vm_remove_reason_t remove_reason); - -/*! - * @typedef vmp_free_list_result_t - * - * @discussion - * This data structure is used by vm_page_put_list_on_free_queue to track - * how many pages were freed to which free lists, so that it can then drive - * which waiters we are going to wake up. - * - * uint8_t counters are enough because we never free more than 64 pages at - * a time, and this allows for the data structure to be passed by register. - */ -typedef struct { - uint8_t vmpr_regular; - uint8_t vmpr_lopage; -#if CONFIG_SECLUDED_MEMORY - uint8_t vmpr_secluded; -#endif /* CONFIG_SECLUDED_MEMORY */ -} vmp_free_list_result_t; - -/* - * vm_page_put_list_on_free_queue: - * Given a list of pages, put each page on whichever global free queue is - * appropriate. - * - * Must be called with the VM free page lock held. - */ -extern vmp_free_list_result_t vm_page_put_list_on_free_queue( - vm_page_t list, - bool page_queues_locked); extern void vm_page_balance_inactive( int max_to_move); @@ -850,6 +1082,8 @@ extern kern_return_t pmap_enter_check( #define DW_VM_PAGE_QUEUES_REMOVE 0x2000 #define DW_enqueue_cleaned 0x4000 #define DW_vm_phantom_cache_update 0x8000 +#define DW_vm_page_iopl_wire 0x20000 +#define DW_vm_page_iopl_wire_write 0x40000 struct vm_page_delayed_work { vm_page_t dw_m; @@ -863,7 +1097,7 @@ struct vm_page_delayed_work_ctx { thread_t delayed_owner; }; -void vm_page_do_delayed_work(vm_object_t object, vm_tag_t tag, struct vm_page_delayed_work *dwp, int dw_count); +kern_return_t vm_page_do_delayed_work(vm_object_t object, vm_tag_t tag, struct vm_page_delayed_work *dwp, int dw_count); #define DELAYED_WORK_LIMIT(max) ((vm_max_delayed_work_limit >= max ? max : vm_max_delayed_work_limit)) diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 979439a3c..cc456d0c3 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -90,16 +90,16 @@ #include #include +#include #include -#include - #include #include #include #include #include +#include #include #include #include @@ -115,7 +115,9 @@ #include #include +#include #include +#include #if CONFIG_PHANTOM_CACHE #include @@ -126,6 +128,10 @@ #include #endif +os_log_t vm_log_handle = OS_LOG_DEFAULT; +TUNABLE(bool, vm_log_to_serial, "vm_log_to_serial", false); +TUNABLE(bool, vm_log_debug_enabled, "vm_log_debug", false); + extern int cs_debug; #if CONFIG_MBUF_MCACHE @@ -367,7 +373,7 @@ uint32_t vm_pageout_memorystatus_fb_factor_dr = 2; TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT); int vm_pgo_pbound = 0; -extern void thread_soft_bind_cluster_type(thread_t, char); +extern kern_return_t thread_soft_bind_cluster_type(thread_t, char); #endif /* __AMP__ */ @@ -896,6 +902,10 @@ struct vm_pageout_stat { unsigned long vm_page_pageable_external_count; unsigned long vm_page_xpmapped_external_count; + unsigned long vm_page_swapped_count; + uint64_t swapouts; + uint64_t swapins; + unsigned int pages_grabbed; unsigned int pages_freed; @@ -938,7 +948,6 @@ struct vm_pageout_stat { unsigned int forcereclaimed_realtime; unsigned int protected_sharedcache; unsigned int protected_realtime; - } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE]; unsigned int vm_pageout_stat_now = 0; @@ -1674,6 +1683,8 @@ vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock, static struct vm_pageout_vminfo last; +static uint64_t last_swapouts; +static uint64_t last_swapins; uint64_t last_vm_page_pages_grabbed = 0; @@ -1696,6 +1707,7 @@ update_vm_info(void) vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count; vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count; vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT; + vm_pageout_stats[vm_pageout_stat_now].vm_page_swapped_count = os_atomic_load(&vm_page_swapped_count, relaxed); vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed; vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count; @@ -1735,6 +1747,13 @@ update_vm_info(void) vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed); last.vm_page_pages_freed = tmp; + tmp64 = counter_load(&vm_statistics_swapouts); + vm_pageout_stats[vm_pageout_stat_now].swapouts = tmp64 - last_swapouts; + last_swapouts = tmp64; + + tmp64 = counter_load(&vm_statistics_swapins); + vm_pageout_stats[vm_pageout_stat_now].swapins = tmp64 - last_swapins; + last_swapins = tmp64; if (vm_pageout_stats[vm_pageout_stat_now].considered) { tmp = vm_pageout_vminfo.vm_pageout_pages_evicted; @@ -1830,68 +1849,76 @@ update_vm_info(void) last.vm_pageout_protected_realtime = tmp; } - KDBG((VMDBG_CODE(DBG_VM_INFO1)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT1) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count); - KDBG((VMDBG_CODE(DBG_VM_INFO2)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT2) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count); - KDBG((VMDBG_CODE(DBG_VM_INFO3)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT3) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed, vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count, vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count); + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGCNT4) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].vm_page_swapped_count); + + if (vm_pageout_stats[vm_pageout_stat_now].considered || vm_pageout_stats[vm_pageout_stat_now].pages_compressed || vm_pageout_stats[vm_pageout_stat_now].failed_compressions) { - KDBG((VMDBG_CODE(DBG_VM_INFO4)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT1) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].considered, vm_pageout_stats[vm_pageout_stat_now].freed_speculative, vm_pageout_stats[vm_pageout_stat_now].freed_external, vm_pageout_stats[vm_pageout_stat_now].inactive_referenced); - KDBG((VMDBG_CODE(DBG_VM_INFO5)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT2) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].throttled_external_q, vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external, vm_pageout_stats[vm_pageout_stat_now].freed_cleaned, vm_pageout_stats[vm_pageout_stat_now].inactive_nolock); - KDBG((VMDBG_CODE(DBG_VM_INFO6)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT3) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q, vm_pageout_stats[vm_pageout_stat_now].pages_compressed, vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor, vm_pageout_stats[vm_pageout_stat_now].skipped_external); - KDBG((VMDBG_CODE(DBG_VM_INFO7)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT4) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded, vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim, vm_pageout_stats[vm_pageout_stat_now].failed_compressions, vm_pageout_stats[vm_pageout_stat_now].freed_internal); - KDBG((VMDBG_CODE(DBG_VM_INFO8)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT5) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal, vm_pageout_stats[vm_pageout_stat_now].considered_bq_external, vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations, vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal); - KDBG((VMDBG_CODE(DBG_VM_INFO10)) | DBG_FUNC_NONE, + KDBG_RELEASE(MEMINFO_CODE(DBG_MEMINFO_PGOUT6) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache, vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime, vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache, vm_pageout_stats[vm_pageout_stat_now].protected_realtime); } - KDBG((VMDBG_CODE(DBG_VM_INFO9)) | DBG_FUNC_NONE, + KDBG(MEMINFO_CODE(DBG_MEMINFO_DEMAND1) | DBG_FUNC_NONE, vm_pageout_stats[vm_pageout_stat_now].pages_grabbed, vm_pageout_stats[vm_pageout_stat_now].pages_freed, vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found, vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added); + KDBG(MEMINFO_CODE(DBG_MEMINFO_DEMAND2) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].swapouts, + vm_pageout_stats[vm_pageout_stat_now].swapins); + record_memory_pressure(); } @@ -1962,7 +1989,6 @@ vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed) #endif /* CONFIG_SECLUDED_MEMORY */ } - /* * This function is called only from vm_pageout_scan and * it initializes the loop targets for vm_pageout_scan(). @@ -3013,7 +3039,7 @@ vm_pageout_scan(void) struct vm_speculative_age_q *sq; struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } }; boolean_t inactive_throttled = FALSE; - vm_object_t object = NULL; + vm_object_t object = NULL; /* object that we're currently working on from previous iterations */ uint32_t inactive_reclaim_run; boolean_t grab_anonymous = FALSE; boolean_t force_anonymous = FALSE; @@ -3022,7 +3048,7 @@ vm_pageout_scan(void) int page_prev_q_state = 0; boolean_t page_from_bg_q = FALSE; uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; - vm_object_t m_object = VM_OBJECT_NULL; + vm_object_t m_object = VM_OBJECT_NULL; /* object of the current page (m) */ int retval = 0; boolean_t lock_yield_check = FALSE; @@ -3179,7 +3205,6 @@ return_from_scan: continue; } - /* * If our 'aged' queue is empty and we have some speculative pages * in the other queues, let's go through and see if we need to age @@ -3416,6 +3441,21 @@ return_from_scan: } else { VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1); } + if (m->vmp_pmapped) { + int refmod; + + /* + * If this page was file-backed and wired while its pager + * was lost (during a forced unmount, for example), there + * could still be some pmap mappings that need to be + * cleaned up before we can free the page. + */ + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); + if ((refmod & VM_MEM_MODIFIED) && + !m->vmp_dirty) { + SET_PAGE_DIRTY(m, FALSE); + } + } reclaim_page: if (vm_pageout_deadlock_target) { VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1); @@ -4125,9 +4165,15 @@ vm_pageout_select_filling_chead(struct pgo_iothread_state *cq, vm_page_t m) if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) { m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY; return donate_queue_head; - } else { - return &cq->current_regular_swapout_chead; } + + uint32_t sel_i = 0; +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 + vm_object_t object = VM_PAGE_OBJECT(m); + sel_i = object->vo_chead_hint; +#endif + assert(sel_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT); + return &cq->current_regular_swapout_cheads[sel_i]; } #define MAX_FREE_BATCH 32 @@ -4619,7 +4665,7 @@ vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_resu * Use the soft bound option for vm_compressor to allow it to run on * P-cores if E-cluster is unavailable. */ - thread_soft_bind_cluster_type(self, 'E'); + (void) thread_soft_bind_cluster_type(self, 'E'); } #endif /* __AMP__ */ @@ -4668,7 +4714,7 @@ uint64_t vm_pressure_last_level_transition_abs = 0; int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS; void -vm_pressure_response(void) +vm_pressure_response() { vm_pressure_level_t old_level = kVMPressureNormal; int new_level = -1; @@ -4990,7 +5036,8 @@ vm_pageout_garbage_collect(void *step, wait_result_t wr __unused) consider_machine_collect(); #if CONFIG_DEFERRED_RECLAIM - vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, RECLAIM_OPTIONS_NONE); + mach_vm_size_t bytes_reclaimed; + vm_deferred_reclamation_gc(RECLAIM_GC_TRIM, &bytes_reclaimed, RECLAIM_OPTIONS_NONE); #endif /* CONFIG_DEFERRED_RECLAIM */ #if CONFIG_MBUF_MCACHE mbuf_drain(FALSE); @@ -5186,7 +5233,7 @@ vm_pageout(void) * Use the soft bound option for vm pageout to allow it to run on * E-cores if P-cluster is unavailable. */ - thread_soft_bind_cluster_type(self, 'P'); + (void) thread_soft_bind_cluster_type(self, 'P'); } #endif /* __AMP__ */ @@ -5197,6 +5244,8 @@ vm_pageout(void) thread_set_thread_name(current_thread(), "VM_pageout_scan"); + vm_log_handle = os_log_create("com.apple.xnu", "virtual-memory"); + /* * Initialize some paging parameters. */ @@ -5296,7 +5345,9 @@ vm_pageout(void) ethr->q = &vm_pageout_queue_external; /* in external_state these cheads are never used, they are used only in internal_state for te compressor */ ethr->current_early_swapout_chead = NULL; - ethr->current_regular_swapout_chead = NULL; + for (int reg_i = 0; reg_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT; ++reg_i) { + ethr->current_regular_swapout_cheads[reg_i] = NULL; + } ethr->current_late_swapout_chead = NULL; ethr->scratch_buf = NULL; #if DEVELOPMENT || DEBUG @@ -5449,7 +5500,7 @@ vm_pageout_internal_start(void) kmem_alloc(kernel_map, &buf, bufsize * vm_pageout_state.vm_compressor_thread_count, - KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT, + KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR); for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) { @@ -5457,7 +5508,9 @@ vm_pageout_internal_start(void) iq->id = i; iq->q = &vm_pageout_queue_internal; iq->current_early_swapout_chead = NULL; - iq->current_regular_swapout_chead = NULL; + for (int reg_i = 0; reg_i < COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT; ++reg_i) { + iq->current_regular_swapout_cheads[reg_i] = NULL; + } iq->current_late_swapout_chead = NULL; iq->scratch_buf = (char *)(buf + i * bufsize); #if DEVELOPMENT || DEBUG @@ -5645,6 +5698,15 @@ upl_destroy(upl_t upl) } #endif + if (upl->flags & UPL_HAS_FS_VERIFY_INFO) { + assert(upl->u_fs_un.verify_info && upl->u_fs_un.verify_info->verify_data_len > 0 && + upl->u_fs_un.verify_info->verify_data_len <= upl_adjusted_size(upl, PAGE_MASK)); + + kfree_data(upl->u_fs_un.verify_info->verify_data_ptr, + upl->u_fs_un.verify_info->verify_data_len); + kfree_type(struct upl_fs_verify_info, upl->u_fs_un.verify_info); + } + #if UPL_DEBUG for (int i = 0; i < upl->upl_commit_index; i++) { btref_put(upl->upl_commit_records[i].c_btref); @@ -5703,13 +5765,18 @@ upl_unmark_decmp(upl_t upl) #define VM_PAGE_Q_BACKING_UP(q) \ ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10)) -boolean_t must_throttle_writes(void); - -boolean_t +static boolean_t must_throttle_writes() { if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) && vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) { + /* + * The external pageout queue is saturated, and there is an abundance of + * filecache on the system that VM_pageout still needs to get to. Likely the + * pageout thread is contending at the filesystem or storage layers with a + * high volume of other I/Os. Attempt to give the pageout thread a chance to + * catch up by applying a blanket throttle to all outgoing I/Os. + */ return TRUE; } @@ -5763,6 +5830,82 @@ vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp) zfree(dw_ctx_zone, ldw_ctx); } +uint64_t vm_object_upl_throttle_cnt; + +TUNABLE(uint32_t, vm_object_throttle_delay_us, + "vm_object_upl_throttle_delay_us", 1000); /* 1ms */ + +/* + * @func vm_object_upl_throttle + * + * @brief + * Throttle the current UPL request to give the external pageout thread + * a chance to catch up to system I/O demand. + * + * @discussion + * We may end up in a situation where the file-cache is large, and we need to + * evict some of it. However, the external pageout thread either can't keep up + * with demand or is contending with other I/Os for the storage device (see + * @c must_throttle_writes()). In these situations, we apply a throttle to + * outgoing writes to give the pageout thread a chance to catch up. + */ +OS_NOINLINE OS_NOT_TAIL_CALLED +static void +vm_object_upl_throttle(vm_object_t object, upl_size_t size) +{ + int delay_us = vm_object_throttle_delay_us; +#if XNU_TARGET_OS_OSX + if (memory_object_is_vnode_pager(object->pager)) { + boolean_t isSSD = FALSE; + __assert_only kern_return_t kr; + kr = vnode_pager_get_isSSD(object->pager, &isSSD); + assert3u(kr, ==, KERN_SUCCESS); + if (!isSSD) { + delay_us = 5000; /* 5 ms */ + } + } +#endif /* !XNU_TARGET_OS_OSX */ + + KDBG(VMDBG_CODE(DBG_VM_UPL_THROTTLE) | DBG_FUNC_START, VM_OBJECT_ID(object), + size, delay_us); + + if (delay_us == 0) { + goto done; + } + + vm_object_unlock(object); + + uint32_t size_pages = size >> PAGE_SHIFT; + os_atomic_inc(&vm_object_upl_throttle_cnt, relaxed); + + os_atomic_add(&vm_upl_wait_for_pages, size_pages, relaxed); + + /* + * Unconditionally block for a fixed delay interval. + * + * FIXME: This mechanism should likely be revisited. (rdar://157163748) + * + * Should there be a back-pressure mechanisms that un-throttles the I/O if the + * situation resolves? + * + * Is 1ms long enough? The original mechanism scaled the delay with the I/O + * size, but that overly penalized large I/Os (which are actually preferrable + * if device contention is the problem). + * + * Can we isolate only I/Os which are to the same device that the external + * pageout thread is stuck on? e.g. There is no reason to penalize I/Os to an + * external drive if the pageout thread is gummed up on the internal drive. + */ + delay(delay_us); + + os_atomic_sub(&vm_upl_wait_for_pages, size_pages, relaxed); + + vm_object_lock(object); +done: + KDBG(VMDBG_CODE(DBG_VM_UPL_THROTTLE) | DBG_FUNC_END); +} + + /* * Routine: vm_object_upl_request * Purpose: @@ -5836,7 +5979,7 @@ vm_object_upl_request( int dw_count; int dw_limit; int io_tracking_flag = 0; - int grab_options; + vm_grab_options_t grab_options; int page_grab_count = 0; ppnum_t phys_page; pmap_flush_context pmap_flush_context_storage; @@ -5918,7 +6061,7 @@ vm_object_upl_request( if (cntrl_flags & UPL_SET_LITE) { upl->map_object = object; } else { - upl->map_object = vm_object_allocate(size); + upl->map_object = vm_object_allocate(size, object->vmo_provenance); vm_object_lock(upl->map_object); /* * No neeed to lock the new object: nobody else knows @@ -5946,7 +6089,7 @@ vm_object_upl_request( vm_object_lock(object); vm_object_activity_begin(object); - grab_options = 0; + grab_options = VM_PAGE_GRAB_OPTIONS_NONE; #if CONFIG_SECLUDED_MEMORY if (object->can_grab_secluded) { grab_options |= VM_PAGE_GRAB_SECLUDED; @@ -5965,6 +6108,10 @@ vm_object_upl_request( queue_enter(&object->uplq, upl, upl_t, uplq); } #endif + + /* remember which copy object we synchronized with */ + last_copy_object = object->vo_copy; + last_copy_version = object->vo_copy_version; if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) { /* * Honor copy-on-write obligations @@ -5987,11 +6134,6 @@ vm_object_upl_request( VM_PAGEOUT_DEBUG(upl_cow, 1); VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT)); } - /* - * remember which copy object we synchronized with - */ - last_copy_object = object->vo_copy; - last_copy_version = object->vo_copy_version; entry = 0; xfer_size = size; @@ -6004,25 +6146,7 @@ vm_object_upl_request( } if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) { - boolean_t isSSD = FALSE; - -#if !XNU_TARGET_OS_OSX - isSSD = TRUE; -#else /* !XNU_TARGET_OS_OSX */ - vnode_pager_get_isSSD(object->pager, &isSSD); -#endif /* !XNU_TARGET_OS_OSX */ - vm_object_unlock(object); - - OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); - - if (isSSD == TRUE) { - delay(1000 * size_in_pages); - } else { - delay(5000 * size_in_pages); - } - OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages); - - vm_object_lock(object); + vm_object_upl_throttle(object, size); } while (xfer_size) { @@ -6221,7 +6345,7 @@ check_busy: } } } else { - if ((cntrl_flags & UPL_WILL_MODIFY) && + while ((cntrl_flags & UPL_WILL_MODIFY) && (object->vo_copy != last_copy_object || object->vo_copy_version != last_copy_version)) { /* @@ -6245,6 +6369,10 @@ check_busy: * atomicity. We just don't want new mappings * to see both the *before* and *after* pages. */ + + /* first remember the copy object we re-synced with */ + last_copy_object = object->vo_copy; + last_copy_version = object->vo_copy_version; if (object->vo_copy != VM_OBJECT_NULL) { vm_object_update( object, @@ -6259,11 +6387,6 @@ check_busy: VM_PAGEOUT_DEBUG(upl_cow_again, 1); VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT)); } - /* - * remember the copy object we synced with - */ - last_copy_object = object->vo_copy; - last_copy_version = object->vo_copy_version; } dst_page = vm_page_lookup(object, dst_offset); @@ -6327,7 +6450,7 @@ check_busy: if (dst_page != VM_PAGE_NULL) { vm_page_release(dst_page, - FALSE); + VMP_RELEASE_NONE); } dst_page = vm_object_page_grab(object); @@ -6400,10 +6523,29 @@ check_busy: dst_page->vmp_overwriting = TRUE; if (dst_page->vmp_pmapped) { +#if CONFIG_SPTM + if (__improbable(PMAP_PAGE_IS_USER_EXECUTABLE(dst_page))) { + /* + * Various buffer cache operations may need to reload the page contents + * even though the page may have an executable frame type from prior use of + * the vnode associated with the VM object. For those cases, we need to + * disconnect all mappings and reset the frame type, regardless of whether + * UPL_FILE_IO was passed here, as the SPTM will not allow writable CPU + * or IOMMU mappings of exec-typed pages. + * NOTE: It's theoretically possible that the retype here could race with + * setup/teardown of IOMMU mappings by another thread that went through + * the vm_object_iopl_request() path. I'm not sure that would ever be + * expected to happen for an exec page in practice though. If it does + * happen, we may need to change vm_page_do_delayed_work() to forbid all + * IOPLs against executable pages rather than only writable ones. + */ + refmod_state = pmap_disconnect_options(phys_page, PMAP_OPTIONS_RETYPE, NULL); + } else +#endif /* CONFIG_SPTM */ if (!(cntrl_flags & UPL_FILE_IO)) { /* * eliminate all mappings from the - * original object and its prodigy + * original object and its progeny */ refmod_state = pmap_disconnect(phys_page); } else { @@ -6608,8 +6750,9 @@ try_next_page: VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, DBG_VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); if (task != NULL) { - counter_add(&task->pages_grabbed_upl, page_grab_count); + ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count); } + counter_add(&vm_page_grab_count_upl, page_grab_count); if (dwp_start && dwp_finish_ctx) { vm_page_delayed_work_finish_ctx(dwp_start); @@ -6623,6 +6766,93 @@ int cs_executable_create_upl = 0; extern int proc_selfpid(void); extern char *proc_name_address(void *p); +/** + * Helper for determining whether a writable (!UPL_COPYOUT_FROM) UPL is allowed for a given VA region. + * This is determined not only by the allowed permissions in the relevant vm_map_entry, but also by + * the code integrity enforcement model present on the system. + * + * @param map VM map against which the UPL is being populated. + * @param entry The source vm_map_entry in [map] against which the UPL is being populated. + * @param offset Base offset of UPL request in [map], for debugging purposes. + * + * @return True if the writable UPL is allowed for [entry], false otherwise. + */ +static bool +vme_allows_upl_write( + vm_map_t map __unused, + vm_map_entry_t entry, + vm_map_address_t offset __unused) +{ + if (!(entry->protection & VM_PROT_WRITE)) { + return false; + } +#if CONFIG_SPTM + /* + * For SPTM configurations, reject any attempt to create a writable UPL against any executable + * region. Even in cases such as JIT/USER_DEBUG in which the vm_map_entry may allow write + * access, the SPTM/TXM codesigning model still forbids writable DMA mappings of these pages. + */ + if ((entry->protection & VM_PROT_EXECUTE) || entry->vme_xnu_user_debug) { + vm_map_guard_exception(offset, kGUARD_EXC_SEC_UPL_WRITE_ON_EXEC_REGION); + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, + KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_UPL_WRITE_ON_EXEC_REGION), (uintptr_t)offset); + return false; + } +#endif /* CONFIG_SPTM */ + return true; +} + +/** + * Helper for determining whether a read-only (UPL_COPYOUT_FROM) UPL is allowed for a given VA region, + * possibly with the additional requirement of creating a kernel copy of the source buffer. + * This is determined by the code integrity enforcement model present on the system. + * + * @param map VM map against which the UPL is being populated. + * @param entry The source vm_map_entry in [map] against which the UPL is being populated. + * @param offset Base offset of UPL request in [map], for debugging purposes. + * @param copy_required Output parameter indicating whether the UPL should be created against a kernel + * copy of the source data. + * + * @return True if the read-only UPL is allowed for [entry], false otherwise. + */ +static bool +vme_allows_upl_read( + vm_map_t map __unused, + vm_map_entry_t entry __unused, + vm_map_address_t offset __unused, + bool *copy_required) +{ + assert(copy_required != NULL); + *copy_required = false; +#if CONFIG_SPTM + /* + * For SPTM configs, always create a copy when attempting a read-only I/O operation against an + * executable or debug (which may become executable) mapping. The SPTM's stricter security + * enforcements against DMA mappings of executable pages may otherwise trigger an SPTM violation + * panic. We expect the added cost of this copy to be manageable as DMA mappings of executable + * regions are rare in practice. + */ + if ((map->pmap != kernel_pmap) && + ((entry->protection & VM_PROT_EXECUTE) || entry->vme_xnu_user_debug)) { + *copy_required = true; + } +#endif /* CONFIG_SPTM */ +#if !XNU_TARGET_OS_OSX + /* + * For all non-Mac targets, create a copy when attempting a read-only I/O operation against a + * read-only executable region. These regions are likely to be codesigned and are typically + * mapped CoW; our wire operation will be treated as a proactive CoW fault which will copy the + * backing pages and thus cause them to no longer be codesigned. + */ + if (map->pmap != kernel_pmap && + (entry->protection & VM_PROT_EXECUTE) && + !(entry->protection & VM_PROT_WRITE)) { + *copy_required = true; + } +#endif /* !XNU_TARGET_OS_OSX */ + return true; +} + kern_return_t vm_map_create_upl( vm_map_t map, @@ -6646,16 +6876,10 @@ vm_map_create_upl( vm_map_size_t original_size, adjusted_size; vm_map_offset_t local_entry_start; vm_object_offset_t local_entry_offset; - vm_object_offset_t offset_in_mapped_page; boolean_t release_map = FALSE; - + vmlp_api_start(VM_MAP_CREATE_UPL); start_with_map: - - original_offset = offset; - original_size = *upl_size; - adjusted_size = original_size; - caller_flags = *flags; if (caller_flags & ~UPL_VALID_FLAGS) { @@ -6666,14 +6890,20 @@ start_with_map: ret = KERN_INVALID_VALUE; goto done; } - force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC); - sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM); if (upl == NULL) { ret = KERN_INVALID_ARGUMENT; goto done; } + + original_offset = offset; + original_size = *upl_size; + adjusted_size = original_size; + + force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC); + sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM); + REDISCOVER_ENTRY: vm_map_lock_read(map); @@ -6683,6 +6913,10 @@ REDISCOVER_ENTRY: goto done; } + if (!entry->is_sub_map) { + vmlp_range_event_entry(map, entry); + } + local_entry_start = entry->vme_start; local_entry_offset = VME_OFFSET(entry); @@ -6715,19 +6949,7 @@ REDISCOVER_ENTRY: goto done; } - offset_in_mapped_page = 0; - if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) { - offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map)); - *upl_size = (upl_size_t) - (vm_map_round_page(original_offset + adjusted_size, - VM_MAP_PAGE_MASK(map)) - - offset); - - offset_in_mapped_page = original_offset - offset; - assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map)); - - DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page); - } + bool copy_required = false; if (!entry->is_sub_map) { if (VME_OBJECT(entry) == VM_OBJECT_NULL || @@ -6754,7 +6976,7 @@ REDISCOVER_ENTRY: VME_OBJECT_SET(entry, vm_object_allocate((vm_size_t) - vm_object_round_page((entry->vme_end - entry->vme_start))), + vm_object_round_page((entry->vme_end - entry->vme_start)), map->serial_id), false, 0); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); @@ -6762,28 +6984,22 @@ REDISCOVER_ENTRY: vm_map_lock_write_to_read(map); } - if (!(caller_flags & UPL_COPYOUT_FROM) && - !(entry->protection & VM_PROT_WRITE)) { + if (((caller_flags & UPL_COPYOUT_FROM) && !vme_allows_upl_read(map, entry, offset, ©_required)) || + (!(caller_flags & UPL_COPYOUT_FROM) && !vme_allows_upl_write(map, entry, offset))) { vm_map_unlock_read(map); ret = KERN_PROTECTION_FAILURE; goto done; } } -#if !XNU_TARGET_OS_OSX - if (map->pmap != kernel_pmap && - (caller_flags & UPL_COPYOUT_FROM) && - (entry->protection & VM_PROT_EXECUTE) && - !(entry->protection & VM_PROT_WRITE)) { + if (__improbable(copy_required)) { vm_offset_t kaddr; vm_size_t ksize; /* - * We're about to create a read-only UPL backed by - * memory from an executable mapping. - * Wiring the pages would result in the pages being copied - * (due to the "MAP_PRIVATE" mapping) and no longer - * code-signed, so no longer eligible for execution. + * Depending on the device configuration, wiring certain pages + * for I/O may violate the security policy for codesigning-related + * reasons. * Instead, let's copy the data into a kernel buffer and * create the UPL from this kernel buffer. * The kernel buffer is then freed, leaving the UPL holding @@ -6797,7 +7013,7 @@ REDISCOVER_ENTRY: ksize = round_page(*upl_size); kaddr = 0; ret = kmem_alloc(kernel_map, &kaddr, ksize, - KMA_PAGEABLE | KMA_DATA, tag); + KMA_PAGEABLE | KMA_DATA_SHARED, tag); if (ret == KERN_SUCCESS) { /* copyin the user data */ ret = copyinmap(map, offset, (void *)kaddr, *upl_size); @@ -6810,17 +7026,8 @@ REDISCOVER_ENTRY: ksize - *upl_size); } /* create the UPL from the kernel buffer */ - vm_object_offset_t offset_in_object; - vm_object_offset_t offset_in_object_page; - - offset_in_object = offset - local_entry_start + local_entry_offset; - offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object); - assert(offset_in_object_page < PAGE_SIZE); - assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE); - *upl_size -= offset_in_object_page + offset_in_mapped_page; ret = vm_map_create_upl(kernel_map, - (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page), - upl_size, upl, page_list, count, flags, tag); + (vm_map_address_t)kaddr, upl_size, upl, page_list, count, flags, tag); } if (kaddr != 0) { /* free the kernel buffer */ @@ -6837,7 +7044,6 @@ REDISCOVER_ENTRY: #endif /* DEVELOPMENT || DEBUG */ goto done; } -#endif /* !XNU_TARGET_OS_OSX */ if (!entry->is_sub_map) { local_object = VME_OBJECT(entry); @@ -6991,9 +7197,7 @@ REDISCOVER_ENTRY: vm_map_reference(submap); vm_map_unlock_read(map); - DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap); - offset += offset_in_mapped_page; - *upl_size -= offset_in_mapped_page; + DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, submap); if (release_map) { vm_map_deallocate(map); @@ -7125,10 +7329,6 @@ REDISCOVER_ENTRY: vm_map_unlock_read(map); - offset += offset_in_mapped_page; - assert(*upl_size > offset_in_mapped_page); - *upl_size -= offset_in_mapped_page; - ret = vm_object_iopl_request(local_object, ((vm_object_offset_t) ((offset - local_start) + local_offset)), @@ -7140,12 +7340,12 @@ REDISCOVER_ENTRY: tag); vm_object_deallocate(local_object); - done: if (release_map) { vm_map_deallocate(map); } + vmlp_api_end(VM_MAP_CREATE_UPL, ret); return ret; } @@ -7172,7 +7372,6 @@ vm_map_enter_upl_range( int isVectorUPL = 0, curr_upl = 0; upl_t vector_upl = NULL; mach_vm_offset_t vector_upl_dst_addr = 0; - vm_map_t vector_upl_submap = NULL; upl_offset_t subupl_offset = 0; upl_size_t subupl_size = 0; @@ -7212,12 +7411,17 @@ vm_map_enter_upl_range( panic("TODO4K: vector UPL not implemented"); } - vector_upl_submap = kmem_suballoc(map, &vector_upl_dst_addr, - vector_upl->u_size, VM_MAP_CREATE_DEFAULT, - VM_FLAGS_ANYWHERE, KMS_NOFAIL | KMS_DATA, - VM_KERN_MEMORY_NONE).kmr_submap; - map = vector_upl_submap; - vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr); + kern_return_t kr2; + vm_offset_t alloc_addr = 0; + kr2 = vm_allocate(map, &alloc_addr, vector_upl->u_size, VM_FLAGS_ANYWHERE); + if (kr2 != KERN_SUCCESS) { + os_log(OS_LOG_DEFAULT, "%s: vm_allocate(0x%x) -> %d", + __func__, vector_upl->u_size, kr2); + upl_unlock(vector_upl); + return kr2; + } + vector_upl_dst_addr = alloc_addr; + vector_upl_set_addr(vector_upl, vector_upl_dst_addr); curr_upl = 0; } else { upl_lock(upl); @@ -7257,7 +7461,10 @@ process_upl_to_enter: size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)); object = upl->map_object; - upl->map_object = vm_object_allocate(vm_object_round_page(size)); + upl->map_object = vm_object_allocate( + vm_object_round_page(size), + /* Provenance is copied from the object we're shadowing */ + object->vmo_provenance); vm_object_lock(upl->map_object); @@ -7348,7 +7555,7 @@ process_upl_to_enter: * NEED A UPL_MAP ALIAS */ kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK), upl->map_object, offset, FALSE, prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT); @@ -7359,7 +7566,9 @@ process_upl_to_enter: } } else { kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0, - VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK), + VM_MAP_KERNEL_FLAGS_FIXED( + .vm_tag = VM_KERN_MEMORY_OSFMK, + .vmf_overwrite = true), upl->map_object, offset, FALSE, prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (kr) { @@ -7413,7 +7622,6 @@ process_upl_to_enter: addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map))); if (addr_adjustment) { - assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK); DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment)); *dst_addr += addr_adjustment; } @@ -7491,13 +7699,18 @@ vm_map_remove_upl_range( process_upl_to_remove: if (isVectorUPL) { if (curr_upl == vector_upl_max_upls(vector_upl)) { - vm_map_t v_upl_submap; - vm_offset_t v_upl_submap_dst_addr; - vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr); + vm_offset_t v_upl_dst_addr; + kern_return_t kr; + vector_upl_get_addr(vector_upl, &v_upl_dst_addr); - kmem_free_guard(map, v_upl_submap_dst_addr, - vector_upl->u_size, KMF_NONE, KMEM_GUARD_SUBMAP); - vm_map_deallocate(v_upl_submap); + kr = vm_deallocate(map, v_upl_dst_addr, vector_upl->u_size); + if (kr != KERN_SUCCESS) { + os_log(OS_LOG_DEFAULT, "%s: vm_deallocate(0x%llx, 0x%x) -> %d", + __func__, (uint64_t)v_upl_dst_addr, + vector_upl->u_size, kr); + } + v_upl_dst_addr = 0; + vector_upl_set_addr(vector_upl, v_upl_dst_addr); upl_unlock(vector_upl); return KERN_SUCCESS; } @@ -7522,8 +7735,8 @@ process_upl_to_remove: if (isVectorUPL) { /* * If it's a Vectored UPL, we'll be removing the entire - * submap anyways, so no need to remove individual UPL - * element mappings from within the submap + * address range anyway, so no need to remove individual UPL + * element mappings from within the range */ goto process_upl_to_remove; } @@ -7628,6 +7841,7 @@ iopl_valid_data( assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); assert(m->vmp_wire_count == 0); m->vmp_wire_count++; + m->vmp_iopl_wired = true; assert(m->vmp_wire_count); if (m->vmp_wire_count == 1) { m->vmp_q_state = VM_PAGE_IS_WIRED; @@ -7704,6 +7918,9 @@ vm_object_iopl_wire_full( while (page_count--) { if (dst_page->vmp_busy || +#if CONFIG_SPTM + PMAP_PAGE_IS_USER_EXECUTABLE(dst_page) || +#endif vm_page_is_fictitious(dst_page) || dst_page->vmp_absent || VMP_ERROR_GET(dst_page) || @@ -7720,6 +7937,7 @@ vm_object_iopl_wire_full( dst_page->vmp_reference = TRUE; vm_page_wire(dst_page, tag, FALSE); + dst_page->vmp_iopl_wired = true; if (!(cntrl_flags & UPL_COPYOUT_FROM)) { SET_PAGE_DIRTY(dst_page, FALSE); @@ -7776,16 +7994,16 @@ vm_object_iopl_wire_empty( int page_count, int *page_grab_count) { - vm_page_t dst_page; - boolean_t no_zero_fill = FALSE; - int interruptible; - int pages_wired = 0; - int pages_inserted = 0; - int entry = 0; - uint64_t delayed_ledger_update = 0; - kern_return_t ret = KERN_SUCCESS; - int grab_options; - ppnum_t phys_page; + vm_page_t dst_page; + boolean_t no_zero_fill = FALSE; + int interruptible; + int pages_wired = 0; + int pages_inserted = 0; + int entry = 0; + uint64_t delayed_ledger_update = 0; + kern_return_t ret = KERN_SUCCESS; + vm_grab_options_t grab_options; + ppnum_t phys_page; vm_object_lock_assert_exclusive(object); assert(object->purgable != VM_PURGABLE_VOLATILE); @@ -7804,7 +8022,7 @@ vm_object_iopl_wire_empty( no_zero_fill = TRUE; } - grab_options = 0; + grab_options = VM_PAGE_GRAB_OPTIONS_NONE; #if CONFIG_SECLUDED_MEMORY if (object->can_grab_secluded) { grab_options |= VM_PAGE_GRAB_SECLUDED; @@ -7844,6 +8062,7 @@ vm_object_iopl_wire_empty( assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q); assert(dst_page->vmp_wire_count == 0); dst_page->vmp_wire_count++; + dst_page->vmp_iopl_wired = true; dst_page->vmp_q_state = VM_PAGE_IS_WIRED; assert(dst_page->vmp_wire_count); pages_wired++; @@ -8003,6 +8222,7 @@ vm_object_iopl_request( task_t task = current_task(); dwp_start = dwp = NULL; + *upl_ptr = NULL; vm_object_offset_t original_offset = offset; upl_size_t original_size = size; @@ -8022,7 +8242,7 @@ vm_object_iopl_request( */ return KERN_INVALID_VALUE; } - if (vm_lopage_needed == FALSE) { + if (!vm_lopage_needed) { cntrl_flags &= ~UPL_NEED_32BIT_ADDR; } @@ -8176,8 +8396,9 @@ vm_object_iopl_request( VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0); if (task != NULL) { - counter_add(&task->pages_grabbed_iopl, page_grab_count); + ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count); } + counter_add(&vm_page_grab_count_iopl, page_grab_count); return KERN_SUCCESS; } if (!is_kernel_object(object) && object != compressor_object) { @@ -8492,9 +8713,8 @@ memory_error: vm_pageout_steal_laundry(dst_page, FALSE); } - if ( - ((cntrl_flags & UPL_NEED_32BIT_ADDR) && - phys_page >= (max_valid_dma_address >> PAGE_SHIFT))) { + if ((cntrl_flags & UPL_NEED_32BIT_ADDR) && + phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) { vm_page_t new_page; int refmod; @@ -8511,9 +8731,7 @@ memory_error: goto return_err; } - { - new_page = vm_page_grablo(); - } + new_page = vm_page_grablo(VM_PAGE_GRAB_OPTIONS_NONE); if (new_page == VM_PAGE_NULL) { ret = KERN_RESOURCE_SHORTAGE; @@ -8562,7 +8780,21 @@ memory_error: phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); } if (!dst_page->vmp_busy) { - dwp->dw_mask |= DW_vm_page_wire; + /* + * Specify that we're wiring the page for I/O, which also means + * that the delayed work handler may return KERN_PROTECTION_FAILURE + * on certain configs if a page's mapping state doesn't allow I/O + * wiring. For the specifc case in which we're creating an IOPL + * against an executable mapping, the buffer copy performed by + * vm_map_create_upl() should prevent failure here, but we still + * want to gracefully fail here if someone attempts to I/O-wire + * an executable page through a named entry or non-executable + * alias mapping. + */ + dwp->dw_mask |= (DW_vm_page_wire | DW_vm_page_iopl_wire); + if (!(cntrl_flags & UPL_COPYOUT_FROM)) { + dwp->dw_mask |= DW_vm_page_iopl_wire_write; + } } if (cntrl_flags & UPL_BLOCK_ACCESS) { @@ -8653,19 +8885,25 @@ skip_page: VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); if (dw_count >= dw_limit) { - vm_page_do_delayed_work(object, tag, dwp_start, dw_count); + ret = vm_page_do_delayed_work(object, tag, dwp_start, dw_count); dwp = dwp_start; dw_count = 0; + if (ret != KERN_SUCCESS) { + goto return_err; + } } } } assert(entry == size_in_pages); if (dw_count) { - vm_page_do_delayed_work(object, tag, dwp_start, dw_count); + ret = vm_page_do_delayed_work(object, tag, dwp_start, dw_count); dwp = dwp_start; dw_count = 0; + if (ret != KERN_SUCCESS) { + goto return_err; + } } finish: if (user_page_list && set_cache_attr_needed == TRUE) { @@ -8692,14 +8930,17 @@ finish: PMAP_NULL, PAGE_SIZE, 0, VM_PROT_NONE); + vm_object_lock(object); assert(!object->blocked_access); object->blocked_access = TRUE; + vm_object_unlock(object); } VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0); if (task != NULL) { - counter_add(&task->pages_grabbed_iopl, page_grab_count); + ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count); } + counter_add(&vm_page_grab_count_iopl, page_grab_count); if (dwp_start && dwp_finish_ctx) { vm_page_delayed_work_finish_ctx(dwp_start); @@ -8791,11 +9032,13 @@ return_err: } vm_object_unlock(object); upl_destroy(upl); + *upl_ptr = NULL; VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, DBG_VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0); if (task != NULL) { - counter_add(&task->pages_grabbed_iopl, page_grab_count); + ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count); } + counter_add(&vm_page_grab_count_iopl, page_grab_count); if (dwp_start && dwp_finish_ctx) { vm_page_delayed_work_finish_ctx(dwp_start); @@ -8950,7 +9193,7 @@ vm_paging_map_init(void) { kmem_alloc(kernel_map, &vm_paging_base_address, ptoa(VM_PAGING_NUM_PAGES), - KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE, + KMA_DATA_SHARED | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE, VM_KERN_MEMORY_NONE); } STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init); @@ -9116,7 +9359,7 @@ vm_paging_map_object( address, map_size, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), object, object_offset, FALSE, @@ -9267,7 +9510,6 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) upl_t vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls) { - int i = 0; upl_t upl; assert(max_upls > 0); @@ -9278,22 +9520,14 @@ vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls) if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) { max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT; } - vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL); + vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL | Z_ZERO); upl = upl_create(0, UPL_VECTOR, 0); upl->vector_upl = vector_upl; upl->u_offset = upl_offset; - vector_upl->size = 0; vector_upl->offset = upl_offset; - vector_upl->invalid_upls = 0; - vector_upl->num_upls = 0; - vector_upl->pagelist = NULL; vector_upl->max_upls = max_upls; - for (i = 0; i < max_upls; i++) { - vector_upl->upls[i].iostate.size = 0; - vector_upl->upls[i].iostate.offset = 0; - } return upl; } @@ -9477,36 +9711,40 @@ vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_ } void -vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr) +vector_upl_get_addr(upl_t upl, vm_offset_t *dst_addr) { - *v_upl_submap = NULL; - if (vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if (vector_upl) { - *v_upl_submap = vector_upl->submap; - *submap_dst_addr = vector_upl->submap_dst_addr; + assert(vector_upl->dst_addr != 0); + *dst_addr = vector_upl->dst_addr; } else { - panic("vector_upl_get_submap was passed a non-vectored UPL"); + panic("%s was passed a non-vectored UPL", __func__); } } else { - panic("vector_upl_get_submap was passed a null UPL"); + panic("%s was passed a null UPL", __func__); } } void -vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr) +vector_upl_set_addr(upl_t upl, vm_offset_t dst_addr) { if (vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if (vector_upl) { - vector_upl->submap = submap; - vector_upl->submap_dst_addr = submap_dst_addr; + if (dst_addr) { + /* setting a new value: do not overwrite an old one */ + assert(vector_upl->dst_addr == 0); + } else { + /* resetting: make sure there was an old value */ + assert(vector_upl->dst_addr != 0); + } + vector_upl->dst_addr = dst_addr; } else { - panic("vector_upl_get_submap was passed a non-vectored UPL"); + panic("%s was passed a non-vectored UPL", __func__); } } else { - panic("vector_upl_get_submap was passed a NULL UPL"); + panic("%s was passed a NULL UPL", __func__); } } @@ -9740,6 +9978,12 @@ upl_page_get_mark(upl_page_info_t *upl, int index) return upl[index].mark; } +boolean_t +upl_page_is_needed(upl_page_info_t *upl, int index) +{ + return upl[index].needed; +} + void vm_countdirtypages(void) { @@ -9946,13 +10190,68 @@ upl_get_data_offset( upl_t upl_associated_upl(upl_t upl) { - return upl->associated_upl; + if (!(upl->flags & UPL_HAS_FS_VERIFY_INFO)) { + return upl->u_fs_un.associated_upl; + } + return NULL; } void upl_set_associated_upl(upl_t upl, upl_t associated_upl) { - upl->associated_upl = associated_upl; + assert(!(upl->flags & UPL_HAS_FS_VERIFY_INFO)); + upl->u_fs_un.associated_upl = associated_upl; +} + +bool +upl_has_fs_verify_info(upl_t upl) +{ + return upl->flags & UPL_HAS_FS_VERIFY_INFO; +} + +void +upl_set_fs_verify_info(upl_t upl, uint32_t size) +{ + struct upl_fs_verify_info *fs_verify_infop; + + if (upl->flags & UPL_HAS_FS_VERIFY_INFO || !size) { + return; + } + + fs_verify_infop = kalloc_type(struct upl_fs_verify_info, Z_WAITOK); + fs_verify_infop->verify_data_ptr = kalloc_data(size, Z_WAITOK); + fs_verify_infop->verify_data_len = size; + + upl_lock(upl); + if (upl->flags & UPL_HAS_FS_VERIFY_INFO) { + upl_unlock(upl); + + assert(upl->u_fs_un.verify_info && + upl->u_fs_un.verify_info->verify_data_len > 0 && + upl->u_fs_un.verify_info->verify_data_len <= upl_adjusted_size(upl, PAGE_MASK)); + + kfree_data(fs_verify_infop->verify_data_ptr, size); + kfree_type(struct upl_fs_verify_info, fs_verify_infop); + } else { + upl->flags |= UPL_HAS_FS_VERIFY_INFO; + upl->u_fs_un.verify_info = fs_verify_infop; + + upl_unlock(upl); + } +} + +uint8_t * +upl_fs_verify_buf(upl_t upl, uint32_t *size) +{ + assert(size); + + if (!(upl->flags & UPL_HAS_FS_VERIFY_INFO)) { + *size = 0; + return NULL; + } + + *size = upl->u_fs_un.verify_info->verify_data_len; + return upl->u_fs_un.verify_info->verify_data_ptr; } struct vnode * @@ -10082,16 +10381,19 @@ move_pages_to_queue( vm_object_t curr_object = VM_OBJECT_NULL; *pages_moved = 0; + vmlp_api_start(MOVE_PAGES_TO_QUEUE); if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) { /* * We don't currently support benchmarking maps with a different page size * than the kernel. */ + vmlp_api_end(MOVE_PAGES_TO_QUEUE, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (os_add_overflow(start_addr, buffer_size, &end_addr)) { + vmlp_api_end(MOVE_PAGES_TO_QUEUE, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -10106,6 +10408,9 @@ move_pages_to_queue( err = KERN_INVALID_ARGUMENT; break; } + + vmlp_range_event_entry(map, curr_entry); + curr_object = VME_OBJECT(curr_entry); if (curr_object) { vm_object_lock(curr_object); @@ -10161,6 +10466,7 @@ move_pages_to_queue( vm_object_unlock(curr_object); } vm_map_unlock_read(map); + vmlp_api_end(MOVE_PAGES_TO_QUEUE, err); return err; } diff --git a/osfmk/vm/vm_pageout_xnu.h b/osfmk/vm/vm_pageout_xnu.h index 1394eff19..591e76763 100644 --- a/osfmk/vm/vm_pageout_xnu.h +++ b/osfmk/vm/vm_pageout_xnu.h @@ -58,6 +58,9 @@ extern upl_t upl_associated_upl(upl_t upl); extern void upl_set_associated_upl(upl_t upl, upl_t associated_upl); extern void upl_set_map_exclusive(upl_t upl); extern void upl_clear_map_exclusive(upl_t upl); +extern void upl_set_fs_verify_info(upl_t upl, uint32_t size_per_page); +extern bool upl_has_fs_verify_info(upl_t upl); +extern uint8_t * upl_fs_verify_buf(upl_t upl, uint32_t *size); #include @@ -83,8 +86,8 @@ extern kern_return_t vm_map_create_upl( vm_tag_t tag); extern void vm_page_free_list( - vm_page_t mem, - boolean_t prepare_object); + vm_page_t mem, + bool prepare_object); extern kern_return_t vm_page_alloc_list( vm_size_t page_count, @@ -166,8 +169,7 @@ struct _vector_upl { uint32_t num_upls; uint32_t invalid_upls; uint32_t max_upls; - vm_map_t submap; - vm_offset_t submap_dst_addr; + vm_offset_t dst_addr; vm_object_offset_t offset; upl_page_info_array_t pagelist; struct { @@ -198,6 +200,10 @@ struct upl_io_completion { int io_error; }; +struct upl_fs_verify_info { + uint8_t *verify_data_ptr; /* verification data (hashes) for the data pages in the UPL */ + uint32_t verify_data_len; /* the digest size per page (can vary depending on the type of hash) */ +}; struct upl { decl_lck_mtx_data(, Lock); /* Synchronization */ @@ -216,7 +222,10 @@ struct upl { vm_offset_t kaddr; /* secondary mapping in kernel */ vm_object_t map_object; vector_upl_t vector_upl; - upl_t associated_upl; + union { + upl_t associated_upl; + struct upl_fs_verify_info *verify_info; /* verification data for the data pages in the UPL */ + } u_fs_un; struct upl_io_completion *upl_iodone; ppnum_t highest_page; #if CONFIG_IOSCHED @@ -265,7 +274,8 @@ struct upl { #define UPL_DECMP_REQ 0x80000 #define UPL_DECMP_REAL_IO 0x100000 #define UPL_MAP_EXCLUSIVE_WAIT 0x200000 -#define UPL_HAS_WIRED 0x400000 +#define UPL_HAS_FS_VERIFY_INFO 0x400000 +#define UPL_HAS_WIRED 0x800000 /* flags for upl_create flags parameter */ #define UPL_CREATE_EXTERNAL 0 @@ -275,8 +285,8 @@ struct upl { #define UPL_CREATE_EXPEDITE_SUP 0x8 extern void vector_upl_deallocate(upl_t); -extern void vector_upl_set_submap(upl_t, vm_map_t, vm_offset_t); -extern void vector_upl_get_submap(upl_t, vm_map_t*, vm_offset_t*); +extern void vector_upl_set_addr(upl_t, vm_offset_t); +extern void vector_upl_get_addr(upl_t, vm_offset_t*); extern void vector_upl_get_iostate(upl_t, upl_t, upl_offset_t*, upl_size_t*); extern void vector_upl_get_iostate_byindex(upl_t, uint32_t, upl_offset_t*, upl_size_t*); extern upl_t vector_upl_subupl_byindex(upl_t, uint32_t); @@ -368,7 +378,6 @@ struct vm_pageout_vminfo { unsigned long vm_pageout_forcereclaimed_sharedcache; unsigned long vm_pageout_protected_realtime; unsigned long vm_pageout_forcereclaimed_realtime; - }; extern struct vm_pageout_vminfo vm_pageout_vminfo; @@ -464,7 +473,7 @@ struct pgo_iothread_state { struct vm_pageout_queue *q; // cheads unused by external thread void *current_early_swapout_chead; - void *current_regular_swapout_chead; + void *current_regular_swapout_cheads[COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT]; void *current_late_swapout_chead; char *scratch_buf; int id; diff --git a/osfmk/vm/vm_phantom_cache.c b/osfmk/vm/vm_phantom_cache.c index ffe11a908..51550fa5a 100644 --- a/osfmk/vm/vm_phantom_cache.c +++ b/osfmk/vm/vm_phantom_cache.c @@ -187,6 +187,14 @@ vm_phantom_cache_add_ghost(vm_page_t m) if (vm_phantom_cache_num_entries == 0) { return; } + if (object->pager == MEMORY_OBJECT_NULL) { + /* + * This object must have lost its memory object due to a force-unmount + * or ungraft, for example; this page won't come back, so no need to + * track it. + */ + return; + } pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; diff --git a/osfmk/vm/vm_pmap.c b/osfmk/vm/vm_pmap.c index 59609ac8e..c7457720f 100644 --- a/osfmk/vm/vm_pmap.c +++ b/osfmk/vm/vm_pmap.c @@ -155,3 +155,40 @@ unified_page_list_iterator_page( return phys_page; } + +#if XNU_VM_HAS_LINEAR_PAGES_ARRAY + +/** + * Attempts to resolve the canonical VM page for the current position of a page list iter + * + * @note The behavior of this function is undefined if the iterator is already at or + * beyond the end of the page list. + * + * @param iter The iterator from which to extract the current page. + * + * @return The canonical vm_page_t for the current iterator position or + * VM_PAGE_NULL (if the page isn't managed and is part of an UPL array). + */ +__attribute__((always_inline)) +vm_page_t +unified_page_list_iterator_vm_page( + const unified_page_list_iterator_t *iter) +{ + vm_page_t page = VM_PAGE_NULL; + ppnum_t phys_page; + + switch (iter->list->type) { + case UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY: + phys_page = iter->list->upl.upl_info[iter->upl_index].phys_addr; + page = vm_page_find_canonical(phys_page); + break; + case UNIFIED_PAGE_LIST_TYPE_VM_PAGE_LIST: + case UNIFIED_PAGE_LIST_TYPE_VM_PAGE_OBJ_Q: + case UNIFIED_PAGE_LIST_TYPE_VM_PAGE_FIFO_Q: + page = iter->pageq_pos; + break; + } + return page; +} + +#endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */ diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 1e6d32d41..6616b9d5c 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -33,6 +33,7 @@ #include #include +#include #include #ifdef __cplusplus @@ -64,7 +65,7 @@ extern mach_port_name_t ipc_port_copyout_send_pinned( extern kern_return_t mach_port_deallocate_kernel( ipc_space_t space, mach_port_name_t name, - natural_t kotype); + ipc_object_type_t otype); #endif /* _IPC_IPC_PORT_H_ */ #ifndef _KERN_IPC_TT_H_ @@ -126,9 +127,7 @@ extern boolean_t coredumpok(vm_map_t map, mach_vm_offset_t va); * VM routines that used to be published to * user space, and are now restricted to the kernel. * - * They should eventually go away entirely - - * to be replaced with standard vm_map() and - * vm_deallocate() calls. + * No longer supported and always returns an error. */ extern kern_return_t vm_region_object_create ( diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index 050de9c97..0dafc628f 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -202,10 +202,17 @@ find_available_token: .kmg_context = os_hash_kernel_pointer(&tokens), }; + /* + * We can't use KMR_DATA here, since we enforce single-mappability + * on RESTRICTED mappings, and that means we have to use FREEOLD. + * + * However, the realloc path cannot also free the tokens allocation, + * since we cannot free it without taking the lock (vm_page_lock_queues). + */ if (alloc_size <= TOKEN_COUNT_MAX * sizeof(struct token)) { kmr = kmem_realloc_guard(kernel_map, (vm_offset_t)tokens, token_q_cur_size, alloc_size, - KMR_ZERO | KMR_DATA, guard); + KMR_ZERO | KMR_DATA_SHARED, guard); } vm_page_lock_queues(); diff --git a/osfmk/vm/vm_reclaim.c b/osfmk/vm/vm_reclaim.c index ce24b90b4..a0120d70b 100644 --- a/osfmk/vm/vm_reclaim.c +++ b/osfmk/vm/vm_reclaim.c @@ -76,11 +76,15 @@ TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16); #else /* RELEASE */ const uint32_t kReclaimChunkSize = 16; #endif /* DEVELOPMENT || DEBUG */ +TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns", +#if CONFIG_WORKING_SET_ESTIMATION + 10ULL * NSEC_PER_SEC); +#else + 0ULL); +#endif #if CONFIG_WORKING_SET_ESTIMATION TUNABLE_DT_DEV_WRITEABLE(bool, vm_reclaim_enabled, "/defaults", "kern.vm_reclaim_enabled", "vm_reclaim_enabled", VM_RECLAIM_ENABLED_DEFAULT, TUNABLE_DT_NONE); -/* TODO: Consider varying the sampling rate based on rusage, ringbuffer-velocity, memory pressure */ -TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_sampling_period_ns, "vm_reclaim_sampling_period_ns", 1ULL * NSEC_PER_SEC); TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_normal, "vm_reclaim_autotrim_pct_normal", 10); TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_pressure, "vm_reclaim_autotrim_pct_pressure", 5); TUNABLE_DEV_WRITEABLE(uint32_t, vm_reclaim_autotrim_pct_critical, "vm_reclaim_autotrim_pct_critical", 1); @@ -128,14 +132,19 @@ static kern_return_t reclaim_copyin_tail(vm_deferred_reclamation_metadata_t meta static kern_return_t reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy); static kern_return_t reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result); #if CONFIG_WORKING_SET_ESTIMATION -static bool vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out); +static mach_error_t vmdr_sample_working_set( + vm_deferred_reclamation_metadata_t metadata, + mach_vm_size_t *trim_threshold_out, + vm_deferred_reclamation_options_t options); #endif static void vmdr_metadata_release(vm_deferred_reclamation_metadata_t metadata); static void vmdr_list_append_locked(vm_deferred_reclamation_metadata_t metadata); static void vmdr_list_remove_locked(vm_deferred_reclamation_metadata_t metadata); static void vmdr_metadata_own(vm_deferred_reclamation_metadata_t metadata); static void vmdr_metadata_disown(vm_deferred_reclamation_metadata_t metadata); -static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options); +static void vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, + mach_vm_size_t *total_bytes_reclaimed_out, + vm_deferred_reclamation_options_t options); static kern_return_t reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, uint64_t bytes_to_reclaim, uint64_t *bytes_reclaimed_out, mach_vm_reclaim_count_t chunk_size, mach_vm_reclaim_count_t *num_reclaimed_out); @@ -148,6 +157,7 @@ struct vm_deferred_reclamation_metadata_s { TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; /* Protects all struct fields (except denoted otherwise) */ decl_lck_mtx_data(, vdrm_lock); + /* Gate to be acquired when performing copyio on the user ring */ decl_lck_mtx_gate_data(, vdrm_gate); /* * The task owns this structure but we maintain a backpointer here @@ -164,9 +174,9 @@ struct vm_deferred_reclamation_metadata_s { */ os_refcnt_t vdrm_refcnt; /* The virtual address of the ringbuffer in the user map (immutable) */ - user_addr_t vdrm_buffer_addr; + user_addr_t vdrm_ring_addr; /* The size of the VM allocation containing the ringbuffer (immutable) */ - mach_vm_size_t vdrm_buffer_size; + mach_vm_size_t vdrm_ring_size; /* The length of the ringbuffer. This may be changed on buffer re-size */ mach_vm_reclaim_count_t vdrm_buffer_len; /* Which GC epoch this buffer was last considered in */ @@ -176,44 +186,24 @@ struct vm_deferred_reclamation_metadata_s { * on this buffer to complete. */ uint32_t vdrm_waiters; -#if CONFIG_WORKING_SET_ESTIMATION /* timestamp (MAS) of the last working set sample for this ringbuffer */ uint64_t vdrm_last_sample_abs; /* - * Exponential moving average of the minimum reclaimable buffer size (in VMDR_WMA_UNIT's) + * The number of bytes reclaimed by kernel GC since the last user + * accounting update. Protected by @c vdrm_gate. + */ + size_t vdrm_kernel_bytes_reclaimed; + /* + * The last amount of reclaimable bytes reported to the kernel. + */ + uint64_t vdrm_reclaimable_bytes_last; +#if CONFIG_WORKING_SET_ESTIMATION + /* + * Exponential moving average of the minimum reclaimable buffer size + * (in VMDR_WMA_UNIT's). Protected by @c vdrm_gate. */ uint64_t vdrm_reclaimable_bytes_wma; - /* - * The minimum amount of reclaimable memory in this buffer for the current - * sampling interval. - */ - size_t vdrm_reclaimable_bytes_min; #endif /* CONFIG_WORKING_SET_ESTIMATION */ - /* - * These two values represent running sums of uncancelled bytes - * entered into the ring by userspace and bytes reclaimed out of the - * buffer by the kernel. - * - * The uncancelled byte-count may fluctuate as the client enters and - * cancels new reclamation requests. Reclamation requests which have - * been completed by the kernel will not deduct from the uncancelled - * count but will be added to the reclaimed byte count. - * - * - `vdrm_cumulative_reclaimed_bytes` is monotonically increasing. - * - `vdrm_cumulative_uncancelled_bytes` may fluctuate but - * should trend upward. - * - `vdrm_cumulative_uncancelled_bytes` must be kept >= - * `vdrm_cumulative_reclaimed_bytes` - * - * Both values are in terms of virtual memory, - * so they give an upper bound on the amount of physical memory that - * can be reclaimed. To get an estimate of the current amount of VA in - * the buffer do vdrm_cumulative_uncancelled_bytes - - * vdrm_cumulative_reclaimed_bytes. - */ - size_t vdrm_cumulative_uncancelled_bytes; - size_t vdrm_cumulative_reclaimed_bytes; - /* * Tracks whether or not this reclamation metadata has been added * to the global list yet. Normally, this happens when it is allocated, @@ -271,8 +261,8 @@ vmdr_metadata_alloc( metadata->vdrm_task = task; metadata->vdrm_map = map; - metadata->vdrm_buffer_addr = buffer; - metadata->vdrm_buffer_size = size; + metadata->vdrm_ring_addr = buffer; + metadata->vdrm_ring_size = size; metadata->vdrm_buffer_len = len; if (os_atomic_inc(&vm_reclaim_buffer_count, relaxed) == UINT32_MAX) { @@ -311,6 +301,7 @@ mach_error_t vm_deferred_reclamation_buffer_allocate_internal( task_t task, mach_vm_address_ut *address_u, + uint64_t *sampling_period, mach_vm_reclaim_count_t len, mach_vm_reclaim_count_t max_len) { @@ -324,10 +315,8 @@ vm_deferred_reclamation_buffer_allocate_internal( if (task == TASK_NULL) { return KERN_INVALID_TASK; } - if (address_u == NULL) { - return KERN_INVALID_ADDRESS; - } - if (len == 0 || max_len == 0 || max_len < len) { + if (address_u == NULL || sampling_period == NULL || + len == 0 || max_len == 0 || max_len < len) { return KERN_INVALID_ARGUMENT; } map = task->map; @@ -465,6 +454,7 @@ fail_task: out: *address_u = vm_sanitize_wrap_addr(0ull); + *sampling_period = vm_reclaim_sampling_period_abs; vmdr_metadata_release(metadata); KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END, kr, NULL); @@ -733,33 +723,26 @@ out: static user_addr_t get_entries_ptr(vm_deferred_reclamation_metadata_t metadata) { - return metadata->vdrm_buffer_addr + + return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, entries); } static user_addr_t -get_indices_ptr(user_addr_t buffer_addr) +get_head_ptr(vm_deferred_reclamation_metadata_t metadata) { - return buffer_addr + - offsetof(struct mach_vm_reclaim_ring_s, indices); + return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, head); } static user_addr_t -get_head_ptr(user_addr_t indices) +get_tail_ptr(vm_deferred_reclamation_metadata_t metadata) { - return indices + offsetof(struct mach_vm_reclaim_indices_s, head); + return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, tail); } static user_addr_t -get_tail_ptr(user_addr_t indices) +get_busy_ptr(vm_deferred_reclamation_metadata_t metadata) { - return indices + offsetof(struct mach_vm_reclaim_indices_s, tail); -} - -static user_addr_t -get_busy_ptr(user_addr_t indices) -{ - return indices + offsetof(struct mach_vm_reclaim_indices_s, busy); + return metadata->vdrm_ring_addr + offsetof(struct mach_vm_reclaim_ring_s, busy); } static kern_return_t @@ -785,8 +768,7 @@ reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head) { int result; kern_return_t kr; - user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr); - user_addr_t head_ptr = get_head_ptr(indices); + user_addr_t head_ptr = get_head_ptr(metadata); result = copyin_atomic64(head_ptr, head); kr = reclaim_handle_copyio_error(metadata, result); @@ -802,8 +784,7 @@ reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail) { int result; kern_return_t kr; - user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr); - user_addr_t tail_ptr = get_tail_ptr(indices); + user_addr_t tail_ptr = get_tail_ptr(metadata); result = copyin_atomic64(tail_ptr, tail); kr = reclaim_handle_copyio_error(metadata, result); @@ -819,8 +800,7 @@ reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy) { int result; kern_return_t kr; - user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr); - user_addr_t busy_ptr = get_busy_ptr(indices); + user_addr_t busy_ptr = get_busy_ptr(metadata); result = copyin_atomic64(busy_ptr, busy); kr = reclaim_handle_copyio_error(metadata, result); @@ -831,19 +811,64 @@ reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy) return kr; } +static kern_return_t +reclaim_copyin_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *reclaimable_bytes_out) +{ + int result; + kern_return_t kr = KERN_SUCCESS; + uint64_t reclaimable_bytes; + user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr + + offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes); + + result = copyin_atomic64(ptr, &reclaimable_bytes); + if (result) { + kr = reclaim_handle_copyio_error(metadata, result); + if (result != EFAULT || !vm_fault_get_disabled()) { + vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result); + } + } else { + *reclaimable_bytes_out = (size_t)reclaimable_bytes; + } + return kr; +} + +#if CONFIG_WORKING_SET_ESTIMATION +static kern_return_t +reclaim_copyin_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t *min_reclaimable_bytes_out) +{ + int result; + kern_return_t kr = KERN_SUCCESS; + uint64_t min_reclaimable_bytes; + user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr + + offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min); + + result = copyin_atomic64(ptr, &min_reclaimable_bytes); + if (result) { + kr = reclaim_handle_copyio_error(metadata, result); + if (result != EFAULT || !vm_fault_get_disabled()) { + vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result); + } + } else { + *min_reclaimable_bytes_out = (size_t)min_reclaimable_bytes; + } + return kr; +} +#endif /* CONFIG_WORKING_SET_ESTIMATION */ + static bool reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value) { int result; - kern_return_t kr; - user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr); - user_addr_t busy_ptr = get_busy_ptr(indices); + kern_return_t kr = KERN_SUCCESS; + user_addr_t busy_ptr = get_busy_ptr(metadata); result = copyout_atomic64(value, busy_ptr); - kr = reclaim_handle_copyio_error(metadata, result); - if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) { - vmdr_log_error( - "Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result); + if (result) { + kr = reclaim_handle_copyio_error(metadata, result); + if (result != EFAULT || !vm_fault_get_disabled()) { + vmdr_log_error( + "Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result); + } } return kr; } @@ -852,19 +877,40 @@ static bool reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value) { int result; - kern_return_t kr; - user_addr_t indices = get_indices_ptr(metadata->vdrm_buffer_addr); - user_addr_t head_ptr = get_head_ptr(indices); + kern_return_t kr = KERN_SUCCESS; + user_addr_t head_ptr = get_head_ptr(metadata); result = copyout_atomic64(value, head_ptr); - kr = reclaim_handle_copyio_error(metadata, result); - if (kr != KERN_SUCCESS && kr != KERN_MEMORY_ERROR) { - vmdr_log_error( - "Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result); + if (result) { + kr = reclaim_handle_copyio_error(metadata, result); + if (result != EFAULT || !vm_fault_get_disabled()) { + vmdr_log_error( + "Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result); + } } return kr; } +#if CONFIG_WORKING_SET_ESTIMATION +static kern_return_t +reclaim_copyout_min_reclaimable_bytes(vm_deferred_reclamation_metadata_t metadata, size_t min_reclaimable_bytes) +{ + int result; + kern_return_t kr = KERN_SUCCESS; + user_addr_t ptr = (uintptr_t)metadata->vdrm_ring_addr + + offsetof(struct mach_vm_reclaim_ring_s, reclaimable_bytes_min); + + result = copyout_atomic64(min_reclaimable_bytes, ptr); + if (result) { + kr = reclaim_handle_copyio_error(metadata, result); + if (result != EFAULT || !vm_fault_get_disabled()) { + vmdr_log_error("Unable to copyin reclaimable byte count err=%d\n", result); + } + } + return kr; +} +#endif /* CONFIG_WORKING_SET_ESTIMATION */ + #pragma mark Reclamation /* @@ -901,7 +947,6 @@ reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, mach_vm_reclaim_count_t num_reclaimed = 0, num_copied = 0; uint64_t bytes_reclaimed = 0; uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0; - user_addr_t indices; vm_map_t map = metadata->vdrm_map; vm_map_switch_context_t switch_ctx; struct mach_vm_reclaim_entry_s copied_entries[kReclaimChunkSize]; @@ -917,7 +962,6 @@ reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, memset(copied_entries, 0, sizeof(copied_entries)); - indices = get_indices_ptr(metadata->vdrm_buffer_addr); switch_ctx = vm_map_switch_to(map); kr = reclaim_copyin_busy(metadata, &busy); @@ -942,8 +986,8 @@ reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, vmdr_log_error( "Userspace modified head or busy pointer! head: %llu " "(0x%llx) | busy: %llu (0x%llx) | tail = %llu (0x%llx)\n", - head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, - get_tail_ptr(indices)); + head, get_head_ptr(metadata), busy, get_busy_ptr(metadata), tail, + get_tail_ptr(metadata)); reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_INDEX_FAILURE, busy); kr = KERN_FAILURE; @@ -959,8 +1003,8 @@ reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, "Tail < head! Userspace is likely attempting a " "cancellation; aborting reclamation | head: %llu " "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n", - head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, - get_busy_ptr(indices)); + head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy, + get_busy_ptr(metadata)); kr = KERN_ABORTED; goto done; } @@ -1001,8 +1045,8 @@ reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, "Tail < head! Userspace is likely attempting a " "cancellation; aborting reclamation | head: %llu " "(0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n", - head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, - get_busy_ptr(indices)); + head, get_head_ptr(metadata), tail, get_tail_ptr(metadata), busy, + get_busy_ptr(metadata)); /* Reset busy back to head */ reclaim_copyout_busy(metadata, head); kr = KERN_ABORTED; @@ -1173,7 +1217,7 @@ done: */ static kern_return_t vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata, - size_t bytes_to_reclaim, size_t *num_bytes_reclaimed_out, + mach_vm_size_t bytes_to_reclaim, mach_vm_size_t *num_bytes_reclaimed_out, vm_deferred_reclamation_options_t options) { kern_return_t kr = KERN_SUCCESS; @@ -1182,9 +1226,9 @@ vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata, vm_fault_disable(); } - size_t total_bytes_reclaimed = 0; + mach_vm_size_t total_bytes_reclaimed = 0; while (total_bytes_reclaimed < bytes_to_reclaim) { - uint64_t cur_bytes_reclaimed; + mach_vm_size_t cur_bytes_reclaimed; mach_vm_reclaim_count_t entries_reclaimed; kr = reclaim_chunk(metadata, bytes_to_reclaim - total_bytes_reclaimed, &cur_bytes_reclaimed, kReclaimChunkSize, &entries_reclaimed); @@ -1197,7 +1241,7 @@ vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata, if (options & RECLAIM_NO_FAULT) { vm_fault_enable(); } - vmdr_log_debug("reclaimed %lu B / %lu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid); + vmdr_log_debug("reclaimed %llu B / %llu B from %d\n", total_bytes_reclaimed, bytes_to_reclaim, metadata->vdrm_pid); if (num_bytes_reclaimed_out) { *num_bytes_reclaimed_out = total_bytes_reclaimed; } @@ -1205,24 +1249,31 @@ vmdr_reclaim_from_buffer(vm_deferred_reclamation_metadata_t metadata, } /* - * Get the reclamation metadata buffer for the given map. + * Get and retain the reclamation metadata buffer for the given task. */ static vm_deferred_reclamation_metadata_t -get_task_reclaim_metadata(task_t task) +vmdr_acquire_task_metadata(task_t task) { + vm_deferred_reclamation_metadata_t meta = NULL; assert(task != NULL); - vm_deferred_reclamation_metadata_t metadata = NULL; task_lock(task); - metadata = task->deferred_reclamation_metadata; + if (!task_is_halting(task) && task_is_active(task)) { + meta = task->deferred_reclamation_metadata; + } + if (meta != NULL) { + vmdr_metadata_retain(meta); + } task_unlock(task); - return metadata; + return meta; } + #pragma mark Buffer Resize/Synchronization kern_return_t vm_deferred_reclamation_buffer_flush_internal(task_t task, - mach_vm_reclaim_count_t num_entries_to_reclaim) + mach_vm_reclaim_count_t num_entries_to_reclaim, + mach_vm_size_t *bytes_reclaimed_out) { kern_return_t kr; vm_deferred_reclamation_metadata_t metadata = NULL; @@ -1233,7 +1284,7 @@ vm_deferred_reclamation_buffer_flush_internal(task_t task, return KERN_INVALID_TASK; } - metadata = get_task_reclaim_metadata(task); + metadata = vmdr_acquire_task_metadata(task); if (metadata == NULL) { return KERN_INVALID_ARGUMENT; } @@ -1265,12 +1316,16 @@ vm_deferred_reclamation_buffer_flush_internal(task_t task, break; } } + /* + * Tell the client how many bytes the kernel has reclaimed + * since the last time it updated its accounting + */ + bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed; + metadata->vdrm_kernel_bytes_reclaimed = 0; - vmdr_metadata_lock(metadata); - metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed; - vmdr_metadata_disown_locked(metadata); - vmdr_metadata_unlock(metadata); + vmdr_metadata_disown(metadata); + *bytes_reclaimed_out = bytes_reclaimed; KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_FLUSH) | DBG_FUNC_END, kr, total_reclaimed, bytes_reclaimed); DTRACE_VM2(reclaim_flush, mach_vm_reclaim_count_t, num_entries_to_reclaim, @@ -1281,7 +1336,8 @@ vm_deferred_reclamation_buffer_flush_internal(task_t task, kern_return_t vm_deferred_reclamation_buffer_resize_internal( task_t task, - mach_vm_reclaim_count_t len) + mach_vm_reclaim_count_t len, + mach_vm_size_t *bytes_reclaimed_out) { kern_return_t kr; mach_vm_reclaim_count_t num_entries_reclaimed = 0; @@ -1293,7 +1349,7 @@ vm_deferred_reclamation_buffer_resize_internal( if (len == 0) { return KERN_INVALID_ARGUMENT; } - vm_deferred_reclamation_metadata_t metadata = get_task_reclaim_metadata(task); + vm_deferred_reclamation_metadata_t metadata = vmdr_acquire_task_metadata(task); if (metadata == NULL) { return KERN_INVALID_TASK; } @@ -1302,9 +1358,11 @@ vm_deferred_reclamation_buffer_resize_internal( vm_map_t map = task->map; mach_vm_size_t new_size = vmdr_round_len_to_size(map, len); if (new_size == 0) { + vmdr_metadata_release(metadata); return KERN_INVALID_ARGUMENT; } - if (new_size > metadata->vdrm_buffer_size) { + if (new_size > metadata->vdrm_ring_size) { + vmdr_metadata_release(metadata); return KERN_NO_SPACE; } @@ -1331,21 +1389,34 @@ vm_deferred_reclamation_buffer_resize_internal( * TODO: Consider encoding the ringbuffer-capacity in the * mach_vm_reclaim_id_t, so reuses can still find objects after a resize. */ + mach_vm_size_t total_bytes_reclaimed = 0; do { - kr = reclaim_chunk(metadata, UINT64_MAX, NULL, kReclaimChunkSize, + mach_vm_size_t cur_bytes_reclaimed; + kr = reclaim_chunk(metadata, UINT64_MAX, &cur_bytes_reclaimed, kReclaimChunkSize, &num_entries_reclaimed); + total_bytes_reclaimed += cur_bytes_reclaimed; if (kr != KERN_SUCCESS) { goto fail; } } while (num_entries_reclaimed > 0); + vmdr_log_debug("[%d] successfully resized buffer | reclaimed: %llu B " + "kernel_reclaimed: %zu B\n", metadata->vdrm_pid, + total_bytes_reclaimed, metadata->vdrm_kernel_bytes_reclaimed); + + total_bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed; + metadata->vdrm_kernel_bytes_reclaimed = 0; + /* Publish new user addresses in kernel metadata */ vmdr_metadata_lock(metadata); metadata->vdrm_buffer_len = len; vmdr_metadata_disown_locked(metadata); vmdr_metadata_unlock(metadata); + vmdr_metadata_release(metadata); - KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed); + *bytes_reclaimed_out = total_bytes_reclaimed; + + KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, KERN_SUCCESS, num_entries_reclaimed, total_bytes_reclaimed); DTRACE_VM2(reclaim_ring_resize, mach_vm_reclaim_count_t, old_len, mach_vm_reclaim_count_t, len); @@ -1353,6 +1424,8 @@ vm_deferred_reclamation_buffer_resize_internal( fail: vmdr_metadata_disown(metadata); + vmdr_metadata_release(metadata); + *bytes_reclaimed_out = total_bytes_reclaimed; KDBG(VM_RECLAIM_CODE(VM_RECLAIM_RESIZE) | DBG_FUNC_END, kr, num_entries_reclaimed); return kr; } @@ -1362,8 +1435,8 @@ fail: #if CONFIG_WORKING_SET_ESTIMATION extern vm_pressure_level_t memorystatus_vm_pressure_level; -static uint64_t -vmdr_metadata_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata) +static kern_return_t +vmdr_calculate_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata, size_t *trim_threshold_out) { kern_return_t kr; uint32_t autotrim_pct; @@ -1391,25 +1464,29 @@ vmdr_metadata_autotrim_threshold(vm_deferred_reclamation_metadata_t metadata) * Estimate the task's maximum working set size */ ledger_amount_t phys_footprint_max = 0; - kr = ledger_get_lifetime_max(metadata->vdrm_task->ledger, + + vmdr_metadata_lock(metadata); + task_t task = metadata->vdrm_task; + if (task == TASK_NULL) { + vmdr_metadata_unlock(metadata); + return KERN_INVALID_TASK; + } + task_reference(task); + vmdr_metadata_unlock(metadata); + + kr = ledger_get_lifetime_max(task->ledger, task_ledgers.phys_footprint, &phys_footprint_max); assert3u(kr, ==, KERN_SUCCESS); - return phys_footprint_max * autotrim_pct / 100; + task_deallocate(task); + + *trim_threshold_out = phys_footprint_max * autotrim_pct / 100; + return KERN_SUCCESS; } #define VMDR_WMA_UNIT (1 << 8) #define VMDR_WMA_MIX(base, e) ((vm_reclaim_wma_weight_base * (base) + (e) * VMDR_WMA_UNIT * vm_reclaim_wma_weight_cur) / vm_reclaim_wma_denom) - -static size_t -vmdr_metadata_reset_min_bytes(vm_deferred_reclamation_metadata_t metadata) -{ - LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); - metadata->vdrm_reclaimable_bytes_min = - metadata->vdrm_cumulative_uncancelled_bytes - - metadata->vdrm_cumulative_reclaimed_bytes; - return metadata->vdrm_reclaimable_bytes_min; -} +#endif /* CONFIG_WORKING_SET_ESTIMATION */ /* * @func vmdr_ws_sample @@ -1423,44 +1500,86 @@ vmdr_metadata_reset_min_bytes(vm_deferred_reclamation_metadata_t metadata) * If the buffer should be trimmed, the amount to trim (in bytes) will be * written out * - * @returns true iff the buffer should be trimmed + * @returns KERN_MEMORY_ERROR if copyio failed due to RECLAIM_NO_FAULT * * @discussion - * The caller must hold the buffer locked. + * The caller must own the buffer */ -static bool +static mach_error_t vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata, - size_t *trim_threshold_out) + mach_vm_size_t *trim_threshold_out, vm_deferred_reclamation_options_t options) { - LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); + mach_error_t err = ERR_SUCCESS; + size_t min_reclaimable_bytes = 0, cur_reclaimable_bytes = 0; + uint64_t wma = 0; + + vmdr_metadata_assert_owned(metadata); + + *trim_threshold_out = 0; + + vm_map_switch_context_t map_ctx = vm_map_switch_to(metadata->vdrm_map); + + if (options & RECLAIM_NO_FAULT) { + vm_fault_disable(); + } +#if CONFIG_WORKING_SET_ESTIMATION + err = reclaim_copyin_min_reclaimable_bytes(metadata, &min_reclaimable_bytes); + if (err != ERR_SUCCESS) { + goto done; + } uint64_t now = mach_absolute_time(); if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) { /* A sampling period has not elapsed */ - return false; + goto done; } - - size_t estimated_reclaimable_bytes; - uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) / - vm_reclaim_sampling_period_abs; - KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_START, metadata->vdrm_pid, now, metadata->vdrm_last_sample_abs, - metadata->vdrm_reclaimable_bytes_min); + min_reclaimable_bytes); + err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes); + if (err != ERR_SUCCESS) { + goto done; + } + + /* Reset the minimum to start a new sampling interval */ + err = reclaim_copyout_min_reclaimable_bytes(metadata, cur_reclaimable_bytes); + if (err != ERR_SUCCESS) { + goto done; + } + + /* + * The user accounting will overcount if the kernel has reclaimed + * without telling the client about it. + */ + if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) { + cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed; + } else { + vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than " + "are supposedly in buffer (%zu)\n", metadata->vdrm_pid, + metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes); + /* This will cause an underflow in user accounting */ + reclaim_kill_with_reason(metadata, kGUARD_EXC_RECLAIM_ACCOUNTING_FAILURE, cur_reclaimable_bytes); + err = KERN_ABORTED; + goto done; + } + if (min_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) { + min_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed; + } else { + min_reclaimable_bytes = 0; + } + + uint64_t samples_elapsed = (now - metadata->vdrm_last_sample_abs) / + vm_reclaim_sampling_period_abs; if (samples_elapsed > vm_reclaim_abandonment_threshold) { /* * Many sampling periods have elapsed since the ring was * last sampled. Don't bother computing the WMA and assume * the buffer's current contents are unneeded. */ - estimated_reclaimable_bytes = - metadata->vdrm_cumulative_uncancelled_bytes - - metadata->vdrm_cumulative_reclaimed_bytes; - metadata->vdrm_reclaimable_bytes_min = estimated_reclaimable_bytes; - metadata->vdrm_reclaimable_bytes_wma = estimated_reclaimable_bytes; + wma = VMDR_WMA_MIX(0, cur_reclaimable_bytes); } else { /* * Compute an exponential moving average of the minimum amount of reclaimable @@ -1470,53 +1589,79 @@ vmdr_sample_working_set(vm_deferred_reclamation_metadata_t metadata, * update accounting) */ for (unsigned int i = 0; i < samples_elapsed; i++) { - metadata->vdrm_reclaimable_bytes_wma = VMDR_WMA_MIX( + wma = VMDR_WMA_MIX( metadata->vdrm_reclaimable_bytes_wma, - metadata->vdrm_reclaimable_bytes_min); + min_reclaimable_bytes); } - - /* Reset the minimum to start a new sampling interval */ - estimated_reclaimable_bytes = vmdr_metadata_reset_min_bytes(metadata); } - metadata->vdrm_last_sample_abs = now; - - size_t trim_threshold_bytes = MIN(metadata->vdrm_reclaimable_bytes_min, + metadata->vdrm_reclaimable_bytes_wma = wma; + size_t unneeded_bytes = MIN(min_reclaimable_bytes, metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT); - size_t autotrim_threshold = vmdr_metadata_autotrim_threshold(metadata); - bool trim_needed = trim_threshold_bytes >= vm_map_page_size(metadata->vdrm_map) && - trim_threshold_bytes >= autotrim_threshold; + size_t autotrim_threshold; + err = vmdr_calculate_autotrim_threshold(metadata, &autotrim_threshold); + if (err != ERR_SUCCESS) { + goto done; + } - *trim_threshold_out = vm_map_round_page(trim_threshold_bytes, - vm_map_page_mask(metadata->vdrm_map)); + if (unneeded_bytes >= vm_map_page_size(metadata->vdrm_map) && + unneeded_bytes >= autotrim_threshold) { + *trim_threshold_out = vm_map_round_page(unneeded_bytes, + vm_map_page_mask(metadata->vdrm_map)); + } +#else /* !CONFIG_WORKING_SET_ESTIMATION */ + (void)min_reclaimable_bytes; + (void)wma; + err = reclaim_copyin_reclaimable_bytes(metadata, &cur_reclaimable_bytes); + if (err != ERR_SUCCESS) { + goto done; + } + if (cur_reclaimable_bytes >= metadata->vdrm_kernel_bytes_reclaimed) { + cur_reclaimable_bytes -= metadata->vdrm_kernel_bytes_reclaimed; + } else { + vmdr_log_error("[%d] more bytes have been reclaimed (%zu) than " + "are supposedly in buffer (%zu)\n", metadata->vdrm_pid, + metadata->vdrm_kernel_bytes_reclaimed, cur_reclaimable_bytes); + } + if (cur_reclaimable_bytes > vm_reclaim_max_threshold) { + *trim_threshold_out = vm_reclaim_max_threshold - cur_reclaimable_bytes; + } +#endif /* CONFIG_WORKING_SET_ESTIMATION */ + metadata->vdrm_last_sample_abs = mach_absolute_time(); + metadata->vdrm_reclaimable_bytes_last = cur_reclaimable_bytes; + +done: + vm_map_switch_back(map_ctx); + if (options & RECLAIM_NO_FAULT) { + vm_fault_enable(); + } KDBG(VM_RECLAIM_CODE(VM_RECLAIM_SAMPLE) | DBG_FUNC_END, - *trim_threshold_out, - trim_needed, - estimated_reclaimable_bytes); + wma, + min_reclaimable_bytes, + cur_reclaimable_bytes, + *trim_threshold_out); DTRACE_VM5(reclaim_sample, pid_t, metadata->vdrm_pid, - uint64_t, metadata->vdrm_reclaimable_bytes_wma, - size_t, metadata->vdrm_reclaimable_bytes_min, - size_t, estimated_reclaimable_bytes, + uint64_t, wma, + size_t, min_reclaimable_bytes, + size_t, cur_reclaimable_bytes, size_t, *trim_threshold_out); - vmdr_log_debug("sampled buffer with min %lu est %lu trim %lu wma %llu\n", - metadata->vdrm_reclaimable_bytes_min, - estimated_reclaimable_bytes, - trim_threshold_bytes, - metadata->vdrm_reclaimable_bytes_wma / VMDR_WMA_UNIT); - - return trim_needed; + vmdr_log_debug("sampled buffer with min %lu est %lu trim %llu wma %llu\n", + min_reclaimable_bytes, + cur_reclaimable_bytes, + *trim_threshold_out, + wma); + return err; } -#endif /* CONFIG_WORKING_SET_ESTIMATION */ /* * Caller must have buffer owned and unlocked */ static kern_return_t -vmdr_trim(vm_deferred_reclamation_metadata_t metadata, size_t bytes_to_reclaim, - size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options) +vmdr_trim(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t bytes_to_reclaim, + mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options) { kern_return_t kr; KDBG(VM_RECLAIM_CODE(VM_RECLAIM_TRIM) | DBG_FUNC_START, @@ -1537,7 +1682,7 @@ vmdr_trim(vm_deferred_reclamation_metadata_t metadata, size_t bytes_to_reclaim, * Caller must have buffer owned and unlocked */ static kern_return_t -vmdr_drain(vm_deferred_reclamation_metadata_t metadata, size_t *bytes_reclaimed, +vmdr_drain(vm_deferred_reclamation_metadata_t metadata, mach_vm_size_t *bytes_reclaimed, vm_deferred_reclamation_options_t options) { kern_return_t kr; @@ -1554,98 +1699,74 @@ vmdr_drain(vm_deferred_reclamation_metadata_t metadata, size_t *bytes_reclaimed, return kr; } -kern_return_t -vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, uint64_t bytes_placed_in_buffer) +mach_error_t +vm_deferred_reclamation_update_accounting_internal(task_t task, uint64_t *bytes_reclaimed_out) { vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; - size_t estimated_reclaimable_bytes, bytes_to_reclaim, bytes_reclaimed = 0; - kern_return_t kr = KERN_SUCCESS; + mach_vm_size_t bytes_to_reclaim, bytes_reclaimed = 0; + mach_error_t err = ERR_SUCCESS; + if (metadata == NULL) { - return KERN_INVALID_ARGUMENT; + return KERN_NOT_FOUND; } - KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START, - metadata->vdrm_pid, bytes_placed_in_buffer); - - vmdr_metadata_lock(metadata); - if (!metadata->vdrm_pid) { /* If this is a forked child, we may not yet have a pid */ metadata->vdrm_pid = task_pid(task); } - /* - * The client is allowed to make this call in parallel from multiple threads. - * It's possible that, while we were waiting for the lock, another - * thread updated accounting with a larger/newer uncancelled_bytes - * value that resulted in a reclaim. We can't provide strict ordering - * with the current implementation, but we can at least detect very - * erroneous stale values that would result in the uncancelled-byte - * count being less than the reclaimed-byte-count (which cannot be - * accurate). - * - * TODO: Consider making this a try_copyin of the userspace value - * under the mutex to ensure ordering/consistency (rdar://137607771) - */ - if (bytes_placed_in_buffer < metadata->vdrm_cumulative_reclaimed_bytes) { + KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START, + metadata->vdrm_pid); + + vmdr_metadata_lock(metadata); + uint64_t now = mach_absolute_time(); + if (now - metadata->vdrm_last_sample_abs < vm_reclaim_sampling_period_abs) { + /* + * This is a fast path to avoid waiting on the gate if another + * thread beat us to sampling. + */ + vmdr_metadata_unlock(metadata); goto done; } + vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE); + vmdr_metadata_unlock(metadata); - metadata->vdrm_cumulative_uncancelled_bytes = bytes_placed_in_buffer; - estimated_reclaimable_bytes = bytes_placed_in_buffer - metadata->vdrm_cumulative_reclaimed_bytes; -#if CONFIG_WORKING_SET_ESTIMATION - bool should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim); - if (should_reclaim) { - vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE); - lck_mtx_unlock(&metadata->vdrm_lock); - vmdr_log_debug("trimming pid %d\n", metadata->vdrm_pid); + err = vmdr_sample_working_set(metadata, &bytes_to_reclaim, RECLAIM_OPTIONS_NONE); + if (err != ERR_SUCCESS) { + vmdr_metadata_disown(metadata); + goto done; + } + if (bytes_to_reclaim) { + vmdr_log_debug("[%d] trimming %llu B\n", metadata->vdrm_pid, bytes_to_reclaim); - kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE); + err = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE); - vmdr_metadata_lock(metadata); - metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed; - /* Reset the current minimum now that the buffer has been trimmed down */ - vmdr_metadata_reset_min_bytes(metadata); - vmdr_metadata_disown_locked(metadata); - if (kr == KERN_ABORTED) { + if (err == KERN_ABORTED) { /* * We were unable to complete the trim due to a lost * race with userspace. This need not be fatal b/c the * accounting was successfully updated. */ - kr = KERN_SUCCESS; - } - } else { - /* Update the minimum for the current sampling period */ - metadata->vdrm_reclaimable_bytes_min = MIN(metadata->vdrm_reclaimable_bytes_min, estimated_reclaimable_bytes); - } -#else /* !CONFIG_WORKING_SET_ESTIMATION */ - if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) { - bytes_to_reclaim = vm_reclaim_max_threshold - estimated_reclaimable_bytes; - vmdr_metadata_own_locked(metadata, RECLAIM_OPTIONS_NONE); - vmdr_metadata_unlock(metadata); - kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, RECLAIM_OPTIONS_NONE); - vmdr_metadata_lock(metadata); - metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed; - vmdr_metadata_disown_locked(metadata); - if (kr == KERN_ABORTED) { - /* - * We were unable to complete the trim due to a lost - * race with userspace. This need not be fatal b/c the - * accounting was successfully updated. - */ - kr = KERN_SUCCESS; + err = KERN_SUCCESS; } } -#endif /* CONFIG_WORKING_SET_ESTIMATION */ + + /* + * Tell the client how many bytes the kernel has reclaimed + * since the last time it updated its accounting + */ + bytes_reclaimed += metadata->vdrm_kernel_bytes_reclaimed; + metadata->vdrm_kernel_bytes_reclaimed = 0; + + vmdr_metadata_disown(metadata); done: KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END, - metadata->vdrm_cumulative_uncancelled_bytes, - metadata->vdrm_cumulative_reclaimed_bytes, + metadata->vdrm_last_sample_abs, + bytes_to_reclaim, bytes_reclaimed); - vmdr_metadata_unlock(metadata); - return kr; + *bytes_reclaimed_out = (uint64_t)bytes_reclaimed; + return err; } kern_return_t @@ -1653,7 +1774,7 @@ vm_deferred_reclamation_task_drain(task_t task, vm_deferred_reclamation_options_t options) { kern_return_t kr; - size_t bytes_reclaimed; + mach_vm_size_t bytes_reclaimed; task_lock(task); if (!task_is_active(task) || task_is_halting(task)) { @@ -1671,12 +1792,9 @@ vm_deferred_reclamation_task_drain(task_t task, vmdr_metadata_own(metadata); kr = vmdr_drain(metadata, &bytes_reclaimed, options); + metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed; - vmdr_metadata_lock(metadata); - metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed; - vmdr_metadata_disown_locked(metadata); - vmdr_metadata_unlock(metadata); - + vmdr_metadata_disown(metadata); vmdr_metadata_release(metadata); return kr; } @@ -1696,17 +1814,16 @@ vm_deferred_reclamation_task_fork(task_t task, vm_deferred_reclamation_metadata_ { vm_deferred_reclamation_metadata_t metadata = NULL; vmdr_metadata_assert_owned(parent); + vmdr_log_debug("forking [%d]\n", parent->vdrm_pid); assert(task->deferred_reclamation_metadata == NULL); - metadata = vmdr_metadata_alloc(task, parent->vdrm_buffer_addr, - parent->vdrm_buffer_size, parent->vdrm_buffer_len); + metadata = vmdr_metadata_alloc(task, parent->vdrm_ring_addr, + parent->vdrm_ring_size, parent->vdrm_buffer_len); - metadata->vdrm_cumulative_reclaimed_bytes = parent->vdrm_cumulative_reclaimed_bytes; - metadata->vdrm_cumulative_uncancelled_bytes = parent->vdrm_cumulative_uncancelled_bytes; -#if CONFIG_WORKING_SET_ESTIMATION - metadata->vdrm_reclaimable_bytes_min = parent->vdrm_reclaimable_bytes_min; - metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma; metadata->vdrm_last_sample_abs = parent->vdrm_last_sample_abs; + metadata->vdrm_kernel_bytes_reclaimed = parent->vdrm_kernel_bytes_reclaimed; +#if CONFIG_WORKING_SET_ESTIMATION + metadata->vdrm_reclaimable_bytes_wma = parent->vdrm_reclaimable_bytes_wma; #endif /* CONFIG_WORKING_SET_ESTIMATION */ return metadata; @@ -1743,28 +1860,41 @@ vm_deferred_reclamation_ring_disown(vm_deferred_reclamation_metadata_t metadata) } void -vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options) +vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action, + mach_vm_size_t *total_bytes_reclaimed, + vm_deferred_reclamation_options_t options) { - vmdr_garbage_collect(action, options); + vmdr_garbage_collect(action, total_bytes_reclaimed, options); +} + +void +vm_deferred_reclamation_settle_ledger(task_t task) +{ + vm_deferred_reclamation_metadata_t meta = vmdr_acquire_task_metadata(task); + if (meta == NULL) { + return; + } + vmdr_metadata_lock(meta); + ledger_zero_balance(task->ledger, task_ledgers.est_reclaimable); + ledger_credit( + task->ledger, + task_ledgers.est_reclaimable, + meta->vdrm_reclaimable_bytes_last); + vmdr_metadata_unlock(meta); + vmdr_metadata_release(meta); } #pragma mark Global Reclamation GC static void -vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_reclamation_options_t options) +vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, + mach_vm_size_t *total_bytes_reclaimed_out, + vm_deferred_reclamation_options_t options) { kern_return_t kr; - size_t bytes_reclaimed, bytes_to_reclaim; - bool should_reclaim; + mach_vm_size_t total_bytes_reclaimed = 0; gate_wait_result_t wr; -#if !CONFIG_WORKING_SET_ESTIMATION - if (action == RECLAIM_GC_TRIM) { - /* GC_TRIM is a no-op without working set estimation */ - return; - } -#endif /* !CONFIG_WORKING_SET_ESTIMATION */ - lck_mtx_lock(&reclaim_buffers_lock); kr = lck_mtx_gate_try_close(&reclaim_buffers_lock, &vm_reclaim_gc_gate); if (kr != KERN_SUCCESS) { @@ -1808,47 +1938,57 @@ vmdr_garbage_collect(vm_deferred_reclamation_gc_action_t action, vm_deferred_rec bool buffer_is_suspended = task_is_app_suspended(task); task = TASK_NULL; + mach_vm_size_t bytes_reclaimed = 0; + mach_vm_size_t bytes_to_reclaim = 0; + switch (action) { case RECLAIM_GC_DRAIN: if (!vmdr_metadata_own_locked(metadata, options)) { goto next; } vmdr_metadata_unlock(metadata); - vmdr_drain(metadata, &bytes_reclaimed, options); + + vmdr_log_debug("draining [%d]\n", metadata->vdrm_pid); + kr = vmdr_drain(metadata, &bytes_reclaimed, options); + metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed; + vmdr_metadata_lock(metadata); vmdr_metadata_disown_locked(metadata); break; case RECLAIM_GC_SCAVENGE: if (buffer_is_suspended) { - vmdr_metadata_own_locked(metadata, options); + if (!vmdr_metadata_own_locked(metadata, options)) { + goto next; + } vmdr_metadata_unlock(metadata); + /* This buffer is no longer in use, fully reclaim it. */ - vmdr_log_debug("found suspended buffer (%d), draining\n", metadata->vdrm_pid); + vmdr_log_debug("found suspended buffer [%d], draining\n", metadata->vdrm_pid); kr = vmdr_drain(metadata, &bytes_reclaimed, options); + metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed; + vmdr_metadata_lock(metadata); vmdr_metadata_disown_locked(metadata); } break; case RECLAIM_GC_TRIM: -#if CONFIG_WORKING_SET_ESTIMATION - should_reclaim = vmdr_sample_working_set(metadata, &bytes_to_reclaim); - if (should_reclaim) { - vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid); - vmdr_metadata_own_locked(metadata, options); - vmdr_metadata_unlock(metadata); - kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options); - vmdr_metadata_lock(metadata); - vmdr_metadata_disown_locked(metadata); + if (!vmdr_metadata_own_locked(metadata, options)) { + goto next; } -#else /* !CONFIG_WORKING_SET_ESTIMATION */ - (void)bytes_to_reclaim; - (void)should_reclaim; -#endif /* CONFIG_WORKING_SET_ESTIMATION */ + vmdr_metadata_unlock(metadata); + kr = vmdr_sample_working_set(metadata, &bytes_to_reclaim, options); + if (kr == KERN_SUCCESS && bytes_to_reclaim) { + vmdr_log_debug("GC found stale buffer (%d), trimming\n", metadata->vdrm_pid); + kr = vmdr_trim(metadata, bytes_to_reclaim, &bytes_reclaimed, options); + metadata->vdrm_kernel_bytes_reclaimed += bytes_reclaimed; + } + vmdr_metadata_lock(metadata); + vmdr_metadata_disown_locked(metadata); break; } if (bytes_reclaimed) { vm_reclaim_gc_reclaim_count++; - metadata->vdrm_cumulative_reclaimed_bytes += bytes_reclaimed; + total_bytes_reclaimed += bytes_reclaimed; } if (metadata->vdrm_waiters && action != RECLAIM_GC_TRIM) { thread_wakeup((event_t)&metadata->vdrm_waiters); @@ -1860,6 +2000,7 @@ next: } lck_mtx_gate_handoff(&reclaim_buffers_lock, &vm_reclaim_gc_gate, GATE_HANDOFF_OPEN_IF_NO_WAITERS); lck_mtx_unlock(&reclaim_buffers_lock); + *total_bytes_reclaimed_out = total_bytes_reclaimed; } OS_NORETURN @@ -1869,8 +2010,13 @@ vm_reclaim_scavenger_thread_continue(__unused void *param, __unused wait_result_ sched_cond_ack(&vm_reclaim_scavenger_cond); while (true) { - vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, RECLAIM_OPTIONS_NONE); - sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, vm_reclaim_scavenger_thread_continue); + mach_vm_size_t total_bytes_reclaimed; + vmdr_garbage_collect(RECLAIM_GC_SCAVENGE, &total_bytes_reclaimed, + RECLAIM_OPTIONS_NONE); + vmdr_log_info("scavenger reclaimed %llu KiB of virtual memory\n", + total_bytes_reclaimed >> 10); + sched_cond_wait(&vm_reclaim_scavenger_cond, THREAD_UNINT, + vm_reclaim_scavenger_thread_continue); } } @@ -1891,10 +2037,8 @@ static void vm_deferred_reclamation_init(void) { vm_reclaim_log_handle = os_log_create("com.apple.xnu", "vm_reclaim"); -#if CONFIG_WORKING_SET_ESTIMATION nanoseconds_to_absolutetime((uint64_t)vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs); -#endif /* CONFIG_WORKING_SET_ESTIMATION */ sched_cond_init(&vm_reclaim_scavenger_cond); lck_mtx_gate_init(&reclaim_buffers_lock, &vm_reclaim_gc_gate); @@ -1915,20 +2059,12 @@ bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task) { bool reclaimed; - vm_deferred_reclamation_metadata_t metadata = NULL; + vm_deferred_reclamation_metadata_t metadata; - task_lock(task); - if (!task_is_halting(task) && task_is_active(task)) { - metadata = task->deferred_reclamation_metadata; - } - if (metadata != NULL) { - vmdr_metadata_retain(metadata); - } - task_unlock(task); + metadata = vmdr_acquire_task_metadata(task); if (metadata == NULL) { return false; } - vmdr_metadata_lock(metadata); metadata->vdrm_waiters++; @@ -1946,3 +2082,37 @@ vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task) } #endif /* DEVELOPMENT || DEBUG */ + +#pragma mark Introspectibility + +kern_return_t +vm_deferred_reclamation_buffer_query_internal( + task_t task, + mach_vm_address_ut *addr_out_u, + mach_vm_size_ut *size_out_u) +{ + vm_deferred_reclamation_metadata_t meta; + + if (task == NULL) { + return KERN_INVALID_TASK; + } + + if ((addr_out_u == NULL) || (size_out_u == NULL)) { + return KERN_INVALID_ARGUMENT; + } + + meta = vmdr_acquire_task_metadata(task); + + if (meta == NULL) { + *addr_out_u = vm_sanitize_wrap_addr(0); + *size_out_u = vm_sanitize_wrap_size(0); + } else { + vmdr_metadata_lock(meta); + *addr_out_u = vm_sanitize_wrap_addr(meta->vdrm_ring_addr); + *size_out_u = vm_sanitize_wrap_size(meta->vdrm_ring_size); + vmdr_metadata_unlock(meta); + vmdr_metadata_release(meta); + } + + return KERN_SUCCESS; +} diff --git a/osfmk/vm/vm_reclaim_internal.h b/osfmk/vm/vm_reclaim_internal.h index 719a5458b..301d1d212 100644 --- a/osfmk/vm/vm_reclaim_internal.h +++ b/osfmk/vm/vm_reclaim_internal.h @@ -39,25 +39,32 @@ #if MACH_KERNEL_PRIVATE mach_error_t vm_deferred_reclamation_buffer_allocate_internal( - task_t task, + task_t task, mach_vm_address_ut *address, + uint64_t *sampling_period, mach_vm_reclaim_count_t len, mach_vm_reclaim_count_t max_len); kern_return_t vm_deferred_reclamation_buffer_flush_internal( task_t task, - mach_vm_reclaim_count_t max_entries_to_reclaim); + mach_vm_reclaim_count_t max_entries_to_reclaim, + mach_vm_size_t *bytes_reclaimed_out); -kern_return_t vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal( - task_t task, uint64_t reclaimable_bytes); +mach_error_t vm_deferred_reclamation_update_accounting_internal( + task_t task, uint64_t *bytes_reclaimed_out); /* * Resize the reclaim buffer for a given task */ kern_return_t vm_deferred_reclamation_buffer_resize_internal( - task_t task, - mach_vm_reclaim_count_t len); + task_t task, + mach_vm_reclaim_count_t len, + mach_vm_size_t *bytes_reclaimed_out); +kern_return_t vm_deferred_reclamation_buffer_query_internal( + task_t task, + mach_vm_address_ut *addr_out_ut, + mach_vm_size_ut *size_out_ut); void vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata); void vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata); diff --git a/osfmk/vm/vm_reclaim_xnu.h b/osfmk/vm/vm_reclaim_xnu.h index 0186b1384..07757b7cf 100644 --- a/osfmk/vm/vm_reclaim_xnu.h +++ b/osfmk/vm/vm_reclaim_xnu.h @@ -127,8 +127,14 @@ void vm_deferred_reclamation_task_suspend(task_t task); * Perform Garbage Collection on all reclaim rings */ void vm_deferred_reclamation_gc(vm_deferred_reclamation_gc_action_t action, + mach_vm_size_t *total_bytes_reclaimed_out, vm_deferred_reclamation_options_t options); +/* + * Settle ledger entry for reclaimable memory + */ +void vm_deferred_reclamation_settle_ledger(task_t task); + #endif /* CONFIG_DEFERRED_RECLAIM */ #endif /* XNU_KERNEL_PRIVATE */ #endif /* __VM_RECLAIM_XNU__ */ diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index 15f171fba..6bf2c3959 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -61,7 +61,7 @@ * * Resident memory management module. */ - +#include #include #include @@ -92,7 +92,7 @@ #include #include #include -#include +#include #include #include @@ -123,6 +123,11 @@ #include #endif /* HIBERNATION */ +#if CONFIG_SECLUDED_MEMORY +static_assert(!XNU_VM_HAS_LOPAGE, + "VM_PAGE_ON_SECLUDED_Q and VM_PAGE_ON_FREE_LOPAGE_Q alias"); +#endif + #include #if defined(HAS_APPLE_PAC) @@ -151,61 +156,39 @@ static TUNABLE(uint32_t, fillval, "fill", 0); #if MACH_ASSERT TUNABLE(bool, vm_check_refs_on_alloc, "vm_check_refs_on_alloc", false); -#define ASSERT_PMAP_FREE(mem) pmap_assert_free(VM_PAGE_GET_PHYS_PAGE(mem)) - -#else /* MACH_ASSERT */ - -#define ASSERT_PMAP_FREE(mem) /* nothing */ #endif /* MACH_ASSERT */ - extern boolean_t vm_pageout_running; extern thread_t vm_pageout_scan_thread; extern bool vps_dynamic_priority_enabled; -const bool vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE] = { - [VM_PAGE_ON_INACTIVE_INTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_CLEANED_Q] = true, -}; +const uint16_t vm_page_inactive_states = + BIT(VM_PAGE_ON_INACTIVE_INTERNAL_Q) | + BIT(VM_PAGE_ON_INACTIVE_EXTERNAL_Q) | + BIT(VM_PAGE_ON_INACTIVE_CLEANED_Q); -const bool vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE] = { - [VM_PAGE_ON_INACTIVE_INTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_CLEANED_Q] = true, - [VM_PAGE_ON_ACTIVE_Q] = true, - [VM_PAGE_ON_SPECULATIVE_Q] = true, - [VM_PAGE_ON_THROTTLED_Q] = true, +const uint16_t vm_page_active_or_inactive_states = + vm_page_inactive_states | #if CONFIG_SECLUDED_MEMORY - [VM_PAGE_ON_SECLUDED_Q] = true, + BIT(VM_PAGE_ON_SECLUDED_Q) | #endif /* CONFIG_SECLUDED_MEMORY */ -}; + BIT(VM_PAGE_ON_ACTIVE_Q); -const bool vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE] = { - [VM_PAGE_ON_INACTIVE_INTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_CLEANED_Q] = true, - [VM_PAGE_ON_ACTIVE_Q] = true, - [VM_PAGE_ON_THROTTLED_Q] = true, -#if CONFIG_SECLUDED_MEMORY - [VM_PAGE_ON_SECLUDED_Q] = true, -#endif /* CONFIG_SECLUDED_MEMORY */ -}; +const uint16_t vm_page_non_speculative_pageable_states = + vm_page_active_or_inactive_states | + BIT(VM_PAGE_ON_THROTTLED_Q); -const bool vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE] = { - [VM_PAGE_ON_INACTIVE_INTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_EXTERNAL_Q] = true, - [VM_PAGE_ON_INACTIVE_CLEANED_Q] = true, - [VM_PAGE_ON_ACTIVE_Q] = true, -#if CONFIG_SECLUDED_MEMORY - [VM_PAGE_ON_SECLUDED_Q] = true, -#endif /* CONFIG_SECLUDED_MEMORY */ -}; +const uint16_t vm_page_pageable_states = + vm_page_non_speculative_pageable_states | + BIT(VM_PAGE_ON_SPECULATIVE_Q); #if CONFIG_SECLUDED_MEMORY struct vm_page_secluded_data vm_page_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ +#if HIBERNATION +static bool hibernate_rebuild_needed = false; +#endif /* HIBERNATION */ #if DEVELOPMENT || DEBUG extern struct memory_object_pager_ops shared_region_pager_ops; @@ -215,16 +198,30 @@ unsigned int shared_region_pagers_resident_peak = 0; -int PERCPU_DATA(start_color); +unsigned int PERCPU_DATA(start_color); vm_page_t PERCPU_DATA(free_pages); +SCALABLE_COUNTER_DEFINE(vm_cpu_free_count); boolean_t hibernate_cleaning_in_progress = FALSE; atomic_counter_t vm_guard_count; -uint32_t vm_lopage_free_count = 0; -uint32_t vm_lopage_free_limit = 0; -uint32_t vm_lopage_lowater = 0; -boolean_t vm_lopage_refill = FALSE; -boolean_t vm_lopage_needed = FALSE; + +#if XNU_VM_HAS_LOPAGE +/* + * this interface exists to support hardware controllers + * incapable of generating DMAs with more than 32 bits + * of address on platforms with physical memory > 4G... + */ +vm_page_queue_head_t vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED; +uint32_t vm_lopage_free_count = 0; +uint32_t vm_lopage_free_limit = 0; +uint32_t vm_lopage_lowater = 0; +bool vm_lopage_refill = false; +bool vm_lopage_needed = false; +unsigned int vm_lopages_allocated_q = 0; +unsigned int vm_lopages_allocated_cpm_success = 0; +unsigned int vm_lopages_allocated_cpm_failed = 0; +#endif /* XNU_VM_HAS_LOPAGE */ + int speculative_age_index = 0; int speculative_steal_index = 0; @@ -241,8 +238,10 @@ static void vm_tag_init(void); /* for debugging purposes */ SECURITY_READ_ONLY_EARLY(uint32_t) vm_packed_from_vm_pages_array_mask = VM_PAGE_PACKED_FROM_ARRAY; +#ifndef __BUILDING_XNU_LIB_UNITTEST__ /* This is not a compile-time constant when building unit-test */ SECURITY_READ_ONLY_EARLY(vm_packing_params_t) vm_page_packing_params = VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR); +#endif /* __BUILDING_XNU_LIB_UNITTEST__ */ /* * Associated with page of user-allocatable memory is a @@ -294,7 +293,7 @@ SECURITY_READ_ONLY_LATE(unsigned int) vm_page_bucket_lock_count = 0; /* H /* for debugging */ SECURITY_READ_ONLY_LATE(bool) vm_tag_active_update = VM_TAG_ACTIVE_UPDATE; -SECURITY_READ_ONLY_LATE(lck_spin_t *) vm_page_bucket_locks; +SECURITY_READ_ONLY_LATE(lck_ticket_t *) vm_page_bucket_locks; vm_allocation_site_t vm_allocation_sites_static[VM_KERN_MEMORY_FIRST_DYNAMIC + 1]; vm_allocation_site_t * vm_allocation_sites[VM_MAX_TAG_VALUE]; @@ -380,6 +379,24 @@ SECURITY_READ_ONLY_LATE(uint32_t) vm_pages_count; #if XNU_VM_HAS_LINEAR_PAGES_ARRAY SECURITY_READ_ONLY_LATE(ppnum_t) vm_pages_first_pnum; #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */ +#if CONFIG_SPTM +/* + * When used, these 128bit (MAX_COLORS bits) masks represent a "cluster" + * of contiguous free physical pages. + * + * For each cluster, there is an enqueue "index", which is -1 when there is no + * free page in the cluster, or the index in [0, 128) of the page that is + * enqueued on the vm_page_free_queue to represent the entire cluster. + * + * Grouping pages this way has the double nice effect to reduce doubly linked + * list (the worst data structure known to man when considering cache misses) + * manipulations, and also to mechanically make the VM serve more "contiguous" + * pages naturally. + */ +static_assert(XNU_VM_HAS_LINEAR_PAGES_ARRAY); +SECURITY_READ_ONLY_LATE(__uint128_t *) _vm_pages_free_masks; +SECURITY_READ_ONLY_LATE(int8_t *) _vm_pages_free_enqueue_idx; +#endif /* CONFIG_SPTM */ /* @@ -387,12 +404,12 @@ SECURITY_READ_ONLY_LATE(ppnum_t) vm_pages_first_pnum; * are allocated from a set of free lists, * one per color. */ -unsigned int vm_colors; -unsigned int vm_color_mask; /* mask is == (vm_colors-1) */ +SECURITY_READ_ONLY_LATE(unsigned int) vm_colors; +SECURITY_READ_ONLY_LATE(unsigned int) vm_color_mask; /* mask is == (vm_colors-1) */ unsigned int vm_cache_geometry_colors = 0; /* set by hw dependent code during startup */ unsigned int vm_free_magazine_refill_limit = 0; -struct vm_page_queue_free_head vm_page_queue_free[MAX_COLORS]; +struct vm_page_free_queue vm_page_queue_free; unsigned int vm_page_free_wanted; unsigned int vm_page_free_wanted_privileged; @@ -530,6 +547,8 @@ unsigned int vm_page_speculative_created = 0; unsigned int vm_page_speculative_used = 0; #endif +_Atomic unsigned int vm_page_swapped_count = 0; + vm_page_queue_head_t vm_page_queue_cleaned VM_PAGE_PACKED_ALIGNED; unsigned int vm_page_cleaned_count = 0; @@ -594,52 +613,24 @@ vm_set_page_size(void) } } -/* - * See the header for function documentation. - */ -vm_memory_class_t -vm_page_get_memory_class(vm_page_t page __unused) -{ - assert(!vm_page_is_fictitious(page)); +/* + * @abstract + * Given a page, returns the memory class of that page. + */ +static vm_memory_class_t +vm_page_get_memory_class(vm_page_t mem __unused, ppnum_t pnum __unused) +{ + assert(!vm_page_is_fictitious(mem)); + +#if XNU_VM_HAS_LOPAGE + if (mem->vmp_lopage) { + return VM_MEMORY_CLASS_LOPAGE; + } +#endif /* XNU_VM_HAS_LOPAGE */ return VM_MEMORY_CLASS_REGULAR; } -/* - * vm_page_validate_no_references: - * - * Make sure the physical page has no refcounts. - * - */ -static inline void -vm_page_validate_no_references( - vm_page_t mem) -{ - bool is_freed; - - if (vm_page_is_fictitious(mem)) { - return; - } - - pmap_paddr_t paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(mem)); - -#if CONFIG_SPTM - is_freed = pmap_is_page_free(paddr); -#else - is_freed = pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)); -#endif /* CONFIG_SPTM */ - - if (!is_freed) { - /* - * There is a redundancy here, but we are going to panic anyways, - * and ASSERT_PMAP_FREE traces useful information. So, we keep this - * behavior. - */ - ASSERT_PMAP_FREE(mem); - panic("%s: page 0x%llx is referenced", __func__, paddr); - } -} - /* * vm_page_is_restricted: * @@ -709,322 +700,356 @@ vm_page_setup_clump( void ) #endif /* __x86_64__ */ -/* - * vm_page_queue_free_remove: - * Removes a specific page from the global free queues. Based on the remove - * reason, this may update the page state... but it does not update the queue - * state. - */ -static void -vm_page_queue_free_remove(vm_page_t page, vm_remove_reason_t remove_reason) -{ - unsigned int color = VM_PAGE_GET_COLOR(page); - - vm_page_queue_remove(&vm_page_queue_free[color].qhead, page, vmp_pageq); - vm_page_free_count--; - - switch (remove_reason) { - case VM_REMOVE_REASON_USE: - { - break; - } - case VM_REMOVE_REASON_REBALANCE: - { - break; - } - default: - { - panic("Unrecognized remove reason %u", remove_reason); - __builtin_unreachable(); - } - } -} - -/* - * vm_page_queue_free_remove_first: - * Given a number of pages, removes that many pages from the head of the global - * free queues and returns a page list of these pages, with the queue state set - * to a state specified by the caller. - * - * Must be called with the free page lock held, preemption disabled, and with - * enough pages in the global free queues to satisfy the request. - */ -static vm_page_t -vm_page_queue_free_remove_first(unsigned int num_pages, unsigned int q_state) -{ - vm_page_t mem = VM_PAGE_NULL; - vm_page_t list = VM_PAGE_NULL; - vm_page_t old_list = VM_PAGE_NULL; - vm_page_t new_list = VM_PAGE_NULL; - int *colorp; - unsigned int color; - unsigned int clump_end __unused = 0; - unsigned int sub_count __unused = 0; - - LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); - assert(get_preemption_level() != 0); - assert(q_state <= VM_PAGE_Q_STATE_LAST_VALID_VALUE); - assert(vm_page_free_count >= num_pages); - - colorp = PERCPU_GET(start_color); - color = *colorp; - vm_page_free_count -= num_pages; - - /* Get the pages. */ - while (num_pages--) { - while (vm_page_queue_empty(&vm_page_queue_free[color].qhead)) { - /* This color queue is empty; skip to the next one. */ - color = (color + 1) & vm_color_mask; - } - -#if defined(__x86_64__) - /* - * x86_64 uses a bespoke free queue scheme, where the free path - * tries to cluster clumps of contiguous pages together on - * the free queue to optimize for the platform's memory - * controller. - */ - vm_page_queue_remove_first_with_clump(&vm_page_queue_free[color].qhead, - mem, clump_end); - - if (clump_end) { - /* Only change colors at the end of a clump. */ - color = (color + 1) & vm_color_mask; - } - -#if DEVELOPMENT || DEBUG - sub_count++; - - if (clump_end) { - vm_clump_update_stats(sub_count); - sub_count = 0; - } -#endif /* !DEVELOPMENT && !DEBUG */ - -#else /* !defined(__x86_64__) */ - /* Other targets default to rotating colors after each pop. */ - vm_page_queue_remove_first(&vm_page_queue_free[color].qhead, mem, vmp_pageq); - color = (color + 1) & vm_color_mask; -#endif /* !defined(__x86_64__) */ - - vm_page_list_push(&list, mem); - - /* Assert that we got a sane page from the free queue. */ - assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q); - assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); - assert(mem->vmp_tabled == FALSE); - assert(mem->vmp_object == 0); - assert(!mem->vmp_laundry); - assert(mem->vmp_busy); - assert(!mem->vmp_pmapped); - assert(!mem->vmp_wpmapped); - assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); - assert(!mem->vmp_realtime); - - /* Set the page to the client's desired queue state. */ - mem->vmp_q_state = q_state; - - } - -#if defined(__x86_64__) && (DEVELOPMENT || DEBUG) - vm_clump_update_stats(sub_count); -#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */ - - /* Record the next page color the CPU should try to get. */ - *colorp = color; - - /* - * Some existing driver/IOKit code deals badly with getting physically - * contiguous memory... which this alloc code is rather likely to - * provide by accident immediately after boot. - * - * To avoid hitting issues related to this, we'll invert the order of - * the list we return. This code should be removed once we've tracked - * down the various driver issues. - */ - old_list = list; - - vm_page_list_foreach_consume(mem, &old_list) { - vm_page_list_push(&new_list, mem); - } - - list = new_list; - - return list; -} - -/* - * vm_page_queue_free_enter: - * Given a page, puts that pages onto the global free page queues. - * - * Must be called with the VM page free lock held. - */ -static void -vm_page_queue_free_enter(vm_page_t mem) -{ - int color = VM_PAGE_GET_COLOR(mem);; - - - assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - assert(mem->vmp_busy); - assert(!mem->vmp_realtime); - - mem->vmp_lopage = FALSE; - mem->vmp_q_state = VM_PAGE_ON_FREE_Q; - -#if defined(__x86_64__) - vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem); -#else - vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vmp_pageq); -#endif - - - vm_page_free_count++; -} - -/* - * See the header for documentation. - */ void -vm_page_steal_free_page(vm_page_t page, vm_remove_reason_t remove_reason) +vm_page_free_queue_init(vm_page_free_queue_t free_queue) { - vm_memory_class_t memory_class = vm_page_get_memory_class(page); - - assert(page->vmp_q_state == VM_PAGE_ON_FREE_Q); - - switch (memory_class) { - case VM_MEMORY_CLASS_REGULAR: - { - vm_page_queue_free_remove(page, remove_reason); - break; + for (unsigned int color = 0; color < MAX_COLORS; color++) { + vm_page_queue_init(&free_queue->vmpfq_queues[color].qhead); } - default: - { - panic("Unrecognized memory class %u\n", memory_class); - break; - } - } - -#if MACH_ASSERT - if (vm_check_refs_on_alloc) { - /* - * Stolen free pages should be unreferenced, just like grabbed free - * pages. - */ - vm_page_validate_no_references(page); - } -#endif /* MACH_ASSERT */ } -/* - * See the header for documentation. +/*! + * @function vm_page_free_queue_for_class() + * + * @abstract + * Returns the appropriate free queue for the given class and page color. */ -vmp_free_list_result_t -vm_page_put_list_on_free_queue(vm_page_t list, bool page_queues_locked) +__pure2 +static vm_page_queue_t +vm_page_free_queue_for_class(vm_memory_class_t mem_class, unsigned int color) { - vmp_free_list_result_t result = { }; - vm_page_t mem; - - LCK_MTX_ASSERT(&vm_page_queue_lock, - page_queues_locked ? LCK_MTX_ASSERT_OWNED : LCK_MTX_ASSERT_NOTOWNED); -#if !HIBERNATION - if (startup_phase >= STARTUP_SUB_KMEM) { - /* vm_page_release_startup() doesn't hold the lock */ - LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + switch (mem_class) { + case VM_MEMORY_CLASS_REGULAR: + return &vm_page_queue_free.vmpfq_queues[color].qhead; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + return &vm_lopage_queue_free; +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + case VM_MEMORY_CLASS_SECLUDED: + return &vm_page_queue_secluded; +#endif } -#endif /* !HIBERNATION */ +} - vm_page_list_foreach_consume(mem, &list) { - vm_memory_class_t memory_class = vm_page_get_memory_class(mem); +__pure2 +static bool +vm_page_free_queue_has_colors(vm_memory_class_t mem_class) +{ + switch (mem_class) { + case VM_MEMORY_CLASS_REGULAR: + return true; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + return false; +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + case VM_MEMORY_CLASS_SECLUDED: + return false; +#endif + } +} - /* Clear any specialQ hints before releasing page to the free pool*/ - mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY; - - if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) && - vm_lopage_free_count < vm_lopage_free_limit && - VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { - /* - * this exists to support hardware controllers - * incapable of generating DMAs with more than 32 bits - * of address on platforms with physical memory > 4G... - */ - vm_page_queue_enter_first(&vm_lopage_queue_free, mem, - vmp_pageq); - vm_lopage_free_count++; - - if (vm_lopage_free_count >= vm_lopage_free_limit) { - vm_lopage_refill = FALSE; - } - - mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; - mem->vmp_lopage = TRUE; - result.vmpr_lopage++; - continue; - } - #if CONFIG_SECLUDED_MEMORY - if (memory_class == VM_MEMORY_CLASS_REGULAR && - vm_page_free_count > vm_page_free_reserved && - vm_page_secluded_count < vm_page_secluded_target && - num_tasks_can_use_secluded_mem == 0) { - /* - * XXX FBDP TODO: also avoid refilling secluded queue - * when some IOKit objects are already grabbing from it... - */ - if (!page_queues_locked && !vm_page_trylock_queues()) { - /* take locks in right order */ - vm_free_page_unlock(); - vm_page_lock_queues(); - vm_free_page_lock_spin(); - } - mem->vmp_lopage = FALSE; - vm_page_queue_enter_first(&vm_page_queue_secluded, mem, - vmp_pageq); - mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; - vm_page_secluded_count++; - vm_page_secluded_count_free++; - VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); - - if (!page_queues_locked) { - vm_page_unlock_queues(); - } - result.vmpr_secluded++; - continue; - } -#else - (void)page_queues_locked; -#endif /* CONFIG_SECLUDED_MEMORY */ - - switch (memory_class) { - case VM_MEMORY_CLASS_REGULAR: - vm_page_queue_free_enter(mem); - result.vmpr_regular++; - break; - default: - panic("unrecognized memory class %u", memory_class); - } +static bool +vm_page_secluded_pool_eligible(vm_memory_class_t class) +{ + switch (class) { +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + return false; +#endif /* XNU_VM_HAS_LOPAGE */ + default: + return true; } - - return result; } static bool -vm_page_free_has_any_waiters(void) +vm_page_secluded_pool_depleted(void) +{ + if (vm_page_free_count <= vm_page_free_reserved) { + return false; + } + if (num_tasks_can_use_secluded_mem) { + return false; + } + return vm_page_secluded_count < vm_page_secluded_target; +} + +#endif /* CONFIG_SECLUDED_MEMORY */ +#if HIBERNATION + +__attribute__((overloadable)) +static void +vm_page_free_queue_foreach(vm_page_queue_t queue, void (^block)(vm_page_t)) +{ + vm_page_t page; + + vm_page_queue_iterate(queue, page, vmp_pageq) { + block(page); + } +} + +__attribute__((overloadable)) +static void +vm_page_free_queue_foreach(vm_page_free_queue_t queue, void (^block)(vm_page_t)) +{ + for (unsigned int color = 0; color < vm_colors; color++) { + vm_page_free_queue_foreach(&queue->vmpfq_queues[color].qhead, block); + } +} + +#endif /* HIBERNATION */ +#if CONFIG_SPTM + +static inline uint32_t +vm_pages_free_mask_len(void) +{ + extern pmap_paddr_t real_avail_end; + + uint64_t pnums = atop(real_avail_end) - pmap_first_pnum; + static_assert(8 * sizeof(__uint128_t) == MAX_COLORS); + return (uint32_t)((pnums + MAX_COLORS - 1) / MAX_COLORS); +} + +static inline int8_t +vm_pages_free_mask_bit(ppnum_t pnum) +{ + return (int8_t)(pnum & (MAX_COLORS - 1)); +} + +static inline uint32_t +vm_pages_free_mask_index(ppnum_t pnum) +{ + return (pnum - pmap_first_pnum) / MAX_COLORS; +} + +__pure2 +static inline __uint128_t * +vm_pages_free_masks(void) +{ + return _vm_pages_free_masks; +} + +__pure2 +static inline bitmap_t * +vm_pages_free_masks_as_bitmap(uint32_t index) +{ + /* + * this conversion is gross but helps with codegen for bit-wise + * accesses where the __uint128_t type is really yielding poor code. + * + * This conversion is only legal on little endian architectures. + */ +#ifndef __LITTLE_ENDIAN__ +#error unsupported configuration +#endif + return (bitmap_t *)(_vm_pages_free_masks + index); +} + +__pure2 +static inline int8_t * +vm_pages_free_enqueue_idx(uint32_t index) +{ + return &_vm_pages_free_enqueue_idx[index]; +} + +/*! + * @brief + * Return the position of the next bit in "circular" order for a given cluster + * of pages, starting at and including @c bit. + */ +static inline int8_t +vm_pages_free_mask_next_bit(uint32_t index, int8_t bit) +{ + __uint128_t value = vm_pages_free_masks()[index]; + __uint128_t mask = ((__uint128_t)1 << bit) - 1; + + if (value == 0) { + return -1; + } + + if (value & ~mask) { + value &= ~mask; + } + if ((uint64_t)value) { + return (int8_t)__builtin_ctzll((uint64_t)value); + } + return 64 + (int8_t)__builtin_ctzll((uint64_t)(value >> 64)); +} + +static inline bool +vm_pages_free_mask_test(uint32_t index, int8_t bit) +{ + return bitmap_test(vm_pages_free_masks_as_bitmap(index), bit); +} + +static inline void +vm_pages_free_mask_set(uint32_t index, int8_t bit) +{ + assert(!vm_pages_free_mask_test(index, bit)); + bitmap_set(vm_pages_free_masks_as_bitmap(index), bit); +} + +static inline void +vm_pages_free_mask_clear(uint32_t index, int8_t bit) +{ + assert(vm_pages_free_mask_test(index, bit)); + bitmap_clear(vm_pages_free_masks_as_bitmap(index), bit); +} + +#endif /* CONFIG_SPTM */ + +__attribute__((always_inline)) +void +vm_page_free_queue_enter(vm_memory_class_t class, vm_page_t mem, ppnum_t pnum) +{ + bool enter_first; + unsigned int color; + vm_page_queue_t queue; + + if (startup_phase >= STARTUP_SUB_KMEM) { + LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + } + + assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 && + mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 && + mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 && + mem->vmp_next_m == 0 && + mem->vmp_object == 0 && + mem->vmp_wire_count == 0 && + mem->vmp_busy && + !mem->vmp_tabled && + !mem->vmp_laundry && + !mem->vmp_pmapped && + !mem->vmp_wpmapped && + !mem->vmp_realtime); + + switch (class) { +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; + mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY; + mem->vmp_lopage = true; + mem->vmp_canonical = true; + enter_first = true; + break; +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + case VM_MEMORY_CLASS_SECLUDED: + if (startup_phase >= STARTUP_SUB_KMEM) { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + } + mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; + mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY; + mem->vmp_lopage = false; + mem->vmp_canonical = true; + enter_first = true; + break; +#endif + default: + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; + mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY; + mem->vmp_lopage = false; + mem->vmp_canonical = true; + enter_first = false; + break; + } + + + color = VM_PAGE_GET_COLOR_PNUM(pnum); + queue = vm_page_free_queue_for_class(class, color); +#if CONFIG_SPTM + if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) { + uint32_t index = vm_pages_free_mask_index(pnum); + int8_t bit = vm_pages_free_mask_bit(pnum); + + if (vm_pages_free_masks()[index] == 0) { + vm_page_queue_enter(queue, mem, vmp_pageq); + *vm_pages_free_enqueue_idx(index) = bit; + } + vm_pages_free_mask_set(index, bit); + } else +#endif /* CONFIG_SPTM */ + if (enter_first) { + vm_page_queue_enter_first(queue, mem, vmp_pageq); + } else { +#if defined(__x86_64__) + vm_page_queue_enter_clump(queue, mem); +#else + vm_page_queue_enter(queue, mem, vmp_pageq); +#endif + } + + switch (class) { + case VM_MEMORY_CLASS_REGULAR: + VM_COUNTER_INC(&vm_page_queue_free.vmpfq_count); + VM_COUNTER_INC(&vm_page_free_count); + break; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + VM_COUNTER_INC(&vm_lopage_free_count); + if (vm_lopage_free_count >= vm_lopage_free_limit) { + vm_lopage_refill = false; + } + break; +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + case VM_MEMORY_CLASS_SECLUDED: + vm_page_secluded_count++; + vm_page_secluded_count_free++; + VM_PAGE_SECLUDED_COUNT_OVER_TARGET_UPDATE(); + break; +#endif /* CONFIG_SECLUDED_MEMORY */ + default: + __builtin_unreachable(); + } +} + +/*! + * @typedef vmp_free_list_result_t + * + * @discussion + * This data structure is used by vm_page_free_queue_add_list to track + * how many pages were freed to which free lists, so that it can then drive + * which waiters we are going to wake up. + * + * uint8_t counters are enough because we never free more than 64 pages at + * a time, and this allows for the data structure to be passed by register. + */ +typedef struct { + uint8_t vmpr_regular; + uint8_t vmpr_lopage; +#if CONFIG_SECLUDED_MEMORY + uint8_t vmpr_secluded; +#endif /* CONFIG_SECLUDED_MEMORY */ +} vmp_free_list_result_t; + +/*! + * @abstract + * Returns whether there are any threads blocked in VM_PAGE_WAIT(). + * + * @discussion + * The page free queue lock must be held. + */ +static bool +vm_page_free_queue_has_any_waiters(void) { uint32_t result = 0; + result |= vm_page_free_wanted; result |= vm_page_free_wanted_privileged; #if CONFIG_SECLUDED_MEMORY result |= vm_page_free_wanted_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ - result |= vm_page_free_wanted; return result != 0; } -static void +void vm_page_free_wakeup(event_t event, uint32_t n) { if (vps_dynamic_priority_enabled) { @@ -1041,29 +1066,42 @@ vm_page_free_wakeup(event_t event, uint32_t n) } } +/*! + * @abstract + * Helper to wakeup threads in VM_PAGE_WAIT() given + * a vm_page_free_queue_enter_list() result. + * + * @discussion + * The page free queue lock must be held, and is unlocked on return. + * + * @param vmpr The result of a vm_page_free_queue_enter_list() call. + */ __attribute__((noinline)) static void -vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) +vm_page_free_queue_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) { unsigned int need_wakeup = 0; unsigned int need_priv_wakeup = 0; #if CONFIG_SECLUDED_MEMORY unsigned int need_wakeup_secluded = 0; #endif /* CONFIG_SECLUDED_MEMORY */ + unsigned int unpriv_limit; -#define DONATE_TO_WAITERS(count, waiters_count) ({ \ - uint32_t __n = MIN(waiters_count, vmpr.count); \ - waiters_count -= __n; \ - vmpr.count -= __n; \ - __n; \ +#define DONATE_TO_WAITERS(wake, count, waiters_count, limit) ({ \ + uint32_t __n = MIN(MIN(waiters_count, vmpr.count), limit); \ + waiters_count -= __n; \ + vmpr.count -= __n; \ + wake += __n; \ + __n; \ }) /* * Step 1: privileged waiters get to be satisfied first */ if (vm_page_free_wanted_privileged) { - need_priv_wakeup += DONATE_TO_WAITERS(vmpr_regular, - vm_page_free_wanted_privileged); + DONATE_TO_WAITERS(need_priv_wakeup, + vmpr_regular, vm_page_free_wanted_privileged, + UINT32_MAX); } @@ -1074,11 +1112,9 @@ vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) * for free pages above the reserve threshold. */ if (vm_page_free_count <= vm_page_free_reserved) { - vmpr.vmpr_regular = 0; - } else if (vm_page_free_count - vmpr.vmpr_regular < - vm_page_free_reserved) { - vmpr.vmpr_regular = (uint8_t)(vm_page_free_count - - vm_page_free_reserved); + unpriv_limit = 0; + } else { + unpriv_limit = vm_page_free_count - vm_page_free_reserved; } /* @@ -1087,10 +1123,13 @@ vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) */ #if CONFIG_SECLUDED_MEMORY if (vm_page_free_wanted_secluded) { - need_wakeup_secluded += DONATE_TO_WAITERS(vmpr_secluded, - vm_page_free_wanted_secluded); - need_wakeup_secluded += DONATE_TO_WAITERS(vmpr_regular, - vm_page_free_wanted_secluded); + DONATE_TO_WAITERS(need_wakeup_secluded, + vmpr_secluded, vm_page_free_wanted_secluded, + UINT32_MAX); + unpriv_limit -= DONATE_TO_WAITERS(need_wakeup_secluded, + vmpr_regular, vm_page_free_wanted_secluded, + unpriv_limit); + if (vm_page_free_wanted_secluded == 0) { need_wakeup_secluded = UINT32_MAX; } @@ -1101,8 +1140,9 @@ vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) * Step 4: satisfy regular demand last. */ if (vm_page_free_wanted) { - need_wakeup += DONATE_TO_WAITERS(vmpr_regular, - vm_page_free_wanted); + unpriv_limit -= DONATE_TO_WAITERS(need_wakeup, + vmpr_regular, vm_page_free_wanted, + unpriv_limit); if (vm_page_free_wanted == 0) { need_wakeup = UINT32_MAX; } @@ -1118,10 +1158,9 @@ vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) * lock all bets are off. * * To avoid this priority inversion that could really hurt the VM, - * disable preemption until we've woken up all privileged threads. + * disable preemption until we've woken up everyone. */ disable_preemption(); - vm_free_page_unlock(); /* @@ -1135,21 +1174,357 @@ vm_page_free_handle_wakeups_and_unlock(vmp_free_list_result_t vmpr) vm_page_free_wakeup(&vm_page_free_wanted_privileged, UINT32_MAX); } - enable_preemption(); - + if (need_wakeup) { + vm_page_free_wakeup(&vm_page_free_count, need_wakeup); + } #if CONFIG_SECLUDED_MEMORY if (need_wakeup_secluded) { vm_page_free_wakeup(&vm_page_free_wanted_secluded, need_wakeup_secluded); } #endif /* CONFIG_SECLUDED_MEMORY */ - if (need_wakeup) { - vm_page_free_wakeup(&vm_page_free_count, need_wakeup); - } + + enable_preemption(); #undef DONATE_TO_WAITERS } +/* + * @abstract + * Given a list of pages, put each page on whichever global free queue is + * appropriate. + * + * @discussion + * Must be called with the VM free page lock unlocked. + * + * The list must contain less than 255 elements. + */ +static void +vm_page_free_queue_enter_list(vm_page_list_t list, vmp_release_options_t opts) +{ + bool page_queues_unlock = false; + bool page_queues_locked = false; + bool do_secluded = false; + vmp_free_list_result_t result = { }; + vm_page_t mem; + + LCK_MTX_ASSERT(&vm_page_queue_lock, + (opts & VMP_RELEASE_Q_LOCKED) + ? LCK_MTX_ASSERT_OWNED + : LCK_MTX_ASSERT_NOTOWNED); + + /* + * Hibernation and startup do not really need the lock because + * these are single threaded paths, so from the PoV of that function, + * it's as if VMP_RELEASE_Q_LOCKED was passed. + */ + page_queues_locked = (opts & (VMP_RELEASE_STARTUP | + VMP_RELEASE_HIBERNATE | + VMP_RELEASE_Q_LOCKED)); + +#if CONFIG_SECLUDED_MEMORY + do_secluded = vm_page_secluded_pool_depleted(); +#endif /* CONFIG_SECLUDED_MEMORY */ + + if (!page_queues_locked && (list.vmpl_has_realtime || do_secluded)) { + vm_page_lock_queues(); + page_queues_locked = true; + page_queues_unlock = true; + } + + if (opts & VMP_RELEASE_STARTUP) { + LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + } else { + vm_free_page_lock_spin(); + } + + vm_page_list_foreach_consume(mem, &list) { + ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem); + vm_memory_class_t class = vm_page_get_memory_class(mem, pnum); + + if (mem->vmp_realtime) { + mem->vmp_realtime = false; + VM_COUNTER_DEC(&vm_page_realtime_count); + } + +#if XNU_VM_HAS_LOPAGE + if ((class == VM_MEMORY_CLASS_REGULAR || + class == VM_MEMORY_CLASS_LOPAGE) && + vm_lopage_refill && + vm_lopage_free_count < vm_lopage_free_limit && + pnum < max_valid_low_ppnum) { + class = VM_MEMORY_CLASS_LOPAGE; + } else { + class = VM_MEMORY_CLASS_REGULAR; + } +#endif /* XNU_VM_HAS_LOPAGE */ + +#if CONFIG_SECLUDED_MEMORY + /* + * XXX FBDP TODO: also avoid refilling secluded queue + * when some IOKit objects are already grabbing from it... + */ + if (page_queues_locked && + vm_page_secluded_pool_eligible(class) && + vm_page_secluded_pool_depleted()) { + class = VM_MEMORY_CLASS_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + vm_page_free_queue_enter(class, mem, pnum); + + switch (class) { + case VM_MEMORY_CLASS_REGULAR: + result.vmpr_regular++; + break; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + result.vmpr_lopage++; + break; +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + case VM_MEMORY_CLASS_SECLUDED: + result.vmpr_secluded++; + continue; +#endif /* CONFIG_SECLUDED_MEMORY */ + } + } + + if (page_queues_unlock) { + vm_page_unlock_queues(); + } + + vm_pageout_vminfo.vm_page_pages_freed += list.vmpl_count; + VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE, + DBG_FUNC_NONE, list.vmpl_count, 0, 0, 0); + + if (opts & VMP_RELEASE_STARTUP) { + /* + * On purpose skip the VM_CHECK_MEMORYSTATUS, + * pmap_startup() will do it, + * and the caller holds the free queue lock the whole time. + */ + return; + } + + if (vm_page_free_queue_has_any_waiters()) { + vm_page_free_queue_handle_wakeups_and_unlock(result); + } else { + vm_free_page_unlock(); + } + + if ((opts & VMP_RELEASE_HIBERNATE) == 0) { + /* + * Skip VM_CHECK_MEMORYSTATUS here as + * hibernate_rebuild_vm_structs() will run it after the last flush. + */ + VM_CHECK_MEMORYSTATUS; + } +} + +__attribute__((always_inline)) +void +vm_page_free_queue_remove( + vm_memory_class_t class, + vm_page_t mem, + ppnum_t pnum, + vm_page_q_state_t q_state) +{ + unsigned int color; + vm_page_queue_t queue; + + if (startup_phase >= STARTUP_SUB_KMEM) { + LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + } + + mem->vmp_q_state = q_state; + + + color = VM_PAGE_GET_COLOR_PNUM(pnum); + queue = vm_page_free_queue_for_class(class, color); +#if CONFIG_SPTM + if (class == VM_MEMORY_CLASS_REGULAR && vm_pages_free_masks()) { + uint32_t index = vm_pages_free_mask_index(pnum); + int8_t bit = vm_pages_free_mask_bit(pnum); + + vm_pages_free_mask_clear(index, bit); + if (*vm_pages_free_enqueue_idx(index) == bit) { + vm_page_queue_remove(queue, mem, vmp_pageq); + bit = vm_pages_free_mask_next_bit(index, bit); + *vm_pages_free_enqueue_idx(index) = bit; + + if (bit != -1) { + assert(vm_pages_free_mask_test(index, bit)); + pnum = (pnum & -MAX_COLORS) + bit; + mem = vm_page_find_canonical(pnum); + color = VM_PAGE_GET_COLOR_PNUM(pnum); + queue = vm_page_free_queue_for_class(class, color); + vm_page_queue_enter(queue, mem, vmp_pageq); + } + } + } else +#endif /* CONFIG_SPTM */ + { + vm_page_queue_remove(queue, mem, vmp_pageq); + } + + switch (class) { + case VM_MEMORY_CLASS_REGULAR: + VM_COUNTER_DEC(&vm_page_queue_free.vmpfq_count); + VM_COUNTER_DEC(&vm_page_free_count); + break; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + VM_COUNTER_DEC(&vm_lopage_free_count); + vm_lopages_allocated_q += 1; + if (vm_lopage_free_count < vm_lopage_lowater) { + vm_lopage_refill = true; + } + break; +#endif /* XNU_VM_HAS_LOPAGE */ + default: + __builtin_unreachable(); + } +} + +vm_page_list_t +vm_page_free_queue_grab( + vm_grab_options_t options __unused, + vm_memory_class_t class, + unsigned int num_pages, + vm_page_q_state_t q_state) +{ + unsigned int *colorp; + unsigned int color; +#if defined(__x86_64__) + unsigned int clump_end = 1; + unsigned int sub_count = 0; +#endif /* __x86_64__ */ + vm_page_list_t list = { }; + + if (startup_phase >= STARTUP_SUB_KMEM) { + LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_OWNED); + } + assert(get_preemption_level() != 0); + assert(q_state <= VM_PAGE_Q_STATE_LAST_VALID_VALUE); + + + colorp = PERCPU_GET(start_color); + color = *colorp; + + /* Get the pages. */ + while (list.vmpl_count < num_pages) { + uint32_t color_offset = 1; + vm_page_queue_t queue; + vm_page_t mem; + + queue = vm_page_free_queue_for_class(class, color); + if (!vm_page_free_queue_has_colors(class)) { + assert(!vm_page_queue_empty(queue)); + color_offset = 0; + } + while (vm_page_queue_empty(queue)) { + color = (color + 1) & vm_color_mask; + queue = vm_page_free_queue_for_class(class, color); + } + +#if defined(__x86_64__) + if (class == VM_MEMORY_CLASS_REGULAR) { + /* + * x86_64 uses a bespoke free queue scheme, where the free path + * tries to cluster clumps of contiguous pages together on + * the free queue to optimize for the platform's memory + * controller. + */ + vm_page_queue_remove_first_with_clump(queue, mem, clump_end); + sub_count++; + if (clump_end) { +#if DEVELOPMENT || DEBUG + vm_clump_update_stats(sub_count); +#endif /* !DEVELOPMENT && !DEBUG */ + sub_count = 0; + } else { + /* Only change colors at the end of a clump. */ + color_offset = 0; + } + } else +#endif /* !defined(__x86_64__) */ + { + /* Other targets default to rotating colors after each pop. */ + vm_page_queue_remove_first(queue, mem, vmp_pageq); + } + +#if CONFIG_SPTM + if (vm_pages_free_masks()) { + ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem); + ppnum_t first_pnum = pnum & -MAX_COLORS; + uint32_t index = vm_pages_free_mask_index(pnum); + int8_t bit = vm_pages_free_mask_bit(pnum); + + for (;;) { + vm_pages_free_mask_clear(index, bit); + mem->vmp_q_state = q_state; + vm_page_list_push(&list, mem); + + bit = (bit + 1) & (MAX_COLORS - 1); + + if (!vm_pages_free_mask_test(index, bit) || + num_pages <= list.vmpl_count) { + break; + } + mem = vm_page_find_canonical(first_pnum + bit); + } + + color = bit & vm_color_mask; + + bit = vm_pages_free_mask_next_bit(index, bit); + *vm_pages_free_enqueue_idx(index) = bit; + + if (bit != -1) { + assert(vm_pages_free_mask_test(index, bit)); + mem = vm_page_find_canonical(first_pnum + bit); + queue = vm_page_free_queue_for_class(class, + bit & vm_color_mask); + vm_page_queue_enter_first(queue, mem, vmp_pageq); + } + } else +#endif /* CONFIG_SPTM */ + { + /* Set the page to the client's desired queue state. */ + mem->vmp_q_state = q_state; + vm_page_list_push(&list, mem); + + color = (color + color_offset) & vm_color_mask; + } + } + + switch (class) { + case VM_MEMORY_CLASS_REGULAR: + VM_COUNTER_SUB(&vm_page_queue_free.vmpfq_count, list.vmpl_count); + VM_COUNTER_SUB(&vm_page_free_count, list.vmpl_count); + break; +#if XNU_VM_HAS_LOPAGE + case VM_MEMORY_CLASS_LOPAGE: + VM_COUNTER_SUB(&vm_lopage_free_count, list.vmpl_count); + vm_lopages_allocated_q += list.vmpl_count; + if (vm_lopage_free_count < vm_lopage_lowater) { + vm_lopage_refill = true; + } + break; +#endif /* XNU_VM_HAS_LOPAGE */ + default: + __builtin_unreachable(); + } + + /* Record the next page color the CPU should try to get. */ + *colorp = color; +#if defined(__x86_64__) && (DEVELOPMENT || DEBUG) + vm_clump_update_stats(sub_count); +#endif /* defined(__x86_64__) && (DEVELOPMENT || DEBUG) */ + + return list; +} + #define COLOR_GROUPS_TO_STEAL 4 @@ -1216,8 +1591,9 @@ static ppnum_t delay_above_pnum = PPNUM_MAX; /* * Get and initialize the next delayed page. */ +__attribute__((noinline)) static vm_page_t -vm_get_delayed_page(int grab_options) +vm_get_delayed_page(vm_grab_options_t grab_options) { vm_page_t p; ppnum_t pnum; @@ -1293,7 +1669,7 @@ vm_free_delayed_pages(void) while ((p = vm_get_delayed_page(VM_PAGE_GRAB_OPTIONS_NONE)) != NULL) { if (vm_himemory_mode) { - vm_page_release(p, FALSE); + vm_page_release(p, VMP_RELEASE_NONE); } else { p->vmp_snext = list; list = p; @@ -1309,7 +1685,7 @@ vm_free_delayed_pages(void) p = list; list = p->vmp_snext; p->vmp_snext = NULL; - vm_page_release(p, FALSE); + vm_page_release(p, VMP_RELEASE_NONE); } #if DEVELOPMENT || DEBUG kprintf("vm_free_delayed_pages: initialized %d free pages\n", cnt); @@ -1378,7 +1754,7 @@ vm_free_delayed_pages_contig( return; } pnum = VM_PAGE_GET_PHYS_PAGE(p); - vm_page_release(p, FALSE); + vm_page_release(p, VMP_RELEASE_NONE); if (pnum >= max_pnum) { return; } @@ -1403,7 +1779,7 @@ vm_free_delayed_pages_contig( if (p == NULL) { return; } - vm_page_release(p, FALSE); + vm_page_release(p, VMP_RELEASE_NONE); } } @@ -1450,7 +1826,7 @@ vm_page_init_local_q(unsigned int num_cpus) * This should be called right before launchd is loaded. */ void -vm_init_before_launchd() +vm_init_before_launchd(void) { vm_page_lockspin_queues(); vm_page_wire_count_on_boot = vm_page_wire_count; @@ -1507,11 +1883,10 @@ vm_page_bootstrap( purgeable_nonvolatile_count = 0; queue_init(&purgeable_nonvolatile_queue); - for (i = 0; i < MAX_COLORS; i++) { - vm_page_queue_init(&vm_page_queue_free[i].qhead); - } - + vm_page_free_queue_init(&vm_page_queue_free); +#if XNU_VM_HAS_LOPAGE vm_page_queue_init(&vm_lopage_queue_free); +#endif /* XNU_VM_HAS_LOPAGE */ vm_page_queue_init(&vm_page_queue_active); vm_page_queue_init(&vm_page_queue_inactive); #if CONFIG_SECLUDED_MEMORY @@ -1678,9 +2053,9 @@ vm_page_bootstrap( sizeof(vm_page_bucket_t), 0); kernel_debug_string_early("vm_page_bucket_locks"); - vm_page_bucket_locks = (lck_spin_t *) + vm_page_bucket_locks = (lck_ticket_t *) pmap_steal_memory(vm_page_bucket_lock_count * - sizeof(lck_spin_t), 0); + sizeof(lck_ticket_t), 0); for (i = 0; i < vm_page_bucket_count; i++) { vm_page_bucket_t *bucket = &vm_page_buckets[i]; @@ -1693,7 +2068,7 @@ vm_page_bootstrap( } for (i = 0; i < vm_page_bucket_lock_count; i++) { - lck_spin_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket, &vm_page_lck_attr); + lck_ticket_init(&vm_page_bucket_locks[i], &vm_page_lck_grp_bucket); } vm_tag_init(); @@ -1757,7 +2132,7 @@ vm_page_bootstrap( * On x86 it will allocate large pages if size is sufficiently large. We don't need to do this * on ARM yet, due to the combination of a large base page size and smaller RAM devices. */ -static void * +__static_testable void * pmap_steal_memory_internal( vm_size_t size, vm_size_t alignment, @@ -1767,10 +2142,14 @@ pmap_steal_memory_internal( { kern_return_t kr; vm_offset_t addr; + vm_offset_t end = 0; vm_offset_t map_addr; ppnum_t phys_page; unsigned int pmap_flags; + if (size > UINT64_MAX - sizeof(void *)) { + panic("pmap_steal_memory(): size: 0x%lx", size); + } /* * Size needs to be aligned to word size. */ @@ -1826,7 +2205,10 @@ pmap_steal_memory_internal( * Allocate and map physical pages to back the new virtual space. */ map_addr = round_page(addr); - while (map_addr < addr + size) { + if (os_add_overflow(addr, size, &end)) { + panic("pmap_steal_memory() overflow, addr: %lx, size: 0x%lx", addr, size); + } + while (map_addr < end) { #if defined(__x86_64__) /* * Back with a large page if properly aligned on x86 @@ -1893,7 +2275,7 @@ pmap_steal_memory_internal( return (void *) addr; } -void * +__mockable void * pmap_steal_memory( vm_size_t size, vm_size_t alignment) @@ -1929,7 +2311,7 @@ extern void patch_low_glo_vm_page_info(void *, void *, uint32_t); #endif void vm_page_release_startup(vm_page_t mem); -void +__mockable void pmap_startup( vm_offset_t *startp, vm_offset_t *endp) @@ -1977,6 +2359,17 @@ pmap_startup( #if XNU_VM_HAS_LINEAR_PAGES_ARRAY mem_sz = ptoa(pmap_free_pages_span()); +#if CONFIG_SPTM + { + uint32_t count = vm_pages_free_mask_len(); + + _vm_pages_free_masks = pmap_steal_memory(count * + sizeof(__uint128_t), sizeof(__uint128_t)); + _vm_pages_free_enqueue_idx = pmap_steal_memory(count, sizeof(uint8_t)); + bzero(_vm_pages_free_masks, count * sizeof(__uint128_t)); + memset(_vm_pages_free_enqueue_idx, 0xff, count); + } +#endif /* CONFIG_SPTM */ #else mem_sz = ptoa(pmap_free_pages()); #endif @@ -2046,6 +2439,8 @@ pmap_startup( #endif /* defined(__x86_64__) */ + vm_free_page_lock(); + for (uint32_t i = 0; i < npages && pmap_next_page(&phys_page); i++) { #if XNU_VM_HAS_DELAYED_PAGES if (phys_page < max_valid_low_ppnum) { @@ -2095,6 +2490,8 @@ pmap_startup( } } + vm_free_page_unlock(); + absolutetime_to_nanoseconds(mach_absolute_time(), &now_ns); printf("pmap_startup() init/release time: %lld microsec\n", (now_ns - start_ns) / NSEC_PER_USEC); @@ -2317,10 +2714,8 @@ vm_pages_radix_load_root(uint32_t *level) return (vm_page_radix_node_t)root; } -#if XNU_HANDLE_ECC || DEBUG || DEVELOPMENT - -static vm_page_t -vm_pages_radix_next(uint32_t *cursor) +vm_page_t +vm_pages_radix_next(uint32_t *cursor, ppnum_t *pnum) { const uint32_t max_index = vm_pages_first_pnum - pmap_first_pnum; vm_page_radix_node_t node; @@ -2346,16 +2741,19 @@ vm_pages_radix_next(uint32_t *cursor) level -= 1; } else { *cursor = index + 1; + if (pnum) { + *pnum = pmap_first_pnum + index; + } return (vm_page_t)VM_PAGE_UNPACK_PTR(ptr); } } + if (pnum) { + *pnum = 0; + } return VM_PAGE_NULL; } -#define vm_pages_radix_for_each(it) \ - for (uint32_t __index = 0; ((it) = vm_pages_radix_next(&__index)); ) - #if DEBUG || DEVELOPMENT static int @@ -2381,7 +2779,6 @@ vm_page_radix_verify_test(int64_t in __unused, int64_t *out) SYSCTL_TEST_REGISTER(vm_page_radix_verify, vm_page_radix_verify_test); #endif /* DEBUG || DEVELOPMENT */ -#endif /* XNU_HANDLE_ECC || DEBUG || DEVELOPMENT */ __attribute__((noinline)) static void @@ -2480,7 +2877,7 @@ vm_page_find_canonical(ppnum_t pnum) * @brief * Common helper for all vm_page_create* functions. */ -static vm_page_t +vm_page_t vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags) { vm_page_t m; @@ -2498,6 +2895,9 @@ vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags) #if XNU_VM_HAS_LINEAR_PAGES_ARRAY vm_pages_radix_insert(phys_page, m); #endif /* XNU_VM_HAS_LINEAR_PAGES_ARRAY */ + vm_free_page_lock(); + vm_page_pages++; + vm_free_page_unlock(); } return m; } @@ -2514,14 +2914,10 @@ vm_page_create(ppnum_t phys_page, bool canonical, zalloc_flags_t flags) void vm_page_create_canonical(ppnum_t phys_page) { - vm_page_t m; + vm_page_t m; m = vm_page_create(phys_page, true, Z_WAITOK); - - vm_free_page_lock(); - vm_page_pages++; - vm_free_page_unlock(); - vm_page_release(m, FALSE); + vm_page_release(m, VMP_RELEASE_NONE); } @@ -2577,7 +2973,7 @@ vm_page_insert_internal( uint64_t *delayed_ledger_update) { vm_page_bucket_t *bucket; - lck_spin_t *bucket_lock; + lck_ticket_t *bucket_lock; int hash_id; task_t owner; int ledger_idx_volatile; @@ -2645,7 +3041,7 @@ vm_page_insert_internal( bucket = &vm_page_buckets[hash_id]; bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); mem->vmp_next_m = bucket->page_list; bucket->page_list = VM_PAGE_PACK_PTR(mem); @@ -2657,7 +3053,7 @@ vm_page_insert_internal( } #endif /* MACH_PAGE_HASH_STATS */ mem->vmp_hashed = TRUE; - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); } { @@ -2692,6 +3088,10 @@ vm_page_insert_internal( } assert(object->resident_page_count >= object->wired_page_count); +#if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 + vm_object_set_chead_hint(object); +#endif + #if DEVELOPMENT || DEBUG if (object->object_is_shared_cache && object->pager != NULL && @@ -2804,6 +3204,7 @@ vm_page_insert_internal( } } + #if VM_OBJECT_TRACKING_OP_MODIFIED if (vm_object_tracking_btlog && object->internal && @@ -2834,8 +3235,8 @@ vm_page_replace( { vm_page_bucket_t *bucket; vm_page_t found_m = VM_PAGE_NULL; - lck_spin_t *bucket_lock; - int hash_id; + lck_ticket_t *bucket_lock; + int hash_id; #if 0 /* @@ -2871,7 +3272,7 @@ vm_page_replace( bucket = &vm_page_buckets[hash_id]; bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); if (bucket->page_list) { vm_page_packed_t *mp = &bucket->page_list; @@ -2905,7 +3306,7 @@ vm_page_replace( bucket->page_list = VM_PAGE_PACK_PTR(mem); mem->vmp_hashed = TRUE; - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); if (found_m) { /* @@ -2934,7 +3335,7 @@ vm_page_remove( { vm_page_bucket_t *bucket; vm_page_t this; - lck_spin_t *bucket_lock; + lck_ticket_t *bucket_lock; int hash_id; task_t owner; vm_object_t m_object; @@ -2971,7 +3372,7 @@ vm_page_remove( bucket = &vm_page_buckets[hash_id]; bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) { /* optimize for common case */ @@ -2992,7 +3393,7 @@ vm_page_remove( #endif /* MACH_PAGE_HASH_STATS */ mem->vmp_hashed = FALSE; this->vmp_next_m = VM_PAGE_PACK_PTR(NULL); - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); } /* * Now remove from the object's list of backed pages. @@ -3192,7 +3593,7 @@ vm_page_lookup( vm_page_t mem; vm_page_bucket_t *bucket; vm_page_queue_entry_t qe; - lck_spin_t *bucket_lock = NULL; + lck_ticket_t *bucket_lock = NULL; int hash_id; #if DEBUG_VM_PAGE_LOOKUP uint64_t start, elapsed; @@ -3200,11 +3601,11 @@ vm_page_lookup( OSAddAtomic64(1, &vm_page_lookup_stats.vpl_total); #endif -#if CONFIG_KERNEL_TAGGING +#if KASAN_TBI if (is_kernel_object(object)) { offset = vm_memtag_canonicalize_kernel(offset); } -#endif /* CONFIG_KERNEL_TAGGING */ +#endif /* KASAN_TBI */ vm_object_lock_assert_held(object); assertf(page_aligned(offset), "offset 0x%llx\n", offset); @@ -3308,7 +3709,7 @@ vm_page_lookup( bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; @@ -3324,7 +3725,7 @@ vm_page_lookup( break; } } - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); } #if DEBUG_VM_PAGE_LOOKUP @@ -3446,10 +3847,11 @@ vm_page_init(vm_page_t mem, ppnum_t phys_page) assert(VM_PAGE_NOT_ON_Q == 0); assert(sizeof(*mem) % sizeof(uintptr_t) == 0); *mem = (struct vm_page) { - .vmp_q_state = VM_PAGE_NOT_ON_Q, - .vmp_canonical = vm_page_in_array(mem), - .vmp_offset = (vm_object_offset_t)-1, - .vmp_busy = true, + .vmp_offset = (vm_object_offset_t)-1, + .vmp_q_state = VM_PAGE_NOT_ON_Q, + .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY, + .vmp_canonical = vm_page_in_array(mem), + .vmp_busy = true, }; VM_PAGE_INIT_PHYS_PAGE(mem, phys_page); @@ -3646,9 +4048,7 @@ vm_page_update_special_state(vm_page_t mem) return; } - int mode = mem->vmp_on_specialq; - - switch (mode) { + switch (mem->vmp_on_specialq) { case VM_PAGE_SPECIAL_Q_BG: { task_t my_task = current_task_early(); @@ -3709,7 +4109,7 @@ vm_page_update_special_state(vm_page_t mem) void -vm_page_assign_special_state(vm_page_t mem, int mode) +vm_page_assign_special_state(vm_page_t mem, vm_page_specialq_t mode) { if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { return; @@ -3753,17 +4153,13 @@ vm_page_assign_special_state(vm_page_t mem, int mode) void -vm_page_remove_from_specialq( - vm_page_t mem) +vm_page_remove_from_specialq(vm_page_t mem) { vm_object_t m_object; - unsigned short mode; LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - mode = mem->vmp_on_specialq; - - switch (mode) { + switch (mem->vmp_on_specialq) { case VM_PAGE_SPECIAL_Q_BG: { if (mem->vmp_specialq.next && mem->vmp_specialq.prev) { @@ -3813,9 +4209,7 @@ vm_page_remove_from_specialq( void -vm_page_add_to_specialq( - vm_page_t mem, - boolean_t first) +vm_page_add_to_specialq(vm_page_t mem, boolean_t first) { vm_object_t m_object; @@ -3825,9 +4219,7 @@ vm_page_add_to_specialq( return; } - int mode = mem->vmp_on_specialq; - - switch (mode) { + switch (mem->vmp_on_specialq) { case VM_PAGE_SPECIAL_Q_BG: { if (vm_page_background_mode == VM_PAGE_BG_DISABLED) { @@ -3879,6 +4271,71 @@ vm_page_add_to_specialq( } } +/*! + * @brief + * Prepares a page that has been successfully grabbed for the caller. + * + * @discussion + * This function will update accounting, emit tracements, ... + */ +static vm_page_t +vm_page_grab_finalize(vm_grab_options_t grab_options __unused, vm_page_t mem) +{ + task_t task; + +#if MACH_ASSERT + /* + * For all free pages, no matter their provenance... + * ensure they are not referenced anywhere, + * and their state is clean. + */ + if (vm_check_refs_on_alloc) { + pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem)); + } + assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 && + mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 && + mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 && + mem->vmp_next_m == 0 && + mem->vmp_object == 0 && + mem->vmp_wire_count == 0 && + mem->vmp_busy && + !mem->vmp_tabled && + !mem->vmp_laundry && + !mem->vmp_pmapped && + !mem->vmp_wpmapped && + !mem->vmp_realtime); +#endif /* MACH_ASSERT */ + + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + + { + VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB, + DBG_FUNC_NONE, grab_options, 0, 0, 0); + } + + counter_inc(&vm_page_grab_count); + + task = current_task_early(); + if (task != TASK_NULL) { + ledger_credit(task->ledger, task_ledgers.pages_grabbed, 1); + } + if (task != TASK_NULL && task != kernel_task) { + /* + * tag:DONATE this is where the donate state of the page + * is decided according to what task grabs it + */ + if (task->donates_own_pages) { + vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE); + } else { + vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG); + } + } + + return mem; +} + #if __x86_64__ /* * This can be switched to FALSE to help debug drivers @@ -3887,51 +4344,36 @@ vm_page_add_to_specialq( boolean_t vm_himemory_mode = TRUE; #endif /* __x86_64__ */ -/* - * this interface exists to support hardware controllers - * incapable of generating DMAs with more than 32 bits - * of address on platforms with physical memory > 4G... - */ -unsigned int vm_lopages_allocated_q = 0; -unsigned int vm_lopages_allocated_cpm_success = 0; -unsigned int vm_lopages_allocated_cpm_failed = 0; -vm_page_queue_head_t vm_lopage_queue_free VM_PAGE_PACKED_ALIGNED; +#if XNU_VM_HAS_LOPAGE vm_page_t -vm_page_grablo(void) +vm_page_grablo(vm_grab_options_t grab_options) { - vm_page_t mem; + vm_page_t mem = VM_PAGE_NULL; - if (vm_lopage_needed == FALSE) { - int grab_options = VM_PAGE_GRAB_OPTIONS_NONE; + if (!vm_lopage_needed) { return vm_page_grab_options(grab_options); } vm_free_page_lock_spin(); + if (vm_lopage_free_count) { +#if LCK_MTX_USE_ARCH + /* + * Intel locks do not really always disable preemption + * for lck_mtx_lock_spin(), and vm_page_free_queue_grab() + * really want that. + */ + disable_preemption(); +#endif + mem = vm_page_free_queue_grab(grab_options, + VM_MEMORY_CLASS_LOPAGE, 1, VM_PAGE_NOT_ON_Q).vmpl_head; +#if LCK_MTX_USE_ARCH + enable_preemption(); +#endif + } + vm_free_page_unlock(); - if (!vm_page_queue_empty(&vm_lopage_queue_free)) { - vm_page_queue_remove_first(&vm_lopage_queue_free, mem, vmp_pageq); - assert(vm_lopage_free_count); - assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); - mem->vmp_q_state = VM_PAGE_NOT_ON_Q; - - vm_lopage_free_count--; - vm_lopages_allocated_q++; - - if (vm_lopage_free_count < vm_lopage_lowater) { - vm_lopage_refill = TRUE; - } - - vm_free_page_unlock(); - - if (current_task()->donates_own_pages) { - vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE); - } else { - vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG); - } - } else { - vm_free_page_unlock(); - + if (mem == VM_PAGE_NULL) { if (cpm_allocate(PAGE_SIZE, &mem, atop(PPNUM_MAX), 0, FALSE, KMA_LOMEM) != KERN_SUCCESS) { vm_free_page_lock_spin(); vm_lopages_allocated_cpm_failed++; @@ -3952,349 +4394,40 @@ vm_page_grablo(void) vm_lopages_allocated_cpm_success++; vm_page_unlock_queues(); } - assert(mem->vmp_busy); - assert(!mem->vmp_pmapped); - assert(!mem->vmp_wpmapped); - assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); - VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - - counter_inc(&vm_page_grab_count); - VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0); - - return mem; + return vm_page_grab_finalize(grab_options, mem); } -/* - * vm_page_grab: +#endif /* XNU_VM_HAS_LOPAGE */ +#if CONFIG_SECLUDED_MEMORY + +/*! + * @brief + * Attempt to allocate a page from the secluded queue * - * first try to grab a page from the per-cpu free list... - * this must be done while pre-emption is disabled... if - * a page is available, we're done... - * if no page is available, grab the vm_page_queue_free_lock - * and see if current number of free pages would allow us - * to grab at least 1... if not, return VM_PAGE_NULL as before... - * if there are pages available, disable preemption and - * recheck the state of the per-cpu free list... we could - * have been preempted and moved to a different cpu, or - * some other thread could have re-filled it... if still - * empty, figure out how many pages we can steal from the - * global free queue and move to the per-cpu queue... - * return 1 of these pages when done... only wakeup the - * pageout_scan thread if we moved pages from the global - * list... no need for the wakeup if we've satisfied the - * request from the per-cpu queue. - */ - -#if CONFIG_SECLUDED_MEMORY -vm_page_t vm_page_grab_secluded(void); -#endif /* CONFIG_SECLUDED_MEMORY */ - -static inline void -vm_page_grab_diags(void); - -vm_page_t -vm_page_grab(void) -{ - return vm_page_grab_options(VM_PAGE_GRAB_OPTIONS_NONE); -} - -#if HIBERNATION -boolean_t hibernate_rebuild_needed = FALSE; -#endif /* HIBERNATION */ - -static void -vm_page_finalize_grabed_page(vm_page_t mem) -{ - task_t cur_task = current_task_early(); - if (cur_task && cur_task != kernel_task) { - /* tag:DONATE this is where the donate state of the page is decided according to what task grabs it */ - if (cur_task->donates_own_pages) { - vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_DONATE); - } else { - vm_page_assign_special_state(mem, VM_PAGE_SPECIAL_Q_BG); - } - } -} - -/* - * vm_page_grab_options_internal: - * Core logic for a page grab request. This is separate from the actual - * vm_page_grab_options interface so that this function can have multiple paths - * to return a page, but the interface will unilaterally run any checks or - * followup we want on a grabbed page. + * @discussion + * This function will check that the caller is eligible + * for the secluded pool, and if not, return VM_PAGE_NULL. */ +__attribute__((noinline)) static vm_page_t -vm_page_grab_options_internal( - int grab_options) -{ - vm_page_t mem; - -restart: - disable_preemption(); - - if ((mem = *PERCPU_GET(free_pages))) { - assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); - -#if HIBERNATION - if (hibernate_rebuild_needed) { - panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__); - } -#endif /* HIBERNATION */ - - vm_page_grab_diags(); - - vm_offset_t pcpu_base = current_percpu_base(); - counter_inc_preemption_disabled(&vm_page_grab_count); - *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem->vmp_snext; - VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); - - VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - mem->vmp_q_state = VM_PAGE_NOT_ON_Q; - enable_preemption(); - - assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); - assert(mem->vmp_tabled == FALSE); - assert(mem->vmp_object == 0); - assert(!mem->vmp_laundry); - assert(mem->vmp_busy); - assert(!mem->vmp_pmapped); - assert(!mem->vmp_wpmapped); - assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); - assert(!mem->vmp_realtime); - -#if MACH_ASSERT - if (vm_check_refs_on_alloc) { - vm_page_validate_no_references(mem); - } -#endif - vm_page_finalize_grabed_page(mem); - return mem; - } - enable_preemption(); - - /* - * Optionally produce warnings if the wire or gobble - * counts exceed some threshold. - */ -#if VM_PAGE_WIRE_COUNT_WARNING - if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) { - printf("mk: vm_page_grab(): high wired page count of %d\n", - vm_page_wire_count); - } -#endif -#if VM_PAGE_GOBBLE_COUNT_WARNING - if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) { - printf("mk: vm_page_grab(): high gobbled page count of %d\n", - vm_page_gobble_count); - } -#endif - -#if XNU_VM_HAS_DELAYED_PAGES - /* - * If free count is low and we have delayed pages from early boot, - * get one of those instead. - */ - if (__improbable(vm_delayed_count > 0 && - vm_page_free_count <= vm_page_free_target && - (mem = vm_get_delayed_page(grab_options)) != NULL)) { - assert(!mem->vmp_realtime); - // TODO: missing vm_page_finalize_grabed_page()? - return mem; - } -#endif /* XNU_VM_HAS_DELAYED_PAGES */ - - vm_free_page_lock_spin(); - - /* - * Only let privileged threads (involved in pageout) - * dip into the reserved pool. - */ - if ((vm_page_free_count < vm_page_free_reserved) && - !(current_thread()->options & TH_OPT_VMPRIV)) { - /* no page for us in the free queue... */ - vm_free_page_unlock(); - mem = VM_PAGE_NULL; - -#if CONFIG_SECLUDED_MEMORY - /* ... but can we try and grab from the secluded queue? */ - if (vm_page_secluded_count > 0 && - ((grab_options & VM_PAGE_GRAB_SECLUDED) || - task_can_use_secluded_mem(current_task(), TRUE))) { - mem = vm_page_grab_secluded(); - if (grab_options & VM_PAGE_GRAB_SECLUDED) { - vm_page_secluded.grab_for_iokit++; - if (mem) { - vm_page_secluded.grab_for_iokit_success++; - } - } - if (mem) { - VM_CHECK_MEMORYSTATUS; - - vm_page_grab_diags(); - counter_inc(&vm_page_grab_count); - VM_DEBUG_EVENT(vm_page_grab, DBG_VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); - - assert(!mem->vmp_realtime); - // TODO: missing vm_page_finalize_grabed_page()? - return mem; - } - } -#endif /* CONFIG_SECLUDED_MEMORY */ - (void) grab_options; - } else { - unsigned int pages_to_steal; - - - /* - * Replenishing our per-CPU cache of free pages might take - * too long to keep holding the "free_page" lock as a spinlock, - * so convert to the full mutex to prevent other threads trying - * to acquire the "free_page" lock from timing out spinning on - * the mutex interlock. - */ - vm_free_page_lock_convert(); - - while (vm_page_free_count == 0) { - vm_free_page_unlock(); - /* - * must be a privileged thread to be - * in this state since a non-privileged - * thread would have bailed if we were - * under the vm_page_free_reserved mark - */ - VM_PAGE_WAIT(); - vm_free_page_lock(); - } - - /* - * Need to repopulate the per-CPU free list from the global free list. - * Note we don't do any processing of pending retirement pages here. - * That'll happen in the code above when the page comes off the per-CPU list. - */ - disable_preemption(); - - /* - * If we got preempted the cache might now have pages. - */ - if ((mem = *PERCPU_GET(free_pages))) { - vm_free_page_unlock(); - enable_preemption(); - goto restart; - } - - if (vm_page_free_count <= vm_page_free_reserved) { - pages_to_steal = 1; - } else { - if (vm_free_magazine_refill_limit <= (vm_page_free_count - vm_page_free_reserved)) { - pages_to_steal = vm_free_magazine_refill_limit; - } else { - pages_to_steal = (vm_page_free_count - vm_page_free_reserved); - } - } - - /* Grab pages from the global free queues. */ - mem = vm_page_queue_free_remove_first(pages_to_steal, VM_PAGE_ON_FREE_LOCAL_Q); - -#if HIBERNATION - if (hibernate_rebuild_needed) { - panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__); - } -#endif /* HIBERNATION */ - - /* Make the grabbed list the per-CPU free list. */ - vm_offset_t pcpu_base = current_percpu_base(); - *PERCPU_GET_WITH_BASE(pcpu_base, free_pages) = mem; - - /* - * We decremented vm_page_free_count above - * so we must wake up vm_pageout_scan() if - * we brought it down below vm_page_free_min. - */ - bool wakeup_pageout_scan = false; - if (vm_page_free_count < vm_page_free_min && - !vm_pageout_running) { - wakeup_pageout_scan = true; - } - vm_free_page_unlock(); - - enable_preemption(); - - if (wakeup_pageout_scan) { - thread_wakeup((event_t) &vm_page_free_wanted); - } - VM_CHECK_MEMORYSTATUS; - - goto restart; - } - - /* - * Decide if we should poke the pageout daemon. - * We do this if the free count is less than the low - * water mark. VM Pageout Scan will keep running till - * the free_count > free_target (& hence above free_min). - * This wakeup is to catch the possibility of the counts - * dropping between VM Pageout Scan parking and this check. - * - * We don't have the counts locked ... if they change a little, - * it doesn't really matter. - */ - if (vm_page_free_count < vm_page_free_min) { - vm_free_page_lock(); - if (vm_pageout_running == FALSE) { - vm_free_page_unlock(); - thread_wakeup((event_t) &vm_page_free_wanted); - } else { - vm_free_page_unlock(); - } - } - - VM_CHECK_MEMORYSTATUS; - - if (mem) { - assert(!mem->vmp_realtime); -// dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 4); /* (TEST/DEBUG) */ - - vm_page_finalize_grabed_page(mem); - } - return mem; -} - -vm_page_t -vm_page_grab_options( - int grab_options) -{ - vm_page_t mem = vm_page_grab_options_internal(grab_options); - - /* - * For all free pages, no matter their provenance... ensure they are - * not referenced anywhere. - */ - if (mem != VM_PAGE_NULL) { - -#if MACH_ASSERT - if (vm_check_refs_on_alloc) { - vm_page_validate_no_references(mem); - } -#endif /* MACH_ASSERT */ - } - - return mem; -} - -#if CONFIG_SECLUDED_MEMORY -vm_page_t -vm_page_grab_secluded(void) +vm_page_grab_secluded(vm_grab_options_t grab_options) { vm_page_t mem; vm_object_t object; int refmod_state; if (vm_page_secluded_count == 0) { - /* no secluded pages to grab... */ return VM_PAGE_NULL; } + if (grab_options & VM_PAGE_GRAB_SECLUDED) { + vm_page_secluded.grab_for_iokit++; + } else if (!task_can_use_secluded_mem(current_task(), TRUE)) { + return VM_PAGE_NULL; + } + + /* secluded queue is protected by the VM page queue lock */ vm_page_lock_queues(); @@ -4338,18 +4471,7 @@ vm_page_grab_secluded(void) /* free for grab! */ vm_page_unlock_queues(); vm_page_secluded.grab_success_free++; - - assert(mem->vmp_busy); - assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL); - assert(mem->vmp_pageq.next == 0); - assert(mem->vmp_pageq.prev == 0); - assert(mem->vmp_listq.next == 0); - assert(mem->vmp_listq.prev == 0); - assert(mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY); - assert(mem->vmp_specialq.next == 0); - assert(mem->vmp_specialq.prev == 0); - return mem; + goto out_success; } assert(!object->internal); @@ -4400,7 +4522,6 @@ reactivate_secluded_page: vm_page_unlock_queues(); - /* finish what vm_page_free() would have done... */ vm_page_free_prepare_object(mem, TRUE); vm_object_unlock(object); @@ -4409,17 +4530,11 @@ reactivate_secluded_page: pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); vm_page_secluded.grab_success_other++; - assert(mem->vmp_busy); - assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL); - assert(mem->vmp_pageq.next == 0); - assert(mem->vmp_pageq.prev == 0); - assert(mem->vmp_listq.next == 0); - assert(mem->vmp_listq.prev == 0); - assert(mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY); - assert(mem->vmp_specialq.next == 0); - assert(mem->vmp_specialq.prev == 0); +out_success: + if (grab_options & VM_PAGE_GRAB_SECLUDED) { + vm_page_secluded.grab_for_iokit_success++; + } return mem; } @@ -4482,91 +4597,250 @@ vm_page_secluded_drain(void) return num_reclaimed; } + #endif /* CONFIG_SECLUDED_MEMORY */ -static inline void -vm_page_grab_diags() +/*! + * @brief + * Attempts to allocate a page from the specified per-cpu page queue. + */ +static vm_page_t +vm_page_grab_from_cpu(vm_page_t *cpu_list, scalable_counter_t *counter) { - task_t task = current_task_early(); - if (task == NULL) { - return; - } + vm_page_t mem = _vm_page_list_pop(cpu_list); - counter_inc(&task->pages_grabbed); + if (mem != VM_PAGE_NULL) { +#if HIBERNATION + if (hibernate_rebuild_needed) { + panic("should not modify cpu->free_pages while hibernating"); + } +#endif /* HIBERNATION */ + counter_dec_preemption_disabled(counter); + } + return mem; } -/* - * vm_page_release: + +/*! + * @brief + * Attempts to allocate pages from free queues, and to populate the per-cpu + * queue as a side effect. * - * Return a page to the free list. + * @discussion + * This function will take the properties of the allocating thread into account + * to decide how many pages it can allocate. + * + * If the free queues are depleted, then it will return VM_PAGE_NULL. */ - -void -vm_page_release(vm_page_t mem, boolean_t page_queues_locked) +__attribute__((noinline)) +static vm_page_t +vm_page_grab_slow(vm_grab_options_t grab_options) { - vmp_free_list_result_t vmpr; - - if (page_queues_locked) { - LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - } else { - LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); - } - - assert(vm_page_is_canonical(mem)); - - vm_page_validate_no_references(mem); - - if (__improbable(mem->vmp_realtime)) { - if (!page_queues_locked) { - vm_page_lock_queues(); - } - if (mem->vmp_realtime) { - mem->vmp_realtime = false; - vm_page_realtime_count--; - } - if (!page_queues_locked) { - vm_page_unlock_queues(); - } - } - - pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); + unsigned int target = vm_free_magazine_refill_limit; + vm_memory_class_t class = VM_MEMORY_CLASS_REGULAR; + vm_page_t mem = VM_PAGE_NULL; + vm_page_list_t list = { }; + vm_page_t *cpu_list = NULL; + scalable_counter_t *counter = NULL; vm_free_page_lock_spin(); - - assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - assert(mem->vmp_busy); - assert(!mem->vmp_laundry); - assert(mem->vmp_object == 0); - assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); - assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); - assert(mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0); - - vmpr = vm_page_put_list_on_free_queue(mem, page_queues_locked); - vm_pageout_vminfo.vm_page_pages_freed += 1; - VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE, - DBG_FUNC_NONE, 1, 0, 0, 0); - - if (vm_page_free_has_any_waiters()) { - vm_page_free_handle_wakeups_and_unlock(vmpr); - } else { +#if LCK_MTX_USE_ARCH + /* Intel does't disable preemption with vm_free_page_lock_spin() */ + disable_preemption(); +#endif /* LCK_MTX_USE_ARCH */ + cpu_list = PERCPU_GET(free_pages); + counter = &vm_cpu_free_count; + { + mem = vm_page_grab_from_cpu(cpu_list, counter); + } + if (mem != VM_PAGE_NULL) { +#if LCK_MTX_USE_ARCH + enable_preemption(); +#endif /* LCK_MTX_USE_ARCH */ vm_free_page_unlock(); + return mem; } + if (vm_page_free_count <= vm_page_free_reserved) { + if ((current_thread()->options & TH_OPT_VMPRIV) == 0) { + target = 0; + } else if (vm_page_free_count == 0) { + target = 0; + } else { + target = 1; + } + } else { + target = MIN(target, vm_page_free_count - vm_page_free_reserved); + } + +#if HIBERNATION + if (target > 0 && hibernate_rebuild_needed) { + panic("should not modify CPU free_pages while hibernating"); + } +#endif /* HIBERNATION */ + + /* + * Convert the lock hold into a mutex, to signal to waiters that the + * lock may be held for longer. + */ +#if !LCK_MTX_USE_ARCH + disable_preemption(); +#endif /* !LCK_MTX_USE_ARCH */ + vm_free_page_lock_convert(); + + if (target != 0) { + list = vm_page_free_queue_grab(grab_options, class, target, + VM_PAGE_ON_FREE_LOCAL_Q); + } + +#if VM_PAGE_WIRE_COUNT_WARNING + if (vm_page_wire_count >= VM_PAGE_WIRE_COUNT_WARNING) { + printf("mk: vm_page_grab(): high wired page count of %d\n", + vm_page_wire_count); + } +#endif +#if VM_PAGE_GOBBLE_COUNT_WARNING + if (vm_page_gobble_count >= VM_PAGE_GOBBLE_COUNT_WARNING) { + printf("mk: vm_page_grab(): high gobbled page count of %d\n", + vm_page_gobble_count); + } +#endif + + if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) { + thread_wakeup(&vm_page_free_wanted); + } + + vm_free_page_unlock(); + VM_CHECK_MEMORYSTATUS; + + if (list.vmpl_head) { + /* Steal a page off the list for the caller. */ + mem = vm_page_list_pop(&list); + + /* Add the remaining pages to the CPU's free list. */ + assert(*cpu_list == VM_PAGE_NULL); + *cpu_list = list.vmpl_head; + counter_add_preemption_disabled(counter, list.vmpl_count); + } + + enable_preemption(); + + return mem; } -/* - * This version of vm_page_release() is used only at startup - * when we are single-threaded and pages are being released - * for the first time. Hence, no locking or unnecessary checks are made. - * Note: VM_CHECK_MEMORYSTATUS invoked by the caller. - */ -void -vm_page_release_startup(vm_page_t mem) +vm_page_t +vm_page_grab_options(vm_grab_options_t options) { - vm_page_put_list_on_free_queue(mem, false); + vm_page_t mem; + +restart: + + /* + * Step 1: look at the CPU magazines. + */ + + disable_preemption(); + mem = vm_page_grab_from_cpu(PERCPU_GET(free_pages), &vm_cpu_free_count); + enable_preemption(); + + if (mem != VM_PAGE_NULL) { + return vm_page_grab_finalize(options, mem); + } + +#if XNU_VM_HAS_DELAYED_PAGES + /* + * If free count is low and we have delayed pages from early boot, + * get one of those instead. + */ + if (__improbable(vm_delayed_count > 0 && + vm_page_free_count <= vm_page_free_target)) { + mem = vm_get_delayed_page(options); + if (mem != VM_PAGE_NULL) { + return vm_page_grab_finalize(options, mem); + } + } +#endif /* XNU_VM_HAS_DELAYED_PAGES */ + + + /* + * Step 2: Try to promote pages from the free queues, + * or the secluded queue if appropriate. + */ + + mem = vm_page_grab_slow(options); + if (mem != VM_PAGE_NULL) { + return vm_page_grab_finalize(options, mem); + } + +#if CONFIG_SECLUDED_MEMORY + mem = vm_page_grab_secluded(options); + if (mem != VM_PAGE_NULL) { + return vm_page_grab_finalize(options, mem); + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + + /* + * Step 3: Privileged threads block and retry, others fail. + */ + + if ((options & VM_PAGE_GRAB_NOPAGEWAIT) == 0 && + (current_thread()->options & TH_OPT_VMPRIV) != 0) { + VM_PAGE_WAIT(); + goto restart; + } + + return VM_PAGE_NULL; } +vm_grab_options_t +vm_page_grab_options_for_object(vm_object_t object __unused) +{ + vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE; + +#if CONFIG_SECLUDED_MEMORY + if (object->can_grab_secluded) { + options |= VM_PAGE_GRAB_SECLUDED; + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + return options; +} + +/*! + * @function vm_page_free_queue_steal() + * + * @abstract + * Steal a given page from the free queues. + * + * @discussion + * The given page must be in the given free queue, or state may be corrupted. + * + * Internally, the free queue is not synchronized, so any locking must be done + * outside of this function. + * + * This function, like vm_page_grab(), takes care of waking up + * page out scan as needed. + */ +static void +vm_page_free_queue_steal(vm_grab_options_t options, vm_page_t mem) +{ + ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem); + vm_memory_class_t class = vm_page_get_memory_class(mem, pnum); + + assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q); + assert(!mem->vmp_lopage && mem->vmp_busy); + + vm_page_free_queue_remove(class, mem, pnum, VM_PAGE_NOT_ON_Q); + vm_page_grab_finalize(options, mem); + + if (vm_page_free_count < vm_page_free_min && !vm_pageout_running) { + thread_wakeup(&vm_page_free_wanted); + } +} + + /* * vm_page_wait: * @@ -4579,8 +4853,7 @@ vm_page_release_startup(vm_page_t mem) */ boolean_t -vm_page_wait( - int interruptible ) +vm_page_wait(int interruptible) { /* * We can't use vm_page_free_reserved to make this @@ -4589,32 +4862,28 @@ vm_page_wait( * succeeds, the second fails. After the first page is freed, * a call to vm_page_wait must really block. */ - kern_return_t wait_result; - int need_wakeup = 0; - thread_t cur_thread = current_thread(); - int is_privileged = cur_thread->options & TH_OPT_VMPRIV; - event_t wait_event = NULL; - event_t wake_event = (event_t)&vm_page_free_wanted; + kern_return_t wait_result = THREAD_NOT_WAITING; + thread_t cur_thread = current_thread(); + bool is_privileged = cur_thread->options & TH_OPT_VMPRIV; + bool need_wakeup = false; + event_t wait_event = NULL; vm_free_page_lock_spin(); - { - if (is_privileged && vm_page_free_count) { - vm_free_page_unlock(); - return TRUE; - } - - if (vm_page_free_count >= vm_page_free_target) { - vm_free_page_unlock(); - return TRUE; - } - } - if (is_privileged) { - if (vm_page_free_wanted_privileged++ == 0) { - need_wakeup = 1; + if (vm_page_free_count) { + vm_free_page_unlock(); + goto out; } + + if (vm_page_free_wanted_privileged++ == 0) { + need_wakeup = true; + } + wait_event = (event_t)&vm_page_free_wanted_privileged; + } else if (vm_page_free_count >= vm_page_free_target) { + vm_free_page_unlock(); + goto out; #if CONFIG_SECLUDED_MEMORY } else if (secluded_for_apps && task_can_use_secluded_mem(current_task(), FALSE)) { @@ -4624,21 +4893,27 @@ vm_page_wait( /* XXX FBDP: hopefully not for too long... */ if (vm_page_secluded_count > 0) { vm_free_page_unlock(); - return TRUE; + goto out; } #endif if (vm_page_free_wanted_secluded++ == 0) { - need_wakeup = 1; + need_wakeup = true; } + wait_event = (event_t)&vm_page_free_wanted_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ } else { if (vm_page_free_wanted++ == 0) { - need_wakeup = 1; + need_wakeup = true; } + wait_event = (event_t)&vm_page_free_count; } + if (vm_pageout_running) { + need_wakeup = false; + } + /* * We don't do a vm_pageout_scan wakeup if we already have * some waiters because vm_pageout_scan checks for waiters @@ -4654,9 +4929,8 @@ vm_page_wait( * context switch. Could be a perf. issue. */ - if (need_wakeup) { - thread_wakeup(wake_event); + thread_wakeup((event_t)&vm_page_free_wanted); } /* @@ -4685,74 +4959,34 @@ vm_page_wait( vm_free_page_unlock(); if (need_wakeup) { - thread_wakeup(wake_event); + thread_wakeup((event_t)&vm_page_free_wanted); } - if (wait_result == THREAD_WAITING) { - { - VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, - DBG_VM_PAGE_WAIT_BLOCK, - DBG_FUNC_START, - vm_page_free_wanted_privileged, - vm_page_free_wanted, + if (wait_result != THREAD_WAITING) { + goto out; + } + + + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, + DBG_VM_PAGE_WAIT_BLOCK, + DBG_FUNC_START, + vm_page_free_wanted_privileged, + vm_page_free_wanted, #if CONFIG_SECLUDED_MEMORY - vm_page_free_wanted_secluded, + vm_page_free_wanted_secluded, #else /* CONFIG_SECLUDED_MEMORY */ - 0, + 0, #endif /* CONFIG_SECLUDED_MEMORY */ - 0 - ); - } - - wait_result = thread_block(THREAD_CONTINUE_NULL); - - { - VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, - DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); - } - } + 0); + wait_result = thread_block(THREAD_CONTINUE_NULL); + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, + DBG_VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); } - +out: return (wait_result == THREAD_AWAKENED) || (wait_result == THREAD_NOT_WAITING); } -/* - * vm_page_alloc: - * - * Allocate and return a memory cell associated - * with this VM object/offset pair. - * - * Object must be locked. - */ - -vm_page_t -vm_page_alloc( - vm_object_t object, - vm_object_offset_t offset) -{ - vm_page_t mem; - int grab_options; - - vm_object_lock_assert_exclusive(object); - grab_options = 0; -#if CONFIG_SECLUDED_MEMORY - if (object->can_grab_secluded) { - grab_options |= VM_PAGE_GRAB_SECLUDED; - } -#endif /* CONFIG_SECLUDED_MEMORY */ - - - mem = vm_page_grab_options(grab_options); - if (mem == VM_PAGE_NULL) { - return VM_PAGE_NULL; - } - - vm_page_insert(mem, object, offset); - - return mem; -} - /* * vm_page_free_prepare: * @@ -4765,12 +4999,14 @@ static void vm_page_free_prepare( vm_page_t mem) { + vm_page_free_prepare_queues(mem); + vm_page_free_prepare_object(mem, TRUE); #if CONFIG_SPTM /** - * SPTM TODO: The pmap should retype frames automatically as mappings to them are - * created and destroyed. In order to catch potential cases where this - * does not happen, add an appropriate assert here. This code should be - * executed on every frame that is about to be released to the VM. + * The pmap should retype frames as necessary when pmap_recycle_page() + * is called. In order to catch potential cases where this does not + * happen, add an appropriate assert here. This code should be + * executed on every frame that is about to be released to the VM. */ const sptm_paddr_t paddr = ((uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)) << PAGE_SHIFT; __unused const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr); @@ -4778,8 +5014,6 @@ vm_page_free_prepare( assert(frame_type == XNU_DEFAULT); #endif /* CONFIG_SPTM */ - vm_page_free_prepare_queues(mem); - vm_page_free_prepare_object(mem, TRUE); } @@ -4812,9 +5046,9 @@ vm_page_free_prepare_queues( vm_page_queues_remove(mem, TRUE); - if (__improbable(mem->vmp_realtime)) { + if (mem->vmp_realtime) { mem->vmp_realtime = false; - vm_page_realtime_count--; + VM_COUNTER_DEC(&vm_page_realtime_count); } if (VM_PAGE_WIRED(mem)) { @@ -4887,6 +5121,7 @@ vm_page_free_prepare_queues( mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_iopl_wired = false; mem->vmp_wire_count = 0; assert(!mem->vmp_gobbled); } else if (mem->vmp_gobbled) { @@ -4901,27 +5136,28 @@ vm_page_free_prepare_queues( * like vm_page_init, but we have to preserve fields related to phys page */ inline static void -vm_page_reset(vm_page_t mem) +vm_page_reset_canonical(vm_page_t mem) { *mem = (struct vm_page){ - .vmp_q_state = VM_PAGE_NOT_ON_Q, - .vmp_canonical = mem->vmp_canonical, - .vmp_lopage = mem->vmp_lopage, - .vmp_offset = (vm_object_offset_t)-1, - .vmp_busy = true, + .vmp_offset = (vm_object_offset_t)-1, + .vmp_q_state = VM_PAGE_NOT_ON_Q, + .vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY, +#if XNU_VM_HAS_LOPAGE + .vmp_lopage = mem->vmp_lopage, +#endif /* XNU_VM_HAS_LOPAGE */ + .vmp_canonical = true, + .vmp_busy = true, + .vmp_realtime = mem->vmp_realtime, #if !XNU_VM_HAS_LINEAR_PAGES_ARRAY - .vmp_phys_page = mem->vmp_phys_page, + .vmp_phys_page = mem->vmp_phys_page, #endif /* !XNU_VM_HAS_LINEAR_PAGES_ARRAY */ }; /* ECC information is out of `struct vm_page` and preserved */ } void -vm_page_free_prepare_object( - vm_page_t mem, - boolean_t remove_from_hash) +vm_page_free_prepare_object(vm_page_t mem, boolean_t remove_from_hash) { - assert(!mem->vmp_realtime); if (mem->vmp_tabled) { vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */ } @@ -4931,20 +5167,60 @@ vm_page_free_prepare_object( vm_page_reset_private(mem); } if (vm_page_is_canonical(mem)) { - assert(mem->vmp_pageq.next == 0); - assert(mem->vmp_pageq.prev == 0); - assert(mem->vmp_listq.next == 0); - assert(mem->vmp_listq.prev == 0); - assert(mem->vmp_specialq.next == 0); - assert(mem->vmp_specialq.prev == 0); - assert(mem->vmp_next_m == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0 && + mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0 && + mem->vmp_specialq.next == 0 && mem->vmp_specialq.prev == 0 && + mem->vmp_next_m == 0); - vm_page_validate_no_references(mem); + pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem)); - vm_page_reset(mem); + vm_page_reset_canonical(mem); } } +/* + * vm_page_release: + * + * Return a page to the free list. + * + * Keep in sync with vm_page_free_list(). + */ + +void +vm_page_release(vm_page_t mem, vmp_release_options_t options) +{ + if (options & VMP_RELEASE_Q_LOCKED) { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + } else { + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); + } + + assert(vm_page_is_canonical(mem)); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); + + if ((options & VMP_RELEASE_SKIP_FREE_CHECK) == 0) { + pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem)); + } + + pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); + + + vm_page_free_queue_enter_list(vm_page_list_for_page(mem), options); +} + +/* + * This version of vm_page_release() is used only at startup + * when we are single-threaded and pages are being released + * for the first time. Hence, no locking or unnecessary checks are made. + * Note: VM_CHECK_MEMORYSTATUS invoked by the caller. + */ +void +vm_page_release_startup(vm_page_t mem) +{ + vm_page_free_queue_enter_list(vm_page_list_for_page(mem), + VMP_RELEASE_STARTUP); +} + /* * vm_page_free: * @@ -4954,13 +5230,14 @@ vm_page_free_prepare_object( * Object and page queues must be locked prior to entry. */ void -vm_page_free( - vm_page_t mem) +vm_page_free(vm_page_t mem) { vm_page_free_prepare(mem); if (vm_page_is_canonical(mem)) { - vm_page_release(mem, TRUE); /* page queues are locked */ + /* page queues are locked */ + vm_page_release(mem, VMP_RELEASE_Q_LOCKED | + VMP_RELEASE_SKIP_FREE_CHECK); } else { vm_page_release_fictitious(mem); } @@ -4968,9 +5245,7 @@ vm_page_free( void -vm_page_free_unlocked( - vm_page_t mem, - boolean_t remove_from_hash) +vm_page_free_unlocked(vm_page_t mem, boolean_t remove_from_hash) { vm_page_lockspin_queues(); vm_page_free_prepare_queues(mem); @@ -4979,7 +5254,8 @@ vm_page_free_unlocked( vm_page_free_prepare_object(mem, remove_from_hash); if (vm_page_is_canonical(mem)) { - vm_page_release(mem, FALSE); /* page queues are not locked */ + /* page queues are not locked */ + vm_page_release(mem, VMP_RELEASE_SKIP_FREE_CHECK); } else { vm_page_release_fictitious(mem); } @@ -4994,98 +5270,56 @@ vm_page_free_unlocked( * * The VM page queues lock (vm_page_queue_lock) should NOT be held. * The VM page free queues lock (vm_page_queue_free_lock) should NOT be held. + * + * Keep in sync with vm_page_release(). */ void -vm_page_free_list(vm_page_t freeq, boolean_t prepare_object) +vm_page_free_list(vm_page_t freeq, bool prepare_object) { - vm_page_t mem; - vm_page_t nxt; - vm_page_t local_freeq; - int pg_count; - LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); LCK_MTX_ASSERT(&vm_page_queue_free_lock, LCK_MTX_ASSERT_NOTOWNED); while (freeq) { - pg_count = 0; - local_freeq = VM_PAGE_NULL; - mem = freeq; + vm_page_list_t list = { }; + + while (list.vmpl_count < VMP_FREE_BATCH_SIZE && freeq) { + vm_page_t mem = _vm_page_list_pop(&freeq); - /* - * break up the processing into smaller chunks so - * that we can 'pipeline' the pages onto the - * free list w/o introducing too much - * contention on the global free queue lock - */ - while (mem && pg_count < 64) { assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) || (mem->vmp_q_state == VM_PAGE_IS_WIRED)); - assert(mem->vmp_specialq.next == 0 && - mem->vmp_specialq.prev == 0); - /* - * && - * mem->vmp_on_specialq == VM_PAGE_SPECIAL_Q_EMPTY); - */ - nxt = mem->vmp_snext; - mem->vmp_snext = NULL; - assert(mem->vmp_pageq.prev == 0); - if (vm_page_is_canonical(mem)) { - vm_page_validate_no_references(mem); - } - - if (__improbable(mem->vmp_realtime)) { - vm_page_lock_queues(); - if (mem->vmp_realtime) { - mem->vmp_realtime = false; - vm_page_realtime_count--; - } - vm_page_unlock_queues(); - } - - if (prepare_object == TRUE) { + if (prepare_object) { vm_page_free_prepare_object(mem, TRUE); } if (vm_page_is_fictitious(mem)) { vm_page_release_fictitious(mem); - } else { - /* - * IMPORTANT: we can't set the page "free" here - * because that would make the page eligible for - * a physically-contiguous allocation (see - * vm_page_find_contiguous()) right away (we don't - * hold the vm_page_queue_free lock). That would - * cause trouble because the page is not actually - * in the free queue yet... - */ - pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); - - mem->vmp_snext = local_freeq; - local_freeq = mem; - pg_count++; + continue; } - mem = nxt; + + if (!prepare_object) { + /* vm_page_free_prepare_object() checked it */ + pmap_recycle_page(VM_PAGE_GET_PHYS_PAGE(mem)); + } + + pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); + + + /* + * IMPORTANT: we can't set the page "free" here + * because that would make the page eligible for + * a physically-contiguous allocation (see + * vm_page_find_contiguous()) right away (we don't + * hold the vm_page_queue_free lock). That would + * cause trouble because the page is not actually + * in the free queue yet... + */ + + vm_page_list_push(&list, mem); } - freeq = mem; - if ((mem = local_freeq)) { - vmp_free_list_result_t vmpr; - - vm_free_page_lock_spin(); - - vmpr = vm_page_put_list_on_free_queue(mem, false); - vm_pageout_vminfo.vm_page_pages_freed += pg_count; - VM_DEBUG_CONSTANT_EVENT(vm_page_release, DBG_VM_PAGE_RELEASE, - DBG_FUNC_NONE, pg_count, 0, 0, 0); - - if (vm_page_free_has_any_waiters()) { - vm_page_free_handle_wakeups_and_unlock(vmpr); - } else { - vm_free_page_unlock(); - } - - VM_CHECK_MEMORYSTATUS; + if (list.vmpl_count) { + vm_page_free_queue_enter_list(list, VMP_RELEASE_NONE); } } } @@ -5284,6 +5518,7 @@ vm_page_unwire( boolean_t do_footprint; mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_iopl_wired = false; VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object); VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem); @@ -6321,13 +6556,15 @@ vm_page_verify_free_list( if (other_color == color) { continue; } - vm_page_verify_free_list(&vm_page_queue_free[other_color].qhead, + vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[other_color].qhead, other_color, look_for_page, FALSE); } +#if XNU_VM_HAS_LOPAGE if (color == (unsigned int) -1) { vm_page_verify_free_list(&vm_lopage_queue_free, (unsigned int) -1, look_for_page, FALSE); } +#endif /* XNU_VM_HAS_LOPAGE */ panic("vm_page_verify_free_list(color=%u)", color); } if (!expect_page && found_page) { @@ -6350,6 +6587,7 @@ vm_page_verify_free_lists( void ) } npages = 0; + nlopages = 0; vm_free_page_lock(); @@ -6367,12 +6605,14 @@ vm_page_verify_free_lists( void ) } for (color = 0; color < vm_colors; color++) { - npages += vm_page_verify_free_list(&vm_page_queue_free[color].qhead, + npages += vm_page_verify_free_list(&vm_page_queue_free.vmpfq_queues[color].qhead, color, VM_PAGE_NULL, FALSE); } +#if XNU_VM_HAS_LOPAGE nlopages = vm_page_verify_free_list(&vm_lopage_queue_free, (unsigned int) -1, VM_PAGE_NULL, FALSE); +#endif /* XNU_VM_HAS_LOPAGE */ if (npages != vm_page_free_count || nlopages != vm_lopage_free_count) { panic("vm_page_verify_free_lists: " "npages %u free_count %d nlopages %u lo_free_count %u", @@ -6414,7 +6654,9 @@ vm_page_is_relocatable(vm_page_t m, vm_relocate_reason_t reloc_reason) return FALSE; } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) || +#if XNU_VM_HAS_LOPAGE (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) || +#endif /* XNU_VM_HAS_LOPAGE */ (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) { /* * Page needs to be on one of our queues (other then the pageout or special @@ -6470,6 +6712,12 @@ vm_page_is_relocatable(vm_page_t m, vm_relocate_reason_t reloc_reason) * vm_object_iopl_wire_full(). * * The VM page queues lock must also be held. + * + * @returns + * - KERN_SUCCESS if the relocation was successful. + * - KERN_INVALID_OBJECT if @c m1's object is VM_OBJECT_NULL. + * - KERN_FAILURE if the reolcation failed due to @c m1's state. + * - KERN_RESOURCE_SHORTAGE if no page could be allocated to relocate @c m1. */ kern_return_t vm_page_relocate( @@ -6495,7 +6743,7 @@ vm_page_relocate( } if (object == VM_OBJECT_NULL) { - return KERN_FAILURE; + return KERN_INVALID_OBJECT; } vm_object_lock_assert_held(object); @@ -6536,9 +6784,9 @@ vm_page_relocate( } if ((m1->vmp_pmapped && !reusable) || m1->vmp_dirty || m1->vmp_precious) { + vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD; vm_object_offset_t offset; int copy_page_options = 0; - int grab_options = VM_PAGE_GRAB_Q_LOCK_HELD; /* page is not reusable, we need to allocate a new page * and move its contents there. @@ -6685,8 +6933,6 @@ vm_page_relocate( /* unset the busy flag (pages on the free queue are busy) and notify if wanted */ vm_page_wakeup_done(object, m2); - - return KERN_SUCCESS; } else { assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); @@ -6699,11 +6945,42 @@ vm_page_relocate( */ vm_page_free_prepare(m1); - /* we're done here */ - return KERN_SUCCESS; + if (new_page) { + vm_page_t m2; + vm_object_offset_t offset; + vm_grab_options_t grab_options = VM_PAGE_GRAB_Q_LOCK_HELD; + + /* The caller still wanted a page, so let's give them a new one. */ + offset = m1->vmp_offset; + m2 = vm_page_grab_options(grab_options); + + if (m2 == VM_PAGE_NULL) { + return KERN_RESOURCE_SHORTAGE; + } + + /* + * make sure we clear the ref/mod state + * from the pmap layer... else we risk + * inheriting state from the last time + * this page was used... + */ + pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), + VM_MEM_MODIFIED | VM_MEM_REFERENCED); + + offset = m1->vmp_offset; + + /* + * now put the substitute page on the object + */ + vm_page_insert_internal(m2, object, offset, VM_KERN_MEMORY_NONE, TRUE, + TRUE, FALSE, FALSE, NULL); + + *new_page = m2; + } } - return KERN_FAILURE; + /* we're done here */ + return KERN_SUCCESS; } /* @@ -6780,7 +7057,7 @@ vm_page_find_contiguous( boolean_t wire, int flags) { - vm_page_t m = NULL; + vm_page_list_t list = { }; ppnum_t prevcontaddr = 0; ppnum_t start_pnum = 0; unsigned int npages = 0, considered = 0, scanned = 0; @@ -6848,6 +7125,8 @@ full_scan_again: for (page_idx = last_idx, start_idx = last_idx; npages < contig_pages && page_idx < vm_pages_count; page_idx++) { + vm_page_t m = NULL; + retry: if (wrapped && npages == 0 && @@ -6969,8 +7248,6 @@ did_consider: considered++; } /* main for-loop end */ - m = VM_PAGE_NULL; - if (npages != contig_pages) { if (!wrapped) { /* @@ -6995,7 +7272,7 @@ did_consider: unsigned int cur_idx; unsigned int tmp_start_idx; vm_object_t locked_object = VM_OBJECT_NULL; - boolean_t abort_run = FALSE; + bool abort_run = false; assert(page_idx - start_idx == contig_pages); @@ -7011,34 +7288,15 @@ did_consider: * free pages in this run and return them to the free list */ while (start_idx < page_idx) { + vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE; + m1 = vm_page_get(start_idx++); #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q); #endif - if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) { -#if MACH_ASSERT - unsigned int color = VM_PAGE_GET_COLOR(m1); - vm_memory_class_t memory_class = vm_page_get_memory_class(m1); - - if (memory_class == VM_MEMORY_CLASS_REGULAR) { - vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, m1, TRUE); - } -#endif - vm_page_steal_free_page(m1, VM_REMOVE_REASON_USE); -#if MACH_ASSERT - if (memory_class == VM_MEMORY_CLASS_REGULAR) { - vm_page_verify_free_list(&vm_page_queue_free[color].qhead, color, VM_PAGE_NULL, FALSE); - } -#endif - /* - * Clear the "free" bit so that this page - * does not get considered for another - * concurrent physically-contiguous allocation. - */ - m1->vmp_q_state = VM_PAGE_NOT_ON_Q; - assert(m1->vmp_busy); + vm_page_free_queue_steal(options, m1); } } if (flags & KMA_LOMEM) { @@ -7080,7 +7338,7 @@ did_consider: /* * try to relocate/steal the page */ - if (abort_run == TRUE) { + if (abort_run) { continue; } @@ -7098,7 +7356,7 @@ did_consider: } else { /* object must be locked to relocate its pages */ tmp_start_idx = cur_idx; - abort_run = TRUE; + abort_run = true; continue; } } @@ -7110,7 +7368,7 @@ did_consider: locked_object = VM_OBJECT_NULL; } tmp_start_idx = cur_idx; - abort_run = TRUE; + abort_run = true; continue; } @@ -7127,8 +7385,7 @@ did_consider: vm_page_assign_special_state(m1, VM_PAGE_SPECIAL_Q_BG); } VM_PAGE_ZERO_PAGEQ_ENTRY(m1); - m1->vmp_snext = m; - m = m1; + vm_page_list_push(&list, m1); } if (locked_object) { @@ -7136,7 +7393,7 @@ did_consider: locked_object = VM_OBJECT_NULL; } - if (abort_run == TRUE) { + if (abort_run) { /* * want the index of the last * page in this run that was @@ -7147,11 +7404,11 @@ did_consider: page_idx = tmp_start_idx + 2; if (page_idx >= vm_pages_count) { if (wrapped) { - if (m != VM_PAGE_NULL) { + if (list.vmpl_count) { vm_page_unlock_queues(); - vm_page_free_list(m, FALSE); + vm_page_free_list(list.vmpl_head, FALSE); vm_page_lock_queues(); - m = VM_PAGE_NULL; + list = (vm_page_list_t){ }; } dumped_run++; goto done_scanning; @@ -7159,7 +7416,7 @@ did_consider: page_idx = last_idx = 0; wrapped = TRUE; } - abort_run = FALSE; + abort_run = false; /* * We didn't find a contiguous range but we didn't @@ -7176,11 +7433,11 @@ did_consider: last_idx = page_idx; - if (m != VM_PAGE_NULL) { + if (list.vmpl_count) { vm_page_unlock_queues(); - vm_page_free_list(m, FALSE); + vm_page_free_list(list.vmpl_head, FALSE); vm_page_lock_queues(); - m = VM_PAGE_NULL; + list = (vm_page_list_t){ }; } dumped_run++; @@ -7193,7 +7450,7 @@ did_consider: goto retry; } - for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) { + vm_page_list_foreach(m1, list) { assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q); assert(m1->vmp_wire_count == 0); @@ -7214,7 +7471,7 @@ did_consider: */ vm_page_wire_count += npages; - assert(vm_page_verify_contiguous(m, npages)); + assert(vm_page_verify_contiguous(list.vmpl_head, npages)); } done_scanning: PAGE_REPLACEMENT_ALLOWED(FALSE); @@ -7245,7 +7502,7 @@ done_scanning: #if MACH_ASSERT vm_page_verify_free_lists(); #endif - if (m == NULL && zone_gc_called < 2) { + if (list.vmpl_count == 0 && zone_gc_called < 2) { printf("%s(num=%d,low=%d): found %d pages at 0x%llx...scanned %d pages... yielded %d times... dumped run %d times... stole %d pages... stole %d compressed pages... wired count is %d\n", __func__, contig_pages, max_pnum, npages, (vm_object_offset_t)start_pnum << PAGE_SHIFT, scanned, yielded, dumped_run, stolen_pages, compressed_pages, vm_page_wire_count); @@ -7262,7 +7519,7 @@ done_scanning: goto full_scan_again; } - return m; + return list.vmpl_head; } /* @@ -7353,15 +7610,22 @@ unsigned int vm_max_delayed_work_limit = DEFAULT_DELAYED_WORK_LIMIT; * the operation names are modeled after the names of the routines that * need to be called in order to make the changes very obvious in the * original loop + * + * On certain configurations, this function may return failure if any of + * the pages in the run has a mapping state that doesn't allow the specified + * operation. In that case, it will still fully process the run of pages + * in order to avoid requiring the caller to partially undo the work done + * here. */ -void +kern_return_t vm_page_do_delayed_work( vm_object_t object, vm_tag_t tag, struct vm_page_delayed_work *dwp, int dw_count) { + kern_return_t kr = KERN_SUCCESS; int j; vm_page_t m; vm_page_t local_free_q = VM_PAGE_NULL; @@ -7419,6 +7683,31 @@ vm_page_do_delayed_work( #endif if (dwp->dw_mask & DW_vm_page_wire) { vm_page_wire(m, tag, FALSE); + if (dwp->dw_mask & DW_vm_page_iopl_wire) { +#if CONFIG_SPTM + /* + * The SPTM's security model prevents us from allowing writable I/O + * mappings of executable pages. We need to check that here, + * in the same place that we set vmp_iopl_wired, because this + * function may have transiently dropped the VM object lock + * before reaching this point, which means that frontloading + * this check in the caller may not work in all cases. + */ + if ((dwp->dw_mask & DW_vm_page_iopl_wire_write) && PMAP_PAGE_IS_USER_EXECUTABLE(m)) { + if (kr == KERN_SUCCESS) { + kr = KERN_PROTECTION_FAILURE; + vm_map_guard_exception(VM_PAGE_GET_PHYS_PAGE(m), kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE); + ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, + KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_IOPL_ON_EXEC_PAGE), + (uintptr_t)(VM_PAGE_GET_PHYS_PAGE(m))); + } + } else { + m->vmp_iopl_wired = true; + } +#else + m->vmp_iopl_wired = true; +#endif /* CONFIG_SPTM */ + } } else if (dwp->dw_mask & DW_vm_page_unwire) { boolean_t queueit; @@ -7508,6 +7797,8 @@ vm_page_do_delayed_work( } VM_CHECK_MEMORYSTATUS; + + return kr; } __abortlike @@ -7522,10 +7813,7 @@ __vm_page_alloc_list_failed_panic( } kern_return_t -vm_page_alloc_list( - vm_size_t page_count, - kma_flags_t flags, - vm_page_t *list) +vm_page_alloc_list(vm_size_t page_count, kma_flags_t flags, vm_page_t *list) { vm_page_t page_list = VM_PAGE_NULL; vm_page_t mem; @@ -7535,10 +7823,14 @@ vm_page_alloc_list( for (vm_size_t i = 0; i < page_count; i++) { for (;;) { + vm_grab_options_t options = VM_PAGE_GRAB_OPTIONS_NONE; + + if (flags & KMA_NOPAGEWAIT) { + options |= VM_PAGE_GRAB_NOPAGEWAIT; + } if (flags & KMA_LOMEM) { - mem = vm_page_grablo(); + mem = vm_page_grablo(options); } else { - uint_t options = VM_PAGE_GRAB_OPTIONS_NONE; mem = vm_page_grab_options(options); } @@ -7550,7 +7842,7 @@ vm_page_alloc_list( kr = KERN_RESOURCE_SHORTAGE; goto out; } - if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) { + if ((flags & KMA_LOMEM) && vm_lopage_needed) { kr = KERN_RESOURCE_SHORTAGE; goto out; } @@ -7584,8 +7876,9 @@ vm_page_alloc_list( out: task = current_task_early(); if (task != NULL) { - counter_add(&task->pages_grabbed_kern, page_grab_count); + ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, page_grab_count); } + counter_add(&vm_page_grab_count_kern, page_grab_count); if (kr == KERN_SUCCESS) { *list = page_list; @@ -7627,21 +7920,7 @@ vm_page_get_phys_page(vm_page_t page) #if HIBERNATION -static vm_page_t hibernate_gobble_queue; - -static int hibernate_drain_pageout_queue(struct vm_pageout_queue *); -static int hibernate_flush_dirty_pages(int); -static int hibernate_flush_queue(vm_page_queue_head_t *, int); - -void hibernate_flush_wait(void); -void hibernate_mark_in_progress(void); -void hibernate_clear_in_progress(void); - -void hibernate_free_range(int, int); -void hibernate_hash_insert_page(vm_page_t); -uint32_t hibernate_mark_as_unneeded(addr64_t, addr64_t, hibernate_page_list_t *, hibernate_page_list_t *); -uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *); -ppnum_t hibernate_lookup_paddr(unsigned int); +static uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *, hibernate_page_list_t *); struct hibernate_statistics { int hibernate_considered; @@ -8038,14 +8317,14 @@ hibernate_flush_dirty_pages(int pass) void -hibernate_reset_stats() +hibernate_reset_stats(void) { bzero(&hibernate_stats, sizeof(struct hibernate_statistics)); } int -hibernate_flush_memory() +hibernate_flush_memory(void) { int retval; @@ -8128,26 +8407,6 @@ hibernate_page_list_zero(hibernate_page_list_t *list) } } -void -hibernate_free_gobble_pages(void) -{ - vm_page_t m, next; - uint32_t count = 0; - - m = (vm_page_t) hibernate_gobble_queue; - while (m) { - next = m->vmp_snext; - vm_page_free(m); - count++; - m = next; - } - hibernate_gobble_queue = VM_PAGE_NULL; - - if (count) { - HIBLOG("Freed %d pages\n", count); - } -} - static boolean_t hibernate_consider_discard(vm_page_t m, boolean_t preflight) { @@ -8387,10 +8646,10 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, uint64_t start, end, nsec; vm_page_t m; vm_page_t next; - uint32_t pages = page_list->page_count; + __block uint32_t pages = page_list->page_count; + __block uint32_t count_wire = pages; uint32_t count_anonymous = 0, count_throttled = 0, count_compressor = 0; uint32_t count_inactive = 0, count_active = 0, count_speculative = 0, count_cleaned = 0; - uint32_t count_wire = pages; uint32_t count_discard_active = 0; uint32_t count_discard_inactive = 0; uint32_t count_retired = 0; @@ -8398,7 +8657,6 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, uint32_t count_discard_purgeable = 0; uint32_t count_discard_speculative = 0; uint32_t count_discard_vm_struct_pages = 0; - uint32_t i; uint32_t bank; hibernate_bitmap_t * bitmap; hibernate_bitmap_t * bitmap_wired; @@ -8458,61 +8716,57 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernation_vmqueues_inspection = TRUE; - m = (vm_page_t) hibernate_gobble_queue; - while (m) { + __auto_type hib_free_boilerplate = ^(vm_page_t page) { + assert((page->vmp_q_state == VM_PAGE_ON_FREE_Q) || +#if XNU_VM_HAS_LOPAGE + (page->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) || +#endif /* XNU_VM_HAS_LOPAGE */ + (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q)); + pages--; count_wire--; + if (!preflight) { - hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); + hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(page)); + hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(page)); + + hibernate_stats.cd_total_free++; + + if (page->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) { + hibernate_stats.cd_local_free++; + } } - m = m->vmp_snext; - } + }; if (!preflight) { percpu_foreach(free_pages_head, free_pages) { - for (m = *free_pages_head; m; m = m->vmp_snext) { + _vm_page_list_foreach(m, *free_pages_head) { assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); - - pages--; - count_wire--; - hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - - hibernate_stats.cd_local_free++; - hibernate_stats.cd_total_free++; + hib_free_boilerplate(m); } } } +#if CONFIG_SPTM + if (vm_pages_free_masks()) { + uint32_t bits = vm_pages_free_mask_len() * MAX_COLORS; + bitmap_t *map = vm_pages_free_masks_as_bitmap(0); - for (i = 0; i < vm_colors; i++) { - vm_page_queue_iterate(&vm_page_queue_free[i].qhead, m, vmp_pageq) { - assert(m->vmp_q_state == VM_PAGE_ON_FREE_Q); + for (int bit = bitmap_first(map, bits); + bit >= 0; bit = bitmap_next(map, bit)) { + ppnum_t pnum = pmap_first_pnum + bit; + vm_page_t mem = vm_page_find_canonical(pnum); - pages--; - count_wire--; - if (!preflight) { - hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - - hibernate_stats.cd_total_free++; - } - } - } - - vm_page_queue_iterate(&vm_lopage_queue_free, m, vmp_pageq) { - assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); - - pages--; - count_wire--; - if (!preflight) { - hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - - hibernate_stats.cd_total_free++; + hib_free_boilerplate(mem); } + } else +#endif /* CONFIG_SPTM */ + { + vm_page_free_queue_foreach(&vm_page_queue_free, hib_free_boilerplate); } +#if XNU_VM_HAS_LOPAGE + vm_page_free_queue_foreach(&vm_lopage_queue_free, hib_free_boilerplate); +#endif /* XNU_VM_HAS_LOPAGE */ m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled); while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) { @@ -8719,7 +8973,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, } /* XXX FBDP TODO: secluded queue */ - for (i = 0; i <= vm_page_max_speculative_age_q; i++) { + for (uint32_t i = 0; i <= vm_page_max_speculative_age_q; i++) { m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q); while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) { assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q, @@ -8780,7 +9034,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, bitmap = &page_list->bank_bitmap[0]; bitmap_wired = &page_list_wired->bank_bitmap[0]; for (bank = 0; bank < page_list->bank_count; bank++) { - for (i = 0; i < bitmap->bitmapwords; i++) { + for (uint32_t i = 0; i < bitmap->bitmapwords; i++) { bitmap->bitmap[i] = bitmap->bitmap[i] | ~bitmap_wired->bitmap[i]; } bitmap = (hibernate_bitmap_t *)&bitmap->bitmap[bitmap->bitmapwords]; @@ -8793,27 +9047,40 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, if (!preflight) { hibernate_stats.cd_count_wire = count_wire; - hibernate_stats.cd_discarded = count_discard_active + count_discard_inactive + count_discard_purgeable + - count_discard_speculative + count_discard_cleaned + count_discard_vm_struct_pages; + hibernate_stats.cd_discarded = count_discard_active + + count_discard_inactive + count_discard_purgeable + + count_discard_speculative + count_discard_cleaned + + count_discard_vm_struct_pages; } clock_get_uptime(&end); absolutetime_to_nanoseconds(end - start, &nsec); HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL); - HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, zf %d, throt %d, compr %d, xpmapped %d\n %s discard act %d inact %d purgeable %d spec %d cleaned %d retired %d\n", - pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped, + HIBLOG("pages %d, wire %d, act %d, inact %d, cleaned %d spec %d, " + "zf %d, throt %d, compr %d, xpmapped %d\n" + " %s discard act %d inact %d purgeable %d " + "spec %d cleaned %d retired %d\n", + pages, count_wire, count_active, count_inactive, count_cleaned, count_speculative, + count_anonymous, count_throttled, count_compressor, hibernate_stats.cd_found_xpmapped, discard_all ? "did" : "could", - count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative, count_discard_cleaned, count_retired); + count_discard_active, count_discard_inactive, count_discard_purgeable, + count_discard_speculative, count_discard_cleaned, count_retired); if (hibernate_stats.cd_skipped_xpmapped) { - HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n", hibernate_stats.cd_skipped_xpmapped); + HIBLOG("WARNING: hibernate_page_list_setall skipped %d xpmapped pages\n", + hibernate_stats.cd_skipped_xpmapped); } - *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative - count_discard_cleaned - count_retired; + *pagesOut = pages - count_discard_active - count_discard_inactive - + count_discard_purgeable - count_discard_speculative - + count_discard_cleaned - count_retired; if (preflight && will_discard) { - *pagesOut -= count_compressor + count_throttled + count_anonymous + count_inactive + count_cleaned + count_speculative + count_active; + *pagesOut -= count_compressor + count_throttled + + count_anonymous + count_inactive + count_cleaned + + count_speculative + count_active; + /* * We try to keep max HIBERNATE_XPMAPPED_LIMIT pages around in the hibernation image * even if these are clean and so we need to size the hibernation image accordingly. @@ -9022,7 +9289,7 @@ hibernate_create_paddr_map(void) } } -ppnum_t +static ppnum_t hibernate_lookup_paddr(unsigned int indx) { struct ppnum_mapping *ppnm = NULL; @@ -9048,7 +9315,7 @@ done: } -uint32_t +static uint32_t hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired) { addr64_t saddr_aligned; @@ -9074,7 +9341,7 @@ hibernate_mark_as_unneeded(addr64_t saddr, addr64_t eaddr, hibernate_page_list_t } -void +static void hibernate_hash_insert_page(vm_page_t mem) { vm_page_bucket_t *bucket; @@ -9098,31 +9365,40 @@ hibernate_hash_insert_page(vm_page_t mem) } -void -hibernate_free_range(int sindx, int eindx) +static void +hibernate_free_range_flush(vm_page_list_t *list) { - vm_page_t mem; + vm_page_free_queue_enter_list(*list, VMP_RELEASE_HIBERNATE); + *list = (vm_page_list_t){ }; +} - while (sindx < eindx) { - mem = vm_page_get(sindx); +static void +hibernate_free_range(vm_page_list_t *list, int sindx, int eindx) +{ + for (; sindx < eindx; sindx++) { + vm_page_t mem = vm_page_get(sindx); + ppnum_t pnum = hibernate_lookup_paddr(sindx); - vm_page_init(mem, hibernate_lookup_paddr(sindx)); + vm_page_init(mem, pnum); + vm_page_list_push(list, mem); - vm_page_put_list_on_free_queue(mem, false); - - sindx++; + /* Max batch size of these lists is 255 due to vmp_free_list_result_t */ + if (list->vmpl_count >= UINT8_MAX) { + hibernate_free_range_flush(list); + } } } void hibernate_rebuild_vm_structs(void) { - int i, cindx, sindx, eindx; + int cindx, sindx, eindx; + vm_page_list_t list = { }; vm_page_t mem, tmem, mem_next; AbsoluteTime startTime, endTime; uint64_t nsec; - if (hibernate_rebuild_needed == FALSE) { + if (!hibernate_rebuild_needed) { return; } @@ -9141,7 +9417,7 @@ hibernate_rebuild_vm_structs(void) * transient. This is needed to ensure that buddy page search is corrrect. * Without this random data in these vm_pages[] can trip the buddy search */ - for (i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) { + for (int i = hibernate_teardown_last_valid_compact_indx + 1; i < eindx; ++i) { vm_page_get(i)->vmp_q_state = VM_PAGE_NOT_ON_Q; } @@ -9174,13 +9450,14 @@ hibernate_rebuild_vm_structs(void) * vm_page_t we moved needs to be initialized as * a range of free vm_page_t's */ - hibernate_free_range(sindx + 1, eindx); + hibernate_free_range(&list, sindx + 1, eindx); eindx = sindx; } - if (sindx) { - hibernate_free_range(0, sindx); - } + hibernate_free_range(&list, 0, sindx); + hibernate_free_range_flush(&list); + + VM_CHECK_MEMORYSTATUS; assert(vm_page_free_count == hibernate_teardown_vm_page_free_count); @@ -9203,18 +9480,15 @@ hibernate_rebuild_vm_structs(void) HIBLOG("hibernate_rebuild completed - took %qd msecs\n", nsec / 1000000ULL); - hibernate_rebuild_needed = FALSE; + hibernate_rebuild_needed = false; KDBG(IOKDBG_CODE(DBG_HIBERNATE, 13) | DBG_FUNC_END); } -uint32_t +static uint32_t hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_list_t *page_list_wired) { - unsigned int i; unsigned int compact_target_indx; - vm_page_t mem, mem_next; - vm_page_bucket_t *bucket; unsigned int mark_as_unneeded_pages = 0; unsigned int unneeded_vm_page_bucket_pages = 0; unsigned int unneeded_vm_pages_pages = 0; @@ -9227,14 +9501,18 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l return 0; } - hibernate_rebuild_needed = TRUE; + hibernate_rebuild_needed = true; - HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, active_pages %d, inactive_pages %d, speculative_pages %d, cleaned_pages %d, compressor_pages %d\n", - vm_page_wire_count, vm_page_free_count, vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, + HIBLOG("hibernate_teardown: wired_pages %d, free_pages %d, " + "active_pages %d, inactive_pages %d, speculative_pages %d, " + "cleaned_pages %d, compressor_pages %d\n", + vm_page_wire_count, vm_page_free_count, + vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_cleaned_count, compressor_object->resident_page_count); - for (i = 0; i < vm_page_bucket_count; i++) { - bucket = &vm_page_buckets[i]; + for (uint32_t i = 0; i < vm_page_bucket_count; i++) { + vm_page_bucket_t *bucket = &vm_page_buckets[i]; + vm_page_t mem, mem_next; for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) { assert(mem->vmp_hashed); @@ -9247,24 +9525,26 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l } } } - unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0], (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired); + unneeded_vm_page_bucket_pages = hibernate_mark_as_unneeded((addr64_t)&vm_page_buckets[0], + (addr64_t)&vm_page_buckets[vm_page_bucket_count], page_list, page_list_wired); mark_as_unneeded_pages += unneeded_vm_page_bucket_pages; hibernate_teardown_vm_page_free_count = vm_page_free_count; compact_target_indx = 0; - for (i = 0; i < vm_pages_count; i++) { - mem = vm_page_get(i); + vm_free_page_lock(); + + for (uint32_t i = 0; i < vm_pages_count; i++) { + vm_page_t mem = vm_page_get(i); + ppnum_t pnum = VM_PAGE_GET_PHYS_PAGE(mem); + vm_memory_class_t class = vm_page_get_memory_class(mem, pnum); if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) { - assert(mem->vmp_busy); - assert(!mem->vmp_lopage); - - vm_page_steal_free_page(mem, VM_REMOVE_REASON_USE); + vm_page_free_queue_remove(class, mem, pnum, + VM_PAGE_ON_FREE_Q); hibernate_teardown_found_free_pages++; - if (vm_page_get(compact_target_indx)->vmp_q_state != VM_PAGE_ON_FREE_Q) { compact_target_indx = i; } @@ -9293,22 +9573,27 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l } } - unneeded_vm_pages_pages = hibernate_mark_as_unneeded((addr64_t)&vm_pages[hibernate_teardown_last_valid_compact_indx + 1], - (addr64_t)vm_page_get(vm_pages_count - 1), page_list, page_list_wired); + vm_free_page_unlock(); + + unneeded_vm_pages_pages = hibernate_mark_as_unneeded( + (addr64_t)vm_page_get(hibernate_teardown_last_valid_compact_indx + 1), + (addr64_t)vm_page_get(vm_pages_count - 1), + page_list, page_list_wired); mark_as_unneeded_pages += unneeded_vm_pages_pages; pal_hib_teardown_pmap_structs(&start_of_unneeded, &end_of_unneeded); if (start_of_unneeded) { - unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded, end_of_unneeded, page_list, page_list_wired); + unneeded_pmap_pages = hibernate_mark_as_unneeded(start_of_unneeded, + end_of_unneeded, page_list, page_list_wired); mark_as_unneeded_pages += unneeded_pmap_pages; } - HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n", unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages); + HIBLOG("hibernate_teardown: mark_as_unneeded_pages %d, %d, %d\n", + unneeded_vm_page_bucket_pages, unneeded_vm_pages_pages, unneeded_pmap_pages); return mark_as_unneeded_pages; } - #endif /* HIBERNATION */ /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -9336,7 +9621,7 @@ vm_page_info( unsigned int count) { unsigned int i; - lck_spin_t *bucket_lock; + lck_ticket_t *bucket_lock; if (vm_page_bucket_count < count) { count = vm_page_bucket_count; @@ -9348,7 +9633,7 @@ vm_page_info( vm_page_t m; bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); m != VM_PAGE_NULL; @@ -9356,7 +9641,7 @@ vm_page_info( bucket_count++; } - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); /* don't touch pageable memory while holding locks */ info[i].hib_count = bucket_count; @@ -9374,7 +9659,7 @@ vm_page_buckets_check(void) vm_page_t p; unsigned int p_hash; vm_page_bucket_t *bucket; - lck_spin_t *bucket_lock; + lck_ticket_t *bucket_lock; if (!vm_page_buckets_check_ready) { return; @@ -9414,7 +9699,7 @@ vm_page_buckets_check(void) } bucket_lock = &vm_page_bucket_locks[i / BUCKETS_PER_LOCK]; - lck_spin_lock_grp(bucket_lock, &vm_page_lck_grp_bucket); + lck_ticket_lock(bucket_lock, &vm_page_lck_grp_bucket); p = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); while (p != VM_PAGE_NULL) { @@ -9437,7 +9722,7 @@ vm_page_buckets_check(void) } p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)); } - lck_spin_unlock(bucket_lock); + lck_ticket_unlock(bucket_lock); } // printf("BUCKET_CHECK: checked buckets\n"); @@ -9974,34 +10259,12 @@ vm_tag_alloc(vm_allocation_site_t * site) return site->tag; } -#if VM_BTLOG_TAGS -#define VM_KERN_MEMORY_STR_MAX_LEN (32) -TUNABLE_STR(vmtaglog, VM_KERN_MEMORY_STR_MAX_LEN, "vmtaglog", ""); -#define VM_TAG_BTLOG_SIZE (16u << 10) - -btlog_t vmtaglog_btlog; -vm_tag_t vmtaglog_tag; - -static void -vm_tag_log(vm_object_t object, int64_t delta, void *fp) -{ - if (is_kernel_object(object)) { - /* kernel object backtraces are tracked in vm entries */ - return; - } - if (delta > 0) { - btref_t ref = btref_get(fp, BTREF_GET_NOWAIT); - btlog_record(vmtaglog_btlog, object, 0, ref); - } else if (object->wired_page_count == 0) { - btlog_erase(vmtaglog_btlog, object); - } -} - #ifndef ARRAY_SIZE #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) #endif /* ARRAY_SIZE */ -#define VM_KERN_MEMORY_ELEM(name) [VM_KERN_MEMORY_##name] = #name -const char *vm_kern_memory_strs[] = { +#define VM_KERN_MEMORY_ELEM(name) [VM_KERN_MEMORY_##name] = "VM_KERN_MEMORY_" #name +const char *vm_kern_memory_names[] = { + VM_KERN_MEMORY_ELEM(NONE), VM_KERN_MEMORY_ELEM(OSFMK), VM_KERN_MEMORY_ELEM(BSD), VM_KERN_MEMORY_ELEM(IOKIT), @@ -10035,13 +10298,66 @@ const char *vm_kern_memory_strs[] = { VM_KERN_MEMORY_ELEM(KALLOC_TYPE), VM_KERN_MEMORY_ELEM(TRIAGE), VM_KERN_MEMORY_ELEM(RECOUNT), + "VM_KERN_MEMORY_34", + VM_KERN_MEMORY_ELEM(EXCLAVES), + VM_KERN_MEMORY_ELEM(EXCLAVES_SHARED), + VM_KERN_MEMORY_ELEM(KALLOC_SHARED), + VM_KERN_MEMORY_ELEM(CPUTRACE), }; +_Static_assert(ARRAY_SIZE(vm_kern_memory_names) == VM_KERN_MEMORY_FIRST_DYNAMIC, + "vm_kern_memory_names must map all counter tags"); + +#define VM_KERN_COUNT_ELEM(name) [VM_KERN_COUNT_##name] = "VM_KERN_COUNT_" #name +const char *vm_kern_count_names[] = { + VM_KERN_COUNT_ELEM(MANAGED), + VM_KERN_COUNT_ELEM(RESERVED), + VM_KERN_COUNT_ELEM(WIRED), + VM_KERN_COUNT_ELEM(WIRED_MANAGED), + VM_KERN_COUNT_ELEM(STOLEN), + VM_KERN_COUNT_ELEM(LOPAGE), + VM_KERN_COUNT_ELEM(MAP_KERNEL), + VM_KERN_COUNT_ELEM(MAP_ZONE), + VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE), + VM_KERN_COUNT_ELEM(WIRED_BOOT), + VM_KERN_COUNT_ELEM(BOOT_STOLEN), + VM_KERN_COUNT_ELEM(WIRED_STATIC_KERNELCACHE), + VM_KERN_COUNT_ELEM(MAP_KALLOC_LARGE_DATA), + VM_KERN_COUNT_ELEM(MAP_KERNEL_DATA), + VM_KERN_COUNT_ELEM(EXCLAVES_CARVEOUT), +}; + +#if VM_BTLOG_TAGS +#define VM_KERN_MEMORY_STR_MAX_LEN (32) +TUNABLE_STR(vmtaglog, VM_KERN_MEMORY_STR_MAX_LEN, "vmtaglog", ""); +#define VM_TAG_BTLOG_SIZE (16u << 10) + +btlog_t vmtaglog_btlog; +vm_tag_t vmtaglog_tag; + +static void +vm_tag_log(vm_object_t object, int64_t delta, void *fp) +{ + if (is_kernel_object(object)) { + /* kernel object backtraces are tracked in vm entries */ + return; + } + if (delta > 0) { + btref_t ref = btref_get(fp, BTREF_GET_NOWAIT); + btlog_record(vmtaglog_btlog, object, 0, ref); + } else if (object->wired_page_count == 0) { + btlog_erase(vmtaglog_btlog, object); + } +} + +_Static_assert(ARRAY_SIZE(vm_kern_count_names) == VM_KERN_COUNTER_COUNT, + "vm_kern_count_names must map all counter tags"); + static vm_tag_t vm_tag_str_to_idx(char tagstr[VM_KERN_MEMORY_STR_MAX_LEN]) { - for (vm_tag_t i = VM_KERN_MEMORY_OSFMK; i < ARRAY_SIZE(vm_kern_memory_strs); i++) { - if (!strncmp(vm_kern_memory_strs[i], tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) { + for (vm_tag_t i = VM_KERN_MEMORY_OSFMK; i < ARRAY_SIZE(vm_kern_memory_names); i++) { + if (!strncmp(vm_kern_memory_names[i], tagstr, VM_KERN_MEMORY_STR_MAX_LEN)) { return i; } } @@ -10102,9 +10418,8 @@ kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta, __ } #if DEBUG || DEVELOPMENT - if (value > allocation->peak) { - os_atomic_max(&allocation->peak, value, relaxed); - } + /* release to publish the new total */ + os_atomic_max(&allocation->peak, value, release); #endif /* DEBUG || DEVELOPMENT */ if (value == (uint64_t)delta && !allocation->tag) { @@ -10118,6 +10433,37 @@ kern_allocation_update_size(kern_allocation_name_t allocation, int64_t delta, __ #endif /* VM_BTLOG_TAGS */ } +#if DEBUG || DEVELOPMENT + +void +vm_tag_reset_all_peaks(void) +{ + vm_log("resetting peak size for all kernel tags\n"); + for (vm_tag_t tag = 0; tag <= vm_allocation_tag_highest; tag++) { + vm_tag_reset_peak(tag); + } +} + +kern_return_t +vm_tag_reset_peak(vm_tag_t tag) +{ + if (tag > vm_allocation_tag_highest) { + return KERN_INVALID_ARGUMENT; + } + + vm_allocation_site_t *site = vm_allocation_sites[tag]; + vm_log_info("resetting peak size for kernel tag %s\n", + KA_NAME(site)); + + uint64_t new_peak = os_atomic_load(&site->total, relaxed); + /* acquire updates to the total */ + os_atomic_min(&site->peak, new_peak, acquire); + + return KERN_SUCCESS; +} + +#endif /* DEBUG || DEVELOPMENT */ + #if VM_TAG_SIZECLASSES void @@ -10374,6 +10720,8 @@ process_account(mach_memory_info_t * info, unsigned int num_info, info[idx].flags &= ~VM_KERN_SITE_WIRED; info[idx].collectable_bytes = zones_collectable_bytes; } + info[idx].flags |= VM_KERN_SITE_NAMED; + strlcpy(info[idx].name, vm_kern_memory_names[idx], MACH_MEMORY_INFO_NAME_MAX_LEN); } else if ((namelen = (VM_TAG_NAME_LEN_MAX & (site->flags >> VM_TAG_NAME_LEN_SHIFT)))) { info[idx].site = 0; info[idx].flags |= VM_KERN_SITE_NAMED; @@ -10437,7 +10785,7 @@ process_account(mach_memory_info_t * info, unsigned int num_info, for (sub = 0; sub < site->subtotalscount; sub++) { alloctag = site->subtotals[sub].tag; assert(alloctag < num_info); - if (info[alloctag].name[0]) { + if (info[alloctag].name[0] && alloctag >= VM_KERN_MEMORY_FIRST_DYNAMIC) { continue; } take = site->subtotals[sub].total; @@ -10616,19 +10964,21 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone mach_memory_info_t * counts; uint32_t i; + vmlp_api_start(VM_PAGE_DIAGNOSE); + bzero(info, num_info * sizeof(mach_memory_info_t)); if (!vm_page_wire_count_initial) { + vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_ABORTED); return KERN_ABORTED; } -#if !XNU_TARGET_OS_OSX wired_size = ptoa_64(vm_page_wire_count); wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count); -#else /* !XNU_TARGET_OS_OSX */ - wired_size = ptoa_64(vm_page_wire_count + vm_lopage_free_count + vm_page_throttled_count); - wired_reserved_size = ptoa_64(vm_page_wire_count_initial - vm_page_stolen_count + vm_page_throttled_count); -#endif /* !XNU_TARGET_OS_OSX */ +#if XNU_TARGET_OS_OSX + wired_size += ptoa_64(vm_lopage_free_count + vm_page_throttled_count); + wired_reserved_size += ptoa_64(vm_page_throttled_count); +#endif /* XNU_TARGET_OS_OSX */ wired_managed_size = ptoa_64(vm_page_wire_count - vm_page_wire_count_initial); wired_size += booter_size; @@ -10637,12 +10987,14 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone num_info -= VM_KERN_COUNTER_COUNT; counts = &info[num_info]; -#define SET_COUNT(xcount, xsize, xflags) \ +#define SET_COUNT(xcount, xsize, xflags) MACRO_BEGIN \ counts[xcount].tag = VM_MAX_TAG_VALUE + xcount; \ counts[xcount].site = (xcount); \ counts[xcount].size = (xsize); \ counts[xcount].mapped = (xsize); \ - counts[xcount].flags = VM_KERN_SITE_COUNTER | xflags; + counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED | xflags; \ + strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \ + MACRO_END; SET_COUNT(VM_KERN_COUNT_MANAGED, ptoa_64(vm_page_pages), 0); SET_COUNT(VM_KERN_COUNT_WIRED, wired_size, 0); @@ -10657,13 +11009,15 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone SET_COUNT(VM_KERN_COUNT_EXCLAVES_CARVEOUT, SPTMArgs->sk_carveout_size, 0); #endif -#define SET_MAP(xcount, xsize, xfree, xlargest) \ +#define SET_MAP(xcount, xsize, xfree, xlargest) MACRO_BEGIN \ counts[xcount].site = (xcount); \ counts[xcount].size = (xsize); \ counts[xcount].mapped = (xsize); \ counts[xcount].free = (xfree); \ counts[xcount].largest = (xlargest); \ - counts[xcount].flags = VM_KERN_SITE_COUNTER; + counts[xcount].flags = VM_KERN_SITE_COUNTER | VM_KERN_SITE_NAMED; \ + strlcpy(counts[xcount].name, vm_kern_count_names[xcount], MACH_MEMORY_INFO_NAME_MAX_LEN); \ + MACRO_END; vm_map_size_t map_size, map_free, map_largest; @@ -10679,9 +11033,13 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone i = 0; if (!redact_info) { - if (zone_is_data_kheap(KHEAP_DATA_BUFFERS->kh_heap_id)) { + if (zone_is_data_buffers_kheap(KHEAP_DATA_BUFFERS->kh_heap_id)) { i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_BUFFERS); } + if (zone_is_data_shared_kheap(KHEAP_DATA_SHARED->kh_heap_id)) { + i += vm_page_diagnose_heap(counts + i, KHEAP_DATA_SHARED); + } + if (KHEAP_KT_VAR->kh_heap_id == KHEAP_ID_KT_VAR) { i += vm_page_diagnose_kt_heaps(counts + i); } @@ -10758,6 +11116,9 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone entry = NULL; break; } + + vmlp_range_event_entry(map, entry); + if (is_kernel_object(VME_OBJECT(entry))) { count = 0; vm_object_lock(VME_OBJECT(entry)); @@ -10791,6 +11152,7 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone process_account(info, num_info, zones_collectable_bytes, iterate, redact_info); + vmlp_api_end(VM_PAGE_DIAGNOSE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -10804,9 +11166,12 @@ vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_siz vm_map_t map; vm_map_entry_t entry; + vmlp_api_start(VM_KERN_ALLOCATION_INFO); + zsize = zone_element_info((void *) addr, tag); if (zsize) { *zone_size = *size = zsize; + vmlp_api_end(VM_KERN_ALLOCATION_INFO, KERN_SUCCESS); return KERN_SUCCESS; } @@ -10814,7 +11179,7 @@ vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_siz ret = KERN_INVALID_ADDRESS; for (map = kernel_map; map;) { vm_map_lock(map); - if (!vm_map_lookup_entry_allow_pgz(map, addr, &entry)) { + if (!vm_map_lookup_entry(map, addr, &entry)) { break; } if (entry->is_sub_map) { @@ -10827,6 +11192,9 @@ vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_siz if (entry->vme_start != addr) { break; } + + vmlp_range_event_entry(map, entry); + *tag = (vm_tag_t)VME_ALIAS(entry); *size = (entry->vme_end - addr); ret = KERN_SUCCESS; @@ -10837,9 +11205,84 @@ vm_kern_allocation_info(uintptr_t addr, vm_size_t * size, vm_tag_t * tag, vm_siz } vm_map_unlock(kernel_map); + vmlp_api_end(VM_KERN_ALLOCATION_INFO, ret); return ret; } +// some DEBUG/DEVELOPMENT code to get a process to page out its shared cache TEXT pages, +// only used for DK driver in LPW testing +uint64_t +vm_task_evict_shared_cache(task_t task) +{ + enum { kMaxKernelDepth = 3 }; + vm_map_t maps[kMaxKernelDepth]; + vm_map_entry_t entries[kMaxKernelDepth]; + vm_map_t map; + vm_object_t textObject, shadow; + vm_map_entry_t entry; + vm_object_offset_t textOffset, textSize; + int stackIdx; + uint64_t count; + + count = counter_load(&task->pageins); + map = get_task_map(task); + textObject = NULL; + stackIdx = 0; + while (map) { + vm_map_lock_read(map); + for (entry = map->hdr.links.next; map; entry = entry->vme_next) { + if (entry->is_sub_map) { + assert(stackIdx < kMaxKernelDepth); + maps[stackIdx] = map; + entries[stackIdx] = entry; + stackIdx++; + map = VME_SUBMAP(entry); + entry = NULL; + break; + } + if (stackIdx && (VM_PROT_EXECUTE | VM_PROT_READ) == entry->protection) { + textObject = VME_OBJECT(entry); + vm_object_lock(textObject); + while ((shadow = textObject->shadow)) { + vm_object_lock(shadow); + vm_object_unlock(textObject); + textObject = shadow; + } + vm_object_reference_locked(textObject); + vm_object_unlock(textObject); + textOffset = VME_OFFSET(entry); + textSize = entry->vme_end - entry->vme_start; + entry = vm_map_last_entry(map); + } + while (map && (entry == vm_map_last_entry(map))) { + vm_map_unlock_read(map); + if (!stackIdx) { + map = NULL; + } else { + --stackIdx; + map = maps[stackIdx]; + entry = entries[stackIdx]; + if (textObject) { + entry = vm_map_last_entry(map); + } + } + } + } + } + + if (textObject) { + vm_object_sync(textObject, textOffset, textSize, true, false, false); + vm_object_deallocate(textObject); + } + return count; +} + +uint64_t +vm_task_pageins(task_t task) +{ + return counter_load(&task->pageins); +} + #endif /* DEBUG || DEVELOPMENT */ uint32_t diff --git a/osfmk/vm/vm_sanitize.c b/osfmk/vm/vm_sanitize.c index 93a78a3ad..1bbb76d50 100644 --- a/osfmk/vm/vm_sanitize.c +++ b/osfmk/vm/vm_sanitize.c @@ -57,6 +57,7 @@ #include #include + #define VM_SANITIZE_PROT_ALLOWED (VM_PROT_ALL | VM_PROT_ALLEXEC) // TODO: enable telemetry and ktriage separately? @@ -474,11 +475,13 @@ vm_sanitize_addr_size( assert(!(flags & VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES)); } -#if CONFIG_KERNEL_TAGGING +#if KASAN_TBI if (flags & VM_SANITIZE_FLAGS_CANONICALIZE) { *addr = vm_memtag_canonicalize_kernel(*addr); } -#endif /* CONFIG_KERNEL_TAGGING */ +#endif /* KASAN_TBI */ + + addr_aligned = vm_map_trunc_page_mask(*addr, pgmask); /* diff --git a/osfmk/vm/vm_sanitize_internal.h b/osfmk/vm/vm_sanitize_internal.h index f5a0ecf76..02a28d145 100644 --- a/osfmk/vm/vm_sanitize_internal.h +++ b/osfmk/vm/vm_sanitize_internal.h @@ -191,7 +191,7 @@ __enum_closed_decl(vm_sanitize_caller_id_t, uint32_t, { * Reject non user allowed mem map flags for memory entry. * * @const VM_SANITIZE_FLAGS_CANONICALIZE - * Canonicalize address for CONFIG_KERNEL_TAGGING + * Canonicalize address for KASAN_TBI * * @const VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE * Checks that the size is aligned to map page size. diff --git a/osfmk/vm/vm_sanitize_telemetry.c b/osfmk/vm/vm_sanitize_telemetry.c index 3cc11c74b..db2f01da5 100644 --- a/osfmk/vm/vm_sanitize_telemetry.c +++ b/osfmk/vm/vm_sanitize_telemetry.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include @@ -219,7 +219,7 @@ vm_sanitize_send_telemetry_core_analytics( struct proc *proc = current_proc(); ca_event_t ca_event = CA_EVENT_ALLOCATE_FLAGS(vm_sanitize_updated_return_code, Z_NOWAIT | Z_ZERO); if (NULL == ca_event) { - os_log_error(OS_LOG_DEFAULT, "Failed to allocate event for VM API telemetry."); + vm_log_error("Failed to allocate event for VM API telemetry."); return; } CA_EVENT_TYPE(vm_sanitize_updated_return_code) * event_data = ca_event->data; diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index 437ded9f7..3b6935902 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -53,15 +53,49 @@ * it without having to enter it in their own pmap. * * When a process is being exec'ed, vm_map_exec() calls vm_shared_region_enter() - * to map the appropriate shared region in the process's address space. + * to associate the appropriate shared region with the process's address space. * We look up the appropriate shared region for the process's environment. * If we can't find one, we create a new (empty) one and add it to the list. * Otherwise, we just take an extra reference on the shared region we found. + * At this point, the shared region is not actually mapped into the process's + * address space, but rather a permanent VM_PROT_NONE placeholder covering the + * same VA region as the shared region is inserted. * * The "dyld" runtime, mapped into the process's address space at exec() time, * will then use the shared_region_check_np() and shared_region_map_and_slide_2_np() * system calls to validate and/or populate the shared region with the - * appropriate dyld_shared_cache file. + * appropriate dyld_shared_cache file. If the initial call to shared_region_check_np() + * indicates that the shared region has not been configured, dyld will then call + * shared_region_map_and_slide_2_np() to configure the shared region. It's possible + * that multiple tasks may simultaneously issue this call sequence for the same shared + * region, but the synchronization done by shared_region_acquire() will ensure that + * only one task will ultimately configure the shared region. All other tasks will + * wait for that task to finish its configuration step, at which point (assuming + * successful configuration) they will observe the configured shared region and + * re-issue the shared_region_check_np() system call to obtain the final shared + * region info. + * + * For the task that ends up configuring the shared region, the mapping and + * sliding of the shared region is performed against a temporary configuration-only + * vm_map, which is temporarily activated for the calling thread using + * vm_map_switch_to(). Once mapping and sliding completes successfully, the shared + * region will be "sealed" by stabilizing all its vm_map_entrys using COPY_DELAY + * objects, which eliminates the need for later modification of shared region map + * entries and thus simplifies the shared region's runtime locking requirements. + * After this sealing step, the original task vm_map will be restored. Since this + * entire configuration sequence happens within the context of a single system call, + * use of the temporary vm_map effectively guarantees that the shared region will + * not be visible in the task's address space (either to other threads in the task + * or to other tasks attempting to query the address space e.g. for debugging purposes) + * until it has been fully configured and sealed. + * + * The shared region is only inserted into a task's address space when the + * shared_region_check_np() system call detects that the shared region has been fully + * configured. Only at this point will the placeholder entry inserted at exec() + * time be replaced with the real shared region submap entry. This step is required + * of all tasks; even the task that previously configured the shared region must + * issue a final shared_region_check_np() system call to obtain the real shared + * region mapping. * * The shared region is inherited on fork() and the child simply takes an * extra reference on its parent's shared region. @@ -119,6 +153,7 @@ #include #include #include +#include #if defined(__arm64__) #include @@ -228,6 +263,8 @@ static kern_return_t vm_shared_region_slide_mapping( uint32_t slide, memory_object_control_t, vm_prot_t prot); /* forward */ +static kern_return_t vm_shared_region_insert_placeholder(vm_map_t map, vm_shared_region_t shared_region); +static kern_return_t vm_shared_region_insert_submap(vm_map_t map, vm_shared_region_t shared_region, bool overwrite); static int __commpage_setup = 0; #if XNU_TARGET_OS_OSX @@ -256,7 +293,7 @@ vm_shared_region_get( task_lock(task); vm_shared_region_lock(); shared_region = task->shared_region; - if (shared_region) { + if (shared_region != NULL) { assert(shared_region->sr_ref_count > 0); vm_shared_region_reference_locked(shared_region); } @@ -271,6 +308,44 @@ vm_shared_region_get( return shared_region; } +static void +vm_shared_region_acquire(vm_shared_region_t shared_region) +{ + vm_shared_region_lock(); + assert(shared_region->sr_ref_count > 0); + while (shared_region->sr_mapping_in_progress != NULL) { + /* wait for our turn... */ + vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, + THREAD_UNINT); + } + assert(shared_region->sr_mapping_in_progress == NULL); + assert(shared_region->sr_ref_count > 0); + + /* let others know to wait while we're working in this shared region */ + shared_region->sr_mapping_in_progress = current_thread(); + vm_shared_region_unlock(); +} + +static void +vm_shared_region_release(vm_shared_region_t shared_region) +{ + vm_shared_region_lock(); + assert(shared_region->sr_mapping_in_progress == current_thread()); + shared_region->sr_mapping_in_progress = THREAD_NULL; + vm_shared_region_wakeup((event_t) &shared_region->sr_mapping_in_progress); + vm_shared_region_unlock(); +} + +static void +vm_shared_region_seal( + struct vm_shared_region *sr) +{ + vm_map_t sr_map; + + sr_map = vm_shared_region_vm_map(sr); + vm_map_seal(sr_map, true /* nested_pmap */); +} + vm_map_t vm_shared_region_vm_map( vm_shared_region_t shared_region) @@ -678,7 +753,8 @@ vm_shared_region_create( vm_named_entry_t mem_entry; ipc_port_t mem_entry_port; vm_shared_region_t shared_region; - vm_map_t sub_map; + vm_map_t sub_map, config_map; + pmap_t nested_pmap, config_pmap; mach_vm_offset_t base_address, pmap_nesting_start; mach_vm_size_t size, pmap_nesting_size; @@ -693,6 +769,9 @@ vm_shared_region_create( mem_entry = NULL; mem_entry_port = IPC_PORT_NULL; sub_map = VM_MAP_NULL; + config_map = VM_MAP_NULL; + nested_pmap = PMAP_NULL; + config_pmap = PMAP_NULL; /* create a new shared region structure... */ shared_region = kalloc_type(struct vm_shared_region, @@ -768,7 +847,6 @@ vm_shared_region_create( #if defined(__arm64__) { - struct pmap *pmap_nested; int pmap_flags = 0; pmap_flags |= is_64bit ? PMAP_CREATE_64BIT : 0; @@ -781,45 +859,74 @@ vm_shared_region_create( } #endif /* __ARM_MIXED_PAGE_SIZE__ */ - pmap_nested = pmap_create_options(NULL, 0, pmap_flags); - if (pmap_nested != PMAP_NULL) { - pmap_set_nested(pmap_nested); - sub_map = vm_map_create_options(pmap_nested, 0, + nested_pmap = pmap_create_options(NULL, 0, pmap_flags | PMAP_CREATE_NESTED); + config_pmap = pmap_create_options(NULL, 0, pmap_flags); + if ((nested_pmap != PMAP_NULL) && (config_pmap != PMAP_NULL)) { + pmap_set_nested(nested_pmap); +#if CODE_SIGNING_MONITOR + csm_setup_nested_address_space(nested_pmap, base_address, size); +#endif /* CODE_SIGNING_MONITOR */ + pmap_set_shared_region(config_pmap, nested_pmap, base_address, size); + sub_map = vm_map_create_options(nested_pmap, 0, (vm_map_offset_t)size, VM_MAP_CREATE_PAGEABLE); + config_map = vm_map_create_options(config_pmap, base_address, + base_address + size, VM_MAP_CREATE_PAGEABLE); if (is_64bit || page_shift_user32 == SIXTEENK_PAGE_SHIFT) { /* enforce 16KB alignment of VM map entries */ vm_map_set_page_shift(sub_map, SIXTEENK_PAGE_SHIFT); + vm_map_set_page_shift(config_map, SIXTEENK_PAGE_SHIFT); } #if __ARM_MIXED_PAGE_SIZE__ if (cputype == CPU_TYPE_ARM64 && target_page_shift == FOURK_PAGE_SHIFT) { /* arm64/4k address space */ vm_map_set_page_shift(sub_map, FOURK_PAGE_SHIFT); + vm_map_set_page_shift(config_map, FOURK_PAGE_SHIFT); } #endif /* __ARM_MIXED_PAGE_SIZE__ */ - } else { - sub_map = VM_MAP_NULL; } } #else /* defined(__arm64__) */ { /* create a VM sub map and its pmap */ - pmap_t pmap = pmap_create_options(NULL, 0, is_64bit); - if (pmap != NULL) { - sub_map = vm_map_create_options(pmap, 0, + nested_pmap = pmap_create_options(NULL, 0, is_64bit); + config_pmap = pmap_create_options(NULL, 0, is_64bit); + if ((nested_pmap != NULL) && (config_pmap != NULL)) { + pmap_set_shared_region(config_pmap, nested_pmap, base_address, size); + sub_map = vm_map_create_options(nested_pmap, 0, (vm_map_offset_t)size, VM_MAP_CREATE_PAGEABLE); - } else { - sub_map = VM_MAP_NULL; + config_map = vm_map_create_options(config_pmap, base_address, + base_address + size, VM_MAP_CREATE_PAGEABLE); } } #endif /* defined(__arm64__) */ - if (sub_map == VM_MAP_NULL) { + + if (sub_map != VM_MAP_NULL) { + nested_pmap = PMAP_NULL; + } + if (config_map != VM_MAP_NULL) { + config_pmap = PMAP_NULL; + } + if (nested_pmap != PMAP_NULL) { + pmap_destroy(nested_pmap); + } + if (config_pmap != PMAP_NULL) { + pmap_destroy(config_pmap); + } + + if ((sub_map == VM_MAP_NULL) || (config_map == VM_MAP_NULL)) { + if (sub_map != VM_MAP_NULL) { + vm_map_deallocate(sub_map); + } + if (config_map != VM_MAP_NULL) { + vm_map_deallocate(config_map); + } ipc_port_release_send(mem_entry_port); kfree_type(struct vm_shared_region, shared_region); shared_region = NULL; - SHARED_REGION_TRACE_ERROR(("shared_region: create: couldn't allocate map\n")); + SHARED_REGION_TRACE_ERROR(("shared_region: create: couldn't allocate maps\n")); goto done; } @@ -827,9 +934,13 @@ vm_shared_region_create( vm_map_cs_enforcement_set(sub_map, true); assert(vm_map_cs_enforcement(sub_map)); assert(pmap_get_vm_map_cs_enforced(vm_map_pmap(sub_map))); + vm_map_cs_enforcement_set(config_map, true); + assert(vm_map_cs_enforcement(config_map)); + assert(pmap_get_vm_map_cs_enforced(vm_map_pmap(config_map))); assert(!sub_map->disable_vmentry_reuse); sub_map->is_nested_map = TRUE; + sub_map->vmmap_sealed = VM_MAP_WILL_BE_SEALED; /* make the memory entry point to the VM sub map */ mem_entry->is_sub_map = TRUE; @@ -841,6 +952,7 @@ vm_shared_region_create( shared_region->sr_mem_entry = mem_entry_port; /* fill in the shared region's environment and settings */ + shared_region->sr_config_map = config_map; shared_region->sr_base_address = base_address; shared_region->sr_size = size; shared_region->sr_pmap_nesting_start = pmap_nesting_start; @@ -879,6 +991,14 @@ vm_shared_region_create( shared_region->sr_next_auth_section = 0; shared_region->sr_auth_section = NULL; #endif /* __has_feature(ptrauth_calls) */ + kern_return_t kr = vm_shared_region_insert_submap(config_map, shared_region, false); + if (kr != KERN_SUCCESS) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: create(%p): insert_submap returned 0x%x\n", shared_region, kr)); + shared_region->sr_ref_count = 0; + vm_shared_region_destroy(shared_region); + shared_region = NULL; + } done: if (shared_region) { @@ -934,6 +1054,12 @@ vm_shared_region_destroy( assert(mem_entry->is_sub_map); assert(!mem_entry->internal); assert(!mem_entry->is_copy); + + if (shared_region->sr_config_map != VM_MAP_NULL) { + vm_map_deallocate(shared_region->sr_config_map); + shared_region->sr_config_map = VM_MAP_NULL; + } + map = mem_entry->backing.map; /* @@ -953,7 +1079,7 @@ vm_shared_region_destroy( /* * Release our (one and only) handle on the memory entry. * This will generate a no-senders notification, which will be processed - * by ipc_kobject_notify_no_senders(), which will release the one and only + * by ipc_notify_no_senders_kobject(), which will release the one and only * reference on the memory entry and cause it to be destroyed, along * with the VM sub map and its pmap. */ @@ -1003,8 +1129,7 @@ vm_shared_region_destroy( kern_return_t vm_shared_region_start_address( vm_shared_region_t shared_region, - mach_vm_offset_t *start_address, - task_t task) + mach_vm_offset_t *start_address) { kern_return_t kr; mach_vm_offset_t sr_base_address; @@ -1021,12 +1146,12 @@ vm_shared_region_start_address( * in this shared region right when we're looking at it. * We want a consistent view of the map... */ - while (shared_region->sr_mapping_in_progress) { + while (shared_region->sr_mapping_in_progress != NULL) { /* wait for our turn... */ vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, THREAD_UNINT); } - assert(!shared_region->sr_mapping_in_progress); + assert(shared_region->sr_mapping_in_progress == NULL); assert(shared_region->sr_ref_count > 0); sr_base_address = shared_region->sr_base_address; @@ -1041,23 +1166,8 @@ vm_shared_region_start_address( } - uint32_t slide = shared_region->sr_slide; - vm_shared_region_unlock(); - /* - * Cache shared region info in the task for telemetry gathering, if we're - * passed in the task. No task lock here as we're still in intial task set up. - */ - if (kr == KERN_SUCCESS && task != NULL && task->task_shared_region_slide == -1) { - uint_t sc_header_uuid_offset = offsetof(struct _dyld_cache_header, uuid); - if (copyin((user_addr_t)(*start_address + sc_header_uuid_offset), - (char *)&task->task_shared_region_uuid, - sizeof(task->task_shared_region_uuid)) == 0) { - task->task_shared_region_slide = slide; - } - } - SHARED_REGION_TRACE_DEBUG( ("shared_region: start_address(%p) <- 0x%llx\n", (void *)VM_KERNEL_ADDRPERM(shared_region), @@ -1066,6 +1176,52 @@ vm_shared_region_start_address( return kr; } +kern_return_t +vm_shared_region_update_task(task_t task, vm_shared_region_t shared_region, mach_vm_offset_t start_address) +{ + kern_return_t kr = KERN_SUCCESS; + uuid_t shared_region_uuid; + _Static_assert(sizeof(shared_region_uuid) == sizeof(task->task_shared_region_uuid), + "sizeof task_shared_region_uuid != sizeof uuid_t"); + task_lock(task); + if (task->task_shared_region_slide == -1) { + assert(vm_map_is_sealed(vm_shared_region_vm_map(shared_region))); + kr = vm_shared_region_insert_submap(task->map, shared_region, true); + if (kr == KERN_SUCCESS) { + task->task_shared_region_slide = shared_region->sr_slide; + /* + * Drop the task lock to avoid potential deadlock if copyin() faults. + * With the lock dropped, another thread in the task could theoretically + * call this function, observe task_shared_region_slide != -1, and + * return before the UUID has been copied to the task, but in practice + * dyld should only issue the shared_region_check_np() syscall that ends + * up invoking this function exactly once, and while the task is still + * single-threaded at that. + */ + task_unlock(task); + /* + * Now that shared region is accessible in the task's address space, + * copyin the UUID for debugging/telemetry purposes. + * copyin had better succeed here. We've already inserted the submap, + * which can't be undone or re-done later. If the shared region header + * isn't accessible at this point, we have big problems. + */ + const uint_t sc_header_uuid_offset = offsetof(struct _dyld_cache_header, uuid); + if (copyin((user_addr_t)(start_address + sc_header_uuid_offset), + (char *)&shared_region_uuid, sizeof(shared_region_uuid)) != 0) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: update_task(%p) copyin failed\n", + (void *)VM_KERNEL_ADDRPERM(shared_region))); + } + task_lock(task); + memcpy(&task->task_shared_region_uuid, shared_region_uuid, sizeof(shared_region_uuid)); + } + } + + task_unlock(task); + return kr; +} + /* * Look up a pre-existing mapping in shared region, for replacement. * Takes an extra object reference if found. @@ -1075,18 +1231,25 @@ find_mapping_to_slide(vm_map_t map, vm_map_address_t addr, vm_map_entry_t entry) { vm_map_entry_t found; + vmlp_api_start(FIND_MAPPING_TO_SLIDE); + /* find the shared region's map entry to slide */ vm_map_lock_read(map); - if (!vm_map_lookup_entry_allow_pgz(map, addr, &found)) { + if (!vm_map_lookup_entry(map, addr, &found)) { /* no mapping there */ vm_map_unlock(map); + vmlp_api_end(FIND_MAPPING_TO_SLIDE, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } *entry = *found; + + vmlp_range_event_entry(map, entry); + /* extra ref to keep object alive while map is unlocked */ vm_object_reference(VME_OBJECT(found)); vm_map_unlock_read(map); + vmlp_api_end(FIND_MAPPING_TO_SLIDE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -1181,26 +1344,21 @@ vm_shared_region_auth_remap(vm_shared_region_t sr) boolean_t use_ptr_auth = task_sign_pointers(task); /* - * Don't do this more than once and avoid any race conditions in finishing it. + * Taking the full shared region lock here shouldn't be necessary for + * functional correctness here, so we could potentially gain some scalability + * by only taking the task lock here which would avoid the possibility of + * serializing multiple tasks at the auth_remap step. But shared_region_pager_match() + * is slightly racy and can produce duplicate pagers without shared-region-wide + * synchronization, which is a potential memory footprint issue. */ - vm_shared_region_lock(); - while (sr->sr_mapping_in_progress) { - /* wait for our turn... */ - vm_shared_region_sleep(&sr->sr_mapping_in_progress, THREAD_UNINT); - } - assert(!sr->sr_mapping_in_progress); - assert(sr->sr_ref_count > 0); + vm_shared_region_acquire(sr); /* Just return if already done. */ if (task->shared_region_auth_remapped) { - vm_shared_region_unlock(); + vm_shared_region_release(sr); return KERN_SUCCESS; } - /* let others know to wait while we're working in this shared region */ - sr->sr_mapping_in_progress = current_thread(); - vm_shared_region_unlock(); - /* * Remap any sections with pointer authentications into the private map. */ @@ -1260,7 +1418,7 @@ vm_shared_region_auth_remap(vm_shared_region_t sr) /* Preserve the TPRO flag if task has TPRO enabled */ vmk_flags.vmf_tpro = (vm_map_tpro(task->map) && tmp_entry->used_for_tpro && - task_is_hardened_binary(task)); + task_has_tpro(task)); map_addr = si->si_slid_address; kr = mach_vm_map_kernel(task->map, @@ -1308,12 +1466,8 @@ done: /* * Mark the region as having it's auth sections remapped. */ - vm_shared_region_lock(); task->shared_region_auth_remapped = TRUE; - assert(sr->sr_mapping_in_progress == current_thread()); - sr->sr_mapping_in_progress = THREAD_NULL; - vm_shared_region_wakeup((event_t)&sr->sr_mapping_in_progress); - vm_shared_region_unlock(); + vm_shared_region_release(sr); return kr; } #endif /* __has_feature(ptrauth_calls) */ @@ -1328,7 +1482,6 @@ vm_shared_region_undo_mappings( { unsigned int j = 0; vm_shared_region_t shared_region = NULL; - boolean_t reset_shared_region_state = FALSE; struct _sr_file_mappings *srfmp; unsigned int mappings_count; struct shared_file_mapping_slide_np *mappings; @@ -1345,23 +1498,6 @@ vm_shared_region_undo_mappings( ipc_port_t sr_handle; vm_named_entry_t sr_mem_entry; - vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 0); - - while (shared_region->sr_mapping_in_progress) { - /* wait for our turn... */ - vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, - THREAD_UNINT); - } - assert(!shared_region->sr_mapping_in_progress); - assert(shared_region->sr_ref_count > 0); - /* let others know we're working in this shared region */ - shared_region->sr_mapping_in_progress = current_thread(); - - vm_shared_region_unlock(); - - reset_shared_region_state = TRUE; - /* no need to lock because this data is never modified... */ sr_handle = shared_region->sr_mem_entry; sr_mem_entry = mach_memory_entry_from_port(sr_handle); @@ -1417,17 +1553,6 @@ vm_shared_region_undo_mappings( } } - if (reset_shared_region_state) { - vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 0); - assert(shared_region->sr_mapping_in_progress == current_thread()); - /* we're done working on that shared region */ - shared_region->sr_mapping_in_progress = THREAD_NULL; - vm_shared_region_wakeup((event_t) &shared_region->sr_mapping_in_progress); - vm_shared_region_unlock(); - reset_shared_region_state = FALSE; - } - vm_shared_region_deallocate(shared_region); } @@ -1470,38 +1595,7 @@ vm_shared_region_map_file_setup( struct shared_file_mapping_slide_np *mappings; struct _sr_file_mappings *srfmp; - vm_shared_region_lock(); - assert(shared_region->sr_ref_count > 0); - - /* - * Make sure we handle only one mapping at a time in a given - * shared region, to avoid race conditions. This should not - * happen frequently... - */ - while (shared_region->sr_mapping_in_progress) { - /* wait for our turn... */ - vm_shared_region_sleep(&shared_region->sr_mapping_in_progress, - THREAD_UNINT); - } - assert(!shared_region->sr_mapping_in_progress); - assert(shared_region->sr_ref_count > 0); - - - /* let others know we're working in this shared region */ - shared_region->sr_mapping_in_progress = current_thread(); - - /* - * Did someone race in and map this shared region already? - */ - if (shared_region->sr_first_mapping != -1) { - vm_shared_region_unlock(); -#if DEVELOPMENT || DEBUG - printf("shared_region: caught race in map and slide\n"); -#endif /* DEVELOPMENT || DEBUG */ - return KERN_FAILURE; - } - - vm_shared_region_unlock(); + assert(shared_region->sr_mapping_in_progress == current_thread()); /* no need to lock because this data is never modified... */ sr_handle = shared_region->sr_mem_entry; @@ -1521,6 +1615,7 @@ vm_shared_region_map_file_setup( for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[sr_file_mappings_count]; srfmp++) { + i = 0; /* reset i early because it's used in the error recovery path */ mappings_count = srfmp->mappings_count; mappings = srfmp->mappings; file_control = srfmp->file_control; @@ -1569,7 +1664,7 @@ vm_shared_region_map_file_setup( * The size needs to be suitable to map into kernel. */ obj_size = vm_object_round_page(mappings->sms_size); - object = vm_object_allocate(obj_size); + object = vm_object_allocate(obj_size, kernel_map->serial_id); if (object == VM_OBJECT_NULL) { printf("%s(): for fd==-1 vm_object_allocate() failed\n", __func__); kr = KERN_RESOURCE_SHORTAGE; @@ -1582,7 +1677,8 @@ vm_shared_region_map_file_setup( vm_map_offset_t kaddr = 0; vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE(); vmk_flags.vmkf_no_copy_on_read = 1; - vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA; + vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ? + KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA; kr = vm_map_enter(kernel_map, &kaddr, @@ -1621,7 +1717,7 @@ vm_shared_region_map_file_setup( copyin((user_addr_t)mappings->sms_file_offset, (void *)kaddr, mappings->sms_size); vm_map_remove(kernel_map, kaddr, kaddr + obj_size); if (copyin_err) { - printf("%s(): for fd==-1 copyin() failed, errno=%d\n", __func__, copyin_err); + printf("%s(): for fd==-1 copyin(%p) failed, errno=%d\n", __func__, (void*)mappings->sms_file_offset, copyin_err); switch (copyin_err) { case EPERM: case EACCES: @@ -1801,7 +1897,7 @@ vm_shared_region_map_file_setup( * We have to create the VM object now, so that it can be mapped "copy-on-write". */ obj_size = vm_map_round_page(mappings[i].sms_size, VM_MAP_PAGE_MASK(sr_map)); - object = vm_object_allocate(obj_size); + object = vm_object_allocate(obj_size, sr_map->serial_id); if (object == VM_OBJECT_NULL) { kr = KERN_RESOURCE_SHORTAGE; } else { @@ -1984,6 +2080,10 @@ vm_shared_region_map_file( memory_object_control_t *slid_file_controls = NULL; /* [0..vmsr_num_slides] */ struct shared_file_mapping_slide_np **mappings_to_slide = NULL; /* [0..vmsr_num_slides] */ struct _sr_file_mappings *srfmp; + vm_map_switch_context_t switch_ctx; + bool map_switched = false; + + vmlp_api_start(VM_SHARED_REGION_MAP_FILE); /* * Figure out how many of the mappings have slides. @@ -2008,15 +2108,31 @@ vm_shared_region_map_file( kalloc_type(struct shared_file_mapping_slide_np *, vmsr_num_slides, Z_WAITOK | Z_ZERO); } + vm_shared_region_acquire(shared_region); + + /* + * Did someone race in and map this shared region already, or did an earlier mapping fail? + */ + if (shared_region->sr_first_mapping != -1) { +#if DEVELOPMENT || DEBUG + printf("shared_region: caught race in map and slide\n"); +#endif /* DEVELOPMENT || DEBUG */ + kr = KERN_FAILURE; + goto done; + } + kr = vm_shared_region_map_file_setup(shared_region, sr_file_mappings_count, sr_file_mappings, &mappings_to_slide_cnt, mappings_to_slide, slid_mappings, slid_file_controls, &sfm_min_address, &sfm_max_address, &sr_map, &lowest_unnestable_addr, vmsr_num_slides); if (kr != KERN_SUCCESS) { - vm_shared_region_lock(); goto done; } assert(vmsr_num_slides == mappings_to_slide_cnt); + assert(shared_region->sr_config_map != NULL); + switch_ctx = vm_map_switch_to(shared_region->sr_config_map); + map_switched = true; + /* * The call above installed direct mappings to the shared cache file. * Now we go back and overwrite the mappings that need relocation @@ -2065,7 +2181,6 @@ vm_shared_region_map_file( &sr_file_mappings[0], &sr_file_mappings[sr_file_mappings_count - 1], sr_file_mappings_count); - vm_shared_region_lock(); goto done; } } @@ -2076,6 +2191,7 @@ vm_shared_region_map_file( lowest_unnestable_addr &= ~(pmap_shared_region_size_min(sr_map->pmap) - 1); if (lowest_unnestable_addr != sr_map->lowest_unnestable_start) { vm_map_lock(sr_map); + vmlp_range_event_none(sr_map); sr_map->lowest_unnestable_start = lowest_unnestable_addr; vm_map_unlock(sr_map); } @@ -2085,33 +2201,33 @@ vm_shared_region_map_file( assert(shared_region->sr_mapping_in_progress == current_thread()); vm_shared_region_map_file_final(shared_region, sr_map, sfm_min_address, sfm_max_address); - -done: - /* - * We're done working on that shared region. - * Wake up any waiting threads. - */ - assert(shared_region->sr_mapping_in_progress == current_thread()); - shared_region->sr_mapping_in_progress = THREAD_NULL; - vm_shared_region_wakeup((event_t) &shared_region->sr_mapping_in_progress); vm_shared_region_unlock(); -#if __has_feature(ptrauth_calls) - if (kr == KERN_SUCCESS) { - /* - * Since authenticated mappings were just added to the shared region, - * go back and remap them into private mappings for this task. - */ - kr = vm_shared_region_auth_remap(shared_region); - } -#endif /* __has_feature(ptrauth_calls) */ +done: - /* Cache shared region info needed for telemetry in the task */ - task_t task; - if (kr == KERN_SUCCESS && (task = current_task())->task_shared_region_slide == -1) { - mach_vm_offset_t start_address; - (void)vm_shared_region_start_address(shared_region, &start_address, task); +#ifndef NO_NESTED_PMAP + /* + * If we succeeded, we know the bounds of the shared region. + * Trim our pmaps to only cover this range (if applicable to + * this platform). + */ + if (kr == KERN_SUCCESS) { + pmap_trim(shared_region->sr_config_map->pmap, sr_map->pmap, sfm_min_address, sfm_max_address - sfm_min_address); } +#endif + if (map_switched) { + vm_map_switch_back(switch_ctx); + } + + if (kr == KERN_SUCCESS) { + vm_map_deallocate(shared_region->sr_config_map); + shared_region->sr_config_map = VM_MAP_NULL; + } + + if (kr == KERN_SUCCESS) { + vm_shared_region_seal(shared_region); + } + vm_shared_region_release(shared_region); SHARED_REGION_TRACE_DEBUG( ("shared_region: map(%p) <- 0x%x \n", @@ -2122,6 +2238,7 @@ done: kfree_type(struct shared_file_mapping_slide_np *, vmsr_num_slides, mappings_to_slide); } + vmlp_api_end(VM_SHARED_REGION_MAP_FILE, kr); return kr; } @@ -2218,75 +2335,19 @@ vm_shared_region_map_file_final( } primary_system_shared_region = shared_region; } - -#ifndef NO_NESTED_PMAP - /* - * If we succeeded, we know the bounds of the shared region. - * Trim our pmaps to only cover this range (if applicable to - * this platform). - */ - if (VM_MAP_PAGE_SHIFT(current_map()) == VM_MAP_PAGE_SHIFT(sr_map)) { - pmap_trim(current_map()->pmap, sr_map->pmap, sfm_min_address, sfm_max_address - sfm_min_address); - } -#endif } /* - * Retrieve a task's shared region and grab an extra reference to - * make sure it doesn't disappear while the caller is using it. - * The caller is responsible for consuming that extra reference if - * necessary. - * - * This also tries to trim the pmap for the shared region. - */ -vm_shared_region_t -vm_shared_region_trim_and_get(task_t task) -{ - vm_shared_region_t shared_region; - ipc_port_t sr_handle; - vm_named_entry_t sr_mem_entry; - vm_map_t sr_map; - - /* Get the shared region and the map. */ - shared_region = vm_shared_region_get(task); - if (shared_region == NULL) { - return NULL; - } - - sr_handle = shared_region->sr_mem_entry; - sr_mem_entry = mach_memory_entry_from_port(sr_handle); - sr_map = sr_mem_entry->backing.map; - -#ifndef NO_NESTED_PMAP - /* Trim the pmap if possible. */ - if (VM_MAP_PAGE_SHIFT(task->map) == VM_MAP_PAGE_SHIFT(sr_map)) { - pmap_trim(task->map->pmap, sr_map->pmap, 0, 0); - } -#endif - - return shared_region; -} - -/* - * Enter the appropriate shared region into "map" for "task". - * This involves looking up the shared region (and possibly creating a new - * one) for the desired environment, then mapping the VM sub map into the - * task's VM "map", with the appropriate level of pmap-nesting. + * Insert the real shared region submap entry into a task's VM map over the placeholder + * installed by vm_map_exec(). Note that this function can only be called once per vm_map, + * and cannot be undone. This is because it results in the shared region's pmap being nested + * into [map]'s pmap; on some platforms the security model requires this nesting relationship + * to be permanent, so the nested pmap cannot be "de-nested" from the top-level pmap or + * "re-nested" again into the same top-level pmap. */ kern_return_t -vm_shared_region_enter( - struct _vm_map *map, - struct task *task, - boolean_t is_64bit, - void *fsroot, - cpu_type_t cpu, - cpu_subtype_t cpu_subtype, - boolean_t reslide, - boolean_t is_driverkit, - uint32_t rsr_version) +vm_shared_region_insert_submap(vm_map_t map, vm_shared_region_t shared_region, bool overwrite) { - kern_return_t kr; - vm_shared_region_t shared_region; vm_map_offset_t sr_address, sr_offset, target_address; vm_map_size_t sr_size, mapping_size; vm_map_offset_t sr_pmap_nesting_start; @@ -2295,31 +2356,7 @@ vm_shared_region_enter( vm_prot_t cur_prot, max_prot; vm_map_kernel_flags_t vmk_flags; - SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> " - "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d,driverkit=%d)\n", - (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, is_driverkit)); - - /* lookup (create if needed) the shared region for this environment */ - shared_region = vm_shared_region_lookup(fsroot, cpu, cpu_subtype, is_64bit, VM_MAP_PAGE_SHIFT(map), reslide, is_driverkit, rsr_version); - if (shared_region == NULL) { - /* this should not happen ! */ - SHARED_REGION_TRACE_ERROR( - ("shared_region: -> " - "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d,reslide=%d,driverkit=%d): " - "lookup failed !\n", - (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit)); - //panic("shared_region_enter: lookup failed"); - return KERN_FAILURE; - } - - kr = KERN_SUCCESS; + kern_return_t kr = KERN_SUCCESS; /* no need to lock since this data is never modified */ sr_address = (vm_map_offset_t)shared_region->sr_base_address; sr_size = (vm_map_size_t)shared_region->sr_size; @@ -2327,7 +2364,15 @@ vm_shared_region_enter( sr_pmap_nesting_start = (vm_map_offset_t)shared_region->sr_pmap_nesting_start; sr_pmap_nesting_size = (vm_map_size_t)shared_region->sr_pmap_nesting_size; vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED(); + if (overwrite) { + vmk_flags.vmf_overwrite = true; + vmk_flags.vmkf_overwrite_immutable = true; + } + /* + * vm_map_lookup_and_lock_object() expects the parent map entry + * for a shared region submap to have protections r-- by default. + */ cur_prot = VM_PROT_READ; if (VM_MAP_POLICY_WRITABLE_SHARED_REGION(map)) { /* @@ -2366,24 +2411,20 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); - goto done; + return kr; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); sr_offset += mapping_size; @@ -2417,24 +2458,20 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)sr_pmap_nesting_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); - goto done; + return kr; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "nested vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)sr_pmap_nesting_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); @@ -2460,24 +2497,20 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); - goto done; + return kr; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d,%d,%d,%d): " + ("shared_region: insert_submap(%p,%p): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, cpu_subtype, is_64bit, reslide, is_driverkit, + (void *)VM_KERNEL_ADDRPERM(shared_region), (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); sr_offset += mapping_size; @@ -2485,7 +2518,89 @@ vm_shared_region_enter( } assert(sr_size == 0); -done: + return kr; +} + +/* + * Inserts a VM_PROT_NONE placeholder covering the shared region into [map]. + * This is intended to be called when a new task is exec'ed and initially associated + * with a shared region. Once the userspace dyld initialization sequence successfully + * queries the shared region start address via the shared_region_check_np syscall, + * this placeholder will be replaced with the real shared region submap entry. + */ +static kern_return_t +vm_shared_region_insert_placeholder(vm_map_t map, vm_shared_region_t shared_region) +{ + vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(); + + vm_map_offset_t address = shared_region->sr_base_address; + + pmap_set_shared_region(map->pmap, vm_shared_region_vm_map(shared_region)->pmap, + address, shared_region->sr_size); + + return vm_map_enter( + map, + &address, + shared_region->sr_size, + (vm_map_offset_t)0, + vmk_flags, + VM_OBJECT_NULL, + (vm_object_offset_t)0, + FALSE, + VM_PROT_NONE, + VM_PROT_NONE, + VM_INHERIT_COPY); +} + +/* + * Enter the appropriate shared region into "map" for "task". + * This involves looking up the shared region (and possibly creating a new + * one) for the desired environment, then entering a permanent placeholder + * entry for the shared region. If the task actually chooses to map a + * shared region, this placeholder will later be overwritten by a submap + * entry for the real shared region in vm_shared_region_insert_submap(). + */ +kern_return_t +vm_shared_region_enter( + struct _vm_map *map, + struct task *task, + boolean_t is_64bit, + void *fsroot, + cpu_type_t cpu, + cpu_subtype_t cpu_subtype, + boolean_t reslide, + boolean_t is_driverkit, + uint32_t rsr_version) +{ + kern_return_t kr; + vm_shared_region_t shared_region; + + SHARED_REGION_TRACE_DEBUG( + ("shared_region: -> " + "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d,driverkit=%d)\n", + (void *)VM_KERNEL_ADDRPERM(map), + (void *)VM_KERNEL_ADDRPERM(task), + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, is_driverkit)); + + /* lookup (create if needed) the shared region for this environment */ + shared_region = vm_shared_region_lookup(fsroot, cpu, cpu_subtype, is_64bit, VM_MAP_PAGE_SHIFT(map), reslide, is_driverkit, rsr_version); + if (shared_region == NULL) { + /* this should not happen ! */ + SHARED_REGION_TRACE_ERROR( + ("shared_region: -> " + "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d,reslide=%d,driverkit=%d): " + "lookup failed !\n", + (void *)VM_KERNEL_ADDRPERM(map), + (void *)VM_KERNEL_ADDRPERM(task), + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, reslide, is_driverkit)); + //panic("shared_region_enter: lookup failed"); + return KERN_FAILURE; + } + + kr = vm_shared_region_insert_placeholder(map, shared_region); + if (kr == KERN_SUCCESS) { /* let the task use that shared region */ vm_shared_region_set(task, shared_region); @@ -2632,7 +2747,7 @@ vm_shared_region_slide_mapping( } error = copyin(slide_info_addr, slide_info_entry, (size_t)slide_info_size); if (error) { - printf("copyin of slide_info failed\n"); + printf("copyin of slide_info (%p) failed\n", (void*)slide_info_addr); kr = KERN_INVALID_ADDRESS; goto done; } @@ -3284,6 +3399,10 @@ vm_shared_region_slide_page_v3( bool isBind = (value & (1ULL << 62)) != 0; if (isBind) { +#if CONFIG_SPTM + pmap_batch_sign_user_ptr(NULL, NULL, 0, 0, 0); + assert(preemption_enabled()); +#endif /* CONFIG_SPTM */ return KERN_FAILURE; } @@ -3310,11 +3429,18 @@ vm_shared_region_slide_page_v3( } if (jop_key != 0 && si->si_ptrauth && !arm_user_jop_disabled()) { +#if CONFIG_SPTM + pmap_batch_sign_user_ptr(rebaseLocation, (void *)value, key, discriminator, jop_key); +#else /* CONFIG_SPTM */ /* * these pointers are used in user mode. disable the kernel key diversification * so we can sign them for use in user mode. */ value = (uintptr_t)pmap_sign_user_ptr((void *)value, key, discriminator, jop_key); + memcpy(rebaseLocation, &value, sizeof(value)); +#endif /* CONFIG_SPTM */ + } else { + memcpy(rebaseLocation, &value, sizeof(value)); } #endif /* __has_feature(ptrauth_calls) */ } else { @@ -3326,11 +3452,16 @@ vm_shared_region_slide_page_v3( uint64_t bottom43Bits = value & 0x000007FFFFFFFFFFULL; uint64_t targetValue = (top8Bits << 13) | bottom43Bits; value = targetValue + slide_amount; + memcpy(rebaseLocation, &value, sizeof(value)); } - - memcpy(rebaseLocation, &value, sizeof(value)); } while (delta != 0); +#if CONFIG_SPTM + /* Sign the leftovers if there's any. */ + pmap_batch_sign_user_ptr(NULL, NULL, 0, 0, 0); + assert(preemption_enabled()); +#endif /* CONFIG_SPTM */ + return KERN_SUCCESS; } @@ -3517,21 +3648,33 @@ vm_shared_region_slide_page_v5( } if (jop_key != 0 && si->si_ptrauth && !arm_user_jop_disabled()) { +#if CONFIG_SPTM + pmap_batch_sign_user_ptr(rebaseLocation, (void *)value, key, discriminator, jop_key); +#else /* CONFIG_SPTM */ /* * these pointers are used in user mode. disable the kernel key diversification * so we can sign them for use in user mode. */ value = (uintptr_t)pmap_sign_user_ptr((void *)value, key, discriminator, jop_key); + memcpy(rebaseLocation, &value, sizeof(value)); +#endif /* CONFIG_SPTM */ + } else { + memcpy(rebaseLocation, &value, sizeof(value)); } #endif /* __has_feature(ptrauth_calls) */ } else { // the value already has the correct low bits, so just add in the high8 if it exists value += high8; + memcpy(rebaseLocation, &value, sizeof(value)); } - - memcpy(rebaseLocation, &value, sizeof(value)); } while (delta != 0); +#if CONFIG_SPTM + /* Sign the leftovers if there's any. */ + pmap_batch_sign_user_ptr(NULL, NULL, 0, 0, 0); + assert(preemption_enabled()); +#endif /* CONFIG_SPTM */ + return KERN_SUCCESS; } @@ -3603,6 +3746,7 @@ _vm_commpage_init( panic("_vm_commpage_init: could not allocate pmap"); } new_map = vm_map_create_options(new_pmap, 0, size, VM_MAP_CREATE_DEFAULT); + new_map->vmmap_sealed = VM_MAP_WILL_BE_SEALED; mem_entry = mach_memory_entry_allocate(handlep); mem_entry->backing.map = new_map; diff --git a/osfmk/vm/vm_shared_region_internal.h b/osfmk/vm/vm_shared_region_internal.h index 8f6dcf654..d141cb115 100644 --- a/osfmk/vm/vm_shared_region_internal.h +++ b/osfmk/vm/vm_shared_region_internal.h @@ -71,8 +71,7 @@ extern vm_shared_region_t vm_shared_region_lookup( uint32_t rsr_version); extern kern_return_t vm_shared_region_start_address( struct vm_shared_region *shared_region, - mach_vm_offset_t *start_address, - task_t task); + mach_vm_offset_t *start_address); extern void vm_shared_region_undo_mappings( vm_map_t sr_map, mach_vm_offset_t sr_base_address, diff --git a/osfmk/vm/vm_shared_region_pager.c b/osfmk/vm/vm_shared_region_pager.c index 09e7cc701..9856b5dc8 100644 --- a/osfmk/vm/vm_shared_region_pager.c +++ b/osfmk/vm/vm_shared_region_pager.c @@ -46,9 +46,6 @@ #include #include -#include -#include - #include #include #include @@ -1173,7 +1170,7 @@ shared_region_pager_create( * The vm_map call takes both named entry ports and raw memory * objects in the same parameter. We need to make sure that * vm_map does not see this object as a named entry port. So, - * we reserve the first word in the object for a fake ip_kotype + * we reserve the first word in the object for a fake object type * setting - that will tell vm_map to use it as a memory object. */ pager->srp_header.mo_ikot = IKOT_MEMORY_OBJECT; diff --git a/osfmk/vm/vm_shared_region_xnu.h b/osfmk/vm/vm_shared_region_xnu.h index f2c9efd0c..7755e995a 100644 --- a/osfmk/vm/vm_shared_region_xnu.h +++ b/osfmk/vm/vm_shared_region_xnu.h @@ -188,6 +188,7 @@ struct vm_shared_region { cpu_type_t sr_cpu_type; cpu_subtype_t sr_cpu_subtype; ipc_port_t sr_mem_entry; + vm_map_t sr_config_map; mach_vm_offset_t sr_first_mapping; mach_vm_offset_t sr_base_address; mach_vm_size_t sr_size; @@ -230,8 +231,6 @@ extern uint64_t shared_region_find_key(char *shared_region_id); extern vm_shared_region_t vm_shared_region_get( struct task *task); -extern vm_shared_region_t vm_shared_region_trim_and_get( - struct task *task); extern void vm_shared_region_deallocate( struct vm_shared_region *shared_region); extern void vm_shared_region_set( @@ -241,6 +240,10 @@ extern kern_return_t vm_shared_region_sliding_valid(uint32_t slide); extern void vm_commpage_init(void); extern void vm_commpage_text_init(void); extern void vm_shared_region_reslide_stale(boolean_t driverkit); +extern kern_return_t vm_shared_region_update_task( + struct task *task, + struct vm_shared_region *shared_region, + mach_vm_offset_t start_address); #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS diff --git a/osfmk/vm/vm_swapfile_pager.c b/osfmk/vm/vm_swapfile_pager.c index 800fe402e..71a4ce446 100644 --- a/osfmk/vm/vm_swapfile_pager.c +++ b/osfmk/vm/vm_swapfile_pager.c @@ -335,7 +335,7 @@ swapfile_pager_data_request( * destination physical page when it's its turn to be processed. */ kr = kmem_alloc(kernel_map, &kernel_mapping, PAGE_SIZE, - KMA_DATA | KMA_KOBJECT | KMA_PAGEABLE, VM_KERN_MEMORY_NONE); + KMA_DATA_SHARED | KMA_KOBJECT | KMA_PAGEABLE, VM_KERN_MEMORY_NONE); if (kr != KERN_SUCCESS) { retval = kr; goto done; @@ -688,7 +688,7 @@ swapfile_pager_create( * The vm_map call takes both named entry ports and raw memory * objects in the same parameter. We need to make sure that * vm_map does not see this object as a named entry port. So, - * we reserve the second word in the object for a fake ip_kotype + * we reserve the second word in the object for a fake object type * setting - that will tell vm_map to use it as a memory object. */ pager->swp_pgr_hdr.mo_ikot = IKOT_MEMORY_OBJECT; diff --git a/osfmk/vm/vm_tests.c b/osfmk/vm/vm_tests.c index eec86e072..1e6a76b8a 100644 --- a/osfmk/vm/vm_tests.c +++ b/osfmk/vm/vm_tests.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -46,7 +47,7 @@ #include #include #include -#include +#include #include #include #include @@ -55,10 +56,12 @@ #include #include #include +#include #include #include +#include #include /* for the sysctl tests */ #include /* for testing-related functions and macros */ @@ -102,14 +105,14 @@ vm_test_collapse_compressor(void) /* create backing object */ backing_size = 15 * PAGE_SIZE; - backing_object = vm_object_allocate(backing_size); + backing_object = vm_object_allocate(backing_size, kernel_map->serial_id); assert(backing_object != VM_OBJECT_NULL); printf("VM_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n", backing_object); /* map backing object */ backing_offset = 0; kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), backing_object, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); @@ -152,14 +155,14 @@ vm_test_collapse_compressor(void) /* create top object */ top_size = 9 * PAGE_SIZE; - top_object = vm_object_allocate(top_size); + top_object = vm_object_allocate(top_size, backing_object->vmo_provenance); assert(top_object != VM_OBJECT_NULL); printf("VM_TEST_COLLAPSE_COMPRESSOR: created top object %p\n", top_object); /* map top object */ top_offset = 0; kr = vm_map_enter(kernel_map, &top_offset, top_size, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), top_object, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); @@ -368,9 +371,12 @@ vm_test_page_wire_overflow_panic(void) printf("VM_TEST_PAGE_WIRE_OVERFLOW_PANIC: starting...\n"); - object = vm_object_allocate(PAGE_SIZE); + object = vm_object_allocate(PAGE_SIZE, VM_MAP_SERIAL_NONE); + while ((page = vm_page_grab()) == VM_PAGE_NULL) { + VM_PAGE_WAIT(); + } vm_object_lock(object); - page = vm_page_alloc(object, 0x0); + vm_page_insert(page, object, 0); vm_page_lock_queues(); do { vm_page_wire(page, 1, FALSE); @@ -425,7 +431,7 @@ vm_test_device_pager_transpose(void) kern_return_t kr; size = 3 * PAGE_SIZE; - anon_object = vm_object_allocate(size); + anon_object = vm_object_allocate(size, kernel_map->serial_id); assert(anon_object != VM_OBJECT_NULL); device_pager = device_pager_setup(NULL, 0, size, 0); assert(device_pager != NULL); @@ -449,7 +455,7 @@ vm_test_device_pager_transpose(void) vm_sanitize_wrap_addr_ref(&device_mapping), size, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), (void *)device_pager, 0, FALSE, @@ -872,17 +878,17 @@ vm_test_map_copy_adjust_to_target(void) vm_map_set_page_shift(map16k, 14); /* create 4 VM objects */ - obj1 = vm_object_allocate(0x100000); - obj2 = vm_object_allocate(0x100000); - obj3 = vm_object_allocate(0x100000); - obj4 = vm_object_allocate(0x100000); + obj1 = vm_object_allocate(0x100000, map4k->serial_id); + obj2 = vm_object_allocate(0x100000, map4k->serial_id); + obj3 = vm_object_allocate(0x100000, map4k->serial_id); + obj4 = vm_object_allocate(0x100000, map4k->serial_id); /* map objects in 4k map */ vm_object_reference(obj1); addr4k = 0x1000; size4k = 0x3000; kr = vm_map_enter(map4k, &addr4k, size4k, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), obj1, 0, + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), obj1, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); @@ -893,7 +899,7 @@ vm_test_map_copy_adjust_to_target(void) addr16k = 0x4000; size16k = 0x8000; kr = vm_map_enter(map16k, &addr16k, size16k, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), obj1, 0, + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), obj1, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); assert(kr == KERN_SUCCESS); @@ -1066,7 +1072,7 @@ vm_test_per_mapping_internal_accounting(void) kr = ledger_get_balance(ledger, task_ledgers.internal, &balance); assertf(kr == KERN_SUCCESS, "kr=0x%x", kr); assertf(balance == 0, "balance=0x%llx", balance); - device_object = vm_object_allocate(PAGE_SIZE); + device_object = vm_object_allocate(PAGE_SIZE, kernel_map->serial_id); assert(device_object); vm_object_lock(device_object); VM_OBJECT_SET_PRIVATE(device_object, TRUE); @@ -1091,7 +1097,7 @@ vm_test_per_mapping_internal_accounting(void) &device_addr, PAGE_SIZE, 0, - VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(), + VM_MAP_KERNEL_FLAGS_DATA_SHARED_ANYWHERE(), device_object, 0, FALSE, /* copy */ @@ -1218,7 +1224,7 @@ vm_test_collapse_overflow(void) /* create an object for which (int)(size>>PAGE_SHIFT) = 0 */ size = 0x400000000000ULL; assert((int)(size >> PAGE_SHIFT) == 0); - backing_object = vm_object_allocate(size + PAGE_SIZE); + backing_object = vm_object_allocate(size + PAGE_SIZE, VM_MAP_SERIAL_NONE); assert(backing_object); vm_object_reference(backing_object); /* insert a page */ @@ -1234,7 +1240,7 @@ vm_test_collapse_overflow(void) vm_page_insert(m, backing_object, 0); vm_object_unlock(backing_object); /* make it back another object */ - object = vm_object_allocate(size); + object = vm_object_allocate(size, VM_MAP_SERIAL_NONE); assert(object); vm_object_reference(object); object->shadow = backing_object; @@ -1389,51 +1395,6 @@ vm_tests(void) return kr; } -/* - * Checks that vm_map_delete() can deal with map unaligned entries. - * rdar://88969652 - */ -static int -vm_map_non_aligned_test(__unused int64_t in, int64_t *out) -{ - vm_map_t map = current_map(); - mach_vm_size_t size = 2 * VM_MAP_PAGE_SIZE(map); - mach_vm_address_t addr; - vm_map_entry_t entry; - kern_return_t kr; - - if (VM_MAP_PAGE_SHIFT(map) > PAGE_SHIFT) { - kr = mach_vm_allocate(map, &addr, size, VM_FLAGS_ANYWHERE); - if (kr != KERN_SUCCESS) { - return ENOMEM; - } - - vm_map_lock(map); - if (!vm_map_lookup_entry(map, addr, &entry)) { - panic("couldn't find the entry we just made: " - "map:%p addr:0x%0llx", map, addr); - } - - /* - * Now break the entry into: - * 2 * 4k - * 2 * 4k - * 1 * 16k - */ - vm_map_clip_end(map, entry, addr + VM_MAP_PAGE_SIZE(map)); - entry->map_aligned = FALSE; - vm_map_clip_end(map, entry, addr + PAGE_SIZE * 2); - vm_map_unlock(map); - - kr = mach_vm_deallocate(map, addr, size); - assert(kr == KERN_SUCCESS); - } - - *out = 1; - return 0; -} -SYSCTL_TEST_REGISTER(vm_map_non_aligned, vm_map_non_aligned_test); - static inline vm_map_t create_map(mach_vm_address_t map_start, mach_vm_address_t map_end) { @@ -1444,6 +1405,13 @@ create_map(mach_vm_address_t map_start, mach_vm_address_t map_end) vm_map_t map = vm_map_create_options(pmap, map_start, map_end, VM_MAP_CREATE_PAGEABLE);//vm_compute_max_offset assert(map); +#if CONFIG_SPTM + /* Ensure the map serial looks fine */ + if (map->serial_id != pmap->associated_vm_map_serial_id) { + panic("Expected a map and its pmap to have exactly the same serial"); + } +#endif /* CONFIG_SPTM */ + return map; } @@ -1827,60 +1795,6 @@ vm_map_null_tests(__unused int64_t in, int64_t *out) } SYSCTL_TEST_REGISTER(vm_map_null, vm_map_null_tests); -#if CONFIG_PROB_GZALLOC -extern vm_offset_t pgz_protect_for_testing_only(zone_t zone, vm_offset_t addr, void *fp); - -static int -vm_memory_entry_pgz_test(__unused int64_t in, int64_t *out) -{ - kern_return_t kr; - ipc_port_t mem_entry_ptr; - mach_vm_address_t allocation_addr = 0; - vm_size_t size = PAGE_SIZE; - - allocation_addr = (mach_vm_address_t) kalloc_data(size, Z_WAITOK); - if (!allocation_addr) { - *out = -1; - return 0; - } - - /* - * Make sure we get a pgz protected address - * If we aren't already protected, try to protect it - */ - if (!pgz_owned(allocation_addr)) { - zone_id_t zid = zone_id_for_element((void *) allocation_addr, size); - zone_t zone = &zone_array[zid]; - allocation_addr = pgz_protect_for_testing_only(zone, allocation_addr, __builtin_frame_address(0)); - } - /* - * If we still aren't protected, tell userspace to skip the test - */ - if (!pgz_owned(allocation_addr)) { - *out = 2; - return 0; - } - - kr = mach_make_memory_entry(kernel_map, &size, (mach_vm_offset_t) allocation_addr, VM_PROT_READ | VM_PROT_WRITE | MAP_MEM_VM_COPY, &mem_entry_ptr, IPC_PORT_NULL); - assert(kr == KERN_SUCCESS); - - ipc_port_release(mem_entry_ptr); - kfree_data(allocation_addr, size); - - *out = 1; - return 0; -} -#else /* CONFIG_PROB_GZALLOC */ -static int -vm_memory_entry_pgz_test(__unused int64_t in, int64_t *out) -{ - *out = 1; - return 0; -} -#endif /* CONFIG_PROB_GZALLOC */ - -SYSCTL_TEST_REGISTER(vm_memory_entry_pgz, vm_memory_entry_pgz_test); - static int vm_map_copyio_test(__unused int64_t in, int64_t *out) @@ -2248,10 +2162,801 @@ vm_get_wimg_mode(int64_t in, int64_t *out) return ENOTSUP; } + *out = 0; vm_object_t obj = VME_OBJECT(entry); - *out = obj->wimg_bits; + if (obj != VM_OBJECT_NULL) { + *out = obj->wimg_bits; + } vm_map_unlock_read(map); return 0; } SYSCTL_TEST_REGISTER(vm_get_wimg_mode, vm_get_wimg_mode); + +/* + * Make sure copies from 4k->16k maps doesn't lead to address space holes + */ +static int +vm_map_4k_16k_test(int64_t in, int64_t *out) +{ +#if PMAP_CREATE_FORCE_4K_PAGES + const mach_vm_size_t alloc_size = (36 * 1024); + assert((alloc_size % FOURK_PAGE_SHIFT) == 0); + assert((alloc_size % SIXTEENK_PAGE_SHIFT) != 0); + assert(alloc_size > msg_ool_size_small); // avoid kernel buffer copy optimization + + /* initialize maps */ + pmap_t pmap_4k, pmap_16k; + vm_map_t map_4k, map_16k; + pmap_4k = pmap_create_options(NULL, 0, PMAP_CREATE_64BIT | PMAP_CREATE_FORCE_4K_PAGES); + assert(pmap_4k); + map_4k = vm_map_create_options(pmap_4k, MACH_VM_MIN_ADDRESS, MACH_VM_MAX_ADDRESS, VM_MAP_CREATE_PAGEABLE); + assert(map_4k != VM_MAP_NULL); + vm_map_set_page_shift(map_4k, FOURK_PAGE_SHIFT); + + pmap_16k = pmap_create_options(NULL, 0, PMAP_CREATE_64BIT); + assert(pmap_16k); + map_16k = vm_map_create_options(pmap_16k, MACH_VM_MIN_ADDRESS, MACH_VM_MAX_ADDRESS, VM_MAP_CREATE_PAGEABLE); + assert(map_16k != VM_MAP_NULL); + assert(VM_MAP_PAGE_SHIFT(map_16k) == SIXTEENK_PAGE_SHIFT); + + /* create mappings in 4k map */ + /* allocate space */ + vm_address_t address_4k; + kern_return_t kr = vm_allocate_external(map_4k, &address_4k, alloc_size, VM_FLAGS_ANYWHERE); + assert3u(kr, ==, KERN_SUCCESS); /* reserve space for 4k entries in 4k map */ + + /* overwrite with a bunch of 4k entries */ + for (mach_vm_address_t addr = address_4k; addr < (address_4k + alloc_size); addr += FOURK_PAGE_SIZE) { + /* allocate 128MB objects, so that they don't get coalesced, preventing entry simplification */ + vm_object_t object = vm_object_allocate(ANON_CHUNK_SIZE, map_4k->serial_id); + kr = vm_map_enter(map_4k, &addr, FOURK_PAGE_SIZE, /* mask */ 0, + VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = TRUE), object, /* offset */ 0, + /* copy */ false, VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); + assert3u(kr, ==, KERN_SUCCESS); /* overwrite the 4k chunk at addr with its own entry */ + } + + /* set up vm_map_copy_t */ + vm_map_copy_t copy; + kr = vm_map_copyin(map_4k, address_4k, alloc_size, true, ©); + assert3u(kr, ==, KERN_SUCCESS); /* copyin from 4k map succeeds */ + + /* write out the vm_map_copy_t to the 16k map */ + vm_address_t address_16k; + if (in == 0) { + /* vm_map_copyout */ + vm_map_address_t tmp_address; + kr = vm_map_copyout(map_16k, &tmp_address, copy); + assert3u(kr, ==, KERN_SUCCESS); /* copyout into 16k map suceeds */ + address_16k = (vm_address_t)tmp_address; + } else if (in == 1) { + /* vm_map_copy_overwrite */ + /* reserve space */ + kr = vm_allocate_external(map_16k, &address_16k, alloc_size, VM_FLAGS_ANYWHERE); + assert3u(kr, ==, KERN_SUCCESS); /* reserve space in 16k map succeeds */ + + /* do the overwrite */ + kr = vm_map_copy_overwrite(map_16k, address_16k, copy, alloc_size, + true); + assert3u(kr, ==, KERN_SUCCESS); /* copy_overwrite into 16k map succeds */ + } else { + panic("invalid vm_map_4k_16k_test variant: %lld", in); + } + + /* validate that everything is combined into one large 16k-aligned entry */ + mach_vm_size_t expected_size = VM_MAP_ROUND_PAGE(alloc_size, SIXTEENK_PAGE_MASK); + vm_map_lock_read(map_16k); + vm_map_entry_t entry; + bool address_in_map = vm_map_lookup_entry(map_16k, address_16k, &entry); + assert(address_in_map); /* address_16k found in map_16k */ + assert3u((entry->vme_end - entry->vme_start), ==, expected_size); /* 4k entries combined into a single 16k entry */ + vm_map_unlock_read(map_16k); +#else /* !PMAP_CREATE_FORCE_4K_PAGES */ + (void)in; +#endif /* !PMAP_CREATE_FORCE_4K_PAGES */ + *out = 1; + return 0; +} +SYSCTL_TEST_REGISTER(vm_map_4k_16k, vm_map_4k_16k_test); + +static int +vm_vector_upl_test(int64_t in, int64_t *out) +{ + extern upl_t vector_upl_create(vm_offset_t, uint32_t); + extern boolean_t vector_upl_set_subupl(upl_t, upl_t, uint32_t); + + upl_t vector_upl = NULL; + vm_address_t kva = 0; + + *out = 0; + + struct { + uint64_t iov; + uint16_t iovcnt; + } args; + + struct { + uint64_t base; + uint32_t len; + } *iov; + + size_t iovsize = 0; + iov = NULL; + + int error = copyin((user_addr_t)in, &args, sizeof(args)); + if ((error != 0) || (args.iovcnt == 0)) { + goto vector_upl_test_done; + } + + iovsize = sizeof(*iov) * args.iovcnt; + + iov = kalloc_data(iovsize, Z_WAITOK_ZERO); + if (iov == NULL) { + error = ENOMEM; + goto vector_upl_test_done; + } + + error = copyin((user_addr_t)args.iov, iov, iovsize); + if (error != 0) { + goto vector_upl_test_done; + } + + vector_upl = vector_upl_create(iov->base & PAGE_MASK, args.iovcnt); + upl_size_t vector_upl_size = 0; + + /* Create each sub-UPL and append it to the top-level vector UPL. */ + for (uint16_t i = 0; i < args.iovcnt; i++) { + upl_t subupl; + upl_size_t upl_size = iov[i].len; + unsigned int upl_count = 0; + upl_control_flags_t upl_flags = UPL_SET_IO_WIRE | UPL_SET_LITE | UPL_WILL_MODIFY | UPL_SET_INTERNAL; + kern_return_t kr = vm_map_create_upl(current_map(), + (vm_map_offset_t)iov[i].base, + &upl_size, + &subupl, + NULL, + &upl_count, + &upl_flags, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + printf("vm_map_create_upl[%d](%p, 0x%lx) returned 0x%x\n", + (int)i, (void*)iov[i].base, (unsigned long)iov[i].len, kr); + error = EIO; + goto vector_upl_test_done; + } + /* This effectively transfers our reference to subupl over to vector_upl. */ + vector_upl_set_subupl(vector_upl, subupl, upl_size); + vector_upl_set_iostate(vector_upl, subupl, vector_upl_size, upl_size); + vector_upl_size += upl_size; + } + + /* Map the vector UPL as a single KVA region and modify the page contents by adding 1 to each char. */ + kern_return_t kr = vm_upl_map(kernel_map, vector_upl, &kva); + if (kr != KERN_SUCCESS) { + error = ENOMEM; + goto vector_upl_test_done; + } + + char *buf = (char*)kva; + for (upl_size_t i = 0; i < vector_upl_size; i++) { + buf[i] = buf[i] + 1; + } + *out = (int64_t)vector_upl_size; + +vector_upl_test_done: + + if (kva != 0) { + vm_upl_unmap(kernel_map, vector_upl); + } + + if (vector_upl != NULL) { + /* Committing the vector UPL will release and deallocate each of its sub-UPLs. */ + upl_commit(vector_upl, NULL, 0); + upl_deallocate(vector_upl); + } + + if (iov != NULL) { + kfree_data(iov, iovsize); + } + + return error; +} +SYSCTL_TEST_REGISTER(vm_vector_upl, vm_vector_upl_test); + +/* + * Test that wiring copy delay memory pushes pages to its copy object + */ +static int +vm_map_wire_copy_delay_memory_test(__unused int64_t in, int64_t *out) +{ + kern_return_t kr; + vm_map_t map; + mach_vm_address_t address_a, address_b, address_c; + vm_prot_t cur_prot, max_prot; + vm_map_entry_t entry; + vm_object_t object; + vm_page_t m; + bool result; + + T_BEGIN("vm_map_wire_copy_delay_memory_test"); + map = create_map(0x100000000ULL, 0x200000000ULL); + + address_a = 0; + kr = mach_vm_allocate( + map, + &address_a, + /* size */ PAGE_SIZE, + VM_FLAGS_ANYWHERE); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "mach_vm_allocate A"); + + address_b = 0; + kr = mach_vm_remap( + map, + &address_b, + /* size */ PAGE_SIZE, + /* mask */ 0, + VM_FLAGS_ANYWHERE, + map, + address_a, + /* copy */ FALSE, + &cur_prot, + &max_prot, + VM_INHERIT_NONE); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "mach_vm_remap A->B"); + + address_c = 0; + kr = mach_vm_remap( + map, + &address_c, + /* size */ PAGE_SIZE, + /* mask */ 0, + VM_FLAGS_ANYWHERE, + map, + address_b, + /* copy */ TRUE, + &cur_prot, + &max_prot, + VM_INHERIT_NONE); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "mach_vm_remap B->C"); + + kr = mach_vm_protect( + map, + address_c, + /* size */ PAGE_SIZE, + /* set_max */ FALSE, + VM_PROT_READ); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "mach_vm_protect C"); + + kr = vm_map_wire_kernel( + map, + /* begin */ address_b, + /* end */ address_b + PAGE_SIZE, + VM_PROT_NONE, + VM_KERN_MEMORY_OSFMK, + false); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "vm_map_wire_kernel B"); + + vm_map_lock(map); + result = vm_map_lookup_entry(map, address_c, &entry); + T_ASSERT_EQ_INT(result, true, "vm_map_lookup_entry"); + + object = VME_OBJECT(entry); + T_ASSERT_NOTNULL(object, "C's object should not be null"); + vm_object_lock(object); + + m = vm_page_lookup(object, /* offset */ 0); + T_ASSERT_NOTNULL(m, "C should have a page pushed to it"); + + /* cleanup */ + vm_object_unlock(object); + vm_map_unlock(map); + cleanup_map(&map); + + T_END; + *out = 1; + return 0; +} +SYSCTL_TEST_REGISTER(vm_map_wire_copy_delay_memory, vm_map_wire_copy_delay_memory_test); + + +/* + * Compare the contents of an original userspace buffer with that kernel mapping of a UPL created + * against that userspace buffer. Also validate that the physical pages in the UPL's page list + * match the physical pages backing the kernel mapping at the pmap layer. Furthermore, if UPL creation + * was expected to copy the original buffer, validate that the backing pages for the userspace buffer + * don't match the kernel/UPL pages, otherwise validate that they do match. + */ +static int +upl_buf_compare(user_addr_t src, upl_t upl, const void *upl_buf, upl_size_t size, bool copy_expected) +{ + int error = 0; + void *temp = kalloc_data(PAGE_SIZE, Z_WAITOK); + + upl_size_t i = 0; + while (i < size) { + size_t bytes = MIN(size - i, PAGE_SIZE); + error = copyin(src + i, temp, bytes); + if (!error && (memcmp(temp, (const void*)((uintptr_t)upl_buf + i), bytes) != 0)) { + printf("%s: memcmp(%p, %p, %zu) failed, src[0] = 0x%llx, buf[0] = 0x%llx\n", + __func__, (void*)(src + i), (const void*)((uintptr_t)upl_buf + i), bytes, *((unsigned long long*)temp), *((unsigned long long*)((uintptr_t)upl_buf + i))); + error = EINVAL; + } + if (!error) { + ppnum_t user_pa = pmap_find_phys(current_map()->pmap, (addr64_t)src + i); + ppnum_t upl_pa = pmap_find_phys(kernel_pmap, (addr64_t)upl_buf + i); + if ((upl_pa == 0) || /* UPL is wired, PA should always be valid */ + (!copy_expected && (upl_pa != user_pa)) || + (copy_expected && (upl_pa == user_pa)) || + (upl_pa != (upl->page_list[i >> PAGE_SHIFT].phys_addr))) { + printf("%s: PA verification[%u] failed: copy=%u, upl_pa = 0x%lx, user_pa = 0x%lx, page list PA = 0x%lx\n", + __func__, (unsigned)i, (unsigned)copy_expected, (unsigned long)upl_pa, (unsigned long)user_pa, + (unsigned long)upl->page_list[i].phys_addr); + error = EFAULT; + } + } + if (error) { + break; + } + i += bytes; + } + + kfree_data(temp, PAGE_SIZE); + + return error; +} + +static int +vm_upl_test(int64_t in, int64_t *out __unused) +{ + upl_t upl = NULL; + vm_address_t kva = 0; + + struct { + uint64_t ptr; /* Base address of buffer in userspace */ + uint32_t size; /* Size of userspace buffer (in bytes) */ + char test_pattern; /* Starting char of test pattern we should write (if applicable) */ + bool copy_expected; /* Is UPL creation expected to create a copy of the original buffer? */ + bool should_fail; /* Is UPL creation expected to fail due to permissions checking? */ + bool upl_rw; /* Should the UPL be created RW (!UPL_COPYOUT_FROM) instead of RO? */ + } args; + int error = copyin((user_addr_t)in, &args, sizeof(args)); + if ((error != 0) || (args.size == 0)) { + goto upl_test_done; + } + + upl_size_t upl_size = args.size; + unsigned int upl_count = 0; + upl_control_flags_t upl_flags = UPL_SET_IO_WIRE | UPL_SET_LITE | UPL_SET_INTERNAL; + if (!args.upl_rw) { + upl_flags |= UPL_COPYOUT_FROM; + } else { + upl_flags |= UPL_WILL_MODIFY; + } + kern_return_t kr = vm_map_create_upl(current_map(), + (vm_map_offset_t)args.ptr, + &upl_size, + &upl, + NULL, + &upl_count, + &upl_flags, + VM_KERN_MEMORY_DIAG); + if (args.should_fail && (kr == KERN_PROTECTION_FAILURE)) { + goto upl_test_done; + } else if (args.should_fail && (kr == KERN_SUCCESS)) { + printf("%s: vm_map_create_upl(%p, 0x%lx) did not fail as expected\n", + __func__, (void*)args.ptr, (unsigned long)args.size); + error = EIO; + goto upl_test_done; + } else if (kr != KERN_SUCCESS) { + printf("%s: vm_map_create_upl(%p, 0x%lx) returned 0x%x\n", + __func__, (void*)args.ptr, (unsigned long)args.size, kr); + error = kr; + goto upl_test_done; + } + + kr = vm_upl_map(kernel_map, upl, &kva); + if (kr != KERN_SUCCESS) { + error = kr; + printf("%s: vm_upl_map() returned 0x%x\n", __func__, kr); + goto upl_test_done; + } + + /* Ensure the mapped UPL contents match the original user buffer contents */ + error = upl_buf_compare((user_addr_t)args.ptr, upl, (void*)kva, upl_size, args.copy_expected); + + if (error) { + printf("%s: upl_buf_compare(%p, %p, %zu) failed\n", + __func__, (void*)args.ptr, (void*)kva, (size_t)upl_size); + } + + if (!error && args.upl_rw) { + /* + * If the UPL is writable, update the contents so that userspace can + * validate that it sees the updates. + */ + for (unsigned int i = 0; i < (upl_size / sizeof(unsigned int)); i++) { + ((unsigned int*)kva)[i] = (unsigned int)args.test_pattern + i; + } + } + +upl_test_done: + + if (kva != 0) { + vm_upl_unmap(kernel_map, upl); + } + + if (upl != NULL) { + upl_commit(upl, NULL, 0); + upl_deallocate(upl); + } + + return error; +} +SYSCTL_TEST_REGISTER(vm_upl, vm_upl_test); + +static int +vm_upl_submap_test(int64_t in, int64_t *out __unused) +{ + vm_map_address_t start = 0x180000000ULL; + vm_map_address_t end = start + 0x180000000ULL; + + upl_t upl = NULL; + vm_address_t kva = 0; + int error = 0; + + /* + * Create temporary pmap and VM map for nesting our submap. + * We can't directly nest our submap into the current user map, because it will + * have already nested the shared region, and our security model doesn't allow + * multiple nested pmaps. + */ + pmap_t temp_pmap = pmap_create_options(NULL, 0, PMAP_CREATE_64BIT); + + vm_map_t temp_map = VM_MAP_NULL; + if (temp_pmap != PMAP_NULL) { + temp_map = vm_map_create_options(temp_pmap, 0, 0xfffffffffffff, 0); + } + + /* Now create the pmap and VM map that will back the submap entry in 'temp_map'. */ + pmap_t nested_pmap = pmap_create_options(NULL, 0, PMAP_CREATE_64BIT | PMAP_CREATE_NESTED); + + vm_map_t nested_map = VM_MAP_NULL; + if (nested_pmap != PMAP_NULL) { +#if defined(__arm64__) + pmap_set_nested(nested_pmap); +#endif /* defined(__arm64__) */ +#if CODE_SIGNING_MONITOR + csm_setup_nested_address_space(nested_pmap, start, end - start); +#endif + nested_map = vm_map_create_options(nested_pmap, 0, end - start, 0); + } + + if (temp_map == VM_MAP_NULL || nested_map == VM_MAP_NULL) { + error = ENOMEM; + printf("%s: failed to create VM maps\n", __func__); + goto upl_submap_test_done; + } + + nested_map->is_nested_map = TRUE; + nested_map->vmmap_sealed = VM_MAP_WILL_BE_SEALED; + + struct { + uint64_t ptr; /* Base address of original buffer in userspace */ + uint64_t upl_base; /* Base address in 'temp_map' against which UPL should be created */ + uint32_t size; /* Size of userspace buffer in bytes */ + uint32_t upl_size; /* Size of UPL to create in bytes */ + bool upl_rw; /* Should the UPL be created RW (!UPL_COPYOUT_FROM) instead of RO? */ + } args; + error = copyin((user_addr_t)in, &args, sizeof(args)); + if ((error != 0) || (args.size == 0) || (args.upl_size == 0)) { + goto upl_submap_test_done; + } + + /* + * Remap the original userspace buffer into the nested map, with CoW protection. + * This will not actually instantiate new mappings in 'nested_pmap', but will instead create + * new shadow object of the original object for the userspace buffer in the nested map. + * Mappings would only be created in 'nested_pmap' upon a later non-CoW fault of the nested region, + * which we aren't doing here. That's fine, as we're not testing pmap functionality here; we + * only care that UPL creation produces the expected results at the VM map/entry level. + */ + mach_vm_offset_t submap_start = 0; + + vm_prot_ut remap_cur_prot = vm_sanitize_wrap_prot(VM_PROT_READ); + vm_prot_ut remap_max_prot = vm_sanitize_wrap_prot(VM_PROT_READ); + + kern_return_t kr = mach_vm_remap_new_kernel(nested_map, (mach_vm_offset_ut*)&submap_start, args.size, 0, + VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK), current_map(), args.ptr, TRUE, + &remap_cur_prot, &remap_max_prot, VM_INHERIT_NONE); + if (kr != KERN_SUCCESS) { + printf("%s: failed to remap source buffer to nested map: 0x%x\n", __func__, kr); + error = kr; + goto upl_submap_test_done; + } + + vm_map_seal(nested_map, true); + pmap_set_shared_region(temp_pmap, nested_pmap, start, end - start); + + /* Do the actual nesting. */ + vm_map_reference(nested_map); + kr = vm_map_enter(temp_map, &start, end - start, 0, + VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_submap = TRUE, .vmkf_nested_pmap = TRUE), (vm_object_t)(uintptr_t) nested_map, 0, + true, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, VM_INHERIT_DEFAULT); + + if (kr != KERN_SUCCESS) { + error = kr; + printf("%s: failed to enter nested map in test map: 0x%x\n", __func__, kr); + vm_map_deallocate(nested_map); + goto upl_submap_test_done; + } + + /* Validate that the nesting operation produced the expected submap entry in 'temp_map'. */ + vm_map_entry_t submap_entry; + if (!vm_map_lookup_entry(temp_map, args.upl_base, &submap_entry) || !submap_entry->is_sub_map) { + error = ENOENT; + printf("%s: did not find submap entry at beginning up UPL region\n", __func__); + goto upl_submap_test_done; + } + + upl_size_t upl_size = args.upl_size; + unsigned int upl_count = 0; + upl_control_flags_t upl_flags = UPL_SET_IO_WIRE | UPL_SET_LITE | UPL_SET_INTERNAL; + if (!args.upl_rw) { + upl_flags |= UPL_COPYOUT_FROM; + } + kr = vm_map_create_upl(temp_map, + (vm_map_offset_t)args.upl_base, + &upl_size, + &upl, + NULL, + &upl_count, + &upl_flags, + VM_KERN_MEMORY_DIAG); + + if (kr != KERN_SUCCESS) { + error = kr; + printf("%s: failed to create UPL for submap: 0x%x\n", __func__, kr); + goto upl_submap_test_done; + } + + /* Validate that UPL creation unnested a portion of the submap entry. */ + if (!vm_map_lookup_entry(temp_map, args.upl_base, &submap_entry) || submap_entry->is_sub_map) { + error = ENOENT; + printf("%s: did not find non-submap entry at beginning up UPL region\n", __func__); + goto upl_submap_test_done; + } + + kr = vm_upl_map(kernel_map, upl, &kva); + if (kr != KERN_SUCCESS) { + error = kr; + goto upl_submap_test_done; + } + + /* + * Compare the original userspace buffer to the ultimate kernel mapping of the UPL. + * The unnesting and CoW faulting performed as part of UPL creation should have copied the original buffer + * pages, so we expect the two buffers to be backed by different pages. + */ + error = upl_buf_compare((user_addr_t)args.ptr + (args.upl_base - start), upl, (void*)kva, upl_size, true); + + if (!error) { + /* + * Now validate that the nested region in 'temp_map' matches the original buffer. + * The unnesting and CoW faulting performed as part of UPL creation should have acted directly + * upon 'temp_map', so the backing pages should be the same here. + */ + vm_map_switch_context_t switch_ctx = vm_map_switch_to(temp_map); + error = upl_buf_compare((user_addr_t)args.upl_base, upl, (void*)kva, upl_size, false); + vm_map_switch_back(switch_ctx); + } + +upl_submap_test_done: + + if (kva != 0) { + vm_upl_unmap(kernel_map, upl); + } + + if (upl != NULL) { + upl_commit(upl, NULL, 0); + upl_deallocate(upl); + } + + if (temp_map != VM_MAP_NULL) { + vm_map_deallocate(temp_map); + temp_pmap = PMAP_NULL; + } + if (nested_map != VM_MAP_NULL) { + vm_map_deallocate(nested_map); + nested_pmap = PMAP_NULL; + } + + if (temp_pmap != PMAP_NULL) { + pmap_destroy(temp_pmap); + } + if (nested_pmap != PMAP_NULL) { + pmap_destroy(nested_pmap); + } + + return error; +} +SYSCTL_TEST_REGISTER(vm_upl_submap, vm_upl_submap_test); + +#if CONFIG_SPTM + +static void +page_clean_timeout(thread_call_param_t param0, __unused thread_call_param_t param1) +{ + vm_page_t m = (vm_page_t)param0; + vm_object_t object = VM_PAGE_OBJECT(m); + vm_object_lock(object); + m->vmp_cleaning = false; + vm_page_wakeup(object, m); + vm_object_unlock(object); +} + +/** + * This sysctl is meant to exercise very specific functionality that can't be exercised through + * the normal vm_map_create_upl() path. It operates directly against the vm_object backing + * the specified address range, and does not take any locks against the VM map to guarantee + * stability of the specified address range. It is therefore meant to be used against + * VM regions directly allocated by the userspace caller and guaranteed to not be altered by + * other threads. The regular vm_upl/vm_upl_submap sysctls should be preferred over this + * if at all possible. + */ +static int +vm_upl_object_test(int64_t in, int64_t *out __unused) +{ + upl_t upl = NULL; + + struct { + uint64_t ptr; /* Base address of buffer in userspace */ + uint32_t size; /* Size of userspace buffer (in bytes) */ + bool upl_rw; + bool should_fail; /* Is UPL creation expected to fail due to permissions checking? */ + bool exec_fault; + } args; + int error = copyin((user_addr_t)in, &args, sizeof(args)); + if ((error != 0) || (args.size == 0)) { + goto upl_object_test_done; + } + + upl_size_t upl_size = args.size; + unsigned int upl_count = 0; + upl_control_flags_t upl_flags = UPL_SET_IO_WIRE | UPL_SET_LITE | UPL_SET_INTERNAL; + if (!args.upl_rw) { + upl_flags |= UPL_COPYOUT_FROM; + } else { + upl_flags |= UPL_WILL_MODIFY; + } + + vm_map_entry_t entry; + vm_object_t object; + vm_page_t m __unused; + + if (!vm_map_lookup_entry(current_map(), args.ptr, &entry) || entry->is_sub_map) { + error = ENOENT; + printf("%s: did not find entry at beginning up UPL region\n", __func__); + goto upl_object_test_done; + } + + object = VME_OBJECT(entry); + if (object == VM_OBJECT_NULL) { + error = ENOENT; + printf("%s: No VM object associated with entry at beginning of UPL region\n", __func__); + goto upl_object_test_done; + } + + vm_object_reference(object); + + kern_return_t kr = vm_object_iopl_request(object, + (vm_object_offset_t)(args.ptr - entry->vme_start + VME_OFFSET(entry)), + upl_size, + &upl, + NULL, + &upl_count, + upl_flags, + VM_KERN_MEMORY_DIAG); + + if (args.exec_fault) { + /* + * The page may have already been retyped to its "final" executable type by a prior fault, + * so simulate a page recycle operation in order to ensure that our simulated exec fault below + * will attempt to retype it. + */ + vm_object_lock(object); + m = vm_page_lookup(object, (VME_OFFSET(entry) + ((vm_map_address_t)args.ptr - entry->vme_start))); + assert(m != VM_PAGE_NULL); + assert(m->vmp_iopl_wired); + ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(m); + pmap_disconnect(pn); + pmap_lock_phys_page(pn); + pmap_recycle_page(pn); + pmap_unlock_phys_page(pn); + assertf(pmap_will_retype(current_map()->pmap, (vm_map_address_t)args.ptr, VM_PAGE_GET_PHYS_PAGE(m), VM_PROT_EXECUTE | VM_PROT_READ, 0, PMAP_MAPPING_TYPE_INFER), + "pmap will not retype for vm_page_t %p", m); + vm_object_unlock(object); + } + + if (args.should_fail && (kr == KERN_PROTECTION_FAILURE)) { + goto upl_object_test_done; + } else if (args.should_fail && (kr == KERN_SUCCESS)) { + printf("%s: vm_object_iopl_request(%p, 0x%lx) did not fail as expected\n", + __func__, (void*)args.ptr, (unsigned long)args.size); + error = EIO; + goto upl_object_test_done; + } else if (kr != KERN_SUCCESS) { + printf("%s: vm_object_iopl_request(%p, 0x%lx) returned 0x%x\n", + __func__, (void*)args.ptr, (unsigned long)args.size, kr); + error = kr; + goto upl_object_test_done; + } + + if (args.exec_fault) { + kr = vm_fault(current_map(), + (vm_map_address_t)args.ptr, + VM_PROT_EXECUTE | VM_PROT_READ, + FALSE, + VM_KERN_MEMORY_NONE, + THREAD_UNINT, + NULL, + 0); + /* Exec page retype attempt with in-flight IOPL should be forbidden. */ + if (kr != KERN_PROTECTION_FAILURE) { + printf("%s: vm_fault(%p) did not fail as expected\n", __func__, (void*)args.ptr); + error = ((kr == KERN_SUCCESS) ? EIO : kr); + goto upl_object_test_done; + } + assertf(pmap_will_retype(current_map()->pmap, (vm_map_address_t)args.ptr, VM_PAGE_GET_PHYS_PAGE(m), VM_PROT_EXECUTE | VM_PROT_READ, 0, PMAP_MAPPING_TYPE_INFER), + "pmap will not retype for vm_page_t %p", m); + } + +upl_object_test_done: + + if (upl != NULL) { + upl_commit(upl, NULL, 0); + upl_deallocate(upl); + } + + if ((error == 0) && args.exec_fault) { + /* + * Exec page retype attempt without in-flight IOPL should ultimately succeed, but should + * block if the page is being cleaned. Simulate that scenario with a thread call to "finish" + * the clean operation and wake up the waiting fault handler after 1s. + */ + vm_object_lock(object); + assert(!m->vmp_iopl_wired); + m->vmp_cleaning = true; + vm_object_unlock(object); + thread_call_t page_clean_timer_call = thread_call_allocate(page_clean_timeout, m); + uint64_t deadline; + clock_interval_to_deadline(1, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(page_clean_timer_call, deadline); + kr = vm_fault(current_map(), + (vm_map_address_t)args.ptr, + VM_PROT_EXECUTE | VM_PROT_READ, + FALSE, + VM_KERN_MEMORY_NONE, + THREAD_UNINT, + NULL, + 0); + /* + * Thread call should no longer be active, as its expiry should have been the thing that + * unblocked the fault above. + */ + assert(!thread_call_isactive(page_clean_timer_call)); + thread_call_free(page_clean_timer_call); + if (kr != KERN_SUCCESS) { + printf("%s: vm_fault(%p) did not succeed as expected\n", __func__, (void*)args.ptr); + error = kr; + } + } + + if (object != VM_OBJECT_NULL) { + vm_object_deallocate(object); + } + + return error; +} +SYSCTL_TEST_REGISTER(vm_upl_object, vm_upl_object_test); + +#endif /* CONFIG_SPTM */ diff --git a/osfmk/vm/vm_upl.c b/osfmk/vm/vm_upl.c index 4cd7f0b7e..04df3a608 100644 --- a/osfmk/vm/vm_upl.c +++ b/osfmk/vm/vm_upl.c @@ -666,6 +666,8 @@ process_upl_to_commit: kr = KERN_FAILURE; goto done; } + assertf(upl->flags & UPL_INTERNAL, "%s: sub-upl %p of vector upl %p has no internal page list", + __func__, upl, vector_upl); page_list = upl->page_list; subupl_size -= size; subupl_offset += size; @@ -904,6 +906,7 @@ process_upl_to_commit: if (m->vmp_wire_count == 0) { m->vmp_q_state = VM_PAGE_NOT_ON_Q; + m->vmp_iopl_wired = false; unwired_count++; } diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index d6b8aaa3d..ed4d2e26d 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -133,6 +133,8 @@ #include #include +#include /* for is_address_space_debugged */ + /* * mach_vm_allocate allocates "zero fill" memory in the specfied * map. @@ -186,10 +188,12 @@ mach_vm_deallocate_sanitize( mach_vm_offset_t *end, mach_vm_size_t *size) { + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + return vm_sanitize_addr_size(start_u, size_u, - VM_SANITIZE_CALLER_VM_DEALLOCATE, map, - VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, - end, size); + VM_SANITIZE_CALLER_VM_DEALLOCATE, map, flags, + start, end, size); } /* @@ -596,7 +600,10 @@ mach_vm_read_overwrite( } error = vm_map_copy_overwrite(current_thread()->map, - data, copy, size, FALSE); + data, + copy, + size, + FALSE); if (KERN_SUCCESS == error) { *data_size = size; return error; @@ -629,6 +636,34 @@ vm_read_overwrite( return mach_vm_read_overwrite(map, address, size, data, data_size); } +/* + * mach_vm_update_pointers_with_remote_tags - + */ + +kern_return_t +mach_vm_update_pointers_with_remote_tags( + __unused vm_map_t map, + __unused mach_vm_offset_list_t in_pointer_list, + __unused mach_msg_type_number_t in_pointer_listCnt, + __unused mach_vm_offset_list_t out_pointer_list, + __unused mach_msg_type_number_t *out_pointer_listCnt) +{ + if (!in_pointer_list + || !out_pointer_list + || in_pointer_listCnt >= 512 + /* The length of the output pointer list must match the input pointer list */ + || !out_pointer_listCnt + || *out_pointer_listCnt != in_pointer_listCnt + ) { + return KERN_INVALID_ARGUMENT; + } + + if (!map || !map->pmap) { + return KERN_INVALID_ARGUMENT; + } + + return KERN_FAILURE; +} /* * mach_vm_write - @@ -652,8 +687,11 @@ mach_vm_write( */ vm_map_copy_t data = (vm_map_copy_t) VM_SANITIZE_UNSAFE_UNWRAP(data_u); - return vm_map_copy_overwrite(map, address, - data, size, FALSE /* interruptible XXX */); + return vm_map_copy_overwrite(map, + address, + data, + size, + FALSE /* interruptible XXX */); } /* @@ -703,8 +741,11 @@ mach_vm_copy( assert(VM_SANITIZE_UNSAFE_IS_EQUAL(size, copy->size)); } - kr = vm_map_copy_overwrite(map, dest_address, - copy, size, FALSE /* interruptible XXX */); + kr = vm_map_copy_overwrite(map, + dest_address, + copy, + size, + FALSE); if (KERN_SUCCESS != kr) { vm_map_copy_discard(copy); @@ -1176,6 +1217,8 @@ vm_toggle_entry_reuse(int toggle, int *old_value) { vm_map_t map = current_map(); + vmlp_api_start(VM_TOGGLE_ENTRY_REUSE); + assert(!map->is_nested_map); if (toggle == VM_TOGGLE_GETVALUE && old_value != NULL) { *old_value = map->disable_vmentry_reuse; @@ -1197,9 +1240,11 @@ vm_toggle_entry_reuse(int toggle, int *old_value) map->disable_vmentry_reuse = FALSE; vm_map_unlock(map); } else { + vmlp_api_end(VM_TOGGLE_ENTRY_REUSE, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } + vmlp_api_end(VM_TOGGLE_ENTRY_REUSE, KERN_SUCCESS); return KERN_SUCCESS; } @@ -1242,10 +1287,11 @@ mach_vm_behavior_set_sanitize( break; } + vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS; + + kr = vm_sanitize_addr_size(start_u, size_u, VM_SANITIZE_CALLER_VM_BEHAVIOR_SET, - align_mask, map, - VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, - start, end, size); + align_mask, map, flags, start, end, size); if (__improbable(kr != KERN_SUCCESS)) { return kr; } @@ -1823,7 +1869,10 @@ kern_return_t vm_map_exec_lockdown( vm_map_t map) { + vmlp_api_start(VM_MAP_EXEC_LOCKDOWN); + if (map == VM_MAP_NULL) { + vmlp_api_end(VM_MAP_EXEC_LOCKDOWN, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -1831,42 +1880,18 @@ vm_map_exec_lockdown( map->map_disallow_new_exec = TRUE; vm_map_unlock(map); + vmlp_api_end(VM_MAP_EXEC_LOCKDOWN, KERN_SUCCESS); return KERN_SUCCESS; } #if XNU_PLATFORM_MacOSX -/* - * Now a kernel-private interface (for BootCache - * use only). Need a cleaner way to create an - * empty vm_map() and return a handle to it. - */ - kern_return_t vm_region_object_create( - vm_map_t target_map, - vm_size_t size, - ipc_port_t *object_handle) + __unused vm_map_t target_map, + __unused vm_size_t size, + __unused ipc_port_t *object_handle) { - vm_named_entry_t user_entry; - vm_map_t new_map; - - user_entry = mach_memory_entry_allocate(object_handle); - - /* Create a named object based on a submap of specified size */ - - new_map = vm_map_create_options(PMAP_NULL, VM_MAP_MIN_ADDRESS, - vm_map_round_page(size, VM_MAP_PAGE_MASK(target_map)), - VM_MAP_CREATE_PAGEABLE); - vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(target_map)); - - user_entry->backing.map = new_map; - user_entry->internal = TRUE; - user_entry->is_sub_map = TRUE; - user_entry->offset = 0; - user_entry->protection = VM_PROT_ALL; - user_entry->size = size; - - return KERN_SUCCESS; + return KERN_NOT_SUPPORTED; } #endif /* XNU_PLATFORM_MacOSX */ @@ -1876,6 +1901,7 @@ kern_return_t mach_vm_deferred_reclamation_buffer_allocate( task_t task, mach_vm_address_ut *address, + uint64_t *sampling_period, uint32_t initial_capacity, uint32_t max_capacity) { @@ -1888,7 +1914,7 @@ mach_vm_deferred_reclamation_buffer_allocate( if (proc_is_simulated(p)) { return KERN_NOT_SUPPORTED; } - return vm_deferred_reclamation_buffer_allocate_internal(task, address, initial_capacity, max_capacity); + return vm_deferred_reclamation_buffer_allocate_internal(task, address, sampling_period, initial_capacity, max_capacity); #else (void) task; (void) address; @@ -1900,14 +1926,22 @@ mach_vm_deferred_reclamation_buffer_allocate( kern_return_t mach_vm_deferred_reclamation_buffer_flush( task_t task, - uint32_t num_entries_to_reclaim) + uint32_t num_entries_to_reclaim, + mach_vm_size_ut *bytes_reclaimed_out) { #if CONFIG_DEFERRED_RECLAIM + kern_return_t kr; + mach_vm_size_t bytes_reclaimed = 0; if (task != current_task()) { /* Remote buffer operations are not supported */ return KERN_INVALID_TASK; } - return vm_deferred_reclamation_buffer_flush_internal(task, num_entries_to_reclaim); + if (bytes_reclaimed_out == NULL) { + return KERN_INVALID_ARGUMENT; + } + kr = vm_deferred_reclamation_buffer_flush_internal(task, num_entries_to_reclaim, &bytes_reclaimed); + *bytes_reclaimed_out = vm_sanitize_wrap_size(bytes_reclaimed); + return kr; #else (void) task; (void) num_entries_to_reclaim; @@ -1915,40 +1949,26 @@ mach_vm_deferred_reclamation_buffer_flush( #endif /* CONFIG_DEFERRED_RECLAIM */ } -kern_return_t -mach_vm_deferred_reclamation_buffer_update_reclaimable_bytes( - task_t task, - mach_vm_size_ut reclaimable_bytes_u) -{ -#if CONFIG_DEFERRED_RECLAIM - /* - * This unwrapping is safe as reclaimable_bytes is not to be - * interpreted as the size of range of addresses. - */ - mach_vm_size_t reclaimable_bytes = - VM_SANITIZE_UNSAFE_UNWRAP(reclaimable_bytes_u); - if (task != current_task()) { - /* Remote buffer operations are not supported */ - return KERN_INVALID_TASK; - } - return vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task, reclaimable_bytes); -#else - (void) task; - (void) reclaimable_bytes; - return KERN_NOT_SUPPORTED; -#endif /* CONFIG_DEFERRED_RECLAIM */ -} - kern_return_t mach_vm_deferred_reclamation_buffer_resize(task_t task, - uint32_t capacity) + uint32_t new_len, + mach_vm_size_ut *bytes_reclaimed_out) { #if CONFIG_DEFERRED_RECLAIM + mach_error_t err; + mach_vm_size_t bytes_reclaimed = 0; + if (task != current_task()) { /* Remote buffer operations are not supported */ return KERN_INVALID_TASK; } - return vm_deferred_reclamation_buffer_resize_internal(task, capacity); + if (bytes_reclaimed_out == NULL) { + return KERN_INVALID_ARGUMENT; + } + + err = vm_deferred_reclamation_buffer_resize_internal(task, new_len, &bytes_reclaimed); + *bytes_reclaimed_out = vm_sanitize_wrap_size(bytes_reclaimed); + return err; #else (void) task; (void) size; @@ -1956,6 +1976,21 @@ mach_vm_deferred_reclamation_buffer_resize(task_t task, #endif /* CONFIG_DEFERRED_RECLAIM */ } +kern_return_t +mach_vm_deferred_reclamation_buffer_query(task_t task, + mach_vm_address_ut *addr_out_ut, + mach_vm_size_ut *size_out_ut) +{ +#if CONFIG_DEFERRED_RECLAIM + return vm_deferred_reclamation_buffer_query_internal(task, addr_out_ut, size_out_ut); +#else /* CONFIG_DEFERRED_RECLAIM */ + (void) task; + (void) addr_out_ut; + (void) size_out_ut; + return KERN_NOT_SUPPORTED; +#endif /* !CONFIG_DEFERRED_RECLAIM */ +} + #if CONFIG_MAP_RANGES extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); @@ -2053,6 +2088,8 @@ mach_vm_range_create_v1( kern_return_t kr = KERN_SUCCESS; uint16_t count; + vmlp_api_start(MACH_VM_RANGE_CREATE_V1); + struct mach_vm_range void1 = { .min_address = map->default_range.max_address, .max_address = map->data_range.min_address, @@ -2068,7 +2105,9 @@ mach_vm_range_create_v1( kr = mach_vm_range_create_v1_sanitize(map, recipe_u, new_count, &recipe); if (__improbable(kr != KERN_SUCCESS)) { - return vm_sanitize_get_kr(kr); + kr = vm_sanitize_get_kr(kr); + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, kr); + return kr; } qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t), @@ -2083,6 +2122,7 @@ mach_vm_range_create_v1( mach_vm_size_t s; if (recipe[i].flags) { + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } @@ -2091,17 +2131,20 @@ mach_vm_range_create_v1( case MACH_VM_RANGE_FIXED: break; default: + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } s = mach_vm_range_size(r); if (!mach_vm_range_contains(&void1, r->min_address, s) && !mach_vm_range_contains(&void2, r->min_address, s)) { + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } if (i > 0 && recipe[i - 1].range.max_address > recipe[i].range.min_address) { + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, KERN_INVALID_ARGUMENT); return KERN_INVALID_ARGUMENT; } } @@ -2175,6 +2218,8 @@ out_unlock: }; __assert_only kern_return_t kr2; + vmlp_range_event(map, recipe[i].range.min_address, recipe[i].range.max_address - recipe[i].range.min_address); + kr2 = vm_map_enter(map, &recipe[i].range.min_address, mach_vm_range_size(&recipe[i].range), 0, vmk_flags, VM_OBJECT_NULL, 0, FALSE, @@ -2183,6 +2228,7 @@ out_unlock: assert(kr2 == KERN_SUCCESS); } } + vmlp_api_end(MACH_VM_RANGE_CREATE_V1, kr); return kr; } diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index cc65325f4..33b1f2031 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -809,13 +809,13 @@ pmap_init(void) ppnum_t ppn; kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store; - _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store); + _vm_object_allocate((vm_object_size_t)NPML4PGS * PAGE_SIZE, &kpml4obj_object_store, VM_MAP_SERIAL_SPECIAL); kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store; - _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store); + _vm_object_allocate((vm_object_size_t)NPDPTPGS * PAGE_SIZE, &kpdptobj_object_store, VM_MAP_SERIAL_SPECIAL); kernel_pmap->pm_obj = &kptobj_object_store; - _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store); + _vm_object_allocate((vm_object_size_t)NPDEPGS * PAGE_SIZE, &kptobj_object_store, VM_MAP_SERIAL_SPECIAL); /* * Allocate memory for the pv_head_table and its lock bits, @@ -1503,6 +1503,24 @@ done: } #endif /* MACH_ASSERT */ +inline void +pmap_recycle_page(ppnum_t pn) +{ + const bool is_freed = pmap_verify_free(pn); + + if (__improbable(!is_freed)) { + /* + * There is a redundancy here, but we are going to panic anyways, + * and ASSERT_PMAP_FREE traces useful information. So, we keep this + * behavior. + */ +#if MACH_ASSERT + pmap_assert_free(pn); +#endif /* MACH_ASSERT */ + panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn)); + } +} + boolean_t pmap_is_empty( pmap_t pmap, @@ -1675,17 +1693,17 @@ pmap_create_options( /* allocate the vm_objs to hold the pdpt, pde and pte pages */ - p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE); + p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS) *PAGE_SIZE, VM_MAP_SERIAL_SPECIAL); if (NULL == p->pm_obj_pml4) { panic("pmap_create pdpt obj"); } - p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE); + p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS) *PAGE_SIZE, VM_MAP_SERIAL_SPECIAL); if (NULL == p->pm_obj_pdpt) { panic("pmap_create pdpt obj"); } - p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE); + p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS) *PAGE_SIZE, VM_MAP_SERIAL_SPECIAL); if (NULL == p->pm_obj) { panic("pmap_create pte obj"); } diff --git a/pexpert/arm/hwtrace/hwtrace.c b/pexpert/arm/hwtrace/hwtrace.c index 0afc913eb..1bbbf8c94 100644 --- a/pexpert/arm/hwtrace/hwtrace.c +++ b/pexpert/arm/hwtrace/hwtrace.c @@ -81,6 +81,12 @@ boolean_t panic_trace_disabled_for_rdar107003520 = FALSE; static boolean_t debug_and_trace_initialized = false; +#if DEVELOPMENT || DEBUG +static boolean_t _panic_trace_always_enabled = false; + +static boolean_t _panic_trace_stress_racks = false; +#endif /* DEVELOPMENT || DEBUG */ + /************ * Boot-args * ************/ @@ -339,26 +345,33 @@ static TUNABLE_DT(uint32_t, panic_trace_partial_percent, "/arm-io/cpu-debug-interface", "panic-trace-partial-percent", "panic_trace_partial_percent", 50, TUNABLE_DT_NONE); +/* + * Detect if we're running on stress-racks. + */ +static boolean_t +_is_stress_racks(void) +{ + DTEntry ent = NULL; + const void *propP = NULL; + unsigned int size = 0; + if (SecureDTLookupEntry(NULL, "/chosen", &ent) == kSuccess && + SecureDTGetProperty(ent, "stress-rack", &propP, &size) == kSuccess) { + return true; + } + return false; +} + /* * Stress racks opt out of panic_trace, unless overridden by the panic_trace boot-arg. */ static void panic_trace_apply_stress_rack_policy(void) { - DTEntry ent = NULL; - DTEntry entryP = NULL; - const void *propP = NULL; - unsigned int size = 0; - - if (SecureDTLookupEntry(NULL, "/chosen", &ent) == kSuccess && - SecureDTGetProperty(ent, "stress-rack", &propP, &size) == kSuccess) { - (void)entryP; - if (PE_parse_boot_argn("panic_trace", NULL, 0)) { - // Prefer user specified boot-arg even when running on stress racks. - // Make an exception for devices with broken single-stepping. - } else { - panic_trace = 0; - } + if (PE_parse_boot_argn("panic_trace", NULL, 0)) { + // Prefer user specified boot-arg even when running on stress racks. + // Make an exception for devices with broken single-stepping. + } else { + panic_trace = 0; } } @@ -509,9 +522,18 @@ pe_arm_debug_init_early(void *boot_cpu_data) return; } - /* Update the panic_trace start policy depending on the execution environment. */ #if DEVELOPMENT || DEBUG - if (panic_trace != 0) { + + /* Determine if we're enabled at 100% rate, + * report it globally. */ + _panic_trace_always_enabled = (panic_trace & panic_trace_enabled) && !(panic_trace & panic_trace_partial_policy); + + /* Determine if we're running on stress-racks, + * report it globally. */ + _panic_trace_stress_racks = _is_stress_racks(); + + /* Update the panic_trace start policy depending on the execution environment. */ + if ((panic_trace != 0) && (_panic_trace_stress_racks)) { panic_trace_apply_stress_rack_policy(); } @@ -566,6 +588,7 @@ pe_arm_debug_init_late(void) { } + /********************* * Panic-trace sysctl * *********************/ diff --git a/pexpert/arm/pe_init.c b/pexpert/arm/pe_init.c index 666b87b01..8f2d359e1 100644 --- a/pexpert/arm/pe_init.c +++ b/pexpert/arm/pe_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017, 2024 Apple Inc. All rights reserved. * * arm platform expert initialization. */ @@ -34,8 +34,8 @@ static void pe_prepare_images(void); /* private globals */ SECURITY_READ_ONLY_LATE(PE_state_t) PE_state; -TUNABLE_DT(uint32_t, PE_srd_fused, "/chosen", "research-enabled", - "srd_fusing", 0, TUNABLE_DT_NONE); +TUNABLE_DT(uint32_t, PE_esdm_fuses, "/chosen", "esdm-fuses", "", 0, TUNABLE_DT_NONE); +TUNABLE_DT(uint32_t, PE_vmm_present, "/defaults", "vmm-present", "", 0, TUNABLE_DT_NONE); #define FW_VERS_LEN 128 @@ -373,7 +373,7 @@ PE_init_iokit(void) KDBG_RELEASE(IOKDBG_CODE(DBG_BOOTER, 0), start_time_value, debug_wait_start_value, load_kernel_start_value, populate_registry_time_value); #if CONFIG_SPTM - KDBG_RELEASE(IOKDBG_CODE(DBG_BOOTER, 1), SPTMArgs->timestamp_sk_bootstrap, SPTMArgs->timestamp_xnu_bootstrap); + KDBG_RELEASE(IOKDBG_CODE(DBG_BOOTER, 1), SPTMArgs->timestamp_sk_bootstrap, SPTMArgs->timestamp_xnu_bootstrap, SPTMArgs->timestamp_txm_bootstrap); #endif } @@ -871,8 +871,16 @@ PE_init_socd_client(void) return 0; } - socd_trace_ram_base = ml_io_map(reg_prop[0], (vm_size_t)reg_prop[1]); + if (size < 2 * sizeof(uintptr_t)) { + return 0; + } + socd_trace_ram_size = (vm_size_t)reg_prop[1]; + if (socd_trace_ram_size == 0) { + return 0; + } + + socd_trace_ram_base = ml_io_map(reg_prop[0], socd_trace_ram_size); return socd_trace_ram_size; } diff --git a/pexpert/arm/pe_kprintf.c b/pexpert/arm/pe_kprintf.c index 6825f310c..e58804656 100644 --- a/pexpert/arm/pe_kprintf.c +++ b/pexpert/arm/pe_kprintf.c @@ -143,6 +143,7 @@ kprintf(const char *fmt, ...) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); va_end(listp2); } else { diff --git a/pexpert/arm/pe_serial.c b/pexpert/arm/pe_serial.c index d0db82794..032d9e00b 100644 --- a/pexpert/arm/pe_serial.c +++ b/pexpert/arm/pe_serial.c @@ -11,7 +11,7 @@ * list of interfaces pointed to by gPESF. When outputting or receiving * characters, each interface is queried in turn. * - * Please view doc/arm_serial.md for an in-depth description of these drivers. + * Please view doc/arm/arm_serial.md for an in-depth description of these drivers. */ #include #include diff --git a/pexpert/conf/Makefile.template b/pexpert/conf/Makefile.template index 2d658913f..3711a92da 100644 --- a/pexpert/conf/Makefile.template +++ b/pexpert/conf/Makefile.template @@ -89,6 +89,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/pexpert/gen/bootargs.c b/pexpert/gen/bootargs.c index 531c66053..1ef8caa7e 100644 --- a/pexpert/gen/bootargs.c +++ b/pexpert/gen/bootargs.c @@ -71,6 +71,7 @@ PE_parse_boot_argn_internal( boolean_t arg_boolean; boolean_t arg_found; + /* Please don't change this behavior */ if (*args == '\0') { return FALSE; } @@ -547,7 +548,7 @@ PE_get_default( } /* function: get_range_bounds - * Parse a range string like "1_3,5_20" and return 1,3 as lower and upper. + * Parse a range string like "1_3,10,15_20" and return 1,3 as lower and upper. * Note: '_' is separator for bounds integer delimiter and * ',' is considered as separator for range pair. * returns TRUE when both range values are found @@ -564,7 +565,7 @@ get_range_bounds(char *c, int64_t *lower, int64_t *upper) } while (*c != '\0') { - if (*c == '_') { + if (*c == '_' || *c == ',') { break; } c++; diff --git a/pexpert/gen/device_tree.c b/pexpert/gen/device_tree.c index 4eff3236b..8483f8cda 100644 --- a/pexpert/gen/device_tree.c +++ b/pexpert/gen/device_tree.c @@ -41,9 +41,9 @@ #include #include -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) extern addr64_t kvtophys(vm_offset_t va); -#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #include @@ -249,7 +249,7 @@ SecureDTIsLockedDown(void) { #if CONFIG_SPTM return true; -#elif defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#elif defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* * We cannot check if the DT is in the CTRR region early on, * because knowledge of the CTRR region is set up later. But the diff --git a/pexpert/gen/pe_gen.c b/pexpert/gen/pe_gen.c index 53331609b..6740cb08a 100644 --- a/pexpert/gen/pe_gen.c +++ b/pexpert/gen/pe_gen.c @@ -141,7 +141,7 @@ PE_init_printf(boolean_t vm_initialized) } } -uint32_t +__mockable uint32_t PE_get_random_seed(unsigned char *dst_random_seed, uint32_t request_size) { uint32_t size = 0; diff --git a/pexpert/i386/pe_init.c b/pexpert/i386/pe_init.c index e77bfd5c5..f2b2d2be1 100644 --- a/pexpert/i386/pe_init.c +++ b/pexpert/i386/pe_init.c @@ -53,7 +53,8 @@ extern void pe_identify_machine(void * args); extern int kdb_printf(const char *format, ...) __printflike(1, 2); /* private globals */ PE_state_t PE_state; -SECURITY_READ_ONLY_LATE(uint32_t) PE_srd_fused = 0; +SECURITY_READ_ONLY_LATE(uint32_t) PE_esdm_fuses = 0; +SECURITY_READ_ONLY_LATE(uint32_t) PE_vmm_present = 0; /* Clock Frequency Info */ clock_frequency_info_t gPEClockFrequencyInfo; diff --git a/pexpert/i386/pe_kprintf.c b/pexpert/i386/pe_kprintf.c index b40f50f44..199df04ea 100644 --- a/pexpert/i386/pe_kprintf.c +++ b/pexpert/i386/pe_kprintf.c @@ -150,6 +150,7 @@ kprintf(const char *fmt, ...) if (ml_get_interrupts_enabled()) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); #pragma clang diagnostic pop } @@ -190,6 +191,7 @@ kprintf(const char *fmt, ...) if (ml_get_interrupts_enabled()) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp2, caller); #pragma clang diagnostic pop } @@ -199,6 +201,7 @@ kprintf(const char *fmt, ...) va_start(listp, fmt); #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" os_log_with_args(OS_LOG_DEFAULT, OS_LOG_TYPE_DEFAULT, fmt, listp, caller); #pragma clang diagnostic pop va_end(listp); diff --git a/pexpert/pexpert/arm64/H16.h b/pexpert/pexpert/arm64/H16.h index d22dad5f2..7fd7c12aa 100644 --- a/pexpert/pexpert/arm64/H16.h +++ b/pexpert/pexpert/arm64/H16.h @@ -105,6 +105,11 @@ #define __ARM_RANGE_TLBI__ 1 +#if !CONFIG_SPTM +/* VHE is disabled at runtime on SPTM-based systems. */ +#include +#endif /* !CONFIG_SPTM */ + #include #endif /* !_PEXPERT_ARM64_H16_H */ diff --git a/pexpert/pexpert/arm64/VMAPPLE.h b/pexpert/pexpert/arm64/VMAPPLE.h index 5164f2bb8..08b35780a 100644 --- a/pexpert/pexpert/arm64/VMAPPLE.h +++ b/pexpert/pexpert/arm64/VMAPPLE.h @@ -31,6 +31,7 @@ #define NO_MONITOR 1 #define NO_ECORE 1 +#define HAS_PARAVIRTUALIZED_CTRR 1 #define VMAPPLE 1 #define APPLEVIRTUALPLATFORM 1 @@ -50,8 +51,10 @@ #define ARM_PARAMETERIZED_PMAP 1 #define __ARM_MIXED_PAGE_SIZE__ 1 + #include #undef __ARM64_PMAP_SUBPAGE_L1__ +#undef __ARM64_PMAP_KERN_SUBPAGE_L1__ #ifndef ASSEMBLER #define PL011_UART diff --git a/pexpert/pexpert/arm64/apple_arm64_common.h b/pexpert/pexpert/arm64/apple_arm64_common.h index 56caa9556..1f86042dc 100644 --- a/pexpert/pexpert/arm64/apple_arm64_common.h +++ b/pexpert/pexpert/arm64/apple_arm64_common.h @@ -39,10 +39,23 @@ #define __ARM_ENABLE_SWAP__ 1 #define __ARM_V8_CRYPTO_EXTENSIONS__ 1 -#ifndef ARM_LARGE_MEMORY -#define __ARM64_PMAP_SUBPAGE_L1__ 1 +/* + * If we're using a parameterized PMAP + SPTM, we can enable kernel-only large + * memory. Otherwise, large memory is either enabled for both user and kernel or + * neither. + */ +#if ARM_PARAMETERIZED_PMAP && CONFIG_SPTM +#define HAS_ARM_INDEPENDENT_TNSZ 1 #endif +#if !ARM_LARGE_MEMORY +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM64_PMAP_KERN_SUBPAGE_L1__ 1 +#elif ARM_LARGE_MEMORY_KERNONLY && HAS_ARM_INDEPENDENT_TNSZ +/* Kernel-only large memory */ +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#endif /* ARM_LARGE_MEMORY */ + #define APPLE_ARM64_ARCH_FAMILY 1 #define ARM_ARCH_TIMER @@ -52,6 +65,8 @@ #elif defined(HAS_CTRR) #define KERNEL_INTEGRITY_CTRR 1 #define KERNEL_CTRR_VERSION 2 +#elif defined(HAS_PARAVIRTUALIZED_CTRR) +#define KERNEL_INTEGRITY_PV_CTRR 1 #elif defined(HAS_KTRR) #define KERNEL_INTEGRITY_KTRR 1 #elif defined(MONITOR) diff --git a/pexpert/pexpert/arm64/apple_arm64_regs.h b/pexpert/pexpert/arm64/apple_arm64_regs.h index a844f2ddf..ed7d0dccc 100644 --- a/pexpert/pexpert/arm64/apple_arm64_regs.h +++ b/pexpert/pexpert/arm64/apple_arm64_regs.h @@ -125,6 +125,8 @@ /* * EACC/PACC cpmX_IMPL register offset */ +#define LLC_ERR_STS_OFFSET (0x8ULL) +#define LLC_ERR_ADR_OFFSET (0x10ULL) #define LLC_ERR_INF_OFFSET (0x18ULL) #define LLC_ERR_INF_NREC (1ULL << 36) #endif /* defined(APPLEH16) */ diff --git a/pexpert/pexpert/arm64/board_config.h b/pexpert/pexpert/arm64/board_config.h index 4a0406334..159356da7 100644 --- a/pexpert/pexpert/arm64/board_config.h +++ b/pexpert/pexpert/arm64/board_config.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2023 Apple Inc. All rights reserved. + * Copyright (c) 2007-2025 Apple Inc. All rights reserved. * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. */ #ifndef _PEXPERT_ARM_BOARD_CONFIG_H @@ -140,7 +140,7 @@ #include #define MAX_L2_CLINE 7 -#define MAX_CPUS 8 /* Actually has 6 CPUs, see doc/xnu_build_consolidation.md for more info */ +#define MAX_CPUS 8 /* Actually has 6 CPUs, see doc/building/xnu_build_consolidation.md for more info */ #define MAX_CPU_CLUSTERS 2 #ifndef CONFIG_SPTM @@ -174,6 +174,7 @@ #define USE_APPLEARMSMP 1 #define XNU_PLATFORM_ERROR_HANDLER 1 /* This platform uses the platform error handler inside XNU rather than a kext */ #define XNU_HANDLE_MCC 1 /* This platform may support MCC error recovery */ + #endif /* ARM64_BOARD_CONFIG_T8122_T8130 */ #ifdef ARM64_BOARD_CONFIG_T8132 @@ -204,6 +205,8 @@ + + #ifdef ARM64_BOARD_CONFIG_T6030 #include @@ -263,6 +266,7 @@ #define __ARM_42BIT_PA_SPACE__ 1 #define USE_APPLEARMSMP 1 +#define NO_CPU_OVRD 1 /* CPU_OVRD register accesses are banned */ #define XNU_SUPPORT_BOOTCPU_SHUTDOWN 1 #define RHODES_CLUSTER_POWERDOWN_WORKAROUND 1 /* Workaround for rdar://89107373 (Rhodes cluster power down: cannot manually power down and up a core multiple times without powering down the cluster) */ @@ -272,6 +276,8 @@ + + #ifdef ARM64_BOARD_CONFIG_VMAPPLE #include @@ -283,10 +289,6 @@ #define USE_APPLEARMSMP 1 -#if XNU_TARGET_OS_WATCH -#define PREFER_ARM64_32_BINARIES 1 -#endif - #define NO_XNU_PLATFORM_ERROR_HANDLER 1 @@ -339,4 +341,8 @@ +#if DEBUG || DEVELOPMENT +#define HAS_SPTM_SYSCTL 1 +#endif /* DEBUG || DEVELOPMENT */ + #endif /* ! _PEXPERT_ARM_BOARD_CONFIG_H */ diff --git a/pexpert/pexpert/arm64/boot.h b/pexpert/pexpert/arm64/boot.h index eab75014f..1bad20f3b 100644 --- a/pexpert/pexpert/arm64/boot.h +++ b/pexpert/pexpert/arm64/boot.h @@ -15,6 +15,7 @@ #include #include +#if !TARGET_OS_SIMULATOR /* * Maximum size of an environment variable value. This particular value is * chosen to accommodate the maximum encoded size of the system token as @@ -22,7 +23,15 @@ * * This value matches iBoot's IBOOT_MAX_ENV_VAR_DATA_SIZE. */ -#define BOOT_LINE_LENGTH 1024 +#include +#define BOOT_LINE_LENGTH (IBOOT_MAX_ENV_VAR_DATA_SIZE) +#else /* TARGET_OS_SIMULATOR */ +/** + * Since iBoot does not import IBOOT_MAX_ENV_VAR_DATA_SIZE for simulators, + * hardcode it here. + */ +#define BOOT_LINE_LENGTH 1024 +#endif /* !TARGET_OS_SIMULATOR */ /* * Video information.. diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index 8a8ae4b0b..ff40c157c 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -494,8 +494,10 @@ void PE_reset_kc_header(kc_kind_t type); extern void PE_set_kc_header_and_base(kc_kind_t type, kernel_mach_header_t *header, void *base, uintptr_t slide); /* The highest non-LINKEDIT virtual address */ extern vm_offset_t kc_highest_nonlinkedit_vmaddr; -/* whether this is an srd enabled device */ -extern uint32_t PE_srd_fused; +/* state of extended security domain (used for security research device) */ +extern uint32_t PE_esdm_fuses; +/* state of whether this platform is virtual or not */ +extern uint32_t PE_vmm_present; #endif /* returns a pointer to the mach-o header for a give KC type, returns NULL if nothing's been set */ extern void *PE_get_kc_header(kc_kind_t type); diff --git a/san/conf/Makefile.template b/san/conf/Makefile.template index 349746a0d..a4e1d9f22 100644 --- a/san/conf/Makefile.template +++ b/san/conf/Makefile.template @@ -70,17 +70,21 @@ $(COMPONENT).filelist: $(OBJS) .KASANFLAGS $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).libfilelist -$(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h: $(SRCROOT)/$(COMPONENT)/memory/kasan-blacklist-dynamic +$(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_denylist_dynamic.h: $(SRCROOT)/$(COMPONENT)/memory/kasan-denylist-dynamic @$(LOG_GENERATE) "$(notdir $@)" - @$(SRCROOT)/$(COMPONENT)/tools/generate_dynamic_blacklist.py "$<" > "$@" + @$(SRCROOT)/$(COMPONENT)/tools/generate_dynamic_denylist.py "$<" > "$@" -$(SRCROOT)/$(COMPONENT)/memory/kasan_dynamic_blacklist.c: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_blacklist_dynamic.h +$(SRCROOT)/$(COMPONENT)/memory/kasan_dynamic_denylist.c: $(TARGET)/$(CURRENT_KERNEL_CONFIG)/kasan_denylist_dynamic.h ifeq ($(RC_ProjectName),xnu_libraries) do_all: $(COMPONENT).libfilelist diff --git a/san/conf/files b/san/conf/files index f6bb36513..22992f36c 100644 --- a/san/conf/files +++ b/san/conf/files @@ -7,7 +7,7 @@ OPTIONS/CONFIG_STKSZ optional config_stksz san/memory/kasan.c optional config_kasan san/memory/kasan-test.c optional config_kasan san/memory/kasan-memintrinsics.c optional config_kasan -san/memory/kasan_dynamic_blacklist.c optional config_kasan +san/memory/kasan_dynamic_denylist.c optional config_kasan san/memory/kasan-report.c optional config_kasan san/memory/kasan-helper.c optional config_kasan diff --git a/san/coverage/Makefile b/san/coverage/Makefile index f77f8212e..cdc0e2c72 100644 --- a/san/coverage/Makefile +++ b/san/coverage/Makefile @@ -35,14 +35,14 @@ EXPORT_MI_LIST = ${PRIVATE_XNUFILES} ${KERNELFILES} ${PRIVATE_KERNELFILES} INSTALL_MI_DIR = san EXPORT_MI_DIR = san -# Generate blacklist +# Generate denylist .DELETE_ON_ERROR: -$(OBJROOT)/san/kcov-blacklist-%: $(SOURCE)/kcov-blacklist $(SOURCE)/kcov-blacklist-% +$(OBJROOT)/san/kcov-denylist-%: $(SOURCE)/kcov-denylist $(SOURCE)/kcov-denylist-% @$(LOG_GENERATE) "$(notdir $@)" $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@ - $(_v)$(SRCROOT)/san/tools/validate_blacklist.sh "$@" + $(_v)$(SRCROOT)/san/tools/validate_denylist.sh "$@" -do_build_setup:: $(OBJROOT)/san/kcov-blacklist-x86_64 $(OBJROOT)/san/kcov-blacklist-arm64 +do_build_setup:: $(OBJROOT)/san/kcov-denylist-x86_64 $(OBJROOT)/san/kcov-denylist-arm64 # # Kcov System.kext plugin diff --git a/san/coverage/kcov-blacklist-arm64 b/san/coverage/kcov-blacklist-arm64 deleted file mode 100644 index c6b507558..000000000 --- a/san/coverage/kcov-blacklist-arm64 +++ /dev/null @@ -1,18 +0,0 @@ -# ARM64 specific blacklist - -# Exclude KASan runtime -src:./osfmk/arm/machine_routines_common.c - -# These use a local variable to work out which stack we're on, but can end up with -# a fakestack allocation. -fun:ml_at_interrupt_context -fun:ml_stack_remaining -fun:ml_stack_base -fun:ml_stack_size -fun:kernel_preempt_check - -# Closure of pmap_in_ppl -fun:pmap_interrupts_disable -fun:pmap_get_cpu_data -fun:ml_get_ppl_cpu_data -fun:pmap_interrupts_restore diff --git a/san/coverage/kcov-blacklist b/san/coverage/kcov-denylist similarity index 69% rename from san/coverage/kcov-blacklist rename to san/coverage/kcov-denylist index ae000f4ad..87df3f4a6 100644 --- a/san/coverage/kcov-blacklist +++ b/san/coverage/kcov-denylist @@ -1,3 +1,6 @@ +#!special-case-list-v1 +# rdar://139815990 + # Blanket ignore non-sanitized functions fun:ksancov_* fun:kcov_* @@ -30,3 +33,13 @@ fun:get_preemption_level fun:vm_memtag_add_ptr_tag fun:ml_static_unslide fun:vm_is_addr_slid + +# Exclude static_if dependencies +src:./osfmk/*/static_if.c +src:./osfmk/kern/static_if_common.c + +# STATIC_IF_INIT functions +fun:static_if_tests_setup +fun:kern_feature_override_apply +# Called in kern_feature_override_apply +fun:lck_rw_assert_init \ No newline at end of file diff --git a/san/coverage/kcov-denylist-arm64 b/san/coverage/kcov-denylist-arm64 new file mode 100644 index 000000000..8c31b7898 --- /dev/null +++ b/san/coverage/kcov-denylist-arm64 @@ -0,0 +1,48 @@ +# ARM64 specific denylist + +# Exclude KASan runtime +src:./osfmk/arm/machine_routines_common.c + +# These use a local variable to work out which stack we're on, but can end up with +# a fakestack allocation. +fun:ml_at_interrupt_context +fun:ml_stack_remaining +fun:ml_stack_base +fun:ml_stack_size +fun:kernel_preempt_check + +# Closure of pmap_in_ppl +fun:pmap_interrupts_disable +fun:pmap_get_cpu_data +fun:ml_get_ppl_cpu_data +fun:pmap_interrupts_restore + +# These are required to fixup the kernelcache, which needs to happen before KASAN +# initialization on SPTM systems. +fun:arm_slide_rebase_and_sign_image +fun:kernel_collection_adjust_fileset_entry_addrs +fun:kernel_collection_adjust_mh_addrs +fun:PE_set_kc_header +fun:phystokv + +# SPTM init +src:./osfmk/arm64/sptm/arm_init_sptm.c +src:./osfmk/arm64/static_if.c + +# __SECURITY_STACK_DISALLOWED_PUSH +fun:sleh_panic_lockdown_should_initiate_el1_sp0_sync + +# PAN Disabling functions (rdar://145659776&154299852) +fun:copyout +fun:copyin +fun:copyin_atomic32 +fun:_unprivileged_copyin_atomic32_wait_if_equals +fun:copyin_atomic32_wait_if_equals +fun:copyin_atomic64 +fun:copyout_atomic32 +fun:copyout_atomic64 +fun:copyinstr +fun:pmap_test_access +fun:user_access_enable +fun:pmap_get_tpro +fun:user_access_disable diff --git a/san/coverage/kcov-blacklist-x86_64 b/san/coverage/kcov-denylist-x86_64 similarity index 98% rename from san/coverage/kcov-blacklist-x86_64 rename to san/coverage/kcov-denylist-x86_64 index 0069a9038..9fb8ffd94 100644 --- a/san/coverage/kcov-blacklist-x86_64 +++ b/san/coverage/kcov-denylist-x86_64 @@ -1,4 +1,4 @@ -# x86_64 specific blacklist +# x86_64 specific denylist # Early boot AUTOGEN src:./bsd/kern/kdebug.c diff --git a/san/coverage/kcov.c b/san/coverage/kcov.c index e850ce43a..5199b03a0 100644 --- a/san/coverage/kcov.c +++ b/san/coverage/kcov.c @@ -60,7 +60,7 @@ SYSCTL_NODE(_kern, OID_AUTO, kcov, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "kcov"); * * A compiler will add hooks almost in any basic block in the kernel. However it is * not safe to call hook from some of the contexts. To make this safe it would require - * precise blacklist of all unsafe sources. Which results in high maintenance costs. + * precise denylist of all unsafe sources. Which results in high maintenance costs. * * To avoid this we bootsrap the coverage sanitizer in phases: * @@ -77,7 +77,7 @@ SYSCTL_NODE(_kern, OID_AUTO, kcov, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "kcov"); * configured the boostrap originator enables its converage sanitizer by writing * secondary's per-cpu data. * - * To make this step safe, it is required to maintain blacklist that contains CPU + * To make this step safe, it is required to maintain denylist that contains CPU * bootstrap code to avoid firing hook from unsupported context. * * ... From this point all CPUs can execute the hook correctly. @@ -94,7 +94,6 @@ static void kcov_init(void) { /* Master CPU is fully setup at this point so just enable coverage tracking. */ - printf("KCOV: Enabling coverage tracking on cpu %d\n", cpu_number()); ksancov_init(); current_kcov_data()->kcd_enabled = 1; } @@ -166,6 +165,64 @@ kcov_init_thread(kcov_thread_data_t *data) kcov_stksz_init_thread(&data->ktd_stksz); } +/* Shared prologue between trace functions */ +static kcov_thread_data_t * +trace_prologue(void) +{ + /* Check the global flag for the case no recording is enabled. */ + if (__probable(os_atomic_load(&kcov_enabled, relaxed) == 0)) { + return NULL; + } + + /* + * rdar://145659776 + * If PAN is disabled we cannot safely re-enable preemption after disabling it. + * The proper way to do this in a generic way is to check here for PAN and bail ot + * if (__improbable(__builtin_arm_rsr("pan") == 0)) + * + * The issue with this solution is the performance cost of reading the MSR for each + * trace point, so PAN disabled functions are included in the baclklist instead + * (see kcov-blacklist-arm64). + */ + + /* Per-cpu area access. Must happen with disabled interrupts/preemtion. */ + disable_preemption(); + + if (!current_kcov_data()->kcd_enabled) { + enable_preemption(); + return NULL; + } + + /* No support for PPL. */ + if (pmap_in_ppl()) { + enable_preemption(); + return NULL; + } + /* Interrupt context not supported. */ + if (ml_at_interrupt_context()) { + enable_preemption(); + return NULL; + } + + thread_t th = current_thread(); + if (__improbable(th == THREAD_NULL)) { + enable_preemption(); + return NULL; + } + + /* This thread does not want to be traced. */ + kcov_thread_data_t *data = kcov_get_thread_data(th); + if (__improbable(data->ktd_disabled) != 0) { + enable_preemption(); + return NULL; + } + + /* Enable preemption as we are no longer accessing per-cpu data. */ + enable_preemption(); + + return data; +} + /* * This is the core of the coverage recording. * @@ -174,7 +231,7 @@ kcov_init_thread(kcov_thread_data_t *data) * contexts (for example per-cpu data access). * * Do not call anything unnecessary before ksancov_disable() as that will cause - * recursion. Update blacklist after any such change. + * recursion. Update denylist after any such change. * * Every complex code here may have impact on the overall performance. This function * is called for every edge in the kernel and that means multiple times through a @@ -185,46 +242,11 @@ trace_pc_guard(uint32_t __unused *guardp, void __unused *caller, uintptr_t __unu { kcov_ksancov_trace_guard(guardp, caller); - /* Check the global flag for the case no recording is enabled. */ - if (__probable(os_atomic_load(&kcov_enabled, relaxed) == 0)) { + kcov_thread_data_t *data = trace_prologue(); + if (data == NULL) { return; } - /* Per-cpu area access. Must happen with disabled interrupts/preemtion. */ - disable_preemption(); - - if (!current_kcov_data()->kcd_enabled) { - enable_preemption(); - return; - } - - /* No support for PPL. */ - if (pmap_in_ppl()) { - enable_preemption(); - return; - } - /* Interrupt context not supported. */ - if (ml_at_interrupt_context()) { - enable_preemption(); - return; - } - - thread_t th = current_thread(); - if (__improbable(th == THREAD_NULL)) { - enable_preemption(); - return; - } - - /* This thread does not want to record stack usage. */ - kcov_thread_data_t *data = kcov_get_thread_data(th); - if (__improbable(data->ktd_disabled) != 0) { - enable_preemption(); - return; - } - - /* Enable preemption as we are no longer accessing per-cpu data. */ - enable_preemption(); - /* It is now safe to call back to kernel from this thread without recursing in the hook itself. */ kcov_disable_thread(data); @@ -277,3 +299,114 @@ __sanitizer_cov_pcs_init(uintptr_t __unused *start, uintptr_t __unused *stop) { kcov_ksancov_pcs_init(start, stop); } + +static void +trace_cmp(uint32_t __unused type, uint64_t __unused arg1, uint64_t __unused arg2, void __unused *caller) +{ + kcov_thread_data_t *data = trace_prologue(); + if (data == NULL) { + return; + } + + /* It is now safe to call back to kernel from this thread without recursing in the hook itself. */ + kcov_disable_thread(data); + + kcov_ksancov_trace_cmp(data, type, arg1, arg2, caller); + + kcov_enable_thread(data); +} + +void +__sanitizer_cov_trace_cmp1(uint8_t arg1, uint8_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE1, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_cmp2(uint16_t arg1, uint16_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE2, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_cmp4(uint32_t arg1, uint32_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE4, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_cmp8(uint64_t arg1, uint64_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE8, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_const_cmp1(uint8_t arg1, uint8_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE1 | KCOV_CMP_CONST, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_const_cmp2(uint16_t arg1, uint16_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE2 | KCOV_CMP_CONST, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_const_cmp4(uint32_t arg1, uint32_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE4 | KCOV_CMP_CONST, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_const_cmp8(uint64_t arg1, uint64_t arg2) +{ + trace_cmp(KCOV_CMP_SIZE8 | KCOV_CMP_CONST, arg1, arg2, __builtin_return_address(0)); +} + +void +__sanitizer_cov_trace_switch(uint64_t val, uint64_t *cases) +{ + void *ret = __builtin_return_address(0); + + uint32_t type; + switch (cases[1]) { + case 8: + type = KCOV_CMP_SIZE1 | KCOV_CMP_CONST; + break; + case 16: + type = KCOV_CMP_SIZE2 | KCOV_CMP_CONST; + break; + case 32: + type = KCOV_CMP_SIZE4 | KCOV_CMP_CONST; + break; + case 64: + type = KCOV_CMP_SIZE8 | KCOV_CMP_CONST; + break; + default: + return; + } + + uint64_t i; + uint64_t count = cases[0]; + + for (i = 0; i < count; i++) { + trace_cmp(type, cases[i + 2], val, ret); + } +} + +void +kcov_trace_cmp_func(void *caller_pc, uint32_t type, const void *s1, size_t s1len, const void *s2, size_t s2len, bool always_log) +{ + kcov_thread_data_t *data = trace_prologue(); + if (data == NULL) { + return; + } + + /* It is now safe to call back to kernel from this thread without recursing in the hook itself. */ + kcov_disable_thread(data); + + kcov_ksancov_trace_cmp_func(data, type, s1, s1len, s2, s2len, caller_pc, always_log); + + kcov_enable_thread(data); +} diff --git a/san/coverage/kcov.h b/san/coverage/kcov.h index b737b3107..feb588c48 100644 --- a/san/coverage/kcov.h +++ b/san/coverage/kcov.h @@ -31,12 +31,30 @@ #if KERNEL_PRIVATE -#if !CONFIG_KCOV && __has_feature(coverage_sanitizer) +#if __has_feature(coverage_sanitizer) && !(CONFIG_KCOV || defined(__BUILDING_XNU_LIBRARY__)) # error "Coverage sanitizer enabled in compiler, but kernel is not configured for KCOV" #endif #if CONFIG_KCOV +#include + +/* Comparison type values */ +enum { + KCOV_CMP_CONST = 1, + KCOV_CMP_SIZE1 = 0, + KCOV_CMP_SIZE2 = 2, + KCOV_CMP_SIZE4 = 4, + KCOV_CMP_SIZE8 = 6, + + KCOV_CMP_FUNC_MEMCMP = 32, + KCOV_CMP_FUNC_STRCMP = 34, + KCOV_CMP_FUNC_STRNCMP = 36, + KCOV_CMP_FUNC_STRBUFCMP = 38, +}; + +#define KCOV_CMP_IS_FUNC(type) (type >= KCOV_CMP_FUNC_MEMCMP) + /* Forward declaration for types used in interfaces below. */ typedef struct kcov_cpu_data kcov_cpu_data_t; typedef struct kcov_thread_data kcov_thread_data_t; @@ -56,11 +74,14 @@ void kcov_start_cpu(int cpuid); void kcov_panic_disable(void); /* per-thread */ -struct kcov_thread_data *kcov_get_thread_data(thread_t); +struct kcov_thread_data *kcov_get_thread_data(thread_t thread); void kcov_enable(void); void kcov_disable(void); +/* Comparison function tracing */ +void kcov_trace_cmp_func(void *caller_pc, uint32_t type, const void *s1, size_t s1len, const void *s2, size_t s2len, bool always_log); + /* * SanitizerCoverage ABI */ @@ -69,6 +90,15 @@ void __sanitizer_cov_trace_pc_guard(uint32_t *guard); void __sanitizer_cov_trace_pc_guard_init(uint32_t *start, uint32_t *stop); void __sanitizer_cov_trace_pc_indirect(void *callee); void __sanitizer_cov_trace_pc(void); +void __sanitizer_cov_trace_cmp1(uint8_t arg1, uint8_t arg2); +void __sanitizer_cov_trace_cmp2(uint16_t arg1, uint16_t arg2); +void __sanitizer_cov_trace_cmp4(uint32_t arg1, uint32_t arg2); +void __sanitizer_cov_trace_cmp8(uint64_t arg1, uint64_t arg2); +void __sanitizer_cov_trace_const_cmp1(uint8_t arg1, uint8_t arg2); +void __sanitizer_cov_trace_const_cmp2(uint16_t arg1, uint16_t arg2); +void __sanitizer_cov_trace_const_cmp4(uint32_t arg1, uint32_t arg2); +void __sanitizer_cov_trace_const_cmp8(uint64_t arg1, uint64_t arg2); +void __sanitizer_cov_trace_switch(uint64_t val, uint64_t *cases); __END_DECLS diff --git a/san/coverage/kcov_data.h b/san/coverage/kcov_data.h index 91bdb38ad..4cc7890d2 100644 --- a/san/coverage/kcov_data.h +++ b/san/coverage/kcov_data.h @@ -48,7 +48,7 @@ struct kcov_cpu_data { struct kcov_thread_data { uint32_t ktd_disabled; /* disable sanitizer for a thread */ #if CONFIG_KSANCOV - ksancov_dev_t ktd_device; /* ksancov per-thread data */ + ksancov_dev_t ktd_device; /* ksancov per-thread data */ #endif #if CONFIG_STKSZ kcov_stksz_thread_t ktd_stksz; /* stack size per-thread data */ diff --git a/san/coverage/kcov_ksancov.c b/san/coverage/kcov_ksancov.c index 6028c1268..f73f60d04 100644 --- a/san/coverage/kcov_ksancov.c +++ b/san/coverage/kcov_ksancov.c @@ -78,7 +78,7 @@ typedef struct uthread * uthread_t; #include #define USE_PC_TABLE 0 -#define KSANCOV_MAX_DEV 64 +#define KSANCOV_MAX_DEV 128 #define KSANCOV_MAX_PCS (1024U * 64) /* default to 256k buffer => 64k pcs */ extern boolean_t ml_at_interrupt_context(void); @@ -119,6 +119,8 @@ struct ksancov_od_module_handle { uint32_t *start; /* guards boundaries */ uint32_t *stop; uint64_t *gate; /* pointer to __DATA,__sancov_gate*/ + uint64_t text_start; /* .text section start, stripped and unslided address */ + uint64_t text_end; /* .text section end, stripped and unslided address */ }; static struct ksancov_od_module_entry *ksancov_od_module_entries = NULL; @@ -281,23 +283,23 @@ kcov_ksancov_trace_guard(uint32_t *guardp, void *caller) return; } - /* - * Since this code was originally introduced, VM_KERNEL_UNSLIDE - * evolved significantly, and it now expands to a series of - * function calls that check whether the address is slid, mask - * off tags and ultimately unslide the pointer. - * - * Therefore we need to make sure that we do not instrument any function - * in the closure of VM_KERNEL_UNSLIDE: this would cause a loop where the - * instrumentation callbacks end up calling into instrumented code. - * - */ - uintptr_t pc = (uintptr_t)(VM_KERNEL_UNSLIDE(caller) - 1); - uint32_t gd = *guardp; if (__improbable(gd && !(gd & GUARD_SEEN))) { size_t idx = gd & GUARD_IDX_MASK; if (idx < ksancov_edgemap->ke_nedges) { + /* + * Since this code was originally introduced, VM_KERNEL_UNSLIDE + * evolved significantly, and it now expands to a series of + * function calls that check whether the address is slid, mask + * off tags and ultimately unslide the pointer. + * + * Therefore we need to make sure that we do not instrument any function + * in the closure of VM_KERNEL_UNSLIDE: this would cause a loop where the + * instrumentation callbacks end up calling into instrumented code. + * + */ + uintptr_t pc = (uintptr_t)(VM_KERNEL_UNSLIDE(caller) - 1); + ksancov_edgemap->ke_addrs[idx] = pc; *guardp |= GUARD_SEEN; } @@ -308,7 +310,7 @@ void kcov_ksancov_trace_pc(kcov_thread_data_t *data, uint32_t *guardp, void *caller, uintptr_t sp) { #pragma unused(sp) - uintptr_t pc = (uintptr_t)(VM_KERNEL_UNSLIDE(caller) - 1); + uintptr_t pc; ksancov_dev_t dev = data->ktd_device; /* Check that we have coverage recording enabled for a thread. */ @@ -326,10 +328,12 @@ kcov_ksancov_trace_pc(kcov_thread_data_t *data, uint32_t *guardp, void *caller, */ switch (dev->mode) { case KS_MODE_TRACE: + pc = (uintptr_t)(VM_KERNEL_UNSLIDE(caller) - 1); trace_pc_guard_pcs(dev, pc); break; #if CONFIG_STKSZ case KS_MODE_STKSIZE: + pc = (uintptr_t)(VM_KERNEL_UNSLIDE(caller) - 1); trace_pc_guard_pcs_stk(dev, pc, data->ktd_stksz.kst_stksz); break; #endif @@ -366,6 +370,126 @@ kcov_ksancov_trace_pc_guard_init(uint32_t *start, uint32_t *stop) } } +void +kcov_ksancov_trace_cmp(kcov_thread_data_t *data, uint32_t type, uint64_t arg1, uint64_t arg2, void *caller) +{ + ksancov_dev_t dev = data->ktd_device; + + /* Check that we have coverage recording enabled for a thread. */ + if (__probable(dev == NULL)) { + return; + } + + /* Check that we have cmps tracing enabled. */ + if (os_atomic_load(&dev->cmps_hdr, relaxed) == NULL) { + return; + } + if (os_atomic_load(&dev->cmps_hdr->kh_enabled, relaxed) == 0) { + return; + } + + /* + * Treat all unsupported tracing modes as no-op. It is not destructive for the kernel itself just + * coverage sanitiser will not record anything in such case. + */ + if (dev->cmps_mode != KS_CMPS_MODE_TRACE && dev->cmps_mode != KS_CMPS_MODE_TRACE_FUNC) { + return; + } + + if (__improbable(dev->cmps_sz < sizeof(ksancov_trace_t))) { + return; + } + size_t max_entries = (dev->cmps_sz - sizeof(ksancov_trace_t)) / sizeof(ksancov_cmps_trace_ent_t); + + if (os_atomic_load(&dev->cmps_trace->kt_head, relaxed) >= max_entries) { + return; /* overflow */ + } + + uint32_t idx = os_atomic_inc_orig(&dev->cmps_trace->kt_head, relaxed); + if (__improbable(idx >= max_entries)) { + return; + } + + uint64_t pc = (uint64_t)(VM_KERNEL_UNSLIDE(caller) - 1); + + ksancov_cmps_trace_ent_t *entries = (ksancov_cmps_trace_ent_t *)dev->cmps_trace->kt_entries; + entries[idx].pc = pc; + entries[idx].type = type; + entries[idx].args[0] = arg1; + entries[idx].args[1] = arg2; +} + +void +kcov_ksancov_trace_cmp_func(kcov_thread_data_t *data, uint32_t type, const void *arg1, size_t len1, const void *arg2, size_t len2, void *caller, bool always_log) +{ + if (len1 + len2 > KSANCOV_CMPS_TRACE_FUNC_MAX_BYTES) { + return; + } + + ksancov_dev_t dev = data->ktd_device; + + /* Check that we have coverage recording enabled for a thread. */ + if (__probable(dev == NULL)) { + return; + } + + /* Check that we have cmps tracing enabled. */ + if (os_atomic_load(&dev->cmps_hdr, relaxed) == NULL) { + return; + } + if (os_atomic_load(&dev->cmps_hdr->kh_enabled, relaxed) == 0) { + return; + } + + /* + * Treat all unsupported tracing modes as no-op. It is not destructive for the kernel itself just + * coverage sanitiser will not record anything in such case. + */ + if (dev->cmps_mode != KS_CMPS_MODE_TRACE_FUNC) { + return; + } + + if (__improbable(dev->cmps_sz < sizeof(ksancov_trace_t))) { + return; + } + + size_t max_entries = (dev->cmps_sz - sizeof(ksancov_trace_t)) / sizeof(ksancov_cmps_trace_ent_t); + if (os_atomic_load(&dev->cmps_trace->kt_head, relaxed) >= max_entries) { + return; /* overflow */ + } + + uintptr_t addr = (uintptr_t)VM_KERNEL_UNSLIDE(caller); + if (!addr) { + return; + } + + if (!always_log && !kcov_ksancov_must_instrument((uintptr_t)caller)) { + return; + } + + uint32_t space = (uint32_t)ksancov_cmps_trace_func_space(len1, len2); + + uint32_t idx = os_atomic_add_orig(&dev->cmps_trace->kt_head, space / sizeof(ksancov_cmps_trace_ent_t), relaxed); + if (__improbable(idx >= max_entries)) { + return; + } + + uint64_t pc = (uint64_t)(addr - 1); + + ksancov_cmps_trace_ent_t *entries = (ksancov_cmps_trace_ent_t *)dev->cmps_trace->kt_entries; + + entries[idx].pc = pc; + entries[idx].type = type; + entries[idx].len1_func = (uint16_t)len1; + entries[idx].len2_func = (uint16_t)len2; + + uint8_t* func_args = (uint8_t*)entries[idx].args; + + __builtin_memcpy(func_args, arg1, len1); + __builtin_memcpy(&func_args[len1], arg2, len2); +} + + void kcov_ksancov_pcs_init(uintptr_t *start, uintptr_t *stop) { @@ -467,6 +591,8 @@ kcov_ksancov_bookmark_on_demand_module(uint32_t *start, uint32_t *stop) handle->start = start; handle->stop = stop; handle->gate = gate_section; + handle->text_start = (uint64_t)VM_KERNEL_UNSLIDE(summary.text_exec_address); + handle->text_end = (uint64_t)VM_KERNEL_UNSLIDE(summary.text_exec_address + summary.text_exec_size); strlcpy(entry->bundle, summary.name, sizeof(entry->bundle)); entry->idx = (uint32_t)idx; @@ -477,6 +603,43 @@ kcov_ksancov_bookmark_on_demand_module(uint32_t *start, uint32_t *stop) lck_mtx_unlock(&ksancov_od_lck); } +bool +kcov_ksancov_must_instrument(uintptr_t addr) +{ + /* + * If the kernel itself was not compiled with sanitizer coverage skip + * addresses from the kernel itself and focus on KEXTs only. + */ +#if __has_feature(coverage_sanitizer) + if (kernel_text_contains(addr)) { + return true; + } +#endif + + uintptr_t unslided_addr = (uintptr_t)VM_KERNEL_UNSLIDE(addr); + if (!unslided_addr) { + return false; + } + + /* + * Check that the address is in a KEXT and that the on demand gate is enabled + * NOTE: We don't use any lock here as we are reading: + * 1) atomically ksancov_od_modules_count, that can only increase + * 2) ksancov_od_module_handles[...] that are constant after being added to the + * array, with only the gate field changing + * 3) atomically the gate value + */ + unsigned int modules_count = os_atomic_load(&ksancov_od_modules_count, relaxed); + for (unsigned int idx = 0; idx < modules_count; idx++) { + struct ksancov_od_module_handle *handle = &ksancov_od_module_handles[idx]; + if (unslided_addr >= handle->text_start && unslided_addr < handle->text_end && handle->gate) { + return os_atomic_load(handle->gate, relaxed) != 0; + } + } + + return false; +} + /* * Coverage sanitizer pseudo-device code. */ @@ -498,11 +661,14 @@ create_dev(dev_t dev) static void free_dev(ksancov_dev_t d) { - if (d->mode == KS_MODE_TRACE && d->trace) { + if ((d->mode == KS_MODE_TRACE || d->mode == KS_MODE_STKSIZE) && d->trace) { kmem_free(kernel_map, (uintptr_t)d->trace, d->sz); } else if (d->mode == KS_MODE_COUNTERS && d->counters) { kmem_free(kernel_map, (uintptr_t)d->counters, d->sz); } + if ((d->cmps_mode == KS_CMPS_MODE_TRACE || d->cmps_mode == KS_CMPS_MODE_TRACE_FUNC) && d->cmps_trace) { + kmem_free(kernel_map, (uintptr_t)d->cmps_trace, d->cmps_sz); + } lck_mtx_destroy(&d->lock, &ksancov_lck_grp); kfree_type(struct ksancov_dev, d); } @@ -648,7 +814,7 @@ ksancov_open(dev_t dev, int flags, int devtype, proc_t p) size_t sz = sizeof(struct ksancov_edgemap) + nedges * sizeof(uintptr_t); kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, - KMA_DATA | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG); + KMA_DATA_SHARED | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG); if (kr) { printf("ksancov: failed to allocate edge addr map\n"); lck_rw_unlock_exclusive(&ksancov_devs_lck); @@ -691,7 +857,7 @@ ksancov_trace_alloc(ksancov_dev_t d, ksancov_mode_t mode, size_t maxpcs) } /* allocate the shared memory buffer */ - kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, KMA_DATA | KMA_ZERO, + kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG); if (kr != KERN_SUCCESS) { return ENOMEM; @@ -723,7 +889,7 @@ ksancov_counters_alloc(ksancov_dev_t d) size_t sz = sizeof(struct ksancov_counters) + ksancov_edgemap->ke_nedges * sizeof(uint8_t); /* allocate the shared memory buffer */ - kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, KMA_DATA | KMA_ZERO, + kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG); if (kr != KERN_SUCCESS) { return ENOMEM; @@ -813,6 +979,78 @@ ksancov_detach(ksancov_dev_t d) d->thread = THREAD_NULL; } +static int +ksancov_cmps_trace_alloc(ksancov_dev_t d, ksancov_cmps_mode_t mode, size_t maxcmps) +{ + if (d->cmps_mode != KS_CMPS_MODE_NONE) { + return EBUSY; /* cmps trace already created */ + } + assert(d->cmps_trace == NULL); + + uintptr_t buf; + size_t sz; + + if (mode == KS_CMPS_MODE_TRACE || mode == KS_CMPS_MODE_TRACE_FUNC) { + if (os_mul_and_add_overflow(maxcmps, sizeof(ksancov_cmps_trace_ent_t), + sizeof(struct ksancov_trace), &sz)) { + return EINVAL; + } + } else { + return EINVAL; + } + + /* allocate the shared memory buffer */ + kern_return_t kr = kmem_alloc(kernel_map, &buf, sz, KMA_DATA_SHARED | KMA_ZERO, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + return ENOMEM; + } + + struct ksancov_trace *cmps_trace = (struct ksancov_trace *)buf; + cmps_trace->kt_hdr.kh_magic = KSANCOV_CMPS_TRACE_MAGIC; + os_atomic_init(&cmps_trace->kt_head, 0); + os_atomic_init(&cmps_trace->kt_hdr.kh_enabled, 0); + cmps_trace->kt_maxent = (uint32_t)maxcmps; + + d->cmps_trace = cmps_trace; + d->cmps_sz = sz; + d->cmps_mode = mode; + + return 0; +} + +/* + * map the sancov comparisons buffer into the current process + */ +static int +ksancov_cmps_map(ksancov_dev_t d, uintptr_t *bufp, size_t *sizep) +{ + uintptr_t addr; + size_t size = d->cmps_sz; + + switch (d->cmps_mode) { + case KS_CMPS_MODE_TRACE: + case KS_CMPS_MODE_TRACE_FUNC: + if (!d->cmps_trace) { + return EINVAL; + } + addr = (uintptr_t)d->cmps_trace; + break; + default: + return EINVAL; /* not configured */ + } + + void *buf = ksancov_do_map(addr, size, VM_PROT_READ | VM_PROT_WRITE); + if (buf == NULL) { + return ENOMEM; + } + + *bufp = (uintptr_t)buf; + *sizep = size; + + return 0; +} + static int ksancov_close(dev_t dev, int flags, int devtype, proc_t p) { @@ -841,6 +1079,9 @@ ksancov_close(dev_t dev, int flags, int devtype, proc_t p) if (d->mode != KS_MODE_NONE && d->hdr != NULL) { os_atomic_store(&d->hdr->kh_enabled, 0, relaxed); /* stop tracing */ } + if (d->cmps_mode != KS_CMPS_MODE_NONE && d->cmps_hdr != NULL) { + os_atomic_store(&d->cmps_hdr->kh_enabled, 0, relaxed); /* stop tracing cmps */ + } ksancov_detach(d); free_dev(d); @@ -896,10 +1137,28 @@ ksancov_handle_on_demand_cmd(struct ksancov_on_demand_msg *kmsg) struct ksancov_od_module_entry *entry = NULL; struct ksancov_od_module_handle *handle = NULL; ksancov_on_demand_operation_t op = kmsg->operation; - int ret = 0; lck_mtx_lock(&ksancov_od_lck); + if (op == KS_OD_GET_BUNDLE) { + uint64_t pc = kmsg->pc; + for (unsigned int idx = 0; idx < ksancov_od_modules_count; idx++) { + entry = &ksancov_od_module_entries[idx]; + handle = &ksancov_od_module_handles[idx]; + + if (pc >= handle->text_start && pc < handle->text_end) { + strncpy(kmsg->bundle, entry->bundle, sizeof(kmsg->bundle)); + lck_mtx_unlock(&ksancov_od_lck); + return 0; + } + } + + lck_mtx_unlock(&ksancov_od_lck); + return EINVAL; + } + + int ret = 0; + /* find the entry/handle to the module */ for (unsigned int idx = 0; idx < ksancov_od_modules_count; idx++) { entry = &ksancov_od_module_entries[idx]; @@ -945,8 +1204,8 @@ ksancov_handle_on_demand_cmd(struct ksancov_on_demand_msg *kmsg) /* Get which range of the guards table covers the given module */ ksancov_od_log("ksancov: Range for '%s': %u, %u\n", kmsg->bundle, *handle->start, *(handle->stop - 1)); - kmsg->range.start = *handle->start; - kmsg->range.stop = *(handle->stop - 1); + kmsg->range.start = *handle->start & GUARD_IDX_MASK; + kmsg->range.stop = *(handle->stop - 1) & GUARD_IDX_MASK; break; default: ret = EINVAL; @@ -1016,6 +1275,19 @@ ksancov_ioctl(dev_t dev, unsigned long cmd, caddr_t _data, int fflag, proc_t p) case KSANCOV_IOC_TESTPANIC: ksancov_testpanic(*(uint64_t *)data); break; + case KSANCOV_IOC_CMPS_TRACE: + case KSANCOV_IOC_CMPS_TRACE_FUNC: + lck_mtx_lock(&d->lock); + ksancov_cmps_mode_t cmp_mode = (cmd == KSANCOV_IOC_CMPS_TRACE) ? KS_CMPS_MODE_TRACE : KS_CMPS_MODE_TRACE_FUNC; + ret = ksancov_cmps_trace_alloc(d, cmp_mode, *(size_t *)data); + lck_mtx_unlock(&d->lock); + break; + case KSANCOV_IOC_CMPS_MAP: + mcmd = (struct ksancov_buf_desc *)data; + lck_mtx_lock(&d->lock); + ret = ksancov_cmps_map(d, &mcmd->ptr, &mcmd->sz); + lck_mtx_unlock(&d->lock); + break; default: ret = EINVAL; break; diff --git a/san/coverage/kcov_ksancov.h b/san/coverage/kcov_ksancov.h index 401ba0c02..4e32df159 100644 --- a/san/coverage/kcov_ksancov.h +++ b/san/coverage/kcov_ksancov.h @@ -40,23 +40,30 @@ #define KSANCOV_PATH "/dev/" KSANCOV_DEVNODE /* Set mode */ -#define KSANCOV_IOC_TRACE _IOW('K', 1, size_t) /* number of pcs */ -#define KSANCOV_IOC_COUNTERS _IO('K', 2) -#define KSANCOV_IOC_STKSIZE _IOW('K', 3, size_t) /* number of pcs */ +#define KSANCOV_IOC_TRACE _IOW('K', 1, size_t) /* number of pcs */ +#define KSANCOV_IOC_COUNTERS _IO('K', 2) +#define KSANCOV_IOC_STKSIZE _IOW('K', 3, size_t) /* number of pcs */ /* Establish a shared mapping of the coverage buffer. */ -#define KSANCOV_IOC_MAP _IOWR('K', 8, struct ksancov_buf_desc) +#define KSANCOV_IOC_MAP _IOWR('K', 8, struct ksancov_buf_desc) /* Establish a shared mapping of the edge address buffer. */ -#define KSANCOV_IOC_MAP_EDGEMAP _IOWR('K', 9, struct ksancov_buf_desc) +#define KSANCOV_IOC_MAP_EDGEMAP _IOWR('K', 9, struct ksancov_buf_desc) /* Log the current thread */ -#define KSANCOV_IOC_START _IOW('K', 10, uintptr_t) -#define KSANCOV_IOC_NEDGES _IOR('K', 50, size_t) -#define KSANCOV_IOC_TESTPANIC _IOW('K', 20, uint64_t) +#define KSANCOV_IOC_START _IOW('K', 10, uintptr_t) +#define KSANCOV_IOC_NEDGES _IOR('K', 50, size_t) +#define KSANCOV_IOC_TESTPANIC _IOW('K', 20, uint64_t) /* Operations related to on-demand instrumentation */ -#define KSANCOV_IOC_ON_DEMAND _IOWR('K', 60, struct ksancov_on_demand_msg) +#define KSANCOV_IOC_ON_DEMAND _IOWR('K', 60, struct ksancov_on_demand_msg) + +/* Set comparison log mode */ +#define KSANCOV_IOC_CMPS_TRACE _IOW('K', 70, size_t) /* number of cmps */ +#define KSANCOV_IOC_CMPS_TRACE_FUNC _IOW('K', 71, size_t) /* number of cmps */ + +/* Establish a shared mapping of the comparisons buffer. */ +#define KSANCOV_IOC_CMPS_MAP _IOWR('K', 90, struct ksancov_buf_desc) /* * ioctl @@ -74,6 +81,7 @@ typedef enum { KS_OD_GET_GATE = 1, KS_OD_SET_GATE = 2, KS_OD_GET_RANGE = 3, + KS_OD_GET_BUNDLE = 4, } ksancov_on_demand_operation_t; struct ksancov_on_demand_msg { @@ -85,6 +93,7 @@ struct ksancov_on_demand_msg { uint32_t start; uint32_t stop; } range; + uint64_t pc; }; }; @@ -92,13 +101,13 @@ struct ksancov_on_demand_msg { * shared kernel-user mapping */ -#define KSANCOV_MAX_EDGES (1 << 24) -#define KSANCOV_MAX_HITS UINT8_MAX -#define KSANCOV_TRACE_MAGIC (uint32_t)0x5AD17F5BU -#define KSANCOV_COUNTERS_MAGIC (uint32_t)0x5AD27F6BU -#define KSANCOV_EDGEMAP_MAGIC (uint32_t)0x5AD37F7BU -#define KSANCOV_STKSIZE_MAGIC (uint32_t)0x5AD47F8BU - +#define KSANCOV_MAX_EDGES (1 << 24) +#define KSANCOV_MAX_HITS UINT8_MAX +#define KSANCOV_TRACE_MAGIC (uint32_t)0x5AD17F5BU +#define KSANCOV_COUNTERS_MAGIC (uint32_t)0x5AD27F6BU +#define KSANCOV_EDGEMAP_MAGIC (uint32_t)0x5AD37F7BU +#define KSANCOV_STKSIZE_MAGIC (uint32_t)0x5AD47F8BU +#define KSANCOV_CMPS_TRACE_MAGIC (uint32_t)0x5AD47F9BU __BEGIN_DECLS @@ -110,6 +119,9 @@ void kcov_ksancov_trace_guard(uint32_t *, void *); void kcov_ksancov_trace_pc(kcov_thread_data_t *, uint32_t *, void*, uintptr_t); void kcov_ksancov_trace_pc_guard_init(uint32_t *, uint32_t *); void kcov_ksancov_pcs_init(uintptr_t *, uintptr_t *); +void kcov_ksancov_trace_cmp(kcov_thread_data_t *, uint32_t, uint64_t, uint64_t, void*); +void kcov_ksancov_trace_cmp_func(kcov_thread_data_t *, uint32_t, const void*, size_t, const void*, size_t, void*, bool); +bool kcov_ksancov_must_instrument(uintptr_t); __END_DECLS @@ -121,6 +133,8 @@ __END_DECLS #define kcov_ksancov_trace_pc(dev, guardp, caller, sp) #define kcov_ksancov_trace_pc_guard_init(start, stop) #define kcov_ksancov_pcs_init(start, stop) +#define kcov_ksancov_trace_cmp(data, type, arg1, arg2, caller) +#define kcov_ksancov_trace_cmp_func(data, type, arg1, arg2, size, caller, always_log) #endif /* CONFIG_KSANCOV */ diff --git a/san/coverage/kcov_ksancov_data.h b/san/coverage/kcov_ksancov_data.h index 75ffc0eb4..cde357ae2 100644 --- a/san/coverage/kcov_ksancov_data.h +++ b/san/coverage/kcov_ksancov_data.h @@ -91,6 +91,58 @@ typedef struct ksancov_edgemap { uintptr_t ke_addrs[]; /* address of each edge relative to 'offset' */ } ksancov_edgemap_t; +/* + * Supported comparison logging modes. + */ +typedef enum { + KS_CMPS_MODE_NONE, + KS_CMPS_MODE_TRACE, + KS_CMPS_MODE_TRACE_FUNC, + KS_CMPS_MODE_MAX +} ksancov_cmps_mode_t; + +#define KSANCOV_CMPS_TRACE_FUNC_MAX_BYTES 512 + +/* CMPS TRACE mode tracks comparison values */ +typedef struct __attribute__((__packed__)) ksancov_cmps_trace_entry { + uint64_t pc; + uint32_t type; + uint16_t len1_func; + uint16_t len2_func; + union { + uint64_t args[2]; /* cmp instruction arguments */ + uint8_t args_func[0]; /* cmp function arguments (variadic) */ + }; +} ksancov_cmps_trace_ent_t; + +/* Calculate the total space that a ksancov_cmps_trace_ent_t tracing a function takes */ +static inline size_t +ksancov_cmps_trace_func_space(size_t len1_func, size_t len2_func) +{ + static_assert(sizeof(ksancov_cmps_trace_ent_t) == sizeof(uint64_t) * 3 + sizeof(uint32_t) + sizeof(uint16_t) * 2, "ksancov_cmps_trace_ent_t invalid size"); + + size_t size = sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint16_t) * 2; // header + size += len1_func + len2_func; + size_t rem = size % sizeof(ksancov_cmps_trace_ent_t); + if (rem == 0) { + return size; + } + return size + sizeof(ksancov_cmps_trace_ent_t) - rem; +} + +static inline uint8_t * +ksancov_cmps_trace_func_arg1(ksancov_cmps_trace_ent_t *entry) +{ + return entry->args_func; +} + +static inline uint8_t * +ksancov_cmps_trace_func_arg2(ksancov_cmps_trace_ent_t *entry) +{ + uint8_t* func_args = entry->args_func; + return &func_args[entry->len1_func]; +} + /* * Represents state of a ksancov device when userspace asks for coverage data recording. */ @@ -107,6 +159,14 @@ struct ksancov_dev { size_t maxpcs; + ksancov_cmps_mode_t cmps_mode; + + union { + ksancov_header_t *cmps_hdr; + ksancov_trace_t *cmps_trace; + }; + size_t cmps_sz; /* size of allocated cmps trace buffer */ + thread_t thread; dev_t dev; lck_mtx_t lock; diff --git a/san/memory/Makefile b/san/memory/Makefile index 6fed0a42c..1111613ff 100644 --- a/san/memory/Makefile +++ b/san/memory/Makefile @@ -35,12 +35,12 @@ INSTALL_MI_DIR = san EXPORT_MI_DIR = san .DELETE_ON_ERROR: -$(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/ubsan-blacklist $(SOURCE)/kasan-blacklist-% +$(OBJROOT)/san/kasan-denylist-%: $(SOURCE)/kasan-denylist $(SOURCE)/ubsan-denylist $(SOURCE)/kasan-denylist-% @$(LOG_GENERATE) "$(notdir $@)" $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@ - $(_v)$(SRCROOT)/san/tools/validate_blacklist.sh "$@" + $(_v)$(SRCROOT)/san/tools/validate_denylist.sh "$@" -do_build_setup:: $(OBJROOT)/san/kasan-blacklist-x86_64 $(OBJROOT)/san/kasan-blacklist-arm64 +do_build_setup:: $(OBJROOT)/san/kasan-denylist-x86_64 $(OBJROOT)/san/kasan-denylist-arm64 # # Kasan System.kext plugin diff --git a/san/memory/kasan-arm64.c b/san/memory/kasan-arm64.c index d6eba12f6..2a3e94324 100644 --- a/san/memory/kasan-arm64.c +++ b/san/memory/kasan-arm64.c @@ -195,19 +195,19 @@ kasan_arm64_align_to_page(vm_offset_t *addrp, vm_offset_t *sizep) static uint64_t * kasan_arm64_lookup_l1(uint64_t *base, vm_offset_t address) { - return base + ((address & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); + return base + L1_TABLE_T1_INDEX(address, TCR_EL1_BOOT); } static uint64_t * kasan_arm64_lookup_l2(uint64_t *base, vm_offset_t address) { - return base + ((address & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); + return base + L2_TABLE_INDEX(address); } static uint64_t * kasan_arm64_lookup_l3(uint64_t *base, vm_offset_t address) { - return base + ((address & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT); + return base + L3_TABLE_INDEX(address); } /* @@ -384,7 +384,7 @@ kasan_arch_init(void) /* Map the physical aperture */ kasan_map_shadow(physmap_vbase, physmap_vtop - physmap_vbase, true); -#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) /* Pre-allocate all the L3 page table pages to avoid triggering KTRR */ kasan_map_shadow_internal(VM_MIN_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS + 1, KASAN_ARM64_PREALLOCATE_TRANSLATION); diff --git a/san/memory/kasan-classic.h b/san/memory/kasan-classic.h index 1aa34ba15..275d86ae3 100644 --- a/san/memory/kasan-classic.h +++ b/san/memory/kasan-classic.h @@ -46,7 +46,7 @@ typedef uintptr_t uptr; #define KASAN_DEBUG 0 -#define KASAN_DYNAMIC_BLACKLIST 1 +#define KASAN_DYNAMIC_DENYLIST 1 #define KASAN_FAKESTACK 1 /* * KASAN features and config diff --git a/san/memory/kasan-blacklist b/san/memory/kasan-denylist similarity index 93% rename from san/memory/kasan-blacklist rename to san/memory/kasan-denylist index a58341448..1fd44abf3 100644 --- a/san/memory/kasan-blacklist +++ b/san/memory/kasan-denylist @@ -1,11 +1,9 @@ -#!special-case-list-v1 -# rdar://139815990 # This file declares the list of source files that should be exempt from # AddressSanitizer instrumentation. Usually, this is because a file is used by # the AddressSanitizer runtime itself, or because the code executes before # the runtime has been initialized. -[.*] +[*] # Blanket ignore non-sanitized functions fun:__nosan_* @@ -14,7 +12,7 @@ fun:__nosan_* src:./osfmk/kdp/* src:./osfmk/kern/debug.c -[kernel-hwaddress|address] +[{kernel-hwaddress,address}] # Exclude linker sets type:struct linker_set_entry @@ -37,7 +35,7 @@ src:./san/memory/kasan.c src:./san/memory/kasan-fakestack.c src:./san/memory/kasan-x86_64.c src:./san/memory/kasan-memintrinsics.c -src:./san/memory/kasan_dynamic_blacklist.c +src:./san/memory/kasan_dynamic_denylist.c src:./san/memory/kasan-classic.c src:./san/memory/kasan-tbi.c src:./san/memory/kasan-helper.c diff --git a/san/memory/kasan-blacklist-arm64 b/san/memory/kasan-denylist-arm64 similarity index 90% rename from san/memory/kasan-blacklist-arm64 rename to san/memory/kasan-denylist-arm64 index e8addf24d..99cfc768f 100644 --- a/san/memory/kasan-blacklist-arm64 +++ b/san/memory/kasan-denylist-arm64 @@ -1,8 +1,6 @@ -#!special-case-list-v1 -# rdar://139815990 -# ARM64 specific blacklist +# ARM64 specific denylist -[kernel-hwaddress|address] +[{kernel-hwaddress,address}] # Exclude KASan runtime src:./san/memory/kasan-arm64.c diff --git a/san/memory/kasan-blacklist-dynamic b/san/memory/kasan-denylist-dynamic similarity index 80% rename from san/memory/kasan-blacklist-dynamic rename to san/memory/kasan-denylist-dynamic index 768aa0325..14a5f18d7 100644 --- a/san/memory/kasan-blacklist-dynamic +++ b/san/memory/kasan-denylist-dynamic @@ -1,5 +1,3 @@ -#!special-case-list-v1 -# rdar://139815990 # entry = :: # # = [ kfree zfree fsfree memr memw strr strw read write rw free mem str poison ] @@ -14,5 +12,5 @@ __kernel__:_ZN6OSKext8copyInfoEP7OSArray:memr # For unit-testing KASan -__kernel__:test_blacklist:test -__kernel__:test_blacklist_str:memr +__kernel__:test_denylist:test +__kernel__:test_denylist_str:memr diff --git a/san/memory/kasan-blacklist-x86_64 b/san/memory/kasan-denylist-x86_64 similarity index 96% rename from san/memory/kasan-blacklist-x86_64 rename to san/memory/kasan-denylist-x86_64 index 52bf7a0cf..40c9ca93f 100644 --- a/san/memory/kasan-blacklist-x86_64 +++ b/san/memory/kasan-denylist-x86_64 @@ -1,6 +1,4 @@ -#!special-case-list-v1 -# rdar://139815990 -# x86_64 specific blacklist +# x86_64 specific denylist [address] diff --git a/san/memory/kasan-memintrinsics.c b/san/memory/kasan-memintrinsics.c index 5658b054d..b138d4b01 100644 --- a/san/memory/kasan-memintrinsics.c +++ b/san/memory/kasan-memintrinsics.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "kasan_internal.h" #include "memintrinsics.h" @@ -79,6 +80,9 @@ __asan_bcmp(const void *a, const void *b, size_t len) { kasan_check_range(a, len, TYPE_MEMR); kasan_check_range(b, len, TYPE_MEMR); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_MEMCMP, a, len, b, len, false); +#endif return __nosan_bcmp(a, b, len); } @@ -87,6 +91,9 @@ __asan_memcmp(const void *a, const void *b, size_t n) { kasan_check_range(a, n, TYPE_MEMR); kasan_check_range(b, n, TYPE_MEMR); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_MEMCMP, a, n, b, n, false); +#endif return __nosan_memcmp(a, b, n); } @@ -121,8 +128,12 @@ __asan_strncat(char *dst, const char *src, size_t sz) size_t __asan_strnlen(const char *src, size_t sz) { + size_t n = __nosan_strnlen(src, sz); + if (n < sz) { + sz = n + 1; // Include NUL + } kasan_check_range(src, sz, TYPE_STRR); - return __nosan_strnlen(src, sz); + return n; } size_t @@ -132,3 +143,117 @@ __asan_strlen(const char *src) kasan_check_range(src, sz + 1, TYPE_STRR); return sz; } + +int +__asan_strcmp(const char *__null_terminated s1, const char *__null_terminated s2) +{ + size_t l1 = __asan_strlen(s1); + size_t l2 = __asan_strlen(s2); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strcmp(s1, s2); +} + +__ptrcheck_unavailable_r("strlcmp or strbufcmp") +int +__asan_strncmp(const char *__unsafe_indexable s1, const char *__unsafe_indexable s2, size_t n) +{ + size_t l1 = __asan_strnlen(s1, n); + size_t l2 = __asan_strnlen(s2, n); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRNCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strncmp(s1, s2, n); +} + +int +__asan_strlcmp(const char *__counted_by(n)s1, const char *s2, size_t n) +{ + size_t l1 = __asan_strnlen(s1, n); + size_t l2 = __asan_strlen(s2); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRNCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strlcmp(s1, s2, n); +} + +int +__asan_strbufcmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) +{ + size_t l1 = __asan_strnlen(s1, s1len); + size_t l2 = __asan_strnlen(s2, s2len); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRBUFCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strbufcmp(s1, s1len, s2, s2len); +} + +int +__asan_strcasecmp(const char *__null_terminated s1, const char *__null_terminated s2) +{ + size_t l1 = __asan_strlen(s1); + size_t l2 = __asan_strlen(s2); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strcasecmp(s1, s2); +} + +__ptrcheck_unavailable_r("strlcasecmp or strbufcasecmp") +int +__asan_strncasecmp(const char *__unsafe_indexable s1, const char *__unsafe_indexable s2, size_t n) +{ + size_t l1 = __asan_strnlen(s1, n); + size_t l2 = __asan_strnlen(s2, n); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRNCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strncasecmp(s1, s2, n); +} + +int +__asan_strlcasecmp(const char *__counted_by(n)s1, const char *s2, size_t n) +{ + size_t l1 = __asan_strnlen(s1, n); + size_t l2 = __asan_strlen(s2); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRNCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strlcasecmp(s1, s2, n); +} + +int +__asan_strbufcasecmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) +{ + size_t l1 = __asan_strnlen(s1, s1len); + size_t l2 = __asan_strnlen(s2, s2len); +#if CONFIG_KCOV + kcov_trace_cmp_func(__builtin_return_address(0), KCOV_CMP_FUNC_STRBUFCMP, s1, l1, s2, l2, false); +#else + (void)l1; + (void)l2; +#endif + return __nosan_strbufcasecmp(s1, s1len, s2, s2len); +} diff --git a/san/memory/kasan-report.c b/san/memory/kasan-report.c index 49182ff43..6f547a588 100644 --- a/san/memory/kasan-report.c +++ b/san/memory/kasan-report.c @@ -166,7 +166,7 @@ kasan_log_report(uptr p, uptr width, access_t access, violation_t reason) } /* - * Report a violation that may be disabled and/or blacklisted. This can only be + * Report a violation that may be disabled and/or denylisted. This can only be * called for dynamic checks (i.e. where the fault is recoverable). Use * kasan_crash_report() for static (unrecoverable) violations. * diff --git a/san/memory/kasan-tbi-arm64.h b/san/memory/kasan-tbi-arm64.h index 9d062756b..e39a362cf 100644 --- a/san/memory/kasan-tbi-arm64.h +++ b/san/memory/kasan-tbi-arm64.h @@ -48,10 +48,10 @@ #if defined(ARM_LARGE_MEMORY) #define KASAN_SHADOW_MIN (VM_MAX_KERNEL_ADDRESS+1) #define KASAN_SHADOW_MAX 0xffffffffffffffffULL -#elif defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) +#elif defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) #define KASAN_SHADOW_MIN 0xfffffffdc0000000ULL #define KASAN_SHADOW_MAX 0xffffffffc0000000ULL -#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */ +#else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */ #define KASAN_SHADOW_MIN 0xfffffffe00000000ULL #define KASAN_SHADOW_MAX 0xffffffffc0000000ULL #endif diff --git a/san/memory/kasan-tbi.h b/san/memory/kasan-tbi.h index 54adf403a..894fdf904 100644 --- a/san/memory/kasan-tbi.h +++ b/san/memory/kasan-tbi.h @@ -42,7 +42,7 @@ /* old-style configs. */ #define KASAN_DEBUG 0 -#define KASAN_DYNAMIC_BLACKLIST 1 +#define KASAN_DYNAMIC_DENYLIST 1 #define KASAN_FAKESTACK 0 /* Granularity is 16 bytes */ diff --git a/san/memory/kasan-test.c b/san/memory/kasan-test.c index 22bd398e0..fa4191c21 100644 --- a/san/memory/kasan-test.c +++ b/san/memory/kasan-test.c @@ -459,22 +459,22 @@ test_strncat(struct kasan_test *t) /* we ignore the top *two* frames in backtrace - so add an extra one */ static int OS_NOINLINE -test_blacklist_helper(void) +test_denylist_helper(void) { - return kasan_is_blacklisted(TYPE_TEST); + return kasan_is_denylisted(TYPE_TEST); } static int OS_NOINLINE -test_blacklist(struct kasan_test *t) +test_denylist(struct kasan_test *t) { TEST_START(t); - int res = (int)!test_blacklist_helper(); + int res = (int)!test_denylist_helper(); TEST_DONE(t, res); return 0; } static int OS_NOINLINE -test_blacklist_str(struct kasan_test *t) +test_denylist_str(struct kasan_test *t) { TEST_START(t); char a1[8]; @@ -570,8 +570,8 @@ static struct kasan_test xnu_tests[] = { DECLARE_TEST(test_strlcat, "strlcat"), DECLARE_TEST(test_strncpy, "strncpy"), DECLARE_TEST(test_strncat, "strncat"), - DECLARE_TEST(test_blacklist, "blacklist"), - DECLARE_TEST(test_blacklist_str, "blacklist_str"), + DECLARE_TEST(test_denylist, "denylist"), + DECLARE_TEST(test_denylist_str, "denylist_str"), DECLARE_TEST(test_fakestack, "fakestack"), }; static int num_xnutests = sizeof(xnu_tests) / sizeof(xnu_tests[0]); diff --git a/san/memory/kasan.c b/san/memory/kasan.c index 525863af4..10aec024e 100644 --- a/san/memory/kasan.c +++ b/san/memory/kasan.c @@ -118,7 +118,7 @@ kasan_lock_held(thread_t thread) bool kasan_check_enabled(access_t access) { - return kasan_enabled && (kasan_enabled_checks & access) && !kasan_is_blacklisted(access); + return kasan_enabled && (kasan_enabled_checks & access) && !kasan_is_denylisted(access); } void @@ -187,8 +187,8 @@ kasan_load_kext(vm_offset_t base, vm_size_t __unused size, const void *bundleid) unsigned long sectsz; void *sect; -#if KASAN_DYNAMIC_BLACKLIST - kasan_dybl_load_kext(base, bundleid); +#if KASAN_DYNAMIC_DENYLIST + kasan_dyn_denylist_load_kext(base, bundleid); #endif /* find the kasan globals segment/section */ @@ -212,8 +212,8 @@ kasan_unload_kext(vm_offset_t base, vm_size_t size) kexts_loaded--; } -#if KASAN_DYNAMIC_BLACKLIST - kasan_dybl_unload_kext(base); +#if KASAN_DYNAMIC_DENYLIST + kasan_dyn_denylist_unload_kext(base); #endif } @@ -263,8 +263,8 @@ kasan_init_xnu_globals(void) void NOINLINE kasan_late_init(void) { -#if KASAN_DYNAMIC_BLACKLIST - kasan_init_dybl(); +#if KASAN_DYNAMIC_DENYLIST + kasan_init_dyn_denylist(); #endif kasan_init_xnu_globals(); kasan_impl_late_init(); @@ -442,4 +442,4 @@ SYSCTL_COMPAT_UINT(_kern_kasan, OID_AUTO, kalloc, CTLFLAG_RD, NULL, 1, ""); SYSCTL_COMPAT_UINT(_kern_kasan, OID_AUTO, zalloc, CTLFLAG_RD, NULL, 0, ""); SYSCTL_COMPAT_UINT(_kern_kasan, OID_AUTO, kalloc, CTLFLAG_RD, NULL, 0, ""); #endif -SYSCTL_COMPAT_UINT(_kern_kasan, OID_AUTO, dynamicbl, CTLFLAG_RD, NULL, KASAN_DYNAMIC_BLACKLIST, ""); +SYSCTL_COMPAT_UINT(_kern_kasan, OID_AUTO, dynamicbl, CTLFLAG_RD, NULL, KASAN_DYNAMIC_DENYLIST, ""); diff --git a/san/memory/kasan_dynamic_blacklist.c b/san/memory/kasan_dynamic_denylist.c similarity index 72% rename from san/memory/kasan_dynamic_blacklist.c rename to san/memory/kasan_dynamic_denylist.c index c6d3656b8..0e5bfd885 100644 --- a/san/memory/kasan_dynamic_blacklist.c +++ b/san/memory/kasan_dynamic_denylist.c @@ -17,14 +17,14 @@ #include "kasan.h" #include "kasan_internal.h" -#if KASAN_DYNAMIC_BLACKLIST +#if KASAN_DYNAMIC_DENYLIST #define MAX_FRAMES 8 #define HASH_NBUCKETS 128U #define HASH_MASK (HASH_NBUCKETS-1) #define HASH_CACHE_NENTRIES 128 -struct blacklist_entry { +struct denylist_entry { const char *kext_name; const char *func_name; access_t type_mask; @@ -33,47 +33,47 @@ struct blacklist_entry { uint64_t count; }; -#include "kasan_blacklist_dynamic.h" -/* defines 'blacklist' and 'blacklist_entries' */ +#include "kasan_denylist_dynamic.h" +/* defines 'denylist' and 'denylist_entries' */ -decl_simple_lock_data(static, _dybl_lock); -static access_t blacklisted_types; /* bitmap of access types with blacklist entries */ +decl_simple_lock_data(static, _dyn_denylist_lock); +static access_t denylisted_types; /* bitmap of access types with denylist entries */ static void -dybl_lock(boolean_t *b) +dyn_denylist_lock(boolean_t *b) { *b = ml_set_interrupts_enabled(false); - simple_lock(&_dybl_lock, LCK_GRP_NULL); + simple_lock(&_dyn_denylist_lock, LCK_GRP_NULL); } static void -dybl_unlock(boolean_t b) +dyn_denylist_unlock(boolean_t b) { - simple_unlock(&_dybl_lock); + simple_unlock(&_dyn_denylist_lock); ml_set_interrupts_enabled(b); } /* - * blacklist call site hash table + * denylist call site hash table */ -struct blacklist_hash_entry { - SLIST_ENTRY(blacklist_hash_entry) chain; // next element in chain - struct blacklist_entry *ble; // blacklist entry that this caller is an instance of +struct denylist_hash_entry { + SLIST_ENTRY(denylist_hash_entry) chain; // next element in chain + struct denylist_entry *dle; // denylist entry that this caller is an instance of uintptr_t addr; // callsite address uint64_t count; // hit count }; struct hash_chain_head { - SLIST_HEAD(, blacklist_hash_entry); + SLIST_HEAD(, denylist_hash_entry); }; unsigned cache_next_entry = 0; -struct blacklist_hash_entry blhe_cache[HASH_CACHE_NENTRIES]; +struct denylist_hash_entry dlhe_cache[HASH_CACHE_NENTRIES]; struct hash_chain_head hash_buckets[HASH_NBUCKETS]; -static struct blacklist_hash_entry * +static struct denylist_hash_entry * alloc_hash_entry(void) { unsigned idx = cache_next_entry++; @@ -81,7 +81,7 @@ alloc_hash_entry(void) cache_next_entry = HASH_CACHE_NENTRIES; // avoid overflow return NULL; } - return &blhe_cache[idx]; + return &dlhe_cache[idx]; } static unsigned @@ -91,38 +91,38 @@ hash_addr(uintptr_t addr) return (unsigned)addr & HASH_MASK; } -static struct blacklist_hash_entry * -blacklist_hash_lookup(uintptr_t addr) +static struct denylist_hash_entry * +denylist_hash_lookup(uintptr_t addr) { unsigned idx = hash_addr(addr); - struct blacklist_hash_entry *blhe; + struct denylist_hash_entry *dlhe; - SLIST_FOREACH(blhe, &hash_buckets[idx], chain) { - if (blhe->addr == addr) { - return blhe; + SLIST_FOREACH(dlhe, &hash_buckets[idx], chain) { + if (dlhe->addr == addr) { + return dlhe; } } return NULL; } -static struct blacklist_hash_entry * -blacklist_hash_add(uintptr_t addr, struct blacklist_entry *ble) +static struct denylist_hash_entry * +denylist_hash_add(uintptr_t addr, struct denylist_entry *dle) { unsigned idx = hash_addr(addr); - struct blacklist_hash_entry *blhe = alloc_hash_entry(); - if (!blhe) { + struct denylist_hash_entry *dlhe = alloc_hash_entry(); + if (!dlhe) { return NULL; } - blhe->ble = ble; - blhe->addr = addr; - blhe->count = 1; + dlhe->dle = dle; + dlhe->addr = addr; + dlhe->count = 1; - SLIST_INSERT_HEAD(&hash_buckets[idx], blhe, chain); + SLIST_INSERT_HEAD(&hash_buckets[idx], dlhe, chain); - return blhe; + return dlhe; } static void @@ -130,7 +130,7 @@ hash_drop(void) { if (cache_next_entry > 0) { bzero(&hash_buckets, sizeof(hash_buckets)); - bzero(&blhe_cache, sizeof(struct blacklist_hash_entry) * cache_next_entry); + bzero(&dlhe_cache, sizeof(struct denylist_hash_entry) * cache_next_entry); cache_next_entry = 0; } } @@ -146,7 +146,7 @@ struct range_tree_entry { struct { uint64_t size : 63; - uint64_t accessed : 1; // blacklist entry exists in this range + uint64_t accessed : 1; // denylist entry exists in this range }; /* kext name */ @@ -187,7 +187,7 @@ RB_GENERATE(range_tree, range_tree_entry, tree, range_tree_cmp); /* for each executable section, insert a range tree entry */ void -kasan_dybl_load_kext(uintptr_t addr, const char *kextname) +kasan_dyn_denylist_load_kext(uintptr_t addr, const char *kextname) { int i; @@ -217,9 +217,9 @@ kasan_dybl_load_kext(uintptr_t addr, const char *kextname) e->mh = mh; boolean_t flag; - dybl_lock(&flag); + dyn_denylist_lock(&flag); RB_INSERT(range_tree, &range_tree_root, e); - dybl_unlock(flag); + dyn_denylist_unlock(flag); } } @@ -228,7 +228,7 @@ kasan_dybl_load_kext(uintptr_t addr, const char *kextname) } void -kasan_dybl_unload_kext(uintptr_t addr) +kasan_dyn_denylist_unload_kext(uintptr_t addr) { int i; @@ -251,16 +251,16 @@ kasan_dybl_unload_kext(uintptr_t addr) struct range_tree_entry key = { .base = seg->vmaddr, .size = 0 }; struct range_tree_entry *e; boolean_t flag; - dybl_lock(&flag); + dyn_denylist_lock(&flag); e = RB_FIND(range_tree, &range_tree_root, &key); if (e) { RB_REMOVE(range_tree, &range_tree_root, e); if (e->accessed) { - /* there was a blacklist entry in this range */ + /* there was a denylist entry in this range */ hash_drop(); } } - dybl_unlock(flag); + dyn_denylist_unlock(flag); kfree_type(struct range_tree_entry, e); } @@ -347,7 +347,7 @@ addr_to_func(uintptr_t addr, const kernel_mach_header_t *mh) } bool OS_NOINLINE -kasan_is_blacklisted(access_t type) +kasan_is_denylisted(access_t type) { uint32_t nframes = 0; uintptr_t frames[MAX_FRAMES]; @@ -355,8 +355,8 @@ kasan_is_blacklisted(access_t type) assert(__builtin_popcount(type) == 1); - if ((type & blacklisted_types) == 0) { - /* early exit for types with no blacklist entries */ + if ((type & denylisted_types) == 0) { + /* early exit for types with no denylist entries */ return false; } @@ -372,25 +372,25 @@ kasan_is_blacklisted(access_t type) bt += 1; } - struct blacklist_hash_entry *blhe = NULL; + struct denylist_hash_entry *dlhe = NULL; - dybl_lock(&flag); + dyn_denylist_lock(&flag); /* First check if any frame hits in the hash */ for (uint32_t i = 0; i < nframes; i++) { - blhe = blacklist_hash_lookup(bt[i]); - if (blhe) { - if ((blhe->ble->type_mask & type) != type) { + dlhe = denylist_hash_lookup(bt[i]); + if (dlhe) { + if ((dlhe->dle->type_mask & type) != type) { /* wrong type */ continue; } /* hit */ - blhe->count++; - blhe->ble->count++; - // printf("KASan: blacklist cache hit (%s:%s [0x%lx] 0x%x)\n", - // ble->kext_name ?: "" , ble->func_name ?: "", VM_KERNEL_UNSLIDE(bt[i]), mask); - dybl_unlock(flag); + dlhe->count++; + dlhe->dle->count++; + // printf("KASan: denylist cache hit (%s:%s [0x%lx] 0x%x)\n", + // dle->kext_name ?: "" , dle->func_name ?: "", VM_KERNEL_UNSLIDE(bt[i]), mask); + dyn_denylist_unlock(flag); return true; } } @@ -421,35 +421,35 @@ kasan_is_blacklisted(access_t type) // printf("%s: a = 0x%016lx,0x%016lx f = %s, k = %s\n", __func__, bt[i], VM_KERNEL_UNSLIDE(bt[i]), funcname, kextname); - /* check if kextname or funcname are in the blacklist */ - for (size_t j = 0; j < blacklist_entries; j++) { - struct blacklist_entry *ble = &blacklist[j]; + /* check if kextname or funcname are in the denylist */ + for (size_t j = 0; j < denylist_entries; j++) { + struct denylist_entry *dle = &denylist[j]; uint64_t count; - if ((ble->type_mask & type) != type) { + if ((dle->type_mask & type) != type) { /* wrong type */ continue; } - if (ble->kext_name && kextname && strncmp(kextname, ble->kext_name, KMOD_MAX_NAME) != 0) { + if (dle->kext_name && kextname && strncmp(kextname, dle->kext_name, KMOD_MAX_NAME) != 0) { /* wrong kext name */ continue; } - if (ble->func_name && funcname && strncmp(funcname, ble->func_name, 128) != 0) { + if (dle->func_name && funcname && strncmp(funcname, dle->func_name, 128) != 0) { /* wrong func name */ continue; } /* found a matching function or kext */ - blhe = blacklist_hash_add(bt[i], ble); - count = ble->count++; + dlhe = denylist_hash_add(bt[i], dle); + count = dle->count++; e->accessed = 1; - dybl_unlock(flag); + dyn_denylist_unlock(flag); if (count == 0) { - printf("KASan: ignoring blacklisted violation (%s:%s [0x%lx] %d 0x%x)\n", + printf("KASan: ignoring denylisted violation (%s:%s [0x%lx] %d 0x%x)\n", kextname, funcname, VM_KERNEL_UNSLIDE(bt[i]), i, type); } @@ -457,18 +457,18 @@ kasan_is_blacklisted(access_t type) } } - dybl_unlock(flag); + dyn_denylist_unlock(flag); return false; } static void -add_blacklist_entry(const char *kext, const char *func, access_t type) +add_denylist_entry(const char *kext, const char *func, access_t type) { assert(kext || func); - struct blacklist_entry *ble = &blacklist[blacklist_entries++]; + struct denylist_entry *dle = &denylist[denylist_entries++]; - if (blacklist_entries > blacklist_max_entries) { - panic("KASan: dynamic blacklist entries exhausted"); + if (denylist_entries > denylist_max_entries) { + panic("KASan: dynamic denylist entries exhausted"); } if (kext) { @@ -476,7 +476,7 @@ add_blacklist_entry(const char *kext, const char *func, access_t type) if (sz > 1) { char *s = zalloc_permanent(sz, ZALIGN_NONE); __nosan_strlcpy(s, kext, sz); - ble->kext_name = s; + dle->kext_name = s; } } @@ -485,11 +485,11 @@ add_blacklist_entry(const char *kext, const char *func, access_t type) if (sz > 1) { char *s = zalloc_permanent(sz, ZALIGN_NONE); __nosan_strlcpy(s, func, sz); - ble->func_name = s; + dle->func_name = s; } } - ble->type_mask = type; + dle->type_mask = type; } #define TS(x) { .type = TYPE_##x, .str = #x } @@ -540,17 +540,17 @@ map_type(const char *str) } } - printf("KASan: unknown blacklist type `%s', assuming `normal'\n", str); + printf("KASan: unknown denylist type `%s', assuming `normal'\n", str); return TYPE_NORMAL; } void -kasan_init_dybl(void) +kasan_init_dyn_denylist(void) { - simple_lock_init(&_dybl_lock, 0); + simple_lock_init(&_dyn_denylist_lock, 0); /* - * dynamic blacklist entries via boot-arg. Syntax is: + * dynamic denylist entries via boot-arg. Syntax is: * kasan.bl=kext1:func1:type1,kext2:func2:type2,... */ char buf[256] = {}; @@ -568,24 +568,24 @@ kasan_init_dybl(void) *typestr++ = 0; type = map_type(typestr); } - add_blacklist_entry(kext, func, type); + add_denylist_entry(kext, func, type); } } - /* collect bitmask of blacklisted types */ - for (size_t j = 0; j < blacklist_entries; j++) { - struct blacklist_entry *ble = &blacklist[j]; - blacklisted_types |= ble->type_mask; + /* collect bitmask of denylisted types */ + for (size_t j = 0; j < denylist_entries; j++) { + struct denylist_entry *dle = &denylist[j]; + denylisted_types |= dle->type_mask; } /* add the fake kernel kext */ - kasan_dybl_load_kext((uintptr_t)&_mh_execute_header, "__kernel__"); + kasan_dyn_denylist_load_kext((uintptr_t)&_mh_execute_header, "__kernel__"); } -#else /* KASAN_DYNAMIC_BLACKLIST */ +#else /* KASAN_DYNAMIC_DENYLIST */ bool -kasan_is_blacklisted(access_t __unused type) +kasan_is_denylisted(access_t __unused type) { return false; } diff --git a/san/memory/kasan_internal.h b/san/memory/kasan_internal.h index 871eb6ec7..96c460811 100644 --- a/san/memory/kasan_internal.h +++ b/san/memory/kasan_internal.h @@ -72,7 +72,7 @@ typedef uintptr_t uptr; #define KASAN_STRIP_ADDR(_x) (_x) #elif KASAN_TBI #define KASAN_MODEL_STR "kasan-tbi" -#define KASAN_STRIP_ADDR(_x) (VM_KERNEL_STRIP_UPTR(_x)) +#define KASAN_STRIP_ADDR(_x) (VM_KERNEL_STRIP_PTR(_x)) #else #error "No kasan model specified" #endif /* KASAN_CLASSIC || KASAN_TBI */ @@ -165,11 +165,11 @@ bool kasan_check_enabled(access_t); bool kasan_impl_check_enabled(access_t); void kasan_check_range(const void *, size_t, access_t); -/* dynamic blacklist */ -void kasan_init_dybl(void); -bool kasan_is_blacklisted(access_t); -void kasan_dybl_load_kext(uintptr_t, const char *); -void kasan_dybl_unload_kext(uintptr_t); +/* dynamic denylist */ +void kasan_init_dyn_denylist(void); +bool kasan_is_denylisted(access_t); +void kasan_dyn_denylist_load_kext(uintptr_t, const char *); +void kasan_dyn_denylist_unload_kext(uintptr_t); /* arch-specific interface */ void kasan_arch_init(void); diff --git a/san/memory/memintrinsics.h b/san/memory/memintrinsics.h index d77272a80..e86f79f74 100644 --- a/san/memory/memintrinsics.h +++ b/san/memory/memintrinsics.h @@ -92,6 +92,46 @@ __nosan_strlen(const char *__null_terminated src) { return strlen(src); } +static inline int +__nosan_strcmp(const char *__null_terminated s1, const char *__null_terminated s2) +{ + return strcmp(s1, s2); +} +static inline int +__nosan_strncmp(const char *__unsafe_indexable s1, const char *__unsafe_indexable s2, size_t n) +{ + return strbufcmp(__unsafe_forge_bidi_indexable(const char *, s1, n), n, __unsafe_forge_bidi_indexable(const char *, s2, n), n); +} +static inline int +__nosan_strlcmp(const char *__counted_by(n)s1, const char *s2, size_t n) +{ + return strlcmp(s1, s2, n); +} +static inline int +__nosan_strbufcmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) +{ + return strbufcmp(s1, s1len, s2, s2len); +} +static inline int +__nosan_strcasecmp(const char *__null_terminated s1, const char *__null_terminated s2) +{ + return strcasecmp(s1, s2); +} +static inline int +__nosan_strncasecmp(const char *__unsafe_indexable s1, const char *__unsafe_indexable s2, size_t n) +{ + return strbufcasecmp(__unsafe_forge_bidi_indexable(const char *, s1, n), n, __unsafe_forge_bidi_indexable(const char *, s2, n), n); +} +static inline int +__nosan_strlcasecmp(const char *__counted_by(n)s1, const char *s2, size_t n) +{ + return strlcasecmp(s1, s2, n); +} +static inline int +__nosan_strbufcasecmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) +{ + return strbufcasecmp(s1, s1len, s2, s2len); +} #if !__has_ptrcheck && !__has_include(<__xnu_libcxx_sentinel.h>) static inline char * __nosan_strncpy(char *dst, const char *src, size_t sz) @@ -125,30 +165,80 @@ void *__sized_by(sz) __asan_memset(void * __sized_by(sz), int c, size_t sz); void *__sized_by(sz) __asan_memmove(void *dst __sized_by(sz), const void *src __sized_by(sz), size_t sz); void __asan_bcopy(const void *src __sized_by(sz), void *dst __sized_by(sz), size_t sz); void __asan_bzero(void *dst __sized_by(sz), size_t sz); -int __asan_bcmp(const void *a __sized_by(sz), const void *b __sized_by(sz), size_t sz); -int __asan_memcmp(const void *a __sized_by(sz), const void *b __sized_by(sz), size_t sz); +int __asan_bcmp(const void *a __sized_by(sz), const void *b __sized_by(sz), size_t sz) __stateful_pure; +int __asan_memcmp(const void *a __sized_by(sz), const void *b __sized_by(sz), size_t sz) __stateful_pure; size_t __asan_strlcpy(char *__sized_by(sz) dst, const char *__null_terminated src, size_t sz); char *__asan_strncpy(char *dst, const char *src, size_t sz); char *__asan_strncat(char *dst, const char *src, size_t sz); size_t __asan_strlcat(char *__sized_by(sz) dst, const char *__null_terminated src, size_t sz); -size_t __asan_strnlen(const char *__null_terminated src, size_t sz); -size_t __asan_strlen(const char *__null_terminated src); +size_t __asan_strnlen(const char *__counted_by(n)s, size_t n) __stateful_pure; +size_t __asan_strlen(const char *__null_terminated src) __stateful_pure; +int __asan_strcmp(const char *__null_terminated s1, const char *__null_terminated s2) __stateful_pure; +__ptrcheck_unavailable_r("strlcmp or strbufcmp") +int __asan_strncmp(const char *__null_terminated s1, const char *__null_terminated s2, size_t n) __stateful_pure; +int __asan_strlcmp(const char *__counted_by(n)s1, const char *s2, size_t n) __stateful_pure; +int __asan_strbufcmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) __stateful_pure; +int __asan_strcasecmp(const char *__null_terminated s1, const char *__null_terminated s2) __stateful_pure; +__ptrcheck_unavailable_r("strlcasecmp or strbufcasecmp") +int __asan_strncasecmp(const char *__unsafe_indexable s1, const char *__unsafe_indexable s2, size_t n) __stateful_pure; +int __asan_strlcasecmp(const char *__counted_by(n)s1, const char *s2, size_t n) __stateful_pure; +int __asan_strbufcasecmp(const char *__counted_by(s1len)s1, size_t s1len, const char *__counted_by(s2len)s2, size_t s2len) __stateful_pure; -#define memcpy __asan_memcpy -#define memmove __asan_memmove -#define memset __asan_memset -#define bcopy __asan_bcopy -#define bzero __asan_bzero -#define bcmp __asan_bcmp -#define memcmp __asan_memcmp +#define memcpy __asan_memcpy +#define memmove __asan_memmove +#define memset __asan_memset +#define bcopy __asan_bcopy +#define bzero __asan_bzero +#define bcmp __asan_bcmp +#define memcmp __asan_memcmp -#define strlcpy __asan_strlcpy -#define strncpy __asan_strncpy -#define strlcat __asan_strlcat -#define strncat __asan_strncat -// #define strnlen __asan_strnlen -// #define strlen __asan_strlen +#define strlcpy __asan_strlcpy +#define strncpy __asan_strncpy +#define strlcat __asan_strlcat +#define strncat __asan_strncat +#define strnlen __asan_strnlen +#define strlen __asan_strlen +#define strcmp __asan_strcmp +#define strncmp __asan_strncmp +#define strlcmp __asan_strlcmp +#define strcasecmp __asan_strcasecmp +#define strncasecmp __asan_strncasecmp +#define strlcasecmp __asan_strlcasecmp + +// Previously defined as macros in string.h +#undef strbuflen_1 +#undef strbuflen_2 +#undef strbuflen +#undef strbufcmp_2 +#undef strbufcmp_4 +#undef strbufcmp +#undef strbufcasecmp_2 +#undef strbufcasecmp_4 +#undef strbufcasecmp + +#define strbuflen_1(BUF) ({ \ + __xnu_assert_is_array(BUF, "argument is not an array"); \ + __asan_strnlen((BUF), sizeof(BUF)); \ +}) +#define strbuflen_2(BUF, LEN) __asan_strnlen(BUF, LEN) +#define strbuflen(...) __xnu_argc_overload(strbuflen, __VA_ARGS__) + +#define strbufcmp_2(A, B) ({ \ + __xnu_assert_is_array(A, "first argument is not an array"); \ + __xnu_assert_is_array(B, "second argument is not an array"); \ + (__asan_strbufcmp)((A), sizeof(A), (B), sizeof(B)); \ +}) +#define strbufcmp_4 (__asan_strbufcmp) +#define strbufcmp(...) __xnu_argc_overload(strbufcmp, __VA_ARGS__) + +#define strbufcasecmp_2(A, B) ({ \ + __xnu_assert_is_array(A, "first argument is not an array"); \ + __xnu_assert_is_array(B, "second argument is not an array"); \ + (__asan_strbufcasecmp)((A), sizeof(A), (B), sizeof(B)); \ +}) +#define strbufcasecmp_4 (__asan_strbufcasecmp) +#define strbufcasecmp(...) __xnu_argc_overload(strbufcasecmp, __VA_ARGS__) #endif diff --git a/san/memory/ubsan-blacklist b/san/memory/ubsan-denylist similarity index 100% rename from san/memory/ubsan-blacklist rename to san/memory/ubsan-denylist diff --git a/san/tools/generate_dynamic_blacklist.py b/san/tools/generate_dynamic_denylist.py similarity index 87% rename from san/tools/generate_dynamic_blacklist.py rename to san/tools/generate_dynamic_denylist.py index 2b155c9a7..a642499ae 100755 --- a/san/tools/generate_dynamic_blacklist.py +++ b/san/tools/generate_dynamic_denylist.py @@ -9,7 +9,7 @@ def type_map(x): def print_preamble(): - print(r'struct blacklist_entry blacklist[] = {') + print(r'struct denylist_entry denylist[] = {') def print_entry(kext, func, type): @@ -36,8 +36,8 @@ def print_postamble(nentries, extra_entries): print("};\n") - print('static size_t blacklist_entries = {};'.format(nentries)) - print('static const size_t blacklist_max_entries = {};'.format( + print('static size_t denylist_entries = {};'.format(nentries)) + print('static const size_t denylist_max_entries = {};'.format( nentries + extra_entries)) diff --git a/san/tools/ksancov.c b/san/tools/ksancov.c index cdfdb5a41..cbff0bd29 100644 --- a/san/tools/ksancov.c +++ b/san/tools/ksancov.c @@ -49,6 +49,7 @@ usage(void) " -t | --trace use trace (PC log) mode [default]\n" " -s | --stksize use trace (PC log) with stack size mode\n" " -c | --counters use edge counter mode\n" + " -p | --cmptrace use trace (CMP log) mode\n" " -n | --entries override max entries in trace log\n" " -x | --exec instrument execution of binary at \n" " -b | --bundle bundle for on-demand tracing\n"); @@ -66,6 +67,11 @@ typedef struct ksancov_state { ksancov_trace_t *ks_trace; ksancov_counters_t *ks_counters; }; + ksancov_cmps_mode_t ks_cmps_mode; + union { + ksancov_header_t *ks_cmps_header; + ksancov_trace_t *ks_cmps_trace; + }; } ksancov_state_t; /* @@ -94,12 +100,35 @@ ksancov_set_mode(int fd, ksancov_mode_t mode, int max_entries) return ret; } +/* + * Configures ksancov device for selected comparison mode. + */ +static int +ksancov_cmps_set_mode(int fd, ksancov_cmps_mode_t mode, int max_entries) +{ + int ret = 0; + + switch (mode) { + case KS_CMPS_MODE_TRACE: + ret = ksancov_cmps_mode_trace(fd, max_entries, false); + break; + case KS_CMPS_MODE_TRACE_FUNC: + ret = ksancov_cmps_mode_trace(fd, max_entries, true); + break; + default: + perror("ksancov unsupported cmps mode\n"); + return ENOTSUP; + } + + return ret; +} + /* * Initialize coverage state from provided options. Shared mappings with kernel are established * here. */ static int -ksancov_init_state(int fd, ksancov_mode_t mode, int max_entries, ksancov_state_t *state) +ksancov_init_state(int fd, ksancov_mode_t mode, ksancov_cmps_mode_t cmps_mode, int max_entries, ksancov_state_t *state) { uintptr_t addr; size_t sz; @@ -139,6 +168,33 @@ ksancov_init_state(int fd, ksancov_mode_t mode, int max_entries, ksancov_state_t fprintf(stderr, "maxpcs = %lu\n", ksancov_trace_max_ent(state->ks_trace)); } + if (cmps_mode == KS_CMPS_MODE_NONE) { + state->ks_cmps_mode = cmps_mode; + state->ks_cmps_header = NULL; + return ret; + } + + /* Setup selected comparison tracing mode. */ + ret = ksancov_cmps_set_mode(fd, cmps_mode, max_entries); + if (ret) { + perror("ksancov cmps set mode\n"); + return ret; + } + + /* Map buffer for selected mode into process address space. */ + ret = ksancov_cmps_map(fd, &addr, &sz); + if (ret) { + perror("ksancov cmps map"); + return ret; + } + fprintf(stderr, "cmps mapped to 0x%lx + %lu\n", addr, sz); + + /* Finalize state members. */ + state->ks_cmps_mode = cmps_mode; + state->ks_cmps_header = (void *)addr; + + fprintf(stderr, "maxcmps = %lu\n", ksancov_trace_max_ent(state->ks_cmps_trace)); + return ret; } @@ -167,6 +223,31 @@ ksancov_print_state(ksancov_state_t *state) } } + if (state->ks_cmps_mode == KS_CMPS_MODE_TRACE || state->ks_cmps_mode == KS_CMPS_MODE_TRACE_FUNC) { + static const char *type_map[KCOV_CMP_SIZE8 + 1] = { + "8 bits", NULL, "16 bits", NULL, "32 bits", + NULL, "64 bits" + }; + + size_t head = ksancov_trace_head(state->ks_cmps_trace); + fprintf(stderr, "cmps head = %lu\n", head); + + for (uint32_t i = 0; i < head;) { + ksancov_cmps_trace_ent_t *entry = ksancov_cmps_trace_entry(state->ks_cmps_trace, i); + if (KCOV_CMP_IS_FUNC(entry->type)) { + size_t space = ksancov_cmps_trace_func_space(entry->len1_func, entry->len2_func); + i += space / sizeof(ksancov_cmps_trace_ent_t); + fprintf(stderr, "0x%llx [func %u %u] '%s' '%s'\n", entry->pc, entry->len1_func, entry->len2_func, + ksancov_cmps_trace_func_arg1(entry), + ksancov_cmps_trace_func_arg2(entry)); + } else { + uint64_t type = entry->type & KCOV_CMP_SIZE_MASK; + fprintf(stderr, "0x%llx [%s] 0x%llx 0x%llx\n", entry->pc, type_map[type], entry->args[0], entry->args[1]); + ++i; + } + } + } + return 0; } @@ -189,7 +270,8 @@ int main(int argc, char *argv[]) { ksancov_mode_t ksan_mode = KS_MODE_NONE; - ksancov_state_t ksan_state; + ksancov_cmps_mode_t ksan_cmps_mode = KS_CMPS_MODE_NONE; + ksancov_state_t ksan_state = {0}; int ret; size_t max_entries = 64UL * 1024; @@ -203,6 +285,7 @@ main(int argc, char *argv[]) { "trace", no_argument, NULL, 't' }, { "counters", no_argument, NULL, 'c' }, { "stksize", no_argument, NULL, 's' }, + { "cmptrace", no_argument, NULL, 'p' }, { "bundle", required_argument, NULL, 'b' }, @@ -210,7 +293,7 @@ main(int argc, char *argv[]) }; int ch; - while ((ch = getopt_long(argc, argv, "tsn:x:cb:", opts, NULL)) != -1) { + while ((ch = getopt_long(argc, argv, "tsn:x:cpb:", opts, NULL)) != -1) { switch (ch) { case 'n': max_entries = strtoul(optarg, NULL, 0); @@ -227,6 +310,9 @@ main(int argc, char *argv[]) case 's': ksan_mode = KS_MODE_STKSIZE; break; + case 'p': + ksan_cmps_mode = KS_CMPS_MODE_TRACE_FUNC; + break; case 'b': od_bundle = optarg; break; @@ -243,7 +329,7 @@ main(int argc, char *argv[]) fprintf(stderr, "opened ksancov on fd %i\n", fd); /* Initialize ksancov state. */ - ret = ksancov_init_state(fd, ksan_mode, max_entries, &ksan_state); + ret = ksancov_init_state(fd, ksan_mode, ksan_cmps_mode, max_entries, &ksan_state); if (ret) { perror("ksancov init\n"); return ret; @@ -264,6 +350,10 @@ main(int argc, char *argv[]) ksancov_on_demand_set_enabled(fd, od_bundle, true); ksancov_reset(ksan_state.ks_header); ksancov_start(ksan_state.ks_header); + if (ksan_state.ks_cmps_header) { + ksancov_reset(ksan_state.ks_cmps_header); + ksancov_start(ksan_state.ks_cmps_header); + } ret = execl(path, path, 0); perror("execl"); ksancov_on_demand_set_enabled(fd, od_bundle, false); @@ -273,6 +363,9 @@ main(int argc, char *argv[]) /* parent */ waitpid(pid, NULL, 0); ksancov_stop(ksan_state.ks_header); + if (ksan_state.ks_cmps_header) { + ksancov_stop(ksan_state.ks_cmps_header); + } ksancov_on_demand_set_enabled(fd, od_bundle, false); } } else { @@ -285,8 +378,15 @@ main(int argc, char *argv[]) ksancov_on_demand_set_enabled(fd, od_bundle, true); ksancov_reset(ksan_state.ks_header); ksancov_start(ksan_state.ks_header); + if (ksan_state.ks_cmps_header) { + ksancov_reset(ksan_state.ks_cmps_header); + ksancov_start(ksan_state.ks_cmps_header); + } int ppid = getppid(); ksancov_stop(ksan_state.ks_header); + if (ksan_state.ks_cmps_header) { + ksancov_stop(ksan_state.ks_cmps_header); + } ksancov_on_demand_set_enabled(fd, od_bundle, false); fprintf(stderr, "ppid = %i\n", ppid); } diff --git a/san/tools/ksancov.h b/san/tools/ksancov.h index 50fbe67dc..fb816e3d3 100644 --- a/san/tools/ksancov.h +++ b/san/tools/ksancov.h @@ -41,33 +41,42 @@ #define KSANCOV_PATH "/dev/" KSANCOV_DEVNODE /* Set mode */ -#define KSANCOV_IOC_TRACE _IOW('K', 1, size_t) /* number of pcs */ -#define KSANCOV_IOC_COUNTERS _IO('K', 2) -#define KSANCOV_IOC_STKSIZE _IOW('K', 3, size_t) /* number of pcs */ +#define KSANCOV_IOC_TRACE _IOW('K', 1, size_t) /* number of pcs */ +#define KSANCOV_IOC_COUNTERS _IO('K', 2) +#define KSANCOV_IOC_STKSIZE _IOW('K', 3, size_t) /* number of pcs */ /* Establish a shared mapping of the coverage buffer. */ -#define KSANCOV_IOC_MAP _IOWR('K', 8, struct ksancov_buf_desc) +#define KSANCOV_IOC_MAP _IOWR('K', 8, struct ksancov_buf_desc) /* Establish a shared mapping of the edge address buffer. */ -#define KSANCOV_IOC_MAP_EDGEMAP _IOWR('K', 9, struct ksancov_buf_desc) +#define KSANCOV_IOC_MAP_EDGEMAP _IOWR('K', 9, struct ksancov_buf_desc) /* Log the current thread */ -#define KSANCOV_IOC_START _IOW('K', 10, uintptr_t) -#define KSANCOV_IOC_NEDGES _IOR('K', 50, size_t) +#define KSANCOV_IOC_START _IOW('K', 10, uintptr_t) +#define KSANCOV_IOC_NEDGES _IOR('K', 50, size_t) +#define KSANCOV_IOC_TESTPANIC _IOW('K', 20, uint64_t) -/* kext-related operations */ -#define KSANCOV_IOC_ON_DEMAND _IOWR('K', 60, struct ksancov_on_demand_msg) +/* Operations related to on-demand instrumentation */ +#define KSANCOV_IOC_ON_DEMAND _IOWR('K', 60, struct ksancov_on_demand_msg) + +/* Set comparison log mode */ +#define KSANCOV_IOC_CMPS_TRACE _IOW('K', 70, size_t) /* number of cmps */ +#define KSANCOV_IOC_CMPS_TRACE_FUNC _IOW('K', 71, size_t) /* number of cmps */ + +/* Establish a shared mapping of the comparisons buffer. */ +#define KSANCOV_IOC_CMPS_MAP _IOWR('K', 90, struct ksancov_buf_desc) /* * shared kernel-user mapping */ -#define KSANCOV_MAX_EDGES (1 << 24) -#define KSANCOV_MAX_HITS UINT8_MAX -#define KSANCOV_TRACE_MAGIC (uint32_t)0x5AD17F5BU -#define KSANCOV_COUNTERS_MAGIC (uint32_t)0x5AD27F6BU -#define KSANCOV_EDGEMAP_MAGIC (uint32_t)0x5AD37F7BU -#define KSANCOV_STKSIZE_MAGIC (uint32_t)0x5AD47F8BU +#define KSANCOV_MAX_EDGES (1 << 24) +#define KSANCOV_MAX_HITS UINT8_MAX +#define KSANCOV_TRACE_MAGIC (uint32_t)0x5AD17F5BU +#define KSANCOV_COUNTERS_MAGIC (uint32_t)0x5AD27F6BU +#define KSANCOV_EDGEMAP_MAGIC (uint32_t)0x5AD37F7BU +#define KSANCOV_STKSIZE_MAGIC (uint32_t)0x5AD47F8BU +#define KSANCOV_CMPS_TRACE_MAGIC (uint32_t)0x5AD47F9BU /* * ioctl @@ -138,6 +147,73 @@ typedef struct ksancov_edgemap { uintptr_t ke_addrs[]; /* address of each edge relative to 'offset' */ } ksancov_edgemap_t; +/* + * Supported comparison logging modes. + */ +typedef enum { + KS_CMPS_MODE_NONE, + KS_CMPS_MODE_TRACE, + KS_CMPS_MODE_TRACE_FUNC, + KS_CMPS_MODE_MAX +} ksancov_cmps_mode_t; + +/* Comparison type values */ +enum { + KCOV_CMP_CONST = 1, + KCOV_CMP_SIZE1 = 0, + KCOV_CMP_SIZE2 = 2, + KCOV_CMP_SIZE4 = 4, + KCOV_CMP_SIZE8 = 6, + + KCOV_CMP_FUNC_MEMCMP = 32, + KCOV_CMP_FUNC_STRCMP = 34, + KCOV_CMP_FUNC_STRNCMP = 36, + KCOV_CMP_FUNC_STRBUFCMP = 38, +}; + +#define KCOV_CMP_SIZE_MASK 0xfffffff6 +#define KCOV_CMP_IS_FUNC(type) (type >= KCOV_CMP_FUNC_MEMCMP) + +/* CMPS TRACE mode tracks comparison values */ +typedef struct __attribute__((__packed__)) ksancov_cmps_trace_entry { + uint64_t pc; + uint32_t type; + uint16_t len1_func; + uint16_t len2_func; + union { + uint64_t args[2]; /* cmp instruction arguments */ + uint8_t args_func[0]; /* cmp function arguments (variadic) */ + }; +} ksancov_cmps_trace_ent_t; + +/* Calculate the total space that a ksancov_cmps_trace_ent_t tracing a function takes */ +static inline size_t +ksancov_cmps_trace_func_space(size_t len1_func, size_t len2_func) +{ + static_assert(sizeof(ksancov_cmps_trace_ent_t) == sizeof(uint64_t) * 3 + sizeof(uint32_t) + sizeof(uint16_t) * 2, "ksancov_cmps_trace_ent_t invalid size"); + + size_t size = sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint16_t) * 2; // header + size += len1_func + len2_func; + size_t rem = size % sizeof(ksancov_cmps_trace_ent_t); + if (rem == 0) { + return size; + } + return size + sizeof(ksancov_cmps_trace_ent_t) - rem; +} + +static inline uint8_t * +ksancov_cmps_trace_func_arg1(ksancov_cmps_trace_ent_t *entry) +{ + return entry->args_func; +} + +static inline uint8_t * +ksancov_cmps_trace_func_arg2(ksancov_cmps_trace_ent_t *entry) +{ + uint8_t* func_args = entry->args_func; + return &func_args[entry->len1_func]; +} + /* * On-demand related functionalities */ @@ -182,6 +258,8 @@ ksancov_map(int fd, uintptr_t *buf, size_t *sz) int ret; struct ksancov_buf_desc mc = {0}; + assert(buf != NULL); + ret = ioctl(fd, KSANCOV_IOC_MAP, &mc); if (ret == -1) { return errno; @@ -206,6 +284,8 @@ ksancov_map_edgemap(int fd, uintptr_t *buf, size_t *sz) int ret; struct ksancov_buf_desc mc = {0}; + assert(buf != NULL); + ret = ioctl(fd, KSANCOV_IOC_MAP_EDGEMAP, &mc); if (ret == -1) { return errno; @@ -298,7 +378,7 @@ static inline int ksancov_reset(void *buf) { ksancov_header_t *hdr = (ksancov_header_t *)buf; - if (hdr->kh_magic == KSANCOV_TRACE_MAGIC || hdr->kh_magic == KSANCOV_STKSIZE_MAGIC) { + if (hdr->kh_magic == KSANCOV_TRACE_MAGIC || hdr->kh_magic == KSANCOV_STKSIZE_MAGIC || hdr->kh_magic == KSANCOV_CMPS_TRACE_MAGIC) { ksancov_trace_t *trace = (ksancov_trace_t *)buf; atomic_store_explicit(&trace->kt_head, 0, memory_order_relaxed); } else if (hdr->kh_magic == KSANCOV_COUNTERS_MAGIC) { @@ -375,6 +455,58 @@ ksancov_stksize_size(ksancov_trace_t *trace, size_t i) return entries[i].stksize; } +static inline int +ksancov_cmps_mode_trace(int fd, size_t entries, bool trace_functions) +{ + int ret; + if (trace_functions) { + ret = ioctl(fd, KSANCOV_IOC_CMPS_TRACE_FUNC, &entries); + } else { + ret = ioctl(fd, KSANCOV_IOC_CMPS_TRACE, &entries); + } + if (ret == -1) { + return errno; + } + return 0; +} + +static inline int +ksancov_cmps_map(int fd, uintptr_t *buf, size_t *sz) +{ + int ret; + struct ksancov_buf_desc mc = {0}; + + assert(buf != NULL); + + ret = ioctl(fd, KSANCOV_IOC_CMPS_MAP, &mc); + if (ret == -1) { + return errno; + } + + *buf = mc.ptr; + if (sz) { + *sz = mc.sz; + } + + ksancov_header_t *hdr = (ksancov_header_t *)mc.ptr; + assert(hdr->kh_magic == KSANCOV_CMPS_TRACE_MAGIC); + + return 0; +} + +static inline ksancov_cmps_trace_ent_t* +ksancov_cmps_trace_entry(ksancov_trace_t *trace, size_t i) +{ + assert(trace); + assert(trace->kt_hdr.kh_magic == KSANCOV_CMPS_TRACE_MAGIC); + if (i >= trace->kt_head) { + return NULL; + } + + ksancov_cmps_trace_ent_t *entries = (ksancov_cmps_trace_ent_t *)trace->kt_entries; + return &entries[i]; +} + /* * On-demand control API */ diff --git a/san/tools/validate_blacklist.sh b/san/tools/validate_blacklist.sh deleted file mode 100755 index 8045241ed..000000000 --- a/san/tools/validate_blacklist.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# Ensure all blacklisted files exist. Paths with wildcards are ignored. -# Run against a blacklist with fully-qualified paths. - -IFS=$'\n' - -blacklist_files=`sed -n -e ' - # ignore paths with wildcards - /\*/ d - - # strip leading 'src:' - /^src/ { - s/^src:// - p - } -' $1` - -ret=0 - -for f in $blacklist_files ; do - if ! [[ -e $f ]] ; then - echo "KASan: blacklisted file $f not found" >&2 - ret=1 - fi -done - -exit $ret diff --git a/san/tools/validate_denylist.sh b/san/tools/validate_denylist.sh new file mode 100755 index 000000000..366168051 --- /dev/null +++ b/san/tools/validate_denylist.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# Ensure all denylisted files exist. Paths with wildcards are ignored. +# Run against a denylist with fully-qualified paths. + +IFS=$'\n' + +denylist_files=`sed -n -e ' + # ignore paths with wildcards + /\*/ d + + # strip leading 'src:' + /^src/ { + s/^src:// + p + } +' $1` + +ret=0 + +for f in $denylist_files ; do + if ! [[ -e $f ]] ; then + echo "KASan: denylisted file $f not found" >&2 + ret=1 + fi +done + +exit $ret diff --git a/security/conf/Makefile.template b/security/conf/Makefile.template index 67b610631..34f0df59e 100644 --- a/security/conf/Makefile.template +++ b/security/conf/Makefile.template @@ -92,6 +92,10 @@ $(COMPONENT).filelist: $(OBJS) $(ECHO) $(TARGET)/$(CURRENT_KERNEL_CONFIG)/$${obj}; \ done > $(COMPONENT).filelist +ifeq ($(XNU_LibAllFiles),1) +LIBOBJS := $(OBJS) +endif + $(COMPONENT).libfilelist: $(LIBOBJS) @$(LOG_LDFILELIST) "lib$(COMPONENT)" $(_v)for obj in ${LIBOBJS}; do \ diff --git a/security/mac_audit.c b/security/mac_audit.c index b67d0b9f6..3579d1738 100644 --- a/security/mac_audit.c +++ b/security/mac_audit.c @@ -76,7 +76,7 @@ #if CONFIG_AUDIT ZONE_DEFINE(mac_audit_data_zone, "mac_audit_data_zone", - MAC_AUDIT_DATA_LIMIT, ZC_PGZ_USE_GUARDS); + MAC_AUDIT_DATA_LIMIT, ZC_NONE); int mac_system_check_audit(struct ucred *cred, void *record, int length) diff --git a/security/mac_framework.h b/security/mac_framework.h index c71df639e..07c89fb9f 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -241,6 +241,7 @@ int mac_mount_check_snapshot_mount(vfs_context_t ctx, struct vnode *rvp, int mac_mount_check_snapshot_revert(vfs_context_t ctx, struct mount *mp, const char *name) __result_use_check; int mac_mount_check_remount(vfs_context_t ctx, struct mount *mp, int flags) __result_use_check; +int mac_mount_check_remount_with_flags(vfs_context_t ctx, struct mount *mp, int *flagsp) __result_use_check; int mac_mount_check_setattr(vfs_context_t ctx, struct mount *mp, struct vfs_attr *vfa) __result_use_check; int mac_mount_check_stat(vfs_context_t ctx, struct mount *mp) __result_use_check; @@ -317,6 +318,7 @@ int mac_proc_check_get_task(kauth_cred_t cred, proc_ident_t pident, mach_tas int mac_proc_check_expose_task(kauth_cred_t cred, proc_ident_t pident, mach_task_flavor_t flavor) __result_use_check; int mac_proc_check_get_movable_control_port(void) __result_use_check; int mac_proc_check_inherit_ipc_ports(struct proc *p, struct vnode *cur_vp, off_t cur_offset, struct vnode *img_vp, off_t img_offset, struct vnode *scriptvp) __result_use_check; +int mac_proc_check_iopolicysys(struct proc *p, kauth_cred_t cred, int cmd, int type, int scope, int policy) __result_use_check; int mac_proc_check_getaudit(proc_t proc) __result_use_check; int mac_proc_check_getauid(proc_t proc) __result_use_check; int mac_proc_check_dyld_process_info_notify_register(void) __result_use_check; @@ -593,6 +595,11 @@ int mac_skywalk_flow_check_listen(proc_t p, void *flow, const struct sockadd void mac_vnode_notify_reclaim(vnode_t vp); void mac_vnode_notify_unlink(vfs_context_t ctx, struct vnode *dvp, struct vnode *vp, struct componentname *cnp); +int mac_vnode_check_dataprotect_set(vfs_context_t ctx, struct vnode *vp, uint32_t *dataprotect_class) __result_use_check; +int mac_graft_check_graft(vfs_context_t ctx, struct vnode *graft_dir_vp) __result_use_check; +int mac_graft_check_ungraft(vfs_context_t ctx, struct vnode *graft_dir_vp) __result_use_check; +void mac_graft_notify_graft(vfs_context_t ctx, struct vnode *graft_dir_vp); +void mac_graft_notify_ungraft(vfs_context_t ctx, struct vnode *graft_dir_vp); void psem_label_associate(struct fileproc *fp, struct vnode *vp, struct vfs_context *ctx); void pshm_label_associate(struct fileproc *fp, struct vnode *vp, struct vfs_context *ctx); diff --git a/security/mac_label.c b/security/mac_label.c index 79b1540f4..2d1693d6b 100644 --- a/security/mac_label.c +++ b/security/mac_label.c @@ -128,7 +128,7 @@ mac_labelzone_free_owned(struct label **labelp, extra_deinit(label); } - *labelp = NULL; + os_atomic_store(labelp, NULL, release); mac_labelzone_free(label); } } @@ -144,7 +144,7 @@ mac_label_verify_panic(struct label **labelp) struct label * mac_label_verify(struct label **labelp) { - struct label *label = *labelp; + struct label *label = os_atomic_load(labelp, acquire); if (label != NULL) { zone_require_ro(ZONE_ID_MAC_LABEL, sizeof(struct label), label); diff --git a/security/mac_mach.c b/security/mac_mach.c index af645d1f0..f8f7d5c14 100644 --- a/security/mac_mach.c +++ b/security/mac_mach.c @@ -87,7 +87,7 @@ mac_task_check_expose_task(struct task *task, mach_task_flavor_t flavor) if (p == NULL) { return ESRCH; } - struct proc_ident pident = proc_ident(p); + struct proc_ident pident = proc_ident_with_policy(p, IDENT_VALIDATION_PROC_EXACT); struct ucred *cred = kauth_cred_get(); proc_rele(p); @@ -113,7 +113,7 @@ mac_task_check_task_id_token_get_task(struct task *task, mach_task_flavor_t flav if (target_proc == NULL) { return ESRCH; } - pident = proc_ident(target_proc); + pident = proc_ident_with_policy(target_proc, IDENT_VALIDATION_PROC_EXACT); pidentp = &pident; proc_rele(target_proc); } @@ -179,7 +179,7 @@ mac_task_check_get_task_special_port(struct task *task, struct task *target, int if (target_proc == NULL) { return ESRCH; } - pident = proc_ident(target_proc); + pident = proc_ident_with_policy(target_proc, IDENT_VALIDATION_PROC_EXACT); pidentp = &pident; proc_rele(target_proc); } @@ -209,7 +209,7 @@ mac_task_check_set_task_special_port(struct task *task, struct task *target, int return ESRCH; } - struct proc_ident pident = proc_ident(targetp); + struct proc_ident pident = proc_ident_with_policy(targetp, IDENT_VALIDATION_PROC_EXACT); proc_rele(targetp); MAC_CHECK(proc_check_set_task_special_port, @@ -238,7 +238,7 @@ mac_task_check_set_task_exception_ports(struct task *task, struct task *target, return ESRCH; } - struct proc_ident pident = proc_ident(targetp); + struct proc_ident pident = proc_ident_with_policy(targetp, IDENT_VALIDATION_PROC_EXACT); proc_rele(targetp); for (exception = FIRST_EXCEPTION; exception < EXC_TYPES_COUNT; exception++) { @@ -274,7 +274,7 @@ mac_task_check_set_thread_exception_ports(struct task *task, struct task *target return ESRCH; } - struct proc_ident pident = proc_ident(targetp); + struct proc_ident pident = proc_ident_with_policy(targetp, IDENT_VALIDATION_PROC_EXACT); proc_rele(targetp); for (exception = FIRST_EXCEPTION; exception < EXC_TYPES_COUNT; exception++) { diff --git a/security/mac_mach_internal.h b/security/mac_mach_internal.h index 17995cc21..460c5bdca 100644 --- a/security/mac_mach_internal.h +++ b/security/mac_mach_internal.h @@ -62,15 +62,14 @@ #include /* mac_do_machexc() flags */ -#define MAC_DOEXCF_TRACED 0x01 /* Only do mach exeception if - being ptrace()'ed */ +#define MAC_DOEXCF_TRACED 0x01 /* Only do mach exeception if being ptrace()'ed */ struct exception_action; struct proc; struct uthread; struct task; -int mac_do_machexc(int64_t code, int64_t subcode, uint32_t flags __unused); -int mac_schedule_userret(void); +int mac_do_machexc(int64_t code, int64_t subcode, uint32_t flags __unused); +int mac_schedule_userret(void); /* telemetry */ int mac_schedule_telemetry(void); diff --git a/security/mac_policy.h b/security/mac_policy.h index 434ef509a..7fc88c1b3 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -126,7 +126,15 @@ struct sockaddr; * Should be removed once all dependent parties adopt * proc_ident_t. */ -#define MAC_PROC_IDENT_SUPPORT +#define MAC_PROC_IDENT_SUPPORT 1 + +/* + * rdar://146696727 + * + * Support for opaque lookup policy on proc_ident + * when using proc_find_ident + */ +#define MAC_PROC_IDENT_POLICY_SUPPORT 1 #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T @@ -1411,6 +1419,27 @@ typedef int mpo_mount_check_remount_t( struct label *mlabel, uint64_t flags ); +/** + * @brief Access control check for remounting a filesystem with modifiable flags + * @param cred Subject credential + * @param mp The mount point + * @param mlabel Label currently associated with the mount point + * @param flagsp A pointer to requested update flags. This can be modified by the function + * to reflect changes in the operation flags. + * + * This function is a variant of mpo_mount_check_remount_t, allowing + * the caller to specify and potentially overwrite the flags via a + * pointer to an integer. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_mount_check_remount_with_flags_t( + kauth_cred_t cred, + struct mount *mp, + struct label *mlabel, + int *flagsp + ); /** * @brief Access control check for the settting of file system attributes * @param cred Subject credential @@ -3745,6 +3774,30 @@ typedef int mpo_proc_check_inherit_ipc_ports_t( struct vnode *scriptvp ); +/** + * @brief Access control check for iopolicysys + * @param p current process instance + * @param cred Subject credential + * @param type Type of I/O policy (e.g. IOPOL_TYPE_DISK) + * @param cmd Command into I/O policy (e.g. IOPOL_CMD_GET or IOPOL_CMD_SET) + * @param scope Scope of the I/O policy (e.g IOPOL_SCOPE_PROCESS) + * @param policy Priority of the I/O policy (e.g. IOPOL_IMPORTANT) + * + * Determine whether the subject identified by the credential can perform + * the I/O policy type within the scope. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_proc_check_iopolicysys_t( + struct proc *p, + kauth_cred_t cred, + int cmd, + int type, + int scope, + int policy + ); + /** * @brief Privilege check for a process to run invalid * @param p Object process @@ -4911,6 +4964,23 @@ typedef int mpo_vnode_check_swap_t( struct vnode *v2, struct label *vl2 ); +/** + * @brief Access control and clamping for changing dataprotection class of a vnode. + * @param cred User credential for process changing dataprotection class + * @param vp the vnode that is being changed + * @param dataprotect_class a pointer to the desired new dataprotection class + * + * The hook may override the requested data protection class by altering the + * value referenced by dataprotect_class. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_vnode_check_dataprotect_set_t( + kauth_cred_t cred, + struct vnode *vp, + uint32_t *dataprotect_class + ); /** * @brief Access control check for vnode trigger resolution * @param cred Subject credential @@ -5826,6 +5896,62 @@ typedef void mpo_vnode_notify_unlink_t( struct componentname *cnp ); +/** + * @brief Access control check for grafting a Cryptex + * @param cred Subject credential + * @param graft_dir_vp Vnode that is to be the graft point + * + * Determine whether the subject identified by the credential can perform + * the graft operation on the target vnode. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_graft_check_graft_t( + kauth_cred_t cred, + struct vnode *graft_dir_vp + ); + +/** + * @brief Access control check for ungrafting a Cryptex + * @param cred Subject credential + * @param graft_dir_vp Vnode of graft point to be ungrafted + * + * Determine whether the subject identified by the credential can perform + * the ungraft operation on the target vnode. + * + * @return Return 0 if access is granted, otherwise an appropriate value for + * errno should be returned. + */ +typedef int mpo_graft_check_ungraft_t( + kauth_cred_t cred, + struct vnode *graft_dir_vp + ); + +/** + * @brief Notify on successful Cryptex graft + * @param cred Subject credential + * @param graft_dir_vp Vnode of graft point + * + * Notify on successful Cryptex graft. + */ +typedef void mpo_graft_notify_graft_t( + kauth_cred_t cred, + struct vnode *graft_dir_vp + ); + +/** + * @brief Notify on successful Cryptex ungraft + * @param cred Subject credential + * @param graft_dir_vp Vnode of graft point + * + * Notify on successful Cryptex ungraft. + */ +typedef void mpo_graft_notify_ungraft_t( + kauth_cred_t cred, + struct vnode *graft_dir_vp + ); + /* * Placeholder for future events that may need mac hooks. */ @@ -5837,15 +5963,15 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 87 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 91 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; - mpo_reserved_hook_t *mpo_reserved01; - mpo_reserved_hook_t *mpo_reserved02; - mpo_reserved_hook_t *mpo_reserved03; - mpo_reserved_hook_t *mpo_reserved04; + mpo_graft_check_graft_t *mpo_graft_check_graft; + mpo_graft_check_ungraft_t *mpo_graft_check_ungraft; + mpo_graft_notify_graft_t *mpo_graft_notify_graft; + mpo_graft_notify_ungraft_t *mpo_graft_notify_ungraft; mpo_cred_check_label_update_execve_t *mpo_cred_check_label_update_execve; mpo_cred_check_label_update_t *mpo_cred_check_label_update; @@ -5929,8 +6055,8 @@ struct mac_policy_ops { mpo_vnode_notify_swap_t *mpo_vnode_notify_swap; mpo_vnode_notify_unlink_t *mpo_vnode_notify_unlink; mpo_vnode_check_swap_t *mpo_vnode_check_swap; - mpo_reserved_hook_t *mpo_reserved33; - mpo_reserved_hook_t *mpo_reserved34; + mpo_vnode_check_dataprotect_set_t *mpo_vnode_check_dataprotect_set; + mpo_mount_check_remount_with_flags_t *mpo_mount_check_remount_with_flags; mpo_mount_notify_mount_t *mpo_mount_notify_mount; mpo_vnode_check_copyfile_t *mpo_vnode_check_copyfile; @@ -6026,7 +6152,7 @@ struct mac_policy_ops { mpo_proc_check_sched_t *mpo_proc_check_sched; mpo_proc_check_setaudit_t *mpo_proc_check_setaudit; mpo_proc_check_setauid_t *mpo_proc_check_setauid; - mpo_reserved_hook_t *mpo_reserved64; + mpo_proc_check_iopolicysys_t *mpo_proc_check_iopolicysys; mpo_proc_check_signal_t *mpo_proc_check_signal; mpo_proc_check_wait_t *mpo_proc_check_wait; mpo_proc_check_dump_core_t *mpo_proc_check_dump_core; diff --git a/security/mac_process.c b/security/mac_process.c index c57e29e68..51bcf33e8 100644 --- a/security/mac_process.c +++ b/security/mac_process.c @@ -466,6 +466,23 @@ mac_proc_check_inherit_ipc_ports( return error; } +int +mac_proc_check_iopolicysys(struct proc *p, kauth_cred_t cred, int cmd, int type, int scope, int policy) +{ + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_system_enforce) { + return 0; + } +#endif + + MAC_CHECK(proc_check_iopolicysys, p, cred, cmd, type, scope, policy); + + return error; +} + /* * The type of maxprot in proc_check_map_anon must be equivalent to vm_prot_t * (defined in ). mac_policy.h does not include any header diff --git a/security/mac_skywalk.c b/security/mac_skywalk.c index 28b370f1a..20f9b5c1d 100644 --- a/security/mac_skywalk.c +++ b/security/mac_skywalk.c @@ -33,21 +33,25 @@ #include int -mac_skywalk_flow_check_connect(__unused proc_t proc, void *flow, const struct sockaddr *addr, int type, int protocol) +mac_skywalk_flow_check_connect(proc_t proc, void *flow, const struct sockaddr *addr, int type, int protocol) { int error; + kauth_cred_t cred; - assert(proc == current_proc()); - MAC_CHECK(skywalk_flow_check_connect, kauth_cred_get(), flow, addr, type, protocol); + cred = kauth_cred_proc_ref(proc); + MAC_CHECK(skywalk_flow_check_connect, cred, flow, addr, type, protocol); + kauth_cred_unref(&cred); return error; } int -mac_skywalk_flow_check_listen(__unused proc_t proc, void *flow, const struct sockaddr *addr, int type, int protocol) +mac_skywalk_flow_check_listen(proc_t proc, void *flow, const struct sockaddr *addr, int type, int protocol) { int error; + kauth_cred_t cred; - assert(proc == current_proc()); - MAC_CHECK(skywalk_flow_check_listen, kauth_cred_get(), flow, addr, type, protocol); + cred = kauth_cred_proc_ref(proc); + MAC_CHECK(skywalk_flow_check_listen, cred, flow, addr, type, protocol); + kauth_cred_unref(&cred); return error; } diff --git a/security/mac_vfs.c b/security/mac_vfs.c index f356cc70e..efcc1ef94 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -100,7 +100,7 @@ * KDBG_EVENTID(DBG_FSYSTEM, DBG_VFS, dcode) global event id, see bsd/sys/kdebug.h. * Note that dcode is multiplied by 4 and ORed as part of the construction. See bsd/kern/trace_codes * for list of system-wide {global event id, name} pairs. Currently DBG_VFS event ids are in range - * [0x3130000, 0x3130198]. + * [0x3130000, 0x313019C]. */ //#define VFS_TRACE_POLICY_OPS @@ -187,7 +187,7 @@ mac_vnode_label_init(vnode_t vp) struct label *label; label = mac_vnode_label_alloc(vp); - vp->v_label = label; + os_atomic_store(&vp->v_label, label, release); } struct label * @@ -2158,7 +2158,7 @@ mac_vnode_label_update(vfs_context_t ctx, struct vnode *vp, struct label *newlab * somebody else might have already got here first. */ if (mac_vnode_label(vp) == NULL) { - vp->v_label = tmpl; + os_atomic_store(&vp->v_label, tmpl, release); tmpl = NULL; } @@ -2949,6 +2949,29 @@ mac_vnode_check_rename_swap(vfs_context_t ctx, struct vnode *fdvp, return error; } +int +mac_vnode_check_dataprotect_set(vfs_context_t ctx, struct vnode *vp, uint32_t *dataprotect_class) +{ + kauth_cred_t cred; + int error = 0; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(101, vp); + MAC_PERFORM(vnode_check_dataprotect_set, cred, vp, dataprotect_class); + VFS_KERNEL_DEBUG_END1(101, vp); + + return error; +} + void mac_mount_notify_mount(vfs_context_t ctx, struct mount *mp) { @@ -2956,3 +2979,128 @@ mac_mount_notify_mount(vfs_context_t ctx, struct mount *mp) MAC_PERFORM(mount_notify_mount, vfs_context_ucred(ctx), mp, mac_mount_label(mp)); VFS_KERNEL_DEBUG_END1(102, mp); } + +int +mac_mount_check_remount_with_flags(vfs_context_t ctx, struct mount *mp, int *flagsp) +{ + kauth_cred_t cred; + int error; + int visflags; + + if (!flagsp) { + return EINVAL; + } + visflags = (*flagsp & (MNT_CMDFLAGS | MNT_VISFLAGMASK)); + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return 0; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(103, mp); + MAC_CHECK(mount_check_remount_with_flags, cred, mp, mac_mount_label(mp), &visflags); + VFS_KERNEL_DEBUG_END1(103, mp); + + if (error) { + return error; + } + + /* Sanity check */ + if (visflags != (visflags & (MNT_CMDFLAGS | MNT_VISFLAGMASK))) { + return EINVAL; + } + *flagsp = visflags; + + return 0; +} + +int +mac_graft_check_graft(vfs_context_t ctx, struct vnode *graft_dir_vp) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return 0; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(104, vp); + MAC_CHECK(graft_check_graft, cred, graft_dir_vp); + VFS_KERNEL_DEBUG_END1(104, vp); + + return error; +} + +int +mac_graft_check_ungraft(vfs_context_t ctx, struct vnode *graft_dir_vp) +{ + kauth_cred_t cred; + int error; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return 0; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return 0; + } + VFS_KERNEL_DEBUG_START1(105, vp); + MAC_CHECK(graft_check_ungraft, cred, graft_dir_vp); + VFS_KERNEL_DEBUG_END1(105, vp); + + return error; +} + +void +mac_graft_notify_graft(vfs_context_t ctx, struct vnode *graft_dir_vp) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return; + } + VFS_KERNEL_DEBUG_START1(106, vp); + MAC_PERFORM(graft_notify_graft, cred, graft_dir_vp); + VFS_KERNEL_DEBUG_END1(106, vp); +} + +void +mac_graft_notify_ungraft(vfs_context_t ctx, struct vnode *graft_dir_vp) +{ + kauth_cred_t cred; + +#if SECURITY_MAC_CHECK_ENFORCE + /* 21167099 - only check if we allow write */ + if (!mac_vnode_enforce) { + return; + } +#endif + cred = vfs_context_ucred(ctx); + if (!mac_cred_check_enforce(cred)) { + return; + } + VFS_KERNEL_DEBUG_START1(107, vp); + MAC_PERFORM(graft_notify_ungraft, cred, graft_dir_vp); + VFS_KERNEL_DEBUG_END1(107, vp); +} diff --git a/tests/Makefile b/tests/Makefile index 461baaf45..60c856eb4 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -43,11 +43,15 @@ include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common OTHER_CFLAGS = -Wall OTHER_CFLAGS += -Wno-deprecated-declarations +OTHER_CFLAGS += -Werror=excess-initializers OTHER_CFLAGS += -Wno-c2x-extensions +OTHER_CFLAGS += -Werror=excess-initializers OTHER_CFLAGS += --std=gnu17 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders OTHER_CFLAGS += -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu -DT_LEAKS_DISABLE=1 OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks +ipc/%: OTHER_CFLAGS += -Wno-format-pedantic $(WERROR) + CODESIGN:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign) CODESIGN_HARDENED_RUNTIME:=$(CODESIGN) -o runtime @@ -98,6 +102,9 @@ include sched/Makefile # Miscellaneous Tests +mach_eventlink: OTHER_LDFLAGS += $(SCHED_UTILS_FLAGS) +mach_eventlink: $(SCHED_UTILS) + signal_exit_reason: CODE_SIGN_ENTITLEMENTS = signal_exit_reason.entitlements atm_diagnostic_flag: OTHER_CFLAGS += drop_priv.c @@ -155,10 +162,45 @@ corpse_backtrace: OTHER_CFLAGS += $(OBJROOT)/excserver_backtrace.c corpse_backtrace: OTHER_CFLAGS += -I $(OBJROOT) corpse_backtrace: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement +# Tests that use exc_guard_helper.h should add all of the build options used by exc_guard_helper_test: +# - additional C files exc_guard_helper.c and exc_helpers.c and test_utils.c +# - additional library darwintest_utils +# - excserver MIG-generated file +# - entitlement to allow thread_set_exception_ports() +exc_guard_helper_test: exc_guard_helper.c exc_helpers.c test_utils.c excserver +exc_guard_helper_test: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +exc_guard_helper_test: OTHER_LDFLAGS += -ldarwintest_utils +exc_guard_helper_test: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement + +EXCLUDED_SOURCES += exc_guard_helper.c + +exc_guard_helper_test_unexpected: exc_guard_helper.c exc_helpers.c test_utils.c excserver +exc_guard_helper_test_unexpected: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +exc_guard_helper_test_unexpected: OTHER_LDFLAGS += -ldarwintest_utils +exc_guard_helper_test_unexpected: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement + +# Tests that use try_read_write.h should add all of the build options used by try_read_write_test: +# - additional C files try_read_write.c and exc_helpers.c +# - excserver MIG-generated file +# - entitlement to allow thread_set_exception_ports() +try_read_write_test: try_read_write.c exc_helpers.c excserver +try_read_write_test: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +try_read_write_test: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement + +EXCLUDED_SOURCES += try_read_write.c + +try_read_write_test_unexpected: try_read_write.c exc_helpers.c excserver +try_read_write_test_unexpected: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +try_read_write_test_unexpected: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement + vm/corpse_owned_vmobjects: excserver_protect_state vm/corpse_owned_vmobjects: OTHER_CFLAGS += $(OBJROOT)/excserver_protect_state.c -I $(OBJROOT) vm/corpse_owned_vmobjects: CODE_SIGN_ENTITLEMENTS += vm/corpse_owned_vmobjects.entitlements +vm_test_mach_map: try_read_write.c exc_helpers.c excserver +vm_test_mach_map: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +vm_test_mach_map: CODE_SIGN_ENTITLEMENTS = ./vm_test_mach_map.plist + OTHER_TEST_TARGETS += vm/vm_parameter_validation # vm_parameter_validation builds each arch as a separate build command @@ -303,6 +345,34 @@ ipc/sys_perf_notify_test: excserver_protect_state ipc/sys_perf_notify_test: OTHER_CFLAGS += $(OBJROOT)/excserver_protect_state.c -I $(OBJROOT) ipc/sys_perf_notify_test: CODE_SIGN_ENTITLEMENTS=entitlements/sys_perf_notify_test.entitlements +ipc/port_type_policy: CODE_SIGN_ENTITLEMENTS = entitlements/port_type_policy.entitlements +ipc/port_type_policy: OTHER_CFLAGS += -I $(OBJROOT) +ipc/port_type_policy: ipc/port_type_policy.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(OTHER_CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ + $(filter-out $(CUSTOM_TARGETS), $^) -o $(SYMROOT)/$@ + $(CODESIGN) --force --sign - --entitlements $(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@ + +EXCLUDED_SOURCES += posix_sem_namespace_helper.c +CUSTOM_TARGETS += posix_sem_namespace_helper_team0 posix_sem_namespace_helper_team1 +posix_sem_namespace_helper_team0: posix_sem_namespace_helper.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ + $^ -o $(SYMROOT)/$@ + $(CODESIGN) --force --sign - --timestamp=none --team-identifier "PSXSEMT0" $(SYMROOT)/$@ +posix_sem_namespace_helper_team1: posix_sem_namespace_helper.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ + $^ -o $(SYMROOT)/$@ + $(CODESIGN) --force --sign - --timestamp=none --team-identifier "PSXSEMT1" $(SYMROOT)/$@ + +install-posix_sem_namespace_helper_team0: posix_sem_namespace_helper_team0 + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/posix_sem_namespace_helper_team0 $(INSTALLDIR)/ +install-posix_sem_namespace_helper_team1: posix_sem_namespace_helper_team1 + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/posix_sem_namespace_helper_team1 $(INSTALLDIR)/ + +posix_sem: posix_sem_namespace_helper_team0 posix_sem_namespace_helper_team1 +posix_sem: OTHER_LDFLAGS += -ldarwintest_utils + posix_spawnattr_set_crash_behavior_np: posix_spawnattr_set_crash_behavior_np_child posix_spawnattr_set_crash_behavior_np: CODE_SIGN_ENTITLEMENTS = posix_spawnattr_set_crash_behavior_np_entitlements.plist @@ -371,6 +441,8 @@ install-kqworkloop_limits_client: kqworkloop_limits_client ipc/task_name_for_pid: CODE_SIGN_ENTITLEMENTS = ipc/task_name_for_pid_entitlement.plist +ipc/mach_msg_transport: CODE_SIGN_ENTITLEMENTS = entitlements/create_conn_port_with_port_array.entitlements + fd_table_limits: rnserver fd_table_limits: OTHER_CFLAGS += $(OBJROOT)/rnServer.c -I $(OBJROOT) fd_table_limits: OTHER_LDFLAGS += -ldarwintest_utils -lpthread @@ -406,7 +478,7 @@ imm_pinned_control_port_crasher: imm_pinned_control_port_crasher.c cs_helpers.c $^ -o $(SYMROOT)/$@ $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@ -imm_pinned_control_port_crasher_3P_hardened: CODE_SIGN_ENTITLEMENTS = entitlements/hardened-binary-entitlements-1.plist +imm_pinned_control_port_crasher_3P_hardened: CODE_SIGN_ENTITLEMENTS = entitlements/enhanced-security-binary-entitlements-1.plist imm_pinned_control_port_crasher_3P_hardened: imm_pinned_control_port_crasher.c cs_helpers.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ $^ -o $(SYMROOT)/$@ @@ -429,12 +501,14 @@ reply_port_defense: OTHER_LDFLAGS += -ldarwintest_utils -lpthread reply_port_defense: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement reply_port_defense: reply_port_defense_client reply_port_defense_client_3P_hardened +reply_port_defense_client: CODE_SIGN_ENTITLEMENTS=exception_tests.entitlements +reply_port_defense_client: OTHER_LDFLAGS += -ldarwintest_utils -lpthread reply_port_defense_client: reply_port_defense_client.c cs_helpers.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ $^ -o $(SYMROOT)/$@ - $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@ + $(CODESIGN) --force --sign - --entitlements $(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@ -reply_port_defense_client_3P_hardened: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-binary-entitlements-1.plist +reply_port_defense_client_3P_hardened: CODE_SIGN_ENTITLEMENTS=entitlements/enhanced-security-binary-entitlements-1.plist reply_port_defense_client_3P_hardened: reply_port_defense_client.c cs_helpers.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders \ $^ -o $(SYMROOT)/$@ @@ -460,7 +534,7 @@ OTHER_TEST_TARGETS += net_no_bounds_safety host_statistics_rate_limiting: cs_helpers.c host_statistics_rate_limiting.c -EXCLUDED_SOURCES += drop_priv.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c test_utils.c inet_transfer.c net_test_lib.c cs_helpers.c +EXCLUDED_SOURCES += drop_priv.c xnu_quick_test_helpers.c memorystatus_assertion_helpers.c bpflib.c in_cksum.c test_utils.c inet_transfer.c net_test_lib.c cs_helpers.c vsock_helpers.c ifneq ($(IOS_TEST_COMPAT),YES) EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c perf_compressor.c vm/ios13extended_footprint.c vm/entitlement_internal_bands.c @@ -792,6 +866,11 @@ INCLUDED_TEST_SOURCE_DIRS += ipc rm workq include vm/Makefile +vm/vm_stress: OTHER_CXXFLAGS += -std=c++20 +vm/vm_stress: OTHER_LDFLAGS += -ldarwintest_utils +vm/vm_stress: INVALID_ARCHS = x86_64h + + big_map_test: CODE_SIGN_ENTITLEMENTS = ./big_map_test_entitlement.plist task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist @@ -802,6 +881,7 @@ ipc/ipc_thread_ports_race: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.p ipc/ipc_thread_ports_race: OTHER_LDFLAGS += -ldarwintest_utils rm/pthread_setugid: OTHER_LDFLAGS += -ldarwintest_utils +rm/coalition_info_resource_usage: OTHER_LDFLAGS += -ldarwintest_utils proc_info: recount/recount_test_utils.c proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist @@ -813,12 +893,12 @@ proc_info_44873309: CODE_SIGN_ENTITLEMENTS = ./proc_info_44873309.entitlements disk_mount_conditioner: disk_mount_conditioner* disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist -disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils +disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils -framework Foundation disk_mount_conditioner: OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled disk_mount_conditioner_unentitled: OTHER_CFLAGS += -DTEST_UNENTITLED -disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils -disk_mount_conditioner_unentitled: disk_mount_conditioner.c +disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils -framework Foundation +disk_mount_conditioner_unentitled: disk_mount_conditioner.m $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ work_interval_test: CODE_SIGN_ENTITLEMENTS = work_interval_test.entitlements @@ -830,8 +910,11 @@ work_interval_data_test: OTHER_LDFLAGS += -lpthread game_mode: CODE_SIGN_ENTITLEMENTS = game_mode.entitlements carplay_mode: CODE_SIGN_ENTITLEMENTS = carplay_mode.entitlements +runaway_mitigation: CODE_SIGN_ENTITLEMENTS = runaway_mitigation.entitlements coalition_policy: CODE_SIGN_ENTITLEMENTS = coalition_policy.entitlements +coalition_policy: OTHER_LDFLAGS += $(SCHED_UTILS_FLAGS) +coalition_policy: $(SCHED_UTILS) settimeofday_29193041: OTHER_CFLAGS += drop_priv.c @@ -858,6 +941,12 @@ endif unp_sock_release: CODE_SIGN_ENTITLEMENTS = unp_sock_release-entitlements.plist +accept_race: in_cksum.c net_test_lib.c +accept_race: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +flow_div_doubleconnect_55917185: in_cksum.c net_test_lib.c +flow_div_doubleconnect_55917185: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + bpf_write: bpflib.c in_cksum.c net_test_lib.c bpf_write: OTHER_LDFLAGS += -ldarwintest_utils @@ -872,8 +961,21 @@ udp_bind_connect: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist tcp_bind_connect: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist tcp_send_implied_connect: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +tcp_cache_test: CODE_SIGN_ENTITLEMENTS = tcp_cache_entitlements.plist + +socket_bind_35243417: in_cksum.c net_test_lib.c socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +socket_bind_35685803: in_cksum.c net_test_lib.c socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +tcp_input_outputopts_uaf_56155583: in_cksum.c net_test_lib.c +tcp_input_outputopts_uaf_56155583: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +unp_connect_thread_uaf: in_cksum.c net_test_lib.c +unp_connect_thread_uaf: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + icmp_fragmetned_payload: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist tcp_invalid_address: OTHER_CFLAGS += -Wno-int-conversion tcp_invalid_address: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist @@ -892,6 +994,9 @@ net_bond: inet_transfer.c bpflib.c in_cksum.c net_test_lib.c net_bond: OTHER_LDFLAGS += -ldarwintest_utils net_bond: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +mcast_ssm: in_cksum.c net_test_lib.c +mcast_ssm: OTHER_LDFLAGS += -ldarwintest_utils + if_generation_id: net_test_lib.c in_cksum.c if_generation_id: OTHER_LDFLAGS += -ldarwintest_utils @@ -906,16 +1011,39 @@ bpf_timestamp: bpflib.c bpf_timestamp: OTHER_LDFLAGS += -ldarwintest_utils bpf_timestamp: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +ipv6_bind_race: in_cksum.c net_test_lib.c ipv6_bind_race: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +mcast_group_race_82820812: in_cksum.c net_test_lib.c +mcast_group_race_82820812: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + so_bindtodevice: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +recv_link_addr_type: net_test_lib.c in_cksum.c +recv_link_addr_type: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + +ip_pktinfo: net_test_lib.c in_cksum.c +ip_pktinfo: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + v4mappedv6_dontfrag: net_test_lib.c in_cksum.c v4mappedv6_dontfrag: OTHER_LDFLAGS += -ldarwintest_utils v4mappedv6_dontfrag: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +net_siocdifaddr: net_test_lib.c in_cksum.c +net_siocdifaddr: OTHER_LDFLAGS += -ldarwintest_utils + +vsock: vsock_helpers.c +vsock_entitled: vsock_helpers.c +vsock_entitled: CODE_SIGN_ENTITLEMENTS = vsock_entitlements.plist + +sendmsg_test: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + network_elevated_logging: CODE_SIGN_ENTITLEMENTS = entitlements/network_elevated_logging.entitlements +ipc/platform_restrictions_entitlements: CODE_SIGN_ENTITLEMENTS = entitlements/platform-restrictions.plist + +ipc/tpro_entitlements: CODE_SIGN_ENTITLEMENTS = entitlements/tpro.plist + CUSTOM_TARGETS += posix_spawn_archpref_helper posix_spawn_archpref_helper: posix_spawn_archpref_helper.c @@ -1057,13 +1185,13 @@ arm_matrix: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) -Wno-language-e arm_matrix: CODE_SIGN_ENTITLEMENTS = arm_matrix.entitlements -x18_entitled: OTHER_CFLAGS += -Wno-language-extension-token +x18_entitled: OTHER_CFLAGS += -ffixed-x18 -Wno-language-extension-token x18_entitled: CODE_SIGN_ENTITLEMENTS = x18-entitlements.plist -x18_unentitled: OTHER_CFLAGS += -Wno-language-extension-token +x18_unentitled: OTHER_CFLAGS += -ffixed-x18 -Wno-language-extension-token ifneq ($(PLATFORM),MacOSX) EXCLUDED_SOURCES += x18_legacy.c else -x18_legacy: OTHER_CFLAGS += -Wno-language-extension-token -Wl,-platform_version -Wl,macos -Wl,10.12 -Wl,10.12 +x18_legacy: OTHER_CFLAGS += -ffixed-x18 -Wno-language-extension-token -Wl,-platform_version -Wl,macos -Wl,10.12 -Wl,10.12 endif ifeq ($(findstring arm64,$(ARCH_CONFIGS)),) @@ -1093,9 +1221,47 @@ endif endif -pmap_bench: INVALID_ARCHS = $(filter-out arm%,$(ARCH_CONFIGS)) +bingrade: bingrade_helper_arm32 bingrade_helper_arm64 bingrade_helper_arm_fat +bingrade: INVALID_ARCHS = x86_64 x86_64h +# libdarwintest.a doesn't have an arm64 slice on watchOS +ifeq ($(PLATFORM),WatchOS) +bingrade: INVALID_ARCHS += arm64 +endif +EXCLUDED_SOURCES += bingrade_helper.c +CUSTOM_TARGETS += bingrade_helper_arm32 +bingrade_helper_arm32: CFLAGS = -arch arm64_32 -static -nostdlib +bingrade_helper_arm32: bingrade_helper.c + $(CC) $(CFLAGS) $< -o $(SYMROOT)/$@ + $(CODESIGN) -f -s- $(SYMROOT)/$@ + +install-bingrade_helper_arm32: bingrade_helper_arm32 + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/$< $(INSTALLDIR)/ + +CUSTOM_TARGETS += bingrade_helper_arm64 +bingrade_helper_arm64: CFLAGS = -arch arm64 -static -nostdlib +bingrade_helper_arm64: bingrade_helper.c + $(CC) $(CFLAGS) $< -o $(SYMROOT)/$@ + $(CODESIGN) -f -s- $(SYMROOT)/$@ + +install-bingrade_helper_arm64: bingrade_helper_arm64 + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/$< $(INSTALLDIR)/ + +CUSTOM_TARGETS += bingrade_helper_arm_fat +bingrade_helper_arm_fat: CFLAGS = -arch arm64 -arch arm64_32 -arch arm64e -static -nostdlib +bingrade_helper_arm_fat: bingrade_helper.c + $(CC) $(CFLAGS) $< -o $(SYMROOT)/$@ + $(CODESIGN) -f -s- $(SYMROOT)/$@ + +install-bingrade_helper_arm_fat: bingrade_helper_arm_fat + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/$< $(INSTALLDIR)/ + +pmap_bench: INVALID_ARCHS = $(filter-out arm%,$(ARCH_CONFIGS)) pmap_stress: INVALID_ARCHS = $(filter-out arm%,$(ARCH_CONFIGS)) +pmap_fault_on_commpage: INVALID_ARCHS = $(filter-out arm%,$(ARCH_CONFIGS)) hw_breakpoint_step_arm64: CODE_SIGN_ENTITLEMENTS = hw_breakpoint_step_arm64_entitlements.plist hw_breakpoint_step_arm64: INVALID_ARCHS = $(filter-out arm64%,$(ARCH_CONFIGS)) @@ -1118,7 +1284,7 @@ ptrauth_failure: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) -Wno-langu ptrauth_failure: CODESIGN = $(CODESIGN_HARDENED_RUNTIME) ptrauth_failure: CODE_SIGN_ENTITLEMENTS = entitlements/set_state_and_exception_port.entitlement -decompression_failure: excserver exc_helpers.c +decompression_failure: excserver exc_helpers.c try_read_write.c decompression_failure: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) decompression_failure: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement @@ -1134,6 +1300,14 @@ pac_exception_entitlement: OTHER_CFLAGS += $(OBJROOT)/excserver.c -Wno-language- pac_exception_entitlement: CODE_SIGN_ENTITLEMENTS = pac_exception_entitlement.plist endif +unrecoverable_trap_test: excserver exc_helpers.c +unrecoverable_trap_test: OTHER_CFLAGS += $(OBJROOT)/excserver.c -Wno-language-extension-token +unrecoverable_trap_test: CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement +# libdarwintest.a doesn't have an arm64 slice on watchOS +ifeq ($(PLATFORM),WatchOS) +unrecoverable_trap_test: INVALID_ARCHS += arm64 +endif + ifeq ($(findstring arm64,$(ARCH_CONFIGS)),) EXCLUDED_SOURCES += arm_cpu_capabilities.c else @@ -1242,8 +1416,6 @@ endif EXCLUDED_SOURCES += benchmark/helpers.c -perf_vmfault: OTHER_CFLAGS += benchmark/helpers.c - vm/fault_throughput: OTHER_CFLAGS += benchmark/helpers.c .PHONY: install-vm/fault_throughput @@ -1404,13 +1576,13 @@ test_static_binary_csflags: OTHER_LDFLAGS += -ldarwintest_utils libmalloc_apple_array: CODE_SIGN_ENTITLEMENTS = entitlements/libmalloc_apple_array.entitlements libmalloc_apple_array: OTHER_CFLAGS += -DENTITLED=1 drop_priv.c test_utils.c libmalloc_apple_array: OTHER_LDFLAGS += -ldarwintest_utils -libmalloc_apple_array: libmalloc_apple_array.c print_apple_array print_apple_array_HR1 print_apple_array_HR2 print_apple_array_hardened_proc print_apple_array_hardened_heap_disable print_apple_array_hardened_heap +libmalloc_apple_array: libmalloc_apple_array.c print_apple_array print_apple_array_HR1 print_apple_array_HR2 print_apple_array_hardened_proc print_apple_array_hardened_proc_security print_apple_array_hardened_heap print_apple_array_hardened_heap_security $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none --entitlements $(CODE_SIGN_ENTITLEMENTS) $(SYMROOT)/$@; libmalloc_apple_array_unentitled: OTHER_CFLAGS += drop_priv.c test_utils.c libmalloc_apple_array_unentitled: OTHER_LDFLAGS += -ldarwintest_utils -libmalloc_apple_array_unentitled: libmalloc_apple_array.c print_apple_array print_apple_array_HR1 print_apple_array_HR2 print_apple_array_hardened_proc print_apple_array_hardened_heap_disable print_apple_array_hardened_heap +libmalloc_apple_array_unentitled: libmalloc_apple_array.c print_apple_array print_apple_array_HR1 print_apple_array_HR2 print_apple_array_hardened_proc print_apple_array_hardened_heap $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $(OTHER_CFLAGS) $< -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; @@ -1420,13 +1592,13 @@ print_apple_array: print_apple_array.c env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; CUSTOM_TARGETS += print_apple_array_HR1 -print_apple_array_HR1: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-binary-entitlements-1.plist +print_apple_array_HR1: CODE_SIGN_ENTITLEMENTS=entitlements/enhanced-security-binary-entitlements-1.plist print_apple_array_HR1: print_apple_array.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; CUSTOM_TARGETS += print_apple_array_HR2 -print_apple_array_HR2: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-binary-entitlements-2.plist +print_apple_array_HR2: CODE_SIGN_ENTITLEMENTS=entitlements/enhanced-security-binary-entitlements-2.plist print_apple_array_HR2: print_apple_array.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; @@ -1437,18 +1609,24 @@ print_apple_array_hardened_proc: print_apple_array.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; -CUSTOM_TARGETS += print_apple_array_hardened_heap_disable -print_apple_array_hardened_heap_disable: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-proc-hardened-heap-override.entitlements -print_apple_array_hardened_heap_disable: print_apple_array.c +CUSTOM_TARGETS += print_apple_array_hardened_proc_security +print_apple_array_hardened_proc_security: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-proc-security.entitlements +print_apple_array_hardened_proc_security: print_apple_array.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; + CUSTOM_TARGETS += print_apple_array_hardened_heap -print_apple_array_hardened_heap: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-heap-standalone.entitlements +print_apple_array_hardened_heap: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-heap.entitlements print_apple_array_hardened_heap: print_apple_array.c $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; +CUSTOM_TARGETS += print_apple_array_hardened_heap_security +print_apple_array_hardened_heap_security: CODE_SIGN_ENTITLEMENTS=entitlements/hardened-heap-security.entitlements +print_apple_array_hardened_heap_security: print_apple_array.c + $(CC) $(DT_CFLAGS) -I $(OBJROOT) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --entitlements $(SRCROOT)/$(CODE_SIGN_ENTITLEMENTS) --timestamp=none $(SYMROOT)/$@; install-print_apple_array: print_apple_array mkdir -p $(INSTALLDIR)/tools @@ -1466,15 +1644,19 @@ install-print_apple_array_hardened_proc: print_apple_array_hardened_proc mkdir -p $(INSTALLDIR)/tools cp $(SYMROOT)/print_apple_array_hardened_proc $(INSTALLDIR)/tools/ -install-print_apple_array_hardened_heap_disable: print_apple_array_hardened_heap_disable +install-print_apple_array_hardened_proc_security: print_apple_array_hardened_proc_security mkdir -p $(INSTALLDIR)/tools - cp $(SYMROOT)/print_apple_array_hardened_heap_disable $(INSTALLDIR)/tools/ + cp $(SYMROOT)/print_apple_array_hardened_proc_security $(INSTALLDIR)/tools/ install-print_apple_array_hardened_heap: print_apple_array_hardened_heap mkdir -p $(INSTALLDIR)/tools cp $(SYMROOT)/print_apple_array_hardened_heap $(INSTALLDIR)/tools/ +install-print_apple_array_hardened_heap_security: print_apple_array_hardened_heap_security + mkdir -p $(INSTALLDIR)/tools + cp $(SYMROOT)/print_apple_array_hardened_heap_security $(INSTALLDIR)/tools/ + persona: CODE_SIGN_ENTITLEMENTS = persona.entitlements persona_adoption: CODE_SIGN_ENTITLEMENTS = persona_adoption.entitlements @@ -1508,11 +1690,36 @@ iopolicy: CODE_SIGN_ENTITLEMENTS = iopolicy.entitlements INCLUDED_TEST_SOURCE_DIRS += vfs vfs/freeable_vnodes: OTHER_LDFLAGS += -ldarwintest_utils +vfs/sandbox_fstat: OTHER_LDFLAGS += -lsandbox +vfs/named_fork_path: OTHER_LDFLAGS += -lsandbox +vfs/sandbox_type_error: OTHER_LDFLAGS += -lsandbox +vfs/sandbox_appledouble_write: OTHER_LDFLAGS += -lsandbox -vm/vm_reclaim: OTHER_CFLAGS += -Wno-language-extension-token -Wno-c++98-compat memorystatus_assertion_helpers.c -vm/vm_reclaim: OTHER_LDFLAGS += -ldarwintest_utils +# tests/vm/configurator_*.c use the VM configurator -vm/vm_reclaim: INVALID_ARCHS = armv7k arm64_32 +VM_CONFIGURATOR_SRCS = $(wildcard vm/configurator_*.c) +VM_CONFIGURATOR_TARGETS = $(VM_CONFIGURATOR_SRCS:%.c=%) + +$(VM_CONFIGURATOR_TARGETS): vm/configurator/vm_configurator.c +$(VM_CONFIGURATOR_TARGETS): vm/configurator/vm_configurator.h vm/configurator/vm_configurator_tests.h +$(VM_CONFIGURATOR_TARGETS): OTHER_CFLAGS += -Wall -Wextra -g + +# exception handling helpers +$(VM_CONFIGURATOR_TARGETS): exc_guard_helper.c try_read_write.c exc_helpers.c test_utils.c excserver +$(VM_CONFIGURATOR_TARGETS): OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +$(VM_CONFIGURATOR_TARGETS): OTHER_LDFLAGS += -ldarwintest_utils +$(VM_CONFIGURATOR_TARGETS): CODE_SIGN_ENTITLEMENTS = entitlements/set_exception_port.entitlement + +# Convenience target to build all tests that use vm_configurator.c +.PHONY: all-vm-configurator-tests +all-vm-configurator-tests: $(VM_CONFIGURATOR_TARGETS) + +# Convenience target to list all test executables that use vm_configurator.c +.PHONY: list-all-vm-configurator-executables +list-all-vm-configurator-executables: + @ echo $(addprefix $(SYMROOT)/,$(VM_CONFIGURATOR_TARGETS)) + +vm/corpse_footprint: OTHER_LDFLAGS += -ldarwintest_utils EXCLUDED_SOURCES += constrained_ctypes_tests.c CUSTOM_TARGETS += constrained_ctypes_tests_enabled constrained_ctypes_tests_enacted constrained_ctypes_tests_disabled @@ -1551,9 +1758,7 @@ install-constrained_ctypes_tests_disabled: constrained_ctypes_tests_disabled task_suspend_stats: OTHER_LDFLAGS += -ldarwintest_utils task_suspend_stats: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist -# os_atomic use throws this warning -setitimer: OTHER_CFLAGS += -Wno-c++98-compat -framework perfdata -setitimer: OTHER_LDFLAGS += -framework perfdata +exec_set_proc_name: OTHER_LDFLAGS += -ldarwintest_utils include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets @@ -1564,8 +1769,37 @@ vfs/longpaths: vfs/longpaths.c vfs/longpaths: OTHER_LDFLAGS += -ldarwintest_utils vfs/longpaths: CODE_SIGN_ENTITLEMENTS = vfs/longpaths.entitlements +vfs/devfd_access: vfs/devfd_access.c +vfs/devfd_access: CODE_SIGN_ENTITLEMENTS = vfs/devfd_access.entitlements + +vfs/openbyid_stress: vfs/openbyid_stress.c +vfs/openbyid_stress: CODE_SIGN_ENTITLEMENTS = vfs/openbyid_stress.entitlements + +OTHER_TEST_TARGETS += trial_experiment_factors_entitled +trial_experiment_factors_entitled: OTHER_CFLAGS += -DENTITLED=1 +trial_experiment_factors_entitled: CODE_SIGN_ENTITLEMENTS=kern-trial.entitlements +trial_experiment_factors_entitled: trial_experiment_factors.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none --entitlements $(CODE_SIGN_ENTITLEMENTS) $(SYMROOT)/$@; + memorystatus_experiment_factors: CODESIGN_ENTITLEMENTS=./memorystatus/memorystatus_experiment_factors.entitlements memorystatus_experiment_factors: memorystatus/memorystatus_experiment_factors.c $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ $(ENV) CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none --entitlements $(CODESIGN_ENTITLEMENTS) $(SYMROOT)/$@; +ifeq ($(PLATFORM),MacOSX) +CUSTOM_TARGETS += nox86exec_helper + +nox86exec_helper: INVALID_ARCHS = arm64e arm64 +nox86exec_helper: nox86exec_helper.c + $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $^ -o $(SYMROOT)/$@ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; + +install-posix_spawn_alt_rosetta_helper: nox86exec_helper + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/nox86exec_helper $(INSTALLDIR)/ +endif + +vm/memorystatus_rearm: excserver +vm/memorystatus_rearm: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +vm/memorystatus_rearm: CODE_SIGN_ENTITLEMENTS = vm/memorystatus_rearm.entitlements diff --git a/tests/accept_race.c b/tests/accept_race.c index ab5bdf44d..1907df820 100644 --- a/tests/accept_race.c +++ b/tests/accept_race.c @@ -9,6 +9,8 @@ #include #include +#include "net_test_lib.h" + #define BASE_PORT 2020 static int port = BASE_PORT; @@ -84,4 +86,6 @@ T_DECL(accept_race, sleep(1); } client(); + + force_zone_gc(); } diff --git a/tests/aio.c b/tests/aio.c new file mode 100644 index 000000000..63f46ff34 --- /dev/null +++ b/tests/aio.c @@ -0,0 +1,564 @@ +/* compile: xcrun -sdk macosx.internal clang -arch arm64e -arch x86_64 -ldarwintest -o test_aio aio.c */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.h" + + +#ifndef SIGEV_KEVENT +#define SIGEV_KEVENT 4 +#endif + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.file_descriptors.aio"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("file descriptors"), + T_META_CHECK_LEAKS(false), + T_META_TAG_VM_PREFERRED); + + +#define AIO_TESTFILE "aio_testfile" +#define AIO_BUFFER_SIZE (1024 * 1024) +#define AIO_BUFFER_PATTERN 0x20190912 +#define AIO_LIST_MAX 4 + +static char *g_testfiles[AIO_LIST_MAX]; +static int g_fds[AIO_LIST_MAX]; +static struct aiocb g_aiocbs[AIO_LIST_MAX]; +static char *g_buffers[AIO_LIST_MAX]; + +/* + * This unit-test tests AIO (Asynchronous I/O) facility. + */ + + +static void +exit_cleanup(void) +{ + for (int i = 0; i < AIO_LIST_MAX; i++) { + if (g_fds[i] > 0) { + close(g_fds[i]); + } + if (g_testfiles[i]) { + (void)remove(g_testfiles[i]); + } + if (g_buffers[i]) { + free(g_buffers[i]); + } + } +} + +static void +do_init(int num_files, bool enable_nocache) +{ + const char *tmpdir = dt_tmpdir(); + int i, err; + + T_SETUPBEGIN; + + atexit(exit_cleanup); + + T_QUIET; + T_ASSERT_LE(num_files, AIO_LIST_MAX, "too many files"); + + for (i = 0; i < AIO_LIST_MAX; i++) { + g_fds[i] = -1; + g_testfiles[i] = NULL; + g_buffers[i] = NULL; + } + + for (i = 0; i < num_files; i++) { + T_WITH_ERRNO; + g_testfiles[i] = malloc(MAXPATHLEN); + T_QUIET; + T_ASSERT_NE(g_testfiles[i], NULL, "Allocate path buffer %d size %d", + i, MAXPATHLEN); + + snprintf(g_testfiles[i], MAXPATHLEN, "%s/%s.%d", + tmpdir, AIO_TESTFILE, i); + + T_WITH_ERRNO; + g_fds[i] = open(g_testfiles[i], O_CREAT | O_RDWR, 0666); + T_ASSERT_NE(g_fds[i], -1, "Create test fi1e: %s", g_testfiles[i]); + + T_WITH_ERRNO; + g_buffers[i] = malloc(AIO_BUFFER_SIZE); + T_QUIET; + T_ASSERT_NE(g_buffers[i], NULL, "Allocate data buffer %d size %d", + i, AIO_BUFFER_SIZE); + memset(g_buffers[i], AIO_BUFFER_PATTERN, AIO_BUFFER_SIZE); + + if (enable_nocache) { + T_WITH_ERRNO; + err = fcntl(g_fds[i], F_NOCACHE, 1); + T_ASSERT_NE(err, -1, "Set F_NOCACHE: %s", g_testfiles[i]); + } + } + + T_SETUPEND; +} + +static struct aiocb * +init_aiocb(int idx, off_t offset, int lio_opcode) +{ + struct aiocb *aiocbp; + + aiocbp = &g_aiocbs[idx]; + memset(aiocbp, 0, sizeof(struct aiocb)); + aiocbp->aio_fildes = g_fds[idx]; + aiocbp->aio_offset = offset; + aiocbp->aio_buf = g_buffers[idx]; + aiocbp->aio_nbytes = AIO_BUFFER_SIZE; + aiocbp->aio_lio_opcode = lio_opcode; + + return aiocbp; +} + +static int +poll_aio_error(struct aiocb *aiocbp) +{ + int err; + + while (1) { + err = aio_error(aiocbp); + if (err != EINPROGRESS) { + break; + } + usleep(10000); + } + + return err; +} + +static int +wait_for_kevent(int kq, struct kevent64_s *kevent) +{ + struct timespec timeout = {.tv_sec = 10, .tv_nsec = 0}; + + return kevent64(kq, NULL, 0, kevent, 1, 0, &timeout); +} + +static int +verify_buffer_data(struct aiocb *aiocbp, uint32_t pattern) +{ + char *buf_to_verify; + int err = 0; + + buf_to_verify = malloc(aiocbp->aio_nbytes); + if (!buf_to_verify) { + err = ENOMEM; + goto out; + } + memset(buf_to_verify, pattern, aiocbp->aio_nbytes); + + err = memcmp((const void *)aiocbp->aio_buf, (const void *)buf_to_verify, + aiocbp->aio_nbytes); + free(buf_to_verify); + +out: + return err; +} + +/* + * Test aio_write() and aio_read(). + * Poll with aio_error() for AIO completion and call aio_return() to retrieve + * return status of AIO operation. + */ +T_DECL(write_read, "Test aio_write() and aio_read(). Poll for AIO completion") +{ + struct aiocb *aiocbp; + ssize_t retval; + int err; + + do_init(1, true); + + /* Setup aiocb for aio_write(). */ + aiocbp = init_aiocb(0, 0, 0); + + T_WITH_ERRNO; + err = aio_write(aiocbp); + T_ASSERT_NE(err, -1, "aio_write() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp %p bytes_written 0x%zx", aiocbp, retval); + + memset((void *)aiocbp->aio_buf, 0, AIO_BUFFER_SIZE); + + T_WITH_ERRNO; + err = aio_read(aiocbp); + T_ASSERT_NE(err, -1, "aio_read() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp %p bytes_read 0x%zx", aiocbp, retval); + + err = verify_buffer_data(aiocbp, AIO_BUFFER_PATTERN); + T_ASSERT_EQ(err, 0, "verify data returned from aio_read()"); +} + +/* + * Test aio_write() and aio_fsync(). + * Poll with aio_error() for AIO completion and call aio_return() to retrieve + * return status of AIO operation. + */ +T_DECL(write_fsync, "Test aio_write() and aio_fsync(). Poll for AIO completion.") +{ + struct aiocb *aiocbp; + ssize_t retval; + int err; + + do_init(1, false); + + /* Setup aiocb for aio_write(). */ + aiocbp = init_aiocb(0, (1024 * 1024), 0); + + T_WITH_ERRNO; + err = aio_write(aiocbp); + T_ASSERT_NE(err, -1, "aio_write() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp %p bytes_written 0x%zx", aiocbp, retval); + + T_WITH_ERRNO; + err = aio_fsync(O_SYNC, aiocbp); + T_ASSERT_NE(err, -1, "aio_fsync() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + err = aio_return(aiocbp); + T_ASSERT_EQ(err, 0, "aio_return() for aiocbp %p", aiocbp); +} + +/* + * Test aio_write() and aio_suspend(). + * Suspend with aio_suspend() until AIO completion and call aio_return() to + * retrieve return status of AIO operation. + */ +T_DECL(write_suspend, "Test aio_write() and aio_suspend(). Suspend until AIO completion.") +{ + struct aiocb *aiocbp, *aiocb_list[AIO_LIST_MAX]; + struct timespec timeout; + ssize_t retval; + int err; + + do_init(1, false); + + /* Setup aiocb for aio_write(). */ + aiocbp = init_aiocb(0, (128 * 1024), 0); + aiocb_list[0] = aiocbp; + + T_WITH_ERRNO; + err = aio_write(aiocbp); + T_ASSERT_NE(err, -1, "aio_write() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + T_WITH_ERRNO; + timeout.tv_sec = 1; + timeout.tv_nsec = 0; + err = aio_suspend((const struct aiocb *const *)aiocb_list, 1, &timeout); + T_ASSERT_NE(err, -1, "aio_suspend() with 1 sec timeout"); + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp %p bytes_written 0x%zx", aiocbp, retval); +} + +/* + * Test lio_listio() with LIO_WAIT. + * Initiate a list of AIO operations and wait for their completions. + */ +T_DECL(lio_listio_wait, "Test lio_listio() with LIO_WAIT.") +{ + struct aiocb *aiocbp, *aiocb_list[AIO_LIST_MAX]; + ssize_t retval; + int i, err; + + do_init(AIO_LIST_MAX, true); + + /* Setup aiocbs for lio_listio(). */ + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = init_aiocb(i, (i * 1024 * 1024), LIO_WRITE); + aiocb_list[i] = aiocbp; + } + + T_WITH_ERRNO; + err = lio_listio(LIO_WAIT, aiocb_list, AIO_LIST_MAX, NULL); + T_ASSERT_NE(err, -1, "lio_listio(LIO_WAIT) for %d AIO operations", + AIO_LIST_MAX); + + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = aiocb_list[i]; + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp(%d) %p bytes_written 0x%zx", + i, aiocbp, retval); + } +} + +/* + * Test lio_listio() with LIO_NOWAIT. + * Initiate a list of AIO operations and poll for their completions. + */ +T_DECL(lio_listio_nowait, "Test lio_listio() with LIO_NOWAIT.") +{ + struct aiocb *aiocbp, *aiocb_list[AIO_LIST_MAX]; + ssize_t retval; + int i, err; + + do_init(AIO_LIST_MAX, true); + + /* Setup aiocbs for lio_listio(). */ + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = init_aiocb(i, (i * 1024 * 1024), LIO_WRITE); + aiocb_list[i] = aiocbp; + } + + T_WITH_ERRNO; + err = lio_listio(LIO_NOWAIT, aiocb_list, AIO_LIST_MAX, NULL); + T_ASSERT_NE(err, -1, "lio_listio(LIO_NOWAIT) for %d AIO operations", + AIO_LIST_MAX); + + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = aiocb_list[i]; + + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp(%d) %p bytes_written 0x%zx", + i, aiocbp, retval); + } +} + +/* + * Test lio_listio() and aio_cancel(). + * Initiate a list of AIO operations and attempt to cancel them with + * aio_cancel(). + */ +T_DECL(lio_listio_cancel, "Test lio_listio() and aio_cancel().") +{ + struct aiocb *aiocbp, *aiocb_list[AIO_LIST_MAX]; + char *buffer; + ssize_t retval; + int i, err; + + do_init(AIO_LIST_MAX, true); + + /* Setup aiocbs for lio_listio(). */ + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = init_aiocb(i, (i * 1024 * 1024), LIO_WRITE); + aiocb_list[i] = aiocbp; + } + + T_WITH_ERRNO; + err = lio_listio(LIO_NOWAIT, aiocb_list, AIO_LIST_MAX, NULL); + T_ASSERT_NE(err, -1, "lio_listio() for %d AIO operations", AIO_LIST_MAX); + + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = aiocb_list[i]; + + T_WITH_ERRNO; + err = aio_cancel(g_fds[i], aiocbp); + T_ASSERT_TRUE(((err & (AIO_ALLDONE | AIO_CANCELED | AIO_NOTCANCELED)) != 0), + "aio_cancel() for aiocbp(%d) %p err %d", i, aiocbp, err); + + if (err == AIO_NOTCANCELED || err == AIO_ALLDONE) { + if (err == AIO_NOTCANCELED) { + T_WITH_ERRNO; + err = poll_aio_error(aiocbp); + T_ASSERT_NE(err, -1, "aio_error() for aiocbp %p", aiocbp); + } + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, AIO_BUFFER_SIZE, + "aio_return() for aiocbp(%d) %p bytes_written 0x%zx", + i, aiocbp, retval); + } else if (err == AIO_CANCELED) { + T_WITH_ERRNO; + retval = aio_return(aiocbp); + T_ASSERT_EQ((int)retval, -1, + "aio_return() for aiocbp(%d) %p", i, aiocbp); + } + } +} + +/* + * Test aio_write() and aio_read(). + * Use kevent for AIO completion and return status. + */ +T_DECL(write_read_kevent, "Test aio_write() and aio_read(). Use kevent for AIO completion and return status.") +{ + struct aiocb *aiocbp; + struct kevent64_s kevent; + void *udata1, *udata2; + ssize_t retval; + int err, kq; + + do_init(1, true); + + kq = kqueue(); + T_ASSERT_NE(kq, -1, "Create kqueue"); + + /* Setup aiocb for aio_write(). */ + aiocbp = init_aiocb(0, 0, 0); + aiocbp->aio_sigevent.sigev_notify = SIGEV_KEVENT; + aiocbp->aio_sigevent.sigev_signo = kq; + aiocbp->aio_sigevent.sigev_value.sival_ptr = (void *)&udata1; + + T_WITH_ERRNO; + err = aio_write(aiocbp); + T_ASSERT_NE(err, -1, "aio_write() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + memset(&kevent, 0, sizeof(kevent)); + err = wait_for_kevent(kq, &kevent); + T_ASSERT_NE(err, -1, "Listen for AIO completion event on kqueue %d", kq); + + if (err > 0) { + T_ASSERT_EQ(err, 1, "num event returned %d", err); + T_ASSERT_EQ((struct aiocb *)kevent.ident, aiocbp, "kevent.ident %p", + (struct aiocb *)kevent.ident); + T_ASSERT_EQ(kevent.filter, EVFILT_AIO, "kevent.filter %d", + kevent.filter); + T_ASSERT_EQ((void **)kevent.udata, &udata1, "kevent.udata %p", + (char *)kevent.udata); + T_ASSERT_EQ((int)kevent.ext[0], 0, "kevent.ext[0] (err %d)", + (int)kevent.ext[0]); + T_ASSERT_EQ((int)kevent.ext[1], AIO_BUFFER_SIZE, + "kevent.ext[1] (bytes_written 0x%x)", (int)kevent.ext[1]); + } else { + T_FAIL("Timedout listening for AIO completion event on kqueue %d", kq); + } + + aiocbp->aio_sigevent.sigev_value.sival_ptr = (void *)&udata2; + + T_WITH_ERRNO; + err = aio_read(aiocbp); + T_ASSERT_NE(err, -1, "aio_read() for fd %d offset 0x%llx length 0x%zx", + aiocbp->aio_fildes, aiocbp->aio_offset, aiocbp->aio_nbytes); + + memset(&kevent, 0, sizeof(kevent)); + err = wait_for_kevent(kq, &kevent); + T_ASSERT_NE(err, -1, "Listen for AIO completion event on kqueue %d", kq); + + if (err > 0) { + T_ASSERT_EQ(err, 1, "num event returned %d", err); + T_ASSERT_EQ((struct aiocb *)kevent.ident, aiocbp, "kevent.ident %p", + (struct aiocb *)kevent.ident); + T_ASSERT_EQ(kevent.filter, EVFILT_AIO, "kevent.filter %d", + kevent.filter); + T_ASSERT_EQ((void **)kevent.udata, &udata2, "kevent.udata %p", + (char *)kevent.udata); + T_ASSERT_EQ((int)kevent.ext[0], 0, "kevent.ext[0] (err %d)", + (int)kevent.ext[0]); + T_ASSERT_EQ((int)kevent.ext[1], AIO_BUFFER_SIZE, + "kevent.ext[1] (bytes_read 0x%x)", (int)kevent.ext[1]); + } else { + T_FAIL("Timedout listening for AIO completion event on kqueue %d", kq); + } +} + +/* + * Test lio_listio() with LIO_NOWAIT. + * Initiate a list of AIO operations and use kevent for their completion + * notification and status. + */ +T_DECL(lio_listio_kevent, "Test lio_listio() with kevent.") +{ + struct aiocb *aiocbp, *aiocb_list[AIO_LIST_MAX]; + struct kevent64_s kevent; + ssize_t retval; + int i, err, kq; + + do_init(AIO_LIST_MAX, true); + + kq = kqueue(); + T_ASSERT_NE(kq, -1, "Create kqueue"); + + /* Setup aiocbs for lio_listio(). */ + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = init_aiocb(i, (i * 1024 * 1024), LIO_WRITE); + aiocbp->aio_sigevent.sigev_notify = SIGEV_KEVENT; + aiocbp->aio_sigevent.sigev_signo = kq; + aiocbp->aio_sigevent.sigev_value.sival_ptr = (void *)g_testfiles[i]; + aiocb_list[i] = aiocbp; + } + + T_WITH_ERRNO; + err = lio_listio(LIO_NOWAIT, aiocb_list, AIO_LIST_MAX, NULL); + T_ASSERT_NE(err, -1, "lio_listio(LIO_NOWAIT) for %d AIO operations", + AIO_LIST_MAX); + + for (i = 0; i < AIO_LIST_MAX; i++) { + aiocbp = aiocb_list[i]; + + memset(&kevent, 0, sizeof(kevent)); + err = wait_for_kevent(kq, &kevent); + T_ASSERT_NE(err, -1, "Listen for AIO completion event on kqueue %d", kq); + if (err > 0) { + int idx; + + aiocbp = NULL; + T_ASSERT_EQ(err, 1, "num event returned %d", err); + + for (idx = 0; idx < AIO_LIST_MAX; idx++) { + if (aiocb_list[idx] == (struct aiocb *)kevent.ident) { + aiocbp = (struct aiocb *)kevent.ident; + break; + } + } + + T_ASSERT_EQ((struct aiocb *)kevent.ident, aiocbp, "kevent.ident %p", + (struct aiocb *)kevent.ident); + T_ASSERT_EQ(kevent.filter, EVFILT_AIO, "kevent.filter %d", + kevent.filter); + T_ASSERT_EQ((void *)kevent.udata, (void *)g_testfiles[idx], + "kevent.udata %p", (char *)kevent.udata); + T_ASSERT_EQ((int)kevent.ext[0], 0, "kevent.ext[0] (err %d)", + (int)kevent.ext[0]); + T_ASSERT_EQ((int)kevent.ext[1], AIO_BUFFER_SIZE, + "kevent.ext[1] (bytes_read 0x%x)", (int)kevent.ext[1]); + } else { + T_FAIL("Timedout listening for AIO completion event on kqueue %d", kq); + } + } +} diff --git a/tests/arm_cpu_capabilities.c b/tests/arm_cpu_capabilities.c index 7664fc94e..70b62eed5 100644 --- a/tests/arm_cpu_capabilities.c +++ b/tests/arm_cpu_capabilities.c @@ -47,7 +47,8 @@ static volatile bool cap_usable; static size_t bad_instruction_handler(mach_port_t task __unused, mach_port_t thread __unused, - exception_type_t type __unused, mach_exception_data_t codes __unused) + exception_type_t type __unused, mach_exception_data_t codes __unused, + uint64_t exception_pc __unused) { cap_usable = false; return 4; @@ -517,6 +518,7 @@ try_sme_i16i64(void) } + static void try_fpexcp(void) { diff --git a/tests/arm_matrix.c b/tests/arm_matrix.c index f7878af7d..82e5fdceb 100644 --- a/tests/arm_matrix.c +++ b/tests/arm_matrix.c @@ -90,7 +90,8 @@ bad_instruction_exception_handler( __unused mach_port_t task, __unused mach_port_t thread, exception_type_t type, - mach_exception_data_t codes) + mach_exception_data_t codes, + __unused uint64_t exception_pc) { T_QUIET; T_ASSERT_EQ(type, EXC_BAD_INSTRUCTION, "Caught an EXC_BAD_INSTRUCTION exception"); T_QUIET; T_ASSERT_EQ(codes[0], (uint64_t)EXC_ARM_UNDEFINED, "The subcode is EXC_ARM_UNDEFINED"); diff --git a/tests/bingrade.c b/tests/bingrade.c new file mode 100644 index 000000000..27c75a385 --- /dev/null +++ b/tests/bingrade.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include + +#include "test_utils.h" + +#define ILP32_POINTER_BYTES (4) + +T_GLOBAL_META( + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("arm"), + T_META_OWNER("jwilkey")); + +T_DECL(bingrade_vm_force_arm64_32, "Test forced arm64_32 binary grading policy for VM", + T_META_RUN_CONCURRENTLY(true), + T_META_REQUIRES_SYSCTL_EQ("kern.hv_vmm_present", 1), + T_META_BOOTARGS_SET("force-arm64-32=1"), + T_META_ENABLED(TARGET_OS_WATCH), + T_META_TAG_VM_PREFERRED) +{ + pid_t pid; + int status; + + /* 32-bit process should succeed. */ + { + char * const argv[] = {"bingrade_helper_arm32", NULL}; + const int rc = posix_spawn(&pid, argv[0], NULL, NULL, argv, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(rc, "32-bit process should spawn."); + pid = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, NULL); + T_QUIET; T_ASSERT_TRUE(WIFEXITED(status), NULL); + T_ASSERT_EQ(WEXITSTATUS(status), ILP32_POINTER_BYTES, "32-bit process should succeed."); + } + + /* 64-bit process should be rejected. */ + { + char * const argv[] = {"bingrade_helper_arm64", NULL}; + const int rc = posix_spawn(NULL, argv[0], NULL, NULL, argv, NULL); + T_ASSERT_POSIX_ERROR(rc, EBADARCH, "64-bit process should be rejected."); + } + + /* Fat binary should select 32-bit process. */ + { + char * const argv[] = {"bingrade_helper_arm_fat", NULL}; + const int rc = posix_spawn(&pid, argv[0], NULL, NULL, argv, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(rc, "Fat binary should spawn."); + pid = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, NULL); + T_QUIET; T_ASSERT_TRUE(WIFEXITED(status), NULL); + T_ASSERT_EQ(WEXITSTATUS(status), ILP32_POINTER_BYTES, + "Fat binary should select 32-bit process."); + } +} + +T_DECL(gestalt_vm_force_arm64_32, "Test forced arm64_32 mode host_info CPU architecture", + T_META_RUN_CONCURRENTLY(true), + T_META_REQUIRES_SYSCTL_EQ("kern.hv_vmm_present", 1), + T_META_BOOTARGS_SET("force-arm64-32=1"), + T_META_ENABLED(TARGET_OS_WATCH), + T_META_TAG_VM_PREFERRED) +{ + mach_msg_type_number_t count = HOST_PREFERRED_USER_ARCH_COUNT; + host_preferred_user_arch_data_t hi; + kern_return_t kr; + + kr = host_info(mach_host_self(), HOST_PREFERRED_USER_ARCH, (host_info_t)&hi, &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_info"); + T_QUIET; T_ASSERT_EQ(count, HOST_PREFERRED_USER_ARCH_COUNT, NULL); + + T_ASSERT_EQ(hi.cpu_type, CPU_TYPE_ARM64_32, NULL); + T_ASSERT_EQ(hi.cpu_subtype, CPU_SUBTYPE_ARM64_32_V8, NULL); +} diff --git a/tests/bingrade_helper.c b/tests/bingrade_helper.c new file mode 100644 index 000000000..5ae07da54 --- /dev/null +++ b/tests/bingrade_helper.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include + +#define _STR(X) #X +#define STR(X) _STR(X) + +void start() asm ("start"); + +/* + * A library-free routine to return the number of bytes in a pointer. This + * allows us to test the kernel loader policy without having an external + * dependency on dyld and/or the presence of a slice in a library dependency + * (which it often does not, even if the kernel supports the policy). + * + * No C code. The stack is not guaranteed to be aligned yet. + */ +__attribute__((naked, noreturn)) +void +start() +{ + /* exit(__WORDSIZE/8) */ + asm volatile ( + "mov x0, " STR(__WORDSIZE/8) "\n" + "mov x16, #1\n" + "svc #(" STR(SWI_SYSCALL) ")\n"); +} diff --git a/tests/coalition_policy.c b/tests/coalition_policy.c index 0ce1450c9..7d024c340 100644 --- a/tests/coalition_policy.c +++ b/tests/coalition_policy.c @@ -41,6 +41,8 @@ #include #include +#include + T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("scheduler"), @@ -142,6 +144,13 @@ restore_coalition_state(void) "coalition_policy_set(%lld, COALITION_POLICY_SUPPRESS, COALITION_POLICY_SUPPRESS_NONE)", jet_id); } +static void +quiesce(int argc, char *const *argv) +{ + if (!wait_for_quiescence_default(argc, argv)) { + T_LOG("WARN: System did not quiesce. BG threads may experience starvation, causing this test to fail."); + } +} T_DECL(coalition_suppress_read_entitled, "COALITION_POLICY_SUPPRESS should be readable with entitlement") { @@ -184,6 +193,7 @@ T_DECL(coalition_suppress_read_rsrc_coalition, "COALITION_POLICY_SUPPRESS should T_DECL(coalition_suppress_set, "COALITION_POLICY_SUPPRESS should be settable with entitlement") { T_ATEND(restore_coalition_state); + quiesce(argc, argv); uint64_t jet_id = get_jet_id(); @@ -199,6 +209,7 @@ T_DECL(coalition_suppress_set, "COALITION_POLICY_SUPPRESS should be settable wit T_DECL(coalition_suppress_set_check_task, "current task should become BG when coalition changes", T_META_ASROOT(true)) { T_ATEND(restore_coalition_state); + quiesce(argc, argv); uint64_t jet_id = get_jet_id(); @@ -222,6 +233,7 @@ T_DECL(coalition_suppress_set_check_task, "current task should become BG when co T_DECL(coalition_suppress_child_bg, "child spawned into bg coalition should be bg", T_META_ASROOT(true)) { T_ATEND(restore_coalition_state); + quiesce(argc, argv); uint64_t jet_id = get_jet_id(); @@ -270,6 +282,7 @@ T_DECL(coalition_suppress_child_bg, "child spawned into bg coalition should be b T_DECL(coalition_suppress_child_change_bg, "child changing coalition to bg should affect parent", T_META_ASROOT(true)) { T_ATEND(restore_coalition_state); + quiesce(argc, argv); uint64_t jet_id = get_jet_id(); diff --git a/tests/coalition_policy_unentitled.c b/tests/coalition_policy_unentitled.c index 701bbb47e..e03097173 100644 --- a/tests/coalition_policy_unentitled.c +++ b/tests/coalition_policy_unentitled.c @@ -66,7 +66,7 @@ get_jet_id(void) return jet_id; } -T_DECL(coalition_suppress_read_entitled, "COALITION_POLICY_SUPPRESS should not be readable without entitlement") +T_DECL(coalition_suppress_read_unentitled, "COALITION_POLICY_SUPPRESS should not be readable without entitlement") { uint64_t jet_id = get_jet_id(); @@ -75,7 +75,7 @@ T_DECL(coalition_suppress_read_entitled, "COALITION_POLICY_SUPPRESS should not b T_LOG("suppress: %d", suppress); } -T_DECL(coalition_suppress_set, "COALITION_POLICY_SUPPRESS should not be settable without entitlement") +T_DECL(coalition_suppress_set_unentitled, "COALITION_POLICY_SUPPRESS should not be settable without entitlement") { uint64_t jet_id = get_jet_id(); diff --git a/tests/codesigntests.c b/tests/codesigntests.c index e25f35ce2..79b6f94e0 100644 --- a/tests/codesigntests.c +++ b/tests/codesigntests.c @@ -2,9 +2,11 @@ #include #include #include +#include #include #include #include +#include #define MAXBUFLEN 1024 * 1024 @@ -94,10 +96,11 @@ T_DECL(TESTNAME, "CS OP, code sign operations test") uint32_t status; int rcent; pid_t pid; + csops_cdhash_t cdhash_info = {0}; + uint8_t cdhash[CS_CDHASH_LEN] = {0}; pid = getpid(); - rcent = get_blob(pid, CS_OPS_ENTITLEMENTS_BLOB); T_ASSERT_EQ_INT(rcent, 0, "Getting CS OPS entitlements blob"); @@ -127,8 +130,18 @@ T_DECL(TESTNAME, "CS OP, code sign operations test") rcent = csops(pid, CS_OPS_STATUS, &status, sizeof(status)); T_ASSERT_EQ_INT(rcent, 0, "Getting CS OPs status of process"); + rcent = csops(pid, CS_OPS_CDHASH, cdhash, sizeof(cdhash)); + T_ASSERT_EQ_INT(rcent, 0, "Getting CS_OPS_CDHASH"); + + rcent = csops(pid, CS_OPS_CDHASH_WITH_INFO, &cdhash_info, sizeof(cdhash_info)); + T_ASSERT_EQ_INT(rcent, 0, "Getting CS_OPS_CDHASH_WITH_INFO"); + + /* Verify the returned CDHashes match and are the correct type */ + T_ASSERT_EQ_INT(memcmp(cdhash_info.hash, cdhash, sizeof(cdhash)), 0, "Comparing CDHashes"); + T_ASSERT_EQ_INT(cdhash_info.type, CS_HASHTYPE_SHA256, "Checking returned CDHash type [SHA256]"); + /* - * Only run the folling tests if not HARD since otherwise + * Only run the following tests if not HARD since otherwise * we'll just die when marking ourself invalid. */ diff --git a/tests/context_helpers.h b/tests/context_helpers.h new file mode 100644 index 000000000..4cf9ade60 --- /dev/null +++ b/tests/context_helpers.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + +/** Retrieve current CSW count for the calling PID. */ +static int32_t +get_csw_count(void) +{ + struct proc_taskinfo taskinfo; + int rv; + + rv = proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &taskinfo, sizeof(taskinfo)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "PROC_PIDTASKINFO"); + + return taskinfo.pti_csw; +} diff --git a/tests/cpu_counters/cpc_security_tests.c b/tests/cpu_counters/cpc_security_tests.c index 3a19b279e..e1ca00724 100644 --- a/tests/cpu_counters/cpc_security_tests.c +++ b/tests/cpu_counters/cpc_security_tests.c @@ -208,13 +208,17 @@ check_secure_cpmu(void) } T_DECL(secure_cpmu_event_restrictions, "secured CPMU should be restricted to known events", - _T_META_CPC_SECURE_ON_DEV, T_META_TAG_VM_NOT_ELIGIBLE) + _T_META_CPC_SECURE_ON_DEV, + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://153473281 */) { check_secure_cpmu(); } T_DECL(release_cpmu_event_restrictions, "release CPMU should be restricted to known events", - XNU_T_META_REQUIRES_RELEASE_KERNEL, T_META_TAG_VM_NOT_ELIGIBLE) + XNU_T_META_REQUIRES_RELEASE_KERNEL, + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://153473334 */) { check_secure_cpmu(); } @@ -366,7 +370,8 @@ T_DECL(insecure_cpmu_unrestricted, "insecure CPMU should be unrestricted", } T_DECL(secure_kpc_counting_system, "kpc should not allow counting the kernel when secure", - _T_META_CPC_SECURE_ON_DEV) + _T_META_CPC_SECURE_ON_DEV, + T_META_ENABLED(false) /* rdar://131466526 */) { kpep_db_t db = NULL; int ret = kpep_db_createx(NULL, KPEP_DB_FLAG_PUBLIC_ONLY, &db); diff --git a/tests/cpu_counters/kpc_tests.c b/tests/cpu_counters/kpc_tests.c index e4312dd5d..02846e09c 100644 --- a/tests/cpu_counters/kpc_tests.c +++ b/tests/cpu_counters/kpc_tests.c @@ -301,7 +301,9 @@ check_tally(unsigned int ncpus, unsigned int nctrs, struct tally *tallies) T_DECL(kpc_cpu_direct_configurable, "test that configurable counters return monotonically increasing values", XNU_T_META_SOC_SPECIFIC, - T_META_BOOTARGS_SET("enable_skstb=1"), T_META_TAG_VM_NOT_ELIGIBLE) + T_META_BOOTARGS_SET("enable_skstb=1"), + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://134505531 */) { skip_if_unsupported(); @@ -418,7 +420,9 @@ struct cpu { T_DECL(kpc_pmi_configurable, "test that PMIs don't interfere with sampling counters in kperf", XNU_T_META_SOC_SPECIFIC, - T_META_BOOTARGS_SET("enable_skstb=1"), T_META_TAG_VM_NOT_ELIGIBLE) + T_META_BOOTARGS_SET("enable_skstb=1"), + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://134505531 */) { skip_if_unsupported(); diff --git a/tests/cpucount.c b/tests/cpucount.c index 21a2370c8..cef0d5a80 100644 --- a/tests/cpucount.c +++ b/tests/cpucount.c @@ -12,6 +12,7 @@ * xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -framework IOKit -framework CoreFoundation -arch arm64e -Weverything */ +#include "context_helpers.h" #include #include "test_utils.h" @@ -35,13 +36,11 @@ T_GLOBAL_META( T_META_RUN_CONCURRENTLY(false), - T_META_BOOTARGS_SET("enable_skstb=1"), T_META_CHECK_LEAKS(false), T_META_ASROOT(true), T_META_ALL_VALID_ARCHS(true), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("scheduler"), - T_META_OWNER("jarrad"), T_META_TAG_VM_NOT_PREFERRED ); @@ -93,18 +92,6 @@ abs_to_nanos(uint64_t abs) return abs * timebase_info.numer / timebase_info.denom; } -static int32_t -get_csw_count(void) -{ - struct proc_taskinfo taskinfo; - int rv; - - rv = proc_pidinfo(getpid(), PROC_PIDTASKINFO, 0, &taskinfo, sizeof(taskinfo)); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "PROC_PIDTASKINFO"); - - return taskinfo.pti_csw; -} - // noinline hopefully keeps the optimizer from hoisting it out of the loop // until rdar://68253516 is fixed. __attribute__((noinline)) @@ -166,6 +153,7 @@ cpucount_setup(void) T_DECL(count_cpus, "Tests we can schedule bound threads on all hw.ncpus cores and that _os_cpu_number matches", + T_META_BOOTARGS_SET("enable_skstb=1"), XNU_T_META_SOC_SPECIFIC) { int rv; @@ -261,8 +249,11 @@ T_DECL(count_cpus, T_DECL(count_clusters, "Tests we can schedule bound threads on all cpu clusters and that _os_cpu_cluster_number matches", XNU_T_META_SOC_SPECIFIC, + /* Disable CLPC dynamic cluster power-down to ensure threads can run on their bound cluster. */ + T_META_BOOTARGS_SET("enable_skstb=1 cpu-dynamic-cluster-power-down=0"), #if __x86_64__ - T_META_ENABLED(false /* rdar://133956403 */) + /* We shouldn't need to count clusters for Rosetta processes. */ + T_META_ENABLED(false) #else T_META_ENABLED(true) #endif @@ -381,6 +372,7 @@ T_DECL(count_clusters, T_DECL(check_cpu_topology, "Verify _os_cpu_cluster_number(), _os_cpu_number() against IORegistry", XNU_T_META_SOC_SPECIFIC, + T_META_BOOTARGS_SET("enable_skstb=1"), T_META_ENABLED(TARGET_CPU_ARM || TARGET_CPU_ARM64)) { int rv; diff --git a/tests/decompression_failure.c b/tests/decompression_failure.c index b9867ee09..e4a6bc9b9 100644 --- a/tests/decompression_failure.c +++ b/tests/decompression_failure.c @@ -8,8 +8,7 @@ #include #include #include -#include "excserver.h" -#include "exc_helpers.h" +#include "try_read_write.h" extern int pid_hibernate(int pid); @@ -113,23 +112,6 @@ create_corrupted_regions(void) } } -static bool -try_write(volatile uint32_t *word __unused) -{ -#ifdef __arm64__ - uint64_t val = 1; - __asm__ volatile ( - "str %w0, %1\n" - "mov %0, 0\n" - : "+r"(val) : "m"(*word)); - // The exception handler skips over the instruction that zeroes val when a - // decompression failure is detected. - return val == 0; -#else - return false; -#endif -} - static bool read_blocks(void) { @@ -137,7 +119,10 @@ read_blocks(void) for (size_t buffer_offset = 0; buffer_offset < block_length; buffer_offset += vm_pagesize) { // Access pages until the fault is detected. - if (!try_write((volatile uint32_t *)(blocks[i] + buffer_offset))) { + kern_return_t exception_kr; + if (!try_write_byte(blocks[i] + buffer_offset, 1, &exception_kr)) { + T_ASSERT_EQ(exception_kr, KERN_MEMORY_FAILURE, + "exception code should be KERN_MEMORY_FAILURE"); T_LOG("test_thread breaking"); return true; } @@ -146,22 +131,6 @@ read_blocks(void) return false; } -static size_t -kern_memory_failure_handler( - __unused mach_port_t task, - __unused mach_port_t thread, - exception_type_t exception, - mach_exception_data_t code) -{ - T_EXPECT_EQ(exception, EXC_BAD_ACCESS, - "Verified bad address exception"); - T_EXPECT_EQ((int)code[0], KERN_MEMORY_FAILURE, "caught KERN_MEMORY_FAILURE"); - T_PASS("received KERN_MEMORY_FAILURE from test thread"); - // Skip the next instruction as well so that the faulting code can detect - // the exception. - return 8; -} - T_DECL(decompression_failure, "Confirm that exception is raised on decompression failure", // Disable software checks in development builds, as these would result in @@ -192,12 +161,9 @@ T_DECL(decompression_failure, T_ASSERT_EQ_ULONG(size, sizeof(value), NULL); page_size = (vm_address_t)value; - mach_port_t exc_port = create_exception_port(EXC_MASK_BAD_ACCESS); create_corrupted_regions(); T_SETUPEND; - run_exception_handler(exc_port, kern_memory_failure_handler); - if (!read_blocks()) { T_SKIP("no faults"); } diff --git a/tests/dev_zero.c b/tests/dev_zero.c index b02e92233..3d564b572 100644 --- a/tests/dev_zero.c +++ b/tests/dev_zero.c @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include T_DECL(dev_zero, @@ -27,3 +29,23 @@ T_DECL(dev_zero, close(dev); } + +T_DECL(dev_zero_permissions, + "ensure /dev/zero's permissions can't be updated", + T_META_ASROOT(true)) +{ + struct stat sb = {0}; + const char *dev = "/dev/zero"; + int ret = 0; + + ret = stat(dev, &sb); + T_ASSERT_POSIX_SUCCESS(ret, "stat /dev/zero"); + T_ASSERT_TRUE(sb.st_mode & S_IWOTH, "/dev/zero world writable"); + + ret = chmod(dev, 0664); + T_ASSERT_POSIX_FAILURE(ret, EPERM, "chmod /dev/zero should fail w/ EPERM"); + + ret = stat(dev, &sb); + T_ASSERT_POSIX_SUCCESS(ret, "stat /dev/zero"); + T_ASSERT_TRUE(sb.st_mode & S_IWOTH, "/dev/zero still world writable"); +} diff --git a/tests/disk_mount_conditioner.c b/tests/disk_mount_conditioner.m similarity index 83% rename from tests/disk_mount_conditioner.c rename to tests/disk_mount_conditioner.m index f8a1d0f85..d2a694336 100644 --- a/tests/disk_mount_conditioner.c +++ b/tests/disk_mount_conditioner.m @@ -9,6 +9,7 @@ #include #include #include +#import static char *mktempdir(void); static char *mktempmount(void); @@ -273,6 +274,7 @@ T_DECL(fsctl_set_nonroot, T_DECL(fsctl_delays, "Validate I/O delays when DMC is enabled", + T_META_ENABLED(!TARGET_OS_BRIDGE), // diskutil is unavailable on bridgeOS T_META_RUN_CONCURRENTLY(false), T_META_TAG_VM_PREFERRED) { char *path; @@ -293,7 +295,7 @@ T_DECL(fsctl_delays, // measure delay before setting parameters (should be none) elapsed_nsec = time_for_read(fd, buf); - T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "DMC disabled read(%ld) from %s is reasonably fast", READSIZE, path); + T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "DMC disabled read(%ld) is reasonably fast", READSIZE); // measure delay after setting parameters info.enabled = 1; @@ -306,8 +308,8 @@ T_DECL(fsctl_delays, T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET) delay"); elapsed_nsec = time_for_read(fd, buf); - T_ASSERT_GT_ULLONG(elapsed_nsec, expected_nsec, "DMC enabled read(%ld) from %s is at least the expected delay", READSIZE, path); - T_ASSERT_LT_ULLONG(elapsed_nsec, 2 * expected_nsec, "DMC enabled read(%ld) from %s is no more than twice the expected delay", READSIZE, path); + T_ASSERT_GT_ULLONG(elapsed_nsec, expected_nsec, "DMC enabled read(%ld) is at least the expected delay", READSIZE); + T_ASSERT_LT_ULLONG(elapsed_nsec, 2 * expected_nsec, "DMC enabled read(%ld) is no more than twice the expected delay", READSIZE); // measure delay after resetting parameters (should be none) info.enabled = 0; @@ -317,7 +319,7 @@ T_DECL(fsctl_delays, usleep(USEC_PER_SEC / 2); // might still be other I/O inflight elapsed_nsec = time_for_read(fd, buf); - T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "After disabling DMC read(%ld) from %s is reasonably fast", READSIZE, path); + T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "After disabling DMC read(%ld) is reasonably fast", READSIZE); } #else /* TEST_UNENTITLED */ @@ -444,42 +446,72 @@ system_legal(const char *command) /* * Return the path to a temporary mount - * that contains a usable HFS+ filesystem + * that contains a usable APFS filesystem * mounted via a ram disk */ static char * mkramdisk(void) { char cmd[1024]; - char *mount_path = mktempdir(); char *dev_disk_file = malloc(256); atexit_b(^{ free(dev_disk_file); }); strcpy(dev_disk_file, "/tmp/dmc.ramdisk.XXXXXXXX"); - T_WITH_ERRNO; - T_ASSERT_NOTNULL(mktemp(dev_disk_file), "Create temporary file to store dev disk for ramdisk"); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(mktemp(dev_disk_file), "Create temporary file to store dev disk for ramdisk"); atexit_b(^{ remove(dev_disk_file); }); // create the RAM disk device - snprintf(cmd, sizeof(cmd), "hdik -nomount ram://10000 > %s", dev_disk_file); + // dev_disk_file will store the /dev/diskX path + snprintf(cmd, sizeof(cmd), "diskimagetool attach --nomount ram://16m > %s", dev_disk_file); T_ASSERT_EQ_INT(0, system_legal(cmd), "Create ramdisk"); atexit_b(^{ char eject_cmd[1024]; - unmount(mount_path, MNT_FORCE); - snprintf(eject_cmd, sizeof(eject_cmd), "hdik -e `cat %s`", dev_disk_file); + snprintf(eject_cmd, sizeof(eject_cmd), "diskutil eject force `cat %s`", dev_disk_file); system_legal(eject_cmd); remove(dev_disk_file); }); - // initialize as an HFS volume - snprintf(cmd, sizeof(cmd), "newfs_hfs `cat %s`", dev_disk_file); - T_ASSERT_EQ_INT(0, system_legal(cmd), "Initialize ramdisk as HFS"); + // initialize and mount as an APFS volume + snprintf(cmd, sizeof(cmd), "diskutil eraseVolume APFS dmc.ramdisk `cat %s`", dev_disk_file); + T_ASSERT_EQ_INT(0, system_legal(cmd), "Initialize ramdisk as APFS"); - // mount it - snprintf(cmd, sizeof(cmd), "mount -t hfs `cat %s` %s", dev_disk_file, mount_path); + // on iOS the previous eraseVolume doesn't automatically mount + // on macOS this mount will be redundant, but successful + snprintf(cmd, sizeof(cmd), "diskutil mountDisk `cat %s`", dev_disk_file); T_ASSERT_EQ_INT(0, system_legal(cmd), "Mount ramdisk"); + // on iOS the previous mountDisk doesn't support -mountPoint, so we have to find where it was mounted + char *mount_info_path = malloc(256); + strcpy(mount_info_path, "/tmp/dmc.mount_info.XXXXXXXX"); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(mktemp(mount_info_path), "Create temporary file to store mount info for ramdisk"); + atexit_b(^{ remove(mount_info_path); }); + + snprintf(cmd, sizeof(cmd), "diskimagetool list -plist `cat %s` > %s", dev_disk_file, mount_info_path); + T_QUIET; T_ASSERT_EQ_INT(0, system_legal(cmd), "Fetch ramdisk mount info"); + + NSURL *mountInfoURL = [NSURL fileURLWithPath:@(mount_info_path) isDirectory:NO]; + free(mount_info_path); + + NSError *error; + NSDictionary *mountInfo = [NSDictionary dictionaryWithContentsOfURL:mountInfoURL error:&error]; + if (!mountInfo) { + T_LOG("Error: %s", error.localizedDescription.UTF8String); + } + T_QUIET; T_ASSERT_NOTNULL(mountInfo, "Read mount info plist"); + + NSString *mountPoint = nil; + for (NSDictionary *entity in (NSArray *)mountInfo[@"System Entities"]) { + mountPoint = entity[@"Mount Point"]; + if (mountPoint) { + break; + } + } + T_QUIET; T_ASSERT_NOTNULL(mountPoint, "Find mount point in mount info plist"); + + char *mount_path = malloc(PATH_MAX); + atexit_b(^{ free(mount_path); }); + strlcpy(mount_path, mountPoint.UTF8String, PATH_MAX); return mount_path; } @@ -498,10 +530,10 @@ time_for_read(int fd, const char *expected) ret = read(fd, buf, READSIZE); stop = dt_nanoseconds(); - T_ASSERT_GE_LONG(ret, 0L, "read from temporary file"); - T_ASSERT_EQ_LONG(ret, READSIZE, "read %ld bytes from temporary file", READSIZE); + T_QUIET; T_ASSERT_GE_LONG(ret, 0L, "read from temporary file"); + T_QUIET; T_ASSERT_EQ_LONG(ret, READSIZE, "read %ld bytes from temporary file", READSIZE); err = memcmp(buf, expected, sizeof(buf)); - T_ASSERT_EQ_INT(0, err, "read expected contents from temporary file"); + T_QUIET; T_ASSERT_EQ_INT(0, err, "read expected contents from temporary file"); return stop - start; } @@ -513,12 +545,15 @@ perf_setup(char **path, int *fd) char *temp_path; char *mount_path = mkramdisk(); + T_LOG("Using ramdisk mounted at %s", mount_path); + temp_path = *path = malloc(PATH_MAX); snprintf(temp_path, PATH_MAX, "%s/dmc.XXXXXXXX", mount_path); atexit_b(^{ free(temp_path); }); T_ASSERT_NOTNULL(mktemp(temp_path), "Create temporary file"); atexit_b(^{ remove(temp_path); }); + T_LOG("Using temporary file at %s", temp_path); temp_fd = *fd = open(temp_path, O_RDWR | O_CREAT); T_WITH_ERRNO; diff --git a/tests/ecc_test.c b/tests/ecc_test.c index 9d5459f99..4528ca7ec 100644 --- a/tests/ecc_test.c +++ b/tests/ecc_test.c @@ -207,23 +207,19 @@ run_test(bool use_dcs) T_DECL(ecc_uncorrected_test, "test detection and handling of non-fatal ECC uncorrected errors", T_META_IGNORECRASHES(".*ecc_test_helper.*"), T_META_ASROOT(true), - T_META_ENABLED(false /* TARGET_CPU_ARM64 && TARGET_OS_OSX */) /* rdar://133461215 */, + T_META_ENABLED(false /* TARGET_CPU_ARM64 && TARGET_OS_OSX */), /* rdar://133461215 */ T_META_REQUIRES_SYSCTL_EQ("vm.retired_pages_end_test", 0), T_META_TAG_VM_NOT_ELIGIBLE) { run_test(false); } -/* DCS injection was fixed but then broke again.. - * Waiting on rdar://115998013 (WRDIS_DRAM_RAS_ERR needs to be disabled for dev fused units) - */ -#if 0 T_DECL(dcs_uncorrected_test, "test detection and handling from non-fatal ECC uncorrected errors (injected via DCS)", T_META_IGNORECRASHES(".*ecc_test_helper.*"), T_META_ASROOT(true), - T_META_ENABLED(TARGET_CPU_ARM64 && TARGET_OS_OSX), - T_META_REQUIRES_SYSCTL_EQ("vm.retired_pages_end_test", 0), T_META_TAG_VM_NOT_ELIGIBLE) + T_META_ENABLED(false /* TARGET_CPU_ARM64 && TARGET_OS_OSX */), /* rdar://133461215 */ + T_META_REQUIRES_SYSCTL_EQ("vm.retired_pages_end_test", 0), + T_META_TAG_VM_NOT_ELIGIBLE) { run_test(true); } -#endif diff --git a/tests/entitlements/create_conn_port_with_port_array.entitlements b/tests/entitlements/create_conn_port_with_port_array.entitlements new file mode 100644 index 000000000..2d6a16885 --- /dev/null +++ b/tests/entitlements/create_conn_port_with_port_array.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.developer.allow-connection-port-with-port-array + + + diff --git a/tests/entitlements/hardened-binary-entitlements-1.plist b/tests/entitlements/enhanced-security-binary-entitlements-1.plist similarity index 100% rename from tests/entitlements/hardened-binary-entitlements-1.plist rename to tests/entitlements/enhanced-security-binary-entitlements-1.plist diff --git a/tests/entitlements/hardened-binary-entitlements-2.plist b/tests/entitlements/enhanced-security-binary-entitlements-2.plist similarity index 100% rename from tests/entitlements/hardened-binary-entitlements-2.plist rename to tests/entitlements/enhanced-security-binary-entitlements-2.plist diff --git a/tests/entitlements/hardened-heap-security.entitlements b/tests/entitlements/hardened-heap-security.entitlements new file mode 100644 index 000000000..9fbc2a5ae --- /dev/null +++ b/tests/entitlements/hardened-heap-security.entitlements @@ -0,0 +1,10 @@ + + + + + com.apple.security.hardened-process + + com.apple.security.hardened-process.hardened-heap + + + diff --git a/tests/entitlements/hardened-heap-standalone.entitlements b/tests/entitlements/hardened-heap.entitlements similarity index 81% rename from tests/entitlements/hardened-heap-standalone.entitlements rename to tests/entitlements/hardened-heap.entitlements index eb0ba8069..57bf31a41 100644 --- a/tests/entitlements/hardened-heap-standalone.entitlements +++ b/tests/entitlements/hardened-heap.entitlements @@ -2,6 +2,8 @@ + com.apple.developer.hardened-process + com.apple.developer.hardened-process.hardened-heap diff --git a/tests/entitlements/hardened-proc-invalid.entitlements b/tests/entitlements/hardened-proc-invalid.entitlements new file mode 100644 index 000000000..8149bcb77 --- /dev/null +++ b/tests/entitlements/hardened-proc-invalid.entitlements @@ -0,0 +1,10 @@ + + + + + com.apple.security.hardened-process + + com.apple.developer.hardened-process + + + diff --git a/tests/entitlements/hardened-proc-security.entitlements b/tests/entitlements/hardened-proc-security.entitlements new file mode 100644 index 000000000..c900d2871 --- /dev/null +++ b/tests/entitlements/hardened-proc-security.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.security.hardened-process + + + diff --git a/tests/entitlements/platform-restrictions.plist b/tests/entitlements/platform-restrictions.plist new file mode 100644 index 000000000..bf8fe1719 --- /dev/null +++ b/tests/entitlements/platform-restrictions.plist @@ -0,0 +1,10 @@ + + + + + com.apple.developer.hardened-process + + com.apple.security.hardened-process.platform-restrictions + 2 + + \ No newline at end of file diff --git a/tests/entitlements/port_type_policy.entitlements b/tests/entitlements/port_type_policy.entitlements new file mode 100644 index 000000000..0917e1a6d --- /dev/null +++ b/tests/entitlements/port_type_policy.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.developer.allow-connection-port-with-port-array + + com.apple.private.allow-weak-reply-port + + com.apple.security.hardened-process.platform-restrictions + 2 + + diff --git a/tests/entitlements/tpro.plist b/tests/entitlements/tpro.plist new file mode 100644 index 000000000..876e5b9a9 --- /dev/null +++ b/tests/entitlements/tpro.plist @@ -0,0 +1,10 @@ + + + + + com.apple.developer.hardened-process + + com.apple.security.hardened-process.dyld-ro + + + diff --git a/tests/exc_guard_helper.c b/tests/exc_guard_helper.c new file mode 100644 index 000000000..0247c5525 --- /dev/null +++ b/tests/exc_guard_helper.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "exc_helpers.h" +#include "exc_guard_helper.h" +#include "test_utils.h" + +/* Convenience macro for compile-time array size */ +#define countof(array) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic error \"-Wsizeof-pointer-div\"") \ + (sizeof(array)/sizeof((array)[0])) \ + _Pragma("clang diagnostic pop") + +/* + * Global data shared between the code running the block and the exception handler. + * Ideally this would be thread-local data in the thread running the block, + * but the exception handler runs on a different thread and can't see it. + */ +static pthread_mutex_t exc_guard_helper_mutex = PTHREAD_MUTEX_INITIALIZER; +static mach_port_t exc_guard_helper_exc_port = MACH_PORT_NULL; + +static pthread_mutex_t exc_guard_helper_request_mutex = PTHREAD_MUTEX_INITIALIZER; +static exc_guard_helper_info_t exc_guard_helper_reply; +static struct { + mach_port_t thread; + unsigned int guard_type; +} exc_guard_helper_request; + +static const char * +name_for_guard_type(unsigned guard_type) +{ + static const char *names[] = { + [GUARD_TYPE_NONE] = "GUARD_TYPE_NONE", + [GUARD_TYPE_MACH_PORT] = "GUARD_TYPE_MACH_PORT", + [GUARD_TYPE_FD] = "GUARD_TYPE_FD", + [GUARD_TYPE_USER] = "GUARD_TYPE_USER", + [GUARD_TYPE_VN] = "GUARD_TYPE_VN", + [GUARD_TYPE_VIRT_MEMORY] = "GUARD_TYPE_VIRT_MEMORY", + [GUARD_TYPE_REJECTED_SC] = "GUARD_TYPE_REJECTED_SC", + }; + const char *result = NULL; + if (guard_type < countof(names)) { + result = names[guard_type]; + } + if (result == NULL) { + result = "unknown"; + } + return result; +} + +static size_t +exc_guard_helper_exception_handler( + __unused mach_port_t task, + mach_port_t thread, + exception_type_t exception, + mach_exception_data_t codes, + __unused uint64_t exception_pc) +{ + T_QUIET; T_ASSERT_EQ(exception, EXC_GUARD, "exception type"); + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_lock(&exc_guard_helper_request_mutex), "lock"); + + if (thread != exc_guard_helper_request.thread) { + /* reject, nobody is waiting for exceptions */ + if (verbose_exc_helper) { + T_LOG("exc_guard_helper caught an exception but nobody is waiting for it"); + } + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "unlock"); + return 0; + } + + unsigned int exc_guard_type = EXC_GUARD_DECODE_GUARD_TYPE(codes[0]); + uint32_t exc_guard_flavor = EXC_GUARD_DECODE_GUARD_FLAVOR(codes[0]); + uint32_t exc_guard_target = EXC_GUARD_DECODE_GUARD_TARGET(codes[0]); + uint64_t exc_guard_payload = codes[1]; + + if (exc_guard_helper_request.guard_type == exc_guard_type) { + /* okay, exception matches caller's requested guard type */ + } else { + /* reject, exception's guard type is not of the requested type */ + if (verbose_exc_helper) { + T_LOG("exc_guard_helper exception is not of the " + "desired guard type (expected %u, got %u)", + exc_guard_helper_request.guard_type, exc_guard_type); + } + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "unlock"); + return 0; + } + + if (++exc_guard_helper_reply.catch_count == 1) { + /* save the details of the first caught exception */ + exc_guard_helper_reply.guard_type = exc_guard_type; + exc_guard_helper_reply.guard_flavor = exc_guard_flavor; + exc_guard_helper_reply.guard_target = exc_guard_target; + exc_guard_helper_reply.guard_payload = exc_guard_payload; + } + + if (verbose_exc_helper) { + T_LOG("exc_guard_helper caught EXC_GUARD type %u (%s), flavor %u, " + "target %u, payload 0x%llx (catch #%u in the block)", + exc_guard_type, name_for_guard_type(exc_guard_type), + exc_guard_flavor, exc_guard_target, exc_guard_payload, + exc_guard_helper_reply.catch_count); + } + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "unlock"); + return 0; +} + +/* + * Set up our exception handlers if they are not already configured. + * exc_guard_helper_mutex must be held by the caller. + */ +static void +initialize_exception_handlers(void) +{ + if (exc_guard_helper_exc_port == MACH_PORT_NULL) { + exc_guard_helper_exc_port = create_exception_port(EXC_MASK_GUARD); + T_QUIET; T_ASSERT_NE(exc_guard_helper_exc_port, MACH_PORT_NULL, "exception port"); + repeat_exception_handler(exc_guard_helper_exc_port, exc_guard_helper_exception_handler); + if (verbose_exc_helper) { + T_LOG("exc_guard_helper exception handlers installed"); + } + } +} + +void +exc_guard_helper_init(void) +{ + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_lock(&exc_guard_helper_mutex), "lock"); + initialize_exception_handlers(); + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_mutex), "unlock"); +} + + +/* + * Return EXC_GUARD behavior flags that enable guard_type (non-fatal) + * and leave all other behaviors in old_behavior unchanged. + */ +static task_exc_guard_behavior_t +configure_exc_guard_of_type( + unsigned int guard_type, + task_exc_guard_behavior_t old_behavior) +{ + /* + * Behavior flags for all known EXC_GUARD types. + * These flags are defined in mach/task_info.h. + * Some guard types cannot be configured and do not have these flags. + */ + static const struct { + task_exc_guard_behavior_t set; + task_exc_guard_behavior_t clear; + } behavior_flags[] = { + [GUARD_TYPE_VIRT_MEMORY] = { + .clear = TASK_EXC_GUARD_VM_ALL, + .set = TASK_EXC_GUARD_VM_DELIVER, + }, + [GUARD_TYPE_MACH_PORT] = { + .clear = TASK_EXC_GUARD_MP_ALL, + .set = TASK_EXC_GUARD_MP_DELIVER, + }, + }; + + /* Reject guard types not present in behavior_flags[]. */ + if (guard_type >= countof(behavior_flags)) { + goto unimplemented_guard_type; + } + if (behavior_flags[guard_type].set == 0 && + behavior_flags[guard_type].clear == 0) { + goto unimplemented_guard_type; + } + + /* Set and clear behavior flags for the requested guard type(s). */ + task_exc_guard_behavior_t new_behavior = old_behavior; + new_behavior &= ~behavior_flags[guard_type].clear; + new_behavior |= behavior_flags[guard_type].set; + return new_behavior; + +unimplemented_guard_type: + /* + * No behavior_flags[] entry for this EXC_GUARD guard type. + * If task_set_exc_guard_behavior() can configure your new + * guard type then add it to behavior_flags[] above. + */ + T_FAIL("guard type %u (%s) is unimplemented in exc_guard_helper", + guard_type, name_for_guard_type(guard_type)); + T_END; +} + +task_exc_guard_behavior_t +enable_exc_guard_of_type(unsigned int guard_type) +{ + kern_return_t kr; + task_exc_guard_behavior_t old_behavior, new_behavior; + + kr = task_get_exc_guard_behavior(mach_task_self(), &old_behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exc_guard_helper calling task_get_exc_guard_behavior"); + + new_behavior = configure_exc_guard_of_type(guard_type, old_behavior); + + kr = task_set_exc_guard_behavior(mach_task_self(), new_behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, + "exc_guard_helper calling task_set_exc_guard_behavior to enable guard type %u %s", + guard_type, name_for_guard_type(guard_type)); + + return old_behavior; +} + +bool +block_raised_exc_guard_of_type( + unsigned int guard_type, + exc_guard_helper_info_t * const out_exc_info, + exc_guard_helper_block_t block) +{ + if (process_is_translated() && guard_type == GUARD_TYPE_VIRT_MEMORY) { + T_FAIL("block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY) " + "does not work on translation/Rosetta (rdar://142438840)"); + } + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_lock(&exc_guard_helper_mutex), "lock"); + initialize_exception_handlers(); + + /* lock the request and reply structs against the exception handler */ + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_lock(&exc_guard_helper_request_mutex), "lock"); + + /* prepare the global request and reply struct contents */ + memset(&exc_guard_helper_request, 0, sizeof(exc_guard_helper_request)); + memset(&exc_guard_helper_reply, 0, sizeof(exc_guard_helper_reply)); + exc_guard_helper_request.thread = mach_thread_self(); + exc_guard_helper_request.guard_type = guard_type; + + /* unlock the request and reply structs so the exception handler can use them */ + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "unlock"); + + /* run the caller's block */ + if (verbose_exc_helper) { + T_LOG("exc_guard_helper calling a block"); + } + block(); + if (verbose_exc_helper) { + T_LOG("exc_guard_helper finished a block, %u exception%s caught", + exc_guard_helper_reply.catch_count, + exc_guard_helper_reply.catch_count == 1 ? "" : "s"); + } + + /* lock the request and reply structs again */ + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "lock"); + + /* read the reply from the exception handler */ + bool result = exc_guard_helper_reply.catch_count > 0; + memcpy(out_exc_info, &exc_guard_helper_reply, sizeof(exc_guard_helper_reply)); + + /* clear the request and reply before unlocking everything */ + memset(&exc_guard_helper_request, 0, sizeof(exc_guard_helper_request)); + memset(&exc_guard_helper_reply, 0, sizeof(exc_guard_helper_reply)); + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_request_mutex), "unlock"); + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_mutex_unlock(&exc_guard_helper_mutex), "unlock"); + + return result; +} + +bool +block_raised_exc_guard_of_type_ignoring_translated( + unsigned int guard_type, + exc_guard_helper_info_t * const out_exc_info, + exc_guard_helper_block_t block) +{ + if (process_is_translated() && guard_type == GUARD_TYPE_VIRT_MEMORY) { + /* Rosetta can't recover from guard exceptions of GUARD_TYPE_VIRT_MEMORY */ + T_LOG("note: exc_guard_helper calling a block with no exception " + "handler due to translation/Rosetta (rdar://142438840)"); + block(); + memset(out_exc_info, 0, sizeof(*out_exc_info)); + return false; + } + + return block_raised_exc_guard_of_type(guard_type, out_exc_info, block); +} diff --git a/tests/exc_guard_helper.h b/tests/exc_guard_helper.h new file mode 100644 index 000000000..8813c85f6 --- /dev/null +++ b/tests/exc_guard_helper.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * exc_guard_helper.h + * + * Helper functions for userspace tests to test for EXC_GUARD exceptions. + * + * To use these functions in your test you must set additional build options. + * See target `exc_guard_helper_test` in tests/Makefile for an example. + */ + +#pragma once + +#include +#include +#include + +/* + * Set verbose_exc_helper = true to log exception information with T_LOG(). + * The default is true. + */ +extern bool verbose_exc_helper; + +typedef struct { + /* The number of EXC_GUARD exceptions caught during the block. */ + unsigned catch_count; + + /* + * The remaining fields are only set for the first EXC_GUARD caught. + * See kern/exc_guard.h for definitions of these fields. + */ + unsigned guard_type; /* e.g. GUARD_TYPE_VIRT_MEMORY */ + uint32_t guard_flavor; + uint32_t guard_target; + uint64_t guard_payload; +} exc_guard_helper_info_t; + +/* + * Initialize exc_guard_helper's exception handling. + * + * Calling this is optional. The other functions will perform + * initialization if necessary. You may need to call this + * function if that automatic initialization allocates + * memory in address ranges that your test requires to + * be unallocated. + */ +extern void +exc_guard_helper_init(void); + +/* + * Sets EXC_GUARD exceptions of the given type (e.g. GUARD_TYPE_VIRT_MEMORY) + * to be enabled and non-fatal in this process. + * Returns the previous guard exception behavior. Pass this value + * to task_set_exc_guard_behavior() to restore the previous behavior. + * + * Fails with T_FAIL if the behavior could not be set; for example: + * - guard exceptions cannot be configured in some processes + * - some guard exception types cannot be set to non-fatal + */ +extern task_exc_guard_behavior_t +enable_exc_guard_of_type(unsigned int guard_type); + +/* + * Runs block() and returns true if it raised a non-fatal EXC_GUARD exception + * of the requested type (e.g. GUARD_TYPE_VIRT_MEMORY). + * + * While block() runs, any EXC_GUARD exceptions of the requested + * type are caught and recorded, then execution resumes. + * Information about any caught exception(s) is returned in *out_exc_info. + * If more than one EXC_GUARD exception of the requested type is raised then + * details about all but the first are discarded, other than `catch_count` + * the number of exceptions caught. + * + * Guard exceptions of this type must be enabled and non-fatal. + * enable_exc_guard_of_type() can set this for your process. + * + * Note that block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY) + * does not work on Rosetta. This function will T_FAIL if you try. + * See block_raised_exc_guard_of_type_ignoring_translated() below + * if you are willing to forgo the guard exception handler in + * translated execution environments like Rosetta. + * + * Example: + * enable_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY); + * [...] + * exc_guard_helper_info_t exc_info; + * if (block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + * mach_vm_deallocate(mach_task_self(), addr, size); + * })) { + * // EXC_GUARD raised during mach_vm_deallocate, details in exc_info + * } else { + * // mach_vm_deallocate did not raise EXC_GUARD + * } + */ +typedef void (^exc_guard_helper_block_t)(void); +extern bool +block_raised_exc_guard_of_type( + unsigned int guard_type, + exc_guard_helper_info_t * const out_exc_info, + exc_guard_helper_block_t block); + +/* + * Like block_raised_exc_guard_of_type(), but quietly + * runs the block with no guard exception handler if + * the guard type is GUARD_TYPE_VIRT_MEMORY and we're + * in a translated execution environment like Rosetta. + */ +extern bool +block_raised_exc_guard_of_type_ignoring_translated( + unsigned int guard_type, + exc_guard_helper_info_t * const out_exc_info, + exc_guard_helper_block_t block); diff --git a/tests/exc_guard_helper_test.c b/tests/exc_guard_helper_test.c new file mode 100644 index 000000000..608240295 --- /dev/null +++ b/tests/exc_guard_helper_test.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * exc_guard_helper_test.c + * + * Test the testing helper functions in exc_guard_helper.h. + */ + +#include "exc_guard_helper.h" + +#include +#include +#include +#include +#include +#include +#include "test_utils.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vm"), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true) + ); + +/* Convenience macro for compile-time array size */ +#define countof(array) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic error \"-Wsizeof-pointer-div\"") \ + (sizeof(array)/sizeof((array)[0])) \ + _Pragma("clang diagnostic pop") + +/* + * Return true if [query_start, query_start + query_size) is unallocated memory. + */ +static bool +is_hole(mach_vm_address_t query_start, mach_vm_size_t query_size) +{ + mach_vm_address_t entry_start = query_start; + mach_vm_size_t entry_size; + vm_region_submap_info_data_64_t info; + uint32_t depth = 0; + mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; + kern_return_t kr = mach_vm_region_recurse(mach_task_self(), + &entry_start, &entry_size, &depth, + (vm_region_recurse_info_t)&info, &count); + + if (kr == KERN_INVALID_ADDRESS) { + /* + * query_start is unmapped, and so is everything after it, + * therefore the query range is a hole + */ + return true; + } + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_region"); + + /* this code does not handle submaps */ + T_QUIET; T_ASSERT_EQ(depth, 0, "submaps unimplemented"); + + /* + * entry_start is mapped memory, and either + * (1) entry_start's mapping contains query_start, OR + * (2) query_start is unmapped and entry_start is the next mapped memory + */ + + if (entry_start >= query_start + query_size) { + /* + * entry_start's mapping does not contain query_start, + * and entry_start's mapping begins after the query range, + * therefore the query range is a hole + */ + return true; + } else { + return false; + } +} + +/* Call enable_exc_guard_of_type(), and test its behavior. */ +static void +enable_exc_guard_of_type_and_verify(unsigned int guard_type) +{ + struct { + const char *name; + task_exc_guard_behavior_t all_mask; + task_exc_guard_behavior_t deliver_mask; + task_exc_guard_behavior_t fatal_mask; + } guards[] = { + [GUARD_TYPE_VIRT_MEMORY] = { + .name = "VM", + .all_mask = TASK_EXC_GUARD_VM_ALL, + .deliver_mask = TASK_EXC_GUARD_VM_DELIVER, + .fatal_mask = TASK_EXC_GUARD_VM_FATAL + }, + [GUARD_TYPE_MACH_PORT] = { + .name = "Mach port", + .all_mask = TASK_EXC_GUARD_MP_ALL, + .deliver_mask = TASK_EXC_GUARD_MP_DELIVER, + .fatal_mask = TASK_EXC_GUARD_MP_FATAL + } + }; + + kern_return_t kr; + task_exc_guard_behavior_t disabling_behavior, old_behavior, new_behavior; + + T_QUIET; T_ASSERT_TRUE(guard_type < countof(guards) && guards[guard_type].name != NULL, + "guard type in enable_exc_guard_of_type_and_verify"); + + /* disable guard exceptions of this type, then verify that enable_exc_guard_of_type enables them */ + + kr = task_get_exc_guard_behavior(mach_task_self(), &disabling_behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "get old behavior"); + disabling_behavior &= ~guards[guard_type].all_mask; + kr = task_set_exc_guard_behavior(mach_task_self(), disabling_behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "set empty behavior"); + + old_behavior = enable_exc_guard_of_type(guard_type); + T_QUIET; T_ASSERT_EQ(old_behavior, disabling_behavior, "enable_exc_guard_of_type return value"); + T_QUIET; T_ASSERT_FALSE(old_behavior & guards[guard_type].deliver_mask, + "%s guard exceptions must not be enabled", guards[guard_type].name); + + kr = task_get_exc_guard_behavior(mach_task_self(), &new_behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "get new behavior"); + T_ASSERT_TRUE(new_behavior & guards[guard_type].deliver_mask, + "enable_exc_guard_of_type enabled %s guard exceptions", guards[guard_type].name); + T_ASSERT_FALSE(new_behavior & guards[guard_type].fatal_mask, + "enable_exc_guard_of_type set %s guard exceptions to non-fatal", guards[guard_type].name); +} + + +T_DECL(exc_guard_helper_test_vm, + "test the test helper function block_raised_exc_guard_of_type with VM guard exceptions") +{ + if (process_is_translated()) { + T_SKIP("VM guard exceptions not supported on Rosetta (rdar://142438840)"); + } + + kern_return_t kr; + exc_guard_helper_info_t exc_info; + + exc_guard_helper_init(); + enable_exc_guard_of_type_and_verify(GUARD_TYPE_VIRT_MEMORY); + + /* + * Test guard exceptions by deallocating unallocated VM space. + * Problem: Rosetta asynchronously allocates memory in the process + * to store translated instructions. These allocations can land + * inside our unallocated space, disrupting our test and crashing + * after we call vm_deallocate() on space that we thought was empty. + * Solution: + * - use VM_FLAGS_RANDOM_ADDR in the hope of moving our allocation + * away from VM's ordinary next allocation space + * - try to verify that the unallocated space is empty before + * calling vm_deallocate, and retry several times if it is not empty + */ + +#define LAST_RETRY 10 + for (int retry_count = 0; retry_count <= LAST_RETRY; retry_count++) { + /* allocate three pages */ + mach_vm_address_t allocated = 0; + kr = mach_vm_allocate(mach_task_self(), &allocated, PAGE_SIZE * 3, + VM_FLAGS_ANYWHERE | VM_FLAGS_RANDOM_ADDR); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "allocate space"); + + /* deallocate the page in the middle; no EXC_GUARD from successful deallocation */ + if (block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kern_return_t kr; + kr = mach_vm_deallocate(mach_task_self(), allocated + PAGE_SIZE, PAGE_SIZE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "create hole"); + })) { + T_FAIL("unexpected guard exception"); + } else { + T_ASSERT_EQ(exc_info.catch_count, 0, "block_raised_exc_guard_of_type(VM) with no exceptions"); + } + + /* try to deallocate the hole, twice, and detect the guard exceptions */ + __block bool retry = false; + bool caught_exception = block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kern_return_t kr; + + /* deallocate page-hole-page; EXC_GUARD expected from deallocating a hole */ + if (!is_hole(allocated + PAGE_SIZE, PAGE_SIZE)) { + retry = true; /* somebody allocated inside our unallocated space; retry */ + return; + } + kr = mach_vm_deallocate(mach_task_self(), allocated, PAGE_SIZE * 3); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate a hole"); + + /* deallocate again, now all holes; EXC_GUARD expected from deallocating a hole */ + if (!is_hole(allocated, PAGE_SIZE * 3)) { + retry = true; /* somebody allocated inside our unallocated space; retry */ + return; + } + kr = mach_vm_deallocate(mach_task_self(), allocated, PAGE_SIZE * 3); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate a hole again"); + + if (!is_hole(allocated, PAGE_SIZE * 3)) { + retry = true; /* somebody allocated inside our unallocated space; retry */ + return; + } + }); + + if (retry) { + if (retry_count < LAST_RETRY) { + T_LOG("unallocated space was found to be allocated, retrying"); + } else { + T_FAIL("intended unallocated space was repeatedly found to be allocated, giving up"); + } + } else if (caught_exception) { + /* caught an exception as expected: verify what we caught */ + T_ASSERT_EQ(exc_info.catch_count, 2, "block_raised_exc_guard_of_type(VM) with 2 exceptions"); + T_ASSERT_EQ(exc_info.guard_type, GUARD_TYPE_VIRT_MEMORY, "caught exception's type"); + T_ASSERT_EQ(exc_info.guard_flavor, kGUARD_EXC_DEALLOC_GAP, "caught exception's flavor"); + T_ASSERT_EQ(exc_info.guard_payload, allocated + PAGE_SIZE, "caught exception's payload"); + break; /* done retrying */ + } else { + /* where's the beef? */ + T_FAIL("no VM guard exception caught"); + break; /* done retrying */ + } + } +} + + +T_DECL(exc_guard_helper_test_mach_port, + "test the test helper function block_raised_exc_guard_of_type with Mach port guard exceptions") +{ + kern_return_t kr; + exc_guard_helper_info_t exc_info; + mach_port_t port; + + exc_guard_helper_init(); + enable_exc_guard_of_type_and_verify(GUARD_TYPE_MACH_PORT); + + /* + * Test guard exceptions by overflowing the send right count for a port. + */ + + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "new port"); + kr = mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "make send"); + + /* add and remove one send right, should succeed */ + if (block_raised_exc_guard_of_type(GUARD_TYPE_MACH_PORT, &exc_info, ^{ + kern_return_t kr; + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, +1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "add one send right"); + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, -1); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "remove one send right"); + })) { + T_FAIL("unexpected guard exception"); + } else { + T_ASSERT_EQ(exc_info.catch_count, 0, "block_raised_exc_guard_of_type(MACH_PORT) with no exceptions"); + } + + /* try to overflow the port's send right count, twice, and catch the exceptions */ + bool caught_exception = block_raised_exc_guard_of_type(GUARD_TYPE_MACH_PORT, &exc_info, ^{ + kern_return_t kr; + unsigned expected_error; + if (process_is_translated()) { + expected_error = 0x1000013; /* KERN_UREFS_OVERFLOW plus another bit? */ + } else { + expected_error = KERN_INVALID_VALUE; + } + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, INT32_MAX); + T_QUIET; T_ASSERT_MACH_ERROR(kr, expected_error, "add too many send rights"); + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, INT32_MAX); + T_QUIET; T_ASSERT_MACH_ERROR(kr, expected_error, "add too many send rights, again"); + }); + if (caught_exception) { + /* caught an exception as expected: verify what we caught */ + T_ASSERT_EQ(exc_info.catch_count, 2, "block_raised_exc_guard_of_type(MACH_PORT) with 2 exceptions"); + T_ASSERT_EQ(exc_info.guard_type, GUARD_TYPE_MACH_PORT, "caught exception's type"); + T_ASSERT_EQ(exc_info.guard_flavor, kGUARD_EXC_INVALID_VALUE, "caught exception's flavor"); + T_ASSERT_EQ(exc_info.guard_target, port, "caught exception's target"); + } else { + /* where's the beef? */ + T_FAIL("no Mach port guard exception caught"); + } +} diff --git a/tests/exc_guard_helper_test_unexpected.c b/tests/exc_guard_helper_test_unexpected.c new file mode 100644 index 000000000..3c93030e7 --- /dev/null +++ b/tests/exc_guard_helper_test_unexpected.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * exc_guard_helper_test_unexpected.c + * + * Test the testing helper functions in exc_guard_helper.h. + * The exception handler used by block_raise_exc_guard_of_type() + * should allow other exceptions to continue to a crash. + */ + +#include "test_utils.h" +#include "exc_guard_helper.h" + +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vm"), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true), + + T_META_IGNORECRASHES(".*exc_guard_helper_test_unexpected.*") + ); + +T_DECL(exc_guard_helper_test_unexpected_exc_guard, + "provoke one guard exception type while exc_guard_helper is expecting another") +{ + if (process_is_translated()) { + T_SKIP("VM guard exceptions not supported on Rosetta (rdar://142438840)"); + } + + pid_t child_pid; + + if ((child_pid = fork())) { + /* parent */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(child_pid, "fork"); + + int status; + pid_t waited_pid; + + waited_pid = waitpid(child_pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(waited_pid, "waitpid"); + T_QUIET; T_ASSERT_EQ(waited_pid, child_pid, "waitpid"); + + T_ASSERT_TRUE(WIFSIGNALED(status), "child should have crashed"); + T_ASSERT_EQ(WTERMSIG(status), SIGKILL, "child should have crashed with SIGKILL"); + } else { + /* child */ + kern_return_t kr; + task_exc_guard_behavior_t behavior; + exc_guard_helper_info_t exc_info; + mach_port_t port; + + exc_guard_helper_init(); + + /* + * set GUARD_TYPE_MACH_PORT to be enabled and fatal. + * This child process is expected to crash. + */ + kr = task_get_exc_guard_behavior(mach_task_self(), &behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "get old behavior"); + behavior &= ~TASK_EXC_GUARD_MP_ALL; + behavior |= TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_FATAL; + kr = task_set_exc_guard_behavior(mach_task_self(), behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "set fatal mach port behavior"); + + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "new port"); + kr = mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "make send"); + + /* provoke GUARD_TYPE_MACH_PORT while listening for GUARD_TYPE_VIRT_MEMORY */ + + if (block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kern_return_t kr; + T_LOG("CHILD EXPECTED TO CRASH after this guard exception"); + kr = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, INT32_MAX); + T_QUIET; T_ASSERT_MACH_ERROR(kr, KERN_INVALID_VALUE, "add too many send rights"); + })) { + T_FAIL("Mach port guard exception unexpectedly caught by VM guard exception handler"); + } + + T_FAIL("expected Mach port guard exception to kill the process"); + } +} diff --git a/tests/exc_helpers.c b/tests/exc_helpers.c index 8588e38d9..f10e277a5 100644 --- a/tests/exc_helpers.c +++ b/tests/exc_helpers.c @@ -37,14 +37,23 @@ #define EXCEPTION_THREAD_STATE ARM_THREAD_STATE64 #define EXCEPTION_THREAD_STATE_COUNT ARM_THREAD_STATE64_COUNT #elif __x86_64__ -#define EXCEPTION_THREAD_STATE x86_THREAD_STATE -#define EXCEPTION_THREAD_STATE_COUNT x86_THREAD_STATE_COUNT +#define EXCEPTION_THREAD_STATE x86_THREAD_STATE64 +#define EXCEPTION_THREAD_STATE_COUNT x86_THREAD_STATE64_COUNT #else #error Unsupported architecture #endif #define EXCEPTION_IDENTITY_PROTECTED 4 +bool verbose_exc_helper = true; + +#define LOG_VERBOSE(format, ...) \ + do { \ + if (verbose_exc_helper) { \ + T_LOG(format, ##__VA_ARGS__); \ + } \ + } while (0) + /** * mach_exc_server() is a MIG-generated function that verifies the message * that was received is indeed a mach exception and then calls @@ -104,10 +113,63 @@ catch_mach_exception_raise_state_identity( thread_state_t out_state, mach_msg_type_number_t *out_state_count); -static exc_handler_callback_t exc_handler_callback; -static exc_handler_protected_callback_t exc_handler_protected_callback; -static exc_handler_state_protected_callback_t exc_handler_state_protected_callback; -static exc_handler_backtrace_callback_t exc_handler_backtrace_callback; +/* Thread-local storage for exception server threads. */ + +struct exc_handler_callbacks { + exc_handler_callback_t state_callback; + exc_handler_protected_callback_t protected_callback; + exc_handler_state_protected_callback_t state_protected_callback; + exc_handler_backtrace_callback_t backtrace_callback; +}; + +static __thread struct exc_handler_callbacks tls_callbacks; + +/* + * Return the (ptrauth-stripped) PC from the + * thread state passed to an exception handler. + */ +static uint64_t +get_exception_pc(thread_state_t in_state) +{ +#if __arm64__ + arm_thread_state64_t *state = (arm_thread_state64_t*)(void *)in_state; + return arm_thread_state64_get_pc(*state); +#elif __x86_64__ + x86_thread_state64_t *state = (x86_thread_state64_t*)(void *)in_state; + return state->__rip; +#else + T_FAIL("unknown architecture"); + __builtin_unreachable(); +#endif +} + +/* + * Increment the PC in thread state `out_state` by `advance_pc` bytes. + */ +static void +advance_exception_pc( + size_t advance_pc, + thread_state_t out_state) +{ + /* disallow the sentinel value used by the exception handlers */ + assert(advance_pc != EXC_HELPER_HALT); + +#if __arm64__ + arm_thread_state64_t *state = (arm_thread_state64_t*)(void *)out_state; + + void *pc = (void*)(arm_thread_state64_get_pc(*state) + advance_pc); + /* Have to sign the new PC value when pointer authentication is enabled. */ + pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0); + arm_thread_state64_set_pc_fptr(*state, pc); +#elif __x86_64__ + x86_thread_state64_t *state = (x86_thread_state64_t*)(void *)out_state; + state->__rip += advance_pc; +#else + (void)advance_pc; + T_FAIL("unknown architecture"); + __builtin_unreachable(); +#endif +} /** * This has to be defined for linking purposes, but it's unused. @@ -140,7 +202,8 @@ catch_mach_exception_raise_state_identity_protected( thread_state_t out_state, mach_msg_type_number_t *out_state_count) { - T_LOG("Caught a mach exception!\n"); + LOG_VERBOSE("Caught a mach exception!\n"); + /* There should only be two code values. */ T_QUIET; T_ASSERT_EQ(code_count, 2, "Two code values were provided with the mach exception"); @@ -149,7 +212,8 @@ catch_mach_exception_raise_state_identity_protected( * when setting the exception port. */ mach_exception_data_t codes_64 = (mach_exception_data_t)(void *)codes; - T_LOG("Mach exception codes[0]: %#llx, codes[1]: %#llx\n", codes_64[0], codes_64[1]); + LOG_VERBOSE("Mach exception type %d, codes[0]: %#llx, codes[1]: %#llx\n", + type, codes_64[0], codes_64[1]); /* Verify that we're receiving the expected thread state flavor. */ T_QUIET; T_ASSERT_EQ(*flavor, EXCEPTION_THREAD_STATE, "The thread state flavor is EXCEPTION_THREAD_STATE"); @@ -158,8 +222,20 @@ catch_mach_exception_raise_state_identity_protected( *out_state_count = in_state_count; /* size of state object in 32-bit words */ memcpy((void*)out_state, (void*)in_state, in_state_count * 4); - exc_handler_state_protected_callback(task_id_token, thread_id, type, codes_64, in_state, - in_state_count, out_state, out_state_count); + size_t advance_pc = tls_callbacks.state_protected_callback( + task_id_token, thread_id, type, codes_64, in_state, + in_state_count, out_state, out_state_count); + + if (advance_pc == EXC_HELPER_HALT) { + /* Exception handler callback says we can't continue. */ + LOG_VERBOSE("Halting after exception"); + return KERN_FAILURE; + } + + if (advance_pc != 0) { + T_FAIL("unimplemented PC change from EXCEPTION_STATE_IDENTITY_PROTECTED callback"); + return KERN_FAILURE; + } /* Return KERN_SUCCESS to tell the kernel to keep running the victim thread. */ return KERN_SUCCESS; @@ -175,7 +251,7 @@ catch_mach_exception_raise_identity_protected( mach_exception_data_t codes, mach_msg_type_number_t codeCnt) { - T_LOG("Caught a mach exception!\n"); + LOG_VERBOSE("Caught a mach exception!\n"); /* There should only be two code values. */ T_QUIET; T_ASSERT_EQ(codeCnt, 2, "Two code values were provided with the mach exception"); @@ -185,11 +261,22 @@ catch_mach_exception_raise_identity_protected( * when setting the exception port. */ mach_exception_data_t codes_64 = (mach_exception_data_t)(void *)codes; - T_LOG("Mach exception codes[0]: %#llx, codes[1]: %#llx\n", codes_64[0], codes_64[1]); + LOG_VERBOSE("Mach exception type %d, codes[0]: %#llx, codes[1]: %#llx\n", + exception, codes_64[0], codes_64[1]); - exc_handler_protected_callback(task_id_token, thread_id, exception, codes_64); + size_t advance_pc = tls_callbacks.protected_callback( + task_id_token, thread_id, exception, codes_64); - T_LOG("Assuming the thread state modification was done in the callback, skipping it"); + if (advance_pc == EXC_HELPER_HALT) { + /* Exception handler callback says we can't continue. */ + LOG_VERBOSE("Halting after exception"); + return KERN_FAILURE; + } + + if (advance_pc != 0) { + T_FAIL("unimplemented PC change from EXCEPTION_IDENTITY_PROTECTED callback"); + return KERN_FAILURE; + } /* Return KERN_SUCCESS to tell the kernel to keep running the victim thread. */ return KERN_SUCCESS; @@ -234,7 +321,7 @@ catch_mach_exception_raise_state_identity( thread_state_t out_state, mach_msg_type_number_t *out_state_count) { - T_LOG("Caught a mach exception!\n"); + LOG_VERBOSE("Caught a mach exception!\n"); /* There should only be two code values. */ T_QUIET; T_ASSERT_EQ(code_count, 2, "Two code values were provided with the mach exception"); @@ -244,33 +331,37 @@ catch_mach_exception_raise_state_identity( * when setting the exception port. */ mach_exception_data_t codes_64 = (mach_exception_data_t)(void *)codes; - T_LOG("Mach exception codes[0]: %#llx, codes[1]: %#llx\n", codes_64[0], codes_64[1]); + LOG_VERBOSE("Mach exception type %d, codes[0]: %#llx, codes[1]: %#llx\n", + type, codes_64[0], codes_64[1]); /* Verify that we're receiving the expected thread state flavor. */ T_QUIET; T_ASSERT_EQ(*flavor, EXCEPTION_THREAD_STATE, "The thread state flavor is EXCEPTION_THREAD_STATE"); T_QUIET; T_ASSERT_EQ(in_state_count, EXCEPTION_THREAD_STATE_COUNT, "The thread state count is EXCEPTION_THREAD_STATE_COUNT"); - size_t advance_pc = exc_handler_callback(task, thread, type, codes_64); + uint64_t exception_pc = get_exception_pc(in_state); + + size_t advance_pc = tls_callbacks.state_callback( + task, thread, type, codes_64, exception_pc); + + if (advance_pc == EXC_HELPER_HALT) { + /* Exception handler callback says we can't continue. */ + LOG_VERBOSE("Halting after exception"); + return KERN_FAILURE; + } /** - * Increment the PC by the requested amount so the thread doesn't cause - * another exception when it resumes. + * Copy in_state to out_state, then increment the PC by the requested + * amount so the thread doesn't cause another exception when it resumes. */ *out_state_count = in_state_count; /* size of state object in 32-bit words */ memcpy((void*)out_state, (void*)in_state, in_state_count * 4); - -#if __arm64__ - arm_thread_state64_t *state = (arm_thread_state64_t*)(void *)out_state; - - void *pc = (void*)(arm_thread_state64_get_pc(*state) + advance_pc); - /* Have to sign the new PC value when pointer authentication is enabled. */ - pc = ptrauth_sign_unauthenticated(pc, ptrauth_key_function_pointer, 0); - arm_thread_state64_set_pc_fptr(*state, pc); -#else - (void)advance_pc; - T_FAIL("catch_mach_exception_raise_state() not fully implemented on this architecture"); - __builtin_unreachable(); -#endif + assert(0 == memcmp(in_state, out_state, in_state_count * 4)); + if (advance_pc != 0) { + advance_exception_pc(advance_pc, out_state); + LOG_VERBOSE("Continuing after exception at a new PC"); + } else { + LOG_VERBOSE("Continuing after exception"); + } /* Return KERN_SUCCESS to tell the kernel to keep running the victim thread. */ return KERN_SUCCESS; @@ -284,7 +375,7 @@ catch_mach_exception_raise_backtrace( mach_exception_data_t codes, __unused mach_msg_type_number_t codeCnt) { - return exc_handler_backtrace_callback(kcdata_object, exception, codes); + return tls_callbacks.backtrace_callback(kcdata_object, exception, codes); } mach_port_t @@ -293,13 +384,17 @@ create_exception_port(exception_mask_t exception_mask) return create_exception_port_behavior64(exception_mask, EXCEPTION_STATE_IDENTITY); } -mach_port_t -create_exception_port_behavior64(exception_mask_t exception_mask, exception_behavior_t behavior) +void +set_thread_exception_port(mach_port_t exc_port, exception_mask_t exception_mask) +{ + set_thread_exception_port_behavior64(exc_port, exception_mask, EXCEPTION_STATE_IDENTITY); +} + +void +set_thread_exception_port_behavior64(exception_port_t exc_port, exception_mask_t exception_mask, exception_behavior_t behavior) { - mach_port_t exc_port = MACH_PORT_NULL; - mach_port_t task = mach_task_self(); mach_port_t thread = mach_thread_self(); - kern_return_t kr = KERN_SUCCESS; + kern_return_t kr; if (((unsigned int)behavior & ~MACH_EXCEPTION_MASK) != EXCEPTION_STATE_IDENTITY && ((unsigned int)behavior & ~MACH_EXCEPTION_MASK) != EXCEPTION_IDENTITY_PROTECTED) { @@ -308,6 +403,23 @@ create_exception_port_behavior64(exception_mask_t exception_mask, exception_beha behavior |= MACH_EXCEPTION_CODES; + /* Tell the kernel what port to send exceptions to. */ + kr = thread_set_exception_ports( + thread, + exception_mask, + exc_port, + (exception_behavior_t)((unsigned int)behavior), + EXCEPTION_THREAD_STATE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); +} + +mach_port_t +create_exception_port_behavior64(exception_mask_t exception_mask, exception_behavior_t behavior) +{ + mach_port_t exc_port = MACH_PORT_NULL; + mach_port_t task = mach_task_self(); + kern_return_t kr = KERN_SUCCESS; + /* Create the mach port the exception messages will be sent to. */ kr = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port"); @@ -319,21 +431,15 @@ create_exception_port_behavior64(exception_mask_t exception_mask, exception_beha kr = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port"); - /* Tell the kernel what port to send exceptions to. */ - kr = thread_set_exception_ports( - thread, - exception_mask, - exc_port, - (exception_behavior_t)((unsigned int)behavior), - EXCEPTION_THREAD_STATE); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Set the exception port to my custom handler"); - + set_thread_exception_port_behavior64(exc_port, exception_mask, behavior); return exc_port; } struct thread_params { mach_port_t exc_port; bool run_once; + + struct exc_handler_callbacks callbacks; }; /** @@ -347,7 +453,15 @@ exc_server_thread(void *arg) struct thread_params *params = arg; mach_port_t exc_port = params->exc_port; bool run_once = params->run_once; + + /* + * Save callbacks to thread-local storage so the + * catch_mach_exception_raise_* functions can get them. + */ + tls_callbacks = params->callbacks; + free(params); + params = NULL; /** * mach_msg_server_once is a helper function provided by libsyscall that @@ -371,34 +485,35 @@ exc_server_thread(void *arg) static void _run_exception_handler(mach_port_t exc_port, void *preferred_callback, void *callback, bool run_once, exception_behavior_t behavior) { + /* Set parameters for the exception server's thread. */ + struct thread_params *params = calloc(1, sizeof(*params)); + params->exc_port = exc_port; + params->run_once = run_once; + if (behavior & MACH_EXCEPTION_BACKTRACE_PREFERRED) { T_QUIET; T_ASSERT_NE(NULL, preferred_callback, "Require a preferred callback"); - exc_handler_backtrace_callback = (exc_handler_backtrace_callback_t)preferred_callback; + params->callbacks.backtrace_callback = (exc_handler_backtrace_callback_t)preferred_callback; } behavior &= ~MACH_EXCEPTION_MASK; switch (behavior) { case EXCEPTION_STATE_IDENTITY: - exc_handler_callback = (exc_handler_callback_t)callback; + params->callbacks.state_callback = (exc_handler_callback_t)callback; break; case EXCEPTION_STATE_IDENTITY_PROTECTED: - exc_handler_state_protected_callback = (exc_handler_state_protected_callback_t)callback; + params->callbacks.state_protected_callback = (exc_handler_state_protected_callback_t)callback; break; case EXCEPTION_IDENTITY_PROTECTED: - exc_handler_protected_callback = (exc_handler_protected_callback_t)callback; + params->callbacks.protected_callback = (exc_handler_protected_callback_t)callback; break; default: T_FAIL("Unsupported behavior"); break; } - pthread_t exc_thread; - /* Spawn the exception server's thread. */ - struct thread_params *params = malloc(sizeof(*params)); - params->exc_port = exc_port; - params->run_once = run_once; + pthread_t exc_thread; int err = pthread_create(&exc_thread, (pthread_attr_t*)0, exc_server_thread, params); T_QUIET; T_ASSERT_POSIX_ZERO(err, "Spawned exception server thread"); diff --git a/tests/exc_helpers.h b/tests/exc_helpers.h index f2d9a4457..52b3fc42b 100644 --- a/tests/exc_helpers.h +++ b/tests/exc_helpers.h @@ -32,8 +32,15 @@ #include #include #include +#include #include +/** + * Set verbose_exc_helper = true to log exception information with T_LOG(). + * The default is true. + */ +extern bool verbose_exc_helper; + /** * Callback invoked by run_exception_handler() when a Mach exception is * received. @@ -42,13 +49,15 @@ * @param thread the task causing the exception * @param type exception type received from the kernel * @param codes exception codes received from the kernel + * @param pc the (ptrauth-stripped) program counter of the exception * * @return how much the exception handler should advance the program * counter, in bytes (in order to move past the code causing the - * exception) + * exception); OR the special value EXC_HELPER_HALT to + * let the process crash instead of continuing. */ typedef size_t (*exc_handler_callback_t)(mach_port_t task, mach_port_t thread, - exception_type_t type, mach_exception_data_t codes); + exception_type_t type, mach_exception_data_t codes, uint64_t pc); typedef size_t (*exc_handler_protected_callback_t)(task_id_token_t token, uint64_t thread_d, exception_type_t type, mach_exception_data_t codes); @@ -60,8 +69,11 @@ typedef size_t (*exc_handler_state_protected_callback_t)(task_id_token_t token, typedef kern_return_t (*exc_handler_backtrace_callback_t)(kcdata_object_t kcdata_object, exception_type_t type, mach_exception_data_t codes); +#define EXC_HELPER_HALT ((size_t)INTPTR_MAX) + /** - * Allocates a Mach port and configures it to receive exception messages. + * Allocates a Mach port and configures it to receive exception messages, + * and installs it as the exception handler for the current thread. * * @param exception_mask exception types that this Mach port should receive * @@ -73,6 +85,16 @@ create_exception_port(exception_mask_t exception_mask); mach_port_t create_exception_port_behavior64(exception_mask_t exception_mask, exception_behavior_t behavior); +/** + * Installs an exception port created with create_exception_port() + * as the exception handler for the current thread. + */ +void +set_thread_exception_port(mach_port_t exc_port, exception_mask_t exception_mask); + +void +set_thread_exception_port_behavior64(mach_port_t exc_port, exception_mask_t exception_mask, exception_behavior_t behavior); + /** * Handles one exception received on the provided Mach port, by running the * provided callback. diff --git a/tests/exception_tests.c b/tests/exception_tests.c index 1d3c8e09f..41c68d76e 100644 --- a/tests/exception_tests.c +++ b/tests/exception_tests.c @@ -12,19 +12,6 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_VERSION("IPC"), T_META_RUN_CONCURRENTLY(true)); -static size_t -exc_immovable_handler( - mach_port_t task, - mach_port_t thread, - __unused exception_type_t type, - __unused mach_exception_data_t codes) -{ - T_EXPECT_EQ(task, mach_task_self(), "Received immovable task port"); - T_EXPECT_EQ(thread, pthread_mach_thread_np(pthread_main_thread_np()), - "Received immovable thread port"); - T_END; -} - static size_t exc_handler_identity_protected( task_id_token_t token, @@ -45,37 +32,6 @@ exc_handler_identity_protected( T_END; } -T_DECL(exc_immovable, "Test that exceptions receive immovable ports", - T_META_TAG_VM_PREFERRED) -{ - mach_port_t exc_port = create_exception_port(EXC_MASK_BAD_ACCESS); - uint32_t opts = 0; - size_t size = sizeof(&opts); - mach_port_t mp; - kern_return_t kr; - - T_LOG("Check if task_exc_guard exception has been enabled\n"); - int ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0); - T_EXPECT_POSIX_SUCCESS(ret, "sysctlbyname(kern.ipc_control_port_options)"); - - if ((opts & 0x30) == 0) { - T_SKIP("immovable rights aren't enabled"); - } - - kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &mp); - T_EXPECT_MACH_SUCCESS(kr, "task_get_special_port"); - T_EXPECT_NE(mp, mach_task_self(), "should receive movable port"); - - /* - * do not deallocate the port we received on purpose to check - * that the exception will not coalesce with the movable port - * we have in our space now - */ - - run_exception_handler(exc_port, exc_immovable_handler); - *(void *volatile*)0 = 0; -} - T_DECL(exc_raise_identity_protected, "Test identity-protected exception delivery behavior", T_META_TAG_VM_NOT_PREFERRED) { diff --git a/tests/exec_set_proc_name.c b/tests/exec_set_proc_name.c new file mode 100644 index 000000000..c409ffedf --- /dev/null +++ b/tests/exec_set_proc_name.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include + + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.spawn"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("spawn"), + T_META_RUN_CONCURRENTLY(TRUE)); + +static void +check_myself(char *name) +{ + struct proc_bsdinfo pinfo = {0}; + int ret = proc_pidinfo(getpid(), PROC_PIDTBSDINFO, 0, &pinfo, sizeof(pinfo)); + T_ASSERT_POSIX_SUCCESS(ret, "proc_pidinfo"); + + T_LOG("my process name is '%s' (comm is '%s')", pinfo.pbi_name, pinfo.pbi_comm); + + char *found = strstr(pinfo.pbi_name, "exec_set_proc_name"); + T_ASSERT_NOTNULL(found, "proc name of %s", name); +} + +T_HELPER_DECL(spawned_helper, "spawned helper") +{ + check_myself("child"); +} + +T_DECL(set_proc_name, "check process name is correct", T_META_TAG_VM_PREFERRED) +{ + int pid, ret, status; + + check_myself("parent"); + + char binpath[MAXPATHLEN]; + uint32_t size = sizeof(binpath); + ret = _NSGetExecutablePath(binpath, &size); + T_QUIET; T_ASSERT_EQ(ret, 0, "get binary path"); + + ret = dt_launch_tool(&pid, (char *[]) { binpath, "-n", "spawned_helper", NULL }, false, NULL, NULL); + T_ASSERT_POSIX_ZERO(ret, "posix_spawn"); + + ret = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "waitpid"); + T_ASSERT_TRUE(WIFEXITED(status), "child exited"); + T_ASSERT_EQ(WEXITSTATUS(status), 0, "child exit code"); +} diff --git a/tests/extract_right_soft_fail.c b/tests/extract_right_soft_fail.c deleted file mode 100644 index 193e4c2b5..000000000 --- a/tests/extract_right_soft_fail.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#import - -#define IKOT_TASK_CONTROL 2 - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.ipc"), - T_META_RADAR_COMPONENT_NAME("xnu"), - T_META_RADAR_COMPONENT_VERSION("IPC"), - T_META_RUN_CONCURRENTLY(TRUE), - T_META_TAG_VM_PREFERRED); - -static void -test_extract_immovable_task_port(pid_t pid) -{ - kern_return_t kr; - mach_port_t tport = MACH_PORT_NULL; - ipc_info_space_t space_info; - ipc_info_name_array_t table; - mach_msg_type_number_t tableCount; - ipc_info_tree_name_array_t tree; /* unused */ - mach_msg_type_number_t treeCount; /* unused */ - - mach_port_t extracted; - mach_msg_type_name_t right; - - - kr = task_for_pid(mach_task_self(), pid, &tport); - T_EXPECT_MACH_SUCCESS(kr, "task_for_pid(), tport: 0x%x", tport); - - T_LOG("Target pid: %d", pid); - - if (pid == getpid()) { - /* self extraction should succeed */ - kr = mach_port_extract_right(mach_task_self(), mach_task_self(), MACH_MSG_TYPE_COPY_SEND, &extracted, &right); - T_EXPECT_MACH_SUCCESS(kr, "mach_port_extract_right() on immovable port in current space should succeed"); - } else { - unsigned int kotype = 0, kobject = 0; - mach_port_name_t tport_name = MACH_PORT_NULL; - int tport_idx = 0; - kr = mach_port_space_info(tport, &space_info, &table, &tableCount, &tree, &treeCount); - T_EXPECT_MACH_SUCCESS(kr, "mach_port_space_info()"); - - for (int i = 0; i < tableCount; i++) { - T_LOG("Searching for task port..name: 0x%x", table[i].iin_name); - kr = mach_port_kernel_object(tport, table[i].iin_name, &kotype, &kobject); - if (KERN_SUCCESS == kr && kotype == IKOT_TASK_CONTROL) { - tport_name = table[i].iin_name; - tport_idx = i; - break; - } else if (kr) { - T_LOG("mach_port_kernel_object() failed on name 0x%x, kr: 0x%x", table[i].iin_name, kr); - } - } - - if (!tport_name) { - T_FAIL("Did not find task port in child's space"); - } - T_LOG("Remote tport name: 0x%x", tport_name); - kr = mach_port_extract_right(tport, tport_name, MACH_MSG_TYPE_COPY_SEND, &extracted, &right); - T_EXPECT_EQ(kr, KERN_INVALID_CAPABILITY, "mach_port_extract_right() on immovable port in child's space should fail (no crash): 0x%x", kr); - - T_LOG("Still alive after extract right.."); - - kr = mach_port_mod_refs(tport, tport_name, MACH_PORT_RIGHT_SEND, -table[tport_idx].iin_urefs); - T_EXPECT_EQ(kr, KERN_INVALID_CAPABILITY, "mach_port_mod_refs() on pinned port in child's space should fail (no crash): 0x%x", kr); - - T_LOG("Still alive after deallocate.."); - } -} - -T_DECL(extract_right_soft_fail, "Immovable/pinned violation on foreign task's space should not crash caller", - T_META_CHECK_LEAKS(false)) -{ - uint32_t opts = 0; - size_t size = sizeof(&opts); - pid_t child_pid; - kern_return_t ret; - int status, fd[2], fd2[2]; - - T_LOG("Check if immovable control port has been enabled\n"); - ret = sysctlbyname("kern.ipc_control_port_options", &opts, &size, NULL, 0); - - if (!ret && (opts & 0x08) == 0) { - T_SKIP("1p immovable control port hard enforcement isn't enabled"); - } - - /* extracting mach_task_self() should succeed */ - test_extract_immovable_task_port(getpid()); - - ret = pipe(fd); - T_EXPECT_NE(ret, -1, "pipe creation"); - - ret = pipe(fd2); - T_EXPECT_NE(ret, -1, "pipe creation2"); - - child_pid = fork(); - - if (child_pid < 0) { - T_FAIL("fork failed()"); - } - - if (child_pid == 0) { - char data[6]; - close(fd[0]); - close(fd2[1]); - write(fd[1], "wakeup", 6); /* Sync point 1 */ - close(fd[1]); - - read(fd2[0], data, 6); /* Sync point 2 */ - close(fd2[0]); - } else { - char data[6]; - close(fd[1]); - close(fd2[0]); - read(fd[0], data, 6); /* Sync point 1 */ - close(fd[0]); - - /* extracting child's immovable task port should fail without crash */ - test_extract_immovable_task_port(child_pid); - - write(fd2[1], "wakeup", 6); /* Sync point 2 */ - close(fd2[1]); - - kill(child_pid, SIGKILL); - wait(&status); - } -} diff --git a/tests/flow_div_doubleconnect_55917185.c b/tests/flow_div_doubleconnect_55917185.c index bde0d0025..eb06ca63b 100644 --- a/tests/flow_div_doubleconnect_55917185.c +++ b/tests/flow_div_doubleconnect_55917185.c @@ -8,6 +8,8 @@ #include +#include "net_test_lib.h" + /* we should win the race in this window: */ #define NTRIES 200000 @@ -58,6 +60,9 @@ T_DECL(flow_div_doubleconnect_55917185, "Bad error path in double-connect for fl s = -1; } + T_ASSERT_POSIX_ZERO(pthread_join(t, NULL), NULL); T_PASS("flow_divert_kctl_connect race didn't trigger panic"); + + force_zone_gc(); } diff --git a/tests/fp_exception.c b/tests/fp_exception.c index 0c9982c27..801ea6766 100644 --- a/tests/fp_exception.c +++ b/tests/fp_exception.c @@ -49,6 +49,8 @@ T_GLOBAL_META( T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_NOT_ELIGIBLE); +#ifdef __arm64__ + /* The bit to set in FPCR to enable the divide-by-zero floating point exception. */ #define FPCR_DIV_EXC 0x200 #define FPCR_INIT (0x0) @@ -56,13 +58,13 @@ T_GLOBAL_META( /* Whether we caught the EXC_ARITHMETIC mach exception or not. */ static volatile bool mach_exc_caught = false; -#ifdef __arm64__ static size_t exc_arithmetic_handler( __unused mach_port_t task, __unused mach_port_t thread, exception_type_t type, - mach_exception_data_t codes_64) + mach_exception_data_t codes_64, + __unused uint64_t exception_pc) { /* Floating point divide by zero should cause an EXC_ARITHMETIC exception. */ T_ASSERT_EQ(type, EXC_ARITHMETIC, "Caught an EXC_ARITHMETIC exception"); @@ -73,11 +75,12 @@ exc_arithmetic_handler( mach_exc_caught = true; return 4; } -#endif #define KERNEL_BOOTARGS_MAX_SIZE 1024 static char kernel_bootargs[KERNEL_BOOTARGS_MAX_SIZE]; +#endif /* __arm64__ */ + T_DECL(armv8_fp_exception, "Test that ARMv8 floating point exceptions generate Mach exceptions, verify default FPCR value.") { diff --git a/tests/host_statistics_rate_limiting.c b/tests/host_statistics_rate_limiting.c index de3cf0391..07f350f73 100644 --- a/tests/host_statistics_rate_limiting.c +++ b/tests/host_statistics_rate_limiting.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "cs_helpers.h" T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); @@ -22,6 +23,7 @@ T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); struct all_host_info { vm_statistics64_data_t host_vm_info64_rev0; vm_statistics64_data_t host_vm_info64_rev1; + vm_statistics64_data_t host_vm_info64_rev2; vm_extmod_statistics_data_t host_extmod_info64; host_load_info_data_t host_load_info; vm_statistics_data_t host_vm_info_rev0; @@ -32,6 +34,22 @@ struct all_host_info { task_power_info_v2_data_t host_expired_task_info2; }; +static bool +on_rosetta(void) +{ +#if defined(__x86_64__) + int out_value = 0; + size_t io_size = sizeof(out_value); + if (sysctlbyname("sysctl.proc_translated", &out_value, &io_size, NULL, 0) == 0) { + assert(io_size >= sizeof(out_value)); + return out_value; + } + return false; +#else /* defined(__x86_64__) */ + return false; +#endif /* !defined(__x86_64__) */ +} + static void check_host_info(struct all_host_info* data, unsigned long iter, char lett) { @@ -82,29 +100,61 @@ get_host_info(struct all_host_info* data, host_t self, int iter) for (i = 0; i < iter; i++) { count = HOST_VM_INFO64_REV0_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev0, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO64_REV0_COUNT, NULL); + count = HOST_VM_INFO64_REV1_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev1, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO64_REV1_COUNT, NULL); + + count = HOST_VM_INFO64_REV2_COUNT; + T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev2, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO64_REV2_COUNT, NULL); + count = HOST_EXTMOD_INFO64_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_EXTMOD_INFO64, (host_info64_t)&data[i].host_extmod_info64, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_EXTMOD_INFO64_COUNT, NULL); + count = HOST_LOAD_INFO_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_LOAD_INFO, (host_info_t)&data[i].host_load_info, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_LOAD_INFO_COUNT, NULL); + count = HOST_VM_INFO_REV0_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev0, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO_REV0_COUNT, NULL); + count = HOST_VM_INFO_REV1_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev1, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO_REV1_COUNT, NULL); + count = HOST_VM_INFO_REV2_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev2, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO_REV2_COUNT, NULL); + count = HOST_CPU_LOAD_INFO_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_CPU_LOAD_INFO, (host_info_t)&data[i].host_cpu_load_info, &count), NULL); + T_QUIET; T_ASSERT_EQ(count, HOST_CPU_LOAD_INFO_COUNT, NULL); + count = TASK_POWER_INFO_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info, &count), NULL); + if (!on_rosetta()) { + /* rdar://61083333 */ + T_QUIET; T_ASSERT_EQ(count, TASK_POWER_INFO_COUNT, NULL); + } + count = TASK_POWER_INFO_V2_COUNT; T_QUIET; T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info2, &count), NULL); + if (!on_rosetta()) { + /* rdar://61083333 */ + T_QUIET; T_ASSERT_EQ(count, TASK_POWER_INFO_V2_COUNT, NULL); + } } } T_DECL(test_host_statistics, "testing rate limit for host_statistics", - T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true), T_META_TAG_VM_NOT_PREFERRED) + T_META_CHECK_LEAKS(false), + T_META_ALL_VALID_ARCHS(true), + T_META_TAG_VM_NOT_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { unsigned long long start, end, window; int retry = 0; @@ -129,11 +179,12 @@ T_DECL(test_host_statistics, "testing rate limit for host_statistics", T_QUIET; T_ASSERT_EQ(sizeof(data[0].host_expired_task_info2), TASK_POWER_INFO_V2_COUNT * sizeof(int), "TASK_POWER_INFO_V2_COUNT"); /* check that the latest revision is the COUNT */ - T_QUIET; T_ASSERT_EQ(HOST_VM_INFO64_REV1_COUNT, HOST_VM_INFO64_COUNT, "HOST_VM_INFO64_REV1_COUNT"); + T_QUIET; T_ASSERT_EQ(HOST_VM_INFO64_REV2_COUNT, HOST_VM_INFO64_COUNT, "HOST_VM_INFO64_REV2_COUNT"); T_QUIET; T_ASSERT_EQ(HOST_VM_INFO_REV2_COUNT, HOST_VM_INFO_COUNT, "HOST_VM_INFO_REV2_COUNT"); /* check that the previous revision are smaller than the latest */ T_QUIET; T_ASSERT_LE(HOST_VM_INFO64_REV0_COUNT, HOST_VM_INFO64_REV1_COUNT, "HOST_VM_INFO64_REV0"); + T_QUIET; T_ASSERT_LE(HOST_VM_INFO64_REV1_COUNT, HOST_VM_INFO64_REV2_COUNT, "HOST_VM_INFO64_REV1"); T_QUIET; T_ASSERT_LE(HOST_VM_INFO_REV0_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV0_COUNT"); T_QUIET; T_ASSERT_LE(HOST_VM_INFO_REV1_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV1_COUNT"); T_QUIET; T_ASSERT_LE(TASK_POWER_INFO_COUNT, TASK_POWER_INFO_V2_COUNT, "TASK_POWER_INFO_COUNT"); diff --git a/tests/imm_pinned_control_port.c b/tests/imm_pinned_control_port.c index c5764c1db..90a64f6f1 100644 --- a/tests/imm_pinned_control_port.c +++ b/tests/imm_pinned_control_port.c @@ -21,8 +21,6 @@ extern char **environ; static uint64_t exception_code = 0; static exception_type_t exception_taken = 0; -#define IKOT_TASK_CONTROL 2 - #ifndef kGUARD_EXC_INVALID_OPTIONS #define kGUARD_EXC_INVALID_OPTIONS 3 #endif @@ -436,7 +434,7 @@ test_imm_pinned_control_port(const char *test_prog_name) } } -T_DECL(imm_pinned_control_port_hardened, "Test pinned & immovable task and thread control ports for hardened runtime binary", +T_DECL(imm_pinned_control_port_hardened, "Test pinned & immovable task and thread control ports for platform restrictions binary", T_META_IGNORECRASHES(".*pinned_rights_child.*"), T_META_CHECK_LEAKS(false)) { diff --git a/tests/imm_pinned_control_port_crasher.c b/tests/imm_pinned_control_port_crasher.c index 72970a04a..5635c664f 100644 --- a/tests/imm_pinned_control_port_crasher.c +++ b/tests/imm_pinned_control_port_crasher.c @@ -94,11 +94,13 @@ pinned_test_pthread_dealloc(void) static void pinned_test_task_self_dealloc(void) { - printf("[Crasher]: Deallocate mach_task_self twice\n"); + printf("[Crasher]: Deallocate mach_task_self thrice\n"); mach_port_t task_self = mach_task_self(); kern_return_t kr = mach_port_deallocate(task_self, task_self); assert(kr == 0); kr = mach_port_deallocate(task_self, task_self); + assert(kr == 0); + kr = mach_port_deallocate(task_self, task_self); printf("[Crasher pinned_test_task_self_dealloc] mach_port_deallocate returned %s \n.", mach_error_string(kr)); } @@ -107,7 +109,7 @@ static void pinned_test_task_self_mod_ref(void) { printf("[Crasher]: Mod refs mach_task_self() to 0\n"); - kern_return_t kr = mach_port_mod_refs(mach_task_self(), mach_task_self(), MACH_PORT_RIGHT_SEND, -2); + kern_return_t kr = mach_port_mod_refs(mach_task_self(), mach_task_self(), MACH_PORT_RIGHT_SEND, -3); printf("[Crasher pinned_test_task_self_mod_ref] mach_port_mod_refs returned %s \n.", mach_error_string(kr)); } @@ -355,18 +357,10 @@ cfi_test_msg_to_timer_port(void) } msg; kern_return_t kr; - natural_t kotype; - mach_vm_address_t addr; -#define IKOT_TIMER 8 timer = mk_timer_create(); assert(timer != MACH_PORT_NULL); - /* Make sure it's a kobject port */ - kr = mach_port_kobject(mach_task_self(), timer, &kotype, &addr); - assert(kr == KERN_SUCCESS); - assert(kotype == IKOT_TIMER); - msg.header.msgh_local_port = MACH_PORT_NULL; msg.header.msgh_remote_port = timer; msg.header.msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MAKE_SEND, 0, 0, 0); diff --git a/tests/inet6_addr_mode.c b/tests/inet6_addr_mode.c index 7581d1648..58e8413de 100644 --- a/tests/inet6_addr_mode.c +++ b/tests/inet6_addr_mode.c @@ -45,137 +45,6 @@ T_GLOBAL_META( static char ifname1[IF_NAMESIZE]; -/** -** stolen from bootp/bootplib/util.c -** -**/ - -#define ROUNDUP(a) \ - ((a) > 0 ? (1 + (((a) - 1) | (sizeof(u_int32_t) - 1))) : sizeof(u_int32_t)) - -static int -rt_xaddrs(char * cp, const char * cplim, struct rt_addrinfo * rtinfo) -{ - int i; - struct sockaddr * sa; - - bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info)); - for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) { - if ((rtinfo->rti_addrs & (1 << i)) == 0) { - continue; - } - sa = (struct sockaddr *)cp; - if ((cp + sa->sa_len) > cplim) { - return EINVAL; - } - rtinfo->rti_info[i] = sa; - cp += ROUNDUP(sa->sa_len); - } - return 0; -} - -/** -** stolen from bootp/IPConfiguration.bproj/iputil.c -** -** inet6_addrlist_* -**/ - -#define s6_addr16 __u6_addr.__u6_addr16 - -static char * -copy_if_info(unsigned int if_index, int af, int *ret_len_p) -{ - char * buf = NULL; - size_t buf_len = 0; - int mib[6]; - - mib[0] = CTL_NET; - mib[1] = PF_ROUTE; - mib[2] = 0; - mib[3] = af; - mib[4] = NET_RT_IFLIST; - mib[5] = (int)if_index; - - *ret_len_p = 0; - if (sysctl(mib, 6, NULL, &buf_len, NULL, 0) < 0) { - fprintf(stderr, "sysctl() size failed: %s", strerror(errno)); - goto failed; - } - buf_len *= 2; /* just in case something changes */ - buf = malloc(buf_len); - if (sysctl(mib, 6, buf, &buf_len, NULL, 0) < 0) { - free(buf); - buf = NULL; - fprintf(stderr, "sysctl() failed: %s", strerror(errno)); - goto failed; - } - *ret_len_p = (int)buf_len; - -failed: - return buf; -} - -static bool -inet6_get_linklocal_address(unsigned int if_index, struct in6_addr *ret_addr) -{ - char * buf = NULL; - char * buf_end; - int buf_len; - bool found = FALSE; - char *scan; - struct rt_msghdr *rtm; - - bzero(ret_addr, sizeof(*ret_addr)); - buf = copy_if_info(if_index, AF_INET6, &buf_len); - if (buf == NULL) { - goto done; - } - buf_end = buf + buf_len; - for (scan = buf; scan < buf_end; scan += rtm->rtm_msglen) { - struct ifa_msghdr * ifam; - struct rt_addrinfo info; - - /* ALIGN: buf aligned (from calling copy_if_info), scan aligned, - * cast ok. */ - rtm = (struct rt_msghdr *)(void *)scan; - if (rtm->rtm_version != RTM_VERSION) { - continue; - } - if (rtm->rtm_type == RTM_NEWADDR) { - errno_t error; - struct sockaddr_in6 *sin6_p; - - ifam = (struct ifa_msghdr *)rtm; - info.rti_addrs = ifam->ifam_addrs; - error = rt_xaddrs((char *)(ifam + 1), - ((char *)ifam) + ifam->ifam_msglen, - &info); - if (error) { - fprintf(stderr, "couldn't extract rt_addrinfo %s (%d)\n", - strerror(error), error); - goto done; - } - /* ALIGN: info.rti_info aligned (sockaddr), cast ok. */ - sin6_p = (struct sockaddr_in6 *)(void *)info.rti_info[RTAX_IFA]; - if (sin6_p == NULL - || sin6_p->sin6_len < sizeof(struct sockaddr_in6)) { - continue; - } - if (IN6_IS_ADDR_LINKLOCAL(&sin6_p->sin6_addr)) { - *ret_addr = sin6_p->sin6_addr; - ret_addr->s6_addr16[1] = 0; /* mask scope id */ - found = TRUE; - break; - } - } - } - -done: - if (buf != NULL) { - free(buf); - } - return found; -} static void cleanup(void) diff --git a/tests/ioc_str.h b/tests/ioc_str.h index c794ae2f1..9cf55794b 100644 --- a/tests/ioc_str.h +++ b/tests/ioc_str.h @@ -187,7 +187,6 @@ X(SIOCGIFFUNCTIONALTYPE) \ X(SIOCSIFNETSIGNATURE) \ X(SIOCGIFNETSIGNATURE) \ - X(SIOCGECNMODE) \ X(SIOCSECNMODE) \ X(SIOCSIFORDER) \ X(SIOCGIFORDER) \ @@ -244,6 +243,10 @@ X(SIOCGIFDISABLEINPUT) \ X(SIOCSIFDISABLEINPUT) \ X(SIOCGIFCONGESTEDLINK) \ - X(SIOCSIFCONGESTEDLINK) + X(SIOCSIFCONGESTEDLINK) \ + X(SIOCGIFL4S) \ + X(SIOCSIFL4S) \ + X(SIOCGINBANDWAKEPKT) \ + X(SIOCSINBANDWAKEPKT) #endif /* ioc_str_h */ diff --git a/tests/iokit/ioserviceusernotification_race.c b/tests/iokit/ioserviceusernotification_race.c index 4066d1a24..3e6661b7c 100644 --- a/tests/iokit/ioserviceusernotification_race.c +++ b/tests/iokit/ioserviceusernotification_race.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -22,93 +23,6 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_VERSION("IOKit"), T_META_OWNER("souvik_b")); - -static bool -ioclasscount(const char * className, size_t * result) -{ - bool ret = false; - kern_return_t status; - io_registry_entry_t root = IO_OBJECT_NULL; //must release - CFMutableDictionaryRef rootProps = NULL; //must release - CFStringRef classStr = CFStringCreateWithCString(kCFAllocatorDefault, className, kCFStringEncodingUTF8); //must release - - CFDictionaryRef diagnostics = NULL; //do not release - CFDictionaryRef classes = NULL; //do not release - CFNumberRef num = NULL; //do not release - int32_t num32; - - root = IORegistryGetRootEntry(kIOMainPortDefault); - status = IORegistryEntryCreateCFProperties(root, - &rootProps, kCFAllocatorDefault, kNilOptions); - if (KERN_SUCCESS != status) { - T_LOG("Error: Can't read registry root properties."); - goto finish; - } - if (CFDictionaryGetTypeID() != CFGetTypeID(rootProps)) { - T_LOG("Error: Registry root properties not a dictionary."); - goto finish; - } - - diagnostics = (CFDictionaryRef)CFDictionaryGetValue(rootProps, - CFSTR(kIOKitDiagnosticsKey)); - if (!diagnostics) { - T_LOG("Error: Allocation information missing."); - goto finish; - } - if (CFDictionaryGetTypeID() != CFGetTypeID(diagnostics)) { - T_LOG("Error: Allocation information not a dictionary."); - goto finish; - } - - classes = (CFDictionaryRef)CFDictionaryGetValue(diagnostics, CFSTR("Classes")); - if (!classes) { - T_LOG("Error: Class information missing."); - goto finish; - } - if (CFDictionaryGetTypeID() != CFGetTypeID(classes)) { - T_LOG("Error: Class information not a dictionary."); - goto finish; - } - - num = (CFNumberRef)CFDictionaryGetValue(classes, classStr); - if (!num) { - T_LOG("Error: Could not find class %s in dictionary.", className); - goto finish; - } - - if (CFNumberGetTypeID() != CFGetTypeID(num)) { - T_LOG("Error: Instance information not a number."); - goto finish; - } - - if (!CFNumberGetValue(num, kCFNumberSInt32Type, &num32)) { - T_LOG("Error: Failed to get number."); - goto finish; - } - - if (num32 < 0) { - T_LOG("Instance count is negative."); - goto finish; - } - - *result = (size_t)num32; - - ret = true; - -finish: - if (root != IO_OBJECT_NULL) { - IOObjectRelease(root); - } - if (rootProps != NULL) { - CFRelease(rootProps); - } - if (classStr != NULL) { - CFRelease(classStr); - } - - return ret; -} - static size_t absoluteDifference(size_t first, size_t second) { @@ -182,15 +96,27 @@ T_HELPER_DECL(ioserviceusernotification_race_helper, "ioserviceusernotification_ #define NUM_NOTIFICATION_ITERS 500 // how many times we should run the helper -#define NUM_HELPER_INVOCATIONS 50 +#define NUM_HELPER_INVOCATIONS 20 // when calling the external method, call in groups of N #define EXTERNAL_METHOD_GROUP_SIZE 5 -// How much ioclasscount variation to tolerate before we think we have a leak -#define IOCLASSCOUNT_LEAK_TOLERANCE 20 +// various sleep points in the test +#define WAIT_TIME1_MS 300 +#define WAIT_TIME2_MS 300 +#define WAIT_TIME3_MS 100 +#define WAIT_TIME4_MS 300 -T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_META_TAG_VM_PREFERRED) +// the test involves multiple sleep points. adding together they consume at least +// ((WAIT_TIME1_MS + WAIT_TIME2_MS) * NUM_HELPER_INVOCATIONS + WAIT_TIME3_MS * EXTERNAL_METHOD_GROUP_SIZE + WAIT_TIME4_MS) ms +// this (plus some leeway) should not exceed 30s +static_assert(((WAIT_TIME1_MS + WAIT_TIME2_MS) * NUM_HELPER_INVOCATIONS + WAIT_TIME3_MS * EXTERNAL_METHOD_GROUP_SIZE + WAIT_TIME4_MS) < 28 * MSEC_PER_SEC); + +// test is only run on macOS since slower platforms can cause timeout +T_DECL(ioserviceusernotification_race, + "Test IOServiceUserNotification race", + T_META_ENABLED(TARGET_OS_OSX), + T_META_TAG_VM_PREFERRED) { io_service_t service = IO_OBJECT_NULL; io_connect_t connect = IO_OBJECT_NULL; @@ -198,17 +124,9 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ char test_path[MAXPATHLEN] = {0}; char * helper_args[] = { test_path, "-n", "ioserviceusernotification_race_helper", NULL }; io_iterator_t notificationIters[NUM_NOTIFICATION_ITERS]; - size_t initialIOServiceUserNotificationCount; - size_t initialIOServiceMessageUserNotificationCount; - size_t initialIOUserNotificationCount; - size_t finalIOServiceUserNotificationCount; - size_t finalIOServiceMessageUserNotificationCount; - size_t finalIOUserNotificationCount; + pid_t childPids[NUM_HELPER_INVOCATIONS] = {}; + size_t leaks = 1, outCount = 1; - // Initial class counts - T_ASSERT_TRUE(ioclasscount("IOServiceUserNotification", &initialIOServiceUserNotificationCount), "ioclasscount IOServiceUserNotification"); - T_ASSERT_TRUE(ioclasscount("IOServiceMessageUserNotification", &initialIOServiceMessageUserNotificationCount), "ioclasscount IOServiceMessageUserNotification"); - T_ASSERT_TRUE(ioclasscount("IOUserNotification", &initialIOUserNotificationCount), "ioclasscount IOUserNotification"); T_QUIET; T_ASSERT_POSIX_SUCCESS(proc_pidpath(getpid(), test_path, MAXPATHLEN), "get pid path"); T_QUIET; T_ASSERT_POSIX_SUCCESS(IOTestServiceFindService("TestIOServiceUserNotification", &service), @@ -216,7 +134,6 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ T_QUIET; T_ASSERT_NE(service, MACH_PORT_NULL, "got service"); for (size_t i = 0; i < NUM_HELPER_INVOCATIONS; i++) { - pid_t child; if (connect == IO_OBJECT_NULL) { T_ASSERT_MACH_SUCCESS(IOServiceOpen(service, mach_task_self(), 1, &connect), "open service"); } @@ -224,24 +141,24 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ T_QUIET; T_ASSERT_MACH_SUCCESS(IOConnectCallMethod(connect, 0, NULL, 0, NULL, 0, NULL, 0, NULL, NULL), "call external method"); - sleep(1); - dt_launch_tool(&child, helper_args, false, NULL, NULL); - T_LOG("launch helper -> pid %d", child); - sleep(1); + usleep(WAIT_TIME1_MS); + dt_launch_tool(&childPids[i], helper_args, false, NULL, NULL); + T_LOG("launch helper -> pid %d", childPids[i]); + usleep(WAIT_TIME2_MS); while (true) { for (size_t k = 0; k < EXTERNAL_METHOD_GROUP_SIZE; k++) { T_QUIET; T_ASSERT_MACH_SUCCESS(IOConnectCallMethod(connect, 0, NULL, 0, NULL, 0, NULL, 0, NULL, NULL), "call external method"); - usleep(100); + usleep(WAIT_TIME3_MS); } if ((random() % 1000) == 0) { break; } } - T_LOG("kill helper %d", child); - kill(child, SIGKILL); + T_LOG("kill helper %d", childPids[i]); + kill(childPids[i], SIGKILL); if ((random() % 3) == 0) { IOServiceClose(connect); @@ -249,6 +166,15 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ } } + if (connect != IO_OBJECT_NULL) { + IOServiceClose(connect); + connect = IO_OBJECT_NULL; + } + + for (size_t i = 0; i < NUM_HELPER_INVOCATIONS; i++) { + waitpid(childPids[i], NULL, 0); + } + // Register for notifications for (size_t i = 0; i < sizeof(notificationIters) / sizeof(notificationIters[0]); i++) { T_QUIET; T_ASSERT_MACH_SUCCESS( @@ -256,7 +182,7 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ "add notification"); } - sleep(1); + usleep(WAIT_TIME4_MS); // Release the notifications for (size_t i = 0; i < sizeof(notificationIters) / sizeof(notificationIters[0]); i++) { @@ -266,14 +192,17 @@ T_DECL(ioserviceusernotification_race, "Test IOServiceUserNotification race", T_ notificationIters[i] = MACH_PORT_NULL; } - // Check for leaks - T_ASSERT_TRUE(ioclasscount("IOServiceUserNotification", &finalIOServiceUserNotificationCount), "ioclasscount IOServiceUserNotification"); - T_ASSERT_TRUE(ioclasscount("IOServiceMessageUserNotification", &finalIOServiceMessageUserNotificationCount), "ioclasscount IOServiceMessageUserNotification"); - T_ASSERT_TRUE(ioclasscount("IOUserNotification", &finalIOUserNotificationCount), "ioclasscount IOUserNotification"); - T_ASSERT_LT(absoluteDifference(initialIOServiceUserNotificationCount, finalIOServiceUserNotificationCount), (size_t)IOCLASSCOUNT_LEAK_TOLERANCE, "did not leak IOServiceUserNotification"); - T_ASSERT_LT(absoluteDifference(initialIOServiceMessageUserNotificationCount, finalIOServiceMessageUserNotificationCount), (size_t)IOCLASSCOUNT_LEAK_TOLERANCE, "did not leak IOServiceMessageUserNotification"); - T_ASSERT_LT(absoluteDifference(initialIOUserNotificationCount, finalIOUserNotificationCount), (size_t)IOCLASSCOUNT_LEAK_TOLERANCE, "did not leak IOUserNotification"); + T_ASSERT_MACH_SUCCESS(IOServiceOpen(service, mach_task_self(), 1, &connect), "open service"); + T_ASSERT_MACH_SUCCESS(IOConnectCallMethod(connect, 1, + NULL, 0, NULL, 0, &leaks, &outCount, NULL, NULL), "call external method"); + + T_LOG("IOServiceUserNotification leak count: %llu", leaks); + + // Check for leaks + T_ASSERT_EQ(leaks, 0, "leaked IOServiceUserNotification"); + + IOServiceClose(connect); IOObjectRelease(service); IONotificationPortDestroy(notifyPort); } diff --git a/tests/iokit/testiodataqueues.c b/tests/iokit/testiodataqueues.c new file mode 100644 index 000000000..1e251c170 --- /dev/null +++ b/tests/iokit/testiodataqueues.c @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__arm64__) +#include +#endif /* defined(__arm64__) */ + +// PT: These two files must be included before IOCircularDataQueueImplementation.h +#include +#include +#include +#if 0 +#include "device_user.h" +#include <../iokit/IOKit/IOCircularDataQueueImplementation.h> +#else +#include +#endif + +#include "service_helpers.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.iokit"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("IOKit")); + +T_DECL(iodataqueues, "Test IODataQueues", T_META_TAG_VM_PREFERRED) +{ + io_service_t service = IO_OBJECT_NULL; + io_connect_t connect = IO_OBJECT_NULL; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(IOTestServiceFindService("TestIODataQueues", &service), + "Find service"); + T_QUIET; T_ASSERT_NE(service, MACH_PORT_NULL, "got service"); + + T_ASSERT_MACH_SUCCESS(IOServiceOpen(service, mach_task_self(), 1, &connect), "open service"); + + kern_return_t ret; + IOCircularDataQueue * queue; + + for (int cycle = 0; cycle < 2; cycle++) { + ret = IOCircularDataQueueCreateWithConnection(kIOCircularDataQueueCreateConsumer, connect, 53, &queue); +#if defined(__arm64__) && defined(__LP64__) + if (0 == (kHasFeatLSE2 & _get_cpu_capabilities())) { + assert(kIOReturnUnsupported == ret); + break; + } else { + assert(kIOReturnSuccess == ret); + } + + char buf[16]; + size_t length = sizeof(buf); + ret = IOCircularDataQueueCopyLatest(queue, &buf[0], &length); + assert(kIOReturnSuccess == ret); + printf("[%ld]%s\n", length, &buf[0]); + + if (0) { + // requires write access so disabled + ret = IOCircularDataQueueEnqueue(queue, "goodbye", sizeof("goodbye")); + assert(kIOReturnSuccess == ret); + } + ret = IOCircularDataQueueCopyLatest(queue, &buf[0], &length); + assert(kIOReturnSuccess == ret); + printf("[%ld]%s\n", length, &buf[0]); + + ret = IOCircularDataQueueDestroy(&queue); + assert(kIOReturnSuccess == ret); +#else /* defined(__arm64__) && defined(__LP64__) */ + assert(kIOReturnUnsupported == ret); + break; +#endif /* !(defined(__arm64__) && defined(__LP64__)) */ + } + + IOObjectRelease(service); +} diff --git a/tests/iokit/testiodataqueues.entitlements b/tests/iokit/testiodataqueues.entitlements new file mode 100644 index 000000000..95087bb66 --- /dev/null +++ b/tests/iokit/testiodataqueues.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.iokit.TestIODataQueues + + com.apple.iokit.test-service-management + + com.apple.security.iokit-user-client-class + TestIODataQueuesUserClient + + diff --git a/tests/iopolicy.c b/tests/iopolicy.c index d60d7a59c..070cc5786 100644 --- a/tests/iopolicy.c +++ b/tests/iopolicy.c @@ -94,6 +94,12 @@ T_DECL(iopol_type_vfs_disallow_rw_for_o_evtonly, T_ASSERT_EQ(err, IOPOL_VFS_DISALLOW_RW_FOR_O_EVTONLY_ON, "getiopolicy_np(IOPOL_TYPE_VFS_DISALLOW_RW_FOR_O_EVTONLY, IOPOL_SCOPE_PROCESS)"); + T_WITH_ERRNO; + err = getiopolicy_np(IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY, + IOPOL_SCOPE_THREAD); + T_ASSERT_TRUE((err == -1) && (errno == EINVAL), + "getiopolicy_np(IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY, IOPOL_SCOPE_THREAD)"); + T_WITH_ERRNO; err = setiopolicy_np(IOPOL_TYPE_VFS_DISALLOW_RW_FOR_O_EVTONLY, IOPOL_SCOPE_PROCESS, IOPOL_VFS_DISALLOW_RW_FOR_O_EVTONLY_OFF); diff --git a/tests/ip_pktinfo.c b/tests/ip_pktinfo.c new file mode 100644 index 000000000..4fd4c61ce --- /dev/null +++ b/tests/ip_pktinfo.c @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define __APPLE_USE_RFC_3542 1 + +#include + +#include +#include +#include +#include + +#include "net_test_lib.h" + +#define MAX_IPv4_STR_LEN 16 +#define MAX_IPv6_STR_LEN 64 + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.net"), + T_META_ASROOT(true), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("networking"), + T_META_CHECK_LEAKS(false)); + +static char *ifname1; +static char *ifname2; + +#define IPV4_MULTICAST_ADDR_STR "239.1.2.3" +#define IPV6_MULTICAST_ADDR_STR "FF12:0:0:0:0:0:0:FC" + +#define TEN_NET 0x0a000000 +#define TEN_1_NET (TEN_NET | 0x010000) +#define TEN_1_BROADCAST (TEN_1_NET | 0xff) + +static network_interface_pair_list_t S_feth_pairs; + + +static char *data = "hello\n"; + +static bool success = false; + +static void +get_ipv4_address(u_int unit, u_int addr_index, struct in_addr *ip) +{ + /* up to 255 units, 255 addresses */ + ip->s_addr = htonl(TEN_1_NET | (unit << 8) | addr_index); + return; +} + +static void +network_interface_assign_address(network_interface_t netif, + unsigned int unit, unsigned int address_index) +{ + get_ipv4_address(unit, address_index, &netif->ip); + ifnet_add_ip_address(netif->if_name, netif->ip, + inet_class_c_subnet_mask); + route_add_inet_scoped_subnet(netif->if_name, netif->if_index, + netif->ip, inet_class_c_subnet_mask); + ifnet_start_ipv6(netif->if_name); + T_ASSERT_EQ(inet6_get_linklocal_address(netif->if_index, &netif->ip6), 1, NULL); +} + +static void +initialize_feth_pairs(u_int n, bool need_address) +{ + network_interface_pair_t scan; + + S_feth_pairs = network_interface_pair_list_alloc(n); + scan = S_feth_pairs->list; + for (unsigned int i = 0; i < n; i++, scan++) { + network_interface_create(&scan->one, FETH_NAME); + network_interface_create(&scan->two, FETH_NAME); + if (need_address) { + network_interface_assign_address(&scan->one, i, 1); + network_interface_assign_address(&scan->two, i, 2); + } + fake_set_peer(scan->one.if_name, scan->two.if_name); + } + + ifname1 = S_feth_pairs->list->one.if_name; + ifname2 = S_feth_pairs->list->two.if_name; +} + +static void +cleanup(void) +{ + network_interface_pair_list_destroy(S_feth_pairs); + /* allow for the detach to be final before the next test */ + usleep(100000); +} + +static void +init(void) +{ + T_ATEND(cleanup); + + success = false; + initialize_feth_pairs(1, true); +} + +static int +setup_receiver(char *bind_to_ifname, bool bind_to_port, in_addr_t bind_to_addr, in_port_t *bound_port) +{ + int receiver_fd; + socklen_t solen; + struct sockaddr_in sin = {}; + char ifname[IFNAMSIZ]; + char laddr_str[MAX_IPv4_STR_LEN]; + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + int optval; + + /* + * Setup receiver bound to ifname1 + */ + T_ASSERT_POSIX_SUCCESS(receiver_fd = socket(AF_INET, SOCK_DGRAM, 0), NULL); + + T_ASSERT_POSIX_SUCCESS(setsockopt(receiver_fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receiver_fd, SOL_SOCKET, SO_DEBUG, &optval, sizeof(int)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receiver_fd, IPPROTO_IP, IP_RECVPKTINFO, &optval, sizeof(int)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receiver_fd, IPPROTO_UDP, UDP_NOCKSUM, &optval, sizeof(int)), NULL); + + if (bind_to_ifname != NULL) { + solen = strlen(bind_to_ifname); + T_ASSERT_POSIX_SUCCESS(setsockopt(receiver_fd, SOL_SOCKET, SO_BINDTODEVICE, bind_to_ifname, solen), NULL); + } + + if (bind_to_port || bind_to_addr != INADDR_ANY) { + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr.s_addr = bind_to_addr; + T_ASSERT_POSIX_SUCCESS(bind(receiver_fd, (struct sockaddr *)&sin, sizeof(struct sockaddr_in)), NULL); + } + solen = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(getsockname(receiver_fd, (struct sockaddr *)&sin, &solen), NULL); + inet_ntop(AF_INET, &sin.sin_addr, laddr_str, sizeof(laddr_str)); + + solen = sizeof(ifname); + T_ASSERT_POSIX_SUCCESS(getsockopt(receiver_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, &solen), NULL); + + T_LOG("receiver bound to %s:%u over '%s'", laddr_str, ntohs(sin.sin_port), ifname); + + *bound_port = sin.sin_port; + return receiver_fd; +} + +int +setup_sender(char *bind_to_ifname, in_addr_t connect_to_addr, in_port_t connect_to_port) +{ + int sender_fd; + struct sockaddr_in connect_to_sin = {}; + struct sockaddr_in sin = {}; + socklen_t solen; + char laddr_str[MAX_IPv4_STR_LEN]; + char faddr_str[MAX_IPv4_STR_LEN]; + char ifname[IFNAMSIZ]; + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + int optval; + + T_ASSERT_POSIX_SUCCESS(sender_fd = socket(AF_INET, SOCK_DGRAM, 0), NULL); + + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_DEBUG, &optval, sizeof(int)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_IP, IP_RECVPKTINFO, &optval, sizeof(int)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_UDP, UDP_NOCKSUM, &optval, sizeof(int)), NULL); + + if (bind_to_ifname != NULL) { + solen = strlen(bind_to_ifname); + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_BINDTODEVICE, bind_to_ifname, solen), NULL); + } + + connect_to_sin.sin_family = AF_INET; + connect_to_sin.sin_len = sizeof(struct sockaddr_in); + connect_to_sin.sin_port = connect_to_port; + connect_to_sin.sin_addr.s_addr = connect_to_addr; + + T_ASSERT_POSIX_SUCCESS(connect(sender_fd, (struct sockaddr *)&connect_to_sin, sizeof(struct sockaddr_in)), NULL); + + solen = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(getsockname(sender_fd, (struct sockaddr *)&sin, &solen), NULL); + inet_ntop(AF_INET, &sin.sin_addr, laddr_str, sizeof(laddr_str)); + inet_ntop(AF_INET, &connect_to_sin.sin_addr, faddr_str, sizeof(faddr_str)); + + solen = sizeof(ifname); + T_ASSERT_POSIX_SUCCESS(getsockopt(sender_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, &solen), NULL); + + T_LOG("sender_fd connected from %s:%u to %s:%u over '%s'", + laddr_str, ntohs(sin.sin_port), faddr_str, ntohs(connect_to_sin.sin_port), + ifname); + + return sender_fd; +} + + +static void +echo(int receiver_fd, bool by_ip_addr) +{ + struct msghdr recvmsghdr = {}; + char control_space[CMSG_SPACE(128)] = {}; + char packet_space[1500] = {}; + struct cmsghdr *cmsg; + ssize_t retval; + struct iovec recv_iov = {}; + struct sockaddr_in peer_addr; + struct in_pktinfo recv_in_pktinfo = {}; + struct in_pktinfo send_in_pktinfo = {}; + char ifname[IFNAMSIZ] = {}; + struct msghdr reply_msg = {}; + struct iovec reply_iov = {}; + char reply_control_space[CMSG_SPACE(128)] = {}; + char spec_dst_str[MAX_IPv4_STR_LEN]; + char addr_str[MAX_IPv4_STR_LEN]; + char peer_addr_str[MAX_IPv4_STR_LEN]; + + T_LOG("%s(by_ip_addr: %s)", __func__, by_ip_addr ? "true" : "false"); + + recv_iov.iov_len = sizeof(packet_space); + recv_iov.iov_base = &packet_space; + + recvmsghdr.msg_name = &peer_addr; + recvmsghdr.msg_namelen = sizeof(struct sockaddr_in); + recvmsghdr.msg_iov = &recv_iov; + recvmsghdr.msg_iovlen = 1; + recvmsghdr.msg_control = &control_space; + recvmsghdr.msg_controllen = sizeof(control_space); + recvmsghdr.msg_flags = 0; + + T_ASSERT_POSIX_SUCCESS(retval = recvmsg(receiver_fd, &recvmsghdr, 0), NULL); + + for (cmsg = CMSG_FIRSTHDR(&recvmsghdr); cmsg != NULL; cmsg = CMSG_NXTHDR(&recvmsghdr, cmsg)) { + if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVPKTINFO) { + T_ASSERT_EQ(CMSG_LEN(sizeof(struct in_pktinfo)), (size_t)cmsg->cmsg_len, + "CMSG_LEN(struct in_pktinfo), (size_t)cmsg->cmsg_len"); + memcpy(&recv_in_pktinfo, CMSG_DATA(cmsg), sizeof(struct in_pktinfo)); + } + } + + ifname[0] = 0; + if_indextoname(recv_in_pktinfo.ipi_ifindex, ifname); + inet_ntop(AF_INET, &recv_in_pktinfo.ipi_spec_dst, spec_dst_str, sizeof(spec_dst_str)); + inet_ntop(AF_INET, &recv_in_pktinfo.ipi_addr, addr_str, sizeof(addr_str)); + inet_ntop(AF_INET, &peer_addr.sin_addr, peer_addr_str, sizeof(peer_addr_str)); + + T_LOG("received %ld bytes from %s:%u with IP_RECVPKTINFO ipi_ifindex: %u (%s) ipi_spec_dst: %s ipi_addr: %s", + retval, peer_addr_str, ntohs(peer_addr.sin_port), + recv_in_pktinfo.ipi_ifindex, ifname, spec_dst_str, addr_str); + + reply_iov.iov_base = packet_space; + reply_iov.iov_len = retval; + + reply_msg.msg_name = &peer_addr; + reply_msg.msg_namelen = sizeof(struct sockaddr_in); + reply_msg.msg_iov = &reply_iov; + reply_msg.msg_iovlen = 1; + reply_msg.msg_control = reply_control_space; + reply_msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo)); + + send_in_pktinfo.ipi_addr.s_addr = 0; + if (by_ip_addr) { + send_in_pktinfo.ipi_ifindex = 0; + send_in_pktinfo.ipi_spec_dst.s_addr = recv_in_pktinfo.ipi_addr.s_addr; + } else { + send_in_pktinfo.ipi_ifindex = recv_in_pktinfo.ipi_ifindex; + send_in_pktinfo.ipi_spec_dst.s_addr = 0; + } + cmsg = CMSG_FIRSTHDR(&reply_msg); + cmsg->cmsg_level = IPPROTO_IP; + cmsg->cmsg_type = IP_PKTINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); + memcpy(CMSG_DATA(cmsg), &send_in_pktinfo, sizeof(struct in_pktinfo)); + + ifname[0] = 0; + if_indextoname(send_in_pktinfo.ipi_ifindex, ifname); + inet_ntop(AF_INET, &send_in_pktinfo.ipi_spec_dst, spec_dst_str, sizeof(spec_dst_str)); + inet_ntop(AF_INET, &send_in_pktinfo.ipi_addr, addr_str, sizeof(addr_str)); + + T_LOG("sending %ld bytes to %s:%u with IP_PKTINFO ipi_ifindex: %u (%s) ipi_spec_dst: %s ipi_addr: %s", + retval, peer_addr_str, ntohs(peer_addr.sin_port), + send_in_pktinfo.ipi_ifindex, ifname, spec_dst_str, addr_str); + + T_ASSERT_POSIX_SUCCESS(retval = sendmsg(receiver_fd, &reply_msg, 0), NULL); +} + +static void +echo_and_check(int receiver_fd, bool by_ip_addr) +{ + socklen_t solen; + struct sockaddr_in before_sin = {}; + char before_ifname[IFNAMSIZ]; + u_int before_ifindex; + struct sockaddr_in after_sin = {}; + char after_ifname[IFNAMSIZ]; + u_int after_ifindex; + char before_addr_str[MAX_IPv4_STR_LEN]; + char after_addr_str[MAX_IPv4_STR_LEN]; + + T_LOG("%s(by_ip_addr: %s)", __func__, by_ip_addr ? "true" : "false"); + + solen = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(getsockname(receiver_fd, (struct sockaddr *)&before_sin, &solen), NULL); + inet_ntop(AF_INET, &before_sin.sin_addr, before_addr_str, sizeof(before_addr_str)); + + solen = sizeof(before_ifname); + T_ASSERT_POSIX_SUCCESS(getsockopt(receiver_fd, SOL_SOCKET, SO_BINDTODEVICE, before_ifname, &solen), NULL); + before_ifindex = if_nametoindex(before_ifname); + + echo(receiver_fd, by_ip_addr); + + solen = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(getsockname(receiver_fd, (struct sockaddr *)&after_sin, &solen), NULL); + inet_ntop(AF_INET, &after_sin.sin_addr, after_addr_str, sizeof(after_addr_str)); + + solen = sizeof(after_ifname); + T_ASSERT_POSIX_SUCCESS(getsockopt(receiver_fd, SOL_SOCKET, SO_BINDTODEVICE, after_ifname, &solen), NULL); + after_ifindex = if_nametoindex(after_ifname); + + + T_LOG("before bound to %s:%u over '%s'/%u", before_addr_str, ntohs(before_sin.sin_port), before_ifname, before_ifindex); + T_LOG("after bound to %s:%u over '%s'/%u", after_addr_str, ntohs(after_sin.sin_port), after_ifname, after_ifindex); + + T_ASSERT_EQ_USHORT(before_sin.sin_port, after_sin.sin_port, "same port"); + T_ASSERT_EQ_UINT(before_sin.sin_addr.s_addr, after_sin.sin_addr.s_addr, "same IP address"); + T_ASSERT_EQ_UINT(before_ifindex, after_ifindex, "same interface index"); +} + +static void +do_test_ip_pktinfo(bool bind_to_device, bool bind_to_port, in_addr_t bind_to_addr) +{ + int receiver_fd; + in_port_t receiver_port = 0; + int sender_fd; + ssize_t retval; + + init(); + + receiver_fd = setup_receiver(bind_to_device ? ifname1 : NULL, + bind_to_port, + bind_to_addr ? S_feth_pairs->list->one.ip.s_addr : INADDR_ANY, + &receiver_port); + sender_fd = setup_sender(ifname2, S_feth_pairs->list->one.ip.s_addr, receiver_port); + + T_ASSERT_POSIX_SUCCESS(retval = send(sender_fd, data, strlen(data) + 1, 0), NULL); + echo_and_check(receiver_fd, true); + + T_ASSERT_POSIX_SUCCESS(retval = send(sender_fd, data, strlen(data) + 1, 0), NULL); + echo_and_check(receiver_fd, false); + + close(sender_fd); + close(receiver_fd); + + success = true; +} + + +T_DECL(ip_pktinfo_010, "IP_PTKINFO bind_to_device=false bind_to_port=true bind_to_addr=false") +{ + do_test_ip_pktinfo(false, true, false); +} + +T_DECL(ip_pktinfo_011, "IP_PTKINFO bind_to_device=false bind_to_port=true bind_to_addr=true") +{ + do_test_ip_pktinfo(false, true, true); +} + +T_DECL(ip_pktinfo_110, "IP_PTKINFO bind_to_device=true bind_to_port=true bind_to_addr=false") +{ + do_test_ip_pktinfo(true, true, false); +} + +T_DECL(ip_pktinfo_111, "IP_PTKINFO bind_to_device=true bind_to_port=true bind_to_addr=true") +{ + do_test_ip_pktinfo(true, true, true); +} diff --git a/tests/ipc/hardened_exceptions.c b/tests/ipc/hardened_exceptions.c index e6b058874..83ece48fa 100644 --- a/tests/ipc/hardened_exceptions.c +++ b/tests/ipc/hardened_exceptions.c @@ -44,7 +44,6 @@ struct mach_exception_options { thread_state_flavor_t flavors_allowed; }; -#if __arm64__ static void bad_access_func(void) { @@ -53,11 +52,11 @@ bad_access_func(void) T_QUIET; T_LOG("Recoverd!"); return; } -#endif /* __arm64__ */ static int num_exceptions = 0; static uint32_t signing_key = (uint32_t)(0xa8000000 & 0xff000000); + static size_t exc_handler_state_identity_protected( task_id_token_t token, @@ -123,8 +122,7 @@ create_hardened_exception_port(const struct mach_exception_options meo, { #if !__arm64__ T_SKIP("Hardened exceptions not supported on !arm64"); - return MACH_PORT_NULL; -#else /* !__arm64__ */ +#endif /* !__arm64__ */ kern_return_t kr; mach_port_t exc_port; mach_port_options_t opts = { @@ -142,18 +140,17 @@ create_hardened_exception_port(const struct mach_exception_options meo, T_ASSERT_NE_UINT(exc_port, 0, "new exception port not null"); return exc_port; -#endif /* !__arm64__ */ } T_DECL(hardened_exceptions_default, "Test creating and using hardened exception ports") { #if !__arm64__ T_SKIP("Hardened exceptions not supported on !arm64"); -#else /* !__arm64__ */ +#endif /* !__arm64__ */ struct mach_exception_options meo; meo.exceptions_allowed = EXC_MASK_BAD_ACCESS; meo.behaviors_allowed = EXCEPTION_STATE_IDENTITY_PROTECTED | MACH_EXCEPTION_CODES; - meo.flavors_allowed = ARM_THREAD_STATE64; + meo.flavors_allowed = EXCEPTION_THREAD_STATE; mach_port_t exc_port = create_hardened_exception_port(meo, signing_key); @@ -165,7 +162,6 @@ T_DECL(hardened_exceptions_default, bad_access_func(); printf("Successfully recovered from the exception!\n"); -#endif /* !__arm64__ */ } extern char *__progname; @@ -175,11 +171,11 @@ T_DECL(entitled_process_exceptions_disallowed, T_META_IGNORECRASHES("*hardened_exceptions_entitled")) { #if !__arm64__ T_SKIP("Hardened exceptions not supported on !arm64"); -#else /* !__arm64__ */ +#endif /* !__arm64__ */ bool entitled = strstr(__progname, "entitled") != NULL; bool debugger = strstr(__progname, "debugger") != NULL; - /* thread_set_exception_ports as a hardened binary should fail */ + /* thread_set_exception_ports as a platform restrictions binary should fail */ kern_return_t kr = thread_set_exception_ports( mach_thread_self(), EXC_MASK_ALL, @@ -196,5 +192,4 @@ T_DECL(entitled_process_exceptions_disallowed, } else { T_FAIL("invalid configuration"); } -#endif /* !__arm64__ */ } diff --git a/tests/ipc/ipc_read_inspect.c b/tests/ipc/ipc_read_inspect.c index 9350f53f5..963261d3e 100644 --- a/tests/ipc/ipc_read_inspect.c +++ b/tests/ipc/ipc_read_inspect.c @@ -14,16 +14,6 @@ #include #include -#define IKOT_THREAD_CONTROL 1 -#define IKOT_THREAD_READ 47 -#define IKOT_THREAD_INSPECT 46 - -#define IKOT_TASK_CONTROL 2 -#define IKOT_TASK_READ 45 -#define IKOT_TASK_INSPECT 44 -#define IKOT_TASK_NAME 20 - - /* * This test verifies various security properties for task and thread * read/inspect interfaces. Specifically, it checks and makes sure: @@ -159,7 +149,7 @@ test_task_threads( thread_array_t threadList; mach_msg_type_number_t threadCount = 0; - unsigned int kotype; + ipc_info_object_type_t kotype; unsigned int kaddr; T_LOG("Testing task_threads() with task flavor %d", flavor); @@ -191,13 +181,13 @@ test_task_threads( } switch (flavor) { case TASK_FLAVOR_CONTROL: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_CONTROL, "Task control port should yield thread control port"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_THREAD_CONTROL, "Task control port should yield thread control port"); break; case TASK_FLAVOR_READ: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_READ, "Task read port should yield thread read port"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_THREAD_READ, "Task read port should yield thread read port"); break; case TASK_FLAVOR_INSPECT: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_THREAD_INSPECT, "Task inspect port should yield thread inspect port"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_THREAD_INSPECT, "Task inspect port should yield thread inspect port"); break; default: T_FAIL("task_threads() returned thread ports with task name port??"); @@ -223,7 +213,7 @@ test_processor_set_tasks( mach_msg_type_number_t pcnt = 0, tcnt = 0; mach_port_t host = mach_host_self(); - unsigned int kotype; + ipc_info_object_type_t kotype; unsigned int kaddr; T_LOG("Testing processor_set_tasks() with task flavor %d", flavor); @@ -255,16 +245,16 @@ test_processor_set_tasks( } switch (flavor) { case TASK_FLAVOR_CONTROL: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_CONTROL, "TASK_FLAVOR_CONTROL should yield control ports"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_TASK_CONTROL, "TASK_FLAVOR_CONTROL should yield control ports"); break; case TASK_FLAVOR_READ: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_READ, "TASK_FLAVOR_READ should yield read ports"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_TASK_READ, "TASK_FLAVOR_READ should yield read ports"); break; case TASK_FLAVOR_INSPECT: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_INSPECT, "TASK_FLAVOR_INSPECT should yield inspect ports"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_TASK_INSPECT, "TASK_FLAVOR_INSPECT should yield inspect ports"); break; case TASK_FLAVOR_NAME: - T_QUIET; T_EXPECT_EQ(kotype, IKOT_TASK_NAME, "TASK_FLAVOR_NAME should yield name ports"); + T_QUIET; T_EXPECT_EQ(kotype, IPC_OTYPE_TASK_NAME, "TASK_FLAVOR_NAME should yield name ports"); break; default: T_FAIL("strange flavor"); @@ -476,7 +466,6 @@ test_thread_port_mig_intrans( mach_msg_type_number_t count = THREAD_QOS_POLICY_COUNT; boolean_t get_default = FALSE; - processor_set_name_t name = MACH_PORT_NULL; kr = thread_policy_get(tport, THREAD_QOS_POLICY, (thread_policy_t)&info, &count, &get_default); RESULT_CHECK(kr, flavor, THREAD_FLAVOR_INSPECT, "thread_policy_get"); diff --git a/tests/ipc/ipc_thread_ports_race.c b/tests/ipc/ipc_thread_ports_race.c index 0087df0bc..bd5f2611d 100644 --- a/tests/ipc/ipc_thread_ports_race.c +++ b/tests/ipc/ipc_thread_ports_race.c @@ -37,10 +37,14 @@ thread_creation_bomb_one(void *_ctx __unused, size_t _i __unused) for (int i = 0; i < BATCH; i++) { int rc = pthread_create(&th[i], NULL, thread_do_nothing, NULL); + if (rc == EAGAIN) { + th[i] = NULL; + continue; + } T_QUIET; T_ASSERT_EQ(rc, 0, "pthread_create[%d]", i); } - for (int i = 0; i < BATCH; i++) { + for (int i = 0; i < BATCH && th[i]; i++) { int rc = pthread_join(th[i], NULL); T_QUIET; T_ASSERT_EQ(rc, 0, "pthread_join[%d]", i); } diff --git a/tests/ipc/ipcpv_telemetry_test.c b/tests/ipc/ipcpv_telemetry_test.c new file mode 100644 index 000000000..1b44d2f89 --- /dev/null +++ b/tests/ipc/ipcpv_telemetry_test.c @@ -0,0 +1,31 @@ +#include +#include +#include +#include +#include +#include +#include "test_utils.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("IPC"), + T_META_TIMEOUT(10), + T_META_RUN_CONCURRENTLY(TRUE)); + +T_DECL(ipcpv_telemetry_test, + "Make sure there is no telemetry for known issues during presub testing", + T_META_CHECK_LEAKS(false), + XNU_T_META_REQUIRES_DEVELOPMENT_KERNEL, + T_META_TAG_VM_PREFERRED) +{ + unsigned int telemetry_count = 0; + size_t telemetry_size = sizeof(telemetry_count); + int kr = sysctlbyname("debug.ipcpv_telemetry_count", &telemetry_count, &telemetry_size, NULL, 0); + if (kr != 0) { + T_SKIP("sysctl debug.ipcpv_telemetry_count"); + exit(0); + } + T_ASSERT_EQ(kr, 0, "sysctl debug.ipcpv_telemetry_count"); + T_ASSERT_EQ(telemetry_count, 0, "found ipc policy violation during presub"); +} diff --git a/tests/ipc/mach_exc_port_substitute.c b/tests/ipc/mach_exc_port_substitute.c index 0d63c35cc..4ece19d59 100644 --- a/tests/ipc/mach_exc_port_substitute.c +++ b/tests/ipc/mach_exc_port_substitute.c @@ -105,12 +105,13 @@ catch_mach_exception_raise( } else { T_ASSERT_EQ(task, child_read, "out-of-process delivers read port"); - uint64_t type, addr; - kern_return_t kr = mach_port_kobject(mach_task_self(), task, &type, &addr); + uint64_t addr; + ipc_info_object_type_t otype; + kern_return_t kr = mach_port_kobject(mach_task_self(), task, + &otype, &addr); T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_kobject"); - -#define IKOT_TASK_READ 45 /* ipc_kobject.h */ - T_ASSERT_EQ(type, IKOT_TASK_READ, "task type must be IKOT_TASK_READ"); + T_ASSERT_EQ(otype, IPC_OTYPE_TASK_READ, + "task type must be IPC_OTYPE_TASK_READ"); } T_END; @@ -218,7 +219,7 @@ T_DECL(mach_exc_port_substitute_oop, "test out of process exception with read po } else { char *buf[3]; close(fds[1]); - int ret = read(fds[0], buf, sizeof(buf)); + read(fds[0], buf, sizeof(buf)); T_LOG("Child woke up from read, about to trip on bkpt"); __builtin_debugtrap(); /* Generate EXC_BREAKPOINT for all platforms */ diff --git a/tests/ipc/mach_msg2.c b/tests/ipc/mach_msg2.c index 930c7dbdb..f8ab96bbc 100644 --- a/tests/ipc/mach_msg2.c +++ b/tests/ipc/mach_msg2.c @@ -30,7 +30,7 @@ T_GLOBAL_META( #define MACH_MSG 1 #define MACH_MSG2 2 -#define MACH_MSG2_TEST_COUNT 16 +#define MACH_MSG2_TEST_COUNT 17 struct msg_rcv_args { mach_port_t rcv_port; @@ -159,7 +159,6 @@ do_msg_rcv(void * _Nullable arg) mach_port_t msg_rcv_port = ((struct msg_rcv_args *)arg)->rcv_port; mach_msg_vector_t data_vec[2]; kern_return_t kr; - mach_msg_header_t emptry_header = {}; msg_rcv_buffer_t message_buffer; inline_message_t *msg; @@ -192,7 +191,7 @@ do_msg_rcv(void * _Nullable arg) 0, 0); } else { - kr = mach_msg(msg, + kr = mach_msg((void *)msg, MACH_RCV_MSG, 0, rcv_configs[i].rcv_size, msg_rcv_port, 0, 0); } @@ -205,7 +204,7 @@ do_msg_rcv(void * _Nullable arg) if (kr == KERN_SUCCESS) { /* verify message proper carries correct data and port */ - T_QUIET; T_EXPECT_EQ(msg->data, MESSAGE_DATA_BYTES, "message should carry correct value"); + T_QUIET; T_EXPECT_EQ(msg->data, (uint64_t)MESSAGE_DATA_BYTES, "message should carry correct value"); T_QUIET; T_EXPECT_EQ(msg->header.msgh_remote_port, send_port, "port name should match"); T_QUIET; T_EXPECT_EQ(msg->header.msgh_local_port, msg_rcv_port, "port name should match"); T_QUIET; T_EXPECT_EQ(msg->header.msgh_id, 4141, "ID should match"); @@ -460,7 +459,7 @@ T_DECL(mach_msg2_combined_send_rcv, "Test mach_msg2() combined send/rcv") T_EXPECT_EQ(kr, MACH_MSG_SUCCESS, "1+1 combined send/rcv succeeded"); /* Verify content */ - T_EXPECT_EQ(((aux_buffer_t *)data_vec[1].msgv_data)->header.msgdh_size, + T_EXPECT_EQ((unsigned long)((aux_buffer_t *)data_vec[1].msgv_data)->header.msgdh_size, sizeof(aux_buffer_t), "Kernel should reset header to correct size"); ret = strcmp(buf_string, ((aux_buffer_t *)data_vec[1].msgv_data)->string); T_EXPECT_EQ(ret, 0, "aux data string should match after receive"); @@ -512,7 +511,7 @@ workloop_cb(uint64_t *workloop_id __unused, void **eventslist, int *events __unu T_LOG("workloop is set running.."); T_EXPECT_NE(msg_size, 0, "msg size should not be zero"); - T_EXPECT_EQ(aux_size, sizeof(aux_buffer_t), "aux size should match"); + T_EXPECT_EQ((unsigned long)aux_size, sizeof(aux_buffer_t), "aux size should match"); aux_buffer_t *aux = (aux_buffer_t *)((uintptr_t)msg + msg_size); T_EXPECT_EQ(aux->header.msgdh_size, aux_size, "aux size should match header"); diff --git a/tests/ipc/mach_msg_transport.c b/tests/ipc/mach_msg_transport.c index 490889b30..87a6bfb14 100644 --- a/tests/ipc/mach_msg_transport.c +++ b/tests/ipc/mach_msg_transport.c @@ -43,7 +43,6 @@ t_port_construct_full( } #define t_port_construct() t_port_construct_full(MPO_INSERT_SEND_RIGHT, 1) - static void t_port_destruct_full( mach_port_name_t *name, @@ -228,7 +227,7 @@ T_DECL(mach_msg_trailer, "check trailer generation") kr = t_receive(rcv_name, &buf.hdr, sizeof(buf), topts); T_ASSERT_MACH_SUCCESS(kr, "receiving message with trailer %d", i); - T_EXPECT_EQ(buf.hdr.msgh_size, sizeof(buf.hdr), "msgh_size"); + T_EXPECT_EQ((unsigned long)buf.hdr.msgh_size, sizeof(buf.hdr), "msgh_size"); T_EXPECT_EQ(buf.trailer.msgh_trailer_type, MACH_MSG_TRAILER_FORMAT_0, "msgh_trailer_type"); T_EXPECT_EQ(buf.trailer.msgh_trailer_size, tsize, "msgh_trailer_size"); if (tsize > offsetof(mach_msg_max_trailer_t, msgh_sender)) { @@ -520,9 +519,19 @@ static void t_mach_msg_descriptor_port_array(bool pseudo_receive) { mach_port_name_t rcv_name, port1, port2; + uint32_t mpo_flags; kern_return_t kr; - rcv_name = t_port_construct(); + /* + * Receive rights can receive OOL ports array only if + * the port is of a dedicated type that allows it. + * + * This type is created by using the MPO_CONNECTION_PORT_WITH_PORT_ARRAY + * flag, which also requires the task to have the relevant + * entitlement. + */ + mpo_flags = MPO_INSERT_SEND_RIGHT | MPO_CONNECTION_PORT_WITH_PORT_ARRAY; + rcv_name = t_port_construct_full(mpo_flags, 1); port1 = t_port_construct(); port2 = t_port_construct(); @@ -530,73 +539,48 @@ t_mach_msg_descriptor_port_array(bool pseudo_receive) t_fill_port(rcv_name, 1); } - for (size_t i = 0; i < port_dispositions[i]; i++) { - mach_msg_type_name_t disp = port_dispositions[i]; - mach_port_name_t name1 = port1; - mach_port_name_t name2 = port2; - struct msg_complex_port_array msg; - mach_port_name_t *array; + /* + * We only allow MACH_MSG_TYPE_COPY_SEND disposition + * for OOL ports array descriptors. + */ + mach_msg_type_name_t disp = MACH_MSG_TYPE_COPY_SEND; + mach_port_name_t name1 = port1; + mach_port_name_t name2 = port2; + struct msg_complex_port_array msg; + mach_port_name_t *array; - if (disp == MACH_MSG_TYPE_MOVE_SEND_ONCE) { - name1 = t_make_sonce(port1); - name2 = t_make_sonce(port2); - } + t_fill_complex_port_array_msg(&msg, disp, name1, name2); - t_fill_complex_port_array_msg(&msg, disp, name1, name2); + kr = t_send(rcv_name, &msg.base, &msg.trailer, MACH64_SEND_TIMEOUT); + if (pseudo_receive) { + T_ASSERT_MACH_ERROR(kr, MACH_SEND_TIMED_OUT, + "pseudo-rcv(disposition:%d)", disp); + } else { + T_ASSERT_MACH_SUCCESS(kr, "send(disposition:%d)", disp); - kr = t_send(rcv_name, &msg.base, &msg.trailer, MACH64_SEND_TIMEOUT); - if (pseudo_receive) { - T_ASSERT_MACH_ERROR(kr, MACH_SEND_TIMED_OUT, - "pseudo-rcv(disposition:%d)", disp); - } else { - T_ASSERT_MACH_SUCCESS(kr, "send(disposition:%d)", disp); - - kr = t_receive(rcv_name, &msg.base.header, sizeof(msg), - MACH64_MSG_OPTION_NONE); - T_ASSERT_MACH_SUCCESS(kr, "recv(disposition:%d)", disp); - } - - switch (disp) { - case MACH_MSG_TYPE_MOVE_RECEIVE: - disp = MACH_MSG_TYPE_PORT_RECEIVE; - break; - case MACH_MSG_TYPE_MOVE_SEND: - case MACH_MSG_TYPE_COPY_SEND: - case MACH_MSG_TYPE_MAKE_SEND: - disp = MACH_MSG_TYPE_PORT_SEND; - break; - case MACH_MSG_TYPE_MOVE_SEND_ONCE: - case MACH_MSG_TYPE_MAKE_SEND_ONCE: - disp = MACH_MSG_TYPE_PORT_SEND_ONCE; - break; - } - - array = msg.dsc.address; - - T_ASSERT_EQ(msg.base.header.msgh_bits & MACH_MSGH_BITS_COMPLEX, - MACH_MSGH_BITS_COMPLEX, "verify complex"); - T_ASSERT_EQ(msg.base.body.msgh_descriptor_count, 1u, "verify dsc count"); - T_ASSERT_EQ((mach_msg_descriptor_type_t)msg.dsc.type, MACH_MSG_OOL_PORTS_DESCRIPTOR, "verify type"); - T_ASSERT_EQ((mach_msg_type_name_t)msg.dsc.disposition, disp, "verify disposition"); - T_ASSERT_EQ(msg.dsc.count, 2u, "verify count"); - T_ASSERT_EQ((bool)msg.dsc.deallocate, true, "verify deallocate"); - - if (disp == MACH_MSG_TYPE_PORT_RECEIVE || - disp == MACH_PORT_TYPE_SEND) { - T_ASSERT_EQ(array[0], name1, "verify name"); - T_ASSERT_EQ(array[1], name2, "verify name"); - } - - if (disp == MACH_MSG_TYPE_PORT_SEND_ONCE) { - t_deallocate_sonce(array[0]); - t_deallocate_sonce(array[1]); - } - - t_vm_deallocate(array, sizeof(array[0]) * msg.dsc.count); + kr = t_receive(rcv_name, &msg.base.header, sizeof(msg), + MACH64_MSG_OPTION_NONE); + T_ASSERT_MACH_SUCCESS(kr, "recv(disposition:%d)", disp); } - t_port_destruct_full(&port1, 3, 0); /* did a COPY_SEND and a MAKE_SEND */ - t_port_destruct_full(&port2, 3, 0); /* did a COPY_SEND and a MAKE_SEND */ + disp = MACH_MSG_TYPE_PORT_SEND; + array = msg.dsc.address; + + T_ASSERT_EQ(msg.base.header.msgh_bits & MACH_MSGH_BITS_COMPLEX, + MACH_MSGH_BITS_COMPLEX, "verify complex"); + T_ASSERT_EQ(msg.base.body.msgh_descriptor_count, 1u, "verify dsc count"); + T_ASSERT_EQ((mach_msg_descriptor_type_t)msg.dsc.type, MACH_MSG_OOL_PORTS_DESCRIPTOR, "verify type"); + T_ASSERT_EQ((mach_msg_type_name_t)msg.dsc.disposition, disp, "verify disposition"); + T_ASSERT_EQ(msg.dsc.count, 2u, "verify count"); + T_ASSERT_EQ((bool)msg.dsc.deallocate, true, "verify deallocate"); + + T_ASSERT_EQ(array[0], name1, "verify name"); + T_ASSERT_EQ(array[1], name2, "verify name"); + + t_vm_deallocate(array, sizeof(array[0]) * msg.dsc.count); + + t_port_destruct_full(&port1, 2, 0); /* did a COPY_SEND */ + t_port_destruct_full(&port2, 2, 0); /* did a COPY_SEND */ t_port_destruct(&rcv_name); } diff --git a/tests/ipc/mach_port_construct_errors.c b/tests/ipc/mach_port_construct_errors.c new file mode 100644 index 000000000..92d9ac15f --- /dev/null +++ b/tests/ipc/mach_port_construct_errors.c @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RUN_CONCURRENTLY(TRUE), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("IPC"), + T_META_TAG_VM_PREFERRED); + +#define countof(x) (sizeof(x) / sizeof(x[0])) + +static void +expect_sigkill( + void (^fn)(void), + const char *description) +{ + pid_t pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { + fn(); + T_ASSERT_FAIL("%s: did not receive SIGKILL", description); + } else { + int status = 0; + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid"); + T_EXPECT_EQ(WTERMSIG(status), SIGKILL, + "%s exited with %d, expect SIGKILL", description, WTERMSIG(status)); + } +} + +T_DECL(mach_port_construct_at_most_one, + "mach_port_construct at most one flag policy") +{ + /* verify our at most one flag rule is enforced */ + const uint32_t at_most_one_flags[] = { + MPO_REPLY_PORT, + MPO_CONNECTION_PORT, + MPO_SERVICE_PORT, + MPO_PROVISIONAL_REPLY_PORT, + MPO_EXCEPTION_PORT, + MPO_CONNECTION_PORT_WITH_PORT_ARRAY + }; + + + for (uint32_t i = 0; i < countof(at_most_one_flags) - 1; ++i) { + for (uint32_t j = i + 1; j < countof(at_most_one_flags); ++j) { + mach_port_t port; + + mach_port_options_t opts = { + .flags = at_most_one_flags[i] | at_most_one_flags[j] + }; + + kern_return_t kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, "mach_port_construct failed for at most one flags"); + } + } +} + +T_DECL(mach_port_construct_invalid_arguments_and_values, + "mach_port_construct invalid arguments and values") +{ + kern_return_t kr; + mach_port_t port; + + mach_port_options_t conn_opts = { + .flags = MPO_CONNECTION_PORT, + .service_port_name = 0x0 + }; + + kr = mach_port_construct(mach_task_self(), &conn_opts, 0x0, &port); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, + "MPO_CONNECTION_PORT failed on service_port_name"); + + conn_opts.service_port_name = MPO_ANONYMOUS_SERVICE; + + kr = mach_port_construct(mach_task_self(), &conn_opts, 0x0, &port); + T_ASSERT_MACH_SUCCESS(kr, "MPO_CONNECTION_PORT succeeds with anonymous service name"); + kr = mach_port_destruct(mach_task_self(), port, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "destroy anonymous service name"); + + mach_port_options_t qlimit_opts = { + .flags = MPO_QLIMIT, + .mpl.mpl_qlimit = MACH_PORT_QLIMIT_MAX + 1 + }; + + kr = mach_port_construct(mach_task_self(), &qlimit_opts, 0x0, &port); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_VALUE, + "MPO_QLIMIT failed on invalid value"); + + /* Enumerate on all unknown MPO flags */ + mach_port_options_t unknown_flags_opts; + for (uint32_t i = 0; i < sizeof(unknown_flags_opts.flags) * CHAR_BIT; ++i) { + unknown_flags_opts.flags = MPO_UNUSED_BITS & (1 << i); + + if (unknown_flags_opts.flags != 0) { + kr = mach_port_construct(mach_task_self(), &unknown_flags_opts, 0x0, &port); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, + "Unknown MPO flags 0x%x failed with KERN_INVALID_ARGUMENT", + unknown_flags_opts.flags); + } + } +} + +T_DECL(mach_port_construct_fatal_failure, + "mach_port_construct kern defined fatal failures", + T_META_IGNORECRASHES(".*mach_port_construct_errors.*"), + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) +{ + expect_sigkill(^{ + mach_port_t port; + mach_port_options_t opts = { + .flags = MPO_CONNECTION_PORT_WITH_PORT_ARRAY + }; + (void)mach_port_construct(mach_task_self(), &opts, 0x0, &port); + }, "passing MPO_CONNECTION_PORT_WITH_PORT_ARRAY without entitlement"); +} + +T_DECL(mach_port_construct_kern_denied, + "mach_port_construct kern defined failures", + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) +{ + kern_return_t kr; + mach_port_t port; + mach_port_options_t opts; + + /* + * should fail because only TASK_GRAPHICS_SERVER is allowed to + * use MPO_TG_BLOCK_TRACKING. + */ + opts.flags = MPO_TG_BLOCK_TRACKING; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_ASSERT_MACH_ERROR(kr, + KERN_DENIED, "MPO_TG_BLOCK_TRACKING failed with KERN_DENIED"); +} diff --git a/tests/ipc/mach_port_description.c b/tests/ipc/mach_port_description.c new file mode 100644 index 000000000..4b896b2ae --- /dev/null +++ b/tests/ipc/mach_port_description.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.mach.port_description"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("ipc")); + +// kern/ipc_kobject.h +#define IKOT_NAMED_ENTRY 28 + +T_DECL(vm_named_entry, + "test mach_port_kobject_description() on a named memory entry") +{ + kern_return_t kr; + mach_vm_size_t size = vm_page_size; + mach_port_t named_entry = MACH_PORT_NULL; + natural_t object_type; + mach_vm_address_t object_addr; + kobject_description_t object_description; + boolean_t dev_kern; + size_t dev_kern_size = sizeof(dev_kern); + int ret; + + ret = sysctlbyname("kern.development", &dev_kern, &dev_kern_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl(kern.development)"); + + // Create a memory entry + kr = mach_make_memory_entry_64(mach_task_self(), &size, 0ull, + MAP_MEM_NAMED_CREATE | VM_PROT_DEFAULT, &named_entry, MACH_PORT_NULL); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_make_memory_entry_64()"); + + // Describe it + kr = mach_port_kobject_description(mach_task_self(), named_entry, + &object_type, &object_addr, object_description); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_kobject_description()"); + + T_LOG("Object Type: %d", object_type); + T_EXPECT_EQ(object_type, IKOT_NAMED_ENTRY, "object has type IKOT_NAMED_ENTRY"); + + T_LOG("Object Address: %llu", object_addr); + if (dev_kern) { + T_EXPECT_NE(object_addr, 0ull, "object address is populated on development kernel"); + } else { + T_EXPECT_EQ(object_addr, 0ull, "object address is zero on release kernel"); + } + + T_LOG("Object Description: %s", object_description); + T_EXPECT_NE_STR(object_description, "", "object description is populated"); + + mach_port_deallocate(mach_task_self(), named_entry); +} diff --git a/tests/ipc/platform_restrictions_entitlements.c b/tests/ipc/platform_restrictions_entitlements.c new file mode 100644 index 000000000..53a565dcc --- /dev/null +++ b/tests/ipc/platform_restrictions_entitlements.c @@ -0,0 +1,59 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.spawn"), + T_META_RUN_CONCURRENTLY(TRUE), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("spawn"), + T_META_TAG_VM_PREFERRED); + + +struct task_security_config { + uint8_t hardened_heap: 1, + tpro :1, + reserved: 1, + platform_restrictions_version :3; + uint8_t hardened_process_version; +}; + +T_DECL(test_platform_restrictions_entitlements, + "entitlement should enable the platform restrictions configuration", + T_META_CHECK_LEAKS(false), + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_BOOTARGS_SET("amfi=0x7")) +{ + struct task_security_config_info config; + struct task_ipc_space_policy_info space_info; + mach_msg_type_number_t count; + kern_return_t kr; + + count = TASK_SECURITY_CONFIG_INFO_COUNT; + kr = task_info(mach_task_self(), TASK_SECURITY_CONFIG_INFO, (task_info_t)&config, &count); + T_ASSERT_MACH_SUCCESS(kr, "task_info(TASK_SECURITY_CONFIG_INFO)"); + + struct task_security_config *conf = (struct task_security_config*)&config; + uint8_t vers = conf->platform_restrictions_version; + T_EXPECT_EQ_UINT(vers, 2, "Platform restrictions1 bit should not be set"); + + T_EXPECT_FALSE(conf->reserved, "reserved bit should not be set"); + T_EXPECT_FALSE(conf->tpro, "TPRO bit should not be set"); + T_EXPECT_FALSE(conf->hardened_heap, "hardened heap bit should not be set"); + + count = TASK_IPC_SPACE_POLICY_INFO_COUNT; + kr = task_info(mach_task_self(), TASK_IPC_SPACE_POLICY_INFO, (task_info_t)&space_info, &count); + T_ASSERT_MACH_SUCCESS(kr, "task_info(TASK_SECURITY_CONFIG_INFO)"); + T_ASSERT_EQ_UINT(count, 1, "ipc space should return 1 value"); + + T_EXPECT_TRUE(space_info.space_policy & 0x400, "enhanced V2 bit should be set"); + + T_EXPECT_FALSE(space_info.space_policy & 0x100, "enhanced V0 bit should not be set"); + T_EXPECT_FALSE(space_info.space_policy & 0x200, "enhanced V1 bit should not be set"); +} diff --git a/tests/ipc/port_api.c b/tests/ipc/port_api.c index 8a0a9a830..fb4388929 100644 --- a/tests/ipc/port_api.c +++ b/tests/ipc/port_api.c @@ -29,3 +29,39 @@ T_DECL(mach_port_insert_right_123724977, "regression test for 123724977") MACH_MSG_TYPE_MAKE_SEND); T_ASSERT_MACH_ERROR(kr, KERN_INVALID_RIGHT, "insert right fails"); } + +T_DECL(mach_port_name_rules, "make sure port names work correctly") +{ + mach_port_type_t ty; + kern_return_t kr; + mach_port_t mp, mp2; + + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &mp); + T_ASSERT_MACH_SUCCESS(kr, "creating port"); + T_ASSERT_EQ(mp & 0x3u, 0x3, "low bits are 0x3"); + + kr = mach_port_type(mach_task_self(), mp, &ty); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_type"); + T_ASSERT_TRUE(ty & MACH_PORT_TYPE_RECEIVE, "mp is a receive right"); + + kr = mach_port_type(mach_task_self(), mp & ~0x3u, &ty); + T_ASSERT_MACH_ERROR(kr, KERN_INVALID_NAME, + "lookup is sensitive to the low bits"); + + kr = mach_port_destruct(mach_task_self(), mp, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "destroying port"); + + kr = mach_port_type(mach_task_self(), mp, &ty); + T_ASSERT_MACH_ERROR(kr, KERN_INVALID_NAME, "port is destroyed"); + + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &mp2); + T_ASSERT_MACH_SUCCESS(kr, "creating port"); + T_ASSERT_EQ(mp2 & 0x3, 0x3, "low bits are 0x3"); + T_ASSERT_NE(mp, mp2, "port name will change"); + T_ASSERT_EQ(mp & ~0xffu, mp2 & ~0xffu, + "the index was reused with a generation delta of %d", + (mp2 - mp) >> 2); + + kr = mach_port_destruct(mach_task_self(), mp2, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "destroying port"); +} diff --git a/tests/ipc/port_peek.c b/tests/ipc/port_peek.c index bc60945a8..0969c6a04 100644 --- a/tests/ipc/port_peek.c +++ b/tests/ipc/port_peek.c @@ -55,7 +55,7 @@ T_DECL(mach_port_peek, "Test mach port peeking") mach_msg_type_number_t incoming_size = 0; mach_msg_id_t incoming_id; - kr = mach_port_peek(mach_task_self(), port, tlrtype, &seqno, &incoming_size, &incoming_id, &audit_trailer, &size); + kr = mach_port_peek(mach_task_self(), port, tlrtype, &seqno, &incoming_size, &incoming_id, (void *)&audit_trailer, &size); T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_peek"); T_ASSERT_EQ(incoming_id, outgoing_id, "Peek must return correct msgh_id"); diff --git a/tests/ipc/port_turnstile_stash.c b/tests/ipc/port_turnstile_stash.c index 124ad8bad..f894ccf29 100644 --- a/tests/ipc/port_turnstile_stash.c +++ b/tests/ipc/port_turnstile_stash.c @@ -103,7 +103,7 @@ T_DECL(port_stash_turnstile, "stashing knote turnstile on port should take a +1" int nevents = kevent_qos(kq, &event, 1, out_events, 1, NULL, NULL, 0); T_ASSERT_EQ(nevents, 1, "kevent_qos succeeded"); - T_ASSERT_EQ(remote->sequence, 0x6666666666666666, NULL); + T_ASSERT_EQ(remote->sequence, (uint64_t)0x6666666666666666, NULL); int ret = 0; struct kevent_qos_s del_event = { diff --git a/tests/ipc/port_type_policy.c b/tests/ipc/port_type_policy.c new file mode 100644 index 000000000..57996b493 --- /dev/null +++ b/tests/ipc/port_type_policy.c @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ipc"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("IPC"), + T_META_TIMEOUT(10), + T_META_IGNORECRASHES(".*port_type_policy.*"), + T_META_RUN_CONCURRENTLY(TRUE)); + +/* in xpc/launch_private.h */ +#define XPC_DOMAIN_SYSTEM 1 + +#define countof(arr) (sizeof(arr) / sizeof((arr)[0])) + + +static void +expect_sigkill( + void (^fn)(void), + const char *format_description, ...) +{ + char description[0x100]; + + va_list args; + va_start(args, format_description); + vsnprintf(description, sizeof(description), format_description, args); + va_end(args); + + pid_t pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { + fn(); + T_ASSERT_FAIL("%s: did not receive SIGKILL", description); + } else { + int status = 0; + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid"); + T_EXPECT_EQ(WTERMSIG(status), SIGKILL, + "%s exited with %d, expect SIGKILL", description, WTERMSIG(status)); + } +} + +struct msg_complex_port { + mach_msg_base_t base; + mach_msg_port_descriptor_t dsc; + mach_msg_max_trailer_t trailer; +}; + +#define OOL_PORT_COUNTS 2 + +struct msg_complex_port_array { + mach_msg_base_t base; + mach_msg_ool_ports_descriptor_t dsc; + mach_msg_max_trailer_t trailer; + mach_port_name_t array[OOL_PORT_COUNTS]; +}; + +struct msg_complex_port_two_arrays { + mach_msg_header_t header; + mach_msg_base_t base; + mach_msg_ool_ports_descriptor_t dsc1; + mach_msg_ool_ports_descriptor_t dsc2; + mach_msg_max_trailer_t trailer; + mach_port_name_t array[OOL_PORT_COUNTS]; +}; + +static kern_return_t +send_msg( + mach_port_t dest_port, + mach_msg_header_t *msg, + mach_msg_size_t size) +{ + mach_msg_option64_t opts; + + opts = MACH64_SEND_MSG | MACH64_SEND_MQ_CALL | MACH64_SEND_TIMEOUT; + + msg->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MAKE_SEND, 0, 0, + MACH_MSGH_BITS_COMPLEX); + msg->msgh_size = size; + msg->msgh_remote_port = dest_port; + msg->msgh_local_port = MACH_PORT_NULL; + msg->msgh_voucher_port = MACH_PORT_NULL; + msg->msgh_id = 42; + return mach_msg2(msg, opts, *msg, size, 0, 0, 0, 0); +} + +static kern_return_t +send_port_descriptor( + mach_port_t dest_port, + mach_port_t dsc_port, + int disp) +{ + struct msg_complex_port complex_msg; + mach_msg_header_t *msg; + mach_msg_size_t size; + + complex_msg = (struct msg_complex_port){ + .base.body.msgh_descriptor_count = 1, + .dsc = { + .type = MACH_MSG_PORT_DESCRIPTOR, + .disposition = disp, + .name = dsc_port, + }, + }; + + msg = &complex_msg.base.header; + size = (mach_msg_size_t)((char *)&complex_msg.trailer - (char *)&complex_msg.base); + return send_msg(dest_port, msg, size); +} + +static mach_port_t +recv_port_descriptor(mach_port_t dst_port) +{ + struct msg_complex_port msg; + + kern_return_t kr = mach_msg2(&msg, MACH64_RCV_MSG, MACH_MSG_HEADER_EMPTY, + 0, sizeof(msg), dst_port, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "mach_msg2 receive port descriptor"); + + /* extract and return the received port name */ + return msg.dsc.name; +} + +static mach_port_t +get_send_receive_right(void) +{ + kern_return_t kr; + mach_port_t port; + + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_allocate"); + + kr = mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_insert_right"); + + return port; +} + +static kern_return_t +send_ool_port_array( + mach_port_t dest_port, + mach_msg_type_name_t disp) +{ + struct msg_complex_port_array complex_msg; + mach_msg_header_t *msg; + mach_msg_size_t size; + + complex_msg = (struct msg_complex_port_array){ + .base.body.msgh_descriptor_count = 1, + .dsc = { + .type = MACH_MSG_OOL_PORTS_DESCRIPTOR, + .disposition = disp, + .address = &complex_msg.array, + .count = OOL_PORT_COUNTS, + .deallocate = false, + }, + }; + + for (size_t i = 0; i < OOL_PORT_COUNTS; ++i) { + complex_msg.array[i] = get_send_receive_right(); + } + + msg = &complex_msg.base.header; + size = (mach_msg_size_t)((char *)&complex_msg.trailer - (char *)&complex_msg.base); + return send_msg(dest_port, msg, size); +} + +static kern_return_t +send_ool_port_multiple_arrays( + mach_port_t dest_port, + mach_msg_type_name_t disp) +{ + struct msg_complex_port_two_arrays complex_msg; + mach_msg_header_t *msg; + mach_msg_size_t size; + + complex_msg = (struct msg_complex_port_two_arrays){ + .base.body.msgh_descriptor_count = 2, + .dsc1 = { + .type = MACH_MSG_OOL_PORTS_DESCRIPTOR, + .disposition = disp, + .address = &complex_msg.array, + .count = OOL_PORT_COUNTS, + .deallocate = false, + }, + .dsc2 = { + .type = MACH_MSG_OOL_PORTS_DESCRIPTOR, + .disposition = disp, + .address = &complex_msg.array, + .count = OOL_PORT_COUNTS, + .deallocate = false, + }, + }; + + for (size_t i = 0; i < OOL_PORT_COUNTS; ++i) { + complex_msg.array[i] = get_send_receive_right(); + } + + msg = &complex_msg.base.header; + size = (mach_msg_size_t)((char *)&complex_msg.trailer - (char *)&complex_msg.base); + return send_msg(dest_port, msg, size); +} + +/* + * Helper constructor functions to create different types of ports. + */ +static mach_port_t +create_conn_with_port_array_port(void) +{ + kern_return_t kr; + mach_port_t port; + + mach_port_options_t opts = {.flags = MPO_CONNECTION_PORT_WITH_PORT_ARRAY, }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return port; +} + +static mach_port_t +create_exception_port(void) +{ + kern_return_t kr; + mach_port_t port; + + mach_port_options_t opts = {.flags = MPO_EXCEPTION_PORT, }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return port; +} + +static mach_port_t +create_connection_port(void) +{ + kern_return_t kr; + mach_port_t conn_port; + + mach_port_options_t opts = { + .flags = MPO_CONNECTION_PORT | MPO_INSERT_SEND_RIGHT, + .service_port_name = MPO_ANONYMOUS_SERVICE, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &conn_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return conn_port; +} + +static mach_port_t +create_reply_port(void) +{ + kern_return_t kr; + mach_port_t port; + + mach_port_options_t opts = { + .flags = MPO_REPLY_PORT, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return port; +} + +static mach_port_t +create_provisional_reply_port(void) +{ + kern_return_t kr; + mach_port_t port; + + mach_port_options_t opts = { + .flags = MPO_PROVISIONAL_REPLY_PORT, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return port; +} + +static mach_port_t +create_service_port(void) +{ + kern_return_t kr; + mach_port_t port; + + struct mach_service_port_info sp_info = { + .mspi_string_name = "com.apple.testservice", + .mspi_domain_type = XPC_DOMAIN_SYSTEM, + }; + + mach_port_options_t opts = { + .flags = MPO_STRICT_SERVICE_PORT, + .service_port_info = &sp_info, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0x0, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + + return port; +} + +static void +destruct_generic_port(mach_port_t port) +{ + kern_return_t kr; + mach_port_type_t type = 0; + + kr = mach_port_type(mach_task_self(), port, &type); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_type"); + + kr = mach_port_destruct(mach_task_self(), + port, + (type & MACH_PORT_TYPE_SEND) ? -1 : 0, + 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_destruct"); +} +/* + * Helper functions and types to help making test output nice and readable. + */ +static const char* +get_disp_name(mach_msg_type_name_t disp) +{ + switch (disp) { + case MACH_MSG_TYPE_MOVE_SEND: + return "MOVE_SEND"; + case MACH_MSG_TYPE_MAKE_SEND: + return "MAKE_SEND"; + case MACH_MSG_TYPE_MOVE_SEND_ONCE: + return "MOVE_SEND_ONCE"; + case MACH_MSG_TYPE_MAKE_SEND_ONCE: + return "MAKE_SEND_ONCE"; + case MACH_MSG_TYPE_COPY_SEND: + return "COPY_SEND"; + case MACH_MSG_TYPE_MOVE_RECEIVE: + return "MOVE_RECEIVE"; + default: + T_ASSERT_FAIL("Invalid disp"); + } +} + +static const char* +get_notification_name(mach_msg_id_t notification_id) +{ + switch (notification_id) { + case MACH_NOTIFY_PORT_DESTROYED: + return "PORT_DESTROY"; + break; + case MACH_NOTIFY_NO_SENDERS: + return "NO_MORE_SENDERS"; + break; + case MACH_NOTIFY_SEND_POSSIBLE: + return "SEND_POSSIBLE"; + break; + default: + T_ASSERT_FAIL("Invalid notification id"); + } +} + +typedef struct { + mach_port_t (*port_ctor)(void); + char *port_type_name; + bool is_reply_port; +} port_type_desc; + +const port_type_desc IOT_PORT_DESC = { + .port_ctor = get_send_receive_right, + .port_type_name = "IOT_PORT", + .is_reply_port = false, +}; +const port_type_desc REPLY_PORT_DESC = { + .port_ctor = create_reply_port, + .port_type_name = "IOT_REPLY_PORT", + .is_reply_port = true, +}; +const port_type_desc CONNECTION_PORT_DESC = { + .port_ctor = create_connection_port, + .port_type_name = "IOT_CONNECTION_PORT", + .is_reply_port = false, +}; +const port_type_desc EXCEPTION_PORT_DESC = { + .port_ctor = create_exception_port, + .port_type_name = "IOT_EXCEPTION_PORT", + .is_reply_port = false, +}; +const port_type_desc PROVISIONAL_REPLY_PORT_DESC = { + .port_ctor = create_provisional_reply_port, + .port_type_name = "IOT_PROVISIONAL_REPLY_PORT", + .is_reply_port = false, +}; +const port_type_desc CONNECTION_PORT_WITH_PORT_ARRAY_DESC = { + .port_ctor = create_conn_with_port_array_port, + .port_type_name = "IOT_CONNECTION_PORT_WITH_PORT_ARRAY", + .is_reply_port = false, +}; +const port_type_desc TIMER_PORT_DESC = { + .port_ctor = mk_timer_create, + .port_type_name = "IOT_TIMER_PORT", + .is_reply_port = false, +}; +const port_type_desc SPECIAL_REPLY_PORT_DESC = { + .port_ctor = thread_get_special_reply_port, + .port_type_name = "IOT_SPECIAL_REPLY_PORT", + .is_reply_port = true, +}; +const port_type_desc SERVICE_PORT_DESC = { + .port_ctor = create_service_port, + .port_type_name = "IOT_SERVICE_PORT", + .is_reply_port = false, +}; + +const port_type_desc PORT_TYPE_DESC_ARRAY[] = { + IOT_PORT_DESC, + REPLY_PORT_DESC, + CONNECTION_PORT_DESC, + EXCEPTION_PORT_DESC, + PROVISIONAL_REPLY_PORT_DESC, + CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + TIMER_PORT_DESC, + SPECIAL_REPLY_PORT_DESC, + SERVICE_PORT_DESC +}; + +/* + * Helper functions to test MachIPC functionalities. + */ +static void +test_disallowed_register_mach_notification( + const port_type_desc *port_desc, + mach_msg_id_t notify_id) +{ + expect_sigkill(^{ + mach_port_t port, notify_port, previous; + + /* construct a receive right to send the port as descriptor to */ + notify_port = get_send_receive_right(); + + port = port_desc->port_ctor(); + (void)mach_port_request_notification(mach_task_self(), + port, + notify_id, + 0, + notify_port, + MACH_MSG_TYPE_MAKE_SEND_ONCE, + &previous); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s failed with mach notification %s", port_desc->port_type_name, get_notification_name(notify_id)); +} + +/* + * In this helper function we cover two properties: + * - we make sure these ports are immovable-receive by trying to + * send them in a message with MACH_MSG_PORT_DESCRIPTOR descriptor; + * - we attempt to register them for a PD notification. + * + * This seems redundent since it is not possible to register immovable-receive + * ports to PD notification by construction. However, we want our tests + * to cover everything, and this link between immovable-receive and + * PD notifications, no matter how trivial, should be question as well. + * + * Note: this intentionally does NOT use get status trap + * and test for MACH_PORT_STATUS_FLAG_GUARD_IMMOVABLE_RECEIVE, + * because the purpose of these tests is to ensure the overall security + * properties are respected (immovability, Guard, fatal exception, etc.). + */ +static void +test_receive_immovability(const port_type_desc *port_desc) +{ + expect_sigkill(^{ + mach_port_t dst_port, port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = get_send_receive_right(); + + /* + * construct the port to test immovability, and send it as port + * descriptor with RECEIVE right. + */ + port = port_desc->port_ctor(); + (void)send_port_descriptor(dst_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s failed immovable-receive", port_desc->port_type_name); + + test_disallowed_register_mach_notification(port_desc, + MACH_NOTIFY_PORT_DESTROYED); +} + +/* + * We have port types which their receive right is allowed to be move + * ONCE, and then they become immovable-receive for the rest of their + * lifetime. + * + * This helper function tests that property. + */ +static void +test_receive_immovability_move_once(const port_type_desc *port_desc) +{ + expect_sigkill(^{ + kern_return_t kr; + mach_port_t dst_port, port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = get_send_receive_right(); + + /* construct the port for our test, and send it as port descriptor */ + port = port_desc->port_ctor(); + kr = send_port_descriptor(dst_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); + T_ASSERT_MACH_SUCCESS(kr, "send_port_descriptor"); + + /* we moved the receive right out of our IPC space */ + port = MACH_PORT_NULL; + + /* + * receive the port we sent to ourselves. + * + * From now on, this port is expected to be immovable-receive + * for the rest of its lifetime. + */ + port = recv_port_descriptor(dst_port); + + /* + * this should raise a fatal Guard exception + * on immovability violation + */ + (void)send_port_descriptor(dst_port, port, MACH_MSG_TYPE_MOVE_RECEIVE); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s is allowed to be move ONCE", port_desc->port_type_name); +} + +static void +test_send_immovability_move_so(const port_type_desc *port_desc) +{ + expect_sigkill(^{ + mach_port_t dst_port, port, so_right; + mach_msg_type_name_t disp; + kern_return_t kr; + + dst_port = get_send_receive_right(); + port = port_desc->port_ctor(); + + /* create a send-once right for the port */ + kr = mach_port_extract_right(mach_task_self(), port, + MACH_MSG_TYPE_MAKE_SEND_ONCE, &so_right, &disp); + + T_ASSERT_MACH_SUCCESS(kr, "mach_port_extract_right with %s", port_desc->port_type_name); + + (void)send_port_descriptor(dst_port, so_right, MACH_MSG_TYPE_MOVE_SEND_ONCE); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s immovable-send failed with MOVE_SEND_ONCE", port_desc->port_type_name); +} + +static void +test_send_immovability(const port_type_desc *port_desc) +{ + expect_sigkill(^{ + mach_msg_type_name_t disp; + mach_port_name_t name; + + mach_port_t port = port_desc->port_ctor(); + (void)mach_port_extract_right(mach_task_self(), port, + MACH_MSG_TYPE_MOVE_SEND, &name, &disp); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s immovable-send failed extract_right MOVE_SEND", port_desc->port_type_name); + + expect_sigkill(^{ + mach_port_t dst_port, port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = get_send_receive_right(); + + port = port_desc->port_ctor(); + (void)send_port_descriptor(dst_port, port, MACH_MSG_TYPE_MOVE_SEND); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s immovable-send failed with MOVE_SEND", port_desc->port_type_name); + + expect_sigkill(^{ + mach_port_t dst_port, port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = get_send_receive_right(); + + port = port_desc->port_ctor(); + (void)send_port_descriptor(dst_port, port, MACH_MSG_TYPE_COPY_SEND); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "%s immovable-send failed with COPY_SEND", port_desc->port_type_name); + + /* + * Do not attempt to extract SEND_ONCE for reply port types. Such behavior + * should be covered by the reply_port_defense test. + */ + if (!port_desc->is_reply_port) { + test_send_immovability_move_so(port_desc); + } +} + +static void +test_ool_port_array( + const port_type_desc *port_desc, + mach_msg_type_name_t disp) +{ + expect_sigkill(^{ + mach_port_t dst_port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = port_desc->port_ctor(); + + (void)send_ool_port_array(dst_port, disp); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "sending OOL port array to %s with %s", port_desc->port_type_name, get_disp_name(disp)); +} + +/* + * Because of mach hardening opt out, group + * reply port tests together and skip them. + */ +T_DECL(reply_port_policies, + "Reply port policies tests") { +#if TARGET_OS_OSX || TARGET_OS_BRIDGE + T_SKIP("Test disabled on macOS due to mach hardening opt out"); +#endif /* TARGET_OS_OSX || TARGET_OS_BRIDGE */ + + test_receive_immovability(&REPLY_PORT_DESC); + + test_send_immovability(&REPLY_PORT_DESC); + + test_disallowed_register_mach_notification(&REPLY_PORT_DESC, + MACH_NOTIFY_NO_SENDERS); +} + +T_DECL(immovable_receive_port_types, + "Port types we expect to be immovable-receive") { + test_receive_immovability(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC); + + test_receive_immovability(&EXCEPTION_PORT_DESC); + + test_receive_immovability(&TIMER_PORT_DESC); + + test_receive_immovability(&SPECIAL_REPLY_PORT_DESC); + + /* + * kGUARD_EXC_KERN_FAILURE is not fatal on Bridge OS because + * we don't set TASK_EXC_GUARD_MP_FATAL by default/ + */ +#if !TARGET_OS_BRIDGE + test_receive_immovability(&SERVICE_PORT_DESC); +#endif /* !TARGET_OS_BRIDGE */ +} + +T_DECL(immovable_receive_move_once_port_types, + "Port types we expect to be immovable-receive") { + test_receive_immovability_move_once(&CONNECTION_PORT_DESC); +} + +T_DECL(immovable_send_port_types, + "Port types we expect to be immovable-send") +{ + test_send_immovability(&CONNECTION_PORT_DESC); + + test_send_immovability(&SPECIAL_REPLY_PORT_DESC); +} + +T_DECL(ool_port_array_policies, + "OOL port array policies") +{ +#if TARGET_OS_VISION + T_SKIP("OOL port array enforcement is disabled"); +#else + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + + /* + * The only port type allowed to receive the MACH_MSG_OOL_PORTS_DESCRIPTOR + * descriptor is IOT_CONNECTION_PORT_WITH_PORT_ARRAY. + * + * Attempt sending MACH_MSG_OOL_PORTS_DESCRIPTOR to any other port type + * result in a fatal Guard exception. + */ + test_ool_port_array(&IOT_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&REPLY_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&SPECIAL_REPLY_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&CONNECTION_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&EXCEPTION_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&PROVISIONAL_REPLY_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + test_ool_port_array(&TIMER_PORT_DESC, + MACH_MSG_TYPE_COPY_SEND); + + /* + * Now try to send to IOT_CONNECTION_PORT_WITH_PORT_ARRAY ports, + * but use disallowed dispositions. + * + * The only allowed disposition is COPY_SEND. + */ + test_ool_port_array(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + MACH_MSG_TYPE_MOVE_SEND); + + test_ool_port_array(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + MACH_MSG_TYPE_MAKE_SEND); + + test_ool_port_array(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + MACH_MSG_TYPE_MOVE_SEND_ONCE); + + test_ool_port_array(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + MACH_MSG_TYPE_MAKE_SEND_ONCE); + + test_ool_port_array(&CONNECTION_PORT_WITH_PORT_ARRAY_DESC, + MACH_MSG_TYPE_MOVE_RECEIVE); + + /* + * Finally, try sending OOL port array to IOT_CONNECTION_PORT_WITH_PORT_ARRAY, + * with (the only) allowed disposition, but send two arrays in one kmsg. + */ + expect_sigkill(^{ + mach_port_t dst_port; + + /* construct a receive right to send the port as descriptor to */ + dst_port = create_conn_with_port_array_port(); + + (void)send_ool_port_multiple_arrays(dst_port, MACH_MSG_TYPE_COPY_SEND); + + /* Unreachable; ports will be destructed when IPC space is destroyed */ + }, "sending two OOL port arrays"); +#endif /* TARGET_OS_VISION */ +} + +T_DECL(disallowed_no_more_senders_port_destroy_port_types, + "Port types we disallow no-more-senders notifications for") +{ + test_disallowed_register_mach_notification(&SPECIAL_REPLY_PORT_DESC, + MACH_NOTIFY_NO_SENDERS); +} + +T_DECL(provisional_reply_port, + "Provisional reply ports have no restrictions") +{ + mach_port_t prp, remote_port, recv_port; + kern_return_t kr; + + prp = create_provisional_reply_port(); + remote_port = get_send_receive_right(); + + kr = mach_port_insert_right(mach_task_self(), prp, prp, + MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_insert_right"); + + /* send a send right to the provisional reply port*/ + kr = send_port_descriptor(remote_port, prp, MACH_MSG_TYPE_MOVE_SEND); + T_ASSERT_MACH_SUCCESS(kr, "send_port_descriptor"); + + /* receive that port descriptor, which has to have the same name */ + recv_port = recv_port_descriptor(remote_port); + T_QUIET; T_ASSERT_EQ(prp, recv_port, "recv_port_descriptor send"); + + /* drop only the send right of the provisional reply port */ + kr = mach_port_mod_refs(mach_task_self(), prp, MACH_PORT_RIGHT_SEND, -1); + + /* send a receive right to the provisional reply port */ + kr = send_port_descriptor(remote_port, prp, MACH_MSG_TYPE_MOVE_RECEIVE); + T_ASSERT_MACH_SUCCESS(kr, "send_port_descriptor"); + + recv_port = recv_port_descriptor(remote_port); + T_ASSERT_NE(recv_port, MACH_PORT_NULL, "recv_port_descriptor receive"); + + /* cleanup, destruct the ports we used */ + kr = mach_port_destruct(mach_task_self(), recv_port, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_destruct recv_port"); + + kr = mach_port_destruct(mach_task_self(), remote_port, 0, 0); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_destruct remote_port"); +} + +T_DECL(mktimer_traps, + "Test mktimer traps") +{ + kern_return_t kr; + mach_port_t port; + uint64_t result_time; + + /* + * Enumerate all port types, makes sure mk_timer_arm + * fails on every single one besides IOT_TIMER_PORT + */ + for (uint32_t i = 0; i < countof(PORT_TYPE_DESC_ARRAY); ++i) { + if (PORT_TYPE_DESC_ARRAY[i].port_ctor == mk_timer_create) { + continue; + } + + /* Create a non-timer port type */ + port = PORT_TYPE_DESC_ARRAY[i].port_ctor(); + T_QUIET; T_ASSERT_NE(port, MACH_PORT_NULL, + "constructing a port type %s", + PORT_TYPE_DESC_ARRAY[i].port_type_name); + + kr = mk_timer_arm(port, 1); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, + "mk_timer_arm failed on non timer port type (%s)", + PORT_TYPE_DESC_ARRAY[i].port_type_name); + + kr = mk_timer_cancel(port, &result_time); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, + "mk_timer_cancel failed on non timer port type (%s)", + PORT_TYPE_DESC_ARRAY[i].port_type_name); + + kr = mk_timer_destroy(port); + T_ASSERT_MACH_ERROR(kr, + KERN_INVALID_ARGUMENT, + "mk_timer_destroy failed on non timer port type (%s)", + PORT_TYPE_DESC_ARRAY[i].port_type_name); + + /* Destroy the port we created */ + destruct_generic_port(port); + } + + /* Verify mk_timer_arm succeed with actual timer */ + port = TIMER_PORT_DESC.port_ctor(); + T_QUIET; T_ASSERT_NE(port, MACH_PORT_NULL, + "constructing a timer (%s)", + TIMER_PORT_DESC.port_type_name); + + kr = mk_timer_arm(port, 1); + T_ASSERT_MACH_SUCCESS(kr, "mk_timer_arm on actual timer"); + + kr = mk_timer_cancel(port, &result_time); + T_ASSERT_MACH_SUCCESS(kr, "mk_timer_cancel on actual timer"); + + kr = mk_timer_destroy(port); + T_ASSERT_MACH_SUCCESS(kr, "mk_timer_destroy"); +} diff --git a/tests/ipc/sys_perf_notify_test.c b/tests/ipc/sys_perf_notify_test.c index b5e58bb3c..1d223a2e8 100644 --- a/tests/ipc/sys_perf_notify_test.c +++ b/tests/ipc/sys_perf_notify_test.c @@ -75,7 +75,7 @@ catch_mach_exception_raise_state_identity_protected( thread_state_t new_state, mach_msg_type_number_t * new_state_count) { -#pragma unused(exception_port, thread_id, tatask_id_tokensk, exception, codes, codeCnt, flavor, old_state, old_state_count, new_state, new_state_count) +#pragma unused(exception_port, thread_id, task_id_token, exception, codes, codeCnt, flavor, old_state, old_state_count, new_state, new_state_count) T_FAIL("Unsupported catch_mach_exception_raise_state_identity"); return KERN_NOT_SUPPORTED; } @@ -94,7 +94,7 @@ catch_mach_exception_raise_identity_protected( T_QUIET; T_ASSERT_EQ(exception_port, exc_port, "correct exception port"); T_QUIET; T_ASSERT_EQ(exception, EXC_RPC_ALERT, "exception type is EXC_RPC_ALERT"); T_QUIET; T_ASSERT_EQ(codeCnt, 2, "codeCnt is 2"); - T_QUIET; T_ASSERT_EQ(codes[0], 0xFF000001, "codes[0] is 0xFF000001"); + T_QUIET; T_ASSERT_EQ(codes[0], (mach_exception_data_type_t)0xFF000001, "codes[0] is 0xFF000001"); return KERN_SUCCESS; } diff --git a/tests/ipc/tpro_entitlements.c b/tests/ipc/tpro_entitlements.c new file mode 100644 index 000000000..a97817295 --- /dev/null +++ b/tests/ipc/tpro_entitlements.c @@ -0,0 +1,60 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.spawn"), + T_META_RUN_CONCURRENTLY(TRUE), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("spawn"), + T_META_TAG_VM_PREFERRED); + +struct task_security_config { + uint8_t hardened_heap: 1, + tpro :1, + reserved: 1, + platform_restrictions_version :3; + uint8_t hardened_process_version; +}; + +T_DECL(test_platform_restrictions_entitlements, + "entitlement should enable the platform restrictions configuration", + T_META_CHECK_LEAKS(false), + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_BOOTARGS_SET("amfi=0x7"), + T_META_ENABLED(false) /* rdar://153306234 */) +{ + struct task_security_config_info config; + struct task_ipc_space_policy_info space_info; + mach_msg_type_number_t count; + kern_return_t kr; + + count = TASK_SECURITY_CONFIG_INFO_COUNT; + kr = task_info(mach_task_self(), TASK_SECURITY_CONFIG_INFO, (task_info_t)&config, &count); + T_ASSERT_MACH_SUCCESS(kr, "task_info(TASK_SECURITY_CONFIG_INFO)"); + T_ASSERT_EQ(count, 1, "security config should return 1 value"); + + struct task_security_config *conf = (struct task_security_config*)&config; + + T_EXPECT_TRUE(conf->tpro, "TPRO bit should be set"); + + T_EXPECT_FALSE(conf->reserved, "reserved bit should not be set"); + T_EXPECT_FALSE(conf->hardened_heap, "hardened_heap bit should not be set"); + uint8_t vers = conf->platform_restrictions_version; + T_EXPECT_EQ_UINT(vers, 0, "Platform restrictions version should be 0"); + + count = TASK_IPC_SPACE_POLICY_INFO_COUNT; + kr = task_info(mach_task_self(), TASK_IPC_SPACE_POLICY_INFO, (task_info_t)&space_info, &count); + T_ASSERT_MACH_SUCCESS(kr, "task_info(TASK_SECURITY_CONFIG_INFO)"); + T_ASSERT_EQ_UINT(count, 1, "ipc space should return 1 value"); + + T_EXPECT_FALSE(space_info.space_policy & 0x100, "enhanced V0 bit should not be set"); + T_EXPECT_FALSE(space_info.space_policy & 0x200, "enhanced V1 bit should not be set"); + T_EXPECT_FALSE(space_info.space_policy & 0x400, "enhanced V2 bit should not be set"); +} diff --git a/tests/ipv6_bind_race.c b/tests/ipv6_bind_race.c index 1deaaf312..85c16a77e 100644 --- a/tests/ipv6_bind_race.c +++ b/tests/ipv6_bind_race.c @@ -39,6 +39,8 @@ #include #include +#include "net_test_lib.h" + /* * The test is disabled on platforms that could be limited in term of CPU * or memory because this stress test that cycles rapidly through a lot of socket @@ -454,6 +456,8 @@ do_bind_race(bool do_test_tcp, void *(*leader)(void *), void *(*racer)(void *)) pthread_join(runner1, 0); pthread_join(runner2, 0); + + force_zone_gc(); } T_DECL(ipv6_tcp_bind6_bind4_race, "race bind calls with TCP sockets") diff --git a/tests/kern-trial.entitlements b/tests/kern-trial.entitlements new file mode 100644 index 000000000..a74975f6b --- /dev/null +++ b/tests/kern-trial.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.kernel.read-write-trial-experiment-factors + + + diff --git a/tests/ktrace/kdebug_tests.c b/tests/ktrace/kdebug_tests.c index cd14f2d20..cb6f65f90 100644 --- a/tests/ktrace/kdebug_tests.c +++ b/tests/ktrace/kdebug_tests.c @@ -1285,7 +1285,8 @@ T_DECL(round_trips, * heart-beat. */ T_DECL(event_coverage, "ensure events appear up to the end of tracing", - T_META_TAG_VM_PREFERRED) + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505849 */) { start_controlling_ktrace(); @@ -1597,8 +1598,12 @@ static const char *expected_subsystems[] = { #define EXPECTED_SUBSYSTEMS_LEN \ (sizeof(expected_subsystems) / sizeof(expected_subsystems[0])) -T_DECL(early_boot_tracing, "ensure early boot strings are present", - T_META_BOOTARGS_SET("trace=1000000"), XNU_T_META_SOC_SPECIFIC, T_META_TAG_VM_NOT_ELIGIBLE) +T_DECL(early_boot_tracing, + "ensure early boot strings are present", + T_META_BOOTARGS_SET("trace=100000"), + XNU_T_META_SOC_SPECIFIC, + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://149654502 */) { T_ATEND(reset_ktrace); @@ -1726,7 +1731,8 @@ sighandler(int sig) T_DECL(instrs_and_cycles_on_proc_exit, "instructions and cycles should be traced on thread exit", T_META_REQUIRES_SYSCTL_EQ("kern.monotonic.supported", 1), - T_META_TAG_VM_NOT_ELIGIBLE) + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://134505849 */) { T_SETUPBEGIN; start_controlling_ktrace(); @@ -1896,3 +1902,54 @@ T_DECL(instrs_and_cycles_on_thread_exit, dispatch_main(); } + +T_DECL(direct_file_writing, "ensure direct file writes work correctly", + T_META_TAG_VM_PREFERRED) +{ + start_controlling_ktrace(); + + T_SETUPBEGIN; + char trace_file_path[MAXPATHLEN] = "direct_file.bin"; + int error = dt_resultfile(trace_file_path, sizeof(trace_file_path)); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "dt_resultfile"); + T_LOG("directly kdebug to file at %s", trace_file_path); + + int fd = open(trace_file_path, O_CREAT | O_TRUNC | O_RDWR); + T_QUIET; T_ASSERT_POSIX_SUCCESS(fd, "open and create trace file"); + + int mib[4] = { CTL_KERN, KERN_KDEBUG }; + mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF"); + + mib[2] = KERN_KDSETUP; mib[3] = 0; + size_t needed = 0; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0), + "KERN_KDSETUP"); + + mib[2] = KERN_KDENABLE; mib[3] = 1; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE"); + T_SETUPEND; + + sleep(1); + + mib[2] = KERN_KDWRITEMAP; mib[3] = fd; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDWRITEMAP"); + + mib[2] = KERN_KDWRITETR; mib[3] = fd; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDWRITETR"); + + close(fd); + + ktrace_file_t trace_file = ktrace_file_open(trace_file_path, NULL); + T_WITH_ERRNO; T_ASSERT_NOTNULL(trace_file, "can open file as a trace file"); + + uint64_t earliest_timestamp = 0; + error = ktrace_file_earliest_timestamp(trace_file, &earliest_timestamp); + T_QUIET; + T_ASSERT_POSIX_ZERO(error, "read earliest event timestamp from file"); + T_QUIET; + T_EXPECT_NE(earliest_timestamp, 0ULL, "earliest event timestamp is valid"); + + ktrace_file_close(trace_file); + T_PASS("trace file appears usable"); +} diff --git a/tests/ledger_entry_info_v2.c b/tests/ledger_entry_info_v2.c new file mode 100644 index 000000000..3dd009340 --- /dev/null +++ b/tests/ledger_entry_info_v2.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include + +#include "../osfmk/kern/ledger.h" +extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3); + +T_DECL(ledger_entry_v2, + "test the LEDGER_ENTRY_INFO_V2 command of ledger() syscal", + T_META_LTEPHASE(LTE_POSTINIT), + T_META_OWNER("skwok2"), + T_META_TAG_VM_PREFERRED) +{ + struct ledger_info li; + int64_t ledger_count; + struct ledger_entry_info_v2 *lei_v2 = NULL; + bool retrieved_lifetime_max = false; + size_t malloc_size = 0; + + T_QUIET; T_ASSERT_EQ(ledger(LEDGER_INFO, + (caddr_t)(uintptr_t)getpid(), + (caddr_t)&li, + NULL), + 0, + "ledger(LEDGER_INFO)"); + + ledger_count = li.li_entries; + T_QUIET; T_ASSERT_GT(ledger_count, 0, "no ledger entry available"); + + malloc_size = (size_t)ledger_count * sizeof(struct ledger_entry_info_v2); + lei_v2 = (struct ledger_entry_info_v2 *)malloc(malloc_size); + T_QUIET; T_ASSERT_NE(lei_v2, NULL, "malloc(ledger_entry_info_v2) of size %u", malloc_size); + + + T_ASSERT_GE(ledger(LEDGER_ENTRY_INFO_V2, + (caddr_t)(uintptr_t)getpid(), + (caddr_t)lei_v2, + (caddr_t)&ledger_count), + 0, + "ledger(LEDGER_ENTRY_INFO_V2)"); + + for (int i = 0; i < ledger_count; i++) { + if (lei_v2[i].lei_lifetime_max != -1) { + retrieved_lifetime_max = true; + break; + } + } + + free(lei_v2); + + if (retrieved_lifetime_max) { + T_PASS("successfully retrieved at least one entry which support lifetime max"); + } else { + T_FAIL("couldn't read any lifetime max value"); + } +} diff --git a/tests/libmalloc_apple_array.c b/tests/libmalloc_apple_array.c index d04948086..beb10a114 100644 --- a/tests/libmalloc_apple_array.c +++ b/tests/libmalloc_apple_array.c @@ -179,7 +179,7 @@ get_apple_array(size_t *num_array_entries, const char * filename) #define HARDENED_RUNTIME_KEY "HardenedRuntime=" -#define HARDENED_HEAP_KEY "hardened_heap=" +#define SECURITY_CONFIG_KEY "security_config=" /* @@ -208,15 +208,15 @@ get_apple_array_key(char **apple_array, size_t num_array_entries, uint64_t *fact /* libmalloc relies on these values not changing. If they change, * you need to update the values in that project as well */ -__options_decl(HR_flags_t, uint32_t, { +__options_decl(hardened_browser_flags_t, uint32_t, { BrowserHostEntitlementMask = 0x01, BrowserGPUEntitlementMask = 0x02, BrowserNetworkEntitlementMask = 0x04, BrowserWebContentEntitlementMask = 0x08, }); -T_DECL(libmalloc_hardened_binary_present, - "hardened binary flags show up in apple array", +T_DECL(libmalloc_hardened_browser_present, + "platform restrictions binary flags show up in apple array", T_META_ASROOT(false)) { uint64_t apple_array_val = 0; @@ -241,8 +241,11 @@ T_DECL(libmalloc_hardened_binary_present, free(apple_array); } -T_DECL(libmalloc_hardened_heap_entitlements, - "hardened heap enablement via hardened process and hardened heap entitlements", +#define SECURITY_CONFIG_HARDENED_HEAP_ENTRY (0x01) +#define SECURITY_CONFIG_TPRO_ENTRY (0x02) + +T_DECL(libmalloc_security_config_hardened_heap_entitlements, + "parse security_config values to verify security configs hardened_heap enablement/disablement", T_META_ASROOT(false)) { uint64_t apple_array_val = 0; @@ -250,33 +253,33 @@ T_DECL(libmalloc_hardened_heap_entitlements, char **apple_array; bool found = false; - uint32_t mask_val = 1; apple_array = get_apple_array(&num_array_entries, "tools/print_apple_array_hardened_proc"); - found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, HARDENED_HEAP_KEY); - T_ASSERT_FALSE(found, "Didn't find " HARDENED_HEAP_KEY " in apple array"); - free(apple_array); + found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, SECURITY_CONFIG_KEY); + T_ASSERT_TRUE(found, "Found " SECURITY_CONFIG_KEY " in apple array"); - apple_array = get_apple_array(&num_array_entries, "tools/print_apple_array_hardened_heap_disable"); - found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, HARDENED_HEAP_KEY); - T_ASSERT_FALSE(found, "Didn't find " HARDENED_HEAP_KEY " in apple array"); + /* Let's start parsing the security config, to see what's enabled. */ + T_EXPECT_FALSE(apple_array_val & SECURITY_CONFIG_HARDENED_HEAP_ENTRY, "Hardened-heap is disabled"); free(apple_array); apple_array = get_apple_array(&num_array_entries, "tools/print_apple_array_hardened_heap"); - found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, HARDENED_HEAP_KEY); - T_ASSERT_TRUE(found, "Found " HARDENED_HEAP_KEY " in apple array"); - T_ASSERT_EQ(apple_array_val, mask_val, "Bitmask value matches"); + found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, SECURITY_CONFIG_KEY); + T_ASSERT_TRUE(found, "Found " SECURITY_CONFIG_KEY " in apple array"); + + T_EXPECT_TRUE(apple_array_val & SECURITY_CONFIG_HARDENED_HEAP_ENTRY, "Hardened-heap is enabled"); free(apple_array); - apple_array = get_apple_array(&num_array_entries, "tools/print_apple_array_hardened_proc_all_subfeatures"); - found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, HARDENED_HEAP_KEY); - T_ASSERT_TRUE(found, "Found " HARDENED_HEAP_KEY " in apple array"); - T_ASSERT_EQ(apple_array_val, mask_val, "Bitmask value matches"); + /* Verify that the same config is mirrored with the com.apple.security namespace */ + apple_array = get_apple_array(&num_array_entries, "tools/print_apple_array_hardened_heap_security"); + found = get_apple_array_key(apple_array, num_array_entries, &apple_array_val, SECURITY_CONFIG_KEY); + T_ASSERT_TRUE(found, "Found " SECURITY_CONFIG_KEY " in apple array"); + + T_EXPECT_TRUE(apple_array_val & SECURITY_CONFIG_HARDENED_HEAP_ENTRY, "Hardened-heap is enabled"); free(apple_array); } -T_DECL(libmalloc_hardened_binary_absent, - "hardened binary flags do not show up in apple array for normal third party processes", +T_DECL(libmalloc_hardened_browser_absent, + "platform restrictions binary flags do not show up in apple array for normal third party processes", T_META_ASROOT(false)) { uint64_t new_val, apple_array_val = 0; diff --git a/tests/mach_eventlink.c b/tests/mach_eventlink.c index bac4879d1..44abf42c5 100644 --- a/tests/mach_eventlink.c +++ b/tests/mach_eventlink.c @@ -2,35 +2,21 @@ * mach eventlink: Tests mach eventlink kernel synchronization primitive. */ -#include -#include - -#include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include +#include +#include -T_GLOBAL_META(T_META_NAMESPACE("xnu.mach_eventlink"), - T_META_RUN_CONCURRENTLY(true)); +#include +#include "sched/sched_test_utils.h" + +T_GLOBAL_META(T_META_NAMESPACE("xnu.mach_eventlink")); static int g_loop_iterations = 100000; +static semaphore_t g_sem_done = SEMAPHORE_NULL; static kern_return_t test_eventlink_create(mach_port_t *port_pair) @@ -271,6 +257,11 @@ test_eventlink_wait_then_signal_loop(void *arg) kr = mach_eventlink_signal(eventlink_port, 0); T_ASSERT_MACH_SUCCESS(kr, "mach_eventlink_signal"); + if (g_sem_done != SEMAPHORE_NULL) { + kr = semaphore_wait(g_sem_done); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait(g_sem_done)"); + } + return NULL; } @@ -279,7 +270,8 @@ test_eventlink_wait_then_signal_loop(void *arg) * * Calls eventlink creates which returns a pair of eventlink port objects. */ -T_DECL(test_eventlink_create, "eventlink create test", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_create, "eventlink create test", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -299,7 +291,8 @@ T_DECL(test_eventlink_create, "eventlink create test", T_META_ASROOT(YES), T_MET * Calls eventlink creates which returns a pair of eventlink port objects. * Calls eventlink destroy on eventlink port pair. */ -T_DECL(test_eventlink_destroy, "eventlink destroy test", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_destroy, "eventlink destroy test", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -321,7 +314,8 @@ T_DECL(test_eventlink_destroy, "eventlink destroy test", T_META_ASROOT(YES), T_M * Create eventlink object pair and associate threads to each side and then * disassociate threads and check for error conditions. */ -T_DECL(test_eventlink_associate, "eventlink associate test", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_associate, "eventlink associate test", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -397,7 +391,8 @@ T_DECL(test_eventlink_associate, "eventlink associate test", T_META_ASROOT(YES), * * Create an eventlink object, associate threads and test eventlink wait with timeout. */ -T_DECL(test_eventlink_wait_timeout, "eventlink wait timeout test", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_timeout, "eventlink wait timeout test", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -424,7 +419,8 @@ T_DECL(test_eventlink_wait_timeout, "eventlink wait timeout test", T_META_ASROOT * * Create an eventlink object, associate threads and test eventlink wait with no wait flag. */ -T_DECL(test_eventlink_wait_no_wait, "eventlink wait no wait test", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_no_wait, "eventlink wait no wait test", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -448,7 +444,8 @@ T_DECL(test_eventlink_wait_no_wait, "eventlink wait no wait test", T_META_ASROOT * * Create an eventlink object, associate threads and destroy the port. */ -T_DECL(test_eventlink_wait_and_destroy, "eventlink wait and destroy", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_and_destroy, "eventlink wait and destroy", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -482,7 +479,8 @@ T_DECL(test_eventlink_wait_and_destroy, "eventlink wait and destroy", T_META_ASR * * Create an eventlink object, associate threads, wait and destroy the remote eventlink port. */ -T_DECL(test_eventlink_wait_and_destroy_remote, "eventlink wait and remote destroy", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_and_destroy_remote, "eventlink wait and remote destroy", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -515,7 +513,8 @@ T_DECL(test_eventlink_wait_and_destroy_remote, "eventlink wait and remote destro * * Create an eventlink object, associate threads, wait and deallocate the eventlink port. */ -T_DECL(test_eventlink_wait_and_deallocate, "eventlink wait and deallocate", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_and_deallocate, "eventlink wait and deallocate", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -543,7 +542,8 @@ T_DECL(test_eventlink_wait_and_deallocate, "eventlink wait and deallocate", T_ME * * Create an eventlink object, associate threads, wait and disassociate thread from the eventlink port. */ -T_DECL(test_eventlink_wait_and_disassociate, "eventlink wait and disassociate", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_and_disassociate, "eventlink wait and disassociate", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -573,7 +573,8 @@ T_DECL(test_eventlink_wait_and_disassociate, "eventlink wait and disassociate", * * Create an eventlink object, associate threads and test wait signal. */ -T_DECL(test_eventlink_wait_and_signal, "eventlink wait and signal", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_and_signal, "eventlink wait and signal", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -608,7 +609,8 @@ T_DECL(test_eventlink_wait_and_signal, "eventlink wait and signal", T_META_ASROO * * Create an eventlink object, associate threads and test wait_signal. */ -T_DECL(test_eventlink_wait_signal, "eventlink wait_signal", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_signal, "eventlink wait_signal", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -648,7 +650,8 @@ T_DECL(test_eventlink_wait_signal, "eventlink wait_signal", T_META_ASROOT(YES), * * Create an eventlink object, associate threads and test wait_signal with no wait. */ -T_DECL(test_eventlink_wait_signal_no_wait, "eventlink wait_signal with no wait", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_signal_no_wait, "eventlink wait_signal with no wait", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -688,7 +691,8 @@ T_DECL(test_eventlink_wait_signal_no_wait, "eventlink wait_signal with no wait", * * Create an eventlink object, associate threads and test wait_signal with prepost. */ -T_DECL(test_eventlink_wait_signal_prepost, "eventlink wait_signal with prepost", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_signal_prepost, "eventlink wait_signal with prepost", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -728,7 +732,8 @@ T_DECL(test_eventlink_wait_signal_prepost, "eventlink wait_signal with prepost", * * Create an eventlink object, set associate on wait on one side and test wait_signal. */ -T_DECL(test_eventlink_wait_signal_associate_on_wait, "eventlink wait_signal associate on wait", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_signal_associate_on_wait, "eventlink wait_signal associate on wait", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -777,7 +782,8 @@ T_DECL(test_eventlink_wait_signal_associate_on_wait, "eventlink wait_signal asso * * Create an eventlink object, associate threads and test wait_signal in a loop. */ -T_DECL(test_eventlink_wait_signal_loop, "eventlink wait_signal in loop", T_META_ASROOT(YES), T_META_TAG_VM_PREFERRED) +T_DECL(test_eventlink_wait_signal_loop, "eventlink wait_signal in loop", + T_META_ASROOT(true), T_META_RUN_CONCURRENTLY(true), T_META_TAG_VM_PREFERRED) { kern_return_t kr; mach_port_t port_pair[2]; @@ -814,14 +820,6 @@ T_DECL(test_eventlink_wait_signal_loop, "eventlink wait_signal in loop", T_META_ } -static uint64_t -nanos_to_abs(uint64_t nanos) -{ - static mach_timebase_info_data_t timebase_info; - mach_timebase_info(&timebase_info); - return nanos * timebase_info.denom / timebase_info.numer; -} - static const uint64_t DEFAULT_INTERVAL_NS = 15000000; // 15 ms static void @@ -850,6 +848,8 @@ static _Atomic bool suspend_resume_thread_stop = false; static void * test_suspend_resume_thread(void *arg) { + T_ASSERT_NE(g_sem_done, SEMAPHORE_NULL, "g_sem_done should be initialized"); + uint64_t count = 0; mach_port_t suspend_resume_other_thread_port = (mach_port_t) (uintptr_t)arg; kern_return_t kr1 = KERN_SUCCESS, kr2 = KERN_SUCCESS; @@ -860,24 +860,42 @@ test_suspend_resume_thread(void *arg) count++; } - T_ASSERT_MACH_SUCCESS(kr1, "thread_suspend #%lld", count); - T_ASSERT_MACH_SUCCESS(kr2, "thread_resume #%lld", count); + T_LOG("thread suspend/resume count: %llu", count); + T_EXPECT_MACH_SUCCESS(kr1, "thread_suspend"); + T_EXPECT_MACH_SUCCESS(kr2, "thread_resume"); + + /* Signal that it is now safe to exit the thread under test. */ + int kr = semaphore_signal(g_sem_done); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "semaphore_signal(g_sem_done)"); return NULL; } /* * Test 16: Test suspension of a thread in the middle of a wait-signal operation - * rdar://120761588 rdar://123887338 + * Also tests that the eventlink signal leads to a real handoff. + * rdar://120761588 rdar://123887338 rdar://138657435 */ -T_DECL(test_eventlink_wait_signal_suspend_loop, "eventlink wait_signal + thread_suspend/resume in loop", T_META_ASROOT(YES)) -{ +T_DECL(test_eventlink_wait_signal_suspend_loop, "eventlink wait_signal + thread_suspend/resume in loop", + T_META_RUN_CONCURRENTLY(false), /* Test uses global handoff counter */ + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(!TARGET_OS_VISION) /* */ + ) { kern_return_t kr; mach_port_t port_pair[2]; pthread_t pthread, suspend_thread; mach_port_t self = mach_thread_self(); uint64_t count = 0; int i; + uint64_t handoffs_start, handoffs_end; + + size_t handoffs_start_size = sizeof(handoffs_start_size); + kr = sysctlbyname("kern.mach_eventlink_handoff_success_count", &handoffs_start, &handoffs_start_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "sysctlbyname(kern.mach_eventlink_handoff_success_count)"); + T_LOG("handoffs_start: %llu", handoffs_start); + + kr = semaphore_create(mach_task_self(), &g_sem_done, SYNC_POLICY_FIFO, 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create"); /* Create an eventlink and associate threads to it */ kr = test_eventlink_create(port_pair); @@ -914,6 +932,20 @@ T_DECL(test_eventlink_wait_signal_suspend_loop, "eventlink wait_signal + thread_ pthread_join(suspend_thread, NULL); pthread_join(pthread, NULL); - mach_port_deallocate(mach_task_self(), port_pair[0]); - mach_port_deallocate(mach_task_self(), port_pair[1]); + kr = mach_port_deallocate(mach_task_self(), port_pair[0]); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate"); + kr = mach_port_deallocate(mach_task_self(), port_pair[1]); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate"); + kr = semaphore_destroy(mach_task_self(), g_sem_done); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_destroy"); + + size_t handoffs_end_size = sizeof(handoffs_end); + kr = sysctlbyname("kern.mach_eventlink_handoff_success_count", &handoffs_end, &handoffs_end_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "sysctlbyname(kern.mach_eventlink_handoff_success_count)"); + T_LOG("handoffs_end: %llu", handoffs_end); + + T_QUIET; T_ASSERT_GE(handoffs_end, handoffs_start, "kern.mach_eventlink_handoff_success_count did not overflow"); + const uint64_t successful_handoffs = handoffs_end - handoffs_start; + const uint64_t min_handoffs = MAX(2 * g_loop_iterations, 2) - 2; + T_EXPECT_GE(successful_handoffs, min_handoffs, "found at least %llu handoffs", min_handoffs); } diff --git a/tests/mach_service_port.c b/tests/mach_service_port.c index 168756670..ac71bd2e0 100644 --- a/tests/mach_service_port.c +++ b/tests/mach_service_port.c @@ -50,8 +50,6 @@ service_port_get_throttled(int *is_throttled) T_DECL(mach_service_port, "Create a port with a service port label", T_META_CHECK_LEAKS(false)) { mach_port_t connection_port; - mach_port_t notify_port; - mach_port_t previous; uint64_t fpid = 0; boolean_t is_throttled; @@ -78,23 +76,7 @@ T_DECL(mach_service_port, "Create a port with a service port label", T_META_CHEC T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct %u", connection_port); kr = mach_port_is_connection_for_service(mach_task_self(), connection_port, service_port, &fpid); - if (kr != KERN_SUCCESS || kr != KERN_NOT_SUPPORTED) { - T_LOG("mach_port_is_connection_for_service kr = %d, fpid = %llu", kr, fpid); - } - - // notification port for the service port to come back on - kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, ¬ify_port); - T_ASSERT_MACH_SUCCESS(kr, "mach_port_allocate notify_port"); - - kr = mach_port_insert_right(mach_task_self(), notify_port, notify_port, MACH_MSG_TYPE_MAKE_SEND); - T_ASSERT_MACH_SUCCESS(kr, "mach_port_insert_right notify_port"); - - T_LOG("service port: 0x%x, notify port: 0x%x\n", service_port, notify_port); - - kr = mach_port_request_notification(mach_task_self(), service_port, MACH_NOTIFY_PORT_DESTROYED, 0, notify_port, - MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous); - T_ASSERT_MACH_SUCCESS(kr, "mach_port_request_notification service_port"); - T_ASSERT_EQ(previous, MACH_PORT_NULL, "previous null"); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_is_connection_for_service"); /* Test port throttling flag */ kr = service_port_get_throttled(&is_throttled); @@ -119,12 +101,6 @@ T_DECL(mach_service_port, "Create a port with a service port label", T_META_CHEC kr = mach_port_destruct(mach_task_self(), service_port, 0, SP_CONTEXT); T_ASSERT_MACH_SUCCESS(kr, "mach_port_destruct service_port"); - /* - * Recover the service port because the port must have been destroyed and sent the notification by now - */ - kr = mach_msg_server_once(notify_server, MACH_MSG_SIZE_RELIABLE, notify_port, MACH_RCV_TIMEOUT); - T_ASSERT_MACH_SUCCESS(kr, "mach_msg_server_once notify_port"); - T_LOG("done"); } diff --git a/tests/mcast_group_race_82820812.c b/tests/mcast_group_race_82820812.c index c6d85fc06..4a1a1ac20 100644 --- a/tests/mcast_group_race_82820812.c +++ b/tests/mcast_group_race_82820812.c @@ -4,6 +4,10 @@ #include #include +#include + +#include "net_test_lib.h" + volatile static int lock_a; volatile static int lock_b; @@ -61,4 +65,6 @@ T_DECL(mcast_group_race_82820812, "Race between multicast group join operations. T_ASSERT_POSIX_ZERO(pthread_join(th, NULL), "pthread_join"); T_ASSERT_POSIX_SUCCESS(close(fd), "close"); } + + force_zone_gc(); } diff --git a/tests/mcast_ssm.c b/tests/mcast_ssm.c new file mode 100644 index 000000000..46680e16d --- /dev/null +++ b/tests/mcast_ssm.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include "net_test_lib.h" + +network_interface interface; + +static void +cleanup(void) +{ + network_interface_destroy(&interface); +} + +T_DECL(net_multicast_igmp_ssm, "IGMP SSM test", T_META_ASROOT(true)) +{ + int s = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + + T_ATEND(cleanup); + network_interface_create(&interface, FETH_NAME); + struct in_addr addr; + addr.s_addr = inet_addr("192.168.55.1"); + struct in_addr mask; + mask.s_addr = inet_addr("255.255.255.0"); + ifnet_add_ip_address(interface.if_name, addr, mask); + + struct ip_mreq_source mr = {}; + mr.imr_sourceaddr.s_addr = inet_addr("192.168.55.2"); + mr.imr_multiaddr.s_addr = inet_addr("239.1.2.3"); + mr.imr_interface.s_addr = INADDR_ANY; + + for (int i = 0; i < 20; i++) { + mr.imr_sourceaddr.s_addr += i; + T_ASSERT_POSIX_SUCCESS(setsockopt(s, IPPROTO_IP, + IP_ADD_SOURCE_MEMBERSHIP, &mr, + sizeof(mr)), + "IP_ADD_SOURCE_MEMBERSHIP"); + } + close(s); +} + +T_DECL(net_multicast_mld_ssm, "MLD SSM test", T_META_ASROOT(true)) +{ + int s6 = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + + T_ATEND(cleanup); + network_interface_create(&interface, FETH_NAME); + ifnet_start_ipv6(interface.if_name); + + struct sockaddr_storage group_storage = {}, source_storage = {}; + + struct sockaddr_in6 *group = (struct sockaddr_in6 *)&group_storage; + + group->sin6_family = AF_INET6; + group->sin6_len = sizeof(*group); + char address[128] = {}; + snprintf(address, sizeof(address), "ff02::1234%%%s", interface.if_name); + inet_pton(AF_INET6, address, &group->sin6_addr); + + struct sockaddr_in6 *source = (struct sockaddr_in6 *)&source_storage; + source->sin6_family = AF_INET6; + source->sin6_len = sizeof(*source); + inet_pton(AF_INET6, "2001:db8::1", &source->sin6_addr); + + struct group_source_req gr = {}; + gr.gsr_interface = interface.if_index; + gr.gsr_group = group_storage; + gr.gsr_source = source_storage; + + for (int i = 0; i < 20; i++) { + ((struct sockaddr_in6 *)&gr.gsr_source)->sin6_addr.__u6_addr.__u6_addr8[15] += i; + T_ASSERT_POSIX_SUCCESS(setsockopt(s6, IPPROTO_IPV6, + MCAST_JOIN_SOURCE_GROUP, &gr, + sizeof(gr)), + "MCAST_JOIN_SOURCE_GROUP"); + } +} diff --git a/tests/memorystatus_is_assertion.c b/tests/memorystatus_is_assertion.c index 2e3cc42d3..02f96ea1b 100644 --- a/tests/memorystatus_is_assertion.c +++ b/tests/memorystatus_is_assertion.c @@ -26,7 +26,8 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_VERSION("VM"), T_META_CHECK_LEAKS(false), T_META_RUN_CONCURRENTLY(true), - T_META_TAG_VM_PREFERRED + T_META_TAG_VM_PREFERRED, + T_META_BOOTARGS_SET("memstat_no_task_limit_increase=1") ); #define IDLE_AGEOUT_S 30 diff --git a/tests/microstackshot_tests.c b/tests/microstackshot_tests.c index bf7665e21..59b008b79 100644 --- a/tests/microstackshot_tests.c +++ b/tests/microstackshot_tests.c @@ -312,18 +312,10 @@ T_DECL(excessive_sampling, "ensure that microstackshots are not being sampled too frequently", T_META_REQUIRES_SYSCTL_EQ("kern.monotonic.supported", 1), T_META_TAG_VM_NOT_ELIGIBLE) { - unsigned int interrupt_sample_rate = 0; - size_t sysctl_size = sizeof(interrupt_sample_rate); - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctlbyname( - "kern.microstackshot.interrupt_sample_rate", - &interrupt_sample_rate, &sysctl_size, NULL, 0), - "query interrupt sample rate"); unsigned int pmi_counter = 0; uint64_t pmi_period = 0; (void)query_pmi_params(&pmi_counter, &pmi_period); - T_LOG("interrupt sample rate: %uHz", interrupt_sample_rate); T_LOG("PMI counter: %u", pmi_counter); T_LOG("PMI period: %llu", pmi_period); diff --git a/tests/mktimer_kobject.c b/tests/mktimer_kobject.c deleted file mode 100644 index db69d9e0a..000000000 --- a/tests/mktimer_kobject.c +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include - -#include -#include - -#include - -T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true), - T_META_NAMESPACE("xnu.ipc"), - T_META_RADAR_COMPONENT_NAME("xnu"), - T_META_RADAR_COMPONENT_VERSION("IPC")); - -T_DECL(mktimer_kobject, "mktimer_kobject()", T_META_ALL_VALID_ARCHS(true), T_META_IGNORECRASHES(".*mktimer_kobject.*"), T_META_TAG_VM_PREFERRED) -{ - mach_port_t timer_port = MACH_PORT_NULL; - mach_port_t notify_port = MACH_PORT_NULL; - - kern_return_t kr = KERN_SUCCESS; - task_exc_guard_behavior_t old, new; - - /* - * Disable [optional] Mach port guard exceptions to avoid fatal crash - */ - kr = task_get_exc_guard_behavior(mach_task_self(), &old); - T_ASSERT_MACH_SUCCESS(kr, "task_get_exc_guard_behavior"); - new = (old & ~TASK_EXC_GUARD_MP_DELIVER); - kr = task_set_exc_guard_behavior(mach_task_self(), new); - T_ASSERT_MACH_SUCCESS(kr, "task_set_exc_guard_behavior new"); - - /* - * timer port - * This is a receive right which is also a kobject - */ - timer_port = mk_timer_create(); - T_ASSERT_NE(timer_port, (mach_port_t)MACH_PORT_NULL, "mk_timer_create: %s", mach_error_string(kr)); - - mach_port_set_context(mach_task_self(), timer_port, (mach_port_context_t) 0x1); - T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(timer_port): %s", mach_error_string(kr)); - - /* notification port for the mk_timer port to come back on */ - kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, ¬ify_port); - T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr)); - - kr = mach_port_set_context(mach_task_self(), notify_port, (mach_port_context_t) 0x2); - T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(notify_port): %s", mach_error_string(kr)); - - T_LOG("timer: 0x%x, notify: 0x%x", timer_port, notify_port); - - /* - * This code generates a mach port guard exception and should be tested with an exception catcher. - * Will be updated in - */ - mach_port_t previous = MACH_PORT_NULL; - - /* request a port-destroyed notification on the timer port */ - kr = mach_port_request_notification(mach_task_self(), timer_port, MACH_NOTIFY_PORT_DESTROYED, - 0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous); - /* this will ordinarily fail with a guard exception! */ - T_ASSERT_MACH_ERROR(kr, KERN_INVALID_RIGHT, "notifications should NOT work on mk_timer ports!"); - - /* restore the old guard behavior */ - kr = task_set_exc_guard_behavior(mach_task_self(), old); - T_ASSERT_MACH_SUCCESS(kr, "task_set_exc_guard_behavior old"); - - T_LOG("done"); -} diff --git a/tests/net_bridge.c b/tests/net_bridge.c index af22a8014..b7358e24c 100644 --- a/tests/net_bridge.c +++ b/tests/net_bridge.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024 Apple Inc. All rights reserved. + * Copyright (c) 2019-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1993,10 +1993,8 @@ static void mac_nat_test_arp_in(switch_port_list_t port_list) { u_int i; - struct in_addr ip_src; switch_port_t port; - ip_src = get_external_ipv4_address(); for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { if (port->mac_nat) { continue; @@ -2256,7 +2254,6 @@ validate_mac_nat_nd6_out(switch_port_t port, const ether_header_t * eh_p, static void mac_nat_test_nd6_out(switch_port_list_t port_list) { - ether_addr_t * ext_mac; switch_port_t ext_port; u_int i; union ifbrip ip_dst; @@ -2266,7 +2263,6 @@ mac_nat_test_nd6_out(switch_port_list_t port_list) ext_port = port_list->list; T_QUIET; T_ASSERT_TRUE(ext_port->mac_nat, NULL); - ext_mac = &ext_port->member_mac; for (i = 0, port = port_list->list; i < port_list->count; i++, port++) { if (port->mac_nat) { continue; @@ -2814,10 +2810,8 @@ fake_set_lro(bool enable) { int error; int lro; - size_t len; lro = (enable) ? 1 : 0; - len = sizeof(fake_bsd_mode); error = sysctlbyname("net.link.fake.lro", NULL, 0, &lro, sizeof(lro)); T_ASSERT_EQ(error, 0, "sysctl net.link.fake.lro %d", lro); diff --git a/tests/net_siocdifaddr.c b/tests/net_siocdifaddr.c new file mode 100644 index 000000000..3064de784 --- /dev/null +++ b/tests/net_siocdifaddr.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * net_siocdifaddr.c + * - verify that SIOCDIFADDR succeeds + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "net_test_lib.h" + +T_GLOBAL_META(T_META_NAMESPACE("xnu.net"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("networking"), + T_META_ASROOT(true)); + +static char ifname[IF_NAMESIZE]; + +static void +fake_set_fail_ioctl(bool fail) +{ + int error; + int val; + + val = fail ? 1 : 0; +#define FAKE_FAIL_IOCTL "net.link.fake.fail_ioctl" + error = sysctlbyname(FAKE_FAIL_IOCTL, NULL, 0, + &val, sizeof(val)); + T_ASSERT_EQ(error, 0, FAKE_FAIL_IOCTL " %d", val); +} + +static void +test_cleanup(void) +{ + if (ifname[0] != '\0') { + (void)ifnet_destroy(ifname, false); + T_LOG("ifnet_destroy %s", ifname); + } + fake_set_fail_ioctl(false); +} + +static void +sigint_cleanup(__unused int sig) +{ + signal(SIGINT, SIG_DFL); + test_cleanup(); +} + +static void +test_siocdifaddr(void) +{ + struct in_addr addr; + int error; + struct in_addr mask; + + addr.s_addr = htonl(IN_LINKLOCALNETNUM + 1); + mask.s_addr = htonl(IN_CLASSB_NET); + + signal(SIGINT, sigint_cleanup); + T_ATEND(test_cleanup); + + strlcpy(ifname, FETH_NAME, sizeof(ifname)); + error = ifnet_create_2(ifname, sizeof(ifname)); + if (error != 0) { + ifname[0] = '\0'; + T_FAIL("ifnet_create_2 %s", FETH_NAME); + } + fake_set_fail_ioctl(true); + ifnet_add_ip_address(ifname, addr, mask); + ifnet_remove_ip_address(ifname, addr, mask); +} + +T_DECL(siocdifaddr, + "Verify SIOCDIFADDR succeeds when interface returns failure", + T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) +{ + test_siocdifaddr(); +} diff --git a/tests/net_test_lib.c b/tests/net_test_lib.c index 228ed4351..368aee5b8 100644 --- a/tests/net_test_lib.c +++ b/tests/net_test_lib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024 Apple Inc. All rights reserved. + * Copyright (c) 2019-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -37,7 +37,7 @@ #define RTM_BUFLEN (sizeof(struct rt_msghdr) + 6 * SOCK_MAXADDRLEN) #define ROUNDUP(a) \ -((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) +((a) > 0 ? (1 + (((a) - 1) | (sizeof(uint32_t) - 1))) : sizeof(uint32_t)) bool G_debug; @@ -63,7 +63,6 @@ siocll_start(int s, const char * ifname) result = ioctl(s, SIOCLL_START, &ifra_in6); T_QUIET; T_ASSERT_POSIX_SUCCESS(result, "SIOCLL_START %s", ifname); - return; } static void @@ -92,7 +91,6 @@ nd_flags_set(int s, const char * if_name, "SIOCSIFINFO_FLAGS(%s) 0x%x", if_name, nd.ndi.flags); } - return; } @@ -106,15 +104,17 @@ siocprotoattach_in6(int s, const char * name) strncpy(ifra.ifra_name, name, sizeof(ifra.ifra_name)); result = ioctl(s, SIOCPROTOATTACH_IN6, &ifra); T_ASSERT_POSIX_SUCCESS(result, "SIOCPROTOATTACH_IN6(%s)", name); - return; } static void -siocaifaddr(int s, char *ifname, struct in_addr addr, struct in_addr mask) +sioc_a_or_d_ifaddr(int s, char *ifname, struct in_addr addr, struct in_addr mask, + bool add) { struct ifaliasreq ifra; char ntopbuf_ip[INET_ADDRSTRLEN]; char ntopbuf_mask[INET_ADDRSTRLEN]; + unsigned long request; + const char * request_str; int ret; struct sockaddr_in * sin; @@ -131,12 +131,30 @@ siocaifaddr(int s, char *ifname, struct in_addr addr, struct in_addr mask) sin->sin_family = AF_INET; sin->sin_addr = mask; - ret = ioctl(s, SIOCAIFADDR, &ifra); + if (add) { + request = SIOCAIFADDR; + request_str = "SIOCAIFADDR"; + } else { + request = SIOCDIFADDR; + request_str = "SIOCDIFADDR"; + } + ret = ioctl(s, request, &ifra); inet_ntop(AF_INET, &addr, ntopbuf_ip, sizeof(ntopbuf_ip)); inet_ntop(AF_INET, &sin->sin_addr, ntopbuf_mask, sizeof(ntopbuf_mask)); - T_ASSERT_POSIX_SUCCESS(ret, "SIOCAIFADDR %s %s %s", + T_ASSERT_POSIX_SUCCESS(ret, "%s %s %s %s", request_str, ifname, ntopbuf_ip, ntopbuf_mask); - return; +} + +static void +siocaifaddr(int s, char *ifname, struct in_addr addr, struct in_addr mask) +{ + sioc_a_or_d_ifaddr(s, ifname, addr, mask, true); +} + +static void +siocdifaddr(int s, char *ifname, struct in_addr addr, struct in_addr mask) +{ + sioc_a_or_d_ifaddr(s, ifname, addr, mask, false); } @@ -534,6 +552,14 @@ ifnet_add_ip_address(char *ifname, struct in_addr addr, struct in_addr mask) siocaifaddr(s, ifname, addr, mask); } +void +ifnet_remove_ip_address(char *ifname, struct in_addr addr, struct in_addr mask) +{ + int s = inet_dgram_socket_get(); + + siocdifaddr(s, ifname, addr, mask); +} + int ifnet_set_mtu(const char *ifname, int mtu) { @@ -982,3 +1008,145 @@ bridge_add_member(const char * bridge, const char * member) T_ASSERT_POSIX_SUCCESS(ret, "%s %s %s", __func__, bridge, member); return ret; } + +/* +** stolen from bootp/bootplib/util.c +** +**/ + +static int +rt_xaddrs(char * cp, const char * cplim, struct rt_addrinfo * rtinfo) +{ + int i; + struct sockaddr * sa; + + bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info)); + for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) { + if ((rtinfo->rti_addrs & (1 << i)) == 0) { + continue; + } + sa = (struct sockaddr *)cp; + if ((cp + sa->sa_len) > cplim) { + return EINVAL; + } + rtinfo->rti_info[i] = sa; + cp += ROUNDUP(sa->sa_len); + } + return 0; +} + +/** +** stolen from bootp/IPConfiguration.bproj/iputil.c +** +** inet6_addrlist_* +**/ + +#define s6_addr16 __u6_addr.__u6_addr16 + +static char * +copy_if_info(unsigned int if_index, int af, int *ret_len_p) +{ + char * buf = NULL; + size_t buf_len = 0; + int mib[6]; + + mib[0] = CTL_NET; + mib[1] = PF_ROUTE; + mib[2] = 0; + mib[3] = af; + mib[4] = NET_RT_IFLIST; + mib[5] = (int)if_index; + + *ret_len_p = 0; + if (sysctl(mib, 6, NULL, &buf_len, NULL, 0) < 0) { + T_LOG("sysctl() size failed: %s", strerror(errno)); + goto failed; + } + buf_len *= 2; /* just in case something changes */ + buf = malloc(buf_len); + if (sysctl(mib, 6, buf, &buf_len, NULL, 0) < 0) { + free(buf); + buf = NULL; + T_LOG("sysctl() failed: %s", strerror(errno)); + goto failed; + } + *ret_len_p = (int)buf_len; + +failed: + return buf; +} + +bool +inet6_get_linklocal_address(unsigned int if_index, struct in6_addr *ret_addr) +{ + char * buf = NULL; + char * buf_end; + int buf_len; + bool found = FALSE; + char *scan; + struct rt_msghdr *rtm; + + bzero(ret_addr, sizeof(*ret_addr)); + buf = copy_if_info(if_index, AF_INET6, &buf_len); + if (buf == NULL) { + goto done; + } + buf_end = buf + buf_len; + for (scan = buf; scan < buf_end; scan += rtm->rtm_msglen) { + struct ifa_msghdr * ifam; + struct rt_addrinfo info; + + /* ALIGN: buf aligned (from calling copy_if_info), scan aligned, + * cast ok. */ + rtm = (struct rt_msghdr *)(void *)scan; + T_LOG("rtm_version %d rtm_type %d", rtm->rtm_version, rtm->rtm_type); + if (rtm->rtm_version != RTM_VERSION) { + continue; + } + if (rtm->rtm_type == RTM_NEWADDR) { + errno_t error; + struct sockaddr_in6 *sin6_p; + + ifam = (struct ifa_msghdr *)rtm; + info.rti_addrs = ifam->ifam_addrs; + error = rt_xaddrs((char *)(ifam + 1), + ((char *)ifam) + ifam->ifam_msglen, + &info); + if (error) { + T_LOG("couldn't extract rt_addrinfo %s (%d)\n", + strerror(error), error); + goto done; + } + /* ALIGN: info.rti_info aligned (sockaddr), cast ok. */ + sin6_p = (struct sockaddr_in6 *)(void *)info.rti_info[RTAX_IFA]; + if (sin6_p == NULL + || sin6_p->sin6_len < sizeof(struct sockaddr_in6)) { + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6_p->sin6_addr)) { + *ret_addr = sin6_p->sin6_addr; + ret_addr->s6_addr16[1] = 0; /* mask scope id */ + found = TRUE; + break; + } + } + } + +done: + if (buf != NULL) { + free(buf); + } + return found; +} + +void +force_zone_gc(void) +{ + kern_return_t kr = mach_zone_force_gc(mach_host_self()); + + if (kr != KERN_SUCCESS) { + T_LOG("mach_zone_force_gc(): failed with error %s\n", mach_error_string(kr)); + } else { + T_LOG("mach_zone_force_gc(): success\n"); + } +} diff --git a/tests/net_test_lib.h b/tests/net_test_lib.h index 39e52ac00..409e0d7c9 100644 --- a/tests/net_test_lib.h +++ b/tests/net_test_lib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024 Apple Inc. All rights reserved. + * Copyright (c) 2023-2025 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,6 +68,11 @@ #include "bpflib.h" #include "in_cksum.h" +#include +#include + +extern kern_return_t mach_zone_force_gc(host_t host); + extern bool G_debug; /* @@ -181,6 +186,8 @@ void inet_dgram_socket_close(void); extern int inet6_dgram_socket_get(void); void inet6_dgram_socket_close(void); +extern bool inet6_get_linklocal_address(unsigned int if_index, struct in6_addr *ret_addr); + extern int ifnet_create(const char * ifname); extern int ifnet_create_2(char * ifname, size_t len); @@ -201,6 +208,9 @@ extern int ifnet_set_flags(const char * ifname, extern void ifnet_add_ip_address(char *ifname, struct in_addr addr, struct in_addr mask); +extern void ifnet_remove_ip_address(char *ifname, struct in_addr addr, + struct in_addr mask); + extern int ifnet_set_mtu(const char * ifname, int mtu); extern int siocdrvspec(const char * ifname, @@ -241,4 +251,6 @@ extern bool has_ipv6_default_route(void); extern int bridge_add_member(const char * bridge, const char * member); +extern void force_zone_gc(void); + #endif /* __net_test_lib_h__ */ diff --git a/tests/net_tuntests.c b/tests/net_tuntests.c index 3c27b91cd..332a7bc6d 100644 --- a/tests/net_tuntests.c +++ b/tests/net_tuntests.c @@ -89,9 +89,7 @@ static int g_OPT_GET_CHANNEL_UUID = -1; static int g_OPT_IFNAME = -1; static char *g_CONTROL_NAME = NULL; -static int create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); -static int create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); -static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); +static int create_tunsock(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]); static void setup_ipsec_test(void) @@ -103,7 +101,6 @@ setup_ipsec_test(void) g_OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID; g_OPT_IFNAME = IPSEC_OPT_IFNAME; g_CONTROL_NAME = IPSEC_CONTROL_NAME; - create_tunsock = create_tunsock_new; g_is_ipsec_test = true; } @@ -117,7 +114,6 @@ setup_utun_test(void) g_OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID; g_OPT_IFNAME = UTUN_OPT_IFNAME; g_CONTROL_NAME = UTUN_CONTROL_NAME; - create_tunsock = create_tunsock_old; g_is_utun_test = true; } @@ -398,159 +394,10 @@ create_sa(const char ifname[IFXNAMSIZ], uint8_t type, uint32_t spi, struct in_ad T_QUIET; T_EXPECT_EQ(slen, (ssize_t)sizeof(addcmd), NULL); } -/* This version of the test expects channels to be enabled after connect. - * Once the utun driver is converted, switch to create_tunsock_new - */ -static int -create_tunsock_old(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) -{ - int tunsock; - struct ctl_info kernctl_info; - struct sockaddr_ctl kernctl_addr; - uuid_t scratchuuid[channel_count]; - if (!uuid) { - uuid = scratchuuid; - } - socklen_t uuidlen; - -startover: - - T_QUIET; T_EXPECT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL); - - memset(&kernctl_info, 0, sizeof(kernctl_info)); - strlcpy(kernctl_info.ctl_name, g_CONTROL_NAME, sizeof(kernctl_info.ctl_name)); - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL); - - memset(&kernctl_addr, 0, sizeof(kernctl_addr)); - kernctl_addr.sc_len = sizeof(kernctl_addr); - kernctl_addr.sc_family = AF_SYSTEM; - kernctl_addr.ss_sysaddr = AF_SYS_CONTROL; - kernctl_addr.sc_id = kernctl_info.ctl_id; - kernctl_addr.sc_unit = 0; - - T_LOG("%s: enable_netif = %d, enable_flowswitch = %d, channel_count = %d", - __func__, enable_netif, enable_flowswitch, channel_count); - - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, - &enable_netif, sizeof(enable_netif)), EINVAL, NULL); - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, - &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &channel_count, sizeof(channel_count)), EINVAL, NULL); - for (int i = 0; i < channel_count; i++) { - uuid_clear(uuid[i]); - } - uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; - T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, - uuid, &uuidlen), EINVAL, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); - for (int i = 0; i < channel_count; i++) { - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); - } - - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL); - - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, - &enable_netif, sizeof(enable_netif)), NULL); - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, - &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL); - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &channel_count, sizeof(channel_count)), EINVAL, NULL); - for (int i = 0; i < channel_count; i++) { - uuid_clear(uuid[i]); - } - uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; - T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, - uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); - for (int i = 0; i < channel_count; i++) { - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); - } - - int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)); - if (error == -1 && errno == EBUSY) { - /* XXX remove this retry nonsense when this is fixed: - * creating an interface without specifying specific interface name should not return EBUSY - */ - close(tunsock); - T_LOG("connect got EBUSY, sleeping 1 second before retry"); - sleep(1); - goto startover; - } - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(error, "connect()"); - - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF, - &enable_netif, sizeof(enable_netif)), EINVAL, NULL); - - if (is_netagent_enabled()) { - if (enable_netif) { - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, - &enable_flowswitch, sizeof(enable_flowswitch)), NULL); - } else { - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, - &enable_flowswitch, sizeof(enable_flowswitch)), ENOENT, NULL); - } - } else { - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH, - &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL); - } - - if (channel_count) { - if (g_is_ipsec_test && !enable_netif) { - /* ipsec doesn't support channels without a netif */ - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &channel_count, sizeof(channel_count)), EOPNOTSUPP, NULL); - for (int i = 0; i < channel_count; i++) { - uuid_clear(uuid[i]); - } - uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; - T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, - uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); - for (int i = 0; i < channel_count; i++) { - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); - } - } else { - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &channel_count, sizeof(channel_count)), NULL); - for (int i = 0; i < channel_count; i++) { - uuid_clear(uuid[i]); - } - uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; - T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, - uuid, &uuidlen), NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); - for (int i = 0; i < channel_count; i++) { - T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid[i]), NULL); - } - } - } else { - T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL, - &channel_count, sizeof(channel_count)), ENXIO, NULL); - for (int i = 0; i < channel_count; i++) { - uuid_clear(uuid[i]); - } - uuidlen = sizeof(uuid_t) * (unsigned int)channel_count; - T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID, - uuid, &uuidlen), ENXIO, NULL); - T_QUIET; T_EXPECT_EQ_ULONG((unsigned long)uuidlen, sizeof(uuid_t) * (unsigned long)channel_count, NULL); - for (int i = 0; i < channel_count; i++) { - T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid[i]), NULL); - } - } - - check_enables(tunsock, enable_netif, enable_flowswitch, channel_count, uuid); - - //T_LOG("Returning tunsock %d", tunsock); - - return tunsock; -} - /* This version of the test expects channels to be enabled before connect - * Once the utun driver is converted, rename this to just create_tunsock */ static int -create_tunsock_new(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) +create_tunsock(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) { int tunsock; struct ctl_info kernctl_info; @@ -695,8 +542,6 @@ startover: return tunsock; } -static int (*create_tunsock)(int enable_netif, int enable_flowswitch, int channel_count, uuid_t uuid[]) = create_tunsock_new; - #if 0 static void ipsec_stats(void) diff --git a/tests/nox86exec.c b/tests/nox86exec.c new file mode 100644 index 000000000..b4a9001bb --- /dev/null +++ b/tests/nox86exec.c @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include + +T_DECL(nox86exec, "make sure the nox86exec boot-arg is honored", T_META_ALL_VALID_ARCHS(false), T_META_BOOTARGS_SET("nox86exec=1")) +{ +#if TARGET_OS_OSX && defined(__arm64__) + int spawn_ret, pid; + char path[1024]; + uint32_t size = sizeof(path); + + T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL); + T_ASSERT_LT(strlcat(path, "_helper", size), (unsigned long)size, NULL); + + spawn_ret = posix_spawn(&pid, path, NULL, NULL, NULL, NULL); + if (spawn_ret == 0) { + int wait_ret = 0; + waitpid(pid, &wait_ret, 0); + T_ASSERT_FALSE(WIFEXITED(wait_ret), "x86_64 helper should not run"); + } +#else + T_SKIP("Skipping. Test only runs on arm64 macOS."); +#endif +} diff --git a/tests/nox86exec_helper.c b/tests/nox86exec_helper.c new file mode 100644 index 000000000..f3ec9f025 --- /dev/null +++ b/tests/nox86exec_helper.c @@ -0,0 +1,8 @@ +#include +#include + +T_DECL(nox86exec_helper, "x86_64 binary that nox86exec test attempts to spawn") +{ + printf("Hello, Rosetta!"); + T_SKIP("I'm just a helper, in the world. That's all that you'll let me be."); +} diff --git a/tests/nvram_tests/nvram_nonentitled.c b/tests/nvram_tests/nvram_nonentitled.c index 285851e43..b62a8de5f 100644 --- a/tests/nvram_tests/nvram_nonentitled.c +++ b/tests/nvram_tests/nvram_nonentitled.c @@ -337,4 +337,18 @@ T_DECL(TestNVRAMOblit, "Test NVRAM Obliterate") ReleaseOptionsRef(optionsRef); } + +// Clear all test variables +T_DECL(TestZ, "Clear test variables") +{ + char * varToTest = "clear-test-vars"; + + optionsRef = CreateOptionsRef(); + + TestVarOp(OP_SET, varToTest, DefaultSetVal, KERN_SUCCESS, optionsRef); + TestVarOp(OP_GET, "testNeverDel", NULL, KERN_FAILURE, optionsRef); + + ReleaseOptionsRef(optionsRef); +} + #endif /* !(__x86_64__) */ diff --git a/tests/os_refcnt.c b/tests/os_refcnt.c index ebff6fd0c..5e99dcd0c 100644 --- a/tests/os_refcnt.c +++ b/tests/os_refcnt.c @@ -371,7 +371,7 @@ T_DECL(refcnt_overflow, "Overflow") (void)os_ref_retain_raw(&rc, NULL); T_FAIL("overflow not caught"); } else { - T_ASSERT_EQ_INT(x, OSREF_RETAIN, "overflow caught"); + T_ASSERT_EQ_INT(x, OSREF_OVERFLOW, "overflow caught"); } } diff --git a/tests/pac_exception_entitlement.c b/tests/pac_exception_entitlement.c index 2994061a5..a5ef371ac 100644 --- a/tests/pac_exception_entitlement.c +++ b/tests/pac_exception_entitlement.c @@ -50,7 +50,8 @@ T_GLOBAL_META( #if __arm64e__ static size_t exception_handler(mach_port_t task __unused, mach_port_t thread __unused, - exception_type_t type __unused, mach_exception_data_t codes __unused) + exception_type_t type __unused, mach_exception_data_t codes __unused, + uint64_t exception_pc __unused) { T_ASSERT_FAIL("kernel ran exception handler instead of terminating process"); } diff --git a/tests/perf_vmfault.c b/tests/perf_vmfault.c deleted file mode 100644 index 75a6d0170..000000000 --- a/tests/perf_vmfault.c +++ /dev/null @@ -1,464 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "benchmark/helpers.h" -#include "test_utils.h" - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.vm.perf"), - T_META_RADAR_COMPONENT_NAME("xnu"), - T_META_RADAR_COMPONENT_VERSION("VM"), - T_META_CHECK_LEAKS(false), - T_META_TAG_PERF - ); - -#ifdef DT_IOSMARK -#define MEMSIZE (1UL<<29) /* 512 MB */ -#else -#define MEMSIZE (1UL<<27) /* 128 MB */ -#endif - -#define VM_TAG1 100 -#define VM_TAG2 101 - -enum { - SOFT_FAULT, - ZERO_FILL, - NUM_FAULT_TYPES -}; - -enum { - VARIANT_DEFAULT = 1, - VARIANT_SINGLE_REGION, - VARIANT_MULTIPLE_REGIONS, - NUM_MAPPING_VARIANTS -}; - -static char *variant_str[] = { - "none", - "default", - "single-region", - "multiple-regions" -}; - - -typedef struct { - char *region_addr; - char *shared_region_addr; - size_t region_len; -} memregion_config; - -static memregion_config *memregion_config_per_thread; - -static size_t pgsize; -static int num_threads; -static int ready_thread_count; -static int finished_thread_count; -static dt_stat_time_t runtime; -static pthread_cond_t start_cvar; -static pthread_cond_t threads_ready_cvar; -static pthread_cond_t threads_finished_cvar; -static pthread_mutex_t ready_thread_count_lock; -static pthread_mutex_t finished_thread_count_lock; - -static void map_mem_regions_default(int fault_type, size_t memsize); -static void map_mem_regions_single(int fault_type, size_t memsize); -static void map_mem_regions_multiple(int fault_type, size_t memsize); -static void map_mem_regions(int fault_type, int mapping_variant, size_t memsize); -static void unmap_mem_regions(int mapping_variant, size_t memsize); -static void setup_per_thread_regions(char *memblock, char *memblock_share, int fault_type, size_t memsize); -static void fault_pages(int thread_id); -static void execute_threads(void); -static void *thread_setup(void *arg); -static void run_test(int fault_type, int mapping_variant, size_t memsize); -static void setup_and_run_test(int test, int threads); - -/* Allocates memory using the default mmap behavior. Each VM region created is capped at 128 MB. */ -static void -map_mem_regions_default(int fault_type, size_t memsize) -{ - volatile char val; - vm_prot_t curprot, maxprot; - char *ptr, *memblock, *memblock_share = NULL; - - memblock = (char *)mmap(NULL, memsize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); - T_QUIET; T_ASSERT_NE((void *)memblock, MAP_FAILED, "mmap"); - - if (fault_type == SOFT_FAULT) { - /* Fault in all the pages of the original region. */ - for (ptr = memblock; ptr < memblock + memsize; ptr += pgsize) { - val = *ptr; - } - /* Remap the region so that subsequent accesses result in read soft faults. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_remap(mach_task_self(), (vm_address_t *)&memblock_share, - memsize, 0, VM_FLAGS_ANYWHERE, mach_task_self(), (vm_address_t)memblock, FALSE, - &curprot, &maxprot, VM_INHERIT_DEFAULT), "vm_remap"); - } - setup_per_thread_regions(memblock, memblock_share, fault_type, memsize); -} - -/* Creates a single VM region by mapping in a named memory entry. */ -static void -map_mem_regions_single(int fault_type, size_t memsize) -{ - volatile char val; - vm_prot_t curprot, maxprot; - char *ptr, *memblock = NULL, *memblock_share = NULL; - vm_size_t size = memsize; - vm_offset_t addr1 = 0; - mach_port_t mem_handle = MACH_PORT_NULL; - - /* Allocate a region and fault in all the pages. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_allocate(mach_task_self(), &addr1, size, VM_FLAGS_ANYWHERE), "vm_allocate"); - for (ptr = (char *)addr1; ptr < (char *)addr1 + memsize; ptr += pgsize) { - val = *ptr; - } - - /* Create a named memory entry from the region allocated above, and de-allocate said region. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_make_memory_entry(mach_task_self(), &size, addr1, VM_PROT_ALL | MAP_MEM_NAMED_CREATE, - &mem_handle, MACH_PORT_NULL), "mach_make_memory_entry"); - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_deallocate(mach_task_self(), addr1, size), "vm_deallocate"); - - /* Map in the named entry and deallocate it. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_map(mach_task_self(), (vm_address_t *)&memblock, size, 0, VM_FLAGS_ANYWHERE, mem_handle, 0, - FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_NONE), "vm_map"); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_deallocate(mach_task_self(), mem_handle), "mach_port_deallocate"); - - if (fault_type == SOFT_FAULT) { - /* Fault in all the pages of the original region. */ - for (ptr = memblock; ptr < memblock + memsize; ptr += pgsize) { - val = *ptr; - } - /* Remap the region so that subsequent accesses result in read soft faults. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_remap(mach_task_self(), (vm_address_t *)&memblock_share, - memsize, 0, VM_FLAGS_ANYWHERE, mach_task_self(), (vm_address_t)memblock, FALSE, - &curprot, &maxprot, VM_INHERIT_DEFAULT), "vm_remap"); - } - setup_per_thread_regions(memblock, memblock_share, fault_type, memsize); -} - -/* Allocates a separate VM region for each thread. */ -static void -map_mem_regions_multiple(int fault_type, size_t memsize) -{ - int i; - size_t region_len, num_pages; - volatile char val; - char *ptr, *memblock, *memblock_share; - vm_prot_t curprot, maxprot; - - num_pages = memsize / pgsize; - - for (i = 0; i < num_threads; i++) { - memblock = NULL; - - region_len = num_pages / (size_t)num_threads; - if ((size_t)i < num_pages % (size_t)num_threads) { - region_len++; - } - region_len *= pgsize; - - int fd = VM_MAKE_TAG((i % 2)? VM_TAG1 : VM_TAG2); - memblock = (char *)mmap(NULL, region_len, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, fd, 0); - T_QUIET; T_ASSERT_NE((void *)memblock, MAP_FAILED, "mmap"); - memregion_config_per_thread[i].region_addr = memblock; - memregion_config_per_thread[i].shared_region_addr = 0; - memregion_config_per_thread[i].region_len = region_len; - - if (fault_type == SOFT_FAULT) { - /* Fault in all the pages of the original region. */ - for (ptr = memblock; ptr < memblock + region_len; ptr += pgsize) { - val = *ptr; - } - memblock_share = NULL; - /* Remap the region so that subsequent accesses result in read soft faults. */ - T_QUIET; T_ASSERT_MACH_SUCCESS(vm_remap(mach_task_self(), (vm_address_t *)&memblock_share, - region_len, 0, VM_FLAGS_ANYWHERE, mach_task_self(), (vm_address_t)memblock, FALSE, - &curprot, &maxprot, VM_INHERIT_DEFAULT), "vm_remap"); - memregion_config_per_thread[i].shared_region_addr = memblock_share; - } - } -} - -static void -map_mem_regions(int fault_type, int mapping_variant, size_t memsize) -{ - memregion_config_per_thread = (memregion_config *)malloc(sizeof(*memregion_config_per_thread) * (size_t)num_threads); - switch (mapping_variant) { - case VARIANT_SINGLE_REGION: - map_mem_regions_single(fault_type, memsize); - break; - case VARIANT_MULTIPLE_REGIONS: - map_mem_regions_multiple(fault_type, memsize); - break; - case VARIANT_DEFAULT: - default: - map_mem_regions_default(fault_type, memsize); - } -} - -static void -setup_per_thread_regions(char *memblock, char *memblock_share, int fault_type, size_t memsize) -{ - int i; - size_t region_len, region_start, num_pages; - - num_pages = memsize / pgsize; - for (i = 0; i < num_threads; i++) { - region_len = num_pages / (size_t)num_threads; - region_start = region_len * (size_t)i; - - if ((size_t)i < num_pages % (size_t)num_threads) { - region_start += (size_t)i; - region_len++; - } else { - region_start += num_pages % (size_t)num_threads; - } - - region_start *= pgsize; - region_len *= pgsize; - - memregion_config_per_thread[i].region_addr = memblock + region_start; - memregion_config_per_thread[i].shared_region_addr = ((fault_type == SOFT_FAULT) ? - memblock_share + region_start : 0); - memregion_config_per_thread[i].region_len = region_len; - } -} - -static void -unmap_mem_regions(int mapping_variant, size_t memsize) -{ - if (mapping_variant == VARIANT_MULTIPLE_REGIONS) { - int i; - for (i = 0; i < num_threads; i++) { - if (memregion_config_per_thread[i].shared_region_addr != 0) { - T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memregion_config_per_thread[i].shared_region_addr, - memregion_config_per_thread[i].region_len), "munmap"); - } - T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memregion_config_per_thread[i].region_addr, - memregion_config_per_thread[i].region_len), "munmap"); - } - } else { - if (memregion_config_per_thread[0].shared_region_addr != 0) { - T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memregion_config_per_thread[0].shared_region_addr, memsize), "munmap"); - } - T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memregion_config_per_thread[0].region_addr, memsize), "munmap"); - } -} - -static void -fault_pages(int thread_id) -{ - char *ptr, *block; - volatile char val; - - block = memregion_config_per_thread[thread_id].shared_region_addr ? - memregion_config_per_thread[thread_id].shared_region_addr : - memregion_config_per_thread[thread_id].region_addr; - for (ptr = block; ptr < block + memregion_config_per_thread[thread_id].region_len; ptr += pgsize) { - val = *ptr; - } -} - -static void * -thread_setup(void *arg) -{ - int my_index = *((int *)arg); - - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock"); - ready_thread_count++; - if (ready_thread_count == num_threads) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_ready_cvar), "pthread_cond_signal"); - } - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&start_cvar, &ready_thread_count_lock), "pthread_cond_wait"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); - - fault_pages(my_index); - - /* Up the finished count */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock"); - finished_thread_count++; - if (finished_thread_count == num_threads) { - /* All the threads are done. Wake up the main thread */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_finished_cvar), "pthread_cond_signal"); - } - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&finished_thread_count_lock), "pthread_mutex_unlock"); - return NULL; -} - -static void -execute_threads(void) -{ - int thread_index, thread_retval; - int *thread_indices; - void *thread_retval_ptr = &thread_retval; - pthread_t* threads; - - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_ready_cvar, NULL), "pthread_cond_init"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&start_cvar, NULL), "pthread_cond_init"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&ready_thread_count_lock, NULL), "pthread_mutex_init"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_finished_cvar, NULL), "pthread_cond_init"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&finished_thread_count_lock, NULL), "pthread_mutex_init"); - ready_thread_count = 0; - finished_thread_count = 0; - - threads = (pthread_t *)malloc(sizeof(*threads) * (size_t)num_threads); - thread_indices = (int *)malloc(sizeof(*thread_indices) * (size_t)num_threads); - for (thread_index = 0; thread_index < num_threads; thread_index++) { - thread_indices[thread_index] = thread_index; - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&threads[thread_index], NULL, - thread_setup, (void *)&thread_indices[thread_index]), "pthread_create"); - } - - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock"); - while (ready_thread_count != num_threads) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_ready_cvar, &ready_thread_count_lock), - "pthread_cond_wait"); - } - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); - - T_STAT_MEASURE(runtime) { - /* Ungate the threads */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_broadcast(&start_cvar), "pthread_cond_broadcast"); - /* Wait for the threads to finish */ - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&finished_thread_count_lock), "pthread_mutex_lock"); - while (finished_thread_count != num_threads) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_finished_cvar, &finished_thread_count_lock), "pthread_cond_wait"); - } - }; - - /* Join the threads */ - for (thread_index = 0; thread_index < num_threads; thread_index++) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr), - "pthread_join"); - } - - free(threads); - free(thread_indices); -} - -static void -run_test(int fault_type, int mapping_variant, size_t memsize) -{ - char metric_str[32]; - size_t num_pages; - size_t sysctl_size = sizeof(pgsize); - int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pagesize failed"); - - num_pages = memsize / pgsize; - - T_QUIET; T_ASSERT_LT(fault_type, NUM_FAULT_TYPES, "invalid test type"); - T_QUIET; T_ASSERT_LT(mapping_variant, NUM_MAPPING_VARIANTS, "invalid mapping variant"); - T_QUIET; T_ASSERT_GT(num_threads, 0, "num_threads <= 0"); - T_QUIET; T_ASSERT_GT((int)num_pages / num_threads, 0, "num_pages/num_threads <= 0"); - - T_LOG("No. of cpus: %d", get_ncpu()); - T_LOG("No. of threads: %d", num_threads); - T_LOG("No. of pages: %ld", num_pages); - T_LOG("Pagesize: %ld", pgsize); - T_LOG("Allocation size: %ld MB", memsize / (1024 * 1024)); - T_LOG("Mapping variant: %s", variant_str[mapping_variant]); - - snprintf(metric_str, 32, "Runtime-%s", variant_str[mapping_variant]); - runtime = dt_stat_time_create(metric_str); - - while (!dt_stat_stable(runtime)) { - map_mem_regions(fault_type, mapping_variant, memsize); - execute_threads(); - unmap_mem_regions(mapping_variant, memsize); - } - - dt_stat_finalize(runtime); - T_LOG("Throughput-%s (MB/s): %lf\n\n", variant_str[mapping_variant], (double)memsize / (1024 * 1024) / dt_stat_mean((dt_stat_t)runtime)); -} - -static void -setup_and_run_test(int fault_type, int threads) -{ - int i, mapping_variant; - size_t memsize; - char *e; - - mapping_variant = VARIANT_DEFAULT; - memsize = MEMSIZE; - num_threads = threads; - - if ((e = getenv("NTHREADS"))) { - if (threads == 1) { - T_SKIP("Custom environment variables specified. Skipping single threaded version."); - } - num_threads = (int)strtol(e, NULL, 0); - } - - if ((e = getenv("MEMSIZEMB"))) { - memsize = (size_t)strtol(e, NULL, 0) * 1024 * 1024; - } - - if ((e = getenv("VARIANT"))) { - mapping_variant = (int)strtol(e, NULL, 0); - run_test(fault_type, mapping_variant, memsize); - } else { - for (i = VARIANT_DEFAULT; i < NUM_MAPPING_VARIANTS; i++) { - run_test(fault_type, i, memsize); - } - } - - T_END; -} - -T_DECL(read_soft_fault, - "Read soft faults (single thread)", T_META_TAG_VM_NOT_ELIGIBLE) -{ - setup_and_run_test(SOFT_FAULT, 1); -} - -T_DECL(read_soft_fault_multithreaded, - "Read soft faults (multi-threaded)", T_META_TAG_VM_NOT_ELIGIBLE) -{ - char *e; - int nthreads; - - /* iOSMark passes in the no. of threads via an env. variable */ - if ((e = getenv("DT_STAT_NTHREADS"))) { - nthreads = (int)strtol(e, NULL, 0); - } else { - nthreads = get_ncpu(); - if (nthreads == 1) { - T_SKIP("Skipping multi-threaded test on single core device."); - } - } - setup_and_run_test(SOFT_FAULT, nthreads); -} - -T_DECL(zero_fill_fault, - "Zero fill faults (single thread)", T_META_TAG_VM_NOT_ELIGIBLE) -{ - setup_and_run_test(ZERO_FILL, 1); -} - -T_DECL(zero_fill_fault_multithreaded, - "Zero fill faults (multi-threaded)", - XNU_T_META_SOC_SPECIFIC, T_META_TAG_VM_NOT_ELIGIBLE) -{ - char *e; - int nthreads; - - /* iOSMark passes in the no. of threads via an env. variable */ - if ((e = getenv("DT_STAT_NTHREADS"))) { - nthreads = (int)strtol(e, NULL, 0); - } else { - nthreads = get_ncpu(); - if (nthreads == 1) { - T_SKIP("Skipping multi-threaded test on single core device."); - } - } - setup_and_run_test(ZERO_FILL, nthreads); -} diff --git a/tests/pmap_fault_on_commpage.c b/tests/pmap_fault_on_commpage.c new file mode 100644 index 000000000..e493247b2 --- /dev/null +++ b/tests/pmap_fault_on_commpage.c @@ -0,0 +1,67 @@ +#include +#include +#include "test_utils.h" + +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.arm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("arm"), + T_META_ENABLED(TARGET_CPU_ARM64), + T_META_OWNER("xi_han"), + T_META_RUN_CONCURRENTLY(true), + XNU_T_META_SOC_SPECIFIC + ); + +#if defined(__LP64__) +#define SIGNAL_EXPECTED SIGBUS +#define SIGNAL_EXPECTED_STR "SIGBUS" +#else +/* On arm64_32, _COMM_PAGE_START_ADDRESS is out of normal VA range, so a SIGSEGV is expected if there's a fault. */ +#define SIGNAL_EXPECTED SIGSEGV +#define SIGNAL_EXPECTED_STR "SIGSEGV" +#endif + +#define TEST_STATE_TESTING_NONE 0 +#define TEST_STATE_TESTING_READ 1 +#define TEST_STATE_TESTING_WRITE 2 +static volatile sig_atomic_t test_state; + +static void +test_handler(int signum) +{ + T_ASSERT_EQ(signum, SIGNAL_EXPECTED, "received signal"); + + if (test_state == TEST_STATE_TESTING_READ) { + T_FAIL("read access triggered a %s", SIGNAL_EXPECTED_STR); + } else if (test_state == TEST_STATE_TESTING_WRITE) { + T_PASS("write access triggered a %s", SIGNAL_EXPECTED_STR); + exit(EXIT_SUCCESS); + } else { + T_FAIL("unexpected %s in test state %u", SIGNAL_EXPECTED_STR, (unsigned int)test_state); + } +} + +T_DECL(pmap_commpage_access_test, + "Verify system behavior on user access to the commpage", T_META_TAG_VM_NOT_PREFERRED) +{ + test_state = TEST_STATE_TESTING_NONE; + + struct sigaction sa; + sa.sa_handler = test_handler; + sa.sa_mask = 0; + sa.sa_flags = 0; + sigaction(SIGNAL_EXPECTED, &sa, NULL); + + test_state = TEST_STATE_TESTING_READ; + *(volatile uint32_t *)_COMM_PAGE_START_ADDRESS; + + T_PASS("read access must not trigger a %s", SIGNAL_EXPECTED_STR); + + test_state = TEST_STATE_TESTING_WRITE; + *(volatile uint32_t *)_COMM_PAGE_START_ADDRESS = 0; + + T_FAIL("write access must trigger a %s", SIGNAL_EXPECTED_STR); +} diff --git a/tests/pmap_stress.c b/tests/pmap_stress.c index 80fec6615..d27509ae7 100644 --- a/tests/pmap_stress.c +++ b/tests/pmap_stress.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "test_utils.h" T_GLOBAL_META( @@ -96,8 +97,21 @@ T_DECL(pmap_huge_pv_list_test, } hugepv_in; hugepv_in.num_loops = 500; hugepv_in.num_mappings = 500000; + + /** + * This test spawns a number of long-running and CPU-intensive kernel worker threads + * which inherit the main thread's priority. Temporarily drop our priority down to + * the default (low) userspace priority to avoid producing a bunch of foreground- + * priority kernel threads that may starve other threads on smaller/slower devices. + */ + qos_class_t prev_qos; + pthread_get_qos_class_np(pthread_self(), &prev_qos, NULL); + pthread_set_qos_class_self_np(QOS_CLASS_DEFAULT, 0); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.pmap_huge_pv_list_test", NULL, NULL, &hugepv_in, sizeof(hugepv_in)), "kern.pmap_huge_pv_list_test"); + + pthread_set_qos_class_self_np(prev_qos, 0); } T_DECL(pmap_reentrance_test, @@ -108,3 +122,23 @@ T_DECL(pmap_reentrance_test, T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.pmap_reentrance_test", NULL, NULL, &num_loops, sizeof(num_loops)), "kern.pmap_reentrance_test, %d loops", num_loops); } + +T_DECL(surt_test, + "Test that surt can handle a surge of SURT requests", + T_META_REQUIRES_SYSCTL_EQ("kern.page_protection_type", 2), + T_META_REQUIRES_SYSCTL_EQ("kern.surt_ready", 1), + T_META_TAG_VM_NOT_ELIGIBLE) +{ + /* Use maxproc to get the theoretical upper bound on the sizeof a SURT request surge. */ + int maxproc; + size_t maxproc_size = sizeof(maxproc); + sysctlbyname("kern.maxproc", &maxproc, &maxproc_size, NULL, 0); + + int num_surts = maxproc; + const int num_loops = 100; + + for (int i = 0; i < num_loops; i++) { + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.surt_test", NULL, NULL, &num_surts, sizeof(num_surts)), + "kern.surt_test, %d surts", num_surts); + } +} diff --git a/tests/poll.c b/tests/poll.c index b244bbe74..6f44b909c 100644 --- a/tests/poll.c +++ b/tests/poll.c @@ -9,6 +9,7 @@ #include #include #include +#include T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"), T_META_RUN_CONCURRENTLY(true)); @@ -143,6 +144,10 @@ leak_thread(void *ptr) memset(buffer, 0, 16392 * 8); // Dump the kevent udatas for self + // PT: Note that this is exposed by libproc but isn't declared in the header, + // hence the forward declaration here. It seems fine to expose this in the header, + // but I'm unsure of the implications. + int proc_list_uptrs(int pid, uint64_t *buf, uint32_t bufsz); int ret = proc_list_uptrs(getpid(), buffer, 16392 * 8); if (ret > 0) { diff --git a/tests/posix_sem.c b/tests/posix_sem.c new file mode 100644 index 000000000..fd322e13d --- /dev/null +++ b/tests/posix_sem.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include + + +T_GLOBAL_META( + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("bsd"), + T_META_OWNER("m_staveleytaylor"), + T_META_RUN_CONCURRENTLY(true)); + +#define NUM_TEST_SEMAPHORES 50 + +static char open_test_prefix[PSEMNAMLEN + 1]; +static char open_sem_invalid[PSEMNAMLEN + 1]; +static char open_sem_a[PSEMNAMLEN + 1]; +static char open_sem_b[PSEMNAMLEN + 1]; + +static void +cleanup_open() +{ + sem_unlink(open_sem_invalid); + sem_unlink(open_sem_a); + sem_unlink(open_sem_b); + + for (int i = 0; i < NUM_TEST_SEMAPHORES; i++) { + char name_buf[PSEMNAMLEN]; + snprintf(name_buf, sizeof(name_buf), "%s/many%d", open_test_prefix, i); + sem_unlink(name_buf); + } +} + +T_DECL(posix_sem_open, "POSIX sem_open", + T_META_TAG_VM_PREFERRED) +{ + sem_t *sem; + + T_SETUPBEGIN; + srand(time(NULL)); + snprintf(open_test_prefix, sizeof(open_test_prefix), "xnutest%d", rand() % 10000); + snprintf(open_sem_invalid, sizeof(open_sem_invalid), "%s/invalid", open_test_prefix); + snprintf(open_sem_a, sizeof(open_sem_a), "%s/a", open_test_prefix); + snprintf(open_sem_b, sizeof(open_sem_b), "%s/b", open_test_prefix); + + T_ATEND(cleanup_open); + T_SETUPEND; + + sem = sem_open(open_sem_invalid, 0); + T_EXPECT_EQ_PTR(sem, SEM_FAILED, "sem_open without O_CREAT fails"); + T_EXPECT_EQ(errno, ENOENT, "sem_open without O_CREAT gives ENOENT"); + + sem = sem_open(open_sem_a, O_CREAT, 0755, 0); + T_WITH_ERRNO; + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open(O_CREAT) succeeds"); + + sem = sem_open(open_sem_a, O_CREAT, 0755, 0); + T_WITH_ERRNO; + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open(O_CREAT) on existing succeeds"); + + sem = sem_open(open_sem_a, O_CREAT | O_EXCL, 0755, 0); + T_EXPECT_EQ_PTR(sem, SEM_FAILED, "sem_open(O_CREAT | O_EXCL) on existing fails"); + T_EXPECT_EQ(errno, EEXIST, "sem_open(O_CREAT | O_EXCL) on existing gives EEXIST"); + + sem = sem_open(open_sem_b, O_CREAT | O_EXCL, 0755, 0); + T_WITH_ERRNO; + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open(O_CREAT | O_EXCL) on non-existing succeeds"); + + for (int i = 0; i < NUM_TEST_SEMAPHORES; i++) { + char name_buf[PSEMNAMLEN]; + snprintf(name_buf, sizeof(name_buf), "%s/many%d", open_test_prefix, i); + + int oflag = O_CREAT; + if (rand() % 2 == 0) { + oflag |= O_EXCL; + } + + sem = sem_open(name_buf, oflag, 0755, 0); + T_WITH_ERRNO; + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open name=%s oflag=%d succeeds", name_buf, oflag); + } + + /* Fisher-Yates shuffle to randomize order in which we unlink semaphores */ + int unlink_order[NUM_TEST_SEMAPHORES] = { 0 }; + for (int i = 0; i < NUM_TEST_SEMAPHORES; i++) { + unlink_order[i] = i; + } + for (int i = 0; i < NUM_TEST_SEMAPHORES; i++) { + int next_index = rand() % (NUM_TEST_SEMAPHORES - i); + + int semaphore = unlink_order[i + next_index]; + unlink_order[i + next_index] = unlink_order[i]; + + char name_buf[PSEMNAMLEN + 1]; + snprintf(name_buf, sizeof(name_buf), "%s/many%d", open_test_prefix, semaphore); + + T_WITH_ERRNO; + T_EXPECT_POSIX_SUCCESS(sem_unlink(name_buf), "sem_unlink(%s)", name_buf); + } +} + +static char namespace_test_sem_name[PSEMNAMLEN + 1]; + +static int +find_helper(char* test_path, int team_id) +{ + char binpath[MAXPATHLEN]; + char* dirpath; + uint32_t size = sizeof(binpath); + int retval; + + retval = _NSGetExecutablePath(binpath, &size); + assert(retval == 0); + dirpath = dirname(binpath); + + snprintf(test_path, MAXPATHLEN, "%s/posix_sem_namespace_helper_team%d", dirpath, team_id); + if (access(test_path, F_OK) == 0) { + return 0; + } else { + return -1; + } +} + +static void +do_namespace_op(const char *namespace, const char *op) +{ + int ret, exit_status, signum; + + dt_pipe_data_handler_t stdout_handler = ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + T_LOG("%s: %s", (char *)context->user_context, data); + return false; + }; + dt_pipe_data_handler_t stderr_handler = ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + T_LOG("%s (stderr): %s", (char *)context->user_context, data); + return false; + }; + + pid_t pid = dt_launch_tool_pipe((char *[]){ (char *)namespace, namespace_test_sem_name, (char *)op, NULL}, false, NULL, stdout_handler, stderr_handler, BUFFER_PATTERN_LINE, (void *)namespace); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool_pipe %s (%s) - %s", op, namespace, namespace_test_sem_name); + + ret = dt_waitpid(pid, &exit_status, &signum, 60 * 5); + T_QUIET; T_ASSERT_EQ(ret, 1, "dt_waitpid (exit=%d,signum=%d)", exit_status, signum); + T_QUIET; T_ASSERT_EQ(exit_status, 0, "dt_waitpid: exit_status"); + T_QUIET; T_ASSERT_EQ(signum, 0, "dt_waitpid: signum"); +} + +static void +cleanup_namespace() +{ + sem_unlink(namespace_test_sem_name); +} + +/* + * Unfortunately this test suffers from two issues that mean we must leave it disabled on BATS: + * 1. rdar://75835929 means that XBS strips the team ID from our helper binaries after we've signed them. + * 2. BATS infrastructure boots with amfi_get_out_of_my_way=1, which treats signatures as CS_PLATFORM_BINARY and causes the team ID to be ignored. + */ +T_DECL(posix_sem_open_team_id_namespace, "POSIX sem_open team ID namespace", + T_META_BOOTARGS_SET("amfi_allow_any_signature=1"), + T_META_ENABLED(FALSE), + T_META_TAG_VM_PREFERRED) +{ + T_SETUPBEGIN; + srand(time(NULL)); + snprintf(namespace_test_sem_name, sizeof(namespace_test_sem_name), "xnutest%d/ns", rand() % 10000); + + T_ATEND(cleanup_namespace); + + char team0_helper[MAXPATHLEN], team1_helper[MAXPATHLEN]; + find_helper(team0_helper, 0); + find_helper(team1_helper, 1); + printf("found helpers at '%s' and '%s'\n", team0_helper, team1_helper); + + /* Quite difficult to register cleanup handlers for this, so we'll perform cleanup now */ + T_LOG("Performing sem_unlink cleanup"); + do_namespace_op(team0_helper, "unlink_force"); + do_namespace_op(team1_helper, "unlink_force"); + + T_SETUPEND; + + /* Check that semaphores created by 1st party applications can be discovered by 3rd party applications. */ + T_LOG("Check 3rd party sees 1st party"); + + sem_t *sem = sem_open(namespace_test_sem_name, O_CREAT | O_EXCL, 0755, 0); + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open(O_CREAT | O_EXCL)"); + sem_close(sem); + + do_namespace_op(team0_helper, "check_access"); + T_ASSERT_POSIX_SUCCESS(sem_unlink(namespace_test_sem_name), "sem_unlink"); + do_namespace_op(team0_helper, "check_no_access"); + +#if TARGET_OS_OSX + T_LOG("macOS only: check 1st party sees 3rd party"); + do_namespace_op(team0_helper, "open_excl"); + do_namespace_op(team0_helper, "check_access"); + + sem = sem_open(namespace_test_sem_name, 0); + T_EXPECT_NE_PTR(sem, SEM_FAILED, "sem_open on 3rd party semaphore"); + sem_close(sem); + + do_namespace_op(team0_helper, "unlink"); + + T_LOG("macOS only: check 3rd party sees other 3rd party" ); + do_namespace_op(team0_helper, "check_no_access"); + do_namespace_op(team1_helper, "check_no_access"); + + do_namespace_op(team0_helper, "open_excl"); + do_namespace_op(team0_helper, "check_access"); + do_namespace_op(team1_helper, "check_access"); + + do_namespace_op(team1_helper, "unlink"); + do_namespace_op(team0_helper, "check_no_access"); + do_namespace_op(team1_helper, "check_no_access"); +#else + /* 1st party applications should not be able to look up semaphores created by 3rd party applications. */ + T_LOG("Check 1st party doesn't see 3rd party"); + + do_namespace_op(team0_helper, "open_excl"); + do_namespace_op(team0_helper, "check_access"); + + sem = sem_open(namespace_test_sem_name, 0); + T_EXPECT_EQ_PTR(sem, SEM_FAILED, "sem_open on 3rd party semaphore"); + sem_close(sem); + + do_namespace_op(team0_helper, "unlink"); + + /* 3rd party applications should not be able to interfere with eachother. */ + T_LOG("Check 3rd party doesn't see other 3rd party"); + + do_namespace_op(team0_helper, "check_no_access"); + do_namespace_op(team1_helper, "check_no_access"); + + do_namespace_op(team0_helper, "open_excl"); + do_namespace_op(team0_helper, "check_access"); + do_namespace_op(team1_helper, "check_no_access"); + + do_namespace_op(team1_helper, "open_excl"); + do_namespace_op(team0_helper, "check_access"); + do_namespace_op(team1_helper, "check_access"); + + do_namespace_op(team0_helper, "unlink"); + do_namespace_op(team0_helper, "check_no_access"); + do_namespace_op(team1_helper, "check_access"); + + do_namespace_op(team1_helper, "unlink"); + do_namespace_op(team0_helper, "check_no_access"); + do_namespace_op(team1_helper, "check_no_access"); + #endif +} diff --git a/tests/posix_sem_namespace_helper.c b/tests/posix_sem_namespace_helper.c new file mode 100644 index 000000000..b12c85f7d --- /dev/null +++ b/tests/posix_sem_namespace_helper.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include + +/* spawned helper binary, so we don't have darwintest here */ +/* usage: posix_sem_namespace_helper_teamN */ +int +main(int argc, char *argv[]) +{ + if (argc != 3) { + fprintf(stderr, "error: wrong number of arguments (%d)\n", argc); + return -1; + } + + assert(argv[0] != NULL && strlen(argv[0]) > 0); + int team_id = argv[0][strlen(argv[0]) - 1] - '0'; + if (team_id != 0 && team_id != 1) { + fprintf(stderr, "error: invalid team_id %d\n", team_id); + return -1; + } + + char *sem_name = argv[1]; + char *op = argv[2]; + + printf("running %s (%s)\n", op, sem_name); + fflush(stdout); + + if (!strcmp(op, "open_excl")) { + if (sem_open(sem_name, O_CREAT | O_EXCL, 0755, 0) == SEM_FAILED) { + fprintf(stderr, "%s: ", sem_name); + perror("sem_open (create exclusive)"); + return -1; + } + } else if (!strcmp(op, "check_access")) { + if (sem_open(sem_name, 0) == SEM_FAILED) { + fprintf(stderr, "%s: ", sem_name); + perror("sem_open (check_access)"); + return -1; + } + } else if (!strcmp(op, "check_no_access")) { + if (sem_open(sem_name, 0) != SEM_FAILED) { + fprintf(stderr, "%s: sem_open unexpectedly succeeded\n", sem_name); + return -1; + } + } else if (!strcmp(op, "unlink")) { + if (sem_unlink(sem_name) != 0) { + fprintf(stderr, "%s: ", sem_name); + perror("sem_unlink"); + return -1; + } + } else if (!strcmp(op, "unlink_force")) { + sem_unlink(sem_name); + } +} diff --git a/tests/posix_spawn_file_actions.c b/tests/posix_spawn_file_actions.c index c5c0df790..596953ac2 100644 --- a/tests/posix_spawn_file_actions.c +++ b/tests/posix_spawn_file_actions.c @@ -21,7 +21,7 @@ T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); /* TEST_PATH needs to be something that exists, but is not the cwd */ #define TEST_PATH "/System/Library/Caches" -T_DECL(posix_spawn_file_actions_addchdir_np, "Check posix_spawn_file_actions_addchdir_np", +T_DECL(posix_spawn_file_actions_addchdir, "Check posix_spawn_file_actions_addchdir", T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) { posix_spawn_file_actions_t file_actions; @@ -31,8 +31,8 @@ T_DECL(posix_spawn_file_actions_addchdir_np, "Check posix_spawn_file_actions_add T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); - ret = posix_spawn_file_actions_addchdir_np(&file_actions, TEST_PATH); - T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addchdir_np"); + ret = posix_spawn_file_actions_addchdir(&file_actions, TEST_PATH); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addchdir"); char * const prog = "/bin/sh"; char * const argv_child[] = { prog, @@ -61,7 +61,7 @@ T_DECL(posix_spawn_file_actions_addchdir_np, "Check posix_spawn_file_actions_add T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success"); } -T_DECL(posix_spawn_file_actions_addchdir_np_errors, "Check posix_spawn_file_actions_addchdir_np errors", +T_DECL(posix_spawn_file_actions_addchdir_errors, "Check posix_spawn_file_actions_addchdir errors", T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) { char longpath[PATH_MAX + 1]; @@ -75,10 +75,10 @@ T_DECL(posix_spawn_file_actions_addchdir_np_errors, "Check posix_spawn_file_acti T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); - ret = posix_spawn_file_actions_addchdir_np(NULL, "/"); + ret = posix_spawn_file_actions_addchdir(NULL, "/"); T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL"); - ret = posix_spawn_file_actions_addchdir_np(&file_actions, longpath); + ret = posix_spawn_file_actions_addchdir(&file_actions, longpath); T_ASSERT_EQ(ret, ENAMETOOLONG, "Path longer than PATH_MAX returns ENAMETOOLONG"); ret = posix_spawn_file_actions_destroy(&file_actions); @@ -86,7 +86,7 @@ T_DECL(posix_spawn_file_actions_addchdir_np_errors, "Check posix_spawn_file_acti T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_destroy"); } -T_DECL(posix_spawn_file_actions_addfchdir_np, "Check posix_spawn_file_actions_addfchdir_np", +T_DECL(posix_spawn_file_actions_addfchdir, "Check posix_spawn_file_actions_addfchdir", T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) { posix_spawn_file_actions_t file_actions; @@ -100,8 +100,8 @@ T_DECL(posix_spawn_file_actions_addfchdir_np, "Check posix_spawn_file_actions_ad test_fd = open(TEST_PATH, O_RDONLY | O_CLOEXEC); T_ASSERT_POSIX_SUCCESS(test_fd, "open " TEST_PATH); - ret = posix_spawn_file_actions_addfchdir_np(&file_actions, test_fd); - T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addfchdir_np"); + ret = posix_spawn_file_actions_addfchdir(&file_actions, test_fd); + T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_addfchdir"); char * const prog = "/bin/sh"; char * const argv_child[] = { prog, @@ -134,7 +134,7 @@ T_DECL(posix_spawn_file_actions_addfchdir_np, "Check posix_spawn_file_actions_ad T_ASSERT_POSIX_SUCCESS(ret, "close test fd"); } -T_DECL(posix_spawn_file_actions_addfchdir_np_errors, "Check posix_spawn_file_actions_addfchdir_np errors", +T_DECL(posix_spawn_file_actions_addfchdir_errors, "Check posix_spawn_file_actions_addfchdir errors", T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) { posix_spawn_file_actions_t file_actions; @@ -144,10 +144,10 @@ T_DECL(posix_spawn_file_actions_addfchdir_np_errors, "Check posix_spawn_file_act T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn_file_actions_init"); - ret = posix_spawn_file_actions_addfchdir_np(NULL, 0); + ret = posix_spawn_file_actions_addfchdir(NULL, 0); T_ASSERT_EQ(ret, EINVAL, "NULL *file_actions returns EINVAL"); - ret = posix_spawn_file_actions_addfchdir_np(&file_actions, -1); + ret = posix_spawn_file_actions_addfchdir(&file_actions, -1); T_ASSERT_EQ(ret, EBADF, "-1 file descriptor returns EBADF"); ret = posix_spawn_file_actions_destroy(&file_actions); diff --git a/tests/prng.c b/tests/prng.c index 18b6ee869..a16e24eaa 100644 --- a/tests/prng.c +++ b/tests/prng.c @@ -79,3 +79,18 @@ T_DECL(prng, "prng test") free(buf); } + +T_DECL(prng_write_random, "Test writing to /dev/random") +{ + uint8_t entropy[128] = {0}; + + int rndfd = open("/dev/random", O_WRONLY, S_IWUSR); + T_ASSERT_POSIX_SUCCESS(rndfd, "Open /dev/random"); + + T_ASSERT_EQ_INT((int)write(rndfd, entropy, 128), 128, "write 128 bytes to /dev/random"); + T_ASSERT_EQ_INT((int)write(rndfd, entropy, 65), 65, "write 65 bytes to /dev/random"); + T_ASSERT_EQ_INT((int)write(rndfd, entropy, 64), 64, "write 64 bytes to /dev/random"); + T_ASSERT_EQ_INT((int)write(rndfd, entropy, 1), 1, "write 1 byte to /dev/random"); + + close(rndfd); +} diff --git a/tests/proc_archinfo.c b/tests/proc_archinfo.c new file mode 100644 index 000000000..aa0d2b666 --- /dev/null +++ b/tests/proc_archinfo.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include + +T_GLOBAL_META( + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("libsyscall"), + T_META_OWNER("m_staveleytaylor"), + T_META_RUN_CONCURRENTLY(true) + ); + +T_DECL(proc_archinfo, "Check proc_archinfo is exposed in public headers") +{ + struct proc_archinfo pai = {0}; + pid_t pid = getpid(); + + T_EXPECT_POSIX_SUCCESS(proc_pidinfo(pid, PROC_PIDARCHINFO, 0, &pai, sizeof(pai)), "proc_pidinfo(PROC_PIDARCHINFO)"); + + /* checks from tests/proc_info.c */ +#if defined(__arm__) || defined(__arm64__) + bool arm = (pai.p_cputype & CPU_TYPE_ARM) == CPU_TYPE_ARM; + bool arm64 = (pai.p_cputype & CPU_TYPE_ARM64) == CPU_TYPE_ARM64; + if (!arm && !arm64) { + T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_ARM, "PROC_PIDARCHINFO returned valid value for p_cputype"); + } + T_EXPECT_EQ_INT((pai.p_cpusubtype & CPU_SUBTYPE_ARM_ALL), CPU_SUBTYPE_ARM_ALL, + "PROC_PIDARCHINFO returned valid value for p_cpusubtype"); +#else + bool x86 = (pai.p_cputype & CPU_TYPE_X86) == CPU_TYPE_X86; + bool x86_64 = (pai.p_cputype & CPU_TYPE_X86_64) == CPU_TYPE_X86_64; + if (!x86 && !x86_64) { + T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_X86, "PROC_PIDARCHINFO returned valid value for p_cputype"); + } +#endif +} diff --git a/tests/proc_info.c b/tests/proc_info.c index 3a6f4895f..28b51a8c4 100644 --- a/tests/proc_info.c +++ b/tests/proc_info.c @@ -77,6 +77,11 @@ CONF_TMP_FILE_OPEN(char path[PATH_MAX]) return fd; } +// Forward declarations for currently 'private' functions implemented by libproc. +int proc_list_uptrs(int pid, uint64_t *buf, uint32_t bufsz); +int proc_list_dynkqueueids(int pid, kqueue_id_t *buf, uint32_t bufsz); +int proc_piddynkqueueinfo(int pid, int flavor, kqueue_id_t kq_id, void *buffer, int buffersize); + uint32_t get_tty_dev(void); #define WAIT_FOR_CHILDREN(pipefd, action, child_count) \ @@ -850,7 +855,9 @@ T_DECL(proc_info_listpids_ppid_only, T_DECL(proc_info_listpids_uid_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_UID_ONLY returns correct value"); @@ -869,7 +876,9 @@ T_DECL(proc_info_listpids_uid_only, T_DECL(proc_info_listpids_ruid_only, "proc_info API test to verify PROC_INFO_CALL_LISTPIDS", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler); T_LOG("Test to verify PROC_RUID_ONLY returns correct value"); @@ -1008,7 +1017,9 @@ T_DECL(ensure_ppidversion_is_not_updated_after_exec, T_DECL(proc_info_pidinfo_proc_pidtbsdinfo, "Test to verify PROC_PIDTBSDINFO returns valid information about the process", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { void * proc_info[2]; int child_pid = 0; @@ -1069,7 +1080,9 @@ T_DECL(proc_info_pidt_shortbsdinfo, T_DECL(proc_info_pidt_bsdinfowithuniqid, "Test to verify PROC_PIDT_BSDINFOWITHUNIQID returns valid information about the process", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { void * proc_info[4]; int child_pid = 0; @@ -1176,7 +1189,9 @@ T_DECL(proc_info_proc_pidtask_info, T_DECL(proc_info_proc_pidtaskallinfo, "Test to verify PROC_PIDTASKALLINFO returns valid information about the process", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { void * proc_info[4]; int child_pid = 0; @@ -1249,7 +1264,9 @@ T_DECL(proc_info_proc_pidlistthreads, T_DECL(proc_info_proc_pidthreadinfo, "Test to verify PROC_PIDTHREADINFO returns valid information about the process", - T_META_ASROOT(true), T_META_TAG_VM_PREFERRED) + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://134505671 */) { void * proc_info[2]; int child_pid = 0; diff --git a/tests/ptrauth_failure.c b/tests/ptrauth_failure.c index 55d4d8999..4bd1b651f 100644 --- a/tests/ptrauth_failure.c +++ b/tests/ptrauth_failure.c @@ -55,7 +55,8 @@ pac_exception_handler( __unused mach_port_t task, __unused mach_port_t thread, exception_type_t type, - mach_exception_data_t codes) + mach_exception_data_t codes, + __unused uint64_t exception_pc) { T_ASSERT_EQ(type, EXC_BAD_ACCESS, "Caught an EXC_BAD_ACCESS exception"); T_ASSERT_EQ(codes[0], (uint64_t)EXC_ARM_PAC_FAIL, "The subcode is EXC_ARM_PAC_FAIL"); diff --git a/tests/recount/coalition_info_tests.c b/tests/recount/coalition_info_tests.c index 16e11518c..cda6c3255 100644 --- a/tests/recount/coalition_info_tests.c +++ b/tests/recount/coalition_info_tests.c @@ -38,10 +38,30 @@ skip_if_monotonic_unsupported(void) } } +// Don't rely on FastSim's CPMU to produce reliable data. +// In particular, S3_2_C15_C1_0 (instructions retired) seems to be zero on some devices (rdar://143157256). +static void +skip_if_fastsim(void) +{ + char buffer[64] = ""; + size_t buffer_size = sizeof(buffer); + + int r = sysctlbyname("hw.targettype", buffer, &buffer_size, NULL, 0); + if (r < 0) { + T_WITH_ERRNO; + T_SKIP("could not find \"hw.targettype\" sysctl"); + } + + if (strstr(buffer, "sim") != NULL) { + T_SKIP("CPU performance counters are unreliable on FastSim"); + } +} + T_DECL(coalition_resource_info_counters, "ensure that coalition resource info produces valid counter data", T_META_TAG_VM_NOT_ELIGIBLE) { skip_if_monotonic_unsupported(); + skip_if_fastsim(); T_SETUPBEGIN; diff --git a/tests/recount/recount_test_utils.c b/tests/recount/recount_test_utils.c index dfaa67d23..a78a485f8 100644 --- a/tests/recount/recount_test_utils.c +++ b/tests/recount/recount_test_utils.c @@ -188,8 +188,9 @@ run_on_all_perf_levels(void) } T_SETUPBEGIN; - bind_to_cluster('P'); - bind_to_cluster('E'); + for (unsigned int i = 0; i < perf_level_count(); i++) { + bind_to_cluster(perf_level_name(i)[0]); + } // Return to the kernel to synchronize timings with the scheduler. (void)getppid(); _unbind_from_cluster(); @@ -213,13 +214,14 @@ run_in_exclaves_on_all_perf_levels(void) { if (perf_level_count() == 1) { _run_on_exclaves(); + return; } T_SETUPBEGIN; - bind_to_cluster('P'); - _run_on_exclaves(); - bind_to_cluster('E'); - _run_on_exclaves(); + for (unsigned int i = 0; i < perf_level_count(); i++) { + bind_to_cluster(perf_level_name(i)[0]); + _run_on_exclaves(); + } _unbind_from_cluster(); T_SETUPEND; } diff --git a/tests/recount/recount_test_utils.h b/tests/recount/recount_test_utils.h index c6eda93fd..9ad7b2f2e 100644 --- a/tests/recount/recount_test_utils.h +++ b/tests/recount/recount_test_utils.h @@ -15,7 +15,7 @@ #define REQUIRE_RECOUNT_ENERGY \ T_META_REQUIRES_SYSTCL_EQ("kern.pervasive_energy", 1) #define REQUIRE_MULTIPLE_PERF_LEVELS \ - T_META_REQUIRES_SYSCTL_EQ("hw.nperflevels", 2) + T_META_REQUIRES_SYSCTL_NE("hw.nperflevels", 1) #define REQUIRE_EXCLAVES \ T_META_REQUIRES_SYSCTL_EQ("kern.exclaves_status", 1) #define SET_THREAD_BIND_BOOTARG \ diff --git a/tests/recount/recount_tests.c b/tests/recount/recount_tests.c index 9a18da799..9b62a5599 100644 --- a/tests/recount/recount_tests.c +++ b/tests/recount/recount_tests.c @@ -308,17 +308,14 @@ T_DECL(thread_selfusage_sanity, "ensure thread_selfusage times are sane", T_META T_DECL(proc_pid_rusage_perf_levels, "ensure proc_pid_rusage fills in per-perf level information", REQUIRE_RECOUNT_PMCS, - // REQUIRE_MULTIPLE_PERF_LEVELS, disabled due to rdar://111297938 + REQUIRE_MULTIPLE_PERF_LEVELS, SET_THREAD_BIND_BOOTARG, T_META_TAG_VM_NOT_ELIGIBLE) { + T_QUIET; T_ASSERT_GT(perf_level_count(), 1, "Platform should be AMP"); + struct rusage_info_v6 before = { 0 }; struct rusage_info_v6 after = { 0 }; - // Until rdar://111297938, manually skip the test if there aren't multiple perf levels. - if (perf_level_count() < 2) { - T_SKIP("device is not eligible for checking perf levels because it is SMP"); - } - _get_proc_pid_rusage(getpid(), &before); run_on_all_perf_levels(); _get_proc_pid_rusage(getpid(), &after); @@ -595,3 +592,81 @@ T_DECL(proc_pidthreadcounts_invalid_tid, "fail"); T_ASSERT_EQ(errno, ESRCH, "should fail with ESRCH"); } + +// Shared state for the getrusage_thread_terminate_increasing test. + +static struct { + pthread_mutex_t lock; + pthread_cond_t wait_for_thread; + pthread_cond_t wait_for_test; +} _getrusage_thread_state = { + .lock = PTHREAD_MUTEX_INITIALIZER, + .wait_for_thread = PTHREAD_COND_INITIALIZER, + .wait_for_test = PTHREAD_COND_INITIALIZER, +}; + + +static void * +_thread_spin_and_exit(void * __unused arg) +{ + pthread_mutex_lock(&_getrusage_thread_state.lock); + + volatile int counter = 0; + while (counter++ < 100000) {} + + pthread_cond_signal(&_getrusage_thread_state.wait_for_thread); + pthread_cond_wait(&_getrusage_thread_state.wait_for_test, + &_getrusage_thread_state.lock); + pthread_mutex_unlock(&_getrusage_thread_state.lock); + return NULL; +} + +static uint64_t +_rusage_to_time_us(struct rusage *usage) +{ + return usage->ru_utime.tv_sec * USEC_PER_SEC + usage->ru_utime.tv_usec; +} + +T_DECL(getrusage_thread_terminate_increasing, + "check that getrusage(2) is monotonically increasing, even with threads terminating", + T_META_TAG_VM_PREFERRED) +{ + const uint64_t test_duration_secs = 2; + uint64_t now_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC); + uint64_t end_ns = now_ns + test_duration_secs * NSEC_PER_SEC; + + while (clock_gettime_nsec_np(CLOCK_MONOTONIC) < end_ns) { + pthread_t thread; + struct rusage usage; + uint64_t old_usage_us, new_usage_us; + + // Start the thread running and doing work. + pthread_mutex_lock(&_getrusage_thread_state.lock); + pthread_create(&thread, NULL, _thread_spin_and_exit, NULL); + pthread_cond_wait(&_getrusage_thread_state.wait_for_thread, + &_getrusage_thread_state.lock); + pthread_mutex_unlock(&_getrusage_thread_state.lock); + + // Gather the current process user and system time accumulation. + T_QUIET; T_ASSERT_POSIX_SUCCESS(getrusage(RUSAGE_SELF, &usage), NULL); + old_usage_us = _rusage_to_time_us(&usage); + + // Let the thread terminate. + pthread_cond_signal(&_getrusage_thread_state.wait_for_test); + pthread_mutex_unlock(&_getrusage_thread_state.lock); + pthread_join(thread, NULL); + + // Gather the times again, which might have gone backwards if the + // thread's time was temporarily lost due to a race condition in + // getrusage(2). + T_QUIET; T_ASSERT_POSIX_SUCCESS(getrusage(RUSAGE_SELF, &usage), NULL); + + new_usage_us = _rusage_to_time_us(&usage); + T_QUIET; + T_ASSERT_GE(new_usage_us, old_usage_us, + "getrusage(2) times were not monotonically increasing"); + } + + T_PASS("checked getrusage(2) times for %llu second%s while threads terminated", + test_duration_secs, test_duration_secs == 1 ? "" : "s"); +} diff --git a/tests/recount/thread_selfcounts_tests.c b/tests/recount/thread_selfcounts_tests.c index be239e3fa..1d381fc2a 100644 --- a/tests/recount/thread_selfcounts_tests.c +++ b/tests/recount/thread_selfcounts_tests.c @@ -144,16 +144,13 @@ T_DECL(thread_selfcounts_cpi_sanity, "check the current thread's CPI", T_DECL(thread_selfcounts_perf_level_sanity, "check per-perf level time, energy, and CPI", REQUIRE_RECOUNT_PMCS, - // REQUIRE_MULTIPLE_PERF_LEVELS, disabled due to rdar://111297938 + REQUIRE_MULTIPLE_PERF_LEVELS, SET_THREAD_BIND_BOOTARG, T_META_ASROOT(true), T_META_TAG_VM_NOT_ELIGIBLE) { unsigned int level_count = perf_level_count(); + T_QUIET; T_ASSERT_GT(level_count, 1, "Platform should be AMP"); - // Until rdar://111297938, manually skip the test if there aren't multiple perf levels. - if (level_count < 2) { - T_SKIP("device is not eligible for checking perf levels because it is SMP"); - } struct thsc_time_energy_cpi *before = calloc(level_count, sizeof(*before)); struct thsc_time_energy_cpi *after = calloc(level_count, sizeof(*after)); @@ -197,7 +194,7 @@ _expect_counts_on_perf_level(unsigned int perf_level_index, T_ASSERT_POSIX_ZERO(err, "thread_selfcounts(THSC_TIME_ENERGY_CPI_PER_PERF_LEVEL, ...)"); - char *name = perf_level_name(perf_level_index); + const char *name = perf_level_name(perf_level_index); _check_usage(&before[perf_level_index], &after[perf_level_index], name); } @@ -219,51 +216,48 @@ _expect_no_counts_on_perf_level(unsigned int perf_level_index, T_ASSERT_POSIX_ZERO(err, "thread_selfcounts(THSC_TIME_ENERGY_CPI_PER_PERF_LEVEL, ...)"); - char *name = perf_level_name(perf_level_index); + const char *name = perf_level_name(perf_level_index); _check_no_usage(&before[perf_level_index], &after[perf_level_index], name); } T_DECL(thread_selfcounts_perf_level_correct, "check that runtimes on each perf level match binding request", REQUIRE_RECOUNT_PMCS, - // REQUIRE_MULTIPLE_PERF_LEVELS, disabled due to rdar://111297938 + REQUIRE_MULTIPLE_PERF_LEVELS, SET_THREAD_BIND_BOOTARG, T_META_ASROOT(true), T_META_TAG_VM_NOT_ELIGIBLE) { unsigned int level_count = perf_level_count(); + T_QUIET; T_ASSERT_GT(level_count, 1, "Platform should be AMP"); - // Until rdar://111297938, manually skip the test if there aren't multiple perf levels. - if (level_count < 2) { - T_SKIP("device is not eligible for checking perf levels because it is SMP"); - } T_LOG("Currently running the \"%s\" scheduler policy", sched_policy_name()); bool is_edge_scheduler = strcmp(sched_policy_name(), "edge") == 0; - for (unsigned int i = 0; i < level_count; i++) { - T_LOG("Level %d: %s", i, perf_level_name(i)); - } struct thsc_time_energy_cpi *before = calloc(level_count, sizeof(*before)); struct thsc_time_energy_cpi *after = calloc(level_count, sizeof(*after)); - T_LOG("Binding to Efficiency cluster, should only see counts from E-cores"); - T_SETUPBEGIN; - bind_to_cluster('E'); - T_SETUPEND; - _expect_counts_on_perf_level(1, before, after); - _expect_no_counts_on_perf_level(0, before, after); + for (unsigned int i = 0; i < level_count; i++) { + T_LOG("Binding to \"%s\" cluster, should only see counts from %c-cores", + perf_level_name(i), perf_level_name(i)[0]); - T_LOG("Binding to Performance cluster, should only see counts from P-cores"); - T_SETUPBEGIN; - bind_to_cluster('P'); - T_SETUPEND; - if (!is_edge_scheduler) { - T_QUIET; T_EXPECT_EQ_STR(sched_policy_name(), "amp", "Unexpected multicluster scheduling policy"); - T_LOG("The AMP scheduler doesn't guarantee that a P-bound thread will " - "only run on P-cores, so the following expects may fail."); - set_expects_may_fail(true); + T_SETUPBEGIN; + bind_to_cluster(perf_level_name(i)[0]); + + if (!is_edge_scheduler && (perf_level_name(i)[0] == 'P')) { + T_QUIET; T_EXPECT_EQ_STR(sched_policy_name(), "amp", "Unexpected multicluster scheduling policy"); + T_LOG("The AMP scheduler doesn't guarantee that a P-bound thread will " + "only run on P-cores, so the following expects may fail."); + set_expects_may_fail(true); + } + T_SETUPEND; + + _expect_counts_on_perf_level(i, before, after); + for (unsigned int j = 0; j < level_count; j++) { + if (j != i) { + _expect_no_counts_on_perf_level(j, before, after); + } + } } - _expect_counts_on_perf_level(0, before, after); - _expect_no_counts_on_perf_level(1, before, after); free(before); free(after); diff --git a/tests/recv_link_addr_type.c b/tests/recv_link_addr_type.c new file mode 100644 index 000000000..61b7595f3 --- /dev/null +++ b/tests/recv_link_addr_type.c @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define __APPLE_USE_RFC_3542 1 + +#include + +#include +#include +#include +#include + +#include "net_test_lib.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.net"), + T_META_ASROOT(true), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("networking"), + T_META_CHECK_LEAKS(false)); + +static char *ifname1; +static char *ifname2; + +#define IPV4_MULTICAST_ADDR_STR "239.1.2.3" +#define IPV6_MULTICAST_ADDR_STR "FF12:0:0:0:0:0:0:FC" + +#define TEN_NET 0x0a000000 +#define TEN_1_NET (TEN_NET | 0x010000) +#define TEN_1_BROADCAST (TEN_1_NET | 0xff) + +static network_interface_pair_list_t S_feth_pairs; + + +static void +get_ipv4_address(u_int unit, u_int addr_index, struct in_addr *ip) +{ + /* up to 255 units, 255 addresses */ + ip->s_addr = htonl(TEN_1_NET | (unit << 8) | addr_index); + return; +} + +static void +network_interface_assign_address(network_interface_t netif, + unsigned int unit, unsigned int address_index) +{ + get_ipv4_address(unit, address_index, &netif->ip); + ifnet_add_ip_address(netif->if_name, netif->ip, + inet_class_c_subnet_mask); + route_add_inet_scoped_subnet(netif->if_name, netif->if_index, + netif->ip, inet_class_c_subnet_mask); + ifnet_start_ipv6(netif->if_name); + T_ASSERT_EQ(inet6_get_linklocal_address(netif->if_index, &netif->ip6), 1, NULL); +} + +static void +initialize_feth_pairs(u_int n, bool need_address) +{ + network_interface_pair_t scan; + + S_feth_pairs = network_interface_pair_list_alloc(n); + scan = S_feth_pairs->list; + for (unsigned int i = 0; i < n; i++, scan++) { + network_interface_create(&scan->one, FETH_NAME); + network_interface_create(&scan->two, FETH_NAME); + if (need_address) { + network_interface_assign_address(&scan->one, i, 1); + network_interface_assign_address(&scan->two, i, 2); + } + fake_set_peer(scan->one.if_name, scan->two.if_name); + } + + ifname1 = S_feth_pairs->list->one.if_name; + ifname2 = S_feth_pairs->list->two.if_name; +} + +static void +cleanup(void) +{ + network_interface_pair_list_destroy(S_feth_pairs); +} + +static void +init(void) +{ + T_ATEND(cleanup); + + initialize_feth_pairs(1, true); +} + +T_DECL(ip_recv_link_addr_type, "IP_RECV_LINK_ADDR_TYPE") +{ + int receive_fd; + int sender_fd; + socklen_t solen; + int optval; + struct ip_mreq mreq = {}; + struct sockaddr_in sin = {}; + struct in_addr addr; + char *str; + ssize_t retval; + in_port_t port; + + init(); + + /* + * Setup receiver bound to ifname1 + */ + T_ASSERT_POSIX_SUCCESS(receive_fd = socket(AF_INET, SOCK_DGRAM, 0), NULL); + + solen = strlen(ifname1); + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname1, solen), NULL); + + /* + * Verify the IP_RECV_LINK_ADDR_TYPE option is setable + */ + solen = sizeof(int); + + T_ASSERT_POSIX_SUCCESS(getsockopt(receive_fd, IPPROTO_IP, IP_RECV_LINK_ADDR_TYPE, &optval, &solen), NULL); + T_LOG("IP_RECV_LINK_ADDR_TYPE default: %d", optval); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IP, IP_RECV_LINK_ADDR_TYPE, &optval, solen), NULL); + + T_ASSERT_POSIX_SUCCESS(getsockopt(receive_fd, IPPROTO_IP, IP_RECV_LINK_ADDR_TYPE, &optval, &solen), NULL); + T_LOG("IP_RECV_LINK_ADDR_TYPE enabled: %d", optval); + + /* + * Join multicast group on ifname1 + */ + inet_aton(IPV4_MULTICAST_ADDR_STR, &mreq.imr_multiaddr); + mreq.imr_interface = S_feth_pairs->list->one.ip; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)), NULL); + + struct timeval timeo = { .tv_sec = 1, .tv_usec = 0 }; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IP, IP_RECVDSTADDR, &optval, sizeof(optval)), NULL); + + /* + * Bind to an ephemeral port + */ + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(bind(receive_fd, (struct sockaddr *)&sin, sizeof(struct sockaddr_in)), NULL); + + solen = sizeof(struct sockaddr_in); + T_ASSERT_POSIX_SUCCESS(getsockname(receive_fd, (struct sockaddr *)&sin, &solen), NULL); + + port = sin.sin_port; + T_LOG("receiver bound to port %u", ntohs(port)); + + + /* + * Setup receiver bound to ifname2 + */ + T_ASSERT_POSIX_SUCCESS(sender_fd = socket(AF_INET, SOCK_DGRAM, 0), NULL); + + solen = strlen(ifname2); + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname2, solen), NULL); + + addr = S_feth_pairs->list->two.ip; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)), NULL); + + u_char ttl = 255; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_BROADCAST, &optval, sizeof(optval)), NULL); + + /* + * Send unicast, broadcast and multicast a few times to allow for ARP to do its job + */ + for (int i = 0; i < 3; i++) { + str = "unicast"; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = S_feth_pairs->list->one.ip; + sin.sin_port = port; + T_ASSERT_POSIX_SUCCESS(retval = sendto(sender_fd, str, strlen(str) + 1, 0, (struct sockaddr *)&sin, sin.sin_len), NULL); + + str = "broadcast"; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr.s_addr = htonl(TEN_1_BROADCAST); + sin.sin_port = port; + T_ASSERT_POSIX_SUCCESS(retval = sendto(sender_fd, str, strlen(str) + 1, 0, (struct sockaddr *)&sin, sin.sin_len), NULL); + + str = "multicast"; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + inet_aton(IPV4_MULTICAST_ADDR_STR, &sin.sin_addr); + sin.sin_port = port; + T_ASSERT_POSIX_SUCCESS(retval = sendto(sender_fd, str, strlen(str) + 1, 0, (struct sockaddr *)&sin, sin.sin_len), NULL); + + usleep(50); + } + + while (true) { + char control_space[CMSG_SPACE(8192)] = {}; + struct msghdr recvmsghdr = {}; + char packet_space[1500] = {}; + struct cmsghdr *cmsg; + int addr_type = -1; + + struct iovec recv_iov; + recv_iov.iov_len = sizeof(packet_space); + recv_iov.iov_base = &packet_space; + + recvmsghdr.msg_iov = &recv_iov; + recvmsghdr.msg_iovlen = 1; + recvmsghdr.msg_control = &control_space; + recvmsghdr.msg_controllen = sizeof(control_space); + recvmsghdr.msg_flags = 0; + + retval = recvmsg(receive_fd, &recvmsghdr, 0); + if (retval < 0) { + break; + } + + for (cmsg = CMSG_FIRSTHDR(&recvmsghdr); cmsg != NULL; cmsg = CMSG_NXTHDR(&recvmsghdr, cmsg)) { + if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVDSTADDR) { + addr.s_addr = *(in_addr_t *)CMSG_DATA(cmsg); + } + if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECV_LINK_ADDR_TYPE) { + addr_type = *(int *)CMSG_DATA(cmsg); + } + } + T_LOG("received packet to: %s address type: %d", inet_ntoa(addr), addr_type); + + if (IN_MULTICAST(ntohl(addr.s_addr))) { + T_ASSERT_EQ(addr_type, IP_RECV_LINK_ADDR_MULTICAST, "multicast"); + } else if ((ntohl(addr.s_addr) & 0x000000ff) == 0x000000ff) { + T_ASSERT_EQ(addr_type, IP_RECV_LINK_ADDR_BROADCAST, "broadcast"); + } else { + T_ASSERT_EQ(addr_type, IP_RECV_LINK_ADDR_UNICAST, "unicast"); + } + } +} + +T_DECL(ipv6_recv_link_addr_type, "IPV6_RECV_LINK_ADDR_TYPE") +{ + int receive_fd; + int sender_fd; + socklen_t solen; + int optval; + struct ipv6_mreq mreq = {}; + struct sockaddr_in6 sin6 = {}; + char *str; + ssize_t retval; + in_port_t port; + char addrstr[INET6_ADDRSTRLEN]; + + init(); + + inet_ntop(AF_INET6, &S_feth_pairs->list->one.ip6, addrstr, sizeof(addrstr)); + T_LOG("feth one: %s index: %u ip: %s ip6: %s", + S_feth_pairs->list->one.if_name, + S_feth_pairs->list->one.if_index, + inet_ntoa(S_feth_pairs->list->one.ip), + addrstr); + + inet_ntop(AF_INET6, &S_feth_pairs->list->two.ip6, addrstr, sizeof(addrstr)); + T_LOG("feth one: %s index: %u ip: %s ip6: %s", + S_feth_pairs->list->two.if_name, + S_feth_pairs->list->two.if_index, + inet_ntoa(S_feth_pairs->list->two.ip), + addrstr); + + + /* + * Setup receiver bound to ifname1 + */ + T_ASSERT_POSIX_SUCCESS(receive_fd = socket(AF_INET6, SOCK_DGRAM, 0), NULL); + + solen = strlen(ifname1); + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname1, solen), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IPV6, IPV6_V6ONLY, &optval, sizeof(optval)), NULL); + + /* + * Verify the IP_RECV_LINK_ADDR_TYPE option is setable + */ + solen = sizeof(int); + + T_ASSERT_POSIX_SUCCESS(getsockopt(receive_fd, IPPROTO_IPV6, IPV6_RECV_LINK_ADDR_TYPE, &optval, &solen), NULL); + T_LOG("IPV6_RECV_LINK_ADDR_TYPE default: %d", optval); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IPV6, IPV6_RECV_LINK_ADDR_TYPE, &optval, solen), NULL); + + T_ASSERT_POSIX_SUCCESS(getsockopt(receive_fd, IPPROTO_IPV6, IPV6_RECV_LINK_ADDR_TYPE, &optval, &solen), NULL); + T_LOG("IPV6_RECV_LINK_ADDR_TYPE enabled: %d", optval); + + /* + * Join multicast group on ifname1 + */ + inet_pton(AF_INET6, IPV6_MULTICAST_ADDR_STR, &mreq.ipv6mr_multiaddr); + mreq.ipv6mr_interface = S_feth_pairs->list->one.if_index; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IPV6, IPV6_JOIN_GROUP, &mreq, sizeof(mreq)), NULL); + + struct timeval timeo = { .tv_sec = 1, .tv_usec = 0 }; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(receive_fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &optval, sizeof(optval)), NULL); + + /* + * Bind to an ephemeral port + */ + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + T_ASSERT_POSIX_SUCCESS(bind(receive_fd, (struct sockaddr *)&sin6, sizeof(struct sockaddr_in6)), NULL); + + solen = sizeof(struct sockaddr_in6); + T_ASSERT_POSIX_SUCCESS(getsockname(receive_fd, (struct sockaddr *)&sin6, &solen), NULL); + + port = sin6.sin6_port; + T_LOG("receiver bound to port %u", ntohs(port)); + + + /* + * Setup receiver bound to ifname2 + */ + T_ASSERT_POSIX_SUCCESS(sender_fd = socket(AF_INET6, SOCK_DGRAM, 0), NULL); + + solen = strlen(ifname2); + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname2, solen), NULL); + + optval = S_feth_pairs->list->two.if_index; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_IPV6, IPV6_MULTICAST_IF, &optval, sizeof(optval)), NULL); + + optval = IPV6_DEFHLIM; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &optval, sizeof(optval)), NULL); + + optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sender_fd, SOL_SOCKET, SO_BROADCAST, &optval, sizeof(optval)), NULL); + + /* + * Send unicast, broadcast and multicast a few times to allow for ND to do its job + */ + for (int i = 0; i < 3; i++) { + str = "unicast"; + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr = S_feth_pairs->list->one.ip6; + sin6.sin6_port = port; + sin6.sin6_scope_id = S_feth_pairs->list->two.if_index; + T_ASSERT_POSIX_SUCCESS(retval = sendto(sender_fd, str, strlen(str) + 1, 0, (struct sockaddr *)&sin6, sin6.sin6_len), NULL); + + str = "multicast"; + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + inet_pton(AF_INET6, IPV6_MULTICAST_ADDR_STR, &sin6.sin6_addr); + sin6.sin6_port = port; + T_ASSERT_POSIX_SUCCESS(retval = sendto(sender_fd, str, strlen(str) + 1, 0, (struct sockaddr *)&sin6, sin6.sin6_len), NULL); + + usleep(50); + } + + while (true) { + char control_space[CMSG_SPACE(8192)] = {}; + struct msghdr recvmsghdr = {}; + char packet_space[1500] = {}; + struct cmsghdr *cmsg; + int addr_type = -1; + struct in6_pktinfo pktinfo = {}; + + struct iovec recv_iov; + recv_iov.iov_len = sizeof(packet_space); + recv_iov.iov_base = &packet_space; + + recvmsghdr.msg_iov = &recv_iov; + recvmsghdr.msg_iovlen = 1; + recvmsghdr.msg_control = &control_space; + recvmsghdr.msg_controllen = sizeof(control_space); + recvmsghdr.msg_flags = 0; + + retval = recvmsg(receive_fd, &recvmsghdr, 0); + if (retval < 0) { + break; + } + + for (cmsg = CMSG_FIRSTHDR(&recvmsghdr); cmsg != NULL; cmsg = CMSG_NXTHDR(&recvmsghdr, cmsg)) { + if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { + pktinfo = *(struct in6_pktinfo *)CMSG_DATA(cmsg); + } + if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_RECV_LINK_ADDR_TYPE) { + addr_type = *(int *)CMSG_DATA(cmsg); + } + } + inet_ntop(AF_INET6, &pktinfo.ipi6_addr, addrstr, sizeof(addrstr)); + T_LOG("received packet to: %s address type: %d", addrstr, addr_type); + + if (IN6_IS_ADDR_MULTICAST(&pktinfo.ipi6_addr)) { + T_ASSERT_EQ(addr_type, IP_RECV_LINK_ADDR_MULTICAST, "multicast"); + } else { + T_ASSERT_EQ(addr_type, IP_RECV_LINK_ADDR_UNICAST, "unicast"); + } + } +} diff --git a/tests/reply_port_defense.c b/tests/reply_port_defense.c index 0bfa9bd67..7836d6b40 100644 --- a/tests/reply_port_defense.c +++ b/tests/reply_port_defense.c @@ -5,9 +5,9 @@ #include #include #include +#include #include #include "excserver_protect_state.h" -#include "../osfmk/ipc/ipc_init.h" #include "../osfmk/mach/port.h" #include "../osfmk/kern/exc_guard.h" #include "exc_helpers.h" @@ -15,7 +15,6 @@ #include "cs_helpers.h" #include -#define MAX_TEST_NUM 10 #define MAX_ARGV 3 extern char **environ; @@ -35,6 +34,82 @@ T_GLOBAL_META( T_META_TIMEOUT(10), T_META_RUN_CONCURRENTLY(TRUE)); +static bool +check_current_cs_flags(code_signing_config_t expected_cs_config) +{ + code_signing_config_t cur_cs_config = 0; + size_t cs_config_size = sizeof(cur_cs_config); + sysctlbyname("security.codesigning.config", &cur_cs_config, &cs_config_size, NULL, 0); + return cur_cs_config & expected_cs_config; +} + +static bool +unrestricted_debugging() +{ + /* AMFI often disables security features if debugging is unrestricted */ + bool unrestricted_debugging = check_current_cs_flags(CS_CONFIG_UNRESTRICTED_DEBUGGING); + if (unrestricted_debugging) { + T_LOG("UNRESTRICTED DEBUGGING"); + } + return unrestricted_debugging; +} + +static bool +sip_disabled() +{ +#if XNU_TARGET_OS_OSX || XNU_TARGET_OS_BRIDGE + /* SIP can only be disabled on macOS */ + bool sip_disabled = csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0; + if (sip_disabled) { + T_LOG("SIP DISABLED"); + } + return sip_disabled; +#else + return false; +#endif +} + +static bool +ipc_hardening_disabled() +{ +#if TARGET_OS_OSX || TARGET_OS_BRIDGE + /* + * CS_CONFIG_GET_OUT_OF_MY_WAY (enabled via AMFI boot-args) + * disables IPC security features. This boot-arg previously + * caused a headache for developers on macos, who frequently use it for + * testing purposes, because all of their 3rd party apps will + * crash due to being treated as platform code. Unfortunately + * BATS runs with this boot-arg enabled very frequently. + */ + bool enforcement_disabled = check_current_cs_flags(CS_CONFIG_GET_OUT_OF_MY_WAY); + if (enforcement_disabled) { + T_LOG("IPC HARDENING ENFORCEMENT IS DISABLED"); + } else { + T_LOG("IPC HARDENING ENABLED"); + } + return enforcement_disabled; +#else /* TARGET_OS_OSX || TARGET_OS_BRIDGE */ + /* mach hardening is only disabled by boot-args on macOS */ + return false; +#endif +} + +static bool +is_release_kernel() +{ + int kernel_type = 0; + size_t kernel_type_size = sizeof(kernel_type); + int r; + r = sysctlbyname("kern.development", &kernel_type, &kernel_type_size, NULL, 0); + if (r < 0) { + T_WITH_ERRNO; + T_SKIP("could not find \"kern.development\" sysctl"); + } + return kernel_type == 0; +} + + + static mach_port_t alloc_exception_port(void) { @@ -116,9 +191,10 @@ catch_mach_exception_raise_identity_protected( { #pragma unused(exception_port, thread_id, task_id_token) - T_ASSERT_GT_UINT(codeCnt, 0, "CodeCnt"); + T_ASSERT_GT_UINT(codeCnt, 1, "CodeCnt"); - T_LOG("Caught exception type: %d code: 0x%llx", exception, codes[0]); + T_LOG("Caught %d codes", codeCnt); + T_LOG("Caught exception type: %d code[0]: 0x%llx code[1]:0x%llx", exception, codes[0], codes[1]); exception_taken = exception; if (exception == EXC_GUARD) { received_exception_code = EXC_GUARD_DECODE_GUARD_FLAVOR((uint64_t)codes[0]); @@ -157,6 +233,8 @@ exception_server_thread(void *arg) static void reply_port_defense(const bool thirdparty_hardened, int test_index, mach_exception_data_type_t expected_exception_code, bool triggers_exception) { + /* reset exception code before running this test */ + received_exception_code = 0; int ret = 0; uint32_t task_exc_guard = 0; @@ -234,32 +312,22 @@ T_DECL(reply_port_defense, "Test reply port semantics violations", T_META_IGNORECRASHES(".*reply_port_defense_client.*"), T_META_CHECK_LEAKS(false), - T_META_TAG_VM_NOT_PREFERRED) { + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) { + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } bool triggers_exception = true; - /* The first test is setup as moving immovable receive right of a reply port. */ - reply_port_defense(true, 0, kGUARD_EXC_IMMOVABLE, triggers_exception); - reply_port_defense(false, 0, kGUARD_EXC_IMMOVABLE, triggers_exception); - + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_IMMOVABLE; + int test_num = 0; int rp_defense_max_test_idx = 3; - /* Run the reply_port_defense tests 1, 2, and 3 */ - mach_exception_data_type_t expected_exception_code = kGUARD_EXC_INVALID_RIGHT; - for (int i = 1; i <= rp_defense_max_test_idx; i++) { + /* Run the reply_port_defense tests 0, 1, 2 */ + for (int i = test_num; i < rp_defense_max_test_idx; i++) { reply_port_defense(true, i, expected_exception_code, triggers_exception); reply_port_defense(false, i, expected_exception_code, triggers_exception); } -} - -T_DECL(test_move_provisional_reply_port, - "provisional reply ports are movable", - T_META_IGNORECRASHES(".*reply_port_defense_client.*"), - T_META_CHECK_LEAKS(false), - T_META_ENABLED(TARGET_OS_OSX || TARGET_OS_BRIDGE)) { - int test_num = 4; - mach_exception_data_type_t expected_exception_code = 0; - bool triggers_exception = false; - - reply_port_defense(true, test_num, expected_exception_code, triggers_exception); - reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + reply_port_defense(true, 3, kGUARD_EXC_INVALID_RIGHT, triggers_exception); + reply_port_defense(false, 3, kGUARD_EXC_INVALID_RIGHT, triggers_exception); } T_DECL(test_unentitled_thread_set_exception_ports, @@ -270,32 +338,12 @@ T_DECL(test_unentitled_thread_set_exception_ports, mach_exception_data_type_t expected_exception_code = kGUARD_EXC_EXCEPTION_BEHAVIOR_ENFORCE; bool triggers_exception = true; -#if TARGET_OS_OSX - T_SKIP("Test disabled on macOS due to SIP disabled and AMFI boot args usage on BATS"); - /* - * CS_CONFIG_GET_OUT_OF_MY_WAY (enabled via AMFI boot-args) - * disables this security feature. This boot-arg previously - * caused a headache for developers on macos, who frequently use it for - * testing purposes, because all of their 3rd party apps will - * crash due to being treated as platform code. Unfortunately - * BATS runs with this boot-arg enabled. - */ - code_signing_config_t cs_config = 0; - size_t cs_config_size = sizeof(cs_config); - sysctlbyname("security.codesigning.config", &cs_config, &cs_config_size, NULL, 0); - if (cs_config & CS_CONFIG_GET_OUT_OF_MY_WAY) { - expected_exception_code = 0; - triggers_exception = false; - T_LOG("task identity security policy for thread_set_exception_ports" - " disabled due to AMFI boot-args."); - } else -#endif /* TARGET_OS_OSX */ - { - T_LOG("task identity security policy for thread_set_exception_ports enabled"); + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); } reply_port_defense(true, test_num, expected_exception_code, triggers_exception); - reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + // reply_port_defense(false, test_num, expected_exception_code, triggers_exception); } T_DECL(test_unentitled_thread_set_state, @@ -305,13 +353,12 @@ T_DECL(test_unentitled_thread_set_state, T_META_ENABLED(false /* rdar://133955889 */)) { int test_num = 6; - mach_exception_data_type_t expected_exception_code = (mach_exception_data_type_t)kGUARD_EXC_THREAD_SET_STATE; + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_THREAD_SET_STATE; bool triggers_exception = true; -#if TARGET_OS_OSX - T_SKIP("Test disabled on macOS due to mach hardening opt out"); -#endif /* TARGET_OS_OSX */ - reply_port_defense(true, test_num, expected_exception_code, triggers_exception); reply_port_defense(false, test_num, expected_exception_code, triggers_exception); } @@ -321,7 +368,10 @@ T_DECL(unentitled_set_exception_ports_pass, T_META_IGNORECRASHES(".*reply_port_defense_client.*"), T_META_CHECK_LEAKS(false)) { int test_num = 7; - mach_exception_data_type_t expected_exception_code = (mach_exception_data_type_t)0; + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_NONE; bool triggers_exception = false; reply_port_defense(true, test_num, expected_exception_code, triggers_exception); reply_port_defense(false, test_num, expected_exception_code, triggers_exception); @@ -331,29 +381,189 @@ T_DECL(unentitled_set_exception_ports_pass, T_DECL(kobject_reply_port_defense, "sending messages to kobjects without a proper reply port should crash", T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_TAG_VM_PREFERRED, T_META_CHECK_LEAKS(false), - T_META_ENABLED(!TARGET_OS_OSX)) { /* disable on macOS due to BATS boot-args */ + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) { /* disable on macOS due to BATS boot-args */ + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } int test_num = 9; -#if __x86_64__ - mach_exception_data_type_t expected_exception_code = (mach_exception_data_type_t)kGUARD_EXC_REQUIRE_REPLY_PORT_SEMANTICS; -#else - mach_exception_data_type_t expected_exception_code = (mach_exception_data_type_t)kGUARD_EXC_SEND_INVALID_REPLY; -#endif + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_KOBJECT_REPLY_PORT_SEMANTICS; bool triggers_exception = true; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); reply_port_defense(false, test_num, expected_exception_code, triggers_exception); } -T_DECL(test_alloc_provisional_reply_port, - "1p is not allowed to create provisional reply ports on iOS+", +T_DECL(test_alloc_weak_reply_port, + "1p is not allowed to create weak reply ports", T_META_IGNORECRASHES(".*reply_port_defense_client.*"), - T_META_CHECK_LEAKS(false), - T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE && !TARGET_OS_XR)) { + T_META_CHECK_LEAKS(false)) { + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + int test_num = 10; - mach_exception_data_type_t expected_exception_code = kGUARD_EXC_PROVISIONAL_REPLY_PORT; + mach_exception_data_type_t expected_exception_code; bool triggers_exception = true; +#if TARGET_OS_OSX || TARGET_OS_BRIDGE + expected_exception_code = kGUARD_EXC_PROVISIONAL_REPLY_PORT; +#else + expected_exception_code = kGUARD_EXC_INVALID_MPO_ENTITLEMENT; +#endif /* TARGET_OS_OSX || TARGET_OS_BRIDGE */ + /* rdar://136996362 (iOS+ telemetry for restricting 1P usage of provisional reply port) */ reply_port_defense(true, test_num, expected_exception_code, triggers_exception); reply_port_defense(false, test_num, expected_exception_code, triggers_exception); } + +T_DECL(test_move_service_port, + "service ports are immovable", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false)) { + int test_num = 11; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_SERVICE_PORT_VIOLATION_FATAL; + bool triggers_exception = true; + + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + + +T_DECL(test_notification_policy, + "registering notifications on an mktimer crashes", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) { /* disable on macOS due to BATS boot-args */ + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_INVALID_NOTIFICATION_REQ; + bool triggers_exception = true; + + int test_num = 12; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + + test_num = 13; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + + test_num = 14; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + + +T_DECL(test_reply_port_extract_right_disallowed, + "mach_port_extract_right disallowed on reply port", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false)) { + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + + int test_num = 15; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_INVALID_RIGHT; + bool triggers_exception = true; + + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + +T_DECL(test_mach_task_self_send_movability, + "mach_task_self is immovable unless you have called ", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false)) { + if (ipc_hardening_disabled()) { + T_SKIP("hardening disabled due to boot-args"); + } + + int test_num = 16; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_IMMOVABLE; + bool triggers_exception = true; + + if (sip_disabled() || unrestricted_debugging()) { + /* + * see `proc_check_get_movable_control_port`: + * enforcement is always controlled by entitlements and + * unrestricted debugging boot-arg + * or if SIP is disabled + */ + expected_exception_code = 0; + triggers_exception = false; + } + + /* + * it should fail on reply_port_defense_client_3P_hardened because it doesn't + * have com.apple.security.get-movable-control-port + */ + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + test_num = 17; /* These tests crash the same way */ + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + + /* + * it should succeed on reply_port_defense_client because it + * has com.apple.security.get-movable-control-port + */ + test_num = 16; + expected_exception_code = 0; + triggers_exception = false; + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + test_num = 17; + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + + +T_DECL(test_send_immovability, + "ensure that send immovability is set on ports, even if they are not copied out", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false)) { + /* attempt to move ports created by mach port construct */ + + + /* test_move_newly_constructed_port_immovable_send */ + int test_num = 18; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_IMMOVABLE; + bool triggers_exception = true; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); + + /* test_move_special_reply_port */ + test_num = 19; + expected_exception_code = kGUARD_EXC_IMMOVABLE; + triggers_exception = true; + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + +T_DECL(test_reply_port_header_disposition, + "Ensure only make_send_once is allowed for reply port", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false), + T_META_ENABLED(!TARGET_OS_OSX && !TARGET_OS_BRIDGE)) { +#if TARGET_OS_OSX || TARGET_OS_BRIDGE + T_SKIP("disabled on macos"); +#endif + int test_num = 20; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_SEND_INVALID_REPLY; + bool triggers_exception = true; + + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} + +T_DECL(test_service_port_as_exception_port, + "Ensure both service and weak service port can be used as exception port", + T_META_IGNORECRASHES(".*reply_port_defense_client.*"), + T_META_CHECK_LEAKS(false)) { + int test_num = 21; + mach_exception_data_type_t expected_exception_code = kGUARD_EXC_NONE; + bool triggers_exception = false; + + reply_port_defense(true, test_num, expected_exception_code, triggers_exception); + reply_port_defense(false, test_num, expected_exception_code, triggers_exception); +} diff --git a/tests/reply_port_defense_client.c b/tests/reply_port_defense_client.c index 11bc28397..8b2eef2ba 100644 --- a/tests/reply_port_defense_client.c +++ b/tests/reply_port_defense_client.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,10 +11,11 @@ #include #include #include - +#include +#include #include "cs_helpers.h" -#define MAX_TEST_NUM 11 +#define MAX_TEST_NUM 22 #if __arm64__ #define machine_thread_state_t arm_thread_state64_t @@ -27,6 +29,9 @@ #error Unsupported architecture #endif +/* in xpc/launch_private.h */ +#define XPC_DOMAIN_SYSTEM 1 + static mach_port_t alloc_server_port(void) { @@ -34,48 +39,92 @@ alloc_server_port(void) kern_return_t kr; kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &server_port); - assert(kr == 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_server_port"); kr = mach_port_insert_right(mach_task_self(), server_port, server_port, MACH_MSG_TYPE_MAKE_SEND); - assert(kr == 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_server_port mach_port_insert_right"); return server_port; } -static mach_port_t -alloc_provisional_reply_port() -{ - kern_return_t kr; - mach_port_t reply_port = MACH_PORT_NULL; - mach_port_t task = mach_task_self(); - - mach_port_options_t opts = { - .flags = MPO_PROVISIONAL_REPLY_PORT | MPO_INSERT_SEND_RIGHT, - }; - - kr = mach_port_construct(mach_task_self(), &opts, 0, &reply_port); - assert(kr == 0); - - return reply_port; -} - static mach_port_t alloc_reply_port() { kern_return_t kr; mach_port_t reply_port = MACH_PORT_NULL; - mach_port_t task = mach_task_self(); mach_port_options_t opts = { .flags = MPO_REPLY_PORT | MPO_INSERT_SEND_RIGHT, }; kr = mach_port_construct(mach_task_self(), &opts, 0, &reply_port); - assert(kr == 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_reply_port"); + T_QUIET; T_ASSERT_NE(reply_port, (mach_port_t)MACH_PORT_NULL, "reply_port_create: %s", mach_error_string(kr)); return reply_port; } +static mach_port_t +alloc_weak_reply_port() +{ + kern_return_t kr; + mach_port_t reply_port = MACH_PORT_NULL; + + mach_port_options_t opts = { + .flags = MPO_PROVISIONAL_REPLY_PORT, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0, &reply_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_reply_port"); + T_QUIET; T_ASSERT_NE(reply_port, (mach_port_t)MACH_PORT_NULL, "weak_reply_port_create: %s", mach_error_string(kr)); + + return reply_port; +} + +static mach_port_t +alloc_service_port(void) +{ + kern_return_t kr; + mach_port_t service_port = MACH_PORT_NULL; + + struct mach_service_port_info sp_info = { + .mspi_string_name = "com.apple.testservice", + .mspi_domain_type = XPC_DOMAIN_SYSTEM, + }; + + mach_port_options_t opts = { + .flags = MPO_STRICT_SERVICE_PORT | MPO_INSERT_SEND_RIGHT, + .service_port_info = &sp_info, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0, &service_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_service_port"); + + return service_port; +} + +static mach_port_t +alloc_weak_service_port(void) +{ + kern_return_t kr; + mach_port_t weak_service_port = MACH_PORT_NULL; + + struct mach_service_port_info sp_info = { + .mspi_string_name = "com.apple.testservice", + .mspi_domain_type = XPC_DOMAIN_SYSTEM, + }; + + mach_port_options_t opts = { + .flags = MPO_SERVICE_PORT | MPO_INSERT_SEND_RIGHT, + .service_port_info = &sp_info, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0, &weak_service_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "alloc_weak_service_port"); + + return weak_service_port; +} + /* The rcv right of the port would be marked immovable. */ static void test_immovable_receive_right(void) @@ -116,6 +165,13 @@ test_make_send_once_right(void) printf("[reply_port_defense_client test_make_send_once_right]: mach_port_insert_right() returned %d\n", kr); } +static void +test_alloc_weak_reply_port(void) +{ + mach_port_t reply_port = alloc_weak_reply_port(); + printf("[reply_port_defense_client test_alloc_weak_reply_port]: did not crash with port=%d\n", reply_port); +} + /* The send right of the port would only used for guarding a name in ipc space, it would not allow to send a message. */ static void test_using_send_right(void) @@ -166,43 +222,13 @@ test_move_send_right(void) printf("[reply_port_defense_client test_move_send_right]: mach_msg2() returned %d\n", kr); } -static void -test_move_provisional_reply_port(void) -{ - kern_return_t kr; - mach_port_t server_port = MACH_PORT_NULL, reply_port = MACH_PORT_NULL; - struct { - mach_msg_header_t header; - mach_msg_body_t body; - mach_msg_port_descriptor_t desc; - } msg; - - server_port = alloc_server_port(); - reply_port = alloc_provisional_reply_port(); - - msg.header.msgh_remote_port = server_port; - msg.header.msgh_local_port = MACH_PORT_NULL; - msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX; - msg.header.msgh_size = sizeof msg; - - msg.body.msgh_descriptor_count = 1; - - msg.desc.name = reply_port; - msg.desc.disposition = MACH_MSG_TYPE_MOVE_RECEIVE; - msg.desc.type = MACH_MSG_PORT_DESCRIPTOR; - - kr = mach_msg_send(&msg.header); - - printf("[reply_port_defense_client test_immovable_receive_right]: mach_msg2() returned %d\n", kr); -} - static void test_unentitled_thread_set_state(void) { machine_thread_state_t ts; mach_msg_type_number_t count = MACHINE_THREAD_STATE_COUNT; - /* thread_set_state as a hardened binary should fail */ + /* thread_set_state as a platform restrictions binary should fail */ kern_return_t kr = thread_get_state(mach_thread_self(), MACHINE_THREAD_STATE, (thread_state_t)&ts, &count); kr = thread_set_state(mach_thread_self(), MACHINE_THREAD_STATE, (thread_state_t)&ts, count); @@ -211,11 +237,11 @@ test_unentitled_thread_set_state(void) } static void -unentitled_set_exception_ports_crash(void) +test_unentitled_thread_set_exception_ports(void) { mach_port_t exc_port = alloc_server_port(); - /* thread_set_exception_ports as a hardened binary should fail */ + /* thread_set_exception_ports as a platform restrictions binary should fail without identity protected options */ kern_return_t kr = thread_set_exception_ports( mach_thread_self(), EXC_MASK_ALL, @@ -243,7 +269,7 @@ unentitled_set_exception_ports_pass(void) exc_port, (exception_behavior_t)((unsigned int)EXCEPTION_STATE_IDENTITY_PROTECTED | MACH_EXCEPTION_CODES), EXCEPTION_THREAD_STATE); - assert(kr == 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_set_exception_ports EXCEPTION_STATE_IDENTITY_PROTECTED"); kr = thread_set_exception_ports( mach_thread_self(), @@ -251,7 +277,7 @@ unentitled_set_exception_ports_pass(void) exc_port, (exception_behavior_t)((unsigned int)EXCEPTION_IDENTITY_PROTECTED | MACH_EXCEPTION_CODES), EXCEPTION_THREAD_STATE); - assert(kr == 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_set_exception_ports EXCEPTION_IDENTITY_PROTECTED"); return; } @@ -266,7 +292,7 @@ exception_ports_crash(void) }; kr = mach_port_construct(mach_task_self(), &opts, 0ull, &exc_port); - assert(kr == KERN_SUCCESS); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "exception_ports_crash mach_port_construct"); kr = task_register_hardened_exception_handler(current_task(), 0, EXC_MASK_BAD_ACCESS, @@ -290,24 +316,269 @@ kobject_reply_port_defense(void) mach_port_t port = MACH_PORT_NULL; kern_return_t kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port); - assert(kr == KERN_SUCCESS); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "kobject_reply_port_defense mach_port_allocate"); // make a kobject call kr = thread_get_state(mach_thread_self(), MACHINE_THREAD_STATE, (thread_state_t)&ts, &count); - assert(kr == KERN_SUCCESS); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "kobject_reply_port_defense thread_get_state"); // set the MIG reply port to a "normal" port _os_tsd_set_direct(__TSD_MIG_REPLY, (void *)(uintptr_t)port); kr = thread_get_state(mach_thread_self(), MACHINE_THREAD_STATE, (thread_state_t)&ts, &count); - printf("kobject call did not crash: %d\n", kr); + T_FAIL("kobject call did not crash: %d\n", kr); } static void -test_alloc_provisional_reply_port(void) +test_move_service_port(void) { - mach_port_t __unused reply_port = alloc_provisional_reply_port(); + kern_return_t kr; + mach_port_t server_port = MACH_PORT_NULL, service_port = MACH_PORT_NULL; + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t desc; + } msg; + + server_port = alloc_server_port(); + service_port = alloc_service_port(); + + msg.header.msgh_remote_port = server_port; + msg.header.msgh_local_port = MACH_PORT_NULL; + msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX; + msg.header.msgh_size = sizeof msg; + + msg.body.msgh_descriptor_count = 1; + + msg.desc.name = service_port; + msg.desc.disposition = MACH_MSG_TYPE_MOVE_RECEIVE; + msg.desc.type = MACH_MSG_PORT_DESCRIPTOR; + + kr = mach_msg_send(&msg.header); + T_FAIL("move service port did not crash: %d\n", kr); +} + +static void +test_mktimer_notification_policy(void) +{ + mach_port_t timer_port = MACH_PORT_NULL; + mach_port_t notify_port = MACH_PORT_NULL; + mach_port_t previous = MACH_PORT_NULL; + + kern_return_t kr = KERN_SUCCESS; + + timer_port = mk_timer_create(); + T_ASSERT_NE(timer_port, (mach_port_t)MACH_PORT_NULL, "mk_timer_create: %s", mach_error_string(kr)); + + /* notification port for the mk_timer port to come back on */ + kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, ¬ify_port); + T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr)); + + T_LOG("timer: 0x%x, notify: 0x%x", timer_port, notify_port); + + /* request a port-destroyed notification on the timer port, which should crash */ + kr = mach_port_request_notification(mach_task_self(), timer_port, MACH_NOTIFY_PORT_DESTROYED, + 0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous); + + T_FAIL("mktimer did not crash with exc_guard kr=%d", kr); +} + +static void +test_reply_port_port_destroyed_notification_policy(void) +{ + mach_port_t reply_port = MACH_PORT_NULL; + mach_port_t previous = MACH_PORT_NULL; + mach_port_t notify_port = MACH_PORT_NULL; + + kern_return_t kr = KERN_SUCCESS; + mach_port_options_t opts = {}; + + reply_port = alloc_reply_port(); + + kr = mach_port_construct(mach_task_self(), &opts, 0, ¬ify_port); + T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr)); + + /* request a port-destroyed notification on the reply port */ + kr = mach_port_request_notification(mach_task_self(), reply_port, MACH_NOTIFY_PORT_DESTROYED, + 0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous); + + printf("reply port did not crash kr=%d\n", kr); +} + +static void +test_reply_port_no_senders_notification_policy(void) +{ + mach_port_t reply_port = MACH_PORT_NULL; + mach_port_t previous = MACH_PORT_NULL; + mach_port_t notify_port = MACH_PORT_NULL; + + kern_return_t kr = KERN_SUCCESS; + + reply_port = alloc_reply_port(); + mach_port_options_t opts = {}; + + kr = mach_port_construct(mach_task_self(), &opts, 0, ¬ify_port); + T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr)); + + /* request a no-senders notification on the reply port */ + kr = mach_port_request_notification(mach_task_self(), reply_port, MACH_NOTIFY_NO_SENDERS, + 0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous); + + T_FAIL("reply port did not crash kr=%d", kr); +} + +static void +test_reply_port_insert_right_disallowed(void) +{ + mach_port_t reply_port = MACH_PORT_NULL; + mach_port_t send_reply_port = MACH_PORT_NULL; + mach_msg_type_name_t right = 0; + + kern_return_t kr = KERN_SUCCESS; + reply_port = alloc_reply_port(); + kr = mach_port_extract_right(mach_task_self(), reply_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &send_reply_port, &right); + T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_extract_right(reply_port, make_send_once): %s", mach_error_string(kr)); + + T_FAIL("reply port make send once outside of kmsg did not crash kr=%d", kr); +} + +static kern_return_t +move_port(mach_port_t immovable_port) +{ + mach_port_t server_port = MACH_PORT_NULL; + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t desc; + } msg; + + server_port = alloc_server_port(); + + msg.header.msgh_remote_port = server_port; + msg.header.msgh_local_port = MACH_PORT_NULL; + msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0) | MACH_MSGH_BITS_COMPLEX; + msg.header.msgh_size = sizeof msg; + + msg.body.msgh_descriptor_count = 1; + + msg.desc.name = immovable_port; + msg.desc.disposition = MACH_MSG_TYPE_MOVE_SEND; + msg.desc.type = MACH_MSG_PORT_DESCRIPTOR; + return mach_msg_send(&msg.header); +} + +/* attempt to move mach_task_self */ +static void +test_mach_task_self_send_movability(void) +{ + kern_return_t kr = move_port(mach_task_self()); + printf("[reply_port_defense_client test_task_self_immovable]: mach_msg2() returned %d\n", kr); +} + +/* mach_task_self() is movable before and after calling task_get_special_port, when entitled */ +static void +test_task_self_movable_send(void) +{ + kern_return_t kr; + mach_port_t task_self = MACH_PORT_NULL; + + kr = move_port(mach_task_self()); + T_EXPECT_MACH_SUCCESS(kr, "move mach_task_self"); + + kr = task_get_special_port(mach_task_self(), TASK_KERNEL_PORT, &task_self); + T_EXPECT_MACH_SUCCESS(kr, "task_get_special_port"); + + kr = move_port(mach_task_self()); + T_EXPECT_MACH_SUCCESS(kr, "move mach_task_self again"); + + mach_port_t thread_port = pthread_mach_thread_np(pthread_main_thread_np()); + kr = move_port(thread_port); + T_EXPECT_MACH_SUCCESS(kr, "move main_thread_port"); +} + +static void +test_move_newly_constructed_port_immovable_send(void) +{ + kern_return_t kr; + mach_port_t port = MACH_PORT_NULL; + + mach_port_options_t opts = { + .flags = MPO_INSERT_SEND_RIGHT | MPO_CONNECTION_PORT, + .service_port_name = MPO_ANONYMOUS_SERVICE, + }; + + kr = mach_port_construct(mach_task_self(), &opts, 0, &port); + + kr = move_port(port); + printf("kr=%d\n", kr); + T_EXPECT_MACH_ERROR(kr, KERN_DENIED, "move port with immovable send rights"); +} + +static void +test_move_special_reply_port(void) +{ + kern_return_t kr; + mach_port_t special_reply_port = thread_get_special_reply_port(); + + kr = move_port(special_reply_port); + T_EXPECT_MACH_ERROR(kr, KERN_DENIED, "move special reply port"); +} + +static void +test_reply_port_header_disposition(void) +{ + kern_return_t kr; + mach_port_t server_port = MACH_PORT_NULL; + mach_port_t reply_port1 = MACH_PORT_NULL, reply_port2 = MACH_PORT_NULL; + struct { + mach_msg_header_t header; + } msg; + + server_port = alloc_server_port(); + reply_port1 = alloc_reply_port(); + reply_port2 = alloc_reply_port(); + + msg.header.msgh_remote_port = server_port; + msg.header.msgh_size = sizeof msg; + + /* sending with make_send_once should succeed */ + msg.header.msgh_local_port = reply_port1; + msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, + MACH_MSG_TYPE_MAKE_SEND_ONCE); + kr = mach_msg_send(&msg.header); + T_EXPECT_MACH_SUCCESS(kr, "reply_port_disposition make_send_once"); + + /* sending with make_send should fail */ + msg.header.msgh_local_port = reply_port2; + msg.header.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, + MACH_MSG_TYPE_MAKE_SEND); + kr = mach_msg_send(&msg.header); + T_ASSERT_MACH_ERROR(kr, MACH_SEND_INVALID_REPLY, "reply_port_disposition make_send"); +} + +static void +test_service_port_as_exception_port(void) +{ + kern_return_t kr; + mach_port_t service_port = alloc_service_port(); + mach_port_t weak_service_port = alloc_weak_service_port(); + + kr = thread_set_exception_ports( + mach_thread_self(), + EXC_MASK_ALL, + service_port, + (exception_behavior_t)((unsigned int)EXCEPTION_STATE_IDENTITY_PROTECTED | MACH_EXCEPTION_CODES), + EXCEPTION_THREAD_STATE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "test_service_port_as_exception_port IOT_SERVICE_PORT"); + + kr = thread_set_exception_ports( + mach_thread_self(), + EXC_MASK_ALL, + weak_service_port, + (exception_behavior_t)((unsigned int)EXCEPTION_STATE_IDENTITY_PROTECTED | MACH_EXCEPTION_CODES), + EXCEPTION_THREAD_STATE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "test_service_port_as_exception_port IOT_WEAK_SERVICE_PORT"); } int @@ -315,9 +586,10 @@ main(int argc, char *argv[]) { uint32_t my_csflags = 0; bool thirdparty_hardened = !strcmp(argv[0], "./reply_port_defense_client_3P_hardened"); + T_ASSERT_POSIX_ZERO(csops(getpid(), CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)), NULL); /* TODO add some sysctl which disabled platform binary bit here */ - if (my_csflags & CS_PLATFORM_BINARY == thirdparty_hardened) { + if ((my_csflags & CS_PLATFORM_BINARY) == thirdparty_hardened) { printf("platform binary does not match expected\n"); return -1; } @@ -325,16 +597,27 @@ main(int argc, char *argv[]) void (*tests[MAX_TEST_NUM])(void) = { test_immovable_receive_right, /* 0 */ - test_make_send_once_right, - test_using_send_right, /* 2 */ - test_move_send_right, - test_move_provisional_reply_port, /* 4 */ - unentitled_set_exception_ports_crash, + test_using_send_right, /* 1 */ + test_move_send_right, /* 2 */ + test_make_send_once_right, /* 3 */ + NULL, /* 4 */ + test_unentitled_thread_set_exception_ports, /* 5 */ test_unentitled_thread_set_state, /* 6 */ unentitled_set_exception_ports_pass, exception_ports_crash, /* 8 */ kobject_reply_port_defense, /* 9 */ - test_alloc_provisional_reply_port, /* 10 */ + test_alloc_weak_reply_port, /* 10 */ + test_move_service_port, /* 11 */ + test_mktimer_notification_policy, /* 12 */ + test_reply_port_port_destroyed_notification_policy, /* 13 */ + test_reply_port_no_senders_notification_policy, /* 14 */ + test_reply_port_insert_right_disallowed, /* 15 */ + test_mach_task_self_send_movability, /* 16 */ + test_task_self_movable_send, /* 17 */ + test_move_newly_constructed_port_immovable_send, /* 18 */ + test_move_special_reply_port, /* 19 */ + test_reply_port_header_disposition, /* 20 */ + test_service_port_as_exception_port, /* 21 */ }; if (argc < 2) { diff --git a/tests/rm/coalition_info_resource_usage.c b/tests/rm/coalition_info_resource_usage.c new file mode 100644 index 000000000..bb13b2ccb --- /dev/null +++ b/tests/rm/coalition_info_resource_usage.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.rm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("rm"), + T_META_OWNER("m_staveleytaylor")); + +static uint64_t +create_coalition(int type) +{ + uint64_t id = 0; + uint32_t flags = 0; + uint64_t param[2]; + int ret; + + COALITION_CREATE_FLAGS_SET_TYPE(flags, type); + ret = coalition_create(&id, flags); + T_ASSERT_POSIX_SUCCESS(ret, "coalition_create"); + T_QUIET; + T_ASSERT_GE(id, 0ULL, "coalition_create returned a valid id"); + + T_LOG("coalition has id %lld\n", id); + + /* disable notifications for this coalition so launchd doesn't freak out */ + param[0] = id; + param[1] = 0; + ret = sysctlbyname("kern.coalition_notify", NULL, NULL, param, sizeof(param)); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "kern.coalition_notify"); + + return id; +} + +static pid_t +spawn_helper_in_coalition(char *helper_name, uint64_t coal_id) +{ + int ret; + posix_spawnattr_t attr; + extern char **environ; + pid_t new_pid = 0; + char path[PATH_MAX]; + uint32_t path_size = sizeof(path); + + T_QUIET; + T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), + "_NSGetExecutablePath"); + char *args[] = {path, "-n", helper_name, NULL}; + + ret = posix_spawnattr_init(&attr); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "posix_spawnattr_init"); + + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "posix_spawnattr_setcoalition_np"); + ret = posix_spawnattr_setcoalition_np(&attr, coal_id, + COALITION_TYPE_RESOURCE, + COALITION_TASKROLE_LEADER); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "posix_spawnattr_setcoalition_np"); + + T_LOG("posix_spawn %s %s %s", args[0], args[1], args[2]); + ret = posix_spawn(&new_pid, path, NULL, &attr, args, environ); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "posix_spawn"); + + ret = posix_spawnattr_destroy(&attr); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "posix_spawnattr_destroy"); + return new_pid; +} + +T_HELPER_DECL(qos_expense, "qos_expense") +{ + mach_timebase_info_data_t tb_info; + mach_timebase_info(&tb_info); + + T_LOG("starting busy work in child"); + + uint64_t start_ns = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); + + /* Do 500ms of busy work to pad our QoS stats */ + while (true) { + uint64_t now_ns = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); + uint64_t diff_ms = (now_ns - start_ns) / (1000ULL * 1000ULL); + if (diff_ms > 500) { + break; + } + } + + T_PASS("finished busy work in child"); +} + +static uint64_t +get_qos_sum(uint64_t coalition_id) +{ + struct coalition_resource_usage cru; + int ret = coalition_info_resource_usage(coalition_id, &cru, sizeof(cru)); + T_ASSERT_POSIX_SUCCESS(ret, "coalition_info_resource_usage"); + + uint64_t sum = 0; + for (int i = 0; i < COALITION_NUM_THREAD_QOS_TYPES; i++) { + sum += cru.cpu_time_eqos[i]; + } + return sum; +} + +static uint64_t coalition_id; + +static void +terminate_and_reap_coalition(void) +{ + T_LOG("coalition_terminate"); coalition_terminate(coalition_id, 0); + T_LOG("coalition_reap"); coalition_reap(coalition_id, 0); +} + +T_DECL(coalition_info_resource_usage_qos_monotonic, + "Make sure CPU time QoS values are accumulated from dead tasks", + T_META_ASROOT(true), + T_META_SYSCTL_INT("kern.unrestrict_coalitions=1"), + T_META_TAG_VM_PREFERRED) +{ + T_SETUPBEGIN; + coalition_id = create_coalition(COALITION_TYPE_RESOURCE); + T_ATEND(terminate_and_reap_coalition); + T_SETUPEND; + + T_ASSERT_EQ_ULLONG(get_qos_sum(coalition_id), 0ULL, "cpu_time_eqos == 0"); + + pid_t child_pid = spawn_helper_in_coalition("qos_expense", coalition_id); + + T_LOG("waitpid(%d)\n", child_pid); + int stat; + int ret = waitpid(child_pid, &stat, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "waitpid"); + T_QUIET; T_ASSERT_TRUE(WIFEXITED(stat), "child exited."); + T_QUIET; T_ASSERT_EQ(WEXITSTATUS(stat), 0, "child exited cleanly."); + + T_ASSERT_GT_ULLONG(get_qos_sum(coalition_id), 0ULL, "cpu_time_eqos > 0"); +} diff --git a/tests/runaway_mitigation.c b/tests/runaway_mitigation.c new file mode 100644 index 000000000..9bb432c99 --- /dev/null +++ b/tests/runaway_mitigation.c @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* test that the header doesn't implicitly depend on others */ +#include +#include + +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include /* TODO: this should be installed for userspace */ +extern int ledger(int cmd, caddr_t arg1, caddr_t arg2, caddr_t arg3); + +#include +extern int __microstackshot(char *tracebuf, uint32_t tracebuf_size, uint32_t flags); + + +T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("scheduler"), + T_META_OWNER("chimene"), + T_META_RUN_CONCURRENTLY(false), /* because of messing with global SFI */ + T_META_ASROOT(true), /* for TASK_POLICY_STATE, and setting SFI */ + T_META_TAG_VM_PREFERRED); + +static void +check_is_bg(bool wants_bg) +{ + kern_return_t kr; + struct task_policy_state policy_state; + + mach_msg_type_number_t count = TASK_POLICY_STATE_COUNT; + boolean_t get_default = FALSE; + + kr = task_policy_get(mach_task_self(), TASK_POLICY_STATE, + (task_policy_t)&policy_state, &count, &get_default); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_policy_get(TASK_POLICY_STATE)"); + + /* + * A test reporting type=APPLICATION should have the live donor bit set. + * If this fails, the test may have been launched as a daemon instead. + */ + T_QUIET; T_ASSERT_BITS_SET(policy_state.flags, TASK_IMP_LIVE_DONOR, "test should be live donor enabled"); + + /* + * The BG bit is updated via task_policy_update_internal_locked, + * checking this proves that the first phase update ran on this task. + */ + if (wants_bg) { + T_ASSERT_BITS_SET(policy_state.effective, POLICY_EFF_DARWIN_BG, "%d: is BG", getpid()); + } else { + T_ASSERT_BITS_NOTSET(policy_state.effective, POLICY_EFF_DARWIN_BG, "%d: is not BG", getpid()); + } + + /* + * The live donor bit is updated via task_policy_update_complete_unlocked, + * checking this proves that the second phase update ran on this task. + */ + if (wants_bg) { + T_ASSERT_BITS_NOTSET(policy_state.flags, TASK_IMP_DONOR, "%d: is not live donor", getpid()); + } else { + T_ASSERT_BITS_SET(policy_state.flags, TASK_IMP_DONOR, "%d: is live donor", getpid()); + } +} + +static void +check_runaway_mode(bool expected_mode) +{ + int runaway_mode = getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(runaway_mode, "getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION)"); + + T_LOG("pid %d: runaway mitigation mode is: %d", getpid(), runaway_mode); + + if (expected_mode) { + T_QUIET; + T_ASSERT_EQ(runaway_mode, PRIO_DARWIN_RUNAWAY_MITIGATION_ON, "should be on"); + check_is_bg(true); + } else { + T_QUIET; + T_ASSERT_EQ(runaway_mode, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF, "should be off"); + check_is_bg(false); + } +} + +T_DECL(entitled_runaway_mode, "runaway mitigation mode should be settable while entitled") +{ + T_LOG("uid: %d", getuid()); + + check_runaway_mode(false); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON)"); + + check_runaway_mode(true); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF)"); + + check_runaway_mode(false); +} + +T_DECL(entitled_runaway_mode_read_root, "runaway mitigation mode should be readable as root", + T_META_ASROOT(true)) +{ + T_LOG("uid: %d", getuid()); + + check_runaway_mode(false); +} + +T_DECL(entitled_runaway_mode_read_notroot, "runaway mitigation mode should be readable as not root but entitled", + T_META_ASROOT(false)) +{ + T_LOG("uid: %d", getuid()); + + int runaway_mode = getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, getpid()); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(runaway_mode, "getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION)"); + + T_ASSERT_EQ(runaway_mode, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF, "should be off"); +} + +T_DECL(runaway_mode_child_exit, "runaway mitigation mode should disappear when child exits") +{ + T_LOG("uid: %d", getuid()); + + check_runaway_mode(false); + + T_LOG("Spawning child"); + + pid_t child_pid = fork(); + + if (child_pid == 0) { + /* child process */ + + check_runaway_mode(false); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON)"); + + check_runaway_mode(true); + + T_LOG("Exit pid %d with runaway mitigation mode on", getpid()); + + exit(0); + } else { + T_ASSERT_POSIX_SUCCESS(child_pid, "fork, pid %d", child_pid); + + /* wait for child process to exit */ + int exit_status = 0, signum = 0; + + T_ASSERT_TRUE(dt_waitpid(child_pid, &exit_status, &signum, 5), + "wait for child (%d) complete", child_pid); + + T_QUIET; T_ASSERT_EQ(exit_status, 0, "dt_waitpid: exit_status"); + T_QUIET; T_ASSERT_EQ(signum, 0, "dt_waitpid: signum"); + } + + check_runaway_mode(false); +} + +T_DECL(runaway_mode_child_set, "runaway mitigation mode should be settable on child pid") +{ + T_LOG("uid: %d", getuid()); + + check_runaway_mode(false); + + int fd[2]; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(fd), "pipe()"); + + T_LOG("Spawning child"); + + pid_t child_pid = fork(); + + if (child_pid == 0) { + char buf[10]; + + /* child process */ + T_ASSERT_POSIX_SUCCESS(child_pid, "fork, in child with pid %d", getpid()); + + T_ASSERT_POSIX_SUCCESS(close(fd[1]), "close(fd[1])"); + + T_ASSERT_POSIX_SUCCESS(read(fd[0], buf, sizeof(buf)), "read(fd[0], buf, sizeof(buf)"); + + T_ASSERT_POSIX_SUCCESS(close(fd[0]), "close(fd[0])"); + + check_runaway_mode(true); + + T_LOG("Exit pid %d with runaway mitigation mode on", getpid()); + + exit(0); + } else { + T_ASSERT_POSIX_SUCCESS(child_pid, "fork parent: child pid %d", child_pid); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, child_pid, PRIO_DARWIN_RUNAWAY_MITIGATION_ON), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, child_pid, PRIO_DARWIN_RUNAWAY_MITIGATION_ON)"); + + int runaway_mode = getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, child_pid); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(runaway_mode, "getpriority(PRIO_DARWIN_RUNAWAY_MITIGATION)"); + + T_ASSERT_EQ(runaway_mode, PRIO_DARWIN_RUNAWAY_MITIGATION_ON, "should be on"); + + T_QUIET; T_LOG("Signalling child to continue"); + T_ASSERT_POSIX_SUCCESS(close(fd[1]), "close(fd[1])"); + + /* wait for child process to exit */ + int exit_status = 0, signum = 0; + + T_ASSERT_TRUE(dt_waitpid(child_pid, &exit_status, &signum, 5), + "wait for child (%d) complete", child_pid); + + T_QUIET; T_ASSERT_EQ(exit_status, 0, "dt_waitpid: exit_status"); + T_QUIET; T_ASSERT_EQ(signum, 0, "dt_waitpid: signum"); + } + + check_runaway_mode(false); +} + + +/* + * TODO: This should be in a test utils library, + * but it requires including Kernel.framework header kern/ledger.h, which is Bad + */ +static size_t +ledger_index_for_string(size_t *num_entries, char* string) +{ + struct ledger_info li; + struct ledger_template_info *templateInfo = NULL; + int ret; + size_t i, footprint_index; + bool found = false; + + ret = ledger(LEDGER_INFO, (caddr_t)(uintptr_t)getpid(), (caddr_t)&li, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ledger(LEDGER_INFO)"); + + T_QUIET; T_ASSERT_GT(li.li_entries, (int64_t) 0, "num ledger entries is valid"); + *num_entries = (size_t) li.li_entries; + templateInfo = malloc((size_t)li.li_entries * sizeof(struct ledger_template_info)); + T_QUIET; T_ASSERT_NOTNULL(templateInfo, "malloc entries"); + + footprint_index = 0; + ret = ledger(LEDGER_TEMPLATE_INFO, (caddr_t) templateInfo, (caddr_t) num_entries, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ledger(LEDGER_TEMPLATE_INFO)"); + for (i = 0; i < *num_entries; i++) { + if (strcmp(templateInfo[i].lti_name, string) == 0) { + footprint_index = i; + found = true; + } + } + free(templateInfo); + T_QUIET; T_ASSERT_TRUE(found, "found %s in ledger", string); + return footprint_index; +} + +/* + * sadly there's no 'get just this one ledger index' syscall, + * we have to read all ledgers and filter for the one we want + */ +static int64_t +get_ledger_entry_for_pid(pid_t pid, size_t index, size_t num_entries) +{ + int ret; + int64_t value; + struct ledger_entry_info *lei = NULL; + + lei = malloc(num_entries * sizeof(*lei)); + ret = ledger(LEDGER_ENTRY_INFO, (caddr_t) (uintptr_t) pid, (caddr_t) lei, (caddr_t) &num_entries); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ledger(LEDGER_ENTRY_INFO)"); + value = lei[index].lei_balance; + free(lei); + return value; +} + + +uint64_t initial_sfi_window = 0, initial_class_offtime = 0; + +static void +restore_sfi_state(void) +{ + T_LOG("Restoring initial system SFI window %lld, SFI_CLASS_RUNAWAY_MITIGATION class offtime %lld", + initial_sfi_window, initial_class_offtime); + + /* + * Setting window will fail if there is a larger offtime set, and + * setting class will fail if the window is smaller. + * To avoid this, disable the window, configure new values, then finally + * re-enable the window. + */ + + T_QUIET; T_ASSERT_POSIX_SUCCESS(system_set_sfi_window(0), + "system_set_sfi_window(0)"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(sfi_set_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, initial_class_offtime), + "system_set_sfi_window(%lld)", initial_class_offtime); + T_QUIET; T_ASSERT_POSIX_SUCCESS(system_set_sfi_window(initial_sfi_window), + "system_set_sfi_window(%lld)", initial_sfi_window); +} + +const int spin_seconds = 1; + + +static void * +spin_thread(void *arg) +{ + static mach_timebase_info_data_t timebase_info; + mach_timebase_info(&timebase_info); + + uint64_t duration = spin_seconds * NSEC_PER_SEC * timebase_info.denom / timebase_info.numer; + uint64_t deadline = mach_absolute_time() + duration; + + while (mach_absolute_time() < deadline) { + ; + } + + return NULL; +} + +T_DECL(runaway_mode_child_sfi, "runaway mitigation mode should cause SFI") +{ + T_LOG("uid: %d", getuid()); + + check_runaway_mode(false); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(system_get_sfi_window(&initial_sfi_window), + "system_get_sfi_window(&initial_sfi_window)"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(sfi_get_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, &initial_class_offtime), + "sfi_get_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, &initial_class_offtime)"); + + T_LOG("Initial System SFI window %lld, SFI_CLASS_RUNAWAY_MITIGATION class offtime %lld\n", initial_sfi_window, initial_class_offtime); + + size_t num_ledger_entries = 0; + size_t ledger_index = ledger_index_for_string(&num_ledger_entries, "SFI_CLASS_RUNAWAY_MITIGATION"); + uint64_t sfi_time_before = get_ledger_entry_for_pid(getpid(), ledger_index, num_ledger_entries); + + T_LOG("SFI_CLASS_RUNAWAY_MITIGATION ledger index: %zu out of %zu\n", ledger_index, num_ledger_entries); + + T_LOG("Initial accumulated SFI time: %lld\n", sfi_time_before); + + T_ATEND(restore_sfi_state); + + uint64_t custom_sfi_window = 100000; /* microseconds */ + uint64_t custom_class_offtime = 50000; + + T_LOG("Setting custom system SFI window %lld, SFI_CLASS_RUNAWAY_MITIGATION class offtime %lld", + custom_sfi_window, custom_class_offtime); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(system_set_sfi_window(0), + "system_set_sfi_window(0)"); + T_ASSERT_POSIX_SUCCESS(sfi_set_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, custom_class_offtime), + "sfi_set_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, %lld)", custom_class_offtime); + T_ASSERT_POSIX_SUCCESS(system_set_sfi_window(custom_sfi_window), + "system_set_sfi_window(%lld)", custom_sfi_window); + + pthread_t thread; + + T_LOG("Spawning thread to spin for %d seconds\n", spin_seconds); + + int rv = pthread_create(&thread, NULL, spin_thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create"); + + T_LOG("Enable mitigation mode\n"); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON)"); + + check_runaway_mode(true); + + T_LOG("Wait %d seconds for spin to finish\n", spin_seconds); + + rv = pthread_join(thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join"); + + T_LOG("Thread joined, disable mitigation mode\n"); + + T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF)"); + + uint64_t sfi_time_after = get_ledger_entry_for_pid(getpid(), ledger_index, num_ledger_entries); + + T_LOG("Ending accumulated SFI time: %lld\n", sfi_time_after); + + T_ASSERT_LT(sfi_time_before, sfi_time_after, "SFI_CLASS_RUNAWAY_MITIGATION SFI time must have increased"); + + check_runaway_mode(false); + + uint64_t final_sfi_window = 0, final_class_offtime = 0; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(system_get_sfi_window(&final_sfi_window), + "system_get_sfi_window(&final_sfi_window)"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(sfi_get_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, &final_class_offtime), + "sfi_get_class_offtime(SFI_CLASS_RUNAWAY_MITIGATION, &final_class_offtime)"); + + /* + * If the System SFI configuration was changed out from under us during the test, either us or them will be confused. + */ + T_QUIET; T_ASSERT_EQ(custom_sfi_window, final_sfi_window, "System SFI window should not unexpectedly change during the test"); + T_QUIET; T_ASSERT_EQ(custom_class_offtime, final_class_offtime, "System SFI offtime should not unexpectedly change during the test"); +} + +#if defined(__arm64__) + +static bool found_flag = false; +static bool found_self = false; + +static const size_t microstackshot_buf_size = 16 * 1024; + +static bool +search_for_self_microstackshot(bool log_details) +{ + void *buf = calloc(microstackshot_buf_size, 1); + T_QUIET; T_ASSERT_NOTNULL(buf, "allocate buffer"); + + int ret = __microstackshot(buf, microstackshot_buf_size, STACKSHOT_GET_MICROSTACKSHOT); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "microstackshot"); + + if (!log_details) { + T_QUIET; + } + T_EXPECT_EQ(*(uint32_t *)buf, + (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC, + "magic value for microstackshot matches"); + + uint32_t magic = STACKSHOT_TASK_SNAPSHOT_MAGIC; + + void* next_tsnap = memmem(buf, microstackshot_buf_size, &magic, sizeof(magic)); + + void* buf_end = buf + microstackshot_buf_size; + + while (next_tsnap != NULL && next_tsnap + sizeof(struct task_snapshot) < buf_end) { + struct task_snapshot *tsnap = (struct task_snapshot *)next_tsnap; + unsigned int offset = next_tsnap - buf; + + if (log_details) { + T_LOG("%6d: found snap pid %d name %s\n", offset, tsnap->pid, (char*)&tsnap->p_comm); + } + + if (tsnap->pid == getpid()) { + if (log_details) { + T_LOG("%6d: found self snap: flags 0x%x 0x%llx\n", offset, tsnap->ss_flags, tsnap->disk_reads_count); + } + found_self = true; + + if (tsnap->disk_reads_count & kTaskRunawayMitigation) { + T_LOG("%6d: found runaway flag: pid %d, name %s, flags: 0x%x 0x%llx, \n", + offset, tsnap->pid, (char*)&tsnap->p_comm, tsnap->ss_flags, tsnap->disk_reads_count); + found_flag = true; + } + } + + void* search_start = next_tsnap + sizeof(struct task_snapshot); + size_t remaining_size = buf_end - search_start; + next_tsnap = memmem(search_start, remaining_size, &magic, sizeof(magic)); + } + + free(buf); + + return found_flag; +} + +T_DECL(runaway_mode_microstackshot_flag, + "check that mitigated processes show up in microstackshot", + T_META_REQUIRES_SYSCTL_EQ("kern.monotonic.supported", 1), + T_META_TAG_VM_NOT_ELIGIBLE, T_META_TIMEOUT(120)) +{ + unsigned int pmi_counter; + size_t sysctl_size = sizeof(pmi_counter); + int ret = sysctlbyname( + "kern.microstackshot.pmi_sample_counter", + &pmi_counter, &sysctl_size, NULL, 0); + if (ret == -1 && errno == ENOENT) { + T_SKIP("no PMI support"); + } else { + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "query PMI counter"); + } + uint64_t pmi_period; + sysctl_size = sizeof(pmi_period); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(sysctlbyname( + "kern.microstackshot.pmi_sample_period", + &pmi_period, &sysctl_size, NULL, 0), + "query PMI period"); + + T_LOG("PMI counter: %u", pmi_counter); + T_LOG("PMI period: %llu", pmi_period); + + if (pmi_period == 0) { + T_SKIP("PMI microstackshots not enabled"); + } + + T_LOG("Enable mitigation mode on self\n"); + + T_EXPECT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, + 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_ON)"); + + uint32_t iterations = 100; + + /* Over-spin to make it likely we get sampled at least once before failing */ + uint32_t multiplier = 10; + uint64_t target_cycles = multiplier * pmi_period; + + T_LOG("Spinning for %d iterations or %lld*%d cycles or until self-sample is found\n", + iterations, pmi_period, multiplier); + + struct rusage_info_v6 ru = {}; + + for (int i = 0; i < iterations; i++) { + spin_thread(NULL); + + int rv = proc_pid_rusage(getpid(), RUSAGE_INFO_V6, (rusage_info_t *)&ru); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "proc_pid_rusage"); + + T_LOG("iteration %3d: %14lld / %14lld cycles executed (%.2f%%)\n", i, + ru.ri_cycles, target_cycles, + ((double)ru.ri_cycles) * 100.0 / (double)target_cycles); + + T_QUIET; T_ASSERT_NE(ru.ri_cycles, (uint64_t)0, + "should be able to measure cycles with proc_pid_rusage"); + + bool found = search_for_self_microstackshot(false); + if (ru.ri_cycles > target_cycles || found) { + break; + } + } + + T_LOG("Complete, executed %lld cycles. Disable mitigation mode.\n", ru.ri_cycles); + + T_EXPECT_POSIX_SUCCESS(setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, + 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF), + "setpriority(PRIO_DARWIN_RUNAWAY_MITIGATION, 0, PRIO_DARWIN_RUNAWAY_MITIGATION_OFF)"); + + search_for_self_microstackshot(true); + + T_EXPECT_EQ(found_self, true, + "Should have found self in microstackshot buffer"); + T_EXPECT_EQ(found_flag, true, + "Should have found kTaskRunawayMitigation flag in microstackshot buffer"); +} +#endif // defined(__arm64__) diff --git a/tests/runaway_mitigation.entitlements b/tests/runaway_mitigation.entitlements new file mode 100644 index 000000000..ae96eb026 --- /dev/null +++ b/tests/runaway_mitigation.entitlements @@ -0,0 +1,10 @@ + + + + + com.apple.private.runaway-mitigation + + com.apple.private.kernel.selective-forced-idle + + + diff --git a/tests/sched/Makefile b/tests/sched/Makefile index 780e5036b..b168b92fc 100644 --- a/tests/sched/Makefile +++ b/tests/sched/Makefile @@ -12,6 +12,7 @@ sched/all_cores_running: $(SCHED_UTILS) SCHED_TARGETS += sched/all_cores_running +sched/cluster_bound_threads: OTHER_CFLAGS += -Wno-int-to-void-pointer-cast sched/cluster_bound_threads: OTHER_LDFLAGS += $(SCHED_UTILS_FLAGS) sched/cluster_bound_threads: $(SCHED_UTILS) SCHED_TARGETS += sched/cluster_bound_threads @@ -53,14 +54,24 @@ sched/thread_group_flags_workload_config.h: sched/thread_group_flags_workload_co xxd -i $< > $@ SCHED_TARGETS += sched/thread_group_flags +sched/setitimer: OTHER_LDFLAGS += $(SCHED_UTILS_FLAGS) -framework perfdata +sched/setitimer: $(SCHED_UTILS) +SCHED_TARGETS += sched/setitimer + sched/yield_aggressor: OTHER_CFLAGS += -Wno-atomic-implicit-seq-cst sched/yield_aggressor: OTHER_LDFLAGS += -framework perfdata $(SCHED_UTILS_FLAGS) sched/yield_aggressor: $(SCHED_UTILS) SCHED_TARGETS += sched/yield_aggressor sched/zero_to_n_tests: OTHER_LDFLAGS += -framework perfdata $(SCHED_UTILS_FLAGS) +sched/zero_to_n_tests: $(SCHED_UTILS) SCHED_TARGETS += sched/zero_to_n_tests +sched/rttimer: CODE_SIGN_ENTITLEMENTS = sched/rttimer.entitlements +sched/rttimer: OTHER_LDFLAGS += $(SCHED_UTILS_FLAGS) +sched/rttimer: $(SCHED_UTILS) +SCHED_TARGETS += sched/rttimer + # Convenience command for building all of the test targets under sched/ .PHONY: sched/all sched/all: $(SCHED_TARGETS) diff --git a/tests/sched/all_cores_running.c b/tests/sched/all_cores_running.c index 4d613a174..3eeb042a1 100644 --- a/tests/sched/all_cores_running.c +++ b/tests/sched/all_cores_running.c @@ -190,7 +190,7 @@ T_DECL(all_cores_running, * Now after we have logged all of the relevant information, enforce that each * of the cores was recommended and had test threads scheduled on it. */ - T_ASSERT_EQ((unsigned int)__builtin_popcountll(final_visited_cores_bitmask), cpu_count, "Each core ran at least one of the test threads"); + T_ASSERT_EQ((unsigned int)__builtin_popcountll(final_visited_cores_bitmask), cpu_count, "[%s] Each core ran at least one of the test threads", platform_train_descriptor()); for (unsigned int i = 0; i < cpu_count; i++) { T_QUIET; T_ASSERT_GT(non_idle_ticks[i], 0, "One or more cores were idle during the work period"); } @@ -237,5 +237,5 @@ T_DECL(recommended_cores_mask, } } - T_ASSERT_EQ(passed_test, 1, "kern.sched_recommended_cores reflects that all expected cores are recommended"); + T_ASSERT_EQ(passed_test, 1, "[%s] kern.sched_recommended_cores reflects that all expected cores are recommended", platform_train_descriptor()); } diff --git a/tests/sched/cluster_bound_threads.c b/tests/sched/cluster_bound_threads.c index 3b49696e5..2cec73990 100644 --- a/tests/sched/cluster_bound_threads.c +++ b/tests/sched/cluster_bound_threads.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -146,19 +145,15 @@ T_DECL(cluster_soft_binding, trace_handle_t trace = begin_collect_trace(argc, argv, "cluster_soft_binding"); T_SETUPEND; - for (int p = 0; p < 2; p++) { + for (unsigned int p = 0; p < platform_nperflevels(); p++) { /* Ensure all cores recommended */ char *restore_dynamic_control_args[] = {"-d", NULL}; execute_clpcctrl(restore_dynamic_control_args, false); bool all_cores_recommended = check_recommended_core_mask(NULL); T_QUIET; T_EXPECT_TRUE(all_cores_recommended, "Not all cores are recommended for scheduling"); - void *arg; - if (p == 0) { - arg = (void *)'P'; - } else { - arg = (void *)'E'; - } + char perflevel_char = platform_perflevel_name(p)[0]; + void *arg = (void *)perflevel_char; pthread_t bound_thread; create_thread(&bound_thread, NULL, spin_bound_thread, arg); sleep(1); @@ -171,12 +166,8 @@ T_DECL(cluster_soft_binding, "%c-bound thread didn't run at least %f of %d seconds", (char)arg, runtime_threshold, observe_seconds); /* Derecommend the bound cluster type */ - char *derecommend_args[] = {"-C", "X", NULL}; - if (p == 0) { - derecommend_args[1] = "e"; - } else { - derecommend_args[1] = "p"; - } + char perflevel_arg[2] = {perflevel_char, '\0'}; + char *derecommend_args[] = {"-C", perflevel_arg, NULL}; execute_clpcctrl(derecommend_args, false); check_recommended_core_mask(NULL); sleep(1); @@ -211,7 +202,7 @@ spin_cluster_binding(void *) if (running_on_cluster != bind_cluster) { T_LOG("Failed on iteration %d", t); /* Mark this failure in the recorded trace */ - kdebug_trace(ARIADNEDBG_CODE(0, 0), (uint64_t)t, (uint64_t)bind_cluster, (uint64_t)running_on_cluster, 0); + sched_kdebug_test_fail(t, bind_cluster, running_on_cluster, 0); } } } diff --git a/tests/sched/clutch_runqueue.c b/tests/sched/clutch_runqueue.c index 3c14b9352..108db364c 100644 --- a/tests/sched/clutch_runqueue.c +++ b/tests/sched/clutch_runqueue.c @@ -410,8 +410,8 @@ SCHED_POLICY_T_DECL(runq_tracepoint_thread_select, /* Test the cluster_id field */ test_thread_t bound_thread = create_thread(TH_BUCKET_SHARE_DF, same_tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); set_thread_cluster_bound(bound_thread, 1); - enqueue_thread(cluster_target(1), bound_thread); - ret = dequeue_thread_expect(cluster_target(1), bound_thread); + enqueue_thread(pset_target(1), bound_thread); + ret = dequeue_thread_expect(pset_target(1), bound_thread); T_QUIET; T_ASSERT_TRUE(ret, "Dequeue single thread on cluster 1"); root_bucket_arg = SELECTION_WAS_EDF | CTS_VERSION | SELECTION_WAS_CLUSTER_BOUND | CLUSTER_ID(1); ret = tracepoint_expect(CLUTCH_THREAD_SELECT, 10, 0, TH_BUCKET_SHARE_DF, root_bucket_arg); diff --git a/tests/sched/edge_migration.c b/tests/sched/edge_migration.c index d6537367c..83ee0634c 100644 --- a/tests/sched/edge_migration.c +++ b/tests/sched/edge_migration.c @@ -26,7 +26,7 @@ SCHED_POLICY_T_DECL(migration_cluster_bound, set_pset_load_avg(i, TH_BUCKET_SHARE_DF, low_load); } for (int i = 0; i < dual_die.num_psets; i++) { - set_current_processor(cluster_id_to_cpu_id(i)); + set_current_processor(pset_id_to_cpu_id(i)); for (int j = 0; j < dual_die.num_psets; j++) { /* Add extra load to the bound cluster, so we're definitely not just idle short-circuiting */ set_pset_load_avg(j, TH_BUCKET_SHARE_DF, high_load); @@ -69,25 +69,125 @@ SCHED_POLICY_T_DECL(migration_should_yield, cpu_set_thread_current(0, yielder); ret = cpu_check_should_yield(0, false); T_QUIET; T_EXPECT_TRUE(ret, "No thread present to yield to"); - enqueue_thread(cluster_target(0), background); + enqueue_thread(pset_target(0), background); ret = cpu_check_should_yield(0, true); T_QUIET; T_EXPECT_TRUE(ret, "Should yield to a low priority thread on the current runqueue"); SCHED_POLICY_PASS("Basic yield behavior on single pset"); - ret = dequeue_thread_expect(cluster_target(0), background); + ret = dequeue_thread_expect(pset_target(0), background); T_QUIET; T_EXPECT_TRUE(ret, "Only background thread in runqueue"); cpu_set_thread_current(0, yielder); /* Reset current thread */ - enqueue_thread(cluster_target(1), background); + enqueue_thread(pset_target(1), background); ret = cpu_check_should_yield(0, true); T_QUIET; T_EXPECT_TRUE(ret, "Should yield in order to steal thread"); - ret = dequeue_thread_expect(cluster_target(1), background); + ret = dequeue_thread_expect(pset_target(1), background); T_QUIET; T_EXPECT_TRUE(ret, "Only background thread in runqueue"); - cpu_set_thread_current(cluster_id_to_cpu_id(1), background); - ret = cpu_check_should_yield(cluster_id_to_cpu_id(1), false); + cpu_set_thread_current(pset_id_to_cpu_id(1), background); + ret = cpu_check_should_yield(pset_id_to_cpu_id(1), false); T_QUIET; T_EXPECT_TRUE(ret, "Should not yield in order to rebalance (presumed) native thread"); SCHED_POLICY_PASS("Thread yields in order to steal from other psets"); } +SCHED_POLICY_T_DECL(migration_stir_the_pot_basic, + "Verify stir-the-pot succeeds to rotate threads across P and E-cores after" + "their respective quanta have expired") +{ + int ret; + init_migration_harness(basic_amp); + struct thread_group *tg = create_tg(0); + test_thread_t starts_p = create_thread(TH_BUCKET_SHARE_DF, tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); + test_thread_t starts_e = create_thread(TH_BUCKET_SHARE_DF, tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); + int p_cpu = 0; + int e_cpu = 2; + int other_e_cpu = 3; + int other_p_cpu = 1; + cpu_set_thread_current(p_cpu, starts_p); + cpu_set_thread_current(e_cpu, starts_e); + int p_pset = 0; + set_pset_load_avg(p_pset, TH_BUCKET_SHARE_DF, 10000000); + int e_pset = 1; + + /* Thread on low core type "pays its dues" */ + cpu_expire_quantum(e_cpu); + + /* Thread on high core type should locate swap candidate */ + cpu_expire_quantum(p_cpu); + ret = ipi_expect(e_cpu, TEST_IPI_IMMEDIATE); + T_QUIET; T_EXPECT_TRUE(ret, "Should have found stir-the-pot candidate with expired quantum"); + + /* Thread on low core type should respond to IPI by preempting... */ + ret = thread_avoid_processor_expect(starts_e, e_cpu, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should preempt to get on P-core"); + + /* (Simulate as if we are switching to another quantum-expired thread) */ + test_thread_t other_expired_thread = create_thread(TH_BUCKET_SHARE_DF, tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); + cpu_set_thread_current(other_e_cpu, other_expired_thread); + cpu_expire_quantum(other_e_cpu); + cpu_clear_thread_current(other_e_cpu); + cpu_set_thread_current(e_cpu, other_expired_thread); + + /* ...and choosing the corresponding P-core for swap */ + ret = choose_pset_for_thread_expect(starts_e, p_pset); + T_QUIET; T_EXPECT_TRUE(ret, "Should choose P-cores despite no idle cores there"); + + /* Upon arrival, thread swapping in should preempt its predecessor */ + enqueue_thread(pset_target(p_pset), starts_e); + ret = cpu_check_preempt_current(p_cpu, true); + T_QUIET; T_EXPECT_TRUE(ret, "P-core should preempt quantum expired thread"); + + /* ...and preempted thread on P-core should spill down to E, completing the swap */ + ret = dequeue_thread_expect(pset_target(p_pset), starts_e); + T_QUIET; T_ASSERT_TRUE(ret, "e_starts was enqueued on P"); + cpu_set_thread_current(p_cpu, starts_e); + ret = choose_pset_for_thread_expect(starts_p, e_pset); + T_QUIET; T_EXPECT_TRUE(ret, "p_starts spilled to E, completing swap"); + + /* + * And a second swap should be initiated for the other E-expired thread + * that switched on-core afterwards. + */ + test_thread_t other_p_thread = create_thread(TH_BUCKET_SHARE_DF, tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); + cpu_set_thread_current(other_p_cpu, other_p_thread); + cpu_expire_quantum(other_p_cpu); + ret = ipi_expect(e_cpu, TEST_IPI_IMMEDIATE); + T_QUIET; T_EXPECT_TRUE(ret, "Should have found stir-the-pot candidate with expired quantum"); + + SCHED_POLICY_PASS("Stir-the-pot successfully initiated by P-core and completed"); + + /* Clean-up and reset to initial conditions */ + cpu_set_thread_current(p_cpu, starts_p); + cpu_set_thread_current(e_cpu, starts_e); + cpu_set_thread_current(other_p_cpu, other_p_thread); + cpu_set_thread_current(other_e_cpu, other_expired_thread); + + /* Now P-core expires quantum first */ + cpu_expire_quantum(p_cpu); + + /* Thread on E-core "pays its dues" and responds to self-message by preempting */ + cpu_expire_quantum(e_cpu); + ret = thread_avoid_processor_expect(starts_e, e_cpu, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should preempt to get on P-core"); + + /* ...and choosing the corresponding P-core for swap */ + cpu_clear_thread_current(e_cpu); + ret = choose_pset_for_thread_expect(starts_e, p_pset); + T_QUIET; T_EXPECT_TRUE(ret, "Should choose P-cores despite no idle cores there"); + + /* Upon arrival, thread swapping in should preempt its predecessor */ + enqueue_thread(pset_target(p_pset), starts_e); + ret = cpu_check_preempt_current(p_cpu, true); + T_QUIET; T_EXPECT_TRUE(ret, "P-core should preempt quantum expired thread"); + + /* ...and preempted thread on P-core should spill down to E, completing the swap */ + ret = dequeue_thread_expect(pset_target(p_pset), starts_e); + T_QUIET; T_ASSERT_TRUE(ret, "e_starts was enqueued on P"); + cpu_set_thread_current(p_cpu, starts_e); + ret = choose_pset_for_thread_expect(starts_p, e_pset); + T_QUIET; T_EXPECT_TRUE(ret, "p_starts spilled to E, completing swap"); + + SCHED_POLICY_PASS("Stir-the-pot successfully initiated by E-core and completed"); +} + SCHED_POLICY_T_DECL(migration_ipi_policy, "Verify we send the right type of IPI in different cross-core preemption scenarios") { @@ -163,3 +263,240 @@ SCHED_POLICY_T_DECL(migration_max_parallelism, } SCHED_POLICY_PASS("Correct recommended parallel width for all configurations"); } + +SCHED_POLICY_T_DECL(migration_rebalance_basic, "Verify that basic rebalance steal and " + "running rebalance mechanisms kick in") +{ + int ret; + test_hw_topology_t topo = SCHED_POLICY_DEFAULT_TOPO; + init_migration_harness(topo); + int sched_bucket = TH_BUCKET_SHARE_DF; + struct thread_group *tg = create_tg(0); + thread_t thread = create_thread(sched_bucket, tg, root_bucket_to_highest_pri[sched_bucket]); + + for (int preferred_pset_id = 0; preferred_pset_id < topo.num_psets; preferred_pset_id++) { + set_tg_sched_bucket_preferred_pset(tg, sched_bucket, preferred_pset_id); + sched_policy_push_metadata("preferred_pset_id", preferred_pset_id); + for (int running_on_pset_id = 0; running_on_pset_id < topo.num_psets; running_on_pset_id++) { + /* Running rebalance */ + int running_on_cpu = pset_id_to_cpu_id(running_on_pset_id); + cpu_set_thread_current(running_on_cpu, thread); + sched_policy_push_metadata("running_on_pset_id", running_on_pset_id); + for (int c = 0; c < topo.total_cpus; c++) { + sched_policy_push_metadata("evaluate_cpu", c); + int evaluate_pset = cpu_id_to_pset_id(c); + bool want_rebalance = cpu_processor_balance(c); + if (evaluate_pset == running_on_pset_id) { + T_QUIET; T_EXPECT_FALSE(want_rebalance, "should be no thread available for rebalance %s", + sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + continue; + } + bool should_rebalance = (topo.psets[evaluate_pset].cpu_type == topo.psets[preferred_pset_id].cpu_type) && + (topo.psets[running_on_pset_id].cpu_type != topo.psets[preferred_pset_id].cpu_type); + T_QUIET; T_EXPECT_EQ(want_rebalance, should_rebalance, "should rebalance to move thread to preferred type " + "if not there already %s", sched_policy_dump_metadata()); + if (should_rebalance) { + ret = thread_avoid_processor_expect(thread, running_on_cpu, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "thread will preempt in response to running rebalance IPI %s", + sched_policy_dump_metadata()); + /* Try loading all other cores of the preferred type, forcing this decision to find the idle one */ + for (int p = 0; p < topo.num_psets; p++) { + if ((topo.psets[p].cpu_type == topo.psets[preferred_pset_id].cpu_type) && + (p != evaluate_pset)) { + set_pset_load_avg(p, sched_bucket, 10000000); + } + } + ret = thread_avoid_processor_expect(thread, running_on_cpu, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "...even if all other cores (except rebalancer) are full %s", + sched_policy_dump_metadata()); + /* Unload cores for clean-up */ + for (int p = 0; p < topo.num_psets; p++) { + if ((topo.psets[p].cpu_type == topo.psets[preferred_pset_id].cpu_type) && + (p != evaluate_pset)) { + set_pset_load_avg(p, sched_bucket, 0); + } + } + } + sched_policy_pop_metadata(); + } + cpu_clear_thread_current(running_on_cpu); + sched_policy_pop_metadata(); + + /* Rebalance steal */ + int enqueued_pset = running_on_pset_id; + enqueue_thread(pset_target(enqueued_pset), thread); + sched_policy_push_metadata("enqueued_pset", enqueued_pset); + for (int c = 0; c < topo.total_cpus; c++) { + sched_policy_push_metadata("evaluate_cpu", c); + int evaluate_pset = cpu_id_to_pset_id(c); + if ((topo.psets[evaluate_pset].cpu_type != topo.psets[enqueued_pset].cpu_type) && + ((topo.psets[enqueued_pset].cpu_type != TEST_CPU_TYPE_PERFORMANCE) || + (topo.psets[preferred_pset_id].cpu_type != TEST_CPU_TYPE_PERFORMANCE))) { + /* Only evaluate steal between mismatching cluster types and where spill is not allowed */ + thread_t stolen_thread = cpu_steal_thread(c); + bool should_rebalance_steal = (topo.psets[evaluate_pset].cpu_type == topo.psets[preferred_pset_id].cpu_type) && + (topo.psets[enqueued_pset].cpu_type != topo.psets[preferred_pset_id].cpu_type); + bool did_rebalance_steal = (stolen_thread == thread); + if (stolen_thread != NULL) { + T_QUIET; T_EXPECT_EQ(stolen_thread, thread, "should only be one thread to steal?"); + } + T_QUIET; T_EXPECT_EQ(did_rebalance_steal, should_rebalance_steal, "should rebalance steal to move " + "thread to preferred type if not already there %s", sched_policy_dump_metadata()); + if (did_rebalance_steal) { + /* Put back stolen thread */ + enqueue_thread(pset_target(enqueued_pset), thread); + } + } + sched_policy_pop_metadata(); + } + + ret = dequeue_thread_expect(pset_target(enqueued_pset), thread); + T_QUIET; T_EXPECT_TRUE(ret, "thread correctly where we left it"); + sched_policy_pop_metadata(); + } + sched_policy_pop_metadata(); + } + SCHED_POLICY_PASS("Rebalance mechanisms kicking in!"); +} + +SCHED_POLICY_T_DECL(migration_harmonious_chosen_pset, + "Verify that different migration mechanisms agree about where a thread " + "should be, given current system conditions") +{ + int ret; + test_hw_topology_t topo = SCHED_POLICY_DEFAULT_TOPO; + init_migration_harness(topo); + int sched_bucket = TH_BUCKET_SHARE_DF; + struct thread_group *tg = create_tg(0); + thread_t thread = create_thread(sched_bucket, tg, root_bucket_to_highest_pri[sched_bucket]); + int max_load_threads = 20; + test_thread_t load_threads[max_load_threads]; + for (int i = 0; i < max_load_threads; i++) { + load_threads[i] = create_thread(sched_bucket, tg, root_bucket_to_highest_pri[sched_bucket]); + } + + /* Iterate conditions with different preferred psets and pset loads */ + for (int preferred_pset_id = 0; preferred_pset_id < topo.num_psets; preferred_pset_id++) { + set_tg_sched_bucket_preferred_pset(tg, sched_bucket, preferred_pset_id); + sched_policy_push_metadata("preferred_pset_id", preferred_pset_id); + for (int loaded_pset_id = 0; loaded_pset_id < topo.num_psets; loaded_pset_id++) { + // TODO: Test properly updated load average + enqueue_threads_arr(pset_target(loaded_pset_id), max_load_threads, load_threads); + bool preferred_is_idle = preferred_pset_id != loaded_pset_id; + sched_policy_push_metadata("loaded_pset_id", loaded_pset_id); + + /* Where the thread proactively wants to go */ + int chosen_pset = choose_pset_for_thread(thread); + bool chose_the_preferred_pset = chosen_pset == preferred_pset_id; + if (preferred_is_idle) { + T_QUIET; T_EXPECT_TRUE(chose_the_preferred_pset, "Should always choose the preferred pset if idle %s", + sched_policy_dump_metadata()); + } + + /* Thread generally should not avoid a processor in its chosen pset */ + for (int c = 0; c < topo.psets[chosen_pset].num_cpus; c++) { + int avoid_cpu_id = pset_id_to_cpu_id(chosen_pset) + c; + sched_policy_push_metadata("avoid_cpu_id", avoid_cpu_id); + ret = thread_avoid_processor_expect(thread, avoid_cpu_id, false, false); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should not want to leave processor in just chosen pset %s", + sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + } + + /* Extra assertions we can make based on the preferred pset being idle */ + if (preferred_is_idle) { + /* Thread should avoid processor in non-preferred pset to get to the idle preferred pset */ + for (int c = 0; c < topo.total_cpus; c++) { + if (cpu_id_to_pset_id(c) != preferred_pset_id) { + sched_policy_push_metadata("avoid_non_preferred_cpu_id", c); + ret = thread_avoid_processor_expect(thread, c, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should avoid processor in non-preferred pset to get to idle " + "preferred pset %s", sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + } + } + } + + /* Other cores should not want to rebalance the running thread away from its chosen pset */ + int chosen_cpu = pset_id_to_cpu_id(chosen_pset); + cpu_set_thread_current(chosen_cpu, thread); + for (int c = 0; c < topo.total_cpus; c++) { + if ((cpu_id_to_pset_id(c) != chosen_pset) && (cpu_id_to_pset_id(c) != loaded_pset_id)) { + sched_policy_push_metadata("stealing_cpu_id", c); + thread_t stolen_thread = cpu_steal_thread(c); + if (stolen_thread != NULL) { + T_QUIET; T_EXPECT_NE(stolen_thread, thread, "Should not steal back thread from its chosen_pset %s", + sched_policy_dump_metadata()); + if (stolen_thread != thread) { + /* Put back the stolen load thread */ + enqueue_thread(pset_target(loaded_pset_id), stolen_thread); + } + } + bool want_rebalance = cpu_processor_balance(c); + T_QUIET; T_EXPECT_FALSE(want_rebalance, "Should not rebalance thread away from its chosen_pset %s", + sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + } + } + + (void)dequeue_threads_expect_ordered_arr(pset_target(loaded_pset_id), max_load_threads, load_threads); + for (int pset = 0; pset < topo.num_psets; pset++) { + T_QUIET; T_EXPECT_TRUE(runqueue_empty(pset_target(pset)), "pset %d wasn't cleared at the end of test " + "scenario %s", pset, sched_policy_dump_metadata()); + } + sched_policy_pop_metadata(); + } + sched_policy_pop_metadata(); + } + SCHED_POLICY_PASS("Policy is harmonious on the subject of a thread's chosen pset"); +} + +SCHED_POLICY_T_DECL(migration_search_order, + "Verify that we iterate psets for spill and steal in the expected order") +{ + int ret; + init_migration_harness(dual_die); + int expected_orders[6][6] = { + {0, 3, 1, 2, 4, 5}, + {1, 2, 4, 5, 0, 3}, + {2, 1, 4, 5, 0, 3}, + {3, 0, 4, 5, 1, 2}, + {4, 5, 1, 2, 3, 0}, + {5, 4, 1, 2, 3, 0}, + }; + for (int src_pset_id = 0; src_pset_id < dual_die.num_psets; src_pset_id++) { + ret = iterate_pset_search_order_expect(src_pset_id, UINT64_MAX, 0, expected_orders[src_pset_id], dual_die.num_psets); + T_QUIET; T_EXPECT_EQ(ret, -1, "Mismatched search order at ind %d for src_pset_id %d", + ret, src_pset_id); + } + SCHED_POLICY_PASS("Search order sorts on migration weight, then locality, then pset id"); + uint64_t p_mask = 0b110110; + int expected_p_orders[6][6] = { + {1, 2, 4, 5, -1, -1}, + {1, 2, 4, 5, -1, -1}, + {2, 1, 4, 5, -1, -1}, + {4, 5, 1, 2, -1, -1}, + {4, 5, 1, 2, -1, -1}, + {5, 4, 1, 2, -1, -1}, + }; + uint64_t e_mask = 0b001001; + int expected_e_orders[6][6] = { + {0, 3, -1, -1, -1, -1}, + {0, 3, -1, -1, -1, -1}, + {0, 3, -1, -1, -1, -1}, + {3, 0, -1, -1, -1, -1}, + {3, 0, -1, -1, -1, -1}, + {3, 0, -1, -1, -1, -1}, + }; + for (int i = 0; i < 2; i++) { + for (int src_pset_id = 0; src_pset_id < dual_die.num_psets; src_pset_id++) { + uint64_t mask = (i == 0) ? p_mask : e_mask; + int *expected_order_masked = (i == 0) ? expected_p_orders[src_pset_id] : expected_e_orders[src_pset_id]; + ret = iterate_pset_search_order_expect(src_pset_id, mask, 0, expected_order_masked, dual_die.num_psets); + T_QUIET; T_EXPECT_EQ(ret, -1, "Mismatched masked search order at ind %d for src_pset_id %d", + ret, src_pset_id); + } + } + SCHED_POLICY_PASS("Search order traversal respects candidate mask"); +} diff --git a/tests/sched/edge_runqueue.c b/tests/sched/edge_runqueue.c index 7e1152ea1..ba2d26701 100644 --- a/tests/sched/edge_runqueue.c +++ b/tests/sched/edge_runqueue.c @@ -16,7 +16,7 @@ SCHED_POLICY_T_DECL(runq_shared_rsrc_bound, init_migration_harness(single_core); struct thread_group *tg = create_tg(0); /* Test both shared resource types */ - for (int i = 0; i < 2; i++) { + for (int i = 0; i < CLUSTER_SHARED_RSRC_TYPE_COUNT; i++) { thread_t thread = create_thread(TH_BUCKET_SHARE_DF, tg, root_bucket_to_highest_pri[TH_BUCKET_SHARE_DF]); edge_set_thread_shared_rsrc(thread, i); enqueue_thread(default_target, thread); diff --git a/tests/sched/rt_migration.c b/tests/sched/rt_migration.c new file mode 100644 index 000000000..9369d73e1 --- /dev/null +++ b/tests/sched/rt_migration.c @@ -0,0 +1,396 @@ +// Copyright (c) 2024 Apple Inc. All rights reserved. + +#include "sched_test_harness/sched_policy_darwintest.h" +#include "sched_test_harness/sched_edge_harness.h" + +T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("scheduler"), + T_META_RUN_CONCURRENTLY(true), + T_META_OWNER("m_zinn")); + +static mach_timebase_info_data_t timebase_info; + +uint64_t +nanos_to_abs(uint64_t nanos) +{ + static mach_timebase_info_data_t timebase = {}; + + if (timebase.numer == 0 || timebase.denom == 0) { + kern_return_t kr; + + kr = mach_timebase_info(&timebase_info); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); + + timebase = timebase_info; + } + return nanos * timebase.denom / timebase.numer; +} + +SCHED_POLICY_T_DECL(rt_migration_cluster_bound, + "Verify that cluster-bound realtime threads always choose the bound " + "cluster except when its derecommended") +{ + int ret; + init_migration_harness(dual_die); + struct thread_group *tg = create_tg(0); + test_thread_t threads[dual_die.num_psets]; + for (int i = 0; i < dual_die.num_psets; i++) { + threads[i] = create_thread(TH_BUCKET_FIXPRI, tg, BASEPRI_RTQUEUES); + set_thread_cluster_bound(threads[i], i); + } + for (int i = 0; i < dual_die.num_psets; i++) { + set_current_processor(pset_id_to_cpu_id(i)); + for (int j = 0; j < dual_die.num_psets; j++) { + ret = choose_pset_for_thread_expect(threads[j], j); + T_QUIET; T_EXPECT_TRUE(ret, "Expecting the bound cluster"); + } + } + SCHED_POLICY_PASS("Cluster bound chooses bound cluster"); + /* Derecommend the bound cluster */ + for (int i = 0; i < dual_die.num_psets; i++) { + set_pset_derecommended(i); + int replacement_pset = -1; + for (int j = 0; j < dual_die.num_psets; j++) { + /* Find the first homogenous cluster and mark it as idle so we choose it */ + if ((i != j) && (dual_die.psets[i].cpu_type == dual_die.psets[j].cpu_type)) { + replacement_pset = j; + break; + } + } + ret = choose_pset_for_thread_expect(threads[i], replacement_pset); + T_QUIET; T_EXPECT_TRUE(ret, "Expecting the idle pset when the bound cluster is derecommended"); + /* Restore pset conditions */ + set_pset_recommended(i); + } + SCHED_POLICY_PASS("Cluster binding is soft"); +} + +SCHED_POLICY_T_DECL(rt_choose_processor, + "Verify the realtime spill policy") +{ + test_hw_topology_t topo = dual_die; + init_migration_harness(topo); + + uint64_t start = mach_absolute_time(); + + const uint64_t period = 0; + const uint64_t computation = nanos_to_abs(5000000ULL); /* 5ms */ + const uint64_t constraint = nanos_to_abs(10000000ULL); /* 10ms */ + const bool preemptible = false; + const uint8_t priority_offset = 0; + + struct thread_group *tg = create_tg(0); + thread_t thread = create_thread(TH_BUCKET_FIXPRI, tg, BASEPRI_RTQUEUES); + set_thread_sched_mode(thread, TH_MODE_REALTIME); + const uint64_t deadline = rt_deadline_add(start, nanos_to_abs(10000000ULL /* 10ms */)); + set_thread_realtime(thread, period, computation, constraint, preemptible, priority_offset, deadline); + + test_thread_t earlier_threads[topo.total_cpus] = {}; + for (int i = 0; i < topo.total_cpus; i++) { + earlier_threads[i] = create_thread(TH_BUCKET_FIXPRI, tg, BASEPRI_RTQUEUES); + set_thread_sched_mode(earlier_threads[i], TH_MODE_REALTIME); + const uint64_t early_deadline = rt_deadline_add(start, nanos_to_abs(5000000) /* 5ms */); + set_thread_realtime(earlier_threads[i], period, computation, constraint, preemptible, priority_offset, early_deadline); + } + + test_thread_t later_thread = create_thread(TH_BUCKET_FIXPRI, tg, BASEPRI_RTQUEUES); + set_thread_sched_mode(later_thread, TH_MODE_REALTIME); + const uint64_t late_deadline = rt_deadline_add(start, nanos_to_abs(20000000ULL) /* 20ms */); + set_thread_realtime(later_thread, period, computation, constraint, preemptible, priority_offset, late_deadline); + + for (int preferred_pset_id = 0; preferred_pset_id < topo.num_psets; preferred_pset_id++) { + set_tg_sched_bucket_preferred_pset(tg, TH_BUCKET_FIXPRI, preferred_pset_id); + sched_policy_push_metadata("preferred_pset_id", preferred_pset_id); + + /* Unloaded system. Expect to choose the preferred pset. */ + choose_pset_for_thread_expect(thread, preferred_pset_id); + + /* + * Load the preferred pset with earlier-deadline threads. Should cause + * the thread to spill (since the die has multiple clusters of each + * performance type). + */ + for (int i = 0; i < topo.psets[preferred_pset_id].num_cpus; i++) { + int cpu_id = pset_id_to_cpu_id(preferred_pset_id) + i; + cpu_set_thread_current(cpu_id, earlier_threads[i]); + } + int chosen = choose_pset_for_thread(thread); + T_QUIET; T_EXPECT_GE(chosen, 0, "chose a valid cluster"); + T_QUIET; T_EXPECT_NE(chosen, preferred_pset_id, "chose an unloaded cluster"); + T_QUIET; T_EXPECT_EQ(topo.psets[chosen].cpu_type, topo.psets[preferred_pset_id].cpu_type, "chose a pset of the same performance type"); + + /* Replace the first earlier-deadline thread with a later-deadline thread. Should cause the thread to preempt. */ + cpu_set_thread_current(pset_id_to_cpu_id(preferred_pset_id), later_thread); + chosen = choose_pset_for_thread(thread); + T_QUIET; T_EXPECT_EQ(chosen, preferred_pset_id, "preempting later-deadline thread"); + + /* Load all psets of the same performance type with early-deadline threads. Expected preferred pset to be chosen. */ + for (int i = 0; i < topo.num_psets; i++) { + if (topo.psets[i].cpu_type != topo.psets[preferred_pset_id].cpu_type) { + continue; + } + for (int j = 0; j < topo.psets[i].num_cpus; j++) { + int cpu_id = pset_id_to_cpu_id(i) + j; + cpu_set_thread_current(cpu_id, earlier_threads[cpu_id]); + } + } + choose_pset_for_thread_expect(thread, preferred_pset_id); + + /* Clean up */ + for (int i = 0; i < topo.total_cpus; i++) { + cpu_clear_thread_current(i); + } + + sched_policy_pop_metadata(/* preferred_pset_id */); + } + + SCHED_POLICY_PASS("sched_rt_choose_processor selects the right pset"); +} + +SCHED_POLICY_T_DECL(rt_spill_order, "Verify computed realtime spill orders.") +{ + init_migration_harness(dual_die); + + /* Test setup: reset all edges. */ + for (uint src_id = 0; src_id < dual_die.num_psets; src_id++) { + for (uint dst_id = 0; dst_id < dual_die.num_psets; dst_id++) { + sched_rt_config_set(src_id, dst_id, (sched_clutch_edge) {}); + } + } + + /* First test: create edges from pset 5 to psets 0-3. */ + for (unsigned i = 0; i < 4; i++) { + sched_rt_config_set(5, i, (sched_clutch_edge) { + .sce_migration_allowed = 1, + .sce_steal_allowed = 0, + .sce_migration_weight = i % 3 /* create ties to test die-locality */ + }); + } + /* Disallow spill from 5 to 4, despite being the same perf level. */ + sched_rt_config_set(5, 4, (sched_clutch_edge) { + .sce_migration_allowed = 0, + .sce_steal_allowed = 0, + .sce_migration_weight = 0 + }); + + rt_pset_recompute_spill_order(5); + + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(5, 0), 3, "spso_search_order[0] == 3"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(5, 1), 0, "spso_search_order[1] == 0"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(5, 2), 1, "spso_search_order[2] == 1"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(5, 3), 2, "spso_search_order[3] == 2"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(5, 4), PSET_ID_INVALID, "spso_search_order[4] == PSET_ID_INVALID"); + + /* Second test: create edges from 0 to psets 1, 2, 4, and 5. */ + sched_rt_config_set(0, 1, (sched_clutch_edge) { + .sce_migration_allowed = 1, + .sce_steal_allowed = 0, + .sce_migration_weight = 2 + }); + sched_rt_config_set(0, 2, (sched_clutch_edge) { + .sce_migration_allowed = 1, + .sce_steal_allowed = 0, + .sce_migration_weight = 1 + }); + sched_rt_config_set(0, 4, (sched_clutch_edge) { + .sce_migration_allowed = 1, + .sce_steal_allowed = 0, + .sce_migration_weight = 0 + }); + sched_rt_config_set(0, 5, (sched_clutch_edge) { + .sce_migration_allowed = 1, + .sce_steal_allowed = 0, + .sce_migration_weight = 1 + }); + + rt_pset_recompute_spill_order(0); + + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(0, 0), 4, "spso_search_order[0] == 4"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(0, 1), 2, "spso_search_order[1] == 2"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(0, 2), 5, "spso_search_order[2] == 5"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(0, 3), 1, "spso_search_order[3] == 1"); + T_QUIET; T_EXPECT_EQ(rt_pset_spill_search_order_at_offset(0, 4), PSET_ID_INVALID, "spso_search_order[4] == PSET_ID_INVALID"); + + SCHED_POLICY_PASS("Realtime spill orders are computed correctly."); +} + +SCHED_POLICY_T_DECL(rt_thread_avoid_processor, + "Verify that thread_avoid_processor is correct for realtime threads") +{ + int ret; + test_hw_topology_t topo = dual_die; + init_migration_harness(topo); + struct thread_group *tg = create_tg(0); + thread_t thread = create_thread(TH_BUCKET_FIXPRI, tg, BASEPRI_RTQUEUES); + + /* Iterate conditions with different preferred psets and pset loads */ + for (int preferred_pset_id = 0; preferred_pset_id < topo.num_psets; preferred_pset_id++) { + set_tg_sched_bucket_preferred_pset(tg, TH_BUCKET_FIXPRI, preferred_pset_id); + sched_policy_push_metadata("preferred_pset_id", preferred_pset_id); + + /* Where the thread proactively wants to go */ + int chosen_pset = choose_pset_for_thread(thread); + T_QUIET; T_EXPECT_EQ(preferred_pset_id, chosen_pset, "Thread should choose un-loaded preferred pset %s", + sched_policy_dump_metadata()); + + /* Thread generally should not avoid a processor in its chosen pset */ + for (int c = 0; c < topo.psets[chosen_pset].num_cpus; c++) { + int avoid_cpu_id = pset_id_to_cpu_id(chosen_pset) + c; + sched_policy_push_metadata("avoid_cpu_id", avoid_cpu_id); + ret = thread_avoid_processor_expect(thread, avoid_cpu_id, false, false); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should not want to leave processor in just chosen pset %s", + sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + } + + /* Thread should avoid processor if not allowed to run on the pset */ + for (int c = 0; c < topo.total_cpus; c++) { + sched_clutch_edge edge = sched_rt_config_get(preferred_pset_id, cpu_id_to_pset_id(c)); + if (cpu_id_to_pset_id(c) != preferred_pset_id && !(edge.sce_migration_allowed || edge.sce_steal_allowed)) { + sched_policy_push_metadata("avoid_non_preferred_cpu_id", c); + ret = thread_avoid_processor_expect(thread, c, false, true); + T_QUIET; T_EXPECT_TRUE(ret, "Thread should avoid processor in non-preferred pset to get to idle " + "preferred pset %s", sched_policy_dump_metadata()); + sched_policy_pop_metadata(); + } + } + + sched_policy_pop_metadata(); + } + SCHED_POLICY_PASS("thread_avoid_processor works for realtime threads"); +} + +static thread_t +create_realtime_thread_with_deadline(uint64_t deadline_nanos) +{ + test_thread_t thread = create_thread( + TH_BUCKET_FIXPRI, + create_tg(0) /* realtime policies don't consider thread groups */, + BASEPRI_RTQUEUES); + set_thread_sched_mode(thread, TH_MODE_REALTIME); + set_thread_realtime( + thread, + 0, + (uint32_t) nanos_to_abs(5000000ULL /* 5ms */), + (uint32_t) nanos_to_abs(10000000ULL /* 10ms */), + false, + 0, + nanos_to_abs(deadline_nanos)); + return thread; +} + +static void +fill_all_cpus_with_realtime_threads(uint64_t deadline_nanos) +{ + for (int i = 0; i < get_hw_topology().total_cpus; i++) { + cpu_set_thread_current(i, create_realtime_thread_with_deadline(deadline_nanos)); + } +} + +SCHED_POLICY_T_DECL(rt_choose_thread, "Verify realtime thread selection policy and mechanism") +{ + int ret; + test_hw_topology_t topo = dual_die; + init_migration_harness(topo); + + const uint64_t start = mach_absolute_time(); + const uint64_t deadline = rt_deadline_add(start, nanos_to_abs(5000000)); /* start + 5ms */ + const uint64_t later_deadline = rt_deadline_add(start, nanos_to_abs(6000000)); /* start + 6ms */ + + fill_all_cpus_with_realtime_threads(later_deadline); + + /* One of these threads will be on the stealing pset runqueue: */ + test_thread_t later_deadline_thread = create_realtime_thread_with_deadline(later_deadline); + test_thread_t earlier_deadline_thread = create_realtime_thread_with_deadline(deadline); + + /* And this thread will be on another runqueue: */ + test_thread_t stealable_thread = create_realtime_thread_with_deadline(deadline); + + /* Check that sched_rt_choose_thread obeys the steal policies configured by + * the realtime matrix. A pset should only steal if the thread's deadline + * is earlier than that of any thread on the pset's runqueue. */ + + for (uint stealing_pset_id = 0; stealing_pset_id < topo.num_psets; stealing_pset_id++) { + sched_policy_push_metadata("stealing_pset", stealing_pset_id); + for (uint off = 1; off < topo.num_psets; off++) { + uint other_pset_id = (stealing_pset_id + off) % topo.num_psets; + sched_policy_push_metadata("other_pset", other_pset_id); + + enqueue_thread(pset_target(other_pset_id), stealable_thread); + + enqueue_thread(pset_target(stealing_pset_id), earlier_deadline_thread); + ret = dequeue_thread_expect(pset_target(stealing_pset_id), earlier_deadline_thread); + T_QUIET; T_ASSERT_TRUE(ret, "when deadlines are equal, prefer thread from local runqueue %s", sched_policy_dump_metadata()); + + enqueue_thread(pset_target(stealing_pset_id), later_deadline_thread); + if (topo.psets[other_pset_id].cpu_type == topo.psets[stealing_pset_id].cpu_type) { + T_QUIET; T_ASSERT_TRUE(sched_rt_config_get(other_pset_id, stealing_pset_id).sce_steal_allowed, "steal allowed between psets of the same type %s", sched_policy_dump_metadata()); + + ret = dequeue_thread_expect(pset_target(stealing_pset_id), stealable_thread); + T_QUIET; T_ASSERT_TRUE(ret, "steal because the other pset has an earlier-deadline thread %s", sched_policy_dump_metadata()); + + ret = dequeue_thread_expect(pset_target(stealing_pset_id), later_deadline_thread); + T_QUIET; T_ASSERT_TRUE(ret, "take thread from local runqueue because no earlier-deadline threads on other psets %s", sched_policy_dump_metadata()); + } else { + T_QUIET; T_ASSERT_FALSE(sched_rt_config_get(other_pset_id, stealing_pset_id).sce_steal_allowed, "steal disallowed between psets of different types %s", sched_policy_dump_metadata()); + + ret = dequeue_thread_expect(pset_target(stealing_pset_id), later_deadline_thread); + T_QUIET; T_ASSERT_TRUE(ret, "take later-deadline thread because policy disallows steal %s", sched_policy_dump_metadata()); + + ret = dequeue_thread_expect(pset_target(other_pset_id), stealable_thread); + T_QUIET; T_ASSERT_TRUE(ret, "removed stealable thread %s", sched_policy_dump_metadata()); + } + sched_policy_pop_metadata(/* other_pset */); + } + sched_policy_pop_metadata(/* stealing_pset */); + } + + SCHED_POLICY_PASS("Verified realtime thread selection"); +} + +SCHED_POLICY_T_DECL(rt_followup_ipi, "Verify that followup IPIs are sent when there are stealable realtime threads and idle processors") +{ + int ret; + test_hw_topology_t topo = dual_die; + init_migration_harness(topo); + + const uint64_t start = mach_absolute_time(); + const uint64_t deadline = rt_deadline_add(start, nanos_to_abs(5000000)); /* start + 5ms */ + + fill_all_cpus_with_realtime_threads(deadline); + + /* This thread is used to load a runqueue. */ + test_thread_t thread = create_realtime_thread_with_deadline(deadline); + + for (int target_cpu = 0; target_cpu < topo.total_cpus; target_cpu++) { + sched_policy_push_metadata("target_cpu", target_cpu); + for (int idle_cpu = 0; idle_cpu < topo.total_cpus; idle_cpu++) { + if (target_cpu == idle_cpu) { + continue; + } + + sched_policy_push_metadata("idle_cpu", idle_cpu); + enqueue_thread(cpu_target(target_cpu), thread); + test_thread_t saved_idle_thread = cpu_clear_thread_current(idle_cpu); + + /* idle_cpu is now "idle," now simulate thread_select() on target_cpu: */ + cpu_set_thread_current(target_cpu, cpu_clear_thread_current(target_cpu)); + + /* That should result in a deferred followup IPI, if spill is allowed between target_cpu and idle_cpu. */ + if (topo.psets[cpu_id_to_pset_id(idle_cpu)].cpu_type == topo.psets[cpu_id_to_pset_id(target_cpu)].cpu_type) { + ret = ipi_expect(idle_cpu, TEST_IPI_DEFERRED); + T_QUIET; T_ASSERT_TRUE(ret, "should send a followup IPI %s", sched_policy_dump_metadata()); + } + + /* Clean up for the next iteration. */ + ret = dequeue_thread_expect(cpu_target(target_cpu), thread); + T_QUIET; T_ASSERT_TRUE(ret, "cleaning up %s", sched_policy_dump_metadata()); + cpu_set_thread_current(idle_cpu, saved_idle_thread); + sched_policy_pop_metadata(/* idle_cpu */); + } + sched_policy_pop_metadata(/* target_cpu */); + } + + SCHED_POLICY_PASS("Realtime followup IPIs work"); +} diff --git a/tests/sched/rttimer.c b/tests/sched/rttimer.c new file mode 100644 index 000000000..3f413225e --- /dev/null +++ b/tests/sched/rttimer.c @@ -0,0 +1,291 @@ +// Copyright (c) 2024 Apple Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "../test_utils.h" +#include "sched_test_utils.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.scheduler"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("scheduler") + ); + +static const uint32_t CALIBRATION_CYCLES = 10000; + +uint64_t waitStart = 0ULL; +uint64_t waitEnd = 8ULL * NSEC_PER_MSEC; +#if TARGET_OS_WATCH || TARGET_OS_TV +/* Increase step stride for slower APs. */ +uint64_t waitStep = 2000ULL * NSEC_PER_USEC; +#else /* TARGET_OS_WATCH || TARGET_OS_TV */ +uint64_t waitStep = 500ULL * NSEC_PER_USEC; +#endif /* TARGET_OS_WATCH || TARGET_OS_TV */ +uint64_t testDuration = 5ULL * NSEC_PER_SEC; +uint64_t wasteCPUThreads = 0ULL; +uint64_t wasteRTCPUThreads = 0ULL; +uint64_t wasteCPUTimeQuanta = 10ULL * NSEC_PER_MSEC; +uint64_t wasteCPUTimePercentActive = 50ULL; +uint64_t wasteCPUTimeQuantaRandomVariationPercent = 50ULL; +uint32_t rtPolicyPeriod = 0ULL * USEC_PER_SEC; +uint64_t rtPolicyComputation = 5ULL * USEC_PER_SEC; +uint64_t rtPolicyConstraint = 10ULL * USEC_PER_SEC; +bool rtPolicyPreemptible = false; + +/* Workgroup (for CLPC, and required to get RT on visionOS) */ +os_workgroup_t g_rt_workgroup = NULL; +os_workgroup_join_token_s g_rt_workgroup_join_token = { 0 }; + +static const char workload_config_plist[] = { +#embed "rttimer.workload_config.plist" suffix(,) + 0, +}; + + +static void +workload_config_load(void) +{ + /* Try to load the test workload config plist. */ + size_t len = 0; + int ret = sysctlbyname("kern.workload_config", NULL, &len, (void*) (const void*) workload_config_plist, strlen(workload_config_plist)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(kern.workload_config)"); +} + +static void +workload_config_unload(void) +{ + /* clear the loaded workload config plist.. */ + size_t len = 0; + sysctlbyname("kern.workload_config", NULL, &len, "", 1); +} + +static void +setup_workgroup(void) +{ + int ret; + /* Create a named workgroup. */ + os_workgroup_attr_s attr = OS_WORKGROUP_ATTR_INITIALIZER_DEFAULT; + ret = os_workgroup_attr_set_flags(&attr, OS_WORKGROUP_ATTR_NONPROPAGATING); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "os_workgroup_set_flags(OS_WORKGROUP_ATTR_NONPROPAGATING)"); + g_rt_workgroup = os_workgroup_create_with_workload_id("rttimer", "com.apple.test", &attr); + T_QUIET; T_ASSERT_NOTNULL(g_rt_workgroup, "created the test workgroup"); +} + +static thread_basic_info_data_t +thread_info_get() +{ + thread_basic_info_data_t value; + mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT; + thread_info(pthread_mach_thread_np(pthread_self()), THREAD_BASIC_INFO, (thread_info_t)&value, &info_count); + return value; +} + + +static void +make_realtime() +{ + thread_time_constraint_policy_data_t policy; + policy.period = (uint32_t)(nanos_to_abs(rtPolicyPeriod)); + policy.computation = (uint32_t)(nanos_to_abs(rtPolicyComputation)); + policy.constraint = (uint32_t)(nanos_to_abs(rtPolicyConstraint)); + policy.preemptible = rtPolicyPreemptible; + + int ret = thread_policy_set( + pthread_mach_thread_np(pthread_self()), + THREAD_TIME_CONSTRAINT_POLICY, + (thread_policy_t)&policy, + THREAD_TIME_CONSTRAINT_POLICY_COUNT); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "thread_policy_set self to realtime"); +} + +static void * +cpu_waster(void * arg) +{ + int ret; + char * name; + ret = asprintf(&name, "cpu_waster#%d", (int) arg); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "asprintf"); + ret = pthread_setname_np(name); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_setname_np(\"%s\")", name); + + while (1) { + uint64_t time_quanta_in_ns = wasteCPUTimeQuanta; + if (wasteCPUTimeQuantaRandomVariationPercent) { + uint64_t maximum_possible_variation_in_ns = wasteCPUTimeQuanta * wasteCPUTimeQuantaRandomVariationPercent / 100ULL; + uint64_t actual_variation_in_ns = arc4random_uniform((uint32_t)maximum_possible_variation_in_ns); + time_quanta_in_ns += actual_variation_in_ns; + } + + uint64_t time_acitve_in_ns = time_quanta_in_ns * wasteCPUTimePercentActive / 100ULL; + uint64_t time_sleeping_in_ns = time_quanta_in_ns - time_acitve_in_ns; + + // Chew some cpu + uint64_t time_active_in_abs = nanos_to_abs(time_acitve_in_ns); + uint64_t test_start_time = mach_absolute_time(); + uint64_t test_desired_end_time = test_start_time + time_active_in_abs; + while (mach_absolute_time() < test_desired_end_time) { + } + + // Sleep a bit + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = time_sleeping_in_ns; + nanosleep(&ts, NULL); + } + return NULL; +} + +static void * +perform_test(__unused void * arg) +{ + make_realtime(); + + T_LOG("Requested Test Average Worst"); + T_LOG("WAIT(ns) CPU(us) cpu%% Elapsed(ns) Miss(ns) Miss(ns)"); + + for (uint64_t delay_in_ns = waitStart; delay_in_ns <= waitEnd; delay_in_ns += waitStep) { + uint64_t delay_in_abs = nanos_to_abs(delay_in_ns); + + uint64_t test_start_time = mach_absolute_time(); + uint64_t test_desired_end_time = test_start_time + nanos_to_abs(testDuration); + uint64_t test_actual_end_time = 0; + uint64_t elapsed_reading_count = 0; + uint64_t total_elapsed_time = 0; + uint64_t avg_elapsed_reading = 0; + uint64_t worst_miss = 0; + + thread_basic_info_data_t start_info = thread_info_get(); + do { + // This is the actual timer wait + uint64_t t1 = mach_absolute_time(); + mach_wait_until(t1 + delay_in_abs); + uint64_t t2 = mach_absolute_time(); + + // Now we calculate the elapsed time + int64_t elapsed_ns = abs_to_nanos(t2 - t1 - delay_in_abs); + elapsed_reading_count++; + total_elapsed_time += elapsed_ns; + avg_elapsed_reading = total_elapsed_time / elapsed_reading_count; + + if (elapsed_ns > worst_miss) { + worst_miss = elapsed_ns; + } + } while ((test_actual_end_time = mach_absolute_time()) < test_desired_end_time); + + thread_basic_info_data_t end_info = thread_info_get(); + + uint64_t user_delta_micros = ((end_info.user_time.seconds * USEC_PER_SEC) + end_info.user_time.microseconds) - + ((start_info.user_time.seconds * USEC_PER_SEC) + start_info.user_time.microseconds); + + uint64_t system_delta_micros = ((end_info.system_time.seconds * USEC_PER_SEC) + end_info.system_time.microseconds) - + ((start_info.system_time.seconds * USEC_PER_SEC) + start_info.system_time.microseconds); + + uint64_t total_delta_micros = user_delta_micros + system_delta_micros; + uint64_t test_actual_elapsed = abs_to_nanos(test_actual_end_time - test_start_time); + avg_elapsed_reading = total_elapsed_time / elapsed_reading_count; + + T_LOG("%09llu, %7llu, %4.1f, %10llu, %09llu, %09llu", + delay_in_ns, total_delta_micros, (double)end_info.cpu_usage / 10.0, + test_actual_elapsed, avg_elapsed_reading, worst_miss); + + T_QUIET; T_EXPECT_LE(avg_elapsed_reading, 500 * NSEC_PER_USEC, "average miss is <=0.5ms."); + if (avg_elapsed_reading > 500 * NSEC_PER_USEC) { + sched_kdebug_test_fail(delay_in_ns, total_delta_micros, avg_elapsed_reading, worst_miss); + } + } + + return NULL; +} + +static void * +calibration(__unused void * arg) +{ + make_realtime(); + + uint64_t delta_measurement = 0; + for (uint32_t i = 0; i < CALIBRATION_CYCLES; ++i) { + uint64_t last_time = mach_absolute_time(); + uint64_t delta = mach_absolute_time() - last_time; + delta_measurement += abs_to_nanos(delta); + } + + T_LOG( "mach_absolute_time minimum resolution: %llu ns", abs_to_nanos(1ULL)); + T_LOG( "averaged minimum measurement time: %llu ns", delta_measurement / CALIBRATION_CYCLES); + T_LOG( "testDuration: %llu ns", testDuration); + T_LOG( "waitStep: %llu ns", waitStep); + + return NULL; +} + +T_DECL(rttimer, "Check that realtime thread timer's average miss is <= 0.5ms", + T_META_TAG_VM_NOT_ELIGIBLE, XNU_T_META_SOC_SPECIFIC, + T_META_CHECK_LEAKS(false), /* could affect timing */ + T_META_RUN_CONCURRENTLY(false), + T_META_ASROOT(true) /* needed to set workload config */ + ) +{ + T_QUIET; T_ASSERT_POSIX_SUCCESS(proc_disable_wakemon(getpid()), "proc_disable_wakemon(getpid())"); + + if (platform_is_virtual_machine()) { + T_SKIP("Test not supposed to run on virtual machine. rdar://132930927"); + } + + pthread_t thread = NULL; + int ret; + + /* Load the workload config. */ + workload_config_load(); + T_ATEND(workload_config_unload); + + /* Create the workgroup. The main thread does not need to join and become realtime. */ + setup_workgroup(); + + /* Calibration */ + ret = pthread_create_with_workgroup_np(&thread, g_rt_workgroup, NULL, calibration, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create(calibration)"); + ret = pthread_join(thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join(calibration)"); + + /* No-load tests */ + T_LOG(""); + T_LOG("Performing no-load tests."); + ret = pthread_create_with_workgroup_np(&thread, g_rt_workgroup, NULL, perform_test, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create(perform_test) no-load"); + ret = pthread_join(thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join(perform_test) no-load"); + + /* Heavy-load tests */ + int thread_count = 2 * dt_ncpu(); + T_LOG(""); + T_LOG("Performing heavy-load tests. Spawning %d default priority cpu waster threads.", thread_count); + for (int i = 0; i < thread_count; i++) { + ret = pthread_create(&thread, NULL, cpu_waster, (void *) (uintptr_t) i); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create(cpu_waster#%d)", i); + } + ret = pthread_create_with_workgroup_np(&thread, g_rt_workgroup, NULL, perform_test, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_create(perform_test) heavy-load"); + ret = pthread_join(thread, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "pthread_join(perform_test) heavy-load"); + + T_PASS("realtime thread timer's average miss is <= 0.5ms"); + T_END; +} diff --git a/tests/sched/rttimer.entitlements b/tests/sched/rttimer.entitlements new file mode 100644 index 000000000..5726ec2c6 --- /dev/null +++ b/tests/sched/rttimer.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.kernel.work-interval + + + diff --git a/tests/sched/rttimer.workload_config.plist b/tests/sched/rttimer.workload_config.plist new file mode 100644 index 000000000..d5916dba3 --- /dev/null +++ b/tests/sched/rttimer.workload_config.plist @@ -0,0 +1,27 @@ + + + + + WorkloadIDTable + + com.apple.test + + Phases + + Realtime + + WorkIntervalType + DEFAULT + WorkloadClass + REALTIME + + + Root + + DefaultPhase + Realtime + + + + + \ No newline at end of file diff --git a/tests/sched/sched_test_harness/Makefile b/tests/sched/sched_test_harness/Makefile deleted file mode 100644 index 75b876cd7..000000000 --- a/tests/sched/sched_test_harness/Makefile +++ /dev/null @@ -1,93 +0,0 @@ -ifneq ($(PLATFORM),MacOSX) -# Exclude building for any platform except MacOSX, due to arch/target incompatibility -EXCLUDED_SOURCES += sched/clutch_runqueue.c sched/edge_runqueue.c sched/edge_migration.c -else - -SCHED_HARNESS := sched/sched_test_harness -SCHED_HARNESS_SHADOW := $(SCHED_HARNESS)/shadow_headers -SCHED_CLUTCH_DISABLED_WARNINGS := -Wno-declaration-after-statement -Wno-nullability-completeness -Wno-missing-prototypes -Wno-gnu-statement-expression-from-macro-expansion -Wno-implicit-int-conversion -Wno-sign-conversion -Wno-c++98-compat -Wno-language-extension-token -Wno-c2x-extensions -Wno-format-nonliteral -Wno-unused-function -SCHED_TEST_DISABLED_WARNINGS := -Wno-gnu-binary-literal -Wno-format-nonliteral -Wno-language-extension-token -Wno-sign-conversion -SCHED_HARNESS_DEFINES := -DSCHED_TEST_HARNESS=1 -DCONFIG_SCHED_CLUTCH=1 -DCONFIG_SCHED_TIMESHARE_CORE=1 -DCONFIG_THREAD_GROUPS=1 -# Configure osmfk/kern/queue.h to define symbol __queue_element_linkage_invalid() -SCHED_HARNESS_DEFINES += -DDRIVERKIT_FRAMEWORK_INCLUDE=1 -SCHED_EDGE_DEFINES := -DCONFIG_SCHED_EDGE=1 -D__AMP__=1 - -# Enable some ASan/UBSan in the test binary for MacOS target -SCHED_HARNESS_DEBUG_FLAGS := -fsanitize=bounds -fsanitize=null -fsanitize=address -gfull - -XNU_SRC := .. -# List the shadow_headers directory first before ../osfmk/ so that headers present in the -# harness directory override the versions in xnu source proper -SCHED_HARNESS_COMPILER_SEARCH_ORDER := -I $(SCHED_HARNESS_SHADOW)/ -I $(XNU_SRC)/osfmk/ - -# Track file modifications correctly in the recipe -SCHED_HARNESS_DEPS := $(shell find $(SCHED_HARNESS) -name "*.c" -o -name "*.h") -SCHED_CLUTCH_DEPS := $(XNU_SRC)/osfmk/kern/sched_clutch.c $(XNU_SRC)/osfmk/kern/sched_clutch.h $(XNU_SRC)/osfmk/kern/queue.h $(XNU_SRC)/osfmk/kern/circle_queue.h $(XNU_SRC)/osfmk/kern/bits.h $(XNU_SRC)/osfmk/kern/sched.h - -# Guard-out some unwanted includes without needing to modify the original header files -SCHED_CLUTCH_UNWANTED_HDRS := mach/policy.h kern/smp.h kern/timer_call.h kern/macro_help.h kern/spl.h kern/misc_protos.h kern/thread.h -clutch_setup_placehold_hdrs: - mkdir -p $(SCHED_HARNESS_SHADOW)/kern - mkdir -p $(SCHED_HARNESS_SHADOW)/mach - for hdr in $(SCHED_CLUTCH_UNWANTED_HDRS); do \ - echo "/* Empty file used as a placeholder for " $$hdr " that we don't want to import */" > $(SCHED_HARNESS_SHADOW)/$$hdr; \ - done - # Replace osfmk/mach/mach_types.h with smaller set of dependencies combined in misc_needed_defines.h, - # ultimately satisfying dependencies needed by osfmk/kern/sched.h - echo '#include "misc_needed_defines.h"' > $(SCHED_HARNESS_SHADOW)/mach/mach_types.h - -# Make it convenient to build all of the tests in one go -SCHED_USERSPACE_UNIT_TESTS = sched/clutch_runqueue sched/edge_runqueue sched/edge_migration -.PHONY: sched/userspace_unit_tests -sched/userspace_unit_tests: $(SCHED_USERSPACE_UNIT_TESTS) -SCHED_TARGETS += $(SCHED_USERSPACE_UNIT_TESTS) - -# Link together all the object files (built below) with the unit tests into a final binary -sched/clutch_runqueue: INVALID_ARCHS = $(filter-out arm64e%,$(ARCH_CONFIGS)) -sched/clutch_runqueue: OTHER_CFLAGS += $(SCHED_HARNESS_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) $(SCHED_TEST_DISABLED_WARNINGS) -DTEST_RUNQ_POLICY="clutch" -sched/clutch_runqueue: OTHER_LDFLAGS += -ldarwintest_utils $(SCHED_HARNESS_DEBUG_FLAGS) $(OBJROOT)/sched_clutch_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o -sched/clutch_runqueue: $(OBJROOT)/sched_clutch_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o -sched/clutch_runqueue: CONFIG_FLAGS := $(filter-out -O%,$(CONFIG_FLAGS)) -O0 -gfull - -sched/edge_runqueue: INVALID_ARCHS = $(filter-out arm64e%,$(ARCH_CONFIGS)) -sched/edge_runqueue: OTHER_CFLAGS += $(SCHED_HARNESS_DEFINES) $(SCHED_EDGE_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) $(SCHED_TEST_DISABLED_WARNINGS) -DTEST_RUNQ_POLICY="edge" -sched/edge_runqueue: OTHER_LDFLAGS += -ldarwintest_utils $(SCHED_HARNESS_DEBUG_FLAGS) $(OBJROOT)/sched_edge_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o $(OBJROOT)/sched_migration_harness.o -sched/edge_runqueue: $(OBJROOT)/sched_edge_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o $(OBJROOT)/sched_migration_harness.o -sched/edge_runqueue: CONFIG_FLAGS := $(filter-out -O%,$(CONFIG_FLAGS)) -O0 -gfull - -sched/edge_migration: INVALID_ARCHS = $(filter-out arm64e%,$(ARCH_CONFIGS)) -sched/edge_migration: OTHER_CFLAGS += $(SCHED_HARNESS_DEFINES) $(SCHED_EDGE_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) $(SCHED_TEST_DISABLED_WARNINGS) -DTEST_RUNQ_POLICY="edge" -sched/edge_migration: OTHER_LDFLAGS += -ldarwintest_utils $(SCHED_HARNESS_DEBUG_FLAGS) $(OBJROOT)/sched_edge_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o $(OBJROOT)/sched_migration_harness.o -sched/edge_migration: $(OBJROOT)/sched_edge_harness.o $(OBJROOT)/priority_queue.o $(OBJROOT)/sched_runqueue_harness.o $(OBJROOT)/sched_migration_harness.o -sched/edge_migration: CONFIG_FLAGS := $(filter-out -O%,$(CONFIG_FLAGS)) -O0 -gfull - -# Runqueue harness -$(OBJROOT)/sched_runqueue_harness.o: OTHER_CFLAGS += $(SCHED_HARNESS_DEBUG_FLAGS) -$(OBJROOT)/sched_runqueue_harness.o: $(SCHED_HARNESS)/sched_runqueue_harness.c - $(MAKE) clutch_setup_placehold_hdrs - $(CC) $(OTHER_CFLAGS) $(CFLAGS) -c $< -o $@ - -# Migration harness -$(OBJROOT)/sched_migration_harness.o: OTHER_CFLAGS += $(SCHED_HARNESS_DEBUG_FLAGS) -$(OBJROOT)/sched_migration_harness.o: $(SCHED_HARNESS)/sched_migration_harness.c - $(MAKE) clutch_setup_placehold_hdrs - $(CC) $(OTHER_CFLAGS) $(CFLAGS) -c $< -o $@ - -# Clutch harness -$(OBJROOT)/sched_clutch_harness.o: OTHER_CFLAGS += -DRUNQUEUE_HARNESS_IMPLEMENTATION=1 $(SCHED_HARNESS_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_CLUTCH_DISABLED_WARNINGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) -$(OBJROOT)/sched_clutch_harness.o: $(SCHED_HARNESS)/sched_clutch_harness.c $(SCHED_HARNESS_DEPS) $(SCHED_CLUTCH_DEPS) - $(MAKE) clutch_setup_placehold_hdrs - $(CC) $(OTHER_CFLAGS) $(CFLAGS) -c $< -o $@ - -# Edge harness -$(OBJROOT)/sched_edge_harness.o: OTHER_CFLAGS += $(SCHED_HARNESS_DEFINES) $(SCHED_EDGE_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_CLUTCH_DISABLED_WARNINGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) -$(OBJROOT)/sched_edge_harness.o: $(SCHED_HARNESS)/sched_edge_harness.c $(SCHED_HARNESS)/sched_clutch_harness_impl.c $(SCHED_HARNESS_DEPS) $(SCHED_CLUTCH_DEPS) - $(MAKE) clutch_setup_placehold_hdrs - $(CC) $(OTHER_CFLAGS) $(CFLAGS) -c $< -o $@ - -# Priority queue C++ dependency -$(OBJROOT)/priority_queue.o: OTHER_CXXFLAGS += -std=c++11 $(SCHED_HARNESS_DEFINES) $(SCHED_HARNESS_DEBUG_FLAGS) $(SCHED_HARNESS_COMPILER_SEARCH_ORDER) -$(OBJROOT)/priority_queue.o: $(SCHED_HARNESS_SHADOW)/priority_queue.cpp - $(CXX) $(OTHER_CXXFLAGS) $(CXXFLAGS) -c $< -o $@ - -endif diff --git a/tests/sched/sched_test_harness/sched_clutch_harness.c b/tests/sched/sched_test_harness/sched_clutch_harness.c index cff9bead1..9fc3175dc 100644 --- a/tests/sched/sched_test_harness/sched_clutch_harness.c +++ b/tests/sched/sched_test_harness/sched_clutch_harness.c @@ -11,10 +11,12 @@ impl_init_runqueue(void) assert(processor_avail_count == 1); sched_clutch_init(); sched_clutch_pset_init(&pset0); + sched_rt_init_pset(&pset0); sched_clutch_processor_init(&cpu0); increment_mock_time(100); clutch_impl_init_params(); clutch_impl_init_tracepoints(); + sched_rt_init_completed(); } struct thread_group * @@ -29,12 +31,6 @@ impl_create_thread(int root_bucket, struct thread_group *tg, int pri) return clutch_impl_create_thread(root_bucket, tg, pri); } -void -impl_set_thread_sched_mode(test_thread_t thread, int mode) -{ - clutch_impl_set_thread_sched_mode(thread, mode); -} - void impl_set_thread_processor_bound(test_thread_t thread, int cpu_id) { @@ -47,21 +43,30 @@ impl_cpu_set_thread_current(int cpu_id, test_thread_t thread) clutch_impl_cpu_set_thread_current(cpu_id, thread); } -void +test_thread_t impl_cpu_clear_thread_current(int cpu_id) { - clutch_impl_cpu_clear_thread_current(cpu_id); + return clutch_impl_cpu_clear_thread_current(cpu_id); } void impl_cpu_enqueue_thread(int cpu_id, test_thread_t thread) { - sched_clutch_processor_enqueue(cpus[cpu_id], thread, SCHED_TAILQ); + if (impl_get_thread_is_realtime(thread)) { + rt_runq_insert(cpus[cpu_id], cpus[cpu_id]->processor_set, (thread_t) thread); + } else { + sched_clutch_processor_enqueue(cpus[cpu_id], (thread_t) thread, SCHED_TAILQ); + } } test_thread_t impl_cpu_dequeue_thread(int cpu_id) { + test_thread_t chosen_thread = sched_rt_choose_thread(cpus[cpu_id]); + if (chosen_thread != THREAD_NULL) { + return chosen_thread; + } + /* No realtime threads. */ return sched_clutch_choose_thread(cpus[cpu_id], MINPRI, NULL, 0); } @@ -69,6 +74,7 @@ test_thread_t impl_cpu_dequeue_thread_compare_current(int cpu_id) { assert(cpus[cpu_id]->active_thread != NULL); + assert(impl_get_thread_is_realtime(cpus[cpu_id]) == false); /* should not be called when realtime threads are running */ return sched_clutch_choose_thread(cpus[cpu_id], MINPRI, cpus[cpu_id]->active_thread, 0); } diff --git a/tests/sched/sched_test_harness/sched_clutch_harness.h b/tests/sched/sched_test_harness/sched_clutch_harness.h index 3d07e7747..483a07a71 100644 --- a/tests/sched/sched_test_harness/sched_clutch_harness.h +++ b/tests/sched/sched_test_harness/sched_clutch_harness.h @@ -28,6 +28,6 @@ extern test_thread_t clutch_impl_create_thread(int root_bucket, struct thread_gr extern void clutch_impl_set_thread_sched_mode(test_thread_t thread, int mode); extern void clutch_impl_set_thread_processor_bound(test_thread_t thread, int cpu_id); extern void clutch_impl_cpu_set_thread_current(int cpu_id, test_thread_t thread); -extern void clutch_impl_cpu_clear_thread_current(int cpu_id); +extern test_thread_t clutch_impl_cpu_clear_thread_current(int cpu_id); extern void clutch_impl_log_tracepoint(uint64_t trace_code, uint64_t a1, uint64_t a2, uint64_t a3, uint64_t a4); extern void clutch_impl_pop_tracepoint(uint64_t *clutch_trace_code, uint64_t *arg1, uint64_t *arg2, uint64_t *arg3, uint64_t *arg4); diff --git a/tests/sched/sched_test_harness/sched_clutch_harness_impl.c b/tests/sched/sched_test_harness/sched_clutch_harness_impl.c index 354db2b7b..2e39417b9 100644 --- a/tests/sched/sched_test_harness/sched_clutch_harness_impl.c +++ b/tests/sched/sched_test_harness/sched_clutch_harness_impl.c @@ -19,8 +19,9 @@ #include "shadow_headers/sched_prim.c" static test_hw_topology_t curr_hw_topo = { - .num_psets = 0, .psets = NULL, + .num_psets = 0, + .total_cpus = 0, }; static int _curr_cpu = 0; @@ -30,6 +31,12 @@ ml_get_cluster_count(void) return (unsigned int)curr_hw_topo.num_psets; } +unsigned int +ml_get_cpu_count(void) +{ + return (unsigned int)curr_hw_topo.total_cpus; +} + /* * Mocked HW details * For simplicity, we mock a platform with 1 pset comprised of 1 CPU @@ -43,6 +50,9 @@ static struct processor *cpus[MAX_CPUS]; struct processor_set pset0; struct processor cpu0; +/* pset_nodes indexed by CPU type */ +pset_node_t pset_node_by_cpu_type[TEST_CPU_TYPE_MAX]; + /* Mocked-out Clutch functions */ static boolean_t sched_thread_sched_pri_promoted(thread_t thread) @@ -53,6 +63,10 @@ sched_thread_sched_pri_promoted(thread_t thread) /* Clutch policy code under-test, safe to include now after satisfying its dependencies */ #include +#include + +/* Realtime policy code under-test */ +#include /* Implementation of sched_clutch_harness.h interface */ @@ -75,11 +89,13 @@ unsigned int CLUTCH_THREAD_SELECT = -1; static test_pset_t single_pset = { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 1, + .cluster_id = 0, .die_id = 0, }; test_hw_topology_t single_core = { .psets = &single_pset, .num_psets = 1, + .total_cpus = 1, }; static char @@ -95,11 +111,25 @@ test_cpu_type_to_char(test_cpu_type_t cpu_type) } } +static uint64_t unique_tg_id = 0; +static uint64_t unique_thread_id = 0; + void clutch_impl_init_topology(test_hw_topology_t hw_topology) { printf("🗺️ Mock HW Topology: %d psets {", hw_topology.num_psets); assert(hw_topology.num_psets <= MAX_PSETS); + + /* Initialize pset nodes for each distinct CPU type. */ + for (int i = 0; i < hw_topology.num_psets; i++) { + if (pset_node_by_cpu_type[hw_topology.psets[i].cpu_type] == PSET_NODE_NULL) { + pset_node_by_cpu_type[hw_topology.psets[i].cpu_type] = (pset_node_t) malloc(sizeof(struct pset_node)); + pset_node_t node = pset_node_by_cpu_type[hw_topology.psets[i].cpu_type]; + bzero(&node->pset_map, sizeof(node->pset_map)); + node->psets = PROCESSOR_SET_NULL; + } + } + int total_cpus = 0; for (int i = 0; i < hw_topology.num_psets; i++) { assert((total_cpus + hw_topology.psets[i].num_cpus) <= MAX_CPUS); @@ -111,7 +141,15 @@ clutch_impl_init_topology(test_hw_topology_t hw_topology) psets[i]->pset_cluster_id = i; psets[i]->pset_id = i; psets[i]->cpu_set_low = total_cpus; + psets[i]->cpu_set_count = hw_topology.psets[i].num_cpus; psets[i]->cpu_bitmask = 0; + + pset_node_t node = pset_node_by_cpu_type[hw_topology.psets[i].cpu_type]; + psets[i]->node = node; + psets[i]->pset_list = node->psets; + node->psets = psets[i]; + node->pset_map |= BIT(i); + printf(" (%d: %d %c CPUs)", i, hw_topology.psets[i].num_cpus, test_cpu_type_to_char(hw_topology.psets[i].cpu_type)); for (int c = total_cpus; c < total_cpus + hw_topology.psets[i].num_cpus; c++) { if (c == 0) { @@ -122,18 +160,26 @@ clutch_impl_init_topology(test_hw_topology_t hw_topology) cpus[c]->cpu_id = c; cpus[c]->processor_set = psets[i]; bit_set(psets[i]->cpu_bitmask, c); - cpus[c]->active_thread = NULL; + struct thread_group *not_real_idle_tg = create_tg(0); + thread_t idle_thread = clutch_impl_create_thread(TH_BUCKET_SHARE_BG, not_real_idle_tg, IDLEPRI); + idle_thread->bound_processor = cpus[c]; + idle_thread->state = (TH_RUN | TH_IDLE); + cpus[c]->idle_thread = idle_thread; + cpus[c]->active_thread = cpus[c]->idle_thread; + cpus[c]->state = PROCESSOR_IDLE; } psets[i]->recommended_bitmask = psets[i]->cpu_bitmask; psets[i]->cpu_available_map = psets[i]->cpu_bitmask; + bzero(&psets[i]->realtime_map, sizeof(psets[i]->realtime_map)); total_cpus += hw_topology.psets[i].num_cpus; } processor_avail_count = total_cpus; printf(" }\n"); + /* After mock idle thread creation, reset thread/TG start IDs, as the idle threads shouldn't count! */ + unique_tg_id = 0; + unique_thread_id = 0; } -static uint64_t unique_tg_id = 0; -static uint64_t unique_thread_id = 0; #define NUM_LOGGED_TRACE_CODES 1 #define NUM_TRACEPOINT_FIELDS 5 static uint64_t logged_trace_codes[NUM_LOGGED_TRACE_CODES]; @@ -186,10 +232,12 @@ clutch_impl_create_thread(int root_bucket, struct thread_group *tg, int pri) thread_t thread = malloc(sizeof(struct thread)); thread->base_pri = pri; thread->sched_pri = pri; + thread->sched_flags = 0; thread->thread_group = tg; thread->th_sched_bucket = root_bucket; thread->bound_processor = NULL; thread->__runq.runq = PROCESSOR_NULL; + queue_chain_init(thread->runq_links); thread->thread_id = unique_thread_id++; #if CONFIG_SCHED_EDGE thread->th_bound_cluster_enqueued = false; @@ -197,19 +245,31 @@ clutch_impl_create_thread(int root_bucket, struct thread_group *tg, int pri) thread->th_shared_rsrc_enqueued[shared_rsrc_type] = false; thread->th_shared_rsrc_heavy_user[shared_rsrc_type] = false; thread->th_shared_rsrc_heavy_perf_control[shared_rsrc_type] = false; + thread->th_expired_quantum_on_lower_core = false; + thread->th_expired_quantum_on_higher_core = false; } #endif /* CONFIG_SCHED_EDGE */ thread->th_bound_cluster_id = THREAD_BOUND_CLUSTER_NONE; thread->reason = AST_NONE; thread->sched_mode = TH_MODE_TIMESHARE; - thread->sched_flags = 0; + bzero(&thread->realtime, sizeof(thread->realtime)); + thread->last_made_runnable_time = 0; + thread->state = TH_RUN; return thread; } + void -clutch_impl_set_thread_sched_mode(test_thread_t thread, int mode) +impl_set_thread_sched_mode(test_thread_t thread, int mode) { ((thread_t)thread)->sched_mode = (sched_mode_t)mode; } + +bool +impl_get_thread_is_realtime(test_thread_t thread) +{ + return ((thread_t)thread)->sched_pri >= BASEPRI_RTQUEUES; +} + void clutch_impl_set_thread_processor_bound(test_thread_t thread, int cpu_id) { @@ -220,17 +280,28 @@ void clutch_impl_cpu_set_thread_current(int cpu_id, test_thread_t thread) { cpus[cpu_id]->active_thread = thread; - cpus[cpu_id]->first_timeslice = true; + cpus[cpu_id]->first_timeslice = TRUE; /* Equivalent logic of processor_state_update_from_thread() */ cpus[cpu_id]->current_pri = ((thread_t)thread)->sched_pri; cpus[cpu_id]->current_thread_group = ((thread_t)thread)->thread_group; cpus[cpu_id]->current_is_bound = ((thread_t)thread)->bound_processor != PROCESSOR_NULL; + + if (((thread_t) thread)->sched_pri >= BASEPRI_RTQUEUES) { + bit_set(cpus[cpu_id]->processor_set->realtime_map, cpu_id); + cpus[cpu_id]->deadline = ((thread_t) thread)->realtime.deadline; + } else { + bit_clear(cpus[cpu_id]->processor_set->realtime_map, cpu_id); + cpus[cpu_id]->deadline = UINT64_MAX; + } } -void +test_thread_t clutch_impl_cpu_clear_thread_current(int cpu_id) { - cpus[cpu_id]->active_thread = NULL; + test_thread_t thread = cpus[cpu_id]->active_thread; + cpus[cpu_id]->active_thread = cpus[cpu_id]->idle_thread; + bit_clear(cpus[cpu_id]->processor_set->realtime_map, cpu_id); + return thread; } static bool @@ -284,3 +355,96 @@ clutch_impl_pop_tracepoint(uint64_t *clutch_trace_code, uint64_t *arg1, uint64_t *arg4 = logged_tracepoints[expect_tracepoint_ind * NUM_TRACEPOINT_FIELDS + 4]; expect_tracepoint_ind++; } + +#pragma mark - Realtime + +static test_thread_t +impl_dequeue_realtime_thread(processor_set_t pset) +{ + thread_t thread = rt_runq_dequeue(&pset->rt_runq); + pset_update_rt_stealable_state(pset); + return thread; +} + +void +impl_set_thread_realtime(test_thread_t thread, uint32_t period, uint32_t computation, uint32_t constraint, bool preemptible, uint8_t priority_offset, uint64_t deadline) +{ + thread_t t = (thread_t) thread; + t->realtime.period = period; + t->realtime.computation = computation; + t->realtime.constraint = constraint; + t->realtime.preemptible = preemptible; + t->realtime.priority_offset = priority_offset; + t->realtime.deadline = deadline; +} + +void +impl_sched_rt_spill_policy_set(unsigned policy) +{ + sched_rt_spill_policy = policy; +} + +void +impl_sched_rt_steal_policy_set(unsigned policy) +{ + sched_rt_steal_policy = policy; +} + +void +impl_sched_rt_init_completed() +{ + sched_rt_init_completed(); +} + +#pragma mark -- IPI Subsystem + +sched_ipi_type_t +sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event) +{ + /* Forward to the policy-specific implementation */ + return SCHED(ipi_policy)(dst, thread, (dst->active_thread == dst->idle_thread), event); +} + +#define MAX_LOGGED_IPIS 10000 +typedef struct { + int cpu_id; + sched_ipi_type_t ipi_type; +} logged_ipi_t; +static logged_ipi_t logged_ipis[MAX_LOGGED_IPIS]; +static uint32_t curr_ipi_ind = 0; +static uint32_t expect_ipi_ind = 0; + +void +sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi) +{ + /* Record the IPI type and where we sent it */ + logged_ipis[curr_ipi_ind].cpu_id = dst->cpu_id; + logged_ipis[curr_ipi_ind].ipi_type = ipi; + curr_ipi_ind++; +} + +sched_ipi_type_t +sched_ipi_policy(processor_t dst, thread_t thread, + boolean_t dst_idle, sched_ipi_event_t event) +{ + (void)dst; + (void)thread; + (void)dst_idle; + (void)event; + if (event == SCHED_IPI_EVENT_REBALANCE) { + return SCHED_IPI_IMMEDIATE; + } + /* For now, default to deferred IPI */ + return SCHED_IPI_DEFERRED; +} + +sched_ipi_type_t +sched_ipi_deferred_policy(processor_set_t pset, + processor_t dst, thread_t thread, sched_ipi_event_t event) +{ + (void)pset; + (void)dst; + (void)thread; + (void)event; + return SCHED_IPI_DEFERRED; +} diff --git a/tests/sched/sched_test_harness/sched_edge_harness.c b/tests/sched/sched_test_harness/sched_edge_harness.c index d829aefa9..a769046dd 100644 --- a/tests/sched/sched_test_harness/sched_edge_harness.c +++ b/tests/sched/sched_test_harness/sched_edge_harness.c @@ -30,7 +30,14 @@ ml_cpu_signal_deferred_get_timer(void) static test_cpu_type_t cluster_type_to_test_cpu_type(cluster_type_t cluster_type) { - return (test_cpu_type_t)(cluster_type - 1); + switch (cluster_type) { + case CLUSTER_TYPE_E: + return TEST_CPU_TYPE_EFFICIENCY; + case CLUSTER_TYPE_P: + return TEST_CPU_TYPE_PERFORMANCE; + default: + assert(false); + } } static unsigned int cpu_count_for_type[TEST_CPU_TYPE_MAX] = { 0 }; @@ -58,112 +65,73 @@ ml_get_cluster_number_type(cluster_type_t cluster_type) int sched_amp_spill_deferred_ipi = 1; int sched_amp_pcores_preempt_immediate_ipi = 1; -sched_ipi_type_t -sched_ipi_action(processor_t dst, thread_t thread, sched_ipi_event_t event) -{ - /* Forward to the policy-specific implementation */ - return SCHED(ipi_policy)(dst, thread, (dst->active_thread == NULL), event); -} - -#define MAX_LOGGED_IPIS 10000 -typedef struct { - int cpu_id; - sched_ipi_type_t ipi_type; -} logged_ipi_t; -static logged_ipi_t logged_ipis[MAX_LOGGED_IPIS]; -static uint32_t curr_ipi_ind = 0; -static uint32_t expect_ipi_ind = 0; - -void -sched_ipi_perform(processor_t dst, sched_ipi_type_t ipi) -{ - /* Record the IPI type and where we sent it */ - logged_ipis[curr_ipi_ind].cpu_id = dst->cpu_id; - logged_ipis[curr_ipi_ind].ipi_type = ipi; - curr_ipi_ind++; -} - -sched_ipi_type_t -sched_ipi_policy(processor_t dst, thread_t thread, - boolean_t dst_idle, sched_ipi_event_t event) -{ - (void)dst; - (void)thread; - (void)dst_idle; - (void)event; - /* For now, only send IPIs based on a policy-specific decision */ - return SCHED_IPI_NONE; -} - -sched_ipi_type_t -sched_ipi_deferred_policy(processor_set_t pset, - processor_t dst, thread_t thread, sched_ipi_event_t event) -{ - (void)pset; - (void)dst; - (void)thread; - (void)event; - return SCHED_IPI_NONE; -} - /* Implementation of sched_runqueue_harness.h interface */ static test_pset_t basic_amp_psets[2] = { { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 2, + .cluster_id = 0, .die_id = 0, }, { .cpu_type = TEST_CPU_TYPE_EFFICIENCY, .num_cpus = 4, + .cluster_id = 1, .die_id = 0, }, }; test_hw_topology_t basic_amp = { .psets = &basic_amp_psets[0], .num_psets = 2, + .total_cpus = 6, }; static test_pset_t dual_die_psets[6] = { { .cpu_type = TEST_CPU_TYPE_EFFICIENCY, .num_cpus = 2, + .cluster_id = 0, .die_id = 0, }, { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 4, + .cluster_id = 1, .die_id = 0, }, { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 4, + .cluster_id = 2, .die_id = 0, }, { .cpu_type = TEST_CPU_TYPE_EFFICIENCY, .num_cpus = 2, + .cluster_id = 3, .die_id = 1, }, { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 4, + .cluster_id = 4, .die_id = 1, }, { .cpu_type = TEST_CPU_TYPE_PERFORMANCE, .num_cpus = 4, + .cluster_id = 5, .die_id = 1, }, }; test_hw_topology_t dual_die = { .psets = &dual_die_psets[0], .num_psets = 6, + .total_cpus = 20, }; #define MAX_NODES 2 -static struct pset_node node_array[MAX_NODES]; static void edge_impl_set_cluster_type(processor_set_t pset, test_cpu_type_t type) @@ -171,11 +139,13 @@ edge_impl_set_cluster_type(processor_set_t pset, test_cpu_type_t type) switch (type) { case TEST_CPU_TYPE_EFFICIENCY: pset->pset_cluster_type = PSET_AMP_E; - pset->node = &node_array[0]; + pset->node = &pset_nodes[0]; + bitmap_set(&pset_nodes[0].pset_map, pset->pset_cluster_id); break; case TEST_CPU_TYPE_PERFORMANCE: pset->pset_cluster_type = PSET_AMP_P; - pset->node = &node_array[1]; + pset->node = &pset_nodes[1]; + bitmap_set(&pset_nodes[1].pset_map, pset->pset_cluster_id); break; default: assert(false); @@ -183,18 +153,21 @@ edge_impl_set_cluster_type(processor_set_t pset, test_cpu_type_t type) } } +struct mock_topology_info_struct mock_topology_info; + static void edge_impl_init_runqueues(void) { assert(curr_hw_topo.num_psets != 0); clutch_impl_init_topology(curr_hw_topo); + mock_topology_info.num_cpus = curr_hw_topo.total_cpus; sched_edge_init(); - node_array[0].pset_cluster_type = PSET_AMP_E; - os_atomic_store(&node_array[0].pset_recommended_map, 0, relaxed); - atomic_bit_set(&node_array[0].pset_recommended_map, 0, memory_order_relaxed); - node_array[1].pset_cluster_type = PSET_AMP_P; - os_atomic_store(&node_array[1].pset_recommended_map, 0, relaxed); - atomic_bit_set(&node_array[1].pset_recommended_map, 1, memory_order_relaxed); + bzero(pset_nodes, sizeof(pset_nodes)); + pset_nodes[0].pset_cluster_type = PSET_AMP_E; + pset_nodes[1].pset_cluster_type = PSET_AMP_P; + for (int i = 0; i < MAX_NODES; i++) { + os_atomic_store(&pset_nodes[i].pset_recommended_map, 0, relaxed); + } for (int i = 0; i < curr_hw_topo.num_psets; i++) { pset_array[i] = psets[i]; edge_impl_set_cluster_type(psets[i], curr_hw_topo.psets[i].cpu_type); @@ -210,12 +183,21 @@ edge_impl_init_runqueues(void) cpu_count_for_type[curr_hw_topo.psets[i].cpu_type] += curr_hw_topo.psets[i].num_cpus; recommended_cpu_count_for_type[curr_hw_topo.psets[i].cpu_type] += curr_hw_topo.psets[i].num_cpus; + impl_set_pset_recommended(i); + psets[i]->cpu_running_foreign = 0; + for (uint state = 0; state < PROCESSOR_STATE_LEN; state++) { + psets[i]->cpu_state_map[state] = 0; + } + /* Initialize realtime queues */ + pset_rt_init(psets[i]); } for (unsigned int j = 0; j < processor_avail_count; j++) { processor_array[j] = cpus[j]; sched_clutch_processor_init(cpus[j]); + os_atomic_store(&cpus[j]->stir_the_pot_inbox_cpu, -1, relaxed); } sched_edge_cpu_init_completed(); + sched_rt_init_completed(); increment_mock_time(100); clutch_impl_init_params(); clutch_impl_init_tracepoints(); @@ -249,12 +231,6 @@ impl_create_thread(int root_bucket, struct thread_group *tg, int pri) return clutch_impl_create_thread(root_bucket, tg, pri); } -void -impl_set_thread_sched_mode(test_thread_t thread, int mode) -{ - clutch_impl_set_thread_sched_mode(thread, mode); -} - void impl_set_thread_processor_bound(test_thread_t thread, int cpu_id) { @@ -270,31 +246,78 @@ impl_set_thread_cluster_bound(test_thread_t thread, int cluster_id) ((thread_t)thread)->th_bound_cluster_id = cluster_id; } +static void +processor_state_update_running_foreign(processor_t processor, thread_t thread) +{ + cluster_type_t current_processor_type = pset_type_for_id(processor->processor_set->pset_cluster_id); + cluster_type_t thread_type = pset_type_for_id(sched_edge_thread_preferred_cluster(thread)); + + boolean_t non_rt_thr = (processor->current_pri < BASEPRI_RTQUEUES); + boolean_t non_bound_thr = (thread->bound_processor == PROCESSOR_NULL); + if (non_rt_thr && non_bound_thr && (current_processor_type != thread_type)) { + bit_set(processor->processor_set->cpu_running_foreign, processor->cpu_id); + } else { + bit_clear(processor->processor_set->cpu_running_foreign, processor->cpu_id); + } +} + void impl_cpu_set_thread_current(int cpu_id, test_thread_t thread) { _curr_cpu = cpu_id; + processor_set_t pset = cpus[cpu_id]->processor_set; clutch_impl_cpu_set_thread_current(cpu_id, thread); + processor_state_update_running_foreign(cpus[cpu_id], (thread_t)thread); + pset_update_processor_state(pset, cpus[cpu_id], PROCESSOR_RUNNING); + sched_bucket_t bucket = ((((thread_t)thread)->state & TH_IDLE) || (((thread_t)thread)->bound_processor != PROCESSOR_NULL)) ? TH_BUCKET_SCHED_MAX : ((thread_t)thread)->th_sched_bucket; + os_atomic_store(&cpus[cpu_id]->processor_set->cpu_running_buckets[cpu_id], bucket, relaxed); + sched_edge_stir_the_pot_update_registry_state((thread_t)thread); + + /* Send followup IPIs for realtime, as needed */ + bit_clear(pset->rt_pending_spill_cpu_mask, cpu_id); + processor_t next_rt_processor = PROCESSOR_NULL; + sched_ipi_type_t next_rt_ipi_type = SCHED_IPI_NONE; + if (rt_pset_has_stealable_threads(pset)) { + rt_choose_next_processor_for_spill_IPI(pset, cpus[cpu_id], &next_rt_processor, &next_rt_ipi_type); + } else if (rt_pset_needs_a_followup_IPI(pset)) { + rt_choose_next_processor_for_followup_IPI(pset, cpus[cpu_id], &next_rt_processor, &next_rt_ipi_type); + } + if (next_rt_processor != PROCESSOR_NULL) { + sched_ipi_perform(next_rt_processor, next_rt_ipi_type); + } } -void +test_thread_t impl_cpu_clear_thread_current(int cpu_id) { _curr_cpu = cpu_id; - clutch_impl_cpu_clear_thread_current(cpu_id); + test_thread_t thread = clutch_impl_cpu_clear_thread_current(cpu_id); + pset_update_processor_state(cpus[cpu_id]->processor_set, cpus[cpu_id], PROCESSOR_IDLE); + os_atomic_store(&cpus[cpu_id]->processor_set->cpu_running_buckets[cpu_id], TH_BUCKET_SCHED_MAX, relaxed); + sched_edge_stir_the_pot_clear_registry_entry(); + return thread; } void impl_cpu_enqueue_thread(int cpu_id, test_thread_t thread) { _curr_cpu = cpu_id; - sched_clutch_processor_enqueue(cpus[cpu_id], thread, SCHED_TAILQ); + if (((thread_t) thread)->sched_pri >= BASEPRI_RTQUEUES) { + rt_runq_insert(cpus[cpu_id], cpus[cpu_id]->processor_set, (thread_t) thread); + } else { + sched_clutch_processor_enqueue(cpus[cpu_id], (thread_t) thread, SCHED_TAILQ); + } } test_thread_t impl_cpu_dequeue_thread(int cpu_id) { _curr_cpu = cpu_id; + test_thread_t chosen_thread = sched_rt_choose_thread(cpus[cpu_id]); + if (chosen_thread != THREAD_NULL) { + return chosen_thread; + } + /* No realtime threads. */ return sched_clutch_choose_thread(cpus[cpu_id], MINPRI, NULL, 0); } @@ -303,7 +326,12 @@ impl_cpu_dequeue_thread_compare_current(int cpu_id) { _curr_cpu = cpu_id; assert(cpus[cpu_id]->active_thread != NULL); - return sched_clutch_choose_thread(cpus[cpu_id], MINPRI, cpus[cpu_id]->active_thread, 0); + processor_set_t pset = cpus[cpu_id]->processor_set; + if (rt_runq_count(pset) > 0) { + return impl_dequeue_realtime_thread(pset); + } else { + return sched_clutch_choose_thread(cpus[cpu_id], MINPRI, cpus[cpu_id]->active_thread, 0); + } } bool @@ -326,11 +354,41 @@ int impl_choose_pset_for_thread(test_thread_t thread) { /* Begins search starting from current pset */ + sched_options_t options = SCHED_NONE; processor_t chosen_processor = sched_edge_choose_processor( - current_processor()->processor_set, current_processor(), (thread_t)thread); + current_processor()->processor_set, current_processor(), (thread_t)thread, &options); return chosen_processor->processor_set->pset_id; } +bool +impl_thread_avoid_processor(test_thread_t thread, int cpu_id, bool quantum_expired) +{ + _curr_cpu = cpu_id; + return sched_edge_thread_avoid_processor(cpus[cpu_id], (thread_t)thread, quantum_expired ? AST_QUANTUM : AST_NONE); +} + +void +impl_cpu_expire_quantum(int cpu_id) +{ + _curr_cpu = cpu_id; + sched_edge_quantum_expire(cpus[cpu_id]->active_thread); + cpus[cpu_id]->first_timeslice = FALSE; +} + +test_thread_t +impl_steal_thread(int cpu_id) +{ + _curr_cpu = cpu_id; + return sched_edge_processor_idle(psets[cpu_id_to_pset_id(cpu_id)]); +} + +bool +impl_processor_balance(int cpu_id) +{ + _curr_cpu = cpu_id; + return sched_edge_balance(cpus[cpu_id], psets[cpu_id_to_pset_id(cpu_id)]); +} + void impl_set_current_processor(int cpu_id) { @@ -340,7 +398,7 @@ impl_set_current_processor(int cpu_id) void impl_set_tg_sched_bucket_preferred_pset(struct thread_group *tg, int sched_bucket, int cluster_id) { - assert(sched_bucket > 0 && sched_bucket < TH_BUCKET_SCHED_MAX); + assert(sched_bucket < TH_BUCKET_SCHED_MAX); sched_clutch_t clutch = sched_clutch_for_thread_group(tg); bitmap_t modify_bitmap[BITMAP_LEN(TH_BUCKET_SCHED_MAX)] = {0}; bitmap_set(modify_bitmap, sched_bucket); @@ -409,8 +467,55 @@ impl_send_ipi(int cpu_id, test_thread_t thread, test_ipi_event_t event) sched_ipi_perform(cpus[cpu_id], triggered_ipi); } +int +rt_pset_spill_search_order_at_offset(int src_pset_id, int offset) +{ + return psets[src_pset_id]->sched_rt_spill_search_order.spso_search_order[offset]; +} + +void +rt_pset_recompute_spill_order(int src_pset_id) +{ + sched_rt_config_pset_push(psets[src_pset_id]); +} + uint32_t impl_qos_max_parallelism(int qos, uint64_t options) { return sched_edge_qos_max_parallelism(qos, options); } + +int * +impl_iterate_pset_search_order(int src_pset_id, uint64_t candidate_map, int sched_bucket) +{ + int *psets = (int *)malloc(sizeof(int) * curr_hw_topo.num_psets); + for (int i = 0; i < curr_hw_topo.num_psets; i++) { + psets[i] = -1; + } + sched_pset_iterate_state_t istate = SCHED_PSET_ITERATE_STATE_INIT; + int ind = 0; + processor_set_t starting_pset = pset_array[src_pset_id]; + while (sched_iterate_psets_ordered(starting_pset, + &starting_pset->spill_search_order[sched_bucket], candidate_map, &istate)) { + psets[ind++] = istate.spis_pset_id; + } + return psets; +} + +test_thread_t +impl_rt_choose_thread(int cpu_id) +{ + return sched_rt_choose_thread(cpus[cpu_id]); +} + +void +sched_rt_spill_policy_set(unsigned policy) +{ + impl_sched_rt_spill_policy_set(policy); +} + +void +sched_rt_steal_policy_set(unsigned policy) +{ + impl_sched_rt_steal_policy_set(policy); +} diff --git a/tests/sched/sched_test_harness/sched_edge_harness.h b/tests/sched/sched_test_harness/sched_edge_harness.h index ce4cf5a51..038271a04 100644 --- a/tests/sched/sched_test_harness/sched_edge_harness.h +++ b/tests/sched/sched_test_harness/sched_edge_harness.h @@ -4,5 +4,19 @@ #include "sched_harness_impl.h" #include "sched_clutch_harness.h" +/* To get sched_clutch_edge and cluster_shared_rsrc_type_t */ +#include +/* To get PSET_ID_INVALID */ +#include extern void edge_set_thread_shared_rsrc(test_thread_t thread, bool native_first); + +#pragma mark - Realtime + +extern void sched_rt_config_set(uint8_t src, uint8_t dst, sched_clutch_edge edge); +extern sched_clutch_edge sched_rt_config_get(uint8_t src, uint8_t dst); +extern uint64_t rt_deadline_add(uint64_t d, uint64_t e); +extern void rt_pset_recompute_spill_order(int src_pset_id); +extern int rt_pset_spill_search_order_at_offset(int src_pset_id, int offset); +extern void sched_rt_spill_policy_set(unsigned policy); +extern void sched_rt_steal_policy_set(unsigned policy); diff --git a/tests/sched/sched_test_harness/sched_harness_impl.h b/tests/sched/sched_test_harness/sched_harness_impl.h index 64e918774..c20657b8f 100644 --- a/tests/sched/sched_test_harness/sched_harness_impl.h +++ b/tests/sched/sched_test_harness/sched_harness_impl.h @@ -12,9 +12,10 @@ extern void impl_init_runqueue(void); extern struct thread_group *impl_create_tg(int interactivity_score); extern test_thread_t impl_create_thread(int th_sched_bucket, struct thread_group *tg, int pri); extern void impl_set_thread_sched_mode(test_thread_t thread, int mode); +extern bool impl_get_thread_is_realtime(test_thread_t thread); extern void impl_set_thread_processor_bound(test_thread_t thread, int cpu_id); extern void impl_cpu_set_thread_current(int cpu_id, test_thread_t thread); -extern void impl_cpu_clear_thread_current(int cpu_id); +extern test_thread_t impl_cpu_clear_thread_current(int cpu_id); extern void impl_cpu_enqueue_thread(int cpu_id, test_thread_t thread); extern test_thread_t impl_cpu_dequeue_thread(int cpu_id); extern test_thread_t impl_cpu_dequeue_thread_compare_current(int cpu_id); @@ -29,8 +30,18 @@ extern void impl_init_migration_harness(test_hw_topology_t hw_t extern void impl_set_tg_sched_bucket_preferred_pset(struct thread_group *tg, int sched_bucket, int cluster_id); extern void impl_set_thread_cluster_bound(test_thread_t thread, int cluster_id); extern int impl_choose_pset_for_thread(test_thread_t thread); +extern bool impl_thread_avoid_processor(test_thread_t thread, int cpu_id, bool quantum_expiry); +extern void impl_cpu_expire_quantum(int cpu_id); +extern test_thread_t impl_steal_thread(int cpu_id); +extern bool impl_processor_balance(int cpu_id); extern void impl_set_current_processor(int cpu_id); extern void impl_set_pset_load_avg(int cluster_id, int QoS, uint64_t load_avg); extern void impl_set_pset_derecommended(int cluster_id); +extern int impl_iterate_pset_search_order_rt(int src_pset_id, int offset); extern void impl_set_pset_recommended(int cluster_id); extern uint32_t impl_qos_max_parallelism(int qos, uint64_t options); +extern int *impl_iterate_pset_search_order(int src_pset_id, uint64_t candidate_map, int sched_bucket); + +/* Realtime */ +extern void impl_set_thread_realtime(test_thread_t thread, uint32_t period, uint32_t computation, uint32_t constraint, bool preemptible, uint8_t priority_offset, uint64_t deadline); +extern test_thread_t impl_rt_choose_thread(int cpu_id); diff --git a/tests/sched/sched_test_harness/sched_migration_harness.c b/tests/sched/sched_test_harness/sched_migration_harness.c index b6623008d..621b2cea3 100644 --- a/tests/sched/sched_test_harness/sched_migration_harness.c +++ b/tests/sched/sched_test_harness/sched_migration_harness.c @@ -15,6 +15,7 @@ init_migration_harness(test_hw_topology_t hw_topology) /* Sets up _log and ATEND to close it */ init_harness_logging(T_NAME); assert(_log != NULL); + assert(hw_topology.num_psets > 0 && hw_topology.total_cpus > 0); fprintf(_log, "\tinitializing migration harness\n"); set_hw_topology(hw_topology); @@ -35,15 +36,55 @@ set_thread_cluster_bound(test_thread_t thread, int cluster_id) impl_set_thread_cluster_bound(thread, cluster_id); } +int +choose_pset_for_thread(test_thread_t thread) +{ + int chosen_pset_id = impl_choose_pset_for_thread(thread); + fprintf(_log, "for thread %p we chose pset_id %d\n", (void *)thread, chosen_pset_id); + return chosen_pset_id; +} + bool choose_pset_for_thread_expect(test_thread_t thread, int expected_cluster_id) { - int chosen_pset_id = impl_choose_pset_for_thread(thread); + int chosen_pset_id = choose_pset_for_thread(thread); fprintf(_log, "%s: for thread %p we chose pset_id %d, expecting %d\n", chosen_pset_id == expected_cluster_id ? "PASS" : "FAIL", (void *)thread, chosen_pset_id, expected_cluster_id); return chosen_pset_id == expected_cluster_id; } +bool +thread_avoid_processor_expect(test_thread_t thread, int cpu_id, bool quantum_expiry, bool avoid_expected) +{ + bool avoiding = impl_thread_avoid_processor(thread, cpu_id, quantum_expiry); + fprintf(_log, "%s: thread %p would avoid cpu %d? %d, expecting to avoid? %d\n", avoiding == avoid_expected ? + "PASS" : "FAIL", (void *)thread, cpu_id, avoiding, avoid_expected); + return avoiding == avoid_expected; +} + +void +cpu_expire_quantum(int cpu_id) +{ + impl_cpu_expire_quantum(cpu_id); + fprintf(_log, "cpu %d expired quantum\n", cpu_id); +} + +test_thread_t +cpu_steal_thread(int cpu_id) +{ + test_thread_t stolen_thread = impl_steal_thread(cpu_id); + fprintf(_log, "on cpu %d, stole thread %p\n", cpu_id, (void *)stolen_thread); + return stolen_thread; +} + +bool +cpu_processor_balance(int cpu_id) +{ + bool doing_rebalance = impl_processor_balance(cpu_id); + fprintf(_log, "on cpu %d, doing rebalance? %d\n", cpu_id, doing_rebalance); + return doing_rebalance; +} + void set_current_processor(int cpu_id) { @@ -109,3 +150,25 @@ max_parallelism_expect(int qos, uint64_t options, uint32_t expected_parallelism) expected_parallelism, qos, options, found_parallelism); return found_parallelism == expected_parallelism; } + +int +iterate_pset_search_order_expect(int src_pset_id, uint64_t candidate_map, int sched_bucket, + int *expected_pset_ids, int num_psets) +{ + int *search_order = impl_iterate_pset_search_order(src_pset_id, candidate_map, sched_bucket); + fprintf(_log, "for src pset %d candidate map %llx bucket %d, we found search order:\t", + src_pset_id, candidate_map, sched_bucket); + int first_failure_ind = -1; + for (int i = 0; (i < num_psets) && (search_order[i] != -1); i++) { + fprintf(_log, "%2d ", search_order[i]); + if ((expected_pset_ids[i] != search_order[i]) && (first_failure_ind == -1)) { + first_failure_ind = i; + } + } + fprintf(_log, "\n\t%s: expected search order:\t", (first_failure_ind == -1) ? "PASS" : "FAIL"); + for (int i = 0; i < num_psets; i++) { + fprintf(_log, "%2d ", expected_pset_ids[i]); + } + fprintf(_log, "\n"); + return first_failure_ind; +} diff --git a/tests/sched/sched_test_harness/sched_migration_harness.h b/tests/sched/sched_test_harness/sched_migration_harness.h index 3f0f46f9f..2714f558c 100644 --- a/tests/sched/sched_test_harness/sched_migration_harness.h +++ b/tests/sched/sched_test_harness/sched_migration_harness.h @@ -18,16 +18,18 @@ typedef enum { typedef struct { test_cpu_type_t cpu_type; int num_cpus; + int cluster_id; int die_id; } test_pset_t; typedef struct { test_pset_t *psets; int num_psets; + int total_cpus; } test_hw_topology_t; -extern int cpu_id_to_cluster_id(int cpu_id); -extern int cluster_id_to_cpu_id(int cluster_id); +extern int pset_id_to_cpu_id(int pset_id); +extern int cpu_id_to_pset_id(int cpu_id); extern test_hw_topology_t get_hw_topology(void); extern void set_hw_topology(test_hw_topology_t hw_topology); @@ -40,7 +42,12 @@ extern test_hw_topology_t dual_die; // 2E + 4P + 4P + 2E + 4P + 4P extern void init_migration_harness(test_hw_topology_t hw_topology); extern void set_tg_sched_bucket_preferred_pset(struct thread_group *tg, int sched_bucket, int cluster_id); extern void set_thread_cluster_bound(test_thread_t thread, int cluster_id); +extern int choose_pset_for_thread(test_thread_t thread); extern bool choose_pset_for_thread_expect(test_thread_t thread, int expected_cluster_id); +extern test_thread_t cpu_steal_thread(int cpu_id); +extern bool cpu_processor_balance(int cpu_id); +extern bool thread_avoid_processor_expect(test_thread_t thread, int cpu_id, bool quantum_expiry, bool avoid_expected); +extern void cpu_expire_quantum(int cpu_id); extern void set_current_processor(int cpu_id); extern void set_pset_load_avg(int cluster_id, int QoS, uint64_t load_avg); extern void set_pset_recommended(int cluster_id); @@ -64,3 +71,4 @@ extern void cpu_send_ipi_for_thread(int cpu_id, test_thread_t thread, test_ #define QOS_PARALLELISM_REALTIME 0x2 #define QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE 0x4 extern bool max_parallelism_expect(int qos, uint64_t options, uint32_t expected_parallelism); +extern int iterate_pset_search_order_expect(int src_pset_id, uint64_t candidate_map, int sched_bucket, int *expected_pset_ids, int num_psets); diff --git a/tests/sched/sched_test_harness/sched_policy_darwintest.h b/tests/sched/sched_test_harness/sched_policy_darwintest.h index ce8a78393..83194b594 100644 --- a/tests/sched/sched_test_harness/sched_policy_darwintest.h +++ b/tests/sched/sched_test_harness/sched_policy_darwintest.h @@ -36,7 +36,7 @@ sched_policy_final_pass(void) #define PASTER(a, b) a##_##b #define SCHED_POLICY_TEST_NAME(policy_name, test_name) PASTER(policy_name, test_name) -#define SCHED_POLICY_T_DECL(test_name, description, ...) T_DECL(SCHED_POLICY_TEST_NAME(TEST_RUNQ_POLICY, test_name), description, ##__VA_ARGS__) +#define SCHED_POLICY_T_DECL(test_name, description, ...) T_DECL(SCHED_POLICY_TEST_NAME(TEST_RUNQ_POLICY, test_name), description, T_META_TAG_VM_PREFERRED, ##__VA_ARGS__) static unsigned int sched_policy_fails_so_far = 0; static unsigned int sched_policy_passes_so_far = 0; @@ -65,3 +65,41 @@ static bool sched_policy_setup_final_pass = false; } \ }) /* END IGNORE CODESTYLE */ + +/* Test scenario metadata printing utilities */ + +#define MAX_METADATA 64 +#define MAX_METADATA_STR 256 +static char metadata_log[MAX_METADATA][MAX_METADATA_STR]; +static int metadata_ind = 0; + +static void +sched_policy_push_metadata(char *metada_name, uint64_t value) +{ + snprintf(metadata_log[metadata_ind++], 256, "%s %llu", metada_name, value); +} + +static void +sched_policy_pop_metadata(void) +{ + T_QUIET; T_EXPECT_GT(metadata_ind, 0, "no metadata left to pop"); + metadata_ind--; +} + +#define MAX_METADA_DUMP_STR (MAX_METADATA_STR * MAX_METADATA) +static char metadata_dump[MAX_METADA_DUMP_STR]; +static char * +sched_policy_dump_metadata(void) +{ + metadata_dump[0] = '('; + for (int i = 0; i < metadata_ind; i++) { + if (i == 0) { + snprintf(&metadata_dump[1], MAX_METADATA_STR, metadata_log[i]); + } else { + strcat(metadata_dump, ", "); + strcat(metadata_dump, metadata_log[i]); + } + } + strcat(metadata_dump, ") 🗃️ "); + return metadata_dump; +} diff --git a/tests/sched/sched_test_harness/sched_runqueue_harness.c b/tests/sched/sched_test_harness/sched_runqueue_harness.c index fce109208..c47aa85a8 100644 --- a/tests/sched/sched_test_harness/sched_runqueue_harness.c +++ b/tests/sched/sched_test_harness/sched_runqueue_harness.c @@ -67,33 +67,32 @@ get_hw_topology(void) } int -cpu_id_to_cluster_id(int cpu_id) +pset_id_to_cpu_id(int pset_id) { test_hw_topology_t topo = get_hw_topology(); int cpu_index = 0; for (int p = 0; p < topo.num_psets; p++) { - for (int c = 0; c < topo.psets[p].num_cpus; c++) { - if (cpu_index == cpu_id) { - return (int)p; - } - cpu_index++; - } - } - T_QUIET; T_ASSERT_FAIL("cpu id %d never found out of %d cpus", cpu_id, cpu_index); -} - -int -cluster_id_to_cpu_id(int cluster_id) -{ - test_hw_topology_t topo = get_hw_topology(); - int cpu_index = 0; - for (int p = 0; p < topo.num_psets; p++) { - if (p == cluster_id) { + if (p == pset_id) { return cpu_index; } cpu_index += topo.psets[p].num_cpus; } - T_QUIET; T_ASSERT_FAIL("pset id %d never found out of %d psets", cluster_id, topo.num_psets); + T_QUIET; T_ASSERT_FAIL("pset id %d never found out of %d psets", pset_id, topo.num_psets); +} + +int +cpu_id_to_pset_id(int cpu_id) +{ + test_hw_topology_t topo = get_hw_topology(); + T_QUIET; T_ASSERT_LT(cpu_id, topo.total_cpus, "cpu id out of bounds"); + int cpu_count = 0; + for (int p = 0; p < topo.num_psets; p++) { + cpu_count += topo.psets[p].num_cpus; + if (cpu_id < cpu_count) { + return p; + } + } + T_QUIET; T_ASSERT_FAIL("failed to find pset for cpu %d somehow", cpu_id); } static char _log_filepath[MAXPATHLEN]; @@ -164,6 +163,13 @@ cpu_set_thread_current(int cpu_id, test_thread_t thread) fprintf(_log, "\tset %p as current thread on cpu %d\n", thread, cpu_id); } +test_thread_t +cpu_clear_thread_current(int cpu_id) +{ + fprintf(_log, "\tclearing the current thread from cpu %d\n", cpu_id); + return impl_cpu_clear_thread_current(cpu_id); +} + bool runqueue_empty(test_runq_target_t runq_target) { @@ -176,8 +182,8 @@ runq_target_to_cpu_id(test_runq_target_t runq_target) switch (runq_target.target_type) { case TEST_RUNQ_TARGET_TYPE_CPU: return runq_target.target_id; - case TEST_RUNQ_TARGET_TYPE_CLUSTER: - return cluster_id_to_cpu_id(runq_target.target_id); + case TEST_RUNQ_TARGET_TYPE_PSET: + return pset_id_to_cpu_id(runq_target.target_id); default: T_ASSERT_FAIL("unexpected type %d", runq_target.target_type); } @@ -202,11 +208,11 @@ cpu_enqueue_thread(int cpu_id, test_thread_t thread) } test_runq_target_t -cluster_target(int cluster_id) +pset_target(int pset_id) { test_runq_target_t target = { - .target_type = TEST_RUNQ_TARGET_TYPE_CLUSTER, - .target_id = cluster_id, + .target_type = TEST_RUNQ_TARGET_TYPE_PSET, + .target_id = pset_id, }; return target; } @@ -253,7 +259,7 @@ enqueue_threads_rand_order(test_runq_target_t runq_target, unsigned int random_s { va_list args; va_start(args, num_threads); - test_thread_t *tmp = (test_thread_t *)malloc(sizeof(test_thread_t) * (size_t)num_threads); + test_thread_t *tmp = (test_thread_t *)calloc(num_threads, sizeof(test_thread_t)); for (int i = 0; i < num_threads; i++) { test_thread_t thread = va_arg(args, test_thread_t); tmp[i] = thread; @@ -290,11 +296,11 @@ dequeue_thread_expect(test_runq_target_t runq_target, test_thread_t expected_thr if (chosen_thread != expected_thread) { return false; } - if (expected_thread != NULL && auto_current_thread_disabled == false) { + if (expected_thread != NULL && auto_current_thread_disabled == false && impl_get_thread_is_realtime(chosen_thread) == false) { /* * Additionally verify that chosen_thread still gets returned as the highest * thread, even when compared against the remaining runqueue as the currently - * running thread + * running thread. */ cpu_set_thread_current(cpu_id, expected_thread); bool pass = cpu_dequeue_thread_expect_compare_current(cpu_id, expected_thread); @@ -385,3 +391,14 @@ reenable_auto_current_thread(void) { auto_current_thread_disabled = false; } + +#pragma mark - Realtime + +void +set_thread_realtime(test_thread_t thread, uint32_t period, uint32_t computation, uint32_t constraint, bool preemptible, uint8_t priority_offset, uint64_t deadline) +{ + fprintf(_log, "\tsetting realtime deadline on thread %p: period=0x%x, computation=0x%x, constraint=0x%x," + " preemptible=%s, priority_offset=%x, deadline=%llx\n", (void *) thread, period, computation, constraint, + preemptible ? "true" : "false", priority_offset, deadline); + impl_set_thread_realtime(thread, period, computation, constraint, preemptible, priority_offset, deadline); +} diff --git a/tests/sched/sched_test_harness/sched_runqueue_harness.h b/tests/sched/sched_test_harness/sched_runqueue_harness.h index bc90a8d3b..16edc6fcb 100644 --- a/tests/sched/sched_test_harness/sched_runqueue_harness.h +++ b/tests/sched/sched_test_harness/sched_runqueue_harness.h @@ -22,7 +22,7 @@ extern void increment_mock_time_us(uint64_t added_us); /* Specifying a runqueue */ typedef enum { TEST_RUNQ_TARGET_TYPE_CPU, - TEST_RUNQ_TARGET_TYPE_CLUSTER, + TEST_RUNQ_TARGET_TYPE_PSET, } test_runq_target_type_t; typedef struct { @@ -33,7 +33,7 @@ typedef struct { extern test_runq_target_t default_target; extern int get_default_cpu(void); -extern test_runq_target_t cluster_target(int cluster_id); +extern test_runq_target_t pset_target(int cluster_id); extern test_runq_target_t cpu_target(int cpu_id); /* Test harness utilities */ @@ -45,6 +45,7 @@ extern test_thread_t create_thread(int th_sched_bucket, struct thread_gr extern void set_thread_sched_mode(test_thread_t thread, int mode); extern void set_thread_processor_bound(test_thread_t thread, int cpu_id); extern void cpu_set_thread_current(int cpu_id, test_thread_t thread); +extern test_thread_t cpu_clear_thread_current(int cpu_id); extern bool runqueue_empty(test_runq_target_t runq_target); extern void enqueue_thread(test_runq_target_t runq_target, test_thread_t thread); extern void enqueue_threads(test_runq_target_t runq_target, int num_threads, ...); @@ -60,3 +61,6 @@ extern bool tracepoint_expect(uint64_t trace_code, uint64_t arg extern void disable_auto_current_thread(void); extern void reenable_auto_current_thread(void); extern bool cpu_check_should_yield(int cpu_id, bool yield_expected); + +/* Realtime thread utilities */ +extern void set_thread_realtime(test_thread_t thread, uint32_t period, uint32_t computation, uint32_t constraint, bool preemptible, uint8_t priority_offset, uint64_t deadline); diff --git a/tests/sched/sched_test_harness/shadow_headers/misc_needed_defines.h b/tests/sched/sched_test_harness/shadow_headers/misc_needed_defines.h index 7f16809e3..3871bed59 100644 --- a/tests/sched/sched_test_harness/shadow_headers/misc_needed_defines.h +++ b/tests/sched/sched_test_harness/shadow_headers/misc_needed_defines.h @@ -3,6 +3,8 @@ #ifndef _MISC_NEEDED_DEFINES_H_ #define _MISC_NEEDED_DEFINES_H_ +#include + /* * Include non-kernel header dependencies to make up for the equivalent kernel header * dependencies which are not safe to compile in a userspace binary @@ -32,6 +34,7 @@ typedef struct processor_set *processor_set_t; #define TASK_NULL ((task_t) 0) #define THREAD_NULL ((thread_t) 0) #define PROCESSOR_NULL ((processor_t) 0) +#define PROCESSOR_SET_NULL ((processor_set_t) 0) /* Defines from osfmk/kern/timer_call.h */ typedef void *timer_call_param_t; @@ -49,9 +52,26 @@ typedef enum { CLUSTER_TYPE_P = 2, MAX_CPU_TYPES, } cluster_type_t; +#define MAX_AMP_CLUSTER_TYPES (MAX_PSET_TYPES - 1) extern unsigned int ml_get_die_id(unsigned int cluster_id); extern uint64_t ml_cpu_signal_deferred_get_timer(void); extern unsigned int ml_get_cpu_number_type(cluster_type_t cluster_type, bool logical, bool available); extern unsigned int ml_get_cluster_number_type(cluster_type_t cluster_type); +/* Defines from osfmk/kern/thread.h */ +#define assert_thread_magic(thread) do { (void)(thread); } while (0) + +/* Defines from osfmk/kern/startup.h */ +#define TUNABLE(type_t, var, boot_arg, default_value) \ + type_t var = default_value + +/* Defines from bsd/sys/kdebug_kernel.h */ +#define __kdebug_only __unused + +struct mock_topology_info_struct { + unsigned int num_cpus; +}; +extern struct mock_topology_info_struct mock_topology_info; +#define ml_get_topology_info() (&mock_topology_info) + #endif /* _MISC_NEEDED_DEFINES_H_ */ diff --git a/tests/sched/sched_test_harness/shadow_headers/misc_needed_deps.c b/tests/sched/sched_test_harness/shadow_headers/misc_needed_deps.c index dda248ca9..807ad375e 100644 --- a/tests/sched/sched_test_harness/shadow_headers/misc_needed_deps.c +++ b/tests/sched/sched_test_harness/shadow_headers/misc_needed_deps.c @@ -12,9 +12,11 @@ /* Overrides necessary for userspace code */ #define panic(...) ({ printf("Panicking:\n"); printf(__VA_ARGS__); abort(); }) #define KDBG(...) (void)0 +#define KDBG_RELEASE(...) (void)0 #define kalloc_type(x, y, z) calloc((size_t)y, sizeof(x)) #define kfree_type(x, y, z) free(z) #define PE_parse_boot_argn(x, y, z) FALSE +#define kprintf(...) printf(__VA_ARGS__) /* Mock locks */ typedef void *lck_ticket_t; @@ -22,15 +24,57 @@ typedef void *lck_ticket_t; #define decl_simple_lock_data(class, name) class int name #define pset_lock(x) (void)x #define pset_unlock(x) (void)x +#define change_locked_pset(x, y) y #define pset_assert_locked(x) (void)x #define thread_lock(x) (void)x #define thread_unlock(x) (void)x +#define simple_lock(...) +#define simple_unlock(...) /* Processor-related */ #define PERCPU_DECL(type_t, name) type_t name #include processor_t processor_array[MAX_SCHED_CPUS]; processor_set_t pset_array[MAX_PSETS]; +struct pset_node pset_nodes[MAX_AMP_CLUSTER_TYPES]; +#define pset_node0 (pset_nodes[0]) + +pset_node_t +pset_node_for_pset_cluster_type(pset_cluster_type_t pset_cluster_type) +{ + for (unsigned i = 0; i < MAX_AMP_CLUSTER_TYPES; i++) { + if (bitmap_is_empty(&pset_nodes[i].pset_map, MAX_PSETS)) { + continue; + } + if (pset_nodes[i].pset_cluster_type == pset_cluster_type) { + return &pset_nodes[i]; + } + } + return PSET_NODE_NULL; +} + +pset_cluster_type_t +cluster_type_to_pset_cluster_type(cluster_type_t cluster_type) +{ + switch (cluster_type) { +#if __AMP__ + case CLUSTER_TYPE_E: + return PSET_AMP_E; + case CLUSTER_TYPE_P: + return PSET_AMP_P; +#endif /* __AMP__ */ + case CLUSTER_TYPE_SMP: + return PSET_SMP; + default: + panic("Unexpected cluster type %d", cluster_type); + } +} + +cpumap_t +pset_available_cpumap(processor_set_t pset) +{ + return pset->cpu_available_map & pset->recommended_bitmask; +} /* Expected global(s) */ static task_t kernel_task = NULL; @@ -93,7 +137,21 @@ struct thread { bool th_shared_rsrc_enqueued[CLUSTER_SHARED_RSRC_TYPE_COUNT]; bool th_shared_rsrc_heavy_user[CLUSTER_SHARED_RSRC_TYPE_COUNT]; bool th_shared_rsrc_heavy_perf_control[CLUSTER_SHARED_RSRC_TYPE_COUNT]; + bool th_expired_quantum_on_lower_core; + bool th_expired_quantum_on_higher_core; #endif /* CONFIG_SCHED_EDGE */ + + /* real-time parameters */ + struct { /* see mach/thread_policy.h */ + uint32_t period; + uint32_t computation; + uint32_t constraint; + bool preemptible; + uint8_t priority_offset; /* base_pri = BASEPRI_RTQUEUES + priority_offset */ + uint64_t deadline; + } realtime; + + uint64_t last_made_runnable_time; /* time when thread was unblocked or preempted */ }; void @@ -135,6 +193,12 @@ thread_tid( return thread != THREAD_NULL? thread->thread_id: 0; } +void +thread_clear_runq_locked(thread_t thread) +{ + thread->__runq.runq = PROCESSOR_NULL; +} + /* Satisfy recount dependency needed by osfmk/kern/sched.h */ #define recount_thread_time_mach(thread) (thread->mock_recount_time.user_time + thread->mock_recount_time.system_time) diff --git a/tests/sched/sched_test_harness/shadow_headers/sched_prim.c b/tests/sched/sched_test_harness/shadow_headers/sched_prim.c index 49720dda1..cdf30d2da 100644 --- a/tests/sched/sched_test_harness/shadow_headers/sched_prim.c +++ b/tests/sched/sched_test_harness/shadow_headers/sched_prim.c @@ -76,6 +76,7 @@ #include #include +#include boolean_t priority_is_urgent(int priority) @@ -240,9 +241,9 @@ run_queue_peek( } } -uint32_t sched_run_buckets[TH_BUCKET_MAX]; -unsigned sched_tick = 0; -int8_t sched_load_shifts[NRQS]; +uint32_t sched_run_buckets[TH_BUCKET_MAX]; +_Atomic uint32_t sched_tick = 0; +int8_t sched_load_shifts[NRQS]; #define DEFAULT_PREEMPTION_RATE 100 /* (1/s) */ int default_preemption_rate = DEFAULT_PREEMPTION_RATE; @@ -325,6 +326,23 @@ pset_type_for_id(uint32_t cluster_id) return pset_array[cluster_id]->pset_type; } +cluster_type_t +pset_cluster_type_to_cluster_type(pset_cluster_type_t pset_cluster_type) +{ + switch (pset_cluster_type) { +#if __AMP__ + case PSET_AMP_E: + return CLUSTER_TYPE_E; + case PSET_AMP_P: + return CLUSTER_TYPE_P; +#endif /* __AMP__ */ + case PSET_SMP: + return CLUSTER_TYPE_SMP; + default: + panic("Unexpected pset cluster type %d", pset_cluster_type); + } +} + #if CONFIG_SCHED_EDGE uint64_t @@ -344,9 +362,15 @@ processor_t choose_processor( processor_set_t starting_pset, processor_t processor, - thread_t thread) + thread_t thread, + __unused sched_options_t *options) { (void)processor; + + if (thread->sched_pri >= BASEPRI_RTQUEUES) { + return sched_rt_choose_processor(starting_pset, processor, thread); + } + if (thread->bound_processor != NULL) { return thread->bound_processor; } @@ -354,12 +378,6 @@ choose_processor( return processor_array[starting_pset->cpu_set_low]; } -static cpumap_t -pset_available_cpumap(processor_set_t pset) -{ - return pset->cpu_available_map & pset->recommended_bitmask; -} - int pset_available_cpu_count(processor_set_t pset) { @@ -392,9 +410,18 @@ sched_update_pset_load_average(processor_set_t pset, uint64_t curtime) (void)curtime; } -int -rt_runq_count(processor_set_t pset) +void +thread_setrun(thread_t thread, sched_options_t options) { - (void)pset; - return 0; + (void)thread; + (void)options; + assertf(false, "unimplemented"); } + +bool +sched_steal_thread_enabled(processor_set_t pset) +{ + return bit_count(pset->node->pset_map) > 1; +} + +int sched_rt_n_backup_processors = SCHED_DEFAULT_BACKUP_PROCESSORS; diff --git a/tests/sched/sched_test_utils.c b/tests/sched/sched_test_utils.c index b2f4a5fbb..dec2e91ec 100644 --- a/tests/sched/sched_test_utils.c +++ b/tests/sched/sched_test_utils.c @@ -1,10 +1,11 @@ -#include -#include -#include #include -#include -#include +#include #include +#include +#include +#include +#include +#include #include #include @@ -27,30 +28,35 @@ reenable_verbose_sched_utils(void) } static mach_timebase_info_data_t timebase_info; -static bool initialized_timebase = false; uint64_t nanos_to_abs(uint64_t nanos) { - kern_return_t kr; - if (!initialized_timebase) { + mach_timebase_info_data_t timebase = timebase_info; + + if (timebase.numer == 0 || timebase.denom == 0) { + kern_return_t kr; + kr = mach_timebase_info(&timebase_info); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); - initialized_timebase = true; + + timebase = timebase_info; } - return nanos * timebase_info.denom / timebase_info.numer; + return nanos * timebase.denom / timebase.numer; } uint64_t abs_to_nanos(uint64_t abs) { - kern_return_t kr; - if (!initialized_timebase) { + mach_timebase_info_data_t timebase = timebase_info; + + if (timebase.numer == 0 || timebase.denom == 0) { + kern_return_t kr; kr = mach_timebase_info(&timebase_info); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); - initialized_timebase = true; + timebase = timebase_info; } - return abs * timebase_info.numer / timebase_info.denom; + return abs * timebase.numer / timebase.denom; } static int num_perf_levels = 0; @@ -74,11 +80,25 @@ platform_perflevel_name(unsigned int perflevel) char sysctl_name[64] = { 0 }; snprintf(sysctl_name, sizeof(sysctl_name), "hw.perflevel%d.name", perflevel); ret = sysctlbyname(sysctl_name, &perflevel_names[perflevel], &(size_t){ sizeof(perflevel_names[perflevel]) }, NULL, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, sysctl_name); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "%s", sysctl_name); } return (const char *)perflevel_names[perflevel]; } +static unsigned int perflevel_ncpus[64] = {0}; +unsigned int +platform_perflevel_ncpus(unsigned int perflevel) +{ + if (perflevel_ncpus[perflevel] == 0) { + int ret; + char sysctl_name[64] = { 0 }; + snprintf(sysctl_name, sizeof(sysctl_name), "hw.perflevel%d.logicalcpu", perflevel); + ret = sysctlbyname(sysctl_name, &perflevel_ncpus[perflevel], &(size_t){ sizeof(perflevel_ncpus[perflevel]) }, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "%s", sysctl_name); + } + return perflevel_ncpus[perflevel]; +} + static bool reported_is_amp = false; bool platform_is_amp(void) @@ -98,10 +118,14 @@ platform_is_virtual_machine(void) int vmm_present = 0; ret = sysctlbyname("kern.hv_vmm_present", &vmm_present, &(size_t){ sizeof(vmm_present) }, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.hv_vmm_present"); - if (vmm_present && verbosity_enabled) { - T_LOG("🛰️ Platform is a virtual machine!"); + if (vmm_present) { + if (verbosity_enabled) { + T_LOG("🛰️ Platform is a virtual machine!"); + } + return true; } - return (bool)vmm_present; + + return false; } static char sched_policy_name[64]; @@ -261,6 +285,26 @@ create_threads(int num_threads, int priority, return thread_handles; } +const char * +platform_train_descriptor(void) +{ +#if TARGET_OS_XR + return "visionOS"; +#elif TARGET_OS_TV + return "tvOS"; +#elif TARGET_OS_WATCH + return "watchOS"; +#elif TARGET_OS_BRIDGE + return "bridgeOS"; +#elif TARGET_OS_OSX + return "macOS"; +#elif TARGET_OS_IOS + return "iOS"; +#else + return "unknown"; +#endif +} + static const double default_idle_threshold = 0.9; static const int default_timeout_sec = 3; @@ -522,7 +566,7 @@ sched_utils_tracing_supported(void) trace_handle_t begin_collect_trace(int argc, char *const argv[], char *filename) { - return begin_collect_trace_fmt(argc, argv, filename); + return begin_collect_trace_fmt(COLLECT_TRACE_FLAG_NONE, argc, argv, filename); } static bool first_trace = true; @@ -538,10 +582,14 @@ static char *begin_notification = "🖊️_trace_begun..."; static char *end_notification = "🖊️_trace_ended..."; static char *trigger_end_notification = "🖊️_stopping_trace..."; +#if !(TARGET_OS_WATCH || TARGET_OS_TV) static const int waiting_timeout_sec = 60 * 2; /* 2 minutes, allows trace post-processing to finish */ +#else /* !(TARGET_OS_WATCH || TARGET_OS_TV) */ +static const int waiting_timeout_sec = 60 * 3 + 30; /* 3 minutes and 30 seconds for slower targets */ +#endif /* !(TARGET_OS_WATCH || TARGET_OS_TV) */ trace_handle_t -begin_collect_trace_fmt(int argc, char *const argv[], char *fmt, ...) +begin_collect_trace_fmt(collect_trace_flags_t flags, int argc, char *const argv[], char *fmt, ...) { /* Check trace requirements */ if (!sched_utils_tracing_supported() || !trace_requested(argc, argv)) { @@ -589,10 +637,31 @@ begin_collect_trace_fmt(int argc, char *const argv[], char *fmt, ...) T_QUIET; T_WITH_ERRNO; T_EXPECT_EQ(ret, 0, "dt_launch_tool"); /* Launch trace record */ - char *trace_args[] = {trace_bin, "record", handle->abs_filename, "--plan", "default", "--unsafe", - "--kdebug-filter-include", "C0x01", "--omit", "Logging", "--kdebug-buffer-size", "1gb", - "--notify-after-start", begin_notification, "--notify-after-end", end_notification, - "--end-on-notification", trigger_end_notification, "&", NULL}; + char *trace_args_base[18] = {trace_bin, "record", handle->abs_filename, "--plan", "default", "--unsafe", + "--kdebug-filter-include", "C0x01", + "--omit", "Logging", "--kdebug-buffer-size", "1gb", + "--notify-after-start", begin_notification, "--notify-after-end", end_notification, + "--end-on-notification", trigger_end_notification}; + const unsigned trace_args_cap = 32; /* INCREASE THIS if there are too many trace args */ + char* trace_args[trace_args_cap]; + unsigned trace_args_len = 0; + for (unsigned i = 0; i < sizeof(trace_args_base) / sizeof(char *); ++i) { + trace_args[trace_args_len++] = trace_args_base[i]; + T_QUIET; T_ASSERT_LT(trace_args_len, trace_args_cap, "too many trace args"); + } + if (flags & COLLECT_TRACE_FLAG_DISABLE_SYSCALLS) { + trace_args[trace_args_len++] = "--omit=syscalls,syscall-sampling"; + T_QUIET; T_ASSERT_LT(trace_args_len, trace_args_cap, "too many trace args"); + trace_args[trace_args_len++] = "--kdebug-filter-exclude=S0x0103,S0x040c"; + T_QUIET; T_ASSERT_LT(trace_args_len, trace_args_cap, "too many trace args"); + } + if (flags & COLLECT_TRACE_FLAG_DISABLE_CLUTCH) { + trace_args[trace_args_len++] = "--kdebug-filter-exclude=S0x01A9"; + T_QUIET; T_ASSERT_LT(trace_args_len, trace_args_cap, "too many trace args"); + } + trace_args[trace_args_len++] = NULL; + T_QUIET; T_ASSERT_LT(trace_args_len, trace_args_cap, "too many trace args"); + pid_t trace_pid = dt_launch_tool_pipe(trace_args, false, NULL, ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { T_LOG("🖊️ [trace] %s", data); @@ -607,7 +676,7 @@ begin_collect_trace_fmt(int argc, char *const argv[], char *fmt, ...) T_LOG("🖊️ Starting trace collection for \"%s\" trace[%u]", handle->trace_filename, trace_pid); /* Wait for tracing to start */ - int signal_num; + int signal_num = 0; ret = dt_waitpid(handle->wait_on_start_pid, NULL, &signal_num, waiting_timeout_sec); T_QUIET; T_EXPECT_TRUE(ret, "dt_waitpid for trace start signal_num %d", signal_num); @@ -636,8 +705,8 @@ end_collect_trace(trace_handle_t handle) /* Wait for tracing to actually stop */ T_LOG("🖊️ Now waiting on trace to finish up..."); - int signal_num; - int exit_status; + int signal_num = 0; + int exit_status = 0; ret = dt_waitpid(trace_state->wait_on_end_pid, &exit_status, &signal_num, waiting_timeout_sec); T_QUIET; T_EXPECT_TRUE(ret, "dt_waitpid for trace stop, exit status %d signal_num %d", exit_status, signal_num); @@ -710,3 +779,9 @@ discard_collected_trace(trace_handle_t handle) } trace_state->status = DISCARDED; } + +void +sched_kdebug_test_fail(uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3) +{ + kdebug_trace(ARIADNEDBG_CODE(0, 0), arg0, arg1, arg2, arg3); +} diff --git a/tests/sched/sched_test_utils.h b/tests/sched/sched_test_utils.h index 201af49e3..3f2a02d6d 100644 --- a/tests/sched/sched_test_utils.h +++ b/tests/sched/sched_test_utils.h @@ -3,6 +3,8 @@ #define XNU_SCHED_TEST_UTILS_H #include +#include +#include /* -- Meta-controls -- */ @@ -61,7 +63,9 @@ bool platform_is_virtual_machine(void); char *platform_sched_policy(void); unsigned int platform_num_clusters(void); const char *platform_perflevel_name(unsigned int perflevel); +unsigned int platform_perflevel_ncpus(unsigned int perflevel); unsigned int platform_nperflevels(void); +const char *platform_train_descriptor(void); /* -- 📈🕒 Monitor system performance state -- */ @@ -99,6 +103,12 @@ uint64_t execute_clpcctrl(char *clpcctrl_args[], bool read_value); typedef void *trace_handle_t; +__options_decl(collect_trace_flags_t, uint32_t, { + COLLECT_TRACE_FLAG_NONE = 0x00, + COLLECT_TRACE_FLAG_DISABLE_SYSCALLS = 0x01, + COLLECT_TRACE_FLAG_DISABLE_CLUTCH = 0x02, +}); + /* * Begins trace collection, using the specified name as a prefix for all * generated filenames. Arguments are parsed to check for --no-trace or @@ -110,7 +120,7 @@ typedef void *trace_handle_t; * the period of interest. */ trace_handle_t begin_collect_trace(int argc, char *const argv[], char *filename); -trace_handle_t begin_collect_trace_fmt(int argc, char *const argv[], char *filename_fmt, ...); +trace_handle_t begin_collect_trace_fmt(collect_trace_flags_t flags, int argc, char *const argv[], char *filename_fmt, ...); /* * NOTE: It's possible that tests may induce CPU starvation that can @@ -129,4 +139,7 @@ void save_collected_trace(trace_handle_t handle); /* Deletes the recorded trace */ void discard_collected_trace(trace_handle_t handle); +/* Drop a tracepoint for test failure. */ +void sched_kdebug_test_fail(uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3); + #endif /* XNU_SCHED_TEST_UTILS_H */ diff --git a/tests/setitimer.c b/tests/sched/setitimer.c similarity index 88% rename from tests/setitimer.c rename to tests/sched/setitimer.c index b5e6067f8..a9395ccc0 100644 --- a/tests/setitimer.c +++ b/tests/sched/setitimer.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include "sched_test_utils.h" #if __has_include() #include @@ -56,7 +58,8 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_VERSION("scheduler"), T_META_OWNER("chimene"), T_META_ENABLED(TARGET_OS_OSX), - T_META_TAG_VM_NOT_ELIGIBLE + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ASROOT(true) // for trace recording ); static void *stat_thread(void *arg); @@ -96,12 +99,6 @@ static bool is_rosetta = false; static mach_timebase_info_data_t timebase_info; -static uint64_t -abs_to_nanos(uint64_t abs) -{ - return abs * timebase_info.numer / timebase_info.denom; -} - /* Some statistics APIs return host abstime instead of Rosetta-translated abstime */ static uint64_t abs_to_nanos_host(uint64_t abstime) @@ -140,11 +137,20 @@ fill_thread_stats(uint32_t i) T_DECL(setitimer, "Test various setitimer delivered signals to CPU-burning threads") { + T_SETUPBEGIN; + int rv; kern_return_t kr; uint32_t ncpu; size_t ncpu_size = sizeof(ncpu); + trace_handle_t trace = NULL; + + if (geteuid() == 0) { + trace = begin_collect_trace_fmt(COLLECT_TRACE_FLAG_DISABLE_SYSCALLS, + argc, argv, "test_setitimer"); + } + struct sched_param self_param = {.sched_priority = 47}; rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, &self_param); @@ -153,6 +159,9 @@ T_DECL(setitimer, kr = mach_timebase_info(&timebase_info); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info"); + long ticks = sysconf(_SC_CLK_TCK); + T_LOG("sysconf(_SC_CLK_TCK) = %ld\n", ticks); + is_rosetta = processIsTranslated(); rv = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0); @@ -220,6 +229,14 @@ T_DECL(setitimer, (void *)gThreadList[i].stack_size); } + check_recommended_core_mask(NULL); + + wait_for_quiescence_default(argc, argv); + + T_SETUPEND; + + T_LOG("Finished wait_for_quiescence_default, starting test\n"); + sigset_t sigmk; sigemptyset(&sigmk); @@ -252,10 +269,10 @@ T_DECL(setitimer, T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)"); rv = setitimer(ITIMER_VIRTUAL, &itime, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)"); rv = setitimer(ITIMER_PROF, &itime, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)"); struct rlimit rlim = {}; @@ -275,6 +292,14 @@ T_DECL(setitimer, struct timespec timenow = {}; uint64_t time_start; + clock_t st_time, en_time; + struct tms st_cpu, en_cpu; + st_time = times(&st_cpu); + + struct rusage ru_start, ru_end; + rv = getrusage(RUSAGE_SELF, &ru_start); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_start)"); + kr = mach_get_times(&time_start, NULL, &timenow); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_get_times()"); @@ -310,6 +335,14 @@ T_DECL(setitimer, rv = setitimer(ITIMER_PROF, &itime_stop, NULL); T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)"); + rlim.rlim_cur = RLIM_INFINITY; + rv = setrlimit(RLIMIT_CPU, &rlim); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)"); + + en_time = times(&en_cpu); + rv = getrusage(RUSAGE_SELF, &ru_end); + T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_end)"); + break; } else { T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_cond_timedwait(&gShouldExitCondition, ...)"); @@ -324,19 +357,37 @@ T_DECL(setitimer, T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join"); } - uint64_t test_duration = time_end - time_start; - uint64_t test_duration_ns = abs_to_nanos(test_duration); - - double elapsed_secs = (double) test_duration_ns / (uint64_t)NSEC_PER_SEC; - - T_LOG("test duration %3.3f seconds\n", elapsed_secs); - fill_thread_stats(0); struct rusage_info_v6 ru = {}; rv = proc_pid_rusage(getpid(), RUSAGE_INFO_V6, (rusage_info_t *)&ru); T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "proc_pid_rusage"); + uint64_t test_duration = time_end - time_start; + uint64_t test_duration_ns = abs_to_nanos(test_duration); + + double elapsed_secs = (double) test_duration_ns / (uint64_t)NSEC_PER_SEC; + + T_LOG("test duration %3.3f seconds (%lld)\n", elapsed_secs, test_duration_ns); + + uintmax_t real_ticks = en_time - st_time; + uintmax_t user_ticks = en_cpu.tms_utime - st_cpu.tms_utime; + uintmax_t system_ticks = en_cpu.tms_stime - st_cpu.tms_stime; + + T_LOG("times(): Real Time: %jd (%3.3f seconds), User Time %jd (%3.3f seconds), System Time %jd (%3.3f seconds)\n", + real_ticks, (double)real_ticks / (double)ticks, + user_ticks, (double)user_ticks / (double)ticks, + system_ticks, (double)system_ticks / (double)ticks); + + struct timeval ru_udelta, ru_sdelta; + + timersub(&ru_end.ru_utime, &ru_start.ru_utime, &ru_udelta); + timersub(&ru_end.ru_stime, &ru_start.ru_stime, &ru_sdelta); + + T_LOG("rusage(): User Time %ld.%06d, System Time %ld.%06d\n", + ru_udelta.tv_sec, ru_udelta.tv_usec, + ru_sdelta.tv_sec, ru_sdelta.tv_usec); + uint64_t total_user_time_ns = abs_to_nanos_host(ru.ri_user_time); double total_user_time_s = (double)total_user_time_ns / (uint64_t)NSEC_PER_SEC; @@ -396,7 +447,19 @@ T_DECL(setitimer, double percentage_user = (double)user_time / (double) total_time * 100; double percentage_system = (double)system_time / (double) total_time * 100; - double percentage_not_running = (double)(test_duration_ns - total_time) / (double) test_duration_ns * 100; + + uint64_t not_running_time = test_duration_ns - total_time; + /* + * The worker threads spend a little bit of time waking up before the test + * start time is captured, so they can occasionally have a larger total_time + * than the test duration, leading to underflow in not_running_time and + * test failures. + * rdar://106147865 + */ + if (total_time > test_duration_ns) { + not_running_time = 0; + } + double percentage_not_running = (double)(not_running_time) / (double) test_duration_ns * 100; char* thread_type_str = ""; char* warning_str = ""; @@ -477,11 +540,15 @@ T_DECL(setitimer, } /* - * SIGXCPU should be delivered exactly once. + * SIGXCPU should be delivered at least once. */ if (total_xcpu == 0) { T_FAIL("SIGXCPU delivered %d times (expected at least once)", total_xcpu); } + + if (trace) { + end_collect_trace(trace); + } } static void * @@ -684,14 +751,6 @@ T_DECL(setitimer_prof_multi_threaded, T_SETUPBEGIN; (void)signal(SIGPROF, sigprof_received); - pthread_t cpu_threads[PROF_EXTRA_THREAD_COUNT] = { 0 }; - bool spin_while_true = true; - for (unsigned int i = 0; i < PROF_EXTRA_THREAD_COUNT; i++) { - int error = pthread_create(&cpu_threads[i], NULL, cpu_thread_main, &spin_while_true); - T_QUIET; T_ASSERT_POSIX_ZERO(error, "create thread %d", i); - } - T_LOG("spinning %d threads on CPU", PROF_EXTRA_THREAD_COUNT + 1); - uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC); uint64_t expected_end_ns = start_ns + (EXPECTED_PROF_DURATION_SECS * NSEC_PER_SEC); uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC); @@ -703,6 +762,28 @@ T_DECL(setitimer_prof_multi_threaded, int ret = setitimer(ITIMER_PROF, &prof_timer, NULL); T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)", ITIMER_PROF_SECS); + + /* + * If we start the extra thread before capturing start_ns, + * then it immediately starts accumulating time. + * Additionally, setitimer does a lazy timestamp capture in task_vtimer_set, + * which won't see that previously on-core accumulated time. + * These together could cause SIGPROF to deliver slightly before 1s. + * Create the threads after the timer starts to ensure that only CPU + * work done after the timer starts contributes to SIGPROF. + * + * rdar://134811651 + */ + + pthread_t cpu_threads[PROF_EXTRA_THREAD_COUNT] = { 0 }; + bool spin_while_true = true; + for (unsigned int i = 0; i < PROF_EXTRA_THREAD_COUNT; i++) { + int error = pthread_create(&cpu_threads[i], NULL, cpu_thread_main, &spin_while_true); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "create thread %d", i); + } + + T_LOG("spinning %d threads on CPU", PROF_EXTRA_THREAD_COUNT + 1); + T_SETUPEND; uint64_t last_ns = 0; diff --git a/tests/sched/thread_group_fairness.c b/tests/sched/thread_group_fairness.c index 30b29a5be..b56b0b273 100644 --- a/tests/sched/thread_group_fairness.c +++ b/tests/sched/thread_group_fairness.c @@ -8,10 +8,12 @@ #include #include #include +#include #include #include #include +#include "test_utils.h" #include "sched_test_utils.h" #include "thread_group_fairness_workload_config.h" @@ -227,3 +229,175 @@ T_DECL(thread_group_fairness, T_END; } + +static uint64_t +get_thread_group_cpu_time(int sched_bucket) +{ + int ret; + uint64_t cpu_stats[2]; + size_t cpu_stats_len = sizeof(uint64_t) * 2; + ret = sysctlbyname("kern.clutch_bucket_group_cpu_stats", cpu_stats, &cpu_stats_len, + &sched_bucket, sizeof(sched_bucket)); + if (ret != 0 && errno == ENOTSUP) { + T_LOG("Test only supported on Clutch/Edge scheduler (current policy is \"%s\") " + "platforms on development/debug build variants", platform_sched_policy()); + T_SKIP("kern.clutch_bucket_group_cpu_stats development-only sysctl not present"); + } + T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_SUCCESS(ret, "kern.clutch_bucket_group_cpu_stats"); + return cpu_stats[0]; +} + +static volatile uint64_t mach_deadline = 0; +static const int seconds = 2; +static _Atomic volatile uint64_t count = 0; +static const int iters_per_lock_hold = 100000; +static const int low_qos = QOS_CLASS_USER_INITIATED; +static const int low_sched_bucket = 2; // TH_BUCKET_SHARE_IN +static const int high_qos = QOS_CLASS_USER_INTERACTIVE; +static const int high_sched_bucket = 1; // TH_BUCKET_SHARE_FG +static _Atomic volatile bool recorder_picked = false; + +static void * +boost_while_working(void *arg) +{ + int ret; + work_interval_t wi = (work_interval_t)arg; + ret = work_interval_join(wi); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "work_interval_join"); + + bool is_recorder = os_atomic_cmpxchg(&recorder_picked, false, true, relaxed); + uint64_t cpu_time_begin_low = 0; + uint64_t cpu_time_begin_high = 0; + if (is_recorder) { + cpu_time_begin_low = get_thread_group_cpu_time(low_sched_bucket); + cpu_time_begin_high = get_thread_group_cpu_time(high_sched_bucket); + } + + while (mach_absolute_time() < mach_deadline) { + /* Assume high priority */ + ret = pthread_set_qos_class_self_np(high_qos, 0); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "pthread_set_qos_class_self_np UI"); + T_QUIET; T_ASSERT_EQ(qos_class_self(), high_qos, "qos_class_self"); + /* Complete a "work item" */ + for (volatile int i = 0; i < iters_per_lock_hold; i++) { + os_atomic_inc(&count, relaxed); + } + /* Drop priority down before parking to sleep */ + ret = pthread_set_qos_class_self_np(low_qos, 0); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "pthread_set_qos_class_self_np IN"); + T_QUIET; T_ASSERT_EQ(qos_class_self(), low_qos, "qos_class_self"); + usleep(2 * 1000); // 2ms + } + + if (is_recorder) { + uint64_t cpu_time_end_low = get_thread_group_cpu_time(low_sched_bucket); + uint64_t cpu_time_end_high = get_thread_group_cpu_time(high_sched_bucket); + + T_QUIET; T_ASSERT_GE(cpu_time_end_high, cpu_time_begin_high, + "non-monotonic thread group CPU time"); + uint64_t high_cpu_time = cpu_time_end_high - cpu_time_begin_high; + T_QUIET; T_ASSERT_GE(cpu_time_end_low, cpu_time_begin_low, + "non-monotonic thread group CPU time"); + uint64_t low_cpu_time = cpu_time_end_low - cpu_time_begin_low; + + T_QUIET; T_ASSERT_GT(high_cpu_time + low_cpu_time, 0ULL, + "CPU not attributed to either expected bucket"); + T_LOG("High ticks: %llu, Low ticks: %llu, High-to-low ratio: %.3f", + high_cpu_time, low_cpu_time, high_cpu_time * 1.0 / (high_cpu_time + low_cpu_time)); + T_EXPECT_GE(high_cpu_time, low_cpu_time, "More work accounted to the high QoS"); + T_EXPECT_LE(low_cpu_time * 1.0, high_cpu_time * 0.2, + "Vast majority of work accounted to the high QoS"); + } + return NULL; +} + +/* + * Note, preemption due to non-test threads poses a special problem for + * this test because time the test threads spend preempted at their low + * QoS, in between processing work items, translates to "blocked" time + * for the thread group at its high QoS. This leads to CPU usage aging + * out more quickly for the high QoS, causing the test to fail. + * + * Additionally, the test must be run like an application in the QoS + * engine, without a QoS ceiling which would prevent the test threads + * from performing adequately high QoS boosts. For example: + * sudo taskpolicy -a ./thread_group_fairness -n interactivity_cpu_accounting + */ +T_DECL(interactivity_cpu_accounting, + "Ensure that CPU runtime tracked for calculating interactivity score " + "gets attributed to the right QoS that performed the work, even if we " + "switch QoS while on-core (rdar://125045167)", + T_META_ENABLED(TARGET_CPU_ARM64 && !TARGET_OS_BRIDGE), +#if TARGET_OS_WATCH + T_META_MAYFAIL("Watches too noisy with high priority spinners (rdar://150323037)"), +#elif TARGET_OS_TV + T_META_MAYFAIL("TVs too noisy with high priority audio (rdar://149974201)"), +#endif + T_META_ASROOT(YES)) +{ + /* Skips the test if needed sysctl isn't present */ + get_thread_group_cpu_time(0); + + /* Ensure we don't have a QoS ceiling that would prevent high enough boosts */ + struct task_policy_state policy_state; + mach_msg_type_number_t count = TASK_POLICY_STATE_COUNT; + boolean_t get_default = FALSE; + kern_return_t kr = task_policy_get(mach_task_self(), TASK_POLICY_STATE, + (task_policy_t)&policy_state, &count, &get_default); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_policy_get(self, TASK_POLICY_STATE)"); + int requested_app_type = (policy_state.requested & POLICY_REQ_APPTYPE_MASK) >> POLICY_REQ_APPTYPE_SHIFT; + T_QUIET; T_ASSERT_EQ(requested_app_type, TASK_APPTYPE_APP_DEFAULT, + "Test needs to be run like an application for QoS boosting above pri 37 to succeed"); + + wait_for_quiescence(argc, argv, 0.9, 10); + + trace_handle_t trace = begin_collect_trace(argc, argv, T_NAME); + T_SETUPEND; + + if (platform_is_amp()) { + /* + * Isolate-out the effects of cluster recommendation, since that + * causes threads to be preempted sometimes for rebalancing purposes. + */ + char *clpcctrl_args[] = {"-C", "p", NULL}; + execute_clpcctrl(clpcctrl_args, false); + } + + mach_deadline = mach_absolute_time() + nanos_to_abs(seconds * NSEC_PER_SEC); + + /* + * Create threads in their own TG that will run work at "boosted" + * priority and after a work item is complete, lower their + * priority back down to a low QoS before "parking" via usleep(). + * + * We expect that the interactivity score for the high QoS for this + * TG will be the one to lower, rather than the low QoS which the + * threads are switching down to before context-switching off-core. + */ + int num_boosters = MIN(4, dt_ncpu()); + work_interval_t wi_handle; + make_work_interval(&wi_handle, WORK_INTERVAL_TYPE_DEFAULT); + pthread_t threads[num_boosters]; + for (int i = 0; i < num_boosters; i++) { + create_thread(&threads[i], NULL, boost_while_working, wi_handle); + } + + /* + * Wait for test deadline to pass, to avoid priority boosting + * with pthread_join(), which would affect the results. + */ + uint64_t curr_time = mach_absolute_time(); + if (curr_time < mach_deadline) { + usleep(abs_to_nanos(mach_deadline - curr_time) / NSEC_PER_USEC); + } + for (int i = 0; i < num_boosters; i++) { + pthread_join(threads[i], NULL); + } + + if (platform_is_amp()) { + /* Reenable all cores to speed up trace post-processing */ + char *recommend_all_cores_args[] = {"-C", "all", NULL}; + execute_clpcctrl(recommend_all_cores_args, false); + } + end_collect_trace(trace); +} diff --git a/tests/sched/zero_to_n_tests.c b/tests/sched/zero_to_n_tests.c index 8d04d907d..9a66dff93 100644 --- a/tests/sched/zero_to_n_tests.c +++ b/tests/sched/zero_to_n_tests.c @@ -1,13 +1,14 @@ #include #include +#include #include #include #include #include #include +#include "sched_test_utils.h" #include "test_utils.h" -#if defined(__arm64__) T_GLOBAL_META( T_META_TAG_PERF, T_META_RUN_CONCURRENTLY(false), @@ -19,23 +20,9 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.scheduler"), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("scheduler"), - T_META_OWNER("chimene"), + T_META_OWNER("m_zinn"), T_META_TAG_VM_NOT_ELIGIBLE ); -#else -T_GLOBAL_META( - T_META_TAG_PERF, - T_META_RUN_CONCURRENTLY(false), - T_META_CHECK_LEAKS(false), - T_META_ASROOT(true), - T_META_REQUIRES_SYSCTL_EQ("kern.hv_vmm_present", 0), - T_META_NAMESPACE("xnu.scheduler"), - T_META_RADAR_COMPONENT_NAME("xnu"), - T_META_RADAR_COMPONENT_VERSION("scheduler"), - T_META_OWNER("chimene"), - T_META_TAG_VM_NOT_ELIGIBLE - ); -#endif static void log_cmd(char **cmd) @@ -53,75 +40,110 @@ log_cmd(char **cmd) } static void -run_zn(char *name, char **cmd) +run_zn(char *name, char **cmd, int argc, char *const argv[]) { - char tracefile_path[MAXPATHLEN] = "zn.artrace"; - snprintf(tracefile_path, MAXPATHLEN, "%s.artrace", name); - - int ret = dt_resultfile(tracefile_path, sizeof(tracefile_path)); - if (ret) { - T_ASSERT_FAIL("get file path for trace file failed with errno %d", errno); - } - - cmd[3] = tracefile_path; log_cmd(cmd); + trace_handle_t trace = begin_collect_trace_fmt(COLLECT_TRACE_FLAG_DISABLE_SYSCALLS | COLLECT_TRACE_FLAG_DISABLE_CLUTCH, argc, argv, name); + __block bool test_failed = true; __block bool test_skipped = false; + __block dispatch_semaphore_t stdout_finished_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(stdout_finished_sem, "dispatch_semaphore_create()"); + dt_launch_pipe_t *pipes = NULL; pid_t test_pid; - test_pid = dt_launch_tool_pipe(cmd, false, NULL, ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + test_pid = dt_launch_tool_pipe(cmd, false, &pipes, NULL, NULL, NULL, NULL); + T_QUIET; T_ASSERT_NE(test_pid, 0, "dt_launch_tool_pipe() failed unexpectedly with errno %d", errno); + T_QUIET; T_ASSERT_NOTNULL(pipes, "dt_launch_tool_pipe returned non-null pipes"); + + dispatch_block_t cleanup_handler = + ^{ dispatch_semaphore_signal(stdout_finished_sem); }; + + dt_pipe_data_handler_t stdout_handler = ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { T_LOG("%s", data); if (strstr(data, "TEST PASSED")) { - test_failed = false; + test_failed = false; + return true; } if (strstr(data, "TEST FAILED")) { - test_failed = true; + test_failed = true; + return true; } if (strstr(data, "TEST SKIPPED")) { - test_skipped = true; + test_skipped = true; + return true; } return false; - }, ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { + }; + + dt_pipe_data_handler_t stderr_handler = ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) { T_LOG("%s", data); return false; - }, BUFFER_PATTERN_LINE, NULL); + }; - if (test_pid == 0) { - T_ASSERT_FAIL("dt_launch_tool_pipe() failed unexpectedly with errno %d", errno); - } + dispatch_source_t stdout_reader = dt_create_dispatch_file_reader(pipes->pipe_out[0], BUFFER_PATTERN_LINE, stdout_handler, cleanup_handler, NULL); + T_QUIET; T_ASSERT_NOTNULL(stdout_reader, "create darwintest dispatch file reader for stdout"); + dispatch_source_t stderr_reader = dt_create_dispatch_file_reader(pipes->pipe_err[0], BUFFER_PATTERN_LINE, stderr_handler, ^{}, NULL); + T_QUIET; T_ASSERT_NOTNULL(stderr_reader, "create darwintest dispatch file reader for stderr"); + + /* Wait for zero-to-n to exit, and check its return value. */ int exitstatus; - dt_waitpid(test_pid, &exitstatus, NULL, 0); - if (exitstatus != 0) { - T_LOG("ktrace artrace exitstatus=%d\n", exitstatus); + if (!dt_waitpid(test_pid, &exitstatus, NULL, 0) || exitstatus != 0) { + T_FAIL("zero-to-n exitstatus=%d\n", exitstatus); } + + /* Test exited, end the trace. */ + end_collect_trace(trace); + + /* Wait for the readers to finish. */ + intptr_t rv = dispatch_semaphore_wait(stdout_finished_sem, dispatch_time(DISPATCH_TIME_NOW, 30 * NSEC_PER_SEC)); + T_QUIET; T_ASSERT_EQ((uint64_t) rv, 0ULL, "zn should finish within 30 seconds"); + + /* Free the pipes. */ + free(pipes); + if (test_skipped) { - unlink(tracefile_path); T_SKIP("%s", name); } else if (test_failed) { T_FAIL("%s", name); } else { - unlink(tracefile_path); T_PASS("%s", name); } - pdwriter_t writer = pdwriter_open_tmp("xnu", name, 0, 0, NULL, 0); - T_WITH_ERRNO; - T_ASSERT_NOTNULL(writer, "pdwriter_open_tmp"); - pdwriter_new_value(writer, "scheduler_ok", PDUNIT_CUSTOM(passing), !test_failed); - pdwriter_close(writer); T_END; } -T_DECL(zn_rt, "Schedule 1 RT thread per performance core, and test max latency", T_META_ENABLED(!TARGET_OS_TV), XNU_T_META_SOC_SPECIFIC) +T_DECL(zn_rt, "Schedule 1 RT thread per performance core, and test max latency", + XNU_T_META_SOC_SPECIFIC) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "broadcast-single-sem", "realtime", "1000", "--spin-time", "200000", "--spin-all", "--test-rt", +#if defined(__arm64__) + "--bind", "P", /* */ + "--trace", "500000", +#elif defined(__x86_64__) + "--trace", "2000000", +#endif + NULL}; + + run_zn("zn_rt", cmd, argc, argv); +} + +T_DECL(zn_rt_ival, "Schedule 1 RT thread per performance core, and test max latency", + XNU_T_META_SOC_SPECIFIC, + T_META_ENABLED(false) /* TODO: Enable once is fixed. */) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "broadcast-single-sem", "realtime", "1000", + "--spin-time", "200000", + "--spin-all", + "--test-rt", + "--rt-interval", #if defined(__x86_64__) "--trace", "2000000", #else @@ -129,13 +151,33 @@ T_DECL(zn_rt, "Schedule 1 RT thread per performance core, and test max latency", #endif NULL}; - run_zn("zn_rt", cmd); + run_zn("zn_rt_ival", cmd, argc, argv); } -T_DECL(zn_rt_smt, "Schedule 1 RT thread per primary core, verify that the secondaries are idle iff the RT threads are running", T_META_ENABLED(TARGET_CPU_X86_64)) +T_DECL(zn_rt_ival_ll, "Schedule 1 RT thread per performance core, and test max latency", + XNU_T_META_SOC_SPECIFIC) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "broadcast-single-sem", "realtime", "1000", + "--spin-time", "200000", + "--spin-all", + "--test-rt", + "--rt-interval", + "--rt-ll", +#if defined(__x86_64__) + "--trace", "2000000", +#else + "--trace", "500000", +#endif + NULL}; + + run_zn("zn_rt_ival_ll", cmd, argc, argv); +} + +T_DECL(zn_rt_smt, "Schedule 1 RT thread per primary core, verify that the secondaries are idle iff the RT threads are running", + T_META_ENABLED(TARGET_CPU_X86_64)) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "broadcast-single-sem", "realtime", "1000", "--spin-time", "200000", "--spin-all", @@ -144,13 +186,29 @@ T_DECL(zn_rt_smt, "Schedule 1 RT thread per primary core, verify that the second "--trace", "2000000", NULL}; - run_zn("zn_rt_smt", cmd); + run_zn("zn_rt_smt", cmd, argc, argv); } -T_DECL(zn_rt_avoid0, "Schedule 1 RT thread per primary core except for CPU 0", T_META_ASROOT(true), T_META_ENABLED(TARGET_CPU_X86_64)) +T_DECL(zn_rt_ival_smt, "Schedule 1 RT thread per primary core, verify that the secondaries are idle iff the RT threads are running", + T_META_ENABLED(TARGET_CPU_X86_64)) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "broadcast-single-sem", "realtime", "1000", + "--spin-time", "200000", + "--spin-all", + "--churn-pri", "4", + "--test-rt-smt", + "--rt-interval", + "--trace", "2000000", + NULL}; + + run_zn("zn_rt_ival_smt", cmd, argc, argv); +} + +T_DECL(zn_rt_avoid0, "Schedule 1 RT thread per primary core except for CPU 0", + T_META_ENABLED(TARGET_CPU_X86_64)) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "broadcast-single-sem", "realtime", "1000", "--spin-time", "200000", "--spin-all", @@ -158,19 +216,55 @@ T_DECL(zn_rt_avoid0, "Schedule 1 RT thread per primary core except for CPU 0", T "--trace", "2000000", NULL}; - run_zn("zn_rt_avoid0", cmd); + run_zn("zn_rt_avoid0", cmd, argc, argv); } -T_DECL(zn_rt_apt, "Emulate AVID Pro Tools with default latency deadlines", T_META_ENABLED(!TARGET_OS_TV)) +T_DECL(zn_rt_ival_avoid0, "Schedule 1 RT thread per primary core except for CPU 0", + T_META_ENABLED(TARGET_CPU_X86_64)) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "broadcast-single-sem", "realtime", "1000", + "--spin-time", "200000", + "--spin-all", + "--test-rt-avoid0", + "--rt-interval", + "--trace", "2000000", + NULL}; + + run_zn("zn_rt_ival_avoid0", cmd, argc, argv); +} + +T_DECL(zn_rt_apt, "Emulate AVID Pro Tools with default latency deadlines") +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "chain", "realtime", "1000", "--extra-thread-count", "-3", "--spin-time", "200000", "--spin-all", "--churn-pri", "31", "--churn-random", "--test-rt", +#if defined(__arm64__) + "--bind", "P", /* */ + "--trace", "500000", +#elif defined(__x86_64__) + "--trace", "2000000", +#endif + NULL}; + + run_zn("zn_rt_apt", cmd, argc, argv); +} + +T_DECL(zn_rt_ival_apt, "Emulate AVID Pro Tools with default latency deadlines", + T_META_ENABLED(false) /* TODO: Enable once is fixed. */) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "chain", "realtime", "1000", + "--extra-thread-count", "-3", + "--spin-time", "200000", + "--spin-all", + "--churn-pri", "31", "--churn-random", + "--test-rt", + "--rt-interval", #if defined(__x86_64__) "--trace", "2000000", #else @@ -178,13 +272,13 @@ T_DECL(zn_rt_apt, "Emulate AVID Pro Tools with default latency deadlines", T_MET #endif NULL}; - run_zn("zn_rt_apt", cmd); + run_zn("zn_rt_ival_apt", cmd, argc, argv); } -T_DECL(zn_rt_apt_ll, "Emulate AVID Pro Tools with low latency deadlines", XNU_T_META_SOC_SPECIFIC) +T_DECL(zn_rt_apt_ll, "Emulate AVID Pro Tools with low latency deadlines", + XNU_T_META_SOC_SPECIFIC) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "chain", "realtime", "1000", "--extra-thread-count", "-3", "--spin-time", "200000", @@ -192,22 +286,66 @@ T_DECL(zn_rt_apt_ll, "Emulate AVID Pro Tools with low latency deadlines", XNU_T_ "--churn-pri", "31", "--churn-random", "--test-rt", "--rt-ll", +#if defined(__arm64__) + "--bind", "P", /* */ +#endif /* __arm64__*/ "--trace", "500000", NULL}; - run_zn("zn_rt_apt_ll", cmd); + run_zn("zn_rt_apt_ll", cmd, argc, argv); } -T_DECL(zn_rt_edf, "Test max latency of earliest deadline RT threads in the presence of later deadline threads", T_META_ENABLED(!TARGET_OS_TV), XNU_T_META_SOC_SPECIFIC) +T_DECL(zn_rt_ival_apt_ll, "Emulate AVID Pro Tools with low latency deadlines", + XNU_T_META_SOC_SPECIFIC) { - char *cmd[] = {"/usr/bin/ktrace", "artrace", "-o", "zn.artrace", "-c", - "/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "chain", "realtime", "1000", + "--extra-thread-count", "-3", + "--spin-time", "200000", + "--spin-all", + "--churn-pri", "31", "--churn-random", + "--test-rt", + "--rt-ll", + "--rt-interval", + "--trace", "500000", + NULL}; + + run_zn("zn_rt_ival_apt_ll", cmd, argc, argv); +} + +T_DECL(zn_rt_edf, "Test max latency of earliest deadline RT threads in the presence of later deadline threads", + XNU_T_META_SOC_SPECIFIC) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", "0", "broadcast-single-sem", "realtime", "1000", "--extra-thread-count", "-1", "--spin-time", "200000", "--spin-all", "--rt-churn", "--test-rt", +#if defined(__arm64__) + "--bind", "P", /* */ + "--trace", "500000", +#elif defined(__x86_64__) + "--trace", "2000000", +#endif + NULL}; + + run_zn("zn_rt_edf", cmd, argc, argv); +} + +T_DECL(zn_rt_ival_edf, "Test max latency of earliest deadline RT threads in the presence of later deadline threads", + XNU_T_META_SOC_SPECIFIC) +{ + char *cmd[] = {"/AppleInternal/CoreOS/tests/xnu/zero-to-n/zn", + "0", "broadcast-single-sem", "realtime", "1000", + "--extra-thread-count", "-1", + "--spin-time", "200000", + "--spin-all", + "--rt-churn", + "--test-rt", + "--rt-ll", /* TODO: remove low-latency constraint once is fixed */ + "--rt-interval", #if defined(__x86_64__) "--trace", "2000000", #else @@ -215,5 +353,6 @@ T_DECL(zn_rt_edf, "Test max latency of earliest deadline RT threads in the prese #endif NULL}; - run_zn("zn_rt_edf", cmd); + run_zn("zn_rt_ival_edf", cmd, argc, argv); } + diff --git a/tests/select_stress.c b/tests/select_stress.c index e3dd2c6cc..0772a0e70 100644 --- a/tests/select_stress.c +++ b/tests/select_stress.c @@ -11,6 +11,12 @@ #include #include +T_GLOBAL_META( + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("file descriptors")); + +#define BACKOFF_SLEEP_SECONDS 3 + /* Select parameters */ #define TIMEOUT_CHANCE 17 /* one in this many times, timeout */ #define TIMEOUT_POLLCHANCE 11 /* one in this many is a poll */ @@ -226,6 +232,31 @@ do_select(void *arg) return NULL; } +/* + * Work around rdar://87992172: pthread_join() doesn't wait for the thread to be + * fully cleaned up, so create/join loops may spuriously fail with too many + * threads. + */ +static int +create_thread_backoff(pthread_t *pthread, const pthread_attr_t *attr, void *(*thread_func)(void *), void *arg) +{ + int ret, tries; + + for (tries = 0; tries < 3; tries++) { + ret = pthread_create(pthread, NULL, thread_func, arg); + if (ret != EAGAIN) { + break; + } + + T_LOG("warning: pthread_create failed with %d (%s), backing off for %d seconds...", ret, strerror(ret), BACKOFF_SLEEP_SECONDS); + sleep(BACKOFF_SLEEP_SECONDS); + } + + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "pthread_create (after %d retries)", tries); + return tries; +} + static void test_select_stress(int nthreads, uint64_t duration_seconds) @@ -260,10 +291,7 @@ test_select_stress(int nthreads, uint64_t duration_seconds) struct endpoint *e = &th[i].ep; th[i].setup = setup_stress_event; th[i].work = do_stress_events; - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_POSIX_ZERO(pthread_create(&e->pth, 0, thread_sync, &th[i]), - "pthread_create:do_stress_events"); + create_thread_backoff(&e->pth, 0, thread_sync, &th[i]); } /* @@ -319,10 +347,11 @@ handle_ebadf: } sarg.ret = 0; - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_POSIX_ZERO(pthread_create(&sarg.pth, 0, do_select, &sarg), - "pthread_create:do_select"); + int retries = create_thread_backoff(&sarg.pth, 0, do_select, &sarg); + if (retries > 0) { + /* we had backoff for a few seconds, so extend our deadline */ + deadline += ns_to_abs(NSEC_PER_SEC * BACKOFF_SLEEP_SECONDS * retries); + } T_QUIET; T_WITH_ERRNO; diff --git a/tests/sendmsg_test.c b/tests/sendmsg_test.c new file mode 100644 index 000000000..4e8e43252 --- /dev/null +++ b/tests/sendmsg_test.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.net"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("networking"), + T_META_CHECK_LEAKS(false)); + +static void +do_test(int type) +{ + ssize_t retval; + int sock[2]; + + T_LOG("test socket type %d", type); + + T_ASSERT_POSIX_SUCCESS(socketpair(PF_LOCAL, type, 0, sock), "socketpair()"); + + T_LOG("socketpair: [%d, %d]", sock[0], sock[1]); + + int optval = 1; + T_ASSERT_POSIX_SUCCESS(setsockopt(sock[0], SOL_SOCKET, SO_DEBUG, &optval, sizeof(optval)), "setsockopt(SO_DEBUG)"); + + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + T_ASSERT_POSIX_SUCCESS(setsockopt(sock[0], SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval)), "setsockopt(SO_RCVTIMEO)"); + + struct iovec iov0 = { .iov_base = NULL, .iov_len = 0 }; + + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msghdr1 = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov0, + .msg_iovlen = 1, + .msg_control = cmsg_buf, + .msg_controllen = sizeof(cmsg_buf), + .msg_flags = 0 + }; + + struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msghdr1); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + *((int *) CMSG_DATA(cmsg)) = sock[0]; + + retval = sendmsg(sock[1], &msghdr1, 0); + if (retval == -1) { + T_LOG("sendmsg(msghdr1) error: %s", strerror(errno)); + } else { + T_LOG("sendmsg msghdr1 %ld", retval); + } + + struct iovec iov1 = { .iov_base = NULL, .iov_len = 0 }; + struct msghdr msghdr2 = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov1, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + retval = recvmsg(sock[0], &msghdr2, MSG_WAITALL); + if (retval == -1) { + T_LOG("recvmsg(msghdr2) error: %s", strerror(errno)); + } else { + T_LOG("recvmsg msghdr2 %ld", retval); + } + + char * buf[0x10] = { 0 }; + struct iovec iov2 = { + .iov_base = buf, + .iov_len = sizeof(buf), + }; + + struct msghdr msghdr3 = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov2, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + retval = recvmsg(sock[0], &msghdr3, MSG_WAITALL); + if (retval == -1) { + T_LOG("recvmsg(msghdr3) error: %s", strerror(errno)); + } else { + T_LOG("recvmsg msghdr3 %ld", retval); + } + + close(sock[0]); + close(sock[1]); + + T_PASS("%s", __func__); +} + +T_DECL(send_zero_payload_dgram, "repro-124040738 SOCK_DGRAM", T_META_ASROOT(true)) +{ + do_test(SOCK_DGRAM); +} + +T_DECL(send_zero_payload_stream, "repro-124040738 SOCK_STREAM", T_META_ASROOT(true)) +{ + do_test(SOCK_STREAM); +} diff --git a/tests/signal_exit_reason.c b/tests/signal_exit_reason.c index b4543e190..c2395ec27 100644 --- a/tests/signal_exit_reason.c +++ b/tests/signal_exit_reason.c @@ -226,9 +226,10 @@ __test_exit_reason_delegate_terminate() ret = proc_terminate_delegate(instigator, token, &sentsignal); T_EXPECT_EQ_INT(ret, ESRCH, "expect no such process return: %d", ret); // Terminating PID 1 should fail with EPERM - audit_token_for_pid(1, &token); - ret = proc_terminate_delegate(instigator, token, &sentsignal); - T_EXPECT_EQ_INT(ret, EPERM, "expected eperm return: %d", ret); + if (audit_token_for_pid(1, &token)) { + ret = proc_terminate_delegate(instigator, token, &sentsignal); + T_EXPECT_EQ_INT(ret, EPERM, "expected eperm return: %d", ret); + } } else { pause(); // This exit should not hit, but we exit abnormally in case something went wrong @@ -356,7 +357,69 @@ __test_exit_reason_signal_with_audittoken_fail_bad_signal(int signal) } } -T_DECL(proc_signal_delegate_success, "proc_signal_delegate should work", T_META_TAG_VM_PREFERRED) +// Required signal handler for sigwait to work properly +static void +null_signal_handler(int sig) +{ +} + +static void +__test_signal_zombie(void) +{ + pid_t child; + sigset_t set; + sigset_t oldset; + int sig = 0, ret = 0; + audit_token_t token = INVALID_AUDIT_TOKEN_VALUE; + + // Set signal handler + signal(SIGCHLD, &null_signal_handler); + + // Mask SIGCHLD so it becomes pending + // when the child dies. + sigemptyset(&set); + sigaddset(&set, SIGCHLD); + sigprocmask(SIG_BLOCK, &set, &oldset); + + // Immediately exit child + if ((child = fork()) == 0) { + sleep(1); + exit(0); + } + + // Calculate target audit token + T_EXPECT_TRUE(audit_token_for_pid(child, &token), "audit token determined"); + + // Wait for kernel to notify us of a dead child. Which means it's now in a + // zombie state. + sigwait(&set, &sig); + + // Restore process mask + sigprocmask(SIG_SETMASK, &oldset, NULL); + + // First test that kill succeeds for POSIX compliance + T_EXPECT_EQ_INT(kill(child, 0), 0, "kill() suceeds on a zombie"); + + // Then test that the proc_info path has a sensible error code + ret = proc_signal_with_audittoken(&token, SIGHUP); + T_EXPECT_EQ_INT(ret, ESRCH, "expect invalid sig num return: %d", ret); + + // Cleanup zombie child + wait_with_timeout_expected(child, 0); +} + + +T_DECL(signal_zombie, "signaling a zombie should work", T_META_TAG_VM_PREFERRED) +{ + dispatch_test(^{ + __test_signal_zombie(); + T_END; + }); +} + +T_DECL(proc_signal_delegate_success, "proc_signal_delegate should work", + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(false) /* rdar://146369624 */) { dispatch_test(^{ __test_exit_reason_delegate_signal(SIGABRT); @@ -454,7 +517,7 @@ __test_exit_reason_pthread_kill_self(int signal) } else { pthread_t t; struct pthread_kill_helper_args args = {&t, signal}; - pthread_create(&t, NULL, pthread_kill_helper, (void *)&args); + pthread_create(&t, NULL, (void*(*)(void*))pthread_kill_helper, (void *)&args); pthread_join(t, NULL); } } diff --git a/tests/signal_initproc.c b/tests/signal_initproc.c new file mode 100644 index 000000000..307bb7998 --- /dev/null +++ b/tests/signal_initproc.c @@ -0,0 +1,32 @@ +#include +#include +#include + +T_GLOBAL_META( + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("signals"), + T_META_OWNER("m_staveleytaylor"), + T_META_RUN_CONCURRENTLY(true) + ); + +T_DECL(signal_initproc_prohibited, "Check that signalling initproc is prohibited", T_META_ASROOT(TRUE), T_META_TAG_VM_PREFERRED) +{ + /* All user-initiated signals to launchd are prohibited. */ + bool saw_sigterm = false; + bool saw_sigkill = false; + int signal_max = SIGUSR2; + + for (int signal = 1; signal < signal_max; signal++) { + T_WITH_ERRNO; + T_ASSERT_POSIX_FAILURE(kill(1, signal), + EPERM, + "Shouln't be able to send signal '%s' to initproc", + strsignal(signal)); + + saw_sigkill |= signal == SIGKILL; + saw_sigterm |= signal == SIGTERM; + } + + T_ASSERT_TRUE(saw_sigkill, "Tried sigkill"); + T_ASSERT_TRUE(saw_sigterm, "Tried sigterm"); +} diff --git a/tests/skywalk/skt_badring.c b/tests/skywalk/skt_badring.c index c4674b802..c4835f4ff 100644 --- a/tests/skywalk/skt_badring.c +++ b/tests/skywalk/skt_badring.c @@ -62,7 +62,7 @@ skt_badringtx_common(int argc, char *argv[], int method) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); @@ -181,7 +181,7 @@ skt_badringrx_common(int argc, char *argv[], int method) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_bind.c b/tests/skywalk/skt_bind.c index d3a79a661..52c0f30ed 100644 --- a/tests/skywalk/skt_bind.c +++ b/tests/skywalk/skt_bind.c @@ -195,14 +195,14 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* this must fail since the key attribute is missing */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* this must work (key attributes match) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 != NULL); /* we assume this won't change, so retrieve now */ @@ -219,14 +219,14 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* this must fail since the key attribute is missing */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* this must work (key attributes match) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 != NULL); os_channel_destroy(ch10); @@ -244,7 +244,7 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* this must fail (key attributes swapped) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 == NULL); SKTC_ASSERT_ERR(errno == EACCES); @@ -254,26 +254,26 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, ringid, NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 == NULL); SKTC_ASSERT_ERR(errno == EACCES); assert(ch01 == NULL); ch01 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, (ringid + 1), NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch01 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* these all must work (key attributes match) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, ringid, attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 != NULL); ch01 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, (ringid + 1), attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch01 != NULL); os_channel_destroy(ch00); @@ -291,7 +291,7 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* this must fail (key attributes swapped) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 == NULL); SKTC_ASSERT_ERR(errno == EACCES); @@ -301,26 +301,26 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, ringid, NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 == NULL); SKTC_ASSERT_ERR(errno == EACCES); assert(ch11 == NULL); ch11 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, (ringid + 1), NULL, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch11 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* these all must work (key attributes match) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, ringid, attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 != NULL); ch11 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, (ringid + 1), attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch11 != NULL); os_channel_destroy(ch10); @@ -338,25 +338,25 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* these all must fail (key attributes swapped) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, ringid, attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 == NULL); SKTC_ASSERT_ERR(errno == EACCES); ch01 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, (ringid + 1), attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch01 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* these all must work (key attributes match) */ ch00 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, ringid, attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch00 != NULL); ch01 = sktu_channel_create_extended(sktc_instance_uuid, port0, CHANNEL_DIR_TX_RX, (ringid + 1), attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch01 != NULL); os_channel_destroy(ch00); @@ -376,25 +376,25 @@ skt_bind_common(nexus_type_t type, nexus_port_t port0, nexus_port_t port1) /* these all must fail (key attributes swapped) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, ringid, attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 == NULL); SKTC_ASSERT_ERR(errno == EACCES); ch11 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, (ringid + 1), attr0, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch11 == NULL); SKTC_ASSERT_ERR(errno == EACCES); /* these all must work (key attributes match) */ ch10 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, ringid, attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch10 != NULL); ch11 = sktu_channel_create_extended(sktc_instance_uuid, port1, CHANNEL_DIR_TX_RX, (ringid + 1), attr1, - -1, -1, -1, -1, -1, -1, upp, 1, -1, -1); + -1, -1, -1, -1, -1, upp, 1, -1, -1); assert(ch11 != NULL); os_channel_destroy(ch10); diff --git a/tests/skywalk/skt_closecfd.c b/tests/skywalk/skt_closecfd.c index 1636cc545..cf127b9ba 100644 --- a/tests/skywalk/skt_closecfd.c +++ b/tests/skywalk/skt_closecfd.c @@ -49,7 +49,7 @@ skt_closecfd_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); @@ -86,7 +86,7 @@ skt_writecfd_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); @@ -124,7 +124,7 @@ skt_readcfd_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_debug_verify.c b/tests/skywalk/skt_debug_verify.c index fee21bf0e..dbaef70a1 100644 --- a/tests/skywalk/skt_debug_verify.c +++ b/tests/skywalk/skt_debug_verify.c @@ -62,7 +62,7 @@ skt_debug_verify_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_features.c b/tests/skywalk/skt_features.c index ac0f7a395..b5f81fec6 100644 --- a/tests/skywalk/skt_features.c +++ b/tests/skywalk/skt_features.c @@ -54,7 +54,6 @@ skt_features_main(int argc, char *argv[]) assert(features & SK_FEATURE_NETNS); assert(features & SK_FEATURE_NEXUS_USER_PIPE); assert(features & SK_FEATURE_NEXUS_KERNEL_PIPE); - assert(features & SK_FEATURE_NEXUS_MONITOR); assert(features & SK_FEATURE_NEXUS_FLOWSWITCH); assert(features & SK_FEATURE_NEXUS_NETIF); diff --git a/tests/skywalk/skt_filter.c b/tests/skywalk/skt_filter.c index a3233ea0d..e4dcdb653 100644 --- a/tests/skywalk/skt_filter.c +++ b/tests/skywalk/skt_filter.c @@ -383,7 +383,6 @@ custom_ether_verify(packet_t pkt, custom_ether_args_t *args) } } else { struct ether_vlan_header *evh = buf; - boolean_t tag_in_pkt; uint16_t etype, evl_tag, tag; int err; @@ -401,7 +400,7 @@ custom_ether_verify(packet_t pkt, custom_ether_args_t *args) evl_tag = ntohs(evh->evl_tag); /* vlan tag metadata is not expected for this test case */ - err = os_packet_get_vlan_tag(pkt, &tag, &tag_in_pkt); + err = os_packet_get_vlan_tag(pkt, &tag); if (err == 0) { SKD1("tag not expected: 0x%x\n", tag); return FALSE; diff --git a/tests/skywalk/skt_flow.c b/tests/skywalk/skt_flow.c index 742df5d4b..f93f368e0 100644 --- a/tests/skywalk/skt_flow.c +++ b/tests/skywalk/skt_flow.c @@ -235,6 +235,87 @@ skt_flow_config_main(int argc, char *argv[]) return 0; } +int +skt_flow_conn_idle_main(int argc, char *argv[]) +{ + ifname = FETH0_NAME; + our_mask = sktc_make_in_addr(IN_CLASSC_NET); + our_ip = sktc_feth0_in_addr(); + dst_ip = sktc_feth1_in_addr(); + + T_LOG("\nTesting flow connection idle API\n"); + + bzero(&handles, sizeof(handles)); + strlcpy(handles.netif_ifname, ifname, sizeof(handles.netif_ifname)); + handles.netif_addr = our_ip; + handles.netif_mask = our_mask; + sktc_create_flowswitch_no_address(&handles, -1, -1, -1, -1, 0); + + T_LOG("add a flow\n"); + struct sktu_flow *flow; + flow = sktu_create_nexus_flow(&handles, AF_INET, &our_ip, &dst_ip, IPPROTO_TCP, 1234, 1234); + assert(flow); + + T_LOG("verify flow default (negative) CONNECTION_IDLE flag\n"); + struct sk_stats_flow sf; + int ret = sktu_get_nexus_flow_stats(flow->uuid, &sf); + assert(ret == 0); + assert((sf.sf_flags & SFLOWF_CONNECTION_IDLE) == 0); + + uuid_t rand_uuid; + do { + uuid_generate(rand_uuid); + } while (uuid_compare(rand_uuid, flow->uuid) == 0); + + // should return ENOENT with mismatching flow uuid + T_LOG("verify ENOENT with INVALID flow\n"); + ret = os_nexus_flow_set_connection_idle(handles.fsw_nx_uuid, rand_uuid, false); + assert(ret != 0); + assert(errno == ENOENT); + + /* should fail with EPERM from another PID */ + T_LOG("verify EPERM with INVALID PID\n"); + int child_pid; + if ((child_pid = fork()) == -1) { + SKT_LOG("fork: %s\n", strerror(errno)); + exit(1); + } + if (child_pid == 0) { + ret = os_nexus_flow_set_connection_idle(handles.fsw_nx_uuid, flow->uuid, false); + exit(errno); + } else { + int child_status; + wait(&child_status); + assert(WIFEXITED(child_status)); + assert(WEXITSTATUS(child_status) == EPERM); + } + + T_LOG("verify setting flow CONNECTION_IDLE\n"); + ret = os_nexus_flow_set_connection_idle(handles.fsw_nx_uuid, flow->uuid, true); + assert(ret == 0); + + ret = sktu_get_nexus_flow_stats(flow->uuid, &sf); + assert(ret == 0); + assert((sf.sf_flags & SFLOWF_CONNECTION_IDLE) != 0); + + T_LOG("verify clearing flow CONNECTION_IDLE\n"); + ret = os_nexus_flow_set_connection_idle(handles.fsw_nx_uuid, flow->uuid, false); + assert(ret == 0); + + ret = sktu_get_nexus_flow_stats(flow->uuid, &sf); + assert(ret == 0); + assert((sf.sf_flags & SFLOWF_CONNECTION_IDLE) == 0); + + T_LOG("verify EPERM with netif nexus\n"); + ret = os_nexus_flow_set_connection_idle(handles.netif_nx_uuid, flow->uuid, true); + assert(ret != 0); + assert(errno == EPERM); + + T_LOG("\n"); + + return 0; +} + int skt_flow_req_main(int argc, char *argv[]) { @@ -320,3 +401,10 @@ struct skywalk_test skt_flow_config = { skt_flow_config_main, { NULL }, skt_flow_req_net_init, skt_flow_req_net_fini, }; + +struct skywalk_test skt_flow_conn_idle = { + "flowconnidle", "test skywalk flow connection idle api", + SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_FLOWSWITCH | SK_FEATURE_NETNS, + skt_flow_conn_idle_main, { NULL }, + skt_flow_req_net_init, skt_flow_req_net_fini, +}; diff --git a/tests/skywalk/skt_fsw29301703.c b/tests/skywalk/skt_fsw29301703.c index 2502677d6..1fb554e2b 100644 --- a/tests/skywalk/skt_fsw29301703.c +++ b/tests/skywalk/skt_fsw29301703.c @@ -86,7 +86,7 @@ skt_fsw29301703_common(int nchannels) /* must fail without user packet pool set (flow switch) */ assert(sktu_channel_create_extended(fsw_instance, 2, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); + -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); /* * Open many channels from userland to the flowswitch. @@ -95,7 +95,7 @@ skt_fsw29301703_common(int nchannels) for (int i = 0; i < sizeof(channels) / sizeof(channels[0]); i++) { channels[i] = sktu_channel_create_extended(fsw_instance, i + 2, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, 1, 1, -1, -1); + -1, -1, -1, -1, -1, 1, 1, -1, -1); if (!channels[i]) { SKT_LOG("failed on channel %d errno %d\n", 1 + i, errno); result = 1; diff --git a/tests/skywalk/skt_fullupipe.c b/tests/skywalk/skt_fullupipe.c index bac35fbfb..74b8966ee 100644 --- a/tests/skywalk/skt_fullupipe.c +++ b/tests/skywalk/skt_fullupipe.c @@ -81,11 +81,11 @@ skt_fullupipe_main(int argc, char *argv[]) channel0 = sktu_channel_create_extended(instance_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel0); channel1 = sktu_channel_create_extended(instance_uuid, 1, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel1); txring0 = os_channel_tx_ring(channel0, os_channel_ring_id(channel0, CHANNEL_FIRST_TX_RING)); @@ -277,11 +277,11 @@ skt_upipepeerclosure_main(int argc, char *argv[]) channel0 = sktu_channel_create_extended(instance_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel0); channel1 = sktu_channel_create_extended(instance_uuid, 1, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel1); txring0 = os_channel_tx_ring(channel0, os_channel_ring_id(channel0, diff --git a/tests/skywalk/skt_kqueue.c b/tests/skywalk/skt_kqueue.c index a481e4583..0cb6f32a7 100644 --- a/tests/skywalk/skt_kqueue.c +++ b/tests/skywalk/skt_kqueue.c @@ -150,7 +150,7 @@ skt_kqueue_basic_rx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, ch_attr, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); ring_id = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); @@ -231,7 +231,7 @@ skt_kqueue_basic_tx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, ch_attr, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); ring_id = os_channel_ring_id(channel, CHANNEL_FIRST_TX_RING); @@ -363,7 +363,7 @@ skt_kqueue_lowat_basic_rx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, ch_attr, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); ring_id = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); @@ -482,7 +482,7 @@ skt_kqueue_lowat_basic_tx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, ch_attr, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); slot_size = channel->chd_info->cinfo_nxprov_params.nxp_buf_size; diff --git a/tests/skywalk/skt_mangle.c b/tests/skywalk/skt_mangle.c index dd0893474..0fed829ab 100644 --- a/tests/skywalk/skt_mangle.c +++ b/tests/skywalk/skt_mangle.c @@ -172,7 +172,7 @@ skt_mangle_rx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); ring_id = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); @@ -263,7 +263,7 @@ skt_mangle_tx(void *ctx_) ctx->nexus_uuid, port, ring_dir, ring_id, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel != NULL); ring_id = os_channel_ring_id(channel, CHANNEL_FIRST_TX_RING); diff --git a/tests/skywalk/skt_manyflows.c b/tests/skywalk/skt_manyflows.c index a02d2e8f9..f5e6c0090 100644 --- a/tests/skywalk/skt_manyflows.c +++ b/tests/skywalk/skt_manyflows.c @@ -330,7 +330,7 @@ skt_mcflows_main(int argc, char *argv[]) channel = sktu_channel_create_extended(fsw_uuid, NEXUS_PORT_FLOW_SWITCH_CLIENT + child, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); if ((ret = write(MPTEST_SEQ_FILENO, buf, sizeof(buf))) == -1) { diff --git a/tests/skywalk/skt_netifcompat.c b/tests/skywalk/skt_netifcompat.c index 3e84c04d7..75a70de52 100644 --- a/tests/skywalk/skt_netifcompat.c +++ b/tests/skywalk/skt_netifcompat.c @@ -57,12 +57,12 @@ skt_netifcompat_common(void) channel = sktu_channel_create_extended(sktc_instance_uuid, NEXUS_PORT_NET_IF_HOST, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(!channel); channel = sktu_channel_create_extended(sktc_instance_uuid, NEXUS_PORT_NET_IF_DEV, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(!channel); uuid_generate_random(if_uuid); @@ -84,7 +84,7 @@ skt_netifcompat_common(void) channel = sktu_channel_create_extended(sktc_instance_uuid, NEXUS_PORT_NET_IF_DEV, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); if (channel != NULL) { error = __os_nexus_ifdetach(sktc_nexus_controller, sktc_instance_uuid, if_uuid); @@ -254,12 +254,12 @@ skt_fsw_common(const char *name) /* must fail without user packet pool set (flow switch) */ assert(sktu_channel_create_extended(ms_instance, 2, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); + -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); /* Open and close channel to the flow switch */ channel = sktu_channel_create_extended(ms_instance, 2, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, 1, 1, -1, -1); + -1, -1, -1, -1, -1, 1, 1, -1, -1); assert(channel); os_channel_destroy(channel); diff --git a/tests/skywalk/skt_nslots.c b/tests/skywalk/skt_nslots.c index 8f622e4c5..76a1d8833 100644 --- a/tests/skywalk/skt_nslots.c +++ b/tests/skywalk/skt_nslots.c @@ -71,7 +71,7 @@ skt_nslots_common(int argc, char *argv[], uint32_t nslots, uint32_t interval, in channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_oneslot.c b/tests/skywalk/skt_oneslot.c index e348a3bef..af359abb0 100644 --- a/tests/skywalk/skt_oneslot.c +++ b/tests/skywalk/skt_oneslot.c @@ -64,7 +64,7 @@ skt_oneslot_common(int argc, char *argv[], int method, bool defunct) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_pllupipe.c b/tests/skywalk/skt_pllupipe.c index 7e79ca736..f77312d69 100644 --- a/tests/skywalk/skt_pllupipe.c +++ b/tests/skywalk/skt_pllupipe.c @@ -84,7 +84,7 @@ skt_pllupipe_txk_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_TX_RING); @@ -110,7 +110,7 @@ skt_pllupipe_txs_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_TX_RING); @@ -136,7 +136,7 @@ skt_pllupipe_txp_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_TX_RING); @@ -162,7 +162,7 @@ skt_pllupipe_rxk_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); @@ -188,7 +188,7 @@ skt_pllupipe_rxs_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); @@ -214,7 +214,7 @@ skt_pllupipe_rxp_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid = os_channel_ring_id(channel, CHANNEL_FIRST_RX_RING); diff --git a/tests/skywalk/skt_restricted_port.c b/tests/skywalk/skt_restricted_port.c index 35874d541..814468b2b 100644 --- a/tests/skywalk/skt_restricted_port.c +++ b/tests/skywalk/skt_restricted_port.c @@ -45,44 +45,22 @@ static int skt_reserve_restricted_port() { int error; - int old_first, old_last; - int restricted_port = 55555; - size_t size; - - size = sizeof(old_first); - error = sysctlbyname("net.inet.ip.portrange.first", &old_first, &size, &restricted_port, sizeof(restricted_port)); - SKTC_ASSERT_ERR(!error); - assert(size == sizeof(old_first)); - - size = sizeof(old_last); - error = sysctlbyname("net.inet.ip.portrange.last", &old_last, &size, &restricted_port, sizeof(restricted_port)); - SKTC_ASSERT_ERR(!error); - assert(size == sizeof(old_last)); + int restricted_port = 55555; // restricted for lights_out_management struct sktc_nexus_handles handles; sktc_create_flowswitch(&handles, 0); - uuid_t flow; - /* try reserve one of the restricted ephemeral ports */ + uuid_t flow; uuid_generate_random(flow); error = sktc_bind_tcp4_flow(handles.controller, handles.fsw_nx_uuid, - 0, NEXUS_PORT_FLOW_SWITCH_CLIENT, flow); + restricted_port, NEXUS_PORT_FLOW_SWITCH_CLIENT, flow); SKTC_ASSERT_ERR(error == -1); - SKTC_ASSERT_ERR(errno == EADDRNOTAVAIL); + SKTC_ASSERT_ERR(errno == EADDRINUSE); uuid_clear(flow); sktc_cleanup_flowswitch(&handles); - size = sizeof(old_first); - error = sysctlbyname("net.inet.ip.portrange.first", NULL, NULL, &old_first, size); - SKTC_ASSERT_ERR(!error); - assert(size == sizeof(old_first)); - - size = sizeof(old_last); - error = sysctlbyname("net.inet.ip.portrange.last", NULL, NULL, &old_last, size); - SKTC_ASSERT_ERR(!error); - assert(size == sizeof(old_last)); return 0; } @@ -92,7 +70,6 @@ skt_reserve_restricted_port_main(int argc, char *argv[]) return skt_reserve_restricted_port(); } - struct skywalk_test skt_restricted_port = { "restricted_port", "test reserve a restricted ephemeral port", SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_NETIF | SK_FEATURE_NEXUS_FLOWSWITCH | SK_FEATURE_NETNS, diff --git a/tests/skywalk/skt_ringid.c b/tests/skywalk/skt_ringid.c index 096951a9e..329c3f4ca 100644 --- a/tests/skywalk/skt_ringid.c +++ b/tests/skywalk/skt_ringid.c @@ -83,7 +83,7 @@ skt_ringid_main_common(int argc, char *argv[], uint32_t num, channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); fringid = os_channel_ring_id(channel, first); @@ -114,7 +114,7 @@ skt_ringid_main_common(int argc, char *argv[], uint32_t num, channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, ringid, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ringid2 = os_channel_ring_id(channel, first); @@ -136,7 +136,7 @@ skt_ringid_main_common(int argc, char *argv[], uint32_t num, assert(ringid == lringid + 1); channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, ringid, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(!channel); return 0; diff --git a/tests/skywalk/skt_shutdown.c b/tests/skywalk/skt_shutdown.c index 26ca3bdc7..80fd30b84 100644 --- a/tests/skywalk/skt_shutdown.c +++ b/tests/skywalk/skt_shutdown.c @@ -66,7 +66,7 @@ skt_shutdown_common(int argc, char *argv[], int method) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_shutdown2.c b/tests/skywalk/skt_shutdown2.c index 7b193c7bc..1e5a84417 100644 --- a/tests/skywalk/skt_shutdown2.c +++ b/tests/skywalk/skt_shutdown2.c @@ -74,7 +74,7 @@ skt_shutdown2_common(int argc, char *argv[], int method) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); channelfd = os_channel_get_fd(channel); diff --git a/tests/skywalk/skt_steering.c b/tests/skywalk/skt_steering.c index bdb97c8ef..127b01082 100644 --- a/tests/skywalk/skt_steering.c +++ b/tests/skywalk/skt_steering.c @@ -34,6 +34,33 @@ #define TEST_RPORT 45678 #define TEST_QSET_ID 0x0001 +static void +fill_traffic_descriptor_eth(struct ifnet_traffic_descriptor_eth *td, + uint8_t mask, bool fill_valid_td) +{ + ether_addr_t dst_mac_addr = {0}; + int err; + + bzero(td, sizeof(*td)); + + td->eth_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_ETH; + td->eth_common.itd_len = sizeof(*td); + td->eth_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND | + IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND; + + td->eth_mask = mask; + if (mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE) { + td->eth_type = (fill_valid_td) ? ETHERTYPE_PAE : ETHERTYPE_IP; + } + if (mask & IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR) { + if (fill_valid_td) { + err = sktc_get_mac_addr(FETH0_NAME, dst_mac_addr.octet); + assert(err == 0); + } + bcopy(&dst_mac_addr, &td->eth_raddr, ETHER_ADDR_LEN); + } +} + static void fill_traffic_descriptor_v4(struct ifnet_traffic_descriptor_inet *td) { @@ -106,8 +133,11 @@ skt_steering_main(int argc, char *argv[]) { nexus_controller_t ctl; struct ifnet_traffic_descriptor_inet td; + struct ifnet_traffic_descriptor_eth td_eth; struct ifnet_traffic_rule_action_steer ra; uuid_t v4_rule, v6_rule; + uuid_t eth_type_rule, eth_raddr_rule, eth_rule; + uint8_t mask; int err; ctl = os_nexus_controller_create(); @@ -115,24 +145,113 @@ skt_steering_main(int argc, char *argv[]) fill_traffic_rule_action(&ra); + //Adding v4 rule is successful fill_traffic_descriptor_v4(&td); err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, (struct ifnet_traffic_descriptor_common *)&td, (struct ifnet_traffic_rule_action *)&ra, 0, &v4_rule); assert(err == 0); + //Adding eth & inet rules concurrently is not successful + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE; + fill_traffic_descriptor_eth(&td_eth, mask, true); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_type_rule); + assert(err != 0); + + //Adding v6 rule is successful fill_traffic_descriptor_v6(&td); err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, (struct ifnet_traffic_descriptor_common *)&td, (struct ifnet_traffic_rule_action *)&ra, 0, &v6_rule); assert(err == 0); + //Removing v4 rule is successful err = os_nexus_controller_remove_traffic_rule(ctl, v4_rule); assert(err == 0); + //Removing v6 rule is successful err = os_nexus_controller_remove_traffic_rule(ctl, v6_rule); assert(err == 0); + //Adding eth type rule is successful (All the inet rules are removed) + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE; + fill_traffic_descriptor_eth(&td_eth, mask, true); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_type_rule); + assert(err == 0); + + //Adding duplicate eth type rule is not successful + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_rule); + assert(err != 0); + + //Adding eth & inet rules concurrently is not successful + fill_traffic_descriptor_v4(&td); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td, + (struct ifnet_traffic_rule_action *)&ra, 0, &v4_rule); + assert(err != 0); + + //Adding eth raddr rule is successful + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR; + fill_traffic_descriptor_eth(&td_eth, mask, true); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_raddr_rule); + assert(err == 0); + + //Adding duplicate eth raddr rule is not successful + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_raddr_rule); + assert(err != 0); + + //Adding a different eth raddr rule is successful + bcopy(ether_aton("11:22:33:44:55:66"), &td_eth.eth_raddr, ETHER_ADDR_LEN); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_rule); + assert(err == 0); + + //Removing eth type rule is successful + err = os_nexus_controller_remove_traffic_rule(ctl, eth_type_rule); + assert(err == 0); + + //Removing eth raddr rule is successful + err = os_nexus_controller_remove_traffic_rule(ctl, eth_raddr_rule); + assert(err == 0); + err = os_nexus_controller_remove_traffic_rule(ctl, eth_rule); + assert(err == 0); + + //Adding eth type & raddr rule is not successful + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE | + IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR; + fill_traffic_descriptor_eth(&td_eth, mask, true); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_rule); + assert(err != 0); + + //Adding invalid eth type rule is not successful + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_ETHER_TYPE; + fill_traffic_descriptor_eth(&td_eth, mask, false); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_rule); + assert(err != 0); + + //Adding invalid eth raddr rule is not successful + mask = IFNET_TRAFFIC_DESCRIPTOR_ETH_MASK_RADDR; + fill_traffic_descriptor_eth(&td_eth, mask, false); + err = os_nexus_controller_add_traffic_rule(ctl, FETH0_NAME, + (struct ifnet_traffic_descriptor_common *)&td_eth, + (struct ifnet_traffic_rule_action *)&ra, 0, ð_rule); + assert(err != 0); + os_nexus_controller_destroy(ctl); return 0; } diff --git a/tests/skywalk/skt_teardown.c b/tests/skywalk/skt_teardown.c index e3cf1d94a..9dd35fab8 100644 --- a/tests/skywalk/skt_teardown.c +++ b/tests/skywalk/skt_teardown.c @@ -136,11 +136,11 @@ skt_teardown_pass(int count, int *permute) /* must fail without user packet pool set (flow switch) */ assert(sktu_channel_create_extended(ms_instance, NEXUS_PORT_FLOW_SWITCH_CLIENT, CHANNEL_DIR_TX_RX, - CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); + CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); channel = sktu_channel_create_extended(ms_instance, NEXUS_PORT_FLOW_SWITCH_CLIENT, CHANNEL_DIR_TX_RX, - CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1); + CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, 1, 1, -1, -1); assert(channel); /* Allow us to permute teardown steps */ diff --git a/tests/skywalk/skt_utun27302538.c b/tests/skywalk/skt_utun27302538.c index 3d4d645bd..807eeb3c5 100644 --- a/tests/skywalk/skt_utun27302538.c +++ b/tests/skywalk/skt_utun27302538.c @@ -80,7 +80,7 @@ thread1(void *unused) static void skt_utun27302538_common(void) { - tunsock = sktu_create_interface(SKTU_IFT_UTUN, SKTU_IFF_ENABLE_NETIF); + tunsock = sktu_create_interface(SKTU_IFT_UTUN, SKTU_IFF_ENABLE_NETIF | SKTU_IFF_ENABLE_CHANNEL); assert(tunsock); usleep(100000); diff --git a/tests/skywalk/skt_utunloop.c b/tests/skywalk/skt_utunloop.c index 4e4087d2d..6f9d00502 100644 --- a/tests/skywalk/skt_utunloop.c +++ b/tests/skywalk/skt_utunloop.c @@ -134,6 +134,7 @@ skt_utunloop_xfer_slots(int kq, prevslotcount = slotcount = 0; prevbytecount = bytecount = 0; int stallcount = 0; + then = time(NULL); start = time(NULL); while (!g_die) { @@ -834,7 +835,7 @@ dotraffic(void *(*sourcefunc)(void *), void *(*sinkfunc)(void *), static void -skt_tunloop_common(bool doutun, bool enable_netif, bool udp, bool udpduplex, bool tcp, bool tcpduplex, bool dualstream) +skt_tunloop_common(bool doutun, bool enable_netif, bool enable_channel, bool udp, bool udpduplex, bool tcp, bool tcpduplex, bool dualstream) { int error; int utun1, utun2; @@ -871,6 +872,7 @@ skt_tunloop_common(bool doutun, bool enable_netif, bool udp, bool udpduplex, boo sktu_if_type_t type = doutun ? SKTU_IFT_UTUN : SKTU_IFT_IPSEC; sktu_if_flag_t flags = enable_netif ? SKTU_IFF_ENABLE_NETIF : 0; + flags |= enable_channel ? SKTU_IFF_ENABLE_CHANNEL : 0; utun1 = sktu_create_interface(type, flags); utun2 = sktu_create_interface(type, flags); @@ -1011,45 +1013,11 @@ skt_tunloop_common(bool doutun, bool enable_netif, bool udp, bool udpduplex, boo /****************************************************************/ -static int -skt_utunloopn4u1_main(int argc, char *argv[]) -{ - g_assert_stalls12 = true; - skt_tunloop_common(true, false, true, false, false, false, false); - return 0; -} - -static int -skt_utunloopn4u2_main(int argc, char *argv[]) -{ - g_assert_stalls12 = true; - g_assert_stalls21 = true; - skt_tunloop_common(true, false, true, true, false, false, false); - return 0; -} - -static int -skt_utunloopn4t1_main(int argc, char *argv[]) -{ - g_assert_stalls12 = true; - skt_tunloop_common(true, false, false, false, true, false, false); - return 0; -} - -static int -skt_utunloopn4t2_main(int argc, char *argv[]) -{ - g_assert_stalls12 = true; - g_assert_stalls21 = true; - skt_tunloop_common(true, false, false, false, true, true, false); - return 0; -} - static int skt_utunloopy4u1_main(int argc, char *argv[]) { g_assert_stalls12 = true; - skt_tunloop_common(true, true, true, false, false, false, false); + skt_tunloop_common(true, true, true, true, false, false, false, false); return 0; } @@ -1058,7 +1026,7 @@ skt_utunloopy4u2_main(int argc, char *argv[]) { g_assert_stalls12 = true; g_assert_stalls21 = true; - skt_tunloop_common(true, true, true, true, false, false, false); + skt_tunloop_common(true, true, true, true, true, false, false, false); return 0; } @@ -1066,7 +1034,7 @@ static int skt_utunloopy4t1_main(int argc, char *argv[]) { g_assert_stalls12 = true; - skt_tunloop_common(true, true, false, false, true, false, false); + skt_tunloop_common(true, true, true, false, false, true, false, false); return 0; } @@ -1075,48 +1043,17 @@ skt_utunloopy4t2_main(int argc, char *argv[]) { g_assert_stalls12 = true; g_assert_stalls21 = true; - skt_tunloop_common(true, true, false, false, true, true, false); - return 0; -} - -static int -skt_utunloopn1000_main(int argc, char *argv[]) -{ - skt_tunloop_common(true, false, false, false, false, false, false); + skt_tunloop_common(true, true, true, false, false, true, true, false); return 0; } static int skt_utunloopy1000_main(int argc, char *argv[]) { - skt_tunloop_common(true, true, false, false, false, false, false); + skt_tunloop_common(true, true, true, false, false, false, false, false); return 0; } -struct skywalk_test skt_utunloopn4u1 = { - "utunloopn4u1", "open 2 utuns without netif and floods ipv4 udp packets in one direction", - SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, - skt_utunloopn4u1_main, -}; - -struct skywalk_test skt_utunloopn4u2 = { - "utunloopn4u2", "open 2 utuns without netif and floods ipv4 udp packets in two directions", - SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, - skt_utunloopn4u2_main, -}; - -struct skywalk_test skt_utunloopn4t1 = { - "utunloopn4t1", "open 2 utuns without netif and floods ipv4 tcp packets in one direction", - SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, - skt_utunloopn4t1_main, -}; - -struct skywalk_test skt_utunloopn4t2 = { - "utunloopn4t2", "open 2 utuns without netif and floods ipv4 tcp packets in two directions", - SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, - skt_utunloopn4t2_main, -}; - struct skywalk_test skt_utunloopy4u1 = { "utunloopy4u1", "open 2 utuns with netif and floods ipv4 udp packets in one direction", SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, @@ -1141,12 +1078,6 @@ struct skywalk_test skt_utunloopy4t2 = { skt_utunloopy4t2_main, }; -struct skywalk_test skt_utunloopn1000 = { - "utunloopn1000", "open 2 utuns without netif and sleeps for 1000 seconds", - SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, - skt_utunloopn1000_main, -}; - struct skywalk_test skt_utunloopy1000 = { "utunloopy1000", "open 2 utuns with netif and sleeps for 1000 seconds", SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_KERNEL_PIPE, @@ -1159,7 +1090,7 @@ static int skt_ipsecloopy4u1_main(int argc, char *argv[]) { g_assert_stalls12 = true; - skt_tunloop_common(false, true, true, false, false, false, false); + skt_tunloop_common(false, true, true, true, false, false, false, false); return 0; } @@ -1168,7 +1099,7 @@ skt_ipsecloopy4u2_main(int argc, char *argv[]) { g_assert_stalls12 = true; g_assert_stalls21 = true; - skt_tunloop_common(false, true, true, true, false, false, false); + skt_tunloop_common(false, true, true, true, true, false, false, false); return 0; } @@ -1176,7 +1107,7 @@ static int skt_ipsecloopy4t1_main(int argc, char *argv[]) { g_assert_stalls12 = true; - skt_tunloop_common(false, true, false, false, true, false, false); + skt_tunloop_common(false, true, true, false, false, true, false, false); return 0; } @@ -1185,14 +1116,14 @@ skt_ipsecloopy4t2_main(int argc, char *argv[]) { g_assert_stalls12 = true; g_assert_stalls21 = true; - skt_tunloop_common(false, true, false, false, true, true, false); + skt_tunloop_common(false, true, true, false, false, true, true, false); return 0; } static int skt_ipsecloopy1000_main(int argc, char *argv[]) { - skt_tunloop_common(false, true, false, false, false, false, false); + skt_tunloop_common(false, true, true, false, false, false, false, false); return 0; } diff --git a/tests/skywalk/skt_writemem.c b/tests/skywalk/skt_writemem.c index 7949d3caf..9c896b8ad 100644 --- a/tests/skywalk/skt_writemem.c +++ b/tests/skywalk/skt_writemem.c @@ -51,7 +51,7 @@ skt_writeif_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); *(char *)channel->chd_schema->csm_kern_name = 'X'; @@ -82,7 +82,7 @@ skt_writering_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); ring_id_t ringid; @@ -136,7 +136,7 @@ skt_readsmap_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); //T_LOG("ch_if 0x%p offset 0x%llx pointer 0x%p\n", @@ -172,7 +172,7 @@ skt_writesmap_main(int argc, char *argv[]) channel = sktu_channel_create_extended(channel_uuid, 0, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); //T_LOG("ch_if 0x%p offset 0x%llx pointer 0x%p\n", @@ -291,11 +291,11 @@ skt_nxregion_verify_main(int argc, char *argv[]) /* must fail without user packet pool set (flow switch) */ assert(sktu_channel_create_extended(fsw_uuid, NEXUS_PORT_FLOW_SWITCH_CLIENT, CHANNEL_DIR_TX_RX, - CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); + CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, 1, -1, -1) == NULL); channel = sktu_channel_create_extended(fsw_uuid, NEXUS_PORT_FLOW_SWITCH_CLIENT, CHANNEL_DIR_TX_RX, - CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1); + CHANNEL_RING_ID_ANY, NULL, -1, -1, -1, -1, -1, 1, 1, -1, -1); assert(channel); switch (test_id) { diff --git a/tests/skywalk/skt_xfer.c b/tests/skywalk/skt_xfer.c index 76e74fb1f..283f48a8d 100644 --- a/tests/skywalk/skt_xfer.c +++ b/tests/skywalk/skt_xfer.c @@ -153,7 +153,7 @@ static const struct fsw_inject_codes fsw_inject_codes[] = { _S1(1, FSW_STATS_RX_FLOW_EXTRACT_ERR), /* ms_copy_to_dev_mbuf() sets mbuf to NULL */ - /*_S2(11, FSW_STATS_DROP, FSW_STATS_DROP_NOMEM_MBUF), */ + _S2(11, FSW_STATS_DROP, FSW_STATS_DROP_NOMEM_MBUF), /* ms_copy_to_dev_pkt() set pkt to NULL */ _S2(12, FSW_STATS_DROP, FSW_STATS_DROP_NOMEM_PKT), @@ -296,6 +296,7 @@ connect_flow(nexus_controller_t ncd, nfr.nfr_daddr.sin.sin_addr = dst_addr; nfr.nfr_flowadv_idx = FLOWADV_IDX_NONE; nfr.nfr_qset_id = qset_id; + error = __os_nexus_flow_add(ncd, fsw, &nfr); if (error) { @@ -1840,8 +1841,8 @@ ping_pong(channel_port_t port, uuid_t flow_id, uint16_t src_port, } if (demux_offset <= MAX_DEMUX_OFFSET) { - payload.data[demux_offset] = DEMUX_PAYLOAD_VALUE; - payload.data[demux_offset + 1] = DEMUX_PAYLOAD_VALUE >> 8; + payload.data[demux_offset] = (char)DEMUX_PAYLOAD_VALUE; + payload.data[demux_offset + 1] = (char)DEMUX_PAYLOAD_VALUE >> 8; } if (child == 0) { @@ -4123,6 +4124,105 @@ skt_xfer_udp_parent_child(int id, uint16_t demux_offset) return 0; } +static int +skt_xfer_rx_flow_steering_drop_packets(int child, bool drop_tx) +{ + char buf[1] = { 0 }; + int error; + const char * ifname; + uuid_t flow_id = {}; + struct in_addr our_ip; + struct in_addr our_mask; + uint16_t our_port; + struct in_addr peer_ip; + uint16_t peer_port; + channel_port port; + ssize_t ret; + flowadv_idx_t flowadv_idx; + struct fsw_stats stats_before, stats_after; + uint64_t counter = 0; + uint16_t flags = 0; + + our_mask = sktc_make_in_addr(IN_CLASSC_NET); + + if (child == 0) { + ifname = FETH0_NAME; + our_ip = sktc_feth0_in_addr(); + peer_ip = sktc_feth1_in_addr(); + our_port = FETH0_PORT; + peer_port = FETH1_PORT; + flags = drop_tx ? NXFLOWREQF_AOP_OFFLOAD : 0; + } else { + child = 1; + ifname = FETH1_NAME; + our_ip = sktc_feth1_in_addr(); + peer_ip = sktc_feth0_in_addr(); + our_port = FETH1_PORT; + peer_port = FETH0_PORT; + flags = !drop_tx ? NXFLOWREQF_AOP_OFFLOAD : 0; + } + + /* set up the flowswitch over the right interface */ + error = setup_flowswitch_and_flow(&handles, ifname, IPPROTO_UDP, + flags, our_ip, our_mask, our_port, getpid(), peer_ip, + peer_port, flow_id, &flowadv_idx, -1, -1, -1, -1, false); + if (error == 0) { + sktu_channel_port_init(&port, handles.fsw_nx_uuid, + OUR_FLOWSWITCH_PORT, ENABLE_UPP, false, false); + assert(port.chan != NULL); + } + if ((ret = write(MPTEST_SEQ_FILENO, buf, sizeof(buf))) == -1) { + SKT_LOG("write fail: %s\n", strerror(errno)); + return 1; + } + assert(ret == 1); +#if SKT_XFER_DEBUG + T_LOG("child %d signaled\n", child); +#endif + /* Wait for go signal */ + if ((ret = read(MPTEST_SEQ_FILENO, buf, sizeof(buf))) == -1) { + SKT_LOG("read fail: %s\n", strerror(errno)); + return 1; + } + assert(ret == 1); + if (error != 0) { + return 1; + } +#if SKT_XFER_DEBUG + T_LOG("got input %d from parent in child %d, starting test\n", + buf[0], child); +#endif + port.ip_addr = our_ip; + + if (flags == NXFLOWREQF_AOP_OFFLOAD) { + ret = get_fsw_stats(&stats_before); + assert(ret == 0); + } + + ping_pong(&port, flow_id, our_port, peer_ip, peer_port, + 1, 1, child, TRUE, flowadv_idx, + FALSE, FALSE, MAX_DEMUX_OFFSET + 1); + + if (flags == NXFLOWREQF_AOP_OFFLOAD) { + ret = get_fsw_stats(&stats_after); + assert(ret == 0); + + if (drop_tx) { + counter = STATS_VAL(&stats_after, FSW_STATS_TX_DISABLED); + counter -= STATS_VAL(&stats_before, FSW_STATS_TX_DISABLED); + } else { + counter = STATS_VAL(&stats_after, FSW_STATS_RX_DISABLED); + counter -= STATS_VAL(&stats_before, FSW_STATS_RX_DISABLED); + } + if (counter == 0) { + T_LOG("Offload packets wasn't dropped"); + assert(0); + } + T_LOG("Offload packets dropped %"PRIu64"\n", counter); + } + return 0; +} + static int skt_xfer_udp_main(int argc, char *argv[]) { @@ -4493,6 +4593,32 @@ skt_xfer_parent_child_flow_main_offset_400(int argc, char *argv[]) return skt_xfer_udp_parent_child(child, 400); } +static int +skt_xfer_rx_flow_steering_drop_tx_packets_main(int argc, char *argv[]) +{ + int child; + + assert(!strcmp(argv[3], "--child")); + child = atoi(argv[4]); + + skt_xfer_rx_flow_steering_drop_packets(child, true); + + return 0; +} + +static int +skt_xfer_rx_flow_steering_drop_rx_packets_main(int argc, char *argv[]) +{ + int child; + + assert(!strcmp(argv[3], "--child")); + child = atoi(argv[4]); + + skt_xfer_rx_flow_steering_drop_packets(child, false); + + return 0; +} + static void skt_xfer_init_txstart(void) { @@ -4923,6 +5049,26 @@ skt_xfer_fini_parent_child_flow(void) sktc_restore_fsw_rx_agg_tcp(); } +static void +skt_xfer_init_rx_flow_steering(void) +{ + int rx_flow_steering = 1; + + assert(sysctlbyname("net.link.fake.rx_flow_steering_support", + NULL, 0, &rx_flow_steering, sizeof(rx_flow_steering)) == 0); + skt_xfer_init_native(); +} + +static void +skt_xfer_fini_rx_flow_steering(void) +{ + int rx_flow_steering = 0; + + skt_xfer_fini(); + assert(sysctlbyname("net.link.fake.rx_flow_steering_support", + NULL, 0, &rx_flow_steering, sizeof(rx_flow_steering)) == 0); +} + struct skywalk_mptest skt_xferudp = { "xferudp", "UDP bi-directional transfer over fake ethernet pair", SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_NETIF | @@ -5515,3 +5661,23 @@ struct skywalk_mptest skt_xferparentchildflown_offset_400 = { { NULL, NULL, NULL, NULL, NULL, NULL }, skt_xfer_init_parent_child_flow_native, skt_xfer_fini_parent_child_flow, {}, }; + +struct skywalk_mptest skt_xferrxflowsteeringdroptxpackets = { + "skt_xferrxflowsteeringdroptxpackets", + "drop aop2 offload Tx packets in flowswitch", + SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_NETIF | SK_FEATURE_DEV_OR_DEBUG | + SK_FEATURE_NEXUS_FLOWSWITCH | SK_FEATURE_NETNS, + 2, skt_xfer_rx_flow_steering_drop_tx_packets_main, + { NULL, NULL, NULL, NULL, NULL, NULL }, + skt_xfer_init_rx_flow_steering, skt_xfer_fini_rx_flow_steering, {}, +}; + +struct skywalk_mptest skt_xferrxflowsteeringdroprxpackets = { + "skt_xferrxflowsteeringdroprxpackets", + "drop aop2 offload Rx packets in flowswitch", + SK_FEATURE_SKYWALK | SK_FEATURE_NEXUS_NETIF | SK_FEATURE_DEV_OR_DEBUG | + SK_FEATURE_NEXUS_FLOWSWITCH | SK_FEATURE_NETNS, + 2, skt_xfer_rx_flow_steering_drop_rx_packets_main, + { NULL, NULL, NULL, NULL, NULL, NULL }, + skt_xfer_init_rx_flow_steering, skt_xfer_fini_rx_flow_steering, {}, +}; diff --git a/tests/skywalk/skywalk_mptest_driver.c b/tests/skywalk/skywalk_mptest_driver.c index 35075e056..7c5afde0c 100644 --- a/tests/skywalk/skywalk_mptest_driver.c +++ b/tests/skywalk/skywalk_mptest_driver.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "skywalk_test_driver.h" #include "skywalk_test_common.h" @@ -96,6 +97,32 @@ skywalk_mptest_driver_SIGINT_handler(int sig) exit(0); } +static void +print_fsw_stats(void) +{ + struct sk_stats_flow_switch *sfsw; + struct sk_stats_flow_switch *entry; + size_t len; + int ret; + + ret = sktu_get_nexus_flowswitch_stats(&sfsw, &len); + assert(ret == 0); + + os_log(OS_LOG_DEFAULT, "Flowswitch stats\n"); + for (entry = sfsw; (void *)entry < (void *)sfsw + len; entry++) { + uuid_string_t uuid_str; + uuid_unparse_upper(entry->sfs_nx_uuid, uuid_str); + os_log(OS_LOG_DEFAULT, "%s: %s\n", entry->sfs_if_name, uuid_str); + __fsw_stats_print(&entry->sfs_fsws); + } +} + +void +skywalk_mptest_driver_SIGABRT_handler(int s) +{ + print_fsw_stats(); +} + void skywalk_mptest_driver_init(void) { @@ -114,6 +141,7 @@ skywalk_mptest_driver_init(void) curr_test = NULL; signal(SIGINT, skywalk_mptest_driver_SIGINT_handler); + signal(SIGABRT, skywalk_mptest_driver_SIGABRT_handler); } diff --git a/tests/skywalk/skywalk_mptests.c b/tests/skywalk/skywalk_mptests.c index 959c4498b..a84540c5d 100644 --- a/tests/skywalk/skywalk_mptests.c +++ b/tests/skywalk/skywalk_mptests.c @@ -54,8 +54,6 @@ X(xfertcprstflood, "TCP RST flood") \ X(xferudpwitherrors, "UDP bi-directional transfer over native fake ethernet pair with injected errors") \ X(xferudpwitherrorscompat, "UDP bi-directional transfer over compat fake ethernet pair with injected errors") \ - X(xferudpping_aqm, "UDP ping-pong over fake ethernet pair with AQM") \ - X(xferudppingn_aqm, "UDP ping-pong over native fake ethernet pair with AQM") \ X(xfertcpportzero, "TCP connect to port 0") \ X(xferudpportzero, "UDP connect to port 0") \ X(xfersetuponly, "setup fake ethernet pair only") \ @@ -78,7 +76,6 @@ X(netifdirectifadvdisable, "netif interface advisory disabled test") \ X(netifdirectchanevents, "netif interface channel events test") \ X(netifdirectexpiryevents, "netif interface expiry events test") \ - X(xferudpifadvenable, "flowswitch interface advisory enabled test") \ X(xferudpifadvdisable, "flowswitch interface advisory disabled test") \ X(xferudpchanevents, "flowswitch channel events test") \ X(xferudpchaneventsasync, "flowswitch channel events in async mode test") \ @@ -90,7 +87,12 @@ X(xferparentchildflown, "flowswitch parent child flows on native fake ethernet interface test") \ X(xferparentchildflow_offset_400, "flowswitch parent child flows test with demux offset 400") \ X(xferparentchildflown_offset_400, "flowswitch parent child flows on native fake ethernet interface test with demux offset 400") \ - X(xferrdudpping, "UDP ping-pong between redirect and fake ethernet interface") + X(xferrdudpping, "UDP ping-pong between redirect and fake ethernet interface") \ + X(xferrxflowsteeringdroptxpackets, "drop aop2 offload Tx packets in flowswitch") + +#define RDAR_133412076_FAILING_TESTS \ + X(xferudpifadvenable, "flowswitch interface advisory enabled test") \ + X(xferrxflowsteeringdroprxpackets, "drop aop2 offload Rx packets in flowswitch") /* * This is equivalent to the following legacy test command: @@ -109,3 +111,18 @@ } BATS_TESTS; #undef X + +#define X(test, desc, ...) \ + T_DECL(test, desc, T_META_NAMESPACE("xnu.skywalk_mptests"), \ + T_META_ENABLED(false) /* rdar://133412076 */ ) \ + { \ + const char *ignorefail_str = getenv("ignorefail"); \ + bool ignorefail = false; \ + if (ignorefail_str) { \ + T_LOG("ignorefail option present"); \ + ignorefail = true; \ + } \ + skywalk_mptest_driver_run(&skt_##test, ignorefail); \ + } +RDAR_133412076_FAILING_TESTS +#undef X diff --git a/tests/skywalk/skywalk_test_common.c b/tests/skywalk/skywalk_test_common.c index 070ad087f..42bca1dbc 100644 --- a/tests/skywalk/skywalk_test_common.c +++ b/tests/skywalk/skywalk_test_common.c @@ -637,7 +637,7 @@ sktc_setup_channel_worker(uuid_t instance_uuid, nexus_port_t channel_port, sktc_channel = sktu_channel_create_extended(instance_uuid, channel_port, CHANNEL_DIR_TX_RX, ringid, attr, - -1, -1, -1, -1, -1, -1, -1, defunct_ok ? 1 : -1, -1, -1); + -1, -1, -1, -1, -1, -1, defunct_ok ? 1 : -1, -1, -1); assert(sktc_channel); if (attr) { @@ -2220,30 +2220,10 @@ sktc_ifnet_add_scoped_default_route(char * ifname, struct in_addr ifa) /* interval in nanoseconds */ int -sktc_set_classq_update_interval(uint64_t ns, sktc_classq_type_t type) +sktc_set_classq_update_interval(uint64_t ns) { int error; - char *sysctl_name; - - switch (type) { - case SKTC_CLASSQ_DEF_C: - sysctl_name = "net.classq.def_c_update_interval"; - break; - case SKTC_CLASSQ_DEF_L4S: - sysctl_name = "net.classq.def_l4s_update_interval"; - break; - case SKTC_CLASSQ_LL_C: - sysctl_name = "net.classq.ll_c_update_interval"; - break; - case SKTC_CLASSQ_LL_L4S: - sysctl_name = "net.classq.ll_l4s_update_interval"; - break; - - default: - assert(0); - __builtin_unreachable(); - break; - } + char *sysctl_name = "net.classq.fq_codel.update_interval"; error = sysctlbyname(sysctl_name, NULL, NULL, &ns, sizeof(ns)); @@ -2262,22 +2242,16 @@ sktc_set_classq_update_intervals(uint64_t ns) { int error; - error = sktc_set_classq_update_interval(ns, SKTC_CLASSQ_DEF_C); - assert(error == 0); - error = sktc_set_classq_update_interval(ns, SKTC_CLASSQ_DEF_L4S); - assert(error == 0); - error = sktc_set_classq_update_interval(ns, SKTC_CLASSQ_LL_C); - assert(error == 0); - error = sktc_set_classq_update_interval(ns, SKTC_CLASSQ_LL_L4S); + error = sktc_set_classq_update_interval(ns); assert(error == 0); return 0; } int -sktc_reset_classq_update_interval(sktc_classq_type_t type) +sktc_reset_classq_update_interval() { - return sktc_set_classq_update_interval(0, type); + return sktc_set_classq_update_interval(0); } int @@ -2288,30 +2262,10 @@ sktc_reset_classq_update_intervals(void) /* interval in nanoseconds */ int -sktc_set_classq_target_qdelay(uint64_t ns, sktc_classq_type_t type) +sktc_set_classq_target_qdelay(uint64_t ns) { int error; - char *sysctl_name; - - switch (type) { - case SKTC_CLASSQ_DEF_C: - sysctl_name = "net.classq.def_c_target_qdelay"; - break; - case SKTC_CLASSQ_DEF_L4S: - sysctl_name = "net.classq.def_l4s_target_qdelay"; - break; - case SKTC_CLASSQ_LL_C: - sysctl_name = "net.classq.ll_c_target_qdelay"; - break; - case SKTC_CLASSQ_LL_L4S: - sysctl_name = "net.classq.ll_l4s_target_qdelay"; - break; - - default: - assert(0); - __builtin_unreachable(); - break; - } + char *sysctl_name = "net.classq.fq_codel.target_qdelay"; error = sysctlbyname(sysctl_name, NULL, NULL, &ns, sizeof(ns)); @@ -2330,22 +2284,16 @@ sktc_set_classq_target_qdelays(uint64_t ns) { int error; - error = sktc_set_classq_target_qdelay(ns, SKTC_CLASSQ_DEF_C); - assert(error == 0); - error = sktc_set_classq_target_qdelay(ns, SKTC_CLASSQ_DEF_L4S); - assert(error == 0); - error = sktc_set_classq_target_qdelay(ns, SKTC_CLASSQ_LL_C); - assert(error == 0); - error = sktc_set_classq_target_qdelay(ns, SKTC_CLASSQ_LL_L4S); + error = sktc_set_classq_target_qdelay(ns); assert(error == 0); return 0; } int -sktc_reset_classq_target_qdelay(sktc_classq_type_t type) +sktc_reset_classq_target_qdelay() { - return sktc_set_classq_target_qdelay(0, type); + return sktc_set_classq_target_qdelay(0); } int diff --git a/tests/skywalk/skywalk_test_common.h b/tests/skywalk/skywalk_test_common.h index 4050790ab..377e9de07 100644 --- a/tests/skywalk/skywalk_test_common.h +++ b/tests/skywalk/skywalk_test_common.h @@ -246,7 +246,7 @@ sktc_feth1_in_addr(void) static inline struct in_addr sktc_rd0_in_addr(void) { - _CASSERT(RD0_INADDR == FETH0_INADDR); + static_assert(RD0_INADDR == FETH0_INADDR); return sktc_make_in_addr(RD0_INADDR); } @@ -277,16 +277,10 @@ extern bool sktc_get_flowswitch_nexus(const char *ifname, uuid_t fsw); extern int sktc_ifnet_feth0_set_dequeue_stall(boolean_t enable); extern int sktc_ifnet_feth1_set_dequeue_stall(boolean_t enable); -typedef enum : uint8_t { - SKTC_CLASSQ_DEF_C, - SKTC_CLASSQ_DEF_L4S, - SKTC_CLASSQ_LL_C, - SKTC_CLASSQ_LL_L4S, -} sktc_classq_type_t; -extern int sktc_set_classq_update_interval(uint64_t ns, sktc_classq_type_t type); -extern int sktc_reset_classq_update_interval(sktc_classq_type_t type); -extern int sktc_set_classq_target_qdelay(uint64_t ns, sktc_classq_type_t type); -extern int sktc_reset_classq_target_qdelay(sktc_classq_type_t type); +extern int sktc_set_classq_update_interval(uint64_t ns); +extern int sktc_reset_classq_update_interval(); +extern int sktc_set_classq_target_qdelay(uint64_t ns); +extern int sktc_reset_classq_target_qdelay(); extern int sktc_set_classq_update_intervals(uint64_t ns); extern int sktc_reset_classq_update_intervals(void); extern int sktc_set_classq_target_qdelays(uint64_t ns); diff --git a/tests/skywalk/skywalk_test_driver.c b/tests/skywalk/skywalk_test_driver.c index 1b711fa0a..16392a552 100644 --- a/tests/skywalk/skywalk_test_driver.c +++ b/tests/skywalk/skywalk_test_driver.c @@ -190,7 +190,6 @@ skywalk_test_driver_run(struct skywalk_test *skt, int argc, char **argv, exception_behavior_t behaviors[2]; thread_state_flavor_t flavors[2]; int pid, child_status; - int testid; size_t len; int error; int itercount = -1; @@ -297,7 +296,7 @@ skywalk_test_driver_run(struct skywalk_test *skt, int argc, char **argv, } if (error == -1 && errno != ESRCH) { - SKT_LOG(stderr, "pid_shutdown_sockets: %s", strerror(errno)); + SKT_LOG("pid_shutdown_sockets: %s", strerror(errno)); test_exit(1); } diff --git a/tests/skywalk/skywalk_test_driver.h b/tests/skywalk/skywalk_test_driver.h index df73ca8b6..da675cc59 100644 --- a/tests/skywalk/skywalk_test_driver.h +++ b/tests/skywalk/skywalk_test_driver.h @@ -153,15 +153,10 @@ extern struct skywalk_test skt_utun27302538d; extern struct skywalk_test skt_utun27646755; extern struct skywalk_test skt_utun27646755slow; extern struct skywalk_test skt_utunleak; -extern struct skywalk_test skt_utunloopn4u1; -extern struct skywalk_test skt_utunloopn4u2; -extern struct skywalk_test skt_utunloopn4t1; -extern struct skywalk_test skt_utunloopn4t2; extern struct skywalk_test skt_utunloopy4u1; extern struct skywalk_test skt_utunloopy4u2; extern struct skywalk_test skt_utunloopy4t1; extern struct skywalk_test skt_utunloopy4t2; -extern struct skywalk_test skt_utunloopn1000; extern struct skywalk_test skt_utunloopy1000; extern struct skywalk_test skt_ipsecloopy4u1; extern struct skywalk_test skt_ipsecloopy4u2; @@ -230,6 +225,7 @@ extern struct skywalk_test skt_memory; extern struct skywalk_test skt_flow_req; extern struct skywalk_test skt_flow_req_ll; extern struct skywalk_test skt_flow_config; +extern struct skywalk_test skt_flow_conn_idle; extern struct skywalk_test skt_flowlookup; extern struct skywalk_test skt_libcuckoo; extern struct skywalk_test skt_restricted_port; @@ -310,6 +306,8 @@ extern struct skywalk_mptest skt_xferparentchildflown; extern struct skywalk_mptest skt_xferparentchildflow_offset_400; extern struct skywalk_mptest skt_xferparentchildflown_offset_400; extern struct skywalk_mptest skt_xferrdudpping; +extern struct skywalk_mptest skt_xferrxflowsteeringdroptxpackets; +extern struct skywalk_mptest skt_xferrxflowsteeringdroprxpackets; extern struct skywalk_mptest_check skt_filternative_check; extern struct skywalk_mptest_check skt_filtercompat_check; diff --git a/tests/skywalk/skywalk_test_utils.c b/tests/skywalk/skywalk_test_utils.c index 00bafbb42..5cec47db2 100644 --- a/tests/skywalk/skywalk_test_utils.c +++ b/tests/skywalk/skywalk_test_utils.c @@ -252,14 +252,11 @@ sktc_build_nexus(nexus_controller_t ncd, struct sktc_nexus_attr *sktc_attr, #define SKTU_CHANNEL_CREATE_NOMEM_RETRIES 16 channel_t -sktu_channel_create_extended(const uuid_t uuid, - const nexus_port_t port, const ring_dir_t dir, - const ring_id_t rid, const channel_attr_t attr, - uint64_t exclusive, uint64_t monitor, - uint64_t txlowatunit, uint64_t txlowatval, - uint64_t rxlowatunit, uint64_t rxlowatval, - uint64_t userpacketpool, uint64_t defunctok, - uint64_t event_ring, uint64_t low_latency) +sktu_channel_create_extended(const uuid_t uuid, const nexus_port_t port, + const ring_dir_t dir, const ring_id_t rid, const channel_attr_t attr, + uint64_t exclusive, uint64_t txlowatunit, uint64_t txlowatval, + uint64_t rxlowatunit, uint64_t rxlowatval, uint64_t userpacketpool, + uint64_t defunctok, uint64_t event_ring, uint64_t low_latency) { channel_attr_t tmpattr; int error; @@ -279,11 +276,6 @@ sktu_channel_create_extended(const uuid_t uuid, SKTC_ASSERT_ERR(!error); } - if (monitor != -1) { - error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_MONITOR, monitor); - SKTC_ASSERT_ERR(!error); - } - if (txlowatunit != -1) { error = os_channel_attr_set(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, txlowatunit); SKTC_ASSERT_ERR(!error); @@ -340,12 +332,6 @@ retry: assert(scratch != 1); assert(exclusive == -1 || exclusive == scratch); - scratch = -1; - error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_MONITOR, &scratch); - SKTC_ASSERT_ERR(!error); - assert(scratch != -1); - assert(exclusive == -1 || monitor == scratch); - scratch = -1; error = os_channel_attr_get(tmpattr, CHANNEL_ATTR_TX_LOWAT_UNIT, &scratch); SKTC_ASSERT_ERR(!error); @@ -744,7 +730,7 @@ sktc_bind_tcp4_flow(nexus_controller_t ncd, const uuid_t fsw, in_port_t in_port, // XXX fails, see the fswbind25 for standalone test for this assert(nfr.nfr_nx_port == nx_port); - T_LOG("got ephemeral port %d\n", ntohs(nfr.nfr_saddr.sin.sin_port)); + SKT_LOG("got ephemeral port %d\n", ntohs(nfr.nfr_saddr.sin.sin_port)); /* Validate the ephemeral ports */ if (!error && !in_port) { @@ -1085,8 +1071,8 @@ sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags) int error; int tunsock; const char *CONTROL_NAME; - int OPT_ENABLE_NETIF, OPT_ATTACH_FSW; - int enable_netif, attach_fsw; + int OPT_ENABLE_NETIF, OPT_ATTACH_FSW, OPT_ENABLE_CHANNEL; + int enable_netif, attach_fsw, enable_channel; int scratch; assert(type == SKTU_IFT_UTUN || type == SKTU_IFT_IPSEC); @@ -1094,14 +1080,17 @@ sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags) CONTROL_NAME = UTUN_CONTROL_NAME; OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF; OPT_ATTACH_FSW = UTUN_OPT_ATTACH_FLOWSWITCH; + OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL; } else { CONTROL_NAME = IPSEC_CONTROL_NAME; OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF; OPT_ATTACH_FSW = 0; + OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL; } enable_netif = ((flags & SKTU_IFF_ENABLE_NETIF) != 0) ? 1 : 0; attach_fsw = ((flags & SKTU_IFF_NO_ATTACH_FSW) != 0) ? 0 : 1; + enable_channel = ((flags & SKTU_IFF_ENABLE_CHANNEL) != 0) ? 1 : 0; /* XXX Remove this retry nonsense when this is fixed: * creating an interface without specifying specific interface name should not return EBUSY @@ -1151,6 +1140,14 @@ sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags) assert(scratchlen == sizeof(scratch)); assert(enable_netif == scratch); + error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &enable_channel, sizeof(enable_channel)); + SKTC_ASSERT_ERR(!error); + scratchlen = sizeof(scratch); + error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &scratch, &scratchlen); + SKTC_ASSERT_ERR(!error); + assert(scratchlen == sizeof(scratch)); + assert(enable_channel == scratch); + /* only applicable for utun */ if (type == SKTU_IFT_UTUN) { error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ATTACH_FSW, &attach_fsw, sizeof(attach_fsw)); @@ -1164,6 +1161,7 @@ sktu_create_interface(sktu_if_type_t type, sktu_if_flag_t flags) tunsock = -1; continue; } + SKTC_ASSERT_ERR(!error); error = fcntl(tunsock, F_SETFD, FD_CLOEXEC); if (error != 0) { @@ -1190,34 +1188,15 @@ sktu_create_interface_channel(sktu_if_type_t type, int tunsock) channel_t channel; socklen_t uuidlen; int error; - int OPT_ENABLE_CHANNEL; int OPT_GET_CHANNEL_UUID; if (type == SKTU_IFT_UTUN) { - OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL; OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID; } else { assert(type == SKTU_IFT_IPSEC); - OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL; OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID; } - if (type == SKTU_IFT_UTUN) { - int enable = 1; - error = setsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &enable, sizeof(enable)); - if (error != 0) { - SKT_LOG("setsockopt returned error %d, errno %d\n", error, errno); - } - SKTC_ASSERT_ERR(error == 0); - } - - int scratch; - socklen_t scratchlen = sizeof(scratch); - error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_ENABLE_CHANNEL, &scratch, &scratchlen); - SKTC_ASSERT_ERR(!error); - assert(scratchlen == sizeof(scratch)); - assert(1 == scratch); - uuidlen = sizeof(uuid); error = getsockopt(tunsock, SYSPROTO_CONTROL, OPT_GET_CHANNEL_UUID, uuid, &uuidlen); SKTC_ASSERT_ERR(error == 0); @@ -1227,7 +1206,7 @@ sktu_create_interface_channel(sktu_if_type_t type, int tunsock) channel = sktu_channel_create_extended(uuid, NEXUS_PORT_KERNEL_PIPE_CLIENT, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, attr, - -1, -1, -1, -1, -1, -1, -1, 1, -1, -1); + -1, -1, -1, -1, -1, -1, 1, -1, -1); assert(channel); return channel; @@ -1648,7 +1627,7 @@ sktu_channel_port_init(channel_port_t ch_port, uuid_t instance, bzero(ch_port, sizeof(*ch_port)); chan = sktu_channel_create_extended(instance, port, CHANNEL_DIR_TX_RX, CHANNEL_RING_ID_ANY, NULL, - -1, -1, -1, -1, -1, -1, enable_upp ? 1 : -1, 1, + -1, -1, -1, -1, -1, enable_upp ? 1 : -1, 1, enable_event_ring ? 1 : -1, low_latency ? 1 : -1); if (chan == NULL) { SKT_LOG("Can't open channel on port %d, %s\n", port, diff --git a/tests/skywalk/skywalk_test_utils.h b/tests/skywalk/skywalk_test_utils.h index 0b2439dd6..fa644160a 100644 --- a/tests/skywalk/skywalk_test_utils.h +++ b/tests/skywalk/skywalk_test_utils.h @@ -46,6 +46,7 @@ typedef enum sktu_if_type { #define SKTU_IFF_ENABLE_NETIF 0x00000001 // no-netif (txstart bsd interface) by default #define SKTU_IFF_NO_ATTACH_FSW 0x00000002 // auto-attach fsw for netif by default +#define SKTU_IFF_ENABLE_CHANNEL 0x00000004 // auto-attach kpipe typedef uint32_t sktu_if_flag_t; typedef struct sktc_nexus_handles { @@ -213,14 +214,11 @@ typedef struct sktu_flow { } *sktu_nexus_flow_t; channel_t -sktu_channel_create_extended(const uuid_t uuid, - const nexus_port_t port, const ring_dir_t dir, - const ring_id_t rid, const channel_attr_t attr, - uint64_t exclusive, uint64_t monitor, - uint64_t txlowatunit, uint64_t txlowatval, - uint64_t rxlowatunit, uint64_t rxlowatval, - uint64_t userpacketpool, uint64_t defunctok, - uint64_t event_ring, uint64_t low_latency); +sktu_channel_create_extended(const uuid_t uuid, const nexus_port_t port, + const ring_dir_t dir, const ring_id_t rid, const channel_attr_t attr, + uint64_t exclusive, uint64_t txlowatunit, uint64_t txlowatval, + uint64_t rxlowatunit, uint64_t rxlowatval, uint64_t userpacketpool, + uint64_t defunctok, uint64_t event_ring, uint64_t low_latency); void permutefuncP(int n, int *permute, void (*func)(int, int *permute)); void permutefuncH(int n, int *permute, void (*func)(int, int *permute)); void permutefuncR(int n, int *permute, void (*func)(int, int *permute), int total, unsigned seed); diff --git a/tests/skywalk_test.entitlements b/tests/skywalk_test.entitlements index e41e0106e..623cdd122 100644 --- a/tests/skywalk_test.entitlements +++ b/tests/skywalk_test.entitlements @@ -20,11 +20,11 @@ com.apple.security.network.server - com.apple.private.network.restricted.port.lights_out_management - com.apple.private.skywalk.low-latency-channel com.apple.private.set-exception-port - + + com.apple.private.network.aop2_offload + diff --git a/tests/skywalk_tests.c b/tests/skywalk_tests.c index 757187c3d..47516631c 100644 --- a/tests/skywalk_tests.c +++ b/tests/skywalk_tests.c @@ -100,7 +100,6 @@ T_GLOBAL_META( X(kqueue_lowat_note_slots, "test kqueue low watermark (slot watermark on knote)") \ X(change_len, "test kernel resilience to modified slot lengths") \ X(big_len, "test unrealistically large slot lengths") \ - X(internalizemetdata, "test internalizex packet metadata verification") \ X(ringidtx, "test opening tx ringids") \ X(ringidrx, "test opening rx ringids") \ X(closekqk, "test closing kqueue in kqueue") \ @@ -112,10 +111,7 @@ T_GLOBAL_META( X(bindupipeanon, "test binds a channel to an anonymous user pipe nexus") \ X(bindupipekey, "test binds a channel to a non-anonymous user pipe nexus") \ X(bindfswanon, "test binds a channel to an anonymous flowswitch nexus") \ - X(bindfswkey, "test binds a channel to a non-anonymous flowswitch nexus") \ X(bindnetifkey, "test binds a channel to a non-anonymous netif nexus") \ - X(flowswitch_ns_reserve, "test confirms that flowswitches can reserve L4 ports") \ - X(flowswitch_ns_reserve2, "thorough test of netns for both BSD & flowswitch, IPv4/v6") \ X(netifcompata, "Test setup and teardown netifcompat on feth0") \ X(netifcompatb, "Test setup and teardown netifcompat on feth0 with deferred close channel") \ X(netifcompatc, "Test setup and teardown netifcompat on feth0 with deferred detach and close channel") \ @@ -126,31 +122,25 @@ T_GLOBAL_META( X(fsw29301703a, "Test open 63 channels to a flowswitch") \ X(fsw29301703b, "Test open 200 channels to a flowswitch") \ X(fsw29301703c, "Open too many channels to a flowswitch") \ - X(fswbindany, "Test attempts to bind to port -1 of flowswitch") \ X(fswbind0, "Test attempts to bind to port 0 of flowswitch") \ X(fswbind1, "Test attempts to bind to port 1 of flowswitch") \ X(fswbind512, "Test attempts to bind to port 512 of flowswitch") \ - X(fswbind2, "Test attempts to bind to port 2 of flowswitch") \ - X(fswbind5, "Test attempts to bind to port 5 of flowswitch") \ X(fullupipe, "Test rx on full tx pipe") \ X(upipepeerclosure, "Test channel operations on upipe with no peer") \ - X(listener, "Test skywalk listener flow creation check") \ - X(listener_stress, "Test stress skywalk listener flows") \ - X(listener_reuse, "Test stress skywalk listener reuse") \ X(copy_cksum_single, "Test copy/checksum code: single buffer") \ X(copy_cksum_multi, "Test copy/checksum code: buffer chain") \ + X(reass, "UDP fragmentation reassembly (channel flow Rx)") \ + X(reass_default_setting, "UDP fragmentation reassembly (channel flow Rx) (without forcing ip_reass sysctl)") \ X(reass_timeout, "Test send partial fragment to flowswitch and check for ICMPv6 time exceeded reply") \ X(reass_bad_fraglen, "Test send fragment with bad fragment length (!= 8*) to flowswitch and check for ICMPv6 param header reply") \ X(reass_atomic, "Test send atomic ICMP echo fragment to flowswitch and check for reply") \ X(reass_fuzz_queue_limit, "Test fuzz flowswitch to hit fragment limit") \ X(cksum, "Test checksum code") \ X(memory, "Test skmem allocator basic and advanced tests") \ - X(flow_req, "Test skywalk flow request api") \ X(flow_req_ll, "Test skywalk flow request api for low latency flows") \ X(flow_config, "Test skywalk flow config api") \ - X(flowlookup, "Test test flow lookup by send/receive of packets") \ + X(flow_conn_idle, "Test skywalk flow connection idle api") \ X(libcuckoo, "Test Cuckoo hashtable library basic and advanced tests") \ - X(restricted_port, "Test reserve a restricted ephemeral port") \ X(steering, "Test steering rules") \ X(listen_stress, "Test stress posix socket listen") \ X(pllutxk, "Test send 10000000 slots to upipe sink using kqueue") @@ -182,10 +172,6 @@ T_GLOBAL_META( X(utun27302538c, "test cleaning up utun kpipe while channel is in kevent (case c)") \ X(utun27302538d, "test cleaning up utun kpipe while channel is in kevent (case d)") \ X(utun27646755, "race cleaning up channel and utun socket (20 iterations)") \ - X(utunloopn4u1, "open 2 utuns without netif and floods ipv4 udp packets in one direction") \ - X(utunloopn4u2, "open 2 utuns without netif and floods ipv4 udp packets in two directions") \ - X(utunloopn4t1, "open 2 utuns without netif and floods ipv4 tcp packets in one direction") \ - X(utunloopn4t2, "open 2 utuns without netif and floods ipv4 tcp packets in two directions") \ X(utunloopy4u1, "open 2 utuns with netif and floods ipv4 udp packets in one direction") \ X(utunloopy4u2, "open 2 utuns with netif and floods ipv4 udp packets in two directions") \ X(utunloopy4t1, "open 2 utuns with netif and floods ipv4 tcp packets in one direction") \ @@ -241,7 +227,6 @@ T_GLOBAL_META( X(kqueue_lowat_note_bytes, "test kqueue low watermark (byte watermark on knote)") \ X(change_len, "test kernel resilience to modified slot lengths") \ X(big_len, "test unrealistically large slot lengths") \ - X(internalizemetdata, "test internalizex packet metadata verification") \ X(ringidtx, "test opening tx ringids") \ X(ringidrx, "test opening rx ringids") \ X(debug_verify_u, "test confirms that skywalk is storing checksums of slots received on a upipe when in SKF_VERIFY debug mode") \ @@ -251,10 +236,6 @@ T_GLOBAL_META( X(utun27302538c, "test cleaning up utun kpipe while channel is in kevent (case c)") \ X(utun27302538d, "test cleaning up utun kpipe while channel is in kevent (case d)") \ X(utun27646755, "race cleaning up channel and utun socket (20 iterations)") \ - X(utunloopn4u1, "open 2 utuns without netif and floods ipv4 udp packets in one direction") \ - X(utunloopn4u2, "open 2 utuns without netif and floods ipv4 udp packets in two directions") \ - X(utunloopn4t1, "open 2 utuns without netif and floods ipv4 tcp packets in one direction") \ - X(utunloopn4t2, "open 2 utuns without netif and floods ipv4 tcp packets in two directions") \ X(utunloopy4u1, "open 2 utuns with netif and floods ipv4 udp packets in one direction") \ X(utunloopy4u2, "open 2 utuns with netif and floods ipv4 udp packets in two directions") \ X(utunloopy4t1, "open 2 utuns with netif and floods ipv4 tcp packets in one direction") \ @@ -264,12 +245,9 @@ T_GLOBAL_META( X(ipsecloopy4t1, "open 2 ipsecs with netif and floods ipv4 tcp packets in one direction") \ X(ipsecloopy4t2, "open 2 ipsecs with netif and floods ipv4 tcp packets in two directions") \ X(bindupipekey, "test binds a channel to a non-anonymous user pipe nexus") \ - X(bindfswkey, "test binds a channel to a non-anonymous flowswitch nexus") \ X(netifcompata, "Test setup and teardown netifcompat on feth0") \ X(netifcompatb, "Test setup and teardown netifcompat on feth0 with deferred close channel") \ X(netifcompatc, "Test setup and teardown netifcompat on feth0 with deferred detach and close channel") \ - X(teardown, "Test setup complicated topology tear it down") \ - X(teardownb, "Test setup complicated topology tear it down backwards") \ X(teardownr, "setup complicated topology tear it down randomly (1000 iterations)") \ X(teardownz, "setup complicated topology tear it down with each stage in an out of order position") \ X(fsw29301703a, "Test open 63 channels to a flowswitch") \ @@ -288,18 +266,26 @@ T_GLOBAL_META( X(pllurxs, "receive 10000000 slots from upipe source using select") \ X(pllurxp, "receive 10000000 slots to upipe source using poll") -/* - * These tests have failure rate > 2%, so we turn on extra logging. - */ -#define BATS_FAILING_TESTS \ - X(utunloopn4u1, "open 2 utuns without netif and floods ipv4 udp packets in one direction") \ - X(utunloopn4u2, "open 2 utuns without netif and floods ipv4 udp packets in two directions") \ - X(utunloopn4t1, "open 2 utuns without netif and floods ipv4 tcp packets in one direction") \ - X(utunloopn4t2, "open 2 utuns without netif and floods ipv4 tcp packets in two directions") \ +#define RDAR_145328590_FAILING_TESTS \ + X(flow_req, "Test skywalk flow request api") \ + X(flowlookup, "Test test flow lookup by send/receive of packets") \ + X(flowswitch_ns_reserve, "test confirms that flowswitches can reserve L4 ports") \ + X(flowswitch_ns_reserve2, "thorough test of netns for both BSD & flowswitch, IPv4/v6") \ + X(fswbind2, "Test attempts to bind to port 2 of flowswitch") \ + X(fswbind5, "Test attempts to bind to port 5 of flowswitch") \ + X(fswbindany, "Test attempts to bind to port -1 of flowswitch") \ + X(internalizemetdata, "test internalizex packet metadata verification") \ + X(listener, "Test skywalk listener flow creation check") \ + X(listener_stress, "Test stress skywalk listener flows") \ + X(listener_reuse, "Test stress skywalk listener reuse") \ + X(restricted_port, "Test reserve a restricted ephemeral port") \ + X(teardown, "Test setup complicated topology tear it down") \ + X(teardownb, "Test setup complicated topology tear it down backwards") \ X(utunloopy4u1, "open 2 utuns with netif and floods ipv4 udp packets in one direction") \ X(utunloopy4u2, "open 2 utuns with netif and floods ipv4 udp packets in two directions") \ X(utunloopy4t1, "open 2 utuns with netif and floods ipv4 tcp packets in one direction") \ - X(utunloopy4t2, "open 2 utuns with netif and floods ipv4 tcp packets in two directions") + X(utunloopy4t2, "open 2 utuns with netif and floods ipv4 tcp packets in two directions") \ + X(bindfswkey, "test binds a channel to a non-anonymous flowswitch nexus") #define EXPAND_TO_T_DECL_COMMON(test, desc) \ { \ @@ -422,17 +408,3 @@ T_DECL_REF(noop_memcleanup, noop, "run noop test to cleanup memory failure sysct EXPAND_TO_T_DECL_COMMON(test, desc) SHUTDOWN_TESTS; #undef X - -/* - * These tests are known to have failure rate > 2% so we turn on extra logging. - * Uncrustify does not handle T_META_MAYFAIL being used in X macros properly. - */ -/* BEGIN IGNORE CODESTYLE */ -#define X(test, desc, ...) \ - T_DECL(failing_##test, desc, \ - T_META_SYSCTL_INT("kern.skywalk.verbose=16492674416640"), \ - T_META_MAYFAIL("rdar://126364642, bind call fails with EADDRNOTAVAIL")) \ - EXPAND_TO_T_DECL_COMMON(test, desc) -BATS_FAILING_TESTS; -#undef X -/* END IGNORE CODESTYLE */ diff --git a/tests/socket_bind_35243417.c b/tests/socket_bind_35243417.c index a7a1ea879..c65e5b6b5 100644 --- a/tests/socket_bind_35243417.c +++ b/tests/socket_bind_35243417.c @@ -8,6 +8,8 @@ #include #include +#include "net_test_lib.h" + static int sockv6_open(void) { @@ -87,6 +89,9 @@ loop_done: T_ASSERT_TRUE(bound_count == bind_attempts, "number of successful binds %d (out of %d)", bound_count, bind_attempts); + + force_zone_gc(); + return success; } diff --git a/tests/socket_bind_35685803.c b/tests/socket_bind_35685803.c index a183ced5b..ef945315c 100644 --- a/tests/socket_bind_35685803.c +++ b/tests/socket_bind_35685803.c @@ -13,6 +13,8 @@ #include #include +#include "net_test_lib.h" + T_GLOBAL_META(T_META_RUN_CONCURRENTLY(true)); static bool debug; @@ -177,6 +179,8 @@ run_multithreaded_bind_test(int number_of_runs, bool v6, int socket_count) multithreaded_bind_test(v6, socket_count); } T_PASS("multithreaded_bind_test %s", v6 ? "IPv6" : "IPv4"); + + force_zone_gc(); #endif /* TARGET_OS_BRIDGE */ } diff --git a/tests/socket_v4mappedv6.c b/tests/socket_v4mappedv6.c index b604f4c13..56d616eb6 100644 --- a/tests/socket_v4mappedv6.c +++ b/tests/socket_v4mappedv6.c @@ -20,7 +20,8 @@ sockv6_open(void) T_DECL(v4_mapped_v6_ops, "v4 mapped v6 sock operations around bind/connect", T_META_ASROOT(false), - T_META_CHECK_LEAKS(false)) + T_META_CHECK_LEAKS(false), + T_META_ENABLED(false) /* rdar://134506000 */) { int s6 = -1; int ret = 0; diff --git a/tests/stackshot_tests.m b/tests/stackshot_tests.m index 1bbd737a1..411257262 100644 --- a/tests/stackshot_tests.m +++ b/tests/stackshot_tests.m @@ -18,6 +18,7 @@ #include #include #include +#import #import #import @@ -156,6 +157,7 @@ struct scenario { bool no_recordfile; pid_t target_pid; bool target_kernel; + bool nocheck_recordfile; uint64_t since_timestamp; uint32_t size_hint; dt_stat_time_t timer; @@ -267,6 +269,61 @@ retry: ; T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file"); fclose(f); + + // the xnu lldbmacros include a kcdata dumper which is used by + // panic triage and other things to process the recorded data + // from panics. With `-s foo.ips`, this generates a panic + // report similar. It's really important that this continues to + // work. + // + // We also ship the same code as /usr/local/bin/kcdata. To make + // sure the *.ips continues to work without aborting or otherwise + // tripping over the current data being output by xnu, we do a + // `kcdata.py -s /dev/null` run on the *first* kcdata we get for + // a given test, and save the stdout/err to files that get + // reported in the test report. Typically it will tell you the + // shared cache UUID and maybe complain about missing exclaves + // data. + // + // This only works on full stackshots, so we skip it for DELTAs, + // and BridgeOS is missing python, so we make sure everything we + // need is executable before trying +#define PYTHON3_PATH "/usr/local/bin/python3" +#define KCDATA_PATH "/usr/local/bin/kcdata" + if (!(scenario->flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) && + !scenario->nocheck_recordfile && + access(PYTHON3_PATH, X_OK) == 0 && access(KCDATA_PATH, X_OK) == 0) { + + scenario->nocheck_recordfile = true; // don't do this more than once per scenario + char outpath[MAXPATHLEN]; + strlcpy(outpath, scenario->name, sizeof(outpath)); + strlcat(outpath, ".kcdpy-out", sizeof(outpath)); + char errpath[MAXPATHLEN]; + strlcpy(errpath, scenario->name, sizeof(errpath)); + strlcat(errpath, ".kcdpy-err", sizeof(errpath)); + T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(outpath, sizeof(outpath)), "create py-out path"); + T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(errpath, sizeof(errpath)), "create py-err path"); + + char *launch_tool_args[] = { + KCDATA_PATH, + "-s", + "/dev/null", + sspath, + NULL + }; + pid_t child_pid = -1; + int ret = dt_launch_tool(&child_pid, launch_tool_args, false, outpath, errpath); + T_WITH_ERRNO; T_EXPECT_EQ(ret, 0, "dt_launch_tool(\"" KCDATA_PATH " -s /dev/null kcdata\") should succeed"); + if (ret == 0) { + int exit_status = 0, signum = 0; + ret = dt_waitpid(child_pid, &exit_status, &signum, 60); + T_QUIET; T_EXPECT_EQ(ret, 1, "dt_waitpid() on "KCDATA_PATH); + if (ret == 1) { + T_EXPECT_EQ(exit_status, 0, "kcdata.py should successfully run against our output"); + T_QUIET; T_EXPECT_EQ(signum, 0, "kcdata.py shouldn't get a signal"); + } + } + } } cb(buf, size); if (compress_ok) { @@ -1563,6 +1620,8 @@ T_DECL(dump_page_tables, "test stackshot page table dumping support", T_META_TAG }); } + + static void stackshot_verify_current_proc_uuid_info(void **ssbuf, size_t sslen, uint64_t expected_offset, const struct proc_uniqidentifierinfo *proc_info_data) { const uuid_t *current_uuid = (const uuid_t *)(&proc_info_data->p_uuid); @@ -2255,10 +2314,11 @@ T_DECL(throttled_sp, } +char *const clpc_path = "/usr/local/bin/clpc"; char *const clpcctrl_path = "/usr/local/bin/clpcctrl"; static void -run_clpcctrl(char *const argv[]) { +run_clpc(char *const argv[]) { posix_spawnattr_t sattr; pid_t pid; int wstatus; @@ -2277,17 +2337,73 @@ run_clpcctrl(char *const argv[]) { } static void -restore_clpcctrl() { - run_clpcctrl((char *const []) { clpcctrl_path, "-d", NULL }); +restore_clpc() { + /* + * For some reason, the new CLPC utility always returns with a nonzero + * exit status when re-enabling dynamic control. So, we use the old + * one here. + */ + run_clpc((char *const []) { clpcctrl_path, "-d", NULL }); } -#define CLUSTER_TYPE_SMP 0 -#define CLUSTER_TYPE_E 1 -#define CLUSTER_TYPE_P 2 +struct cpu_cluster { + int type; + uint64_t mask; +}; + +static NSArray* +get_cpu_clusters() { + NSTask *task = [[NSTask alloc] init]; + [task setLaunchPath:[NSString stringWithUTF8String:clpc_path]]; + [task setArguments:@[@"topologies", @"-f", @"json"]]; + + NSPipe *pipe = [NSPipe pipe]; + [task setStandardOutput:pipe]; + [task setStandardError:nil]; + + [task launch]; + [task waitUntilExit]; + + NSData *data = [[pipe fileHandleForReading] readDataToEndOfFile]; + NSString *data_string = [[NSString alloc] initWithData:data encoding:NSUTF8StringEncoding]; + + /* + * The CLPC util outputs the CPU and ANE topology as JSON objects _not_ + * separated by a comma, so we have to fix it up... + */ + data_string = [data_string stringByReplacingOccurrencesOfString:@"\n}\n" withString:@"\n},\n"]; + data_string = [NSString stringWithFormat:@"[%@]", data_string]; + data = [data_string dataUsingEncoding:NSUTF8StringEncoding]; + + T_QUIET; T_ASSERT_EQ(task.terminationStatus, 0, "clpc exit status"); + + NSError *jsonError = nil; + NSArray *json = [NSJSONSerialization JSONObjectWithData:data options:0 error:&jsonError]; + + if (jsonError) { + T_FAIL("clpc topologies failed. output: %s\nerror: %s", + [data_string cStringUsingEncoding:NSUTF8StringEncoding], + [[jsonError localizedDescription] cStringUsingEncoding:NSUTF8StringEncoding]); + T_END; + } + + NSMutableArray* out = [[NSMutableArray alloc] init]; + struct cpu_cluster cluster; + + for (NSDictionary *cluster_json in json[0][@"CPU Topology"][@"Clusters"]) { + cluster = (struct cpu_cluster) { + .type = [cluster_json[@"Type"] intValue] + 1, + .mask = [cluster_json[@"CoreMask"] unsignedLongLongValue] + }; + [out addObject:[NSValue valueWithBytes:&cluster objCType:@encode(struct cpu_cluster)]]; + } + + return out; +} void test_stackshot_cpu_info(void *ssbuf, size_t sslen, int exp_cpus, NSArray *exp_cluster_types) { kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - bool seen = false; + bool seen_cpu = false, seen_buffer = false; int singlethread_override = 0; size_t singlethread_override_sz = sizeof(singlethread_override); T_QUIET; T_ASSERT_POSIX_SUCCESS( @@ -2299,36 +2415,62 @@ void test_stackshot_cpu_info(void *ssbuf, size_t sslen, int exp_cpus, NSArray *e } KCDATA_ITER_FOREACH(iter) { - if ((kcdata_iter_type(iter) != KCDATA_TYPE_ARRAY) || (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_LATENCY_INFO_CPU)) { + if (kcdata_iter_type(iter) != KCDATA_TYPE_ARRAY) { continue; } - seen = true; + int n_elems = kcdata_iter_array_elem_count(iter); - /* Check ncpus */ - int ncpus = kcdata_iter_array_elem_count(iter); - if (exp_cpus != -1) { - T_QUIET; T_ASSERT_EQ(exp_cpus, ncpus, "Expected number of CPUs matches number of CPUs used for stackshot"); - } + switch (kcdata_iter_array_elem_type(iter)) { + case STACKSHOT_KCTYPE_LATENCY_INFO_CPU: + seen_cpu = true; - if (exp_cluster_types == nil) { - continue; - } + /* Check ncpus */ + if (exp_cpus != -1) { + T_QUIET; T_ASSERT_EQ(exp_cpus, n_elems, "Expected number of CPUs matches number of CPUs used for stackshot"); + } - /* Check cluster types */ - struct stackshot_latency_cpu *latencies = (struct stackshot_latency_cpu *) kcdata_iter_payload(iter); - for (int i = 0; i < ncpus; i++) { - NSNumber *cluster_type = [NSNumber numberWithInt:latencies[i].cluster_type]; - T_QUIET; T_ASSERT_TRUE([exp_cluster_types containsObject:cluster_type], "Type of CPU cluster in expected CPU cluster types"); + if (exp_cluster_types == nil) { + continue; + } + + /* Check cluster types */ + struct stackshot_latency_cpu *latencies = (struct stackshot_latency_cpu *) kcdata_iter_payload(iter); + for (int i = 0; i < n_elems; i++) { + NSNumber *cluster_type = [NSNumber numberWithInt:latencies[i].cluster_type]; + T_QUIET; T_ASSERT_TRUE([exp_cluster_types containsObject:cluster_type], "Type of CPU cluster in expected CPU cluster types"); + } + break; + case STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER: + seen_buffer = true; + + if (exp_cluster_types == nil) { + continue; + } + + /* Check that we have a buffer for each cluster */ + struct stackshot_latency_buffer *buffers = (struct stackshot_latency_buffer *) kcdata_iter_payload(iter); + for (int i = 0; i < n_elems; i++) { + if (buffers[i].size == 0) { + continue; + } + NSNumber *cluster_type = [NSNumber numberWithInt:buffers[i].cluster_type]; + T_QUIET; T_ASSERT_TRUE([exp_cluster_types containsObject:cluster_type], "Type of CPU cluster for buffer in expected cluster types"); + } + break; + default: + /* Not either of these, continue; */ + break; } } - T_QUIET; T_ASSERT_TRUE(seen || !is_development_kernel(), "Seen CPU latency info or is release kernel"); + T_QUIET; T_ASSERT_TRUE(seen_cpu || !is_development_kernel(), "Seen CPU latency info or is release kernel"); + T_QUIET; T_ASSERT_TRUE(seen_buffer || !is_development_kernel(), "Seen buffer info or is release kernel"); } -void test_stackshot_with_clpcctrl(char *const name, char *const argv[], int exp_cpus, NSArray *exp_cluster_types) { - T_LOG("Stackshot CLPC scenario %s", name); - run_clpcctrl(argv); +static void +test_stackshot_with_clpcctrl(const char *name, char *const argv[], int exp_cpus, NSArray *exp_cluster_types) { + run_clpc(argv); struct scenario scenario = { .name = name, .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_SAVE_LOADINFO | @@ -2340,6 +2482,8 @@ void test_stackshot_with_clpcctrl(char *const name, char *const argv[], int exp_ }); } +#define N_CLUSTER_TYPES 2 + T_DECL(core_masks, "test that stackshot works under various core masks on ARM systems", T_META_REQUIRES_SYSCTL_EQ("hw.optional.arm64", 1), @@ -2372,18 +2516,6 @@ T_DECL(core_masks, return; } - - T_ATEND(restore_clpcctrl); - - /* Test with 1 and 2 CPUs for basic functionality */ - test_stackshot_with_clpcctrl( - "core_masks_1cpu", (char *const[]) {clpcctrl_path, "-c", "1", NULL}, - 1, nil); - - test_stackshot_with_clpcctrl( - "core_masks_2cpus", (char *const[]) {clpcctrl_path, "-c", "2", NULL}, - 2, nil); - /* Check nperflevels to see if we're on an AMP system */ int nperflevels = 1; size_t nperflevels_sz = sizeof(int); @@ -2391,28 +2523,49 @@ T_DECL(core_masks, sysctlbyname("hw.nperflevels", &nperflevels, &nperflevels_sz, NULL, 0), "get hw.nperflevels"); if (nperflevels == 1) { - T_LOG("On SMP system, skipping stackshot core_masks AMP tests"); - return; + T_SKIP("On SMP system, skipping stackshot core_masks tests"); } - T_QUIET; T_ASSERT_EQ(nperflevels, 2, "nperflevels is 1 or 2"); - T_LOG("On AMP system, performing stackshot core_masks AMP tests"); + T_ATEND(restore_clpc); + + uint64_t cluster_masks[N_CLUSTER_TYPES] = {0}; + NSArray* clusters = get_cpu_clusters(); + for (NSValue *data in clusters) { + struct cpu_cluster cluster; + [data getValue:&cluster]; + + T_QUIET; T_ASSERT_LT(cluster.type - 1, N_CLUSTER_TYPES, "valid cluster type"); + cluster_masks[cluster.type - 1] |= cluster.mask; + } + + NSMutableArray* cluster_types = [[NSMutableArray alloc] init]; + char const* scenario_names[] = { + "core_masks_amp_ecpus", + "core_masks_amp_pcpus", + }; + for (int type = 0; type < N_CLUSTER_TYPES; type++) { + if (!cluster_masks[type]) { + continue; + } + + NSNumber *cluster_type_num = [NSNumber numberWithInt:(type + 1)]; + [cluster_types addObject:cluster_type_num]; + + char mask_str[19]; + sprintf(mask_str, "0x%llx", cluster_masks[type]); + + test_stackshot_with_clpcctrl( + scenario_names[type], + (char *const[]) {clpc_path, "control", "-C", mask_str, NULL}, + -1, @[cluster_type_num]); + } + + T_ASSERT_GE((int) [cluster_types count], 2, "at least two cluster types"); - /* Perform AMP tests with different cluster types active */ test_stackshot_with_clpcctrl( "core_masks_amp_allcpus", - (char *const[]) {clpcctrl_path, "-C", "all", NULL}, - -1, @[@CLUSTER_TYPE_E, @CLUSTER_TYPE_P]); - - test_stackshot_with_clpcctrl( - "core_masks_amp_ecpus", - (char *const[]) {clpcctrl_path, "-C", "e", NULL}, - -1, @[@CLUSTER_TYPE_E]); - - test_stackshot_with_clpcctrl( - "core_masks_amp_pcpus", - (char *const[]) {clpcctrl_path, "-C", "p", NULL}, - -1, @[@CLUSTER_TYPE_P]); + (char *const[]) {clpc_path, "control", "-C", "all", NULL}, + -1, cluster_types); } #pragma mark performance tests @@ -3222,6 +3375,9 @@ parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, NSD T_QUIET; T_EXPECT_NOTNULL(sharedCaches[sharedcache_id], "sharedCacheID %d should exist", [sharedcache_id intValue]); } } else { + if ((sharedregion_flags == kTaskSharedRegionOther) && (task_flags & kTaskSharedRegionInfoUnavailable)) { + T_LOG("kTaskSharedRegionOther does not have shared region info available."); + } T_QUIET; T_EXPECT_NULL(sharedregion_info, "non-kTaskSharedRegionOther should have no shared_cache_dyld_load_info struct"); T_QUIET; T_EXPECT_NULL(sharedcache_id, "non-kTaskSharedRegionOther should have no sharedCacheID"); } diff --git a/tests/sysctl_hw.c b/tests/sysctl_hw.c index adfb5fb42..f86a010cb 100644 --- a/tests/sysctl_hw.c +++ b/tests/sysctl_hw.c @@ -15,6 +15,13 @@ T_DECL(sysctl_hw_cpu, "ensure vital product and CPU-related sysctls exist") buffer_size = sizeof(buffer); + ret = sysctlbyname("hw.jetsam_properties_product_type", buffer, + &buffer_size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(ret, "hw.jetsam_properties_product_type sysctl"); + T_LOG("hw.jetsam_properties_product_type = %s", buffer); + + buffer_size = sizeof(buffer); + ret = sysctlbyname("hw.product", buffer, &buffer_size, NULL, 0); T_ASSERT_POSIX_SUCCESS(ret, "hw.product sysctl"); diff --git a/tests/sysctl_wire_limits.c b/tests/sysctl_wire_limits.c index 12eac2c6b..74a211334 100644 --- a/tests/sysctl_wire_limits.c +++ b/tests/sysctl_wire_limits.c @@ -13,7 +13,9 @@ static const char *g_sysctl_no_wire_name = "vm.global_no_user_wire_amount"; static const char *g_sysctl_wire_name = "vm.global_user_wire_limit"; static const char *g_sysctl_per_task_wire_name = "vm.user_wire_limit"; static const char *g_sysctl_current_wired_count_name = "vm.page_wire_count"; +#if __x86_64__ static const char *g_sysctl_current_free_count_name = "vm.lopage_free_count"; +#endif /* __x86_64__ */ static const char *g_sysctl_vm_page_size_name = "vm.pagesize"; static const char *g_sysctl_memsize_name = "hw.memsize"; @@ -149,12 +151,18 @@ wire_to_limit(size_t limit, size_t *size) size_t buffer_size, offset_from_limit; void *buffer; size_t current_wired_size = sizeof(current_wired); +#if __x86_64__ size_t current_free_size = sizeof(current_free); +#endif /* __x86_64__ */ while (true) { ret = sysctlbyname(g_sysctl_current_wired_count_name, ¤t_wired, ¤t_wired_size, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "get current wired count failed"); +#if __x86_64__ ret = sysctlbyname(g_sysctl_current_free_count_name, ¤t_free, ¤t_free_size, NULL, 0); T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "get current free count failed"); +#else + current_free = 0; +#endif /* __x86_64__ */ offset_from_limit = ptoa(current_wired + current_free + wiggle_room_pages); T_QUIET; T_ASSERT_GE(limit, offset_from_limit, "more pages are wired than the limit."); buffer_size = limit - offset_from_limit; @@ -171,7 +179,9 @@ wire_to_limit(size_t limit, size_t *size) } T_DECL(wire_stress_test, "wire up to global_user_wire_limit and spin for 120 seconds.", - T_META_REQUIRES_SYSCTL_NE("kern.hv_vmm_present", 1), T_META_TAG_VM_NOT_ELIGIBLE) + T_META_REQUIRES_SYSCTL_NE("kern.hv_vmm_present", 1), + T_META_TAG_VM_NOT_ELIGIBLE, + T_META_ENABLED(false) /* rdar://145613247 */) { static const int kNumSecondsToSpin = 120; int ret; diff --git a/tests/task_suspend_stats.c b/tests/task_suspend_stats.c index a2ddce5a1..6f90b297b 100644 --- a/tests/task_suspend_stats.c +++ b/tests/task_suspend_stats.c @@ -336,6 +336,9 @@ T_DECL(suspend_stats_update_on_forkcorpse, T_LOG("Generating corpse of helper..."); kr = task_generate_corpse(child_task, &cp); + if (kr == KERN_RESOURCE_SHORTAGE) { + T_SKIP("Corpse slot unavailable"); + } T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_generate_corpse"); get_stats(child_task, &post); diff --git a/tests/task_vm_info_decompressions.c b/tests/task_vm_info_decompressions.c index e58b9dfd2..e507b52be 100644 --- a/tests/task_vm_info_decompressions.c +++ b/tests/task_vm_info_decompressions.c @@ -1,7 +1,10 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -18,17 +21,10 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_VERSION("VM")); #define KB 1024 -#define MALLOC_SIZE_PER_THREAD (64 * KB) -#define freezer_path "/usr/local/bin/freeze" - -/* BridgeOS could spend more time execv freezer */ -#if TARGET_OS_BRIDGE -static int timeout = 600; -#else -static int timeout = 120; -#endif +#define VM_SIZE_PER_THREAD (64 * KB) static _Atomic int thread_malloc_count = 0; +static _Atomic int thread_compressed_count = 0; static _Atomic int thread_thawed_count = 0; static _Atomic int phase = 0; @@ -36,41 +32,6 @@ struct thread_args { int id; }; -static void -freeze_pid(pid_t pid) -{ - char pid_str[6]; - char *args[3]; - pid_t child_pid; - int status; - - sprintf(pid_str, "%d", pid); - child_pid = fork(); - if (child_pid == 0) { - /* Launch freezer */ - args[0] = freezer_path; - args[1] = pid_str; - args[2] = NULL; - execv(freezer_path, args); - /* execve() does not return on success */ - perror("execve"); - T_FAIL("execve() failed"); - } - - /* Wait for freezer to complete */ - T_LOG("Waiting for freezer %d to complete", child_pid); - while (0 == waitpid(child_pid, &status, WNOHANG)) { - if (timeout < 0) { - kill(child_pid, SIGKILL); - T_FAIL("Freezer took too long to freeze the test"); - } - sleep(1); - timeout--; - } - if (WIFEXITED(status) != 1 || WEXITSTATUS(status) != 0) { - T_FAIL("Freezer error'd out"); - } -} static void * worker_thread_function(void *args) { @@ -79,8 +40,13 @@ worker_thread_function(void *args) char *array; /* Allocate memory */ - array = malloc(MALLOC_SIZE_PER_THREAD); - T_EXPECT_NOTNULL(array, "thread %d allocated heap memory to be dirtied", thread_id); + mach_vm_address_t addr; + kern_return_t kr; + kr = mach_vm_allocate(mach_task_self(), &addr, VM_SIZE_PER_THREAD, + VM_FLAGS_ANYWHERE | VM_PROT_DEFAULT | VM_MAKE_TAG(VM_MEMORY_APPLICATION_SPECIFIC_1)); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_allocate()"); + array = (char *)addr; + T_QUIET; T_EXPECT_NOTNULL(array, "thread %d allocated heap memory to be dirtied", thread_id); /* Waiting for phase 1 (touch pages) to start */ while (atomic_load(&phase) != 1) { @@ -88,8 +54,8 @@ worker_thread_function(void *args) } /* Phase 1: touch pages */ - T_LOG("thread %d phase 1: dirtying %d heap pages (%d bytes)", thread_id, MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE, MALLOC_SIZE_PER_THREAD); - memset(&array[0], 1, MALLOC_SIZE_PER_THREAD); + T_LOG("thread %d phase 1: dirtying %d heap pages (%d bytes)", thread_id, VM_SIZE_PER_THREAD / (int)PAGE_SIZE, VM_SIZE_PER_THREAD); + memset(&array[0], 1, VM_SIZE_PER_THREAD); atomic_fetch_add(&thread_malloc_count, 1); /* Wait for process to be frozen */ @@ -97,21 +63,24 @@ worker_thread_function(void *args) ; } - /* Phase 2, process thawed, trigger decompressions by re-faulting pages */ - T_LOG("thread %d phase 2: faulting pages back in to trigger decompressions", thread_id); - memset(&array[0], 1, MALLOC_SIZE_PER_THREAD); + /* Phase 2: compress pages */ + kr = mach_vm_behavior_set(mach_task_self(), addr, VM_SIZE_PER_THREAD, VM_BEHAVIOR_PAGEOUT); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_behavior_set()"); + atomic_fetch_add(&thread_compressed_count, 1); + + while (atomic_load(&phase) != 3) { + ; + } + + /* Phase 3, process thawed, trigger decompressions by re-faulting pages */ + T_LOG("thread %d phase 3: faulting pages back in to trigger decompressions", thread_id); + memset(&array[0], 1, VM_SIZE_PER_THREAD); /* Main thread will retrieve vm statistics once all threads are thawed */ atomic_fetch_add(&thread_thawed_count, 1); - free(array); - - -#if 0 /* Test if the thread's decompressions counter was added to the task decompressions counter when a thread terminates */ - if (thread_id < 2) { - sleep(10); - } -#endif + kr = mach_vm_deallocate(mach_task_self(), addr, VM_SIZE_PER_THREAD); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_deallocate()"); return NULL; } @@ -150,6 +119,7 @@ T_DECL(task_vm_info_decompressions, "Test multithreaded per-task decompressions counter", T_META_TAG_VM_NOT_ELIGIBLE) { int err; + mach_error_t kr; int ncpu; size_t ncpu_size = sizeof(ncpu); int npages; @@ -176,7 +146,7 @@ T_DECL(task_vm_info_decompressions, T_EXPECT_EQ_INT(0, err, "Detected %d cpus\n", ncpu); /* Set total number of pages to be frozen */ - npages = ncpu * MALLOC_SIZE_PER_THREAD / (int)PAGE_SIZE; + npages = ncpu * VM_SIZE_PER_THREAD / (int)PAGE_SIZE; T_LOG("Test will be freezing at least %d heap pages\n", npages); /* Change state to freezable */ @@ -206,9 +176,19 @@ T_DECL(task_vm_info_decompressions, } T_EXPECT_EQ(ncpu, atomic_load(&thread_malloc_count), "%d threads finished writing to malloc pages\n", ncpu); + count = TASK_VM_INFO_COUNT; + err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + T_QUIET; T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count); + T_QUIET; T_EXPECT_EQ_INT(0, err, "task_info(TASK_VM_INFO) returned 0"); + T_EXPECT_EQ(0, vm_info.decompressions, "Expected 0 decompressions before compressions"); + /* Launch freezer to compress the dirty pages */ - T_LOG("Running freezer to compress pages for pid %d", getpid()); - freeze_pid(getpid()); + atomic_fetch_add(&phase, 1); + /* Wait for all threads to compress their pages */ + while (atomic_load(&thread_compressed_count) != ncpu) { + sleep(1); + } + T_EXPECT_EQ(ncpu, atomic_load(&thread_compressed_count), "%d threads finished writing to malloc pages\n", ncpu); /* Phase 2: triger decompression in threads */ atomic_fetch_add(&phase, 1); @@ -220,9 +200,9 @@ T_DECL(task_vm_info_decompressions, /* Phase 3: Call into kernel to retrieve vm_info and to get the updated decompressions counter */ count = TASK_VM_INFO_COUNT; - err = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count); - T_EXPECT_EQ(0, err, "task_info(TASK_VM_INFO) returned 0"); + kr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); + T_QUIET; T_EXPECT_EQ(count, TASK_VM_INFO_COUNT, "count == TASK_VM_INFO_COUNT: %d", count); + T_QUIET; T_EXPECT_MACH_SUCCESS(kr, "task_info(TASK_VM_INFO)"); /* Make sure this task has decompressed at least all of the dirtied memory */ T_EXPECT_GE_INT(vm_info.decompressions, npages, "decompressed %d pages (>= heap pages: %d)", vm_info.decompressions, npages); diff --git a/tests/tcp_cache_entitlements.plist b/tests/tcp_cache_entitlements.plist new file mode 100644 index 000000000..3a4843aec --- /dev/null +++ b/tests/tcp_cache_entitlements.plist @@ -0,0 +1,10 @@ + + + + + com.apple.private.tcp.heuristics_list + + com.apple.private.tcp.cache_list + + + diff --git a/tests/tcp_cache_test.c b/tests/tcp_cache_test.c new file mode 100644 index 000000000..895233a17 --- /dev/null +++ b/tests/tcp_cache_test.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +T_DECL(tcp_cache_list_sysctl, "Test retrieving TCP cache list via sysctl") +{ + size_t size = 0; + int ret; + + // First call to get the required buffer size + ret = sysctlbyname("net.inet.tcp.cache_list", NULL, &size, NULL, 0); + if (ret == -1) { + T_SKIP("sysctlbyname(\"net.inet.tcp.cache_list\") error: %d", errno); + } + + T_LOG("TCP cache list size: %zu bytes", size); + + if (size == 0) { + T_PASS("No TCP cache entries found"); + } + + // Allocate buffer and retrieve the data + void *buffer = malloc(size); + T_QUIET; T_ASSERT_NOTNULL(buffer, "malloc buffer"); + + ret = sysctlbyname("net.inet.tcp.cache_list", buffer, &size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname to get data"); + + // Calculate number of entries + size_t num_entries = size / sizeof(struct tcp_cache_data); + T_LOG("Found %zu TCP cache entries", num_entries); + + struct tcp_cache_data *entries = (struct tcp_cache_data *)buffer; + + // Log all fields of each entry + for (size_t i = 0; i < num_entries; i++) { + struct tcp_cache_data *entry = &entries[i]; + + T_LOG("Entry %zu:", i); + T_LOG(" tc_last_access: %u", entry->tc_last_access); + T_LOG(" tc_key.tck_family: %d", entry->tc_key.tck_family); + + // Log source key info + T_LOG(" tc_key.tck_src.thk_family: %d", entry->tc_key.tck_src.thk_family); + if (entry->tc_key.tck_src.thk_family == AF_INET) { + T_LOG(" tc_key.tck_src IP: %s", inet_ntoa(entry->tc_key.tck_src.thk_ip.addr)); + } else if (entry->tc_key.tck_src.thk_family == AF_INET6) { + char addr_str[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &entry->tc_key.tck_src.thk_ip.addr6, addr_str, sizeof(addr_str)); + T_LOG(" tc_key.tck_src IPv6: %s", addr_str); + } + + // Log destination address + if (entry->tc_key.tck_family == AF_INET) { + T_LOG(" tc_key.tck_dst IP: %s", inet_ntoa(entry->tc_key.tck_dst.addr)); + } else if (entry->tc_key.tck_family == AF_INET6) { + char addr_str[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &entry->tc_key.tck_dst.addr6, addr_str, sizeof(addr_str)); + T_LOG(" tc_key.tck_dst IPv6: %s", addr_str); + } + + // Log TFO cookie info + T_LOG(" tc_tfo_cookie_len: %u", entry->tc_tfo_cookie_len); + if (entry->tc_tfo_cookie_len > 0) { + char cookie_hex[TFO_COOKIE_LEN_MAX * 2 + 1] = {0}; + for (int j = 0; j < entry->tc_tfo_cookie_len && j < TFO_COOKIE_LEN_MAX; j++) { + snprintf(cookie_hex + j * 2, 3, "%02x", entry->tc_tfo_cookie[j]); + } + T_LOG(" tc_tfo_cookie: %s", cookie_hex); + } + + // Log MPTCP info + T_LOG(" tc_mptcp_version_confirmed: %u", entry->tc_mptcp_version_confirmed); + T_LOG(" tc_mptcp_version: %u", entry->tc_mptcp_version); + T_LOG(" tc_mptcp_next_version_try: %u", entry->tc_mptcp_next_version_try); + T_LOG(""); // Empty line between entries + } + + free(buffer); + + T_PASS("%s", __func__); +} + +T_DECL(tcp_heuristics_list_sysctl, "Test retrieving TCP heuristics list via sysctl") +{ + size_t size = 0; + int ret; + + // First call to get the required buffer size + ret = sysctlbyname("net.inet.tcp.heuristics_list", NULL, &size, NULL, 0); + if (ret == -1) { + T_SKIP("sysctlbyname(\"net.inet.tcp.cache_list\") error: %d", errno); + } + + T_LOG("TCP heuristics list size: %zu bytes", size); + + if (size == 0) { + T_PASS("No TCP heuristics entries found"); + } + + // Allocate buffer and retrieve the data + void *buffer = malloc(size); + T_QUIET; T_ASSERT_NOTNULL(buffer, "malloc buffer"); + + ret = sysctlbyname("net.inet.tcp.heuristics_list", buffer, &size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname to get data"); + + // Calculate number of entries + size_t num_entries = size / sizeof(struct tcp_heuristics_data); + T_LOG("Found %zu TCP heuristics entries", num_entries); + + struct tcp_heuristics_data *entries = (struct tcp_heuristics_data *)buffer; + + // Log all fields of each entry + for (size_t i = 0; i < num_entries; i++) { + struct tcp_heuristics_data *entry = &entries[i]; + + T_LOG("Heuristics Entry %zu:", i); + T_LOG(" th_last_access: %u", entry->th_last_access); + T_LOG(" th_key.thk_family: %d", entry->th_key.thk_family); + + // Log source key info + if (entry->th_key.thk_family == AF_INET) { + T_LOG(" th_key.thk_ip IP: %s", inet_ntoa(entry->th_key.thk_ip.addr)); + } else if (entry->th_key.thk_family == AF_INET6) { + char addr_str[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &entry->th_key.thk_ip.addr6, addr_str, sizeof(addr_str)); + T_LOG(" th_key.thk_ip IPv6: %s", addr_str); + } + + // Log TFO heuristics + T_LOG(" th_tfo_data_loss: %u", entry->th_tfo_data_loss); + T_LOG(" th_tfo_req_loss: %u", entry->th_tfo_req_loss); + T_LOG(" th_tfo_data_rst: %u", entry->th_tfo_data_rst); + + + T_LOG(" th_tfo_req_rst: %u", entry->th_tfo_req_rst); + + // Log MPTCP heuristics + T_LOG(" th_mptcp_loss: %u", entry->th_mptcp_loss); + T_LOG(" th_mptcp_success: %u", entry->th_mptcp_success); + + // Log ECN heuristics + T_LOG(" th_ecn_droprst: %u", entry->th_ecn_droprst); + T_LOG(" th_ecn_synrst: %u", entry->th_ecn_synrst); + + // Log timing information + T_LOG(" th_tfo_enabled_time: %u", entry->th_tfo_enabled_time); + T_LOG(" th_tfo_backoff_until: %u", entry->th_tfo_backoff_until); + T_LOG(" th_tfo_backoff: %u", entry->th_tfo_backoff); + T_LOG(" th_mptcp_backoff: %u", entry->th_mptcp_backoff); + T_LOG(" th_ecn_backoff: %u", entry->th_ecn_backoff); + + // Log flags + T_LOG(" th_tfo_in_backoff: %u", entry->th_tfo_in_backoff); + T_LOG(" th_mptcp_in_backoff: %u", entry->th_mptcp_in_backoff); + T_LOG(" th_mptcp_heuristic_disabled: %u", entry->th_mptcp_heuristic_disabled); + T_LOG(""); // Empty line between entries + } + + free(buffer); + + T_PASS("%s", __func__); +} diff --git a/tests/tcp_input_outputopts_uaf_56155583.c b/tests/tcp_input_outputopts_uaf_56155583.c index cf0347c2c..e7b3f8c3b 100644 --- a/tests/tcp_input_outputopts_uaf_56155583.c +++ b/tests/tcp_input_outputopts_uaf_56155583.c @@ -13,6 +13,8 @@ #include +#include "net_test_lib.h" + /* sizeof(struct ip6_pktopts) */ #define SIZEOF_STRUCT_IP6_PKTOPTS 192 @@ -99,4 +101,6 @@ T_DECL(tcp_input_outputopts_uaf_56155583, "Use-after-free when accepting TCP6 co for (n = 0; n < nthreads; ++n) { pthread_join(threads[n], NULL); } + + force_zone_gc(); } diff --git a/tests/test_utils.c b/tests/test_utils.c index 5eba1f486..d7cea7ec0 100644 --- a/tests/test_utils.c +++ b/tests/test_utils.c @@ -29,6 +29,32 @@ is_development_kernel(void) return is_development; } + +bool +process_is_translated() +{ + static dispatch_once_t is_translated_once; + static bool is_translated; + + dispatch_once(&is_translated_once, ^{ + int out_value = 0; + size_t inout_size = sizeof(out_value); + if (sysctlbyname("sysctl.proc_translated", &out_value, &inout_size, NULL, 0) != 0) { + /* + * ENOENT means the sysctl is not present and therefore + * this process is not translated. Any other error is bad. + */ + T_QUIET; T_ASSERT_POSIX_ERROR(errno, ENOENT, "sysctlbyname(sysctl.proc_translated)"); + is_translated = false; + } else { + T_QUIET; T_ASSERT_GE(inout_size, sizeof(out_value), "sysctlbyname(sysctl.proc_translated)"); + is_translated = (bool)out_value; + } + }); + return is_translated; +} + + pid_t launch_background_helper( const char* variant, diff --git a/tests/test_utils.h b/tests/test_utils.h index 344e88844..6b2feb854 100644 --- a/tests/test_utils.h +++ b/tests/test_utils.h @@ -2,10 +2,17 @@ #define XNU_DARWINTEST_UTILS_H #include +#include /* Misc. utility functions for writing darwintests. */ bool is_development_kernel(void); +/* + * Returns true if the process is translated according to sysctl.proc_translated. + * For example, Rosetta processes are translated processes. + */ +bool process_is_translated(void); + /* Launches the given helper variant as a managed process. */ pid_t launch_background_helper( const char* variant, diff --git a/tests/trial_experiment_factors.c b/tests/trial_experiment_factors.c new file mode 100644 index 000000000..2b22cd7fc --- /dev/null +++ b/tests/trial_experiment_factors.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.trial"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("sysctl"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true), + T_META_TAG_VM_PREFERRED, + T_META_ASROOT(false)); + + +#if defined(ENTITLED) +T_DECL(kern_trial_sysctl_entitled, + "test that kern.trial sysctls can be read-from/written-to if the proper " + "entitlement is granted") +#else +T_DECL(kern_trial_sysctl_unentitled, + "test that kern.trial sysctls cannot be read-from/written-to without " + "the proper entitlement") +#endif +{ + int ret; + int32_t val; + size_t sz = sizeof(val); + + ret = sysctlbyname("kern.trial.test", &val, &sz, NULL, 0); +#if defined(ENTITLED) + T_ASSERT_POSIX_SUCCESS(ret, "kern.trial.test can be read from"); +#else + T_EXPECT_POSIX_FAILURE(ret, EPERM, "kern.trial.test cannot be written to"); +#endif + + val = 1; + ret = sysctlbyname("kern.trial.test", NULL, 0, &val, sizeof(val)); +#if !defined(ENTITLED) + T_EXPECT_POSIX_FAILURE(ret, EPERM, "kern.trial.test cannot be written to"); +#else + T_EXPECT_POSIX_SUCCESS(ret, "kern.trial.test can be written to with a valid value"); + + ret = sysctlbyname("kern.trial.test", &val, &sz, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.trial.test can be read from"); + T_EXPECT_EQ(val, 1, "kern.trial.test written value took effect"); + + val = UINT32_MAX; + ret = sysctlbyname("kern.trial.test", NULL, 0, &val, sizeof(val)); + T_EXPECT_POSIX_FAILURE(ret, EINVAL, "kern.trial.test cannot be written to with an invalid value"); +#endif +} diff --git a/tests/try_read_write.c b/tests/try_read_write.c new file mode 100644 index 000000000..1221626fc --- /dev/null +++ b/tests/try_read_write.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * try_read_write.c + * + * Helper functions for userspace tests to read or write memory and + * verify that EXC_BAD_ACCESS is or is not generated by that operation. + */ + +#include +#include +#include +#include +#include +#include + +#include "exc_helpers.h" +#include "try_read_write.h" + +/* + * -- Implementation overview -- + * + * try_read_byte() and try_write_byte() operate by performing + * a read or write instruction with a Mach exception handler + * in place. + * + * The exception handler catches EXC_BAD_ACCESS. If the bad access + * came from our designated read or write instructions then it + * records the exception that occurred to thread-local storage + * and moves that thread's program counter to resume execution + * and recover from the exception. + * + * Unrecognized exceptions, and EXC_BAD_ACCESS exceptions from + * unrecognized instructions, either go uncaught or are caught and + * re-raised. In either case they lead to an ordinary crash. This + * means we don't get false positives where the test expects one + * crash but incorrectly passes after crashing in some unrelated way. + * We can be precise about what the fault was and where it came from. + * + * We use Mach exceptions instead of signals because + * on watchOS signal handlers do not receive the thread + * state so they cannot recover from the signal. + * + * try_read_write_exception_handler() + * our exception handler, installed using tests/exc_helpers.c + * + * read_byte() and write_byte() + * our designated read and write instructions, recognized by + * the exception handler and specially structured to allow + * recovery by changing the PC + * + * try_read_write_thread_t + * thread-local storage to record the caught exception + */ + +static dispatch_once_t try_read_write_initializer; +static mach_port_t try_read_write_exc_port; + +/* + * Bespoke thread-local storage for threads inside try_read_write. + * We can't use pthread local storage because the Mach exception + * handler needs to access it and that exception handler runs on + * a different thread. + * + * Access by the Mach exception thread is safe because the real thread + * is suspended at that point. (This scheme would be unsound if the + * real thread raised an exception while manipulating the thread-local + * data, but we don't try to cover that case.) + */ +typedef struct { + mach_port_t thread; + kern_return_t exception_kr; /* EXC_BAD_ADDRESS sub-code */ + uint64_t exception_pc; /* PC of faulting instruction */ + uint64_t exception_memory; /* Memory address of faulting access */ +} try_read_write_thread_t; + +#define TRY_READ_WRITE_MAX_THREADS 128 +static pthread_mutex_t try_read_write_thread_list_mutex = PTHREAD_MUTEX_INITIALIZER; +static unsigned try_read_write_thread_count = 0; +static try_read_write_thread_t try_read_write_thread_list[TRY_READ_WRITE_MAX_THREADS]; +static __thread try_read_write_thread_t *try_read_write_thread_self; + +/* + * Look up the try_read_write_thread_t for a Mach thread. + * If create == true and no info was found, add it to the list. + * Returns NULL if no info was found and create == false. + */ +static __attribute__((overloadable)) +try_read_write_thread_t * +thread_info_for_mach_thread(mach_port_t thread_port, bool create) +{ + /* first look for a cached value in real thread-local storage */ + if (mach_thread_self() == thread_port) { + try_read_write_thread_t *info = try_read_write_thread_self; + if (info) { + return info; + } + } + + int err = pthread_mutex_lock(&try_read_write_thread_list_mutex); + assert(err == 0); + + /* search the list */ + for (unsigned i = 0; i < try_read_write_thread_count; i++) { + try_read_write_thread_t *info = &try_read_write_thread_list[i]; + if (info->thread == thread_port) { + pthread_mutex_unlock(&try_read_write_thread_list_mutex); + if (mach_thread_self() == thread_port) { + try_read_write_thread_self = info; + } + return info; + } + } + + /* not in list - create if requested */ + if (create) { + assert(try_read_write_thread_count < TRY_READ_WRITE_MAX_THREADS); + try_read_write_thread_t *info = &try_read_write_thread_list[try_read_write_thread_count++]; + info->thread = thread_port; + info->exception_kr = 0; + pthread_mutex_unlock(&try_read_write_thread_list_mutex); + if (mach_thread_self() == thread_port) { + try_read_write_thread_self = info; + } + return info; + } + + pthread_mutex_unlock(&try_read_write_thread_list_mutex); + return NULL; +} + +static __attribute__((overloadable)) +try_read_write_thread_t * +thread_info_for_mach_thread(mach_port_t thread_port) +{ + return thread_info_for_mach_thread(thread_port, false /* create */); +} + + +/* + * read_byte() and write_byte() are functions that + * read or write memory as their first instruction. + * Used to test memory access that may provoke an exception. + * + * try_read_write_exception_handler() below checks if the exception PC + * is equal to one of these functions. The first instruction must be + * the memory access instruction. + * + * try_read_write_exception_handler() below increments the PC by four bytes. + * The memory access instruction must be padded to exactly four bytes. + */ + +static uint64_t __attribute__((naked)) +read_byte(mach_vm_address_t addr) +{ +#if __arm64__ + asm("\n ldrb w0, [x0]" + "\n ret"); +#elif __x86_64__ + asm("\n movb (%rdi), %al" + "\n nop" /* pad load to four bytes */ + "\n nop" + "\n ret"); +#else +# error unknown architecture +#endif +} + +static void __attribute__((naked)) +write_byte(mach_vm_address_t addr, uint8_t value) +{ +#if __arm64__ + asm("\n strb w1, [x0]" + "\n ret"); +#elif __x86_64__ + asm("\n movb %sil, (%rdi)" + "\n nop" /* pad store to four bytes */ + "\n ret"); +#else +# error unknown architecture +#endif +} + + +/* + * Mach exception handler for EXC_BAD_ACCESS called by exc_helpers. + * Returns the number of bytes to advance the PC to resolve the exception. + */ +static size_t +try_read_write_exception_handler( + __unused mach_port_t task, + mach_port_t thread, + exception_type_t exception, + mach_exception_data_t codes, + uint64_t exception_pc) +{ + assert(exception == EXC_BAD_ACCESS); + try_read_write_thread_t *info = thread_info_for_mach_thread(thread); + assert(info); /* we do not expect exceptions from other threads */ + + uint64_t read_byte_pc = (uint64_t)ptrauth_strip(&read_byte, ptrauth_key_function_pointer); + uint64_t write_byte_pc = (uint64_t)ptrauth_strip(&write_byte, ptrauth_key_function_pointer); + + if (exception_pc != read_byte_pc && exception_pc != write_byte_pc) { + /* this exception isn't one of ours - re-raise it */ + if (verbose_exc_helper) { + T_LOG("not a try_read_write exception"); + } + return EXC_HELPER_HALT; + } + + assert(info->exception_kr == 0); /* no nested exceptions allowed */ + + info->exception_pc = exception_pc; + info->exception_kr = codes[0]; + info->exception_memory = codes[1]; + if (verbose_exc_helper) { + T_LOG("try_read_write exception: pc 0x%llx kr %d mem 0x%llx", + info->exception_pc, info->exception_kr, info->exception_memory); + } + + /* advance pc by 4 bytes to recover */ + return 4; +} + +/* + * Create an exc_helpers exception handler port and thread, + * and install the exception handler port on this thread. + */ +static void +initialize_exception_handlers(void) +{ + try_read_write_exc_port = create_exception_port(EXC_MASK_BAD_ACCESS); + repeat_exception_handler(try_read_write_exc_port, try_read_write_exception_handler); +} + +/* + * Begin try_read_write exception handling on this thread. + */ +static void +begin_expected_exceptions(void) +{ + dispatch_once(&try_read_write_initializer, ^{ + initialize_exception_handlers(); + }); + + try_read_write_thread_t *info = try_read_write_thread_self; + if (!info) { + set_thread_exception_port(try_read_write_exc_port, EXC_MASK_BAD_ACCESS); + info = thread_info_for_mach_thread(mach_thread_self(), true /* create */); + } + + info->exception_kr = 0; + info->exception_pc = 0; + info->exception_memory = 0; +} + +/* + * End try_read_write exception handling on this thread. + * Returns the caught exception data, if any. + */ +static void +end_expected_exceptions( + kern_return_t * const out_kr, + uint64_t * const out_pc, + uint64_t * const out_memory) +{ + try_read_write_thread_t *info = try_read_write_thread_self; + assert(info); + *out_kr = info->exception_kr; + *out_pc = info->exception_pc; + *out_memory = info->exception_memory; +} + + +extern bool +try_read_byte( + mach_vm_address_t addr, + uint8_t * const out_byte, + kern_return_t * const out_error) +{ + kern_return_t exception_kr; + uint64_t exception_pc; + uint64_t exception_memory; + + begin_expected_exceptions(); + *out_byte = read_byte(addr); + end_expected_exceptions(&exception_kr, &exception_pc, &exception_memory); + + /* + * pc was verified inside the exception handler. + * kr will be verified by the caller. + * Verify address here. + */ + + if (exception_kr != KERN_SUCCESS) { + assert(exception_memory == addr); + } + + *out_error = exception_kr; + return exception_kr == 0; +} + +extern bool +try_write_byte( + mach_vm_address_t addr, + uint8_t byte, + kern_return_t * const out_error) +{ + kern_return_t exception_kr; + uint64_t exception_pc; + uint64_t exception_memory; + + begin_expected_exceptions(); + write_byte(addr, byte); + end_expected_exceptions(&exception_kr, &exception_pc, &exception_memory); + + /* + * pc was verified inside the exception handler. + * kr will be verified by the caller. + * Verify address here. + */ + + if (exception_kr != KERN_SUCCESS) { + assert(exception_memory == addr); + } + + *out_error = exception_kr; + return exception_kr == 0; +} diff --git a/tests/try_read_write.h b/tests/try_read_write.h new file mode 100644 index 000000000..2cde3e2fb --- /dev/null +++ b/tests/try_read_write.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +/* + * try_read_write.h + * + * Helper functions for userspace tests to read or write memory and + * verify that EXC_BAD_ACCESS is or is not generated by that operation. + * + * To use these functions in your test you must set additional build options. + * See target `try_read_write_test` in tests/Makefile for an example. + */ + +#include +#include +#include + +/* + * Set verbose_exc_helper = true to log exception information with T_LOG(). + * The default is true. + */ +extern bool verbose_exc_helper; + +/* + * Tries to read a single byte from an address. + * Returns true if the read succeeded. + * Aborts if an exception other than EXC_BAD_ACCESS is generated. + * On exit: + * *out_byte is the value read, or an indeterminate value if the read failed. + * *out_error is the EXC_BAD_ACCESS error code + * (typically KERN_PROTECTION_FAILURE or KERN_INVALID_ADDRESS) + * or 0 if the read succeeded. + * + * To use this function in your test you must set additional build options. + * See target `try_read_write_test` in tests/Makefile for an example. + */ +extern bool +try_read_byte( + mach_vm_address_t addr, + uint8_t * const out_byte, + kern_return_t * const out_error); + +/* + * Tries to write a single byte to an address. + * Returns true if the write succeeded. + * Aborts if an exception other than EXC_BAD_ACCESS is generated. + * On exit: + * *out_error is the EXC_BAD_ACCESS error code + * (typically KERN_PROTECTION_FAILURE or KERN_INVALID__ADDRESS) + * or 0 if the write succeeded. + * + * To use this function in your test you must set additional build options. + * See target `try_read_write_test` in tests/Makefile for an example. + */ +extern bool +try_write_byte( + mach_vm_address_t addr, + uint8_t byte, + kern_return_t * const out_error); diff --git a/tests/try_read_write_test.c b/tests/try_read_write_test.c new file mode 100644 index 000000000..396296234 --- /dev/null +++ b/tests/try_read_write_test.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * try_read_write_test.c + * + * Test the testing helper functions in try_read_write.h. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "try_read_write.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vm"), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true) + ); + +#define MAYBE_QUIET(quiet) \ + do { \ + if (quiet) { \ + T_QUIET; \ + } \ + } while (0) + +static void +test_try_read_byte_maybe_quietly( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + bool quiet, + const char *message) +{ + bool expected_result = (expected_error == 0); + bool actual_result; + uint8_t actual_byte; + kern_return_t actual_error; + + actual_result = try_read_byte(addr, &actual_byte, &actual_error); + + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_result, actual_result, "%s: try_read_byte return value", message); + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_error, actual_error, "%s: try_read_byte error code", message); + if (expected_error == 0 && actual_error == 0) { + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_byte, actual_byte, "%s: try_read_byte value read", message); + } +} + +static void +test_try_read_byte( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + const char *message) +{ + test_try_read_byte_maybe_quietly(addr, expected_byte, expected_error, false /* quiet */, message); +} + +static void +test_try_read_byte_quietly( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + const char *message) +{ + test_try_read_byte_maybe_quietly(addr, expected_byte, expected_error, true /* quiet */, message); +} + +static void +test_try_write_byte_maybe_quietly( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + bool quiet, + const char *message) +{ + bool expected_result = (expected_error == 0); + bool actual_result; + uint8_t actual_byte; + kern_return_t actual_error; + + actual_result = try_write_byte(addr, expected_byte, &actual_error); + + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_result, actual_result, "%s: try_write_byte return value", message); + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_error, actual_error, "%s: try_write_byte error code", message); + if (expected_error == 0 && actual_error == 0) { + actual_byte = *(volatile uint8_t *)addr; + MAYBE_QUIET(quiet); T_EXPECT_EQ(expected_byte, actual_byte, "%s: try_write_byte value written", message); + } +} + +static void +test_try_write_byte( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + const char *message) +{ + test_try_write_byte_maybe_quietly(addr, expected_byte, expected_error, false /* quiet */, message); +} + +static void +test_try_write_byte_quietly( + mach_vm_address_t addr, + uint8_t expected_byte, + kern_return_t expected_error, + const char *message) +{ + test_try_write_byte_maybe_quietly(addr, expected_byte, expected_error, true /* quiet */, message); +} + +static mach_vm_address_t +allocate_page_with_prot(vm_prot_t prot) +{ + mach_vm_address_t addr; + kern_return_t kr; + + kr = mach_vm_allocate(mach_task_self(), &addr, PAGE_SIZE, VM_FLAGS_ANYWHERE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_allocate"); + kr = mach_vm_protect(mach_task_self(), addr, PAGE_SIZE, false /* set max */, prot); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_protect"); + return addr; +} + +static void +deallocate_page(mach_vm_address_t addr) +{ + kern_return_t kr = mach_vm_deallocate(mach_task_self(), addr, PAGE_SIZE); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate"); +} + +/* + * Generate some r-x memory with a known value. + */ +static void __attribute__((naked)) +instruction_byte_ff(void) +{ + asm(".quad 0xffffffff"); +} + +T_DECL(try_read_write_test, + "test the test helper functions try_read_byte and try_write_byte") +{ + mach_vm_address_t addr; + + /* read and write an unmapped address */ + test_try_read_byte(0, 0, KERN_INVALID_ADDRESS, "read unmapped address"); + test_try_write_byte(0, 0, KERN_INVALID_ADDRESS, "write unmapped address"); + + /* read and write --- */ + addr = allocate_page_with_prot(VM_PROT_NONE); + test_try_read_byte(addr, 0, KERN_PROTECTION_FAILURE, "read prot ---"); + test_try_write_byte(addr, 1, KERN_PROTECTION_FAILURE, "write prot ---"); + deallocate_page(addr); + + /* read and write r-- */ + addr = allocate_page_with_prot(VM_PROT_READ); + test_try_read_byte(addr, 0, KERN_SUCCESS, "read prot r--"); + test_try_write_byte(addr, 1, KERN_PROTECTION_FAILURE, "write prot r--"); + deallocate_page(addr); + + /* read and write -w- */ + addr = allocate_page_with_prot(VM_PROT_WRITE); + test_try_read_byte(addr, 0, KERN_PROTECTION_FAILURE, "read prot -w-"); + test_try_write_byte(addr, 1, KERN_PROTECTION_FAILURE, "write prot -w-"); + deallocate_page(addr); + + /* read and write rw- */ + addr = allocate_page_with_prot(VM_PROT_READ | VM_PROT_WRITE); + *(uint8_t *)addr = 1; + test_try_read_byte(addr, 1, KERN_SUCCESS, "read prot rw-"); + test_try_write_byte(addr, 2, KERN_SUCCESS, "write prot rw-"); + test_try_read_byte(addr, 2, KERN_SUCCESS, "read prot rw- again"); + deallocate_page(addr); + + /* read and write r-x */ + addr = (mach_vm_address_t)ptrauth_strip(&instruction_byte_ff, ptrauth_key_function_pointer); + test_try_read_byte(addr, 0xff, KERN_SUCCESS, "read prot r-x"); + test_try_write_byte(addr, 1, KERN_PROTECTION_FAILURE, "write prot r-x"); +} + + +/* this test provokes THREAD_COUNT * REP_COUNT * PAGE_SIZE exceptions */ +#define THREAD_COUNT 10 +#define REP_COUNT 5 + +struct test_alloc { + mach_vm_address_t addr; + vm_prot_t prot; + kern_return_t expected_read_error; + kern_return_t expected_write_error; +}; + +static struct test_alloc +allocate_page_with_random_prot(void) +{ + struct test_alloc result; + + switch (random() % 4) { + case 0: + result.prot = VM_PROT_NONE; + result.expected_read_error = KERN_PROTECTION_FAILURE; + result.expected_write_error = KERN_PROTECTION_FAILURE; + break; + case 1: + result.prot = VM_PROT_READ; + result.expected_read_error = KERN_SUCCESS; + result.expected_write_error = KERN_PROTECTION_FAILURE; + break; + case 2: + result.prot = VM_PROT_WRITE; + result.expected_read_error = KERN_PROTECTION_FAILURE; + result.expected_write_error = KERN_PROTECTION_FAILURE; + break; + case 3: + result.prot = VM_PROT_READ | VM_PROT_WRITE; + result.expected_read_error = KERN_SUCCESS; + result.expected_write_error = KERN_SUCCESS; + break; + } + + result.addr = allocate_page_with_prot(result.prot); + return result; +} + +static void * +multithreaded_test(void *arg) +{ + struct test_alloc alloc = *(struct test_alloc *)arg; + + /* Read and write a lot from our page. */ + for (int reps = 0; reps < REP_COUNT; reps++) { + for (int offset = 0; offset < PAGE_SIZE; offset++) { + test_try_read_byte_quietly(alloc.addr + offset, 0, alloc.expected_read_error, "thread read"); + test_try_write_byte_quietly(alloc.addr + offset, 0, alloc.expected_write_error, "thread write"); + } + } + + return NULL; +} + +T_DECL(try_read_write_test_multithreaded, + "test try_read_byte and try_write_byte from multiple threads") +{ + verbose_exc_helper = false; + + pthread_t threads[THREAD_COUNT]; + struct test_alloc allocs[THREAD_COUNT]; + + /* each thread gets a page with a random prot to read and write on */ + + for (int i = 0; i < THREAD_COUNT; i++) { + allocs[i] = allocate_page_with_random_prot(); + } + + T_LOG("running %d threads each %d times", THREAD_COUNT, REP_COUNT); + + for (int i = 0; i < THREAD_COUNT; i++) { + pthread_create(&threads[i], NULL, multithreaded_test, &allocs[i]); + } + + for (int i = 0; i < THREAD_COUNT; i++) { + pthread_join(threads[i], NULL); + deallocate_page(allocs[i].addr); + } +} diff --git a/tests/try_read_write_test_unexpected.c b/tests/try_read_write_test_unexpected.c new file mode 100644 index 000000000..f92dc451c --- /dev/null +++ b/tests/try_read_write_test_unexpected.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * try_read_write_test_unexpected.c + * + * Test the testing helper functions in try_read_write.h. + * The exception handler used by try_read_byte/try_write_byte + * should allow other exceptions to continue to a crash. + */ + +#include +#include +#include +#include +#include +#include + +#include "try_read_write.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vm"), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true), + + /* these tests are expected to crash */ + T_META_IGNORECRASHES(".*try_read_write_test_unexpected.*") + ); + +static void +install_exception_handler(void) +{ + kern_return_t kr; + bool result; + + result = try_write_byte(0, 0, &kr); + T_QUIET; T_ASSERT_EQ(result, false, "try_write_byte to NULL"); + T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ADDRESS, "try_write_byte to NULL"); +} + +static void +test_crasher(void (^crashing_block)(void)) +{ + pid_t child_pid; + if ((child_pid = fork())) { + /* parent */ + int status; + int err = waitpid(child_pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "waitpid"); + T_EXPECT_TRUE(WIFSIGNALED(status), "parent: child process should crash"); + } else { + /* child */ + T_LOG("-- Calling try_write_byte() to install exception handlers --"); + install_exception_handler(); + T_LOG("-- The next exception should crash --"); + crashing_block(); + T_FAIL("child: process should have crashed"); + } +} + +static void __attribute__((noinline)) +THIS_IS_EXPECTED_TO_CRASH_EXC_BAD_ACCESS(void) +{ + *(volatile int *)0 = 1; +} + +static void __attribute__((noinline)) +THIS_IS_EXPECTED_TO_CRASH_BUILTIN_TRAP(void) +{ + __builtin_trap(); +} + + +T_DECL(try_read_write_unexpected_bad_access, + "test an unrelated EXC_BAD_ACCESS exception " + "with the try_read_write exception handler in place") +{ + test_crasher(^{ + /* + * Provoke EXC_BAD_ACCESS outside try_read_byte and try_write_byte. + * The try_read_write exception handler should catch and rethrow it. + */ + THIS_IS_EXPECTED_TO_CRASH_EXC_BAD_ACCESS(); + }); +} + + +T_DECL(try_read_write_unexpected_trap, + "test an unrelated non-EXC_BAD_ACCESS exception " + "with the try_read_write exception handler in place") +{ + test_crasher(^{ + /* + * Provoke a non-EXC_BAD_ACCESS exception outside of try_read_byte and try_write_byte. + * The try_read_write exception handler should not catch it. + */ + THIS_IS_EXPECTED_TO_CRASH_BUILTIN_TRAP(); + }); +} diff --git a/tests/udp_kao_opt.c b/tests/udp_kao_opt.c index ec2676096..a63f1f1a8 100644 --- a/tests/udp_kao_opt.c +++ b/tests/udp_kao_opt.c @@ -125,7 +125,8 @@ set_udp_kao_opt(int expected_errno, int domain, const char *domain_str, #define SET_UDP_KAO_OPT(e, d, t, p) set_udp_kao_opt(e, d, #d, t, #t, p, #p) -T_DECL(test_udp_keep_alive_option, "TCP bind with a IPv6 multicast address") +T_DECL(test_udp_keep_alive_option, "TCP bind with a IPv6 multicast address", + T_META_ENABLED(false) /* rdar://134506000 */) { tcp_listen(); diff --git a/tests/unit/Makefile b/tests/unit/Makefile new file mode 100644 index 000000000..e96657f0c --- /dev/null +++ b/tests/unit/Makefile @@ -0,0 +1,346 @@ +# example of how to run this from XNU root, see README.md: +# make -C tests/unit SDKROOT=macosx.internal example_test_osfmk + +PROJECT := xnu/unit_tests + +ifdef BASEDSTROOT +override DSTROOT = $(BASEDSTROOT) +endif + +DEVELOPER_DIR ?= $(shell xcode-select -p) +INVALID_ARCHS = $(filter-out arm64%,$(ARCH_CONFIGS)) + +.DEFAULT_GOAL := install + +ORIG_SYMROOT := $(SYMROOT) + +ifdef OBJROOT +XNU_OBJROOT := $(OBJROOT) +else +XNU_OBJROOT = $(XNU_ROOT)/BUILD/obj +endif + +include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common + +ifndef KERNEL_CONFIG +KERNEL_CONFIG = development +endif +ifndef PRODUCT_CONFIG +PRODUCT_CONFIG = j414c +endif + +# find the name of the XNU library +XNU_ROOT := $(abspath $(SRCROOT)/../..) +KERNEL_CONFIG_UPPER := $(shell echo $(KERNEL_CONFIG) | tr '[:lower:]' '[:upper:]') +XNU_DETAILS := $(shell $(SRCROOT)/tools/get_target_details.py $(SDKROOT) $(PRODUCT_CONFIG)) +XNU_ARCH := $(word 1,$(XNU_DETAILS)) +XNU_KERNEL_PLATFORM := $(word 2,$(XNU_DETAILS)) +XNU_KERNEL_PLATFORM_UPPER := $(shell echo $(XNU_KERNEL_PLATFORM) | tr '[:lower:]' '[:upper:]') +XNU_FILE_NAME_PREFIX := $(word 3,$(XNU_DETAILS)) + +XNU_BUILD_DIR := $(XNU_OBJROOT)/$(KERNEL_CONFIG_UPPER)_$(XNU_ARCH)_$(XNU_KERNEL_PLATFORM_UPPER) +XNU_LIB_FILE_BASE := lib$(XNU_FILE_NAME_PREFIX).$(KERNEL_CONFIG).$(XNU_KERNEL_PLATFORM) +XNU_LIB_FILE := $(XNU_BUILD_DIR)/$(XNU_LIB_FILE_BASE).a + +# avoid annoyances +OTHER_CFLAGS += -Wno-missing-prototypes -Wno-unused-parameter -Wno-missing-variable-declarations +# darwintest config +DT_CFLAGS = -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu -DT_LEAKS_DISABLE=1 -DBUILD_NO_STD_HEADERS -I$(DT_SYMLINKS_DIR) +OTHER_CFLAGS += $(DT_CFLAGS) +OTHER_CXXFLAGS += $(DT_CFLAGS) +OTHER_LDFLAGS += -ldarwintest_utils +# interpose header +INTERPOSE_CFLAGS = -I$(SDKROOT)/usr/local/include/mach-o +OTHER_CFLAGS += $(INTERPOSE_CFLAGS) +# we build with clang but XNU contains C++ so we add this manually +OTHER_LDFLAGS += -lc++ + +_v = $(if $(filter YES,$(or $(VERBOSE),$(RC_XBS))),,@) + +LD := "$(shell xcrun -sdk "$(SDKROOT)" -find ld)" +DYLD_INFO := "$(shell xcrun -sdk "$(SDKROOT)" -find dyld_info)" +LIBTOOL := "$(shell xcrun -sdk "$(SDKROOT)" -find libtool)" + +XNU_MAKE_FLAGS = BUILD_LTO=0 PRODUCT_CONFIGS=$(PRODUCT_CONFIG) KERNEL_CONFIGS=$(KERNEL_CONFIG) +XNU_CFLAGS_EXTRA = +COVERAGE_FLAGS = + +ifeq ($(BUILD_CODE_COVERAGE),1) +# make XNU build coverage +XNU_MAKE_FLAGS += BUILD_CODE_COVERAGE=1 +# make mocks code not mock some coverage related llvm functions +COVERAGE_FLAGS += -D__BUILDING_FOR_COVERAGE__=1 +endif # BUILD_CODE_COVERAGE + +BUILD_SANITIZERS = 0 +SANITIZERS_FLAGS = +ATTACH_SANITIZERS_SOURCES = +SANCOV_FLAG = -fsanitize-coverage=bb,no-prune,trace-pc-guard + +ifeq ($(FIBERS_PREEMPTION),1) +# trace-loads,trace-stores depends on either trace-pc-guard or libfuzzer instrumentation +BUILD_SANCOV = 1 +# compile with memory operations instrumentation +XNU_CFLAGS_EXTRA += -fsanitize-coverage=trace-loads,trace-stores +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_SANCOV_LOAD_STORES__=1 +endif # FIBERS_PREEMPTION + +ifeq ($(BUILD_ASAN),1) +BUILD_SANITIZERS = 1 +# compile XNU with asan +# TODO: enable globals instrumentation and write a proper ignorelist for problematic global vars +XNU_CFLAGS_EXTRA += -fsanitize=address -mllvm -asan-globals=0 +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_ASAN__=1 +endif # BUILD_ASAN + +ifeq ($(BUILD_UBSAN),1) +BUILD_SANITIZERS = 1 +# compile XNU with ubsan +# TODO: add more ubsan support +XNU_CFLAGS_EXTRA += -fsanitize=signed-integer-overflow,shift,pointer-overflow,bounds,object-size,vla-bound,builtin +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_UBSAN__=1 +endif # BUILD_UBSAN + +ifeq ($(BUILD_TSAN),1) +BUILD_SANITIZERS = 1 +# compile XNU with tsan +XNU_CFLAGS_EXTRA += -fsanitize=thread +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_TSAN__=1 +endif # BUILD_TSAN + +# SanitizerCoverage is used for preemption simulation +ifeq ($(BUILD_SANCOV),1) +BUILD_SANITIZERS = 1 +# compile XNU with bb sancov +XNU_CFLAGS_EXTRA += $(SANCOV_FLAG) -fsanitize-coverage-ignorelist=$(SRCROOT)/tools/sanitizers-ignorelist +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_SANCOV__=1 +endif # BUILD_SANCOV + +ifeq ($(BUILD_SANITIZERS),1) +# include the ignorelist to disable instrumentation of some file/functions https://clang.llvm.org/docs/SanitizerSpecialCaseList.html +XNU_CFLAGS_EXTRA += -fsanitize-ignorelist=$(SRCROOT)/tools/sanitizers-ignorelist +# make mocks code aware of sanitizers runtime being linked +SANITIZERS_FLAGS += -D__BUILDING_WITH_SANITIZER__=1 +ATTACH_SANITIZERS_SOURCES += mocks/san_attached.c +endif # BUILD_SANITIZERS + +ifneq ($(strip $(XNU_CFLAGS_EXTRA)),) +# add CFLAGS_EXTRA to the XNU make flags if any, wrap between "" as XNU_CFLAGS_EXTRA contains spaces and replace inner " with \" +XNU_MAKE_FLAGS += CFLAGS_EXTRA="$(subst ",\",$(XNU_CFLAGS_EXTRA))" +endif + +# sources for the mocks .dylib which overrides functions from XNU +MOCK_SOURCES = mocks/mock_alloc.c \ + mocks/mock_misc.c \ + mocks/mock_pmap.c \ + mocks/mock_thread.c \ + mocks/mock_unimpl.c \ + mocks/mock_cpu.c \ + mocks/unit_test_utils.c \ + mocks/fibers/random.c \ + mocks/fibers/fibers.c \ + mocks/fibers/mutex.c \ + mocks/fibers/condition.c \ + mocks/fibers/rwlock.c \ + mocks/fibers/checker.c + +# sources for the mocks that are linked into the same .dylib as the XNU static lib +# fake_kinit.c needs to be first because it contains the initialization entry point +ATTACH_MOCK_SOURCES = mocks/fake_kinit.c \ + mocks/mock_3rd_party.c \ + mocks/mock_mem.c \ + mocks/mock_attached.c \ + $(ATTACH_SANITIZERS_SOURCES) + +# sources that are added to the compilation of every test target +TEST_SIDE_SOURCES = mocks/dt_proxy.c + +# --------------------- individual tests customization ---------------------------- + +INCLUDED_TEST_SOURCE_DIRS += example_dir + + +# --------------------------------------------------------------------------------- +# we don't want to pass our arguments to the XNU makefile since that would confuse it, but we do want to pass any +# -j argument if it existed +unexport SRCROOT +unexport SDKROOT +unexport MAKEFLAGS +MKJOBS = $(filter --jobs=%,$(MAKEFLAGS)) + +.FORCE: +ifeq ($(SKIP_XNU),) +# The XNU make needs to run every time since if anything changed in XNU, we want to rebuild the tests +# This extra wait time can be skipped buy adding SKIP_XNU=1 to the make command line +$(XNU_LIB_FILE): .FORCE + SDKROOT=$(SDKROOT) $(MAKE) -C $(XNU_ROOT) RC_ProjectName=xnu_libraries XNU_LibAllFiles=1 XNU_LibFlavour=UNITTEST SDKROOT=$(SDKROOT) $(XNU_MAKE_FLAGS) $(MKJOBS) +endif # SKIP_XNU + +# Structure of unit-test linking: +# Mocking of XNU functions relies on the interposable functions mechanism which requires the interposed and interposable +# functions to be in different .dylibs +# - tester (executable) +# - compiles tester.c +# - statically linked to libdarwintest.a +# - statically linked to libside.a +# - dynamically linked to libmocks.dylib +# - dynamically linked to libkernel.xxx.dylib +# - libside.a +# - compiles mocks/dt_proxy.c ($(TEST_SIDE_SOURCES)) which bridges PT_xxx calls from libmocks and libkernel +# to libdarwintest. This is needed since the test executable is the only mach-o which links to libdarwintest +# - libmocks.dylib +# - compiles mocks/mock_xxx.c - $(MOCK_SOURCES) files which contain T_MOCK() definitions to override functions from XNU +# via the interposable functions mechanism. +# - dynamically linked with libkernel.xxx.dylib +# - libkernel.xxx.dylib +# - compiles mocks/xxx.c - $(ATTACH_MOCK_SOURCES) files which contain dependencies needed for linking the XNU static library +# - statically linked to libkernel.xxx.prelinked.a +# - makes all functions interposable so that internal calls are routed to mocks +# - libkernel.xxx.prelinked.a +# - This is the same content as libkernel.xxx.a, passed through "ld -r" so that some symbols that are also +# defined in libc can be unexported, so that anything outside XNU (i.e. mock and darwintest code) doesn't end up +# calling them. e.g get_pid() +# - libkernel.xxx.a +# - This is the static lib produced by XNU make which contains all of the XNU code. + +# flags that .c files under MODULE (osfmk/bsd) are built with. MODULE should be defined per target +MODULE_CFLAGS = $(shell $(SRCROOT)/tools/quote_defines.py $(XNU_BUILD_DIR)/$(MODULE)/$(KERNEL_CONFIG_UPPER)/.CFLAGS) -I$(XNU_BUILD_DIR)/$(MODULE)/$(KERNEL_CONFIG_UPPER) $(MODULE_FLAG) $(SANITIZERS_FLAGS) +MODULE_CXXFLAGS = $(shell $(SRCROOT)/tools/quote_defines.py $(XNU_BUILD_DIR)/$(MODULE)/$(KERNEL_CONFIG_UPPER)/.CXXFLAGS) -I$(XNU_BUILD_DIR)/$(MODULE)/$(KERNEL_CONFIG_UPPER) $(MODULE_FLAG) $(SANITIZERS_FLAGS) +OSFMK_CFLAGS = $(shell $(SRCROOT)/tools/quote_defines.py $(XNU_BUILD_DIR)/osfmk/$(KERNEL_CONFIG_UPPER)/.CFLAGS) -I$(XNU_BUILD_DIR)/osfmk/$(KERNEL_CONFIG_UPPER) +MOCKS_CFLAGS = $(OSFMK_CFLAGS) $(INTERPOSE_CFLAGS) $(COVERAGE_FLAGS) $(SANITIZERS_FLAGS) + +# We need to link the darwintest headers from an empty folder and include from there since we can't add their +# original folder as -I since that would allow XNU headers to include SDK headers. +DT_SYMLINKS_DIR := $(OBJROOT)/darwintest_headers +DT_ORIG_DIR := $(SDKROOT)/usr/local/include +# Target to check if the sdk changed. content of the file is the path to the last sdk used for building +$(OBJROOT)/sdk_updated: .FORCE + $(_v)if [ ! -f $(OBJROOT)/sdk_updated ] || [ "$$(cat $(OBJROOT)/sdk_updated)" != "$(SDKROOT)" ]; then \ + echo $(SDKROOT) > $(OBJROOT)/sdk_updated; \ + fi +# Do we need to update the darwintest headers symlinks? +$(DT_SYMLINKS_DIR): $(DT_ORIG_DIR)/darwintest.h $(OBJROOT)/sdk_updated + $(_v)mkdir -p $(DT_SYMLINKS_DIR) + $(_v)for file in $(DT_ORIG_DIR)/darwintest*; do ln -sfn "$${file}" $(DT_SYMLINKS_DIR); done + +# The LD invocation below can't get the arch and platform from the prelinked.a file so we need this empty object +# to carry this information +ARCH_DEF_OBJ := $(OBJROOT)/arch_def.o +$(ARCH_DEF_OBJ): $(XNU_LIB_FILE) + $(_v)$(CC) -c -x c /dev/null -o $@ $(CFLAGS) + +SPTM_LIB := $(SDKROOT)/usr/local/lib/kernel/platform/libsptm_xnu.a +# this creates a dummy executable with libsptm_xnu.a so that the xbs dependency analysis know to build xnu_unittests after libsptm was installed +DUMMY_SPTM_EXE := $(OBJROOT)/ut_dummy_sptm +$(DUMMY_SPTM_EXE): $(SPTM_LIB) $(OBJROOT)/sdk_updated $(ARCH_DEF_OBJ) + $(_v)echo "int start() { return 0; }" > $(OBJROOT)/dummy_sptm_start.c + $(_v)$(CC) $(CFLAGS) $(OBJROOT)/dummy_sptm_start.c -Wl,-e,_start -Wl,-pie -nostdlib -Wl,-kernel -static -Wl,-force_load,$(SPTM_LIB) -o $@ + +XNU_LIB_PRELINKED := $(OBJROOT)/$(XNU_LIB_FILE_BASE).prelinked.a +$(XNU_LIB_PRELINKED): $(XNU_LIB_FILE) $(SRCROOT)/tools/xnu_lib.unexport $(ARCH_DEF_OBJ) + $(_v)$(LD) -r $(ARCH_DEF_OBJ) -all_load $(XNU_LIB_FILE) -o $@ -unexported_symbols_list $(SRCROOT)/tools/xnu_lib.unexport + +XNU_LIB_DYLIB := $(SYMROOT)/$(XNU_LIB_FILE_BASE).dylib +$(XNU_LIB_DYLIB): $(XNU_LIB_PRELINKED) $(ATTACH_MOCK_SOURCES) $(DT_SYMLINKS_DIR) + $(_v)$(CC) $(MOCKS_CFLAGS) $(CFLAGS) -dynamiclib $(ATTACH_MOCK_SOURCES) -Wl,-all_load,$(XNU_LIB_PRELINKED) -lc++ -Wl,-undefined,dynamic_lookup -Wl,-interposable -install_name @rpath/$(XNU_LIB_FILE_BASE).dylib -o $@ + +# Do we need to update the unimplemented functions mock? +$(OBJROOT)/func_unimpl.inc: $(XNU_LIB_DYLIB) + $(_v)echo "// Generated from undefined imports from: $(XNU_LIB_DYLIB)" > $@ + $(_v)$(DYLD_INFO) -imports $(XNU_LIB_DYLIB) | grep "flat-namespace" | awk '{print "UNIMPLEMENTED(" substr($$2, 2) ")"}' >> $@ +# The xnu dylib is linked with -undefined dynamic_lookup so that it's able to find undefined symbols at load time +# These are symbols that come from libsptm_xnu.a and libTightbeam.a. They still need to have an implementation for load +# to succeeds so this gets the list of undefined symbols and defines them in the mocks dylib +# use awk to discard the first underscore prefix of each symbol and wrap with UNIMPLEMENTED() macro expected in mock_unimpl.c + +MOCKS_DYLIB := $(SYMROOT)/libmocks.dylib +$(MOCKS_DYLIB): $(MOCK_SOURCES) $(XNU_LIB_DYLIB) $(OBJROOT)/func_unimpl.inc + $(_v)$(CC) $(MOCKS_CFLAGS) $(CFLAGS) -I$(OBJROOT) $(XNU_LIB_DYLIB) -dynamiclib -MJ $@.json $(MOCK_SOURCES) $(XNU_LIB_DYLIB) -install_name @rpath/libmocks.dylib -o $@ +# -install_name makes the test executables which link to these .dylibs find them in the same folder rather than the +# folder the .dylibs happen to be built at. The path is relative to rpath and rpath is defined by the executable +# itself to be to root of test executables, see below + +SIDE_LIB := $(OBJROOT)/libside.a +TEST_SIDE_OBJ := $(foreach source,$(TEST_SIDE_SOURCES),$(OBJROOT)/$(notdir $(basename $(source))).o) +$(TEST_SIDE_OBJ): $(TEST_SIDE_SOURCES) $(DT_SYMLINKS_DIR) $(XNU_LIB_DYLIB) + $(_v)$(CC) $(MOCKS_CFLAGS) $(CFLAGS) $(DT_CFLAGS) -c $< -o $@ +$(SIDE_LIB): $(TEST_SIDE_OBJ) + $(_v)$(LIBTOOL) -static $< -o $@ + +# this creates a shell script for running all the unit-tests on-desk (build all using target 'install' +.PHONY: run_unittests +install: run_unittests +RUN_UT_SCRIPT := $(SYMROOT)/run_unittests.sh +RUN_UT_LIST := $(SYMROOT)/run_unittests.targets +run_unittests: $(RUN_UT_SCRIPT) +$(RUN_UT_SCRIPT): $(SRCROOT)/tools/make_run_unittests.py + $(SRCROOT)/tools/make_run_unittests.py "$(TEST_TARGETS)" $@ $(RUN_UT_LIST) + chmod a+x $@ + +# inform the dt makefile that these need to be installed along with the test executables +CUSTOM_TARGETS += $(XNU_LIB_DYLIB) $(MOCKS_DYLIB) $(DUMMY_SPTM_EXE) $(RUN_UT_SCRIPT) +install-$(XNU_LIB_DYLIB): $(XNU_LIB_DYLIB) + mkdir -p $(INSTALLDIR) + cp $< $(INSTALLDIR)/ +install-$(MOCKS_DYLIB): $(MOCKS_DYLIB) + mkdir -p $(INSTALLDIR) + cp $< $(INSTALLDIR)/ +install-$(DUMMY_SPTM_EXE): $(DUMMY_SPTM_EXE) + echo "not copying $(DUMMY_SPTM_EXE)" +install-$(RUN_UT_SCRIPT): $(RUN_UT_SCRIPT) + mkdir -p $(INSTALLDIR) + cp $< $(INSTALLDIR)/ + cp $(RUN_UT_LIST) $(INSTALLDIR)/ + +OTHER_LDFLAGS += $(XNU_LIB_DYLIB) $(MOCKS_DYLIB) -Wl,-force_load,$(SIDE_LIB) + + +include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets + +# Every unit-test target needs to define a target-specific variable named UT_MODULE so that MODULE_CFLAGS is defined +# This is parsed from the .c file that needs to have a line like: #define UT_MODULE osfmk +# The clang invocation may produce errors due to missing include folders but it still generates the #defines list. +define assign_module +$(1): MODULE ?= $(shell for f in $(1).c $(1).cpp; do \ + [ -f $$f ] && $(CC) -E -dM $$f 2>/dev/null | grep -m1 '^#define UT_MODULE ' | awk '{print $$3}' && break; \ +done) +endef +$(foreach target, $(TEST_TARGETS), $(eval $(call assign_module, $(target), $(target)))) +# if no UT_MODULE was found, this will trigger an error in std_safe.h +MODULE_FLAG = -DUT_MODULE=$(if $(strip $(MODULE)),$(MODULE),-1) + +# All test targets depend on the XNU lib +$(TEST_TARGETS): $(XNU_LIB_DYLIB) $(MOCKS_DYLIB) $(SIDE_LIB) + +# if the test executalbe is in a sub-folder, it's rpath needs to point back to the root of all +# test executables. This is done by appending as many ".." to @executable_path as there are levels of sub-directories +define make_rpath +@executable_path$(shell \ + dir=$$(dirname "$1"); \ + if [ "$$dir" != "." ]; then \ + echo "$$dir" | sed 's|[^/][^/]*|..|g' | sed 's|^|/|'; \ + fi) +endef +# this sets the CFLAGS for building the test to be the same as files in it's UT_MODULE +$(TEST_TARGETS): OTHER_CFLAGS += $(MODULE_CFLAGS) -MJ $(OBJROOT)/$(subst /,__,$@).json -rpath $(call make_rpath,$@) +# C++ files get both CFLAGS and CXXFLAGS +$(TEST_TARGETS): OTHER_CXXFLAGS += $(MODULE_CXXFLAGS) $(MODULE_CFLAGS) -MJ $(OBJROOT)/$(subst /,__,$@).json -rpath $(call make_rpath,$@) + +# This is for creating a new version of $(XNU_ROOT)/compile_commands.json that includes +# the test and mock files. It's used by IDEs for understanding the code +.PHONY: cmds_json proj_xcode proj_vscode proj_clion +cmds_json: + $(SRCROOT)/tools/merge_cmds_json.py $(XNU_ROOT) $(XNU_BUILD_DIR) $(OBJROOT) + +proj_xcode: cmds_json + $(SRCROOT)/tools/generate_ut_proj.py xcode +proj_vscode: cmds_json + $(SRCROOT)/tools/generate_ut_proj.py vscode +proj_clion: cmds_json + $(SRCROOT)/tools/generate_ut_proj.py clion + diff --git a/tests/unit/README.md b/tests/unit/README.md new file mode 100644 index 000000000..a12557a51 --- /dev/null +++ b/tests/unit/README.md @@ -0,0 +1,210 @@ +# XNU user-space unit-tests + +This folder contains unit-tests for in-kernel functionality, build as a user-space process + +### Building a test: +``` +> make -C tests/unit SDKROOT=macosx.internal +``` +This will build XNU as a library and link it into a test executable. +`` is the name of the test executable. There should be a corresponding `.c` +Examples for ``: `example_test_osfmk`, `example_test_bsd` + +Useful customization for the make command: +- `VERBOSE=YES` - Show more of the build commands +- `BUILD_WERROR=0` - When building XNU, Do not convert warnings to errors +- `SKIP_XNU=1` - Don't try to rebuild XNU +- `KERNEL_CONFIG=release` - Build XNU in in release rather than 'development' +- `PRODUCT_CONFIG=...` - Build XNU for a device other than the default. Only macos devices are supported +- `BUILD_CODE_COVERAGE=1` - Build with coverage support, see section below +- `FIBERS_PREEMPTION=1` - Build with memory operations instrumentation to simulate preemption, see section below +- `BUILD_ASAN=1` - Build with AddressSanitizer support +- `BUILD_UBSAN=1` - Build with UndefinedBehaviourSanitizer support +- `BUILD_TSAN=1` - Build with ThreadSanitizer support + +### Running a test +The darwintest executable is created in `tests/unit/build/sym/`. To run all tests in an executable: +```text +> ./tests/unit/build/sym/ +``` + +### Creating a new test +- Add a `.c` file in this directory with the test code. +- In the added .c file, add a line that looks like `#define UT_MODULE osfmk` +This determines the context in which the test is going to be built. This should be +either "bsd" or "osfmk", depending on where the tested code resides. See example_test_bsd.c, example_test_osfmk.c. + +### Building all tests +To build and run all the unit tests executables do: +``` +> make -C tests/unit SDKROOT=macosx.internal install +> ./tests/unit/build/sym/run_unittests.sh +``` +Another option is to run through the main Makefile: +``` +> make SDKROOT=macosx.internal xnu_unittests +> ./BUILD/sym/run_unittests.sh +``` +This is what the xnu_unittests build alias build. Notice that the output folder is different from the first option. + +## Debugging a test +``` +> xcrun -sdk macosx.internal lldb ./tests/unit/build/sym/ +(lldb) run +``` +Notice that if the test executable contains more than one `T_DECL()`s, libdarwintest is going to run each `T_DECL()` +in a separate child process, so invoking `run` in lldb without the name of a specific `T_DECL()` will debug just the top +level process and not stop on breakpoints. +For a better debugging experience wrap debugged code with +``` +#pragma clang attribute push(__attribute__((noinline, optnone)), apply_to=function) +... +#pragma clang attribute pop +``` +or annotate individual functions with `__attribute__((noinline, optnone))` + +The unit-tests Makefile is able to generate files that allow easy debugging experience with various IDEs +``` +> make SDKROOT=macosx.internal cmds_json +``` +This make target adds the unit-tests executables that were built since the last `clean` to the `compile_commands.json` +file at the root of the repository so that IDEs that support this file (VSCode, CLion) know about the tests .c files +as well as the XNU .c files. + +### Debugging with Xcode +``` +> make SDKROOT=macosx.internal proj_xcode +``` +This reads the `compile_commands.json` file and generates an Xcode project named `ut_xnu_proj.xcodeproj` with all of +XNU and the unit-tests source files, and running schemes for the test targets. +To debug using this project: +- Start Xcode, open the `ut_xnu_proj.xcodeproj` project +- At the top bar, select the runnning scheme named after the test executable name (``) +- In the same menu, press "Edit Scheme", go to "Run"->"Arguments" and add as an argument the name of the `T_DECL()` +to debug +- Again at the top bar, to the right of the name of the scheme press `My Mac (arm64e)` to open the Location menu +- Select `My Mac (arm64)` (instead of `My Mac (arm64e)`) +- Set a breakpoint in the test +- Press the Play button at the top bar + +### Debugging with VSCode +``` +> make SDKROOT=macosx.internal proj_vscode +``` +This reads the `compile_commands.json` file and generates a `.vscode/launch.json` file for VSCode to know about +the executables to run. +(if you have such existing file it will be overwritten) +To debug in VSCode: +- (one time setup) Install the "LLDB DAP" extension + - the "LLDB DAP" extension uses the lldb from the currently selected Xcode.app +- Open the XNU root folder +- Press the "Run and Debug" tab at the left bar +- Select the test executable name from the top menu (``) +- Press the gear icon next to it to edit launch.json +- In "args", write the name of the `T_DECL()` to debug +- Press the green play arrow next to the test name + +### Debugging with CLion +``` +> make SDKROOT=macosx.internal proj_clion +``` +This reads the `compile_commands.json` file and edits the files in `.idea` for CLion to know about +the executables to run. +To debug in CLion you need CLion version 2025.1.3 or above which supports custom external lldb +- (one time setup) Add a new custom-lldb toolchain: + - Open Settings -> "Build, Executaion, Deployment" -> Toolchains + - Press the "+" icon above the list + - Name the new toolchain "System" + - At the bottom, next to "Debugger:" add the path to an installed Xcode.app + - (it doesn't have to be the Xcode.app which is currently selected or the one which is used to build XNU) +- Open the XNU root folder +- At the top right select the test executable name (``) from the menu +- Press the menu again "Edit Configurations..." +- Next to "Program arguments:" write the name of the `T_DECL()` to debug +- Press the bug icon to at the top right to debug + + +## Running Coverage Analysis +1. Run the unit-test make command with the coverage option: +``` +> make -C tests/unit SDKROOT=macosx.internal BUILD_CODE_COVERAGE=1 +``` +This will build XNU, the mocks dylib and the test executable with coverage instrumentation. +2. Run the unit-test and tell the coverage lib where to save the .profraw file: +``` +> LLVM_PROFILE_FILE="coverage_data.profraw" ./tests/unit/build/sym/ +``` +3. Convert the .profraw file to .profdata file: +``` +> xcrun -sdk macosx.internal llvm-profdata merge -sparse coverage_data.profraw -o coverage_data.profdata +``` +4. Generate reports + +High-level per-file textual report: +``` +> xcrun -sdk macosx.internal llvm-cov report ./tests/unit/build/sym/libkernel.development.t6020.dylib -instr-profile=coverage_data.profdata +``` +Low-level per-line html pages in a directory structure: +``` +> xcrun -sdk macosx.internal llvm-cov show ./tests/unit/build/sym/libkernel.development.t6020.dylib -instr-profile=coverage_data.profdata --format=html -output-dir ./_cov_html +> open ./_cov_html/index.html +``` +Mind that both of these commands take the binary for which we want to show information for, in this case, the XNU dylib. +If you want to show the coverage for the unit-test executable, put that instead. It's also possible to specify multiple binaries with `-object` argument. + +Both these commands can take `-sources` argument followed by the list of source files to limit the source files that would show in the report. +The names need to be the real paths of the files (relative or absolute), not just the path part that appears in the `report` output. +5. To check the coverage of a single function add `-name=` to the `show` command. +6. To manually filter out functions from the report, for instance if the source file contains test functions which +are not interesting for coverage statistics: +- Add `-show-functions` to the `report` command and redirect the output to a file. +- From the output, take only the function names with: +`cat report_output.txt | cut -d " " -f1 | sort | uniq > func_names.txt` +- Edit the file and remove the functions names that are not needed. +Mind that in this list, static functions appear with the filename prefixed. +- Add the prefix `allowlist_fun:` to every line in the file: +`cat func_names.txt | sed 's/^/allowlist_fun:/' > allow_list.txt` +- Add the argument `-name-allowlist=allow_list.txt` to the `show` command. + +See more documentation: +https://clang.llvm.org/docs/SourceBasedCodeCoverage.html +https://llvm.org/docs/CommandGuide/llvm-cov.html + +## Deterministic threading with fibers +The mocks library provides a fibers implementation that can be used by tests including the header files in `mocks/fibers/`. + +To access mocks that replace locking and scheduling APIs like lck_mtx_t and waitq functions, the test file must include `mocks/mock_thread.h` +and use the `UT_USE_FIBERS(1)` macro in the global scope. + +By default, the context switch points are placed the entry and exit of the fibers API (e.g. before and after mutex lock) but preemption can be simulated using compiler instrumentation. +If you add `FIBERS_PREEMPTION=1` to the make command line, every memory load and store in the XNU library and in your test file will be instrumentated to be +a possible context switch point for the deterministic scheduler. + +In addition, a data race detector can be enabled when the test is using fibers with preemption simulation. +The checker is a probabilistic data race sanitizer based on the [DataCollider](https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Erickson.pdf) algorithm and can be used as +a replacement of ThreadSanitizer (that works with the fibers implementation but there can be false positives) or in combination. +The checker can be enabled with the macro `UT_FIBERS_USE_CHECKER(1)` in the global scope of the test file or setting the `FIBERS_CHECK_RACES` env var when executing a test with fibers. + +For an example test using fibers read `fibers_test`. + + +## FAQ +- Q: I'm trying to call function X but I get a linker error "Undefined symbols for architecture arm64e: X referenced from..." +- A: This is likely due to the function being declared as hidden, either using `__private_extern__` at +the function declaration or a `#pragma GCC visibility push(hidden)`/`#pragma GCC visibility pop` pair around +where it's defined. You can verify this by doing: +`nm -m tests/unit/build/obj/libkernel.development.t6020.dylib | grep ` +and verifying that the function in questions appears next to a lower-case `t` to mean it's a private symbol +(as opposed to a capital `T` which means it's exported symbol, or it not appearing at all which means there is +no such function). +To fix that, simply change `__private_extern__` to `__exported_hidden` or the `#pragma` pair with +`__exported_push_hidden`/`__exported_pop`. These keep the visibility the same (hidden) for normal XNU build but +drop to the default (visible) for the user-mode build. + + +- Q: How to build XNU on-desk if it builds warnings with warnings which are converted to errors? +- A: In the make command line add `BUILD_WERROR=0` + + +- Q: lldb startup takes a long time and shows many errors about loading symbols +- A: try doing `dsymForUUID --disable` to disable automaic symbol loading diff --git a/tests/unit/bits_test.c b/tests/unit/bits_test.c new file mode 100644 index 000000000..dd79a9d0a --- /dev/null +++ b/tests/unit/bits_test.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "mocks/unit_test_utils.h" +#include + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.scheduler"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("scheduler"), + T_META_TAG_VM_PREFERRED + ); + +T_DECL(xnu_bits, "snapshot tests for bit manipulation routines") +{ + T_EXPECT_EQ(bit_first(0ULL), -1, "bit_first"); + T_EXPECT_EQ(lsb_first(0ULL), -1, "lsb_first"); + + for (int i = 0; i < 64; i++) { + T_EXPECT_EQ(bit_first(BIT(i)), i, "bit_first"); + T_EXPECT_EQ(lsb_first(BIT(i)), i, "lsb_first"); + + T_EXPECT_EQ(bit_first(mask(i)), i - 1, "bit_first"); + T_EXPECT_EQ(lsb_first(mask(i)), i > 0 ? 0 : -1, "lsb_first"); + + T_EXPECT_EQ(bit_next(0ULL, i), -1, "bit_next"); + T_EXPECT_EQ(lsb_next(0ULL, i), -1, "lsb_next"); + + T_EXPECT_EQ(bit_next(~(0ULL), i), i - 1, "bit_next"); + T_EXPECT_EQ(lsb_next(~(0ULL), i), i < 63 ? i + 1 : -1, "lsb_next"); + } +} diff --git a/tests/unit/debugger_xcall_test.c b/tests/unit/debugger_xcall_test.c new file mode 100644 index 000000000..2289a9831 --- /dev/null +++ b/tests/unit/debugger_xcall_test.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define UT_MODULE osfmk + +#include +#include +#include +#include +#include "mocks/mock_cpu.h" + + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.debugger_xcall_test"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RUN_CONCURRENTLY(false) + ); + + +T_DECL(debugger_xcall_enter_cpu_signal_fail, "DebuggerXCallEnter where cpu_signal fails.") +{ + T_MOCK_SET_RETVAL(cpu_signal, kern_return_t, KERN_FAILURE); + + // Init with 2 CPUs. We are cpu-0 + T_MOCK_SET_RETVAL(ml_get_max_cpu_number, int, 1); + cpu_data_t cpu_data_cpu1 = {0}; + CpuDataEntries[0].cpu_data_vaddr = getCpuDatap(); + CpuDataEntries[1].cpu_data_vaddr = &cpu_data_cpu1; + + kern_return_t result = DebuggerXCallEnter(false, false); + T_EXPECT_MACH_SUCCESS(result, "Expecting DebuggerXCallEnter() success."); +} diff --git a/tests/unit/ecc_test_remove_duplicates.c b/tests/unit/ecc_test_remove_duplicates.c new file mode 100644 index 000000000..003cc1add --- /dev/null +++ b/tests/unit/ecc_test_remove_duplicates.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +extern uint32_t remove_bad_ram_duplicates(uint32_t bad_pages_count, ppnum_t *bad_pages); + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.ecc_test_remove_duplicates"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("tgal2"), + T_META_RUN_CONCURRENTLY(false) + ); + +T_DECL(ecc_rm_dups_zero_elements, "make sure 0 is returned for empty input") +{ + uint32_t bp_count = 0; + ppnum_t bp[] = {}; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 0) { + T_FAIL("Expected: bp_count == 0 but got: bp_count == %u", bp_count); + } + T_PASS("bp_count == 0 as expected"); +} + +T_DECL(ecc_rm_dups_0, "make sure 1 is returned for [0]") +{ + uint32_t bp_count = 1; + ppnum_t bp[] = { 0 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 1 || bp[0] != 0) { + T_FAIL("Expected: bp_count == 1 and bp[0] == 0 but got: bp_count == %u, bp[0] == %u", bp_count, bp[0]); + } else { + T_PASS("bp_count == 1 and bp[0] == 0 as expected"); + } +} + +T_DECL(ecc_rm_dups_00, "make sure 1 is returned for [0, 0]") +{ + uint32_t bp_count = 2; + ppnum_t bp[] = { 0, 0 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 1 || bp[0] != 0) { + T_FAIL("Expected: bp_count == 1 and bp[0] == 0 but got: bp_count == %u, bp[0] == %u", bp_count, bp[0]); + } else { + T_PASS("bp_count == 1 and bp[0] == 0 as expected"); + } +} + +T_DECL(ecc_rm_dups_001, "make sure 2 is returned for [0, 0, 1]") +{ + uint32_t bp_count = 3; + ppnum_t bp[] = { 0, 0, 1 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 2 || bp[0] != 0 || bp[1] != 1) { + T_FAIL("Expected: bp_count == 2 and bp == [0, 1] but got: bp_count == %u, bp == [%u, %u]", bp_count, bp[0], bp[1]); + } else { + T_PASS("bp_count == 2 and bp == [0, 1] as expected"); + } +} + +T_DECL(ecc_rm_dups_101, "make sure 2 is returned for [1, 0, 1]") +{ + uint32_t bp_count = 3; + ppnum_t bp[] = { 1, 0, 1 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 2 || bp[0] != 0 || bp[1] != 1) { + T_FAIL("Expected: bp_count == 2 and bp == [0, 1] but got: bp_count == %u, bp == [%u, %u]", bp_count, bp[0], bp[1]); + } else { + T_PASS("bp_count == 2 and bp == [0, 1] as expected"); + } +} + +T_DECL(ecc_rm_dups_201, "make sure 3 is returned for [2, 0, 1]") +{ + uint32_t bp_count = 3; + ppnum_t bp[] = { 2, 0, 1 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 3 || bp[0] != 0 || bp[1] != 1 || bp[2] != 2) { + T_FAIL("Expected: bp_count == 3 and bp == [0, 1, 2] but got: bp_count == %u, bp == [%u, %u, %u]", bp_count, bp[0], bp[1], bp[2]); + } else { + T_PASS("bp_count == 3 and bp == [0, 1, 2] as expected"); + } +} + +T_DECL(ecc_rm_dups_2012, "make sure 3 is returned for [2, 0, 1, 2]") +{ + uint32_t bp_count = 4; + ppnum_t bp[] = { 2, 0, 1, 2 }; + bp_count = remove_bad_ram_duplicates(bp_count, bp); + if (bp_count != 3 || bp[0] != 0 || bp[1] != 1 || bp[2] != 2) { + T_FAIL("Expected: bp_count == 3 and bp == [0, 1, 2] but got: bp_count == %u, bp == [%u, %u, %u]", bp_count, bp[0], bp[1], bp[2]); + } else { + T_PASS("bp_count == 3 and bp == [0, 1, 2] as expected"); + } +} + +T_DECL(ecc_rm_dups_large, "make sure large input is handled correctly") +{ + uint32_t bp_count = 1000; + ppnum_t bp[1000]; + for (uint32_t i = 0; i < bp_count; i++) { + bp[i] = (i % 10); // Repeated numbers [0-9] + } + bp_count = remove_bad_ram_duplicates(bp_count, bp); + bool valid = (bp_count == 10); + for (uint32_t i = 0; i < bp_count && valid; i++) { + if (bp[i] != i) { + valid = false; + } + } + if (!valid) { + T_FAIL("Expected: bp_count == 10 and bp == [0-9] but got: bp_count == %u", bp_count); + } else { + T_PASS("bp_count == 10 and bp == [0-9] as expected"); + } +} diff --git a/tests/unit/example_dir/example_test_in_dir.c b/tests/unit/example_dir/example_test_in_dir.c new file mode 100644 index 000000000..e93ca1ce7 --- /dev/null +++ b/tests/unit/example_dir/example_test_in_dir.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "mocks/unit_test_utils.h" +#include + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.example_test_in_dir"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("s_shalom"), + T_META_RUN_CONCURRENTLY(false) + ); + + +// If this test fails to build or run to success it means that something broke +// with the unit-test harness. + +T_DECL(xnu_example_test_in_dir, "an example test in a directory") +{ + T_EXPECT_MACH_SUCCESS(0, "hello"); +} diff --git a/tests/unit/example_test_bsd.c b/tests/unit/example_test_bsd.c new file mode 100644 index 000000000..3dbf00809 --- /dev/null +++ b/tests/unit/example_test_bsd.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "mocks/unit_test_utils.h" +#include + +#define UT_MODULE bsd +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.example_test_bsd"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("s_shalom"), + T_META_RUN_CONCURRENTLY(false) + ); + +// If this test fails to build or run to success it means that something broke +// with the unit-test harness. + +T_DECL(xnu_example_test_bsd, "a BSD test") +{ + T_ASSERT_PANIC({ + proc_iterate(0, NULL, NULL, NULL, NULL); + }, "proc_iterate"); + + T_PASS("hello"); +} diff --git a/tests/unit/example_test_iokit.cpp b/tests/unit/example_test_iokit.cpp new file mode 100644 index 000000000..e4c48c190 --- /dev/null +++ b/tests/unit/example_test_iokit.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "IOKit/IOUserClient.h" + +#define UT_MODULE iokit +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.example_test_iokit"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("s_shalom"), + T_META_RUN_CONCURRENTLY(false) + ); + +T_DECL(xnu_example_test_iokit, "an IOKit example") { + IOUserClient::clientHasPrivilege(NULL, "foo"); + T_PASS("ok"); +} diff --git a/tests/unit/example_test_osfmk.c b/tests/unit/example_test_osfmk.c new file mode 100644 index 000000000..210a64148 --- /dev/null +++ b/tests/unit/example_test_osfmk.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "mocks/unit_test_utils.h" +#include + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.example_test_osfmk"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("s_shalom"), + T_META_RUN_CONCURRENTLY(false) + ); + + +// If this test fails to build or run to success it means that something broke +// with the unit-test harness. + +T_DECL(xnu_example_test_osfmk, "an OSFMK test") +{ + T_ASSERT_PANIC({ + vm_object_copy_slowly(NULL, 0, 0, false, NULL); + }, "copy_slowly"); + + T_EXPECT_MACH_SUCCESS(0, "hello"); +} diff --git a/tests/unit/fibers_test.c b/tests/unit/fibers_test.c new file mode 100644 index 000000000..c58f58e20 --- /dev/null +++ b/tests/unit/fibers_test.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include "mocks/std_safe.h" +#include "mocks/mock_thread.h" + +#include "mocks/fibers/fibers.h" +#include "mocks/fibers/mutex.h" +#include "mocks/fibers/condition.h" +#include "mocks/fibers/random.h" + +// Use FIBERS_PREEMPTION=1 to have simulated preemption at memory operations. +// make -C tests/unit SDKROOT=macosx.internal fibers_test FIBERS_PREEMPTION=1 + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.fibers"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("a_fioraldi"), + T_META_RUN_CONCURRENTLY(false) + ); +// use fibers for scheduling +UT_USE_FIBERS(1); +// use the data race checker +// UT_FIBERS_USE_CHECKER(1); + +static int third_fiber_id = -1; +static void* +coop_fibers_func(void* x) +{ + int *cooperative_counter = (int*)x; + + if (*cooperative_counter == 0) { + // main thread can jump here just after fibers_create + fibers_yield_to(0); // switch back to main thread and finish the fibers creation + } + + T_QUIET; T_ASSERT_EQ(*cooperative_counter, fibers_current->id, "invalid cooperative_counter"); + *cooperative_counter = fibers_current->id + 1; + + // switch to next fiber or to main fiber (id=0) if the current is the last + if (fibers_current->id == third_fiber_id) { + fibers_yield_to(0); + } else { + fibers_yield_to(fibers_current->id + 1); + } + + return NULL; +} + +T_DECL(coop_fibers, "cooperative scheduling using fibers") +{ + // disable preemption in case FIBERS_PREEMPTION=1 was using to compile + // context switches will still happen before and after locks / interrupt enable/disable / fibers creation + fibers_may_yield_probability = 0; + + random_set_seed(1234); + + int cooperative_counter = 0; + + fiber_t first = fibers_create(FIBERS_DEFAULT_STACK_SIZE, coop_fibers_func, (void*)&cooperative_counter); + fiber_t second = fibers_create(FIBERS_DEFAULT_STACK_SIZE, coop_fibers_func, (void*)&cooperative_counter); + fiber_t third = fibers_create(FIBERS_DEFAULT_STACK_SIZE, coop_fibers_func, (void*)&cooperative_counter); + + third_fiber_id = third->id; + + // Start the chain of ctxswitches from the main thread and switch to first + cooperative_counter = first->id; + fibers_yield_to(first->id); + + T_LOG("Done cooperative_counter=%d", cooperative_counter); + T_ASSERT_EQ(cooperative_counter, third->id + 1, "invalid cooperative schedule"); + + // always join the fibers + fibers_join(first); + fibers_join(second); + fibers_join(third); + + T_PASS("coop_fibers"); +} + +static int global_var; +static void* +tiny_race_func(void* x) +{ + global_var = 42; + return x; +} + +// Standard ThreadSanitizer example in the llvm doc to showcase a race +// TSan will not fail the test by default, you beed to set halt_on_error=1 in TSAN_OPTIONS +// the test will just run fine without TSan, the data race between fibers can be detected with the fibers data race checker too +T_DECL(tsan_tiny_race, "tsan_tiny_race") +{ + // This sometimes triggers a ThreadSanitizer data race depending on the OS scheduler + pthread_t thread; + pthread_create(&thread, NULL, tiny_race_func, NULL); + global_var = 43; + pthread_join(thread, NULL); + + T_LOG("Done pthread global_var=%d", global_var); + + // This always triggers a ThreadSanitizer data race thanks to the fixed seed + fibers_log_level = FIBERS_LOG_INFO; + fibers_may_yield_probability = 1; + random_set_seed(1234); + + fiber_t fiber = fibers_create(FIBERS_DEFAULT_STACK_SIZE, tiny_race_func, NULL); + global_var = 43; + fibers_join(fiber); + + T_LOG("Done fibers global_var=%d", global_var); + T_PASS("tsan_tiny_race"); +} + +#define NUM_INCREMENTS 100000 +#define NUM_THREADS 10 + +struct inc_state { + volatile int64_t counter; + //_Atomic int64_t counter; + lck_mtx_t mtx; + lck_grp_t grp; +}; + +void* +increment_counter(void* arg) +{ + struct inc_state *s = (struct inc_state *)arg; + for (int i = 0; i < NUM_INCREMENTS; i++) { + // Remove locks to fail the test and trigger a ThreadSanitizer data race + lck_mtx_lock(&s->mtx); + //lck_mtx_lock_spin(&s->mtx); + s->counter++; + //os_atomic_inc(&s->counter, relaxed); + lck_mtx_unlock(&s->mtx); + } + return NULL; +} + +T_DECL(mutex_mock_increment_int, "mutex mock test") +{ + // fibers_log_level = 1; + // fibers_may_yield_probability = 0; + random_set_seed(1234); + + fiber_t mythreads[NUM_THREADS] = {}; + struct inc_state s = {.counter = 0}; + lck_grp_init(&s.grp, "test_mutex", LCK_GRP_ATTR_NULL); + lck_mtx_init(&s.mtx, &s.grp, LCK_ATTR_NULL); + + // Create fibers + for (int i = 0; i < NUM_THREADS; i++) { + mythreads[i] = fibers_create(FIBERS_DEFAULT_STACK_SIZE, increment_counter, (void*)&s); + } + + // Wait for all fibers to finish + for (int i = 0; i < NUM_THREADS; i++) { + fibers_join(mythreads[i]); + } + lck_mtx_destroy(&s.mtx, &s.grp); + + T_LOG("Done counter=%lld", os_atomic_load(&s.counter, relaxed)); + T_ASSERT_EQ(s.counter, (int64_t)(NUM_INCREMENTS * NUM_THREADS), "race detected on counter"); + + T_PASS("mutex_mock_increment_int"); +} diff --git a/tests/unit/mach_vm_range_contains.c b/tests/unit/mach_vm_range_contains.c new file mode 100644 index 000000000..a5417ad0d --- /dev/null +++ b/tests/unit/mach_vm_range_contains.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.mach_vm_range_contains"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("tgal2"), + T_META_RUN_CONCURRENTLY(false) + ); + +T_DECL(prevent_overflow_with_large_address, "make sure false is returned for addr causing overflow (if !DEBUG && !DEVELOPMENT, it will panic)") +{ + const struct mach_vm_range r = {0x1000, 0x2000}; + mach_vm_offset_t addr = 0xFFFFFFFFFFFFFF00; + mach_vm_offset_t size = 0x1100; + + T_ASSERT_FALSE(mach_vm_range_contains(&r, addr, size), + "got true for overflow (exploit exploit)"); + + T_PASS("false returned for address overflow as expected"); +} + +T_DECL(prevent_overflow_with_large_size, "make sure false is returned for size causing overflow (if !DEBUG && !DEVELOPMENT, it will panic)") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x2000; + mach_vm_offset_t size = 0xFFFFFFFFFFFFFFF0; + + T_ASSERT_FALSE(mach_vm_range_contains(&r, addr, size), + "got true for overflow (exploit exploit)"); + + T_PASS("false returned for size overflow as expected"); +} + +T_DECL(allow_valid_range, "make sure true is returned for a valid range") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x1500; + mach_vm_offset_t size = 0x500; + + T_ASSERT_TRUE(mach_vm_range_contains(&r, addr, size), + "got false for valid range"); + + T_PASS("true returned for valid range as expected"); +} + +T_DECL(dont_allow_out_of_bounds_start, "make sure false is returned for address out of bounds") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x500; + mach_vm_offset_t size = 0x500; + + T_ASSERT_FALSE(mach_vm_range_contains(&r, addr, size), + "got true for out-of-bounds start address"); + + T_PASS("false returned for out-of-bounds start address as expected"); +} + +T_DECL(dont_allow_out_of_bounds_end, "make sure false is returned for size extending out of bounds") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x2000; + mach_vm_offset_t size = 0x2000; + + T_ASSERT_FALSE(mach_vm_range_contains(&r, addr, size), + "got true for out-of-bounds end address"); + + T_PASS("false returned for out-of-bounds end address as expected"); +} + +T_DECL(allow_exact_range_match_start, "make sure true is returned for exact range match - start of range") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x1000; + mach_vm_offset_t size = 0x0; + + T_ASSERT_TRUE(mach_vm_range_contains(&r, addr, size), + "got false for exact range match"); + + T_PASS("true returned for exact range match as expected"); +} + +T_DECL(allow_exact_range_match_end, "make sure true is returned for exact range match - end of range") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x1000; + mach_vm_offset_t size = 0x2000; + + T_ASSERT_TRUE(mach_vm_range_contains(&r, addr, size), + "got false for exact range match"); + + T_PASS("true returned for exact range match as expected"); +} + +T_DECL(prevent_invalid_size_zero, "make sure false is returned for size == 0") +{ + const struct mach_vm_range r = {0x1000, 0x3000}; + mach_vm_offset_t addr = 0x1500; + mach_vm_offset_t size = 0x0; + + T_ASSERT_TRUE(mach_vm_range_contains(&r, addr, size), + "got false for size == 0 with addr in range"); + + T_PASS("true returned for size == 0 with addr in range as expected"); +} diff --git a/tests/unit/mocks/dt_proxy.c b/tests/unit/mocks/dt_proxy.c new file mode 100644 index 000000000..dde8411b0 --- /dev/null +++ b/tests/unit/mocks/dt_proxy.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "dt_proxy.h" +#include + +static void +pt_assert_true(bool cond, const char *msg) +{ + T_ASSERT_TRUE(cond, "%s", msg); +} +static void +pt_assert_notnull(void *ptr, const char *msg) +{ + T_ASSERT_NOTNULL(ptr, "%s", msg); +} +static void +pt_assert_posix_zero(int v, const char *msg) +{ + T_ASSERT_POSIX_ZERO(v, "%s", msg); +} +static void +pt_log(const char *msg) +{ + T_LOG("%s", msg); +} +static void +pt_log_fmtstr(const char* fmt, const char *msg) +{ + T_LOG(fmt, msg); +} +static void +pt_fail(const char *msg) +{ + T_FAIL("%s", msg); +} +static void +pt_quiet(void) +{ + T_QUIET; +} + +static struct dt_proxy_callbacks dt_callbacks = { + .t_assert_true = &pt_assert_true, + .t_assert_notnull = &pt_assert_notnull, + .t_assert_posix_zero = &pt_assert_posix_zero, + .t_log = &pt_log, + .t_log_fmtstr = &pt_log_fmtstr, + .t_fail = &pt_fail, + .t_quiet = &pt_quiet +}; + +// This code is linked into every test executable to allow the XNU and mocks .dylibs access to some +// darwintest functionality. libdarwintest.a is only linked to the executable so code in the XNU and +// mocks .dylibs can't call into it directly +// due to how dyld works, this constructor is going to be called after the fake_kinit() constructor +// so during fake_kinit() dt_proxy is going to stay NULL and any output to darwintest asserts is lost. +__attribute__((constructor)) void +dt_init(void) +{ + set_dt_proxy_attached(&dt_callbacks); + set_dt_proxy_mock(&dt_callbacks); +} diff --git a/tests/unit/mocks/dt_proxy.h b/tests/unit/mocks/dt_proxy.h new file mode 100644 index 000000000..f11c165ae --- /dev/null +++ b/tests/unit/mocks/dt_proxy.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once +#include +#include "unit_test_utils.h" + +// -- darwin-test proxy macros and support -- +// libdarwintest.a is linked only to the main executable of the test and not to the XNU .dylib or +// the mocks .dylib, otherwise there would create 3 instances of darwintest in the same process, each with different +// state. +// On some occasions some code in these .dylibs need to call darwin-test macros so that the output +// is visible to the user. An example for this is when XNU code calls panic or assert. +// To achieve this, the file dt_proxy.c is built into every test executable and it contains a constructor function +// which registers a set of proxy functions that call the darwin-test macros. + +struct dt_proxy_callbacks { + void (*t_assert_true)(bool cond, const char *msg); + void (*t_assert_notnull)(void *ptr, const char *msg); + void (*t_assert_posix_zero)(int v, const char *msg); + void (*t_log)(const char *msg); + void (*t_log_fmtstr)(const char *fmt, const char *msg); + void (*t_fail)(const char *msg); + void (*t_quiet)(void); +}; + +// register the proxies to the XNU .dylib +extern void set_dt_proxy_attached(struct dt_proxy_callbacks *p); +// register the proxies to the mocks .dylib +extern void set_dt_proxy_mock(struct dt_proxy_callbacks *p); + +extern struct dt_proxy_callbacks *get_dt_proxy_attached(void); +extern struct dt_proxy_callbacks *get_dt_proxy_mock(void); + +// A pointer of this name appears in the XNU .dylib and the mocks .dylib +extern struct dt_proxy_callbacks *dt_proxy; + +#define PT_ASSERT_TRUE(cond, msg) do { if (dt_proxy) { dt_proxy->t_assert_true((cond), #cond msg); } } while(false) +#define PT_ASSERT_TRUE_S(cond, msg) do { if (dt_proxy) { dt_proxy->t_assert_true((cond), msg); } } while(false) +#define PT_ASSERT_NOTNULL(ptr, msg) do { if (dt_proxy) { dt_proxy->t_assert_notnull((ptr), msg); } } while(false) +#define PT_ASSERT_POSIX_ZERO(v, msg) do { if (dt_proxy) { dt_proxy->t_assert_posix_zero((v), msg); } } while(false) +#define PT_LOG(msg) do { if (dt_proxy) { dt_proxy->t_log(msg); } } while(false) +#define PT_LOG_FMTSTR(fmt, str) do { if (dt_proxy) { dt_proxy->t_log_fmtstr(fmt, str); } } while(false) +#define PT_LOG_OR_RAW_FMTSTR(fmt, str) do { \ + if (dt_proxy) { \ + dt_proxy->t_log_fmtstr(fmt, str); \ + } else { \ + raw_printf(fmt "\n", str); \ + } \ + } while(false) +#define PT_FAIL(msg) do { if (dt_proxy) { dt_proxy->t_fail(msg); } } while(false) +#define PT_QUIET do { if (dt_proxy) { dt_proxy->t_quiet(); } } while(false) diff --git a/tests/unit/mocks/fake_kinit.c b/tests/unit/mocks/fake_kinit.c new file mode 100644 index 000000000..6fc153dfe --- /dev/null +++ b/tests/unit/mocks/fake_kinit.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include // for BootArgs +#include // for PE_state +#include // for bootargs +#include // for kernel_startup_initialize_upto +#include // for IOLock +#include // for vm_set_page_size +#include // for clock_config +#include + +#include "std_safe.h" + +// This define is supposed to come from the .CFLAGS parsing. if it's not, something is wrong with the Makefile +#ifndef __BUILDING_XNU_LIB_UNITTEST__ +#error "not building unittest, something is wrong" +#endif + + +extern void kernel_startup_bootstrap(void); +extern void scale_setup(void); +extern void vm_mem_bootstrap(void); +extern void waitq_bootstrap(void); +// can't include IOKit/IOKitDebug.h since it's a C++ file +extern void IOTrackingInit(void); +extern void mock_mem_init_vm_objects(void); + +extern lck_grp_t * IOLockGroup; +extern IOLock * sKextLoggingLock; +extern bitmap_t * asid_bitmap; +extern zone_t pmap_zone; + +void +fake_pmap_init(void) +{ + pmap_zone = zone_create_ext("pmap", sizeof(struct pmap), + ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL); + + static uint64_t asid_bits = 0; + asid_bitmap = &asid_bits; +} + + +void +fake_init_bootargs(void) +{ + // see PE_boot_args() + static boot_args ba; + PE_state.bootArgs = &ba; + PE_state.initialized = TRUE; + BootArgs = &ba; // arm_init() +} + +void +fake_kernel_bootstrap(void) +{ + mem_size = 0x0000000080000000ULL; // 2 GB + max_mem = mem_size; + scale_setup(); + + vm_set_page_size(); // called from arm_init() -> arm_vm_init() + vm_mem_bootstrap(); + fake_pmap_init(); + clock_config(); +} + + +void +fake_iokit_init(void) +{ + // these are needed for static initializations in iokit to not crash + IOLockGroup = lck_grp_alloc_init("IOKit", LCK_GRP_ATTR_NULL); +#if IOTRACKING + IOTrackingInit(); +#endif + sKextLoggingLock = IOLockAlloc(); +} + +// This is the first function that is called before any initialization in libkernel. +// It's made to be first by the order of object files in the linker command line in Makefile +__attribute__((constructor)) void +fake_kinit(void) +{ + fake_init_bootargs(); + kernel_startup_bootstrap(); + fake_kernel_bootstrap(); + fake_iokit_init(); + + kernel_startup_initialize_only(STARTUP_SUB_MACH_IPC); + kernel_startup_initialize_only(STARTUP_SUB_SYSCTL); + + mock_mem_init_vm_objects(); +} diff --git a/tests/unit/mocks/fibers/checker.c b/tests/unit/mocks/fibers/checker.c new file mode 100644 index 000000000..89ddd96a2 --- /dev/null +++ b/tests/unit/mocks/fibers/checker.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "checker.h" +#include + +#define WATCHPOINT_MAP_INITIAL_CAPACITY 4096 /* must be power of 2 */ +#define WATCHPOINT_MAP_MAX_LOAD_FACTOR 0.75 + +static inline size_t +hash_address(uintptr_t addr, size_t capacity) +{ + size_t hash = (size_t)addr; + hash = (hash ^ (hash >> 16)) * 31; + hash = (hash ^ (hash >> 16)) * 31; + hash = hash ^ (hash >> 16); + return hash % capacity; +} + +struct watchpoint_entry { + union { + void *pc; /* program point of the memory operation instruction */ + struct backtrace_array *backtrace; /* backtrace collected at program point of the memory operation instruction */ + }; + uintptr_t address; /* address of the memory operation happening on the fiber with id=fiber_id */ + int fiber_id; /* id of the fiber in which the memory operation is happening */ + uint8_t size; /* size of the memory operation (up to 16 bytes) */ + uint8_t access_type; /* enum access_type */ + uint8_t has_backtrace; /* if true, use the backtrace field of the union */ +}; + +static void +watchpoint_entry_init(struct watchpoint_entry* entry, uintptr_t address, enum access_type type, size_t size, fiber_t fiber) +{ + FIBERS_ASSERT(entry != NULL, "watchpoint_entry_init: null entry"); + FIBERS_ASSERT(size <= 16, "watchpoint_entry_init: invalid size"); // no access is bigger than sizeof(int128) + + entry->address = address; + entry->fiber_id = fiber->id; + entry->access_type = (uint8_t)type; + entry->size = (uint8_t)size; + // default to pc=0 + entry->pc = 0; + entry->has_backtrace = 0; +} + +struct watchpoint_node { + struct watchpoint_entry entry; + struct watchpoint_node *next; +}; + +// A simple hashmap to store watchpoint_entry indexed by watchpoint_entry.address +struct watchpoint_map { + struct watchpoint_node **buckets; + size_t count; + size_t capacity; +}; + +static bool +watchpoint_map_is_initialized(struct watchpoint_map* map) +{ + return map->capacity != 0 && map->buckets != NULL; +} + +void +watchpoint_map_init(struct watchpoint_map* map) +{ + map->count = 0; + map->capacity = WATCHPOINT_MAP_INITIAL_CAPACITY; + map->buckets = calloc(map->capacity, sizeof(struct watchpoint_node*)); +} + +// Currently the watchpoint map has a global scope, so this function is unnecessary +// We keep it here for future usage +/* + * void + * watchpoint_map_destroy(struct watchpoint_map* map) + * { + * if (map->buckets) { + * for (size_t i = 0; i < map->capacity; ++i) { + * struct watchpoint_node* current = map->buckets[i]; + * while (current != NULL) { + * struct watchpoint_node* to_free = current; + * current = current->next; + * free(to_free); + * } + * } + * free(map->buckets); + * } + * map->buckets = NULL; + * map->count = 0; + * map->capacity = 0; + * } + */ + +static bool +watchpoint_map_resize(struct watchpoint_map* map, size_t new_capacity) +{ + if (new_capacity < map->count) { + return false; + } + struct watchpoint_node** new_buckets = calloc(new_capacity, sizeof(struct watchpoint_node*)); + if (new_buckets == NULL) { + return false; + } + + /* rehash all existing entries into the new buckets */ + for (size_t i = 0; i < map->capacity; ++i) { + struct watchpoint_node* current = map->buckets[i]; + while (current != NULL) { + struct watchpoint_node* node_to_move = current; + current = current->next; + + size_t new_index = hash_address(node_to_move->entry.address, new_capacity); + node_to_move->next = new_buckets[new_index]; + new_buckets[new_index] = node_to_move; + } + } + + free(map->buckets); + map->buckets = new_buckets; + map->capacity = new_capacity; + return true; +} + +void +watchpoint_map_add(struct watchpoint_map* map, struct watchpoint_entry entry) +{ + if (((double)map->count / map->capacity) >= WATCHPOINT_MAP_MAX_LOAD_FACTOR) { + watchpoint_map_resize(map, map->capacity * 2); + } + + struct watchpoint_node* new_node = malloc(sizeof(struct watchpoint_node)); + new_node->entry = entry; + new_node->next = NULL; + + size_t index = hash_address(entry.address, map->capacity); + new_node->next = map->buckets[index]; + map->buckets[index] = new_node; + map->count++; +} + +bool +watchpoint_map_find_remove(struct watchpoint_map* map, uintptr_t address, fiber_t fiber, struct watchpoint_entry* removed_entry) +{ + size_t index = hash_address(address, map->capacity); + + struct watchpoint_node* current = map->buckets[index]; + struct watchpoint_node* prev = NULL; + + while (current != NULL) { + if (current->entry.address == address && current->entry.fiber_id == fiber->id) { + if (removed_entry) { + memcpy(removed_entry, ¤t->entry, sizeof(struct watchpoint_entry)); + } + + if (prev == NULL) { + map->buckets[index] = current->next; + } else { + prev->next = current->next; + } + free(current); + map->count--; + return true; + } + prev = current; + current = current->next; + } + + return false; +} + +static void +report_race(uintptr_t current_addr, size_t current_size, enum access_type current_type, struct watchpoint_entry* conflicting_entry) +{ + raw_printf("==== Warning: Fibers data race checker violation ====\n"); + raw_printf("%s of size %d at %p by fiber %d\n", current_type == ACCESS_TYPE_STORE ? "Write" : "Read", current_size, (void*)current_addr, fibers_current->id); + if (fibers_debug) { + print_current_backtrace(); + } + + raw_printf("Previous %s of size %d at %p by fiber %d\n", conflicting_entry->access_type == ACCESS_TYPE_STORE ? "write" : "read", conflicting_entry->size, (void*)conflicting_entry->address, conflicting_entry->fiber_id); + if (conflicting_entry->has_backtrace) { + print_collected_backtrace(conflicting_entry->backtrace); + } else { + struct backtrace_array bt = { .buffer = {(void*)conflicting_entry->pc}, .nptrs = 1 }; + print_collected_backtrace(&bt); + } + + if (fibers_abort_on_error) { + abort(); + } +} + +static inline void +report_missing_race(uintptr_t current_addr, size_t current_size, enum access_type current_type) +{ + raw_printf("==== Warning: Fibers data race checker violation ====\n"); + raw_printf("%s of size %d at %p by fiber %d\n", current_type == ACCESS_TYPE_STORE ? "Write" : "Read", current_size, (void*)current_addr, fibers_current->id); + if (fibers_debug) { + print_current_backtrace(); + } + + raw_printf("Watchpoint was unexpectedly missing or modified by another fiber during yield.\n"); + if (fibers_abort_on_error) { + abort(); + } +} + +void +report_value_race(uintptr_t current_addr, size_t current_size, enum access_type current_type) +{ + raw_printf("==== Warning: Fibers data race checker violation ====\n"); + raw_printf("%s of size %d at %p by fiber %d\n", current_type == ACCESS_TYPE_STORE ? "Write" : "Read", current_size, (void*)current_addr, fibers_current->id); + if (fibers_debug) { + print_current_backtrace(); + } + + raw_printf("Value was modified in between the operation by another fiber during yield.\n"); + if (fibers_abort_on_error) { + abort(); + } +} + +static inline bool +ranges_overlap(uintptr_t addr1, size_t size1, uintptr_t addr2, size_t size2) +{ + if (size1 == 0 || size2 == 0) { + return false; + } + uintptr_t end1 = addr1 + size1; + uintptr_t end2 = addr2 + size2; + if (end1 < addr1 || end2 < addr2) { + return false; + } + return addr1 < end2 && addr2 < end1; +} + +/* + * Check for conflicting memory accesses to the same region happening in another fiber. + * Concurrent loads are allowed, loads in-between a store are not. + */ +static inline bool +check_for_conflicts(struct watchpoint_map* map, uintptr_t current_addr, size_t current_size, enum access_type current_type) +{ + /* range: [current_addr - 16, current_addr + 16] (33 addresses) */ + uintptr_t start_check_addr = (current_addr > 16) ? (current_addr - 16) : 0; + uintptr_t end_check_addr = current_addr + 16; + + for (uintptr_t check_addr = start_check_addr;; ++check_addr) { + size_t index = hash_address(check_addr, map->capacity); + struct watchpoint_node* node = map->buckets[index]; + + while (node != NULL) { + struct watchpoint_entry* existing_entry = &node->entry; + + if (ranges_overlap(current_addr, current_size, existing_entry->address, existing_entry->size)) { + if (current_type == ACCESS_TYPE_STORE) { + /* any access in between a store is a race */ + report_race(current_addr, current_size, current_type, existing_entry); + return true; + } else if (existing_entry->access_type == ACCESS_TYPE_STORE) { + /* allow other loads in between a load, but not a store */ + report_race(current_addr, current_size, current_type, existing_entry); + return true; + } + } + node = node->next; + } + if (check_addr == end_check_addr) { + break; + } + } + + return false; +} + +static struct watchpoint_map checker_watchpoints; + +bool +check_and_set_watchpoint(void *pc, uintptr_t address, size_t size, enum access_type access_type) +{ + if (!watchpoint_map_is_initialized(&checker_watchpoints)) { + watchpoint_map_init(&checker_watchpoints); + } + + if (check_for_conflicts(&checker_watchpoints, address, size, access_type)) { + return false; + } else { + struct watchpoint_entry new_watchpoint; + watchpoint_entry_init(&new_watchpoint, address, access_type, size, fibers_current); + if (fibers_debug) { + new_watchpoint.backtrace = collect_current_backtrace(); + new_watchpoint.has_backtrace = 1; + } else { + new_watchpoint.pc = pc; + } + + watchpoint_map_add(&checker_watchpoints, new_watchpoint); + return true; + } +} + +void +post_check_and_remove_watchpoint(uintptr_t address, size_t size, enum access_type access_type) +{ + struct watchpoint_entry removed_entry; + if (watchpoint_map_find_remove(&checker_watchpoints, address, fibers_current, &removed_entry)) { + FIBERS_ASSERT(removed_entry.address == address, "race? internal error"); + FIBERS_ASSERT(removed_entry.access_type == access_type, "race? internal error"); + FIBERS_ASSERT(removed_entry.size == size, "race? internal error"); + FIBERS_ASSERT(removed_entry.fiber_id == fibers_current->id, "race? internal error"); + + if (removed_entry.has_backtrace) { + free(removed_entry.backtrace); + } + } else { + report_missing_race(address, size, access_type); + } +} diff --git a/tests/unit/mocks/fibers/checker.h b/tests/unit/mocks/fibers/checker.h new file mode 100644 index 000000000..51874f0cd --- /dev/null +++ b/tests/unit/mocks/fibers/checker.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "fibers.h" + +/* + * The fibers data racer checker is a watchpoint-based checker inspired by DataCollider: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Erickson.pdf + * Unlike the original paper, here everything is implemented in software and called from the load/store instrumentation in mock_thread.c + * Check to SANCOV_LOAD_STORE_DATA_CHECKER macro to see how the checker API is used. + */ + +enum access_type { + ACCESS_TYPE_LOAD = 0, + ACCESS_TYPE_STORE = 1 +}; + +// check for concurrent accesses on the same region and, if no data race is detected, install a watchpoint so that other fibers can perform the same check +extern bool check_and_set_watchpoint(void *pc, uintptr_t address, size_t size, enum access_type access_type); +// remove the watchpoint after the memory access is completed +extern void post_check_and_remove_watchpoint(uintptr_t address, size_t size, enum access_type access_type); +// report a data race +extern void report_value_race(uintptr_t current_addr, size_t current_size, enum access_type current_type); diff --git a/tests/unit/mocks/fibers/condition.c b/tests/unit/mocks/fibers/condition.c new file mode 100644 index 000000000..e257ccd57 --- /dev/null +++ b/tests/unit/mocks/fibers/condition.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "condition.h" +#include "random.h" + +void +fibers_condition_wakeup_one(fibers_condition_t *cond) +{ + fibers_condition_wakeup_some(cond, 1, NULL, NULL); +} + +int +fibers_condition_wakeup_some(fibers_condition_t *cond, int num_fibers, void (*callback)(void *, fiber_t), void *arg) +{ + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_WAKEUP | + FIBERS_YIELD_REASON_ORDER_PRE); + + if (num_fibers < 0 || num_fibers > cond->wait_queue.count) { + num_fibers = cond->wait_queue.count; + } + + unsigned int num_awakened = 0; + while (num_fibers > 0) { + fiber_t target = fibers_queue_pop(&cond->wait_queue, random_below(cond->wait_queue.count)); + FIBERS_ASSERT(target->state == FIBER_WAIT, "fibers_condition_wakeup_some: waking up %d that is not FIBER_WAIT", target->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up %d waiting on condition %p", target->id, cond); + if (callback) { + callback(arg, target); + } + fibers_queue_push(&fibers_run_queue, target); + --num_fibers; + num_awakened++; + } + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_WAKEUP | + FIBERS_YIELD_REASON_ORDER_POST | + FIBERS_YIELD_REASON_ERROR_IF(num_awakened == 0)); + + return num_fibers; +} + +void +fibers_condition_wait(fibers_condition_t *cond) +{ + FIBERS_LOG(FIBERS_LOG_DEBUG, "waiting on condition %p", cond); + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_condition_wait: waiting on a condition with fibers_current->may_yield_disabled not 0"); + //FIBERS_ASSERT(fibers_queue_count(&cond->wait_queue) == cond->wait_queue.count, "fibers_queue_count"); + + fibers_queue_push(&cond->wait_queue, fibers_current); + fibers_choose_next(FIBER_WAIT); +} + +void +fibers_condition_destroy(fibers_condition_t *cond) +{ + FIBERS_LOG(FIBERS_LOG_DEBUG, "destroy condition %p", cond); + FIBERS_ASSERT(cond->wait_queue.count == 0, "fibers_mutex_destroy: tried to destroy condition with non empty wait queue"); +} + +fiber_t +fibers_condition_identify(fibers_condition_t *cond) +{ + FIBERS_LOG(FIBERS_LOG_DEBUG, "identify from wait queue of %d fibers", cond->wait_queue.count); + //FIBERS_ASSERT(fibers_queue_count(&cond->wait_queue) == cond->wait_queue.count, "fibers_queue_count"); + if (cond->wait_queue.count == 0) { + return NULL; + } + size_t index = random_below(cond->wait_queue.count); + fiber_t iter = cond->wait_queue.top; + while (iter != NULL) { + if (index == 0) { + return iter; + } + index--; + iter = iter->next; + } + FIBERS_ASSERT(false, "fibers_condition_identify: unreachable"); + return NULL; +} + +bool +fibers_condition_wakeup_identified(fibers_condition_t *cond, fiber_t target) +{ + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_WAKEUP | + FIBERS_YIELD_REASON_ORDER_PRE); + + //FIBERS_ASSERT(fibers_queue_count(&cond->wait_queue) == cond->wait_queue.count, "fibers_queue_count"); + //FIBERS_ASSERT(fibers_queue_remove(&cond->wait_queue, target), "fibers_condition_wakeup_identified: target not in wait queue"); + if (!fibers_queue_remove(&cond->wait_queue, target)) { + return false; + } + + FIBERS_ASSERT(target->state == FIBER_WAIT, "fibers_condition_wakeup_identified: waking up %d that is not FIBER_WAIT", target->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up %d waiting on condition %p", target->id, cond); + fibers_queue_push(&fibers_run_queue, target); + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_WAKEUP | + FIBERS_YIELD_REASON_ORDER_POST); + + return true; +} diff --git a/tests/unit/mocks/fibers/condition.h b/tests/unit/mocks/fibers/condition.h new file mode 100644 index 000000000..8d9366903 --- /dev/null +++ b/tests/unit/mocks/fibers/condition.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "fibers.h" + +#define FIBERS_CONDITION_ALL (-1) + +typedef struct fibers_condition fibers_condition_t; + +struct fibers_condition { + struct fibers_queue wait_queue; +}; + +extern void fibers_condition_wakeup_one(fibers_condition_t *cond); +extern int fibers_condition_wakeup_some(fibers_condition_t *cond, int num_fibers, void (*callback)(void*, fiber_t), void *arg); +extern void fibers_condition_wait(fibers_condition_t *cond); +extern void fibers_condition_destroy(fibers_condition_t *cond); +extern fiber_t fibers_condition_identify(fibers_condition_t *cond); +extern bool fibers_condition_wakeup_identified(fibers_condition_t *cond, fiber_t target); diff --git a/tests/unit/mocks/fibers/fibers.c b/tests/unit/mocks/fibers/fibers.c new file mode 100644 index 000000000..a44a34360 --- /dev/null +++ b/tests/unit/mocks/fibers/fibers.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define _XOPEN_SOURCE // To use *context deprecated API on OSX +#define BSD_KERNEL_PRIVATE + +#include "fibers.h" +#include "random.h" + +#include +#include +#include + +#ifdef __BUILDING_WITH_TSAN__ +#include +#endif +#ifdef __BUILDING_WITH_ASAN__ +#include +#endif + +// from ucontext.h +#include +extern void makecontext(ucontext_t *ucp, void (*func)(), int argc, ...); +extern int swapcontext(ucontext_t *oucp, const ucontext_t *ucp); +extern int getcontext(ucontext_t *ucp); +extern int setcontext(const ucontext_t *ucp); + +int fibers_log_level; +bool fibers_debug; +int fibers_abort_on_error = 0; + +uint64_t fibers_may_yield_probability = FIBERS_DEFAULT_YIELD_PROB; + +struct fiber_context fibers_main = { + .id = 0, + .state = FIBER_RUN, +}; +static int fibers_last_forged_id = 0; + +fiber_t fibers_current = &fibers_main; /* currently running */ +struct fibers_queue fibers_run_queue; /* ready to be scheduled */ +struct fibers_queue fibers_existing_queue = { .top = &fibers_main, .count = 1 }; /* existing fibers */ + +static void +fibers_default_choose_next(__unused void *arg, int state) +{ + fibers_switch_random(state); +} + +static bool +fibers_default_should_yield(__unused void *arg, uint64_t probability, __unused fiber_yield_reason_t reason) +{ + return probability && random_below(probability) == 0; +} + +struct fibers_scheduler_t fibers_default_scheduler = { + .fibers_choose_next = &fibers_default_choose_next, + .fibers_should_yield = &fibers_default_should_yield +}; + +struct fibers_scheduler_t *fibers_scheduler = &fibers_default_scheduler; +void *fibers_scheduler_context = 0; + +void +fibers_scheduler_get(struct fibers_scheduler_t **scheduler, void **context) +{ + *scheduler = fibers_scheduler; + *context = fibers_scheduler_context; +} + +void +fibers_scheduler_set(struct fibers_scheduler_t *scheduler, void *context) +{ + fibers_scheduler = scheduler; + fibers_scheduler_context = context; +} + +struct fibers_create_trampoline_args { + fiber_t fiber; + void *start_routine_arg; + jmp_buf parent_env; +}; + +static void +fibers_create_trampoline(int arg1, int arg2) +{ + struct fibers_create_trampoline_args *args = (struct fibers_create_trampoline_args *)(((uintptr_t)arg1 << 32) | (uintptr_t)arg2); + // Copy fiber and arg to the local scope as by the time start_routine is called the parent fibers_create stack may have been deallocated + fiber_t fiber = args->fiber; + void *start_routine_arg = args->start_routine_arg; + + #ifdef __BUILDING_WITH_ASAN__ + __sanitizer_finish_switch_fiber(&fiber->sanitizer_fake_stack, &fiber->stack_bottom, &fiber->stack_size); + #endif + + // setjmp/longjmp are faster context switch primitives compared to swapcontext + if (setjmp(fiber->env) == 0) { + // The first time the setjmp is called to save the current context in fiber->env + // we end un in this branch in which we switch back to fibers_create + // When the fiber will be scheduled for the first time, setjmp(fiber->env) != 0 + // and thus the execution will continue in the other branch that calls args.start_routine +#ifdef __BUILDING_WITH_ASAN__ + __sanitizer_start_switch_fiber(&fibers_current->sanitizer_fake_stack, fibers_current->stack_bottom, fibers_current->stack_size); +#endif +#ifdef __BUILDING_WITH_TSAN__ + __tsan_switch_to_fiber(fibers_current->tsan_fiber, 0); +#endif + longjmp(args->parent_env, 1337); + } + + #ifdef __BUILDING_WITH_ASAN__ + __sanitizer_finish_switch_fiber(&fiber->sanitizer_fake_stack, &fiber->stack_bottom, &fiber->stack_size); + #endif + + fibers_current = fiber; + FIBERS_LOG(FIBERS_LOG_INFO, "starting to execute the routine"); + + void *ret_value = fiber->start_routine(start_routine_arg); + fibers_exit(ret_value); +} + +fiber_t +fibers_create(size_t stack_size, void* (*start_routine)(void*), void* arg) +{ + if (fibers_current == &fibers_main && fibers_main.stack_bottom == NULL) { + // fibers_main has no stack_bottom or stack_size, get them here the first time + void* stackaddr = pthread_get_stackaddr_np(pthread_self()); + size_t stacksize = pthread_get_stacksize_np(pthread_self()); + fibers_main.stack_bottom = stackaddr - stacksize; + fibers_main.stack_size = stacksize; + +#ifdef __BUILDING_WITH_TSAN__ + fibers_main.tsan_fiber = __tsan_get_current_fiber(); + __tsan_set_fiber_name(fibers_main.tsan_fiber, "fiber0"); +#endif + } + + void *stack_addr = malloc(stack_size); + + fiber_t fiber = calloc(1, sizeof(struct fiber_context)); + fiber->id = ++fibers_last_forged_id; + FIBERS_ASSERT(fibers_last_forged_id != 0, "fibers_create: new fiber id integer overflow"); + fiber->state = FIBER_STOP; + fiber->start_routine = start_routine; + fiber->stack_size = stack_size; + fiber->stack_bottom = stack_addr; + FIBERS_ASSERT(fiber->stack_bottom, "fibers_create: stack malloc failed"); + +#ifdef __BUILDING_WITH_TSAN__ + fiber->tsan_fiber = __tsan_create_fiber(0); + char tsan_fiber_name[32]; + snprintf(tsan_fiber_name, 32, "fiber%d", fiber->id); + __tsan_set_fiber_name(fiber->tsan_fiber, tsan_fiber_name); +#endif + + ucontext_t tmp_uc; + ucontext_t child_uc = {0}; + FIBERS_ASSERT(getcontext(&child_uc) == 0, "fibers_create: getcontext"); + child_uc.uc_stack.ss_sp = stack_addr; + child_uc.uc_stack.ss_size = stack_size; + child_uc.uc_link = 0; + + struct fibers_create_trampoline_args trampoline_args = {0}; + trampoline_args.fiber = fiber; + trampoline_args.start_routine_arg = arg; + + int trampoline_args1 = (int)((uintptr_t)&trampoline_args >> 32); + int trampoline_args2 = (int)((uintptr_t)&trampoline_args); + + makecontext(&child_uc, (void (*)())fibers_create_trampoline, 2, trampoline_args1, trampoline_args2); + + // switch to the trampoline to setup the setjmp env of the fiber on the newly created stack, then switch back + // setjmp/longjmp are faster context switch primitives, swapcontext will never be used again for this fiber + // ref. the ThreadSanitizer fibers example in LLVM at compiler-rt/test/tsan/fiber_longjmp.cpp + if (setjmp(trampoline_args.parent_env) == 0) { +#ifdef __BUILDING_WITH_ASAN__ + __sanitizer_start_switch_fiber(&fiber->sanitizer_fake_stack, fiber->stack_bottom, fiber->stack_size); +#endif +#ifdef __BUILDING_WITH_TSAN__ + __tsan_switch_to_fiber(fiber->tsan_fiber, 0); +#endif + FIBERS_ASSERT(swapcontext(&tmp_uc, &child_uc) == 0, "fibers_create: swapcontext"); + } + +#ifdef __BUILDING_WITH_ASAN__ + // fibers_create_trampoline did not change fibers_current + __sanitizer_finish_switch_fiber(&fibers_current->sanitizer_fake_stack, &fibers_current->stack_bottom, &fibers_current->stack_size); +#endif + + fibers_queue_push(&fibers_run_queue, fiber); + fibers_existing_push(fiber); + + FIBERS_LOG(FIBERS_LOG_INFO, "fiber %d created", fiber->id); + + /* chance to schedule the newly created fiber */ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_CREATE); + return fiber; +} + +static void +fibers_dispose(fiber_t fiber) +{ + FIBERS_LOG(FIBERS_LOG_DEBUG, "dispose %d", fiber->id); + + fibers_existing_remove(fiber); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_destroy_fiber(fiber->tsan_fiber); +#endif + + if (fiber->extra_cleanup_routine) { + fiber->extra_cleanup_routine(fiber->extra); + } + + free((void*)fiber->stack_bottom); + free(fiber); +} + +void +fibers_exit(void *ret_value) +{ + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_exit: fibers_current->may_yield_disabled is not 0"); + + fibers_current->ret_value = ret_value; + if (fibers_current->joiner) { + FIBERS_LOG(FIBERS_LOG_INFO, "exiting, joined by %d", fibers_current->joiner->id); + fibers_queue_push(&fibers_run_queue, fibers_current->joiner); + } else { + FIBERS_LOG(FIBERS_LOG_INFO, "exiting, no joiner"); + } + + fibers_choose_next(FIBER_DEAD); + FIBERS_ASSERT(false, "fibers_exit: unreachable"); +} + +void * +fibers_join(fiber_t target) +{ + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_join: fibers_current->may_yield_disabled is not 0"); + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_JOIN | FIBERS_YIELD_REASON_ORDER_PRE); + + FIBERS_LOG(FIBERS_LOG_INFO, "join %d", target->id); + if (target->state != FIBER_DEAD) { + FIBERS_ASSERT(target->joiner == NULL, "fibers_join: %d already joined by %d", target->id, target->joiner->id); + + target->joiner = fibers_current; + fibers_current->joining = target; + + // RANGELOCKINGTODO rdar://150845975 maybe have a queue for fibers in join to output debug info in case of deadlock + fibers_choose_next(FIBER_JOIN); + } + + FIBERS_LOG(FIBERS_LOG_INFO, "finish joining %d", target->id); + FIBERS_ASSERT(target->state == FIBER_DEAD, "fibers_join: not dead"); + + void *ret_value = target->ret_value; + fibers_dispose(target); + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_JOIN | FIBERS_YIELD_REASON_ORDER_POST); + return ret_value; +} + +void +fibers_switch_helper(fiber_t target, int state) +{ + if (target == fibers_current) { + target->state = FIBER_RUN; + return; + } + FIBERS_LOG(FIBERS_LOG_TRACE, "switch to %d, state=%d", target->id, state); + + fibers_current->state = state; + fiber_t save = fibers_current; + + if (setjmp(save->env) == 0) { +#ifdef __BUILDING_WITH_ASAN__ + __sanitizer_start_switch_fiber(&target->sanitizer_fake_stack, target->stack_bottom, target->stack_size); +#endif +#ifdef __BUILDING_WITH_TSAN__ + __tsan_switch_to_fiber(target->tsan_fiber, state == FIBER_DEAD ? 0 : __tsan_switch_to_fiber_no_sync); +#endif + longjmp(target->env, 1337); + } +#ifdef __BUILDING_WITH_ASAN__ + __sanitizer_finish_switch_fiber(&save->sanitizer_fake_stack, &save->stack_bottom, &save->stack_size); +#endif + + fibers_current = save; + save->state = FIBER_RUN; +} + +void +fibers_choose_next(int state) +{ + fibers_scheduler->fibers_choose_next(fibers_scheduler_context, state); +} + +void +fibers_switch_to(fiber_t target, int state) +{ + FIBERS_ASSERT(fibers_queue_remove(&fibers_run_queue, target), "fibers_switch_to"); + fibers_switch_helper(target, state); +} + +void +fibers_switch_to_by_id(int target_id, int state) +{ + fiber_t target = fibers_queue_remove_by_id(&fibers_run_queue, target_id); + FIBERS_ASSERT(target != NULL, "fibers_switch_to_by_id"); + fibers_switch_helper(target, state); +} + +void +fibers_switch_top(int state) +{ + fiber_t target = fibers_queue_pop(&fibers_run_queue, 0); + fibers_switch_helper(target, state); +} + +void +fibers_switch_random(int state) +{ + fiber_t target = fibers_queue_pop(&fibers_run_queue, random_below(fibers_run_queue.count)); + fibers_switch_helper(target, state); +} + +void +fibers_yield_to(int fiber_id) +{ + fibers_queue_push(&fibers_run_queue, fibers_current); + fibers_switch_to_by_id(fiber_id, FIBER_STOP); +} + +void +fibers_yield(void) +{ + fibers_queue_push(&fibers_run_queue, fibers_current); + fibers_choose_next(FIBER_STOP); +} + +bool +fibers_may_yield_internal(void) +{ + return fibers_may_yield_with_prob_and_reason(FIBERS_INTERNAL_YIELD_PROB, FIBERS_YIELD_REASON_UNKNOWN); +} + +bool +fibers_may_yield_internal_with_reason(fiber_yield_reason_t reason) +{ + return fibers_may_yield_with_prob_and_reason(FIBERS_INTERNAL_YIELD_PROB, reason); +} + +bool +fibers_may_yield(void) +{ + return fibers_may_yield_with_prob(fibers_may_yield_probability); +} + +bool +fibers_may_yield_with_prob(uint64_t probability) +{ + return fibers_may_yield_with_prob_and_reason(probability, FIBERS_YIELD_REASON_UNKNOWN); +} + +bool +fibers_may_yield_with_reason(fiber_yield_reason_t reason) +{ + return fibers_may_yield_with_prob_and_reason(fibers_may_yield_probability, reason); +} + +bool +fibers_may_yield_with_prob_and_reason(uint64_t probability, fiber_yield_reason_t reason) +{ + if (fibers_current->may_yield_disabled) { + return false; + } + + if (fibers_scheduler->fibers_should_yield(fibers_scheduler_context, probability, reason)) { + fibers_queue_push(&fibers_run_queue, fibers_current); + fibers_choose_next(FIBER_STOP); + return true; + } + + return false; +} diff --git a/tests/unit/mocks/fibers/fibers.h b/tests/unit/mocks/fibers/fibers.h new file mode 100644 index 000000000..d8e515c90 --- /dev/null +++ b/tests/unit/mocks/fibers/fibers.h @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "mocks/std_safe.h" +#include "mocks/unit_test_utils.h" + +#define FIBERS_DEFAULT_STACK_SIZE 1048576 // 8mb +#define FIBERS_INTERNAL_YIELD_PROB 4 // switch on internal yield points 1/4 of the times +#define FIBERS_DEFAULT_YIELD_PROB 256 + +/* Configuration variables */ +extern int fibers_log_level; // see FIBERS_LOG and the levels FIBERS_LOG_* +extern bool fibers_debug; // Mostly used to collect backtraces at fibers events points. The slowdown is huge. +extern int fibers_abort_on_error; // By default do not stop execution at errors, set to not 0 to abort. +extern uint64_t fibers_may_yield_probability; // FIBERS_DEFAULT_YIELD_PROB by default + +typedef struct fiber_context *fiber_t; + +typedef uint32_t fiber_yield_reason_t; + +#define FIBERS_YIELD_REASON_ORDER_PRE_SHIFT (16) +#define FIBERS_YIELD_REASON_ORDER_PRE (0 << FIBERS_YIELD_REASON_ORDER_PRE_SHIFT) +#define FIBERS_YIELD_REASON_ORDER_POST (1 << FIBERS_YIELD_REASON_ORDER_PRE_SHIFT) +#define FIBERS_YIELD_REASON_ORDER(x) ((x) & (1 << FIBERS_YIELD_REASON_ORDER_PRE_SHIFT)) + +#define FIBERS_YIELD_REASON_ERROR_SHIFT (17) +#define FIBERS_YIELD_REASON_ERROR (1 << FIBERS_YIELD_REASON_ERROR_SHIFT) +#define FIBERS_YIELD_REASON_ERROR_IF(x) ((x) ? FIBERS_YIELD_REASON_ERROR : 0) +#define FIBERS_YIELD_REASON_IS_ERROR(x) (!!((x) & FIBERS_YIELD_REASON_ERROR)) + +#define FIBERS_YIELD_REASON_MUTEX_SHIFT (18) +#define FIBERS_YIELD_REASON_MUTEX_LOCK (0 << FIBERS_YIELD_REASON_MUTEX_SHIFT) +#define FIBERS_YIELD_REASON_MUTEX_UNLOCK (1 << FIBERS_YIELD_REASON_MUTEX_SHIFT) +#define FIBERS_YIELD_REASON_MUTEX_DESTROY (2 << FIBERS_YIELD_REASON_MUTEX_SHIFT) +#define FIBERS_YIELD_REASON_MUTEX_STATE(x) ((x) & (3 << FIBERS_YIELD_REASON_MUTEX_DESTROY)) + +#define FIBERS_YIELD_REASON_CATEGORY(x) ((x) & 0xffff) +#define FIBERS_YIELD_REASON_UNKNOWN 0 +#define FIBERS_YIELD_REASON_MUTEX 1 +#define FIBERS_YIELD_REASON_PREEMPTION_CONTROL 2 +#define FIBERS_YIELD_REASON_PREEMPTION_TRIGGER 3 +#define FIBERS_YIELD_REASON_BLOCKED 4 +#define FIBERS_YIELD_REASON_WAKEUP 5 +#define FIBERS_YIELD_REASON_CREATE 6 +#define FIBERS_YIELD_REASON_JOIN 7 + +#define FIBERS_YIELD_REASON_PREEMPTION_WILL_ENABLE (FIBERS_YIELD_REASON_PREEMPTION_CONTROL | \ + FIBERS_YIELD_REASON_MUTEX_UNLOCK | \ + FIBERS_YIELD_REASON_ORDER_PRE) + +#define FIBERS_YIELD_REASON_PREEMPTION_DID_ENABLE (FIBERS_YIELD_REASON_PREEMPTION_CONTROL | \ + FIBERS_YIELD_REASON_MUTEX_UNLOCK | \ + FIBERS_YIELD_REASON_ORDER_POST) + +#define FIBERS_YIELD_REASON_PREEMPTION_WILL_DISABLE (FIBERS_YIELD_REASON_PREEMPTION_CONTROL | \ + FIBERS_YIELD_REASON_MUTEX_LOCK | \ + FIBERS_YIELD_REASON_ORDER_PRE) + +#define FIBERS_YIELD_REASON_PREEMPTION_DID_DISABLE (FIBERS_YIELD_REASON_PREEMPTION_CONTROL | \ + FIBERS_YIELD_REASON_MUTEX_LOCK | \ + FIBERS_YIELD_REASON_ORDER_POST) + +#define FIBERS_YIELD_REASON_MUTEX_WILL_LOCK (FIBERS_YIELD_REASON_MUTEX | \ + FIBERS_YIELD_REASON_MUTEX_LOCK | \ + FIBERS_YIELD_REASON_ORDER_PRE) + +#define FIBERS_YIELD_REASON_MUTEX_DID_LOCK (FIBERS_YIELD_REASON_MUTEX | \ + FIBERS_YIELD_REASON_MUTEX_LOCK | \ + FIBERS_YIELD_REASON_ORDER_POST) + +#define FIBERS_YIELD_REASON_MUTEX_TRY_LOCK_FAIL (FIBERS_YIELD_REASON_MUTEX_DID_LOCK | \ + FIBERS_YIELD_REASON_ERROR) + +#define FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK (FIBERS_YIELD_REASON_MUTEX | \ + FIBERS_YIELD_REASON_MUTEX_UNLOCK | \ + FIBERS_YIELD_REASON_ORDER_PRE) + +#define FIBERS_YIELD_REASON_MUTEX_DID_UNLOCK (FIBERS_YIELD_REASON_MUTEX | \ + FIBERS_YIELD_REASON_MUTEX_UNLOCK | \ + FIBERS_YIELD_REASON_ORDER_POST) + + +extern fiber_t fibers_current; +extern struct fibers_queue fibers_run_queue; +extern struct fibers_queue fibers_existing_queue; + +#define FIBERS_ASSERT(expr, msg, ...) do { \ + if (!(expr)) { \ + raw_printf("fibers failure: current=%d expr=" #expr ": " msg "\n", (fibers_current ? fibers_current->id : -1 ), ##__VA_ARGS__); \ + if (fibers_debug) print_current_backtrace(); \ + if (fibers_abort_on_error) abort(); \ + } \ + } while (0) + +struct fibers_scheduler_t { + void (*fibers_choose_next)(void *arg, int state); + bool (*fibers_should_yield)(void *arg, uint64_t probability, fiber_yield_reason_t reason); +}; + +extern void fibers_scheduler_get(struct fibers_scheduler_t **scheduler, void **context); +extern void fibers_scheduler_set(struct fibers_scheduler_t *scheduler, void *context); + +extern struct fibers_scheduler_t *fibers_scheduler; +extern void *fibers_scheduler_context; + +#define FIBERS_LOG_WARN 0 +#define FIBERS_LOG_INFO 1 +#define FIBERS_LOG_DEBUG 2 +#define FIBERS_LOG_TRACE 3 +#define FIBERS_LOG(level, msg, ...) do { \ + if (fibers_log_level >= (level)) { \ + raw_printf("fibers log(%d): current=%d: " msg "\n", (level), (fibers_current ? fibers_current->id : -1 ), ##__VA_ARGS__); \ + if (fibers_debug) print_current_backtrace(); \ + } \ + } while (0) + +struct fiber_context { + int id; /* unique fiber id assigned at creation */ + int state; /* current state */ +#define FIBER_RUN 0x1 +#define FIBER_STOP 0x2 +#define FIBER_WAIT 0x4 +#define FIBER_JOIN 0x8 +#define FIBER_DEAD 0x10 + + int may_yield_disabled; + int disable_race_checker; + + fiber_t joining; /* waiting for this fiber if FIBER_JOIN */ + fiber_t joiner; /* signal this fiber on termination */ + fiber_t next; /* next fiber on the same queue (run or wait queue) */ + fiber_t next_existing; /* next fiber in the list of existing fibers */ + + void* (*start_routine)(void*); /* start routine function pointer */ + void *ret_value; /* return value upon exit */ + jmp_buf env; /* buf to jump when run */ + const void *stack_bottom; /* stack bottom addr, 16 bytes aligned */ + size_t stack_size; + + void *extra; /* per-fiber extra data */ + void (*extra_cleanup_routine)(void*); + +#ifdef __BUILDING_WITH_ASAN__ + void *sanitizer_fake_stack; /* set by asan to track fake stack switches */ +#endif +#ifdef __BUILDING_WITH_TSAN__ + void *tsan_fiber; +#endif +}; + +static void +fibers_checker_atomic_begin(void) +{ + fibers_current->disable_race_checker++; +} + +static void +fibers_checker_atomic_end(void) +{ + fibers_current->disable_race_checker--; +} + +struct fibers_queue { + fiber_t top; + size_t count; +}; + +static inline void +fibers_queue_push(struct fibers_queue *queue, fiber_t fiber) +{ + FIBERS_ASSERT(fiber->next == NULL, "fibers_queue_push: already on another queue"); + fiber->next = queue->top; + queue->top = fiber; + queue->count++; +} + +static inline fiber_t +fibers_queue_pop(struct fibers_queue *queue, size_t index) +{ + FIBERS_ASSERT(queue->count > 0, "fibers_queue_pop: empty queue"); + FIBERS_ASSERT(queue->count > index, "fibers_queue_pop: invalid index"); + fiber_t *iter = &queue->top; + while (*iter != NULL) { + if (index == 0) { + fiber_t fiber = *iter; + *iter = fiber->next; + fiber->next = NULL; + queue->count--; + return fiber; + } + index--; + iter = &(*iter)->next; + } + FIBERS_ASSERT(false, "fibers_queue_pop: unreachable"); + return NULL; +} + +static inline fiber_t +fibers_queue_peek(struct fibers_queue *queue) +{ + for (fiber_t *iter = &queue->top; + *iter != NULL; + iter = &(*iter)->next) { + if ((*iter)->next == NULL) { + return *iter; + } + } + return NULL; +} + +static inline bool +fibers_queue_contains(struct fibers_queue *queue, fiber_t fiber) +{ + fiber_t iter = queue->top; + while (iter != NULL) { + if (iter == fiber) { + return true; + } + iter = iter->next; + } + return false; +} + +static inline bool +fibers_queue_remove(struct fibers_queue *queue, fiber_t fiber) +{ + fiber_t *iter = &queue->top; + while (*iter != NULL) { + if (*iter == fiber) { + *iter = fiber->next; + fiber->next = NULL; + queue->count--; + return true; + } + iter = &(*iter)->next; + } + return false; +} + +static inline fiber_t +fibers_queue_remove_by_id(struct fibers_queue *queue, int fiber_id) +{ + fiber_t *iter = &queue->top; + while (*iter != NULL) { + if ((*iter)->id == fiber_id) { + fiber_t fiber = *iter; + *iter = fiber->next; + fiber->next = NULL; + queue->count--; + return fiber; + } + iter = &(*iter)->next; + } + return NULL; +} + +static inline size_t +fibers_queue_count(struct fibers_queue *queue) +{ + fiber_t iter = queue->top; + size_t count = 0; + while (iter != NULL) { + count++; + iter = iter->next; + } + return count; +} + +static inline void +fibers_existing_push(fiber_t fiber) +{ + FIBERS_ASSERT(fiber->next_existing == NULL, "fibers_existing_push: already on existing queue"); + fiber->next_existing = fibers_existing_queue.top; + fibers_existing_queue.top = fiber; + fibers_existing_queue.count++; +} + +static inline bool +fibers_existing_remove(fiber_t fiber) +{ + fiber_t *iter = &fibers_existing_queue.top; + while (*iter != NULL) { + if (*iter == fiber) { + *iter = fiber->next_existing; + fiber->next_existing = NULL; + fibers_existing_queue.count--; + return true; + } + iter = &(*iter)->next_existing; + } + return false; +} + +// Create, exit and join are similar to pthread. +// Detaching is not supported at the moment. +extern fiber_t fibers_create(size_t stack_size, void *(*start_routine)(void*), void *arg); +extern void fibers_exit(void *ret_value); +extern void *fibers_join(fiber_t target); + +extern void fibers_switch_to(fiber_t target, int state); +extern void fibers_switch_to_by_id(int target_id, int state); +extern void fibers_switch_top(int state); +extern void fibers_switch_random(int state); +extern void fibers_switch_helper(fiber_t target, int state); +extern void fibers_choose_next(int state); + +// Force a context switch +extern void fibers_yield(void); +// Force a context switch to a specific fiber (must be ready to be scheduled) +extern void fibers_yield_to(int fiber_id); +// Context switch with fibers_may_yield_probability +extern bool fibers_may_yield(void); +// Context switch with a default priority for infrastructure +extern bool fibers_may_yield_internal(); +// Context switch with a default priority for infrastructure and explicit reason +extern bool fibers_may_yield_internal_with_reason(fiber_yield_reason_t reason); +// Context switch with custom probability +extern bool fibers_may_yield_with_prob(uint64_t probability); +// Context switch with fibers_may_yield_probability and an explicit reason +extern bool fibers_may_yield_with_reason(fiber_yield_reason_t reason); +// Context switch with custom probability and explicit reason +extern bool fibers_may_yield_with_prob_and_reason(uint64_t probability, fiber_yield_reason_t reason); diff --git a/tests/unit/mocks/fibers/mutex.c b/tests/unit/mocks/fibers/mutex.c new file mode 100644 index 000000000..987671e1c --- /dev/null +++ b/tests/unit/mocks/fibers/mutex.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "mutex.h" +#include "random.h" + +#include +#include +#include + +#ifdef __BUILDING_WITH_TSAN__ +#include +#endif + +void +fibers_mutex_init(fibers_mutex_t *mtx) +{ + mtx->holder = 0; + mtx->wait_queue = (struct fibers_queue){0, 0}; +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_create(mtx, __tsan_mutex_not_static); +#endif +} + +static void +fibers_mutex_lock_helper(fibers_mutex_t *mtx, bool check_may_yield) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(mtx, 0); +#endif + + if (mtx->holder) { + FIBERS_ASSERT(mtx->holder != fibers_current, "fibers_mutex_lock_helper: tried to lock mutex already held by %d", mtx->holder->id); + // TODO rdar://150846598 add support for recursive locks + FIBERS_LOG(FIBERS_LOG_DEBUG, "waiting on mutex %p locked by %d", mtx, mtx->holder->id); + if (check_may_yield) { + // check for mutexes but not spinlocks + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_mutex_lock_helper: waiting on a mutex with fibers_current->may_yield_disabled not 0"); + } + + fibers_queue_push(&mtx->wait_queue, fibers_current); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_divert(mtx, 0); +#endif + fibers_choose_next(FIBER_WAIT); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_divert(mtx, 0); +#endif + FIBERS_ASSERT(mtx->holder == fibers_current, "fibers_mutex_lock_helper: waken up without being the holder of %p", mtx); + } else { + FIBERS_LOG(FIBERS_LOG_DEBUG, "locking mutex %p", mtx); + mtx->holder = fibers_current; + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(mtx, 0, 0); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_LOCK); +} + +static void +fibers_mutex_unlock_helper(fibers_mutex_t *mtx) +{ + FIBERS_ASSERT(mtx->holder == fibers_current, "fibers_mutex_unlock_helper: tried to unlock mutex held by %d", mtx->holder ? mtx->holder->id : -1); + FIBERS_LOG(FIBERS_LOG_DEBUG, "unlocking mutex %p", mtx); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_unlock(mtx, 0); +#endif + + mtx->holder = NULL; + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_unlock(mtx, 0); +#endif + + if (mtx->wait_queue.count) { + fiber_t new_holder = fibers_queue_pop(&mtx->wait_queue, random_below(mtx->wait_queue.count)); + FIBERS_ASSERT(new_holder->state == FIBER_WAIT, "fibers_mutex_unlock_helper: new holder %d is not FIBER_WAIT", new_holder->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up %d waiting on mutex %p", new_holder->id, mtx); + mtx->holder = new_holder; + + fibers_queue_push(&fibers_run_queue, new_holder); + } + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_UNLOCK); +} + +static int +fibers_mutex_try_lock_helper(fibers_mutex_t *mtx) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(mtx, __tsan_mutex_try_lock); +#endif + + if (mtx->holder) { +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(mtx, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0); +#endif + return EBUSY; + } else { + FIBERS_LOG(FIBERS_LOG_DEBUG, "locking mutex %p", mtx); + mtx->holder = fibers_current; + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(mtx, __tsan_mutex_try_lock, 0); +#endif + return 0; +} + +void +fibers_mutex_lock(fibers_mutex_t *mtx, bool check_may_yield) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + fibers_mutex_lock_helper(mtx, check_may_yield); +} + +void +fibers_mutex_unlock(fibers_mutex_t *mtx) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK); + fibers_mutex_unlock_helper(mtx); +} + +int +fibers_mutex_try_lock(fibers_mutex_t *mtx) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + int err = fibers_mutex_try_lock_helper(mtx); + fibers_may_yield_internal_with_reason(err == 0 ? FIBERS_YIELD_REASON_MUTEX_DID_LOCK : FIBERS_YIELD_REASON_MUTEX_TRY_LOCK_FAIL); + return err; +} + +void +fibers_mutex_destroy(fibers_mutex_t *mtx) +{ + FIBERS_ASSERT(mtx->holder == NULL, "fibers_mutex_destroy: tried to destroy mutex held by %d", mtx->holder->id); + FIBERS_ASSERT(mtx->wait_queue.count == 0, "fibers_mutex_destroy: tried to destroy mutex with non empty wait queue"); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_destroy(mtx, __tsan_mutex_not_static); +#endif + + fibers_may_yield_internal(); +} diff --git a/tests/unit/mocks/fibers/mutex.h b/tests/unit/mocks/fibers/mutex.h new file mode 100644 index 000000000..df1846d35 --- /dev/null +++ b/tests/unit/mocks/fibers/mutex.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "fibers.h" + +typedef struct fibers_mutex fibers_mutex_t; + +struct fibers_mutex { + fiber_t holder; + struct fibers_queue wait_queue; +}; + +extern void fibers_mutex_init(fibers_mutex_t *mtx); +extern void fibers_mutex_lock(fibers_mutex_t *mtx, bool check_may_yield); +extern void fibers_mutex_unlock(fibers_mutex_t *mtx); +extern int fibers_mutex_try_lock(fibers_mutex_t *mtx); +extern void fibers_mutex_destroy(fibers_mutex_t *mtx); diff --git a/tests/unit/mocks/fibers/random.c b/tests/unit/mocks/fibers/random.c new file mode 100644 index 000000000..893fb8bd1 --- /dev/null +++ b/tests/unit/mocks/fibers/random.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "random.h" + +#include + +// written in 2015 by Sebastiano Vigna https://prng.di.unimi.it/splitmix64.c +static inline uint64_t +splitmix64_next(uint64_t *state) +{ + uint64_t z = (*state += 0x9e3779b97f4a7c15); + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + return z ^ (z >> 31); +} + +static inline uint64_t +rotl64(uint64_t x, int8_t r) +{ + return (x << r) | (x >> (64 - r)); +} + +// fast alternative to x % n +static inline uint64_t +fast_bound(uint64_t x, uint64_t n) +{ + uint128_t mul = (uint128_t)x * (uint128_t)n; + return (uint64_t)(mul >> 64); +} + +// initial state as if random_set_seed(1337) was called +uint64_t romuduojr_x_state = 13161956497586561035ull; +uint64_t romuduojr_y_state = 14663483216071361993ull; + +void +random_set_seed(uint64_t seed) +{ + romuduojr_x_state = splitmix64_next(&seed); + romuduojr_y_state = splitmix64_next(&seed); +} + +uint64_t +random_next(void) +{ + const uint64_t xp = romuduojr_x_state; + romuduojr_x_state = 15241094284759029579ull * romuduojr_y_state; + romuduojr_y_state = romuduojr_y_state - xp; + romuduojr_y_state = rotl64(romuduojr_y_state, 27); + return xp; +} + +uint64_t +random_below(uint64_t upper_bound) +{ + return fast_bound(random_next(), upper_bound); +} diff --git a/tests/unit/mocks/fibers/random.h b/tests/unit/mocks/fibers/random.h new file mode 100644 index 000000000..b58f7b14e --- /dev/null +++ b/tests/unit/mocks/fibers/random.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include + +extern void random_set_seed(uint64_t seed); +extern uint64_t random_next(void); +extern uint64_t random_below(uint64_t upper_bound); diff --git a/tests/unit/mocks/fibers/rwlock.c b/tests/unit/mocks/fibers/rwlock.c new file mode 100644 index 000000000..d69915bbf --- /dev/null +++ b/tests/unit/mocks/fibers/rwlock.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "rwlock.h" +#include "random.h" + +#include +#include + +#ifdef __BUILDING_WITH_TSAN__ +#include +#endif + +void +fibers_rwlock_init(fibers_rwlock_t *rwlock) +{ + rwlock->writer_active = NULL; + rwlock->reader_count = 0; + rwlock->reader_wait_queue = (struct fibers_queue){0, 0}; + rwlock->writer_wait_queue = (struct fibers_queue){0, 0}; + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_create(rwlock, __tsan_mutex_not_static); +#endif +} + +static void +fibers_rwlock_rdlock_helper(fibers_rwlock_t *rwlock, bool check_may_yield) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(rwlock, __tsan_mutex_read_lock); +#endif + + // stop a reader if there are writers waiting (RANGELOCKINGTODO rdar://150845975 use the PRNG to choose?) + if (rwlock->writer_active != NULL || rwlock->writer_wait_queue.count > 0) { + FIBERS_LOG(FIBERS_LOG_DEBUG, "waiting for read lock %p (writer %p active, %d writers waiting)", + rwlock, rwlock->writer_active, rwlock->writer_wait_queue.count); + if (check_may_yield) { + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_rwlock_rdlock_helper: waiting on rwlock with fibers_current->may_yield_disabled not 0"); + } + + fibers_queue_push(&rwlock->reader_wait_queue, fibers_current); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_divert(rwlock, 0); +#endif + fibers_choose_next(FIBER_WAIT); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_divert(rwlock, 0); +#endif + FIBERS_ASSERT(rwlock->writer_active == NULL, "fibers_rwlock_rdlock_helper: woken up while writer %d still active", rwlock->writer_active ? rwlock->writer_active->id : -1); + } else { + rwlock->reader_count++; + FIBERS_LOG(FIBERS_LOG_DEBUG, "acquired read lock %p (now %u readers)", rwlock, rwlock->reader_count); + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_read_lock, 0); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_LOCK); +} + +static int +fibers_rwlock_try_rdlock_helper(fibers_rwlock_t *rwlock) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(rwlock, __tsan_mutex_try_read_lock); +#endif + + if (rwlock->writer_active != NULL || rwlock->writer_wait_queue.count > 0) { +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_try_read_lock | __tsan_mutex_try_read_lock_failed, 0); +#endif + return EBUSY; + } else { + rwlock->reader_count++; + FIBERS_LOG(FIBERS_LOG_DEBUG, "try acquired read lock %p (now %u readers)", rwlock, rwlock->reader_count); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_try_read_lock, 0); +#endif + return 0; + } +} + +static void +fibers_rwlock_wrlock_helper(fibers_rwlock_t *rwlock, bool check_may_yield) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(rwlock, 0); +#endif + + if (rwlock->writer_active != NULL || rwlock->reader_count > 0) { + FIBERS_ASSERT(rwlock->writer_active != fibers_current, "fibers_rwlock_wrlock_helper: recursive write lock attempted by %d", fibers_current->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waiting for write lock %p (writer %p active, %u readers active)", + rwlock, rwlock->writer_active, rwlock->reader_count); + if (check_may_yield) { + FIBERS_ASSERT(fibers_current->may_yield_disabled == 0, "fibers_rwlock_wrlock_helper: waiting on rwlock with fibers_current->may_yield_disabled not 0"); + } + + fibers_queue_push(&rwlock->writer_wait_queue, fibers_current); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_divert(rwlock, 0); +#endif + fibers_choose_next(FIBER_WAIT); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_divert(rwlock, 0); +#endif + FIBERS_ASSERT(rwlock->writer_active == fibers_current, "fibers_rwlock_wrlock_helper: woken up but not writer holder (%p != %p)", rwlock->writer_active, fibers_current); + FIBERS_ASSERT(rwlock->reader_count == 0, "fibers_rwlock_wrlock_helper: woken up as writer but %u readers still active?", rwlock->reader_count); + } else { + FIBERS_LOG(FIBERS_LOG_DEBUG, "acquired write lock %p", rwlock); + rwlock->writer_active = fibers_current; + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, 0, 0); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_LOCK); +} + +static int +fibers_rwlock_try_wrlock_helper(fibers_rwlock_t *rwlock) +{ +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_lock(rwlock, __tsan_mutex_try_lock); +#endif + + if (rwlock->writer_active != NULL || rwlock->reader_count > 0) { +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_try_lock | __tsan_mutex_try_lock_failed, 0); +#endif + return EBUSY; + } else { + // Acquire write lock + FIBERS_LOG(FIBERS_LOG_DEBUG, "try acquired write lock %p", rwlock); + rwlock->writer_active = fibers_current; +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_try_lock, 0); +#endif + return 0; + } +} + +static void +fibers_rwlock_rdunlock_helper(fibers_rwlock_t *rwlock) +{ + FIBERS_ASSERT(rwlock->writer_active == NULL, "fibers_rwlock_rdunlock_helper: trying to read-unlock while writer %d active", rwlock->writer_active ? rwlock->writer_active->id : -1); + FIBERS_ASSERT(rwlock->reader_count > 0, "fibers_rwlock_rdunlock_helper: trying to read-unlock with zero readers"); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_unlock(rwlock, __tsan_mutex_read_lock); +#endif + + rwlock->reader_count--; + FIBERS_LOG(FIBERS_LOG_DEBUG, "released read lock %p (readers remaining %u)", rwlock, rwlock->reader_count); + + // if last reader out and writers are waiting, wake one writer + if (rwlock->reader_count == 0 && rwlock->writer_wait_queue.count > 0) { + fiber_t new_writer = fibers_queue_pop(&rwlock->writer_wait_queue, random_below(rwlock->writer_wait_queue.count)); + FIBERS_ASSERT(new_writer->state == FIBER_WAIT, "fibers_rwlock_rdunlock_helper: woken writer %d is not FIBER_WAIT", new_writer->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up writer %d waiting on rwlock %p", new_writer->id, rwlock); + rwlock->writer_active = new_writer; + + fibers_queue_push(&fibers_run_queue, new_writer); + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_unlock(rwlock, __tsan_mutex_read_lock); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_UNLOCK); +} + +static void +fibers_rwlock_wrunlock_helper(fibers_rwlock_t *rwlock) +{ + FIBERS_ASSERT(rwlock->writer_active == fibers_current, "fibers_rwlock_wrunlock_helper: trying to write-unlock lock not held by current fiber %d (holder %d)", fibers_current->id, rwlock->writer_active ? rwlock->writer_active->id : -1); + FIBERS_ASSERT(rwlock->reader_count == 0, "fibers_rwlock_wrunlock_helper: trying to write-unlock while %u readers active?", rwlock->reader_count); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_unlock(rwlock, 0); +#endif + + FIBERS_LOG(FIBERS_LOG_DEBUG, "releasing write lock %p", rwlock); + rwlock->writer_active = NULL; + + if (rwlock->writer_wait_queue.count > 0) { + fiber_t new_writer = fibers_queue_pop(&rwlock->writer_wait_queue, random_below(rwlock->writer_wait_queue.count)); + FIBERS_ASSERT(new_writer->state == FIBER_WAIT, "fibers_rwlock_wrunlock_helper: woken writer %d is not FIBER_WAIT", new_writer->id); + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up writer %d waiting on rwlock %p", new_writer->id, rwlock); + rwlock->writer_active = new_writer; + + fibers_queue_push(&fibers_run_queue, new_writer); + } else if (rwlock->reader_wait_queue.count > 0) { + FIBERS_LOG(FIBERS_LOG_DEBUG, "waking up %d readers waiting on rwlock %p", rwlock->reader_wait_queue.count, rwlock); + + unsigned int initial_count = rwlock->reader_wait_queue.count; + while (rwlock->reader_wait_queue.count > 0) { + fiber_t new_reader = fibers_queue_pop(&rwlock->reader_wait_queue, random_below(rwlock->reader_wait_queue.count)); + FIBERS_ASSERT(new_reader->state == FIBER_WAIT, "fibers_rwlock_wrunlock_helper: woken reader %d is not FIBER_WAIT", new_reader->id); + rwlock->reader_count++; + + fibers_queue_push(&fibers_run_queue, new_reader); + } + FIBERS_ASSERT(rwlock->reader_count == initial_count, "fibers_rwlock_wrunlock_helper: reader count mismatch after waking readers (%u != %u)", rwlock->reader_count, initial_count); + FIBERS_LOG(FIBERS_LOG_DEBUG, "rwlock %p now held by %u readers", rwlock, rwlock->reader_count); + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_unlock(rwlock, 0); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_UNLOCK); +} + +void +fibers_rwlock_rdlock(fibers_rwlock_t *rwlock, bool check_may_yield) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + fibers_rwlock_rdlock_helper(rwlock, check_may_yield); +} + +void +fibers_rwlock_wrlock(fibers_rwlock_t *rwlock, bool check_may_yield) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + fibers_rwlock_wrlock_helper(rwlock, check_may_yield); +} + +int +fibers_rwlock_try_rdlock(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + int err = fibers_rwlock_try_rdlock_helper(rwlock); + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_MUTEX | + FIBERS_YIELD_REASON_ERROR_IF(err != 0)); + return err; +} + +int +fibers_rwlock_try_wrlock(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_LOCK); + int err = fibers_rwlock_try_wrlock_helper(rwlock); + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_MUTEX | + FIBERS_YIELD_REASON_ERROR_IF(err != 0)); + return err; +} + +void +fibers_rwlock_rdunlock(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK); + fibers_rwlock_rdunlock_helper(rwlock); +} + +void +fibers_rwlock_wrunlock(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK); + fibers_rwlock_wrunlock_helper(rwlock); +} + +void +fibers_rwlock_unlock(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK); + if (rwlock->writer_active) { + fibers_rwlock_wrunlock_helper(rwlock); + } else { + fibers_rwlock_rdunlock_helper(rwlock); + } +} + +void +fibers_rwlock_destroy(fibers_rwlock_t *rwlock) +{ + FIBERS_ASSERT(rwlock->writer_active == NULL, "fibers_rwlock_destroy: tried to destroy rwlock with active writer %d", rwlock->writer_active ? rwlock->writer_active->id : -1); + FIBERS_ASSERT(rwlock->reader_count == 0, "fibers_rwlock_destroy: tried to destroy rwlock with %u active readers", rwlock->reader_count); + FIBERS_ASSERT(rwlock->reader_wait_queue.count == 0, "fibers_rwlock_destroy: tried to destroy rwlock with %d waiting readers", rwlock->reader_wait_queue.count); + FIBERS_ASSERT(rwlock->writer_wait_queue.count == 0, "fibers_rwlock_destroy: tried to destroy rwlock with %d waiting writers", rwlock->writer_wait_queue.count); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_destroy(rwlock, __tsan_mutex_not_static); +#endif + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_MUTEX | + FIBERS_YIELD_REASON_MUTEX_DESTROY | + FIBERS_YIELD_REASON_ORDER_POST); +} + +bool +fibers_rwlock_upgrade(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_with_prob(FIBERS_INTERNAL_YIELD_PROB); + + FIBERS_ASSERT(rwlock->writer_active == NULL, "fibers_rwlock_upgrade: trying to upgrade lock while writer %d active", rwlock->writer_active ? rwlock->writer_active->id : -1); + FIBERS_ASSERT(rwlock->reader_count > 0, "fibers_rwlock_upgrade: trying to upgrade with zero readers"); + + // if another fiber want to upgrade fail, release the lock and bail out + if (rwlock->flags & FIBERS_RWLOCK_WANT_UPGRADE) { + fibers_rwlock_rdunlock_helper(rwlock); + return false; + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_unlock(rwlock, __tsan_mutex_read_lock); +#endif + + // mark that we want to upgrade and we arrived here first + rwlock->flags |= FIBERS_RWLOCK_WANT_UPGRADE; + rwlock->reader_count--; + + // wait for the other readers to finish + if (rwlock->reader_count > 0) { + FIBERS_LOG(FIBERS_LOG_DEBUG, "fibers_rwlock_upgrade: waiting for remaining readers (%u) to finish on rwlock %p", rwlock->reader_count, rwlock); + + fibers_queue_push(&rwlock->writer_wait_queue, fibers_current); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_divert(rwlock, 0); +#endif + fibers_choose_next(FIBER_WAIT); +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_divert(rwlock, 0); +#endif + + // when we wake up, we should be the only ones holding the lock. + FIBERS_ASSERT(rwlock->writer_active == fibers_current, "fibers_rwlock_upgrade: woken up but not writer holder (%p != %p)", rwlock->writer_active, fibers_current); + FIBERS_ASSERT(rwlock->reader_count == 0, "fibers_rwlock_upgrade: woken up as writer but %u readers still active?", rwlock->reader_count); + } else { + // we were the only reader, so we can immediately become the writer. + FIBERS_LOG(FIBERS_LOG_DEBUG, "fibers_rwlock_upgrade: no other readers, acquiring write lock %p", rwlock); + rwlock->writer_active = fibers_current; + } + + rwlock->flags &= ~FIBERS_RWLOCK_WANT_UPGRADE; + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_unlock(rwlock, __tsan_mutex_read_lock); + __tsan_mutex_pre_lock(rwlock, 0); + __tsan_mutex_post_lock(rwlock, 0, 0); +#endif + fibers_may_yield_with_prob(FIBERS_INTERNAL_YIELD_PROB); + + return true; +} + +void +fibers_rwlock_downgrade(fibers_rwlock_t *rwlock) +{ + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_WILL_UNLOCK); + + FIBERS_ASSERT(rwlock->writer_active == fibers_current, "fibers_rwlock_downgrade: trying to downgrade lock not held exclusively by current fiber %d (holder %d)", fibers_current->id, rwlock->writer_active ? rwlock->writer_active->id : -1); + FIBERS_ASSERT(rwlock->reader_count == 0, "fibers_rwlock_downgrade: trying to downgrade while %u readers unexpectedly active?", rwlock->reader_count); + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_pre_unlock(rwlock, 0); +#endif + + FIBERS_LOG(FIBERS_LOG_DEBUG, "downgrading write lock %p to read lock", rwlock); + + // release the write hold, acquire a read hold for the current fiber + rwlock->writer_active = NULL; + rwlock->reader_count = 1; + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_unlock(rwlock, 0); + __tsan_mutex_pre_lock(rwlock, __tsan_mutex_read_lock); +#endif + + if (rwlock->reader_wait_queue.count > 0) { + FIBERS_LOG(FIBERS_LOG_DEBUG, "downgrade: waking up %d readers waiting on rwlock %p", rwlock->reader_wait_queue.count, rwlock); + unsigned int initial_woken_count = rwlock->reader_wait_queue.count; + unsigned int readers_woken = 0; + while (rwlock->reader_wait_queue.count > 0) { + fiber_t new_reader = fibers_queue_pop(&rwlock->reader_wait_queue, random_below(rwlock->reader_wait_queue.count)); + FIBERS_ASSERT(new_reader->state == FIBER_WAIT, "fibers_rwlock_downgrade: woken reader %d is not FIBER_WAIT", new_reader->id); + rwlock->reader_count++; + readers_woken++; + fibers_queue_push(&fibers_run_queue, new_reader); + // TSan: Each woken reader will execute its post_lock upon resuming. + } + FIBERS_ASSERT(readers_woken == initial_woken_count, "fibers_rwlock_downgrade: reader wakeup count mismatch (%u != %u)", readers_woken, initial_woken_count); + FIBERS_LOG(FIBERS_LOG_DEBUG, "rwlock %p now held by %u readers after downgrade", rwlock, rwlock->reader_count); + } else { + FIBERS_LOG(FIBERS_LOG_DEBUG, "rwlock %p now held by 1 reader (self) after downgrade", rwlock); + } + +#ifdef __BUILDING_WITH_TSAN__ + __tsan_mutex_post_lock(rwlock, __tsan_mutex_read_lock, 0); +#endif + + fibers_may_yield_internal_with_reason(FIBERS_YIELD_REASON_MUTEX_DID_UNLOCK); +} + +void +fibers_rwlock_assert(fibers_rwlock_t *rwlock, unsigned int type) +{ + fiber_t current = fibers_current; + bool condition_met = false; + const char *fail_msg = "Unknown assertion failure"; + + switch (type) { + case FIBERS_RWLOCK_ASSERT_SHARED: + if (rwlock->reader_count > 0 && rwlock->writer_active == NULL) { + condition_met = true; + } else { + fail_msg = "Lock not held in shared mode"; + } + break; + + case FIBERS_RWLOCK_ASSERT_EXCLUSIVE: + if (rwlock->writer_active == current && rwlock->reader_count == 0) { + condition_met = true; + } else { + fail_msg = "Lock not held exclusively by current fiber"; + } + break; + + case FIBERS_RWLOCK_ASSERT_HELD: + if ((rwlock->reader_count > 0 && rwlock->writer_active == NULL) || + (rwlock->writer_active == current && rwlock->reader_count == 0)) { + condition_met = true; + } else { + fail_msg = "Lock not held by current fiber (exclusively) or any fiber (shared)"; + } + break; + + case FIBERS_RWLOCK_ASSERT_NOTHELD: + if (rwlock->reader_count == 0 && rwlock->writer_active == NULL) { + condition_met = true; + } else { + fail_msg = "Lock is held"; + } + break; + + case FIBERS_RWLOCK_ASSERT_NOT_OWNED: + if (rwlock->writer_active != current) { + condition_met = true; + } else { + fail_msg = "Lock is held exclusively by current fiber"; + } + break; + + default: + fail_msg = "Unknown assertion type requested"; + break; + } + + FIBERS_ASSERT( + condition_met, + "fibers_rwlock_assert(%p) failed: type=0x%x (%s). State: writer=%d, readers=%u", (void *)rwlock, type, fail_msg, + rwlock->writer_active ? rwlock->writer_active->id : -1, + rwlock->reader_count + ); +} diff --git a/tests/unit/mocks/fibers/rwlock.h b/tests/unit/mocks/fibers/rwlock.h new file mode 100644 index 000000000..de917dc4a --- /dev/null +++ b/tests/unit/mocks/fibers/rwlock.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "fibers.h" + +#define FIBERS_RWLOCK_ASSERT_SHARED 0x01 +#define FIBERS_RWLOCK_ASSERT_EXCLUSIVE 0x02 +#define FIBERS_RWLOCK_ASSERT_HELD 0x03 +#define FIBERS_RWLOCK_ASSERT_NOTHELD 0x04 +#define FIBERS_RWLOCK_ASSERT_NOT_OWNED 0x05 + +#define FIBERS_RWLOCK_WANT_UPGRADE 0x1 + +typedef struct fibers_rwlock fibers_rwlock_t; + +struct fibers_rwlock { + fiber_t writer_active; + unsigned int reader_count; + unsigned int flags; + + struct fibers_queue reader_wait_queue; + struct fibers_queue writer_wait_queue; +}; + +extern void fibers_rwlock_init(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_rdlock(fibers_rwlock_t *rwlock, bool check_may_yield); +extern void fibers_rwlock_wrlock(fibers_rwlock_t *rwlock, bool check_may_yield); +extern int fibers_rwlock_try_rdlock(fibers_rwlock_t *rwlock); +extern int fibers_rwlock_try_wrlock(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_rdunlock(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_wrunlock(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_unlock(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_destroy(fibers_rwlock_t *rwlock); +extern bool fibers_rwlock_upgrade(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_downgrade(fibers_rwlock_t *rwlock); +extern void fibers_rwlock_assert(fibers_rwlock_t *rwlock, unsigned int type); diff --git a/tests/unit/mocks/mock_3rd_party.c b/tests/unit/mocks/mock_3rd_party.c new file mode 100644 index 000000000..f2bcf57e7 --- /dev/null +++ b/tests/unit/mocks/mock_3rd_party.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include + +// This usually comes from tightbeam library. This is not needed by the tester so it is mocked here are a no-op +void +tb_transport_startup(void) +{ +} + +#if !__BUILDING_FOR_COVERAGE__ +// These __llvm_* functions usually come from cc_kext library and are used for profiling. +// This functionality is not needed by the tester so they are mocked here as a no-op to resolve their linking. +// linking to cc_kext is not possible because it is compiled with -mkernel +// When building unit-tests for coverage, these function come from the user-mode coverage lib, so they shouldn't +// be redefined here. +uint64_t +__llvm_profile_get_size_for_buffer_internal(const char *DataBegin, + const char *DataEnd, + const char *CountersBegin, + const char *CountersEnd, + const char *NamesBegin, + const char *NamesEnd) +{ + return 0; +} + +int +__llvm_profile_write_buffer_internal(char *Buffer, + const char *DataBegin, + const char *DataEnd, + const char *CountersBegin, + const char *CountersEnd, + const char *NamesBegin, + const char *NamesEnd) +{ + return 0; +} +#endif // !__BUILDING_FOR_COVERAGE__ + +// These __firehose_* functions usually come from libfirehose_kernel are used for logging +// This functionality is not needed and is mocked by the tester, so these functions are mocked as no-op +// to resolve their linking. +// Linking to libfirehose_kernel is not possible because it is compiled with -mkernel + +firehose_buffer_t +__firehose_buffer_create(size_t *size) +{ + return NULL; +} + +void +__firehose_buffer_tracepoint_flush(firehose_tracepoint_t vat, + firehose_tracepoint_id_u vatid) +{ +} + +firehose_tracepoint_t +__firehose_buffer_tracepoint_reserve(uint64_t stamp, firehose_stream_t stream, + uint16_t pubsize, uint16_t privsize, uint8_t **privptr) +{ + return NULL; +} + +int +__firehose_kernel_configuration_valid(uint8_t chunk_count, uint8_t io_pages) +{ + return 0; +} + +bool +__firehose_merge_updates(firehose_push_reply_t update) +{ + return false; +} + + diff --git a/tests/unit/mocks/mock_alloc.c b/tests/unit/mocks/mock_alloc.c new file mode 100644 index 000000000..558f5f6cf --- /dev/null +++ b/tests/unit/mocks/mock_alloc.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "unit_test_utils.h" + +#include +#include +#include +#include + + +#undef kalloc_ext + +T_MOCK(struct kalloc_result, +kalloc_ext, ( + void *kheap_or_kt_view, + vm_size_t size, + zalloc_flags_t flags, + void *owner)) +{ + void* addr = calloc(1, size); + return (struct kalloc_result){ .addr = addr, .size = size }; +} + + +T_MOCK(void, +kfree_ext, (void *kheap_or_kt_view, void *data, vm_size_t size)) +{ + free(data); +} + +T_MOCK(void *, +kalloc_type_impl_internal, (kalloc_type_view_t kt_view, zalloc_flags_t flags)) +{ + return calloc(1, kt_view->kt_size); +} +T_MOCK(void *, +kalloc_type_impl_external, (kalloc_type_view_t kt_view, zalloc_flags_t flags)) +{ + return calloc(1, kt_view->kt_size); +} + +T_MOCK(kmem_return_t, +kmem_alloc_guard, ( + vm_map_t map, + vm_size_t size, + vm_offset_t mask, + kma_flags_t flags, + kmem_guard_t guard)) +{ + kmem_return_t kmr = { }; + kmr.kmr_address = (vm_address_t)calloc(1, size); + // TODO verify allocation rdar://136915968 + // TODO malloc with guard pages? + kmr.kmr_return = KERN_SUCCESS; + return kmr; +} + +T_MOCK(vm_size_t, +kmem_free_guard, ( + vm_map_t map, + vm_offset_t req_addr, + vm_size_t req_size, + kmf_flags_t flags, + kmem_guard_t guard)) +{ + // TODO rdar://136915968 + return req_size; +} + +T_MOCK(void *, +zalloc_permanent_tag, (vm_size_t size, vm_offset_t mask, vm_tag_t tag)) +{ + // mask is align-1, see ZALIGN() + return checked_alloc_align(size, mask + 1); +} + +T_MOCK(void *, +zalloc_percpu_permanent, (vm_size_t size, vm_offset_t mask)) +{ + return MOCK_zalloc_permanent_tag(size, mask, 0); +} + +T_MOCK(void, +zalloc_ro_mut, (zone_id_t zid, void *elem, vm_offset_t offset, const void *new_data, vm_size_t new_data_size)) +{ + memcpy((void *)((uintptr_t)elem + offset), new_data, new_data_size); +} + +T_MOCK(void, +zone_require, (zone_t zone, void *addr)) +{ + // TODO rdar://136915968 +} + +T_MOCK(void, +zone_id_require, (zone_id_t zid, vm_size_t esize, void *addr)) +{ + // TODO rdar://136915968 +} + +T_MOCK(void, +zone_enable_caching, (zone_t zone)) +{ +} + +void *mock_mem_alloc_vm_object(void); + +T_MOCK(struct kalloc_result, +zalloc_ext, (zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)) +{ + void* addr = NULL; + if (strcmp(zone->z_name, "vm objects") == 0) { + addr = mock_mem_alloc_vm_object(); + } else { + addr = calloc(1, zone->z_elem_size); + } + return (struct kalloc_result){ (void *)addr, zone->z_elem_size }; +} + +T_MOCK(void, +zfree_ext, (zone_t zone, zone_stats_t zstats, void *addr, uint64_t combined_size)) +{ + // TODO rdar://136915968 +} + +T_MOCK(void, +zone_enable_smr, (zone_t zone, struct smr *smr, zone_smr_free_cb_t free_cb)) +{ +} diff --git a/tests/unit/mocks/mock_attached.c b/tests/unit/mocks/mock_attached.c new file mode 100644 index 000000000..e1e240261 --- /dev/null +++ b/tests/unit/mocks/mock_attached.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "dt_proxy.h" +#include "unit_test_utils.h" + +#include // for vm_map_offset_t +#include +#include +#include + +// This file is linked to the same .dylib as the XNU code static library +// to provide some utilities that XNU code relies on when building for unit-test + +// This symbol normally comes from lastkernelconstructor.o but this object is not linked to libkernel since it is +// a zero length symbol, which fails to prelink with ld -r +// It is defined here again to resolve the external. +// It is used for debugging and in kext related functions which are not needed by the tester +void* last_kernel_symbol = NULL; + +// In normal XNU build this is a global of type mach_header_64 that the linker adds. The user-space linker +// doesn't add it so it's added here to resolve the external. It's not needed by the tester. see +int _mh_execute_header; + +// called from fake_init, setup proxies for darwintest asserts +struct dt_proxy_callbacks *dt_proxy = NULL; +void +set_dt_proxy_attached(struct dt_proxy_callbacks *p) +{ + dt_proxy = p; +} +struct dt_proxy_callbacks * +get_dt_proxy_attached(void) +{ + return dt_proxy; +} + +// check if panic/assert were expected by the test using T_ASSERT_PANIC +struct ut_expected_panic_s ut_expected_panic; + +void +ut_check_expected_panic(const char* panic_str) +{ + if (!ut_expected_panic.expect_panic) { + return; + } + ut_expected_panic.expect_panic = false; + if (ut_expected_panic.str_contains != NULL) { + if (strstr(panic_str, ut_expected_panic.str_contains) == NULL) { + PT_LOG_FMTSTR("Panic with unexpected panic-string, expected: `%s`", ut_expected_panic.str_contains); + return; + } + } + PT_LOG("Panic was expected"); + longjmp(ut_expected_panic.jb, 1); +} + +// This function is called on an assert instead of invoking a brk instruction which would trap the kernel +__attribute__((noreturn)) void +ut_assert_trap(int code, long a, long b, long c) +{ + struct kernel_panic_reason pr = {}; + if (code == MACH_ASSERT_TRAP_CODE) { + panic_assert_format(pr.buf, sizeof(pr.buf), (struct mach_assert_hdr *)a, b, c); + PT_LOG_OR_RAW_FMTSTR("%s", pr.buf); + } else { + snprintf(pr.buf, sizeof(pr.buf), "%x", code); + PT_LOG_OR_RAW_FMTSTR("Unknown assert code %s", pr.buf); + } + + ut_check_expected_panic(pr.buf); // may not return + PT_FAIL("Unexpected assert fail, exiting"); + abort(); +} + +// This function can be called from the tested code to force a context switch when using fibers +// See the mock implementation +void +ut_fibers_ctxswitch(void) +{ +} + +// This function can be called from the tested code to force a context switch to a specific fiber +// See the mock implementation +void +ut_fibers_ctxswitch_to(int fiber_id) +{ +} + +// This function can be called from the tested code to get the current fiber id when using fibers, -1 otherwise +// See the mock implementation +int +ut_fibers_current_id(void) +{ + return -1; +} + +static void +fail_not_mocked() +{ + PT_FAIL("This function should never be called since it is mocked by the mocks dylib"); +} + +// This function is changed from being a macro. It needs to have an implementation +// in the code attached to XNU so that it can be mocked +__mockable void +lock_disable_preemption_for_thread(thread_t t) +{ + fail_not_mocked(); +} + +__mockable __attribute__((const)) thread_t +current_thread_fast(void) +{ + fail_not_mocked(); + return NULL; +} + + + +extern vm_map_t vm_map_create_external( + pmap_t pmap, + vm_map_offset_t min, + vm_map_offset_t max, + boolean_t pageable); + +// this function alias is done during linking of XNU, which doesn't happen when building the library +vm_map_t +vm_map_create( + pmap_t pmap, + vm_map_offset_t min, + vm_map_offset_t max, + boolean_t pageable) +{ + return vm_map_create_external(pmap, min, max, pageable); +} diff --git a/tests/unit/mocks/mock_cpu.c b/tests/unit/mocks/mock_cpu.c new file mode 100644 index 000000000..0ca13c7f0 --- /dev/null +++ b/tests/unit/mocks/mock_cpu.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "unit_test_utils.h" +#include "mock_cpu.h" + + +T_MOCK_DYNAMIC(kern_return_t, cpu_signal, + (cpu_data_t * target, cpu_signal_t signal, + void *p0, void *p1), + (target, signal, p0, p1)); + +T_MOCK_DYNAMIC(int, ml_get_max_cpu_number, (void), ()); diff --git a/tests/unit/mocks/mock_cpu.h b/tests/unit/mocks/mock_cpu.h new file mode 100644 index 000000000..951dda888 --- /dev/null +++ b/tests/unit/mocks/mock_cpu.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include +#include "mock_dynamic.h" + + +T_MOCK_DYNAMIC_DECLARE( + kern_return_t, + cpu_signal, + (cpu_data_t * target, cpu_signal_t signal, + void *p0, void *p1)); + + +T_MOCK_DYNAMIC_DECLARE(int, ml_get_max_cpu_number, (void)); diff --git a/tests/unit/mocks/mock_dynamic.h b/tests/unit/mocks/mock_dynamic.h new file mode 100644 index 000000000..4b92c59d4 --- /dev/null +++ b/tests/unit/mocks/mock_dynamic.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + + +/* BEGIN IGNORE CODESTYLE */ + +/* Dynamic mock allows an individual test executable to control what a mock does. + * T_MOCK_DYNAMIC_DECLARE() + * Declare a dynamic mock. This declaration should come in a header file under the mocks/ folder. + * The header file should be included in both the respective .c file and in the test .c file that + * wants to set the behaviour of the mock. + * It declares the signature of the mocked function so that if the signature changes the compiler + * can assure that the mock and its setters are in sync. + * T_MOCK_DYNAMIC() + * Define the dynamic mock. This should come in a .c file under the mocks/ folder. + * This defines the mock function itself using the T_MOCK() macro. + * + * The test has 4 possible way to control the mock. It can temporarily set the return value, + * it can set a temporary block callback, it can set a permanent return value or a permanent function. + * @argument args_def is how the function arguments are defined in a function definition. + * This can be copy-pasted directly from the original function definition. + * @argument args_invoke is how the same arguments are passed to a function call + * @argument (optional) default_action should be a scope of code that will be executed if no mock control + * is set up. it can reference the arguments in args_def and also call the original + * function. If this argument is not supplied, the default action is to call the original XNU + * function with the same arguments. + * + * Example: + * // we want to mock a function from XNU that has the signature: + * size_t foobar(int a, char b); + * + * // in a header in the mocks library (tests/unit/mocks) add: + * T_MOCK_DYNAMIC_DECLARE(size_t, foobar, (int a, char b)); + * + * // in a .c file in the mock library (tests/unit/mocks) add: + * T_MOCK_DYNAMIC(size_t, foobar, (int a, char b), (a, b), { return 0 }); + * + * // Now to control the mock, in a T_DECL test you can do: + * T_DECL(test, "test") { + * T_MOCK_SET_RETVAL(foobar, size_t, 42); + * // ... call into XNU which will call foobar() + * + * T_MOCK_SET_CALLBACK(foobar, size_t, (int a, char b), { + * T_ASSERT_EQ(a, b, "args equal"); + * return a + b; + * }); + * // ... call into XNU which will call foobar() + * } + * + * // The third option is to define a permanent return value for the mock that will + * // be in effect for all tests in the executable. + * // This essentially overrides the default-value that's defined in the T_MOCK_DYNAMIC() + * T_MOCK_SET_PERM_RETVAL(foobar, size_t, 43); + * + * // The fourth option is for the test to define a permanent function in the global scope + * // that will be called every time the mock is called. + * T_MOCK_SET_PERM_FUNC(size_t, foobar, (int a, char b)) { + * return b - a; + * } + * + * It's possible for multiple mock controls of different types to be active at the same time. The priority + * in which the dynamic mock tries to find them is + * 1. ret-val + * 2. block call back + * 3. permanent ret-val / permanent function + * The effect of the ret-val and callback setters is limited to the scope the they are in. This + * is achieved using a cleanup function in the setter. + * It is possible for multiple setters of the same type to be invoked during the flow of the same scope. + * In that case, the last setter that was invoked is in effect. + * + * It is not possible to have multiple static function setters and/or permanent ret-val setter for the + * same mock in the same test executable. This would cause a compile/link error due to duplicate symbol. + */ + +#define _T_MOCK_RETVAL_CALLBACK(name) _mock_retval_callback_ ## name +#define _T_MOCK_CALLBACK(name) _mock_callback_ ## name +#define _T_MOCK_PERM_RETVAL_FUNC(name) _mock_p_retval_func_ ## name +#define _T_MOCK_PERM_FUNC(name) _mock_func_ ## name + +#define T_MOCK_DYNAMIC_DECLARE(ret, name, args_def) \ + extern ret (^_T_MOCK_RETVAL_CALLBACK(name))(void); \ + extern ret (^_T_MOCK_CALLBACK(name)) args_def; \ + extern ret (*_T_MOCK_PERM_RETVAL_FUNC(name))(void); \ + extern ret (*_T_MOCK_PERM_FUNC(name)) args_def; \ + extern ret name args_def + +#define _T_MOCK_DYNAMIC_WITH_IMPL(ret, name, args_def, args_invoke, default_action) \ + ret (^_T_MOCK_RETVAL_CALLBACK(name)) (void) = NULL; \ + ret (^_T_MOCK_CALLBACK(name)) args_def = NULL; \ + ret (*_T_MOCK_PERM_RETVAL_FUNC(name)) (void) = NULL; \ + ret (*_T_MOCK_PERM_FUNC(name)) args_def = NULL; \ + T_MOCK(ret, name, args_def) { \ + if (_T_MOCK_RETVAL_CALLBACK(name) != NULL) { \ + return _T_MOCK_RETVAL_CALLBACK(name)(); \ + } \ + if (_T_MOCK_CALLBACK(name) != NULL) { \ + return _T_MOCK_CALLBACK(name) args_invoke; \ + } \ + if (_T_MOCK_PERM_RETVAL_FUNC(name) != NULL) { \ + return _T_MOCK_PERM_RETVAL_FUNC(name)(); \ + } \ + if (_T_MOCK_PERM_FUNC(name) != NULL) { \ + return _T_MOCK_PERM_FUNC(name) args_invoke; \ + } \ + default_action; \ + } + +#define _T_MOCK_DYNAMIC_DEFAULT_IMPL(ret, name, args_def, args_invoke) \ + _T_MOCK_DYNAMIC_WITH_IMPL(ret, name, args_def, args_invoke, { return name args_invoke; }) + +/* T_MOCK_DYNAMIC() selects which of the above versions to call depending on the number of arguments it gets + * - T_MOCK_DYNAMIC(a, b, c, d) with 4 arguments expands to + * _T_MOCK_GET_INSTANCE(a, b, c, d, _T_MOCK_DYNAMIC_WITH_IMPL, _T_MOCK_DYNAMIC_DEFAULT_IMPL)(a, b, c, d) + * then NAME is _T_MOCK_DYNAMIC_DEFAULT_IMPL so this expands to + * _T_MOCK_DYNAMIC_DEFAULT_IMPL(a, b, c, d) + * - T_MOCK_DYNAMIC(a, b, c, d, e) with 5 arguments expands to + * _T_MOCK_GET_INSTANCE(a, b, c, d, e, _T_MOCK_DYNAMIC_WITH_IMPL, _T_MOCK_DYNAMIC_DEFAULT_IMPL)(a, b, c, d, e) + * then NAME is _T_MOCK_DYNAMIC_WITH_IMPL so this expands to + * _T_MOCK_DYNAMIC_WITH_IMPL(a, b, c, e, e) + */ +#define _T_MOCK_GET_INSTANCE(_1, _2, _3, _4, _5, NAME, ...) NAME +#define T_MOCK_DYNAMIC(...) _T_MOCK_GET_INSTANCE(__VA_ARGS__, _T_MOCK_DYNAMIC_WITH_IMPL, _T_MOCK_DYNAMIC_DEFAULT_IMPL)(__VA_ARGS__) + + + +#define _UT_CONCAT2(a, b) a ## b +#define _UT_CONCAT(a, b) _UT_CONCAT2(a, b) + +static inline void +_mock_set_cleaner(void ***ptr) { + **ptr = NULL; +} + +/* How it works? + * - For each mock that is defined using T_MOCK_DYNAMIC() the macro above defines a few + * global variables with the function name suffixed, and also defines the mock function to check + * these global variables. + * - The test executable can then set any of them using the T_MOCK_SET_X() macros below + * - T_MOCK_SET_RETVAL() and T_MOCK_SET_CALLBACK() should be used from inside T_DECL and have a + * cleaner that undoes their effect at the end of the scope they are defined in. + * The cleaner has a __COUNTER__ concatenated so that it's possible to have more than one such + * T_MOCK_SET_X() invocation in the same scope + * - T_MOCK_SET_PERM_RETVAL() and T_MOCK_SET_PERM_FUNC() should be used in the global scope + * and has a constructor function that sets the global variable when the executable loads + */ + +#define _T_MOCK_CLEANER(name) _UT_CONCAT(_cleaner_ ## name, __COUNTER__) +#define _T_MOCK_RETVAL_CAPTURE(name, N) _UT_CONCAT(_mock_retval_capture_ ## name, N) + +/* to set a return value, we set a global that holds a callback block that returns the value. + * The callback variable is a pointer and NULL indicates it's not set + * The value expression the user gives is first captured in a local variable since some + * expressions can't be captured by a block (array reference for instance) */ +#define _T_MOCK_SET_RETVAL_IMPL(name, ret, val, N) \ + ret _T_MOCK_RETVAL_CAPTURE(name, N) = val; \ + _T_MOCK_RETVAL_CALLBACK(name) = ^ret(void) { return _T_MOCK_RETVAL_CAPTURE(name, N); }; \ + __attribute__((cleanup(_mock_set_cleaner))) void **_T_MOCK_CLEANER(name) = \ + (void**)&_T_MOCK_RETVAL_CALLBACK(name) +#define T_MOCK_SET_RETVAL(name, ret, val) _T_MOCK_SET_RETVAL_IMPL(name, ret, val, __COUNTER__) + +/* to set a mock callback block from the user we set a dedicated callback for that, so it doesn't + * interfere with SET_RETVAL */ +#define T_MOCK_SET_CALLBACK(name, ret, args_def, body) \ + _T_MOCK_CALLBACK(name) = ^ret args_def body; \ + __attribute__((cleanup(_mock_set_cleaner))) void **_T_MOCK_CLEANER(name) = \ + (void**)&_T_MOCK_CALLBACK(name) + +#define _T_MOCK_CTOR_SETTER(name) _ctor_setter_ ## name +#define _T_MOCK_PERM_HOOK(name) PERM_HOOK_ ## name + +/* To set a permanent return value, we define a function that returns it, and set it to the + * extern global in a constructor. + * This setter needs to be in the global scope of the tester */ +#define T_MOCK_SET_PERM_RETVAL(name, ret, val) \ + ret _T_MOCK_PERM_HOOK(name)(void) { return (val); } \ + __attribute__((constructor)) void _T_MOCK_CTOR_SETTER(name)() { \ + _T_MOCK_PERM_RETVAL_FUNC(name) = _T_MOCK_PERM_HOOK(name); \ + } + +/* To set a permanent function that will be called from the mock we declare it, set it to the extern + * in a constructor and define it. + * This needs to be in the global scope and the body of the function needs to follows it immediately */ +#define T_MOCK_SET_PERM_FUNC(ret, name, args_def) \ + ret _T_MOCK_PERM_HOOK(name) args_def; \ + __attribute__((constructor)) void _T_MOCK_CTOR_SETTER(name)() { \ + _T_MOCK_PERM_FUNC(name) = _T_MOCK_PERM_HOOK(name); \ + } \ + ret _T_MOCK_PERM_HOOK(name) args_def + + +/* T_MOCK_CALL_QUEUE() + * Allow tests to define a call expectation queue for a mock + * + * This macro wraps a definition of a struct and defines easy helpers to + * manage a global queue of elements of that struct. + * A test can use this along with a mock callback to verify and control what the mock + * does in every call it gets. + * @argument type_name the name of the struct to define + * @argument struct_body the elements of the struct + * + * Example: + * // for mocking the function foobar() we'll define a struct that will allow the mock + * // to verify its arguments and control its return value. The elements of the struct can + * // be anything. + * T_MOCK_CALL_QUEUE(fb_call, { + * int expected_a_eq; + * bool expected_b_small; + * size_t ret_val; + * }) + * + * T_MOCK_SET_PERM_FUNC(size_t, foobar, (int a, char b)) { + * fb_call call = dequeue_fb_call(); + * T_ASSERT_EQ(a, call.expected_a_eq, "a arg"); + * if (call.expected_b_small) + * T_ASSERT_LE(b, 127, "b arg too big"); + * return call.ret_val; + * } + * + * // in the test we set up the expected calls before calling the code that ends up in the mock + * T_DECL(test, "test") { + * enqueue_fb_call( (fb_call){ .expected_a = 1, .expected_b = 2, .ret_val = 3 }); + * enqueue_fb_call( (fb_call){ .expected_a = 10, .expected_b = 20, .ret_val = 30 }); + * // ... call into XNU which will call foobar() + * assert_empty_fb_call(); // check all calls were consumed + * } + */ + +#define _T_MOCK_CALL_LST(type_name) _lst_ ## type_name + +#define T_MOCK_CALL_QUEUE(type_name, struct_body) \ + typedef struct s_ ## type_name struct_body type_name; \ + struct _node_ ## type_name { \ + STAILQ_ENTRY(_node_ ## type_name) next; \ + type_name d; \ + }; \ + static STAILQ_HEAD(, _node_ ## type_name) _T_MOCK_CALL_LST(type_name) = \ + STAILQ_HEAD_INITIALIZER(_T_MOCK_CALL_LST(type_name)); \ + static void enqueue_ ## type_name (type_name value) { \ + struct _node_ ## type_name *node = calloc(1, sizeof(struct _node_ ## type_name)); \ + node->d = value; \ + STAILQ_INSERT_TAIL(&_T_MOCK_CALL_LST(type_name), node, next); \ + } \ + static type_name dequeue_ ## type_name (void) { \ + struct _node_ ## type_name *node = STAILQ_FIRST(&_T_MOCK_CALL_LST(type_name)); \ + T_QUIET; T_ASSERT_NOTNULL(node, "consumed too many " #type_name); \ + type_name d = node->d; \ + STAILQ_REMOVE_HEAD(&_T_MOCK_CALL_LST(type_name), next); \ + free(node); \ + return d; \ + } \ + static void assert_empty_ ## type_name (void) { \ + T_QUIET; T_ASSERT_TRUE( STAILQ_EMPTY(&_T_MOCK_CALL_LST(type_name)), \ + "calls not fully consumed " #type_name); \ + } \ + static void clear_ ## type_name (void) { \ + STAILQ_INIT(&_T_MOCK_CALL_LST(type_name)); \ + } + +/* END IGNORE CODESTYLE */ diff --git a/tests/unit/mocks/mock_mem.c b/tests/unit/mocks/mock_mem.c new file mode 100644 index 000000000..141c1f248 --- /dev/null +++ b/tests/unit/mocks/mock_mem.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "dt_proxy.h" +#include "unit_test_utils.h" + +/* This is an implementation of simple fixed size same-size objects pool */ +struct mock_mem_pool { + size_t elem_size; + char *buffer; + char *free_head; + uint32_t free_count; +}; + +void +mock_mem_init(struct mock_mem_pool* mm, size_t elem_sz, uint32_t count) +{ + mm->elem_size = elem_sz; + size_t buf_size = elem_sz * count; + mm->buffer = aligned_alloc(8, buf_size); + PT_QUIET; PT_ASSERT_NOTNULL(mm->buffer, "failed alloc"); + memset(mm->buffer, 0, buf_size); + mm->free_head = mm->buffer; + mm->free_count = count; +} + +void * +mock_mem_alloc(struct mock_mem_pool* mm) +{ + PT_QUIET; PT_ASSERT_NOTNULL(mm->buffer, "mock mem not allocated"); + PT_QUIET; PT_ASSERT_TRUE(mm->free_count > 0, "no more space left"); + void *ret = mm->free_head; + mm->free_head += mm->elem_size; + mm->free_count--; + return ret; +} + +void +mock_mem_free(struct mock_mem_pool* mm, void *ptr) +{ + // not implemeted yet rdar://136915968 +} + +struct mock_mem_pool mm_vm_objects; + + +// this is used for vm_object and vm_page pointer packing +uintptr_t mock_page_ptr_base; + +void +mock_mem_init_vm_objects(void) +{ + mock_mem_init(&mm_vm_objects, 256, 100); + mock_page_ptr_base = (uintptr_t)mm_vm_objects.buffer; +} +void * +mock_mem_alloc_vm_object(void) +{ + return mock_mem_alloc(&mm_vm_objects); +} diff --git a/tests/unit/mocks/mock_misc.c b/tests/unit/mocks/mock_misc.c new file mode 100644 index 000000000..9719af803 --- /dev/null +++ b/tests/unit/mocks/mock_misc.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "mock_misc.h" +#include "std_safe.h" +#include "unit_test_utils.h" +#include "dt_proxy.h" + +#include "fibers/random.h" + +#include +#include +#include + +// This initialized the darwintest asserts proxies in the mocks .dylib +struct dt_proxy_callbacks *dt_proxy = NULL; +void +set_dt_proxy_mock(struct dt_proxy_callbacks *p) +{ + dt_proxy = p; +} +struct dt_proxy_callbacks * +get_dt_proxy_mock(void) +{ + return dt_proxy; +} + + +// for cpu_data_startup_init +T_MOCK(unsigned int, +ml_get_cpu_count, (void)) +{ + return 1; +} + +T_MOCK(vm_offset_t, +min_valid_stack_address, (void)) +{ + return 0; +} + +T_MOCK(vm_offset_t, +max_valid_stack_address, (void)) +{ + return 0; +} + +T_MOCK(u_int32_t, +RandomULong, (void)) +{ + return (u_int32_t)random_next(); +} + +T_MOCK(uint64_t, +early_random, (void)) +{ + return random_next(); +} + +// needed because in-kernel impl for some reason got to libcorecrypt dyld +T_MOCK(void, +read_erandom, (void * buffer, unsigned int numBytes)) +{ + unsigned char *cbuf = (unsigned char *)buffer; + for (int i = 0; i < numBytes; ++i) { + cbuf[i] = (unsigned char)(random_next() % 0xFF); + } +} + +T_MOCK(void, +read_random, (void * buffer, unsigned int numbytes)) +{ + read_erandom(buffer, numbytes); +} + +T_MOCK(uint32_t, +PE_get_random_seed, (unsigned char *dst_random_seed, uint32_t request_size)) +{ + for (uint32_t i = 0; i < request_size; i++, dst_random_seed++) { + *dst_random_seed = 0; + } + return request_size; +} + +T_MOCK(bool, +ml_unsafe_kernel_text, (void)) +{ + return true; +} + + +T_MOCK(__attribute__((noinline, not_tail_called)) void, +os_log_with_args, (void* oslog, uint8_t type, const char *fmt, va_list args, void *addr)) +{ + char buf[PRINT_BUF_SIZE]; + int printed = vsnprintf(buf, PRINT_BUF_SIZE, fmt, args); + if (printed > PRINT_BUF_SIZE - 1) { + printed = PRINT_BUF_SIZE - 1; + } +#if 0 // this can be switched on if we want pre-main logs + buf[printed] = '\n'; + write(STDOUT_FILENO, buf, printed); +#else + PT_LOG(buf); +#endif +} + + +// The panic() mock works in conjunction with T_ASSERT_PANIC() +// XNU code that panics doesn't expect panic() to return so any function that calls panic() doesn't bother +// to return gracefully to its caller with an error. +// In a unit-test we still want to call a function that is expected to panic, and then be able to run code after it. +// T_ASSERT_PANIC creates a setjmp() point before the call that is expected to panic. +// Once the panic callback panic_trap_to_debugger() is called it does a longjmp() to that jump point. +// This has a similar effect as C++ exceptions, except that any memory allocations performed by the code +// prior to the panic are going to be leaked. + +T_MOCK(void, +panic_trap_to_debugger, (const char *panic_format_str, va_list * panic_args, +unsigned int reason, void *ctx, uint64_t panic_options_mask, void *panic_data, +unsigned long panic_caller, const char *panic_initiator)) +{ + char buf[PRINT_BUF_SIZE]; + vsnprintf(buf, PRINT_BUF_SIZE, panic_format_str, *panic_args); + PT_LOG_OR_RAW_FMTSTR("panic! %s", buf); + ut_check_expected_panic(buf); // may not return + PT_FAIL("Panic was unexpected, exiting"); + abort(); +} + +T_MOCK(void, +vm_sanitize_send_telemetry, ( + vm_sanitize_method_t method, + vm_sanitize_checker_t checker, + vm_sanitize_checker_count_t checker_count, + enum vm_sanitize_subsys_error_codes ktriage_code, + uint64_t arg1, + uint64_t arg2, + uint64_t arg3, + uint64_t arg4, + uint64_t future_ret, + uint64_t past_ret)) +{ +} + +#if (DEBUG || DEVELOPMENT) + +T_MOCK(vm_size_t, +zone_element_info, ( + void *addr, + vm_tag_t * ptag)) +{ + return 0; +} + +#endif // DEBUG || DEVELOPMENT + +// added for setup_nested_submap() +T_MOCK(kern_return_t, +csm_setup_nested_address_space, ( + pmap_t pmap, + const vm_address_t region_addr, + const vm_size_t region_size)) +{ + return KERN_SUCCESS; +} + +T_MOCK(btref_t, +btref_get, ( + void *fp, + btref_get_flags_t flags)) +{ + return 0; +} + +#if (DEBUG || DEVELOPMENT) +// these are used for testing the mocking framework, xnu has them only in development || debug +T_MOCK_DYNAMIC(size_t, kernel_func1, (int a, char b), (a, b), { return 0; }); +T_MOCK_DYNAMIC(size_t, kernel_func2, (int a, char b), (a, b), { return 0; }); +T_MOCK_DYNAMIC(size_t, kernel_func3, (int a, char b), (a, b), { return 0; }); +T_MOCK_DYNAMIC(size_t, kernel_func4, (int a, char b), (a, b), { return 0; }); +T_MOCK_DYNAMIC(size_t, kernel_func5, (int a, char b), (a, b), { return kernel_func5(a, b); }); +T_MOCK_DYNAMIC(void, kernel_func6, (int a, char b), (a, b), { kernel_func6(a, b); }); +T_MOCK_DYNAMIC(size_t, kernel_func7, (int a, char b), (a, b)); +T_MOCK_DYNAMIC(void, kernel_func8, (int a, char b), (a, b)); +#endif // DEBUG || DEVELOPMENT diff --git a/tests/unit/mocks/mock_misc.h b/tests/unit/mocks/mock_misc.h new file mode 100644 index 000000000..e38f042a8 --- /dev/null +++ b/tests/unit/mocks/mock_misc.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once +#include "std_safe.h" +#include "mock_dynamic.h" + + +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func1, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func2, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func3, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func4, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func5, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(void, kernel_func6, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(size_t, kernel_func7, (int a, char b)); +T_MOCK_DYNAMIC_DECLARE(void, kernel_func8, (int a, char b)); diff --git a/tests/unit/mocks/mock_pmap.c b/tests/unit/mocks/mock_pmap.c new file mode 100644 index 000000000..10195ad68 --- /dev/null +++ b/tests/unit/mocks/mock_pmap.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "unit_test_utils.h" +#include "mock_pmap.h" + +#include + +T_MOCK(void *, +pmap_steal_memory, (vm_size_t size, vm_size_t alignment)) +{ + return checked_alloc_align(size, alignment); +} + + +T_MOCK(void, +pmap_startup, (vm_offset_t * startp, vm_offset_t * endp)) +{ + // TODO rdar://136915968 +} + +T_MOCK(boolean_t, +pmap_virtual_region, (unsigned int region_select, vm_map_offset_t * startp, vm_map_size_t * size)) +{ + return false; // TODO rdar://136915968 +} + +extern const struct page_table_attr * const native_pt_attr; + + +T_MOCK(pmap_t, +pmap_create_options, ( + ledger_t ledger, + vm_map_size_t size, + unsigned int flags)) +{ + pmap_t p = (pmap_t)calloc(1, sizeof(struct pmap)); + // this is needed for pmap_shared_region_size_min() + p->pmap_pt_attr = native_pt_attr; + + return p; +} + +T_MOCK(void, +pmap_set_nested, ( + pmap_t pmap)) +{ +} + +T_MOCK(kern_return_t, +pmap_nest, ( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + uint64_t size)) +{ + return KERN_SUCCESS; +} + +T_MOCK(kern_return_t, +pmap_unnest_options, ( + pmap_t grand, + addr64_t vaddr, + uint64_t size, + unsigned int option)) +{ + return KERN_SUCCESS; +} + +T_MOCK(void, +pmap_remove_options, ( + pmap_t pmap, + vm_map_address_t start, + vm_map_address_t end, + int options)) +{ +} + +T_MOCK(void, +pmap_destroy, ( + pmap_t pmap)) +{ +} +T_MOCK_DYNAMIC(uint64_t, + pmap_shared_region_size_min, (pmap_t pmap), (pmap), +{ + // the default behaviour for arm64 + return 0x0000000002000000ULL; +}) + +T_MOCK_DYNAMIC( + unsigned int, + pmap_cache_attributes, + (ppnum_t phys), (phys), + { return 0; }) + +T_MOCK_DYNAMIC( + pmap_paddr_t, + kvtophys, + (vm_offset_t offs), (offs), + { return 0; }) diff --git a/tests/unit/mocks/mock_pmap.h b/tests/unit/mocks/mock_pmap.h new file mode 100644 index 000000000..7ed9b90ec --- /dev/null +++ b/tests/unit/mocks/mock_pmap.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include +#include +#include "mock_dynamic.h" + +T_MOCK_DYNAMIC_DECLARE( + unsigned int, + pmap_cache_attributes, + (ppnum_t phys)); + +T_MOCK_DYNAMIC_DECLARE( + pmap_paddr_t, + kvtophys, + (vm_offset_t offs)); + +T_MOCK_DYNAMIC_DECLARE( + uint64_t, + pmap_shared_region_size_min, + (pmap_t pmap)); + +// This is a useful override for some tests that don't want to deal with huge sizes +// due to the pmap min region size +#define T_MOCK_pmap_shared_region_size_min_RET_PAGE_SIZE() \ + T_MOCK_SET_RETVAL(pmap_shared_region_size_min, uint64_t, PAGE_SIZE) diff --git a/tests/unit/mocks/mock_thread.c b/tests/unit/mocks/mock_thread.c new file mode 100644 index 000000000..95a0193bc --- /dev/null +++ b/tests/unit/mocks/mock_thread.c @@ -0,0 +1,1741 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "dt_proxy.h" +#include "mock_thread.h" +#include "unit_test_utils.h" +#include "mock_thread.h" + +#include "fibers/fibers.h" +#include "fibers/mutex.h" +#include "fibers/condition.h" +#include "fibers/rwlock.h" +#include "fibers/random.h" +#include "fibers/checker.h" + +#include // for cpu_data +#include +#include +#include +#include +#include +#include + +#define UNDEFINED_MOCK \ + raw_printf("%s: WIP mock, this should not be called\n", __FUNCTION__); \ + print_current_backtrace(); + +/* + * Unit tests that wants to use fibers must redefine this global with a value not 0. + * The test executable should not do this directly, instead it should call macro UT_USE_FIBERS in its global scope. + * + * We use a weak global and not a macro that defines a constructor to avoid initialization code running before such constructor to run + * with ut_mocks_use_fibers=0 before that the constructor change its value. + * Switching from the pthread mocks to fibers is not supported, we must be consistent from the very beginning. + */ +int ut_mocks_use_fibers __attribute__((weak)) = 0; + +/* + * Unit tests that wants to use fibers with data race checking must redefine this global with a value not 0. + * FIBERS_CHECKER=1 as env var will do the same job too. + */ +int ut_fibers_use_data_race_checker __attribute__((weak)) = 0; + +/* + * Unit tests can set this variable to force `lck_rw_lock_shared_to_exclusive` to fail. + * + * RANGELOCKINGTODO rdar://150846598 model when to return FALSE + */ +bool ut_mocks_lock_upgrade_fail = 0; + +/* + * This constructor is used to set the configuration variables of the fibers using env vars. + * The main use case is fuzzing, unit tests should set the variables in the test function or + * by calling the correspondig macros (UT_FIBERS_*, see mock_thread.h) in their global scope. + */ +__attribute__((constructor)) +static void +initialize_fiber_settings(void) +{ + const char *debug_env = getenv("FIBERS_DEBUG"); + if (debug_env != NULL) { + fibers_debug = atoi(debug_env); + } + + const char *err_env = getenv("FIBERS_ABORT_ON_ERROR"); + if (err_env != NULL) { + fibers_abort_on_error = atoi(err_env); + } + + const char *verbose_env = getenv("FIBERS_LOG"); + if (verbose_env != NULL) { + fibers_log_level = atoi(verbose_env); + } + + const char *prob_env = getenv("FIBERS_MAY_YIELD_PROB"); + if (prob_env != NULL) { + fibers_may_yield_probability = atoi(prob_env); + } + + const char *checker_env = getenv("FIBERS_CHECK_RACES"); + if (checker_env != NULL) { +#ifndef __BUILDING_WITH_SANCOV_LOAD_STORES__ + raw_printf("==== Fibers data race checker disabled ====\n"); + raw_printf("You cannot enable the data race checker if the FIBERS_PREEMPTION=1 flag was to not used as make parameter."); + return; +#else + if (!ut_mocks_use_fibers) { + raw_printf("==== Fibers data race checker disabled ====\n"); + raw_printf("You cannot enable the data race checker if the test is not using fibers (see UT_USE_FIBERS in the readme)."); + return; + } + ut_fibers_use_data_race_checker = atoi(checker_env); + if (ut_fibers_use_data_race_checker) { + raw_printf("==== Fibers data race checker enabled ====\n"); + } else { + raw_printf("==== Fibers data race checker disabled ====\n"); + } +#endif // __BUILDING_WITH_SANCOV_LOAD_STORES__ + } +} + +// --------------- proc and thread ------------------ + +struct proc; +typedef struct proc * proc_t; + +extern void init_thread_from_template(thread_t thread); +extern void ctid_table_init(void); +extern void ctid_table_add(thread_t thread); +extern void ctid_table_remove(thread_t thread); +extern void thread_ro_create(task_t parent_task, thread_t th, thread_ro_t tro_tpl); +extern task_t proc_get_task_raw(proc_t proc); +extern void task_zone_init(void); + +extern struct compact_id_table ctid_table; +extern lck_grp_t thread_lck_grp; +extern size_t proc_struct_size; +extern proc_t kernproc; + +void mock_init_proc(proc_t p, void* (*calloc_call)(size_t, size_t)); + +// a pointer to this object is kept per thread in thread-local-storage +struct mock_thread { + struct thread th; + fiber_t fiber; + struct mock_thread* wq_next; + bool interrupts_disabled; +}; + +struct pthread_mock_event_table_entry { + event_t ev; + pthread_cond_t cond; + // the condition variable is owned by the table and is initialized on the first use of the entry + bool cond_inited; +}; +#define PTHREAD_EVENTS_TABLE_SIZE 1000 + +struct mock_process_state { + void *proctask; // buffer for proc and task + struct proc *main_proc; + struct task *main_task; + struct cpu_data cpud; + struct mock_thread *main_thread; + uint64_t thread_unique_id; + uint64_t _faults; + uint64_t _pageins; + uint64_t _cow_faults; + + // pthread + pthread_key_t tls_thread_key; + pthread_mutex_t interrupts_mutex; // if this mutex is locked interrupts are disabled + pthread_mutex_t events_mutex; // for all event condition variables + struct pthread_mock_event_table_entry events[PTHREAD_EVENTS_TABLE_SIZE]; + // !pthread + + // fibers + int interrupts_disabled; + // !fibers +}; + +static void +mock_destroy_thread(void *th_p) +{ + struct mock_thread *mth = (struct mock_thread *)th_p; + // raw_printf("thread_t finished ctid=%u\n", mth->th.ctid); + + ctid_table_remove(&mth->th); + + free(mth->th.t_tro); + free(mth); +} + +static struct mock_thread * +mock_init_new_thread(struct mock_process_state* s) +{ + struct mock_thread *new_mock_thread = calloc(1, sizeof(struct mock_thread)); + struct thread *new_thread = &new_mock_thread->th; + + if (ut_mocks_use_fibers) { + new_mock_thread->fiber = fibers_current; + fibers_current->extra = new_mock_thread; + fibers_current->extra_cleanup_routine = &mock_destroy_thread; + } else { + pthread_setspecific(s->tls_thread_key, new_mock_thread); + } + + static int mock_init_new_thread_first_call = 1; + if (mock_init_new_thread_first_call) { + mock_init_new_thread_first_call = 0; + compact_id_table_init(&ctid_table); + ctid_table_init(); + } + + init_thread_from_template(new_thread); + + // maybe call thread_create_internal() ? + // machine is needed by _enable_preemption_write_count() + machine_thread_create(new_thread, s->main_task, true); + new_thread->machine.CpuDatap = &s->cpud; + new_thread->thread_id = ++s->thread_unique_id; + //new_thread->ctid = (uint32_t)new_thread->thread_id; + ctid_table_add(new_thread); + + thread_lock_init(new_thread); + wake_lock_init(new_thread); + + fake_init_lock(&new_thread->mutex); + + new_thread->t_tro = calloc(1, sizeof(struct thread_ro)); + new_thread->t_tro->tro_owner = new_thread; + new_thread->t_tro->tro_task = s->main_task; + new_thread->t_tro->tro_proc = s->main_proc; + + // for the main thread this happens before zalloc init so don't do the following which uses zalloc + //struct thread_ro tro_tpl = { }; + //thread_ro_create(&s->main_task, new_thread, &tro_tpl); + + new_thread->state = TH_RUN; + + // raw_printf("thread_t created ctid=%u\n", new_thread->ctid); + return new_mock_thread; +} + +void +fake_init_task(task_t new_task) +{ + // can't call task_create_internal() since it does zalloc + fake_init_lock(&new_task->lock); + fake_init_lock(&new_task->task_objq_lock); + queue_init(&new_task->task_objq); + queue_init(&new_task->threads); + new_task->suspend_count = 0; + new_task->thread_count = 0; + new_task->active_thread_count = 0; + new_task->user_stop_count = 0; + new_task->legacy_stop_count = 0; + new_task->active = TRUE; + new_task->halting = FALSE; + new_task->priv_flags = 0; + new_task->t_flags = 0; + new_task->t_procflags = 0; + new_task->t_returnwaitflags = 0; + new_task->importance = 0; + new_task->crashed_thread_id = 0; + new_task->watchports = NULL; + new_task->t_rr_ranges = NULL; + + new_task->bank_context = NULL; + + new_task->pageins = calloc(1, sizeof(uint64_t)); + + fake_init_lock(&new_task->task_objq_lock); + queue_init(&new_task->task_objq); +} + +static void +mock_init_threads_state(struct mock_process_state* s) +{ + //task_zone_init(); + s->proctask = calloc(1, proc_struct_size + sizeof(struct task)); + s->main_proc = (proc_t)s->proctask; + s->main_task = proc_get_task_raw(s->main_proc); + + memset(s->main_proc, 0, proc_struct_size); + mock_init_proc(s->main_proc, calloc); + kernproc = s->main_proc; // set global variable + + memset(s->main_task, 0, sizeof(*s->main_task)); + fake_init_task(s->main_task); + s->_faults = 0; + s->main_task->faults = &s->_faults; + s->_pageins = 0; + s->main_task->pageins = &s->_pageins; + s->_cow_faults = 0; + s->main_task->cow_faults = &s->_cow_faults; + + kernel_task = s->main_task; // without this machine_thread_create allocates + + cpu_data_init(&s->cpud); + s->thread_unique_id = 100; + + if (!ut_mocks_use_fibers) { + int ret = pthread_key_create(&s->tls_thread_key, &mock_destroy_thread); + if (ret != 0) { + raw_printf("failed pthread_key_create"); + exit(1); + } + + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + ret = pthread_mutex_init(&s->interrupts_mutex, &attr); + if (ret != 0) { + raw_printf("failed pthread_key_create"); + exit(1); + } + pthread_mutexattr_destroy(&attr); + + ret = pthread_mutex_init(&s->events_mutex, NULL); + if (ret != 0) { + raw_printf("failed pthread_key_create"); + exit(1); + } + memset(&s->events, 0, sizeof(s->events)); + } + + s->main_thread = mock_init_new_thread(s); +} + +struct mock_process_state * +get_proc_state(void) +{ + static struct mock_process_state s; + static bool initialized = false; + if (!initialized) { // TODO move to fake_kinit.c ? + initialized = true; + mock_init_threads_state(&s); + } + return &s; +} + +struct mock_thread * +get_mock_thread(void) +{ + struct mock_process_state *s = get_proc_state(); + + struct mock_thread *mth; + if (ut_mocks_use_fibers) { + mth = (struct mock_thread *)fibers_current->extra; + } else { + mth = pthread_getspecific(s->tls_thread_key); + } + + if (mth == NULL) { + mth = mock_init_new_thread(s); + } + return mth; +} + +T_MOCK(thread_t, +current_thread_fast, (void)) +{ + return &get_mock_thread()->th; +} + +T_MOCK(uint32_t, +kauth_cred_getuid, (void* cred)) +{ + return 0; +} + +// --------------- interrupts disable (spl) --------------------- + +T_MOCK(boolean_t, +ml_get_interrupts_enabled, (void)) +{ + if (ut_mocks_use_fibers) { + return get_mock_thread()->interrupts_disabled == 0; + } else { + pthread_mutex_t *m = &get_proc_state()->interrupts_mutex; + int r = pthread_mutex_trylock(m); + if (r == 0) { + // it's locked, meaning interrupts are disabled + pthread_mutex_unlock(m); + return false; + } + PT_QUIET; PT_ASSERT_TRUE(r == EBUSY, "unexpected value in get_interrupts_enabled"); + return true; + } +} + +// original calls DAIF +// interupts disable is mocked by disabling context switches with fiber_t.may_yield_disabled +T_MOCK(boolean_t, +ml_set_interrupts_enabled, (boolean_t enable)) +{ + if (ut_mocks_use_fibers) { + bool prev_interrupts_disabled = get_mock_thread()->interrupts_disabled; + + FIBERS_LOG(FIBERS_LOG_DEBUG, "ml_set_interrupts_enabled: enable=%d, previous state=%d, may_yield_disabled=%d", enable, !get_mock_thread()->interrupts_disabled, fibers_current->may_yield_disabled); + + fibers_may_yield_internal_with_reason( + (enable ? FIBERS_YIELD_REASON_PREEMPTION_WILL_ENABLE : FIBERS_YIELD_REASON_PREEMPTION_WILL_DISABLE) | + FIBERS_YIELD_REASON_ERROR_IF(enable != prev_interrupts_disabled)); + + // Track the interrupt state per fiber through yield_disabled + if (enable && prev_interrupts_disabled) { + get_mock_thread()->interrupts_disabled = false; + fibers_current->may_yield_disabled--; + } else if (!enable && !prev_interrupts_disabled) { + get_mock_thread()->interrupts_disabled = true; + fibers_current->may_yield_disabled++; + } + + FIBERS_LOG(FIBERS_LOG_DEBUG, "ml_set_interrupts_enabled exit: enable=%d, state=%d, may_yield_disabled=%d", enable, !get_mock_thread()->interrupts_disabled, fibers_current->may_yield_disabled); + + fibers_may_yield_internal_with_reason( + (enable ? FIBERS_YIELD_REASON_PREEMPTION_DID_ENABLE : FIBERS_YIELD_REASON_PREEMPTION_DID_DISABLE) | + FIBERS_YIELD_REASON_ERROR_IF(enable != prev_interrupts_disabled)); + + return !prev_interrupts_disabled; + } else { + pthread_mutex_t *m = &get_proc_state()->interrupts_mutex; + if (enable) { + int ret = pthread_mutex_unlock(m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "interrupts pthread_mutex_unlock"); + } else { + // disable interrupts locks + int ret = pthread_mutex_lock(m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "interrupts pthread_mutex_lock"); + } + } + return true; +} + +T_MOCK(boolean_t, +ml_set_interrupts_enabled_with_debug, (boolean_t enable, boolean_t __unused debug)) +{ + return MOCK_ml_set_interrupts_enabled(enable); +} + +T_MOCK(void, +_disable_preemption, (void)) +{ + if (ut_mocks_use_fibers) { + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_PREEMPTION_WILL_DISABLE | + FIBERS_YIELD_REASON_ERROR_IF(fibers_current->may_yield_disabled != 0)); + + fibers_current->may_yield_disabled++; + + FIBERS_LOG(FIBERS_LOG_DEBUG, "disable_preemption: may_yield_disabled=%d", fibers_current->may_yield_disabled); + + thread_t thread = MOCK_current_thread_fast(); + unsigned int count = thread->machine.preemption_count; + os_atomic_store(&thread->machine.preemption_count, count + 1, compiler_acq_rel); + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_PREEMPTION_DID_DISABLE | + FIBERS_YIELD_REASON_ERROR_IF(fibers_current->may_yield_disabled != 1)); + } else { + pthread_mutex_t *m = &get_proc_state()->interrupts_mutex; + + int ret = pthread_mutex_lock(m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "_disable_preemption pthread_mutex_lock"); + + thread_t thread = MOCK_current_thread_fast(); + unsigned int count = thread->machine.preemption_count; + os_atomic_store(&thread->machine.preemption_count, count + 1, compiler_acq_rel); + } +} + +T_MOCK(void, +_disable_preemption_without_measurements, (void)) +{ + MOCK__disable_preemption(); +} + +T_MOCK(void, +lock_disable_preemption_for_thread, (thread_t t)) +{ + MOCK__disable_preemption(); +} + +T_MOCK(void, +_enable_preemption, (void)) +{ + if (ut_mocks_use_fibers) { + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_PREEMPTION_WILL_ENABLE | + FIBERS_YIELD_REASON_ERROR_IF(fibers_current->may_yield_disabled != 1)); + + fibers_current->may_yield_disabled--; + + FIBERS_LOG(FIBERS_LOG_DEBUG, "enable_preemption: may_yield_disabled=%d", fibers_current->may_yield_disabled); + + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; + os_atomic_store(&thread->machine.preemption_count, count - 1, compiler_acq_rel); + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_PREEMPTION_DID_ENABLE | + FIBERS_YIELD_REASON_ERROR_IF(fibers_current->may_yield_disabled != 0)); + } else { + thread_t thread = current_thread(); + unsigned int count = thread->machine.preemption_count; + os_atomic_store(&thread->machine.preemption_count, count - 1, compiler_acq_rel); + + pthread_mutex_t *m = &get_proc_state()->interrupts_mutex; + + int ret = pthread_mutex_unlock(m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "_enable_preemption pthread_mutex_unlock"); + } +} + +// --------------- mutex ------------------ + +struct mock_lck_mtx_t { + union { + pthread_mutex_t *pt_m; + fibers_mutex_t *f_m; + }; + lck_mtx_state_t lck_mtx; +}; +static_assert(sizeof(struct mock_lck_mtx_t) == sizeof(lck_mtx_t)); + +void +fake_init_lock(lck_mtx_t * lck) +{ + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lck; + if (ut_mocks_use_fibers) { + mlck->f_m = calloc(1, sizeof(fibers_mutex_t)); + fibers_mutex_init(mlck->f_m); + } else { + mlck->pt_m = calloc(1, sizeof(pthread_mutex_t)); + int ret = pthread_mutex_init(mlck->pt_m, NULL); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "pthread_mutex_init"); + } +} + +T_MOCK(void, +lck_mtx_init, (lck_mtx_t * lck, lck_grp_t * grp, lck_attr_t * attr)) +{ + fake_init_lock(lck); +} + +T_MOCK(void, +lck_mtx_destroy, (lck_mtx_t * lck, lck_grp_t * grp)) +{ + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lck; + if (ut_mocks_use_fibers) { + fibers_mutex_destroy(mlck->f_m); + free(mlck->f_m); + mlck->f_m = NULL; + } else { + int ret = pthread_mutex_destroy(mlck->pt_m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "pthread_mutex_destroy"); + free(mlck->pt_m); + mlck->pt_m = NULL; + } +} + +T_MOCK(void, +lck_mtx_lock, (lck_mtx_t * lock)) +{ + uint32_t ctid = MOCK_current_thread_fast()->ctid; + + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lock; + if (ut_mocks_use_fibers) { + fibers_mutex_lock(mlck->f_m, true); + } else { + int ret = pthread_mutex_lock(mlck->pt_m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "pthread_mutex_lock"); + } + mlck->lck_mtx.owner = ctid; +} + +T_MOCK(void, +lck_mtx_lock_spin, (lck_mtx_t * lock)) +{ + uint32_t ctid = MOCK_current_thread_fast()->ctid; + + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lock; + if (ut_mocks_use_fibers) { + fibers_mutex_lock(mlck->f_m, false); // do not check for disabled preemption if spinlock + } else { + int ret = pthread_mutex_lock(mlck->pt_m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "pthread_mutex_lock"); + } + mlck->lck_mtx.owner = ctid; +} + +T_MOCK(boolean_t, +lck_mtx_try_lock, (lck_mtx_t * lock)) +{ + uint32_t ctid = MOCK_current_thread_fast()->ctid; + + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lock; + int ret; + if (ut_mocks_use_fibers) { + ret = fibers_mutex_try_lock(mlck->f_m); + } else { + int ret = pthread_mutex_trylock(mlck->pt_m); + } + if (ret == 0) { + mlck->lck_mtx.owner = ctid; + return TRUE; + } else { + return FALSE; + } +} + +T_MOCK(void, +lck_mtx_unlock, (lck_mtx_t * lock)) +{ + struct mock_lck_mtx_t* mlck = (struct mock_lck_mtx_t*)lock; + mlck->lck_mtx.owner = 0; + if (ut_mocks_use_fibers) { + fibers_mutex_unlock(mlck->f_m); + } else { + int ret = pthread_mutex_unlock(mlck->pt_m); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "pthread_mutex_unlock"); + } +} + +T_MOCK(void, +mutex_pause, (uint32_t collisions)) +{ + if (ut_mocks_use_fibers) { + // we can't sleep to not break determinism, trigger a ctxswitch instead + fibers_yield(); + } else { + mutex_pause(collisions); + } +} + +// --------------- rwlocks ------------------ + +struct mock_lck_rw_t { + fibers_rwlock_t *rw; + // lck_rw_word_t lck_rw; // RANGELOCKINGTODO rdar://150846598 + uint32_t lck_rw_owner; +}; +static_assert(sizeof(struct mock_lck_rw_t) == sizeof(lck_rw_t)); + +static_assert(LCK_RW_ASSERT_SHARED == FIBERS_RWLOCK_ASSERT_SHARED); +static_assert(LCK_RW_ASSERT_EXCLUSIVE == FIBERS_RWLOCK_ASSERT_EXCLUSIVE); +static_assert(LCK_RW_ASSERT_HELD == FIBERS_RWLOCK_ASSERT_HELD); +static_assert(LCK_RW_ASSERT_NOTHELD == FIBERS_RWLOCK_ASSERT_NOTHELD); + +void +fake_init_rwlock(struct mock_lck_rw_t *mlck) +{ + mlck->rw = calloc(1, sizeof(fibers_rwlock_t)); + fibers_rwlock_init(mlck->rw); +} + +static boolean_t +fake_rw_try_lock(struct mock_lck_rw_t *mlck, lck_rw_type_t lck_rw_type) +{ + int ret; + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_inc(MOCK_current_thread_fast(), (const void*)mlck); + + if (lck_rw_type == LCK_RW_TYPE_SHARED) { + ret = fibers_rwlock_try_rdlock(mlck->rw); + } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) { + ret = fibers_rwlock_try_wrlock(mlck->rw); + if (ret == 0) { + mlck->lck_rw_owner = MOCK_current_thread_fast()->ctid; + } + } else { + PT_FAIL("lck_rw_try_lock: Invalid lock type"); + } + + if (ret != 0) { + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_dec(MOCK_current_thread_fast(), (const void*)mlck); + } + return ret == 0; +} + +static bool +fake_rw_lock_would_yield_exclusive(struct mock_lck_rw_t *mlck, lck_rw_yield_t mode) +{ + fibers_rwlock_assert(mlck->rw, FIBERS_RWLOCK_ASSERT_EXCLUSIVE); + + bool yield = false; + if (mode == LCK_RW_YIELD_ALWAYS) { + yield = true; + } else { + if (mlck->rw->writer_wait_queue.count > 0) { + yield = true; + } else if (mode == LCK_RW_YIELD_ANY_WAITER) { + yield = (mlck->rw->reader_wait_queue.count != 0); + } + } + return yield; +} + +T_MOCK(void, +lck_rw_init, ( + lck_rw_t * lck, + lck_grp_t * grp, + lck_attr_t * attr)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_init(lck, grp, attr); + return; + } + + // RANGELOCKINGTODO rdar://150846598 mock attr, especially lck_rw_can_sleep + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fake_init_rwlock(mlck); +} + +T_MOCK(void, +lck_rw_destroy, (lck_rw_t * lck, lck_grp_t * grp)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_destroy(lck, grp); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_destroy(mlck->rw); + free(mlck->rw); + mlck->rw = NULL; +} + +T_MOCK(void, +lck_rw_unlock, (lck_rw_t * lck, lck_rw_type_t lck_rw_type)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_unlock(lck, lck_rw_type); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + if (mlck->rw->writer_active) { + mlck->lck_rw_owner = 0; + } + fibers_rwlock_unlock(mlck->rw); + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_dec(MOCK_current_thread_fast(), (const void*)mlck); +} + +static void +lck_rw_old_mock_unlock_shared(lck_rw_t * lck) +{ + if (!ut_mocks_use_fibers) { + lck_rw_unlock_shared(lck); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_rdunlock(mlck->rw); + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_dec(MOCK_current_thread_fast(), (const void*)mlck); +} + +T_MOCK(void, +lck_rw_unlock_shared, (lck_rw_t * lck)) +{ + lck_rw_old_mock_unlock_shared(lck); +} + +T_MOCK(void, +lck_rw_unlock_exclusive, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_unlock_exclusive(lck); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + mlck->lck_rw_owner = 0; + fibers_rwlock_wrunlock(mlck->rw); + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_dec(MOCK_current_thread_fast(), (const void*)mlck); +} + +T_MOCK(void, +lck_rw_lock_exclusive, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_lock_exclusive(lck); + return; + } + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_inc(MOCK_current_thread_fast(), (const void*)lck); + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_wrlock(mlck->rw, true); + mlck->lck_rw_owner = MOCK_current_thread_fast()->ctid; +} + +T_MOCK(void, +lck_rw_lock_shared, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_lock_shared(lck); + return; + } + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_inc(MOCK_current_thread_fast(), (const void*)lck); + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_rdlock(mlck->rw, true); +} + +T_MOCK(boolean_t, +lck_rw_try_lock, (lck_rw_t * lck, lck_rw_type_t lck_rw_type)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_try_lock(lck, lck_rw_type); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + return fake_rw_try_lock(mlck, lck_rw_type); +} + +T_MOCK(boolean_t, +lck_rw_try_lock_exclusive, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_try_lock_exclusive(lck); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + return fake_rw_try_lock(mlck, LCK_RW_TYPE_EXCLUSIVE); +} + +T_MOCK(boolean_t, +lck_rw_try_lock_shared, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_try_lock_shared(lck); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + return fake_rw_try_lock(mlck, LCK_RW_TYPE_SHARED); +} + +T_MOCK(lck_rw_type_t, +lck_rw_done, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_done(lck); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + mlck->lck_rw_owner = 0; + // If there is a writer locking it must be the current fiber or will trigger an assertion in fibers_rwlock_wrunlock + lck_rw_type_t ret = mlck->rw->writer_active ? LCK_RW_TYPE_EXCLUSIVE : LCK_RW_TYPE_SHARED; + fibers_rwlock_unlock(mlck->rw); + + // RANGELOCKINGTODO rdar://150846598 handle old lock can_sleep + lck_rw_lock_count_dec(MOCK_current_thread_fast(), (const void*)mlck); + + return ret; +} + +T_MOCK(boolean_t, +lck_rw_lock_shared_to_exclusive, (lck_rw_t * lck)) +{ + if (ut_mocks_lock_upgrade_fail) { + lck_rw_old_mock_unlock_shared(lck); + return false; + } + + if (!ut_mocks_use_fibers) { + return lck_rw_lock_shared_to_exclusive(lck); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + return fibers_rwlock_upgrade(mlck->rw); +} + +T_MOCK(void, +lck_rw_lock_exclusive_to_shared, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_lock_exclusive_to_shared(lck); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_downgrade(mlck->rw); +} + +T_MOCK(void, +lck_rw_assert, ( + lck_rw_t * lck, + unsigned int type)) +{ + if (!ut_mocks_use_fibers) { + lck_rw_assert(lck, type); + return; + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_assert(mlck->rw, type); +} + +T_MOCK(bool, +lck_rw_lock_would_yield_exclusive, ( + lck_rw_t * lck, + lck_rw_yield_t mode)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_lock_would_yield_exclusive(lck, mode); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + return fake_rw_lock_would_yield_exclusive(mlck, mode); +} + +T_MOCK(bool, +lck_rw_lock_would_yield_shared, (lck_rw_t * lck)) +{ + if (!ut_mocks_use_fibers) { + return lck_rw_lock_would_yield_shared(lck); + } + + struct mock_lck_rw_t* mlck = (struct mock_lck_rw_t*)lck; + fibers_rwlock_assert(mlck->rw, FIBERS_RWLOCK_ASSERT_SHARED); + return mlck->rw->writer_wait_queue.count != 0; +} + +// Note: No need to mock lck_rw_sleep as it uses lck_rw_* API and waitq, we already mock everything the function uses + +// --------------- waitq ------------------ + +/* + * If the 4 bytes of mock_waitq.mock_magic are not matching MOCK_WAITQ_MAGIC + * it means the waitq comes from an unsupported location and was not created with mock_waitq_init(). + */ +#define MOCK_WAITQ_MAGIC 0xb60d0d8f + +struct mock_waitq_extra { + bool valid; + fibers_condition_t cond; + fibers_mutex_t mutex; + + struct mock_thread *waiting_threads; + int waiting_thread_count; // Count of waiting threads +}; + +struct mock_waitq { // 24 bytes + WAITQ_FLAGS(waitq, waitq_eventmask:_EVENT_MASK_BITS); + unsigned int mock_magic; + event64_t current_event; // delete when every waiting thread is removed + struct mock_waitq_extra *extra; +}; + +static_assert(sizeof(struct waitq) == sizeof(struct mock_waitq)); + +#define MWQCAST(xnu_wq) ((struct mock_waitq *)(xnu_wq).wq_q) + +static bool +waitq_use_real_impl(waitq_t wq) +{ + return !ut_mocks_use_fibers || waitq_type(wq) != WQT_QUEUE; +} + +int +mock_waitq_init(struct mock_waitq *wq) +{ + if (!wq) { + return EINVAL; + } + wq->mock_magic = MOCK_WAITQ_MAGIC; + wq->current_event = 0; + + wq->extra = calloc(sizeof(struct mock_waitq_extra), 1); + wq->extra->valid = true; + fibers_mutex_init(&wq->extra->mutex); + + return 0; +} + +int +mock_waitq_destroy(struct mock_waitq *wq) +{ + if (!wq) { + return EINVAL; + } + PT_QUIET; PT_ASSERT_TRUE(wq->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + + fibers_condition_destroy(&wq->extra->cond); + fibers_mutex_destroy(&wq->extra->mutex); + free(wq->extra); + wq->extra = NULL; + + return 0; +} + +static inline bool +waitq_should_unlock(waitq_wakeup_flags_t flags) +{ + return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED)) == WAITQ_UNLOCK; +} + +static inline bool +waitq_should_enable_interrupts(waitq_wakeup_flags_t flags) +{ + return (flags & (WAITQ_UNLOCK | WAITQ_KEEP_LOCKED | WAITQ_ENABLE_INTERRUPTS)) == (WAITQ_UNLOCK | WAITQ_ENABLE_INTERRUPTS); +} + + +T_MOCK(void, +waitq_init, (waitq_t wq, waitq_type_t type, int policy)) +{ + if (!ut_mocks_use_fibers || type == WQT_PORT) { + waitq_init(wq, type, policy); + return; + } + + *wq.wq_q = (struct waitq){ + .waitq_type = type, + .waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0), + }; + + // RANGELOCKINGTODO rdar://150846598 + PT_QUIET; PT_ASSERT_TRUE(type == WQT_QUEUE, "invalid waitq type"); + mock_waitq_init(MWQCAST(wq)); + + if (policy & SYNC_POLICY_INIT_LOCKED) { + fibers_mutex_lock(&MWQCAST(wq)->extra->mutex, false); + } +} + +T_MOCK(void, +waitq_deinit, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + waitq_deinit(wq); + return; + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + mock_waitq_destroy(MWQCAST(wq)); +} + +T_MOCK(void, +waitq_lock, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + waitq_lock(wq); + return; + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + fibers_mutex_lock(&MWQCAST(wq)->extra->mutex, false); +} + +T_MOCK(void, +waitq_unlock, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + waitq_unlock(wq); + return; + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + fibers_mutex_unlock(&MWQCAST(wq)->extra->mutex); +} + +T_MOCK(bool, +waitq_is_valid, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + return waitq_is_valid(wq); + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + return MWQCAST(wq)->extra->valid; +} + +T_MOCK(void, +waitq_invalidate, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + return waitq_invalidate(wq); + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + MWQCAST(wq)->extra->valid = false; +} + +T_MOCK(bool, +waitq_held, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + return waitq_held(wq); + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + return MWQCAST(wq)->extra->mutex.holder != NULL; +} + +T_MOCK(void, +waitq_lock_wait, (waitq_t wq, uint32_t ticket)) +{ + MOCK_waitq_lock(wq); +} + +T_MOCK(bool, +waitq_lock_try, (waitq_t wq)) +{ + if (waitq_use_real_impl(wq)) { + return waitq_lock_try(wq); + } + + PT_QUIET; PT_ASSERT_TRUE(MWQCAST(wq)->mock_magic == MOCK_WAITQ_MAGIC, "missing mock_waitq magic"); + return fibers_mutex_try_lock(&MWQCAST(wq)->extra->mutex) == 0; +} + +// --------------- events ------------------ + +#define MOCK_WAITQS_NUM 4096 +static struct mock_waitq global_mock_waitqs[MOCK_WAITQS_NUM]; +static int global_mock_waitqs_inited = 0; + +static void +global_mock_waitqs_init(void) +{ + for (int i = 0; i < MOCK_WAITQS_NUM; ++i) { + MOCK_waitq_init((struct waitq*)&global_mock_waitqs[i], WQT_QUEUE, SYNC_POLICY_FIFO); + } + global_mock_waitqs_inited = 1; +} + +struct mock_waitq* +find_mock_waitq(event64_t event) +{ + if (!global_mock_waitqs_inited) { + global_mock_waitqs_init(); + } + for (int i = 0; i < MOCK_WAITQS_NUM; ++i) { + if (global_mock_waitqs[i].current_event == event) { + return &global_mock_waitqs[i]; + } + } + return NULL; +} + +struct mock_waitq* +find_or_alloc_mock_waitq(event64_t event) +{ + if (!global_mock_waitqs_inited) { + global_mock_waitqs_init(); + } + int first_free = -1; + for (int i = 0; i < MOCK_WAITQS_NUM; ++i) { + if (global_mock_waitqs[i].current_event == event) { + return &global_mock_waitqs[i]; + } else if (first_free < 0 && global_mock_waitqs[i].current_event == 0) { + first_free = i; + } + } + PT_QUIET; PT_ASSERT_TRUE(first_free >= 0, "no more space in global_mock_waitqs"); + global_mock_waitqs[first_free].current_event = event; + return &global_mock_waitqs[first_free]; +} + +// --------------- waitq mocks ------------------ + +// pthread mocks + +struct pthread_mock_event_table_entry* +find_pthread_mock_event_entry(struct mock_process_state *s, event_t ev) +{ + for (int i = 0; i < PTHREAD_EVENTS_TABLE_SIZE; ++i) { + if (s->events[i].ev == ev) { + return &s->events[i]; + } + } + return NULL; +} + +T_MOCK_DYNAMIC(kern_return_t, + thread_wakeup_prim, ( + event_t event, + boolean_t one_thread, + wait_result_t result), + (event, one_thread, result), +{ + if (ut_mocks_use_fibers) { + // fibers is mocking waitq apis, go forward calling the real thread_wakeup_prim + return thread_wakeup_prim(event, one_thread, result); + } + + kern_return_t kr = KERN_SUCCESS; + + struct mock_process_state *s = get_proc_state(); + int ret = pthread_mutex_lock(&s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_wakeup pthread_mutex_lock"); + + struct pthread_mock_event_table_entry* event_entry = find_pthread_mock_event_entry(s, event); + if (event_entry == NULL) { + kr = KERN_NOT_WAITING; + goto done; + } + if (one_thread) { + ret = pthread_cond_signal(&event_entry->cond); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_wakeup pthread_cond_signal"); + } else { + ret = pthread_cond_broadcast(&event_entry->cond); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_wakeup pthread_cond_broadcast"); + } + done: + pthread_mutex_unlock(&s->events_mutex); + return kr; +}); + +wait_result_t +pthread_mock_thread_block_reason( + thread_continue_t continuation, + void *parameter, + ast_t reason) +{ + PT_QUIET; PT_ASSERT_TRUE(continuation == THREAD_CONTINUE_NULL && parameter == NULL && reason == AST_NONE, "thread_block argument"); + + struct mock_process_state *s = get_proc_state(); + int ret = pthread_mutex_lock(&s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_block pthread_mutex_lock"); + + // find empty entry in table + struct pthread_mock_event_table_entry *event_entry = find_pthread_mock_event_entry(s, 0); + PT_QUIET; PT_ASSERT_NOTNULL(event_entry, "empty entry not found"); + + // register the entry to this event + event_entry->ev = (event_t)MOCK_current_thread_fast()->wait_event; + + // if it doesn't have a condition variable yet, create one + if (!event_entry->cond_inited) { + ret = pthread_cond_init(&event_entry->cond, NULL); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_block pthread_cond_init"); + event_entry->cond_inited = true; + } + + // wait on variable. This releases the mutex, waits and reaquires it before returning + ret = pthread_cond_wait(&event_entry->cond, &s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_block pthread_cond_wait"); + + // reset the entry so that it can be reused (will be done by all waiters that woke up) + event_entry->ev = 0; + + ret = pthread_mutex_unlock(&s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "thread_block pthread_mutex_unlock"); + + return THREAD_AWAKENED; +} + +kern_return_t +pthread_mock_clear_wait( + thread_t thread, + wait_result_t result) +{ + struct mock_process_state *s = get_proc_state(); + int ret = pthread_mutex_lock(&s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "clear_wait pthread_mutex_lock"); + + struct pthread_mock_event_table_entry *event_entry = find_pthread_mock_event_entry(s, 0); + PT_QUIET; PT_ASSERT_NOTNULL(event_entry, "empty entry not found"); + + event_entry->ev = 0; + + ret = pthread_mutex_unlock(&s->events_mutex); + PT_QUIET; PT_ASSERT_POSIX_ZERO(ret, "clear_wait pthread_mutex_unlock"); + return KERN_SUCCESS; +} + +// fibers mocks + +T_MOCK(struct waitq *, +_global_eventq, (event64_t event)) +{ + if (!ut_mocks_use_fibers) { + return _global_eventq(event); + } + + struct waitq *ret = (struct waitq *)find_or_alloc_mock_waitq(event); + return ret; +} + +T_MOCK(wait_result_t, +waitq_assert_wait64_locked, ( + waitq_t waitq, + event64_t wait_event, + wait_interrupt_t interruptible, + wait_timeout_urgency_t urgency, + uint64_t deadline, + uint64_t leeway, + thread_t thread)) +{ + if (waitq_use_real_impl(waitq)) { + return waitq_assert_wait64_locked(waitq, wait_event, interruptible, urgency, deadline, leeway, thread); + } + + struct mock_waitq *wq = MWQCAST(waitq); + + if (wq->current_event == 0) { + wq->current_event = wait_event; + } + + PT_QUIET; PT_ASSERT_TRUE(wq->current_event == wait_event, "waitq_assert_wait64_locked another event queue"); + + struct mock_thread * mock_thread = (struct mock_thread*)thread; // !!! ASSUME every thread_t is created from mock_thread + mock_thread->wq_next = wq->extra->waiting_threads; + wq->extra->waiting_threads = mock_thread; + wq->extra->waiting_thread_count++; + + thread->wait_event = wait_event; // Store waiting event in thread context + thread->state |= TH_WAIT; // Set thread state to waiting + thread->waitq = waitq; + + return THREAD_WAITING; // Indicate thread is now waiting, but not blocked yet +} + +T_MOCK(wait_result_t, +waitq_assert_wait64, ( + struct waitq *waitq, + event64_t wait_event, + wait_interrupt_t interruptible, + uint64_t deadline)) +{ + if (waitq_use_real_impl(waitq)) { + return waitq_assert_wait64(waitq, wait_event, interruptible, deadline); + } + + thread_t thread = MOCK_current_thread_fast(); + + MOCK_waitq_lock(waitq); + wait_result_t res = MOCK_waitq_assert_wait64_locked(waitq, wait_event, interruptible, + TIMEOUT_URGENCY_SYS_NORMAL, deadline, TIMEOUT_NO_LEEWAY, thread); + MOCK_waitq_unlock(waitq); + return res; +} + +static void +mock_waitq_clear_wait(struct mock_thread * thread, struct mock_waitq *wq) +{ + struct mock_thread ** mock_thread = &wq->extra->waiting_threads; + int removed = 0; + while (*mock_thread) { + if (*mock_thread == thread) { + *mock_thread = (*mock_thread)->wq_next; + removed = 1; + break; + } + mock_thread = &(*mock_thread)->wq_next; + } + PT_QUIET; PT_ASSERT_TRUE(removed, "thread_block thread not in wq"); + thread->wq_next = NULL; + + wq->extra->waiting_thread_count--; + if (wq->extra->waiting_thread_count == 0) { + wq->current_event = 0; // reset current_event + } + PT_QUIET; PT_ASSERT_TRUE(wq->extra->waiting_thread_count >= 0, "something bad"); +} + +static struct mock_thread * +mock_waitq_pop_wait(struct mock_waitq *wq) +{ + if (wq->extra->waiting_thread_count == 0) { + return NULL; + } + + struct mock_thread * thread = wq->extra->waiting_threads; + wq->extra->waiting_threads = thread->wq_next; + thread->wq_next = NULL; + + wq->extra->waiting_thread_count--; + if (wq->extra->waiting_thread_count == 0) { + wq->current_event = 0; // reset current_event + } + PT_QUIET; PT_ASSERT_TRUE(wq->extra->waiting_thread_count >= 0, "something bad"); + + return thread; +} + +T_MOCK_DYNAMIC(wait_result_t, + thread_block_reason, ( + thread_continue_t continuation, + void *parameter, + ast_t reason), ( + continuation, + parameter, + reason), +{ + if (!ut_mocks_use_fibers) { + return pthread_mock_thread_block_reason(continuation, parameter, reason); + } + + PT_QUIET; PT_ASSERT_TRUE(continuation == THREAD_CONTINUE_NULL && parameter == NULL && reason == AST_NONE, "thread_block argument"); + + thread_t thread = current_thread(); + PT_QUIET; PT_ASSERT_TRUE(thread->state & TH_WAIT, "thread_block called but thread state is not TH_WAIT"); + + /* + * In case of a window between assert_wait and thread_block + * another thread could wake up the current thread after being added to the waitq + * but before the block. + * In this case, the thread will still be TH_WAIT but without an assigned waitq. + * TH_WAKING must be set. + */ + struct mock_waitq *wq = MWQCAST(thread->waitq); + if (wq == NULL) { + PT_QUIET; PT_ASSERT_TRUE(thread->state & TH_WAKING, "with waitq == NULL there must be TH_WAKING set"); + thread->state &= ~TH_WAKING; + goto awake_thread; + } + + fibers_condition_wait(&wq->extra->cond); + + if (thread->state & TH_WAKING) { + thread->state &= ~TH_WAKING; + } else { + // is this possible? TH_WAKING is always set ATM in the mocks, keep this code to be more robust + thread->waitq.wq_q = NULL; + mock_waitq_clear_wait((struct mock_thread *)thread, wq); + } + + awake_thread: + thread->state &= ~TH_WAIT; + thread->state |= TH_RUN; + + return thread->wait_result; +}); + +T_MOCK(kern_return_t, +clear_wait, (thread_t thread, wait_result_t wresult)) +{ + if (!ut_mocks_use_fibers) { + return pthread_mock_clear_wait(thread, wresult); + } + + struct mock_waitq *wq = MWQCAST(thread->waitq); + PT_QUIET; PT_ASSERT_TRUE(wq != NULL, "thread->waitq is NULL"); + + thread->state &= ~TH_WAIT; + thread->waitq.wq_q = NULL; + thread->wait_result = wresult; + + mock_waitq_clear_wait((struct mock_thread *)thread, wq); + + return KERN_SUCCESS; +} + +typedef struct { + wait_result_t wait_result; +} waitq_wakeup_args_t; + +static void +waitq_wakeup_fiber_callback(void *arg, fiber_t target) +{ + waitq_wakeup_args_t *wakeup_args = (waitq_wakeup_args_t*)arg; + struct mock_thread *thread = (struct mock_thread *)target->extra; + assert(thread); + + struct mock_waitq *wq = MWQCAST(thread->th.waitq); + assert(wq); + + thread->th.state |= TH_WAKING; + thread->th.waitq.wq_q = NULL; + thread->th.wait_result = wakeup_args->wait_result; + + mock_waitq_clear_wait(thread, wq); +} + +// Called from thread_wakeup_nthreads_prim +T_MOCK(uint32_t, +waitq_wakeup64_nthreads_locked, ( + waitq_t waitq, + event64_t wake_event, + wait_result_t result, + waitq_wakeup_flags_t flags, + uint32_t nthreads)) +{ + if (waitq_use_real_impl(waitq)) { + return waitq_wakeup64_nthreads_locked(waitq, wake_event, result, flags, nthreads); + } + + // RANGELOCKINGTODO rdar://150846598 flags + waitq_wakeup_args_t wakeup_args = { + .wait_result = result + }; + + struct mock_waitq *wq = MWQCAST(waitq); + PT_QUIET; PT_ASSERT_TRUE(wq->current_event == wake_event, "waitq_wakeup64_nthreads current_event is wrong"); + + // Avoid to trigger a switch in fibers_condition_wakeup_some before a valid state in the waitq + fibers_current->may_yield_disabled++; + + FIBERS_LOG(FIBERS_LOG_DEBUG, "waitq_wakeup64_nthreads_locked nthreads=%u wake_event=%lld", nthreads, wake_event); + + int count = fibers_condition_wakeup_some(&wq->extra->cond, nthreads, &waitq_wakeup_fiber_callback, &wakeup_args); + + /* + * In case of a window in which a thread is pushed to the waitq but thread_block was still not called + * when another thread wakes up the threads in the waitq here. + * fibers_condition_wakeup_some will not find these fibers as they are not waiting on the condition, + * In this case these fibers must be in FIBER_STOP that means that they are ready to be scheduled, + * but we still need to take action here to remove them from the waitq and clear the state. + */ + while (wq->extra->waiting_thread_count && count < nthreads) { + struct mock_thread *thread = mock_waitq_pop_wait(wq); + PT_QUIET; PT_ASSERT_TRUE(thread->fiber->state & FIBER_STOP, "leftover fiber in waitq not in FIBER_STOP"); + thread->th.state |= TH_WAKING; + thread->th.waitq.wq_q = NULL; + thread->th.wait_result = result; + ++count; + } + + fibers_current->may_yield_disabled--; + + if (waitq_should_unlock(flags)) { + MOCK_waitq_unlock(waitq); + } + if (waitq_should_enable_interrupts(flags)) { + MOCK_ml_set_interrupts_enabled(1); + } + + return (uint32_t)count; +} + +T_MOCK(thread_t, +waitq_wakeup64_identify_locked, ( + waitq_t waitq, + event64_t wake_event, + waitq_wakeup_flags_t flags)) +{ + if (waitq_use_real_impl(waitq)) { + return waitq_wakeup64_identify_locked(waitq, wake_event, flags); + } + + // RANGELOCKINGTODO rdar://150846598 flags + + struct mock_waitq *wq = MWQCAST(waitq); + PT_QUIET; PT_ASSERT_TRUE(wq->current_event == wake_event, "waitq_wakeup64_identify_locked current_event is wrong"); + + // RANGELOCKINGTODO rdar://150845975 for fuzzing select random, not the top of the queue + struct mock_thread * mock_thread = wq->extra->waiting_threads; + if (mock_thread == NULL) { + return THREAD_NULL; + } + + // Preemption will be re-enabled when the thread is resumed in `waitq_resume_identify_thread` + MOCK__disable_preemption(); + + mock_thread->th.state |= TH_WAKING; + mock_thread->th.waitq.wq_q = NULL; + mock_thread->th.wait_result = THREAD_AWAKENED; + + mock_waitq_clear_wait(mock_thread, wq); + + FIBERS_LOG(FIBERS_LOG_DEBUG, "waitq_wakeup64_identify_locked identified fiber %d", mock_thread->fiber->id); + + if (waitq_should_unlock(flags)) { + MOCK_waitq_unlock(waitq); + } + if (waitq_should_enable_interrupts(flags)) { + MOCK_ml_set_interrupts_enabled(1); + } + + fibers_may_yield_internal(); + + return &mock_thread->th; +} + +T_MOCK(void, +waitq_resume_identified_thread, ( + waitq_t waitq, + thread_t thread, + wait_result_t result, + waitq_wakeup_flags_t flags)) +{ + if (waitq_use_real_impl(waitq)) { + return waitq_resume_identified_thread(waitq, thread, result, flags); + } + + // RANGELOCKINGTODO rdar://150846598 other flags + + struct mock_thread * mock_thread = (struct mock_thread*)thread; // !!! ASSUME every thread_t is created from mock_thread + struct mock_waitq *wq = MWQCAST(waitq); + + bool found = fibers_condition_wakeup_identified(&wq->extra->cond, mock_thread->fiber); + if (!found) { + /* + * In case of a window in which a thread is pushed to the waitq but thread_block was still not called + * when the thread is identified by another one and resumed, we pop it from the waitq in waitq_wakeup64_identify_locked + * but we will not find it in wq->cond.wait_queue. + * In this case it is not needed any action as the fiber must be in FIBER_STOP and can already be scheduled. + */ + PT_QUIET; PT_ASSERT_TRUE(mock_thread->fiber->state & FIBER_STOP, "waitq_resume_identified_thread fiber not found in condition and not in FIBER_STOP"); + } + + // Paired with the call to `waitq_wakeup64_identify_locked` + MOCK__enable_preemption(); + + fibers_may_yield_internal_with_reason( + FIBERS_YIELD_REASON_WAKEUP | + FIBERS_YIELD_REASON_ERROR_IF(!found)); +} + +// Allow to cause a context switch from a function that can be called from XNU +T_MOCK(void, +ut_fibers_ctxswitch, (void)) +{ + if (ut_mocks_use_fibers) { + fibers_yield(); + } +} + +// Allow to cause a context switch to a specific fiber from a function that can be called from XNU +T_MOCK(void, +ut_fibers_ctxswitch_to, (int fiber_id)) +{ + if (ut_mocks_use_fibers) { + fibers_yield_to(fiber_id); + } +} + +// Get the current fiber id from a function that can be called from XNU +T_MOCK(int, +ut_fibers_current_id, (void)) +{ + if (ut_mocks_use_fibers) { + return fibers_current->id; + } + return -1; +} + +// --------------- preemption ------------------ + +#ifdef __BUILDING_WITH_SANCOV_LOAD_STORES__ +// Optional: uncomment to enable yield at every basic block entry +/* + * T_MOCK(void, + * __sanitizer_cov_trace_pc_guard, (uint32_t * guard)) + * { + * fibers_may_yield(); + * } + */ + +#define IS_ALIGNED(ptr, size) ( (((uintptr_t)(ptr)) & (((uintptr_t)(size)) - 1)) == 0 ) +#define IS_ATOMIC(ptr, size) ( (size) <= sizeof(uint64_t) && IS_ALIGNED(ptr, size) ) + +// These functions can be called from XNU to enter/exit atomic regions in which the data checker is disabled +T_MOCK(void, +data_race_checker_atomic_begin, (void)) +{ + fibers_checker_atomic_begin(); +} +T_MOCK(void, +data_race_checker_atomic_end, (void)) +{ + fibers_checker_atomic_end(); +} + +/* + * Detecting data races on memory operations: + * Memory operation functions are used to check for data races using the fibers checkers API, a software implementation of DataCollider. + * The idea is to set a watchpoint before context switching and report a data race every time a concurrent access (watchpoint hit) is in between a write or a write in between a load. + * To be more robust, we also check that the value pointed the memory operation address before the context switch is still the same after the context switch. + * If not, very likely it is a data race. Atomic memory operations should be excluded from this, we use the IS_ATOMIC macro to filter memory loads. + * Note: atomic_fetch_add_explicit() et al. on ARM64 are compiled to LDADD et al. that seem to not be supported by __sanitizer_cov_loadX, ok for us we want to skip atomic operations. + */ +#define SANCOV_LOAD_STORE_DATA_CHECKER(type, size, access_type) do { \ + if (fibers_current->may_yield_disabled) { \ + return; \ + } \ + if (fibers_scheduler->fibers_should_yield(fibers_scheduler_context, \ + fibers_may_yield_probability, FIBERS_YIELD_REASON_PREEMPTION_TRIGGER)) { \ + volatile type before = *addr; \ + void *pc = __builtin_return_address(0); \ + bool has_wp = check_and_set_watchpoint(pc, (uintptr_t)addr, size, access_type); \ + \ + fibers_queue_push(&fibers_run_queue, fibers_current); \ + fibers_choose_next(FIBER_STOP); \ + \ + if (has_wp) { \ + post_check_and_remove_watchpoint((uintptr_t)addr, size, access_type); \ + } \ + type after = *addr; \ + if (before != after) { \ + report_value_race((uintptr_t)addr, size, access_type); \ + } \ + } \ + } while (0) + +/* + * Mock the SanitizerCoverage load/store instrumentation callbacks (original in san_attached.c). + * The functions are execute at every memory operations in libxnu and in the test binary, libmocks is excluded. + * Functions and files in tools/sanitizers-ignorelist are excluded from instrumentation. + */ +#define MOCK_SANCOV_LOAD_STORE(type, size) \ + __attribute__((optnone)) \ + T_MOCK(void, \ + __sanitizer_cov_load##size, (type* addr)) \ + { \ + if (!ut_fibers_use_data_race_checker || IS_ATOMIC(addr, size) || fibers_current->disable_race_checker) { \ + fibers_may_yield_with_reason(FIBERS_YIELD_REASON_PREEMPTION_TRIGGER); \ + return; \ + } \ + SANCOV_LOAD_STORE_DATA_CHECKER(type, size, ACCESS_TYPE_LOAD); \ + } \ + \ + __attribute__((optnone)) \ + T_MOCK(void, \ + __sanitizer_cov_store##size, (type* addr)) \ + { /* do not care about atomicity for stores */ \ + if (!ut_fibers_use_data_race_checker || fibers_current->disable_race_checker) { \ + fibers_may_yield_with_reason(FIBERS_YIELD_REASON_PREEMPTION_TRIGGER); \ + return; \ + } \ + SANCOV_LOAD_STORE_DATA_CHECKER(type, size, ACCESS_TYPE_STORE); \ + } + +MOCK_SANCOV_LOAD_STORE(uint8_t, 1) +MOCK_SANCOV_LOAD_STORE(uint16_t, 2) +MOCK_SANCOV_LOAD_STORE(uint32_t, 4) +MOCK_SANCOV_LOAD_STORE(uint64_t, 8) +MOCK_SANCOV_LOAD_STORE(__uint128_t, 16) + +#endif // __BUILDING_WITH_SANCOV__ diff --git a/tests/unit/mocks/mock_thread.h b/tests/unit/mocks/mock_thread.h new file mode 100644 index 000000000..dbb7f4ac9 --- /dev/null +++ b/tests/unit/mocks/mock_thread.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +#include "mock_dynamic.h" +#include +#include +#include +#include + +#include "fibers/fibers.h" + +// Unit tests that wants to use fibers must call with macro in the global scope with val=1 +#define UT_USE_FIBERS(val) int ut_mocks_use_fibers = (val) +// Unit tests using fibers that wants to enable the data race checker must call with macro in the global scope with val=1 +#define UT_FIBERS_USE_CHECKER(val) int ut_fibers_use_data_race_checker = (val) + +extern int ut_mocks_use_fibers __attribute__((weak)); +extern int ut_fibers_use_data_race_checker __attribute__((weak)); + +// You can set the fibers configuration variables either assigning a value to them in the test function (see fibers_test.c) +// or using these macros in the global scope +#define UT_FIBERS_LOG_LEVEL(val) \ + __attribute__((constructor)) \ + static void \ + _ut_fibers_set_log_level(void) \ + { \ + fibers_log_level = (val); \ + } +#define UT_FIBERS_DEBUG(val) \ + __attribute__((constructor)) \ + static void \ + _ut_fibers_set_log_debug(void) \ + { \ + fibers_debug = (val); \ + } +#define UT_FIBERS_ABORT_ON_ERROR(val) \ + __attribute__((constructor)) \ + static void \ + _ut_fibers_set_abort_on_error(void) \ + { \ + fibers_abort_on_error = (val); \ + } +#define UT_FIBERS_MAY_YIELD_PROB(val) \ + __attribute__((constructor)) \ + static void \ + _ut_fibers_set_may_yield_prob(void) \ + { \ + fibers_may_yield_probability = (val); \ + } + +/* + * Writing tests using fibers: + * + * If UT_USE_FIBERS(1) is used, every test defined in the same test executable will use the threading mocks implemented using the fibers API. + * However, this is not sufficient to write a test with multiple "threads", the test itself is reposinsible of creating the fibers. + * For some working examples, see the fibers_test.c file. + * + * The tests file must include the needed headers from mocks/fibers/ depending on what needs to be used. + * Fibers API are very similar to pthread, and if FIBERS_PREEMPTION=1 is used at compile time it behaves in a similar way to real threads. + * The main different is that developers must be aware that blocking operations are blocking every fiber, + * for instance you should not call sleep() in your test and if some kernel function is calling a similar function you should mock it with + * a call to one or more fibers_yield() to trigger a context switch. + * The scheduler is deterministic, the interleaving can be changed either setting a different seed in the PRNG with random_set_seed() + * or with any change to the code itself, as possible context switch points are located inside the fibers API or, even more drastically, + * at every memory load/store when FIBERS_PREEMPTION=1. + * + * Target code in XNU (like sysctl tests) can trigger a fibers context switch using the following API (see mock_attached.c): + * void ut_fibers_ctxswitch(void); // Switch to a random fiber + * void ut_fibers_ctxswitch_to(int fiber_id); // Switch to a specific fiber by id + * int ut_fibers_current_id(void); // Get the current fiber id + */ + +extern void fake_init_lock(lck_mtx_t *mtx); +extern void fake_init_task(task_t new_task); + +T_MOCK_DYNAMIC_DECLARE( + kern_return_t, + thread_wakeup_prim, ( + event_t event, + boolean_t one_thread, + wait_result_t result)); + +T_MOCK_DYNAMIC_DECLARE( + wait_result_t, + thread_block_reason, ( + thread_continue_t continuation, + void *parameter, + ast_t reason)); diff --git a/tests/unit/mocks/mock_unimpl.c b/tests/unit/mocks/mock_unimpl.c new file mode 100644 index 000000000..b9ca6b4f8 --- /dev/null +++ b/tests/unit/mocks/mock_unimpl.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2000-2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "std_safe.h" +#include "dt_proxy.h" + +// The normal XNU executable links to libsptm_xnu.a and libTightbeam.a for platforms that require it. +// The unit-test environment however isn't supposed to call into these libraries +// and can't link to them anyway because they are built for arch arm64.kernel, not arm64. +// Instead, the required symbols are discovered at build time and defined in this translation unit. +// This is done to satisfy the linker and to show an error if one of these function +// ends up being called. +// These definitions ignore the real return value and arguments of +// the functions to keep it simple, and the linker doesn't care. + + +#define UNIMPLEMENTED(name) void name(void) { PT_FAIL("unimplemented: " #name); } +#include "func_unimpl.inc" +#undef UNIMPLEMENTED diff --git a/tests/unit/mocks/san_attached.c b/tests/unit/mocks/san_attached.c new file mode 100644 index 000000000..bfbcfca34 --- /dev/null +++ b/tests/unit/mocks/san_attached.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "dt_proxy.h" +#include + +#define NOT_MOCKED(name) PT_FAIL(#name ": this function should never be called since it is mocked by the mocks dylib") + +void +data_race_checker_atomic_begin(void) +{ +} + +void +data_race_checker_atomic_end(void) +{ +} + +__mockable void +__sanitizer_cov_trace_pc_guard(uint32_t *guard) +{ + // do nothing +} + +// Called before a load of appropriate size. Addr is the address of the load. +__mockable void +__sanitizer_cov_load1(uint8_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_load1); +} +__mockable void +__sanitizer_cov_load2(uint16_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_load2); +} +__mockable void +__sanitizer_cov_load4(uint32_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_load4); +} +__mockable void +__sanitizer_cov_load8(uint64_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_load8); +} +__mockable void +__sanitizer_cov_load16(__int128 *addr) +{ + NOT_MOCKED(__sanitizer_cov_load16); +} +// Called before a store of appropriate size. Addr is the address of the store. +void +__sanitizer_cov_store1(uint8_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_store1); +} +void +__sanitizer_cov_store2(uint16_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_store2); +} +void +__sanitizer_cov_store4(uint32_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_store4); +} +void +__sanitizer_cov_store8(uint64_t *addr) +{ + NOT_MOCKED(__sanitizer_cov_store8); +} +void +__sanitizer_cov_store16(__int128 *addr) +{ + NOT_MOCKED(__sanitizer_cov_store16); +} diff --git a/tests/unit/mocks/std_safe.h b/tests/unit/mocks/std_safe.h new file mode 100644 index 000000000..ad62a3d75 --- /dev/null +++ b/tests/unit/mocks/std_safe.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once + +// check that we're being compiled from the unit-tests makefile and that UT_MODULE was found +#if defined(UT_MODULE) && (UT_MODULE == -1) +#error "UT_MODULE not defined, did you forget to add a `#define UT_MODULE ` in your test?" +#endif + +#include +#include +#include +#include +#include + +// This file defines some function from libc that are used by the mocks. +// The testers are built with -nostdlibinc so the system headers folders (from the SDK) are not available when +// compiling these files. +// Having system headers folders available for the includes search would create conflicts when XNU code includes +// headers like string.h, sys/types.h etc' (which need to come from XNU). +// This is why it's not possible do add headers like here. +// Furthermore, even if we could have included system headers like stdlib.h, Having that and XNU headers in the same +// translation unit would have created conflicts with XNU-defined types which have the same name but a different type +// from the system includes definition. +// For example, every type in mach/mach_types.h is typedef'd as mach_port_t in the system headers. + +// from stdlib.h +extern void *calloc(size_t count, size_t size); +extern void *aligned_alloc(size_t alignment, size_t size); +extern void free(void *ptr); +extern int rand(void); +extern void srand(unsigned seed); +extern __attribute__((noreturn)) void exit(int status); +extern __attribute__((noreturn)) void abort(void); +extern char * getenv(const char *name); +extern int atoi(const char *str); + +extern void * calloc(size_t count, size_t size); +extern void free(void *ptr); +extern void * malloc(size_t size); +extern void *realloc(void *ptr, size_t size); + +// from stdio.h +extern int vsnprintf(char * str, size_t size, const char * format, va_list ap); +extern int snprintf(char * str, size_t size, const char * format, ...); + +// from string.h +extern void *memcpy(void *restrict dst, const void *restrict src, size_t n); +extern void *memset(void *b, int c, size_t len); +extern int strcmp(const char *s1, const char *s2); +extern char *strstr(const char *haystack, const char *needle); +extern char *strdup(const char *s1); + +// from unistd.h +extern size_t write(int fildes, const void *buf, size_t nbyte); +#define STDOUT_FILENO 1 + +// from pthread.h +#if defined(__LP64__) +#define __PTHREAD_SIZE__ 8176 +#define __PTHREAD_ATTR_SIZE__ 56 +#define __PTHREAD_MUTEX_SIZE__ 56 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_COND_SIZE__ 40 +#define __PTHREAD_CONDATTR_SIZE__ 8 +#else // !__LP64__ +#define __PTHREAD_SIZE__ 4088 +#define __PTHREAD_ATTR_SIZE__ 36 +#define __PTHREAD_MUTEX_SIZE__ 40 +#define __PTHREAD_MUTEXATTR_SIZE__ 8 +#define __PTHREAD_COND_SIZE__ 24 +#define __PTHREAD_CONDATTR_SIZE__ 4 +#endif +#if defined(__arm__) || defined(__arm64__) +#define PTHREAD_STACK_MIN 16384 +#else +#define PTHREAD_STACK_MIN 8192 +#endif + +struct _opaque_pthread_attr_t { + long __sig; + char __opaque[__PTHREAD_ATTR_SIZE__]; +}; +struct _opaque_pthread_t { + long __sig; + void *__cleanup_stack; + char __opaque[__PTHREAD_SIZE__]; +}; +struct _opaque_pthread_mutex_t { + long __sig; + char __opaque[__PTHREAD_MUTEX_SIZE__]; +}; +struct _opaque_pthread_mutexattr_t { + long __sig; + char __opaque[__PTHREAD_MUTEXATTR_SIZE__]; +}; +struct _opaque_pthread_cond_t { + long __sig; + char __opaque[__PTHREAD_COND_SIZE__]; +}; +struct _opaque_pthread_condattr_t { + long __sig; + char __opaque[__PTHREAD_CONDATTR_SIZE__]; +}; +typedef struct _opaque_pthread_attr_t __darwin_pthread_attr_t; +typedef struct _opaque_pthread_t *__darwin_pthread_t; +typedef unsigned long __darwin_pthread_key_t; +typedef struct _opaque_pthread_mutex_t __darwin_pthread_mutex_t; +typedef struct _opaque_pthread_mutexattr_t __darwin_pthread_mutexattr_t; +typedef struct _opaque_pthread_cond_t __darwin_pthread_cond_t; +typedef struct _opaque_pthread_condattr_t __darwin_pthread_condattr_t; + +typedef __darwin_pthread_t pthread_t; +typedef __darwin_pthread_attr_t pthread_attr_t; +typedef __darwin_pthread_key_t pthread_key_t; +typedef __darwin_pthread_mutex_t pthread_mutex_t; +typedef __darwin_pthread_mutexattr_t pthread_mutexattr_t; +typedef __darwin_pthread_cond_t pthread_cond_t; +typedef __darwin_pthread_condattr_t pthread_condattr_t; + +extern pthread_t pthread_self(void); +extern int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg); +extern int pthread_join(pthread_t thread, void **value_ptr); +extern int pthread_setspecific(pthread_key_t key, const void *value); +extern void *pthread_getspecific(pthread_key_t key); +extern int pthread_key_create(pthread_key_t *key, void (*destructor)(void *)); +extern int pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *attr); +extern int pthread_mutex_destroy(pthread_mutex_t *mutex); +extern int pthread_mutex_lock(pthread_mutex_t *mutex); +extern int pthread_mutex_trylock(pthread_mutex_t *mutex); +extern int pthread_mutex_unlock(pthread_mutex_t *mutex); +extern int pthread_mutexattr_init(pthread_mutexattr_t *attr); +extern int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); +extern int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type); +#define PTHREAD_MUTEX_RECURSIVE 2 +extern int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr); +extern int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *mutex); +extern int pthread_cond_signal(pthread_cond_t *cond); +extern int pthread_cond_broadcast(pthread_cond_t *cond); +extern int pthread_cond_destroy(pthread_cond_t *cond); +extern size_t pthread_get_stacksize_np(pthread_t); +extern void* pthread_get_stackaddr_np(pthread_t); + +// errno.h +#define EBUSY 16 + +extern int * __error(void); +#define errno (*__error()) + +// sysctl.h +extern int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); + +// from setjmp.h +#if defined(__x86_64__) +# define _JBLEN ((9 * 2) + 3 + 16) +#elif defined(__i386__) +# define _JBLEN (18) +#elif defined(__arm__) && !defined(__ARM_ARCH_7K__) +# define _JBLEN (10 + 16 + 2) +#elif defined(__arm64__) || defined(__ARM_ARCH_7K__) +# define _JBLEN ((14 + 8 + 2) * 2) +#else +# error Undefined platform for setjmp +#endif + +typedef int jmp_buf[_JBLEN]; + +extern int setjmp(jmp_buf); +extern void longjmp(jmp_buf, int) __attribute__((__noreturn__)); + +// from time.h +#include +extern time_t time(time_t *tloc); diff --git a/tests/unit/mocks/unit_test_utils.c b/tests/unit/mocks/unit_test_utils.c new file mode 100644 index 000000000..75998bbdb --- /dev/null +++ b/tests/unit/mocks/unit_test_utils.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "unit_test_utils.h" +#include "dt_proxy.h" +#include "sys/queue.h" +#include + +extern int backtrace(void **array, int size); +extern char **backtrace_symbols(void *const *array, int size); + +#ifdef __BUILDING_WITH_SANITIZER__ +extern void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf, size_t out_buf_size); +#endif + +extern int kernel_sysctlbyname(const char *, void *, size_t *, void *, size_t); + +int64_t +run_sysctl_test(const char *t, int64_t value, int argc, char* const* argv) +{ + char name[1024]; + int64_t result = 0; + size_t s = sizeof(value); + int rc; + + snprintf(name, sizeof(name), "debug.test.%s", t); + + bool run_real = (argc > 0 && strcmp(argv[0], "real_sysctl") == 0); + if (!run_real) { + rc = kernel_sysctlbyname(name, &result, &s, &value, s); + } else { + rc = sysctlbyname(name, &result, &s, &value, s); + } + PT_QUIET; PT_ASSERT_POSIX_ZERO(rc, "sysctlbyname()"); + return result; +} + +void * +checked_alloc_align(size_t size, size_t align) +{ + void *ptr = NULL; + if (align < sizeof(void *)) { + ptr = calloc(1, size); + PT_QUIET; PT_ASSERT_NOTNULL(ptr, "failed alloc"); + } else { + ptr = aligned_alloc(align, size); + PT_QUIET; PT_ASSERT_NOTNULL(ptr, "failed alloc"); + memset(ptr, 0, size); + } + return ptr; +} + +struct backtrace_array * +collect_current_backtrace(void) +{ + struct backtrace_array *bt = malloc(sizeof(struct backtrace_array)); + bt->nptrs = backtrace(bt->buffer, 100); + return bt; +} + +void +print_collected_backtrace(struct backtrace_array *bt) +{ +#ifdef __BUILDING_WITH_SANITIZER__ + // If compiled with any sanitizer, use __sanitizer_symbolize_pc as it gives much more info compared to backtrace_symbols + char description[1024]; + for (int idx = 0; idx < bt->nptrs; idx++) { + __sanitizer_symbolize_pc(bt->buffer[idx], "%p %F %L", description, + sizeof(description)); + raw_printf("%d\t%s\n", idx, description); + } +#else + char** strings = backtrace_symbols(bt->buffer, bt->nptrs); + PT_QUIET; PT_ASSERT_NOTNULL(strings, "backtrace_symbols"); + for (int idx = 0; idx < bt->nptrs; idx++) { + raw_printf("%s\n", strings[idx]); + } + free(strings); +#endif + raw_printf("\n"); +} + +void +print_current_backtrace(void) +{ + struct backtrace_array bt; + bt.nptrs = backtrace(bt.buffer, 100); + print_collected_backtrace(&bt); +} diff --git a/tests/unit/mocks/unit_test_utils.h b/tests/unit/mocks/unit_test_utils.h new file mode 100644 index 000000000..e2636ee6f --- /dev/null +++ b/tests/unit/mocks/unit_test_utils.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#pragma once +#include "std_safe.h" +#include + +struct ut_expected_panic_s { + bool expect_panic; + jmp_buf jb; + const char* str_contains; +}; +extern struct ut_expected_panic_s ut_expected_panic; + +// Wrap a call that's expected to panic +// This assumes tester is single threaded +#define T_ASSERT_PANIC_CONTAINS(code_block, s_contains, msg) do { \ + if (setjmp(ut_expected_panic.jb) == 0) { \ + ut_expected_panic.expect_panic = true; \ + ut_expected_panic.str_contains = s_contains; \ + { \ + code_block \ + } \ + T_FAIL("did not panic() %s", msg); \ + } \ + else { \ + T_PASS("OK panic()ed %s", msg); \ + } \ + } while(false) + +#define T_ASSERT_PANIC(code_block, msg) \ + T_ASSERT_PANIC_CONTAINS(code_block, NULL, msg) + +extern void ut_check_expected_panic(const char* panic_str); + +static inline void raw_printf(const char *fmt, ...) __attribute__((format(printf, 1, 0))); + +#define PRINT_BUF_SIZE 1024 +static inline void +raw_printf(const char *fmt, ...) +{ + va_list listp; + va_start(listp, fmt); + char buf[PRINT_BUF_SIZE]; + int printed = vsnprintf(buf, PRINT_BUF_SIZE, fmt, listp); + if (printed > PRINT_BUF_SIZE - 1) { + printed = PRINT_BUF_SIZE - 1; + } + write(STDOUT_FILENO, buf, printed); + va_end(listp); +} + +extern void *checked_alloc_align(size_t size, size_t mask); + +#define BACKTRACE_ARRAY_SIZE 100 +struct backtrace_array { + void* buffer[BACKTRACE_ARRAY_SIZE]; + int nptrs; +}; +extern struct backtrace_array *collect_current_backtrace(void); +extern void print_collected_backtrace(struct backtrace_array *bt); +extern void print_current_backtrace(void); + +extern void ut_set_perm_quiet(bool v); + +extern int64_t run_sysctl_test(const char *t, int64_t value, int argc, char* const* argv); + +#define T_MOCK(ret, name, args) \ + extern ret name args; \ + static ret MOCK_ ## name args; \ + DYLD_INTERPOSE(MOCK_ ## name, name) \ + static ret MOCK_ ## name args diff --git a/tests/unit/mocks_test.c b/tests/unit/mocks_test.c new file mode 100644 index 000000000..271da6aaa --- /dev/null +++ b/tests/unit/mocks_test.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include "mocks/std_safe.h" +#include "mocks/unit_test_utils.h" +#include "mocks/mock_misc.h" +#include "mocks/dt_proxy.h" +#include +#include +#include + +#define UT_MODULE osfmk +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.mocks"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("s_shalom"), + T_META_RUN_CONCURRENTLY(false) + ); + + +#define NUM_INCREMENTS 100000 +#define NUM_THREADS 10 + +struct inc_state { + volatile int64_t counter; + //_Atomic int64_t counter; + lck_mtx_t mtx; + lck_grp_t grp; +}; + +void* +increment_counter(void* arg) +{ + struct inc_state *s = (struct inc_state *)arg; + for (int i = 0; i < NUM_INCREMENTS; i++) { + lck_mtx_lock(&s->mtx); + //lck_mtx_lock_spin(&s->mtx); + s->counter++; + //os_atomic_inc(&s->counter, relaxed); + lck_mtx_unlock(&s->mtx); + } + return NULL; +} + + +T_DECL(mutex_mock_increment_int, "mutex mock test") +{ + pthread_t mythreads[NUM_THREADS] = {}; + struct inc_state s = {.counter = 0}; + lck_grp_init(&s.grp, "test_mutex", LCK_GRP_ATTR_NULL); + lck_mtx_init(&s.mtx, &s.grp, LCK_ATTR_NULL); + + // Create threads + for (int i = 0; i < NUM_THREADS; i++) { + pthread_create(&mythreads[i], NULL, increment_counter, (void*)&s); + } + + // Wait for all threads to finish + for (int i = 0; i < NUM_THREADS; i++) { + pthread_join(mythreads[i], NULL); + } + lck_mtx_destroy(&s.mtx, &s.grp); + + T_LOG("Done counter=%lld", os_atomic_load(&s.counter, relaxed)); + T_ASSERT_EQ(s.counter, (int64_t)(NUM_INCREMENTS * NUM_THREADS), "eq"); +} + +struct wait_state { + event_t event; + volatile bool thread_did_sleep; +}; + +// from unistd.h. +// This can't be in stdsafe.h since it conflicts with a definition in bsd/sys/proc_internal.h +unsigned int sleep(unsigned int seconds); + +void* +do_sleep_and_wake(void *arg) +{ + struct wait_state *s = (struct wait_state *)arg; + sleep(1); + s->thread_did_sleep = true; + kern_return_t ret = thread_wakeup(s->event); + T_ASSERT_EQ(ret, KERN_SUCCESS, "thread_wakeup"); + return NULL; +} + +T_DECL(mocks_can_call_dt, "check that mocks can call T_x macros via PT_x") +{ + T_ASSERT_NOTNULL(get_dt_proxy_mock(), "mock dt_proxy null"); + T_ASSERT_NOTNULL(get_dt_proxy_attached(), "attached dt_proxy null"); +} + +// this test is meant to fail in order to verify that we're linking with the mock unimplemented sptm functions +// it's useful when debugging the Makefile +void libsptm_init(void); +T_DECL(sptm_link_unimpl, "sptm_link_unimpl", T_META_EXPECTFAIL("fail due to unimplemented sptm mock")) +{ + libsptm_init(); +} + +// --------------- dynamic mocks --------------------------------- + +#if (DEBUG || DEVELOPMENT) +// disabled in release since the kernel_funcX() functions are not defined by xnu in release + +T_DECL(mock_with_callback, "mock_with_callback") +{ + size_t ret1 = kernel_func1(1, 2); + T_ASSERT_EQ(ret1, (size_t)0, "expected return before - default value from mock"); + { + T_MOCK_SET_CALLBACK(kernel_func1, + size_t, + (int a, char b), + { + T_ASSERT_EQ(a, 3, "expected a"); + T_ASSERT_EQ(b, 4, "expected b"); + return a + b; + }); + + size_t ret2 = kernel_func1(3, 4); + T_ASSERT_EQ(ret2, (size_t)7, "expected return sum"); + + + T_MOCK_SET_CALLBACK(kernel_func1, + size_t, + (int a, char b), + { + return a - b; + }); + + size_t ret3 = kernel_func1(40, 30); + T_ASSERT_EQ(ret3, (size_t)10, "expected return second in the same scope"); + } + + size_t ret4 = kernel_func1(5, 6); + T_ASSERT_EQ(ret4, (size_t)0, "expected return before - mock default value"); +} + + +T_DECL(mock_with_retval, "mock_with_retval") +{ + size_t r1 = kernel_func1(0, 1); + T_ASSERT_EQ(r1, (size_t)0, "expected value before - mock default value"); + + { + T_MOCK_SET_RETVAL(kernel_func1, size_t, 42); + + size_t r2 = kernel_func1(0, 1); + T_ASSERT_EQ(r2, (size_t)42, "expected value with mock"); + + + T_MOCK_SET_RETVAL(kernel_func1, size_t, 43); + + size_t r3 = kernel_func1(0, 1); + T_ASSERT_EQ(r3, (size_t)43, "expected value with mock second in the same scope"); + } + + size_t r4 = kernel_func1(0, 1); + T_ASSERT_EQ(r4, (size_t)0, "expected value after - mock default value"); +} + + +T_MOCK_SET_PERM_FUNC(size_t, + kernel_func2, + (int a, char b)) +{ + T_ASSERT_EQ((int)a % 2, 0, "a is even"); + return a * 2; +} + +T_DECL(mock_with_static_func, "mock_with_static_func") +{ + size_t r = kernel_func2(10, 1); + T_ASSERT_EQ(r, (size_t)20, "expected return value"); +} + + +T_MOCK_SET_PERM_RETVAL(kernel_func3, size_t, 42); + +T_DECL(mock_with_perm_retval, "mock_with_perm_retval") +{ + size_t r = kernel_func3(1, 2); + T_ASSERT_EQ(r, (size_t)42, "expected return value"); +} + + +T_MOCK_CALL_QUEUE(fb_call, { + int expected_a; + char expected_b; + size_t ret_val; +}) + +T_DECL(mock_call_queue, "mock_call_queue") +{ + enqueue_fb_call((fb_call){ .expected_a = 1, .expected_b = 2, .ret_val = 3 }); + enqueue_fb_call((fb_call){ .expected_a = 10, .expected_b = 20, .ret_val = 30 }); + + { + fb_call c1 = dequeue_fb_call(); + T_ASSERT_EQ(c1.expected_a, 1, "a arg"); + T_ASSERT_EQ(c1.expected_b, 2, "b arg"); + T_ASSERT_EQ(c1.ret_val, (size_t)3, "a arg"); + } + { + fb_call c2 = dequeue_fb_call(); + T_ASSERT_EQ(c2.expected_a, 10, "a arg"); + T_ASSERT_EQ(c2.expected_b, 20, "b arg"); + T_ASSERT_EQ(c2.ret_val, (size_t)30, "a arg"); + } +} + + +T_MOCK_SET_PERM_FUNC(size_t, + kernel_func4, + (int a, char b)) +{ + fb_call c = dequeue_fb_call(); + T_ASSERT_EQ(a, c.expected_a, "a arg"); + T_ASSERT_EQ(b, c.expected_b, "b arg"); + return c.ret_val; +} + +T_DECL(mock_call_queue_in_a_mock, "mock_call_queue_in_a_mock") +{ + enqueue_fb_call((fb_call){ .expected_a = 1, .expected_b = 2, .ret_val = 3 }); + enqueue_fb_call((fb_call){ .expected_a = 10, .expected_b = 20, .ret_val = 30 }); + + size_t r1 = kernel_func4(1, 2); + T_ASSERT_EQ(r1, (size_t)3, "r1 ret"); + size_t r2 = kernel_func4(10, 20); + T_ASSERT_EQ(r2, (size_t)30, "r2 ret"); +} + +// a mock that calls the original function explicitly +T_DECL(mock_default_calling_original, "mock_default_calling_original") +{ + size_t r = kernel_func5(1, 2); + T_ASSERT_EQ(r, (size_t)5000, "r ret"); +} + +// a mock that calls the original function implicitly through _T_MOCK_DYNAMIC_DEFAULT_IMPL +T_DECL(mock_default_calling_original_implicit, "mock_default_calling_original_auto_define") +{ + size_t r = kernel_func7(1, 2); + T_ASSERT_EQ(r, (size_t)7000, "r ret"); +} + +T_DECL(mock_void_ret, "mock_void_ret") +{ + extern int kernel_func6_was_called; + kernel_func6_was_called = 0; + kernel_func6(3, 4); + T_ASSERT_EQ(kernel_func6_was_called, 3, "original called"); + + kernel_func6_was_called = 0; + T_MOCK_SET_CALLBACK(kernel_func6, + void, + (int a, char b), + { + T_ASSERT_EQ(a, 3, "expected a"); + T_ASSERT_EQ(b, 4, "expected b"); + }); + kernel_func6(3, 4); + T_ASSERT_EQ(kernel_func6_was_called, 0, "original called"); +} + +// void function with the default action that calls the original function +T_DECL(mock_void_ret_original_implicit, "mock_void_ret_original_implicit") +{ + extern int kernel_func8_was_called; + kernel_func8_was_called = 0; + kernel_func8(3, 4); + T_ASSERT_EQ(kernel_func8_was_called, 3, "original called"); +} + +#endif // (DEBUG || DEVELOPMENT) diff --git a/tests/unit/panic_path_test.c b/tests/unit/panic_path_test.c new file mode 100644 index 000000000..99c617128 --- /dev/null +++ b/tests/unit/panic_path_test.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include "mocks/unit_test_utils.h" +#include "mocks/mock_pmap.h" + +#include +#include +#include +#include +#include + +#define UT_MODULE osfmk + +kern_return_t +memory_backing_aware_buffer_stage_outproc( + struct kdp_output_stage *stage, + unsigned int request, + char *corename, + uint64_t length, + void * panic_data); + +static kern_return_t +kosf_outproc_mock( + __unused struct kdp_output_stage *stage, + __unused unsigned int request, + __unused char *corename, + __unused uint64_t length, + __unused void *panic_data + ) +{ + return KERN_SUCCESS; +} + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.unit.panic_path_test"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("e_zisman"), + T_META_RUN_CONCURRENTLY(false) + ); + +T_DECL(xnu_osfmk_kdp_memory_backing_aware_buffer_stage_outproc, "memory_backing_aware_buffer_stage_outproc") +{ + // No need to actually fill with data. + char panic_data[18 * 1024] __attribute__((aligned)); + + STAILQ_HEAD(, kdp_output_stage) stages; + struct kdp_output_stage stage1; + struct kdp_output_stage stage2; + char data1[32]; + char data2[32]; + + stage1.kos_data = data1; + stage2.kos_data = data2; + stage1.kos_funcs.kosf_outproc = kosf_outproc_mock; + stage2.kos_funcs.kosf_outproc = kosf_outproc_mock; + + STAILQ_INIT(&stages); + STAILQ_INSERT_HEAD(&stages, &stage1, kos_next); + STAILQ_INSERT_TAIL(&stages, &stage2, kos_next); + + struct { + char *test_name; + unsigned int pmap_cache_attributes_retval; + size_t panic_data_length; + kern_return_t expected; + } test_cases[] = { + { + .test_name = "normal memory flow", + .pmap_cache_attributes_retval = 0x02, // VM_WIMG_DEFAULT + .panic_data_length = sizeof(panic_data), + .expected = KERN_SUCCESS + }, + { + .test_name = "not-normal memory flow, 4-byte reads", + .pmap_cache_attributes_retval = 0x00, + .panic_data_length = sizeof(panic_data), + .expected = KERN_SUCCESS + }, + { + .test_name = "not-normal memory flow, 1-byte reads", + .pmap_cache_attributes_retval = 0x00, + .panic_data_length = sizeof(panic_data) - 1, // ensure length of panic data is not aligned to 4 bytes. + .expected = KERN_SUCCESS + }, + }; + + T_MOCK_SET_RETVAL(kvtophys, pmap_paddr_t, 0x12345678); // arbitrary value; isn't used anyways since we mock pmap_cache_attributes. + + for (size_t i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) { + T_MOCK_SET_RETVAL(pmap_cache_attributes, unsigned int, test_cases[i].pmap_cache_attributes_retval); + + T_EXPECT_EQ( + test_cases[i].expected, + memory_backing_aware_buffer_stage_outproc(&stage1, KDP_DATA, "corename", sizeof(panic_data), panic_data), + "return value matches expectation" + ); + } +} diff --git a/tests/unit/pmap_steal_memory_overflow.c b/tests/unit/pmap_steal_memory_overflow.c new file mode 100644 index 000000000..5f5b3d92c --- /dev/null +++ b/tests/unit/pmap_steal_memory_overflow.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2000-2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include "mach/arm/boolean.h" +#include "mocks/unit_test_utils.h" +#include +#include + +#define UT_MODULE osfmk + +extern void *pmap_steal_memory_internal( + vm_size_t size, + vm_size_t alignment, + boolean_t might_free, + unsigned int flags, + pmap_mapping_type_t mapping_type); + +T_DECL(size_overflow, "make sure we panic when size is greater than UINT64_MAX - 8") +{ + vm_size_t size = UINT64_MAX - 7; + vm_size_t alignment = 0; + boolean_t might_free = false; + unsigned int flags = 0; + pmap_mapping_type_t mapping_type = PMAP_MAPPING_TYPE_INFER; + T_ASSERT_PANIC({ + pmap_steal_memory_internal( + size, + alignment, + might_free, + flags, + mapping_type); + }, "should panic because of size overflow"); +} + +T_DECL(addr_plus_size_overflow, "make sure we panic when size is so big that addr + size will overflow") +{ + vm_size_t size = UINT64_MAX - 8; + vm_size_t alignment = 0; + boolean_t might_free = false; + unsigned int flags = 0; + pmap_mapping_type_t mapping_type = PMAP_MAPPING_TYPE_INFER; + T_ASSERT_PANIC({ + pmap_steal_memory_internal( + size, + alignment, + might_free, + flags, + mapping_type); + }, "should panic because of size overflow"); +} diff --git a/tests/unit/tools/fibers_lldb.py b/tests/unit/tools/fibers_lldb.py new file mode 100755 index 000000000..54679fab1 --- /dev/null +++ b/tests/unit/tools/fibers_lldb.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +''' +How to Use: + +Load in LLDB: +(lldb) command script import ./tests/unit/tools/fibers_lldb.py + +Run the commands: +(lldb) fibers_all # Lists all existing fibers +(lldb) fibers_ready # Lists fibers in the run queue +(lldb) fibers_current # Gets information about the current fiber +(lldb) fibers_regs [id] # Get the registers saved in the fiber end (default current fiber) +''' + +import lldb +import sys + +def fiber_state_to_string(state): + """Converts a fiber state integer to a human-readable string.""" + states = [] + if state & 0x1: + states.append("RUN") + if state & 0x2: + states.append("STOP") + if state & 0x4: + states.append("WAIT") + if state & 0x8: + states.append("JOIN") + if state & 0x10: + states.append("DEAD") + return "|".join(states) if states else "UNKNOWN" + +def strip_pointer(target, addr): + """Strips the PAC signature from a pointer.""" + val = target.CreateValueFromAddress("__tmp_strip_pac", lldb.SBAddress(addr, target), target.FindFirstType("unsigned long long")) + val.SetPreferDynamicValue(lldb.eNoDynamicValues) + val = val.AddressOf() + return val.GetValueAsAddress() + +def strip_fp_lr_sp(process, target, fp, lr, sp): + """Strip manged registers in the jmp buf from the munge token and PAC.""" + # get the munge token (see __longjmp impl) + frame = process.selected_thread.GetFrameAtIndex(0) + + # ref. os/tsd.h + # define __TSD_PTR_MUNGE 7 + munge_token = frame.EvaluateExpression('({void** r; __asm__("mrs %0, TPIDRRO_EL0" : "=r"(r)); r[7];})') + if munge_token.GetError().Fail(): + return None + munge_token = munge_token.GetValueAsAddress() + + fp = strip_pointer(target, fp ^ munge_token) + lr = strip_pointer(target, lr ^ munge_token) + sp = strip_pointer(target, sp ^ munge_token) + return (fp, lr, sp) + +def get_fiber_info(debugger, fiber_value): + """Retrieves information about a fiber from its SBValue address.""" + if not fiber_value or not fiber_value.IsValid(): + return None + + fiber_address = fiber_value.GetValueAsAddress() + + fiber_id_value = fiber_value.GetChildMemberWithName('id') + fiber_id_state = fiber_value.GetChildMemberWithName('state') + stack_bottom_value = fiber_value.GetChildMemberWithName('stack_bottom') + env_value = fiber_value.GetChildMemberWithName('env') + if not fiber_id_value.IsValid() or not fiber_id_state.IsValid() or not stack_bottom_value.IsValid() or not env_value.IsValid(): + print(f"Error reading fiber memory") + return None + + fiber_id = fiber_id_value.GetValueAsUnsigned() + fiber_state = fiber_id_state.GetValueAsUnsigned() + stack_bottom = stack_bottom_value.GetValueAsAddress() + env_address = env_value.AddressOf().GetValueAsAddress() + + return { + "id": fiber_id, + "address": fiber_address, + "state": fiber_state, + "state_str": fiber_state_to_string(fiber_state), + "stack_bottom": stack_bottom, + "env_address": env_address + } + +def print_stack_trace_from_jmp_buf(debugger, fiber_info, result, arch): + """Prints a stack trace by manually walking the stack.""" + target = debugger.GetSelectedTarget() + process = target.GetProcess() + env_address = fiber_info["env_address"] + error = lldb.SBError() + addr_size = target.GetAddressByteSize() + + if arch == "x86_64": + result.AppendMessage(f" Error: Register printing is not supported on x86_64.") + return + + elif arch == "arm64": + FP_OFFSET = 80 + LR_OFFSET = 88 + SP_OFFSET = 96 + + fp = process.ReadPointerFromMemory(env_address + FP_OFFSET, error) + lr = process.ReadPointerFromMemory(env_address + LR_OFFSET, error) + sp = process.ReadPointerFromMemory(env_address + SP_OFFSET, error) + + if error.Fail(): + result.AppendMessage(f" Error: Could not read registers from jmp_buf: {error}") + return + + strip_res = strip_fp_lr_sp(process, target, fp, lr, sp) + if strip_res is None: + result.AppendMessage(f" Error: Could not strip FP LR or SP") + return + fp, lr, sp = strip_res + + result.AppendMessage(f" Stack trace for fiber {fiber_info['id']} (manual backtrace):") + + for i in range(10): # Limit to 10 frames for simplicity + symbol_context = target.ResolveSymbolContextForAddress(lldb.SBAddress(lr, target), lldb.eSymbolContextEverything) + symbol = symbol_context.GetSymbol() + if symbol: + symbol_name = symbol.GetName() + else: + symbol_name = "unknown" + result.AppendMessage(f" #{i}: 0x{lr:x} {symbol_name}") + + next_fp = process.ReadPointerFromMemory(fp, error) + if error.Fail(): + result.AppendMessage(f" Error: Could not read next FP from memory: {error}") + break + + next_lr = process.ReadPointerFromMemory(fp + 8, error) # read next LR from the stack using current SP + if error.Fail(): + result.AppendMessage(f" Error: Could not read next LR from memory: {error}") + break + + if next_lr == 0: + result.AppendMessage(" End of stack or error reading memory.") + break + + next_lr = strip_pointer(target, next_lr) + lr = next_lr + fp = next_fp + + else: + result.AppendMessage(f" Error: Unsupported architecture: {arch}") + return + + +def list_fibers_all(debugger, command, result, internal_dict, arch): + """Lists all existing fibers.""" + list_fibers_from_queue(debugger, command, result, internal_dict, "fibers_existing_queue", "All Existing Fibers", arch) + +def list_fibers_ready(debugger, command, result, internal_dict, arch): + """Lists fibers in the run queue (now called 'ready').""" + list_fibers_from_queue(debugger, command, result, internal_dict, "fibers_run_queue", "Ready Fibers (Run Queue)", arch) + +def list_fibers_from_queue(debugger, command, result, internal_dict, queue_name, title, arch): + """Lists fibers from a specified queue.""" + + target = debugger.GetSelectedTarget() + + queue_var = target.FindFirstGlobalVariable(queue_name) + if not queue_var.IsValid(): + result.SetError(f"Could not find '{queue_name}' global variable.") + return + + result.AppendMessage(f"{title}:") + result.AppendMessage("-------") + + queue_top_value = queue_var.GetChildMemberWithName('top') + if not queue_top_value.IsValid(): + result.SetError(f"Could not find '{queue_name}.top' field.") + return + + fiber_value = queue_top_value + while fiber_value and fiber_value.IsValid(): + fiber = get_fiber_info(debugger, fiber_value) + if fiber: + result.AppendMessage(f" ID: {fiber['id']}, Address: 0x{fiber['address']:x}, State: {fiber['state_str']}, Stack Bottom: 0x{fiber['stack_bottom']:x}") + try: + print_stack_trace_from_jmp_buf(debugger, fiber, result, arch) # Optional: Add stack traces + except Exception as err: + result.AppendMessage(f"Error: failed to get a stacktrace: {err}") + break + else: + result.AppendMessage(f"Warning: Could not read fiber at address 0x{fiber_value.GetValueAsUnsigned():x}") + break + + if queue_name == "fibers_existing_queue": + next_fiber_value = fiber_value.GetChildMemberWithName('next_existing') + else: + next_fiber_value = fiber_value.GetChildMemberWithName('next') + + if not next_fiber_value.IsValid(): + break + + fiber_value = next_fiber_value + +def get_current_fiber_info(debugger, command, result, internal_dict, arch): + """Gets and prints information about the current fiber.""" + target = debugger.GetSelectedTarget() + + fibers_current_var = target.FindFirstGlobalVariable("fibers_current") + if not fibers_current_var.IsValid(): + result.SetError("Could not find 'fibers_current' global variable.") + return + + current_fiber = get_fiber_info(debugger, fibers_current_var) + + if not current_fiber: + result.AppendMessage("No current fiber.") + return + + result.AppendMessage("Current Fiber Information:") + result.AppendMessage("--------------------------") + result.AppendMessage(f" ID: {current_fiber['id']}") + result.AppendMessage(f" Address: 0x{current_fiber['address']:x}") + result.AppendMessage(f" State: {current_fiber['state_str']}") + result.AppendMessage(f" Stack Bottom: 0x{current_fiber['stack_bottom']:x}") + try: + print_stack_trace_from_jmp_buf(debugger, current_fiber, result, arch) # Optional: Add stack traces + except Exception as err: + print(f"Error: failed to get a stacktrace: {err}") + +def print_fiber_registers(debugger, command, result, internal_dict, arch, fiber_id=None): + """Prints the registers of a specified fiber.""" + target = debugger.GetSelectedTarget() + process = target.GetProcess() + + if fiber_id is None: + fibers_current_var = target.FindFirstGlobalVariable("fibers_current") + if not fibers_current_var.IsValid(): + result.SetError("Could not find 'fibers_current' global variable.") + return + + current_fiber = get_fiber_info(debugger, fibers_current_var) + if not current_fiber: + result.AppendMessage("No current fiber.") + return + + else: + # find the specified fiber in the existing queue + fiber_address = None + existing_queue_var = target.FindFirstGlobalVariable("fibers_existing_queue") + if not existing_queue_var.IsValid(): + result.SetError("Could not find 'fibers_existing_queue' global variable.") + return + + queue_top_value = existing_queue_var.GetChildMemberWithName('top') + if not queue_top_value.IsValid(): + result.SetError(f"Could not find '{existing_queue_var.GetName()}.top' field.") + return + + fiber_value = queue_top_value + while fiber_value and fiber_value.IsValid(): + temp_fiber = get_fiber_info(debugger, fiber_value) + if temp_fiber and temp_fiber['id'] == int(fiber_id): + current_fiber = temp_fiber + break + + next_fiber_value = fiber_value.GetChildMemberWithName('next_existing') + if not next_fiber_value.IsValid(): + break + + fiber_value = next_fiber_value + + if not current_fiber: + result.AppendMessage(f"Fiber with ID {fiber_id} not found.") + return + + env_address = current_fiber["env_address"] + error = lldb.SBError() + addr_size = target.GetAddressByteSize() + + if arch == "x86_64": + result.AppendMessage(f" Error: Register printing is not supported on x86_64.") + return + + elif arch == "arm64": + # reference: libplatform src/setjmp/arm64/setjmp.s __longjmp + X19_OFFSET = 0 + X20_OFFSET = 8 + X21_OFFSET = 16 + X22_OFFSET = 24 + X23_OFFSET = 32 + X24_OFFSET = 40 + X25_OFFSET = 48 + X26_OFFSET = 56 + X27_OFFSET = 64 + X28_OFFSET = 72 + + FP_OFFSET = 80 + LR_OFFSET = 88 + SP_OFFSET = 96 + + x19 = process.ReadPointerFromMemory(env_address + X19_OFFSET, error) + x20 = process.ReadPointerFromMemory(env_address + X20_OFFSET, error) + x21 = process.ReadPointerFromMemory(env_address + X21_OFFSET, error) + x22 = process.ReadPointerFromMemory(env_address + X22_OFFSET, error) + x23 = process.ReadPointerFromMemory(env_address + X23_OFFSET, error) + x24 = process.ReadPointerFromMemory(env_address + X24_OFFSET, error) + x25 = process.ReadPointerFromMemory(env_address + X25_OFFSET, error) + x26 = process.ReadPointerFromMemory(env_address + X26_OFFSET, error) + x27 = process.ReadPointerFromMemory(env_address + X27_OFFSET, error) + x28 = process.ReadPointerFromMemory(env_address + X28_OFFSET, error) + + fp = process.ReadPointerFromMemory(env_address + FP_OFFSET, error) + lr = process.ReadPointerFromMemory(env_address + LR_OFFSET, error) + sp = process.ReadPointerFromMemory(env_address + SP_OFFSET, error) + + if error.Fail(): + result.AppendMessage(f" Error: Could not read registers from jmp_buf: {error}") + return + + strip_res = strip_fp_lr_sp(process, target, fp, lr, sp) + if strip_res is None: + result.AppendMessage(f" Error: Could not strip FP LR or SP") + return + fp, lr, sp = strip_res + + result.AppendMessage(f"Fiber {current_fiber['id']} Registers (arm64):") + result.AppendMessage("-----------------------------") + result.AppendMessage(f" X19: 0x{x19:x}") + result.AppendMessage(f" X20: 0x{x20:x}") + result.AppendMessage(f" X21: 0x{x21:x}") + result.AppendMessage(f" X22: 0x{x22:x}") + result.AppendMessage(f" X23: 0x{x23:x}") + result.AppendMessage(f" X24: 0x{x24:x}") + result.AppendMessage(f" X25: 0x{x25:x}") + result.AppendMessage(f" X26: 0x{x26:x}") + result.AppendMessage(f" X27: 0x{x27:x}") + result.AppendMessage(f" X28: 0x{x28:x}") + result.AppendMessage(f" LR: 0x{lr:x}") + result.AppendMessage(f" FP: 0x{fp:x}") + result.AppendMessage(f" SP: 0x{sp:x}") + else: + result.AppendMessage(f" Error: Unsupported architecture: {arch}") + return + +arch = None + +def list_fibers_all_cmd(debugger, command, result, internal_dict): + list_fibers_all(debugger, command, result, internal_dict, arch) + +def list_fibers_ready_cmd(debugger, command, result, internal_dict): + list_fibers_ready(debugger, command, result, internal_dict, arch) + +def get_current_fiber_info_cmd(debugger, command, result, internal_dict): + get_current_fiber_info(debugger, command, result, internal_dict, arch) + +def print_fiber_registers_cmd(debugger, command, result, internal_dict): + """Prints the registers of a specified fiber.""" + args = command.split() + fiber_id = None + if len(args) > 0: + try: + fiber_id = int(args[0]) + except ValueError: + result.SetError("Invalid fiber ID. Please provide an integer.") + return + + print_fiber_registers(debugger, command, result, internal_dict, arch, fiber_id) + +def __lldb_init_module(debugger, internal_dict): + global arch + """LLDB calls this function to load the script.""" + + target = debugger.GetSelectedTarget() + platform = target.GetPlatform() + if platform: + platform_name = platform.GetTriple() + if "x86_64" in platform_name: + arch = "x86_64" + elif "arm64" in platform_name or "aarch64" in platform_name: + arch = "arm64" + else: + print(f"Warning: Unsupported architecture: {platform_name}. Stack traces may not work.") + arch = "unknown" + else: + print("Warning: Could not get platform information. Stack traces may not work.") + arch = "unknown" + + debugger.HandleCommand('command script add -f fibers_lldb.list_fibers_all_cmd fibers_all') + debugger.HandleCommand('command script add -f fibers_lldb.list_fibers_ready_cmd fibers_ready') + debugger.HandleCommand('command script add -f fibers_lldb.get_current_fiber_info_cmd fibers_current') + debugger.HandleCommand('command script add -f fibers_lldb.print_fiber_registers_cmd fibers_regs') + print("The 'fibers_all', 'fibers_ready', 'fibers_current', and 'fibers_regs' commands have been added.") + print(f"Detected architecture: {arch}") diff --git a/tests/unit/tools/generate_ut_proj.py b/tests/unit/tools/generate_ut_proj.py new file mode 100755 index 000000000..628d112e8 --- /dev/null +++ b/tests/unit/tools/generate_ut_proj.py @@ -0,0 +1,757 @@ +#!/usr/bin/env python3 +import json +import argparse +import os +import pathlib +import xml.etree.ElementTree as ET +import uuid + +# This scripts takes a compile_commands.json file that was generated using `make -C tests/unit cmds_json` +# and creates project files for an IDE that can be used for debugging user-space unit-tests +# The project is not able to build XNU or the test executable + +SRC_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +TESTS_UNIT_PREFIX = "tests/unit/" +TESTS_UNIT_BUILD_PREFIX = TESTS_UNIT_PREFIX + "build/sym/" + +def parse_command(entry): + file = entry['file'] + directory = entry["directory"] + if not file.startswith(SRC_ROOT): + full_file = directory + "/" + file + else: + full_file = file + assert full_file.startswith(SRC_ROOT), "unexpected path" + full_file + rel_file = full_file[len(SRC_ROOT)+1:] + + # arguments[0] is clang + args = entry['arguments'][1:] + + args.extend(['-I', directory]) + return rel_file, args + +# -------------------------------------- Xcode project ---------------------------------------- +# an Xcode project is a plist with a list of objects. each object has an ID and objects reference +# each other by their ID. + +def do_quote_lst(dash_split): + output = [] + # change ' -DX=y z' to ' -DX="y z"' + for i, s in enumerate(dash_split): + if i == 0: + continue # skip the clang executable + if '=' in s: + st = s.strip() + eq_sp = st.split('=') + if ' ' in eq_sp[1]: + output.append(f'{eq_sp[0]}=\\"{eq_sp[1]}\\"') + continue + + output.append(f"{s}") + return " ".join(output) + +class ObjType: + def __init__(self, idprefix, type_name): + self.type_name = type_name + self.id_prefix = idprefix + self.next_count = 1 + def make_id(self): + id = f"{self.id_prefix:016d}{self.next_count:08d}" + self.next_count += 1 + return id + +class ObjRegistry: + def __init__(self): + self.types = {} # map type-name to id-prefix (12 chars) + self.next_type_prefix = 1 + + self.objects = {} # map object-id to instance + + def register(self, type_name, obj): + if type_name not in self.types: + self.types[type_name] = ObjType(self.next_type_prefix, type_name) + self.next_type_prefix += 1 + id = self.types[type_name].make_id() + self.objects[id] = obj + return id + + +obj_reg = ObjRegistry() + +TYPE_SOURCE_C = "sourcecode.c.c" +TYPE_SOURCE_CPP = "sourcecode.cpp.cpp" +TYPE_SOURCE_ASM = "sourcecode.asm" +TYPE_HEADER = "sourcecode.c.h" +TYPE_STATIC_LIB = "archive.ar" +TYPE_EXE = '"compiled.mach-o.executable"' + +class ObjList: + def __init__(self, name=None): + self.name = name + self.objs = [] + def add(self, obj): + self.objs.append(obj) + def extend(self, lst): + self.objs.extend(lst) + +def tab(count): + return '\t' * count + +# The top-level object list is special in that it's grouped by the type of objects +# This class represents part of the top level objects list +class TopObjList(ObjList): + def write(self, out, lvl): + out.write(f"/* Begin {self.name} section */\n") + for obj in self.objs: + out.write(f"{tab(lvl)}{obj.id} = ") + obj.write(out, lvl) + out.write(f"/* End {self.name} section */\n\n") + +# a property that is serilized as a list of ids +class IdList(ObjList): + def write(self, out, lvl): + out.write("(\n") # after = + for obj in self.objs: + out.write(f"{tab(lvl+1)}{obj.id} /* {obj.name} */,\n") + out.write(f"{tab(lvl)});\n") + +class StrList: + def __init__(self, lst): + self.lst = lst + def write(self, out, lvl): + out.write("(\n") # after = + for v in self.lst: + out.write(f"{tab(lvl+1)}{v},\n") + out.write(f"{tab(lvl)});\n") + @classmethod + def list_sort_quote(cls, s): + l = list(s) + l.sort() + return cls([f'"{d}"' for d in l]) + +class StrEval: + def __init__(self, fn): + self.fn = fn + def write(self, out, lvl): + out.write(self.fn() + ";\n") +class LateEval: + def __init__(self, fn): + self.fn = fn + def write(self, out, lvl): + self.fn().write(out, lvl) + +class PDict: + def __init__(self, isa, inline=False): + self.d = {} + self.p = [] + self.inline = inline + if isa is not None: + self.isa = self.padd("isa", isa) + + def padd(self, k, v, comment=None): + self.p.append((k, v, comment)) + self.d[k] = v + return v + def pextend(self, d): + for k, v in d.items(): + self.padd(k, v) + + def write(self, out, lvl): + if self.inline: + out.write("{") + for k, v, comment in self.p: + assert isinstance(v, str) or isinstance(v, int), "complex value inline" + out.write(f"{k} = ") + if comment is None: + out.write(f"{v}; ") + else: + out.write(f"{v} /* {comment} */; ") + out.write("};\n") + else: + out.write("{\n") # comes after = + for k, v, comment in self.p: + out.write(f"{tab(lvl+1)}{k} = ") + if isinstance(v, str) or isinstance(v, int): + if comment is None: + out.write(f"{v};\n") + else: + out.write(f"{v} /* {comment} */;\n") + else: + v.write(out, lvl+1) + out.write(f"{tab(lvl)}}};\n") + + +class File: + def __init__(self, name, args): + self.name = name.split('/')[-1] + self.args = args + self.ref = None + + def type_str(self): + ext = os.path.splitext(self.name)[1] + if ext == ".c": + return TYPE_SOURCE_C + if ext == ".h": + return TYPE_HEADER + if ext == ".cpp": + return TYPE_SOURCE_CPP + if ext == ".a": + return TYPE_STATIC_LIB + if ext == ".s": + return TYPE_SOURCE_ASM + if ext == '': + return TYPE_EXE + return None + +class BuildFile(PDict): + def __init__(self, file): + PDict.__init__(self, "PBXBuildFile", inline=True) + self.id = obj_reg.register("build_file", self) + self.file = file + self.name = file.name + self.padd("fileRef", self.file.ref.id, comment=self.file.name) + +class FileRef(PDict): + def __init__(self, file): + PDict.__init__(self, "PBXFileReference", inline=True) + self.id = obj_reg.register("file_ref", self) + self.file = file + file.ref = self + typ = self.file.type_str() + assert typ is not None, "unknown file type " + self.file.name + if typ == TYPE_STATIC_LIB or typ == TYPE_EXE: + self.padd("explicitFileType", typ) + self.padd("includeInIndex", 0) + self.padd("path", f'"{self.file.name}"') + self.padd("sourceTree", "BUILT_PRODUCTS_DIR") + else: + self.padd("lastKnownFileType", typ) + self.padd("path", f'"{self.file.name}"') + self.padd("sourceTree", '""') + + @property + def name(self): + return self.file.name + +class Group(PDict): + def __init__(self, name=None, path=None): + PDict.__init__(self, "PBXGroup") + self.id = obj_reg.register("group", self) + self.children = self.padd("children", IdList()) + self.child_dict = {} # map name to Group/FileRef + if name is not None: + self.name = self.padd("name", name) + if path is not None: + self.name = self.padd("path", f'"{path}"') + self.padd("sourceTree", '""') + + def rec_add(self, sp_path, groups_lst, file_ref): + elem = sp_path[0] + if len(sp_path) == 1: + assert elem not in self.child_dict, f"already have file elem {elem} in {self.name}" + self.children.add(file_ref) + self.child_dict[elem] = file_ref + #file_ref.file.name = elem # remove the path from the name + else: + if elem in self.child_dict: + g = self.child_dict[elem] + else: + g = Group(path=elem) + groups_lst.add(g) + self.children.add(g) + self.child_dict[elem] = g + g.rec_add(sp_path[1:], groups_lst, file_ref) + + def sort(self): + self.children.objs.sort(key=lambda x: x.name) + for elem in self.children.objs: + if isinstance(elem, Group): + elem.sort() + +class BuildPhase(PDict): + def __init__(self, isa, name): + PDict.__init__(self, isa) + self.id = obj_reg.register("build_phase", self) + self.name = name + self.padd("buildActionMask", 2147483647) + self.files = self.padd("files", IdList()) + self.padd("runOnlyForDeploymentPostprocessing", 0) + +class Target(PDict): + def __init__(self, name, file_ref, cfg_lst, prod_type): + PDict.__init__(self, "PBXNativeTarget") + self.id = obj_reg.register("target", self) + self.cfg_lst = self.padd("buildConfigurationList", cfg_lst.id) + self.build_phases = self.padd("buildPhases", IdList()) + self.padd("buildRules", IdList()) + self.padd("dependencies", IdList()) + self.name = self.padd("name", name) + self.padd("packageProductDependencies", IdList()) + self.padd("productName", name) + self.padd("productReference", file_ref.id, comment=file_ref.name) + self.padd("productType", prod_type) + +class CfgList(PDict): + def __init__(self, name): + PDict.__init__(self, "XCConfigurationList") + self.id = obj_reg.register("config_list", self) + self.name = name # not used + self.configs = self.padd("buildConfigurations", IdList()) + self.padd("defaultConfigurationIsVisible", 0) + self.padd("defaultConfigurationName", StrEval(lambda: self.configs.objs[0].name)) + +class Config(PDict): + def __init__(self, name): + PDict.__init__(self, "XCBuildConfiguration") + self.id = obj_reg.register("config", self) + self.settings = self.padd("buildSettings", PDict(None)) + self.name = self.padd("name", name) + +class Project(PDict): + def __init__(self, cfg_lst, group_main, group_prod): + PDict.__init__(self, "PBXProject") + self.id = obj_reg.register("project", self) + self.targets = IdList("targets") + self.padd("attributes", LateEval(lambda: self.make_attr())) + self.padd("buildConfigurationList", cfg_lst.id, comment=cfg_lst.name) + self.padd("developmentRegion", "en") + self.padd("hasScannedForEncodings", "0") + self.padd("knownRegions", StrList(["en", "Base"])) + self.padd("mainGroup", group_main.id) + self.padd("minimizedProjectReferenceProxies", "1") + self.padd("preferredProjectObjectVersion", "77") + self.padd("productRefGroup", group_prod.id) + self.padd("projectDirPath", '""') + self.padd("projectRoot", '""') + self.padd("targets", self.targets) + + def make_attr(self): + a = PDict(None) + a.padd("BuildIndependentTargetsInParallel", 1) + a.padd("LastUpgradeCheck", 1700) + ta = a.padd("TargetAttributes", PDict(None)) + for t in self.targets.objs: + p = ta.padd(t.id, PDict(None)) + p.padd("CreatedOnToolsVersion", "17.0") + return a + + +class PbxProj: + def __init__(self): + self.top_obj = [] + self.build_files = self.add_top(TopObjList("PBXBuildFile")) + self.file_refs = self.add_top(TopObjList("PBXFileReference")) + self.groups = self.add_top(TopObjList("PBXGroup")) + self.build_phases = self.add_top(TopObjList("build phases")) + self.targets = self.add_top(TopObjList("PBXNativeTarget")) + self.projects = self.add_top(TopObjList("PBXProject")) + self.configs = self.add_top(TopObjList("XCBuildConfiguration")) + self.config_lists = self.add_top(TopObjList("XCConfigurationList")) + + self.group_main = self.add_group(Group()) + self.group_products = self.add_group(Group(name="Products")) + self.group_main.children.add(self.group_products) + + self.cfg_prod_release = self.add_config(Config("Release")) + self.cfg_prod_release.settings.pextend({"SDKROOT": "macosx", + "MACOSX_DEPLOYMENT_TARGET": "14.1", + }) + self.proj_cfg_lst = self.add_cfg_lst(CfgList("proj config list")) + self.proj_cfg_lst.configs.add(self.cfg_prod_release) + + self.root_proj = Project(self.proj_cfg_lst, self.group_main, self.group_products) + self.projects.add(self.root_proj) + + self.test_exec = [] + + def add_top(self, t): + self.top_obj.append(t) + return t + def add_group(self, g): + self.groups.add(g) + return g + def add_build_phase(self, p): + self.build_phases.add(p) + return p + def add_config(self, c): + self.configs.add(c) + return c + def add_cfg_lst(self, c): + self.config_lists.add(c) + return c + def add_target(self, t): + self.targets.add(t) + return t + + def add_xnu_archive(self): + f = File("libkernel.a", []) + fr = FileRef(f) + self.file_refs.add(fr) + self.group_products.children.add(fr) + self.xnu_phase_headers = self.add_build_phase(BuildPhase("PBXHeadersBuildPhase", "Headers")) + self.xnu_phase_sources = self.add_build_phase(BuildPhase("PBXSourcesBuildPhase", "Sources")) + + cfg_xnu_release = self.add_config(Config("Release")) + cfg_xnu_release.settings.pextend( { "CODE_SIGN_STYLE": "Automatic", + "EXECUTABLE_PREFIX": "lib", + "PRODUCT_NAME": '"$(TARGET_NAME)"', + "SKIP_INSTALL": "YES"}) + xnu_cfg_lst = self.add_cfg_lst(CfgList("target config list")) + xnu_cfg_lst.configs.add(cfg_xnu_release) + + target = self.add_target(Target("xnu_static_lib", fr, xnu_cfg_lst, '"com.apple.product-type.library.static"')) + target.build_phases.extend([self.xnu_phase_headers, self.xnu_phase_sources]) + self.root_proj.targets.add(target) + + def add_test_target(self, c_file_ref, c_build_file): + name = os.path.splitext(os.path.split(c_file_ref.name)[1])[0] + f = File(name, []) + fr = FileRef(f) + self.file_refs.add(fr) + self.group_products.children.add(fr) + phase_h = self.add_build_phase(BuildPhase("PBXHeadersBuildPhase", "Headers")) + phase_src = self.add_build_phase(BuildPhase("PBXSourcesBuildPhase", "Sources")) + phase_src.files.add(c_build_file) + + cfg_release = self.add_config(Config("Release")) + cfg_release.settings.pextend( { "CODE_SIGN_STYLE": "Automatic", + "PRODUCT_NAME": '"$(TARGET_NAME)"'}) + cfg_lst = self.add_cfg_lst(CfgList("target config list")) + cfg_lst.configs.add(cfg_release) + + target = self.add_target(Target(name, fr, cfg_lst, '"com.apple.product-type.tool"')) + target.build_phases.extend([phase_h, phase_src]) + self.root_proj.targets.add(target) + self.test_exec.append(target) + + def add_file(self, file_path, flags): + f = File(file_path, flags) + fr = FileRef(f) + bf = BuildFile(f) + self.build_files.add(bf) + self.file_refs.add(fr) + self.group_main.rec_add(file_path.split('/'), self.groups, fr) + typ = f.type_str() + if typ == TYPE_HEADER: + self.xnu_phase_headers.files.add(bf) + elif typ in [TYPE_SOURCE_C, TYPE_SOURCE_CPP, TYPE_SOURCE_ASM]: + self.xnu_phase_sources.files.add(bf) + return fr, bf + def add_ccj(self, ccj): + test_targets = [] + for entry in ccj: + src_file, flags = parse_command(entry) + if src_file.endswith('dt_proxy.c'): + continue + fr, bf = self.add_file(src_file, flags) + if src_file.startswith(TESTS_UNIT_PREFIX): + test_targets.append((fr, bf)) + test_targets.sort(key=lambda x:x[1].name) + for fr, bf in test_targets: + self.add_test_target(fr, bf) + + def add_headers(self): + for path in pathlib.Path(SRC_ROOT).rglob('*.h'): + full_file = str(path) + assert full_file.startswith(SRC_ROOT), "unexpected path" + full_file + rel_file = full_file[len(SRC_ROOT)+1:] + self.add_file(str(rel_file), None) + + def sort_groups(self): + self.group_main.sort() + + def write(self, out): + out.write("// !$*UTF8*$!\n{\n") + out.write("\tarchiveVersion = 1;\n\tclasses = {\n\t};\n\tobjectVersion = 77;\n\tobjects = {\n\n") + for t in self.top_obj: + t.write(out, 2) + out.write(f"\t}};\n\trootObject = {self.root_proj.id};\n") + out.write("}") + + def make_settings(self): + # go over all build files and find in their arguments a union of all the included folders + # this is useful for file navigation in xcode to work correctly + inc_dirs = set() + common_defines = None + for f in self.build_files.objs: + file_defines = set() + args = f.file.args + if args is None: + continue + for i, arg in enumerate(args): + if arg == '-I': + d = args[i + 1] + if d != ".": + inc_dirs.add(args[i + 1]) + elif arg == '-D': + file_defines.add(args[i+1]) + if common_defines is None: + common_defines = file_defines + else: + common_defines = common_defines.intersection(file_defines) + inc_str_lst = StrList.list_sort_quote(inc_dirs) + self.cfg_prod_release.settings.padd("HEADER_SEARCH_PATHS", inc_str_lst) + self.cfg_prod_release.settings.padd("SYSTEM_HEADER_SEARCH_PATHS", inc_str_lst) + str_common_defs = StrList.list_sort_quote(common_defines) + self.cfg_prod_release.settings.padd("GCC_PREPROCESSOR_DEFINITIONS", str_common_defs) + + def write_schemes(self, folder, container_dir): + for target in self.test_exec: + path = os.path.join(folder, target.name + ".xcscheme") + out = open(path, "w") + exec_path = SRC_ROOT + "/" + TESTS_UNIT_BUILD_PREFIX + target.name + out.write(f''' + + + + + + + + + + + + + + + + + + +''') + print(f"Wrote {path}") + +def gen_xcode(ccj): + p = PbxProj() + p.add_xnu_archive() + p.add_ccj(ccj) + p.add_headers() + p.sort_groups() + p.make_settings() + + output = os.path.join(SRC_ROOT, "ut_xnu_proj.xcodeproj") + os.makedirs(output, exist_ok=True) + proj_path = os.path.join(output, "project.pbxproj") + p.write(open(proj_path, "w")) + print(f'wrote file: {proj_path};') + + schemes_dir = output + "/xcshareddata/xcschemes" + os.makedirs(schemes_dir, exist_ok=True) + p.write_schemes(schemes_dir, output) + print(f'wrote schemes to: {schemes_dir}') + +# -------------------------------------- VSCode launch targets ---------------------------------------- + +class TargetsProject: + def __init__(self): + self.targets = [] + + def add_ccj(self, ccj): + for entry in ccj: + src_file, flags = parse_command(entry) + if src_file.startswith(TESTS_UNIT_PREFIX): + name = os.path.splitext(src_file[len(TESTS_UNIT_PREFIX):])[0] + self.targets.append(name) + self.targets.sort() + +class VsCodeLaunchJson(TargetsProject): + def write(self, f): + confs = [] + launch = {"version": "0.2.0", "configurations": confs } + for t in self.targets: + confs.append({ + "name": t, + "type": "lldb-dap", + "request": "launch", + "program": "${workspaceFolder}/" + TESTS_UNIT_BUILD_PREFIX + t, + "stopOnEntry": False, + "cwd": "${workspaceFolder}", + "args": [], + "env": [] + }) + json.dump(launch, f, indent=4) + + +def gen_vscode(ccj): + p = VsCodeLaunchJson() + p.add_ccj(ccj) + + output = os.path.join(SRC_ROOT, ".vscode/launch.json") + os.makedirs(os.path.join(SRC_ROOT, ".vscode"), exist_ok=True) + if os.path.exists(output): + print(f"deleting existing {output}") + os.unlink(output) + p.write(open(output, "w")) + print(f"wrote {output}") + +# -------------------------------------- CLion targets ---------------------------------------- + +def find_elem(root, tag, **kvarg): + assert len(kvarg.items()) == 1 + key, val = list(kvarg.items())[0] + for child in root: + assert child.tag == tag, f'unexpected child.tag {child.tag}' + if child.attrib[key] == val: + return child + return None + +def get_elem(root, tag, **kvarg): + child = find_elem(root, tag, **kvarg) + key, val = list(kvarg.items())[0] + if child is not None: + return child, False + comp = ET.SubElement(root, tag) + comp.attrib[key] = val + return comp, True + + +CLION_TOOLCHAIN_NAME = "System" +class CLionProject(TargetsProject): + def _get_root(self, path): + if os.path.exists(path): + print(f"Parsing existing file {path}") + root = ET.parse(path).getroot() + assert root.tag == 'project', f'unexpected root.tag {root.tag}' + else: + root = ET.Element('project') + root.attrib["version"] = "4" + return root + + def _write(self, root, path): + tree = ET.ElementTree(root) + ET.indent(tree, space=' ', level=0) + tree.write(open(path, "wb"), encoding="utf-8", xml_declaration=True) + print(f"Wrote {path}") + + def make_custom_targets(self): + # add a target that uses toolchain "System" + path = os.path.join(SRC_ROOT, ".idea/customTargets.xml") + root = self._get_root(path) + comp, _ = get_elem(root, "component", name="CLionExternalBuildManager") + # check if we already have the target we need + for target in comp: + if target.attrib["defaultType"] == "TOOL": + target_name = target.attrib["name"] + if len(target) == 1 and target[0].tag == "configuration": + conf = target[0] + if conf.attrib["toolchainName"] == CLION_TOOLCHAIN_NAME: + conf_name = conf.attrib["name"] + print(f"file {path} already has the needed target with name {target_name},{conf_name}") + return target_name, conf_name # it already exists, nothing to do + # add a new target + target_name = "test_default" + conf_name = "test_default" + + target = ET.SubElement(comp, "target") + target.attrib["id"] = str(uuid.uuid1()) + target.attrib["name"] = target_name + target.attrib["defaultType"] = "TOOL" + + conf = ET.SubElement(target, "configuration") + conf.attrib["id"] = str(uuid.uuid1()) + conf.attrib["name"] = conf_name + conf.attrib["toolchainName"] = CLION_TOOLCHAIN_NAME + print(f"Created target named {target_name}") + self._write(root, path) + return target_name, conf_name + + def add_to_workspace(self, target_name, conf_name): + path = os.path.join(SRC_ROOT, ".idea/workspace.xml") + root = self._get_root(path) + comp, _ = get_elem(root, "component", name="RunManager") + added_anything = False + for t in self.targets: + for conf in comp: + if conf.tag != "configuration": + continue + if conf.attrib["name"] == t: # already has this target + print(f"Found existing configuration named '{t}', not adding it") + break + else: + print(f"Adding configuration for '{t}'") + proj_name = os.path.basename(SRC_ROOT) + conf = ET.SubElement(comp, "configuration", name=t, + type="CLionExternalRunConfiguration", + factoryName="Application", + REDIRECT_INPUT="false", + ELEVATE="false", + USE_EXTERNAL_CONSOLE="false", + EMULATE_TERMINAL="false", + PASS_PARENT_ENVS_2="true", + PROJECT_NAME=proj_name, + TARGET_NAME=target_name, + CONFIG_NAME=conf_name, + RUN_PATH=f"$PROJECT_DIR$/{TESTS_UNIT_BUILD_PREFIX}{t}") + ET.SubElement(conf, "method", v="2") + added_anything = True + if added_anything: + self._write(root, path) + + +def gen_clion(ccj): + p = CLionProject() + p.add_ccj(ccj) + + os.makedirs(os.path.join(SRC_ROOT, ".idea"), exist_ok=True) + target_name, conf_name = p.make_custom_targets() + p.add_to_workspace(target_name, conf_name) + + +def main(): + parser = argparse.ArgumentParser(description='Generate xcode project from compile_commands.json') + parser.add_argument('mode', help='IDE to generate for', choices=['xcode', 'vscode', 'clion']) + parser.add_argument('compile_commands', help='Path to compile_commands.json', nargs='*', default=os.path.join(SRC_ROOT, "compile_commands.json")) + args = parser.parse_args() + + if not os.path.exists(args.compile_commands): + print(f"Can't find input {args.compile_commands}") + return 1 + + ccj = json.load(open(args.compile_commands, 'r')) + + if args.mode == 'xcode': + return gen_xcode(ccj) + elif args.mode == 'vscode': + return gen_vscode(ccj) + elif args.mode == 'clion': + return gen_clion(ccj) + + +if __name__ == '__main__': + main() + diff --git a/tests/unit/tools/get_target_details.py b/tests/unit/tools/get_target_details.py new file mode 100755 index 000000000..b6a93ebc5 --- /dev/null +++ b/tests/unit/tools/get_target_details.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +import sys +import subprocess + +# get the strings XNU build-folder strings for the given device +def main(): + sdkroot = sys.argv[1] + target_name = sys.argv[2] # e.g. j414c + query = f"SELECT DISTINCT KernelMachOArchitecture, KernelPlatform, SDKPlatform FROM Targets WHERE TargetType == '{target_name}'" + r = subprocess.check_output(["xcrun", "--sdk", sdkroot, "embedded_device_map", "-query", query], encoding="ascii") + r = r.strip() + if len(r) == 0: + raise Exception(f"target not found {target_name}") + arch, kernel_platform, sdk_platform = r.split("|") + + if arch.startswith("arm64"): # can be arm64, arm64e + arch = "ARM64" + elif arch.startswith("arm"): + arch = "ARM" + else: + raise Exception(f"unsupported arch {arch}") + + if sdk_platform == "macosx": + file_name_prefix = "kernel" + else: + file_name_prefix = "mach" + print(arch + " " + kernel_platform + " " + file_name_prefix) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/unit/tools/make_run_unittests.py b/tests/unit/tools/make_run_unittests.py new file mode 100755 index 000000000..64373845c --- /dev/null +++ b/tests/unit/tools/make_run_unittests.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import sys +import os + +template = '''#!/bin/zsh +tests=( +TEST_TARGETS +) + +s_dir=${0:A:h} +for file in ${tests[@]}; do + file_path=$s_dir/$file + echo "Running $file_path ..." + if [[ -f $file_path ]]; then + $file_path > /dev/null 2>/dev/null + ret=$? + if [[ $ret -eq 0 ]]; then + print -P "%F{green}*** PASS%f" + else + print -P "%F{red}*** FAILED: $file_path%f" + fi + else + print -P "%F{yellow}*** Missing%f" + fi +done +''' + +def main(): + targets_s = sys.argv[1] + output = sys.argv[2] + output_list = sys.argv[3] + + targets = targets_s.strip().split(' ') + target_lines = '\n'.join([('"' + t + '"') for t in targets]) + s = template.replace('TEST_TARGETS', target_lines) + open(output, 'w').write(s) + print(f"wrote {output}") + + open(output_list, 'w').write('\n'.join(targets) + '\n') + print(f"wrote {output_list}") + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/tests/unit/tools/merge_cmds_json.py b/tests/unit/tools/merge_cmds_json.py new file mode 100755 index 000000000..985176fe6 --- /dev/null +++ b/tests/unit/tools/merge_cmds_json.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +import sys +import os +import glob + +def main(): + xnu_root = sys.argv[1] + xnu_build_dir = sys.argv[2] + tests_obj_dir = sys.argv[3] + + xnu_json = os.path.join(xnu_build_dir, "compile_commands.json") + if not os.path.exists(xnu_json): + print(f"did not find xnu build json: {xnu_json}") + return 0 + root_json = os.path.join(xnu_root, "compile_commands.json") + + if os.path.exists(root_json): + if not os.path.islink(root_json): + print(f"root json is not a symlink, not removing it: {root_json}") + return 0 + + add_text = "" + for filename in glob.glob(os.path.join(tests_obj_dir, "*.json")): + if filename.endswith("compile_commands.json"): + continue + print(f"found {filename}") + text = open(filename).read() + add_text += text + add_text = add_text.rstrip() + if add_text[-1] == ',': + add_text = add_text[:-1] + + if len(add_text) == 0: + print(f"did not find any json files in {tests_obj_dir}") + return 0 + + xnu_j = open(xnu_json).read() + if xnu_j[-3:] != "\n]\n": + print(f"doesn't look like a legit compile_commands.json: {xnu_json}") + return 0 + + xnu_j_mod = xnu_j[:-3] + ",\n\n" + add_text + "]\n" + + tests_json = os.path.join(tests_obj_dir, "compile_commands.json") + open(tests_json, "w").write(xnu_j_mod) + print(f"saved {tests_json}") + + if os.path.exists(root_json): + print(f"removing old link {root_json}") + os.unlink(root_json) + os.symlink(tests_json, root_json) + print(f"added link {root_json}") + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/tests/unit/tools/quote_defines.py b/tests/unit/tools/quote_defines.py new file mode 100755 index 000000000..39f6ef3e3 --- /dev/null +++ b/tests/unit/tools/quote_defines.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +import sys + +# read a .CFLAGS file and print the appropriately quoted clang command line arguments +def main(): + in_path = sys.argv[1] + line = open(in_path).read() + # split by " -" (with space) to avoid issue with paths that contain dashes + dash_split = line.split(' -') + output = [] + # change ' -DX=y z' to ' -DX="y z"' + for i, s in enumerate(dash_split): + if i == 0: + continue # skip the clang executable + if '=' in s: + st = s.strip() + eq_sp = st.split('=') + if ' ' in eq_sp[1]: + output.append(f'-{eq_sp[0]}="{eq_sp[1]}"') + continue + + output.append(f"-{s}") + print(" ".join(output)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/unit/tools/sanitizers-ignorelist b/tests/unit/tools/sanitizers-ignorelist new file mode 100644 index 000000000..5b6e8ae63 --- /dev/null +++ b/tests/unit/tools/sanitizers-ignorelist @@ -0,0 +1,30 @@ +#!special-case-list-v1 +# rdar://139815990 + +[*] + +# ARM64 specific blacklist +# __SECURITY_STACK_DISALLOWED_PUSH +fun:sleh_panic_lockdown_should_initiate_el1_sp0_sync + +# [{coverage,thread}] + +# src:*tests/unit/*.c +src:*tests/unit/mocks/*.h +src:*tests/unit/mocks/*.c +src:*tests/unit/mocks/fibers/*.c +src:*tests/unit/mocks/fibers/*.h +# for libmocks.dylib compilation +src:mocks/*.c +src:mocks/*.h +src:mocks/fibers/*.c +src:mocks/fibers/*.h + +# disable ctx switches in printf-like functions +src:*bsd/kern/subr_prf.c +src:*osfmk/kern/printf.c +src:*osfmk/console/serial_general.c +src:*osfmk/console/serial_console.c + +# blacklist str* function, but maybe we want to context switch in there? +src:*osfmk/device/subrs.c \ No newline at end of file diff --git a/tests/unit/tools/xnu_lib.unexport b/tests/unit/tools/xnu_lib.unexport new file mode 100644 index 000000000..9165a80a3 --- /dev/null +++ b/tests/unit/tools/xnu_lib.unexport @@ -0,0 +1,25 @@ +# The symbols in this file are symbols that darwintest main uses from libc and that XNU defines. +_gettimeofday +_getpid +_getuid +_geteuid +_setpriority +_open +_write +_dup2 +_exit +_posix_spawn +_task_policy_set +__stack_chk_fail +# used by pthreads +_mach_vm_deallocate +# used by tests +_sleep +# used by mock library +_abort +# used by llvm compiler_rt when creating coverage file +_mkdir +# for debugging +_backtrace +# used by sanitizers +_task_set_exc_guard_behavior diff --git a/tests/unp_connect_thread_uaf.c b/tests/unp_connect_thread_uaf.c index 5120772c6..5a7f79640 100644 --- a/tests/unp_connect_thread_uaf.c +++ b/tests/unp_connect_thread_uaf.c @@ -10,6 +10,8 @@ #include #include +#include "net_test_lib.h" + int g_start = 0; int g_client = 0; int g_sever1 = 0; @@ -141,6 +143,8 @@ test_unp_connect_multithread() close(g_sever2); close(g_sever1); } + + force_zone_gc(); } T_DECL(unp_connect_thread_uaf, "Uaf due to multithreaded unp_connect", T_META_TAG_VM_PREFERRED) diff --git a/tests/unp_sock_release.c b/tests/unp_sock_release.c index 69f6d27cf..7b724a8eb 100644 --- a/tests/unp_sock_release.c +++ b/tests/unp_sock_release.c @@ -39,7 +39,8 @@ T_GLOBAL_META( T_META_ASROOT(true) ); -T_DECL(test_unp_sock_release, "UDS with sock_release()") +T_DECL(test_unp_sock_release, "UDS with sock_release()", + T_META_ENABLED(false) /* rdar://150253879 */) { int fds[2] = { -1, -1 }; struct nfsd_args nfsd_args = { 0 }; diff --git a/tests/unrecoverable_trap_test.c b/tests/unrecoverable_trap_test.c new file mode 100644 index 000000000..fc645907f --- /dev/null +++ b/tests/unrecoverable_trap_test.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2025 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "exc_helpers.h" +#include "test_utils.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("arm"), + T_META_OWNER("p_tennen") + ); + +static size_t +exception_handler_expect_not_called(mach_port_t task __unused, mach_port_t thread __unused, + exception_type_t type __unused, mach_exception_data_t codes __unused) +{ + T_ASSERT_FAIL("kernel ran exception handler instead of terminating process"); + return 0; +} + +static void +signal_handler_expect_not_called(int sig, siginfo_t *sip __unused, void *ucontext __unused) +{ + T_FAIL("kernel dispatched signal handler instead of terminating process"); +} + +T_DECL(uncatchable_fatal_trap_developer_mode_disabled, + "Ensure a maybe-unrecoverable trap label is uncatchable with !developer_mode", + T_META_REQUIRES_SYSCTL_EQ("security.mac.amfi.developer_mode_status", 0), + T_META_ENABLED(TARGET_CPU_ARM64) + ) +{ + /* Given a child process that sets up some mechanisms to catch an exception/signal */ + /* And developer mode is disabled and we're not being debugged */ + pid_t pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { + /* + * Try to catch the exception in two ways: + * - via setting up a Mach exception handler, and + * - via sigaction + */ + mach_port_t exc_port = create_exception_port(EXC_MASK_ALL); + run_exception_handler(exc_port, (exc_handler_callback_t)exception_handler_expect_not_called); + + struct sigaction sa = { + .sa_sigaction = signal_handler_expect_not_called, + .sa_flags = SA_SIGINFO + }; + sigfillset(&sa.sa_mask); + + T_ASSERT_POSIX_ZERO(sigaction(SIGILL, &sa, NULL), NULL); + + /* When the child issues a maybe-fatal trap label */ + /* 0xB000 is the start of the 'runtimes-owned traps' range in xnu */ + os_fatal_trap(0xB000); + /* The brk above should have been treated as unrecoverable by the kernel */ + T_FAIL("child ran past unrecoverable brk"); + } else { + int status; + int err = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "waitpid"); + + /* Then the child does not have an opportunity to run its exception handlers, and is immediately killed */ + T_EXPECT_TRUE(WIFSIGNALED(status), "child terminated due to signal"); + T_EXPECT_EQ(SIGKILL, WTERMSIG(status), "child terminated due to SIGKILL"); + } +} + +T_DECL(uncatchable_fatal_trap_developer_mode_enabled, + "Ensure an maybe-unrecoverable trap label is uncatchable with developer_mode", + T_META_REQUIRES_SYSCTL_EQ("security.mac.amfi.developer_mode_status", 1), + T_META_ENABLED(TARGET_CPU_ARM64) + ) +{ + /* Given a child process that sets up some mechanisms to catch an exception/signal */ + /* And developer mode is enabled, but we're not being debugged */ + pid_t pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { + /* + * Try to catch the exception in two ways: + * - via setting up a Mach exception handler, and + * - via sigaction + */ + mach_port_t exc_port = create_exception_port(EXC_MASK_ALL); + run_exception_handler(exc_port, (exc_handler_callback_t)exception_handler_expect_not_called); + + struct sigaction sa = { + .sa_sigaction = signal_handler_expect_not_called, + .sa_flags = SA_SIGINFO + }; + sigfillset(&sa.sa_mask); + + T_ASSERT_POSIX_ZERO(sigaction(SIGILL, &sa, NULL), NULL); + + /* When the child issues a maybe-fatal trap label */ + /* 0xB000 is the start of the 'runtimes-owned traps' range in xnu */ + os_fatal_trap(0xB000); + /* The brk above should have been treated as unrecoverable by the kernel */ + T_FAIL("child ran past brk"); + } else { + int status; + int err = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "waitpid"); + + /* Then the child does not have an opportunity to run its exception handlers, and is immediately killed */ + T_EXPECT_TRUE(WIFSIGNALED(status), "child terminated due to signal"); + T_EXPECT_EQ(SIGKILL, WTERMSIG(status), "child terminated due to SIGKILL"); + } +} + +static bool* shared_was_mach_exception_handler_called = NULL; +static bool* shared_was_posix_signal_handler_called = NULL; + +static size_t +exception_handler_expect_called(mach_port_t task __unused, mach_port_t thread __unused, + exception_type_t type __unused, mach_exception_data_t codes __unused) +{ + T_PASS("Our Mach exception handler ran"); + *shared_was_mach_exception_handler_called = true; + exit(0); + return 0; +} + +static void +signal_handler_expect_called(int sig, siginfo_t *sip __unused, void *ucontext __unused) +{ + T_PASS("Our BSD signal handler ran"); + *shared_was_posix_signal_handler_called = true; + exit(0); +} + + +T_DECL(uncatchable_fatal_trap_debugged, + "Ensure an maybe-unrecoverable trap label is catchable under a debugger", + T_META_REQUIRES_SYSCTL_EQ("security.mac.amfi.developer_mode_status", 1), + /* It's not straightforward to ptrace on platforms other than macOS, so don't bother */ + // T_META_ENABLED(TARGET_CPU_ARM64 && TARGET_OS_OSX) + T_META_ENABLED(false) /* rdar://153223014 */ + ) +{ + /* Given a child process that sets up some mechanisms to catch an exception/signal */ + /* And developer mode is enabled, and the child is being debugged */ + int ret; + + const char* memory_path = "uncatchable_fatal_trap_debugged"; + shm_unlink(memory_path); + int shm_fd = shm_open(memory_path, O_RDWR | O_CREAT); + T_ASSERT_POSIX_SUCCESS(shm_fd, "Created shared memory"); + ret = ftruncate(shm_fd, sizeof(bool) * 2); + T_ASSERT_POSIX_SUCCESS(ret, "ftruncate"); + + shared_was_mach_exception_handler_called = (bool*)mmap(NULL, sizeof(bool), PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + shared_was_posix_signal_handler_called = (bool*)mmap(NULL, sizeof(bool), PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + bool* has_parent_connected = (bool*)mmap(NULL, sizeof(bool), PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + *has_parent_connected = false; + + pid_t pid = fork(); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork"); + + if (pid == 0) { + /* Allow the parent to attach */ + while (!*has_parent_connected) { + sleep(1); + } + + /* + * Try to catch the exception in two ways: + * - via setting up a Mach exception handler, and + * - via sigaction + */ + mach_port_t exc_port = create_exception_port(EXC_MASK_ALL); + run_exception_handler(exc_port, (exc_handler_callback_t)exception_handler_expect_called); + + struct sigaction sa = { + .sa_sigaction = signal_handler_expect_called, + .sa_flags = SA_SIGINFO + }; + sigfillset(&sa.sa_mask); + + T_ASSERT_POSIX_ZERO(sigaction(SIGILL, &sa, NULL), NULL); + + /* When the child issues a maybe-fatal trap label */ + /* 0xB000 is the start of the 'runtimes-owned traps' range in xnu */ + os_fatal_trap(0xB000); + /* The brk above should have terminated this thread */ + T_FAIL("child ran past brk"); + } else { + /* Attach to the child so it's marked as being debugged */ + ret = ptrace(PT_ATTACHEXC, pid, 0, 0); + T_EXPECT_POSIX_SUCCESS(ret, "ptrace PT_ATTACHEXC"); + ret = ptrace(PT_CONTINUE, pid, (caddr_t)1, 0); + T_EXPECT_POSIX_SUCCESS(ret, "ptrace PT_CONTINUE"); + /* And let the child know that it can carry on */ + *has_parent_connected = true; + + int status; + int err = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "waitpid"); + + /* + * Then the child is given an opportunity to run its exception handlers, + * which we witness by its setting of a shared boolean and clean exit(0). + */ + T_EXPECT_TRUE(WIFEXITED(status), "child exited"); + T_EXPECT_TRUE(*shared_was_mach_exception_handler_called + || *shared_was_posix_signal_handler_called, + "Expected one of our handlers to be dispatched"); + + T_ASSERT_POSIX_SUCCESS(close(shm_fd), "Closed shm fd"); + T_ASSERT_POSIX_SUCCESS(shm_unlink(memory_path), "Unlinked"); + } +} diff --git a/tests/vfs/devfd_access.c b/tests/vfs/devfd_access.c new file mode 100644 index 000000000..8b07c0a38 --- /dev/null +++ b/tests/vfs/devfd_access.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o devfd_access devfd_access.c -g -Weverything */ +/* sign: codesign --force --sign - --timestamp=none --entitlements devfd_access.entitlements devfd_access */ + +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static int +docheck(int fd, int perm) +{ + char path[MAXPATHLEN]; + + path[0] = '\0'; + snprintf(path, sizeof(path), "/dev/fd/%d", fd); + errno = 0; + + return access(path, perm); +} + +/* The devfs_access test should not run as root */ +T_DECL(devfd_access, "Calculate the allowed access based on the open-flags for fdesc vnodes") +{ + const char *path = "/dev/null"; + int fd_rdonly, fd_wronly, fd_evtonly, fd_evtonly_drw; + + if (geteuid() == 0) { + T_SKIP("Test should NOT run as root"); + } + + T_SETUPBEGIN; + + T_ASSERT_POSIX_SUCCESS(fd_rdonly = open(path, O_RDONLY), + "Setup: Opening file with O_RDONLY permissions, fd_rdonly = %d", + fd_rdonly); + + T_ASSERT_POSIX_SUCCESS(fd_wronly = open(path, O_WRONLY), + "Setup: Opening file with O_WRONLY permissions, fd_wronly = %d", + fd_wronly); + + T_ASSERT_POSIX_SUCCESS(fd_evtonly = open(path, O_EVTONLY), + "Setup: Opening file with O_EVTONLY permissions, fd_evtonly = %d", + fd_evtonly); + + T_ASSERT_POSIX_SUCCESS(setiopolicy_np(IOPOL_TYPE_VFS_DISALLOW_RW_FOR_O_EVTONLY, + IOPOL_SCOPE_PROCESS, + IOPOL_VFS_DISALLOW_RW_FOR_O_EVTONLY_ON), + "Setup: Disallowing RW for O_EVTONLY"); + + T_ASSERT_POSIX_SUCCESS(fd_evtonly_drw = open(path, O_EVTONLY), + "Setup: Opening file with O_EVTONLY permissions while RW is disabled, fd_evtonly_drw = %d", + fd_evtonly_drw); + + T_SETUPEND; + + T_LOG("Test rdonly-fd's access"); + T_EXPECT_POSIX_SUCCESS(docheck(fd_rdonly, R_OK), "Testing R_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_rdonly, W_OK), EACCES, "Testing W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_rdonly, R_OK | W_OK), EACCES, "Testing R_OK | W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_rdonly, X_OK), EACCES, "Testing X_OK permissions"); + + T_LOG("Test wronly-fd's access"); + T_EXPECT_POSIX_FAILURE(docheck(fd_wronly, R_OK), EACCES, "Testing R_OK permissions"); + T_EXPECT_POSIX_SUCCESS(docheck(fd_wronly, W_OK), "Testing W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_wronly, R_OK | W_OK), EACCES, "Testing R_OK | W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_wronly, X_OK), EACCES, "Testing X_OK permissions"); + + T_LOG("Test evtonly-fd's access"); + T_EXPECT_POSIX_SUCCESS(docheck(fd_evtonly, R_OK), "Testing R_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly, W_OK), EACCES, "Testing W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly, R_OK | W_OK), EACCES, "Testing R_OK | W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly, X_OK), EACCES, "Testing X_OK permissions"); + + T_LOG("Test evtonly-drw-fd's access"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly_drw, R_OK), EACCES, "Testing R_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly_drw, W_OK), EACCES, "Testing W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly_drw, R_OK | W_OK), EACCES, "Testing R_OK | W_OK permissions"); + T_EXPECT_POSIX_FAILURE(docheck(fd_evtonly_drw, X_OK), EACCES, "Testing X_OK permissions"); + + /* Close open file descriptors */ + close(fd_rdonly); + close(fd_wronly); + close(fd_evtonly); + close(fd_evtonly_drw); +} diff --git a/tests/vfs/devfd_access.entitlements b/tests/vfs/devfd_access.entitlements new file mode 100644 index 000000000..524047fc6 --- /dev/null +++ b/tests/vfs/devfd_access.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.vfs.disallow-rw-for-o-evtonly + + + diff --git a/tests/vfs/direntries_permissions.c b/tests/vfs/direntries_permissions.c new file mode 100644 index 000000000..de8196b3d --- /dev/null +++ b/tests/vfs/direntries_permissions.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o direntries_permissions direntries_permissions.c -g -Weverything */ + +#include +#include +#include +#include +#include + +#include +#include + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char dir1[PATH_MAX], dir2[PATH_MAX]; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)); + +static int +switch_user(uid_t uid, gid_t gid) +{ + int ret; +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + ret = pthread_setugid_np(uid, gid); +#pragma clang diagnostic pop + return ret; +} + +static void +cleanup(void) +{ + switch_user(KAUTH_UID_NONE, KAUTH_GID_NONE); + + if (dir2[0] != '\0') { + rmdir(dir2); + } + if (dir1[0] != '\0') { + rmdir(dir1); + } + + if (rmdir(testdir)) { + T_FAIL("Unable to remove the test directory (%s)", testdir); + } +} + +#if 0 + +T_DECL(direntries_permissions_no_owner, + "Directory write permission should give full control of directory contents") +{ + dir1[0] = dir2[0] = '\0'; + + if (geteuid() != 0) { + T_SKIP("Test should run as root"); + } + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Switch user to 20/501 */ + T_ASSERT_POSIX_SUCCESS(switch_user(501, 20), "Switching user to 501/20"); + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/direntries_permissions_no_owner-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + /* Setup directory names */ + snprintf(dir1, sizeof(dir1), "%s/%s", testdir, "dir1"); + snprintf(dir2, sizeof(dir2), "%s/%s", testdir, "dir2"); + + /* Switch user to root */ + T_ASSERT_POSIX_SUCCESS(switch_user(KAUTH_UID_NONE, KAUTH_GID_NONE), "Switching user to root"); + + /* Create the second directory */ + T_ASSERT_POSIX_SUCCESS(mkdir(dir1, 0755), "Creating directory %s", dir1); + + /* Switch user to 20/501 */ + T_ASSERT_POSIX_SUCCESS(switch_user(501, 20), "Switching user to 501/20"); + + T_SETUPEND; + + /* Rename dir1 -> dir2 */ + T_ASSERT_POSIX_SUCCESS(rename(dir1, dir2), "Renaming %s -> %s", dir1, dir2); +} + +#endif /* 0 */ + +T_DECL(direntries_permissions_no_write, + "Directory without write permissions should not be renamed") +{ + dir1[0] = dir2[0] = '\0'; + + if (geteuid() != 0) { + T_SKIP("Test should run as root"); + } + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Switch user to 20/501 */ + T_ASSERT_POSIX_SUCCESS(switch_user(501, 20), "Switching user to 501/20"); + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/direntries_permissions_no_write-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + /* Changing directory */ + T_ASSERT_POSIX_SUCCESS(chdir(testdir), "Changing directory %s", testdir); + + /* Setup directory names */ + snprintf(dir1, sizeof(dir1), "%s/%s", testdir, "dir1"); + snprintf(dir2, sizeof(dir2), "%s/%s", testdir, "dir2"); + + /* Create the first directory */ + T_ASSERT_POSIX_SUCCESS(mkdir(dir1, 0777), "Creating directory %s", dir1); + + /* Setup mode */ + T_ASSERT_POSIX_SUCCESS(chmod(dir1, 0777), "Changing mode to directory 0777"); + + /* Create the second directory */ + T_ASSERT_POSIX_SUCCESS(mkdir(dir2, 0555), "Creating directory %s", dir2); + + /* Setup mode */ + T_ASSERT_POSIX_SUCCESS(chmod(dir2, 0555), "Changing mode to directory 0555"); + + T_SETUPEND; + + T_EXPECT_POSIX_FAILURE(rename(dir1, dir2), EACCES, "Renaming dir1 -> dir2. should fail with EACCES"); + T_EXPECT_POSIX_FAILURE(rename(dir2, dir1), EACCES, "Renaming dir2 -> dir1. should fail with EACCES"); +} diff --git a/tests/vfs/fmount_funmount.c b/tests/vfs/fmount_funmount.c new file mode 100644 index 000000000..c89742cc3 --- /dev/null +++ b/tests/vfs/fmount_funmount.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o fmount_funmount fmount_funmount.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define RUN_TEST TARGET_OS_OSX + +#define FSTYPE_APFS "apfs" +#define FSTYPE_DEVFS "devfs" + +static char template[MAXPATHLEN]; +static char *testdir = NULL; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_ENABLED(RUN_TEST), + T_META_CHECK_LEAKS(false)); + +static int +verify_fstypename(const char *name) +{ + int error; + struct statfs statfs_buf;; + + error = statfs(testdir, &statfs_buf); + if (error) { + return errno; + } + + if (strncmp(name, statfs_buf.f_fstypename, MFSNAMELEN)) { + return EINVAL; + } + + return 0; +} + +static void +cleanup(void) +{ + if (testdir) { + rmdir(testdir); + } +} + +T_DECL(fmount_funmount, + "Test fmount() and funmount() system calls") +{ +#if (!RUN_TEST) + T_SKIP("Not macOS"); +#endif + + int fd; + + T_ATEND(cleanup); + + T_SETUPBEGIN; + + snprintf(template, sizeof(template), "%s/fmount_funmount-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + T_ASSERT_POSIX_ZERO(verify_fstypename(FSTYPE_APFS), "Verifing fstype name equals %s", FSTYPE_APFS); + + T_SETUPEND; + + /* Mount phase */ + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_DIRECTORY)), "Open test root dir: %s", testdir); + T_ASSERT_POSIX_SUCCESS(fmount(FSTYPE_DEVFS, fd, MNT_RDONLY, NULL), "Mounting temporary %s mount using fmount(fd = %d)", FSTYPE_DEVFS, fd); + T_ASSERT_POSIX_ZERO(verify_fstypename(FSTYPE_DEVFS), "Verifing fstype name equals %s", FSTYPE_DEVFS); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing (fd = %d)", fd); + + /* Unmount phase */ + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_DIRECTORY)), "Open test root dir: %s", testdir); + T_ASSERT_POSIX_SUCCESS(funmount(fd, MNT_FORCE), "Unmounting %s using funmount(fd = %d)", testdir, fd); + T_ASSERT_POSIX_ZERO(verify_fstypename(FSTYPE_APFS), "Verifing fstype name equals %s", FSTYPE_APFS); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing (fd = %d)", fd); +} diff --git a/tests/vfs/getattrlist_fullpath.c b/tests/vfs/getattrlist_fullpath.c new file mode 100644 index 000000000..647fc53a3 --- /dev/null +++ b/tests/vfs/getattrlist_fullpath.c @@ -0,0 +1,67 @@ +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o getattrlist_fullpath getattrlist_fullpath.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define MAXLONGPATHLEN 4096 + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static char * +fast_realpath(const char *path, bool follow) +{ + struct { + uint32_t size; + attrreference_t fullPathAttr; + char fullPathBuf[MAXLONGPATHLEN]; + } __attribute__((aligned(4), packed)) buf; + + struct attrlist al = { + .bitmapcount = ATTR_BIT_MAP_COUNT, + .commonattr = ATTR_CMN_FULLPATH, + }; + + unsigned int options = FSOPT_ATTR_CMN_EXTENDED; + if (!follow) { + options |= FSOPT_NOFOLLOW; + } + + if (getattrlist(path, &al, &buf, sizeof(buf), options) < 0) { + return NULL; + } + + return strdup((char *)&buf.fullPathAttr + buf.fullPathAttr.attr_dataoffset); +} + +static void +test_realpath(char *input, char *output) +{ + T_ASSERT_EQ_STR(fast_realpath(input, false), output, "Testing input '%s', output '%s'", input, output); +} + +T_DECL(getattrlist_fullpath, + "getattrlist ATTR_CMN_FULLPATH should preserve input path prefix in output") +{ + test_realpath("/private/etc/hosts", "/private/etc/hosts"); + test_realpath("/etc/hosts", "/private/etc/hosts"); + + /* Test for .nofollow prefix */ + test_realpath("/.nofollow/etc/hosts", NULL); + test_realpath("/.nofollow/private/etc/hosts", "/.nofollow/private/etc/hosts"); + + /* Test for RESOLVE_NOFOLLOW_ANY resolve prefix */ + test_realpath("/.resolve/1/etc/hosts", NULL); + test_realpath("/.resolve/1/private/etc/hosts", "/.resolve/1/private/etc/hosts"); +} diff --git a/tests/vfs/getattrlist_mountextflags.c b/tests/vfs/getattrlist_mountextflags.c index bf6a78197..721736c50 100644 --- a/tests/vfs/getattrlist_mountextflags.c +++ b/tests/vfs/getattrlist_mountextflags.c @@ -28,20 +28,21 @@ /* compile: xcrun -sdk macosx.internal clang -ldarwintest -o getattrlist_mountextflags getattrlist_mountextflags.c -g -Weverything */ -#include #include #include #include #include #include +#include +#include + #if !TARGET_OS_OSX #define FSTYPE_LIFS "lifs" #endif /* !TARGET_OS_OSX */ #define FSTYPE_MSDOS "msdos" #define FSTYPE_APFS "apfs" -#define TEMPLATE "/private/var/tmp/getattrlist_mountextflags_test.XXXXXXXX" /* rdar://137970358: Disable the test for now until the root cause was determined */ #if 0 @@ -50,7 +51,7 @@ #define RUN_TEST 0 #endif -static char template[] = TEMPLATE; +static char template[MAXPATHLEN]; static char *testdir = NULL; static char *output_buffer = NULL; static char image_path[PATH_MAX]; @@ -225,6 +226,7 @@ T_DECL(getattrlist_mountextflags, output_buffer = malloc(PATH_MAX); /* Create test directory */ + snprintf(template, sizeof(template), "%s/getattrlist_mountextflags-XXXXXX", dt_tmpdir()); T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); /* Create image path */ diff --git a/tests/vfs/linkat_flags.c b/tests/vfs/linkat_flags.c new file mode 100644 index 000000000..99a81afcc --- /dev/null +++ b/tests/vfs/linkat_flags.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o linkat_flags linkat_flags.c -g -Weverything */ + +#include +#include +#include +#include + +#include +#include + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char file[PATH_MAX], sym[PATH_MAX], symloop[PATH_MAX], dirloop[PATH_MAX]; +static char lfile1[PATH_MAX], lfile2[PATH_MAX], lfile3[PATH_MAX]; +static char lfile4[PATH_MAX], lfile5[PATH_MAX], lfile6[PATH_MAX]; +static char lfile7[PATH_MAX], lfile8[PATH_MAX]; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (lfile8[0] != '\0') { + unlink(lfile8); + } + if (lfile7[0] != '\0') { + unlink(lfile7); + } + if (lfile6[0] != '\0') { + unlink(lfile6); + } + if (lfile5[0] != '\0') { + unlink(lfile5); + } + if (lfile4[0] != '\0') { + unlink(lfile4); + } + if (lfile3[0] != '\0') { + unlink(lfile3); + } + if (lfile2[0] != '\0') { + unlink(lfile2); + } + if (lfile1[0] != '\0') { + unlink(lfile1); + } + if (dirloop[0] != '\0') { + unlink(dirloop); + } + if (sym[0] != '\0') { + unlink(sym); + } + if (file[0] != '\0') { + unlink(file); + } + if (testdir) { + rmdir(testdir); + } +} + +static void +verify_stat(nlink_t file_nlink, nlink_t sym_nlink) +{ + int error; + struct stat buf; + + /* Verify file's status */ + memset(&buf, 0, sizeof(buf)); + error = fstatat(AT_FDCWD, file, &buf, 0); + if (error) { + T_ASSERT_FAIL("Calling fstatat for the file failed with %s", strerror(errno)); + } + T_ASSERT_EQ(file_nlink, buf.st_nlink, "Validating file's nlink count of %d", file_nlink); + + /* Verify symlink's status */ + memset(&buf, 0, sizeof(buf)); + error = fstatat(AT_FDCWD, sym, &buf, AT_SYMLINK_NOFOLLOW); + if (error) { + T_ASSERT_FAIL("Calling fstatat for the symlink failed with %s", strerror(errno)); + } + T_ASSERT_EQ(sym_nlink, buf.st_nlink, "Validating symlink's nlink count of %d", sym_nlink); +} + +T_DECL(linkat_flags, + "Test linkat's AT_SYMLINK_FOLLOW and AT_SYMLINK_NOFOLLOW_ANY flags") +{ + int fd; + char testdir_path[MAXPATHLEN + 1]; + + file[0] = sym[0] = dirloop[0] = '\0'; + lfile1[0] = lfile2[0] = lfile3[0] = '\0'; + lfile4[0] = lfile5[0] = lfile6[0] = '\0'; + lfile7[0] = lfile8[0] = '\0'; + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/linkat_flags-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + /* Get testdir full path */ + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_SEARCH, 0777)), "Opening the test root directory"); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", testdir); + + /* Setup file names */ + snprintf(file, sizeof(file), "%s/%s", testdir_path, "file"); + snprintf(sym, sizeof(sym), "%s/%s", testdir_path, "sym"); + snprintf(dirloop, sizeof(dirloop), "%s/%s", testdir_path, "dirloop"); + snprintf(symloop, sizeof(symloop), "%s/%s", dirloop, "sym"); + snprintf(lfile1, sizeof(lfile1), "%s/%s", testdir_path, "lfile1"); + snprintf(lfile2, sizeof(lfile2), "%s/%s", testdir_path, "lfile2"); + snprintf(lfile3, sizeof(lfile3), "%s/%s", testdir_path, "lfile3"); + snprintf(lfile4, sizeof(lfile4), "%s/%s", testdir_path, "lfile4"); + snprintf(lfile5, sizeof(lfile5), "%s/%s", testdir_path, "lfile5"); + snprintf(lfile6, sizeof(lfile6), "%s/%s", testdir_path, "lfile6"); + snprintf(lfile7, sizeof(lfile7), "%s/%s", testdir_path, "lfile7"); + snprintf(lfile8, sizeof(lfile8), "%s/%s", testdir_path, "lfile8"); + + /* Create the test files */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating %s", file); + T_ASSERT_POSIX_SUCCESS(symlink(file, sym), "Creating symbolic link %s ---> %s", sym, file); + T_ASSERT_POSIX_SUCCESS(symlink(testdir_path, dirloop), "Creating symbolic link %s ---> %s", dirloop, testdir_path); + + /* Validating nlink count */ + verify_stat(1, 1); + + /* Close the open files */ + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", file); + + T_SETUPEND; + + T_LOG("Testing linkat() using no flags"); + { + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, file, AT_FDCWD, lfile1, 0), "Calling linkat() while name1 is a file"); + verify_stat(2, 1); + + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, sym, AT_FDCWD, lfile2, 0), "Calling linkat() while name1 is a symbolic link"); + verify_stat(2, 2); + + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, symloop, AT_FDCWD, lfile3, 0), "Calling linkat() while name1 is a symbolic link and it's path contains a symbolic"); + verify_stat(2, 3); + } + + T_LOG("Testing linkat() using the AT_SYMLINK_FOLLOW flag"); + { + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, file, AT_FDCWD, lfile4, AT_SYMLINK_FOLLOW), "Calling linkat() while name1 is a file"); + verify_stat(3, 3); + + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, sym, AT_FDCWD, lfile5, AT_SYMLINK_FOLLOW), "Calling linkat() while name1 is a symbolic link"); + verify_stat(4, 3); + + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, symloop, AT_FDCWD, lfile6, AT_SYMLINK_FOLLOW), "Calling linkat() while name1 is a symbolic link and it's path contains a symbolic"); + verify_stat(5, 3); + } + + T_LOG("Testing linkat() using the AT_SYMLINK_NOFOLLOW_ANY flag"); + { + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, file, AT_FDCWD, lfile7, AT_SYMLINK_NOFOLLOW_ANY), "Calling linkat() while name1 is a file %s", file); + verify_stat(6, 3); + + T_ASSERT_POSIX_SUCCESS(linkat(AT_FDCWD, sym, AT_FDCWD, lfile8, AT_SYMLINK_NOFOLLOW_ANY), "Calling linkat() while name1 is a symbolic link"); + verify_stat(6, 4); + + T_ASSERT_POSIX_FAILURE(linkat(AT_FDCWD, symloop, AT_FDCWD, "invalid_path", AT_SYMLINK_NOFOLLOW_ANY), ELOOP, "Calling linkat() while name1 is a symbolic link and it's path contains a symbolic"); + } + + /* See resolve_beneath.c for the AT_RESOLVE_BENEATH flag tests */ +} diff --git a/tests/vfs/longpaths.c b/tests/vfs/longpaths.c index 5c494015d..62b9fb431 100644 --- a/tests/vfs/longpaths.c +++ b/tests/vfs/longpaths.c @@ -425,6 +425,7 @@ test_getattrlist_fullpath(size_t pathlen, bool policy, int expected_errno) if (pathlen + 1 <= cwdlen) { // Test dir is longer than pathlen + slash, no sense running the test + free(cwd); return; } @@ -469,6 +470,7 @@ test_getattrlist_relpath(size_t pathlen, bool policy, int expected_errno) if (pathlen + 1 <= cwdlen) { // Test dir is longer than pathlen + slash, no sense running the test + free(cwd); return; } @@ -536,6 +538,7 @@ test_getattrlist_nofirmlinkpath(size_t pathlen, bool policy, int expected_errno) if (pathlen + 1 <= mtptlen + cwdlen) { // Test dir + mount point is longer than pathlen + slash, no sense running the test + free(cwd); return; } @@ -973,6 +976,7 @@ test_symlink_intermediate(size_t pathlen, bool policy, int expected_errno) // Find parent of path char *lastslash = strrchr(path, '/'); if (lastslash == NULL || lastslash == path) { + free(path); return; } @@ -1038,6 +1042,8 @@ test_symlink_intermediate(size_t pathlen, bool policy, int expected_errno) if (fd >= 0) { close(fd); } + free(basepath); + free(path); } static void diff --git a/tests/vfs/named_fork_path.c b/tests/vfs/named_fork_path.c new file mode 100644 index 000000000..4ffff1170 --- /dev/null +++ b/tests/vfs/named_fork_path.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -lsandbox -o named_fork_path named_fork_path.c -g -Weverything */ + +#include +#include +#include + +#include +#include + +#define RUN_TEST TARGET_OS_OSX + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char rsrc[PATH_MAX]; +static char file[PATH_MAX], file_rfork[PATH_MAX]; +static char file2[PATH_MAX], file2_rfork[PATH_MAX]; +static sandbox_params_t params = NULL; +static sandbox_profile_t profile = NULL; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_ENABLED(RUN_TEST), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (profile) { + sandbox_free_profile(profile); + } + if (params) { + sandbox_free_params(params); + } + if (file[0] != '\0') { + unlink(file); + } + if (file2[0] != '\0') { + unlink(file2); + } + if (rsrc[0] != '\0') { + unlink(rsrc); + } + if (testdir) { + rmdir(testdir); + } +} + +static void +create_profile_string(char *buff, size_t size) +{ + snprintf(buff, size, "(version 1) \n\ + (allow default) \n\ + (import \"system.sb\") \n\ + (deny file-read-xattr file-write-xattr (path \"%s\")) \n\ + (deny file-read-xattr file-write-xattr (path \"%s\")) \n", + file, file2); +} + +T_DECL(named_fork_path, + "Named fork paths to check file-read-xattr and file-write-xattr Sandbox permissions") +{ +#if (!RUN_TEST) + T_SKIP("Not macOS"); +#endif + + int fd, fd2, fd3, fd_rfork; + char xattr_buff[100]; + char *sberror = NULL; + const char *xattr = "test1234"; + char profile_string[1000]; + size_t xattr_len = strlen(xattr); + char testdir_path[MAXPATHLEN]; + + file[0] = file2[0] = rsrc[0] = '\0'; + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/named_fork_path-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory %s", testdir); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", testdir_path); + + /* Setup file names */ + snprintf(file, sizeof(file), "%s/%s", testdir_path, "file"); + snprintf(file_rfork, sizeof(file_rfork), "%s/..namedfork/rsrc", file); + + snprintf(file2, sizeof(file2), "%s/%s", testdir_path, "file2"); + snprintf(file2_rfork, sizeof(file2_rfork), "%s/..namedfork/rsrc", file2); + + snprintf(rsrc, sizeof(rsrc), "%s/%s", testdir_path, "rsrc"); + + /* Create the test files */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating %s", file); + T_ASSERT_POSIX_SUCCESS((fd2 = open(file2, O_CREAT | O_RDWR, 0777)), "Creating %s", file2); + T_ASSERT_POSIX_SUCCESS((fd3 = open(rsrc, O_CREAT | O_RDWR, 0777)), "Creating %s", rsrc); + + /* Set ResourceFork extended attribute */ + T_ASSERT_POSIX_SUCCESS(fsetxattr(fd, XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting ResourceFork of %s to '%s'", file, xattr); + + /* Create sandbox variables */ + T_ASSERT_POSIX_NOTNULL(params = sandbox_create_params(), "Creating Sandbox params object"); + create_profile_string(profile_string, sizeof(profile_string)); + T_ASSERT_POSIX_NOTNULL(profile = sandbox_compile_string(profile_string, params, &sberror), "Creating Sandbox profile object"); + + T_SETUPEND; + + /* Test rename to/from an ..namedfork/rsrc path */ + T_ASSERT_POSIX_FAILURE(rename(file_rfork, rsrc), EPERM, "Verifying rename from an ..namedfork/rsrc path isn't a supported (EPERM)"); + T_ASSERT_POSIX_FAILURE(rename(rsrc, file_rfork), EPERM, "Verifying trename to an ..namedfork/rsrc path isn't a supported (EPERM)"); + + /* Read ResourceFork extended attribute using getxattr() */ + T_ASSERT_EQ((ssize_t)xattr_len, fgetxattr(fd, XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, 0), + "Trying to get ResourceFork extended attribute"); + T_ASSERT_EQ(0, strncmp(xattr, xattr_buff, xattr_len), "Verifying ResourceFork extended content"); + + /* Read ResourceFork extended attribute using the ..namedfork/rsrc path */ + T_ASSERT_POSIX_SUCCESS((fd_rfork = open(file_rfork, O_RDONLY, 0777)), "Opening %s", file_rfork); + T_ASSERT_EQ((ssize_t)xattr_len, read(fd_rfork, xattr_buff, sizeof(xattr_buff)), "Trying to read ResourceFork extended attribute"); + T_ASSERT_EQ(0, strncmp(xattr, xattr_buff, xattr_len), "Verifying ResourceFork extended content"); + T_ASSERT_POSIX_SUCCESS(close(fd_rfork), "Closing %s", file_rfork); + + /* Apply sandbox profile */ + T_ASSERT_POSIX_SUCCESS(sandbox_apply(profile), "Applying Sandbox profile"); + + /* Test ResourceFork extended attribute using fgetxattr(), fsetxattr() and fremovexattr() */ + T_ASSERT_POSIX_FAILURE(fgetxattr(fd, XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, 0), EPERM, "Verifying that fgetxattr() fails to get ResourceFork with EPERM"); + T_ASSERT_POSIX_FAILURE(fremovexattr(fd, XATTR_RESOURCEFORK_NAME, 0), EPERM, "Verifying that fremovexattr() fails to remove ResourceFork with EPERM"); + T_ASSERT_POSIX_FAILURE(fsetxattr(fd2, XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), EPERM, "Verifying that fsetxattr() fails to set ResourceFork with EPERM"); + + /* Test ResourceFork extended attribute using the ..namedfork/rsrc path */ + T_ASSERT_POSIX_FAILURE((fd_rfork = open(file_rfork, O_RDONLY, 0777)), EPERM, "Verifying that open() fails with EPERM"); + T_ASSERT_POSIX_FAILURE((fd_rfork = open(file2_rfork, O_CREAT | O_RDONLY, 0777)), EPERM, "Verifying that open(O_CREAT) fails with EPERM"); + T_ASSERT_POSIX_FAILURE(unlink(file_rfork), EPERM, "Verifying that unlink() fails with EPERM"); + + /* Close the open files */ + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", file); + T_ASSERT_POSIX_SUCCESS(close(fd2), "Closing %s", file2); + T_ASSERT_POSIX_SUCCESS(close(fd3), "Closing %s", rsrc); +} diff --git a/tests/vfs/open_symlink.c b/tests/vfs/open_symlink.c new file mode 100644 index 000000000..7176a2c8a --- /dev/null +++ b/tests/vfs/open_symlink.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o open_symlink open_symlink.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char file[PATH_MAX], sym[PATH_MAX]; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (sym[0] != '\0') { + unlink(sym); + } + if (file[0] != '\0') { + unlink(file); + } + if (testdir) { + rmdir(testdir); + } +} + +T_DECL(open_symlink, + "Verify that O_SYMLINK is not being ignored while used by open() in addition to O_CREAT") +{ + int fd; + char namebuf[MAXPATHLEN + 1]; + char namebuf2[MAXPATHLEN + 1]; + + file[0] = sym[0] = '\0'; + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/open_symlink-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + /* Setup file names */ + snprintf(file, sizeof(file), "%s/%s", testdir, "file"); + snprintf(sym, sizeof(sym), "%s/%s", testdir, "symlink"); + + /* Create the test file */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating file %s", file); + + /* Create the symlink */ + T_ASSERT_POSIX_SUCCESS(symlink(file, sym), "Creating symlink %s -> %s", sym, file); + + /* Close the test file */ + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", file); + + T_SETUPEND; + + /* Step 1 - Verify O_SYMLINK behaviour */ + T_ASSERT_POSIX_SUCCESS((fd = open(sym, O_SYMLINK, 0777)), "Opening %s using the O_SYMLINK flag", sym); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, namebuf), "Calling fcntl() to get the path"); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", sym); + + /* Step 2 - Verify O_SYMLINK | O_CREAT behaviour */ + T_ASSERT_POSIX_SUCCESS((fd = open(sym, O_SYMLINK | O_CREAT, 0777)), "Opening %s using the O_SYMLINK | O_CREAT flags", sym); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, namebuf2), "Calling fcntl() to get the path"); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", sym); + + /* Compare names */ + T_ASSERT_EQ(strncmp(namebuf, namebuf2, strlen(namebuf)), 0, "Verifying %s was opened, got %s", namebuf, namebuf2); +} diff --git a/tests/vfs/open_unique.c b/tests/vfs/open_unique.c new file mode 100644 index 000000000..1d089458e --- /dev/null +++ b/tests/vfs/open_unique.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o open_unique open_unique.c -g -Weverything */ + +#include +#include +#include + +#include +#include + +static char template[MAXPATHLEN]; +static char testdir_path[MAXPATHLEN + 1]; +static char *testdir = NULL; +static int testdir_fd = -1; + +#ifndef O_UNIQUE +#define O_UNIQUE 0x00002000 +#endif + +#define FILE "file.txt" +#define FILE2 "file2.txt" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (testdir_fd != -1) { + unlinkat(testdir_fd, FILE, 0); + unlinkat(testdir_fd, FILE2, 0); + + close(testdir_fd); + if (rmdir(testdir)) { + T_FAIL("Unable to remove the test directory (%s)", testdir); + } + } +} + +T_DECL(open_unique, + "Validate the functionality of the O_UNIQUE flag in the open/openat syscalls") +{ + int fd; + char file_path[MAXPATHLEN]; + struct stat statbuf; + + T_SETUPBEGIN; + T_ATEND(cleanup); + + /* Create test root directory */ + snprintf(template, sizeof(template), "%s/%s-XXXXXX", dt_tmpdir(), "open_unique"); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root directory"); + T_ASSERT_POSIX_SUCCESS((testdir_fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory %s", testdir); + T_ASSERT_POSIX_SUCCESS(fcntl(testdir_fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + + /* Create test file path */ + snprintf(file_path, sizeof(file_path), "%s/%s", testdir_path, FILE); + + T_SETUPEND; + + /* Create the test file */ + T_ASSERT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_CREAT | O_RDWR | O_UNIQUE, 0777)), "Creating %s using openat() with O_UNIQUE -> Should PASS", FILE); + close(fd); + + /* Validate nlink count equals 1 */ + T_EXPECT_POSIX_SUCCESS((fstatat(testdir_fd, FILE, &statbuf, 0)), "Calling stat() for %s -> Should PASS", FILE); + T_EXPECT_EQ(statbuf.st_nlink, 1, "Validate nlink equals 1"); + T_EXPECT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_RDONLY | O_UNIQUE, 0)), "Opening %s using O_UNIQUE -> Should PASS", FILE); + close(fd); + + /* Increase nlink count */ + T_EXPECT_POSIX_SUCCESS(linkat(testdir_fd, FILE, testdir_fd, FILE2, 0), "Calling linkat() for %s, %s -> Should PASS", FILE, FILE2); + + /* Validate nlink count equals 2 */ + T_EXPECT_POSIX_SUCCESS((fstatat(testdir_fd, FILE, &statbuf, 0)), "Calling fstatat() for %s -> Should PASS", FILE); + T_EXPECT_EQ(statbuf.st_nlink, 2, "Validate nlink equals 2"); + T_EXPECT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_RDONLY, 0)), "Opening %s -> Should PASS", FILE); + close(fd); + + /* Validate ENOTCAPABLE */ + T_EXPECT_POSIX_FAILURE((fd = open(file_path, O_RDONLY | O_UNIQUE, 0)), ENOTCAPABLE, "Opening using open() with O_UNIQUE -> Should FAIL with ENOTCAPABLE"); + + T_EXPECT_POSIX_FAILURE((fd = openat(testdir_fd, FILE, O_WRONLY | O_UNIQUE, 0)), ENOTCAPABLE, "Opening %s using openat() with O_UNIQUE -> Should FAIL with ENOTCAPABLE", FILE); + + T_EXPECT_POSIX_FAILURE((fd = openat(testdir_fd, FILE2, O_CREAT | O_RDWR | O_UNIQUE, 0)), ENOTCAPABLE, "Opening %s using openat() with O_UNIQUE -> Should FAIL with ENOTCAPABLE", FILE2); + + /* Reduce nlink count */ + T_EXPECT_POSIX_SUCCESS(unlinkat(testdir_fd, FILE2, 0), "Calling unlinkat() for %s -> Should PASS", FILE2); + + /* Validate nlink count equals 1 */ + T_EXPECT_POSIX_SUCCESS((fstatat(testdir_fd, FILE, &statbuf, 0)), "Calling fstatat() for %s -> Should PASS", FILE); + T_EXPECT_EQ(statbuf.st_nlink, 1, "Validate nlink equals 1"); + T_EXPECT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_RDONLY | O_UNIQUE, 0)), "Opening %s -> Should PASS", FILE); + close(fd); +} diff --git a/tests/vfs/openbyid_stress.c b/tests/vfs/openbyid_stress.c new file mode 100644 index 000000000..b48a37e62 --- /dev/null +++ b/tests/vfs/openbyid_stress.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o openbyid_stress openbyid_stress.c -g -Weverything */ +/* sign: codesign --force --sign - --timestamp=none --entitlements openbyid_stress.entitlements openbyid_stress */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TEST_DURATION 10 /* seconds */ + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char dir1[PATH_MAX], dir2[PATH_MAX]; +static char file1[PATH_MAX], file2[PATH_MAX]; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (file1[0] != '\0') { + unlink(file1); + } + if (file2[0] != '\0') { + unlink(file2); + } + if (dir1[0] != '\0') { + rmdir(dir1); + } + if (dir2[0] != '\0') { + rmdir(dir2); + } + if (testdir) { + rmdir(testdir); + } +} + +T_DECL(openbyid_stress, + "Test that openbyid_np does not open the wrong file") +{ + int fd; + struct stat buf_stat; + struct statfs buf_statfs; + __block int timeout = 0; + __block int error = 0; + int64_t interval = TEST_DURATION * NSEC_PER_SEC; + dispatch_queue_t queue; + dispatch_source_t timeout_source; + + dir1[0] = dir2[0] = '\0'; + file2[0] = file2[0] = '\0'; + + T_ATEND(cleanup); + + T_SETUPBEGIN; + + T_ASSERT_NOTNULL((queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0)), "Getting global queue"); + T_ASSERT_NOTNULL((timeout_source = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, queue)), "Creating dispatch source"); + + dispatch_source_set_timer(timeout_source, dispatch_time(DISPATCH_TIME_NOW, interval), DISPATCH_TIME_FOREVER, 0); + dispatch_source_set_event_handler(timeout_source, ^{ + timeout = 1; + T_LOG("%d seconds timeout expired", TEST_DURATION); + }); + + snprintf(template, sizeof(template), "%s/openbyid_stress-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + snprintf(dir1, sizeof(dir1), "%s/%s", testdir, "dir1"); + snprintf(dir2, sizeof(dir2), "%s/%s", testdir, "dir2"); + + T_ASSERT_POSIX_SUCCESS(mkdir(dir1, 0777), "Creating dir1"); + T_ASSERT_POSIX_SUCCESS(mkdir(dir2, 0777), "Creating dir2"); + + snprintf(file1, sizeof(file1), "%s/%s", dir1, "file"); + snprintf(file2, sizeof(file2), "%s/%s", dir2, "file"); + + T_ASSERT_POSIX_SUCCESS((fd = open(file1, O_CREAT | O_RDWR, 0777)), "Creating %s", file1); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", file1); + + T_ASSERT_POSIX_SUCCESS((fd = open(file2, O_CREAT | O_RDWR, 0777)), "Creating %s", file2); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", file2); + + T_ASSERT_POSIX_SUCCESS(stat(file1, &buf_stat), "Calling stat() on %s", file1); + T_ASSERT_POSIX_SUCCESS(statfs(file1, &buf_statfs), "Calling statfs() on %s", file1); + + T_LOG("File successfully opened: fsid {%d, %d}, inode %llu", buf_statfs.f_fsid.val[0], buf_statfs.f_fsid.val[1], buf_stat.st_ino); + + T_SETUPEND; + + T_LOG("Running for %d seconds", TEST_DURATION); + dispatch_resume(timeout_source); + + /* Replace between dir1 and dir2 */ + dispatch_async(queue, ^(void) { + while (!timeout && !error) { + renamex_np(dir1, dir2, RENAME_SWAP); + } + }); + + /* Query openbyid_np */ + while (!timeout && !error) { + int fd2; + struct stat buf_stat2; + struct statfs buf_statfs2; + + if ((fd2 = openbyid_np(&buf_statfs.f_fsid, (fsobj_id_t *)&buf_stat.st_ino, 0)) < 0) { + T_FAIL("openbyid_np() failed %d", errno); + error = errno; + break; + } + + if ((error = fstatfs(fd2, &buf_statfs2)) < 0) { + T_FAIL("fstatfs() failed"); + error = errno; + close(fd2); + break; + } + + if ((error = fstat(fd2, &buf_stat2)) < 0) { + T_FAIL("fstat() failed"); + error = errno; + close(fd2); + break; + } + + if (buf_statfs.f_fsid.val[0] != buf_statfs2.f_fsid.val[0] || + buf_statfs.f_fsid.val[1] != buf_statfs2.f_fsid.val[1] || + buf_stat2.st_ino != buf_stat.st_ino) { + T_FAIL("Wrong file opened! fsid {%d, %d}, inode %llu", buf_statfs2.f_fsid.val[0], buf_statfs2.f_fsid.val[1], buf_stat2.st_ino); + error = EINVAL; + close(fd2); + break; + } + + close(fd2); + } + + T_ASSERT_POSIX_ZERO(error, "Test completed without error(s)"); +} diff --git a/tests/vfs/openbyid_stress.entitlements b/tests/vfs/openbyid_stress.entitlements new file mode 100644 index 000000000..ed83cb277 --- /dev/null +++ b/tests/vfs/openbyid_stress.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.vfs.open-by-id + + + diff --git a/tests/vfs/resolve_beneath.c b/tests/vfs/resolve_beneath.c index 3d707c998..4078df401 100644 --- a/tests/vfs/resolve_beneath.c +++ b/tests/vfs/resolve_beneath.c @@ -28,22 +28,51 @@ /* compile: xcrun -sdk macosx.internal clang -ldarwintest -o resolve_beneath resolve_beneath.c -g -Weverything */ -#include #include #include #include +#include +#include #include #include +#include +#include -#define TEMPLATE "/private/var/tmp/resolve_beneath.XXXXXXXX" -static char template[] = TEMPLATE; +#include +#include + +static char template[MAXPATHLEN]; static char *testdir = NULL; static int testdir_fd = -1, test_fd = -1; +#ifndef ENOTCAPABLE +#define ENOTCAPABLE 107 +#endif + #ifndef O_RESOLVE_BENEATH #define O_RESOLVE_BENEATH 0x1000 #endif +#ifndef AT_RESOLVE_BENEATH +#define AT_RESOLVE_BENEATH 0x2000 +#endif + +#ifndef XATTR_RESOLVE_BENEATH +#define XATTR_RESOLVE_BENEATH 0x0080 +#endif + +#ifndef CLONE_RESOLVE_BENEATH +#define CLONE_RESOLVE_BENEATH 0x0010 +#endif + +#ifndef RENAME_RESOLVE_BENEATH +#define RENAME_RESOLVE_BENEATH 0x0020 +#endif + +#ifndef FSOPT_RESOLVE_BENEATH +#define FSOPT_RESOLVE_BENEATH 0x1000 +#endif + #define TEST_DIR "test_dir" #define NESTED_DIR "test_dir/nested" #define OUTSIDE_FILE "outside_file.txt" @@ -68,13 +97,14 @@ T_GLOBAL_META( T_META_CHECK_LEAKS(false)); static void -setup(void) +setup(const char *dirname) { int fd; testdir_fd = test_fd = -1; /* Create test root directory */ + snprintf(template, sizeof(template), "%s/%s-XXXXXX", dt_tmpdir(), dirname); T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root directory"); T_ASSERT_POSIX_SUCCESS((testdir_fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory %s", testdir); @@ -135,7 +165,7 @@ T_DECL(resolve_beneath_open, T_SETUPBEGIN; T_ATEND(cleanup); - setup(); + setup("resolve_beneath_open"); T_ASSERT_POSIX_SUCCESS((root_fd = open("/", O_SEARCH, 0777)), "Opening the root directory"); @@ -150,10 +180,10 @@ T_DECL(resolve_beneath_open, } /* Test Case 2: File using a symlink pointing outside */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "symlink", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 2: File using a symlink pointing outside"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "symlink", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); /* Test Case 3: Attempt to open a file using ".." to navigate outside */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "../outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 3: File using \"..\" to navigate outside"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "../outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); /* Test Case 4: File within a nested directory */ T_EXPECT_POSIX_SUCCESS((fd = openat(test_fd, "nested/nested_file.txt", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 4: File within a nested directory"); @@ -168,70 +198,990 @@ T_DECL(resolve_beneath_open, } /* Test Case 6: File using an absolute path */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 6: File using an absolute path"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); /* Test Case 7: Valid symlink to parent directory */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "parent_symlink/outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 7: Valid symlink to parent directory"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "parent_symlink/outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); /* Test Case 8: Circular symlink within directory */ T_EXPECT_POSIX_FAILURE(openat(test_fd, "circular_symlink", O_RDONLY | O_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); /* Test Case 9: Path can not escape outside at any point of the resolution */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "../test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 9: Path can not escape outside at any point of the resolution"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "../test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); /* Test Case 10: File using a symlink pointing to absolute path */ - T_EXPECT_POSIX_FAILURE(openat(test_fd, "symlink_absolute/test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 10: File using a symlink pointing to absolute path"); + T_EXPECT_POSIX_FAILURE(openat(test_fd, "symlink_absolute/test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); /* Test Case 11: Absolute path relative to the root directory */ - T_EXPECT_POSIX_FAILURE(openat(root_fd, "/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 11: Absolute path relative to the root directory"); + T_EXPECT_POSIX_FAILURE(openat(root_fd, "/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 11: Absolute path relative to the root directory"); + + /* Test Case 12: Path can not escape outside of the root directory using dotdot */ + T_EXPECT_POSIX_FAILURE((fd = openat(root_fd, "../private", O_RESOLVE_BENEATH)), ENOTCAPABLE, "Test Case 12: Path can not escape outside of the root directory using dotdot"); /* Changing current directory to the test directory */ T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); T_LOG("Testing the open() syscall using O_RESOLVE_BENEATH"); - /* Test Case 12: Open a file within the directory */ - T_EXPECT_POSIX_SUCCESS((fd = open("inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 12: Open a file within the directory"); + /* Test Case 13: Open a file within the directory */ + T_EXPECT_POSIX_SUCCESS((fd = open("inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 13: Open a file within the directory"); if (fd >= 0) { close(fd); } - /* Test Case 13: Attempt to open a file using a symlink pointing outside */ - T_EXPECT_POSIX_FAILURE(open("symlink", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 13: Attempt to open a file using a symlink pointing outside"); + /* Test Case 14: Attempt to open a file using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(open("symlink", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 14: Attempt to open a file using a symlink pointing outside"); - /* Test Case 14: Attempt to open a file using ".." to navigate outside */ - T_EXPECT_POSIX_FAILURE(open("../outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 14: Attempt to open a file using \"..\" to navigate outside"); + /* Test Case 15: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(open("../outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 15: Attempt to open a file using \"..\" to navigate outside"); - /* Test Case 15: Open a file within a nested directory */ - T_EXPECT_POSIX_SUCCESS((fd = open("nested/nested_file.txt", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 15: Open a file within a nested directory"); + /* Test Case 16: Open a file within a nested directory */ + T_EXPECT_POSIX_SUCCESS((fd = open("nested/nested_file.txt", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 16: Open a file within a nested directory"); if (fd >= 0) { close(fd); } - /* Test Case 16: Symlink to a file in a nested directory */ - T_EXPECT_POSIX_SUCCESS((fd = open("symlink_to_nested", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 16: Symlink to a file within the same directory"); + /* Test Case 17: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS((fd = open("symlink_to_nested", O_RDONLY | O_RESOLVE_BENEATH, 0777)), "Test Case 17: Symlink to a file within the same directory"); if (fd >= 0) { close(fd); } - /* Test Case 17: Attempt to open a file using an absolute path */ - T_EXPECT_POSIX_FAILURE(open("/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 17: Attempt to open a file using an absolute path"); + /* Test Case 18: Attempt to open a file using an absolute path */ + T_EXPECT_POSIX_FAILURE(open("/etc/passwd", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 18: Attempt to open a file using an absolute path"); - /* Test Case 18: Valid symlink to parent directory */ - T_EXPECT_POSIX_FAILURE(open("parent_symlink/outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 18: Valid symlink to parent directory"); + /* Test Case 19: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(open("parent_symlink/outside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 19: Valid symlink to parent directory"); - /* Test Case 19: Circular symlink within directory */ - T_EXPECT_POSIX_FAILURE(open("circular_symlink", O_RDONLY | O_RESOLVE_BENEATH), ELOOP, "Test Case 19: Circular symlink within directory"); + /* Test Case 20: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(open("circular_symlink", O_RDONLY | O_RESOLVE_BENEATH), ELOOP, "Test Case 20: Circular symlink within directory"); - /* Test Case 20: Path can not escape outside at any point of the resolution */ - T_EXPECT_POSIX_FAILURE(open("../test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 20: Path can not escape outside at any point of the resolution"); + /* Test Case 21: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(open("../test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 21: Path can not escape outside at any point of the resolution"); - /* Test Case 21: Attempt to open a file using a symlink pointing to absolute path */ - T_EXPECT_POSIX_FAILURE(open("symlink_absolute/test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 21: Attempt to open a file using a symlink pointing to absolute path"); + /* Test Case 22: Attempt to open a file using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(open("symlink_absolute/test_dir/inside_file.txt", O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 22: Attempt to open a file using a symlink pointing to absolute path"); - /* Test Case 22: Path can not escape outside at any point of the resolution using absolute path */ + /* Test Case 23: Path can not escape outside at any point of the resolution using absolute path */ snprintf(path, sizeof(path), "%s/%s", testdir, INSIDE_FILE); - T_EXPECT_POSIX_FAILURE(open(path, O_RDONLY | O_RESOLVE_BENEATH), EACCES, "Test Case 22: Path can not escape outside at any point of the resolution using absolute path"); + T_EXPECT_POSIX_FAILURE(open(path, O_RDONLY | O_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 23: Path can not escape outside at any point of the resolution using absolute path"); T_EXPECT_POSIX_SUCCESS(close(root_fd), "Closing the root directory"); } + +T_DECL(resolve_beneath_faccessat, + "test faccessat() using the AT_RESOLVE_BENEATH flag") +{ + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_faccessat"); + + T_SETUPEND; + + T_LOG("Testing the faccessat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(faccessat(test_fd, "inside_file.txt", R_OK, AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "symlink", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "../outside_file.txt", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(faccessat(test_fd, "nested/nested_file.txt", R_OK, AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(faccessat(test_fd, "symlink_to_nested", R_OK, AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "/etc/passwd", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "parent_symlink/outside_file.txt", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "circular_symlink", R_OK, AT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "../test_dir/inside_file.txt", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(faccessat(test_fd, "symlink_absolute/test_dir/inside_file.txt", R_OK, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_fstatat, + "test fstatat() using the AT_RESOLVE_BENEATH flag") +{ + struct stat buf; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_fstatat"); + + T_SETUPEND; + + T_LOG("Testing the fstatat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(fstatat(test_fd, "inside_file.txt", &buf, AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "symlink", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "../outside_file.txt", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(fstatat(test_fd, "nested/nested_file.txt", &buf, AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(fstatat(test_fd, "symlink_to_nested", &buf, AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "/etc/passwd", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "parent_symlink/outside_file.txt", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "circular_symlink", &buf, AT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "../test_dir/inside_file.txt", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(fstatat(test_fd, "symlink_absolute/test_dir/inside_file.txt", &buf, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_fchmodat, + "test fchmodat() using the AT_RESOLVE_BENEATH flag") +{ + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_fchmodat"); + + T_SETUPEND; + + T_LOG("Testing the fchmodat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(fchmodat(test_fd, "inside_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "symlink", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "../outside_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(fchmodat(test_fd, "nested/nested_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(fchmodat(test_fd, "symlink_to_nested", S_IRWXU, AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "/etc/passwd", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "parent_symlink/outside_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "circular_symlink", S_IRWXU, AT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "../test_dir/inside_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(fchmodat(test_fd, "symlink_absolute/test_dir/inside_file.txt", S_IRWXU, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_fchownat, + "test fchownat() using the AT_RESOLVE_BENEATH flag") +{ + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_fchownat"); + + T_SETUPEND; + + T_LOG("Testing the fchownat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(fchownat(test_fd, "inside_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "symlink", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "../outside_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(fchownat(test_fd, "nested/nested_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(fchownat(test_fd, "symlink_to_nested", geteuid(), getgid(), AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "/etc/passwd", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "parent_symlink/outside_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "circular_symlink", geteuid(), getgid(), AT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "../test_dir/inside_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(fchownat(test_fd, "symlink_absolute/test_dir/inside_file.txt", geteuid(), getgid(), AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_linkat, + "test linkat() using the AT_RESOLVE_BENEATH flag") +{ + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_linkat"); + + T_SETUPEND; + + T_LOG("Testing the linkat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(linkat(test_fd, "inside_file.txt", test_fd, "inside_file_2.txt", AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + unlinkat(test_fd, "inside_file_2.txt", 0); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(linkat(test_fd, "symlink/.", test_fd, "inside_file_2.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(linkat(test_fd, "inside_file.txt", test_fd, "../outside_file.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(linkat(test_fd, "nested/nested_file.txt", test_fd, "nested/nested_file_2.txt", AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(linkat(test_fd, "symlink_to_nested", test_fd, "nested/nested_file_2.txt", AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(linkat(test_fd, "/etc/passwd", test_fd, "inside_file_2.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); +} + +T_DECL(resolve_beneath_unlinkat, + "test unlinkat() using the AT_RESOLVE_BENEATH flag") +{ + int fd; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_unlinkat"); + + T_SETUPEND; + + T_LOG("Testing the unlinkat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(unlinkat(test_fd, "inside_file.txt", AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + if ((fd = openat(testdir_fd, INSIDE_FILE, O_CREAT | O_RDWR, 0777)) < 0) { + T_FAIL("Unable to recreate %s", INSIDE_FILE); + } + close(fd); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_SUCCESS(unlinkat(test_fd, "symlink", AT_RESOLVE_BENEATH), "Test Case 2: File using a symlink pointing outside"); + if (symlinkat(SYMLINK_FROM, testdir_fd, SYMLINK) < 0) { + T_FAIL("Unable to recreate %s", INSIDE_FILE); + } + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(unlinkat(test_fd, "../outside_file.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(unlinkat(test_fd, "nested/nested_file.txt", AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + if ((fd = openat(testdir_fd, NESTED_FILE, O_CREAT | O_RDWR, 0777)) < 0) { + T_FAIL("Unable to recreate %s", NESTED_FILE); + } + close(fd); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(unlinkat(test_fd, "symlink_to_nested", AT_RESOLVE_BENEATH), "Test Case 5: Symlink //to a file within the same directory"); + if (symlinkat(SYMLINK_TO_NESTED_FROM, testdir_fd, SYMLINK_TO_NESTED) < 0) { + T_FAIL("Unable to recreate %s", SYMLINK_TO_NESTED); + } + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(unlinkat(test_fd, "/etc/passwd", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(unlinkat(test_fd, "parent_symlink/outside_file.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_SUCCESS(unlinkat(test_fd, "circular_symlink", AT_RESOLVE_BENEATH), "Test Case 8: Circular symlink within directory"); + if (symlinkat(CIRCULAR_SYMLINK_FROM, testdir_fd, CIRCULAR_SYMLINK) < 0) { + T_FAIL("Unable to recreate %s", CIRCULAR_SYMLINK); + } + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(unlinkat(test_fd, "../test_dir/inside_file.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(unlinkat(test_fd, "symlink_absolute/test_dir/inside_file.txt", AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_utimensat, + "test utimensat() using the AT_RESOLVE_BENEATH flag") +{ + static const struct timespec tptr[] = { + { 0x12345678, 987654321 }, + { 0x15263748, 123456789 }, + }; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_utimensat"); + + T_SETUPEND; + + T_LOG("Testing the utimensat() syscall using AT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(utimensat(test_fd, "inside_file.txt", tptr, AT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "symlink", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "../outside_file.txt", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(utimensat(test_fd, "nested/nested_file.txt", tptr, AT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(utimensat(test_fd, "symlink_to_nested", tptr, AT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "/etc/passwd", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "parent_symlink/outside_file.txt", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "circular_symlink", tptr, AT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "../test_dir/inside_file.txt", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(utimensat(test_fd, "symlink_absolute/test_dir/inside_file.txt", tptr, AT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); +} + +T_DECL(resolve_beneath_getxattr, + "test getxattr()/fgetxattr() using the XATTR_RESOLVE_BENEATH flag") +{ + char xattr_buff[100]; + const char *xattr = "test1234"; + size_t xattr_len = strlen(xattr); + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_getxattr"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + /* Setting extended attributes */ + T_ASSERT_POSIX_SUCCESS(setxattr("inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to inside_file.txt"); + T_ASSERT_POSIX_SUCCESS(setxattr("../outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to outside_file.txt"); + T_ASSERT_POSIX_SUCCESS(setxattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to nested_file.txt"); + + T_SETUPEND; + + T_LOG("Testing the getxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(getxattr("inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(getxattr("symlink", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(getxattr("../outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(getxattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(getxattr("symlink_to_nested", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(getxattr("/etc/passwd", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(getxattr("parent_symlink/outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(getxattr("circular_symlink", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(getxattr("../test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(getxattr("symlink_absolute/test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the fgetxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 11: Verifying that fgetxattr() fails with EINVAL */ + T_EXPECT_POSIX_FAILURE(fgetxattr(test_fd, XATTR_RESOURCEFORK_NAME, xattr_buff, sizeof(xattr_buff), 0, XATTR_RESOLVE_BENEATH), EINVAL, "Test Case 11: Verifying that fgetxattr() fails with EINVAL"); +} + +T_DECL(resolve_beneath_setxattr, + "test setxattr()/fsetxattr() using the XATTR_RESOLVE_BENEATH flag") +{ + const char *xattr = "test1234"; + size_t xattr_len = strlen(xattr); + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_setxattr"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + T_SETUPEND; + + T_LOG("Testing the setxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(setxattr("inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(setxattr("symlink", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(setxattr("../outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(setxattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(setxattr("symlink_to_nested", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(setxattr("/etc/passwd", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(setxattr("parent_symlink/outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(setxattr("circular_symlink", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(setxattr("../test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(setxattr("symlink_absolute/test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the fsetxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 11: Verifying that fsetxattr() fails with EINVAL */ + T_EXPECT_POSIX_FAILURE(fsetxattr(test_fd, XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, XATTR_RESOLVE_BENEATH), EINVAL, "Test Case 11: Verifying that fsetxattr() fails with EINVAL"); +} + +T_DECL(resolve_beneath_listxattr, + "test listxattr()/flistxattr() using the XATTR_RESOLVE_BENEATH flag") +{ + char xattr_buff[100]; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_listxattr"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + T_SETUPEND; + + T_LOG("Testing the listxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(listxattr("inside_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(listxattr("symlink", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(listxattr("../outside_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(listxattr("nested/nested_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(listxattr("symlink_to_nested", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(listxattr("/etc/passwd", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(listxattr("parent_symlink/outside_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(listxattr("circular_symlink", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(listxattr("../test_dir/inside_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(listxattr("symlink_absolute/test_dir/inside_file.txt", xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the flistxattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 11: Verifying that flistxattr() fails with EINVAL */ + T_EXPECT_POSIX_FAILURE(flistxattr(test_fd, xattr_buff, sizeof(xattr_buff), XATTR_RESOLVE_BENEATH), EINVAL, "Test Case 11: Verifying that flistxattr() fails with EINVAL"); +} + +T_DECL(resolve_beneath_removexattr, + "test removexattr()/fremovexattr() using the XATTR_RESOLVE_BENEATH flag") +{ + const char *xattr = "test1234"; + size_t xattr_len = strlen(xattr); + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_removexattr"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + /* Setting extended attributes */ + T_ASSERT_POSIX_SUCCESS(setxattr("inside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to inside_file.txt"); + T_ASSERT_POSIX_SUCCESS(setxattr("../outside_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to outside_file.txt"); + T_ASSERT_POSIX_SUCCESS(setxattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting extended attributes to nested_file.txt"); + + T_SETUPEND; + + T_LOG("Testing the removexattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(removexattr("inside_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(removexattr("symlink", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(removexattr("../outside_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(removexattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + if (setxattr("nested/nested_file.txt", XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0) < 0) { + T_FAIL("Unable to setxattr to nested_file.txt"); + } + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(removexattr("symlink_to_nested", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(removexattr("/etc/passwd", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(removexattr("parent_symlink/outside_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(removexattr("circular_symlink", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(removexattr("../test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(removexattr("symlink_absolute/test_dir/inside_file.txt", XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the fremovexattr() syscall using XATTR_RESOLVE_BENEATH"); + + /* Test Case 11: Verifying that fremovexattr() fails with EINVAL */ + T_EXPECT_POSIX_FAILURE(fremovexattr(test_fd, XATTR_RESOURCEFORK_NAME, XATTR_RESOLVE_BENEATH), EINVAL, "Test Case 11: Verifying that fremovexattr() fails with EINVAL"); +} + +T_DECL(resolve_beneath_clonefile, + "test clonefile()/clonefileat()/fclonefileat() using the CLONE_RESOLVE_BENEATH flag") +{ + int fd; + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_clonefile"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + /* Open test file */ + T_ASSERT_POSIX_SUCCESS((fd = open("inside_file.txt", O_RDWR, 0777)), "Opening %s", INSIDE_FILE); + + T_SETUPEND; + + T_LOG("Testing the clonefile() syscall using CLONE_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(clonefile("inside_file.txt", "inside_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + unlink("inside_file_2.txt"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(clonefile("symlink", "inside_file_2.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(clonefile("inside_file.txt", "../outside_file.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(clonefile("nested/nested_file.txt", "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(clonefile("symlink_to_nested", "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(clonefile("/etc/passwd", "inside_file_2.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + T_LOG("Testing the clonefileat() syscall using CLONE_RESOLVE_BENEATH"); + + /* Test Case 7: File within the directory */ + T_EXPECT_POSIX_SUCCESS(clonefileat(test_fd, "inside_file.txt", test_fd, "inside_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 7: File within the directory"); + unlinkat(test_fd, "inside_file_2.txt", 0); + + /* Test Case 8: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(clonefileat(test_fd, "symlink", test_fd, "inside_file_2.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 8: File using a symlink pointing outside"); + + /* Test Case 9: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(clonefileat(test_fd, "inside_file.txt", test_fd, "../outside_file.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: File using \"..\" to navigate outside"); + + /* Test Case 10: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(clonefileat(test_fd, "nested/nested_file.txt", test_fd, "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 10: File within a nested directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 11: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(clonefileat(test_fd, "symlink_to_nested", test_fd, "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 11: Symlink to a file within the same directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 12: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(clonefileat(test_fd, "/etc/passwd", test_fd, "inside_file_2.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 12: File using an absolute path"); + + T_LOG("Testing the fclonefileat() syscall using CLONE_RESOLVE_BENEATH"); + + /* Test Case 13: File within the directory */ + T_EXPECT_POSIX_SUCCESS(fclonefileat(fd, test_fd, "inside_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 13: File within the directory"); + unlinkat(test_fd, "inside_file_2.txt", 0); + + /* Test Case 14: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(fclonefileat(fd, test_fd, "symlink_absolute/test_dir/inside_file.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 14: File using a symlink pointing outside"); + + /* Test Case 15: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(fclonefileat(fd, test_fd, "../outside_file.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 15: File using \"..\" to navigate outside"); + + /* Test Case 16: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(fclonefileat(fd, test_fd, "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 16: File within a nested directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 17: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(fclonefileat(fd, test_fd, "nested/nested_file_2.txt", CLONE_RESOLVE_BENEATH), "Test Case 17: Symlink to a file within the same directory"); + unlinkat(test_fd, "nested/nested_file_2.txt", 0); + + /* Test Case 18: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(fclonefileat(fd, test_fd, "/etc/inside_file_2.txt", CLONE_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 18: File using an absolute path"); + + T_EXPECT_POSIX_SUCCESS(close(fd), "Closing %s", INSIDE_FILE); +} + +T_DECL(resolve_beneath_renamex_np, + "test renamex_np()/renameatx_np() using the RENAME_RESOLVE_BENEATH flag") +{ + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_renamex_np"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + T_SETUPEND; + + T_LOG("Testing the renamex_np() syscall using RENAME_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(renamex_np("inside_file.txt", "inside_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + if (renamex_np("inside_file_2.txt", "inside_file.txt", 0)) { + T_FAIL("Unable to rename inside_file_2.txt to inside_file.txt"); + } + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(renamex_np("symlink/.", "inside_file_2.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(renamex_np("inside_file.txt", "../outside_file.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(renamex_np("nested/nested_file.txt", "nested/nested_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + if (renamex_np("nested/nested_file_2.txt", "nested/nested_file.txt", 0)) { + T_FAIL("Unable to rename nested/nested_file_2.txt to nested/nested_file.txt"); + } + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(renamex_np("symlink_to_nested", "nested/nested_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + if (renamex_np("nested/nested_file_2.txt", "symlink_to_nested", 0)) { + T_FAIL("Unable to rename nested/nested_file_2.txt to symlink_to_nested"); + } + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(renamex_np("/etc/passwd", "inside_file_2.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + T_LOG("Testing the renameatx_np() syscall using RENAME_RESOLVE_BENEATH"); + + /* Test Case 7: File within the directory */ + T_EXPECT_POSIX_SUCCESS(renameatx_np(test_fd, "inside_file.txt", test_fd, "inside_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 7: File within the directory"); + if (renamex_np("inside_file_2.txt", "inside_file.txt", 0)) { + T_FAIL("Unable to rename inside_file_2.txt to inside_file.txt"); + } + + /* Test Case 8: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(renameatx_np(test_fd, "symlink/.", test_fd, "inside_file_2.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 8: File using a symlink pointing outside"); + + /* Test Case 9: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(renameatx_np(test_fd, "inside_file.txt", test_fd, "../outside_file.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: File using \"..\" to navigate outside"); + + /* Test Case 10: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(renameatx_np(test_fd, "nested/nested_file.txt", test_fd, "nested/nested_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 10: File within a nested directory"); + if (renamex_np("nested/nested_file_2.txt", "nested/nested_file.txt", 0)) { + T_FAIL("Unable to rename nested/nested_file_2.txt to nested/nested_file.txt"); + } + + /* Test Case 11: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(renameatx_np(test_fd, "symlink_to_nested", test_fd, "nested/nested_file_2.txt", RENAME_RESOLVE_BENEATH), "Test Case 11: Symlink to a file within the same directory"); + if (renamex_np("nested/nested_file_2.txt", "symlink_to_nested", 0)) { + T_FAIL("Unable to rename nested/nested_file_2.txt to symlink_to_nested"); + } + + /* Test Case 12: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(renameatx_np(test_fd, "/etc/passwd", test_fd, "inside_file_2.txt", RENAME_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 12: File using an absolute path"); +} + +T_DECL(resolve_beneath_getattrlist, + "test getattrlist()/fgetattrlist()/getattrlistat() using the FSOPT_RESOLVE_BENEATH flag") +{ + int fd; + + struct myattrbuf { + uint32_t length; + attribute_set_t returned_attrs; + vol_attributes_attr_t vol_attributes; + attrreference_t fstypename_ref; + uint32_t fssubtype; + char fstypename[MFSTYPENAMELEN]; + } attrbuf; + + struct attrlist attrs = { + .bitmapcount = ATTR_BIT_MAP_COUNT, + .commonattr = ATTR_CMN_RETURNED_ATTRS, + /* + * Request ATTR_VOL_ATTRIBUTES to ensure that + * ATTR_VOL_FSTYPENAME and ATTR_VOL_FSSUBTYPE + * are packed into the buffer *after*. + */ + .volattr = ATTR_VOL_INFO | ATTR_VOL_ATTRIBUTES | + ATTR_VOL_FSTYPENAME | ATTR_VOL_FSSUBTYPE, + }; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_beneath_getattrlist"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + /* Open test file */ + T_ASSERT_POSIX_SUCCESS((fd = open("inside_file.txt", O_RDWR, 0777)), "Opening %s", INSIDE_FILE); + + T_SETUPEND; + + T_LOG("Testing the getattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(getattrlist("inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(getattrlist("symlink", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(getattrlist("../outside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(getattrlist("nested/nested_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(getattrlist("symlink_to_nested", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(getattrlist("/etc/passwd", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(getattrlist("parent_symlink/outside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(getattrlist("circular_symlink", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(getattrlist("../test_dir/inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(getattrlist("symlink_absolute/test_dir/inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the fgetattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 11: fgetattrlist() syscall using FSOPT_RESOLVE_BENEATH */ + T_EXPECT_POSIX_SUCCESS(fgetattrlist(fd, &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 11: fgetattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + T_LOG("Testing the getattrlistat() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 12: File within the directory */ + T_EXPECT_POSIX_SUCCESS(getattrlistat(test_fd, "inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 12: File within the directory"); + + /* Test Case 13: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "symlink", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 13: File using a symlink pointing outside"); + + /* Test Case 14: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "../outside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 14: File using \"..\" to navigate outside"); + + /* Test Case 15: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(getattrlistat(test_fd, "nested/nested_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 15: File within a nested directory"); + + /* Test Case 16: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(getattrlistat(test_fd, "symlink_to_nested", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), "Test Case 16: Symlink to a file within the same directory"); + + /* Test Case 17: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "/etc/passwd", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 17: File using an absolute path"); + + /* Test Case 18: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "parent_symlink/outside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 18: Valid symlink to parent directory"); + + /* Test Case 19: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "circular_symlink", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ELOOP, "Test Case 19: Circular symlink within directory"); + + /* Test Case 20: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "../test_dir/inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 20: Path can not escape outside at any point of the resolution"); + + /* Test Case 21: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(getattrlistat(test_fd, "symlink_absolute/test_dir/inside_file.txt", &attrs, &attrbuf, sizeof(attrbuf), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 21: File using a symlink pointing to absolute path"); + + T_EXPECT_POSIX_SUCCESS(close(fd), "Closing %s", INSIDE_FILE); +} + +T_DECL(resolve_beneath_setattrlist, + "test setattrlist()/fsetattrlist()/setattrlistat() using the FSOPT_RESOLVE_BENEATH flag") +{ + int fd; + int flags; + struct attrlist attrlist; + + T_SETUPBEGIN; + + flags = 0; + memset(&attrlist, 0, sizeof(attrlist)); + attrlist.bitmapcount = ATTR_BIT_MAP_COUNT; + attrlist.commonattr = ATTR_CMN_FLAGS; + + T_ATEND(cleanup); + setup("resolve_beneath_setattrlist"); + + /* Changing current directory to the test directory */ + T_ASSERT_POSIX_SUCCESS(fchdir(test_fd), "Changing directory to %s/%s", testdir, TEST_DIR); + + /* Open test file */ + T_ASSERT_POSIX_SUCCESS((fd = open("inside_file.txt", O_RDWR, 0777)), "Opening %s", INSIDE_FILE); + + T_SETUPEND; + + T_LOG("Testing the setattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 1: File within the directory */ + T_EXPECT_POSIX_SUCCESS(setattrlist("inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 1: File within the directory"); + + /* Test Case 2: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(setattrlist("symlink", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 2: File using a symlink pointing outside"); + + /* Test Case 3: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(setattrlist("../outside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 3: File using \"..\" to navigate outside"); + + /* Test Case 4: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(setattrlist("nested/nested_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 4: File within a nested directory"); + + /* Test Case 5: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(setattrlist("symlink_to_nested", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 5: Symlink to a file within the same directory"); + + /* Test Case 6: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(setattrlist("/etc/passwd", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 6: File using an absolute path"); + + /* Test Case 7: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(setattrlist("parent_symlink/outside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 7: Valid symlink to parent directory"); + + /* Test Case 8: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(setattrlist("circular_symlink", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ELOOP, "Test Case 8: Circular symlink within directory"); + + /* Test Case 9: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(setattrlist("../test_dir/inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 9: Path can not escape outside at any point of the resolution"); + + /* Test Case 10: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(setattrlist("symlink_absolute/test_dir/inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 10: File using a symlink pointing to absolute path"); + + T_LOG("Testing the fsetattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 11: fsetattrlist() syscall using FSOPT_RESOLVE_BENEATH */ + T_EXPECT_POSIX_SUCCESS(fsetattrlist(fd, &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 11: fsetattrlist() syscall using FSOPT_RESOLVE_BENEATH"); + + T_LOG("Testing the setattrlistat() syscall using FSOPT_RESOLVE_BENEATH"); + + /* Test Case 12: File within the directory */ + T_EXPECT_POSIX_SUCCESS(setattrlistat(test_fd, "inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 12: File within the directory"); + + /* Test Case 13: File using a symlink pointing outside */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "symlink", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 13: File using a symlink pointing outside"); + + /* Test Case 14: Attempt to open a file using ".." to navigate outside */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "../outside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 14: File using \"..\" to navigate outside"); + + /* Test Case 15: File within a nested directory */ + T_EXPECT_POSIX_SUCCESS(setattrlistat(test_fd, "nested/nested_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 15: File within a nested directory"); + + /* Test Case 16: Symlink to a file in a nested directory */ + T_EXPECT_POSIX_SUCCESS(setattrlistat(test_fd, "symlink_to_nested", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), "Test Case 16: Symlink to a file within the same directory"); + + /* Test Case 17: File using an absolute path */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "/etc/passwd", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 17: File using an absolute path"); + + /* Test Case 18: Valid symlink to parent directory */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "parent_symlink/outside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 18: Valid symlink to parent directory"); + + /* Test Case 19: Circular symlink within directory */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "circular_symlink", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ELOOP, "Test Case 19: Circular symlink within directory"); + + /* Test Case 20: Path can not escape outside at any point of the resolution */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "../test_dir/inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 20: Path can not escape outside at any point of the resolution"); + + /* Test Case 21: File using a symlink pointing to absolute path */ + T_EXPECT_POSIX_FAILURE(setattrlistat(test_fd, "symlink_absolute/test_dir/inside_file.txt", &attrlist, &flags, sizeof(flags), FSOPT_RESOLVE_BENEATH), ENOTCAPABLE, "Test Case 21: File using a symlink pointing to absolute path"); + + T_EXPECT_POSIX_SUCCESS(close(fd), "Closing %s", INSIDE_FILE); +} diff --git a/tests/vfs/resolve_namespace.c b/tests/vfs/resolve_namespace.c new file mode 100644 index 000000000..9434780a4 --- /dev/null +++ b/tests/vfs/resolve_namespace.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o resolve_namespace resolve_namespace.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +static char template[MAXPATHLEN]; +static char testdir_path[MAXPATHLEN + 1]; +static char *testdir = NULL; +static int testdir_fd = -1, test_fd = -1; + +#ifndef ENOTCAPABLE +#define ENOTCAPABLE 107 +#endif + +#ifndef RESOLVE_NOFOLLOW_ANY +#define RESOLVE_NOFOLLOW_ANY 0x00000001 +#endif + +#ifndef RESOLVE_NODOTDOT +#define RESOLVE_NODOTDOT 0x00000002 +#endif + +#ifndef RESOLVE_NODEVFS +#define RESOLVE_NODEVFS 0x00000008 +#endif + +#ifndef RESOLVE_UNIQUE +#define RESOLVE_UNIQUE 0x00000020 +#endif + +#ifndef RESOLVE_NOXATTRS +#define RESOLVE_NOXATTRS 0x00000040 +#endif + +#define TEST_DIR "test_dir" +#define FILE "test_dir/file.txt" +#define FILE2 "test_dir/file2.txt" +#define FILE3 "test_dir/file3.txt" +#define DIR_SYMLINK "test_dir/dir_symlink" +#define FILE_SYMLINK "test_dir/dir_symlink/file_symlink.txt" +#define FILE_SYMLINK_2 "test_dir/dir_symlink/file_symlink_2.txt" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (test_fd != -1) { + close(test_fd); + } + if (testdir_fd != -1) { + unlinkat(testdir_fd, FILE_SYMLINK, 0); + unlinkat(testdir_fd, FILE_SYMLINK_2, 0); + unlinkat(testdir_fd, FILE, 0); + unlinkat(testdir_fd, DIR_SYMLINK, 0); + unlinkat(testdir_fd, TEST_DIR, AT_REMOVEDIR); + + close(testdir_fd); + if (rmdir(testdir)) { + T_FAIL("Unable to remove the test directory (%s)", testdir); + } + } +} + +static void +setup(const char *dirname) +{ + int fd; + char symlink_path[PATH_MAX]; + + testdir_fd = test_fd = -1; + + /* Create test root directory */ + snprintf(template, sizeof(template), "%s/%s-XXXXXX", dt_tmpdir(), dirname); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root directory"); + T_ASSERT_POSIX_SUCCESS((testdir_fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory %s", testdir); + T_ASSERT_POSIX_SUCCESS(fcntl(testdir_fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + + /* Create test directories */ + T_ASSERT_POSIX_SUCCESS(mkdirat(testdir_fd, TEST_DIR, 0777), "Creating %s/%s", testdir_path, TEST_DIR); + T_ASSERT_POSIX_SUCCESS((test_fd = openat(testdir_fd, TEST_DIR, O_SEARCH, 0777)), "Opening test directory %s/%s", testdir_path, TEST_DIR); + + /* Create the test files */ + snprintf(symlink_path, sizeof(symlink_path), "%s/%s/../", testdir_path, TEST_DIR); + T_ASSERT_POSIX_SUCCESS(symlinkat(symlink_path, testdir_fd, DIR_SYMLINK), "Creating symbolic link %s ---> %s", DIR_SYMLINK, symlink_path); + T_ASSERT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE); + close(fd); + T_ASSERT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE_SYMLINK, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE_SYMLINK); + close(fd); +} + +T_DECL(resolve_namespace_nofollow, + "Test the RESOLVE_NOFOLLOW_ANY prefix-path") +{ + int fd; + + char file_nofollow[PATH_MAX]; + char symlink_nofollow[PATH_MAX]; + char symlink_resolve[PATH_MAX]; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_namespace_nofollow"); + + /* Setup file names */ + snprintf(file_nofollow, sizeof(file_nofollow), "/.nofollow/%s/%s", testdir_path, FILE); + snprintf(symlink_nofollow, sizeof(symlink_nofollow), "/.nofollow/%s/%s", testdir_path, FILE_SYMLINK); + snprintf(symlink_resolve, sizeof(symlink_resolve), "/.resolve/%d/%s/%s", RESOLVE_NOFOLLOW_ANY, testdir_path, FILE_SYMLINK_2); + + T_SETUPEND; + + T_EXPECT_POSIX_SUCCESS((fd = openat(testdir_fd, FILE, O_NOFOLLOW_ANY)), "Testing openat(O_NOFOLLOW_ANY) using path with no symlinks"); + close(fd); + + T_EXPECT_POSIX_SUCCESS((fd = open(file_nofollow, O_NOFOLLOW_ANY)), "Testing open() using path with no symlinks and '.nofollow' prefix"); + close(fd); + + T_EXPECT_POSIX_FAILURE((fd = openat(testdir_fd, FILE_SYMLINK, O_NOFOLLOW_ANY)), ELOOP, "Testing openat(O_NOFOLLOW_ANY) using path with a symlink"); + T_EXPECT_POSIX_FAILURE((fd = open(symlink_nofollow, 0)), ELOOP, "Testing open() using path with a symlink and '.nofollow' prefix"); + + T_EXPECT_POSIX_FAILURE((fd = openat(testdir_fd, FILE_SYMLINK_2, O_CREAT | O_NOFOLLOW_ANY)), ELOOP, "Testing openat(O_CREAT | O_NOFOLLOW_ANY) using path with a symlink"); + T_EXPECT_POSIX_FAILURE((fd = open(symlink_resolve, O_CREAT)), ELOOP, "Testing open(O_CREAT) using path with a symlink and '.resolve' prefix"); +} + +T_DECL(resolve_namespace_nodotdot, + "Test the RESOLVE_NODOTDOT prefix-path") +{ + int fd; + + char file_dotdot[PATH_MAX]; + char file_nodotdot[PATH_MAX]; + char symlink_dotdot[PATH_MAX]; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_namespace_nodotdot"); + + /* Setup file names */ + snprintf(file_dotdot, sizeof(file_dotdot), "/.resolve/%d/%s/%s/../%s", RESOLVE_NODOTDOT, testdir_path, TEST_DIR, FILE); + snprintf(file_nodotdot, sizeof(file_nodotdot), "/.resolve/%d/%s/%s", RESOLVE_NODOTDOT, testdir_path, FILE); + snprintf(symlink_dotdot, sizeof(symlink_dotdot), "/.resolve/%d/%s/%s", RESOLVE_NODOTDOT, testdir_path, FILE_SYMLINK); + + T_SETUPEND; + + T_EXPECT_POSIX_SUCCESS((fd = open(file_nodotdot, O_RDONLY)), "Testing open(O_RDONLY) without '..'"); + close(fd); + + T_EXPECT_POSIX_FAILURE((fd = open(file_dotdot, O_RDONLY)), ENOTCAPABLE, "Testing open(O_RDONLY) using path including '..'"); + T_EXPECT_POSIX_FAILURE((fd = open(file_dotdot, O_RDONLY | O_CREAT)), ENOTCAPABLE, "Testing open(O_RDONLY | O_CREAT) using path including '..'"); + T_EXPECT_POSIX_FAILURE((fd = open(symlink_dotdot, O_RDONLY)), ENOTCAPABLE, "Testing open(O_RDONLY) using path with a symlink including '..'"); + T_EXPECT_POSIX_FAILURE((fd = open(symlink_dotdot, O_RDONLY | O_CREAT)), ENOTCAPABLE, "Testing open(O_RDONLY | O_CREAT) using path with a symlink including '..'"); +} + +T_DECL(resolve_namespace_nodevfs, + "Test the RESOLVE_NODEVFS prefix-path") +{ + int fd, dirfd; + struct stat statbuf; + char path[PATH_MAX]; + const char *dir = "/private/var/tmp/"; + + T_SETUPBEGIN; + + T_ASSERT_POSIX_SUCCESS((dirfd = open(dir, O_RDONLY | O_DIRECTORY)), "Opening %s", dir); + + T_SETUPEND; + + snprintf(path, sizeof(path), "/dev/null"); + T_EXPECT_POSIX_SUCCESS((fd = open(path, O_RDONLY)), "Opening %s -> should PASS", path); + close(fd); + + snprintf(path, sizeof(path), "/.resolve/%d/dev/null", RESOLVE_NODEVFS); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDONLY)), ENOTCAPABLE, "Opening %s -> Should fail with ENOTCAPABLE", path); + + snprintf(path, sizeof(path), "/dev/nosuchdir/nosuchfile.txt"); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDONLY)), ENOENT, "Opening a non-existent file %s -> Should fail with ENOENT", path); + + snprintf(path, sizeof(path), "/.resolve/%d/dev/nosuchdir/nosuchfile.txt", RESOLVE_NODEVFS); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDONLY)), ENOTCAPABLE, "Opening a non-existent file %s -> Should fail with ENOTCAPABLE", path); + + snprintf(path, sizeof(path), "/dev/nosuchfile.txt"); + T_EXPECT_EQ((fd = open(path, O_RDWR | O_CREAT)), -1, "Creating a file %s -> Should fail with an error", path); + + snprintf(path, sizeof(path), "/.resolve/%d/dev/nosuchfile.txt", RESOLVE_NODEVFS); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDWR | O_CREAT)), ENOTCAPABLE, "Creating a file %s -> Should fail with ENOTCAPABLE", path); + + snprintf(path, sizeof(path), "/dev/../"); + T_EXPECT_POSIX_SUCCESS((stat(path, &statbuf)), "Calling stat() for %s -> Should PASS", path); + T_EXPECT_POSIX_SUCCESS((fd = open(path, O_RDONLY)), "Opening %s -> Should PASS", path); + close(fd); + + snprintf(path, sizeof(path), "/.resolve/%d/dev/../", RESOLVE_NODEVFS); + T_EXPECT_POSIX_FAILURE((stat(path, &statbuf)), ENOTCAPABLE, "Calling stat() for %s -> Should fail with ENOTCAPABLE", path); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDONLY)), ENOTCAPABLE, "Opening %s -> Should fail with ENOTCAPABLE", path); + + snprintf(path, sizeof(path), "/dev/fd/%d", dirfd); + T_EXPECT_POSIX_SUCCESS((stat(path, &statbuf)), "Calling stat() for %s -> Should PASS", path); + T_EXPECT_POSIX_SUCCESS((fd = open(path, O_RDONLY)), "Opening %s paths -> Should PASS", path); + close(fd); + + snprintf(path, sizeof(path), "/.resolve/%d/dev/fd/%d", RESOLVE_NODEVFS, dirfd); + T_EXPECT_POSIX_FAILURE((stat(path, &statbuf)), ENOTCAPABLE, "Calling stat() for %s -> Should fail with ENOTCAPABLE", path); + T_EXPECT_POSIX_FAILURE((fd = open(path, O_RDONLY)), ENOTCAPABLE, "Opening %s -> Should fail with ENOTCAPABLE", path); + + close(dirfd); +} + +T_DECL(resolve_namespace_unique, + "Test the RESOLVE_UNIQUE prefix-path") +{ + int fd; + struct stat statbuf; + char file_unique[PATH_MAX], file_unique_symlink[PATH_MAX], file3[PATH_MAX]; + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_namespace_unique"); + snprintf(file3, sizeof(file3), "%s/%s", testdir_path, FILE3); + snprintf(file_unique, sizeof(file_unique), "/.resolve/%d/%s/%s", RESOLVE_UNIQUE, testdir_path, FILE); + snprintf(file_unique_symlink, sizeof(file_unique_symlink), "/.resolve/%d/%s/%s/%s", RESOLVE_UNIQUE, testdir_path, DIR_SYMLINK, FILE); + + T_SETUPEND; + + /* Validate nlink count equals 1 */ + T_EXPECT_POSIX_SUCCESS((stat(file_unique, &statbuf)), "Calling stat() for %s -> Should PASS", file_unique); + T_EXPECT_EQ(statbuf.st_nlink, 1, "Validate nlink equals 1"); + T_EXPECT_POSIX_SUCCESS((fd = open(file_unique, O_RDONLY)), "Opening %s -> Should PASS", file_unique); + close(fd); + + /* Increase nlink count */ + T_EXPECT_POSIX_SUCCESS(linkat(testdir_fd, FILE, testdir_fd, FILE2, 0), "Calling linkat() for %s, %s -> Should PASS", FILE, FILE2); + + /* Validate nlink count equals 2 */ + T_EXPECT_POSIX_SUCCESS((fstatat(testdir_fd, FILE, &statbuf, 0)), "Calling fstatat() for %s -> Should PASS", FILE); + T_EXPECT_EQ(statbuf.st_nlink, 2, "Validate nlink equals 2"); + + /* Validate ENOTCAPABLE */ + T_EXPECT_POSIX_FAILURE((stat(file_unique, &statbuf)), ENOTCAPABLE, "Calling stat() for %s -> Should fail with ENOTCAPABLE", file_unique); + T_EXPECT_POSIX_FAILURE((fd = open(file_unique, O_RDONLY)), ENOTCAPABLE, "Opening %s -> Should fail with ENOTCAPABLE", file_unique); + T_EXPECT_POSIX_FAILURE(link(file_unique, file3), ENOTCAPABLE, "Calling link() for %s, %s -> Should fail with ENOTCAPABLE", file_unique, file3); + T_EXPECT_POSIX_FAILURE(rename(file_unique_symlink, file3), ENOTCAPABLE, "Calling rename() for %s, %s -> Should fail with ENOTCAPABLE", file_unique_symlink, file3); + + /* Reduce nlink count */ + T_EXPECT_POSIX_SUCCESS(unlinkat(testdir_fd, FILE2, 0), "Calling unlinkat() for %s -> Should PASS", FILE2); + + /* Validate nlink count equals 1 */ + T_EXPECT_POSIX_SUCCESS((stat(file_unique, &statbuf)), "Calling stat() for %s -> Should PASS", file_unique); + T_EXPECT_EQ(statbuf.st_nlink, 1, "Validate nlink equals 1"); + T_EXPECT_POSIX_SUCCESS((fd = open(file_unique, O_RDONLY)), "Opening %s -> Should PASS", file_unique); + close(fd); +} + +T_DECL(resolve_namespace_noxattrs, + "Test the RESOLVE_NOXATTRS prefix-path") +{ + int fd; + struct stat statbuf; + const char *xattr = "test1234"; + size_t xattr_len = strlen(xattr); + char file_path[PATH_MAX]; + char file_rfork[PATH_MAX], file_noxattrs_rfork[PATH_MAX]; + + T_SETUPBEGIN; + + T_ATEND(cleanup); + setup("resolve_namespace_noxattrs"); + snprintf(file_path, sizeof(file_path), "%s/%s", testdir_path, FILE); + + snprintf(file_rfork, sizeof(file_rfork), "%s/%s/%s", testdir_path, FILE, _PATH_RSRCFORKSPEC); + snprintf(file_noxattrs_rfork, sizeof(file_noxattrs_rfork), "/.resolve/%d/%s/%s/%s", RESOLVE_NOXATTRS, testdir_path, FILE, _PATH_RSRCFORKSPEC); + + /* Set ResourceFork extended attribute */ + T_ASSERT_POSIX_SUCCESS(setxattr(file_path, XATTR_RESOURCEFORK_NAME, xattr, xattr_len, 0, 0), "Setting ResourceFork of %s to '%s'", file_path, xattr); + + T_SETUPEND; + + /* Call stat() for the resource fork file */ + T_EXPECT_POSIX_SUCCESS((stat(file_rfork, &statbuf)), "Calling stat() for %s -> Should PASS", file_rfork); + T_EXPECT_POSIX_FAILURE((stat(file_noxattrs_rfork, &statbuf)), ENOTCAPABLE, "Calling stat() for %s -> Should fail with ENOTCAPABLE", file_noxattrs_rfork); + + /* Open the resource fork file */ + T_EXPECT_POSIX_SUCCESS((fd = open(file_rfork, O_RDONLY)), "Opening %s -> Should PASS", file_rfork); + close(fd); + T_EXPECT_POSIX_FAILURE((fd = open(file_noxattrs_rfork, O_RDONLY)), ENOTCAPABLE, "Opening %s -> Should fail with ENOTCAPABLE", file_noxattrs_rfork); +} diff --git a/tests/vfs/sandbox_appledouble_write.c b/tests/vfs/sandbox_appledouble_write.c new file mode 100644 index 000000000..70a2cfd51 --- /dev/null +++ b/tests/vfs/sandbox_appledouble_write.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -lsandbox -o sandbox_appledouble_write sandbox_appledouble_write.c -g -Weverything */ + +#include +#include +#include + +#include +#include + +#define RUN_TEST TARGET_OS_OSX + +#define FILE "file" +#define FILE_AD "._file" +#define TMP_FILE_AD "._tmpfile" + +#define FILE2 "f" +#define FILE2_AD "._f" +#define TMP_FILE2_AD "._g" + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char file[PATH_MAX], file2[PATH_MAX]; +static sandbox_params_t params = NULL; +static sandbox_profile_t profile = NULL; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(true), + T_META_ENABLED(RUN_TEST), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (profile) { + sandbox_free_profile(profile); + } + if (params) { + sandbox_free_params(params); + } + if (file[0] != '\0') { + unlink(file); + } + if (file2[0] != '\0') { + unlink(file2); + } + if (testdir) { + unmount(testdir, MNT_FORCE); + rmdir(testdir); + } +} + +static void +create_profile_string(char *buff, size_t size) +{ + snprintf(buff, size, "(version 1) \n\ + (allow default) \n\ + (import \"system.sb\") \n\ + (deny file-write-xattr (path \"%s\")) \n\ + (deny file-write-xattr (path \"%s\")) \n", + file, file2); +} + +T_DECL(sandbox_appledouble_write, + "Verify that the 'file-write-xattr' permission is enforced for apple-double files") +{ + int testdirfd, fd; + char *sberror = NULL; + char profile_string[1000]; + char testdir_path[MAXPATHLEN]; + char mount_tmpfs_cmd[1000]; + file[0] = file2[0] = '\0'; + +#if (!RUN_TEST) + T_SKIP("Not macOS"); +#endif + + if (geteuid() != 0) { + T_SKIP("Test should run as root"); + } + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/sandbox_appledouble_write-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory %s", testdir); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + close(fd); + + /* mount tmpfs */ + snprintf(mount_tmpfs_cmd, sizeof(mount_tmpfs_cmd), "/sbin/mount_tmpfs -s 50m %s", testdir_path); + T_ASSERT_POSIX_SUCCESS(system(mount_tmpfs_cmd), "Mounting tmpfs mount -> Should PASS"); + + T_EXPECT_POSIX_SUCCESS((testdirfd = open(testdir_path, O_SEARCH, 0777)), "Opening test root directory"); + + /* Setup file names */ + snprintf(file, sizeof(file), "%s/%s", testdir_path, FILE); + snprintf(file2, sizeof(file2), "%s/%s", testdir_path, FILE2); + + /* Create the test files */ + T_ASSERT_POSIX_SUCCESS((fd = openat(testdirfd, FILE, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE); + close(fd); + + T_ASSERT_POSIX_SUCCESS((fd = openat(testdirfd, FILE_AD, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE_AD); + close(fd); + + T_ASSERT_POSIX_SUCCESS((fd = openat(testdirfd, FILE2, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE2); + close(fd); + + T_ASSERT_POSIX_SUCCESS((fd = openat(testdirfd, FILE2_AD, O_CREAT | O_RDWR, 0777)), "Creating %s", FILE2_AD); + close(fd); + + /* Create sandbox variables */ + T_ASSERT_POSIX_NOTNULL(params = sandbox_create_params(), "Creating Sandbox params object"); + create_profile_string(profile_string, sizeof(profile_string)); + T_ASSERT_POSIX_NOTNULL(profile = sandbox_compile_string(profile_string, params, &sberror), "Creating Sandbox profile object"); + + T_SETUPEND; + + /* Validate SUCCESS for rename */ + T_EXPECT_POSIX_SUCCESS(renameat(testdirfd, FILE_AD, testdirfd, TMP_FILE_AD), "Verifying that rename() of '%s' -> '%s' succeeded", FILE_AD, TMP_FILE_AD); + T_EXPECT_POSIX_SUCCESS(renameat(testdirfd, TMP_FILE_AD, testdirfd, FILE_AD), "Verifying that rename() of '%s' -> '%s' succeeded", TMP_FILE_AD, FILE_AD); + + T_EXPECT_POSIX_SUCCESS(renameat(testdirfd, FILE2_AD, testdirfd, TMP_FILE2_AD), "Verifying that rename() of '%s' -> '%s' succeeded", FILE2_AD, TMP_FILE2_AD); + T_EXPECT_POSIX_SUCCESS(renameat(testdirfd, TMP_FILE2_AD, testdirfd, FILE2_AD), "Verifying that rename() of '%s' -> '%s' succeeded", TMP_FILE2_AD, FILE2_AD); + + /* Validate SUCCESS for unlink */ + T_EXPECT_POSIX_SUCCESS(unlinkat(testdirfd, FILE_AD, 0), "Verifying that unlink() of '%s' succeeded", FILE_AD); + T_EXPECT_POSIX_SUCCESS(unlinkat(testdirfd, FILE2_AD, 0), "Verifying that unlink() of '%s' succeeded", FILE2_AD); + + /* Validate SUCCESS for open/create */ + T_EXPECT_POSIX_SUCCESS((fd = openat(testdirfd, FILE_AD, O_CREAT | O_WRONLY, 0777)), "Verifying that open() with O_WRONLY of '%s' succeeded ", FILE_AD); + if (fd >= 0) { + close(fd); + } + T_EXPECT_POSIX_SUCCESS((fd = openat(testdirfd, FILE2_AD, O_CREAT | O_TRUNC, 0777)), "Verifying that open() with O_TRUNC of '%s' succeeded", FILE2_AD); + if (fd >= 0) { + close(fd); + } + + /* Apply sandbox profile */ + T_ASSERT_POSIX_SUCCESS(sandbox_apply(profile), "Applying Sandbox profile"); + + /* Validate EPERM for rename */ + T_EXPECT_POSIX_FAILURE(renameat(testdirfd, FILE_AD, testdirfd, TMP_FILE_AD), EPERM, "Verifying that rename() of '%s' -> '%s' fails with EPERM", FILE_AD, TMP_FILE_AD); + T_EXPECT_POSIX_FAILURE(renameat(testdirfd, FILE2_AD, testdirfd, TMP_FILE2_AD), EPERM, "Verifying that rename() of '%s' -> '%s' fails with EPERM", FILE2_AD, TMP_FILE2_AD); + + /* Validate EPERM for unlink */ + T_EXPECT_POSIX_FAILURE(unlinkat(testdirfd, FILE_AD, 0), EPERM, "Verifying that unlink() of '%s' fails with EPERM", FILE_AD); + T_EXPECT_POSIX_FAILURE(unlinkat(testdirfd, FILE2_AD, 0), EPERM, "Verifying that unlink() of '%s' fails with EPERM", FILE2_AD); + + /* Validate EPERM for open */ + T_EXPECT_POSIX_FAILURE((fd = openat(testdirfd, FILE_AD, O_WRONLY, 0777)), EPERM, "Verifying that open() with O_WRONLY of '%s' fails with EPERM", FILE_AD); + if (fd >= 0) { + close(fd); + } + T_EXPECT_POSIX_FAILURE((fd = openat(testdirfd, FILE2_AD, O_TRUNC, 0777)), EPERM, "Verifying that open() with O_TRUNC of '%s' fails with EPERM", FILE2_AD); + if (fd >= 0) { + close(fd); + } + + T_ASSERT_POSIX_SUCCESS(close(testdirfd), "Closing %s", testdir_path); +} diff --git a/tests/vfs/sandbox_fstat.c b/tests/vfs/sandbox_fstat.c new file mode 100644 index 000000000..feba57d97 --- /dev/null +++ b/tests/vfs/sandbox_fstat.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -lsandbox -o sandbox_fstat sandbox_fstat.c -g -Weverything */ + +#include +#include +#include +#include + +#include +#include + +#define RUN_TEST TARGET_OS_OSX + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char file[PATH_MAX], file_rsrcfork[PATH_MAX]; +static sandbox_params_t params = NULL; +static sandbox_profile_t profile = NULL; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_ENABLED(RUN_TEST), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (profile) { + sandbox_free_profile(profile); + } + if (params) { + sandbox_free_params(params); + } + if (file[0] != '\0') { + unlink(file); + } + if (testdir) { + rmdir(testdir); + } +} + +static void +create_profile_string(char *buff, size_t size) +{ + snprintf(buff, size, "(version 1) \n\ + (allow default) \n\ + (import \"system.sb\") \n\ + (deny file-read-metadata (path \"%s\")) \n", + file); +} +static void +do_test(int expected_error) +{ + int fd; + struct stat sb; + + /* Test stat() */ + if (expected_error) { + T_EXPECT_POSIX_FAILURE(stat(file, &sb), expected_error, "Calling stat() should FAIL with '%s'", strerror(expected_error)); + } else { + T_EXPECT_POSIX_SUCCESS(stat(file, &sb), "Calling stat() for the file should PASS"); + } + + /* Test fstat() while the file is open with the O_CREAT | O_WRONLY flags */ + T_EXPECT_POSIX_SUCCESS(fd = open(file, O_CREAT | O_WRONLY, 0666), "Opening with the O_CREAT | O_WRONLY flags"); + if (fd != -1) { + if (expected_error) { + T_EXPECT_POSIX_FAILURE(fstat(fd, &sb), expected_error, "Calling fstat() should FAIL with '%s'", strerror(expected_error)); + } else { + T_EXPECT_POSIX_SUCCESS(fstat(fd, &sb), "Calling fstat() for the test file should PASS"); + } + close(fd); + } + + T_EXPECT_POSIX_SUCCESS(fd = open(file_rsrcfork, O_CREAT | O_WRONLY, 0666), "Opening rsrcfork with the O_CREAT | O_WRONLY flags"); + if (fd != -1) { + T_EXPECT_POSIX_SUCCESS(fstat(fd, &sb), "Calling fstat() for the rsrcfork should PASS"); + close(fd); + } +} + +T_DECL(sandbox_fstat, + "Prevent the information disclosure on files opened with O_WRONLY while sandbox profile denies 'file-read-metadata'") +{ +#if (!RUN_TEST) + T_SKIP("Not macOS"); +#endif + + int fd; + char *sberror = NULL; + char profile_string[1000]; + char testdir_path[MAXPATHLEN]; + + file[0] = '\0'; + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/sandbox_fstat-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + T_ASSERT_POSIX_SUCCESS((fd = open(testdir, O_SEARCH, 0777)), "Opening test root directory '%s'", testdir); + T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_GETPATH, testdir_path), "Calling fcntl() to get the path"); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing %s", testdir_path); + + /* Setup file names */ + snprintf(file, sizeof(file), "%s/%s", testdir_path, "file"); + snprintf(file_rsrcfork, sizeof(file_rsrcfork), "%s/%s", file, _PATH_RSRCFORKSPEC); + + /* Create the test file */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating '%s'", file); + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing '%s'", file); + + /* Create sandbox variables */ + T_ASSERT_POSIX_NOTNULL(params = sandbox_create_params(), "Creating Sandbox params object"); + create_profile_string(profile_string, sizeof(profile_string)); + T_ASSERT_POSIX_NOTNULL(profile = sandbox_compile_string(profile_string, params, &sberror), "Creating Sandbox profile object"); + + T_SETUPEND; + + /* Test stat()/fstat() */ + do_test(0); + + /* Apply sandbox profile */ + T_ASSERT_POSIX_SUCCESS(sandbox_apply(profile), "Applying Sandbox profile"); + + /* Test stat()/fstat() */ + do_test(EPERM); +} diff --git a/tests/vfs/sandbox_type_error.c b/tests/vfs/sandbox_type_error.c new file mode 100644 index 000000000..909fa277b --- /dev/null +++ b/tests/vfs/sandbox_type_error.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -lsandbox -o sandbox_type_error sandbox_type_error.c -g -Weverything */ + +#include +#include + +#include +#include + +#define RUN_TEST TARGET_OS_OSX + +static sandbox_params_t params = NULL; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_ENABLED(RUN_TEST), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (params) { + sandbox_free_params(params); + } +} + +static void +create_profile_string(char *buff, size_t size, char *path) +{ + snprintf(buff, size, "(version 1) \n\ + (allow default) \n\ + (deny file-read-metadata (path \"%s\")) \n", + path); +} + +static void +test_path(char *deny_path, char *stat_path, int expected_err) +{ + struct stat sb; + pid_t pid, res; + char *sberror = NULL; + char profile_string[1000]; + sandbox_profile_t profile = NULL; + int status, error, ret; + + /* Fork */ + pid = fork(); + if (pid < -1) { + T_FAIL("Failed to fork"); + return; + } + + switch (pid) { + case 0: + /* Create sandbox variables */ + create_profile_string(profile_string, sizeof(profile_string), deny_path); + if ((profile = sandbox_compile_string(profile_string, params, &sberror)) == NULL) { + T_FAIL("Creating Sandbox profile object"); + exit(EINVAL); + } + + error = sandbox_apply(profile); + if (error) { + T_FAIL("Applying Sandbox profile FAILED"); + sandbox_free_profile(profile); + exit(EINVAL); + } + + /* Query stat */ + error = stat(stat_path, &sb); + + /* Validate error */ + if ((!error && !expected_err) || (error == -1 && errno == expected_err)) { + ret = 0; + } else { + ret = errno; + } + + if (profile) { + sandbox_free_profile(profile); + } + exit(ret); + default: + do { + res = waitpid(pid, &status, WUNTRACED); + } while (res == -1 && errno == EINTR); + + if (res != pid) { + T_FAIL("(res != pid"); + break; + } + + if (!WIFEXITED(status)) { + T_FAIL("Stat of '%s' with deny path of '%s' FAILED", stat_path, deny_path); + break; + } + + if (WEXITSTATUS(status)) { + T_FAIL("Stat of '%s' with deny path of '%s' should FAIL with '%s', got '%s'", stat_path, deny_path, strerror(expected_err), strerror(WEXITSTATUS(status))); + break; + } + + if (expected_err) { + T_PASS("Stat of '%s' with deny path of '%s' should FAIL with '%s'", stat_path, deny_path, strerror(expected_err)); + } else { + T_PASS("Stat of '%s' with deny path of '%s' should PASS", stat_path, deny_path); + } + } +} + +T_DECL(sandbox_type_error, + "Prevent the information disclosure on resource type File/Directory/Symlink") +{ +#if (!RUN_TEST) + T_SKIP("Not macOS"); +#endif + + T_ATEND(cleanup); + T_SETUPBEGIN; + + T_ASSERT_POSIX_NOTNULL(params = sandbox_create_params(), "Creating Sandbox params object"); + + T_SETUPEND; + + /* Verify handling of non-existent files */ + test_path("/.file", "/.nofollow/notexist/", ENOENT); + + /* Prevent the information disclosure on the resource type for file */ + test_path("/.file", "/.nofollow/.file/", EPERM); + + /* Prevent the information disclosure on the resource type for directory */ + test_path("/private", "/.nofollow/private/", EPERM); + + /* Prevent the information disclosure on the resource type for symlink */ + test_path("/tmp", "/.nofollow/tmp/", EPERM); + + /* Prevent the information disclosure on the resource type for symlink child */ + test_path("/tmp", "/.nofollow/tmp/notexist", EPERM); +} diff --git a/tests/vfs/statfs_ext.c b/tests/vfs/statfs_ext.c index e3717d723..9c1fdef0b 100644 --- a/tests/vfs/statfs_ext.c +++ b/tests/vfs/statfs_ext.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include @@ -74,7 +74,7 @@ statfs_compare(const char *path, struct statfs *sfs_ext, int mode, int flag, int int fd; struct statfs sfs; - T_LOG("Testing: path %s, sfs_ext %p, mode %s, flag 0x%x, expected_err %d", path, (void *) sfs_ext, mode_name[mode], flag, expected_err); + T_LOG("Testing: path %s, sfs_ext %p, mode %s, flag 0x%x, expected_err %d", path, (void *) sfs_ext, mode_name[mode], (unsigned int)flag, expected_err); if (sfs_ext) { bzero(sfs_ext, sizeof(struct statfs)); diff --git a/tests/vfs/symlink_trailing_slash.c b/tests/vfs/symlink_trailing_slash.c new file mode 100644 index 000000000..5cfbfab52 --- /dev/null +++ b/tests/vfs/symlink_trailing_slash.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false), + T_META_TAG_VM_PREFERRED, + T_META_OWNER("m_staveleytaylor")); + +static char lstat_testdir[PATH_MAX]; +static char access_testdir[PATH_MAX]; + +static void +cleanup_lstat() +{ + rmdir("c"); + unlink("b"); + unlink("a"); + rmdir(lstat_testdir); +} + +static void +cleanup_access() +{ + unlink("test.f"); + unlink("test.lnk"); + unlink("test.d/test.df"); + rmdir("test.d"); + rmdir(access_testdir); +} + +T_DECL( + lstat_symlink_trailing_slash, + "Check symlinks-to-symlinks are resolved correctly when trailing slashes are involved" + ) { + struct stat st; + + T_ATEND(cleanup_lstat); + T_SETUPBEGIN; + + // Create test root dir + snprintf(lstat_testdir, sizeof(lstat_testdir), "%s/symlink_trailing_slash-lstat-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL(mkdtemp(lstat_testdir), "setup: create test root dir"); + + // CD into test root dir + T_ASSERT_POSIX_SUCCESS(chdir(lstat_testdir), "setup: cd testdir"); + + // Setup a scenario with 'a -> b -> c' (where -> means 'is a symlink to'). + T_ASSERT_POSIX_SUCCESS(mkdir("c", 0755), "setup: mkdir c"); + T_ASSERT_POSIX_SUCCESS(symlink("c", "b"), "setup: ln c b"); + T_ASSERT_POSIX_SUCCESS(symlink("b", "a"), "setup: ln b a"); + + T_SETUPEND; + + // stat + + T_ASSERT_POSIX_SUCCESS(stat("a", &st), "stat a succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "stat thinks a is directory"); + + T_ASSERT_POSIX_SUCCESS(stat("b", &st), "stat b succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "stat thinks b is directory"); + + T_ASSERT_POSIX_SUCCESS(stat("b/", &st), "stat b/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "stat thinks b/ is directory"); + + T_ASSERT_POSIX_SUCCESS(stat("a/.", &st), "stat a/. succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "stat thinks a/. is directory"); + + T_ASSERT_POSIX_SUCCESS(stat("a/", &st), "stat a/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "stat thinks a/ is directory"); + + // lstat + + T_ASSERT_POSIX_SUCCESS(lstat("a", &st), "lstat a succeeds"); + T_ASSERT_TRUE(S_ISLNK(st.st_mode), "lstat thinks a is symlink"); + + T_ASSERT_POSIX_SUCCESS(lstat("b", &st), "lstat b succeeds"); + T_ASSERT_TRUE(S_ISLNK(st.st_mode), "lstat thinks b is symlink"); + + T_ASSERT_POSIX_SUCCESS(lstat("b/", &st), "lstat b/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks b/ is directory"); + + T_ASSERT_POSIX_SUCCESS(lstat("a/.", &st), "lstat a/. succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks a/. is directory"); + + // rdar://142559105 (lstat() of a name with trailing '/' is handled differently than other platforms) + T_ASSERT_POSIX_SUCCESS(lstat("a/", &st), "lstat a/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks a/ is directory"); + + // Now modify a such that it has a trailing slash in the link itself. + T_ASSERT_POSIX_SUCCESS(unlink("a"), "unlink a"); + T_ASSERT_POSIX_SUCCESS(symlink("b/", "a"), "symlink a -> b/"); + + T_ASSERT_POSIX_SUCCESS(lstat("a", &st), "lstat a succeeds"); + T_ASSERT_TRUE(S_ISLNK(st.st_mode), "lstat thinks a is symlink"); + + T_ASSERT_POSIX_SUCCESS(lstat("a/", &st), "lstat a/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks a/ is directory"); + + // Do the same for b. + T_ASSERT_POSIX_SUCCESS(unlink("b"), "unlink b"); + T_ASSERT_POSIX_SUCCESS(symlink("c/", "b"), "symlink b -> c/"); + + T_ASSERT_POSIX_SUCCESS(lstat("a", &st), "lstat a succeeds"); + T_ASSERT_TRUE(S_ISLNK(st.st_mode), "lstat thinks a is symlink"); + + T_ASSERT_POSIX_SUCCESS(lstat("a/", &st), "lstat a/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks a/ is directory"); + + T_ASSERT_POSIX_SUCCESS(lstat("b", &st), "lstat b succeeds"); + T_ASSERT_TRUE(S_ISLNK(st.st_mode), "lstat thinks b is symlink"); + + T_ASSERT_POSIX_SUCCESS(lstat("b/", &st), "lstat b/ succeeds"); + T_ASSERT_TRUE(S_ISDIR(st.st_mode), "lstat thinks b/ is directory"); +} + +T_DECL( + access_symlink_trailing_slash, + "Check access returns ENOTDIR when symlink points to a file and trailing slash was used" + ) { + T_ATEND(cleanup_access); + T_SETUPBEGIN; + + // Create test root dir + snprintf(access_testdir, sizeof(access_testdir), "%s/symlink_trailing_slash-access-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL(mkdtemp(access_testdir), "setup: create test root dir"); + printf("testdir is %s\n", access_testdir); + + // CD into test root dir + T_ASSERT_POSIX_SUCCESS(chdir(access_testdir), "setup: cd testdir"); + + T_ASSERT_POSIX_SUCCESS(creat("test.f", 0755), "setup: touch test.f"); + T_ASSERT_POSIX_SUCCESS(symlink("test.f", "test.lnk"), "setup: ln test.f test.lnk"); + T_ASSERT_POSIX_SUCCESS(mkdir("test.d", 0755), "setup: mkdir test.d"); + T_ASSERT_POSIX_SUCCESS(creat("test.d/test.df", 0755), "setup: touch test.d/test.df"); + + T_SETUPEND; + + T_ASSERT_POSIX_SUCCESS(access("test.lnk", R_OK), "access test.lnk suceeds"); + + T_ASSERT_EQ(access("test.lnk/", R_OK), -1, "access test.lnk/ returns -1"); + T_ASSERT_POSIX_ERROR(errno, ENOTDIR, "access sets errno to ENOTDIR"); + + // Now modify test.lnk to contain a trailing slash in the link itself + T_ASSERT_POSIX_SUCCESS(unlink("test.lnk"), "rm test.lnk"); + T_ASSERT_POSIX_SUCCESS(symlink("test.f/", "test.lnk"), "ln -s test.f/ test.lnk"); + + T_ASSERT_EQ(access("test.lnk", R_OK), -1, "access test.lnk returns -1"); + T_ASSERT_POSIX_ERROR(errno, ENOTDIR, "access sets errno to ENOTDIR"); + + T_ASSERT_EQ(access("test.lnk/", R_OK), -1, "access test.lnk/ returns -1"); + T_ASSERT_POSIX_ERROR(errno, ENOTDIR, "access sets errno to ENOTDIR"); + + // Now introduce a directory so that we have: + // test.lnk -> test.d/ which contains test.f + // The trailing slash in test.lnk should not cause access("test.lnk/test.f") to fail + T_ASSERT_POSIX_SUCCESS(unlink("test.lnk"), "rm test.lnk"); + T_ASSERT_POSIX_SUCCESS(symlink("test.d/", "test.lnk"), "ln -s test.d/ test.lnk"); + T_ASSERT_POSIX_SUCCESS(access("test.lnk/test.df", R_OK), "access test.lnk/test.df"); +} diff --git a/tests/vfs/unlinkat_nodeletebusy.c b/tests/vfs/unlinkat_nodeletebusy.c new file mode 100644 index 000000000..331a2e7e9 --- /dev/null +++ b/tests/vfs/unlinkat_nodeletebusy.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o unlinkat_nodeletebusy unlinkat_nodeletebusy.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef AT_NODELETEBUSY +#define AT_NODELETEBUSY 0x4000 +#endif + +static char template[MAXPATHLEN]; +static char *testdir = NULL; +static char file[PATH_MAX]; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)); + +static void +cleanup(void) +{ + if (file[0] != '\0') { + unlink(file); + } + if (testdir) { + rmdir(testdir); + } +} + +T_DECL(unlinkat_nodeletebusy, + "Verify that O_SYMLINK is not being ignored while used by open() in addition to O_CREAT") +{ + int fd; + + file[0] = '\0'; + + T_ATEND(cleanup); + T_SETUPBEGIN; + + /* Create test root dir */ + snprintf(template, sizeof(template), "%s/unlinkat_nodeletebusy-XXXXXX", dt_tmpdir()); + T_ASSERT_POSIX_NOTNULL((testdir = mkdtemp(template)), "Creating test root dir"); + + /* Setup file name */ + snprintf(file, sizeof(file), "%s/%s", testdir, "file"); + + T_SETUPEND; + + /* Create the test file */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating test file"); + + /* Unlinking when file is opened */ + T_ASSERT_POSIX_SUCCESS(unlinkat(AT_FDCWD, file, 0), "Unlinking when file is opened"); + + /* Closing the test file */ + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing file"); + + /* Create the test file */ + T_ASSERT_POSIX_SUCCESS((fd = open(file, O_CREAT | O_RDWR, 0777)), "Creating test file"); + + /* Unlinking when file is opened using the AT_NODELETEBUSY flag */ + T_ASSERT_POSIX_FAILURE(unlinkat(AT_FDCWD, file, AT_NODELETEBUSY), EBUSY, "Unlinking when file is opened using the AT_NODELETEBUSY flag -> Should fail with EBUSY"); + + /* Closing the test file */ + T_ASSERT_POSIX_SUCCESS(close(fd), "Closing file"); + + /* Unlinking when file is NOT opened using the AT_NODELETEBUSY flag */ + T_ASSERT_POSIX_SUCCESS(unlinkat(AT_FDCWD, file, AT_NODELETEBUSY), "Unlinking when file is NOT opened using the AT_NODELETEBUSY flag -> Should pass"); +} diff --git a/tests/vfs/volfs_chroot.c b/tests/vfs/volfs_chroot.c new file mode 100644 index 000000000..6112420bc --- /dev/null +++ b/tests/vfs/volfs_chroot.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* compile: xcrun -sdk macosx.internal clang -ldarwintest -o volfs_chroot volfs_chroot.c -g -Weverything */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vfs"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("vfs"), + T_META_ENABLED(TARGET_OS_OSX), + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)); + +T_DECL(volfs_chroot, + "Check for and fail if the volfs path is not under the chroot") +{ +#if TARGET_OS_OSX + int fd; + char root_volfs[MAXPATHLEN]; + const char *root_path = "/", *private_path = "/private"; + struct stat root_stat, root_stat2, private_stat, fd_stat; + + T_SETUPBEGIN; + + T_ASSERT_POSIX_SUCCESS(stat(root_path, &root_stat), + "Setup: Calling stat() on %s", + root_path); + + T_ASSERT_POSIX_SUCCESS(snprintf(root_volfs, sizeof(root_volfs), "/.vol/%d/2", root_stat.st_dev), + "Setup: Creating root_volfs path"); + + T_ASSERT_POSIX_SUCCESS(stat(root_volfs, &root_stat2), + "Setup: Calling stat() on %s", + root_volfs); + + T_ASSERT_POSIX_SUCCESS(stat(private_path, &private_stat), + "Setup: Calling stat() on %s", + private_path); + + T_ASSERT_POSIX_SUCCESS(chroot(private_path), + "Setup: Calling chroot() on %s", + private_path); + + T_SETUPEND; + + T_ASSERT_EQ(root_stat.st_ino, root_stat2.st_ino, "Verifing %s and %s are the same file", root_path, root_volfs); + T_ASSERT_POSIX_SUCCESS((fd = open(root_path, 0)), "Opening the updated root path"); + T_ASSERT_POSIX_SUCCESS((fstat(fd, &fd_stat)), "Calling stat on the updated root path"); + T_ASSERT_EQ(fd_stat.st_ino, private_stat.st_ino, "Verifing %s was opened", private_path); + T_ASSERT_POSIX_FAILURE(open(root_volfs, 0), ENOENT, "Verifing %s can not be opened because path is not under the chroot", root_volfs); +#else + T_SKIP("Not macOS"); +#endif +} diff --git a/tests/vm/Makefile b/tests/vm/Makefile index e49c2141e..6294df8f5 100644 --- a/tests/vm/Makefile +++ b/tests/vm/Makefile @@ -1,8 +1,5 @@ INCLUDED_TEST_SOURCE_DIRS += vm -# Revert to legacy vm_test suite until gets solved -EXCLUDED_SOURCES += vm/vm_allocation.c - EXCLUDED_SOURCES += jumbo_va_spaces_common.c ifeq ($(PLATFORM),$(filter $(PLATFORM),iphoneos iPhoneOS XROS)) @@ -101,9 +98,18 @@ install-vm/hello: vm/hello mkdir -p $(INSTALLDIR)/vm cp $(SYMROOT)/vm/hello $(INSTALLDIR)/vm/ +vm/upl: exc_guard_helper.c exc_helpers.c test_utils.c excserver +vm/upl: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) +vm/upl: OTHER_LDFLAGS += -ldarwintest_utils +vm/upl: CODE_SIGN_ENTITLEMENTS = ./vm/upl.entitlements + vm/vm_tainted_executable: vm/hello vm/vm_ranges: CODE_SIGN_ENTITLEMENTS = ./vm/vm_ranges_entitlement.plist -vm_test_mach_map: CODE_SIGN_ENTITLEMENTS = ./vm_test_mach_map.plist +vm/vm_memory_entry: CODE_SIGN_ENTITLEMENTS=vm/memory-ownership-transfer.entitlements +vm/vm_reclaim: OTHER_CFLAGS += -Wno-language-extension-token -Wno-c++98-compat memorystatus_assertion_helpers.c +vm/vm_reclaim: OTHER_LDFLAGS += -ldarwintest_utils +vm/vm_reclaim: INVALID_ARCHS = armv7k arm64_32 +vm/vm_reclaim: CODE_SIGN_ENTITLEMENTS = vm/vm_reclaim.entitlements diff --git a/tests/vm/compression_sweep.c b/tests/vm/compression_sweep.c index 3736c5c3a..deeacab95 100644 --- a/tests/vm/compression_sweep.c +++ b/tests/vm/compression_sweep.c @@ -51,6 +51,7 @@ T_DECL(compression_sweep, kern_return_t kret = host_statistics64(mach_host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat_before, &count); T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "host_statistics64"); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO64_COUNT, "host_statistics64 size"); size_t size = sizeof(orig_age); int ret = sysctlbyname(ripe_target_age_sysctl, &orig_age, &size, @@ -72,6 +73,7 @@ T_DECL(compression_sweep, kret = host_statistics64(mach_host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat_after, &count); T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "host_statistics64"); + T_QUIET; T_ASSERT_EQ(count, HOST_VM_INFO64_COUNT, "host_statistics64 size"); T_LOG("compressed %llu pages", vm_stat_after.compressions - vm_stat_before.swapouts); diff --git a/tests/vm/configurator/vm_configurator.c b/tests/vm/configurator/vm_configurator.c new file mode 100644 index 000000000..2ff8d2bf4 --- /dev/null +++ b/tests/vm/configurator/vm_configurator.c @@ -0,0 +1,4322 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "try_read_write.h" +#include "exc_helpers.h" +#include "exc_guard_helper.h" +#include "vm_configurator.h" +#include "vm_configurator_tests.h" + +#pragma clang diagnostic ignored "-Wgnu-conditional-omitted-operand" +#pragma clang diagnostic ignored "-Wformat-pedantic" + +bool Verbose = false; + +/* TODO: sufficiently new SDK defines this */ +#ifndef VM_BEHAVIOR_LAST_VALID +#define VM_BEHAVIOR_LAST_VALID VM_BEHAVIOR_ZERO +#endif + +#define KB16 16384 +#define MB (1024*1024) + +/* pretty printing */ + +/* indentation printed in front of submap contents */ +#define SUBMAP_PREFIX " . " + +/* + * Used when printing attributes of checkers and vm regions. + * BadHighlight gets a highlighted color and "*" marker. + * NormalHighlight gets normal color. + * IgnoredHighlight gets dimmed color. + */ +typedef enum { + BadHighlight = 0, + NormalHighlight, + IgnoredHighlight, + HighlightCount +} attribute_highlight_t; + +/* + * Specify highlights for all entry and object attributes. + * Used when printing entire checkers or VM states. + */ +typedef struct { + attribute_highlight_t highlighting; + vm_entry_attribute_list_t entry; + vm_object_attribute_list_t object; +} attribute_highlights_t; + +/* + * Print all attributes as NormalHighlight. + */ +static attribute_highlights_t +normal_highlights(void) +{ + return (attribute_highlights_t) { + .highlighting = NormalHighlight, + .entry = vm_entry_attributes_with_default(true), + .object = vm_object_attributes_with_default(true), + }; +} + +/* + * Print bad_entry_attr and bad_object_attr as BadHighlight. + * Print other attributes as IgnoredHighlight. + */ +static attribute_highlights_t +bad_or_ignored_highlights( + vm_entry_attribute_list_t bad_entry_attr, + vm_object_attribute_list_t bad_object_attr) +{ + return (attribute_highlights_t) { + .highlighting = BadHighlight, + .entry = bad_entry_attr, + .object = bad_object_attr, + }; +} + +/* + * Print normal_entry_attr and normal_object_attr as NormalHighlight. + * Print other attributes as IgnoredHighlight. + */ +static attribute_highlights_t +normal_or_ignored_highlights( + vm_entry_attribute_list_t normal_entry_attr, + vm_object_attribute_list_t normal_object_attr) +{ + return (attribute_highlights_t) { + .highlighting = NormalHighlight, + .entry = normal_entry_attr, + .object = normal_object_attr, + }; +} + +/* Return true if we should print terminal color codes. */ +static bool +use_colors(void) +{ + static int stdout_is_tty = -1; + if (stdout_is_tty == -1) { + stdout_is_tty = isatty(STDOUT_FILENO); + } + return stdout_is_tty; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wpedantic" +/* -Wpedantic doesn't like "\e" */ + +#define ANSI_DIM "\e[2m" +#define ANSI_UNDIM "\e[22m" + +/* + * Returns a highlighting prefix string. + * Its printed length is one character, either ' ' or '*' + * It may include ANSI color codes. + */ +static const char * +highlight_prefix(attribute_highlight_t highlight) +{ + assert(highlight >= 0 && highlight < HighlightCount); + static const char * highlights[2][HighlightCount] = { + [0] = { + /* no tty, omit color codes */ + [BadHighlight] = "*", + [NormalHighlight] = " ", + [IgnoredHighlight] = " ", + }, + [1] = { + /* tty, add color codes */ + [BadHighlight] = "*", + [NormalHighlight] = " ", + [IgnoredHighlight] = ANSI_DIM " ", + } + }; + + return highlights[use_colors()][highlight]; +} + +/* + * Returns a highlighting suffix string. + * Its printed length is zero characters. + * It may include ANSI color codes. + */ +static const char * +highlight_suffix(attribute_highlight_t highlight __unused) +{ + if (use_colors()) { + return ANSI_UNDIM; + } else { + return ""; + } +} + +#pragma clang diagnostic pop /* ignored -Wpedantic */ + +/* + * Format a value with highlighting. + * Usage: + * printf("%sFFFF%s", HIGHLIGHT(value, entry.some_attr)); + * where "FFFF" is the format string for `value` + * and `highlights.entry.some_attr` is true for highlighted values. + * + * Uses `highlights.highlighting` if `highlights.entry.some_attr` is true. + * Uses `IgnoredHighlight` if `highlights.entry.some_attr` is false. + */ +#define HIGHLIGHT(value, attr_path) \ + highlight_prefix(highlights.attr_path ? highlights.highlighting : IgnoredHighlight), \ + (value), \ + highlight_suffix(highlights.attr_path ? highlights.highlighting : IgnoredHighlight) + + +/* host_priv port wrappers */ + +host_priv_t +host_priv(void) +{ + host_priv_t result; + kern_return_t kr = host_get_host_priv_port(mach_host_self(), &result); + assert(kr == 0 && "cannot get host_priv port; try running as root"); + return result; +} + +bool +host_priv_allowed(void) +{ + host_priv_t result; + kern_return_t kr = host_get_host_priv_port(mach_host_self(), &result); + return kr == 0; +} + +/* math */ + +static bool +is_power_of_two(mach_vm_size_t n) +{ + return n > 0 && (n & (n - 1)) == 0; +} + +static bool +is_valid_alignment_mask(mach_vm_size_t mask) +{ + if (mask == 0) { + return true; + } + + mach_vm_size_t pow = mask + 1; /* may wrap around to zero */ + if (pow == 0) { + return true; /* mask is ~0, mask + 1 wrapped to zero */ + } + + return is_power_of_two(pow); +} + + +/* + * Some vm_behavior_t values have a persistent effect on the vm entry. + * Other behavior values are really one-shot memory operations. + */ +static bool +is_persistent_vm_behavior(vm_behavior_t behavior) +{ + return + behavior == VM_BEHAVIOR_DEFAULT || + behavior == VM_BEHAVIOR_RANDOM || + behavior == VM_BEHAVIOR_SEQUENTIAL || + behavior == VM_BEHAVIOR_RSEQNTL; +} + + +const char * +name_for_entry_kind(vm_entry_template_kind_t kind) +{ + static const char *kind_name[] = { + "END_ENTRIES", "allocation", "hole", "submap parent" + }; + assert(kind < countof(kind_name)); + return kind_name[kind]; +} + +const char * +name_for_kr(kern_return_t kr) +{ + static const char *kr_name[] = { + "KERN_SUCCESS", "KERN_INVALID_ADDRESS", + "KERN_PROTECTION_FAILURE", "KERN_NO_SPACE", + "KERN_INVALID_ARGUMENT", "KERN_FAILURE", + "KERN_RESOURCE_SHORTAGE", "KERN_NOT_RECEIVER", + "KERN_NO_ACCESS", "KERN_MEMORY_FAILURE", + "KERN_MEMORY_ERROR", "KERN_ALREADY_IN_SET", + "KERN_NOT_IN_SET", "KERN_NAME_EXISTS", + "KERN_ABORTED", "KERN_INVALID_NAME", + "KERN_INVALID_TASK", "KERN_INVALID_RIGHT", + "KERN_INVALID_VALUE", "KERN_UREFS_OVERFLOW", + "KERN_INVALID_CAPABILITY", "KERN_RIGHT_EXISTS", + "KERN_INVALID_HOST", "KERN_MEMORY_PRESENT", + /* add other kern_return.h values here if desired */ + }; + + if ((size_t)kr < countof(kr_name)) { + return kr_name[kr]; + } + + /* TODO: recognize and/or decode mach_error format? */ + + return "??"; +} + +const char * +name_for_prot(vm_prot_t prot) +{ + assert(prot_contains_all(VM_PROT_ALL /* rwx */, prot)); + /* TODO: uexec? */ + static const char *prot_name[] = { + "---", "r--", "-w-", "rw-", + "--x", "r-x", "-wx", "rwx" + }; + return prot_name[prot]; +} + +const char * +name_for_inherit(vm_inherit_t inherit) +{ + static const char *inherit_name[] = { + [VM_INHERIT_SHARE] = "VM_INHERIT_SHARE", + [VM_INHERIT_COPY] = "VM_INHERIT_COPY", + [VM_INHERIT_NONE] = "VM_INHERIT_NONE", + }; + static_assert(countof(inherit_name) == VM_INHERIT_LAST_VALID + 1, + "new vm_inherit_t values need names"); + + assert(inherit <= VM_INHERIT_LAST_VALID); + return inherit_name[inherit]; +} + +const char * +name_for_behavior(vm_behavior_t behavior) +{ + static const char *behavior_name[] = { + [VM_BEHAVIOR_DEFAULT] = "VM_BEHAVIOR_DEFAULT", + [VM_BEHAVIOR_RANDOM] = "VM_BEHAVIOR_RANDOM", + [VM_BEHAVIOR_SEQUENTIAL] = "VM_BEHAVIOR_SEQUENTIAL", + [VM_BEHAVIOR_RSEQNTL] = "VM_BEHAVIOR_RSEQNTL", + [VM_BEHAVIOR_WILLNEED] = "VM_BEHAVIOR_WILLNEED", + [VM_BEHAVIOR_DONTNEED] = "VM_BEHAVIOR_DONTNEED", + [VM_BEHAVIOR_FREE] = "VM_BEHAVIOR_FREE", + [VM_BEHAVIOR_ZERO_WIRED_PAGES] = "VM_BEHAVIOR_ZERO_WIRED_PAGES", + [VM_BEHAVIOR_REUSABLE] = "VM_BEHAVIOR_REUSABLE", + [VM_BEHAVIOR_REUSE] = "VM_BEHAVIOR_REUSE", + [VM_BEHAVIOR_CAN_REUSE] = "VM_BEHAVIOR_CAN_REUSE", + [VM_BEHAVIOR_PAGEOUT] = "VM_BEHAVIOR_PAGEOUT", + [VM_BEHAVIOR_ZERO] = "VM_BEHAVIOR_ZERO", + }; + static_assert(countof(behavior_name) == VM_BEHAVIOR_LAST_VALID + 1, + "new vm_behavior_t values need names"); + + assert(behavior >= 0 && behavior <= VM_BEHAVIOR_LAST_VALID); + return behavior_name[behavior]; +} + +const char * +name_for_share_mode(uint8_t share_mode) +{ + assert(share_mode > 0); + static const char *share_mode_name[] = { + [0] = "(0)", + [SM_COW] = "SM_COW", + [SM_PRIVATE] = "SM_PRIVATE", + [SM_EMPTY] = "SM_EMPTY", + [SM_SHARED] = "SM_SHARED", + [SM_TRUESHARED] = "SM_TRUESHARED", + [SM_PRIVATE_ALIASED] = "SM_PRIVATE_ALIASED", + [SM_SHARED_ALIASED] = "SM_SHARED_ALIASED", + [SM_LARGE_PAGE] = "SM_LARGE_PAGE" + }; + + assert(share_mode < countof(share_mode_name)); + return share_mode_name[share_mode]; +} + +const char * +name_for_bool(boolean_t value) +{ + switch (value) { + case 0: return "false"; + case 1: return "true"; + default: return "true-but-not-1"; + } +} + + +void +clamp_start_end_to_start_end( + mach_vm_address_t * const inout_start, + mach_vm_address_t * const inout_end, + mach_vm_address_t limit_start, + mach_vm_address_t limit_end) +{ + if (*inout_start < limit_start) { + *inout_start = limit_start; + } + + if (*inout_end > limit_end) { + *inout_end = limit_end; + } + + if (*inout_start > *inout_end) { + /* no-overlap case */ + *inout_end = *inout_start; + } +} + +void +clamp_address_size_to_address_size( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const inout_size, + mach_vm_address_t limit_address, + mach_vm_size_t limit_size) +{ + mach_vm_address_t end = *inout_address + *inout_size; + mach_vm_address_t limit_end = limit_address + limit_size; + clamp_start_end_to_start_end(inout_address, &end, limit_address, limit_end); + *inout_size = end - *inout_address; +} + +void +clamp_address_size_to_checker( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const inout_size, + vm_entry_checker_t *checker) +{ + clamp_address_size_to_address_size( + inout_address, inout_size, + checker->address, checker->size); +} + +void +clamp_start_end_to_checker( + mach_vm_address_t * const inout_start, + mach_vm_address_t * const inout_end, + vm_entry_checker_t *checker) +{ + clamp_start_end_to_start_end( + inout_start, inout_end, + checker->address, checker_end_address(checker)); +} + + +uint64_t +get_object_id_for_address(mach_vm_address_t address) +{ + mach_vm_address_t info_address = address; + mach_vm_size_t info_size; + vm_region_submap_info_data_64_t info; + + bool found = get_info_for_address_fast(&info_address, &info_size, &info); + assert(found); + assert(info_address == address); + return info.object_id_full; +} + +uint16_t +get_user_tag_for_address(mach_vm_address_t address) +{ + mach_vm_address_t info_address = address; + mach_vm_size_t info_size; + vm_region_submap_info_data_64_t info; + + bool found = get_info_for_address_fast(&info_address, &info_size, &info); + if (found) { + return info.user_tag; + } else { + return 0; + } +} + +uint16_t +get_app_specific_user_tag_for_address(mach_vm_address_t address) +{ + uint16_t tag = get_user_tag_for_address(address); + if (tag < VM_MEMORY_APPLICATION_SPECIFIC_1 || + tag > VM_MEMORY_APPLICATION_SPECIFIC_16) { + /* tag is outside app-specific range, override it */ + return 0; + } + return tag; +} + +static void +set_vm_self_region_footprint(bool value) +{ + int value_storage = value; + int error = sysctlbyname("vm.self_region_footprint", NULL, NULL, &value_storage, sizeof(value_storage)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(error, "sysctl(vm.self_region_footprint)"); +} + +bool __attribute__((overloadable)) +get_info_for_address_fast( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const out_size, + vm_region_submap_info_data_64_t * const out_info, + uint32_t submap_depth) +{ + kern_return_t kr; + + mach_vm_address_t query_address = *inout_address; + mach_vm_address_t actual_address = query_address; + uint32_t actual_depth = submap_depth; + mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; + kr = mach_vm_region_recurse(mach_task_self(), + &actual_address, out_size, &actual_depth, + (vm_region_recurse_info_t)out_info, + &count); + + if (kr == KERN_INVALID_ADDRESS || actual_depth < submap_depth) { + /* query_address is unmapped, and so is everything after it */ + *inout_address = ~(mach_vm_address_t)0; + *out_size = 0; + return false; + } + assert(kr == 0); + if (actual_address > query_address) { + /* query_address is unmapped, but there is a subsequent mapping */ + *inout_address = actual_address; + /* *out_size already set */ + return false; + } + + /* query_address is mapped */ + *inout_address = actual_address; + /* *out_size already set */ + return true; +} + +bool __attribute__((overloadable)) +get_info_for_address( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const out_size, + vm_region_submap_info_data_64_t * const out_info, + uint32_t submap_depth) +{ + mach_vm_address_t addr1, addr2; + mach_vm_size_t size1 = 0, size2 = 0; + vm_region_submap_info_data_64_t info1, info2; + bool result1, result2; + + /* + * VM's task_self_region_footprint() changes + * how vm_map_region_walk() counts things. + * + * We want the ref_count and shadow_depth from footprint==true + * (ignoring the specific pages in the objects) + * but we want pages_resident from footprint==false. + * + * Here we call vm_region once with footprint and once without, + * and pick out the values we want to return. + */ + + set_vm_self_region_footprint(true); + addr1 = *inout_address; + result1 = get_info_for_address_fast(&addr1, &size1, &info1, submap_depth); + + set_vm_self_region_footprint(false); + addr2 = *inout_address; + result2 = get_info_for_address_fast(&addr2, &size2, &info2, submap_depth); + assert(addr1 == addr2); + assert(size1 == size2); + assert(result1 == result2); + + info1.pages_resident = info2.pages_resident; + *out_info = info1; + *inout_address = addr1; + *out_size = size1; + + return result1; +} + +static bool +is_mapped(mach_vm_address_t address, uint32_t submap_depth) +{ + mach_vm_size_t size; + vm_region_submap_info_data_64_t info; + return get_info_for_address_fast(&address, &size, &info, submap_depth); +} + + +static void +dump_region_info( + mach_vm_address_t address, + mach_vm_size_t size, + uint32_t submap_depth, + vm_region_submap_info_data_64_t *info, + attribute_highlights_t highlights) +{ + mach_vm_address_t end = address + size; + + const char *suffix = ""; + if (info->is_submap) { + suffix = " (submap parent)"; + } else if (submap_depth > 0) { + suffix = " (allocation in submap)"; + } + + const char *submap_prefix = submap_depth > 0 ? SUBMAP_PREFIX : ""; + + /* Output order should match dump_checker_info() for the reader's convenience. */ + + T_LOG("%sMAPPING 0x%llx..0x%llx (size 0x%llx)%s", submap_prefix, address, end, size, suffix); + T_LOG("%s %sprotection: %s%s", submap_prefix, HIGHLIGHT(name_for_prot(info->protection), entry.protection_attr)); + T_LOG("%s %smax protection: %s%s", submap_prefix, HIGHLIGHT(name_for_prot(info->max_protection), entry.max_protection_attr)); + T_LOG("%s %sinheritance: %s%s", submap_prefix, HIGHLIGHT(name_for_inherit(info->inheritance), entry.inheritance_attr)); + T_LOG("%s %sbehavior: %s%s", submap_prefix, HIGHLIGHT(name_for_behavior(info->behavior), entry.behavior_attr)); + T_LOG("%s %suser wired count: %d%s", submap_prefix, HIGHLIGHT(info->user_wired_count, entry.user_wired_count_attr)); + T_LOG("%s %suser tag: %d%s", submap_prefix, HIGHLIGHT(info->user_tag, entry.user_tag_attr)); + T_LOG("%s %sobject offset: 0x%llx%s", submap_prefix, HIGHLIGHT(info->offset, entry.object_offset_attr)); + T_LOG("%s %sobject id: 0x%llx%s", submap_prefix, HIGHLIGHT(info->object_id_full, object.object_id_attr)); + T_LOG("%s %sref count: %u%s", submap_prefix, HIGHLIGHT(info->ref_count, object.ref_count_attr)); + T_LOG("%s %sshadow depth: %hu%s", submap_prefix, HIGHLIGHT(info->shadow_depth, object.shadow_depth_attr)); + T_LOG("%s %spages resident: %u%s", submap_prefix, HIGHLIGHT(info->pages_resident, entry.pages_resident_attr)); + T_LOG("%s %spages shared now private: %u%s", submap_prefix, highlight_prefix(IgnoredHighlight), info->pages_shared_now_private, highlight_suffix(IgnoredHighlight)); + T_LOG("%s %spages swapped out: %u%s", submap_prefix, highlight_prefix(IgnoredHighlight), info->pages_swapped_out, highlight_suffix(IgnoredHighlight)); + T_LOG("%s %spages dirtied: %u%s", submap_prefix, highlight_prefix(IgnoredHighlight), info->pages_dirtied, highlight_suffix(IgnoredHighlight)); + T_LOG("%s %sexternal pager: %hhu%s", submap_prefix, highlight_prefix(IgnoredHighlight), info->external_pager, highlight_suffix(IgnoredHighlight)); + T_LOG("%s %sshare mode: %s%s", submap_prefix, HIGHLIGHT(name_for_share_mode(info->share_mode), entry.share_mode_attr)); + T_LOG("%s %sis submap: %s%s", submap_prefix, HIGHLIGHT(name_for_bool(info->is_submap), entry.is_submap_attr)); + T_LOG("%s %ssubmap depth: %u%s", submap_prefix, HIGHLIGHT(submap_depth, entry.submap_depth_attr)); +} + +static void +dump_hole_info( + mach_vm_address_t address, + mach_vm_size_t size, + uint32_t submap_depth, + attribute_highlights_t highlights) +{ + mach_vm_address_t end = address + size; + const char *submap_prefix = submap_depth > 0 ? SUBMAP_PREFIX : ""; + const char *suffix = ""; + if (submap_depth > 0) { + suffix = " (unallocated in submap)"; + } + + T_LOG("%sHOLE 0x%llx..0x%llx (size 0x%llx)%s", + submap_prefix, address, end, size, suffix); + if (submap_depth > 0) { + /* print submap depth to avoid confusion about holes inside submaps */ + T_LOG("%s %ssubmap depth: %u%s", submap_prefix, HIGHLIGHT(submap_depth, entry.submap_depth_attr)); + } +} + +__attribute__((overloadable)) +static void +dump_region_info_in_range( + mach_vm_address_t range_start, + mach_vm_size_t range_size, + uint32_t submap_depth, + bool recurse, + attribute_highlights_t highlights) +{ + mach_vm_address_t range_end = range_start + range_size; + mach_vm_address_t prev_end = range_start; + do { + mach_vm_address_t address = prev_end; + mach_vm_size_t size = 0; + vm_region_submap_info_data_64_t info; + (void)get_info_for_address(&address, &size, &info, submap_depth); + /* + * [address, address+size) is the next mapped region, + * or [~0, ~0) if there is no next mapping. + * There may be a hole preceding that region. + * That region may be beyond our range. + */ + if (address > prev_end) { + /* don't report any part of the hole beyond range_end */ + mach_vm_address_t hole_end = min(address, range_end); + dump_hole_info(prev_end, hole_end - prev_end, submap_depth, highlights); + } + if (address < range_end) { + dump_region_info(address, size, submap_depth, &info, highlights); + if (info.is_submap && recurse) { + /* print submap contents within this window */ + mach_vm_address_t submap_start = max(prev_end, address); + mach_vm_address_t submap_end = min(range_end, address + size); + dump_region_info_in_range(submap_start, submap_end - submap_start, + submap_depth + 1, true, highlights); + } + } + prev_end = address + size; + } while (prev_end < range_end); +} + + +static void +dump_region_info_for_entry( + vm_entry_checker_t *checker, + attribute_highlights_t highlights) +{ + /* Try to print at the checker's submap depth only. Don't recurse. */ + dump_region_info_in_range(checker->address, checker->size, + checker->submap_depth, false /* recurse */, highlights); +} + +void +dump_region_info_for_entries(entry_checker_range_t list) +{ + /* + * Ignore the submap depth of the checkers themselves. + * Print starting at submap depth 0 and recurse. + * Don't specially highlight any attributes. + */ + mach_vm_address_t start = checker_range_start_address(list); + mach_vm_address_t end = checker_range_end_address(list); + dump_region_info_in_range( + start, end - start, + 0 /* submap depth */, true /* recurse */, + normal_highlights()); +} + +/* + * Count the number of templates in a END_ENTRIES-terminated list. + */ +static unsigned +count_entry_templates(const vm_entry_template_t *templates) +{ + if (templates == NULL) { + return 0; + } + for (unsigned count = 0;; count++) { + if (templates[count].kind == EndEntries) { + return count; + } + } +} + +/* + * Count the number of templates in a END_OBJECTS-terminated list. + */ +static unsigned +count_object_templates(const vm_object_template_t *templates) +{ + if (templates == NULL) { + return 0; + } + for (unsigned count = 0;; count++) { + if (templates[count].kind == EndObjects) { + return count; + } + } +} + +/* conveniences for some macros elsewhere */ +static unsigned +count_submap_object_templates(const vm_object_template_t *templates) +{ + return count_object_templates(templates); +} +static unsigned +count_submap_entry_templates(const vm_entry_template_t *templates) +{ + return count_entry_templates(templates); +} + + +static vm_object_checker_t * +object_checker_new(void) +{ + return calloc(sizeof(vm_object_checker_t), 1); +} + +/* + * Returns true if obj_checker refers to a NULL vm object. + */ +static bool +object_is_null(vm_object_checker_t *obj_checker) +{ + if (obj_checker == NULL) { + return true; + } + assert(obj_checker->kind != Deinited); + assert(obj_checker->kind != FreedObject); + assert(obj_checker->kind != EndObjects); + if (obj_checker->object_id_mode == object_has_known_id) { + return obj_checker->object_id == 0; + } + return false; +} + +static unsigned +object_checker_get_shadow_depth(vm_object_checker_t *obj_checker) +{ + if (obj_checker == NULL || obj_checker->shadow == NULL) { + return 0; + } + assert(!object_is_null(obj_checker)); /* null object must have no shadow */ + return 1 + object_checker_get_shadow_depth(obj_checker->shadow); +} + +static unsigned +object_checker_get_self_ref_count(vm_object_checker_t *obj_checker) +{ + if (object_is_null(obj_checker)) { + /* null object always has zero self_ref_count */ + return 0; + } else { + return obj_checker->self_ref_count; + } +} + +/* + * ref_count as reported by vm_region is: + * this object's self_ref_count + * plus all object self_ref_counts in its shadow chain + * minus the number of objects in its shadow chain + * (i.e. discounting the references internal to the shadow chain) + * TODO: also discounting references due to paging_in_progress + */ +static unsigned +object_checker_get_vm_region_ref_count(vm_object_checker_t *obj_checker) +{ + unsigned count = object_checker_get_self_ref_count(obj_checker); + while ((obj_checker = obj_checker->shadow)) { + count += object_checker_get_self_ref_count(obj_checker) - 1; + } + return count; +} + +/* + * Increments an object checker's refcount, mirroring the VM's refcount. + */ +static void +object_checker_reference(vm_object_checker_t *obj_checker) +{ + if (!object_is_null(obj_checker)) { + obj_checker->self_ref_count++; + } +} + +static void object_checker_deinit(vm_object_checker_t *obj_checker); /* forward */ +static void checker_list_free(checker_list_t *checker_list); /* forward */ + +/* + * Decrements an object checker's refcount, mirroring the VM's refcount. + */ +static void +object_checker_dereference(vm_object_checker_t *obj_checker) +{ + if (!object_is_null(obj_checker)) { + assert(obj_checker->self_ref_count > 0); + obj_checker->self_ref_count--; + if (obj_checker->self_ref_count == 0) { + /* + * We can't free this object checker because + * a checker list may still point to it. + * But we do tear down some of its contents. + */ + object_checker_deinit(obj_checker); + } + } +} + +static void +object_checker_deinit(vm_object_checker_t *obj_checker) +{ + if (obj_checker->kind != Deinited) { + object_checker_dereference(obj_checker->shadow); + obj_checker->shadow = NULL; + + if (obj_checker->submap_checkers) { + assert(obj_checker->kind == SubmapObject); + /* submap checker list must not store objects */ + assert(obj_checker->submap_checkers->objects == NULL); + checker_list_free(obj_checker->submap_checkers); + } + + /* + * Previously we kept the object_id intact so we could + * detect usage of an object that the checkers thought + * was dead. This caused false failures when the VM's + * vm_object_t allocator re-used an object pointer. + * Now we scrub the object_id of deinited objects + * so that vm_object_t pointer reuse is allowed. + */ + obj_checker->object_id_mode = object_has_known_id; + obj_checker->object_id = ~0; + obj_checker->kind = Deinited; + } +} + +static void +object_checker_free(vm_object_checker_t *obj_checker) +{ + object_checker_deinit(obj_checker); + free(obj_checker); +} + +vm_object_checker_t * +object_checker_clone(vm_object_checker_t *obj_checker) +{ + assert(obj_checker->kind != SubmapObject); /* unimplemented */ + + vm_object_checker_t *result = object_checker_new(); + *result = *obj_checker; + + result->self_ref_count = 0; + result->object_id_mode = object_is_unknown; + result->object_id = 0; + result->shadow = NULL; + + result->next = NULL; + result->prev = NULL; + + return result; +} + + +/* + * Search a checker list for an object with the given object_id. + * Returns if no object is known to have that id. + */ +static vm_object_checker_t * +find_object_checker_for_object_id(checker_list_t *list, uint64_t object_id) +{ + /* object list is only stored in the top-level checker list */ + if (list->parent) { + return find_object_checker_for_object_id(list->parent, object_id); + } + + /* first object must be the null object */ + assert(list->objects && object_is_null(list->objects)); + + FOREACH_OBJECT_CHECKER(obj_checker, list) { + assert(obj_checker->kind != FreedObject); + switch (obj_checker->object_id_mode) { + case object_is_unknown: + case object_has_unknown_nonnull_id: + /* nope */ + break; + case object_has_known_id: + if (object_id == obj_checker->object_id) { + assert(obj_checker->kind != Deinited); + return obj_checker; + } + break; + } + } + + return NULL; +} + +/* + * Create a new object checker for the null vm object. + */ +static vm_object_checker_t * +make_null_object_checker(checker_list_t *checker_list) +{ + vm_object_checker_t *obj_checker = object_checker_new(); + obj_checker->kind = Anonymous; + obj_checker->verify = vm_object_attributes_with_default(true); + + obj_checker->object_id_mode = object_has_known_id; + obj_checker->object_id = 0; + + obj_checker->size = ~0u; + obj_checker->self_ref_count = 0; + obj_checker->fill_pattern.mode = DontFill; + + obj_checker->next = NULL; + obj_checker->prev = NULL; + + /* null object must be the first in the list */ + assert(checker_list->objects == NULL); + checker_list->objects = obj_checker; + + return obj_checker; +} + +/* + * Create a new object checker for anonymous memory. + * The new object checker is added to the checker list. + */ +static vm_object_checker_t * +make_anonymous_object_checker(checker_list_t *checker_list, mach_vm_size_t size) +{ + vm_object_checker_t *obj_checker = object_checker_new(); + obj_checker->kind = Anonymous; + obj_checker->verify = vm_object_attributes_with_default(true); + + /* don't know the object's id yet, we'll look it up later */ + obj_checker->object_id_mode = object_is_unknown; + obj_checker->object_id = 0; + + obj_checker->size = size; + obj_checker->self_ref_count = 0; + obj_checker->fill_pattern.mode = DontFill; + + obj_checker->next = NULL; + obj_checker->prev = NULL; + + checker_list_append_object(checker_list, obj_checker); + + return obj_checker; +} + +static void checker_list_move_objects_to_parent(checker_list_t *submap_list); /* forward */ + +/* + * Create a new object checker for a parent map submap entry's object. + * The submap's contents are verified using submap_checkers. + * The new object checker takes ownership of submap_checkers. + * The new object checker is added to the checker list. + */ +static vm_object_checker_t * +make_submap_object_checker( + checker_list_t *checker_list, + checker_list_t *submap_checkers) +{ + /* address range where the submap is currently mapped */ + mach_vm_address_t submap_start = checker_range_start_address(submap_checkers->entries); + mach_vm_address_t submap_size = checker_range_size(submap_checkers->entries); + vm_object_checker_t *obj_checker = object_checker_new(); + obj_checker->kind = SubmapObject; + obj_checker->verify = vm_object_attributes_with_default(true); + + /* Look up the object_id stored in the parent map's submap entry. */ + obj_checker->object_id = get_object_id_for_address(submap_start); /* submap_depth==0 */ + obj_checker->object_id_mode = object_has_known_id; + + obj_checker->size = submap_size; + obj_checker->self_ref_count = 0; + obj_checker->fill_pattern.mode = DontFill; + + obj_checker->next = NULL; + obj_checker->prev = NULL; + + obj_checker->submap_checkers = submap_checkers; + + /* + * Slide the submap checkers as if they were + * checking a submap remapping at address 0. + */ + FOREACH_CHECKER(submap_checker, submap_checkers->entries) { + submap_checker->address -= submap_start; + } + + /* Move the submap list's object checkers into the parent list. */ + submap_checkers->parent = checker_list; + checker_list_move_objects_to_parent(submap_checkers); + + checker_list_append_object(checker_list, obj_checker); + + return obj_checker; +} + +static vm_entry_checker_t * +checker_new(void) +{ + return calloc(sizeof(vm_entry_checker_t), 1); +} + +static void +checker_free(vm_entry_checker_t *checker) +{ + object_checker_dereference(checker->object); + free(checker); +} + + +static checker_list_t * +checker_list_new(void) +{ + checker_list_t *list = calloc(sizeof(*list), 1); + + list->entries.head = NULL; + list->entries.tail = NULL; + + make_null_object_checker(list); + + return list; +} + +void +checker_list_append_object( + checker_list_t *list, + vm_object_checker_t *obj_checker) +{ + /* object list is only stored in the top-level checker list */ + if (list->parent) { + return checker_list_append_object(list, obj_checker); + } + + /* first object must be the null object */ + assert(list->objects && object_is_null(list->objects)); + + /* no additional null objects are allowed */ + assert(!object_is_null(obj_checker)); + + /* new object must be currently unlinked */ + assert(obj_checker->next == NULL && obj_checker->prev == NULL); + + /* no duplicate IDs allowed */ + if (obj_checker->object_id_mode == object_has_known_id) { + assert(!find_object_checker_for_object_id(list, obj_checker->object_id)); + } + + /* insert object after the null object */ + vm_object_checker_t *left = list->objects; + vm_object_checker_t *right = list->objects->next; + obj_checker->prev = left; + obj_checker->next = right; + left->next = obj_checker; + if (right) { + right->prev = obj_checker; + } +} + +/* + * Move object checkers from a submap checker list to its parent. + * Submap checker lists do not store objects. + */ +static void +checker_list_move_objects_to_parent(checker_list_t *submap_list) +{ + vm_object_checker_t *obj_checker = submap_list->objects; + + checker_list_t *parent_list = submap_list->parent; + assert(parent_list != NULL); + + /* skip submap's null object, the parent should already have one */ + assert(obj_checker != NULL && object_is_null(obj_checker)); + obj_checker = obj_checker->next; + + while (obj_checker != NULL) { + vm_object_checker_t *cur = obj_checker; + obj_checker = obj_checker->next; + + cur->prev = cur->next = NULL; + checker_list_append_object(parent_list, cur); + } + + /* free submap's null object */ + object_checker_free(submap_list->objects); + submap_list->objects = NULL; +} + +unsigned +checker_range_count(entry_checker_range_t entry_range) +{ + unsigned count = 0; + FOREACH_CHECKER(checker, entry_range) { + count++; + } + return count; +} + +mach_vm_address_t +checker_range_start_address(entry_checker_range_t checker_range) +{ + return checker_range.head->address; +} + +mach_vm_address_t +checker_range_end_address(entry_checker_range_t checker_range) +{ + return checker_end_address(checker_range.tail); +} + +mach_vm_size_t +checker_range_size(entry_checker_range_t checker_range) +{ + return checker_range_end_address(checker_range) - checker_range_start_address(checker_range); +} + +/* + * Add a checker to the end of a checker range. + */ +static void +checker_range_append(entry_checker_range_t *list, vm_entry_checker_t *inserted) +{ + inserted->prev = list->tail; + if (!list->head) { + list->head = inserted; + } + if (list->tail) { + list->tail->next = inserted; + } + list->tail = inserted; +} + +/* + * Free a range of checkers. + * You probably don't want to call this. + * Use checker_list_free() or checker_list_free_range() instead. + */ +static void +checker_range_free(entry_checker_range_t range) +{ + /* not FOREACH_CHECKER due to use-after-free */ + vm_entry_checker_t *checker = range.head; + vm_entry_checker_t *end = range.tail->next; + while (checker != end) { + vm_entry_checker_t *dead = checker; + checker = checker->next; + checker_free(dead); + } +} + +static void +checker_list_free(checker_list_t *list) +{ + /* Free map entry checkers */ + checker_range_free(list->entries); + + /* Free object checkers. */ + vm_object_checker_t *obj_checker = list->objects; + while (obj_checker) { + vm_object_checker_t *dead = obj_checker; + obj_checker = obj_checker->next; + object_checker_free(dead); + } + + free(list); +} + +/* + * Clone a vm entry checker. + * The new clone increases its object's refcount. + * The new clone is unlinked from the checker list. + */ +static vm_entry_checker_t * +checker_clone(vm_entry_checker_t *old) +{ + vm_entry_checker_t *new_checker = checker_new(); + *new_checker = *old; + object_checker_reference(new_checker->object); + new_checker->prev = NULL; + new_checker->next = NULL; + return new_checker; +} + +static void +checker_set_pages_resident(vm_entry_checker_t *checker, mach_vm_size_t pages) +{ + checker->pages_resident = (uint32_t)pages; +} + +/* + * Return the nth checker in a linked list of checkers. + * Includes holes. + */ +static vm_entry_checker_t * +checker_nth(vm_entry_checker_t *checkers, unsigned n) +{ + assert(checkers != NULL); + if (n == 0) { + return checkers; + } else { + return checker_nth(checkers->next, n - 1); + } +} + +/* + * Return the nth checker in a checker list. + * Includes holes. + */ +vm_entry_checker_t * +checker_list_nth(checker_list_t *list, unsigned n) +{ + return checker_nth(list->entries.head, n); +} + +static void +checker_list_apply_slide(checker_list_t *checker_list, mach_vm_address_t slide) +{ + FOREACH_CHECKER(checker, checker_list->entries) { + checker->address += slide; + } +} + +checker_list_t * +checker_get_and_slide_submap_checkers(vm_entry_checker_t *submap_parent) +{ + assert(submap_parent->kind == Submap); + assert(submap_parent->object); + checker_list_t *submap_checkers = submap_parent->object->submap_checkers; + assert(!submap_checkers->is_slid); + submap_checkers->is_slid = true; + submap_checkers->submap_slide = submap_parent->address - submap_parent->object_offset; + checker_list_apply_slide(submap_checkers, submap_checkers->submap_slide); + return submap_checkers; +} + +void +unslide_submap_checkers(checker_list_t *submap_checkers) +{ + assert(submap_checkers->is_slid); + submap_checkers->is_slid = false; + checker_list_apply_slide(submap_checkers, -submap_checkers->submap_slide); + submap_checkers->submap_slide = 0; +} + + +/* + * vm_region of submap contents clamps the reported + * address range to the parent map's submap entry, + * and also modifies some (but not all) fields to match. + * Our submap checkers model the submap's real contents. + * When verifying VM state, we "tweak" the checkers + * of submap contents to match what vm_region will + * report, and "untweak" the checkers afterwards. + * + * Note that these submap "tweaks" are separate from the + * submap "slide" (checker_get_and_slide_submap_checkers). + * Submap slide is applied any time the submap contents are used. + * Submap tweaks are applied only when comparing checkers to vm_region output. + */ + +typedef struct { + mach_vm_address_t address; + mach_vm_address_t size; + uint32_t pages_resident; +} checker_tweaks_t; + +typedef struct { + /* save the checker list so we can use attribute(cleanup) */ + checker_list_t *tweaked_checker_list; + + /* some entries are removed from the list; save them here */ + entry_checker_range_t original_entries; + + /* some entries are modified; save their old values here */ + vm_entry_checker_t new_head_original_contents; + vm_entry_checker_t new_tail_original_contents; +} checker_list_tweaks_t; + +static void +checker_tweak_for_vm_region(vm_entry_checker_t *checker, vm_entry_checker_t *submap_parent) +{ + /* clamp checker bounds to the submap window */ + mach_vm_size_t old_size = checker->size; + clamp_address_size_to_checker(&checker->address, &checker->size, submap_parent); + + /* + * scale pages_resident, on the assumption that either + * all pages are resident, or none of them (TODO page modeling) + */ + if (checker->size != old_size) { + assert(checker->size < old_size); + double scale = (double)checker->size / old_size; + checker->pages_resident *= scale; + } + + /* + * vm_region does NOT adjust the reported object offset, + * so don't tweak it here + */ +} + +static checker_list_tweaks_t +submap_checkers_tweak_for_vm_region( + checker_list_t *submap_checkers, + vm_entry_checker_t *submap_parent) +{ + assert(submap_checkers->is_slid); + + checker_list_tweaks_t tweaks; + tweaks.tweaked_checker_list = submap_checkers; + + /* The order below must reverse submap_checkers_untweak() */ + + /* + * Remove entries from the list that fall outside this submap window. + * (we don't actually change the linked list, + * only the checker list's head and tail) + */ + tweaks.original_entries = submap_checkers->entries; + submap_checkers->entries = checker_list_find_range_including_holes(submap_checkers, + submap_parent->address, submap_parent->size); + + /* "clip" the new head and tail to the submap parent's bounds */ + vm_entry_checker_t *new_head = submap_checkers->entries.head; + vm_entry_checker_t *new_tail = submap_checkers->entries.tail; + + tweaks.new_head_original_contents = *new_head; + tweaks.new_tail_original_contents = *new_tail; + checker_tweak_for_vm_region(new_head, submap_parent); + checker_tweak_for_vm_region(new_tail, submap_parent); + + return tweaks; +} + +static void +cleanup_submap_checkers_untweak(checker_list_tweaks_t *tweaks) +{ + checker_list_t *submap_checkers = tweaks->tweaked_checker_list; + + /* The order below must reverse submap_checkers_tweak_for_vm_region() */ + + /* restore contents of narrowed head and tail */ + *submap_checkers->entries.tail = tweaks->new_tail_original_contents; + *submap_checkers->entries.head = tweaks->new_head_original_contents; + + /* + * restore entries clipped from the list + * + * old_prefix->head..old_prefix->tail <-> head..tail <-> old_suffix->head..old_suffix->tail + */ + submap_checkers->entries = tweaks->original_entries; +} + +#define DEFER_UNTWEAK __attribute__((cleanup(cleanup_submap_checkers_untweak))) + +/* + * Set an entry checker's object checker. + * Adjusts the refcount of the new object checker and (if any) the old object checker. + * Updates the entry's resident page count if the object has a fill pattern. + */ +void +checker_set_object(vm_entry_checker_t *checker, vm_object_checker_t *obj_checker) +{ + object_checker_reference(obj_checker); + if (checker->object) { + object_checker_dereference(checker->object); + } + checker->object = obj_checker; + + /* if the object has a fill pattern then the pages will be resident already */ + if (checker->object->fill_pattern.mode == Fill) { + checker_set_pages_resident(checker, checker->size / PAGE_SIZE); + } +} + +void +checker_make_shadow_object(checker_list_t *list, vm_entry_checker_t *checker) +{ + vm_object_checker_t *old_object = checker->object; + vm_object_checker_t *new_object = object_checker_clone(checker->object); + checker_list_append_object(list, new_object); + + new_object->size = checker->size; + checker->object_offset = 0; + + new_object->shadow = old_object; + object_checker_reference(old_object); + checker_set_object(checker, new_object); +} + +/* + * Set an entry checker's object to the null object. + */ +void +checker_set_null_object(checker_list_t *list, vm_entry_checker_t *checker) +{ + checker_set_object(checker, find_object_checker_for_object_id(list, 0)); +} + +/* + * vm_region computes share_mode from several other entry and object attributes. + * Mimic that here. + */ +uint8_t +checker_share_mode(vm_entry_checker_t *checker) +{ + vm_object_checker_t *obj_checker = checker->object; + + if (object_is_null(obj_checker)) { + return SM_EMPTY; + } + if (checker_is_submap(checker)) { + return SM_PRIVATE; + } + if (object_checker_get_shadow_depth(obj_checker) > 0) { + return SM_COW; + } + if (checker->needs_copy) { + return SM_COW; + } + if (object_checker_get_self_ref_count(obj_checker) == 1) { + /* TODO: self_ref_count == 2 && named */ + return SM_PRIVATE; + } + + return SM_SHARED; +} + + +/* + * Translate a share mode into a "narrowed" form. + * - SM_TRUESHARED is mapped to SM_SHARED + * - SM_SHARED_ALIASED is unsupported. + * - TODO: SM_LARGE_PAGE + */ +static unsigned +narrow_share_mode(unsigned share_mode) +{ + switch (share_mode) { + case SM_TRUESHARED: + return SM_SHARED; + case SM_PRIVATE_ALIASED: + return SM_PRIVATE_ALIASED; + case SM_SHARED_ALIASED: + T_FAIL("unexpected/unimplemented share mode SM_SHARED_ALIASED"); + case SM_LARGE_PAGE: + T_FAIL("unexpected/unimplemented share mode SM_LARGE_PAGE"); + default: + return share_mode; + } +} + +/* + * Return true if a region and a checker have the same share_mode, + * after accounting for share mode distinctions that the checkers do not enforce. + */ +static bool +same_share_mode(vm_region_submap_info_data_64_t *info, vm_entry_checker_t *checker) +{ + return narrow_share_mode(info->share_mode) == + narrow_share_mode(checker_share_mode(checker)); +} + +/* + * Allocate an entry checker using designated initializer syntax. + */ +#define vm_entry_checker(...) \ + checker_clone(&(vm_entry_checker_t){ __VA_ARGS__ }) + +/* + * Allocate a new checker for an unallocated hole. + * The new checker is not linked into the list. + */ +static vm_entry_checker_t * +make_checker_for_hole(mach_vm_address_t address, mach_vm_size_t size) +{ + return vm_entry_checker( + .address = address, + .size = size, + .kind = Hole, + .verify = vm_entry_attributes_with_default(true) + ); +} + +static vm_entry_checker_t * +make_checker_for_anonymous_private( + checker_list_t *list, + vm_entry_template_kind_t kind, + mach_vm_address_t address, + mach_vm_size_t size, + vm_prot_t protection, + vm_prot_t max_protection, + uint16_t user_tag, + bool permanent) +{ + // fixme hack: if you ask for protection --x you get r-x + // fixme arm only? + if (protection == VM_PROT_EXECUTE) { + protection = VM_PROT_READ | VM_PROT_EXECUTE; + } + + assert(user_tag < 256); + + vm_entry_checker_t *checker = vm_entry_checker( + .kind = kind, + + .address = address, + .size = size, + + .object = NULL, /* set below */ + + .protection = protection, + .max_protection = max_protection, + .inheritance = VM_INHERIT_DEFAULT, + .behavior = VM_BEHAVIOR_DEFAULT, + .permanent = permanent, + + .user_wired_count = 0, + .user_tag = (uint8_t)user_tag, + + .object_offset = 0, + .pages_resident = 0, + .needs_copy = false, + + .verify = vm_entry_attributes_with_default(true) + ); + + checker_set_null_object(list, checker); + + return checker; +} + +vm_entry_checker_t * +make_checker_for_vm_allocate( + checker_list_t *list, + mach_vm_address_t address, + mach_vm_size_t size, + int flags_and_tag) +{ + /* Complain about flags not understood by this code. */ + + /* these flags are permitted but have no effect on the checker */ + int ignored_flags = + VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE | VM_FLAGS_RANDOM_ADDR | + VM_FLAGS_OVERWRITE; + + /* these flags are handled by this code */ + int handled_flags = VM_FLAGS_ALIAS_MASK /* tag */ | VM_FLAGS_PERMANENT; + + int allowed_flags = ignored_flags | handled_flags; + assert((flags_and_tag & ~allowed_flags) == 0); + + bool permanent = flags_and_tag & VM_FLAGS_PERMANENT; + uint16_t tag; + VM_GET_FLAGS_ALIAS(flags_and_tag, tag); + + return make_checker_for_anonymous_private( + list, Allocation, address, size, + VM_PROT_DEFAULT, VM_PROT_ALL, tag, permanent); +} + +/* + * Build a vm_checker for a newly-created shared memory region. + * The region is assumed to be a remapping of anonymous memory. + * Attributes not otherwise specified are assumed to have + * default values as set by mach_vm_map(). + * The new checker is not linked into the list. + */ +static vm_entry_checker_t * +make_checker_for_shared( + checker_list_t *list __unused, + vm_entry_template_kind_t kind, + mach_vm_address_t address, + mach_vm_size_t size, + mach_vm_address_t object_offset, + vm_prot_t protection, + vm_prot_t max_protection, + uint16_t user_tag, + bool permanent, + vm_object_checker_t *obj_checker) +{ + // fixme hack: if you ask for protection --x you get r-x + // fixme arm only? + if (protection == VM_PROT_EXECUTE) { + protection = VM_PROT_READ | VM_PROT_EXECUTE; + } + + assert(user_tag < 256); + vm_entry_checker_t *checker = vm_entry_checker( + .kind = kind, + + .address = address, + .size = size, + + .object = NULL, /* set below */ + + .protection = protection, + .max_protection = max_protection, + .inheritance = VM_INHERIT_DEFAULT, + .behavior = VM_BEHAVIOR_DEFAULT, + .permanent = permanent, + + .user_wired_count = 0, + .user_tag = (uint8_t)user_tag, + + .object_offset = object_offset, + .pages_resident = 0, + .needs_copy = false, + + .verify = vm_entry_attributes_with_default(true) + ); + + checker_set_object(checker, obj_checker); + + return checker; +} + +/* + * Build a checker for a parent map's submap entry. + */ +vm_entry_checker_t * +make_checker_for_submap( + mach_vm_address_t address, + mach_vm_size_t size, + mach_vm_address_t object_offset, + vm_object_checker_t *submap_object_checker) +{ + vm_entry_checker_t *checker = vm_entry_checker( + .kind = Submap, + .address = address, + .size = size, + .object = NULL, /* set below */ + .protection = VM_PROT_READ, + .max_protection = 0, /* set below */ + .inheritance = VM_INHERIT_SHARE, + .behavior = VM_BEHAVIOR_DEFAULT, + .permanent = false, /* see comment below */ + .user_wired_count = 0, + .user_tag = 0, + .submap_depth = 0, + .object_offset = object_offset, + .pages_resident = 0, + .needs_copy = false, + + .verify = vm_entry_attributes_with_default(true), + ); + + /* + * Submap max_protection differs on x86_64. + * (see VM_MAP_POLICY_WRITABLE_SHARED_REGION + * and vm_shared_region_insert_submap) + */ +#if __x86_64__ + checker->max_protection = VM_PROT_ALL; +#else + checker->max_protection = VM_PROT_READ; +#endif + + checker_set_object(checker, submap_object_checker); + + /* + * Real submap entries for the shared region are sometimes + * permanent (see shared_region_make_permanent()). + * This test does not attempt to duplicate that because + * permanent entries are difficult to manage in userspace. + */ + + return checker; +} + + +/* + * Print a checker's fields with optional highlighting. + */ +static void +dump_checker_info_with_highlighting( + vm_entry_checker_t *checker, + attribute_highlights_t highlights) +{ + const char *submap_prefix = checker->submap_depth > 0 ? SUBMAP_PREFIX : ""; + + /* Output order should match dump_region_info() for the reader's convenience. */ + + T_LOG("%sCHECKER %s0x%llx%s..%s0x%llx%s %s(size 0x%llx)%s (%s%s)", + submap_prefix, + HIGHLIGHT(checker->address, entry.address_attr), + HIGHLIGHT(checker_end_address(checker), entry.size_attr), + HIGHLIGHT(checker->size, entry.size_attr), + name_for_entry_kind(checker->kind), + checker->submap_depth > 0 ? " in submap" : ""); + + if (checker->kind == Hole) { + if (checker->submap_depth != 0) { + /* print submap depth to avoid confusion about holes inside submaps */ + T_LOG("%s %ssubmap_depth: %u%s", submap_prefix, HIGHLIGHT(checker->submap_depth, entry.submap_depth_attr)); + } + return; + } + + T_LOG("%s %sprotection: %s%s", submap_prefix, HIGHLIGHT(name_for_prot(checker->protection), entry.protection_attr)); + T_LOG("%s %smax protection: %s%s", submap_prefix, HIGHLIGHT(name_for_prot(checker->max_protection), entry.max_protection_attr)); + T_LOG("%s %sinheritance: %s%s", submap_prefix, HIGHLIGHT(name_for_inherit(checker->inheritance), entry.inheritance_attr)); + T_LOG("%s %sbehavior: %s%s", submap_prefix, HIGHLIGHT(name_for_behavior(checker->behavior), entry.behavior_attr)); + T_LOG("%s %suser wired count: %d%s", submap_prefix, HIGHLIGHT(checker->user_wired_count, entry.user_wired_count_attr)); + T_LOG("%s %suser tag: %d%s", submap_prefix, HIGHLIGHT(checker->user_tag, entry.user_tag_attr)); + T_LOG("%s %sobject offset: 0x%llx%s", submap_prefix, HIGHLIGHT(checker->object_offset, entry.object_offset_attr)); + + vm_object_checker_t *obj_checker = checker->object; + if (object_is_null(obj_checker)) { + T_LOG("%s %sobject id: %d%s", submap_prefix, HIGHLIGHT(0, entry.object_attr)); + } else if (obj_checker->object_id_mode == object_is_unknown) { + T_LOG("%s %sobject id: %s%s", submap_prefix, HIGHLIGHT("unknown", entry.object_attr)); + } else if (obj_checker->object_id_mode == object_has_unknown_nonnull_id) { + T_LOG("%s %sobject id: %s%s", submap_prefix, HIGHLIGHT("unknown, not null", entry.object_attr)); + } else { + assert(obj_checker->object_id_mode == object_has_known_id); + T_LOG("%s %sobject id: 0x%llx%s", submap_prefix, HIGHLIGHT(obj_checker->object_id, object.object_id_attr)); + for (vm_object_checker_t *shadow = obj_checker->shadow; shadow; shadow = shadow->shadow) { + T_LOG("%s %sshadow: 0x%llx%s", submap_prefix, HIGHLIGHT(shadow->object_id, object.object_id_attr)); + } + T_LOG("%s %sobject size: 0x%llx%s", submap_prefix, HIGHLIGHT(obj_checker->size, object.size_attr)); + T_LOG("%s %sref_count: %u%s", submap_prefix, HIGHLIGHT(object_checker_get_vm_region_ref_count(obj_checker), object.ref_count_attr)); + T_LOG("%s %sshadow_depth: %u%s", submap_prefix, HIGHLIGHT(object_checker_get_shadow_depth(obj_checker), object.shadow_depth_attr)); + T_LOG("%s %sself_ref_count: %u%s", submap_prefix, HIGHLIGHT(object_checker_get_self_ref_count(obj_checker), object.ref_count_attr)); + } + + T_LOG("%s %spages resident: %u%s", submap_prefix, HIGHLIGHT(checker->pages_resident, entry.pages_resident_attr)); + T_LOG("%s %sshare mode: %s%s", submap_prefix, HIGHLIGHT(name_for_share_mode(checker_share_mode(checker)), entry.share_mode_attr)); + T_LOG("%s %sis submap: %s%s", submap_prefix, HIGHLIGHT(name_for_bool(checker_is_submap(checker)), entry.is_submap_attr)); + T_LOG("%s %ssubmap_depth: %u%s", submap_prefix, HIGHLIGHT(checker->submap_depth, entry.submap_depth_attr)); + T_LOG("%s %spermanent: %s%s", submap_prefix, HIGHLIGHT(name_for_bool(checker->permanent), entry.permanent_attr)); +} + + +static void +dump_checker_info(vm_entry_checker_t *checker) +{ + /* + * Verified attributes are printed normally. + * Unverified attributes are printed ignored. + */ + vm_entry_attribute_list_t verified_entry_attr = checker->verify; + vm_object_attribute_list_t verified_object_attr; + if (checker->verify.object_attr == false) { + /* object verification disabled entirely */ + verified_object_attr = vm_object_attributes_with_default(false); + } else if (checker->object == NULL) { + verified_object_attr = vm_object_attributes_with_default(true); + } else { + verified_object_attr = checker->object->verify; + } + + dump_checker_info_with_highlighting(checker, + normal_or_ignored_highlights(verified_entry_attr, verified_object_attr)); +} + +void +dump_checker_range( + entry_checker_range_t list) +{ + FOREACH_CHECKER(checker, list) { + dump_checker_info(checker); + if (checker_is_submap(checker)) { + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker); + dump_checker_range(submap_checkers->entries); + } + } +} + +/* + * Print a checker that failed verification, + * and the real VM regions overlapping it. + * Attributes in bad_entry_attr and bad_object_attr are printed as BadHighlight. + * Other attributes are printed as IgnoredHighlight. + */ +static void +warn_bad_checker( + vm_entry_checker_t *checker, + vm_entry_attribute_list_t bad_entry_attr, + vm_object_attribute_list_t bad_object_attr, + const char *message) +{ + attribute_highlights_t highlights = + bad_or_ignored_highlights(bad_entry_attr, bad_object_attr); + T_LOG("*** %s: expected ***", message); + dump_checker_info_with_highlighting(checker, highlights); + T_LOG("*** %s: actual ***", message); + dump_region_info_for_entry(checker, highlights); +} + +static mach_vm_size_t +overestimate_size(const vm_entry_template_t templates[], unsigned count) +{ + mach_vm_size_t size = 0; + for (unsigned i = 0; i < count; i++) { + bool overflowed = __builtin_add_overflow(size, templates[i].size, &size); + assert(!overflowed); + } + return size; +} + +/* + * The arena is a contiguous address range where the VM regions for + * a test are placed. Here we allocate the entire space to reserve it. + * Later, it is overwritten by each desired map entry or unallocated hole. + * + * Problem: We want to generate unallocated holes and verify later that + * they are still unallocated. But code like Rosetta compilation and + * Mach exceptions can allocate VM space outside out control. If those + * allocations land in our unallocated holes then a test may spuriously fail. + * Solution: The arena is allocated with VM_FLAGS_RANDOM_ADDR to keep it + * well away from the VM's allocation frontier. This does not prevent the + * problem entirely but so far it appears to dodge it with high probability. + * TODO: make this more reliable or completely safe somehow. + */ +static void +allocate_arena( + mach_vm_size_t arena_size, + mach_vm_size_t arena_alignment_mask, + mach_vm_address_t * const out_arena_address) +{ + mach_vm_size_t arena_unaligned_size; + mach_vm_address_t allocated = 0; + kern_return_t kr; + + /* + * VM_FLAGS_RANDOM_ADDR will often spuriously fail + * when using a large alignment mask. + * We instead allocate oversized and perform the alignment manually. + */ + if (arena_alignment_mask > PAGE_MASK) { + arena_unaligned_size = arena_size + arena_alignment_mask + 1; + } else { + arena_unaligned_size = arena_size; + } + + kr = mach_vm_map(mach_task_self(), &allocated, arena_unaligned_size, + 0 /* alignment mask */, VM_FLAGS_ANYWHERE | VM_FLAGS_RANDOM_ADDR, + 0, 0, 0, 0, 0, 0); + + if (kr == KERN_NO_SPACE) { + /* + * VM_FLAGS_RANDOM_ADDR can spuriously fail even without alignment. + * Try again without it. + */ + kr = mach_vm_map(mach_task_self(), &allocated, arena_unaligned_size, + 0 /* alignment mask */, VM_FLAGS_ANYWHERE, + 0, 0, 0, 0, 0, 0); + if (kr == KERN_SUCCESS) { + T_LOG("note: forced to allocate arena without VM_FLAGS_RANDOM_ADDR"); + } + } + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "arena allocation " + "(size 0x%llx, alignment 0x%llx)", arena_size, arena_alignment_mask); + + if (arena_alignment_mask > PAGE_MASK) { + /* Align manually within the oversized allocation. */ + mach_vm_address_t aligned = (allocated & ~arena_alignment_mask) + arena_alignment_mask + 1; + mach_vm_address_t aligned_end = aligned + arena_size; + mach_vm_address_t allocated_end = allocated + arena_unaligned_size; + + assert(aligned >= allocated && aligned_end <= allocated_end); + assert((aligned & arena_alignment_mask) == 0); + assert((aligned & PAGE_MASK) == 0); + + /* trim the overallocation */ + (void)mach_vm_deallocate(mach_task_self(), allocated, aligned - allocated); + (void)mach_vm_deallocate(mach_task_self(), aligned_end, allocated_end - aligned_end); + + *out_arena_address = aligned; + } else { + /* No alignment needed. */ + *out_arena_address = allocated; + } +} + +static void +write_fill_pattern( + mach_vm_address_t start, + mach_vm_size_t size, + fill_pattern_t fill_pattern) +{ + assert(start % sizeof(uint64_t) == 0); + if (fill_pattern.mode == Fill) { + for (mach_vm_address_t c = start; + c < start + size; + c += sizeof(uint64_t)) { + *(uint64_t *)c = fill_pattern.pattern; + } + } +} + +/* + * Returns true if the memory contents of [start, start + size) + * matches the fill pattern. + * A fill pattern of DontFill always matches and never reads the memory. + * If the pattern did not match, *first_bad_address is set to the + * first address (uint64_t aligned) that did not match. + */ +static bool +verify_fill_pattern( + mach_vm_address_t start, + mach_vm_size_t size, + fill_pattern_t fill_pattern, + mach_vm_address_t * const first_bad_address) +{ + mach_vm_address_t end = start + size; + bool good = true; + assert(start % sizeof(uint64_t) == 0); + if (fill_pattern.mode == Fill) { + for (mach_vm_address_t c = start; + c < end; + c += sizeof(uint64_t)) { + if (*(uint64_t *)c != fill_pattern.pattern) { + if (first_bad_address) { + *first_bad_address = c; + } + good = false; + break; + } + } + } + + return good; +} + +/* Debug syscall to manipulate submaps. */ + +typedef enum { + vsto_make_submap = 1, /* make submap from entries in current_map() at start..end, offset ignored */ + vsto_remap_submap = 2, /* map in current_map() at start..end, from submap address offset */ + vsto_end +} vm_submap_test_op; + +typedef struct { + vm_submap_test_op op; + mach_vm_address_t submap_base_address; + mach_vm_address_t start; + mach_vm_address_t end; + mach_vm_address_t offset; +} vm_submap_test_args; + +static void +submap_op(vm_submap_test_args *args) +{ + int err = sysctlbyname("vm.submap_test_ctl", + NULL, NULL, args, sizeof(*args)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctl(vm.submap_test_ctl)"); +} + +/* Lower address range [start..end) into a submap at that same address. */ +static void +submapify(mach_vm_address_t start, mach_vm_address_t end) +{ + vm_submap_test_args args = { + .op = vsto_make_submap, + .submap_base_address = 0, + .start = start, + .end = end, + .offset = 0, + }; + submap_op(&args); +} + +/* + * submap_base_address is the start of a submap created with submapify(). + * Remap that submap or a portion thereof at [start, end). + * Use offset as the VME_OFFSET field in the parent map's submap entry. + */ +static void +remap_submap( + mach_vm_address_t submap_base_address, + mach_vm_address_t start, + mach_vm_size_t size, + mach_vm_address_t offset) +{ + vm_submap_test_args args = { + .op = vsto_remap_submap, + .submap_base_address = submap_base_address, + .start = start, + .end = start + size, + .offset = offset, + }; + submap_op(&args); +} + +/* + * Temporary scratch space for newly-created VM objects. + * Used by create_vm_state() and its helpers. + */ +typedef struct { + /* computed from entry templates */ + unsigned entry_count; + bool is_private; + mach_vm_size_t min_size; /* size required by entries that use it */ + + /* + * set when allocating the object's temporary backing storage + */ + mach_vm_address_t allocated_address; + mach_vm_size_t allocated_size; + vm_object_checker_t *checker; +} object_scratch_t; + +static void +allocate_submap_storage_and_checker( + checker_list_t *checker_list, + const vm_object_template_t *object_tmpl, + object_scratch_t *object_scratch) +{ + assert(object_tmpl->kind == SubmapObject); + assert(object_tmpl->size == 0); + assert(object_scratch->min_size > 0); + assert(object_scratch->entry_count > 0); + + /* + * Submap size is determined by its contents. + * min_size is the minimum size required for + * the offset/size of the parent map entries + * that remap this submap. + * We allocate the submap first, then check min_size. + */ + + /* + * Check some preconditions on the submap contents. + * This is in addition to the checks performed by create_vm_state(). + */ + for (unsigned i = 0; i < object_tmpl->submap.entry_count; i++) { + const vm_entry_template_t *tmpl = &object_tmpl->submap.entries[i]; + + assert(tmpl->kind != Hole); /* no holes, vm_map_seal fills them */ + assert(tmpl->kind != Submap); /* no nested submaps */ + } + + /* + * Allocate the submap's entries into temporary space, + * space, lower them into a submap, and build checkers for them. + * Later there will be entry templates in the parent map that + * remap this space and clone these checkers. + * This temporary space will be cleaned up when + * the object_scratch is destroyed at the end of create_vm_state(). + */ + checker_list_t *submap_checkers = create_vm_state( + object_tmpl->submap.entries, object_tmpl->submap.entry_count, + object_tmpl->submap.objects, object_tmpl->submap.object_count, + SUBMAP_ALIGNMENT_MASK, "submap construction"); + + /* + * Update the returned submap checkers for vm_map_seal and submap lowering. + * - set the submap depth + * - resolve null objects + * - disable share mode verification (TODO vm_region says SM_COW, we say SM_PRIVATE) + * - TODO resolve needs_copy COW and change to COPY_DELAY + */ + FOREACH_CHECKER(submap_checker, submap_checkers->entries) { + T_QUIET; T_ASSERT_EQ(submap_checker->submap_depth, 0, "nested submaps not allowed"); + submap_checker->submap_depth = 1; + checker_resolve_null_vm_object(submap_checkers, submap_checker); + submap_checker->verify.share_mode_attr = false; + } + + mach_vm_address_t submap_start = checker_range_start_address(submap_checkers->entries); + mach_vm_address_t submap_end = checker_range_end_address(submap_checkers->entries); + assert(submap_start < submap_end); + + /* verify that the submap is bigger than min_size */ + T_QUIET; T_ASSERT_GE(submap_end - submap_start, object_scratch->min_size, + "some submap entry extends beyond the end of the submap object"); + + /* make it a real boy^W submap */ + submapify(submap_start, submap_end); + + /* + * Make an object checker for the entire submap. + * This checker stores the entry and object checkers for the submap's contents. + */ + vm_object_checker_t *obj_checker = make_submap_object_checker( + checker_list, submap_checkers); + + object_scratch->allocated_address = submap_start; + object_scratch->allocated_size = submap_end - submap_start; + object_scratch->checker = obj_checker; +} + +static void +allocate_object_storage_and_checker( + checker_list_t *checker_list, + const vm_object_template_t *object_tmpl, + object_scratch_t *object_scratch) +{ + kern_return_t kr; + + assert(object_tmpl->kind != EndObjects); + assert(object_scratch->entry_count > 0); + assert(object_scratch->min_size > 0); + + /* + * min_size is the required object size as determined by + * the entries using this object and their sizes and offsets. + * + * tmpl->size may be zero, in which case we allocate min_size bytes + * OR tmpl->size may be non-zero, in which case we allocate tmpl->size bytes + * and verify that it is at least as large as min_size. + */ + mach_vm_size_t size = object_tmpl->size ?: object_scratch->min_size; + assert(size >= object_scratch->min_size); + + if (object_scratch->is_private == 1) { + /* + * Object is private memory for a single entry. + * It will be allocated when the entry is created. + */ + assert(object_scratch->entry_count == 1); + object_scratch->allocated_address = 0; + object_scratch->allocated_size = 0; + object_scratch->checker = NULL; + } else if (object_tmpl->kind == Anonymous) { + /* + * Object is anonymous memory and shared or COW + * by multiple entries. Allocate temporary space now. + * Each entry will copy or share it when the entries + * are created. Then this temporary allocation will be freed. + */ + // fixme double-check that freeing this backing store + // does not interfere with COW state + mach_vm_address_t address = 0; + kr = mach_vm_allocate(mach_task_self(), &address, size, + VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_MEMORY_SCENEKIT)); + assert(kr == 0); + + object_scratch->allocated_address = address; + object_scratch->allocated_size = size; + + object_scratch->checker = make_anonymous_object_checker( + checker_list, size); + + write_fill_pattern(address, size, object_tmpl->fill_pattern); + object_scratch->checker->fill_pattern = object_tmpl->fill_pattern; + } else { + T_FAIL("unexpected/unimplemented: object is neither private nor anonymous nor submap"); + } +} + + +/* + * Choose an entry's user_tag value. + * If the requested value is an ordinary tag, use it. + * If the requested value is autoincrementing, pick the next + * autoincrementing tag. *inc stores the persistent increment + * state and should be cleared before the first call. + */ +static uint8_t +choose_user_tag(uint16_t requested_tag, uint8_t *inc) +{ + uint8_t assigned_tag; + if (requested_tag == VM_MEMORY_TAG_AUTOINCREMENTING) { + /* choose an incrementing tag 1..16 */ + assigned_tag = VM_MEMORY_APPLICATION_SPECIFIC_1 + *inc; + *inc = (*inc + 1) % 16; + } else { + /* ordinary tag */ + assert(requested_tag < 256); + assigned_tag = (uint8_t)requested_tag; + } + return assigned_tag; +} + + +/* + * SM_EMPTY is the default template share mode, + * but we allow other template values to implicitly + * override it. + */ +static uint8_t +template_real_share_mode(const vm_entry_template_t *tmpl) +{ + if (tmpl->share_mode != SM_EMPTY) { + return tmpl->share_mode; + } + + /* things that can override SM_EMPTY */ + if (tmpl->user_wired_count > 0) { + return SM_PRIVATE; + } + if (tmpl->object && tmpl->object->fill_pattern.mode == Fill) { + return SM_PRIVATE; + } + + return SM_EMPTY; +} + +static void +create_vm_hole( + const vm_entry_template_t *tmpl, + mach_vm_address_t dest_address, + checker_list_t *checker_list) +{ + kern_return_t kr; + + assert(dest_address % PAGE_SIZE == 0); + assert(tmpl->size % PAGE_SIZE == 0); + assert(tmpl->object == NULL); + + /* deallocate the hole */ + kr = mach_vm_deallocate(mach_task_self(), + dest_address, tmpl->size); + assert(kr == 0); + + /* add a checker for the unallocated space */ + checker_range_append(&checker_list->entries, + make_checker_for_hole(dest_address, tmpl->size)); +} + +static void +create_vm_submap( + const vm_entry_template_t *tmpl, + object_scratch_t *object_scratch, + mach_vm_address_t dest_address, + checker_list_t *checker_list) +{ + kern_return_t kr; + + /* entry must not extend beyond submap's backing store */ + assert(tmpl->offset + tmpl->size <= object_scratch->allocated_size); + + /* deallocate space for the new submap entry */ + /* TODO vsto_remap_submap should copy-overwrite */ + kr = mach_vm_deallocate(mach_task_self(), + dest_address, tmpl->size); + assert(kr == 0); + + remap_submap(object_scratch->allocated_address, + dest_address, tmpl->size, tmpl->offset); + + /* + * Create a map entry checker for the parent map's submap entry. + * Its object checker is the submap checker, which in turn + * contains the entry checkers for the submap's contents. + */ + checker_range_append(&checker_list->entries, + make_checker_for_submap(dest_address, tmpl->size, tmpl->offset, + object_scratch->checker)); +} + +__attribute__((overloadable)) +checker_list_t * +create_vm_state( + const vm_entry_template_t entry_templates[], + unsigned entry_template_count, + const vm_object_template_t object_templates[], + unsigned object_template_count, + mach_vm_size_t alignment_mask, + const char *message) +{ + const vm_object_template_t *start_object_templates = &object_templates[0]; + const vm_object_template_t *end_object_templates = &object_templates[object_template_count]; + checker_list_t *checker_list = checker_list_new(); + uint8_t tag_increment = 0; + kern_return_t kr; + + /* temporary scratch space for new objects for shared and COW entries */ + object_scratch_t *new_objects = + calloc(sizeof(object_scratch_t), object_template_count); + + /* Check some preconditions */ + + assert(is_valid_alignment_mask(alignment_mask)); + assert(entry_template_count > 0); + + /* + * Check preconditions of each entry template + * and accumulate some info about their respective objects. + */ + for (unsigned i = 0; i < entry_template_count; i++) { + const vm_entry_template_t *tmpl = &entry_templates[i]; + + assert(tmpl->kind != EndEntries); + assert(tmpl->size > 0); + assert(tmpl->size % PAGE_SIZE == 0); + assert(tmpl->inheritance <= VM_INHERIT_LAST_VALID); + + /* reject VM_PROT_EXEC; TODO: support it somehow */ + T_QUIET; T_ASSERT_TRUE(prot_contains_all(VM_PROT_READ | VM_PROT_WRITE, tmpl->protection), + "entry template #%u protection 0x%x exceeds VM_PROT_READ | VM_PROT_WRITE", i, tmpl->protection); + + T_QUIET; T_ASSERT_TRUE(prot_contains_all(VM_PROT_ALL, tmpl->max_protection), + "entry template #%u max_protection 0x%x exceeds VM_PROT_ALL", i, tmpl->max_protection); + + T_QUIET; T_ASSERT_TRUE(prot_contains_all(tmpl->max_protection, tmpl->protection), + "entry template #%u protection exceeds max_protection (%s/%s)", + i, name_for_prot(tmpl->protection), name_for_prot(tmpl->max_protection)); + + /* entry can't be COW and wired at the same time */ + assert(!(tmpl->user_wired_count > 0 && template_real_share_mode(tmpl) == SM_COW)); + + /* + * We only allow vm_behavior_t values that are stored + * persistently in the entry. + * Non-persistent behaviors don't make sense here because + * they're really more like one-shot memory operations. + */ + assert(is_persistent_vm_behavior(tmpl->behavior)); + + /* + * Non-zero offset in object not implemented for + * SM_EMPTY and SM_PRIVATE. + * (TODO might be possible for SM_PRIVATE.) + */ + if (tmpl->kind != Submap) { + switch (template_real_share_mode(tmpl)) { + case SM_EMPTY: + case SM_PRIVATE: + assert(tmpl->offset == 0); /* unimplemented */ + break; + default: + break; + } + } else { + /* Submap entries are SM_PRIVATE and can be offset. */ + } + + /* entry's object template must be NULL or in the object list */ + object_scratch_t *object_scratch = NULL; + if (tmpl->object) { + assert(tmpl->object >= start_object_templates && + tmpl->object < end_object_templates); + + object_scratch = + &new_objects[tmpl->object - start_object_templates]; + + /* object size must be large enough to span this entry */ + mach_vm_size_t min_size = tmpl->offset + tmpl->size; + if (object_scratch->min_size < min_size) { + object_scratch->min_size = min_size; + } + } + + if (tmpl->kind == Submap) { + /* submap */ + assert(tmpl->object); + assert(tmpl->object->kind == SubmapObject); + object_scratch->entry_count++; + object_scratch->is_private = false; + } else { + /* not submap */ + assert(tmpl->object == NULL || tmpl->object->kind != SubmapObject); + + /* + * object entry_count is the number of entries that use it + * + * object is_private if its only reference + * is an entry with share mode private + */ + switch (template_real_share_mode(tmpl)) { + case SM_EMPTY: + /* + * empty may not have an object + * (but note that some options may override SM_EMPTY, + * see template_real_share_mode()) + */ + assert(tmpl->object == NULL); + break; + case SM_PRIVATE: + /* + * private: + * object is optional + * object must not be used already + * object will be private + */ + if (tmpl->object) { + assert(object_scratch->entry_count == 0 && + "SM_PRIVATE entry template may not share " + "its object template with any other entry"); + object_scratch->entry_count = 1; + object_scratch->is_private = true; + } + break; + case SM_SHARED: + /* case SM_TRUESHARED, TODO maybe */ + case SM_COW: + /* + * shared or cow: + * object is required + * object must not be private already + */ + assert(tmpl->object); + assert(object_scratch->is_private == false); + object_scratch->entry_count++; + break; + default: + T_FAIL("unexpected/unimplemented: unsupported share mode"); + } + } + } + + /* + * Check that every SM_SHARED entry really does share + * its object with at least one other entry. + */ + for (unsigned i = 0; i < entry_template_count; i++) { + const vm_entry_template_t *tmpl = &entry_templates[i]; + const vm_object_template_t *object_tmpl = tmpl->object; + object_scratch_t *object_scratch = + tmpl->object ? &new_objects[object_tmpl - start_object_templates] : NULL; + + if (template_real_share_mode(tmpl) == SM_SHARED) { + assert(tmpl->object != NULL && + "SM_SHARED entry template must have an object template"); + assert(object_scratch->entry_count > 1 && + "SM_SHARED entry's object template must be used by at least one other entry"); + } + } + + /* + * Check some preconditions of object templates, + * and allocate backing storage and checkers for objects that are shared. + * (Objects that are private are handled when the entry is created.) + * + * This also allocates backing storage and checkers for submaps in a + * similar way to shared non-submaps. The submap mapping(s) into this + * arena's address range, and the checkers thereof, are handled later. + */ + for (unsigned i = 0; i < object_template_count; i++) { + const vm_object_template_t *object_tmpl = &object_templates[i]; + object_scratch_t *object_scratch = &new_objects[i]; + + if (object_tmpl->kind == SubmapObject) { + allocate_submap_storage_and_checker( + checker_list, object_tmpl, object_scratch); + } else { + allocate_object_storage_and_checker( + checker_list, object_tmpl, object_scratch); + } + } + + /* Allocate a range large enough to span all requested entries. */ + mach_vm_address_t arena_address = 0; + mach_vm_address_t arena_end = 0; + { + mach_vm_size_t arena_size = + overestimate_size(entry_templates, entry_template_count); + allocate_arena(arena_size, alignment_mask, &arena_address); + arena_end = arena_address + arena_size; + } + + /* Carve up the allocated range into the requested entries. */ + for (unsigned i = 0; i < entry_template_count; i++) { + const vm_entry_template_t *tmpl = &entry_templates[i]; + const vm_object_template_t *object_tmpl = tmpl->object; + object_scratch_t *object_scratch = + tmpl->object ? &new_objects[object_tmpl - start_object_templates] : NULL; + + /* + * Assign a user_tag, resolving autoincrementing if requested. + */ + uint8_t assigned_tag = choose_user_tag(tmpl->user_tag, &tag_increment); + + unsigned permanent_flag = tmpl->permanent ? VM_FLAGS_PERMANENT : 0; + + /* Allocate the entry. */ + + if (tmpl->kind == Hole) { + create_vm_hole(tmpl, arena_address, checker_list); + arena_address += tmpl->size; + continue; + } else if (tmpl->kind == Submap) { + create_vm_submap(tmpl, object_scratch, arena_address, checker_list); + arena_address += tmpl->size; + continue; + } else { + assert(tmpl->kind == Allocation); + } + + /* new entry is a real allocation */ + if (template_real_share_mode(tmpl) == SM_SHARED) { + /* + * New map entry is shared: it shares + * the same object as some other map entry. + * + * Create the entry using mach_make_memory_entry() + * and mach_vm_map(). The source is the object's + * temporary backing store (or a portion thereof). + * + * We don't use vm_remap to share because it can't + * set the user_tag. + */ + + /* must not extend beyond object's temporary backing store */ + assert(tmpl->offset + tmpl->size <= object_scratch->allocated_size); + + /* create the memory entry covering the entire source object */ + mach_vm_size_t size = tmpl->size; + mach_port_t memory_entry_port; + kr = mach_make_memory_entry_64(mach_task_self(), + &size, + object_scratch->allocated_address + tmpl->offset, /* src */ + tmpl->protection | MAP_MEM_VM_SHARE, + &memory_entry_port, MEMORY_OBJECT_NULL); + assert(kr == 0); + assert(size == tmpl->size); + + /* map the memory entry */ + mach_vm_address_t allocated_address = arena_address; + kr = mach_vm_map(mach_task_self(), + &allocated_address, + tmpl->size, + 0, /* alignment mask */ + VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(assigned_tag) | permanent_flag, + memory_entry_port, /* src */ + 0, /* offset - already applied during mmme */ + false, /* copy */ + tmpl->protection, + tmpl->max_protection, + VM_INHERIT_DEFAULT); + assert(kr == 0); + assert(allocated_address == arena_address); + + /* tear down the memory entry */ + mach_port_deallocate(mach_task_self(), memory_entry_port); + + /* set up the checkers */ + vm_entry_checker_t *checker = make_checker_for_shared( + checker_list, tmpl->kind, + allocated_address, tmpl->size, tmpl->offset, + tmpl->protection, tmpl->max_protection, + assigned_tag, tmpl->permanent, object_scratch->checker); + checker_range_append(&checker_list->entries, checker); + + arena_address = allocated_address + tmpl->size; + } else if (tmpl->object == NULL || tmpl->object->kind == Anonymous) { + /* + * New entry's object is null or anonymous private memory. + * Create the entry using mach_vm_map. + */ + + /* + * We attempt to map the memory with the correct protections + * from the start, because this is more capable than + * mapping with more permissive protections and then + * calling vm_protect. + * + * But sometimes we need to read or write the memory + * during setup. In that case we are forced to map + * permissively and vm_protect later. + */ + vm_prot_t initial_protection = tmpl->protection; + vm_prot_t initial_max_protection = tmpl->max_protection; + bool protect_last = false; + if (template_real_share_mode(tmpl) == SM_PRIVATE || + tmpl->object != NULL) { + protect_last = true; + initial_protection |= VM_PROT_READ | VM_PROT_WRITE; + initial_max_protection |= VM_PROT_READ | VM_PROT_WRITE; + } + + mach_vm_address_t allocated_address = arena_address; + kr = mach_vm_map(mach_task_self(), + &allocated_address, + tmpl->size, + 0, /* alignment mask */ + VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(assigned_tag) | permanent_flag, + 0, /* memory object */ + 0, /* object offset */ + false, /* copy */ + initial_protection, + initial_max_protection, + VM_INHERIT_DEFAULT); + assert(kr == 0); + assert(allocated_address == arena_address); + + vm_entry_checker_t *checker = make_checker_for_anonymous_private( + checker_list, + tmpl->kind, allocated_address, tmpl->size, + tmpl->protection, tmpl->max_protection, assigned_tag, + tmpl->permanent); + checker_range_append(&checker_list->entries, checker); + + arena_address = allocated_address + tmpl->size; + + if (template_real_share_mode(tmpl) == SM_PRIVATE) { + /* + * New entry needs a non-null object. + * tmpl->object may be NULL or have no fill pattern, + * in which case the caller wants a non-null + * object with no resident pages. + */ + vm_object_checker_t *obj_checker = + make_anonymous_object_checker(checker_list, + checker->object_offset + checker->size); + if (tmpl->object) { + obj_checker->fill_pattern = tmpl->object->fill_pattern; + write_fill_pattern(checker->address, checker->size, + obj_checker->fill_pattern); + } else { + /* + * no object template: fill with zeros + * to get a vm object, then kill its pages. + */ + write_fill_pattern(checker->address, checker->size, + (fill_pattern_t){Fill, 0}); + kr = mach_vm_behavior_set(mach_task_self(), + checker->address, checker->size, VM_BEHAVIOR_FREE); + assert(kr == 0); + kr = mach_vm_behavior_set(mach_task_self(), + checker->address, checker->size, VM_BEHAVIOR_PAGEOUT); + assert(kr == 0); + } + checker_set_object(checker, obj_checker); + } else if (tmpl->object != NULL) { + /* + * New entry needs a real object for COW. + * (SM_SHARED was handled above) + */ + assert(template_real_share_mode(tmpl) == SM_COW); + kr = mach_vm_copy(mach_task_self(), + object_scratch->allocated_address + tmpl->offset, + tmpl->size, allocated_address); + assert(kr == 0); + checker_set_object(checker, object_scratch->checker); + checker->needs_copy = true; + } + + if (protect_last) { + /* + * Set protection and max_protection + * if we couldn't do it up front. + */ + kr = mach_vm_protect(mach_task_self(), + allocated_address, tmpl->size, false /*set_max*/, tmpl->protection); + assert(kr == 0); + kr = mach_vm_protect(mach_task_self(), + allocated_address, tmpl->size, true /*set_max*/, tmpl->max_protection); + assert(kr == 0); + } + } else if (template_real_share_mode(tmpl) == SM_PRIVATE) { + /* + * New entry's object is private non-anonymous memory + * TODO named entries + */ + T_FAIL("unexpected/unimplemented: non-anonymous memory unimplemented"); + } else { + T_FAIL("unexpected/unimplemented: unrecognized share mode"); + } + } + + /* + * All entries now have their objects set. + * Deallocate temporary storage for shared objects. + * Do this before verifying share_mode: any sharing from + * the temporary object storage itself should not count. + */ + for (unsigned i = 0; i < object_template_count; i++) { + object_scratch_t *object_scratch = &new_objects[i]; + + if (object_scratch->allocated_address > 0) { + kr = mach_vm_deallocate(mach_task_self(), + object_scratch->allocated_address, + object_scratch->allocated_size); + assert(kr == 0); + object_scratch->allocated_address = 0; + object_scratch->allocated_size = 0; + } + } + + /* + * All of the entries and checkers are in place. + * Now set each entry's properties. + */ + for (unsigned i = 0; i < entry_template_count; i++) { + const vm_entry_template_t *tmpl = &entry_templates[i]; + vm_entry_checker_t *checker = + checker_list_nth(checker_list, i); + + if (tmpl->kind == Hole) { + continue; /* nothing else to do for holes */ + } + if (tmpl->kind == Submap) { + continue; /* nothing else to do for submaps */ + } + assert(tmpl->kind == Allocation); + + /* user_tag - already set */ + + /* permanent - already set */ + + /* + * protection, max_protection - already set + * We set these in mach_vm_map() because setting default + * values in mach_vm_map() and then adjusting them with + * mach_vm_protect() is less capable. + */ + + /* inheritance */ + if (tmpl->inheritance != VM_INHERIT_DEFAULT) { + kr = mach_vm_inherit(mach_task_self(), + checker->address, checker->size, + tmpl->inheritance); + assert(kr == 0); + checker->inheritance = tmpl->inheritance; + } + + /* behavior */ + if (tmpl->behavior != VM_BEHAVIOR_DEFAULT) { + checker->behavior = tmpl->behavior; + kr = mach_vm_behavior_set(mach_task_self(), + checker->address, checker->size, tmpl->behavior); + assert(kr == 0); + } + + /* user_wired_count */ + if (tmpl->user_wired_count > 0) { + checker_resolve_null_vm_object(checker_list, checker); + checker->user_wired_count = tmpl->user_wired_count; + for (uint16_t w = 0; w < tmpl->user_wired_count; w++) { + kr = mach_vm_wire(host_priv(), mach_task_self(), + checker->address, checker->size, VM_PROT_READ); + assert(kr == 0); + } + } + + /* + * Verify that the template's share mode matches + * the checker's share mode, after allowing for + * some mismatches for usability purposes. + * Do this last. + */ + assert(template_real_share_mode(tmpl) == checker_share_mode(checker)); + } + + /* Deallocate any remaining arena space */ + kr = mach_vm_deallocate(mach_task_self(), + arena_address, arena_end - arena_address); + assert(kr == 0); + + /* Deallocate scratch space */ + free(new_objects); + + /* Verify that our entries and checkers match. */ + assert(verify_vm_state(checker_list, message)); + + return checker_list; +} + +void +create_vm_state_from_config( + vm_config_t *config, + checker_list_t ** const out_checker_list, + mach_vm_address_t * const out_start_address, + mach_vm_address_t * const out_end_address) +{ + checker_list_t *list = create_vm_state( + config->entry_templates, config->entry_template_count, + config->object_templates, config->object_template_count, + config->alignment_mask, "before test"); + + /* + * Adjusted start and end address are relative to the + * templates' first and last entry (holes ARE included) + */ + + *out_start_address = list->entries.head->address + config->start_adjustment; + *out_end_address = checker_end_address(list->entries.tail) + config->end_adjustment; + assert(*out_start_address < *out_end_address); + + *out_checker_list = list; +} + + +/* + * Deallocate the real memory and update the checker for the end of a test. + * The checker itself may be deallocated and replaced. + */ +static void +checker_deallocate_allocation(checker_list_t *list, vm_entry_checker_t *checker) +{ + assert(checker->kind == Allocation || checker->kind == Submap); + + kern_return_t kr = mach_vm_deallocate(mach_task_self(), + checker->address, checker->size); + assert(kr == 0); + + if (checker->permanent) { + /* permanent entry becomes inaccessible */ + checker->protection = VM_PROT_NONE; + checker->max_protection = VM_PROT_NONE; + + /* + * hack: disable verification of some attributes + * that verify_vm_faultability perturbed + */ + checker->verify.object_attr = false; + checker->verify.share_mode_attr = false; + checker->verify.pages_resident_attr = false; + + /* + * Don't verify fill pattern because the verifier + * is noisy when the memory is inaccessible. + */ + if (checker->object) { + checker->object->verify.fill_pattern_attr = false; + } + } else { + /* nonpermanent entry becomes a deallocated hole */ + vm_entry_checker_t *new_hole = + make_checker_for_hole(checker->address, checker->size); + checker_list_replace_checker(list, checker, new_hole); + } +} + +/* + * Deallocate the VM allocations covered by the checkers. + * Updates the checkers so that entry permanence can be verified later. + * + * Not recommended after verification errors because the + * true VM allocations may not match the checkers' list. + */ +static void +deallocate_vm_allocations(checker_list_t *list) +{ + /* not FOREACH_CHECKER due to use-after-free */ + vm_entry_checker_t *checker = list->entries.head; + vm_entry_checker_t *end = list->entries.tail->next; + while (checker != end) { + vm_entry_checker_t *next = checker->next; + + if (checker->kind == Allocation || checker->kind == Submap) { + checker_deallocate_allocation(list, checker); + } + + checker = next; + } +} + +static void +learn_object_id( + checker_list_t *checker_list, + vm_object_checker_t *obj_checker, + uint64_t object_id, + vm_entry_attribute_list_t * const out_bad_entry_attr, + vm_object_attribute_list_t * const out_bad_object_attr, + const char *message) +{ + assert(obj_checker->object_id_mode != object_has_known_id); + + if (find_object_checker_for_object_id(checker_list, object_id)) { + /* + * This object should have its own id, + * but we already have another object + * checker with this id. That's bad. + */ + T_FAIL("%s: wrong object id (expected new id, got existing id)", message); + out_bad_entry_attr->object_attr = true; + out_bad_object_attr->object_id_attr = true; + } else { + /* + * Remember this object id. + * If other entries should have the same object + * but don't then the mismatch will be + * detected when they are verified. + */ + obj_checker->object_id_mode = object_has_known_id; + obj_checker->object_id = object_id; + } +} + +/* + * Verify VM state of an address range that is expected to be an allocation. + * Returns true if it looks correct. + * T_FAILs and logs details and returns false if it looks wrong. + */ +static bool +verify_allocation( + checker_list_t *checker_list, + vm_entry_checker_t *checker, + const char *message) +{ + vm_entry_attribute_list_t bad_entry_attr = + vm_entry_attributes_with_default(false); + vm_object_attribute_list_t bad_object_attr = + vm_object_attributes_with_default(false); + + assert(checker->kind == Allocation || checker->kind == Submap); + + /* Call vm_region to get the actual VM state */ + mach_vm_address_t actual_address = checker->address; + mach_vm_size_t actual_size = 0; + vm_region_submap_info_data_64_t info; + if (!get_info_for_address(&actual_address, &actual_size, &info, checker->submap_depth)) { + /* address was unmapped - not a valid allocation */ + if (checker->submap_depth && is_mapped(checker->address, 0)) { + /* address was mapped, but checker wanted a submap */ + T_FAIL("%s: allocation was expected to be in a submap", message); + } else { + /* address was unmapped at every submap depth */ + T_FAIL("%s: allocation was not mapped", message); + } + bad_entry_attr.is_submap_attr = true; + bad_entry_attr.submap_depth_attr = true; + warn_bad_checker(checker, bad_entry_attr, bad_object_attr, message); + return false; + } + + /* Report any differences between the checker and the actual state. */ + + if (actual_address != checker->address || + actual_size != checker->size) { + /* address is mapped, but region doesn't match template exactly */ + T_FAIL("%s: entry bounds did not match", message); + bad_entry_attr.address_attr = true; + bad_entry_attr.size_attr = true; + } + + if (checker->verify.protection_attr && + info.protection != checker->protection) { + T_FAIL("%s: wrong protection", message); + bad_entry_attr.protection_attr = true; + } + if (checker->verify.max_protection_attr && + info.max_protection != checker->max_protection) { + T_FAIL("%s: wrong max protection", message); + bad_entry_attr.max_protection_attr = true; + } + if (checker->verify.inheritance_attr && + info.inheritance != checker->inheritance) { + T_FAIL("%s: wrong inheritance", message); + bad_entry_attr.inheritance_attr = true; + } + if (checker->verify.behavior_attr && + info.behavior != checker->behavior) { + T_FAIL("%s: wrong behavior", message); + bad_entry_attr.behavior_attr = true; + } + if (checker->verify.user_wired_count_attr && + info.user_wired_count != checker->user_wired_count) { + T_FAIL("%s: wrong user wired count", message); + bad_entry_attr.user_wired_count_attr = true; + } + if (checker->verify.user_tag_attr && + info.user_tag != checker->user_tag) { + T_FAIL("%s: wrong user tag", message); + bad_entry_attr.user_tag_attr = true; + } + if (checker->verify.is_submap_attr && + info.is_submap != checker_is_submap(checker)) { + T_FAIL("%s: wrong is_submap", message); + bad_entry_attr.is_submap_attr = true; + bad_entry_attr.submap_depth_attr = true; + } + + if (checker->verify.object_offset_attr && + info.offset != checker->object_offset) { + T_FAIL("%s: wrong object offset", message); + bad_entry_attr.object_offset_attr = true; + } + + if (checker->verify.object_attr) { + vm_object_checker_t *obj_checker = checker->object; + assert(obj_checker != NULL); + assert(obj_checker->kind != Deinited); + + unsigned vm_region_ref_count = object_checker_get_vm_region_ref_count(obj_checker); + unsigned shadow_depth = object_checker_get_shadow_depth(obj_checker); + + if (obj_checker->verify.object_id_attr) { + switch (obj_checker->object_id_mode) { + case object_is_unknown: + learn_object_id(checker_list, obj_checker, info.object_id_full, + &bad_entry_attr, &bad_object_attr, message); + break; + case object_has_unknown_nonnull_id: + /* + * We don't know the right object id, + * but we know that zero is wrong. + */ + if (info.object_id_full == 0) { + T_FAIL("%s: wrong object id (expected nonzero)", message); + bad_entry_attr.object_attr = true; + bad_object_attr.object_id_attr = true; + break; + } + learn_object_id(checker_list, obj_checker, info.object_id_full, + &bad_entry_attr, &bad_object_attr, message); + break; + case object_has_known_id: + if (info.object_id_full != obj_checker->object_id) { + T_FAIL("%s: wrong object id", message); + bad_entry_attr.object_attr = true; + bad_object_attr.object_id_attr = true; + } + break; + } + } + + /* + * can't check object's true size, but we can + * check that it is big enough for this vm entry + */ + if (obj_checker->verify.size_attr && + info.offset + actual_size > obj_checker->size) { + T_FAIL("%s: entry extends beyond object's expected size", message); + bad_entry_attr.object_attr = true; + bad_object_attr.size_attr = true; + } + + if (obj_checker->verify.ref_count_attr && + info.ref_count != vm_region_ref_count) { + T_FAIL("%s: wrong object ref count (want %u got %u)", + message, vm_region_ref_count, info.ref_count); + bad_entry_attr.object_attr = true; + bad_object_attr.ref_count_attr = true; + } + + if (obj_checker->verify.shadow_depth_attr && + info.shadow_depth != shadow_depth) { + T_FAIL("%s: wrong object shadow depth (want %u got %u)", + message, shadow_depth, info.shadow_depth); + bad_entry_attr.object_attr = true; + bad_object_attr.shadow_depth_attr = true; + } + + /* Verify fill pattern after checking the rest of the object */ + if (!obj_checker->verify.fill_pattern_attr) { + /* fill pattern check disabled */ + } else if (bad_entry_attr.address_attr || bad_entry_attr.size_attr) { + /* don't try to verify fill if the address or size were bad */ + } else if (obj_checker->fill_pattern.mode == DontFill) { + /* no fill pattern set, don't verify it */ + } else if (!(info.protection & VM_PROT_READ)) { + /* protection disallows read, can't verify fill pattern */ + T_LOG("note: %s: can't verify fill pattern of unreadable memory (%s/%s)", + message, name_for_prot(info.protection), name_for_prot(info.max_protection)); + } else { + /* verify the fill pattern */ + mach_vm_address_t first_bad_address; + if (!verify_fill_pattern(actual_address, actual_size, + obj_checker->fill_pattern, &first_bad_address)) { + T_FAIL("%s: wrong fill at address 0x%llx " + "(expected 0x%016llx, got 0x%016llx)", + message, first_bad_address, + obj_checker->fill_pattern.pattern, + *(uint64_t *)first_bad_address); + bad_entry_attr.object_attr = true; + bad_object_attr.fill_pattern_attr = true; + } + } + } + + /* do this after checking the object */ + if (checker->verify.share_mode_attr && + !same_share_mode(&info, checker)) { + T_FAIL("%s: wrong share mode", message); + bad_entry_attr.share_mode_attr = true; + } + + /* do this after checking the object */ + if (checker->verify.pages_resident_attr && + info.pages_resident != checker->pages_resident) { + T_FAIL("%s: wrong pages resident count (want %d, got %d)", + message, checker->pages_resident, info.pages_resident); + bad_entry_attr.pages_resident_attr = true; + } + + /* + * checker->permanent can only be tested destructively. + * We don't verify it until the end of the test. + */ + + if (bad_entry_attr.bits != 0 || bad_object_attr.bits != 0) { + warn_bad_checker(checker, bad_entry_attr, bad_object_attr, message); + return false; + } + + return true; +} + + +/* + * Verify VM state of an address range that is + * expected to be an unallocated hole. + * Returns true if it looks correct. + * T_FAILs and logs details and returns false if it looks wrong. + */ +static bool +verify_hole(vm_entry_checker_t *checker, const char *message) +{ + bool good = true; + + assert(checker->kind == Hole); + + /* zero-size hole is always presumed valid */ + if (checker->size == 0) { + return true; + } + + mach_vm_address_t actual_address = checker->address; + mach_vm_size_t actual_size = 0; + vm_region_submap_info_data_64_t info; + if (get_info_for_address_fast(&actual_address, &actual_size, &info)) { + /* address was mapped - not a hole */ + T_FAIL("%s: expected hole is not a hole", message); + good = false; + } else if (actual_address < checker_end_address(checker)) { + /* [address, address + size) was partly mapped - not a hole */ + T_FAIL("%s: expected hole is not a hole", message); + good = false; + } else { + /* [address, address + size) was entirely unmapped */ + } + + if (!good) { + warn_bad_checker(checker, + vm_entry_attributes_with_default(true), + vm_object_attributes_with_default(true), + message); + } + return good; +} + +test_result_t +verify_vm_state_nested(checker_list_t *checker_list, bool in_submap, const char *message) +{ + bool good = true; + + if (Verbose) { + T_LOG("*** %s: verifying vm entries %s ***", + message, in_submap ? "(in submap) " : ""); + } + + vm_entry_checker_t *last_checked = NULL; + FOREACH_CHECKER(checker, checker_list->entries) { + last_checked = checker; + + switch (checker->kind) { + case Allocation: + good &= verify_allocation(checker_list, checker, message); + break; + case Hole: + good &= verify_hole(checker, message); + break; + case Submap: { + /* Verify the submap entry in the parent map. */ + good &= verify_allocation(checker_list, checker, message); + + /* Verify the submap's contents. */ + + /* + * Adjust the submap content checkers to match + * vm_region output within this submap entry. + * Undo those adjustments at end of scope. + */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker); + checker_list_tweaks_t tweaks DEFER_UNTWEAK = + submap_checkers_tweak_for_vm_region(submap_checkers, checker); + + good &= verify_vm_state_nested(submap_checkers, true, message); + break; + } + case EndEntries: + default: + assert(0); + } + } + assert(last_checked == checker_list->entries.tail); + + if (in_submap) { + /* don't dump submap alone, wait until we're back at the top level */ + } else if (!good || Verbose) { + T_LOG("*** %s: all expected ***", message); + dump_checker_range(checker_list->entries); + T_LOG("*** %s: all actual ***", message); + dump_region_info_for_entries(checker_list->entries); + } + + return good ? TestSucceeded : TestFailed; +} + +test_result_t +verify_vm_state(checker_list_t *checker_list, const char *message) +{ + assert(!checker_list->is_slid); + return verify_vm_state_nested(checker_list, false, message); +} + + +/* + * Get the expected errors for read and write faults + * inside the given checker's memory. + * The signals are: + * 0 (mapped and readable / writeable) + * KERN_PROTECTION_FAILURE (mapped but not readable / writeable) + * KERN_INVALID_ADDRESS (unmapped) + */ +static void +get_expected_errors_for_faults( + vm_entry_checker_t *checker, + kern_return_t * const out_read_error, + kern_return_t * const out_write_error) +{ + switch (checker->kind) { + case Allocation: + /* mapped: error is either none or protection failure */ + switch (checker->protection & (VM_PROT_READ | VM_PROT_WRITE)) { + case VM_PROT_READ | VM_PROT_WRITE: + /* mapped, read/write */ + *out_read_error = 0; + *out_write_error = 0; + break; + case VM_PROT_READ: + /* mapped, read-only */ + *out_read_error = 0; + *out_write_error = KERN_PROTECTION_FAILURE; + break; + case VM_PROT_WRITE: + /* mapped, "write-only" but inaccessible to faults */ + *out_read_error = KERN_PROTECTION_FAILURE; + *out_write_error = KERN_PROTECTION_FAILURE; + break; + case 0: + /* mapped, inaccessible */ + *out_read_error = KERN_PROTECTION_FAILURE; + *out_write_error = KERN_PROTECTION_FAILURE; + break; + default: + T_FAIL("unexpected protection %s", name_for_prot(checker->protection)); + } + break; + case Hole: + /* unmapped: error is invalid address */ + *out_read_error = KERN_INVALID_ADDRESS; + *out_write_error = KERN_INVALID_ADDRESS; + break; + case EndEntries: + default: + assert(0); + } +} + + +static fill_pattern_t +checker_fill_pattern(vm_entry_checker_t *checker) +{ + if (checker->object == NULL) { + return (fill_pattern_t){ .mode = DontFill, .pattern = 0 }; + } + return checker->object->fill_pattern; +} + +static bool +checker_should_verify_fill_pattern(vm_entry_checker_t *checker) +{ + return checker->verify.object_attr && + checker->object != NULL && + checker->object->verify.fill_pattern_attr && + checker->object->fill_pattern.mode == Fill; +} + +/* + * Verify read and/or write faults on every page of checker's address range. + */ +bool +verify_checker_faultability( + vm_entry_checker_t *checker, + const char *message, + bool verify_reads, + bool verify_writes) +{ + return verify_checker_faultability_in_address_range(checker, message, + verify_reads, verify_writes, checker->address, checker->size); +} + +bool +verify_checker_faultability_in_address_range( + vm_entry_checker_t *checker, + const char *message, + bool verify_reads, + bool verify_writes, + mach_vm_address_t checked_address, + mach_vm_size_t checked_size) +{ + assert(verify_reads || verify_writes); + + if (Verbose) { + const char *faults; + if (verify_reads && verify_writes) { + faults = "read and write"; + } else if (verify_reads) { + faults = "read"; + } else { + faults = "write"; + } + T_LOG("%s: trying %s faults in [0x%llx..0x%llx)", + message, faults, checked_address, checked_address + checked_size); + } + + /* range to be checked must fall within the checker */ + assert(checked_size > 0); + assert(checker_contains_address(checker, checked_address)); + assert(checker_contains_address(checker, checked_address + checked_size - 1)); + + /* read and write use the fill pattern if any */ + fill_pattern_t fill_pattern = checker_fill_pattern(checker); + bool enforce_expected_byte = checker_should_verify_fill_pattern(checker); +#if BYTE_ORDER == LITTLE_ENDIAN + uint8_t expected_byte = fill_pattern.pattern & 0xff; +#else + uint8_t expected_byte = fill_pattern.pattern >> 56; +#endif + + bool good = true; + kern_return_t expected_read_error, expected_write_error; + get_expected_errors_for_faults(checker, + &expected_read_error, &expected_write_error); + + mach_vm_address_t start = checked_address; + mach_vm_address_t end = checked_address + checked_size; + for (mach_vm_address_t addr = start; addr < end; addr += PAGE_SIZE) { + if (verify_reads) { + uint8_t actual_byte; + kern_return_t actual_read_error; + try_read_byte(addr, &actual_byte, &actual_read_error); + if (expected_read_error != actual_read_error) { + T_FAIL("%s: wrong error %d %s (expected %d %s) " + "when reading from address 0x%llx", + message, actual_read_error, name_for_kr(actual_read_error), + expected_read_error, name_for_kr(expected_read_error), addr); + good = false; + break; + } + if (enforce_expected_byte && + actual_read_error == KERN_SUCCESS && + expected_byte != actual_byte) { + T_FAIL("%s: wrong byte 0x%hhx (expected 0x%hhx) " + "read from address 0x%llx", + message, actual_byte, expected_byte, addr); + good = false; + break; + } + } + + if (verify_writes) { + kern_return_t actual_write_error; + try_write_byte(addr, expected_byte, &actual_write_error); + if (expected_write_error != actual_write_error) { + T_FAIL("%s: wrong error %d %s (expected %d %s) " + "when writing to address 0x%llx", + message, actual_write_error, name_for_kr(actual_write_error), + expected_write_error, name_for_kr(expected_write_error), addr); + good = false; + break; + } + } + } + + if (!good) { + warn_bad_checker(checker, + vm_entry_attributes_with_default(true), + vm_object_attributes_with_default(true), + message); + } + + return good; +} + + +static test_result_t +verify_vm_faultability_nested( + checker_list_t *checker_list, + const char *message, + bool verify_reads, + bool verify_writes, + bool in_submap) +{ + bool good = true; + + if (Verbose) { + T_LOG("*** %s: verifying vm faultability %s ***", + message, in_submap ? "(in submap) " : ""); + } + + FOREACH_CHECKER(checker, checker_list->entries) { + bool really_verify_writes = verify_writes; + + if (prot_contains_all(checker->protection, VM_PROT_READ | VM_PROT_WRITE)) { + /* + * Don't try writing to "writeable" submap allocations. + * That provokes unnesting which confuses us, because + * we don't update the checkers for that unnesting here. + * TODO: implement write fault testing in writeable submaps + */ + if (checker_is_submap(checker)) { + /* checker is parent map's submap entry with +rw */ + really_verify_writes = false; + } else if (in_submap) { + /* checker is submap contents with +rw */ + really_verify_writes = false; + } + } + + /* Read and/or write from the checker's memory. */ + + if (checker_is_submap(checker)) { + /* Verify based on submap contents. */ + T_QUIET; T_ASSERT_FALSE(in_submap, "nested submaps not allowed"); + + /* + * Adjust the submap content checkers to match + * vm_region output within this submap entry. + * Undo those adjustments at end of scope. + */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker); + checker_list_tweaks_t tweaks DEFER_UNTWEAK = + submap_checkers_tweak_for_vm_region(submap_checkers, checker); + + good &= verify_vm_faultability_nested(submap_checkers, message, + verify_reads, really_verify_writes, true /* in_submap */); + } else { + good &= verify_checker_faultability(checker, + message, verify_reads, verify_writes); + } + } + + if (in_submap) { + /* don't dump submap alone, wait until we're back at the top level */ + } else if (!good || Verbose) { + T_LOG("*** %s: all expected ***", message); + dump_checker_range(checker_list->entries); + T_LOG("*** %s: all actual ***", message); + dump_region_info_for_entries(checker_list->entries); + } + + return good ? TestSucceeded : TestFailed; +} + +test_result_t +verify_vm_faultability( + checker_list_t *checker_list, + const char *message, + bool verify_reads, + bool verify_writes) +{ + return verify_vm_faultability_nested(checker_list, message, + verify_reads, verify_writes, false /* in_submap */); +} + + +/* Inserts new_left to the left of old_right. */ +static void +checker_insert_left( + vm_entry_checker_t *new_left, + vm_entry_checker_t *old_right) +{ + assert(new_left); + assert(old_right); + + new_left->prev = old_right->prev; + new_left->next = old_right; + + old_right->prev = new_left; + if (new_left->prev) { + new_left->prev->next = new_left; + } +} + +/* Inserts new_right to the right of old_left. */ +static void +checker_insert_right( + vm_entry_checker_t *old_left, + vm_entry_checker_t *new_right) +{ + assert(old_left); + assert(new_right); + + new_right->prev = old_left; + new_right->next = old_left->next; + + old_left->next = new_right; + if (new_right->next) { + new_right->next->prev = new_right; + } +} + +/* + * Split a checker into two checkers at an address. + * On entry, the checker has already been cloned into two identical checkers. + * This function modifies the clones to make two separate checkers. + */ +static void +checker_split_clones( + vm_entry_checker_t *left, + vm_entry_checker_t *right, + mach_vm_address_t split) +{ + mach_vm_address_t start = left->address; + mach_vm_address_t end = checker_end_address(left); + + assert(split > start); + assert(split < end); + + assert(left->next == right); + assert(right->prev == left); + + left->address = start; + left->size = split - start; + right->address = split; + right->size = end - split; + + right->object_offset = left->object_offset + left->size; +} + +vm_entry_checker_t * +checker_clip_right( + checker_list_t *list, + vm_entry_checker_t *left, + mach_vm_address_t split) +{ + if (split > left->address && split < checker_end_address(left)) { + vm_entry_checker_t *right = checker_clone(left); + checker_insert_right(left, right); + checker_split_clones(left, right, split); + if (list && list->entries.tail == left) { + list->entries.tail = right; + } + return right; + } + return NULL; +} + +vm_entry_checker_t * +checker_clip_left( + checker_list_t *list, + vm_entry_checker_t *right, + mach_vm_address_t split) +{ + if (split > right->address && split < checker_end_address(right)) { + vm_entry_checker_t *left = checker_clone(right); + checker_insert_left(left, right); + checker_split_clones(left, right, split); + if (list && list->entries.head == right) { + list->entries.head = left; + } + return left; + } + return NULL; +} + +static entry_checker_range_t +checker_list_try_find_range_including_holes( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + mach_vm_address_t end = start + size; + vm_entry_checker_t *first = NULL; + vm_entry_checker_t *last = NULL; + + assert(start >= list->entries.head->address); + assert(end <= checker_end_address(list->entries.tail)); + assert(end >= start); + + FOREACH_CHECKER(checker, list->entries) { + /* find the first entry that ends after the start address */ + if (checker_end_address(checker) > start && !first) { + first = checker; + } + /* find the last entry that begins before the end address */ + if (checker->address < end) { + last = checker; + } + } + + return (entry_checker_range_t){ first, last }; +} + +entry_checker_range_t +checker_list_find_range_including_holes( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t result = + checker_list_try_find_range_including_holes(list, start, size); + vm_entry_checker_t *first = result.head; + vm_entry_checker_t *last = result.tail; + + assert(first && last); + assert(first->address <= last->address); + + return result; +} + +entry_checker_range_t +checker_list_find_range( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t result = + checker_list_find_range_including_holes(list, start, size); + + FOREACH_CHECKER(checker, result) { + assert(checker->kind != Hole); + } + + return result; +} + +vm_entry_checker_t * +checker_list_find_checker(checker_list_t *list, mach_vm_address_t addr) +{ + entry_checker_range_t found = + checker_list_try_find_range_including_holes(list, addr, 0); + vm_entry_checker_t *checker = found.head; + + if (!checker) { + return NULL; + } + if (addr < checker->address || addr >= checker_end_address(checker)) { + return NULL; + } + + return checker; +} + +vm_entry_checker_t * +checker_list_find_allocation(checker_list_t *list, mach_vm_address_t addr) +{ + vm_entry_checker_t *checker = checker_list_find_checker(list, addr); + + if (checker->kind != Allocation) { + return NULL; + } + + return checker; +} + +entry_checker_range_t +checker_list_find_and_clip( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t limit = checker_list_find_range(list, start, size); + checker_clip_left(list, limit.head, start); + checker_clip_right(list, limit.tail, start + size); + return limit; +} + +entry_checker_range_t +checker_list_find_and_clip_including_holes( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + mach_vm_address_t end = start + size; + entry_checker_range_t limit = + checker_list_find_range_including_holes(list, start, size); + + if (checker_contains_address(limit.head, start)) { + checker_clip_left(list, limit.head, start); + assert(limit.head->address == start); + } + if (checker_contains_address(limit.tail, end)) { + checker_clip_right(list, limit.tail, end); + assert(checker_end_address(limit.tail) == end); + } + + return limit; +} + +static bool +can_simplify_kind(vm_entry_checker_t *left, vm_entry_checker_t *right) +{ + return (left->kind == Allocation && right->kind == Allocation) || + (left->kind == Submap && right->kind == Submap); +} + +void +checker_simplify_left( + checker_list_t *list, + vm_entry_checker_t *right) +{ + vm_entry_checker_t *left = right->prev; + if (!left) { + return; + } + if (can_simplify_kind(left, right) && + left->protection == right->protection && + left->max_protection == right->max_protection && + left->inheritance == right->inheritance && + left->behavior == right->behavior && + left->user_wired_count == right->user_wired_count && + left->user_tag == right->user_tag && + left->submap_depth == right->submap_depth && + left->object == right->object && + left->object_offset + left->size == right->object_offset && + left->permanent == right->permanent) { + /* kill left and keep right so the simplify loop works unimpeded */ + right->address = left->address; + right->size += left->size; + right->object_offset = left->object_offset; + + /* update other properties that may differ */ + + if (left->verify.pages_resident_attr != right->verify.pages_resident_attr) { + T_LOG("note: can't verify page counts after simplify " + "merged two entries with different page count verification"); + } + right->pages_resident += left->pages_resident; + + /* + * unlink and free left checker + * update the checker list if we are deleting its head + */ + right->prev = left->prev; + if (left->prev) { + left->prev->next = right; + } + if (list->entries.head == left) { + list->entries.head = right; + } + checker_free(left); + } +} + +void +checker_list_simplify( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + mach_vm_address_t end = start + size; + entry_checker_range_t limit = checker_list_find_range_including_holes(list, start, size); + + /* vm_map_simplify_range() also includes any entry that starts at `end` */ + if (limit.tail && limit.tail->next && limit.tail->next->address == end) { + limit.tail = limit.tail->next; + } + + FOREACH_CHECKER(checker, limit) { + checker_simplify_left(list, checker); + } +} + +void +checker_list_replace_range( + checker_list_t *list, + entry_checker_range_t old_range, + entry_checker_range_t new_range) +{ + /* old_range and new_range must coincide */ + assert(checker_range_start_address(old_range) == checker_range_start_address(new_range)); + assert(checker_range_end_address(old_range) == checker_range_end_address(new_range)); + + /* + * Unlink old_range and link in new_range. + * Update list->entries if necessary. + * + * before: ... prev old_range next ... + * after: ... prev new_range next ... + * a.k.a: ... prev new_left ... new_right next ... + */ + vm_entry_checker_t *prev = old_range.head->prev; + vm_entry_checker_t *new_left = new_range.head; + new_left->prev = prev; + if (prev) { + prev->next = new_left; + } else { + list->entries.head = new_left; + } + + vm_entry_checker_t *next = old_range.tail->next; + vm_entry_checker_t *new_right = new_range.tail; + new_right->next = next; + if (next) { + next->prev = new_right; + } else { + list->entries.tail = new_right; + } + + /* Destroy the removed entries. */ + /* TODO: update checker state to account for the removal? */ + checker_range_free(old_range); +} + +void +checker_list_free_range( + checker_list_t *list, + entry_checker_range_t range) +{ + /* Make a new hole checker covering the removed range. */ + vm_entry_checker_t *new_hole = make_checker_for_hole( + checker_range_start_address(range), + checker_range_size(range)); + entry_checker_range_t new_range = { new_hole, new_hole }; + + /* Remove checkers in the old range and insert the new hole. */ + checker_list_replace_range(list, range, new_range); +} + + +static bool +checker_has_null_vm_object(vm_entry_checker_t *checker) +{ + return object_is_null(checker->object); +} + +void +checker_resolve_null_vm_object( + checker_list_t *checker_list, + vm_entry_checker_t *checker) +{ + if (checker_has_null_vm_object(checker)) { + /* entry's object offset is reset to zero */ + checker->object_offset = 0; + + /* entry gets a new object */ + vm_object_checker_t *obj_checker = + make_anonymous_object_checker(checker_list, checker->size); + checker_set_object(checker, obj_checker); + + /* don't know the object's id yet, but we know it isn't zero */ + obj_checker->object_id_mode = object_has_unknown_nonnull_id; + } +} + +void +checker_fault_for_prot_not_cow( + checker_list_t *checker_list, + vm_entry_checker_t *checker, + vm_prot_t fault_prot) +{ + assert(fault_prot != VM_PROT_NONE); + + /* write fault also requires read permission */ + vm_prot_t required_prot = fault_prot; + if (prot_contains_all(required_prot, VM_PROT_WRITE)) { + required_prot |= VM_PROT_READ; + } + if (!prot_contains_all(checker->protection, required_prot)) { + /* access denied */ + return; + } + + checker_resolve_null_vm_object(checker_list, checker); + if (fault_prot & VM_PROT_WRITE) { + /* cow resolution is hard, don't try it here */ + assert(checker_share_mode(checker) != SM_COW); + } + + /* entry is 100% resident */ + checker_set_pages_resident(checker, checker->size / PAGE_SIZE); +} + +vm_entry_checker_t * +checker_list_try_unnest_one_entry_in_submap( + checker_list_t *checker_list, + vm_entry_checker_t *submap_parent, + bool unnest_readonly, + bool all_overwritten, + mach_vm_address_t * const inout_next_address) +{ + mach_vm_address_t unnest_start; + mach_vm_address_t unnest_end; + vm_entry_checker_t *unnested_checker; + vm_prot_t submap_protection; + vm_prot_t submap_max_protection; + vm_object_checker_t *obj_checker; + + { + /* Find the checker for the entry inside the submap at this parent map address. */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + vm_entry_checker_t *submap_content = + checker_list_find_checker(submap_checkers, *inout_next_address); + + /* Compute the range to be unnested if required, and advance past it. */ + unnest_start = submap_content->address; + unnest_end = checker_end_address(submap_content); + clamp_start_end_to_checker(&unnest_start, &unnest_end, submap_parent); + *inout_next_address = unnest_end; + + /* Return now if the submap content does not need to be unnested. */ + switch (submap_content->kind) { + case Allocation: + if (!(submap_content->protection & VM_PROT_WRITE) && !unnest_readonly) { + /* + * Allocation is read-only and unnest_readonly is not set. + * Don't unnest this. + */ + return NULL; + } + break; + case Hole: + /* Unallocated in submap. Don't unnest. */ + return NULL; + case Submap: + assert(0 && "nested submaps not allowed"); + default: + assert(0 && "unknown checker kind"); + } + + submap_protection = submap_content->protection; + submap_max_protection = submap_content->max_protection; + obj_checker = submap_content->object; + + /* + * Unslide the submap checkers now at end of scope. + * Changing the submap parent map entry from a submap + * to an allocation (below) may leave the submap checkers + * unreferenced and thus deallocated. + */ + } + + /* Clip the submap parent to the unnest bounds. */ + checker_clip_left(checker_list, submap_parent, unnest_start); + checker_clip_right(checker_list, submap_parent, unnest_end); + + /* + * unnested_checker (nee submap_parent) now matches the unnesting bounds. + * Change its object and other attributes to become the unnested entry. + * (this matches the behavior of vm_map_lookup_and_lock_object(), + * which also edits the parent map entry in place) + */ + + unnested_checker = submap_parent; + unnested_checker->kind = Allocation; + + /* + * Set unnested_checker's protection and inheritance. + * Copied from vm_map_lookup_and_lock_object. + */ + if (unnested_checker->protection != VM_PROT_READ) { + /* + * Someone has already altered the top entry's + * protections via vm_protect(VM_PROT_COPY). + * Respect these new values and ignore the + * submap entry's protections. + */ + } else { + /* + * Regular copy-on-write: propagate the submap + * entry's protections to the top map entry. + */ + unnested_checker->protection |= submap_protection; + } + unnested_checker->max_protection |= submap_max_protection; + if (unnested_checker->inheritance == VM_INHERIT_SHARE) { + unnested_checker->inheritance = VM_INHERIT_COPY; + } + + /* + * Set unnested_checker's vm object. + * unnesting is a copy-on-write copy, but in our + * tests it is sometimes immediately overwritten so we skip that step. + */ + checker_set_object(unnested_checker, obj_checker); + bool is_null = object_is_null(obj_checker); + if (is_null && all_overwritten) { + checker_resolve_null_vm_object(checker_list, unnested_checker); + } else if (is_null) { + /* no object change */ + } else if (all_overwritten && (submap_protection & VM_PROT_WRITE)) { + /* writeable and will be overwritten - skip COW representation */ + obj_checker = object_checker_clone(obj_checker); + checker_list_append_object(checker_list, obj_checker); + unnested_checker->needs_copy = false; + checker_set_object(unnested_checker, obj_checker); + unnested_checker->object_offset = 0; + } else { + /* won't be overwritten - model a COW copy */ + checker_make_shadow_object(checker_list, unnested_checker); + } + + /* TODO: tpro, permanent, VM_PROT_EXEC */ + + assert(*inout_next_address == checker_end_address(unnested_checker)); + + return unnested_checker; +} + +__attribute__((overloadable)) +vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + vm_object_template_t *object_templates, + vm_entry_template_t *submap_entry_templates, + vm_object_template_t *submap_object_templates, + mach_vm_size_t start_adjustment, + mach_vm_size_t end_adjustment, + mach_vm_size_t alignment_mask) +{ + /* + * Allocate a new vm_config_t and populate it with + * copies of the name string and all of the templates. + */ + vm_config_t *result = calloc(sizeof(vm_config_t), 1); + + result->config_name = strdup(name); + result->start_adjustment = start_adjustment; + result->end_adjustment = end_adjustment; + result->alignment_mask = alignment_mask; + + /* memcpy the templates */ + +#define COPY_TEMPLATE_LIST(T) \ + unsigned T##_template_count = count_##T##_templates(T##_templates); \ + size_t T##_template_bytes = T##_template_count * sizeof(T##_templates[0]); \ + result->T##_templates = calloc(1, T##_template_bytes); \ + result->T##_template_count = T##_template_count; \ + memcpy(result->T##_templates, T##_templates, T##_template_bytes) + + COPY_TEMPLATE_LIST(entry); + COPY_TEMPLATE_LIST(object); + COPY_TEMPLATE_LIST(submap_entry); + COPY_TEMPLATE_LIST(submap_object); + + /* fix up the pointers inside the templates */ + /* TODO: use indexes instead of pointers so that they don't need fixup */ + +#define ASSERT_IS_WITHIN(ptr, array, array_count) \ + assert((ptr) >= (array) && (ptr) < (array) + (array_count)) + + for (unsigned i = 0; i < result->entry_template_count; i++) { + vm_entry_template_t *tmpl = &result->entry_templates[i]; + if (tmpl->object) { + /* fix up entry's object to point into the copied templates */ + ASSERT_IS_WITHIN(tmpl->object, object_templates, object_template_count); + tmpl->object = &result->object_templates[tmpl->object - object_templates]; + } + } + for (unsigned i = 0; i < result->submap_entry_template_count; i++) { + vm_entry_template_t *tmpl = &result->submap_entry_templates[i]; + if (tmpl->object) { + /* fix up submap entry's object to point into the copied submap templates */ + ASSERT_IS_WITHIN(tmpl->object, submap_object_templates, submap_object_template_count); + tmpl->object = &result->submap_object_templates[tmpl->object - submap_object_templates]; + } + } + for (unsigned i = 0; i < result->object_template_count; i++) { + vm_object_template_t *tmpl = &result->object_templates[i]; + if (tmpl->kind != SubmapObject) { + continue; + } + /* fix up submap's template lists to point into the copied submap templates */ + assert(tmpl->submap.entries); /* submap must contain at least one entry */ + ASSERT_IS_WITHIN(tmpl->submap.entries, submap_entry_templates, submap_entry_template_count); + ptrdiff_t submap_index = tmpl->submap.entries - submap_entry_templates; + tmpl->submap.entries = &result->submap_entry_templates[submap_index]; + if (tmpl->submap.entry_count == 0) { + tmpl->submap.entry_count = submap_entry_template_count - submap_index; + } + assert(submap_index + tmpl->submap.entry_count <= submap_entry_template_count); + + if (tmpl->submap.objects) { + ASSERT_IS_WITHIN(tmpl->submap.objects, submap_object_templates, submap_object_template_count); + ptrdiff_t object_index = tmpl->submap.objects - submap_object_templates; + tmpl->submap.objects = &result->submap_object_templates[object_index]; + if (tmpl->submap.object_count == 0) { + tmpl->submap.object_count = submap_object_template_count - object_index; + } + assert(object_index + tmpl->submap.object_count <= submap_object_template_count); + } + } + for (unsigned i = 0; i < result->submap_object_template_count; i++) { + /* no fixups needed inside submap_object_templates, they can't be nested submap objects */ + vm_object_template_t *tmpl = &result->submap_object_templates[i]; + assert(tmpl->kind != SubmapObject); + } + +#undef ASSERT_IS_WITHIN + + return result; +} + + +static void +free_vm_config(vm_config_t *config) +{ + free(config->entry_templates); + free(config->object_templates); + free(config->config_name); + free(config); +} + + +/* + * templates are initialized by vm_configurator_init() + * because PAGE_SIZE is not a compile-time constant + */ +vm_object_template_t END_OBJECTS; +vm_entry_template_t END_ENTRIES = {}; +vm_entry_template_t guard_entry_template = {}; +vm_entry_template_t hole_template = {}; + +__attribute__((constructor)) +static void +vm_configurator_init(void) +{ + /* + * Set Verbose if environment variable VERBOSE is set. + * Also set verbose_exc_helper to match. + */ + char *env_verbose = getenv("VERBOSE"); + if (env_verbose) { + if (0 == strcasecmp(env_verbose, "0") || + 0 == strcasecmp(env_verbose, "false") || + 0 == strcasecmp(env_verbose, "no")) { + /* + * VERBOSE is set to something false-ish like "NO". + * Don't enable it. + */ + } else { + Verbose = true; + } + } + + verbose_exc_helper = Verbose; + + /* + * Verify some preconditions about page sizes. + * These would be static_asserts but PAGE_SIZE isn't constant. + */ + assert(DEFAULT_PARTIAL_ENTRY_SIZE > 0); + assert(DEFAULT_PARTIAL_ENTRY_SIZE / 2 > 0); + + /* + * Initialize some useful templates. + * These would be static initializers but PAGE_SIZE isn't constant. + */ + guard_entry_template = vm_entry_template( + .protection = 0, .max_protection = 0, + .user_tag = VM_MEMORY_GUARD /* 31 */); + hole_template = + vm_entry_template(.kind = Hole); + END_ENTRIES = + vm_entry_template(.kind = EndEntries); + END_OBJECTS = (vm_object_template_t){.kind = EndObjects, .size = 0}; + + /* + * Initialize fault exception and guard exception handlers. + * Do this explicitly in the hope of avoiding memory allocations + * inside our unallocated address ranges later. + */ + exc_guard_helper_init(); + { + static const char unwriteable = 1; + kern_return_t kr; + bool succeeded = try_write_byte((mach_vm_address_t)&unwriteable, 0, &kr); + assert(!succeeded); + assert(kr == KERN_PROTECTION_FAILURE); + } + + /* + * host_priv is looked up lazily so we don't + * unnecessarily fail tests that don't need it. + */ +} + +test_result_t +test_is_unimplemented( + checker_list_t *checker_list __unused, + mach_vm_address_t start __unused, + mach_vm_size_t size __unused) +{ + T_FAIL("don't call test_is_unimplemented()"); + return TestFailed; +} + +void +run_one_vm_test( + const char *filename, + const char *funcname, + const char *testname, + configure_fn_t configure_fn, + test_fn_t test_fn) +{ + vm_config_t *config; + checker_list_t *checker_list; + mach_vm_address_t vm_state_start_address; + mach_vm_address_t vm_state_end_address; + mach_vm_address_t test_fn_start_address; + mach_vm_address_t test_fn_end_address; + test_result_t result; + + const char *short_filename = strstr(filename, "tests/") ?: filename; + + if (test_fn == NULL) { + /* vm_tests_t field not set. The test file needs to be updated. */ + T_FAIL("test %s.%s not present in test file %s; please write it", + funcname, testname, short_filename); + return; + } else if (test_fn == test_is_unimplemented) { + /* Test is deliberately not implemented. */ + T_PASS("unimplemented test: %s %s %s", + short_filename, funcname, testname); + return; + } + + /* Prepare the VM state. */ + config = configure_fn(); + T_LOG("note: starting test: %s %s (%s) ...", funcname, testname, config->config_name); + + create_vm_state_from_config(config, &checker_list, + &test_fn_start_address, &test_fn_end_address); + vm_state_start_address = checker_range_start_address(checker_list->entries); + vm_state_end_address = checker_range_end_address(checker_list->entries); + + if (vm_state_start_address != test_fn_start_address || + vm_state_end_address != test_fn_end_address) { + T_LOG("note: prepared vm state is 0x%llx..0x%llx; calling tested function on 0x%llx..0x%llx", + vm_state_start_address, vm_state_end_address, + test_fn_start_address, test_fn_end_address); + } else { + T_LOG("note: prepared vm state is 0x%llx..0x%llx; calling tested function on the entire range", + vm_state_start_address, vm_state_end_address); + } + + /* Run the test. */ + result = test_fn(checker_list, test_fn_start_address, + test_fn_end_address - test_fn_start_address); + + /* + * Verify and/or deallocate depending on the initial test result. + * These operations may change the result to a failure. + */ + switch (result) { + case TestSucceeded: + /* + * Verify one more time, then perform + * destructive verifications and deallocate. + */ + result = verify_vm_state(checker_list, "after test"); + if (result == TestSucceeded) { + result = verify_vm_faultability(checker_list, "final faultability check", true, true); + } + if (result == TestSucceeded) { + deallocate_vm_allocations(checker_list); + result = verify_vm_state(checker_list, "after final deallocation"); + } + break; + case TestFailed: + /* + * we don't attempt to deallocate after a failure + * because we don't know where the real allocations are + */ + break; + } + + checker_list_free(checker_list); + + /* Report the final test result. */ + if (result == TestFailed) { + /* executable name is basename(short_filename) minus ".c" suffix */ + const char *exe_name = strrchr(short_filename, '/'); + exe_name = exe_name ? exe_name + 1 : short_filename; + int exe_name_len = strrchr(exe_name, '.') - exe_name; + const char *arch_cmd = isRosetta() ? "arch -x86_64 " : ""; + T_FAIL("%s %s %s (%s) failed above; run it locally with `env VERBOSE=1 %s%.*s -n %s %s`", + short_filename, funcname, testname, config->config_name, + arch_cmd, exe_name_len, exe_name, funcname, testname); + } else { + T_PASS("%s %s %s (%s)", + short_filename, funcname, testname, config->config_name); + } + + free_vm_config(config); +} diff --git a/tests/vm/configurator/vm_configurator.h b/tests/vm/configurator/vm_configurator.h new file mode 100644 index 000000000..df4fc44f9 --- /dev/null +++ b/tests/vm/configurator/vm_configurator.h @@ -0,0 +1,1522 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm_configurator.h + * + * Generator and checker of userspace virtual memory configurations. + */ + +#ifndef VM_CONFIGURATOR_H +#define VM_CONFIGURATOR_H + +/* + * -- Dramatis personae -- + * + * vm_entry_template_t + * Specification of a VM entry to create, + * or a hole in VM address space to skip over. + * Used to describe and create the VM state for the start of a test. + * + * vm_object_template_t + * Specification of a VM object to create for entries to copy or share. + * Used to describe and create the VM state for the start of a test. + * + * vm_config_t + * Specification of one or more contiguous VM entries, + * plus a test name and an address range within that VM + * space that is the range to be tested. + * Used to describe and create the VM state for the start of a test. + * + * vm_entry_checker_t + * Describes the expected state of a VM entry or a hole, + * and verifies that the live VM state matches the expected state. + * Updated by test code as test operations are performed. + * Used to verify the VM state during and after a test. + * + * vm_object_checker_t + * Describes the expected state of a VM object + * and verifies that the live VM state matches the expected state + * Updated by test code as test operations are performed. + * Used to verify the VM state during and after a test. + * + * -- Outline of a test -- + * + * 1. Describe the desired initial memory state + * with arrays of vm_entry_template_t and vm_object_template_t. + * 2. Call create_vm_state() to allocate the specified VM entries + * and lists of vm_entry_checker_t and vm_object_checker_t + * that match the newly-allocated state. + * 3. Perform the VM operations to be tested. Update the checkers + * with the state changes that you expect. If some field's value + * becomes indeterminate, or difficult to specify and unimportant + * for your test, disable that field in the checker. + * 4. Call verify_vm_state() to compare the live + * VM state to the checker's expected state. + * 5. Optionally repeat steps 3 and 4 to test a sequence of VM operations. + * + * See vm_configurator_tests.h for a set of templates used by + * many VM syscall tests, and some details on how to run them. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Set Verbose = true to log the complete VM state, both expected and actual, + * every time it is checked. + * Initialized from environment variable VERBOSE + */ +extern bool Verbose; + +/* + * Return values from individual test functions. + * These are ordered from "best" to "worst". + * + * TODO: docs + */ +typedef enum { + TestSucceeded = 1, + TestFailed, +} test_result_t; + +static inline test_result_t +worst_result(test_result_t *list, unsigned count) +{ + test_result_t worst = TestSucceeded; + for (unsigned i = 0; i < count; i++) { + if (list[i] > worst) { + worst = list[i]; + } + } + return worst; +} + +typedef enum { + DontFill = 0, /* must be zero */ + Fill = 1 +} fill_pattern_mode_t; + +typedef struct { + fill_pattern_mode_t mode; + uint64_t pattern; +} fill_pattern_t; + +/* + * EndObjects: for END_OBJECTS array terminator + * Deinited: an object that is no longer referenced and whose checker is now + * depopulated but is still allocated because some checker list may point to it + * Anonymous: anonymous memory such as vm_allocate() + * SubmapObject: an "object" that is really a submap + * TODO: support named/pageable objects + */ +typedef enum { + FreedObject = 0, /* use after free, shouldn't happen */ + EndObjects, + Deinited, + Anonymous, + SubmapObject, +} vm_object_template_kind_t; + +/* + * struct vm_object_template_t + * Declaratively specify VM objects to be created. + */ +typedef struct vm_object_template_s { + vm_object_template_kind_t kind; + + mach_vm_size_t size; /* size 0 means auto-compute from entry sizes */ + + fill_pattern_t fill_pattern; + struct { + struct vm_entry_template_s *entries; + struct vm_object_template_s *objects; + unsigned entry_count; + unsigned object_count; + } submap; +} vm_object_template_t; + +/* + * Convenience macro for initializing a vm_object_template_t. + * The macro sets all template fields to a default value. + * You may override any field using designated initializer syntax. + * + * Example usage: + * // all default values + * vm_object_template() + * + * // default, with custom size and fill pattern + * vm_object_template( + * .size = 20 * PAGE_SIZE, + * .fill_pattern = 0x1234567890abcdef) + */ +#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" +#define vm_object_template(...) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Winitializer-overrides\"") \ + _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") \ + (vm_object_template_t){ \ + .size = 0, /* auto-computed */ \ + .kind = Anonymous, \ + .fill_pattern = {.mode = DontFill}, \ + __VA_ARGS__ \ + } \ + _Pragma("clang diagnostic pop") + +/* Convenience for submap objects */ +#define submap_object_template(...) \ + vm_object_template(.kind = SubmapObject, __VA_ARGS__) + +/* + * EndEntries: for END_ENTRIES array terminator + * Allocation: an ordinary VM entry + * Hole: an unallocated range of the address space. + * Submap: a mapping of a submap + */ +typedef enum { + EndEntries = 0, + Allocation, + Hole, + Submap, +} vm_entry_template_kind_t; + +/* + * struct vm_entry_template_t + * Declaratively specify VM entries to be created. + */ +typedef struct vm_entry_template_s { + mach_vm_size_t size; + vm_entry_template_kind_t kind; + + /* + * NULL object means either null vm_object_t or anonymous zerofilled + * memory, depending on the requirements of the other settings. + * (For example, non-zero wire count faults in the pages + * so it is no longer a null vm_object_t.) + * Used when .kind == Allocation. + */ + vm_object_template_t *object; + + mach_vm_offset_t offset; + + vm_prot_t protection; + vm_prot_t max_protection; + vm_inherit_t inheritance; + vm_behavior_t behavior; + bool permanent; + + /* New entry gets vm_wire'd this many times. */ + uint16_t user_wired_count; + + /* + * User tag may be a specific value, or autoincrementing. + * + * An autoincrementing tag is assigned by create_vm_state() + * in the VM_MEMORY_APPLICATION_SPECIFIC_1-16 range. Adjacent + * autoincrementing entries get distinct tags. This can be + * used to stop the VM from simplifying/coalescing vm entries + * that you want to remain separate. + */ + uint16_t user_tag; +#define VM_MEMORY_TAG_AUTOINCREMENTING 256 + + uint8_t share_mode; + + /* + * Code to update when adding new fields: + * vm_entry_template() macro + * create_vm_state() function + */ +} vm_entry_template_t; + + +/* + * Default size for vm_entries created by this generator + * Some tests require that this be above some minimum. + * 64 * PAGE_SIZE is big enough that 1/4 of an entry is + * still over the 32KB physical copy limit inside vm_map_copyin. + */ +#define DEFAULT_ENTRY_SIZE (64 * (mach_vm_address_t)PAGE_SIZE) + +/* + * Default size for address ranges that cover only part of a vm_entry. + * Some tests require that this be above some minimum. + */ +#define DEFAULT_PARTIAL_ENTRY_SIZE (DEFAULT_ENTRY_SIZE / 2u) + +/* + * Unnesting of submap nested pmaps occurs at L[N-1] page table + * boundaries (pmap "twig"). By default we avoid crossing those + * boundaries in tests because it affects the unnested map entries + * in the parent map. + * TODO: don't hardcode this, get it from pmap somehow + */ +#define SUBMAP_ALIGNMENT_MASK (0x2000000ull - 1) + +/* + * Convenience macro for initializing a vm_entry_template_t. + * The macro sets all template fields to a default value. + * You may override any field using designated initializer syntax. + * + * Example usage: + * // all default values + * vm_entry_template() + * + * // default, with custom size and protections + * vm_entry_template( + * .size = 20 * PAGE_SIZE, + * .protection = VM_PROT_READ, + * .max_protection = VM_PROT_READ | VM_PROT_WRITE) + */ +#define vm_entry_template(...) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Winitializer-overrides\"") \ + _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") \ + (vm_entry_template_t){ \ + .size = DEFAULT_ENTRY_SIZE, \ + .kind = Allocation, \ + .object = NULL, \ + .offset = 0, \ + .protection = VM_PROT_READ | VM_PROT_WRITE, \ + .max_protection = VM_PROT_READ | VM_PROT_WRITE, \ + .inheritance = VM_INHERIT_DEFAULT, /* inherit_copy */ \ + .behavior = VM_BEHAVIOR_DEFAULT, \ + .permanent = false, \ + .user_wired_count = 0, \ + .user_tag = VM_MEMORY_TAG_AUTOINCREMENTING, \ + .share_mode = SM_EMPTY, \ + __VA_ARGS__ \ + } \ + _Pragma("clang diagnostic pop") + +/* Convenience for submap entries */ +#define submap_entry_template(...) \ + vm_entry_template(.kind = Submap, __VA_ARGS__) + +/* + * Convenience templates. + * END_ENTRIES and END_OBJECTS: terminates a template list + * passed to create_vm_state() instead of passing an array count. + * (useful for hand-written template array initializers) + * guard_entry_template: an allocation that defaults to + * prot/max NONE/NONE and tag VM_MEMORY_GUARD + * hole_template: an unallocated hole in the address space. + */ +extern vm_object_template_t END_OBJECTS; +extern vm_entry_template_t END_ENTRIES; +extern vm_entry_template_t guard_entry_template; +extern vm_entry_template_t hole_template; + +/* + * Count the number of templates in an END_TEMPLATE-terminated array. + */ +extern unsigned +count_templates(const vm_entry_template_t *templates); + + +/* + * struct vm_entry_attribute_list_t + * A list of checkable entry attributes with one bool for each. + * Used to record which attributes should be verified by a checker, + * or which attributes failed to match during verification. + */ +typedef struct { + union { + uint64_t bits; + struct { + uint64_t address_attr:1; + uint64_t size_attr:1; + uint64_t object_attr:1; + uint64_t protection_attr:1; + uint64_t max_protection_attr:1; + uint64_t inheritance_attr:1; + uint64_t behavior_attr:1; + uint64_t permanent_attr:1; + uint64_t user_wired_count_attr:1; + uint64_t user_tag_attr:1; + uint64_t is_submap_attr:1; + uint64_t submap_depth_attr:1; + uint64_t object_offset_attr:1; + uint64_t pages_resident_attr:1; + uint64_t share_mode_attr:1; + }; + }; + + /* + * Code to update when adding new fields: + * dump_checker_info() + * vm_entry_attributes_with_default macro + * verify_allocation() + */ +} vm_entry_attribute_list_t; + +/* + * struct vm_object_attribute_list_t + * A list of checkable entry attributes with one bool for each. + * Used to record which attributes should be verified by a checker, + * or which attributes failed to match during verification. + */ +typedef struct { + union { + uint64_t bits; + struct { + uint64_t object_id_attr:1; + uint64_t size_attr:1; + uint64_t ref_count_attr:1; + uint64_t shadow_depth_attr:1; + uint64_t fill_pattern_attr:1; + }; + }; + + /* + * Code to update when adding new fields: + * dump_checker_info() + * vm_object_attributes_with_default macro + * verify_allocation() + */ +} vm_object_attribute_list_t; + +/* + * vm_entry_attributes_with_default() returns a vm_entry_attribute_list_t, + * with all attributes set to `default_value`, and the caller can set individual + * attributes to other values using designated initializer syntax. + */ +#define vm_entry_attributes_with_default(default_value, ...) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Winitializer-overrides\"") \ + _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") \ + (vm_entry_attribute_list_t){ \ + .address_attr = (default_value), \ + .size_attr = (default_value), \ + .object_attr = (default_value), \ + .protection_attr = (default_value), \ + .max_protection_attr = (default_value), \ + .inheritance_attr = (default_value), \ + .behavior_attr = (default_value), \ + .permanent_attr = (default_value), \ + .user_wired_count_attr = (default_value), \ + .user_tag_attr = (default_value), \ + .is_submap_attr = (default_value), \ + .submap_depth_attr = (default_value), \ + .object_offset_attr = (default_value), \ + .pages_resident_attr = (default_value), \ + .share_mode_attr = (default_value), \ + __VA_ARGS__ \ + } \ + _Pragma("clang diagnostic pop") + +/* + * vm_object_attributes_with_default() returns a vm_object_attribute_list_t, + * with all attributes set to `default_value`, and the caller can set individual + * attributes to other values using designated initializer syntax. + */ +#define vm_object_attributes_with_default(default_value, ...) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Winitializer-overrides\"") \ + _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") \ + (vm_object_attribute_list_t){ \ + .object_id_attr = (default_value), \ + .size_attr = (default_value), \ + .ref_count_attr = (default_value), \ + .shadow_depth_attr = (default_value), \ + .fill_pattern_attr = (default_value), \ + __VA_ARGS__ \ + } \ + _Pragma("clang diagnostic pop") + +/* + * Description of a checker's current knowledge of an object's ID. + * object_is_unknown: object'd ID is unknown; it may be null + * object_has_unknown_nonnull_id: object's ID is expected to be non-null, + * but its actual value is unknown + * object_has_known_id: object's ID is expected to be checker->object_id + * + * During verification unknown object IDs are learned by reading them from the + * actual VM state. The learned IDs are applied to subsequent verifications or + * to subsequent uses of the same object in the same verification. + */ +typedef enum { + object_is_unknown = 0, + object_has_unknown_nonnull_id, + object_has_known_id +} object_id_mode_t; + +/* + * struct vm_object_checker_t + * Maintain and verify expected state of a VM object. + */ +typedef struct vm_object_checker_s { + struct vm_object_checker_s *prev; + struct vm_object_checker_s *next; + + vm_object_template_kind_t kind; + vm_object_attribute_list_t verify; + bool deinited; + + uint64_t object_id; + object_id_mode_t object_id_mode; + + /* + * This is the count of references to this object specifically. + * vm_region's reported ref_count also includes references to + * the shadow chain's objects, minus the shadow chain's references + * to each other. + */ + unsigned self_ref_count; + mach_vm_size_t size; + fill_pattern_t fill_pattern; + + /* + * Shadow chain. + * object->shadow moves away from entry. + * object->shadow is refcounted. + */ + struct vm_object_checker_s *shadow; + + /* + * Checkers for submap contents. + * These checkers are configured for a mapping of the whole + * submap at address 0. Verification of actual remappings will + * need to compensate for address offsets and bounds clipping. + */ + struct checker_list_s *submap_checkers; + + /* + * Code to update when adding new fields: + * struct vm_object_attribute_list_t + * make_null_object_checker() + * make_anonymous_object_checker() + * make_submap_object_checker() + * dump_checker_info() + * verify_allocation() + * object_checker_clone() + */ +} vm_object_checker_t; + +/* + * Create a new object checker duplicating an existing checker. + * The new object is: + * - zero self_ref_count + * - unknown object_id + * - not linked into any checker_list + */ +extern vm_object_checker_t * +object_checker_clone(vm_object_checker_t *obj_checker); + +/* + * struct vm_entry_checker_t + * Maintain and verify expected state of a VM map entry. + * + * The `verify` bitmap specifies which properties should be checked. + * If a property's value is indeterminate, or is difficult to specify + * and not important to the test, that check can be disabled. + * + * Checkers are kept in a doubly-linked list in address order, + * similar to vm_map_entry_t but it is not a circular list. + * Submaps are recursive: the top-level list contains a Submap checker, + * and the Submap checker has its own list of contained checkers. + */ +typedef struct vm_entry_checker_s { + struct vm_entry_checker_s *prev; + struct vm_entry_checker_s *next; + + vm_entry_template_kind_t kind; + vm_entry_attribute_list_t verify; + + mach_vm_address_t address; + mach_vm_size_t size; + + vm_object_checker_t *object; + + vm_prot_t protection; + vm_prot_t max_protection; + vm_inherit_t inheritance; + vm_behavior_t behavior; + bool permanent; + + uint16_t user_wired_count; + uint8_t user_tag; + + bool is_submap; /* true when entry is a parent map's submap entry */ + uint32_t submap_depth; /* non-zero when entry is a submap's content */ + + uint64_t object_offset; + uint32_t pages_resident; /* TODO: track this in the object checker instead */ + + bool needs_copy; + + /* share_mode is computed from other entry and object attributes */ + + /* + * Code to update when adding new fields: + * struct vm_entry_attribute_list_t + * make_checker_for_anonymous_private() + * make_checker_for_vm_allocate() + * make_checker_for_shared() + * make_checker_for_submap() + * dump_checker_info() + * verify_allocation() + * checker_simplify_left() + */ +} vm_entry_checker_t; + +/* + * A list of consecutive entry checkers. May be a subset of the entire doubly-linked list. + */ +typedef struct { + vm_entry_checker_t *head; + vm_entry_checker_t *tail; +} entry_checker_range_t; + +/* + * Count the number of entries between + * checker_range->head and checker_range->tail, inclusive. + */ +extern unsigned +checker_range_count(entry_checker_range_t checker_range); + +/* + * Return the start address of the first entry in a range. + */ +extern mach_vm_address_t +checker_range_start_address(entry_checker_range_t checker_range); + +/* + * Return the end address of the last entry in a range. + */ +extern mach_vm_address_t +checker_range_end_address(entry_checker_range_t checker_range); + +/* + * Return size of all entries in a range. + */ +extern mach_vm_size_t +checker_range_size(entry_checker_range_t checker_range); + +/* + * Loop over all checkers between + * entry_range->head and entry_range->tail, inclusive. + * Does visit any submap parent entry. + * Does not descend into submap contents. + * + * You may clip_left the current checker. The new left entry is not visited. + * You may clip_right the current checker. The new right entry is visited next. + * You may not delete the current checker, unless you also immediately break the loop. + */ +#define FOREACH_CHECKER(checker, entry_range) \ + for (vm_entry_checker_t *checker = (entry_range).head; \ + checker != (entry_range).tail->next; \ + checker = checker->next) + +/* + * The list of all entry and object checkers. + * The first and last entries may be changed by the test. + * The first object is the common null object, so it should not change. + * + * Submaps get their own checker_list_t. A submap checker + * list stores checkers for the submap's map entries. + * It does not store any objects; a single global list of objects is + * maintained in the top-level checker list so it can be searched by ID. + * + * submap_slide keeps track of a temporary address offset applied + * to the contained checkers. This is used for submap contents. + */ +typedef struct checker_list_s { + struct checker_list_s *parent; + entry_checker_range_t entries; + vm_object_checker_t *objects; /* must be NULL in submaps */ + uint64_t submap_slide; + bool is_slid; +} checker_list_t; + +#define FOREACH_OBJECT_CHECKER(obj_checker, list) \ + for (vm_object_checker_t *obj_checker = (list)->objects; \ + obj_checker != NULL; \ + obj_checker = obj_checker->next) + +/* + * Return the nth checker in the list. Aborts if n is out of range. + */ +extern vm_entry_checker_t * +checker_list_nth(checker_list_t *list, unsigned n); + +/* + * Search a list of checkers for an allocation that contains the given address. + * Returns NULL if no checker contains the address. + * Returns NULL if a non-Allocation checker contains the address. + * Does not descend into submaps. + */ +extern vm_entry_checker_t * +checker_list_find_allocation(checker_list_t *list, mach_vm_address_t addr); + +/* + * Search a list of checkers for a checker that contains the given address. + * May return checkers for holes. + * Returns NULL if no checker contains the address. + * Does not descend into submaps. + */ +extern vm_entry_checker_t * +checker_list_find_checker(checker_list_t *list, mach_vm_address_t addr); + +/* + * Add a new vm object checker to the list. + * Aborts if the new object is null and the list already has its null object. + * Aborts if the object's ID is the same as some other object. + */ +extern void +checker_list_append_object( + checker_list_t *list, + vm_object_checker_t *obj_checker); + +/* + * Return the list of entry checkers covering an address range. + * Aborts if the range includes any hole checkers. + */ +extern entry_checker_range_t +checker_list_find_range( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Return the list of entry checkers covering an address range. + * Hole checkers are allowed. + */ +extern entry_checker_range_t +checker_list_find_range_including_holes( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Like checker_list_find_range(), + * but the first and last entries are clipped to the address range. + */ +extern entry_checker_range_t +checker_list_find_and_clip( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Like checker_list_find_range_including_holes(), + * but the first and last entries (if any) are clipped to the address range. + */ +extern entry_checker_range_t +checker_list_find_and_clip_including_holes( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Attempts to simplify all entries in an address range. + */ +extern void +checker_list_simplify( + checker_list_t *list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Replace and delete checkers in old_range + * with the checkers in new_range. + * The two ranges must have the same start address and size. + * Updates list->head and/or list->tail if necessary. + */ +extern void +checker_list_replace_range( + checker_list_t *list, + entry_checker_range_t old_range, + entry_checker_range_t new_range); + +/* + * Convenience function to replace one checker with another. + * The two checkers must have the same start address and size. + */ +static inline void +checker_list_replace_checker( + checker_list_t *list, + vm_entry_checker_t *old_checker, + vm_entry_checker_t *new_checker) +{ + checker_list_replace_range(list, + (entry_checker_range_t){ old_checker, old_checker }, + (entry_checker_range_t){ new_checker, new_checker }); +} + +/* + * Convenience function to replace one checker with several checkers. + * The old and the new must have the same start address and size. + */ +static inline void +checker_list_replace_checker_with_range( + checker_list_t *list, + vm_entry_checker_t *old_checker, + entry_checker_range_t new_checkers) +{ + checker_list_replace_range(list, + (entry_checker_range_t){ old_checker, old_checker }, + new_checkers); +} + +/* + * Remove a contiguous range of checkers from a checker list. + * The checkers are freed. + * The checkers are replaced by a new hole checker. + * VM allocations are unaffected. + */ +extern void +checker_list_free_range( + checker_list_t *list, + entry_checker_range_t range); + +/* Convenience function for checker_list_remove_range() of a single checker. */ +static inline void +checker_list_free_checker( + checker_list_t *list, + vm_entry_checker_t *checker) +{ + checker_list_free_range(list, (entry_checker_range_t){ checker, checker }); +} + +/* + * Compute the end address of an entry. + * `checker->address + checker->size`, with integer overflow protection. + */ +static inline mach_vm_address_t +checker_end_address(vm_entry_checker_t *checker) +{ + mach_vm_address_t end; + bool overflowed = __builtin_add_overflow(checker->address, checker->size, &end); + assert(!overflowed); + return end; +} + +/* + * Return true if address is within checker's [start, end) + */ +static inline bool +checker_contains_address(vm_entry_checker_t *checker, mach_vm_address_t address) +{ + return address >= checker->address && address < checker_end_address(checker); +} + +/* + * Compute the share_mode value of an entry. + * This value is computed from other values in the checker and its object. + */ +extern uint8_t +checker_share_mode( + vm_entry_checker_t *checker); + +/* + * Compute the is_submap value of a map entry. + */ +static inline bool +checker_is_submap(vm_entry_checker_t *checker) +{ + return checker->kind == Submap; +} + +/* + * Submap slide (checker_get_and_slide_submap_checkers) + * + * We want a 1:1 relationship between checkers and map entries. + * This is complicated in submaps, where the parent map's view + * of the submap uses different addresses. + * + * Our solution: + * 1. Submap content checkers store the address as if inside the submap. + * 2. When using a submap content checker in a parent map context, + * the checker is temporarily modified to use parent-relative + * addresses instead ("slide"). + * + * The checker_list_t for the submap keeps track of the slide state + * of its checkers. Some places assert that the submap is or is not slid. + * + * Note that this code only deals with constant submaps; therefore + * we don't need to worry about changing checker bounds while they + * are temporarily slid. + */ + +/* + * Return the nested checkers for a parent map's submap entry. + * Returns NULL if the checker is not a submap entry. + * The caller must call unslide_submap_checkers() when finished. + */ +extern checker_list_t * +checker_get_and_slide_submap_checkers(vm_entry_checker_t *checker); + +/* + * Undo the effects of get_and_slide_submap_checkers(). + */ +extern void +unslide_submap_checkers(checker_list_t *submap_checkers); + +/* + * Convenience macro to call unslide_submap_checkers() at end of scope. + * The caller may manually unslide and then set their variable to NULL + * to cancel the automatic unslide. + */ +static inline void +cleanup_unslide_submap_checkers(checker_list_t **inout_submap_checkers) +{ + if (*inout_submap_checkers) { + unslide_submap_checkers(*inout_submap_checkers); + *inout_submap_checkers = NULL; + } +} +#define DEFER_UNSLIDE \ + __attribute__((cleanup(cleanup_unslide_submap_checkers))) + + +/* + * Adjust a start/end so that it does not extend beyond a limit. + * If start/end falls outside the limit, the output's size will + * be zero and its start will be indeterminate. + */ +extern void +clamp_start_end_to_start_end( + mach_vm_address_t * const inout_start, + mach_vm_address_t * const inout_end, + mach_vm_address_t limit_start, + mach_vm_address_t limit_end); + + +/* + * Adjust a address/size so that it does not extend beyond a limit. + * If address/size falls outside the limit, the output size will + * be zero and the start will be indeterminate + */ +extern void +clamp_address_size_to_address_size( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const inout_size, + mach_vm_address_t limit_address, + mach_vm_size_t limit_size); + + +/* + * Adjust an address range so it does not extend beyond an entry's bounds. + * When clamping to a submap entry: + * checker is a submap entry in the parent map. + * address and size are in the parent map's address space on entry and on exit. + */ +extern void +clamp_address_size_to_checker( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const inout_size, + vm_entry_checker_t *checker); + +/* + * Adjust an address range so it does not extend beyond an entry's bounds. + * When clamping to a submap entry: + * checker is a submap entry in the parent map. + * address and size are in the parent map's address space on entry and on exit. + */ +extern void +clamp_start_end_to_checker( + mach_vm_address_t * const inout_start, + mach_vm_address_t * const inout_end, + vm_entry_checker_t *checker); + + +/* + * Set the VM object that an entry points to. + * Replaces any existing object. Updates self_ref_count of any objects. + */ +extern void +checker_set_object(vm_entry_checker_t *checker, vm_object_checker_t *obj_checker); + +/* + * Set an entry's object to the null object. + * Identical to `checker_set_object(checker, find_object_checker_for_object_id(list, 0))` + */ +extern void +checker_set_null_object(checker_list_t *list, vm_entry_checker_t *checker); + +/* + * Set an entry's object to a copy of its current object, + * with the new_object->shadow = old_object. + * The entry's current object must not be null. + */ +extern void +checker_make_shadow_object(checker_list_t *list, vm_entry_checker_t *checker); + +/* + * If checker has a null VM object, change it to a new anonymous object. + */ +extern void +checker_resolve_null_vm_object( + checker_list_t *checker_list, + vm_entry_checker_t *checker); + +/* + * Update an entry's checker as if a fault occurred inside it. + * Assumes that all pages in the entry were faulted. + * Aborts if the fault appears to be a copy-on-write fault; this code does + * not attempt to handle that case. + * + * - resolves null objects + * - sets the resident page count + */ +extern void +checker_fault_for_prot_not_cow( + checker_list_t *checker_list, + vm_entry_checker_t *checker, + vm_prot_t fault_prot); + + +/* + * Conditionally unnest one checker in a submap. + * + * submap_parent is a parent map's submap entry. + * *inout_next_address is the current address in the parent map, + * within the bounds of submap_parent. + * If the entry inside the submap that contains *inout_next_address is: + * - unallocated: + * advance *inout_next_address past the unallocated space and return NULL + * - a writeable allocation: + * unnest the appropriate range in the parent map, + * advance *inout_next_address past the unnested range, + * and return the unnested range's new checker + * - a readable allocation: + * - (unnest_readonly == false) advance past it, same as for unallocated holes + * - (unnest_readonly == true) unnest it, same as for writeable allocations + * + * Set all_overwritten = true if the newly-unnested memory will + * be promptly written to (thus resolving null objects and collapsing COW shadow chains). + */ +extern vm_entry_checker_t * +checker_list_try_unnest_one_entry_in_submap( + checker_list_t *checker_list, + vm_entry_checker_t *submap_parent, + bool unnest_readonly, + bool all_overwritten, + mach_vm_address_t * const inout_next_address); + +/* + * Perform a clip-left operation on a checker, similar to vm_map_clip_left. + * Entry `right` is divided at `split`. + * Returns the new left-hand entry. + * Returns NULL if no split occurred. + * Updates list->head and/or list->tail if necessary. + */ +extern vm_entry_checker_t * +checker_clip_left( + checker_list_t *list, + vm_entry_checker_t *right, + mach_vm_address_t split); + +/* + * Perform a clip-right operation on a checker, similar to vm_map_clip_right. + * Entry `left` is divided at `split`. + * Returns the new right-hand entry. + * Returns NULL if no split occurred. + * Updates list->head and/or list->tail if necessary. + */ +extern vm_entry_checker_t * +checker_clip_right( + checker_list_t *list, + vm_entry_checker_t *left, + mach_vm_address_t split); + +/* + * Perform a simplify operation on a checker and the entry to its left. + * If coalescing occurs, `right` is preserved and + * the entry to the left is destroyed. + */ +extern void +checker_simplify_left( + checker_list_t *list, + vm_entry_checker_t *right); + + +/* + * Build a vm_checker for a newly-created memory region. + * The region is assumed to be the result of vm_allocate(). + * The new checker is not linked into the list. + */ +extern vm_entry_checker_t * +make_checker_for_vm_allocate( + checker_list_t *list, + mach_vm_address_t address, + mach_vm_size_t size, + int flags_and_tag); + +/* + * Create VM entries and VM entry checkers + * for the given VM entry templates. + * + * Entries will be created consecutively in contiguous memory, as specified. + * "Holes" will be deallocated during construction; + * be warned that the holes may become filled by other allocations + * including Rosetta's translations, which will cause the checker to + * fail later. + * + * Alignment handling: + * The first entry gets `alignment_mask` alignment. + * After that it is the caller's responsibility to arrange their + * templates in a way that yields the alignments they want. + */ +extern __attribute__((overloadable)) +checker_list_t * +create_vm_state( + const vm_entry_template_t entry_templates[], + unsigned entry_template_count, + const vm_object_template_t object_templates[], + unsigned object_template_count, + mach_vm_size_t alignment_mask, + const char *message); + +static inline __attribute__((overloadable)) +checker_list_t * +create_vm_state( + const vm_entry_template_t templates[], + unsigned count, + mach_vm_size_t alignment_mask) +{ + return create_vm_state(templates, count, NULL, 0, + alignment_mask, "create_vm_state"); +} + +/* + * Like create_vm_state, but the alignment mask defaults to PAGE_MASK + * and the template list is terminated by END_ENTRIES + */ +static inline __attribute__((overloadable)) +checker_list_t * +create_vm_state(const vm_entry_template_t templates[]) +{ + return create_vm_state(templates, count_templates(templates), PAGE_MASK); +} + +/* + * Like create_vm_state, but the alignment mask defaults to PAGE_MASK. + */ +static inline __attribute__((overloadable)) +checker_list_t * +create_vm_state(const vm_entry_template_t templates[], unsigned count) +{ + return create_vm_state(templates, count, PAGE_MASK); +} + + +/* + * Verify that the VM's state (as determined by vm_region) + * matches the expected state from a list of checkers. + * + * Returns TestSucceeded if the state is good, TestFailed otherwise. + * + * Failures are also reported as darwintest failures (typically T_FAIL) + * and failure details of expected and actual state are reported with T_LOG. + */ +extern test_result_t +verify_vm_state(checker_list_t *checker_list, const char *message); + +/* + * Perform VM read and/or write faults on every page spanned by a list of checkers, + * and verify that exceptions are delivered (or not) as expected. + * This is a destructive test: the faults may change VM state (for example + * resolving COW) but the checkers are not updated. + * + * Returns TestSucceeded if the state is good, TestFailed otherwise. + * + * Failures are also reported as darwintest failures (typically T_FAIL) + * and failure details of expected and actual state are reported with T_LOG. + */ +extern test_result_t +verify_vm_faultability( + checker_list_t *checker_list, + const char *message, + bool verify_reads, + bool verify_writes); + +/* + * Like verify_vm_faultability, but reads and/or writes + * from a single checker's memory. + * Returns true if the verification succeeded. + */ +extern bool +verify_checker_faultability( + vm_entry_checker_t *checker, + const char *message, + bool verify_reads, + bool verify_writes); + +/* + * Like verify_checker_faultability, but reads and/or writes + * only part of a single checker's memory. + * Returns true if the verification succeeded. + */ +extern bool +verify_checker_faultability_in_address_range( + vm_entry_checker_t *checker, + const char *message, + bool verify_reads, + bool verify_writes, + mach_vm_address_t checked_address, + mach_vm_size_t checked_size); + +/* + * Specification for a single trial: + * - the test's name + * - the templates for the virtual memory layout + * - the address range within that virtual memory + * layout that the tested operation should use. + */ +typedef struct vm_config_s { + char *config_name; + + /* + * Test's start address is the start of the first + * entry plus start_adjustment. Test's end address + * is the end of the last entry plus end_adjustment. + * When not zero, start_adjustment is typically positive + * and end_adjustment is typically negative. + */ + mach_vm_size_t start_adjustment; + mach_vm_size_t end_adjustment; + + /* First map entry gets this alignment. */ + mach_vm_size_t alignment_mask; + + vm_entry_template_t *entry_templates; + unsigned entry_template_count; + vm_object_template_t *object_templates; + unsigned object_template_count; + + vm_entry_template_t *submap_entry_templates; + unsigned submap_entry_template_count; + vm_object_template_t *submap_object_templates; + unsigned submap_object_template_count; +} vm_config_t; + +__attribute__((overloadable)) +extern vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + vm_object_template_t *object_templates, + vm_entry_template_t *submap_entry_templates, + vm_object_template_t *submap_object_templates, + mach_vm_size_t start_adjustment, + mach_vm_size_t end_adjustment, + mach_vm_size_t alignment_mask); + +/* + * make_vm_config() variants with fewer parameters + * (convenient for hardcoded initializer syntax) + * + * Variants that allow submap entries force submap-compatible alignment. + * Variants without submap entries use no alignment. + */ + +__attribute__((overloadable)) +static inline vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + vm_object_template_t *object_templates, + vm_entry_template_t *submap_entry_templates, + vm_object_template_t *submap_object_templates, + mach_vm_size_t start_adjustment, + mach_vm_size_t end_adjustment) +{ + return make_vm_config(name, entry_templates, object_templates, + submap_entry_templates, submap_object_templates, + start_adjustment, end_adjustment, SUBMAP_ALIGNMENT_MASK); +} + +__attribute__((overloadable)) +static inline vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + vm_object_template_t *object_templates, + mach_vm_size_t start_adjustment, + mach_vm_size_t end_adjustment) +{ + return make_vm_config(name, entry_templates, object_templates, + NULL, NULL, + start_adjustment, end_adjustment, 0); +} + +__attribute__((overloadable)) +static inline vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + mach_vm_size_t start_adjustment, + mach_vm_size_t end_adjustment) +{ + return make_vm_config(name, entry_templates, NULL, + NULL, NULL, + start_adjustment, end_adjustment, 0); +} + +__attribute__((overloadable)) +static inline vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates, + vm_object_template_t *object_templates) +{ + return make_vm_config(name, entry_templates, object_templates, + NULL, NULL, + 0, 0, 0); +} + +__attribute__((overloadable)) +static inline vm_config_t * +make_vm_config( + const char *name, + vm_entry_template_t *entry_templates) +{ + return make_vm_config(name, entry_templates, NULL, + NULL, NULL, + 0, 0, 0); +} + + +/* + * Like create_vm_state, but also computes the config's desired address range. + */ +extern void +create_vm_state_from_config( + vm_config_t *config, + checker_list_t ** const out_checker_list, + mach_vm_address_t * const out_start_address, + mach_vm_address_t * const out_end_address); + + +/* + * Logs the contents of checkers. + * Also logs the contents of submap checkers recursively. + */ +extern void +dump_checker_range(entry_checker_range_t list); + +/* + * Logs info from vm_region() for the address ranges spanned by the checkers. + * Also logs the contents of submaps recursively. + */ +extern void +dump_region_info_for_entries(entry_checker_range_t list); + + +/* + * Convenience functions for logging. + */ + +extern const char * +name_for_entry_kind(vm_entry_template_kind_t kind); + +extern const char * +name_for_kr(kern_return_t kr); + +extern const char * +name_for_prot(vm_prot_t prot); + +extern const char * +name_for_inherit(vm_inherit_t inheritance); + +extern const char * +name_for_behavior(vm_behavior_t behavior); + +extern const char * +name_for_bool(boolean_t value); + +extern const char * +name_for_share_mode(uint8_t share_mode); + +/* Convenience macro for compile-time array size */ +#define countof(array) \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic error \"-Wsizeof-pointer-div\"") \ + (sizeof(array)/sizeof((array)[0])) \ + _Pragma("clang diagnostic pop") + +/* Convenience macro for a heap allocated formatted string deallocated at end of scope. */ +static inline void +cleanup_cstring(char **ptr) +{ + free(*ptr); +} +#define CLEANUP_CSTRING __attribute__((cleanup(cleanup_cstring))) +#define TEMP_CSTRING(str, format, ...) \ + char *str CLEANUP_CSTRING; \ + asprintf(&str, format, __VA_ARGS__) + +/* + * Returns true if each bit set in `values` is also set in `container`. + */ +static inline bool +prot_contains_all(vm_prot_t container, vm_prot_t values) +{ + return (container & values) == values; +} + +/* + * Convenience functions for address arithmetic + */ + +static inline mach_vm_address_t +max(mach_vm_address_t a, mach_vm_address_t b) +{ + if (a > b) { + return a; + } else { + return b; + } +} + +static inline mach_vm_address_t +min(mach_vm_address_t a, mach_vm_address_t b) +{ + if (a < b) { + return a; + } else { + return b; + } +} + + +/* + * Call vm_region on an address. + * If the query address is mapped at that submap depth: + * - Sets *inout_address and *out_size to that map entry's address and size. + * [*inout_address, *inout_address + *out_size) contains the query address. + * - Sets the info from vm_region. + * - Returns true. + * If the query address is unmapped, or not mapped at that submap depth: + * - Sets *inout_address to the address of the next map entry, or ~0 if there is none. + * - Sets *out_size to zero. + * - Returns false. + */ +__attribute__((overloadable)) +extern bool +get_info_for_address( + mach_vm_address_t *inout_address, + mach_vm_size_t *out_size, + vm_region_submap_info_data_64_t *out_info, + uint32_t submap_depth); + +__attribute__((overloadable)) +static inline bool +get_info_for_address( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const out_size, + vm_region_submap_info_data_64_t * const out_info) +{ + return get_info_for_address(inout_address, out_size, out_info, 0); +} + +/* + * Like get_info_for_address(), but + * (1) it's faster, and + * (2) it does not get the right ref_count or shadow_depth values from vm_region. + */ +__attribute__((overloadable)) +extern bool +get_info_for_address_fast( + mach_vm_address_t *inout_address, + mach_vm_size_t *out_size, + vm_region_submap_info_data_64_t *out_info, + uint32_t submap_depth); + +__attribute__((overloadable)) +static inline bool +get_info_for_address_fast( + mach_vm_address_t * const inout_address, + mach_vm_size_t * const out_size, + vm_region_submap_info_data_64_t * const out_info) +{ + return get_info_for_address_fast(inout_address, out_size, out_info, 0); +} + +/* + * Convenience function to get object_id_full from vm_region at an address. + * Returns zero if the address is mapped but has a null object. + * Aborts if the address is not mapped. + */ +extern uint64_t +get_object_id_for_address(mach_vm_address_t address); + +/* + * Convenience function to get user_tag from vm_region at an address. + * Returns zero if the address is not mapped. + */ +extern uint16_t +get_user_tag_for_address(mach_vm_address_t address); + +/* + * Convenience function to get user_tag from vm_region at an address, + * if that tag is within the app-specific tag range. + * Returns zero if the address is not mapped. + * Returns zero if the address's tag is not within the app-specific range + * [VM_MEMORY_APPLICATION_SPECIFIC_1, VM_MEMORY_APPLICATION_SPECIFIC_16] + * + * This is used by tests that copy user tags from nearby memory. + * The "nearby" memory might not be part of the tested range. + * Copying an arbitrary user tag from outside is undesirable + * because the VM changes some of its behavior for some tag + * values and the tests need to see consistent behavior instead. + */ +extern uint16_t +get_app_specific_user_tag_for_address(mach_vm_address_t address); + +/* + * Convenience functions for vm_wire's host_priv port. + * host_priv() returns the port, or halts if it can't. + * host_priv_allowed() returns true or false. + * The host_priv port requires root on macOS. + */ +extern host_priv_t +host_priv(void); + +extern bool +host_priv_allowed(void); + +#endif /* VM_CONFIGURATOR_H */ diff --git a/tests/vm/configurator/vm_configurator_helpers.h b/tests/vm/configurator/vm_configurator_helpers.h new file mode 100644 index 000000000..2c097da07 --- /dev/null +++ b/tests/vm/configurator/vm_configurator_helpers.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm_configurator_helpers.h + * + * Assorted functions used by multiple vm_configurator tests. + */ + +#ifndef VM_CONFIGURATOR_HELPERS_H +#define VM_CONFIGURATOR_HELPERS_H + +#include "vm_configurator.h" + +/* + * Clear some bits from EXC_GUARD behavior, then set some bits. + * Halt with T_FAIL if task_get/set_exc_guard_behavior() fails. + */ +static inline void +clear_then_set_exc_guard_behavior( + task_exc_guard_behavior_t clear, + task_exc_guard_behavior_t set) +{ + task_exc_guard_behavior_t behavior; + kern_return_t kr = task_get_exc_guard_behavior(mach_task_self(), &behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "get EXC_GUARD behavior"); + + behavior &= ~clear; + behavior |= set; + + kr = task_set_exc_guard_behavior(mach_task_self(), behavior); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "update EXC_GUARD behavior"); +} + +/* + * Disable VM EXC_GUARD exceptions. + * Halt with T_FAIL if they cannot be disabled. + */ +static inline void +disable_vm_exc_guard(void) +{ + clear_then_set_exc_guard_behavior( + TASK_EXC_GUARD_VM_ALL, /* clear */ + 0 /* set */); +} + +/* + * Enable VM EXC_GUARD fatal exceptions. + * Halt with T_FAIL if they cannot be enabled. + */ +static inline void +enable_fatal_vm_exc_guard(void) +{ + clear_then_set_exc_guard_behavior( + TASK_EXC_GUARD_VM_ALL, /* clear */ + TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_FATAL /* set */); +} + +/* + * Enable VM EXC_GUARD non-fatal exceptions. + * Halt with T_FAIL if they cannot be enabled. + */ +static inline void +enable_non_fatal_vm_exc_guard(void) +{ + clear_then_set_exc_guard_behavior( + TASK_EXC_GUARD_VM_ALL, /* clear */ + TASK_EXC_GUARD_VM_DELIVER /* set */); +} + +/* + * Update the checker list after a successful call to vm_deallocate() + * of any number of ordinary allocations and holes. + * Don't use this if anything may be permanent entries. + */ +static inline void +checker_perform_successful_vm_deallocate( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* this may create adjacent hole checkers, but we don't care */ + entry_checker_range_t limit = + checker_list_find_and_clip_including_holes(checker_list, start, size); + checker_list_free_range(checker_list, limit); +} + +/* + * Update the checker list after a successful call to vm_allocate() + * of a permanent entry, which makes the memory inaccessible. + * On entry, the range must be a single checker for a permanent allocation. + */ +static inline void +checker_perform_vm_deallocate_permanent( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* Find the checker and verify its address range and permanence. */ + vm_entry_checker_t *checker = + checker_list_find_allocation(checker_list, start); + assert(checker); + assert(checker->address == start); + assert(checker->size == size); + assert(checker->permanent == true); + + /* Mark the memory as inaccessible. */ + checker->protection = VM_PROT_NONE; + checker->max_protection = VM_PROT_NONE; +} + +#endif /* VM_CONFIGURATOR_HELPERS_H */ diff --git a/tests/vm/configurator/vm_configurator_tests.h b/tests/vm/configurator/vm_configurator_tests.h new file mode 100644 index 000000000..a0ae9d72d --- /dev/null +++ b/tests/vm/configurator/vm_configurator_tests.h @@ -0,0 +1,1924 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm_configurator_tests.h + * + * Virtual memory configurations and a test wrapper + * available for use by tests that use vm_configurator. + */ + +#ifndef VM_CONFIGURATOR_TESTS_H +#define VM_CONFIGURATOR_TESTS_H + +#include "vm_configurator.h" + +/* + * Tests + * + * To add a new configuration for all VM API to be tested with: + * 1. Add a function definition `configure_` + * that returns a vm_config_t representing the VM state + * and address range to be tested. + * 2. Add a field named `` to struct vm_tests_t. + * 3. Add a call to `RUN_TEST()` in run_vm_tests() below. + * + * To help debug failing tests: + * - Run a test executable with environment variable VERBOSE=1 + * to print the checker and VM state frequently. + * - Run a test executable with only a single VM configuration + * by naming that configuration on the command line. + * Example of verbosely running only one read fault test: + * env VERBOSE=1 /path/to/configurator_fault -n fault_read permanent_before_allocation + */ + +typedef vm_config_t *(*configure_fn_t)(void); + +typedef test_result_t (*test_fn_t)( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* single entry */ + +static inline vm_config_t * +configure_single_entry_1(void) +{ + /* one entry, tested address range is the entire entry */ + vm_entry_template_t templates[] = { + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("single entry > entire entry", templates); +} + +static inline vm_config_t * +configure_single_entry_2(void) +{ + /* one entry, tested address range includes only the first part of it */ + vm_entry_template_t templates[] = { + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("single entry > first pages", templates, + 0, -DEFAULT_PARTIAL_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_single_entry_3(void) +{ + /* one entry, tested address range includes only the last part of it */ + vm_entry_template_t templates[] = { + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("single entry > last pages", templates, + DEFAULT_PARTIAL_ENTRY_SIZE, 0); +} + +static inline vm_config_t * +configure_single_entry_4(void) +{ + /* one entry, tested address range includes only the middle part of it */ + vm_entry_template_t templates[] = { + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("single entry > middle pages", templates, + DEFAULT_PARTIAL_ENTRY_SIZE / 2, -(DEFAULT_PARTIAL_ENTRY_SIZE / 2)); +} + +/* multiple entries */ + +static inline vm_config_t * +configure_multiple_entries_1(void) +{ + /* two entries, tested address range includes both */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("multiple entries > two entries", templates); +} + +static inline vm_config_t * +configure_multiple_entries_2(void) +{ + /* three entries, tested address range includes all of them */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("multiple entries > three entries", templates); +} + +static inline vm_config_t * +configure_multiple_entries_3(void) +{ + /* many entries, tested address range includes all of them */ + vm_entry_template_t templates[] = { + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + vm_entry_template(), vm_entry_template(), vm_entry_template(), vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("multiple entries > many entries", templates); +} + +static inline vm_config_t * +configure_multiple_entries_4(void) +{ + /* three entries, tested address range excludes the end of the last one */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("multiple entries > three entries, except the last pages", templates, + 0, -DEFAULT_PARTIAL_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_multiple_entries_5(void) +{ + /* three entries, tested address range excludes the start of the first one */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("multiple entries > three entries, except the first pages", templates, + DEFAULT_PARTIAL_ENTRY_SIZE, 0); +} + +static inline vm_config_t * +configure_multiple_entries_6(void) +{ + /* + * three entries, tested address range excludes both + * the start of the first one and the end of the last one + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + assert(DEFAULT_PARTIAL_ENTRY_SIZE / 2 > 0); + return make_vm_config("multiple entries > three entries, except the first and last pages", templates, + DEFAULT_PARTIAL_ENTRY_SIZE / 2, -(DEFAULT_PARTIAL_ENTRY_SIZE / 2)); +} + +/* some holes but not entirely holes */ + +static inline vm_config_t * +configure_some_holes_1(void) +{ + /* test address range begins in a hole and ends in an allocation */ + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > hole then one entry", templates); +} + +static inline vm_config_t * +configure_some_holes_2(void) +{ + /* test address range begins in a hole and ends in three allocation */ + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > hole then multiple entries", templates); +} + +static inline vm_config_t * +configure_some_holes_3(void) +{ + /* test address range begins in a hole and ends in the middle of an allocation */ + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > hole then partial entry", templates, + 0, -DEFAULT_PARTIAL_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_some_holes_4(void) +{ + /* + * test address range begins in a hole, covers two allocations, + * and ends in the middle of a third allocation + */ + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > hole then multiple entries then partial entry", templates, + 0, -DEFAULT_PARTIAL_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_some_holes_5(void) +{ + /* test address range begins at an allocation and ends in a hole */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("some holes > one entry then hole", templates); +} + +static inline vm_config_t * +configure_some_holes_6(void) +{ + /* + * test address range begins at an allocation, covers two more allocations, + * and ends in a hole + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("some holes > multiple entries then hole", templates); +} + +static inline vm_config_t * +configure_some_holes_7(void) +{ + /* test address range begins in the middle of an allocation and ends in a hole */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("some holes > partial entry then hole", templates, + DEFAULT_PARTIAL_ENTRY_SIZE, 0); +} + +static inline vm_config_t * +configure_some_holes_8(void) +{ + /* + * test address range begins in the middle of an allocation, covers + * two more allocations, and ends in a hole + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("some holes > partial entry then multiple entries then hole", templates, + DEFAULT_PARTIAL_ENTRY_SIZE, 0); +} + +static inline vm_config_t * +configure_some_holes_9(void) +{ + /* test address range is an allocation, then a hole, then an allocation */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > hole in the middle", templates); +} + +static inline vm_config_t * +configure_some_holes_10(void) +{ + /* test address range is allocation-hole-allocation-hole-allocation */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + vm_entry_template(), + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > two holes, three entries", templates); +} + +static inline vm_config_t * +configure_some_holes_11(void) +{ + /* + * test address range is + * two allocations-hole-two allocations-hole-two allocations + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + hole_template, + vm_entry_template(), + vm_entry_template(), + hole_template, + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > two holes, six entries", templates); +} + +static inline vm_config_t * +configure_some_holes_12(void) +{ + /* + * test address range is + * three allocations-hole-three allocations-hole-three allocations + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + hole_template, + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + hole_template, + vm_entry_template(), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("some holes > two holes, nine entries", templates); +} + +/* all holes */ + +static inline vm_config_t * +configure_all_holes_1(void) +{ + /* test address range is unallocated, with allocations on both sides */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("all holes > hole with entries on both sides", templates, + DEFAULT_ENTRY_SIZE, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_all_holes_2(void) +{ + /* + * test address range is unallocated, with an allocation before + * and more unallocated space after + */ + vm_entry_template_t templates[] = { + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("all holes > hole with entry before and hole after", templates, + DEFAULT_ENTRY_SIZE, -DEFAULT_PARTIAL_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_all_holes_3(void) +{ + /* + * test address range is unallocated, with more unallocated space before + * and an allocation after + */ + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("all holes > hole with hole before and entry after", templates, + DEFAULT_PARTIAL_ENTRY_SIZE, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_all_holes_4(void) +{ + /* test address range is unallocated, with more unallocated space before and after */ + vm_entry_template_t templates[] = { + hole_template, + END_ENTRIES + }; + return make_vm_config("all holes > hole with holes on both sides", templates, + DEFAULT_PARTIAL_ENTRY_SIZE / 2, -(DEFAULT_PARTIAL_ENTRY_SIZE / 2)); +} + +/* residency and sharing */ + +static inline vm_config_t * +configure_null_entry(void) +{ + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_EMPTY), + END_ENTRIES + }; + return make_vm_config("residency > null entry", templates); +} + +static inline vm_config_t * +configure_nonresident_entry(void) +{ + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_PRIVATE), + END_ENTRIES + }; + return make_vm_config("residency > nonresident entry", templates); +} + +static inline vm_config_t * +configure_resident_entry(void) +{ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_PRIVATE, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("residency > resident entry", templates, object_templates); +} + +static inline vm_config_t * +configure_shared_entry(void) +{ + /* + * Two entries sharing the same object. + * The address range covers only the left entry + */ + vm_object_template_t object_templates[] = { + vm_object_template(), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0]), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("sharing > simple shared entry", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_shared_entry_discontiguous(void) +{ + /* + * Two entries sharing the same object, + * but not the same range inside that object. + * The address range covers only the left entry. + */ + vm_object_template_t object_templates[] = { + vm_object_template(), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0], + .offset = 0), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0], + .offset = DEFAULT_ENTRY_SIZE), + END_ENTRIES + }; + return make_vm_config("sharing > discontiguous shared entry", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_shared_entry_partial(void) +{ + /* + * Two entries sharing the same object, + * but only partly overlap inside that object. + * The address range covers only the left entry. + */ + vm_object_template_t object_templates[] = { + vm_object_template(), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0], + .offset = 0), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0], + .offset = DEFAULT_PARTIAL_ENTRY_SIZE), + END_ENTRIES + }; + return make_vm_config("sharing > partial shared entry", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_shared_entry_pairs(void) +{ + /* + * Four entries. The first and last are shared. The middle two are + * also shared, independently. + * The address range covers all four entries. + */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + vm_object_template(.fill_pattern = {Fill, 0x2222222222222222}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0]), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[1]), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[1]), + vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("sharing > two pairs of shared entries", templates, object_templates); +} + +static inline vm_config_t * +configure_shared_entry_x1000(void) +{ + /* + * Many entries, all shared. + * The address range covers all entries. + */ + vm_object_template_t object_templates[] = { + vm_object_template(.size = PAGE_SIZE), + END_OBJECTS + }; + + const unsigned count = 1000; /* 1000 shared entries */ + vm_entry_template_t *templates = calloc(sizeof(templates[0]), count + 1); /* ... plus 1 END_ENTRIES entry */ + for (unsigned i = 0; i < count; i++) { + templates[i] = vm_entry_template(.share_mode = SM_SHARED, .object = &object_templates[0], .size = PAGE_SIZE); + } + templates[count] = END_ENTRIES; + vm_config_t *result = make_vm_config("sharing > 1000 shared entries", templates, object_templates); + free(templates); + return result; +} + +static inline vm_config_t * +configure_cow_entry(void) +{ + /* + * two entries that are COW copies of the same underlying object + * Operating range includes only the first entry. + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > one COW entry", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_cow_unreferenced(void) +{ + /* + * one COW entry but the memory being copied has no other references + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > COW with no other references", templates, object_templates); +} + +static inline vm_config_t * +configure_cow_nocow(void) +{ + /* + * one entry that is COW, then one ordinary entry. + * Additional out-of-range entry is a second reference to the COW memory. + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + vm_entry_template(.share_mode = SM_PRIVATE), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > COW then not-COW", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_nocow_cow(void) +{ + /* + * one ordinary entry, then one entry that is COW. + * Additional out-of-range entry is a second reference to the COW memory. + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_PRIVATE), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > not-COW then COW", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_cow_unreadable(void) +{ + /* + * COW entry that is unreadable. + * Additional out-of-range entry is a second reference to the COW memory. + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0], + .protection = VM_PROT_NONE), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > COW but unreadable", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_cow_unwriteable(void) +{ + /* + * COW entry that is readable but unwriteable. + * Additional out-of-range entry is a second reference to the COW memory. + */ + vm_object_template_t object_templates[] = { + /* fixme must use a fill pattern to get a non-null object to copy */ + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0], + .protection = VM_PROT_READ), + vm_entry_template(.share_mode = SM_COW, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("cow > COW but unwriteable", templates, object_templates, + 0, -DEFAULT_ENTRY_SIZE); +} + + +static inline vm_config_t * +configure_permanent_entry(void) +{ + /* one permanent entry */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.permanent = true, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("permanent > one permanent entry", + templates, object_templates); +} + +static inline vm_config_t * +configure_permanent_before_permanent(void) +{ + /* two permanent entries, both in-range */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.permanent = true, .object = &object_templates[0]), + vm_entry_template(.permanent = true, .share_mode = SM_EMPTY), + END_ENTRIES + }; + return make_vm_config("permanent > two permanent entries", + templates, object_templates); +} + +static inline vm_config_t * +configure_permanent_before_allocation(void) +{ + /* + * permanent entry followed by allocation + * The third entry, outside the tested address range, + * is an unallocated hole. This tests rdar://144128567 + * along with test configure_permanent_before_allocation_2 + */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.permanent = true, .object = &object_templates[0]), + vm_entry_template(), + hole_template, + END_ENTRIES + }; + return make_vm_config("permanent > permanent entry before allocation, hole outside", + templates, object_templates, 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_permanent_before_allocation_2(void) +{ + /* + * permanent entry followed by allocation + * The third entry, outside the tested address range, + * is an allocation to provoke rdar://144128567. + * Other than that bug the behavior should be + * identical to configure_permanent_before_allocation. + */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.permanent = true, .object = &object_templates[0]), + vm_entry_template(), + vm_entry_template(), + END_ENTRIES + }; + return make_vm_config("permanent > permanent entry before allocation, allocation outside", + templates, object_templates, 0, -DEFAULT_ENTRY_SIZE); +} + +static inline vm_config_t * +configure_permanent_before_hole(void) +{ + /* permanent entry followed by a hole */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(.permanent = true, .object = &object_templates[0]), + hole_template, + END_ENTRIES + }; + return make_vm_config("permanent > permanent entry before hole", + templates, object_templates); +} + +static inline vm_config_t * +configure_permanent_after_allocation(void) +{ + /* allocation followed by a permanent entry */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + vm_entry_template(), + vm_entry_template(.permanent = true, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("permanent > permanent entry after allocation", + templates, object_templates); +} + +static inline vm_config_t * +configure_permanent_after_hole(void) +{ + /* hole followed by a permanent entry */ + vm_object_template_t object_templates[] = { + vm_object_template(.fill_pattern = {Fill, 0x1234567890abcdef}), + END_OBJECTS + }; + vm_entry_template_t templates[] = { + hole_template, + vm_entry_template(.permanent = true, .object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config("permanent > permanent entry after hole", + templates, object_templates); +} + + +static inline vm_config_t * +configure_protection_single_common(vm_prot_t prot, vm_prot_t max) +{ + vm_entry_template_t templates[] = { + vm_entry_template(.protection = prot, .max_protection = max), + END_ENTRIES + }; + + TEMP_CSTRING(name, "protection > single entry prot/max %s/%s", + name_for_prot(prot), name_for_prot(max)); + return make_vm_config(name, templates); +} + +static inline vm_config_t * +configure_protection_pairs_common(vm_prot_t prot_left, vm_prot_t prot_right) +{ + vm_prot_t max_prot = VM_PROT_READ | VM_PROT_WRITE; + vm_entry_template_t templates[] = { + vm_entry_template(.protection = prot_left, .max_protection = max_prot), + vm_entry_template(.protection = prot_right, .max_protection = max_prot), + END_ENTRIES + }; + + TEMP_CSTRING(name, "protection > two entries prot/max %s/%s and %s/%s", + name_for_prot(prot_left), name_for_prot(max_prot), + name_for_prot(prot_right), name_for_prot(max_prot)); + return make_vm_config(name, templates); +} + +/* single entry with every prot/max combination (fixme no PROT_EXEC) */ + +/* prot/max ---/--- */ +static inline vm_config_t * +configure_protection_single_000_000(void) +{ + return configure_protection_single_common(VM_PROT_NONE, VM_PROT_NONE); +} + +/* prot/max r--/--- is disallowed */ + +/* prot/max -w-/--- is disallowed */ + +/* prot/max rw-/--- is disallowed */ + + +/* prot/max ---/r-- */ +static inline vm_config_t * +configure_protection_single_000_r00(void) +{ + return configure_protection_single_common(VM_PROT_NONE, VM_PROT_READ); +} + +/* prot/max r--/r-- */ +static inline vm_config_t * +configure_protection_single_r00_r00(void) +{ + return configure_protection_single_common(VM_PROT_READ, VM_PROT_READ); +} + +/* prot/max -w-/r-- is disallowed */ + +/* prot/max rw-/r-- is disallowed */ + + +/* prot/max ---/w-- */ +static inline vm_config_t * +configure_protection_single_000_0w0(void) +{ + return configure_protection_single_common(VM_PROT_NONE, VM_PROT_WRITE); +} + +/* prot/max r--/-w- is disallowed */ + +/* prot/max -w-/-w- */ +static inline vm_config_t * +configure_protection_single_0w0_0w0(void) +{ + return configure_protection_single_common(VM_PROT_WRITE, VM_PROT_WRITE); +} + +/* prot/max rw-/-w- is disallowed */ + + +/* prot/max ---/rw- */ +static inline vm_config_t * +configure_protection_single_000_rw0(void) +{ + return configure_protection_single_common(VM_PROT_NONE, VM_PROT_READ | VM_PROT_WRITE); +} + +/* prot/max r--/rw- */ +static inline vm_config_t * +configure_protection_single_r00_rw0(void) +{ + return configure_protection_single_common(VM_PROT_READ, VM_PROT_READ | VM_PROT_WRITE); +} + +/* prot/max -w-/rw- */ +static inline vm_config_t * +configure_protection_single_0w0_rw0(void) +{ + return configure_protection_single_common(VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + +/* prot/max rw-/rw- */ +static inline vm_config_t * +configure_protection_single_rw0_rw0(void) +{ + return configure_protection_single_common(VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + + +/* two entries with every pair of protections (fixme no PROT_EXEC) */ + +static inline vm_config_t * +configure_protection_pairs_000_000(void) +{ + return configure_protection_pairs_common(VM_PROT_NONE, VM_PROT_NONE); +} + +static inline vm_config_t * +configure_protection_pairs_000_r00(void) +{ + return configure_protection_pairs_common(VM_PROT_NONE, VM_PROT_READ); +} + +static inline vm_config_t * +configure_protection_pairs_000_0w0(void) +{ + return configure_protection_pairs_common(VM_PROT_NONE, VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_protection_pairs_000_rw0(void) +{ + return configure_protection_pairs_common(VM_PROT_NONE, VM_PROT_READ | VM_PROT_WRITE); +} + + +static inline vm_config_t * +configure_protection_pairs_r00_000(void) +{ + return configure_protection_pairs_common(VM_PROT_READ, VM_PROT_NONE); +} + +static inline vm_config_t * +configure_protection_pairs_r00_r00(void) +{ + return configure_protection_pairs_common(VM_PROT_READ, VM_PROT_READ); +} + +static inline vm_config_t * +configure_protection_pairs_r00_0w0(void) +{ + return configure_protection_pairs_common(VM_PROT_READ, VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_protection_pairs_r00_rw0(void) +{ + return configure_protection_pairs_common(VM_PROT_READ, VM_PROT_READ | VM_PROT_WRITE); +} + + +static inline vm_config_t * +configure_protection_pairs_0w0_000(void) +{ + return configure_protection_pairs_common(VM_PROT_WRITE, VM_PROT_NONE); +} + +static inline vm_config_t * +configure_protection_pairs_0w0_r00(void) +{ + return configure_protection_pairs_common(VM_PROT_WRITE, VM_PROT_READ); +} + +static inline vm_config_t * +configure_protection_pairs_0w0_0w0(void) +{ + return configure_protection_pairs_common(VM_PROT_WRITE, VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_protection_pairs_0w0_rw0(void) +{ + return configure_protection_pairs_common(VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + + +static inline vm_config_t * +configure_protection_pairs_rw0_000(void) +{ + return configure_protection_pairs_common(VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE); +} + +static inline vm_config_t * +configure_protection_pairs_rw0_r00(void) +{ + return configure_protection_pairs_common(VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ); +} + +static inline vm_config_t * +configure_protection_pairs_rw0_0w0(void) +{ + return configure_protection_pairs_common(VM_PROT_READ | VM_PROT_WRITE, VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_protection_pairs_rw0_rw0(void) +{ + return configure_protection_pairs_common(VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + + +/* submaps */ + +/* + * Common code for tests that are a single submap whose contents are a single entry + * but test at different start and end offsets within that entry. + */ +static inline vm_config_t * +configure_single_submap_single_entry_common( + const char *testname, + mach_vm_size_t start_offset, + mach_vm_size_t end_offset) +{ + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template(.object = &submap_objects[0]), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t entry_templates[] = { + submap_entry_template(.object = &object_templates[0]), + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, submap_entries, submap_objects, + start_offset, end_offset); +} + +static inline vm_config_t * +configure_single_submap_single_entry(void) +{ + /* + * test range consists of a single submap mapping + * which in turn contains a single entry + */ + return configure_single_submap_single_entry_common( + "submap > single entry > entire entry", + 0, 0 /* start and end offsets */); +} + +static inline vm_config_t * +configure_single_submap_single_entry_first_pages(void) +{ + /* + * test range consists of a single submap mapping + * which in turn contains a single entry + * and the address range to be tested + * excludes the end of that entry + */ + return configure_single_submap_single_entry_common( + "submap > single entry > first pages", + 0, -DEFAULT_PARTIAL_ENTRY_SIZE /* start and end offsets */); +} + +static inline vm_config_t * +configure_single_submap_single_entry_last_pages(void) +{ + /* + * test range consists of a single submap mapping + * which in turn contains a single entry + * and the address range to be tested + * excludes the start of that entry + */ + return configure_single_submap_single_entry_common( + "submap > single entry > last pages", + DEFAULT_PARTIAL_ENTRY_SIZE, 0 /* start and end offsets */); +} + +static inline vm_config_t * +configure_single_submap_single_entry_middle_pages(void) +{ + /* + * test range consists of a single submap mapping + * which in turn contains a single entry + * and the address range to be tested + * excludes the start and end of that entry + */ + return configure_single_submap_single_entry_common( + "submap > single entry > middle pages", + DEFAULT_PARTIAL_ENTRY_SIZE / 2, -(DEFAULT_PARTIAL_ENTRY_SIZE / 2) /* start and end offsets */); +} + + +static inline vm_config_t * +configure_single_submap_oversize_entry_common( + const char *testname, + mach_vm_address_t parent_offset, + mach_vm_size_t parent_size) +{ + /* + * submap contains a single entry of default size, + * parent map's view of the submap excludes some part of that entry + */ + assert(parent_offset < DEFAULT_ENTRY_SIZE); + assert(parent_offset + parent_size <= DEFAULT_ENTRY_SIZE); + + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template(.object = &submap_objects[0]), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t entry_templates[] = { + submap_entry_template( + .object = &object_templates[0], + .offset = parent_offset, + .size = parent_size), + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, + submap_entries, submap_objects, + 0, 0); +} + +static inline vm_config_t * +configure_single_submap_oversize_entry_at_start(void) +{ + /* + * submap contains a single entry, + * parent map's view of the submap excludes the start of that entry + */ + return configure_single_submap_oversize_entry_common( + "submap > oversize entry > oversize at start", + DEFAULT_ENTRY_SIZE / 2 /* parent_offset */, + DEFAULT_ENTRY_SIZE / 2 /* parent_size */); +} + +static inline vm_config_t * +configure_single_submap_oversize_entry_at_end(void) +{ + /* + * submap contains a single entry, + * parent map's view of the submap excludes the end of that entry + */ + return configure_single_submap_oversize_entry_common( + "submap > oversize entry > oversize at end", + 0 /* parent_offset */, + DEFAULT_ENTRY_SIZE / 2 /* parent_size */); +} + +static inline vm_config_t * +configure_single_submap_oversize_entry_at_both(void) +{ + /* + * submap contains a single entry, + * parent map's view of the submap excludes the start and end of that entry + */ + return configure_single_submap_oversize_entry_common( + "submap > oversize entry > oversize at both start and end", + DEFAULT_ENTRY_SIZE / 4 /* parent_offset */, + DEFAULT_ENTRY_SIZE / 2 /* parent_size */); +} + + +/* + * Common code for tests of a submap before or after a hole or allocation. + */ +static inline vm_config_t * +configure_submap_beafterfore_entry( + const char *testname, + vm_entry_template_kind_t first, + vm_entry_template_kind_t second, + int submap_protection) +{ + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template( + .object = &submap_objects[0], + .protection = submap_protection, + .max_protection = submap_protection), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t template_options[] = { + [Hole] = hole_template, + [Allocation] = vm_entry_template(), + [Submap] = submap_entry_template(.object = &object_templates[0]) + }; + /* entries must be Hole or Allocation or Submap */ + assert(first == Hole || first == Allocation || first == Submap); + assert(second == Hole || second == Allocation || second == Submap); + /* exactly one entry must be Submap */ + assert((first == Submap && second != Submap) || + (first != Submap && second == Submap)); + vm_entry_template_t entry_templates[] = { + template_options[first], + template_options[second], + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, submap_entries, submap_objects, + 0, 0); +} + +static inline vm_config_t * +configure_submap_before_allocation(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap before allocation", Submap, Allocation, + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_before_allocation_ro(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap before allocation, read-only", Submap, Allocation, + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_after_allocation(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap after allocation", Allocation, Submap, + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_after_allocation_ro(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap after allocation, read-only", Allocation, Submap, + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_before_hole(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap before hole", Submap, Hole, + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_before_hole_ro(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap before hole, read-only", Submap, Hole, + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_after_hole(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap after hole", Hole, Submap, + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_after_hole_ro(void) +{ + return configure_submap_beafterfore_entry( + "submap > submap after hole, read-only", Hole, Submap, + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_allocation_submap_one_entry_common( + const char *testname, + int submap_protection) +{ + /* + * submap has a single entry, but parent map entries are + * submap-allocation-submap, as if part of the submap mapping + * had been deallocated or unnested + */ + + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template( + .object = &submap_objects[0], + .size = DEFAULT_ENTRY_SIZE * 3, + .protection = submap_protection, + .max_protection = submap_protection), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t entry_templates[] = { + submap_entry_template( + .object = &object_templates[0], + .offset = 0), + vm_entry_template(), + submap_entry_template( + .object = &object_templates[0], + .offset = DEFAULT_ENTRY_SIZE * 2), + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, + submap_entries, submap_objects, + 0, 0); +} + +static inline vm_config_t * +configure_submap_allocation_submap_one_entry(void) +{ + return configure_submap_allocation_submap_one_entry_common( + "submap > submap-allocation-submap, one entry in submap", + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_allocation_submap_one_entry_ro(void) +{ + return configure_submap_allocation_submap_one_entry_common( + "submap > submap-allocation-submap, one entry in submap, read-only", + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_allocation_submap_two_entries_common( + const char *testname, + int submap_protection) +{ + /* + * submap has two entries, but parent map entries are + * submap-allocation-submap, as if part of the submap mapping + * had been deallocated or unnested (not matching the submap + * entry boundaries) + */ + + const mach_vm_size_t parent_entry_size = DEFAULT_ENTRY_SIZE; + const mach_vm_size_t total_size = parent_entry_size * 3; + const mach_vm_size_t submap_entry_size = total_size / 2; + assert(parent_entry_size * 3 == submap_entry_size * 2); + + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + vm_object_template(.fill_pattern = {Fill, 0x2222222222222222}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template( + .object = &submap_objects[0], + .size = submap_entry_size, + .protection = submap_protection, + .max_protection = submap_protection), + vm_entry_template( + .object = &submap_objects[1], + .size = submap_entry_size, + .protection = submap_protection, + .max_protection = submap_protection), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t entry_templates[] = { + submap_entry_template( + .object = &object_templates[0], + .offset = 0, + .size = parent_entry_size), + vm_entry_template(), + submap_entry_template( + .object = &object_templates[0], + .offset = parent_entry_size * 2, + .size = parent_entry_size), + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, + submap_entries, submap_objects, + 0, 0); +} + +static inline vm_config_t * +configure_submap_allocation_submap_two_entries(void) +{ + return configure_submap_allocation_submap_two_entries_common( + "submap > submap-allocation-submap, two entries in submap", + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_allocation_submap_two_entries_ro(void) +{ + return configure_submap_allocation_submap_two_entries_common( + "submap > submap-allocation-submap, two entries in submap, read-only", + VM_PROT_READ); +} + +static inline vm_config_t * +configure_submap_allocation_submap_three_entries_common( + const char *testname, + int submap_protection) +{ + /* + * submap has three entries, parent map entries are + * submap-allocation-submap, as if part of the submap mapping + * had been deallocated or unnested on the submap entry boundaries + */ + + vm_object_template_t submap_objects[] = { + vm_object_template(.fill_pattern = {Fill, 0x1111111111111111}), + vm_object_template(.fill_pattern = {Fill, 0x2222222222222222}), + vm_object_template(.fill_pattern = {Fill, 0x3333333333333333}), + END_OBJECTS + }; + vm_entry_template_t submap_entries[] = { + vm_entry_template( + .object = &submap_objects[0], + .protection = submap_protection, + .max_protection = submap_protection), + vm_entry_template( + .object = &submap_objects[1], + .protection = submap_protection, + .max_protection = submap_protection), + vm_entry_template( + .object = &submap_objects[2], + .protection = submap_protection, + .max_protection = submap_protection), + END_ENTRIES + }; + vm_object_template_t object_templates[] = { + submap_object_template( + .submap.entries = submap_entries, + .submap.objects = submap_objects), + END_OBJECTS + }; + vm_entry_template_t entry_templates[] = { + submap_entry_template( + .object = &object_templates[0], + .offset = 0), + vm_entry_template(), + submap_entry_template( + .object = &object_templates[0], + .offset = DEFAULT_ENTRY_SIZE * 2), + END_ENTRIES + }; + return make_vm_config(testname, + entry_templates, object_templates, + submap_entries, submap_objects, + 0, 0); +} + +static inline vm_config_t * +configure_submap_allocation_submap_three_entries(void) +{ + return configure_submap_allocation_submap_three_entries_common( + "submap > submap-allocation-submap, three entries in submap", + VM_PROT_READ | VM_PROT_WRITE); +} + +static inline vm_config_t * +configure_submap_allocation_submap_three_entries_ro(void) +{ + return configure_submap_allocation_submap_three_entries_common( + "submap > submap-allocation-submap, three entries in submap, read-only", + VM_PROT_READ); +} + + +/* add new tests here (configure_ functions) */ + + +typedef struct { + test_fn_t single_entry_1; + test_fn_t single_entry_2; + test_fn_t single_entry_3; + test_fn_t single_entry_4; + + test_fn_t multiple_entries_1; + test_fn_t multiple_entries_2; + test_fn_t multiple_entries_3; + test_fn_t multiple_entries_4; + test_fn_t multiple_entries_5; + test_fn_t multiple_entries_6; + + test_fn_t some_holes_1; + test_fn_t some_holes_2; + test_fn_t some_holes_3; + test_fn_t some_holes_4; + test_fn_t some_holes_5; + test_fn_t some_holes_6; + test_fn_t some_holes_7; + test_fn_t some_holes_8; + test_fn_t some_holes_9; + test_fn_t some_holes_10; + test_fn_t some_holes_11; + test_fn_t some_holes_12; + + test_fn_t all_holes_1; + test_fn_t all_holes_2; + test_fn_t all_holes_3; + test_fn_t all_holes_4; + + test_fn_t null_entry; + test_fn_t nonresident_entry; + test_fn_t resident_entry; + + test_fn_t shared_entry; + test_fn_t shared_entry_discontiguous; + test_fn_t shared_entry_partial; + test_fn_t shared_entry_pairs; + test_fn_t shared_entry_x1000; + + test_fn_t cow_entry; + test_fn_t cow_unreferenced; + test_fn_t cow_nocow; + test_fn_t nocow_cow; + test_fn_t cow_unreadable; + test_fn_t cow_unwriteable; + + test_fn_t permanent_entry; + test_fn_t permanent_before_permanent; + test_fn_t permanent_before_allocation; + test_fn_t permanent_before_allocation_2; + test_fn_t permanent_before_hole; + test_fn_t permanent_after_allocation; + test_fn_t permanent_after_hole; + + test_fn_t single_submap_single_entry; + test_fn_t single_submap_single_entry_first_pages; + test_fn_t single_submap_single_entry_last_pages; + test_fn_t single_submap_single_entry_middle_pages; + test_fn_t single_submap_oversize_entry_at_start; + test_fn_t single_submap_oversize_entry_at_end; + test_fn_t single_submap_oversize_entry_at_both; + + test_fn_t single_submap_single_entry_ro; + test_fn_t single_submap_single_entry_first_pages_ro; + test_fn_t single_submap_single_entry_last_pages_ro; + test_fn_t single_submap_single_entry_middle_pages_ro; + test_fn_t single_submap_oversize_entry_at_start_ro; + test_fn_t single_submap_oversize_entry_at_end_ro; + test_fn_t single_submap_oversize_entry_at_both_ro; + + test_fn_t submap_before_allocation; + test_fn_t submap_after_allocation; + test_fn_t submap_before_hole; + test_fn_t submap_after_hole; + test_fn_t submap_allocation_submap_one_entry; + test_fn_t submap_allocation_submap_two_entries; + test_fn_t submap_allocation_submap_three_entries; + + test_fn_t submap_before_allocation_ro; + test_fn_t submap_after_allocation_ro; + test_fn_t submap_before_hole_ro; + test_fn_t submap_after_hole_ro; + test_fn_t submap_allocation_submap_one_entry_ro; + test_fn_t submap_allocation_submap_two_entries_ro; + test_fn_t submap_allocation_submap_three_entries_ro; + + test_fn_t protection_single_000_000; + test_fn_t protection_single_000_r00; + test_fn_t protection_single_000_0w0; + test_fn_t protection_single_000_rw0; + test_fn_t protection_single_r00_r00; + test_fn_t protection_single_r00_rw0; + test_fn_t protection_single_0w0_0w0; + test_fn_t protection_single_0w0_rw0; + test_fn_t protection_single_rw0_rw0; + + test_fn_t protection_pairs_000_000; + test_fn_t protection_pairs_000_r00; + test_fn_t protection_pairs_000_0w0; + test_fn_t protection_pairs_000_rw0; + test_fn_t protection_pairs_r00_000; + test_fn_t protection_pairs_r00_r00; + test_fn_t protection_pairs_r00_0w0; + test_fn_t protection_pairs_r00_rw0; + test_fn_t protection_pairs_0w0_000; + test_fn_t protection_pairs_0w0_r00; + test_fn_t protection_pairs_0w0_0w0; + test_fn_t protection_pairs_0w0_rw0; + test_fn_t protection_pairs_rw0_000; + test_fn_t protection_pairs_rw0_r00; + test_fn_t protection_pairs_rw0_0w0; + test_fn_t protection_pairs_rw0_rw0; + + /* add new tests here */ +} vm_tests_t; + + +/* + * test_is_unimplemented is used by test files + * as a value in struct vm_tests_t to indicate that + * a particular test case is deliberately not implemented. + */ +extern test_result_t +test_is_unimplemented( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size); + +/* + * Return true if the process is running under Rosetta translation + * https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#Determine-Whether-Your-App-Is-Running-as-a-Translated-Binary + */ +static bool +isRosetta() +{ +#if KERNEL + return false; +#else + int out_value = 0; + size_t io_size = sizeof(out_value); + if (sysctlbyname("sysctl.proc_translated", &out_value, &io_size, NULL, 0) == 0) { + assert(io_size >= sizeof(out_value)); + return out_value; + } + return false; +#endif +} + +/* + * Return true if the task map's page size is less than the VM page size. + * (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) + * for example, Rosetta Intel on ARM + */ +static inline bool +task_page_size_less_than_vm_page_size(void) +{ + size_t map_page_size = PAGE_SIZE; + uint32_t vm_page_size = 0; + size_t len = sizeof(vm_page_size); + int err = sysctlbyname("vm.pagesize", &vm_page_size, &len, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname('vm.pagesize')"); + T_QUIET; T_ASSERT_GE(len, sizeof(vm_page_size), "sysctl result size"); + return map_page_size < vm_page_size; +} + +extern void +run_one_vm_test( + const char *filename, + const char *funcname, + const char *testname, + configure_fn_t configure_fn, + test_fn_t test_fn); + +static inline void +run_vm_tests( + const char *funcname, + const char *filename, + vm_tests_t *tests, + int argc, + char * const *argv) +{ + /* Allow naming a single test to run on the command line. */ + const char *test_to_run = NULL; + bool ran_a_test = false; + if (argc == 1) { + test_to_run = argv[0]; + T_LOG("RUNNING ONLY ONE TEST: %s %s", funcname, test_to_run); + } + + /* + * rdar://138495830 tests fail on Rosetta because of allocation holes + * We run tests that don't have holes and skip those that do. + */ + bool test_holes = true; + if (isRosetta()) { + T_LOG("SKIPPING TESTS of allocation holes on Rosetta (rdar://138495830)"); + test_holes = false; + } + +#define RUN_TEST(testname) \ + ({ \ + if (test_to_run == NULL || 0 == strcmp(#testname, test_to_run)) { \ + ran_a_test = true; \ + run_one_vm_test(filename, funcname, #testname, \ + configure_##testname, tests->testname); \ + } \ + }) + + /* single vm map entry and parts thereof, no holes */ + RUN_TEST(single_entry_1); + RUN_TEST(single_entry_2); + RUN_TEST(single_entry_3); + RUN_TEST(single_entry_4); + + /* multiple map entries and parts thereof, no holes */ + RUN_TEST(multiple_entries_1); + RUN_TEST(multiple_entries_2); + RUN_TEST(multiple_entries_3); + RUN_TEST(multiple_entries_4); + RUN_TEST(multiple_entries_5); + RUN_TEST(multiple_entries_6); + + /* ranges with holes */ + if (test_holes) { + RUN_TEST(some_holes_1); + RUN_TEST(some_holes_2); + RUN_TEST(some_holes_3); + RUN_TEST(some_holes_4); + RUN_TEST(some_holes_5); + RUN_TEST(some_holes_6); + RUN_TEST(some_holes_7); + RUN_TEST(some_holes_8); + RUN_TEST(some_holes_9); + RUN_TEST(some_holes_10); + RUN_TEST(some_holes_11); + RUN_TEST(some_holes_12); + } + + /* ranges that are nothing but holes */ + if (test_holes) { + RUN_TEST(all_holes_1); + RUN_TEST(all_holes_2); + RUN_TEST(all_holes_3); + RUN_TEST(all_holes_4); + } + + /* residency */ + RUN_TEST(null_entry); + RUN_TEST(nonresident_entry); // fixme broken in create_vm_state + RUN_TEST(resident_entry); + + /* sharing */ + RUN_TEST(shared_entry); + RUN_TEST(shared_entry_discontiguous); + RUN_TEST(shared_entry_partial); + RUN_TEST(shared_entry_pairs); + RUN_TEST(shared_entry_x1000); + + /* cow */ + RUN_TEST(cow_entry); + RUN_TEST(cow_unreferenced); + RUN_TEST(cow_nocow); + RUN_TEST(nocow_cow); + RUN_TEST(cow_unreadable); + RUN_TEST(cow_unwriteable); + + /* permanent */ + RUN_TEST(permanent_entry); + RUN_TEST(permanent_before_permanent); + if (test_holes) { + /* this test does have a required hole, after the other allocations */ + RUN_TEST(permanent_before_allocation); + } + RUN_TEST(permanent_before_allocation_2); + if (test_holes) { + RUN_TEST(permanent_before_hole); + } + RUN_TEST(permanent_after_allocation); + if (test_holes) { + RUN_TEST(permanent_after_hole); + } + + /* submaps */ + RUN_TEST(single_submap_single_entry); + RUN_TEST(single_submap_single_entry_first_pages); + RUN_TEST(single_submap_single_entry_last_pages); + RUN_TEST(single_submap_single_entry_middle_pages); + RUN_TEST(single_submap_oversize_entry_at_start); + RUN_TEST(single_submap_oversize_entry_at_end); + RUN_TEST(single_submap_oversize_entry_at_both); + + RUN_TEST(submap_before_allocation); + RUN_TEST(submap_before_allocation_ro); + RUN_TEST(submap_after_allocation); + RUN_TEST(submap_after_allocation_ro); + if (test_holes) { + RUN_TEST(submap_before_hole); + RUN_TEST(submap_before_hole_ro); + RUN_TEST(submap_after_hole); + RUN_TEST(submap_after_hole_ro); + } + RUN_TEST(submap_allocation_submap_one_entry); + RUN_TEST(submap_allocation_submap_one_entry_ro); + RUN_TEST(submap_allocation_submap_two_entries); + RUN_TEST(submap_allocation_submap_two_entries_ro); + RUN_TEST(submap_allocation_submap_three_entries); + RUN_TEST(submap_allocation_submap_three_entries_ro); + + /* protection */ + RUN_TEST(protection_single_000_000); + RUN_TEST(protection_single_000_r00); + RUN_TEST(protection_single_r00_r00); + RUN_TEST(protection_single_000_0w0); + RUN_TEST(protection_single_0w0_0w0); + RUN_TEST(protection_single_000_rw0); + RUN_TEST(protection_single_r00_rw0); + RUN_TEST(protection_single_0w0_rw0); + RUN_TEST(protection_single_rw0_rw0); + + RUN_TEST(protection_pairs_000_000); + RUN_TEST(protection_pairs_000_r00); + RUN_TEST(protection_pairs_000_0w0); + RUN_TEST(protection_pairs_000_rw0); + RUN_TEST(protection_pairs_r00_000); + RUN_TEST(protection_pairs_r00_r00); + RUN_TEST(protection_pairs_r00_0w0); + RUN_TEST(protection_pairs_r00_rw0); + RUN_TEST(protection_pairs_0w0_000); + RUN_TEST(protection_pairs_0w0_r00); + RUN_TEST(protection_pairs_0w0_0w0); + RUN_TEST(protection_pairs_0w0_rw0); + RUN_TEST(protection_pairs_rw0_000); + RUN_TEST(protection_pairs_rw0_r00); + RUN_TEST(protection_pairs_rw0_0w0); + RUN_TEST(protection_pairs_rw0_rw0); + + /* add new tests here */ + +#undef RUN_TEST + + if (test_to_run != NULL && !ran_a_test) { + T_FAIL("no test named '%s'", test_to_run); + } +} + +#endif /* VM_CONFIGURATOR_TESTS_H */ diff --git a/tests/vm/configurator_fault.c b/tests/vm/configurator_fault.c new file mode 100644 index 000000000..6774a6c5c --- /dev/null +++ b/tests/vm/configurator_fault.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_fault_read.c + * + * Test read and write faults with many different VM states. + */ + +#include +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + + +static bool +test_fault_one_checker_in_address_range( + vm_entry_checker_t *checker, + bool is_write_fault, + bool in_submap, + mach_vm_address_t checked_address, + mach_vm_size_t checked_size) +{ + TEMP_CSTRING(message, "after %s 0x%llx..0x%llx%s", + is_write_fault ? "writing" : "reading", + checked_address, checked_address + checked_size, + in_submap ? " (in submap)" : ""); + bool verify_reads = !is_write_fault; + bool verify_writes = is_write_fault; + bool good = verify_checker_faultability_in_address_range(checker, + message, verify_reads, verify_writes, checked_address, checked_size); + return good; +} + +/* + * Call verify_checker_faultability() for one checker. + * Advance *inout_next_address_to_fault past it. + */ +static bool +test_fault_one_checker( + vm_entry_checker_t *checker, + bool is_write_fault, + bool in_submap, + mach_vm_address_t * const inout_next_address_to_fault) +{ + bool good = test_fault_one_checker_in_address_range(checker, + is_write_fault, in_submap, checker->address, checker->size); + *inout_next_address_to_fault = checker_end_address(checker); + return good; +} + +/* + * Call verify_checker_faultability() for one allocation checker. + * Advance *inout_next_address_to_fault past it. + */ +static bool +test_fault_one_allocation( + checker_list_t *checker_list, + vm_entry_checker_t *checker, + bool is_write_fault, + bool in_submap, + mach_vm_address_t *const inout_next_address_to_fault) +{ + /* fault should not affect COW */ + checker_fault_for_prot_not_cow(checker_list, checker, + is_write_fault ? VM_PROT_WRITE : VM_PROT_READ); + return test_fault_one_checker(checker, is_write_fault, in_submap, inout_next_address_to_fault); +} + +/* + * Call verify_checker_faultability() for one parent map submap checker, + * or some portion thereof. + * Advance *inout_next_address_to_fault past the verified range. + */ +static bool +test_fault_one_submap( + checker_list_t *checker_list, + vm_entry_checker_t *submap_parent, + bool is_write_fault, + mach_vm_address_t *const inout_next_address_to_fault) +{ + mach_vm_address_t next_address_to_fault = *inout_next_address_to_fault; + + /* + * Verify up to one entry in the submap. + * The caller's loop will proceed through all entries in the submap. + */ + + /* Write fault unnests up to one entry in the submap, if necessary. */ + if (is_write_fault) { + mach_vm_address_t unnest_address = next_address_to_fault; + vm_entry_checker_t *unnested_checker = + checker_list_try_unnest_one_entry_in_submap(checker_list, submap_parent, + true /* unnest_readonly */, true /* all_overwritten */, + &unnest_address); + if (unnested_checker != NULL) { + /* + * Unnest occurred. Don't change *inout_next_address_to_fault + * and instead let the caller test this unnested entry's + * faultability in its next iteration. + */ + return true; + } + } + + /* + * Did not unnest. Fault the nested entry (allocation or hole). + * Don't fault outside the parent map's view of the submap. + */ + + /* Find the checker for the submap's entry at this address. */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + vm_entry_checker_t *checker = + checker_list_find_checker(submap_checkers, next_address_to_fault); + + /* Compute the extent of the submap content checker that is visible to the parent map. */ + mach_vm_address_t clamped_checker_address = checker->address; + mach_vm_size_t clamped_checker_size = checker->size; + clamp_address_size_to_checker(&clamped_checker_address, &clamped_checker_size, submap_parent); + + assert(checker->kind == Allocation || checker->kind == Hole); + *inout_next_address_to_fault = clamped_checker_address + clamped_checker_size; + return test_fault_one_checker_in_address_range(checker, is_write_fault, true, + clamped_checker_address, clamped_checker_size); +} + +static test_result_t +test_fault_common( + checker_list_t *checker_list, + mach_vm_address_t range_start, + mach_vm_size_t range_size, + bool is_write_fault, /* true for write fault, false for read fault */ + bool in_submap) +{ + /* + * Read or write all pages in one checker, then verify the VM state. Repeat for all checkers. + * Reading or writing in holes must provoke EXC_BAD_ACCESS (KERN_INVALID_ADDRESS). + * Reading or writing unreadable regions must provoke EXC_BAD_ACCESS (KERN_PROTECTION_FAILURE). + * Writing unwriteable regions must provoke EXC_BAD_ACCESS (KERN_PROTECTION_FAILURE). + * + * (TODO page modeling) this accesses outside [range_start, range_size) + * when the range starts or ends inside an entry + * need more precise page tracking to do better + */ + + /* not FOREACH_CHECKER because submap unnesting breaks it */ + mach_vm_address_t next_address_to_fault = range_start; + while (next_address_to_fault < range_start + range_size) { + vm_entry_checker_t *checker = checker_list_find_checker(checker_list, next_address_to_fault); + switch (checker->kind) { + case Allocation: + if (!test_fault_one_allocation( + checker_list, checker, is_write_fault, + in_submap, &next_address_to_fault)) { + goto failed; + } + break; + case Hole: + if (!test_fault_one_checker( + checker, is_write_fault, + in_submap, &next_address_to_fault)) { + goto failed; + } + break; + case Submap: + assert(!in_submap && "nested submaps not allowed"); + if (!test_fault_one_submap( + checker_list, checker, is_write_fault, + &next_address_to_fault)) { + goto failed; + } + break; + default: + assert(0); + } + } + + return TestSucceeded; + +failed: + T_LOG("*** after incomplete verification of faults: all expected ***"); + dump_checker_range(checker_list->entries); + T_LOG("*** after incomplete verification of faults: all actual ***"); + dump_region_info_for_entries(checker_list->entries); + return TestFailed; +} + +static test_result_t +test_fault_read( + checker_list_t *checker_list, + mach_vm_address_t range_start, + mach_vm_size_t range_size) +{ + return test_fault_common(checker_list, range_start, range_size, + false /* is_write_fault */, false /* in_submap */); +} + +static test_result_t +test_fault_write( + checker_list_t *checker_list, + mach_vm_address_t range_start, + mach_vm_size_t range_size) +{ + return test_fault_common(checker_list, range_start, range_size, + true /* is_write_fault */, false /* in_submap */); +} + + +/* + * Resolves COW. Assumes the write operation writes to the entire object, + * so there are no shared pages remaining and the new object's shadow + * chain collapses. + */ +static void +checker_make_cow_private_with_collapsed_shadow_chain( + checker_list_t *checker_list, + vm_entry_checker_t *checker) +{ + assert(checker->needs_copy); + + if (checker->object->self_ref_count == 1) { + /* + * COW but not shared with anything else. + * VM resolves COW by using the same object. + */ + checker->needs_copy = false; + return; + } + + /* make new object */ + vm_object_checker_t *obj_checker = object_checker_clone(checker->object); + checker_list_append_object(checker_list, obj_checker); + + /* change object and entry to private */ + checker->needs_copy = false; + + /* set new object (decreasing previous object's self_ref_count) */ + checker_set_object(checker, obj_checker); +} + +static test_result_t +test_fault_write_cow_1st( + checker_list_t *checker_list, + mach_vm_address_t range_start, + mach_vm_size_t range_size) +{ + /* + * 1st entry is COW. + * Resolve COW because we're writing to it. + * We write to the entire entry so no shadow chain remains. + */ + checker_make_cow_private_with_collapsed_shadow_chain( + checker_list, checker_list_nth(checker_list, 0)); + return test_fault_write(checker_list, range_start, range_size); +} + +static test_result_t +test_fault_write_cow_2nd( + checker_list_t *checker_list, + mach_vm_address_t range_start, + mach_vm_size_t range_size) +{ + /* + * 2nd entry is COW. + * Resolve COW because we're writing to it. + * We write to the entire entry so no shadow chain remains. + */ + checker_make_cow_private_with_collapsed_shadow_chain( + checker_list, checker_list_nth(checker_list, 1)); + return test_fault_write(checker_list, range_start, range_size); +} + +T_DECL(fault_read, + "perform read faults with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = test_fault_read, + .single_entry_2 = test_fault_read, + .single_entry_3 = test_fault_read, + .single_entry_4 = test_fault_read, + + .multiple_entries_1 = test_fault_read, + .multiple_entries_2 = test_fault_read, + .multiple_entries_3 = test_fault_read, + .multiple_entries_4 = test_fault_read, + .multiple_entries_5 = test_fault_read, + .multiple_entries_6 = test_fault_read, + + .some_holes_1 = test_fault_read, + .some_holes_2 = test_fault_read, + .some_holes_3 = test_fault_read, + .some_holes_4 = test_fault_read, + .some_holes_5 = test_fault_read, + .some_holes_6 = test_fault_read, + .some_holes_7 = test_fault_read, + .some_holes_8 = test_fault_read, + .some_holes_9 = test_fault_read, + .some_holes_10 = test_fault_read, + .some_holes_11 = test_fault_read, + .some_holes_12 = test_fault_read, + + .all_holes_1 = test_fault_read, + .all_holes_2 = test_fault_read, + .all_holes_3 = test_fault_read, + .all_holes_4 = test_fault_read, + + .null_entry = test_fault_read, + .nonresident_entry = test_fault_read, + .resident_entry = test_fault_read, + + /* TODO move pages_resident from entry checker to object checker */ + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_fault_read, + .cow_unreferenced = test_fault_read, + .cow_nocow = test_fault_read, + .nocow_cow = test_fault_read, + .cow_unreadable = test_fault_read, + .cow_unwriteable = test_fault_read, + + .permanent_entry = test_fault_read, + .permanent_before_permanent = test_fault_read, + .permanent_before_allocation = test_fault_read, + .permanent_before_allocation_2 = test_fault_read, + .permanent_before_hole = test_fault_read, + .permanent_after_allocation = test_fault_read, + .permanent_after_hole = test_fault_read, + + .single_submap_single_entry = test_fault_read, + .single_submap_single_entry_first_pages = test_fault_read, + .single_submap_single_entry_last_pages = test_fault_read, + .single_submap_single_entry_middle_pages = test_fault_read, + .single_submap_oversize_entry_at_start = test_fault_read, + .single_submap_oversize_entry_at_end = test_fault_read, + .single_submap_oversize_entry_at_both = test_fault_read, + + .submap_before_allocation = test_fault_read, + .submap_after_allocation = test_fault_read, + .submap_before_hole = test_fault_read, + .submap_after_hole = test_fault_read, + .submap_allocation_submap_one_entry = test_fault_read, + .submap_allocation_submap_two_entries = test_fault_read, + .submap_allocation_submap_three_entries = test_fault_read, + + .submap_before_allocation_ro = test_fault_read, + .submap_after_allocation_ro = test_fault_read, + .submap_before_hole_ro = test_fault_read, + .submap_after_hole_ro = test_fault_read, + .submap_allocation_submap_one_entry_ro = test_fault_read, + .submap_allocation_submap_two_entries_ro = test_fault_read, + .submap_allocation_submap_three_entries_ro = test_fault_read, + + .protection_single_000_000 = test_fault_read, + .protection_single_000_r00 = test_fault_read, + .protection_single_r00_r00 = test_fault_read, + .protection_single_000_0w0 = test_fault_read, + .protection_single_0w0_0w0 = test_fault_read, + .protection_single_000_rw0 = test_fault_read, + .protection_single_r00_rw0 = test_fault_read, + .protection_single_0w0_rw0 = test_fault_read, + .protection_single_rw0_rw0 = test_fault_read, + + .protection_pairs_000_000 = test_fault_read, + .protection_pairs_000_r00 = test_fault_read, + .protection_pairs_000_0w0 = test_fault_read, + .protection_pairs_000_rw0 = test_fault_read, + .protection_pairs_r00_000 = test_fault_read, + .protection_pairs_r00_r00 = test_fault_read, + .protection_pairs_r00_0w0 = test_fault_read, + .protection_pairs_r00_rw0 = test_fault_read, + .protection_pairs_0w0_000 = test_fault_read, + .protection_pairs_0w0_r00 = test_fault_read, + .protection_pairs_0w0_0w0 = test_fault_read, + .protection_pairs_0w0_rw0 = test_fault_read, + .protection_pairs_rw0_000 = test_fault_read, + .protection_pairs_rw0_r00 = test_fault_read, + .protection_pairs_rw0_0w0 = test_fault_read, + .protection_pairs_rw0_rw0 = test_fault_read, + }; + + run_vm_tests("fault_read", __FILE__, &tests, argc, argv); +} + + +T_DECL(fault_write, + "perform write faults with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = test_fault_write, + .single_entry_2 = test_fault_write, + .single_entry_3 = test_fault_write, + .single_entry_4 = test_fault_write, + + .multiple_entries_1 = test_fault_write, + .multiple_entries_2 = test_fault_write, + .multiple_entries_3 = test_fault_write, + .multiple_entries_4 = test_fault_write, + .multiple_entries_5 = test_fault_write, + .multiple_entries_6 = test_fault_write, + + .some_holes_1 = test_fault_write, + .some_holes_2 = test_fault_write, + .some_holes_3 = test_fault_write, + .some_holes_4 = test_fault_write, + .some_holes_5 = test_fault_write, + .some_holes_6 = test_fault_write, + .some_holes_7 = test_fault_write, + .some_holes_8 = test_fault_write, + .some_holes_9 = test_fault_write, + .some_holes_10 = test_fault_write, + .some_holes_11 = test_fault_write, + .some_holes_12 = test_fault_write, + + .all_holes_1 = test_fault_write, + .all_holes_2 = test_fault_write, + .all_holes_3 = test_fault_write, + .all_holes_4 = test_fault_write, + + .null_entry = test_fault_write, + .nonresident_entry = test_fault_write, + .resident_entry = test_fault_write, + + /* TODO move pages_resident from entry checker to object checker */ + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_fault_write_cow_1st, + .cow_unreferenced = test_fault_write_cow_1st, + .cow_nocow = test_fault_write_cow_1st, + .nocow_cow = test_fault_write_cow_2nd, + .cow_unreadable = test_fault_write, + .cow_unwriteable = test_fault_write, + + .permanent_entry = test_fault_write, + .permanent_before_permanent = test_fault_write, + .permanent_before_allocation = test_fault_write, + .permanent_before_allocation_2 = test_fault_write, + .permanent_before_hole = test_fault_write, + .permanent_after_allocation = test_fault_write, + .permanent_after_hole = test_fault_write, + + .single_submap_single_entry = test_fault_write, + .single_submap_single_entry_first_pages = test_fault_write, + .single_submap_single_entry_last_pages = test_fault_write, + .single_submap_single_entry_middle_pages = test_fault_write, + .single_submap_oversize_entry_at_start = test_fault_write, + .single_submap_oversize_entry_at_end = test_fault_write, + .single_submap_oversize_entry_at_both = test_fault_write, + + /* TODO: fix submap_allocation_submap tests */ + .submap_before_allocation = test_fault_write, + .submap_after_allocation = test_fault_write, + .submap_before_hole = test_fault_write, + .submap_after_hole = test_fault_write, + .submap_allocation_submap_one_entry = test_is_unimplemented, + .submap_allocation_submap_two_entries = test_is_unimplemented, + .submap_allocation_submap_three_entries = test_is_unimplemented, + + .submap_before_allocation_ro = test_fault_write, + .submap_after_allocation_ro = test_fault_write, + .submap_before_hole_ro = test_fault_write, + .submap_after_hole_ro = test_fault_write, + .submap_allocation_submap_one_entry_ro = test_is_unimplemented, + .submap_allocation_submap_two_entries_ro = test_is_unimplemented, + .submap_allocation_submap_three_entries_ro = test_is_unimplemented, + + .protection_single_000_000 = test_fault_write, + .protection_single_000_r00 = test_fault_write, + .protection_single_r00_r00 = test_fault_write, + .protection_single_000_0w0 = test_fault_write, + .protection_single_0w0_0w0 = test_fault_write, + .protection_single_000_rw0 = test_fault_write, + .protection_single_r00_rw0 = test_fault_write, + .protection_single_0w0_rw0 = test_fault_write, + .protection_single_rw0_rw0 = test_fault_write, + + .protection_pairs_000_000 = test_fault_write, + .protection_pairs_000_r00 = test_fault_write, + .protection_pairs_000_0w0 = test_fault_write, + .protection_pairs_000_rw0 = test_fault_write, + .protection_pairs_r00_000 = test_fault_write, + .protection_pairs_r00_r00 = test_fault_write, + .protection_pairs_r00_0w0 = test_fault_write, + .protection_pairs_r00_rw0 = test_fault_write, + .protection_pairs_0w0_000 = test_fault_write, + .protection_pairs_0w0_r00 = test_fault_write, + .protection_pairs_0w0_0w0 = test_fault_write, + .protection_pairs_0w0_rw0 = test_fault_write, + .protection_pairs_rw0_000 = test_fault_write, + .protection_pairs_rw0_r00 = test_fault_write, + .protection_pairs_rw0_0w0 = test_fault_write, + .protection_pairs_rw0_rw0 = test_fault_write, + }; + + run_vm_tests("fault_write", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_mincore.c b/tests/vm/configurator_mincore.c new file mode 100644 index 000000000..87a9a59c7 --- /dev/null +++ b/tests/vm/configurator_mincore.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_mincore.c + * + * Test mincore with many different VM states. + */ + +#include +#include +#include + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + +/* + * This implementation can model any successful call to mincore. + */ +static test_result_t +successful_mincore_nested( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* mincore returns one byte per page of address range */ + assert(size % PAGE_SIZE == 0); + mach_vm_size_t page_count = size / PAGE_SIZE; + uint8_t *page_infos = calloc(size / PAGE_SIZE, 1); + + /* No checker updates. mincore has no VM side effects. */ + int err = mincore((void *)start, size, (char *)page_infos); + assert(err == 0); + + /* Verify that mincore's result matches the checker's expectation. */ + for (mach_vm_size_t page_index = 0; + page_index < page_count; + page_index++) { + mach_vm_address_t page_address = start + page_index * PAGE_SIZE; + uint8_t page_info = page_infos[page_index]; + vm_entry_checker_t *checker = + checker_list_find_checker(checker_list, page_address); + + /* descend into submaps */ + if (checker != NULL && checker->kind == Submap) { + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker); + test_result_t result = successful_mincore_nested(submap_checkers, page_address, PAGE_SIZE); + if (result != TestSucceeded) { + return result; + } + continue; + } + + /* mappedness */ + if (checker == NULL) { + /* fixme mincore sets MINCORE_ANONYMOUS in unallocated space? */ + T_QUIET; T_EXPECT_EQ((page_info & ~MINCORE_ANONYMOUS), 0, + "empty space should have zero mincore state"); + continue; + } + + /* resident */ + bool mincore_resident = (page_info & MINCORE_INCORE); + /* TODO this assumes writes affect entire entries */ + bool checker_resident = (checker->pages_resident > 0); + if (mincore_resident != checker_resident) { + T_LOG("page residency mismatch, address 0x%llx: expected %s, " + "mincore reported %s (0x%02hhx & MINCORE_INCORE)", + page_address, name_for_bool(checker_resident), + name_for_bool(mincore_resident), page_info); + + entry_checker_range_t range = { .head = checker, .tail = checker }; + T_LOG("*** mincore expected ***"); + dump_checker_range(range); + T_LOG("*** actual ***"); + dump_region_info_for_entries(range); + + free(page_infos); + return TestFailed; + } + } + + free(page_infos); + return TestSucceeded; +} + +static test_result_t +successful_mincore( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t result = successful_mincore_nested(checker_list, start, size); + if (result != TestSucceeded) { + return result; + } + return verify_vm_state(checker_list, "after mincore"); +} + + +T_DECL(mincore, + "run mincore with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = successful_mincore, + .single_entry_2 = successful_mincore, + .single_entry_3 = successful_mincore, + .single_entry_4 = successful_mincore, + + .multiple_entries_1 = successful_mincore, + .multiple_entries_2 = successful_mincore, + .multiple_entries_3 = successful_mincore, + .multiple_entries_4 = successful_mincore, + .multiple_entries_5 = successful_mincore, + .multiple_entries_6 = successful_mincore, + + .some_holes_1 = successful_mincore, + .some_holes_2 = successful_mincore, + .some_holes_3 = successful_mincore, + .some_holes_4 = successful_mincore, + .some_holes_5 = successful_mincore, + .some_holes_6 = successful_mincore, + .some_holes_7 = successful_mincore, + .some_holes_8 = successful_mincore, + .some_holes_9 = successful_mincore, + .some_holes_10 = successful_mincore, + .some_holes_11 = successful_mincore, + .some_holes_12 = successful_mincore, + + .all_holes_1 = successful_mincore, + .all_holes_2 = successful_mincore, + .all_holes_3 = successful_mincore, + .all_holes_4 = successful_mincore, + + .null_entry = successful_mincore, + .nonresident_entry = successful_mincore, + .resident_entry = successful_mincore, + + .shared_entry = successful_mincore, + .shared_entry_discontiguous = successful_mincore, + .shared_entry_partial = successful_mincore, + .shared_entry_pairs = successful_mincore, + .shared_entry_x1000 = successful_mincore, + + .cow_entry = successful_mincore, + .cow_unreferenced = successful_mincore, + .cow_nocow = successful_mincore, + .nocow_cow = successful_mincore, + .cow_unreadable = successful_mincore, + .cow_unwriteable = successful_mincore, + + .permanent_entry = successful_mincore, + .permanent_before_permanent = successful_mincore, + .permanent_before_allocation = successful_mincore, + .permanent_before_allocation_2 = successful_mincore, + .permanent_before_hole = successful_mincore, + .permanent_after_allocation = successful_mincore, + .permanent_after_hole = successful_mincore, + + .single_submap_single_entry = successful_mincore, + .single_submap_single_entry_first_pages = successful_mincore, + .single_submap_single_entry_last_pages = successful_mincore, + .single_submap_single_entry_middle_pages = successful_mincore, + .single_submap_oversize_entry_at_start = successful_mincore, + .single_submap_oversize_entry_at_end = successful_mincore, + .single_submap_oversize_entry_at_both = successful_mincore, + + .submap_before_allocation = successful_mincore, + .submap_after_allocation = successful_mincore, + .submap_before_hole = successful_mincore, + .submap_after_hole = successful_mincore, + .submap_allocation_submap_one_entry = successful_mincore, + .submap_allocation_submap_two_entries = successful_mincore, + .submap_allocation_submap_three_entries = successful_mincore, + + .submap_before_allocation_ro = successful_mincore, + .submap_after_allocation_ro = successful_mincore, + .submap_before_hole_ro = successful_mincore, + .submap_after_hole_ro = successful_mincore, + .submap_allocation_submap_one_entry_ro = successful_mincore, + .submap_allocation_submap_two_entries_ro = successful_mincore, + .submap_allocation_submap_three_entries_ro = successful_mincore, + + .protection_single_000_000 = successful_mincore, + .protection_single_000_r00 = successful_mincore, + .protection_single_000_0w0 = successful_mincore, + .protection_single_000_rw0 = successful_mincore, + .protection_single_r00_r00 = successful_mincore, + .protection_single_r00_rw0 = successful_mincore, + .protection_single_0w0_0w0 = successful_mincore, + .protection_single_0w0_rw0 = successful_mincore, + .protection_single_rw0_rw0 = successful_mincore, + + .protection_pairs_000_000 = successful_mincore, + .protection_pairs_000_r00 = successful_mincore, + .protection_pairs_000_0w0 = successful_mincore, + .protection_pairs_000_rw0 = successful_mincore, + .protection_pairs_r00_000 = successful_mincore, + .protection_pairs_r00_r00 = successful_mincore, + .protection_pairs_r00_0w0 = successful_mincore, + .protection_pairs_r00_rw0 = successful_mincore, + .protection_pairs_0w0_000 = successful_mincore, + .protection_pairs_0w0_r00 = successful_mincore, + .protection_pairs_0w0_0w0 = successful_mincore, + .protection_pairs_0w0_rw0 = successful_mincore, + .protection_pairs_rw0_000 = successful_mincore, + .protection_pairs_rw0_r00 = successful_mincore, + .protection_pairs_rw0_0w0 = successful_mincore, + .protection_pairs_rw0_rw0 = successful_mincore, + }; + + run_vm_tests("mincore", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_mmap.c b/tests/vm/configurator_mmap.c new file mode 100644 index 000000000..fd93d967c --- /dev/null +++ b/tests/vm/configurator_mmap.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_allocate.c + * + * Test vm_allocate(FIXED and FIXED|OVERWRITE) with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" +#include "configurator/vm_configurator_helpers.h" +#include "exc_guard_helper.h" +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + +/* + * rdar://143341561 mmap(FIXED) overwrite sometimes provokes EXC_GUARD + * Remove this when that bug is fixed. + * + * normal workaround: run mmap(FIXED) with the EXC_GUARD catcher in place + * when the test is expected to hit rdar://143341561 + * Rosetta workaround: EXC_GUARD catcher doesn't work on Rosetta, so don't run + * mmap(FIXED) when the test is expected to hit rdar://143341561 + */ +#define workaround_rdar_143341561 1 + +static void +checker_perform_successful_mmap_anon( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + uint16_t user_tag) +{ + /* Make a new checker for the allocation. */ + vm_entry_checker_t *new_checker = make_checker_for_vm_allocate( + checker_list, start, size, VM_MAKE_TAG(user_tag)); + entry_checker_range_t new_range = { new_checker, new_checker }; + + /* Find existing checkers in the address range. */ + entry_checker_range_t old_range = + checker_list_find_and_clip_including_holes(checker_list, start, size); + + /* Free the old checkers and insert the new checker. */ + checker_list_replace_range(checker_list, old_range, new_range); +} + +static test_result_t +successful_mmap_anon_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + void *ret = mmap((void *)start, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0); + mach_vm_address_t allocated = (mach_vm_address_t)ret; + if (ret == MAP_FAILED) { + T_EXPECT_POSIX_SUCCESS(ret, "mmap(ANON | FIXED)"); + return TestFailed; + } + if (allocated != start) { + T_FAIL("mmap(ANON | FIXED) returned address 0x%llx (expected 0x%llx)", allocated, start); + return TestFailed; + } + checker_perform_successful_mmap_anon(checker_list, start, size, 0); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + + +static test_result_t +successful_mmap_anon_fixed_with_tag( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + uint16_t tag) +{ + void *ret = mmap((void *)start, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, VM_MAKE_TAG(tag), 0); + mach_vm_address_t allocated = (mach_vm_address_t)ret; + if (ret == MAP_FAILED) { + T_EXPECT_POSIX_SUCCESS(ret, "mmap(ANON | FIXED, tag)"); + return TestFailed; + } + if (allocated != start) { + T_FAIL("mmap(ANON | FIXED, tag) returned address 0x%llx (expected 0x%llx)", allocated, start); + return TestFailed; + } + checker_perform_successful_mmap_anon(checker_list, start, size, tag); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED, tag)"); +} + +static test_result_t +successful_mmap_anon_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + * to probe simplify behavior. + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (TestFailed == successful_mmap_anon_fixed_with_tag( + checker_list, start, size, tag)) { + return TestFailed; + } + + /* + * Allocate again, with a tag matching the entry to the right, + * to probe simplify behavior. + */ + tag = get_app_specific_user_tag_for_address(start + size); + if (TestFailed == successful_mmap_anon_fixed_with_tag( + checker_list, start, size, tag)) { + return TestFailed; + } + + return TestSucceeded; +} + +static bool +call_mmap_anon_fixed_and_expect_ENOMEM( + mach_vm_address_t start, + mach_vm_size_t size, + uint16_t tag) +{ +#if workaround_rdar_143341561 + __block void *ret; + exc_guard_helper_info_t exc_info; + bool caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + ret = mmap((void *)start, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, VM_MAKE_TAG(tag), 0); + }); + if (caught_exception) { + T_LOG("warning: rdar://143341561 mmap(fixed) should work " + "regardless of whether a mapping exists at the addr"); + } +#else /* not workaround_rdar_143341561 */ + void *ret = mmap((void *)start, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_FIXED, VM_MAKE_TAG(tag), 0); +#endif /* not workaround_rdar_143341561 */ + + if (ret != MAP_FAILED) { + T_EXPECT_POSIX_ERROR(ret, ENOMEM, "mmap(ANON | FIXED, tag)"); + return false; + } + return true; +} + + +static test_result_t +test_permanent_entry_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* one permanent entry, it becomes inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_permanent_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* two permanent entries, both become inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_allocation_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* [start + size/2, start + size) unchanged */ + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_rdar144128567( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, becomes deallocated (rdar://144128567) + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_hole_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one hole, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no change for addresses [start + size / 2, start + size) */ + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_after_allocation_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* + * one nonpermanent allocation, becomes deallocated + * one permanent entry, becomes inaccessible + */ + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_after_hole_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, 0)) { + return TestFailed; + } + + /* + * one hole, unchanged + * one permanent entry, becomes inaccessible + */ + /* no change for addresses [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + + +static test_result_t +test_permanent_entry_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* one permanent entry, it becomes inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_permanent_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* two permanent entries, both become inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* [start + size/2, start + size) unchanged */ + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_with_neighbor_tags_rdar144128567( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, becomes deallocated (rdar://144128567) + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_before_hole_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one hole, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no change for addresses [start + size / 2, start + size) */ + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_after_allocation_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* + * one nonpermanent allocation, becomes deallocated + * one permanent entry, becomes inaccessible + */ + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + +static test_result_t +test_permanent_after_hole_fixed_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_mmap_anon_fixed_and_expect_ENOMEM(start, size, tag)) { + return TestFailed; + } + + /* + * one hole, unchanged + * one permanent entry, becomes inaccessible + */ + /* no change for addresses [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after mmap(ANON | FIXED)"); +} + + +T_DECL(mmap_anon_fixed, + "run mmap(ANON | FIXED) with various vm configurations") +{ +#if workaround_rdar_143341561 + enable_non_fatal_vm_exc_guard(); +#endif + + vm_tests_t tests = { + .single_entry_1 = successful_mmap_anon_fixed, + .single_entry_2 = successful_mmap_anon_fixed, + .single_entry_3 = successful_mmap_anon_fixed, + .single_entry_4 = successful_mmap_anon_fixed, + + .multiple_entries_1 = successful_mmap_anon_fixed, + .multiple_entries_2 = successful_mmap_anon_fixed, + .multiple_entries_3 = successful_mmap_anon_fixed, + .multiple_entries_4 = successful_mmap_anon_fixed, + .multiple_entries_5 = successful_mmap_anon_fixed, + .multiple_entries_6 = successful_mmap_anon_fixed, + + .some_holes_1 = successful_mmap_anon_fixed, + .some_holes_2 = successful_mmap_anon_fixed, + .some_holes_3 = successful_mmap_anon_fixed, + .some_holes_4 = successful_mmap_anon_fixed, + .some_holes_5 = successful_mmap_anon_fixed, + .some_holes_6 = successful_mmap_anon_fixed, + .some_holes_7 = successful_mmap_anon_fixed, + .some_holes_8 = successful_mmap_anon_fixed, + .some_holes_9 = successful_mmap_anon_fixed, + .some_holes_10 = successful_mmap_anon_fixed, + .some_holes_11 = successful_mmap_anon_fixed, + .some_holes_12 = successful_mmap_anon_fixed, + + .all_holes_1 = successful_mmap_anon_fixed, + .all_holes_2 = successful_mmap_anon_fixed, + .all_holes_3 = successful_mmap_anon_fixed, + .all_holes_4 = successful_mmap_anon_fixed, + + .null_entry = successful_mmap_anon_fixed, + .nonresident_entry = successful_mmap_anon_fixed, + .resident_entry = successful_mmap_anon_fixed, + + .shared_entry = successful_mmap_anon_fixed, + .shared_entry_discontiguous = successful_mmap_anon_fixed, + .shared_entry_partial = successful_mmap_anon_fixed, + .shared_entry_pairs = successful_mmap_anon_fixed, + .shared_entry_x1000 = successful_mmap_anon_fixed, + + .cow_entry = successful_mmap_anon_fixed, + .cow_unreferenced = successful_mmap_anon_fixed, + .cow_nocow = successful_mmap_anon_fixed, + .nocow_cow = successful_mmap_anon_fixed, + .cow_unreadable = successful_mmap_anon_fixed, + .cow_unwriteable = successful_mmap_anon_fixed, + + .permanent_entry = test_permanent_entry_fixed, + .permanent_before_permanent = test_permanent_before_permanent_fixed, + .permanent_before_allocation = test_permanent_before_allocation_fixed, + .permanent_before_allocation_2 = test_permanent_before_allocation_fixed_rdar144128567, + .permanent_before_hole = test_permanent_before_hole_fixed, + .permanent_after_allocation = test_permanent_after_allocation_fixed, + .permanent_after_hole = test_permanent_after_hole_fixed, + + .single_submap_single_entry = successful_mmap_anon_fixed, + .single_submap_single_entry_first_pages = successful_mmap_anon_fixed, + .single_submap_single_entry_last_pages = successful_mmap_anon_fixed, + .single_submap_single_entry_middle_pages = successful_mmap_anon_fixed, + .single_submap_oversize_entry_at_start = successful_mmap_anon_fixed, + .single_submap_oversize_entry_at_end = successful_mmap_anon_fixed, + .single_submap_oversize_entry_at_both = successful_mmap_anon_fixed, + + .submap_before_allocation = successful_mmap_anon_fixed, + .submap_after_allocation = successful_mmap_anon_fixed, + .submap_before_hole = successful_mmap_anon_fixed, + .submap_after_hole = successful_mmap_anon_fixed, + .submap_allocation_submap_one_entry = successful_mmap_anon_fixed, + .submap_allocation_submap_two_entries = successful_mmap_anon_fixed, + .submap_allocation_submap_three_entries = successful_mmap_anon_fixed, + + .submap_before_allocation_ro = successful_mmap_anon_fixed, + .submap_after_allocation_ro = successful_mmap_anon_fixed, + .submap_before_hole_ro = successful_mmap_anon_fixed, + .submap_after_hole_ro = successful_mmap_anon_fixed, + .submap_allocation_submap_one_entry_ro = successful_mmap_anon_fixed, + .submap_allocation_submap_two_entries_ro = successful_mmap_anon_fixed, + .submap_allocation_submap_three_entries_ro = successful_mmap_anon_fixed, + + .protection_single_000_000 = successful_mmap_anon_fixed, + .protection_single_000_r00 = successful_mmap_anon_fixed, + .protection_single_000_0w0 = successful_mmap_anon_fixed, + .protection_single_000_rw0 = successful_mmap_anon_fixed, + .protection_single_r00_r00 = successful_mmap_anon_fixed, + .protection_single_r00_rw0 = successful_mmap_anon_fixed, + .protection_single_0w0_0w0 = successful_mmap_anon_fixed, + .protection_single_0w0_rw0 = successful_mmap_anon_fixed, + .protection_single_rw0_rw0 = successful_mmap_anon_fixed, + + .protection_pairs_000_000 = successful_mmap_anon_fixed, + .protection_pairs_000_r00 = successful_mmap_anon_fixed, + .protection_pairs_000_0w0 = successful_mmap_anon_fixed, + .protection_pairs_000_rw0 = successful_mmap_anon_fixed, + .protection_pairs_r00_000 = successful_mmap_anon_fixed, + .protection_pairs_r00_r00 = successful_mmap_anon_fixed, + .protection_pairs_r00_0w0 = successful_mmap_anon_fixed, + .protection_pairs_r00_rw0 = successful_mmap_anon_fixed, + .protection_pairs_0w0_000 = successful_mmap_anon_fixed, + .protection_pairs_0w0_r00 = successful_mmap_anon_fixed, + .protection_pairs_0w0_0w0 = successful_mmap_anon_fixed, + .protection_pairs_0w0_rw0 = successful_mmap_anon_fixed, + .protection_pairs_rw0_000 = successful_mmap_anon_fixed, + .protection_pairs_rw0_r00 = successful_mmap_anon_fixed, + .protection_pairs_rw0_0w0 = successful_mmap_anon_fixed, + .protection_pairs_rw0_rw0 = successful_mmap_anon_fixed, + }; + + run_vm_tests("mmap_anon_fixed", __FILE__, &tests, argc, argv); +} + + +T_DECL(mmap_anon_fixed_with_neighbor_tags, + "run mmap(ANON | FIXED, tag) with various vm configurations " + "and tags copied from neighboring entries") +{ +#if workaround_rdar_143341561 + enable_non_fatal_vm_exc_guard(); +#endif + + vm_tests_t tests = { + .single_entry_1 = successful_mmap_anon_fixed_with_neighbor_tags, + .single_entry_2 = successful_mmap_anon_fixed_with_neighbor_tags, + .single_entry_3 = successful_mmap_anon_fixed_with_neighbor_tags, + .single_entry_4 = successful_mmap_anon_fixed_with_neighbor_tags, + + .multiple_entries_1 = successful_mmap_anon_fixed_with_neighbor_tags, + .multiple_entries_2 = successful_mmap_anon_fixed_with_neighbor_tags, + .multiple_entries_3 = successful_mmap_anon_fixed_with_neighbor_tags, + .multiple_entries_4 = successful_mmap_anon_fixed_with_neighbor_tags, + .multiple_entries_5 = successful_mmap_anon_fixed_with_neighbor_tags, + .multiple_entries_6 = successful_mmap_anon_fixed_with_neighbor_tags, + + .some_holes_1 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_2 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_3 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_4 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_5 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_6 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_7 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_8 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_9 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_10 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_11 = successful_mmap_anon_fixed_with_neighbor_tags, + .some_holes_12 = successful_mmap_anon_fixed_with_neighbor_tags, + + .all_holes_1 = successful_mmap_anon_fixed_with_neighbor_tags, + .all_holes_2 = successful_mmap_anon_fixed_with_neighbor_tags, + .all_holes_3 = successful_mmap_anon_fixed_with_neighbor_tags, + .all_holes_4 = successful_mmap_anon_fixed_with_neighbor_tags, + + .null_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .nonresident_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .resident_entry = successful_mmap_anon_fixed_with_neighbor_tags, + + .shared_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .shared_entry_discontiguous = successful_mmap_anon_fixed_with_neighbor_tags, + .shared_entry_partial = successful_mmap_anon_fixed_with_neighbor_tags, + .shared_entry_pairs = successful_mmap_anon_fixed_with_neighbor_tags, + .shared_entry_x1000 = successful_mmap_anon_fixed_with_neighbor_tags, + + .cow_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .cow_unreferenced = successful_mmap_anon_fixed_with_neighbor_tags, + .cow_nocow = successful_mmap_anon_fixed_with_neighbor_tags, + .nocow_cow = successful_mmap_anon_fixed_with_neighbor_tags, + .cow_unreadable = successful_mmap_anon_fixed_with_neighbor_tags, + .cow_unwriteable = successful_mmap_anon_fixed_with_neighbor_tags, + + .permanent_entry = test_permanent_entry_fixed_with_neighbor_tags, + .permanent_before_permanent = test_permanent_before_permanent_fixed_with_neighbor_tags, + .permanent_before_allocation = test_permanent_before_allocation_fixed_with_neighbor_tags, + .permanent_before_allocation_2 = test_permanent_before_allocation_fixed_with_neighbor_tags_rdar144128567, + .permanent_before_hole = test_permanent_before_hole_fixed_with_neighbor_tags, + .permanent_after_allocation = test_permanent_after_allocation_fixed_with_neighbor_tags, + .permanent_after_hole = test_permanent_after_hole_fixed_with_neighbor_tags, + + .single_submap_single_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_single_entry_first_pages = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_single_entry_last_pages = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_single_entry_middle_pages = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_oversize_entry_at_start = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_oversize_entry_at_end = successful_mmap_anon_fixed_with_neighbor_tags, + .single_submap_oversize_entry_at_both = successful_mmap_anon_fixed_with_neighbor_tags, + + .submap_before_allocation = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_after_allocation = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_before_hole = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_after_hole = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_one_entry = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_two_entries = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_three_entries = successful_mmap_anon_fixed_with_neighbor_tags, + + .submap_before_allocation_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_after_allocation_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_before_hole_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_after_hole_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_one_entry_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_two_entries_ro = successful_mmap_anon_fixed_with_neighbor_tags, + .submap_allocation_submap_three_entries_ro = successful_mmap_anon_fixed_with_neighbor_tags, + + .protection_single_000_000 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_000_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_000_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_000_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_r00_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_r00_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_0w0_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_0w0_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_single_rw0_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + + .protection_pairs_000_000 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_000_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_000_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_000_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_r00_000 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_r00_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_r00_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_r00_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_0w0_000 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_0w0_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_0w0_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_0w0_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_rw0_000 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_rw0_r00 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_rw0_0w0 = successful_mmap_anon_fixed_with_neighbor_tags, + .protection_pairs_rw0_rw0 = successful_mmap_anon_fixed_with_neighbor_tags, + }; + + run_vm_tests("mmap_anon_fixed_with_neighbor_tags", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_test.c b/tests/vm/configurator_test.c new file mode 100644 index 000000000..8848e3a74 --- /dev/null +++ b/tests/vm/configurator_test.c @@ -0,0 +1,2124 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_test.c + * + * Test vm_configurator itself. + * + * Verify that the VM states generated by vm_configurator are correct. + * This is intended to catch bugs in vm_configurator's + * template and checker system, as well as bugs in individual + * configurations used by other tests. + */ + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + + +/* + * Return true if [start, start + size) is a VM entry at the given submap depth. + */ +__attribute__((overloadable)) +static bool +is_entry(mach_vm_address_t start, mach_vm_size_t size, uint32_t submap_depth) +{ + mach_vm_address_t entry_start = start; + mach_vm_size_t entry_size; + vm_region_submap_info_data_64_t info; + + if (!get_info_for_address(&entry_start, &entry_size, &info, submap_depth)) { + return false; /* not mapped */ + } + if (entry_start != start || entry_size != size) { + return false; /* mapped, but wrong extent */ + } + + return true; +} + +/* + * Return true if [start, start + size) is a VM entry at submap depth 0. + */ +__attribute__((overloadable, used)) +static bool +is_entry(mach_vm_address_t start, mach_vm_size_t size) +{ + return is_entry(start, size, 0); +} + +/* + * Return true if [start, start + size) is an unallocated hole + * at the given submap depth. + */ +__attribute__((overloadable)) +static bool +is_hole(mach_vm_address_t start, mach_vm_size_t size, uint32_t submap_depth) +{ + mach_vm_address_t entry_start = start; + mach_vm_size_t entry_size; + vm_region_submap_info_data_64_t info; + + if (get_info_for_address(&entry_start, &entry_size, &info, submap_depth)) { + /* start address was mapped */ + return false; + } else if (entry_start < start + size) { + /* some address before the end of the expected hole was mapped */ + return false; + } + + /* [start, start + size) was entirely unmapped */ + return true; +} + +/* + * Return true if [start, start + size) is an unallocated hole at submap depth 0. + * Unallocate space inside a submap does not count. + */ +__attribute__((overloadable, used)) +static bool +is_hole(mach_vm_address_t start, mach_vm_size_t size) +{ + return is_hole(start, size, 0 /* submap_depth */); +} + +/* + * Verify the memory and the checker for an expected hole. + */ +static void +assert_hole_checker_and_entry( + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert(start % PAGE_SIZE == 0); + assert(size % PAGE_SIZE == 0); + assert(checker->kind == Hole); + assert(checker->address == start); + assert(checker->size == size); + + assert(is_hole(start, size, checker->submap_depth)); +} + +/* + * Verify the checker for an expected allocated entry. + * Does not verify the actual VM state. + */ +static void +assert_allocation_checker( + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert(start % PAGE_SIZE == 0); + assert(size % PAGE_SIZE == 0); + assert(checker->kind == Allocation); + assert(checker->address == start); + assert(checker->size == size); +} + +/* + * Verify the actual VM state for an expected allocated entry. + * Does not verify the matching checker. + * Returns the vm_region output for the memory. + */ +static void +assert_allocation_entry( + mach_vm_address_t start, + mach_vm_size_t size, + uint32_t submap_depth, + vm_region_submap_info_data_64_t * const out_info) +{ + mach_vm_address_t entry_start = start; + mach_vm_size_t entry_size; + assert(get_info_for_address(&entry_start, &entry_size, out_info, submap_depth)); + assert(entry_start == start); + assert(entry_size == size); +} + +/* + * Verify the memory and the checker for an expected allocated entry. + * Returns the vm_region output for the memory. + */ +static void +assert_allocation_checker_and_entry( + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size, + vm_region_submap_info_data_64_t * const out_info) +{ + assert_allocation_checker(checker, start, size); + assert_allocation_entry(start, size, checker->submap_depth, out_info); +} + + +/* + * Verify that checker_list consists of a pattern of entries and holes + * Each allocation or hole is assumed to be DEFAULT_ENTRY_SIZE in length. + * "##.#..#": allocation, allocation, hole, allocation, hole, hole, allocation + */ +static void +assert_allocation_and_hole_pattern( + checker_list_t *checker_list, + const char *pattern) +{ + mach_vm_address_t base = checker_range_start_address(checker_list->entries); + assert(checker_range_count(checker_list->entries) == strlen(pattern)); + + for (size_t i = 0; i < strlen(pattern); i++) { + vm_region_submap_info_data_64_t info; + mach_vm_address_t entry_address = base + i * DEFAULT_ENTRY_SIZE; + vm_entry_checker_t *checker = checker_list_nth(checker_list, i); + switch (pattern[i]) { + case '#': + assert_allocation_checker_and_entry(checker, + entry_address, DEFAULT_ENTRY_SIZE, &info); + break; + case '.': + assert_hole_checker_and_entry(checker, + entry_address, DEFAULT_ENTRY_SIZE); + break; + default: + T_ASSERT_FAIL("pattern character '%c' is neither '#' nor '.'", pattern[i]); + break; + } + } +} + +static void +assert_checker_and_entry_protection_equals( + vm_entry_checker_t *checker, + vm_region_submap_info_data_64_t *info, + int protection, + int max_protection) +{ + assert(checker->protection == protection); + assert(checker->max_protection == max_protection); + assert(info->protection == protection); + assert(info->max_protection == max_protection); +} + +/* + * Verify the memory and the checker for an expected permanent entry. + * This is destructive because it attempts to deallocate the permanent entry + * which makes its memory inaccessible, and updates the checker to match. + */ +static void +destructively_assert_permanent_checker_and_entry( + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert(start % PAGE_SIZE == 0); + assert(size % PAGE_SIZE == 0); + assert(checker->permanent == true); + assert(checker->address == start); + assert(checker->size == size); + + /* + * Permanent memory is indistinguishable in vm_region output. + * We can only try to deallocate it and then see if it is still there. + */ + + /* check that it exists */ + assert(is_entry(start, size, checker->submap_depth)); + + /* try to deallocate it */ + kern_return_t kr = mach_vm_deallocate(mach_task_self(), start, size); + assert(kr == 0); + + /* check that it still exists */ + assert(is_entry(start, size, checker->submap_depth)); + + /* update the checker because the memory is now inaccessible */ + checker->protection = VM_PROT_NONE; + checker->max_protection = VM_PROT_NONE; +} + +/* + * Verify the memory and checker for an expected non-permanent allocation. + * This is destructive because it deallocates the memory + * and updates the checker to match. + */ +static void +destructively_assert_nonpermanent_checker_and_entry( + checker_list_t *list, + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert(start % PAGE_SIZE == 0); + assert(size % PAGE_SIZE == 0); + assert(checker->permanent == false); + assert(checker->address == start); + assert(checker->size == size); + + /* + * Permanent memory is indistinguishable in vm_region output. + * We can only try to deallocate it and then see if it is still there. + */ + + /* check that it exists */ + assert(is_entry(start, size, checker->submap_depth)); + + /* try to deallocate it */ + kern_return_t kr = mach_vm_deallocate(mach_task_self(), start, size); + assert(kr == 0); + + /* check that it no longer exists */ + assert(!is_entry(start, size, checker->submap_depth)); + assert(is_hole(start, size, 0 /* submap_depth */)); + + /* + * Update the checker to match the now-deallocated memory. + * The checker should be replaced by a hole checker. + * + * Save the checker's index first so we can + * look up and verify the hole checker after. + */ + unsigned index = 0; + while (checker_list_nth(list, index) != checker) { + index++; + } + + checker_list_free_checker(list, checker); + vm_entry_checker_t *new_hole = checker_list_nth(list, index); + assert_hole_checker_and_entry(new_hole, start, size); +} + +/* + * Verify the memory and the checker for an expected submap entry. + * Does not examine the contents of the submap. + * Returns the vm_region output for the entry in the parent map. + */ +static void +assert_submap_checker_and_entry( + vm_entry_checker_t *checker, + mach_vm_address_t start, + mach_vm_size_t size, + vm_region_submap_info_data_64_t * const out_info) +{ + assert(start % PAGE_SIZE == 0); + assert(size % PAGE_SIZE == 0); + assert(checker->kind == Submap); + assert(checker->address == start); + assert(checker->size == size); + assert(checker->submap_depth == 0); /* nested submaps not allowed */ + + mach_vm_address_t entry_start = start; + mach_vm_size_t entry_size; + assert(get_info_for_address(&entry_start, &entry_size, out_info, checker->submap_depth)); + assert(entry_start == start); + assert(entry_size == size); +} + + +static test_result_t +test_single_entry_1( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* test range is a single allocation */ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + return TestSucceeded; +} + +static test_result_t +test_single_entry_2( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + + /* test range excludes the end of the allocation */ + assert(size == DEFAULT_ENTRY_SIZE - DEFAULT_PARTIAL_ENTRY_SIZE); + assert_allocation_checker_and_entry(checker, + start, DEFAULT_ENTRY_SIZE, &info); + + return TestSucceeded; +} + +static test_result_t +test_single_entry_3( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + + /* test range excludes the start of the allocation */ + assert(size == DEFAULT_ENTRY_SIZE - DEFAULT_PARTIAL_ENTRY_SIZE); + assert_allocation_checker_and_entry(checker, + start - DEFAULT_PARTIAL_ENTRY_SIZE, DEFAULT_ENTRY_SIZE, &info); + + return TestSucceeded; +} + +static test_result_t +test_single_entry_4( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + + /* test range excludes the start and end of the allocation */ + assert(size == DEFAULT_ENTRY_SIZE - DEFAULT_PARTIAL_ENTRY_SIZE); + assert_allocation_checker_and_entry(checker, + start - DEFAULT_PARTIAL_ENTRY_SIZE / 2, DEFAULT_ENTRY_SIZE, &info); + + return TestSucceeded; +} + + +static test_result_t +test_multiple_entries_1( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "##"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 2); + return TestSucceeded; +} + +static test_result_t +test_multiple_entries_2( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 3); + return TestSucceeded; +} + +static test_result_t +test_multiple_entries_3( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, + "############" + "############" + "############" + "############"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 4 * 12); + return TestSucceeded; +} + +static test_result_t +test_multiple_entries_4( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 3 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_multiple_entries_5( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###"); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE); + assert(size == DEFAULT_ENTRY_SIZE * 3 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_multiple_entries_6( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###"); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE / 2); + assert(size == DEFAULT_ENTRY_SIZE * 3 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + + +static test_result_t +test_some_holes_1( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, ".#"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 2); + return TestSucceeded; +} + +static test_result_t +test_some_holes_2( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, ".###"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 4); + return TestSucceeded; +} + +static test_result_t +test_some_holes_3( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, ".#"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 2 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_some_holes_4( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, ".###"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 4 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_some_holes_5( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#."); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 2); + return TestSucceeded; +} + +static test_result_t +test_some_holes_6( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###."); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 4); + return TestSucceeded; +} + +static test_result_t +test_some_holes_7( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#."); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE); + assert(size == DEFAULT_ENTRY_SIZE * 2 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_some_holes_8( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###."); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE); + assert(size == DEFAULT_ENTRY_SIZE * 4 - DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_some_holes_9( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#.#"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 3); + return TestSucceeded; +} + +static test_result_t +test_some_holes_10( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#.#.#"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 5); + return TestSucceeded; +} + +static test_result_t +test_some_holes_11( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "##.##.##"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 8); + return TestSucceeded; +} + +static test_result_t +test_some_holes_12( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "###.###.###"); + assert(start == checker_range_start_address(checker_list->entries)); + assert(size == DEFAULT_ENTRY_SIZE * 11); + return TestSucceeded; +} + + +static test_result_t +test_all_holes_1( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#.#"); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_ENTRY_SIZE); + assert(size == DEFAULT_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_all_holes_2( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "#."); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_ENTRY_SIZE); + assert(size == DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_all_holes_3( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, ".#"); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE); + assert(size == DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_all_holes_4( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_allocation_and_hole_pattern(checker_list, "."); + assert(start == checker_range_start_address(checker_list->entries) + DEFAULT_PARTIAL_ENTRY_SIZE / 2); + assert(size == DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + + +static test_result_t +test_null_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + /* entry's object is null */ + assert(info.object_id_full == 0); + assert(checker->object->object_id == 0); + + return TestSucceeded; +} + +static test_result_t +test_nonresident_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + /* entry has an object, but its pages are not resident */ + assert(info.object_id_full != 0); + assert(info.pages_resident == 0); + + assert(checker->object->object_id != 0); + assert(checker->pages_resident == 0); + + return TestSucceeded; +} + +static test_result_t +test_resident_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + /* entry has an object and its pages are resident */ + assert(info.pages_resident == size / PAGE_SIZE); + assert(checker->pages_resident == size / PAGE_SIZE); + assert(checker->object->object_id != 0); + + return TestSucceeded; +} + +/* common code for two-shared-entry tests */ +static void +test_one_shared_pair( + checker_list_t *checker_list, + mach_vm_address_t left_entry_start, + mach_vm_address_t right_entry_start, + mach_vm_size_t size, + mach_vm_address_t right_object_offset) +{ + /* + * Two entries, both have the same object with refcount two. + * Right entry's object offset varies. + */ + vm_entry_checker_t *left_checker = + checker_list_find_allocation(checker_list, left_entry_start); + vm_entry_checker_t *right_checker = + checker_list_find_allocation(checker_list, right_entry_start); + assert(left_checker); + assert(right_checker); + + vm_region_submap_info_data_64_t left_info, right_info; + assert_allocation_checker_and_entry(left_checker, left_entry_start, size, &left_info); + assert_allocation_checker_and_entry(right_checker, right_entry_start, size, &right_info); + + assert(left_info.object_id_full != 0); + assert(left_info.object_id_full == right_info.object_id_full); + assert(left_info.ref_count == 2); + assert(right_info.ref_count == 2); + assert(left_info.share_mode == SM_TRUESHARED); + assert(right_info.share_mode == SM_TRUESHARED); + assert(left_info.offset == 0); + assert(right_info.offset == right_object_offset); + assert(left_info.user_tag != right_info.user_tag); + + assert(left_checker->object == right_checker->object); + /* checker doesn't distinguish SM_SHARED from SM_TRUESHARED */ + assert(checker_share_mode(left_checker) == SM_SHARED); + assert(checker_share_mode(right_checker) == SM_SHARED); + assert(left_checker->object_offset == 0); + assert(right_checker->object_offset == right_object_offset); + assert(left_checker->user_tag != right_checker->user_tag); +} + +static test_result_t +test_shared_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* entries are both at object offset 0 */ + test_one_shared_pair(checker_list, start, start + size, size, 0); + return TestSucceeded; +} + +static test_result_t +test_shared_entry_discontiguous( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * right entry's object offset begins + * after the left entry's range ends + */ + test_one_shared_pair(checker_list, start, start + size, size, DEFAULT_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_shared_entry_partial( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * right entry's object offset begins + * inside the left entry's range + */ + test_one_shared_pair(checker_list, start, start + size, size, DEFAULT_PARTIAL_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_shared_entry_pairs( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two shared pairs + */ + mach_vm_size_t entry_size = size / 4; + mach_vm_address_t one = start; + mach_vm_address_t two = one + entry_size; + mach_vm_address_t three = two + entry_size; + mach_vm_address_t four = three + entry_size; + + test_one_shared_pair(checker_list, one, four, entry_size, 0); + test_one_shared_pair(checker_list, two, three, entry_size, 0); + + return TestSucceeded; +} + +static test_result_t +test_shared_entry_x1000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* many entries, all of which share the same object */ + + entry_checker_range_t limit = checker_list_find_range(checker_list, start, size); + assert(checker_range_count(limit) == 1000); + + uint64_t shared_object_id = 0; + FOREACH_CHECKER(checker, limit) { + assert(checker->object); + assert(checker->object->object_id_mode == object_has_known_id); + if (!shared_object_id) { + assert(checker->object->object_id != 0); + shared_object_id = checker->object->object_id; + } + assert(checker->object->object_id == shared_object_id); + assert(get_object_id_for_address(checker->address) == shared_object_id); + } + + return TestSucceeded; +} + + +/* common code for two-shared-entry tests */ +static test_result_t +test_cow_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two entries, both have the same object and its refcount is two. + * [start, start + size) is only the first entry + */ + vm_entry_checker_t *left_checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t left_info; + assert_allocation_checker_and_entry(left_checker, start, size, &left_info); + + vm_entry_checker_t *right_checker = checker_list_nth(checker_list, 1); + vm_region_submap_info_data_64_t right_info; + assert_allocation_checker_and_entry(right_checker, start + size, size, &right_info); + + assert(left_info.object_id_full != 0); + assert(left_info.object_id_full == right_info.object_id_full); + assert(left_info.ref_count == 2); + assert(right_info.ref_count == 2); + assert(left_info.share_mode == SM_COW); + assert(right_info.share_mode == SM_COW); + assert(left_info.offset == 0); + assert(right_info.offset == 0); + + assert(left_checker->object == right_checker->object); + assert(checker_share_mode(left_checker) == SM_COW); + assert(checker_share_mode(right_checker) == SM_COW); + assert(left_checker->object_offset == 0); + assert(right_checker->object_offset == 0); + + return TestSucceeded; +} + +static test_result_t +test_cow_unreferenced( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * one COW entry with refcount 1 + */ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + assert(info.share_mode == SM_COW); + assert(info.object_id_full != 0); + assert(info.ref_count == 1); + assert(info.offset == 0); + + assert(checker->object_offset == 0); + assert(checker_share_mode(checker) == SM_COW); + assert(checker->object); + assert(checker->object->self_ref_count == 1); + + return TestSucceeded; +} + +/* common checks for cow_nocow and nocow_cow */ +static test_result_t +test_cow_nocow_common( + vm_entry_checker_t *cow_checker, + vm_entry_checker_t *plain_checker, + mach_vm_address_t cow_start_address, + mach_vm_address_t plain_start_address, + mach_vm_size_t entry_size) +{ + /* two entries: one is COW, one is not COW */ + vm_region_submap_info_data_64_t cow_info, plain_info; + assert_allocation_checker_and_entry(cow_checker, cow_start_address, entry_size, &cow_info); + assert_allocation_checker_and_entry(plain_checker, plain_start_address, entry_size, &plain_info); + + assert(cow_info.share_mode == SM_COW); + assert(plain_info.share_mode == SM_PRIVATE); + assert(cow_info.object_id_full != 0); + assert(cow_info.object_id_full != plain_info.object_id_full); + assert(cow_info.ref_count == 2); + assert(cow_info.offset == 0); + + assert(checker_share_mode(cow_checker) == SM_COW); + assert(checker_share_mode(plain_checker) == SM_PRIVATE); + assert(cow_checker->object != NULL); + assert(cow_checker->object != plain_checker->object); + assert(cow_checker->object_offset == 0); + + return TestSucceeded; +} + +static test_result_t +test_cow_nocow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two entries: first is COW, second is not + */ + vm_entry_checker_t *cow_checker = checker_list_nth(checker_list, 0); + vm_entry_checker_t *plain_checker = checker_list_nth(checker_list, 1); + + assert(size % 2 == 0); + mach_vm_address_t cow_start = start; + mach_vm_address_t plain_start = start + size / 2; + + return test_cow_nocow_common(cow_checker, plain_checker, + cow_start, plain_start, size / 2); +} + +static test_result_t +test_nocow_cow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two entries: first is not COW, second is COW + */ + vm_entry_checker_t *plain_checker = checker_list_nth(checker_list, 0); + vm_entry_checker_t *cow_checker = checker_list_nth(checker_list, 1); + + assert(size % 2 == 0); + mach_vm_address_t plain_start = start; + mach_vm_address_t cow_start = start + size / 2; + + return test_cow_nocow_common(cow_checker, plain_checker, + cow_start, plain_start, size / 2); +} + +static test_result_t +test_cow_unreadable( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * COW entry, unreadable + */ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + assert(info.share_mode == SM_COW); + assert(info.protection == VM_PROT_NONE); + assert(info.ref_count == 2); + + assert(checker_share_mode(checker) == SM_COW); + assert(checker->protection == VM_PROT_NONE); + assert(checker->object != NULL); + assert(checker->object->self_ref_count == 2); + + return TestSucceeded; +} + +static test_result_t +test_cow_unwriteable( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * COW entry, readable but unwriteable + */ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + vm_region_submap_info_data_64_t info; + assert_allocation_checker_and_entry(checker, start, size, &info); + + assert(info.share_mode == SM_COW); + assert(info.protection == VM_PROT_READ); + assert(info.ref_count == 2); + + assert(checker_share_mode(checker) == SM_COW); + assert(checker->protection == VM_PROT_READ); + assert(checker->object != NULL); + assert(checker->object->self_ref_count == 2); + + return TestSucceeded; +} + +static test_result_t +test_permanent_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 0), start, size); + + return TestSucceeded; +} + +static test_result_t +test_permanent_before_permanent( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 0), + start, size / 2); + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + + return TestSucceeded; +} + +static test_result_t +test_permanent_before_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 0), + start, size / 2); + destructively_assert_nonpermanent_checker_and_entry( + checker_list, + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + assert_hole_checker_and_entry( + checker_list_nth(checker_list, 2), + start + size, DEFAULT_ENTRY_SIZE); + return TestSucceeded; +} + +static test_result_t +test_permanent_before_allocation_2( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_region_submap_info_data_64_t info; + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 0), + start, size / 2); + destructively_assert_nonpermanent_checker_and_entry( + checker_list, + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + assert_allocation_checker_and_entry( + checker_list_nth(checker_list, 2), + start + size, DEFAULT_ENTRY_SIZE, &info); + + return TestSucceeded; +} + +static test_result_t +test_permanent_before_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 0), + start, size / 2); + assert_hole_checker_and_entry( + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + + return TestSucceeded; +} + +static test_result_t +test_permanent_after_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + destructively_assert_nonpermanent_checker_and_entry( + checker_list, + checker_list_nth(checker_list, 0), + start, size / 2); + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + + return TestSucceeded; +} + +static test_result_t +test_permanent_after_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + assert_hole_checker_and_entry( + checker_list_nth(checker_list, 0), + start, size / 2); + destructively_assert_permanent_checker_and_entry( + checker_list_nth(checker_list, 1), + start + size / 2, size / 2); + + return TestSucceeded; +} + + +static test_result_t +test_single_submap_single_entry_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_region_submap_info_data_64_t info; + + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 0); + assert_submap_checker_and_entry(submap_parent, start, size, &info); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker_and_entry(submap_content, start, size, &info); + + return TestSucceeded; +} + +static test_result_t +test_single_submap_single_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_single_submap_single_entry_common( + checker_list, start, size); +} + +static test_result_t +test_single_submap_single_entry_first_pages( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* tested range excludes the last 1/2 of the real entry */ + mach_vm_size_t submap_size = size * 2; + mach_vm_address_t submap_start = start; + return test_single_submap_single_entry_common( + checker_list, submap_start, submap_size); +} + +static test_result_t +test_single_submap_single_entry_last_pages( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* tested range excludes the first 1/2 of the real entry */ + mach_vm_size_t submap_size = size * 2; + mach_vm_address_t submap_start = start - submap_size / 2; + return test_single_submap_single_entry_common( + checker_list, submap_start, submap_size); +} + +static test_result_t +test_single_submap_single_entry_middle_pages( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* tested range excludes the first 1/4 and last 1/4 of the real entry */ + mach_vm_size_t submap_size = size * 2; + mach_vm_address_t submap_start = start - submap_size / 4; + return test_single_submap_single_entry_common( + checker_list, submap_start, submap_size); +} + +static test_result_t +test_single_submap_oversize_entry_common( + checker_list_t *checker_list, + mach_vm_address_t parent_start, + mach_vm_size_t parent_size, + mach_vm_address_t parent_offset, + mach_vm_size_t submap_size) +{ + vm_region_submap_info_data_64_t parent_info, content_info; + + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 0); + assert_submap_checker_and_entry(submap_parent, parent_start, parent_size, &parent_info); + assert(submap_parent->object_offset == parent_offset); + assert(parent_info.offset == parent_offset); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + /* + * Actual entry in submap is clamped to the parent map submap view + * by vm_region. Checker for that entry is unchanged. + */ + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, parent_start - parent_offset, submap_size); + assert(submap_content->submap_depth == 1); + assert_allocation_entry(parent_start, parent_size, 1 /* submap_depth */, &content_info); + assert(submap_content->object_offset == 0); + assert(content_info.offset == 0); + + return TestSucceeded; +} + +static test_result_t +test_single_submap_oversize_entry_at_start( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * parent map: [start, start+size] + * submap: [0 (size) size*2] + */ + return test_single_submap_oversize_entry_common(checker_list, + start, size, + size /* parent_offset */, size * 2 /* submap_size */); +} + +static test_result_t +test_single_submap_oversize_entry_at_end( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * parent map: [start, start+size] + * submap: [0 (size) size*2] + */ + return test_single_submap_oversize_entry_common(checker_list, + start, size, + 0 /* parent_offset */, size * 2 /* submap_size */); +} + +static test_result_t +test_single_submap_oversize_entry_at_both( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * parent map: [start, start+size] + * submap: [0 (size / 2) size*2] + */ + return test_single_submap_oversize_entry_common(checker_list, + start, size, + size / 2 /* parent_offset */, size * 2 /* submap_size */); +} + +static test_result_t +test_submap_before_allocation_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + vm_region_submap_info_data_64_t submap_parent_info; + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 0); + assert_submap_checker_and_entry(submap_parent, + start, size / 2, &submap_parent_info); + + vm_region_submap_info_data_64_t allocation_info; + vm_entry_checker_t *allocation = checker_list_nth(checker_list, 1); + assert_allocation_checker_and_entry(allocation, + start + size / 2, size / 2, &allocation_info); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + vm_region_submap_info_data_64_t submap_content_info; + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start, size / 2); + assert_allocation_entry(start, size / 2, 1 /* submap_depth */, &submap_content_info); + assert_checker_and_entry_protection_equals(submap_content, &submap_content_info, + submap_protection, submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_before_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_before_allocation_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_before_allocation_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_before_allocation_common(checker_list, + start, size, VM_PROT_READ); +} + +static test_result_t +test_submap_after_allocation_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + vm_region_submap_info_data_64_t allocation_info; + vm_entry_checker_t *allocation = checker_list_nth(checker_list, 0); + assert_allocation_checker_and_entry(allocation, + start, size / 2, &allocation_info); + + vm_region_submap_info_data_64_t submap_parent_info; + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 1); + assert_submap_checker_and_entry(submap_parent, + start + size / 2, size / 2, &submap_parent_info); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + vm_region_submap_info_data_64_t submap_content_info; + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start + size / 2, size / 2); + assert_allocation_entry(start + size / 2, size / 2, 1 /* submap_depth */, &submap_content_info); + assert_checker_and_entry_protection_equals(submap_content, &submap_content_info, + submap_protection, submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_after_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_after_allocation_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_after_allocation_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_after_allocation_common(checker_list, + start, size, VM_PROT_READ); +} + +static test_result_t +test_submap_before_hole_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + vm_region_submap_info_data_64_t submap_parent_info; + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 0); + assert_submap_checker_and_entry(submap_parent, + start, size / 2, &submap_parent_info); + + vm_entry_checker_t *hole = checker_list_nth(checker_list, 1); + assert_hole_checker_and_entry(hole, + start + size / 2, size / 2); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + vm_region_submap_info_data_64_t submap_content_info; + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start, size / 2); + assert_allocation_entry(start, size / 2, 1 /* submap_depth */, &submap_content_info); + assert_checker_and_entry_protection_equals(submap_content, &submap_content_info, + submap_protection, submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_before_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_before_hole_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_before_hole_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_before_hole_common(checker_list, + start, size, VM_PROT_READ); +} + +static test_result_t +test_submap_after_hole_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + vm_entry_checker_t *hole = checker_list_nth(checker_list, 0); + assert_hole_checker_and_entry(hole, + start, size / 2); + + vm_region_submap_info_data_64_t submap_parent_info; + vm_entry_checker_t *submap_parent = checker_list_nth(checker_list, 1); + assert_submap_checker_and_entry(submap_parent, + start + size / 2, size / 2, &submap_parent_info); + + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(submap_parent); + + vm_region_submap_info_data_64_t submap_content_info; + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start + size / 2, size / 2); + assert_allocation_entry(start + size / 2, size / 2, 1 /* submap_depth */, &submap_content_info); + assert_checker_and_entry_protection_equals(submap_content, &submap_content_info, + submap_protection, submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_after_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_after_hole_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_after_hole_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_after_hole_common(checker_list, + start, size, VM_PROT_READ); +} + + +/* + * Verify that the checker list consists of three entries, + * a submap mapping, an allocation, and a submap mapping, + * all of default size. + */ +static void +assert_submap_allocation_submap( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_region_submap_info_data_64_t info; + vm_entry_checker_t *checker; + mach_vm_size_t offset; + + assert(checker_range_count(checker_list->entries) == 3); + + offset = DEFAULT_ENTRY_SIZE * 0; + checker = checker_list_nth(checker_list, 0); + assert_submap_checker_and_entry(checker, + start + offset, DEFAULT_ENTRY_SIZE, &info); + assert(checker->object_offset == offset); + assert(info.offset == offset); + + offset = DEFAULT_ENTRY_SIZE * 1; + checker = checker_list_nth(checker_list, 1); + assert_allocation_checker_and_entry(checker, + start + offset, DEFAULT_ENTRY_SIZE, &info); + + offset = DEFAULT_ENTRY_SIZE * 2; + checker = checker_list_nth(checker_list, 2); + assert_submap_checker_and_entry(checker, + start + offset, DEFAULT_ENTRY_SIZE, &info); + assert(checker->object_offset == offset); + assert(info.offset == offset); + + offset = DEFAULT_ENTRY_SIZE * 3; + assert(size == offset); +} + +static test_result_t +test_submap_allocation_submap_one_entry_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + /* parent map is submap - allocation - submap */ + assert_submap_allocation_submap(checker_list, start, size); + + /* submap is one allocation entry */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker_list_nth(checker_list, 0)); + assert(checker_range_count(submap_checkers->entries) == 1); + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start, size); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_allocation_submap_one_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_one_entry_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_allocation_submap_one_entry_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_one_entry_common(checker_list, + start, size, VM_PROT_READ); +} + +static test_result_t +test_submap_allocation_submap_two_entries_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + /* parent map is submap - allocation - submap */ + assert_submap_allocation_submap(checker_list, start, size); + + /* submap is two allocation entries */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker_list_nth(checker_list, 0)); + assert(checker_range_count(submap_checkers->entries) == 2); + + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start, size / 2); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + submap_content = checker_list_nth(submap_checkers, 1); + assert_allocation_checker(submap_content, start + size / 2, size / 2); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_allocation_submap_two_entries( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_two_entries_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_allocation_submap_two_entries_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_two_entries_common(checker_list, + start, size, VM_PROT_READ); +} + +static test_result_t +test_submap_allocation_submap_three_entries_common( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + int submap_protection) +{ + /* parent map is submap - allocation - submap */ + assert_submap_allocation_submap(checker_list, start, size); + + /* submap is three allocation entries */ + checker_list_t *submap_checkers DEFER_UNSLIDE = + checker_get_and_slide_submap_checkers(checker_list_nth(checker_list, 0)); + assert(checker_range_count(submap_checkers->entries) == 3); + + vm_entry_checker_t *submap_content = checker_list_nth(submap_checkers, 0); + assert_allocation_checker(submap_content, start, size / 3); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + submap_content = checker_list_nth(submap_checkers, 1); + assert_allocation_checker(submap_content, start + size / 3, size / 3); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + submap_content = checker_list_nth(submap_checkers, 2); + assert_allocation_checker(submap_content, start + size / 3 * 2, size / 3); + assert(submap_content->protection == submap_protection); + assert(submap_content->max_protection == submap_protection); + + return TestSucceeded; +} + +static test_result_t +test_submap_allocation_submap_three_entries( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_three_entries_common(checker_list, + start, size, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_submap_allocation_submap_three_entries_ro( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return test_submap_allocation_submap_three_entries_common(checker_list, + start, size, VM_PROT_READ); +} + + +static void +assert_protection( + mach_vm_address_t address, + vm_prot_t prot, + bool check_max, + uint32_t submap_depth) +{ + mach_vm_address_t info_address = address; + mach_vm_size_t info_size; + vm_region_submap_info_data_64_t info; + assert(get_info_for_address(&info_address, &info_size, &info, submap_depth)); + assert(info_address == address); + + if (check_max) { + T_QUIET; T_ASSERT_EQ(prot, info.max_protection, "entry max protection"); + } else { + T_QUIET; T_ASSERT_EQ(prot, info.protection, "entry protection"); + } +} + +static test_result_t +test_protection_single_common( + checker_list_t *checker_list, + mach_vm_address_t address, + vm_prot_t prot, vm_prot_t max) +{ + vm_entry_checker_t *checker = + checker_list_find_allocation(checker_list, address); + T_QUIET; T_ASSERT_NOTNULL(checker, "checker"); + T_QUIET; T_ASSERT_EQ(checker->protection, prot, "checker protection"); + T_QUIET; T_ASSERT_EQ(checker->max_protection, max, "checker max protection"); + + assert_protection(address, prot, false /* check max */, 0 /* submap depth */); + assert_protection(address, max, true /* check max */, 0 /* submap depth */); + + return TestSucceeded; +} + +static test_result_t +test_protection_pair_common( + checker_list_t *checker_list, + mach_vm_address_t address, + vm_prot_t left_prot, + vm_prot_t right_prot) +{ + vm_entry_checker_t *left_checker = + checker_list_find_allocation(checker_list, address); + vm_entry_checker_t *right_checker = left_checker->next; + + T_QUIET; T_ASSERT_NOTNULL(left_checker, "checker"); + T_QUIET; T_ASSERT_EQ(left_checker->protection, left_prot, "left entry protection"); + T_QUIET; T_ASSERT_EQ(right_checker->protection, right_prot, "right entry protection"); + + assert_protection(left_checker->address, left_prot, false /* check max */, 0 /* submap depth */); + assert_protection(right_checker->address, right_prot, false /* check max */, 0 /* submap depth */); + + return TestSucceeded; +} + +static test_result_t +test_protection_single_000_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_NONE, VM_PROT_NONE); +} + +static test_result_t +test_protection_single_000_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_NONE, VM_PROT_READ); +} + +static test_result_t +test_protection_single_000_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_NONE, VM_PROT_WRITE); +} + +static test_result_t +test_protection_single_000_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_NONE, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_single_r00_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_READ, VM_PROT_READ); +} + +static test_result_t +test_protection_single_r00_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_READ, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_single_0w0_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_WRITE); +} + +static test_result_t +test_protection_single_0w0_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_single_rw0_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_single_common( + checker_list, start, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + + +static test_result_t +test_protection_pair_000_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_NONE, VM_PROT_NONE); +} + +static test_result_t +test_protection_pair_000_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_NONE, VM_PROT_READ); +} + +static test_result_t +test_protection_pair_000_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_NONE, VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_000_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_NONE, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_r00_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ, VM_PROT_NONE); +} + +static test_result_t +test_protection_pair_r00_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ, VM_PROT_READ); +} + +static test_result_t +test_protection_pair_r00_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ, VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_r00_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_0w0_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_NONE); +} + +static test_result_t +test_protection_pair_0w0_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_READ); +} + +static test_result_t +test_protection_pair_0w0_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_0w0_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_rw0_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE); +} + +static test_result_t +test_protection_pair_rw0_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ); +} + +static test_result_t +test_protection_pair_rw0_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_WRITE); +} + +static test_result_t +test_protection_pair_rw0_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size __unused) +{ + return test_protection_pair_common( + checker_list, start, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE); +} + + +T_DECL(test_vm_configurator, + "spot-check VM states generated by vm configurator") +{ + vm_tests_t tests = { + .single_entry_1 = test_single_entry_1, + .single_entry_2 = test_single_entry_2, + .single_entry_3 = test_single_entry_3, + .single_entry_4 = test_single_entry_4, + + .multiple_entries_1 = test_multiple_entries_1, + .multiple_entries_2 = test_multiple_entries_2, + .multiple_entries_3 = test_multiple_entries_3, + .multiple_entries_4 = test_multiple_entries_4, + .multiple_entries_5 = test_multiple_entries_5, + .multiple_entries_6 = test_multiple_entries_6, + + .some_holes_1 = test_some_holes_1, + .some_holes_2 = test_some_holes_2, + .some_holes_3 = test_some_holes_3, + .some_holes_4 = test_some_holes_4, + .some_holes_5 = test_some_holes_5, + .some_holes_6 = test_some_holes_6, + .some_holes_7 = test_some_holes_7, + .some_holes_8 = test_some_holes_8, + .some_holes_9 = test_some_holes_9, + .some_holes_10 = test_some_holes_10, + .some_holes_11 = test_some_holes_11, + .some_holes_12 = test_some_holes_12, + + .all_holes_1 = test_all_holes_1, + .all_holes_2 = test_all_holes_2, + .all_holes_3 = test_all_holes_3, + .all_holes_4 = test_all_holes_4, + + .null_entry = test_null_entry, + .nonresident_entry = test_nonresident_entry, + .resident_entry = test_resident_entry, + + .shared_entry = test_shared_entry, + .shared_entry_discontiguous = test_shared_entry_discontiguous, + .shared_entry_partial = test_shared_entry_partial, + .shared_entry_pairs = test_shared_entry_pairs, + .shared_entry_x1000 = test_shared_entry_x1000, + + .cow_entry = test_cow_entry, + .cow_unreferenced = test_cow_unreferenced, + .cow_nocow = test_cow_nocow, + .nocow_cow = test_nocow_cow, + .cow_unreadable = test_cow_unreadable, + .cow_unwriteable = test_cow_unwriteable, + + .permanent_entry = test_permanent_entry, + .permanent_before_permanent = test_permanent_before_permanent, + .permanent_before_allocation = test_permanent_before_allocation, + .permanent_before_allocation_2 = test_permanent_before_allocation_2, + .permanent_before_hole = test_permanent_before_hole, + .permanent_after_allocation = test_permanent_after_allocation, + .permanent_after_hole = test_permanent_after_hole, + + .single_submap_single_entry = test_single_submap_single_entry, + .single_submap_single_entry_first_pages = test_single_submap_single_entry_first_pages, + .single_submap_single_entry_last_pages = test_single_submap_single_entry_last_pages, + .single_submap_single_entry_middle_pages = test_single_submap_single_entry_middle_pages, + .single_submap_oversize_entry_at_start = test_single_submap_oversize_entry_at_start, + .single_submap_oversize_entry_at_end = test_single_submap_oversize_entry_at_end, + .single_submap_oversize_entry_at_both = test_single_submap_oversize_entry_at_both, + + .submap_before_allocation = test_submap_before_allocation, + .submap_after_allocation = test_submap_after_allocation, + .submap_before_hole = test_submap_before_hole, + .submap_after_hole = test_submap_after_hole, + .submap_allocation_submap_one_entry = test_submap_allocation_submap_one_entry, + .submap_allocation_submap_two_entries = test_submap_allocation_submap_two_entries, + .submap_allocation_submap_three_entries = test_submap_allocation_submap_three_entries, + + .submap_before_allocation_ro = test_submap_before_allocation_ro, + .submap_after_allocation_ro = test_submap_after_allocation_ro, + .submap_before_hole_ro = test_submap_before_hole_ro, + .submap_after_hole_ro = test_submap_after_hole_ro, + .submap_allocation_submap_one_entry_ro = test_submap_allocation_submap_one_entry_ro, + .submap_allocation_submap_two_entries_ro = test_submap_allocation_submap_two_entries_ro, + .submap_allocation_submap_three_entries_ro = test_submap_allocation_submap_three_entries_ro, + + .protection_single_000_000 = test_protection_single_000_000, + .protection_single_000_r00 = test_protection_single_000_r00, + .protection_single_000_0w0 = test_protection_single_000_0w0, + .protection_single_000_rw0 = test_protection_single_000_rw0, + .protection_single_r00_r00 = test_protection_single_r00_r00, + .protection_single_r00_rw0 = test_protection_single_r00_rw0, + .protection_single_0w0_0w0 = test_protection_single_0w0_0w0, + .protection_single_0w0_rw0 = test_protection_single_0w0_rw0, + .protection_single_rw0_rw0 = test_protection_single_rw0_rw0, + + .protection_pairs_000_000 = test_protection_pair_000_000, + .protection_pairs_000_r00 = test_protection_pair_000_r00, + .protection_pairs_000_0w0 = test_protection_pair_000_0w0, + .protection_pairs_000_rw0 = test_protection_pair_000_rw0, + .protection_pairs_r00_000 = test_protection_pair_r00_000, + .protection_pairs_r00_r00 = test_protection_pair_r00_r00, + .protection_pairs_r00_0w0 = test_protection_pair_r00_0w0, + .protection_pairs_r00_rw0 = test_protection_pair_r00_rw0, + .protection_pairs_0w0_000 = test_protection_pair_0w0_000, + .protection_pairs_0w0_r00 = test_protection_pair_0w0_r00, + .protection_pairs_0w0_0w0 = test_protection_pair_0w0_0w0, + .protection_pairs_0w0_rw0 = test_protection_pair_0w0_rw0, + .protection_pairs_rw0_000 = test_protection_pair_rw0_000, + .protection_pairs_rw0_r00 = test_protection_pair_rw0_r00, + .protection_pairs_rw0_0w0 = test_protection_pair_rw0_0w0, + .protection_pairs_rw0_rw0 = test_protection_pair_rw0_rw0, + }; + + run_vm_tests("test_vm_configurator", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_vm_allocate.c b/tests/vm/configurator_vm_allocate.c new file mode 100644 index 000000000..6f32f1574 --- /dev/null +++ b/tests/vm/configurator_vm_allocate.c @@ -0,0 +1,1019 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_allocate.c + * + * Test vm_allocate(FIXED and FIXED|OVERWRITE) with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" +#include "configurator/vm_configurator_helpers.h" +#include "exc_guard_helper.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + +/* + * rdar://143341561 vm_allocate(FIXED | OVERWRITE) sometimes provokes EXC_GUARD + * Remove this when that bug is fixed. + * + * normal workaround: run vm_allocate with the EXC_GUARD catcher in place + * when the test is expected to hit rdar://143341561 + * Rosetta workaround: EXC_GUARD catcher doesn't work on Rosetta, so don't run + * vm_allocate when the test is expected to hit rdar://143341561 + */ +#define workaround_rdar_143341561 1 + +/* + * Update the checker list after a successful call to vm_allocate(). + * Any pre-existing checkers inside this range are deleted and replaced. + */ +static void +checker_perform_successful_vm_allocate( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + uint16_t user_tag) +{ + /* Make a new checker for the allocation. */ + vm_entry_checker_t *new_checker = make_checker_for_vm_allocate( + checker_list, start, size, VM_MAKE_TAG(user_tag)); + entry_checker_range_t new_range = { new_checker, new_checker }; + + /* Find existing checkers in the address range. */ + entry_checker_range_t old_range = + checker_list_find_and_clip_including_holes(checker_list, start, size); + + /* Free the old checkers and insert the new checker. */ + checker_list_replace_range(checker_list, old_range, new_range); +} + +#if workaround_rdar_143341561 +/* + * Return true if flags has VM_FLAGS_FIXED + * This is non-trivial because VM_FLAGS_FIXED is zero; + * the real value is the absence of VM_FLAGS_ANYWHERE. + */ +static bool +is_fixed(int flags) +{ + static_assert(VM_FLAGS_FIXED == 0, "this test requires VM_FLAGS_FIXED be zero"); + static_assert(VM_FLAGS_ANYWHERE != 0, "this test requires VM_FLAGS_ANYWHERE be nonzero"); + return !(flags & VM_FLAGS_ANYWHERE); +} + +/* Return true if flags has VM_FLAGS_FIXED and VM_FLAGS_OVERWRITE set. */ +static bool +is_fixed_overwrite(int flags) +{ + return is_fixed(flags) && (flags & VM_FLAGS_OVERWRITE); +} +#endif /* workaround_rdar_143341561 */ + +static bool +call_vm_allocate_and_expect_result( + mach_vm_address_t start, + mach_vm_size_t size, + int flags_and_tag, + kern_return_t expected_kr) +{ +#if workaround_rdar_143341561 + __block mach_vm_address_t allocated = start; + __block kern_return_t kr; + exc_guard_helper_info_t exc_info; + bool caught_exception = + block_raised_exc_guard_of_type_ignoring_translated(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kr = mach_vm_allocate(mach_task_self(), &allocated, size, flags_and_tag); + }); + if (caught_exception) { + if (is_fixed_overwrite(flags_and_tag)) { + T_LOG("warning: rdar://143341561 mmap(FIXED) should work " + "regardless of whether a mapping exists at the addr"); + } else { + T_FAIL("unexpected EXC_GUARD during vm_allocate"); + return false; + } + } +#else /* not workaround_rdar_143341561 */ + mach_vm_address_t allocated = start; + kern_return_t kr = + mach_vm_allocate(mach_task_self(), &allocated, size, flags_and_tag); +#endif /* not workaround_rdar_143341561 */ + + if (kr != expected_kr) { + T_EXPECT_MACH_ERROR(kr, expected_kr, "mach_vm_allocate(flags 0x%x)", flags_and_tag); + return false; + } + if (allocated != start) { + T_FAIL("mach_vm_allocate(flags 0x%x) returned address 0x%llx (expected 0x%llx)", + flags_and_tag, allocated, start); + return false; + } + + return true; +} + +static bool +call_vm_allocate_and_expect_success( + mach_vm_address_t start, + mach_vm_size_t size, + int flags_and_tag) +{ + return call_vm_allocate_and_expect_result(start, size, flags_and_tag, KERN_SUCCESS); +} + +static bool +call_vm_allocate_and_expect_no_space( + mach_vm_address_t start, + mach_vm_size_t size, + int flags_and_tag) +{ + return call_vm_allocate_and_expect_result(start, size, flags_and_tag, KERN_NO_SPACE); +} + +static test_result_t +successful_vm_allocate_fixed( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + if (!call_vm_allocate_and_expect_success(start, size, VM_FLAGS_FIXED)) { + return TestFailed; + } + checker_perform_successful_vm_allocate(checker_list, start, size, 0); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED)"); +} + + +static test_result_t +test_permanent_entry_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* one permanent entry, it becomes inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_permanent_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* two permanent entries, both become inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* [start + size/2, start + size) unchanged */ + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_overwrite_rdar144128567( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, becomes deallocated (rdar://144128567) + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_hole_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one hole, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no change for addresses [start + size / 2, start + size) */ + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_after_allocation_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* + * one nonpermanent allocation, becomes deallocated + * one permanent entry, becomes inaccessible + */ + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_after_hole_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE)) { + return TestFailed; + } + + /* + * one hole, unchanged + * one permanent entry, becomes inaccessible + */ + /* no change for addresses [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + + +static test_result_t +test_permanent_entry_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* one permanent entry, it becomes inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_permanent_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* two permanent entries, both become inaccessible */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* [start + size/2, start + size) unchanged */ + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_allocation_fixed_overwrite_with_neighbor_tags_rdar144128567( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one nonpermanent allocation, becomes deallocated (rdar://144128567) + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_before_hole_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* + * one permanent entry, becomes inaccessible + * one hole, unchanged + */ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no change for addresses [start + size / 2, start + size) */ + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_after_allocation_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* + * one nonpermanent allocation, becomes deallocated + * one permanent entry, becomes inaccessible + */ + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +test_permanent_after_hole_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if workaround_rdar_143341561 + if (isRosetta()) { + T_LOG("warning: can't work around rdar://143341561 on Rosetta; just passing instead"); + return TestSucceeded; + } +#endif + + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (!call_vm_allocate_and_expect_no_space( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + + /* + * one hole, unchanged + * one permanent entry, becomes inaccessible + */ + /* no change for addresses [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +fixed_no_space( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + if (!call_vm_allocate_and_expect_no_space(start, size, VM_FLAGS_FIXED)) { + return TestFailed; + } + + /* no checker update here, call should have no side effects */ + + return verify_vm_state(checker_list, "after vm_allocate(FIXED)"); +} + + +static test_result_t +successful_vm_allocate_fixed_overwrite_with_tag( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + uint16_t tag) +{ + if (!call_vm_allocate_and_expect_success( + start, size, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE | VM_MAKE_TAG(tag))) { + return TestFailed; + } + checker_perform_successful_vm_allocate(checker_list, start, size, tag); + + return verify_vm_state(checker_list, "after vm_allocate(FIXED | OVERWRITE)"); +} + +static test_result_t +successful_vm_allocate_fixed_overwrite( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return successful_vm_allocate_fixed_overwrite_with_tag( + checker_list, start, size, 0); +} + +static test_result_t +successful_vm_allocate_fixed_overwrite_with_neighbor_tags( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + uint16_t tag; + + /* + * Allocate with a tag matching the entry to the left, + * to probe simplify behavior. + */ + tag = get_app_specific_user_tag_for_address(start - 1); + if (TestFailed == successful_vm_allocate_fixed_overwrite_with_tag( + checker_list, start, size, tag)) { + return TestFailed; + } + + /* + * Allocate again, with a tag matching the entry to the right, + * to probe simplify behavior. + */ + tag = get_app_specific_user_tag_for_address(start + size); + if (TestFailed == successful_vm_allocate_fixed_overwrite_with_tag( + checker_list, start, size, tag)) { + return TestFailed; + } + + return TestSucceeded; +} + + +T_DECL(vm_allocate_fixed, + "run vm_allocate(FIXED) with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = fixed_no_space, + .single_entry_2 = fixed_no_space, + .single_entry_3 = fixed_no_space, + .single_entry_4 = fixed_no_space, + + .multiple_entries_1 = fixed_no_space, + .multiple_entries_2 = fixed_no_space, + .multiple_entries_3 = fixed_no_space, + .multiple_entries_4 = fixed_no_space, + .multiple_entries_5 = fixed_no_space, + .multiple_entries_6 = fixed_no_space, + + .some_holes_1 = fixed_no_space, + .some_holes_2 = fixed_no_space, + .some_holes_3 = fixed_no_space, + .some_holes_4 = fixed_no_space, + .some_holes_5 = fixed_no_space, + .some_holes_6 = fixed_no_space, + .some_holes_7 = fixed_no_space, + .some_holes_8 = fixed_no_space, + .some_holes_9 = fixed_no_space, + .some_holes_10 = fixed_no_space, + .some_holes_11 = fixed_no_space, + .some_holes_12 = fixed_no_space, + + .all_holes_1 = successful_vm_allocate_fixed, + .all_holes_2 = successful_vm_allocate_fixed, + .all_holes_3 = successful_vm_allocate_fixed, + .all_holes_4 = successful_vm_allocate_fixed, + + .null_entry = fixed_no_space, + .nonresident_entry = fixed_no_space, + .resident_entry = fixed_no_space, + + .shared_entry = fixed_no_space, + .shared_entry_discontiguous = fixed_no_space, + .shared_entry_partial = fixed_no_space, + .shared_entry_pairs = fixed_no_space, + .shared_entry_x1000 = fixed_no_space, + + .cow_entry = fixed_no_space, + .cow_unreferenced = fixed_no_space, + .cow_nocow = fixed_no_space, + .nocow_cow = fixed_no_space, + .cow_unreadable = fixed_no_space, + .cow_unwriteable = fixed_no_space, + + .permanent_entry = fixed_no_space, + .permanent_before_permanent = fixed_no_space, + .permanent_before_allocation = fixed_no_space, + .permanent_before_allocation_2 = fixed_no_space, + .permanent_before_hole = fixed_no_space, + .permanent_after_allocation = fixed_no_space, + .permanent_after_hole = fixed_no_space, + + .single_submap_single_entry = fixed_no_space, + .single_submap_single_entry_first_pages = fixed_no_space, + .single_submap_single_entry_last_pages = fixed_no_space, + .single_submap_single_entry_middle_pages = fixed_no_space, + .single_submap_oversize_entry_at_start = fixed_no_space, + .single_submap_oversize_entry_at_end = fixed_no_space, + .single_submap_oversize_entry_at_both = fixed_no_space, + + .submap_before_allocation = fixed_no_space, + .submap_after_allocation = fixed_no_space, + .submap_before_hole = fixed_no_space, + .submap_after_hole = fixed_no_space, + .submap_allocation_submap_one_entry = fixed_no_space, + .submap_allocation_submap_two_entries = fixed_no_space, + .submap_allocation_submap_three_entries = fixed_no_space, + + .submap_before_allocation_ro = fixed_no_space, + .submap_after_allocation_ro = fixed_no_space, + .submap_before_hole_ro = fixed_no_space, + .submap_after_hole_ro = fixed_no_space, + .submap_allocation_submap_one_entry_ro = fixed_no_space, + .submap_allocation_submap_two_entries_ro = fixed_no_space, + .submap_allocation_submap_three_entries_ro = fixed_no_space, + + .protection_single_000_000 = fixed_no_space, + .protection_single_000_r00 = fixed_no_space, + .protection_single_000_0w0 = fixed_no_space, + .protection_single_000_rw0 = fixed_no_space, + .protection_single_r00_r00 = fixed_no_space, + .protection_single_r00_rw0 = fixed_no_space, + .protection_single_0w0_0w0 = fixed_no_space, + .protection_single_0w0_rw0 = fixed_no_space, + .protection_single_rw0_rw0 = fixed_no_space, + + .protection_pairs_000_000 = fixed_no_space, + .protection_pairs_000_r00 = fixed_no_space, + .protection_pairs_000_0w0 = fixed_no_space, + .protection_pairs_000_rw0 = fixed_no_space, + .protection_pairs_r00_000 = fixed_no_space, + .protection_pairs_r00_r00 = fixed_no_space, + .protection_pairs_r00_0w0 = fixed_no_space, + .protection_pairs_r00_rw0 = fixed_no_space, + .protection_pairs_0w0_000 = fixed_no_space, + .protection_pairs_0w0_r00 = fixed_no_space, + .protection_pairs_0w0_0w0 = fixed_no_space, + .protection_pairs_0w0_rw0 = fixed_no_space, + .protection_pairs_rw0_000 = fixed_no_space, + .protection_pairs_rw0_r00 = fixed_no_space, + .protection_pairs_rw0_0w0 = fixed_no_space, + .protection_pairs_rw0_rw0 = fixed_no_space, + }; + + run_vm_tests("vm_allocate_fixed", __FILE__, &tests, argc, argv); +} + + +T_DECL(vm_allocate_fixed_overwrite, + "run vm_allocate(FIXED|OVERWRITE) with various vm configurations") +{ +#if workaround_rdar_143341561 + enable_non_fatal_vm_exc_guard(); +#endif + + vm_tests_t tests = { + .single_entry_1 = successful_vm_allocate_fixed_overwrite, + .single_entry_2 = successful_vm_allocate_fixed_overwrite, + .single_entry_3 = successful_vm_allocate_fixed_overwrite, + .single_entry_4 = successful_vm_allocate_fixed_overwrite, + + .multiple_entries_1 = successful_vm_allocate_fixed_overwrite, + .multiple_entries_2 = successful_vm_allocate_fixed_overwrite, + .multiple_entries_3 = successful_vm_allocate_fixed_overwrite, + .multiple_entries_4 = successful_vm_allocate_fixed_overwrite, + .multiple_entries_5 = successful_vm_allocate_fixed_overwrite, + .multiple_entries_6 = successful_vm_allocate_fixed_overwrite, + + .some_holes_1 = successful_vm_allocate_fixed_overwrite, + .some_holes_2 = successful_vm_allocate_fixed_overwrite, + .some_holes_3 = successful_vm_allocate_fixed_overwrite, + .some_holes_4 = successful_vm_allocate_fixed_overwrite, + .some_holes_5 = successful_vm_allocate_fixed_overwrite, + .some_holes_6 = successful_vm_allocate_fixed_overwrite, + .some_holes_7 = successful_vm_allocate_fixed_overwrite, + .some_holes_8 = successful_vm_allocate_fixed_overwrite, + .some_holes_9 = successful_vm_allocate_fixed_overwrite, + .some_holes_10 = successful_vm_allocate_fixed_overwrite, + .some_holes_11 = successful_vm_allocate_fixed_overwrite, + .some_holes_12 = successful_vm_allocate_fixed_overwrite, + + .all_holes_1 = successful_vm_allocate_fixed_overwrite, + .all_holes_2 = successful_vm_allocate_fixed_overwrite, + .all_holes_3 = successful_vm_allocate_fixed_overwrite, + .all_holes_4 = successful_vm_allocate_fixed_overwrite, + + .null_entry = successful_vm_allocate_fixed_overwrite, + .nonresident_entry = successful_vm_allocate_fixed_overwrite, + .resident_entry = successful_vm_allocate_fixed_overwrite, + + .shared_entry = successful_vm_allocate_fixed_overwrite, + .shared_entry_discontiguous = successful_vm_allocate_fixed_overwrite, + .shared_entry_partial = successful_vm_allocate_fixed_overwrite, + .shared_entry_pairs = successful_vm_allocate_fixed_overwrite, + .shared_entry_x1000 = successful_vm_allocate_fixed_overwrite, + + .cow_entry = successful_vm_allocate_fixed_overwrite, + .cow_unreferenced = successful_vm_allocate_fixed_overwrite, + .cow_nocow = successful_vm_allocate_fixed_overwrite, + .nocow_cow = successful_vm_allocate_fixed_overwrite, + .cow_unreadable = successful_vm_allocate_fixed_overwrite, + .cow_unwriteable = successful_vm_allocate_fixed_overwrite, + + .permanent_entry = test_permanent_entry_fixed_overwrite, + .permanent_before_permanent = test_permanent_before_permanent_fixed_overwrite, + .permanent_before_allocation = test_permanent_before_allocation_fixed_overwrite, + .permanent_before_allocation_2 = test_permanent_before_allocation_fixed_overwrite_rdar144128567, + .permanent_before_hole = test_permanent_before_hole_fixed_overwrite, + .permanent_after_allocation = test_permanent_after_allocation_fixed_overwrite, + .permanent_after_hole = test_permanent_after_hole_fixed_overwrite, + + .single_submap_single_entry = successful_vm_allocate_fixed_overwrite, + .single_submap_single_entry_first_pages = successful_vm_allocate_fixed_overwrite, + .single_submap_single_entry_last_pages = successful_vm_allocate_fixed_overwrite, + .single_submap_single_entry_middle_pages = successful_vm_allocate_fixed_overwrite, + .single_submap_oversize_entry_at_start = successful_vm_allocate_fixed_overwrite, + .single_submap_oversize_entry_at_end = successful_vm_allocate_fixed_overwrite, + .single_submap_oversize_entry_at_both = successful_vm_allocate_fixed_overwrite, + + .submap_before_allocation = successful_vm_allocate_fixed_overwrite, + .submap_after_allocation = successful_vm_allocate_fixed_overwrite, + .submap_before_hole = successful_vm_allocate_fixed_overwrite, + .submap_after_hole = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_one_entry = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_two_entries = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_three_entries = successful_vm_allocate_fixed_overwrite, + + .submap_before_allocation_ro = successful_vm_allocate_fixed_overwrite, + .submap_after_allocation_ro = successful_vm_allocate_fixed_overwrite, + .submap_before_hole_ro = successful_vm_allocate_fixed_overwrite, + .submap_after_hole_ro = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_one_entry_ro = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_two_entries_ro = successful_vm_allocate_fixed_overwrite, + .submap_allocation_submap_three_entries_ro = successful_vm_allocate_fixed_overwrite, + + .protection_single_000_000 = successful_vm_allocate_fixed_overwrite, + .protection_single_000_r00 = successful_vm_allocate_fixed_overwrite, + .protection_single_000_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_single_000_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_single_r00_r00 = successful_vm_allocate_fixed_overwrite, + .protection_single_r00_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_single_0w0_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_single_0w0_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_single_rw0_rw0 = successful_vm_allocate_fixed_overwrite, + + .protection_pairs_000_000 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_000_r00 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_000_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_000_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_r00_000 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_r00_r00 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_r00_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_r00_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_0w0_000 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_0w0_r00 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_0w0_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_0w0_rw0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_rw0_000 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_rw0_r00 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_rw0_0w0 = successful_vm_allocate_fixed_overwrite, + .protection_pairs_rw0_rw0 = successful_vm_allocate_fixed_overwrite, + }; + + run_vm_tests("vm_allocate_fixed_overwrite", __FILE__, &tests, argc, argv); +} + +T_DECL(vm_allocate_fixed_overwrite_with_neighbor_tags, + "run vm_allocate(FIXED|OVERWRITE|tag) with various vm configurations " + "and tags copied from neighboring entries") +{ +#if workaround_rdar_143341561 + enable_non_fatal_vm_exc_guard(); +#endif + + vm_tests_t tests = { + .single_entry_1 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_entry_2 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_entry_3 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_entry_4 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .multiple_entries_1 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .multiple_entries_2 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .multiple_entries_3 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .multiple_entries_4 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .multiple_entries_5 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .multiple_entries_6 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .some_holes_1 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_2 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_3 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_4 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_5 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_6 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_7 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_8 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_9 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_10 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_11 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .some_holes_12 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .all_holes_1 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .all_holes_2 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .all_holes_3 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .all_holes_4 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .null_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .nonresident_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .resident_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .shared_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .shared_entry_discontiguous = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .shared_entry_partial = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .shared_entry_pairs = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .shared_entry_x1000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .cow_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .cow_unreferenced = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .cow_nocow = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .nocow_cow = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .cow_unreadable = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .cow_unwriteable = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .permanent_entry = test_permanent_entry_fixed_overwrite_with_neighbor_tags, + .permanent_before_permanent = test_permanent_before_permanent_fixed_overwrite_with_neighbor_tags, + .permanent_before_allocation = test_permanent_before_allocation_fixed_overwrite_with_neighbor_tags, + .permanent_before_allocation_2 = test_permanent_before_allocation_fixed_overwrite_with_neighbor_tags_rdar144128567, + .permanent_before_hole = test_permanent_before_hole_fixed_overwrite_with_neighbor_tags, + .permanent_after_allocation = test_permanent_after_allocation_fixed_overwrite_with_neighbor_tags, + .permanent_after_hole = test_permanent_after_hole_fixed_overwrite_with_neighbor_tags, + + .single_submap_single_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_single_entry_first_pages = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_single_entry_last_pages = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_single_entry_middle_pages = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_oversize_entry_at_start = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_oversize_entry_at_end = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .single_submap_oversize_entry_at_both = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .submap_before_allocation = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_after_allocation = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_before_hole = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_after_hole = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_one_entry = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_two_entries = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_three_entries = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .submap_before_allocation_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_after_allocation_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_before_hole_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_after_hole_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_one_entry_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_two_entries_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .submap_allocation_submap_three_entries_ro = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .protection_single_000_000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_000_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_000_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_000_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_r00_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_r00_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_0w0_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_0w0_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_single_rw0_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + + .protection_pairs_000_000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_000_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_000_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_000_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_r00_000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_r00_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_r00_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_r00_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_0w0_000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_0w0_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_0w0_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_0w0_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_rw0_000 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_rw0_r00 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_rw0_0w0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + .protection_pairs_rw0_rw0 = successful_vm_allocate_fixed_overwrite_with_neighbor_tags, + }; + + run_vm_tests("vm_allocate_fixed_overwrite_with_neighbor_tags", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_vm_behavior_set.c b/tests/vm/configurator_vm_behavior_set.c new file mode 100644 index 000000000..db57c2eb9 --- /dev/null +++ b/tests/vm/configurator_vm_behavior_set.c @@ -0,0 +1,909 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_behavior_set.c + * + * Test vm_behavior_set with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + +static void +write_one_memory( + checker_list_t *checker_list, + vm_entry_checker_t *checker) +{ + if (checker->kind == Allocation && + prot_contains_all(checker->protection, VM_PROT_READ | VM_PROT_WRITE)) { + checker_fault_for_prot_not_cow(checker_list, checker, VM_PROT_WRITE); + memset((char *)checker->address, 0xff, checker->size); + if (checker->object) { + checker->object->fill_pattern.mode = Fill; + checker->object->fill_pattern.pattern = 0xffffffffffffffff; + } + } +} + +static void +write_memory( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t limit = + checker_list_find_range_including_holes(checker_list, start, size); + /* TODO: this writes beyond [start, size) */ + FOREACH_CHECKER(checker, limit) { + write_one_memory(checker_list, checker); + } +} + +/* Test vm_behavior_set(behavior). This supports several behaviors. */ +static test_result_t +vm_behavior_common_no_cow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_behavior_t behavior, + bool has_holes) +{ + kern_return_t kr; + test_result_t test_results[1]; + bool clip, reject_submaps; + + kern_return_t expected_kr = KERN_SUCCESS; + if (has_holes) { + expected_kr = KERN_INVALID_ADDRESS; + } + + switch (behavior) { + case VM_BEHAVIOR_DEFAULT: + clip = true; + reject_submaps = false; + break; + case VM_BEHAVIOR_FREE: + clip = false; + reject_submaps = false; + break; + case VM_BEHAVIOR_CAN_REUSE: + clip = false; + reject_submaps = true; + break; + default: + T_FAIL("don't know whether to clip with behavior %s", + name_for_behavior(behavior)); + return TestFailed; + } + + entry_checker_range_t limit; + if (has_holes) { + limit = checker_list_find_range_including_holes(checker_list, start, size); + } else { + limit = checker_list_find_range(checker_list, start, size); + if (clip) { + checker_clip_left(checker_list, limit.head, start); + } + bool rejected = false; + if (reject_submaps) { + FOREACH_CHECKER(checker, limit) { + if (checker->kind == Submap) { + expected_kr = KERN_INVALID_ADDRESS; + rejected = true; + break; + } + } + } + if (clip) { + checker_clip_right(checker_list, limit.tail, start + size); + } + } + + kr = mach_vm_behavior_set(mach_task_self(), start, size, behavior); + if (kr != expected_kr) { + T_FAIL("mach_vm_behavior_set(%s) failed (%s)", + name_for_behavior(behavior), name_for_kr(kr)); + return TestFailed; + } + + /* Some behaviors destroy the pages, which affects the fill. */ + if (behavior == VM_BEHAVIOR_FREE) { + FOREACH_CHECKER(checker, limit) { + if (checker->object && checker->object->fill_pattern.mode == Fill) { + checker->object->fill_pattern.pattern = 0; + checker->object->fill_pattern.mode = DontFill; + } + } + } + + TEMP_CSTRING(when, "after vm_behavior_set(%s)", name_for_behavior(behavior)); + test_results[0] = verify_vm_state(checker_list, when); + + return worst_result(test_results, countof(test_results)); +} + +static test_result_t +vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_behavior_t behavior, + bool rw, + bool has_holes) +{ + test_result_t result; + + result = vm_behavior_common_no_cow( + checker_list, start, size, behavior, has_holes); + if (result != TestSucceeded) { + return result; + } + + if (rw) { + /* write to the memory and do it again */ + write_memory(checker_list, start, size); + result = verify_vm_state(checker_list, "after write_memory"); + if (result != TestSucceeded) { + return result; + } + + result = vm_behavior_common_no_cow( + checker_list, start, size, behavior, has_holes); + if (result != TestSucceeded) { + return result; + } + } + + return result; +} + +static test_result_t +vm_behavior_default_no_cow_rw_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_DEFAULT, + true /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_default_no_cow_rw_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_DEFAULT, + true /* rw */, true /* holes */); +} + +static test_result_t +vm_behavior_default_no_cow_ro_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_DEFAULT, + false /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_default_no_cow_ro_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_DEFAULT, + false /* rw */, true /* holes */); +} + + +static test_result_t +vm_behavior_free_no_cow_rw_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_FREE, + true /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_free_no_cow_rw_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_FREE, + true /* rw */, true /* holes */); +} + +static test_result_t +vm_behavior_free_no_cow_ro_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_FREE, + false /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_free_no_cow_ro_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_FREE, + false /* rw */, true /* holes */); +} + + +static test_result_t +vm_behavior_can_reuse_no_cow_rw_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_CAN_REUSE, + true /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_can_reuse_no_cow_rw_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_CAN_REUSE, + true /* rw */, true /* holes */); +} + +static test_result_t +vm_behavior_can_reuse_no_cow_ro_no_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_CAN_REUSE, + false /* rw */, false /* holes */); +} + +static test_result_t +vm_behavior_can_reuse_no_cow_ro_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + return vm_behavior_no_cow_maybe_rw_maybe_holes( + checker_list, start, size, VM_BEHAVIOR_CAN_REUSE, + false /* rw */, true /* holes */); +} + + +static test_result_t +vm_behavior_zero_once( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + const char *message_suffix) +{ + kern_return_t expected_kr = KERN_SUCCESS; + kern_return_t kr; + entry_checker_range_t limit = + checker_list_find_range_including_holes(checker_list, start, size); + + /* + * vm_behavior_set(ZERO) stops at un-writeable pages + * so we can't use the common code from other behaviors + */ + + if (task_page_size_less_than_vm_page_size()) { + /* + * VM_BEHAVIOR_ZERO does nothing and returns KERN_NO_ACCESS + * if the map's page size is less than the VM's page size. + */ + T_LOG("note: VM_BEHAVIOR_ZERO does nothing on this platform"); + expected_kr = KERN_NO_ACCESS; + goto checker_update_done; + } + + /* Check for holes first. */ + FOREACH_CHECKER(checker, limit) { + if (checker->kind == Hole) { + expected_kr = KERN_INVALID_ADDRESS; + goto checker_update_done; + } + } + + /* Zero the checkers' fill patterns, stopping if we hit an unacceptable entry */ + FOREACH_CHECKER(checker, limit) { + if (!prot_contains_all(checker->protection, VM_PROT_WRITE)) { + /* stop after the first unwriteable entry */ + expected_kr = KERN_PROTECTION_FAILURE; + goto checker_update_done; + } + if (checker->kind == Submap) { + /* stop at submaps */ + expected_kr = KERN_NO_ACCESS; + goto checker_update_done; + } + + /* writeable allocation: memory is now zeros */ + if (checker->object && checker->object->fill_pattern.mode == Fill) { + checker->object->fill_pattern.pattern = 0; + checker->object->fill_pattern.mode = DontFill; + } + } + +checker_update_done: + kr = mach_vm_behavior_set(mach_task_self(), start, size, VM_BEHAVIOR_ZERO); + if (kr != expected_kr) { + T_EXPECT_MACH_ERROR(kr, expected_kr, "mach_vm_behavior_set(VM_BEHAVIOR_ZERO)"); + return TestFailed; + } + + TEMP_CSTRING(when, "after vm_behavior_set(VM_BEHAVIOR_ZERO) %s", message_suffix); + return verify_vm_state(checker_list, when); +} + +static test_result_t +vm_behavior_zero( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t result; + + result = vm_behavior_zero_once(checker_list, start, size, "first time"); + if (result != TestSucceeded) { + return result; + } + + /* write to the memory and do it again */ + bool any_written = false; + entry_checker_range_t limit = checker_list_find_range_including_holes(checker_list, start, size); + /* TODO: this writes beyond [start, size) */ + FOREACH_CHECKER(checker, limit) { + if (checker->kind != Allocation) { + continue; + } + if (prot_contains_all(checker->protection, VM_PROT_READ | VM_PROT_WRITE)) { + any_written = true; + write_one_memory(checker_list, checker); + } else { + /* stop after first unwriteable entry */ + break; + } + } + + if (any_written) { + result = verify_vm_state(checker_list, "after write_memory"); + if (result != TestSucceeded) { + return result; + } + + result = vm_behavior_zero_once(checker_list, start, size, "second time"); + if (result != TestSucceeded) { + return result; + } + } + + return result; +} + + +T_DECL(vm_behavior_set_default, + "run vm_behavior_set(DEFAULT) with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = vm_behavior_default_no_cow_rw_no_holes, + .single_entry_2 = vm_behavior_default_no_cow_rw_no_holes, + .single_entry_3 = vm_behavior_default_no_cow_rw_no_holes, + .single_entry_4 = vm_behavior_default_no_cow_rw_no_holes, + + .multiple_entries_1 = vm_behavior_default_no_cow_rw_no_holes, + .multiple_entries_2 = vm_behavior_default_no_cow_rw_no_holes, + .multiple_entries_3 = vm_behavior_default_no_cow_rw_no_holes, + .multiple_entries_4 = vm_behavior_default_no_cow_rw_no_holes, + .multiple_entries_5 = vm_behavior_default_no_cow_rw_no_holes, + .multiple_entries_6 = vm_behavior_default_no_cow_rw_no_holes, + + .some_holes_1 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_2 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_3 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_4 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_5 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_6 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_7 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_8 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_9 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_10 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_11 = vm_behavior_default_no_cow_rw_with_holes, + .some_holes_12 = vm_behavior_default_no_cow_rw_with_holes, + + .all_holes_1 = vm_behavior_default_no_cow_rw_with_holes, + .all_holes_2 = vm_behavior_default_no_cow_rw_with_holes, + .all_holes_3 = vm_behavior_default_no_cow_rw_with_holes, + .all_holes_4 = vm_behavior_default_no_cow_rw_with_holes, + + .null_entry = vm_behavior_default_no_cow_rw_no_holes, + .nonresident_entry = vm_behavior_default_no_cow_rw_no_holes, + .resident_entry = vm_behavior_default_no_cow_rw_no_holes, + + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_is_unimplemented, + .cow_unreferenced = test_is_unimplemented, + .cow_nocow = test_is_unimplemented, + .nocow_cow = test_is_unimplemented, + .cow_unreadable = test_is_unimplemented, + .cow_unwriteable = test_is_unimplemented, + + .permanent_entry = vm_behavior_default_no_cow_rw_no_holes, + .permanent_before_permanent = vm_behavior_default_no_cow_rw_no_holes, + .permanent_before_allocation = vm_behavior_default_no_cow_rw_no_holes, + .permanent_before_allocation_2 = vm_behavior_default_no_cow_rw_no_holes, + .permanent_before_hole = vm_behavior_default_no_cow_rw_with_holes, + .permanent_after_allocation = vm_behavior_default_no_cow_rw_no_holes, + .permanent_after_hole = vm_behavior_default_no_cow_rw_with_holes, + + .single_submap_single_entry = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_single_entry_first_pages = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_single_entry_last_pages = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_single_entry_middle_pages = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_start = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_end = vm_behavior_default_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_both = vm_behavior_default_no_cow_rw_no_holes, + + .submap_before_allocation = vm_behavior_default_no_cow_rw_no_holes, + .submap_after_allocation = vm_behavior_default_no_cow_rw_no_holes, + .submap_before_hole = vm_behavior_default_no_cow_rw_with_holes, + .submap_after_hole = vm_behavior_default_no_cow_rw_with_holes, + .submap_allocation_submap_one_entry = vm_behavior_default_no_cow_rw_no_holes, + .submap_allocation_submap_two_entries = vm_behavior_default_no_cow_rw_no_holes, + .submap_allocation_submap_three_entries = vm_behavior_default_no_cow_rw_no_holes, + + .submap_before_allocation_ro = vm_behavior_default_no_cow_ro_no_holes, + .submap_after_allocation_ro = vm_behavior_default_no_cow_ro_no_holes, + .submap_before_hole_ro = vm_behavior_default_no_cow_ro_with_holes, + .submap_after_hole_ro = vm_behavior_default_no_cow_ro_with_holes, + .submap_allocation_submap_one_entry_ro = vm_behavior_default_no_cow_ro_no_holes, + .submap_allocation_submap_two_entries_ro = vm_behavior_default_no_cow_ro_no_holes, + .submap_allocation_submap_three_entries_ro = vm_behavior_default_no_cow_ro_no_holes, + + .protection_single_000_000 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_000_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_r00_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_000_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_0w0_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_000_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_r00_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_0w0_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_single_rw0_rw0 = vm_behavior_default_no_cow_rw_no_holes, + + .protection_pairs_000_000 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_000_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_000_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_000_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_r00_000 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_r00_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_r00_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_r00_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_0w0_000 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_0w0_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_0w0_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_0w0_rw0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_rw0_000 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_rw0_r00 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_rw0_0w0 = vm_behavior_default_no_cow_ro_no_holes, + .protection_pairs_rw0_rw0 = vm_behavior_default_no_cow_rw_no_holes, + }; + + run_vm_tests("vm_behavior_set_default", __FILE__, &tests, argc, argv); +} + + +T_DECL(vm_behavior_set_free, + "run vm_behavior_set(FREE) with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = vm_behavior_free_no_cow_rw_no_holes, + .single_entry_2 = vm_behavior_free_no_cow_rw_no_holes, + .single_entry_3 = vm_behavior_free_no_cow_rw_no_holes, + .single_entry_4 = vm_behavior_free_no_cow_rw_no_holes, + + .multiple_entries_1 = vm_behavior_free_no_cow_rw_no_holes, + .multiple_entries_2 = vm_behavior_free_no_cow_rw_no_holes, + .multiple_entries_3 = vm_behavior_free_no_cow_rw_no_holes, + .multiple_entries_4 = vm_behavior_free_no_cow_rw_no_holes, + .multiple_entries_5 = vm_behavior_free_no_cow_rw_no_holes, + .multiple_entries_6 = vm_behavior_free_no_cow_rw_no_holes, + + .some_holes_1 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_2 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_3 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_4 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_5 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_6 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_7 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_8 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_9 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_10 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_11 = vm_behavior_free_no_cow_rw_with_holes, + .some_holes_12 = vm_behavior_free_no_cow_rw_with_holes, + + .all_holes_1 = vm_behavior_free_no_cow_rw_with_holes, + .all_holes_2 = vm_behavior_free_no_cow_rw_with_holes, + .all_holes_3 = vm_behavior_free_no_cow_rw_with_holes, + .all_holes_4 = vm_behavior_free_no_cow_rw_with_holes, + + .null_entry = vm_behavior_free_no_cow_rw_no_holes, + .nonresident_entry = vm_behavior_free_no_cow_rw_no_holes, + .resident_entry = vm_behavior_free_no_cow_rw_no_holes, + + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_is_unimplemented, + .cow_unreferenced = test_is_unimplemented, + .cow_nocow = test_is_unimplemented, + .nocow_cow = test_is_unimplemented, + .cow_unreadable = test_is_unimplemented, + .cow_unwriteable = test_is_unimplemented, + + .permanent_entry = vm_behavior_free_no_cow_rw_no_holes, + .permanent_before_permanent = vm_behavior_free_no_cow_rw_no_holes, + .permanent_before_allocation = vm_behavior_free_no_cow_rw_no_holes, + .permanent_before_allocation_2 = vm_behavior_free_no_cow_rw_no_holes, + .permanent_before_hole = vm_behavior_free_no_cow_rw_with_holes, + .permanent_after_allocation = vm_behavior_free_no_cow_rw_no_holes, + .permanent_after_hole = vm_behavior_free_no_cow_rw_with_holes, + + .single_submap_single_entry = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_single_entry_first_pages = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_single_entry_last_pages = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_single_entry_middle_pages = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_start = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_end = vm_behavior_free_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_both = vm_behavior_free_no_cow_rw_no_holes, + + .submap_before_allocation = vm_behavior_free_no_cow_rw_no_holes, + .submap_after_allocation = vm_behavior_free_no_cow_rw_no_holes, + .submap_before_hole = vm_behavior_free_no_cow_rw_with_holes, + .submap_after_hole = vm_behavior_free_no_cow_rw_with_holes, + .submap_allocation_submap_one_entry = vm_behavior_free_no_cow_rw_no_holes, + .submap_allocation_submap_two_entries = vm_behavior_free_no_cow_rw_no_holes, + .submap_allocation_submap_three_entries = vm_behavior_free_no_cow_rw_no_holes, + + .submap_before_allocation_ro = vm_behavior_free_no_cow_ro_no_holes, + .submap_after_allocation_ro = vm_behavior_free_no_cow_ro_no_holes, + .submap_before_hole_ro = vm_behavior_free_no_cow_ro_with_holes, + .submap_after_hole_ro = vm_behavior_free_no_cow_ro_with_holes, + .submap_allocation_submap_one_entry_ro = vm_behavior_free_no_cow_ro_no_holes, + .submap_allocation_submap_two_entries_ro = vm_behavior_free_no_cow_ro_no_holes, + .submap_allocation_submap_three_entries_ro = vm_behavior_free_no_cow_ro_no_holes, + + .protection_single_000_000 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_000_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_r00_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_000_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_0w0_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_000_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_r00_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_0w0_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_single_rw0_rw0 = vm_behavior_free_no_cow_rw_no_holes, + + .protection_pairs_000_000 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_000_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_000_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_000_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_r00_000 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_r00_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_r00_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_r00_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_0w0_000 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_0w0_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_0w0_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_0w0_rw0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_rw0_000 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_rw0_r00 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_rw0_0w0 = vm_behavior_free_no_cow_ro_no_holes, + .protection_pairs_rw0_rw0 = vm_behavior_free_no_cow_rw_no_holes, + }; + + run_vm_tests("vm_behavior_set_free", __FILE__, &tests, argc, argv); +} + + +T_DECL(vm_behavior_set_can_reuse, + "run vm_behavior_set(CAN_REUSE) with various vm configurations") +{ + if (isRosetta()) { + /* + * CAN_REUSE requires vm_object page alignment, + * but Rosetta is less aligned than that and + * these tests don't yet have a way to adapt. + */ + T_PASS("warning: TODO wrong alignment for vm_behavior_set(CAN_REUSE) " + "on Rosetta; just passing instead"); + return; + } + + vm_tests_t tests = { + .single_entry_1 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_entry_2 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_entry_3 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_entry_4 = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .multiple_entries_1 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .multiple_entries_2 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .multiple_entries_3 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .multiple_entries_4 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .multiple_entries_5 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .multiple_entries_6 = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .some_holes_1 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_2 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_3 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_4 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_5 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_6 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_7 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_8 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_9 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_10 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_11 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .some_holes_12 = vm_behavior_can_reuse_no_cow_rw_with_holes, + + .all_holes_1 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .all_holes_2 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .all_holes_3 = vm_behavior_can_reuse_no_cow_rw_with_holes, + .all_holes_4 = vm_behavior_can_reuse_no_cow_rw_with_holes, + + .null_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + .nonresident_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + .resident_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_is_unimplemented, + .cow_unreferenced = test_is_unimplemented, + .cow_nocow = test_is_unimplemented, + .nocow_cow = test_is_unimplemented, + .cow_unreadable = test_is_unimplemented, + .cow_unwriteable = test_is_unimplemented, + + .permanent_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + .permanent_before_permanent = vm_behavior_can_reuse_no_cow_rw_no_holes, + .permanent_before_allocation = vm_behavior_can_reuse_no_cow_rw_no_holes, + .permanent_before_allocation_2 = vm_behavior_can_reuse_no_cow_rw_no_holes, + .permanent_before_hole = vm_behavior_can_reuse_no_cow_rw_with_holes, + .permanent_after_allocation = vm_behavior_can_reuse_no_cow_rw_no_holes, + .permanent_after_hole = vm_behavior_can_reuse_no_cow_rw_with_holes, + + .single_submap_single_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_single_entry_first_pages = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_single_entry_last_pages = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_single_entry_middle_pages = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_start = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_end = vm_behavior_can_reuse_no_cow_rw_no_holes, + .single_submap_oversize_entry_at_both = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .submap_before_allocation = vm_behavior_can_reuse_no_cow_rw_no_holes, + .submap_after_allocation = vm_behavior_can_reuse_no_cow_rw_no_holes, + .submap_before_hole = vm_behavior_can_reuse_no_cow_rw_with_holes, + .submap_after_hole = vm_behavior_can_reuse_no_cow_rw_with_holes, + .submap_allocation_submap_one_entry = vm_behavior_can_reuse_no_cow_rw_no_holes, + .submap_allocation_submap_two_entries = vm_behavior_can_reuse_no_cow_rw_no_holes, + .submap_allocation_submap_three_entries = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .submap_before_allocation_ro = vm_behavior_can_reuse_no_cow_ro_no_holes, + .submap_after_allocation_ro = vm_behavior_can_reuse_no_cow_ro_no_holes, + .submap_before_hole_ro = vm_behavior_can_reuse_no_cow_ro_with_holes, + .submap_after_hole_ro = vm_behavior_can_reuse_no_cow_ro_with_holes, + .submap_allocation_submap_one_entry_ro = vm_behavior_can_reuse_no_cow_ro_no_holes, + .submap_allocation_submap_two_entries_ro = vm_behavior_can_reuse_no_cow_ro_no_holes, + .submap_allocation_submap_three_entries_ro = vm_behavior_can_reuse_no_cow_ro_no_holes, + + .protection_single_000_000 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_000_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_r00_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_000_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_0w0_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_000_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_r00_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_0w0_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_single_rw0_rw0 = vm_behavior_can_reuse_no_cow_rw_no_holes, + + .protection_pairs_000_000 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_000_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_000_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_000_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_r00_000 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_r00_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_r00_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_r00_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_0w0_000 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_0w0_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_0w0_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_0w0_rw0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_rw0_000 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_rw0_r00 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_rw0_0w0 = vm_behavior_can_reuse_no_cow_ro_no_holes, + .protection_pairs_rw0_rw0 = vm_behavior_can_reuse_no_cow_rw_no_holes, + }; + + run_vm_tests("vm_behavior_set_can_reuse", __FILE__, &tests, argc, argv); +} + + +T_DECL(vm_behavior_set_zero, + "run vm_behavior_set(ZERO) with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = vm_behavior_zero, + .single_entry_2 = vm_behavior_zero, + .single_entry_3 = vm_behavior_zero, + .single_entry_4 = vm_behavior_zero, + + .multiple_entries_1 = vm_behavior_zero, + .multiple_entries_2 = vm_behavior_zero, + .multiple_entries_3 = vm_behavior_zero, + .multiple_entries_4 = vm_behavior_zero, + .multiple_entries_5 = vm_behavior_zero, + .multiple_entries_6 = vm_behavior_zero, + + .some_holes_1 = vm_behavior_zero, + .some_holes_2 = vm_behavior_zero, + .some_holes_3 = vm_behavior_zero, + .some_holes_4 = vm_behavior_zero, + .some_holes_5 = vm_behavior_zero, + .some_holes_6 = vm_behavior_zero, + .some_holes_7 = vm_behavior_zero, + .some_holes_8 = vm_behavior_zero, + .some_holes_9 = vm_behavior_zero, + .some_holes_10 = vm_behavior_zero, + .some_holes_11 = vm_behavior_zero, + .some_holes_12 = vm_behavior_zero, + + .all_holes_1 = vm_behavior_zero, + .all_holes_2 = vm_behavior_zero, + .all_holes_3 = vm_behavior_zero, + .all_holes_4 = vm_behavior_zero, + + .null_entry = vm_behavior_zero, + .nonresident_entry = vm_behavior_zero, + .resident_entry = vm_behavior_zero, + + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_is_unimplemented, + .cow_unreferenced = test_is_unimplemented, + .cow_nocow = test_is_unimplemented, + .nocow_cow = test_is_unimplemented, + .cow_unreadable = test_is_unimplemented, + .cow_unwriteable = test_is_unimplemented, + + .permanent_entry = vm_behavior_zero, + .permanent_before_permanent = vm_behavior_zero, + .permanent_before_allocation = vm_behavior_zero, + .permanent_before_allocation_2 = vm_behavior_zero, + .permanent_before_hole = vm_behavior_zero, + .permanent_after_allocation = vm_behavior_zero, + .permanent_after_hole = vm_behavior_zero, + + .single_submap_single_entry = vm_behavior_zero, + .single_submap_single_entry_first_pages = vm_behavior_zero, + .single_submap_single_entry_last_pages = vm_behavior_zero, + .single_submap_single_entry_middle_pages = vm_behavior_zero, + .single_submap_oversize_entry_at_start = vm_behavior_zero, + .single_submap_oversize_entry_at_end = vm_behavior_zero, + .single_submap_oversize_entry_at_both = vm_behavior_zero, + + .submap_before_allocation = vm_behavior_zero, + .submap_after_allocation = vm_behavior_zero, + .submap_before_hole = vm_behavior_zero, + .submap_after_hole = vm_behavior_zero, + .submap_allocation_submap_one_entry = vm_behavior_zero, + .submap_allocation_submap_two_entries = vm_behavior_zero, + .submap_allocation_submap_three_entries = vm_behavior_zero, + + .submap_before_allocation_ro = vm_behavior_zero, + .submap_after_allocation_ro = vm_behavior_zero, + .submap_before_hole_ro = vm_behavior_zero, + .submap_after_hole_ro = vm_behavior_zero, + .submap_allocation_submap_one_entry_ro = vm_behavior_zero, + .submap_allocation_submap_two_entries_ro = vm_behavior_zero, + .submap_allocation_submap_three_entries_ro = vm_behavior_zero, + + .protection_single_000_000 = vm_behavior_zero, + .protection_single_000_r00 = vm_behavior_zero, + .protection_single_r00_r00 = vm_behavior_zero, + .protection_single_000_0w0 = vm_behavior_zero, + .protection_single_0w0_0w0 = vm_behavior_zero, + .protection_single_000_rw0 = vm_behavior_zero, + .protection_single_r00_rw0 = vm_behavior_zero, + .protection_single_0w0_rw0 = vm_behavior_zero, + .protection_single_rw0_rw0 = vm_behavior_zero, + + .protection_pairs_000_000 = vm_behavior_zero, + .protection_pairs_000_r00 = vm_behavior_zero, + .protection_pairs_000_0w0 = vm_behavior_zero, + .protection_pairs_000_rw0 = vm_behavior_zero, + .protection_pairs_r00_000 = vm_behavior_zero, + .protection_pairs_r00_r00 = vm_behavior_zero, + .protection_pairs_r00_0w0 = vm_behavior_zero, + .protection_pairs_r00_rw0 = vm_behavior_zero, + .protection_pairs_0w0_000 = vm_behavior_zero, + .protection_pairs_0w0_r00 = vm_behavior_zero, + .protection_pairs_0w0_0w0 = vm_behavior_zero, + .protection_pairs_0w0_rw0 = vm_behavior_zero, + .protection_pairs_rw0_000 = vm_behavior_zero, + .protection_pairs_rw0_r00 = vm_behavior_zero, + .protection_pairs_rw0_0w0 = vm_behavior_zero, + .protection_pairs_rw0_rw0 = vm_behavior_zero, + }; + + run_vm_tests("vm_behavior_set_zero", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_vm_deallocate.c b/tests/vm/configurator_vm_deallocate.c new file mode 100644 index 000000000..e1b34d36b --- /dev/null +++ b/tests/vm/configurator_vm_deallocate.c @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_deallocate.c + * + * Test vm_deallocate with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" +#include "configurator/vm_configurator_helpers.h" +#include "exc_guard_helper.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + +static bool +do_successful_vm_deallocate_guarded(mach_vm_address_t start, mach_vm_size_t size) +{ + __block kern_return_t kr; + exc_guard_helper_info_t exc_info; + bool caught_exception; + + caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kr = mach_vm_deallocate(mach_task_self(), start, size); + }); + + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return false; + } + if (caught_exception) { + T_FAIL("unexpected EXC_GUARD during mach_vm_deallocate"); + return false; + } + + return true; +} + +static bool +do_vm_deallocate_holes_guarded(mach_vm_address_t start, mach_vm_size_t size) +{ + __block kern_return_t kr; + exc_guard_helper_info_t exc_info; + bool caught_exception; + + caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + kr = mach_vm_deallocate(mach_task_self(), start, size); + }); + + /* non-fatal EXC_GUARD returns success */ + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate guarded"); + return false; + } + if (!caught_exception) { + T_FAIL("expected EXC_GUARD during mach_vm_deallocate"); + return false; + } + if (exc_info.catch_count != 1) { + T_EXPECT_EQ(exc_info.catch_count, 1, "caught exception count"); + return false; + } + if (exc_info.guard_flavor != kGUARD_EXC_DEALLOC_GAP) { + T_EXPECT_EQ(exc_info.guard_flavor, kGUARD_EXC_DEALLOC_GAP, "caught exception flavor"); + return false; + } + + return true; +} + +static test_result_t +successful_vm_deallocate( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_successful_vm_deallocate(checker_list, start, size); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +successful_vm_deallocate_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_successful_vm_deallocate(checker_list, start, size); + if (!do_successful_vm_deallocate_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_holes_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_successful_vm_deallocate(checker_list, start, size); + if (!do_vm_deallocate_holes_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_vm_deallocate_permanent(checker_list, start, size); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_permanent( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no changes to checkers in [start + size / 2, start + size) */ + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_after_allocation( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_after_hole( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + /* no changes to checkers in [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + kr = mach_vm_deallocate(mach_task_self(), start, size); + if (kr != KERN_SUCCESS) { + T_EXPECT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + + +static test_result_t +vm_deallocate_permanent_entry_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_vm_deallocate_permanent(checker_list, start, size); + if (!do_successful_vm_deallocate_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_permanent_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + if (!do_successful_vm_deallocate_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_allocation_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + checker_perform_successful_vm_deallocate(checker_list, start + size / 2, size / 2); + if (!do_successful_vm_deallocate_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_before_hole_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_vm_deallocate_permanent(checker_list, start, size / 2); + /* no changes to checkers in [start + size / 2, start + size) */ + if (!do_vm_deallocate_holes_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_after_allocation_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + checker_perform_successful_vm_deallocate(checker_list, start, size / 2); + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + if (!do_successful_vm_deallocate_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + +static test_result_t +vm_deallocate_permanent_after_hole_guarded( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* no changes to checkers in [start, start + size / 2) */ + checker_perform_vm_deallocate_permanent(checker_list, start + size / 2, size / 2); + if (!do_vm_deallocate_holes_guarded(start, size)) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_deallocate"); +} + + +T_DECL(vm_deallocate_unguarded, + "run vm_deallocate with various vm configurations; EXC_GUARD disabled") +{ + vm_tests_t tests = { + .single_entry_1 = successful_vm_deallocate, + .single_entry_2 = successful_vm_deallocate, + .single_entry_3 = successful_vm_deallocate, + .single_entry_4 = successful_vm_deallocate, + + .multiple_entries_1 = successful_vm_deallocate, + .multiple_entries_2 = successful_vm_deallocate, + .multiple_entries_3 = successful_vm_deallocate, + .multiple_entries_4 = successful_vm_deallocate, + .multiple_entries_5 = successful_vm_deallocate, + .multiple_entries_6 = successful_vm_deallocate, + + .some_holes_1 = successful_vm_deallocate, + .some_holes_2 = successful_vm_deallocate, + .some_holes_3 = successful_vm_deallocate, + .some_holes_4 = successful_vm_deallocate, + .some_holes_5 = successful_vm_deallocate, + .some_holes_6 = successful_vm_deallocate, + .some_holes_7 = successful_vm_deallocate, + .some_holes_8 = successful_vm_deallocate, + .some_holes_9 = successful_vm_deallocate, + .some_holes_10 = successful_vm_deallocate, + .some_holes_11 = successful_vm_deallocate, + .some_holes_12 = successful_vm_deallocate, + + .all_holes_1 = successful_vm_deallocate, + .all_holes_2 = successful_vm_deallocate, + .all_holes_3 = successful_vm_deallocate, + .all_holes_4 = successful_vm_deallocate, + + .null_entry = successful_vm_deallocate, + .nonresident_entry = successful_vm_deallocate, + .resident_entry = successful_vm_deallocate, + + .shared_entry = successful_vm_deallocate, + .shared_entry_discontiguous = successful_vm_deallocate, + .shared_entry_partial = successful_vm_deallocate, + .shared_entry_pairs = successful_vm_deallocate, + .shared_entry_x1000 = successful_vm_deallocate, + + .cow_entry = successful_vm_deallocate, + .cow_unreferenced = successful_vm_deallocate, + .cow_nocow = successful_vm_deallocate, + .nocow_cow = successful_vm_deallocate, + .cow_unreadable = successful_vm_deallocate, + .cow_unwriteable = successful_vm_deallocate, + + .permanent_entry = vm_deallocate_permanent_entry, + .permanent_before_permanent = vm_deallocate_permanent_before_permanent, + .permanent_before_allocation = vm_deallocate_permanent_before_allocation, + .permanent_before_allocation_2 = vm_deallocate_permanent_before_allocation, + .permanent_before_hole = vm_deallocate_permanent_before_hole, + .permanent_after_allocation = vm_deallocate_permanent_after_allocation, + .permanent_after_hole = vm_deallocate_permanent_after_hole, + + .single_submap_single_entry = successful_vm_deallocate, + .single_submap_single_entry_first_pages = successful_vm_deallocate, + .single_submap_single_entry_last_pages = successful_vm_deallocate, + .single_submap_single_entry_middle_pages = successful_vm_deallocate, + .single_submap_oversize_entry_at_start = successful_vm_deallocate, + .single_submap_oversize_entry_at_end = successful_vm_deallocate, + .single_submap_oversize_entry_at_both = successful_vm_deallocate, + + .submap_before_allocation = successful_vm_deallocate, + .submap_after_allocation = successful_vm_deallocate, + .submap_before_hole = successful_vm_deallocate, + .submap_after_hole = successful_vm_deallocate, + .submap_allocation_submap_one_entry = successful_vm_deallocate, + .submap_allocation_submap_two_entries = successful_vm_deallocate, + .submap_allocation_submap_three_entries = successful_vm_deallocate, + + .submap_before_allocation_ro = successful_vm_deallocate, + .submap_after_allocation_ro = successful_vm_deallocate, + .submap_before_hole_ro = successful_vm_deallocate, + .submap_after_hole_ro = successful_vm_deallocate, + .submap_allocation_submap_one_entry_ro = successful_vm_deallocate, + .submap_allocation_submap_two_entries_ro = successful_vm_deallocate, + .submap_allocation_submap_three_entries_ro = successful_vm_deallocate, + + .protection_single_000_000 = successful_vm_deallocate, + .protection_single_000_r00 = successful_vm_deallocate, + .protection_single_000_0w0 = successful_vm_deallocate, + .protection_single_000_rw0 = successful_vm_deallocate, + .protection_single_r00_r00 = successful_vm_deallocate, + .protection_single_r00_rw0 = successful_vm_deallocate, + .protection_single_0w0_0w0 = successful_vm_deallocate, + .protection_single_0w0_rw0 = successful_vm_deallocate, + .protection_single_rw0_rw0 = successful_vm_deallocate, + + .protection_pairs_000_000 = successful_vm_deallocate, + .protection_pairs_000_r00 = successful_vm_deallocate, + .protection_pairs_000_0w0 = successful_vm_deallocate, + .protection_pairs_000_rw0 = successful_vm_deallocate, + .protection_pairs_r00_000 = successful_vm_deallocate, + .protection_pairs_r00_r00 = successful_vm_deallocate, + .protection_pairs_r00_0w0 = successful_vm_deallocate, + .protection_pairs_r00_rw0 = successful_vm_deallocate, + .protection_pairs_0w0_000 = successful_vm_deallocate, + .protection_pairs_0w0_r00 = successful_vm_deallocate, + .protection_pairs_0w0_0w0 = successful_vm_deallocate, + .protection_pairs_0w0_rw0 = successful_vm_deallocate, + .protection_pairs_rw0_000 = successful_vm_deallocate, + .protection_pairs_rw0_r00 = successful_vm_deallocate, + .protection_pairs_rw0_0w0 = successful_vm_deallocate, + .protection_pairs_rw0_rw0 = successful_vm_deallocate, + }; + + disable_vm_exc_guard(); + run_vm_tests("vm_deallocate_unguarded", __FILE__, &tests, argc, argv); +} /* T_DECL(vm_deallocate_unguarded) */ + + +T_DECL(vm_deallocate_guarded, + "run vm_deallocate with various vm configurations; EXC_GUARD enabled") +{ + if (isRosetta()) { + /* Rosetta doesn't deliver VM guard exceptions to the test's exception handler. */ + T_PASS("can't test VM guard exceptions on Rosetta"); + return; + } + + vm_tests_t tests = { + .single_entry_1 = successful_vm_deallocate_guarded, + .single_entry_2 = successful_vm_deallocate_guarded, + .single_entry_3 = successful_vm_deallocate_guarded, + .single_entry_4 = successful_vm_deallocate_guarded, + + .multiple_entries_1 = successful_vm_deallocate_guarded, + .multiple_entries_2 = successful_vm_deallocate_guarded, + .multiple_entries_3 = successful_vm_deallocate_guarded, + .multiple_entries_4 = successful_vm_deallocate_guarded, + .multiple_entries_5 = successful_vm_deallocate_guarded, + .multiple_entries_6 = successful_vm_deallocate_guarded, + + .some_holes_1 = vm_deallocate_holes_guarded, + .some_holes_2 = vm_deallocate_holes_guarded, + .some_holes_3 = vm_deallocate_holes_guarded, + .some_holes_4 = vm_deallocate_holes_guarded, + .some_holes_5 = vm_deallocate_holes_guarded, + .some_holes_6 = vm_deallocate_holes_guarded, + .some_holes_7 = vm_deallocate_holes_guarded, + .some_holes_8 = vm_deallocate_holes_guarded, + .some_holes_9 = vm_deallocate_holes_guarded, + .some_holes_10 = vm_deallocate_holes_guarded, + .some_holes_11 = vm_deallocate_holes_guarded, + .some_holes_12 = vm_deallocate_holes_guarded, + + .all_holes_1 = vm_deallocate_holes_guarded, + .all_holes_2 = vm_deallocate_holes_guarded, + .all_holes_3 = vm_deallocate_holes_guarded, + .all_holes_4 = vm_deallocate_holes_guarded, + + .null_entry = successful_vm_deallocate_guarded, + .nonresident_entry = successful_vm_deallocate_guarded, + .resident_entry = successful_vm_deallocate_guarded, + + .shared_entry = successful_vm_deallocate_guarded, + .shared_entry_discontiguous = successful_vm_deallocate_guarded, + .shared_entry_partial = successful_vm_deallocate_guarded, + .shared_entry_pairs = successful_vm_deallocate_guarded, + .shared_entry_x1000 = successful_vm_deallocate_guarded, + + .cow_entry = successful_vm_deallocate_guarded, + .cow_unreferenced = successful_vm_deallocate_guarded, + .cow_nocow = successful_vm_deallocate_guarded, + .nocow_cow = successful_vm_deallocate_guarded, + .cow_unreadable = successful_vm_deallocate_guarded, + .cow_unwriteable = successful_vm_deallocate_guarded, + + .permanent_entry = vm_deallocate_permanent_entry_guarded, + .permanent_before_permanent = vm_deallocate_permanent_before_permanent_guarded, + .permanent_before_allocation = vm_deallocate_permanent_before_allocation_guarded, + .permanent_before_allocation_2 = vm_deallocate_permanent_before_allocation_guarded, + .permanent_before_hole = vm_deallocate_permanent_before_hole_guarded, + .permanent_after_allocation = vm_deallocate_permanent_after_allocation_guarded, + .permanent_after_hole = vm_deallocate_permanent_after_hole_guarded, + + .single_submap_single_entry = successful_vm_deallocate_guarded, + .single_submap_single_entry_first_pages = successful_vm_deallocate_guarded, + .single_submap_single_entry_last_pages = successful_vm_deallocate_guarded, + .single_submap_single_entry_middle_pages = successful_vm_deallocate_guarded, + .single_submap_oversize_entry_at_start = successful_vm_deallocate_guarded, + .single_submap_oversize_entry_at_end = successful_vm_deallocate_guarded, + .single_submap_oversize_entry_at_both = successful_vm_deallocate_guarded, + + .submap_before_allocation = successful_vm_deallocate_guarded, + .submap_after_allocation = successful_vm_deallocate_guarded, + .submap_before_hole = vm_deallocate_holes_guarded, + .submap_after_hole = vm_deallocate_holes_guarded, + .submap_allocation_submap_one_entry = successful_vm_deallocate_guarded, + .submap_allocation_submap_two_entries = successful_vm_deallocate_guarded, + .submap_allocation_submap_three_entries = successful_vm_deallocate_guarded, + + .submap_before_allocation_ro = successful_vm_deallocate_guarded, + .submap_after_allocation_ro = successful_vm_deallocate_guarded, + .submap_before_hole_ro = vm_deallocate_holes_guarded, + .submap_after_hole_ro = vm_deallocate_holes_guarded, + .submap_allocation_submap_one_entry_ro = successful_vm_deallocate_guarded, + .submap_allocation_submap_two_entries_ro = successful_vm_deallocate_guarded, + .submap_allocation_submap_three_entries_ro = successful_vm_deallocate_guarded, + + .protection_single_000_000 = successful_vm_deallocate_guarded, + .protection_single_000_r00 = successful_vm_deallocate_guarded, + .protection_single_000_0w0 = successful_vm_deallocate_guarded, + .protection_single_000_rw0 = successful_vm_deallocate_guarded, + .protection_single_r00_r00 = successful_vm_deallocate_guarded, + .protection_single_r00_rw0 = successful_vm_deallocate_guarded, + .protection_single_0w0_0w0 = successful_vm_deallocate_guarded, + .protection_single_0w0_rw0 = successful_vm_deallocate_guarded, + .protection_single_rw0_rw0 = successful_vm_deallocate_guarded, + + .protection_pairs_000_000 = successful_vm_deallocate_guarded, + .protection_pairs_000_r00 = successful_vm_deallocate_guarded, + .protection_pairs_000_0w0 = successful_vm_deallocate_guarded, + .protection_pairs_000_rw0 = successful_vm_deallocate_guarded, + .protection_pairs_r00_000 = successful_vm_deallocate_guarded, + .protection_pairs_r00_r00 = successful_vm_deallocate_guarded, + .protection_pairs_r00_0w0 = successful_vm_deallocate_guarded, + .protection_pairs_r00_rw0 = successful_vm_deallocate_guarded, + .protection_pairs_0w0_000 = successful_vm_deallocate_guarded, + .protection_pairs_0w0_r00 = successful_vm_deallocate_guarded, + .protection_pairs_0w0_0w0 = successful_vm_deallocate_guarded, + .protection_pairs_0w0_rw0 = successful_vm_deallocate_guarded, + .protection_pairs_rw0_000 = successful_vm_deallocate_guarded, + .protection_pairs_rw0_r00 = successful_vm_deallocate_guarded, + .protection_pairs_rw0_0w0 = successful_vm_deallocate_guarded, + .protection_pairs_rw0_rw0 = successful_vm_deallocate_guarded, + }; + + enable_non_fatal_vm_exc_guard(); + run_vm_tests("vm_deallocate_guarded", __FILE__, &tests, argc, argv); +} /* T_DECL(vm_deallocate_guarded) */ diff --git a/tests/vm/configurator_vm_inherit.c b/tests/vm/configurator_vm_inherit.c new file mode 100644 index 000000000..3167bf11d --- /dev/null +++ b/tests/vm/configurator_vm_inherit.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_inherit.c + * + * Test vm_inherit with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + + +/* + * Update the checker state to mirror a vm_inherit call. + */ +static void +checker_perform_vm_inherit( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_inherit_t inheritance) +{ + /* vm_inherit allows unallocated holes */ + entry_checker_range_t limit = + checker_list_find_range_including_holes(checker_list, start, size); + if (limit.head->kind != Hole) { + checker_clip_left(checker_list, limit.head, start); + } + if (limit.tail->kind != Hole) { + checker_clip_right(checker_list, limit.tail, start + size); + } + + FOREACH_CHECKER(checker, limit) { + if (checker->kind == Allocation) { + checker->inheritance = inheritance; + } + } +} + + +/* + * Perform and check a call to vm_inherit that is expected to succeed. + */ +static test_result_t +successful_vm_inherit( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + vm_inherit_t inherit = VM_INHERIT_SHARE; + + checker_perform_vm_inherit(checker_list, start, size, inherit); + kr = mach_vm_inherit(mach_task_self(), start, size, inherit); + if (kr != 0) { + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_inherit"); +} + + +T_DECL(vm_inherit, + "run vm_inherit with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = successful_vm_inherit, + .single_entry_2 = successful_vm_inherit, + .single_entry_3 = successful_vm_inherit, + .single_entry_4 = successful_vm_inherit, + + .multiple_entries_1 = successful_vm_inherit, + .multiple_entries_2 = successful_vm_inherit, + .multiple_entries_3 = successful_vm_inherit, + .multiple_entries_4 = successful_vm_inherit, + .multiple_entries_5 = successful_vm_inherit, + .multiple_entries_6 = successful_vm_inherit, + + .some_holes_1 = successful_vm_inherit, + .some_holes_2 = successful_vm_inherit, + .some_holes_3 = successful_vm_inherit, + .some_holes_4 = successful_vm_inherit, + .some_holes_5 = successful_vm_inherit, + .some_holes_6 = successful_vm_inherit, + .some_holes_7 = successful_vm_inherit, + .some_holes_8 = successful_vm_inherit, + .some_holes_9 = successful_vm_inherit, + .some_holes_10 = successful_vm_inherit, + .some_holes_11 = successful_vm_inherit, + .some_holes_12 = successful_vm_inherit, + + .all_holes_1 = successful_vm_inherit, + .all_holes_2 = successful_vm_inherit, + .all_holes_3 = successful_vm_inherit, + .all_holes_4 = successful_vm_inherit, + + .null_entry = successful_vm_inherit, + .nonresident_entry = successful_vm_inherit, + .resident_entry = successful_vm_inherit, + + .shared_entry = successful_vm_inherit, + .shared_entry_discontiguous = successful_vm_inherit, + .shared_entry_partial = successful_vm_inherit, + .shared_entry_pairs = successful_vm_inherit, + .shared_entry_x1000 = successful_vm_inherit, + + .cow_entry = successful_vm_inherit, + .cow_unreferenced = successful_vm_inherit, + .cow_nocow = successful_vm_inherit, + .nocow_cow = successful_vm_inherit, + .cow_unreadable = successful_vm_inherit, + .cow_unwriteable = successful_vm_inherit, + + .permanent_entry = successful_vm_inherit, + .permanent_before_permanent = successful_vm_inherit, + .permanent_before_allocation = successful_vm_inherit, + .permanent_before_allocation_2 = successful_vm_inherit, + .permanent_before_hole = successful_vm_inherit, + .permanent_after_allocation = successful_vm_inherit, + .permanent_after_hole = successful_vm_inherit, + + .single_submap_single_entry = successful_vm_inherit, + .single_submap_single_entry_first_pages = successful_vm_inherit, + .single_submap_single_entry_last_pages = successful_vm_inherit, + .single_submap_single_entry_middle_pages = successful_vm_inherit, + .single_submap_oversize_entry_at_start = successful_vm_inherit, + .single_submap_oversize_entry_at_end = successful_vm_inherit, + .single_submap_oversize_entry_at_both = successful_vm_inherit, + + .submap_before_allocation = successful_vm_inherit, + .submap_after_allocation = successful_vm_inherit, + .submap_before_hole = successful_vm_inherit, + .submap_after_hole = successful_vm_inherit, + .submap_allocation_submap_one_entry = successful_vm_inherit, + .submap_allocation_submap_two_entries = successful_vm_inherit, + .submap_allocation_submap_three_entries = successful_vm_inherit, + + .submap_before_allocation_ro = successful_vm_inherit, + .submap_after_allocation_ro = successful_vm_inherit, + .submap_before_hole_ro = successful_vm_inherit, + .submap_after_hole_ro = successful_vm_inherit, + .submap_allocation_submap_one_entry_ro = successful_vm_inherit, + .submap_allocation_submap_two_entries_ro = successful_vm_inherit, + .submap_allocation_submap_three_entries_ro = successful_vm_inherit, + + .protection_single_000_000 = successful_vm_inherit, + .protection_single_000_r00 = successful_vm_inherit, + .protection_single_000_0w0 = successful_vm_inherit, + .protection_single_000_rw0 = successful_vm_inherit, + .protection_single_r00_r00 = successful_vm_inherit, + .protection_single_r00_rw0 = successful_vm_inherit, + .protection_single_0w0_0w0 = successful_vm_inherit, + .protection_single_0w0_rw0 = successful_vm_inherit, + .protection_single_rw0_rw0 = successful_vm_inherit, + + .protection_pairs_000_000 = successful_vm_inherit, + .protection_pairs_000_r00 = successful_vm_inherit, + .protection_pairs_000_0w0 = successful_vm_inherit, + .protection_pairs_000_rw0 = successful_vm_inherit, + .protection_pairs_r00_000 = successful_vm_inherit, + .protection_pairs_r00_r00 = successful_vm_inherit, + .protection_pairs_r00_0w0 = successful_vm_inherit, + .protection_pairs_r00_rw0 = successful_vm_inherit, + .protection_pairs_0w0_000 = successful_vm_inherit, + .protection_pairs_0w0_r00 = successful_vm_inherit, + .protection_pairs_0w0_0w0 = successful_vm_inherit, + .protection_pairs_0w0_rw0 = successful_vm_inherit, + .protection_pairs_rw0_000 = successful_vm_inherit, + .protection_pairs_rw0_r00 = successful_vm_inherit, + .protection_pairs_rw0_0w0 = successful_vm_inherit, + .protection_pairs_rw0_rw0 = successful_vm_inherit, + }; + + run_vm_tests("vm_inherit", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_vm_protect.c b/tests/vm/configurator_vm_protect.c new file mode 100644 index 000000000..2b60c5457 --- /dev/null +++ b/tests/vm/configurator_vm_protect.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_protect.c + * + * Test vm_protect with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ASROOT(true), /* required for vm submap sysctls */ + T_META_ALL_VALID_ARCHS(true) + ); + + +/* + * Update checker state to mirror a successful call to vm_protect. + */ +static void +checker_perform_vm_protect( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + bool set_max, + vm_prot_t prot) +{ + entry_checker_range_t limit = + checker_list_find_and_clip(checker_list, start, size); + FOREACH_CHECKER(checker, limit) { + if (set_max) { + checker->max_protection = prot; + checker->protection &= checker->max_protection; + } else { + checker->protection = prot; + } + } + checker_list_simplify(checker_list, start, size); +} + +/* + * Perform and check a call to mach_vm_protect that is expected to succeed. + */ +static test_result_t +vm_protect_successfully( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_prot_t prot) +{ + kern_return_t kr; + + bool set_max = false; + + checker_perform_vm_protect(checker_list, start, size, set_max, prot); + kr = mach_vm_protect(mach_task_self(), start, size, set_max, prot); + if (kr != 0) { + T_FAIL("mach_vm_protect(%s) failed (%s)", + name_for_prot(prot), name_for_kr(kr)); + return TestFailed; + } + + TEMP_CSTRING(name, "after vm_protect(%s)", name_for_prot(prot)); + return verify_vm_state(checker_list, name); +} + +/* + * Perform and check mach_vm_protect that is expected to fail due to holes. + */ +static test_result_t +vm_protect_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + /* + * No checker updates here. vm_map_protect preflights its checks, + * so it fails with no side effects when the address range has holes. + */ + + kr = mach_vm_protect(mach_task_self(), start, size, false, VM_PROT_READ); + if (kr != KERN_INVALID_ADDRESS) { + T_FAIL("mach_vm_protect(holes) expected %s, got %s\n", + name_for_kr(KERN_INVALID_ADDRESS), name_for_kr(kr)); + return TestFailed; + } + + return verify_vm_state(checker_list, "after vm_protect"); +} + +/* + * Perform and check mach_vm_protect that is expected to fail because + * the requested protections are more permissive than max_protection. + */ +static test_result_t +vm_protect_beyond_max_prot( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_prot_t prot) +{ + kern_return_t kr; + + /* + * No checker updates here. vm_map_protect preflights its checks, + * so it fails with no effect. + */ + + kr = mach_vm_protect(mach_task_self(), start, size, false /*set max*/, prot); + if (kr != KERN_PROTECTION_FAILURE) { + T_FAIL("mach_vm_protect(%s which is beyond max) expected %s, got %s\n", + name_for_prot(prot), + name_for_kr(KERN_PROTECTION_FAILURE), name_for_kr(kr)); + return TestFailed; + } + + TEMP_CSTRING(name, "after vm_protect(%s)", name_for_prot(prot)); + return verify_vm_state(checker_list, name); +} + + +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_NONE + */ +static test_result_t +vm_protect_max_000( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t results[4]; + + results[0] = vm_protect_successfully(checker_list, start, size, VM_PROT_NONE); + results[1] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_READ); + results[2] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_WRITE); + results[3] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_READ | VM_PROT_WRITE); + + return worst_result(results, countof(results)); +} + +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_READ + */ +static test_result_t +vm_protect_max_r00( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t results[4]; + + results[0] = vm_protect_successfully(checker_list, start, size, VM_PROT_NONE); + results[1] = vm_protect_successfully(checker_list, start, size, VM_PROT_READ); + results[2] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_WRITE); + results[3] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_READ | VM_PROT_WRITE); + + return worst_result(results, countof(results)); +} + +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_WRITE + */ +static test_result_t +vm_protect_max_0w0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t results[4]; + + results[0] = vm_protect_successfully(checker_list, start, size, VM_PROT_NONE); + results[1] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_READ); + results[2] = vm_protect_successfully(checker_list, start, size, VM_PROT_WRITE); + results[3] = vm_protect_beyond_max_prot(checker_list, start, size, VM_PROT_READ | VM_PROT_WRITE); + + return worst_result(results, countof(results)); +} + + +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_READ | VM_PROT_WRITE + */ +static test_result_t +vm_protect_max_rw0( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + test_result_t results[4]; + + results[0] = vm_protect_successfully(checker_list, start, size, VM_PROT_NONE); + results[1] = vm_protect_successfully(checker_list, start, size, VM_PROT_READ); + results[2] = vm_protect_successfully(checker_list, start, size, VM_PROT_WRITE); + results[3] = vm_protect_successfully(checker_list, start, size, VM_PROT_READ | VM_PROT_WRITE); + + return worst_result(results, countof(results)); +} + +#if __x86_64__ +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXEC + */ +static test_result_t +vm_protect_max_rwx( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* TODO VM_PROT_EXEC */ + return vm_protect_max_rw0(checker_list, start, size); +} +#endif /* __x86_64__ */ + +/* + * Perform multiple successful and unsuccessful vm_protect operations + * on a region whose max_protections are VM_PROT_READ + * OR whose max protections are READ|WRITE|EXEC due to Intel submap unnesting. + */ +static test_result_t +vm_protect_max_r00_or_unnested_submap( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ +#if __x86_64__ + return vm_protect_max_rwx(checker_list, start, size); +#else /* not __x86_64__ */ + return vm_protect_max_r00(checker_list, start, size); +#endif /* not __x86_64__ */ +} + +T_DECL(vm_protect, + "run vm_protect with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = vm_protect_max_rw0, + .single_entry_2 = vm_protect_max_rw0, + .single_entry_3 = vm_protect_max_rw0, + .single_entry_4 = vm_protect_max_rw0, + + .multiple_entries_1 = vm_protect_max_rw0, + .multiple_entries_2 = vm_protect_max_rw0, + .multiple_entries_3 = vm_protect_max_rw0, + .multiple_entries_4 = vm_protect_max_rw0, + .multiple_entries_5 = vm_protect_max_rw0, + .multiple_entries_6 = vm_protect_max_rw0, + + .some_holes_1 = vm_protect_with_holes, + .some_holes_2 = vm_protect_with_holes, + .some_holes_3 = vm_protect_with_holes, + .some_holes_4 = vm_protect_with_holes, + .some_holes_5 = vm_protect_with_holes, + .some_holes_6 = vm_protect_with_holes, + .some_holes_7 = vm_protect_with_holes, + .some_holes_8 = vm_protect_with_holes, + .some_holes_9 = vm_protect_with_holes, + .some_holes_10 = vm_protect_with_holes, + .some_holes_11 = vm_protect_with_holes, + .some_holes_12 = vm_protect_with_holes, + + .all_holes_1 = vm_protect_with_holes, + .all_holes_2 = vm_protect_with_holes, + .all_holes_3 = vm_protect_with_holes, + .all_holes_4 = vm_protect_with_holes, + + .null_entry = vm_protect_max_rw0, + .nonresident_entry = vm_protect_max_rw0, + .resident_entry = vm_protect_max_rw0, + + .shared_entry = vm_protect_max_rw0, + .shared_entry_discontiguous = vm_protect_max_rw0, + .shared_entry_partial = vm_protect_max_rw0, + .shared_entry_pairs = vm_protect_max_rw0, + .shared_entry_x1000 = vm_protect_max_rw0, + + .cow_entry = vm_protect_max_rw0, + .cow_unreferenced = vm_protect_max_rw0, + .cow_nocow = vm_protect_max_rw0, + .nocow_cow = vm_protect_max_rw0, + .cow_unreadable = vm_protect_max_rw0, + .cow_unwriteable = vm_protect_max_rw0, + + .permanent_entry = vm_protect_max_rw0, + .permanent_before_permanent = vm_protect_max_rw0, + .permanent_before_allocation = vm_protect_max_rw0, + .permanent_before_allocation_2 = vm_protect_max_rw0, + .permanent_before_hole = vm_protect_with_holes, + .permanent_after_allocation = vm_protect_max_rw0, + .permanent_after_hole = vm_protect_with_holes, + + /* + * vm_protect without VM_PROT_COPY does not descend into submaps. + * The parent map's submap entry is r--/r--. + */ + .single_submap_single_entry = vm_protect_max_r00_or_unnested_submap, + .single_submap_single_entry_first_pages = vm_protect_max_r00_or_unnested_submap, + .single_submap_single_entry_last_pages = vm_protect_max_r00_or_unnested_submap, + .single_submap_single_entry_middle_pages = vm_protect_max_r00_or_unnested_submap, + .single_submap_oversize_entry_at_start = vm_protect_max_r00_or_unnested_submap, + .single_submap_oversize_entry_at_end = vm_protect_max_r00_or_unnested_submap, + .single_submap_oversize_entry_at_both = vm_protect_max_r00_or_unnested_submap, + + .submap_before_allocation = vm_protect_max_r00_or_unnested_submap, + .submap_after_allocation = vm_protect_max_r00_or_unnested_submap, + .submap_before_hole = vm_protect_with_holes, + .submap_after_hole = vm_protect_with_holes, + .submap_allocation_submap_one_entry = vm_protect_max_r00_or_unnested_submap, + .submap_allocation_submap_two_entries = vm_protect_max_r00_or_unnested_submap, + .submap_allocation_submap_three_entries = vm_protect_max_r00_or_unnested_submap, + + .submap_before_allocation_ro = vm_protect_max_r00_or_unnested_submap, + .submap_after_allocation_ro = vm_protect_max_r00_or_unnested_submap, + .submap_before_hole_ro = vm_protect_with_holes, + .submap_after_hole_ro = vm_protect_with_holes, + .submap_allocation_submap_one_entry_ro = vm_protect_max_r00_or_unnested_submap, + .submap_allocation_submap_two_entries_ro = vm_protect_max_r00_or_unnested_submap, + .submap_allocation_submap_three_entries_ro = vm_protect_max_r00_or_unnested_submap, + + .protection_single_000_000 = vm_protect_max_000, + .protection_single_000_r00 = vm_protect_max_r00, + .protection_single_r00_r00 = vm_protect_max_r00, + .protection_single_000_0w0 = vm_protect_max_0w0, + .protection_single_0w0_0w0 = vm_protect_max_0w0, + .protection_single_000_rw0 = vm_protect_max_rw0, + .protection_single_r00_rw0 = vm_protect_max_rw0, + .protection_single_0w0_rw0 = vm_protect_max_rw0, + .protection_single_rw0_rw0 = vm_protect_max_rw0, + + .protection_pairs_000_000 = vm_protect_max_rw0, + .protection_pairs_000_r00 = vm_protect_max_rw0, + .protection_pairs_000_0w0 = vm_protect_max_rw0, + .protection_pairs_000_rw0 = vm_protect_max_rw0, + .protection_pairs_r00_000 = vm_protect_max_rw0, + .protection_pairs_r00_r00 = vm_protect_max_rw0, + .protection_pairs_r00_0w0 = vm_protect_max_rw0, + .protection_pairs_r00_rw0 = vm_protect_max_rw0, + .protection_pairs_0w0_000 = vm_protect_max_rw0, + .protection_pairs_0w0_r00 = vm_protect_max_rw0, + .protection_pairs_0w0_0w0 = vm_protect_max_rw0, + .protection_pairs_0w0_rw0 = vm_protect_max_rw0, + .protection_pairs_rw0_000 = vm_protect_max_rw0, + .protection_pairs_rw0_r00 = vm_protect_max_rw0, + .protection_pairs_rw0_0w0 = vm_protect_max_rw0, + .protection_pairs_rw0_rw0 = vm_protect_max_rw0, + }; + + run_vm_tests("vm_protect", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/configurator_vm_wire.c b/tests/vm/configurator_vm_wire.c new file mode 100644 index 000000000..319e78681 --- /dev/null +++ b/tests/vm/configurator_vm_wire.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * vm/configurator_vm_wire.c + * + * Test vm_wire with many different VM states. + */ + +#include "configurator/vm_configurator_tests.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.configurator"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true), + T_META_ASROOT(true) /* root required for vm_wire on macOS */ + ); + +/* + * Update checker state to mirror a successful call to + * vm_wire(PROT_NONE) a.k.a. unwire + */ +static void +checker_perform_vm_unwire( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t limit = checker_list_find_and_clip(checker_list, start, size); + FOREACH_CHECKER(checker, limit) { + assert(checker->user_wired_count > 0); + checker->user_wired_count--; + } + checker_list_simplify(checker_list, start, size); +} + + +/* + * Update checker state to mirror a successful call to + * vm_wire(PROT_NONE) a.k.a. unwire + * of a range that includes holes. + */ +static kern_return_t +checker_perform_vm_unwire_with_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + entry_checker_range_t limit = checker_list_find_range_including_holes(checker_list, start, size); + + if (limit.head && limit.head->kind == Allocation && + checker_contains_address(limit.head, start)) { + /* range begins with an allocation - proceed normally */ + } else { + /* range begins with a hole - do nothing, not even simplify */ + return KERN_INVALID_ADDRESS; + } + + FOREACH_CHECKER(checker, limit) { + if (checker->kind == Allocation) { + assert(checker->user_wired_count > 0); + checker->user_wired_count--; + } + } + + checker_list_simplify(checker_list, start, size); + return KERN_SUCCESS; +} + + +/* + * Update checker state to mirrow a successful call to vm_wire. + */ +static void +checker_perform_vm_wire( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_prot_t wire_prot) +{ + assert(wire_prot != VM_PROT_NONE); + + entry_checker_range_t limit; + + /* + * Resolve null objects. + * vm_wire does this before clipping + */ + limit = checker_list_find_range_including_holes(checker_list, start, size); + FOREACH_CHECKER(checker, limit) { + checker_resolve_null_vm_object(checker_list, checker); + } + + /* + * Perform clipping. + */ + limit = checker_list_find_range(checker_list, start, size); + checker_clip_left(checker_list, limit.head, start); + checker_clip_right(checker_list, limit.tail, start + size); + + /* + * Fault and wire. + */ + + FOREACH_CHECKER(checker, limit) { + checker->user_wired_count++; + checker_fault_for_prot_not_cow(checker_list, checker, wire_prot); + } + checker_list_simplify(checker_list, start, size); +} + + +static void +checker_perform_failed_vm_wire( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size, + vm_prot_t wire_prot) +{ + assert(wire_prot != VM_PROT_NONE); + + /* + * failed vm_wire clips entries and resolves null vm_objects + * one at a time until the entry that it couldn't change + * + * failed vm_wire doesn't simplify clipped entries on exit + * + * failed vm_wire is inconsistent about resident page counts + */ + + entry_checker_range_t limit = + checker_list_find_range_including_holes(checker_list, start, size); + FOREACH_CHECKER(checker, limit) { + if (checker->kind != Allocation) { + /* stop at holes */ + break; + } + + /* wire of executable entry fails early */ + if (prot_contains_all(checker->protection, VM_PROT_EXECUTE)) { + // (fixme jit, tpro) + break; + } + + /* null vm_objects are resolved before clipping */ + checker_resolve_null_vm_object(checker_list, checker); + + if (checker == limit.head) { + checker_clip_left(checker_list, checker, start); + } + if (checker == limit.tail) { + checker_clip_right(checker_list, checker, start + size); + } + + if (!prot_contains_all(checker->protection, wire_prot)) { + /* stop at protection failures */ + break; + } + + if (checker != limit.tail && checker->next->kind != Allocation) { + /* stop if the *next* entry is in range and is an illegal hole */ + break; + } + + /* + * failed vm_wire simplifies and faults in, + * except for the cases already short-circuited above + */ + checker_fault_for_prot_not_cow(checker_list, checker, wire_prot); + checker_simplify_left(checker_list, checker); + } +} + + +static test_result_t +successful_vm_wire_read_not_cow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_vm_wire(checker_list, start, size, VM_PROT_READ); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_READ); + if (kr) { + T_FAIL("mach_vm_wire failed (%s)", name_for_kr(kr)); + return TestFailed; + } + + if (verify_vm_state(checker_list, "after vm_wire") != TestSucceeded) { + return TestFailed; + } + + checker_perform_vm_unwire(checker_list, start, size); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_NONE); + if (kr) { + T_FAIL("mach_vm_wire(unwire) failed (%s)", name_for_kr(kr)); + return TestFailed; + } + + if (verify_vm_state(checker_list, "after vm_unwire") != TestSucceeded) { + return TestFailed; + } + + return TestSucceeded; +} + +static test_result_t +failed_vm_wire_read_not_cow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr; + + checker_perform_failed_vm_wire(checker_list, start, size, VM_PROT_READ); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_READ); + if (kr == KERN_SUCCESS) { + T_FAIL("mach_vm_wire unexpectedly succeeded"); + return TestFailed; + } + + return verify_vm_state(checker_list, "after unsuccessful vm_wire"); +} + +static test_result_t +wire_shared_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* two entries each sharing the same object */ + vm_entry_checker_t *right_checker = checker_list_nth(checker_list, 1); + + kern_return_t kr; + + /* + * Wire the left entry. The right entry also faults in but + * stays at wire count zero. + */ + checker_perform_vm_wire(checker_list, start, size, VM_PROT_READ); + checker_fault_for_prot_not_cow(checker_list, right_checker, VM_PROT_READ); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_READ); + assert(kr == 0); + + return verify_vm_state(checker_list, "after vm_wire shared"); +} + +static test_result_t +wire_shared_entry_discontiguous( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two entries each sharing the same object + * but only partially overlap inside that object. + * Wiring the left entry does not affect the right entry, + * so this looks like an ordinary vm_wire test. + */ + return successful_vm_wire_read_not_cow(checker_list, start, size); +} + +static test_result_t +wire_shared_entry_partial( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* + * two entries each sharing the same object + * but only partially overlap inside that object + */ + vm_entry_checker_t *right_checker = checker_list_nth(checker_list, 1); + mach_vm_address_t right_offset = DEFAULT_PARTIAL_ENTRY_SIZE; + + kern_return_t kr; + + /* + * Wire the left entry. The right entry stays at wire count zero + * and only the overlapping section faults in. + */ + checker_perform_vm_wire(checker_list, start, size, VM_PROT_READ); + right_checker->pages_resident = (uint32_t)((size - right_offset) / PAGE_SIZE); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_READ); + assert(kr == 0); + + return verify_vm_state(checker_list, "after vm_wire shared partial"); +} + +static void +checker_make_cow_private( + checker_list_t *checker_list, + vm_entry_checker_t *checker) +{ + if (checker->object->self_ref_count == 1) { + /* + * COW but not shared with anything else. + * VM resolves COW by using the same object. + */ + checker->needs_copy = false; + return; + } + + /* make new object */ + vm_object_checker_t *obj_checker = object_checker_clone(checker->object); + checker_list_append_object(checker_list, obj_checker); + + /* change object and entry to private */ + checker->needs_copy = false; + + /* set new object (decreasing previous object's self_ref_count) */ + checker_set_object(checker, obj_checker); +} + +static test_result_t +wire_cow_entry( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + /* Wiring a COW entry resolves COW but has no effect on other copies. */ + + vm_entry_checker_t *left_checker = checker_list_nth(checker_list, 0); + checker_make_cow_private(checker_list, left_checker); + + return successful_vm_wire_read_not_cow(checker_list, start, size); +} + +static test_result_t +wire_cow_nocow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *left_checker = checker_list_nth(checker_list, 0); + checker_make_cow_private(checker_list, left_checker); + + return successful_vm_wire_read_not_cow(checker_list, start, size); +} + +static test_result_t +wire_nocow_cow( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *left_checker = checker_list_nth(checker_list, 0); + vm_entry_checker_t *right_checker = left_checker->next; + checker_make_cow_private(checker_list, right_checker); + + return successful_vm_wire_read_not_cow(checker_list, start, size); +} + +static test_result_t +wire_cow_unreadable( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + checker_make_shadow_object(checker_list, checker); + return failed_vm_wire_read_not_cow(checker_list, start, size); +} + +static test_result_t +wire_cow_unwriteable( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + vm_entry_checker_t *checker = checker_list_nth(checker_list, 0); + checker_make_cow_private(checker_list, checker); + + return successful_vm_wire_read_not_cow(checker_list, start, size); +} + + +/* + * Test vm_unwire with a range that includes holes. + * We wire each allocation separately, then unwire the entire range + * to test unwire's behavior across holes without reference to + * wire's behavior across holes. + */ +static test_result_t +vm_unwire_holes( + checker_list_t *checker_list, + mach_vm_address_t start, + mach_vm_size_t size) +{ + kern_return_t kr, expected_kr; + + /* + * Wire each allocation separately, + * then unwire the entire range at once. + */ + + mach_vm_address_t end = start + size; + + entry_checker_range_t limit = + checker_list_find_range_including_holes(checker_list, start, size); + + FOREACH_CHECKER(checker, limit) { + if (checker->kind == Allocation) { + /* + * we manually "clip" our address range here + * because the real checker clipping must + * be done inside checker_perform_vm_wire() + * because wire's clip behavior is weird + */ + mach_vm_address_t clipped_address = max(start, checker->address); + mach_vm_address_t clipped_end = min(checker_end_address(checker), end); + mach_vm_size_t clipped_size = clipped_end - clipped_address; + kr = mach_vm_wire(host_priv(), mach_task_self(), + clipped_address, clipped_size, VM_PROT_READ); + assert(kr == 0); + checker_perform_vm_wire(checker_list, + clipped_address, clipped_size, VM_PROT_READ); + } + } + + if (verify_vm_state(checker_list, "before vm_unwire") != TestSucceeded) { + return TestFailed; + } + + expected_kr = checker_perform_vm_unwire_with_holes(checker_list, start, size); + kr = mach_vm_wire(host_priv(), mach_task_self(), start, size, VM_PROT_NONE); + if (kr != expected_kr) { + T_FAIL("mach_vm_wire(unwire) returned %d (%s), expected %d (%s)\n", + kr, name_for_kr(kr), expected_kr, name_for_kr(expected_kr)); + return TestFailed; + } + + if (verify_vm_state(checker_list, "after vm_unwire") != TestSucceeded) { + return TestFailed; + } + + return TestSucceeded; +} + +T_DECL(vm_wire, + "run vm_wire with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = successful_vm_wire_read_not_cow, + .single_entry_2 = successful_vm_wire_read_not_cow, + .single_entry_3 = successful_vm_wire_read_not_cow, + .single_entry_4 = successful_vm_wire_read_not_cow, + + .multiple_entries_1 = successful_vm_wire_read_not_cow, + .multiple_entries_2 = successful_vm_wire_read_not_cow, + .multiple_entries_3 = successful_vm_wire_read_not_cow, + .multiple_entries_4 = successful_vm_wire_read_not_cow, + .multiple_entries_5 = successful_vm_wire_read_not_cow, + .multiple_entries_6 = successful_vm_wire_read_not_cow, + + .some_holes_1 = failed_vm_wire_read_not_cow, + .some_holes_2 = failed_vm_wire_read_not_cow, + .some_holes_3 = failed_vm_wire_read_not_cow, + .some_holes_4 = failed_vm_wire_read_not_cow, + .some_holes_5 = failed_vm_wire_read_not_cow, + .some_holes_6 = failed_vm_wire_read_not_cow, + .some_holes_7 = failed_vm_wire_read_not_cow, + .some_holes_8 = failed_vm_wire_read_not_cow, + .some_holes_9 = failed_vm_wire_read_not_cow, + .some_holes_10 = failed_vm_wire_read_not_cow, + .some_holes_11 = failed_vm_wire_read_not_cow, + .some_holes_12 = failed_vm_wire_read_not_cow, + + .all_holes_1 = failed_vm_wire_read_not_cow, + .all_holes_2 = failed_vm_wire_read_not_cow, + .all_holes_3 = failed_vm_wire_read_not_cow, + .all_holes_4 = failed_vm_wire_read_not_cow, + + .null_entry = successful_vm_wire_read_not_cow, + .nonresident_entry = successful_vm_wire_read_not_cow, + .resident_entry = successful_vm_wire_read_not_cow, + + .shared_entry = wire_shared_entry, + .shared_entry_discontiguous = wire_shared_entry_discontiguous, + .shared_entry_partial = wire_shared_entry_partial, + .shared_entry_pairs = successful_vm_wire_read_not_cow, + .shared_entry_x1000 = successful_vm_wire_read_not_cow, + + .cow_entry = wire_cow_entry, + .cow_unreferenced = wire_cow_entry, + .cow_nocow = wire_cow_nocow, + .nocow_cow = wire_nocow_cow, + .cow_unreadable = wire_cow_unreadable, + .cow_unwriteable = wire_cow_unwriteable, + + .permanent_entry = successful_vm_wire_read_not_cow, + .permanent_before_permanent = successful_vm_wire_read_not_cow, + .permanent_before_allocation = successful_vm_wire_read_not_cow, + .permanent_before_allocation_2 = successful_vm_wire_read_not_cow, + .permanent_before_hole = failed_vm_wire_read_not_cow, + .permanent_after_allocation = successful_vm_wire_read_not_cow, + .permanent_after_hole = failed_vm_wire_read_not_cow, + + /* TODO: wire vs submaps */ + .single_submap_single_entry = test_is_unimplemented, + .single_submap_single_entry_first_pages = test_is_unimplemented, + .single_submap_single_entry_last_pages = test_is_unimplemented, + .single_submap_single_entry_middle_pages = test_is_unimplemented, + .single_submap_oversize_entry_at_start = test_is_unimplemented, + .single_submap_oversize_entry_at_end = test_is_unimplemented, + .single_submap_oversize_entry_at_both = test_is_unimplemented, + + .submap_before_allocation = test_is_unimplemented, + .submap_after_allocation = test_is_unimplemented, + .submap_before_hole = test_is_unimplemented, + .submap_after_hole = test_is_unimplemented, + .submap_allocation_submap_one_entry = test_is_unimplemented, + .submap_allocation_submap_two_entries = test_is_unimplemented, + .submap_allocation_submap_three_entries = test_is_unimplemented, + + .submap_before_allocation_ro = test_is_unimplemented, + .submap_after_allocation_ro = test_is_unimplemented, + .submap_before_hole_ro = test_is_unimplemented, + .submap_after_hole_ro = test_is_unimplemented, + .submap_allocation_submap_one_entry_ro = test_is_unimplemented, + .submap_allocation_submap_two_entries_ro = test_is_unimplemented, + .submap_allocation_submap_three_entries_ro = test_is_unimplemented, + + .protection_single_000_000 = failed_vm_wire_read_not_cow, + .protection_single_000_r00 = failed_vm_wire_read_not_cow, + .protection_single_000_0w0 = failed_vm_wire_read_not_cow, + .protection_single_000_rw0 = failed_vm_wire_read_not_cow, + .protection_single_r00_r00 = successful_vm_wire_read_not_cow, + .protection_single_r00_rw0 = successful_vm_wire_read_not_cow, + .protection_single_0w0_0w0 = failed_vm_wire_read_not_cow, + .protection_single_0w0_rw0 = failed_vm_wire_read_not_cow, + .protection_single_rw0_rw0 = successful_vm_wire_read_not_cow, + + .protection_pairs_000_000 = failed_vm_wire_read_not_cow, + .protection_pairs_000_r00 = failed_vm_wire_read_not_cow, + .protection_pairs_000_0w0 = failed_vm_wire_read_not_cow, + .protection_pairs_000_rw0 = failed_vm_wire_read_not_cow, + .protection_pairs_r00_000 = failed_vm_wire_read_not_cow, + .protection_pairs_r00_r00 = successful_vm_wire_read_not_cow, + .protection_pairs_r00_0w0 = failed_vm_wire_read_not_cow, + .protection_pairs_r00_rw0 = successful_vm_wire_read_not_cow, + .protection_pairs_0w0_000 = failed_vm_wire_read_not_cow, + .protection_pairs_0w0_r00 = failed_vm_wire_read_not_cow, + .protection_pairs_0w0_0w0 = failed_vm_wire_read_not_cow, + .protection_pairs_0w0_rw0 = failed_vm_wire_read_not_cow, + .protection_pairs_rw0_000 = failed_vm_wire_read_not_cow, + .protection_pairs_rw0_r00 = successful_vm_wire_read_not_cow, + .protection_pairs_rw0_0w0 = failed_vm_wire_read_not_cow, + .protection_pairs_rw0_rw0 = successful_vm_wire_read_not_cow, + }; + + run_vm_tests("vm_wire", __FILE__, &tests, argc, argv); +} + + +T_DECL(vm_unwire, + "run vm_unwire with various vm configurations") +{ + vm_tests_t tests = { + .single_entry_1 = test_is_unimplemented, + .single_entry_2 = test_is_unimplemented, + .single_entry_3 = test_is_unimplemented, + .single_entry_4 = test_is_unimplemented, + + .multiple_entries_1 = test_is_unimplemented, + .multiple_entries_2 = test_is_unimplemented, + .multiple_entries_3 = test_is_unimplemented, + .multiple_entries_4 = test_is_unimplemented, + .multiple_entries_5 = test_is_unimplemented, + .multiple_entries_6 = test_is_unimplemented, + + .some_holes_1 = vm_unwire_holes, + .some_holes_2 = vm_unwire_holes, + .some_holes_3 = vm_unwire_holes, + .some_holes_4 = vm_unwire_holes, + .some_holes_5 = vm_unwire_holes, + .some_holes_6 = vm_unwire_holes, + .some_holes_7 = vm_unwire_holes, + .some_holes_8 = vm_unwire_holes, + .some_holes_9 = vm_unwire_holes, + .some_holes_10 = vm_unwire_holes, + .some_holes_11 = vm_unwire_holes, + .some_holes_12 = vm_unwire_holes, + + .all_holes_1 = vm_unwire_holes, + .all_holes_2 = vm_unwire_holes, + .all_holes_3 = vm_unwire_holes, + .all_holes_4 = vm_unwire_holes, + + .null_entry = test_is_unimplemented, + .nonresident_entry = test_is_unimplemented, + .resident_entry = test_is_unimplemented, + + .shared_entry = test_is_unimplemented, + .shared_entry_discontiguous = test_is_unimplemented, + .shared_entry_partial = test_is_unimplemented, + .shared_entry_pairs = test_is_unimplemented, + .shared_entry_x1000 = test_is_unimplemented, + + .cow_entry = test_is_unimplemented, + .cow_unreferenced = test_is_unimplemented, + .cow_nocow = test_is_unimplemented, + .nocow_cow = test_is_unimplemented, + .cow_unreadable = test_is_unimplemented, + .cow_unwriteable = test_is_unimplemented, + + .permanent_entry = test_is_unimplemented, + .permanent_before_permanent = test_is_unimplemented, + .permanent_before_allocation = test_is_unimplemented, + .permanent_before_allocation_2 = test_is_unimplemented, + .permanent_before_hole = test_is_unimplemented, + .permanent_after_allocation = test_is_unimplemented, + .permanent_after_hole = test_is_unimplemented, + + .single_submap_single_entry = test_is_unimplemented, + .single_submap_single_entry_first_pages = test_is_unimplemented, + .single_submap_single_entry_last_pages = test_is_unimplemented, + .single_submap_single_entry_middle_pages = test_is_unimplemented, + .single_submap_oversize_entry_at_start = test_is_unimplemented, + .single_submap_oversize_entry_at_end = test_is_unimplemented, + .single_submap_oversize_entry_at_both = test_is_unimplemented, + + .submap_before_allocation = test_is_unimplemented, + .submap_after_allocation = test_is_unimplemented, + .submap_before_hole = test_is_unimplemented, + .submap_after_hole = test_is_unimplemented, + .submap_allocation_submap_one_entry = test_is_unimplemented, + .submap_allocation_submap_two_entries = test_is_unimplemented, + .submap_allocation_submap_three_entries = test_is_unimplemented, + + .submap_before_allocation_ro = test_is_unimplemented, + .submap_after_allocation_ro = test_is_unimplemented, + .submap_before_hole_ro = test_is_unimplemented, + .submap_after_hole_ro = test_is_unimplemented, + .submap_allocation_submap_one_entry_ro = test_is_unimplemented, + .submap_allocation_submap_two_entries_ro = test_is_unimplemented, + .submap_allocation_submap_three_entries_ro = test_is_unimplemented, + + .protection_single_000_000 = test_is_unimplemented, + .protection_single_000_r00 = test_is_unimplemented, + .protection_single_000_0w0 = test_is_unimplemented, + .protection_single_000_rw0 = test_is_unimplemented, + .protection_single_r00_r00 = test_is_unimplemented, + .protection_single_r00_rw0 = test_is_unimplemented, + .protection_single_0w0_0w0 = test_is_unimplemented, + .protection_single_0w0_rw0 = test_is_unimplemented, + .protection_single_rw0_rw0 = test_is_unimplemented, + + .protection_pairs_000_000 = test_is_unimplemented, + .protection_pairs_000_r00 = test_is_unimplemented, + .protection_pairs_000_0w0 = test_is_unimplemented, + .protection_pairs_000_rw0 = test_is_unimplemented, + .protection_pairs_r00_000 = test_is_unimplemented, + .protection_pairs_r00_r00 = test_is_unimplemented, + .protection_pairs_r00_0w0 = test_is_unimplemented, + .protection_pairs_r00_rw0 = test_is_unimplemented, + .protection_pairs_0w0_000 = test_is_unimplemented, + .protection_pairs_0w0_r00 = test_is_unimplemented, + .protection_pairs_0w0_0w0 = test_is_unimplemented, + .protection_pairs_0w0_rw0 = test_is_unimplemented, + .protection_pairs_rw0_000 = test_is_unimplemented, + .protection_pairs_rw0_r00 = test_is_unimplemented, + .protection_pairs_rw0_0w0 = test_is_unimplemented, + .protection_pairs_rw0_rw0 = test_is_unimplemented, + }; + + run_vm_tests("vm_unwire", __FILE__, &tests, argc, argv); +} diff --git a/tests/vm/corpse_footprint.c b/tests/vm/corpse_footprint.c new file mode 100644 index 000000000..5675ac4c6 --- /dev/null +++ b/tests/vm/corpse_footprint.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.corpse"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_CHECK_LEAKS(false)); + +static pid_t +spawn_munch(size_t footprint) +{ + char **launch_tool_args; + pid_t child_pid; + int ret; + + char size_arg[64]; + + T_LOG("Spawning munch with size %lu MiB", footprint >> 20); + + T_QUIET; T_ASSERT_POSIX_SUCCESS( + snprintf(size_arg, sizeof(size_arg), "--lim-size=%lub", footprint), + "snprintf()"); + + launch_tool_args = (char *[]){ + "/usr/local/bin/munch", + "--cfg-inprocess", + "--fill-cr=2.5", + "--type=malloc", + size_arg, + NULL + }; + + /* Spawn the child process. */ + ret = dt_launch_tool(&child_pid, launch_tool_args, false, NULL, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_launch_tool"); + T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid"); + + return child_pid; +} + +static pid_t munch_pid = 0; + +static void +perf_fork_corpse_teardown(void) +{ + int ret; + bool exited; + + ret = kill(munch_pid, SIGINT); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill()"); + + exited = dt_waitpid(munch_pid, NULL, NULL, 30); + T_QUIET; T_ASSERT_TRUE(exited, "dt_wait_pid()"); +} + +T_DECL(perf_fork_corpse, + "Performance test for forking corpses", + // T_META_ENABLED(!(TARGET_OS_WATCH || TARGET_OS_BRIDGE || TARGET_OS_TV)), + T_META_ENABLED(false), /* rdar://148736982 */ + T_META_BOOTARGS_SET("amfi_unrestrict_task_for_pid=1"), + T_META_TAG_PERF, + T_META_TAG_VM_NOT_PREFERRED, + T_META_RUN_CONCURRENTLY(false)) +{ + size_t footprint = 512 << 20; // 512 MiB + mach_port_t corpse_port; + mach_port_t task_port; + kern_return_t kr; + + pid_t pid = spawn_munch(footprint); + + T_ATEND(perf_fork_corpse_teardown); + + kr = task_for_pid(mach_task_self(), pid, &task_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid()"); + T_QUIET; T_ASSERT_NE(task_port, MACH_PORT_NULL, "task_for_pid"); + + dt_stat_time_t stat = dt_stat_time_create("duration"); + + T_LOG("Collecting measurements..."); + while (!dt_stat_stable(stat)) { + T_STAT_MEASURE(stat) { + kr = task_generate_corpse(task_port, &corpse_port); + } + if (kr != KERN_SUCCESS) { + T_SKIP("Unable to generate a corpse (%d | %s)", kr, mach_error_string(kr)); + } + + mach_port_deallocate(mach_task_self(), corpse_port); + } + dt_stat_finalize(stat); +} diff --git a/tests/vm/corpse_owned_vmobjects.c b/tests/vm/corpse_owned_vmobjects.c index 8785bee34..3eefe69fb 100644 --- a/tests/vm/corpse_owned_vmobjects.c +++ b/tests/vm/corpse_owned_vmobjects.c @@ -56,6 +56,7 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.memorystatus"), T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("aaron_j_sonin"), T_META_RADAR_COMPONENT_VERSION("VM")); /* Globals */ @@ -253,6 +254,7 @@ exc_thread(void *arg) T_DECL(corpse_owned_vmobjects, "vm.get_owned_vmobjects sysctl on corpses", T_META_ASROOT(true), + T_META_BOOTARGS_SET("memstat_no_task_limit_increase=1"), T_META_TAG_VM_PREFERRED ) { diff --git a/tests/vm/entitlement_increased_memory_limit.c b/tests/vm/entitlement_increased_memory_limit.c index f6b49e7ee..ebfedad48 100644 --- a/tests/vm/entitlement_increased_memory_limit.c +++ b/tests/vm/entitlement_increased_memory_limit.c @@ -61,6 +61,7 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_BOOTARGS_SET("memstat_no_task_limit_increase=1"), T_META_TAG_VM_PREFERRED); static int32_t old_entitled_max_task_pmem = 0; diff --git a/tests/vm/entitlement_internal_bands.c b/tests/vm/entitlement_internal_bands.c index dccc19913..76847ac05 100644 --- a/tests/vm/entitlement_internal_bands.c +++ b/tests/vm/entitlement_internal_bands.c @@ -31,7 +31,7 @@ T_DECL(can_not_use_internal_bands_without_entitlement, "Can not move process int #if ENTITLED T_QUIET; T_ASSERT_EQ(set_band, band, "Able to use entitled band"); #else - T_QUIET; T_ASSERT_EQ(set_band, JETSAM_PRIORITY_IDLE, "Fell through to idle band"); + T_QUIET; T_ASSERT_EQ(set_band, JETSAM_PRIORITY_BACKGROUND, "Fell through to background band"); #endif } } diff --git a/tests/vm/memory-ownership-transfer.entitlements b/tests/vm/memory-ownership-transfer.entitlements new file mode 100644 index 000000000..be9976c3f --- /dev/null +++ b/tests/vm/memory-ownership-transfer.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.memory.ownership_transfer + + + diff --git a/tests/vm/memorystatus_convert_limit_bytes.c b/tests/vm/memorystatus_convert_limit_bytes.c index 23927b1c2..0e975e59d 100644 --- a/tests/vm/memorystatus_convert_limit_bytes.c +++ b/tests/vm/memorystatus_convert_limit_bytes.c @@ -11,6 +11,7 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_BOOTARGS_SET("memstat_no_task_limit_increase=1"), T_META_ENABLED(!TARGET_OS_OSX)); T_DECL(memorystatus_convert_limit_bytes, "memorystatus_convert_limit_bytes default limit", T_META_TAG_VM_PREFERRED) diff --git a/tests/vm/memorystatus_freeze_test.c b/tests/vm/memorystatus_freeze_test.c index e5faef2ef..bace290cd 100644 --- a/tests/vm/memorystatus_freeze_test.c +++ b/tests/vm/memorystatus_freeze_test.c @@ -6,10 +6,11 @@ #include #include #include -#include -#include -#include +#include #include +#include +#include +#include #include #include #include @@ -27,9 +28,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.memorystatus"), T_META_RADAR_COMPONENT_NAME("xnu"), - T_META_RADAR_COMPONENT_VERSION("VM - memory pressure"), + T_META_RADAR_COMPONENT_VERSION("VM"), T_META_CHECK_LEAKS(false), - T_META_OWNER("jarrad"), T_META_RUN_CONCURRENTLY(false) ); @@ -2264,7 +2264,7 @@ T_DECL(memorystatus_coalition_freezer_slot_limit, "Exhausting freezer slots and /* Create our coalitions and spawn the leader / "XPC service" members */ spawn_coalition_and_run(^{ - int i, j, ret, n_coal_frozen = 0; + int i, ret, n_coal_frozen = 0; memorystatus_jetsam_snapshot_t *snapshot; memorystatus_jetsam_snapshot_entry_t *entry; @@ -2323,7 +2323,7 @@ T_DECL(memorystatus_two_coalition_freeze, "Exhausting freezer slots with one coa sig_disp = run_block_after_signal(SIGUSR2, ^{ /* After our child signals, we can try spawning and freezing our coalition */ spawn_coalition_and_run(^{ - int i, j, ret, n_coal_frozen = 0; + int i, ret, n_coal_frozen = 0; memorystatus_jetsam_snapshot_t *snapshot; memorystatus_jetsam_snapshot_entry_t *entry; @@ -2390,3 +2390,23 @@ T_HELPER_DECL(coalition_freezer, "Spawns a coalition and freezes it", dispatch_main(); } + +T_DECL(do_fastwake_warmup_all, + "Test kern.memorystatus_do_fastwake_warmup_all", + T_META_ASROOT(true), + T_META_ENABLED(false /* rdar://149557081 !TARGET_OS_OSX && !TARGET_OS_BRIDGE */)) +{ + int val = 1; + int ret = sysctlbyname("kern.memorystatus_do_fastwake_warmup_all", NULL, NULL, &val, sizeof(val)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl(kern.memorystatus_do_fastwake_warmup_all)"); + + struct vm_compressor_q_lens cstats; + mach_msg_type_number_t count = VM_COMPRESSOR_Q_LENS_COUNT; + + kern_return_t kr = host_info(mach_host_self(), HOST_VM_COMPRESSOR_Q_LENS, + (host_info_t)&cstats, &count); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_info(HOST_VM_COMPRESSOR_Q_LENS)"); + + T_EXPECT_EQ(cstats.qcc_swappedout_count, 0, "Zero swapped-out segments after fastwake warmup"); + T_EXPECT_EQ(cstats.qcc_swappedout_sparse_count, 0, "Zero sparse swapped-out segments after fastwake warmup"); +} diff --git a/tests/vm/memorystatus_kill_counts.c b/tests/vm/memorystatus_kill_counts.c index d94c504a3..fe877bfb1 100644 --- a/tests/vm/memorystatus_kill_counts.c +++ b/tests/vm/memorystatus_kill_counts.c @@ -46,14 +46,18 @@ spawn_and_jetsam(int32_t band) T_QUIET; T_ASSERT_POSIX_ZERO(error, "spawn child"); prop.priority = band; prop.user_data = 0; - error = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, child, 0, &prop, sizeof(prop)); - T_QUIET; T_ASSERT_POSIX_ZERO(error, "set child properties"); + error = memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED, child, 1, NULL, 0); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "set child managed"); + error = memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, child, MEMORYSTATUS_SET_PRIORITY_ASSERTION, &prop, sizeof(prop)); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "set child priority"); error = memorystatus_control(MEMORYSTATUS_CMD_TEST_JETSAM, child, 0, NULL, 0); T_QUIET; T_ASSERT_POSIX_ZERO(error, "jetsam child"); } #define N_TEST_BANDS 5 -int32_t test_bands[N_TEST_BANDS] = {0, 30, 35, 40, 45}; +// Insert at head to skip idle aging +int32_t test_bands[N_TEST_BANDS] = {JETSAM_PRIORITY_IDLE_HEAD, JETSAM_PRIORITY_BACKGROUND, 35, JETSAM_PRIORITY_MAIL, 45}; +int32_t expected_bands[N_TEST_BANDS] = {JETSAM_PRIORITY_IDLE, JETSAM_PRIORITY_BACKGROUND, 35, JETSAM_PRIORITY_MAIL, 45}; int32_t proc_counts[N_TEST_BANDS] = {2, 3, 1, 2, 4}; #define BUFFER_SIZE (sizeof(uint32_t) * (JETSAM_REASON_MEMORYSTATUS_MAX + 1)) @@ -76,8 +80,8 @@ T_DECL(memorystatus_kill_counts, "jetsam kill counts", void (^get_all_kill_counts)(uint32_t**, int) = ^(uint32_t **buffers, int flags){ int i, error; for (i = 0; i < N_TEST_BANDS; i++) { - error = get_kill_counts(buffers[i], BUFFER_SIZE, test_bands[i], flags); - T_ASSERT_POSIX_ZERO(error, "get kill counts (band %d)", test_bands[i]); + error = get_kill_counts(buffers[i], BUFFER_SIZE, expected_bands[i], flags); + T_ASSERT_POSIX_ZERO(error, "get kill counts (band %d)", expected_bands[i]); } }; diff --git a/tests/vm/memorystatus_rearm.c b/tests/vm/memorystatus_rearm.c new file mode 100644 index 000000000..9e7769b03 --- /dev/null +++ b/tests/vm/memorystatus_rearm.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* internal */ +#include +#include + +#define TEST_MEMLIMIT_MB 10 +#define SEM_TIMEOUT dispatch_time(DISPATCH_TIME_NOW, 1 * NSEC_PER_SEC) +#define REARM_TIMES 5 + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.memorystatus"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_OWNER("aaron_j_sonin"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_BOOTARGS_SET("memstat_no_task_limit_increase=1")); + +/* Globals */ +static dispatch_semaphore_t sync_sema; +static pid_t child_pid; + +/* Exception */ +kern_return_t +catch_mach_exception_raise(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count) +{ +#pragma unused(exception_port, thread, task, code_count) + if (exception != EXC_RESOURCE) { + T_LOG("Received unknown exception %d\n", exception); + return KERN_FAILURE; + } + + mach_exception_data_type_t resource = EXC_RESOURCE_DECODE_RESOURCE_TYPE(code[0]); + mach_exception_data_type_t flavor = EXC_RESOURCE_DECODE_FLAVOR(code[0]); + + if (resource != RESOURCE_TYPE_MEMORY) { + T_LOG("Received EXC_RESOURCE, but not for memory"); + return KERN_FAILURE; + } + + if (flavor != FLAVOR_HIGH_WATERMARK) { + T_LOG("Received EXC_RESOURCE, but not high watermark"); + return KERN_FAILURE; + } + + T_LOG("Received memory high watermark EXC_RESOURCE!\n"); + dispatch_semaphore_signal(sync_sema); + return KERN_SUCCESS; +} + +/* Unused, but necessary to link w/ excserver */ +kern_return_t +catch_mach_exception_raise_state(mach_port_t exception_port, + exception_type_t exception, + const mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + const thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state"); + return KERN_NOT_SUPPORTED; +} + +/* Unused, but necessary to link w/ excserver */ +kern_return_t +catch_mach_exception_raise_state_identity(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state_identity"); + return KERN_NOT_SUPPORTED; +} + +void +eat_memory(int num_pages) +{ + int ret; + int i, j; + unsigned char *buf; + + for (i = 0; i < REARM_TIMES; i++) { + /* Allocate and touch all our pages */ + T_LOG("Allocating %d pages...", num_pages); + buf = mmap(NULL, vm_page_size * num_pages, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(buf, "mmap"); + for (j = 0; j < num_pages; j++) { + ((volatile unsigned char *)buf)[j * vm_page_size] = 1; + } + + /* Free them, hopefully putting us back under the limit */ + T_LOG("Freeing..."); + munmap((void*) ((size_t) buf + vm_page_size), vm_page_size * (num_pages - 1)); + + /* Re-arm EXC_RESOURCE */ + ret = memorystatus_control( + MEMORYSTATUS_CMD_REARM_MEMLIMIT, + getpid(), + MEMORYSTATUS_FLAGS_REARM_ACTIVE | MEMORYSTATUS_FLAGS_REARM_INACTIVE, + NULL, 0); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "memorystatus_control(MEMORYSTATUS_CMD_REARM_MEMLIMIT)"); + } + + exit(0); +} + +/* + * Background process that will allocate enough memory to push + * itself over the threshold, hopefully triggering EXC_RESOURCE. + */ +T_HELPER_DECL(memory_enjoyer, "") { + int ret; + sig_t sig; + dispatch_source_t dispatch; + int num_pages = 0; + + if (argc == 1) { + num_pages = atoi(argv[0]); + } + + /* Use dispatch to wait for the signal from our parent to start eating memory */ + sig = signal(SIGUSR1, SIG_IGN); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig, SIG_ERR, "signal(SIGUSR1)"); + dispatch = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(dispatch, "dispatch_source_create"); + dispatch_source_set_event_handler(dispatch, ^{ + eat_memory(num_pages); + }); + dispatch_activate(dispatch); + + /* Signal parent that we're ready */ + ret = kill(getppid(), SIGUSR1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "signal parent"); + + dispatch_main(); +} + +static void +kill_child(void) +{ + int ret = kill(child_pid, SIGKILL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kill"); +} + +static pid_t +launch_child(int num_pages) +{ + extern char **environ; + int ret; + pid_t pid; + char testpath[PATH_MAX]; + posix_spawnattr_t spawn_attrs; + + uint32_t testpath_buf_size = PATH_MAX; + char num_pages_str[32] = {0}; + char *argv[5] = {testpath, "-n", "memory_enjoyer", num_pages_str, NULL}; + + T_LOG("Spawning child process..."); + + ret = posix_spawnattr_init(&spawn_attrs); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init"); + ret = posix_spawnattr_setjetsam_ext(&spawn_attrs, 0, JETSAM_PRIORITY_FOREGROUND, TEST_MEMLIMIT_MB, TEST_MEMLIMIT_MB); + + ret = snprintf(num_pages_str, sizeof(num_pages_str), "%d", num_pages); + T_QUIET; T_ASSERT_LE((size_t) ret, sizeof(num_pages_str), "Don't allocate too many pages."); + ret = _NSGetExecutablePath(testpath, &testpath_buf_size); + T_QUIET; T_ASSERT_EQ(ret, 0, "_NSGetExecutablePath"); + ret = posix_spawn(&pid, argv[0], NULL, &spawn_attrs, argv, environ); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "posix_spawn"); + + T_ATEND(kill_child); + + return pid; +} + +static void * +exc_handler_thread(void * arg) +{ +#pragma unused(arg) + kern_return_t kret; + mach_port_t exception_port; + + /* Set up our exception port. */ + + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exception_port); + if (kret != KERN_SUCCESS) { + T_FAIL("mach_port_allocate: %s (%d)", mach_error_string(kret), kret); + } + + kret = mach_port_insert_right(mach_task_self(), exception_port, exception_port, MACH_MSG_TYPE_MAKE_SEND); + if (kret != KERN_SUCCESS) { + T_FAIL("mach_port_insert_right: %s (%d)", mach_error_string(kret), kret); + } + + kret = task_set_exception_ports(mach_task_self(), EXC_MASK_RESOURCE, exception_port, + (exception_behavior_t)(EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0); + if (kret != KERN_SUCCESS) { + T_FAIL("task_set_exception_ports: %s (%d)", mach_error_string(kret), kret); + } + + dispatch_semaphore_signal(sync_sema); + + kret = mach_msg_server(mach_exc_server, MACH_MSG_SIZE_RELIABLE, exception_port, 0); + if (kret != KERN_SUCCESS) { + T_FAIL("mach_msg_server: %s (%d)", mach_error_string(kret), kret); + } + + return NULL; +} + +T_DECL(memorylimit_exception_tests, "EXC_RESOURCE re-arming", + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED, + T_META_ENABLED(!TARGET_OS_OSX) + ) +{ + int num_pages; + long dispatch_err; + sig_t sig_ret; + dispatch_source_t dispatch; + pthread_t handle_thread; + + T_SETUPBEGIN; + + sync_sema = dispatch_semaphore_create(0); + + /* Start our exception handling thread */ + T_ASSERT_POSIX_ZERO(pthread_create(&handle_thread, NULL, exc_handler_thread, NULL), "pthread_create"); + dispatch_err = dispatch_semaphore_wait(sync_sema, SEM_TIMEOUT); + T_QUIET; T_ASSERT_EQ(dispatch_err, 0L, "dispatch_semaphore_wait"); + + /* Make sure we handle SIGUSR1 */ + sig_ret = signal(SIGUSR1, SIG_IGN); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NE(sig_ret, SIG_ERR, "signal(SIGUSR1)"); + + /* + * When we receive SIGUSR1 from our child (to indicate that it's ready), send it a + * SIGUSR1 back to indicate that we're ready (i.e. we've attached to the child). + * Then, wait for EXC_RESOURCE to happen REARM_TIMES times. + */ + dispatch = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(dispatch, "dispatch_source_create"); + dispatch_source_set_event_handler(dispatch, ^{ + int ret; + + /* Attach to child */ + ret = ptrace(PT_ATTACHEXC, child_pid, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ptrace"); + + /* Tell child we're ready */ + kill(child_pid, SIGUSR1); + + /* Wait for EXC_RESOURCEs to be delivered */ + for (int i = 0; i < REARM_TIMES; i++) { + long dispatch_err = dispatch_semaphore_wait(sync_sema, SEM_TIMEOUT); + T_QUIET; T_ASSERT_EQ(dispatch_err, 0L, "Received EXC_RESOURCE"); + } + T_END; + }); + dispatch_activate(dispatch); + + /* Spawn child and attach to it */ + num_pages = (TEST_MEMLIMIT_MB * (1 << 20)) / vm_page_size; + child_pid = launch_child(num_pages); + + T_SETUPEND; + + dispatch_main(); +} diff --git a/tests/vm/memorystatus_rearm.entitlements b/tests/vm/memorystatus_rearm.entitlements new file mode 100644 index 000000000..e3f128475 --- /dev/null +++ b/tests/vm/memorystatus_rearm.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.private.set-exception-port + + com.apple.private.amfi.can-set-exception-ports + + internal.com.apple.system-task-ports.control + + + diff --git a/tests/vm/memorystatus_sort_test.c b/tests/vm/memorystatus_sort_test.c index 2f854b628..73e2b3af7 100644 --- a/tests/vm/memorystatus_sort_test.c +++ b/tests/vm/memorystatus_sort_test.c @@ -17,25 +17,30 @@ T_GLOBAL_META( T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("VM")); -#define kNumProcsInCoalition 4 +#define NUM_PER_ROLE 3 /* Number of procs per role in coalition (besides leader) */ +#define NUM_PROCS_IN_COALITION (NUM_PER_ROLE * (COALITION_NUM_TASKROLES - 1) + 1) +#define NUM_COALITIONS 3 + +#define COAL_ORDER_NUM_PIDS (NUM_PROCS_IN_COALITION + COALITION_NUM_TASKROLES - 1) typedef struct { - pid_t pids[kNumProcsInCoalition]; // An array of pids in this coalition. Owned by this struct. - pid_t expected_order[kNumProcsInCoalition]; // An array of pids in this coalition in proper sorted order. + pid_t pids[NUM_PROCS_IN_COALITION]; // An array of pids in this coalition. Owned by this struct. + pid_t expected_order[COAL_ORDER_NUM_PIDS]; // An array of pids in this coalition in proper sorted order. uint64_t ids[COALITION_NUM_TYPES]; + size_t leader_footprint; } coalition_info_t; /* * Children pids spawned by this test that need to be cleaned up. * Has to be a global because the T_ATEND API doesn't take any arguments. */ -#define kMaxChildrenProcs 16 +#define kMaxChildrenProcs NUM_PROCS_IN_COALITION * NUM_COALITIONS + 1 static pid_t children_pids[kMaxChildrenProcs]; static size_t num_children = 0; /* * Sets up a new coalition. */ -static void init_coalition(coalition_info_t*); +static void init_coalition(coalition_info_t*, size_t leader_fp); /* * Places all procs in the coalition in the given band. @@ -85,6 +90,136 @@ static int get_vmpage_size(void); static pid_t launch_proc_in_coalition(uint64_t *coalition_ids, int role, int num_pages); +static void +bufprint(char **buf, size_t *size, const char *fmt, ...) +{ + va_list list; + int n_written; + + va_start(list, fmt); + n_written = vsnprintf(*buf, *size, fmt, list); + va_end(list); + + if (n_written > 0) { + *buf += n_written; + *size -= n_written; + } +} + +static char * +pids_str(pid_t *pids, int n_pids) +{ + int i; + size_t buf_len = n_pids * 8 + 2; /* For good measure */ + char *buf = malloc(buf_len); + char *obuf = buf; + + bufprint(&buf, &buf_len, "("); + + for (i = 0; (i < n_pids) && (buf_len > 0); i++) { + if (pids[i] == -1) { + bufprint(&buf, &buf_len, "), ("); + } else { + bool is_last = (i == (n_pids - 1)) || (pids[i + 1] == -1); + bufprint(&buf, &buf_len, "%d%s", pids[i], is_last ? "" : ", "); + } + } + + bufprint(&buf, &buf_len, ")"); + + return obuf; +} + +/* + * Sorts the given jetsam band with the desired order and verifies that the + * sort was done correctly. + * `expected_order` is an array of groups of PIDs separated by `-1`, where PIDs + * in each group are re-orderable. For instance, for the expected order: + * [1, 2, -1, 3, -1, 4] + * the orderings of + * 1, 2, 3, 4 and 2, 1, 3, 4 are both valid since 1 and 2 are in the same group. + */ +static void +sort_and_verify( + unsigned int prio, + memorystatus_jetsam_sort_order_t order, + pid_t *expected_order, + size_t expected_order_len) +{ + size_t i, j, n_pids, group_idx; + bool in_order; + pid_t *actual_order; + pid_t *original_expected_order; + + /* Bigger than we need it, but that's fine */ + actual_order = malloc(sizeof(pid_t) * expected_order_len); + + /* Make a copy of expected_order since we'll be overwriting it */ + original_expected_order = malloc(sizeof(pid_t) * expected_order_len); + memcpy(original_expected_order, expected_order, sizeof(pid_t) * expected_order_len); + + /* + * Add only the actual pids from expected_order in to tell memorystatus which + * PIDs we care about + */ + n_pids = 0; + for (i = 0; i < expected_order_len; i++) { + if (expected_order[i] != -1) { + actual_order[n_pids] = expected_order[i]; + n_pids++; + } + } + + int ret = memorystatus_control(MEMORYSTATUS_CMD_TEST_JETSAM_SORT, prio, order, + actual_order, n_pids * sizeof(pid_t)); + T_QUIET; T_EXPECT_POSIX_SUCCESS(ret, "Band sorted and order copied out"); + + /* Check that the order we got was what we expected */ + group_idx = 0; /* idx of pid that starts current reorderable group */ + for (i = 0; i < n_pids; i++) { + /* + * Check if the current pid in actual_order is in the current group. + * If not, advance to the next group until we find it. This is essentially + * a ratcheting mechanism - we can move our search group forwards, but not + * backwards. + */ + for (j = group_idx; j < expected_order_len; j++) { + if (expected_order[j] == -1) { + /* Made it to the end of a group w/o finding the pid */ + group_idx = j + 1; + continue; + } else if (expected_order[j] == actual_order[i]) { + /* Found our pid. Mark it found */ + expected_order[j] = 0; + break; + } + } + } + + + /* Check that all pids were actually found */ + in_order = true; + for (i = 0; i < expected_order_len; i++) { + if ((expected_order[i] != -1) && (expected_order[i] != 0)) { + in_order = false; + break; + } + } + + T_EXPECT_TRUE(in_order, "Band in correct order when sorted in order (%d)", order); + + if (!in_order) { + char *exp_str = pids_str(original_expected_order, expected_order_len); + char *actual_str = pids_str(actual_order, n_pids); + T_LOG("Out of order! Expected:\n%s\nbut got\n%s\n", exp_str, actual_str); + free(exp_str); + free(actual_str); + } + + free(actual_order); + free(original_expected_order); +} + /* * Background process that will munch some memory, signal its parent, and * then sit in a loop. @@ -95,13 +230,42 @@ T_HELPER_DECL(coalition_member, "Mock coalition member") { num_pages = atoi(argv[0]); } allocate_pages(num_pages); + if (num_pages) { + printf("%d has %d\n", getpid(), num_pages); + } // Signal to the parent that we've touched all of our pages. if (kill(getppid(), SIGUSR1) != 0) { T_LOG("Unable to signal to parent process!"); exit(1); } while (true) { - ; + sleep(100); + } +} + +static void +random_order(int *arr, int size) +{ + int i, a, b, s; + for (i = 0; i < size; i++) { + arr[i] = i; + } + for (i = 0; i < size; i++) { + a = rand() % size; + b = rand() % size; + s = arr[a]; + arr[a] = arr[b]; + arr[b] = s; + } +} + +static void +add_coalition_to_order(pid_t *order, coalition_info_t *coal, int coal_idx) +{ + int order_idx = coal_idx * (COAL_ORDER_NUM_PIDS + 1); + memcpy(&order[order_idx], &coal->expected_order, sizeof(coal->expected_order)); + if (coal_idx != 0) { + order[order_idx - 1] = -1; } } @@ -111,37 +275,89 @@ T_HELPER_DECL(coalition_member, "Mock coalition member") { * has a different coalition role. Verifies that the coalition * is sorted properly by role. */ -T_DECL(memorystatus_sort_coalition, "Coalition sort order", +#define COALS_EXPECTED_ORDER_LEN ((COAL_ORDER_NUM_PIDS * NUM_COALITIONS) + (NUM_COALITIONS - 1)) +T_DECL(memorystatus_sort_coalitions_footprint, "Sort coalitions by leader footprint", T_META_ASROOT(true), - T_META_TAG_VM_PREFERRED, - T_META_ENABLED(false /* rdar://133461319 */) - ) + T_META_TAG_VM_PREFERRED) { - int ret; - sig_t res; - coalition_info_t coalition; + int i; + coalition_info_t *coalitions; + int coalition_order[NUM_COALITIONS]; + pid_t *expected_order; /* Expected order of all pids in all coalitions */ + if (!has_unrestrict_coalitions()) { T_SKIP("Unable to test coalitions on this kernel."); } - res = signal(SIGUSR1, SIG_IGN); - T_WITH_ERRNO; T_ASSERT_NE(res, SIG_ERR, "SIG_IGN SIGUSR1"); unrestrict_coalitions(); - // Set up a new coalition with various members. - init_coalition(&coalition); T_ATEND(cleanup_children); T_ATEND(restrict_coalitions); - // Place all procs in the coalition in the foreground band - place_coalition_in_band(&coalition, JETSAM_PRIORITY_FOREGROUND); - // Have the kernel sort the foreground bucket and verify that it's - // sorted correctly. - ret = memorystatus_control(MEMORYSTATUS_CMD_TEST_JETSAM_SORT, JETSAM_PRIORITY_FOREGROUND, 0, - coalition.expected_order, kNumProcsInCoalition * sizeof(pid_t)); - T_QUIET; T_ASSERT_EQ(ret, 0, "Error while sorting or validating sorted order.\n" - "Check os log output for details.\n" - "Look for memorystatus_verify_sort_order."); + + /* Initialize our coalitions */ + coalitions = malloc(sizeof(coalition_info_t) * NUM_COALITIONS); + expected_order = malloc(sizeof(pid_t) * COALS_EXPECTED_ORDER_LEN); + + /* Spawn the coalitions in random order */ + random_order(coalition_order, NUM_COALITIONS); + + /* Spawn coalitions, each with a different leader footprint */ + for (i = 0; i < NUM_COALITIONS; i++) { + int coal = coalition_order[i]; + init_coalition(&coalitions[coal], (NUM_COALITIONS - coal) * 50); + add_coalition_to_order(expected_order, &coalitions[coal], coal); + place_coalition_in_band(&coalitions[coal], JETSAM_PRIORITY_FOREGROUND); + } + + /* Sort by leader footprint and verify coalitions are sorted by leader footprint */ + sort_and_verify(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_FOOTPRINT, expected_order, COALS_EXPECTED_ORDER_LEN); + + free(coalitions); + free(expected_order); } +T_DECL(memorystatus_sort_coalitions_lru, "Sort coalitions by leader LRU", + T_META_ASROOT(true), + T_META_TAG_VM_PREFERRED) +{ + int i; + coalition_info_t *coalitions; + int coalition_order[NUM_COALITIONS]; + pid_t *expected_order; /* Expected order of all pids in all coalitions */ + + if (!has_unrestrict_coalitions()) { + T_SKIP("Unable to test coalitions on this kernel."); + } + unrestrict_coalitions(); + + T_ATEND(cleanup_children); + T_ATEND(restrict_coalitions); + + /* Initialize our coalitions */ + coalitions = malloc(sizeof(coalition_info_t) * NUM_COALITIONS); + expected_order = malloc(sizeof(pid_t) * COALS_EXPECTED_ORDER_LEN); + + /* Spawn coalitions */ + for (i = 0; i < NUM_COALITIONS; i++) { + init_coalition(&coalitions[i], 0); + } + + /* Add coalitions to foreground in random order*/ + random_order(coalition_order, NUM_COALITIONS); + for (i = 0; i < NUM_COALITIONS; i++) { + int coal = coalition_order[i]; + place_coalition_in_band(&coalitions[coal], JETSAM_PRIORITY_FOREGROUND); + add_coalition_to_order(expected_order, &coalitions[coal], i); + } + + + /* Sort by leader LRU and verify coalitions are sorted by leader LRU */ + sort_and_verify(JETSAM_PRIORITY_FOREGROUND, JETSAM_SORT_LRU, expected_order, COALS_EXPECTED_ORDER_LEN); + + free(coalitions); + free(expected_order); +} + + /* * Test that sorting the idle bucket in footprint order works properly. * @@ -158,9 +374,9 @@ T_DECL(memorystatus_sort_footprint, "Footprint sort order", * Note that procs should be sorted in descending footprint order. */ static const int kExpectedOrder[kNumChildren] = {2, 0, 1}; - static const int kJetsamBand = JETSAM_PRIORITY_IDLE; + static const int kJetsamBand = JETSAM_PRIORITY_BACKGROUND; __block pid_t pid; - sig_t res; + sig_t res; dispatch_source_t ds_allocated; T_ATEND(cleanup_children); @@ -176,16 +392,10 @@ T_DECL(memorystatus_sort_footprint, "Footprint sort order", place_proc_in_band(pid, kJetsamBand); } else { pid_t expected_order[kNumChildren] = {0}; - int ret; for (int i = 0; i < kNumChildren; i++) { expected_order[i] = children_pids[kExpectedOrder[i]]; } - // Verify the sort order - ret = memorystatus_control(MEMORYSTATUS_CMD_TEST_JETSAM_SORT, kJetsamBand, 0, - expected_order, sizeof(expected_order)); - T_QUIET; T_ASSERT_EQ(ret, 0, "Error while sorting or validating sorted order.\n" - "Check os log output for details.\n" - "Look for memorystatus_verify_sort_order."); + sort_and_verify(kJetsamBand, JETSAM_SORT_FOOTPRINT_NOCOAL, expected_order, kNumChildren); T_END; } }); @@ -233,9 +443,13 @@ launch_proc_in_coalition(uint64_t *coalition_ids, int role, int num_pages) } static void -init_coalition(coalition_info_t *coalition) +init_coalition(coalition_info_t *coalition, size_t leader_fp) { - int ret; + /* This code will need updating if we add a role */ + static_assert(COALITION_NUM_TASKROLES == 4); + + sigset_t set; + int ret, i, sig; uint32_t flags = 0; memset(coalition, 0, sizeof(coalition_info_t)); for (int i = 0; i < COALITION_NUM_TYPES; i++) { @@ -244,38 +458,52 @@ init_coalition(coalition_info_t *coalition) T_QUIET; T_ASSERT_POSIX_ZERO(ret, "coalition_create"); } + sigemptyset(&set); + ret = sigaddset(&set, SIGUSR1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sigaddset(SIGUSR1)"); + + coalition->leader_footprint = leader_fp; + /* * Spawn procs for each coalition role, and construct the expected * sorted order. */ - for (size_t i = 0; i < kNumProcsInCoalition; i++) { + int n_roles[COALITION_NUM_TASKROLES] = {0}; + int role_order_idx[COALITION_NUM_TASKROLES] = { + /* COALITION_TASKROLE_UNDEF */ 0, + /* COALITION_TASKROLE_LEADER */ (NUM_PER_ROLE + 1) * 3, + /* COALITION_TASKROLE_XPC */ (NUM_PER_ROLE + 1) * 2, + /* COALITION_TASKROLE_EXT */ NUM_PER_ROLE + 1 + }; + for (i = 1; i < COALITION_NUM_TASKROLES; i++) { + coalition->expected_order[role_order_idx[i] - 1] = -1; + } + for (size_t i = 0; i < NUM_PROCS_IN_COALITION; i++) { int role; - if (i == 0) { - role = COALITION_TASKROLE_LEADER; - } else if (i == 1) { - role = COALITION_TASKROLE_EXT; - } else if (i == 2) { - role = COALITION_TASKROLE_UNDEF; - } else { - role = COALITION_TASKROLE_XPC; + size_t pages = 0; + + while (true) { + role = rand() % COALITION_NUM_TASKROLES; + if ((role == COALITION_TASKROLE_LEADER) && n_roles[role]) { + continue; /* Already have a leader */ + } else if (n_roles[role] == NUM_PER_ROLE) { + continue; /* Already have all of this role */ + } + n_roles[role]++; + break; } - pid_t pid = launch_proc_in_coalition(coalition->ids, role, 0); - coalition->pids[i] = pid; - /* - * Determine the expected sorted order. - * After a bucket has been coalition sorted, coalition members should - * be in the following kill order: - * undefined coalition members, extensions, xpc services, leader - */ + if (role == COALITION_TASKROLE_LEADER) { - coalition->expected_order[3] = pid; - } else if (role == COALITION_TASKROLE_XPC) { - coalition->expected_order[2] = pid; - } else if (role == COALITION_TASKROLE_EXT) { - coalition->expected_order[1] = pid; - } else { - coalition->expected_order[0] = pid; + pages = leader_fp; } + + pid_t pid = launch_proc_in_coalition(coalition->ids, role, pages); + ret = sigwait(&set, &sig); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sigwait"); + T_QUIET; T_ASSERT_EQ(sig, SIGUSR1, "sigwait == SIGUSR1"); + coalition->pids[i] = pid; + coalition->expected_order[role_order_idx[role]] = pid; + role_order_idx[role]++; } } @@ -294,7 +522,7 @@ place_proc_in_band(pid_t pid, int band) static void place_coalition_in_band(const coalition_info_t *coalition, int band) { - for (size_t i = 0; i < kNumProcsInCoalition; i++) { + for (size_t i = 0; i < NUM_PROCS_IN_COALITION; i++) { pid_t curr = coalition->pids[i]; place_proc_in_band(curr, band); } diff --git a/tests/vm/mixed_pagesize.plist b/tests/vm/mixed_pagesize.plist index 53e05afd6..9cdf8882b 100644 --- a/tests/vm/mixed_pagesize.plist +++ b/tests/vm/mixed_pagesize.plist @@ -19,8 +19,8 @@ TestName xnu.vm.mixed_pagesize - Enabled - + Disabled + Timeout diff --git a/tests/vm/test_vm_no_pager.m b/tests/vm/test_vm_no_pager.m index 4043ba282..c30e5046a 100644 --- a/tests/vm/test_vm_no_pager.m +++ b/tests/vm/test_vm_no_pager.m @@ -329,3 +329,12 @@ T_DECL(vm_no_pager_ungraft, "test correct detection and propagation of reason fo current_expected_triage_string = UNGRAFTED_ERROR; run_test("2", argc, argv); } + +T_DECL(vm_no_pager_force_unmount_evict, "test object cache eviction when not having a pager (forced unmount)", + T_META_IGNORECRASHES(".*test_vm_no_pager.*"), + T_META_ENABLED(!TARGET_OS_BRIDGE), + T_META_ASROOT(true)) +{ + current_expected_triage_string = FORCED_UNMOUNT_ERROR; + run_test("3", argc, argv); +} diff --git a/tests/vm/test_vm_no_pager_helper.c b/tests/vm/test_vm_no_pager_helper.c index c758a8129..7482a6503 100644 --- a/tests/vm/test_vm_no_pager_helper.c +++ b/tests/vm/test_vm_no_pager_helper.c @@ -17,6 +17,7 @@ #include #pragma clang diagnostic ignored "-Wformat-nonliteral" +#pragma clang diagnostic ignored "-Wformat" static int verbose = 0; @@ -277,6 +278,94 @@ forced_unmount_crash_test(void) rmdir(FUNMOUNT_MOUNT_POINT); } +static void +forced_unmount_panic_test(void) +{ + char device_identifier[128]; + char *file_path; + int test_file_fd; + int ret; + char *mapped; + + setup_unmount_image(device_identifier, sizeof(device_identifier)); + ASSERT(!disk_image_attach(FUNMOUNT_IMAGE, FUNMOUNT_MOUNT_POINT), "attaching and mounting image '%s' failed\n", FUNMOUNT_IMAGE); + + // open file for write + file_path = FUNMOUNT_FILE; + if ((test_file_fd = open(file_path, O_RDWR)) == -1) { + printf("couldn't open file '%s'\n", file_path); + } + ret = ftruncate(test_file_fd, 12); + if (ret < 0) { + printf("ftruncate() errno %d\n", errno); + } + // map it for write + mapped = mmap(0, 1024, PROT_WRITE, MAP_SHARED, test_file_fd, 0);; + if (mapped == MAP_FAILED) { + close(test_file_fd); + printf("couldn't mmap file '%s', errno %d\n", file_path, errno); + } else { + PRINTF("mmap'd file: '%s'\n", file_path); + } + // add contents to 1st page + *mapped = 'A'; + // flush page + ret = msync(mapped, 1024, MS_SYNC | MS_INVALIDATE); + if (ret < 0) { + printf("msync() error %d\n", errno); + } + ret = munmap(mapped, 1024); + if (ret < 0) { + printf("munmap() error %d\n", errno); + } + close(test_file_fd); + // re-open file for read only + if ((test_file_fd = open(file_path, O_RDONLY)) == -1) { + printf("couldn't open file '%s'\n", file_path); + } + // map file read-only + mapped = mmap(0, 1024, PROT_READ, MAP_SHARED, test_file_fd, 0); + if (mapped == MAP_FAILED) { + close(test_file_fd); + printf("couldn't mmap file '%s' read-only, errno %d\n", file_path, errno); + } else { + PRINTF("mmap'd file: '%s'\n", file_path); + } + // close file + close(test_file_fd); + // page 1st page back in + printf("mapped[0] = '%c'\n", mapped[0]); + // wire page + ret = mlock(mapped, 1024); + if (ret < 0) { + printf("mlock() errno %d\n", errno); + } + // force unmount + printf("force unmount...\n"); + ASSERT(!disk_image_eject(device_identifier), "Failed to force unmount device '%s'", device_identifier); + // unwire page + ret = munlock(mapped, 1024); + if (ret < 0) { + printf("munlock() errno %d\n", errno); + } + // force object cache eviction + printf("object cache evict\n"); + fflush(stdout); + ret = sysctlbyname("vm.object_cache_evict", NULL, NULL, NULL, 0); + if (ret < 0) { + printf("sysctl(vm.object_cache_evict) errno %d\n", errno); + } else { + printf("object cache eviction did not cause a panic!\n"); + } + // crash + printf("mapped[0] = '%c'\n", mapped[0]); + + // Cleanup + my_system("rm -f " FUNMOUNT_IMAGE); + disk_image_eject(device_identifier); + rmdir(FUNMOUNT_MOUNT_POINT); +} + static void create_disk_image(const char* image_path, const char* volume_name, bool use_gpt, char* device_name_out, size_t device_len, char* partition_name_out, size_t partition_len) { @@ -410,8 +499,11 @@ main(int argc, char** argv) case 2: ungraft_crash_test(); break; + case 3: + forced_unmount_panic_test(); + break; default: - printf("Invalid test number passed'n"); + printf("Invalid test number passed (%d)\n", test_to_run); exit(1); } } diff --git a/tests/vm/upl.c b/tests/vm/upl.c new file mode 100644 index 000000000..023def705 --- /dev/null +++ b/tests/vm/upl.c @@ -0,0 +1,642 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + */ + +#include +#include + +#include + +#include +#include + +#include + +#include "exc_guard_helper.h" +#include "test_utils.h" + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_OWNER("jharmening"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true), + T_META_ALL_VALID_ARCHS(true)); + +typedef struct { + uint64_t ptr; + uint32_t size; + char test_pattern; + bool copy_expected; + bool should_fail; + bool upl_rw; +} upl_test_args; + +T_DECL(vm_upl_ro_on_rw, + "Generate RO UPL against RW memory region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'a', + .copy_expected = false, .should_fail = false, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x800; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x1000; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_ro_on_ro, + "Generate RO UPL against RO memory region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + T_QUIET; T_ASSERT_POSIX_SUCCESS(mprotect(buf, buf_size, PROT_READ), "mprotect"); + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'a', + .copy_expected = false, .should_fail = false, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x800; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x1000; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_rw_on_rw, + "Generate RW UPL against RW memory region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'b', + .copy_expected = false, .should_fail = false, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + T_QUIET; T_ASSERT_EQ(buf[i], (unsigned int)'b' + i, + "buf[%u]='%u' == '%u'", + i, buf[i], (unsigned int)'b' + i); + } + bzero(buf, buf_size); + args.ptr = (uint64_t)buf + 0x800; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + if ((i < (0x800 / sizeof(*buf))) || (i >= ((0x800 + args.size) / sizeof(*buf)))) { + T_QUIET; T_ASSERT_EQ(buf[i], 0, + "buf[%u]='%u' == 0", i, buf[i]); + } else { + T_QUIET; T_ASSERT_EQ(buf[i], (unsigned int)'b' + i - (unsigned int)(0x800 / sizeof(*buf)), + "buf[%u]='%u' == '%u'", + i, buf[i], (unsigned int)'b' + i - (unsigned int)(0x800 / sizeof(*buf))); + } + } + + bzero(buf, buf_size); + args.ptr = (uint64_t)buf + 0x1000; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + if ((i < (0x1000 / sizeof(*buf))) || (i >= ((0x1000 + args.size) / sizeof(*buf)))) { + T_QUIET; T_ASSERT_EQ(buf[i], 0, + "buf[%u]='%u' == 0", i, buf[i]); + } else { + T_QUIET; T_ASSERT_EQ(buf[i], (unsigned int)'b' + i - (unsigned int)(0x1000 / sizeof(*buf)), + "buf[%u]='%u' == '%u'", + i, buf[i], (unsigned int)'b' + i - (unsigned int)(0x1000 / sizeof(*buf))); + } + } + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_rw_on_ro, + "Generate RW UPL against RO memory region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(mprotect(buf, buf_size, PROT_READ), "mprotect"); + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'b', + .copy_expected = false, .should_fail = true, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x800; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x1000; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + munmap(buf, buf_size); +} + +static bool +sptm_enabled(void) +{ + int page_protection_type, err; + size_t size = sizeof(page_protection_type); + err = sysctlbyname("kern.page_protection_type", &page_protection_type, &size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(err, "sysctl(\"kern.page_protection_type\");"); + return page_protection_type == 2; +} + +T_DECL(vm_upl_ro_on_rx, + "Generate RO UPL against RX memory region") +{ + bool copy_expected = true; +#if TARGET_OS_OSX + /** + * For embedded targets, UPL creation against RX mappings should always produce a copy due to codesigning. + * For MacOS, a copy should only be produced if the SPTM is enabled, due to the SPTM's stricter requirements + * for DMA mappings of executable frame types. + */ + if (!sptm_enabled()) { + copy_expected = false; + } +#endif /* TARGET_OS_OSX */ + + upl_test_args args = { .ptr = (uint64_t)__builtin_return_address(0), .size = PAGE_SIZE, .test_pattern = 'a', + .copy_expected = copy_expected, .should_fail = false, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr += 0x100; + args.size -= 0x200; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); +} + +T_DECL(vm_upl_rw_on_rx, + "Generate RW UPL against RX memory region") +{ + upl_test_args args = { .ptr = (uint64_t)__builtin_return_address(0), .size = PAGE_SIZE, .test_pattern = 'a', + .copy_expected = true, .should_fail = true, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr += 0x100; + args.size -= 0x200; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); +} + +T_DECL(vm_upl_ro_on_jit, + "Generate RO UPL against JIT memory region") +{ + /** + * Direct RO UPLs against JIT pages should be allowed for non-SPTM targets. + * For SPTM targets, a copy is expected due to the SPTM's stricter requirements for DMA + * mappings of executable frame types. + */ + bool copy_expected = sptm_enabled(); + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + if (os_thread_self_restrict_rwx_is_supported()) { + os_thread_self_restrict_rwx_to_rw(); + } + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'b', + .copy_expected = copy_expected, .should_fail = false, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x800; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + args.ptr = (uint64_t)buf + 0x1000; + args.size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_rw_on_jit, + "Generate RW UPL against JIT memory region") +{ + if (process_is_translated()) { + /* TODO: Remove this once rdar://142438840 is fixed. */ + T_SKIP("Guard exception handling does not work correctly with Rosetta (rdar://142438840), skipping..."); + } + const size_t buf_size = 10 * PAGE_SIZE; + /** + * Direct RW UPLs against JIT pages should be allowed for non-SPTM targets. + * For SPTM targets, UPL creation should fail due to the SPTM's stricter requirements for DMA + * mappings of executable frame types. + */ + bool should_fail = sptm_enabled(); + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + upl_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .test_pattern = 'b', + .copy_expected = false, .should_fail = should_fail, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + + /* Ensure that guard exceptions will not be fatal to the test process. */ + enable_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY); + + /** + * Iterate 3 times to guarantee buffer offsets that are neither 4K nor 16K aligned, + * and 4K but not necessarily 16K aligned. + */ + for (int i = 0; i < 2; i++) { + exc_guard_helper_info_t exc_info; + bool caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + }); + if (args.should_fail) { + T_ASSERT_TRUE(caught_exception, "Failing test should also throw guard exception"); + T_ASSERT_EQ(exc_info.guard_flavor, kGUARD_EXC_SEC_UPL_WRITE_ON_EXEC_REGION, + "Failing test throws the expected guard exception flavor"); + T_ASSERT_EQ(exc_info.catch_count, 1, "Failing test should throw exactly one guard exception"); + } else { + T_ASSERT_FALSE(caught_exception, "Passing test should not throw guard exception"); + } + + args.ptr += 0x800; + args.size -= 0x1000; + } + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_ro_on_commpage, + "Generate RO UPL against comm page") +{ +#if !TARGET_OS_OSX + T_SKIP("Comm page only guaranteed to be within user address range on MacOS, skipping..."); +#else +#ifndef __arm64__ + T_SKIP("Comm page only has UPL-incompatible mapping on arm64, skipping..."); +#else + upl_test_args args = { .ptr = (uint64_t)_COMM_PAGE_START_ADDRESS, .size = 0x1000, .test_pattern = 'b', + .copy_expected = false, .should_fail = true, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); +#endif /* !defined(__arm64__) */ +#endif /* !TARGET_OS_OSX */ +} + +T_DECL(vm_upl_partial_cow, + "Generate a UPL that requires CoW setup for part of an object") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + /* + * Mark a portion of the buffer RO, which will split off a separate vm_map_entry backed by the same + * vm_object. This will produce an internal COPY_SYMMETRIC object with refcount > 1, which is the + * baseline requirement for partial CoW setup by vm_map_create_upl(). + */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(mprotect((char*)buf + (8 * PAGE_SIZE), 2 * PAGE_SIZE, PROT_READ), "mprotect"); + + /* + * Request a non-page-aligned UPL against the RW region of the buffer, to ensure that partial CoW + * setup still ultimately uses a page-aligned buffer as required for vm_map_entry clipping. + */ + upl_test_args args = { .ptr = (uint64_t)buf + 0x800, .size = 2 * PAGE_SIZE, .test_pattern = 'b', + .copy_expected = false, .should_fail = false, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl)"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + if ((i < (0x800 / sizeof(*buf))) || (i >= ((0x800 + args.size) / sizeof(*buf)))) { + T_QUIET; T_ASSERT_EQ(buf[i], (unsigned int)'a' + i, + "buf[%u]='%u' == '%u'", i, buf[i], (unsigned int)'a' + i); + } else { + T_QUIET; T_ASSERT_EQ(buf[i], (unsigned int)'b' + i - (unsigned int)(0x800 / sizeof(*buf)), + "buf[%u]='%u' == '%u'", + i, buf[i], (unsigned int)'b' + i - (unsigned int)(0x800 / sizeof(*buf))); + } + } + + munmap(buf, buf_size); +} + +typedef struct { + uint64_t ptr; + uint32_t size; + bool upl_rw; + bool should_fail; + bool exec_fault; +} upl_object_test_args; + +T_DECL(vm_upl_rw_on_exec_object, + "Attempt to create a writable UPL against an object containing executable pages") +{ + /** + * This test is meant to exercise functionality that is currently SPTM-specific. + * It also relies on the assumption that JIT regions are faulted in an all-or-nothing + * manner, so that the write faults generated by our buffer fill below will also + * produce executable mappings of the underlying JIT pages. This happens to hold + * true on SPTM-enabled devices because all of them use xPRR, but may not hold true + * in general. + */ + if (!sptm_enabled()) { + T_SKIP("Exec object test only supported on SPTM-enabled devices, skipping..."); + } + if (process_is_translated()) { + /* TODO: Remove this once rdar://142438840 is fixed. */ + T_SKIP("Guard exception handling does not work correctly with Rosetta (rdar://142438840), skipping..."); + } + + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + if (os_thread_self_restrict_rwx_is_supported()) { + os_thread_self_restrict_rwx_to_rw(); + } + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + /* Ensure that guard exceptions will not be fatal to the test process. */ + enable_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY); + + upl_object_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .upl_rw = true, .should_fail = true, .exec_fault = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + exc_guard_helper_info_t exc_info; + bool caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_object", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_object)"); + }); + if (args.should_fail) { + T_ASSERT_TRUE(caught_exception, "Failing test should also throw guard exception"); + T_ASSERT_EQ(exc_info.guard_flavor, kGUARD_EXC_SEC_IOPL_ON_EXEC_PAGE, + "Failing test throws the expected guard exception flavor"); + T_ASSERT_EQ(exc_info.catch_count, 1, "Failing test should throw exactly one guard exception"); + } else { + T_ASSERT_FALSE(caught_exception, "Passing test should not throw guard exception"); + } +} + +T_DECL(vm_upl_ro_with_exec_fault, + "Attempt to exec-fault a region while a UPL is in-flight for that region") +{ + /** + * This test is meant to exercise functionality that is currently SPTM-specific. + */ + if (!sptm_enabled()) { + T_SKIP("Exec-fault test only supported on SPTM-enabled devices, skipping..."); + } + if (process_is_translated()) { + /* TODO: Remove this once rdar://142438840 is fixed. */ + T_SKIP("Guard exception handling does not work correctly with Rosetta (rdar://142438840), skipping..."); + } + + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + if (os_thread_self_restrict_rwx_is_supported()) { + os_thread_self_restrict_rwx_to_rw(); + } + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + if (os_thread_self_restrict_rwx_is_supported()) { + os_thread_self_restrict_rwx_to_rx(); + } + + /* Ensure that guard exceptions will not be fatal to the test process. */ + enable_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY); + + upl_object_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .upl_rw = false, .should_fail = false, .exec_fault = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + exc_guard_helper_info_t exc_info; + bool caught_exception = + block_raised_exc_guard_of_type(GUARD_TYPE_VIRT_MEMORY, &exc_info, ^{ + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_object", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_object)"); + }); + T_ASSERT_TRUE(caught_exception, "Exec fault should throw guard exception"); + T_ASSERT_EQ(exc_info.guard_flavor, kGUARD_EXC_SEC_EXEC_ON_IOPL_PAGE, + "Attempted exec fault throws the expected guard exception flavor"); + T_ASSERT_EQ(exc_info.catch_count, 1, "Attempted exec fault should throw exactly one guard exception"); +} + +typedef struct { + uint64_t ptr; + uint64_t upl_base; + uint32_t size; + uint32_t upl_size; + bool upl_rw; +} upl_submap_test_args; + +T_DECL(vm_upl_ro_on_submap, + "Generate RO UPL against a submap region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + upl_submap_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .upl_base = 0x180000000ULL, + .upl_size = buf_size, .upl_rw = false }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap)"); + + args.upl_base += 0x800; + args.upl_size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap)"); + + args.upl_base += 0x800; + args.upl_size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap)"); + + munmap(buf, buf_size); +} + +T_DECL(vm_upl_rw_on_submap, + "Generate RW UPL against a submap region") +{ + const size_t buf_size = 10 * PAGE_SIZE; + unsigned int *buf = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE_PTR(buf, MAP_FAILED, "map buffer"); + + for (unsigned int i = 0; i < (buf_size / sizeof(*buf)); i++) { + buf[i] = (unsigned int)'a' + i; + } + + upl_submap_test_args args = { .ptr = (uint64_t)buf, .size = buf_size, .upl_base = 0x180000000ULL, + .upl_size = buf_size, .upl_rw = true }; + + int64_t addr = (int64_t)&args; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap)"); + + args.upl_base += 0x800; + args.upl_size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap"); + + args.upl_base += 0x800; + args.upl_size -= 0x1000; + + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_upl_submap", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_upl_submap)"); + + munmap(buf, buf_size); +} diff --git a/tests/vm/upl.entitlements b/tests/vm/upl.entitlements new file mode 100644 index 000000000..d87df369a --- /dev/null +++ b/tests/vm/upl.entitlements @@ -0,0 +1,16 @@ + + + + + dynamic-codesigning + + com.apple.security.cs.allow-jit + + com.apple.internal.map-jit-without-sandbox + + com.apple.private.set-exception-port + + com.apple.private.amfi.can-set-exception-ports + + + diff --git a/tests/vm/vectorupl.c b/tests/vm/vectorupl.c new file mode 100644 index 000000000..45807070e --- /dev/null +++ b/tests/vm/vectorupl.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + */ + +#include +#include + +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_OWNER("jharmening"), + T_META_CHECK_LEAKS(false), + T_META_RUN_CONCURRENTLY(true)); + +#define NUM_IOVS 7 + +T_DECL(vm_vector_upl, + "Test for vector UPLs", + T_META_TAG_VM_PREFERRED) +{ + struct { + uint64_t base; + uint32_t len; + } w_iovs[NUM_IOVS]; + int64_t expected_bytes; + int w, w_idx; + + T_SETUPBEGIN; + expected_bytes = 0; + for (w = 0; w < NUM_IOVS; w++) { + w_iovs[w].len = (uint32_t) ((w + 1) * (int)PAGE_SIZE); + void *iov_base; + T_QUIET; T_ASSERT_POSIX_SUCCESS(posix_memalign(&iov_base, PAGE_SIZE, w_iovs[w].len), "alloc(w_iov_base[%d])", w); + memset(iov_base, 'a' + w, w_iovs[w].len); + w_iovs[w].base = (uint64_t)iov_base; + expected_bytes += w_iovs[w].len; + } + T_SETUPEND; + + struct { + uint64_t iov; + uint16_t iovcnt; + } arg; + + arg.iov = (uint64_t) &w_iovs[0]; + arg.iovcnt = NUM_IOVS; + + int64_t addr = (int64_t)&arg; + int64_t result = 0; + size_t s = sizeof(result); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("debug.test.vm_vector_upl", &result, &s, &addr, sizeof(addr)), + "sysctlbyname(debug.test.vm_vector_upl)"); + + T_EXPECT_EQ_LLONG(result, expected_bytes, "sysctl output"); + + w = 0; + w_idx = 0; + + /* Validate that the kernel sysctl handler mapped and mutated the page contents as expected. */ + for (w = 0; w < NUM_IOVS; w++) { + char *iov_base = (char*)w_iovs[w].base; + for (w_idx = 0; w_idx < w_iovs[w].len; w_idx++) { + T_QUIET; T_ASSERT_EQ(iov_base[w_idx], 'a' + w + 1, + "w_iovs[%d].iov_base[%d]='%c' == '%c'", + w, w_idx, (unsigned char)iov_base[w_idx], (unsigned char)('a' + w + 1)); + } + } + + T_PASS("%s", __FUNCTION__); +} diff --git a/tests/vm/vm_allocation.c b/tests/vm/vm_allocation.c index 09946db4d..b1143adfe 100644 --- a/tests/vm/vm_allocation.c +++ b/tests/vm/vm_allocation.c @@ -40,6 +40,7 @@ #include #include #include +#include T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), @@ -56,8 +57,6 @@ T_GLOBAL_META( /* Private interface */ /*********************/ -static const char frameworkname[] = "vm_unitester"; - /* Type for test, fixture set up and fixture tear down functions. */ typedef void (*test_fn_t)(); @@ -65,6 +64,7 @@ typedef void (*test_fn_t)(); typedef struct { const char * name; test_fn_t test; + int expected_signal; } unit_test_t; /* Test suite structure. */ @@ -78,6 +78,7 @@ typedef struct { int _quietness = 0; int _expected_signal = 0; +int _expected_vm_exc_guard_signal = 0; struct { uintmax_t numoftests; @@ -122,7 +123,7 @@ static void log_suite_info(suite_t * suite) { logr("[TEST] %s", suite->name); - logr("Number of tests: %d\n", suite->numoftests); + logr("Number of tests: %d", suite->numoftests); } static void @@ -135,13 +136,17 @@ log_suite_results(suite_t * suite, int passed_tests) static void log_test_info(unit_test_t * unit_test, unsigned test_num) { - logr("[BEGIN] #%04d: %s", test_num, unit_test->name); + if (unit_test->expected_signal) { + logr("[BEGIN] #%04d: %s, SIGNAL(%d) expected", test_num, unit_test->name, unit_test->expected_signal); + } else { + logr("[BEGIN] #%04d: %s", test_num, unit_test->name); + } } static void log_test_result(unit_test_t * unit_test, boolean_t test_passed, unsigned test_num) { - logr("[%s] #%04d: %s\n", test_passed ? "PASS" : "FAIL", test_num, unit_test->name); + logr("[%s] #%04d: %s", test_passed ? "PASS" : "FAIL", test_num, unit_test->name); } /* Run a test with fixture set up and teardown, while enforcing the @@ -156,6 +161,19 @@ run_test(suite_t * suite, unit_test_t * unit_test, unsigned test_num) suite->tear_down(); } +/* Expected signal for a test, default is 0. */ +void +set_expected_signal(int signal) +{ + _expected_signal = signal; +} + +int +get_expected_signal() +{ + return _expected_signal; +} + /* Check a child return status. */ static boolean_t child_terminated_normally(int child_status) @@ -169,11 +187,15 @@ child_terminated_normally(int child_status) exit_status); } else if (!_expected_signal) { normal_exit = TRUE; + } else { + T_LOG( + "Child process unexpectedly exited with zero, " + "where signal %d is expected.", _expected_signal); } } else if (WIFSIGNALED(child_status)) { int signal = WTERMSIG(child_status); if (signal == _expected_signal || - (_expected_signal == -1 && (signal == SIGBUS || signal == SIGSEGV))) { + (_expected_signal == -1 && (signal == SIGBUS || signal == SIGSEGV || signal == SIGKILL))) { if (_quietness <= 0) { T_LOG("Child process died with expected signal " "%d.", signal); @@ -196,6 +218,7 @@ child_test_passed(suite_t * suite, unit_test_t * unit_test) { int test_status; static unsigned test_num = 0; + boolean_t use_default_expected_signal = FALSE; test_num++; @@ -208,8 +231,21 @@ child_test_passed(suite_t * suite, unit_test_t * unit_test) while (waitpid(test_pid, &test_status, 0) != test_pid) { continue; } + + /* + * Allow overriding unit_test's default expected signal + */ + if ((get_expected_signal() == 0) && + (unit_test->expected_signal != 0)) { + set_expected_signal(unit_test->expected_signal); + use_default_expected_signal = TRUE; + } boolean_t test_result = child_terminated_normally(test_status); log_test_result(unit_test, test_result, test_num); + if (use_default_expected_signal) { + set_expected_signal(0); + } + return test_result; } @@ -262,19 +298,6 @@ _run_suite(int numoftests, test_fn_t set_up, UnitTests tests, test_fn_t tear_dow * variables. Should only be used outside of the test, set up and tear * down functions. */ -/* Expected signal for a test, default is 0. */ -void -set_expected_signal(int signal) -{ - _expected_signal = signal; -} - -int -get_expected_signal() -{ - return _expected_signal; -} - /* Logging verbosity. */ void set_quietness(int value) @@ -297,13 +320,13 @@ do_nothing() void log_aggregated_results() { - T_LOG("[SUMMARY] Aggregated Test Results\n"); + T_LOG("[SUMMARY] Aggregated Test Results"); T_LOG("Total: %ju", results.numoftests); T_LOG("Passed: %ju", results.passed_tests); - T_LOG("Failed: %ju\n", results.numoftests - results.passed_tests); + T_LOG("Failed: %ju", results.numoftests - results.passed_tests); - T_QUIET; T_ASSERT_EQ(results.passed_tests, results.numoftests, - "%d passed of total %d tests", + T_ASSERT_EQ(results.passed_tests, results.numoftests, + "%ju passed of total %ju tests", results.passed_tests, results.numoftests); } @@ -326,67 +349,47 @@ log_aggregated_results() static int vm_address_size = sizeof(mach_vm_address_t); -static char *progname = ""; - -/*************************/ -/* xnu version functions */ -/*************************/ - -/* Find the xnu version string. */ -char * -xnu_version_string() -{ - size_t length; - int mib[2]; - mib[0] = CTL_KERN; - mib[1] = KERN_VERSION; - - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctl(mib, 2, NULL, &length, NULL, 0), "sysctl()"); - char * version = (char *)malloc(length); - T_QUIET; - T_WITH_ERRNO; - T_ASSERT_NOTNULL(version, "malloc()"); - T_QUIET; - T_EXPECT_POSIX_SUCCESS(sysctl(mib, 2, version, &length, NULL, 0), "sysctl()"); - if (T_RESULT == T_RESULT_FAIL) { - free(version); - T_END; - } - char * xnu_string = strstr(version, "xnu-"); - free(version); - T_QUIET; - T_ASSERT_NOTNULL(xnu_string, "%s: error finding xnu version string.", progname); - return xnu_string; -} - -/* Find the xnu major version number. */ -unsigned int -xnu_major_version() -{ - char * endptr; - char * xnu_substring = xnu_version_string() + 4; - - errno = 0; - unsigned int xnu_version = strtoul(xnu_substring, &endptr, 0); - T_QUIET; - T_ASSERT_TRUE((errno != ERANGE && endptr != xnu_substring), - "%s: error finding xnu major version number.", progname); - return xnu_version; -} - /*************************/ /* Mach assert functions */ /*************************/ -static inline void -assert_mach_return(kern_return_t kr, kern_return_t expected_kr, const char * mach_routine) +#define assert_mach_return(kr, expected_kr, format, ...) \ + do { \ + /* fixme T_QUIET is not working */ \ + if ((kr) != (expected_kr)) { \ + T_QUIET; T_ASSERT_MACH_ERROR(kr, expected_kr, format, ##__VA_ARGS__); \ + } \ + } while (0) + +#define assert_mach_success(kr, format, ...) \ + do { \ + /* fixme T_QUIET is not working */ \ + if (kr != KERN_SUCCESS) { \ + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, format, ##__VA_ARGS__); \ + } \ + } while (0) \ + +#define assert_mach_failure(kr, format, ...) \ + do { \ + /* fixme T_QUIET is not working */ \ + if (kr == KERN_SUCCESS) { \ + T_QUIET; T_ASSERT_NE(kr, KERN_SUCCESS, format, ##__VA_ARGS__); \ + } \ + } while (0) \ + +/* Determine if TASK_EXC_GUARD_VM_FATAL is enabled for task */ +static boolean_t +get_task_exc_guard_vm_fatal(void) { - T_QUIET; T_ASSERT_EQ(kr, expected_kr, - "%s unexpectedly returned: %s." - "Should have returned: %s.", - mach_routine, mach_error_string(kr), - mach_error_string(expected_kr)); + task_exc_guard_behavior_t behavior; + + assert_mach_success(task_get_exc_guard_behavior(mach_task_self(), &behavior), "task_get_exc_guard_behavior"); + if ((behavior & TASK_EXC_GUARD_VM_DELIVER) && + (behavior & TASK_EXC_GUARD_VM_FATAL)) { + return TRUE; + } else { + return FALSE; + } } /*******************************/ @@ -467,7 +470,7 @@ memory_entry(mach_vm_size_t * size, mach_port_t *object_handle) } T_QUIET; T_ASSERT_EQ(*size, round_page(original_size), "mach_make_memory_entry_64() unexpectedly returned a named " - "entry of size 0x%jx (%ju).\n" + "entry of size 0x%jx (%ju). " "Should have returned a " "named entry of size 0x%jx (%ju).", (uintmax_t)*size, (uintmax_t)*size, (uintmax_t)original_size, (uintmax_t)original_size); @@ -486,7 +489,7 @@ wrapper_mach_vm_map_named_entry(vm_map_t map, mach_vm_address_t * address, mach_ check_fixed_address(address, size); kr = mach_vm_map(map, address, size, (mach_vm_offset_t)0, flags, object_handle, (memory_object_offset_t)0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_port_deallocate(mach_task_self(), object_handle), "mach_port_deallocate()"); + assert_mach_success(mach_port_deallocate(mach_task_self(), object_handle), "mach_port_deallocate()"); return kr; } @@ -586,11 +589,13 @@ static mach_vm_size_t _vm_size = DEFAULT_VM_SIZE; static int _address_flag = VM_FLAGS_ANYWHERE; static boolean_t _address_alignment = TRUE; static mach_vm_address_t _vm_address = 0x0; +static mach_vm_address_t _already_deallocated_vm_page = 0x0; /* Buffer for mach_vm_write(). */ static mach_vm_size_t _buffer_size = DEFAULT_VM_SIZE; static mach_vm_address_t _buffer_address = 0x0; static int _buffer_offset = 0; +static mach_vm_address_t _already_deallocated_buffer_page = 0x0; /* Post action for mach_vm_copy(). */ static int _vmcopy_post_action = VMCOPY_MODIFY_SRC; @@ -655,6 +660,18 @@ get_vm_address() return _vm_address; } +static void +set_already_deallocated_vm_page(mach_vm_address_t address) +{ + _already_deallocated_vm_page = address; +} + +static mach_vm_address_t +get_already_deallocated_vm_page() +{ + return _already_deallocated_vm_page; +} + static void set_buffer_size(mach_vm_size_t size) { @@ -691,6 +708,18 @@ get_buffer_offset() return _buffer_offset; } +static void +set_already_deallocated_buffer_page(mach_vm_address_t address) +{ + _already_deallocated_buffer_page = address; +} + +static mach_vm_address_t +get_already_deallocated_buffer_page() +{ + return _already_deallocated_buffer_page; +} + static void set_vmcopy_post_action(int action) { @@ -745,14 +774,6 @@ static test_info_t test_info[] = { {NULL, NULL} }; -static void -die_on_invalid_value(int condition, const char * value_string) -{ - T_QUIET; - T_ASSERT_EQ(condition, 0, "%s: invalid value: %s.", - progname, value_string); -} - static void process_options(test_option_t options) { @@ -762,9 +783,12 @@ process_options(test_option_t options) set_vm_size(DEFAULT_VM_SIZE); set_quietness(DEFAULT_QUIETNESS); + if (get_task_exc_guard_vm_fatal()) { + _expected_vm_exc_guard_signal = SIGKILL; + } if (NULL != getenv("LTERDOS")) { - logr("LTERDOS=YES this is LeanTestEnvironment\nIncreasing quietness by 1."); + logr("LTERDOS=YES this is LeanTestEnvironment. Increasing quietness by 1."); set_quietness(get_quietness() + 1); } else { if (options.to_quietness > 0) { @@ -841,7 +865,7 @@ aligned_size(mach_vm_address_t address, mach_vm_size_t size) static inline void assert_aligned_address(mach_vm_address_t address) { - T_QUIET; T_ASSERT_EQ((address & get_mask()), 0, + T_QUIET; T_ASSERT_EQ((address & get_mask()), 0ull, "Address 0x%jx is unexpectedly " "unaligned.", (uintmax_t)address); @@ -908,7 +932,7 @@ void assert_read_success(mach_vm_address_t address, mach_vm_size_t size, vm_offset_t * data, mach_msg_type_number_t * data_size) { assert_read_return(address, size, data, data_size, KERN_SUCCESS); - T_QUIET; T_ASSERT_EQ(*data_size, size, + T_QUIET; T_ASSERT_EQ((mach_vm_size_t)*data_size, size, "Returned buffer size 0x%jx " "(%ju) is unexpectedly different from source size 0x%jx " "(%ju).", @@ -951,6 +975,13 @@ assert_copy_success(mach_vm_address_t source, mach_vm_size_t size, mach_vm_addre assert_copy_return(source, size, dest, KERN_SUCCESS); } +void +assert_copy_failure(mach_vm_address_t source, mach_vm_size_t size, mach_vm_address_t dest) +{ + assert_mach_failure(mach_vm_copy(mach_task_self(), source, size, dest), "mach_vm_copy()"); +} + + /*******************/ /* Memory patterns */ /*******************/ @@ -1058,8 +1089,8 @@ get_fixed_address(mach_vm_size_t size) * cause malloc() to use the desired range and tests will randomly fail. The allocate routines will * do the delayed vm_deallocate() to free the fixed memory just before allocation testing in the wrapper. */ - T_QUIET; T_ASSERT_EQ(fixed_vm_address, 0, "previous fixed address not used"); - T_QUIET; T_ASSERT_EQ(fixed_vm_size, 0, "previous fixed size not used"); + T_QUIET; T_ASSERT_EQ(fixed_vm_address, 0ull, "previous fixed address not used"); + T_QUIET; T_ASSERT_EQ(fixed_vm_size, 0ull, "previous fixed size not used"); fixed_vm_address = address; fixed_vm_size = size; @@ -1164,6 +1195,18 @@ set_up_copy_shared_mode_variables() /* Allocation set up functions */ /*******************************/ +static void +log_allocation(mach_vm_size_t size, int flags, mach_vm_address_t address, const char *message) +{ + if (flags & VM_FLAGS_ANYWHERE) { + logv("Allocating 0x%jx (%ju) byte%s %s...", + (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", message); + } else { + logv("Allocating 0x%jx (%ju) byte%s at address 0x%jx %s...", + (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", (uintmax_t)address, message); + } +} + /* Allocate VM region of given size. */ void allocate(mach_vm_size_t size) @@ -1171,11 +1214,7 @@ allocate(mach_vm_size_t size) mach_vm_address_t address = get_vm_address(); int flag = get_address_flag(); - logv("Allocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv("..."); + log_allocation(size, flag, address, ""); assert_allocate_success(&address, size, flag); logv( "Memory of rounded size 0x%jx (%ju) allocated at " @@ -1192,6 +1231,7 @@ allocate(mach_vm_size_t size) (uintmax_t)old_address, (uintmax_t)address); } set_vm_address(address); + set_already_deallocated_vm_page(0x0); } void @@ -1199,19 +1239,20 @@ allocate_buffer(mach_vm_size_t buffer_size) { mach_vm_address_t data = 0x0; - logv("Allocating 0x%jx (%ju) byte%s...", (uintmax_t)buffer_size, (uintmax_t)buffer_size, (buffer_size == 1) ? "" : "s"); + log_allocation(buffer_size, VM_FLAGS_ANYWHERE, 0, ""); assert_allocate_success(&data, buffer_size, VM_FLAGS_ANYWHERE); logv( "Memory of rounded size 0x%jx (%ju) allocated at " "address 0x%jx.", (uintmax_t)round_page(buffer_size), (uintmax_t)round_page(buffer_size), (uintmax_t)data); data += get_buffer_offset(); - T_QUIET; T_ASSERT_EQ((vm_offset_t)data, data, + T_QUIET; T_ASSERT_EQ((mach_vm_address_t)(vm_offset_t)data, data, "Address 0x%jx " "unexpectedly overflows to 0x%jx when cast as " "vm_offset_t type.", (uintmax_t)data, (uintmax_t)(vm_offset_t)data); set_buffer_address(data); + set_already_deallocated_buffer_page(0x0); } /****************************************************/ @@ -1287,10 +1328,31 @@ deallocate_range(mach_vm_address_t address, mach_vm_size_t size) assert_deallocate_success(address, size); } +/* + * Same as deallocate_range, buf if already_deallocated_address + * is not zero then that page of memory is not deallocated. + */ +void +deallocate_range_except_page(mach_vm_address_t address, mach_vm_size_t size, + mach_vm_address_t already_deallocated_address) +{ + if (already_deallocated_address != 0) { + mach_vm_address_t end = mach_vm_round_page(address + size); + mach_vm_address_t already_deallocated_end = already_deallocated_address + vm_page_size; + deallocate_range(address, already_deallocated_address - address); + logv("Skipping already-deallocated page at 0x%jx (%ju bytes)", + (uintmax_t)already_deallocated_address, (uintmax_t)vm_page_size); + deallocate_range(already_deallocated_end, end - already_deallocated_end); + } else { + deallocate_range(address, size); + } +} + void deallocate() { - deallocate_range(get_vm_address(), get_vm_size()); + deallocate_range_except_page(get_vm_address(), get_vm_size(), get_already_deallocated_vm_page()); + set_already_deallocated_vm_page(0x0); } /* Deallocate source memory, including the extra page for unaligned @@ -1300,7 +1362,11 @@ deallocate_extra_page() { /* Set the address and size to their original allocation * values. */ - deallocate_range(mach_vm_trunc_page(get_vm_address()), get_vm_size() + 1); + deallocate_range_except_page( + mach_vm_trunc_page(get_vm_address()), + get_vm_size() + 1, + get_already_deallocated_vm_page()); + set_already_deallocated_vm_page(0x0); } /* Deallocate buffer and destination memory for mach_vm_write(), @@ -1308,8 +1374,58 @@ deallocate_extra_page() void deallocate_vm_and_buffer() { - deallocate_range(mach_vm_trunc_page(get_vm_address()), get_vm_size() + 1); - deallocate_range(mach_vm_trunc_page(get_buffer_address()), get_buffer_size() + get_buffer_offset()); + deallocate_range_except_page( + mach_vm_trunc_page(get_vm_address()), + get_vm_size() + 1, + get_already_deallocated_vm_page()); + set_already_deallocated_vm_page(0x0); + + deallocate_range_except_page( + mach_vm_trunc_page(get_buffer_address()), + get_buffer_size() + get_buffer_offset(), + get_already_deallocated_buffer_page()); + set_already_deallocated_buffer_page(0x0); +} + +/* + * Deallocate vm_page_size bytes within the source memory. + * Later deallocate() or deallocate_extra_page() or deallocate_vm_and_buffer() + * will not deallocate it again. + */ +void +deallocate_vm_page_early(mach_vm_address_t address) +{ + mach_vm_address_t vm_start = mach_vm_trunc_page(get_vm_address()); + mach_vm_address_t vm_end = mach_vm_round_page(vm_start + get_vm_size() + 1); + T_QUIET; T_ASSERT_EQ(get_already_deallocated_vm_page(), 0ull, + "deallocate_vm_page_early can only be used once per test"); + T_QUIET; T_ASSERT_EQ(address, mach_vm_trunc_page(address), + "deallocate_vm_page_early address must be page aligned"); + T_QUIET; T_ASSERT_TRUE(address >= vm_start && address + vm_page_size <= vm_end, + "deallocate_vm_page_early address must be within source memory"); + + assert_deallocate_success(address, vm_page_size); + set_already_deallocated_vm_page(address); +} + +/* + * Deallocate vm_page_size bytes within the mach_vm_write() buffer. + * Later deallocate_vm_and_buffer() will not deallocate it again. + */ +void +deallocate_buffer_page_early(mach_vm_address_t address) +{ + mach_vm_address_t buffer_start = mach_vm_trunc_page(get_buffer_address()); + mach_vm_address_t buffer_end = mach_vm_round_page(buffer_start + get_buffer_size() + get_buffer_offset()); + T_QUIET; T_ASSERT_EQ(get_already_deallocated_buffer_page(), 0ull, + "deallocate_buffer_page_early can only be used once per test"); + T_QUIET; T_ASSERT_EQ(address, mach_vm_trunc_page(address), + "deallocate_buffer_page_early address must be page aligned"); + T_QUIET; T_ASSERT_TRUE(address >= buffer_start && address + vm_page_size <= buffer_end, + "deallocate_buffer_page_early address must be within buffer memory"); + + assert_deallocate_success(address, vm_page_size); + set_already_deallocated_buffer_page(address); } /***********************************/ @@ -1340,11 +1456,11 @@ read_deallocate() /* Promoting to mach_vm types after checking for overflow, and * setting the global address from the buffer's. */ - T_QUIET; T_ASSERT_EQ((mach_vm_address_t)read_address, read_address, + T_QUIET; T_ASSERT_EQ((vm_offset_t)(mach_vm_address_t)read_address, read_address, "Address 0x%jx unexpectedly overflows to 0x%jx when cast " "as mach_vm_address_t type.", (uintmax_t)read_address, (uintmax_t)(mach_vm_address_t)read_address); - T_QUIET; T_ASSERT_EQ((mach_vm_size_t)read_size, read_size, + T_QUIET; T_ASSERT_EQ((mach_msg_type_number_t)(mach_vm_size_t)read_size, read_size, "Size 0x%jx (%ju) unexpectedly overflows to 0x%jx (%ju) " "when cast as mach_vm_size_t type.", (uintmax_t)read_size, (uintmax_t)read_size, (uintmax_t)(mach_vm_size_t)read_size, (uintmax_t)(mach_vm_size_t)read_size); @@ -1414,7 +1530,7 @@ copy_deallocate(void) deallocate_range(mach_vm_trunc_page(source), size + 1); /* Promoting to mach_vm types after checking for overflow, and * setting the global address from the buffer's. */ - T_QUIET; T_ASSERT_EQ((vm_offset_t)dest, dest, + T_QUIET; T_ASSERT_EQ((mach_vm_address_t)(vm_offset_t)dest, dest, "Address 0x%jx unexpectedly overflows to 0x%jx when cast " "as mach_vm_address_t type.", (uintmax_t)dest, (uintmax_t)(vm_offset_t)dest); @@ -1471,7 +1587,7 @@ set_up_vm_variables_allocate_protect(vm_prot_t protection, const char * protecti "Setting %s-protection on 0x%jx (%ju) byte%s at address " "0x%jx...", protection_name, (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", (uintmax_t)address); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), address, size, FALSE, protection), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), address, size, FALSE, protection), "mach_vm_protect()"); logv("Region %s-protected.", protection_name); } @@ -1538,7 +1654,7 @@ write_pattern( "and size 0x%jx (%ju)...", pattern_name, (uintmax_t)address, (uintmax_t)size, (uintmax_t)size); filter_addresses_do_else(filter, reversed, address, size, write_address, no_action, address); - logv("Pattern writen."); + logv("Pattern written."); } void @@ -1615,11 +1731,9 @@ test_reallocate_pages() (uintmax_t)address, (uintmax_t)size, (uintmax_t)size); for (i = address; i < address + size; i += vm_page_size) { kr = allocator(this_task, &i, vm_page_size, VM_FLAGS_FIXED); - T_QUIET; T_ASSERT_EQ(kr, KERN_NO_SPACE, - "Allocator " - "at address 0x%jx unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)address, mach_error_string(kr), mach_error_string(KERN_NO_SPACE)); + assert_mach_return(kr, KERN_NO_SPACE, + "Allocator at address 0x%jx expected KERN_NO_SPACE", + (uintmax_t)address); } logv("Returned expected error at each page: %s.", mach_error_string(KERN_NO_SPACE)); } @@ -1632,11 +1746,7 @@ test_allocate_in_null_map() mach_vm_size_t size = get_vm_size(); int flag = get_address_flag(); - logv("Allocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv(" in NULL VM map..."); + log_allocation(size, flag, address, "in NULL VM map"); assert_mach_return(get_allocator()(VM_MAP_NULL, &address, size, flag), MACH_SEND_INVALID_DEST, "Allocator"); logv("Returned expected error: %s.", mach_error_string(MACH_SEND_INVALID_DEST)); } @@ -1654,11 +1764,7 @@ test_allocate_with_kernel_flags() kern_return_t kr; int valid_flags = VM_FLAGS_USER_ALLOCATE | VM_FLAGS_USER_MAP | VM_FLAGS_USER_REMAP | VM_FLAGS_ALIAS_MASK; - logv("Allocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv(" with various invalid flags..."); + log_allocation(size, flag, address, "with various invalid flags"); for (i = 0; i < sizeof(int) * 8; i++) { int test_flag = 1 << i; @@ -1669,11 +1775,9 @@ test_allocate_with_kernel_flags() bad_flag = test_flag | flag; kr = allocator(this_task, &address, size, bad_flag); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ARGUMENT, - "Allocator " - "with invalid flag 0x%x unexpectedly returned: %s.\n" - "Should have returned: %s.", - bad_flag, mach_error_string(kr), mach_error_string(KERN_INVALID_ARGUMENT)); + assert_mach_return(kr, KERN_INVALID_ARGUMENT, + "Allocator with invalid flag 0x%x expected KERN_INVALID_ARGUMENT.", + bad_flag); } logv("Returned expected error with each invalid flag: %s.", mach_error_string(KERN_INVALID_ARGUMENT)); } @@ -1682,37 +1786,6 @@ test_allocate_with_kernel_flags() void test_allocate_superpage_with_incompatible_flags() { - allocate_fn_t allocator = get_allocator(); - vm_map_t this_task = mach_task_self(); - mach_vm_address_t address = get_vm_address(); - mach_vm_size_t size = get_vm_size(); - int flag = get_address_flag(); - int bad_flag, i; - kern_return_t kr; - int incompatible_flags = VM_FLAGS_PURGABLE | VM_FLAGS_TPRO; - - logv("Allocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv(" with various incompatible flags..."); - for (i = 0; i < sizeof(int) * 8; i++) { - int test_flag = 1 << i; - - /* Skip compatible flags */ - if (!(incompatible_flags & test_flag)) { - continue; - } - - bad_flag = test_flag | flag | VM_FLAGS_SUPERPAGE_SIZE_ANY; - kr = allocator(this_task, &address, size, bad_flag); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ARGUMENT, - "Allocator " - "with invalid flag 0x%x unexpectedly returned: %s.\n" - "Should have returned: %s.", - bad_flag, mach_error_string(kr), mach_error_string(KERN_INVALID_ARGUMENT)); - } - logv("Returned expected error with each invalid flag: %s.", mach_error_string(KERN_INVALID_ARGUMENT)); } /*****************************/ @@ -1733,21 +1806,15 @@ test_mach_vm_map_protection_inheritance_error() : (mach_vm_offset_t)get_mask(); int flag = get_address_flag(); mach_port_t object_handle = MACH_PORT_NULL; - vm_prot_t cur_protections[] = {VM_PROT_DEFAULT, VM_PROT_ALL + 1, ~VM_PROT_IS_MASK, INT_MAX}; - vm_prot_t max_protections[] = {VM_PROT_ALL, VM_PROT_ALL + 1, ~VM_PROT_IS_MASK, INT_MAX}; + vm_prot_t cur_protections[] = {VM_PROT_DEFAULT, (VM_PROT_ALL | VM_PROT_ALLEXEC) + 1, ~VM_PROT_IS_MASK, INT_MAX}; + vm_prot_t max_protections[] = {VM_PROT_ALL, (VM_PROT_ALL | VM_PROT_ALLEXEC) + 1, ~VM_PROT_IS_MASK, INT_MAX}; vm_inherit_t inheritances[] = {VM_INHERIT_DEFAULT, VM_INHERIT_LAST_VALID + 1, UINT_MAX}; int i, j, k; if (get_allocator() == wrapper_mach_vm_map_named_entry) { assert_mach_success(memory_entry(&size, &object_handle), "mach_make_memory_entry_64()"); } - logv("Allocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv( - " with various invalid protection/inheritance " - "arguments..."); + log_allocation(size, flag, address, "with various invalid protection/inheritance arguments"); for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { @@ -1758,13 +1825,11 @@ test_mach_vm_map_protection_inheritance_error() } kr = mach_vm_map(my_task, &address, size, mask, flag, object_handle, (memory_object_offset_t)0, FALSE, cur_protections[i], max_protections[j], inheritances[k]); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ARGUMENT, + assert_mach_return(kr, KERN_INVALID_ARGUMENT, "mach_vm_map() " "with cur_protection 0x%x, max_protection 0x%x, " - "inheritance 0x%x unexpectedly returned: %s.\n" - "Should have returned: %s.", - cur_protections[i], max_protections[j], inheritances[k], mach_error_string(kr), - mach_error_string(KERN_INVALID_ARGUMENT)); + "inheritance 0x%x expected KERN_INVALID_ARGUMENT", + cur_protections[i], max_protections[j], inheritances[k]); } } } @@ -1868,9 +1933,9 @@ test_allocate_at_zero() assert_allocate_return(&address, size, VM_FLAGS_FIXED, kr_expected); logv("Returned expected value: %s.", mach_error_string(kr_expected)); if (kr_expected == KERN_SUCCESS) { - T_QUIET; T_ASSERT_EQ(address, 0, + T_QUIET; T_ASSERT_EQ(address, 0ull, "Address 0x%jx is unexpectedly " - "nonzero.\n", + "nonzero.", (uintmax_t)address); logv("Allocated address 0x%jx is zero.", (uintmax_t)address); deallocate_range(address, size); @@ -1962,7 +2027,7 @@ test_allocate_first_fit_pages() logv("Allocating pages between 0x%jx and 0x%jx...", (uintmax_t)address1, (uintmax_t)address2); for (i = address1; i <= address2; i += vm_page_size) { kr = allocator(this_task, &i, vm_page_size, VM_FLAGS_FIXED); - T_QUIET; T_ASSERT_NE(kr, KERN_SUCCESS, + assert_mach_failure(kr, "Allocator at address 0x%jx " "unexpectedly succeeded.", (uintmax_t)i); @@ -1986,7 +2051,7 @@ access_deallocated_range_address(mach_vm_address_t address, const char * positio logv("Will deallocate and read from %s 0x%jx of deallocated range...", position, (uintmax_t)address); deallocate(); mach_vm_address_t bad_value = MACH_VM_ADDRESS_T(address); - T_ASSERT_FAIL("Unexpectedly read value 0x%jx at address 0x%jx.\n" + T_ASSERT_FAIL("Unexpectedly read value 0x%jx at address 0x%jx. " "Should have died with signal SIGSEGV.", (uintmax_t)bad_value, (uintmax_t)address); } @@ -2024,7 +2089,7 @@ test_deallocate_suicide() logv("Deallocating 0x%jx (%ju) bytes at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (uintmax_t)address); kern_return_t kr = mach_vm_deallocate(mach_task_self(), address, size); T_ASSERT_FAIL("mach_vm_deallocate() with address 0x%jx and " - "size 0x%jx (%ju) unexpectedly returned: %s.\n" + "size 0x%jx (%ju) unexpectedly returned: %s. " "Should have died with signal SIGSEGV or SIGBUS.", (uintmax_t)address, (uintmax_t)size, (uintmax_t)size, mach_error_string(kr)); } @@ -2084,46 +2149,9 @@ test_deallocate_zero_size_ranges() logv("Deallocating 0x0 (0) bytes at various addresses..."); for (i = 0; i < numofaddresses; i++) { kr = mach_vm_deallocate(this_task, addresses[i], 0); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_deallocate() at " - "address 0x%jx unexpectedly failed: %s.", - (uintmax_t)addresses[i], mach_error_string(kr)); - } - logv("Deallocations successful."); -} - -/* Deallocation succeeds if the end of the range rounds to 0x0. */ -void -test_deallocate_rounded_zero_end_ranges() -{ - int i; - kern_return_t kr; - vm_map_t this_task = mach_task_self(); - struct { - mach_vm_address_t address; - mach_vm_size_t size; - } ranges[] = { - {0x0, (mach_vm_size_t)UINTMAX_MAX}, - {0x0, (mach_vm_size_t)UINTMAX_MAX - vm_page_size + 2}, - {0x1, (mach_vm_size_t)UINTMAX_MAX - 1}, - {0x1, (mach_vm_size_t)UINTMAX_MAX - vm_page_size + 1}, - {0x2, (mach_vm_size_t)UINTMAX_MAX - 2}, - {0x2, (mach_vm_size_t)UINTMAX_MAX - vm_page_size}, - {(mach_vm_address_t)UINTMAX_MAX - vm_page_size + 1, vm_page_size - 1}, - {(mach_vm_address_t)UINTMAX_MAX - vm_page_size + 1, 1}, - {(mach_vm_address_t)UINTMAX_MAX - 1, 1}, - }; - int numofranges = sizeof(ranges) / sizeof(ranges[0]); - - logv( - "Deallocating various memory ranges whose end rounds to " - "0x0..."); - for (i = 0; i < numofranges; i++) { - kr = mach_vm_deallocate(this_task, ranges[i].address, ranges[i].size); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, - "mach_vm_deallocate() with address 0x%jx and size " - "0x%jx (%ju) unexpectedly returned: %s.\n" - "Should have succeeded.", - (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size, mach_error_string(kr)); + assert_mach_success(kr, "mach_vm_deallocate() at " + "address 0x%jx unexpectedly failed", + (uintmax_t)addresses[i]); } logv("Deallocations successful."); } @@ -2149,14 +2177,14 @@ test_deallocate_wrapped_around_ranges() logv( "Deallocating various memory ranges wrapping around the " "address space..."); + for (i = 0; i < numofranges; i++) { kr = mach_vm_deallocate(this_task, ranges[i].address, ranges[i].size); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ARGUMENT, + assert_mach_return(kr, KERN_INVALID_ARGUMENT, "mach_vm_deallocate() with address 0x%jx and size " - "0x%jx (%ju) unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size, mach_error_string(kr), - mach_error_string(KERN_INVALID_ARGUMENT)); + "0x%jx (%ju) expected KERN_INVALID_ARGUMENT", + (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, + (uintmax_t)ranges[i].size); } logv("Returned expected error on each range: %s.", mach_error_string(KERN_INVALID_ARGUMENT)); } @@ -2167,13 +2195,9 @@ test_deallocate_in_null_map() { mach_vm_address_t address = get_vm_address(); mach_vm_size_t size = get_vm_size(); - int flag = get_address_flag(); - logv("Deallocating 0x%jx (%ju) byte%s", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s"); - if (!(flag & VM_FLAGS_ANYWHERE)) { - logv(" at address 0x%jx", (uintmax_t)address); - } - logv(" in NULL VM map..."); + logv("Deallocating 0x%jx (%ju) byte%s at address 0x%jx in NULL VM map...", + (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", (uintmax_t)address); assert_mach_return(mach_vm_deallocate(VM_MAP_NULL, address, size), MACH_SEND_INVALID_DEST, "mach_vm_deallocate()"); logv("Returned expected error: %s.", mach_error_string(MACH_SEND_INVALID_DEST)); } @@ -2195,7 +2219,7 @@ test_read_address_offset() assert_aligned_address(address); logv("Buffer address 0x%jx is aligned as expected.", (uintmax_t)address); } else { - T_QUIET; T_ASSERT_EQ(((address - 1) & (vm_page_size - 1)), 0, + T_QUIET; T_ASSERT_EQ(((address - 1) & (vm_page_size - 1)), 0ull, "Buffer " "address 0x%jx does not have the expected boundary " "offset of 1.", @@ -2220,8 +2244,8 @@ test_read_null_map() "Reading 0x%jx (%ju) byte%s at address 0x%jx in NULL VM " "map...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", (uintmax_t)address); - assert_mach_return(mach_vm_read(VM_MAP_NULL, address, size, &read_address, &read_size), MACH_SEND_INVALID_DEST, - "mach_vm_read()"); + assert_mach_return(mach_vm_read(VM_MAP_NULL, address, size, &read_address, &read_size), + MACH_SEND_INVALID_DEST, "mach_vm_read()"); logv("Returned expected error: %s.", mach_error_string(MACH_SEND_INVALID_DEST)); } @@ -2236,7 +2260,7 @@ test_read_partially_deallocated_range() mach_msg_type_number_t read_size; logv("Deallocating a mid-range page at address 0x%jx...", (uintmax_t)mid_point); - assert_deallocate_success(mid_point, vm_page_size); + deallocate_vm_page_early(mid_point); logv("Page deallocated."); logv("Reading 0x%jx (%ju) byte%s at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", @@ -2261,7 +2285,7 @@ test_read_partially_unreadable_range() kern_return_t kr_expected = (size < vm_page_size * 2) ? KERN_INVALID_ADDRESS : KERN_PROTECTION_FAILURE; logv("Read-protecting a mid-range page at address 0x%jx...", (uintmax_t)mid_point); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); logv("Page read-protected."); logv("Reading 0x%jx (%ju) byte%s at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", @@ -2294,11 +2318,10 @@ read_edge_size(mach_vm_size_t size, kern_return_t expected_kr) logv("Reading 0x%jx (%ju) bytes at various addresses...", (uintmax_t)size, (uintmax_t)size); for (i = 0; i < numofaddresses; i++) { kr = mach_vm_read(this_task, addresses[i], size, &read_address, &read_size); - T_QUIET; T_ASSERT_EQ(kr, expected_kr, + assert_mach_return(kr, expected_kr, "mach_vm_read() at " - "address 0x%jx unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)addresses[i], mach_error_string(kr), mach_error_string(expected_kr)); + "address 0x%jx expected %s", + (uintmax_t)addresses[i], mach_error_string(expected_kr)); } logv( "mach_vm_read() returned expected value in each case: " @@ -2345,12 +2368,10 @@ test_read_wrapped_around_ranges() "address space..."); for (i = 0; i < numofranges; i++) { kr = mach_vm_read(this_task, ranges[i].address, ranges[i].size, &read_address, &read_size); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ADDRESS, + assert_mach_return(kr, KERN_INVALID_ADDRESS, "mach_vm_read() at address 0x%jx with size " - "0x%jx (%ju) unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size, mach_error_string(kr), - mach_error_string(KERN_INVALID_ADDRESS)); + "0x%jx (%ju) expected KERN_INVALID_ADDRESS", + (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size); } logv("Returned expected error on each range: %s.", mach_error_string(KERN_INVALID_ADDRESS)); } @@ -2442,7 +2463,7 @@ test_write_partially_deallocated_buffer() "Deallocating a mid-range buffer page at address " "0x%jx...", (uintmax_t)buffer_mid_point); - assert_deallocate_success(buffer_mid_point, vm_page_size); + deallocate_buffer_page_early(buffer_mid_point); logv("Page deallocated."); logv( @@ -2466,7 +2487,7 @@ test_write_partially_unreadable_buffer() "Read-protecting a mid-range buffer page at address " "0x%jx...", (uintmax_t)buffer_mid_point); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), buffer_mid_point, vm_page_size, FALSE, VM_PROT_WRITE), + assert_mach_success(mach_vm_protect(mach_task_self(), buffer_mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); logv("Page read-protected."); @@ -2491,7 +2512,7 @@ test_write_on_partially_deallocated_range() "Deallocating the first destination page at address " "0x%jx...", (uintmax_t)start); - assert_deallocate_success(start, vm_page_size); + deallocate_vm_page_early(start); logv("Page deallocated."); logv( @@ -2521,7 +2542,7 @@ test_write_on_partially_unwritable_range() "Write-protecting the first destination page at address " "0x%jx...", (uintmax_t)start); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); logv("Page write-protected."); logv( @@ -2628,11 +2649,10 @@ copy_edge_size(mach_vm_size_t size, kern_return_t expected_kr) logv("Copying 0x%jx (%ju) bytes at various addresses...", (uintmax_t)size, (uintmax_t)size); for (i = 0; i < numofaddresses; i++) { kr = mach_vm_copy(this_task, addresses[i], size, dest); - T_QUIET; T_ASSERT_EQ(kr, expected_kr, + assert_mach_return(kr, expected_kr, "mach_vm_copy() at " - "address 0x%jx unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)addresses[i], mach_error_string(kr), mach_error_string(expected_kr)); + "address 0x%jx expected %s", + (uintmax_t)addresses[i], mach_error_string(expected_kr)); } logv( "mach_vm_copy() returned expected value in each case: " @@ -2683,12 +2703,10 @@ test_copy_wrapped_around_ranges() "address space..."); for (i = 0; i < numofranges; i++) { kr = mach_vm_copy(this_task, ranges[i].address, ranges[i].size, dest); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ADDRESS, + assert_mach_return(kr, KERN_INVALID_ADDRESS, "mach_vm_copy() at address 0x%jx with size " - "0x%jx (%ju) unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size, mach_error_string(kr), - mach_error_string(KERN_INVALID_ADDRESS)); + "0x%jx (%ju) expected KERN_INVALID_ADDRESS", + (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size); } logv("Returned expected error on each range: %s.", mach_error_string(KERN_INVALID_ADDRESS)); @@ -2766,7 +2784,7 @@ test_copy_partially_deallocated_range() mach_vm_address_t dest = 0; logv("Deallocating a mid-range page at address 0x%jx...", (uintmax_t)mid_point); - assert_deallocate_success(mid_point, vm_page_size); + deallocate_vm_page_early(mid_point); logv("Page deallocated."); logv("Copying 0x%jx (%ju) byte%s at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", @@ -2794,7 +2812,7 @@ test_copy_partially_unreadable_range() kern_return_t kr_expected = (size < vm_page_size) ? KERN_INVALID_ADDRESS : KERN_PROTECTION_FAILURE; logv("Read-protecting a mid-range page at address 0x%jx...", (uintmax_t)mid_point); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); logv("Page read-protected."); logv("Copying 0x%jx (%ju) byte%s at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", @@ -2813,12 +2831,12 @@ test_copy_dest_partially_deallocated_region() mach_vm_address_t source = get_buffer_address(); mach_msg_type_number_t size = (mach_msg_type_number_t)get_buffer_size(); mach_vm_address_t source_mid_point = (mach_vm_address_t)mach_vm_trunc_page(dest + size / 2); -#if __MAC_OX_X_VERSION_MIN_REQUIRED > 1080 + logv( "Deallocating a mid-range source page at address " "0x%jx...", (uintmax_t)source_mid_point); - assert_deallocate_success(source_mid_point, vm_page_size); + deallocate_vm_page_early(source_mid_point); logv("Page deallocated."); logv( @@ -2827,11 +2845,6 @@ test_copy_dest_partially_deallocated_region() (uintmax_t)source, (uintmax_t)size, (uintmax_t)size, (uintmax_t)dest); assert_copy_return(source, size, dest, KERN_INVALID_ADDRESS); logv("Returned expected error: %s.", mach_error_string(KERN_INVALID_ADDRESS)); -#else - logv( - "Bypassing partially deallocated region test " - "(See )"); -#endif /* __MAC_OX_X_VERSION_MIN_REQUIRED > 1080 */ } /* Copying from a partially deallocated region fails. */ @@ -2847,7 +2860,7 @@ test_copy_source_partially_deallocated_region() "Deallocating a mid-range source page at address " "0x%jx...", (uintmax_t)source_mid_point); - assert_deallocate_success(source_mid_point, vm_page_size); + deallocate_vm_page_early(source_mid_point); logv("Page deallocated."); logv( @@ -2872,7 +2885,7 @@ test_copy_source_partially_unreadable_region() "Read-protecting a mid-range buffer page at address " "0x%jx...", (uintmax_t)mid_point); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); logv("Page read-protected."); logv( @@ -2888,35 +2901,25 @@ test_copy_source_partially_unreadable_region() void test_copy_dest_partially_unwriteable_region() { - kern_return_t kr; mach_vm_address_t dest = get_vm_address(); mach_vm_address_t source = get_buffer_address(); mach_msg_type_number_t size = (mach_msg_type_number_t)get_buffer_size(); mach_vm_address_t mid_point = (mach_vm_address_t)mach_vm_trunc_page(dest + size / 2); -#if __MAC_OX_X_VERSION_MIN_REQUIRED > 1080 logv( "Read-protecting a mid-range buffer page at address " "0x%jx...", (uintmax_t)mid_point); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), mid_point, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); logv("Page read-protected."); logv( "Copying region at address 0x%jx and size 0x%jx (%ju), on " "memory at address 0x%jx...", (uintmax_t)source, (uintmax_t)size, (uintmax_t)size, (uintmax_t)dest); - if (size >= vm_page_size) { - kr = KERN_PROTECTION_FAILURE; - } else { - kr = KERN_INVALID_ADDRESS; - } - assert_copy_return(source, size, dest, kr); - logv("Returned expected error: %s.", mach_error_string(kr)); -#else - logv( - "Bypassing partially unwriteable region test " - "(See )"); -#endif /* __MAC_OX_X_VERSION_MIN_REQUIRED > 1080 */ + + // The type of failure is not guaranteed to be consistent between architectures, so we just make sure it fails. + assert_copy_failure(source, size, dest); + logv("Returned expected error."); } /* Copying on partially deallocated memory fails. */ @@ -2932,7 +2935,7 @@ test_copy_source_on_partially_deallocated_range() "Deallocating the first source page at address " "0x%jx...", (uintmax_t)start); - assert_deallocate_success(start, vm_page_size); + deallocate_vm_page_early(start); logv("Page deallocated."); logv( @@ -2956,7 +2959,7 @@ test_copy_dest_on_partially_deallocated_range() "Deallocating the first destination page at address " "0x%jx...", (uintmax_t)start); - assert_deallocate_success(start, vm_page_size); + deallocate_buffer_page_early(start); logv("Page deallocated."); logv( @@ -2986,7 +2989,7 @@ test_copy_dest_on_partially_unwritable_range() "Write-protecting the first destination page at address " "0x%jx...", (uintmax_t)start); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_READ), "mach_vm_protect()"); logv("Page write-protected."); logv( @@ -3016,7 +3019,7 @@ test_copy_source_on_partially_unreadable_range() "Read-protecting the first destination page at address " "0x%jx...", (uintmax_t)start); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), start, vm_page_size, FALSE, VM_PROT_WRITE), "mach_vm_protect()"); logv("Page read-protected."); logv( @@ -3047,7 +3050,7 @@ test_zero_filled_readprotect() logv("Setting read access on 0x%jx (%ju) byte%s at address 0x%jx...", (uintmax_t)size, (uintmax_t)size, (size == 1) ? "" : "s", (uintmax_t)address); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_protect(mach_task_self(), address, size, FALSE, VM_PROT_DEFAULT), "mach_vm_protect()"); + assert_mach_success(mach_vm_protect(mach_task_self(), address, size, FALSE, VM_PROT_DEFAULT), "mach_vm_protect()"); logv("Region has read access."); test_zero_filled_extended(); } @@ -3066,7 +3069,7 @@ verify_protection(vm_prot_t protection, const char * protection_name) "Verifying %s-protection on region of address 0x%jx and " "size 0x%jx (%ju) with mach_vm_region()...", protection_name, (uintmax_t)address, (uintmax_t)size, (uintmax_t)size); - T_QUIET; T_ASSERT_MACH_SUCCESS( + assert_mach_success( mach_vm_region(mach_task_self(), &address, &size, VM_REGION_BASIC_INFO_64, (vm_region_info_t)&info, &count, &unused), "mach_vm_region()"); if (original_size) { @@ -3109,7 +3112,7 @@ access_readprotected_range_address(mach_vm_address_t address, const char * posit { logv("Reading from %s 0x%jx of read-protected range...", position, (uintmax_t)address); mach_vm_address_t bad_value = MACH_VM_ADDRESS_T(address); - T_ASSERT_FAIL("Unexpectedly read value 0x%jx at address 0x%jx." + T_ASSERT_FAIL("Unexpectedly read value 0x%jx at address 0x%jx. " "Should have died with signal SIGBUS.", (uintmax_t)bad_value, (uintmax_t)address); } @@ -3142,7 +3145,7 @@ write_writeprotected_range_address(mach_vm_address_t address, const char * posit { logv("Writing on %s 0x%jx of write-protected range...", position, (uintmax_t)address); MACH_VM_ADDRESS_T(address) = 0x0; - T_ASSERT_FAIL("Unexpectedly wrote value 0x0 value at address 0x%jx." + T_ASSERT_FAIL("Unexpectedly wrote value 0x0 value at address 0x%jx. " "Should have died with signal SIGBUS.", (uintmax_t)address); } @@ -3193,7 +3196,7 @@ protect_zero_size(vm_prot_t protection, const char * protection_name) logv("%s-protecting 0x0 (0) bytes at various addresses...", protection_name); for (i = 0; i < numofaddresses; i++) { kr = mach_vm_protect(this_task, addresses[i], 0, FALSE, protection); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, + assert_mach_success(kr, "mach_vm_protect() at " "address 0x%jx unexpectedly failed: %s.", (uintmax_t)addresses[i], mach_error_string(kr)); @@ -3237,12 +3240,10 @@ protect_wrapped_around_ranges(vm_prot_t protection, const char * protection_name protection_name); for (i = 0; i < numofranges; i++) { kr = mach_vm_protect(this_task, ranges[i].address, ranges[i].size, FALSE, protection); - T_QUIET; T_ASSERT_EQ(kr, KERN_INVALID_ARGUMENT, + assert_mach_return(kr, KERN_INVALID_ARGUMENT, "mach_vm_protect() with address 0x%jx and size " - "0x%jx (%ju) unexpectedly returned: %s.\n" - "Should have returned: %s.", - (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size, mach_error_string(kr), - mach_error_string(KERN_INVALID_ARGUMENT)); + "0x%jx (%ju) expected KERN_INVALID_ARGUMENT", + (uintmax_t)ranges[i].address, (uintmax_t)ranges[i].size, (uintmax_t)ranges[i].size); } logv("Returned expected error on each range: %s.", mach_error_string(KERN_INVALID_ARGUMENT)); } @@ -3267,30 +3268,6 @@ test_writeprotect_wrapped_around_ranges() void assert_share_mode(mach_vm_address_t address, unsigned share_mode, const char * share_mode_name) { - mach_vm_size_t size = get_vm_size(); - vm_region_extended_info_data_t info; - mach_msg_type_number_t count = VM_REGION_EXTENDED_INFO_COUNT; - mach_port_t unused; - -/* - * XXX Fails on UVM kernel. See - */ -#if notyet /* __MAC_OS_X_VERSION_MIN_REQUIRED < 1090 */ - logv( - "Verifying %s share mode on region of address 0x%jx and " - "size 0x%jx (%ju)...", - share_mode_name, (uintmax_t)address, (uintmax_t)size, (uintmax_t)size); - T_QUIET; T_ASSERT_MACH_SUCCESS( - mach_vm_region(mach_task_self(), &address, &size, VM_REGION_EXTENDED_INFO, (vm_region_info_t)&info, &count, &unused), - "mach_vm_region()"); - T_QUIET; T_ASSERT_EQ(info.share_mode, share_mode, - "Region's share mode " - " unexpectedly is not %s but %d.", - share_mode_name, info.share_mode); - logv("Region has a share mode of %s as expected.", share_mode_name); -#else - logv("Bypassing share_mode verification (See )"); -#endif /* __MAC_OS_X_VERSION_MIN_REQUIRED < 1090 */ } /* Do the vm_copy() and verify its success. */ @@ -3302,12 +3279,11 @@ assert_vmcopy_success(vm_address_t src, vm_address_t dst, const char * source_na logv("Copying (using mach_vm_copy()) from a %s source...", source_name); kr = mach_vm_copy(mach_task_self(), src, size, dst); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, + assert_mach_success(kr, "mach_vm_copy() with the source address " - "0x%jx, designation address 0x%jx, and size 0x%jx (%ju) unexpectly " - "returned %s.\n Should have returned: %s.", - (uintmax_t)src, (uintmax_t)dst, (uintmax_t)size, (uintmax_t)size, mach_error_string(kr), - mach_error_string(KERN_SUCCESS)); + "0x%jx, designation address 0x%jx, and size 0x%jx (%ju) " + "unexpectedly failed.", + (uintmax_t)src, (uintmax_t)dst, (uintmax_t)size, (uintmax_t)size); logv("Copy (mach_vm_copy()) was successful as expected."); } @@ -3331,7 +3307,6 @@ verify_region(mach_vm_address_t address, mach_vm_address_t start) void modify_one_and_verify_all_regions(vm_address_t src, vm_address_t dst, vm_address_t shared_copied, boolean_t shared) { - mach_vm_size_t size = get_vm_size(); int action = get_vmcopy_post_action(); /* Do the post vm_copy() action. */ @@ -3412,7 +3387,7 @@ test_vmcopy_shared_source() assert_allocate_success(&src, size, TRUE); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_inherit(mach_task_self(), src, size, VM_INHERIT_SHARE), "mach_vm_inherit()"); + assert_mach_success(mach_vm_inherit(mach_task_self(), src, size, VM_INHERIT_SHARE), "mach_vm_inherit()"); write_region(src, 0); @@ -3467,7 +3442,7 @@ test_vmcopy_copied_from_source() assert_allocate_success(&src, size, TRUE); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_copy(mach_task_self(), copied, size, src), "mach_vm_copy()"); + assert_mach_success(mach_vm_copy(mach_task_self(), copied, size, src), "mach_vm_copy()"); assert_share_mode(src, SM_COW, "SM_COW"); @@ -3494,7 +3469,7 @@ test_vmcopy_copied_to_source() assert_allocate_success(&copied, size, TRUE); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_copy(mach_task_self(), src, size, copied), "mach_vm_copy()"); + assert_mach_success(mach_vm_copy(mach_task_self(), src, size, copied), "mach_vm_copy()"); assert_share_mode(src, SM_COW, "SM_COW"); @@ -3522,10 +3497,10 @@ test_vmcopy_trueshared_source() assert_allocate_success(&shared, size, TRUE); write_region(shared, 0); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_make_memory_entry_64(mach_task_self(), &size, (memory_object_offset_t)shared, cur_protect, &mem_obj, + assert_mach_success(mach_make_memory_entry_64(mach_task_self(), &size, (memory_object_offset_t)shared, cur_protect, &mem_obj, (mem_entry_name_port_t)NULL), "mach_make_memory_entry_64()"); - T_QUIET; T_ASSERT_MACH_SUCCESS( + assert_mach_success( mach_vm_map(mach_task_self(), &src, size, 0, TRUE, mem_obj, 0, FALSE, cur_protect, max_protect, VM_INHERIT_NONE), "mach_vm_map()"); @@ -3554,7 +3529,7 @@ test_vmcopy_private_aliased_source() assert_allocate_success(&shared, size, TRUE); write_region(shared, 0); - T_QUIET; T_ASSERT_MACH_SUCCESS(mach_vm_remap(mach_task_self(), &src, size, 0, TRUE, mach_task_self(), shared, FALSE, &cur_protect, + assert_mach_success(mach_vm_remap(mach_task_self(), &src, size, 0, TRUE, mach_task_self(), shared, FALSE, &cur_protect, &max_protect, VM_INHERIT_NONE), "mach_vm_remap()"); @@ -3578,48 +3553,44 @@ test_vmcopy_private_aliased_source() void run_allocate_test_suites() { - /* CoreOSZin 12Z30: VMUnitTest fails: - * error finding xnu major version number. */ - /* unsigned int xnu_version = xnu_major_version(); */ - UnitTests allocate_main_tests = { - {"Allocated address is nonzero iff size is nonzero", test_nonzero_address_iff_nonzero_size}, - {"Allocated address is page-aligned", test_aligned_address}, - {"Allocated memory is zero-filled", test_zero_filled}, - {"Write and verify address-filled pattern", test_write_address_filled}, - {"Write and verify checkerboard pattern", test_write_checkerboard}, - {"Write and verify reverse checkerboard pattern", test_write_reverse_checkerboard}, - {"Write and verify page ends pattern", test_write_page_ends}, - {"Write and verify page interiors pattern", test_write_page_interiors}, - {"Reallocate allocated pages", test_reallocate_pages}, + {"Allocated address is nonzero iff size is nonzero", test_nonzero_address_iff_nonzero_size, 0}, + {"Allocated address is page-aligned", test_aligned_address, 0}, + {"Allocated memory is zero-filled", test_zero_filled, 0}, + {"Write and verify address-filled pattern", test_write_address_filled, 0}, + {"Write and verify checkerboard pattern", test_write_checkerboard, 0}, + {"Write and verify reverse checkerboard pattern", test_write_reverse_checkerboard, 0}, + {"Write and verify page ends pattern", test_write_page_ends, 0}, + {"Write and verify page interiors pattern", test_write_page_interiors, 0}, + {"Reallocate allocated pages", test_reallocate_pages, 0}, }; UnitTests allocate_address_error_tests = { - {"Allocate at address zero", test_allocate_at_zero}, + {"Allocate at address zero", test_allocate_at_zero, 0}, {"Allocate at a 2 MB boundary-unaligned, page-aligned " "address", - test_allocate_2MB_boundary_unaligned_page_aligned_address}, + test_allocate_2MB_boundary_unaligned_page_aligned_address, 0}, }; UnitTests allocate_argument_error_tests = { - {"Allocate in NULL VM map", test_allocate_in_null_map}, - {"Allocate with kernel flags", test_allocate_with_kernel_flags}, - {"Allocate super-page with incompatible flags", test_allocate_superpage_with_incompatible_flags}, + {"Allocate in NULL VM map", test_allocate_in_null_map, 0}, + {"Allocate with kernel flags", test_allocate_with_kernel_flags, 0}, + {"Allocate super-page with incompatible flags", test_allocate_superpage_with_incompatible_flags, 0}, }; UnitTests allocate_fixed_size_tests = { - {"Allocate zero size", test_allocate_zero_size}, - {"Allocate overflowing size", test_allocate_overflowing_size}, - {"Allocate a page with highest address hint", test_allocate_page_with_highest_address_hint}, - {"Allocate two pages and verify first fit strategy", test_allocate_first_fit_pages}, + {"Allocate zero size", test_allocate_zero_size, 0}, + {"Allocate overflowing size", test_allocate_overflowing_size, 0}, + {"Allocate a page with highest address hint", test_allocate_page_with_highest_address_hint, 0}, + {"Allocate two pages and verify first fit strategy", test_allocate_first_fit_pages, 0}, }; UnitTests allocate_invalid_large_size_test = { - {"Allocate invalid large size", test_allocate_invalid_large_size}, + {"Allocate invalid large size", test_allocate_invalid_large_size, 0}, }; UnitTests mach_vm_map_protection_inheritance_error_test = { {"mach_vm_map() with invalid protection/inheritance " "arguments", - test_mach_vm_map_protection_inheritance_error}, + test_mach_vm_map_protection_inheritance_error, 0}, }; UnitTests mach_vm_map_large_mask_overflow_error_test = { - {"mach_vm_map() with large address mask", test_mach_vm_map_large_mask_overflow_error}, + {"mach_vm_map() with large address mask", test_mach_vm_map_large_mask_overflow_error, 0}, }; /* Run the test suites with various allocators and VM sizes, and @@ -3677,16 +3648,8 @@ run_allocate_test_suites() } run_suite(set_up_allocator, allocate_fixed_size_tests, do_nothing, "%s fixed size allocation tests", allocators[allocators_idx].description); - /* CoreOSZin 12Z30: VMUnitTest fails: - * error finding xnu major version number. */ - /* mach_vm_map() with a named entry triggers a panic with this test - * unless under xnu-1598 or later, see 8048580. */ - /* if (allocators_idx != MACH_VM_MAP_NAMED_ENTRY - || xnu_version >= 1598) { */ - if (allocators_idx != MACH_VM_MAP_NAMED_ENTRY) { - run_suite(set_up_allocator, allocate_invalid_large_size_test, do_nothing, "%s invalid large size allocation test", - allocators[allocators_idx].description); - } + run_suite(set_up_allocator, allocate_invalid_large_size_test, do_nothing, "%s invalid large size allocation test", + allocators[allocators_idx].description); } /* mach_vm_map() only large mask overflow tests. */ for (sizes_idx = 0; sizes_idx < numofsizes; sizes_idx++) { @@ -3701,27 +3664,26 @@ void run_deallocate_test_suites() { UnitTests access_deallocated_memory_tests = { - {"Read start of deallocated range", test_access_deallocated_range_start}, - {"Read middle of deallocated range", test_access_deallocated_range_middle}, - {"Read end of deallocated range", test_access_deallocated_range_end}, + {"Read start of deallocated range", test_access_deallocated_range_start, SIGSEGV}, + {"Read middle of deallocated range", test_access_deallocated_range_middle, SIGSEGV}, + {"Read end of deallocated range", test_access_deallocated_range_end, SIGSEGV}, }; UnitTests deallocate_reallocate_tests = { {"Deallocate twice", test_deallocate_twice}, {"Write pattern, deallocate, reallocate (deallocated " "memory is inaccessible), and verify memory is " "zero-filled", - test_write_pattern_deallocate_reallocate_zero_filled}, + test_write_pattern_deallocate_reallocate_zero_filled, 0}, }; UnitTests deallocate_null_map_test = { - {"Deallocate in NULL VM map", test_deallocate_in_null_map}, + {"Deallocate in NULL VM map", test_deallocate_in_null_map, 0}, }; UnitTests deallocate_edge_case_tests = { - {"Deallocate zero size ranges", test_deallocate_zero_size_ranges}, - {"Deallocate memory ranges whose end rounds to 0x0", test_deallocate_rounded_zero_end_ranges}, - {"Deallocate wrapped around memory ranges", test_deallocate_wrapped_around_ranges}, + {"Deallocate zero size ranges", test_deallocate_zero_size_ranges, 0}, + {"Deallocate wrapped around memory ranges", test_deallocate_wrapped_around_ranges, 0}, }; UnitTests deallocate_suicide_test = { - {"Deallocate whole address space", test_deallocate_suicide}, + {"Deallocate whole address space", test_deallocate_suicide, -1}, }; /* All allocations done with mach_vm_allocate(). */ @@ -3740,7 +3702,6 @@ run_deallocate_test_suites() * fault. */ /* Nothing gets deallocated if size is zero. */ if (sizes_idx != ZERO_BYTES) { - set_expected_signal(SIGSEGV); run_suite(set_up_vm_variables_and_allocate, access_deallocated_memory_tests, do_nothing, "Deallocated memory access tests, " "%s%s address, %s size: 0x%jx (%ju)", @@ -3748,7 +3709,12 @@ run_deallocate_test_suites() (flags_idx == ANYWHERE) ? "" : address_alignments[alignments_idx].description, vm_sizes[sizes_idx].description, (uintmax_t)vm_sizes[sizes_idx].size, (uintmax_t)vm_sizes[sizes_idx].size); - set_expected_signal(0); + } + /* Deallocating zero size range should pass */ + if (vm_sizes[sizes_idx].size == 0) { + deallocate_reallocate_tests[0].expected_signal = 0; + } else { + deallocate_reallocate_tests[0].expected_signal = _expected_vm_exc_guard_signal; } run_suite(set_up_vm_variables_and_allocate, deallocate_reallocate_tests, do_nothing, "Deallocation and reallocation tests, %s%s " @@ -3768,37 +3734,34 @@ run_deallocate_test_suites() } } run_suite(do_nothing, deallocate_edge_case_tests, do_nothing, "Edge case deallocation tests"); - - set_expected_signal(-1); /* SIGSEGV or SIGBUS */ run_suite(do_nothing, deallocate_suicide_test, do_nothing, "Whole address space deallocation test"); - set_expected_signal(0); } void run_read_test_suites() { UnitTests read_main_tests = { - {"Read address is nonzero iff size is nonzero", test_nonzero_address_iff_nonzero_size}, - {"Read address has the correct boundary offset", test_read_address_offset}, - {"Reallocate read pages", test_reallocate_pages}, - {"Read and verify zero-filled memory", test_zero_filled}, + {"Read address is nonzero iff size is nonzero", test_nonzero_address_iff_nonzero_size, 0}, + {"Read address has the correct boundary offset", test_read_address_offset, 0}, + {"Reallocate read pages", test_reallocate_pages, 0}, + {"Read and verify zero-filled memory", test_zero_filled, 0}, }; UnitTests read_pattern_tests = { - {"Read address-filled pattern", test_read_address_filled}, - {"Read checkerboard pattern", test_read_checkerboard}, - {"Read reverse checkerboard pattern", test_read_reverse_checkerboard}, + {"Read address-filled pattern", test_read_address_filled, 0}, + {"Read checkerboard pattern", test_read_checkerboard, 0}, + {"Read reverse checkerboard pattern", test_read_reverse_checkerboard, 0}, }; UnitTests read_null_map_test = { - {"Read from NULL VM map", test_read_null_map}, + {"Read from NULL VM map", test_read_null_map, 0}, }; UnitTests read_edge_case_tests = { - {"Read zero size", test_read_zero_size}, - {"Read invalid large size", test_read_invalid_large_size}, - {"Read wrapped around memory ranges", test_read_wrapped_around_ranges}, + {"Read zero size", test_read_zero_size, 0}, + {"Read invalid large size", test_read_invalid_large_size, 0}, + {"Read wrapped around memory ranges", test_read_wrapped_around_ranges, 0}, }; UnitTests read_inaccessible_tests = { - {"Read partially decallocated memory", test_read_partially_deallocated_range}, - {"Read partially read-protected memory", test_read_partially_unreadable_range}, + {"Read partially deallocated memory", test_read_partially_deallocated_range, 0}, + {"Read partially read-protected memory", test_read_partially_unreadable_range, 0}, }; /* All allocations done with mach_vm_allocate(). */ @@ -3854,21 +3817,21 @@ void run_write_test_suites() { UnitTests write_main_tests = { - {"Write and verify zero-filled memory", test_zero_filled_write}, + {"Write and verify zero-filled memory", test_zero_filled_write, 0}, }; UnitTests write_pattern_tests = { - {"Write address-filled pattern", test_address_filled_write}, - {"Write checkerboard pattern", test_checkerboard_write}, - {"Write reverse checkerboard pattern", test_reverse_checkerboard_write}, + {"Write address-filled pattern", test_address_filled_write, 0}, + {"Write checkerboard pattern", test_checkerboard_write, 0}, + {"Write reverse checkerboard pattern", test_reverse_checkerboard_write, 0}, }; UnitTests write_edge_case_tests = { - {"Write into NULL VM map", test_write_null_map}, {"Write zero size", test_write_zero_size}, + {"Write into NULL VM map", test_write_null_map, 0}, {"Write zero size", test_write_zero_size, 0}, }; UnitTests write_inaccessible_tests = { - {"Write partially decallocated buffer", test_write_partially_deallocated_buffer}, - {"Write partially read-protected buffer", test_write_partially_unreadable_buffer}, - {"Write on partially deallocated range", test_write_on_partially_deallocated_range}, - {"Write on partially write-protected range", test_write_on_partially_unwritable_range}, + {"Write partially deallocated buffer", test_write_partially_deallocated_buffer, 0}, + {"Write partially read-protected buffer", test_write_partially_unreadable_buffer, 0}, + {"Write on partially deallocated range", test_write_on_partially_deallocated_range, 0}, + {"Write on partially write-protected range", test_write_on_partially_unwritable_range, 0}, }; /* All allocations done with mach_vm_allocate(). */ @@ -3948,32 +3911,31 @@ void run_protect_test_suites() { UnitTests readprotection_main_tests = { - {"Read-protect, read-allow and verify zero-filled memory", test_zero_filled_readprotect}, + {"Read-protect, read-allow and verify zero-filled memory", test_zero_filled_readprotect, 0}, {"Verify that region is read-protected iff size is " "nonzero", - test_verify_readprotection}, + test_verify_readprotection, 0}, }; UnitTests access_readprotected_memory_tests = { - {"Read start of read-protected range", test_access_readprotected_range_start}, - {"Read middle of read-protected range", test_access_readprotected_range_middle}, - {"Read end of read-protected range", test_access_readprotected_range_end}, + {"Read start of read-protected range", test_access_readprotected_range_start, SIGBUS}, + {"Read middle of read-protected range", test_access_readprotected_range_middle, SIGBUS}, + {"Read end of read-protected range", test_access_readprotected_range_end, SIGBUS}, }; UnitTests writeprotection_main_tests = { - {"Write-protect and verify zero-filled memory", test_zero_filled_extended}, - {"Verify that region is write-protected iff size is " - "nonzero", - test_verify_writeprotection}, + {"Write-protect and verify zero-filled memory", test_zero_filled_extended, 0}, + {"Verify that region is write-protected iff size is nonzero", + test_verify_writeprotection, 0}, }; UnitTests write_writeprotected_memory_tests = { - {"Write at start of write-protected range", test_write_writeprotected_range_start}, - {"Write in middle of write-protected range", test_write_writeprotected_range_middle}, - {"Write at end of write-protected range", test_write_writeprotected_range_end}, + {"Write at start of write-protected range", test_write_writeprotected_range_start, SIGBUS}, + {"Write in middle of write-protected range", test_write_writeprotected_range_middle, SIGBUS}, + {"Write at end of write-protected range", test_write_writeprotected_range_end, SIGBUS}, }; UnitTests protect_edge_case_tests = { - {"Read-protect zero size ranges", test_readprotect_zero_size}, - {"Write-protect zero size ranges", test_writeprotect_zero_size}, - {"Read-protect wrapped around memory ranges", test_readprotect_wrapped_around_ranges}, - {"Write-protect wrapped around memory ranges", test_writeprotect_wrapped_around_ranges}, + {"Read-protect zero size ranges", test_readprotect_zero_size, 0}, + {"Write-protect zero size ranges", test_writeprotect_zero_size, 0}, + {"Read-protect wrapped around memory ranges", test_readprotect_wrapped_around_ranges, 0}, + {"Write-protect wrapped around memory ranges", test_writeprotect_wrapped_around_ranges, 0}, }; /* All allocations done with mach_vm_allocate(). */ @@ -4004,7 +3966,6 @@ run_protect_test_suites() (uintmax_t)vm_sizes[sizes_idx].size); /* Nothing gets protected if size is zero. */ if (sizes_idx != ZERO_BYTES) { - set_expected_signal(SIGBUS); /* Accessing read-protected memory should cause a bus * error. */ run_suite(set_up_vm_variables_allocate_readprotect, access_readprotected_memory_tests, deallocate_extra_page, @@ -4023,7 +3984,6 @@ run_protect_test_suites() (flags_idx == ANYWHERE) ? "" : address_alignments[alignments_idx].description, vm_sizes[sizes_idx].description, (uintmax_t)vm_sizes[sizes_idx].size, (uintmax_t)vm_sizes[sizes_idx].size); - set_expected_signal(0); } } } @@ -4036,39 +3996,39 @@ run_copy_test_suites() { /* Copy tests */ UnitTests copy_main_tests = { - {"Copy and verify zero-filled memory", test_zero_filled_copy_dest}, + {"Copy and verify zero-filled memory", test_zero_filled_copy_dest, 0}, }; UnitTests copy_pattern_tests = { - {"Copy address-filled pattern", test_copy_address_filled}, - {"Copy checkerboard pattern", test_copy_checkerboard}, - {"Copy reverse checkerboard pattern", test_copy_reverse_checkerboard}, + {"Copy address-filled pattern", test_copy_address_filled, 0}, + {"Copy checkerboard pattern", test_copy_checkerboard, 0}, + {"Copy reverse checkerboard pattern", test_copy_reverse_checkerboard, 0}, }; UnitTests copy_edge_case_tests = { - {"Copy with NULL VM map", test_copy_null_map}, - {"Copy zero size", test_copy_zero_size}, - {"Copy invalid large size", test_copy_invalid_large_size}, - {"Read wrapped around memory ranges", test_copy_wrapped_around_ranges}, + {"Copy with NULL VM map", test_copy_null_map, 0}, + {"Copy zero size", test_copy_zero_size, 0}, + {"Copy invalid large size", test_copy_invalid_large_size, 0}, + {"Read wrapped around memory ranges", test_copy_wrapped_around_ranges, 0}, }; UnitTests copy_inaccessible_tests = { - {"Copy source partially decallocated region", test_copy_source_partially_deallocated_region}, + {"Copy source partially deallocated region", test_copy_source_partially_deallocated_region, 0}, /* XXX */ - {"Copy destination partially decallocated region", test_copy_dest_partially_deallocated_region}, - {"Copy source partially read-protected region", test_copy_source_partially_unreadable_region}, + {"Copy destination partially deallocated region", test_copy_dest_partially_deallocated_region, 0}, + {"Copy source partially read-protected region", test_copy_source_partially_unreadable_region, 0}, /* XXX */ - {"Copy destination partially write-protected region", test_copy_dest_partially_unwriteable_region}, - {"Copy source on partially deallocated range", test_copy_source_on_partially_deallocated_range}, - {"Copy destination on partially deallocated range", test_copy_dest_on_partially_deallocated_range}, - {"Copy source on partially read-protected range", test_copy_source_on_partially_unreadable_range}, - {"Copy destination on partially write-protected range", test_copy_dest_on_partially_unwritable_range}, + {"Copy destination partially write-protected region", test_copy_dest_partially_unwriteable_region, 0}, + {"Copy source on partially deallocated range", test_copy_source_on_partially_deallocated_range, 0}, + {"Copy destination on partially deallocated range", test_copy_dest_on_partially_deallocated_range, 0}, + {"Copy source on partially read-protected range", test_copy_source_on_partially_unreadable_range, 0}, + {"Copy destination on partially write-protected range", test_copy_dest_on_partially_unwritable_range, 0}, }; UnitTests copy_shared_mode_tests = { - {"Copy using freshly allocated source", test_vmcopy_fresh_source}, - {"Copy using shared source", test_vmcopy_shared_source}, - {"Copy using a \'copied from\' source", test_vmcopy_copied_from_source}, - {"Copy using a \'copied to\' source", test_vmcopy_copied_to_source}, - {"Copy using a true shared source", test_vmcopy_trueshared_source}, - {"Copy using a private aliased source", test_vmcopy_private_aliased_source}, + {"Copy using freshly allocated source", test_vmcopy_fresh_source, 0}, + {"Copy using shared source", test_vmcopy_shared_source, 0}, + {"Copy using a \'copied from\' source", test_vmcopy_copied_from_source, 0}, + {"Copy using a \'copied to\' source", test_vmcopy_copied_to_source, 0}, + {"Copy using a true shared source", test_vmcopy_trueshared_source, 0}, + {"Copy using a private aliased source", test_vmcopy_private_aliased_source, 0}, }; /* All allocations done with mach_vm_allocate(). */ @@ -4150,14 +4110,34 @@ run_copy_test_suites() } } +static int +set_disable_vm_sanitize_telemetry_via_sysctl(uint32_t val) +{ + int ret = sysctlbyname("debug.disable_vm_sanitize_telemetry", NULL, NULL, &val, sizeof(uint32_t)); + if (ret != 0) { + T_LOG("telemetry sysctl failed with errno %d.", errno); + } + return ret; +} + +static int +disable_vm_sanitize_telemetry(void) +{ + return set_disable_vm_sanitize_telemetry_via_sysctl(1); +} + +static int +reenable_vm_sanitize_telemetry(void) +{ + return set_disable_vm_sanitize_telemetry_via_sysctl(0); +} + void perform_test_with_options(test_option_t options) { - process_options(options); + disable_vm_sanitize_telemetry(); - /* CoreOSZin 12Z30: VMUnitTest fails: - * error finding xnu major version number. */ - /* printf("xnu version is %s.\n\n", xnu_version_string()); */ + process_options(options); if (flag_run_allocate_test) { run_allocate_test_suites(); @@ -4184,6 +4164,7 @@ perform_test_with_options(test_option_t options) } log_aggregated_results(); + reenable_vm_sanitize_telemetry(); } T_DECL(vm_test_allocate, "Allocate VM unit test") @@ -4196,6 +4177,7 @@ T_DECL(vm_test_allocate, "Allocate VM unit test") } T_DECL(vm_test_deallocate, "Deallocate VM unit test", + T_META_ENABLED(!TARGET_OS_BRIDGE), /* disabled on bridgeOS due to failures, rdar://137493917 */ T_META_IGNORECRASHES(".*vm_allocation.*")) { test_options.to_flags = VM_TEST_DEALLOCATE; diff --git a/tests/vm/vm_memory_entry.c b/tests/vm/vm_memory_entry.c new file mode 100644 index 000000000..18fb38d44 --- /dev/null +++ b/tests/vm/vm_memory_entry.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2025 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_TAG_VM_PREFERRED); + + +T_DECL(memory_entry_page_counts, + "Test that page counts are computed correctly for memory entries") +{ + mach_error_t err; + int ret; + uint64_t pages = 1024; + mach_vm_size_t size = pages * vm_page_size; + mach_port_t memory_entry = MACH_PORT_NULL; + uint64_t resident, dirty, swapped; + + T_LOG("Creating memory entry"); + err = mach_make_memory_entry_64(mach_task_self(), &size, + (memory_object_offset_t)0, + (MAP_MEM_NAMED_CREATE | MAP_MEM_LEDGER_TAGGED | VM_PROT_DEFAULT), + &memory_entry, MEMORY_OBJECT_NULL); + T_QUIET; T_ASSERT_MACH_SUCCESS(err, "mach_make_memory_entry()"); + T_QUIET; T_ASSERT_NE(memory_entry, MACH_PORT_NULL, "memory entry is non-null"); + + T_LOG("Mapping memory entry"); + mach_vm_address_t addr; + err = mach_vm_map(mach_task_self(), &addr, size, 0, + VM_FLAGS_ANYWHERE, memory_entry, 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_NONE); + T_QUIET; T_ASSERT_MACH_SUCCESS(err, "mach_vm_map()"); + + T_LOG("Querying page counts"); + ret = mach_memory_entry_get_page_counts(memory_entry, &resident, &dirty, &swapped); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "mach_memory_entry_get_page_counts()"); + + T_EXPECT_EQ(resident, 0ull, "Entry should have no resident pages"); + T_EXPECT_EQ(dirty, 0ull, "Entry should have no dirty pages"); + T_EXPECT_EQ(swapped, 0ull, "Entry should have no swapped pages"); + + T_LOG("Faulting mapping"); + memset((void *)addr, 0xAB, size); + + T_LOG("Wiring mapping"); + ret = mlock((void *)addr, size); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "mlock()"); + + T_LOG("Querying page counts"); + ret = mach_memory_entry_get_page_counts(memory_entry, &resident, &dirty, &swapped); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "mach_memory_entry_get_page_counts()"); + + T_EXPECT_EQ(resident, pages, "Entry should have all resident pages"); + T_EXPECT_EQ(dirty, pages, "Entry should have all dirty pages"); + T_EXPECT_EQ(swapped, 0ull, "Entry should have no swapped pages"); + + T_LOG("Un-wiring mapping"); + ret = munlock((void *)addr, size); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "munlock()"); + + T_LOG("Evicting backing pages..."); + ret = madvise((void *)addr, size, MADV_PAGEOUT); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "madvise()"); + + /* MADV_PAGEOUT is asynchronous */ + sleep(1); + + T_LOG("Querying page counts"); + ret = mach_memory_entry_get_page_counts(memory_entry, &resident, &dirty, &swapped); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "mach_memory_entry_get_page_counts()"); + + T_EXPECT_EQ(resident, 0ull, "Entry should have no resident pages"); + T_EXPECT_EQ(dirty, 0ull, "Entry should have no dirty pages"); + T_EXPECT_EQ(swapped, pages, "Entry should have all swapped pages"); + + err = mach_vm_deallocate(mach_task_self(), addr, size); + T_QUIET; T_ASSERT_MACH_SUCCESS(err, "mach_vm_deallocate()"); + + err = mach_port_deallocate(mach_task_self(), memory_entry); + T_QUIET; T_ASSERT_MACH_SUCCESS(err, "mach_port_deallocate()"); +} diff --git a/tests/vm/vm_parameter_validation.c b/tests/vm/vm_parameter_validation.c index 58cbf6714..5b0950e56 100644 --- a/tests/vm/vm_parameter_validation.c +++ b/tests/vm/vm_parameter_validation.c @@ -27,16 +27,35 @@ // code shared with kernel/kext tests #include "../../osfmk/tests/vm_parameter_validation.h" -#define GOLDEN_FILES_VERSION "vm_parameter_validation_golden_images_edeef315.tar.xz" +#define GOLDEN_FILES_VERSION "vm_parameter_validation_golden_images_a2474e92.tar.xz" #define GOLDEN_FILES_ASSET_FILE_POINTER GOLDEN_FILES_VERSION +/* + * Architecture to pass to the golden file decompressor. + * watchOS passes 'arm64' or 'arm64_32'. + * Decompressor ignores this parameter on other platforms. + */ +#if TARGET_OS_WATCH +# if TARGET_CPU_ARM64 +# if TARGET_RT_64_BIT +# define GOLDEN_FILES_ARCH "arm64" +# else +# define GOLDEN_FILES_ARCH "arm64_32" +# endif +# else +# error unknown watchOS architecture +# endif +#else +# define GOLDEN_FILES_ARCH "unspecified" +#endif + T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_RADAR_COMPONENT_NAME("xnu"), T_META_RADAR_COMPONENT_VERSION("VM"), T_META_S3_ASSET(GOLDEN_FILES_ASSET_FILE_POINTER), T_META_ASROOT(true), /* required for vm_wire tests on macOS */ - T_META_RUN_CONCURRENTLY(false), /* vm_parameter_validation_kern uses kernel globals */ + T_META_RUN_CONCURRENTLY(false), /* tests should be concurrency-safe now, but keep this in case concurrent tests would provoke timeouts */ T_META_ALL_VALID_ARCHS(true), XNU_T_META_REQUIRES_DEVELOPMENT_KERNEL ); @@ -2454,8 +2473,10 @@ reenable_vm_sanitize_telemetry(void) #define KERN_GOLDEN_FILE TMP_DIR "kern_golden_image.log" -results_t *golden_list[MAX_NUM_TESTS]; -results_t *kern_list[MAX_NUM_TESTS]; +static results_t *golden_list[MAX_NUM_TESTS]; +static results_t *kern_list[MAX_NUM_TESTS]; +static uint32_t num_tests = 0; // num of tests in golden_list +static uint32_t num_kern_tests = 0; // num of tests in kern_list #define FILL_TRIALS_NAMES_AND_CONTINUE(results, trials, t_count) { \ for (unsigned i = 0; i < t_count; i++) { \ @@ -2808,21 +2829,203 @@ goldenprintf(const char *format, ...) GOLDEN_OUTPUT_BUF += printed; } +// Knobs controlled by environment variables + // Verbose output in dump_results, controlled by DUMP_RESULTS env. -bool dump = FALSE; +static bool dump = FALSE; // Output to create a golden test result, controlled by GENERATE_GOLDEN_IMAGE. -bool generate_golden = FALSE; +static bool generate_golden = FALSE; // Read existing golden file and print its contents in verbose format (like dump_results). Controlled by DUMP_GOLDEN_IMAGE. -bool dump_golden = FALSE; +static bool dump_golden = FALSE; // Run tests as tests (i.e. emit TS_{PASS/FAIL}), enabled unless golden image generation is true. -bool should_test_results = TRUE; +static bool should_test_results = TRUE; + +static void +read_env() +{ + dump = (getenv("DUMP_RESULTS") != NULL); + dump_golden = (getenv("DUMP_GOLDEN_IMAGE") != NULL); + // Shouldn't do both + generate_golden = (getenv("GENERATE_GOLDEN_IMAGE") != NULL) && !dump_golden; + // Only test when no other golden image flag is set + should_test_results = (getenv("SKIP_TESTS") == NULL) && !dump_golden && !generate_golden; +} + +// Comparator function for sorting result_t list by name +static int +compare_names(const void *a, const void *b) +{ + assert(((const result_t *)a)->name); + assert(((const result_t *)b)->name); + return strcmp(((const result_t *)a)->name, ((const result_t *)b)->name); +} + +static unsigned +binary_search(result_t *list, unsigned count, const result_t *trial) +{ + const char *name = trial->name; + unsigned left = 0, right = count; + while (left < right) { + // Range [left, right) is to be searched. + unsigned mid = left + (right - left) / 2; + int cmp = strcmp(list[mid].name, name); + if (cmp == 0) { + return mid; + } else if (cmp < 0) { + // Narrow search to [mid + 1, right). + left = mid + 1; + } else { + // Narrow search to [left, mid). + right = mid; + } + } + return UINT_MAX; // Not found +} + +static inline bool +trial_name_equals(const result_t *a, const result_t *b) +{ + // NB: strlen match need to handle cases where a shorter 'bname' would match a longer 'aname'. + if (strlen(a->name) == strlen(b->name) && compare_names(a, b) == 0) { + return true; + } + return false; +} + +static const result_t * +get_golden_result(results_t *golden_results, const result_t *trial, unsigned trial_idx) +{ + if (golden_results->trialsformula == eUNKNOWN_TRIALS) { + // golden results don't contain trials names + T_LOG("%s: update test's alloc_results to have a valid trialsformula_t\n", golden_results->testname); + return NULL; + } + + if (trial_idx < golden_results->count && + golden_results->list[trial_idx].name && + trial_name_equals(&golden_results->list[trial_idx], trial)) { + // "fast search" path taken when golden file is in sync to test. + return &golden_results->list[trial_idx]; + } + + // "slow search" path taken when tests idxs are not aligned. Sort the array + // by name and do binary search. + qsort(golden_results->list, golden_results->count, sizeof(result_t), compare_names); + unsigned g_idx = binary_search(golden_results->list, golden_results->count, trial); + if (g_idx < golden_results->count) { + return &golden_results->list[g_idx]; + } + + return NULL; +} + +static void +test_results(results_t *golden_results, results_t *results) +{ + bool passed = TRUE; + unsigned result_count = results->count; + unsigned acceptable_count = 0; + const unsigned acceptable_max = 16; // log up to this many ACCEPTABLE results + const result_t *golden_result = NULL; + if (golden_results->count != results->count) { + if (results->kernel_buffer_full) { + T_FAIL("%s: number of iterations mismatch (wanted %u, got %u) " + "(kernel output buffer full)", + results->testname, golden_results->count, results->count); + passed = FALSE; + } else { + T_LOG("%s: number of iterations mismatch (wanted %u, got %u)", + results->testname, golden_results->count, results->count); + } + } + for (unsigned i = 0; i < result_count; i++) { + golden_result = get_golden_result(golden_results, &results->list[i], i); + if (golden_result) { + if (results->list[i].ret == ACCEPTABLE) { + // trial has declared itself to be correct + // no matter what the golden result is + acceptable_count++; + if (acceptable_count <= acceptable_max) { + T_LOG("%s RESULT ACCEPTABLE (expected %d), %s\n", + results->testname, + golden_result->ret, results->list[i].name); + } + } else if (results->list[i].ret != golden_result->ret) { + T_FAIL("%s RESULT %d (expected %d), %s\n", + results->testname, results->list[i].ret, + golden_result->ret, results->list[i].name); + passed = FALSE; + } + } else { + /* + * This trial is not present in the golden results. + * + * This may be caused by new tests that require + * updates to the golden results. + * Or this may be caused by the last trial name being + * truncated when the kernel's output buffer is full. + * (Or both at once, in which case we only complain + * about one of them.) + */ + const char *suggestion; + if (results->kernel_buffer_full && i == results->count - 1) { + suggestion = "kernel test output buffer is full"; + } else { + suggestion = "regenerate golden files to fix this"; + } + T_FAIL("%s NEW RESULT %d, %s -- %s\n", + results->testname, results->list[i].ret, + results->list[i].name, suggestion); + passed = FALSE; + } + } + + if (acceptable_count > acceptable_max) { + T_LOG("%s %u more RESULT ACCEPTABLE trials not logged\n", + results->testname, acceptable_count - acceptable_max); + } + if (passed) { + T_PASS("%s passed\n", results->testname); + } +} + +static results_t * +process_results(results_t *results) +{ + results_t *golden_results = NULL; + + if (dump && !generate_golden) { + __dump_results(results); + } + + if (generate_golden) { + dump_golden_results(results); + } + + if (should_test_results) { + golden_results = test_name_to_golden_results(results->testname); + + if (golden_results) { + test_results(golden_results, results); + } else { + T_FAIL("New test %s found, update golden list to allow return code testing", results->testname); + // Dump results if not done previously + if (!dump) { + __dump_results(results); + } + } + } + + return results; +} T_DECL(vm_parameter_validation_user, "parameter validation for userspace calls", T_META_SPAWN_TOOL(DECOMPRESS), T_META_SPAWN_TOOL_ARG("user"), T_META_SPAWN_TOOL_ARG(TMP_DIR), - T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_VERSION) + T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_VERSION), + T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_ARCH) ) { if (disable_vm_sanitize_telemetry() != 0) { @@ -3458,8 +3661,6 @@ out: // The actual test code is in: // osfmk/tests/vm_parameter_validation_kern.c -#define KERN_RESULT_DELIMITER "\n" - #ifndef STRINGIFY #define __STR(x) #x #define STRINGIFY(x) __STR(x) @@ -3484,53 +3685,78 @@ static int populate_kernel_results(char *kern_buffer) { char *line = NULL; - char *sub_line = NULL; char *test_name = NULL; - char *result_name = NULL; - char *token = NULL; - char *s_num_kern_results = NULL; results_t *kern_results = NULL; - uint32_t num_kern_results = 0; - uint32_t result_number = 0; - int result_ret = 0; bool in_test = FALSE; line = strtok(kern_buffer, KERN_RESULT_DELIMITER); while (line != NULL) { if (strncmp(line, TESTNAME_DELIMITER, strlen(TESTNAME_DELIMITER)) == 0) { - sub_line = line + strlen(TESTNAME_DELIMITER); + char *sub_line = line + strlen(TESTNAME_DELIMITER); test_name = strdup(sub_line); - result_number = 0; in_test = TRUE; } else if (in_test && strncmp(line, RESULTCOUNT_DELIMITER, strlen(RESULTCOUNT_DELIMITER)) == 0) { - s_num_kern_results = line + strlen(RESULTCOUNT_DELIMITER); - num_kern_results = (uint32_t)strtoul(s_num_kern_results, NULL, 10); + char *s_num_kern_results = line + strlen(RESULTCOUNT_DELIMITER); + uint32_t num_kern_results = (uint32_t)strtoul(s_num_kern_results, NULL, 10); kern_results = alloc_results(test_name, eUNKNOWN_TRIALS, num_kern_results); - kern_results->count = num_kern_results; kern_list[num_kern_tests++] = kern_results; } else if (in_test && strncmp(line, TESTCONFIG_DELIMITER, strlen(TESTCONFIG_DELIMITER)) == 0) { - sub_line = line + strlen(TESTCONFIG_DELIMITER); + char *sub_line = line + strlen(TESTCONFIG_DELIMITER); kern_results->testconfig = strdup(sub_line); } else if (in_test && strstr(line, KERN_TESTRESULT_DELIMITER)) { // should have found TESTCONFIG already assert(kern_results->testconfig != NULL); + int result_ret = 0; + char *token; sscanf(line, KERN_TESTRESULT_DELIMITER "%d", &result_ret); // get result name (comes after the first ,) token = strchr(line, ','); - if (token) { + if (token && strlen(token) > 2) { token = token + 2; // skip the , and the extra space - result_name = strdup(token); - if (result_number >= num_kern_results) { - T_LOG("\tKERN Invalid output in test %s, seeing more results (%u) than expected (%u), ignoring trial RESULT %d, %s\n", - test_name, result_number, num_kern_results, result_ret, result_name); + char *result_name = strdup(token); + if (kern_results->count >= kern_results->capacity) { + T_LOG("\tKERN Invalid output in test %s, " + "too many results (expected %u), " + "ignoring trial RESULT %d, %s\n", + test_name, kern_results->capacity, result_ret, result_name); free(result_name); } else { - kern_results->list[result_number++] = (result_t){.ret = result_ret, .name = result_name}; + kern_results->list[kern_results->count++] = + (result_t){.ret = result_ret, .name = result_name}; } } + } else if (strncmp(line, KERN_FAILURE_DELIMITER, strlen(KERN_FAILURE_DELIMITER)) == 0) { + /* + * A fatal error message interrupted the output. + * (for example, the kernel test's output buffer is full) + * Clean up the last test because it may be + * invalid due to truncated output. + */ + T_FAIL("%s", line); + if (kern_results != NULL) { + if (kern_results->testconfig == NULL) { + // We didn't get any results for this test. + // Just drop it. + dealloc_results(kern_results); + kern_results = NULL; + kern_list[--num_kern_tests] = NULL; + } else { + kern_results->kernel_buffer_full = true; + } + } + + // Stop reading results now. + break; } else { + /* + * Unrecognized output text. + * One possible cause is that the kernel test's output + * buffer is full so this line was truncated beyond + * recognition. In that case we'll hit the + * KERN_FAILURE_DELIMITER line next. + */ + // T_LOG("Unknown kernel result line: %s\n", line); - //in_test = FALSE; } line = strtok(NULL, KERN_RESULT_DELIMITER); @@ -3551,6 +3777,16 @@ run_sysctl_test(const char *t, int64_t value) snprintf(name, sizeof(name), "debug.test.%s", t); rc = sysctlbyname(name, &result, &s, &value, s); + if (rc == -1 && errno == ENOENT) { + /* + * sysctl name not found. Probably an older kernel with the + * previous version of this test. + */ + T_FAIL("sysctl %s not found; may be running on an older kernel " + "that does not implement the current version of this test", + name); + exit(1); + } T_QUIET; T_ASSERT_POSIX_SUCCESS(rc, "sysctlbyname(%s)", t); return result; } @@ -3560,7 +3796,8 @@ T_DECL(vm_parameter_validation_kern, T_META_SPAWN_TOOL(DECOMPRESS), T_META_SPAWN_TOOL_ARG("kern"), T_META_SPAWN_TOOL_ARG(TMP_DIR), - T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_VERSION) + T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_VERSION), + T_META_SPAWN_TOOL_ARG(GOLDEN_FILES_ARCH) ) { if (disable_vm_sanitize_telemetry() != 0) { @@ -3596,22 +3833,37 @@ T_DECL(vm_parameter_validation_kern, // code print directly to the serial console, which takes many minutes // to transfer our test output at 14.4 KB/s. // We align this buffer to KB16 to allow the lower bits to be used for a fd. - void *output; - int alloc_failed = posix_memalign(&output, KB16, SYSCTL_OUTPUT_BUFFER_SIZE); - assert(alloc_failed == 0); + char *output = calloc(SYSCTL_OUTPUT_BUFFER_SIZE, 1); - memset(output, 0, SYSCTL_OUTPUT_BUFFER_SIZE); + vm_parameter_validation_kern_args_t args = { + .sizeof_args = sizeof(args), + .output_buffer_address = (uint64_t)output, + .output_buffer_size = SYSCTL_OUTPUT_BUFFER_SIZE, + .file_descriptor = get_fd(), + .generate_golden = generate_golden + }; + int64_t result = run_sysctl_test("vm_parameter_validation_kern_v2", (int64_t)&args); - int fd = get_fd(); - assert((fd & ((int)KB16 - 1)) == fd); - if (generate_golden) { - // pass flag on the msb of the fd - assert((fd & ((int)(KB16 >> 1) - 1)) == fd); - fd |= KB16 >> 1; + switch (result) { + case KERN_TEST_SUCCESS: + break; + case KERN_TEST_BAD_ARGS: + T_FAIL("version mismatch between test and kernel: " + "sizeof(vm_parameter_validation_kern_args_t) did not match"); + goto out; + case KERN_TEST_FAILED: + if (output[0] == 0) { + // no output from the kernel test; print a generic error + T_FAIL("kernel test failed for unknown reasons"); + } else { + // kernel provided a message: print it + T_FAIL("kernel test failed: %s", output); + } + goto out; + default: + T_FAIL("kernel test failed with unknown error %llu", result); + goto out; } - int64_t result = run_sysctl_test("vm_parameter_validation_kern", (int64_t)output + fd); - - T_QUIET; T_EXPECT_EQ(1ull, result, "vm_parameter_validation_kern"); if (generate_golden) { if (!out_bad_param_in_kern_golden_results(output) || (dump && !should_test_results)) { diff --git a/tests/vm/vm_reclaim.c b/tests/vm/vm_reclaim.c index 25232e125..e59e53702 100644 --- a/tests/vm/vm_reclaim.c +++ b/tests/vm/vm_reclaim.c @@ -40,6 +40,7 @@ T_GLOBAL_META( T_META_OWNER("jarrad"), // Ensure we don't conflict with libmalloc's reclaim buffer T_META_ENVVAR("MallocDeferredReclaim=0"), + T_META_ENVVAR("MallocAllowInternalSecurity=1"), T_META_RUN_CONCURRENTLY(false), T_META_CHECK_LEAKS(false) ); @@ -61,9 +62,9 @@ T_DECL(vm_reclaim_init, "Set up and tear down a reclaim buffer", { mach_vm_reclaim_ring_t ringbuffer = ringbuffer_init(); T_ASSERT_NOTNULL(ringbuffer, "ringbuffer is allocated"); - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.head, relaxed), 0ull, "head is zeroed"); - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.busy, relaxed), 0ull, "busy is zeroed"); - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.tail, relaxed), 0ull, "tail is zeroed"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->head, relaxed), 0ull, "head is zeroed"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->busy, relaxed), 0ull, "busy is zeroed"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->tail, relaxed), 0ull, "tail is zeroed"); size_t expected_len = (vm_page_size - offsetof(struct mach_vm_reclaim_ring_s, entries)) / sizeof(struct mach_vm_reclaim_entry_s); T_ASSERT_EQ((size_t)ringbuffer->len, expected_len, "length is set correctly"); @@ -555,91 +556,7 @@ T_DECL(vm_reclaim_update_reclaimable_bytes_threshold, "Kernel reclaims when num_ T_QUIET; T_ASSERT_LT(get_ledger_entry_for_pid(getpid(), phys_footprint_index, num_ledger_entries), (int64_t) ((kNumEntries) * kAllocationSize), "Entries were reclaimed as we crossed threshold"); } -#else /* !TARGET_OS_IPHONE */ -T_DECL(vm_reclaim_trim_minimum, - "update_accounting trims buffer according to sampling minimum", - T_META_VM_RECLAIM_ENABLED, T_META_TAG_VM_PREFERRED) -{ - kern_return_t kr; - int ret; - bool success, update_accounting; - mach_vm_reclaim_ring_t ringbuffer; - uint64_t sampling_period_ns; - size_t sampling_period_size = sizeof(sampling_period_ns); - uint32_t sizes[3] = {MiB(128), MiB(128), MiB(128)}; - mach_vm_address_t addrs[3] = {0}; - uint64_t ids[3] = {0}; - - ret = sysctlbyname("vm.reclaim.sampling_period_ns", &sampling_period_ns, &sampling_period_size, NULL, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(\"vm.reclaim.sampling_period_ns\")"); - struct timespec ts = { - .tv_sec = 2 * sampling_period_ns / NSEC_PER_SEC, - .tv_nsec = 2 * sampling_period_ns % NSEC_PER_SEC, - }; - - ringbuffer = ringbuffer_init(); - - // This should result in a sample taken (min 0) - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - - for (int i = 0; i < 3; i++) { - T_LOG("Placing entries[%d] into buffer", i); - ids[i] = allocate_and_defer_deallocate(sizes[i], ringbuffer, 0xAB, &addrs[i]); - } - - for (int i = 0; i < 3; i++) { - // The minimum for the first sample should be 0 - success = try_cancel(ringbuffer, ids[i], addrs[i], sizes[i], VM_RECLAIM_DEALLOCATE); - T_ASSERT_TRUE(success, "Entry %d should not be reclaimed", i); - kr = mach_vm_reclaim_try_enter(ringbuffer, addrs[i], sizes[i], VM_RECLAIM_DEALLOCATE, &ids[i], &update_accounting); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_try_enter()"); - if (update_accounting) { - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - } - } - - T_LOG("Sleeping for 2 sampling periods (%llu ns)", 2 * sampling_period_ns); - ret = nanosleep(&ts, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "nanosleep()"); - - // This should result in a sample taken (still min 0) - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - - for (int i = 0; i < 3; i++) { - success = try_cancel(ringbuffer, ids[i], addrs[i], sizes[i], VM_RECLAIM_DEALLOCATE); - T_EXPECT_TRUE(success, "Entry %d should not be reclaimed", i); - kr = mach_vm_reclaim_try_enter(ringbuffer, addrs[i], sizes[i], VM_RECLAIM_DEALLOCATE, &ids[i], &update_accounting); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_try_enter()"); - if (update_accounting) { - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - } - } - T_LOG("Sleeping for 2 sampling periods (%llu ns)", 2 * sampling_period_ns); - ret = nanosleep(&ts, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "nanosleep()"); - - // This should result in a sample taken (still min 0) - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - - T_LOG("Sleeping for 2 sampling periods (%llu ns)", 2 * sampling_period_ns); - ret = nanosleep(&ts, NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "nanosleep()"); - - // This should result in a sample taken (min sum(sizeof(entries[i]))) - kr = mach_vm_reclaim_update_kernel_accounting(ringbuffer); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_update_kernel_accounting()"); - - for (int i = 0; i < 3; i++) { - success = try_cancel(ringbuffer, ids[i], addrs[i], sizes[i], VM_RECLAIM_DEALLOCATE); - T_EXPECT_FALSE(success, "Entry %d should not be reclaimed", i); - } -} -#endif /* TARGET_OS_IPHONE */ +#endif /* TARGET_OS_IPHONE && !TARGET_OS_VISION */ T_HELPER_DECL(deallocate_buffer, "deallocate the buffer from underneath the kernel") @@ -902,7 +819,6 @@ T_DECL(resize_buffer, kr = mach_vm_reclaim_ring_allocate(&ringbuffer, initial_len, max_len); T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_ring_allocate()"); - // Should be able to fit 1022 entries in a one-page buffer (two entries for indices) T_LOG("Filling buffer with entries"); mach_vm_reclaim_count_t old_capacity; kr = mach_vm_reclaim_ring_capacity(ringbuffer, &old_capacity); @@ -914,7 +830,7 @@ T_DECL(resize_buffer, } id_tmp = allocate_and_defer_deallocate(vm_page_size, ringbuffer, 'X', &addr_tmp); T_ASSERT_EQ(id_tmp, VM_RECLAIM_ID_NULL, "Unable to over-fill buffer"); - uint64_t initial_tail = os_atomic_load(&ringbuffer->indices.tail, relaxed); + uint64_t initial_tail = os_atomic_load(&ringbuffer->tail, relaxed); T_ASSERT_EQ(initial_tail, (uint64_t)old_capacity, "tail == capacity after fill"); T_LOG("Resizing buffer to 4x"); @@ -922,9 +838,9 @@ T_DECL(resize_buffer, T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_ring_resize()"); // All entries should be reclaimed after resize - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.head, relaxed), initial_tail, "head is incremented"); - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.busy, relaxed), initial_tail, "busy is incremented"); - T_EXPECT_EQ(os_atomic_load(&ringbuffer->indices.tail, relaxed), initial_tail, "tail is preserved"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->head, relaxed), initial_tail, "head is incremented"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->busy, relaxed), initial_tail, "busy is incremented"); + T_EXPECT_EQ(os_atomic_load(&ringbuffer->tail, relaxed), initial_tail, "tail is preserved"); mach_vm_reclaim_count_t new_capacity; kr = mach_vm_reclaim_ring_capacity(ringbuffer, &new_capacity); @@ -955,3 +871,162 @@ T_DECL(resize_buffer, T_QUIET; T_EXPECT_TRUE(usable, "Entry is available for re-use"); } } + +T_DECL(resize_after_drain, + "resize a buffer after draining it", + T_META_VM_RECLAIM_ENABLED, + T_META_TAG_VM_PREFERRED) +{ + int ret; + mach_vm_reclaim_error_t err; + mach_vm_reclaim_ring_t ring; + uint64_t sampling_period_ns; + size_t sampling_period_size = sizeof(sampling_period_ns); + + ret = sysctlbyname("vm.reclaim.sampling_period_ns", &sampling_period_ns, &sampling_period_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl(vm.reclaim.sampling_period_ns)"); + + T_LOG("Initializing ring"); + mach_vm_reclaim_count_t initial_len = mach_vm_reclaim_round_capacity(512); + mach_vm_reclaim_count_t max_len = 4 * initial_len; + err = mach_vm_reclaim_ring_allocate(&ring, initial_len, max_len); + T_QUIET; T_ASSERT_MACH_SUCCESS(err, "mach_vm_reclaim_ring_allocate()"); + + // Fill the buffer with some memory + T_LOG("Allocating and deferring memory"); + for (mach_vm_reclaim_count_t i = 0; i < 128; i++) { + mach_vm_address_t addr; + mach_vm_reclaim_id_t id = allocate_and_defer_deallocate(vm_page_size, ring, 'A', &addr); + T_QUIET; T_ASSERT_NE(id, VM_RECLAIM_ID_NULL, "Able to defer deallocation"); + } + + T_LOG("Draining ring"); + pid_t pid = getpid(); + ret = sysctlbyname("vm.reclaim.drain_pid", NULL, NULL, &pid, sizeof(pid)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl(vm.reclaim.drain_pid)"); + + err = mach_vm_reclaim_ring_resize(ring, 2 * initial_len); + T_ASSERT_MACH_SUCCESS(err, "mach_vm_reclaim_ring_resize()"); + + T_LOG("Sleeping for 1 sampling period..."); + struct timespec ts = { + .tv_sec = sampling_period_ns / NSEC_PER_SEC, + .tv_nsec = sampling_period_ns % NSEC_PER_SEC, + }; + ret = nanosleep(&ts, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "nanosleep()"); + + err = mach_vm_reclaim_update_kernel_accounting(ring); + T_ASSERT_MACH_SUCCESS(err, "mach_vm_reclaim_update_kernel_accounting()"); +} + +#define QUERY_BUFFER_RING_COUNT 25 + +static void +kill_child() +{ + kill(child_pid, SIGKILL); +} + + +kern_return_t +mach_vm_deferred_reclamation_buffer_remap(task_t source_task, + task_t dest_task, + mach_vm_address_t addr, + mach_vm_address_t *addr_u, + mach_vm_size_t *size_u); + +T_DECL(copy_and_query_buffer, + "verify that a reclaim ring may be queried correctly", + T_META_VM_RECLAIM_ENABLED, + T_META_TAG_VM_PREFERRED, + T_META_ASROOT(true)) +{ + kern_return_t kr; + mach_vm_reclaim_error_t rr; + mach_vm_reclaim_ring_t self_ring; + mach_vm_reclaim_id_t ids[QUERY_BUFFER_RING_COUNT]; + mach_vm_address_t addrs[QUERY_BUFFER_RING_COUNT]; + mach_vm_size_t sizes[QUERY_BUFFER_RING_COUNT]; + mach_vm_reclaim_action_t actions[QUERY_BUFFER_RING_COUNT]; + struct mach_vm_reclaim_region_s query_buffer[QUERY_BUFFER_RING_COUNT]; + mach_vm_reclaim_count_t query_count; + task_t child_task; + mach_vm_reclaim_count_t n_rings; + struct mach_vm_reclaim_ring_ref_s ring_ref; + mach_vm_reclaim_count_t capacity = mach_vm_reclaim_round_capacity(512); + mach_vm_reclaim_ring_copy_t copied_ring; + + T_SETUPBEGIN; + + T_LOG("Initializing buffer"); + kr = mach_vm_reclaim_ring_allocate(&self_ring, capacity, capacity); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_ring_allocate()"); + + T_LOG("Adding entries to buffer"); + for (mach_vm_reclaim_count_t i = 0; i < QUERY_BUFFER_RING_COUNT; i++) { + actions[i] = (rand() % 2 == 0) ? VM_RECLAIM_FREE : VM_RECLAIM_DEALLOCATE; + sizes[i] = ((rand() % 3) + 1) * vm_page_size; + addrs[i] = 0; + ids[i] = allocate_and_defer_free(sizes[i], self_ring, 'A', actions[i], &addrs[i]); + T_QUIET; T_ASSERT_NE(ids[i], VM_RECLAIM_ID_NULL, "Able to defer allocation"); + } + + child_pid = fork(); + if (child_pid == 0) { + while (true) { + sleep(1); + } + } + T_ATEND(kill_child); + + kr = task_for_pid(mach_task_self(), child_pid, &child_task); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_for_pid"); + + T_SETUPEND; + + T_LOG("Copying buffer"); + rr = mach_vm_reclaim_get_rings_for_task(child_task, NULL, &n_rings); + T_ASSERT_MACH_SUCCESS(rr, "Query ring count"); + T_ASSERT_EQ(n_rings, 1, "Task has one ring"); + rr = mach_vm_reclaim_get_rings_for_task(child_task, &ring_ref, &n_rings); + T_ASSERT_MACH_SUCCESS(rr, "Get ring reference"); + T_ASSERT_NE(ring_ref.addr, 0ULL, "Ring ref ring is not null"); + + kr = mach_vm_reclaim_ring_copy(child_task, &ring_ref, &copied_ring); + T_ASSERT_MACH_SUCCESS(kr, "mach_vm_reclaim_ring_copy()"); + T_ASSERT_NOTNULL(copied_ring, "copied ring is not null"); + + T_LOG("Querying buffer"); + + rr = mach_vm_reclaim_copied_ring_query(&copied_ring, NULL, &query_count); + T_QUIET; T_ASSERT_MACH_SUCCESS(rr, "query reclaim ring size"); + T_ASSERT_EQ(query_count, QUERY_BUFFER_RING_COUNT, "correct reclaim ring query size"); + + rr = mach_vm_reclaim_copied_ring_query(&copied_ring, query_buffer, &query_count); + T_QUIET; T_ASSERT_MACH_SUCCESS(rr, "query reclaim ring"); + T_ASSERT_EQ(query_count, QUERY_BUFFER_RING_COUNT, "query count is correct"); + + bool all_match = true; + for (mach_vm_reclaim_count_t i = 0; i < QUERY_BUFFER_RING_COUNT; i++) { + mach_vm_reclaim_region_t qentry = &query_buffer[i]; + if ((qentry->vmrr_addr != addrs[i]) || + (qentry->vmrr_size != sizes[i]) || + (qentry->vmrr_behavior != actions[i])) { + all_match = false; + } + T_QUIET; T_EXPECT_EQ(qentry->vmrr_addr, addrs[i], "query->vmrr_addr is correct"); + T_QUIET; T_EXPECT_EQ(qentry->vmrr_size, sizes[i], "query->vmrr_size is correct"); + T_QUIET; T_EXPECT_EQ(qentry->vmrr_behavior, actions[i], "query->vmrr_behavior is correct"); + } + T_ASSERT_TRUE(all_match, "query entries are correct"); + + query_count = 5; + rr = mach_vm_reclaim_copied_ring_query(&copied_ring, query_buffer, &query_count); + T_QUIET; T_ASSERT_MACH_SUCCESS(rr, "query reclaim ring with small buffer"); + T_ASSERT_EQ(query_count, 5, "query reclaim ring with small buffer returns correct size"); + + T_LOG("Freeing buffer"); + rr = mach_vm_reclaim_copied_ring_free(&copied_ring); + T_ASSERT_MACH_SUCCESS(rr, "free reclaim ring"); +} diff --git a/tests/vm/vm_reclaim.entitlements b/tests/vm/vm_reclaim.entitlements new file mode 100644 index 000000000..917212ddd --- /dev/null +++ b/tests/vm/vm_reclaim.entitlements @@ -0,0 +1,8 @@ + + + + + task_for_pid-allow + + + diff --git a/tests/vm/vm_stress.cpp b/tests/vm/vm_stress.cpp new file mode 100644 index 000000000..30748bf6b --- /dev/null +++ b/tests/vm/vm_stress.cpp @@ -0,0 +1,1483 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_RADAR_COMPONENT_NAME("xnu"), + T_META_RADAR_COMPONENT_VERSION("VM"), + T_META_OWNER("tgal2")); + +/** The following are modes that determine the way in which the created objects will be re-mapped to the task's memory. + * The test behaves as follows according to the chosen policy: + * RandomPartition - creates a buffer for each (randomly sized) part of each object. Every page of every object will be re-mapped exactly once. + * OneToMany - creates multiple mappings of the entire object. + * Overwrite - same as OneToMany, only that a portion of each mapping's pages will be overwritten, creating double the amount of mappings in total. + * Topology - creates mappings according to different topologies. + */ +enum class MappingPolicy { + RandomPartition, + OneToMany, + Overwrite, + Topology, +}; + +struct TestParams { + uint32_t num_objects; + uint64_t obj_size; + uint32_t runtime_secs; + uint32_t num_threads; + MappingPolicy policy; + uint32_t mpng_flags; + bool is_cow; + bool is_file; + bool slow_paging; +}; + +struct MappingArgs { + task_t arg_target_task = mach_task_self(); + mach_vm_address_t arg_target_address = 0; + uint64_t arg_mapping_size = 0; + uint32_t arg_mask = 0; + uint32_t arg_flags = 0; + task_t arg_src_task = mach_task_self(); + mach_vm_address_t arg_src_address = 0; + bool arg_copy = false; + uint32_t arg_cur_protection = 0; + uint32_t arg_max_protection = 0; + uint32_t arg_inheritance = VM_INHERIT_SHARE; +}; + +struct status_counters { + uint32_t success; + uint32_t fail; +} status_counters; + + +static uint64_t +random_between( + uint64_t a, uint64_t b) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(a, b); + return dis(gen); +} + +class TestRuntime +{ +public: + // Member functions: + int + wait_for_status( + int runtime_secs) + { + std::unique_lock lock(mutex); + auto now = std::chrono::system_clock::now(); + auto deadline = now + std::chrono::seconds(runtime_secs); + state = running; + while (state == running) { + if (cond.wait_until(lock, deadline) == std::cv_status::timeout) { + state = complete; + } + } + if (state == complete) { + return 0; + } else { + return 1; + } + } + + enum state { + paused, + running, + error, + complete + }; + + // Data members: + std::atomic state{paused}; + std::mutex mutex; + +private: + std::condition_variable cond; +}; + +TestRuntime runner; + +/** + * Responsible for creating the actual mapping into vm, performing actions on a + * mapping or a page, manage the threads which perform operations on this + * mapping. + */ +class Mapping +{ + using vm_op = std::function; + +public: + // Constructor: + Mapping(uint32_t _id, uint64_t _offset_in_pages, MappingArgs _args, uint32_t _fd) + : id(_id), offset_in_pages(_offset_in_pages), args(_args), fd(_fd), lock(std::make_shared()), src_mapping(std::nullopt), is_mapped(false) + { + num_pages = args.arg_mapping_size / PAGE_SIZE; + op_denom = num_pages; + create_mapping(); + } + + // Comparator for sorting by id + static bool + compare_by_id( + const Mapping &a, const Mapping &b) + { + return a.id < b.id; + } + + // Member functions: + + // Creation: + + kern_return_t + remap_fixed() + { + kern_return_t kr = mach_vm_remap(args.arg_target_task, &args.arg_target_address, args.arg_mapping_size, + args.arg_mask, VM_FLAGS_OVERWRITE | VM_FLAGS_FIXED, args.arg_src_task, + args.arg_src_address + offset_in_pages * PAGE_SIZE, args.arg_copy, (vm_prot_t *)&(args.arg_cur_protection), + (vm_prot_t *)&(args.arg_max_protection), args.arg_inheritance); + if (kr != KERN_SUCCESS) { + return kr; + } + is_mapped = true; + return kr; + } + + int + create_mapping() + { + kern_return_t kr = remap_fixed(); + if (kr != KERN_SUCCESS) { + throw std::runtime_error("mach_vm_remap failed: " + std::string(mach_error_string(kr)) + "\n"); + } + return 0; + } + + void + set_src_mapping( + Mapping &other) + { + src_mapping = other; + } + + // Operations to be done by the ran threads: + + kern_return_t + deallocate_no_lock() + { + is_mapped = false; + kern_return_t kr = mach_vm_deallocate(args.arg_src_task, args.arg_target_address, args.arg_mapping_size); + return kr; + } + + bool + realloc_no_parent() + { + std::unique_lock my_unique(*lock); + + kern_return_t kr = remap_fixed(); + if (kr != KERN_SUCCESS) { + return false; + } + return true; + } + + bool + realloc_with_parent() + { + std::unique_lock my_unique(*lock, std::defer_lock); + std::unique_lock parent_unique(*(src_mapping->get().lock), std::defer_lock); + std::scoped_lock l{my_unique, parent_unique}; + + kern_return_t kr = remap_fixed(); + if (kr != KERN_SUCCESS) { + return false; + } + return true; + } + + bool + op_dealloc() + { + std::unique_lock my_unique(*lock); + + kern_return_t kr = deallocate_no_lock(); + if (kr != KERN_SUCCESS) { + return false; + } + return true; + } + + bool + op_realloc() + { + // std::this_thread::sleep_for(std::chrono::microseconds(50)); + if (src_mapping) { + return realloc_with_parent(); + } else { + return realloc_no_parent(); + } + } + + bool + op_protect() + { + kern_return_t kr = mach_vm_protect(mach_task_self(), (mach_vm_address_t)args.arg_target_address, + (num_pages / op_denom) * PAGE_SIZE, 0, VM_PROT_READ | VM_PROT_WRITE); + if (kr != KERN_SUCCESS) { + return false; + } + return true; + } + + bool + op_wire() + { + std::this_thread::sleep_for(std::chrono::microseconds(50)); + uint32_t err = mlock((void *)args.arg_target_address, (num_pages / op_denom) * PAGE_SIZE); + if (err) { + return false; + } + return true; + } + + bool + op_write() + { + std::shared_lock my_shared(*lock); + if (!is_mapped) { + return false; + } + // Modify only the last byte of each page. + for (uint64_t i = 1; i <= num_pages / op_denom; i++) { + ((char *)args.arg_target_address)[i * PAGE_SIZE - 1] = 'M'; // M marks it was written via the mapping (for debugging purposes) + } + + // No need to sync to the file. It will be written when paged-out (which happens all the time). + + return true; + } + + + bool + op_unwire() + { + uint32_t err = munlock((void *)args.arg_target_address, (num_pages / op_denom) * PAGE_SIZE); + if (err) { + return false; + } + return true; + } + + bool + op_write_direct() + { + std::this_thread::sleep_for(std::chrono::microseconds(50)); + + if (!fd) { + return false; // Return early if no file descriptor (no file-backed mapping) + } + + std::shared_lock my_shared(*lock); + if (!is_mapped) { + return false; + } + + // Modify only the last byte of each page. + for (uint64_t i = 1; i <= num_pages / op_denom; i++) { + ((char *)args.arg_target_address)[i * PAGE_SIZE - 1] = 'D'; // D marks it was written using op_write_Direct (for debugging purposes) + } + + if (fcntl(fd, F_NOCACHE, true)) { + auto err = errno; + throw std::runtime_error("fcntl failed. err=" + std::to_string(err) + "\n"); + } + if (lseek(fd, 0, SEEK_SET) == -1) { + throw std::runtime_error("lseek failed to move cursor to beginning. err=" + std::to_string(errno)); + } + + int num_bytes = write(fd, (void *)(args.arg_target_address), (num_pages / op_denom) * PAGE_SIZE); + + if (num_bytes == -1) { + printf("num_bytes=%d", num_bytes); + return false; + } + + return true; + } + + bool + op_pageout() + { + if (madvise((void *)args.arg_target_address, (num_pages / op_denom) * PAGE_SIZE, MADV_PAGEOUT)) { + return false; + } + return true; + } + + bool + run_op(const std::pair *op) + { + bool ret = false; + ret = op->first(this); + + /* Never let the denominator be zero. */ + uint32_t new_denom = (op_denom * 2) % num_pages; + op_denom = new_denom > 0 ? new_denom : 1; + + return ret; + } + + // Miscellaneous: + + void + create_gap_before() + { + mach_vm_address_t to_dealloc = args.arg_target_address - PAGE_SIZE; + kern_return_t kr = mach_vm_deallocate(mach_task_self(), to_dealloc, PAGE_SIZE); + if (kr != KERN_SUCCESS) { + throw std::runtime_error("mach_vm_deallocate failed: " + std::string(mach_error_string(kr)) + "\n"); + } + } + + void + adjust_addresses_and_offset( + uint64_t detached_num_pages, uint64_t detached_size) + { + args.arg_src_address += detached_size; + args.arg_target_address += detached_size; + offset_in_pages += detached_num_pages; + } + + void + shrink_size( + uint64_t detached_num_pages, uint64_t detached_size) + { + num_pages -= detached_num_pages; + args.arg_mapping_size -= detached_size; + } + + /* Fix the wrapper of the mapping after overwriting a part of it, to keep it aligned to real vmmap_entry */ + void + fix_overwritten_mapping( + uint64_t detached_num_pages) + { + uint64_t detached_size = detached_num_pages * PAGE_SIZE; + id *= 2; + shrink_size(detached_num_pages, detached_size); + adjust_addresses_and_offset(detached_num_pages, detached_size); + create_gap_before(); + } + + void + print_mapping() + { + T_LOG("\tMAPPING #%2d, from address: %llx, to address: %llx, offset: %2llu, size: %4llu " + "pages\n", + id, args.arg_src_address, args.arg_target_address, offset_in_pages, num_pages); + } + + uint64_t + get_end() + { + return offset_in_pages + args.arg_mapping_size / PAGE_SIZE - 1; + } + + void + add_child(Mapping *other) + { + children.emplace_back(other); + } + + void + print_as_tree(const std::string &prefix = "", bool isLast = true) + { + T_LOG("%s%s%d", prefix.c_str(), (isLast ? "└── " : "├── "), id); + + std::string newPrefix = prefix + (isLast ? " " : "│ "); + + for (uint32_t i = 0; i < children.size(); i++) { + children[i]->print_as_tree(newPrefix, i == children.size() - 1); + } + } + + // Data members: + + uint32_t id = 0; + uint64_t offset_in_pages = 0; + MappingArgs args; + uint64_t num_pages = 0; + std::vector children; + uint32_t fd = 0; + std::shared_ptr lock; + std::optional > src_mapping; + bool is_mapped; // set on remap() and cleared on deallocate(). + + /** + * Regarding the locks: (reasoning for shared_ptr) + * In some cases (MAppingsManager::policy==MappingPolicy::Topology), the source for this mapping is another mapping. + * This case requires, in certain ops (op_de_re_allocate()), to also hold the source's lock. + * That means lock is going to be under shared ownership and therefore the locks should be in a shared_ptr. + */ + uint32_t op_denom = 1; // tells the various operations what part of num_pages to include. + static inline std::vector > ops = { + {&Mapping::op_protect, "protect"}, + {&Mapping::op_wire, "wire"}, + {&Mapping::op_write, "write"}, + {&Mapping::op_unwire, "unwire"}, + {&Mapping::op_pageout, "pageout"}}; + /* + * The following is disabled due to a deadlock it causes in the kernel too frequently + * (and we want a running stress test). See rdar://146761078 + * Once this deadlock is solved, we should uncomment it. + */ + // {&Mapping::op_write_direct, "write_direct"}, +}; + +/** + * Creates and wraps the memory object + */ +class Object +{ +public: + // Default constructor: + Object() : id(0), num_pages(0) + { + } + + // Constructor: + Object( + uint32_t _id, uint32_t num_pages) + : id(_id), num_pages(num_pages) + { + } + + // Memeber functions: + + // Creation: + + int + open_file_slow_paging() + { + std::string slow_file = std::string(slow_dmg_path) + "/file.txt"; + fd = open(slow_file.c_str(), O_CREAT | O_RDWR, S_IWUSR | S_IRUSR); + if (fd < 0) { + throw std::runtime_error("open() failed. err=" + std::to_string(errno) + "\n"); + } + + T_LOG("File created in slow ramdisk: %s\n", slow_file.c_str()); + + return fd; + } + + int + open_file() + { + std::string template_str = "/tmp/some_file_" + std::to_string(id) + "XXXXXX"; + char template_filename[template_str.size() + 1]; + strcpy(template_filename, template_str.c_str()); + + fd = mkstemp(template_filename); + if (fd == -1) { + throw std::runtime_error("mkstemp failed. err=" + std::to_string(errno) + "\n"); + } + + T_LOG("Temporary file created: %s\n", template_filename); + + return fd; + } + + void + close_file() + { + close(fd); + fd = 0; + } + + int + create_source_from_file(bool slow_paging) + { + // File opening/creation: + int fd = 0; + struct stat st; + + if (slow_paging) { + fd = open_file_slow_paging(); + } else { + fd = open_file(); + } + + if (fd < 0) { + return fd; + } + + if (ftruncate(fd, num_pages * PAGE_SIZE) < 0) { + throw std::runtime_error("ftruncate failed. err=" + std::to_string(errno) + "\n"); + } + + // Mapping file to memory: + src = (mach_vm_address_t)mmap(NULL, num_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void *)src == MAP_FAILED) { + throw std::runtime_error("mmap failed. err=" + std::to_string(errno) + "\n"); + } + + return 0; + } + + int + create_source_anon() + { + uint32_t anywhere_flag = TRUE; + kern_return_t kr = mach_vm_allocate(mach_task_self(), &src, num_pages * PAGE_SIZE, anywhere_flag); + if (kr != KERN_SUCCESS) { + throw std::runtime_error("mach_vm_allocate failed: " + std::string(mach_error_string(kr)) + "\n"); + } + return 0; + } + + int + create_source( + bool is_file, bool slow_paging) + { + if (is_file) { + return create_source_from_file(slow_paging); + } else { + return create_source_anon(); + } + } + + static uint64_t + random_object_size( + uint64_t obj_size) + { + uint32_t min_obj_size = 16; // (in pages) + return random_between(min_obj_size, obj_size); + } + + // Miscellaneous: + + void + print_object() + { + T_LOG(" -----------------------------------------------------------------------------"); + T_LOG(" OBJECT #%d, size: %llu pages, object address: %llx\n", id, num_pages, src); + } + + // Data members: + uint32_t id = 0; + uint64_t num_pages = 0; + mach_vm_address_t src = 0; + int fd = 0; + static inline char slow_dmg_path[] = "/Volumes/apfs-slow"; +}; + +/** + * Creates and manages the different mappings of an object. + */ +class MappingsManager +{ +public: + // Constructor: + MappingsManager( + const Object &_obj, MappingPolicy _policy) + : obj(_obj), policy(_policy) + { + } + + // Destructor: + ~MappingsManager() + { + for (uint32_t i = 0; i < ranges.size(); i++) { + if (buffers[i]) { + mach_vm_deallocate(mach_task_self(), (mach_vm_address_t)buffers[i], ranges[i].second - ranges[i].first + 2); + buffers[i] = nullptr; + } + } + } + + enum topology { + chain, + star, + ternary, + random + }; + + // Member functions: + + std::string + topo_to_string() + { + switch (topo) { + case chain: + return "chain"; + case star: + return "star"; + case ternary: + return "ternary"; + case random: + return "random"; + default: + return "unknown"; + } + } + + // Partition stuff: + + void + create_general_borders( + std::vector &general_borders) + { + uint64_t gap = obj.num_pages / (num_mappings); + general_borders.emplace_back(1); + for (uint32_t i = 1; i < (num_mappings); i++) { + general_borders.emplace_back(gap * i); + } + } + + void + create_borders( + std::vector &borders) + { + std::vector general_borders; + create_general_borders(general_borders); + borders.emplace_back(0); + + for (uint32_t i = 0; i < general_borders.size() - 1; i++) { + borders.emplace_back( + random_between(general_borders[i], general_borders[i + 1] - 1)); + } + borders.emplace_back(obj.num_pages); + } + + void + convert_borders_to_ranges( + std::vector &borders) + { + for (uint32_t i = 0; i < borders.size() - 1; ++i) { + ranges.emplace_back(borders[i], borders[i + 1] - 1); + } + } + + void + make_random_partition() + { + std::vector borders; + create_borders(borders); + convert_borders_to_ranges(borders); + } + + void + print_partition() + { + printf("| PARTITION:\t| "); + for (const auto &range : ranges) { + printf("%3d -- %3d", range.first, range.second); + } + printf("%*s|\n", 30, ""); + for (auto &m : mappings) { + m.print_mapping(); + } + } + + // Creation: + + void + create_seq(std::vector &seq) + { + seq.emplace_back(0); + for (uint32_t i = 1; i < num_mappings; i++) { + switch (topo) { + case chain: + seq.emplace_back(i); + break; + + case random: + seq.emplace_back(random_between(0, i)); + break; + + case star: + seq.emplace_back(0); + break; + + case ternary: + seq.emplace_back(i / 3); + break; + + default: + throw std::runtime_error("create_seq: topology undefined"); + break; + } + } + T_LOG("topology: %s", topo_to_string().c_str()); + } + + void + allocate_buffer( + uint64_t num_pages_to_alloc) + { + // buffers.emplace_back((char *)malloc((obj.num_pages + 1) * PAGE_SIZE)); // One extra page for a gap + mach_vm_address_t buff; + kern_return_t kr = mach_vm_allocate(mach_task_self(), &buff, num_pages_to_alloc * PAGE_SIZE, TRUE); + if (kr != KERN_SUCCESS) { + throw std::runtime_error("Failed to allocate buffer in object #" + std::to_string(obj.id) + "\n"); + } + buffers.push_back((char *)buff); + } + + void + initialize_partition_buffers() + { + for (auto &range : ranges) { + allocate_buffer(range.second - range.first + 2); + } + } + + MappingArgs + initialize_basic_args() + { + MappingArgs args; + args.arg_src_address = obj.src; + args.arg_copy = is_cow; + args.arg_flags = mpng_flags; + return args; + } + + void + map_by_seq(std::vector &seq) + { + // First mapping of the source object: + MappingArgs args = initialize_basic_args(); + allocate_buffer(obj.num_pages + 1); + args.arg_target_address = (mach_vm_address_t)(buffers[0] + PAGE_SIZE); + args.arg_mapping_size = obj.num_pages * PAGE_SIZE; + mappings.emplace_back(Mapping(1, 0, args, obj.fd)); + + // Re-mappings of the first mappings, according to the given seqence: + for (uint32_t i = 1; i < num_mappings; i++) { + allocate_buffer(obj.num_pages + 1); + args.arg_src_address = mappings[seq[i - 1]].args.arg_target_address; + args.arg_target_address = (mach_vm_address_t)(buffers[i]); + mappings.emplace_back(Mapping(i + 1, 0, args, obj.fd)); + mappings[seq[i - 1]].add_child(&mappings[i]); + mappings[i].set_src_mapping(mappings[seq[i - 1]]); + } + mappings[0].print_as_tree(); + } + + /* Mode 1 - maps parts of the object to parts of the (only) buffer. Every page is mapped exactly once. */ + void + map_by_random_partition() + { + make_random_partition(); + initialize_partition_buffers(); + MappingArgs args = initialize_basic_args(); + for (uint32_t i = 0; i < num_mappings; i++) { + args.arg_target_address = (mach_vm_address_t)(buffers[i] + PAGE_SIZE); + args.arg_mapping_size = (ranges[i].second - ranges[i].first + 1) * PAGE_SIZE; + mappings.emplace_back(Mapping(i + 1, ranges[i].first, args, obj.fd)); + } + } + + /* Modes 2,4 - maps the entire object to different buffers (which all have the same size as the object). */ + void + map_one_to_many( + bool extra) + { + uint32_t num_pages_for_gaps = extra ? 2 : 1; + MappingArgs args = initialize_basic_args(); + for (uint32_t i = 0; i < num_mappings; i++) { + allocate_buffer(obj.num_pages + num_pages_for_gaps); + args.arg_target_address = (mach_vm_address_t)(buffers[i] + PAGE_SIZE * num_pages_for_gaps); + args.arg_mapping_size = obj.num_pages * PAGE_SIZE; + mappings.emplace_back(Mapping(i + 1, 0, args, obj.fd)); + } + } + + /* Mode 3 - maps the source object in a certain CoW-topology, based on the given sequence. */ + void + map_topo() + { + std::vector seq; + create_seq(seq); + map_by_seq(seq); + } + + void + map() + { + switch (policy) { + case MappingPolicy::RandomPartition: + map_by_random_partition(); + break; + case MappingPolicy::OneToMany: + map_one_to_many(false); + break; + case MappingPolicy::Overwrite: + map_one_to_many(true); + break; + case MappingPolicy::Topology: + num_mappings *= 4; + mappings.reserve(num_mappings); + topo = static_cast((obj.id - 1) % 4); // Each object (out of every 4 consecutive objects) will be remapped in a different CoW topology. + map_topo(); + break; + default: + break; + } + } + + void + set_srcs() + { + for (uint32_t i = 1; i < mappings.size(); i++) { + mappings[i].set_src_mapping(mappings[i - 1]); + } + } + + /* Overwrites the first n/x pages of each mapping */ + void + overwrite_mappings() + { + uint64_t num_pages_to_overwrite = obj.num_pages / overwrite_denom; + MappingArgs args = initialize_basic_args(); + for (uint32_t i = 0; i < num_mappings; i++) { + args.arg_target_address = (mach_vm_address_t)(buffers[i] + PAGE_SIZE); + args.arg_mapping_size = num_pages_to_overwrite * PAGE_SIZE; + mappings.emplace_back(Mapping(2 * i + 1, 0, args, obj.fd)); + mappings[i].fix_overwritten_mapping(num_pages_to_overwrite); + } + std::sort(mappings.begin(), mappings.end(), Mapping::compare_by_id); + set_srcs(); // set the src (parent) lock for each newly created mapping to facilitate op_de_re_allocate(). + } + + // "User space" validation: + + bool + validate_sum() + { + uint64_t sum = 0; + + for (const auto &mapping : mappings) { + sum += mapping.num_pages; + } + if (sum != obj.num_pages) { + return false; + } + return true; + } + + bool + validate_consecutiveness() + { + for (int i = 0; i < mappings.size() - 1; i++) { + if (mappings[i].offset_in_pages + mappings[i].num_pages != + mappings[i + 1].offset_in_pages) { + return false; + } + } + return true; + } + + bool + validate_start_and_end() + { + for (int i = 0; i < mappings.size() - 1; i++) { + if (mappings[i].offset_in_pages + mappings[i].num_pages != + mappings[i + 1].offset_in_pages) { + return false; + } + } + return true; + } + + bool + validate_all_sizes() + { + for (const auto &mapping : mappings) { + if (mapping.num_pages != obj.num_pages) { + return false; + } + } + return true; + } + + bool + validate_partition() + { + return validate_sum() && validate_consecutiveness() && validate_start_and_end(); + } + + bool + validate_one_to_many() + { + return validate_all_sizes(); + } + + bool + validate_user_space() + { + switch (policy) { + case MappingPolicy::RandomPartition: + return validate_partition(); + break; + case MappingPolicy::OneToMany: + return validate_one_to_many(); + break; + default: + return true; + break; + } + } + + // Miscellaneous: + + void + set_flags( + uint32_t flags) + { + mpng_flags = flags; + } + + void + set_is_cow( + bool _is_cow) + { + is_cow = _is_cow; + } + + void + print_all_mappings() + { + for (auto &mpng : mappings) { + mpng.print_mapping(); + } + } + + // Data members: + uint32_t num_mappings = 4; + static inline uint32_t overwrite_denom = 2; + /** + * Sets the part to overwrite in case MappingsManager::policy==MappingPolicy::Overwrite. + * It's the same for all of the mappings and has to be visible outside of the class for logging purposes. Therefore it's static. + */ + Object obj; + std::vector mappings; + MappingPolicy policy = MappingPolicy::OneToMany; + std::vector buffers; + std::vector > ranges; + uint32_t mpng_flags = 0; + bool is_cow = false; + topology topo = topology::random; +}; + +class Memory +{ + using vm_op = std::function; + +public: + // Member functions: + + // Creation: + + int + create_objects( + uint32_t num_objects, uint64_t obj_size, MappingPolicy policy, bool is_file, bool is_cow, bool slow_paging) + { + for (uint32_t i = 1; i <= num_objects; i++) { + Object o(i, obj_size); + if (o.create_source(is_file, slow_paging) == 0) { + managers.emplace_back(std::make_unique(o, policy)); + } else { + throw std::runtime_error("Error creating source object #" + std::to_string(i) + "\n"); + } + } + return 0; + } + + void + create_mappings( + uint32_t flags, bool is_cow) + { + for (auto &mngr : managers) { + mngr->set_flags(flags); + mngr->set_is_cow(is_cow); + mngr->map(); + } + } + + void + close_all_files() + { + for (auto &mngr : managers) { + mngr->obj.close_file(); + } + } + + // Thread-related operations: + + bool + run_op_on_all_mappings( + const std::pair *op, uint32_t op_idx) + { + for (auto &mngr : managers) { + for (auto &m : mngr->mappings) { + if (m.run_op(op)) { + op_status_counters[op_idx].success++; + } else { + op_status_counters[op_idx].fail++; + } + } + } + return true; + } + + void + num2op( + std::pair *op, uint32_t thread_number) + { + op->first = Mapping::ops[thread_number % Mapping::ops.size()].first; + op->second = Mapping::ops[thread_number % Mapping::ops.size()].second; + } + + void + print_thread_started( + uint32_t thread_number, std::string thread_name) + { + uint32_t allowed_prints = Mapping::ops.size() * 3; + if (thread_number < allowed_prints) { + T_LOG("Starting thread: %s", thread_name.c_str()); + } else if (thread_number == allowed_prints) { + T_LOG("...\n"); + } + // Else: we've printed enough, don't make a mess on the console + } + + std::future + start_thread( + uint32_t thread_number) + { + uint32_t op_name_length = 16; // Just the length of the longest op name, for nicer printing of op_count + std::pair operation; + std::string thread_name; + uint32_t thread_number_remainder = thread_number / Mapping::ops.size(); + num2op(&operation, thread_number); + std::string operation_name_aligned = operation.second; // For nice printing only + if (operation_name_aligned.length() < op_name_length) { + operation_name_aligned = operation_name_aligned + std::string(op_name_length - operation_name_aligned.length(), ' '); // Pad if shorter than op_name_length + } + thread_name = operation_name_aligned + " #" + std::to_string(thread_number_remainder + 1); + + print_thread_started(thread_number, thread_name); + + return std::async(std::launch::async, [this, operation, thread_name, thread_number]() { /* lambda: */ + while (runner.state != TestRuntime::error && + runner.state != TestRuntime::complete) { + if (runner.state == TestRuntime::running) { + bool running = this->run_op_on_all_mappings(&operation, thread_number % Mapping::ops.size()); + if (!running) { + break; + } + } + } + }); + } + + void + start_ops( + uint32_t num_threads) + { + for (uint32_t i = 0; i < Mapping::ops.size(); i++) { + op_status_counters.emplace_back(0, 0); + } + + for (uint32_t i = 0; i < num_threads * Mapping::ops.size(); i++) { + futures.emplace_back(start_thread(i)); + } + } + + void + join_threads() + { + for (auto &f : futures) { + f.get(); // This replaces thread.join() in order to propogate the exceptions raised from non main threads + } + } + + // Miscellaneous: + + void + print_mem_layout() + { + T_LOG("\nmemory layout:"); + uint32_t allowed_prints = 3; + for (uint32_t i = 0; i < managers.size() && i < allowed_prints; i++) { + managers[i]->obj.print_object(); + managers[i]->print_all_mappings(); + } + T_LOG(" -----------------------------------------------------------------------------"); + T_LOG("...\n"); + } + + void + print_op_counts() + { + for (uint32_t i = 0; i < Mapping::ops.size(); i++) { + T_LOG("%16s: successes %7d :|: fails: %7d", Mapping::ops[i].second.c_str(), op_status_counters[i].success, op_status_counters[i].fail); + } + } + + void + overwrite_all() + { + for (auto &mngr : managers) { + mngr->overwrite_mappings(); + } + } + + bool + validate() + { + for (auto &mngr : managers) { + if (!mngr->validate_user_space()) { + return false; + } + } + return true; + } + + void + print_test_result() + { + T_LOG("\ninner validation: OBJECTS AND MAPPINGS APPEAR %s", validate() ? "AS EXPECTED" : "*NOT* AS EXPECTED"); + } + + // Data members: + + std::vector > managers; + std::vector > futures; + static inline std::vector op_status_counters; +}; + +uint32_t +run_test( + const TestParams &tp) +{ + Memory memory; + uint32_t status; + + int src_created_successfully = memory.create_objects(tp.num_objects, tp.obj_size, tp.policy, tp.is_file, tp.is_cow, tp.slow_paging); + if (src_created_successfully != 0) { + throw std::runtime_error("problem with creating source objects\n"); + } + + memory.create_mappings(tp.mpng_flags, tp.is_cow); + memory.print_mem_layout(); + + if (tp.policy == MappingPolicy::Overwrite) { + memory.overwrite_all(); + T_LOG("1 / %d of each mapping got overwritten\n", MappingsManager::overwrite_denom); + memory.print_mem_layout(); + } + + memory.start_ops(tp.num_threads); + + status = runner.wait_for_status(tp.runtime_secs); + + memory.join_threads(); + memory.print_op_counts(); + memory.close_all_files(); + memory.print_test_result(); + + T_LOG("test finished\n"); + return status; +} + +void +try_catch_test(TestParams &tp) +{ + try + { + if (run_test(tp)) { + T_FAIL("Test failed"); + } else { + T_PASS("Test passed"); + } + } + + catch (const std::runtime_error &e) + { + T_FAIL("Caught a runtime error: %s", e.what()); + } +} + +void +print_help() +{ + printf("\n\nUsage: /vm_stress config -- [-s]\n\n"); + + printf(" Number of objects the test will create and work on\n"); + printf(" Size of each object (>=16)\n"); + printf(" Test duration in seconds\n"); + printf(" Number of threads to use for each operation\n"); + printf(" Policy for mapping (part/one_to_many/over/topo)\n"); + printf(" Copy-on-write flag (0 or 1)\n"); + printf(" File flag (0 or 1)\n\n"); +} + +void +string_to_policy( + MappingPolicy &policy, std::string policy_str) +{ + const std::map string_to_policy = + { + {"part", MappingPolicy::RandomPartition}, + {"one_to_many", MappingPolicy::OneToMany}, + {"over", MappingPolicy::Overwrite}, + {"topo", MappingPolicy::Topology}, + }; + + auto it = string_to_policy.find(policy_str); + + if (it != string_to_policy.end()) { + policy = it->second; + } else { + throw std::runtime_error("Invalid policy string: \"" + policy_str + "\"\n"); + } +} + +T_DECL(config, "configurable", T_META_ENABLED(false) /* rdar://142726486 */) +{ + bool slow_paging = false; + int opt; + + for (int i = 0; i < argc; i++) { + if (strcmp(argv[i], "-s") == 0) { + slow_paging = true; + } else if (strcmp(argv[i], "-h") == 0) { + print_help(); + T_PASS("help configs"); + return; + } + } + + if (argc == 0) { + printf("\n\n\nNo arguments for configurable test, assuming intention was to skip it.\n\n\n"); + T_PASS("config - no args given"); + return; + } + + if (argc != 7 && argc != 8) { + printf("\n\n\nWrong number of arguments.\n"); + printf("Usage: /vm_stress config -- \nPolicies: part/one_to_many/over/topo\n\n"); + printf("Run \"/vm_stress config -- -h\" for more info\n\n\n"); + T_PASS("config - not enough/too many args"); + return; + } + + std::string policy_str(argv[0]); + MappingPolicy policy; + string_to_policy(policy, policy_str); + + uint32_t num_objects = strtoul(argv[1], NULL, 0); + + uint64_t obj_size = strtoull(argv[2], NULL, 0); // In pages + + if (obj_size < 16) { + throw std::runtime_error("obj_size must be more than 16\n"); + } + + uint32_t runtime_secs = strtoul(argv[3], NULL, 0); + + uint32_t num_threads = strtoul(argv[4], NULL, 0); + + bool is_cow = strtoul(argv[5], NULL, 0); + + bool is_file = strtoul(argv[6], NULL, 0); + + TestParams params = { + .num_objects = num_objects, + .obj_size = obj_size, + .runtime_secs = runtime_secs, + .num_threads = num_threads, + .policy = policy, + .is_cow = is_cow, + .is_file = is_file, + .slow_paging = slow_paging}; + + try_catch_test(params); +} + +T_DECL(vm_stress1, "partitions") +{ + TestParams params = { + .num_objects = 5, + .obj_size = 32, + .runtime_secs = 3, + .num_threads = 2, + .policy = MappingPolicy::RandomPartition, + .is_cow = true, + .is_file = true, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress2, "cow topologies") +{ + TestParams params = { + .num_objects = 10, + .obj_size = 32, + .runtime_secs = 4, + .num_threads = 4, + .policy = MappingPolicy::Topology, + .is_cow = true, + .is_file = true, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress3, "overwrite") +{ + TestParams params = { + .num_objects = 10, + .obj_size = 16, + .runtime_secs = 3, + .num_threads = 2, + .policy = MappingPolicy::Overwrite, + .is_cow = true, + .is_file = true, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress4, "partitions - not file-backed") +{ + TestParams params = { + .num_objects = 5, + .obj_size = 32, + .runtime_secs = 3, + .num_threads = 2, + .policy = MappingPolicy::RandomPartition, + .is_cow = true, + .is_file = false, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress5, "cow topologies - not file-backed") +{ + TestParams params = { + .num_objects = 10, + .obj_size = 32, + .runtime_secs = 4, + .num_threads = 4, + .policy = MappingPolicy::Topology, + .is_cow = true, + .is_file = false, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress6, "overwrite - not file-backed") +{ + TestParams params = { + .num_objects = 10, + .obj_size = 16, + .runtime_secs = 3, + .num_threads = 2, + .policy = MappingPolicy::Overwrite, + .is_cow = true, + .is_file = false, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress7, "one to many - not CoW and not file-backed") +{ + TestParams params = { + .num_objects = 5, + .obj_size = 100, + .runtime_secs = 10, + .num_threads = 3, + .policy = MappingPolicy::OneToMany, + .is_cow = false, + .is_file = false, + .slow_paging = false}; + + try_catch_test(params); +} + +T_DECL(vm_stress_hole, "Test locking of ranges with holes in them.") +{ + uint32_t num_secs = 5; + uint32_t half_of_num_mappings = 5; // To ensure num_mappings is an even number. + std::vector mappings; + mach_vm_address_t addr0; + mach_vm_allocate(mach_task_self(), &addr0, PAGE_SIZE, TRUE); + mappings.emplace_back(addr0); + for (uint32_t i = 1; i < half_of_num_mappings * 2; i++) { + mach_vm_address_t addri = addr0 + PAGE_SIZE * 2 * i; + mach_vm_allocate(mach_task_self(), &addri, PAGE_SIZE, FALSE); + mappings.emplace_back(addri); + } + auto start_time = std::chrono::steady_clock::now(); + auto end_time = start_time + std::chrono::seconds(num_secs); + uint32_t inheritance = 1; + int err = 0; + while (std::chrono::steady_clock::now() < end_time) { + for (uint32_t i = 0; i < half_of_num_mappings * 2; i += 2) { + if ((err = minherit((void *)mappings[i], 2 * PAGE_SIZE, inheritance % 2)) != 0) { + break; + } + } + if (err < 0) { + break; + } + inheritance++; + } + T_QUIET; + T_ASSERT_EQ_INT(err, 0, "all calls to minherit returned successfully"); + if (err == 0) { + T_PASS("HOLE LOCKING PASSED"); + } else { + T_FAIL("SOME ERROR IN MINHERIT, err=%d", err); + } +} diff --git a/tests/vm/vm_stress_slow.sh b/tests/vm/vm_stress_slow.sh new file mode 100755 index 000000000..be7380516 --- /dev/null +++ b/tests/vm/vm_stress_slow.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env zsh -e -u + + +# This is a script that creates a disk image with slow IO (a fake, artificial disk that lives on ram resources), +# and then runs the vm stress test with one single configuration on objects that are backed by files in that disk. +# In the end it's going to eject the newly created volume. + +eject_volumes() { + diskutil list | awk '/disk image/{print $1}' | tail -r | xargs -L1 diskutil eject +} + +trap eject_volumes EXIT + +# Default values for the flags +SIZE_MB=2048 +HELP=false +RAMDISK_MP="/Volumes/apfs-dmg" +SLOW_DMG="slow-dmg.dmg" +TYPE="ssd" +IOQUEUE_DEPTH=1 +ACCESS_TIME=$((1 << 18)) # in microseconds +READ_THROUGHPUT=1000 # in MB/s +WRITE_THROUGHPUT=1000 # in MB/s +MAX_READ_CNT=$((1 << 10)) # max bytes per read (1Kb) +MAX_WRITE_CNT=$((1 << 10)) # max bytes per write (1Kb) +SEG_READ_CNT=$((1 << 10)) +SEG_WRITE_CNT=$((1 << 10)) + + +show_help() { + echo "Usage: sudo $0 [options]" + echo + echo "Running this script will create a ramdisk with a disk image configured to run slower than usual, " + echo "and then run the vm_stress test on a file that comes from this disk image." + echo + echo "Options:" + echo " -h, --help Show this help message" + echo " -s, --speed Set paging speed (slower, slowerer, slowest)" + echo +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + HELP=true + shift + ;; + + -s|--speed|-S) + if [[ -z "$2" ]]; then + echo "Error: --speed requires a value (slower, slowerer, slowest)." + exit 1 + fi + case "$2" in + slower) + ((ACCESS_TIME = ACCESS_TIME * 2)) + shift 2 + ;; + slowerer) + ((ACCESS_TIME = ACCESS_TIME * 3)) + shift 2 + ;; + slowest) + ((ACCESS_TIME = ACCESS_TIME * 4)) + shift 2 + ;; + *) + echo "Error: Invalid speed option '$2'. Valid options are: slow, slower, slowest." + exit 1 + ;; + esac + ;; + + # Invalid option + *) + echo "Error: Invalid option '$1'. Use --help for usage." + exit 1 + ;; + esac +done + +# Show help if requested +if $HELP; then + show_help + exit 0 +fi + +echo "Selected speed: access = $ACCESS_TIME" + + +diskutil list | awk '/disk image/{print $1}' | tail -r | xargs -L1 diskutil eject # start fresh with no extra volumes +sysctl debug.didevice_queue_depth=1 +ramdisk_device=$(diskutil image attach "ram://${SIZE_MB}m" | awk '{print $1}') # attach ("create, make visible and mount") disk image ("virtual disk") on RAM (just a disk with no file system) +diskutil eraseDisk apfs apfs-dmg "$ramdisk_device" # put a file system on it + +diskutil image create blank "$RAMDISK_MP/$SLOW_DMG" -size "$((SIZE_MB / 2))m" -volumeName apfs-slow # create another (seemingly regular) disk image ("virtual disk") in the new ramdisk +slow_di_device=$(diskutil image attach "$RAMDISK_MP/$SLOW_DMG" | awk 'END{print $1}') # attach it ("make it visible and mount") + +purge # delete all caches + +# configure IO to be slow on the newly created inner volume, and then apply (start): +dmc configure "$RAMDISK_MP" "$TYPE" "$ACCESS_TIME" "$READ_THROUGHPUT" "$WRITE_THROUGHPUT" "$IOQUEUE_DEPTH" "$MAX_READ_CNT" "$MAX_WRITE_CNT" "$SEG_READ_CNT" "$SEG_WRITE_CNT" +dmc start "$RAMDISK_MP/" + +# Now that the ramdisk exists, find and execute the test: +SCRIPT_DIR=$(dirname "$(realpath "$0")") +TEST_EXEC_DIR=$(find "$SCRIPT_DIR/../" -iname "vm_stress" -maxdepth 5 -print -quit) +"$TEST_EXEC_DIR" config -- topo 6 50 5 5 1 1 -s +"$TEST_EXEC_DIR" config -- over 6 50 5 5 1 1 -s +"$TEST_EXEC_DIR" config -- part 6 50 5 5 1 1 -s +"$TEST_EXEC_DIR" config -- one_to_many 6 50 5 5 1 1 -s +"$TEST_EXEC_DIR" config -- one_to_many 6 50 5 5 0 0 -s +dmc stop "$RAMDISK_MP/" \ No newline at end of file diff --git a/tests/vm/vm_sysctl_tests.c b/tests/vm/vm_sysctl_tests.c index 5e35f051b..80a8c0835 100644 --- a/tests/vm/vm_sysctl_tests.c +++ b/tests/vm/vm_sysctl_tests.c @@ -25,13 +25,6 @@ run_sysctl_test(const char *t, int64_t value) return result; } -T_DECL(vm_map_non_aligned, - "Test that we can destroy map unaligned mappings (rdar://88969652)", - T_META_TAG_VM_PREFERRED) -{ - T_EXPECT_EQ(1ull, run_sysctl_test("vm_map_non_aligned", 0), "vm_map_non_aligned"); -} - T_DECL(vm_map_null, "Test that we can call vm_map functions with VM_MAP_NULL", T_META_TAG_VM_PREFERRED) @@ -40,17 +33,6 @@ T_DECL(vm_map_null, T_EXPECT_EQ(1ull, result, "vm_map_null"); } -T_DECL(vm_memory_entry_pgz, - "Test that we can make a memory entry of a pgz protected allocation (rdar://122836976)", - T_META_TAG_VM_PREFERRED) -{ - int64_t result = run_sysctl_test("vm_memory_entry_pgz", 0); - if (result == 2) { - T_SKIP("Unable to pgz_protect allocation. Pgz slots might be full."); - } - T_EXPECT_EQ(1ull, result, "vm_memory_entry_pgz"); -} - T_DECL(vm_map_copy_entry_subrange, "Test mapping a subrange of a copy entry") { @@ -88,3 +70,15 @@ T_DECL(vm_page_radix_verify, "verify the vm pages radix tree") T_EXPECT_EQ(1ull, run_sysctl_test("vm_page_radix_verify", 0), "vm_page_radix_verify"); } #endif + +T_DECL(vm_map_4k_16k_copyout, + "Make sure vm_map_copyout from 4k->16k maps doesn't lead to address space holes") +{ + T_EXPECT_EQ(1ULL, run_sysctl_test("vm_map_4k_16k", 0), "vm_map_4k_16k_copyout"); +} + +T_DECL(vm_map_4k_16k_copy_overwrite, + "Make sure vm_map_copy_overwrite from 4k->16k maps doesn't lead to address space holes") +{ + T_EXPECT_EQ(1ULL, run_sysctl_test("vm_map_4k_16k", 1), "vm_map_4k_16k_copy_overwrite"); +} diff --git a/tests/vm/vm_user.c b/tests/vm/vm_user.c index 342744ab9..eb638e7dd 100644 --- a/tests/vm/vm_user.c +++ b/tests/vm/vm_user.c @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -66,7 +67,6 @@ get_permanent_mapping(mach_vm_size_t size) T_DECL(permanent_mapping, "check permanent mappings semantics", T_META_TAG_VM_PREFERRED) { - kern_return_t kr; mach_vm_size_t size = 1 << 20; struct child_rc rc; @@ -186,3 +186,16 @@ T_DECL(permanent_mapping, "check permanent mappings semantics", T_META_TAG_VM_PR }); T_EXPECT_EQ(rc.sig, SIGBUS, "accessing the mapping caused a SIGBUS"); } + +T_DECL(vm_tag_describe, + "test mach_vm_tag_describe()", + T_META_TAG_VM_PREFERRED) +{ + for (unsigned int i = 0; i <= VM_MEMORY_COUNT; i++) { + const char *desc = mach_vm_tag_describe(i); + T_LOG("%i: %s", i, desc); + T_ASSERT_NOTNULL(desc, "Tag description (%i) is non-null", i); + T_EXPECT_NE_STR(desc, "", "Tag description (%i) is non-empty", i); + T_EXPECT_LE(strlen(desc), 24UL, "Tag description must be less than 24 characters"); + } +} diff --git a/tests/vm/zalloc.c b/tests/vm/zalloc.c index 78bfc1761..9371a91a8 100644 --- a/tests/vm/zalloc.c +++ b/tests/vm/zalloc.c @@ -46,9 +46,9 @@ T_DECL(zone_gc_stress_test, "stress test for zone_gc", T_META_TAG_VM_PREFERRED) #define ZLOG_ZONE "data.kalloc.128" -T_DECL(zlog_smoke_test, "check that zlog functions at all", +T_DECL(zlog_smoke_test, "check that zlog and zone tagging function at all", T_META_REQUIRES_SYSCTL_NE("kern.kasan.available", 1), - T_META_BOOTARGS_SET("zlog1=" ZLOG_ZONE), T_META_TAG_VM_PREFERRED) + T_META_BOOTARGS_SET("-zt zlog1=" ZLOG_ZONE), T_META_TAG_VM_PREFERRED) { char *cmd[] = { "/usr/local/bin/zlog", "-l", "-z", ZLOG_ZONE, NULL }; dispatch_semaphore_t sema = dispatch_semaphore_create(0); diff --git a/tests/vm_test_mach_map.c b/tests/vm_test_mach_map.c index 83d2d2767..ee0b205b2 100644 --- a/tests/vm_test_mach_map.c +++ b/tests/vm_test_mach_map.c @@ -7,6 +7,7 @@ */ #include #include +#include "try_read_write.h" #include #include @@ -35,6 +36,14 @@ #include #include +#include + +#if __has_include() +#import // for os_security_config_get() +#endif /* __has_include() */ + +#include "test_utils.h" + T_GLOBAL_META( T_META_NAMESPACE("xnu.vm"), T_META_RADAR_COMPONENT_NAME("xnu"), @@ -2246,3 +2255,309 @@ T_DECL(mach_vm_remap_new_task_read_port, T_EXPECT_EQ_INT(ret, pid, "waitpid: child was stopped or terminated"); } } + + +static void +zsm_vm_map(size_t size, + int flags, + mem_entry_name_port_t port, + memory_object_offset_t offset, + boolean_t copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance, + mach_vm_address_t *out_addr, + size_t *out_size + ) +{ + mach_vm_address_t addr_info = 0; + if (!(flags & VM_FLAGS_ANYWHERE)) { + flags |= VM_FLAGS_ANYWHERE; + } + + cur_protection &= max_protection; + kern_return_t kr = mach_vm_map(mach_task_self(), &addr_info, size, 0, flags, port, offset, + copy, cur_protection, max_protection, inheritance); + T_ASSERT_MACH_SUCCESS(kr, "mach_vm_map"); + T_LOG("mapped memory at %llx", addr_info); + + *out_addr = addr_info; + *out_size = size; +} + +static mem_entry_name_port_t +zsm_vm_mach_make_memory_entry(mach_vm_address_t addr, size_t size, int flags, + mem_entry_name_port_t parent, + kern_return_t expected_kr, bool discard) +{ + T_LOG("making memory entry for addr=%llx size=%zx", addr, size); + mem_entry_name_port_t port = 0; + kern_return_t kr = mach_make_memory_entry(mach_task_self(), &size, addr, flags, &port, parent); + T_ASSERT_EQ(kr, expected_kr, "mach_make_memory_entry expected return %d", kr); + if (kr == KERN_SUCCESS) { + T_ASSERT_NE(port, 0, "got non zero port"); + if (discard) { + mach_port_deallocate(mach_task_self(), port); + port = 0; + } + } + return port; +} + +T_DECL(memory_entry_zero_sized, + "Test that creating a zero-sized memory-entry with parent fails correctly") +{ + mach_vm_address_t addr = 0; + size_t size = 0; + kern_return_t kr; + zsm_vm_map(0xa7c000, + 0, /* flags */ + 0, /* port */ + 0, /* offset */ + 0, /* copy */ + VM_PROT_EXECUTE, VM_PROT_EXECUTE, + 0x1, /* inheritance */ + &addr, &size); + mem_entry_name_port_t parent_entry = zsm_vm_mach_make_memory_entry(addr, size, 0, 0, KERN_SUCCESS, false); + + zsm_vm_mach_make_memory_entry(0, 0, 0, parent_entry, KERN_INVALID_ARGUMENT, true); + zsm_vm_mach_make_memory_entry(0, 1, 0, parent_entry, KERN_SUCCESS, true); + zsm_vm_mach_make_memory_entry(1, 0, 0, parent_entry, KERN_SUCCESS, true); + + zsm_vm_mach_make_memory_entry(PAGE_SIZE, 0, 0, parent_entry, KERN_INVALID_ARGUMENT, true); + zsm_vm_mach_make_memory_entry(PAGE_SIZE, 1, 0, parent_entry, KERN_SUCCESS, true); + zsm_vm_mach_make_memory_entry(PAGE_SIZE + 1, 0, 0, parent_entry, KERN_SUCCESS, true); + + kr = mach_port_deallocate(mach_task_self(), parent_entry); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate"); + kr = mach_vm_deallocate(mach_task_self(), addr, size); + T_ASSERT_MACH_SUCCESS(kr, "mach_vm_deallocate"); +} + +T_DECL(memory_entry_null_obj, + "Test creating a memory-entry with null vm_object") +{ + mach_vm_address_t addr = 0; + size_t size = 0; + kern_return_t kr = 0; + uint8_t value = 0; + + // create an allocation with vm_object == NULL + zsm_vm_map(0x604000, /* size */ + 0, /* flags */ + 0, /* port */ + 0, /* offset */ + TRUE, /* copy */ + VM_PROT_NONE, VM_PROT_NONE, + 0x0, /* inheritance */ + &addr, &size); + + // verify it's NONE + bool read_success = try_read_byte(addr, &value, &kr); + T_ASSERT_FALSE(read_success, "can't read from NONE address"); + bool write_succeded = try_write_byte(addr, 42, &kr); + T_ASSERT_FALSE(write_succeded, "can't write to NONE address"); + + // size 0 entry of the allocated memory - should fail + zsm_vm_mach_make_memory_entry(addr, /*size=*/ 0, /*flags=*/ 0x0, /*parent=*/ 0x0, KERN_INVALID_ARGUMENT, true); + + // trying to get a 'copy' entry of a PROT_NONE entry + zsm_vm_mach_make_memory_entry(addr, size, /*flags=*/ 0x0, /*parent=*/ 0x0, KERN_PROTECTION_FAILURE, true); + + // get a 'share' entry of a PROT_NONE entry and remap it + mem_entry_name_port_t np = zsm_vm_mach_make_memory_entry(addr, size, MAP_MEM_VM_SHARE, 0x0, KERN_SUCCESS, false); + + mach_vm_address_t m_addr = 0; + size_t m_size = 0; + zsm_vm_map(size, + 0, /* size */ + np, + 0, /* offset */ + FALSE, /* copy */ + VM_PROT_NONE, VM_PROT_NONE, + 0x0, /* inheritance */ + &m_addr, &m_size); + + // try to accessremapped area + read_success = try_read_byte(m_addr, &value, &kr); + T_ASSERT_FALSE(read_success, "can't read from NONE address"); + write_succeded = try_write_byte(m_addr, 42, &kr); + T_ASSERT_FALSE(write_succeded, "can't write to NONE address"); + + kr = mach_port_deallocate(mach_task_self(), np); + T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate"); + kr = mach_vm_deallocate(mach_task_self(), addr, size); + T_ASSERT_MACH_SUCCESS(kr, "mach_vm_deallocate"); + kr = mach_vm_deallocate(mach_task_self(), m_addr, m_size); + T_ASSERT_MACH_SUCCESS(kr, "mach_vm_deallocate mapped"); +} + +#if __arm64e__ +#define TARGET_CPU_ARM64E true +#else +#define TARGET_CPU_ARM64E false +#endif + +T_DECL(vm_region_recurse_tpro_info, + "Ensure metadata returned by vm_region_recurse correct reflects TPRO status", + T_META_ENABLED(TARGET_CPU_ARM64E), + XNU_T_META_SOC_SPECIFIC, + T_META_ASROOT(true)) +{ + T_SETUPBEGIN; + + /* First things first, do nothing unless we're TPRO enabled */ + if (!(os_security_config_get() & OS_SECURITY_CONFIG_TPRO)) { + T_SKIP("Skipping because we're not running under TPRO"); + return; + } + + /* Given an allocation from dyld's heap */ + const char* tpro_allocation = _dyld_get_image_name(0); + + /* And an allocation from our own heap (which is not TPRO) */ + mach_vm_address_t non_tpro_allocation; + mach_vm_size_t alloc_size = PAGE_SIZE; + kern_return_t kr = mach_vm_allocate( + mach_task_self(), + &non_tpro_allocation, + alloc_size, + VM_FLAGS_ANYWHERE ); + T_ASSERT_MACH_SUCCESS(kr, "Allocated non-TPRO region"); + /* (And write to it to be sure we populate a VM object) */ + memset((uint8_t*)non_tpro_allocation, 0, alloc_size); + + T_SETUPEND; + + /* When we query the attributes of the region covering the TPRO-enabled buffer */ + mach_vm_address_t addr = (mach_vm_address_t)tpro_allocation; + mach_vm_size_t addr_size = 16; + uint32_t nesting_depth = UINT_MAX; + mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + vm_region_submap_info_data_64_t region_info; + kr = vm_region_recurse_64(mach_task_self(), (vm_address_t*)&addr, (vm_size_t*)&addr_size, &nesting_depth, (vm_region_recurse_info_t)®ion_info, &count); + + /* Then our metadata confirms that the region contains a TPRO entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query TPRO-enabled region"); + T_ASSERT_TRUE(region_info.flags & VM_REGION_FLAG_TPRO_ENABLED, "Expected metadata to reflect a TPRO entry"); + + /* And when we query the same thing via the 'short' info */ + addr = (mach_vm_address_t)tpro_allocation; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + vm_region_submap_short_info_data_64_t short_info; + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)&short_info, &count); + + /* Then the short metadata also confirms that the region contains a TPRO entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query TPRO-enabled region"); + T_ASSERT_TRUE(short_info.flags & VM_REGION_FLAG_TPRO_ENABLED, "Expected metadata to reflect a TPRO entry"); + + /* And when we query the attributes of the region covering the non-TPRO allocation */ + addr = non_tpro_allocation; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + memset(®ion_info, 0, sizeof(region_info)); + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)®ion_info, &count); + + /* Then our metadata confirm that the region does not contain a TPRO entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query non-TPRO region"); + T_ASSERT_FALSE(region_info.flags & VM_REGION_FLAG_TPRO_ENABLED, "Expected metadata to reflect no TPRO entry"); + + /* And when we query the same thing via the 'short' info */ + addr = non_tpro_allocation; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + memset(&short_info, 0, sizeof(short_info)); + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)&short_info, &count); + + /* Then the short metadata also confirms that the region does not contain a TPRO entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query non-TPRO region"); + T_ASSERT_FALSE(short_info.flags & VM_REGION_FLAG_TPRO_ENABLED, "Expected metadata to reflect no TPRO entry"); + + /* Cleanup */ + kr = mach_vm_deallocate(mach_task_self(), non_tpro_allocation, alloc_size); + T_ASSERT_MACH_SUCCESS(kr, "deallocate memory"); +} + +T_DECL(vm_region_recurse_jit_info, + "Ensure metadata returned by vm_region_recurse correct reflects JIT status", + XNU_T_META_SOC_SPECIFIC, + /* Only attempt to run on macOS so we don't need to worry about JIT policy */ + T_META_ENABLED(TARGET_OS_OSX), + T_META_ASROOT(true)) +{ + T_SETUPBEGIN; + + /* Given a JIT region */ + mach_vm_size_t alloc_size = PAGE_SIZE * 4; + void* jit_region = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_JIT, -1, 0); + T_ASSERT_NE_PTR(jit_region, MAP_FAILED, "MAP_JIT"); + + /* And a non-JIT region */ + mach_vm_address_t non_jit_allocation; + kern_return_t kr = mach_vm_allocate( + mach_task_self(), + &non_jit_allocation, + alloc_size, + VM_FLAGS_ANYWHERE); + T_ASSERT_MACH_SUCCESS(kr, "Allocated non-JIT region"); + /* (And write to it to be sure we populate a VM object) */ + memset((uint8_t*)non_jit_allocation, 0, alloc_size); + + T_SETUPEND; + + /* When we query the attributes of the region covering the JIT-enabled buffer */ + mach_vm_address_t addr = (mach_vm_address_t)jit_region; + mach_vm_size_t addr_size = alloc_size; + uint32_t nesting_depth = UINT_MAX; + mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + vm_region_submap_info_data_64_t region_info; + kr = vm_region_recurse_64(mach_task_self(), (vm_address_t*)&addr, (vm_size_t*)&addr_size, &nesting_depth, (vm_region_recurse_info_t)®ion_info, &count); + + /* Then our metadata confirms that the region contains a JIT entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query JIT-enabled region"); + T_ASSERT_TRUE(region_info.flags & VM_REGION_FLAG_JIT_ENABLED, "Expected metadata to reflect a JIT entry"); + + /* And when we query the same thing via the 'short' info */ + addr = (mach_vm_address_t)jit_region; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + vm_region_submap_short_info_data_64_t short_info; + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)&short_info, &count); + + /* Then the short metadata also confirms that the region contains a JIT entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query JIT-enabled region"); + T_ASSERT_TRUE(short_info.flags & VM_REGION_FLAG_JIT_ENABLED, "Expected metadata to reflect a JIT entry"); + + /* And when we query the attributes of the region covering the non-JIT allocation */ + addr = non_jit_allocation; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; + memset(®ion_info, 0, sizeof(region_info)); + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)®ion_info, &count); + + /* Then our metadata confirm that the region does not contain a JIT entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query non-JIT region"); + T_ASSERT_FALSE(region_info.flags & VM_REGION_FLAG_JIT_ENABLED, "Expected metadata to reflect no JIT entry"); + + /* And when we query the same thing via the 'short' info */ + addr = non_jit_allocation; + addr_size = alloc_size; + nesting_depth = UINT_MAX; + count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + memset(&short_info, 0, sizeof(short_info)); + kr = mach_vm_region_recurse(mach_task_self(), (mach_vm_address_t*)&addr, (mach_vm_size_t*)&addr_size, &nesting_depth, (vm_region_info_t)&short_info, &count); + + /* Then the short metadata also confirms that the region does not contain a JIT entry */ + T_ASSERT_MACH_SUCCESS(kr, "Query non-JIT region"); + T_ASSERT_FALSE(short_info.flags & VM_REGION_FLAG_JIT_ENABLED, "Expected metadata to reflect no JIT entry"); + + /* Cleanup */ + kr = mach_vm_deallocate(mach_task_self(), non_jit_allocation, alloc_size); + T_ASSERT_MACH_SUCCESS(kr, "deallocate memory"); +} diff --git a/tests/vm_test_mach_map.plist b/tests/vm_test_mach_map.plist index 594fd29fd..c0c864c54 100644 --- a/tests/vm_test_mach_map.plist +++ b/tests/vm_test_mach_map.plist @@ -2,7 +2,15 @@ - com.apple.system-task-ports.read - + com.apple.system-task-ports.read + + com.apple.security.cs.allow-jit + + + + com.apple.private.set-exception-port + + com.apple.private.amfi.can-set-exception-ports + - \ No newline at end of file + diff --git a/tests/vsock.c b/tests/vsock.c index 87dcd5adf..e2c4ec18b 100644 --- a/tests/vsock.c +++ b/tests/vsock.c @@ -26,171 +26,13 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#define COUNT_ELEMS(array) (sizeof (array) / sizeof (array[0])) +#include T_GLOBAL_META( T_META_RUN_CONCURRENTLY(true), T_META_NAMESPACE("xnu.vsock") ); -static int -vsock_new_socket(void) -{ - int sock = socket(AF_VSOCK, SOCK_STREAM, 0); - if (sock < 0 && errno == ENODEV) { - T_SKIP("no vsock transport available"); - } - T_ASSERT_GT(sock, 0, "create new vsock socket"); - return sock; -} - -static uint32_t -vsock_get_local_cid(int socket) -{ - uint32_t cid = 0; - int result = ioctl(socket, IOCTL_VM_SOCKETS_GET_LOCAL_CID, &cid); - T_ASSERT_POSIX_SUCCESS(result, "vsock ioctl cid successful"); - T_ASSERT_GT(cid, VMADDR_CID_HOST, "cid is set"); - T_ASSERT_NE(cid, VMADDR_CID_ANY, "cid is valid"); - - return cid; -} - -static int -vsock_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket) -{ - *socket = vsock_new_socket(); - - bzero(addr, sizeof(*addr)); - addr->svm_port = port; - addr->svm_cid = cid; - - return bind(*socket, (struct sockaddr *) addr, sizeof(*addr)); -} - -static int -vsock_listen(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int backlog, int *socket) -{ - int result = vsock_bind(cid, port, addr, socket); - T_ASSERT_POSIX_SUCCESS(result, "vsock bind"); - return listen(*socket, backlog); -} - -static int -vsock_connect(uint32_t cid, uint32_t port, int *socket) -{ - *socket = vsock_new_socket(); - struct sockaddr_vm addr = (struct sockaddr_vm) { - .svm_cid = cid, - .svm_port = port, - }; - return connect(*socket, (struct sockaddr *)&addr, sizeof(addr)); -} - -static struct sockaddr_vm -vsock_getsockname(int socket) -{ - struct sockaddr_vm addr; - socklen_t length = sizeof(addr); - int result = getsockname(socket, (struct sockaddr *)&addr, &length); - T_ASSERT_POSIX_SUCCESS(result, "vsock getsockname"); - T_ASSERT_EQ_INT((int) sizeof(addr), length, "correct address length"); - T_ASSERT_GT(addr.svm_port, 0, "bound to non-zero local port"); - return addr; -} - -static void -vsock_close(int socket) -{ - int result = close(socket); - T_ASSERT_POSIX_SUCCESS(result, "vsock close"); -} - -static void -vsock_connect_peers(uint32_t cid, uint32_t port, int backlog, int *socketA, int *socketB) -{ - // Listen. - struct sockaddr_vm addr; - int listen_socket; - int result = vsock_listen(cid, port, &addr, backlog, &listen_socket); - T_ASSERT_POSIX_SUCCESS(result, "vsock listen"); - - const uint32_t connection_cid = vsock_get_local_cid(listen_socket); - - // Connect. - int connect_socket; - result = vsock_connect(connection_cid, addr.svm_port, &connect_socket); - T_ASSERT_POSIX_SUCCESS(result, "vsock connect"); - - // Accept. - struct sockaddr_vm accepted_addr; - socklen_t addrlen = sizeof(accepted_addr); - int accepted_socket = accept(listen_socket, (struct sockaddr *)&accepted_addr, &addrlen); - T_ASSERT_GT(accepted_socket, 0, "accepted socket"); - T_ASSERT_EQ_INT((int) sizeof(accepted_addr), addrlen, "correct address length"); - T_ASSERT_EQ_INT(connection_cid, accepted_addr.svm_cid, "same cid"); - T_ASSERT_NE_INT(VMADDR_CID_ANY, accepted_addr.svm_port, "some valid port"); - T_ASSERT_NE_INT(0, accepted_addr.svm_port, "some non-zero port"); - - *socketA = connect_socket; - *socketB = accepted_socket; -} - -static void -vsock_send(int socket, char *msg) -{ - T_ASSERT_NOTNULL(msg, "send message is not null"); - ssize_t sent_bytes = send(socket, msg, strlen(msg), 0); - T_ASSERT_EQ_LONG(strlen(msg), (unsigned long)sent_bytes, "sent all bytes"); -} - -static void -vsock_disable_sigpipe(int socket) -{ - int on = 1; - int result = setsockopt(socket, SOL_SOCKET, SO_NOSIGPIPE, &on, sizeof(on)); - T_ASSERT_POSIX_SUCCESS(result, "vsock disable SIGPIPE"); -} - -static bool -vsock_address_exists(struct xvsockpgen *buffer, struct sockaddr_vm addr) -{ - struct xvsockpgen *xvg = buffer; - struct xvsockpgen *oxvg = buffer; - - bool found = false; - for (xvg = (struct xvsockpgen *)((char *)xvg + xvg->xvg_len); - xvg->xvg_len > sizeof(struct xvsockpgen); - xvg = (struct xvsockpgen *)((char *)xvg + xvg->xvg_len)) { - struct xvsockpcb *xpcb = (struct xvsockpcb *)xvg; - - /* Ignore PCBs which were freed during copyout. */ - if (xpcb->xvp_gencnt > oxvg->xvg_gen) { - continue; - } - - if (xpcb->xvp_local_cid == addr.svm_cid && xpcb->xvp_remote_cid == VMADDR_CID_ANY && - xpcb->xvp_local_port == addr.svm_port && xpcb->xvp_remote_port == VMADDR_PORT_ANY) { - found = true; - break; - } - } - - T_ASSERT_NE(xvg, oxvg, "first and last xvsockpgen were returned"); - - return found; -} - /* New Socket */ T_DECL(new_socket_getsockname, "vsock new - getsockname") @@ -241,7 +83,8 @@ T_DECL(bind, "vsock bind to specific port") { int socket; struct sockaddr_vm addr; - int result = vsock_bind(VMADDR_CID_ANY, 8888, &addr, &socket); + const uint32_t port = vsock_get_available_port(); + int result = vsock_bind(VMADDR_CID_ANY, port, &addr, &socket); T_ASSERT_POSIX_SUCCESS(result, "vsock bind to specific port"); } @@ -311,7 +154,7 @@ T_DECL(bind_zero, "vsock bind to port zero", T_META_ASROOT(true)) T_DECL(bind_double, "vsock double bind") { const uint32_t cid = VMADDR_CID_ANY; - const uint32_t port = 8899; + const uint32_t port = vsock_get_available_port(); int socket; struct sockaddr_vm addr; @@ -325,7 +168,7 @@ T_DECL(bind_double, "vsock double bind") T_DECL(bind_same, "vsock bind same address and port") { const uint32_t cid = VMADDR_CID_ANY; - const uint32_t port = 3399; + const uint32_t port = vsock_get_available_port(); int socket; struct sockaddr_vm addr; @@ -339,7 +182,7 @@ T_DECL(bind_same, "vsock bind same address and port") T_DECL(bind_port_reuse, "vsock bind port reuse") { const uint32_t cid = VMADDR_CID_ANY; - const uint32_t port = 9111; + const uint32_t port = vsock_get_available_port(); int socket; struct sockaddr_vm addr; @@ -376,43 +219,19 @@ T_DECL(bind_privileged_root, "vsock bind on privileged port - root", T_META_ASRO T_DECL(bind_no_family, "vsock bind with unspecified family") { - int socket = vsock_new_socket(); - - struct sockaddr_vm addr = (struct sockaddr_vm) { - .svm_family = AF_UNSPEC, - .svm_cid = VMADDR_CID_ANY, - .svm_port = 7321, - }; - - int result = bind(socket, (struct sockaddr *) &addr, sizeof(addr)); + int result = vsock_bind_family(AF_UNSPEC); T_ASSERT_POSIX_SUCCESS(result, "vsock bind with unspecified family"); } T_DECL(bind_vsock_family, "vsock bind with vsock family") { - int socket = vsock_new_socket(); - - struct sockaddr_vm addr = (struct sockaddr_vm) { - .svm_family = AF_VSOCK, - .svm_cid = VMADDR_CID_ANY, - .svm_port = 7322, - }; - - int result = bind(socket, (struct sockaddr *) &addr, sizeof(addr)); + int result = vsock_bind_family(AF_VSOCK); T_ASSERT_POSIX_SUCCESS(result, "vsock bind with vsock family"); } T_DECL(bind_wrong_family, "vsock bind with wrong family") { - int socket = vsock_new_socket(); - - struct sockaddr_vm addr = (struct sockaddr_vm) { - .svm_family = AF_INET, - .svm_cid = VMADDR_CID_ANY, - .svm_port = 7323, - }; - - int result = bind(socket, (struct sockaddr *) &addr, sizeof(addr)); + int result = vsock_bind_family(AF_INET); T_ASSERT_POSIX_FAILURE(result, EAFNOSUPPORT, "vsock bind with wrong family"); } @@ -422,7 +241,8 @@ T_DECL(listen, "vsock listen on specific port") { struct sockaddr_vm addr; int socket; - int result = vsock_listen(VMADDR_CID_ANY, 8889, &addr, 10, &socket); + const uint32_t port = vsock_get_available_port(); + int result = vsock_listen(VMADDR_CID_ANY, port, &addr, 10, &socket); T_ASSERT_POSIX_SUCCESS(result, "vsock listen"); } @@ -450,9 +270,7 @@ T_DECL(connect_non_listening_host, "vsock connect to non-listening host port") T_ASSERT_POSIX_FAILURE(result, EAGAIN, "vsock connect non-listening host port"); } -T_DECL(connect_non_listening_hypervisor, "vsock connect to non-listening hypervisor port", - T_META_ENABLED(false /* rdar://133461431 */) - ) +T_DECL(connect_non_listening_hypervisor, "vsock connect to non-listening hypervisor port") { int socket; int result = vsock_connect(VMADDR_CID_HYPERVISOR, 4444, &socket); @@ -479,9 +297,10 @@ T_DECL(connect_timeout, "vsock connect with timeout") int result = setsockopt(socket, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)); T_ASSERT_POSIX_SUCCESS(result, "vsock set socket timeout"); + const uint32_t port = vsock_get_available_port(); struct sockaddr_vm addr = (struct sockaddr_vm) { .svm_cid = VMADDR_CID_HOST, - .svm_port = 4321, + .svm_port = port, }; result = connect(socket, (struct sockaddr *)&addr, sizeof(addr)); T_ASSERT_POSIX_FAILURE(result, ETIMEDOUT, "vsock connect timeout"); @@ -491,7 +310,7 @@ T_DECL(connect_non_blocking, "vsock connect non-blocking") { int socket = vsock_new_socket(); - const uint32_t port = 4321; + const uint32_t port = vsock_get_available_port(); const uint32_t cid = vsock_get_local_cid(socket); // Listen. @@ -536,7 +355,8 @@ T_DECL(shutdown_not_connected, "vsock shutdown - not connected") T_DECL(shutdown_reads, "vsock shutdown - reads") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 8989, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char *msg = "This is test message.\n"; @@ -562,7 +382,8 @@ T_DECL(shutdown_reads, "vsock shutdown - reads") T_DECL(shutdown_writes, "vsock shutdown - writes") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 8787, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char *msg = "This is test message.\n"; @@ -595,7 +416,8 @@ T_DECL(shutdown_writes, "vsock shutdown - writes") T_DECL(shutdown_both, "vsock shutdown - both") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 8686, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char *msg = "This is test message.\n"; char buffer[1024] = {0}; @@ -638,7 +460,8 @@ T_DECL(shutdown_both, "vsock shutdown - both") T_DECL(talk_self, "vsock talk to self") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 4545, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char buffer[1024] = {0}; @@ -662,7 +485,8 @@ T_DECL(talk_self, "vsock talk to self") T_DECL(talk_self_double, "vsock talk to self - double sends") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 4646, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char buffer[1024] = {0}; @@ -692,7 +516,8 @@ T_DECL(talk_self_double, "vsock talk to self - double sends") T_DECL(talk_self_early_close, "vsock talk to self - peer closes early") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 4646, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); char *msg = "This is a message."; vsock_send(socketA, msg); @@ -710,7 +535,7 @@ T_DECL(talk_self_early_close, "vsock talk to self - peer closes early") T_DECL(talk_self_connections, "vsock talk to self - too many connections") { - const uint32_t port = 4747; + const uint32_t port = vsock_get_available_port(); const int backlog = 1; struct sockaddr_vm listen_addr; @@ -741,7 +566,8 @@ T_DECL(talk_self_connections, "vsock talk to self - too many connections") T_DECL(talk_self_large_writes, "vsock talk to self with large writes") { int socketA, socketB; - vsock_connect_peers(VMADDR_CID_ANY, 4848, 10, &socketA, &socketB); + const uint32_t port = vsock_get_available_port(); + vsock_connect_peers(VMADDR_CID_ANY, port, 10, &socketA, &socketB); size_t size = 65536 * 4; char buffer[65536 * 4] = {0}; @@ -774,7 +600,8 @@ T_DECL(vsock_pcblist_simple, "vsock pcblist sysctl - simple") // Create some socket to discover in the pcblist. struct sockaddr_vm addr; int socket; - int result = vsock_listen(VMADDR_CID_ANY, 88899, &addr, 10, &socket); + const uint32_t port = vsock_get_available_port(); + int result = vsock_listen(VMADDR_CID_ANY, port, &addr, 10, &socket); T_ASSERT_POSIX_SUCCESS(result, "vsock listen on a port"); // Get the buffer length for the pcblist. @@ -814,7 +641,8 @@ T_DECL(vsock_pcblist_added, "vsock pcblist sysctl - socket added") // Create some socket to discover in the pcblist after making the first sysctl. struct sockaddr_vm addr; int socket; - result = vsock_listen(VMADDR_CID_ANY, 77799, &addr, 10, &socket); + const uint32_t port = vsock_get_available_port(); + result = vsock_listen(VMADDR_CID_ANY, port, &addr, 10, &socket); T_ASSERT_POSIX_SUCCESS(result, "vsock listen on a port"); // Allocate the buffer. @@ -838,7 +666,8 @@ T_DECL(vsock_pcblist_removed, "vsock pcblist sysctl - socket removed") // Create some socket to be removed after making the first sysctl. struct sockaddr_vm addr; int socket; - int result = vsock_listen(VMADDR_CID_ANY, 66699, &addr, 10, &socket); + const uint32_t port = vsock_get_available_port(); + int result = vsock_listen(VMADDR_CID_ANY, port, &addr, 10, &socket); T_ASSERT_POSIX_SUCCESS(result, "vsock listen on a port"); // Get the buffer length for the pcblist. @@ -866,3 +695,18 @@ T_DECL(vsock_pcblist_removed, "vsock pcblist sysctl - socket removed") free(buffer); } + +T_DECL(vsock_private_connect_without_entitlement, "vsock private connect should fail without entitlement") +{ + int socket; + int result = vsock_private_connect(VMADDR_CID_HOST, 1234, &socket); + T_ASSERT_POSIX_FAILURE(result, EPERM, "vsock connect without entitlement"); +} + +T_DECL(vsock_private_bind_without_entitlement, "vsock private bind should fail without entitlement") +{ + int socket; + struct sockaddr_vm addr; + int result = vsock_private_bind(VMADDR_CID_ANY, 1234, &addr, &socket); + T_ASSERT_POSIX_FAILURE(result, EPERM, "vsock bind without entitlement"); +} diff --git a/tests/vsock_entitled.c b/tests/vsock_entitled.c new file mode 100644 index 000000000..f7ae92510 --- /dev/null +++ b/tests/vsock_entitled.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +T_GLOBAL_META( + T_META_RUN_CONCURRENTLY(true), + T_META_NAMESPACE("xnu.vsock") + ); + +T_DECL(vsock_private_connect_with_entitlement, "vsock private connect should succeed with entitlement") +{ + const uint32_t port = 1234; + + struct sockaddr_vm listen_addr; + int listen_socket; + int result = vsock_private_listen(VMADDR_CID_ANY, port, &listen_addr, 1, &listen_socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock listen with entitlement"); + + const uint32_t connection_cid = vsock_get_local_cid(listen_socket); + + int connected_socket = vsock_private_new_socket(); + struct sockaddr_vm addr = (struct sockaddr_vm) { + .svm_cid = connection_cid, + .svm_port = port, + }; + result = connect(connected_socket, (struct sockaddr *)&addr, sizeof(addr)); + T_ASSERT_POSIX_SUCCESS(result, "vsock connection with entitlement"); + + vsock_close(connected_socket); + vsock_close(listen_socket); +} + +T_DECL(vsock_private_bind_with_entitlement, "vsock private bind should succeed with entitlement") +{ + int socket; + struct sockaddr_vm addr; + int result = vsock_private_bind(VMADDR_CID_ANY, 1234, &addr, &socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock bind with entitlement"); +} diff --git a/tests/vsock_entitlements.plist b/tests/vsock_entitlements.plist new file mode 100644 index 000000000..fbb8542e2 --- /dev/null +++ b/tests/vsock_entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.private.vsock + + + diff --git a/tests/vsock_helpers.c b/tests/vsock_helpers.c new file mode 100644 index 000000000..acc2fd53a --- /dev/null +++ b/tests/vsock_helpers.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +static int +_vsock_new_socket(uint16_t protocol) +{ + int sock = socket(AF_VSOCK, SOCK_STREAM, protocol); + if (sock < 0 && errno == ENODEV) { + T_SKIP("no vsock transport available"); + } + return sock; +} + +int +vsock_new_socket(void) +{ + int sock = _vsock_new_socket(VSOCK_PROTO_STANDARD); + T_ASSERT_GT(sock, 0, "create new vsock socket"); + return sock; +} + +int +vsock_private_new_socket(void) +{ + int sock = _vsock_new_socket(VSOCK_PROTO_PRIVATE); + T_ASSERT_GT(sock, 0, "create new private vsock socket"); + return sock; +} + +uint32_t +vsock_get_local_cid(int socket) +{ + uint32_t cid = 0; + int result = ioctl(socket, IOCTL_VM_SOCKETS_GET_LOCAL_CID, &cid); + T_ASSERT_POSIX_SUCCESS(result, "vsock ioctl cid successful"); + T_ASSERT_GT(cid, VMADDR_CID_HOST, "cid is set"); + T_ASSERT_NE(cid, VMADDR_CID_ANY, "cid is valid"); + + return cid; +} + +static int +_vsock_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket, uint16_t protocol) +{ + int sock = _vsock_new_socket(protocol); + if (sock < 0) { + return sock; + } + T_ASSERT_GT(sock, 0, "create new vsock socket"); + *socket = sock; + + bzero(addr, sizeof(*addr)); + addr->svm_port = port; + addr->svm_cid = cid; + + return bind(*socket, (struct sockaddr *) addr, sizeof(*addr)); +} + +int +vsock_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket) +{ + return _vsock_bind(cid, port, addr, socket, VSOCK_PROTO_STANDARD); +} + +int +vsock_private_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket) +{ + return _vsock_bind(cid, port, addr, socket, VSOCK_PROTO_PRIVATE); +} + +int +vsock_listen(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int backlog, int *socket) +{ + int result = vsock_bind(cid, port, addr, socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock bind"); + return listen(*socket, backlog); +} + +int +vsock_private_listen(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int backlog, int *socket) +{ + int result = vsock_private_bind(cid, port, addr, socket); + T_ASSERT_POSIX_SUCCESS(result, "private vsock bind"); + return listen(*socket, backlog); +} + +static int +_vsock_connect(uint32_t cid, uint32_t port, int *socket, uint16_t protocol) +{ + int sock = _vsock_new_socket(protocol); + if (sock < 0) { + return sock; + } + T_ASSERT_GT(sock, 0, "create new vsock socket"); + *socket = sock; + + struct sockaddr_vm addr = (struct sockaddr_vm) { + .svm_cid = cid, + .svm_port = port, + }; + return connect(*socket, (struct sockaddr *)&addr, sizeof(addr)); +} + +int +vsock_connect(uint32_t cid, uint32_t port, int *socket) +{ + return _vsock_connect(cid, port, socket, VSOCK_PROTO_STANDARD); +} + +int +vsock_private_connect(uint32_t cid, uint32_t port, int *socket) +{ + return _vsock_connect(cid, port, socket, VSOCK_PROTO_PRIVATE); +} + +struct sockaddr_vm +vsock_getsockname(int socket) +{ + struct sockaddr_vm addr; + socklen_t length = sizeof(addr); + int result = getsockname(socket, (struct sockaddr *)&addr, &length); + T_ASSERT_POSIX_SUCCESS(result, "vsock getsockname"); + T_ASSERT_EQ_INT((int) sizeof(addr), length, "correct address length"); + T_ASSERT_GT(addr.svm_port, 0, "bound to non-zero local port"); + return addr; +} + +void +vsock_close(int socket) +{ + int result = close(socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock close"); +} + +void +vsock_connect_peers(uint32_t cid, uint32_t port, int backlog, int *socketA, int *socketB) +{ + // Listen. + struct sockaddr_vm addr; + int listen_socket; + int result = vsock_listen(cid, port, &addr, backlog, &listen_socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock listen"); + + const uint32_t connection_cid = vsock_get_local_cid(listen_socket); + + // Connect. + int connect_socket; + result = vsock_connect(connection_cid, addr.svm_port, &connect_socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock connect"); + + // Accept. + struct sockaddr_vm accepted_addr; + socklen_t addrlen = sizeof(accepted_addr); + int accepted_socket = accept(listen_socket, (struct sockaddr *)&accepted_addr, &addrlen); + T_ASSERT_GT(accepted_socket, 0, "accepted socket"); + T_ASSERT_EQ_INT((int) sizeof(accepted_addr), addrlen, "correct address length"); + T_ASSERT_EQ_INT(connection_cid, accepted_addr.svm_cid, "same cid"); + T_ASSERT_NE_INT(VMADDR_CID_ANY, accepted_addr.svm_port, "some valid port"); + T_ASSERT_NE_INT(0, accepted_addr.svm_port, "some non-zero port"); + + *socketA = connect_socket; + *socketB = accepted_socket; +} + +void +vsock_send(int socket, char *msg) +{ + T_ASSERT_NOTNULL(msg, "send message is not null"); + ssize_t sent_bytes = send(socket, msg, strlen(msg), 0); + T_ASSERT_EQ_LONG(strlen(msg), (unsigned long)sent_bytes, "sent all bytes"); +} + +void +vsock_disable_sigpipe(int socket) +{ + int on = 1; + int result = setsockopt(socket, SOL_SOCKET, SO_NOSIGPIPE, &on, sizeof(on)); + T_ASSERT_POSIX_SUCCESS(result, "vsock disable SIGPIPE"); +} + +bool +vsock_address_exists(struct xvsockpgen *buffer, struct sockaddr_vm addr) +{ + struct xvsockpgen *xvg = buffer; + struct xvsockpgen *oxvg = buffer; + + bool found = false; + for (xvg = (struct xvsockpgen *)((char *)xvg + xvg->xvg_len); + xvg->xvg_len > sizeof(struct xvsockpgen); + xvg = (struct xvsockpgen *)((char *)xvg + xvg->xvg_len)) { + struct xvsockpcb *xpcb = (struct xvsockpcb *)xvg; + + /* Ignore PCBs which were freed during copyout. */ + if (xpcb->xvp_gencnt > oxvg->xvg_gen) { + continue; + } + + if (xpcb->xvp_local_cid == addr.svm_cid && xpcb->xvp_remote_cid == VMADDR_CID_ANY && + xpcb->xvp_local_port == addr.svm_port && xpcb->xvp_remote_port == VMADDR_PORT_ANY) { + found = true; + break; + } + } + + T_ASSERT_NE(xvg, oxvg, "first and last xvsockpgen were returned"); + + return found; +} + +uint32_t +vsock_get_available_port(void) +{ + int socket; + struct sockaddr_vm addr; + int result = vsock_bind(VMADDR_CID_ANY, VMADDR_PORT_ANY, &addr, &socket); + T_ASSERT_POSIX_SUCCESS(result, "vsock bind to any port"); + + const struct sockaddr_vm bound_addr = vsock_getsockname(socket); + const uint32_t port = bound_addr.svm_port; + T_ASSERT_NE_INT(port, VMADDR_PORT_ANY, "port is specified"); + + vsock_close(socket); + + return port; +} + +int +vsock_bind_family(sa_family_t family) +{ + int socket = vsock_new_socket(); + const uint32_t port = vsock_get_available_port(); + + struct sockaddr_vm addr = (struct sockaddr_vm) { + .svm_family = family, + .svm_cid = VMADDR_CID_ANY, + .svm_port = port, + }; + + return bind(socket, (struct sockaddr *) &addr, sizeof(addr)); +} diff --git a/tests/vsock_helpers.h b/tests/vsock_helpers.h new file mode 100644 index 000000000..94bc978cc --- /dev/null +++ b/tests/vsock_helpers.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2024 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef VSOCK_HELPERS_H +#define VSOCK_HELPERS_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define COUNT_ELEMS(array) (sizeof (array) / sizeof (array[0])) + +int +vsock_new_socket(void); + +int +vsock_private_new_socket(void); + +uint32_t +vsock_get_local_cid(int socket); + +int +vsock_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket); + +int +vsock_private_bind(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int *socket); + +int +vsock_listen(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int backlog, int *socket); + +int +vsock_private_listen(uint32_t cid, uint32_t port, struct sockaddr_vm * addr, int backlog, int *socket); + +int +vsock_connect(uint32_t cid, uint32_t port, int *socket); + +int +vsock_private_connect(uint32_t cid, uint32_t port, int *socket); + +struct sockaddr_vm +vsock_getsockname(int socket); + +void +vsock_close(int socket); + +void +vsock_connect_peers(uint32_t cid, uint32_t port, int backlog, int *socketA, int *socketB); + +void +vsock_send(int socket, char *msg); + +void +vsock_disable_sigpipe(int socket); + +bool +vsock_address_exists(struct xvsockpgen *buffer, struct sockaddr_vm addr); + +uint32_t +vsock_get_available_port(void); + +int +vsock_bind_family(sa_family_t family); + +#endif /* VSOCK_HELPERS_H */ diff --git a/tests/x18_entitled.c b/tests/x18_entitled.c index ec9b149af..301c043a4 100644 --- a/tests/x18_entitled.c +++ b/tests/x18_entitled.c @@ -26,10 +26,12 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include "context_helpers.h" #include -#include #include #include +#include +#include T_GLOBAL_META( T_META_NAMESPACE("xnu.arm"), @@ -46,12 +48,23 @@ T_DECL(x18_entitled, #ifndef __arm64__ T_SKIP("Running on non-arm64 target, skipping..."); #else + bool did_csw = false; uint64_t x18_val; + for (uint64_t i = 0xFEEDB0B000000000ULL; i < 0xFEEDB0B000000000ULL + 10000; ++i) { asm volatile ("mov x18, %0" : : "r"(i)); - sched_yield(); + int32_t const nr_csw = get_csw_count(); + int const rc = usleep(10); + int32_t const nr_csw_after = get_csw_count(); + + // There isn't any guarantee usleep() will actually context switch so this is a best effort way + // to see if we've switched at least once in all these iterations. + did_csw = did_csw || (nr_csw_after > nr_csw); + T_QUIET; T_ASSERT_EQ(0, rc, "usleep"); asm volatile ("mov %0, x18" : "=r"(x18_val)); T_QUIET; T_ASSERT_EQ(x18_val, i, "check that x18 reads back correctly after yield"); } + + T_QUIET; T_ASSERT_TRUE(did_csw, "did not context switch, but should have."); #endif } diff --git a/tests/x18_legacy.c b/tests/x18_legacy.c index 1d43701d6..e5bcb4d70 100644 --- a/tests/x18_legacy.c +++ b/tests/x18_legacy.c @@ -26,10 +26,12 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include "context_helpers.h" #include -#include #include #include +#include +#include T_GLOBAL_META( T_META_NAMESPACE("xnu.arm"), @@ -45,12 +47,23 @@ T_DECL(x18_legacy, #ifndef __arm64__ T_SKIP("Running on non-arm64 target, skipping..."); #else + bool did_csw = false; uint64_t x18_val; + for (uint64_t i = 0xFEEDB0B000000000ULL; i < 0xFEEDB0B000000000ULL + 10000; ++i) { asm volatile ("mov x18, %0" : : "r"(i)); - sched_yield(); + int32_t const nr_csw = get_csw_count(); + int const rc = usleep(10); + int32_t const nr_csw_after = get_csw_count(); + + // There isn't any guarantee usleep() will actually context switch so this is a best effort way + // to see if we've switched at least once in all these iterations. + did_csw = did_csw || (nr_csw_after > nr_csw); + T_QUIET; T_ASSERT_EQ(0, rc, "usleep"); asm volatile ("mov %0, x18" : "=r"(x18_val)); T_QUIET; T_ASSERT_EQ(x18_val, i, "check that x18 reads back correctly after yield"); } + + T_QUIET; T_ASSERT_TRUE(did_csw, "did not context switch, but should have."); #endif } diff --git a/tests/x18_unentitled.c b/tests/x18_unentitled.c index 5154f6139..4d17ce99b 100644 --- a/tests/x18_unentitled.c +++ b/tests/x18_unentitled.c @@ -26,10 +26,12 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include "context_helpers.h" #include -#include #include #include +#include +#include T_GLOBAL_META( T_META_NAMESPACE("xnu.arm"), @@ -45,12 +47,23 @@ T_DECL(x18_unentitled, #ifndef __arm64__ T_SKIP("Running on non-arm64 target, skipping..."); #else + bool did_csw = false; uint64_t x18_val; + for (uint64_t i = 0xFEEDB0B000000000ULL; i < 0xFEEDB0B000000000ULL + 10000; ++i) { asm volatile ("mov x18, %0" : : "r"(i)); - sched_yield(); + int32_t const nr_csw = get_csw_count(); + int const rc = usleep(10); + int32_t const nr_csw_after = get_csw_count(); + + // There isn't any guarantee usleep() will actually context switch so this is a best effort way + // to see if we've switched at least once in all these iterations. + did_csw = did_csw || (nr_csw_after > nr_csw); + T_QUIET; T_ASSERT_EQ(0, rc, "usleep"); asm volatile ("mov %0, x18" : "=r"(x18_val)); T_QUIET; T_ASSERT_EQ(x18_val, 0ULL, "check that x18 is cleared after yield"); } + + T_QUIET; T_ASSERT_TRUE(did_csw, "did not context switch, but should have."); #endif } diff --git a/tools/cocci/remove-cassert.cocci b/tools/cocci/remove-cassert.cocci new file mode 100644 index 000000000..b003871e8 --- /dev/null +++ b/tools/cocci/remove-cassert.cocci @@ -0,0 +1,11 @@ +// To apply, at the top of xnu.git: +// $ spatch --max-width=120 --use-gitgrep --in-place --include-headers --sp-file tools/cocci/remove-cassert.cocci -dir . + +@@ +expression E; +@@ + +( +- _CASSERT(E) ++ static_assert(E) +) diff --git a/tools/lldbmacros/core/__init__.py b/tools/lldbmacros/core/__init__.py index 3f61e71fd..db7acaa37 100755 --- a/tools/lldbmacros/core/__init__.py +++ b/tools/lldbmacros/core/__init__.py @@ -4,3 +4,4 @@ Core classes and functions used for lldb kernel debugging. from .cvalue import value, gettype, getfieldoffset from .standard import xnu_format, xnu_vformat, SBValueFormatter from .iterators import * +from .kernelcore import OSHashPointer, OSHashU64 diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index 1cff3bd78..706c029c5 100755 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -11,6 +11,10 @@ from .caching import ( cache_statically, ) from utils import * +from ctypes import ( + c_uint64, + c_int64, +) import lldb @@ -486,13 +490,13 @@ class KernelTarget(object): if self.arch != 'arm64e': return addr T0Sz = self.GetGlobalVariable('gT0Sz') - return StripPAC(addr, T0Sz) + return CanonicalAddress(addr, T0Sz) def StripKernelPAC(self, addr): if self.arch != 'arm64e': return addr T1Sz = self.GetGlobalVariable('gT1Sz') - return StripPAC(addr, T1Sz) + return CanonicalAddress(addr, T1Sz) PAGE_PROTECTION_TYPE_NONE = 0 PAGE_PROTECTION_TYPE_PPL = 1 @@ -667,3 +671,24 @@ class KernelTarget(object): return 0xffffff8000000000 - 0x80000000 else: return 0xffffffe00000000 + +def _swap32(i): + return struct.unpack("I", i))[0] + +def OSHashPointer(ptr): + h = c_uint64(c_int64(int(ptr) << 16).value >> 20).value + h *= 0x5052acdb + h &= 0xffffffff + return (h ^ _swap32(h)) & 0xffffffff + +def OSHashU64(u64): + u64 = c_uint64(int(u64)).value + u64 ^= (u64 >> 31) + u64 *= 0x7fb5d329728ea185 + u64 &= 0xffffffffffffffff + u64 ^= (u64 >> 27) + u64 *= 0x81dadef4bc2dd44d + u64 &= 0xffffffffffffffff + u64 ^= (u64 >> 33) + + return u64 & 0xffffffff diff --git a/tools/lldbmacros/ioreg.py b/tools/lldbmacros/ioreg.py index a462f7f3d..1db751e93 100755 --- a/tools/lldbmacros/ioreg.py +++ b/tools/lldbmacros/ioreg.py @@ -550,13 +550,17 @@ def GetIOPMWorkQueueSummary(wq): ioservicepm_header = "{:<20s}{:<4s}{:<4s}{:<4s}{:<4s}\n" iopmrequest_indent = " " iopmrequest_header = iopmrequest_indent + "{:<20s}{:<6s}{:<20s}{:<20s}{:<12s}{:<12s}{:<20s}{:<20s}{:<20s}\n" + head = kern.StripKernelPAC(addressof(wq.fWorkQueue)) + head = kern.GetValueFromAddress(head, 'queue_head_t *') - for next in IterateQueue(wq.fWorkQueue, 'IOServicePM *', 'WorkChain'): + for next in IterateQueue(head, 'IOServicePM *', 'WorkChain'): out_str += ioservicepm_header.format("IOService", "ps", "ms", "wr", "name") out_str += "0x{:<16x} {:<2d} {:<2d} {:<2d} {:= GetEnumValue("ipc_object_type_t", "__IKOT_FIRST") + + +def IPCPortTypeToString(ty): + s = GetEnumName("ipc_object_type_t", ty, "IKOT_") + if s.startswith("IOT_"): + s = s[4:].replace("_PORT", "") + return s.lower() + + +def IsPortType(ty, name): + return ty == GetEnumValue("ipc_object_type_t", name) + + +def IsPortSetType(ty): + return IsPortType(ty, "IOT_PORT_SET") + + +def GetPortLabel(port, ty): + addr = kern.StripKernelPAC(port.xGetScalarByPath(".ip_object.iol_pointer")) + return port.xCreateValueFromAddress(None, addr, gettype(ty)) + + +@lldb_type_summary(["struct ipc_entry_table *", "ipc_entry_table_t"]) +def PrintIpcEntryTable(array): + t, s = kalloc_array_decode(array, "struct ipc_entry") + return "ptr = {:#x}, size = {:d}, elem_type = struct ipc_entry".format( + unsigned(t), s + ) + + +@lldb_type_summary(["struct ipc_port_requests_table *", "ipc_port_requests_table_t"]) def PrintIpcPortRequestTable(array): - t, s = kalloc_array_decode(array, 'struct ipc_port_requests') - return "ptr = {:#x}, size = {:d}, elem_type = struct ipc_port_requests".format(unsigned(t), s) + t, s = kalloc_array_decode(array, "struct ipc_port_requests") + return "ptr = {:#x}, size = {:d}, elem_type = struct ipc_port_requests".format( + unsigned(t), s + ) + def GetSpaceTable(space): - """ Return the tuple of (entries, size) of the table for a space - """ + """Return the tuple of (entries, size) of the table for a space""" table = space.is_table.__smr_ptr if table: - return kalloc_array_decode(table, 'struct ipc_entry') + return kalloc_array_decode(table, "struct ipc_entry") return (None, 0) + def GetSpaceEntriesWithBits(is_tableval, num_entries, mask): base = is_tableval.GetSBValue().Dereference() return ( (index, iep) for index, iep in enumerate(base.xIterSiblings(1, num_entries), 1) - if iep.xGetIntegerByName('ie_bits') & mask + if iep.xGetIntegerByName("ie_bits") & mask ) + def GetSpaceObjectsWithBits(is_tableval, num_entries, mask, ty): base = is_tableval.GetSBValue().Dereference() return ( iep.xCreateValueFromAddress( None, - iep.xGetIntegerByName('ie_object'), + iep.xGetIntegerByName("ie_object"), ty, ) for iep in base.xIterSiblings(1, num_entries) - if iep.xGetIntegerByName('ie_bits') & mask + if iep.xGetIntegerByName("ie_bits") & mask ) -@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <32s}".format("task", "pid", '#acts', "tablesize", "command")) -def GetTaskIPCSummary(task, show_busy = False): - """ Display a task's ipc summary. - params: - task : core.value represeting a Task in kernel - returns - str - string of ipc info for the task +def GetGenFromIEBits(ie_bits): + return (ie_bits >> 24) | 3 + + +def GetNameFromIndexAndIEBits(index, ie_bits): + return (index << 8) | GetGenFromIEBits(ie_bits) + + +@header( + "{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <32s}".format( + "task", "pid", "#acts", "tablesize", "command" + ) +) +def GetTaskIPCSummary(task, show_busy=False): + """Display a task's ipc summary. + params: + task : core.value represeting a Task in kernel + returns + str - string of ipc info for the task """ - out_string = '' + out_string = "" format_string = "{0: <#20x} {1: <6d} {2: <6d} {3: <10d} {4: <32s}" busy_format = " {0: <10d} {1: <6d}" - proc_name = '' + proc_name = "" if not task.active: - proc_name = 'terminated: ' + proc_name = "terminated: " if task.halting: - proc_name += 'halting: ' + proc_name += "halting: " proc_name += GetProcNameForTask(task) _, table_size = GetSpaceTable(task.itk_space) - out_string += format_string.format(task, GetProcPIDForTask(task), task.thread_count, table_size, proc_name) + out_string += format_string.format( + task, GetProcPIDForTask(task), task.thread_count, table_size, proc_name + ) if show_busy: nbusy, nmsgs = GetTaskBusyPortsSummary(task) out_string += busy_format.format(nbusy, nmsgs) return (out_string, table_size, nbusy, nmsgs) return (out_string, table_size) -@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <32s} {5: <10s} {6: <6s}".format("task", "pid", '#acts', "tablesize", "command", "#busyports", "#kmsgs")) + +@header( + "{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <32s} {5: <10s} {6: <6s}".format( + "task", "pid", "#acts", "tablesize", "command", "#busyports", "#kmsgs" + ) +) def GetTaskBusyIPCSummary(task): return GetTaskIPCSummary(task, True) + def GetTaskBusyPortsSummary(task): is_tableval, num_entries = GetSpaceTable(task.itk_space) - port_ty = gettype('struct ipc_port') + gettype("struct ipc_port") nbusy = 0 nmsgs = 0 if is_tableval: - ports = GetSpaceObjectsWithBits(is_tableval, num_entries, 0x00020000, - gettype('struct ipc_port')) + ports = GetSpaceObjectsWithBits( + is_tableval, num_entries, 0x00020000, gettype("struct ipc_port") + ) for port in ports: if not port or port == xnudefines.MACH_PORT_DEAD: continue - count = port.xGetIntegerByPath('.ip_messages.imq_msgcount') + count = port.xGetIntegerByPath(".ip_messages.imq_msgcount") if count: nbusy += 1 nmsgs += count @@ -106,53 +164,69 @@ def GetTaskBusyPortsSummary(task): return (nbusy, nmsgs) -@header("{:<20s} {:<20s} {:<10s} {:>6s} {:<20s} {:>8s} {:<20s} {:s}".format( - "port", "waitqueue", "recvname", "refs", "receiver", "nmsgs", "service", "dest/kobject")) +@header( + "{:<20s} {:<20s} {:<10s} {:>6s} {:<20s} {:>8s} {:<20s} {:s}".format( + "port", + "waitqueue", + "recvname", + "refs", + "receiver", + "nmsgs", + "kind", + "dest/kobject", + ) +) def PrintPortSummary(port, show_kmsg_summary=True, show_sets=False, prefix="", O=None): - """ Display a port's summary - params: - port : core.value representing a port in the kernel - returns - str : string of ipc info for the given port + """Display a port's summary + params: + port : core.value representing a port in the kernel + returns + str : string of ipc info for the given port """ format_string = "{:<#20x} {:<#20x} {:#010x} {:>6d} {:<#20x} {:>8d} {:<20s} {:ith_voucher->iv_port) + disp_str = "V" ## Thread voucher (thread->ith_voucher->iv_port) ## Catch-all else: - disp_str = 'X' ## invalid + disp_str = "X" ## invalid return disp_str + def GetPortPDRequest(port): - """ Returns the port-destroyed notification port if any - """ + """Returns the port-destroyed notification port if any""" if port.ip_has_watchport: return port.ip_twe.twe_pdrequest - if not port.ip_specialreply: + if not IsPortType(port.ip_object.io_type, "IOT_SPECIAL_REPLY_PORT"): return port.ip_pdrequest return 0 + def GetKmsgHeader(kmsgp): - """ Helper to get mach message header of a kmsg. + """Helper to get mach message header of a kmsg. Assumes the kmsg has not been put to user. params: kmsgp : core.value representing the given ipc_kmsg_t struct returns: Mach message header for kmsgp """ - if kmsgp.ikm_type == GetEnumValue('ipc_kmsg_type_t', 'IKM_TYPE_ALL_INLINED'): - return kern.GetValueFromAddress(int(addressof(kmsgp.ikm_big_data)), 'mach_msg_header_t *') - if kmsgp.ikm_type == GetEnumValue('ipc_kmsg_type_t', 'IKM_TYPE_UDATA_OOL'): - return kern.GetValueFromAddress(int(addressof(kmsgp.ikm_small_data)), 'mach_msg_header_t *') - return kern.GetValueFromAddress(unsigned(kmsgp.ikm_kdata), 'mach_msg_header_t *') + if kmsgp.ikm_type == GetEnumValue("ipc_kmsg_type_t", "IKM_TYPE_ALL_INLINED"): + return kern.GetValueFromAddress( + int(addressof(kmsgp.ikm_big_data)), "mach_msg_header_t *" + ) + if kmsgp.ikm_type == GetEnumValue("ipc_kmsg_type_t", "IKM_TYPE_UDATA_OOL"): + return kern.GetValueFromAddress( + int(addressof(kmsgp.ikm_small_data)), "mach_msg_header_t *" + ) + return kern.GetValueFromAddress(unsigned(kmsgp.ikm_kdata), "mach_msg_header_t *") -@header("{:<20s} {:<20s} {:<20s} {:<10s} {:>6s} {:<20s} {:<8s} {:<26s} {:<26s}".format( - "", "kmsg", "header", "msgid", "size", "reply-port", "disp", "source", "destination")) + +@header( + "{:<20s} {:<20s} {:<20s} {:<10s} {:>6s} {:<20s} {:<8s} {:<26s} {:<12s} {:<20s}".format( + "", + "kmsg", + "header", + "msgid", + "size", + "reply-port", + "disp", + "source", + "destname", + "destination", + ) +) def GetKMsgSummary(kmsgp, prefix_str=""): - """ Display a summary for type ipc_kmsg_t - params: - kmsgp : core.value representing the given ipc_kmsg_t struct - returns: - str : string of summary info for the given ipc_kmsg_t instance + """Display a summary for type ipc_kmsg_t + params: + kmsgp : core.value representing the given ipc_kmsg_t struct + returns: + str : string of summary info for the given ipc_kmsg_t instance """ kmsghp = GetKmsgHeader(kmsgp) kmsgh = dereference(kmsghp) out_string = "" out_string += "{:<20s} {:<#20x} {:<#20x} {kmsgh.msgh_id:#010x} {kmsgh.msgh_size:>6d} {kmsgh.msgh_local_port:<#20x} ".format( - '', unsigned(kmsgp), unsigned(kmsghp), kmsgh=kmsghp) - prefix_str = "{:<20s} ".format(' ') + prefix_str + "", unsigned(kmsgp), unsigned(kmsghp), kmsgh=kmsghp + ) + prefix_str = "{:<20s} ".format(" ") + prefix_str disposition = "" - bits = kmsgh.msgh_bits & 0xff + bits = kmsgh.msgh_bits & 0xFF # remote port if bits == 17: disposition = "rS" elif bits == 18: disposition = "rO" - else : - disposition = "rX" # invalid + else: + disposition = "rX" # invalid out_string += "{:<2s}".format(disposition) # local port disposition = "" - bits = (kmsgh.msgh_bits & 0xff00) >> 8 + bits = (kmsgh.msgh_bits & 0xFF00) >> 8 if bits == 17: disposition = "lS" @@ -278,7 +371,7 @@ def GetKMsgSummary(kmsgp, prefix_str=""): # voucher disposition = "" - bits = (kmsgh.msgh_bits & 0xff0000) >> 16 + bits = (kmsgh.msgh_bits & 0xFF0000) >> 16 if bits == 17: disposition = "vS" @@ -301,49 +394,60 @@ def GetKMsgSummary(kmsgp, prefix_str=""): else: out_string += "{0: <1s}".format("-") - dest_proc_name = "" - if GetKmsgHeader(kmsgp).msgh_remote_port: - dest_proc_name = GetPortDestinationSummary(GetKmsgHeader(kmsgp).msgh_remote_port) + dest_port = GetKmsgHeader(kmsgp).msgh_remote_port + if dest_port: + name_str, dest_str, _ = GetPortDestProc(dest_port) + else: + name_str = dest_str = "" - out_string += " {:<26s} {:<26s}\n".format(GetKMsgSrc(kmsgp), dest_proc_name) + out_string += " {:<26s} {:<12s} {:<20s}".format( + GetKMsgSrc(kmsgp), name_str, dest_str + ) - if kmsgh.msgh_bits & 0x80000000: - out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc.header + "\n" - out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc(kmsgp, prefix_str + "\t") + "\n" + if kmsgh.msgh_bits & 0x80000000: # MACH_MSGH_BITS_COMPLEX + out_string += "\n" + prefix_str + "\t" + GetKMsgComplexBodyDesc.header + out_string += ( + "\n" + prefix_str + "\t" + GetKMsgComplexBodyDesc(kmsgp, prefix_str + "\t") + ) + + return out_string + "\n" - return out_string @header("{: <20s} {: <20s} {: <10s}".format("descriptor", "address", "size")) def GetMachMsgOOLDescriptorSummary(desc): - """ Returns description for mach_msg_ool_descriptor_t * object - """ + """Returns description for mach_msg_ool_descriptor_t * object""" format_string = "{: <#20x} {: <#20x} {:#010x}" out_string = format_string.format(desc, desc.address, desc.size) return out_string def GetKmsgDescriptors(kmsgp): - """ Get a list of descriptors in a complex message - """ + """Get a list of descriptors in a complex message""" kmsghp = GetKmsgHeader(kmsgp) kmsgh = dereference(kmsghp) - if not (kmsgh.msgh_bits & 0x80000000): # pragma pylint: disable=superfluous-parens + if not (kmsgh.msgh_bits & 0x80000000): # pragma pylint: disable=superfluous-parens return [] ## Something in the python/lldb types is not getting alignment correct here. ## I'm grabbing a pointer to the body manually, and using tribal knowledge ## of the location of the descriptor count to get this correct - body = Cast(addressof(Cast(addressof(kmsgh), 'char *')[sizeof(kmsgh)]), 'mach_msg_body_t *') - #dsc_count = body.msgh_descriptor_count - dsc_count = dereference(Cast(body, 'uint32_t *')) - #dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *') - dschead = Cast(addressof(Cast(addressof(body[0]), 'char *')[sizeof('uint32_t')]), 'mach_msg_descriptor_t *') + body = Cast( + addressof(Cast(addressof(kmsgh), "char *")[sizeof(kmsgh)]), "mach_msg_body_t *" + ) + # dsc_count = body.msgh_descriptor_count + dsc_count = dereference(Cast(body, "uint32_t *")) + # dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *') + dschead = Cast( + addressof(Cast(addressof(body[0]), "char *")[sizeof("uint32_t")]), + "mach_msg_descriptor_t *", + ) dsc_list = [] for i in range(dsc_count): dsc_list.append(dschead[i]) return (body, dschead, dsc_list) + def GetKmsgTotalDescSize(kmsgp): - """ Helper to get total descriptor size of a kmsg. + """Helper to get total descriptor size of a kmsg. Assumes the kmsg has full kernel representation (header and descriptors) params: kmsgp : core.value representing the given ipc_kmsg_t struct @@ -354,44 +458,54 @@ def GetKmsgTotalDescSize(kmsgp): kmsgh = dereference(kmsghp) dsc_count = 0 - if kmsgh.msgh_bits & 0x80000000: # MACH_MSGH_BITS_COMPLEX + if kmsgh.msgh_bits & 0x80000000: # MACH_MSGH_BITS_COMPLEX (body, _, _) = GetKmsgDescriptors(kmsgp) - dsc_count = dereference(Cast(body, 'uint32_t *')) + dsc_count = dereference(Cast(body, "uint32_t *")) - return dsc_count * sizeof('mach_msg_descriptor_t') + return dsc_count * sizeof("mach_msg_descriptor_t") -@header("{: <20s} {: <8s} {: <20s} {: <10s} {: <20s}".format("kmsgheader", "size", "body", "ds_count", "dsc_head")) + +@header( + "{: <20s} {: <20s} {: >6s} {: <20s}".format( + "kmsgheader", "body", "descs", "dsc_head" + ) +) def GetKMsgComplexBodyDesc(kmsgp, prefix_str=""): - """ Routine that prints a complex kmsg's body - """ + """Routine that prints a complex kmsg's body""" kmsghp = GetKmsgHeader(kmsgp) kmsgh = dereference(kmsghp) if not (kmsgh.msgh_bits & 0x80000000): # pragma pylint: disable=superfluous-parens return "" - format_string = "{: <#20x} {: <#8x} {: <#20x} {:#010x} {: <#20x}" + format_string = "{: <#20x} {: <#20x} {:6d} {: <#20x}" out_string = "" (body, dschead, dsc_list) = GetKmsgDescriptors(kmsgp) - out_string += format_string.format(kmsghp, sizeof(dereference(kmsghp)), body, len(dsc_list), dschead) + out_string += format_string.format(kmsghp, body, len(dsc_list), dschead) for dsc in dsc_list: try: dsc_type = unsigned(dsc.type.type) - out_string += "\n" + prefix_str + "Descriptor: " + xnudefines.mach_msg_type_descriptor_strings[dsc_type] + out_string += ( + "\n" + + prefix_str + + "Descriptor: " + + xnudefines.mach_msg_type_descriptor_strings[dsc_type] + ) if dsc_type == 0: # its a port. p = dsc.port.name dstr = GetPortDispositionString(dsc.port.disposition) out_string += " disp:{:s}, name:{: <#20x}".format(dstr, p) - elif unsigned(dsc.type.type) in (1,3): + elif unsigned(dsc.type.type) in (1, 3): # its OOL DESCRIPTOR or OOL VOLATILE DESCRIPTOR ool = dsc.out_of_line out_string += " " + GetMachMsgOOLDescriptorSummary(addressof(ool)) - except: + except Exception: out_string += "\n" + prefix_str + "Invalid Descriptor: {}".format(dsc) return out_string + def GetKmsgTrailer(kmsgp): - """ Helper to get trailer address of a kmsg + """Helper to get trailer address of a kmsg params: kmsgp : core.value representing the given ipc_kmsg_t struct returns: @@ -400,38 +514,52 @@ def GetKmsgTrailer(kmsgp): kmsghp = GetKmsgHeader(kmsgp) kmsgh = dereference(kmsghp) - if (kmsgp.ikm_type == int(GetEnumValue('ipc_kmsg_type_t', 'IKM_TYPE_ALL_INLINED')) or - kmsgp.ikm_type == int(GetEnumValue('ipc_kmsg_type_t', 'IKM_TYPE_KDATA_OOL'))): - return kern.GetValueFromAddress(unsigned(kmsghp) + kmsgh.msgh_size, 'mach_msg_max_trailer_t *') + if kmsgp.ikm_type == int( + GetEnumValue("ipc_kmsg_type_t", "IKM_TYPE_ALL_INLINED") + ) or kmsgp.ikm_type == int(GetEnumValue("ipc_kmsg_type_t", "IKM_TYPE_KDATA_OOL")): + return kern.GetValueFromAddress( + unsigned(kmsghp) + kmsgh.msgh_size, "mach_msg_max_trailer_t *" + ) else: - if kmsgh.msgh_bits & 0x80000000: # MACH_MSGH_BITS_COMPLEX - content_size = kmsgh.msgh_size - sizeof('mach_msg_base_t') - GetKmsgTotalDescSize(kmsgp) + if kmsgh.msgh_bits & 0x80000000: # MACH_MSGH_BITS_COMPLEX + content_size = ( + kmsgh.msgh_size + - sizeof("mach_msg_base_t") + - GetKmsgTotalDescSize(kmsgp) + ) else: - content_size = kmsgh.msgh_size - sizeof('mach_msg_header_t') - return kern.GetValueFromAddress(unsigned(kmsgp.ikm_udata) + content_size, 'mach_msg_max_trailer_t *') + content_size = kmsgh.msgh_size - sizeof("mach_msg_header_t") + return kern.GetValueFromAddress( + unsigned(kmsgp.ikm_udata) + content_size, "mach_msg_max_trailer_t *" + ) + def GetKMsgSrc(kmsgp): - """ Routine that prints a kmsg's source process and pid details - params: - kmsgp : core.value representing the given ipc_kmsg_t struct - returns: - str : string containing the name and pid of the kmsg's source proc + """Routine that prints a kmsg's source process and pid details + params: + kmsgp : core.value representing the given ipc_kmsg_t struct + returns: + str : string containing the name and pid of the kmsg's source proc """ trailer = GetKmsgTrailer(kmsgp) - kmsgpid = Cast(trailer, 'uint *')[10] # audit_token.val[5] - return "{0:s} ({1:d})".format(GetProcNameForPid(kmsgpid), kmsgpid) + kmsgpid = Cast(trailer, "uint *")[10] # audit_token.val[5] + return "{0:s}({1:d})".format(GetProcNameForPid(kmsgpid), kmsgpid) -@header("{:<20s} {:<20s} {:<10s} {:>6s} {:<6s}".format( - "portset", "waitqueue", "name", "refs", "flags")) + +@header( + "{:<20s} {:<20s} {:<10s} {:>6s} {:<6s}".format( + "portset", "waitqueue", "name", "refs", "flags" + ) +) def PrintPortSetSummary(pset, space=0, verbose=True, O=None): - """ Display summary for a given struct ipc_pset * - params: - pset : core.value representing a pset in the kernel - returns: - str : string of summary information for the given pset + """Display summary for a given struct ipc_pset * + params: + pset : core.value representing a pset in the kernel + returns: + str : string of summary information for the given pset """ show_kmsg_summary = False - if config['verbosity'] > vHUMAN : + if config["verbosity"] > vHUMAN: show_kmsg_summary = True ips_wqset = pset.ips_wqset @@ -443,24 +571,30 @@ def PrintPortSetSummary(pset, space=0, verbose=True, O=None): is_tableval, _ = GetSpaceTable(space) if is_tableval: entry_val = GetObjectAtIndexFromArray(is_tableval, local_name >> 8) - local_name |= unsigned(entry_val.ie_bits) >> 24 + local_name |= GetGenFromIEBits(unsigned(entry_val.ie_bits)) dest = GetSpaceProcDesc(space) else: for wq in wqs.iterateMembers(): dest = GetSpaceProcDesc(wq.asPort().ip_receiver) - ips_object = pset.ips_object - if ips_object.io_bits & 0x80000000: - state = "ASet" + if unsigned(pset.ips_object.io_state): + pass else: - state = "DSet" + pass - print("{:<#20x} {:<#20x} {:#010x} {:>6d} {:<6s} {:<20s}".format( - unsigned(pset), addressof(ips_wqset), local_name, - ips_object.io_references, "ASet", dest)) + print( + "{:<#20x} {:<#20x} {:#010x} {:>6d} {:<6s} {:<20s}".format( + unsigned(pset), + addressof(pset.ips_wqset), + local_name, + pset.ips_object.io_references, + "ASet", + dest, + ) + ) if verbose and wqs.hasThreads(): - with O.table("{:<20s} {:<20s}".format('waiter', 'event'), indent=True): + with O.table("{:<20s} {:<20s}".format("waiter", "event"), indent=True): for thread in wqs.iterateThreads(): print("{:<#20x} {:<#20x}".format(unsigned(thread), thread.wait_event)) print("") @@ -472,18 +606,18 @@ def PrintPortSetSummary(pset, space=0, verbose=True, O=None): print("") - # Macro: showipc -@lldb_command('showipc') + +@lldb_command("showipc") def ShowIPC(cmd_args=None): - """ Routine to print data for the given IPC space - Usage: showipc

+ """Routine to print data for the given IPC space + Usage: showipc
""" if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed") - ipc = kern.GetValueFromAddress(cmd_args[0], 'ipc_space *') + ipc = kern.GetValueFromAddress(cmd_args[0], "ipc_space *") if not ipc: print("unknown arguments:", str(cmd_args)) return False @@ -491,19 +625,21 @@ def ShowIPC(cmd_args=None): PrintIPCInformation(ipc, False, False) return True + # EndMacro: showipc # Macro: showtaskipc -@lldb_command('showtaskipc') + +@lldb_command("showtaskipc") def ShowTaskIPC(cmd_args=None): - """ Routine to print IPC summary of given task - Usage: showtaskipc
+ """Routine to print IPC summary of given task + Usage: showtaskipc
""" if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed") - tval = kern.GetValueFromAddress(cmd_args[0], 'task *') + tval = kern.GetValueFromAddress(cmd_args[0], "task *") if not tval: print("unknown arguments:", str(cmd_args)) return False @@ -515,14 +651,16 @@ def ShowTaskIPC(cmd_args=None): print(summary) return True + # EndMacro: showtaskipc # Macro: showallipc -@lldb_command('showallipc') + +@lldb_command("showallipc") def ShowAllIPC(cmd_args=None): - """ Routine to print IPC summary of all tasks - Usage: showallipc + """Routine to print IPC summary of all tasks + Usage: showallipc """ for t in kern.tasks: print(GetTaskSummary.header + " " + GetProcSummary.header) @@ -532,19 +670,21 @@ def ShowAllIPC(cmd_args=None): PrintIPCInformation(t.itk_space, False, False) print("\n\n") + # EndMacro: showallipc -@lldb_command('showipcsummary', fancy=True) + +@lldb_command("showipcsummary", fancy=True) def ShowIPCSummary(cmd_args=None, cmd_options={}, O=None): - """ Summarizes the IPC state of all tasks. - This is a convenient way to dump some basic clues about IPC messaging. You can use the output to determine - tasks that are candidates for further investigation. + """Summarizes the IPC state of all tasks. + This is a convenient way to dump some basic clues about IPC messaging. You can use the output to determine + tasks that are candidates for further investigation. """ with O.table(GetTaskIPCSummary.header): ipc_table_size = 0 - l = [ GetTaskIPCSummary(t) for t in kern.tasks ] - l.sort(key = lambda e: e[1], reverse=True) + l = [GetTaskIPCSummary(t) for t in kern.tasks] + l.sort(key=lambda e: e[1], reverse=True) for e in l: print(e[0]) @@ -555,196 +695,243 @@ def ShowIPCSummary(cmd_args=None, cmd_options={}, O=None): print("Total Table size: {:d}".format(ipc_table_size)) + def GetKObjectFromPort(portval): - """ Get Kobject description from the port. - params: portval - core.value representation of 'ipc_port *' object - returns: str - string of kobject information + """Get Kobject description from the port. + params: portval - core.value representation of 'ipc_port *' object + returns: str - string of kobject information """ if not portval or portval == xnudefines.MACH_PORT_DEAD: return "MACH_PORT_DEAD" - io_bits = unsigned(portval.ip_object.io_bits) - objtype_index = io_bits & 0x3ff - if not objtype_index: + otype = unsigned(portval.ip_object.io_type) + if not IsKObjectType(otype): return "not a kobject" - kobject_addr = kern.StripKernelPAC(unsigned(portval.ip_kobject)) - objtype_str = GetEnumName('ipc_kotype_t', objtype_index, "IKOT_") + kobject_addr = kern.StripKernelPAC(unsigned(portval.ip_kobject)) + objtype_str = IPCPortTypeToString(otype) - desc_str = "{:<#20x} {:<16s}".format(kobject_addr, objtype_str) + if kobject_addr and (objtype_str == 'iokit_object' or objtype_str == 'uext_object' or objtype_str == 'iokit_connect'): + iokit_object = CastIOKitClass(portval.ip_kobject, 'IOMachPort *').object + desc_str = "{:<#20x}".format(kern.StripKernelPAC(unsigned(iokit_object))) + else: + desc_str = "{:<#20x}".format(kobject_addr) if not kobject_addr: pass - elif objtype_str == 'IOKIT_OBJECT': - iokit_classnm = GetObjectTypeStr(portval.ip_kobject) + elif objtype_str == 'iokit_object' or objtype_str == 'uext_object' or objtype_str == 'iokit_connect' : + iokit_classnm = GetObjectTypeStr(iokit_object) if not iokit_classnm: desc_str += " " else: - desc_str += re.sub(r'vtable for ', r' ', iokit_classnm) + desc_str += re.sub(r"vtable for ", r" ", iokit_classnm) - elif objtype_str[:5] == 'TASK_' and objtype_str != 'TASK_ID_TOKEN': - task = value(portval.GetSBValue().xCreateValueFromAddress( - None, kobject_addr, gettype('struct task')).AddressOf()) + elif objtype_str[:5] == "task_" and objtype_str != "task_id_token": + task = value( + portval.GetSBValue() + .xCreateValueFromAddress(None, kobject_addr, gettype("struct task")) + .AddressOf() + ) if GetProcFromTask(task) is not None: - desc_str += " {:s}({:d})".format(GetProcNameForTask(task), GetProcPIDForTask(task)) + desc_str += " {:s}({:d})".format( + GetProcNameForTask(task), GetProcPIDForTask(task) + ) return desc_str + def GetSpaceProcDesc(space): - """ Display the name and pid of a space's task - params: - space: core.value representing a pointer to a space - returns: - str : string containing receiver's name and pid + """Display the name and pid of a space's task + params: + space: core.value representing a pointer to a space + returns: + str : string containing receiver's name and pid """ task = space.is_task if GetProcFromTask(task) is None: return "task {:<#20x}".format(unsigned(task)) return "{:s}({:d})".format(GetProcNameForTask(task), GetProcPIDForTask(task)) + def GetPortDestProc(port): - """ Display the name and pid of a given port's receiver - params: - port : core.value representing a pointer to a port in the kernel - returns: - str : string containing receiver's name and pid + """Display the name and pid of a given port's receiver + params: + port : core.value representing a pointer to a port in the kernel + returns: + a tuple of: + str : the name of that port in its destination (or dead/in-transit) + str : the destination/kobject value for this port + str : the service name for this port """ - bits = unsigned(port.ip_object.io_bits) # osfmk/ipc/ipc_object.h name = unsigned(port.ip_messages.imq_receiver_name) + otype = unsigned(port.ip_object.io_type) + dest_str = None + state = port.ip_object.io_state - port_is_kobject_port = bits & xnudefines.IO_BITS_KOTYPE + if state == GetEnumValue("ipc_object_state_t", "IO_STATE_INACTIVE"): + name_str = "dead" + dest_str = "" + elif state == GetEnumValue("ipc_object_state_t", "IO_STATE_IN_LIMBO"): + name_str = "in-limbo" + dest_str = "" + elif state == GetEnumValue("ipc_object_state_t", "IO_STATE_IN_LIMBO_PD"): + name_str = "in-limbo-pd" + dest_str = "" + elif state == GetEnumValue("ipc_object_state_t", "IO_STATE_IN_TRANSIT"): + name_str = "in-transit" + dest_str = "{:<#20x}".format(port.ip_destination) + elif state == GetEnumValue("ipc_object_state_t", "IO_STATE_IN_TRANSIT_PD"): + name_str = "in-transit-pd" + dest_str = "{:<#20x}".format(port.ip_destination) + else: # in-space / in-space immovable + if name == 1: # kobjects + name_str = "" + else: + name_str = "{:<#12x}".format(name) - if bits & xnudefines.IO_BITS_ACTIVE == 0: - if port_is_kobject_port: - return ('', 'inactive-kobject-port') + if IsKObjectType(otype): + return (name_str, GetKObjectFromPort(port), "") - return ('', 'inactive-port') + service = "" + if port.ip_object.io_type == GetEnumValue("ipc_object_type_t", "IOT_SERVICE_PORT"): + try: + splabel = GetPortLabel(port.GetSBValue(), "struct ipc_service_port_label") + service = splabel.xGetCStringByName("ispl_service_name") + except Exception: + service = "unknown" - if port_is_kobject_port: - return ('', GetKObjectFromPort(port)) + if dest_str is None: + dest_str = GetSpaceProcDesc(port.ip_receiver) - if name == 0: - return ('{:<#20x}'.format(port.ip_destination), 'in-transit') + return (name_str, dest_str, service) - return ('{:<#20x}'.format(name), GetSpaceProcDesc(port.ip_receiver)) -@header("{:<20s} {:<20s}".format("destname", "destination") ) -def GetPortDestinationSummary(port): - """ Get destination information for a port. - params: port - core.value representation of 'ipc_port *' object - returns: str - string of info about ports destination +@lldb_type_summary(["ipc_entry_t"]) +@header( + "{: <20s} {: <12s} {: <8s} {: <8s} {: <8s} {: <8s} {: <12s} {: <20s} {: <20s}".format( + "object", + "name", + "rite", + "urefs", + "nsets", + "nmsgs", + "destname", + "kind", + "dest/kobject", + ) +) +def GetIPCEntrySummary(entry, ipc_name="", rights_filter=0): + """Get summary of a ipc entry. + params: + entry - core.value representing ipc_entry_t in the kernel + ipc_name - str of format '0x0123' for display in summary. + returns: + str - string of ipc entry related information + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ - if not port or port == xnudefines.MACH_PORT_DEAD: - return "MACH_PORT_DEAD" - a, b = GetPortDestProc(port) - return "{:<20s} {:<20s}".format(a, b) - -@lldb_type_summary(['ipc_entry_t']) -@header("{: <20s} {: <12s} {: <8s} {: <8s} {: <8s} {: <8s} {: <20s} {: <20s}".format("object", "name", "rite", "urefs", "nsets", "nmsgs", "destname", "destination")) -def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0): - """ Get summary of a ipc entry. - params: - entry - core.value representing ipc_entry_t in the kernel - ipc_name - str of format '0x0123' for display in summary. - returns: - str - string of ipc entry related information - - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - 'm' : Immovable send port - 'i' : Immovable receive port - 'g' : No grant port - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested - """ - out_str = '' - entry_ptr = int(hex(entry), 16) - format_string = "{: <#20x} {: <12s} {: <8s} {: <8d} {: <8d} {: <8d} {: <20s} {: <20s}" - right_str = '' - destname_str = '' - destination_str = '' + out_str = "" + int(hex(entry), 16) + right_str = "" + name_str = "" + destname_str = "" + service_str = "" ie_object = entry.ie_object ie_bits = int(entry.ie_bits) - io_bits = int(ie_object.io_bits) if ie_object else 0 - urefs = int(ie_bits & 0xffff) + urefs = int(ie_bits & 0xFFFF) nsets = 0 nmsgs = 0 - if ie_bits & 0x00100000 : - right_str = 'Dead' + + if ie_bits & 0x00100000: + right_str = "Dead" + kind = "" elif ie_bits & 0x00080000: - right_str = 'Set' - psetval = kern.CreateTypedPointerFromAddress(unsigned(ie_object), 'struct ipc_pset') + right_str = "Set" + kind = "" + psetval = kern.CreateTypedPointerFromAddress( + unsigned(ie_object), "struct ipc_pset" + ) wqs = Waitq(addressof(psetval.ips_wqset)) members = 0 - for m in wqs.iterateMembers(): members += 1 + for m in wqs.iterateMembers(): + members += 1 destname_str = "{:d} Members".format(members) else: if ie_bits & 0x00010000: if ie_bits & 0x00020000: # SEND + RECV - right_str = 'SR' + right_str = "SR" else: # SEND only - right_str = 'S' + right_str = "S" elif ie_bits & 0x00020000: # RECV only - right_str = 'R' + right_str = "R" elif ie_bits & 0x00040000: # SEND_ONCE - right_str = 'O' - portval = kern.CreateTypedPointerFromAddress(unsigned(ie_object), 'struct ipc_port') + right_str = "O" + if ie_bits & xnudefines.IE_BITS_IMMOVABLE_SEND: + right_str += "m" + portval = kern.CreateTypedPointerFromAddress( + unsigned(ie_object), "struct ipc_port" + ) if int(entry.ie_request) != 0: - requestsval, _ = kalloc_array_decode(portval.ip_requests, 'struct ipc_port_request') + requestsval, _ = kalloc_array_decode( + portval.ip_requests, "struct ipc_port_request" + ) sorightval = requestsval[int(entry.ie_request)].ipr_soright soright_ptr = unsigned(sorightval) if soright_ptr != 0: # dead-name notification requested - right_str += 'd' + right_str += "d" # send-possible armed if soright_ptr & 0x1: - right_str +='s' + right_str += "s" # send-possible requested if soright_ptr & 0x2: - right_str +='r' + right_str += "r" # No-senders notification requested - if portval.ip_nsrequest != 0: - right_str += 'n' + if not IsKObjectType(ie_object.io_type) and portval.ip_nsrequest != 0: + right_str += "n" # port-destroy notification requested if GetPortPDRequest(portval): - right_str += 'x' + right_str += "x" # Immovable receive rights - if portval.ip_immovable_receive != 0: - right_str += 'i' + if portval.ip_object.io_state == GetEnumValue( + "ipc_object_state_t", "IO_STATE_IN_SPACE_IMMOVABLE" + ): + right_str += "i" # Immovable send rights - if portval.ip_immovable_send != 0: - right_str += 'm' - # No-grant Port - if portval.ip_no_grant != 0: - right_str += 'g' # Port with SB filtering on - if io_bits & 0x00001000 != 0: - right_str += 'f' + if portval.ip_object.io_filtered != 0: + right_str += "f" # early-out if the rights-filter doesn't match - if rights_filter != 0 and rights_filter != right_str: - return '' + if rights_filter != 0 and rights_filter not in right_str: + return "" # now show the port destination part - destname_str = GetPortDestinationSummary(portval) + name_str, destname_str, service_str = GetPortDestProc(portval) # Get the number of sets to which this port belongs nsets = len([s for s in Waitq(addressof(portval.ip_waitq)).iterateSets()]) nmsgs = portval.ip_messages.imq_msgcount + kind = IPCPortTypeToString(ie_object.io_type) # append the generation to the name value # (from osfmk/ipc/ipc_entry.h) @@ -753,101 +940,146 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0): # 0 1 48 # 1 0 32 # 1 1 16 - ie_gen_roll = { 0:'.64', 1:'.48', 2:'.32', 3:'.16' } - ipc_name = '{:s}{:s}'.format(ipc_name.strip(), ie_gen_roll[(ie_bits & 0x00c00000) >> 22]) + ie_gen_roll = {0: ".64", 1: ".48", 2: ".32", 3: ".16"} + ipc_name = "{:s}{:s}".format( + ipc_name.strip(), ie_gen_roll[(ie_bits & IE_BITS_ROLL_MASK) >> 24] + ) - if rights_filter == 0 or rights_filter == right_str: - out_str = format_string.format(ie_object, ipc_name, right_str, urefs, nsets, nmsgs, destname_str, destination_str) + if rights_filter == 0 or rights_filter in right_str: + format_string = "{: <#20x} {: <12s} {: <8s} {: <8d} {: <8d} {: <8d} {: <12s} {: <20s} {: <20s}" + out_str = format_string.format( + ie_object, + ipc_name, + right_str, + urefs, + nsets, + nmsgs, + name_str, + kind, + destname_str + " " + service_str, + ) return out_str -@header("{0: >20s}".format("user bt") ) + +@header("{0: >20s}".format("user bt")) def GetPortUserStack(port, task): - """ Get UserStack information for the given port & task. - params: port - core.value representation of 'ipc_port *' object - task - value representing 'task *' object - returns: str - string information on port's userstack + """Get UserStack information for the given port & task. + params: port - core.value representation of 'ipc_port *' object + task - value representing 'task *' object + returns: str - string information on port's userstack """ - out_str = '' + out_str = "" if not port or port == xnudefines.MACH_PORT_DEAD: return out_str pid = port.ip_made_pid proc_val = GetProcFromTask(task) if port.ip_made_bt: btlib = kmemory.BTLibrary.get_shared() - out_str += "\n".join(btlib.get_stack(port.ip_made_bt).symbolicated_frames()) + "\n" + out_str += ( + "\n".join(btlib.get_stack(port.ip_made_bt).symbolicated_frames()) + "\n" + ) if pid != GetProcPID(proc_val): out_str += " ({:<10d})\n".format(pid) return out_str -@lldb_type_summary(['ipc_space *']) -@header("{0: <20s} {1: <20s} {2: <20s} {3: <8s} {4: <10s} {5: >8s} {6: <8s}".format('ipc_space', 'is_task', 'is_table', 'flags', 'ports', 'low_mod', 'high_mod')) -def PrintIPCInformation(space, show_entries=False, show_userstack=False, rights_filter=0): - """ Provide a summary of the ipc space - """ - out_str = '' - format_string = "{0: <#20x} {1: <#20x} {2: <#20x} {3: <8s} {4: <10d} {5: >8d} {6: <8d}" - is_tableval, num_entries = GetSpaceTable(space) - flags ='' - if is_tableval: - flags += 'A' - else: - flags += ' ' - if (space.is_grower) != 0: - flags += 'G' - print(format_string.format(space, space.is_task, is_tableval if is_tableval else 0, flags, - num_entries, space.is_low_mod, space.is_high_mod)) - #should show the each individual entries if asked. +@lldb_type_summary(["ipc_space *"]) +@header( + "{0: <20s} {1: <20s} {2: <20s} {3: <8s} {4: <10s} {5: >8s} {6: <8s}".format( + "ipc_space", "is_task", "is_table", "flags", "ports", "low_mod", "high_mod" + ) +) +def PrintIPCInformation( + space, show_entries=False, show_userstack=False, rights_filter=0 +): + """Provide a summary of the ipc space""" + out_str = "" + format_string = ( + "{0: <#20x} {1: <#20x} {2: <#20x} {3: <8s} {4: <10d} {5: >8d} {6: <8d}" + ) + is_tableval, num_entries = GetSpaceTable(space) + flags = "" + if is_tableval: + flags += "A" + else: + flags += " " + if (space.is_grower) != 0: + flags += "G" + print( + format_string.format( + space, + space.is_task, + is_tableval if is_tableval else 0, + flags, + num_entries, + space.is_low_mod, + space.is_high_mod, + ) + ) + + # should show the each individual entries if asked. if show_entries and is_tableval: print("\t" + GetIPCEntrySummary.header) entries = ( (index, value(iep.AddressOf())) - for index, iep - in GetSpaceEntriesWithBits(is_tableval, num_entries, 0x001f0000) + for index, iep in GetSpaceEntriesWithBits( + is_tableval, num_entries, 0x001F0000 + ) ) for index, entryval in entries: entry_ie_bits = unsigned(entryval.ie_bits) - entry_name = "{0: <#20x}".format( (index <<8 | entry_ie_bits >> 24) ) + entry_name = "{0: <#20x}".format( + GetNameFromIndexAndIEBits(index, entry_ie_bits) + ) entry_str = GetIPCEntrySummary(entryval, entry_name, rights_filter) if not entry_str: continue print("\t" + entry_str) if show_userstack: - entryport = Cast(entryval.ie_object, 'ipc_port *') - if entryval.ie_object and (int(entry_ie_bits) & 0x00070000) and entryport.ip_made_bt: - print(GetPortUserStack.header + GetPortUserStack(entryport, space.is_task)) + entryport = Cast(entryval.ie_object, "ipc_port *") + if ( + entryval.ie_object + and (int(entry_ie_bits) & 0x00070000) + and entryport.ip_made_bt + ): + print( + GetPortUserStack.header + + GetPortUserStack(entryport, space.is_task) + ) - #done with showing entries + # done with showing entries return out_str + # Macro: showrights -@lldb_command('showrights', 'R:') -def ShowRights(cmd_args=None, cmd_options={}): - """ Routine to print rights information for the given IPC space - Usage: showrights [-R rights_type]
- -R rights_type : only display rights matching the string 'rights_type' - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested +@lldb_command("showrights", "R:") +def ShowRights(cmd_args=None, cmd_options={}): + """Routine to print rights information for the given IPC space + Usage: showrights [-R rights_type]
+ -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed") - ipc = kern.GetValueFromAddress(cmd_args[0], 'ipc_space *') + ipc = kern.GetValueFromAddress(cmd_args[0], "ipc_space *") if not ipc: print("unknown arguments:", str(cmd_args)) return False @@ -857,35 +1089,36 @@ def ShowRights(cmd_args=None, cmd_options={}): print(PrintIPCInformation.header) PrintIPCInformation(ipc, True, False, rights_type) + # EndMacro: showrights -@lldb_command('showtaskrights','R:') -def ShowTaskRights(cmd_args=None, cmd_options={}): - """ Routine to ipc rights information for a task - Usage: showtaskrights [-R rights_type] - -R rights_type : only display rights matching the string 'rights_type' - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - 'm' : Immovable send port - 'i' : Immovable receive port - 'g' : No grant port - 'f' : Port with SB filtering on - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested +@lldb_command("showtaskrights", "R:") +def ShowTaskRights(cmd_args=None, cmd_options={}): + """Routine to ipc rights information for a task + Usage: showtaskrights [-R rights_type] + -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + 'f' : Port with SB filtering on + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed") - tval = kern.GetValueFromAddress(cmd_args[0], 'task *') + tval = kern.GetValueFromAddress(cmd_args[0], "task *") if not tval: print("unknown arguments:", str(cmd_args)) return False @@ -898,65 +1131,68 @@ def ShowTaskRights(cmd_args=None, cmd_options={}): print(PrintIPCInformation.header) PrintIPCInformation(tval.itk_space, True, False, rights_type) + # Count the vouchers in a given task's ipc space @header("{: <20s} {: <6s} {: <20s} {: <8s}".format("task", "pid", "name", "#vouchers")) def GetTaskVoucherCount(t): is_tableval, num_entries = GetSpaceTable(t.itk_space) count = 0 - voucher_kotype = int(GetEnumValue('ipc_kotype_t', 'IKOT_VOUCHER')) + voucher_kotype = int(GetEnumValue("ipc_object_type_t", "IKOT_VOUCHER")) if is_tableval: - ports = GetSpaceObjectsWithBits(is_tableval, num_entries, 0x00070000, - gettype('struct ipc_port')) + ports = GetSpaceObjectsWithBits( + is_tableval, num_entries, 0x00070000, gettype("struct ipc_port") + ) for port in ports: - io_bits = port.xGetIntegerByPath('.ip_object.io_bits') - if io_bits & 0x3ff == voucher_kotype: + if port.xGetIntegerByPath(".ip_object.io_type") == voucher_kotype: count += 1 format_str = "{: <#20x} {: <6d} {: <20s} {: <8d}" pval = GetProcFromTask(t) return format_str.format(t, GetProcPID(pval), GetProcNameForTask(t), count) + # Macro: countallvouchers -@lldb_command('countallvouchers', fancy=True) +@lldb_command("countallvouchers", fancy=True) def CountAllVouchers(cmd_args=None, cmd_options={}, O=None): - """ Routine to count the number of vouchers by task. Useful for finding leaks. - Usage: countallvouchers + """Routine to count the number of vouchers by task. Useful for finding leaks. + Usage: countallvouchers """ with O.table(GetTaskVoucherCount.header): for t in kern.tasks: print(GetTaskVoucherCount(t)) + # Macro: showataskrightsbt -@lldb_command('showtaskrightsbt', 'R:') -def ShowTaskRightsBt(cmd_args=None, cmd_options={}): - """ Routine to ipc rights information with userstacks for a task - Usage: showtaskrightsbt [-R rights_type] - -R rights_type : only display rights matching the string 'rights_type' - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - 'm' : Immovable send port - 'i' : Immovable receive port - 'g' : No grant port - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested +@lldb_command("showtaskrightsbt", "R:") +def ShowTaskRightsBt(cmd_args=None, cmd_options={}): + """Routine to ipc rights information with userstacks for a task + Usage: showtaskrightsbt [-R rights_type] + -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed") - tval = kern.GetValueFromAddress(cmd_args[0], 'task *') + tval = kern.GetValueFromAddress(cmd_args[0], "task *") if not tval: print("unknown arguments:", str(cmd_args)) return False @@ -969,31 +1205,32 @@ def ShowTaskRightsBt(cmd_args=None, cmd_options={}): print(PrintIPCInformation.header) PrintIPCInformation(tval.itk_space, True, True, rights_type) + # EndMacro: showtaskrightsbt # Macro: showallrights -@lldb_command('showallrights', 'R:') -def ShowAllRights(cmd_args=None, cmd_options={}): - """ Routine to print rights information for IPC space of all tasks - Usage: showallrights [-R rights_type] - -R rights_type : only display rights matching the string 'rights_type' - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - 'm' : Immovable send port - 'i' : Immovable receive port - 'g' : No grant port - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested +@lldb_command("showallrights", "R:") +def ShowAllRights(cmd_args=None, cmd_options={}): + """Routine to print rights information for IPC space of all tasks + Usage: showallrights [-R rights_type] + -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + 'm' : Immovable send port + 'i' : Immovable receive port + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ rights_type = 0 if "-R" in cmd_options: @@ -1007,53 +1244,65 @@ def ShowAllRights(cmd_args=None, cmd_options={}): PrintIPCInformation(t.itk_space, True, False, rights_type) + "\n\n" except (KeyboardInterrupt, SystemExit): raise - except: - print("Failed to get IPC information. Do individual showtaskrights to find the error. \n\n") + except Exception: + print( + "Failed to get IPC information. Do individual showtaskrights to find the error. \n\n" + ) + # EndMacro: showallrights def GetInTransitPortSummary(port, disp, holding_port, holding_kmsg): - """ String-ify the in-transit dispostion of a port. - """ + """String-ify the in-transit dispostion of a port.""" ## This should match the summary generated by GetIPCEntrySummary ## "object" "name" "rite" "urefs" "nsets" "nmsgs" "destname" "destination" - format_str = "\t{: <#20x} {: <12} {: <8s} {: <8d} {: <8d} {: <8d} p:{: <#19x} k:{: <#19x}" - portname = 'intransit' + format_str = ( + "\t{: <#20x} {: <12} {: <8s} {: <8d} {: <8d} {: <8d} p:{: <#19x} k:{: <#19x}" + ) disp_str = GetPortDispositionString(disp) - out_str = format_str.format(unsigned(port), 'in-transit', disp_str, 0, 0, port.ip_messages.imq_msgcount, unsigned(holding_port), unsigned(holding_kmsg)) + out_str = format_str.format( + unsigned(port), + "in-transit", + disp_str, + 0, + 0, + port.ip_messages.imq_msgcount, + unsigned(holding_port), + unsigned(holding_kmsg), + ) return out_str def GetDispositionFromEntryType(entry_bits): - """ Translate an IPC entry type into an in-transit disposition. This allows - the GetInTransitPortSummary function to be re-used to string-ify IPC - entry types. + """Translate an IPC entry type into an in-transit disposition. This allows + the GetInTransitPortSummary function to be re-used to string-ify IPC + entry types. """ ebits = int(entry_bits) - if (ebits & 0x003f0000) == 0: + if (ebits & 0x003F0000) == 0: return 0 if (ebits & 0x00010000) != 0: - return 17 ## MACH_PORT_RIGHT_SEND + return 17 ## MACH_PORT_RIGHT_SEND elif (ebits & 0x00020000) != 0: - return 16 ## MACH_PORT_RIGHT_RECEIVE + return 16 ## MACH_PORT_RIGHT_RECEIVE elif (ebits & 0x00040000) != 0: - return 18 ## MACH_PORT_RIGHT_SEND_ONCE + return 18 ## MACH_PORT_RIGHT_SEND_ONCE elif (ebits & 0x00080000) != 0: - return 100 ## MACH_PORT_RIGHT_PORT_SET + return 100 ## MACH_PORT_RIGHT_PORT_SET elif (ebits & 0x00100000) != 0: - return 101 ## MACH_PORT_RIGHT_DEAD_NAME + return 101 ## MACH_PORT_RIGHT_DEAD_NAME elif (ebits & 0x00200000) != 0: - return 102 ## MACH_PORT_RIGHT_LABELH + return 102 ## MACH_PORT_RIGHT_LABELH else: return 0 + def GetDispositionFromVoucherPort(th_vport): - """ Translate a thread's voucher port into a 'disposition' - """ + """Translate a thread's voucher port into a 'disposition'""" if unsigned(th_vport) > 0: return 103 ## Voucher type return 0 @@ -1061,17 +1310,18 @@ def GetDispositionFromVoucherPort(th_vport): g_kmsg_prog = 0 g_progmeter = { - 0 : '*', - 1 : '-', - 2 : '\\', - 3 : '|', - 4 : '/', - 5 : '-', - 6 : '\\', - 7 : '|', - 8 : '/', + 0: "*", + 1: "-", + 2: "\\", + 3: "|", + 4: "/", + 5: "-", + 6: "\\", + 7: "|", + 8: "/", } + def PrintProgressForKmsg(): global g_kmsg_prog global g_progmeter @@ -1080,15 +1330,14 @@ def PrintProgressForKmsg(): def CollectPortsForAnalysis(port, disposition): - """ - """ + """ """ if not port or port == xnudefines.MACH_PORT_DEAD: return - p = Cast(port, 'struct ipc_port *') + p = Cast(port, "struct ipc_port *") yield (p, disposition) # no-senders notification port - if unsigned(p.ip_nsrequest) not in (0, 1): # 1 is IP_KOBJECT_NSREQUEST_ARMED + if not IsKObjectType(p.ip_object.io_type) and p.ip_nsrequest != 0: PrintProgressForKmsg() yield (p.ip_nsrequest, -1) @@ -1100,48 +1349,56 @@ def CollectPortsForAnalysis(port, disposition): ## ports can have many send-possible notifications armed: go through the table! if unsigned(p.ip_requests) != 0: - table, table_sz = kalloc_array_decode(p.ip_requests, 'struct ipc_port_request') + table, table_sz = kalloc_array_decode(p.ip_requests, "struct ipc_port_request") for i in range(table_sz): if i == 0: continue ipr = table[i] - if unsigned(ipr.ipr_name) in (0, 0xfffffffe): + if unsigned(ipr.ipr_name) in (0, 0xFFFFFFFE): # 0xfffffffe is a host notify request continue ipr_bits = unsigned(ipr.ipr_soright) & 3 - ipr_port = kern.GetValueFromAddress(int(ipr.ipr_soright) & ~3, 'struct ipc_port *') + ipr_port = kern.GetValueFromAddress( + int(ipr.ipr_soright) & ~3, "struct ipc_port *" + ) # skip unused entries in the ipc table to avoid null dereferences if not ipr_port: continue ipr_disp = 0 - if ipr_bits & 3: ## send-possible armed and requested + if ipr_bits & 3: ## send-possible armed and requested ipr_disp = -5 - elif ipr_bits & 2: ## send-possible requested + elif ipr_bits & 2: ## send-possible requested ipr_disp = -4 - elif ipr_bits & 1: ## send-possible armed + elif ipr_bits & 1: ## send-possible armed ipr_disp = -3 PrintProgressForKmsg() yield (ipr_port, ipr_disp) return + def CollectKmsgPorts(task, task_port, kmsgp): - """ Look through a message, 'kmsgp' destined for 'task' - (enqueued on task_port). Collect any port descriptors, - remote, local, voucher, or other port references - into a (ipc_port_t, disposition) list. + """Look through a message, 'kmsgp' destined for 'task' + (enqueued on task_port). Collect any port descriptors, + remote, local, voucher, or other port references + into a (ipc_port_t, disposition) list. """ kmsgh = dereference(GetKmsgHeader(kmsgp)) p_list = [] PrintProgressForKmsg() - if kmsgh.msgh_remote_port and unsigned(kmsgh.msgh_remote_port) != unsigned(task_port): - disp = kmsgh.msgh_bits & 0x1f + if kmsgh.msgh_remote_port and unsigned(kmsgh.msgh_remote_port) != unsigned( + task_port + ): + disp = kmsgh.msgh_bits & 0x1F p_list += list(CollectPortsForAnalysis(kmsgh.msgh_remote_port, disp)) - if kmsgh.msgh_local_port and unsigned(kmsgh.msgh_local_port) != unsigned(task_port) \ - and unsigned(kmsgh.msgh_local_port) != unsigned(kmsgh.msgh_remote_port): - disp = (kmsgh.msgh_bits & 0x1f00) >> 8 + if ( + kmsgh.msgh_local_port + and unsigned(kmsgh.msgh_local_port) != unsigned(task_port) + and unsigned(kmsgh.msgh_local_port) != unsigned(kmsgh.msgh_remote_port) + ): + disp = (kmsgh.msgh_bits & 0x1F00) >> 8 p_list += list(CollectPortsForAnalysis(kmsgh.msgh_local_port, disp)) if kmsgp.ikm_voucher_port: @@ -1154,7 +1411,7 @@ def CollectKmsgPorts(task, task_port, kmsgp): for dsc in dsc_list: PrintProgressForKmsg() dsc_type = unsigned(dsc.type.type) - if dsc_type == 0 or dsc_type == 2: ## 0 == port, 2 == ool port + if dsc_type == 0 or dsc_type == 2: ## 0 == port, 2 == ool port if dsc_type == 0: ## its a port descriptor dsc_disp = dsc.port.disposition @@ -1162,36 +1419,41 @@ def CollectKmsgPorts(task, task_port, kmsgp): else: ## it's an ool_ports descriptor which is an array of ports dsc_disp = dsc.ool_ports.disposition - dispdata = Cast(dsc.ool_ports.address, 'struct ipc_port *') + dispdata = Cast(dsc.ool_ports.address, "struct ipc_port *") for pidx in range(dsc.ool_ports.count): PrintProgressForKmsg() - p_list += list(CollectPortsForAnalysis(dispdata[pidx], dsc_disp)) + p_list += list( + CollectPortsForAnalysis(dispdata[pidx], dsc_disp) + ) return p_list + def CollectKmsgPortRefs(task, task_port, kmsgp, p_refs): - """ Recursively collect all references to ports inside the kmsg 'kmsgp' - into the set 'p_refs' + """Recursively collect all references to ports inside the kmsg 'kmsgp' + into the set 'p_refs' """ p_list = CollectKmsgPorts(task, task_port, kmsgp) ## Iterate over each ports we've collected, to see if they ## have messages on them, and then recurse! for p, pdisp in p_list: - ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16 + ptype = p.ip_object.io_type p_refs.add((p, pdisp, ptype)) - if ptype != 0: ## don't bother with port sets + if IsPortSetType(ptype): continue ## If the port that's in-transit has messages already enqueued, ## go through each of those messages and look for more ports! - for p_kmsgp in IterateCircleQueue(p.ip_messages.imq_messages, 'ipc_kmsg', 'ikm_link'): + for p_kmsgp in IterateCircleQueue( + p.ip_messages.imq_messages, "ipc_kmsg", "ikm_link" + ): CollectKmsgPortRefs(task, p, p_kmsgp, p_refs) def FindKmsgPortRefs(instr, task, task_port, kmsgp, qport): - """ Look through a message, 'kmsgp' destined for 'task'. If we find - any port descriptors, remote, local, voucher, or other port that - matches 'qport', return a short description - which should match the format of GetIPCEntrySummary. + """Look through a message, 'kmsgp' destined for 'task'. If we find + any port descriptors, remote, local, voucher, or other port that + matches 'qport', return a short description + which should match the format of GetIPCEntrySummary. """ out_str = instr @@ -1203,16 +1465,18 @@ def FindKmsgPortRefs(instr, task, task_port, kmsgp, qport): if unsigned(p) == unsigned(qport): ## the port we're looking for was found in this message! if len(out_str) > 0: - out_str += '\n' + out_str += "\n" out_str += GetInTransitPortSummary(p, pdisp, task_port, kmsgp) - ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16 - if ptype != 0: ## don't bother with port sets + ptype = p.ip_object.io_type + if IsPortSetType(ptype): continue ## If the port that's in-transit has messages already enqueued, ## go through each of those messages and look for more ports! - for p_kmsgp in IterateCircleQueue(p.ip_messages.imq_messages, 'ipc_kmsg', 'ikm_link'): + for p_kmsgp in IterateCircleQueue( + p.ip_messages.imq_messages, "ipc_kmsg", "ikm_link" + ): out_str = FindKmsgPortRefs(out_str, task, p, p_kmsgp, qport) return out_str @@ -1225,9 +1489,10 @@ intransit_idx = -1000 taskports_idx = -2000 thports_idx = -3000 + def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should_log): - """ Iterate over all ports in the system, calling 'func' - for each entry in + """Iterate over all ports in the system, calling 'func' + for each entry in """ global port_iteration_do_print_taskname global intransit_idx, taskports_idx, thports_idx, registeredport_idx, excports_idx @@ -1236,7 +1501,7 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should entry_port_type_mask = 0x00070000 if include_psets: - entry_port_type_mask = 0x000f0000 + entry_port_type_mask = 0x000F0000 if tasklist is None: tasklist = list(kern.tasks) @@ -1251,11 +1516,15 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should if should_log: procname = "" if not t.active: - procname = 'terminated: ' + procname = "terminated: " if t.halting: - procname += 'halting: ' + procname += "halting: " procname += GetProcNameForTask(t) - sys.stderr.write(" checking {:s} ({}/{})...{:50s}\r".format(procname, tidx, len(tasklist), '')) + sys.stderr.write( + " checking {:s} ({}/{})...{:50s}\r".format( + procname, tidx, len(tasklist), "" + ) + ) tidx += 1 port_iteration_do_print_taskname = True @@ -1265,40 +1534,43 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should if not is_tableval: continue - base = is_tableval.GetSBValue().Dereference() - entries = ( - value(iep.AddressOf()) - for iep in base.xIterSiblings(1, num_entries) - ) + base = is_tableval.GetSBValue().Dereference() + entries = (value(iep.AddressOf()) for iep in base.xIterSiblings(1, num_entries)) for idx, entry_val in enumerate(entries, 1): - entry_bits= unsigned(entry_val.ie_bits) - entry_obj = 0 - entry_str = '' - entry_name = "{:x}".format( (idx << 8 | entry_bits >> 24) ) + entry_bits = unsigned(entry_val.ie_bits) + "{:x}".format(GetNameFromIndexAndIEBits(idx, entry_bits)) entry_disp = GetDispositionFromEntryType(entry_bits) ## If the entry in the table represents a port of some sort, ## then make the callback provided if int(entry_bits) & entry_port_type_mask: - eport = kern.CreateTypedPointerFromAddress(unsigned(entry_val.ie_object), 'struct ipc_port') + eport = kern.CreateTypedPointerFromAddress( + unsigned(entry_val.ie_object), "struct ipc_port" + ) ## Make the callback func(t, space, ctx, idx, entry_val, eport, entry_disp) ## if the port has pending messages, look through ## each message for ports (and recurse) - if follow_busyports and unsigned(eport) > 0 and eport.ip_messages.imq_msgcount > 0: + if ( + follow_busyports + and unsigned(eport) > 0 + and eport.ip_messages.imq_msgcount > 0 + ): ## collect all port references from all messages - for kmsgp in IterateCircleQueue(eport.ip_messages.imq_messages, 'ipc_kmsg', 'ikm_link'): + for kmsgp in IterateCircleQueue( + eport.ip_messages.imq_messages, "ipc_kmsg", "ikm_link" + ): p_refs = set() CollectKmsgPortRefs(t, eport, kmsgp, p_refs) - for (port, pdisp, ptype) in p_refs: + for port, pdisp, ptype in p_refs: func(t, space, ctx, intransit_idx, None, port, pdisp) ## for idx in xrange(1, num_entries) ## Task ports (send rights) - if getattr(t, 'itk_settable_self', 0) > 0: + if getattr(t, "itk_settable_self", 0) > 0: func(t, space, ctx, taskports_idx, 0, t.itk_settable_self, 17) if unsigned(t.itk_host) > 0: func(t, space, ctx, taskports_idx, 0, t.itk_host, 17) @@ -1308,9 +1580,9 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should func(t, space, ctx, taskports_idx, 0, t.itk_debug_control, 17) if unsigned(t.itk_task_access) > 0: func(t, space, ctx, taskports_idx, 0, t.itk_task_access, 17) - if unsigned(t.itk_task_ports[1]) > 0: ## task read port + if unsigned(t.itk_task_ports[1]) > 0: ## task read port func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[1], 17) - if unsigned(t.itk_task_ports[2]) > 0: ## task inspect port + if unsigned(t.itk_task_ports[2]) > 0: ## task inspect port func(t, space, ctx, taskports_idx, 0, t.itk_task_ports[2], 17) ## Task name port (not a send right, just a naked ref); TASK_FLAVOR_NAME = 3 @@ -1329,37 +1601,49 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should if unsigned(tport) > 0: try: func(t, space, ctx, registeredport_idx, 0, tport, 17) - except Exception as e: - print("\texception looking through registered port {:d}/{:d} in {:s}".format(tr_idx,tr_max,t)) + except Exception: + print( + "\texception looking through registered port {:d}/{:d} in {:s}".format( + tr_idx, tr_max, t + ) + ) pass tr_idx += 1 ## Task exception ports exidx = 0 exmax = sizeof(t.exc_actions) // sizeof(t.exc_actions[0]) - while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h - export = t.exc_actions[exidx].port ## send right + while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h + export = t.exc_actions[exidx].port ## send right if unsigned(export) > 0: try: func(t, space, ctx, excports_idx, 0, export, 17) - except Exception as e: - print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t)) + except Exception: + print( + "\texception looking through exception port {:d}/{:d} in {:s}".format( + exidx, exmax, t + ) + ) pass exidx += 1 ## XXX: any ports still valid after clearing IPC space?! - for thval in IterateQueue(t.threads, 'thread *', 'task_threads'): + for thval in IterateQueue(t.threads, "thread *", "task_threads"): ## XXX: look at block reason to see if it's in mach_msg_receive - then look at saved state / message ## Thread port (send right) - if getattr(thval.t_tro, 'tro_settable_self_port', 0) > 0: + if getattr(thval.t_tro, "tro_settable_self_port", 0) > 0: thport = thval.t_tro.tro_settable_self_port - func(t, space, ctx, thports_idx, 0, thport, 17) ## see: osfmk/mach/message.h + func( + t, space, ctx, thports_idx, 0, thport, 17 + ) ## see: osfmk/mach/message.h ## Thread special reply port (send-once right) if unsigned(thval.ith_special_reply_port) > 0: thport = thval.ith_special_reply_port - func(t, space, ctx, thports_idx, 0, thport, 18) ## see: osfmk/mach/message.h + func( + t, space, ctx, thports_idx, 0, thport, 18 + ) ## see: osfmk/mach/message.h ## Thread voucher port if unsigned(thval.ith_voucher) > 0: vport = thval.ith_voucher.iv_port @@ -1369,13 +1653,17 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should ## Thread exception ports if unsigned(thval.t_tro.tro_exc_actions) > 0: exidx = 0 - while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h - export = thval.t_tro.tro_exc_actions[exidx].port ## send right + while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h + export = thval.t_tro.tro_exc_actions[exidx].port ## send right if unsigned(export) > 0: try: func(t, space, ctx, excports_idx, 0, export, 17) - except Exception as e: - print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t)) + except Exception: + print( + "\texception looking through exception port {:d}/{:d} in {:s}".format( + exidx, exmax, t + ) + ) pass exidx += 1 ## XXX: the message on a thread (that's currently being received) @@ -1385,61 +1673,74 @@ def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should # Macro: findportrights def FindPortRightsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp): - """ Callback which uses 'ctx' as the (port,rights_types) tuple for which - a caller is seeking references. This should *not* be used from a - recursive call to IterateAllPorts. + """Callback which uses 'ctx' as the (port,rights_types) tuple for which + a caller is seeking references. This should *not* be used from a + recursive call to IterateAllPorts. """ global port_iteration_do_print_taskname (qport, rights_type) = ctx - entry_name = '' - entry_str = '' + entry_name = "" + entry_str = "" if unsigned(ipc_entry) != 0: entry_bits = unsigned(ipc_entry.ie_bits) - entry_name = "{:x}".format( (entry_idx << 8 | entry_bits >> 24) ) - if (int(entry_bits) & 0x001f0000) != 0 and unsigned(ipc_entry.ie_object) == unsigned(qport): + entry_name = "{:x}".format(GetNameFromIndexAndIEBits(entry_idx, entry_bits)) + if (int(entry_bits) & 0x001F0000) != 0 and unsigned( + ipc_entry.ie_object + ) == unsigned(qport): ## it's a valid entry, and it points to the port - entry_str = '\t' + GetIPCEntrySummary(ipc_entry, entry_name, rights_type) + entry_str = "\t" + GetIPCEntrySummary(ipc_entry, entry_name, rights_type) procname = GetProcNameForTask(task) - if ipc_port and ipc_port != xnudefines.MACH_PORT_DEAD and ipc_port.ip_messages.imq_msgcount > 0: - sys.stderr.write(" checking {:s} busy-port {}:{:#x}...{:30s}\r".format(procname, entry_name, unsigned(ipc_port), '')) + if ( + ipc_port + and ipc_port != xnudefines.MACH_PORT_DEAD + and ipc_port.ip_messages.imq_msgcount > 0 + ): + sys.stderr.write( + " checking {:s} busy-port {}:{:#x}...{:30s}\r".format( + procname, entry_name, unsigned(ipc_port), "" + ) + ) ## Search through busy ports to find descriptors which could ## contain the only reference to this port! - for kmsgp in IterateCircleQueue(ipc_port.ip_messages.imq_messages, 'ipc_kmsg', 'ikm_link'): + for kmsgp in IterateCircleQueue( + ipc_port.ip_messages.imq_messages, "ipc_kmsg", "ikm_link" + ): entry_str = FindKmsgPortRefs(entry_str, task, ipc_port, kmsgp, qport) if len(entry_str) > 0: - sys.stderr.write("{:80s}\r".format('')) + sys.stderr.write("{:80s}\r".format("")) if port_iteration_do_print_taskname: print("Task: {0: <#x} {1: ] - -S ipc_space : only search the specified ipc space - -R rights_type : only display rights matching the string 'rights_type' - types of rights: - 'Dead' : Dead name - 'Set' : Port set - 'S' : Send right - 'R' : Receive right - 'O' : Send-once right - types of notifications: - 'd' : Dead-Name notification requested - 's' : Send-Possible notification armed - 'r' : Send-Possible notification requested - 'n' : No-Senders notification requested - 'x' : Port-destroy notification requested +@lldb_command("findportrights", "R:S:") +def FindPortRights(cmd_args=None, cmd_options={}): + """Routine to locate and print all extant rights to a given port + Usage: findportrights [-R rights_type] [-S ] + -S ipc_space : only search the specified ipc space + -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("no port address provided") - port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *') + port = kern.GetValueFromAddress(cmd_args[0], "struct ipc_port *") rights_type = 0 if "-R" in cmd_options: @@ -1447,24 +1748,29 @@ def FindPortRights(cmd_args=None, cmd_options={}): tasklist = None if "-S" in cmd_options: - space = kern.GetValueFromAddress(cmd_options["-S"], 'struct ipc_space *') - tasklist = [ space.is_task ] + space = kern.GetValueFromAddress(cmd_options["-S"], "struct ipc_space *") + tasklist = [space.is_task] ## Don't include port sets ## Don't recurse on busy ports (we do that manually) ## DO log progress - IterateAllPorts(tasklist, FindPortRightsCallback, (port, rights_type), False, False, True) - sys.stderr.write("{:120s}\r".format(' ')) + IterateAllPorts( + tasklist, FindPortRightsCallback, (port, rights_type), False, False, True + ) + sys.stderr.write("{:120s}\r".format(" ")) print("Done.") + + # EndMacro: findportrights # Macro: countallports + def CountPortsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp): - """ Callback which uses 'ctx' as the set of all ports found in the - iteration. This should *not* be used from a recursive - call to IterateAllPorts. + """Callback which uses 'ctx' as the set of all ports found in the + iteration. This should *not* be used from a recursive + call to IterateAllPorts. """ global intransit_idx @@ -1478,22 +1784,23 @@ def CountPortsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_di p_intransit.add(ipc_port_addr) if task.active or (task.halting and not task.active): - if not task in p_bytask: - p_bytask[task] = { 'transit':0, 'table':0, 'other':0 } + if task not in p_bytask: + p_bytask[task] = {"transit": 0, "table": 0, "other": 0} if entry_idx == intransit_idx: - p_bytask[task]['transit'] += 1 + p_bytask[task]["transit"] += 1 elif entry_idx >= 0: - p_bytask[task]['table'] += 1 + p_bytask[task]["table"] += 1 else: - p_bytask[task]['other'] += 1 + p_bytask[task]["other"] += 1 -@header(f"{'#ports': <10s} {'in transit': <10s} {'Special': <10s}") -@lldb_command('countallports', 'P', fancy=True) + +@header(f"{'#ports': <10s} {'in transit': <10s} {'Special': <10s}") +@lldb_command("countallports", "P", fancy=True) def CountAllPorts(cmd_args=None, cmd_options={}, O=None): - """ Routine to search for all as many references to ipc_port structures in the kernel - that we can find. - Usage: countallports [-P] - -P : include port sets in the count (default: NO) + """Routine to search for all as many references to ipc_port structures in the kernel + that we can find. + Usage: countallports [-P] + -P : include port sets in the count (default: NO) """ p_set = set() p_intransit = set() @@ -1506,90 +1813,110 @@ def CountAllPorts(cmd_args=None, cmd_options={}, O=None): ## optionally include port sets ## DO recurse on busy ports ## DO log progress - IterateAllPorts(None, CountPortsCallback, (p_set, p_intransit, p_bytask), find_psets, True, True) + IterateAllPorts( + None, CountPortsCallback, (p_set, p_intransit, p_bytask), find_psets, True, True + ) sys.stderr.write(f"{' ':120s}\r") - # sort by ipc table size - with O.table(GetTaskIPCSummary.header + ' ' + CountAllPorts.header): - for task, port_summary in sorted(p_bytask.items(), key=lambda item: item[1]['table'], reverse=True): + # sort by ipc table size + with O.table(GetTaskIPCSummary.header + " " + CountAllPorts.header): + for task, port_summary in sorted( + p_bytask.items(), key=lambda item: item[1]["table"], reverse=True + ): outstring, _ = GetTaskIPCSummary(task) outstring += f" {port_summary['table']: <10d} {port_summary['transit']: <10d} {port_summary['other']: <10d}" print(outstring) - + print(f"\nTotal ports found: {len(p_set)}") print(f"Number of ports In Transit: {len(p_intransit)}") - + + # EndMacro: countallports # Macro: showpipestats -@lldb_command('showpipestats') + +@lldb_command("showpipestats") def ShowPipeStats(cmd_args=None): - """ Display pipes usage information in the kernel - """ + """Display pipes usage information in the kernel""" print("Number of pipes: {: d}".format(kern.globals.amountpipes)) - print("Memory used by pipes: {:s}".format(sizeof_fmt(int(kern.globals.amountpipekva)))) - print("Max memory allowed for pipes: {:s}".format(sizeof_fmt(int(kern.globals.maxpipekva)))) + print( + "Memory used by pipes: {:s}".format(sizeof_fmt(int(kern.globals.amountpipekva))) + ) + print( + "Max memory allowed for pipes: {:s}".format( + sizeof_fmt(int(kern.globals.maxpipekva)) + ) + ) + # EndMacro: showpipestats # Macro: showtaskbusyports -@lldb_command('showtaskbusyports', fancy=True) + +@lldb_command("showtaskbusyports", fancy=True) def ShowTaskBusyPorts(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about receive rights belonging to this task that - have enqueued messages. This is often a sign of a blocked or hung process - Usage: showtaskbusyports + """Routine to print information about receive rights belonging to this task that + have enqueued messages. This is often a sign of a blocked or hung process + Usage: showtaskbusyports """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed. Please pass in the address of a task") - task = kern.GetValueFromAddress(cmd_args[0], 'task_t') + task = kern.GetValueFromAddress(cmd_args[0], "task_t") is_tableval, num_entries = GetSpaceTable(task.itk_space) if is_tableval: - ports = GetSpaceObjectsWithBits(is_tableval, num_entries, 0x00020000, - gettype('struct ipc_port')) + ports = GetSpaceObjectsWithBits( + is_tableval, num_entries, 0x00020000, gettype("struct ipc_port") + ) with O.table(PrintPortSummary.header): for port in ports: - if port.xGetIntegerByPath('.ip_messages.imq_msgcount'): + if port.xGetIntegerByPath(".ip_messages.imq_msgcount"): PrintPortSummary(value(port.AddressOf()), O=O) + # EndMacro: showtaskbusyports # Macro: showallbusyports -@lldb_command('showallbusyports', fancy=True) + +@lldb_command("showallbusyports", fancy=True) def ShowAllBusyPorts(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about all receive rights on the system that - have enqueued messages. + """Routine to print information about all receive rights on the system that + have enqueued messages. """ with O.table(PrintPortSummary.header): port_ty = gettype("struct ipc_port") for port in kmemory.Zone("ipc ports").iter_allocated(port_ty): - if port.xGetIntegerByPath('.ip_messages.imq_msgcount') > 0: + if port.xGetIntegerByPath(".ip_messages.imq_msgcount") > 0: PrintPortSummary(value(port.AddressOf()), O=O) + # EndMacro: showallbusyports # Macro: showallports -@lldb_command('showallports', fancy=True) -def ShowAllPorts(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about all allocated ports in the system - usage: showallports +@lldb_command("showallports", fancy=True) +def ShowAllPorts(cmd_args=None, cmd_options={}, O=None): + """Routine to print information about all allocated ports in the system + + usage: showallports """ with O.table(PrintPortSummary.header): port_ty = gettype("struct ipc_port") for port in kmemory.Zone("ipc ports").iter_allocated(port_ty): PrintPortSummary(value(port.AddressOf()), show_kmsg_summary=False, O=O) + # EndMacro: showallports # Macro: findkobjectport -@lldb_command('findkobjectport', fancy=True) -def FindKobjectPort(cmd_args=None, cmd_options={}, O=None): - """ Locate all ports pointing to a given kobject - usage: findkobjectport +@lldb_command("findkobjectport", fancy=True) +def FindKobjectPort(cmd_args=None, cmd_options={}, O=None): + """Locate all ports pointing to a given kobject + + usage: findkobjectport """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError() @@ -1600,31 +1927,35 @@ def FindKobjectPort(cmd_args=None, cmd_options={}, O=None): with O.table(PrintPortSummary.header): for port in kmemory.Zone("ipc ports").iter_allocated(port_ty): - if port.xGetIntegerByPath('.ip_object.io_bits') & 0x3ff == 0: + otype = port.xGetIntegerByPath(".ip_object.io_type") + if not itk_task_ports(otype): continue - ip_kobject = kmem.make_address(port.xGetScalarByName('ip_kobject')) + ip_kobject = kmem.make_address(port.xGetScalarByName("ip_kobject")) if ip_kobject == kobj_addr: PrintPortSummary(value(port.AddressOf()), show_kmsg_summary=False, O=O) + # EndMacro: findkobjectport # Macro: showtaskbusypsets -@lldb_command('showtaskbusypsets', fancy=True) + +@lldb_command("showtaskbusypsets", fancy=True) def ShowTaskBusyPortSets(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about port sets belonging to this task that - have enqueued messages. This is often a sign of a blocked or hung process - Usage: showtaskbusypsets + """Routine to print information about port sets belonging to this task that + have enqueued messages. This is often a sign of a blocked or hung process + Usage: showtaskbusypsets """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("No arguments passed. Please pass in the address of a task") - task = kern.GetValueFromAddress(cmd_args[0], 'task_t') + task = kern.GetValueFromAddress(cmd_args[0], "task_t") is_tableval, num_entries = GetSpaceTable(task.itk_space) if is_tableval: - psets = GetSpaceObjectsWithBits(is_tableval, num_entries, 0x00080000, - gettype('struct ipc_pset')) + psets = GetSpaceObjectsWithBits( + is_tableval, num_entries, 0x00080000, gettype("struct ipc_pset") + ) with O.table(PrintPortSetSummary.header): for pset in (value(v.AddressOf()) for v in psets): @@ -1632,13 +1963,15 @@ def ShowTaskBusyPortSets(cmd_args=None, cmd_options={}, O=None): if wq.asPort().ip_messages.imq_msgcount > 0: PrintPortSetSummary(pset, space=task.itk_space, O=O) + # EndMacro: showtaskbusyports # Macro: showallbusypsets -@lldb_command('showallbusypsets', fancy=True) + +@lldb_command("showallbusypsets", fancy=True) def ShowAllBusyPortSets(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about all port sets on the system that - have enqueued messages. + """Routine to print information about all port sets on the system that + have enqueued messages. """ with O.table(PrintPortSetSummary.header): pset_ty = gettype("struct ipc_pset") @@ -1649,29 +1982,32 @@ def ShowAllBusyPortSets(cmd_args=None, cmd_options={}, O=None): if port.ip_messages.imq_msgcount > 0: PrintPortSetSummary(pset, space=port.ip_receiver, O=O) + # EndMacro: showallbusyports # Macro: showallpsets -@lldb_command('showallpsets', fancy=True) -def ShowAllPortSets(cmd_args=None, cmd_options={}, O=None): - """ Routine to print information about all allocated psets in the system - usage: showallpsets +@lldb_command("showallpsets", fancy=True) +def ShowAllPortSets(cmd_args=None, cmd_options={}, O=None): + """Routine to print information about all allocated psets in the system + + usage: showallpsets """ with O.table(PrintPortSetSummary.header): pset_ty = gettype("struct ipc_pset") for pset in kmemory.Zone("ipc port sets").iter_allocated(pset_ty): PrintPortSetSummary(value(pset.AddressOf()), O=O) + # EndMacro: showallports # Macro: showbusyportsummary -@lldb_command('showbusyportsummary') + +@lldb_command("showbusyportsummary") def ShowBusyPortSummary(cmd_args=None): - """ Routine to print a summary of information about all receive rights - on the system that have enqueued messages. + """Routine to print a summary of information about all receive rights + on the system that have enqueued messages. """ - task_queue_head = kern.globals.tasks ipc_table_size = 0 ipc_busy_ports = 0 @@ -1690,78 +2026,88 @@ def ShowBusyPortSummary(cmd_args=None): ipc_busy_ports += nbusy ipc_msgs += nmsgs print(summary) - print("Total Table Size: {:d}, Busy Ports: {:d}, Messages in-flight: {:d}".format(ipc_table_size, ipc_busy_ports, ipc_msgs)) + print( + "Total Table Size: {:d}, Busy Ports: {:d}, Messages in-flight: {:d}".format( + ipc_table_size, ipc_busy_ports, ipc_msgs + ) + ) return + # EndMacro: showbusyportsummary # Macro: showport / showpset + def ShowPortOrPset(obj, space=0, O=None): - """ Routine that lists details about a given IPC port or pset - Syntax: (lldb) showport 0xaddr + """Routine that lists details about a given IPC port or pset + Syntax: (lldb) showport 0xaddr """ if not obj or obj == xnudefines.IPC_OBJECT_DEAD: print("IPC_OBJECT_DEAD") return - otype = (obj.io_bits & 0x7fff0000) >> 16 - if otype == 0: # IOT_PORT - with O.table(PrintPortSummary.header): - PrintPortSummary(cast(obj, 'ipc_port_t'), show_sets=True, O=O) - elif otype == 1: # IOT_PSET + if IsPortSetType(obj.io_type): with O.table(PrintPortSetSummary.header): - PrintPortSetSummary(cast(obj, 'ipc_pset_t'), space, O=O) + PrintPortSetSummary(cast(obj, "ipc_pset_t"), space, O=O) + else: + with O.table(PrintPortSummary.header): + PrintPortSummary(cast(obj, "ipc_port_t"), show_sets=True, O=O) -@lldb_command('showport', 'K', fancy=True) + +@lldb_command("showport", "K", fancy=True) def ShowPort(cmd_args=None, cmd_options={}, O=None): - """ Routine that lists details about a given IPC port + """Routine that lists details about a given IPC port - usage: showport
+ usage: showport
""" # -K is default and kept for backward compat, it used to mean "show kmsg queue" if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing port argument") - obj = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_object *') + obj = kern.GetValueFromAddress(cmd_args[0], "struct ipc_object *") ShowPortOrPset(obj, O=O) -@lldb_command('showpset', "S:", fancy=True) +@lldb_command("showpset", "S:", fancy=True) def ShowPSet(cmd_args=None, cmd_options={}, O=None): - """ Routine that prints details for a given ipc_pset * + """Routine that prints details for a given ipc_pset * - usage: showpset [-S ]
+ usage: showpset [-S ]
""" if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing port argument") space = 0 if "-S" in cmd_options: - space = kern.GetValueFromAddress(cmd_options["-S"], 'struct ipc_space *') - obj = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_object *') + space = kern.GetValueFromAddress(cmd_options["-S"], "struct ipc_space *") + obj = kern.GetValueFromAddress(cmd_args[0], "struct ipc_object *") ShowPortOrPset(obj, space=space, O=O) + # EndMacro: showport / showpset # Macro: showkmsg: -@lldb_command('showkmsg') + +@lldb_command("showkmsg") def ShowKMSG(cmd_args=[]): - """ Show detail information about a structure - Usage: (lldb) showkmsg + """Show detail information about a structure + Usage: (lldb) showkmsg """ if cmd_args is None or len(cmd_args) == 0: - raise ArgumentError('Invalid arguments') + raise ArgumentError("Invalid arguments") - kmsg = kern.GetValueFromAddress(cmd_args[0], 'ipc_kmsg_t') + kmsg = kern.GetValueFromAddress(cmd_args[0], "ipc_kmsg_t") print(GetKMsgSummary.header) print(GetKMsgSummary(kmsg)) + # EndMacro: showkmsg # IPC importance inheritance related macros. -@lldb_command('showalliits') + +@lldb_command("showalliits") def ShowAllIITs(cmd_args=[], cmd_options={}): - """ Development only macro. Show list of all iits allocated in the system. """ + """Development only macro. Show list of all iits allocated in the system.""" try: iit_queue = kern.globals.global_iit_alloc_queue except ValueError: @@ -1769,109 +2115,151 @@ def ShowAllIITs(cmd_args=[], cmd_options={}): return print(GetIPCImportantTaskSummary.header) - for iit in IterateQueue(iit_queue, 'struct ipc_importance_task *', 'iit_allocation'): + for iit in IterateQueue( + iit_queue, "struct ipc_importance_task *", "iit_allocation" + ): print(GetIPCImportantTaskSummary(iit)) return -@header("{: <18s} {: <3s} {: <18s} {: <32s} {: <18s} {: <8s}".format("ipc_imp_inherit", "don", "to_task", "proc_name", "from_elem", "depth")) -@lldb_type_summary(['ipc_importance_inherit *', 'ipc_importance_inherit_t']) + +@header( + "{: <18s} {: <3s} {: <18s} {: <32s} {: <18s} {: <8s}".format( + "ipc_imp_inherit", "don", "to_task", "proc_name", "from_elem", "depth" + ) +) +@lldb_type_summary(["ipc_importance_inherit *", "ipc_importance_inherit_t"]) def GetIPCImportanceInheritSummary(iii): - """ describes iii object of type ipc_importance_inherit_t * """ + """describes iii object of type ipc_importance_inherit_t *""" out_str = "" fmt = "{o: <#18x} {don: <3s} {o.iii_to_task.iit_task: <#18x} {task_name: <20s} {o.iii_from_elem: <#18x} {o.iii_depth: <#8x}" donating_str = "" if unsigned(iii.iii_donating): donating_str = "DON" taskname = GetProcNameForTask(iii.iii_to_task.iit_task) - if hasattr(iii.iii_to_task, 'iit_bsd_pid'): - taskname = "({:d}) {:s}".format(iii.iii_to_task.iit_bsd_pid, iii.iii_to_task.iit_procname) - out_str += fmt.format(o=iii, task_name = taskname, don=donating_str) + if hasattr(iii.iii_to_task, "iit_bsd_pid"): + taskname = "({:d}) {:s}".format( + iii.iii_to_task.iit_bsd_pid, iii.iii_to_task.iit_procname + ) + out_str += fmt.format(o=iii, task_name=taskname, don=donating_str) return out_str -@static_var('recursion_count', 0) -@header("{: <18s} {: <4s} {: <8s} {: <8s} {: <18s} {: <18s}".format("iie", "type", "refs", "made", "#kmsgs", "#inherits")) -@lldb_type_summary(['ipc_importance_elem *']) + +@static_var("recursion_count", 0) +@header( + "{: <18s} {: <4s} {: <8s} {: <8s} {: <18s} {: <18s}".format( + "iie", "type", "refs", "made", "#kmsgs", "#inherits" + ) +) +@lldb_type_summary(["ipc_importance_elem *"]) def GetIPCImportanceElemSummary(iie): - """ describes an ipc_importance_elem * object """ + """describes an ipc_importance_elem * object""" if GetIPCImportanceElemSummary.recursion_count > 500: GetIPCImportanceElemSummary.recursion_count = 0 return "Recursion of 500 reached" - out_str = '' + out_str = "" fmt = "{: <#18x} {: <4s} {: <8d} {: <8d} {: <#18x} {: <#18x}" if unsigned(iie.iie_bits) & xnudefines.IIE_TYPE_MASK: type_str = "INH" inherit_count = 0 else: - type_str = 'TASK' - iit = Cast(iie, 'struct ipc_importance_task *') - inherit_count = sum(1 for i in IterateQueue(iit.iit_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance')) + type_str = "TASK" + iit = Cast(iie, "struct ipc_importance_task *") + inherit_count = sum( + 1 + for i in IterateQueue( + iit.iit_inherits, "struct ipc_importance_inherit *", "iii_inheritance" + ) + ) refs = unsigned(iie.iie_bits) >> xnudefines.IIE_TYPE_BITS made_refs = unsigned(iie.iie_made) - kmsg_count = sum(1 for i in IterateQueue(iie.iie_kmsgs, 'struct ipc_kmsg *', 'ikm_inheritance')) + kmsg_count = sum( + 1 for i in IterateQueue(iie.iie_kmsgs, "struct ipc_kmsg *", "ikm_inheritance") + ) out_str += fmt.format(iie, type_str, refs, made_refs, kmsg_count, inherit_count) - if config['verbosity'] > vHUMAN: + if config["verbosity"] > vHUMAN: if kmsg_count > 0: - out_str += "\n\t"+ GetKMsgSummary.header - for k in IterateQueue(iie.iie_kmsgs, 'struct ipc_kmsg *', 'ikm_inheritance'): - out_str += "\t" + "{: <#18x}".format(GetKmsgHeader(k).msgh_remote_port) + ' ' + GetKMsgSummary(k, "\t").lstrip() + out_str += "\n\t" + GetKMsgSummary.header + for k in IterateQueue( + iie.iie_kmsgs, "struct ipc_kmsg *", "ikm_inheritance" + ): + out_str += ( + "\t" + + "{: <#18x}".format(GetKmsgHeader(k).msgh_remote_port) + + " " + + GetKMsgSummary(k, "\t").lstrip() + ) out_str += "\n" if inherit_count > 0: out_str += "\n\t" + GetIPCImportanceInheritSummary.header + "\n" - for i in IterateQueue(iit.iit_inherits, 'struct ipc_importance_inherit *', 'iii_inheritance'): + for i in IterateQueue( + iit.iit_inherits, "struct ipc_importance_inherit *", "iii_inheritance" + ): out_str += "\t" + GetIPCImportanceInheritSummary(i) + "\n" out_str += "\n" if type_str == "INH": - iii = Cast(iie, 'struct ipc_importance_inherit *') + iii = Cast(iie, "struct ipc_importance_inherit *") out_str += "Inherit from: " + GetIPCImportanceElemSummary(iii.iii_from_elem) return out_str + @header("{: <18s} {: <18s} {: <32}".format("iit", "task", "name")) -@lldb_type_summary(['ipc_importance_task *']) +@lldb_type_summary(["ipc_importance_task *"]) def GetIPCImportantTaskSummary(iit): - """ iit is a ipc_importance_task value object. - """ + """iit is a ipc_importance_task value object.""" fmt = "{: <#18x} {: <#18x} {: <32}" - out_str='' + out_str = "" pname = GetProcNameForTask(iit.iit_task) - if hasattr(iit, 'iit_bsd_pid'): + if hasattr(iit, "iit_bsd_pid"): pname = "({:d}) {:s}".format(iit.iit_bsd_pid, iit.iit_procname) out_str += fmt.format(iit, iit.iit_task, pname) return out_str -@lldb_command('showallimportancetasks') + +@lldb_command("showallimportancetasks") def ShowIPCImportanceTasks(cmd_args=[], cmd_options={}): - """ display a list of all tasks with ipc importance information. - Usage: (lldb) showallimportancetasks - Tip: add "-v" to see detailed information on each kmsg or inherit elems + """display a list of all tasks with ipc importance information. + Usage: (lldb) showallimportancetasks + Tip: add "-v" to see detailed information on each kmsg or inherit elems """ - print(' ' + GetIPCImportantTaskSummary.header + ' ' + GetIPCImportanceElemSummary.header) + print( + " " + + GetIPCImportantTaskSummary.header + + " " + + GetIPCImportanceElemSummary.header + ) for t in kern.tasks: s = "" if unsigned(t.task_imp_base): - s += ' ' + GetIPCImportantTaskSummary(t.task_imp_base) - s += ' ' + GetIPCImportanceElemSummary(addressof(t.task_imp_base.iit_elem)) + s += " " + GetIPCImportantTaskSummary(t.task_imp_base) + s += " " + GetIPCImportanceElemSummary(addressof(t.task_imp_base.iit_elem)) print(s) -@lldb_command('showipcimportance', '') + +@lldb_command("showipcimportance", "") def ShowIPCImportance(cmd_args=[], cmd_options={}): - """ Describe an importance from argument. - Usage: (lldb) showimportance + """Describe an importance from argument. + Usage: (lldb) showimportance """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Please provide valid argument") - elem = kern.GetValueFromAddress(cmd_args[0], 'ipc_importance_elem_t') + elem = kern.GetValueFromAddress(cmd_args[0], "ipc_importance_elem_t") print(GetIPCImportanceElemSummary.header) print(GetIPCImportanceElemSummary(elem)) -@header("{: <18s} {: <18s} {: <8s} {: <5s} {: <5s} {: <8s}".format("ivac", "tbl", "tblsize", "index", "Grow", "freelist")) -@lldb_type_summary(['ipc_voucher_attr_control *', 'ipc_voucher_attr_control_t']) + +@header( + "{: <18s} {: <18s} {: <8s} {: <5s} {: <5s} {: <8s}".format( + "ivac", "tbl", "tblsize", "index", "Grow", "freelist" + ) +) +@lldb_type_summary(["ipc_voucher_attr_control *", "ipc_voucher_attr_control_t"]) def GetIPCVoucherAttrControlSummary(ivac): - """ describes a voucher attribute control settings """ + """describes a voucher attribute control settings""" out_str = "" fmt = "{c: <#18x} {c.ivac_table: <#18x} {c.ivac_table_size: <8d} {c.ivac_key_index: <5d} {growing: <5s} {c.ivac_freelist: <8d}" growing_str = "" @@ -1879,39 +2267,47 @@ def GetIPCVoucherAttrControlSummary(ivac): if ivac == 0: return "{: <#18x}".format(ivac) - growing_str = "Y" if unsigned(ivac.ivac_is_growing) else "N" - out_str += fmt.format(c=ivac, growing = growing_str) + growing_str = "Y" if unsigned(ivac.ivac_is_growing) else "N" + out_str += fmt.format(c=ivac, growing=growing_str) return out_str -@lldb_command('showivac','') + +@lldb_command("showivac", "") def ShowIPCVoucherAttributeControl(cmd_args=[], cmd_options={}): - """ Show summary of voucher attribute contols. - Usage: (lldb) showivac + """Show summary of voucher attribute contols. + Usage: (lldb) showivac """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Please provide correct arguments.") - ivac = kern.GetValueFromAddress(cmd_args[0], 'ipc_voucher_attr_control_t') + ivac = kern.GetValueFromAddress(cmd_args[0], "ipc_voucher_attr_control_t") print(GetIPCVoucherAttrControlSummary.header) print(GetIPCVoucherAttrControlSummary(ivac)) - if config['verbosity'] > vHUMAN: + if config["verbosity"] > vHUMAN: cur_entry_index = 0 last_entry_index = unsigned(ivac.ivac_table_size) print("index " + GetIPCVoucherAttributeEntrySummary.header) while cur_entry_index < last_entry_index: - print("{: <5d} ".format(cur_entry_index) + GetIPCVoucherAttributeEntrySummary(addressof(ivac.ivac_table[cur_entry_index]))) + print( + "{: <5d} ".format(cur_entry_index) + + GetIPCVoucherAttributeEntrySummary( + addressof(ivac.ivac_table[cur_entry_index]) + ) + ) cur_entry_index += 1 - - -@header("{: <18s} {: <30s} {: <30s} {: <30s} {: <30s}".format("ivam", "get_value_fn", "extract_fn", "release_value_fn", "command_fn")) -@lldb_type_summary(['ipc_voucher_attr_manager *', 'ipc_voucher_attr_manager_t']) +@header( + "{: <18s} {: <30s} {: <30s} {: <30s} {: <30s}".format( + "ivam", "get_value_fn", "extract_fn", "release_value_fn", "command_fn" + ) +) +@lldb_type_summary(["ipc_voucher_attr_manager *", "ipc_voucher_attr_manager_t"]) def GetIPCVoucherAttrManagerSummary(ivam): - """ describes a voucher attribute manager settings """ + """describes a voucher attribute manager settings""" out_str = "" fmt = "{: <#18x} {: <30s} {: <30s} {: <30s} {: <30s}" - if unsigned(ivam) == 0 : + if unsigned(ivam) == 0: return "{: <#18x}".format(ivam) get_value_fn = kern.Symbolicate(unsigned(ivam.ivam_get_value)) @@ -1921,31 +2317,51 @@ def GetIPCVoucherAttrManagerSummary(ivam): out_str += fmt.format(ivam, get_value_fn, extract_fn, release_value_fn, command_fn) return out_str + def iv_key_to_index(key): - """ ref: osfmk/ipc/ipc_voucher.c: iv_key_to_index """ - if (key == xnudefines.MACH_VOUCHER_ATTR_KEY_ALL) or (key > xnudefines.MACH_VOUCHER_ATTR_KEY_NUM): + """ref: osfmk/ipc/ipc_voucher.c: iv_key_to_index""" + if (key == xnudefines.MACH_VOUCHER_ATTR_KEY_ALL) or ( + key > xnudefines.MACH_VOUCHER_ATTR_KEY_NUM + ): return xnudefines.IV_UNUSED_KEYINDEX return key - 1 + def iv_index_to_key(index): - """ ref: osfmk/ipc/ipc_voucher.c: iv_index_to_key """ + """ref: osfmk/ipc/ipc_voucher.c: iv_index_to_key""" if index < xnudefines.MACH_VOUCHER_ATTR_KEY_NUM_WELL_KNOWN: return index + 1 return xnudefines.MACH_VOUCHER_ATTR_KEY_NONE -@header("{: <3s} {: <3s} {:s} {:s}".format("idx", "key", GetIPCVoucherAttrControlSummary.header.strip(), GetIPCVoucherAttrManagerSummary.header.strip())) -@lldb_type_summary(['ipc_voucher_global_table_element *', 'ipc_voucher_global_table_element_t']) + +@header( + "{: <3s} {: <3s} {:s} {:s}".format( + "idx", + "key", + GetIPCVoucherAttrControlSummary.header.strip(), + GetIPCVoucherAttrManagerSummary.header.strip(), + ) +) +@lldb_type_summary( + ["ipc_voucher_global_table_element *", "ipc_voucher_global_table_element_t"] +) def GetIPCVoucherGlobalTableElementSummary(idx, ivac, ivam): - """ describes a ipc_voucher_global_table_element object """ + """describes a ipc_voucher_global_table_element object""" out_str = "" fmt = "{idx: <3d} {key: <3d} {ctrl_s:s} {mgr_s:s}" - out_str += fmt.format(idx=idx, key=iv_index_to_key(idx), ctrl_s=GetIPCVoucherAttrControlSummary(addressof(ivac)), mgr_s=GetIPCVoucherAttrManagerSummary(ivam)) + out_str += fmt.format( + idx=idx, + key=iv_index_to_key(idx), + ctrl_s=GetIPCVoucherAttrControlSummary(addressof(ivac)), + mgr_s=GetIPCVoucherAttrManagerSummary(ivam), + ) return out_str -@lldb_command('showglobalvouchertable', '') + +@lldb_command("showglobalvouchertable", "") def ShowGlobalVoucherTable(cmd_args=[], cmd_options={}): - """ show detailed information of all voucher attribute managers registered with vouchers system - Usage: (lldb) showglobalvouchertable + """show detailed information of all voucher attribute managers registered with vouchers system + Usage: (lldb) showglobalvouchertable """ entry_size = sizeof(kern.globals.ivac_global_table[0]) elems = sizeof(kern.globals.ivac_global_table) // entry_size @@ -1957,72 +2373,97 @@ def ShowGlobalVoucherTable(cmd_args=[], cmd_options={}): continue print(GetIPCVoucherGlobalTableElementSummary(i, ivac, ivam)) + # Type summaries for Bag of Bits. -@lldb_type_summary(['user_data_value_element', 'user_data_element_t']) -@header("{0: <20s} {1: <16s} {2: <20s} {3: <20s} {4: <16s} {5: <20s}".format("user_data_ve", "maderefs", "checksum", "hash value", "size", "data")) + +@lldb_type_summary(["user_data_value_element", "user_data_element_t"]) +@header( + "{0: <20s} {1: <16s} {2: <20s} {3: <20s} {4: <16s} {5: <20s}".format( + "user_data_ve", "maderefs", "checksum", "hash value", "size", "data" + ) +) def GetBagofBitsElementSummary(data_element): - """ Summarizes the Bag of Bits element - params: data_element = value of the object of type user_data_value_element_t - returns: String with summary of the type. + """Summarizes the Bag of Bits element + params: data_element = value of the object of type user_data_value_element_t + returns: String with summary of the type. """ format_str = "{0: <#20x} {1: <16d} {2: <#20x} {3: <#20x} {4: <16d}" - out_string = format_str.format(data_element, unsigned(data_element.e_made), data_element.e_sum, data_element.e_hash, unsigned(data_element.e_size)) + out_string = format_str.format( + data_element, + unsigned(data_element.e_made), + data_element.e_sum, + data_element.e_hash, + unsigned(data_element.e_size), + ) out_string += " 0x" for i in range(0, (unsigned(data_element.e_size) - 1)): out_string += "{:02x}".format(int(data_element.e_data[i])) return out_string + def GetIPCHandleSummary(handle_ptr): - """ converts a handle value inside a voucher attribute table to ipc element and returns appropriate summary. - params: handle_ptr - uint64 number stored in handle of voucher. - returns: str - string summary of the element held in internal structure + """converts a handle value inside a voucher attribute table to ipc element and returns appropriate summary. + params: handle_ptr - uint64 number stored in handle of voucher. + returns: str - string summary of the element held in internal structure """ - elem = kern.GetValueFromAddress(handle_ptr, 'ipc_importance_elem_t') + elem = kern.GetValueFromAddress(handle_ptr, "ipc_importance_elem_t") if elem.iie_bits & xnudefines.IIE_TYPE_MASK: - iie = Cast(elem, 'struct ipc_importance_inherit *') + iie = Cast(elem, "struct ipc_importance_inherit *") return GetIPCImportanceInheritSummary(iie) else: - iit = Cast(elem, 'struct ipc_importance_task *') + iit = Cast(elem, "struct ipc_importance_task *") return GetIPCImportantTaskSummary(iit) + def GetATMHandleSummary(handle_ptr): - """ Convert a handle value to atm value and returns corresponding summary of its fields. - params: handle_ptr - uint64 number stored in handle of voucher - returns: str - summary of atm value + """Convert a handle value to atm value and returns corresponding summary of its fields. + params: handle_ptr - uint64 number stored in handle of voucher + returns: str - summary of atm value """ return "???" + def GetBankHandleSummary(handle_ptr): - """ converts a handle value inside a voucher attribute table to bank element and returns appropriate summary. - params: handle_ptr - uint64 number stored in handle of voucher. - returns: str - summary of bank element + """converts a handle value inside a voucher attribute table to bank element and returns appropriate summary. + params: handle_ptr - uint64 number stored in handle of voucher. + returns: str - summary of bank element """ - if handle_ptr == 1 : + if handle_ptr == 1: return "Bank task of Current task" - elem = kern.GetValueFromAddress(handle_ptr, 'bank_element_t') - if elem.be_type & 1 : - ba = Cast(elem, 'struct bank_account *') + elem = kern.GetValueFromAddress(handle_ptr, "bank_element_t") + if elem.be_type & 1: + ba = Cast(elem, "struct bank_account *") return GetBankAccountSummary(ba) else: - bt = Cast(elem, 'struct bank_task *') + bt = Cast(elem, "struct bank_task *") return GetBankTaskSummary(bt) + def GetBagofBitsHandleSummary(handle_ptr): - """ Convert a handle value to bag of bits value and returns corresponding summary of its fields. - params: handle_ptr - uint64 number stored in handle of voucher - returns: str - summary of bag of bits element + """Convert a handle value to bag of bits value and returns corresponding summary of its fields. + params: handle_ptr - uint64 number stored in handle of voucher + returns: str - summary of bag of bits element """ - elem = kern.GetValueFromAddress(handle_ptr, 'user_data_element_t') + elem = kern.GetValueFromAddress(handle_ptr, "user_data_element_t") return GetBagofBitsElementSummary(elem) -@static_var('attr_managers',{1: GetATMHandleSummary, 2: GetIPCHandleSummary, 3: GetBankHandleSummary, 7: GetBagofBitsHandleSummary}) + +@static_var( + "attr_managers", + { + 1: GetATMHandleSummary, + 2: GetIPCHandleSummary, + 3: GetBankHandleSummary, + 7: GetBagofBitsHandleSummary, + }, +) def GetHandleSummaryForKey(handle_ptr, key_num): - """ Get a summary of handle pointer from the voucher attribute manager. - For example key 2 -> ipc and it puts either ipc_importance_inherit_t or ipc_important_task_t. - key 3 -> Bank and it puts either bank_task_t or bank_account_t. - key 7 -> Bag of Bits and it puts user_data_element_t in handle. So summary of it would be Bag of Bits content and refs etc. + """Get a summary of handle pointer from the voucher attribute manager. + For example key 2 -> ipc and it puts either ipc_importance_inherit_t or ipc_important_task_t. + key 3 -> Bank and it puts either bank_task_t or bank_account_t. + key 7 -> Bag of Bits and it puts user_data_element_t in handle. So summary of it would be Bag of Bits content and refs etc. """ key_num = int(key_num) if key_num not in GetHandleSummaryForKey.attr_managers: @@ -2030,11 +2471,14 @@ def GetHandleSummaryForKey(handle_ptr, key_num): return GetHandleSummaryForKey.attr_managers[key_num](handle_ptr) -@header("{: <18s} {: <18s} {: <10s} {: <4s} {: <18s} {: <18s}".format("ivace", "value_handle", "#refs", "rel?", "maderefs", "next_layer")) -@lldb_type_summary(['ivac_entry *', 'ivac_entry_t']) -def GetIPCVoucherAttributeEntrySummary(ivace, manager_key_num = 0): - """ Get summary for voucher attribute entry. - """ +@header( + "{: <18s} {: <18s} {: <10s} {: <4s} {: <18s} {: <18s}".format( + "ivace", "value_handle", "#refs", "rel?", "maderefs", "next_layer" + ) +) +@lldb_type_summary(["ivac_entry *", "ivac_entry_t"]) +def GetIPCVoucherAttributeEntrySummary(ivace, manager_key_num=0): + """Get summary for voucher attribute entry.""" out_str = "" fmt = "{e: <#18x} {e.ivace_value: <#18x} {e.ivace_refs: <10d} {release: <4s} {made_refs: <18s} {next_layer: <18s}" release_str = "" @@ -2045,28 +2489,35 @@ def GetIPCVoucherAttributeEntrySummary(ivace, manager_key_num = 0): if unsigned(ivace.ivace_releasing): release_str = "Y" if unsigned(ivace.ivace_free): - free_str = 'F' + free_str = "F" if unsigned(ivace.ivace_layered): next_layer = "{: <#18x}".format(ivace.ivace_u.ivaceu_layer) else: made_refs = "{: <18d}".format(ivace.ivace_u.ivaceu_made) - out_str += fmt.format(e=ivace, release=release_str, made_refs=made_refs, next_layer=next_layer) - if config['verbosity'] > vHUMAN and manager_key_num > 0: - out_str += " " + GetHandleSummaryForKey(unsigned(ivace.ivace_value), manager_key_num) - if config['verbosity'] > vHUMAN : - out_str += ' {: <2s} {: <4d} {: <4d}'.format(free_str, ivace.ivace_next, ivace.ivace_index) + out_str += fmt.format( + e=ivace, release=release_str, made_refs=made_refs, next_layer=next_layer + ) + if config["verbosity"] > vHUMAN and manager_key_num > 0: + out_str += " " + GetHandleSummaryForKey( + unsigned(ivace.ivace_value), manager_key_num + ) + if config["verbosity"] > vHUMAN: + out_str += " {: <2s} {: <4d} {: <4d}".format( + free_str, ivace.ivace_next, ivace.ivace_index + ) return out_str -@lldb_command('showivacfreelist','') + +@lldb_command("showivacfreelist", "") def ShowIVACFreeList(cmd_args=[], cmd_options={}): - """ Walk the free list and print every entry in the list. - usage: (lldb) showivacfreelist + """Walk the free list and print every entry in the list. + usage: (lldb) showivacfreelist """ if cmd_args is None or len(cmd_args) == 0: - raise ArgumentError('Please provide ') + raise ArgumentError("Please provide ") - ivac = kern.GetValueFromAddress(cmd_args[0], 'ipc_voucher_attr_control_t') + ivac = kern.GetValueFromAddress(cmd_args[0], "ipc_voucher_attr_control_t") print(GetIPCVoucherAttrControlSummary.header) print(GetIPCVoucherAttrControlSummary(ivac)) if unsigned(ivac.ivac_freelist) == 0: @@ -2075,63 +2526,85 @@ def ShowIVACFreeList(cmd_args=[], cmd_options={}): print("index " + GetIPCVoucherAttributeEntrySummary.header) next_free = unsigned(ivac.ivac_freelist) while next_free != 0: - print("{: <5d} ".format(next_free) + GetIPCVoucherAttributeEntrySummary(addressof(ivac.ivac_table[next_free]))) + print( + "{: <5d} ".format(next_free) + + GetIPCVoucherAttributeEntrySummary(addressof(ivac.ivac_table[next_free])) + ) next_free = unsigned(ivac.ivac_table[next_free].ivace_next) - -@header('{: <18s} {: <8s} {: <18s} {: <18s}'.format("ipc_voucher", "refs", "table", "voucher_port")) -@lldb_type_summary(['ipc_voucher *', 'ipc_voucher_t']) +@header( + "{: <18s} {: <8s} {: <18s} {: <18s}".format( + "ipc_voucher", "refs", "table", "voucher_port" + ) +) +@lldb_type_summary(["ipc_voucher *", "ipc_voucher_t"]) def GetIPCVoucherSummary(voucher, show_entries=False): - """ describe a voucher from its ipc_voucher * object """ + """describe a voucher from its ipc_voucher * object""" out_str = "" fmt = "{v: <#18x} {v.iv_refs: <8d} {table_addr: <#18x} {v.iv_port: <#18x}" - out_str += fmt.format(v = voucher, table_addr = addressof(voucher.iv_table)) - entries_str = '' - if show_entries or config['verbosity'] > vHUMAN: + out_str += fmt.format(v=voucher, table_addr=addressof(voucher.iv_table)) + entries_str = "" + if show_entries or config["verbosity"] > vHUMAN: elems = sizeof(voucher.iv_table) // sizeof(voucher.iv_table[0]) - entries_header_str = "\n\t" + "{: <5s} {: <3s} {: <16s} {: <30s}".format("index", "key", "value_index", "manager") + " " + GetIPCVoucherAttributeEntrySummary.header - fmt = "{: <5d} {: <3d} {: <16d} {: <30s}" + entries_header_str = ( + "\n\t" + + "{: <5s} {: <3s} {: <16s} {: <30s}".format( + "index", "key", "value_index", "manager" + ) + + " " + + GetIPCVoucherAttributeEntrySummary.header + ) + fmt = "{: <5d} {: <3d} {: <16d} {: <30s}" for i in range(elems): voucher_entry_index = unsigned(voucher.iv_table[i]) if voucher_entry_index: - s = fmt.format(i, GetVoucherManagerKeyForIndex(i), voucher_entry_index, GetVoucherAttributeManagerNameForIndex(i)) + s = fmt.format( + i, + GetVoucherManagerKeyForIndex(i), + voucher_entry_index, + GetVoucherAttributeManagerNameForIndex(i), + ) e = GetVoucherValueHandleFromVoucherForIndex(voucher, i) if e is not None: - s += " " + GetIPCVoucherAttributeEntrySummary(addressof(e), GetVoucherManagerKeyForIndex(i) ) - if entries_header_str : + s += " " + GetIPCVoucherAttributeEntrySummary( + addressof(e), GetVoucherManagerKeyForIndex(i) + ) + if entries_header_str: entries_str = entries_header_str - entries_header_str = '' + entries_header_str = "" entries_str += "\n\t" + s if not entries_header_str: entries_str += "\n\t" out_str += entries_str return out_str + def GetVoucherManagerKeyForIndex(idx): - """ Returns key number for index based on global table. Will raise index error if value is incorrect - """ + """Returns key number for index based on global table. Will raise index error if value is incorrect""" ret = iv_index_to_key(idx) if ret == xnudefines.MACH_VOUCHER_ATTR_KEY_NONE: raise IndexError("invalid voucher key") return ret + def GetVoucherAttributeManagerForKey(k): - """ Return the attribute manager name for a given key - params: k - int key number of the manager - return: cvalue - the attribute manager object. - None - if not found + """Return the attribute manager name for a given key + params: k - int key number of the manager + return: cvalue - the attribute manager object. + None - if not found """ idx = iv_key_to_index(k) if idx == xnudefines.IV_UNUSED_KEYINDEX: return None return kern.globals.ivam_global_table[idx] + def GetVoucherAttributeControllerForKey(k): - """ Return the attribute controller for a given key - params: k - int key number of the controller - return: cvalue - the attribute controller object. - None - if not found + """Return the attribute controller for a given key + params: k - int key number of the controller + return: cvalue - the attribute controller object. + None - if not found """ idx = iv_key_to_index(k) if idx == xnudefines.IV_UNUSED_KEYINDEX: @@ -2140,25 +2613,29 @@ def GetVoucherAttributeControllerForKey(k): def GetVoucherAttributeManagerName(ivam): - """ find the name of the ivam object - param: ivam - cvalue object of type ipc_voucher_attr_manager_t - returns: str - name of the manager + """find the name of the ivam object + param: ivam - cvalue object of type ipc_voucher_attr_manager_t + returns: str - name of the manager """ return kern.Symbolicate(unsigned(ivam)) + def GetVoucherAttributeManagerNameForIndex(idx): - """ get voucher attribute manager name for index - return: str - name of the attribute manager object + """get voucher attribute manager name for index + return: str - name of the attribute manager object """ - return GetVoucherAttributeManagerName(GetVoucherAttributeManagerForKey(GetVoucherManagerKeyForIndex(idx))) + return GetVoucherAttributeManagerName( + GetVoucherAttributeManagerForKey(GetVoucherManagerKeyForIndex(idx)) + ) + def GetVoucherValueHandleFromVoucherForIndex(voucher, idx): - """ traverse the voucher attrs and get value_handle in the voucher attr controls table - params: - voucher - cvalue object of type ipc_voucher_t - idx - int index in the entries for which you wish to get actual handle for - returns: cvalue object of type ivac_entry_t - None if no handle found. + """traverse the voucher attrs and get value_handle in the voucher attr controls table + params: + voucher - cvalue object of type ipc_voucher_t + idx - int index in the entries for which you wish to get actual handle for + returns: cvalue object of type ivac_entry_t + None if no handle found. """ manager_key = GetVoucherManagerKeyForIndex(idx) voucher_num_elems = sizeof(voucher.iv_table) // sizeof(voucher.iv_table[0]) @@ -2174,61 +2651,69 @@ def GetVoucherValueHandleFromVoucherForIndex(voucher, idx): ivace_table = ivac.ivac_table if voucher_entry_value >= unsigned(ivac.ivac_table_size): - print("Failed to get ivace for value %d in table of size %d" % (voucher_entry_value, unsigned(ivac.ivac_table_size))) + print( + "Failed to get ivace for value %d in table of size %d" + % (voucher_entry_value, unsigned(ivac.ivac_table_size)) + ) return None return ivace_table[voucher_entry_value] - -@lldb_command('showallvouchers') +@lldb_command("showallvouchers") def ShowAllVouchers(cmd_args=[], cmd_options={}): - """ Display a list of all vouchers in the global voucher hash table - Usage: (lldb) showallvouchers + """Display a list of all vouchers in the global voucher hash table + Usage: (lldb) showallvouchers """ print(GetIPCVoucherSummary.header) voucher_ty = gettype("struct ipc_voucher") for v in kmemory.Zone("ipc vouchers").iter_allocated(voucher_ty): print(GetIPCVoucherSummary(value(v.AddressOf()))) -@lldb_command('showvoucher', '') + +@lldb_command("showvoucher", "") def ShowVoucher(cmd_args=[], cmd_options={}): - """ Describe a voucher from argument. - Usage: (lldb) showvoucher + """Describe a voucher from argument. + Usage: (lldb) showvoucher """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Please provide valid argument") - voucher = kern.GetValueFromAddress(cmd_args[0], 'ipc_voucher_t') + voucher = kern.GetValueFromAddress(cmd_args[0], "ipc_voucher_t") print(GetIPCVoucherSummary.header) print(GetIPCVoucherSummary(voucher, show_entries=True)) -@lldb_command('showportsendrights') + +@lldb_command("showportsendrights") def ShowPortSendRights(cmd_args=[], cmd_options={}): - """ Display a list of send rights across all tasks for a given port. - Usage: (lldb) showportsendrights + """Display a list of send rights across all tasks for a given port. + Usage: (lldb) showportsendrights """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("no port address provided") - port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *') + port = kern.GetValueFromAddress(cmd_args[0], "struct ipc_port *") if not port or port == xnudefines.MACH_PORT_DEAD: return - return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'}) + return FindPortRights(cmd_args=[unsigned(port)], cmd_options={"-R": "S"}) -@lldb_command('showtasksuspenders') +@lldb_command("showtasksuspenders") def ShowTaskSuspenders(cmd_args=[], cmd_options={}): - """ Display the tasks and send rights that are holding a target task suspended. - Usage: (lldb) showtasksuspenders + """Display the tasks and send rights that are holding a target task suspended. + Usage: (lldb) showtasksuspenders """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("no task address provided") - task = kern.GetValueFromAddress(cmd_args[0], 'task_t') + task = kern.GetValueFromAddress(cmd_args[0], "task_t") if task.suspend_count == 0: - print("task {:#x} ({:s}) is not suspended".format(unsigned(task), GetProcNameForTask(task))) + print( + "task {:#x} ({:s}) is not suspended".format( + unsigned(task), GetProcNameForTask(task) + ) + ) return # If the task has been suspended by the kernel (potentially by @@ -2238,29 +2723,38 @@ def ShowTaskSuspenders(cmd_args=[], cmd_options={}): # which task did the suspension. port = task.itk_resume if task.pidsuspended: - print("task {:#x} ({:s}) has been `pid_suspend`ed. (Probably runningboardd's fault. Go look at the syslog for \"Suspending task.\")".format(unsigned(task), GetProcNameForTask(task))) + print( + 'task {:#x} ({:s}) has been `pid_suspend`ed. (Probably runningboardd\'s fault. Go look at the syslog for "Suspending task.")'.format( + unsigned(task), GetProcNameForTask(task) + ) + ) return elif not port: - print("task {:#x} ({:s}) is suspended but no resume port exists".format(unsigned(task), GetProcNameForTask(task))) + print( + "task {:#x} ({:s}) is suspended but no resume port exists".format( + unsigned(task), GetProcNameForTask(task) + ) + ) return - return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'}) + return FindPortRights(cmd_args=[unsigned(port)], cmd_options={"-R": "S"}) # Macro: showmqueue: -@lldb_command('showmqueue', fancy=True) +@lldb_command("showmqueue", fancy=True) def ShowMQueue(cmd_args=None, cmd_options={}, O=None): - """ Routine that lists details about a given mqueue. - An mqueue is directly tied to a mach port, so it just shows the details of that port. - Syntax: (lldb) showmqueue
+ """Routine that lists details about a given mqueue. + An mqueue is directly tied to a mach port, so it just shows the details of that port. + Syntax: (lldb) showmqueue
""" if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing mqueue argument") - space = 0 - mqueue = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_mqueue *') - portoff = getfieldoffset('struct ipc_port', 'ip_messages') + kern.GetValueFromAddress(cmd_args[0], "struct ipc_mqueue *") + portoff = getfieldoffset("struct ipc_port", "ip_messages") port = unsigned(ArgumentStringToInt(cmd_args[0])) - unsigned(portoff) - obj = kern.GetValueFromAddress(port, 'struct ipc_object *') + obj = kern.GetValueFromAddress(port, "struct ipc_object *") ShowPortOrPset(obj, O=O) + + # EndMacro: showmqueue diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index 44974e582..396f85578 100755 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -9,8 +9,8 @@ import base64 import argparse import logging import contextlib -import base64 import zlib +from operator import itemgetter long = int @@ -137,7 +137,9 @@ kcdata_type_def = { 'STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGMENTS' : 0x954, 'STACKSHOT_KCTYPE_KERN_EXCLAVES_CRASH_THREADINFO' : 0x955, 'STACKSHOT_KCTYPE_LATENCY_INFO_CPU': 0x956, - + 'STACKSHOT_KCTYPE_TASK_EXEC_META': 0x957, + 'STACKSHOT_KCTYPE_TASK_MEMORYSTATUS': 0x958, + 'STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER': 0x95a, 'KCDATA_TYPE_BUFFER_END': 0xF19158ED, 'TASK_CRASHINFO_EXTMODINFO': 0x801, @@ -765,13 +767,6 @@ class KCData_item: """ header_size = 16 # (uint32_t + uint32_t + uint64_t) - def __init__(self, item_type, item_size, item_flags, item_data): - self.i_type = item_type - self.i_size = item_size - self.i_flags = item_flags - self.i_data = item_data - self.i_offset = None - def __init__(self, barray, pos=0): """ create an object by parsing data from bytes array returns : obj - if data is readable @@ -788,7 +783,7 @@ class KCData_item: def GetHeaderDescription(self): outs = "type: 0x%x size: 0x%x flags: 0x%x (%s)" % (self.i_type, self.i_size, self.i_flags, GetTypeNameForKey(self.i_type)) - if not self.i_offset is None: + if self.i_offset is not None: outs = "pos: 0x%x" % self.i_offset + outs return outs @@ -900,7 +895,9 @@ KNOWN_TYPES_COLLECTION[0x905] = KCTypeDescription(0x905, ( KCSubTypeElement.FromBasicCtype('ts_did_throttle', KCSUBTYPE_TYPE.KC_ST_UINT32, 76), KCSubTypeElement.FromBasicCtype('ts_latency_qos', KCSUBTYPE_TYPE.KC_ST_UINT32, 80), KCSubTypeElement.FromBasicCtype('ts_pid', KCSUBTYPE_TYPE.KC_ST_INT32, 84), - KCSubTypeElement('ts_p_comm', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(32, 1), 88, 1) + KCSubTypeElement('ts_p_comm', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(32, 1), 88, 1), + KCSubTypeElement.FromBasicCtype('ts_uid', KCSUBTYPE_TYPE.KC_ST_UINT32, 120), + KCSubTypeElement.FromBasicCtype('ts_gid', KCSUBTYPE_TYPE.KC_ST_UINT32, 124) ), 'task_snapshot' ) @@ -981,6 +978,11 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT')] = 'task_delta_snapshot' ) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_EXEC_META')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_TASK_EXEC_META'), ( + KCSubTypeElement.FromBasicCtype('tem_flags', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), +), + 'task_exec_meta' +) KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_THREAD_NAME')] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) @@ -1250,6 +1252,15 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_LATENCY_INFO_CPU')] = KC ), 'stackshot_latency_cpu') +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER')] = KCTypeDescription(GetTypeForName('STACKSHOT_LATENCY_INFO_BUFFER'), + ( + KCSubTypeElement.FromBasicCtype('cluster_type', KCSUBTYPE_TYPE.KC_ST_INT32, 0), + KCSubTypeElement.FromBasicCtype('size', KCSUBTYPE_TYPE.KC_ST_UINT64, 4), + KCSubTypeElement.FromBasicCtype('used', KCSUBTYPE_TYPE.KC_ST_UINT64, 12), + KCSubTypeElement.FromBasicCtype('overhead', KCSUBTYPE_TYPE.KC_ST_UINT64, 20), + ), + 'stackshot_latency_buffer') + KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_LATENCY_INFO_TASK')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_LATENCY_INFO_TASK'), ( KCSubTypeElement.FromBasicCtype('task_uniqueid', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), @@ -1544,6 +1555,15 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGME KCSubTypeElement.FromBasicCtype('layoutSegment_rawLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 24), ), 'exclave_textlayout_segments') +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_MEMORYSTATUS')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_TASK_MEMORYSTATUS'), + ( + KCSubTypeElement.FromBasicCtype('tms_current_memlimit', KCSUBTYPE_TYPE.KC_ST_INT32, 0), + KCSubTypeElement.FromBasicCtype('tms_effectivepriority', KCSUBTYPE_TYPE.KC_ST_INT32, 4), + KCSubTypeElement.FromBasicCtype('tms_requestedpriority', KCSUBTYPE_TYPE.KC_ST_INT32, 8), + KCSubTypeElement.FromBasicCtype('tms_assertionpriority', KCSUBTYPE_TYPE.KC_ST_INT32, 12), + ), 'task_memorystatus') + + def GetSecondsFromMATime(mat, tb): return (float(long(mat) * tb['numer']) / tb['denom']) / 1e9 @@ -1600,6 +1620,7 @@ def GetStateDescription(s): TH_TERMINATE2 = 0x20 TH_WAIT_REPORT = 0x40 TH_IDLE = 0x80 + TH_WAKING = 0x100 if (s & TH_WAIT): retval.append("TH_WAIT") if (s & TH_SUSP): @@ -1616,6 +1637,8 @@ def GetStateDescription(s): retval.append("TH_WAIT_REPORT") if (s & TH_IDLE): retval.append("TH_IDLE") + if (s & TH_WAKING): + retval.append("TH_WAKING") return retval @@ -1705,7 +1728,7 @@ def formatPortLabelID(portlabel_id, portlabels): if portlabels is not None: portlabel = portlabels.get(str(portlabel_id), {}) portlabel_name = portlabel_domain(portlabel.get('portlabel_domain')) + " " - portlabel_name += portlabel.get("portlabel_name", "!!!unknown, ID {} !!!".format(portlabel_id)); + portlabel_name += portlabel.get("portlabel_name", "!!!unknown, ID {} !!!".format(portlabel_id)) return " {" + portlabel_name + portThrottledSuffix(portlabel.get('portlabel_flags', 0)) + "}" if portlabel_id < 0: return " {labeled, info truncated" + portThrottledSuffix(portlabel.get('portlabel_flags', 0)) + "}" @@ -1915,27 +1938,25 @@ def FindTextLayout(text_layouts, text_layout_id): return layout return None -def BinaryImagesFromExclavesLayout(layout): - flags = layout['exclave_textlayout_info']['etl_flags'] - sharedCacheIndex = layout['exclave_textlayout_info'].get('sharedcache_index', 0xffffffff) +def BinaryImagesFromExclavesLayout(textlayout): + flags = textlayout['exclave_textlayout_info']['etl_flags'] + sharedCacheIndex = textlayout['exclave_textlayout_info'].get('sharedcache_index', 0xffffffff) layouts = [ [format_uuid(layout['layoutSegment_uuid']), layout['layoutSegment_loadAddress'], 'P'] for layout in textlayout['exclave_textlayout_segments'] ] # 0x4 == kExclaveTextLayoutHasSharedCache - if ((flags & 0x4) != 0 and sharedCacheIndex < length(layouts)): + if ((flags & 0x4) != 0 and sharedCacheIndex < len(layouts)): layouts[sharedCacheIndex][2] = "S" layouts.sort(key=itemgetter(1)) return layouts def GetExclaveLibs(text_layouts, text_layout_id): - from operator import itemgetter textlayout = text_layouts.get(str(text_layout_id)) # This fallback is needed to preserve compatibility with kcdata generated before rdar://123838752 # FindTextLayout function should be removed in future if not textlayout or textlayout['exclave_textlayout_info']['layout_id'] != text_layout_id: - textlayout = FindTextLayout(text_layouts, text_layout_id) + textlayout = FindTextLayout(text_layouts, text_layout_id) - return BinaryImagesFromExclavesLayout(layout) - + return BinaryImagesFromExclavesLayout(textlayout) # kcdata is json at path 'kcdata_stackshot/threads_exclave/0' def GetEASFrames(AllImageCatalog, kcdata, ipc_entry, notes, scid): @@ -1952,9 +1973,9 @@ def GetEASFrames(AllImageCatalog, kcdata, ipc_entry, notes, scid): return [] text_layout_id = as_info['exclave_addressspace_info']['eas_layoutid'] addr_space_name = as_info['exclave_addressspace_name'] - + exclave_libs = GetExclaveLibs(kcdata['exclave_textlayout'], text_layout_id) - + frames = [] stack = ipc_entry.get('secure_ecstack_entry', []) for stack_item in stack: @@ -1972,7 +1993,7 @@ def GetEASFrames(AllImageCatalog, kcdata, ipc_entry, notes, scid): notes.info("PID ${PID} TID ${TID} SCID %d ASID 0x%x has address space name '%s' (%s)" % (scid, asid, addr_space_name, frame_info)) notes.addToOffset(len(frames)) return frames - + def GetExclavesFrames(AllImageCatalog, json, scid, notes): kcdata = json['kcdata_stackshot'] @@ -2009,7 +2030,7 @@ def GetExclavesFrames(AllImageCatalog, json, scid, notes): frames.extend(entry_frames) return frames - + def InsertExclavesFrames(AllImageCatalog, json, thdata, notes, kernel_frames): thread_info = thdata.get('exclaves_thread_info') @@ -2022,13 +2043,13 @@ def InsertExclavesFrames(AllImageCatalog, json, thdata, notes, kernel_frames): notes.offset = offset exclaves_frames = GetExclavesFrames(AllImageCatalog, json, scid, notes) - + # insert exclaves frames to offset for i in range(len(exclaves_frames)): kernel_frames.insert(offset + i, exclaves_frames[i]) class NotesBuilder: - + notes = [] pid = None tis = None @@ -2045,7 +2066,7 @@ class NotesBuilder: note = note.replace('${PID}', str(self.pid)) note = note.replace('${TID}', str(self.tid)) return note + '\n' - + def warn(self, note): note = self.format(note) sys.stdout.write(note) @@ -2054,7 +2075,7 @@ class NotesBuilder: def info(self, note): note = self.format(note) self.notes.append(note) - + def isEmpty(self): return len(self.notes) == 0 @@ -2066,7 +2087,6 @@ class NotesBuilder: def SaveStackshotReport(j, outfile_name, incomplete): import time - from operator import itemgetter, attrgetter ss = j.get('kcdata_stackshot') if not ss: print("No KCDATA_BUFFER_BEGIN_STACKSHOT object found. Skipping writing report.") @@ -2099,7 +2119,7 @@ def SaveStackshotReport(j, outfile_name, incomplete): # is_intel = ('X86_64' in ss.get('osversion', "") and ss.get('kernel_page_size', 0) == 4096) - slidFirstMapping = shared_cache_info.get(SC_SLID_FIRSTMAPPING_KEY, -1); + slidFirstMapping = shared_cache_info.get(SC_SLID_FIRSTMAPPING_KEY, -1) if slidFirstMapping >= shared_cache_base_addr: shared_cache_base_addr = slidFirstMapping sc_note = "base-accurate" @@ -2212,9 +2232,9 @@ def SaveStackshotReport(j, outfile_name, incomplete): ttsnap = { key[1:] : value for key,value in ttsnap.items() } # Add a note to let people know obj["notes"] = obj["notes"] + "PID {} is a transitioning (exiting) task. ".format(pid) - tasksnap = piddata.get('task_snapshot', ttsnap); + tasksnap = piddata.get('task_snapshot', ttsnap) if tasksnap is None: - continue; + continue tsnap["pid"] = tasksnap["ts_pid"] if 'ts_asid' in piddata: tsnap["asid"] = piddata["ts_asid"] @@ -2257,7 +2277,7 @@ def SaveStackshotReport(j, outfile_name, incomplete): thsnap["qosRequested"] = threadsnap["ths_rqos"] if "pth_name" in thdata: - thsnap["name"] = thdata["pth_name"]; + thsnap["name"] = thdata["pth_name"] if threadsnap['ths_continuation']: thsnap["continuation"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_continuation']) @@ -2339,7 +2359,7 @@ def SaveStackshotReport(j, outfile_name, incomplete): def data_from_stream(stream): try: fmap = mmap.mmap(stream.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ) - except: + except Exception: yield stream.buffer.read() else: try: @@ -2366,7 +2386,7 @@ def iterate_kcdatas(kcdata_file): if not isinstance(kcdata_buffer, KCBufferObject): try: decoded = base64.b64decode(data) - except: + except Exception: pass else: iterator = kcdata_item_iterator(decoded) @@ -2376,7 +2396,7 @@ def iterate_kcdatas(kcdata_file): from io import BytesIO try: decompressed = gzip.GzipFile(fileobj=BytesIO(data[:])).read() - except: + except Exception: pass else: iterator = kcdata_item_iterator(decompressed) @@ -2503,6 +2523,7 @@ PRETTIFY_FLAGS = { 'TH_TERMINATE2', 'TH_WAIT_REPORT', 'TH_IDLE', + 'TH_WAKING', ], 'ts_ss_flags': [ 'kUser64_p', @@ -2544,6 +2565,10 @@ PRETTIFY_FLAGS = { 'kTaskDyldCompactInfoTriedFault', 'kTaskWqExceededCooperativeThreadLimit', 'kTaskWqExceededActiveConstrainedThreadLimit', + 'kTaskRunawayMitigated', + 'kTaskIsActive', + 'kTaskIsManaged', + 'kTaskHasAssertion', ], 'turnstile_flags': [ 'turnstile_status_unknown', @@ -2666,7 +2691,7 @@ def decode_kcdata_file(kcdata_file, stackshot_file, multiple=False, prettyhex=Fa try: json_obj = json.loads(str_data) - except: + except Exception: print("JSON reparsing failed! Printing string data!\n", file=sys.stderr) import textwrap print(textwrap.fill(str_data, 100)) diff --git a/tools/lldbmacros/kmemory/btlog.py b/tools/lldbmacros/kmemory/btlog.py index 5b70f3110..fa6f2d20f 100755 --- a/tools/lldbmacros/kmemory/btlog.py +++ b/tools/lldbmacros/kmemory/btlog.py @@ -7,21 +7,11 @@ import struct import sys from collections import namedtuple -from core import caching, xnu_format +from core import caching, xnu_format, OSHashPointer # FIXME: should not import this from xnu from xnu import GetSourceInformationForAddress -def _swap32(i): - return struct.unpack("I", i))[0] - -def _hash_ptr(ptr): - h = ptr >> 4 - h *= 0x5052acdb - h &= 0xffffffff - return (h ^ _swap32(h)) & 0xffffffff - - class BTStack(object): """ Helper class to represent a backtrace in a library @@ -322,7 +312,7 @@ class BTLog(object): for i in range(h_mask + 1) ) else: - i = _hash_ptr(wantElement) & h_mask + i = OSHashPointer(wantElement) & h_mask heads = (target.xCreateValueFromAddress( None, h_base + i * h_tysz, h_ty).xGetIntegerByName('bthh_first'), ) diff --git a/tools/lldbmacros/kmemory/kmem.py b/tools/lldbmacros/kmemory/kmem.py index 6edbe30f2..e55eddea0 100755 --- a/tools/lldbmacros/kmemory/kmem.py +++ b/tools/lldbmacros/kmemory/kmem.py @@ -73,7 +73,7 @@ class KMem(object, metaclass=ABCMeta): heap data structures, globals, ... """ - _HEAP_NAMES = [ "", "shared.", "data.", "" ] + _HEAP_NAMES = [ "", "early.", "data.", "data_shared.", "" ] @staticmethod def _parse_range(zone_info_v, name): @@ -118,12 +118,6 @@ class KMem(object, metaclass=ABCMeta): self.meta_range = self._parse_range(zone_info, 'zi_meta_range') self.bits_range = self._parse_range(zone_info, 'zi_bits_range') self.zone_range = self._parse_range(zone_info, 'zi_map_range') - try: - self.pgz_range = self._parse_range(zone_info, 'zi_pgz_range') - self.pgz_bt = target.chkFindFirstGlobalVariable('pgz_backtraces').xDereference() - except: - self.pgz_range = MemoryRange(0, 0) - self.pgz_bt = None kmem_ranges = target.chkFindFirstGlobalVariable('kmem_ranges') count = kmem_ranges.GetByteSize() // target.GetAddressByteSize() @@ -184,6 +178,7 @@ class KMem(object, metaclass=ABCMeta): self.rwlde_caller_packing = None self.c_slot_packing = VMPointerUnpacker(target, 'c_slot_packing_params') + self.vm_map_entry_packing = VMPointerUnpacker(target, 'vm_map_entry_packing_params') @staticmethod @caching.cache_statically diff --git a/tools/lldbmacros/kmemory/zone.py b/tools/lldbmacros/kmemory/zone.py index 2b2b2b768..b7a5c093c 100755 --- a/tools/lldbmacros/kmemory/zone.py +++ b/tools/lldbmacros/kmemory/zone.py @@ -119,34 +119,6 @@ class ZonePageMetadata(MemoryObject): sbv = self.sbv return Zone(sbv.xGetIntegerByName('zm_index')) - @property - def pgz_slot(self): - addr = self.page_addr - kmem = self.kmem - if kmem.pgz_range.contains(addr): - return (addr - kmem.pgz_range.start) >> (kmem.page_shift + 1) - return None - - def _pgz_alloc_frames(self, index): - kmem = self.kmem - target = kmem.target - bt = kmem.pgz_bt.xGetSiblingValueAtIndex(index) - return ( - kmem.stext + pc - for pc in target.xIterAsInt32( - bt.xGetLoadAddressByName('pgz_bt'), - bt.xGetIntegerByName('pgz_depth') - ) - ) - - @property - def pgz_alloc_bt_frames(self): - return self._pgz_alloc_frames(2 * self.pgz_slot) - - @property - def pgz_free_bt_frames(self): - return self._pgz_alloc_frames(2 * self.pgz_slot + 1) - def describe(self, verbose=False): kmem = self.kmem sbv = self.sbv @@ -312,21 +284,11 @@ class ZoneHeapMemoryObject(MemoryObject): zone = meta.zone esize = zone.elem_outer_size - if kmem.pgz_range.contains(address): - real_addr = meta.sbv.xGetIntegerByName('zm_pgz_orig_addr') - page_mask = kmem.page_mask - elem_addr = (real_addr & page_mask) + (address & ~page_mask) - elem_idx = ((elem_addr & page_mask) - zone.elem_inner_offs) // esize - self.real_addr = real_addr - self.real_meta = ZonePageMetadata._create_with_zone_address(kmem, real_addr) - self.pgz = True - else: - base = meta.page_addr + zone.elem_inner_offs - elem_idx = (address - base) // esize if address >= base else -1 - elem_addr = base + elem_idx * esize if address >= base else None - self.real_addr = elem_addr - self.real_meta = meta - self.pgz = False + base = meta.page_addr + zone.elem_inner_offs + elem_idx = (address - base) // esize if address >= base else -1 + elem_addr = base + elem_idx * esize if address >= base else None + self.real_addr = elem_addr + self.real_meta = meta self.kmem = kmem self.meta = meta @@ -438,21 +400,7 @@ class ZoneHeapMemoryObject(MemoryObject): print(" element index : {}".format(self.elem_idx)) print(" chunk offset : {}".format(self.address - meta.page_addr)) print(" status : {}".format(status)) - if self.pgz: - print(" pgz orig address : {:#x}".format(self.real_addr)) - print() - - print("PGZ Allocation backtrace:") - for pc in meta.pgz_alloc_bt_frames: - print(" " + GetSourceInformationForAddress(pc)) - - if status == 'free': - print() - - print("PGZ Free backtrace:") - for pc in meta.pgz_free_bt_frames: - print(" " + GetSourceInformationForAddress(pc)) - elif btlog and (btlog.is_log() or status == 'allocated'): + if btlog and (btlog.is_log() or status == 'allocated'): record = next(btlog.iter_records( wantElement=self.elem_addr, reverse=True), None) if record: diff --git a/tools/lldbmacros/ktrace.py b/tools/lldbmacros/ktrace.py index 0cbc99fed..3edcdf910 100755 --- a/tools/lldbmacros/ktrace.py +++ b/tools/lldbmacros/ktrace.py @@ -424,7 +424,7 @@ class KDCPU(object): Represents all events from a single CPU. """ - def __init__(self, cpuid, verbose, starting_timestamp=None, humanize=False): + def __init__(self, cpuid, verbose, humanize=None, starting_timestamp=None, tid=None, triage=False): self.cpuid = cpuid self.iter_store = None self.verbose = verbose @@ -432,9 +432,11 @@ class KDCPU(object): self.disabled = False self.last_timestamp = 0 self.err = lldb.SBError() # avoid making it all the time, it's slow - self.kd_bufs = kern.globals.kd_buffer_trace.kd_bufs.GetSBValue() + self.buffer = kern.globals.kd_buffer_triage if triage else kern.globals.kd_buffer_trace + self.kd_bufs = self.buffer.kd_bufs.GetSBValue() + self.tid = tid - kdstorep = kern.globals.kd_buffer_trace.kdb_info[cpuid].kd_list_head + kdstorep = self.buffer.kdb_info[cpuid].kd_list_head self.load_kdstore(kdstorep.GetSBValue()) skipped_storage_count = 0 @@ -486,22 +488,24 @@ class KDCPU(object): self.iter_stamp_max = store.xGetScalarByName('kds_timestamp') # Event iterator implementation returns KDEvent64 instance - def __iter__(self): return self def __next__(self): offs = self.iter_offs - while offs >= self.iter_offs_max: - # This CPU is out of events. - if self.iter_store is None or self.disabled: - raise StopIteration + while True: + while offs >= self.iter_offs_max: + # This CPU is out of events. + if self.iter_store is None or self.disabled: + raise StopIteration - self.load_kdstore(self.iter_store.GetChildMemberWithName('kds_next')) - offs = self.iter_offs + self.load_kdstore(self.iter_store.GetChildMemberWithName('kds_next')) + offs = self.iter_offs - self.iter_offs = offs + KDE_SIZE - kdevent = KDEvent64(self.iter_buf, offs) + self.iter_offs = offs + KDE_SIZE + kdevent = KDEvent64(self.iter_buf, offs) + if self.tid is None or unsigned(kdevent.arg5) == self.tid: + break if self.verbose and self.last_timestamp == 0: print('first event from CPU {} is at time {}'.format( @@ -544,24 +548,24 @@ def GetKdebugCPUName(cpu_id): return '{}(unknown)'.format(cpu_id) -def IterateKdebugEvents(verbose=False, humanize=False, last=None, - include_coprocessors=True): +def IterateKdebugEvents(verbose=False, humanize=False, last=None, tid=None, + include_coprocessors=True, triage=False): """ Yield events from the in-memory kdebug trace buffers. """ ctrl = kern.globals.kd_control_trace - if (ctrl.kdc_flags & xnudefines.KDBG_BUFINIT) == 0: + if not triage and (ctrl.kdc_flags & xnudefines.KDBG_BUFINIT) == 0: return barrier_min = ctrl.kdc_oldest_time - if (ctrl.kdc_live_flags & xnudefines.KDBG_WRAPPED) != 0: + if not triage and (ctrl.kdc_live_flags & xnudefines.KDBG_WRAPPED) != 0: # TODO Yield a wrap event with the barrier_min timestamp. pass cpu_count = kern.globals.machine_info.logical_cpu_max - if include_coprocessors: + if include_coprocessors and not triage: cpu_count = ctrl.kdebug_cpus start_timestamp = None @@ -575,8 +579,7 @@ def IterateKdebugEvents(verbose=False, humanize=False, last=None, start_timestamp, now, duration)) # Merge sort all events from all CPUs. - cpus = [KDCPU(cpuid, verbose, - starting_timestamp=start_timestamp, humanize=humanize) + cpus = [KDCPU(cpuid, verbose, starting_timestamp=start_timestamp, humanize=humanize, tid=tid, triage=triage) for cpuid in range(cpu_count)] return heapq.merge(*cpus, key=operator.attrgetter('timestamp')) @@ -587,7 +590,6 @@ def GetKdebugEvent(event, symbolicate=False, O=None): """ Return a string representing a kdebug trace event. """ - def fmt_arg(a): return '0x{:016x}'.format(a) def sym_arg(a): @@ -604,6 +606,37 @@ def GetKdebugEvent(event, symbolicate=False, O=None): ts, debugid, fn(arg1), fn(arg2), fn(arg3), fn(arg4), cpuid, arg5) +def KtriageDecode(event): + subsystem = event.arg4 >> 24 + strings = kern.globals.ktriage_subsystems_strings[subsystem] + code = (event.arg4 >> 2) & 0x3fff + if code > strings.num_strings: + return '' + try: + return str(strings.strings[code]).strip() + except IndexError: + return '' + + +@header('{:>12s} {:>18s} {:>18s} {:>5s} {:>8s} {:<8s}'.format( + 'timestamp', 'debugid', 'arg1', 'cpuid', 'tid', 'message')) +def GetKtriageEvent(event, O=None): + """ + Return a string representing a kdebug trace event. + """ + timestamp = event.timestamp + cpuid = event.cpuid + def fmt_arg(a): + return '0x{:016x}'.format(unsigned(a)) + args = list(map( + fmt_arg, + [event.arg1, event.arg2, event.arg3, event.arg4])) + return O.format( + '{:12d} {:18s} {:18s} {:5d} {:8d} {:s}', + unsigned(timestamp), args[3], args[0], unsigned(cpuid), + unsigned(event.arg5), KtriageDecode(event)) + + @lldb_command('showkdebugtrace', 'L:S', fancy=True) def ShowKdebugTrace(cmd_args=None, cmd_options={}, O=None): """ @@ -632,6 +665,32 @@ def ShowKdebugTrace(cmd_args=None, cmd_options={}, O=None): event, symbolicate='-S' in cmd_options, O=O)) +@lldb_command('showktriage', 'L:', fancy=True) +def ShowKtriage(cmd_args=None, cmd_options={}, O=None): + """ + List the events present in the ktriage buffers. + + (lldb) showktriage [-L ] [] + + -L : only show events from the last seconds + """ + tid = None if len(cmd_args) == 0 else cmd_args[0] + if tid: + tid = unsigned(tid) + last = cmd_options.get('-L', None) + if last: + try: + last = float(last) + except ValueError: + raise ArgumentError( + 'error: -L argument must be a number, not {}'.format(last)) + with O.table(GetKtriageEvent.header): + for event in IterateKdebugEvents( + config['verbosity'] > vHUMAN, humanize=True, last=last, tid=tid, + triage=True): + print(GetKtriageEvent(event, O=O)) + + def binary_plist(o): return plistlib.dumps(o, fmt=plistlib.FMT_BINARY) diff --git a/tools/lldbmacros/mbufs.py b/tools/lldbmacros/mbufs.py index 705ec18d6..2467ebf52 100755 --- a/tools/lldbmacros/mbufs.py +++ b/tools/lldbmacros/mbufs.py @@ -29,11 +29,10 @@ def MBufStat(cmd_args=None): entry_format = "{0: <16s} {1: >8d} {2: >8d} {3:>7d} / {4:<6d} {5: >8d} {6: >12d} {7: >8d} {8: >8d} {9: >8d} {10: >8d}" num_items = sizeof(kern.globals.mbuf_table) // sizeof(kern.globals.mbuf_table[0]) ncpus = int(kern.globals.ncpu) - mb_uses_mcache = int(kern.globals.mb_uses_mcache) for i in range(num_items): mbuf = kern.globals.mbuf_table[i] mcs = Cast(mbuf.mtbl_stats, 'mb_class_stat_t *') - if mb_uses_mcache == 0: + if kern.arch != 'x86_64': cname = str(mcs.mbcl_cname) if cname == "mbuf": zone = MbufZoneByName("mbuf") @@ -80,7 +79,7 @@ def MBufStat(cmd_args=None): # EndMacro: mbuf_stat def DumpMbufData(mp, count): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mdata = mp.m_hdr.mh_data mlen = mp.m_hdr.mh_len flags = mp.m_hdr.mh_flags @@ -105,7 +104,7 @@ def DecodeMbufData(in_mp, decode_as="ether"): full_buf = b'' mp = scan while (mp): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mdata = mp.m_hdr.mh_data mlen = unsigned(mp.m_hdr.mh_len) flags = mp.m_hdr.mh_flags @@ -145,7 +144,7 @@ def DecodeMbufData(in_mp, decode_as="ether"): except: break mp = mnext - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': scan = scan.m_hdr.mh_nextpkt else: scan = scan.M_hdr_common.M_hdr.mh_nextpkt @@ -178,7 +177,7 @@ def MbufDumpData(cmd_args=None, cmd_options={}): raise ArgumentError() mp = kern.GetValueFromAddress(cmd_args[0], 'mbuf *') - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mdata = mp.m_hdr.mh_data mhlen = mp.m_hdr.mh_len else: @@ -197,7 +196,7 @@ def MbufDumpData(cmd_args=None, cmd_options={}): def ShowMbuf(prefix, mp, count, total, dump_data_len): out_string = "" mca = "" - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhlen = mp.m_hdr.mh_len mhtype = mp.m_hdr.mh_type mhflags = mp.m_hdr.mh_flags @@ -232,7 +231,7 @@ def ShowMbuf(prefix, mp, count, total, dump_data_len): def WalkMufNext(prefix, mp, count, total, dump_data_len): remaining_len = dump_data_len while (mp): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhlen = mp.m_hdr.mh_len mhnext = mp.m_hdr.mh_next else: @@ -272,7 +271,7 @@ def MbufWalkPacket(cmd_args=None, cmd_options={}): WalkMufNext(prefix, mp, count, total, dump_data_len) count_mbuf += count[0] total_len += total[0] - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mp = mp.m_hdr.mh_nextpkt else: mp = mp.M_hdr_common.M_hdr.mh_nextpkt @@ -308,7 +307,7 @@ def MbufBuf2Slab(cmd_args=None): if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing argument 0 in user function.") - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return @@ -327,7 +326,7 @@ def MbufBuf2Slab(cmd_args=None): def MbufBuf2Mca(cmd_args=None): """ Find the mcache audit structure of the corresponding mbuf """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return @@ -342,7 +341,7 @@ def MbufSlabs(cmd_args=None): """ Print all slabs in the group """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis or zprint") return @@ -422,7 +421,7 @@ def MbufSlabsTbl(cmd_args=None): out_string = "" x = 0 - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis or zprint") return @@ -454,7 +453,7 @@ def MbufSlabsTbl(cmd_args=None): def MbufDecode(mbuf, decode_pkt): # Ignore free'd mbufs. - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhlen = mbuf.m_hdr.mh_len mhtype = mbuf.m_hdr.mh_type mhflags = mbuf.m_hdr.mh_flags @@ -472,7 +471,7 @@ def MbufDecode(mbuf, decode_pkt): out_string = "mbuf found @ 0x{0:x}, length {1:d}, {2:s}, {3:s}".format(mbuf, length, GetMbufFlags(mbuf), GetMbufPktCrumbs(mbuf)) print(out_string) if flags & M_PKTHDR: - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': rcvif = mbuf.M_dat.MH.MH_pkthdr.rcvif else: rcvif = mbuf.M_hdr_common.M_pkthdr.rcvif @@ -496,7 +495,7 @@ def MbufWalkSlabs(cmd_args=None): if len(cmd_args) > 0 and cmd_args[0] == 'decode': decode_pkt = True - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': for mbuf in kmemory.Zone("mbuf").iter_allocated(gettype("mbuf")): MbufDecode(value(mbuf.AddressOf()), decode_pkt) return @@ -695,7 +694,7 @@ def GetMbufFlagsAsString(mbuf_flags): def GetMbufFlags(m): out_string = "" if (m != 0): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhflags = m.m_hdr.mh_flags else: mhflags = m.M_hdr_common.M_hdr.mh_flags @@ -726,7 +725,7 @@ MBUF_TYPES[16] = "MT_TAG" def GetMbufType(m): out_string = "" if (m != 0): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhtype = m.m_hdr.mh_type else: mhtype = m.M_hdr_common.M_hdr.mh_type @@ -759,12 +758,12 @@ def GetMbufPktCrumbsAsString(mbuf_crumbs): def GetMbufPktCrumbs(m): out_string = "" if (m != 0): - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mhflags = m.m_hdr.mh_flags else: mhflags = m.M_hdr_common.M_hdr.mh_flags if (mhflags & M_PKTHDR) != 0: - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': pktcrumbs = m.M_dat.MH.MH_pkthdr.pkt_crumbs else: pktcrumbs = m.M_hdr_common.M_pkthdr.pkt_crumbs @@ -950,7 +949,7 @@ def MbufShowActive(cmd_args=None): Pass 2 to also display the mbuf flags and packet crumbs Pass 3 to limit display to mbuf and skip clusters """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': if cmd_args: GetMbufWalkZone(1, 0, ArgumentStringToInt(cmd_args[0])) else: @@ -968,7 +967,7 @@ def MbufShowActive(cmd_args=None): def MbufShowInactive(cmd_args=None): """ Print all freed/in-cache mbuf objects """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': GetMbufWalkZone(0, 1, 0) else: print(GetMbufWalkAllSlabs(0, 1, 0)) @@ -1004,7 +1003,7 @@ def MbufShowTypeSummary(cmd_args=None): def MbufShowMca(cmd_args=None): """ Print the contents of an mbuf mcache audit structure """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis or zstack_findelem") return out_string = "" @@ -1082,7 +1081,7 @@ def MbufShowMca(cmd_args=None): def MbufShowAll(cmd_args=None): """ Print all mbuf objects """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': GetMbufWalkZone(1, 1, 1) else: print(GetMbufWalkAllSlabs(1, 1, 1)) @@ -1103,19 +1102,19 @@ def MbufCountChain(cmd_args=None): while (mp): pkt = pkt + 1 - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mn = mp.m_hdr.mh_next else: mn = mp.M_hdr_common.M_hdr.mh_next while (mn): nxt = nxt + 1 - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mn = mn.m_hdr.mh_next else: mn = mn.M_hdr_common.M_hdr.mh_next print("mp 0x{:x} mn 0x{:x}".format(mp, mn)) - if kern.globals.mb_uses_mcache == 1: + if kern.arch == 'x86_64': mp = mp.m_hdr.mh_nextpkt else: mp = mp.M_hdr_common.M_hdr.mh_nextpkt @@ -1131,7 +1130,7 @@ def MbufCountChain(cmd_args=None): def MbufTopLeak(cmd_args=None): """ Print the top suspected mbuf leakers """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use zleak") return topcnt = 0 @@ -1163,7 +1162,7 @@ def GetMbufTraceLeak(trace): def MbufLargeFailures(cmd_args=None): """ Print the largest allocation failures """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, this macro is not available. use zleak to detect leaks") return topcnt = 0 @@ -1196,7 +1195,7 @@ def MbufTraceLeak(cmd_args=None): if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing argument 0 in user function.") - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return @@ -1213,7 +1212,7 @@ def McacheWalkObject(cmd_args=None): if cmd_args is None or len(cmd_args) == 0: raise ArgumentError("Missing argument 0 in user function.") - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return @@ -1234,7 +1233,7 @@ def McacheWalkObject(cmd_args=None): def McacheStat(cmd_args=None): """ Print all mcaches in the system. """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return @@ -1295,7 +1294,7 @@ def McacheStat(cmd_args=None): def McacheShowCache(cmd_args=None): """Display the number of objects in cache. """ - if int(kern.globals.mb_uses_mcache) == 0: + if kern.arch != 'x86_64': print("mcache is disabled, use kasan whatis") return out_string = "" diff --git a/tools/lldbmacros/memory.py b/tools/lldbmacros/memory.py index 73aece4fb..ab806f9d8 100755 --- a/tools/lldbmacros/memory.py +++ b/tools/lldbmacros/memory.py @@ -6,6 +6,7 @@ from xnu import * import sys import shlex import math +import re from utils import * import xnudefines from process import * @@ -131,6 +132,7 @@ def Memstats(cmd_args=None, cmd_options={}): memstats["vm_page_filecache_min"] = int(kern.globals.vm_pageout_state.vm_page_filecache_min) memstats["vm_page_pageable_int_count"] = int(kern.globals.vm_page_pageable_internal_count) memstats["vm_page_throttled_count"] = int(kern.globals.vm_page_throttled_count) + memstats["vm_page_speculative_count"] = int(kern.globals.vm_page_speculative_count) if hasattr(kern.globals, 'compressor_object'): memstats["compressor_count"] = int(kern.globals.compressor_object.resident_page_count) memstats["compressed_count"] = int(kern.globals.c_segment_pages_compressed) @@ -140,9 +142,9 @@ def Memstats(cmd_args=None, cmd_options={}): memstats["compression_ratio"] = 0 memstats["memorystatus_level"] = int(kern.globals.memorystatus_level) memstats["memorystatus_available_pages"] = int(kern.globals.memorystatus_available_pages) - memstats["memorystatus_available_pages_critical"] = int(kern.globals.memstat_critical_threshold) - memstats["memorystatus_available_pages_idle"] = int(kern.globals.memstat_idle_threshold) - memstats["memorystatus_available_pages_soft"] = int(kern.globals.memstat_soft_threshold) + memstats["memorystatus_available_pages_critical"] = int(getattr(kern.globals, 'memstat_critical_threshold', 0)) + memstats["memorystatus_available_pages_idle"] = int(getattr(kern.globals, 'memstat_idle_threshold', 0)) + memstats["memorystatus_available_pages_soft"] = int(getattr(kern.globals, 'memstat_soft_threshold', 0)) if kern.globals.memstat_policy_config & kPolicyClearTheDecks: memstats["memorystatus_clear_the_decks_offset"] = int(kern.globals.memstat_ctd_offset) else: @@ -185,71 +187,6 @@ def TestMemstats(kernel_target, config, lldb_obj, isConnected ): # EndMacro: memstats -# Macro: showpgz - -@lldb_command('showpgz', "A", fancy=True) -def PGZSummary(cmd_args=None, cmd_options={}, O=None): - """ Routine to show all live PGZ allocations - Usage: showpgz [-A] - - -A show freed entries too - """ - bt = uses = slots = 0 - try: - slots = unsigned(kern.GetGlobalVariable('pgz_slots')) - uses = unsigned(kern.GetGlobalVariable('pgz_uses')) - pgzbt = unsigned(kern.GetGlobalVariable('pgz_backtraces')) - guards = unsigned(kern.GetGlobalVariable('zone_guard_pages')) - except: - pass - if uses == 0: - print("PGZ disabled") - return - - if pgzbt == 0: - print("PGZ not initialized yet") - - zi = kern.GetGlobalVariable('zone_info') - page_size = unsigned(kern.globals.page_size) - pgz_min = unsigned(zi.zi_pgz_range.min_address) + page_size - pgz_max = unsigned(zi.zi_pgz_range.max_address) - - target = LazyTarget.GetTarget() - whatis = kmemory.WhatisProvider.get_shared() - - for i, addr in enumerate(range(pgz_min, pgz_max, 2 * page_size)): - mo = whatis.find_provider(addr).lookup(addr) - - if not mo.real_addr: - continue - - live = mo.status == 'allocated' - - if not live and "-A" not in cmd_options: - continue - - with O.table("Element {:4d}: {:<#20x} ({: - e.g. to sort by compressed memory use: - showallvmstats -S compressed + A sorting option may be provided of + e.g. to sort by total compressed memory use: + showallvmstats -S compressed_total Default behavior is to sort in descending order. To use ascending order, you may provide -A. e.g. to sort by pid in ascending order: showallvmstats -S pid -A @@ -1488,7 +1432,7 @@ def ShowAllVMStats(cmd_args=None, cmd_options={}): valid_sorting_options = ['wired_count', 'resident_count', 'resident_max', 'internal', \ 'external', 'reusable', 'compressed', 'compressed_peak', \ - 'compressed_lifetime', 'new_resident_count', \ + 'compressed_lifetime', 'compressed_total', 'new_resident_count', \ 'proc_name', 'pid', 'vsize', 'footprint'] if ('-S' in cmd_options) and (cmd_options['-S'] not in valid_sorting_options): @@ -1502,17 +1446,23 @@ def ShowAllVMStats(cmd_args=None, cmd_options={}): page_size = kern.globals.page_size - hdr_format = "{:>6s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:<20s} {:1s}" - print(hdr_format.format('#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'footprint', 'footprint', 'compressed', 'compressed', 'compressed', 'pid', 'command', '')) - print(hdr_format.format('', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(peak)', '(current)', '(peak)', '(lifetime)', '', '', '')) - total_format = "{0: >6} {s.wired_count: >10d} {1: >10} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.footprint: >10d} {s.footprint_peak: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {1: >10} {1: <32}" + hdr_format = "{:>6s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:<20s} {:1s}" + print(hdr_format.format('#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'footprint', 'footprint', 'compressed', 'compressed', 'compressed', 'compressed', 'pid', 'command', '')) + print(hdr_format.format('', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(peak)', '(internal)', '(peak)', '(lifetime)', '(total)', '', '', '')) + total_format = "{0: >6} {s.wired_count: >10d} {1: >10} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.footprint: >10d} {s.footprint_peak: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {s.compressed_total: >10d} {1: >10} {1: <32}" ledger_template = kern.globals.task_ledger_template entry_indices = {} - entry_keys = ['wired_mem', 'phys_mem', 'internal', 'external', 'reusable', 'internal_compressed', 'phys_footprint'] + entry_keys = ['wired_mem', 'phys_mem', 'internal', 'external', 'reusable', 'internal_compressed', 'phys_footprint', 'alternate_accounting_compressed'] + compressed_entry_indices = {} for key in entry_keys: entry_indices[key] = GetLedgerEntryIndex(ledger_template, key) assert(entry_indices[key] != -1) + for i in range(ledger_template.lt_cnt): + entry = ledger_template.lt_entries[i] + name = str(entry.et_key) + if compressed_ledger_re.match(name): + compressed_entry_indices[name] = i vmstats_totals = VmStats() vmstats_tasks = [] @@ -1544,6 +1494,13 @@ def ShowAllVMStats(cmd_args=None, cmd_options={}): vmstats.compressed = internal_compressed_entry['balance'] // page_size vmstats.compressed_peak = internal_compressed_entry['lifetime_max'] // page_size vmstats.compressed_lifetime = internal_compressed_entry['credit'] // page_size + + alternate_accounting_entry = GetLedgerEntryWithTemplate(ledger_template, task_ledgerp, entry_indices['alternate_accounting_compressed']) + vmstats.compressed_total = (internal_compressed_entry['balance'] - alternate_accounting_entry['balance']) // page_size + for key in compressed_entry_indices: + entry = GetLedgerEntryWithTemplate(ledger_template, task_ledgerp, compressed_entry_indices[key]) + vmstats.compressed_total += entry['balance'] // page_size + vmstats.new_resident_count = vmstats.internal + vmstats.external vmstats.proc = proc vmstats.proc_name = GetProcName(proc) @@ -1638,11 +1595,11 @@ def ShowVmTagBtLog(cmd_args=None): """Routine to print vmtag backtracing corresponding to boot-arg "vmtaglog" usage: showvmtagbtlog """ - + kmem = kmemory.KMem.get_shared() page_size = kern.globals.page_size map = kern.globals.kernel_map first_entry = map.hdr.links.next - last_entry = map.hdr.links.prev + last_entry = kmem.vm_map_entry_packing.unpack(unsigned(map.hdr.links.prev)) entry = first_entry btrefs = [] while entry != last_entry: @@ -1683,9 +1640,10 @@ def ShowMapRanges(cmd_args=None): map_val = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t') print(GetVMMapSummary.header) print(GetVMMapSummary(map_val)) - print(GetVMRangeSummary.header) - for idx in range(2): - print(GetVMRangeSummary(map_val.user_range[idx], idx)) + if map_val.uses_user_ranges: + print(GetVMRangeSummary.header) + print(GetVMRangeSummary(map_val.default_range)) + print(GetVMRangeSummary(map_val.data_range, 'data')) return None def GetResidentPageCount(vmmap): @@ -1731,15 +1689,10 @@ def GetVMEntrySummary(vme): @lldb_type_summary(['vm_map_range']) @header("{0: <20s} {1: <20s} {2: <20s} {3: <20s}".format("range", "min_address", "max_address", "size")) -def GetVMRangeSummary(vmrange, idx=0): +def GetVMRangeSummary(vmrange, range_name='default'): """ Display vm range specific information. """ - range_id = [ - "default", - "heap" - ] out_string = "" format_string = "{0: <20s} {1: <#020x} {2: <#020x} {3: <#20x}" - range_name = range_id[idx] min_address = vmrange.min_address max_address = vmrange.max_address range_size = max_address - min_address @@ -3300,6 +3253,7 @@ def ShowVMPage(cmd_args=None, cmd_options={}, O=None): pager=-1 paging_in_progress=-1 activity_in_progress=-1 + object=0 if(page.vmp_object): m_object_val = _vm_page_unpack_ptr(page.vmp_object) object = kern.GetValueFromAddress(m_object_val, 'vm_object_t') @@ -3456,7 +3410,9 @@ def showmaphdrvme(maphdr, pmap, start_vaddr, end_vaddr, show_pager_info, show_al print("{:<18s} {:>18s}:{:<18s} {:>10s} {:<8s} {:<16s} {:<18s} {:<18s}".format("entry","start","end","#pgs","tag.kmod","prot&flags","object","offset")) last_end = unsigned(maphdr.links.start) skipped_entries = 0 - for vme in IterateQueue(vme_list_head, vme_ptr_type, "links", reverse_order): + # vme_prev is packed, but vme_next is not. Only pass a function to unpack if we iterate backwards. + vme_unpacking_fn = None if not reverse_order else kmemory.KMem.get_shared().vm_map_entry_packing.unpack + for vme in IterateQueue(vme_list_head, vme_ptr_type, "links", reverse_order, unpack_ptr_fn=vme_unpacking_fn): links = vme.links vme_start = links.start vme_end = links.end @@ -3487,7 +3443,7 @@ def showmaphdrvme(maphdr, pmap, start_vaddr, end_vaddr, show_pager_info, show_al if object_val == kern.globals.bufferhdr_map: object_str = "BUFFERHDR_MAP" - elif object_val == kern.globals.mb_map: + elif hasattr(kern.globals, 'mb_map') and object_val == kern.globals.mb_map: object_str = "MB_MAP" elif object_val == kern.globals.bsd_pageable_map: object_str = "BSD_PAGEABLE_MAP" @@ -3497,8 +3453,6 @@ def showmaphdrvme(maphdr, pmap, start_vaddr, end_vaddr, show_pager_info, show_al object_str = "IPC_KERNEL_COPY_MAP" elif hasattr(kern.globals, 'io_submap') and object_val == kern.globals.io_submap: object_str = "IO_SUBMAP" - elif hasattr(kern.globals, 'pgz_submap') and object_val == kern.globals.pgz_submap: - object_str = "ZALLOC:PGZ" elif hasattr(kern.globals, 'compressor_map') and object_val == kern.globals.compressor_map: object_str = "COMPRESSOR_MAP" elif hasattr(kern.globals, 'g_kext_map') and object_val == kern.globals.g_kext_map: @@ -3673,6 +3627,10 @@ FixedTags = { 33: "VM_KERN_MEMORY_RECOUNT", 34: "VM_KERN_MEMORY_TAG", 35: "VM_KERN_MEMORY_EXCLAVES", + 36: "VM_KERN_MEMORY_EXCLAVES_SHARED", + 37: "VM_KERN_MEMORY_KALLOC_SHARED", + 38: "VM_KERN_MEMORY_FIRST_DYNAMIC", + 39: "VM_KERN_MEMORY_CPUTRACE", 255:"VM_KERN_MEMORY_ANY", } @@ -4076,11 +4034,11 @@ def match_vm_page_attributes(page, matching_attributes): matched_attributes = 0 if "vmp_q_state" in matching_attributes and (page.vmp_q_state == matching_attributes["vmp_q_state"]): matched_attributes += 1 - if "vm_object" in matching_attributes and (unsigned(unpacked_vm_object) == unsigned(matching_attributes["vm_object"])): + if "vm_object" in matching_attributes and (unsigned(unpacked_vm_object) == matching_attributes["vm_object"]): matched_attributes += 1 - if "vmp_offset" in matching_attributes and (unsigned(page.vmp_offset) == unsigned(matching_attributes["vmp_offset"])): + if "vmp_offset" in matching_attributes and (unsigned(page.vmp_offset) == matching_attributes["vmp_offset"]): matched_attributes += 1 - if "phys_page" in matching_attributes and (unsigned(_vm_page_get_phys_page(page_ptr)) == unsigned(matching_attributes["phys_page"])): + if "phys_page" in matching_attributes and (unsigned(_vm_page_get_phys_page(page_ptr)) == matching_attributes["phys_page"]): matched_attributes += 1 if "bitfield" in matching_attributes and unsigned(page.__getattr__(matching_attributes["bitfield"])) == 1: matched_attributes += 1 @@ -4127,19 +4085,22 @@ def ScanVMPages(cmd_args=None, cmd_options={}): attribute_values = {} if "-S" in cmd_options: - attribute_values["vmp_q_state"] = kern.GetValueFromAddress(cmd_options["-S"], 'int') + try: + attribute_values["vmp_q_state"] = ArgumentStringToInt(cmd_options["-S"]) + except: + attribute_values["vmp_q_state"] = GetEnumValue('vm_page_q_state_t', cmd_options["-S"]) attribute_count += 1 if "-O" in cmd_options: - attribute_values["vm_object"] = kern.GetValueFromAddress(cmd_options["-O"], 'vm_object_t') + attribute_values["vm_object"] = ArgumentStringToAddress(cmd_options["-O"]) attribute_count += 1 if "-F" in cmd_options: - attribute_values["vmp_offset"] = kern.GetValueFromAddress(cmd_options["-F"], 'unsigned long long') + attribute_values["vmp_offset"] = ArgumentStringToAddress(cmd_options["-F"]) attribute_count += 1 if "-P" in cmd_options: - attribute_values["phys_page"] = kern.GetValueFromAddress(cmd_options["-P"], 'unsigned int') + attribute_values["phys_page"] = ArgumentStringToAddress(cmd_options["-P"]) attribute_count += 1 if "-B" in cmd_options: @@ -4234,8 +4195,6 @@ def ScanVMPages(cmd_args=None, cmd_options={}): #EndMacro scan_vm_pages -VM_PAGE_IS_WIRED = 1 - @header("{0: <10s} of {1: <10s} {2: <20s} {3: <20s} {4: <20s} {5: <10s} {6: <5s}\t{7: <28s}\t{8: <50s}".format("index", "total", "vm_page_t", "offset", "next", "phys_page", "wire#", "first bitfield", "second bitfield")) @lldb_command('vmobjectwalkpages', 'CSBNQP:O:') def VMObjectWalkPages(cmd_args=None, cmd_options={}): @@ -4349,7 +4308,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): print(out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vmp_object)))) return - if (vmp.vmp_q_state == VM_PAGE_IS_WIRED) and (vmp.vmp_wire_count == 0): + if (vmp.vmp_q_state == GetEnumValue('vm_page_q_state_t', 'VM_PAGE_IS_WIRED')) and (vmp.vmp_wire_count == 0): print(out_string + " page in wired state with wire_count of 0\n") print("vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + "\n") print("stopping...\n") @@ -4992,10 +4951,9 @@ def vm_page_lookup_in_compressor(slot_ptr): C_SEG_SLOT_ARRAY_SIZE = 64 C_SEG_SLOT_ARRAY_MASK = C_SEG_SLOT_ARRAY_SIZE - 1 cs = GetObjectAtIndexFromArray(c_seg.c_slots[c_indx // C_SEG_SLOT_ARRAY_SIZE], c_indx & C_SEG_SLOT_ARRAY_MASK) - print(cs) kmem = kmemory.KMem.get_shared() c_slot_unpacked_ptr = kmem.c_slot_packing.unpack(unsigned(cs.c_packed_ptr)) - print("c_slot {: <#018x} c_offset {:#x} c_size {:#x} c_packed_ptr {:#x} (unpacked: {: <#018x})".format(cs, cs.c_offset, cs.c_size, cs.c_packed_ptr, unsigned(c_slot_unpacked_ptr))) + print(cs, "c_offset {:#x} c_size {:#x} c_packed_ptr {:#x} (unpacked: {: <#018x})".format(cs.c_offset, cs.c_size, cs.c_packed_ptr, unsigned(c_slot_unpacked_ptr))) if unsigned(slot_ptr) != unsigned(c_slot_unpacked_ptr): print("*** ERROR: compressor slot {: <#018x} points back to {: <#018x} instead of itself".format(slot_ptr, c_slot_unpacked_ptr)) if c_no_data == 0: @@ -5007,23 +4965,6 @@ def vm_page_lookup_in_compressor(slot_ptr): else: print("") -# From vm_page.h -VM_PAGE_NOT_ON_Q = 0 -VM_PAGE_IS_WIRED = 1 -VM_PAGE_USED_BY_COMPRESSOR = 2 -VM_PAGE_ON_FREE_Q = 3 -VM_PAGE_ON_FREE_LOCAL_Q = 4 -VM_PAGE_ON_FREE_LOPAGE_Q = 5 -VM_PAGE_ON_THROTTLED_Q = 6 -VM_PAGE_ON_PAGEOUT_Q = 7 -VM_PAGE_ON_SPECULATIVE_Q = 8 -VM_PAGE_ON_ACTIVE_LOCAL_Q = 9 -VM_PAGE_ON_ACTIVE_Q = 10 -VM_PAGE_ON_INACTIVE_INTERNAL_Q = 11 -VM_PAGE_ON_INACTIVE_EXTERNAL_Q = 12 -VM_PAGE_ON_INACTIVE_CLEANED_Q = 13 -VM_PAGE_ON_SECLUDED_Q = 14 - @lldb_command('vm_scan_all_pages') def VMScanAllPages(cmd_args=None): """Scans the vm_pages[] array @@ -5058,30 +4999,32 @@ def VMScanAllPages(cmd_args=None): if m_object_addr != 0 and (m_object := kern.CreateValueFromAddress(m_object_addr, "struct vm_object")).GetSBValue().IsValid() and m_object.internal: internal = True - m_vmp_q_state = int(m.vmp_q_state) - if m.vmp_wire_count != 0 and m_vmp_q_state != VM_PAGE_ON_ACTIVE_LOCAL_Q: + m_vmp_q_state = GetEnumName('vm_page_q_state_t', int(m.vmp_q_state)) + + if m.vmp_wire_count != 0 and m_vmp_q_state != 'VM_PAGE_ON_ACTIVE_LOCAL_Q': wired_count = wired_count + 1 pageable = 0 - elif m_vmp_q_state == VM_PAGE_ON_THROTTLED_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_THROTTLED_Q': throttled_count = throttled_count + 1 pageable = 0 - elif m_vmp_q_state == VM_PAGE_ON_ACTIVE_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_ACTIVE_Q': active_count = active_count + 1 pageable = 1 - elif m_vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_ACTIVE_LOCAL_Q': local_active_count = local_active_count + 1 pageable = 0 - elif m_vmp_q_state in (VM_PAGE_ON_INACTIVE_CLEANED_Q, VM_PAGE_ON_INACTIVE_INTERNAL_Q, - VM_PAGE_ON_INACTIVE_EXTERNAL_Q): + elif m_vmp_q_state in ('VM_PAGE_ON_INACTIVE_CLEANED_Q', + 'VM_PAGE_ON_INACTIVE_INTERNAL_Q', + 'VM_PAGE_ON_INACTIVE_EXTERNAL_Q'): inactive_count = inactive_count + 1 pageable = 1 - elif m_vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_SPECULATIVE_Q': speculative_count = speculative_count + 1 pageable = 0 - elif m_vmp_q_state == VM_PAGE_ON_FREE_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_FREE_Q': free_count = free_count + 1 pageable = 0 - elif m_vmp_q_state == VM_PAGE_ON_SECLUDED_Q: + elif m_vmp_q_state == 'VM_PAGE_ON_SECLUDED_Q': secluded_count = secluded_count + 1 if m_object_addr == 0: secluded_free_count = secluded_free_count + 1 @@ -5091,7 +5034,7 @@ def VMScanAllPages(cmd_args=None): elif m_object_addr == 0 and m.vmp_busy: local_free_count = local_free_count + 1 pageable = 0 - elif m_vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR: + elif m_vmp_q_state == 'VM_PAGE_USED_BY_COMPRESSOR': compressor_count = compressor_count + 1 pageable = 0 else: @@ -5123,13 +5066,17 @@ def VMScanAllPages(cmd_args=None): print("secluded_inuse_count = {:d}\n".format(secluded_inuse_count)) -@lldb_command('show_all_vm_named_entries') -def ShowAllVMNamedEntries(cmd_args=None): +@lldb_command('show_all_vm_named_entries', 'B') +def ShowAllVMNamedEntries(cmd_args=None, cmd_options={}): """ Routine to print a summary listing of all the VM named entries + usage: show_all_vm_named_entries [-B] + options: + -B: show a backtrace collect at the time of each named-entry's creation """ + show_bt = '-B' in cmd_options kmem = kmemory.KMem.get_shared() - ikot_named_entry = GetEnumValue('ipc_kotype_t', 'IKOT_NAMED_ENTRY') + ikot_named_entry = GetEnumValue('ipc_object_type_t', 'IKOT_NAMED_ENTRY') port_ty = gettype('struct ipc_port') ent_ty = gettype('struct vm_named_entry') @@ -5138,25 +5085,28 @@ def ShowAllVMNamedEntries(cmd_args=None): port for port in kmemory.Zone("ipc ports").iter_allocated(port_ty) - if port.xGetScalarByPath(".ip_object.io_bits") & 0x3ff == ikot_named_entry + if port.xGetScalarByPath(".ip_object.io_type") == ikot_named_entry ) for idx, port in enumerate(named_entries): ko = kmem.make_address(port.xGetScalarByName('ip_kobject')) ent = port.xCreateValueFromAddress(None, ko, ent_ty) - showmemoryentry(value(ent.AddressOf()), idx=idx + 1, port=value(port.AddressOf())) + showmemoryentry(value(ent.AddressOf()), idx=idx + 1, port=value(port.AddressOf()), show_bt=show_bt) -@lldb_command('show_vm_named_entry') -def ShowVMNamedEntry(cmd_args=None): +@lldb_command('show_vm_named_entry', 'B') +def ShowVMNamedEntry(cmd_args=None, cmd_options={}): """ Routine to print a VM named entry + usage: show_vm_named_entry [-B] + options: + -B: show a backtrace collect at the time of the named-entry's creation """ if cmd_args is None or len(cmd_args) == 0: raise ArgumentError() - + show_bt = '-B' in cmd_options named_entry = kern.GetValueFromAddress(cmd_args[0], 'vm_named_entry_t') - showmemoryentry(named_entry) + showmemoryentry(named_entry, show_bt=show_bt) -def showmemoryentry(entry, idx=0, port=None): +def showmemoryentry(entry, idx=0, port=None, show_bt=False): """ Routine to print out a summary a VM memory entry params: entry - core.value : a object of type 'struct vm_named_entry *' @@ -5202,6 +5152,10 @@ def showmemoryentry(entry, idx=0, port=None): else: print("***** UNKNOWN TYPE *****") print() + if show_bt: + btl = kmemory.BTLibrary.get_shared() + btref = btl.get_stack(entry.named_entry_bt) + btref.describe() @lldb_command("showmaprb") def ShowMapRB(cmd_args=None): diff --git a/tools/lldbmacros/misc.py b/tools/lldbmacros/misc.py index 8ee3892de..8c0438c24 100755 --- a/tools/lldbmacros/misc.py +++ b/tools/lldbmacros/misc.py @@ -341,6 +341,24 @@ def showTimerWakeupStats(cmd_args=None): print('Task total wakeups: {:d} {:d}'.format( tot_wakes, tot_platform_wakes)) +def timer_deadine_string(timer, recent_timestamp): + EndOfAllTime = signed(-1) + + deadline = unsigned(timer.tc_pqlink.deadline) + deadlinediff = signed(deadline) - signed(recent_timestamp) + deadlinediff_s = kern.GetNanotimeFromAbstime(deadlinediff) / 1000000000.0 + + if signed(timer.tc_pqlink.deadline) == EndOfAllTime: + valid = False + else : + valid = True + + deadline_str = "{:18d}".format(deadline) if valid else "" + deadlinediff_str = "{:16.06f}".format(deadlinediff_s) if valid else "" + + return " {:18s} {:18s}".format(deadline_str, deadlinediff_str) + + @lldb_command('showrunningtimers') def ShowRunningTimers(cmd_args=None): """ @@ -352,10 +370,10 @@ def ShowRunningTimers(cmd_args=None): recent_timestamp = GetRecentTimestamp() - hdr = '{:4s} {:^10s} {:^18s} {:^18s} {:^18s} {:^18s}' - print(hdr.format('CPU', 'State', 'Quantum', 'To Go', 'kperf', 'To Go', 'Hard To Go')) + hdr = '{:4s} {:^10s} {:^18s} {:^18s} {:^18s} {:^18s} {:^18s} {:^18s} {:^18s} {:^18s}' + print(hdr.format('CPU', 'State', 'Quantum', 'To Go', 'Preempt', 'To Go', 'kperf', 'To Go', 'Perfcontrol', 'To Go')) - cpu = '{:3d}: {:^10s} {:18d} {:16.06f} {:18d} {:16.06f}' + cpu = '{:3d}: {:^10s}' i = 0 while processor_array[i] != 0: @@ -363,15 +381,14 @@ def ShowRunningTimers(cmd_args=None): statestr = 'runnning' if processor.running_timers_active else 'idle' - quantum = unsigned(processor.running_timers[0].tc_pqlink.deadline) - quantumdiff = signed(quantum) - signed(recent_timestamp) - quantumdiff_s = kern.GetNanotimeFromAbstime(quantumdiff) / 1000000000.0 + cpustr = cpu.format(i, statestr) - kperf = unsigned(processor.running_timers[1].tc_pqlink.deadline) - kperfdiff = signed(kperf) - signed(recent_timestamp) - kperfdiff_s = kern.GetNanotimeFromAbstime(kperfdiff) / 1000000000.0 + cpustr += timer_deadine_string(processor.running_timers[GetEnumValue('running_timer::RUNNING_TIMER_QUANTUM')], recent_timestamp) + cpustr += timer_deadine_string(processor.running_timers[GetEnumValue('running_timer::RUNNING_TIMER_PREEMPT')], recent_timestamp) + cpustr += timer_deadine_string(processor.running_timers[GetEnumValue('running_timer::RUNNING_TIMER_KPERF')], recent_timestamp) + cpustr += timer_deadine_string(processor.running_timers[GetEnumValue('running_timer::RUNNING_TIMER_PERFCONTROL')], recent_timestamp) - print (cpu.format(i, statestr, quantum, quantumdiff_s, kperf, kperfdiff_s)) + print (cpustr) i += 1 def DoReadMsr64(msr_address, lcpu): diff --git a/tools/lldbmacros/net.py b/tools/lldbmacros/net.py index 8e87020a9..991d0e5bc 100755 --- a/tools/lldbmacros/net.py +++ b/tools/lldbmacros/net.py @@ -106,53 +106,6 @@ def GetIfConfiguration(ifname): return ifnet return None -# Macro: net_get_always_on_pktap -@lldb_command('net_get_always_on_pktap') -def NetGetAlwaysOnPktap(cmd_args=None): - """ Dump the always-on packet capture to /tmp/dump.pktap - """ - for i in range(0, 10): - ifnet = GetIfConfiguration("pktap"+str(i)) - if not ifnet: - continue - if ifnet.if_bpf == 0: - ifnet = None - continue - if ifnet.if_bpf.bif_dlist.bd_headdrop == 0: - ifnet = None - continue - - break - - if not ifnet: - print("Could not find a pktap interface") - return - - bpf_d = ifnet.if_bpf.bif_dlist - - f = tempfile.NamedTemporaryFile(prefix="dump-", suffix=".pktap", dir="/tmp/", mode="wb", delete=False) - - err = lldb.SBError() - - if bpf_d.bd_hbuf != 0: - addr = bpf_d.bd_hbuf[0].GetSBValue().GetLoadAddress() - hlen = (unsigned(bpf_d.bd_hlen)+(4-1))&~(4-1) - buf = LazyTarget.GetProcess().ReadMemory(addr, hlen, err) - if err.fail: - print("Error, getting sbuf") - f.write(buf) - - addr = bpf_d.bd_sbuf[0].GetSBValue().GetLoadAddress() - slen = (unsigned(bpf_d.bd_slen)+(4-1))&~(4-1) - buf = LazyTarget.GetProcess().ReadMemory(addr, slen, err) - if err.fail: - print("Error, getting sbuf") - f.write(buf) - - print(f.name) - f.close() -# EndMacro: net_get_always_on_pktap - #Macro: ifconfig_dlil @lldb_command('ifconfig_dlil') def ShowIfconfigDlil(cmd_args=None) : @@ -281,8 +234,8 @@ def GetIfaddrs(ifp): if (ifp != 0): i = 1 for ifaddr in IterateTAILQ_HEAD(ifp.if_addrhead, "ifa_link"): - format_string = "\t{0: ".format(value)) + return out_string + +# Macro: tcp_walk_timer_list +@lldb_command('tcp_walk_timer_list', 'V') +def TCPWalkTimerList(cmd_args=None, cmd_options={}): + """ Walk the list of tcptimerentry from tcp_timer_list lhead field + Usage: tcp_walk_timer_list [-V] + -V show detail of the TCP control block + """ + verbose = False + if "-V" in cmd_options: + verbose = True + + field_offset = getfieldoffset("struct tcpcb", "tentry.te_le.le_next") + + timer_list = addressof(kern.globals.tcp_timer_list) + + timer_entry = Cast(timer_list.lhead.lh_first, 'tcptimerentry *') + cnt = 0 + + print("Walking entries of tcp_timer_list at 0x{:x}".format(unsigned(timer_list))) + + timer_header_format = "{0:6s} {1:>18s} {2:>12s} {3:>14s} {4:>6s} {5:>12s} {6:>18s} {7:>18s} {8:>18s}" + out_string = timer_header_format.format("Entry#", "(tcptimerentry *)", "timer_start", "index", "mode", "runtime", "le_next", "(tcpcb *)", "(inpcb *)") + print(out_string) + + while timer_entry != 0: + cnt += 1 + next_entry = timer_entry.te_le.le_next + tp = Cast(kern.GetValueFromAddress(Cast(timer_entry, 'char *') - field_offset), 'tcpcb *') + timer_entry_format = "{0:6d} 0x{1:<16x} {2:>12d} {3:>14s} {4:>6d} {5:>12d} 0x{6:<16x} 0x{7:<16x} 0x{8:<16x}" + out_string = timer_entry_format.format( + cnt, + unsigned(timer_entry), + unsigned(timer_entry.te_timer_start), + GetTCPTimerAsString(timer_entry.te_index), + unsigned(timer_entry.te_mode), + unsigned(timer_entry.te_runtime), + unsigned(next_entry) if next_entry else 0, + unsigned(tp), + unsigned(tp.t_inpcb) + ) + print(out_string) + + if verbose: + print(GetInPcb(tp.t_inpcb, IPPROTO_TCP)) + + timer_entry = Cast(next_entry, 'tcptimerentry *') + + # Safety check to prevent infinite loops + if cnt > 10000: + print("Warning: Stopped after 10000 entries to prevent infinite loop") + break + + print("Total timer entries: {:d}".format(cnt)) +# EndMacro: tcp_walk_timer_list + +def ShowBPFDevice(i, bpf_d): + out_string = "" + if bpf_d != 0: + bd_sbuf = cast(bpf_d.bd_sbuf, 'char *') + bd_hbuf = cast(bpf_d.bd_hbuf, 'char *') + ifname = "" + bd_bif = cast(bpf_d.bd_bif, 'struct bpf_if *') + if bd_bif != 0: + bif_ifp = cast(bd_bif.bif_ifp, 'struct ifnet *') + if bif_ifp != 0: + ifname = bif_ifp.if_xname + format_string = "bpf{0:<3d} (struct bpf_d *)0x{1:16x} {2:7d} 0x{3:<16x} {4:7d} 0x{5:<16x} {6:16s}" + out_string += format_string.format(i, bpf_d, bpf_d.bd_slen, bd_sbuf, bpf_d.bd_hlen, bd_hbuf, ifname) + return out_string + +# Macro: show_bpf_devices +@lldb_command('show_bpf_devices') +def ShowBPFDevices(cmd_args=None): + """ Walk the bpf device array + """ + format_string = "{0:6s} {1:34s} {2:>7s} {3:18s} {4:>7s} {5:18s} {6:16s}" + out_string = format_string.format("device", "address", "bd_slen", "bd_sbuf", "bd_hlen", "bd_hbuf", "bif_ifp") + print(out_string) + + bpf_dtab_size = int(kern.globals.bpf_dtab_size) + for i in range(0, bpf_dtab_size): + bpf_d = cast(kern.globals.bpf_dtab[i], 'struct bpf_d *') + if bpf_d == 0: + continue + out_string = ShowBPFDevice(i, bpf_d) + print(out_string) +# EndMacro: show_bpf_devices + +def DumpBPFToFile(bpf_d): + bd_bif = cast(bpf_d.bd_bif, 'struct bpf_if *') + if bd_bif == 0: + print("bd_bif is NULL") + return + + bif_ifp = Cast(bd_bif.bif_ifp, 'struct ifnet *') + if bif_ifp == 0: + print("bd_bif.bif_ifp is NULL") + return + + ifname = cast(bif_ifp.if_xname, 'char *') + print("ifname: ", ifname); + + dlt = bd_bif.bif_dlt + if dlt == 149: + suffix = ".pktap" + else: + suffix = ".bpf" + + format_string = "{0:s}-dlt-{1:d}-" + prefix = format_string.format(ifname, dlt) + + f = tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix, dir="/tmp/", mode="wb", delete=False) + + err = lldb.SBError() + + if bpf_d.bd_hlen != 0: + addr = bpf_d.bd_hbuf[0].GetSBValue().GetLoadAddress() + hlen = (unsigned(bpf_d.bd_hlen)+(4-1))&~(4-1) + if hlen != 0: + buf = LazyTarget.GetProcess().ReadMemory(addr, hlen, err) + if err.fail: + print("Error, getting sbuf") + f.write(buf) + + if bpf_d.bd_slen != 0: + addr = bpf_d.bd_sbuf[0].GetSBValue().GetLoadAddress() + slen = (unsigned(bpf_d.bd_slen)+(4-1))&~(4-1) + if slen != 0: + buf = LazyTarget.GetProcess().ReadMemory(addr, slen, err) + if err.fail: + print("Error, getting sbuf") + f.write(buf) + + print(f.name) + f.close() + +# Macro: net_get_always_on_pktap +@lldb_command('save_bfp_buffers') +def SaveBPFBuffer(cmd_args=None): + """ Dump the buffers of a BPF to a file in /tmp/ + """ + if cmd_args is None or len(cmd_args) == 0: + raise ArgumentError() + + bpf_d = kern.GetValueFromAddress(cmd_args[0], 'struct bpf_d *') + + DumpBPFToFile(bpf_d) + +# Macro: net_get_always_on_pktap +@lldb_command('net_get_always_on_pktap') +def NetGetAlwaysOnPktap(cmd_args=None): + """ Dump the always-on packet capture to a file in /tmp/ + """ + for i in range(0, 10): + ifnet = GetIfConfiguration("pktap"+str(i)) + if not ifnet: + continue + if ifnet.if_bpf == 0: + ifnet = None + continue + if ifnet.if_bpf.bif_dlist.bd_headdrop == 0: + ifnet = None + continue + break + + if not ifnet: + print("Could not find a pktap interface") + return + + bpf_d = ifnet.if_bpf.bif_dlist + + DumpBPFToFile(bpf_d) +# EndMacro: net_get_always_on_pktap diff --git a/tools/lldbmacros/netdefines.py b/tools/lldbmacros/netdefines.py index f3f66ce80..9d0682d13 100755 --- a/tools/lldbmacros/netdefines.py +++ b/tools/lldbmacros/netdefines.py @@ -41,7 +41,8 @@ if_flags_strings = ["UP", if_refflags_strings = ["IFRF_EMBRYONIC", "IFRF_ATTACHED", - "IFRF_DETACHING" + "IFRF_DETACHING", + "IFRF_READY" ] if_eflags_strings = ["AUTOCONFIGURING", @@ -92,20 +93,37 @@ if_xflags_strings = ["WAKE_ON_MAGIC_PACKET", "MARK_WAKE_PKT", "FAST_PACKET_DELIVERY", "NO_TRAFFIC_SHAPING", - "MANAGEMENT" - "ULTRA_CONSTRAINED", + "MANAGEMENT", + "ULTA_CONSTRAINED", "IS_VPN", - "DELAYWAKEPKTEVENT", + "DELAY_WAKE_PACKET_EVENTS", "DISABLE_INPUT", "CONGESTED_LINK", - "UNUSED_0x00100000", - "UNUSED_0x00200000", + "IS_COMPANIONLINK", + "RX_FLOW_STEERING", "UNUSED_0x00400000", "LINK_HEURISTICS", "LINK_HEUR_OFF_PENDING", "POINTOPOINT_MDNS", + "INBAND_WAKE_PKT_TAGGING", + "LOW_POWER_WAKE", + "REQUIRE_CELL_THREAD_GROUP" ] +tcp_timer_strings = ["TCPT_PTO", + "TCPT_DELAYFR", + "TCPT_REORDER", + "TCPT_REXMT", + "TCPT_DELACK", + "TCPT_PERSIST", + "TCPT_KEEP", + "TCPT_2MSL", + "TCPT_JACK_RXMT", + "TCPT_CELLICON" + ] + +TCPT_MAX = 9 + AF_INET = 2 AF_INET6 = 30 AF_LINK = 18 diff --git a/tools/lldbmacros/ntstat.py b/tools/lldbmacros/ntstat.py index 5a69cc411..65f2d187f 100755 --- a/tools/lldbmacros/ntstat.py +++ b/tools/lldbmacros/ntstat.py @@ -37,6 +37,26 @@ class NSTAT_PROVIDER(IntEnum): ###################################### # Helper functions ###################################### + +def FieldPtrToStructPtr(field_ptr, field_name, element_type): + """ Given a pointer to a field with a structure, return a pointer to the structure itself + params: + field_ptr - value : pointer to the field + field_name - str : string name of the field which holds the list links. + element_type - str : type of elements to be linked in the list + returns: + value : A pointer to the start of the structure + """ + out_string = "" + if (field_ptr) : + tmp_element = Cast(field_ptr, element_type) + tmp_element_plus_offset = addressof(tmp_element.__getattr__(field_name)) + offset = tmp_element_plus_offset - tmp_element + original_ptr_as_char_ptr = Cast(field_ptr, 'char *') + amended_ptr = original_ptr_as_char_ptr - offset + return kern.GetValueFromAddress(unsigned(amended_ptr), element_type) + return field_ptr + def ReverseIterateTAILQ_AnonymousHEAD(headval, field_name, element_type): """ reverse iterate over a TAILQ_HEAD in kernel. refer to bsd/sys/queue.h params: @@ -52,13 +72,17 @@ def ReverseIterateTAILQ_AnonymousHEAD(headval, field_name, element_type): print(entry) """ head_first = headval.__getattr__('tqh_first') - head_first_addr = addressof(head_first) - iter_val = headval.__getattr__('tqh_last') - while (unsigned(iter_val) != unsigned(head_first_addr)) and (unsigned(iter_val) != 0) : - yield iter_val - element = Cast(iter_val, element_type) - iter_val = element.__getattr__(field_name).__getattr__('tqe_prev') - #end of yield loop + if head_first: + head_first_addr = FieldPtrToStructPtr(addressof(head_first),field_name, element_type) + head_last = headval.__getattr__('tqh_last') + iter_val = FieldPtrToStructPtr(head_last, field_name, element_type) + + while (unsigned(iter_val) != unsigned(head_first_addr)) and (unsigned(iter_val) != 0) : + yield iter_val + element = Cast(iter_val, element_type) + tmp = element.__getattr__(field_name).__getattr__('tqe_prev') + iter_val = FieldPtrToStructPtr(tmp, field_name, element_type) + #end of yield loop def ShowNstatTUShadow(inshadow): """ Display summary for an nstat_tu_shadow struct @@ -121,7 +145,12 @@ def ShowNstatGShadow(inshadow): format_string = " INVALID proc magic {0: <#0x}" out_string += format_string.format(procmagic) - print(out_string) + print(out_string) + + for src in IterateTAILQ_HEAD(gshad.gshad_locus.ntl_src_queue, 'nts_locus_link'): + ShowNstatSrc(src) + else: + print(out_string) def GetNstatProcdetailsBrief(procdetails): """ Display a brief summary for an nstat_procdetails struct @@ -157,6 +186,24 @@ def ShowNstatProcdetails(procdetails): print(out_string) +def ShowNstatSockLocus(locus): + """ Display a summary for an nstat_sock_locus struct + params: + locus : cvalue object which points to 'struct nstat_sock_locus *' + """ + locus = Cast(locus, 'struct nstat_sock_locus *') + out_string = "" + if (locus) : + format_string = "nstat_sock_locus: {0: <#020x} next={1: <#020x} prev={2: <#020x}" + out_string += format_string.format(locus, locus.nsl_link.tqe_next, locus.nsl_link.tqe_prev) + out_string += GetNstatTULocusBrief(locus); + + print(out_string) + iterator = IterateTAILQ_HEAD(locus.nsl_locus.ntl_src_queue, 'nts_locus_link') + for src in iterator: + ShowNstatSrc(src) + + def GetNstatTUShadowBrief(shadow): """ Display a summary for an nstat_tu_shadow struct params: @@ -212,20 +259,20 @@ def GetNstatGenericShadowBrief(shadow): return out_string -def GetNstatTUCookieBrief(cookie): - """ Display a summary for an nstat_tucookie struct +def GetNstatTULocusBrief(cookie): + """ Display a summary for an nnstat_sock_locus struct params: - shadow : cvalue object which points to 'struct nstat_tucookie *' + cookie : cvalue object which points to 'struct nstat_sock_locus *' returns: - str : A string describing various information for the nstat_tucookie structure + str : A string describing various information for the nstat_sock_locus structure """ out_string = "" - tucookie = Cast(cookie, 'struct nstat_tucookie *') - inp = tucookie.inp - pname = tucookie.pname + sol = Cast(cookie, 'struct nstat_sock_locus *') + inp = sol.nsl_inp inpcb = Cast(inp, 'struct inpcb *') inp_socket = inpcb.inp_socket sock = Cast(inp_socket, 'struct socket *') + pname = sol.nsl_pname format_string = " inpcb={0: <#0x}: socket={1: <#020x} process={2: kern.PAGE_PROTECTION_TYPE_PPL: - pvh_flags.append("SLEEPABLE_LOCK") - if pvh_raw & (1 << 52): if kern.globals.page_protection_type <= kern.PAGE_PROTECTION_TYPE_PPL: pvh_flags.append("SECURE_FLUSH_NEEDED") + else: + pvh_flags.append("SLEEPABLE_LOCK") if kern.arch.startswith('arm64') and pvh_raw & (1 << 61): pvh_flags.append("LOCK") @@ -1550,3 +1549,63 @@ def PmapPaIndex(cmd_args=None): print("Physical Address: {:#x}".format(phys_addr)) print("PAI: {:d}".format(pai)) + +@lldb_command('pmapdumpsurts') +def PmapDumpSurts(cmd_args=None): + """ Dump the SURT list. + + Syntax: (lldb) pmapdumpsurts + """ + from scheduler import IterateBitmap + + if "surt_list" not in kern.globals: + raise NotImplementedError("SURT is not supported on this device.") + + i = 0 + for surt_page in IterateLinkageChain(kern.globals.surt_list, 'surt_page_t *', 'surt_chain'): + print(f"SURT Page {i} at physical address {hex(surt_page.surt_page_pa)}") + print('') + print('Allocation status (O: free, X: allocated):') + bitmap_visual = bytearray('X' * 128, 'ascii') + for free_bit in IterateBitmap(surt_page.surt_page_free_bitmap[0]): + bitmap_index = 127 - free_bit + bitmap_visual[bitmap_index:(bitmap_index + 1)] = b'O' + for free_bit in IterateBitmap(surt_page.surt_page_free_bitmap[1]): + bitmap_index = 127 - (free_bit + 64) + bitmap_visual[bitmap_index:(bitmap_index + 1)] = b'O' + + for j in range(0, 128, 8): + print(f"{bitmap_visual[j:(j+8)].decode('ascii')} bit [{127 - j}:{120 - j}]") + + print('') + print('SURT list structure raw:') + print(dereference(surt_page)) + print('') + print('') + + i = i + 1 + +@lldb_command('showallpmaps') +def ShowAllPmaps(cmd_args=None): + """ Dump all pmaps. + + Syntax: (lldb) showallpmaps + """ + for pmap in IterateQueue(kern.globals.map_pmap_list, 'pmap_t', 'pmaps'): + print(dereference(pmap)) + print() + +@lldb_command('pmapforroottablepa') +def PmapForRootTablePa(cmd_args=None): + """ Dump the pmap with matching root TTE physical address. + + Syntax: (lldb) pmapforroottablepa + """ + if cmd_args is None or len(cmd_args) == 0: + raise ArgumentError('Invalid argument, expecting the physical address of a root translation table') + + pa = kern.GetValueFromAddress(cmd_args[0], 'unsigned long') + for pmap in IterateQueue(kern.globals.map_pmap_list, 'pmap_t', 'pmaps'): + if pmap.ttep == pa: + print(dereference(pmap)) + print() diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index 7210fd4b0..efee9ff0b 100755 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -119,9 +119,12 @@ def GetProcInfo(proc): str : A string describing various information for process. """ out_string = "" + task = GetTaskFromProc(proc) + if task is None: + task = 0 out_string += ("Process {p: <#020x}\n\tname {0: <32s}\n\tpid:{1: <6d} " + "task:{task: <#020x} p_stat:{p.p_stat: <6d} parent pid: {p.p_ppid: <6d}\n" - ).format(GetProcName(proc), GetProcPID(proc), task=GetTaskFromProc(proc), p=proc) + ).format(GetProcName(proc), GetProcPID(proc), task=task, p=proc) #print the Creds ucred = proc.p_proc_ro.p_ucred.__smr_ptr if ucred: @@ -1943,6 +1946,7 @@ def GetLedgerEntryWithTemplate(ledger_template, ledgerp, i): """ lf_refill_scheduled = 0x0400 lf_tracking_max = 0x4000 + lf_is_counter = 0x80000 entry = {} @@ -1951,10 +1955,17 @@ def GetLedgerEntryWithTemplate(ledger_template, ledgerp, i): et_size = et.et_size if et_size == sizeof("struct ledger_entry_small"): les = ledgerp.l_entries[et.et_offset] - entry["credit"] = unsigned(les.les_credit) + flags = int(les.les_flags) entry["debit"] = 0 - entry["flags"] = int(les.les_flags) + entry["flags"] = flags entry["limit"] = ledger_limit_infinity + if (flags & lf_is_counter) and (hasattr(ledger_template, "lt_counters")): + credit = 0 + for v in memory.IterateZPerCPU(cast(les.les_credit, "scalable_counter_t")): + credit += v + entry["credit"] = credit + else: + entry["credit"] = unsigned(les.les_credit) elif et_size == sizeof("struct ledger_entry"): le = cast(addressof(ledgerp.l_entries[et.et_offset]), "struct ledger_entry *") entry["credit"] = unsigned(le.le_credit) diff --git a/tools/lldbmacros/recount.py b/tools/lldbmacros/recount.py index 9a2e000fd..a3990ee7c 100755 --- a/tools/lldbmacros/recount.py +++ b/tools/lldbmacros/recount.py @@ -1,73 +1,94 @@ from xnu import ( - kern, ArgumentError, unsigned, lldb_command, header, GetEnumValue, - GetEnumValues, GetEnumName, GetThreadName, GetProcStartAbsTimeForTask, - GetRecentTimestamp, GetProcNameForTask, FindTasksByName, IterateQueue) + kern, + ArgumentError, + unsigned, + lldb_command, + header, + GetEnumValue, + GetEnumValues, + GetEnumName, + GetThreadName, + GetProcStartAbsTimeForTask, + GetRecentTimestamp, + GetProcNameForTask, + FindTasksByName, + IterateQueue, +) def validate_args(opts, valid_flags): valid_flags = set(valid_flags) for k in opts.keys(): if k[1:] not in valid_flags: - raise ArgumentError('-{} not supported in subcommand'.format(k)) + raise ArgumentError("-{} not supported in subcommand".format(k)) -@lldb_command('recount', 'AF:MT', fancy=True) +@lldb_command("recount", "AF:MT", fancy=True) def Recount(cmd_args=None, cmd_options={}, O=None): # noqa: E741 - """ Inspect counters maintained by the Recount subsystem on various resource - aggregators, like tasks or threads. + """Inspect counters maintained by the Recount subsystem on various resource + aggregators, like tasks or threads. - recount task [-TM] [...] | -F - recount thread [-M] [...] - recount coalition [-M] [...] - recount processor [-ATM] [] [...] + recount task [-TM] [...] | -F + recount thread [-M] [...] + recount coalition [-M] [...] + recount processor [-ATM] [] [...] - Options: - -T : break out active threads for a task or processor - -M : show times in the Mach timebase - -A : show all processors + Options: + -T : break out active threads for a task or processor + -M : show times in the Mach timebase + -A : show all processors - Diagnostic macros: - recount diagnose task - - Ensure resource accounting consistency in a task. - recount triage - - Print out statistics useful for general panic triage. + Diagnostic macros: + recount diagnose task + - Ensure resource accounting consistency in a task. + recount triage + - Print out statistics useful for general panic triage. """ if cmd_args is None or len(cmd_args) == 0: - raise ArgumentError('subcommand required') + raise ArgumentError("subcommand required") - if cmd_args[0] == 'coalition': - validate_args(cmd_options, ['M']) + if cmd_args[0] == "coalition": + validate_args(cmd_options, ["M"]) RecountCoalition(cmd_args[1:], cmd_options=cmd_options, O=O) - elif cmd_args[0] == 'task': - validate_args(cmd_options, ['F', 'M', 'T']) + elif cmd_args[0] == "task": + validate_args(cmd_options, ["F", "M", "T"]) RecountTask(cmd_args[1:], cmd_options=cmd_options, O=O) - elif cmd_args[0] == 'thread': - validate_args(cmd_options, ['M']) + elif cmd_args[0] == "thread": + validate_args(cmd_options, ["M"]) RecountThread(cmd_args[1:], cmd_options=cmd_options, O=O) - elif cmd_args[0] == 'processor': - validate_args(cmd_options, ['A', 'M', 'T']) + elif cmd_args[0] == "processor": + validate_args(cmd_options, ["A", "M", "T"]) RecountProcessor(cmd_args[1:], cmd_options=cmd_options, O=O) - elif cmd_args[0] == 'diagnose': + elif cmd_args[0] == "diagnose": RecountDiagnose(cmd_args[1:], cmd_options=cmd_options, O=O) - elif cmd_args[0] == 'triage': + elif cmd_args[0] == "triage": validate_args(cmd_options, []) RecountTriage(cmd_options=cmd_options, O=O) else: - raise ArgumentError('{}: invalid subcommand'.format(cmd_args[0])) + raise ArgumentError("{}: invalid subcommand".format(cmd_args[0])) -def scale_suffix(val, unit=''): +def scale_suffix(val, unit=""): si_units = [ - (1e21, 'Z'), (1e18, 'E'), (1e15, 'P'), (1e12, 'T'), (1e9, 'B'), - (1e6, 'M'), (1e3, 'k'), (1, ' '), (1e-3, 'm'), (1e-6, 'u'), - (1e-9, 'n')] - scale, sfx = (1, '') - for (si_scale, si_sfx) in si_units: + (1e21, "Z"), + (1e18, "E"), + (1e15, "P"), + (1e12, "T"), + (1e9, "B"), + (1e6, "M"), + (1e3, "k"), + (1, " "), + (1e-3, "m"), + (1e-6, "u"), + (1e-9, "n"), + ] + scale, sfx = (1, "") + for si_scale, si_sfx in si_units: if val >= si_scale: scale, sfx = (si_scale, si_sfx) break - return '{:>7.3f}{:<1s}{}'.format(val / scale, sfx, unit) + return "{:>7.3f}{:<1s}{}".format(val / scale, sfx, unit) class RecountSum(object): @@ -85,19 +106,19 @@ class RecountSum(object): self._valid_count = 0 def add_usage(self, usage): - for (_, level) in self._levels: + for _, level in self._levels: metrics = usage.ru_metrics[level] self._times_mach[level] += unsigned(metrics.rm_time_mach) - if hasattr(metrics, 'rm_cycles'): + if hasattr(metrics, "rm_cycles"): self._instructions[level] += unsigned(metrics.rm_instructions) self._cycles[level] += unsigned(metrics.rm_cycles) if unsigned(metrics.rm_cycles) != 0: self._valid_count += 1 - if hasattr(usage, 'ru_energy_nj'): + if hasattr(usage, "ru_energy_nj"): self._energy_nj += unsigned(usage.ru_energy_nj) def user_sys_times(self): - user_level = GetEnumValue('recount_level_t', 'RCT_LVL_USER') + user_level = GetEnumValue("recount_level_t", "RCT_LVL_USER") user_time = self._times_mach[user_level] return (user_time, sum(self._times_mach) - user_time) @@ -118,41 +139,52 @@ class RecountSum(object): return kern.GetNanotimeFromAbstime(time) def fmt_args(self): - level_args = [[ + level_args = [ + [ level_name, self._convert_time(self._times_mach[level]), scale_suffix(self._cycles[level]), self.div_valid( - self._cycles[level], - kern.GetNanotimeFromAbstime(self._times_mach[level])), + self._cycles[level], + kern.GetNanotimeFromAbstime(self._times_mach[level]), + ), scale_suffix(self._instructions[level]), self.div_valid(self._cycles[level], self._instructions[level]), - '-', - '-'] for (level_name, level) in - RecountPlan.levels()] + "-", + "-", + ] + for (level_name, level) in RecountPlan.levels() + ] total_time_ns = kern.GetNanotimeFromAbstime(sum(self._times_mach)) total_cycles = sum(self._cycles) total_insns = sum(self._instructions) power_w = self._energy_nj / total_time_ns if total_time_ns != 0 else 0 - level_args.append([ - '*', - total_time_ns / 1e9, scale_suffix(total_cycles), + level_args.append( + [ + "*", + total_time_ns / 1e9, + scale_suffix(total_cycles), self.div_valid(total_cycles, total_time_ns), scale_suffix(total_insns), self.div_valid(total_cycles, total_insns), - scale_suffix(self._energy_nj / 1e9, 'J'), - scale_suffix(power_w, 'W')]) + scale_suffix(self._energy_nj / 1e9, "J"), + scale_suffix(power_w, "W"), + ] + ) return level_args def fmt_basic_args(self): - return [[ + return [ + [ level_name, self._convert_time(self._times_mach[level]), self._cycles[level], self._instructions[level], - '-'] for (level_name, level) in - RecountPlan.levels()] + "-", + ] + for (level_name, level) in RecountPlan.levels() + ] class RecountPlan(object): @@ -165,35 +197,36 @@ class RecountPlan(object): self._group_names = [] self._group_column = None - plan = kern.GetGlobalVariable('recount_' + name + '_plan') + plan = kern.GetGlobalVariable("recount_" + name + "_plan") topo = plan.rpl_topo - if topo == GetEnumValue('recount_topo_t', 'RCT_TOPO_CPU'): - self._group_column = 'cpu' + if topo == GetEnumValue("recount_topo_t", "RCT_TOPO_CPU"): + self._group_column = "cpu" self._group_count = unsigned(kern.globals.real_ncpus) - self._group_names = [ - 'cpu-{}'.format(i) for i in range(self._group_count)] - elif topo == GetEnumValue('recount_topo_t', 'RCT_TOPO_CPU_KIND'): - if kern.arch.startswith('arm64'): - self._group_column = 'cpu-kind' + self._group_names = ["cpu-{}".format(i) for i in range(self._group_count)] + elif topo == GetEnumValue("recount_topo_t", "RCT_TOPO_CPU_KIND"): + if kern.arch.startswith("arm64"): + self._group_column = "cpu-kind" cluster_mask = int(kern.globals.topology_info.cluster_types) - self._group_count = bin(cluster_mask).count('1') + self._group_count = bin(cluster_mask).count("1") self._group_names = [ - GetEnumName('recount_cpu_kind_t', i)[8:][:4] - for i in range(self._group_count)] + GetEnumName("recount_cpu_kind_t", i)[8:][:4] + for i in range(self._group_count) + ] else: self._group_count = 1 - elif topo == GetEnumValue('recount_topo_t', 'RCT_TOPO_SYSTEM'): + elif topo == GetEnumValue("recount_topo_t", "RCT_TOPO_SYSTEM"): self._group_count = 1 else: - raise RuntimeError('{}: Unexpected recount topography', topo) + raise RuntimeError("{}: Unexpected recount topography", topo) def time_fmt(self): - return '{:>12d}' if self._mach_times else '{:>12.05f}' + return "{:>12d}" if self._mach_times else "{:>12.05f}" def _usage_fmt(self): - prefix = '{n}{{:>6s}} {t} '.format( - t=self.time_fmt(), n='{:>8s} ' if self._group_column else '') - return prefix + '{:>8s} {:>7.3g} {:>8s} {:>5.03f} {:>9s} {:>9s}' + prefix = "{n}{{:>6s}} {t} ".format( + t=self.time_fmt(), n="{:>8s} " if self._group_column else "" + ) + return prefix + "{:>8s} {:>7.3g} {:>8s} {:>5.03f} {:>9s} {:>9s}" def usages(self, usages): for i in range(self._group_count): @@ -204,20 +237,32 @@ class RecountPlan(object): yield tracks[i].rt_usage def usage_header(self): - fmt = '{:>6s} {:>12s} {:>8s} {:>7s} {:>8s} {:>5s} {:>9s} {:>9s}'.format( # noqa: E501 - 'level', 'time', 'cycles', 'GHz', 'insns', - 'CPI', 'energy', 'power',) + fmt = "{:>6s} {:>12s} {:>8s} {:>7s} {:>8s} {:>5s} {:>9s} {:>9s}".format( # noqa: E501 + "level", + "time", + "cycles", + "GHz", + "insns", + "CPI", + "energy", + "power", + ) if self._group_column: - fmt = '{:>8s} '.format(self._group_column) + fmt + fmt = "{:>8s} ".format(self._group_column) + fmt return fmt def levels(): - names = ['kernel', 'user'] - levels = list(zip(names, GetEnumValues('recount_level_t', [ - 'RCT_LVL_' + name.upper() for name in names]))) + names = ["kernel", "user"] + levels = list( + zip( + names, + GetEnumValues( + "recount_level_t", ["RCT_LVL_" + name.upper() for name in names] + ), + ) + ) try: - levels.append(('secure', - GetEnumValue('recount_level_t', 'RCT_LVL_SECURE'))) + levels.append(("secure", GetEnumValue("recount_level_t", "RCT_LVL_SECURE"))) except KeyError: # RCT_LVL_SECURE is not defined on this system. pass @@ -231,7 +276,7 @@ class RecountPlan(object): total_time_ns = 0 total_cycles = 0 total_insns = 0 - for (level_name, level) in levels: + for level_name, level in levels: metrics = usage.ru_metrics[level] time = unsigned(metrics.rm_time_mach) time_ns = kern.GetNanotimeFromAbstime(time) @@ -239,7 +284,7 @@ class RecountPlan(object): if not self._mach_times: time = time_ns / 1e9 total_time += time - if hasattr(metrics, 'rm_cycles'): + if hasattr(metrics, "rm_cycles"): cycles = unsigned(metrics.rm_cycles) total_cycles += cycles freq = cycles / time_ns if time_ns != 0 else 0 @@ -251,11 +296,20 @@ class RecountPlan(object): freq = 0 insns = 0 cpi = 0 - rows.append([ - level_name, time, scale_suffix(cycles), freq, - scale_suffix(insns), cpi, '-', '-']) + rows.append( + [ + level_name, + time, + scale_suffix(cycles), + freq, + scale_suffix(insns), + cpi, + "-", + "-", + ] + ) - if hasattr(usage, 'ru_energy_nj'): + if hasattr(usage, "ru_energy_nj"): energy_nj = unsigned(usage.ru_energy_nj) if total_time_ns != 0: power_w = energy_nj / total_time_ns @@ -271,11 +325,18 @@ class RecountPlan(object): total_freq = 0 total_cpi = 0 - rows.append([ - '*', total_time, scale_suffix(total_cycles), total_freq, - scale_suffix(total_insns), total_cpi, - scale_suffix(energy_nj / 1e9, 'J'), - scale_suffix(power_w, 'W')]) + rows.append( + [ + "*", + total_time, + scale_suffix(total_cycles), + total_freq, + scale_suffix(total_insns), + total_cpi, + scale_suffix(energy_nj / 1e9, "J"), + scale_suffix(power_w, "W"), + ] + ) if sum: sum.add_usage(usage) @@ -289,29 +350,29 @@ class RecountPlan(object): def format_sum(self, sum, O=None): lines = [] for line in sum.fmt_args(): - lines.append(O.format(self._usage_fmt(), '*', *line)) + lines.append(O.format(self._usage_fmt(), "*", *line)) return lines def format_usages(self, usages, O=None): # noqa: E741 sum = RecountSum(self._mach_times) if self._group_count > 1 else None - str = '' - for (i, usage) in enumerate(self.usages(usages)): + str = "" + for i, usage in enumerate(self.usages(usages)): name = self._group_names[i] if i < len(self._group_names) else None lines = self.format_usage(usage, name=name, sum=sum, O=O) - str += '\n'.join(lines) + '\n' + str += "\n".join(lines) + "\n" if sum: - str += '\n'.join(self.format_sum(sum, O=O)) + str += "\n".join(self.format_sum(sum, O=O)) return str def format_tracks(self, tracks, O=None): # noqa: E741 sum = RecountSum(self._mach_times) if self._group_count > 1 else None - str = '' - for (i, usage) in enumerate(self.track_usages(tracks)): + str = "" + for i, usage in enumerate(self.track_usages(tracks)): name = self._group_names[i] if i < len(self._group_names) else None lines = self.format_usage(usage, name=name, sum=sum, O=O) - str += '\n'.join(lines) + '\n' + str += "\n".join(lines) + "\n" if sum: - str += '\n'.join(self.format_sum(sum, O=O)) + str += "\n".join(self.format_sum(sum, O=O)) return str def sum_usages(self, usages, sum=None): @@ -330,7 +391,7 @@ class RecountPlan(object): def GetTaskTerminatedUserSysTime(task): - plan = RecountPlan('task_terminated') + plan = RecountPlan("task_terminated") sum = RecountSum() for usage in plan.usages(task.tk_recount.rtk_terminated): sum.add_usage(usage) @@ -338,7 +399,7 @@ def GetTaskTerminatedUserSysTime(task): def GetThreadUserSysTime(thread): - plan = RecountPlan('thread') + plan = RecountPlan("thread") sum = RecountSum() for usage in plan.track_usages(thread.th_recount.rth_lifetime): sum.add_usage(usage) @@ -347,17 +408,21 @@ def GetThreadUserSysTime(thread): def print_threads(plan, thread_ptrs, indent=False, O=None): # noqa: E741 for thread_ptr in thread_ptrs: - thread = kern.GetValueFromAddress(thread_ptr, 'thread_t') - print('{}thread 0x{:x} 0x{:x} {}'.format( - ' ' if indent else '', unsigned(thread.thread_id), - unsigned(thread), GetThreadName(thread))) + thread = kern.GetValueFromAddress(thread_ptr, "thread_t") + print( + "{}thread 0x{:x} 0x{:x} {}".format( + " " if indent else "", + unsigned(thread.thread_id), + unsigned(thread), + GetThreadName(thread), + ) + ) with O.table(plan.usage_header(), indent=indent): print(plan.format_tracks(thread.th_recount.rth_lifetime, O=O)) -def RecountThread( - thread_ptrs, cmd_options={}, indent=False, O=None): # noqa: E741 - plan = RecountPlan('thread', mach_times='-M' in cmd_options) +def RecountThread(thread_ptrs, cmd_options={}, indent=False, O=None): # noqa: E741 + plan = RecountPlan("thread", mach_times="-M" in cmd_options) print_threads(plan, thread_ptrs, indent=indent, O=O) @@ -372,45 +437,44 @@ def print_task_description(task): task_name = GetProcNameForTask(task) task_age_ns = get_task_age_ns(task) if task_age_ns is not None: - duration_desc = '{:.3f}s'.format(task_age_ns / 1e9) + duration_desc = "{:.3f}s".format(task_age_ns / 1e9) else: - duration_desc = '-s' - print('task 0x{:x} {} ({} old)'.format( - unsigned(task), task_name, duration_desc)) + duration_desc = "-s" + print("task 0x{:x} {} ({} old)".format(unsigned(task), task_name, duration_desc)) return task_name def RecountTask(task_ptrs, cmd_options={}, O=None): # noqa: E741 - if '-F' in cmd_options: - tasks = FindTasksByName(cmd_options['-F']) + if "-F" in cmd_options: + tasks = FindTasksByName(cmd_options["-F"]) else: - tasks = [kern.GetValueFromAddress(t, 'task_t') for t in task_ptrs] - mach_times = '-M' in cmd_options - plan = RecountPlan('task', mach_times=mach_times) - terminated_plan = RecountPlan('task_terminated', mach_times=mach_times) - active_threads = '-T' in cmd_options + tasks = [kern.GetValueFromAddress(t, "task_t") for t in task_ptrs] + mach_times = "-M" in cmd_options + plan = RecountPlan("task", mach_times=mach_times) + terminated_plan = RecountPlan("task_terminated", mach_times=mach_times) + active_threads = "-T" in cmd_options if active_threads: - thread_plan = RecountPlan('thread', mach_times=mach_times) + thread_plan = RecountPlan("thread", mach_times=mach_times) for task in tasks: task_name = print_task_description(task) with O.table(plan.usage_header()): print(plan.format_tracks(task.tk_recount.rtk_lifetime, O=O)) if active_threads: - threads = [unsigned(t) for t in IterateQueue( - task.threads, 'thread *', 'task_threads')] + threads = [ + unsigned(t) + for t in IterateQueue(task.threads, "thread *", "task_threads") + ] print_threads(thread_plan, threads, indent=True, O=O) - print('task (terminated threads) 0x{:x} {}'.format( - unsigned(task), task_name)) + print("task (terminated threads) 0x{:x} {}".format(unsigned(task), task_name)) with O.table(terminated_plan.usage_header()): - print(terminated_plan.format_usages( - task.tk_recount.rtk_terminated, O=O)) + print(terminated_plan.format_usages(task.tk_recount.rtk_terminated, O=O)) def RecountCoalition(coal_ptrs, cmd_options={}, O=None): # noqa: E741 - plan = RecountPlan('coalition', mach_times='-M' in cmd_options) - coals = [kern.GetValueFromAddress(c, 'coalition_t') for c in coal_ptrs] + plan = RecountPlan("coalition", mach_times="-M" in cmd_options) + coals = [kern.GetValueFromAddress(c, "coalition_t") for c in coal_ptrs] for coal in coals: - print('coalition 0x{:x} {}'.format(unsigned(coal), unsigned(coal.id))) + print("coalition 0x{:x} {}".format(unsigned(coal), unsigned(coal.id))) with O.table(plan.usage_header()): print(plan.format_usages(coal.r.co_recount.rco_exited, O=O)) @@ -418,21 +482,20 @@ def RecountCoalition(coal_ptrs, cmd_options={}, O=None): # noqa: E741 def get_processor(ptr_or_id): ptr_or_id = unsigned(ptr_or_id) if ptr_or_id < 1024: - processor_list = kern.GetGlobalVariable('processor_list') + processor_list = kern.GetGlobalVariable("processor_list") current_processor = processor_list while unsigned(current_processor) > 0: if unsigned(current_processor.cpu_id) == ptr_or_id: return current_processor current_processor = current_processor.processor_list - raise ArgumentError('no processor found with CPU ID {}'.format( - ptr_or_id)) + raise ArgumentError("no processor found with CPU ID {}".format(ptr_or_id)) else: - return kern.GetValueFromAddress(ptr_or_id, 'processor_t') + return kern.GetValueFromAddress(ptr_or_id, "processor_t") def get_all_processors(): processors = [] - processor_list = kern.GetGlobalVariable('processor_list') + processor_list = kern.GetGlobalVariable("processor_list") current_processor = processor_list while unsigned(current_processor) > 0: processors.append(current_processor) @@ -441,19 +504,23 @@ def get_all_processors(): def RecountProcessor(pr_ptrs_or_ids, cmd_options={}, O=None): # noqa: E741 - mach_times = '-M' in cmd_options - plan = RecountPlan('processor', mach_times=mach_times) - if '-A' in cmd_options: + mach_times = "-M" in cmd_options + plan = RecountPlan("processor", mach_times=mach_times) + if "-A" in cmd_options: prs = get_all_processors() else: prs = [get_processor(p) for p in pr_ptrs_or_ids] - active_threads = '-T' in cmd_options + active_threads = "-T" in cmd_options if active_threads: - thread_plan = RecountPlan('thread', mach_times=mach_times) - hdr_prefix = '{:>18s} {:>4s} {:>4s} '.format('processor', 'cpu', 'kind',) - header_fmt = ' {:>12s} {:>12s} {:>8s}' - hdr_suffix = header_fmt.format('idle-time', 'total-time', 'idle-pct') - null_suffix = header_fmt.format('-', '-', '-') + thread_plan = RecountPlan("thread", mach_times=mach_times) + hdr_prefix = "{:>18s} {:>4s} {:>4s} ".format( + "processor", + "cpu", + "kind", + ) + header_fmt = " {:>12s} {:>12s} {:>8s}" + hdr_suffix = header_fmt.format("idle-time", "total-time", "idle-pct") + null_suffix = header_fmt.format("-", "-", "-") levels = RecountPlan.levels() with O.table(hdr_prefix + plan.usage_header() + hdr_suffix): for pr in prs: @@ -465,38 +532,41 @@ def RecountProcessor(pr_ptrs_or_ids, cmd_options={}, O=None): # noqa: E741 idle_time = kern.GetNanotimeFromAbstime(idle_time) / 1e9 total_time = kern.GetNanotimeFromAbstime(total_time) / 1e9 pset = pr.processor_set - cluster_kind = 'SMP' + cluster_kind = "SMP" if unsigned(pset.pset_cluster_type) != 0: - cluster_kind = GetEnumName('pset_cluster_type_t', - pset.pset_cluster_type, 'PSET_AMP_') - prefix = '{:<#018x} {:>4d} {:>4s} '.format( - unsigned(pr), pr.cpu_id, cluster_kind) + cluster_kind = GetEnumName( + "pset_cluster_type_t", pset.pset_cluster_type, "PSET_AMP_" + ) + prefix = "{:<#018x} {:>4d} {:>4s} ".format( + unsigned(pr), pr.cpu_id, cluster_kind + ) suffix = ( - ' ' + plan.time_fmt().format(idle_time) + ' ' + - plan.time_fmt().format(total_time) + - ' {:>7.2f}%'.format(idle_time / total_time * 100)) + " " + + plan.time_fmt().format(idle_time) + + " " + + plan.time_fmt().format(total_time) + + " {:>7.2f}%".format(idle_time / total_time * 100) + ) usage_lines = plan.format_usage(usage, O=O) - for (i, line) in enumerate(usage_lines): + for i, line in enumerate(usage_lines): line_suffix = null_suffix if i + 1 == len(usage_lines): line_suffix = suffix - O.write(prefix + line + line_suffix + '\n') + O.write(prefix + line + line_suffix + "\n") if active_threads: active_thread = unsigned(pr.active_thread) if active_thread != 0: - print_threads( - thread_plan, [active_thread], indent=True, O=O) + print_threads(thread_plan, [active_thread], indent=True, O=O) -@header('{:>4s} {:>20s} {:>20s} {:>20s}'.format( - 'cpu', 'time-mach', 'cycles', 'insns')) +@header("{:>4s} {:>20s} {:>20s} {:>20s}".format("cpu", "time-mach", "cycles", "insns")) def GetRecountSnapshot(cpu, snap, O=None): (insns, cycles) = (0, 0) - if hasattr(snap, 'rsn_cycles'): + if hasattr(snap, "rsn_cycles"): (insns, cycles) = (snap.rsn_insns, snap.rsn_cycles) return O.format( - '{:4d} {:20d} {:20d} {:20d}', cpu, snap.rsn_time_mach, - cycles, insns) + "{:4d} {:20d} {:20d} {:20d}", cpu, snap.rsn_time_mach, cycles, insns + ) def GetRecountProcessorState(pr): @@ -504,92 +574,108 @@ def GetRecountProcessorState(pr): state = state_time >> 63 return ( pr.pr_recount.rpr_snap, - 'I' if state == 1 else 'A', - state_time & ~(0x1 << 63)) + "I" if state == 1 else "A", + state_time & ~(0x1 << 63), + ) -@header('{:>20s} {:>4s} {:>6s} {:>18s} {:>18s} {:>18s} {:>18s} {:>18s}'.format( - 'processor', 'cpu', 'state', 'last-idle-change', 'last-user-change', - 'last-disp', 'since-idle-change', 'since-user-change')) +@header( + "{:>20s} {:>4s} {:>6s} {:>18s} {:>18s} {:>18s} {:>18s} {:>18s}".format( + "processor", + "cpu", + "state", + "last-idle-change", + "last-user-change", + "last-disp", + "since-idle-change", + "since-user-change", + ) +) def GetRecountProcessorDiagnostics(pr, cur_time, O=None): (snap, state, time) = GetRecountProcessorState(pr) cpu_id = unsigned(pr.cpu_id) last_usrchg = snap.rsn_time_mach since_usrchg = cur_time - last_usrchg - last_disp = '{}{:>d}'.format( - '*' if cur_time == unsigned(pr.last_dispatch) else '', - pr.last_dispatch) + last_disp = "{}{:>d}".format( + "*" if cur_time == unsigned(pr.last_dispatch) else "", pr.last_dispatch + ) return O.format( - '{:>#20x} {:4d} {:>6s} {:>18d} {:>18d} {:>18s} {:>18d} {:>18d}', - unsigned(pr), cpu_id, state, time, last_usrchg, last_disp, - cur_time - time, since_usrchg) + "{:>#20x} {:4d} {:>6s} {:>18d} {:>18d} {:>18s} {:>18d} {:>18d}", + unsigned(pr), + cpu_id, + state, + time, + last_usrchg, + last_disp, + cur_time - time, + since_usrchg, + ) -@header('{:>12s} {:>6s} {:>12s} {:>20s} {:>20s}'.format( - 'group', 'level', 'time', 'cycles', 'insns')) +@header( + "{:>12s} {:>6s} {:>12s} {:>20s} {:>20s}".format( + "group", "level", "time", "cycles", "insns" + ) +) def RecountDiagnoseTask(task_ptrs, cmd_options={}, O=None): # noqa: E74 - if '-F' in cmd_options: - tasks = FindTasksByName(cmd_options['-F']) + if "-F" in cmd_options: + tasks = FindTasksByName(cmd_options["-F"]) else: - tasks = [kern.GetValueFromAddress(t, 'task_t') for t in task_ptrs] + tasks = [kern.GetValueFromAddress(t, "task_t") for t in task_ptrs] - line_fmt = '{:20s} = {:10.3f}' - row_fmt = '{:>12s} {:>6s} {:>12.3f} {:>20d} {:>20d}' + line_fmt = "{:20s} = {:10.3f}" + row_fmt = "{:>12s} {:>6s} {:>12.3f} {:>20d} {:>20d}" - task_plan = RecountPlan('task', mach_times=False) - term_plan = RecountPlan('task_terminated', mach_times=False) + task_plan = RecountPlan("task", mach_times=False) + term_plan = RecountPlan("task_terminated", mach_times=False) for task in tasks: print_task_description(task) with O.table(RecountDiagnoseTask.header): task_sum = task_plan.sum_tracks(task.tk_recount.rtk_lifetime) for line in task_sum.fmt_basic_args(): line = line[:-1] - print(O.format(row_fmt, 'task', *line)) + print(O.format(row_fmt, "task", *line)) term_sum = term_plan.sum_usages(task.tk_recount.rtk_terminated) for line in term_sum.fmt_basic_args(): - print(O.format(row_fmt, 'terminated', *line)) + print(O.format(row_fmt, "terminated", *line)) term_sum_ns = term_sum.time() threads_sum = RecountSum(mach_times=True) threads_time_mach = threads_sum.time() - for thread in IterateQueue( - task.threads, 'thread *', 'task_threads'): + for thread in IterateQueue(task.threads, "thread *", "task_threads"): usr_time, sys_time = GetThreadUserSysTime(thread) threads_time_mach += usr_time + sys_time threads_sum_ns = kern.GetNanotimeFromAbstime(threads_time_mach) - print(line_fmt.format('threads CPU', threads_sum_ns / 1e9)) + print(line_fmt.format("threads CPU", threads_sum_ns / 1e9)) all_threads_sum_ns = threads_sum_ns + term_sum_ns - print(line_fmt.format('all threads CPU', all_threads_sum_ns / 1e9)) + print(line_fmt.format("all threads CPU", all_threads_sum_ns / 1e9)) - print(line_fmt.format( - 'discrepancy', task_sum.time() - all_threads_sum_ns)) + print(line_fmt.format("discrepancy", task_sum.time() - all_threads_sum_ns)) def RecountDiagnose(cmd_args=[], cmd_options={}, O=None): # noqa: E741 if cmd_args is None or len(cmd_args) == 0: - raise ArgumentError('diagnose subcommand required') + raise ArgumentError("diagnose subcommand required") - if cmd_args[0] == 'task': - validate_args(cmd_options, ['F']) + if cmd_args[0] == "task": + validate_args(cmd_options, ["F"]) RecountDiagnoseTask(cmd_args[1:], cmd_options=cmd_options, O=O) else: - raise ArgumentError('{}: invalid diagnose subcommand'.format( - cmd_args[0])) + raise ArgumentError("{}: invalid diagnose subcommand".format(cmd_args[0])) def RecountTriage(cmd_options={}, O=None): # noqa: E741 prs = get_all_processors() - print('processors') + print("processors") with O.table(GetRecountProcessorDiagnostics.header, indent=True): max_dispatch = max([unsigned(pr.last_dispatch) for pr in prs]) for pr in prs: - print(GetRecountProcessorDiagnostics( - pr, cur_time=max_dispatch, O=O)) + print(GetRecountProcessorDiagnostics(pr, cur_time=max_dispatch, O=O)) - print('snapshots') + print("snapshots") with O.table(GetRecountSnapshot.header, indent=True): - for (i, pr) in enumerate(prs): + for i, pr in enumerate(prs): print(GetRecountSnapshot(i, pr.pr_recount.rpr_snap, O=O)) diff --git a/tools/lldbmacros/ruff.toml b/tools/lldbmacros/ruff.toml index d78823bf4..6b49c303a 100644 --- a/tools/lldbmacros/ruff.toml +++ b/tools/lldbmacros/ruff.toml @@ -1,10 +1,5 @@ -# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. -select = ["E", "F"] -ignore = ["E741"] - -# Allow autofix for all enabled rules (when `--fix`) is provided. -fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] -unfixable = [] +# Assume Python 3.9. +target-version = "py39" # Assume several lldb and xnu-specific globals are built-in. builtins = ["lldb", "kern", "cast", "addressof"] @@ -18,14 +13,18 @@ exclude = [ "dist", ] -# Same as Black. -line-length = 88 +[lint] +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +# Disable ambiguous variable names and line lengths. +ignore = ["E741", "E501"] + +# Allow autofix for all enabled rules (when `--fix`) is provided. +fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] +unfixable = [] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Assume Python 3.9. -target-version = "py39" - # Unlike Flake8, default to a complexity level of 10. mccabe.max-complexity = 10 diff --git a/tools/lldbmacros/scheduler.py b/tools/lldbmacros/scheduler.py index a0a30646d..fff5a750d 100755 --- a/tools/lldbmacros/scheduler.py +++ b/tools/lldbmacros/scheduler.py @@ -631,7 +631,13 @@ def ShowScheduler(cmd_args=None): """ Routine to print information of all psets and processors Usage: showscheduler """ - node = addressof(kern.globals.pset_node0) + if GetEnumValue('pset_cluster_type_t', 'MAX_PSET_TYPES') > 1: + # AMP platform + node = addressof(kern.globals.pset_nodes[0]) + else: + # SMP platform + node = addressof(kern.globals.pset_node0) + show_priority_runq = 0 show_priority_pset_runq = 0 show_clutch = 0 diff --git a/tools/lldbmacros/tests/integration_smoke/test_lldb_macros.py b/tools/lldbmacros/tests/integration_smoke/test_lldb_macros.py index b2adaae96..a16398444 100755 --- a/tools/lldbmacros/tests/integration_smoke/test_lldb_macros.py +++ b/tools/lldbmacros/tests/integration_smoke/test_lldb_macros.py @@ -157,7 +157,6 @@ MACROS = [ ("addkextaddr", _arbitrary_kext), ("showzpcpu", ""), # TODO: ? "memstats", - "showpgz", ("whatis", _arbitrary_kext), "showzcache", "zprint", diff --git a/tools/lldbmacros/userspace.py b/tools/lldbmacros/userspace.py index c53713606..591cee776 100755 --- a/tools/lldbmacros/userspace.py +++ b/tools/lldbmacros/userspace.py @@ -455,7 +455,7 @@ def _ExtractDataFromString(strdata, offset, data_type, length=0): data = struct.unpack(unpack_str, strdata[offset:(offset + length)])[0] if data_type == 'string': - return data.decode() + return data.decode(errors='backslashreplace') return data diff --git a/tools/lldbmacros/utils.py b/tools/lldbmacros/utils.py index 12d04dcb0..9d853adab 100755 --- a/tools/lldbmacros/utils.py +++ b/tools/lldbmacros/utils.py @@ -621,14 +621,23 @@ def print_hex_data(data, start=0, desc="", marks={}, prefix=" ", extra=None): def Ones(x): return (1 << x)-1 -def StripPAC(x, TySz): +def CanonicalAddress(x, TySz): + """ Canonicalize an address. That is to say, sign-extend the upper 64-TySz + address bits with either 0 or 1 depending on bit 55. + + params: + x: The address to modify. + TySz: Size of the corresponding VA region. + """ sign_mask = 1 << 55 ptr_mask = Ones(64-TySz) - pac_mask = ~ptr_mask + msb_mask = ~ptr_mask sign = x & sign_mask if sign: - return (x | pac_mask) + 2**64 + # Sign-extend + return (x | msb_mask) + 2**64 else: + # Zero-extend return x & ptr_mask @cache_statically diff --git a/tools/lldbmacros/waitq.py b/tools/lldbmacros/waitq.py index 6878494a1..c35f75216 100755 --- a/tools/lldbmacros/waitq.py +++ b/tools/lldbmacros/waitq.py @@ -1,24 +1,16 @@ from xnu import * from utils import * from core.configuration import * +from core import OSHashPointer import sys import struct -def _swap32(i): - return struct.unpack("I", i))[0] - def _getSafeQ(queue): g_wqs = kern.GetGlobalVariable('global_waitqs') g_cnt = unsigned(kern.GetGlobalVariable('g_num_waitqs')) - q_hash = unsigned(queue) - q_hash >>= 4 - q_hash *= 0x5052acdb - q_hash &= 0xffffffff - q_hash ^= _swap32(q_hash) - - return addressof(g_wqs[q_hash & (g_cnt - 1)]) + return addressof(g_wqs[OSHashPointer(queue) & (g_cnt - 1)]) class Waitq(object): """ @@ -179,9 +171,11 @@ def ShowWaitqHelper(waitq, O=None): if waitq.hasThreads(): print("Waiters:") - with O.table("{:<20s} {:<20s}".format('waiter', 'event'), indent=True): + with O.table("{:<20s} {:<20s} {:s}".format('waiter', 'event', 'hint'), indent=True): for thread in waitq.iterateThreads(): - print("{:<#20x} {:<#20x}".format(unsigned(thread), thread.wait_event)) + hint = thread.block_hint or thread.pending_block_hint + hint = GetEnumName('block_hint_t', hint, 'kThreadWait'); + print(f"{unsigned(thread):<#20x} {thread.wait_event:<#20x} {hint:s}"); if waitq.hasSets(): print("Sets:") diff --git a/tools/lldbmacros/xnudefines.py b/tools/lldbmacros/xnudefines.py index efc9d922f..decd3c9a0 100755 --- a/tools/lldbmacros/xnudefines.py +++ b/tools/lldbmacros/xnudefines.py @@ -187,10 +187,6 @@ P_PLATFORM_TVOSSIMULATOR = 8 P_PLATFORM_WATCHOSSIMULATOR = 9 P_PLATFORM_DRIVERKIT = 10 -# File: osfmk/ipc/ipc_object.h -IO_BITS_ACTIVE = 0x80000000 -IO_BITS_KOTYPE = 0x3ff - # File: kern_memorystatus.h JETSAM_PRIORITY_MAX = 210 P_MEMSTAT_FROZEN = 0x00000002 @@ -212,6 +208,8 @@ IV_UNUSED_KEYINDEX = ~0 MACH_VOUCHER_ATTR_KEY_ALL = ~0 MACH_VOUCHER_ATTR_KEY_NUM = 8 +#File: osfmk/ipc/ipc_entry.h +IE_BITS_IMMOVABLE_SEND = 0x00800000 + if __name__ == "__main__": pass - diff --git a/tools/pre-commit.sh b/tools/pre-commit.sh index 4d05a807d..eaa0cc7e9 100755 --- a/tools/pre-commit.sh +++ b/tools/pre-commit.sh @@ -1,17 +1,75 @@ -#!/bin/sh -set -e +#!/bin/bash +# # Abort a commit if the code style is incorrect. +# -# Get a list of paths with staged changes. -FILES=$(git diff --staged --name-only --diff-filter=d) -# Check the paths for style issues. -RESULT=0 -if [ ! -z "$FILES" ]; then - # Stash any unstaged changes. - git stash --quiet --keep-index - ./tools/uncrustify.sh $FILES || RESULT=$? - # Restore the unstaged changes. - git stash pop --quiet +DENYLIST=tools/uncrustify-denylist +UNCRUSTIFY="$(xcrun -f uncrustify)" + +if git rev-parse --verify HEAD >/dev/null 2>&1 ; then + printf >&2 "Validating code style diff against previous commit...\n" + against=HEAD +else + # Initial commit: diff against an empty tree object + printf >&2 "Validating code style diff for entire source tree...\n" + against=$(git hash-object -t tree /dev/null) fi -exit $RESULT + +diff_with_stdin() +{ + if which colordiff >/dev/null 2>&1; then + diff -u "$1" - | colordiff + else + diff -u "$1" - + fi +} + +# Keep track of offending files +staged_paths_with_format_errors=() + +# Note that we exclude staged deletions via --diff-filter +for path in $(git diff --staged --name-only --diff-filter="d" $against); do + # Parse our deny-list to find what to skip + while IFS= read -r deny_path; do + # Skip empty lines and comments + if [[ -z "$deny_path" || "$deny_path" == \#* ]]; then + continue + fi + + # (Prepend ./ to the path in question to match the format used in the denylist) + # Note that excluded directories must specify a trailing slash (or the latter string here needs tweaking) + if [[ "./$path" == "$deny_path" || "./$path" == "$deny_path"* ]]; then + # (Continue outer loop of files to be committed) + continue 2 + fi + done < "$DENYLIST" + + # Skip non-C/++ files + case "$path" in + *.c|*.h|*.cpp) + ;; + *) + continue + ;; + esac + + printf >&2 "Validating code style for $path: " + + if "$UNCRUSTIFY" -q -c tools/xnu-uncrustify.cfg --check -f "$path" >/dev/null 2>&1; then + printf >&2 "\e[1;32mok\e[0m.\n" + else + printf >&2 "\e[1;31minvalid style\e[0m.\n" + "$UNCRUSTIFY" -q -c tools/xnu-uncrustify.cfg -f "$path" | diff_with_stdin "$path" + staged_paths_with_format_errors+=($path) + fi +done + +if [ ${#staged_paths_with_format_errors[@]} -ne 0 ]; then + path_list="${staged_paths_with_format_errors[*]}" + printf >&2 "\e[1;31mSome files have invalid code style, aborting commit. To reformat:\n" + printf >&2 "$ $UNCRUSTIFY -q -c tools/xnu-uncrustify.cfg --replace --no-backup $path_list\e[0m\n" + exit 1 +fi + +exit 0 diff --git a/tools/syscall_map.lua b/tools/syscall_map.lua index a1d8ac68c..a068ede94 100755 --- a/tools/syscall_map.lua +++ b/tools/syscall_map.lua @@ -388,6 +388,12 @@ local syscalls = { 'mach_timespec_t *wakeup_time', }, }, + { number = 63, name = 'mach_vm_reclaim_update_kernel_accounting_trap', + arguments = { + 'mach_port_name_t target', + 'uint64_t *bytes_reclaimed', + }, + }, { number = 70, name = 'host_create_mach_voucher', arguments = { diff --git a/tools/tests/zero-to-n/Makefile b/tools/tests/zero-to-n/Makefile index 293e845b2..4ea3d39e2 100644 --- a/tools/tests/zero-to-n/Makefile +++ b/tools/tests/zero-to-n/Makefile @@ -1,6 +1,6 @@ include ../Makefile.common -CFLAGS := -Os -g $(ARCH_FLAGS) -isysroot $(SDKROOT) -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders +CFLAGS := -std=c23 -Os -g $(ARCH_FLAGS) -isysroot $(SDKROOT) -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders DSTROOT?=$(shell /bin/pwd) SYMROOT?=$(shell /bin/pwd) diff --git a/tools/tests/zero-to-n/zero-to-n.c b/tools/tests/zero-to-n/zero-to-n.c index ea1950897..9fde93fb1 100644 --- a/tools/tests/zero-to-n/zero-to-n.c +++ b/tools/tests/zero-to-n/zero-to-n.c @@ -25,6 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#define __STDC_WANT_LIB_EXT1__ 1 + #include #include #include @@ -33,13 +35,12 @@ #include #include #include -#include #include #include #include #include #include - +#include #include #include #include @@ -60,11 +61,9 @@ #include #include -#if TARGET_OS_XR #include #include #include -#endif /* TARGET_OS_XR */ typedef enum wake_type { WAKE_BROADCAST_ONESEM, WAKE_BROADCAST_PERTHREAD, WAKE_CHAIN, WAKE_HOP } wake_type_t; typedef enum my_policy_type { MY_POLICY_REALTIME, MY_POLICY_TIMESHARE, MY_POLICY_TIMESHARE_NO_SMT, MY_POLICY_FIXEDPRI } my_policy_type_t; @@ -114,6 +113,7 @@ static semaphore_t g_main_sem; static uint64_t *g_thread_endtimes_abs; static boolean_t g_verbose = FALSE; static boolean_t g_do_affinity = FALSE; +static boolean_t g_rt_workgroup_interval = FALSE; static uint64_t g_starttime_abs; static uint32_t g_iteration_sleeptime_us = 0; static uint32_t g_priority = 0; @@ -159,6 +159,11 @@ static boolean_t g_test_rt = FALSE; static boolean_t g_rt_churn = FALSE; +/* If true, churn threads will join the same work interval as non-churn. This + * will not change the work interval's start or deadline. Useful if churn threads + * are meant to pre-warm the workgroup. */ +static boolean_t g_rt_churn_same_wg = FALSE; + /* On SMT machines, test whether realtime threads are scheduled on the correct CPUs */ static boolean_t g_test_rt_smt = FALSE; @@ -194,11 +199,13 @@ static semaphore_t g_rt_churn_start_sem; static semaphore_t *g_semarr; -#if TARGET_OS_XR -/* Workgroup which allows RT on xrOS */ +/* Workgroup (for CLPC, and required to get RT on visionOS) */ os_workgroup_t g_rt_workgroup = NULL; -os_workgroup_join_token_s g_rt_workgroup_join_token = { 0 }; -#endif /* TARGET_OS_XR */ +os_workgroup_interval_t g_rt_churn_workgroup = NULL; +__thread os_workgroup_join_token_s th_rt_workgroup_join = { 0 }; + +/* Cluster to bind to, if any */ +static char g_bind_cluster_type = '\0'; typedef struct { __attribute__((aligned(128))) uint32_t current; @@ -224,9 +231,9 @@ inline static void yield(void) { #if defined(__arm64__) - asm volatile ("yield"); + __asm__ volatile ("yield"); #elif defined(__x86_64__) || defined(__i386__) - asm volatile ("pause"); + __asm__ volatile ("pause"); #else #error Unrecognized architecture #endif @@ -235,77 +242,11 @@ yield(void) #define BIT(b) (1ULL << (b)) #define mask(width) (width >= 64 ? -1ULL : (BIT(width) - 1)) - #if TARGET_OS_XR -/* - * The plist (in JSON as it's more compact). - * - * { - * "WorkloadIDTable":{ - * "com.apple.test":{ - * "Phases":{ - * "Realtime":{ - * "WorkIntervalType":"DEFAULT", - * "WorkloadClass":"REALTIME" - * } - * }, - * "Root":{"DefaultPhase":"Realtime"}} - * } - * } - */ -static uint8_t workload_config_plist[] = { - 0x3c, 0x3f, 0x78, 0x6d, 0x6c, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, - 0x6e, 0x3d, 0x22, 0x31, 0x2e, 0x30, 0x22, 0x20, 0x65, 0x6e, 0x63, 0x6f, - 0x64, 0x69, 0x6e, 0x67, 0x3d, 0x22, 0x55, 0x54, 0x46, 0x2d, 0x38, 0x22, - 0x3f, 0x3e, 0x0a, 0x3c, 0x21, 0x44, 0x4f, 0x43, 0x54, 0x59, 0x50, 0x45, - 0x20, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x20, 0x50, 0x55, 0x42, 0x4c, 0x49, - 0x43, 0x20, 0x22, 0x2d, 0x2f, 0x2f, 0x41, 0x70, 0x70, 0x6c, 0x65, 0x2f, - 0x2f, 0x44, 0x54, 0x44, 0x20, 0x50, 0x4c, 0x49, 0x53, 0x54, 0x20, 0x31, - 0x2e, 0x30, 0x2f, 0x2f, 0x45, 0x4e, 0x22, 0x20, 0x22, 0x68, 0x74, 0x74, - 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, 0x61, 0x70, 0x70, 0x6c, - 0x65, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x44, 0x54, 0x44, 0x73, 0x2f, 0x50, - 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x4c, 0x69, 0x73, 0x74, 0x2d, - 0x31, 0x2e, 0x30, 0x2e, 0x64, 0x74, 0x64, 0x22, 0x3e, 0x0a, 0x3c, 0x70, - 0x6c, 0x69, 0x73, 0x74, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, - 0x3d, 0x22, 0x31, 0x2e, 0x30, 0x22, 0x3e, 0x0a, 0x3c, 0x64, 0x69, 0x63, - 0x74, 0x3e, 0x0a, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x57, 0x6f, 0x72, - 0x6b, 0x6c, 0x6f, 0x61, 0x64, 0x49, 0x44, 0x54, 0x61, 0x62, 0x6c, 0x65, - 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x3c, 0x64, 0x69, 0x63, - 0x74, 0x3e, 0x0a, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x63, 0x6f, - 0x6d, 0x2e, 0x61, 0x70, 0x70, 0x6c, 0x65, 0x2e, 0x74, 0x65, 0x73, 0x74, - 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x09, 0x3c, 0x64, 0x69, - 0x63, 0x74, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, - 0x50, 0x68, 0x61, 0x73, 0x65, 0x73, 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, - 0x0a, 0x09, 0x09, 0x09, 0x3c, 0x64, 0x69, 0x63, 0x74, 0x3e, 0x0a, 0x09, - 0x09, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x52, 0x65, 0x61, 0x6c, - 0x74, 0x69, 0x6d, 0x65, 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, - 0x09, 0x09, 0x09, 0x3c, 0x64, 0x69, 0x63, 0x74, 0x3e, 0x0a, 0x09, 0x09, - 0x09, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x57, 0x6f, 0x72, 0x6b, - 0x49, 0x6e, 0x74, 0x65, 0x72, 0x76, 0x61, 0x6c, 0x54, 0x79, 0x70, 0x65, - 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, - 0x3c, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3e, 0x44, 0x45, 0x46, 0x41, - 0x55, 0x4c, 0x54, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3e, - 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x57, - 0x6f, 0x72, 0x6b, 0x6c, 0x6f, 0x61, 0x64, 0x43, 0x6c, 0x61, 0x73, 0x73, - 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x09, - 0x3c, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3e, 0x52, 0x45, 0x41, 0x4c, - 0x54, 0x49, 0x4d, 0x45, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, - 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x3c, 0x2f, 0x64, 0x69, 0x63, 0x74, - 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x3c, 0x2f, 0x64, 0x69, 0x63, 0x74, 0x3e, - 0x0a, 0x09, 0x09, 0x09, 0x3c, 0x6b, 0x65, 0x79, 0x3e, 0x52, 0x6f, 0x6f, - 0x74, 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x3c, - 0x64, 0x69, 0x63, 0x74, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x09, 0x3c, 0x6b, - 0x65, 0x79, 0x3e, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x68, - 0x61, 0x73, 0x65, 0x3c, 0x2f, 0x6b, 0x65, 0x79, 0x3e, 0x0a, 0x09, 0x09, - 0x09, 0x09, 0x3c, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3e, 0x52, 0x65, - 0x61, 0x6c, 0x74, 0x69, 0x6d, 0x65, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x69, - 0x6e, 0x67, 0x3e, 0x0a, 0x09, 0x09, 0x09, 0x3c, 0x2f, 0x64, 0x69, 0x63, - 0x74, 0x3e, 0x0a, 0x09, 0x09, 0x3c, 0x2f, 0x64, 0x69, 0x63, 0x74, 0x3e, - 0x0a, 0x09, 0x3c, 0x2f, 0x64, 0x69, 0x63, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, - 0x64, 0x69, 0x63, 0x74, 0x3e, 0x0a, 0x3c, 0x2f, 0x70, 0x6c, 0x69, 0x73, - 0x74, 0x3e, 0x0a +static const char workload_config_plist[] = { +#embed "zero_to_n_workload_config.plist" suffix(,) + 0, }; -static const size_t workload_config_plist_len = 591; static bool workload_config_load(void) @@ -313,7 +254,7 @@ workload_config_load(void) /* Try to load the test workload config plist. */ size_t len = 0; int result = sysctlbyname("kern.workload_config", NULL, &len, - &workload_config_plist[0], workload_config_plist_len); + (void*) (const void*) workload_config_plist, strlen(workload_config_plist)); if (result != 0) { warnx("failed to load the workload config: %d", errno); return false; @@ -328,9 +269,6 @@ workload_config_unload(void) /* clear the loaded workload config plist.. */ size_t len = 0; sysctlbyname("kern.workload_config", NULL, &len, "", 1); - - /* Leave the workgroup */ - os_workgroup_leave(g_rt_workgroup, &g_rt_workgroup_join_token); } #endif /* TARGET_OS_XR */ @@ -395,12 +333,7 @@ create_churn_threads() for (uint32_t i = 0; i < g_churn_count; i++) { pthread_t new_thread; -#if TARGET_OS_XR - err = pthread_create_with_workgroup_np(&new_thread, g_rt_workgroup, - &attr, churn_thread, NULL); -#else err = pthread_create(&new_thread, &attr, churn_thread, NULL); -#endif /* TARGET_OS_XR */ if (err) { errc(EX_OSERR, err, "pthread_create"); @@ -459,6 +392,14 @@ rt_churn_thread(__unused void *arg) { rt_churn_thread_setup(); + int kr; + if (g_rt_churn_same_wg) { + kr = os_workgroup_join(g_rt_workgroup, &th_rt_workgroup_join); + } else { + kr = os_workgroup_join(g_rt_churn_workgroup, &th_rt_workgroup_join); + } + mach_assert_zero_t(0, kr); + for (uint32_t i = 0; i < g_iterations; i++) { kern_return_t kr = semaphore_wait_signal(g_rt_churn_start_sem, g_rt_churn_sem); mach_assert_zero_t(0, kr); @@ -473,9 +414,15 @@ rt_churn_thread(__unused void *arg) } } - kern_return_t kr = semaphore_signal(g_rt_churn_sem); + kr = semaphore_signal(g_rt_churn_sem); mach_assert_zero_t(0, kr); + if (g_rt_churn_same_wg) { + os_workgroup_leave(g_rt_workgroup, &th_rt_workgroup_join); + } else { + os_workgroup_leave(g_rt_churn_workgroup, &th_rt_workgroup_join); + } + return NULL; } @@ -529,13 +476,7 @@ create_rt_churn_threads(void) for (uint32_t i = 0; i < g_rt_churn_count; i++) { pthread_t new_thread; -#if TARGET_OS_XR - err = pthread_create_with_workgroup_np(&new_thread, g_rt_workgroup, - &attr, rt_churn_thread, NULL); -#else err = pthread_create(&new_thread, &attr, rt_churn_thread, NULL); -#endif /* TARGET_OS_XR */ - if (err) { errc(EX_OSERR, err, "pthread_create"); } @@ -699,6 +640,13 @@ worker_thread(void *arg) /* Set policy and so forth */ thread_setup(my_id); + if (g_rt_workgroup != NULL) { + kr = os_workgroup_join(g_rt_workgroup, &th_rt_workgroup_join); + if (kr) { + errc(EX_OSERR, kr, "os_workgroup_join from worker thread %d", my_id); + } + } + for (uint32_t i = 0; i < g_iterations; i++) { if (my_id == 0) { /* @@ -735,7 +683,6 @@ worker_thread(void *arg) } /* Signal main thread and wait for start of iteration */ - kr = semaphore_wait_signal(g_leadersem, g_main_sem); mach_assert_zero_t(my_id, kr); @@ -745,6 +692,17 @@ worker_thread(void *arg) assert_zero_t(my_id, atomic_load_explicit(&g_done_threads, memory_order_relaxed)); + if (g_rt_workgroup_interval) { + uint64_t interval_start = mach_absolute_time(); + uint64_t constraint_nanos = g_rt_ll ? LL_CONSTRAINT_NANOS : CONSTRAINT_NANOS; + uint64_t deadline = interval_start + nanos_to_abs(constraint_nanos); + debug_log("Starting work interval %u at %llu, deadline %llu\n", i, interval_start, deadline); + kr = os_workgroup_interval_start(g_rt_workgroup, interval_start, deadline, NULL); + if (kr != 0) { + printf("WARN: os_workgroup_interval_start returned %d; overlapping intervals?\n", kr); + } + } + switch (g_waketype) { case WAKE_BROADCAST_ONESEM: kr = semaphore_signal_all(g_broadcastsem); @@ -877,7 +835,16 @@ worker_thread(void *arg) } } - debug_log("Thread %p done spinning, iteration %d\n", pthread_self(), i); + debug_log("Thread %u[%p] done spinning, iteration %d\n", my_id, pthread_self(), i); + + if (g_rt_workgroup_interval && my_id == 0) { + debug_log("Finishing work interval %u at %llu\n", i, mach_absolute_time()); + /* Finish the work interval. */ + kr = os_workgroup_interval_finish(g_rt_workgroup, NULL); + if (kr != 0) { + printf("WARN: os_workgroup_interval_start returned %d; overlapping intervals?\n", kr); + } + } } if (my_id == 0) { @@ -915,6 +882,10 @@ worker_thread(void *arg) time_value_add(&worker_threads_total_runtime, &runtime); os_unfair_lock_unlock(&runtime_lock); + if (g_rt_workgroup != NULL) { + os_workgroup_leave(g_rt_workgroup, &th_rt_workgroup_join); + } + return 0; } @@ -1053,22 +1024,50 @@ main(int argc, char **argv) printf("TEST SKIPPED\n"); exit(0); } +#endif /* TARGET_OS_XR */ - os_workgroup_attr_s attr = OS_WORKGROUP_ATTR_INITIALIZER_DEFAULT; - g_rt_workgroup = os_workgroup_create_with_workload_id("test", "com.apple.test", &attr); - if (g_rt_workgroup == NULL) { - err(EX_OSERR, "failed to create the test workgroup"); + if (g_rt_workgroup_interval) { + assert(g_policy == MY_POLICY_REALTIME); + + os_workgroup_attr_s attr = OS_WORKGROUP_ATTR_INITIALIZER_DEFAULT; + /* Pretend to be an audio client so that os_workgroup_max_parallel_threads is accurate. */ + ret = os_workgroup_attr_set_interval_type(&attr, OS_WORKGROUP_INTERVAL_TYPE_AUDIO_CLIENT); + if (ret != 0) { + errx(EX_OSERR, "os_workgroup_attr_set_interval_type(OS_WORKGROUP_INTERVAL_TYPE_AUDIO_CLIENT)"); + } + g_rt_workgroup = os_workgroup_interval_create_with_workload_id("zero-to-n", "com.apple.test.zero-to-n.audio", OS_CLOCK_MACH_ABSOLUTE_TIME, &attr); + if (g_rt_workgroup == NULL) { + errx(EX_OSERR, "Failed to create zero-to-n workgroup interval."); + } + } else if (g_policy == MY_POLICY_REALTIME) { + os_workgroup_attr_s attr = OS_WORKGROUP_ATTR_INITIALIZER_DEFAULT; + g_rt_workgroup = os_workgroup_create_with_workload_id("zero-to-n", "com.apple.test.zero-to-n.default", &attr); + if (g_rt_workgroup == NULL) { + errx(EX_OSERR, "Failed to create zero-to-n workgroup."); + } } - /* Join the main thread to the workgroup. */ - ret = os_workgroup_join(g_rt_workgroup, &g_rt_workgroup_join_token); - assert_zero_t(0, ret); -#endif /* TARGET_OS_XR */ + if (g_rt_churn && !g_rt_churn_same_wg) { + os_workgroup_attr_s attr = OS_WORKGROUP_ATTR_INITIALIZER_DEFAULT; + g_rt_churn_workgroup = os_workgroup_create_with_workload_id("churn", "com.apple.test.zero-to-n.churn", &attr); + if (g_rt_churn_workgroup == NULL) { + errx(EX_OSERR, "Failed to create RT churn workgroup."); + } + } + + if (g_bind_cluster_type != '\0') { + ret = set_recommended_cluster(g_bind_cluster_type); + if (ret != 0) { + warn("Failed to bind to cluster type %c", g_bind_cluster_type); + } else { + printf("Bound to cluster type %c\n", g_bind_cluster_type); + } + } size_t maxcpu_size = sizeof(g_maxcpus); ret = sysctlbyname("hw.ncpu", &g_maxcpus, &maxcpu_size, NULL, 0); if (ret) { - err(EX_OSERR, "Failed sysctlbyname(hw.ncpu)"); + errc(EX_OSERR, ret, "Failed sysctlbyname(hw.ncpu)"); } assert(g_maxcpus <= 64); /* g_cpu_map needs to be extended for > 64 cpus */ @@ -1077,21 +1076,11 @@ main(int argc, char **argv) if (ret) { /* hw.perflevel0.logicalcpu failed so falling back to hw.ncpu */ g_numcpus = g_maxcpus; - } else { - /* Test for multiple perf levels */ - uint32_t result = 0; - size_t result_size = sizeof(result); - ret = sysctlbyname("hw.perflevel1.logicalcpu", &result, &result_size, NULL, 0); - if ((ret == 0) && (result > 0)) { - /* */ - /* Multiple perf levels detected, so bind this task to the highest perf node */ - ret = set_recommended_cluster('p'); - if (ret && g_test_rt) { - printf("set_recommended_cluster('p') failed. Skipping test\n"); - printf("TEST SKIPPED\n"); - exit(0); - } - } + } + + if (g_rt_workgroup_interval) { + /* Use the os_workgroup's max parallelism instead of any heuristic. */ + g_numcpus = os_workgroup_max_parallel_threads(g_rt_workgroup, NULL); } size_t physicalcpu_size = sizeof(g_nphysicalcpu); @@ -1105,13 +1094,10 @@ main(int argc, char **argv) } size_t logicalcpu_size = sizeof(g_nlogicalcpu); - ret = sysctlbyname("hw.perflevel0.logicalcpu", &g_nlogicalcpu, &logicalcpu_size, NULL, 0); + /* hw.perflevel0.logicalcpu failed so falling back to hw.logicalcpu */ + ret = sysctlbyname("hw.logicalcpu", &g_nlogicalcpu, &logicalcpu_size, NULL, 0); if (ret) { - /* hw.perflevel0.logicalcpu failed so falling back to hw.logicalcpu */ - ret = sysctlbyname("hw.logicalcpu", &g_nlogicalcpu, &logicalcpu_size, NULL, 0); - if (ret) { - err(EX_OSERR, "Failed sysctlbyname(hw.logicalcpu)"); - } + err(EX_OSERR, "Failed sysctlbyname(hw.logicalcpu)"); } if (g_test_rt) { @@ -1124,6 +1110,7 @@ main(int argc, char **argv) g_numthreads = 2; } } + g_policy = MY_POLICY_REALTIME; g_histogram = true; /* Don't change g_traceworthy_latency_ns if it's explicity been set to something other than the default */ @@ -1306,12 +1293,7 @@ main(int argc, char **argv) /* Create the threads */ for (uint32_t i = 0; i < g_numthreads; i++) { -#if TARGET_OS_XR - ret = pthread_create_with_workgroup_np(&threads[i], g_rt_workgroup, - NULL, worker_thread, (void*)(uintptr_t)i); -#else ret = pthread_create(&threads[i], NULL, worker_thread, (void*)(uintptr_t)i); -#endif if (ret) { errc(EX_OSERR, ret, "pthread_create %d", i); } @@ -1324,8 +1306,6 @@ main(int argc, char **argv) bool recommended_cores_warning = false; - thread_setup(0); - g_starttime_abs = mach_absolute_time(); if (g_churn_pri) { @@ -1651,11 +1631,12 @@ usage() { errx(EX_USAGE, "Usage: %s " " \n\t\t" - "[--trace ] " + "[--trace ]\n\t\t" + "[--rt-interval] [--bind ]\n\t\t" "[--verbose] [--spin-one] [--spin-all] [--spin-time ] [--affinity]\n\t\t" "[--no-sleep] [--drop-priority] [--churn-pri ] [--churn-count ] [--churn-random]\n\t\t" "[--extra-thread-count ]\n\t\t" - "[--rt-churn] [--rt-churn-count ] [--rt-ll]\n\t\t" + "[--rt-churn ] [--rt-churn-count ] [--rt-ll]\n\t\t" "[--test-rt] [--test-rt-smt] [--test-rt-avoid0] [--test-strict-fail]", getprogname()); } @@ -1695,6 +1676,22 @@ read_signed_dec_arg() return arg_val; } +static char +read_cluster_type_arg() +{ + char cluster = optarg[0]; + switch (cluster) { + case 'E': + case 'P': + /* Cluster type is valid. */ + return cluster; + default: + errx(EX_USAGE, "arg --%s should be a valid cluster type, found \"%s\"", + g_longopts[option_index].name, optarg); + return 'P'; + } +} + static void parse_args(int argc, char *argv[]) { @@ -1707,6 +1704,7 @@ parse_args(int argc, char *argv[]) OPT_CHURN_COUNT, OPT_RT_CHURN_COUNT, OPT_EXTRA_THREAD_COUNT, + OPT_BIND_CLUSTER, }; static struct option longopts[] = { @@ -1718,12 +1716,14 @@ parse_args(int argc, char *argv[]) { "churn-count", required_argument, NULL, OPT_CHURN_COUNT }, { "rt-churn-count", required_argument, NULL, OPT_RT_CHURN_COUNT }, { "extra-thread-count", required_argument, NULL, OPT_EXTRA_THREAD_COUNT }, + { "bind" , required_argument, NULL, OPT_BIND_CLUSTER }, { "churn-random", no_argument, (int*)&g_churn_random, TRUE }, { "switched_apptype", no_argument, (int*)&g_seen_apptype, TRUE }, { "spin-one", no_argument, (int*)&g_do_one_long_spin, TRUE }, { "intel-only", no_argument, (int*)&g_run_on_intel_only, TRUE }, { "spin-all", no_argument, (int*)&g_do_all_spin, TRUE }, { "affinity", no_argument, (int*)&g_do_affinity, TRUE }, + { "rt-interval", no_argument, (int*)&g_rt_workgroup_interval, TRUE }, { "no-sleep", no_argument, (int*)&g_do_sleep, FALSE }, { "drop-priority", no_argument, (int*)&g_drop_priority, TRUE }, { "test-rt", no_argument, (int*)&g_test_rt, TRUE }, @@ -1731,6 +1731,7 @@ parse_args(int argc, char *argv[]) { "test-rt-avoid0", no_argument, (int*)&g_test_rt_avoid0, TRUE }, { "test-strict-fail", no_argument, (int*)&g_test_strict_fail, TRUE }, { "rt-churn", no_argument, (int*)&g_rt_churn, TRUE }, + { "rt-churn-same-wg", no_argument, (int*)&g_rt_churn_same_wg, FALSE }, { "rt-ll", no_argument, (int*)&g_rt_ll, TRUE }, { "histogram", no_argument, (int*)&g_histogram, TRUE }, { "verbose", no_argument, (int*)&g_verbose, TRUE }, @@ -1769,6 +1770,9 @@ parse_args(int argc, char *argv[]) case OPT_EXTRA_THREAD_COUNT: g_extra_thread_count = read_signed_dec_arg(); break; + case OPT_BIND_CLUSTER: + g_bind_cluster_type = read_cluster_type_arg(); + break; case '?': case 'h': default: @@ -1828,4 +1832,12 @@ parse_args(int argc, char *argv[]) if (g_numthreads == 1 && g_waketype == WAKE_HOP) { errx(EX_USAGE, "hop mode requires more than one thread"); } + + if (g_rt_churn_same_wg && !g_rt_churn) { + errx(EX_USAGE, "--rt-churn-same-wg requires rt-churn"); + } + + if (g_rt_workgroup_interval && g_policy != MY_POLICY_REALTIME) { + errx(EX_USAGE, "--rt-interval can only be used with realtime policy."); + } } diff --git a/tools/tests/zero-to-n/zero_to_n_workload_config.plist b/tools/tests/zero-to-n/zero_to_n_workload_config.plist new file mode 100644 index 000000000..904936867 --- /dev/null +++ b/tools/tests/zero-to-n/zero_to_n_workload_config.plist @@ -0,0 +1,63 @@ + + + + + WorkloadIDTable + + com.apple.test.zero-to-n.audio + + Phases + + Realtime + + WorkIntervalType + AUDIO_CLIENT + WorkloadClass + REALTIME + + + Root + + DefaultPhase + Realtime + + + com.apple.test.zero-to-n.default + + Phases + + Realtime + + WorkIntervalType + DEFAULT + WorkloadClass + REALTIME + + + Root + + DefaultPhase + Realtime + + + com.apple.test.zero-to-n.churn + + Phases + + Realtime + + WorkIntervalType + DEFAULT + WorkloadClass + REALTIME + + + Root + + DefaultPhase + Realtime + + + + + \ No newline at end of file